super_crawler 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 4f566869c9f06df215b047291bbc08a83e25a176
4
+ data.tar.gz: ee35c2dcee2dae0289e70a8fb0195c63859bf793
5
+ SHA512:
6
+ metadata.gz: e7f5c9db1479c83af91879d41f9ea6480142ddf11fba9fcc3bc944f46de90d8b821929582526f245cd6752a4506019f8b2489d08175e5c6fe30faea620c1ea61
7
+ data.tar.gz: 1c81fc244e6679cfe6782b9651782944306290a9281277658a220155ef6c4a7ccee59b9b7a2f058c0862d52027e7f8453ba01a21c953d2f31ff5980e9fea87ad
data/.gitignore ADDED
@@ -0,0 +1,11 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+
11
+ *.gem
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,5 @@
1
+ sudo: false
2
+ language: ruby
3
+ rvm:
4
+ - 2.3.1
5
+ before_install: gem install bundler -v 1.12.5
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at htaidirt@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in super_crawler.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Hassen Taidirt
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,255 @@
1
+ # SuperCrawler
2
+
3
+ Easy (yet efficient) ruby gem to crawl your favorite website.
4
+
5
+ ## Quick Start
6
+
7
+ Open your terminal, then:
8
+
9
+ ```bash
10
+ $ git clone https://github.com/htaidirt/super_crawler
11
+
12
+ $ cd super_crawler
13
+
14
+ $ bundle
15
+
16
+ $ ./bin/console
17
+ ```
18
+
19
+ ```ruby
20
+ > sc = SuperCrawler::CrawlSite.new('https://gocardless.com')
21
+
22
+ > sc.start # => Start crawling the website
23
+
24
+ > sc.render(5) # => Show first 5 results of the crawling as sitemap
25
+ ```
26
+
27
+ ## Installation
28
+
29
+ Add this line to your application's Gemfile:
30
+
31
+ ```ruby
32
+ gem 'super_crawler'
33
+ ```
34
+
35
+ And then execute:
36
+
37
+ ```ruby
38
+ bundle install
39
+ ```
40
+
41
+ Or install it yourself as:
42
+
43
+ ```ruby
44
+ gem install super_crawler
45
+ ```
46
+
47
+ Want to experiment with the gem without installing it? Clone the following repo and run `bin/console` for an interactive prompt that will allow you to experiment.
48
+
49
+ ## Warning!
50
+
51
+ This gem is an experiment and can't be used for production purposes. Please, use it with caution if you want to use it in your projects.
52
+
53
+ There are also a lot of limitations that weren't handled due to time. You'll find more information on the limitations below.
54
+
55
+ SuperCrawler gem was only tested on MRI and ruby 2.3.1.
56
+
57
+ ## Philosophy
58
+
59
+ Starting from a URL, extract all the internal links and assets within the page. Add all unique links to an array for future exploration of theses links. Repeat for each link in the links list until no new link is discovered.
60
+
61
+ Due to the heavy operations, and the time to access each page content, we will use threads to perform near-parallel processing.
62
+
63
+ In order to keep the code readable and structured, create two classes:
64
+
65
+ - `SuperCrawler::CrawlPage` that is responsible for crawling a single page and extracting all relevant information (internal links and assets)
66
+ - `SuperCrawler::CrawlSite` that is responsible for crawling a whole website, by collecting links and calling `SuperCrawler::CrawlPage` within threads. This class is also responsible for rendering results.
67
+
68
+ ## More detailed use
69
+
70
+ Open your favorite ruby console and require the gem:
71
+
72
+ ```ruby
73
+ require 'super_crawler'
74
+ ```
75
+
76
+ ### Crawling a single web page
77
+
78
+ Read the following if you would like to crawl a single web page and extract relevant information (internal links and assets).
79
+
80
+ ```ruby
81
+ page = SuperCrawler::CrawlPage.new( url )
82
+ ```
83
+
84
+ Where `url` should be the URL of the page you would like to crawl.
85
+
86
+ **Nota:** When missing a scheme (`http://` or `https://`), SuperCrawler will prepend the URL with an `http://`.
87
+
88
+ #### Get the encoded URL
89
+
90
+ Run
91
+
92
+ ```ruby
93
+ page.url
94
+ ```
95
+
96
+ to get the encoded URL provided.
97
+
98
+ #### Get internal links of a page
99
+
100
+ Run
101
+
102
+ ```ruby
103
+ page.get_links
104
+ ```
105
+
106
+ to get a list of internal links within the crawled page. An internal link is a link that _has the same host than the page URL_. Subdomains are rejected.
107
+
108
+ This method searches in the `href` attribute of all `<a>` anchor tags.
109
+
110
+ **Nota:** This method returns an array of absolute URLs (all internal links).
111
+
112
+ **Nota 2:** Bad links and special links (like mailto and javascript) are discarded.
113
+
114
+ #### Get images of a page
115
+
116
+ Run
117
+
118
+ ```ruby
119
+ page.get_images
120
+ ```
121
+
122
+ to get a list of images links within the page. The images links are extracted from the `src="..."` attribute of all `<img>` tags.
123
+
124
+ **Nota:** Images included using CSS or JavaScript aren't detected by the method.
125
+
126
+ **Nota 2:** This method returns an array of absolute URLs.
127
+
128
+ #### Get stylesheets of a page
129
+
130
+ Run
131
+
132
+ ```ruby
133
+ page.get_stylesheets
134
+ ```
135
+
136
+ to get a list of stylesheets links within the page. The stylesheets links are extracted from the `href="..."` attribute of all `<link rel="stylesheet">` tags.
137
+
138
+ **Nota:** Inline styling isn't yet detected by the method.
139
+
140
+ **Nota 2:** This method returns an array of absolute URLs.
141
+
142
+ #### Get scripts of a page
143
+
144
+ Run
145
+
146
+ ```ruby
147
+ page.get_scripts
148
+ ```
149
+
150
+ to get a list of scripts links within the page. The scripts links are extracted from the `src="..."` attribute of all `<script>` tags.
151
+
152
+ **Nota:** Inline script isn't yet detected by the method.
153
+
154
+ **Nota 2:** This method returns an array of absolute URLs.
155
+
156
+ #### List all assets of a page
157
+
158
+ Run
159
+
160
+ ```ruby
161
+ page.get_assets
162
+ ```
163
+
164
+ to get a list of all assets (images, stylesheets and scripts links) as a hash of arrays.
165
+
166
+ ### Crawling a whole web site
167
+
168
+ First instantiate the site crawler.
169
+
170
+ ```ruby
171
+ sc = SuperCrawler::CrawlSite.new(url, count_threads)
172
+ ```
173
+
174
+ where `url` is the URL of the page to crawl, and `count_threads` the number of threads to handle the job (by default 10).
175
+
176
+ Next, start the crawler:
177
+
178
+ ```ruby
179
+ sc.start
180
+ ```
181
+
182
+ This can take some time, depending on the site to crawl.
183
+
184
+ To access crawl results, you can use the following:
185
+
186
+ ```ruby
187
+ sc.links # The array of internal links
188
+
189
+ sc.crawl_results # Array of hashes containing links and assets for every link crawled
190
+ ```
191
+
192
+ To see the crawling as a sitemap, use:
193
+
194
+ ```ruby
195
+ sc.render(5) # Will render the sitemap of the first 5 pages
196
+ ```
197
+
198
+ TODO: Make more sophisticated rendering class, that can render within files of different formats (HTML, XML, JSON,...)
199
+
200
+ #### Tips on searching assets and links
201
+
202
+ After `sc.start`, you can access all collected resources (links and assets) using `sc.crawl_results`. This has the following structure:
203
+
204
+ ```json
205
+ [
206
+ {
207
+ url: 'http://example.com/',
208
+ links: [...array of internal links...],
209
+ assets: {
210
+ images: [...array of images links],
211
+ stylesheets: [...array of stylesheets links],
212
+ scripts: [...array of scripts links],
213
+ }
214
+ },
215
+ ...
216
+ ]
217
+ ```
218
+
219
+ You can use `sc.crawl_results.select{ |resource| ... }` to select a particular resource.
220
+
221
+ ## Limitations
222
+
223
+ Actually, the gem has the following limitations:
224
+
225
+ - Subdomains are not considered as internal links
226
+ - Both HTTP and HTTPS pages are taken into account. This can increase the number of links found, but we think that we need to keep it because some sites don't duplicate all contents for HTTP and HTTPS
227
+ - Only links within `<a href="...">` tags are extracted
228
+ - Only images links within `<img src="..."/>` tags are extracted
229
+ - Only stylesheets links within `<link rel="stylesheet" href="..." />` tags are extracted
230
+ - Only scripts links within `<script src="...">` tags are extracted
231
+ - A page that is not accessible (eg. error 404) is not checked later
232
+
233
+ ## Development
234
+
235
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
236
+
237
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
238
+
239
+ ## Contributing
240
+
241
+ Bug reports and pull requests are welcome on GitHub at https://github.com/htaidirt/super_crawler. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
242
+
243
+ Want to contribute, please follow this process:
244
+
245
+ 1. Fork it
246
+ 2. Create your feature branch (git checkout -b my-new-feature)
247
+ 3. Commit your changes (git commit -am 'Add some feature')
248
+ 4. Push to the branch (git push origin my-new-feature)
249
+ 5. Create new Pull Request
250
+
251
+ ## License
252
+
253
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
254
+
255
+ ## Don't forget to have fun coding Ruby...
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "super_crawler"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,160 @@
1
+ require "open-uri"
2
+ require "open_uri_redirections"
3
+ require "nokogiri"
4
+
5
+ module SuperCrawler
6
+
7
+ ###
8
+ # Crawl a single HTML page
9
+ # Responsible for extracting all relevant information within a page
10
+ #
11
+ class CrawlPage
12
+
13
+ attr_reader :url
14
+
15
+ def initialize url
16
+ # Normalize the URL, by adding http(s) if not present in the URL
17
+ # NOTA: By default, add http:// scheme to an URL that doesn't have one
18
+ @url = URI.encode( !!(url =~ /^(http(s)?:\/\/)/) ? url : ('http://' + url) )
19
+ end
20
+
21
+ ###
22
+ # Get INTERNAL links of the page (same host)
23
+ #
24
+ def get_links
25
+ return [] unless page_exists?
26
+
27
+ # Get all the links that are within <a> tag, using Nokogiri
28
+ links = get_doc.css('a').map{ |link| link['href'] }.compact
29
+
30
+ # Select only internal links (relative links, or absolute links with the same host)
31
+ links.select!{ |link| URI.parse(URI.encode link).host.nil? || URI.parse(URI.encode link).host == URI.parse(@url).host }
32
+
33
+ # Reject bad matches links (like mailto, tel and javascript)
34
+ links.reject!{ |link| !!(link =~ /^(mailto:|tel:|javascript:)/) }
35
+
36
+ # Clean the links
37
+ links.map!{ |link| create_absolute_url( link ) } # Make all links absolute
38
+ .map!{ |link| link.split('#')[0] } # Remove the fragment part from the links (...#...) if any
39
+ .map!{ |link| URI(URI.encode link).normalize().to_s } # Normalize links
40
+
41
+ return links.uniq # Return links without duplicates
42
+ end
43
+
44
+ ###
45
+ # Get all the images within a page
46
+ # NOTA: These are images within <img src="..." /> tag.
47
+ #
48
+ def get_images
49
+ return [] unless page_exists?
50
+
51
+ # Get all the images sources (URLs), using Nokogiri
52
+ images_links = get_doc.css('img').map{ |image| image['src'] }.compact
53
+
54
+ # Create the absolute path of the images
55
+ images_links.map!{ |image| create_absolute_url( image ) }
56
+
57
+ return images_links.uniq # Return links to images without duplicates
58
+ end
59
+
60
+ ###
61
+ # Get all the CSS links within a page
62
+ # NOTA: These are links within <link href="..." /> tag.
63
+ #
64
+ def get_stylesheets
65
+ return [] unless page_exists?
66
+
67
+ # Get all the stylesheet links (URLs), using Nokogiri
68
+ css_links = get_doc.css('link').select{ |css_link| css_link['rel'] == 'stylesheet' }
69
+ .map{ |css_link| css_link['href'] }
70
+ .compact
71
+
72
+ # Create the absolute path of the CSS links
73
+ css_links.map!{ |css_link| create_absolute_url( css_link ) }
74
+
75
+ return css_links.uniq # Return links to CSS files without duplicates
76
+ end
77
+
78
+ ###
79
+ # Get all the JS scripts within a page
80
+ # NOTA: These are scripts within <script src="..." /> tag.
81
+ #
82
+ def get_scripts
83
+ return [] unless page_exists?
84
+
85
+ # Get all the script sources (URLs), using Nokogiri
86
+ scripts_links = get_doc.css('script').map{ |script| script['src'] }.compact
87
+
88
+ # Create the absolute path of the scripts
89
+ scripts_links.map!{ |script| create_absolute_url( script ) }
90
+
91
+ return scripts_links.uniq # Return links to scripts without duplicates
92
+ end
93
+
94
+ ###
95
+ # Get all assets within a page
96
+ # Returns a hash of images, stylesheets and scripts URLs
97
+ #
98
+ def get_assets
99
+ {
100
+ 'images': get_images,
101
+ 'stylesheets': get_stylesheets,
102
+ 'scripts': get_scripts
103
+ }
104
+ end
105
+
106
+ ###
107
+ # Get links and assets within a page
108
+ # Returns a hash of links, images, stylesheets and scripts URLs
109
+ #
110
+ def get_all
111
+ {
112
+ 'links': get_links,
113
+ 'images': get_images,
114
+ 'stylesheets': get_stylesheets,
115
+ 'scripts': get_scripts
116
+ }
117
+ end
118
+
119
+ ###
120
+ # Check if the page exists
121
+ #
122
+ def page_exists?
123
+ !!( get_doc rescue false )
124
+ end
125
+
126
+ private
127
+
128
+ ###
129
+ # Get the page `doc` (document) from Nokogiri.
130
+ # Cache it for performace issue.
131
+ #
132
+ def get_doc
133
+ begin
134
+ @doc ||= Nokogiri(open( @url , allow_redirections: :all ))
135
+ rescue Exception => e
136
+ raise "Problem with URL #{@url}: #{e}"
137
+ end
138
+ end
139
+
140
+ ###
141
+ # Extract the base URL (scheme and host only)
142
+ #
143
+ # eg:
144
+ # http://mysite.com/abc -> http://mysite.com
145
+ # https://dev.mysite.co.uk/mylink -> https://dev.mysite.co.uk
146
+ def base_url
147
+ "#{URI.parse(@url).scheme}://#{URI.parse(@url).host}"
148
+ end
149
+
150
+ ###
151
+ # Given a URL, return the absolute URL
152
+ #
153
+ def create_absolute_url url
154
+ # Append the base URL (scheme+host) if the provided URL is relative
155
+ URI.parse(URI.encode url).host.nil? ? (base_url + url) : url
156
+ end
157
+
158
+ end
159
+
160
+ end
@@ -0,0 +1,154 @@
1
+ require 'thread'
2
+
3
+ require 'super_crawler/crawl_page'
4
+
5
+ module SuperCrawler
6
+
7
+ ###
8
+ # Crawl a whole website
9
+ #
10
+ class CrawlSite
11
+
12
+ attr_reader :links, :crawl_results
13
+
14
+ def initialize start_url, threads = 10, options = {}
15
+ @start_url = URI(URI.encode start_url).normalize().to_s # Normalize the given URL
16
+ @links = [@start_url] # Will contain the list of all links found
17
+ @crawl_results = [] # Will contain the crawl results (links and assets), as array of hashes
18
+ @threads = threads # How many threads to use? Default: 10
19
+
20
+ @option_debug = options[:debug].nil? ? true : !!(options[:debug]) # Debug by default
21
+ end
22
+
23
+ ###
24
+ # Start crawling site
25
+ # Could take a while. Use threads to speed up crawling and logging to inform user.
26
+ #
27
+ def start
28
+
29
+ crawling_start_notice # Show message on what will happen
30
+ threads = [] # Will contain our threads
31
+ @links_queue = Queue.new # Will contain the links queue that the threads will use
32
+ @links = [@start_url] # Re-init the links list
33
+ @crawl_results = [] # Re-init the crawling results
34
+
35
+ start_time = Time.now if @option_debug # Start the timer
36
+
37
+ # Let's populate our queue with links and resources from source url
38
+ process_page( @start_url )
39
+
40
+ # Create threads to handle new links
41
+ @threads.times do # Create many threads
42
+
43
+ threads << Thread.new do # Add a new threads
44
+ begin
45
+ while current_link = @links_queue.pop(true) # Popping every link after another
46
+ process_page( current_link ) # Get links and assets
47
+ end
48
+ rescue ThreadError # Stop when empty links queue
49
+ end
50
+ end
51
+
52
+ end
53
+
54
+ threads.map(&:join) # Activate the threads
55
+ crawling_summary_notice(start_time, Time.now) if @option_debug # Display crawling summary
56
+
57
+ return true
58
+ end
59
+
60
+ ###
61
+ # Render sitemap
62
+ # Show, for each link, internal links and assets
63
+ # We will limit pages to display, because some sites have more than 1,000 pages
64
+ #
65
+ def render max_pages = 10
66
+ draw_line
67
+ puts "Showing first #{max_links} crawled pages and their contents:\n\n"
68
+ @crawl_results[0..(max_pages-1)].each_with_index do |result, index|
69
+ puts "[#{index+1}] Content of #{result[:url]}\n"
70
+
71
+ puts " + Internal links: #{'None' if result[:links].empty?}"
72
+ result[:links].each { |link| puts " - #{link}" }
73
+
74
+ puts " + Internal images: #{'None' if result[:assets][:images].empty?}"
75
+ result[:assets][:images].each { |link| puts " - #{link}" }
76
+
77
+ puts " + Internal stylesheets: #{'None' if result[:assets][:stylesheets].empty?}"
78
+ result[:assets][:stylesheets].each { |link| puts " - #{link}" }
79
+
80
+ puts " + Internal scripts: #{'None' if result[:assets][:scripts].empty?}"
81
+ result[:assets][:scripts].each { |link| puts " - #{link}" }
82
+ puts ""
83
+ end
84
+ draw_line
85
+ end
86
+
87
+ private
88
+
89
+ ###
90
+ # Process a page by extracting information and updating links queue, links list and results.
91
+ #
92
+ def process_page page_url
93
+ page = SuperCrawler::CrawlPage.new(page_url) # Crawl the current page
94
+
95
+ current_page_links = page.get_links # Get current page internal links
96
+ new_links = current_page_links - @links # Select new links
97
+
98
+ new_links.each { |link| @links_queue.push(link) } # Add new links to the queue
99
+ @links += new_links # Add new links to the total links list
100
+ @crawl_results << { # Provide current page crawl result as a hash
101
+ url: page.url, # The crawled page
102
+ links: current_page_links, # Its internal links
103
+ assets: page.get_assets # Its assets
104
+ }
105
+
106
+ log_status( page_url ) if @option_debug # Display site crawling status
107
+ end
108
+
109
+ ###
110
+ # Display a notice when starting a site crawl
111
+ #
112
+ def crawling_start_notice
113
+ draw_line
114
+ puts "Start crawling #{@start_url} using #{@threads} threads. Crawling rules:"
115
+ puts "1. Keep only internal links"
116
+ puts "2. http and https links are considered different"
117
+ puts "3. Remove the fragment part from the links (#...)"
118
+ puts "4. Keep paths with different parameters (?...)"
119
+ draw_line
120
+ end
121
+
122
+ ###
123
+ # Log current search status (crawled links / total links)
124
+ #
125
+ def log_status url
126
+ text = "Crawled #{@crawl_results.length.to_s}/#{@links.length.to_s}: #{url}"
127
+ print "\r#{" "*100}\r" # Clean the previous text
128
+ print (text.length <= 50) ? text : "#{text[0..46]}..."
129
+ STDOUT.flush
130
+ end
131
+
132
+ ###
133
+ # Display final crawling summary after site crawling complete
134
+ #
135
+ def crawling_summary_notice time_start, time_end
136
+ total_time = time_end - time_start
137
+ puts ""
138
+ draw_line
139
+ puts "Crawled #{@links.count} links in #{total_time.to_f.to_s} seconds using #{@threads} threads."
140
+ puts "Use .crawl_results to access the crawl results as an array of hashes."
141
+ puts "Use .render to see the crawl_results as a sitemap."
142
+ draw_line
143
+ end
144
+
145
+ ###
146
+ # Draw a line (because readability is also important!!)
147
+ #
148
+ def draw_line
149
+ puts "#{'-' * 80}"
150
+ end
151
+
152
+ end
153
+
154
+ end
@@ -0,0 +1,3 @@
1
+ module SuperCrawler
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,4 @@
1
+ require "super_crawler/version"
2
+
3
+ require "super_crawler/crawl_page"
4
+ require "super_crawler/crawl_site"
@@ -0,0 +1,37 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'super_crawler/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "super_crawler"
8
+ spec.version = SuperCrawler::VERSION
9
+ spec.authors = ["Hassen Taidirt"]
10
+ spec.email = ["htaidirt@gmail.com"]
11
+
12
+ spec.summary = %q{Easy (yet efficient) ruby gem to crawl your favorite website.}
13
+ spec.description = %q{SuperCrawler allows you to easily crawl full web sites or web pages (extracting internal links and assets) in few seconds.}
14
+ spec.homepage = "https://github.com/htaidirt/super_crawler"
15
+ spec.license = "MIT"
16
+
17
+ # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
18
+ # to allow pushing to a single host or delete this section to allow pushing to any host.
19
+ if spec.respond_to?(:metadata)
20
+ spec.metadata['allowed_push_host'] = "https://rubygems.org"
21
+ else
22
+ raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
23
+ end
24
+
25
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
26
+ spec.bindir = "exe"
27
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
28
+ spec.require_paths = ["lib"]
29
+
30
+ spec.add_dependency "nokogiri", "~> 1"
31
+ spec.add_dependency "open_uri_redirections", "~> 0.2"
32
+ spec.add_dependency "thread", "~> 0.2"
33
+
34
+ spec.add_development_dependency "bundler", "~> 1.10"
35
+ spec.add_development_dependency "rake", "~> 10.0"
36
+ spec.add_development_dependency "rspec", "~> 3.0"
37
+ end
metadata ADDED
@@ -0,0 +1,145 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: super_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Hassen Taidirt
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2016-07-09 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: open_uri_redirections
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '0.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '0.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: thread
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.2'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.2'
55
+ - !ruby/object:Gem::Dependency
56
+ name: bundler
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.10'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.10'
69
+ - !ruby/object:Gem::Dependency
70
+ name: rake
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '10.0'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rspec
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '3.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '3.0'
97
+ description: SuperCrawler allows you to easily crawl full web sites or web pages (extracting
98
+ internal links and assets) in few seconds.
99
+ email:
100
+ - htaidirt@gmail.com
101
+ executables: []
102
+ extensions: []
103
+ extra_rdoc_files: []
104
+ files:
105
+ - ".gitignore"
106
+ - ".rspec"
107
+ - ".travis.yml"
108
+ - CODE_OF_CONDUCT.md
109
+ - Gemfile
110
+ - LICENSE.txt
111
+ - README.md
112
+ - Rakefile
113
+ - bin/console
114
+ - bin/setup
115
+ - lib/super_crawler.rb
116
+ - lib/super_crawler/crawl_page.rb
117
+ - lib/super_crawler/crawl_site.rb
118
+ - lib/super_crawler/version.rb
119
+ - super_crawler.gemspec
120
+ homepage: https://github.com/htaidirt/super_crawler
121
+ licenses:
122
+ - MIT
123
+ metadata:
124
+ allowed_push_host: https://rubygems.org
125
+ post_install_message:
126
+ rdoc_options: []
127
+ require_paths:
128
+ - lib
129
+ required_ruby_version: !ruby/object:Gem::Requirement
130
+ requirements:
131
+ - - ">="
132
+ - !ruby/object:Gem::Version
133
+ version: '0'
134
+ required_rubygems_version: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ requirements: []
140
+ rubyforge_project:
141
+ rubygems_version: 2.5.1
142
+ signing_key:
143
+ specification_version: 4
144
+ summary: Easy (yet efficient) ruby gem to crawl your favorite website.
145
+ test_files: []