simple_crawler 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ebddf965a3f0021ce3187684afa08db022665878
4
+ data.tar.gz: afc8d73392ac3918cd11b1a7fd3e1b61340299e9
5
+ SHA512:
6
+ metadata.gz: 76da6a650493ea030c63371c529759d8e01a87c7f72131250b5a99dff418320dac908eb5e70a36d50629315f232a0b357a4a48f98d5453ad333a15454028e343
7
+ data.tar.gz: 060c78ef1d9452efaa042daf3b90208f5d92980de68b136667c468edd493ed022ac063334b87f25c3bb7c437ca95cdaf92bca3dbfde139ae2aae6e152d74ae23
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .DS_Store
6
+ .idea/
7
+ .yardoc
8
+ Gemfile.lock
9
+ InstalledFiles
10
+ _yardoc
11
+ coverage
12
+ doc/
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
5
+ gem 'awesome_print', '~>1.2.0'
6
+ gem 'em-synchrony', '~>1.0.3'
7
+ gem 'em-http-request', '~>1.1.2'
8
+ gem 'nokogiri', '~>1.6.1'
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Anupom Syam
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,49 @@
1
+ # Crawler
2
+
3
+ Crawler is a simple web crawler written in Ruby. Given a URL it crawls the domain and recursively finds all links
4
+ associated with it. It also keeps track of all static contents related to each of these links.
5
+
6
+ It uses eventmachine and fiber (through em-synchrony) to issue concurrent non-blocking requests.
7
+ Crawler stores the site map using a variation of Adjacency list data structure. It can also
8
+ pretty-print the map once a URL is crawled.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'crawler'
15
+
16
+ And then execute:
17
+
18
+ $ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install crawler
23
+
24
+ ## Usage
25
+
26
+ ### Using Crawler as library
27
+ ```ruby
28
+ crawler = Crawler.new('http://google.com')
29
+ # Start crawling the URL
30
+ crawler.crawl
31
+ # Generated site map object
32
+ map = crawler.map
33
+ # Pretty print the site map
34
+ crawler.print
35
+ ```
36
+
37
+ ### Using Crawler as binary
38
+ ```sh
39
+ # Crawl domain and print the sitemap
40
+ crawler http://google.com
41
+ ```
42
+
43
+ ## Contributing
44
+
45
+ 1. Fork it ( http://github.com/anupom/crawler/fork )
46
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
47
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
48
+ 4. Push to the branch (`git push origin my-new-feature`)
49
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'crawler'
4
+
5
+ if ARGV.empty?
6
+ puts 'Please provide a url to crawl'
7
+ exit 1
8
+ end
9
+
10
+ crawler = Crawler.new(ARGV[0])
11
+ crawler.crawl
12
+ crawler.print
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'simple_crawler'
7
+ spec.version = '0.0.1'
8
+ spec.authors = ['anupom']
9
+ spec.email = ['anupom.syam@gmail.com']
10
+ spec.summary = %q{Create sitemap from a given url}
11
+ spec.description = %q{Simple web crawler to crawl a domain and generate sitemap}
12
+ spec.homepage = 'https://github.com/anupom/crawler'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_development_dependency 'bundler', '~> 1.5'
21
+ spec.add_development_dependency 'rake', '~> 10.1'
22
+ spec.add_development_dependency 'simplecov', '~> 0.8'
23
+ spec.add_development_dependency 'webmock', '~> 1.17'
24
+ end
@@ -0,0 +1,148 @@
1
+ require 'awesome_print'
2
+ require 'em-synchrony'
3
+ require 'em-synchrony/em-http'
4
+ require 'em-synchrony/fiber_iterator'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ class Crawler
9
+ CONCURRENCY = 5
10
+ HTTP_OK = 200
11
+ MAX_REDIRECTS = 3
12
+ MAX_RETRIES = 3
13
+ VALID_SCHEMES = %w(http https)
14
+
15
+ Node = Struct.new(:neighbors, :statics)
16
+
17
+ attr_reader :map
18
+
19
+ def initialize(root_url)
20
+ @map = {}
21
+ @urls_to_crawl = [root_url]
22
+ @root_hostname = URI.parse(root_url).hostname
23
+ @retries = Hash.new { |h, k| h[k] = 0 }
24
+ end
25
+
26
+ def crawl
27
+ if @urls_to_crawl.empty?
28
+ EventMachine.stop
29
+ return
30
+ end
31
+
32
+ EM.synchrony do
33
+ # Iterate over a copy while we change the main array
34
+ urls = @urls_to_crawl.dup
35
+ @urls_to_crawl = crawl_urls(urls)
36
+ crawl
37
+ end
38
+ end
39
+
40
+ def print
41
+ ap @map, { index: false }
42
+ end
43
+
44
+ protected
45
+
46
+ def crawl_urls(urls)
47
+ next_urls = []
48
+
49
+ EM::Synchrony::FiberIterator.new(urls, CONCURRENCY).each do |url|
50
+ next if @map.key?(url)
51
+
52
+ http = http_request(url)
53
+
54
+ next if http.nil?
55
+
56
+ page = Nokogiri::HTML(http.response)
57
+ neighbors = get_neighbors(page, url)
58
+ next_urls += neighbors
59
+
60
+ statics = get_statics(page)
61
+
62
+ @map[url] = Node.new(neighbors, statics)
63
+ end
64
+
65
+ next_urls
66
+ end
67
+
68
+ def http_request(url)
69
+ http = EventMachine::HttpRequest.new(url)
70
+ .get redirects: MAX_REDIRECTS
71
+
72
+ if http.response_header.status != HTTP_OK
73
+ queue_for_retry(url)
74
+ return nil
75
+ end
76
+ http
77
+ rescue Addressable::URI::InvalidURIError
78
+ nil
79
+ end
80
+
81
+ def queue_for_retry(url)
82
+ return if @retries[url] == MAX_RETRIES
83
+ @retries[url] += 1
84
+ @urls_to_crawl.push(url)
85
+ end
86
+
87
+ def get_neighbors(page, parent_url)
88
+ neighbors = Set.new
89
+ links = page.css('a')
90
+
91
+ links.each do |link|
92
+ href = link['href']
93
+
94
+ uri = uri_from_href(href)
95
+
96
+ next unless valid_uri?(uri)
97
+
98
+ uri = URI.join(parent_url, uri) if relative_uri?(uri)
99
+
100
+ # Page fragments are ignored for site map
101
+ uri.fragment = nil
102
+
103
+ next if uri.to_s == parent_url
104
+
105
+ neighbors.add(uri.to_s)
106
+ end
107
+
108
+ neighbors.to_a
109
+ end
110
+
111
+ def get_statics(page)
112
+ statics = Set.new
113
+
114
+ scripts = page.css('script')
115
+ scripts.each do |script|
116
+ statics.add(script['src']) unless script['src'].nil?
117
+ end
118
+
119
+ stylesheets = page.css('link[rel="stylesheet"]')
120
+ stylesheets.each do |stylesheet|
121
+ statics.add(stylesheet['href']) unless stylesheet['href'].nil?
122
+ end
123
+
124
+ images = page.css('img')
125
+ images.each do |image|
126
+ statics.add(image['src']) unless image['src'].nil?
127
+ end
128
+
129
+ statics.to_a
130
+ end
131
+
132
+ def uri_from_href(href)
133
+ URI.parse(href)
134
+ rescue URI::InvalidURIError
135
+ nil
136
+ end
137
+
138
+ def valid_uri?(uri)
139
+ return false if uri.nil?
140
+ return false unless uri.scheme.nil? || VALID_SCHEMES.include?(uri.scheme)
141
+ return false unless uri.hostname.nil? || uri.hostname == @root_hostname
142
+ true
143
+ end
144
+
145
+ def relative_uri?(uri)
146
+ uri.scheme.nil?
147
+ end
148
+ end
@@ -0,0 +1,248 @@
1
+ require 'crawler'
2
+ require 'stringio'
3
+ require 'test/unit'
4
+ require 'webmock/test_unit'
5
+
6
+ class CrawlerTest < Test::Unit::TestCase
7
+ # Called before every test method runs. Can be used
8
+ # to set up fixture information.
9
+ def setup
10
+ # Do nothing
11
+ end
12
+
13
+ # Called after every test method runs. Can be used to tear
14
+ # down fixture information.
15
+
16
+ def teardown
17
+ # Do nothing
18
+ end
19
+
20
+ def test_with_nil_url
21
+ assert_raise URI::InvalidURIError do
22
+ crawler = Crawler.new(nil)
23
+ crawler.crawl
24
+ end
25
+ end
26
+
27
+ def test_with_invalid_url
28
+ crawler = Crawler.new('https:/xyz.invalidurl.com/')
29
+ crawler.crawl
30
+ map = crawler.map
31
+ assert_equal({}, map)
32
+ end
33
+
34
+ def test_with_valid_url
35
+ url1 = 'http://www.example.com'
36
+ url2 = 'http://www.example.com/test.html'
37
+
38
+ stub_response_link(url1, url2)
39
+ stub_response_empty(url2)
40
+
41
+ crawler = Crawler.new(url1)
42
+ crawler.crawl
43
+ map = crawler.map
44
+ assert(map.key?(url1))
45
+ assert_equal(map[url1].neighbors, [url2])
46
+ end
47
+
48
+ def test_with_relative_links
49
+ url1 = 'http://www.example.com/test/test.html'
50
+ url2 = 'http://www.example.com/test/test2.html'
51
+ url3 = 'http://www.example.com/test3.html'
52
+
53
+ stub_response_link(url1, 'test2.html')
54
+ stub_response_link(url2, '/test3.html')
55
+ stub_response_empty(url3)
56
+
57
+ crawler = Crawler.new(url1)
58
+ crawler.crawl
59
+ map = crawler.map
60
+ assert(map.key?(url1))
61
+ assert_equal(map[url1].neighbors, [url2])
62
+ assert(map.key?(url2))
63
+ assert_equal(map[url2].neighbors, [url3])
64
+ assert(map.key?(url3))
65
+ assert_equal(map[url3].neighbors, [])
66
+ end
67
+
68
+ def test_with_unavailable_links
69
+ url1 = 'http://www.example.com'
70
+ url2 = 'http://www.example.com/test.html'
71
+ url3 = 'http://www.example.com/unavailable'
72
+
73
+ stub_response_link(url1, url2)
74
+ stub_response_link(url2, url3)
75
+ stub_response_not_found(url3)
76
+
77
+ crawler = Crawler.new(url1)
78
+ crawler.crawl
79
+ map = crawler.map
80
+ assert(!map.key?(url3))
81
+ end
82
+
83
+ def test_with_external_links
84
+ url1 = 'http://www.example.com/test/test.html'
85
+ url2 = 'http://www.google.com/test/test2.html'
86
+
87
+ stub_response_link(url1, url2)
88
+
89
+ crawler = Crawler.new(url1)
90
+ crawler.crawl
91
+ map = crawler.map
92
+ assert(map.key?(url1))
93
+ assert_equal(map[url1].neighbors, [])
94
+ end
95
+
96
+ def test_with_invalid_links
97
+ url1 = 'http://www.example.com/test/test.html'
98
+ url2 = ':// test/test2.html'
99
+
100
+ stub_response_link(url1, url2)
101
+
102
+ crawler = Crawler.new(url1)
103
+ crawler.crawl
104
+ map = crawler.map
105
+ assert(map.key?(url1))
106
+ assert_equal(map[url1].neighbors, [])
107
+ end
108
+
109
+ def test_with_multiple_links
110
+ url1 = 'http://www.example.com/test/test.html'
111
+ url2 = 'http://www.example.com/test/test2.html'
112
+ url3 = 'http://www.example.com/test3.html'
113
+
114
+ stub_response_two_links(url1, url2, url3)
115
+ stub_response_empty(url2)
116
+ stub_response_empty(url3)
117
+
118
+ crawler = Crawler.new(url1)
119
+ crawler.crawl
120
+ map = crawler.map
121
+ assert(map.key?(url1))
122
+ assert_equal(map[url1].neighbors, [url2, url3])
123
+ assert(map.key?(url2))
124
+ assert_equal(map[url2].neighbors, [])
125
+ assert(map.key?(url3))
126
+ assert_equal(map[url3].neighbors, [])
127
+ end
128
+
129
+ def test_with_same_links
130
+ url1 = 'http://www.example.com/test/test.html'
131
+ url2 = 'http://www.example.com/test/test2.html'
132
+ url3 = url1
133
+
134
+ stub_response_two_links(url1, url2, url3)
135
+ stub_response_empty(url2)
136
+
137
+ crawler = Crawler.new(url1)
138
+ crawler.crawl
139
+ map = crawler.map
140
+ assert(map.key?(url1))
141
+ assert_equal(map[url1].neighbors, [url2])
142
+ assert(map.key?(url2))
143
+ assert_equal(map[url2].neighbors, [])
144
+ end
145
+
146
+ def test_with_same_links_with_different_fragments
147
+ url1 = 'http://www.example.com/test/test.html'
148
+ url2 = 'http://www.example.com/test/test2.html'
149
+ url3 = url1 + '#fragment'
150
+
151
+ stub_response_two_links(url1, url2, url3)
152
+ stub_response_empty(url2)
153
+
154
+ crawler = Crawler.new(url1)
155
+ crawler.crawl
156
+ map = crawler.map
157
+ assert(map.key?(url1))
158
+ assert_equal(map[url1].neighbors, [url2])
159
+ assert(map.key?(url2))
160
+ assert_equal(map[url2].neighbors, [])
161
+ end
162
+
163
+ def test_with_non_http_link
164
+ url1 = 'http://www.example.com/test/test.html'
165
+ url2 = 'http://www.example.com/test/test2.html'
166
+ url3 = 'mailto:mail@example.com'
167
+
168
+ stub_response_two_links(url1, url2, url3)
169
+ stub_response_empty(url2)
170
+
171
+ crawler = Crawler.new(url1)
172
+ crawler.crawl
173
+ map = crawler.map
174
+ assert(map.key?(url1))
175
+ assert_equal(map[url1].neighbors, [url2])
176
+ assert(map.key?(url2))
177
+ assert_equal(map[url2].neighbors, [])
178
+ end
179
+
180
+ def test_with_statics
181
+ url1 = 'http://www.example.com'
182
+
183
+ stub_request(:get, url1).to_return(
184
+ body: '<link rel="stylesheet" href="test.css" />'\
185
+ '<script src="test.js"></script>'\
186
+ '<img src="test.png" />'
187
+ )
188
+
189
+ crawler = Crawler.new(url1)
190
+ crawler.crawl
191
+ map = crawler.map
192
+ assert(map.key?(url1))
193
+ assert_equal(map[url1].statics, %w(test.js test.css test.png))
194
+ end
195
+
196
+ def test_print
197
+ url1 = 'http://www.example.com'
198
+ url2 = 'http://www.example.com/test.html'
199
+
200
+ stub_response_link(url1, url2)
201
+ stub_response_empty(url2)
202
+
203
+ crawler = Crawler.new(url1)
204
+ crawler.crawl
205
+
206
+ printed = capture_stdout do
207
+ crawler.print
208
+ end
209
+
210
+ assert_match(/#{url1}/, printed)
211
+ assert_match(/#{url2}/, printed)
212
+ end
213
+
214
+ private
215
+
216
+ def stub_response_link(url, link)
217
+ stub_request(:get, url).to_return(
218
+ body: %(<body><a href="#{link}">test</a></body>")
219
+ )
220
+ end
221
+
222
+ def stub_response_two_links(url, link1, link2)
223
+ stub_request(:get, url).to_return(
224
+ body: %(<body><a href="#{link1}">t1</a><a href="#{link2}">t2</a></body>)
225
+ )
226
+ end
227
+
228
+ def stub_response_empty(url)
229
+ stub_request(:get, url).to_return(
230
+ body: '<body>stub body</body>'
231
+ )
232
+ end
233
+
234
+ def stub_response_not_found(url)
235
+ stub_request(:get, url).to_return(
236
+ status: 404
237
+ )
238
+ end
239
+
240
+ def capture_stdout(&blk)
241
+ old = $stdout
242
+ $stdout = fake = StringIO.new
243
+ blk.call
244
+ fake.string
245
+ ensure
246
+ $stdout = old
247
+ end
248
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - anupom
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-02-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ - !ruby/object:Gem::Dependency
56
+ name: webmock
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.17'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.17'
69
+ description: Simple web crawler to crawl a domain and generate sitemap
70
+ email:
71
+ - anupom.syam@gmail.com
72
+ executables:
73
+ - crawler
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - bin/crawler
83
+ - crawler.gemspec
84
+ - lib/crawler.rb
85
+ - test/crawler_test.rb
86
+ homepage: https://github.com/anupom/crawler
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.2.2
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: Create sitemap from a given url
110
+ test_files:
111
+ - test/crawler_test.rb