simple_crawler 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ebddf965a3f0021ce3187684afa08db022665878
4
+ data.tar.gz: afc8d73392ac3918cd11b1a7fd3e1b61340299e9
5
+ SHA512:
6
+ metadata.gz: 76da6a650493ea030c63371c529759d8e01a87c7f72131250b5a99dff418320dac908eb5e70a36d50629315f232a0b357a4a48f98d5453ad333a15454028e343
7
+ data.tar.gz: 060c78ef1d9452efaa042daf3b90208f5d92980de68b136667c468edd493ed022ac063334b87f25c3bb7c437ca95cdaf92bca3dbfde139ae2aae6e152d74ae23
@@ -0,0 +1,20 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .DS_Store
6
+ .idea/
7
+ .yardoc
8
+ Gemfile.lock
9
+ InstalledFiles
10
+ _yardoc
11
+ coverage
12
+ doc/
13
+ lib/bundler/man
14
+ pkg
15
+ rdoc
16
+ spec/reports
17
+ test/tmp
18
+ test/version_tmp
19
+ tmp
20
+
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in crawler.gemspec
4
+ gemspec
5
+ gem 'awesome_print', '~>1.2.0'
6
+ gem 'em-synchrony', '~>1.0.3'
7
+ gem 'em-http-request', '~>1.1.2'
8
+ gem 'nokogiri', '~>1.6.1'
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Anupom Syam
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,49 @@
1
+ # Crawler
2
+
3
+ Crawler is a simple web crawler written in Ruby. Given a URL it crawls the domain and recursively finds all links
4
+ associated with it. It also keeps track of all static contents related to each of these links.
5
+
6
+ It uses eventmachine and fiber (through em-synchrony) to issue concurrent non-blocking requests.
7
+ Crawler stores the site map using a variation of Adjacency list data structure. It can also
8
+ pretty-print the map once a URL is crawled.
9
+
10
+ ## Installation
11
+
12
+ Add this line to your application's Gemfile:
13
+
14
+ gem 'crawler'
15
+
16
+ And then execute:
17
+
18
+ $ bundle
19
+
20
+ Or install it yourself as:
21
+
22
+ $ gem install crawler
23
+
24
+ ## Usage
25
+
26
+ ### Using Crawler as library
27
+ ```ruby
28
+ crawler = Crawler.new('http://google.com')
29
+ # Start crawling the URL
30
+ crawler.crawl
31
+ # Generated site map object
32
+ map = crawler.map
33
+ # Pretty print the site map
34
+ crawler.print
35
+ ```
36
+
37
+ ### Using Crawler as binary
38
+ ```sh
39
+ # Crawl domain and print the sitemap
40
+ crawler http://google.com
41
+ ```
42
+
43
+ ## Contributing
44
+
45
+ 1. Fork it ( http://github.com/anupom/crawler/fork )
46
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
47
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
48
+ 4. Push to the branch (`git push origin my-new-feature`)
49
+ 5. Create new Pull Request
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'crawler'
4
+
5
+ if ARGV.empty?
6
+ puts 'Please provide a url to crawl'
7
+ exit 1
8
+ end
9
+
10
+ crawler = Crawler.new(ARGV[0])
11
+ crawler.crawl
12
+ crawler.print
@@ -0,0 +1,24 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+
5
+ Gem::Specification.new do |spec|
6
+ spec.name = 'simple_crawler'
7
+ spec.version = '0.0.1'
8
+ spec.authors = ['anupom']
9
+ spec.email = ['anupom.syam@gmail.com']
10
+ spec.summary = %q{Create sitemap from a given url}
11
+ spec.description = %q{Simple web crawler to crawl a domain and generate sitemap}
12
+ spec.homepage = 'https://github.com/anupom/crawler'
13
+ spec.license = 'MIT'
14
+
15
+ spec.files = `git ls-files`.split($/)
16
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
17
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
+ spec.require_paths = ['lib']
19
+
20
+ spec.add_development_dependency 'bundler', '~> 1.5'
21
+ spec.add_development_dependency 'rake', '~> 10.1'
22
+ spec.add_development_dependency 'simplecov', '~> 0.8'
23
+ spec.add_development_dependency 'webmock', '~> 1.17'
24
+ end
@@ -0,0 +1,148 @@
1
+ require 'awesome_print'
2
+ require 'em-synchrony'
3
+ require 'em-synchrony/em-http'
4
+ require 'em-synchrony/fiber_iterator'
5
+ require 'nokogiri'
6
+ require 'set'
7
+
8
+ class Crawler
9
+ CONCURRENCY = 5
10
+ HTTP_OK = 200
11
+ MAX_REDIRECTS = 3
12
+ MAX_RETRIES = 3
13
+ VALID_SCHEMES = %w(http https)
14
+
15
+ Node = Struct.new(:neighbors, :statics)
16
+
17
+ attr_reader :map
18
+
19
+ def initialize(root_url)
20
+ @map = {}
21
+ @urls_to_crawl = [root_url]
22
+ @root_hostname = URI.parse(root_url).hostname
23
+ @retries = Hash.new { |h, k| h[k] = 0 }
24
+ end
25
+
26
+ def crawl
27
+ if @urls_to_crawl.empty?
28
+ EventMachine.stop
29
+ return
30
+ end
31
+
32
+ EM.synchrony do
33
+ # Iterate over a copy while we change the main array
34
+ urls = @urls_to_crawl.dup
35
+ @urls_to_crawl = crawl_urls(urls)
36
+ crawl
37
+ end
38
+ end
39
+
40
+ def print
41
+ ap @map, { index: false }
42
+ end
43
+
44
+ protected
45
+
46
+ def crawl_urls(urls)
47
+ next_urls = []
48
+
49
+ EM::Synchrony::FiberIterator.new(urls, CONCURRENCY).each do |url|
50
+ next if @map.key?(url)
51
+
52
+ http = http_request(url)
53
+
54
+ next if http.nil?
55
+
56
+ page = Nokogiri::HTML(http.response)
57
+ neighbors = get_neighbors(page, url)
58
+ next_urls += neighbors
59
+
60
+ statics = get_statics(page)
61
+
62
+ @map[url] = Node.new(neighbors, statics)
63
+ end
64
+
65
+ next_urls
66
+ end
67
+
68
+ def http_request(url)
69
+ http = EventMachine::HttpRequest.new(url)
70
+ .get redirects: MAX_REDIRECTS
71
+
72
+ if http.response_header.status != HTTP_OK
73
+ queue_for_retry(url)
74
+ return nil
75
+ end
76
+ http
77
+ rescue Addressable::URI::InvalidURIError
78
+ nil
79
+ end
80
+
81
+ def queue_for_retry(url)
82
+ return if @retries[url] == MAX_RETRIES
83
+ @retries[url] += 1
84
+ @urls_to_crawl.push(url)
85
+ end
86
+
87
+ def get_neighbors(page, parent_url)
88
+ neighbors = Set.new
89
+ links = page.css('a')
90
+
91
+ links.each do |link|
92
+ href = link['href']
93
+
94
+ uri = uri_from_href(href)
95
+
96
+ next unless valid_uri?(uri)
97
+
98
+ uri = URI.join(parent_url, uri) if relative_uri?(uri)
99
+
100
+ # Page fragments are ignored for site map
101
+ uri.fragment = nil
102
+
103
+ next if uri.to_s == parent_url
104
+
105
+ neighbors.add(uri.to_s)
106
+ end
107
+
108
+ neighbors.to_a
109
+ end
110
+
111
+ def get_statics(page)
112
+ statics = Set.new
113
+
114
+ scripts = page.css('script')
115
+ scripts.each do |script|
116
+ statics.add(script['src']) unless script['src'].nil?
117
+ end
118
+
119
+ stylesheets = page.css('link[rel="stylesheet"]')
120
+ stylesheets.each do |stylesheet|
121
+ statics.add(stylesheet['href']) unless stylesheet['href'].nil?
122
+ end
123
+
124
+ images = page.css('img')
125
+ images.each do |image|
126
+ statics.add(image['src']) unless image['src'].nil?
127
+ end
128
+
129
+ statics.to_a
130
+ end
131
+
132
+ def uri_from_href(href)
133
+ URI.parse(href)
134
+ rescue URI::InvalidURIError
135
+ nil
136
+ end
137
+
138
+ def valid_uri?(uri)
139
+ return false if uri.nil?
140
+ return false unless uri.scheme.nil? || VALID_SCHEMES.include?(uri.scheme)
141
+ return false unless uri.hostname.nil? || uri.hostname == @root_hostname
142
+ true
143
+ end
144
+
145
+ def relative_uri?(uri)
146
+ uri.scheme.nil?
147
+ end
148
+ end
@@ -0,0 +1,248 @@
1
+ require 'crawler'
2
+ require 'stringio'
3
+ require 'test/unit'
4
+ require 'webmock/test_unit'
5
+
6
+ class CrawlerTest < Test::Unit::TestCase
7
+ # Called before every test method runs. Can be used
8
+ # to set up fixture information.
9
+ def setup
10
+ # Do nothing
11
+ end
12
+
13
+ # Called after every test method runs. Can be used to tear
14
+ # down fixture information.
15
+
16
+ def teardown
17
+ # Do nothing
18
+ end
19
+
20
+ def test_with_nil_url
21
+ assert_raise URI::InvalidURIError do
22
+ crawler = Crawler.new(nil)
23
+ crawler.crawl
24
+ end
25
+ end
26
+
27
+ def test_with_invalid_url
28
+ crawler = Crawler.new('https:/xyz.invalidurl.com/')
29
+ crawler.crawl
30
+ map = crawler.map
31
+ assert_equal({}, map)
32
+ end
33
+
34
+ def test_with_valid_url
35
+ url1 = 'http://www.example.com'
36
+ url2 = 'http://www.example.com/test.html'
37
+
38
+ stub_response_link(url1, url2)
39
+ stub_response_empty(url2)
40
+
41
+ crawler = Crawler.new(url1)
42
+ crawler.crawl
43
+ map = crawler.map
44
+ assert(map.key?(url1))
45
+ assert_equal(map[url1].neighbors, [url2])
46
+ end
47
+
48
+ def test_with_relative_links
49
+ url1 = 'http://www.example.com/test/test.html'
50
+ url2 = 'http://www.example.com/test/test2.html'
51
+ url3 = 'http://www.example.com/test3.html'
52
+
53
+ stub_response_link(url1, 'test2.html')
54
+ stub_response_link(url2, '/test3.html')
55
+ stub_response_empty(url3)
56
+
57
+ crawler = Crawler.new(url1)
58
+ crawler.crawl
59
+ map = crawler.map
60
+ assert(map.key?(url1))
61
+ assert_equal(map[url1].neighbors, [url2])
62
+ assert(map.key?(url2))
63
+ assert_equal(map[url2].neighbors, [url3])
64
+ assert(map.key?(url3))
65
+ assert_equal(map[url3].neighbors, [])
66
+ end
67
+
68
+ def test_with_unavailable_links
69
+ url1 = 'http://www.example.com'
70
+ url2 = 'http://www.example.com/test.html'
71
+ url3 = 'http://www.example.com/unavailable'
72
+
73
+ stub_response_link(url1, url2)
74
+ stub_response_link(url2, url3)
75
+ stub_response_not_found(url3)
76
+
77
+ crawler = Crawler.new(url1)
78
+ crawler.crawl
79
+ map = crawler.map
80
+ assert(!map.key?(url3))
81
+ end
82
+
83
+ def test_with_external_links
84
+ url1 = 'http://www.example.com/test/test.html'
85
+ url2 = 'http://www.google.com/test/test2.html'
86
+
87
+ stub_response_link(url1, url2)
88
+
89
+ crawler = Crawler.new(url1)
90
+ crawler.crawl
91
+ map = crawler.map
92
+ assert(map.key?(url1))
93
+ assert_equal(map[url1].neighbors, [])
94
+ end
95
+
96
+ def test_with_invalid_links
97
+ url1 = 'http://www.example.com/test/test.html'
98
+ url2 = ':// test/test2.html'
99
+
100
+ stub_response_link(url1, url2)
101
+
102
+ crawler = Crawler.new(url1)
103
+ crawler.crawl
104
+ map = crawler.map
105
+ assert(map.key?(url1))
106
+ assert_equal(map[url1].neighbors, [])
107
+ end
108
+
109
+ def test_with_multiple_links
110
+ url1 = 'http://www.example.com/test/test.html'
111
+ url2 = 'http://www.example.com/test/test2.html'
112
+ url3 = 'http://www.example.com/test3.html'
113
+
114
+ stub_response_two_links(url1, url2, url3)
115
+ stub_response_empty(url2)
116
+ stub_response_empty(url3)
117
+
118
+ crawler = Crawler.new(url1)
119
+ crawler.crawl
120
+ map = crawler.map
121
+ assert(map.key?(url1))
122
+ assert_equal(map[url1].neighbors, [url2, url3])
123
+ assert(map.key?(url2))
124
+ assert_equal(map[url2].neighbors, [])
125
+ assert(map.key?(url3))
126
+ assert_equal(map[url3].neighbors, [])
127
+ end
128
+
129
+ def test_with_same_links
130
+ url1 = 'http://www.example.com/test/test.html'
131
+ url2 = 'http://www.example.com/test/test2.html'
132
+ url3 = url1
133
+
134
+ stub_response_two_links(url1, url2, url3)
135
+ stub_response_empty(url2)
136
+
137
+ crawler = Crawler.new(url1)
138
+ crawler.crawl
139
+ map = crawler.map
140
+ assert(map.key?(url1))
141
+ assert_equal(map[url1].neighbors, [url2])
142
+ assert(map.key?(url2))
143
+ assert_equal(map[url2].neighbors, [])
144
+ end
145
+
146
+ def test_with_same_links_with_different_fragments
147
+ url1 = 'http://www.example.com/test/test.html'
148
+ url2 = 'http://www.example.com/test/test2.html'
149
+ url3 = url1 + '#fragment'
150
+
151
+ stub_response_two_links(url1, url2, url3)
152
+ stub_response_empty(url2)
153
+
154
+ crawler = Crawler.new(url1)
155
+ crawler.crawl
156
+ map = crawler.map
157
+ assert(map.key?(url1))
158
+ assert_equal(map[url1].neighbors, [url2])
159
+ assert(map.key?(url2))
160
+ assert_equal(map[url2].neighbors, [])
161
+ end
162
+
163
+ def test_with_non_http_link
164
+ url1 = 'http://www.example.com/test/test.html'
165
+ url2 = 'http://www.example.com/test/test2.html'
166
+ url3 = 'mailto:mail@example.com'
167
+
168
+ stub_response_two_links(url1, url2, url3)
169
+ stub_response_empty(url2)
170
+
171
+ crawler = Crawler.new(url1)
172
+ crawler.crawl
173
+ map = crawler.map
174
+ assert(map.key?(url1))
175
+ assert_equal(map[url1].neighbors, [url2])
176
+ assert(map.key?(url2))
177
+ assert_equal(map[url2].neighbors, [])
178
+ end
179
+
180
+ def test_with_statics
181
+ url1 = 'http://www.example.com'
182
+
183
+ stub_request(:get, url1).to_return(
184
+ body: '<link rel="stylesheet" href="test.css" />'\
185
+ '<script src="test.js"></script>'\
186
+ '<img src="test.png" />'
187
+ )
188
+
189
+ crawler = Crawler.new(url1)
190
+ crawler.crawl
191
+ map = crawler.map
192
+ assert(map.key?(url1))
193
+ assert_equal(map[url1].statics, %w(test.js test.css test.png))
194
+ end
195
+
196
+ def test_print
197
+ url1 = 'http://www.example.com'
198
+ url2 = 'http://www.example.com/test.html'
199
+
200
+ stub_response_link(url1, url2)
201
+ stub_response_empty(url2)
202
+
203
+ crawler = Crawler.new(url1)
204
+ crawler.crawl
205
+
206
+ printed = capture_stdout do
207
+ crawler.print
208
+ end
209
+
210
+ assert_match(/#{url1}/, printed)
211
+ assert_match(/#{url2}/, printed)
212
+ end
213
+
214
+ private
215
+
216
+ def stub_response_link(url, link)
217
+ stub_request(:get, url).to_return(
218
+ body: %(<body><a href="#{link}">test</a></body>")
219
+ )
220
+ end
221
+
222
+ def stub_response_two_links(url, link1, link2)
223
+ stub_request(:get, url).to_return(
224
+ body: %(<body><a href="#{link1}">t1</a><a href="#{link2}">t2</a></body>)
225
+ )
226
+ end
227
+
228
+ def stub_response_empty(url)
229
+ stub_request(:get, url).to_return(
230
+ body: '<body>stub body</body>'
231
+ )
232
+ end
233
+
234
+ def stub_response_not_found(url)
235
+ stub_request(:get, url).to_return(
236
+ status: 404
237
+ )
238
+ end
239
+
240
+ def capture_stdout(&blk)
241
+ old = $stdout
242
+ $stdout = fake = StringIO.new
243
+ blk.call
244
+ fake.string
245
+ ensure
246
+ $stdout = old
247
+ end
248
+ end
metadata ADDED
@@ -0,0 +1,111 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple_crawler
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - anupom
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-02-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.5'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.5'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '10.1'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '10.1'
41
+ - !ruby/object:Gem::Dependency
42
+ name: simplecov
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '0.8'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '0.8'
55
+ - !ruby/object:Gem::Dependency
56
+ name: webmock
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: '1.17'
62
+ type: :development
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: '1.17'
69
+ description: Simple web crawler to crawl a domain and generate sitemap
70
+ email:
71
+ - anupom.syam@gmail.com
72
+ executables:
73
+ - crawler
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - ".gitignore"
78
+ - Gemfile
79
+ - LICENSE.txt
80
+ - README.md
81
+ - Rakefile
82
+ - bin/crawler
83
+ - crawler.gemspec
84
+ - lib/crawler.rb
85
+ - test/crawler_test.rb
86
+ homepage: https://github.com/anupom/crawler
87
+ licenses:
88
+ - MIT
89
+ metadata: {}
90
+ post_install_message:
91
+ rdoc_options: []
92
+ require_paths:
93
+ - lib
94
+ required_ruby_version: !ruby/object:Gem::Requirement
95
+ requirements:
96
+ - - ">="
97
+ - !ruby/object:Gem::Version
98
+ version: '0'
99
+ required_rubygems_version: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ requirements: []
105
+ rubyforge_project:
106
+ rubygems_version: 2.2.2
107
+ signing_key:
108
+ specification_version: 4
109
+ summary: Create sitemap from a given url
110
+ test_files:
111
+ - test/crawler_test.rb