spidercrawl 0.3.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1cbd732cf3ba32a6a670cf77b39350eb36577d82
4
+ data.tar.gz: bb74e55ea3561de5b270978705d5852c3051fbaf
5
+ SHA512:
6
+ metadata.gz: 432c2150504fc990558fe00091a6965dda060212ae8427603fc19002ee8a7c5f6c3f65e2019700d3b58fdc282801f45df82f37e329db43118fc98edc6dc1afff
7
+ data.tar.gz: 18b71483b43c9abbcf012850865b26e99e0cf853b15954f18d0f92cf35d62fc01e31e7deb1b78161df34ee7029875031884d27a8bc4597ecc7881c426fc64bdb
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ *.bundle
11
+ *.so
12
+ *.o
13
+ *.a
14
+ mkmf.log
15
+ .DS_Store
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in spidercrawl.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Belson Heng
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # SpiderCrawl
2
+
3
+ A ruby gem that can crawl a domain and let you have information about the pages it visits.
4
+
5
+ With the help of Nokogiri, SpiderCrawl will parse each page and return you its title, links, css, words, and many many more! You can also customize what you want to do before & after each fetch request.
6
+
7
+ Long story short - Feed an URL to SpiderCrawl and it will crawl + scrape the content for you.
8
+
9
+ ## Installation
10
+
11
+ Add this line to your application's Gemfile:
12
+
13
+ ```ruby
14
+ gem 'spidercrawl'
15
+ ```
16
+
17
+ And then execute:
18
+
19
+ $ bundle
20
+
21
+ Or install it yourself as:
22
+
23
+ $ gem install spidercrawl
24
+
25
+ ## Usage
26
+
27
+ Start crawling a domain by calling __Spiderman.shoot__(*url*) and it will return you a list of pages it has crawled and scraped:
28
+
29
+ pages = Spiderman.shoot('http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/')
30
+
31
+ To include a pattern matching for each page:
32
+
33
+ pages = Spiderman.shoot('http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/',
34
+ :pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/hwm-magazine-publication-38\/?(.*\.html)?$')
35
+
36
+ Access the following scraped data:
37
+
38
+ pages.each do |page|
39
+ page.url #URL of the page
40
+ page.scheme #Scheme of the page (http, https, etc.)
41
+ page.host #Hostname of the page
42
+ page.base_url #Root URL of the page
43
+ page.doc #Nokogiri document
44
+ page.headers #Response headers for the page
45
+ page.title #Title of the page
46
+ page.links #Every link found in the page, returned as an array
47
+ page.internal_links #Only internal links returned as an array
48
+ page.external_links #Only external links returned as an array
49
+ page.emails #Every email found in the page, returned as an array
50
+ page.images #Every img found in the page, returned as an array
51
+ page.words #Every word that appeared in the page, returned as an array
52
+ page.css #CSS scripts used in the page, returned as an array
53
+ page.content #Contents of the HTML document in string
54
+ page.content_type #Content type of the page
55
+ page.text #Any text found in the page without HTML tags
56
+ page.response_code #HTTP response code of the page
57
+ page.response_time #HTTP response time of the page
58
+ page.crawled_time #The time when this page is crawled/fetched, returned as milliseconds since epoch
59
+ end
60
+
61
+ ## Dependencies
62
+
63
+ + Colorize
64
+ + Curb
65
+ + Nokogiri
66
+ + Typhoeus
67
+
68
+ ## Contributing
69
+
70
+ 1. Fork it ( https://github.com/belsonheng/spidercrawl/fork )
71
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
72
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
73
+ 4. Push to the branch (`git push origin my-new-feature`)
74
+ 5. Create a new Pull Request
75
+
76
+ ## License
77
+
78
+ SpiderCrawl is released under the [MIT license](https://github.com/belsonheng/spidercrawl/blob/master/LICENSE.txt).
data/Rakefile ADDED
@@ -0,0 +1,2 @@
1
+ require "bundler/gem_tasks"
2
+
@@ -0,0 +1,10 @@
1
+ require "spidercrawl/spider_worker"
2
+
3
+ class Spiderman
4
+ def self.shoot(urls, options, &block)
5
+ spiderman = Spidercrawl::SpiderWorker.new(urls, options)
6
+ yield spiderman if block_given?
7
+ return spiderman.parallel_crawl if options[:parallel] == true
8
+ return spiderman.crawl unless options[:parallel]
9
+ end
10
+ end
@@ -0,0 +1,187 @@
1
+ require 'nokogiri'
2
+
3
+ module Spidercrawl
4
+ # Parses the content with Nokogiri
5
+ class Page
6
+
7
+ attr_reader :location, :response_time
8
+ attr_accessor :crawled_time
9
+
10
+ def initialize(url, options = {})
11
+ @url = url
12
+ @code = options[:response_code]
13
+ @headers = options[:response_head]
14
+ @location = options[:redirect_url]
15
+ @body = options[:response_body]
16
+ @response_time = options[:response_time]
17
+ @crawled_time = options[:crawled_time]
18
+ end
19
+
20
+ #
21
+ # Return the url of the page
22
+ #
23
+ def url
24
+ @url.to_s
25
+ end
26
+
27
+ #
28
+ # Return the url scheme of the page (e.g. http, https, etc.)
29
+ #
30
+ def scheme
31
+ @url.scheme
32
+ end
33
+
34
+ #
35
+ # Return the url host of the page
36
+ #
37
+ def host
38
+ @url.host
39
+ end
40
+
41
+ #
42
+ # Return the base url of the page
43
+ #
44
+ def base_url
45
+ @base_url = "#{scheme}://#{host}"
46
+ end
47
+
48
+ #
49
+ # Return the Nokogiri html document
50
+ #
51
+ def doc
52
+ @document = Nokogiri::HTML(@body)
53
+ rescue Exception => e
54
+ puts e.inspect
55
+ puts e.backtrace
56
+ end
57
+
58
+ #
59
+ # Return the headers of the page
60
+ #
61
+ def headers
62
+ puts @headers
63
+ end
64
+
65
+ #
66
+ # Return the title of the page
67
+ #
68
+ def title
69
+ @title = doc.css('head title').inner_text
70
+ end
71
+
72
+ #
73
+ # Return the entire links found in the page; exclude empty links
74
+ #
75
+ def links
76
+ @links = doc.css('a').map { |link| link['href'].to_s }.uniq.delete_if { |href| href.empty? }.map { |url| absolutify(url.strip) }
77
+ end
78
+
79
+ #
80
+ # Return the internal links found in the page
81
+ #
82
+ def internal_links
83
+ @internal_links = links.select { |link| URI.parse(link).host == host } rescue nil
84
+ end
85
+
86
+ #
87
+ # Return the external links found in the page
88
+ #
89
+ def external_links
90
+ @external_links = links.select { |link| URI.parse(link).host != host } rescue nil
91
+ end
92
+
93
+ #
94
+ # Return any emails found in the page
95
+ #
96
+ def emails
97
+ @body.match(/[\w.!#\$%+-]+@[\w-]+(?:\.[\w-]+)+/)
98
+ end
99
+
100
+ #
101
+ # Return all images found in the page
102
+ #
103
+ def images
104
+ @images = doc.css('img').map { |img| img['src'].to_s }.uniq.delete_if { |src| src.empty? }.map { |url| absolutify(url.strip) }
105
+ end
106
+
107
+ #
108
+ # Return all words found in the page
109
+ #
110
+ def words
111
+ @words = text.split(/[^a-zA-Z]/).delete_if { |word| word.empty? }
112
+ end
113
+
114
+ #
115
+ # Return css scripts of the page
116
+ #
117
+ def css
118
+ @css = doc.search("[@type='text/css']")
119
+ end
120
+
121
+ def meta_keywords
122
+ end
123
+
124
+ def meta_descriptions
125
+ end
126
+
127
+ #
128
+ # Return html content as a string
129
+ #
130
+ def content
131
+ @body.to_s
132
+ end
133
+
134
+ #
135
+ # Return the content type of the page
136
+ #
137
+ def content_type
138
+ doc.at("meta[@http-equiv='Content-Type']")['content']
139
+ end
140
+
141
+ #
142
+ # Return plain text of the page without html tags
143
+ #
144
+ def text
145
+ temp_doc = doc
146
+ temp_doc.css('script, noscript, style, link').each { |node| node.remove }
147
+ @text = temp_doc.css('body').text.split("\n").collect { |line| line.strip }.join("\n")
148
+ end
149
+
150
+ #
151
+ # Return the response code
152
+ #
153
+ def response_code
154
+ @code
155
+ end
156
+
157
+ #
158
+ # Return true if page not found
159
+ #
160
+ def not_found?
161
+ @code == 404
162
+ end
163
+
164
+ #
165
+ # Return true if page is fetched successfully
166
+ #
167
+ def success?
168
+ @code == 200
169
+ end
170
+
171
+ #
172
+ # Return true if page is redirected
173
+ #
174
+ def redirect?
175
+ (300..307).include?(@code)
176
+ end
177
+
178
+ #
179
+ # Return the absolute url
180
+ #
181
+ private
182
+ def absolutify(page_url)
183
+ return URI.escape(page_url) if page_url =~ /^\w*\:/i
184
+ return base_url + URI.escape(page_url)
185
+ end
186
+ end
187
+ end
@@ -0,0 +1,155 @@
1
+ require 'spidercrawl/page'
2
+ require 'spidercrawl/user_agents'
3
+ require 'net/http'
4
+ require 'curb'
5
+ require 'colorize'
6
+ require 'typhoeus'
7
+
8
+ module Spidercrawl
9
+ # Makes the request to the targeted website
10
+ class Request
11
+
12
+ attr_accessor :uri
13
+
14
+ def initialize(url, options = {})
15
+ @uri = URI.parse(url)
16
+ @threads = options[:threads]
17
+ @timeout = options[:timeout]
18
+
19
+ @http = Net::HTTP.new(@uri.host, @uri.port) do |http|
20
+ http.open_timeout = @timeout # in seconds
21
+ http.read_timeout = @timeout # in seconds
22
+ end
23
+
24
+ @c = Curl::Easy.new(@uri.to_s) do |curl|
25
+ curl.headers['User-Agent'] = UserAgents.random
26
+ end
27
+ end
28
+
29
+ #
30
+ # Fetch a page from the given url using libcurl
31
+ #
32
+ def curl
33
+ puts "fetching #{@uri.to_s}".green.on_black
34
+ start_time = Time.now
35
+ begin
36
+ c = @c
37
+ c.url = @uri.to_s
38
+ c.perform
39
+ end_time = Time.now
40
+ case c.response_code
41
+ when 200 then
42
+ page = Page.new(@uri, response_code: c.response_code,
43
+ response_head: c.header_str,
44
+ response_body: c.body_str,
45
+ response_time: ((end_time-start_time)*1000).round,
46
+ crawled_time: (Time.now.to_f*1000).to_i)
47
+ when 300..307 then
48
+ page = Page.new(@uri, response_code: c.response_code,
49
+ response_head: c.header_str,
50
+ response_body: c.body_str,
51
+ response_time: ((end_time-start_time)*1000).round,
52
+ redirect_url: c.redirect_url)
53
+ when 404 then
54
+ page = Page.new(@uri, response_code: c.response_code,
55
+ response_time: ((end_time-start_time)*1000).round)
56
+ end
57
+ rescue Exception => e
58
+ puts e.inspect
59
+ puts e.backtrace
60
+ end
61
+ end
62
+
63
+ #
64
+ # Fetch a page from the given url using net/http
65
+ #
66
+ def fetch
67
+ puts "fetching #{@uri.to_s}".green.on_black
68
+ start_time = Time.now
69
+ begin
70
+ request = Net::HTTP::Get.new(@uri.request_uri)
71
+ request["User-Agent"] = UserAgents.random
72
+ response = @http.request(request)
73
+ end_time = Time.now
74
+ case response
75
+ when Net::HTTPSuccess then
76
+ page = Page.new(@uri, response_code: response.code.to_i,
77
+ response_head: response.instance_variable_get("@header"),
78
+ response_body: response.body,
79
+ response_time: (end_time-start_time).to_f,
80
+ crawled_time: (Time.now.to_f*1000).to_i)
81
+ when Net::HTTPRedirection then
82
+ page = Page.new(@uri, response_code: response.code.to_i,
83
+ response_head: response.instance_variable_get("@header"),
84
+ response_body: response.body,
85
+ response_time: (end_time-start_time).to_f,
86
+ redirect_url: response['location'])
87
+ when Net::HTTPNotFound then
88
+ page = Page.new(@uri, response_code: response.code.to_i,
89
+ response_time: (end_time-start_time).to_f)
90
+ end
91
+ rescue Exception => e
92
+ puts e.inspect
93
+ puts e.backtrace
94
+ end
95
+ end
96
+ end
97
+
98
+ # Makes parallel requests to the targeted website using typhoeus and hydra
99
+ class ParallelRequest
100
+
101
+ attr_accessor :urls
102
+
103
+ def initialize(urls, options = {})
104
+ @urls = urls
105
+ @threads = options[:threads]
106
+ @timeout = options[:timeout]
107
+ end
108
+
109
+ #
110
+ # Fetch page(s) from the given url(s)
111
+ #
112
+ def fetch
113
+ hydra = Typhoeus::Hydra.new(:max_concurrency => @threads)
114
+ page, pages = nil, []
115
+
116
+ @urls.each do |url|
117
+ request = Typhoeus::Request.new(url, :timeout => @timeout, :followlocation => false, :headers => {"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control" => "no-cache", "Pragma" => "no-cache", "User-Agent" => UserAgents.random})
118
+ request.on_complete do |response|
119
+ uri = URI(url)
120
+ if response.success?
121
+ puts "fetching #{url}".green.on_black
122
+ page = Page.new(uri, response_code: response.code,
123
+ response_head: response.headers,
124
+ response_body: response.body,
125
+ response_time: response.time*1000,
126
+ crawled_time: (Time.now.to_f*1000).to_i)
127
+ elsif (300..307).include?(response.code)
128
+ puts "fetching #{url}".green.on_black
129
+ puts "### #{response.code} ### redirect to #{response.headers['Location']}".white.on_black
130
+ page = Page.new(uri, response_code: response.code,
131
+ response_head: response.headers,
132
+ response_body: response.body,
133
+ response_time: response.time*1000,
134
+ redirect_url: response.headers['Location'])
135
+ elsif 404 == response.code
136
+ puts "fetching #{url}".green.on_black
137
+ puts "### #{response.code} ### not found #{url}".magenta.on_black
138
+ page = Page.new(uri, response_code: response.code,
139
+ response_time: response.time*1000)
140
+ else
141
+ puts "fetching #{url}".green.on_black
142
+ puts "### #{response.code} ### failed #{url}".magenta.on_black
143
+ puts "### Time: #{response.time} ### #{response.return_message}".magenta.on_black
144
+ page = Page.new(uri, response_code: response.code,
145
+ response_time: response.time*1000)
146
+ end
147
+ pages << page
148
+ end
149
+ hydra.queue(request)
150
+ end
151
+ hydra.run
152
+ return pages
153
+ end
154
+ end
155
+ end
@@ -0,0 +1,203 @@
1
+ require 'spidercrawl/request'
2
+ require 'spidercrawl/page'
3
+ require 'colorize'
4
+
5
+ module Spidercrawl
6
+ # Start working hard
7
+ class SpiderWorker
8
+
9
+ attr_reader :page
10
+
11
+ def initialize(url, options = {})
12
+ @url = url
13
+ #@headers = options[:headers]
14
+ @delay = options[:delay] ? options[:delay] : 0 # default 0 seconds
15
+ @threads = options[:threads] ? options[:threads] : 10 # default 10 threads
16
+ @timeout = options[:timeout] ? options[:timeout] : 20 # default 20 seconds
17
+ @allow_redirections = options[:allow_redirections]
18
+ @max_pages = options[:max_pages]
19
+ @pattern = options[:pattern]
20
+ @setup = nil
21
+ @teardown = nil
22
+ @redirect = nil
23
+ @success = nil
24
+ @failure = nil
25
+ end
26
+
27
+ def crawl
28
+ link_queue = Queue.new
29
+ pages, visited_links = [], []
30
+ link_queue << @url
31
+
32
+ spider_worker = Request.new(@url, :threads => @threads, :timeout => @timeout)
33
+
34
+ begin
35
+ url = link_queue.pop
36
+ next if visited_links.include?(url) || (@pattern && url !~ @pattern)
37
+
38
+ start_time = Time.now
39
+ response = @setup.yield url unless @setup.nil?
40
+ end_time = Time.now
41
+
42
+ spider_worker.uri = URI.parse(url)
43
+ page = (response ? setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
44
+ visited_links << url
45
+
46
+ if page.success? || page.redirect? then
47
+ while page.redirect?
48
+ puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
49
+ break if visited_links.include?(page.location)
50
+
51
+ start_time = Time.now
52
+ response = @redirect.yield page.location unless @redirect.nil?
53
+ end_time = Time.now
54
+
55
+ spider_worker.uri = URI.parse(page.location)
56
+ page = (response ? setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
57
+ visited_links << page.url
58
+ end
59
+ if !visited_links.include?(page.location)
60
+ pages << page unless page.content == ""
61
+ page.internal_links.each do |link|
62
+ if !visited_links.include?(link)
63
+ link_queue << link if @pattern && link =~ @pattern
64
+ link_queue << link unless @pattern
65
+ end
66
+ end unless page.internal_links.nil?
67
+ @teardown.yield page unless @teardown.nil?
68
+ sleep @delay
69
+ end
70
+ elsif page.not_found? then
71
+ puts "page not found"
72
+ end
73
+ end until link_queue.empty?
74
+ pages
75
+ end
76
+
77
+ def parallel_crawl
78
+ link_queue = Queue.new
79
+ pages, visited_links = [], []
80
+ link_queue << @url
81
+
82
+ spider_workers = ParallelRequest.new([@url], :threads => @threads, :timeout => @timeout)
83
+
84
+ begin
85
+ urls = []
86
+ while !link_queue.empty?
87
+ url = link_queue.pop
88
+ next if visited_links.include?(url) || (@pattern && url !~ @pattern)
89
+ visited_links << url
90
+
91
+ start_time = Time.now
92
+ response = @setup.yield url unless @setup.nil?
93
+ end_time = Time.now
94
+
95
+ if response then
96
+ pages << (page = setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i))
97
+ @teardown.yield page unless @teardown.nil?
98
+
99
+ page.internal_links.each do |link| # queue up internal links for crawling
100
+ if !visited_links.include?(link)
101
+ link_queue << link if @pattern && link =~ @pattern
102
+ link_queue << link unless @pattern
103
+ end
104
+ end unless page.internal_links.nil?
105
+ else # queue up url for crawling
106
+ urls << url
107
+ puts "queue: #{url}"
108
+ end
109
+ end
110
+
111
+ spider_workers.urls = urls
112
+ responses = spider_workers.fetch
113
+
114
+ responses.each do |page|
115
+ if (503..504).include?(page.response_code) then
116
+ link_queue << page.url
117
+ elsif page.success? || page.redirect? then
118
+ response = nil
119
+ if page.redirect? then
120
+ puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
121
+ unless visited_links.include?(page.location) || (@pattern && page.location !~ @pattern)
122
+ start_time = Time.now
123
+ response = @redirect.yield page.location unless @redirect.nil?
124
+ end_time = Time.now
125
+
126
+ if response then
127
+ page = setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i)
128
+ visited_links << page.url
129
+ else
130
+ puts "queue: #{page.location}"
131
+ link_queue << page.location
132
+ end
133
+ else
134
+ puts "discard: #{page.location}"
135
+ end
136
+ end
137
+ if page.success? || response then
138
+ pages << page unless page.content == ""
139
+ page.internal_links.each do |link| # queue up internal links for crawling
140
+ if !visited_links.include?(link)
141
+ link_queue << link if @pattern && link =~ @pattern
142
+ link_queue << link unless @pattern
143
+ end
144
+ end unless page.internal_links.nil?
145
+ page.crawled_time = (Time.now.to_f*1000).to_i
146
+ @teardown.yield page unless @teardown.nil?
147
+ end
148
+ elsif page.not_found? then
149
+ puts "page not found"
150
+ end
151
+ end
152
+ end until link_queue.empty?
153
+ pages
154
+ end
155
+
156
+ #
157
+ # Code block for before fetch
158
+ #
159
+ def before_fetch(&block)
160
+ @setup = block if block
161
+ end
162
+
163
+ #
164
+ # Code block for after fetch
165
+ #
166
+ def after_fetch(&block)
167
+ @teardown = block if block
168
+ end
169
+
170
+ #
171
+ # Code block for on redirect
172
+ #
173
+ def on_redirect(&block)
174
+ @redirect = block if block
175
+ end
176
+
177
+ #
178
+ # Code block for on success
179
+ #
180
+ def on_success(&block)
181
+ @success = block if block
182
+ end
183
+
184
+ #
185
+ # Code block for on failure
186
+ #
187
+ def on_failure(&block)
188
+ @failure = block if block
189
+ end
190
+
191
+ #
192
+ # Setup page based on given response
193
+ #
194
+ private
195
+ def setup_page(uri, response, response_time)
196
+ page = Page.new(uri, response_code: response.code.to_i,
197
+ response_head: response.instance_variable_get("@header"),
198
+ response_body: response.body,
199
+ response_time: response_time,
200
+ crawled_time: (Time.now.to_f*1000).to_i)
201
+ end
202
+ end
203
+ end
@@ -0,0 +1,25 @@
1
+ #
2
+ # Copyright (c) 2013 Charles H Martin, PhD
3
+ #
4
+
5
+ class UserAgents
6
+ # Random agents
7
+ def self.random
8
+ case rand(20)
9
+ when 0
10
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:#{10+rand(10)}.#{rand(10)}) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Firefox/20.0"
11
+ when 1
12
+ "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.#{10+rand(10)}) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Ubuntu/10.10 (maverick) Firefox/3.6.#{14+rand(5)}"
13
+ when 2
14
+ ver = "#{400+rand(99)}.#{10+rand(75)}"
15
+ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/#{ver} (KHTML, like Gecko) Chrome/12.0.#{700+rand(90)}.#{100+rand(200)} Safari/#{ver}"
16
+ when 3
17
+ ver = "#{400+rand(99)}.#{rand(9)}"
18
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/#{ver} (KHTML, like Gecko) Chrome/13.0.#{700+rand(90)}.#{100+rand(200)} Safari/#{ver}"
19
+ when 4
20
+ "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Firefox/#{4+rand(1)}.0"
21
+ when 5
22
+ "Mozilla/4.0 (compatible; MSIE 8.#{rand(6)}; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.#{50000+rand(7000)}; .NET CLR 3.5.#{30000+rand(8000)}; .NET CLR 3.0.#{30000+rand(8000)}; Media Center PC 6.0; .NET4.0C; .NET4.0E; MS-RTC LM 8; Zune 4.#{6+rand(3)})"
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,3 @@
1
+ module Spidercrawl
2
+ VERSION = "0.3.9"
3
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'spidercrawl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "spidercrawl"
8
+ spec.version = Spidercrawl::VERSION
9
+ spec.authors = ["Belson Heng"]
10
+ spec.email = ["belsonheng@gmail.com"]
11
+ spec.summary = %q{A ruby gem that can crawl a domain and let you have information about the pages it visits.}
12
+ spec.description = %q{With the help of Nokogiri, SpiderCrawl will parse each page and return you its title, links, css, words, and many many more! You can also customize what you want to do before & after each fetch request.}
13
+ spec.homepage = "http://github.com/belsonheng/spidercrawl"
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency 'nokogiri', '~> 1.6'
22
+ spec.add_dependency "curb"
23
+ spec.add_dependency "colorize"
24
+ spec.add_dependency "typhoeus"
25
+
26
+ spec.add_development_dependency "bundler", "~> 1.7"
27
+ spec.add_development_dependency "rake", "~> 10.0"
28
+ end
metadata ADDED
@@ -0,0 +1,143 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidercrawl
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.9
5
+ platform: ruby
6
+ authors:
7
+ - Belson Heng
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2015-07-30 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ - !ruby/object:Gem::Dependency
28
+ name: curb
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: colorize
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: typhoeus
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.7'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.7'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ description: With the help of Nokogiri, SpiderCrawl will parse each page and return
98
+ you its title, links, css, words, and many many more! You can also customize what
99
+ you want to do before & after each fetch request.
100
+ email:
101
+ - belsonheng@gmail.com
102
+ executables: []
103
+ extensions: []
104
+ extra_rdoc_files: []
105
+ files:
106
+ - ".gitignore"
107
+ - Gemfile
108
+ - LICENSE.txt
109
+ - README.md
110
+ - Rakefile
111
+ - lib/spidercrawl.rb
112
+ - lib/spidercrawl/page.rb
113
+ - lib/spidercrawl/request.rb
114
+ - lib/spidercrawl/spider_worker.rb
115
+ - lib/spidercrawl/user_agents.rb
116
+ - lib/spidercrawl/version.rb
117
+ - spidercrawl.gemspec
118
+ homepage: http://github.com/belsonheng/spidercrawl
119
+ licenses:
120
+ - MIT
121
+ metadata: {}
122
+ post_install_message:
123
+ rdoc_options: []
124
+ require_paths:
125
+ - lib
126
+ required_ruby_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ required_rubygems_version: !ruby/object:Gem::Requirement
132
+ requirements:
133
+ - - ">="
134
+ - !ruby/object:Gem::Version
135
+ version: '0'
136
+ requirements: []
137
+ rubyforge_project:
138
+ rubygems_version: 2.4.5
139
+ signing_key:
140
+ specification_version: 4
141
+ summary: A ruby gem that can crawl a domain and let you have information about the
142
+ pages it visits.
143
+ test_files: []