spidercrawl 0.3.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +78 -0
- data/Rakefile +2 -0
- data/lib/spidercrawl.rb +10 -0
- data/lib/spidercrawl/page.rb +187 -0
- data/lib/spidercrawl/request.rb +155 -0
- data/lib/spidercrawl/spider_worker.rb +203 -0
- data/lib/spidercrawl/user_agents.rb +25 -0
- data/lib/spidercrawl/version.rb +3 -0
- data/spidercrawl.gemspec +28 -0
- metadata +143 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1cbd732cf3ba32a6a670cf77b39350eb36577d82
|
4
|
+
data.tar.gz: bb74e55ea3561de5b270978705d5852c3051fbaf
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 432c2150504fc990558fe00091a6965dda060212ae8427603fc19002ee8a7c5f6c3f65e2019700d3b58fdc282801f45df82f37e329db43118fc98edc6dc1afff
|
7
|
+
data.tar.gz: 18b71483b43c9abbcf012850865b26e99e0cf853b15954f18d0f92cf35d62fc01e31e7deb1b78161df34ee7029875031884d27a8bc4597ecc7881c426fc64bdb
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Belson Heng
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# SpiderCrawl
|
2
|
+
|
3
|
+
A ruby gem that can crawl a domain and let you have information about the pages it visits.
|
4
|
+
|
5
|
+
With the help of Nokogiri, SpiderCrawl will parse each page and return you its title, links, css, words, and many many more! You can also customize what you want to do before & after each fetch request.
|
6
|
+
|
7
|
+
Long story short - Feed an URL to SpiderCrawl and it will crawl + scrape the content for you.
|
8
|
+
|
9
|
+
## Installation
|
10
|
+
|
11
|
+
Add this line to your application's Gemfile:
|
12
|
+
|
13
|
+
```ruby
|
14
|
+
gem 'spidercrawl'
|
15
|
+
```
|
16
|
+
|
17
|
+
And then execute:
|
18
|
+
|
19
|
+
$ bundle
|
20
|
+
|
21
|
+
Or install it yourself as:
|
22
|
+
|
23
|
+
$ gem install spidercrawl
|
24
|
+
|
25
|
+
## Usage
|
26
|
+
|
27
|
+
Start crawling a domain by calling __Spiderman.shoot__(*url*) and it will return you a list of pages it has crawled and scraped:
|
28
|
+
|
29
|
+
pages = Spiderman.shoot('http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/')
|
30
|
+
|
31
|
+
To include a pattern matching for each page:
|
32
|
+
|
33
|
+
pages = Spiderman.shoot('http://forums.hardwarezone.com.sg/hwm-magazine-publication-38/',
|
34
|
+
:pattern => Regexp.new('^http:\/\/forums\.hardwarezone\.com\.sg\/hwm-magazine-publication-38\/?(.*\.html)?$')
|
35
|
+
|
36
|
+
Access the following scraped data:
|
37
|
+
|
38
|
+
pages.each do |page|
|
39
|
+
page.url #URL of the page
|
40
|
+
page.scheme #Scheme of the page (http, https, etc.)
|
41
|
+
page.host #Hostname of the page
|
42
|
+
page.base_url #Root URL of the page
|
43
|
+
page.doc #Nokogiri document
|
44
|
+
page.headers #Response headers for the page
|
45
|
+
page.title #Title of the page
|
46
|
+
page.links #Every link found in the page, returned as an array
|
47
|
+
page.internal_links #Only internal links returned as an array
|
48
|
+
page.external_links #Only external links returned as an array
|
49
|
+
page.emails #Every email found in the page, returned as an array
|
50
|
+
page.images #Every img found in the page, returned as an array
|
51
|
+
page.words #Every word that appeared in the page, returned as an array
|
52
|
+
page.css #CSS scripts used in the page, returned as an array
|
53
|
+
page.content #Contents of the HTML document in string
|
54
|
+
page.content_type #Content type of the page
|
55
|
+
page.text #Any text found in the page without HTML tags
|
56
|
+
page.response_code #HTTP response code of the page
|
57
|
+
page.response_time #HTTP response time of the page
|
58
|
+
page.crawled_time #The time when this page is crawled/fetched, returned as milliseconds since epoch
|
59
|
+
end
|
60
|
+
|
61
|
+
## Dependencies
|
62
|
+
|
63
|
+
+ Colorize
|
64
|
+
+ Curb
|
65
|
+
+ Nokogiri
|
66
|
+
+ Typhoeus
|
67
|
+
|
68
|
+
## Contributing
|
69
|
+
|
70
|
+
1. Fork it ( https://github.com/belsonheng/spidercrawl/fork )
|
71
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
72
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
73
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
74
|
+
5. Create a new Pull Request
|
75
|
+
|
76
|
+
## License
|
77
|
+
|
78
|
+
SpiderCrawl is released under the [MIT license](https://github.com/belsonheng/spidercrawl/blob/master/LICENSE.txt).
|
data/Rakefile
ADDED
data/lib/spidercrawl.rb
ADDED
@@ -0,0 +1,10 @@
|
|
1
|
+
require "spidercrawl/spider_worker"
|
2
|
+
|
3
|
+
class Spiderman
|
4
|
+
def self.shoot(urls, options, &block)
|
5
|
+
spiderman = Spidercrawl::SpiderWorker.new(urls, options)
|
6
|
+
yield spiderman if block_given?
|
7
|
+
return spiderman.parallel_crawl if options[:parallel] == true
|
8
|
+
return spiderman.crawl unless options[:parallel]
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,187 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
module Spidercrawl
|
4
|
+
# Parses the content with Nokogiri
|
5
|
+
class Page
|
6
|
+
|
7
|
+
attr_reader :location, :response_time
|
8
|
+
attr_accessor :crawled_time
|
9
|
+
|
10
|
+
def initialize(url, options = {})
|
11
|
+
@url = url
|
12
|
+
@code = options[:response_code]
|
13
|
+
@headers = options[:response_head]
|
14
|
+
@location = options[:redirect_url]
|
15
|
+
@body = options[:response_body]
|
16
|
+
@response_time = options[:response_time]
|
17
|
+
@crawled_time = options[:crawled_time]
|
18
|
+
end
|
19
|
+
|
20
|
+
#
|
21
|
+
# Return the url of the page
|
22
|
+
#
|
23
|
+
def url
|
24
|
+
@url.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
#
|
28
|
+
# Return the url scheme of the page (e.g. http, https, etc.)
|
29
|
+
#
|
30
|
+
def scheme
|
31
|
+
@url.scheme
|
32
|
+
end
|
33
|
+
|
34
|
+
#
|
35
|
+
# Return the url host of the page
|
36
|
+
#
|
37
|
+
def host
|
38
|
+
@url.host
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Return the base url of the page
|
43
|
+
#
|
44
|
+
def base_url
|
45
|
+
@base_url = "#{scheme}://#{host}"
|
46
|
+
end
|
47
|
+
|
48
|
+
#
|
49
|
+
# Return the Nokogiri html document
|
50
|
+
#
|
51
|
+
def doc
|
52
|
+
@document = Nokogiri::HTML(@body)
|
53
|
+
rescue Exception => e
|
54
|
+
puts e.inspect
|
55
|
+
puts e.backtrace
|
56
|
+
end
|
57
|
+
|
58
|
+
#
|
59
|
+
# Return the headers of the page
|
60
|
+
#
|
61
|
+
def headers
|
62
|
+
puts @headers
|
63
|
+
end
|
64
|
+
|
65
|
+
#
|
66
|
+
# Return the title of the page
|
67
|
+
#
|
68
|
+
def title
|
69
|
+
@title = doc.css('head title').inner_text
|
70
|
+
end
|
71
|
+
|
72
|
+
#
|
73
|
+
# Return the entire links found in the page; exclude empty links
|
74
|
+
#
|
75
|
+
def links
|
76
|
+
@links = doc.css('a').map { |link| link['href'].to_s }.uniq.delete_if { |href| href.empty? }.map { |url| absolutify(url.strip) }
|
77
|
+
end
|
78
|
+
|
79
|
+
#
|
80
|
+
# Return the internal links found in the page
|
81
|
+
#
|
82
|
+
def internal_links
|
83
|
+
@internal_links = links.select { |link| URI.parse(link).host == host } rescue nil
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# Return the external links found in the page
|
88
|
+
#
|
89
|
+
def external_links
|
90
|
+
@external_links = links.select { |link| URI.parse(link).host != host } rescue nil
|
91
|
+
end
|
92
|
+
|
93
|
+
#
|
94
|
+
# Return any emails found in the page
|
95
|
+
#
|
96
|
+
def emails
|
97
|
+
@body.match(/[\w.!#\$%+-]+@[\w-]+(?:\.[\w-]+)+/)
|
98
|
+
end
|
99
|
+
|
100
|
+
#
|
101
|
+
# Return all images found in the page
|
102
|
+
#
|
103
|
+
def images
|
104
|
+
@images = doc.css('img').map { |img| img['src'].to_s }.uniq.delete_if { |src| src.empty? }.map { |url| absolutify(url.strip) }
|
105
|
+
end
|
106
|
+
|
107
|
+
#
|
108
|
+
# Return all words found in the page
|
109
|
+
#
|
110
|
+
def words
|
111
|
+
@words = text.split(/[^a-zA-Z]/).delete_if { |word| word.empty? }
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Return css scripts of the page
|
116
|
+
#
|
117
|
+
def css
|
118
|
+
@css = doc.search("[@type='text/css']")
|
119
|
+
end
|
120
|
+
|
121
|
+
def meta_keywords
|
122
|
+
end
|
123
|
+
|
124
|
+
def meta_descriptions
|
125
|
+
end
|
126
|
+
|
127
|
+
#
|
128
|
+
# Return html content as a string
|
129
|
+
#
|
130
|
+
def content
|
131
|
+
@body.to_s
|
132
|
+
end
|
133
|
+
|
134
|
+
#
|
135
|
+
# Return the content type of the page
|
136
|
+
#
|
137
|
+
def content_type
|
138
|
+
doc.at("meta[@http-equiv='Content-Type']")['content']
|
139
|
+
end
|
140
|
+
|
141
|
+
#
|
142
|
+
# Return plain text of the page without html tags
|
143
|
+
#
|
144
|
+
def text
|
145
|
+
temp_doc = doc
|
146
|
+
temp_doc.css('script, noscript, style, link').each { |node| node.remove }
|
147
|
+
@text = temp_doc.css('body').text.split("\n").collect { |line| line.strip }.join("\n")
|
148
|
+
end
|
149
|
+
|
150
|
+
#
|
151
|
+
# Return the response code
|
152
|
+
#
|
153
|
+
def response_code
|
154
|
+
@code
|
155
|
+
end
|
156
|
+
|
157
|
+
#
|
158
|
+
# Return true if page not found
|
159
|
+
#
|
160
|
+
def not_found?
|
161
|
+
@code == 404
|
162
|
+
end
|
163
|
+
|
164
|
+
#
|
165
|
+
# Return true if page is fetched successfully
|
166
|
+
#
|
167
|
+
def success?
|
168
|
+
@code == 200
|
169
|
+
end
|
170
|
+
|
171
|
+
#
|
172
|
+
# Return true if page is redirected
|
173
|
+
#
|
174
|
+
def redirect?
|
175
|
+
(300..307).include?(@code)
|
176
|
+
end
|
177
|
+
|
178
|
+
#
|
179
|
+
# Return the absolute url
|
180
|
+
#
|
181
|
+
private
|
182
|
+
def absolutify(page_url)
|
183
|
+
return URI.escape(page_url) if page_url =~ /^\w*\:/i
|
184
|
+
return base_url + URI.escape(page_url)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
require 'spidercrawl/page'
|
2
|
+
require 'spidercrawl/user_agents'
|
3
|
+
require 'net/http'
|
4
|
+
require 'curb'
|
5
|
+
require 'colorize'
|
6
|
+
require 'typhoeus'
|
7
|
+
|
8
|
+
module Spidercrawl
|
9
|
+
# Makes the request to the targeted website
|
10
|
+
class Request
|
11
|
+
|
12
|
+
attr_accessor :uri
|
13
|
+
|
14
|
+
def initialize(url, options = {})
|
15
|
+
@uri = URI.parse(url)
|
16
|
+
@threads = options[:threads]
|
17
|
+
@timeout = options[:timeout]
|
18
|
+
|
19
|
+
@http = Net::HTTP.new(@uri.host, @uri.port) do |http|
|
20
|
+
http.open_timeout = @timeout # in seconds
|
21
|
+
http.read_timeout = @timeout # in seconds
|
22
|
+
end
|
23
|
+
|
24
|
+
@c = Curl::Easy.new(@uri.to_s) do |curl|
|
25
|
+
curl.headers['User-Agent'] = UserAgents.random
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# Fetch a page from the given url using libcurl
|
31
|
+
#
|
32
|
+
def curl
|
33
|
+
puts "fetching #{@uri.to_s}".green.on_black
|
34
|
+
start_time = Time.now
|
35
|
+
begin
|
36
|
+
c = @c
|
37
|
+
c.url = @uri.to_s
|
38
|
+
c.perform
|
39
|
+
end_time = Time.now
|
40
|
+
case c.response_code
|
41
|
+
when 200 then
|
42
|
+
page = Page.new(@uri, response_code: c.response_code,
|
43
|
+
response_head: c.header_str,
|
44
|
+
response_body: c.body_str,
|
45
|
+
response_time: ((end_time-start_time)*1000).round,
|
46
|
+
crawled_time: (Time.now.to_f*1000).to_i)
|
47
|
+
when 300..307 then
|
48
|
+
page = Page.new(@uri, response_code: c.response_code,
|
49
|
+
response_head: c.header_str,
|
50
|
+
response_body: c.body_str,
|
51
|
+
response_time: ((end_time-start_time)*1000).round,
|
52
|
+
redirect_url: c.redirect_url)
|
53
|
+
when 404 then
|
54
|
+
page = Page.new(@uri, response_code: c.response_code,
|
55
|
+
response_time: ((end_time-start_time)*1000).round)
|
56
|
+
end
|
57
|
+
rescue Exception => e
|
58
|
+
puts e.inspect
|
59
|
+
puts e.backtrace
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
#
|
64
|
+
# Fetch a page from the given url using net/http
|
65
|
+
#
|
66
|
+
def fetch
|
67
|
+
puts "fetching #{@uri.to_s}".green.on_black
|
68
|
+
start_time = Time.now
|
69
|
+
begin
|
70
|
+
request = Net::HTTP::Get.new(@uri.request_uri)
|
71
|
+
request["User-Agent"] = UserAgents.random
|
72
|
+
response = @http.request(request)
|
73
|
+
end_time = Time.now
|
74
|
+
case response
|
75
|
+
when Net::HTTPSuccess then
|
76
|
+
page = Page.new(@uri, response_code: response.code.to_i,
|
77
|
+
response_head: response.instance_variable_get("@header"),
|
78
|
+
response_body: response.body,
|
79
|
+
response_time: (end_time-start_time).to_f,
|
80
|
+
crawled_time: (Time.now.to_f*1000).to_i)
|
81
|
+
when Net::HTTPRedirection then
|
82
|
+
page = Page.new(@uri, response_code: response.code.to_i,
|
83
|
+
response_head: response.instance_variable_get("@header"),
|
84
|
+
response_body: response.body,
|
85
|
+
response_time: (end_time-start_time).to_f,
|
86
|
+
redirect_url: response['location'])
|
87
|
+
when Net::HTTPNotFound then
|
88
|
+
page = Page.new(@uri, response_code: response.code.to_i,
|
89
|
+
response_time: (end_time-start_time).to_f)
|
90
|
+
end
|
91
|
+
rescue Exception => e
|
92
|
+
puts e.inspect
|
93
|
+
puts e.backtrace
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Makes parallel requests to the targeted website using typhoeus and hydra
|
99
|
+
class ParallelRequest
|
100
|
+
|
101
|
+
attr_accessor :urls
|
102
|
+
|
103
|
+
def initialize(urls, options = {})
|
104
|
+
@urls = urls
|
105
|
+
@threads = options[:threads]
|
106
|
+
@timeout = options[:timeout]
|
107
|
+
end
|
108
|
+
|
109
|
+
#
|
110
|
+
# Fetch page(s) from the given url(s)
|
111
|
+
#
|
112
|
+
def fetch
|
113
|
+
hydra = Typhoeus::Hydra.new(:max_concurrency => @threads)
|
114
|
+
page, pages = nil, []
|
115
|
+
|
116
|
+
@urls.each do |url|
|
117
|
+
request = Typhoeus::Request.new(url, :timeout => @timeout, :followlocation => false, :headers => {"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Cache-Control" => "no-cache", "Pragma" => "no-cache", "User-Agent" => UserAgents.random})
|
118
|
+
request.on_complete do |response|
|
119
|
+
uri = URI(url)
|
120
|
+
if response.success?
|
121
|
+
puts "fetching #{url}".green.on_black
|
122
|
+
page = Page.new(uri, response_code: response.code,
|
123
|
+
response_head: response.headers,
|
124
|
+
response_body: response.body,
|
125
|
+
response_time: response.time*1000,
|
126
|
+
crawled_time: (Time.now.to_f*1000).to_i)
|
127
|
+
elsif (300..307).include?(response.code)
|
128
|
+
puts "fetching #{url}".green.on_black
|
129
|
+
puts "### #{response.code} ### redirect to #{response.headers['Location']}".white.on_black
|
130
|
+
page = Page.new(uri, response_code: response.code,
|
131
|
+
response_head: response.headers,
|
132
|
+
response_body: response.body,
|
133
|
+
response_time: response.time*1000,
|
134
|
+
redirect_url: response.headers['Location'])
|
135
|
+
elsif 404 == response.code
|
136
|
+
puts "fetching #{url}".green.on_black
|
137
|
+
puts "### #{response.code} ### not found #{url}".magenta.on_black
|
138
|
+
page = Page.new(uri, response_code: response.code,
|
139
|
+
response_time: response.time*1000)
|
140
|
+
else
|
141
|
+
puts "fetching #{url}".green.on_black
|
142
|
+
puts "### #{response.code} ### failed #{url}".magenta.on_black
|
143
|
+
puts "### Time: #{response.time} ### #{response.return_message}".magenta.on_black
|
144
|
+
page = Page.new(uri, response_code: response.code,
|
145
|
+
response_time: response.time*1000)
|
146
|
+
end
|
147
|
+
pages << page
|
148
|
+
end
|
149
|
+
hydra.queue(request)
|
150
|
+
end
|
151
|
+
hydra.run
|
152
|
+
return pages
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,203 @@
|
|
1
|
+
require 'spidercrawl/request'
|
2
|
+
require 'spidercrawl/page'
|
3
|
+
require 'colorize'
|
4
|
+
|
5
|
+
module Spidercrawl
|
6
|
+
# Start working hard
|
7
|
+
class SpiderWorker
|
8
|
+
|
9
|
+
attr_reader :page
|
10
|
+
|
11
|
+
def initialize(url, options = {})
|
12
|
+
@url = url
|
13
|
+
#@headers = options[:headers]
|
14
|
+
@delay = options[:delay] ? options[:delay] : 0 # default 0 seconds
|
15
|
+
@threads = options[:threads] ? options[:threads] : 10 # default 10 threads
|
16
|
+
@timeout = options[:timeout] ? options[:timeout] : 20 # default 20 seconds
|
17
|
+
@allow_redirections = options[:allow_redirections]
|
18
|
+
@max_pages = options[:max_pages]
|
19
|
+
@pattern = options[:pattern]
|
20
|
+
@setup = nil
|
21
|
+
@teardown = nil
|
22
|
+
@redirect = nil
|
23
|
+
@success = nil
|
24
|
+
@failure = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def crawl
|
28
|
+
link_queue = Queue.new
|
29
|
+
pages, visited_links = [], []
|
30
|
+
link_queue << @url
|
31
|
+
|
32
|
+
spider_worker = Request.new(@url, :threads => @threads, :timeout => @timeout)
|
33
|
+
|
34
|
+
begin
|
35
|
+
url = link_queue.pop
|
36
|
+
next if visited_links.include?(url) || (@pattern && url !~ @pattern)
|
37
|
+
|
38
|
+
start_time = Time.now
|
39
|
+
response = @setup.yield url unless @setup.nil?
|
40
|
+
end_time = Time.now
|
41
|
+
|
42
|
+
spider_worker.uri = URI.parse(url)
|
43
|
+
page = (response ? setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
|
44
|
+
visited_links << url
|
45
|
+
|
46
|
+
if page.success? || page.redirect? then
|
47
|
+
while page.redirect?
|
48
|
+
puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
|
49
|
+
break if visited_links.include?(page.location)
|
50
|
+
|
51
|
+
start_time = Time.now
|
52
|
+
response = @redirect.yield page.location unless @redirect.nil?
|
53
|
+
end_time = Time.now
|
54
|
+
|
55
|
+
spider_worker.uri = URI.parse(page.location)
|
56
|
+
page = (response ? setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i) : spider_worker.curl)
|
57
|
+
visited_links << page.url
|
58
|
+
end
|
59
|
+
if !visited_links.include?(page.location)
|
60
|
+
pages << page unless page.content == ""
|
61
|
+
page.internal_links.each do |link|
|
62
|
+
if !visited_links.include?(link)
|
63
|
+
link_queue << link if @pattern && link =~ @pattern
|
64
|
+
link_queue << link unless @pattern
|
65
|
+
end
|
66
|
+
end unless page.internal_links.nil?
|
67
|
+
@teardown.yield page unless @teardown.nil?
|
68
|
+
sleep @delay
|
69
|
+
end
|
70
|
+
elsif page.not_found? then
|
71
|
+
puts "page not found"
|
72
|
+
end
|
73
|
+
end until link_queue.empty?
|
74
|
+
pages
|
75
|
+
end
|
76
|
+
|
77
|
+
def parallel_crawl
|
78
|
+
link_queue = Queue.new
|
79
|
+
pages, visited_links = [], []
|
80
|
+
link_queue << @url
|
81
|
+
|
82
|
+
spider_workers = ParallelRequest.new([@url], :threads => @threads, :timeout => @timeout)
|
83
|
+
|
84
|
+
begin
|
85
|
+
urls = []
|
86
|
+
while !link_queue.empty?
|
87
|
+
url = link_queue.pop
|
88
|
+
next if visited_links.include?(url) || (@pattern && url !~ @pattern)
|
89
|
+
visited_links << url
|
90
|
+
|
91
|
+
start_time = Time.now
|
92
|
+
response = @setup.yield url unless @setup.nil?
|
93
|
+
end_time = Time.now
|
94
|
+
|
95
|
+
if response then
|
96
|
+
pages << (page = setup_page(URI.parse(url), response, ((end_time - start_time).to_f*1000).to_i))
|
97
|
+
@teardown.yield page unless @teardown.nil?
|
98
|
+
|
99
|
+
page.internal_links.each do |link| # queue up internal links for crawling
|
100
|
+
if !visited_links.include?(link)
|
101
|
+
link_queue << link if @pattern && link =~ @pattern
|
102
|
+
link_queue << link unless @pattern
|
103
|
+
end
|
104
|
+
end unless page.internal_links.nil?
|
105
|
+
else # queue up url for crawling
|
106
|
+
urls << url
|
107
|
+
puts "queue: #{url}"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
spider_workers.urls = urls
|
112
|
+
responses = spider_workers.fetch
|
113
|
+
|
114
|
+
responses.each do |page|
|
115
|
+
if (503..504).include?(page.response_code) then
|
116
|
+
link_queue << page.url
|
117
|
+
elsif page.success? || page.redirect? then
|
118
|
+
response = nil
|
119
|
+
if page.redirect? then
|
120
|
+
puts ("### redirect to #{page.location}" + (visited_links.include?(page.location) ? " which we have already visited!" : "")).white.on_black
|
121
|
+
unless visited_links.include?(page.location) || (@pattern && page.location !~ @pattern)
|
122
|
+
start_time = Time.now
|
123
|
+
response = @redirect.yield page.location unless @redirect.nil?
|
124
|
+
end_time = Time.now
|
125
|
+
|
126
|
+
if response then
|
127
|
+
page = setup_page(URI.parse(page.location), response, ((end_time - start_time).to_f*1000).to_i)
|
128
|
+
visited_links << page.url
|
129
|
+
else
|
130
|
+
puts "queue: #{page.location}"
|
131
|
+
link_queue << page.location
|
132
|
+
end
|
133
|
+
else
|
134
|
+
puts "discard: #{page.location}"
|
135
|
+
end
|
136
|
+
end
|
137
|
+
if page.success? || response then
|
138
|
+
pages << page unless page.content == ""
|
139
|
+
page.internal_links.each do |link| # queue up internal links for crawling
|
140
|
+
if !visited_links.include?(link)
|
141
|
+
link_queue << link if @pattern && link =~ @pattern
|
142
|
+
link_queue << link unless @pattern
|
143
|
+
end
|
144
|
+
end unless page.internal_links.nil?
|
145
|
+
page.crawled_time = (Time.now.to_f*1000).to_i
|
146
|
+
@teardown.yield page unless @teardown.nil?
|
147
|
+
end
|
148
|
+
elsif page.not_found? then
|
149
|
+
puts "page not found"
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end until link_queue.empty?
|
153
|
+
pages
|
154
|
+
end
|
155
|
+
|
156
|
+
#
|
157
|
+
# Code block for before fetch
|
158
|
+
#
|
159
|
+
def before_fetch(&block)
|
160
|
+
@setup = block if block
|
161
|
+
end
|
162
|
+
|
163
|
+
#
|
164
|
+
# Code block for after fetch
|
165
|
+
#
|
166
|
+
def after_fetch(&block)
|
167
|
+
@teardown = block if block
|
168
|
+
end
|
169
|
+
|
170
|
+
#
|
171
|
+
# Code block for on redirect
|
172
|
+
#
|
173
|
+
def on_redirect(&block)
|
174
|
+
@redirect = block if block
|
175
|
+
end
|
176
|
+
|
177
|
+
#
|
178
|
+
# Code block for on success
|
179
|
+
#
|
180
|
+
def on_success(&block)
|
181
|
+
@success = block if block
|
182
|
+
end
|
183
|
+
|
184
|
+
#
|
185
|
+
# Code block for on failure
|
186
|
+
#
|
187
|
+
def on_failure(&block)
|
188
|
+
@failure = block if block
|
189
|
+
end
|
190
|
+
|
191
|
+
#
|
192
|
+
# Setup page based on given response
|
193
|
+
#
|
194
|
+
private
|
195
|
+
def setup_page(uri, response, response_time)
|
196
|
+
page = Page.new(uri, response_code: response.code.to_i,
|
197
|
+
response_head: response.instance_variable_get("@header"),
|
198
|
+
response_body: response.body,
|
199
|
+
response_time: response_time,
|
200
|
+
crawled_time: (Time.now.to_f*1000).to_i)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2013 Charles H Martin, PhD
|
3
|
+
#
|
4
|
+
|
5
|
+
class UserAgents
|
6
|
+
# Random agents
|
7
|
+
def self.random
|
8
|
+
case rand(20)
|
9
|
+
when 0
|
10
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:#{10+rand(10)}.#{rand(10)}) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Firefox/20.0"
|
11
|
+
when 1
|
12
|
+
"Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.#{10+rand(10)}) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Ubuntu/10.10 (maverick) Firefox/3.6.#{14+rand(5)}"
|
13
|
+
when 2
|
14
|
+
ver = "#{400+rand(99)}.#{10+rand(75)}"
|
15
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/#{ver} (KHTML, like Gecko) Chrome/12.0.#{700+rand(90)}.#{100+rand(200)} Safari/#{ver}"
|
16
|
+
when 3
|
17
|
+
ver = "#{400+rand(99)}.#{rand(9)}"
|
18
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/#{ver} (KHTML, like Gecko) Chrome/13.0.#{700+rand(90)}.#{100+rand(200)} Safari/#{ver}"
|
19
|
+
when 4
|
20
|
+
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:5.0) Gecko/20#{10+rand(3)}#{1000+rand(3)*100+rand(28)} Firefox/#{4+rand(1)}.0"
|
21
|
+
when 5
|
22
|
+
"Mozilla/4.0 (compatible; MSIE 8.#{rand(6)}; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.#{50000+rand(7000)}; .NET CLR 3.5.#{30000+rand(8000)}; .NET CLR 3.0.#{30000+rand(8000)}; Media Center PC 6.0; .NET4.0C; .NET4.0E; MS-RTC LM 8; Zune 4.#{6+rand(3)})"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
data/spidercrawl.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'spidercrawl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "spidercrawl"
|
8
|
+
spec.version = Spidercrawl::VERSION
|
9
|
+
spec.authors = ["Belson Heng"]
|
10
|
+
spec.email = ["belsonheng@gmail.com"]
|
11
|
+
spec.summary = %q{A ruby gem that can crawl a domain and let you have information about the pages it visits.}
|
12
|
+
spec.description = %q{With the help of Nokogiri, SpiderCrawl will parse each page and return you its title, links, css, words, and many many more! You can also customize what you want to do before & after each fetch request.}
|
13
|
+
spec.homepage = "http://github.com/belsonheng/spidercrawl"
|
14
|
+
spec.license = "MIT"
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency 'nokogiri', '~> 1.6'
|
22
|
+
spec.add_dependency "curb"
|
23
|
+
spec.add_dependency "colorize"
|
24
|
+
spec.add_dependency "typhoeus"
|
25
|
+
|
26
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
27
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spidercrawl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.9
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Belson Heng
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-07-30 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: curb
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: colorize
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: typhoeus
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.7'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.7'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
description: With the help of Nokogiri, SpiderCrawl will parse each page and return
|
98
|
+
you its title, links, css, words, and many many more! You can also customize what
|
99
|
+
you want to do before & after each fetch request.
|
100
|
+
email:
|
101
|
+
- belsonheng@gmail.com
|
102
|
+
executables: []
|
103
|
+
extensions: []
|
104
|
+
extra_rdoc_files: []
|
105
|
+
files:
|
106
|
+
- ".gitignore"
|
107
|
+
- Gemfile
|
108
|
+
- LICENSE.txt
|
109
|
+
- README.md
|
110
|
+
- Rakefile
|
111
|
+
- lib/spidercrawl.rb
|
112
|
+
- lib/spidercrawl/page.rb
|
113
|
+
- lib/spidercrawl/request.rb
|
114
|
+
- lib/spidercrawl/spider_worker.rb
|
115
|
+
- lib/spidercrawl/user_agents.rb
|
116
|
+
- lib/spidercrawl/version.rb
|
117
|
+
- spidercrawl.gemspec
|
118
|
+
homepage: http://github.com/belsonheng/spidercrawl
|
119
|
+
licenses:
|
120
|
+
- MIT
|
121
|
+
metadata: {}
|
122
|
+
post_install_message:
|
123
|
+
rdoc_options: []
|
124
|
+
require_paths:
|
125
|
+
- lib
|
126
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
132
|
+
requirements:
|
133
|
+
- - ">="
|
134
|
+
- !ruby/object:Gem::Version
|
135
|
+
version: '0'
|
136
|
+
requirements: []
|
137
|
+
rubyforge_project:
|
138
|
+
rubygems_version: 2.4.5
|
139
|
+
signing_key:
|
140
|
+
specification_version: 4
|
141
|
+
summary: A ruby gem that can crawl a domain and let you have information about the
|
142
|
+
pages it visits.
|
143
|
+
test_files: []
|