rwspider 0.4.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Changelog.rdoc +5 -0
- data/LICENSE.rdoc +25 -0
- data/Manifest +13 -0
- data/README.rdoc +55 -0
- data/Rakefile +59 -0
- data/lib/rwspider/client.rb +235 -0
- data/lib/rwspider/document.rb +183 -0
- data/lib/rwspider/queue.rb +25 -0
- data/lib/rwspider/version.rb +29 -0
- data/lib/rwspider.rb +57 -0
- data/rwspider.gemspec +44 -0
- data/test/client_test.rb +34 -0
- data/test/document_test.rb +55 -0
- data/test/rwspider_test.rb +30 -0
- metadata +120 -0
data/Changelog.rdoc
ADDED
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= License
|
2
|
+
|
3
|
+
(The MIT License)
|
4
|
+
|
5
|
+
Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
8
|
+
a copy of this software and associated documentation files (the
|
9
|
+
"Software"), to deal in the Software without restriction, including
|
10
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
11
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
+
permit persons to whom the Software is furnished to do so, subject to
|
13
|
+
the following conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
|
data/Manifest
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Changelog.rdoc
|
2
|
+
LICENSE.rdoc
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
lib/rwspider.rb
|
6
|
+
lib/rwspider/client.rb
|
7
|
+
lib/rwspider/document.rb
|
8
|
+
lib/rwspider/queue.rb
|
9
|
+
lib/rwspider/version.rb
|
10
|
+
test/client_test.rb
|
11
|
+
test/document_test.rb
|
12
|
+
test/rwspider_test.rb
|
13
|
+
Manifest
|
data/README.rdoc
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= RW Spider
|
2
|
+
|
3
|
+
RW Spider is an multithreading spider client written in Ruby.
|
4
|
+
The library was designed to make easy the development of programs that spider the web.
|
5
|
+
|
6
|
+
RW Spider design comes from the direct experiences of the development of another PHP library that is currently used as the engine for an freeware SEO tool.
|
7
|
+
|
8
|
+
== Features
|
9
|
+
|
10
|
+
* Multithreading spider
|
11
|
+
* Customizable options for the spider job
|
12
|
+
* Robots.txt support
|
13
|
+
* Indexing of web pages and others files (images, CSS, JavaScript, PDF and more)
|
14
|
+
* Following redirects
|
15
|
+
|
16
|
+
== Requirements
|
17
|
+
|
18
|
+
* Ruby >= 1.8.7
|
19
|
+
* Hpricot >= 0.8.2
|
20
|
+
* Robotstxt >= 0.5.2
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
== Installation
|
25
|
+
|
26
|
+
This library is intended to be installed via the
|
27
|
+
Gemcutter[http://gemcutter.org] system.
|
28
|
+
|
29
|
+
$ gem install rwspider
|
30
|
+
|
31
|
+
You might need administrator privileges on your system to install it.
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
== Author
|
36
|
+
|
37
|
+
Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
|
38
|
+
|
39
|
+
|
40
|
+
== Resources
|
41
|
+
|
42
|
+
* {Homepage}[http://www.rwspider.com/]
|
43
|
+
* {Author}[http://www.simonerinzivillo.it/]
|
44
|
+
* {GitHub}[http://github.com/rinzi/rwspider/]
|
45
|
+
|
46
|
+
|
47
|
+
== Changelog
|
48
|
+
|
49
|
+
See the CHANGELOG.rdoc file for details.
|
50
|
+
|
51
|
+
|
52
|
+
== License
|
53
|
+
|
54
|
+
Copyright (c) 2009 Simone Rinzivillo, RW Spider is released under the MIT license.
|
55
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + "/lib")
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'echoe'
|
6
|
+
require 'rwspider'
|
7
|
+
|
8
|
+
|
9
|
+
# Common package properties
|
10
|
+
PKG_NAME = 'rwspider'
|
11
|
+
PKG_VERSION = Rwspider::VERSION
|
12
|
+
RUBYFORGE_PROJECT = 'rwspider'
|
13
|
+
|
14
|
+
if ENV['SNAPSHOT'].to_i == 1
|
15
|
+
PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
Echoe.new(PKG_NAME, PKG_VERSION) do |p|
|
20
|
+
p.author = "Simone Rinzivillo"
|
21
|
+
p.email = "srinzivillo@gmail.com"
|
22
|
+
p.summary = "RW Spider is an multithreading spider client written in Ruby"
|
23
|
+
p.url = "http://www.rwspider.com"
|
24
|
+
p.project = RUBYFORGE_PROJECT
|
25
|
+
p.description = <<-EOD
|
26
|
+
RW Spider is an multithreading spider client written in Ruby designed to make easy \
|
27
|
+
the development of programs that spider the web.
|
28
|
+
EOD
|
29
|
+
|
30
|
+
p.need_zip = true
|
31
|
+
|
32
|
+
p.development_dependencies += ["rake ~>0.8",
|
33
|
+
"hpricot ~>0.8.2",
|
34
|
+
"robotstxt ~>0.5.2",
|
35
|
+
"echoe ~>3.1"]
|
36
|
+
|
37
|
+
p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
desc "Open an irb session preloaded with this library"
|
42
|
+
task :console do
|
43
|
+
sh "irb -rubygems -I lib -r rwspider.rb"
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
require 'code_statistics'
|
48
|
+
desc "Show library's code statistics"
|
49
|
+
task :stats do
|
50
|
+
CodeStatistics.new(["Rwspider", "lib"],
|
51
|
+
["Tests", "test"]).to_s
|
52
|
+
end
|
53
|
+
rescue LoadError
|
54
|
+
puts "CodeStatistics (Rails) is not available"
|
55
|
+
end
|
56
|
+
|
57
|
+
Dir["tasks/**/*.rake"].each do |file|
|
58
|
+
load(file)
|
59
|
+
end
|
@@ -0,0 +1,235 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'net/http'
|
18
|
+
require 'openssl'
|
19
|
+
require 'uri'
|
20
|
+
require 'robotstxt'
|
21
|
+
require 'rwspider/document'
|
22
|
+
require 'rwspider/queue'
|
23
|
+
require 'rwspider/version'
|
24
|
+
require 'hpricot'
|
25
|
+
|
26
|
+
|
27
|
+
module Rwspider
|
28
|
+
class Client
|
29
|
+
|
30
|
+
# Hash of options for the spider job
|
31
|
+
attr_accessor :opts
|
32
|
+
|
33
|
+
|
34
|
+
# DEFAULT_OPTIONS properties
|
35
|
+
#
|
36
|
+
# useragent: The User Agent that RW Spider must apply in HTTP requests
|
37
|
+
#
|
38
|
+
# robot_name: The Robot name that RW Spider must apply in HTTP requests
|
39
|
+
#
|
40
|
+
# scan_documents_limit: The limit of the documents that RW Spider can download,
|
41
|
+
# set as <tt>nil</tt> for start the indexing job without restriction on the number of the download
|
42
|
+
#
|
43
|
+
# scan_domain_limit: Set to restrict the indexing job to the current domain name
|
44
|
+
#
|
45
|
+
# scan_images -Set as <tt>true</tt> to enable the download of the image files
|
46
|
+
#
|
47
|
+
# scan_other_files: Set as <tt>true</tt> to enable the download of the other files as javascript and css
|
48
|
+
#
|
49
|
+
# follow_robotstxt_directive: Set as <tt>true</tt> to enable the analysis of the Robots.txt rules to check the accessibility of URLs
|
50
|
+
#
|
51
|
+
# follow_HTTP_redirection: Set as <tt>true</tt> to follow the HTTP redirections
|
52
|
+
#
|
53
|
+
# timeout: The timeout of single URL analysis
|
54
|
+
|
55
|
+
DEFAULT_OPTIONS = {
|
56
|
+
:useragent => 'RW Spider/' + Rwspider::VERSION,
|
57
|
+
:robot_name => 'rwspider',
|
58
|
+
:scan_documents_limit => 100,
|
59
|
+
:scan_domain_limit => nil,
|
60
|
+
:scan_images => false,
|
61
|
+
:scan_other_files => false,
|
62
|
+
:follow_robotstxt_directive => true,
|
63
|
+
:follow_HTTP_redirection => true,
|
64
|
+
:timeout => 5
|
65
|
+
}
|
66
|
+
|
67
|
+
# Inizialize a new Rwspider::Client instance, accept an <tt>Hash</tt> of options.
|
68
|
+
# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
|
69
|
+
#
|
70
|
+
# opts = {
|
71
|
+
# :useragent => 'My user agent',
|
72
|
+
# :robot_name => 'my_spider_name',
|
73
|
+
# :scan_documents_limit => 100,
|
74
|
+
# :scan_domain_limit => nil,
|
75
|
+
# :scan_images => true,
|
76
|
+
# :scan_other_files => false,
|
77
|
+
# :follow_robotstxt_directive => true,
|
78
|
+
# :follow_HTTP_redirection => true,
|
79
|
+
# :timeout => 5
|
80
|
+
# }
|
81
|
+
# spider = Rwspider::Client::new(opts)
|
82
|
+
#
|
83
|
+
def initialize (options = {})
|
84
|
+
|
85
|
+
load_options options
|
86
|
+
@robotstxt_cache = Hash.new()
|
87
|
+
@main_hostname = ''
|
88
|
+
@scanned_documents = 0
|
89
|
+
@queue = Rwspider::Queue.new
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Start the crawling from the <tt>URL</tt>.
|
94
|
+
#
|
95
|
+
# Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded.
|
96
|
+
# At the end of execution returns an <tt>Array</tt> of Rwspider::Document instances.
|
97
|
+
#
|
98
|
+
# Rwspider::Client::start('http://www.rwspider.com') {do |d|
|
99
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
100
|
+
# }
|
101
|
+
#
|
102
|
+
# arr = Rwspider::Client::start('http://www.rwspider.com')
|
103
|
+
# arr.each{do |d|
|
104
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
105
|
+
# }
|
106
|
+
#
|
107
|
+
def start (start_url)
|
108
|
+
|
109
|
+
@queue << Rwspider::Document.new(start_url)
|
110
|
+
|
111
|
+
@queue.each do |link|
|
112
|
+
@main_url = link.url
|
113
|
+
if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
|
114
|
+
set_as_visited link
|
115
|
+
@main_hostname = link.url.host.downcase if @main_hostname.length == 0
|
116
|
+
|
117
|
+
t = Thread.new(link) { |link|
|
118
|
+
begin
|
119
|
+
|
120
|
+
Timeout::timeout(@opts[:timeout]){
|
121
|
+
beginning = Time.now
|
122
|
+
response = get_uri(link.url)
|
123
|
+
link.download_time = Time.now - beginning
|
124
|
+
link.as_downloaded = true
|
125
|
+
link.http_response = response
|
126
|
+
|
127
|
+
yield link if block_given?
|
128
|
+
|
129
|
+
case response
|
130
|
+
when Net::HTTPSuccess then
|
131
|
+
|
132
|
+
if response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
|
133
|
+
|
134
|
+
link.get_links
|
135
|
+
link.get_images if @opts[:scan_images]
|
136
|
+
link.get_other_files if @opts[:scan_other_files]
|
137
|
+
|
138
|
+
link.documents.each do |doc|
|
139
|
+
add_to_queue doc
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
when Net::HTTPRedirection then
|
144
|
+
add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection]
|
145
|
+
|
146
|
+
end
|
147
|
+
}
|
148
|
+
rescue StandardError => e
|
149
|
+
link.as_downloaded = false
|
150
|
+
yield link if block_given?
|
151
|
+
end
|
152
|
+
}
|
153
|
+
t.join
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
return @queue
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
def add_to_queue (document)
|
166
|
+
|
167
|
+
if follow?(document)
|
168
|
+
@queue.each do |link|
|
169
|
+
if link.url.normalize == document.url.normalize
|
170
|
+
document.as_visited = true
|
171
|
+
link.inbound_links << @main_url.normalize.to_s if !link.inbound_links.include?(@main_url.normalize.to_s)
|
172
|
+
break
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
document.inbound_links << @main_url.normalize.to_s
|
177
|
+
@queue << document if !document.as_visited
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def load_options(opts)
|
182
|
+
@opts = DEFAULT_OPTIONS.merge opts
|
183
|
+
end
|
184
|
+
|
185
|
+
def get_uri(url)
|
186
|
+
@ehttp = true
|
187
|
+
begin
|
188
|
+
http = Net::HTTP.new(url.host, url.port)
|
189
|
+
if url.scheme == 'https'
|
190
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
191
|
+
http.use_ssl = true
|
192
|
+
end
|
193
|
+
|
194
|
+
r = http.request(Net::HTTP::Get.new(url.request_uri, {'User-Agent' => @opts[:useragent]}))
|
195
|
+
return r
|
196
|
+
|
197
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
198
|
+
if @ehttp
|
199
|
+
@ettp = false
|
200
|
+
retry
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
def set_as_visited(var)
|
209
|
+
@scanned_documents = @scanned_documents + 1
|
210
|
+
var.as_visited = true
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
def follow?(document)
|
216
|
+
follow = true
|
217
|
+
if @opts[:follow_robotstxt_directive]
|
218
|
+
if @robotstxt_cache.include?(document.url.host)
|
219
|
+
r = @robotstxt_cache[document.url.host]
|
220
|
+
else
|
221
|
+
r = Robotstxt::Parser.new(@opts[:robot_name])
|
222
|
+
|
223
|
+
r.get(document.url.scheme + '://' + document.url.host)
|
224
|
+
@robotstxt_cache[document.url.host] = r
|
225
|
+
end
|
226
|
+
follow = r.allowed?(document.url.normalize.to_s)
|
227
|
+
end
|
228
|
+
follow
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
require 'uri/generic'
|
17
|
+
require 'hpricot'
|
18
|
+
|
19
|
+
module Rwspider
|
20
|
+
class Document
|
21
|
+
include URI
|
22
|
+
|
23
|
+
# instance of <tt>URI</tt>
|
24
|
+
attr_accessor :url
|
25
|
+
|
26
|
+
# Returns <tt>true</tt> if the Rwspider::Document::url was request
|
27
|
+
attr_accessor :as_visited
|
28
|
+
|
29
|
+
# An instance of Net::HTTPResponse that contains the response returned from the web server
|
30
|
+
attr_accessor :http_response
|
31
|
+
|
32
|
+
# An <tt>Array</tt> of Rwspider::Document found in the HTML code of the current Rwspider::Document
|
33
|
+
attr_accessor :documents
|
34
|
+
|
35
|
+
# The time spent to download the Rwspider::Document
|
36
|
+
attr_accessor :download_time
|
37
|
+
|
38
|
+
# Returns <tt>true</tt> if the Rwspider::Document::url was downloaded correctly
|
39
|
+
attr_accessor :as_downloaded
|
40
|
+
|
41
|
+
# An <tt>Array</tt> of <tt>String</tt> hat contains the URLs of the documents where was found an link at the current Rwspider::Document
|
42
|
+
attr_reader :inbound_links
|
43
|
+
|
44
|
+
|
45
|
+
# Inizialize a new Rwspider::Document instance with the <tt>url</tt>
|
46
|
+
#
|
47
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
48
|
+
#
|
49
|
+
def initialize (url)
|
50
|
+
parse(url)
|
51
|
+
@tag_type = Array.new
|
52
|
+
@tag_type << ['a','href']
|
53
|
+
@tag_type << ['img','src']
|
54
|
+
@tag_type << ['link','href']
|
55
|
+
@inbound_links = []
|
56
|
+
@documents = []
|
57
|
+
@as_visited = false
|
58
|
+
end
|
59
|
+
|
60
|
+
# Rwspider::Document::parse load or replace the Rwspider::Document.url with the new <tt>url</tt>
|
61
|
+
#
|
62
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
63
|
+
# doc.parse('http://www.rwspider.com/sitemap.html')
|
64
|
+
#
|
65
|
+
def parse (url)
|
66
|
+
begin
|
67
|
+
@url = URI.parse(url.gsub(/\\/,'/'))
|
68
|
+
rescue Exception => e
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at other documents.
|
74
|
+
#
|
75
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
76
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
77
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
78
|
+
# arr = doc.get_links
|
79
|
+
#
|
80
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
81
|
+
# and append the Array at the documents attribute.
|
82
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
83
|
+
# was different from 'text/html'.
|
84
|
+
#
|
85
|
+
def get_links()
|
86
|
+
get_document(@tag_type[0])
|
87
|
+
end
|
88
|
+
|
89
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at images.
|
90
|
+
#
|
91
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
92
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
93
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
94
|
+
# arr = doc.get_images
|
95
|
+
#
|
96
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
97
|
+
# and append the Array at the documents attribute.
|
98
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
99
|
+
# was different from 'text/html'.
|
100
|
+
#
|
101
|
+
def get_images()
|
102
|
+
get_document(@tag_type[1])
|
103
|
+
end
|
104
|
+
|
105
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at other files
|
106
|
+
# as javascript and css.
|
107
|
+
#
|
108
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
109
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
110
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
111
|
+
# arr = doc.get_other_files
|
112
|
+
#
|
113
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
114
|
+
# and append the Array at the documents attribute.
|
115
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
116
|
+
# was different from 'text/html'.
|
117
|
+
#
|
118
|
+
def get_other_files()
|
119
|
+
get_document(@tag_type[2])
|
120
|
+
end
|
121
|
+
|
122
|
+
# Normalize the url if the path is relative and returns an <tt>String</tt> with the absolute version.
|
123
|
+
#
|
124
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
125
|
+
# doc.normalize_url(URI.parse('/sitemap.html'))
|
126
|
+
#
|
127
|
+
def normalize_url(var)
|
128
|
+
querystring = (!var.query.nil?) ? '?' + var.query : ''
|
129
|
+
if var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
|
130
|
+
if var.relative?
|
131
|
+
path = var.path
|
132
|
+
if url.path.nil?
|
133
|
+
main_path = url.path.slice(0..url.path.rindex('/'))
|
134
|
+
else
|
135
|
+
main_path = '/'
|
136
|
+
end
|
137
|
+
|
138
|
+
if path.match('^\/')
|
139
|
+
path = url.scheme + '://' + url.host + path + querystring
|
140
|
+
else
|
141
|
+
path = url.scheme + '://' + url.host + main_path + path + querystring
|
142
|
+
end
|
143
|
+
else
|
144
|
+
path = var.scheme + '://' + var.host + var.path + querystring
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
return path
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def get_document(tag)
|
154
|
+
return unless !@http_response.nil? && @http_response.content_type == 'text/html'
|
155
|
+
sourcecode = Hpricot(@http_response.body)
|
156
|
+
lnks = sourcecode.search("//" + tag[0])
|
157
|
+
docs = []
|
158
|
+
lnks.each { |link|
|
159
|
+
|
160
|
+
url = link.attributes[tag[1]].strip
|
161
|
+
doc = Document.new(url) if !url.nil?
|
162
|
+
|
163
|
+
if !doc.nil? && !doc.url.nil?
|
164
|
+
path = normalize_url(doc.url)
|
165
|
+
|
166
|
+
if !path.nil?
|
167
|
+
doc.parse path
|
168
|
+
docs << doc
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
}
|
175
|
+
@documents = @documents + docs
|
176
|
+
docs
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
module Rwspider
|
17
|
+
|
18
|
+
class Queue < Array
|
19
|
+
|
20
|
+
def initialize ()
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
module Rwspider
|
17
|
+
|
18
|
+
module Version
|
19
|
+
MAJOR = 0
|
20
|
+
MINOR = 4
|
21
|
+
TINY = 2
|
22
|
+
ALPHA = nil
|
23
|
+
|
24
|
+
STRING = [MAJOR, MINOR, TINY, ALPHA].compact.join('.')
|
25
|
+
end
|
26
|
+
|
27
|
+
VERSION = Version::STRING
|
28
|
+
|
29
|
+
end
|
data/lib/rwspider.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'rwspider/client'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Rwspider
|
23
|
+
|
24
|
+
NAME = 'Rwspider'
|
25
|
+
GEM = 'rwspider'
|
26
|
+
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
+
|
28
|
+
# Start the crawling from the <tt>URL</tt> with the personalized <tt>options</tt>.
|
29
|
+
# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
|
30
|
+
# Rwspider::start yield an instance of Rwspider::Document Class for each page downloaded.
|
31
|
+
#
|
32
|
+
# opts = {
|
33
|
+
# :useragent => 'My user agent',
|
34
|
+
# :robot_name => 'my_spider_name',
|
35
|
+
# :scan_documents_limit => 100,
|
36
|
+
# :scan_domain_limit => nil,
|
37
|
+
# :scan_images => true,
|
38
|
+
# :scan_other_files => false,
|
39
|
+
# :follow_robotstxt_directive => true,
|
40
|
+
# :follow_HTTP_redirection => true,
|
41
|
+
# :timeout => 5
|
42
|
+
# }
|
43
|
+
# Rwspider.start('http://www.rwspider.com', opts) {do |d|
|
44
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
|
48
|
+
def self.start(url, options = {})
|
49
|
+
|
50
|
+
@client = Rwspider::Client.new(options)
|
51
|
+
@client.start(url)do |doc|
|
52
|
+
yield doc if block_given?
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
data/rwspider.gemspec
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{rwspider}
|
5
|
+
s.version = "0.4.2"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Simone Rinzivillo"]
|
9
|
+
s.date = %q{2010-02-06}
|
10
|
+
s.description = %q{ RW Spider is an multithreading spider client written in Ruby designed to make easy the development of programs that spider the web.
|
11
|
+
}
|
12
|
+
s.email = %q{srinzivillo@gmail.com}
|
13
|
+
s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb"]
|
14
|
+
s.files = ["Changelog.rdoc", "LICENSE.rdoc", "README.rdoc", "Rakefile", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb", "test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb", "Manifest", "rwspider.gemspec"]
|
15
|
+
s.homepage = %q{http://www.rwspider.com}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rwspider", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{rwspider}
|
19
|
+
s.rubygems_version = %q{1.3.5}
|
20
|
+
s.summary = %q{RW Spider is an multithreading spider client written in Ruby}
|
21
|
+
s.test_files = ["test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8"])
|
29
|
+
s.add_development_dependency(%q<hpricot>, ["~> 0.8.2"])
|
30
|
+
s.add_development_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
31
|
+
s.add_development_dependency(%q<echoe>, ["~> 3.1"])
|
32
|
+
else
|
33
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
34
|
+
s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
|
35
|
+
s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
36
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
37
|
+
end
|
38
|
+
else
|
39
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
40
|
+
s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
|
41
|
+
s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
42
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
43
|
+
end
|
44
|
+
end
|
data/test/client_test.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestClient < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
opts = {
|
10
|
+
:useragent => 'My user agent',
|
11
|
+
:robot_name => 'my_spider_name',
|
12
|
+
:scan_documents_limit => 10,
|
13
|
+
:scan_domain_limit => nil,
|
14
|
+
:scan_images => true,
|
15
|
+
:scan_other_files => false,
|
16
|
+
:follow_robotstxt_directive => true,
|
17
|
+
:follow_HTTP_redirection => true,
|
18
|
+
:timeout => 5
|
19
|
+
}
|
20
|
+
@client = Rwspider::Client.new(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_initialize
|
24
|
+
client = Rwspider::Client.new
|
25
|
+
assert_instance_of Rwspider::Client, client
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_start
|
29
|
+
r = @client.start('http://www.rwspider.com')
|
30
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
31
|
+
assert_instance_of Rwspider::Queue, r
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestDocument < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
opts = {
|
10
|
+
:scan_documents_limit => 1,
|
11
|
+
:scan_domain_limit => 'www.rwspider.com',
|
12
|
+
:timeout => 10
|
13
|
+
}
|
14
|
+
client = Rwspider::Client.new(opts)
|
15
|
+
@doc = client.start('http://www.rwspider.com')[0]
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_initialize
|
19
|
+
d = Rwspider::Document.new('http://www.rwspider.com')
|
20
|
+
assert_instance_of Rwspider::Document, d
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_normalize_relative_url
|
24
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
25
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('/sitemap.html')))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_normalize_absolute_url
|
29
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
30
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('http://www.rwspider.com/sitemap.html')))
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_parse
|
34
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
35
|
+
doc.parse('http://www.rwspider.com/sitemap.html')
|
36
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.url.normalize.to_s)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_get_links
|
40
|
+
arr = @doc.get_links
|
41
|
+
assert_instance_of Array, arr
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_get_images
|
45
|
+
arr = @doc.get_images
|
46
|
+
assert_instance_of Array, arr
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_get_other_files
|
50
|
+
arr = @doc.get_other_files
|
51
|
+
assert_instance_of Array, arr
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestRwspider < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_start
|
10
|
+
opts = {
|
11
|
+
:useragent => 'My user agent',
|
12
|
+
:robot_name => 'my_spider_name',
|
13
|
+
:scan_documents_limit => 100,
|
14
|
+
:scan_domain_limit => nil,
|
15
|
+
:scan_images => true,
|
16
|
+
:scan_other_files => false,
|
17
|
+
:follow_robotstxt_directive => true,
|
18
|
+
:follow_HTTP_redirection => true,
|
19
|
+
:timeout => 5
|
20
|
+
}
|
21
|
+
r = Rwspider.start('http://www.rwspider.com', opts)
|
22
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_start_without_options
|
26
|
+
r = Rwspider.start('http://www.rwspider.com')
|
27
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rwspider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Rinzivillo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-06 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.8"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: hpricot
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.8.2
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: robotstxt
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.5.2
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: echoe
|
47
|
+
type: :development
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "3.1"
|
54
|
+
version:
|
55
|
+
description: " RW Spider is an multithreading spider client written in Ruby designed to make easy \t\tthe development of programs that spider the web.\n"
|
56
|
+
email: srinzivillo@gmail.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE.rdoc
|
63
|
+
- README.rdoc
|
64
|
+
- lib/rwspider.rb
|
65
|
+
- lib/rwspider/client.rb
|
66
|
+
- lib/rwspider/document.rb
|
67
|
+
- lib/rwspider/queue.rb
|
68
|
+
- lib/rwspider/version.rb
|
69
|
+
files:
|
70
|
+
- Changelog.rdoc
|
71
|
+
- LICENSE.rdoc
|
72
|
+
- README.rdoc
|
73
|
+
- Rakefile
|
74
|
+
- lib/rwspider.rb
|
75
|
+
- lib/rwspider/client.rb
|
76
|
+
- lib/rwspider/document.rb
|
77
|
+
- lib/rwspider/queue.rb
|
78
|
+
- lib/rwspider/version.rb
|
79
|
+
- test/client_test.rb
|
80
|
+
- test/document_test.rb
|
81
|
+
- test/rwspider_test.rb
|
82
|
+
- Manifest
|
83
|
+
- rwspider.gemspec
|
84
|
+
has_rdoc: true
|
85
|
+
homepage: http://www.rwspider.com
|
86
|
+
licenses: []
|
87
|
+
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options:
|
90
|
+
- --line-numbers
|
91
|
+
- --inline-source
|
92
|
+
- --title
|
93
|
+
- Rwspider
|
94
|
+
- --main
|
95
|
+
- README.rdoc
|
96
|
+
require_paths:
|
97
|
+
- lib
|
98
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: "0"
|
103
|
+
version:
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: "1.2"
|
109
|
+
version:
|
110
|
+
requirements: []
|
111
|
+
|
112
|
+
rubyforge_project: rwspider
|
113
|
+
rubygems_version: 1.3.5
|
114
|
+
signing_key:
|
115
|
+
specification_version: 3
|
116
|
+
summary: RW Spider is an multithreading spider client written in Ruby
|
117
|
+
test_files:
|
118
|
+
- test/client_test.rb
|
119
|
+
- test/document_test.rb
|
120
|
+
- test/rwspider_test.rb
|