rwspider 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Changelog.rdoc +5 -0
- data/LICENSE.rdoc +25 -0
- data/Manifest +13 -0
- data/README.rdoc +55 -0
- data/Rakefile +59 -0
- data/lib/rwspider/client.rb +235 -0
- data/lib/rwspider/document.rb +183 -0
- data/lib/rwspider/queue.rb +25 -0
- data/lib/rwspider/version.rb +29 -0
- data/lib/rwspider.rb +57 -0
- data/rwspider.gemspec +44 -0
- data/test/client_test.rb +34 -0
- data/test/document_test.rb +55 -0
- data/test/rwspider_test.rb +30 -0
- metadata +120 -0
data/Changelog.rdoc
ADDED
data/LICENSE.rdoc
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
= License
|
2
|
+
|
3
|
+
(The MIT License)
|
4
|
+
|
5
|
+
Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
8
|
+
a copy of this software and associated documentation files (the
|
9
|
+
"Software"), to deal in the Software without restriction, including
|
10
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
11
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
+
permit persons to whom the Software is furnished to do so, subject to
|
13
|
+
the following conditions:
|
14
|
+
|
15
|
+
The above copyright notice and this permission notice shall be
|
16
|
+
included in all copies or substantial portions of the Software.
|
17
|
+
|
18
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
|
data/Manifest
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Changelog.rdoc
|
2
|
+
LICENSE.rdoc
|
3
|
+
README.rdoc
|
4
|
+
Rakefile
|
5
|
+
lib/rwspider.rb
|
6
|
+
lib/rwspider/client.rb
|
7
|
+
lib/rwspider/document.rb
|
8
|
+
lib/rwspider/queue.rb
|
9
|
+
lib/rwspider/version.rb
|
10
|
+
test/client_test.rb
|
11
|
+
test/document_test.rb
|
12
|
+
test/rwspider_test.rb
|
13
|
+
Manifest
|
data/README.rdoc
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
= RW Spider
|
2
|
+
|
3
|
+
RW Spider is an multithreading spider client written in Ruby.
|
4
|
+
The library was designed to make easy the development of programs that spider the web.
|
5
|
+
|
6
|
+
RW Spider design comes from the direct experiences of the development of another PHP library that is currently used as the engine for an freeware SEO tool.
|
7
|
+
|
8
|
+
== Features
|
9
|
+
|
10
|
+
* Multithreading spider
|
11
|
+
* Customizable options for the spider job
|
12
|
+
* Robots.txt support
|
13
|
+
* Indexing of web pages and others files (images, CSS, JavaScript, PDF and more)
|
14
|
+
* Following redirects
|
15
|
+
|
16
|
+
== Requirements
|
17
|
+
|
18
|
+
* Ruby >= 1.8.7
|
19
|
+
* Hpricot >= 0.8.2
|
20
|
+
* Robotstxt >= 0.5.2
|
21
|
+
|
22
|
+
|
23
|
+
|
24
|
+
== Installation
|
25
|
+
|
26
|
+
This library is intended to be installed via the
|
27
|
+
Gemcutter[http://gemcutter.org] system.
|
28
|
+
|
29
|
+
$ gem install rwspider
|
30
|
+
|
31
|
+
You might need administrator privileges on your system to install it.
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
== Author
|
36
|
+
|
37
|
+
Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
|
38
|
+
|
39
|
+
|
40
|
+
== Resources
|
41
|
+
|
42
|
+
* {Homepage}[http://www.rwspider.com/]
|
43
|
+
* {Author}[http://www.simonerinzivillo.it/]
|
44
|
+
* {GitHub}[http://github.com/rinzi/rwspider/]
|
45
|
+
|
46
|
+
|
47
|
+
== Changelog
|
48
|
+
|
49
|
+
See the CHANGELOG.rdoc file for details.
|
50
|
+
|
51
|
+
|
52
|
+
== License
|
53
|
+
|
54
|
+
Copyright (c) 2009 Simone Rinzivillo, RW Spider is released under the MIT license.
|
55
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + "/lib")
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'rake'
|
5
|
+
require 'echoe'
|
6
|
+
require 'rwspider'
|
7
|
+
|
8
|
+
|
9
|
+
# Common package properties
|
10
|
+
PKG_NAME = 'rwspider'
|
11
|
+
PKG_VERSION = Rwspider::VERSION
|
12
|
+
RUBYFORGE_PROJECT = 'rwspider'
|
13
|
+
|
14
|
+
if ENV['SNAPSHOT'].to_i == 1
|
15
|
+
PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
|
16
|
+
end
|
17
|
+
|
18
|
+
|
19
|
+
Echoe.new(PKG_NAME, PKG_VERSION) do |p|
|
20
|
+
p.author = "Simone Rinzivillo"
|
21
|
+
p.email = "srinzivillo@gmail.com"
|
22
|
+
p.summary = "RW Spider is an multithreading spider client written in Ruby"
|
23
|
+
p.url = "http://www.rwspider.com"
|
24
|
+
p.project = RUBYFORGE_PROJECT
|
25
|
+
p.description = <<-EOD
|
26
|
+
RW Spider is an multithreading spider client written in Ruby designed to make easy \
|
27
|
+
the development of programs that spider the web.
|
28
|
+
EOD
|
29
|
+
|
30
|
+
p.need_zip = true
|
31
|
+
|
32
|
+
p.development_dependencies += ["rake ~>0.8",
|
33
|
+
"hpricot ~>0.8.2",
|
34
|
+
"robotstxt ~>0.5.2",
|
35
|
+
"echoe ~>3.1"]
|
36
|
+
|
37
|
+
p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
|
38
|
+
end
|
39
|
+
|
40
|
+
|
41
|
+
desc "Open an irb session preloaded with this library"
|
42
|
+
task :console do
|
43
|
+
sh "irb -rubygems -I lib -r rwspider.rb"
|
44
|
+
end
|
45
|
+
|
46
|
+
begin
|
47
|
+
require 'code_statistics'
|
48
|
+
desc "Show library's code statistics"
|
49
|
+
task :stats do
|
50
|
+
CodeStatistics.new(["Rwspider", "lib"],
|
51
|
+
["Tests", "test"]).to_s
|
52
|
+
end
|
53
|
+
rescue LoadError
|
54
|
+
puts "CodeStatistics (Rails) is not available"
|
55
|
+
end
|
56
|
+
|
57
|
+
Dir["tasks/**/*.rake"].each do |file|
|
58
|
+
load(file)
|
59
|
+
end
|
@@ -0,0 +1,235 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'net/http'
|
18
|
+
require 'openssl'
|
19
|
+
require 'uri'
|
20
|
+
require 'robotstxt'
|
21
|
+
require 'rwspider/document'
|
22
|
+
require 'rwspider/queue'
|
23
|
+
require 'rwspider/version'
|
24
|
+
require 'hpricot'
|
25
|
+
|
26
|
+
|
27
|
+
module Rwspider
|
28
|
+
class Client
|
29
|
+
|
30
|
+
# Hash of options for the spider job
|
31
|
+
attr_accessor :opts
|
32
|
+
|
33
|
+
|
34
|
+
# DEFAULT_OPTIONS properties
|
35
|
+
#
|
36
|
+
# useragent: The User Agent that RW Spider must apply in HTTP requests
|
37
|
+
#
|
38
|
+
# robot_name: The Robot name that RW Spider must apply in HTTP requests
|
39
|
+
#
|
40
|
+
# scan_documents_limit: The limit of the documents that RW Spider can download,
|
41
|
+
# set as <tt>nil</tt> for start the indexing job without restriction on the number of the download
|
42
|
+
#
|
43
|
+
# scan_domain_limit: Set to restrict the indexing job to the current domain name
|
44
|
+
#
|
45
|
+
# scan_images -Set as <tt>true</tt> to enable the download of the image files
|
46
|
+
#
|
47
|
+
# scan_other_files: Set as <tt>true</tt> to enable the download of the other files as javascript and css
|
48
|
+
#
|
49
|
+
# follow_robotstxt_directive: Set as <tt>true</tt> to enable the analysis of the Robots.txt rules to check the accessibility of URLs
|
50
|
+
#
|
51
|
+
# follow_HTTP_redirection: Set as <tt>true</tt> to follow the HTTP redirections
|
52
|
+
#
|
53
|
+
# timeout: The timeout of single URL analysis
|
54
|
+
|
55
|
+
DEFAULT_OPTIONS = {
|
56
|
+
:useragent => 'RW Spider/' + Rwspider::VERSION,
|
57
|
+
:robot_name => 'rwspider',
|
58
|
+
:scan_documents_limit => 100,
|
59
|
+
:scan_domain_limit => nil,
|
60
|
+
:scan_images => false,
|
61
|
+
:scan_other_files => false,
|
62
|
+
:follow_robotstxt_directive => true,
|
63
|
+
:follow_HTTP_redirection => true,
|
64
|
+
:timeout => 5
|
65
|
+
}
|
66
|
+
|
67
|
+
# Inizialize a new Rwspider::Client instance, accept an <tt>Hash</tt> of options.
|
68
|
+
# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
|
69
|
+
#
|
70
|
+
# opts = {
|
71
|
+
# :useragent => 'My user agent',
|
72
|
+
# :robot_name => 'my_spider_name',
|
73
|
+
# :scan_documents_limit => 100,
|
74
|
+
# :scan_domain_limit => nil,
|
75
|
+
# :scan_images => true,
|
76
|
+
# :scan_other_files => false,
|
77
|
+
# :follow_robotstxt_directive => true,
|
78
|
+
# :follow_HTTP_redirection => true,
|
79
|
+
# :timeout => 5
|
80
|
+
# }
|
81
|
+
# spider = Rwspider::Client::new(opts)
|
82
|
+
#
|
83
|
+
def initialize (options = {})
|
84
|
+
|
85
|
+
load_options options
|
86
|
+
@robotstxt_cache = Hash.new()
|
87
|
+
@main_hostname = ''
|
88
|
+
@scanned_documents = 0
|
89
|
+
@queue = Rwspider::Queue.new
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
# Start the crawling from the <tt>URL</tt>.
|
94
|
+
#
|
95
|
+
# Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded.
|
96
|
+
# At the end of execution returns an <tt>Array</tt> of Rwspider::Document instances.
|
97
|
+
#
|
98
|
+
# Rwspider::Client::start('http://www.rwspider.com') {do |d|
|
99
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
100
|
+
# }
|
101
|
+
#
|
102
|
+
# arr = Rwspider::Client::start('http://www.rwspider.com')
|
103
|
+
# arr.each{do |d|
|
104
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
105
|
+
# }
|
106
|
+
#
|
107
|
+
def start (start_url)
|
108
|
+
|
109
|
+
@queue << Rwspider::Document.new(start_url)
|
110
|
+
|
111
|
+
@queue.each do |link|
|
112
|
+
@main_url = link.url
|
113
|
+
if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
|
114
|
+
set_as_visited link
|
115
|
+
@main_hostname = link.url.host.downcase if @main_hostname.length == 0
|
116
|
+
|
117
|
+
t = Thread.new(link) { |link|
|
118
|
+
begin
|
119
|
+
|
120
|
+
Timeout::timeout(@opts[:timeout]){
|
121
|
+
beginning = Time.now
|
122
|
+
response = get_uri(link.url)
|
123
|
+
link.download_time = Time.now - beginning
|
124
|
+
link.as_downloaded = true
|
125
|
+
link.http_response = response
|
126
|
+
|
127
|
+
yield link if block_given?
|
128
|
+
|
129
|
+
case response
|
130
|
+
when Net::HTTPSuccess then
|
131
|
+
|
132
|
+
if response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
|
133
|
+
|
134
|
+
link.get_links
|
135
|
+
link.get_images if @opts[:scan_images]
|
136
|
+
link.get_other_files if @opts[:scan_other_files]
|
137
|
+
|
138
|
+
link.documents.each do |doc|
|
139
|
+
add_to_queue doc
|
140
|
+
end
|
141
|
+
end
|
142
|
+
|
143
|
+
when Net::HTTPRedirection then
|
144
|
+
add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection]
|
145
|
+
|
146
|
+
end
|
147
|
+
}
|
148
|
+
rescue StandardError => e
|
149
|
+
link.as_downloaded = false
|
150
|
+
yield link if block_given?
|
151
|
+
end
|
152
|
+
}
|
153
|
+
t.join
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
return @queue
|
158
|
+
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
private
|
164
|
+
|
165
|
+
def add_to_queue (document)
|
166
|
+
|
167
|
+
if follow?(document)
|
168
|
+
@queue.each do |link|
|
169
|
+
if link.url.normalize == document.url.normalize
|
170
|
+
document.as_visited = true
|
171
|
+
link.inbound_links << @main_url.normalize.to_s if !link.inbound_links.include?(@main_url.normalize.to_s)
|
172
|
+
break
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
document.inbound_links << @main_url.normalize.to_s
|
177
|
+
@queue << document if !document.as_visited
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def load_options(opts)
|
182
|
+
@opts = DEFAULT_OPTIONS.merge opts
|
183
|
+
end
|
184
|
+
|
185
|
+
def get_uri(url)
|
186
|
+
@ehttp = true
|
187
|
+
begin
|
188
|
+
http = Net::HTTP.new(url.host, url.port)
|
189
|
+
if url.scheme == 'https'
|
190
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
191
|
+
http.use_ssl = true
|
192
|
+
end
|
193
|
+
|
194
|
+
r = http.request(Net::HTTP::Get.new(url.request_uri, {'User-Agent' => @opts[:useragent]}))
|
195
|
+
return r
|
196
|
+
|
197
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
198
|
+
if @ehttp
|
199
|
+
@ettp = false
|
200
|
+
retry
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
end
|
205
|
+
|
206
|
+
|
207
|
+
|
208
|
+
def set_as_visited(var)
|
209
|
+
@scanned_documents = @scanned_documents + 1
|
210
|
+
var.as_visited = true
|
211
|
+
end
|
212
|
+
|
213
|
+
|
214
|
+
|
215
|
+
def follow?(document)
|
216
|
+
follow = true
|
217
|
+
if @opts[:follow_robotstxt_directive]
|
218
|
+
if @robotstxt_cache.include?(document.url.host)
|
219
|
+
r = @robotstxt_cache[document.url.host]
|
220
|
+
else
|
221
|
+
r = Robotstxt::Parser.new(@opts[:robot_name])
|
222
|
+
|
223
|
+
r.get(document.url.scheme + '://' + document.url.host)
|
224
|
+
@robotstxt_cache[document.url.host] = r
|
225
|
+
end
|
226
|
+
follow = r.allowed?(document.url.normalize.to_s)
|
227
|
+
end
|
228
|
+
follow
|
229
|
+
end
|
230
|
+
|
231
|
+
|
232
|
+
|
233
|
+
end
|
234
|
+
|
235
|
+
end
|
@@ -0,0 +1,183 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
require 'uri/generic'
|
17
|
+
require 'hpricot'
|
18
|
+
|
19
|
+
module Rwspider
|
20
|
+
class Document
|
21
|
+
include URI
|
22
|
+
|
23
|
+
# instance of <tt>URI</tt>
|
24
|
+
attr_accessor :url
|
25
|
+
|
26
|
+
# Returns <tt>true</tt> if the Rwspider::Document::url was request
|
27
|
+
attr_accessor :as_visited
|
28
|
+
|
29
|
+
# An instance of Net::HTTPResponse that contains the response returned from the web server
|
30
|
+
attr_accessor :http_response
|
31
|
+
|
32
|
+
# An <tt>Array</tt> of Rwspider::Document found in the HTML code of the current Rwspider::Document
|
33
|
+
attr_accessor :documents
|
34
|
+
|
35
|
+
# The time spent to download the Rwspider::Document
|
36
|
+
attr_accessor :download_time
|
37
|
+
|
38
|
+
# Returns <tt>true</tt> if the Rwspider::Document::url was downloaded correctly
|
39
|
+
attr_accessor :as_downloaded
|
40
|
+
|
41
|
+
# An <tt>Array</tt> of <tt>String</tt> hat contains the URLs of the documents where was found an link at the current Rwspider::Document
|
42
|
+
attr_reader :inbound_links
|
43
|
+
|
44
|
+
|
45
|
+
# Inizialize a new Rwspider::Document instance with the <tt>url</tt>
|
46
|
+
#
|
47
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
48
|
+
#
|
49
|
+
def initialize (url)
|
50
|
+
parse(url)
|
51
|
+
@tag_type = Array.new
|
52
|
+
@tag_type << ['a','href']
|
53
|
+
@tag_type << ['img','src']
|
54
|
+
@tag_type << ['link','href']
|
55
|
+
@inbound_links = []
|
56
|
+
@documents = []
|
57
|
+
@as_visited = false
|
58
|
+
end
|
59
|
+
|
60
|
+
# Rwspider::Document::parse load or replace the Rwspider::Document.url with the new <tt>url</tt>
|
61
|
+
#
|
62
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
63
|
+
# doc.parse('http://www.rwspider.com/sitemap.html')
|
64
|
+
#
|
65
|
+
def parse (url)
|
66
|
+
begin
|
67
|
+
@url = URI.parse(url.gsub(/\\/,'/'))
|
68
|
+
rescue Exception => e
|
69
|
+
nil
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at other documents.
|
74
|
+
#
|
75
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
76
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
77
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
78
|
+
# arr = doc.get_links
|
79
|
+
#
|
80
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
81
|
+
# and append the Array at the documents attribute.
|
82
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
83
|
+
# was different from 'text/html'.
|
84
|
+
#
|
85
|
+
def get_links()
|
86
|
+
get_document(@tag_type[0])
|
87
|
+
end
|
88
|
+
|
89
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at images.
|
90
|
+
#
|
91
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
92
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
93
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
94
|
+
# arr = doc.get_images
|
95
|
+
#
|
96
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
97
|
+
# and append the Array at the documents attribute.
|
98
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
99
|
+
# was different from 'text/html'.
|
100
|
+
#
|
101
|
+
def get_images()
|
102
|
+
get_document(@tag_type[1])
|
103
|
+
end
|
104
|
+
|
105
|
+
# Analyze the HTML code of the current Rwspider::Document to extract the links at other files
|
106
|
+
# as javascript and css.
|
107
|
+
#
|
108
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
109
|
+
# http = Net::HTTP.new(doc.url.host, doc.url.port)
|
110
|
+
# doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
|
111
|
+
# arr = doc.get_other_files
|
112
|
+
#
|
113
|
+
# This method returns an <tt>Array</tt> of instances of Rwspider::Document
|
114
|
+
# and append the Array at the documents attribute.
|
115
|
+
# Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
|
116
|
+
# was different from 'text/html'.
|
117
|
+
#
|
118
|
+
def get_other_files()
|
119
|
+
get_document(@tag_type[2])
|
120
|
+
end
|
121
|
+
|
122
|
+
# Normalize the url if the path is relative and returns an <tt>String</tt> with the absolute version.
|
123
|
+
#
|
124
|
+
# doc = Rwspider::Document::new('http://www.rwspider.com')
|
125
|
+
# doc.normalize_url(URI.parse('/sitemap.html'))
|
126
|
+
#
|
127
|
+
def normalize_url(var)
|
128
|
+
querystring = (!var.query.nil?) ? '?' + var.query : ''
|
129
|
+
if var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
|
130
|
+
if var.relative?
|
131
|
+
path = var.path
|
132
|
+
if url.path.nil?
|
133
|
+
main_path = url.path.slice(0..url.path.rindex('/'))
|
134
|
+
else
|
135
|
+
main_path = '/'
|
136
|
+
end
|
137
|
+
|
138
|
+
if path.match('^\/')
|
139
|
+
path = url.scheme + '://' + url.host + path + querystring
|
140
|
+
else
|
141
|
+
path = url.scheme + '://' + url.host + main_path + path + querystring
|
142
|
+
end
|
143
|
+
else
|
144
|
+
path = var.scheme + '://' + var.host + var.path + querystring
|
145
|
+
end
|
146
|
+
end
|
147
|
+
|
148
|
+
return path
|
149
|
+
end
|
150
|
+
|
151
|
+
private
|
152
|
+
|
153
|
+
def get_document(tag)
|
154
|
+
return unless !@http_response.nil? && @http_response.content_type == 'text/html'
|
155
|
+
sourcecode = Hpricot(@http_response.body)
|
156
|
+
lnks = sourcecode.search("//" + tag[0])
|
157
|
+
docs = []
|
158
|
+
lnks.each { |link|
|
159
|
+
|
160
|
+
url = link.attributes[tag[1]].strip
|
161
|
+
doc = Document.new(url) if !url.nil?
|
162
|
+
|
163
|
+
if !doc.nil? && !doc.url.nil?
|
164
|
+
path = normalize_url(doc.url)
|
165
|
+
|
166
|
+
if !path.nil?
|
167
|
+
doc.parse path
|
168
|
+
docs << doc
|
169
|
+
|
170
|
+
end
|
171
|
+
|
172
|
+
end
|
173
|
+
|
174
|
+
}
|
175
|
+
@documents = @documents + docs
|
176
|
+
docs
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
|
181
|
+
|
182
|
+
end
|
183
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
module Rwspider
|
17
|
+
|
18
|
+
class Queue < Array
|
19
|
+
|
20
|
+
def initialize ()
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
module Rwspider
|
17
|
+
|
18
|
+
module Version
|
19
|
+
MAJOR = 0
|
20
|
+
MINOR = 4
|
21
|
+
TINY = 2
|
22
|
+
ALPHA = nil
|
23
|
+
|
24
|
+
STRING = [MAJOR, MINOR, TINY, ALPHA].compact.join('.')
|
25
|
+
end
|
26
|
+
|
27
|
+
VERSION = Version::STRING
|
28
|
+
|
29
|
+
end
|
data/lib/rwspider.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#
|
2
|
+
# = Ruby RW Spider
|
3
|
+
#
|
4
|
+
# RW Spider is an multithreading spider client written in Ruby.
|
5
|
+
#
|
6
|
+
#
|
7
|
+
# Category:: Net
|
8
|
+
# Package:: RWSpider
|
9
|
+
# Author:: Simone Rinzivillo <srinzivillo@gmail.com>
|
10
|
+
# License:: MIT License
|
11
|
+
#
|
12
|
+
#--
|
13
|
+
#
|
14
|
+
#++
|
15
|
+
|
16
|
+
|
17
|
+
require 'rwspider/client'
|
18
|
+
require 'uri'
|
19
|
+
|
20
|
+
|
21
|
+
|
22
|
+
module Rwspider
|
23
|
+
|
24
|
+
NAME = 'Rwspider'
|
25
|
+
GEM = 'rwspider'
|
26
|
+
AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
|
27
|
+
|
28
|
+
# Start the crawling from the <tt>URL</tt> with the personalized <tt>options</tt>.
|
29
|
+
# RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
|
30
|
+
# Rwspider::start yield an instance of Rwspider::Document Class for each page downloaded.
|
31
|
+
#
|
32
|
+
# opts = {
|
33
|
+
# :useragent => 'My user agent',
|
34
|
+
# :robot_name => 'my_spider_name',
|
35
|
+
# :scan_documents_limit => 100,
|
36
|
+
# :scan_domain_limit => nil,
|
37
|
+
# :scan_images => true,
|
38
|
+
# :scan_other_files => false,
|
39
|
+
# :follow_robotstxt_directive => true,
|
40
|
+
# :follow_HTTP_redirection => true,
|
41
|
+
# :timeout => 5
|
42
|
+
# }
|
43
|
+
# Rwspider.start('http://www.rwspider.com', opts) {do |d|
|
44
|
+
# puts 'Current URL ' + d.url.normalize.to_s
|
45
|
+
# }
|
46
|
+
#
|
47
|
+
|
48
|
+
def self.start(url, options = {})
|
49
|
+
|
50
|
+
@client = Rwspider::Client.new(options)
|
51
|
+
@client.start(url)do |doc|
|
52
|
+
yield doc if block_given?
|
53
|
+
end
|
54
|
+
|
55
|
+
end
|
56
|
+
|
57
|
+
end
|
data/rwspider.gemspec
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = %q{rwspider}
|
5
|
+
s.version = "0.4.2"
|
6
|
+
|
7
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
8
|
+
s.authors = ["Simone Rinzivillo"]
|
9
|
+
s.date = %q{2010-02-06}
|
10
|
+
s.description = %q{ RW Spider is an multithreading spider client written in Ruby designed to make easy the development of programs that spider the web.
|
11
|
+
}
|
12
|
+
s.email = %q{srinzivillo@gmail.com}
|
13
|
+
s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb"]
|
14
|
+
s.files = ["Changelog.rdoc", "LICENSE.rdoc", "README.rdoc", "Rakefile", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb", "test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb", "Manifest", "rwspider.gemspec"]
|
15
|
+
s.homepage = %q{http://www.rwspider.com}
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rwspider", "--main", "README.rdoc"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = %q{rwspider}
|
19
|
+
s.rubygems_version = %q{1.3.5}
|
20
|
+
s.summary = %q{RW Spider is an multithreading spider client written in Ruby}
|
21
|
+
s.test_files = ["test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
25
|
+
s.specification_version = 3
|
26
|
+
|
27
|
+
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
28
|
+
s.add_development_dependency(%q<rake>, ["~> 0.8"])
|
29
|
+
s.add_development_dependency(%q<hpricot>, ["~> 0.8.2"])
|
30
|
+
s.add_development_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
31
|
+
s.add_development_dependency(%q<echoe>, ["~> 3.1"])
|
32
|
+
else
|
33
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
34
|
+
s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
|
35
|
+
s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
36
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
37
|
+
end
|
38
|
+
else
|
39
|
+
s.add_dependency(%q<rake>, ["~> 0.8"])
|
40
|
+
s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
|
41
|
+
s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
|
42
|
+
s.add_dependency(%q<echoe>, ["~> 3.1"])
|
43
|
+
end
|
44
|
+
end
|
data/test/client_test.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestClient < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
opts = {
|
10
|
+
:useragent => 'My user agent',
|
11
|
+
:robot_name => 'my_spider_name',
|
12
|
+
:scan_documents_limit => 10,
|
13
|
+
:scan_domain_limit => nil,
|
14
|
+
:scan_images => true,
|
15
|
+
:scan_other_files => false,
|
16
|
+
:follow_robotstxt_directive => true,
|
17
|
+
:follow_HTTP_redirection => true,
|
18
|
+
:timeout => 5
|
19
|
+
}
|
20
|
+
@client = Rwspider::Client.new(opts)
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_initialize
|
24
|
+
client = Rwspider::Client.new
|
25
|
+
assert_instance_of Rwspider::Client, client
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_start
|
29
|
+
r = @client.start('http://www.rwspider.com')
|
30
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
31
|
+
assert_instance_of Rwspider::Queue, r
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
@@ -0,0 +1,55 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestDocument < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def setup
|
9
|
+
opts = {
|
10
|
+
:scan_documents_limit => 1,
|
11
|
+
:scan_domain_limit => 'www.rwspider.com',
|
12
|
+
:timeout => 10
|
13
|
+
}
|
14
|
+
client = Rwspider::Client.new(opts)
|
15
|
+
@doc = client.start('http://www.rwspider.com')[0]
|
16
|
+
end
|
17
|
+
|
18
|
+
def test_initialize
|
19
|
+
d = Rwspider::Document.new('http://www.rwspider.com')
|
20
|
+
assert_instance_of Rwspider::Document, d
|
21
|
+
end
|
22
|
+
|
23
|
+
def test_normalize_relative_url
|
24
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
25
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('/sitemap.html')))
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_normalize_absolute_url
|
29
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
30
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('http://www.rwspider.com/sitemap.html')))
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_parse
|
34
|
+
doc = Rwspider::Document::new('http://www.rwspider.com')
|
35
|
+
doc.parse('http://www.rwspider.com/sitemap.html')
|
36
|
+
assert_equal('http://www.rwspider.com/sitemap.html', doc.url.normalize.to_s)
|
37
|
+
end
|
38
|
+
|
39
|
+
def test_get_links
|
40
|
+
arr = @doc.get_links
|
41
|
+
assert_instance_of Array, arr
|
42
|
+
end
|
43
|
+
|
44
|
+
def test_get_images
|
45
|
+
arr = @doc.get_images
|
46
|
+
assert_instance_of Array, arr
|
47
|
+
end
|
48
|
+
|
49
|
+
def test_get_other_files
|
50
|
+
arr = @doc.get_other_files
|
51
|
+
assert_instance_of Array, arr
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
$:.unshift(File.dirname(__FILE__) + '/../lib')
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'rwspider'
|
5
|
+
|
6
|
+
class TestRwspider < Test::Unit::TestCase
|
7
|
+
|
8
|
+
|
9
|
+
def test_start
|
10
|
+
opts = {
|
11
|
+
:useragent => 'My user agent',
|
12
|
+
:robot_name => 'my_spider_name',
|
13
|
+
:scan_documents_limit => 100,
|
14
|
+
:scan_domain_limit => nil,
|
15
|
+
:scan_images => true,
|
16
|
+
:scan_other_files => false,
|
17
|
+
:follow_robotstxt_directive => true,
|
18
|
+
:follow_HTTP_redirection => true,
|
19
|
+
:timeout => 5
|
20
|
+
}
|
21
|
+
r = Rwspider.start('http://www.rwspider.com', opts)
|
22
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_start_without_options
|
26
|
+
r = Rwspider.start('http://www.rwspider.com')
|
27
|
+
assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rwspider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.4.2
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Simone Rinzivillo
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2010-02-06 00:00:00 +01:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: rake
|
17
|
+
type: :development
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ~>
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: "0.8"
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: hpricot
|
27
|
+
type: :development
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.8.2
|
34
|
+
version:
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: robotstxt
|
37
|
+
type: :development
|
38
|
+
version_requirement:
|
39
|
+
version_requirements: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 0.5.2
|
44
|
+
version:
|
45
|
+
- !ruby/object:Gem::Dependency
|
46
|
+
name: echoe
|
47
|
+
type: :development
|
48
|
+
version_requirement:
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ~>
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: "3.1"
|
54
|
+
version:
|
55
|
+
description: " RW Spider is an multithreading spider client written in Ruby designed to make easy \t\tthe development of programs that spider the web.\n"
|
56
|
+
email: srinzivillo@gmail.com
|
57
|
+
executables: []
|
58
|
+
|
59
|
+
extensions: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- LICENSE.rdoc
|
63
|
+
- README.rdoc
|
64
|
+
- lib/rwspider.rb
|
65
|
+
- lib/rwspider/client.rb
|
66
|
+
- lib/rwspider/document.rb
|
67
|
+
- lib/rwspider/queue.rb
|
68
|
+
- lib/rwspider/version.rb
|
69
|
+
files:
|
70
|
+
- Changelog.rdoc
|
71
|
+
- LICENSE.rdoc
|
72
|
+
- README.rdoc
|
73
|
+
- Rakefile
|
74
|
+
- lib/rwspider.rb
|
75
|
+
- lib/rwspider/client.rb
|
76
|
+
- lib/rwspider/document.rb
|
77
|
+
- lib/rwspider/queue.rb
|
78
|
+
- lib/rwspider/version.rb
|
79
|
+
- test/client_test.rb
|
80
|
+
- test/document_test.rb
|
81
|
+
- test/rwspider_test.rb
|
82
|
+
- Manifest
|
83
|
+
- rwspider.gemspec
|
84
|
+
has_rdoc: true
|
85
|
+
homepage: http://www.rwspider.com
|
86
|
+
licenses: []
|
87
|
+
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options:
|
90
|
+
- --line-numbers
|
91
|
+
- --inline-source
|
92
|
+
- --title
|
93
|
+
- Rwspider
|
94
|
+
- --main
|
95
|
+
- README.rdoc
|
96
|
+
require_paths:
|
97
|
+
- lib
|
98
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: "0"
|
103
|
+
version:
|
104
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
|
+
requirements:
|
106
|
+
- - ">="
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: "1.2"
|
109
|
+
version:
|
110
|
+
requirements: []
|
111
|
+
|
112
|
+
rubyforge_project: rwspider
|
113
|
+
rubygems_version: 1.3.5
|
114
|
+
signing_key:
|
115
|
+
specification_version: 3
|
116
|
+
summary: RW Spider is an multithreading spider client written in Ruby
|
117
|
+
test_files:
|
118
|
+
- test/client_test.rb
|
119
|
+
- test/document_test.rb
|
120
|
+
- test/rwspider_test.rb
|