rwspider 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
data/Changelog.rdoc ADDED
@@ -0,0 +1,5 @@
1
+ = Changelog
2
+
3
+ == Release 0.4.1
4
+
5
+ * First release
data/LICENSE.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = License
2
+
3
+ (The MIT License)
4
+
5
+ Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining
8
+ a copy of this software and associated documentation files (the
9
+ "Software"), to deal in the Software without restriction, including
10
+ without limitation the rights to use, copy, modify, merge, publish,
11
+ distribute, sublicense, and/or sell copies of the Software, and to
12
+ permit persons to whom the Software is furnished to do so, subject to
13
+ the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+
data/Manifest ADDED
@@ -0,0 +1,13 @@
1
+ Changelog.rdoc
2
+ LICENSE.rdoc
3
+ README.rdoc
4
+ Rakefile
5
+ lib/rwspider.rb
6
+ lib/rwspider/client.rb
7
+ lib/rwspider/document.rb
8
+ lib/rwspider/queue.rb
9
+ lib/rwspider/version.rb
10
+ test/client_test.rb
11
+ test/document_test.rb
12
+ test/rwspider_test.rb
13
+ Manifest
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ = RW Spider
2
+
3
+ RW Spider is an multithreading spider client written in Ruby.
4
+ The library was designed to make easy the development of programs that spider the web.
5
+
6
+ RW Spider design comes from the direct experiences of the development of another PHP library that is currently used as the engine for an freeware SEO tool.
7
+
8
+ == Features
9
+
10
+ * Multithreading spider
11
+ * Customizable options for the spider job
12
+ * Robots.txt support
13
+ * Indexing of web pages and others files (images, CSS, JavaScript, PDF and more)
14
+ * Following redirects
15
+
16
+ == Requirements
17
+
18
+ * Ruby >= 1.8.7
19
+ * Hpricot >= 0.8.2
20
+ * Robotstxt >= 0.5.2
21
+
22
+
23
+
24
+ == Installation
25
+
26
+ This library is intended to be installed via the
27
+ Gemcutter[http://gemcutter.org] system.
28
+
29
+ $ gem install rwspider
30
+
31
+ You might need administrator privileges on your system to install it.
32
+
33
+
34
+
35
+ == Author
36
+
37
+ Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
38
+
39
+
40
+ == Resources
41
+
42
+ * {Homepage}[http://www.rwspider.com/]
43
+ * {Author}[http://www.simonerinzivillo.it/]
44
+ * {GitHub}[http://github.com/rinzi/rwspider/]
45
+
46
+
47
+ == Changelog
48
+
49
+ See the CHANGELOG.rdoc file for details.
50
+
51
+
52
+ == License
53
+
54
+ Copyright (c) 2009 Simone Rinzivillo, RW Spider is released under the MIT license.
55
+
data/Rakefile ADDED
@@ -0,0 +1,59 @@
1
+ $:.unshift(File.dirname(__FILE__) + "/lib")
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'echoe'
6
+ require 'rwspider'
7
+
8
+
9
+ # Common package properties
10
+ PKG_NAME = 'rwspider'
11
+ PKG_VERSION = Rwspider::VERSION
12
+ RUBYFORGE_PROJECT = 'rwspider'
13
+
14
+ if ENV['SNAPSHOT'].to_i == 1
15
+ PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
16
+ end
17
+
18
+
19
+ Echoe.new(PKG_NAME, PKG_VERSION) do |p|
20
+ p.author = "Simone Rinzivillo"
21
+ p.email = "srinzivillo@gmail.com"
22
+ p.summary = "RW Spider is an multithreading spider client written in Ruby"
23
+ p.url = "http://www.rwspider.com"
24
+ p.project = RUBYFORGE_PROJECT
25
+ p.description = <<-EOD
26
+ RW Spider is an multithreading spider client written in Ruby designed to make easy \
27
+ the development of programs that spider the web.
28
+ EOD
29
+
30
+ p.need_zip = true
31
+
32
+ p.development_dependencies += ["rake ~>0.8",
33
+ "hpricot ~>0.8.2",
34
+ "robotstxt ~>0.5.2",
35
+ "echoe ~>3.1"]
36
+
37
+ p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
38
+ end
39
+
40
+
41
+ desc "Open an irb session preloaded with this library"
42
+ task :console do
43
+ sh "irb -rubygems -I lib -r rwspider.rb"
44
+ end
45
+
46
+ begin
47
+ require 'code_statistics'
48
+ desc "Show library's code statistics"
49
+ task :stats do
50
+ CodeStatistics.new(["Rwspider", "lib"],
51
+ ["Tests", "test"]).to_s
52
+ end
53
+ rescue LoadError
54
+ puts "CodeStatistics (Rails) is not available"
55
+ end
56
+
57
+ Dir["tasks/**/*.rake"].each do |file|
58
+ load(file)
59
+ end
@@ -0,0 +1,235 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'net/http'
18
+ require 'openssl'
19
+ require 'uri'
20
+ require 'robotstxt'
21
+ require 'rwspider/document'
22
+ require 'rwspider/queue'
23
+ require 'rwspider/version'
24
+ require 'hpricot'
25
+
26
+
27
+ module Rwspider
28
+ class Client
29
+
30
+ # Hash of options for the spider job
31
+ attr_accessor :opts
32
+
33
+
34
+ # DEFAULT_OPTIONS properties
35
+ #
36
+ # useragent: The User Agent that RW Spider must apply in HTTP requests
37
+ #
38
+ # robot_name: The Robot name that RW Spider must apply in HTTP requests
39
+ #
40
+ # scan_documents_limit: The limit of the documents that RW Spider can download,
41
+ # set as <tt>nil</tt> for start the indexing job without restriction on the number of the download
42
+ #
43
+ # scan_domain_limit: Set to restrict the indexing job to the current domain name
44
+ #
45
+ # scan_images -Set as <tt>true</tt> to enable the download of the image files
46
+ #
47
+ # scan_other_files: Set as <tt>true</tt> to enable the download of the other files as javascript and css
48
+ #
49
+ # follow_robotstxt_directive: Set as <tt>true</tt> to enable the analysis of the Robots.txt rules to check the accessibility of URLs
50
+ #
51
+ # follow_HTTP_redirection: Set as <tt>true</tt> to follow the HTTP redirections
52
+ #
53
+ # timeout: The timeout of single URL analysis
54
+
55
+ DEFAULT_OPTIONS = {
56
+ :useragent => 'RW Spider/' + Rwspider::VERSION,
57
+ :robot_name => 'rwspider',
58
+ :scan_documents_limit => 100,
59
+ :scan_domain_limit => nil,
60
+ :scan_images => false,
61
+ :scan_other_files => false,
62
+ :follow_robotstxt_directive => true,
63
+ :follow_HTTP_redirection => true,
64
+ :timeout => 5
65
+ }
66
+
67
+ # Inizialize a new Rwspider::Client instance, accept an <tt>Hash</tt> of options.
68
+ # RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
69
+ #
70
+ # opts = {
71
+ # :useragent => 'My user agent',
72
+ # :robot_name => 'my_spider_name',
73
+ # :scan_documents_limit => 100,
74
+ # :scan_domain_limit => nil,
75
+ # :scan_images => true,
76
+ # :scan_other_files => false,
77
+ # :follow_robotstxt_directive => true,
78
+ # :follow_HTTP_redirection => true,
79
+ # :timeout => 5
80
+ # }
81
+ # spider = Rwspider::Client::new(opts)
82
+ #
83
+ def initialize (options = {})
84
+
85
+ load_options options
86
+ @robotstxt_cache = Hash.new()
87
+ @main_hostname = ''
88
+ @scanned_documents = 0
89
+ @queue = Rwspider::Queue.new
90
+
91
+ end
92
+
93
+ # Start the crawling from the <tt>URL</tt>.
94
+ #
95
+ # Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded.
96
+ # At the end of execution returns an <tt>Array</tt> of Rwspider::Document instances.
97
+ #
98
+ # Rwspider::Client::start('http://www.rwspider.com') {do |d|
99
+ # puts 'Current URL ' + d.url.normalize.to_s
100
+ # }
101
+ #
102
+ # arr = Rwspider::Client::start('http://www.rwspider.com')
103
+ # arr.each{do |d|
104
+ # puts 'Current URL ' + d.url.normalize.to_s
105
+ # }
106
+ #
107
+ def start (start_url)
108
+
109
+ @queue << Rwspider::Document.new(start_url)
110
+
111
+ @queue.each do |link|
112
+ @main_url = link.url
113
+ if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
114
+ set_as_visited link
115
+ @main_hostname = link.url.host.downcase if @main_hostname.length == 0
116
+
117
+ t = Thread.new(link) { |link|
118
+ begin
119
+
120
+ Timeout::timeout(@opts[:timeout]){
121
+ beginning = Time.now
122
+ response = get_uri(link.url)
123
+ link.download_time = Time.now - beginning
124
+ link.as_downloaded = true
125
+ link.http_response = response
126
+
127
+ yield link if block_given?
128
+
129
+ case response
130
+ when Net::HTTPSuccess then
131
+
132
+ if response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
133
+
134
+ link.get_links
135
+ link.get_images if @opts[:scan_images]
136
+ link.get_other_files if @opts[:scan_other_files]
137
+
138
+ link.documents.each do |doc|
139
+ add_to_queue doc
140
+ end
141
+ end
142
+
143
+ when Net::HTTPRedirection then
144
+ add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection]
145
+
146
+ end
147
+ }
148
+ rescue StandardError => e
149
+ link.as_downloaded = false
150
+ yield link if block_given?
151
+ end
152
+ }
153
+ t.join
154
+ end
155
+ end
156
+
157
+ return @queue
158
+
159
+ end
160
+
161
+
162
+
163
+ private
164
+
165
+ def add_to_queue (document)
166
+
167
+ if follow?(document)
168
+ @queue.each do |link|
169
+ if link.url.normalize == document.url.normalize
170
+ document.as_visited = true
171
+ link.inbound_links << @main_url.normalize.to_s if !link.inbound_links.include?(@main_url.normalize.to_s)
172
+ break
173
+ end
174
+ end
175
+
176
+ document.inbound_links << @main_url.normalize.to_s
177
+ @queue << document if !document.as_visited
178
+ end
179
+ end
180
+
181
+ def load_options(opts)
182
+ @opts = DEFAULT_OPTIONS.merge opts
183
+ end
184
+
185
+ def get_uri(url)
186
+ @ehttp = true
187
+ begin
188
+ http = Net::HTTP.new(url.host, url.port)
189
+ if url.scheme == 'https'
190
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
191
+ http.use_ssl = true
192
+ end
193
+
194
+ r = http.request(Net::HTTP::Get.new(url.request_uri, {'User-Agent' => @opts[:useragent]}))
195
+ return r
196
+
197
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
198
+ if @ehttp
199
+ @ettp = false
200
+ retry
201
+ end
202
+ end
203
+
204
+ end
205
+
206
+
207
+
208
+ def set_as_visited(var)
209
+ @scanned_documents = @scanned_documents + 1
210
+ var.as_visited = true
211
+ end
212
+
213
+
214
+
215
+ def follow?(document)
216
+ follow = true
217
+ if @opts[:follow_robotstxt_directive]
218
+ if @robotstxt_cache.include?(document.url.host)
219
+ r = @robotstxt_cache[document.url.host]
220
+ else
221
+ r = Robotstxt::Parser.new(@opts[:robot_name])
222
+
223
+ r.get(document.url.scheme + '://' + document.url.host)
224
+ @robotstxt_cache[document.url.host] = r
225
+ end
226
+ follow = r.allowed?(document.url.normalize.to_s)
227
+ end
228
+ follow
229
+ end
230
+
231
+
232
+
233
+ end
234
+
235
+ end
@@ -0,0 +1,183 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ require 'uri/generic'
17
+ require 'hpricot'
18
+
19
+ module Rwspider
20
+ class Document
21
+ include URI
22
+
23
+ # instance of <tt>URI</tt>
24
+ attr_accessor :url
25
+
26
+ # Returns <tt>true</tt> if the Rwspider::Document::url was request
27
+ attr_accessor :as_visited
28
+
29
+ # An instance of Net::HTTPResponse that contains the response returned from the web server
30
+ attr_accessor :http_response
31
+
32
+ # An <tt>Array</tt> of Rwspider::Document found in the HTML code of the current Rwspider::Document
33
+ attr_accessor :documents
34
+
35
+ # The time spent to download the Rwspider::Document
36
+ attr_accessor :download_time
37
+
38
+ # Returns <tt>true</tt> if the Rwspider::Document::url was downloaded correctly
39
+ attr_accessor :as_downloaded
40
+
41
+ # An <tt>Array</tt> of <tt>String</tt> hat contains the URLs of the documents where was found an link at the current Rwspider::Document
42
+ attr_reader :inbound_links
43
+
44
+
45
+ # Inizialize a new Rwspider::Document instance with the <tt>url</tt>
46
+ #
47
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
48
+ #
49
+ def initialize (url)
50
+ parse(url)
51
+ @tag_type = Array.new
52
+ @tag_type << ['a','href']
53
+ @tag_type << ['img','src']
54
+ @tag_type << ['link','href']
55
+ @inbound_links = []
56
+ @documents = []
57
+ @as_visited = false
58
+ end
59
+
60
+ # Rwspider::Document::parse load or replace the Rwspider::Document.url with the new <tt>url</tt>
61
+ #
62
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
63
+ # doc.parse('http://www.rwspider.com/sitemap.html')
64
+ #
65
+ def parse (url)
66
+ begin
67
+ @url = URI.parse(url.gsub(/\\/,'/'))
68
+ rescue Exception => e
69
+ nil
70
+ end
71
+ end
72
+
73
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at other documents.
74
+ #
75
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
76
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
77
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
78
+ # arr = doc.get_links
79
+ #
80
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
81
+ # and append the Array at the documents attribute.
82
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
83
+ # was different from 'text/html'.
84
+ #
85
+ def get_links()
86
+ get_document(@tag_type[0])
87
+ end
88
+
89
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at images.
90
+ #
91
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
92
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
93
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
94
+ # arr = doc.get_images
95
+ #
96
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
97
+ # and append the Array at the documents attribute.
98
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
99
+ # was different from 'text/html'.
100
+ #
101
+ def get_images()
102
+ get_document(@tag_type[1])
103
+ end
104
+
105
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at other files
106
+ # as javascript and css.
107
+ #
108
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
109
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
110
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
111
+ # arr = doc.get_other_files
112
+ #
113
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
114
+ # and append the Array at the documents attribute.
115
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
116
+ # was different from 'text/html'.
117
+ #
118
+ def get_other_files()
119
+ get_document(@tag_type[2])
120
+ end
121
+
122
+ # Normalize the url if the path is relative and returns an <tt>String</tt> with the absolute version.
123
+ #
124
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
125
+ # doc.normalize_url(URI.parse('/sitemap.html'))
126
+ #
127
+ def normalize_url(var)
128
+ querystring = (!var.query.nil?) ? '?' + var.query : ''
129
+ if var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
130
+ if var.relative?
131
+ path = var.path
132
+ if url.path.nil?
133
+ main_path = url.path.slice(0..url.path.rindex('/'))
134
+ else
135
+ main_path = '/'
136
+ end
137
+
138
+ if path.match('^\/')
139
+ path = url.scheme + '://' + url.host + path + querystring
140
+ else
141
+ path = url.scheme + '://' + url.host + main_path + path + querystring
142
+ end
143
+ else
144
+ path = var.scheme + '://' + var.host + var.path + querystring
145
+ end
146
+ end
147
+
148
+ return path
149
+ end
150
+
151
+ private
152
+
153
+ def get_document(tag)
154
+ return unless !@http_response.nil? && @http_response.content_type == 'text/html'
155
+ sourcecode = Hpricot(@http_response.body)
156
+ lnks = sourcecode.search("//" + tag[0])
157
+ docs = []
158
+ lnks.each { |link|
159
+
160
+ url = link.attributes[tag[1]].strip
161
+ doc = Document.new(url) if !url.nil?
162
+
163
+ if !doc.nil? && !doc.url.nil?
164
+ path = normalize_url(doc.url)
165
+
166
+ if !path.nil?
167
+ doc.parse path
168
+ docs << doc
169
+
170
+ end
171
+
172
+ end
173
+
174
+ }
175
+ @documents = @documents + docs
176
+ docs
177
+ end
178
+
179
+
180
+
181
+
182
+ end
183
+ end
@@ -0,0 +1,25 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ module Rwspider
17
+
18
+ class Queue < Array
19
+
20
+ def initialize ()
21
+ end
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,29 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ module Rwspider
17
+
18
+ module Version
19
+ MAJOR = 0
20
+ MINOR = 4
21
+ TINY = 2
22
+ ALPHA = nil
23
+
24
+ STRING = [MAJOR, MINOR, TINY, ALPHA].compact.join('.')
25
+ end
26
+
27
+ VERSION = Version::STRING
28
+
29
+ end
data/lib/rwspider.rb ADDED
@@ -0,0 +1,57 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'rwspider/client'
18
+ require 'uri'
19
+
20
+
21
+
22
+ module Rwspider
23
+
24
+ NAME = 'Rwspider'
25
+ GEM = 'rwspider'
26
+ AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
+
28
+ # Start the crawling from the <tt>URL</tt> with the personalized <tt>options</tt>.
29
+ # RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
30
+ # Rwspider::start yield an instance of Rwspider::Document Class for each page downloaded.
31
+ #
32
+ # opts = {
33
+ # :useragent => 'My user agent',
34
+ # :robot_name => 'my_spider_name',
35
+ # :scan_documents_limit => 100,
36
+ # :scan_domain_limit => nil,
37
+ # :scan_images => true,
38
+ # :scan_other_files => false,
39
+ # :follow_robotstxt_directive => true,
40
+ # :follow_HTTP_redirection => true,
41
+ # :timeout => 5
42
+ # }
43
+ # Rwspider.start('http://www.rwspider.com', opts) {do |d|
44
+ # puts 'Current URL ' + d.url.normalize.to_s
45
+ # }
46
+ #
47
+
48
+ def self.start(url, options = {})
49
+
50
+ @client = Rwspider::Client.new(options)
51
+ @client.start(url)do |doc|
52
+ yield doc if block_given?
53
+ end
54
+
55
+ end
56
+
57
+ end
data/rwspider.gemspec ADDED
@@ -0,0 +1,44 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{rwspider}
5
+ s.version = "0.4.2"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Simone Rinzivillo"]
9
+ s.date = %q{2010-02-06}
10
+ s.description = %q{ RW Spider is an multithreading spider client written in Ruby designed to make easy the development of programs that spider the web.
11
+ }
12
+ s.email = %q{srinzivillo@gmail.com}
13
+ s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb"]
14
+ s.files = ["Changelog.rdoc", "LICENSE.rdoc", "README.rdoc", "Rakefile", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb", "test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb", "Manifest", "rwspider.gemspec"]
15
+ s.homepage = %q{http://www.rwspider.com}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rwspider", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{rwspider}
19
+ s.rubygems_version = %q{1.3.5}
20
+ s.summary = %q{RW Spider is an multithreading spider client written in Ruby}
21
+ s.test_files = ["test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb"]
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ s.add_development_dependency(%q<rake>, ["~> 0.8"])
29
+ s.add_development_dependency(%q<hpricot>, ["~> 0.8.2"])
30
+ s.add_development_dependency(%q<robotstxt>, ["~> 0.5.2"])
31
+ s.add_development_dependency(%q<echoe>, ["~> 3.1"])
32
+ else
33
+ s.add_dependency(%q<rake>, ["~> 0.8"])
34
+ s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
35
+ s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
36
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
37
+ end
38
+ else
39
+ s.add_dependency(%q<rake>, ["~> 0.8"])
40
+ s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
41
+ s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
42
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
43
+ end
44
+ end
@@ -0,0 +1,34 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestClient < Test::Unit::TestCase
7
+
8
+ def setup
9
+ opts = {
10
+ :useragent => 'My user agent',
11
+ :robot_name => 'my_spider_name',
12
+ :scan_documents_limit => 10,
13
+ :scan_domain_limit => nil,
14
+ :scan_images => true,
15
+ :scan_other_files => false,
16
+ :follow_robotstxt_directive => true,
17
+ :follow_HTTP_redirection => true,
18
+ :timeout => 5
19
+ }
20
+ @client = Rwspider::Client.new(opts)
21
+ end
22
+
23
+ def test_initialize
24
+ client = Rwspider::Client.new
25
+ assert_instance_of Rwspider::Client, client
26
+ end
27
+
28
+ def test_start
29
+ r = @client.start('http://www.rwspider.com')
30
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
31
+ assert_instance_of Rwspider::Queue, r
32
+ end
33
+
34
+ end
@@ -0,0 +1,55 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestDocument < Test::Unit::TestCase
7
+
8
+ def setup
9
+ opts = {
10
+ :scan_documents_limit => 1,
11
+ :scan_domain_limit => 'www.rwspider.com',
12
+ :timeout => 10
13
+ }
14
+ client = Rwspider::Client.new(opts)
15
+ @doc = client.start('http://www.rwspider.com')[0]
16
+ end
17
+
18
+ def test_initialize
19
+ d = Rwspider::Document.new('http://www.rwspider.com')
20
+ assert_instance_of Rwspider::Document, d
21
+ end
22
+
23
+ def test_normalize_relative_url
24
+ doc = Rwspider::Document::new('http://www.rwspider.com')
25
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('/sitemap.html')))
26
+ end
27
+
28
+ def test_normalize_absolute_url
29
+ doc = Rwspider::Document::new('http://www.rwspider.com')
30
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('http://www.rwspider.com/sitemap.html')))
31
+ end
32
+
33
+ def test_parse
34
+ doc = Rwspider::Document::new('http://www.rwspider.com')
35
+ doc.parse('http://www.rwspider.com/sitemap.html')
36
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.url.normalize.to_s)
37
+ end
38
+
39
+ def test_get_links
40
+ arr = @doc.get_links
41
+ assert_instance_of Array, arr
42
+ end
43
+
44
+ def test_get_images
45
+ arr = @doc.get_images
46
+ assert_instance_of Array, arr
47
+ end
48
+
49
+ def test_get_other_files
50
+ arr = @doc.get_other_files
51
+ assert_instance_of Array, arr
52
+ end
53
+
54
+
55
+ end
@@ -0,0 +1,30 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestRwspider < Test::Unit::TestCase
7
+
8
+
9
+ def test_start
10
+ opts = {
11
+ :useragent => 'My user agent',
12
+ :robot_name => 'my_spider_name',
13
+ :scan_documents_limit => 100,
14
+ :scan_domain_limit => nil,
15
+ :scan_images => true,
16
+ :scan_other_files => false,
17
+ :follow_robotstxt_directive => true,
18
+ :follow_HTTP_redirection => true,
19
+ :timeout => 5
20
+ }
21
+ r = Rwspider.start('http://www.rwspider.com', opts)
22
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
23
+ end
24
+
25
+ def test_start_without_options
26
+ r = Rwspider.start('http://www.rwspider.com')
27
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
28
+ end
29
+
30
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rwspider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.2
5
+ platform: ruby
6
+ authors:
7
+ - Simone Rinzivillo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-06 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: "0.8"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.8.2
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: robotstxt
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.5.2
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: echoe
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: "3.1"
54
+ version:
55
+ description: " RW Spider is an multithreading spider client written in Ruby designed to make easy \t\tthe development of programs that spider the web.\n"
56
+ email: srinzivillo@gmail.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE.rdoc
63
+ - README.rdoc
64
+ - lib/rwspider.rb
65
+ - lib/rwspider/client.rb
66
+ - lib/rwspider/document.rb
67
+ - lib/rwspider/queue.rb
68
+ - lib/rwspider/version.rb
69
+ files:
70
+ - Changelog.rdoc
71
+ - LICENSE.rdoc
72
+ - README.rdoc
73
+ - Rakefile
74
+ - lib/rwspider.rb
75
+ - lib/rwspider/client.rb
76
+ - lib/rwspider/document.rb
77
+ - lib/rwspider/queue.rb
78
+ - lib/rwspider/version.rb
79
+ - test/client_test.rb
80
+ - test/document_test.rb
81
+ - test/rwspider_test.rb
82
+ - Manifest
83
+ - rwspider.gemspec
84
+ has_rdoc: true
85
+ homepage: http://www.rwspider.com
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options:
90
+ - --line-numbers
91
+ - --inline-source
92
+ - --title
93
+ - Rwspider
94
+ - --main
95
+ - README.rdoc
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: "1.2"
109
+ version:
110
+ requirements: []
111
+
112
+ rubyforge_project: rwspider
113
+ rubygems_version: 1.3.5
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: RW Spider is an multithreading spider client written in Ruby
117
+ test_files:
118
+ - test/client_test.rb
119
+ - test/document_test.rb
120
+ - test/rwspider_test.rb