rwspider 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Changelog.rdoc ADDED
@@ -0,0 +1,5 @@
1
+ = Changelog
2
+
3
+ == Release 0.4.1
4
+
5
+ * First release
data/LICENSE.rdoc ADDED
@@ -0,0 +1,25 @@
1
+ = License
2
+
3
+ (The MIT License)
4
+
5
+ Copyright (c) 2009 Simone Rinzivillo <srinzivillo@gmail.com>
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining
8
+ a copy of this software and associated documentation files (the
9
+ "Software"), to deal in the Software without restriction, including
10
+ without limitation the rights to use, copy, modify, merge, publish,
11
+ distribute, sublicense, and/or sell copies of the Software, and to
12
+ permit persons to whom the Software is furnished to do so, subject to
13
+ the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be
16
+ included in all copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
22
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
+
data/Manifest ADDED
@@ -0,0 +1,13 @@
1
+ Changelog.rdoc
2
+ LICENSE.rdoc
3
+ README.rdoc
4
+ Rakefile
5
+ lib/rwspider.rb
6
+ lib/rwspider/client.rb
7
+ lib/rwspider/document.rb
8
+ lib/rwspider/queue.rb
9
+ lib/rwspider/version.rb
10
+ test/client_test.rb
11
+ test/document_test.rb
12
+ test/rwspider_test.rb
13
+ Manifest
data/README.rdoc ADDED
@@ -0,0 +1,55 @@
1
+ = RW Spider
2
+
3
+ RW Spider is an multithreading spider client written in Ruby.
4
+ The library was designed to make easy the development of programs that spider the web.
5
+
6
+ RW Spider design comes from the direct experiences of the development of another PHP library that is currently used as the engine for an freeware SEO tool.
7
+
8
+ == Features
9
+
10
+ * Multithreading spider
11
+ * Customizable options for the spider job
12
+ * Robots.txt support
13
+ * Indexing of web pages and others files (images, CSS, JavaScript, PDF and more)
14
+ * Following redirects
15
+
16
+ == Requirements
17
+
18
+ * Ruby >= 1.8.7
19
+ * Hpricot >= 0.8.2
20
+ * Robotstxt >= 0.5.2
21
+
22
+
23
+
24
+ == Installation
25
+
26
+ This library is intended to be installed via the
27
+ Gemcutter[http://gemcutter.org] system.
28
+
29
+ $ gem install rwspider
30
+
31
+ You might need administrator privileges on your system to install it.
32
+
33
+
34
+
35
+ == Author
36
+
37
+ Author:: {Simone Rinzivillo}[http://www.simonerinzivillo.it/] <srinzivillo@gmail.com>
38
+
39
+
40
+ == Resources
41
+
42
+ * {Homepage}[http://www.rwspider.com/]
43
+ * {Author}[http://www.simonerinzivillo.it/]
44
+ * {GitHub}[http://github.com/rinzi/rwspider/]
45
+
46
+
47
+ == Changelog
48
+
49
+ See the CHANGELOG.rdoc file for details.
50
+
51
+
52
+ == License
53
+
54
+ Copyright (c) 2009 Simone Rinzivillo, RW Spider is released under the MIT license.
55
+
data/Rakefile ADDED
@@ -0,0 +1,59 @@
1
+ $:.unshift(File.dirname(__FILE__) + "/lib")
2
+
3
+ require 'rubygems'
4
+ require 'rake'
5
+ require 'echoe'
6
+ require 'rwspider'
7
+
8
+
9
+ # Common package properties
10
+ PKG_NAME = 'rwspider'
11
+ PKG_VERSION = Rwspider::VERSION
12
+ RUBYFORGE_PROJECT = 'rwspider'
13
+
14
+ if ENV['SNAPSHOT'].to_i == 1
15
+ PKG_VERSION << "." << Time.now.utc.strftime("%Y%m%d%H%M%S")
16
+ end
17
+
18
+
19
+ Echoe.new(PKG_NAME, PKG_VERSION) do |p|
20
+ p.author = "Simone Rinzivillo"
21
+ p.email = "srinzivillo@gmail.com"
22
+ p.summary = "RW Spider is an multithreading spider client written in Ruby"
23
+ p.url = "http://www.rwspider.com"
24
+ p.project = RUBYFORGE_PROJECT
25
+ p.description = <<-EOD
26
+ RW Spider is an multithreading spider client written in Ruby designed to make easy \
27
+ the development of programs that spider the web.
28
+ EOD
29
+
30
+ p.need_zip = true
31
+
32
+ p.development_dependencies += ["rake ~>0.8",
33
+ "hpricot ~>0.8.2",
34
+ "robotstxt ~>0.5.2",
35
+ "echoe ~>3.1"]
36
+
37
+ p.rcov_options = ["-Itest -x mocha,rcov,Rakefile"]
38
+ end
39
+
40
+
41
+ desc "Open an irb session preloaded with this library"
42
+ task :console do
43
+ sh "irb -rubygems -I lib -r rwspider.rb"
44
+ end
45
+
46
+ begin
47
+ require 'code_statistics'
48
+ desc "Show library's code statistics"
49
+ task :stats do
50
+ CodeStatistics.new(["Rwspider", "lib"],
51
+ ["Tests", "test"]).to_s
52
+ end
53
+ rescue LoadError
54
+ puts "CodeStatistics (Rails) is not available"
55
+ end
56
+
57
+ Dir["tasks/**/*.rake"].each do |file|
58
+ load(file)
59
+ end
@@ -0,0 +1,235 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'net/http'
18
+ require 'openssl'
19
+ require 'uri'
20
+ require 'robotstxt'
21
+ require 'rwspider/document'
22
+ require 'rwspider/queue'
23
+ require 'rwspider/version'
24
+ require 'hpricot'
25
+
26
+
27
+ module Rwspider
28
+ class Client
29
+
30
+ # Hash of options for the spider job
31
+ attr_accessor :opts
32
+
33
+
34
+ # DEFAULT_OPTIONS properties
35
+ #
36
+ # useragent: The User Agent that RW Spider must apply in HTTP requests
37
+ #
38
+ # robot_name: The Robot name that RW Spider must apply in HTTP requests
39
+ #
40
+ # scan_documents_limit: The limit of the documents that RW Spider can download,
41
+ # set as <tt>nil</tt> for start the indexing job without restriction on the number of the download
42
+ #
43
+ # scan_domain_limit: Set to restrict the indexing job to the current domain name
44
+ #
45
+ # scan_images -Set as <tt>true</tt> to enable the download of the image files
46
+ #
47
+ # scan_other_files: Set as <tt>true</tt> to enable the download of the other files as javascript and css
48
+ #
49
+ # follow_robotstxt_directive: Set as <tt>true</tt> to enable the analysis of the Robots.txt rules to check the accessibility of URLs
50
+ #
51
+ # follow_HTTP_redirection: Set as <tt>true</tt> to follow the HTTP redirections
52
+ #
53
+ # timeout: The timeout of single URL analysis
54
+
55
+ DEFAULT_OPTIONS = {
56
+ :useragent => 'RW Spider/' + Rwspider::VERSION,
57
+ :robot_name => 'rwspider',
58
+ :scan_documents_limit => 100,
59
+ :scan_domain_limit => nil,
60
+ :scan_images => false,
61
+ :scan_other_files => false,
62
+ :follow_robotstxt_directive => true,
63
+ :follow_HTTP_redirection => true,
64
+ :timeout => 5
65
+ }
66
+
67
+ # Inizialize a new Rwspider::Client instance, accept an <tt>Hash</tt> of options.
68
+ # RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
69
+ #
70
+ # opts = {
71
+ # :useragent => 'My user agent',
72
+ # :robot_name => 'my_spider_name',
73
+ # :scan_documents_limit => 100,
74
+ # :scan_domain_limit => nil,
75
+ # :scan_images => true,
76
+ # :scan_other_files => false,
77
+ # :follow_robotstxt_directive => true,
78
+ # :follow_HTTP_redirection => true,
79
+ # :timeout => 5
80
+ # }
81
+ # spider = Rwspider::Client::new(opts)
82
+ #
83
+ def initialize (options = {})
84
+
85
+ load_options options
86
+ @robotstxt_cache = Hash.new()
87
+ @main_hostname = ''
88
+ @scanned_documents = 0
89
+ @queue = Rwspider::Queue.new
90
+
91
+ end
92
+
93
+ # Start the crawling from the <tt>URL</tt>.
94
+ #
95
+ # Rwspider::Client::start yield an instance of Rwspider::Document Class for each page downloaded.
96
+ # At the end of execution returns an <tt>Array</tt> of Rwspider::Document instances.
97
+ #
98
+ # Rwspider::Client::start('http://www.rwspider.com') {do |d|
99
+ # puts 'Current URL ' + d.url.normalize.to_s
100
+ # }
101
+ #
102
+ # arr = Rwspider::Client::start('http://www.rwspider.com')
103
+ # arr.each{do |d|
104
+ # puts 'Current URL ' + d.url.normalize.to_s
105
+ # }
106
+ #
107
+ def start (start_url)
108
+
109
+ @queue << Rwspider::Document.new(start_url)
110
+
111
+ @queue.each do |link|
112
+ @main_url = link.url
113
+ if @opts[:scan_documents_limit].nil? || @scanned_documents < @opts[:scan_documents_limit]
114
+ set_as_visited link
115
+ @main_hostname = link.url.host.downcase if @main_hostname.length == 0
116
+
117
+ t = Thread.new(link) { |link|
118
+ begin
119
+
120
+ Timeout::timeout(@opts[:timeout]){
121
+ beginning = Time.now
122
+ response = get_uri(link.url)
123
+ link.download_time = Time.now - beginning
124
+ link.as_downloaded = true
125
+ link.http_response = response
126
+
127
+ yield link if block_given?
128
+
129
+ case response
130
+ when Net::HTTPSuccess then
131
+
132
+ if response.content_type == 'text/html' && (@opts[:scan_domain_limit].nil? || link.url.host.downcase.match(@opts[:scan_domain_limit]) )
133
+
134
+ link.get_links
135
+ link.get_images if @opts[:scan_images]
136
+ link.get_other_files if @opts[:scan_other_files]
137
+
138
+ link.documents.each do |doc|
139
+ add_to_queue doc
140
+ end
141
+ end
142
+
143
+ when Net::HTTPRedirection then
144
+ add_to_queue(Document.new(link.normalize_url(Document.new(response['location']).url))) if @opts[:follow_HTTP_redirection]
145
+
146
+ end
147
+ }
148
+ rescue StandardError => e
149
+ link.as_downloaded = false
150
+ yield link if block_given?
151
+ end
152
+ }
153
+ t.join
154
+ end
155
+ end
156
+
157
+ return @queue
158
+
159
+ end
160
+
161
+
162
+
163
+ private
164
+
165
+ def add_to_queue (document)
166
+
167
+ if follow?(document)
168
+ @queue.each do |link|
169
+ if link.url.normalize == document.url.normalize
170
+ document.as_visited = true
171
+ link.inbound_links << @main_url.normalize.to_s if !link.inbound_links.include?(@main_url.normalize.to_s)
172
+ break
173
+ end
174
+ end
175
+
176
+ document.inbound_links << @main_url.normalize.to_s
177
+ @queue << document if !document.as_visited
178
+ end
179
+ end
180
+
181
+ def load_options(opts)
182
+ @opts = DEFAULT_OPTIONS.merge opts
183
+ end
184
+
185
+ def get_uri(url)
186
+ @ehttp = true
187
+ begin
188
+ http = Net::HTTP.new(url.host, url.port)
189
+ if url.scheme == 'https'
190
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
191
+ http.use_ssl = true
192
+ end
193
+
194
+ r = http.request(Net::HTTP::Get.new(url.request_uri, {'User-Agent' => @opts[:useragent]}))
195
+ return r
196
+
197
+ rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
198
+ if @ehttp
199
+ @ettp = false
200
+ retry
201
+ end
202
+ end
203
+
204
+ end
205
+
206
+
207
+
208
+ def set_as_visited(var)
209
+ @scanned_documents = @scanned_documents + 1
210
+ var.as_visited = true
211
+ end
212
+
213
+
214
+
215
+ def follow?(document)
216
+ follow = true
217
+ if @opts[:follow_robotstxt_directive]
218
+ if @robotstxt_cache.include?(document.url.host)
219
+ r = @robotstxt_cache[document.url.host]
220
+ else
221
+ r = Robotstxt::Parser.new(@opts[:robot_name])
222
+
223
+ r.get(document.url.scheme + '://' + document.url.host)
224
+ @robotstxt_cache[document.url.host] = r
225
+ end
226
+ follow = r.allowed?(document.url.normalize.to_s)
227
+ end
228
+ follow
229
+ end
230
+
231
+
232
+
233
+ end
234
+
235
+ end
@@ -0,0 +1,183 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ require 'uri/generic'
17
+ require 'hpricot'
18
+
19
+ module Rwspider
20
+ class Document
21
+ include URI
22
+
23
+ # instance of <tt>URI</tt>
24
+ attr_accessor :url
25
+
26
+ # Returns <tt>true</tt> if the Rwspider::Document::url was request
27
+ attr_accessor :as_visited
28
+
29
+ # An instance of Net::HTTPResponse that contains the response returned from the web server
30
+ attr_accessor :http_response
31
+
32
+ # An <tt>Array</tt> of Rwspider::Document found in the HTML code of the current Rwspider::Document
33
+ attr_accessor :documents
34
+
35
+ # The time spent to download the Rwspider::Document
36
+ attr_accessor :download_time
37
+
38
+ # Returns <tt>true</tt> if the Rwspider::Document::url was downloaded correctly
39
+ attr_accessor :as_downloaded
40
+
41
+ # An <tt>Array</tt> of <tt>String</tt> hat contains the URLs of the documents where was found an link at the current Rwspider::Document
42
+ attr_reader :inbound_links
43
+
44
+
45
+ # Inizialize a new Rwspider::Document instance with the <tt>url</tt>
46
+ #
47
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
48
+ #
49
+ def initialize (url)
50
+ parse(url)
51
+ @tag_type = Array.new
52
+ @tag_type << ['a','href']
53
+ @tag_type << ['img','src']
54
+ @tag_type << ['link','href']
55
+ @inbound_links = []
56
+ @documents = []
57
+ @as_visited = false
58
+ end
59
+
60
+ # Rwspider::Document::parse load or replace the Rwspider::Document.url with the new <tt>url</tt>
61
+ #
62
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
63
+ # doc.parse('http://www.rwspider.com/sitemap.html')
64
+ #
65
+ def parse (url)
66
+ begin
67
+ @url = URI.parse(url.gsub(/\\/,'/'))
68
+ rescue Exception => e
69
+ nil
70
+ end
71
+ end
72
+
73
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at other documents.
74
+ #
75
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
76
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
77
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
78
+ # arr = doc.get_links
79
+ #
80
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
81
+ # and append the Array at the documents attribute.
82
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
83
+ # was different from 'text/html'.
84
+ #
85
+ def get_links()
86
+ get_document(@tag_type[0])
87
+ end
88
+
89
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at images.
90
+ #
91
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
92
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
93
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
94
+ # arr = doc.get_images
95
+ #
96
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
97
+ # and append the Array at the documents attribute.
98
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
99
+ # was different from 'text/html'.
100
+ #
101
+ def get_images()
102
+ get_document(@tag_type[1])
103
+ end
104
+
105
+ # Analyze the HTML code of the current Rwspider::Document to extract the links at other files
106
+ # as javascript and css.
107
+ #
108
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
109
+ # http = Net::HTTP.new(doc.url.host, doc.url.port)
110
+ # doc.http_response = http.request(Net::HTTP::Get.new(doc.url.request_uri))
111
+ # arr = doc.get_other_files
112
+ #
113
+ # This method returns an <tt>Array</tt> of instances of Rwspider::Document
114
+ # and append the Array at the documents attribute.
115
+ # Returns <tt>nil</tt> if the <tt>content-type</tt> returned in the <tt>http_response</tt> attribute
116
+ # was different from 'text/html'.
117
+ #
118
+ def get_other_files()
119
+ get_document(@tag_type[2])
120
+ end
121
+
122
+ # Normalize the url if the path is relative and returns an <tt>String</tt> with the absolute version.
123
+ #
124
+ # doc = Rwspider::Document::new('http://www.rwspider.com')
125
+ # doc.normalize_url(URI.parse('/sitemap.html'))
126
+ #
127
+ def normalize_url(var)
128
+ querystring = (!var.query.nil?) ? '?' + var.query : ''
129
+ if var.scheme.nil? || (var.scheme.downcase != "mailto" && var.scheme != "javascript")
130
+ if var.relative?
131
+ path = var.path
132
+ if url.path.nil?
133
+ main_path = url.path.slice(0..url.path.rindex('/'))
134
+ else
135
+ main_path = '/'
136
+ end
137
+
138
+ if path.match('^\/')
139
+ path = url.scheme + '://' + url.host + path + querystring
140
+ else
141
+ path = url.scheme + '://' + url.host + main_path + path + querystring
142
+ end
143
+ else
144
+ path = var.scheme + '://' + var.host + var.path + querystring
145
+ end
146
+ end
147
+
148
+ return path
149
+ end
150
+
151
+ private
152
+
153
+ def get_document(tag)
154
+ return unless !@http_response.nil? && @http_response.content_type == 'text/html'
155
+ sourcecode = Hpricot(@http_response.body)
156
+ lnks = sourcecode.search("//" + tag[0])
157
+ docs = []
158
+ lnks.each { |link|
159
+
160
+ url = link.attributes[tag[1]].strip
161
+ doc = Document.new(url) if !url.nil?
162
+
163
+ if !doc.nil? && !doc.url.nil?
164
+ path = normalize_url(doc.url)
165
+
166
+ if !path.nil?
167
+ doc.parse path
168
+ docs << doc
169
+
170
+ end
171
+
172
+ end
173
+
174
+ }
175
+ @documents = @documents + docs
176
+ docs
177
+ end
178
+
179
+
180
+
181
+
182
+ end
183
+ end
@@ -0,0 +1,25 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ module Rwspider
17
+
18
+ class Queue < Array
19
+
20
+ def initialize ()
21
+ end
22
+
23
+ end
24
+
25
+ end
@@ -0,0 +1,29 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+ module Rwspider
17
+
18
+ module Version
19
+ MAJOR = 0
20
+ MINOR = 4
21
+ TINY = 2
22
+ ALPHA = nil
23
+
24
+ STRING = [MAJOR, MINOR, TINY, ALPHA].compact.join('.')
25
+ end
26
+
27
+ VERSION = Version::STRING
28
+
29
+ end
data/lib/rwspider.rb ADDED
@@ -0,0 +1,57 @@
1
+ #
2
+ # = Ruby RW Spider
3
+ #
4
+ # RW Spider is an multithreading spider client written in Ruby.
5
+ #
6
+ #
7
+ # Category:: Net
8
+ # Package:: RWSpider
9
+ # Author:: Simone Rinzivillo <srinzivillo@gmail.com>
10
+ # License:: MIT License
11
+ #
12
+ #--
13
+ #
14
+ #++
15
+
16
+
17
+ require 'rwspider/client'
18
+ require 'uri'
19
+
20
+
21
+
22
+ module Rwspider
23
+
24
+ NAME = 'Rwspider'
25
+ GEM = 'rwspider'
26
+ AUTHORS = ['Simone Rinzivillo <srinzivillo@gmail.com>']
27
+
28
+ # Start the crawling from the <tt>URL</tt> with the personalized <tt>options</tt>.
29
+ # RW Spider apply the Rwspider::Client::DEFAULT_OPTIONS indexing options if you don't customize them
30
+ # Rwspider::start yield an instance of Rwspider::Document Class for each page downloaded.
31
+ #
32
+ # opts = {
33
+ # :useragent => 'My user agent',
34
+ # :robot_name => 'my_spider_name',
35
+ # :scan_documents_limit => 100,
36
+ # :scan_domain_limit => nil,
37
+ # :scan_images => true,
38
+ # :scan_other_files => false,
39
+ # :follow_robotstxt_directive => true,
40
+ # :follow_HTTP_redirection => true,
41
+ # :timeout => 5
42
+ # }
43
+ # Rwspider.start('http://www.rwspider.com', opts) {do |d|
44
+ # puts 'Current URL ' + d.url.normalize.to_s
45
+ # }
46
+ #
47
+
48
+ def self.start(url, options = {})
49
+
50
+ @client = Rwspider::Client.new(options)
51
+ @client.start(url)do |doc|
52
+ yield doc if block_given?
53
+ end
54
+
55
+ end
56
+
57
+ end
data/rwspider.gemspec ADDED
@@ -0,0 +1,44 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{rwspider}
5
+ s.version = "0.4.2"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Simone Rinzivillo"]
9
+ s.date = %q{2010-02-06}
10
+ s.description = %q{ RW Spider is an multithreading spider client written in Ruby designed to make easy the development of programs that spider the web.
11
+ }
12
+ s.email = %q{srinzivillo@gmail.com}
13
+ s.extra_rdoc_files = ["LICENSE.rdoc", "README.rdoc", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb"]
14
+ s.files = ["Changelog.rdoc", "LICENSE.rdoc", "README.rdoc", "Rakefile", "lib/rwspider.rb", "lib/rwspider/client.rb", "lib/rwspider/document.rb", "lib/rwspider/queue.rb", "lib/rwspider/version.rb", "test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb", "Manifest", "rwspider.gemspec"]
15
+ s.homepage = %q{http://www.rwspider.com}
16
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Rwspider", "--main", "README.rdoc"]
17
+ s.require_paths = ["lib"]
18
+ s.rubyforge_project = %q{rwspider}
19
+ s.rubygems_version = %q{1.3.5}
20
+ s.summary = %q{RW Spider is an multithreading spider client written in Ruby}
21
+ s.test_files = ["test/client_test.rb", "test/document_test.rb", "test/rwspider_test.rb"]
22
+
23
+ if s.respond_to? :specification_version then
24
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
25
+ s.specification_version = 3
26
+
27
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
28
+ s.add_development_dependency(%q<rake>, ["~> 0.8"])
29
+ s.add_development_dependency(%q<hpricot>, ["~> 0.8.2"])
30
+ s.add_development_dependency(%q<robotstxt>, ["~> 0.5.2"])
31
+ s.add_development_dependency(%q<echoe>, ["~> 3.1"])
32
+ else
33
+ s.add_dependency(%q<rake>, ["~> 0.8"])
34
+ s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
35
+ s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
36
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
37
+ end
38
+ else
39
+ s.add_dependency(%q<rake>, ["~> 0.8"])
40
+ s.add_dependency(%q<hpricot>, ["~> 0.8.2"])
41
+ s.add_dependency(%q<robotstxt>, ["~> 0.5.2"])
42
+ s.add_dependency(%q<echoe>, ["~> 3.1"])
43
+ end
44
+ end
@@ -0,0 +1,34 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestClient < Test::Unit::TestCase
7
+
8
+ def setup
9
+ opts = {
10
+ :useragent => 'My user agent',
11
+ :robot_name => 'my_spider_name',
12
+ :scan_documents_limit => 10,
13
+ :scan_domain_limit => nil,
14
+ :scan_images => true,
15
+ :scan_other_files => false,
16
+ :follow_robotstxt_directive => true,
17
+ :follow_HTTP_redirection => true,
18
+ :timeout => 5
19
+ }
20
+ @client = Rwspider::Client.new(opts)
21
+ end
22
+
23
+ def test_initialize
24
+ client = Rwspider::Client.new
25
+ assert_instance_of Rwspider::Client, client
26
+ end
27
+
28
+ def test_start
29
+ r = @client.start('http://www.rwspider.com')
30
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
31
+ assert_instance_of Rwspider::Queue, r
32
+ end
33
+
34
+ end
@@ -0,0 +1,55 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestDocument < Test::Unit::TestCase
7
+
8
+ def setup
9
+ opts = {
10
+ :scan_documents_limit => 1,
11
+ :scan_domain_limit => 'www.rwspider.com',
12
+ :timeout => 10
13
+ }
14
+ client = Rwspider::Client.new(opts)
15
+ @doc = client.start('http://www.rwspider.com')[0]
16
+ end
17
+
18
+ def test_initialize
19
+ d = Rwspider::Document.new('http://www.rwspider.com')
20
+ assert_instance_of Rwspider::Document, d
21
+ end
22
+
23
+ def test_normalize_relative_url
24
+ doc = Rwspider::Document::new('http://www.rwspider.com')
25
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('/sitemap.html')))
26
+ end
27
+
28
+ def test_normalize_absolute_url
29
+ doc = Rwspider::Document::new('http://www.rwspider.com')
30
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.normalize_url(URI.parse('http://www.rwspider.com/sitemap.html')))
31
+ end
32
+
33
+ def test_parse
34
+ doc = Rwspider::Document::new('http://www.rwspider.com')
35
+ doc.parse('http://www.rwspider.com/sitemap.html')
36
+ assert_equal('http://www.rwspider.com/sitemap.html', doc.url.normalize.to_s)
37
+ end
38
+
39
+ def test_get_links
40
+ arr = @doc.get_links
41
+ assert_instance_of Array, arr
42
+ end
43
+
44
+ def test_get_images
45
+ arr = @doc.get_images
46
+ assert_instance_of Array, arr
47
+ end
48
+
49
+ def test_get_other_files
50
+ arr = @doc.get_other_files
51
+ assert_instance_of Array, arr
52
+ end
53
+
54
+
55
+ end
@@ -0,0 +1,30 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+
3
+ require 'test/unit'
4
+ require 'rwspider'
5
+
6
+ class TestRwspider < Test::Unit::TestCase
7
+
8
+
9
+ def test_start
10
+ opts = {
11
+ :useragent => 'My user agent',
12
+ :robot_name => 'my_spider_name',
13
+ :scan_documents_limit => 100,
14
+ :scan_domain_limit => nil,
15
+ :scan_images => true,
16
+ :scan_other_files => false,
17
+ :follow_robotstxt_directive => true,
18
+ :follow_HTTP_redirection => true,
19
+ :timeout => 5
20
+ }
21
+ r = Rwspider.start('http://www.rwspider.com', opts)
22
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
23
+ end
24
+
25
+ def test_start_without_options
26
+ r = Rwspider.start('http://www.rwspider.com')
27
+ assert_equal('http://www.rwspider.com/', r[0].url.normalize.to_s)
28
+ end
29
+
30
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rwspider
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.4.2
5
+ platform: ruby
6
+ authors:
7
+ - Simone Rinzivillo
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2010-02-06 00:00:00 +01:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: rake
17
+ type: :development
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: "0.8"
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: hpricot
27
+ type: :development
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.8.2
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: robotstxt
37
+ type: :development
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ~>
42
+ - !ruby/object:Gem::Version
43
+ version: 0.5.2
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: echoe
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ~>
52
+ - !ruby/object:Gem::Version
53
+ version: "3.1"
54
+ version:
55
+ description: " RW Spider is an multithreading spider client written in Ruby designed to make easy \t\tthe development of programs that spider the web.\n"
56
+ email: srinzivillo@gmail.com
57
+ executables: []
58
+
59
+ extensions: []
60
+
61
+ extra_rdoc_files:
62
+ - LICENSE.rdoc
63
+ - README.rdoc
64
+ - lib/rwspider.rb
65
+ - lib/rwspider/client.rb
66
+ - lib/rwspider/document.rb
67
+ - lib/rwspider/queue.rb
68
+ - lib/rwspider/version.rb
69
+ files:
70
+ - Changelog.rdoc
71
+ - LICENSE.rdoc
72
+ - README.rdoc
73
+ - Rakefile
74
+ - lib/rwspider.rb
75
+ - lib/rwspider/client.rb
76
+ - lib/rwspider/document.rb
77
+ - lib/rwspider/queue.rb
78
+ - lib/rwspider/version.rb
79
+ - test/client_test.rb
80
+ - test/document_test.rb
81
+ - test/rwspider_test.rb
82
+ - Manifest
83
+ - rwspider.gemspec
84
+ has_rdoc: true
85
+ homepage: http://www.rwspider.com
86
+ licenses: []
87
+
88
+ post_install_message:
89
+ rdoc_options:
90
+ - --line-numbers
91
+ - --inline-source
92
+ - --title
93
+ - Rwspider
94
+ - --main
95
+ - README.rdoc
96
+ require_paths:
97
+ - lib
98
+ required_ruby_version: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: "0"
103
+ version:
104
+ required_rubygems_version: !ruby/object:Gem::Requirement
105
+ requirements:
106
+ - - ">="
107
+ - !ruby/object:Gem::Version
108
+ version: "1.2"
109
+ version:
110
+ requirements: []
111
+
112
+ rubyforge_project: rwspider
113
+ rubygems_version: 1.3.5
114
+ signing_key:
115
+ specification_version: 3
116
+ summary: RW Spider is an multithreading spider client written in Ruby
117
+ test_files:
118
+ - test/client_test.rb
119
+ - test/document_test.rb
120
+ - test/rwspider_test.rb