spider 0.4.1 → 0.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/CHANGES CHANGED
@@ -1,3 +1,8 @@
1
+ 2008-07-06
2
+ * Trap interrupts and shutdown gracefully
3
+ * Support for custom urls-to-crawl objects
4
+ * Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
5
+
1
6
  2007-11-09:
2
7
  * Handle redirects that assume a base URL.
3
8
 
data/README CHANGED
@@ -75,6 +75,31 @@ scraping, collecting, and looping so that you can just handle the data.
75
75
  s.check_already_seen_with ExpireLinks.new
76
76
  end
77
77
 
78
+ === Store nodes to visit with Amazon SQS
79
+
80
+ require 'spider'
81
+ require 'spider/next_urls_in_sqs'
82
+ Spider.start_at('http://mike-burns.com') do |s|
83
+ s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
84
+ end
85
+
86
+ ==== Store nodes to visit with a custom object
87
+
88
+ require 'spider'
89
+ class MyArray < Array
90
+ def pop
91
+ super
92
+ end
93
+
94
+ def push(a_msg)
95
+ super(a_msg)
96
+ end
97
+ end
98
+
99
+ Spider.start_at('http://mike-burns.com') do |s|
100
+ s.store_next_urls_with MyArray.new
101
+ end
102
+
78
103
  === Create a URL graph
79
104
 
80
105
  require 'spider'
@@ -106,9 +131,10 @@ scraping, collecting, and looping so that you can just handle the data.
106
131
 
107
132
  == Author
108
133
 
109
- Mike Burns http://mike-burns.com mike@mike-burns.com
134
+ John Nagro john.nagro@gmail.com
135
+ Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
110
136
 
111
- Help from Matt Horan, John Nagro, and Henri Cook.
137
+ Help from Matt Horan, and Henri Cook.
112
138
 
113
139
  With `robot_rules' from James Edward Gray II via
114
140
  http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -1,4 +1,4 @@
1
- # Copyright 2007 Mike Burns
1
+ # Copyright 2007-2008 Mike Burns & John Nagro
2
2
  # :include: README
3
3
 
4
4
  # Redistribution and use in source and binary forms, with or without
@@ -1,5 +1,6 @@
1
1
  # Use memcached to track cycles.
2
-
2
+ #
3
+ # Copyright 2007 Mike Burns
3
4
  # Redistribution and use in source and binary forms, with or without
4
5
  # modification, are permitted provided that the following conditions are met:
5
6
  # * Redistributions of source code must retain the above copyright
@@ -0,0 +1,66 @@
1
+ # Use AmazonSQS to track nodes to visit.
2
+ #
3
+ # Copyright 2008 John Nagro
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name Mike Burns nor the
12
+ # names of his contributors may be used to endorse or promote products
13
+ # derived from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+
26
+ require 'rubygems'
27
+ require 'right_aws'
28
+ require 'yaml'
29
+
30
+ # A specialized class using AmazonSQS to track nodes to walk. It supports
31
+ # two operations: push and pop . Together these can be used to
32
+ # add items to the queue, then pull items off the queue.
33
+ #
34
+ # This is useful if you want multiple Spider processes crawling the same
35
+ # data set.
36
+ #
37
+ # To use it with Spider use the store_next_urls_with method:
38
+ #
39
+ # Spider.start_at('http://example.com/') do |s|
40
+ # s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
41
+ # end
42
+ class NextUrlsInSQS
43
+ # Construct a new NextUrlsInSQS instance. All arguments here are
44
+ # passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
45
+ # to set the AmazonSQS queue name (optional).
46
+ def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
47
+ @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
48
+ @queue = @sqs.queue(queue_name)
49
+ end
50
+
51
+ # Pull an item off the queue, loop until data is found. Data is
52
+ # encoded with YAML.
53
+ def pop
54
+ while true
55
+ message = @queue.pop
56
+ return YAML::load(message.to_s) unless message.nil?
57
+ sleep 5
58
+ end
59
+ end
60
+
61
+ # Put data on the queue. Data is encoded with YAML.
62
+ def push(a_msg)
63
+ encoded_message = YAML::dump(a_msg)
64
+ @queue.push(a_msg)
65
+ end
66
+ end
@@ -1,6 +1,6 @@
1
1
  # Specialized spidering rules.
2
2
 
3
- # Copyright 2007 Mike Burns
3
+ # Copyright 2007-2008 Mike Burns & John Nagro
4
4
  # Redistribution and use in source and binary forms, with or without
5
5
  # modification, are permitted provided that the following conditions are met:
6
6
  # * Redistributions of source code must retain the above copyright
@@ -51,7 +51,7 @@ class SpiderInstance
51
51
  @url_checks = []
52
52
  @cache = :memory
53
53
  @callbacks = {}
54
- @next_urls = next_urls
54
+ @next_urls = [next_urls]
55
55
  @seen = seen
56
56
  @rules = rules || RobotRules.new('Ruby Spider 1.0')
57
57
  @robots_seen = robots_seen
@@ -96,6 +96,30 @@ class SpiderInstance
96
96
  end
97
97
  end
98
98
 
99
+ # The Web is a really, really, really big graph; as such, this list
100
+ # of nodes to visit grows really, really, really big.
101
+ #
102
+ # Change the object used to store nodes we have yet to walk. The default
103
+ # object is an instance of Array. Available with Spider is a wrapper of
104
+ # AmazonSQS.
105
+ #
106
+ # You can implement a custom class for this; any object passed to
107
+ # check_already_seen_with must understand just push and pop .
108
+ #
109
+ # # default
110
+ # store_next_urls_with Array.new
111
+ #
112
+ # # AmazonSQS
113
+ # require 'spider/next_urls_in_sqs'
114
+ # store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
115
+ def store_next_urls_with(a_store)
116
+ tmp_next_urls = @next_urls
117
+ @next_urls = a_store
118
+ tmp_next_urls.each do |a_url_hash|
119
+ @next_urls.push a_url_hash
120
+ end
121
+ end
122
+
99
123
  # Add a response handler. A response handler's trigger can be :every,
100
124
  # :success, :failure, or any HTTP status code. The handler itself can be
101
125
  # either a Proc or a block.
@@ -159,9 +183,11 @@ class SpiderInstance
159
183
  @headers = {}
160
184
  end
161
185
 
162
- def start! #:nodoc:
163
- next_urls = @next_urls
186
+ def start! #:nodoc:
187
+ interrupted = false
188
+ trap("SIGINT") { interrupted = true }
164
189
  begin
190
+ next_urls = @next_urls.pop
165
191
  tmp_n_u = {}
166
192
  next_urls.each do |prior_url, urls|
167
193
  urls.map do |a_url|
@@ -172,13 +198,18 @@ class SpiderInstance
172
198
  @setup.call(a_url) unless @setup.nil?
173
199
  get_page(parsed_url) do |response|
174
200
  do_callbacks(a_url, response, prior_url)
175
- tmp_n_u[a_url] = generate_next_urls(a_url, response)
201
+ #tmp_n_u[a_url] = generate_next_urls(a_url, response)
202
+ #@next_urls.push tmp_n_u
203
+ generate_next_urls(a_url, response).each do |a_next_url|
204
+ @next_urls.push a_url => a_next_url
205
+ end
206
+ #exit if interrupted
176
207
  end
177
208
  @teardown.call(a_url) unless @teardown.nil?
209
+ exit if interrupted
178
210
  end
179
211
  end
180
- next_urls = tmp_n_u
181
- end while !next_urls.empty?
212
+ end while !@next_urls.empty?
182
213
  end
183
214
 
184
215
  def success_or_failure(code) #:nodoc:
@@ -0,0 +1,27 @@
1
+ require 'spider.rb'
2
+ require 'spider/next_urls_in_sqs.rb'
3
+
4
+ class MyArray < Array
5
+ def pop
6
+ a_msg = super
7
+ puts "pop: #{a_msg.inspect}"
8
+ return a_msg
9
+ end
10
+
11
+ def push(a_msg)
12
+ puts "push: #{a_msg.inspect}"
13
+ super(a_msg)
14
+ end
15
+ end
16
+
17
+ AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
18
+ AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
19
+
20
+ #Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
21
+ Spider.start_at("http://www.google.com") do |s|
22
+ #s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
23
+ s.store_next_urls_with MyArray.new
24
+ s.on(:every) do |a_url, resp, prior_url|
25
+ puts a_url
26
+ end
27
+ end
@@ -1,11 +1,12 @@
1
1
  require 'rubygems'
2
2
 
3
3
  spec = Gem::Specification.new do |s|
4
- s.author = 'Mike Burns'
5
- s.email = 'mike@mike-burns.com'
4
+ s.author = 'John Nagro'
5
+ s.email = 'john.nagro@gmail.com'
6
6
  s.has_rdoc = true
7
7
  s.homepage = 'http://spider.rubyforge.org/'
8
8
  s.name = 'spider'
9
+ s.rubyforge_project = 'spider'
9
10
  s.summary = 'A Web spidering library'
10
11
  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
11
12
  s.require_path = 'lib'
@@ -13,5 +14,5 @@ spec = Gem::Specification.new do |s|
13
14
  A Web spidering library: handles robots.txt, scraping, finding more
14
15
  links, and doing it all over again.
15
16
  EOF
16
- s.version = '0.4.1'
17
+ s.version = '0.4.2'
17
18
  end
metadata CHANGED
@@ -1,78 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: spider
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.4.1
7
- date: 2007-11-10 00:00:00 -05:00
8
- summary: A Web spidering library
9
- require_paths:
10
- - lib
11
- email: mike@mike-burns.com
12
- homepage: http://spider.rubyforge.org/
13
- rubyforge_project:
14
- description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.4.2
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
- - Mike Burns
7
+ - John Nagro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-07-06 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
17
+ email: john.nagro@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
31
24
  files:
25
+ - CHANGES
32
26
  - doc
33
- - doc/rdoc-style.css
27
+ - doc/classes
28
+ - doc/classes/IncludedInMemcached.html
29
+ - doc/classes/Spider.html
30
+ - doc/classes/SpiderInstance.html
31
+ - doc/created.rid
34
32
  - doc/files
35
33
  - doc/files/lib
36
- - doc/files/lib/spider_rb.html
37
34
  - doc/files/lib/spider
38
- - doc/files/lib/spider/spider_instance_rb.html
39
35
  - doc/files/lib/spider/included_in_memcached_rb.html
36
+ - doc/files/lib/spider/spider_instance_rb.html
37
+ - doc/files/lib/spider_rb.html
40
38
  - doc/files/README.html
41
- - doc/classes
42
- - doc/classes/IncludedInMemcached.html
43
- - doc/classes/SpiderInstance.html
44
- - doc/classes/Spider.html
45
- - doc/fr_file_index.html
46
39
  - doc/fr_class_index.html
40
+ - doc/fr_file_index.html
47
41
  - doc/fr_method_index.html
48
42
  - doc/index.html
49
- - doc/created.rid
43
+ - doc/rdoc-style.css
44
+ - lib
45
+ - lib/spider
46
+ - lib/spider/included_in_memcached.rb
47
+ - lib/spider/next_urls_in_sqs.rb
48
+ - lib/spider/robot_rules.rb
49
+ - lib/spider/spider_instance.rb
50
+ - lib/spider.rb
51
+ - lib/test.rb
52
+ - README
50
53
  - spec
54
+ - spec/spec_helper.rb
51
55
  - spec/spider
52
56
  - spec/spider/included_in_memcached_spec.rb
53
57
  - spec/spider/spider_instance_spec.rb
54
58
  - spec/spider_spec.rb
55
- - spec/spec_helper.rb
56
- - README
57
59
  - spider.gemspec
58
- - CHANGES
59
- - lib
60
- - lib/spider.rb
61
- - lib/spider
62
- - lib/spider/included_in_memcached.rb
63
- - lib/spider/robot_rules.rb
64
- - lib/spider/spider_instance.rb
65
- test_files: []
66
-
60
+ has_rdoc: true
61
+ homepage: http://spider.rubyforge.org/
62
+ post_install_message:
67
63
  rdoc_options: []
68
64
 
69
- extra_rdoc_files: []
70
-
71
- executables: []
72
-
73
- extensions: []
74
-
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
75
79
  requirements: []
76
80
 
77
- dependencies: []
81
+ rubyforge_project: spider
82
+ rubygems_version: 1.0.1
83
+ signing_key:
84
+ specification_version: 2
85
+ summary: A Web spidering library
86
+ test_files: []
78
87