spider 0.4.1 → 0.4.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGES CHANGED
@@ -1,3 +1,8 @@
1
+ 2008-07-06
2
+ * Trap interrupts and shutdown gracefully
3
+ * Support for custom urls-to-crawl objects
4
+ * Example AmazonSQS urls-to-crawl support (next_urls_in_sqs.rb)
5
+
1
6
  2007-11-09:
2
7
  * Handle redirects that assume a base URL.
3
8
 
data/README CHANGED
@@ -75,6 +75,31 @@ scraping, collecting, and looping so that you can just handle the data.
75
75
  s.check_already_seen_with ExpireLinks.new
76
76
  end
77
77
 
78
+ === Store nodes to visit with Amazon SQS
79
+
80
+ require 'spider'
81
+ require 'spider/next_urls_in_sqs'
82
+ Spider.start_at('http://mike-burns.com') do |s|
83
+ s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
84
+ end
85
+
86
+ ==== Store nodes to visit with a custom object
87
+
88
+ require 'spider'
89
+ class MyArray < Array
90
+ def pop
91
+ super
92
+ end
93
+
94
+ def push(a_msg)
95
+ super(a_msg)
96
+ end
97
+ end
98
+
99
+ Spider.start_at('http://mike-burns.com') do |s|
100
+ s.store_next_urls_with MyArray.new
101
+ end
102
+
78
103
  === Create a URL graph
79
104
 
80
105
  require 'spider'
@@ -106,9 +131,10 @@ scraping, collecting, and looping so that you can just handle the data.
106
131
 
107
132
  == Author
108
133
 
109
- Mike Burns http://mike-burns.com mike@mike-burns.com
134
+ John Nagro john.nagro@gmail.com
135
+ Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
110
136
 
111
- Help from Matt Horan, John Nagro, and Henri Cook.
137
+ Help from Matt Horan, and Henri Cook.
112
138
 
113
139
  With `robot_rules' from James Edward Gray II via
114
140
  http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
@@ -1,4 +1,4 @@
1
- # Copyright 2007 Mike Burns
1
+ # Copyright 2007-2008 Mike Burns & John Nagro
2
2
  # :include: README
3
3
 
4
4
  # Redistribution and use in source and binary forms, with or without
@@ -1,5 +1,6 @@
1
1
  # Use memcached to track cycles.
2
-
2
+ #
3
+ # Copyright 2007 Mike Burns
3
4
  # Redistribution and use in source and binary forms, with or without
4
5
  # modification, are permitted provided that the following conditions are met:
5
6
  # * Redistributions of source code must retain the above copyright
@@ -0,0 +1,66 @@
1
+ # Use AmazonSQS to track nodes to visit.
2
+ #
3
+ # Copyright 2008 John Nagro
4
+ # Redistribution and use in source and binary forms, with or without
5
+ # modification, are permitted provided that the following conditions are met:
6
+ # * Redistributions of source code must retain the above copyright
7
+ # notice, this list of conditions and the following disclaimer.
8
+ # * Redistributions in binary form must reproduce the above copyright
9
+ # notice, this list of conditions and the following disclaimer in the
10
+ # documentation and/or other materials provided with the distribution.
11
+ # * Neither the name Mike Burns nor the
12
+ # names of his contributors may be used to endorse or promote products
13
+ # derived from this software without specific prior written permission.
14
+ #
15
+ # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
+ # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
+ # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
+ # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
+ # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
+ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
+
26
+ require 'rubygems'
27
+ require 'right_aws'
28
+ require 'yaml'
29
+
30
+ # A specialized class using AmazonSQS to track nodes to walk. It supports
31
+ # two operations: push and pop . Together these can be used to
32
+ # add items to the queue, then pull items off the queue.
33
+ #
34
+ # This is useful if you want multiple Spider processes crawling the same
35
+ # data set.
36
+ #
37
+ # To use it with Spider use the store_next_urls_with method:
38
+ #
39
+ # Spider.start_at('http://example.com/') do |s|
40
+ # s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
41
+ # end
42
+ class NextUrlsInSQS
43
+ # Construct a new NextUrlsInSQS instance. All arguments here are
44
+ # passed to RightAWS::SqsGen2 (part of the right_aws gem) or used
45
+ # to set the AmazonSQS queue name (optional).
46
+ def initialize(aws_access_key, aws_secret_access_key, queue_name = 'ruby-spider')
47
+ @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
48
+ @queue = @sqs.queue(queue_name)
49
+ end
50
+
51
+ # Pull an item off the queue, loop until data is found. Data is
52
+ # encoded with YAML.
53
+ def pop
54
+ while true
55
+ message = @queue.pop
56
+ return YAML::load(message.to_s) unless message.nil?
57
+ sleep 5
58
+ end
59
+ end
60
+
61
+ # Put data on the queue. Data is encoded with YAML.
62
+ def push(a_msg)
63
+ encoded_message = YAML::dump(a_msg)
64
+ @queue.push(a_msg)
65
+ end
66
+ end
@@ -1,6 +1,6 @@
1
1
  # Specialized spidering rules.
2
2
 
3
- # Copyright 2007 Mike Burns
3
+ # Copyright 2007-2008 Mike Burns & John Nagro
4
4
  # Redistribution and use in source and binary forms, with or without
5
5
  # modification, are permitted provided that the following conditions are met:
6
6
  # * Redistributions of source code must retain the above copyright
@@ -51,7 +51,7 @@ class SpiderInstance
51
51
  @url_checks = []
52
52
  @cache = :memory
53
53
  @callbacks = {}
54
- @next_urls = next_urls
54
+ @next_urls = [next_urls]
55
55
  @seen = seen
56
56
  @rules = rules || RobotRules.new('Ruby Spider 1.0')
57
57
  @robots_seen = robots_seen
@@ -96,6 +96,30 @@ class SpiderInstance
96
96
  end
97
97
  end
98
98
 
99
+ # The Web is a really, really, really big graph; as such, this list
100
+ # of nodes to visit grows really, really, really big.
101
+ #
102
+ # Change the object used to store nodes we have yet to walk. The default
103
+ # object is an instance of Array. Available with Spider is a wrapper of
104
+ # AmazonSQS.
105
+ #
106
+ # You can implement a custom class for this; any object passed to
107
+ # check_already_seen_with must understand just push and pop .
108
+ #
109
+ # # default
110
+ # store_next_urls_with Array.new
111
+ #
112
+ # # AmazonSQS
113
+ # require 'spider/next_urls_in_sqs'
114
+ # store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
115
+ def store_next_urls_with(a_store)
116
+ tmp_next_urls = @next_urls
117
+ @next_urls = a_store
118
+ tmp_next_urls.each do |a_url_hash|
119
+ @next_urls.push a_url_hash
120
+ end
121
+ end
122
+
99
123
  # Add a response handler. A response handler's trigger can be :every,
100
124
  # :success, :failure, or any HTTP status code. The handler itself can be
101
125
  # either a Proc or a block.
@@ -159,9 +183,11 @@ class SpiderInstance
159
183
  @headers = {}
160
184
  end
161
185
 
162
- def start! #:nodoc:
163
- next_urls = @next_urls
186
+ def start! #:nodoc:
187
+ interrupted = false
188
+ trap("SIGINT") { interrupted = true }
164
189
  begin
190
+ next_urls = @next_urls.pop
165
191
  tmp_n_u = {}
166
192
  next_urls.each do |prior_url, urls|
167
193
  urls.map do |a_url|
@@ -172,13 +198,18 @@ class SpiderInstance
172
198
  @setup.call(a_url) unless @setup.nil?
173
199
  get_page(parsed_url) do |response|
174
200
  do_callbacks(a_url, response, prior_url)
175
- tmp_n_u[a_url] = generate_next_urls(a_url, response)
201
+ #tmp_n_u[a_url] = generate_next_urls(a_url, response)
202
+ #@next_urls.push tmp_n_u
203
+ generate_next_urls(a_url, response).each do |a_next_url|
204
+ @next_urls.push a_url => a_next_url
205
+ end
206
+ #exit if interrupted
176
207
  end
177
208
  @teardown.call(a_url) unless @teardown.nil?
209
+ exit if interrupted
178
210
  end
179
211
  end
180
- next_urls = tmp_n_u
181
- end while !next_urls.empty?
212
+ end while !@next_urls.empty?
182
213
  end
183
214
 
184
215
  def success_or_failure(code) #:nodoc:
@@ -0,0 +1,27 @@
1
+ require 'spider.rb'
2
+ require 'spider/next_urls_in_sqs.rb'
3
+
4
+ class MyArray < Array
5
+ def pop
6
+ a_msg = super
7
+ puts "pop: #{a_msg.inspect}"
8
+ return a_msg
9
+ end
10
+
11
+ def push(a_msg)
12
+ puts "push: #{a_msg.inspect}"
13
+ super(a_msg)
14
+ end
15
+ end
16
+
17
+ AWS_ACCESS_KEY = '0YA99M8Y09J2D4FEC602'
18
+ AWS_SECRET_ACCESS_KEY = 'Sc9R9uiwbFYz7XhQqkPvSK3Bbq4tPYPVMWyDlF+a'
19
+
20
+ #Spider.start_at("http://docs.huihoo.com/ruby/ruby-man-1.4/function.html") do |s|
21
+ Spider.start_at("http://www.google.com") do |s|
22
+ #s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
23
+ s.store_next_urls_with MyArray.new
24
+ s.on(:every) do |a_url, resp, prior_url|
25
+ puts a_url
26
+ end
27
+ end
@@ -1,11 +1,12 @@
1
1
  require 'rubygems'
2
2
 
3
3
  spec = Gem::Specification.new do |s|
4
- s.author = 'Mike Burns'
5
- s.email = 'mike@mike-burns.com'
4
+ s.author = 'John Nagro'
5
+ s.email = 'john.nagro@gmail.com'
6
6
  s.has_rdoc = true
7
7
  s.homepage = 'http://spider.rubyforge.org/'
8
8
  s.name = 'spider'
9
+ s.rubyforge_project = 'spider'
9
10
  s.summary = 'A Web spidering library'
10
11
  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
11
12
  s.require_path = 'lib'
@@ -13,5 +14,5 @@ spec = Gem::Specification.new do |s|
13
14
  A Web spidering library: handles robots.txt, scraping, finding more
14
15
  links, and doing it all over again.
15
16
  EOF
16
- s.version = '0.4.1'
17
+ s.version = '0.4.2'
17
18
  end
metadata CHANGED
@@ -1,78 +1,87 @@
1
1
  --- !ruby/object:Gem::Specification
2
- rubygems_version: 0.9.4
3
- specification_version: 1
4
2
  name: spider
5
3
  version: !ruby/object:Gem::Version
6
- version: 0.4.1
7
- date: 2007-11-10 00:00:00 -05:00
8
- summary: A Web spidering library
9
- require_paths:
10
- - lib
11
- email: mike@mike-burns.com
12
- homepage: http://spider.rubyforge.org/
13
- rubyforge_project:
14
- description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
15
- autorequire:
16
- default_executable:
17
- bindir: bin
18
- has_rdoc: true
19
- required_ruby_version: !ruby/object:Gem::Version::Requirement
20
- requirements:
21
- - - ">"
22
- - !ruby/object:Gem::Version
23
- version: 0.0.0
24
- version:
4
+ version: 0.4.2
25
5
  platform: ruby
26
- signing_key:
27
- cert_chain:
28
- post_install_message:
29
6
  authors:
30
- - Mike Burns
7
+ - John Nagro
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-07-06 00:00:00 -04:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: "A Web spidering library: handles robots.txt, scraping, finding more links, and doing it all over again."
17
+ email: john.nagro@gmail.com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files: []
23
+
31
24
  files:
25
+ - CHANGES
32
26
  - doc
33
- - doc/rdoc-style.css
27
+ - doc/classes
28
+ - doc/classes/IncludedInMemcached.html
29
+ - doc/classes/Spider.html
30
+ - doc/classes/SpiderInstance.html
31
+ - doc/created.rid
34
32
  - doc/files
35
33
  - doc/files/lib
36
- - doc/files/lib/spider_rb.html
37
34
  - doc/files/lib/spider
38
- - doc/files/lib/spider/spider_instance_rb.html
39
35
  - doc/files/lib/spider/included_in_memcached_rb.html
36
+ - doc/files/lib/spider/spider_instance_rb.html
37
+ - doc/files/lib/spider_rb.html
40
38
  - doc/files/README.html
41
- - doc/classes
42
- - doc/classes/IncludedInMemcached.html
43
- - doc/classes/SpiderInstance.html
44
- - doc/classes/Spider.html
45
- - doc/fr_file_index.html
46
39
  - doc/fr_class_index.html
40
+ - doc/fr_file_index.html
47
41
  - doc/fr_method_index.html
48
42
  - doc/index.html
49
- - doc/created.rid
43
+ - doc/rdoc-style.css
44
+ - lib
45
+ - lib/spider
46
+ - lib/spider/included_in_memcached.rb
47
+ - lib/spider/next_urls_in_sqs.rb
48
+ - lib/spider/robot_rules.rb
49
+ - lib/spider/spider_instance.rb
50
+ - lib/spider.rb
51
+ - lib/test.rb
52
+ - README
50
53
  - spec
54
+ - spec/spec_helper.rb
51
55
  - spec/spider
52
56
  - spec/spider/included_in_memcached_spec.rb
53
57
  - spec/spider/spider_instance_spec.rb
54
58
  - spec/spider_spec.rb
55
- - spec/spec_helper.rb
56
- - README
57
59
  - spider.gemspec
58
- - CHANGES
59
- - lib
60
- - lib/spider.rb
61
- - lib/spider
62
- - lib/spider/included_in_memcached.rb
63
- - lib/spider/robot_rules.rb
64
- - lib/spider/spider_instance.rb
65
- test_files: []
66
-
60
+ has_rdoc: true
61
+ homepage: http://spider.rubyforge.org/
62
+ post_install_message:
67
63
  rdoc_options: []
68
64
 
69
- extra_rdoc_files: []
70
-
71
- executables: []
72
-
73
- extensions: []
74
-
65
+ require_paths:
66
+ - lib
67
+ required_ruby_version: !ruby/object:Gem::Requirement
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ version: "0"
72
+ version:
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ requirements:
75
+ - - ">="
76
+ - !ruby/object:Gem::Version
77
+ version: "0"
78
+ version:
75
79
  requirements: []
76
80
 
77
- dependencies: []
81
+ rubyforge_project: spider
82
+ rubygems_version: 1.0.1
83
+ signing_key:
84
+ specification_version: 2
85
+ summary: A Web spidering library
86
+ test_files: []
78
87