spider 0.4.4 → 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (77) hide show
  1. checksums.yaml +7 -0
  2. data/AUTHORS +17 -0
  3. data/CHANGES +16 -0
  4. data/LICENSE +21 -0
  5. data/{README → README.md} +73 -44
  6. data/lib/spider.rb +12 -29
  7. data/lib/spider/included_in_file.rb +32 -0
  8. data/lib/spider/included_in_memcached.rb +1 -24
  9. data/lib/spider/included_in_redis.rb +31 -0
  10. data/lib/spider/next_urls_in_sqs.rb +6 -29
  11. data/lib/spider/robot_rules.rb +61 -57
  12. data/lib/spider/spider_instance.rb +16 -35
  13. data/spec/spider/included_in_redis_spec.rb +43 -0
  14. data/spider.gemspec +5 -3
  15. metadata +38 -125
  16. data/doc/classes/BeStaticServerPages.html +0 -197
  17. data/doc/classes/BeStaticServerPages.src/M000030.html +0 -19
  18. data/doc/classes/BeStaticServerPages.src/M000031.html +0 -19
  19. data/doc/classes/BeStaticServerPages.src/M000032.html +0 -18
  20. data/doc/classes/BeStaticServerPages.src/M000033.html +0 -18
  21. data/doc/classes/IncludedInMemcached.html +0 -199
  22. data/doc/classes/IncludedInMemcached.src/M000015.html +0 -18
  23. data/doc/classes/IncludedInMemcached.src/M000016.html +0 -18
  24. data/doc/classes/IncludedInMemcached.src/M000017.html +0 -18
  25. data/doc/classes/LoopingServlet.html +0 -137
  26. data/doc/classes/LoopingServlet.src/M000037.html +0 -23
  27. data/doc/classes/NextUrlsInSQS.html +0 -204
  28. data/doc/classes/NextUrlsInSQS.src/M000018.html +0 -19
  29. data/doc/classes/NextUrlsInSQS.src/M000019.html +0 -22
  30. data/doc/classes/NextUrlsInSQS.src/M000020.html +0 -19
  31. data/doc/classes/QueryServlet.html +0 -137
  32. data/doc/classes/QueryServlet.src/M000038.html +0 -19
  33. data/doc/classes/RobotRules.html +0 -175
  34. data/doc/classes/RobotRules.src/M000034.html +0 -19
  35. data/doc/classes/RobotRules.src/M000035.html +0 -67
  36. data/doc/classes/RobotRules.src/M000036.html +0 -24
  37. data/doc/classes/Spider.html +0 -170
  38. data/doc/classes/Spider.src/M000029.html +0 -21
  39. data/doc/classes/SpiderInstance.html +0 -345
  40. data/doc/classes/SpiderInstance.src/M000021.html +0 -18
  41. data/doc/classes/SpiderInstance.src/M000022.html +0 -22
  42. data/doc/classes/SpiderInstance.src/M000023.html +0 -22
  43. data/doc/classes/SpiderInstance.src/M000024.html +0 -24
  44. data/doc/classes/SpiderInstance.src/M000025.html +0 -18
  45. data/doc/classes/SpiderInstance.src/M000026.html +0 -18
  46. data/doc/classes/SpiderInstance.src/M000027.html +0 -18
  47. data/doc/classes/SpiderInstance.src/M000028.html +0 -18
  48. data/doc/created.rid +0 -1
  49. data/doc/files/README.html +0 -223
  50. data/doc/files/lib/spider/included_in_memcached_rb.html +0 -142
  51. data/doc/files/lib/spider/next_urls_in_sqs_rb.html +0 -144
  52. data/doc/files/lib/spider/robot_rules_rb.html +0 -114
  53. data/doc/files/lib/spider/spider_instance_rb.html +0 -117
  54. data/doc/files/lib/spider_rb.html +0 -254
  55. data/doc/files/spec/spec_helper_rb.html +0 -196
  56. data/doc/files/spec/spec_helper_rb.src/M000001.html +0 -20
  57. data/doc/files/spec/spec_helper_rb.src/M000002.html +0 -26
  58. data/doc/files/spec/spec_helper_rb.src/M000003.html +0 -24
  59. data/doc/files/spec/spec_helper_rb.src/M000004.html +0 -18
  60. data/doc/files/spec/spec_helper_rb.src/M000005.html +0 -23
  61. data/doc/files/spec/spider/included_in_memcached_spec_rb.html +0 -142
  62. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000006.html +0 -19
  63. data/doc/files/spec/spider/included_in_memcached_spec_rb.src/M000007.html +0 -18
  64. data/doc/files/spec/spider/spider_instance_spec_rb.html +0 -210
  65. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000008.html +0 -21
  66. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000009.html +0 -19
  67. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000010.html +0 -19
  68. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000011.html +0 -27
  69. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000012.html +0 -26
  70. data/doc/files/spec/spider/spider_instance_spec_rb.src/M000013.html +0 -27
  71. data/doc/files/spec/spider_spec_rb.html +0 -127
  72. data/doc/files/spec/spider_spec_rb.src/M000014.html +0 -23
  73. data/doc/fr_class_index.html +0 -34
  74. data/doc/fr_file_index.html +0 -35
  75. data/doc/fr_method_index.html +0 -64
  76. data/doc/index.html +0 -24
  77. data/doc/rdoc-style.css +0 -208
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
4
+ data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
5
+ SHA512:
6
+ metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
7
+ data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
data/AUTHORS ADDED
@@ -0,0 +1,17 @@
1
+ The Ruby Spider Gem would not be what it is today without the help of
2
+ the following kind souls:
3
+
4
+ Alexandre Rousseau
5
+ Brian Campbell
6
+ Henri Cook
7
+ James Edward Gray II
8
+ Jeremy Evans
9
+ Joao Eriberto Mota Filho
10
+ John Buckley
11
+ John Nagro
12
+ Matt Horan
13
+ Marc (@brigriffin)
14
+ Mike Burns (original author)
15
+ Olle Jonsson
16
+ Sander van der Vliet
17
+ Stuart Yamartino
data/CHANGES CHANGED
@@ -1,3 +1,19 @@
1
+ 2018-04-23 v0.5.3
2
+ * release simply to add missing CHANGES notes
3
+
4
+ 2018-04-23 v0.5.2
5
+ * fixed #2 thanks to @jeremyevans
6
+ * added Redis as cache wrapper thanks to @brigriffin
7
+
8
+ 2016-09-04 v0.5.1
9
+ * added the ability to stop a crawl
10
+
11
+ 2016-05-13 v0.5.0
12
+ * fixed #1 thanks to @eribertomota
13
+ * got it running on more recent versions of ruby
14
+ * cleaned up the docs a bit
15
+ * cleaned up the licensing and attribution
16
+
1
17
  2009-05-21
2
18
  * fixed an issue with robots.txt on ssl hosts
3
19
  * fixed an issue with pulling robots.txt from disallowed hosts
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2007-2016 Spider Team Authors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,66 +1,100 @@
1
1
 
2
- Spider, a Web spidering library for Ruby. It handles the robots.txt,
3
- scraping, collecting, and looping so that you can just handle the data.
2
+ # Spider
3
+ _a Web spidering library for Ruby. It handles the robots.txt,
4
+ scraping, collecting, and looping so that you can just handle the data._
4
5
 
5
- == Examples
6
+ ## Examples
6
7
 
7
- === Crawl the Web, loading each page in turn, until you run out of memory
8
+ ### Crawl the Web, loading each page in turn, until you run out of memory
8
9
 
10
+ ```ruby
9
11
  require 'spider'
10
- Spider.start_at('http://mike-burns.com/') {}
12
+ Spider.start_at('http://cashcats.biz/') {}
13
+ ```
11
14
 
12
- === To handle erroneous responses
15
+ ### To handle erroneous responses
13
16
 
17
+ ```ruby
14
18
  require 'spider'
15
- Spider.start_at('http://mike-burns.com/') do |s|
19
+ Spider.start_at('http://cashcats.biz/') do |s|
16
20
  s.on :failure do |a_url, resp, prior_url|
17
21
  puts "URL failed: #{a_url}"
18
22
  puts " linked from #{prior_url}"
19
23
  end
20
24
  end
25
+ ```
21
26
 
22
- === Or handle successful responses
27
+ ### Or handle successful responses
23
28
 
29
+ ```ruby
24
30
  require 'spider'
25
- Spider.start_at('http://mike-burns.com/') do |s|
31
+ Spider.start_at('http://cashcats.biz/') do |s|
26
32
  s.on :success do |a_url, resp, prior_url|
27
33
  puts "#{a_url}: #{resp.code}"
28
34
  puts resp.body
29
35
  puts
30
36
  end
31
37
  end
38
+ ```
32
39
 
33
- === Limit to just one domain
40
+ ### Limit to just one domain
34
41
 
42
+ ```ruby
35
43
  require 'spider'
36
- Spider.start_at('http://mike-burns.com/') do |s|
44
+ Spider.start_at('http://cashcats.biz/') do |s|
37
45
  s.add_url_check do |a_url|
38
- a_url =~ %r{^http://mike-burns.com.*}
46
+ a_url =~ %r{^http://cashcats.biz.*}
39
47
  end
40
48
  end
49
+ ```
41
50
 
42
- === Pass headers to some requests
51
+ ### Pass headers to some requests
43
52
 
53
+ ```ruby
44
54
  require 'spider'
45
- Spider.start_at('http://mike-burns.com/') do |s|
55
+ Spider.start_at('http://cashcats.biz/') do |s|
46
56
  s.setup do |a_url|
47
57
  if a_url =~ %r{^http://.*wikipedia.*}
48
58
  headers['User-Agent'] = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
49
59
  end
50
60
  end
51
61
  end
62
+ ```
52
63
 
53
- === Use memcached to track cycles
64
+ ### Use memcached to track cycles
54
65
 
66
+ ```ruby
55
67
  require 'spider'
56
68
  require 'spider/included_in_memcached'
57
69
  SERVERS = ['10.0.10.2:11211','10.0.10.3:11211','10.0.10.4:11211']
58
- Spider.start_at('http://mike-burns.com/') do |s|
70
+ Spider.start_at('http://cashcats.biz/') do |s|
59
71
  s.check_already_seen_with IncludedInMemcached.new(SERVERS)
60
72
  end
73
+ ```
61
74
 
62
- === Track cycles with a custom object
75
+ ### Use Redis to track cycles
63
76
 
77
+ ```ruby
78
+ require 'spider'
79
+ require 'spider/included_in_redis'
80
+ Spider.start_at('http://cashcats.biz/') do |s|
81
+ s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
82
+ end
83
+ ```
84
+
85
+ ### Use Plain text to track cycles
86
+
87
+ ```ruby
88
+ require 'spider'
89
+ require 'spider/included_in_redis'
90
+ Spider.start_at('http://cashcats.biz/') do |s|
91
+ s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
92
+ end
93
+ ```
94
+
95
+ ### Track cycles with a custom object
96
+
97
+ ```ruby
64
98
  require 'spider'
65
99
  class ExpireLinks < Hash
66
100
  def <<(v)
@@ -71,54 +105,62 @@ scraping, collecting, and looping so that you can just handle the data.
71
105
  end
72
106
  end
73
107
 
74
- Spider.start_at('http://mike-burns.com/') do |s|
108
+ Spider.start_at('http://cashcats.biz/') do |s|
75
109
  s.check_already_seen_with ExpireLinks.new
76
110
  end
111
+ ```
77
112
 
78
- === Store nodes to visit with Amazon SQS
113
+ ### Store nodes to visit with Amazon SQS
79
114
 
115
+ ```ruby
80
116
  require 'spider'
81
117
  require 'spider/next_urls_in_sqs'
82
- Spider.start_at('http://mike-burns.com') do |s|
118
+ Spider.start_at('http://cashcats.biz') do |s|
83
119
  s.store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY)
84
120
  end
121
+ ```
85
122
 
86
- ==== Store nodes to visit with a custom object
123
+ ### Store nodes to visit with a custom object
87
124
 
125
+ ```ruby
88
126
  require 'spider'
89
127
  class MyArray < Array
90
128
  def pop
91
- super
129
+ super
92
130
  end
93
-
131
+
94
132
  def push(a_msg)
95
133
  super(a_msg)
96
134
  end
97
135
  end
98
136
 
99
- Spider.start_at('http://mike-burns.com') do |s|
137
+ Spider.start_at('http://cashcats.biz') do |s|
100
138
  s.store_next_urls_with MyArray.new
101
139
  end
140
+ ```
102
141
 
103
- === Create a URL graph
142
+ ### Create a URL graph
104
143
 
144
+ ```ruby
105
145
  require 'spider'
106
146
  nodes = {}
107
- Spider.start_at('http://mike-burns.com/') do |s|
108
- s.add_url_check {|a_url| a_url =~ %r{^http://mike-burns.com.*} }
147
+ Spider.start_at('http://cashcats.biz/') do |s|
148
+ s.add_url_check {|a_url| a_url =~ %r{^http://cashcats.biz.*} }
109
149
 
110
150
  s.on(:every) do |a_url, resp, prior_url|
111
151
  nodes[prior_url] ||= []
112
152
  nodes[prior_url] << a_url
113
153
  end
114
154
  end
155
+ ```
115
156
 
116
- === Use a proxy
157
+ ### Use a proxy
117
158
 
159
+ ```ruby
118
160
  require 'net/http_configuration'
119
161
  require 'spider'
120
162
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
121
- :proxy_port => 8881)
163
+ :proxy_port => 8881)
122
164
  http_conf.apply do
123
165
  Spider.start_at('http://img.4chan.org/b/') do |s|
124
166
  s.on(:success) do |a_url, resp, prior_url|
@@ -128,19 +170,6 @@ scraping, collecting, and looping so that you can just handle the data.
128
170
  end
129
171
  end
130
172
  end
173
+ ```
131
174
 
132
- == Author
133
-
134
- John Nagro john.nagro@gmail.com
135
-
136
- Mike Burns http://mike-burns.com mike@mike-burns.com (original author)
137
-
138
- Many thanks to:
139
- Matt Horan
140
- Henri Cook
141
- Sander van der Vliet
142
- John Buckley
143
- Brian Campbell
144
-
145
- With `robot_rules' from James Edward Gray II via
146
- http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/177589
175
+ _Copyright (c) 2007-2016 Spider Team Authors_
@@ -1,41 +1,24 @@
1
- # Copyright 2007-2008 Mike Burns & John Nagro
2
- # :include: README
3
-
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
-
26
1
  require File.dirname(__FILE__)+'/spider/spider_instance'
27
2
 
28
3
  # A spidering library for Ruby. Handles robots.txt, scraping, finding more
29
4
  # links, and doing it all over again.
30
5
  class Spider
6
+
7
+ VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
8
+ VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
+
10
+ def self.version
11
+ VERSION
12
+ end
13
+
31
14
  # Runs the spider starting at the given URL. Also takes a block that is given
32
15
  # the SpiderInstance. Use the block to define the rules and handlers for
33
16
  # the discovered Web pages. See SpiderInstance for the possible rules and
34
17
  # handlers.
35
18
  #
36
- # Spider.start_at('http://mike-burns.com/') do |s|
19
+ # Spider.start_at('http://cashcats.biz/') do |s|
37
20
  # s.add_url_check do |a_url|
38
- # a_url =~ %r{^http://mike-burns.com.*}
21
+ # a_url =~ %r{^http://cashcats.biz.*}
39
22
  # end
40
23
  #
41
24
  # s.on 404 do |a_url, resp, prior_url|
@@ -52,8 +35,8 @@ class Spider
52
35
  # end
53
36
 
54
37
  def self.start_at(a_url, &block)
55
- rules = RobotRules.new('Ruby Spider 1.0')
56
- a_spider = SpiderInstance.new({nil => a_url}, [], rules, [])
38
+ rules = RobotRules.new("Ruby Spider #{Spider::VERSION}")
39
+ a_spider = SpiderInstance.new({nil => [a_url]}, [], rules, [])
57
40
  block.call(a_spider)
58
41
  a_spider.start!
59
42
  end
@@ -0,0 +1,32 @@
1
+ # Use plain text file to track cycles.
2
+
3
+ # A specialized class using a plain text to track items stored. It supports
4
+ # three operations: new, <<, and include? . Together these can be used to
5
+ # add items to the text file, then determine whether the item has been added.
6
+ #
7
+ # To use it with Spider use the check_already_seen_with method:
8
+ #
9
+ # Spider.start_at('http://example.com/') do |s|
10
+ # s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
11
+ # end
12
+ class IncludedInFile
13
+ # Construct a new IncludedInFile instance.
14
+ # @param filepath [String] as path of file to store crawled URL
15
+ def initialize(filepath)
16
+ @filepath = filepath
17
+ # create file if not exists
18
+ File.write(@filepath, '') unless File.file?(@filepath)
19
+ @urls = File.readlines(@filepath).map(&:chomp)
20
+ end
21
+
22
+ # Add an item to the file & array of URL.
23
+ def <<(v)
24
+ @urls << v.to_s
25
+ File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
26
+ end
27
+
28
+ # True if the item is in the file.
29
+ def include?(v)
30
+ @urls.include? v.to_s
31
+ end
32
+ end
@@ -1,32 +1,9 @@
1
1
  # Use memcached to track cycles.
2
- #
3
- # Copyright 2007 Mike Burns
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'memcache'
27
4
 
28
5
  # A specialized class using memcached to track items stored. It supports
29
- # three operations: new, <<, and include? . Together these can be used to
6
+ # three operations: new, <<, and include? . Together these can be used to
30
7
  # add items to the memcache, then determine whether the item has been added.
31
8
  #
32
9
  # To use it with Spider use the check_already_seen_with method:
@@ -0,0 +1,31 @@
1
+ # Use Redis to track cycles.
2
+
3
+ require 'redis'
4
+ require 'json'
5
+
6
+ # A specialized class using Redis to track items stored. It supports
7
+ # three operations: new, <<, and include? . Together these can be used to
8
+ # add items to Redis, then determine whether the item has been added.
9
+ #
10
+ # To use it with Spider use the check_already_seen_with method:
11
+ #
12
+ # Spider.start_at('http://example.com/') do |s|
13
+ # s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
14
+ # end
15
+ class IncludedInRedis
16
+ # Construct a new IncludedInRedis instance. All arguments here are
17
+ # passed to Redis (part of the redis gem).
18
+ def initialize(*a)
19
+ @c = Redis.new(*a)
20
+ end
21
+
22
+ # Add an item to Redis
23
+ def <<(v)
24
+ @c.set(v.to_s, v.to_json)
25
+ end
26
+
27
+ # True if the item is in Redis
28
+ def include?(v)
29
+ @c.get(v.to_s) == v.to_json
30
+ end
31
+ end
@@ -1,34 +1,11 @@
1
1
  # Use AmazonSQS to track nodes to visit.
2
- #
3
- # Copyright 2008 John Nagro
4
- # Redistribution and use in source and binary forms, with or without
5
- # modification, are permitted provided that the following conditions are met:
6
- # * Redistributions of source code must retain the above copyright
7
- # notice, this list of conditions and the following disclaimer.
8
- # * Redistributions in binary form must reproduce the above copyright
9
- # notice, this list of conditions and the following disclaimer in the
10
- # documentation and/or other materials provided with the distribution.
11
- # * Neither the name Mike Burns nor the
12
- # names of his contributors may be used to endorse or promote products
13
- # derived from this software without specific prior written permission.
14
- #
15
- # THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
16
- # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17
- # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18
- # DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
19
- # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20
- # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21
- # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22
- # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23
- # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24
- # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
2
 
26
3
  require 'rubygems'
27
4
  require 'right_aws'
28
5
  require 'yaml'
29
6
 
30
7
  # A specialized class using AmazonSQS to track nodes to walk. It supports
31
- # two operations: push and pop . Together these can be used to
8
+ # two operations: push and pop . Together these can be used to
32
9
  # add items to the queue, then pull items off the queue.
33
10
  #
34
11
  # This is useful if you want multiple Spider processes crawling the same
@@ -47,8 +24,8 @@ class NextUrlsInSQS
47
24
  @sqs = RightAws::SqsGen2.new(aws_access_key, aws_secret_access_key)
48
25
  @queue = @sqs.queue(queue_name)
49
26
  end
50
-
51
- # Pull an item off the queue, loop until data is found. Data is
27
+
28
+ # Pull an item off the queue, loop until data is found. Data is
52
29
  # encoded with YAML.
53
30
  def pop
54
31
  while true
@@ -57,10 +34,10 @@ class NextUrlsInSQS
57
34
  sleep 5
58
35
  end
59
36
  end
60
-
37
+
61
38
  # Put data on the queue. Data is encoded with YAML.
62
39
  def push(a_msg)
63
40
  encoded_message = YAML::dump(a_msg)
64
41
  @queue.push(a_msg)
65
- end
66
- end
42
+ end
43
+ end