spider 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ae84ef471855de9c49c3499eb658d961342a4338
4
- data.tar.gz: cf80d52423709af79478ff051f212dea0b8021a2
3
+ metadata.gz: 1303a4df64456b14c15c76ef68959829f83458f4
4
+ data.tar.gz: a178f2d2d961b175eded73c24c96eea5ad763502
5
5
  SHA512:
6
- metadata.gz: 72785824697410005b8738a32e74d71a78d4623522e7f3f4bb56318349c12d693cee8a8a003766c33afec0b70bb195ce079117413543d9a004fe7adfae03b9d8
7
- data.tar.gz: 8bdd202aa793c3f39984e3394e915c915c2fd7d68b7f475c0d317103d31c4a61bdc3ceaa87cfea02a77ec2cdbf2e23cc245a4e080c211c03322ce729b5551bb8
6
+ metadata.gz: 6c556fbf8dc472250d9740b216983a6844263388b8fa18956858052606003685c2f84da8ac2815621295db086d78331181a353676600785df5bfdec7a39dae40
7
+ data.tar.gz: 9680ebca3b44c55c8f952e7f2903526f0267013967a2d38eab524970f1b702817515470cd48a04ba3ad3fbb46ccaf92ff47e911df4c62e1589ef5482e28a8b33
data/AUTHORS CHANGED
@@ -4,9 +4,11 @@ the following kind souls:
4
4
  Brian Campbell
5
5
  Henri Cook
6
6
  James Edward Gray II
7
+ Jeremy Evans
7
8
  Joao Eriberto Mota Filho
8
9
  John Buckley
9
10
  John Nagro
10
11
  Matt Horan
12
+ Marc (@brigriffin)
11
13
  Mike Burns (original author)
12
14
  Sander van der Vliet
data/README.md CHANGED
@@ -72,6 +72,16 @@ scraping, collecting, and looping so that you can just handle the data._
72
72
  end
73
73
  ```
74
74
 
75
+ ### Use Redis to track cycles
76
+
77
+ ```ruby
78
+ require 'spider'
79
+ require 'spider/included_in_redis'
80
+ Spider.start_at('http://cashcats.biz/') do |s|
81
+ s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
82
+ end
83
+ ```
84
+
75
85
  ### Track cycles with a custom object
76
86
 
77
87
  ```ruby
data/lib/spider.rb CHANGED
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
4
4
  # links, and doing it all over again.
5
5
  class Spider
6
6
 
7
- VERSION_INFO = [0, 5, 1] unless defined?(self::VERSION_INFO)
7
+ VERSION_INFO = [0, 5, 2] unless defined?(self::VERSION_INFO)
8
8
  VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
9
 
10
10
  def self.version
@@ -0,0 +1,31 @@
1
+ # Use Redis to track cycles.
2
+
3
+ require 'redis'
4
+ require 'json'
5
+
6
+ # A specialized class using Redis to track items stored. It supports
7
+ # three operations: new, <<, and include? . Together these can be used to
8
+ # add items to Redis, then determine whether the item has been added.
9
+ #
10
+ # To use it with Spider use the check_already_seen_with method:
11
+ #
12
+ # Spider.start_at('http://example.com/') do |s|
13
+ # s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
14
+ # end
15
+ class IncludedInRedis
16
+ # Construct a new IncludedInRedis instance. All arguments here are
17
+ # passed to Redis (part of the redis gem).
18
+ def initialize(*a)
19
+ @c = Redis.new(*a)
20
+ end
21
+
22
+ # Add an item to Redis
23
+ def <<(v)
24
+ @c.set(v.to_s, v.to_json)
25
+ end
26
+
27
+ # True if the item is in Redis
28
+ def include?(v)
29
+ @c.get(v.to_s) == v.to_json
30
+ end
31
+ end
@@ -123,7 +123,7 @@ class SpiderInstance
123
123
  def on(code, p = nil, &block)
124
124
  f = p ? p : block
125
125
  case code
126
- when Fixnum
126
+ when Integer
127
127
  @callbacks[code] = f
128
128
  else
129
129
  @callbacks[code.to_sym] = f
@@ -0,0 +1,43 @@
1
+ require File.dirname(__FILE__)+'/../spec_helper'
2
+
3
+ def before_specing_redis
4
+ local_require 'spider/included_in_redis'
5
+ system('redis-server 127.0.0.1:6379')
6
+ end
7
+
8
+ def after_specing_redis
9
+ system('kill -KILL `pidof redis-server`')
10
+ end
11
+
12
+ Spec::Runner.configure { |c| c.mock_with :mocha }
13
+
14
+ describe 'Object to halt cycles' do
15
+ before do
16
+ before_specing_redis
17
+ end
18
+
19
+ it 'should understand <<' do
20
+ c = IncludedInRedis.new(host: 'localhost', port: 6379)
21
+ c.should respond_to(:<<)
22
+ end
23
+
24
+ it 'should understand included?' do
25
+ c = IncludedInRedis.new(host: 'localhost', port: 6379)
26
+ c.should respond_to(:include?)
27
+ end
28
+
29
+ it 'should produce false if the object is not included' do
30
+ c = IncludedInRedis.new(host: 'localhost', port: 6379)
31
+ c.include?('a').should be_false
32
+ end
33
+
34
+ it 'should produce true if the object is included' do
35
+ c = IncludedInRedis.new(host: 'localhost', port: 6379)
36
+ c << 'a'
37
+ c.include?('a').should be_true
38
+ end
39
+
40
+ after do
41
+ after_specing_redis
42
+ end
43
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-04 00:00:00.000000000 Z
11
+ date: 2018-04-23 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |
14
14
  A Web spidering library: handles robots.txt, scraping, finding more
@@ -24,11 +24,13 @@ files:
24
24
  - README.md
25
25
  - lib/spider.rb
26
26
  - lib/spider/included_in_memcached.rb
27
+ - lib/spider/included_in_redis.rb
27
28
  - lib/spider/next_urls_in_sqs.rb
28
29
  - lib/spider/robot_rules.rb
29
30
  - lib/spider/spider_instance.rb
30
31
  - spec/spec_helper.rb
31
32
  - spec/spider/included_in_memcached_spec.rb
33
+ - spec/spider/included_in_redis_spec.rb
32
34
  - spec/spider/spider_instance_spec.rb
33
35
  - spec/spider_spec.rb
34
36
  - spider.gemspec
@@ -52,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
52
54
  version: '0'
53
55
  requirements: []
54
56
  rubyforge_project: spider
55
- rubygems_version: 2.6.6
57
+ rubygems_version: 2.5.2.1
56
58
  signing_key:
57
59
  specification_version: 4
58
60
  summary: A Web spidering library