spider 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/AUTHORS +2 -0
- data/README.md +10 -0
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_redis.rb +31 -0
- data/lib/spider/spider_instance.rb +1 -1
- data/spec/spider/included_in_redis_spec.rb +43 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1303a4df64456b14c15c76ef68959829f83458f4
|
4
|
+
data.tar.gz: a178f2d2d961b175eded73c24c96eea5ad763502
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c556fbf8dc472250d9740b216983a6844263388b8fa18956858052606003685c2f84da8ac2815621295db086d78331181a353676600785df5bfdec7a39dae40
|
7
|
+
data.tar.gz: 9680ebca3b44c55c8f952e7f2903526f0267013967a2d38eab524970f1b702817515470cd48a04ba3ad3fbb46ccaf92ff47e911df4c62e1589ef5482e28a8b33
|
data/AUTHORS
CHANGED
data/README.md
CHANGED
@@ -72,6 +72,16 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
72
72
|
end
|
73
73
|
```
|
74
74
|
|
75
|
+
### Use Redis to track cycles
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
require 'spider'
|
79
|
+
require 'spider/included_in_redis'
|
80
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
81
|
+
s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
82
|
+
end
|
83
|
+
```
|
84
|
+
|
75
85
|
### Track cycles with a custom object
|
76
86
|
|
77
87
|
```ruby
|
data/lib/spider.rb
CHANGED
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
4
4
|
# links, and doing it all over again.
|
5
5
|
class Spider
|
6
6
|
|
7
|
-
VERSION_INFO = [0, 5,
|
7
|
+
VERSION_INFO = [0, 5, 2] unless defined?(self::VERSION_INFO)
|
8
8
|
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
9
|
|
10
10
|
def self.version
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Use Redis to track cycles.
|
2
|
+
|
3
|
+
require 'redis'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# A specialized class using Redis to track items stored. It supports
|
7
|
+
# three operations: new, <<, and include? . Together these can be used to
|
8
|
+
# add items to Redis, then determine whether the item has been added.
|
9
|
+
#
|
10
|
+
# To use it with Spider use the check_already_seen_with method:
|
11
|
+
#
|
12
|
+
# Spider.start_at('http://example.com/') do |s|
|
13
|
+
# s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
14
|
+
# end
|
15
|
+
class IncludedInRedis
|
16
|
+
# Construct a new IncludedInRedis instance. All arguments here are
|
17
|
+
# passed to Redis (part of the redis gem).
|
18
|
+
def initialize(*a)
|
19
|
+
@c = Redis.new(*a)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to Redis
|
23
|
+
def <<(v)
|
24
|
+
@c.set(v.to_s, v.to_json)
|
25
|
+
end
|
26
|
+
|
27
|
+
# True if the item is in Redis
|
28
|
+
def include?(v)
|
29
|
+
@c.get(v.to_s) == v.to_json
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
2
|
+
|
3
|
+
def before_specing_redis
|
4
|
+
local_require 'spider/included_in_redis'
|
5
|
+
system('redis-server 127.0.0.1:6379')
|
6
|
+
end
|
7
|
+
|
8
|
+
def after_specing_redis
|
9
|
+
system('kill -KILL `pidof redis-server`')
|
10
|
+
end
|
11
|
+
|
12
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
13
|
+
|
14
|
+
describe 'Object to halt cycles' do
|
15
|
+
before do
|
16
|
+
before_specing_redis
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should understand <<' do
|
20
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
21
|
+
c.should respond_to(:<<)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should understand included?' do
|
25
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
26
|
+
c.should respond_to(:include?)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should produce false if the object is not included' do
|
30
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
31
|
+
c.include?('a').should be_false
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should produce true if the object is included' do
|
35
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
36
|
+
c << 'a'
|
37
|
+
c.include?('a').should be_true
|
38
|
+
end
|
39
|
+
|
40
|
+
after do
|
41
|
+
after_specing_redis
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Nagro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
@@ -24,11 +24,13 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- lib/spider.rb
|
26
26
|
- lib/spider/included_in_memcached.rb
|
27
|
+
- lib/spider/included_in_redis.rb
|
27
28
|
- lib/spider/next_urls_in_sqs.rb
|
28
29
|
- lib/spider/robot_rules.rb
|
29
30
|
- lib/spider/spider_instance.rb
|
30
31
|
- spec/spec_helper.rb
|
31
32
|
- spec/spider/included_in_memcached_spec.rb
|
33
|
+
- spec/spider/included_in_redis_spec.rb
|
32
34
|
- spec/spider/spider_instance_spec.rb
|
33
35
|
- spec/spider_spec.rb
|
34
36
|
- spider.gemspec
|
@@ -52,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
54
|
version: '0'
|
53
55
|
requirements: []
|
54
56
|
rubyforge_project: spider
|
55
|
-
rubygems_version: 2.
|
57
|
+
rubygems_version: 2.5.2.1
|
56
58
|
signing_key:
|
57
59
|
specification_version: 4
|
58
60
|
summary: A Web spidering library
|