spider 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/AUTHORS +2 -0
- data/README.md +10 -0
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_redis.rb +31 -0
- data/lib/spider/spider_instance.rb +1 -1
- data/spec/spider/included_in_redis_spec.rb +43 -0
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1303a4df64456b14c15c76ef68959829f83458f4
|
4
|
+
data.tar.gz: a178f2d2d961b175eded73c24c96eea5ad763502
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6c556fbf8dc472250d9740b216983a6844263388b8fa18956858052606003685c2f84da8ac2815621295db086d78331181a353676600785df5bfdec7a39dae40
|
7
|
+
data.tar.gz: 9680ebca3b44c55c8f952e7f2903526f0267013967a2d38eab524970f1b702817515470cd48a04ba3ad3fbb46ccaf92ff47e911df4c62e1589ef5482e28a8b33
|
data/AUTHORS
CHANGED
data/README.md
CHANGED
@@ -72,6 +72,16 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
72
72
|
end
|
73
73
|
```
|
74
74
|
|
75
|
+
### Use Redis to track cycles
|
76
|
+
|
77
|
+
```ruby
|
78
|
+
require 'spider'
|
79
|
+
require 'spider/included_in_redis'
|
80
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
81
|
+
s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
82
|
+
end
|
83
|
+
```
|
84
|
+
|
75
85
|
### Track cycles with a custom object
|
76
86
|
|
77
87
|
```ruby
|
data/lib/spider.rb
CHANGED
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
4
4
|
# links, and doing it all over again.
|
5
5
|
class Spider
|
6
6
|
|
7
|
-
VERSION_INFO = [0, 5,
|
7
|
+
VERSION_INFO = [0, 5, 2] unless defined?(self::VERSION_INFO)
|
8
8
|
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
9
|
|
10
10
|
def self.version
|
@@ -0,0 +1,31 @@
|
|
1
|
+
# Use Redis to track cycles.
|
2
|
+
|
3
|
+
require 'redis'
|
4
|
+
require 'json'
|
5
|
+
|
6
|
+
# A specialized class using Redis to track items stored. It supports
|
7
|
+
# three operations: new, <<, and include? . Together these can be used to
|
8
|
+
# add items to Redis, then determine whether the item has been added.
|
9
|
+
#
|
10
|
+
# To use it with Spider use the check_already_seen_with method:
|
11
|
+
#
|
12
|
+
# Spider.start_at('http://example.com/') do |s|
|
13
|
+
# s.check_already_seen_with IncludedInRedis.new(host: '127.0.0.1', port: 6379)
|
14
|
+
# end
|
15
|
+
class IncludedInRedis
|
16
|
+
# Construct a new IncludedInRedis instance. All arguments here are
|
17
|
+
# passed to Redis (part of the redis gem).
|
18
|
+
def initialize(*a)
|
19
|
+
@c = Redis.new(*a)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to Redis
|
23
|
+
def <<(v)
|
24
|
+
@c.set(v.to_s, v.to_json)
|
25
|
+
end
|
26
|
+
|
27
|
+
# True if the item is in Redis
|
28
|
+
def include?(v)
|
29
|
+
@c.get(v.to_s) == v.to_json
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
require File.dirname(__FILE__)+'/../spec_helper'
|
2
|
+
|
3
|
+
def before_specing_redis
|
4
|
+
local_require 'spider/included_in_redis'
|
5
|
+
system('redis-server 127.0.0.1:6379')
|
6
|
+
end
|
7
|
+
|
8
|
+
def after_specing_redis
|
9
|
+
system('kill -KILL `pidof redis-server`')
|
10
|
+
end
|
11
|
+
|
12
|
+
Spec::Runner.configure { |c| c.mock_with :mocha }
|
13
|
+
|
14
|
+
describe 'Object to halt cycles' do
|
15
|
+
before do
|
16
|
+
before_specing_redis
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should understand <<' do
|
20
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
21
|
+
c.should respond_to(:<<)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'should understand included?' do
|
25
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
26
|
+
c.should respond_to(:include?)
|
27
|
+
end
|
28
|
+
|
29
|
+
it 'should produce false if the object is not included' do
|
30
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
31
|
+
c.include?('a').should be_false
|
32
|
+
end
|
33
|
+
|
34
|
+
it 'should produce true if the object is included' do
|
35
|
+
c = IncludedInRedis.new(host: 'localhost', port: 6379)
|
36
|
+
c << 'a'
|
37
|
+
c.include?('a').should be_true
|
38
|
+
end
|
39
|
+
|
40
|
+
after do
|
41
|
+
after_specing_redis
|
42
|
+
end
|
43
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Nagro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-04-23 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
@@ -24,11 +24,13 @@ files:
|
|
24
24
|
- README.md
|
25
25
|
- lib/spider.rb
|
26
26
|
- lib/spider/included_in_memcached.rb
|
27
|
+
- lib/spider/included_in_redis.rb
|
27
28
|
- lib/spider/next_urls_in_sqs.rb
|
28
29
|
- lib/spider/robot_rules.rb
|
29
30
|
- lib/spider/spider_instance.rb
|
30
31
|
- spec/spec_helper.rb
|
31
32
|
- spec/spider/included_in_memcached_spec.rb
|
33
|
+
- spec/spider/included_in_redis_spec.rb
|
32
34
|
- spec/spider/spider_instance_spec.rb
|
33
35
|
- spec/spider_spec.rb
|
34
36
|
- spider.gemspec
|
@@ -52,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
54
|
version: '0'
|
53
55
|
requirements: []
|
54
56
|
rubyforge_project: spider
|
55
|
-
rubygems_version: 2.
|
57
|
+
rubygems_version: 2.5.2.1
|
56
58
|
signing_key:
|
57
59
|
specification_version: 4
|
58
60
|
summary: A Web spidering library
|