spider 0.5.3 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/AUTHORS +3 -0
- data/README.md +12 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_file.rb +32 -0
- data/spider.gemspec +0 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
|
4
|
+
data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
|
7
|
+
data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
|
data/AUTHORS
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
The Ruby Spider Gem would not be what it is today without the help of
|
2
2
|
the following kind souls:
|
3
3
|
|
4
|
+
Alexandre Rousseau
|
4
5
|
Brian Campbell
|
5
6
|
Henri Cook
|
6
7
|
James Edward Gray II
|
@@ -11,4 +12,6 @@ John Nagro
|
|
11
12
|
Matt Horan
|
12
13
|
Marc (@brigriffin)
|
13
14
|
Mike Burns (original author)
|
15
|
+
Olle Jonsson
|
14
16
|
Sander van der Vliet
|
17
|
+
Stuart Yamartino
|
data/README.md
CHANGED
@@ -82,6 +82,16 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
82
82
|
end
|
83
83
|
```
|
84
84
|
|
85
|
+
### Use Plain text to track cycles
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
require 'spider'
|
89
|
+
require 'spider/included_in_redis'
|
90
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
91
|
+
s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
|
92
|
+
end
|
93
|
+
```
|
94
|
+
|
85
95
|
### Track cycles with a custom object
|
86
96
|
|
87
97
|
```ruby
|
@@ -116,7 +126,7 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
116
126
|
require 'spider'
|
117
127
|
class MyArray < Array
|
118
128
|
def pop
|
119
|
-
|
129
|
+
super
|
120
130
|
end
|
121
131
|
|
122
132
|
def push(a_msg)
|
@@ -150,7 +160,7 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
150
160
|
require 'net/http_configuration'
|
151
161
|
require 'spider'
|
152
162
|
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
153
|
-
:proxy_port => 8881)
|
163
|
+
:proxy_port => 8881)
|
154
164
|
http_conf.apply do
|
155
165
|
Spider.start_at('http://img.4chan.org/b/') do |s|
|
156
166
|
s.on(:success) do |a_url, resp, prior_url|
|
data/lib/spider.rb
CHANGED
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
4
4
|
# links, and doing it all over again.
|
5
5
|
class Spider
|
6
6
|
|
7
|
-
VERSION_INFO = [0, 5,
|
7
|
+
VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
|
8
8
|
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
9
|
|
10
10
|
def self.version
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Use plain text file to track cycles.
|
2
|
+
|
3
|
+
# A specialized class using a plain text to track items stored. It supports
|
4
|
+
# three operations: new, <<, and include? . Together these can be used to
|
5
|
+
# add items to the text file, then determine whether the item has been added.
|
6
|
+
#
|
7
|
+
# To use it with Spider use the check_already_seen_with method:
|
8
|
+
#
|
9
|
+
# Spider.start_at('http://example.com/') do |s|
|
10
|
+
# s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
|
11
|
+
# end
|
12
|
+
class IncludedInFile
|
13
|
+
# Construct a new IncludedInFile instance.
|
14
|
+
# @param filepath [String] as path of file to store crawled URL
|
15
|
+
def initialize(filepath)
|
16
|
+
@filepath = filepath
|
17
|
+
# create file if not exists
|
18
|
+
File.write(@filepath, '') unless File.file?(@filepath)
|
19
|
+
@urls = File.readlines(@filepath).map(&:chomp)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to the file & array of URL.
|
23
|
+
def <<(v)
|
24
|
+
@urls << v.to_s
|
25
|
+
File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
|
26
|
+
end
|
27
|
+
|
28
|
+
# True if the item is in the file.
|
29
|
+
def include?(v)
|
30
|
+
@urls.include? v.to_s
|
31
|
+
end
|
32
|
+
end
|
data/spider.gemspec
CHANGED
@@ -9,7 +9,6 @@ spec = Gem::Specification.new do |s|
|
|
9
9
|
s.has_rdoc = true
|
10
10
|
s.homepage = 'https://github.com/johnnagro/spider'
|
11
11
|
s.name = 'spider'
|
12
|
-
s.rubyforge_project = 'spider'
|
13
12
|
s.summary = 'A Web spidering library'
|
14
13
|
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
|
15
14
|
s.require_path = 'lib'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Nagro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
@@ -23,6 +23,7 @@ files:
|
|
23
23
|
- LICENSE
|
24
24
|
- README.md
|
25
25
|
- lib/spider.rb
|
26
|
+
- lib/spider/included_in_file.rb
|
26
27
|
- lib/spider/included_in_memcached.rb
|
27
28
|
- lib/spider/included_in_redis.rb
|
28
29
|
- lib/spider/next_urls_in_sqs.rb
|
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
54
|
- !ruby/object:Gem::Version
|
54
55
|
version: '0'
|
55
56
|
requirements: []
|
56
|
-
rubyforge_project:
|
57
|
-
rubygems_version: 2.
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.7.6
|
58
59
|
signing_key:
|
59
60
|
specification_version: 4
|
60
61
|
summary: A Web spidering library
|