spider 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/AUTHORS +3 -0
- data/README.md +12 -2
- data/lib/spider.rb +1 -1
- data/lib/spider/included_in_file.rb +32 -0
- data/spider.gemspec +0 -1
- metadata +5 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
|
4
|
+
data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
|
7
|
+
data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
|
data/AUTHORS
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
The Ruby Spider Gem would not be what it is today without the help of
|
2
2
|
the following kind souls:
|
3
3
|
|
4
|
+
Alexandre Rousseau
|
4
5
|
Brian Campbell
|
5
6
|
Henri Cook
|
6
7
|
James Edward Gray II
|
@@ -11,4 +12,6 @@ John Nagro
|
|
11
12
|
Matt Horan
|
12
13
|
Marc (@brigriffin)
|
13
14
|
Mike Burns (original author)
|
15
|
+
Olle Jonsson
|
14
16
|
Sander van der Vliet
|
17
|
+
Stuart Yamartino
|
data/README.md
CHANGED
@@ -82,6 +82,16 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
82
82
|
end
|
83
83
|
```
|
84
84
|
|
85
|
+
### Use Plain text to track cycles
|
86
|
+
|
87
|
+
```ruby
|
88
|
+
require 'spider'
|
89
|
+
require 'spider/included_in_redis'
|
90
|
+
Spider.start_at('http://cashcats.biz/') do |s|
|
91
|
+
s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
|
92
|
+
end
|
93
|
+
```
|
94
|
+
|
85
95
|
### Track cycles with a custom object
|
86
96
|
|
87
97
|
```ruby
|
@@ -116,7 +126,7 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
116
126
|
require 'spider'
|
117
127
|
class MyArray < Array
|
118
128
|
def pop
|
119
|
-
|
129
|
+
super
|
120
130
|
end
|
121
131
|
|
122
132
|
def push(a_msg)
|
@@ -150,7 +160,7 @@ scraping, collecting, and looping so that you can just handle the data._
|
|
150
160
|
require 'net/http_configuration'
|
151
161
|
require 'spider'
|
152
162
|
http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
|
153
|
-
:proxy_port => 8881)
|
163
|
+
:proxy_port => 8881)
|
154
164
|
http_conf.apply do
|
155
165
|
Spider.start_at('http://img.4chan.org/b/') do |s|
|
156
166
|
s.on(:success) do |a_url, resp, prior_url|
|
data/lib/spider.rb
CHANGED
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
|
|
4
4
|
# links, and doing it all over again.
|
5
5
|
class Spider
|
6
6
|
|
7
|
-
VERSION_INFO = [0, 5,
|
7
|
+
VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
|
8
8
|
VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
|
9
9
|
|
10
10
|
def self.version
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# Use plain text file to track cycles.
|
2
|
+
|
3
|
+
# A specialized class using a plain text to track items stored. It supports
|
4
|
+
# three operations: new, <<, and include? . Together these can be used to
|
5
|
+
# add items to the text file, then determine whether the item has been added.
|
6
|
+
#
|
7
|
+
# To use it with Spider use the check_already_seen_with method:
|
8
|
+
#
|
9
|
+
# Spider.start_at('http://example.com/') do |s|
|
10
|
+
# s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
|
11
|
+
# end
|
12
|
+
class IncludedInFile
|
13
|
+
# Construct a new IncludedInFile instance.
|
14
|
+
# @param filepath [String] as path of file to store crawled URL
|
15
|
+
def initialize(filepath)
|
16
|
+
@filepath = filepath
|
17
|
+
# create file if not exists
|
18
|
+
File.write(@filepath, '') unless File.file?(@filepath)
|
19
|
+
@urls = File.readlines(@filepath).map(&:chomp)
|
20
|
+
end
|
21
|
+
|
22
|
+
# Add an item to the file & array of URL.
|
23
|
+
def <<(v)
|
24
|
+
@urls << v.to_s
|
25
|
+
File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
|
26
|
+
end
|
27
|
+
|
28
|
+
# True if the item is in the file.
|
29
|
+
def include?(v)
|
30
|
+
@urls.include? v.to_s
|
31
|
+
end
|
32
|
+
end
|
data/spider.gemspec
CHANGED
@@ -9,7 +9,6 @@ spec = Gem::Specification.new do |s|
|
|
9
9
|
s.has_rdoc = true
|
10
10
|
s.homepage = 'https://github.com/johnnagro/spider'
|
11
11
|
s.name = 'spider'
|
12
|
-
s.rubyforge_project = 'spider'
|
13
12
|
s.summary = 'A Web spidering library'
|
14
13
|
s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
|
15
14
|
s.require_path = 'lib'
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- John Nagro
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-06-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: |
|
14
14
|
A Web spidering library: handles robots.txt, scraping, finding more
|
@@ -23,6 +23,7 @@ files:
|
|
23
23
|
- LICENSE
|
24
24
|
- README.md
|
25
25
|
- lib/spider.rb
|
26
|
+
- lib/spider/included_in_file.rb
|
26
27
|
- lib/spider/included_in_memcached.rb
|
27
28
|
- lib/spider/included_in_redis.rb
|
28
29
|
- lib/spider/next_urls_in_sqs.rb
|
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
53
54
|
- !ruby/object:Gem::Version
|
54
55
|
version: '0'
|
55
56
|
requirements: []
|
56
|
-
rubyforge_project:
|
57
|
-
rubygems_version: 2.
|
57
|
+
rubyforge_project:
|
58
|
+
rubygems_version: 2.7.6
|
58
59
|
signing_key:
|
59
60
|
specification_version: 4
|
60
61
|
summary: A Web spidering library
|