spider 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 60310268052e7e2cfe120fdd313e314b28124c5a
4
- data.tar.gz: 8747d828232c89f674e765e005df722c170a7839
2
+ SHA256:
3
+ metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
4
+ data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
5
5
  SHA512:
6
- metadata.gz: a58cc23eb4d464b41957ca39ac17a3d2c91d86ac71140163b2072bedc317e3b529430fd47630980c0e87668df509031078a6d0a7d63d62878d5a31300b34bc51
7
- data.tar.gz: a2391c2b8e64dd4dd5f895e9530312bd515d00981c85d857ff8261d28bdaab2a1669e1d9fc9e15fa43f62f4eba99666465cbc0bdd6378e6610daa8e043e5f156
6
+ metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
7
+ data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
data/AUTHORS CHANGED
@@ -1,6 +1,7 @@
1
1
  The Ruby Spider Gem would not be what it is today without the help of
2
2
  the following kind souls:
3
3
 
4
+ Alexandre Rousseau
4
5
  Brian Campbell
5
6
  Henri Cook
6
7
  James Edward Gray II
@@ -11,4 +12,6 @@ John Nagro
11
12
  Matt Horan
12
13
  Marc (@brigriffin)
13
14
  Mike Burns (original author)
15
+ Olle Jonsson
14
16
  Sander van der Vliet
17
+ Stuart Yamartino
data/README.md CHANGED
@@ -82,6 +82,16 @@ scraping, collecting, and looping so that you can just handle the data._
82
82
  end
83
83
  ```
84
84
 
85
+ ### Use Plain text to track cycles
86
+
87
+ ```ruby
88
+ require 'spider'
89
+ require 'spider/included_in_redis'
90
+ Spider.start_at('http://cashcats.biz/') do |s|
91
+ s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
92
+ end
93
+ ```
94
+
85
95
  ### Track cycles with a custom object
86
96
 
87
97
  ```ruby
@@ -116,7 +126,7 @@ scraping, collecting, and looping so that you can just handle the data._
116
126
  require 'spider'
117
127
  class MyArray < Array
118
128
  def pop
119
- super
129
+ super
120
130
  end
121
131
 
122
132
  def push(a_msg)
@@ -150,7 +160,7 @@ scraping, collecting, and looping so that you can just handle the data._
150
160
  require 'net/http_configuration'
151
161
  require 'spider'
152
162
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
153
- :proxy_port => 8881)
163
+ :proxy_port => 8881)
154
164
  http_conf.apply do
155
165
  Spider.start_at('http://img.4chan.org/b/') do |s|
156
166
  s.on(:success) do |a_url, resp, prior_url|
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
4
4
  # links, and doing it all over again.
5
5
  class Spider
6
6
 
7
- VERSION_INFO = [0, 5, 3] unless defined?(self::VERSION_INFO)
7
+ VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
8
8
  VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
9
 
10
10
  def self.version
@@ -0,0 +1,32 @@
1
+ # Use plain text file to track cycles.
2
+
3
+ # A specialized class using a plain text to track items stored. It supports
4
+ # three operations: new, <<, and include? . Together these can be used to
5
+ # add items to the text file, then determine whether the item has been added.
6
+ #
7
+ # To use it with Spider use the check_already_seen_with method:
8
+ #
9
+ # Spider.start_at('http://example.com/') do |s|
10
+ # s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
11
+ # end
12
+ class IncludedInFile
13
+ # Construct a new IncludedInFile instance.
14
+ # @param filepath [String] as path of file to store crawled URL
15
+ def initialize(filepath)
16
+ @filepath = filepath
17
+ # create file if not exists
18
+ File.write(@filepath, '') unless File.file?(@filepath)
19
+ @urls = File.readlines(@filepath).map(&:chomp)
20
+ end
21
+
22
+ # Add an item to the file & array of URL.
23
+ def <<(v)
24
+ @urls << v.to_s
25
+ File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
26
+ end
27
+
28
+ # True if the item is in the file.
29
+ def include?(v)
30
+ @urls.include? v.to_s
31
+ end
32
+ end
@@ -9,7 +9,6 @@ spec = Gem::Specification.new do |s|
9
9
  s.has_rdoc = true
10
10
  s.homepage = 'https://github.com/johnnagro/spider'
11
11
  s.name = 'spider'
12
- s.rubyforge_project = 'spider'
13
12
  s.summary = 'A Web spidering library'
14
13
  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
15
14
  s.require_path = 'lib'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-04-23 00:00:00.000000000 Z
11
+ date: 2020-06-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |
14
14
  A Web spidering library: handles robots.txt, scraping, finding more
@@ -23,6 +23,7 @@ files:
23
23
  - LICENSE
24
24
  - README.md
25
25
  - lib/spider.rb
26
+ - lib/spider/included_in_file.rb
26
27
  - lib/spider/included_in_memcached.rb
27
28
  - lib/spider/included_in_redis.rb
28
29
  - lib/spider/next_urls_in_sqs.rb
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
54
  - !ruby/object:Gem::Version
54
55
  version: '0'
55
56
  requirements: []
56
- rubyforge_project: spider
57
- rubygems_version: 2.5.2.1
57
+ rubyforge_project:
58
+ rubygems_version: 2.7.6
58
59
  signing_key:
59
60
  specification_version: 4
60
61
  summary: A Web spidering library