spider 0.5.3 → 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 60310268052e7e2cfe120fdd313e314b28124c5a
4
- data.tar.gz: 8747d828232c89f674e765e005df722c170a7839
2
+ SHA256:
3
+ metadata.gz: b87ed979c115546fa802f888fea0baf322f458be6e50e40f8ea8fd9ee392c8ac
4
+ data.tar.gz: eabe506949614a5622afa2def1da954d352d737dccadd0622d060be13c061115
5
5
  SHA512:
6
- metadata.gz: a58cc23eb4d464b41957ca39ac17a3d2c91d86ac71140163b2072bedc317e3b529430fd47630980c0e87668df509031078a6d0a7d63d62878d5a31300b34bc51
7
- data.tar.gz: a2391c2b8e64dd4dd5f895e9530312bd515d00981c85d857ff8261d28bdaab2a1669e1d9fc9e15fa43f62f4eba99666465cbc0bdd6378e6610daa8e043e5f156
6
+ metadata.gz: ab52efe227f19067dd52efb0333d5901d3725f4e88fa0b86942c12bd702efa6b3bf4ed72b03d5ce58b32a22c574b0ee31764fa5e37df07a57132c250bf6b0658
7
+ data.tar.gz: ede92b88eb09867c41c1f7fcca58c99a55e63f02bb54fa094ece40e8875240734aa9abdcaf043db003120a44b239198dc6d063779543927febb493116732b7f3
data/AUTHORS CHANGED
@@ -1,6 +1,7 @@
1
1
  The Ruby Spider Gem would not be what it is today without the help of
2
2
  the following kind souls:
3
3
 
4
+ Alexandre Rousseau
4
5
  Brian Campbell
5
6
  Henri Cook
6
7
  James Edward Gray II
@@ -11,4 +12,6 @@ John Nagro
11
12
  Matt Horan
12
13
  Marc (@brigriffin)
13
14
  Mike Burns (original author)
15
+ Olle Jonsson
14
16
  Sander van der Vliet
17
+ Stuart Yamartino
data/README.md CHANGED
@@ -82,6 +82,16 @@ scraping, collecting, and looping so that you can just handle the data._
82
82
  end
83
83
  ```
84
84
 
85
+ ### Use Plain text to track cycles
86
+
87
+ ```ruby
88
+ require 'spider'
89
+ require 'spider/included_in_redis'
90
+ Spider.start_at('http://cashcats.biz/') do |s|
91
+ s.check_already_seen_with IncludedInFile.new('/tmp/cashcats_crawl.txt')
92
+ end
93
+ ```
94
+
85
95
  ### Track cycles with a custom object
86
96
 
87
97
  ```ruby
@@ -116,7 +126,7 @@ scraping, collecting, and looping so that you can just handle the data._
116
126
  require 'spider'
117
127
  class MyArray < Array
118
128
  def pop
119
- super
129
+ super
120
130
  end
121
131
 
122
132
  def push(a_msg)
@@ -150,7 +160,7 @@ scraping, collecting, and looping so that you can just handle the data._
150
160
  require 'net/http_configuration'
151
161
  require 'spider'
152
162
  http_conf = Net::HTTP::Configuration.new(:proxy_host => '7proxies.org',
153
- :proxy_port => 8881)
163
+ :proxy_port => 8881)
154
164
  http_conf.apply do
155
165
  Spider.start_at('http://img.4chan.org/b/') do |s|
156
166
  s.on(:success) do |a_url, resp, prior_url|
@@ -4,7 +4,7 @@ require File.dirname(__FILE__)+'/spider/spider_instance'
4
4
  # links, and doing it all over again.
5
5
  class Spider
6
6
 
7
- VERSION_INFO = [0, 5, 3] unless defined?(self::VERSION_INFO)
7
+ VERSION_INFO = [0, 5, 4] unless defined?(self::VERSION_INFO)
8
8
  VERSION = VERSION_INFO.map(&:to_s).join('.') unless defined?(self::VERSION)
9
9
 
10
10
  def self.version
@@ -0,0 +1,32 @@
1
+ # Use plain text file to track cycles.
2
+
3
+ # A specialized class using a plain text to track items stored. It supports
4
+ # three operations: new, <<, and include? . Together these can be used to
5
+ # add items to the text file, then determine whether the item has been added.
6
+ #
7
+ # To use it with Spider use the check_already_seen_with method:
8
+ #
9
+ # Spider.start_at('http://example.com/') do |s|
10
+ # s.check_already_seen_with IncludedInFile.new('/tmp/crawled.log')
11
+ # end
12
+ class IncludedInFile
13
+ # Construct a new IncludedInFile instance.
14
+ # @param filepath [String] as path of file to store crawled URL
15
+ def initialize(filepath)
16
+ @filepath = filepath
17
+ # create file if not exists
18
+ File.write(@filepath, '') unless File.file?(@filepath)
19
+ @urls = File.readlines(@filepath).map(&:chomp)
20
+ end
21
+
22
+ # Add an item to the file & array of URL.
23
+ def <<(v)
24
+ @urls << v.to_s
25
+ File.write(@filepath, "#{v}\r\n", File.size(@filepath), mode: 'a')
26
+ end
27
+
28
+ # True if the item is in the file.
29
+ def include?(v)
30
+ @urls.include? v.to_s
31
+ end
32
+ end
@@ -9,7 +9,6 @@ spec = Gem::Specification.new do |s|
9
9
  s.has_rdoc = true
10
10
  s.homepage = 'https://github.com/johnnagro/spider'
11
11
  s.name = 'spider'
12
- s.rubyforge_project = 'spider'
13
12
  s.summary = 'A Web spidering library'
14
13
  s.files = Dir['**/*'].delete_if { |f| f =~ /(cvs|gem|svn)$/i }
15
14
  s.require_path = 'lib'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - John Nagro
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-04-23 00:00:00.000000000 Z
11
+ date: 2020-06-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: |
14
14
  A Web spidering library: handles robots.txt, scraping, finding more
@@ -23,6 +23,7 @@ files:
23
23
  - LICENSE
24
24
  - README.md
25
25
  - lib/spider.rb
26
+ - lib/spider/included_in_file.rb
26
27
  - lib/spider/included_in_memcached.rb
27
28
  - lib/spider/included_in_redis.rb
28
29
  - lib/spider/next_urls_in_sqs.rb
@@ -53,8 +54,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
53
54
  - !ruby/object:Gem::Version
54
55
  version: '0'
55
56
  requirements: []
56
- rubyforge_project: spider
57
- rubygems_version: 2.5.2.1
57
+ rubyforge_project:
58
+ rubygems_version: 2.7.6
58
59
  signing_key:
59
60
  specification_version: 4
60
61
  summary: A Web spidering library