spiderkit 0.2.0 → 0.2.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d3b757d2369c3d6c9520e8f4b78f60f79fdfd95
4
- data.tar.gz: 562120125d785f938f37b102a53e90dc78dc56cc
3
+ metadata.gz: be49878f0fe9fc2947133b602a98e78070e82cee
4
+ data.tar.gz: 70045c213e2dc966cdcceb7b4d66b68aa953b09d
5
5
  SHA512:
6
- metadata.gz: 391d8705750fe6e738152384439da6b512dc29471796c37bd48f1d90bd2b3909fc1a1c6257c1a95baff42533d662140f7cf6a999a3d9cedca17cdd8cb1f43c06
7
- data.tar.gz: a243700c54bf0eecaf16a879d76e201c7debcb93094bed3ba32bea47f9615c321ba5f297451955cead398e3fda3f7ecd2892673eea87102339e0ca75c3df34de
6
+ metadata.gz: bd5e451c53232047a24c050161dde45356706d80ea230fa793375dae97e0f1c629b6adf21919b4572f534f210cf91b4fe73a0ca4edaf3b8a99c78f852cb25cb3
7
+ data.tar.gz: a6bd3b7be26ad0d56f1f408ef5778734a9753b4b599a74e5968b7b4d620e7781fa9d9b4dd7e2b42f87a43ed98b7393a672505a193e6f080d617fc3a5cf98945d
data/README.md CHANGED
@@ -44,13 +44,14 @@ Since you need to implement page fetching on your own (using any of a number of
44
44
  A basic example:
45
45
 
46
46
  ```ruby
47
+ require 'open-uri'
48
+
47
49
  mybot = Spider::VisitQueue.new
48
50
  mybot.push_front('http://someurl.com')
49
51
 
50
52
  mybot.visit_each do |url|
51
- #fetch the url
53
+ data = open(url).read
52
54
  #pull out the links as linklist
53
-
54
55
  mybot.push_back(linklist)
55
56
  end
56
57
  ```
@@ -104,7 +105,9 @@ The finalizer, if any, will still be executed after stopping iteration.
104
105
  Spiderkit also includes a robots.txt parser that can either work standalone, or be passed as an argument to the visit queue. If passed as an argument, urls that are excluded by the robots.txt will be dropped transparently.
105
106
 
106
107
  ```
107
- # fetch robots.txt as variable txt
108
+ require 'open-uri'
109
+
110
+ txt = open('http://somesite.com/robots.txt').read
108
111
 
109
112
  # create a stand alone parser
110
113
  robots_txt = Spider::ExclusionParser.new(txt)
@@ -150,9 +153,8 @@ should respond to "googlebot" in robots.txt. By convention, bots and spiders us
150
153
  ```ruby
151
154
  require 'open-uri'
152
155
 
153
- status = 0
154
- data = open('http://wikipedia.org/robots.txt') { |f| status = f.status }
155
- mybot.robot_txt = Spider::ExclusionParser.new(data.read, 'mybot', status)
156
+ data = open('http://wikipedia.org/robots.txt')
157
+ mybot.robot_txt = Spider::ExclusionParser.new(data.read, 'mybot', data.status)
156
158
  ```
157
159
 
158
160
  Finally, as a sanity check / to avoid DoS honeypots with malicious robots.txt files, the exclusion parser will process a maximum of one thousand non-whitespace lines before stopping.
@@ -164,7 +166,10 @@ Ideally a bot should wait for some period of time in between requests to avoid c
164
166
  You can create it standalone, or get it from an exclusion parser:
165
167
 
166
168
  ```ruby
169
+ require 'open-uri'
170
+
167
171
  # download a robots.txt with a crawl-delay 40
172
+ txt = open('http://crawldelay40seconds.com/robots.txt').read
168
173
 
169
174
  robots_txt = Spider::ExclusionParser.new(txt)
170
175
  delay = robots_txt.wait_time
@@ -209,14 +214,14 @@ Spider::VisitRecorder.record!
209
214
  mybot.visit_each do |url|
210
215
 
211
216
  data = Spider::VisitRecorder.recall(url) do
212
- text = ''
213
217
  puts "fetch #{url}"
214
- open(url) do |f|
215
- text = f.read
216
- # doing this is only necessary if you want to
217
- # save this information in the recording
218
- text.http_status = f.status.first.to_i
219
- end
218
+
219
+ data = open(url)
220
+ text = data.read
221
+
222
+ # doing this is only necessary if you want to
223
+ # save this information in the recording
224
+ text.http_status = data.status.first.to_i
220
225
 
221
226
  text
222
227
  end
data/lib/queue.rb CHANGED
@@ -2,7 +2,7 @@
2
2
  # Copyright:: Copyright (c) 2016 Robert Dormer
3
3
  # License:: MIT
4
4
 
5
- require 'bloom-filter'
5
+ require 'bloomer'
6
6
  require 'exclusion'
7
7
 
8
8
  module Spider
@@ -27,7 +27,7 @@ module Spider
27
27
  url = @pending.pop
28
28
  next unless url_okay(url)
29
29
  yield url.clone if block_given?
30
- @visited.insert(url)
30
+ @visited.add(url)
31
31
  @visit_count += 1
32
32
  end
33
33
  rescue IterationExit
@@ -46,7 +46,7 @@ module Spider
46
46
 
47
47
  def mark(urls)
48
48
  urls = [urls] unless urls.is_a? Array
49
- urls.each { |u| @visited.insert(u) }
49
+ urls.each { |u| @visited.add(u) }
50
50
  end
51
51
 
52
52
  def size
@@ -62,7 +62,7 @@ module Spider
62
62
  end
63
63
 
64
64
  def clear_visited
65
- @visited = BloomFilter.new(size: 10_000, error_rate: 0.001)
65
+ @visited = Bloomer.new(10_000, 0.001)
66
66
  end
67
67
 
68
68
  def url_okay(url)
data/lib/version.rb CHANGED
@@ -3,5 +3,5 @@
3
3
  # License:: MIT
4
4
 
5
5
  module Spider
6
- VERSION = "0.2.0"
6
+ VERSION = "0.2.1"
7
7
  end
data/spiderkit.gemspec CHANGED
@@ -23,5 +23,5 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "rspec", "~> 3.4.0"
24
24
  spec.add_development_dependency "rake"
25
25
 
26
- spec.add_dependency "bloom-filter", "~> 0.2.0"
26
+ spec.add_dependency "bloomer", "~> 0.0.5"
27
27
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spiderkit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Robert Dormer
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-08-06 00:00:00.000000000 Z
11
+ date: 2016-12-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -53,19 +53,19 @@ dependencies:
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
55
  - !ruby/object:Gem::Dependency
56
- name: bloom-filter
56
+ name: bloomer
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.2.0
61
+ version: 0.0.5
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.2.0
68
+ version: 0.0.5
69
69
  description: Spiderkit library for basic spiders and bots
70
70
  email:
71
71
  - rdormer@gmail.com