spiderkit 0.2.0 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +18 -13
- data/lib/queue.rb +4 -4
- data/lib/version.rb +1 -1
- data/spiderkit.gemspec +1 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: be49878f0fe9fc2947133b602a98e78070e82cee
|
4
|
+
data.tar.gz: 70045c213e2dc966cdcceb7b4d66b68aa953b09d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bd5e451c53232047a24c050161dde45356706d80ea230fa793375dae97e0f1c629b6adf21919b4572f534f210cf91b4fe73a0ca4edaf3b8a99c78f852cb25cb3
|
7
|
+
data.tar.gz: a6bd3b7be26ad0d56f1f408ef5778734a9753b4b599a74e5968b7b4d620e7781fa9d9b4dd7e2b42f87a43ed98b7393a672505a193e6f080d617fc3a5cf98945d
|
data/README.md
CHANGED
@@ -44,13 +44,14 @@ Since you need to implement page fetching on your own (using any of a number of
|
|
44
44
|
A basic example:
|
45
45
|
|
46
46
|
```ruby
|
47
|
+
require 'open-uri'
|
48
|
+
|
47
49
|
mybot = Spider::VisitQueue.new
|
48
50
|
mybot.push_front('http://someurl.com')
|
49
51
|
|
50
52
|
mybot.visit_each do |url|
|
51
|
-
|
53
|
+
data = open(url).read
|
52
54
|
#pull out the links as linklist
|
53
|
-
|
54
55
|
mybot.push_back(linklist)
|
55
56
|
end
|
56
57
|
```
|
@@ -104,7 +105,9 @@ The finalizer, if any, will still be executed after stopping iteration.
|
|
104
105
|
Spiderkit also includes a robots.txt parser that can either work standalone, or be passed as an argument to the visit queue. If passed as an argument, urls that are excluded by the robots.txt will be dropped transparently.
|
105
106
|
|
106
107
|
```
|
107
|
-
|
108
|
+
require 'open-uri'
|
109
|
+
|
110
|
+
txt = open('http://somesite.com/robots.txt').read
|
108
111
|
|
109
112
|
# create a stand alone parser
|
110
113
|
robots_txt = Spider::ExclusionParser.new(txt)
|
@@ -150,9 +153,8 @@ should respond to "googlebot" in robots.txt. By convention, bots and spiders us
|
|
150
153
|
```ruby
|
151
154
|
require 'open-uri'
|
152
155
|
|
153
|
-
|
154
|
-
|
155
|
-
mybot.robot_txt = Spider::ExclusionParser.new(data.read, 'mybot', status)
|
156
|
+
data = open('http://wikipedia.org/robots.txt')
|
157
|
+
mybot.robot_txt = Spider::ExclusionParser.new(data.read, 'mybot', data.status)
|
156
158
|
```
|
157
159
|
|
158
160
|
Finally, as a sanity check / to avoid DoS honeypots with malicious robots.txt files, the exclusion parser will process a maximum of one thousand non-whitespace lines before stopping.
|
@@ -164,7 +166,10 @@ Ideally a bot should wait for some period of time in between requests to avoid c
|
|
164
166
|
You can create it standalone, or get it from an exclusion parser:
|
165
167
|
|
166
168
|
```ruby
|
169
|
+
require 'open-uri'
|
170
|
+
|
167
171
|
# download a robots.txt with a crawl-delay 40
|
172
|
+
txt = open('http://crawldelay40seconds.com/robots.txt').read
|
168
173
|
|
169
174
|
robots_txt = Spider::ExclusionParser.new(txt)
|
170
175
|
delay = robots_txt.wait_time
|
@@ -209,14 +214,14 @@ Spider::VisitRecorder.record!
|
|
209
214
|
mybot.visit_each do |url|
|
210
215
|
|
211
216
|
data = Spider::VisitRecorder.recall(url) do
|
212
|
-
text = ''
|
213
217
|
puts "fetch #{url}"
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
218
|
+
|
219
|
+
data = open(url)
|
220
|
+
text = data.read
|
221
|
+
|
222
|
+
# doing this is only necessary if you want to
|
223
|
+
# save this information in the recording
|
224
|
+
text.http_status = data.status.first.to_i
|
220
225
|
|
221
226
|
text
|
222
227
|
end
|
data/lib/queue.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# Copyright:: Copyright (c) 2016 Robert Dormer
|
3
3
|
# License:: MIT
|
4
4
|
|
5
|
-
require '
|
5
|
+
require 'bloomer'
|
6
6
|
require 'exclusion'
|
7
7
|
|
8
8
|
module Spider
|
@@ -27,7 +27,7 @@ module Spider
|
|
27
27
|
url = @pending.pop
|
28
28
|
next unless url_okay(url)
|
29
29
|
yield url.clone if block_given?
|
30
|
-
@visited.
|
30
|
+
@visited.add(url)
|
31
31
|
@visit_count += 1
|
32
32
|
end
|
33
33
|
rescue IterationExit
|
@@ -46,7 +46,7 @@ module Spider
|
|
46
46
|
|
47
47
|
def mark(urls)
|
48
48
|
urls = [urls] unless urls.is_a? Array
|
49
|
-
urls.each { |u| @visited.
|
49
|
+
urls.each { |u| @visited.add(u) }
|
50
50
|
end
|
51
51
|
|
52
52
|
def size
|
@@ -62,7 +62,7 @@ module Spider
|
|
62
62
|
end
|
63
63
|
|
64
64
|
def clear_visited
|
65
|
-
@visited =
|
65
|
+
@visited = Bloomer.new(10_000, 0.001)
|
66
66
|
end
|
67
67
|
|
68
68
|
def url_okay(url)
|
data/lib/version.rb
CHANGED
data/spiderkit.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spiderkit
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Robert Dormer
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-12-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -53,19 +53,19 @@ dependencies:
|
|
53
53
|
- !ruby/object:Gem::Version
|
54
54
|
version: '0'
|
55
55
|
- !ruby/object:Gem::Dependency
|
56
|
-
name:
|
56
|
+
name: bloomer
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
58
58
|
requirements:
|
59
59
|
- - "~>"
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0.
|
61
|
+
version: 0.0.5
|
62
62
|
type: :runtime
|
63
63
|
prerelease: false
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
65
65
|
requirements:
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
|
-
version: 0.
|
68
|
+
version: 0.0.5
|
69
69
|
description: Spiderkit library for basic spiders and bots
|
70
70
|
email:
|
71
71
|
- rdormer@gmail.com
|