arachnid2 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +21 -13
- data/lib/arachnid2.rb +76 -27
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ebb9ed9cdef3106462796f1b7fcc2483d58857bf605f04c800733358ea3f486
|
4
|
+
data.tar.gz: 86eaaf1bd44b85ee564b1bf3aeb08e2be326462c6bf8d3291ba2bc2f55e7c444
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b8f1d5798379c75502cf36046c3110f02ae6abbd12a8ecaa7c501e1efd0d8d86393abd64212f12e3c3f951ddf5f8f6fed5ed15a9937305cf63ff92523087c89
|
7
|
+
data.tar.gz: 4f02afa25d537346b2cc6daaa16ae7b19956c5478bdfb5f4e2a9188825ad3f687691ad96025570661fd97dd12954c0e367be9cbdf9267f726b34ad13ea50bff0
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.1.
|
4
|
+
arachnid2 (0.1.2)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
@@ -20,7 +20,7 @@ GEM
|
|
20
20
|
diff-lcs (1.3)
|
21
21
|
ethon (0.11.0)
|
22
22
|
ffi (>= 1.3.0)
|
23
|
-
ffi (1.9.
|
23
|
+
ffi (1.9.25)
|
24
24
|
mini_portile2 (2.3.0)
|
25
25
|
nokogiri (1.8.2)
|
26
26
|
mini_portile2 (~> 2.3.0)
|
data/README.md
CHANGED
@@ -63,10 +63,15 @@ require "arachnid2"
|
|
63
63
|
url = "http://sixcolours.com"
|
64
64
|
spider = Arachnid2.new(url)
|
65
65
|
opts = {
|
66
|
+
followlocation: true,
|
67
|
+
timeout: 10000,
|
66
68
|
time_box: 60,
|
67
69
|
max_urls: 50,
|
68
|
-
|
69
|
-
|
70
|
+
:headers => {
|
71
|
+
'Accept-Language' => "en-UK",
|
72
|
+
'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
73
|
+
},
|
74
|
+
memory_limit: 89.99,
|
70
75
|
proxy: {
|
71
76
|
ip: "1.2.3.4",
|
72
77
|
port: "1234",
|
@@ -94,16 +99,18 @@ If no valid integer is provided, it will crawl for 50 URLs before exiting.
|
|
94
99
|
10000 seconds is the current maximum,
|
95
100
|
and any value above it will be reduced to 10000.
|
96
101
|
|
97
|
-
#### `
|
102
|
+
#### `headers`
|
98
103
|
|
99
|
-
|
100
|
-
|
101
|
-
|
104
|
+
This is a hash that represents any HTTP header key/value pairs you desire,
|
105
|
+
and is passed directly to Typheous. Before it is sent, a default
|
106
|
+
language and user agent are created:
|
107
|
+
|
108
|
+
##### Defaults
|
102
109
|
|
103
|
-
|
110
|
+
The HTTP header `Accept-Language` default is
|
111
|
+
`en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
|
104
112
|
|
105
|
-
|
106
|
-
default is
|
113
|
+
The HTTP header `User-Agent` default is
|
107
114
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
108
115
|
|
109
116
|
#### `proxy`
|
@@ -112,12 +119,13 @@ Provide your IP, port for a proxy. If required, provide credentials for
|
|
112
119
|
authenticating to that proxy. Proxy options and handling are done
|
113
120
|
by Typhoeus.
|
114
121
|
|
115
|
-
|
122
|
+
#### `memory_limit` and Docker
|
116
123
|
|
117
124
|
In case you are operating the crawler within a container, Arachnid2
|
118
|
-
|
119
|
-
|
120
|
-
|
125
|
+
can attempt to prevent the container from running out of memory.
|
126
|
+
By default, it will end the crawl when the container uses >= 80%
|
127
|
+
of its available memory. You can override this with the
|
128
|
+
option.
|
121
129
|
|
122
130
|
### Non-HTML links
|
123
131
|
|
data/lib/arachnid2.rb
CHANGED
@@ -37,7 +37,11 @@ class Arachnid2
|
|
37
37
|
}
|
38
38
|
MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
|
39
39
|
MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
|
40
|
-
|
40
|
+
DEFAULT_MAXIMUM_LOAD_RATE = 79.9
|
41
|
+
|
42
|
+
DEFAULT_TIMEOUT = 10_000
|
43
|
+
MINIMUM_TIMEOUT = 1
|
44
|
+
MAXIMUM_TIMEOUT = 999_999
|
41
45
|
|
42
46
|
#
|
43
47
|
# Creates the object to execute the crawl
|
@@ -64,11 +68,14 @@ class Arachnid2
|
|
64
68
|
# spider = Arachnid2.new(url)
|
65
69
|
#
|
66
70
|
# opts = {
|
71
|
+
# :followlocation => true,
|
72
|
+
# :timeout => 25000,
|
67
73
|
# :time_box => 30,
|
68
74
|
# :headers => {
|
69
75
|
# 'Accept-Language' => "en-UK",
|
70
76
|
# 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
71
77
|
# },
|
78
|
+
# :memory_limit => 89.99,
|
72
79
|
# :proxy => {
|
73
80
|
# :ip => "1.2.3.4",
|
74
81
|
# :port => "1234",
|
@@ -89,34 +96,35 @@ class Arachnid2
|
|
89
96
|
preflight(opts)
|
90
97
|
|
91
98
|
until @global_queue.empty?
|
92
|
-
@
|
93
|
-
|
94
|
-
q = @global_queue.shift
|
99
|
+
@max_concurrency.times do
|
100
|
+
q = @global_queue.shift
|
95
101
|
|
96
|
-
|
97
|
-
|
98
|
-
|
102
|
+
break if @global_visited.size >= @crawl_options[:max_urls]
|
103
|
+
break if Time.now > @crawl_options[:time_limit]
|
104
|
+
break if memory_danger?
|
99
105
|
|
100
|
-
|
106
|
+
@global_visited.insert(q)
|
101
107
|
|
102
|
-
|
108
|
+
request = Typhoeus::Request.new(q, request_options)
|
103
109
|
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
request.on_complete do |response|
|
111
|
+
links = process(response)
|
112
|
+
next unless links
|
107
113
|
|
108
|
-
|
114
|
+
yield response
|
109
115
|
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
|
-
request.run
|
114
|
-
ensure
|
115
|
-
@cookie_file.close! if @cookie_file
|
116
|
+
vacuum(links, response)
|
116
117
|
end
|
117
|
-
|
118
|
-
|
119
|
-
|
118
|
+
|
119
|
+
@hydra.queue(request)
|
120
|
+
end # @max_concurrency.times do
|
121
|
+
|
122
|
+
@hydra.run
|
123
|
+
end # until @global_queue.empty?
|
124
|
+
|
125
|
+
ensure
|
126
|
+
@cookie_file.close! if @cookie_file
|
127
|
+
end # def crawl(opts = {})
|
120
128
|
|
121
129
|
private
|
122
130
|
def process(response)
|
@@ -153,12 +161,24 @@ class Arachnid2
|
|
153
161
|
def preflight(opts)
|
154
162
|
@options = opts
|
155
163
|
@crawl_options = crawl_options
|
156
|
-
|
157
|
-
|
164
|
+
@maximum_load_rate = maximum_load_rate
|
165
|
+
@max_concurrency = max_concurrency
|
166
|
+
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
158
167
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
159
168
|
@global_queue = [@url]
|
160
169
|
end
|
161
170
|
|
171
|
+
def max_concurrency
|
172
|
+
@max_concurrency ||= nil
|
173
|
+
|
174
|
+
if !@max_concurrency
|
175
|
+
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
176
|
+
@max_concurrency = 1 unless (@max_concurrency > 0)
|
177
|
+
end
|
178
|
+
|
179
|
+
@max_concurrency
|
180
|
+
end
|
181
|
+
|
162
182
|
def bound_time
|
163
183
|
boundary = "#{@options[:time_box]}".to_i
|
164
184
|
boundary = BASE_CRAWL_TIME if boundary <= 0
|
@@ -175,12 +195,30 @@ class Arachnid2
|
|
175
195
|
amount
|
176
196
|
end
|
177
197
|
|
198
|
+
def followlocation
|
199
|
+
if @followlocation.is_a?(NilClass)
|
200
|
+
@followlocation = @options[:followlocation]
|
201
|
+
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
202
|
+
end
|
203
|
+
@followlocation
|
204
|
+
end
|
205
|
+
|
206
|
+
def timeout
|
207
|
+
if !@timeout
|
208
|
+
@timeout = @options[:timeout]
|
209
|
+
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
210
|
+
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
211
|
+
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
212
|
+
end
|
213
|
+
@timeout
|
214
|
+
end
|
215
|
+
|
178
216
|
def request_options
|
179
217
|
@cookie_file ||= Tempfile.new('cookies')
|
180
218
|
|
181
219
|
@request_options = {
|
182
|
-
timeout:
|
183
|
-
followlocation:
|
220
|
+
timeout: timeout,
|
221
|
+
followlocation: followlocation,
|
184
222
|
cookiefile: @cookie_file.path,
|
185
223
|
cookiejar: @cookie_file.path,
|
186
224
|
headers: @options[:headers]
|
@@ -236,7 +274,7 @@ class Arachnid2
|
|
236
274
|
|
237
275
|
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
238
276
|
|
239
|
-
return ( ( (use / @limit) * 100.0 ) >=
|
277
|
+
return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
|
240
278
|
end
|
241
279
|
|
242
280
|
def in_docker?
|
@@ -244,4 +282,15 @@ class Arachnid2
|
|
244
282
|
true
|
245
283
|
end
|
246
284
|
|
285
|
+
def maximum_load_rate
|
286
|
+
@maximum_load_rate ||= nil
|
287
|
+
|
288
|
+
if !@maximum_load_rate
|
289
|
+
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
290
|
+
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
291
|
+
end
|
292
|
+
|
293
|
+
@maximum_load_rate
|
294
|
+
end
|
295
|
+
|
247
296
|
end
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|