arachnid2 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +21 -13
- data/lib/arachnid2.rb +76 -27
- data/lib/arachnid2/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0ebb9ed9cdef3106462796f1b7fcc2483d58857bf605f04c800733358ea3f486
|
4
|
+
data.tar.gz: 86eaaf1bd44b85ee564b1bf3aeb08e2be326462c6bf8d3291ba2bc2f55e7c444
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1b8f1d5798379c75502cf36046c3110f02ae6abbd12a8ecaa7c501e1efd0d8d86393abd64212f12e3c3f951ddf5f8f6fed5ed15a9937305cf63ff92523087c89
|
7
|
+
data.tar.gz: 4f02afa25d537346b2cc6daaa16ae7b19956c5478bdfb5f4e2a9188825ad3f687691ad96025570661fd97dd12954c0e367be9cbdf9267f726b34ad13ea50bff0
|
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
arachnid2 (0.1.
|
4
|
+
arachnid2 (0.1.2)
|
5
5
|
addressable
|
6
6
|
adomain
|
7
7
|
bloomfilter-rb
|
@@ -20,7 +20,7 @@ GEM
|
|
20
20
|
diff-lcs (1.3)
|
21
21
|
ethon (0.11.0)
|
22
22
|
ffi (>= 1.3.0)
|
23
|
-
ffi (1.9.
|
23
|
+
ffi (1.9.25)
|
24
24
|
mini_portile2 (2.3.0)
|
25
25
|
nokogiri (1.8.2)
|
26
26
|
mini_portile2 (~> 2.3.0)
|
data/README.md
CHANGED
@@ -63,10 +63,15 @@ require "arachnid2"
|
|
63
63
|
url = "http://sixcolours.com"
|
64
64
|
spider = Arachnid2.new(url)
|
65
65
|
opts = {
|
66
|
+
followlocation: true,
|
67
|
+
timeout: 10000,
|
66
68
|
time_box: 60,
|
67
69
|
max_urls: 50,
|
68
|
-
|
69
|
-
|
70
|
+
:headers => {
|
71
|
+
'Accept-Language' => "en-UK",
|
72
|
+
'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
73
|
+
},
|
74
|
+
memory_limit: 89.99,
|
70
75
|
proxy: {
|
71
76
|
ip: "1.2.3.4",
|
72
77
|
port: "1234",
|
@@ -94,16 +99,18 @@ If no valid integer is provided, it will crawl for 50 URLs before exiting.
|
|
94
99
|
10000 seconds is the current maximum,
|
95
100
|
and any value above it will be reduced to 10000.
|
96
101
|
|
97
|
-
#### `
|
102
|
+
#### `headers`
|
98
103
|
|
99
|
-
|
100
|
-
|
101
|
-
|
104
|
+
This is a hash that represents any HTTP header key/value pairs you desire,
|
105
|
+
and is passed directly to Typheous. Before it is sent, a default
|
106
|
+
language and user agent are created:
|
107
|
+
|
108
|
+
##### Defaults
|
102
109
|
|
103
|
-
|
110
|
+
The HTTP header `Accept-Language` default is
|
111
|
+
`en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
|
104
112
|
|
105
|
-
|
106
|
-
default is
|
113
|
+
The HTTP header `User-Agent` default is
|
107
114
|
`Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
|
108
115
|
|
109
116
|
#### `proxy`
|
@@ -112,12 +119,13 @@ Provide your IP, port for a proxy. If required, provide credentials for
|
|
112
119
|
authenticating to that proxy. Proxy options and handling are done
|
113
120
|
by Typhoeus.
|
114
121
|
|
115
|
-
|
122
|
+
#### `memory_limit` and Docker
|
116
123
|
|
117
124
|
In case you are operating the crawler within a container, Arachnid2
|
118
|
-
|
119
|
-
|
120
|
-
|
125
|
+
can attempt to prevent the container from running out of memory.
|
126
|
+
By default, it will end the crawl when the container uses >= 80%
|
127
|
+
of its available memory. You can override this with the
|
128
|
+
option.
|
121
129
|
|
122
130
|
### Non-HTML links
|
123
131
|
|
data/lib/arachnid2.rb
CHANGED
@@ -37,7 +37,11 @@ class Arachnid2
|
|
37
37
|
}
|
38
38
|
MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
|
39
39
|
MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
|
40
|
-
|
40
|
+
DEFAULT_MAXIMUM_LOAD_RATE = 79.9
|
41
|
+
|
42
|
+
DEFAULT_TIMEOUT = 10_000
|
43
|
+
MINIMUM_TIMEOUT = 1
|
44
|
+
MAXIMUM_TIMEOUT = 999_999
|
41
45
|
|
42
46
|
#
|
43
47
|
# Creates the object to execute the crawl
|
@@ -64,11 +68,14 @@ class Arachnid2
|
|
64
68
|
# spider = Arachnid2.new(url)
|
65
69
|
#
|
66
70
|
# opts = {
|
71
|
+
# :followlocation => true,
|
72
|
+
# :timeout => 25000,
|
67
73
|
# :time_box => 30,
|
68
74
|
# :headers => {
|
69
75
|
# 'Accept-Language' => "en-UK",
|
70
76
|
# 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
|
71
77
|
# },
|
78
|
+
# :memory_limit => 89.99,
|
72
79
|
# :proxy => {
|
73
80
|
# :ip => "1.2.3.4",
|
74
81
|
# :port => "1234",
|
@@ -89,34 +96,35 @@ class Arachnid2
|
|
89
96
|
preflight(opts)
|
90
97
|
|
91
98
|
until @global_queue.empty?
|
92
|
-
@
|
93
|
-
|
94
|
-
q = @global_queue.shift
|
99
|
+
@max_concurrency.times do
|
100
|
+
q = @global_queue.shift
|
95
101
|
|
96
|
-
|
97
|
-
|
98
|
-
|
102
|
+
break if @global_visited.size >= @crawl_options[:max_urls]
|
103
|
+
break if Time.now > @crawl_options[:time_limit]
|
104
|
+
break if memory_danger?
|
99
105
|
|
100
|
-
|
106
|
+
@global_visited.insert(q)
|
101
107
|
|
102
|
-
|
108
|
+
request = Typhoeus::Request.new(q, request_options)
|
103
109
|
|
104
|
-
|
105
|
-
|
106
|
-
|
110
|
+
request.on_complete do |response|
|
111
|
+
links = process(response)
|
112
|
+
next unless links
|
107
113
|
|
108
|
-
|
114
|
+
yield response
|
109
115
|
|
110
|
-
|
111
|
-
end
|
112
|
-
|
113
|
-
request.run
|
114
|
-
ensure
|
115
|
-
@cookie_file.close! if @cookie_file
|
116
|
+
vacuum(links, response)
|
116
117
|
end
|
117
|
-
|
118
|
-
|
119
|
-
|
118
|
+
|
119
|
+
@hydra.queue(request)
|
120
|
+
end # @max_concurrency.times do
|
121
|
+
|
122
|
+
@hydra.run
|
123
|
+
end # until @global_queue.empty?
|
124
|
+
|
125
|
+
ensure
|
126
|
+
@cookie_file.close! if @cookie_file
|
127
|
+
end # def crawl(opts = {})
|
120
128
|
|
121
129
|
private
|
122
130
|
def process(response)
|
@@ -153,12 +161,24 @@ class Arachnid2
|
|
153
161
|
def preflight(opts)
|
154
162
|
@options = opts
|
155
163
|
@crawl_options = crawl_options
|
156
|
-
|
157
|
-
|
164
|
+
@maximum_load_rate = maximum_load_rate
|
165
|
+
@max_concurrency = max_concurrency
|
166
|
+
@hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
|
158
167
|
@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
|
159
168
|
@global_queue = [@url]
|
160
169
|
end
|
161
170
|
|
171
|
+
def max_concurrency
|
172
|
+
@max_concurrency ||= nil
|
173
|
+
|
174
|
+
if !@max_concurrency
|
175
|
+
@max_concurrency = "#{@options[:max_concurrency]}".to_i
|
176
|
+
@max_concurrency = 1 unless (@max_concurrency > 0)
|
177
|
+
end
|
178
|
+
|
179
|
+
@max_concurrency
|
180
|
+
end
|
181
|
+
|
162
182
|
def bound_time
|
163
183
|
boundary = "#{@options[:time_box]}".to_i
|
164
184
|
boundary = BASE_CRAWL_TIME if boundary <= 0
|
@@ -175,12 +195,30 @@ class Arachnid2
|
|
175
195
|
amount
|
176
196
|
end
|
177
197
|
|
198
|
+
def followlocation
|
199
|
+
if @followlocation.is_a?(NilClass)
|
200
|
+
@followlocation = @options[:followlocation]
|
201
|
+
@followlocation = true unless @followlocation.is_a?(FalseClass)
|
202
|
+
end
|
203
|
+
@followlocation
|
204
|
+
end
|
205
|
+
|
206
|
+
def timeout
|
207
|
+
if !@timeout
|
208
|
+
@timeout = @options[:timeout]
|
209
|
+
@timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
|
210
|
+
@timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
|
211
|
+
@timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
|
212
|
+
end
|
213
|
+
@timeout
|
214
|
+
end
|
215
|
+
|
178
216
|
def request_options
|
179
217
|
@cookie_file ||= Tempfile.new('cookies')
|
180
218
|
|
181
219
|
@request_options = {
|
182
|
-
timeout:
|
183
|
-
followlocation:
|
220
|
+
timeout: timeout,
|
221
|
+
followlocation: followlocation,
|
184
222
|
cookiefile: @cookie_file.path,
|
185
223
|
cookiejar: @cookie_file.path,
|
186
224
|
headers: @options[:headers]
|
@@ -236,7 +274,7 @@ class Arachnid2
|
|
236
274
|
|
237
275
|
return false unless ( (use > 0.0) && (@limit > 0.0) )
|
238
276
|
|
239
|
-
return ( ( (use / @limit) * 100.0 ) >=
|
277
|
+
return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
|
240
278
|
end
|
241
279
|
|
242
280
|
def in_docker?
|
@@ -244,4 +282,15 @@ class Arachnid2
|
|
244
282
|
true
|
245
283
|
end
|
246
284
|
|
285
|
+
def maximum_load_rate
|
286
|
+
@maximum_load_rate ||= nil
|
287
|
+
|
288
|
+
if !@maximum_load_rate
|
289
|
+
@maximum_load_rate = "#{@options[:memory_limit]}".to_f
|
290
|
+
@maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
|
291
|
+
end
|
292
|
+
|
293
|
+
@maximum_load_rate
|
294
|
+
end
|
295
|
+
|
247
296
|
end
|
data/lib/arachnid2/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: arachnid2
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Nissen
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|