arachnid2 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35ba8ebca2f7ed2459bdf6db38053835a6977bb51b34627fd2f9642bb3b7abb3
4
- data.tar.gz: f3ed7cd8b5cbabb1643bb2fee9121a8df97e9ba05c4d5452f53e32fa91bd8090
3
+ metadata.gz: 0ebb9ed9cdef3106462796f1b7fcc2483d58857bf605f04c800733358ea3f486
4
+ data.tar.gz: 86eaaf1bd44b85ee564b1bf3aeb08e2be326462c6bf8d3291ba2bc2f55e7c444
5
5
  SHA512:
6
- metadata.gz: b99ae55035e1064fa2140d8d5c13e924a47ba52905611586090c0e35650e31d77067c39a00245d0be5364945f24a025821f18b45917b8584fd3aec72fd0190b3
7
- data.tar.gz: ebf1e8ca73ed4a964b0b33de3ca396ba48b59df1777402ac9cc8d3ba1126cd5ea63b559c5d55a88f51595af8f58acefa592cefebaae8f590d697586b5ac27461
6
+ metadata.gz: 1b8f1d5798379c75502cf36046c3110f02ae6abbd12a8ecaa7c501e1efd0d8d86393abd64212f12e3c3f951ddf5f8f6fed5ed15a9937305cf63ff92523087c89
7
+ data.tar.gz: 4f02afa25d537346b2cc6daaa16ae7b19956c5478bdfb5f4e2a9188825ad3f687691ad96025570661fd97dd12954c0e367be9cbdf9267f726b34ad13ea50bff0
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- arachnid2 (0.1.1)
4
+ arachnid2 (0.1.2)
5
5
  addressable
6
6
  adomain
7
7
  bloomfilter-rb
@@ -20,7 +20,7 @@ GEM
20
20
  diff-lcs (1.3)
21
21
  ethon (0.11.0)
22
22
  ffi (>= 1.3.0)
23
- ffi (1.9.23)
23
+ ffi (1.9.25)
24
24
  mini_portile2 (2.3.0)
25
25
  nokogiri (1.8.2)
26
26
  mini_portile2 (~> 2.3.0)
data/README.md CHANGED
@@ -63,10 +63,15 @@ require "arachnid2"
63
63
  url = "http://sixcolours.com"
64
64
  spider = Arachnid2.new(url)
65
65
  opts = {
66
+ followlocation: true,
67
+ timeout: 10000,
66
68
  time_box: 60,
67
69
  max_urls: 50,
68
- language: "en-UK",
69
- user_agent: "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"
70
+ :headers => {
71
+ 'Accept-Language' => "en-UK",
72
+ 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
73
+ },
74
+ memory_limit: 89.99,
70
75
  proxy: {
71
76
  ip: "1.2.3.4",
72
77
  port: "1234",
@@ -94,16 +99,18 @@ If no valid integer is provided, it will crawl for 50 URLs before exiting.
94
99
  10000 seconds is the current maximum,
95
100
  and any value above it will be reduced to 10000.
96
101
 
97
- #### `language`
102
+ #### `headers`
98
103
 
99
- The language is a string mapped to the HTTP header Accept-Language. The
100
- default is
101
- `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
104
+ This is a hash that represents any HTTP header key/value pairs you desire,
105
+ and is passed directly to Typheous. Before it is sent, a default
106
+ language and user agent are created:
107
+
108
+ ##### Defaults
102
109
 
103
- #### `user_agent`
110
+ The HTTP header `Accept-Language` default is
111
+ `en-IE, en-UK;q=0.9, en-NL;q=0.8, en-MT;q=0.7, en-LU;q=0.6, en;q=0.5, \*;0.4`
104
112
 
105
- This user agent is a string mapped to the HTTP header User-Agent. The
106
- default is
113
+ The HTTP header `User-Agent` default is
107
114
  `Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15`
108
115
 
109
116
  #### `proxy`
@@ -112,12 +119,13 @@ Provide your IP, port for a proxy. If required, provide credentials for
112
119
  authenticating to that proxy. Proxy options and handling are done
113
120
  by Typhoeus.
114
121
 
115
- ### Memory use in Docker
122
+ #### `memory_limit` and Docker
116
123
 
117
124
  In case you are operating the crawler within a container, Arachnid2
118
- will attempt to prevent the container from running out of memory,
119
- and crawls will end when the container has <= 20% of its available memory
120
- free.
125
+ can attempt to prevent the container from running out of memory.
126
+ By default, it will end the crawl when the container uses >= 80%
127
+ of its available memory. You can override this with the
128
+ option.
121
129
 
122
130
  ### Non-HTML links
123
131
 
@@ -37,7 +37,11 @@ class Arachnid2
37
37
  }
38
38
  MEMORY_USE_FILE = "/sys/fs/cgroup/memory/memory.usage_in_bytes"
39
39
  MEMORY_LIMIT_FILE = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
40
- MAXIMUM_LOAD_RATE = 79.9
40
+ DEFAULT_MAXIMUM_LOAD_RATE = 79.9
41
+
42
+ DEFAULT_TIMEOUT = 10_000
43
+ MINIMUM_TIMEOUT = 1
44
+ MAXIMUM_TIMEOUT = 999_999
41
45
 
42
46
  #
43
47
  # Creates the object to execute the crawl
@@ -64,11 +68,14 @@ class Arachnid2
64
68
  # spider = Arachnid2.new(url)
65
69
  #
66
70
  # opts = {
71
+ # :followlocation => true,
72
+ # :timeout => 25000,
67
73
  # :time_box => 30,
68
74
  # :headers => {
69
75
  # 'Accept-Language' => "en-UK",
70
76
  # 'User-Agent' => "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0",
71
77
  # },
78
+ # :memory_limit => 89.99,
72
79
  # :proxy => {
73
80
  # :ip => "1.2.3.4",
74
81
  # :port => "1234",
@@ -89,34 +96,35 @@ class Arachnid2
89
96
  preflight(opts)
90
97
 
91
98
  until @global_queue.empty?
92
- @global_queue.size.times do
93
- begin
94
- q = @global_queue.shift
99
+ @max_concurrency.times do
100
+ q = @global_queue.shift
95
101
 
96
- break if @global_visited.size >= @crawl_options[:max_urls]
97
- break if Time.now > @crawl_options[:time_limit]
98
- break if memory_danger?
102
+ break if @global_visited.size >= @crawl_options[:max_urls]
103
+ break if Time.now > @crawl_options[:time_limit]
104
+ break if memory_danger?
99
105
 
100
- @global_visited.insert(q)
106
+ @global_visited.insert(q)
101
107
 
102
- request = Typhoeus::Request.new(q, request_options)
108
+ request = Typhoeus::Request.new(q, request_options)
103
109
 
104
- request.on_complete do |response|
105
- links = process(response)
106
- next unless links
110
+ request.on_complete do |response|
111
+ links = process(response)
112
+ next unless links
107
113
 
108
- yield response
114
+ yield response
109
115
 
110
- vacuum(links, response)
111
- end
112
-
113
- request.run
114
- ensure
115
- @cookie_file.close! if @cookie_file
116
+ vacuum(links, response)
116
117
  end
117
- end
118
- end
119
- end
118
+
119
+ @hydra.queue(request)
120
+ end # @max_concurrency.times do
121
+
122
+ @hydra.run
123
+ end # until @global_queue.empty?
124
+
125
+ ensure
126
+ @cookie_file.close! if @cookie_file
127
+ end # def crawl(opts = {})
120
128
 
121
129
  private
122
130
  def process(response)
@@ -153,12 +161,24 @@ class Arachnid2
153
161
  def preflight(opts)
154
162
  @options = opts
155
163
  @crawl_options = crawl_options
156
- # TODO: write looping to take advantage of Hydra
157
- # @hydra = Typhoeus::Hydra.new(:max_concurrency => 1)
164
+ @maximum_load_rate = maximum_load_rate
165
+ @max_concurrency = max_concurrency
166
+ @hydra = Typhoeus::Hydra.new(:max_concurrency => @max_concurrency)
158
167
  @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => true)
159
168
  @global_queue = [@url]
160
169
  end
161
170
 
171
+ def max_concurrency
172
+ @max_concurrency ||= nil
173
+
174
+ if !@max_concurrency
175
+ @max_concurrency = "#{@options[:max_concurrency]}".to_i
176
+ @max_concurrency = 1 unless (@max_concurrency > 0)
177
+ end
178
+
179
+ @max_concurrency
180
+ end
181
+
162
182
  def bound_time
163
183
  boundary = "#{@options[:time_box]}".to_i
164
184
  boundary = BASE_CRAWL_TIME if boundary <= 0
@@ -175,12 +195,30 @@ class Arachnid2
175
195
  amount
176
196
  end
177
197
 
198
+ def followlocation
199
+ if @followlocation.is_a?(NilClass)
200
+ @followlocation = @options[:followlocation]
201
+ @followlocation = true unless @followlocation.is_a?(FalseClass)
202
+ end
203
+ @followlocation
204
+ end
205
+
206
+ def timeout
207
+ if !@timeout
208
+ @timeout = @options[:timeout]
209
+ @timeout = DEFAULT_TIMEOUT unless @timeout.is_a?(Integer)
210
+ @timeout = DEFAULT_TIMEOUT if @timeout > MAXIMUM_TIMEOUT
211
+ @timeout = DEFAULT_TIMEOUT if @timeout < MINIMUM_TIMEOUT
212
+ end
213
+ @timeout
214
+ end
215
+
178
216
  def request_options
179
217
  @cookie_file ||= Tempfile.new('cookies')
180
218
 
181
219
  @request_options = {
182
- timeout: 10000,
183
- followlocation: true,
220
+ timeout: timeout,
221
+ followlocation: followlocation,
184
222
  cookiefile: @cookie_file.path,
185
223
  cookiejar: @cookie_file.path,
186
224
  headers: @options[:headers]
@@ -236,7 +274,7 @@ class Arachnid2
236
274
 
237
275
  return false unless ( (use > 0.0) && (@limit > 0.0) )
238
276
 
239
- return ( ( (use / @limit) * 100.0 ) >= MAXIMUM_LOAD_RATE )
277
+ return ( ( (use / @limit) * 100.0 ) >= @maximum_load_rate )
240
278
  end
241
279
 
242
280
  def in_docker?
@@ -244,4 +282,15 @@ class Arachnid2
244
282
  true
245
283
  end
246
284
 
285
+ def maximum_load_rate
286
+ @maximum_load_rate ||= nil
287
+
288
+ if !@maximum_load_rate
289
+ @maximum_load_rate = "#{@options[:memory_limit]}".to_f
290
+ @maximum_load_rate = DEFAULT_MAXIMUM_LOAD_RATE unless ((@maximum_load_rate > 0.0) && (@maximum_load_rate < 100.0))
291
+ end
292
+
293
+ @maximum_load_rate
294
+ end
295
+
247
296
  end
@@ -1,3 +1,3 @@
1
1
  class Arachnid2
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arachnid2
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Nissen
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-12 00:00:00.000000000 Z
11
+ date: 2018-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler