rhack 1.2.10 → 1.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/rhack.rb +0 -1
- data/lib/rhack/curl/response.rb +6 -4
- data/lib/rhack/frame.rb +13 -3
- data/lib/rhack/page.rb +35 -42
- data/lib/rhack/scout.rb +28 -22
- data/lib/rhack/version.rb +1 -1
- data/rhack.gemspec +1 -5
- metadata +4 -46
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccd52c86276421476a67a4b379fcb6275bd6f85e
|
4
|
+
data.tar.gz: 46efe342223575cb26eceddca4970173b7f32cdf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af9bbd50acdc9ee228bc3ccede8fc31da56dd762d39a96f85c847e3c223167f2aff0fff6f4ef30ee68e7d7b3fa91e51d74cf670c7b7968bd9d4ec70861ca3ae9
|
7
|
+
data.tar.gz: 7d0f7418a4af8d3ff2bcbfab5ec26fbca85ad0664570242255fee2822ffbf3c528ab8e4ba9979d2a545b6599a77bd80f15ab6aa2f24297bb947fdc134a7457b0
|
data/lib/rhack.rb
CHANGED
data/lib/rhack/curl/response.rb
CHANGED
@@ -8,7 +8,7 @@ module Curl
|
|
8
8
|
def to_s
|
9
9
|
str = '<#'
|
10
10
|
if @error
|
11
|
-
str << "#{@error
|
11
|
+
str << "#{@error.name}: #{@error.message}"
|
12
12
|
else
|
13
13
|
str << (@header[/\d{3}/] == @code.to_s ? @header : "#{@header[/\S+/]} #{@code}") if @header
|
14
14
|
if @hash.location
|
@@ -77,14 +77,16 @@ module Curl
|
|
77
77
|
|
78
78
|
def is(klass)
|
79
79
|
if @error
|
80
|
-
|
80
|
+
$log.warn "obsolete comparison with Array", caller: 1 if klass == Array
|
81
|
+
klass == Array || klass = Curl::Response # obsolete
|
81
82
|
else
|
82
83
|
klass == Curl::Response
|
83
84
|
end
|
84
85
|
end
|
85
86
|
|
86
|
-
def [](
|
87
|
-
|
87
|
+
def [](key)
|
88
|
+
#@error ? @error[key_or_index] : @hash[key_or_index.downcase] # old
|
89
|
+
@hash[key.downcase]
|
88
90
|
end
|
89
91
|
|
90
92
|
alias :headers :hash
|
data/lib/rhack/frame.rb
CHANGED
@@ -22,6 +22,7 @@ module RHACK
|
|
22
22
|
class Frame
|
23
23
|
__init__
|
24
24
|
attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
|
25
|
+
alias options opts
|
25
26
|
@@cache = {}
|
26
27
|
|
27
28
|
def initialize *args
|
@@ -56,7 +57,7 @@ module RHACK
|
|
56
57
|
@ss.update to, forced
|
57
58
|
update_loc to
|
58
59
|
end
|
59
|
-
alias
|
60
|
+
alias target= retarget
|
60
61
|
|
61
62
|
def anchor
|
62
63
|
retarget @loc.href
|
@@ -222,7 +223,7 @@ module RHACK
|
|
222
223
|
(opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
|
223
224
|
if opts[:content_type]
|
224
225
|
if opts[:content_type].is Symbol
|
225
|
-
if mime_type =
|
226
|
+
if mime_type = MIME::Types.of(opts[:content_type])[0]
|
226
227
|
(opts[:headers] ||= {})['Content-Type'] = mime_type.content_type
|
227
228
|
else
|
228
229
|
raise ArgumentError, "failed to detect Mime::Type by extension: #{opts[:content_type]}
|
@@ -329,6 +330,15 @@ module RHACK
|
|
329
330
|
page = opts[:result].new
|
330
331
|
# if no spare scouts can be found, squad simply waits for first callbacks to complete
|
331
332
|
s = @ss.next
|
333
|
+
s.http.on_failure {|curl, error|
|
334
|
+
s.process_failure(*error)
|
335
|
+
if opts[:raw]
|
336
|
+
page.res = s.error
|
337
|
+
elsif page.process(curl, opts)
|
338
|
+
run_callbacks! page, opts, &callback
|
339
|
+
# nothing to do here if process returns nil or false
|
340
|
+
end
|
341
|
+
}
|
332
342
|
s.send(*(order << opts)) {|curl|
|
333
343
|
# there is a problem with storing html on disk
|
334
344
|
if order[0] == :loadGet and @write_to
|
@@ -343,7 +353,7 @@ module RHACK
|
|
343
353
|
elsif page.process(curl, opts)
|
344
354
|
@@cache[page.href] = page if order[0] == :loadGet and @use_cache
|
345
355
|
run_callbacks! page, opts, &callback
|
346
|
-
|
356
|
+
# nothing to do here if process returns nil or false
|
347
357
|
end
|
348
358
|
}
|
349
359
|
# > Carier.requests++
|
data/lib/rhack/page.rb
CHANGED
@@ -72,8 +72,8 @@ module RHACK
|
|
72
72
|
|
73
73
|
def inspect
|
74
74
|
sz = size
|
75
|
-
if
|
76
|
-
"<##{self.class.name} (#{@data
|
75
|
+
if @json or @hash
|
76
|
+
"<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>"
|
77
77
|
else
|
78
78
|
"<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
|
79
79
|
end
|
@@ -91,7 +91,7 @@ module RHACK
|
|
91
91
|
|
92
92
|
# override this in a subclass
|
93
93
|
def failed?(*)
|
94
|
-
@curl_res.code != 200
|
94
|
+
@curl_res.error or @curl_res.code != 200
|
95
95
|
end
|
96
96
|
|
97
97
|
# override this in a subclass
|
@@ -103,14 +103,6 @@ module RHACK
|
|
103
103
|
# MUST return self if successful
|
104
104
|
# MAY return false otherwise
|
105
105
|
def parse(opts={})
|
106
|
-
if failed?
|
107
|
-
failed!
|
108
|
-
if opts[:json] or opts[:hash]
|
109
|
-
@data = false
|
110
|
-
end
|
111
|
-
return self
|
112
|
-
end
|
113
|
-
|
114
106
|
if opts[:json]
|
115
107
|
parse_json opts
|
116
108
|
elsif opts[:hash]
|
@@ -120,15 +112,23 @@ module RHACK
|
|
120
112
|
else
|
121
113
|
parse_html opts
|
122
114
|
end
|
123
|
-
|
124
115
|
self
|
125
116
|
end
|
126
117
|
|
127
118
|
private
|
128
119
|
|
120
|
+
# @failed means failure cause
|
121
|
+
# MUST return false if you don't want this client
|
122
|
+
# to call main callback in case of failure
|
123
|
+
# MUST return true otherwise
|
129
124
|
def failed!
|
130
|
-
|
131
|
-
|
125
|
+
if @curl_res.error
|
126
|
+
@failed = @curl_res.error
|
127
|
+
else
|
128
|
+
@body = @curl_res.body
|
129
|
+
@failed = @curl_res.code
|
130
|
+
end
|
131
|
+
false
|
132
132
|
end
|
133
133
|
|
134
134
|
def log_failed(action)
|
@@ -171,7 +171,9 @@ module RHACK
|
|
171
171
|
end
|
172
172
|
end
|
173
173
|
|
174
|
+
# urlencoded
|
174
175
|
def parse_hash(*)
|
176
|
+
@hash = true
|
175
177
|
if @curl_res.body.inline
|
176
178
|
@data = @curl_res.body.to_params
|
177
179
|
else
|
@@ -184,15 +186,22 @@ module RHACK
|
|
184
186
|
public
|
185
187
|
|
186
188
|
# We can then alternate #process in Page subclasses
|
187
|
-
# Frame doesn't mind about value returned by #process
|
188
|
-
|
189
|
-
|
190
|
-
@
|
191
|
-
@
|
189
|
+
# Frame doesn't mind about the value returned by #process
|
190
|
+
# unless it is nil or false
|
191
|
+
def process(curl, opts={})
|
192
|
+
@loc = curl.last_effective_url.parse :uri
|
193
|
+
@curl = curl
|
194
|
+
@curl_res = curl.res
|
192
195
|
|
193
|
-
if
|
194
|
-
|
195
|
-
|
196
|
+
if failed?
|
197
|
+
should_proceed = failed! # false by default
|
198
|
+
if retry?
|
199
|
+
curl.retry!
|
200
|
+
return false # callback will not proceed
|
201
|
+
end
|
202
|
+
unless should_proceed
|
203
|
+
return false # nor callback or retry will not proceed
|
204
|
+
end
|
196
205
|
end
|
197
206
|
|
198
207
|
L.debug "#{@loc.fullpath} -> #{@curl_res}"
|
@@ -519,11 +528,7 @@ module RHACK
|
|
519
528
|
# MUST return self if successful
|
520
529
|
# MAY return false otherwise
|
521
530
|
def parse(opts={})
|
522
|
-
|
523
|
-
failed!
|
524
|
-
else
|
525
|
-
parse_xml opts
|
526
|
-
end
|
531
|
+
parse_xml opts
|
527
532
|
self
|
528
533
|
end
|
529
534
|
|
@@ -537,11 +542,7 @@ module RHACK
|
|
537
542
|
# MUST return self if successful
|
538
543
|
# MAY return false otherwise
|
539
544
|
def parse(opts={})
|
540
|
-
|
541
|
-
failed!
|
542
|
-
else
|
543
|
-
parse_html opts
|
544
|
-
end
|
545
|
+
parse_html opts
|
545
546
|
self
|
546
547
|
end
|
547
548
|
|
@@ -555,11 +556,7 @@ module RHACK
|
|
555
556
|
# MUST return self if successful
|
556
557
|
# MAY return false otherwise
|
557
558
|
def parse(opts={})
|
558
|
-
|
559
|
-
failed!
|
560
|
-
else
|
561
|
-
parse_json opts
|
562
|
-
end
|
559
|
+
parse_json opts
|
563
560
|
self
|
564
561
|
end
|
565
562
|
|
@@ -573,11 +570,7 @@ module RHACK
|
|
573
570
|
# MUST return self if successful
|
574
571
|
# MAY return false otherwise
|
575
572
|
def parse(opts={})
|
576
|
-
|
577
|
-
failed!
|
578
|
-
else
|
579
|
-
parse_hash opts
|
580
|
-
end
|
573
|
+
parse_hash opts
|
581
574
|
self
|
582
575
|
end
|
583
576
|
|
data/lib/rhack/scout.rb
CHANGED
@@ -142,7 +142,7 @@ module RHACK
|
|
142
142
|
name = v.is(Hash) && v[:name] ||
|
143
143
|
File.basename(path)
|
144
144
|
content_type = v.is(Hash) && v[:content_type].to_s ||
|
145
|
-
(
|
145
|
+
(MIME::Types.of(path)[0] || {}).content_type ||
|
146
146
|
"application/octet-stream"
|
147
147
|
Curl::PostField.file(k, type, name, read(path))
|
148
148
|
else
|
@@ -235,6 +235,23 @@ module RHACK
|
|
235
235
|
!loaded?
|
236
236
|
end
|
237
237
|
|
238
|
+
def process_failure(curl_err, message)
|
239
|
+
@error = curl_err.new message
|
240
|
+
#@error = [curl_err, message] # old
|
241
|
+
@http.outdate!
|
242
|
+
# we must clean @http.on_complete, otherwise
|
243
|
+
# it would run right after this function and with broken data
|
244
|
+
@http.on_complete &Proc::NULL
|
245
|
+
if retry? curl_err
|
246
|
+
L.debug "#{curl_err} -> reloading scout"
|
247
|
+
retry!
|
248
|
+
else
|
249
|
+
L.debug "#{curl_err} -> not reloading scout"
|
250
|
+
raise @error if @raise_err
|
251
|
+
#raise *@error if @raise_err # old
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
238
255
|
def load!
|
239
256
|
unless Curl.carier.add @http
|
240
257
|
Curl.carier.remove @http
|
@@ -253,7 +270,7 @@ module RHACK
|
|
253
270
|
end
|
254
271
|
|
255
272
|
def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
|
256
|
-
# cache preprocessed data for one time
|
273
|
+
# cache preprocessed data for one time so we can do #retry
|
257
274
|
@__path = path
|
258
275
|
@__headers = headers
|
259
276
|
@__not_redir = not_redir
|
@@ -264,36 +281,25 @@ module RHACK
|
|
264
281
|
@http.headers = mkHeader(path).merge!(headers)
|
265
282
|
@http.timeout = @timeout
|
266
283
|
|
267
|
-
@http.on_complete {|
|
284
|
+
@http.on_complete {|curl| # = @http
|
268
285
|
# > Carier.requests--
|
269
286
|
@error = nil
|
270
287
|
# While not outdated, Curl::Response here may contain pointers on freed
|
271
288
|
# memory, thus throwing exception on #to_s and #inspect
|
272
|
-
|
273
|
-
|
289
|
+
@http.outdate!
|
290
|
+
res = @http.res
|
291
|
+
ProcCookies res if @cookieProc
|
274
292
|
# We cannot just cancel on_complete in on_redirect block
|
275
293
|
# because loadGet will immediately reset on_complete back
|
276
|
-
if
|
294
|
+
if res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = res.hash.location
|
277
295
|
loadGet(loc, headers: headers, relvl: relvl, redir: true, &callback)
|
278
296
|
elsif block_given?
|
279
|
-
yield
|
297
|
+
yield @http
|
280
298
|
end
|
281
299
|
}
|
282
|
-
@http.on_failure {|
|
283
|
-
|
284
|
-
|
285
|
-
c.outdate!
|
286
|
-
# we must clean @http.on_complete, otherwise
|
287
|
-
# it would run right after this function and with broken data
|
288
|
-
@http.on_complete &Proc::NULL
|
289
|
-
if retry? eclass
|
290
|
-
L.debug "#{eclass} -> reloading scout"
|
291
|
-
retry!
|
292
|
-
else
|
293
|
-
L.debug "#{eclass} -> not reloading scout"
|
294
|
-
raise *e if @raise_err
|
295
|
-
end
|
296
|
-
} if !@http.on_failure
|
300
|
+
@http.on_failure {|curl, error|
|
301
|
+
process_failure(*error)
|
302
|
+
} unless @http.on_failure
|
297
303
|
|
298
304
|
load!
|
299
305
|
end
|
data/lib/rhack/version.rb
CHANGED
data/rhack.gemspec
CHANGED
@@ -16,13 +16,9 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
|
-
spec.add_runtime_dependency "activesupport", "~> 3"
|
20
19
|
#spec.add_runtime_dependency "redis"
|
21
|
-
spec.add_runtime_dependency "rmtools", "~> 2.
|
20
|
+
spec.add_runtime_dependency "rmtools", "~> 2.4"
|
22
21
|
spec.add_runtime_dependency "libxml-ruby", "~> 2.7"
|
23
22
|
|
24
23
|
spec.extensions << 'ext/curb/extconf.rb'
|
25
|
-
|
26
|
-
spec.add_development_dependency "bundler", "~> 0"
|
27
|
-
spec.add_development_dependency "rake", "~> 0"
|
28
24
|
end
|
metadata
CHANGED
@@ -1,43 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhack
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sergey Baev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: activesupport
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '3'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '3'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: rmtools
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
30
16
|
requirements:
|
31
17
|
- - ~>
|
32
18
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
19
|
+
version: '2.4'
|
34
20
|
type: :runtime
|
35
21
|
prerelease: false
|
36
22
|
version_requirements: !ruby/object:Gem::Requirement
|
37
23
|
requirements:
|
38
24
|
- - ~>
|
39
25
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2.
|
26
|
+
version: '2.4'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: libxml-ruby
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,34 +38,6 @@ dependencies:
|
|
52
38
|
- - ~>
|
53
39
|
- !ruby/object:Gem::Version
|
54
40
|
version: '2.7'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: bundler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ~>
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ~>
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rake
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ~>
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ~>
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
41
|
description: 'RHACK is Ruby Http ACcess Kit: curl-based web-client framework created
|
84
42
|
for developing web-scrapers/bots.\n\nFeatures:\nAsynchronous, still EventMachine
|
85
43
|
independent\nFast as on simple queries as on high load\n3 levels of flexible configuration\nWeb-client
|