rhack 1.2.10 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/rhack.rb +0 -1
- data/lib/rhack/curl/response.rb +6 -4
- data/lib/rhack/frame.rb +13 -3
- data/lib/rhack/page.rb +35 -42
- data/lib/rhack/scout.rb +28 -22
- data/lib/rhack/version.rb +1 -1
- data/rhack.gemspec +1 -5
- metadata +4 -46
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccd52c86276421476a67a4b379fcb6275bd6f85e
|
4
|
+
data.tar.gz: 46efe342223575cb26eceddca4970173b7f32cdf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af9bbd50acdc9ee228bc3ccede8fc31da56dd762d39a96f85c847e3c223167f2aff0fff6f4ef30ee68e7d7b3fa91e51d74cf670c7b7968bd9d4ec70861ca3ae9
|
7
|
+
data.tar.gz: 7d0f7418a4af8d3ff2bcbfab5ec26fbca85ad0664570242255fee2822ffbf3c528ab8e4ba9979d2a545b6599a77bd80f15ab6aa2f24297bb947fdc134a7457b0
|
data/lib/rhack.rb
CHANGED
data/lib/rhack/curl/response.rb
CHANGED
@@ -8,7 +8,7 @@ module Curl
|
|
8
8
|
def to_s
|
9
9
|
str = '<#'
|
10
10
|
if @error
|
11
|
-
str << "#{@error
|
11
|
+
str << "#{@error.name}: #{@error.message}"
|
12
12
|
else
|
13
13
|
str << (@header[/\d{3}/] == @code.to_s ? @header : "#{@header[/\S+/]} #{@code}") if @header
|
14
14
|
if @hash.location
|
@@ -77,14 +77,16 @@ module Curl
|
|
77
77
|
|
78
78
|
def is(klass)
|
79
79
|
if @error
|
80
|
-
|
80
|
+
$log.warn "obsolete comparison with Array", caller: 1 if klass == Array
|
81
|
+
klass == Array || klass = Curl::Response # obsolete
|
81
82
|
else
|
82
83
|
klass == Curl::Response
|
83
84
|
end
|
84
85
|
end
|
85
86
|
|
86
|
-
def [](
|
87
|
-
|
87
|
+
def [](key)
|
88
|
+
#@error ? @error[key_or_index] : @hash[key_or_index.downcase] # old
|
89
|
+
@hash[key.downcase]
|
88
90
|
end
|
89
91
|
|
90
92
|
alias :headers :hash
|
data/lib/rhack/frame.rb
CHANGED
@@ -22,6 +22,7 @@ module RHACK
|
|
22
22
|
class Frame
|
23
23
|
__init__
|
24
24
|
attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
|
25
|
+
alias options opts
|
25
26
|
@@cache = {}
|
26
27
|
|
27
28
|
def initialize *args
|
@@ -56,7 +57,7 @@ module RHACK
|
|
56
57
|
@ss.update to, forced
|
57
58
|
update_loc to
|
58
59
|
end
|
59
|
-
alias
|
60
|
+
alias target= retarget
|
60
61
|
|
61
62
|
def anchor
|
62
63
|
retarget @loc.href
|
@@ -222,7 +223,7 @@ module RHACK
|
|
222
223
|
(opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
|
223
224
|
if opts[:content_type]
|
224
225
|
if opts[:content_type].is Symbol
|
225
|
-
if mime_type =
|
226
|
+
if mime_type = MIME::Types.of(opts[:content_type])[0]
|
226
227
|
(opts[:headers] ||= {})['Content-Type'] = mime_type.content_type
|
227
228
|
else
|
228
229
|
raise ArgumentError, "failed to detect Mime::Type by extension: #{opts[:content_type]}
|
@@ -329,6 +330,15 @@ module RHACK
|
|
329
330
|
page = opts[:result].new
|
330
331
|
# if no spare scouts can be found, squad simply waits for first callbacks to complete
|
331
332
|
s = @ss.next
|
333
|
+
s.http.on_failure {|curl, error|
|
334
|
+
s.process_failure(*error)
|
335
|
+
if opts[:raw]
|
336
|
+
page.res = s.error
|
337
|
+
elsif page.process(curl, opts)
|
338
|
+
run_callbacks! page, opts, &callback
|
339
|
+
# nothing to do here if process returns nil or false
|
340
|
+
end
|
341
|
+
}
|
332
342
|
s.send(*(order << opts)) {|curl|
|
333
343
|
# there is a problem with storing html on disk
|
334
344
|
if order[0] == :loadGet and @write_to
|
@@ -343,7 +353,7 @@ module RHACK
|
|
343
353
|
elsif page.process(curl, opts)
|
344
354
|
@@cache[page.href] = page if order[0] == :loadGet and @use_cache
|
345
355
|
run_callbacks! page, opts, &callback
|
346
|
-
|
356
|
+
# nothing to do here if process returns nil or false
|
347
357
|
end
|
348
358
|
}
|
349
359
|
# > Carier.requests++
|
data/lib/rhack/page.rb
CHANGED
@@ -72,8 +72,8 @@ module RHACK
|
|
72
72
|
|
73
73
|
def inspect
|
74
74
|
sz = size
|
75
|
-
if
|
76
|
-
"<##{self.class.name} (#{@data
|
75
|
+
if @json or @hash
|
76
|
+
"<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>"
|
77
77
|
else
|
78
78
|
"<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
|
79
79
|
end
|
@@ -91,7 +91,7 @@ module RHACK
|
|
91
91
|
|
92
92
|
# override this in a subclass
|
93
93
|
def failed?(*)
|
94
|
-
@curl_res.code != 200
|
94
|
+
@curl_res.error or @curl_res.code != 200
|
95
95
|
end
|
96
96
|
|
97
97
|
# override this in a subclass
|
@@ -103,14 +103,6 @@ module RHACK
|
|
103
103
|
# MUST return self if successful
|
104
104
|
# MAY return false otherwise
|
105
105
|
def parse(opts={})
|
106
|
-
if failed?
|
107
|
-
failed!
|
108
|
-
if opts[:json] or opts[:hash]
|
109
|
-
@data = false
|
110
|
-
end
|
111
|
-
return self
|
112
|
-
end
|
113
|
-
|
114
106
|
if opts[:json]
|
115
107
|
parse_json opts
|
116
108
|
elsif opts[:hash]
|
@@ -120,15 +112,23 @@ module RHACK
|
|
120
112
|
else
|
121
113
|
parse_html opts
|
122
114
|
end
|
123
|
-
|
124
115
|
self
|
125
116
|
end
|
126
117
|
|
127
118
|
private
|
128
119
|
|
120
|
+
# @failed means failure cause
|
121
|
+
# MUST return false if you don't want this client
|
122
|
+
# to call main callback in case of failure
|
123
|
+
# MUST return true otherwise
|
129
124
|
def failed!
|
130
|
-
|
131
|
-
|
125
|
+
if @curl_res.error
|
126
|
+
@failed = @curl_res.error
|
127
|
+
else
|
128
|
+
@body = @curl_res.body
|
129
|
+
@failed = @curl_res.code
|
130
|
+
end
|
131
|
+
false
|
132
132
|
end
|
133
133
|
|
134
134
|
def log_failed(action)
|
@@ -171,7 +171,9 @@ module RHACK
|
|
171
171
|
end
|
172
172
|
end
|
173
173
|
|
174
|
+
# urlencoded
|
174
175
|
def parse_hash(*)
|
176
|
+
@hash = true
|
175
177
|
if @curl_res.body.inline
|
176
178
|
@data = @curl_res.body.to_params
|
177
179
|
else
|
@@ -184,15 +186,22 @@ module RHACK
|
|
184
186
|
public
|
185
187
|
|
186
188
|
# We can then alternate #process in Page subclasses
|
187
|
-
# Frame doesn't mind about value returned by #process
|
188
|
-
|
189
|
-
|
190
|
-
@
|
191
|
-
@
|
189
|
+
# Frame doesn't mind about the value returned by #process
|
190
|
+
# unless it is nil or false
|
191
|
+
def process(curl, opts={})
|
192
|
+
@loc = curl.last_effective_url.parse :uri
|
193
|
+
@curl = curl
|
194
|
+
@curl_res = curl.res
|
192
195
|
|
193
|
-
if
|
194
|
-
|
195
|
-
|
196
|
+
if failed?
|
197
|
+
should_proceed = failed! # false by default
|
198
|
+
if retry?
|
199
|
+
curl.retry!
|
200
|
+
return false # callback will not proceed
|
201
|
+
end
|
202
|
+
unless should_proceed
|
203
|
+
return false # nor callback or retry will not proceed
|
204
|
+
end
|
196
205
|
end
|
197
206
|
|
198
207
|
L.debug "#{@loc.fullpath} -> #{@curl_res}"
|
@@ -519,11 +528,7 @@ module RHACK
|
|
519
528
|
# MUST return self if successful
|
520
529
|
# MAY return false otherwise
|
521
530
|
def parse(opts={})
|
522
|
-
|
523
|
-
failed!
|
524
|
-
else
|
525
|
-
parse_xml opts
|
526
|
-
end
|
531
|
+
parse_xml opts
|
527
532
|
self
|
528
533
|
end
|
529
534
|
|
@@ -537,11 +542,7 @@ module RHACK
|
|
537
542
|
# MUST return self if successful
|
538
543
|
# MAY return false otherwise
|
539
544
|
def parse(opts={})
|
540
|
-
|
541
|
-
failed!
|
542
|
-
else
|
543
|
-
parse_html opts
|
544
|
-
end
|
545
|
+
parse_html opts
|
545
546
|
self
|
546
547
|
end
|
547
548
|
|
@@ -555,11 +556,7 @@ module RHACK
|
|
555
556
|
# MUST return self if successful
|
556
557
|
# MAY return false otherwise
|
557
558
|
def parse(opts={})
|
558
|
-
|
559
|
-
failed!
|
560
|
-
else
|
561
|
-
parse_json opts
|
562
|
-
end
|
559
|
+
parse_json opts
|
563
560
|
self
|
564
561
|
end
|
565
562
|
|
@@ -573,11 +570,7 @@ module RHACK
|
|
573
570
|
# MUST return self if successful
|
574
571
|
# MAY return false otherwise
|
575
572
|
def parse(opts={})
|
576
|
-
|
577
|
-
failed!
|
578
|
-
else
|
579
|
-
parse_hash opts
|
580
|
-
end
|
573
|
+
parse_hash opts
|
581
574
|
self
|
582
575
|
end
|
583
576
|
|
data/lib/rhack/scout.rb
CHANGED
@@ -142,7 +142,7 @@ module RHACK
|
|
142
142
|
name = v.is(Hash) && v[:name] ||
|
143
143
|
File.basename(path)
|
144
144
|
content_type = v.is(Hash) && v[:content_type].to_s ||
|
145
|
-
(
|
145
|
+
(MIME::Types.of(path)[0] || {}).content_type ||
|
146
146
|
"application/octet-stream"
|
147
147
|
Curl::PostField.file(k, type, name, read(path))
|
148
148
|
else
|
@@ -235,6 +235,23 @@ module RHACK
|
|
235
235
|
!loaded?
|
236
236
|
end
|
237
237
|
|
238
|
+
def process_failure(curl_err, message)
|
239
|
+
@error = curl_err.new message
|
240
|
+
#@error = [curl_err, message] # old
|
241
|
+
@http.outdate!
|
242
|
+
# we must clean @http.on_complete, otherwise
|
243
|
+
# it would run right after this function and with broken data
|
244
|
+
@http.on_complete &Proc::NULL
|
245
|
+
if retry? curl_err
|
246
|
+
L.debug "#{curl_err} -> reloading scout"
|
247
|
+
retry!
|
248
|
+
else
|
249
|
+
L.debug "#{curl_err} -> not reloading scout"
|
250
|
+
raise @error if @raise_err
|
251
|
+
#raise *@error if @raise_err # old
|
252
|
+
end
|
253
|
+
end
|
254
|
+
|
238
255
|
def load!
|
239
256
|
unless Curl.carier.add @http
|
240
257
|
Curl.carier.remove @http
|
@@ -253,7 +270,7 @@ module RHACK
|
|
253
270
|
end
|
254
271
|
|
255
272
|
def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
|
256
|
-
# cache preprocessed data for one time
|
273
|
+
# cache preprocessed data for one time so we can do #retry
|
257
274
|
@__path = path
|
258
275
|
@__headers = headers
|
259
276
|
@__not_redir = not_redir
|
@@ -264,36 +281,25 @@ module RHACK
|
|
264
281
|
@http.headers = mkHeader(path).merge!(headers)
|
265
282
|
@http.timeout = @timeout
|
266
283
|
|
267
|
-
@http.on_complete {|
|
284
|
+
@http.on_complete {|curl| # = @http
|
268
285
|
# > Carier.requests--
|
269
286
|
@error = nil
|
270
287
|
# While not outdated, Curl::Response here may contain pointers on freed
|
271
288
|
# memory, thus throwing exception on #to_s and #inspect
|
272
|
-
|
273
|
-
|
289
|
+
@http.outdate!
|
290
|
+
res = @http.res
|
291
|
+
ProcCookies res if @cookieProc
|
274
292
|
# We cannot just cancel on_complete in on_redirect block
|
275
293
|
# because loadGet will immediately reset on_complete back
|
276
|
-
if
|
294
|
+
if res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = res.hash.location
|
277
295
|
loadGet(loc, headers: headers, relvl: relvl, redir: true, &callback)
|
278
296
|
elsif block_given?
|
279
|
-
yield
|
297
|
+
yield @http
|
280
298
|
end
|
281
299
|
}
|
282
|
-
@http.on_failure {|
|
283
|
-
|
284
|
-
|
285
|
-
c.outdate!
|
286
|
-
# we must clean @http.on_complete, otherwise
|
287
|
-
# it would run right after this function and with broken data
|
288
|
-
@http.on_complete &Proc::NULL
|
289
|
-
if retry? eclass
|
290
|
-
L.debug "#{eclass} -> reloading scout"
|
291
|
-
retry!
|
292
|
-
else
|
293
|
-
L.debug "#{eclass} -> not reloading scout"
|
294
|
-
raise *e if @raise_err
|
295
|
-
end
|
296
|
-
} if !@http.on_failure
|
300
|
+
@http.on_failure {|curl, error|
|
301
|
+
process_failure(*error)
|
302
|
+
} unless @http.on_failure
|
297
303
|
|
298
304
|
load!
|
299
305
|
end
|
data/lib/rhack/version.rb
CHANGED
data/rhack.gemspec
CHANGED
@@ -16,13 +16,9 @@ Gem::Specification.new do |spec|
|
|
16
16
|
spec.files = `git ls-files`.split($/)
|
17
17
|
spec.require_paths = ["lib"]
|
18
18
|
|
19
|
-
spec.add_runtime_dependency "activesupport", "~> 3"
|
20
19
|
#spec.add_runtime_dependency "redis"
|
21
|
-
spec.add_runtime_dependency "rmtools", "~> 2.
|
20
|
+
spec.add_runtime_dependency "rmtools", "~> 2.4"
|
22
21
|
spec.add_runtime_dependency "libxml-ruby", "~> 2.7"
|
23
22
|
|
24
23
|
spec.extensions << 'ext/curb/extconf.rb'
|
25
|
-
|
26
|
-
spec.add_development_dependency "bundler", "~> 0"
|
27
|
-
spec.add_development_dependency "rake", "~> 0"
|
28
24
|
end
|
metadata
CHANGED
@@ -1,43 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhack
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sergey Baev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-10-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: activesupport
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - ~>
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '3'
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - ~>
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '3'
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: rmtools
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
30
16
|
requirements:
|
31
17
|
- - ~>
|
32
18
|
- !ruby/object:Gem::Version
|
33
|
-
version: '2.
|
19
|
+
version: '2.4'
|
34
20
|
type: :runtime
|
35
21
|
prerelease: false
|
36
22
|
version_requirements: !ruby/object:Gem::Requirement
|
37
23
|
requirements:
|
38
24
|
- - ~>
|
39
25
|
- !ruby/object:Gem::Version
|
40
|
-
version: '2.
|
26
|
+
version: '2.4'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: libxml-ruby
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -52,34 +38,6 @@ dependencies:
|
|
52
38
|
- - ~>
|
53
39
|
- !ruby/object:Gem::Version
|
54
40
|
version: '2.7'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: bundler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ~>
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ~>
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: rake
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ~>
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ~>
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
41
|
description: 'RHACK is Ruby Http ACcess Kit: curl-based web-client framework created
|
84
42
|
for developing web-scrapers/bots.\n\nFeatures:\nAsynchronous, still EventMachine
|
85
43
|
independent\nFast as on simple queries as on high load\n3 levels of flexible configuration\nWeb-client
|