rhack 1.2.10 → 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98ec5c6f4bcfb4bbc988ac69fc4148a7042d348c
4
- data.tar.gz: d12c3aaad05efbc0df476d790213d0bcd9065beb
3
+ metadata.gz: ccd52c86276421476a67a4b379fcb6275bd6f85e
4
+ data.tar.gz: 46efe342223575cb26eceddca4970173b7f32cdf
5
5
  SHA512:
6
- metadata.gz: 8eef5e956ef5b444c33e6ddb28c56e9e4c8e9c2cdaec9ced9124569e461a6aed26bff7a913e8b02009105428171aa30fd5bd9a88e5829e4c7ee9a91abd78bb28
7
- data.tar.gz: c40ca66013388dbd0459d4b1d97cb8c9ebce03ff942d225b77affedbf4dfb76c897691305737c5e8b9e7bec719c603738a7c518b464e8b253d51bd259c6873f1
6
+ metadata.gz: af9bbd50acdc9ee228bc3ccede8fc31da56dd762d39a96f85c847e3c223167f2aff0fff6f4ef30ee68e7d7b3fa91e51d74cf670c7b7968bd9d4ec70861ca3ae9
7
+ data.tar.gz: 7d0f7418a4af8d3ff2bcbfab5ec26fbca85ad0664570242255fee2822ffbf3c528ab8e4ba9979d2a545b6599a77bd80f15ab6aa2f24297bb947fdc134a7457b0
@@ -10,7 +10,6 @@ require "rhack/js/johnson"
10
10
  module RHACK
11
11
  mattr_reader :config, :redis, :useragents
12
12
 
13
- Dir.chdir ENV['APP_ROOT'] if ENV['APP_ROOT']
14
13
  cfgfile = Dir['{config/,}rhack.yml'].first
15
14
  @@config = cfgfile ? YAML.load(IO.read(cfgfile)) : {}
16
15
 
@@ -8,7 +8,7 @@ module Curl
8
8
  def to_s
9
9
  str = '<#'
10
10
  if @error
11
- str << "#{@error[0].self_name}: #{@error[1]}"
11
+ str << "#{@error.name}: #{@error.message}"
12
12
  else
13
13
  str << (@header[/\d{3}/] == @code.to_s ? @header : "#{@header[/\S+/]} #{@code}") if @header
14
14
  if @hash.location
@@ -77,14 +77,16 @@ module Curl
77
77
 
78
78
  def is(klass)
79
79
  if @error
80
- klass == Array || klass = Curl::Response
80
+ $log.warn "obsolete comparison with Array", caller: 1 if klass == Array
81
+ klass == Array || klass = Curl::Response # obsolete
81
82
  else
82
83
  klass == Curl::Response
83
84
  end
84
85
  end
85
86
 
86
- def [](key_or_index)
87
- @error ? @error[key_or_index] : @hash[key_or_index.downcase]
87
+ def [](key)
88
+ #@error ? @error[key_or_index] : @hash[key_or_index.downcase] # old
89
+ @hash[key.downcase]
88
90
  end
89
91
 
90
92
  alias :headers :hash
@@ -22,6 +22,7 @@ module RHACK
22
22
  class Frame
23
23
  __init__
24
24
  attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
+ alias options opts
25
26
  @@cache = {}
26
27
 
27
28
  def initialize *args
@@ -56,7 +57,7 @@ module RHACK
56
57
  @ss.update to, forced
57
58
  update_loc to
58
59
  end
59
- alias :target= :retarget
60
+ alias target= retarget
60
61
 
61
62
  def anchor
62
63
  retarget @loc.href
@@ -222,7 +223,7 @@ module RHACK
222
223
  (opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
223
224
  if opts[:content_type]
224
225
  if opts[:content_type].is Symbol
225
- if mime_type = Mime::Types.of(opts[:content_type])[0]
226
+ if mime_type = MIME::Types.of(opts[:content_type])[0]
226
227
  (opts[:headers] ||= {})['Content-Type'] = mime_type.content_type
227
228
  else
228
229
  raise ArgumentError, "failed to detect Mime::Type by extension: #{opts[:content_type]}
@@ -329,6 +330,15 @@ module RHACK
329
330
  page = opts[:result].new
330
331
  # if no spare scouts can be found, squad simply waits for first callbacks to complete
331
332
  s = @ss.next
333
+ s.http.on_failure {|curl, error|
334
+ s.process_failure(*error)
335
+ if opts[:raw]
336
+ page.res = s.error
337
+ elsif page.process(curl, opts)
338
+ run_callbacks! page, opts, &callback
339
+ # nothing to do here if process returns nil or false
340
+ end
341
+ }
332
342
  s.send(*(order << opts)) {|curl|
333
343
  # there is a problem with storing html on disk
334
344
  if order[0] == :loadGet and @write_to
@@ -343,7 +353,7 @@ module RHACK
343
353
  elsif page.process(curl, opts)
344
354
  @@cache[page.href] = page if order[0] == :loadGet and @use_cache
345
355
  run_callbacks! page, opts, &callback
346
- # nothing to do here if process returns nil or false
356
+ # nothing to do here if process returns nil or false
347
357
  end
348
358
  }
349
359
  # > Carier.requests++
@@ -72,8 +72,8 @@ module RHACK
72
72
 
73
73
  def inspect
74
74
  sz = size
75
- if !@data.nil?
76
- "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
75
+ if @json or @hash
76
+ "<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>"
77
77
  else
78
78
  "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
79
79
  end
@@ -91,7 +91,7 @@ module RHACK
91
91
 
92
92
  # override this in a subclass
93
93
  def failed?(*)
94
- @curl_res.code != 200
94
+ @curl_res.error or @curl_res.code != 200
95
95
  end
96
96
 
97
97
  # override this in a subclass
@@ -103,14 +103,6 @@ module RHACK
103
103
  # MUST return self if successful
104
104
  # MAY return false otherwise
105
105
  def parse(opts={})
106
- if failed?
107
- failed!
108
- if opts[:json] or opts[:hash]
109
- @data = false
110
- end
111
- return self
112
- end
113
-
114
106
  if opts[:json]
115
107
  parse_json opts
116
108
  elsif opts[:hash]
@@ -120,15 +112,23 @@ module RHACK
120
112
  else
121
113
  parse_html opts
122
114
  end
123
-
124
115
  self
125
116
  end
126
117
 
127
118
  private
128
119
 
120
+ # @failed means failure cause
121
+ # MUST return false if you don't want this client
122
+ # to call main callback in case of failure
123
+ # MUST return true otherwise
129
124
  def failed!
130
- @body = @curl_res.body
131
- @failed = @curl_res.code
125
+ if @curl_res.error
126
+ @failed = @curl_res.error
127
+ else
128
+ @body = @curl_res.body
129
+ @failed = @curl_res.code
130
+ end
131
+ false
132
132
  end
133
133
 
134
134
  def log_failed(action)
@@ -171,7 +171,9 @@ module RHACK
171
171
  end
172
172
  end
173
173
 
174
+ # urlencoded
174
175
  def parse_hash(*)
176
+ @hash = true
175
177
  if @curl_res.body.inline
176
178
  @data = @curl_res.body.to_params
177
179
  else
@@ -184,15 +186,22 @@ module RHACK
184
186
  public
185
187
 
186
188
  # We can then alternate #process in Page subclasses
187
- # Frame doesn't mind about value returned by #process
188
- def process(c, opts={})
189
- @loc = c.last_effective_url.parse:uri
190
- @curl = c
191
- @curl_res = c.res
189
+ # Frame doesn't mind about the value returned by #process
190
+ # unless it is nil or false
191
+ def process(curl, opts={})
192
+ @loc = curl.last_effective_url.parse :uri
193
+ @curl = curl
194
+ @curl_res = curl.res
192
195
 
193
- if retry?
194
- c.retry!
195
- return # callback will not proceed
196
+ if failed?
197
+ should_proceed = failed! # false by default
198
+ if retry?
199
+ curl.retry!
200
+ return false # callback will not proceed
201
+ end
202
+ unless should_proceed
203
+ return false # nor callback or retry will not proceed
204
+ end
196
205
  end
197
206
 
198
207
  L.debug "#{@loc.fullpath} -> #{@curl_res}"
@@ -519,11 +528,7 @@ module RHACK
519
528
  # MUST return self if successful
520
529
  # MAY return false otherwise
521
530
  def parse(opts={})
522
- if failed?
523
- failed!
524
- else
525
- parse_xml opts
526
- end
531
+ parse_xml opts
527
532
  self
528
533
  end
529
534
 
@@ -537,11 +542,7 @@ module RHACK
537
542
  # MUST return self if successful
538
543
  # MAY return false otherwise
539
544
  def parse(opts={})
540
- if failed?
541
- failed!
542
- else
543
- parse_html opts
544
- end
545
+ parse_html opts
545
546
  self
546
547
  end
547
548
 
@@ -555,11 +556,7 @@ module RHACK
555
556
  # MUST return self if successful
556
557
  # MAY return false otherwise
557
558
  def parse(opts={})
558
- if failed?
559
- failed!
560
- else
561
- parse_json opts
562
- end
559
+ parse_json opts
563
560
  self
564
561
  end
565
562
 
@@ -573,11 +570,7 @@ module RHACK
573
570
  # MUST return self if successful
574
571
  # MAY return false otherwise
575
572
  def parse(opts={})
576
- if failed?
577
- failed!
578
- else
579
- parse_hash opts
580
- end
573
+ parse_hash opts
581
574
  self
582
575
  end
583
576
 
@@ -142,7 +142,7 @@ module RHACK
142
142
  name = v.is(Hash) && v[:name] ||
143
143
  File.basename(path)
144
144
  content_type = v.is(Hash) && v[:content_type].to_s ||
145
- (Mime::Types.of(path)[0] || {}).content_type ||
145
+ (MIME::Types.of(path)[0] || {}).content_type ||
146
146
  "application/octet-stream"
147
147
  Curl::PostField.file(k, type, name, read(path))
148
148
  else
@@ -235,6 +235,23 @@ module RHACK
235
235
  !loaded?
236
236
  end
237
237
 
238
+ def process_failure(curl_err, message)
239
+ @error = curl_err.new message
240
+ #@error = [curl_err, message] # old
241
+ @http.outdate!
242
+ # we must clean @http.on_complete, otherwise
243
+ # it would run right after this function and with broken data
244
+ @http.on_complete &Proc::NULL
245
+ if retry? curl_err
246
+ L.debug "#{curl_err} -> reloading scout"
247
+ retry!
248
+ else
249
+ L.debug "#{curl_err} -> not reloading scout"
250
+ raise @error if @raise_err
251
+ #raise *@error if @raise_err # old
252
+ end
253
+ end
254
+
238
255
  def load!
239
256
  unless Curl.carier.add @http
240
257
  Curl.carier.remove @http
@@ -253,7 +270,7 @@ module RHACK
253
270
  end
254
271
 
255
272
  def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
256
- # cache preprocessed data for one time for we can do #retry
273
+ # cache preprocessed data for one time so we can do #retry
257
274
  @__path = path
258
275
  @__headers = headers
259
276
  @__not_redir = not_redir
@@ -264,36 +281,25 @@ module RHACK
264
281
  @http.headers = mkHeader(path).merge!(headers)
265
282
  @http.timeout = @timeout
266
283
 
267
- @http.on_complete {|c|
284
+ @http.on_complete {|curl| # = @http
268
285
  # > Carier.requests--
269
286
  @error = nil
270
287
  # While not outdated, Curl::Response here may contain pointers on freed
271
288
  # memory, thus throwing exception on #to_s and #inspect
272
- c.outdate!
273
- ProcCookies c.res if @cookieProc
289
+ @http.outdate!
290
+ res = @http.res
291
+ ProcCookies res if @cookieProc
274
292
  # We cannot just cancel on_complete in on_redirect block
275
293
  # because loadGet will immediately reset on_complete back
276
- if c.res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = c.res.hash.location
294
+ if res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = res.hash.location
277
295
  loadGet(loc, headers: headers, relvl: relvl, redir: true, &callback)
278
296
  elsif block_given?
279
- yield c
297
+ yield @http
280
298
  end
281
299
  }
282
- @http.on_failure {|c, e|
283
- eclass = e[0]
284
- @error = e
285
- c.outdate!
286
- # we must clean @http.on_complete, otherwise
287
- # it would run right after this function and with broken data
288
- @http.on_complete &Proc::NULL
289
- if retry? eclass
290
- L.debug "#{eclass} -> reloading scout"
291
- retry!
292
- else
293
- L.debug "#{eclass} -> not reloading scout"
294
- raise *e if @raise_err
295
- end
296
- } if !@http.on_failure
300
+ @http.on_failure {|curl, error|
301
+ process_failure(*error)
302
+ } unless @http.on_failure
297
303
 
298
304
  load!
299
305
  end
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.2.10'
2
+ VERSION = '1.3.0'
3
3
  end
@@ -16,13 +16,9 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_runtime_dependency "activesupport", "~> 3"
20
19
  #spec.add_runtime_dependency "redis"
21
- spec.add_runtime_dependency "rmtools", "~> 2.3"
20
+ spec.add_runtime_dependency "rmtools", "~> 2.4"
22
21
  spec.add_runtime_dependency "libxml-ruby", "~> 2.7"
23
22
 
24
23
  spec.extensions << 'ext/curb/extconf.rb'
25
-
26
- spec.add_development_dependency "bundler", "~> 0"
27
- spec.add_development_dependency "rake", "~> 0"
28
24
  end
metadata CHANGED
@@ -1,43 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.10
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sergey Baev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-06 00:00:00.000000000 Z
11
+ date: 2014-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: activesupport
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '3'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ~>
25
- - !ruby/object:Gem::Version
26
- version: '3'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: rmtools
29
15
  requirement: !ruby/object:Gem::Requirement
30
16
  requirements:
31
17
  - - ~>
32
18
  - !ruby/object:Gem::Version
33
- version: '2.3'
19
+ version: '2.4'
34
20
  type: :runtime
35
21
  prerelease: false
36
22
  version_requirements: !ruby/object:Gem::Requirement
37
23
  requirements:
38
24
  - - ~>
39
25
  - !ruby/object:Gem::Version
40
- version: '2.3'
26
+ version: '2.4'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: libxml-ruby
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -52,34 +38,6 @@ dependencies:
52
38
  - - ~>
53
39
  - !ruby/object:Gem::Version
54
40
  version: '2.7'
55
- - !ruby/object:Gem::Dependency
56
- name: bundler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ~>
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ~>
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rake
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ~>
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ~>
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
41
  description: 'RHACK is Ruby Http ACcess Kit: curl-based web-client framework created
84
42
  for developing web-scrapers/bots.\n\nFeatures:\nAsynchronous, still EventMachine
85
43
  independent\nFast as on simple queries as on high load\n3 levels of flexible configuration\nWeb-client