rhack 1.2.10 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 98ec5c6f4bcfb4bbc988ac69fc4148a7042d348c
4
- data.tar.gz: d12c3aaad05efbc0df476d790213d0bcd9065beb
3
+ metadata.gz: ccd52c86276421476a67a4b379fcb6275bd6f85e
4
+ data.tar.gz: 46efe342223575cb26eceddca4970173b7f32cdf
5
5
  SHA512:
6
- metadata.gz: 8eef5e956ef5b444c33e6ddb28c56e9e4c8e9c2cdaec9ced9124569e461a6aed26bff7a913e8b02009105428171aa30fd5bd9a88e5829e4c7ee9a91abd78bb28
7
- data.tar.gz: c40ca66013388dbd0459d4b1d97cb8c9ebce03ff942d225b77affedbf4dfb76c897691305737c5e8b9e7bec719c603738a7c518b464e8b253d51bd259c6873f1
6
+ metadata.gz: af9bbd50acdc9ee228bc3ccede8fc31da56dd762d39a96f85c847e3c223167f2aff0fff6f4ef30ee68e7d7b3fa91e51d74cf670c7b7968bd9d4ec70861ca3ae9
7
+ data.tar.gz: 7d0f7418a4af8d3ff2bcbfab5ec26fbca85ad0664570242255fee2822ffbf3c528ab8e4ba9979d2a545b6599a77bd80f15ab6aa2f24297bb947fdc134a7457b0
@@ -10,7 +10,6 @@ require "rhack/js/johnson"
10
10
  module RHACK
11
11
  mattr_reader :config, :redis, :useragents
12
12
 
13
- Dir.chdir ENV['APP_ROOT'] if ENV['APP_ROOT']
14
13
  cfgfile = Dir['{config/,}rhack.yml'].first
15
14
  @@config = cfgfile ? YAML.load(IO.read(cfgfile)) : {}
16
15
 
@@ -8,7 +8,7 @@ module Curl
8
8
  def to_s
9
9
  str = '<#'
10
10
  if @error
11
- str << "#{@error[0].self_name}: #{@error[1]}"
11
+ str << "#{@error.name}: #{@error.message}"
12
12
  else
13
13
  str << (@header[/\d{3}/] == @code.to_s ? @header : "#{@header[/\S+/]} #{@code}") if @header
14
14
  if @hash.location
@@ -77,14 +77,16 @@ module Curl
77
77
 
78
78
  def is(klass)
79
79
  if @error
80
- klass == Array || klass = Curl::Response
80
+ $log.warn "obsolete comparison with Array", caller: 1 if klass == Array
81
+ klass == Array || klass = Curl::Response # obsolete
81
82
  else
82
83
  klass == Curl::Response
83
84
  end
84
85
  end
85
86
 
86
- def [](key_or_index)
87
- @error ? @error[key_or_index] : @hash[key_or_index.downcase]
87
+ def [](key)
88
+ #@error ? @error[key_or_index] : @hash[key_or_index.downcase] # old
89
+ @hash[key.downcase]
88
90
  end
89
91
 
90
92
  alias :headers :hash
@@ -22,6 +22,7 @@ module RHACK
22
22
  class Frame
23
23
  __init__
24
24
  attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
+ alias options opts
25
26
  @@cache = {}
26
27
 
27
28
  def initialize *args
@@ -56,7 +57,7 @@ module RHACK
56
57
  @ss.update to, forced
57
58
  update_loc to
58
59
  end
59
- alias :target= :retarget
60
+ alias target= retarget
60
61
 
61
62
  def anchor
62
63
  retarget @loc.href
@@ -222,7 +223,7 @@ module RHACK
222
223
  (opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
223
224
  if opts[:content_type]
224
225
  if opts[:content_type].is Symbol
225
- if mime_type = Mime::Types.of(opts[:content_type])[0]
226
+ if mime_type = MIME::Types.of(opts[:content_type])[0]
226
227
  (opts[:headers] ||= {})['Content-Type'] = mime_type.content_type
227
228
  else
228
229
  raise ArgumentError, "failed to detect Mime::Type by extension: #{opts[:content_type]}
@@ -329,6 +330,15 @@ module RHACK
329
330
  page = opts[:result].new
330
331
  # if no spare scouts can be found, squad simply waits for first callbacks to complete
331
332
  s = @ss.next
333
+ s.http.on_failure {|curl, error|
334
+ s.process_failure(*error)
335
+ if opts[:raw]
336
+ page.res = s.error
337
+ elsif page.process(curl, opts)
338
+ run_callbacks! page, opts, &callback
339
+ # nothing to do here if process returns nil or false
340
+ end
341
+ }
332
342
  s.send(*(order << opts)) {|curl|
333
343
  # there is a problem with storing html on disk
334
344
  if order[0] == :loadGet and @write_to
@@ -343,7 +353,7 @@ module RHACK
343
353
  elsif page.process(curl, opts)
344
354
  @@cache[page.href] = page if order[0] == :loadGet and @use_cache
345
355
  run_callbacks! page, opts, &callback
346
- # nothing to do here if process returns nil or false
356
+ # nothing to do here if process returns nil or false
347
357
  end
348
358
  }
349
359
  # > Carier.requests++
@@ -72,8 +72,8 @@ module RHACK
72
72
 
73
73
  def inspect
74
74
  sz = size
75
- if !@data.nil?
76
- "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
75
+ if @json or @hash
76
+ "<##{self.class.name} (#{@data ? sz.bytes : 'failed to parse'}) #{@json ? 'json' : 'url params'}>"
77
77
  else
78
78
  "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
79
79
  end
@@ -91,7 +91,7 @@ module RHACK
91
91
 
92
92
  # override this in a subclass
93
93
  def failed?(*)
94
- @curl_res.code != 200
94
+ @curl_res.error or @curl_res.code != 200
95
95
  end
96
96
 
97
97
  # override this in a subclass
@@ -103,14 +103,6 @@ module RHACK
103
103
  # MUST return self if successful
104
104
  # MAY return false otherwise
105
105
  def parse(opts={})
106
- if failed?
107
- failed!
108
- if opts[:json] or opts[:hash]
109
- @data = false
110
- end
111
- return self
112
- end
113
-
114
106
  if opts[:json]
115
107
  parse_json opts
116
108
  elsif opts[:hash]
@@ -120,15 +112,23 @@ module RHACK
120
112
  else
121
113
  parse_html opts
122
114
  end
123
-
124
115
  self
125
116
  end
126
117
 
127
118
  private
128
119
 
120
+ # @failed means failure cause
121
+ # MUST return false if you don't want this client
122
+ # to call main callback in case of failure
123
+ # MUST return true otherwise
129
124
  def failed!
130
- @body = @curl_res.body
131
- @failed = @curl_res.code
125
+ if @curl_res.error
126
+ @failed = @curl_res.error
127
+ else
128
+ @body = @curl_res.body
129
+ @failed = @curl_res.code
130
+ end
131
+ false
132
132
  end
133
133
 
134
134
  def log_failed(action)
@@ -171,7 +171,9 @@ module RHACK
171
171
  end
172
172
  end
173
173
 
174
+ # urlencoded
174
175
  def parse_hash(*)
176
+ @hash = true
175
177
  if @curl_res.body.inline
176
178
  @data = @curl_res.body.to_params
177
179
  else
@@ -184,15 +186,22 @@ module RHACK
184
186
  public
185
187
 
186
188
  # We can then alternate #process in Page subclasses
187
- # Frame doesn't mind about value returned by #process
188
- def process(c, opts={})
189
- @loc = c.last_effective_url.parse:uri
190
- @curl = c
191
- @curl_res = c.res
189
+ # Frame doesn't mind about the value returned by #process
190
+ # unless it is nil or false
191
+ def process(curl, opts={})
192
+ @loc = curl.last_effective_url.parse :uri
193
+ @curl = curl
194
+ @curl_res = curl.res
192
195
 
193
- if retry?
194
- c.retry!
195
- return # callback will not proceed
196
+ if failed?
197
+ should_proceed = failed! # false by default
198
+ if retry?
199
+ curl.retry!
200
+ return false # callback will not proceed
201
+ end
202
+ unless should_proceed
203
+ return false # nor callback or retry will not proceed
204
+ end
196
205
  end
197
206
 
198
207
  L.debug "#{@loc.fullpath} -> #{@curl_res}"
@@ -519,11 +528,7 @@ module RHACK
519
528
  # MUST return self if successful
520
529
  # MAY return false otherwise
521
530
  def parse(opts={})
522
- if failed?
523
- failed!
524
- else
525
- parse_xml opts
526
- end
531
+ parse_xml opts
527
532
  self
528
533
  end
529
534
 
@@ -537,11 +542,7 @@ module RHACK
537
542
  # MUST return self if successful
538
543
  # MAY return false otherwise
539
544
  def parse(opts={})
540
- if failed?
541
- failed!
542
- else
543
- parse_html opts
544
- end
545
+ parse_html opts
545
546
  self
546
547
  end
547
548
 
@@ -555,11 +556,7 @@ module RHACK
555
556
  # MUST return self if successful
556
557
  # MAY return false otherwise
557
558
  def parse(opts={})
558
- if failed?
559
- failed!
560
- else
561
- parse_json opts
562
- end
559
+ parse_json opts
563
560
  self
564
561
  end
565
562
 
@@ -573,11 +570,7 @@ module RHACK
573
570
  # MUST return self if successful
574
571
  # MAY return false otherwise
575
572
  def parse(opts={})
576
- if failed?
577
- failed!
578
- else
579
- parse_hash opts
580
- end
573
+ parse_hash opts
581
574
  self
582
575
  end
583
576
 
@@ -142,7 +142,7 @@ module RHACK
142
142
  name = v.is(Hash) && v[:name] ||
143
143
  File.basename(path)
144
144
  content_type = v.is(Hash) && v[:content_type].to_s ||
145
- (Mime::Types.of(path)[0] || {}).content_type ||
145
+ (MIME::Types.of(path)[0] || {}).content_type ||
146
146
  "application/octet-stream"
147
147
  Curl::PostField.file(k, type, name, read(path))
148
148
  else
@@ -235,6 +235,23 @@ module RHACK
235
235
  !loaded?
236
236
  end
237
237
 
238
+ def process_failure(curl_err, message)
239
+ @error = curl_err.new message
240
+ #@error = [curl_err, message] # old
241
+ @http.outdate!
242
+ # we must clean @http.on_complete, otherwise
243
+ # it would run right after this function and with broken data
244
+ @http.on_complete &Proc::NULL
245
+ if retry? curl_err
246
+ L.debug "#{curl_err} -> reloading scout"
247
+ retry!
248
+ else
249
+ L.debug "#{curl_err} -> not reloading scout"
250
+ raise @error if @raise_err
251
+ #raise *@error if @raise_err # old
252
+ end
253
+ end
254
+
238
255
  def load!
239
256
  unless Curl.carier.add @http
240
257
  Curl.carier.remove @http
@@ -253,7 +270,7 @@ module RHACK
253
270
  end
254
271
 
255
272
  def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
256
- # cache preprocessed data for one time for we can do #retry
273
+ # cache preprocessed data for one time so we can do #retry
257
274
  @__path = path
258
275
  @__headers = headers
259
276
  @__not_redir = not_redir
@@ -264,36 +281,25 @@ module RHACK
264
281
  @http.headers = mkHeader(path).merge!(headers)
265
282
  @http.timeout = @timeout
266
283
 
267
- @http.on_complete {|c|
284
+ @http.on_complete {|curl| # = @http
268
285
  # > Carier.requests--
269
286
  @error = nil
270
287
  # While not outdated, Curl::Response here may contain pointers on freed
271
288
  # memory, thus throwing exception on #to_s and #inspect
272
- c.outdate!
273
- ProcCookies c.res if @cookieProc
289
+ @http.outdate!
290
+ res = @http.res
291
+ ProcCookies res if @cookieProc
274
292
  # We cannot just cancel on_complete in on_redirect block
275
293
  # because loadGet will immediately reset on_complete back
276
- if c.res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = c.res.hash.location
294
+ if res.code.in(300..399) and !not_redir.b and (relvl -= 1) > -1 and loc = res.hash.location
277
295
  loadGet(loc, headers: headers, relvl: relvl, redir: true, &callback)
278
296
  elsif block_given?
279
- yield c
297
+ yield @http
280
298
  end
281
299
  }
282
- @http.on_failure {|c, e|
283
- eclass = e[0]
284
- @error = e
285
- c.outdate!
286
- # we must clean @http.on_complete, otherwise
287
- # it would run right after this function and with broken data
288
- @http.on_complete &Proc::NULL
289
- if retry? eclass
290
- L.debug "#{eclass} -> reloading scout"
291
- retry!
292
- else
293
- L.debug "#{eclass} -> not reloading scout"
294
- raise *e if @raise_err
295
- end
296
- } if !@http.on_failure
300
+ @http.on_failure {|curl, error|
301
+ process_failure(*error)
302
+ } unless @http.on_failure
297
303
 
298
304
  load!
299
305
  end
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.2.10'
2
+ VERSION = '1.3.0'
3
3
  end
@@ -16,13 +16,9 @@ Gem::Specification.new do |spec|
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.require_paths = ["lib"]
18
18
 
19
- spec.add_runtime_dependency "activesupport", "~> 3"
20
19
  #spec.add_runtime_dependency "redis"
21
- spec.add_runtime_dependency "rmtools", "~> 2.3"
20
+ spec.add_runtime_dependency "rmtools", "~> 2.4"
22
21
  spec.add_runtime_dependency "libxml-ruby", "~> 2.7"
23
22
 
24
23
  spec.extensions << 'ext/curb/extconf.rb'
25
-
26
- spec.add_development_dependency "bundler", "~> 0"
27
- spec.add_development_dependency "rake", "~> 0"
28
24
  end
metadata CHANGED
@@ -1,43 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.2.10
4
+ version: 1.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sergey Baev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-08-06 00:00:00.000000000 Z
11
+ date: 2014-10-15 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: activesupport
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - ~>
18
- - !ruby/object:Gem::Version
19
- version: '3'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - ~>
25
- - !ruby/object:Gem::Version
26
- version: '3'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: rmtools
29
15
  requirement: !ruby/object:Gem::Requirement
30
16
  requirements:
31
17
  - - ~>
32
18
  - !ruby/object:Gem::Version
33
- version: '2.3'
19
+ version: '2.4'
34
20
  type: :runtime
35
21
  prerelease: false
36
22
  version_requirements: !ruby/object:Gem::Requirement
37
23
  requirements:
38
24
  - - ~>
39
25
  - !ruby/object:Gem::Version
40
- version: '2.3'
26
+ version: '2.4'
41
27
  - !ruby/object:Gem::Dependency
42
28
  name: libxml-ruby
43
29
  requirement: !ruby/object:Gem::Requirement
@@ -52,34 +38,6 @@ dependencies:
52
38
  - - ~>
53
39
  - !ruby/object:Gem::Version
54
40
  version: '2.7'
55
- - !ruby/object:Gem::Dependency
56
- name: bundler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ~>
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ~>
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: rake
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ~>
74
- - !ruby/object:Gem::Version
75
- version: '0'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ~>
81
- - !ruby/object:Gem::Version
82
- version: '0'
83
41
  description: 'RHACK is Ruby Http ACcess Kit: curl-based web-client framework created
84
42
  for developing web-scrapers/bots.\n\nFeatures:\nAsynchronous, still EventMachine
85
43
  independent\nFast as on simple queries as on high load\n3 levels of flexible configuration\nWeb-client