rhack 1.2.1 → 1.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +13 -5
  2. data/README.md +21 -9
  3. data/ext/curb/curb.c +977 -977
  4. data/ext/curb/curb.h +52 -52
  5. data/ext/curb/curb_config.h +270 -270
  6. data/ext/curb/curb_easy.c +3437 -3434
  7. data/ext/curb/curb_easy.h +94 -94
  8. data/ext/curb/curb_errors.c +647 -647
  9. data/ext/curb/curb_errors.h +129 -129
  10. data/ext/curb/curb_macros.h +162 -162
  11. data/ext/curb/curb_multi.c +704 -702
  12. data/ext/curb/curb_multi.h +26 -26
  13. data/ext/curb/curb_postfield.c +523 -523
  14. data/ext/curb/curb_postfield.h +40 -40
  15. data/ext/curb/curb_upload.c +80 -80
  16. data/ext/curb/curb_upload.h +30 -30
  17. data/ext/curb-original/curb.c +977 -977
  18. data/ext/curb-original/curb.h +52 -52
  19. data/ext/curb-original/curb_config.h +238 -238
  20. data/ext/curb-original/curb_easy.c +3404 -3404
  21. data/ext/curb-original/curb_easy.h +90 -90
  22. data/ext/curb-original/curb_errors.c +647 -647
  23. data/ext/curb-original/curb_errors.h +129 -129
  24. data/ext/curb-original/curb_macros.h +159 -159
  25. data/ext/curb-original/curb_multi.c +633 -633
  26. data/ext/curb-original/curb_multi.h +26 -26
  27. data/ext/curb-original/curb_postfield.c +523 -523
  28. data/ext/curb-original/curb_postfield.h +40 -40
  29. data/ext/curb-original/curb_upload.c +80 -80
  30. data/ext/curb-original/curb_upload.h +30 -30
  31. data/lib/rhack/clients/base.rb +61 -10
  32. data/lib/rhack/clients/oauth.rb +4 -4
  33. data/lib/rhack/curl/easy.rb +1 -0
  34. data/lib/rhack/curl/global.rb +2 -0
  35. data/lib/rhack/curl/response.rb +4 -2
  36. data/lib/rhack/frame.rb +70 -32
  37. data/lib/rhack/js/browser/env.js +697 -697
  38. data/lib/rhack/js/browser/jquery.js +7180 -7180
  39. data/lib/rhack/js/browser/xmlsax.js +1564 -1564
  40. data/lib/rhack/js/browser/xmlw3cdom_1.js +1443 -1443
  41. data/lib/rhack/js/browser/xmlw3cdom_2.js +2744 -2744
  42. data/lib/rhack/page.rb +227 -68
  43. data/lib/rhack/scout.rb +52 -26
  44. data/lib/rhack/scout_squad.rb +10 -2
  45. data/lib/rhack/version.rb +1 -1
  46. data/rhack.gemspec +1 -1
  47. metadata +17 -17
data/lib/rhack/page.rb CHANGED
@@ -33,89 +33,173 @@ module RHACK
33
33
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
34
34
  __init__
35
35
  attr_writer :title
36
- attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
36
+ attr_reader :body, :loc, :data, :doc, :js, :curl, :curl_res, :failed
37
+ alias :hash :data # DEPRECATED
38
+ alias :html :body # DEPRECATED
39
+
37
40
  # result of page processing been made in frame context
38
41
  attr_accessor :res
39
42
  # for johnson
40
43
  @@ignore = /google|_gat|tracker|adver/i
41
44
 
42
- def initialize(obj='', loc=Hash.new(''), js=Johnson::Runtime.browser||Johnson::Runtime.new)
45
+ # Frame calls it with no args
46
+ def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new))
43
47
  loc = loc.parse:uri if !loc.is Hash
44
48
  @js = js
45
49
  if obj.is Curl::Easy or obj.kinda Scout
46
50
  c = obj.kinda(Scout) ? obj.http : obj
47
- @html = ''
48
51
  # just (c, loc) would pass to #process opts variable that returns '' on any key
49
52
  process(c, loc.b || {})
50
53
  else
51
- @html = obj
54
+ @body = obj
52
55
  @loc = loc
53
56
  end
54
57
  end
55
58
 
56
59
  def empty?
57
- !(@hash.nil? ? @html : @hash).b
60
+ !@data && !@body.b
61
+ end
62
+
63
+ def size
64
+ if @data.nil?
65
+ (@body || '').size
66
+ elsif @data == false
67
+ 0
68
+ else
69
+ @data.inspect.size
70
+ end
58
71
  end
59
72
 
60
73
  def inspect
61
- if !@hash.nil?
62
- "<##{self.class.name} (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
74
+ sz = size
75
+ if !@data.nil?
76
+ "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
63
77
  else
64
- "<##{self.class.name} #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
78
+ "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
65
79
  end
66
80
  end
67
81
 
68
- def html!(encoding='UTF-8')
69
- @html.force_encoding(encoding)
82
+ def utf!
83
+ @body.utf!
70
84
  end
71
85
 
72
- def url() @loc.href end
86
+ def url
87
+ @loc.href
88
+ end
73
89
  alias :href :url
74
90
 
91
+
92
+ # override this in a subclass
93
+ def failed?(*)
94
+ @curl_res.code != 200
95
+ end
96
+
97
+ # override this in a subclass
98
+ def retry?(*)
99
+ false
100
+ end
101
+
102
+ # override this in a subclass
103
+ # MUST return self if successful
104
+ # MAY return false otherwise
105
+ def parse(opts={})
106
+ if failed?
107
+ failed!
108
+ if opts[:json] or opts[:hash]
109
+ @data = false
110
+ end
111
+ return self
112
+ end
113
+
114
+ if opts[:json]
115
+ parse_json opts
116
+ elsif opts[:hash]
117
+ parse_hash opts
118
+ elsif opts[:xml]
119
+ parse_xml opts
120
+ else
121
+ parse_html opts
122
+ end
123
+
124
+ self
125
+ end
126
+
127
+ private
128
+
129
+ def failed!
130
+ @body = @curl_res.body
131
+ @failed = @curl_res.code
132
+ end
133
+
134
+ def log_failed(action)
135
+ L.debug "Failed #{action} from #{@curl.last_effective_url}, take a look at my @body for info; my object_id is #{object_id}"
136
+ end
137
+
138
+ def parse_xml(*)
139
+ @body = @curl_res.body.xml_to_utf
140
+ to_xml
141
+ rescue StandardError => e
142
+ L.warn "Exception raised during `to_xml': #{e.inspect}"
143
+ log_failed "to parse page as XML"
144
+ failed!
145
+ end
146
+
147
+ def parse_html(opts={})
148
+ @body = @curl_res.body.xml_to_utf
149
+ to_html
150
+ if opts[:eval]
151
+ load_scripts opts[:load_scripts]
152
+ eval_js
153
+ end
154
+ rescue StandardError => e
155
+ L.warn "Exception raised during `to_html': #{e.inspect}"
156
+ log_failed "to parse page as HTML"
157
+ failed!
158
+ end
159
+
160
+ def parse_json(*)
161
+ @json = true
162
+ begin
163
+ @data = @curl_res.body.from_json
164
+ rescue StandardError => e
165
+ L.warn "Exception raised during `from_json': #{e.inspect}"
166
+ end
167
+ if !@data or @data.is String
168
+ log_failed "to get JSON"
169
+ failed!
170
+ @data = false
171
+ end
172
+ end
173
+
174
+ def parse_hash(*)
175
+ if @curl_res.body.inline
176
+ @data = @curl_res.body.to_params
177
+ else
178
+ log_failed "to get url-params hash"
179
+ failed!
180
+ @data = false
181
+ end
182
+ end
183
+
184
+ public
185
+
75
186
  # We can then alternate #process in Page subclasses
76
187
  # Frame doesn't mind about value returned by #process
77
188
  def process(c, opts={})
78
189
  @loc = c.last_effective_url.parse:uri
190
+ @curl = c
79
191
  @curl_res = c.res
80
- L.debug "#{@loc.fullpath} -> #{@curl_res}"
81
- if @curl_res.code == 200
82
- body = @curl_res.body
83
- if opts[:json]
84
- @json = true
85
- @hash = begin; body.from_json
86
- rescue StandardError
87
- false
88
- end
89
- if !@hash or @hash.is String
90
- L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
91
- @html = body; to_doc
92
- @hash = false
93
- end
94
-
95
- elsif opts[:hash]
96
- if body.inline
97
- @hash = body.to_params
98
- else
99
- @hash = false
100
- L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
101
- @html = body; to_doc
102
- end
103
-
104
- else
105
- @html = body.xml_to_utf
106
- to_doc
107
- if opts[:eval]
108
- load_scripts opts[:load_scripts]
109
- eval_js
110
- end
111
- end
112
- elsif !(opts[:json] or opts[:hash])
113
- @html = @curl_res.body
114
- @failed = @curl_res.code
192
+
193
+ if retry?
194
+ c.retry!
195
+ return # callback will not proceed
115
196
  end
116
- self
197
+
198
+ L.debug "#{@loc.fullpath} -> #{@curl_res}"
199
+ parse(opts)
117
200
  end
118
201
 
202
+
119
203
  def eval_js(frame=nil)
120
204
  eval_string "document.location = window.location = #{@loc.to_json};
121
205
  document.URL = document.baseURI = document.documentURI = location.href;
@@ -149,14 +233,18 @@ module RHACK
149
233
  end
150
234
  end
151
235
 
152
- def to_doc
153
- @doc = @html.to_doc :forceutf
236
+ def to_html
237
+ @doc = @body.to_html
238
+ end
239
+
240
+ def to_xml
241
+ @doc = @body.to_xml
154
242
  end
155
243
 
156
244
  def title(full=true)
157
- if @hash.nil? and !@failed and @html.b
245
+ if @data.nil? and !@failed and @body.b
158
246
  if full
159
- to_doc unless defined? @doc
247
+ to_html unless defined? @doc
160
248
  if @doc.title.b
161
249
  @title = @doc.title
162
250
  else
@@ -257,14 +345,14 @@ module RHACK
257
345
  end
258
346
  end
259
347
 
260
- def __at(xp) (@doc || to_doc).at xp end
348
+ def __at(xp) (@doc || to_html).at xp end
261
349
 
262
- def __find(xp) (@doc || to_doc).find xp end
350
+ def __find(xp) (@doc || to_html).find xp end
263
351
 
264
352
  public
265
353
 
266
354
  def at(selector_or_node, options={})
267
- if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
355
+ if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ?
268
356
  selector_or_node : __at(selector_or_node)
269
357
 
270
358
  preresult = preprocess_search_result(preresult, options[:preprocess])
@@ -277,7 +365,7 @@ module RHACK
277
365
  alias :first :at
278
366
 
279
367
  def find(selector_or_nodes, options={}, &foreach)
280
- preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
368
+ preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ?
281
369
  selector_or_nodes : __find(selector_or_nodes)
282
370
 
283
371
  if preresult.size > 0
@@ -349,7 +437,7 @@ module RHACK
349
437
  form = "[action=#{@loc.path.inspect}]" if form == :self
350
438
  if form.is String
351
439
  form_node = at form
352
- raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
440
+ raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
353
441
  else form_node = form
354
442
  end
355
443
  hash = form_node.inputs_all.merge!(hash)
@@ -376,13 +464,13 @@ module RHACK
376
464
  end
377
465
 
378
466
 
379
- # OLD #
467
+ ### DEPRECATED ###
380
468
 
381
469
  # TODO: make into same form as #get_src and #map
382
470
  def get_srcs(links='img')
383
471
  begin
384
472
  links = find(links).map {|e| e.src} if links.is String
385
- rescue XML::Error
473
+ rescue LibXML::XML::Error
386
474
  links = [links]
387
475
  end
388
476
  links.map {|link| expand_link link}.uniq
@@ -392,7 +480,7 @@ module RHACK
392
480
  #def get_src(link='img')
393
481
  # begin
394
482
  # link = at(link) && at(link).src if link.is String
395
- # rescue XML::Error; nil
483
+ # rescue LibXML::XML::Error; nil
396
484
  # end
397
485
  # expand_link link if link
398
486
  #end
@@ -400,7 +488,7 @@ module RHACK
400
488
  def get_links(links='a')
401
489
  begin
402
490
  links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
403
- rescue XML::Error
491
+ rescue LibXML::XML::Error
404
492
  links = [links]
405
493
  end
406
494
  links.map {|link| expand_link link}.uniq
@@ -421,22 +509,93 @@ module RHACK
421
509
  end
422
510
 
423
511
  end
512
+
513
+ ### Pages with specific processing
514
+
515
+ class XmlPage < Page
516
+
517
+ # override this in a subclass
518
+ # MUST return self if successful
519
+ # MAY return false otherwise
520
+ def parse(opts={})
521
+ if failed?
522
+ failed!
523
+ else
524
+ parse_xml opts
525
+ end
526
+ self
527
+ end
528
+
529
+ end
530
+
531
+
532
+ class HtmlPage < Page
533
+
534
+ # override this in a subclass
535
+ # MUST return self if successful
536
+ # MAY return false otherwise
537
+ def parse(opts={})
538
+ if failed?
539
+ failed!
540
+ else
541
+ parse_html opts
542
+ end
543
+ self
544
+ end
545
+
546
+ end
547
+
548
+
549
+ class JsonPage < Page
550
+
551
+ # override this in a subclass
552
+ # MUST return self if successful
553
+ # MAY return false otherwise
554
+ def parse(opts={})
555
+ if failed?
556
+ failed!
557
+ else
558
+ parse_json opts
559
+ end
560
+ self
561
+ end
562
+
563
+ end
564
+
565
+
566
+ class HashPage < Page
567
+
568
+ # override this in a subclass
569
+ # MUST return self if successful
570
+ # MAY return false otherwise
571
+ def parse(opts={})
572
+ if failed?
573
+ failed!
574
+ else
575
+ parse_hash opts
576
+ end
577
+ self
578
+ end
579
+
580
+ end
424
581
 
582
+ ### DEPRECATED ### Use native inheritance and override #retry instead
583
+
425
584
  # using reprocessing of page in case of non-200 response:
426
585
  # page_class = ReloadablePage do
427
586
  # @res and @res.code != 200
428
587
  # end
429
588
  def ReloadablePage(&reload_condition)
430
- rp = Class.new Page
431
- rp.send :define_method, :process do |curl, opts|
432
- super(curl, opts || {})
433
- if curl.instance_eval &reload_condition
434
- curl.retry!
435
- nil # in case of reload_condition.call super's callback will not proceed
436
- else self
589
+ Class.new Page do
590
+ define_method :process do |curl, opts|
591
+ super(curl, opts || {})
592
+ if curl.instance_eval &reload_condition
593
+ curl.retry!
594
+ nil # in case of reload_condition.call super's callback will not proceed
595
+ else self
596
+ end
437
597
  end
438
598
  end
439
- rp
440
599
  end
441
600
 
442
601
  end
data/lib/rhack/scout.rb CHANGED
@@ -46,9 +46,17 @@ module RHACK
46
46
  @timeout = opts[:timeout] || @@timeout || 60
47
47
  @post_proc = @get_proc = @head_proc = @put_proc = @delete_proc = Proc::NULL
48
48
  update uri
49
+
49
50
  @retry = opts[:retry] || {}
50
51
  @retry = {@uri.host => @retry} if @retry.is Array
51
-
52
+ end
53
+
54
+ def setup_curl
55
+ if loaded?
56
+ Curl.carier.remove @http
57
+ end
58
+ @http = Curl::Easy(@webproxy ? @proxy : @root)
59
+ @http.base = self
52
60
  @http.cacert = @@cacert
53
61
  end
54
62
 
@@ -66,8 +74,7 @@ module RHACK
66
74
  if @http
67
75
  @http.url = @webproxy ? @proxy : @root
68
76
  else
69
- @http = Curl::Easy(@webproxy ? @proxy : @root)
70
- @http.base = self
77
+ setup_curl
71
78
  end
72
79
  if @proxy
73
80
  @http.proxy_url = @proxy*':' if !@webproxy
@@ -186,12 +193,23 @@ module RHACK
186
193
  cks.map2 {|k, v| Cookie(k, v)}
187
194
  end
188
195
 
189
- def retry?(err)
190
- # exc = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
191
- exc = (@@retry.keys + @retry.keys).select_in @root
192
- return false if !exc.b
193
- # ['www.nomer.org', 'nomer.org'].every {|www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
194
- exc.no? {|e| err[0].self_name.in((@@retry[e] || []) + @retry[e])}
196
+ def retry?(eclass)
197
+ # sites = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
198
+ sites = (@@retry.keys + @retry.keys).select_in @root
199
+ return false if sites.empty?
200
+ errname = eclass.self_name
201
+ # retry = ['www.nomer.org', 'nomer.org'].any? {|www| {'nomer.org' => ['TimeoutError']}[www].include? 'TimeoutError'}
202
+ sites.any? {|site|
203
+ (@@retry[site] || []).include? errname or
204
+ (@retry[site] || []).include? errname
205
+ }
206
+ end
207
+
208
+ def retry!(path=@__path, headers=@__headers, not_redir=@__not_redir, relvl=@__relvl, callback=@__callback)
209
+ # all external params including post_body are still set
210
+ setup_curl # @http reload here
211
+ # and now we can set @http.on_complete back again
212
+ load(path, headers, not_redir, relvl, &callback)
195
213
  end
196
214
 
197
215
  def loaded?
@@ -209,10 +227,24 @@ module RHACK
209
227
  end
210
228
  rescue RuntimeError => e
211
229
  e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
212
- raise e
230
+ L.warn "#{e.inspect}: #{e.message}"
231
+ if loaded?
232
+ Curl.carier.remove @http
233
+ end
234
+ sleep 1
235
+ load!
236
+ #e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
237
+ #raise e
213
238
  end
214
239
 
215
240
  def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
241
+ # cache preprocessed data for one time for we can do #retry
242
+ @__path = path
243
+ @__headers = headers
244
+ @__not_redir = not_redir
245
+ @__relvl = relvl
246
+ @__callback = callback
247
+
216
248
  @http.path = path = fix(path)
217
249
  @http.headers = mkHeader(path).merge!(headers)
218
250
  @http.timeout = @timeout
@@ -233,24 +265,18 @@ module RHACK
233
265
  end
234
266
  }
235
267
  @http.on_failure {|c, e|
268
+ eclass = e[0]
236
269
  @error = e
237
- if e[0] == Curl::Err::CurlOK
238
- # в сорцах on_failure не вызывается по коду 0, это какой-то глюк
239
- # в любом случае такой поворот не означает ошибки
240
- L.warn "Got Curl::Err::CurlOK, response was: #{c.res}"
241
- load!
270
+ c.outdate!
271
+ # we must clean @http.on_complete, otherwise
272
+ # it would run right after this function and with broken data
273
+ @http.on_complete &Proc::NULL
274
+ if retry? eclass
275
+ L.debug "#{eclass} -> reloading scout"
276
+ retry!
242
277
  else
243
- c.outdate!
244
- if retry? e
245
- L.debug "#{e[0]} -> reloading scout"
246
- #load uri, headers, not_redir, relvl, &callback
247
- load! # all params including post_body are still set
248
- # DO they include `on_complete'?
249
- else
250
- @http.on_complete &Proc::NULL
251
- L.debug "#{e[0]} -> not reloading scout"
252
- raise *e if @raise_err
253
- end
278
+ L.debug "#{eclass} -> not reloading scout"
279
+ raise *e if @raise_err
254
280
  end
255
281
  } if !@http.on_failure
256
282
 
@@ -15,12 +15,16 @@ module RHACK
15
15
  if args[0].is Scout
16
16
  s = args[0]
17
17
  else
18
- if !args[0].is String
19
- args.unshift ''
18
+ unless args[0].is String
20
19
  if (opts = args[-1]).is Hash and (opts[:cp] || opts[:ck]).is Hash
21
20
  L.warn "it's useless to setup cookies for untargeted squad!"
22
21
  end
23
22
  end
23
+ if !args[0]
24
+ args[0] = ''
25
+ elsif !args[0].is String
26
+ args.unshift ''
27
+ end
24
28
  if args[1] and args[1][0].is Array
25
29
  proxies = args[1]
26
30
  args[1] = proxies.shift
@@ -43,7 +47,9 @@ module RHACK
43
47
  end
44
48
 
45
49
  def wait_for_available
50
+ #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
46
51
  Curl.execute :unless_already
52
+ #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
47
53
  # Carier.requests освобождаются ещё до колбека,
48
54
  # но колбеки выполняются последовательно,
49
55
  # поэтому здесь мы можем усыплять тред,
@@ -59,6 +65,7 @@ module RHACK
59
65
  raise PickError if !b
60
66
  # to_a because Array#reject returns object of this class
61
67
  if scout = to_a.rand_by_available?
68
+ L.debug {"randomly picked an available scout##{scout.object_id}"}
62
69
  scout
63
70
  else
64
71
  wait_for_available
@@ -69,6 +76,7 @@ module RHACK
69
76
  def next
70
77
  raise PickError if !b
71
78
  if scout = to_a.find_available?
79
+ L.debug {"picked the next available scout##{scout.object_id}"}
72
80
  scout
73
81
  else
74
82
  wait_for_available
data/lib/rhack/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.2.1'
2
+ VERSION = '1.2.7'
3
3
  end
data/rhack.gemspec CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
 
19
19
  spec.add_runtime_dependency "activesupport"
20
20
  #spec.add_runtime_dependency "redis"
21
- spec.add_runtime_dependency "rmtools", ">= 2.3.0"
21
+ spec.add_runtime_dependency "rmtools", ">= 2.3.6"
22
22
  spec.add_runtime_dependency "libxml-ruby"
23
23
 
24
24
  spec.extensions << 'ext/curb/extconf.rb'