rhack 1.2.1 → 1.2.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +13 -5
  2. data/README.md +21 -9
  3. data/ext/curb/curb.c +977 -977
  4. data/ext/curb/curb.h +52 -52
  5. data/ext/curb/curb_config.h +270 -270
  6. data/ext/curb/curb_easy.c +3437 -3434
  7. data/ext/curb/curb_easy.h +94 -94
  8. data/ext/curb/curb_errors.c +647 -647
  9. data/ext/curb/curb_errors.h +129 -129
  10. data/ext/curb/curb_macros.h +162 -162
  11. data/ext/curb/curb_multi.c +704 -702
  12. data/ext/curb/curb_multi.h +26 -26
  13. data/ext/curb/curb_postfield.c +523 -523
  14. data/ext/curb/curb_postfield.h +40 -40
  15. data/ext/curb/curb_upload.c +80 -80
  16. data/ext/curb/curb_upload.h +30 -30
  17. data/ext/curb-original/curb.c +977 -977
  18. data/ext/curb-original/curb.h +52 -52
  19. data/ext/curb-original/curb_config.h +238 -238
  20. data/ext/curb-original/curb_easy.c +3404 -3404
  21. data/ext/curb-original/curb_easy.h +90 -90
  22. data/ext/curb-original/curb_errors.c +647 -647
  23. data/ext/curb-original/curb_errors.h +129 -129
  24. data/ext/curb-original/curb_macros.h +159 -159
  25. data/ext/curb-original/curb_multi.c +633 -633
  26. data/ext/curb-original/curb_multi.h +26 -26
  27. data/ext/curb-original/curb_postfield.c +523 -523
  28. data/ext/curb-original/curb_postfield.h +40 -40
  29. data/ext/curb-original/curb_upload.c +80 -80
  30. data/ext/curb-original/curb_upload.h +30 -30
  31. data/lib/rhack/clients/base.rb +61 -10
  32. data/lib/rhack/clients/oauth.rb +4 -4
  33. data/lib/rhack/curl/easy.rb +1 -0
  34. data/lib/rhack/curl/global.rb +2 -0
  35. data/lib/rhack/curl/response.rb +4 -2
  36. data/lib/rhack/frame.rb +70 -32
  37. data/lib/rhack/js/browser/env.js +697 -697
  38. data/lib/rhack/js/browser/jquery.js +7180 -7180
  39. data/lib/rhack/js/browser/xmlsax.js +1564 -1564
  40. data/lib/rhack/js/browser/xmlw3cdom_1.js +1443 -1443
  41. data/lib/rhack/js/browser/xmlw3cdom_2.js +2744 -2744
  42. data/lib/rhack/page.rb +227 -68
  43. data/lib/rhack/scout.rb +52 -26
  44. data/lib/rhack/scout_squad.rb +10 -2
  45. data/lib/rhack/version.rb +1 -1
  46. data/rhack.gemspec +1 -1
  47. metadata +17 -17
data/lib/rhack/page.rb CHANGED
@@ -33,89 +33,173 @@ module RHACK
33
33
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
34
34
  __init__
35
35
  attr_writer :title
36
- attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
36
+ attr_reader :body, :loc, :data, :doc, :js, :curl, :curl_res, :failed
37
+ alias :hash :data # DEPRECATED
38
+ alias :html :body # DEPRECATED
39
+
37
40
  # result of page processing been made in frame context
38
41
  attr_accessor :res
39
42
  # for johnson
40
43
  @@ignore = /google|_gat|tracker|adver/i
41
44
 
42
- def initialize(obj='', loc=Hash.new(''), js=Johnson::Runtime.browser||Johnson::Runtime.new)
45
+ # Frame calls it with no args
46
+ def initialize(obj='', loc=Hash.new(''), js=is_a?(HtmlPage)&&(Johnson::Runtime.browser||Johnson::Runtime.new))
43
47
  loc = loc.parse:uri if !loc.is Hash
44
48
  @js = js
45
49
  if obj.is Curl::Easy or obj.kinda Scout
46
50
  c = obj.kinda(Scout) ? obj.http : obj
47
- @html = ''
48
51
  # just (c, loc) would pass to #process opts variable that returns '' on any key
49
52
  process(c, loc.b || {})
50
53
  else
51
- @html = obj
54
+ @body = obj
52
55
  @loc = loc
53
56
  end
54
57
  end
55
58
 
56
59
  def empty?
57
- !(@hash.nil? ? @html : @hash).b
60
+ !@data && !@body.b
61
+ end
62
+
63
+ def size
64
+ if @data.nil?
65
+ (@body || '').size
66
+ elsif @data == false
67
+ 0
68
+ else
69
+ @data.inspect.size
70
+ end
58
71
  end
59
72
 
60
73
  def inspect
61
- if !@hash.nil?
62
- "<##{self.class.name} (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
74
+ sz = size
75
+ if !@data.nil?
76
+ "<##{self.class.name} (#{@data == false ? 'failed to parse' : sz.bytes}) #{@json ? 'json' : 'url params'}>"
63
77
  else
64
- "<##{self.class.name} #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
78
+ "<##{self.class.name} #{sz == 0 ? '(empty)' : "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{sz.bytes})"}#{' js enabled' if @js and @doc}>"
65
79
  end
66
80
  end
67
81
 
68
- def html!(encoding='UTF-8')
69
- @html.force_encoding(encoding)
82
+ def utf!
83
+ @body.utf!
70
84
  end
71
85
 
72
- def url() @loc.href end
86
+ def url
87
+ @loc.href
88
+ end
73
89
  alias :href :url
74
90
 
91
+
92
+ # override this in a subclass
93
+ def failed?(*)
94
+ @curl_res.code != 200
95
+ end
96
+
97
+ # override this in a subclass
98
+ def retry?(*)
99
+ false
100
+ end
101
+
102
+ # override this in a subclass
103
+ # MUST return self if successful
104
+ # MAY return false otherwise
105
+ def parse(opts={})
106
+ if failed?
107
+ failed!
108
+ if opts[:json] or opts[:hash]
109
+ @data = false
110
+ end
111
+ return self
112
+ end
113
+
114
+ if opts[:json]
115
+ parse_json opts
116
+ elsif opts[:hash]
117
+ parse_hash opts
118
+ elsif opts[:xml]
119
+ parse_xml opts
120
+ else
121
+ parse_html opts
122
+ end
123
+
124
+ self
125
+ end
126
+
127
+ private
128
+
129
+ def failed!
130
+ @body = @curl_res.body
131
+ @failed = @curl_res.code
132
+ end
133
+
134
+ def log_failed(action)
135
+ L.debug "Failed #{action} from #{@curl.last_effective_url}, take a look at my @body for info; my object_id is #{object_id}"
136
+ end
137
+
138
+ def parse_xml(*)
139
+ @body = @curl_res.body.xml_to_utf
140
+ to_xml
141
+ rescue StandardError => e
142
+ L.warn "Exception raised during `to_xml': #{e.inspect}"
143
+ log_failed "to parse page as XML"
144
+ failed!
145
+ end
146
+
147
+ def parse_html(opts={})
148
+ @body = @curl_res.body.xml_to_utf
149
+ to_html
150
+ if opts[:eval]
151
+ load_scripts opts[:load_scripts]
152
+ eval_js
153
+ end
154
+ rescue StandardError => e
155
+ L.warn "Exception raised during `to_html': #{e.inspect}"
156
+ log_failed "to parse page as HTML"
157
+ failed!
158
+ end
159
+
160
+ def parse_json(*)
161
+ @json = true
162
+ begin
163
+ @data = @curl_res.body.from_json
164
+ rescue StandardError => e
165
+ L.warn "Exception raised during `from_json': #{e.inspect}"
166
+ end
167
+ if !@data or @data.is String
168
+ log_failed "to get JSON"
169
+ failed!
170
+ @data = false
171
+ end
172
+ end
173
+
174
+ def parse_hash(*)
175
+ if @curl_res.body.inline
176
+ @data = @curl_res.body.to_params
177
+ else
178
+ log_failed "to get url-params hash"
179
+ failed!
180
+ @data = false
181
+ end
182
+ end
183
+
184
+ public
185
+
75
186
  # We can then alternate #process in Page subclasses
76
187
  # Frame doesn't mind about value returned by #process
77
188
  def process(c, opts={})
78
189
  @loc = c.last_effective_url.parse:uri
190
+ @curl = c
79
191
  @curl_res = c.res
80
- L.debug "#{@loc.fullpath} -> #{@curl_res}"
81
- if @curl_res.code == 200
82
- body = @curl_res.body
83
- if opts[:json]
84
- @json = true
85
- @hash = begin; body.from_json
86
- rescue StandardError
87
- false
88
- end
89
- if !@hash or @hash.is String
90
- L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
91
- @html = body; to_doc
92
- @hash = false
93
- end
94
-
95
- elsif opts[:hash]
96
- if body.inline
97
- @hash = body.to_params
98
- else
99
- @hash = false
100
- L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
101
- @html = body; to_doc
102
- end
103
-
104
- else
105
- @html = body.xml_to_utf
106
- to_doc
107
- if opts[:eval]
108
- load_scripts opts[:load_scripts]
109
- eval_js
110
- end
111
- end
112
- elsif !(opts[:json] or opts[:hash])
113
- @html = @curl_res.body
114
- @failed = @curl_res.code
192
+
193
+ if retry?
194
+ c.retry!
195
+ return # callback will not proceed
115
196
  end
116
- self
197
+
198
+ L.debug "#{@loc.fullpath} -> #{@curl_res}"
199
+ parse(opts)
117
200
  end
118
201
 
202
+
119
203
  def eval_js(frame=nil)
120
204
  eval_string "document.location = window.location = #{@loc.to_json};
121
205
  document.URL = document.baseURI = document.documentURI = location.href;
@@ -149,14 +233,18 @@ module RHACK
149
233
  end
150
234
  end
151
235
 
152
- def to_doc
153
- @doc = @html.to_doc :forceutf
236
+ def to_html
237
+ @doc = @body.to_html
238
+ end
239
+
240
+ def to_xml
241
+ @doc = @body.to_xml
154
242
  end
155
243
 
156
244
  def title(full=true)
157
- if @hash.nil? and !@failed and @html.b
245
+ if @data.nil? and !@failed and @body.b
158
246
  if full
159
- to_doc unless defined? @doc
247
+ to_html unless defined? @doc
160
248
  if @doc.title.b
161
249
  @title = @doc.title
162
250
  else
@@ -257,14 +345,14 @@ module RHACK
257
345
  end
258
346
  end
259
347
 
260
- def __at(xp) (@doc || to_doc).at xp end
348
+ def __at(xp) (@doc || to_html).at xp end
261
349
 
262
- def __find(xp) (@doc || to_doc).find xp end
350
+ def __find(xp) (@doc || to_html).find xp end
263
351
 
264
352
  public
265
353
 
266
354
  def at(selector_or_node, options={})
267
- if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
355
+ if selector_or_node and preresult = selector_or_node.is_a?(LibXML::XML::Node) ?
268
356
  selector_or_node : __at(selector_or_node)
269
357
 
270
358
  preresult = preprocess_search_result(preresult, options[:preprocess])
@@ -277,7 +365,7 @@ module RHACK
277
365
  alias :first :at
278
366
 
279
367
  def find(selector_or_nodes, options={}, &foreach)
280
- preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
368
+ preresult = selector_or_nodes.is_a?(LibXML::XML::XPath::Object, Array) ?
281
369
  selector_or_nodes : __find(selector_or_nodes)
282
370
 
283
371
  if preresult.size > 0
@@ -349,7 +437,7 @@ module RHACK
349
437
  form = "[action=#{@loc.path.inspect}]" if form == :self
350
438
  if form.is String
351
439
  form_node = at form
352
- raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
440
+ raise LibXML::XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
353
441
  else form_node = form
354
442
  end
355
443
  hash = form_node.inputs_all.merge!(hash)
@@ -376,13 +464,13 @@ module RHACK
376
464
  end
377
465
 
378
466
 
379
- # OLD #
467
+ ### DEPRECATED ###
380
468
 
381
469
  # TODO: make into same form as #get_src and #map
382
470
  def get_srcs(links='img')
383
471
  begin
384
472
  links = find(links).map {|e| e.src} if links.is String
385
- rescue XML::Error
473
+ rescue LibXML::XML::Error
386
474
  links = [links]
387
475
  end
388
476
  links.map {|link| expand_link link}.uniq
@@ -392,7 +480,7 @@ module RHACK
392
480
  #def get_src(link='img')
393
481
  # begin
394
482
  # link = at(link) && at(link).src if link.is String
395
- # rescue XML::Error; nil
483
+ # rescue LibXML::XML::Error; nil
396
484
  # end
397
485
  # expand_link link if link
398
486
  #end
@@ -400,7 +488,7 @@ module RHACK
400
488
  def get_links(links='a')
401
489
  begin
402
490
  links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
403
- rescue XML::Error
491
+ rescue LibXML::XML::Error
404
492
  links = [links]
405
493
  end
406
494
  links.map {|link| expand_link link}.uniq
@@ -421,22 +509,93 @@ module RHACK
421
509
  end
422
510
 
423
511
  end
512
+
513
+ ### Pages with specific processing
514
+
515
+ class XmlPage < Page
516
+
517
+ # override this in a subclass
518
+ # MUST return self if successful
519
+ # MAY return false otherwise
520
+ def parse(opts={})
521
+ if failed?
522
+ failed!
523
+ else
524
+ parse_xml opts
525
+ end
526
+ self
527
+ end
528
+
529
+ end
530
+
531
+
532
+ class HtmlPage < Page
533
+
534
+ # override this in a subclass
535
+ # MUST return self if successful
536
+ # MAY return false otherwise
537
+ def parse(opts={})
538
+ if failed?
539
+ failed!
540
+ else
541
+ parse_html opts
542
+ end
543
+ self
544
+ end
545
+
546
+ end
547
+
548
+
549
+ class JsonPage < Page
550
+
551
+ # override this in a subclass
552
+ # MUST return self if successful
553
+ # MAY return false otherwise
554
+ def parse(opts={})
555
+ if failed?
556
+ failed!
557
+ else
558
+ parse_json opts
559
+ end
560
+ self
561
+ end
562
+
563
+ end
564
+
565
+
566
+ class HashPage < Page
567
+
568
+ # override this in a subclass
569
+ # MUST return self if successful
570
+ # MAY return false otherwise
571
+ def parse(opts={})
572
+ if failed?
573
+ failed!
574
+ else
575
+ parse_hash opts
576
+ end
577
+ self
578
+ end
579
+
580
+ end
424
581
 
582
+ ### DEPRECATED ### Use native inheritance and override #retry instead
583
+
425
584
  # using reprocessing of page in case of non-200 response:
426
585
  # page_class = ReloadablePage do
427
586
  # @res and @res.code != 200
428
587
  # end
429
588
  def ReloadablePage(&reload_condition)
430
- rp = Class.new Page
431
- rp.send :define_method, :process do |curl, opts|
432
- super(curl, opts || {})
433
- if curl.instance_eval &reload_condition
434
- curl.retry!
435
- nil # in case of reload_condition.call super's callback will not proceed
436
- else self
589
+ Class.new Page do
590
+ define_method :process do |curl, opts|
591
+ super(curl, opts || {})
592
+ if curl.instance_eval &reload_condition
593
+ curl.retry!
594
+ nil # in case of reload_condition.call super's callback will not proceed
595
+ else self
596
+ end
437
597
  end
438
598
  end
439
- rp
440
599
  end
441
600
 
442
601
  end
data/lib/rhack/scout.rb CHANGED
@@ -46,9 +46,17 @@ module RHACK
46
46
  @timeout = opts[:timeout] || @@timeout || 60
47
47
  @post_proc = @get_proc = @head_proc = @put_proc = @delete_proc = Proc::NULL
48
48
  update uri
49
+
49
50
  @retry = opts[:retry] || {}
50
51
  @retry = {@uri.host => @retry} if @retry.is Array
51
-
52
+ end
53
+
54
+ def setup_curl
55
+ if loaded?
56
+ Curl.carier.remove @http
57
+ end
58
+ @http = Curl::Easy(@webproxy ? @proxy : @root)
59
+ @http.base = self
52
60
  @http.cacert = @@cacert
53
61
  end
54
62
 
@@ -66,8 +74,7 @@ module RHACK
66
74
  if @http
67
75
  @http.url = @webproxy ? @proxy : @root
68
76
  else
69
- @http = Curl::Easy(@webproxy ? @proxy : @root)
70
- @http.base = self
77
+ setup_curl
71
78
  end
72
79
  if @proxy
73
80
  @http.proxy_url = @proxy*':' if !@webproxy
@@ -186,12 +193,23 @@ module RHACK
186
193
  cks.map2 {|k, v| Cookie(k, v)}
187
194
  end
188
195
 
189
- def retry?(err)
190
- # exc = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
191
- exc = (@@retry.keys + @retry.keys).select_in @root
192
- return false if !exc.b
193
- # ['www.nomer.org', 'nomer.org'].every {|www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
194
- exc.no? {|e| err[0].self_name.in((@@retry[e] || []) + @retry[e])}
196
+ def retry?(eclass)
197
+ # sites = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
198
+ sites = (@@retry.keys + @retry.keys).select_in @root
199
+ return false if sites.empty?
200
+ errname = eclass.self_name
201
+ # retry = ['www.nomer.org', 'nomer.org'].any? {|www| {'nomer.org' => ['TimeoutError']}[www].include? 'TimeoutError'}
202
+ sites.any? {|site|
203
+ (@@retry[site] || []).include? errname or
204
+ (@retry[site] || []).include? errname
205
+ }
206
+ end
207
+
208
+ def retry!(path=@__path, headers=@__headers, not_redir=@__not_redir, relvl=@__relvl, callback=@__callback)
209
+ # all external params including post_body are still set
210
+ setup_curl # @http reload here
211
+ # and now we can set @http.on_complete back again
212
+ load(path, headers, not_redir, relvl, &callback)
195
213
  end
196
214
 
197
215
  def loaded?
@@ -209,10 +227,24 @@ module RHACK
209
227
  end
210
228
  rescue RuntimeError => e
211
229
  e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
212
- raise e
230
+ L.warn "#{e.inspect}: #{e.message}"
231
+ if loaded?
232
+ Curl.carier.remove @http
233
+ end
234
+ sleep 1
235
+ load!
236
+ #e.message << ". Failed to load allready loaded? easy handler: Bad file descriptor" unless Curl::Err::CurlError === e
237
+ #raise e
213
238
  end
214
239
 
215
240
  def load(path=@path, headers={}, not_redir=1, relvl=10, &callback)
241
+ # cache preprocessed data for one time for we can do #retry
242
+ @__path = path
243
+ @__headers = headers
244
+ @__not_redir = not_redir
245
+ @__relvl = relvl
246
+ @__callback = callback
247
+
216
248
  @http.path = path = fix(path)
217
249
  @http.headers = mkHeader(path).merge!(headers)
218
250
  @http.timeout = @timeout
@@ -233,24 +265,18 @@ module RHACK
233
265
  end
234
266
  }
235
267
  @http.on_failure {|c, e|
268
+ eclass = e[0]
236
269
  @error = e
237
- if e[0] == Curl::Err::CurlOK
238
- # в сорцах on_failure не вызывается по коду 0, это какой-то глюк
239
- # в любом случае такой поворот не означает ошибки
240
- L.warn "Got Curl::Err::CurlOK, response was: #{c.res}"
241
- load!
270
+ c.outdate!
271
+ # we must clean @http.on_complete, otherwise
272
+ # it would run right after this function and with broken data
273
+ @http.on_complete &Proc::NULL
274
+ if retry? eclass
275
+ L.debug "#{eclass} -> reloading scout"
276
+ retry!
242
277
  else
243
- c.outdate!
244
- if retry? e
245
- L.debug "#{e[0]} -> reloading scout"
246
- #load uri, headers, not_redir, relvl, &callback
247
- load! # all params including post_body are still set
248
- # DO they include `on_complete'?
249
- else
250
- @http.on_complete &Proc::NULL
251
- L.debug "#{e[0]} -> not reloading scout"
252
- raise *e if @raise_err
253
- end
278
+ L.debug "#{eclass} -> not reloading scout"
279
+ raise *e if @raise_err
254
280
  end
255
281
  } if !@http.on_failure
256
282
 
@@ -15,12 +15,16 @@ module RHACK
15
15
  if args[0].is Scout
16
16
  s = args[0]
17
17
  else
18
- if !args[0].is String
19
- args.unshift ''
18
+ unless args[0].is String
20
19
  if (opts = args[-1]).is Hash and (opts[:cp] || opts[:ck]).is Hash
21
20
  L.warn "it's useless to setup cookies for untargeted squad!"
22
21
  end
23
22
  end
23
+ if !args[0]
24
+ args[0] = ''
25
+ elsif !args[0].is String
26
+ args.unshift ''
27
+ end
24
28
  if args[1] and args[1][0].is Array
25
29
  proxies = args[1]
26
30
  args[1] = proxies.shift
@@ -43,7 +47,9 @@ module RHACK
43
47
  end
44
48
 
45
49
  def wait_for_available
50
+ #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
46
51
  Curl.execute :unless_already
52
+ #L.debug {"Curl.carier_thread = #{Curl.carier_thread}; Thread.current = #{Thread.current}"}
47
53
  # Carier.requests освобождаются ещё до колбека,
48
54
  # но колбеки выполняются последовательно,
49
55
  # поэтому здесь мы можем усыплять тред,
@@ -59,6 +65,7 @@ module RHACK
59
65
  raise PickError if !b
60
66
  # to_a because Array#reject returns object of this class
61
67
  if scout = to_a.rand_by_available?
68
+ L.debug {"randomly picked an available scout##{scout.object_id}"}
62
69
  scout
63
70
  else
64
71
  wait_for_available
@@ -69,6 +76,7 @@ module RHACK
69
76
  def next
70
77
  raise PickError if !b
71
78
  if scout = to_a.find_available?
79
+ L.debug {"picked the next available scout##{scout.object_id}"}
72
80
  scout
73
81
  else
74
82
  wait_for_available
data/lib/rhack/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.2.1'
2
+ VERSION = '1.2.7'
3
3
  end
data/rhack.gemspec CHANGED
@@ -18,7 +18,7 @@ Gem::Specification.new do |spec|
18
18
 
19
19
  spec.add_runtime_dependency "activesupport"
20
20
  #spec.add_runtime_dependency "redis"
21
- spec.add_runtime_dependency "rmtools", ">= 2.3.0"
21
+ spec.add_runtime_dependency "rmtools", ">= 2.3.6"
22
22
  spec.add_runtime_dependency "libxml-ruby"
23
23
 
24
24
  spec.extensions << 'ext/curb/extconf.rb'