rhack 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,10 @@
1
1
  # encoding: utf-8
2
2
  module Curl
3
3
 
4
- def execute
5
- # если ты ищешь, откуда оно запускается, то это наверное
6
- # lib/extensions/johnson.rb:37:in `set_browser_for_curl'
4
+ def execute(unless_allready=false)
5
+ if unless_allready and Curl.status
6
+ return L.debug "Non-nil status! Avoid executing"
7
+ end
7
8
  if $CarierThread and s = $CarierThread.status
8
9
  L.debug "Carier thread allready started and has status #{s}"
9
10
  else
@@ -33,13 +34,16 @@ module Curl
33
34
  } unless error
34
35
  error
35
36
  }
37
+ # until main thread has sleep a bit, $CarierThread will have status "run",
38
+ # no matter whether it's idling or performing requests
39
+ sleep 0.001
36
40
  end
37
41
  end
38
42
  alias :run :execute
39
43
  module_function :execute, :run
40
44
 
41
45
  def wait
42
- if $CarierThread
46
+ if $CarierThread and $CarierThread.status
43
47
  if !(within = Thread.current == $CarierThread)
44
48
  # We can't set `perform' timeout lesser than 1 second in the curl binding
45
49
  # because in that case thread status would always be "run"
@@ -84,7 +88,9 @@ module Curl
84
88
  end
85
89
  end
86
90
  else
87
- L.debug "No thread to wait"
91
+ L < "No thread to wait. I guess I should create one"
92
+ execute
93
+ wait
88
94
  end
89
95
  end
90
96
  module_function :wait
@@ -21,11 +21,12 @@ module HTTPAccessKit
21
21
 
22
22
  class Frame
23
23
  __init__
24
- attr_reader :loc, :static, :ss, :opts
24
+ attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
+ @@cache = {}
25
26
 
26
27
  def initialize *args
27
28
  args << 10 unless args[-1].is Fixnum
28
- args[-2] = {} unless args[-2].is Hash
29
+ args.insert -2, {} unless args[-2].is Hash
29
30
  @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
30
31
  args[-2] = @opts
31
32
  if args[0].is String
@@ -39,8 +40,7 @@ module HTTPAccessKit
39
40
  @static = false
40
41
  end
41
42
  @ss = ScoutSquad *args
42
- @pages = []
43
- Curl.run unless Curl.status
43
+ Curl.run :unless_allready
44
44
  end
45
45
 
46
46
  def retarget to, forced=nil
@@ -48,23 +48,40 @@ module HTTPAccessKit
48
48
  @ss.update to, forced
49
49
  @loc = to.parse:uri
50
50
  end
51
-
52
- def target=to
53
- retarget to
54
- end
51
+ alias :target= :retarget
55
52
 
56
53
  def next() @ss.next end
57
54
  def rand() @ss.rand end
58
55
  def each(&block) @ss.each &block end
59
56
  def [](i) @ss[i] end
60
57
 
58
+ def copy_cookies! i=0
59
+ @ss.each {|s| s.cookies.replace @ss[i].cookies}
60
+ end
61
+
62
+ def use_cache! opts={}
63
+ if opts == false
64
+ @use_cache = false
65
+ else
66
+ @@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
67
+ #@write_to = opts[:write_to] if :write_to.in opts
68
+ @use_cache = true
69
+ end
70
+ end
71
+
72
+ def drop_cache! use=nil
73
+ @@cache.clear
74
+ GC.start
75
+ @use_cache = use if use.in [true, false]
76
+ end
77
+
61
78
  def inspect
62
79
  "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static' if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
63
80
  end
64
81
 
65
82
  # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
66
83
  # :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
67
- def get *args, &callback
84
+ def exec *args, &callback
68
85
  many, order, orders, with_opts = interpret_request *args
69
86
  L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
70
87
 
@@ -79,7 +96,7 @@ module HTTPAccessKit
79
96
  if many then exec_many orders, with_opts, &callback
80
97
  else exec_one order, with_opts, &callback end
81
98
  end
82
- alias :exec :get
99
+ alias :get :exec
83
100
  alias :run :get
84
101
 
85
102
  def interpret_request(*args)
@@ -431,28 +448,46 @@ module HTTPAccessKit
431
448
  uris.map! {|u| validate u}
432
449
  end
433
450
 
434
- def exec_one(order, opts)
451
+ def run_callbacks!(page, opts, &callback)
452
+ if callback
453
+ yres = callback.call page
454
+ if opts[:save_result] or :proc_result.in opts
455
+ page.res = yres
456
+ end
457
+ if opts[:proc_result].is Proc and yres != :skip
458
+ opts[:proc_result].call yres
459
+ end
460
+ end
461
+ end
462
+
463
+ # TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
464
+ def exec_one(order, opts, &callback)
465
+ if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
466
+ run_callbacks! page, opts, &callback
467
+ res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
468
+ return res
469
+ end
435
470
  # must result in Page (default) or it's subclass
436
471
  page = opts[:result].new
437
- # if no spare scouts can be found, squad simply waits for all callbacks to complete
472
+ # if no spare scouts can be found, squad simply waits for first callbacks to complete
438
473
  s = @ss.next
439
- #s.raise_err = true# Зачем это тут? Можно добавлять :raise=>1 фрейму при запиле
440
474
  s.send(*(order << opts)) {|curl|
475
+ # there is a problem with storing html on disk
476
+ if order[0] == :loadGet and @write_to
477
+ # sometimes (about 2% for 100-threads-dling) when this string is calling
478
+ # no matter what +curl.res.body+ has contained here
479
+ RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
480
+ end
441
481
  if opts[:raw]
442
482
  yield curl
443
- elsif page.process(curl, opts) and block_given?
444
- yres = yield page
445
- if opts[:save_result] or :proc_result.in opts
446
- page.res = yres
447
- end
448
- if opts[:proc_result].is Proc and yres != :skip
449
- opts[:proc_result].call yres
450
- end
483
+ # here +curl.res.body+ become empty
484
+ elsif page.process(curl, opts)
485
+ @@cache[page.href] = page if order[0] == :loadGet and @use_cache
486
+ run_callbacks! page, opts, &callback
451
487
  end
452
488
  }
453
489
  if opts[:wait]
454
490
  opts[:thread_safe] ? $Carier.perform : Curl.wait
455
- # почему бы не уменьшить бойлерплейт в сервисах и не возвращать res сразу?
456
491
  (opts[:save_result] or :proc_result.in opts) ? page.res : page
457
492
  else page
458
493
  end
@@ -530,39 +565,49 @@ module HTTPAccessKit
530
565
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
531
566
  __init__
532
567
  # res here is result of page processing made in frame context
533
- attr_accessor :title, :res
534
- attr_reader :html, :loc, :hash, :doc, :js
568
+ attr_writer :title
569
+ attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
570
+ attr_accessor :res
535
571
  @@ignore = /google|_gat|tracker|adver/i
536
572
 
537
573
  def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
538
574
  loc = loc.parse:uri if !loc.is Hash
539
575
  @js = js
540
576
  if obj.is Curl::Easy or obj.kinda Scout
541
- c = obj.kinda(Scout) ? obj.http : html
577
+ c = obj.kinda(Scout) ? obj.http : obj
542
578
  @html = ''
543
579
  # just (c, loc) would pass to #process opts variable that returns '' on any key
544
- process(c, loc.b || {})
580
+ process(c, loc.b || {})
545
581
  else
546
582
  @html = obj
547
583
  @loc = loc
548
584
  end
549
585
  end
586
+
587
+ def empty?
588
+ !(@hash.nil? ? @html : @hash).b
589
+ end
550
590
 
551
591
  def inspect
552
592
  if !@hash.nil?
553
593
  "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
554
594
  else
555
- "<#FramePage #{@html.b ? "«#{@title}» (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
595
+ "<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
556
596
  end
557
597
  end
558
598
 
599
+ def html!(encoding='UTF-8')
600
+ @html.force_encoding(encoding)
601
+ end
602
+
559
603
  # We can then alternate #process in Page subclasses
560
604
  # Frame doesn't mind about value returned by #process
561
605
  def process(c, opts={})
562
606
  @loc = c.last_effective_url.parse:uri
563
- L.debug "#{@loc.fullpath} -> #{c.res}"
564
- if c.res.code == 200
565
- body = c.res.body
607
+ @curl_res = c.res
608
+ L.debug "#{@loc.fullpath} -> #{@curl_res}"
609
+ if @curl_res.code == 200
610
+ body = @curl_res.body
566
611
  if opts[:json]
567
612
  @json = true
568
613
  @hash = begin; body.from_json
@@ -577,7 +622,7 @@ module HTTPAccessKit
577
622
 
578
623
  elsif opts[:hash]
579
624
  if body.inline
580
- @hash = body.to_hash
625
+ @hash = body.to_params
581
626
  else
582
627
  @hash = false
583
628
  L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
@@ -585,12 +630,16 @@ module HTTPAccessKit
585
630
  end
586
631
 
587
632
  else
588
- @html = body; to_doc
633
+ @html = body.xml_to_utf
634
+ to_doc
589
635
  if opts[:eval]
590
636
  load_scripts opts[:load_scripts]
591
637
  eval_js
592
638
  end
593
639
  end
640
+ elsif !(opts[:json] or opts[:hash])
641
+ @html = @curl_res.body
642
+ @failed = @curl_res.code
594
643
  end
595
644
  self
596
645
  end
@@ -630,17 +679,32 @@ module HTTPAccessKit
630
679
 
631
680
  def to_doc
632
681
  @doc = @html.to_doc :forceutf
633
- if !(@title = @doc.title.b)
634
- @title = @loc.href
635
- @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
636
- else
637
- if @title.cyr? and UTF2ANSI[@title].size > 40
638
- @title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/]]+'…'
639
- elsif @title.size > 40
640
- @title = @title[/.{1,30}\S*/]+'…'
682
+ end
683
+
684
+ def title(full=true)
685
+ if @hash.nil? and !@failed and @html.b
686
+ if full
687
+ to_doc unless defined? @doc
688
+ if @doc.title.b
689
+ @title = @doc.title
690
+ else
691
+ @title = @loc.href
692
+ @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
693
+ @title
694
+ end
695
+ else
696
+ title true unless defined? @title
697
+ if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
698
+ @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
699
+ elsif @title.size > 40
700
+ @short_title = @title[/.{1,30}\S*/][0..38]+'…'
701
+ else
702
+ @short_title = @title
703
+ end
641
704
  end
705
+ else
706
+ @loc.href
642
707
  end
643
- @doc
644
708
  end
645
709
 
646
710
  def find(xp) (@doc || to_doc).find xp end
@@ -717,7 +781,7 @@ module HTTPAccessKit
717
781
  end
718
782
 
719
783
  def submit(form, frame, hash={}, opts={}, &callback)
720
- (opts[:header] ||= {}).Referer ||= @loc.href if @loc
784
+ (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
721
785
  query = form(form, hash, opts)
722
786
 
723
787
  curr_target, new_target = frame.loc.href, (query[2] || query[0])
@@ -1,19 +1,19 @@
1
- ua file: /path/to/ua/list
2
- cache:
3
- dir: /path/to/cache/dir
4
- table: hack_cache
5
- clean: 30.days
6
- logger:
7
- :out: /path/to/rmlogger/logfile
8
- scout retry:
9
- example.com:
10
- - TimeoutError
11
- db:
12
- reconnect: true
13
- encoding: utf8
14
- username: root
15
- adapter: mysql
16
- database: dbname
17
- pool: 5
18
- password:
19
- socket: /var/run/mysqld/mysqld.sock
1
+ #ua file: db/useragents.txt
2
+ #logger:
3
+ # :out: log/rhack.log
4
+ #scout retry: {host => [Curl::Error subclass, ], }
5
+ # example.com:
6
+ # - TimeoutError
7
+ #db: # defaults file @ to RAILS_PATH/config/RAILS_ENV.yml
8
+ # reconnect: true
9
+ # encoding: utf8
10
+ # username: root
11
+ # adapter: mysql
12
+ # database: dbname
13
+ # pool: 5
14
+ # password:
15
+ # socket: /var/run/mysqld/mysqld.sock
16
+ #cache: # deprecated
17
+ # dir: /path/to/cache/dir
18
+ # table: hack_cache
19
+ # clean: 30.days
@@ -13,6 +13,17 @@ module HTTPAccessKit
13
13
  # first argument should be a string so that frame won't be static
14
14
  @f = frame || Frame(self.class::URI[service] || self.class::URI[:login], *args)
15
15
  end
16
+
17
+ # Usable only for sync requests
18
+ def login(*)
19
+ Curl.run
20
+ @f[0].cookies.clear
21
+ json, wait, @f.opts[:json], @f.opts[:wait] = @f.opts[:json], @f.opts[:wait], false, true
22
+ yield @f.get(self.class::URI[:login])
23
+ @f.get(self.class::URI[:home]) if self.class::URI[:home]
24
+ @f.opts[:json], @f.opts[:wait] = json, wait
25
+ @f.copy_cookies!
26
+ end
16
27
 
17
28
  def go(*args, &block)
18
29
  __send__(@service, *args, &block)
@@ -307,13 +318,9 @@ module HTTPAccessKit
307
318
  end
308
319
 
309
320
  def login params={'email'=>'fshm@bk.ru', 'pass'=>'Riddick2', 'expire'=>nil}
310
- Curl.run
311
- @f[0].cookies.clear
312
- @f.get(URI[:login], :json=>nil) {|login_page|
313
- login_page.submit('form', @f, params, :json=>nil) {|redirection|
314
- redirection.submit('form', @f, {}, :json=>nil) {|logged|
315
- @f.each {|s| s.cookies.replace @f[0].cookies}
316
- }}}
321
+ super {|login_page|
322
+ login_page.submit('form', @f, params).submit('form', @f, {})
323
+ }
317
324
  end
318
325
 
319
326
  def get_links h, pagenum, &block
@@ -114,7 +114,7 @@ module HTTPAccessKit
114
114
  }
115
115
  @name, @value = ck[0].split('=', 2)
116
116
  #@value.gsub!(/^['"]|['"]$/, '')
117
- L.debug args if !@domain
117
+ #L.debug args if !@domain
118
118
  (scout.cookies[scout.uri.host] ||= {})[@name] = self
119
119
  else
120
120
  @name, cookie = args[0]
@@ -337,7 +337,7 @@ module HTTPAccessKit
337
337
  # exc = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
338
338
  exc = (@@retry.keys + @retry.keys).select_in @root
339
339
  return false if !exc.b
340
- # ['www.nomer.org', 'nomer.org'].every |www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
340
+ # ['www.nomer.org', 'nomer.org'].every {|www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
341
341
  exc.no? {|e| err[0].self_name.in((@@retry[e] || []) + @retry[e])}
342
342
  end
343
343
 
@@ -346,7 +346,6 @@ module HTTPAccessKit
346
346
  end
347
347
 
348
348
  def load!
349
- $log <= [$Carier, @http]
350
349
  unless $Carier.add @http
351
350
  $Carier.remove @http
352
351
  $Carier.add @http
@@ -427,9 +426,7 @@ module HTTPAccessKit
427
426
  end
428
427
  emu = lambda {
429
428
  @headers = ''
430
- $log << @headers
431
429
  @http.on_header {|h|
432
- $log << @headers
433
430
  @headers << h
434
431
  h == "\r\n" ? 0 : h.size
435
432
  }
@@ -439,7 +436,6 @@ module HTTPAccessKit
439
436
  if emulate != :always
440
437
  load(uri, headers) {|c|
441
438
  if !@error and c.res.code != 200 and emulate == :if_retry
442
- $log << @headers
443
439
  emu.call
444
440
  else
445
441
  callback[c]
@@ -499,7 +495,8 @@ module HTTPAccessKit
499
495
  if scout = to_a.rand {|_|!_.loaded?}; scout
500
496
  else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
501
497
  raise "Curl must run in order to use ScoutSquad#rand" if !Curl.status
502
- Curl.wait
498
+ #Curl.wait
499
+ loop {sleep 1; break if $Carier.reqs.size < size}
503
500
  self.rand
504
501
  end
505
502
  end
@@ -509,7 +506,8 @@ module HTTPAccessKit
509
506
  if scout = find {|_|!_.loaded?}; scout
510
507
  else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
511
508
  raise "Curl must run in order to use ScoutSquad#next" if !Curl.status
512
- Curl.wait
509
+ #Curl.wait
510
+ loop {sleep 1; break if $Carier.reqs.size < size}
513
511
  self.next
514
512
  end
515
513
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-07 00:00:00.000000000 Z
12
+ date: 2013-01-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rmtools
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: 1.0.0
21
+ version: 1.2.12
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
- version: 1.0.0
29
+ version: 1.2.12
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rake
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -105,20 +105,28 @@ extra_rdoc_files:
105
105
  files:
106
106
  - ext/curb/curb_errors.c
107
107
  - ext/curb/curb_errors.h
108
+ - ext/curb/curb_errors.o
108
109
  - ext/curb/Makefile
109
110
  - ext/curb/curb_macros.h
110
111
  - ext/curb/curb_multi.c
111
112
  - ext/curb/curb_multi.h
113
+ - ext/curb/curb_multi.o
112
114
  - ext/curb/curb_upload.c
113
115
  - ext/curb/curb_upload.h
116
+ - ext/curb/curb_upload.o
114
117
  - ext/curb/curb_config.h
115
118
  - ext/curb/extconf.rb
116
119
  - ext/curb/curb.c
117
120
  - ext/curb/curb.h
121
+ - ext/curb/curb.o
122
+ - ext/curb/curb_core.so
118
123
  - ext/curb/curb_easy.c
119
124
  - ext/curb/curb_easy.h
125
+ - ext/curb/curb_easy.o
126
+ - ext/curb/mkmf.log
120
127
  - ext/curb/curb_postfield.c
121
128
  - ext/curb/curb_postfield.h
129
+ - ext/curb/curb_postfield.o
122
130
  - ext/curb-original/curb_errors.c
123
131
  - ext/curb-original/curb_errors.h
124
132
  - ext/curb-original/curb_macros.h
@@ -165,7 +173,7 @@ files:
165
173
  - ./Gemfile
166
174
  - ./History.txt
167
175
  - .gemtest
168
- homepage: http://2ch.ru
176
+ homepage: https://github.com/tinbka/rhack
169
177
  licenses: []
170
178
  post_install_message:
171
179
  rdoc_options:
@@ -181,7 +189,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
181
189
  version: '0'
182
190
  segments:
183
191
  - 0
184
- hash: -702291569
192
+ hash: -84270947
185
193
  required_rubygems_version: !ruby/object:Gem::Requirement
186
194
  none: false
187
195
  requirements: