rhack 0.3.3 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,9 +1,10 @@
1
1
  # encoding: utf-8
2
2
  module Curl
3
3
 
4
- def execute
5
- # если ты ищешь, откуда оно запускается, то это наверное
6
- # lib/extensions/johnson.rb:37:in `set_browser_for_curl'
4
+ def execute(unless_allready=false)
5
+ if unless_allready and Curl.status
6
+ return L.debug "Non-nil status! Avoid executing"
7
+ end
7
8
  if $CarierThread and s = $CarierThread.status
8
9
  L.debug "Carier thread allready started and has status #{s}"
9
10
  else
@@ -33,13 +34,16 @@ module Curl
33
34
  } unless error
34
35
  error
35
36
  }
37
+ # until main thread has sleep a bit, $CarierThread will have status "run",
38
+ # no matter whether it's idling or performing requests
39
+ sleep 0.001
36
40
  end
37
41
  end
38
42
  alias :run :execute
39
43
  module_function :execute, :run
40
44
 
41
45
  def wait
42
- if $CarierThread
46
+ if $CarierThread and $CarierThread.status
43
47
  if !(within = Thread.current == $CarierThread)
44
48
  # We can't set `perform' timeout lesser than 1 second in the curl binding
45
49
  # because in that case thread status would always be "run"
@@ -84,7 +88,9 @@ module Curl
84
88
  end
85
89
  end
86
90
  else
87
- L.debug "No thread to wait"
91
+ L < "No thread to wait. I guess I should create one"
92
+ execute
93
+ wait
88
94
  end
89
95
  end
90
96
  module_function :wait
@@ -21,11 +21,12 @@ module HTTPAccessKit
21
21
 
22
22
  class Frame
23
23
  __init__
24
- attr_reader :loc, :static, :ss, :opts
24
+ attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
+ @@cache = {}
25
26
 
26
27
  def initialize *args
27
28
  args << 10 unless args[-1].is Fixnum
28
- args[-2] = {} unless args[-2].is Hash
29
+ args.insert -2, {} unless args[-2].is Hash
29
30
  @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
30
31
  args[-2] = @opts
31
32
  if args[0].is String
@@ -39,8 +40,7 @@ module HTTPAccessKit
39
40
  @static = false
40
41
  end
41
42
  @ss = ScoutSquad *args
42
- @pages = []
43
- Curl.run unless Curl.status
43
+ Curl.run :unless_allready
44
44
  end
45
45
 
46
46
  def retarget to, forced=nil
@@ -48,23 +48,40 @@ module HTTPAccessKit
48
48
  @ss.update to, forced
49
49
  @loc = to.parse:uri
50
50
  end
51
-
52
- def target=to
53
- retarget to
54
- end
51
+ alias :target= :retarget
55
52
 
56
53
  def next() @ss.next end
57
54
  def rand() @ss.rand end
58
55
  def each(&block) @ss.each &block end
59
56
  def [](i) @ss[i] end
60
57
 
58
+ def copy_cookies! i=0
59
+ @ss.each {|s| s.cookies.replace @ss[i].cookies}
60
+ end
61
+
62
+ def use_cache! opts={}
63
+ if opts == false
64
+ @use_cache = false
65
+ else
66
+ @@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
67
+ #@write_to = opts[:write_to] if :write_to.in opts
68
+ @use_cache = true
69
+ end
70
+ end
71
+
72
+ def drop_cache! use=nil
73
+ @@cache.clear
74
+ GC.start
75
+ @use_cache = use if use.in [true, false]
76
+ end
77
+
61
78
  def inspect
62
79
  "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static' if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
63
80
  end
64
81
 
65
82
  # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
66
83
  # :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
67
- def get *args, &callback
84
+ def exec *args, &callback
68
85
  many, order, orders, with_opts = interpret_request *args
69
86
  L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
70
87
 
@@ -79,7 +96,7 @@ module HTTPAccessKit
79
96
  if many then exec_many orders, with_opts, &callback
80
97
  else exec_one order, with_opts, &callback end
81
98
  end
82
- alias :exec :get
99
+ alias :get :exec
83
100
  alias :run :get
84
101
 
85
102
  def interpret_request(*args)
@@ -431,28 +448,46 @@ module HTTPAccessKit
431
448
  uris.map! {|u| validate u}
432
449
  end
433
450
 
434
- def exec_one(order, opts)
451
+ def run_callbacks!(page, opts, &callback)
452
+ if callback
453
+ yres = callback.call page
454
+ if opts[:save_result] or :proc_result.in opts
455
+ page.res = yres
456
+ end
457
+ if opts[:proc_result].is Proc and yres != :skip
458
+ opts[:proc_result].call yres
459
+ end
460
+ end
461
+ end
462
+
463
+ # TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
464
+ def exec_one(order, opts, &callback)
465
+ if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
466
+ run_callbacks! page, opts, &callback
467
+ res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
468
+ return res
469
+ end
435
470
  # must result in Page (default) or it's subclass
436
471
  page = opts[:result].new
437
- # if no spare scouts can be found, squad simply waits for all callbacks to complete
472
+ # if no spare scouts can be found, squad simply waits for first callbacks to complete
438
473
  s = @ss.next
439
- #s.raise_err = true# Зачем это тут? Можно добавлять :raise=>1 фрейму при запиле
440
474
  s.send(*(order << opts)) {|curl|
475
+ # there is a problem with storing html on disk
476
+ if order[0] == :loadGet and @write_to
477
+ # sometimes (about 2% for 100-threads-dling) when this string is calling
478
+ # no matter what +curl.res.body+ has contained here
479
+ RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
480
+ end
441
481
  if opts[:raw]
442
482
  yield curl
443
- elsif page.process(curl, opts) and block_given?
444
- yres = yield page
445
- if opts[:save_result] or :proc_result.in opts
446
- page.res = yres
447
- end
448
- if opts[:proc_result].is Proc and yres != :skip
449
- opts[:proc_result].call yres
450
- end
483
+ # here +curl.res.body+ become empty
484
+ elsif page.process(curl, opts)
485
+ @@cache[page.href] = page if order[0] == :loadGet and @use_cache
486
+ run_callbacks! page, opts, &callback
451
487
  end
452
488
  }
453
489
  if opts[:wait]
454
490
  opts[:thread_safe] ? $Carier.perform : Curl.wait
455
- # почему бы не уменьшить бойлерплейт в сервисах и не возвращать res сразу?
456
491
  (opts[:save_result] or :proc_result.in opts) ? page.res : page
457
492
  else page
458
493
  end
@@ -530,39 +565,49 @@ module HTTPAccessKit
530
565
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
531
566
  __init__
532
567
  # res here is result of page processing made in frame context
533
- attr_accessor :title, :res
534
- attr_reader :html, :loc, :hash, :doc, :js
568
+ attr_writer :title
569
+ attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
570
+ attr_accessor :res
535
571
  @@ignore = /google|_gat|tracker|adver/i
536
572
 
537
573
  def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
538
574
  loc = loc.parse:uri if !loc.is Hash
539
575
  @js = js
540
576
  if obj.is Curl::Easy or obj.kinda Scout
541
- c = obj.kinda(Scout) ? obj.http : html
577
+ c = obj.kinda(Scout) ? obj.http : obj
542
578
  @html = ''
543
579
  # just (c, loc) would pass to #process opts variable that returns '' on any key
544
- process(c, loc.b || {})
580
+ process(c, loc.b || {})
545
581
  else
546
582
  @html = obj
547
583
  @loc = loc
548
584
  end
549
585
  end
586
+
587
+ def empty?
588
+ !(@hash.nil? ? @html : @hash).b
589
+ end
550
590
 
551
591
  def inspect
552
592
  if !@hash.nil?
553
593
  "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
554
594
  else
555
- "<#FramePage #{@html.b ? "«#{@title}» (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
595
+ "<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
556
596
  end
557
597
  end
558
598
 
599
+ def html!(encoding='UTF-8')
600
+ @html.force_encoding(encoding)
601
+ end
602
+
559
603
  # We can then alternate #process in Page subclasses
560
604
  # Frame doesn't mind about value returned by #process
561
605
  def process(c, opts={})
562
606
  @loc = c.last_effective_url.parse:uri
563
- L.debug "#{@loc.fullpath} -> #{c.res}"
564
- if c.res.code == 200
565
- body = c.res.body
607
+ @curl_res = c.res
608
+ L.debug "#{@loc.fullpath} -> #{@curl_res}"
609
+ if @curl_res.code == 200
610
+ body = @curl_res.body
566
611
  if opts[:json]
567
612
  @json = true
568
613
  @hash = begin; body.from_json
@@ -577,7 +622,7 @@ module HTTPAccessKit
577
622
 
578
623
  elsif opts[:hash]
579
624
  if body.inline
580
- @hash = body.to_hash
625
+ @hash = body.to_params
581
626
  else
582
627
  @hash = false
583
628
  L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
@@ -585,12 +630,16 @@ module HTTPAccessKit
585
630
  end
586
631
 
587
632
  else
588
- @html = body; to_doc
633
+ @html = body.xml_to_utf
634
+ to_doc
589
635
  if opts[:eval]
590
636
  load_scripts opts[:load_scripts]
591
637
  eval_js
592
638
  end
593
639
  end
640
+ elsif !(opts[:json] or opts[:hash])
641
+ @html = @curl_res.body
642
+ @failed = @curl_res.code
594
643
  end
595
644
  self
596
645
  end
@@ -630,17 +679,32 @@ module HTTPAccessKit
630
679
 
631
680
  def to_doc
632
681
  @doc = @html.to_doc :forceutf
633
- if !(@title = @doc.title.b)
634
- @title = @loc.href
635
- @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
636
- else
637
- if @title.cyr? and UTF2ANSI[@title].size > 40
638
- @title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/]]+'…'
639
- elsif @title.size > 40
640
- @title = @title[/.{1,30}\S*/]+'…'
682
+ end
683
+
684
+ def title(full=true)
685
+ if @hash.nil? and !@failed and @html.b
686
+ if full
687
+ to_doc unless defined? @doc
688
+ if @doc.title.b
689
+ @title = @doc.title
690
+ else
691
+ @title = @loc.href
692
+ @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
693
+ @title
694
+ end
695
+ else
696
+ title true unless defined? @title
697
+ if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
698
+ @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
699
+ elsif @title.size > 40
700
+ @short_title = @title[/.{1,30}\S*/][0..38]+'…'
701
+ else
702
+ @short_title = @title
703
+ end
641
704
  end
705
+ else
706
+ @loc.href
642
707
  end
643
- @doc
644
708
  end
645
709
 
646
710
  def find(xp) (@doc || to_doc).find xp end
@@ -717,7 +781,7 @@ module HTTPAccessKit
717
781
  end
718
782
 
719
783
  def submit(form, frame, hash={}, opts={}, &callback)
720
- (opts[:header] ||= {}).Referer ||= @loc.href if @loc
784
+ (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
721
785
  query = form(form, hash, opts)
722
786
 
723
787
  curr_target, new_target = frame.loc.href, (query[2] || query[0])
@@ -1,19 +1,19 @@
1
- ua file: /path/to/ua/list
2
- cache:
3
- dir: /path/to/cache/dir
4
- table: hack_cache
5
- clean: 30.days
6
- logger:
7
- :out: /path/to/rmlogger/logfile
8
- scout retry:
9
- example.com:
10
- - TimeoutError
11
- db:
12
- reconnect: true
13
- encoding: utf8
14
- username: root
15
- adapter: mysql
16
- database: dbname
17
- pool: 5
18
- password:
19
- socket: /var/run/mysqld/mysqld.sock
1
+ #ua file: db/useragents.txt
2
+ #logger:
3
+ # :out: log/rhack.log
4
+ #scout retry: {host => [Curl::Error subclass, ], }
5
+ # example.com:
6
+ # - TimeoutError
7
+ #db: # defaults file @ to RAILS_PATH/config/RAILS_ENV.yml
8
+ # reconnect: true
9
+ # encoding: utf8
10
+ # username: root
11
+ # adapter: mysql
12
+ # database: dbname
13
+ # pool: 5
14
+ # password:
15
+ # socket: /var/run/mysqld/mysqld.sock
16
+ #cache: # deprecated
17
+ # dir: /path/to/cache/dir
18
+ # table: hack_cache
19
+ # clean: 30.days
@@ -13,6 +13,17 @@ module HTTPAccessKit
13
13
  # first argument should be a string so that frame won't be static
14
14
  @f = frame || Frame(self.class::URI[service] || self.class::URI[:login], *args)
15
15
  end
16
+
17
+ # Usable only for sync requests
18
+ def login(*)
19
+ Curl.run
20
+ @f[0].cookies.clear
21
+ json, wait, @f.opts[:json], @f.opts[:wait] = @f.opts[:json], @f.opts[:wait], false, true
22
+ yield @f.get(self.class::URI[:login])
23
+ @f.get(self.class::URI[:home]) if self.class::URI[:home]
24
+ @f.opts[:json], @f.opts[:wait] = json, wait
25
+ @f.copy_cookies!
26
+ end
16
27
 
17
28
  def go(*args, &block)
18
29
  __send__(@service, *args, &block)
@@ -307,13 +318,9 @@ module HTTPAccessKit
307
318
  end
308
319
 
309
320
  def login params={'email'=>'fshm@bk.ru', 'pass'=>'Riddick2', 'expire'=>nil}
310
- Curl.run
311
- @f[0].cookies.clear
312
- @f.get(URI[:login], :json=>nil) {|login_page|
313
- login_page.submit('form', @f, params, :json=>nil) {|redirection|
314
- redirection.submit('form', @f, {}, :json=>nil) {|logged|
315
- @f.each {|s| s.cookies.replace @f[0].cookies}
316
- }}}
321
+ super {|login_page|
322
+ login_page.submit('form', @f, params).submit('form', @f, {})
323
+ }
317
324
  end
318
325
 
319
326
  def get_links h, pagenum, &block
@@ -114,7 +114,7 @@ module HTTPAccessKit
114
114
  }
115
115
  @name, @value = ck[0].split('=', 2)
116
116
  #@value.gsub!(/^['"]|['"]$/, '')
117
- L.debug args if !@domain
117
+ #L.debug args if !@domain
118
118
  (scout.cookies[scout.uri.host] ||= {})[@name] = self
119
119
  else
120
120
  @name, cookie = args[0]
@@ -337,7 +337,7 @@ module HTTPAccessKit
337
337
  # exc = ['0chan.ru', '2-ch.ru', 'www.nomer.org', 'nomer.org'].select_in('http://www.nomer.org') = ['www.nomer.org', 'nomer.org']
338
338
  exc = (@@retry.keys + @retry.keys).select_in @root
339
339
  return false if !exc.b
340
- # ['www.nomer.org', 'nomer.org'].every |www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
340
+ # ['www.nomer.org', 'nomer.org'].every {|www| 'TimeoutError'.in({'nomer.org' => 'TimeoutError'}[www])} ?
341
341
  exc.no? {|e| err[0].self_name.in((@@retry[e] || []) + @retry[e])}
342
342
  end
343
343
 
@@ -346,7 +346,6 @@ module HTTPAccessKit
346
346
  end
347
347
 
348
348
  def load!
349
- $log <= [$Carier, @http]
350
349
  unless $Carier.add @http
351
350
  $Carier.remove @http
352
351
  $Carier.add @http
@@ -427,9 +426,7 @@ module HTTPAccessKit
427
426
  end
428
427
  emu = lambda {
429
428
  @headers = ''
430
- $log << @headers
431
429
  @http.on_header {|h|
432
- $log << @headers
433
430
  @headers << h
434
431
  h == "\r\n" ? 0 : h.size
435
432
  }
@@ -439,7 +436,6 @@ module HTTPAccessKit
439
436
  if emulate != :always
440
437
  load(uri, headers) {|c|
441
438
  if !@error and c.res.code != 200 and emulate == :if_retry
442
- $log << @headers
443
439
  emu.call
444
440
  else
445
441
  callback[c]
@@ -499,7 +495,8 @@ module HTTPAccessKit
499
495
  if scout = to_a.rand {|_|!_.loaded?}; scout
500
496
  else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
501
497
  raise "Curl must run in order to use ScoutSquad#rand" if !Curl.status
502
- Curl.wait
498
+ #Curl.wait
499
+ loop {sleep 1; break if $Carier.reqs.size < size}
503
500
  self.rand
504
501
  end
505
502
  end
@@ -509,7 +506,8 @@ module HTTPAccessKit
509
506
  if scout = find {|_|!_.loaded?}; scout
510
507
  else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
511
508
  raise "Curl must run in order to use ScoutSquad#next" if !Curl.status
512
- Curl.wait
509
+ #Curl.wait
510
+ loop {sleep 1; break if $Carier.reqs.size < size}
513
511
  self.next
514
512
  end
515
513
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-07 00:00:00.000000000 Z
12
+ date: 2013-01-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rmtools
@@ -18,7 +18,7 @@ dependencies:
18
18
  requirements:
19
19
  - - ! '>='
20
20
  - !ruby/object:Gem::Version
21
- version: 1.0.0
21
+ version: 1.2.12
22
22
  type: :runtime
23
23
  prerelease: false
24
24
  version_requirements: !ruby/object:Gem::Requirement
@@ -26,7 +26,7 @@ dependencies:
26
26
  requirements:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
- version: 1.0.0
29
+ version: 1.2.12
30
30
  - !ruby/object:Gem::Dependency
31
31
  name: rake
32
32
  requirement: !ruby/object:Gem::Requirement
@@ -105,20 +105,28 @@ extra_rdoc_files:
105
105
  files:
106
106
  - ext/curb/curb_errors.c
107
107
  - ext/curb/curb_errors.h
108
+ - ext/curb/curb_errors.o
108
109
  - ext/curb/Makefile
109
110
  - ext/curb/curb_macros.h
110
111
  - ext/curb/curb_multi.c
111
112
  - ext/curb/curb_multi.h
113
+ - ext/curb/curb_multi.o
112
114
  - ext/curb/curb_upload.c
113
115
  - ext/curb/curb_upload.h
116
+ - ext/curb/curb_upload.o
114
117
  - ext/curb/curb_config.h
115
118
  - ext/curb/extconf.rb
116
119
  - ext/curb/curb.c
117
120
  - ext/curb/curb.h
121
+ - ext/curb/curb.o
122
+ - ext/curb/curb_core.so
118
123
  - ext/curb/curb_easy.c
119
124
  - ext/curb/curb_easy.h
125
+ - ext/curb/curb_easy.o
126
+ - ext/curb/mkmf.log
120
127
  - ext/curb/curb_postfield.c
121
128
  - ext/curb/curb_postfield.h
129
+ - ext/curb/curb_postfield.o
122
130
  - ext/curb-original/curb_errors.c
123
131
  - ext/curb-original/curb_errors.h
124
132
  - ext/curb-original/curb_macros.h
@@ -165,7 +173,7 @@ files:
165
173
  - ./Gemfile
166
174
  - ./History.txt
167
175
  - .gemtest
168
- homepage: http://2ch.ru
176
+ homepage: https://github.com/tinbka/rhack
169
177
  licenses: []
170
178
  post_install_message:
171
179
  rdoc_options:
@@ -181,7 +189,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
181
189
  version: '0'
182
190
  segments:
183
191
  - 0
184
- hash: -702291569
192
+ hash: -84270947
185
193
  required_rubygems_version: !ruby/object:Gem::Requirement
186
194
  none: false
187
195
  requirements: