rhack 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. data/.gemtest +0 -0
  2. data/CURB-LICENSE +51 -0
  3. data/Gemfile +4 -0
  4. data/History.txt +4 -0
  5. data/LICENSE +51 -0
  6. data/License.txt +17 -0
  7. data/Manifest.txt +61 -0
  8. data/README.txt +12 -0
  9. data/Rakefile +34 -0
  10. data/ext/curb-original/curb.c +977 -0
  11. data/ext/curb-original/curb.h +52 -0
  12. data/ext/curb-original/curb_config.h +235 -0
  13. data/ext/curb-original/curb_easy.c +3455 -0
  14. data/ext/curb-original/curb_easy.h +90 -0
  15. data/ext/curb-original/curb_errors.c +647 -0
  16. data/ext/curb-original/curb_errors.h +129 -0
  17. data/ext/curb-original/curb_macros.h +159 -0
  18. data/ext/curb-original/curb_multi.c +704 -0
  19. data/ext/curb-original/curb_multi.h +26 -0
  20. data/ext/curb-original/curb_postfield.c +523 -0
  21. data/ext/curb-original/curb_postfield.h +40 -0
  22. data/ext/curb-original/curb_upload.c +80 -0
  23. data/ext/curb-original/curb_upload.h +30 -0
  24. data/ext/curb/Makefile +157 -0
  25. data/ext/curb/curb.c +977 -0
  26. data/ext/curb/curb.h +52 -0
  27. data/ext/curb/curb_config.h +235 -0
  28. data/ext/curb/curb_easy.c +3430 -0
  29. data/ext/curb/curb_easy.h +94 -0
  30. data/ext/curb/curb_errors.c +647 -0
  31. data/ext/curb/curb_errors.h +129 -0
  32. data/ext/curb/curb_macros.h +159 -0
  33. data/ext/curb/curb_multi.c +710 -0
  34. data/ext/curb/curb_multi.h +26 -0
  35. data/ext/curb/curb_postfield.c +523 -0
  36. data/ext/curb/curb_postfield.h +40 -0
  37. data/ext/curb/curb_upload.c +80 -0
  38. data/ext/curb/curb_upload.h +30 -0
  39. data/ext/curb/extconf.rb +399 -0
  40. data/lib/cache.rb +44 -0
  41. data/lib/curl-global.rb +151 -0
  42. data/lib/extensions/browser/env.js +697 -0
  43. data/lib/extensions/browser/jquery.js +7180 -0
  44. data/lib/extensions/browser/xmlsax.js +1564 -0
  45. data/lib/extensions/browser/xmlw3cdom_1.js +1444 -0
  46. data/lib/extensions/browser/xmlw3cdom_2.js +2744 -0
  47. data/lib/extensions/curb.rb +125 -0
  48. data/lib/extensions/declarative.rb +153 -0
  49. data/lib/extensions/johnson.rb +63 -0
  50. data/lib/frame.rb +766 -0
  51. data/lib/init.rb +36 -0
  52. data/lib/rhack.rb +16 -0
  53. data/lib/rhack.yml.template +19 -0
  54. data/lib/rhack/proxy/checker.rb +226 -0
  55. data/lib/rhack/proxy/list.rb +196 -0
  56. data/lib/rhack/services.rb +445 -0
  57. data/lib/rhack_in.rb +2 -0
  58. data/lib/scout.rb +591 -0
  59. data/lib/words.rb +37 -0
  60. data/test/test_frame.rb +107 -0
  61. data/test/test_rhack.rb +5 -0
  62. data/test/test_scout.rb +53 -0
  63. metadata +195 -0
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+ module Curl
3
+
4
+ class Easy
5
+ __init__
6
+ attr_accessor :base
7
+
8
+ def res
9
+ Response(self)
10
+ end
11
+ alias response res
12
+
13
+ def req
14
+ res.req
15
+ end
16
+ alias request req
17
+
18
+ def host
19
+ url.parse(:uri).root
20
+ end
21
+
22
+ def path=(href)
23
+ self.url = host+href.parse(:uri).fullpath
24
+ end
25
+
26
+ def retry!
27
+ @base.retry!
28
+ end
29
+
30
+ # curb changed getters interface, so i get some shortcuts from curb/lib/curl/easy.rb
31
+ def set(opt,val)
32
+ if opt.is_a?(Symbol)
33
+ setopt(sym2curl(opt), val)
34
+ else
35
+ setopt(opt.to_i, val)
36
+ end
37
+ end
38
+
39
+ def sym2curl(opt)
40
+ Curl.const_get("CURLOPT_#{opt.to_s.upcase}")
41
+ end
42
+
43
+ def interface=(value)
44
+ set :interface, value
45
+ end
46
+
47
+ def url=(u)
48
+ set :url, u
49
+ end
50
+
51
+ def proxy_url=(url)
52
+ set :proxy, url
53
+ end
54
+
55
+ def userpwd=(value)
56
+ set :userpwd, value
57
+ end
58
+
59
+ def proxypwd=(value)
60
+ set :proxyuserpwd, value
61
+ end
62
+
63
+ def follow_location=(onoff)
64
+ set :followlocation, onoff
65
+ end
66
+
67
+ def head=(onoff)
68
+ set :nobody, !!onoff
69
+ end
70
+
71
+ def get=(onoff)
72
+ set :httpget, !!onoff
73
+ end
74
+
75
+ end
76
+
77
+ class PostField
78
+
79
+ def to_s
80
+ raise "Cannot convert unnamed field to string" if !name
81
+ display_content = if (cp = content_proc)
82
+ cp.inspect
83
+ elsif (c = content)
84
+ "#{c[0...20].inspect}#{"… (#{c.size.bytes})" if c.size > 20}"
85
+ elsif (ln = local_name)
86
+ File.new(ln).inspect
87
+ end
88
+ "#{name}=#{display_content}"
89
+ end
90
+
91
+ end
92
+
93
+ class Multi
94
+ if method_defined? :requests
95
+ alias :reqs :requests
96
+ end
97
+
98
+ def reset
99
+ reqs.each {|k| remove k rescue()}
100
+ $Carier = Multi.new
101
+ $Carier.pipeline = true
102
+ # GC.start
103
+ end
104
+
105
+ def drop
106
+ while running > 0 do perform rescue() end
107
+ Curl.recall
108
+ end
109
+
110
+ def drop!
111
+ drop
112
+ reset if reqs.size + running > 0
113
+ end
114
+
115
+ def sheduled
116
+ 0 < running and running <= reqs.size
117
+ end
118
+
119
+ def inspect
120
+ "<#Carier #{'unit'.x reqs.size}, #{running} executing>"
121
+ end
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,153 @@
1
+ # encoding: utf-8
2
+ module ActiveRecord
3
+
4
+ module ConnectionAdapters
5
+ AbstractAdapter
6
+
7
+ class VirtualTable < Table
8
+
9
+ def debug_str meth, called, exist, *args
10
+ "Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
11
+ end
12
+
13
+ def column_exists *args
14
+ column_names = @base.columns(@table_name).names
15
+ options = args.extract_options!
16
+ names = args.dup
17
+ args << options
18
+ _or_ = (names[0] == :all) ? !names.shift : true
19
+ names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
20
+ !_or_
21
+ end
22
+
23
+ def index_exists *indexes
24
+ column_indexes = @base.indexes(@table_name).columnss.flatten
25
+ _or_ = (indexes[0] == :all) ? !indexes.shift : true
26
+ indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
27
+ !_or_
28
+ end
29
+
30
+ def initialize name, connection, map=nil
31
+ super name, connection
32
+ case map
33
+ when true; @map = []
34
+ when Array; @map = map
35
+ end
36
+ end
37
+
38
+ def map!
39
+ map_names = @map.firsts.to_ss
40
+ @base.columns(@table_name).names.each {|name|
41
+ name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
42
+ }
43
+ @map.each {|col| column *col}
44
+ end
45
+
46
+ def column name, *args
47
+ to_be_called = !column_exists(name)
48
+ super if to_be_called
49
+ $log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
50
+ @map << [name, *args] if @map
51
+ end
52
+
53
+ %w{string text integer float decimal
54
+ datetime timestamp time date binary boolean}.each {|column_type|
55
+ define_method(column_type) {|*args|
56
+ to_be_called = !column_exists(*args)
57
+ super if to_be_called
58
+ $log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
59
+ if @map
60
+ options = args.extract_options!
61
+ args = args.xprod(column_type)
62
+ args = args.xprod(options) if options
63
+ @map.concat args
64
+ end
65
+ } }
66
+
67
+ def index name, *args
68
+ to_be_called = !index_exists(name)
69
+ super if to_be_called
70
+ $log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
71
+ end
72
+
73
+ def timestamps
74
+ to_be_called = !column_exists('created_at', 'updated_at')
75
+ super if to_be_called
76
+ $log.debug {debug_str :timestamps, to_be_called, !to_be_called}
77
+ @map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
78
+ end
79
+
80
+ def change *args
81
+ raise NotImplementedError, "don't use #change in declaration!"
82
+ end
83
+
84
+ def change_default *args
85
+ raise NotImplementedError, "don't use #change_default in declaration!"
86
+ end
87
+
88
+ def rename column_name, new_column_name
89
+ to_be_called = !column_exists(new_column_name)
90
+ super if to_be_called
91
+ $log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
92
+ end
93
+
94
+ def references *args
95
+ to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
96
+ super if to_be_called
97
+ $log.debug {debug_str :references, to_be_called, !to_be_called, *args}
98
+ end
99
+ alias :belongs_to :references
100
+
101
+ def remove *args
102
+ to_be_called = column_exists :all, *args
103
+ super if to_be_called
104
+ $log.debug {debug_str :remove, to_be_called, to_be_called, *args}
105
+ end
106
+
107
+ def remove_references *args
108
+ to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
109
+ super if to_be_called
110
+ $log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
111
+ end
112
+ alias :remove_belongs_to :remove_references
113
+
114
+ def remove_index options
115
+ indexes = options.is(Hash) ? options[:column] : options
116
+ raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
117
+ to_be_called = index_exists :all, *indexes
118
+ super if to_be_called
119
+ $log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
120
+ end
121
+
122
+ def remove_timestamps
123
+ to_be_called = column_exists 'created_at', 'updated_at'
124
+ super if to_be_called
125
+ $log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
126
+ end
127
+
128
+ end
129
+
130
+ end
131
+
132
+ class Base
133
+
134
+ def self.declare name, options={}, &block
135
+ self.table_name = name
136
+ if !table_exists? or options[:force]
137
+ $log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
138
+ self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
139
+ $log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
140
+ connection.create_table(name, options, &block)
141
+ elsif options[:map]
142
+ table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
143
+ yield table
144
+ table.map!
145
+ else yield ConnectionAdapters::VirtualTable.new(name, connection)
146
+ end
147
+ reset_column_information
148
+ end
149
+
150
+ end
151
+
152
+ end
153
+
@@ -0,0 +1,63 @@
1
+ # encoding: utf-8
2
+ module Johnson
3
+ begin
4
+ require 'johnson'
5
+ rescue LoadError
6
+ Enabled = false
7
+ else
8
+ if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
9
+ Enabled = false
10
+ else Enabled = true
11
+ end
12
+ end
13
+ ### JavaScript interface DOM emulation ###
14
+
15
+ class Runtime
16
+ attr_accessor :thread_id
17
+ Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
18
+ BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
19
+
20
+ # CarierThread breaks if Multi has no work && CarierThread
21
+ # is joined so itwon't last forever.
22
+ #
23
+ # Johnson is not thread safe =>
24
+ # Runtime created in this thread will become unusable after
25
+ # CarierThread dies.
26
+ #
27
+ # So we don't use Curl.wait until Carier haven't got whole
28
+ # request for this Runtime.
29
+ def self.set_browser_for_curl(opts)
30
+ if !Runtime_is_set[opts]
31
+ if Curl.status
32
+ Curl.recall
33
+ $log.debug 'recalled'
34
+ end
35
+ if opts[:thread_safe].b
36
+ $JSRuntime = new_browser(opts[:jq])
37
+ $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
38
+ else
39
+ $log.debug 'about to run carier'
40
+ Curl.execute {$JSRuntime = new_browser(opts[:jq])
41
+ $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
42
+ sleep 0.01 until Runtime_is_set[opts]
43
+ end
44
+ end
45
+ end
46
+
47
+ def self.new_browser(jq=false)
48
+ rt = new
49
+ %w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
50
+ path = "#{BROWSER_PATH}/#{f}.js"
51
+ rt.evaluate IO.read(path), path, 1
52
+ }
53
+ rt.document = ''
54
+ rt
55
+ end
56
+
57
+ def document=(html)
58
+ evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
59
+ end
60
+
61
+ end
62
+
63
+ end
data/lib/frame.rb ADDED
@@ -0,0 +1,766 @@
1
+ # encoding: utf-8
2
+ module HTTPAccessKit
3
+
4
+ # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
+ # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
6
+
7
+ class ZippingError < ArgumentError
8
+ def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
9
+ super str%debug end
10
+ end
11
+
12
+ class TargetError < ArgumentError
13
+ def initialize msg="only static frame can use local paths"
14
+ super end
15
+ end
16
+
17
+ class ConfigError < ArgumentError
18
+ def initialize msg
19
+ super end
20
+ end
21
+
22
+ class Frame
23
+ __init__
24
+ attr_reader :loc, :static, :ss, :opts
25
+
26
+ def initialize *args
27
+ args << 10 unless args[-1].is Fixnum
28
+ @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].is(Hash) ? args[-2] : {})
29
+ if args[0].is String
30
+ uri = args[0]
31
+ 'http://' >> uri if uri !~ /^\w+:\/\//
32
+ @loc = uri.parse:uri
33
+ # be careful, if you set :static => false, frame will be unable to use implicit url
34
+ @static = @opts.fetch(:static, true).b
35
+ else
36
+ @loc = {}
37
+ @static = false
38
+ end
39
+ @ss = ScoutSquad *args
40
+ @pages = []
41
+ Curl.run unless Curl.status
42
+ end
43
+
44
+ def retarget to, forced=nil
45
+ to = 'http://' + to if to !~ /^\w+:/
46
+ @ss.update to, forced
47
+ @loc = to.parse:uri
48
+ end
49
+
50
+ def target=to
51
+ retarget to
52
+ end
53
+
54
+ def next() @ss.next end
55
+ def rand() @ss.rand end
56
+ def each(&block) @ss.each &block end
57
+ def [](i) @ss[i] end
58
+
59
+ def inspect
60
+ "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static' if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
61
+ end
62
+
63
+ # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
64
+ # :zip, :thread_safe, :result, :stream, :raw + any opts for Scouts in one hash
65
+ def get *args, &callback
66
+ many, order, orders, with_opts = interpret_request *args
67
+ L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
68
+
69
+ if !Johnson::Enabled and with_opts[:eval]
70
+ L < "failed to use option :eval because Johnson is disabled"
71
+ with_opts.delete :eval
72
+ end
73
+ # JS Runtime is not thread-safe and must be created in curl thread
74
+ # if we aren't said explicitly about the opposite
75
+ Johnson::Runtime.set_browser_for_curl with_opts
76
+
77
+ if many then exec_many orders, with_opts, &callback
78
+ else exec_one order, with_opts, &callback end
79
+ end
80
+ alias :exec :get
81
+ alias :run :get
82
+
83
+ def interpret_request(*args)
84
+ body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
85
+ L.log [body, mp, uri, opts]
86
+ zip = opts.delete :zip
87
+ many = order = orders = post = false
88
+ # Default options set is for POST
89
+ if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
90
+ # if second arg is String, then that's uri
91
+ uri, mp, post = mp.dup, false, true
92
+ # L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
93
+ # But if we have only one argument actually passed
94
+ # except for options hash, then believe it's GET
95
+ elsif body.is String or body.kinda [String]
96
+ L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
97
+ uri = body.dup
98
+ elsif !body then uri = nil
99
+ else
100
+ uri = uri.dup if uri
101
+ mp, post = !!mp, true
102
+ end
103
+ if post
104
+ unless body.is Hash or body.kinda [Hash]
105
+ raise TypeError, "body of post request must be a hash or hash array, params was
106
+ (#{args.inspect[1..-2]})"
107
+ end
108
+ validate_zip uri, body if zip
109
+ if zip or uri.kinda Array or body.kinda Array
110
+ many = true
111
+ if zip or uri.kinda Array
112
+ validate_some uri
113
+ orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
114
+ else
115
+ uri = validate uri
116
+ orders = body.xprod uri
117
+ end
118
+ orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
119
+ else
120
+ uri = validate uri
121
+ order = [:loadPost, body, mp, uri]
122
+ end
123
+ else
124
+ if uri.kinda Array
125
+ many = true
126
+ validate_some uri
127
+ orders = [:loadGet].xprod uri
128
+ else
129
+ uri = validate uri
130
+ order = [:loadGet, uri]
131
+ end
132
+ end
133
+ if !order.b and !orders.b
134
+ raise ArgumentError, "failed to run blank request#{'s' if many}, params was
135
+ (#{args.inspect[1..-2]})"
136
+ else
137
+ opts[:wait] = opts[:sync] if :sync.in opts
138
+ opts[:wait] = true if !:wait.in(opts) and
139
+ :proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
140
+ opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
141
+ opts[:load_scripts] = self if opts[:load_scripts]
142
+ opts[:stream] = true if opts[:raw]
143
+ [many, order, orders, opts]
144
+ end
145
+ end
146
+
147
+ def get_cached(*links)
148
+ res = []
149
+ expire = links[-1] == :expire ? links.pop : false
150
+ links.parses(:uri).each_with_index {|uri, i|
151
+ next if uri.path[/ads|count|stats/]
152
+ file = Cache.load uri, !expire
153
+ if file
154
+ if expire
155
+ @ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
156
+ if c.res.code == 200
157
+ res << [i, (data = c.res.body)]
158
+ Cache.save uri, data, false
159
+ else
160
+ res << [i, file.is(String) ? file : read(file.path)]
161
+ end
162
+ }
163
+ else
164
+ res << [i, file.is(String) ? file : read(file.path)]
165
+ end
166
+ else
167
+ @ss.next.loadGet(uri.href) {|c|
168
+ if c.res.code == 200
169
+ res << [i, (data = c.res.body)]
170
+ Cache.save uri, data, !expire
171
+ end
172
+ }
173
+ end
174
+ }
175
+ Curl.wait
176
+ links.size == 1 ? res[0][1] : res.sort!.lasts
177
+ end
178
+
179
+ def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
180
+ raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
181
+ @print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
182
+ (s = @ss.next).http.on_header {|h|
183
+ next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
184
+ if code = $2
185
+ if code != '200'
186
+ L << "#$& getting #{uri}; interrupting request."
187
+ s.http.on_header() # set default process
188
+ next 0
189
+ end
190
+ next h.size
191
+ end
192
+
193
+ s.http.on_header() # set default process
194
+ if !$1 # конец хедера, content-length отсутствует
195
+ L << "No Content-Length header; trying to load a whole #{uri} at once!"
196
+ s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
197
+ next 0
198
+ end
199
+
200
+ len = $1.to_i - start
201
+ psize = configure_psize(len, psize, threads)
202
+ parts = (len/psize.to_f).ceil
203
+ setup_speedometer(uri, parts, len)
204
+ yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
205
+
206
+ @ss_reserve = @ss[threads+1..-1]
207
+ @ss = @ss[0..threads]
208
+ (0...parts).each {|n|
209
+ break if stop_download
210
+
211
+ s = @ss.next
212
+ run_speedometer(s, len, n)
213
+ s.loadGet(uri, :headers => {
214
+ 'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
215
+ }) {|c|
216
+ clear_speedometer(s)
217
+ if c.res.code/10 == 20
218
+ yield len, n*psize, c.res.body
219
+ else
220
+ L << "#{c.res} during get #{uri.inspect}; interrupting request."
221
+ stop_download = true
222
+ end
223
+ }
224
+ }
225
+ 0
226
+ }
227
+ s.raise_err = false
228
+ s.loadGet validate uri
229
+ ensure
230
+ @ss.concat @ss_reserve || []
231
+ end
232
+
233
+ def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
234
+ dled = 0
235
+ lock = ''
236
+ callback = lambda {|len, pos, body|
237
+ if body != :careful_dl
238
+ begin
239
+ write(df, body, pos)
240
+ rescue => e
241
+ binding.start_interaction
242
+ raise
243
+ end
244
+ if (dled += body.size) == len
245
+ File.delete lock if File.file? lock
246
+ yield df if block_given?
247
+ end
248
+ else
249
+ lock = lock_file df, len, pos # filename, filesize, partsize
250
+ end
251
+ }
252
+ opts[:threads] ||= @ss.size-1
253
+ get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
254
+ Curl.wait unless block_given?
255
+ df
256
+ end
257
+
258
+ def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
259
+ opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
260
+ L << opts
261
+
262
+ @print_progress = opts[:print_progress]
263
+ unless len = opts[:len] || (map = read_mapfile(df) and map.len)
264
+ return @ss.next.loadHead(uri) {|c| $log << c
265
+ if len = c.res['Content-Length']
266
+ simple_dl(uri, df, opts.merge(:len => len.to_i))
267
+ else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
268
+ end
269
+ }
270
+ end
271
+
272
+ psize, parts = check_mapfile(df, opts)
273
+ return unless psize
274
+ L << [psize, parts]
275
+ setup_speedometer(uri, parts.size, len)
276
+
277
+ obtained uri do |uri|
278
+ if opts[:threads] == 1
279
+ start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
280
+ scout = opts[:scout] || @ss.next
281
+ $log << [uri, scout]
282
+ (loadget = lambda {|n|
283
+ run_speedometer(scout, len, n)
284
+ from = start + n*psize
285
+ to = start + (n+1)*psize - 1
286
+ scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
287
+ begin
288
+ $log << "writing #{df} from #{from}: #{c.res.body.inspect}"
289
+ write(df, c.res.body, from)
290
+ rescue => e
291
+ binding.start_interaction
292
+ raise
293
+ end
294
+ if write_mapfile(df, from, to)
295
+ clear_speedometer(scout)
296
+ L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
297
+ yield df if block_given?
298
+ elsif (n+1)*psize <= len
299
+ loadget[n+1]
300
+ end
301
+ }
302
+ })[0]
303
+ else
304
+ exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
305
+ L << c.res
306
+ range = c.req.range
307
+ begin
308
+ write(df, c.res.body, range.begin)
309
+ rescue => e
310
+ binding.start_interaction
311
+ raise
312
+ end
313
+ if write_mapfile(df, range.begin, range.end)
314
+ @ss.each {|s| s.http.on_progress} if @print_progress
315
+ yield df if block_given?
316
+ end
317
+ }
318
+ end
319
+ end
320
+ end
321
+
322
+ def check_mapfile(df, opts={})
323
+ opts.reverse_merge! :psize => :auto, :threads => 1
324
+ map = read_mapfile df
325
+ if map
326
+ L << map
327
+ if map.rest.empty?
328
+ puts "#{df} is loaded"
329
+ $log << 'deleting mapfile'
330
+ File.delete df+'.map'
331
+ []
332
+ else
333
+ if opts[:len] and map.len != opts[:len]
334
+ raise "Incorrect file size for #{df}"
335
+ end
336
+ psize = configure_psize *opts.values_at(:len, :psize, :threads)
337
+ [psize, map.rest.div(psize)]
338
+ end
339
+ else
340
+ write_mapfile df, opts[:len]
341
+ psize = configure_psize *opts.values_at(:len, :psize, :threads)
342
+ $log << (0...opts[:len]).div(psize)
343
+ [psize, (0...opts[:len]).div(psize)]
344
+ end
345
+ end
346
+
347
+ def read_mapfile(df)
348
+ df += '.map'
349
+ text = read df
350
+ $log << "mapfile read: #{text}"
351
+ if text.b
352
+ text[/^(\d+)\0+(\d+)\0*\n/]
353
+ map = {}
354
+ $log << [$1,$2]
355
+ if $1 and $1 == $2
356
+ map.rest = []
357
+ else
358
+ map.len, *map.parts = text.chop/"\n"
359
+ map.len = map.len.to_i
360
+ map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
361
+ $log << map.parts
362
+ map.rest = (0...map.len) - XRange(*map.parts)
363
+ end
364
+ map
365
+ end
366
+ end
367
+
368
+ def write_mapfile(df, *args)
369
+ df += '.map'
370
+ map = ''
371
+ if args.size != 2
372
+ len = args.shift
373
+ map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
374
+ end
375
+ if args.any?
376
+ read(df)[/^(\d+)\0+(\d+)\0*\n/]
377
+ $log << "mapfile read"
378
+ $log << [$1,$2]
379
+ dled = $2.to_i + args[1] - args[0] + 1
380
+ return true if dled == $1.to_i
381
+ map << "#{args[0]}..#{args[1]}\n"
382
+ $log << 'writing mapfile'
383
+ write(df, dled.to_s.ljust(11, "\0"), 11)
384
+ end
385
+ $log << [df, map]
386
+ $log << 'writing mapfile'
387
+ write df, map
388
+ nil
389
+ end
390
+
391
+ def configure_psize(len, psize, threads)
392
+ case psize
393
+ when Numeric; psize.to_i
394
+ when :auto; len > 100000 ? len/threads+1 : len
395
+ when :mb; 1.mb
396
+ else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
397
+ end
398
+ end
399
+
400
+ private
401
+ def validate_zip(uri, body)
402
+ if !(uri.kinda Array and body.kinda Array)
403
+ raise ZippingError, [uri.class, nil, body.class, nil]
404
+ elsif uri.size != body.size
405
+ raise ZippingError, [uri.class, uri.size, body.class, body.size]
406
+ end
407
+ end
408
+
409
+ def validate(uri)
410
+ if uri
411
+ loc = uri.parse:uri
412
+ if loc.root and loc.root != @loc.root
413
+ raise TargetError, "failed to get #{uri} by static frame #{@loc.host}, you should first update it with new target" if @static
414
+ @loc.root = loc.root
415
+ uri
416
+ elsif !loc.root
417
+ raise TargetError if !@static
418
+ File.join @loc.root, uri
419
+ else uri
420
+ end
421
+ else
422
+ raise TargetError if !@static
423
+ @loc.href
424
+ end
425
+ end
426
+
427
+ def validate_some(uris)
428
+ uris.map! {|u| validate u}
429
+ end
430
+
431
+ def exec_one(order, opts)
432
+ # must result in Page (default) or it's subclass
433
+ page = opts[:result].new
434
+ # if no spare scouts can be found, squad simply waits for all callbacks to complete
435
+ s = @ss.next
436
+ #s.raise_err = true# Зачем это тут? Можно добавлять :raise=>1 фрейму при запиле
437
+ s.send(*(order << opts)) {|curl|
438
+ if opts[:raw]
439
+ yield curl
440
+ elsif page.process(curl, opts) and block_given?
441
+ yres = yield page
442
+ if opts[:save_result] or :proc_result.in opts
443
+ page.res = yres
444
+ end
445
+ if opts[:proc_result].is Proc and yres != :skip
446
+ opts[:proc_result].call yres
447
+ end
448
+ end
449
+ }
450
+ if opts[:wait]
451
+ opts[:thread_safe] ? $Carier.perform : Curl.wait
452
+ # почему бы не уменьшить бойлерплейт в сервисах и не возвращать res сразу?
453
+ (opts[:save_result] or :proc_result.in opts) ? page.res : page
454
+ else page
455
+ end
456
+ end
457
+
458
+ def exec_many(orders, with_opts, &callback)
459
+ w = with_opts.delete :wait
460
+ iterator = with_opts[:stream] ? :each : :map
461
+ if with_opts[:ranges]
462
+ if orders.size != with_opts[:ranges].size
463
+ raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
464
+ end
465
+ pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
466
+ (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
467
+ exec_one order, with_opts, &callback
468
+ }
469
+ else
470
+ pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
471
+ end
472
+ with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
473
+ with_opts[:stream] || pages
474
+ end
475
+
476
+
477
+ def setup_speedometer(uri, parts, len)
478
+ return unless @print_progress
479
+ @progress = Array.new(parts, 0)
480
+ @stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
481
+ @str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
482
+ @bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
483
+ Thread.new {
484
+ until @stop_print
485
+ sleep 0.2
486
+ now = Time.now
487
+ if now > @speedometer[0] and @sum > @speedometer[1]
488
+ @speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
489
+ @speedometer.replace [now, @sum]
490
+ end
491
+ end
492
+ }
493
+ end
494
+
495
+ def run_speedometer(scout, len, n)
496
+ return unless @print_progress
497
+ scout.http.on_progress {|dl_need, dl_now, *ul|
498
+ if !@stop_print
499
+ @progress[n] = dl_now
500
+ percents = (@sum = @progress.sum)*100/len
501
+ print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
502
+ if percents == 100
503
+ puts "\v"*@newlines
504
+ @stop_print = true
505
+ end
506
+ end
507
+ true
508
+ }
509
+ end
510
+
511
+ def clear_speedometer(scout)
512
+ return unless @print_progress
513
+ scout.http.on_progress
514
+ end
515
+
516
+ end
517
+
518
+ def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
519
+ Curl.run
520
+ Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
521
+ end
522
+ module_function :dl
523
+
524
+
525
+
526
+ class Page
527
+ # for debug, just enable L#debug, don't write tons of chaotic log-lines
528
+ __init__
529
+ # res here is result of page processing made in frame context
530
+ attr_accessor :title, :res
531
+ attr_reader :html, :loc, :hash, :doc, :js
532
+ @@ignore = /google|_gat|tracker|adver/i
533
+
534
+ def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
535
+ loc = loc.parse:uri if !loc.is Hash
536
+ @js = js
537
+ if obj.is Curl::Easy or obj.kinda Scout
538
+ c = obj.kinda(Scout) ? obj.http : html
539
+ @html = ''
540
+ # just (c, loc) would pass to #process opts variable that returns '' on any key
541
+ process(c, loc.b || {})
542
+ else
543
+ @html = obj
544
+ @loc = loc
545
+ end
546
+ end
547
+
548
+ def inspect
549
+ if !@hash.nil?
550
+ "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
551
+ else
552
+ "<#FramePage #{@html.b ? "«#{@title}» (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
553
+ end
554
+ end
555
+
556
+ # We can then alternate #process in Page subclasses
557
+ # Frame doesn't mind about value returned by #process
558
+ def process(c, opts={})
559
+ @loc = c.last_effective_url.parse:uri
560
+ L.debug "#{@loc.fullpath} -> #{c.res}"
561
+ if c.res.code == 200
562
+ body = c.res.body
563
+ if opts[:json]
564
+ @json = true
565
+ @hash = begin; body.from_json
566
+ rescue StandardError
567
+ false
568
+ end
569
+ if !@hash or @hash.is String
570
+ L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
571
+ @html = body; to_doc
572
+ @hash = false
573
+ end
574
+
575
+ elsif opts[:hash]
576
+ if body.inline
577
+ @hash = body.to_hash
578
+ else
579
+ @hash = false
580
+ L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
581
+ @html = body; to_doc
582
+ end
583
+
584
+ else
585
+ @html = body; to_doc
586
+ if opts[:eval]
587
+ load_scripts opts[:load_scripts]
588
+ eval_js
589
+ end
590
+ end
591
+ end
592
+ self
593
+ end
594
+
595
+ def eval_js(frame=nil)
596
+ eval_string "document.location = window.location = #{@loc.to_json};
597
+ document.URL = document.baseURI = document.documentURI = location.href;
598
+ document.domain = location.host;"
599
+ find("script").each {|n|
600
+ L.debug n.text.strip
601
+ if text = n.text.strip.b
602
+ js[:write_output] = ''
603
+ eval_string text
604
+ if res = js[:write_output].b then n.after res end
605
+ n.remove!
606
+ elsif frame and n.src
607
+ eval_string frame.get_cached expand_link n.src
608
+ end
609
+ }
610
+ end
611
+
612
+ def eval_string(str)
613
+ @js ||= Johnson::Runtime.new
614
+ L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
615
+ begin
616
+ @js.evaluate(str)
617
+ rescue Johnson::Error => e
618
+ L.warn e.message
619
+ L.debug {
620
+ if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
621
+ L.clr.hl! str, /\b#{m[1] || m[2]}\b/
622
+ end
623
+ "\n\t#{str}"
624
+ }
625
+ end
626
+ end
627
+
628
+ def to_doc
629
+ @doc = @html.to_doc :forceutf
630
+ if !(@title = @doc.title.b)
631
+ @title = @loc.href
632
+ @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
633
+ else
634
+ if @title.cyr? and UTF2ANSI[@title].size > 40
635
+ @title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/]]+'…'
636
+ elsif @title.size > 40
637
+ @title = @title[/.{1,30}\S*/]+'…'
638
+ end
639
+ end
640
+ @doc
641
+ end
642
+
643
+ def find(xp) (@doc || to_doc).find xp end
644
+
645
+ def at(xp) (@doc || to_doc).at xp end
646
+
647
+ def url() @loc.href end
648
+ alias :href :url
649
+
650
+ def get_srcs(links='img')
651
+ begin
652
+ links = find(links).map {|e| e.src} if links.is String
653
+ rescue XML::Error
654
+ links = [links]
655
+ end
656
+ links.map {|link| expand_link link}.uniq
657
+ end
658
+
659
+ def get_src(link='img')
660
+ begin
661
+ link = at(link) && at(link).src if link.is String
662
+ rescue XML::Error; nil
663
+ end
664
+ expand_link link if link
665
+ end
666
+
667
+ def get_links(links='a')
668
+ begin
669
+ links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
670
+ rescue XML::Error
671
+ links = [links]
672
+ end
673
+ links.map {|link| expand_link link}.uniq
674
+ end
675
+
676
+ def get_link(link='a')
677
+ begin
678
+ link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
679
+ rescue XML::Error; nil
680
+ end
681
+ expand_link link if link
682
+ end
683
+ alias :get_hrefs :get_links
684
+ alias :links :get_links
685
+ alias :get_href :get_link
686
+ alias :link :get_link
687
+
688
+ def expand_link(link)
689
+ case link
690
+ when /^\w+:\/\// then link
691
+ when /^\/\// then @loc.protocol+link
692
+ when /^\// then @loc.root+link
693
+ else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
694
+ end
695
+ end
696
+
697
+ def form(form='form', hash={}, opts={})
698
+ form = "[action=#{@loc.path.inspect}]" if form == :self
699
+ if form.is String
700
+ form_node = at form
701
+ raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
702
+ else form_node = form
703
+ end
704
+ hash = form_node.inputs_all.merge!(hash)
705
+ action = expand_link(form_node.action || @loc.path)
706
+ if form_node['method'].downcase == 'post'
707
+ [hash, form_node.enctype =~ /multipart/, action, opts]
708
+ else
709
+ action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
710
+ [action, opts]
711
+ end
712
+ end
713
+
714
+ def submit(form, frame, hash={}, opts={}, &callback)
715
+ (opts[:header] ||= {}).Referer ||= @loc.href if @loc
716
+ query = form(form, hash, opts)
717
+
718
+ curr_target, new_target = frame.loc.href, (query[2] || query[0])
719
+ if need_retargeting = (frame.static && curr_target != new_target)
720
+ frame.retarget new_target
721
+ end
722
+ page = frame.exec(*query, &callback)
723
+ frame.retarget curr_target, :forced if need_retargeting
724
+ page
725
+ end
726
+
727
+ def load_scripts(frame)
728
+ frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
729
+ end
730
+
731
+ end
732
+
733
+ # using reprocessing of page in case of non-200 response:
734
+ # page_class = ReloadablePage do
735
+ # @res and @res.code != 200
736
+ # end
737
+ def ReloadablePage(&reload_condition)
738
+ rp = Class.new Page
739
+ rp.send :define_method, :process do |curl, opts|
740
+ super(curl, opts || {})
741
+ if curl.instance_eval &reload_condition
742
+ curl.retry!
743
+ nil # in case of reload_condition.call super's callback will not proceed
744
+ else self
745
+ end
746
+ end
747
+ rp
748
+ end
749
+
750
+ end
751
+
752
+
753
+
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+
766
+