rhack 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. data/.gemtest +0 -0
  2. data/CURB-LICENSE +51 -0
  3. data/Gemfile +4 -0
  4. data/History.txt +4 -0
  5. data/LICENSE +51 -0
  6. data/License.txt +17 -0
  7. data/Manifest.txt +61 -0
  8. data/README.txt +12 -0
  9. data/Rakefile +34 -0
  10. data/ext/curb-original/curb.c +977 -0
  11. data/ext/curb-original/curb.h +52 -0
  12. data/ext/curb-original/curb_config.h +235 -0
  13. data/ext/curb-original/curb_easy.c +3455 -0
  14. data/ext/curb-original/curb_easy.h +90 -0
  15. data/ext/curb-original/curb_errors.c +647 -0
  16. data/ext/curb-original/curb_errors.h +129 -0
  17. data/ext/curb-original/curb_macros.h +159 -0
  18. data/ext/curb-original/curb_multi.c +704 -0
  19. data/ext/curb-original/curb_multi.h +26 -0
  20. data/ext/curb-original/curb_postfield.c +523 -0
  21. data/ext/curb-original/curb_postfield.h +40 -0
  22. data/ext/curb-original/curb_upload.c +80 -0
  23. data/ext/curb-original/curb_upload.h +30 -0
  24. data/ext/curb/Makefile +157 -0
  25. data/ext/curb/curb.c +977 -0
  26. data/ext/curb/curb.h +52 -0
  27. data/ext/curb/curb_config.h +235 -0
  28. data/ext/curb/curb_easy.c +3430 -0
  29. data/ext/curb/curb_easy.h +94 -0
  30. data/ext/curb/curb_errors.c +647 -0
  31. data/ext/curb/curb_errors.h +129 -0
  32. data/ext/curb/curb_macros.h +159 -0
  33. data/ext/curb/curb_multi.c +710 -0
  34. data/ext/curb/curb_multi.h +26 -0
  35. data/ext/curb/curb_postfield.c +523 -0
  36. data/ext/curb/curb_postfield.h +40 -0
  37. data/ext/curb/curb_upload.c +80 -0
  38. data/ext/curb/curb_upload.h +30 -0
  39. data/ext/curb/extconf.rb +399 -0
  40. data/lib/cache.rb +44 -0
  41. data/lib/curl-global.rb +151 -0
  42. data/lib/extensions/browser/env.js +697 -0
  43. data/lib/extensions/browser/jquery.js +7180 -0
  44. data/lib/extensions/browser/xmlsax.js +1564 -0
  45. data/lib/extensions/browser/xmlw3cdom_1.js +1444 -0
  46. data/lib/extensions/browser/xmlw3cdom_2.js +2744 -0
  47. data/lib/extensions/curb.rb +125 -0
  48. data/lib/extensions/declarative.rb +153 -0
  49. data/lib/extensions/johnson.rb +63 -0
  50. data/lib/frame.rb +766 -0
  51. data/lib/init.rb +36 -0
  52. data/lib/rhack.rb +16 -0
  53. data/lib/rhack.yml.template +19 -0
  54. data/lib/rhack/proxy/checker.rb +226 -0
  55. data/lib/rhack/proxy/list.rb +196 -0
  56. data/lib/rhack/services.rb +445 -0
  57. data/lib/rhack_in.rb +2 -0
  58. data/lib/scout.rb +591 -0
  59. data/lib/words.rb +37 -0
  60. data/test/test_frame.rb +107 -0
  61. data/test/test_rhack.rb +5 -0
  62. data/test/test_scout.rb +53 -0
  63. metadata +195 -0
@@ -0,0 +1,125 @@
1
+ # encoding: utf-8
2
+ module Curl
3
+
4
+ class Easy
5
+ __init__
6
+ attr_accessor :base
7
+
8
+ def res
9
+ Response(self)
10
+ end
11
+ alias response res
12
+
13
+ def req
14
+ res.req
15
+ end
16
+ alias request req
17
+
18
+ def host
19
+ url.parse(:uri).root
20
+ end
21
+
22
+ def path=(href)
23
+ self.url = host+href.parse(:uri).fullpath
24
+ end
25
+
26
+ def retry!
27
+ @base.retry!
28
+ end
29
+
30
+ # curb changed getters interface, so i get some shortcuts from curb/lib/curl/easy.rb
31
+ def set(opt,val)
32
+ if opt.is_a?(Symbol)
33
+ setopt(sym2curl(opt), val)
34
+ else
35
+ setopt(opt.to_i, val)
36
+ end
37
+ end
38
+
39
+ def sym2curl(opt)
40
+ Curl.const_get("CURLOPT_#{opt.to_s.upcase}")
41
+ end
42
+
43
+ def interface=(value)
44
+ set :interface, value
45
+ end
46
+
47
+ def url=(u)
48
+ set :url, u
49
+ end
50
+
51
+ def proxy_url=(url)
52
+ set :proxy, url
53
+ end
54
+
55
+ def userpwd=(value)
56
+ set :userpwd, value
57
+ end
58
+
59
+ def proxypwd=(value)
60
+ set :proxyuserpwd, value
61
+ end
62
+
63
+ def follow_location=(onoff)
64
+ set :followlocation, onoff
65
+ end
66
+
67
+ def head=(onoff)
68
+ set :nobody, !!onoff
69
+ end
70
+
71
+ def get=(onoff)
72
+ set :httpget, !!onoff
73
+ end
74
+
75
+ end
76
+
77
+ class PostField
78
+
79
+ def to_s
80
+ raise "Cannot convert unnamed field to string" if !name
81
+ display_content = if (cp = content_proc)
82
+ cp.inspect
83
+ elsif (c = content)
84
+ "#{c[0...20].inspect}#{"… (#{c.size.bytes})" if c.size > 20}"
85
+ elsif (ln = local_name)
86
+ File.new(ln).inspect
87
+ end
88
+ "#{name}=#{display_content}"
89
+ end
90
+
91
+ end
92
+
93
+ class Multi
94
+ if method_defined? :requests
95
+ alias :reqs :requests
96
+ end
97
+
98
+ def reset
99
+ reqs.each {|k| remove k rescue()}
100
+ $Carier = Multi.new
101
+ $Carier.pipeline = true
102
+ # GC.start
103
+ end
104
+
105
+ def drop
106
+ while running > 0 do perform rescue() end
107
+ Curl.recall
108
+ end
109
+
110
+ def drop!
111
+ drop
112
+ reset if reqs.size + running > 0
113
+ end
114
+
115
+ def sheduled
116
+ 0 < running and running <= reqs.size
117
+ end
118
+
119
+ def inspect
120
+ "<#Carier #{'unit'.x reqs.size}, #{running} executing>"
121
+ end
122
+
123
+ end
124
+
125
+ end
@@ -0,0 +1,153 @@
1
+ # encoding: utf-8
2
+ module ActiveRecord
3
+
4
+ module ConnectionAdapters
5
+ AbstractAdapter
6
+
7
+ class VirtualTable < Table
8
+
9
+ def debug_str meth, called, exist, *args
10
+ "Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
11
+ end
12
+
13
+ def column_exists *args
14
+ column_names = @base.columns(@table_name).names
15
+ options = args.extract_options!
16
+ names = args.dup
17
+ args << options
18
+ _or_ = (names[0] == :all) ? !names.shift : true
19
+ names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
20
+ !_or_
21
+ end
22
+
23
+ def index_exists *indexes
24
+ column_indexes = @base.indexes(@table_name).columnss.flatten
25
+ _or_ = (indexes[0] == :all) ? !indexes.shift : true
26
+ indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
27
+ !_or_
28
+ end
29
+
30
+ def initialize name, connection, map=nil
31
+ super name, connection
32
+ case map
33
+ when true; @map = []
34
+ when Array; @map = map
35
+ end
36
+ end
37
+
38
+ def map!
39
+ map_names = @map.firsts.to_ss
40
+ @base.columns(@table_name).names.each {|name|
41
+ name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
42
+ }
43
+ @map.each {|col| column *col}
44
+ end
45
+
46
+ def column name, *args
47
+ to_be_called = !column_exists(name)
48
+ super if to_be_called
49
+ $log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
50
+ @map << [name, *args] if @map
51
+ end
52
+
53
+ %w{string text integer float decimal
54
+ datetime timestamp time date binary boolean}.each {|column_type|
55
+ define_method(column_type) {|*args|
56
+ to_be_called = !column_exists(*args)
57
+ super if to_be_called
58
+ $log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
59
+ if @map
60
+ options = args.extract_options!
61
+ args = args.xprod(column_type)
62
+ args = args.xprod(options) if options
63
+ @map.concat args
64
+ end
65
+ } }
66
+
67
+ def index name, *args
68
+ to_be_called = !index_exists(name)
69
+ super if to_be_called
70
+ $log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
71
+ end
72
+
73
+ def timestamps
74
+ to_be_called = !column_exists('created_at', 'updated_at')
75
+ super if to_be_called
76
+ $log.debug {debug_str :timestamps, to_be_called, !to_be_called}
77
+ @map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
78
+ end
79
+
80
+ def change *args
81
+ raise NotImplementedError, "don't use #change in declaration!"
82
+ end
83
+
84
+ def change_default *args
85
+ raise NotImplementedError, "don't use #change_default in declaration!"
86
+ end
87
+
88
+ def rename column_name, new_column_name
89
+ to_be_called = !column_exists(new_column_name)
90
+ super if to_be_called
91
+ $log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
92
+ end
93
+
94
+ def references *args
95
+ to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
96
+ super if to_be_called
97
+ $log.debug {debug_str :references, to_be_called, !to_be_called, *args}
98
+ end
99
+ alias :belongs_to :references
100
+
101
+ def remove *args
102
+ to_be_called = column_exists :all, *args
103
+ super if to_be_called
104
+ $log.debug {debug_str :remove, to_be_called, to_be_called, *args}
105
+ end
106
+
107
+ def remove_references *args
108
+ to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
109
+ super if to_be_called
110
+ $log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
111
+ end
112
+ alias :remove_belongs_to :remove_references
113
+
114
+ def remove_index options
115
+ indexes = options.is(Hash) ? options[:column] : options
116
+ raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
117
+ to_be_called = index_exists :all, *indexes
118
+ super if to_be_called
119
+ $log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
120
+ end
121
+
122
+ def remove_timestamps
123
+ to_be_called = column_exists 'created_at', 'updated_at'
124
+ super if to_be_called
125
+ $log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
126
+ end
127
+
128
+ end
129
+
130
+ end
131
+
132
+ class Base
133
+
134
+ def self.declare name, options={}, &block
135
+ self.table_name = name
136
+ if !table_exists? or options[:force]
137
+ $log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
138
+ self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
139
+ $log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
140
+ connection.create_table(name, options, &block)
141
+ elsif options[:map]
142
+ table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
143
+ yield table
144
+ table.map!
145
+ else yield ConnectionAdapters::VirtualTable.new(name, connection)
146
+ end
147
+ reset_column_information
148
+ end
149
+
150
+ end
151
+
152
+ end
153
+
@@ -0,0 +1,63 @@
1
+ # encoding: utf-8
2
+ module Johnson
3
+ begin
4
+ require 'johnson'
5
+ rescue LoadError
6
+ Enabled = false
7
+ else
8
+ if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
9
+ Enabled = false
10
+ else Enabled = true
11
+ end
12
+ end
13
+ ### JavaScript interface DOM emulation ###
14
+
15
+ class Runtime
16
+ attr_accessor :thread_id
17
+ Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
18
+ BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
19
+
20
+ # CarierThread breaks if Multi has no work && CarierThread
21
+ # is joined so itwon't last forever.
22
+ #
23
+ # Johnson is not thread safe =>
24
+ # Runtime created in this thread will become unusable after
25
+ # CarierThread dies.
26
+ #
27
+ # So we don't use Curl.wait until Carier haven't got whole
28
+ # request for this Runtime.
29
+ def self.set_browser_for_curl(opts)
30
+ if !Runtime_is_set[opts]
31
+ if Curl.status
32
+ Curl.recall
33
+ $log.debug 'recalled'
34
+ end
35
+ if opts[:thread_safe].b
36
+ $JSRuntime = new_browser(opts[:jq])
37
+ $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
38
+ else
39
+ $log.debug 'about to run carier'
40
+ Curl.execute {$JSRuntime = new_browser(opts[:jq])
41
+ $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
42
+ sleep 0.01 until Runtime_is_set[opts]
43
+ end
44
+ end
45
+ end
46
+
47
+ def self.new_browser(jq=false)
48
+ rt = new
49
+ %w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
50
+ path = "#{BROWSER_PATH}/#{f}.js"
51
+ rt.evaluate IO.read(path), path, 1
52
+ }
53
+ rt.document = ''
54
+ rt
55
+ end
56
+
57
+ def document=(html)
58
+ evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
59
+ end
60
+
61
+ end
62
+
63
+ end
data/lib/frame.rb ADDED
@@ -0,0 +1,766 @@
1
+ # encoding: utf-8
2
+ module HTTPAccessKit
3
+
4
+ # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
+ # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
6
+
7
+ class ZippingError < ArgumentError
8
+ def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
9
+ super str%debug end
10
+ end
11
+
12
+ class TargetError < ArgumentError
13
+ def initialize msg="only static frame can use local paths"
14
+ super end
15
+ end
16
+
17
+ class ConfigError < ArgumentError
18
+ def initialize msg
19
+ super end
20
+ end
21
+
22
+ class Frame
23
+ __init__
24
+ attr_reader :loc, :static, :ss, :opts
25
+
26
+ def initialize *args
27
+ args << 10 unless args[-1].is Fixnum
28
+ @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].is(Hash) ? args[-2] : {})
29
+ if args[0].is String
30
+ uri = args[0]
31
+ 'http://' >> uri if uri !~ /^\w+:\/\//
32
+ @loc = uri.parse:uri
33
+ # be careful, if you set :static => false, frame will be unable to use implicit url
34
+ @static = @opts.fetch(:static, true).b
35
+ else
36
+ @loc = {}
37
+ @static = false
38
+ end
39
+ @ss = ScoutSquad *args
40
+ @pages = []
41
+ Curl.run unless Curl.status
42
+ end
43
+
44
+ def retarget to, forced=nil
45
+ to = 'http://' + to if to !~ /^\w+:/
46
+ @ss.update to, forced
47
+ @loc = to.parse:uri
48
+ end
49
+
50
+ def target=to
51
+ retarget to
52
+ end
53
+
54
+ def next() @ss.next end
55
+ def rand() @ss.rand end
56
+ def each(&block) @ss.each &block end
57
+ def [](i) @ss[i] end
58
+
59
+ def inspect
60
+ "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static' if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
61
+ end
62
+
63
+ # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
64
+ # :zip, :thread_safe, :result, :stream, :raw + any opts for Scouts in one hash
65
+ def get *args, &callback
66
+ many, order, orders, with_opts = interpret_request *args
67
+ L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
68
+
69
+ if !Johnson::Enabled and with_opts[:eval]
70
+ L < "failed to use option :eval because Johnson is disabled"
71
+ with_opts.delete :eval
72
+ end
73
+ # JS Runtime is not thread-safe and must be created in curl thread
74
+ # if we aren't said explicitly about the opposite
75
+ Johnson::Runtime.set_browser_for_curl with_opts
76
+
77
+ if many then exec_many orders, with_opts, &callback
78
+ else exec_one order, with_opts, &callback end
79
+ end
80
+ alias :exec :get
81
+ alias :run :get
82
+
83
+ def interpret_request(*args)
84
+ body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
85
+ L.log [body, mp, uri, opts]
86
+ zip = opts.delete :zip
87
+ many = order = orders = post = false
88
+ # Default options set is for POST
89
+ if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
90
+ # if second arg is String, then that's uri
91
+ uri, mp, post = mp.dup, false, true
92
+ # L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
93
+ # But if we have only one argument actually passed
94
+ # except for options hash, then believe it's GET
95
+ elsif body.is String or body.kinda [String]
96
+ L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
97
+ uri = body.dup
98
+ elsif !body then uri = nil
99
+ else
100
+ uri = uri.dup if uri
101
+ mp, post = !!mp, true
102
+ end
103
+ if post
104
+ unless body.is Hash or body.kinda [Hash]
105
+ raise TypeError, "body of post request must be a hash or hash array, params was
106
+ (#{args.inspect[1..-2]})"
107
+ end
108
+ validate_zip uri, body if zip
109
+ if zip or uri.kinda Array or body.kinda Array
110
+ many = true
111
+ if zip or uri.kinda Array
112
+ validate_some uri
113
+ orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
114
+ else
115
+ uri = validate uri
116
+ orders = body.xprod uri
117
+ end
118
+ orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
119
+ else
120
+ uri = validate uri
121
+ order = [:loadPost, body, mp, uri]
122
+ end
123
+ else
124
+ if uri.kinda Array
125
+ many = true
126
+ validate_some uri
127
+ orders = [:loadGet].xprod uri
128
+ else
129
+ uri = validate uri
130
+ order = [:loadGet, uri]
131
+ end
132
+ end
133
+ if !order.b and !orders.b
134
+ raise ArgumentError, "failed to run blank request#{'s' if many}, params was
135
+ (#{args.inspect[1..-2]})"
136
+ else
137
+ opts[:wait] = opts[:sync] if :sync.in opts
138
+ opts[:wait] = true if !:wait.in(opts) and
139
+ :proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
140
+ opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
141
+ opts[:load_scripts] = self if opts[:load_scripts]
142
+ opts[:stream] = true if opts[:raw]
143
+ [many, order, orders, opts]
144
+ end
145
+ end
146
+
147
+ def get_cached(*links)
148
+ res = []
149
+ expire = links[-1] == :expire ? links.pop : false
150
+ links.parses(:uri).each_with_index {|uri, i|
151
+ next if uri.path[/ads|count|stats/]
152
+ file = Cache.load uri, !expire
153
+ if file
154
+ if expire
155
+ @ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
156
+ if c.res.code == 200
157
+ res << [i, (data = c.res.body)]
158
+ Cache.save uri, data, false
159
+ else
160
+ res << [i, file.is(String) ? file : read(file.path)]
161
+ end
162
+ }
163
+ else
164
+ res << [i, file.is(String) ? file : read(file.path)]
165
+ end
166
+ else
167
+ @ss.next.loadGet(uri.href) {|c|
168
+ if c.res.code == 200
169
+ res << [i, (data = c.res.body)]
170
+ Cache.save uri, data, !expire
171
+ end
172
+ }
173
+ end
174
+ }
175
+ Curl.wait
176
+ links.size == 1 ? res[0][1] : res.sort!.lasts
177
+ end
178
+
179
+ def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
180
+ raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
181
+ @print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
182
+ (s = @ss.next).http.on_header {|h|
183
+ next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
184
+ if code = $2
185
+ if code != '200'
186
+ L << "#$& getting #{uri}; interrupting request."
187
+ s.http.on_header() # set default process
188
+ next 0
189
+ end
190
+ next h.size
191
+ end
192
+
193
+ s.http.on_header() # set default process
194
+ if !$1 # конец хедера, content-length отсутствует
195
+ L << "No Content-Length header; trying to load a whole #{uri} at once!"
196
+ s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
197
+ next 0
198
+ end
199
+
200
+ len = $1.to_i - start
201
+ psize = configure_psize(len, psize, threads)
202
+ parts = (len/psize.to_f).ceil
203
+ setup_speedometer(uri, parts, len)
204
+ yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
205
+
206
+ @ss_reserve = @ss[threads+1..-1]
207
+ @ss = @ss[0..threads]
208
+ (0...parts).each {|n|
209
+ break if stop_download
210
+
211
+ s = @ss.next
212
+ run_speedometer(s, len, n)
213
+ s.loadGet(uri, :headers => {
214
+ 'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
215
+ }) {|c|
216
+ clear_speedometer(s)
217
+ if c.res.code/10 == 20
218
+ yield len, n*psize, c.res.body
219
+ else
220
+ L << "#{c.res} during get #{uri.inspect}; interrupting request."
221
+ stop_download = true
222
+ end
223
+ }
224
+ }
225
+ 0
226
+ }
227
+ s.raise_err = false
228
+ s.loadGet validate uri
229
+ ensure
230
+ @ss.concat @ss_reserve || []
231
+ end
232
+
233
+ def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
234
+ dled = 0
235
+ lock = ''
236
+ callback = lambda {|len, pos, body|
237
+ if body != :careful_dl
238
+ begin
239
+ write(df, body, pos)
240
+ rescue => e
241
+ binding.start_interaction
242
+ raise
243
+ end
244
+ if (dled += body.size) == len
245
+ File.delete lock if File.file? lock
246
+ yield df if block_given?
247
+ end
248
+ else
249
+ lock = lock_file df, len, pos # filename, filesize, partsize
250
+ end
251
+ }
252
+ opts[:threads] ||= @ss.size-1
253
+ get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
254
+ Curl.wait unless block_given?
255
+ df
256
+ end
257
+
258
+ def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
259
+ opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
260
+ L << opts
261
+
262
+ @print_progress = opts[:print_progress]
263
+ unless len = opts[:len] || (map = read_mapfile(df) and map.len)
264
+ return @ss.next.loadHead(uri) {|c| $log << c
265
+ if len = c.res['Content-Length']
266
+ simple_dl(uri, df, opts.merge(:len => len.to_i))
267
+ else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
268
+ end
269
+ }
270
+ end
271
+
272
+ psize, parts = check_mapfile(df, opts)
273
+ return unless psize
274
+ L << [psize, parts]
275
+ setup_speedometer(uri, parts.size, len)
276
+
277
+ obtained uri do |uri|
278
+ if opts[:threads] == 1
279
+ start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
280
+ scout = opts[:scout] || @ss.next
281
+ $log << [uri, scout]
282
+ (loadget = lambda {|n|
283
+ run_speedometer(scout, len, n)
284
+ from = start + n*psize
285
+ to = start + (n+1)*psize - 1
286
+ scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
287
+ begin
288
+ $log << "writing #{df} from #{from}: #{c.res.body.inspect}"
289
+ write(df, c.res.body, from)
290
+ rescue => e
291
+ binding.start_interaction
292
+ raise
293
+ end
294
+ if write_mapfile(df, from, to)
295
+ clear_speedometer(scout)
296
+ L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
297
+ yield df if block_given?
298
+ elsif (n+1)*psize <= len
299
+ loadget[n+1]
300
+ end
301
+ }
302
+ })[0]
303
+ else
304
+ exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
305
+ L << c.res
306
+ range = c.req.range
307
+ begin
308
+ write(df, c.res.body, range.begin)
309
+ rescue => e
310
+ binding.start_interaction
311
+ raise
312
+ end
313
+ if write_mapfile(df, range.begin, range.end)
314
+ @ss.each {|s| s.http.on_progress} if @print_progress
315
+ yield df if block_given?
316
+ end
317
+ }
318
+ end
319
+ end
320
+ end
321
+
322
+ def check_mapfile(df, opts={})
323
+ opts.reverse_merge! :psize => :auto, :threads => 1
324
+ map = read_mapfile df
325
+ if map
326
+ L << map
327
+ if map.rest.empty?
328
+ puts "#{df} is loaded"
329
+ $log << 'deleting mapfile'
330
+ File.delete df+'.map'
331
+ []
332
+ else
333
+ if opts[:len] and map.len != opts[:len]
334
+ raise "Incorrect file size for #{df}"
335
+ end
336
+ psize = configure_psize *opts.values_at(:len, :psize, :threads)
337
+ [psize, map.rest.div(psize)]
338
+ end
339
+ else
340
+ write_mapfile df, opts[:len]
341
+ psize = configure_psize *opts.values_at(:len, :psize, :threads)
342
+ $log << (0...opts[:len]).div(psize)
343
+ [psize, (0...opts[:len]).div(psize)]
344
+ end
345
+ end
346
+
347
+ def read_mapfile(df)
348
+ df += '.map'
349
+ text = read df
350
+ $log << "mapfile read: #{text}"
351
+ if text.b
352
+ text[/^(\d+)\0+(\d+)\0*\n/]
353
+ map = {}
354
+ $log << [$1,$2]
355
+ if $1 and $1 == $2
356
+ map.rest = []
357
+ else
358
+ map.len, *map.parts = text.chop/"\n"
359
+ map.len = map.len.to_i
360
+ map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
361
+ $log << map.parts
362
+ map.rest = (0...map.len) - XRange(*map.parts)
363
+ end
364
+ map
365
+ end
366
+ end
367
+
368
+ def write_mapfile(df, *args)
369
+ df += '.map'
370
+ map = ''
371
+ if args.size != 2
372
+ len = args.shift
373
+ map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
374
+ end
375
+ if args.any?
376
+ read(df)[/^(\d+)\0+(\d+)\0*\n/]
377
+ $log << "mapfile read"
378
+ $log << [$1,$2]
379
+ dled = $2.to_i + args[1] - args[0] + 1
380
+ return true if dled == $1.to_i
381
+ map << "#{args[0]}..#{args[1]}\n"
382
+ $log << 'writing mapfile'
383
+ write(df, dled.to_s.ljust(11, "\0"), 11)
384
+ end
385
+ $log << [df, map]
386
+ $log << 'writing mapfile'
387
+ write df, map
388
+ nil
389
+ end
390
+
391
+ def configure_psize(len, psize, threads)
392
+ case psize
393
+ when Numeric; psize.to_i
394
+ when :auto; len > 100000 ? len/threads+1 : len
395
+ when :mb; 1.mb
396
+ else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
397
+ end
398
+ end
399
+
400
+ private
401
+ def validate_zip(uri, body)
402
+ if !(uri.kinda Array and body.kinda Array)
403
+ raise ZippingError, [uri.class, nil, body.class, nil]
404
+ elsif uri.size != body.size
405
+ raise ZippingError, [uri.class, uri.size, body.class, body.size]
406
+ end
407
+ end
408
+
409
+ def validate(uri)
410
+ if uri
411
+ loc = uri.parse:uri
412
+ if loc.root and loc.root != @loc.root
413
+ raise TargetError, "failed to get #{uri} by static frame #{@loc.host}, you should first update it with new target" if @static
414
+ @loc.root = loc.root
415
+ uri
416
+ elsif !loc.root
417
+ raise TargetError if !@static
418
+ File.join @loc.root, uri
419
+ else uri
420
+ end
421
+ else
422
+ raise TargetError if !@static
423
+ @loc.href
424
+ end
425
+ end
426
+
427
+ def validate_some(uris)
428
+ uris.map! {|u| validate u}
429
+ end
430
+
431
+ def exec_one(order, opts)
432
+ # must result in Page (default) or it's subclass
433
+ page = opts[:result].new
434
+ # if no spare scouts can be found, squad simply waits for all callbacks to complete
435
+ s = @ss.next
436
+ #s.raise_err = true# Зачем это тут? Можно добавлять :raise=>1 фрейму при запиле
437
+ s.send(*(order << opts)) {|curl|
438
+ if opts[:raw]
439
+ yield curl
440
+ elsif page.process(curl, opts) and block_given?
441
+ yres = yield page
442
+ if opts[:save_result] or :proc_result.in opts
443
+ page.res = yres
444
+ end
445
+ if opts[:proc_result].is Proc and yres != :skip
446
+ opts[:proc_result].call yres
447
+ end
448
+ end
449
+ }
450
+ if opts[:wait]
451
+ opts[:thread_safe] ? $Carier.perform : Curl.wait
452
+ # почему бы не уменьшить бойлерплейт в сервисах и не возвращать res сразу?
453
+ (opts[:save_result] or :proc_result.in opts) ? page.res : page
454
+ else page
455
+ end
456
+ end
457
+
458
+ def exec_many(orders, with_opts, &callback)
459
+ w = with_opts.delete :wait
460
+ iterator = with_opts[:stream] ? :each : :map
461
+ if with_opts[:ranges]
462
+ if orders.size != with_opts[:ranges].size
463
+ raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
464
+ end
465
+ pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
466
+ (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
467
+ exec_one order, with_opts, &callback
468
+ }
469
+ else
470
+ pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
471
+ end
472
+ with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
473
+ with_opts[:stream] || pages
474
+ end
475
+
476
+
477
+ def setup_speedometer(uri, parts, len)
478
+ return unless @print_progress
479
+ @progress = Array.new(parts, 0)
480
+ @stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
481
+ @str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
482
+ @bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
483
+ Thread.new {
484
+ until @stop_print
485
+ sleep 0.2
486
+ now = Time.now
487
+ if now > @speedometer[0] and @sum > @speedometer[1]
488
+ @speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
489
+ @speedometer.replace [now, @sum]
490
+ end
491
+ end
492
+ }
493
+ end
494
+
495
+ def run_speedometer(scout, len, n)
496
+ return unless @print_progress
497
+ scout.http.on_progress {|dl_need, dl_now, *ul|
498
+ if !@stop_print
499
+ @progress[n] = dl_now
500
+ percents = (@sum = @progress.sum)*100/len
501
+ print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
502
+ if percents == 100
503
+ puts "\v"*@newlines
504
+ @stop_print = true
505
+ end
506
+ end
507
+ true
508
+ }
509
+ end
510
+
511
+ def clear_speedometer(scout)
512
+ return unless @print_progress
513
+ scout.http.on_progress
514
+ end
515
+
516
+ end
517
+
518
+ def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
519
+ Curl.run
520
+ Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
521
+ end
522
+ module_function :dl
523
+
524
+
525
+
526
+ class Page
527
+ # for debug, just enable L#debug, don't write tons of chaotic log-lines
528
+ __init__
529
+ # res here is result of page processing made in frame context
530
+ attr_accessor :title, :res
531
+ attr_reader :html, :loc, :hash, :doc, :js
532
+ @@ignore = /google|_gat|tracker|adver/i
533
+
534
+ def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
535
+ loc = loc.parse:uri if !loc.is Hash
536
+ @js = js
537
+ if obj.is Curl::Easy or obj.kinda Scout
538
+ c = obj.kinda(Scout) ? obj.http : html
539
+ @html = ''
540
+ # just (c, loc) would pass to #process opts variable that returns '' on any key
541
+ process(c, loc.b || {})
542
+ else
543
+ @html = obj
544
+ @loc = loc
545
+ end
546
+ end
547
+
548
+ def inspect
549
+ if !@hash.nil?
550
+ "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
551
+ else
552
+ "<#FramePage #{@html.b ? "«#{@title}» (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
553
+ end
554
+ end
555
+
556
+ # We can then alternate #process in Page subclasses
557
+ # Frame doesn't mind about value returned by #process
558
+ def process(c, opts={})
559
+ @loc = c.last_effective_url.parse:uri
560
+ L.debug "#{@loc.fullpath} -> #{c.res}"
561
+ if c.res.code == 200
562
+ body = c.res.body
563
+ if opts[:json]
564
+ @json = true
565
+ @hash = begin; body.from_json
566
+ rescue StandardError
567
+ false
568
+ end
569
+ if !@hash or @hash.is String
570
+ L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
571
+ @html = body; to_doc
572
+ @hash = false
573
+ end
574
+
575
+ elsif opts[:hash]
576
+ if body.inline
577
+ @hash = body.to_hash
578
+ else
579
+ @hash = false
580
+ L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
581
+ @html = body; to_doc
582
+ end
583
+
584
+ else
585
+ @html = body; to_doc
586
+ if opts[:eval]
587
+ load_scripts opts[:load_scripts]
588
+ eval_js
589
+ end
590
+ end
591
+ end
592
+ self
593
+ end
594
+
595
+ def eval_js(frame=nil)
596
+ eval_string "document.location = window.location = #{@loc.to_json};
597
+ document.URL = document.baseURI = document.documentURI = location.href;
598
+ document.domain = location.host;"
599
+ find("script").each {|n|
600
+ L.debug n.text.strip
601
+ if text = n.text.strip.b
602
+ js[:write_output] = ''
603
+ eval_string text
604
+ if res = js[:write_output].b then n.after res end
605
+ n.remove!
606
+ elsif frame and n.src
607
+ eval_string frame.get_cached expand_link n.src
608
+ end
609
+ }
610
+ end
611
+
612
+ def eval_string(str)
613
+ @js ||= Johnson::Runtime.new
614
+ L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
615
+ begin
616
+ @js.evaluate(str)
617
+ rescue Johnson::Error => e
618
+ L.warn e.message
619
+ L.debug {
620
+ if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
621
+ L.clr.hl! str, /\b#{m[1] || m[2]}\b/
622
+ end
623
+ "\n\t#{str}"
624
+ }
625
+ end
626
+ end
627
+
628
+ def to_doc
629
+ @doc = @html.to_doc :forceutf
630
+ if !(@title = @doc.title.b)
631
+ @title = @loc.href
632
+ @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
633
+ else
634
+ if @title.cyr? and UTF2ANSI[@title].size > 40
635
+ @title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/]]+'…'
636
+ elsif @title.size > 40
637
+ @title = @title[/.{1,30}\S*/]+'…'
638
+ end
639
+ end
640
+ @doc
641
+ end
642
+
643
+ def find(xp) (@doc || to_doc).find xp end
644
+
645
+ def at(xp) (@doc || to_doc).at xp end
646
+
647
+ def url() @loc.href end
648
+ alias :href :url
649
+
650
+ def get_srcs(links='img')
651
+ begin
652
+ links = find(links).map {|e| e.src} if links.is String
653
+ rescue XML::Error
654
+ links = [links]
655
+ end
656
+ links.map {|link| expand_link link}.uniq
657
+ end
658
+
659
+ def get_src(link='img')
660
+ begin
661
+ link = at(link) && at(link).src if link.is String
662
+ rescue XML::Error; nil
663
+ end
664
+ expand_link link if link
665
+ end
666
+
667
+ def get_links(links='a')
668
+ begin
669
+ links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
670
+ rescue XML::Error
671
+ links = [links]
672
+ end
673
+ links.map {|link| expand_link link}.uniq
674
+ end
675
+
676
+ def get_link(link='a')
677
+ begin
678
+ link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
679
+ rescue XML::Error; nil
680
+ end
681
+ expand_link link if link
682
+ end
683
+ alias :get_hrefs :get_links
684
+ alias :links :get_links
685
+ alias :get_href :get_link
686
+ alias :link :get_link
687
+
688
+ def expand_link(link)
689
+ case link
690
+ when /^\w+:\/\// then link
691
+ when /^\/\// then @loc.protocol+link
692
+ when /^\// then @loc.root+link
693
+ else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
694
+ end
695
+ end
696
+
697
+ def form(form='form', hash={}, opts={})
698
+ form = "[action=#{@loc.path.inspect}]" if form == :self
699
+ if form.is String
700
+ form_node = at form
701
+ raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
702
+ else form_node = form
703
+ end
704
+ hash = form_node.inputs_all.merge!(hash)
705
+ action = expand_link(form_node.action || @loc.path)
706
+ if form_node['method'].downcase == 'post'
707
+ [hash, form_node.enctype =~ /multipart/, action, opts]
708
+ else
709
+ action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
710
+ [action, opts]
711
+ end
712
+ end
713
+
714
+ def submit(form, frame, hash={}, opts={}, &callback)
715
+ (opts[:header] ||= {}).Referer ||= @loc.href if @loc
716
+ query = form(form, hash, opts)
717
+
718
+ curr_target, new_target = frame.loc.href, (query[2] || query[0])
719
+ if need_retargeting = (frame.static && curr_target != new_target)
720
+ frame.retarget new_target
721
+ end
722
+ page = frame.exec(*query, &callback)
723
+ frame.retarget curr_target, :forced if need_retargeting
724
+ page
725
+ end
726
+
727
+ def load_scripts(frame)
728
+ frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
729
+ end
730
+
731
+ end
732
+
733
+ # using reprocessing of page in case of non-200 response:
734
+ # page_class = ReloadablePage do
735
+ # @res and @res.code != 200
736
+ # end
737
+ def ReloadablePage(&reload_condition)
738
+ rp = Class.new Page
739
+ rp.send :define_method, :process do |curl, opts|
740
+ super(curl, opts || {})
741
+ if curl.instance_eval &reload_condition
742
+ curl.retry!
743
+ nil # in case of reload_condition.call super's callback will not proceed
744
+ else self
745
+ end
746
+ end
747
+ rp
748
+ end
749
+
750
+ end
751
+
752
+
753
+
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+
763
+
764
+
765
+
766
+