rhack 0.4.1 → 1.0.0.rc4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +2 -5
  3. data/LICENSE +19 -15
  4. data/README.md +66 -26
  5. data/Rakefile +42 -31
  6. data/config/cacert.pem +3895 -0
  7. data/config/rhack.yml.template +40 -0
  8. data/ext/curb-original/curb_config.h +3 -0
  9. data/ext/curb-original/curb_easy.c +3 -54
  10. data/ext/curb-original/curb_multi.c +69 -140
  11. data/ext/curb/curb_multi.c +1 -1
  12. data/lib/rhack.rb +82 -12
  13. data/lib/rhack/cookie.rb +49 -0
  14. data/lib/rhack/curl.rb +6 -0
  15. data/lib/{extensions/curb.rb → rhack/curl/easy.rb} +26 -48
  16. data/lib/rhack/curl/global.rb +175 -0
  17. data/lib/rhack/curl/itt.rb +11 -0
  18. data/lib/rhack/curl/multi.rb +37 -0
  19. data/lib/rhack/curl/post_field.rb +20 -0
  20. data/lib/rhack/curl/response.rb +91 -0
  21. data/lib/rhack/dl.rb +308 -0
  22. data/lib/rhack/frame.rb +316 -0
  23. data/lib/{extensions → rhack/js}/browser/env.js +0 -0
  24. data/lib/{extensions → rhack/js}/browser/jquery.js +0 -0
  25. data/lib/{extensions → rhack/js}/browser/xmlsax.js +0 -0
  26. data/lib/{extensions → rhack/js}/browser/xmlw3cdom_1.js +0 -0
  27. data/lib/{extensions → rhack/js}/browser/xmlw3cdom_2.js +0 -0
  28. data/lib/rhack/js/johnson.rb +71 -0
  29. data/lib/rhack/page.rb +263 -0
  30. data/lib/rhack/proxy.rb +3 -0
  31. data/lib/rhack/proxy/checker.rb +1 -1
  32. data/lib/rhack/scout.rb +342 -0
  33. data/lib/rhack/scout_squad.rb +98 -0
  34. data/lib/rhack/services.rb +1 -464
  35. data/lib/rhack/services/base.rb +59 -0
  36. data/lib/rhack/services/examples.rb +423 -0
  37. data/lib/rhack/version.rb +3 -0
  38. data/lib/rhack_in.rb +3 -2
  39. data/rhack.gemspec +28 -0
  40. metadata +104 -85
  41. data/.gemtest +0 -0
  42. data/Gemfile.lock +0 -23
  43. data/Manifest.txt +0 -60
  44. data/ext/curb/Makefile +0 -217
  45. data/lib/cache.rb +0 -44
  46. data/lib/curl-global.rb +0 -164
  47. data/lib/extensions/declarative.rb +0 -153
  48. data/lib/extensions/johnson.rb +0 -63
  49. data/lib/frame.rb +0 -848
  50. data/lib/init.rb +0 -49
  51. data/lib/rhack.yml.template +0 -19
  52. data/lib/scout.rb +0 -589
  53. data/lib/words.rb +0 -25
data/lib/cache.rb DELETED
@@ -1,44 +0,0 @@
1
- # encoding: utf-8
2
- module HTTPAccessKit
3
-
4
- class Cache < ActiveRecord::Base
5
- declare CacheTable do |t|
6
- t.integer :url_hash
7
- t.string :url
8
- t.string :path
9
- t.string :date
10
- t.string :ext
11
- t.timestamps
12
- end if DB
13
- RAMCache = {}
14
-
15
- def self.clean(time=7.days)
16
- destroy_all("created_at < '#{time.ago}'").each {|c|
17
- FileUtils.remove c.path if c.path and File.file?(c.path)}
18
- end
19
- CacheTTL and clean CacheTTL
20
-
21
- def self.save(url, data, cache_data=true)
22
- new(url, data).save
23
- RAMCache[url.href] = data if cache_data
24
- end
25
-
26
- def self.load(url, cache_data=true)
27
- if data = RAMCache[url.href]
28
- data
29
- elsif file = first(:select => 'date, path', :conditions => {:url_hash => url.href.hash})
30
- RAMCache[url.href] = read(file.path) if cache_data
31
- file
32
- end
33
- end
34
-
35
- def initialize(url, data)
36
- t = Time.now
37
- path = "#{CacheDir}/#{t.to_i}-#{File.split(url.path)[1]}"
38
- rw path, data
39
- super :url => url.href, :url_hash => url.href.hash, :date => t.httpdate, :path => path, :ext => url.ext
40
- end
41
-
42
- end
43
-
44
- end
data/lib/curl-global.rb DELETED
@@ -1,164 +0,0 @@
1
- # encoding: utf-8
2
- module Curl
3
-
4
- def execute(unless_allready=false)
5
- if unless_allready and Curl.status
6
- return L.log "Non-nil status! Avoid executing"
7
- end
8
- if $CarierThread and s = $CarierThread.status
9
- L.log "Carier thread allready started and has status #{s}"
10
- else
11
- if s = Curl.status(false) then L.warn s end
12
- L.log($CarierThread ? "Resetting Carier thread" : "Setting Carier thread up")
13
- $CarierThread = Thread.new {
14
- error = nil
15
- begin
16
- # "why Thread#value is raising since it never raised before?"
17
- yield if block_given?
18
- rescue => error
19
- nil
20
- end
21
- loop {
22
- begin
23
- # with true argument (idle) it would break only if no requests to perform
24
- break unless $Carier.perform true
25
- L.log "Nothing to perform; idling..."
26
- rescue => error
27
- break
28
- # but ruby mystically crashes if next sequence occur:
29
- # Multi performs and can't see any requests so entering idle mode
30
- # we add some requests and multi load them
31
- # one of requests' callbacks raises error in *main* thread
32
- # so we can't allow any raises here, instead, keep them in 'wait' section
33
- end
34
- } unless error
35
- error
36
- }
37
- # until main thread has sleep a bit, $CarierThread will have status "run",
38
- # no matter whether it's idling or performing requests
39
- sleep 0.001
40
- end
41
- end
42
- alias :run :execute
43
- module_function :execute, :run
44
-
45
- def wait
46
- if $CarierThread and $CarierThread.status
47
- unless within = Thread.current == $CarierThread
48
- # We can't set `perform' timeout lesser than 1 second in the curl binding
49
- # because in that case thread status would always be "run"
50
- # so here we wait for exactly 1 sec
51
- sleep 1
52
- end
53
- # Also, if thread do Kernel.sleep, it would skip Curl.wait here
54
- if !$Carier.sheduled and ($CarierThread.status == 'sleep' or within && $Carier.reqs.empty?)
55
- L.log "No shedule to wait"
56
- else
57
- this_thread = within ? 'it\'s thread' : Thread.main == Thread.current ? 'main thread' : 'thread '+Thread.current.object_id
58
- L.log "Waiting for Carier to complete in #{this_thread}"
59
- begin
60
- L.log { "Trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> true from #{this_thread}" }
61
- if within
62
- L.log "calling this from one of callbacks to wait for the rest to complete"
63
- begin
64
- $Carier.perform
65
- rescue RuntimeError => e
66
- L.warn [e, e.message]
67
- L.info "$Carier $Carier.sheduled $CarierThread $CarierThread.status", binding
68
- L.warn "Failed to run Multi#perform: nothing to perform"
69
- end
70
- else
71
- $CarierThreadIsJoined = true
72
- $CarierThread.join
73
- end
74
- rescue (defined?(IRB) ? IRB::Abort : NilClass)
75
- recall!
76
- L.info "Carier thread recalled by keyboard"
77
- ensure
78
- L.log "trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> false from #{this_thread}"
79
- if !within
80
- $CarierThreadIsJoined = false
81
- # using Curl#execute from different threads may cause problems here when you don't control input,
82
- # for example, in a daemonized ruby process
83
- # just do not get $CarierThread joined from non-main thread
84
- if $CarierThread and e = $CarierThread.value
85
- # this will raise thread-safely in main thread
86
- # in case of unrescued error in CarierThread
87
- L.log(([e.message]+RMTools.format_trace(e.backtrace))*"\n")
88
- recall!
89
- raise e
90
- end
91
- execute
92
- end
93
- end
94
- end
95
- else
96
- L < "No thread to wait. I guess I should create one"
97
- execute
98
- wait
99
- end
100
- end
101
- module_function :wait
102
-
103
- def recall
104
- L.debug caller
105
- if $CarierThread
106
- L.log "Recalling Carier thread"
107
- $CarierThread.kill
108
- sleep 1
109
- else
110
- L.log "No thread to recall"
111
- end
112
- end
113
- alias :stop :recall
114
-
115
- def recall!
116
- if $CarierThread
117
- L.warn "Recalling thread and resetting Carier!!!"
118
- $CarierThread.kill
119
- $CarierThread = nil
120
- $Carier.reset
121
- else
122
- L.log "No thread to recall!"
123
- end
124
- end
125
- alias :stop! :recall!
126
- module_function :recall!, :stop!, :recall, :stop
127
-
128
- def reset
129
- recall
130
- execute
131
- end
132
- alias :reload :reset
133
-
134
- def reset!
135
- recall!
136
- execute
137
- end
138
- alias :reload! :reset!
139
- module_function :reset!, :reset, :reload!, :reload
140
-
141
- def status(raise_e=true)
142
- if $CarierThread and (s = $CarierThread.status)
143
- L.log "Carier thread responding with status #{s}"
144
- s
145
- elsif $CarierThread
146
- if e = $CarierThread.value
147
- if raise_e
148
- recall!
149
- raise e
150
- else
151
- L.log "Carier Thread returned #{e.inspect}"
152
- e
153
- end
154
- else
155
- L.log "Carier Thread is exited without error"
156
- end
157
- else
158
- L.log "There is no Carier Thread atm"
159
- end
160
- end
161
- alias :st :status
162
- module_function :status, :st
163
-
164
- end
@@ -1,153 +0,0 @@
1
- # encoding: utf-8
2
- module ActiveRecord
3
-
4
- module ConnectionAdapters
5
- AbstractAdapter
6
-
7
- class VirtualTable < Table
8
-
9
- def debug_str meth, called, exist, *args
10
- "Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
11
- end
12
-
13
- def column_exists *args
14
- column_names = @base.columns(@table_name).names
15
- options = args.extract_options!
16
- names = args.dup
17
- args << options
18
- _or_ = (names[0] == :all) ? !names.shift : true
19
- names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
20
- !_or_
21
- end
22
-
23
- def index_exists *indexes
24
- column_indexes = @base.indexes(@table_name).columnss.flatten
25
- _or_ = (indexes[0] == :all) ? !indexes.shift : true
26
- indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
27
- !_or_
28
- end
29
-
30
- def initialize name, connection, map=nil
31
- super name, connection
32
- case map
33
- when true; @map = []
34
- when Array; @map = map
35
- end
36
- end
37
-
38
- def map!
39
- map_names = @map.firsts.to_ss
40
- @base.columns(@table_name).names.each {|name|
41
- name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
42
- }
43
- @map.each {|col| column *col}
44
- end
45
-
46
- def column name, *args
47
- to_be_called = !column_exists(name)
48
- super if to_be_called
49
- $log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
50
- @map << [name, *args] if @map
51
- end
52
-
53
- %w{string text integer float decimal
54
- datetime timestamp time date binary boolean}.each {|column_type|
55
- define_method(column_type) {|*args|
56
- to_be_called = !column_exists(*args)
57
- super if to_be_called
58
- $log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
59
- if @map
60
- options = args.extract_options!
61
- args = args.xprod(column_type)
62
- args = args.xprod(options) if options
63
- @map.concat args
64
- end
65
- } }
66
-
67
- def index name, *args
68
- to_be_called = !index_exists(name)
69
- super if to_be_called
70
- $log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
71
- end
72
-
73
- def timestamps
74
- to_be_called = !column_exists('created_at', 'updated_at')
75
- super if to_be_called
76
- $log.debug {debug_str :timestamps, to_be_called, !to_be_called}
77
- @map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
78
- end
79
-
80
- def change *args
81
- raise NotImplementedError, "don't use #change in declaration!"
82
- end
83
-
84
- def change_default *args
85
- raise NotImplementedError, "don't use #change_default in declaration!"
86
- end
87
-
88
- def rename column_name, new_column_name
89
- to_be_called = !column_exists(new_column_name)
90
- super if to_be_called
91
- $log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
92
- end
93
-
94
- def references *args
95
- to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
96
- super if to_be_called
97
- $log.debug {debug_str :references, to_be_called, !to_be_called, *args}
98
- end
99
- alias :belongs_to :references
100
-
101
- def remove *args
102
- to_be_called = column_exists :all, *args
103
- super if to_be_called
104
- $log.debug {debug_str :remove, to_be_called, to_be_called, *args}
105
- end
106
-
107
- def remove_references *args
108
- to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
109
- super if to_be_called
110
- $log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
111
- end
112
- alias :remove_belongs_to :remove_references
113
-
114
- def remove_index options
115
- indexes = options.is(Hash) ? options[:column] : options
116
- raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
117
- to_be_called = index_exists :all, *indexes
118
- super if to_be_called
119
- $log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
120
- end
121
-
122
- def remove_timestamps
123
- to_be_called = column_exists 'created_at', 'updated_at'
124
- super if to_be_called
125
- $log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
126
- end
127
-
128
- end
129
-
130
- end
131
-
132
- class Base
133
-
134
- def self.declare name, options={}, &block
135
- self.table_name = name
136
- if !table_exists? or options[:force]
137
- $log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
138
- self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
139
- $log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
140
- connection.create_table(name, options, &block)
141
- elsif options[:map]
142
- table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
143
- yield table
144
- table.map!
145
- else yield ConnectionAdapters::VirtualTable.new(name, connection)
146
- end
147
- reset_column_information
148
- end
149
-
150
- end
151
-
152
- end
153
-
@@ -1,63 +0,0 @@
1
- # encoding: utf-8
2
- module Johnson
3
- begin
4
- require 'johnson'
5
- rescue LoadError
6
- Enabled = false
7
- else
8
- if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
9
- Enabled = false
10
- else Enabled = true
11
- end
12
- end
13
- ### JavaScript interface DOM emulation ###
14
-
15
- class Runtime
16
- attr_accessor :thread_id
17
- Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
18
- BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
19
-
20
- # CarierThread breaks if Multi has no work && CarierThread
21
- # is joined so itwon't last forever.
22
- #
23
- # Johnson is not thread safe =>
24
- # Runtime created in this thread will become unusable after
25
- # CarierThread dies.
26
- #
27
- # So we don't use Curl.wait until Carier haven't got whole
28
- # request for this Runtime.
29
- def self.set_browser_for_curl(opts)
30
- if !Runtime_is_set[opts]
31
- if Curl.status
32
- Curl.recall
33
- $log.debug 'recalled'
34
- end
35
- if opts[:thread_safe].b
36
- $JSRuntime = new_browser(opts[:jq])
37
- $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
38
- else
39
- $log.debug 'about to run carier'
40
- Curl.execute {$JSRuntime = new_browser(opts[:jq])
41
- $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
42
- sleep 0.01 until Runtime_is_set[opts]
43
- end
44
- end
45
- end
46
-
47
- def self.new_browser(jq=false)
48
- rt = new
49
- %w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
50
- path = "#{BROWSER_PATH}/#{f}.js"
51
- rt.evaluate IO.read(path), path, 1
52
- }
53
- rt.document = ''
54
- rt
55
- end
56
-
57
- def document=(html)
58
- evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
59
- end
60
-
61
- end
62
-
63
- end
data/lib/frame.rb DELETED
@@ -1,848 +0,0 @@
1
- # encoding: utf-8
2
- module HTTPAccessKit
3
-
4
- # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
- # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
6
-
7
- class ZippingError < ArgumentError
8
- def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
9
- super str%debug end
10
- end
11
-
12
- class TargetError < ArgumentError
13
- def initialize msg="only static frame can use local paths"
14
- super end
15
- end
16
-
17
- class ConfigError < ArgumentError
18
- def initialize msg
19
- super end
20
- end
21
-
22
- class Frame
23
- __init__
24
- attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
- @@cache = {}
26
-
27
- def initialize *args
28
- args << 10 unless args[-1].is Fixnum
29
- args.insert -2, {} unless args[-2].is Hash
30
- @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
31
- args[-2] = @opts
32
- if args[0].is String
33
- uri = args[0]
34
- 'http://' >> uri if uri !~ /^\w+:\/\//
35
- @loc = uri.parse:uri
36
- # be careful, if you set :static => false, frame will be unable to use implicit url
37
- @static = @opts.fetch(:static, true)
38
- else
39
- @loc = {}
40
- @static = false
41
- end
42
- @ss = ScoutSquad *args
43
- Curl.run :unless_allready
44
- end
45
-
46
- def retarget to, forced=nil
47
- to = 'http://' + to if to !~ /^\w+:/
48
- @ss.update to, forced
49
- @loc = to.parse:uri
50
- end
51
- alias :target= :retarget
52
-
53
- def next() @ss.next end
54
- def rand() @ss.rand end
55
- def each(&block) @ss.each &block end
56
- def [](i) @ss[i] end
57
-
58
- def copy_cookies! i=0
59
- @ss.each {|s| s.cookies.replace @ss[i].cookies}
60
- end
61
-
62
- def use_cache! opts={}
63
- if opts == false
64
- @use_cache = false
65
- else
66
- @@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
67
- #@write_to = opts[:write_to] if :write_to.in opts
68
- @use_cache = true
69
- end
70
- end
71
-
72
- def drop_cache! use=nil
73
- @@cache.clear
74
- GC.start
75
- @use_cache = use if use.in [true, false]
76
- end
77
-
78
- def inspect
79
- "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static'+(' => '+@static.protocol if @static.is(Hash)) if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
80
- end
81
-
82
- # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
83
- # :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
84
- def exec *args, &callback
85
- many, order, orders, with_opts = interpret_request *args
86
- L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
87
-
88
- if !Johnson::Enabled and with_opts[:eval]
89
- L < "failed to use option :eval because Johnson is disabled"
90
- with_opts.delete :eval
91
- end
92
- # JS Runtime is not thread-safe and must be created in curl thread
93
- # if we aren't said explicitly about the opposite
94
- Johnson::Runtime.set_browser_for_curl with_opts
95
-
96
- if many then exec_many orders, with_opts, &callback
97
- else exec_one order, with_opts, &callback end
98
- end
99
- alias :get :exec
100
- alias :run :get
101
-
102
- def interpret_request(*args)
103
- body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
104
- L.log [body, mp, uri, opts]
105
- zip = opts.delete :zip
106
- many = order = orders = post = false
107
- # Default options set is for POST
108
- if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
109
- # if second arg is String, then that's uri
110
- uri, mp, post = mp.dup, false, true
111
- # L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
112
- # But if we have only one argument actually passed
113
- # except for options hash, then believe it's GET
114
- elsif body.is String or body.kinda [String]
115
- L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
116
- uri = body.dup
117
- elsif !body then uri = nil
118
- else
119
- uri = uri.dup if uri
120
- mp, post = !!mp, true
121
- end
122
- if post
123
- unless body.is Hash or body.kinda [Hash]
124
- raise TypeError, "body of post request must be a hash or hash array, params was
125
- (#{args.inspect[1..-2]})"
126
- end
127
- validate_zip uri, body if zip
128
- if zip or uri.kinda Array or body.kinda Array
129
- many = true
130
- if zip or uri.kinda Array
131
- validate_some uri
132
- orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
133
- else
134
- uri = validate uri
135
- orders = body.xprod uri
136
- end
137
- orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
138
- else
139
- uri = validate uri
140
- order = [:loadPost, body, mp, uri]
141
- end
142
- else
143
- if uri.kinda Array
144
- many = true
145
- validate_some uri
146
- orders = [:loadGet].xprod uri
147
- else
148
- uri = validate uri
149
- order = [:loadGet, uri]
150
- end
151
- end
152
- if !order.b and !orders.b
153
- raise ArgumentError, "failed to run blank request#{'s' if many}, params was
154
- (#{args.inspect[1..-2]})"
155
- end
156
-
157
- opts[:wait] = opts[:sync] if :sync.in opts
158
- opts[:wait] = true if !:wait.in(opts) and
159
- :proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
160
- opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
161
- opts[:load_scripts] = self if opts[:load_scripts]
162
- opts[:stream] = true if opts[:raw]
163
- (opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
164
- [many, order, orders, opts]
165
- end
166
-
167
- def get_cached(*links)
168
- res = []
169
- expire = links[-1] == :expire ? links.pop : false
170
- links.parses(:uri).each_with_index {|uri, i|
171
- next if uri.path[/ads|count|stats/]
172
- file = Cache.load uri, !expire
173
- if file
174
- if expire
175
- @ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
176
- if c.res.code == 200
177
- res << [i, (data = c.res.body)]
178
- Cache.save uri, data, false
179
- else
180
- res << [i, file.is(String) ? file : read(file.path)]
181
- end
182
- }
183
- else
184
- res << [i, file.is(String) ? file : read(file.path)]
185
- end
186
- else
187
- @ss.next.loadGet(uri.href) {|c|
188
- if c.res.code == 200
189
- res << [i, (data = c.res.body)]
190
- Cache.save uri, data, !expire
191
- end
192
- }
193
- end
194
- }
195
- Curl.wait
196
- links.size == 1 ? res[0][1] : res.sort!.lasts
197
- end
198
-
199
- def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
200
- raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
201
- @print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
202
- (s = @ss.next).http.on_header {|h|
203
- next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
204
- if code = $2
205
- if code != '200'
206
- L << "#$& getting #{uri}; interrupting request."
207
- s.http.on_header() # set default process
208
- next 0
209
- end
210
- next h.size
211
- end
212
-
213
- s.http.on_header() # set default process
214
- if !$1 # конец хедера, content-length отсутствует
215
- L << "No Content-Length header; trying to load a whole #{uri} at once!"
216
- s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
217
- next 0
218
- end
219
-
220
- len = $1.to_i - start
221
- psize = configure_psize(len, psize, threads)
222
- parts = (len/psize.to_f).ceil
223
- setup_speedometer(uri, parts, len)
224
- yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
225
-
226
- @ss_reserve = @ss[threads+1..-1]
227
- @ss = @ss[0..threads]
228
- (0...parts).each {|n|
229
- break if stop_download
230
-
231
- s = @ss.next
232
- run_speedometer(s, len, n)
233
- s.loadGet(uri, :headers => {
234
- 'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
235
- }) {|c|
236
- clear_speedometer(s)
237
- if c.res.code/10 == 20
238
- yield len, n*psize, c.res.body
239
- else
240
- L << "#{c.res} during get #{uri.inspect}; interrupting request."
241
- stop_download = true
242
- end
243
- }
244
- }
245
- 0
246
- }
247
- s.raise_err = false
248
- s.loadGet validate uri
249
- ensure
250
- @ss.concat @ss_reserve || []
251
- end
252
-
253
- def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
254
- dled = 0
255
- lock = ''
256
- callback = lambda {|len, pos, body|
257
- if body != :careful_dl
258
- begin
259
- write(df, body, pos)
260
- rescue => e
261
- binding.start_interaction
262
- raise
263
- end
264
- if (dled += body.size) == len
265
- File.delete lock if File.file? lock
266
- yield df if block_given?
267
- end
268
- else
269
- lock = lock_file df, len, pos # filename, filesize, partsize
270
- end
271
- }
272
- opts[:threads] ||= @ss.size-1
273
- get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
274
- Curl.wait unless block_given?
275
- df
276
- end
277
-
278
- def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
279
- opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
280
- L << opts
281
-
282
- @print_progress = opts[:print_progress]
283
- unless len = opts[:len] || (map = read_mapfile(df) and map.len)
284
- return @ss.next.loadHead(uri) {|c| $log << c
285
- if len = c.res['Content-Length']
286
- simple_dl(uri, df, opts.merge(:len => len.to_i))
287
- else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
288
- end
289
- }
290
- end
291
-
292
- psize, parts = check_mapfile(df, opts)
293
- return unless psize
294
- L << [psize, parts]
295
- setup_speedometer(uri, parts.size, len)
296
-
297
- obtained uri do |uri|
298
- if opts[:threads] == 1
299
- start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
300
- scout = opts[:scout] || @ss.next
301
- $log << [uri, scout]
302
- (loadget = lambda {|n|
303
- run_speedometer(scout, len, n)
304
- from = start + n*psize
305
- to = start + (n+1)*psize - 1
306
- scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
307
- begin
308
- $log << "writing #{df} from #{from}: #{c.res.body.inspect}"
309
- write(df, c.res.body, from)
310
- rescue => e
311
- binding.start_interaction
312
- raise
313
- end
314
- if write_mapfile(df, from, to)
315
- clear_speedometer(scout)
316
- L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
317
- yield df if block_given?
318
- elsif (n+1)*psize <= len
319
- loadget[n+1]
320
- end
321
- }
322
- })[0]
323
- else
324
- exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
325
- L << c.res
326
- range = c.req.range
327
- begin
328
- write(df, c.res.body, range.begin)
329
- rescue => e
330
- binding.start_interaction
331
- raise
332
- end
333
- if write_mapfile(df, range.begin, range.end)
334
- @ss.each {|s| s.http.on_progress} if @print_progress
335
- yield df if block_given?
336
- end
337
- }
338
- end
339
- end
340
- end
341
-
342
- def check_mapfile(df, opts={})
343
- opts.reverse_merge! :psize => :auto, :threads => 1
344
- map = read_mapfile df
345
- if map
346
- L << map
347
- if map.rest.empty?
348
- puts "#{df} is loaded"
349
- $log << 'deleting mapfile'
350
- File.delete df+'.map'
351
- []
352
- else
353
- if opts[:len] and map.len != opts[:len]
354
- raise "Incorrect file size for #{df}"
355
- end
356
- psize = configure_psize *opts.values_at(:len, :psize, :threads)
357
- [psize, map.rest.div(psize)]
358
- end
359
- else
360
- write_mapfile df, opts[:len]
361
- psize = configure_psize *opts.values_at(:len, :psize, :threads)
362
- $log << (0...opts[:len]).div(psize)
363
- [psize, (0...opts[:len]).div(psize)]
364
- end
365
- end
366
-
367
- def read_mapfile(df)
368
- df += '.map'
369
- text = read df
370
- $log << "mapfile read: #{text}"
371
- if text.b
372
- text[/^(\d+)\0+(\d+)\0*\n/]
373
- map = {}
374
- $log << [$1,$2]
375
- if $1 and $1 == $2
376
- map.rest = []
377
- else
378
- map.len, *map.parts = text.chop/"\n"
379
- map.len = map.len.to_i
380
- map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
381
- $log << map.parts
382
- map.rest = (0...map.len) - XRange(*map.parts)
383
- end
384
- map
385
- end
386
- end
387
-
388
- def write_mapfile(df, *args)
389
- df += '.map'
390
- map = ''
391
- if args.size != 2
392
- len = args.shift
393
- map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
394
- end
395
- if args.any?
396
- read(df)[/^(\d+)\0+(\d+)\0*\n/]
397
- $log << "mapfile read"
398
- $log << [$1,$2]
399
- dled = $2.to_i + args[1] - args[0] + 1
400
- return true if dled == $1.to_i
401
- map << "#{args[0]}..#{args[1]}\n"
402
- $log << 'writing mapfile'
403
- write(df, dled.to_s.ljust(11, "\0"), 11)
404
- end
405
- $log << [df, map]
406
- $log << 'writing mapfile'
407
- write df, map
408
- nil
409
- end
410
-
411
- def configure_psize(len, psize, threads)
412
- case psize
413
- when Numeric; psize.to_i
414
- when :auto; len > 100000 ? len/threads+1 : len
415
- when :mb; 1.mb
416
- else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
417
- end
418
- end
419
-
420
- private
421
- def validate_zip(uri, body)
422
- if !(uri.kinda Array and body.kinda Array)
423
- raise ZippingError, [uri.class, nil, body.class, nil]
424
- elsif uri.size != body.size
425
- raise ZippingError, [uri.class, uri.size, body.class, body.size]
426
- end
427
- end
428
-
429
- # :static option now can accept hash with :procotol key, in that case Frame can be relocated to the same domain on another protocol and default protocol would be the value of @static.protocol
430
- def validate(uri)
431
- if uri
432
- loc = uri.parse:uri
433
- if loc.root and loc.root != @loc.root
434
- if @static
435
- if @static.is Hash
436
- if loc.host != @loc.host
437
- raise TargetError, "unable to get #{uri} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
438
- end
439
- else
440
- raise TargetError, "unable to get #{uri} by static frame #{@loc.root}, you should first update it with new target"
441
- end
442
- end
443
- @loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
444
- uri
445
- elsif !loc.root
446
- raise TargetError if !@static
447
- if @static.is Hash
448
- @loc.protocol = @static.protocol
449
- @loc.root = @loc.protocol+'://'+@loc.host
450
- end
451
- File.join @loc.root, uri
452
- else uri
453
- end
454
- else
455
- raise TargetError if !@static
456
- @loc.href
457
- end
458
- end
459
-
460
- def validate_some(uris)
461
- uris.map! {|u| validate u}
462
- end
463
-
464
- def run_callbacks!(page, opts, &callback)
465
- if callback
466
- yres = callback.call page
467
- if opts[:save_result] or :proc_result.in opts
468
- page.res = yres
469
- end
470
- if opts[:proc_result].is Proc and yres != :skip
471
- opts[:proc_result].call yres
472
- end
473
- end
474
- end
475
-
476
- # TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
477
- def exec_one(order, opts, &callback)
478
- if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
479
- run_callbacks! page, opts, &callback
480
- res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
481
- return res
482
- end
483
- # must result in Page (default) or it's subclass
484
- page = opts[:result].new
485
- # if no spare scouts can be found, squad simply waits for first callbacks to complete
486
- s = @ss.next
487
- s.send(*(order << opts)) {|curl|
488
- # there is a problem with storing html on disk
489
- if order[0] == :loadGet and @write_to
490
- # sometimes (about 2% for 100-threads-dling) when this string is calling
491
- # no matter what +curl.res.body+ has contained here
492
- RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
493
- end
494
- if opts[:raw]
495
- yield curl
496
- # here +curl.res.body+ become empty
497
- elsif page.process(curl, opts)
498
- @@cache[page.href] = page if order[0] == :loadGet and @use_cache
499
- run_callbacks! page, opts, &callback
500
- end
501
- }
502
- if opts[:wait]
503
- opts[:thread_safe] ? $Carier.perform : Curl.wait
504
- (opts[:save_result] or :proc_result.in opts) ? page.res : page
505
- else page
506
- end
507
- end
508
-
509
- def exec_many(orders, with_opts, &callback)
510
- w = with_opts.delete :wait
511
- iterator = with_opts[:stream] ? :each : :map
512
- if with_opts[:ranges]
513
- if orders.size != with_opts[:ranges].size
514
- raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
515
- end
516
- pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
517
- (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
518
- exec_one order, with_opts, &callback
519
- }
520
- else
521
- pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
522
- end
523
- with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
524
- with_opts[:stream] || pages
525
- end
526
-
527
-
528
- def setup_speedometer(uri, parts, len)
529
- return unless @print_progress
530
- @progress = Array.new(parts, 0)
531
- @stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
532
- @str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
533
- @bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
534
- Thread.new {
535
- until @stop_print
536
- sleep 0.2
537
- now = Time.now
538
- if now > @speedometer[0] and @sum > @speedometer[1]
539
- @speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
540
- @speedometer.replace [now, @sum]
541
- end
542
- end
543
- }
544
- end
545
-
546
- def run_speedometer(scout, len, n)
547
- return unless @print_progress
548
- scout.http.on_progress {|dl_need, dl_now, *ul|
549
- if !@stop_print
550
- @progress[n] = dl_now
551
- percents = (@sum = @progress.sum)*100/len
552
- print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
553
- if percents == 100
554
- puts "\v"*@newlines
555
- @stop_print = true
556
- end
557
- end
558
- true
559
- }
560
- end
561
-
562
- def clear_speedometer(scout)
563
- return unless @print_progress
564
- scout.http.on_progress
565
- end
566
-
567
- end
568
-
569
- def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
570
- Curl.run
571
- Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
572
- end
573
- module_function :dl
574
-
575
-
576
-
577
- class Page
578
- # for debug, just enable L#debug, don't write tons of chaotic log-lines
579
- __init__
580
- # res here is result of page processing made in frame context
581
- attr_writer :title
582
- attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
583
- attr_accessor :res
584
- @@ignore = /google|_gat|tracker|adver/i
585
-
586
- def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
587
- loc = loc.parse:uri if !loc.is Hash
588
- @js = js
589
- if obj.is Curl::Easy or obj.kinda Scout
590
- c = obj.kinda(Scout) ? obj.http : obj
591
- @html = ''
592
- # just (c, loc) would pass to #process opts variable that returns '' on any key
593
- process(c, loc.b || {})
594
- else
595
- @html = obj
596
- @loc = loc
597
- end
598
- end
599
-
600
- def empty?
601
- !(@hash.nil? ? @html : @hash).b
602
- end
603
-
604
- def inspect
605
- if !@hash.nil?
606
- "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
607
- else
608
- "<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
609
- end
610
- end
611
-
612
- def html!(encoding='UTF-8')
613
- @html.force_encoding(encoding)
614
- end
615
-
616
- # We can then alternate #process in Page subclasses
617
- # Frame doesn't mind about value returned by #process
618
- def process(c, opts={})
619
- @loc = c.last_effective_url.parse:uri
620
- @curl_res = c.res
621
- L.debug "#{@loc.fullpath} -> #{@curl_res}"
622
- if @curl_res.code == 200
623
- body = @curl_res.body
624
- if opts[:json]
625
- @json = true
626
- @hash = begin; body.from_json
627
- rescue StandardError
628
- false
629
- end
630
- if !@hash or @hash.is String
631
- L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
632
- @html = body; to_doc
633
- @hash = false
634
- end
635
-
636
- elsif opts[:hash]
637
- if body.inline
638
- @hash = body.to_params
639
- else
640
- @hash = false
641
- L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
642
- @html = body; to_doc
643
- end
644
-
645
- else
646
- @html = body.xml_to_utf
647
- to_doc
648
- if opts[:eval]
649
- load_scripts opts[:load_scripts]
650
- eval_js
651
- end
652
- end
653
- elsif !(opts[:json] or opts[:hash])
654
- @html = @curl_res.body
655
- @failed = @curl_res.code
656
- end
657
- self
658
- end
659
-
660
- def eval_js(frame=nil)
661
- eval_string "document.location = window.location = #{@loc.to_json};
662
- document.URL = document.baseURI = document.documentURI = location.href;
663
- document.domain = location.host;"
664
- find("script").each {|n|
665
- L.debug n.text.strip
666
- if text = n.text.strip.b
667
- js[:write_output] = ''
668
- eval_string text
669
- if res = js[:write_output].b then n.after res end
670
- n.remove!
671
- elsif frame and n.src
672
- eval_string frame.get_cached expand_link n.src
673
- end
674
- }
675
- end
676
-
677
- def eval_string(str)
678
- @js ||= Johnson::Runtime.new
679
- L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
680
- begin
681
- @js.evaluate(str)
682
- rescue Johnson::Error => e
683
- L.warn e.message
684
- L.debug {
685
- if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
686
- L.clr.hl! str, /\b#{m[1] || m[2]}\b/
687
- end
688
- "\n\t#{str}"
689
- }
690
- end
691
- end
692
-
693
- def to_doc
694
- @doc = @html.to_doc :forceutf
695
- end
696
-
697
- def title(full=true)
698
- if @hash.nil? and !@failed and @html.b
699
- if full
700
- to_doc unless defined? @doc
701
- if @doc.title.b
702
- @title = @doc.title
703
- else
704
- @title = @loc.href
705
- @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
706
- @title
707
- end
708
- else
709
- title true unless defined? @title
710
- if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
711
- @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
712
- elsif @title.size > 40
713
- @short_title = @title[/.{1,30}\S*/][0..38]+'…'
714
- else
715
- @short_title = @title
716
- end
717
- end
718
- else
719
- @loc.href
720
- end
721
- end
722
-
723
- def find(xp) (@doc || to_doc).find xp end
724
-
725
- def at(xp) (@doc || to_doc).at xp end
726
-
727
- def url() @loc.href end
728
- alias :href :url
729
-
730
- def get_srcs(links='img')
731
- begin
732
- links = find(links).map {|e| e.src} if links.is String
733
- rescue XML::Error
734
- links = [links]
735
- end
736
- links.map {|link| expand_link link}.uniq
737
- end
738
-
739
- def get_src(link='img')
740
- begin
741
- link = at(link) && at(link).src if link.is String
742
- rescue XML::Error; nil
743
- end
744
- expand_link link if link
745
- end
746
-
747
- def get_links(links='a')
748
- begin
749
- links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
750
- rescue XML::Error
751
- links = [links]
752
- end
753
- links.map {|link| expand_link link}.uniq
754
- end
755
-
756
- def get_link(link='a')
757
- begin
758
- link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
759
- rescue XML::Error; nil
760
- end
761
- expand_link link if link
762
- end
763
- alias :get_hrefs :get_links
764
- alias :links :get_links
765
- alias :get_href :get_link
766
- alias :link :get_link
767
- alias :srcs :get_srcs
768
- alias :src :get_src
769
-
770
- def expand_link(link)
771
- case link
772
- when /^\w+:\/\// then link
773
- when /^\/\// then @loc.protocol+link
774
- when /^\// then @loc.root+link
775
- else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
776
- end
777
- end
778
-
779
- def form(form='form', hash={}, opts={})
780
- form = "[action=#{@loc.path.inspect}]" if form == :self
781
- if form.is String
782
- form_node = at form
783
- raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
784
- else form_node = form
785
- end
786
- hash = form_node.inputs_all.merge!(hash)
787
- action = expand_link(form_node.action || @loc.path)
788
- if form_node['method'].downcase == 'post'
789
- [hash, form_node.enctype =~ /multipart/, action, opts]
790
- else
791
- action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
792
- [action, opts]
793
- end
794
- end
795
-
796
- def submit(form, frame, hash={}, opts={}, &callback)
797
- (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
798
- query = form(form, hash, opts)
799
-
800
- curr_target, new_target = frame.loc.href, (query[2] || query[0])
801
- if need_retargeting = (frame.static && curr_target != new_target)
802
- frame.retarget new_target
803
- end
804
- page = frame.exec(*query, &callback)
805
- frame.retarget curr_target, :forced if need_retargeting
806
- page
807
- end
808
-
809
- def load_scripts(frame)
810
- frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
811
- end
812
-
813
- end
814
-
815
- # using reprocessing of page in case of non-200 response:
816
- # page_class = ReloadablePage do
817
- # @res and @res.code != 200
818
- # end
819
- def ReloadablePage(&reload_condition)
820
- rp = Class.new Page
821
- rp.send :define_method, :process do |curl, opts|
822
- super(curl, opts || {})
823
- if curl.instance_eval &reload_condition
824
- curl.retry!
825
- nil # in case of reload_condition.call super's callback will not proceed
826
- else self
827
- end
828
- end
829
- rp
830
- end
831
-
832
- end
833
-
834
-
835
-
836
-
837
-
838
-
839
-
840
-
841
-
842
-
843
-
844
-
845
-
846
-
847
-
848
-