rhack 0.4.1 → 1.0.0.rc4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. data/.gitignore +22 -0
  2. data/Gemfile +2 -5
  3. data/LICENSE +19 -15
  4. data/README.md +66 -26
  5. data/Rakefile +42 -31
  6. data/config/cacert.pem +3895 -0
  7. data/config/rhack.yml.template +40 -0
  8. data/ext/curb-original/curb_config.h +3 -0
  9. data/ext/curb-original/curb_easy.c +3 -54
  10. data/ext/curb-original/curb_multi.c +69 -140
  11. data/ext/curb/curb_multi.c +1 -1
  12. data/lib/rhack.rb +82 -12
  13. data/lib/rhack/cookie.rb +49 -0
  14. data/lib/rhack/curl.rb +6 -0
  15. data/lib/{extensions/curb.rb → rhack/curl/easy.rb} +26 -48
  16. data/lib/rhack/curl/global.rb +175 -0
  17. data/lib/rhack/curl/itt.rb +11 -0
  18. data/lib/rhack/curl/multi.rb +37 -0
  19. data/lib/rhack/curl/post_field.rb +20 -0
  20. data/lib/rhack/curl/response.rb +91 -0
  21. data/lib/rhack/dl.rb +308 -0
  22. data/lib/rhack/frame.rb +316 -0
  23. data/lib/{extensions → rhack/js}/browser/env.js +0 -0
  24. data/lib/{extensions → rhack/js}/browser/jquery.js +0 -0
  25. data/lib/{extensions → rhack/js}/browser/xmlsax.js +0 -0
  26. data/lib/{extensions → rhack/js}/browser/xmlw3cdom_1.js +0 -0
  27. data/lib/{extensions → rhack/js}/browser/xmlw3cdom_2.js +0 -0
  28. data/lib/rhack/js/johnson.rb +71 -0
  29. data/lib/rhack/page.rb +263 -0
  30. data/lib/rhack/proxy.rb +3 -0
  31. data/lib/rhack/proxy/checker.rb +1 -1
  32. data/lib/rhack/scout.rb +342 -0
  33. data/lib/rhack/scout_squad.rb +98 -0
  34. data/lib/rhack/services.rb +1 -464
  35. data/lib/rhack/services/base.rb +59 -0
  36. data/lib/rhack/services/examples.rb +423 -0
  37. data/lib/rhack/version.rb +3 -0
  38. data/lib/rhack_in.rb +3 -2
  39. data/rhack.gemspec +28 -0
  40. metadata +104 -85
  41. data/.gemtest +0 -0
  42. data/Gemfile.lock +0 -23
  43. data/Manifest.txt +0 -60
  44. data/ext/curb/Makefile +0 -217
  45. data/lib/cache.rb +0 -44
  46. data/lib/curl-global.rb +0 -164
  47. data/lib/extensions/declarative.rb +0 -153
  48. data/lib/extensions/johnson.rb +0 -63
  49. data/lib/frame.rb +0 -848
  50. data/lib/init.rb +0 -49
  51. data/lib/rhack.yml.template +0 -19
  52. data/lib/scout.rb +0 -589
  53. data/lib/words.rb +0 -25
data/lib/cache.rb DELETED
@@ -1,44 +0,0 @@
1
- # encoding: utf-8
2
- module HTTPAccessKit
3
-
4
- class Cache < ActiveRecord::Base
5
- declare CacheTable do |t|
6
- t.integer :url_hash
7
- t.string :url
8
- t.string :path
9
- t.string :date
10
- t.string :ext
11
- t.timestamps
12
- end if DB
13
- RAMCache = {}
14
-
15
- def self.clean(time=7.days)
16
- destroy_all("created_at < '#{time.ago}'").each {|c|
17
- FileUtils.remove c.path if c.path and File.file?(c.path)}
18
- end
19
- CacheTTL and clean CacheTTL
20
-
21
- def self.save(url, data, cache_data=true)
22
- new(url, data).save
23
- RAMCache[url.href] = data if cache_data
24
- end
25
-
26
- def self.load(url, cache_data=true)
27
- if data = RAMCache[url.href]
28
- data
29
- elsif file = first(:select => 'date, path', :conditions => {:url_hash => url.href.hash})
30
- RAMCache[url.href] = read(file.path) if cache_data
31
- file
32
- end
33
- end
34
-
35
- def initialize(url, data)
36
- t = Time.now
37
- path = "#{CacheDir}/#{t.to_i}-#{File.split(url.path)[1]}"
38
- rw path, data
39
- super :url => url.href, :url_hash => url.href.hash, :date => t.httpdate, :path => path, :ext => url.ext
40
- end
41
-
42
- end
43
-
44
- end
data/lib/curl-global.rb DELETED
@@ -1,164 +0,0 @@
1
- # encoding: utf-8
2
- module Curl
3
-
4
- def execute(unless_allready=false)
5
- if unless_allready and Curl.status
6
- return L.log "Non-nil status! Avoid executing"
7
- end
8
- if $CarierThread and s = $CarierThread.status
9
- L.log "Carier thread allready started and has status #{s}"
10
- else
11
- if s = Curl.status(false) then L.warn s end
12
- L.log($CarierThread ? "Resetting Carier thread" : "Setting Carier thread up")
13
- $CarierThread = Thread.new {
14
- error = nil
15
- begin
16
- # "why Thread#value is raising since it never raised before?"
17
- yield if block_given?
18
- rescue => error
19
- nil
20
- end
21
- loop {
22
- begin
23
- # with true argument (idle) it would break only if no requests to perform
24
- break unless $Carier.perform true
25
- L.log "Nothing to perform; idling..."
26
- rescue => error
27
- break
28
- # but ruby mystically crashes if next sequence occur:
29
- # Multi performs and can't see any requests so entering idle mode
30
- # we add some requests and multi load them
31
- # one of requests' callbacks raises error in *main* thread
32
- # so we can't allow any raises here, instead, keep them in 'wait' section
33
- end
34
- } unless error
35
- error
36
- }
37
- # until main thread has sleep a bit, $CarierThread will have status "run",
38
- # no matter whether it's idling or performing requests
39
- sleep 0.001
40
- end
41
- end
42
- alias :run :execute
43
- module_function :execute, :run
44
-
45
- def wait
46
- if $CarierThread and $CarierThread.status
47
- unless within = Thread.current == $CarierThread
48
- # We can't set `perform' timeout lesser than 1 second in the curl binding
49
- # because in that case thread status would always be "run"
50
- # so here we wait for exactly 1 sec
51
- sleep 1
52
- end
53
- # Also, if thread do Kernel.sleep, it would skip Curl.wait here
54
- if !$Carier.sheduled and ($CarierThread.status == 'sleep' or within && $Carier.reqs.empty?)
55
- L.log "No shedule to wait"
56
- else
57
- this_thread = within ? 'it\'s thread' : Thread.main == Thread.current ? 'main thread' : 'thread '+Thread.current.object_id
58
- L.log "Waiting for Carier to complete in #{this_thread}"
59
- begin
60
- L.log { "Trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> true from #{this_thread}" }
61
- if within
62
- L.log "calling this from one of callbacks to wait for the rest to complete"
63
- begin
64
- $Carier.perform
65
- rescue RuntimeError => e
66
- L.warn [e, e.message]
67
- L.info "$Carier $Carier.sheduled $CarierThread $CarierThread.status", binding
68
- L.warn "Failed to run Multi#perform: nothing to perform"
69
- end
70
- else
71
- $CarierThreadIsJoined = true
72
- $CarierThread.join
73
- end
74
- rescue (defined?(IRB) ? IRB::Abort : NilClass)
75
- recall!
76
- L.info "Carier thread recalled by keyboard"
77
- ensure
78
- L.log "trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> false from #{this_thread}"
79
- if !within
80
- $CarierThreadIsJoined = false
81
- # using Curl#execute from different threads may cause problems here when you don't control input,
82
- # for example, in a daemonized ruby process
83
- # just do not get $CarierThread joined from non-main thread
84
- if $CarierThread and e = $CarierThread.value
85
- # this will raise thread-safely in main thread
86
- # in case of unrescued error in CarierThread
87
- L.log(([e.message]+RMTools.format_trace(e.backtrace))*"\n")
88
- recall!
89
- raise e
90
- end
91
- execute
92
- end
93
- end
94
- end
95
- else
96
- L < "No thread to wait. I guess I should create one"
97
- execute
98
- wait
99
- end
100
- end
101
- module_function :wait
102
-
103
- def recall
104
- L.debug caller
105
- if $CarierThread
106
- L.log "Recalling Carier thread"
107
- $CarierThread.kill
108
- sleep 1
109
- else
110
- L.log "No thread to recall"
111
- end
112
- end
113
- alias :stop :recall
114
-
115
- def recall!
116
- if $CarierThread
117
- L.warn "Recalling thread and resetting Carier!!!"
118
- $CarierThread.kill
119
- $CarierThread = nil
120
- $Carier.reset
121
- else
122
- L.log "No thread to recall!"
123
- end
124
- end
125
- alias :stop! :recall!
126
- module_function :recall!, :stop!, :recall, :stop
127
-
128
- def reset
129
- recall
130
- execute
131
- end
132
- alias :reload :reset
133
-
134
- def reset!
135
- recall!
136
- execute
137
- end
138
- alias :reload! :reset!
139
- module_function :reset!, :reset, :reload!, :reload
140
-
141
- def status(raise_e=true)
142
- if $CarierThread and (s = $CarierThread.status)
143
- L.log "Carier thread responding with status #{s}"
144
- s
145
- elsif $CarierThread
146
- if e = $CarierThread.value
147
- if raise_e
148
- recall!
149
- raise e
150
- else
151
- L.log "Carier Thread returned #{e.inspect}"
152
- e
153
- end
154
- else
155
- L.log "Carier Thread is exited without error"
156
- end
157
- else
158
- L.log "There is no Carier Thread atm"
159
- end
160
- end
161
- alias :st :status
162
- module_function :status, :st
163
-
164
- end
@@ -1,153 +0,0 @@
1
- # encoding: utf-8
2
- module ActiveRecord
3
-
4
- module ConnectionAdapters
5
- AbstractAdapter
6
-
7
- class VirtualTable < Table
8
-
9
- def debug_str meth, called, exist, *args
10
- "Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
11
- end
12
-
13
- def column_exists *args
14
- column_names = @base.columns(@table_name).names
15
- options = args.extract_options!
16
- names = args.dup
17
- args << options
18
- _or_ = (names[0] == :all) ? !names.shift : true
19
- names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
20
- !_or_
21
- end
22
-
23
- def index_exists *indexes
24
- column_indexes = @base.indexes(@table_name).columnss.flatten
25
- _or_ = (indexes[0] == :all) ? !indexes.shift : true
26
- indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
27
- !_or_
28
- end
29
-
30
- def initialize name, connection, map=nil
31
- super name, connection
32
- case map
33
- when true; @map = []
34
- when Array; @map = map
35
- end
36
- end
37
-
38
- def map!
39
- map_names = @map.firsts.to_ss
40
- @base.columns(@table_name).names.each {|name|
41
- name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
42
- }
43
- @map.each {|col| column *col}
44
- end
45
-
46
- def column name, *args
47
- to_be_called = !column_exists(name)
48
- super if to_be_called
49
- $log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
50
- @map << [name, *args] if @map
51
- end
52
-
53
- %w{string text integer float decimal
54
- datetime timestamp time date binary boolean}.each {|column_type|
55
- define_method(column_type) {|*args|
56
- to_be_called = !column_exists(*args)
57
- super if to_be_called
58
- $log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
59
- if @map
60
- options = args.extract_options!
61
- args = args.xprod(column_type)
62
- args = args.xprod(options) if options
63
- @map.concat args
64
- end
65
- } }
66
-
67
- def index name, *args
68
- to_be_called = !index_exists(name)
69
- super if to_be_called
70
- $log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
71
- end
72
-
73
- def timestamps
74
- to_be_called = !column_exists('created_at', 'updated_at')
75
- super if to_be_called
76
- $log.debug {debug_str :timestamps, to_be_called, !to_be_called}
77
- @map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
78
- end
79
-
80
- def change *args
81
- raise NotImplementedError, "don't use #change in declaration!"
82
- end
83
-
84
- def change_default *args
85
- raise NotImplementedError, "don't use #change_default in declaration!"
86
- end
87
-
88
- def rename column_name, new_column_name
89
- to_be_called = !column_exists(new_column_name)
90
- super if to_be_called
91
- $log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
92
- end
93
-
94
- def references *args
95
- to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
96
- super if to_be_called
97
- $log.debug {debug_str :references, to_be_called, !to_be_called, *args}
98
- end
99
- alias :belongs_to :references
100
-
101
- def remove *args
102
- to_be_called = column_exists :all, *args
103
- super if to_be_called
104
- $log.debug {debug_str :remove, to_be_called, to_be_called, *args}
105
- end
106
-
107
- def remove_references *args
108
- to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
109
- super if to_be_called
110
- $log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
111
- end
112
- alias :remove_belongs_to :remove_references
113
-
114
- def remove_index options
115
- indexes = options.is(Hash) ? options[:column] : options
116
- raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
117
- to_be_called = index_exists :all, *indexes
118
- super if to_be_called
119
- $log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
120
- end
121
-
122
- def remove_timestamps
123
- to_be_called = column_exists 'created_at', 'updated_at'
124
- super if to_be_called
125
- $log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
126
- end
127
-
128
- end
129
-
130
- end
131
-
132
- class Base
133
-
134
- def self.declare name, options={}, &block
135
- self.table_name = name
136
- if !table_exists? or options[:force]
137
- $log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
138
- self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
139
- $log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
140
- connection.create_table(name, options, &block)
141
- elsif options[:map]
142
- table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
143
- yield table
144
- table.map!
145
- else yield ConnectionAdapters::VirtualTable.new(name, connection)
146
- end
147
- reset_column_information
148
- end
149
-
150
- end
151
-
152
- end
153
-
@@ -1,63 +0,0 @@
1
- # encoding: utf-8
2
- module Johnson
3
- begin
4
- require 'johnson'
5
- rescue LoadError
6
- Enabled = false
7
- else
8
- if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
9
- Enabled = false
10
- else Enabled = true
11
- end
12
- end
13
- ### JavaScript interface DOM emulation ###
14
-
15
- class Runtime
16
- attr_accessor :thread_id
17
- Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
18
- BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
19
-
20
- # CarierThread breaks if Multi has no work && CarierThread
21
- # is joined so itwon't last forever.
22
- #
23
- # Johnson is not thread safe =>
24
- # Runtime created in this thread will become unusable after
25
- # CarierThread dies.
26
- #
27
- # So we don't use Curl.wait until Carier haven't got whole
28
- # request for this Runtime.
29
- def self.set_browser_for_curl(opts)
30
- if !Runtime_is_set[opts]
31
- if Curl.status
32
- Curl.recall
33
- $log.debug 'recalled'
34
- end
35
- if opts[:thread_safe].b
36
- $JSRuntime = new_browser(opts[:jq])
37
- $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
38
- else
39
- $log.debug 'about to run carier'
40
- Curl.execute {$JSRuntime = new_browser(opts[:jq])
41
- $log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
42
- sleep 0.01 until Runtime_is_set[opts]
43
- end
44
- end
45
- end
46
-
47
- def self.new_browser(jq=false)
48
- rt = new
49
- %w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
50
- path = "#{BROWSER_PATH}/#{f}.js"
51
- rt.evaluate IO.read(path), path, 1
52
- }
53
- rt.document = ''
54
- rt
55
- end
56
-
57
- def document=(html)
58
- evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
59
- end
60
-
61
- end
62
-
63
- end
data/lib/frame.rb DELETED
@@ -1,848 +0,0 @@
1
- # encoding: utf-8
2
- module HTTPAccessKit
3
-
4
- # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
- # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
6
-
7
- class ZippingError < ArgumentError
8
- def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
9
- super str%debug end
10
- end
11
-
12
- class TargetError < ArgumentError
13
- def initialize msg="only static frame can use local paths"
14
- super end
15
- end
16
-
17
- class ConfigError < ArgumentError
18
- def initialize msg
19
- super end
20
- end
21
-
22
- class Frame
23
- __init__
24
- attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
25
- @@cache = {}
26
-
27
- def initialize *args
28
- args << 10 unless args[-1].is Fixnum
29
- args.insert -2, {} unless args[-2].is Hash
30
- @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
31
- args[-2] = @opts
32
- if args[0].is String
33
- uri = args[0]
34
- 'http://' >> uri if uri !~ /^\w+:\/\//
35
- @loc = uri.parse:uri
36
- # be careful, if you set :static => false, frame will be unable to use implicit url
37
- @static = @opts.fetch(:static, true)
38
- else
39
- @loc = {}
40
- @static = false
41
- end
42
- @ss = ScoutSquad *args
43
- Curl.run :unless_allready
44
- end
45
-
46
- def retarget to, forced=nil
47
- to = 'http://' + to if to !~ /^\w+:/
48
- @ss.update to, forced
49
- @loc = to.parse:uri
50
- end
51
- alias :target= :retarget
52
-
53
- def next() @ss.next end
54
- def rand() @ss.rand end
55
- def each(&block) @ss.each &block end
56
- def [](i) @ss[i] end
57
-
58
- def copy_cookies! i=0
59
- @ss.each {|s| s.cookies.replace @ss[i].cookies}
60
- end
61
-
62
- def use_cache! opts={}
63
- if opts == false
64
- @use_cache = false
65
- else
66
- @@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
67
- #@write_to = opts[:write_to] if :write_to.in opts
68
- @use_cache = true
69
- end
70
- end
71
-
72
- def drop_cache! use=nil
73
- @@cache.clear
74
- GC.start
75
- @use_cache = use if use.in [true, false]
76
- end
77
-
78
- def inspect
79
- "<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static'+(' => '+@static.protocol if @static.is(Hash)) if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
80
- end
81
-
82
- # opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
83
- # :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
84
- def exec *args, &callback
85
- many, order, orders, with_opts = interpret_request *args
86
- L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
87
-
88
- if !Johnson::Enabled and with_opts[:eval]
89
- L < "failed to use option :eval because Johnson is disabled"
90
- with_opts.delete :eval
91
- end
92
- # JS Runtime is not thread-safe and must be created in curl thread
93
- # if we aren't said explicitly about the opposite
94
- Johnson::Runtime.set_browser_for_curl with_opts
95
-
96
- if many then exec_many orders, with_opts, &callback
97
- else exec_one order, with_opts, &callback end
98
- end
99
- alias :get :exec
100
- alias :run :get
101
-
102
- def interpret_request(*args)
103
- body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
104
- L.log [body, mp, uri, opts]
105
- zip = opts.delete :zip
106
- many = order = orders = post = false
107
- # Default options set is for POST
108
- if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
109
- # if second arg is String, then that's uri
110
- uri, mp, post = mp.dup, false, true
111
- # L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
112
- # But if we have only one argument actually passed
113
- # except for options hash, then believe it's GET
114
- elsif body.is String or body.kinda [String]
115
- L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
116
- uri = body.dup
117
- elsif !body then uri = nil
118
- else
119
- uri = uri.dup if uri
120
- mp, post = !!mp, true
121
- end
122
- if post
123
- unless body.is Hash or body.kinda [Hash]
124
- raise TypeError, "body of post request must be a hash or hash array, params was
125
- (#{args.inspect[1..-2]})"
126
- end
127
- validate_zip uri, body if zip
128
- if zip or uri.kinda Array or body.kinda Array
129
- many = true
130
- if zip or uri.kinda Array
131
- validate_some uri
132
- orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
133
- else
134
- uri = validate uri
135
- orders = body.xprod uri
136
- end
137
- orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
138
- else
139
- uri = validate uri
140
- order = [:loadPost, body, mp, uri]
141
- end
142
- else
143
- if uri.kinda Array
144
- many = true
145
- validate_some uri
146
- orders = [:loadGet].xprod uri
147
- else
148
- uri = validate uri
149
- order = [:loadGet, uri]
150
- end
151
- end
152
- if !order.b and !orders.b
153
- raise ArgumentError, "failed to run blank request#{'s' if many}, params was
154
- (#{args.inspect[1..-2]})"
155
- end
156
-
157
- opts[:wait] = opts[:sync] if :sync.in opts
158
- opts[:wait] = true if !:wait.in(opts) and
159
- :proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
160
- opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
161
- opts[:load_scripts] = self if opts[:load_scripts]
162
- opts[:stream] = true if opts[:raw]
163
- (opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
164
- [many, order, orders, opts]
165
- end
166
-
167
- def get_cached(*links)
168
- res = []
169
- expire = links[-1] == :expire ? links.pop : false
170
- links.parses(:uri).each_with_index {|uri, i|
171
- next if uri.path[/ads|count|stats/]
172
- file = Cache.load uri, !expire
173
- if file
174
- if expire
175
- @ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
176
- if c.res.code == 200
177
- res << [i, (data = c.res.body)]
178
- Cache.save uri, data, false
179
- else
180
- res << [i, file.is(String) ? file : read(file.path)]
181
- end
182
- }
183
- else
184
- res << [i, file.is(String) ? file : read(file.path)]
185
- end
186
- else
187
- @ss.next.loadGet(uri.href) {|c|
188
- if c.res.code == 200
189
- res << [i, (data = c.res.body)]
190
- Cache.save uri, data, !expire
191
- end
192
- }
193
- end
194
- }
195
- Curl.wait
196
- links.size == 1 ? res[0][1] : res.sort!.lasts
197
- end
198
-
199
- def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
200
- raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
201
- @print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
202
- (s = @ss.next).http.on_header {|h|
203
- next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
204
- if code = $2
205
- if code != '200'
206
- L << "#$& getting #{uri}; interrupting request."
207
- s.http.on_header() # set default process
208
- next 0
209
- end
210
- next h.size
211
- end
212
-
213
- s.http.on_header() # set default process
214
- if !$1 # конец хедера, content-length отсутствует
215
- L << "No Content-Length header; trying to load a whole #{uri} at once!"
216
- s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
217
- next 0
218
- end
219
-
220
- len = $1.to_i - start
221
- psize = configure_psize(len, psize, threads)
222
- parts = (len/psize.to_f).ceil
223
- setup_speedometer(uri, parts, len)
224
- yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
225
-
226
- @ss_reserve = @ss[threads+1..-1]
227
- @ss = @ss[0..threads]
228
- (0...parts).each {|n|
229
- break if stop_download
230
-
231
- s = @ss.next
232
- run_speedometer(s, len, n)
233
- s.loadGet(uri, :headers => {
234
- 'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
235
- }) {|c|
236
- clear_speedometer(s)
237
- if c.res.code/10 == 20
238
- yield len, n*psize, c.res.body
239
- else
240
- L << "#{c.res} during get #{uri.inspect}; interrupting request."
241
- stop_download = true
242
- end
243
- }
244
- }
245
- 0
246
- }
247
- s.raise_err = false
248
- s.loadGet validate uri
249
- ensure
250
- @ss.concat @ss_reserve || []
251
- end
252
-
253
- def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
254
- dled = 0
255
- lock = ''
256
- callback = lambda {|len, pos, body|
257
- if body != :careful_dl
258
- begin
259
- write(df, body, pos)
260
- rescue => e
261
- binding.start_interaction
262
- raise
263
- end
264
- if (dled += body.size) == len
265
- File.delete lock if File.file? lock
266
- yield df if block_given?
267
- end
268
- else
269
- lock = lock_file df, len, pos # filename, filesize, partsize
270
- end
271
- }
272
- opts[:threads] ||= @ss.size-1
273
- get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
274
- Curl.wait unless block_given?
275
- df
276
- end
277
-
278
- def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
279
- opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
280
- L << opts
281
-
282
- @print_progress = opts[:print_progress]
283
- unless len = opts[:len] || (map = read_mapfile(df) and map.len)
284
- return @ss.next.loadHead(uri) {|c| $log << c
285
- if len = c.res['Content-Length']
286
- simple_dl(uri, df, opts.merge(:len => len.to_i))
287
- else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
288
- end
289
- }
290
- end
291
-
292
- psize, parts = check_mapfile(df, opts)
293
- return unless psize
294
- L << [psize, parts]
295
- setup_speedometer(uri, parts.size, len)
296
-
297
- obtained uri do |uri|
298
- if opts[:threads] == 1
299
- start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
300
- scout = opts[:scout] || @ss.next
301
- $log << [uri, scout]
302
- (loadget = lambda {|n|
303
- run_speedometer(scout, len, n)
304
- from = start + n*psize
305
- to = start + (n+1)*psize - 1
306
- scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
307
- begin
308
- $log << "writing #{df} from #{from}: #{c.res.body.inspect}"
309
- write(df, c.res.body, from)
310
- rescue => e
311
- binding.start_interaction
312
- raise
313
- end
314
- if write_mapfile(df, from, to)
315
- clear_speedometer(scout)
316
- L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
317
- yield df if block_given?
318
- elsif (n+1)*psize <= len
319
- loadget[n+1]
320
- end
321
- }
322
- })[0]
323
- else
324
- exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
325
- L << c.res
326
- range = c.req.range
327
- begin
328
- write(df, c.res.body, range.begin)
329
- rescue => e
330
- binding.start_interaction
331
- raise
332
- end
333
- if write_mapfile(df, range.begin, range.end)
334
- @ss.each {|s| s.http.on_progress} if @print_progress
335
- yield df if block_given?
336
- end
337
- }
338
- end
339
- end
340
- end
341
-
342
- def check_mapfile(df, opts={})
343
- opts.reverse_merge! :psize => :auto, :threads => 1
344
- map = read_mapfile df
345
- if map
346
- L << map
347
- if map.rest.empty?
348
- puts "#{df} is loaded"
349
- $log << 'deleting mapfile'
350
- File.delete df+'.map'
351
- []
352
- else
353
- if opts[:len] and map.len != opts[:len]
354
- raise "Incorrect file size for #{df}"
355
- end
356
- psize = configure_psize *opts.values_at(:len, :psize, :threads)
357
- [psize, map.rest.div(psize)]
358
- end
359
- else
360
- write_mapfile df, opts[:len]
361
- psize = configure_psize *opts.values_at(:len, :psize, :threads)
362
- $log << (0...opts[:len]).div(psize)
363
- [psize, (0...opts[:len]).div(psize)]
364
- end
365
- end
366
-
367
- def read_mapfile(df)
368
- df += '.map'
369
- text = read df
370
- $log << "mapfile read: #{text}"
371
- if text.b
372
- text[/^(\d+)\0+(\d+)\0*\n/]
373
- map = {}
374
- $log << [$1,$2]
375
- if $1 and $1 == $2
376
- map.rest = []
377
- else
378
- map.len, *map.parts = text.chop/"\n"
379
- map.len = map.len.to_i
380
- map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
381
- $log << map.parts
382
- map.rest = (0...map.len) - XRange(*map.parts)
383
- end
384
- map
385
- end
386
- end
387
-
388
- def write_mapfile(df, *args)
389
- df += '.map'
390
- map = ''
391
- if args.size != 2
392
- len = args.shift
393
- map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
394
- end
395
- if args.any?
396
- read(df)[/^(\d+)\0+(\d+)\0*\n/]
397
- $log << "mapfile read"
398
- $log << [$1,$2]
399
- dled = $2.to_i + args[1] - args[0] + 1
400
- return true if dled == $1.to_i
401
- map << "#{args[0]}..#{args[1]}\n"
402
- $log << 'writing mapfile'
403
- write(df, dled.to_s.ljust(11, "\0"), 11)
404
- end
405
- $log << [df, map]
406
- $log << 'writing mapfile'
407
- write df, map
408
- nil
409
- end
410
-
411
- def configure_psize(len, psize, threads)
412
- case psize
413
- when Numeric; psize.to_i
414
- when :auto; len > 100000 ? len/threads+1 : len
415
- when :mb; 1.mb
416
- else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
417
- end
418
- end
419
-
420
- private
421
- def validate_zip(uri, body)
422
- if !(uri.kinda Array and body.kinda Array)
423
- raise ZippingError, [uri.class, nil, body.class, nil]
424
- elsif uri.size != body.size
425
- raise ZippingError, [uri.class, uri.size, body.class, body.size]
426
- end
427
- end
428
-
429
- # :static option now can accept hash with :procotol key, in that case Frame can be relocated to the same domain on another protocol and default protocol would be the value of @static.protocol
430
- def validate(uri)
431
- if uri
432
- loc = uri.parse:uri
433
- if loc.root and loc.root != @loc.root
434
- if @static
435
- if @static.is Hash
436
- if loc.host != @loc.host
437
- raise TargetError, "unable to get #{uri} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
438
- end
439
- else
440
- raise TargetError, "unable to get #{uri} by static frame #{@loc.root}, you should first update it with new target"
441
- end
442
- end
443
- @loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
444
- uri
445
- elsif !loc.root
446
- raise TargetError if !@static
447
- if @static.is Hash
448
- @loc.protocol = @static.protocol
449
- @loc.root = @loc.protocol+'://'+@loc.host
450
- end
451
- File.join @loc.root, uri
452
- else uri
453
- end
454
- else
455
- raise TargetError if !@static
456
- @loc.href
457
- end
458
- end
459
-
460
- def validate_some(uris)
461
- uris.map! {|u| validate u}
462
- end
463
-
464
- def run_callbacks!(page, opts, &callback)
465
- if callback
466
- yres = callback.call page
467
- if opts[:save_result] or :proc_result.in opts
468
- page.res = yres
469
- end
470
- if opts[:proc_result].is Proc and yres != :skip
471
- opts[:proc_result].call yres
472
- end
473
- end
474
- end
475
-
476
- # TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
477
- def exec_one(order, opts, &callback)
478
- if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
479
- run_callbacks! page, opts, &callback
480
- res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
481
- return res
482
- end
483
- # must result in Page (default) or it's subclass
484
- page = opts[:result].new
485
- # if no spare scouts can be found, squad simply waits for first callbacks to complete
486
- s = @ss.next
487
- s.send(*(order << opts)) {|curl|
488
- # there is a problem with storing html on disk
489
- if order[0] == :loadGet and @write_to
490
- # sometimes (about 2% for 100-threads-dling) when this string is calling
491
- # no matter what +curl.res.body+ has contained here
492
- RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
493
- end
494
- if opts[:raw]
495
- yield curl
496
- # here +curl.res.body+ become empty
497
- elsif page.process(curl, opts)
498
- @@cache[page.href] = page if order[0] == :loadGet and @use_cache
499
- run_callbacks! page, opts, &callback
500
- end
501
- }
502
- if opts[:wait]
503
- opts[:thread_safe] ? $Carier.perform : Curl.wait
504
- (opts[:save_result] or :proc_result.in opts) ? page.res : page
505
- else page
506
- end
507
- end
508
-
509
- def exec_many(orders, with_opts, &callback)
510
- w = with_opts.delete :wait
511
- iterator = with_opts[:stream] ? :each : :map
512
- if with_opts[:ranges]
513
- if orders.size != with_opts[:ranges].size
514
- raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
515
- end
516
- pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
517
- (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
518
- exec_one order, with_opts, &callback
519
- }
520
- else
521
- pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
522
- end
523
- with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
524
- with_opts[:stream] || pages
525
- end
526
-
527
-
528
- def setup_speedometer(uri, parts, len)
529
- return unless @print_progress
530
- @progress = Array.new(parts, 0)
531
- @stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
532
- @str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
533
- @bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
534
- Thread.new {
535
- until @stop_print
536
- sleep 0.2
537
- now = Time.now
538
- if now > @speedometer[0] and @sum > @speedometer[1]
539
- @speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
540
- @speedometer.replace [now, @sum]
541
- end
542
- end
543
- }
544
- end
545
-
546
- def run_speedometer(scout, len, n)
547
- return unless @print_progress
548
- scout.http.on_progress {|dl_need, dl_now, *ul|
549
- if !@stop_print
550
- @progress[n] = dl_now
551
- percents = (@sum = @progress.sum)*100/len
552
- print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
553
- if percents == 100
554
- puts "\v"*@newlines
555
- @stop_print = true
556
- end
557
- end
558
- true
559
- }
560
- end
561
-
562
- def clear_speedometer(scout)
563
- return unless @print_progress
564
- scout.http.on_progress
565
- end
566
-
567
- end
568
-
569
- def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
570
- Curl.run
571
- Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
572
- end
573
- module_function :dl
574
-
575
-
576
-
577
- class Page
578
- # for debug, just enable L#debug, don't write tons of chaotic log-lines
579
- __init__
580
- # res here is result of page processing made in frame context
581
- attr_writer :title
582
- attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
583
- attr_accessor :res
584
- @@ignore = /google|_gat|tracker|adver/i
585
-
586
- def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
587
- loc = loc.parse:uri if !loc.is Hash
588
- @js = js
589
- if obj.is Curl::Easy or obj.kinda Scout
590
- c = obj.kinda(Scout) ? obj.http : obj
591
- @html = ''
592
- # just (c, loc) would pass to #process opts variable that returns '' on any key
593
- process(c, loc.b || {})
594
- else
595
- @html = obj
596
- @loc = loc
597
- end
598
- end
599
-
600
- def empty?
601
- !(@hash.nil? ? @html : @hash).b
602
- end
603
-
604
- def inspect
605
- if !@hash.nil?
606
- "<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
607
- else
608
- "<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
609
- end
610
- end
611
-
612
- def html!(encoding='UTF-8')
613
- @html.force_encoding(encoding)
614
- end
615
-
616
- # We can then alternate #process in Page subclasses
617
- # Frame doesn't mind about value returned by #process
618
- def process(c, opts={})
619
- @loc = c.last_effective_url.parse:uri
620
- @curl_res = c.res
621
- L.debug "#{@loc.fullpath} -> #{@curl_res}"
622
- if @curl_res.code == 200
623
- body = @curl_res.body
624
- if opts[:json]
625
- @json = true
626
- @hash = begin; body.from_json
627
- rescue StandardError
628
- false
629
- end
630
- if !@hash or @hash.is String
631
- L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
632
- @html = body; to_doc
633
- @hash = false
634
- end
635
-
636
- elsif opts[:hash]
637
- if body.inline
638
- @hash = body.to_params
639
- else
640
- @hash = false
641
- L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
642
- @html = body; to_doc
643
- end
644
-
645
- else
646
- @html = body.xml_to_utf
647
- to_doc
648
- if opts[:eval]
649
- load_scripts opts[:load_scripts]
650
- eval_js
651
- end
652
- end
653
- elsif !(opts[:json] or opts[:hash])
654
- @html = @curl_res.body
655
- @failed = @curl_res.code
656
- end
657
- self
658
- end
659
-
660
- def eval_js(frame=nil)
661
- eval_string "document.location = window.location = #{@loc.to_json};
662
- document.URL = document.baseURI = document.documentURI = location.href;
663
- document.domain = location.host;"
664
- find("script").each {|n|
665
- L.debug n.text.strip
666
- if text = n.text.strip.b
667
- js[:write_output] = ''
668
- eval_string text
669
- if res = js[:write_output].b then n.after res end
670
- n.remove!
671
- elsif frame and n.src
672
- eval_string frame.get_cached expand_link n.src
673
- end
674
- }
675
- end
676
-
677
- def eval_string(str)
678
- @js ||= Johnson::Runtime.new
679
- L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
680
- begin
681
- @js.evaluate(str)
682
- rescue Johnson::Error => e
683
- L.warn e.message
684
- L.debug {
685
- if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
686
- L.clr.hl! str, /\b#{m[1] || m[2]}\b/
687
- end
688
- "\n\t#{str}"
689
- }
690
- end
691
- end
692
-
693
- def to_doc
694
- @doc = @html.to_doc :forceutf
695
- end
696
-
697
- def title(full=true)
698
- if @hash.nil? and !@failed and @html.b
699
- if full
700
- to_doc unless defined? @doc
701
- if @doc.title.b
702
- @title = @doc.title
703
- else
704
- @title = @loc.href
705
- @doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
706
- @title
707
- end
708
- else
709
- title true unless defined? @title
710
- if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
711
- @short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
712
- elsif @title.size > 40
713
- @short_title = @title[/.{1,30}\S*/][0..38]+'…'
714
- else
715
- @short_title = @title
716
- end
717
- end
718
- else
719
- @loc.href
720
- end
721
- end
722
-
723
- def find(xp) (@doc || to_doc).find xp end
724
-
725
- def at(xp) (@doc || to_doc).at xp end
726
-
727
- def url() @loc.href end
728
- alias :href :url
729
-
730
- def get_srcs(links='img')
731
- begin
732
- links = find(links).map {|e| e.src} if links.is String
733
- rescue XML::Error
734
- links = [links]
735
- end
736
- links.map {|link| expand_link link}.uniq
737
- end
738
-
739
- def get_src(link='img')
740
- begin
741
- link = at(link) && at(link).src if link.is String
742
- rescue XML::Error; nil
743
- end
744
- expand_link link if link
745
- end
746
-
747
- def get_links(links='a')
748
- begin
749
- links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
750
- rescue XML::Error
751
- links = [links]
752
- end
753
- links.map {|link| expand_link link}.uniq
754
- end
755
-
756
- def get_link(link='a')
757
- begin
758
- link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
759
- rescue XML::Error; nil
760
- end
761
- expand_link link if link
762
- end
763
- alias :get_hrefs :get_links
764
- alias :links :get_links
765
- alias :get_href :get_link
766
- alias :link :get_link
767
- alias :srcs :get_srcs
768
- alias :src :get_src
769
-
770
- def expand_link(link)
771
- case link
772
- when /^\w+:\/\// then link
773
- when /^\/\// then @loc.protocol+link
774
- when /^\// then @loc.root+link
775
- else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
776
- end
777
- end
778
-
779
- def form(form='form', hash={}, opts={})
780
- form = "[action=#{@loc.path.inspect}]" if form == :self
781
- if form.is String
782
- form_node = at form
783
- raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
784
- else form_node = form
785
- end
786
- hash = form_node.inputs_all.merge!(hash)
787
- action = expand_link(form_node.action || @loc.path)
788
- if form_node['method'].downcase == 'post'
789
- [hash, form_node.enctype =~ /multipart/, action, opts]
790
- else
791
- action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
792
- [action, opts]
793
- end
794
- end
795
-
796
- def submit(form, frame, hash={}, opts={}, &callback)
797
- (opts[:headers] ||= {}).Referer ||= @loc.href if @loc
798
- query = form(form, hash, opts)
799
-
800
- curr_target, new_target = frame.loc.href, (query[2] || query[0])
801
- if need_retargeting = (frame.static && curr_target != new_target)
802
- frame.retarget new_target
803
- end
804
- page = frame.exec(*query, &callback)
805
- frame.retarget curr_target, :forced if need_retargeting
806
- page
807
- end
808
-
809
- def load_scripts(frame)
810
- frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
811
- end
812
-
813
- end
814
-
815
- # using reprocessing of page in case of non-200 response:
816
- # page_class = ReloadablePage do
817
- # @res and @res.code != 200
818
- # end
819
- def ReloadablePage(&reload_condition)
820
- rp = Class.new Page
821
- rp.send :define_method, :process do |curl, opts|
822
- super(curl, opts || {})
823
- if curl.instance_eval &reload_condition
824
- curl.retry!
825
- nil # in case of reload_condition.call super's callback will not proceed
826
- else self
827
- end
828
- end
829
- rp
830
- end
831
-
832
- end
833
-
834
-
835
-
836
-
837
-
838
-
839
-
840
-
841
-
842
-
843
-
844
-
845
-
846
-
847
-
848
-