rhack 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/CURB-LICENSE +51 -0
- data/Gemfile +4 -0
- data/History.txt +4 -0
- data/LICENSE +51 -0
- data/License.txt +17 -0
- data/Manifest.txt +61 -0
- data/README.txt +12 -0
- data/Rakefile +34 -0
- data/ext/curb-original/curb.c +977 -0
- data/ext/curb-original/curb.h +52 -0
- data/ext/curb-original/curb_config.h +235 -0
- data/ext/curb-original/curb_easy.c +3455 -0
- data/ext/curb-original/curb_easy.h +90 -0
- data/ext/curb-original/curb_errors.c +647 -0
- data/ext/curb-original/curb_errors.h +129 -0
- data/ext/curb-original/curb_macros.h +159 -0
- data/ext/curb-original/curb_multi.c +704 -0
- data/ext/curb-original/curb_multi.h +26 -0
- data/ext/curb-original/curb_postfield.c +523 -0
- data/ext/curb-original/curb_postfield.h +40 -0
- data/ext/curb-original/curb_upload.c +80 -0
- data/ext/curb-original/curb_upload.h +30 -0
- data/ext/curb/Makefile +157 -0
- data/ext/curb/curb.c +977 -0
- data/ext/curb/curb.h +52 -0
- data/ext/curb/curb_config.h +235 -0
- data/ext/curb/curb_easy.c +3430 -0
- data/ext/curb/curb_easy.h +94 -0
- data/ext/curb/curb_errors.c +647 -0
- data/ext/curb/curb_errors.h +129 -0
- data/ext/curb/curb_macros.h +159 -0
- data/ext/curb/curb_multi.c +710 -0
- data/ext/curb/curb_multi.h +26 -0
- data/ext/curb/curb_postfield.c +523 -0
- data/ext/curb/curb_postfield.h +40 -0
- data/ext/curb/curb_upload.c +80 -0
- data/ext/curb/curb_upload.h +30 -0
- data/ext/curb/extconf.rb +399 -0
- data/lib/cache.rb +44 -0
- data/lib/curl-global.rb +151 -0
- data/lib/extensions/browser/env.js +697 -0
- data/lib/extensions/browser/jquery.js +7180 -0
- data/lib/extensions/browser/xmlsax.js +1564 -0
- data/lib/extensions/browser/xmlw3cdom_1.js +1444 -0
- data/lib/extensions/browser/xmlw3cdom_2.js +2744 -0
- data/lib/extensions/curb.rb +125 -0
- data/lib/extensions/declarative.rb +153 -0
- data/lib/extensions/johnson.rb +63 -0
- data/lib/frame.rb +766 -0
- data/lib/init.rb +36 -0
- data/lib/rhack.rb +16 -0
- data/lib/rhack.yml.template +19 -0
- data/lib/rhack/proxy/checker.rb +226 -0
- data/lib/rhack/proxy/list.rb +196 -0
- data/lib/rhack/services.rb +445 -0
- data/lib/rhack_in.rb +2 -0
- data/lib/scout.rb +591 -0
- data/lib/words.rb +37 -0
- data/test/test_frame.rb +107 -0
- data/test/test_rhack.rb +5 -0
- data/test/test_scout.rb +53 -0
- metadata +195 -0
@@ -0,0 +1,125 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Curl
|
3
|
+
|
4
|
+
class Easy
|
5
|
+
__init__
|
6
|
+
attr_accessor :base
|
7
|
+
|
8
|
+
def res
|
9
|
+
Response(self)
|
10
|
+
end
|
11
|
+
alias response res
|
12
|
+
|
13
|
+
def req
|
14
|
+
res.req
|
15
|
+
end
|
16
|
+
alias request req
|
17
|
+
|
18
|
+
def host
|
19
|
+
url.parse(:uri).root
|
20
|
+
end
|
21
|
+
|
22
|
+
def path=(href)
|
23
|
+
self.url = host+href.parse(:uri).fullpath
|
24
|
+
end
|
25
|
+
|
26
|
+
def retry!
|
27
|
+
@base.retry!
|
28
|
+
end
|
29
|
+
|
30
|
+
# curb changed getters interface, so i get some shortcuts from curb/lib/curl/easy.rb
|
31
|
+
def set(opt,val)
|
32
|
+
if opt.is_a?(Symbol)
|
33
|
+
setopt(sym2curl(opt), val)
|
34
|
+
else
|
35
|
+
setopt(opt.to_i, val)
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def sym2curl(opt)
|
40
|
+
Curl.const_get("CURLOPT_#{opt.to_s.upcase}")
|
41
|
+
end
|
42
|
+
|
43
|
+
def interface=(value)
|
44
|
+
set :interface, value
|
45
|
+
end
|
46
|
+
|
47
|
+
def url=(u)
|
48
|
+
set :url, u
|
49
|
+
end
|
50
|
+
|
51
|
+
def proxy_url=(url)
|
52
|
+
set :proxy, url
|
53
|
+
end
|
54
|
+
|
55
|
+
def userpwd=(value)
|
56
|
+
set :userpwd, value
|
57
|
+
end
|
58
|
+
|
59
|
+
def proxypwd=(value)
|
60
|
+
set :proxyuserpwd, value
|
61
|
+
end
|
62
|
+
|
63
|
+
def follow_location=(onoff)
|
64
|
+
set :followlocation, onoff
|
65
|
+
end
|
66
|
+
|
67
|
+
def head=(onoff)
|
68
|
+
set :nobody, !!onoff
|
69
|
+
end
|
70
|
+
|
71
|
+
def get=(onoff)
|
72
|
+
set :httpget, !!onoff
|
73
|
+
end
|
74
|
+
|
75
|
+
end
|
76
|
+
|
77
|
+
class PostField
|
78
|
+
|
79
|
+
def to_s
|
80
|
+
raise "Cannot convert unnamed field to string" if !name
|
81
|
+
display_content = if (cp = content_proc)
|
82
|
+
cp.inspect
|
83
|
+
elsif (c = content)
|
84
|
+
"#{c[0...20].inspect}#{"… (#{c.size.bytes})" if c.size > 20}"
|
85
|
+
elsif (ln = local_name)
|
86
|
+
File.new(ln).inspect
|
87
|
+
end
|
88
|
+
"#{name}=#{display_content}"
|
89
|
+
end
|
90
|
+
|
91
|
+
end
|
92
|
+
|
93
|
+
class Multi
|
94
|
+
if method_defined? :requests
|
95
|
+
alias :reqs :requests
|
96
|
+
end
|
97
|
+
|
98
|
+
def reset
|
99
|
+
reqs.each {|k| remove k rescue()}
|
100
|
+
$Carier = Multi.new
|
101
|
+
$Carier.pipeline = true
|
102
|
+
# GC.start
|
103
|
+
end
|
104
|
+
|
105
|
+
def drop
|
106
|
+
while running > 0 do perform rescue() end
|
107
|
+
Curl.recall
|
108
|
+
end
|
109
|
+
|
110
|
+
def drop!
|
111
|
+
drop
|
112
|
+
reset if reqs.size + running > 0
|
113
|
+
end
|
114
|
+
|
115
|
+
def sheduled
|
116
|
+
0 < running and running <= reqs.size
|
117
|
+
end
|
118
|
+
|
119
|
+
def inspect
|
120
|
+
"<#Carier #{'unit'.x reqs.size}, #{running} executing>"
|
121
|
+
end
|
122
|
+
|
123
|
+
end
|
124
|
+
|
125
|
+
end
|
@@ -0,0 +1,153 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module ActiveRecord
|
3
|
+
|
4
|
+
module ConnectionAdapters
|
5
|
+
AbstractAdapter
|
6
|
+
|
7
|
+
class VirtualTable < Table
|
8
|
+
|
9
|
+
def debug_str meth, called, exist, *args
|
10
|
+
"Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
|
11
|
+
end
|
12
|
+
|
13
|
+
def column_exists *args
|
14
|
+
column_names = @base.columns(@table_name).names
|
15
|
+
options = args.extract_options!
|
16
|
+
names = args.dup
|
17
|
+
args << options
|
18
|
+
_or_ = (names[0] == :all) ? !names.shift : true
|
19
|
+
names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
|
20
|
+
!_or_
|
21
|
+
end
|
22
|
+
|
23
|
+
def index_exists *indexes
|
24
|
+
column_indexes = @base.indexes(@table_name).columnss.flatten
|
25
|
+
_or_ = (indexes[0] == :all) ? !indexes.shift : true
|
26
|
+
indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
|
27
|
+
!_or_
|
28
|
+
end
|
29
|
+
|
30
|
+
def initialize name, connection, map=nil
|
31
|
+
super name, connection
|
32
|
+
case map
|
33
|
+
when true; @map = []
|
34
|
+
when Array; @map = map
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def map!
|
39
|
+
map_names = @map.firsts.to_ss
|
40
|
+
@base.columns(@table_name).names.each {|name|
|
41
|
+
name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
|
42
|
+
}
|
43
|
+
@map.each {|col| column *col}
|
44
|
+
end
|
45
|
+
|
46
|
+
def column name, *args
|
47
|
+
to_be_called = !column_exists(name)
|
48
|
+
super if to_be_called
|
49
|
+
$log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
|
50
|
+
@map << [name, *args] if @map
|
51
|
+
end
|
52
|
+
|
53
|
+
%w{string text integer float decimal
|
54
|
+
datetime timestamp time date binary boolean}.each {|column_type|
|
55
|
+
define_method(column_type) {|*args|
|
56
|
+
to_be_called = !column_exists(*args)
|
57
|
+
super if to_be_called
|
58
|
+
$log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
|
59
|
+
if @map
|
60
|
+
options = args.extract_options!
|
61
|
+
args = args.xprod(column_type)
|
62
|
+
args = args.xprod(options) if options
|
63
|
+
@map.concat args
|
64
|
+
end
|
65
|
+
} }
|
66
|
+
|
67
|
+
def index name, *args
|
68
|
+
to_be_called = !index_exists(name)
|
69
|
+
super if to_be_called
|
70
|
+
$log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
|
71
|
+
end
|
72
|
+
|
73
|
+
def timestamps
|
74
|
+
to_be_called = !column_exists('created_at', 'updated_at')
|
75
|
+
super if to_be_called
|
76
|
+
$log.debug {debug_str :timestamps, to_be_called, !to_be_called}
|
77
|
+
@map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
|
78
|
+
end
|
79
|
+
|
80
|
+
def change *args
|
81
|
+
raise NotImplementedError, "don't use #change in declaration!"
|
82
|
+
end
|
83
|
+
|
84
|
+
def change_default *args
|
85
|
+
raise NotImplementedError, "don't use #change_default in declaration!"
|
86
|
+
end
|
87
|
+
|
88
|
+
def rename column_name, new_column_name
|
89
|
+
to_be_called = !column_exists(new_column_name)
|
90
|
+
super if to_be_called
|
91
|
+
$log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
|
92
|
+
end
|
93
|
+
|
94
|
+
def references *args
|
95
|
+
to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
|
96
|
+
super if to_be_called
|
97
|
+
$log.debug {debug_str :references, to_be_called, !to_be_called, *args}
|
98
|
+
end
|
99
|
+
alias :belongs_to :references
|
100
|
+
|
101
|
+
def remove *args
|
102
|
+
to_be_called = column_exists :all, *args
|
103
|
+
super if to_be_called
|
104
|
+
$log.debug {debug_str :remove, to_be_called, to_be_called, *args}
|
105
|
+
end
|
106
|
+
|
107
|
+
def remove_references *args
|
108
|
+
to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
|
109
|
+
super if to_be_called
|
110
|
+
$log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
|
111
|
+
end
|
112
|
+
alias :remove_belongs_to :remove_references
|
113
|
+
|
114
|
+
def remove_index options
|
115
|
+
indexes = options.is(Hash) ? options[:column] : options
|
116
|
+
raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
|
117
|
+
to_be_called = index_exists :all, *indexes
|
118
|
+
super if to_be_called
|
119
|
+
$log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
|
120
|
+
end
|
121
|
+
|
122
|
+
def remove_timestamps
|
123
|
+
to_be_called = column_exists 'created_at', 'updated_at'
|
124
|
+
super if to_be_called
|
125
|
+
$log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
|
126
|
+
end
|
127
|
+
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
|
132
|
+
class Base
|
133
|
+
|
134
|
+
def self.declare name, options={}, &block
|
135
|
+
self.table_name = name
|
136
|
+
if !table_exists? or options[:force]
|
137
|
+
$log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
|
138
|
+
self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
|
139
|
+
$log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
|
140
|
+
connection.create_table(name, options, &block)
|
141
|
+
elsif options[:map]
|
142
|
+
table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
|
143
|
+
yield table
|
144
|
+
table.map!
|
145
|
+
else yield ConnectionAdapters::VirtualTable.new(name, connection)
|
146
|
+
end
|
147
|
+
reset_column_information
|
148
|
+
end
|
149
|
+
|
150
|
+
end
|
151
|
+
|
152
|
+
end
|
153
|
+
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module Johnson
|
3
|
+
begin
|
4
|
+
require 'johnson'
|
5
|
+
rescue LoadError
|
6
|
+
Enabled = false
|
7
|
+
else
|
8
|
+
if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
|
9
|
+
Enabled = false
|
10
|
+
else Enabled = true
|
11
|
+
end
|
12
|
+
end
|
13
|
+
### JavaScript interface DOM emulation ###
|
14
|
+
|
15
|
+
class Runtime
|
16
|
+
attr_accessor :thread_id
|
17
|
+
Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
|
18
|
+
BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
|
19
|
+
|
20
|
+
# CarierThread breaks if Multi has no work && CarierThread
|
21
|
+
# is joined so itwon't last forever.
|
22
|
+
#
|
23
|
+
# Johnson is not thread safe =>
|
24
|
+
# Runtime created in this thread will become unusable after
|
25
|
+
# CarierThread dies.
|
26
|
+
#
|
27
|
+
# So we don't use Curl.wait until Carier haven't got whole
|
28
|
+
# request for this Runtime.
|
29
|
+
def self.set_browser_for_curl(opts)
|
30
|
+
if !Runtime_is_set[opts]
|
31
|
+
if Curl.status
|
32
|
+
Curl.recall
|
33
|
+
$log.debug 'recalled'
|
34
|
+
end
|
35
|
+
if opts[:thread_safe].b
|
36
|
+
$JSRuntime = new_browser(opts[:jq])
|
37
|
+
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
38
|
+
else
|
39
|
+
$log.debug 'about to run carier'
|
40
|
+
Curl.execute {$JSRuntime = new_browser(opts[:jq])
|
41
|
+
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
|
42
|
+
sleep 0.01 until Runtime_is_set[opts]
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.new_browser(jq=false)
|
48
|
+
rt = new
|
49
|
+
%w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
|
50
|
+
path = "#{BROWSER_PATH}/#{f}.js"
|
51
|
+
rt.evaluate IO.read(path), path, 1
|
52
|
+
}
|
53
|
+
rt.document = ''
|
54
|
+
rt
|
55
|
+
end
|
56
|
+
|
57
|
+
def document=(html)
|
58
|
+
evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/lib/frame.rb
ADDED
@@ -0,0 +1,766 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module HTTPAccessKit
|
3
|
+
|
4
|
+
# Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
|
5
|
+
# Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
|
6
|
+
|
7
|
+
class ZippingError < ArgumentError
|
8
|
+
def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
|
9
|
+
super str%debug end
|
10
|
+
end
|
11
|
+
|
12
|
+
class TargetError < ArgumentError
|
13
|
+
def initialize msg="only static frame can use local paths"
|
14
|
+
super end
|
15
|
+
end
|
16
|
+
|
17
|
+
class ConfigError < ArgumentError
|
18
|
+
def initialize msg
|
19
|
+
super end
|
20
|
+
end
|
21
|
+
|
22
|
+
class Frame
|
23
|
+
__init__
|
24
|
+
attr_reader :loc, :static, :ss, :opts
|
25
|
+
|
26
|
+
def initialize *args
|
27
|
+
args << 10 unless args[-1].is Fixnum
|
28
|
+
@opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].is(Hash) ? args[-2] : {})
|
29
|
+
if args[0].is String
|
30
|
+
uri = args[0]
|
31
|
+
'http://' >> uri if uri !~ /^\w+:\/\//
|
32
|
+
@loc = uri.parse:uri
|
33
|
+
# be careful, if you set :static => false, frame will be unable to use implicit url
|
34
|
+
@static = @opts.fetch(:static, true).b
|
35
|
+
else
|
36
|
+
@loc = {}
|
37
|
+
@static = false
|
38
|
+
end
|
39
|
+
@ss = ScoutSquad *args
|
40
|
+
@pages = []
|
41
|
+
Curl.run unless Curl.status
|
42
|
+
end
|
43
|
+
|
44
|
+
def retarget to, forced=nil
|
45
|
+
to = 'http://' + to if to !~ /^\w+:/
|
46
|
+
@ss.update to, forced
|
47
|
+
@loc = to.parse:uri
|
48
|
+
end
|
49
|
+
|
50
|
+
def target=to
|
51
|
+
retarget to
|
52
|
+
end
|
53
|
+
|
54
|
+
def next() @ss.next end
|
55
|
+
def rand() @ss.rand end
|
56
|
+
def each(&block) @ss.each &block end
|
57
|
+
def [](i) @ss[i] end
|
58
|
+
|
59
|
+
def inspect
|
60
|
+
"<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static' if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
|
61
|
+
end
|
62
|
+
|
63
|
+
# opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
|
64
|
+
# :zip, :thread_safe, :result, :stream, :raw + any opts for Scouts in one hash
|
65
|
+
def get *args, &callback
|
66
|
+
many, order, orders, with_opts = interpret_request *args
|
67
|
+
L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
|
68
|
+
|
69
|
+
if !Johnson::Enabled and with_opts[:eval]
|
70
|
+
L < "failed to use option :eval because Johnson is disabled"
|
71
|
+
with_opts.delete :eval
|
72
|
+
end
|
73
|
+
# JS Runtime is not thread-safe and must be created in curl thread
|
74
|
+
# if we aren't said explicitly about the opposite
|
75
|
+
Johnson::Runtime.set_browser_for_curl with_opts
|
76
|
+
|
77
|
+
if many then exec_many orders, with_opts, &callback
|
78
|
+
else exec_one order, with_opts, &callback end
|
79
|
+
end
|
80
|
+
alias :exec :get
|
81
|
+
alias :run :get
|
82
|
+
|
83
|
+
def interpret_request(*args)
|
84
|
+
body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
|
85
|
+
L.log [body, mp, uri, opts]
|
86
|
+
zip = opts.delete :zip
|
87
|
+
many = order = orders = post = false
|
88
|
+
# Default options set is for POST
|
89
|
+
if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
|
90
|
+
# if second arg is String, then that's uri
|
91
|
+
uri, mp, post = mp.dup, false, true
|
92
|
+
# L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
|
93
|
+
# But if we have only one argument actually passed
|
94
|
+
# except for options hash, then believe it's GET
|
95
|
+
elsif body.is String or body.kinda [String]
|
96
|
+
L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
|
97
|
+
uri = body.dup
|
98
|
+
elsif !body then uri = nil
|
99
|
+
else
|
100
|
+
uri = uri.dup if uri
|
101
|
+
mp, post = !!mp, true
|
102
|
+
end
|
103
|
+
if post
|
104
|
+
unless body.is Hash or body.kinda [Hash]
|
105
|
+
raise TypeError, "body of post request must be a hash or hash array, params was
|
106
|
+
(#{args.inspect[1..-2]})"
|
107
|
+
end
|
108
|
+
validate_zip uri, body if zip
|
109
|
+
if zip or uri.kinda Array or body.kinda Array
|
110
|
+
many = true
|
111
|
+
if zip or uri.kinda Array
|
112
|
+
validate_some uri
|
113
|
+
orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
|
114
|
+
else
|
115
|
+
uri = validate uri
|
116
|
+
orders = body.xprod uri
|
117
|
+
end
|
118
|
+
orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
|
119
|
+
else
|
120
|
+
uri = validate uri
|
121
|
+
order = [:loadPost, body, mp, uri]
|
122
|
+
end
|
123
|
+
else
|
124
|
+
if uri.kinda Array
|
125
|
+
many = true
|
126
|
+
validate_some uri
|
127
|
+
orders = [:loadGet].xprod uri
|
128
|
+
else
|
129
|
+
uri = validate uri
|
130
|
+
order = [:loadGet, uri]
|
131
|
+
end
|
132
|
+
end
|
133
|
+
if !order.b and !orders.b
|
134
|
+
raise ArgumentError, "failed to run blank request#{'s' if many}, params was
|
135
|
+
(#{args.inspect[1..-2]})"
|
136
|
+
else
|
137
|
+
opts[:wait] = opts[:sync] if :sync.in opts
|
138
|
+
opts[:wait] = true if !:wait.in(opts) and
|
139
|
+
:proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
|
140
|
+
opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
|
141
|
+
opts[:load_scripts] = self if opts[:load_scripts]
|
142
|
+
opts[:stream] = true if opts[:raw]
|
143
|
+
[many, order, orders, opts]
|
144
|
+
end
|
145
|
+
end
|
146
|
+
|
147
|
+
def get_cached(*links)
|
148
|
+
res = []
|
149
|
+
expire = links[-1] == :expire ? links.pop : false
|
150
|
+
links.parses(:uri).each_with_index {|uri, i|
|
151
|
+
next if uri.path[/ads|count|stats/]
|
152
|
+
file = Cache.load uri, !expire
|
153
|
+
if file
|
154
|
+
if expire
|
155
|
+
@ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
|
156
|
+
if c.res.code == 200
|
157
|
+
res << [i, (data = c.res.body)]
|
158
|
+
Cache.save uri, data, false
|
159
|
+
else
|
160
|
+
res << [i, file.is(String) ? file : read(file.path)]
|
161
|
+
end
|
162
|
+
}
|
163
|
+
else
|
164
|
+
res << [i, file.is(String) ? file : read(file.path)]
|
165
|
+
end
|
166
|
+
else
|
167
|
+
@ss.next.loadGet(uri.href) {|c|
|
168
|
+
if c.res.code == 200
|
169
|
+
res << [i, (data = c.res.body)]
|
170
|
+
Cache.save uri, data, !expire
|
171
|
+
end
|
172
|
+
}
|
173
|
+
end
|
174
|
+
}
|
175
|
+
Curl.wait
|
176
|
+
links.size == 1 ? res[0][1] : res.sort!.lasts
|
177
|
+
end
|
178
|
+
|
179
|
+
def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
|
180
|
+
raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
|
181
|
+
@print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
|
182
|
+
(s = @ss.next).http.on_header {|h|
|
183
|
+
next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
|
184
|
+
if code = $2
|
185
|
+
if code != '200'
|
186
|
+
L << "#$& getting #{uri}; interrupting request."
|
187
|
+
s.http.on_header() # set default process
|
188
|
+
next 0
|
189
|
+
end
|
190
|
+
next h.size
|
191
|
+
end
|
192
|
+
|
193
|
+
s.http.on_header() # set default process
|
194
|
+
if !$1 # конец хедера, content-length отсутствует
|
195
|
+
L << "No Content-Length header; trying to load a whole #{uri} at once!"
|
196
|
+
s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
|
197
|
+
next 0
|
198
|
+
end
|
199
|
+
|
200
|
+
len = $1.to_i - start
|
201
|
+
psize = configure_psize(len, psize, threads)
|
202
|
+
parts = (len/psize.to_f).ceil
|
203
|
+
setup_speedometer(uri, parts, len)
|
204
|
+
yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
|
205
|
+
|
206
|
+
@ss_reserve = @ss[threads+1..-1]
|
207
|
+
@ss = @ss[0..threads]
|
208
|
+
(0...parts).each {|n|
|
209
|
+
break if stop_download
|
210
|
+
|
211
|
+
s = @ss.next
|
212
|
+
run_speedometer(s, len, n)
|
213
|
+
s.loadGet(uri, :headers => {
|
214
|
+
'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
|
215
|
+
}) {|c|
|
216
|
+
clear_speedometer(s)
|
217
|
+
if c.res.code/10 == 20
|
218
|
+
yield len, n*psize, c.res.body
|
219
|
+
else
|
220
|
+
L << "#{c.res} during get #{uri.inspect}; interrupting request."
|
221
|
+
stop_download = true
|
222
|
+
end
|
223
|
+
}
|
224
|
+
}
|
225
|
+
0
|
226
|
+
}
|
227
|
+
s.raise_err = false
|
228
|
+
s.loadGet validate uri
|
229
|
+
ensure
|
230
|
+
@ss.concat @ss_reserve || []
|
231
|
+
end
|
232
|
+
|
233
|
+
def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
|
234
|
+
dled = 0
|
235
|
+
lock = ''
|
236
|
+
callback = lambda {|len, pos, body|
|
237
|
+
if body != :careful_dl
|
238
|
+
begin
|
239
|
+
write(df, body, pos)
|
240
|
+
rescue => e
|
241
|
+
binding.start_interaction
|
242
|
+
raise
|
243
|
+
end
|
244
|
+
if (dled += body.size) == len
|
245
|
+
File.delete lock if File.file? lock
|
246
|
+
yield df if block_given?
|
247
|
+
end
|
248
|
+
else
|
249
|
+
lock = lock_file df, len, pos # filename, filesize, partsize
|
250
|
+
end
|
251
|
+
}
|
252
|
+
opts[:threads] ||= @ss.size-1
|
253
|
+
get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
|
254
|
+
Curl.wait unless block_given?
|
255
|
+
df
|
256
|
+
end
|
257
|
+
|
258
|
+
def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
|
259
|
+
opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
|
260
|
+
L << opts
|
261
|
+
|
262
|
+
@print_progress = opts[:print_progress]
|
263
|
+
unless len = opts[:len] || (map = read_mapfile(df) and map.len)
|
264
|
+
return @ss.next.loadHead(uri) {|c| $log << c
|
265
|
+
if len = c.res['Content-Length']
|
266
|
+
simple_dl(uri, df, opts.merge(:len => len.to_i))
|
267
|
+
else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
|
268
|
+
end
|
269
|
+
}
|
270
|
+
end
|
271
|
+
|
272
|
+
psize, parts = check_mapfile(df, opts)
|
273
|
+
return unless psize
|
274
|
+
L << [psize, parts]
|
275
|
+
setup_speedometer(uri, parts.size, len)
|
276
|
+
|
277
|
+
obtained uri do |uri|
|
278
|
+
if opts[:threads] == 1
|
279
|
+
start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
|
280
|
+
scout = opts[:scout] || @ss.next
|
281
|
+
$log << [uri, scout]
|
282
|
+
(loadget = lambda {|n|
|
283
|
+
run_speedometer(scout, len, n)
|
284
|
+
from = start + n*psize
|
285
|
+
to = start + (n+1)*psize - 1
|
286
|
+
scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
|
287
|
+
begin
|
288
|
+
$log << "writing #{df} from #{from}: #{c.res.body.inspect}"
|
289
|
+
write(df, c.res.body, from)
|
290
|
+
rescue => e
|
291
|
+
binding.start_interaction
|
292
|
+
raise
|
293
|
+
end
|
294
|
+
if write_mapfile(df, from, to)
|
295
|
+
clear_speedometer(scout)
|
296
|
+
L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
|
297
|
+
yield df if block_given?
|
298
|
+
elsif (n+1)*psize <= len
|
299
|
+
loadget[n+1]
|
300
|
+
end
|
301
|
+
}
|
302
|
+
})[0]
|
303
|
+
else
|
304
|
+
exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
|
305
|
+
L << c.res
|
306
|
+
range = c.req.range
|
307
|
+
begin
|
308
|
+
write(df, c.res.body, range.begin)
|
309
|
+
rescue => e
|
310
|
+
binding.start_interaction
|
311
|
+
raise
|
312
|
+
end
|
313
|
+
if write_mapfile(df, range.begin, range.end)
|
314
|
+
@ss.each {|s| s.http.on_progress} if @print_progress
|
315
|
+
yield df if block_given?
|
316
|
+
end
|
317
|
+
}
|
318
|
+
end
|
319
|
+
end
|
320
|
+
end
|
321
|
+
|
322
|
+
def check_mapfile(df, opts={})
|
323
|
+
opts.reverse_merge! :psize => :auto, :threads => 1
|
324
|
+
map = read_mapfile df
|
325
|
+
if map
|
326
|
+
L << map
|
327
|
+
if map.rest.empty?
|
328
|
+
puts "#{df} is loaded"
|
329
|
+
$log << 'deleting mapfile'
|
330
|
+
File.delete df+'.map'
|
331
|
+
[]
|
332
|
+
else
|
333
|
+
if opts[:len] and map.len != opts[:len]
|
334
|
+
raise "Incorrect file size for #{df}"
|
335
|
+
end
|
336
|
+
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
337
|
+
[psize, map.rest.div(psize)]
|
338
|
+
end
|
339
|
+
else
|
340
|
+
write_mapfile df, opts[:len]
|
341
|
+
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
342
|
+
$log << (0...opts[:len]).div(psize)
|
343
|
+
[psize, (0...opts[:len]).div(psize)]
|
344
|
+
end
|
345
|
+
end
|
346
|
+
|
347
|
+
def read_mapfile(df)
|
348
|
+
df += '.map'
|
349
|
+
text = read df
|
350
|
+
$log << "mapfile read: #{text}"
|
351
|
+
if text.b
|
352
|
+
text[/^(\d+)\0+(\d+)\0*\n/]
|
353
|
+
map = {}
|
354
|
+
$log << [$1,$2]
|
355
|
+
if $1 and $1 == $2
|
356
|
+
map.rest = []
|
357
|
+
else
|
358
|
+
map.len, *map.parts = text.chop/"\n"
|
359
|
+
map.len = map.len.to_i
|
360
|
+
map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
|
361
|
+
$log << map.parts
|
362
|
+
map.rest = (0...map.len) - XRange(*map.parts)
|
363
|
+
end
|
364
|
+
map
|
365
|
+
end
|
366
|
+
end
|
367
|
+
|
368
|
+
def write_mapfile(df, *args)
|
369
|
+
df += '.map'
|
370
|
+
map = ''
|
371
|
+
if args.size != 2
|
372
|
+
len = args.shift
|
373
|
+
map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
|
374
|
+
end
|
375
|
+
if args.any?
|
376
|
+
read(df)[/^(\d+)\0+(\d+)\0*\n/]
|
377
|
+
$log << "mapfile read"
|
378
|
+
$log << [$1,$2]
|
379
|
+
dled = $2.to_i + args[1] - args[0] + 1
|
380
|
+
return true if dled == $1.to_i
|
381
|
+
map << "#{args[0]}..#{args[1]}\n"
|
382
|
+
$log << 'writing mapfile'
|
383
|
+
write(df, dled.to_s.ljust(11, "\0"), 11)
|
384
|
+
end
|
385
|
+
$log << [df, map]
|
386
|
+
$log << 'writing mapfile'
|
387
|
+
write df, map
|
388
|
+
nil
|
389
|
+
end
|
390
|
+
|
391
|
+
def configure_psize(len, psize, threads)
|
392
|
+
case psize
|
393
|
+
when Numeric; psize.to_i
|
394
|
+
when :auto; len > 100000 ? len/threads+1 : len
|
395
|
+
when :mb; 1.mb
|
396
|
+
else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
|
397
|
+
end
|
398
|
+
end
|
399
|
+
|
400
|
+
private
|
401
|
+
def validate_zip(uri, body)
|
402
|
+
if !(uri.kinda Array and body.kinda Array)
|
403
|
+
raise ZippingError, [uri.class, nil, body.class, nil]
|
404
|
+
elsif uri.size != body.size
|
405
|
+
raise ZippingError, [uri.class, uri.size, body.class, body.size]
|
406
|
+
end
|
407
|
+
end
|
408
|
+
|
409
|
+
def validate(uri)
|
410
|
+
if uri
|
411
|
+
loc = uri.parse:uri
|
412
|
+
if loc.root and loc.root != @loc.root
|
413
|
+
raise TargetError, "failed to get #{uri} by static frame #{@loc.host}, you should first update it with new target" if @static
|
414
|
+
@loc.root = loc.root
|
415
|
+
uri
|
416
|
+
elsif !loc.root
|
417
|
+
raise TargetError if !@static
|
418
|
+
File.join @loc.root, uri
|
419
|
+
else uri
|
420
|
+
end
|
421
|
+
else
|
422
|
+
raise TargetError if !@static
|
423
|
+
@loc.href
|
424
|
+
end
|
425
|
+
end
|
426
|
+
|
427
|
+
def validate_some(uris)
|
428
|
+
uris.map! {|u| validate u}
|
429
|
+
end
|
430
|
+
|
431
|
+
def exec_one(order, opts)
|
432
|
+
# must result in Page (default) or it's subclass
|
433
|
+
page = opts[:result].new
|
434
|
+
# if no spare scouts can be found, squad simply waits for all callbacks to complete
|
435
|
+
s = @ss.next
|
436
|
+
#s.raise_err = true# Зачем это тут? Можно добавлять :raise=>1 фрейму при запиле
|
437
|
+
s.send(*(order << opts)) {|curl|
|
438
|
+
if opts[:raw]
|
439
|
+
yield curl
|
440
|
+
elsif page.process(curl, opts) and block_given?
|
441
|
+
yres = yield page
|
442
|
+
if opts[:save_result] or :proc_result.in opts
|
443
|
+
page.res = yres
|
444
|
+
end
|
445
|
+
if opts[:proc_result].is Proc and yres != :skip
|
446
|
+
opts[:proc_result].call yres
|
447
|
+
end
|
448
|
+
end
|
449
|
+
}
|
450
|
+
if opts[:wait]
|
451
|
+
opts[:thread_safe] ? $Carier.perform : Curl.wait
|
452
|
+
# почему бы не уменьшить бойлерплейт в сервисах и не возвращать res сразу?
|
453
|
+
(opts[:save_result] or :proc_result.in opts) ? page.res : page
|
454
|
+
else page
|
455
|
+
end
|
456
|
+
end
|
457
|
+
|
458
|
+
def exec_many(orders, with_opts, &callback)
|
459
|
+
w = with_opts.delete :wait
|
460
|
+
iterator = with_opts[:stream] ? :each : :map
|
461
|
+
if with_opts[:ranges]
|
462
|
+
if orders.size != with_opts[:ranges].size
|
463
|
+
raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
|
464
|
+
end
|
465
|
+
pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
|
466
|
+
(with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
|
467
|
+
exec_one order, with_opts, &callback
|
468
|
+
}
|
469
|
+
else
|
470
|
+
pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
|
471
|
+
end
|
472
|
+
with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
|
473
|
+
with_opts[:stream] || pages
|
474
|
+
end
|
475
|
+
|
476
|
+
|
477
|
+
def setup_speedometer(uri, parts, len)
|
478
|
+
return unless @print_progress
|
479
|
+
@progress = Array.new(parts, 0)
|
480
|
+
@stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
|
481
|
+
@str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
|
482
|
+
@bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
|
483
|
+
Thread.new {
|
484
|
+
until @stop_print
|
485
|
+
sleep 0.2
|
486
|
+
now = Time.now
|
487
|
+
if now > @speedometer[0] and @sum > @speedometer[1]
|
488
|
+
@speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
|
489
|
+
@speedometer.replace [now, @sum]
|
490
|
+
end
|
491
|
+
end
|
492
|
+
}
|
493
|
+
end
|
494
|
+
|
495
|
+
def run_speedometer(scout, len, n)
|
496
|
+
return unless @print_progress
|
497
|
+
scout.http.on_progress {|dl_need, dl_now, *ul|
|
498
|
+
if !@stop_print
|
499
|
+
@progress[n] = dl_now
|
500
|
+
percents = (@sum = @progress.sum)*100/len
|
501
|
+
print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
|
502
|
+
if percents == 100
|
503
|
+
puts "\v"*@newlines
|
504
|
+
@stop_print = true
|
505
|
+
end
|
506
|
+
end
|
507
|
+
true
|
508
|
+
}
|
509
|
+
end
|
510
|
+
|
511
|
+
def clear_speedometer(scout)
|
512
|
+
return unless @print_progress
|
513
|
+
scout.http.on_progress
|
514
|
+
end
|
515
|
+
|
516
|
+
end
|
517
|
+
|
518
|
+
def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
|
519
|
+
Curl.run
|
520
|
+
Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
|
521
|
+
end
|
522
|
+
module_function :dl
|
523
|
+
|
524
|
+
|
525
|
+
|
526
|
+
class Page
|
527
|
+
# for debug, just enable L#debug, don't write tons of chaotic log-lines
|
528
|
+
__init__
|
529
|
+
# res here is result of page processing made in frame context
|
530
|
+
attr_accessor :title, :res
|
531
|
+
attr_reader :html, :loc, :hash, :doc, :js
|
532
|
+
@@ignore = /google|_gat|tracker|adver/i
|
533
|
+
|
534
|
+
def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
|
535
|
+
loc = loc.parse:uri if !loc.is Hash
|
536
|
+
@js = js
|
537
|
+
if obj.is Curl::Easy or obj.kinda Scout
|
538
|
+
c = obj.kinda(Scout) ? obj.http : html
|
539
|
+
@html = ''
|
540
|
+
# just (c, loc) would pass to #process opts variable that returns '' on any key
|
541
|
+
process(c, loc.b || {})
|
542
|
+
else
|
543
|
+
@html = obj
|
544
|
+
@loc = loc
|
545
|
+
end
|
546
|
+
end
|
547
|
+
|
548
|
+
def inspect
|
549
|
+
if !@hash.nil?
|
550
|
+
"<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
|
551
|
+
else
|
552
|
+
"<#FramePage #{@html.b ? "«#{@title}» (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
|
553
|
+
end
|
554
|
+
end
|
555
|
+
|
556
|
+
# We can then alternate #process in Page subclasses
|
557
|
+
# Frame doesn't mind about value returned by #process
|
558
|
+
def process(c, opts={})
|
559
|
+
@loc = c.last_effective_url.parse:uri
|
560
|
+
L.debug "#{@loc.fullpath} -> #{c.res}"
|
561
|
+
if c.res.code == 200
|
562
|
+
body = c.res.body
|
563
|
+
if opts[:json]
|
564
|
+
@json = true
|
565
|
+
@hash = begin; body.from_json
|
566
|
+
rescue StandardError
|
567
|
+
false
|
568
|
+
end
|
569
|
+
if !@hash or @hash.is String
|
570
|
+
L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
571
|
+
@html = body; to_doc
|
572
|
+
@hash = false
|
573
|
+
end
|
574
|
+
|
575
|
+
elsif opts[:hash]
|
576
|
+
if body.inline
|
577
|
+
@hash = body.to_hash
|
578
|
+
else
|
579
|
+
@hash = false
|
580
|
+
L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
581
|
+
@html = body; to_doc
|
582
|
+
end
|
583
|
+
|
584
|
+
else
|
585
|
+
@html = body; to_doc
|
586
|
+
if opts[:eval]
|
587
|
+
load_scripts opts[:load_scripts]
|
588
|
+
eval_js
|
589
|
+
end
|
590
|
+
end
|
591
|
+
end
|
592
|
+
self
|
593
|
+
end
|
594
|
+
|
595
|
+
def eval_js(frame=nil)
|
596
|
+
eval_string "document.location = window.location = #{@loc.to_json};
|
597
|
+
document.URL = document.baseURI = document.documentURI = location.href;
|
598
|
+
document.domain = location.host;"
|
599
|
+
find("script").each {|n|
|
600
|
+
L.debug n.text.strip
|
601
|
+
if text = n.text.strip.b
|
602
|
+
js[:write_output] = ''
|
603
|
+
eval_string text
|
604
|
+
if res = js[:write_output].b then n.after res end
|
605
|
+
n.remove!
|
606
|
+
elsif frame and n.src
|
607
|
+
eval_string frame.get_cached expand_link n.src
|
608
|
+
end
|
609
|
+
}
|
610
|
+
end
|
611
|
+
|
612
|
+
def eval_string(str)
|
613
|
+
@js ||= Johnson::Runtime.new
|
614
|
+
L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
615
|
+
begin
|
616
|
+
@js.evaluate(str)
|
617
|
+
rescue Johnson::Error => e
|
618
|
+
L.warn e.message
|
619
|
+
L.debug {
|
620
|
+
if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
|
621
|
+
L.clr.hl! str, /\b#{m[1] || m[2]}\b/
|
622
|
+
end
|
623
|
+
"\n\t#{str}"
|
624
|
+
}
|
625
|
+
end
|
626
|
+
end
|
627
|
+
|
628
|
+
def to_doc
|
629
|
+
@doc = @html.to_doc :forceutf
|
630
|
+
if !(@title = @doc.title.b)
|
631
|
+
@title = @loc.href
|
632
|
+
@doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
|
633
|
+
else
|
634
|
+
if @title.cyr? and UTF2ANSI[@title].size > 40
|
635
|
+
@title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/]]+'…'
|
636
|
+
elsif @title.size > 40
|
637
|
+
@title = @title[/.{1,30}\S*/]+'…'
|
638
|
+
end
|
639
|
+
end
|
640
|
+
@doc
|
641
|
+
end
|
642
|
+
|
643
|
+
def find(xp) (@doc || to_doc).find xp end
|
644
|
+
|
645
|
+
def at(xp) (@doc || to_doc).at xp end
|
646
|
+
|
647
|
+
def url() @loc.href end
|
648
|
+
alias :href :url
|
649
|
+
|
650
|
+
def get_srcs(links='img')
|
651
|
+
begin
|
652
|
+
links = find(links).map {|e| e.src} if links.is String
|
653
|
+
rescue XML::Error
|
654
|
+
links = [links]
|
655
|
+
end
|
656
|
+
links.map {|link| expand_link link}.uniq
|
657
|
+
end
|
658
|
+
|
659
|
+
def get_src(link='img')
|
660
|
+
begin
|
661
|
+
link = at(link) && at(link).src if link.is String
|
662
|
+
rescue XML::Error; nil
|
663
|
+
end
|
664
|
+
expand_link link if link
|
665
|
+
end
|
666
|
+
|
667
|
+
def get_links(links='a')
|
668
|
+
begin
|
669
|
+
links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
|
670
|
+
rescue XML::Error
|
671
|
+
links = [links]
|
672
|
+
end
|
673
|
+
links.map {|link| expand_link link}.uniq
|
674
|
+
end
|
675
|
+
|
676
|
+
def get_link(link='a')
|
677
|
+
begin
|
678
|
+
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
|
679
|
+
rescue XML::Error; nil
|
680
|
+
end
|
681
|
+
expand_link link if link
|
682
|
+
end
|
683
|
+
alias :get_hrefs :get_links
|
684
|
+
alias :links :get_links
|
685
|
+
alias :get_href :get_link
|
686
|
+
alias :link :get_link
|
687
|
+
|
688
|
+
def expand_link(link)
|
689
|
+
case link
|
690
|
+
when /^\w+:\/\// then link
|
691
|
+
when /^\/\// then @loc.protocol+link
|
692
|
+
when /^\// then @loc.root+link
|
693
|
+
else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
|
694
|
+
end
|
695
|
+
end
|
696
|
+
|
697
|
+
def form(form='form', hash={}, opts={})
|
698
|
+
form = "[action=#{@loc.path.inspect}]" if form == :self
|
699
|
+
if form.is String
|
700
|
+
form_node = at form
|
701
|
+
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node
|
702
|
+
else form_node = form
|
703
|
+
end
|
704
|
+
hash = form_node.inputs_all.merge!(hash)
|
705
|
+
action = expand_link(form_node.action || @loc.path)
|
706
|
+
if form_node['method'].downcase == 'post'
|
707
|
+
[hash, form_node.enctype =~ /multipart/, action, opts]
|
708
|
+
else
|
709
|
+
action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
|
710
|
+
[action, opts]
|
711
|
+
end
|
712
|
+
end
|
713
|
+
|
714
|
+
def submit(form, frame, hash={}, opts={}, &callback)
|
715
|
+
(opts[:header] ||= {}).Referer ||= @loc.href if @loc
|
716
|
+
query = form(form, hash, opts)
|
717
|
+
|
718
|
+
curr_target, new_target = frame.loc.href, (query[2] || query[0])
|
719
|
+
if need_retargeting = (frame.static && curr_target != new_target)
|
720
|
+
frame.retarget new_target
|
721
|
+
end
|
722
|
+
page = frame.exec(*query, &callback)
|
723
|
+
frame.retarget curr_target, :forced if need_retargeting
|
724
|
+
page
|
725
|
+
end
|
726
|
+
|
727
|
+
def load_scripts(frame)
|
728
|
+
frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
|
729
|
+
end
|
730
|
+
|
731
|
+
end
|
732
|
+
|
733
|
+
# using reprocessing of page in case of non-200 response:
|
734
|
+
# page_class = ReloadablePage do
|
735
|
+
# @res and @res.code != 200
|
736
|
+
# end
|
737
|
+
def ReloadablePage(&reload_condition)
|
738
|
+
rp = Class.new Page
|
739
|
+
rp.send :define_method, :process do |curl, opts|
|
740
|
+
super(curl, opts || {})
|
741
|
+
if curl.instance_eval &reload_condition
|
742
|
+
curl.retry!
|
743
|
+
nil # in case of reload_condition.call super's callback will not proceed
|
744
|
+
else self
|
745
|
+
end
|
746
|
+
end
|
747
|
+
rp
|
748
|
+
end
|
749
|
+
|
750
|
+
end
|
751
|
+
|
752
|
+
|
753
|
+
|
754
|
+
|
755
|
+
|
756
|
+
|
757
|
+
|
758
|
+
|
759
|
+
|
760
|
+
|
761
|
+
|
762
|
+
|
763
|
+
|
764
|
+
|
765
|
+
|
766
|
+
|