rhack 0.4.1 → 1.0.0.rc4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +22 -0
- data/Gemfile +2 -5
- data/LICENSE +19 -15
- data/README.md +66 -26
- data/Rakefile +42 -31
- data/config/cacert.pem +3895 -0
- data/config/rhack.yml.template +40 -0
- data/ext/curb-original/curb_config.h +3 -0
- data/ext/curb-original/curb_easy.c +3 -54
- data/ext/curb-original/curb_multi.c +69 -140
- data/ext/curb/curb_multi.c +1 -1
- data/lib/rhack.rb +82 -12
- data/lib/rhack/cookie.rb +49 -0
- data/lib/rhack/curl.rb +6 -0
- data/lib/{extensions/curb.rb → rhack/curl/easy.rb} +26 -48
- data/lib/rhack/curl/global.rb +175 -0
- data/lib/rhack/curl/itt.rb +11 -0
- data/lib/rhack/curl/multi.rb +37 -0
- data/lib/rhack/curl/post_field.rb +20 -0
- data/lib/rhack/curl/response.rb +91 -0
- data/lib/rhack/dl.rb +308 -0
- data/lib/rhack/frame.rb +316 -0
- data/lib/{extensions → rhack/js}/browser/env.js +0 -0
- data/lib/{extensions → rhack/js}/browser/jquery.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlsax.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlw3cdom_1.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlw3cdom_2.js +0 -0
- data/lib/rhack/js/johnson.rb +71 -0
- data/lib/rhack/page.rb +263 -0
- data/lib/rhack/proxy.rb +3 -0
- data/lib/rhack/proxy/checker.rb +1 -1
- data/lib/rhack/scout.rb +342 -0
- data/lib/rhack/scout_squad.rb +98 -0
- data/lib/rhack/services.rb +1 -464
- data/lib/rhack/services/base.rb +59 -0
- data/lib/rhack/services/examples.rb +423 -0
- data/lib/rhack/version.rb +3 -0
- data/lib/rhack_in.rb +3 -2
- data/rhack.gemspec +28 -0
- metadata +104 -85
- data/.gemtest +0 -0
- data/Gemfile.lock +0 -23
- data/Manifest.txt +0 -60
- data/ext/curb/Makefile +0 -217
- data/lib/cache.rb +0 -44
- data/lib/curl-global.rb +0 -164
- data/lib/extensions/declarative.rb +0 -153
- data/lib/extensions/johnson.rb +0 -63
- data/lib/frame.rb +0 -848
- data/lib/init.rb +0 -49
- data/lib/rhack.yml.template +0 -19
- data/lib/scout.rb +0 -589
- data/lib/words.rb +0 -25
data/lib/cache.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module HTTPAccessKit
|
3
|
-
|
4
|
-
class Cache < ActiveRecord::Base
|
5
|
-
declare CacheTable do |t|
|
6
|
-
t.integer :url_hash
|
7
|
-
t.string :url
|
8
|
-
t.string :path
|
9
|
-
t.string :date
|
10
|
-
t.string :ext
|
11
|
-
t.timestamps
|
12
|
-
end if DB
|
13
|
-
RAMCache = {}
|
14
|
-
|
15
|
-
def self.clean(time=7.days)
|
16
|
-
destroy_all("created_at < '#{time.ago}'").each {|c|
|
17
|
-
FileUtils.remove c.path if c.path and File.file?(c.path)}
|
18
|
-
end
|
19
|
-
CacheTTL and clean CacheTTL
|
20
|
-
|
21
|
-
def self.save(url, data, cache_data=true)
|
22
|
-
new(url, data).save
|
23
|
-
RAMCache[url.href] = data if cache_data
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.load(url, cache_data=true)
|
27
|
-
if data = RAMCache[url.href]
|
28
|
-
data
|
29
|
-
elsif file = first(:select => 'date, path', :conditions => {:url_hash => url.href.hash})
|
30
|
-
RAMCache[url.href] = read(file.path) if cache_data
|
31
|
-
file
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def initialize(url, data)
|
36
|
-
t = Time.now
|
37
|
-
path = "#{CacheDir}/#{t.to_i}-#{File.split(url.path)[1]}"
|
38
|
-
rw path, data
|
39
|
-
super :url => url.href, :url_hash => url.href.hash, :date => t.httpdate, :path => path, :ext => url.ext
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
end
|
data/lib/curl-global.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module Curl
|
3
|
-
|
4
|
-
def execute(unless_allready=false)
|
5
|
-
if unless_allready and Curl.status
|
6
|
-
return L.log "Non-nil status! Avoid executing"
|
7
|
-
end
|
8
|
-
if $CarierThread and s = $CarierThread.status
|
9
|
-
L.log "Carier thread allready started and has status #{s}"
|
10
|
-
else
|
11
|
-
if s = Curl.status(false) then L.warn s end
|
12
|
-
L.log($CarierThread ? "Resetting Carier thread" : "Setting Carier thread up")
|
13
|
-
$CarierThread = Thread.new {
|
14
|
-
error = nil
|
15
|
-
begin
|
16
|
-
# "why Thread#value is raising since it never raised before?"
|
17
|
-
yield if block_given?
|
18
|
-
rescue => error
|
19
|
-
nil
|
20
|
-
end
|
21
|
-
loop {
|
22
|
-
begin
|
23
|
-
# with true argument (idle) it would break only if no requests to perform
|
24
|
-
break unless $Carier.perform true
|
25
|
-
L.log "Nothing to perform; idling..."
|
26
|
-
rescue => error
|
27
|
-
break
|
28
|
-
# but ruby mystically crashes if next sequence occur:
|
29
|
-
# Multi performs and can't see any requests so entering idle mode
|
30
|
-
# we add some requests and multi load them
|
31
|
-
# one of requests' callbacks raises error in *main* thread
|
32
|
-
# so we can't allow any raises here, instead, keep them in 'wait' section
|
33
|
-
end
|
34
|
-
} unless error
|
35
|
-
error
|
36
|
-
}
|
37
|
-
# until main thread has sleep a bit, $CarierThread will have status "run",
|
38
|
-
# no matter whether it's idling or performing requests
|
39
|
-
sleep 0.001
|
40
|
-
end
|
41
|
-
end
|
42
|
-
alias :run :execute
|
43
|
-
module_function :execute, :run
|
44
|
-
|
45
|
-
def wait
|
46
|
-
if $CarierThread and $CarierThread.status
|
47
|
-
unless within = Thread.current == $CarierThread
|
48
|
-
# We can't set `perform' timeout lesser than 1 second in the curl binding
|
49
|
-
# because in that case thread status would always be "run"
|
50
|
-
# so here we wait for exactly 1 sec
|
51
|
-
sleep 1
|
52
|
-
end
|
53
|
-
# Also, if thread do Kernel.sleep, it would skip Curl.wait here
|
54
|
-
if !$Carier.sheduled and ($CarierThread.status == 'sleep' or within && $Carier.reqs.empty?)
|
55
|
-
L.log "No shedule to wait"
|
56
|
-
else
|
57
|
-
this_thread = within ? 'it\'s thread' : Thread.main == Thread.current ? 'main thread' : 'thread '+Thread.current.object_id
|
58
|
-
L.log "Waiting for Carier to complete in #{this_thread}"
|
59
|
-
begin
|
60
|
-
L.log { "Trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> true from #{this_thread}" }
|
61
|
-
if within
|
62
|
-
L.log "calling this from one of callbacks to wait for the rest to complete"
|
63
|
-
begin
|
64
|
-
$Carier.perform
|
65
|
-
rescue RuntimeError => e
|
66
|
-
L.warn [e, e.message]
|
67
|
-
L.info "$Carier $Carier.sheduled $CarierThread $CarierThread.status", binding
|
68
|
-
L.warn "Failed to run Multi#perform: nothing to perform"
|
69
|
-
end
|
70
|
-
else
|
71
|
-
$CarierThreadIsJoined = true
|
72
|
-
$CarierThread.join
|
73
|
-
end
|
74
|
-
rescue (defined?(IRB) ? IRB::Abort : NilClass)
|
75
|
-
recall!
|
76
|
-
L.info "Carier thread recalled by keyboard"
|
77
|
-
ensure
|
78
|
-
L.log "trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> false from #{this_thread}"
|
79
|
-
if !within
|
80
|
-
$CarierThreadIsJoined = false
|
81
|
-
# using Curl#execute from different threads may cause problems here when you don't control input,
|
82
|
-
# for example, in a daemonized ruby process
|
83
|
-
# just do not get $CarierThread joined from non-main thread
|
84
|
-
if $CarierThread and e = $CarierThread.value
|
85
|
-
# this will raise thread-safely in main thread
|
86
|
-
# in case of unrescued error in CarierThread
|
87
|
-
L.log(([e.message]+RMTools.format_trace(e.backtrace))*"\n")
|
88
|
-
recall!
|
89
|
-
raise e
|
90
|
-
end
|
91
|
-
execute
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
else
|
96
|
-
L < "No thread to wait. I guess I should create one"
|
97
|
-
execute
|
98
|
-
wait
|
99
|
-
end
|
100
|
-
end
|
101
|
-
module_function :wait
|
102
|
-
|
103
|
-
def recall
|
104
|
-
L.debug caller
|
105
|
-
if $CarierThread
|
106
|
-
L.log "Recalling Carier thread"
|
107
|
-
$CarierThread.kill
|
108
|
-
sleep 1
|
109
|
-
else
|
110
|
-
L.log "No thread to recall"
|
111
|
-
end
|
112
|
-
end
|
113
|
-
alias :stop :recall
|
114
|
-
|
115
|
-
def recall!
|
116
|
-
if $CarierThread
|
117
|
-
L.warn "Recalling thread and resetting Carier!!!"
|
118
|
-
$CarierThread.kill
|
119
|
-
$CarierThread = nil
|
120
|
-
$Carier.reset
|
121
|
-
else
|
122
|
-
L.log "No thread to recall!"
|
123
|
-
end
|
124
|
-
end
|
125
|
-
alias :stop! :recall!
|
126
|
-
module_function :recall!, :stop!, :recall, :stop
|
127
|
-
|
128
|
-
def reset
|
129
|
-
recall
|
130
|
-
execute
|
131
|
-
end
|
132
|
-
alias :reload :reset
|
133
|
-
|
134
|
-
def reset!
|
135
|
-
recall!
|
136
|
-
execute
|
137
|
-
end
|
138
|
-
alias :reload! :reset!
|
139
|
-
module_function :reset!, :reset, :reload!, :reload
|
140
|
-
|
141
|
-
def status(raise_e=true)
|
142
|
-
if $CarierThread and (s = $CarierThread.status)
|
143
|
-
L.log "Carier thread responding with status #{s}"
|
144
|
-
s
|
145
|
-
elsif $CarierThread
|
146
|
-
if e = $CarierThread.value
|
147
|
-
if raise_e
|
148
|
-
recall!
|
149
|
-
raise e
|
150
|
-
else
|
151
|
-
L.log "Carier Thread returned #{e.inspect}"
|
152
|
-
e
|
153
|
-
end
|
154
|
-
else
|
155
|
-
L.log "Carier Thread is exited without error"
|
156
|
-
end
|
157
|
-
else
|
158
|
-
L.log "There is no Carier Thread atm"
|
159
|
-
end
|
160
|
-
end
|
161
|
-
alias :st :status
|
162
|
-
module_function :status, :st
|
163
|
-
|
164
|
-
end
|
@@ -1,153 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module ActiveRecord
|
3
|
-
|
4
|
-
module ConnectionAdapters
|
5
|
-
AbstractAdapter
|
6
|
-
|
7
|
-
class VirtualTable < Table
|
8
|
-
|
9
|
-
def debug_str meth, called, exist, *args
|
10
|
-
"Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
|
11
|
-
end
|
12
|
-
|
13
|
-
def column_exists *args
|
14
|
-
column_names = @base.columns(@table_name).names
|
15
|
-
options = args.extract_options!
|
16
|
-
names = args.dup
|
17
|
-
args << options
|
18
|
-
_or_ = (names[0] == :all) ? !names.shift : true
|
19
|
-
names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
|
20
|
-
!_or_
|
21
|
-
end
|
22
|
-
|
23
|
-
def index_exists *indexes
|
24
|
-
column_indexes = @base.indexes(@table_name).columnss.flatten
|
25
|
-
_or_ = (indexes[0] == :all) ? !indexes.shift : true
|
26
|
-
indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
|
27
|
-
!_or_
|
28
|
-
end
|
29
|
-
|
30
|
-
def initialize name, connection, map=nil
|
31
|
-
super name, connection
|
32
|
-
case map
|
33
|
-
when true; @map = []
|
34
|
-
when Array; @map = map
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def map!
|
39
|
-
map_names = @map.firsts.to_ss
|
40
|
-
@base.columns(@table_name).names.each {|name|
|
41
|
-
name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
|
42
|
-
}
|
43
|
-
@map.each {|col| column *col}
|
44
|
-
end
|
45
|
-
|
46
|
-
def column name, *args
|
47
|
-
to_be_called = !column_exists(name)
|
48
|
-
super if to_be_called
|
49
|
-
$log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
|
50
|
-
@map << [name, *args] if @map
|
51
|
-
end
|
52
|
-
|
53
|
-
%w{string text integer float decimal
|
54
|
-
datetime timestamp time date binary boolean}.each {|column_type|
|
55
|
-
define_method(column_type) {|*args|
|
56
|
-
to_be_called = !column_exists(*args)
|
57
|
-
super if to_be_called
|
58
|
-
$log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
|
59
|
-
if @map
|
60
|
-
options = args.extract_options!
|
61
|
-
args = args.xprod(column_type)
|
62
|
-
args = args.xprod(options) if options
|
63
|
-
@map.concat args
|
64
|
-
end
|
65
|
-
} }
|
66
|
-
|
67
|
-
def index name, *args
|
68
|
-
to_be_called = !index_exists(name)
|
69
|
-
super if to_be_called
|
70
|
-
$log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
|
71
|
-
end
|
72
|
-
|
73
|
-
def timestamps
|
74
|
-
to_be_called = !column_exists('created_at', 'updated_at')
|
75
|
-
super if to_be_called
|
76
|
-
$log.debug {debug_str :timestamps, to_be_called, !to_be_called}
|
77
|
-
@map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
|
78
|
-
end
|
79
|
-
|
80
|
-
def change *args
|
81
|
-
raise NotImplementedError, "don't use #change in declaration!"
|
82
|
-
end
|
83
|
-
|
84
|
-
def change_default *args
|
85
|
-
raise NotImplementedError, "don't use #change_default in declaration!"
|
86
|
-
end
|
87
|
-
|
88
|
-
def rename column_name, new_column_name
|
89
|
-
to_be_called = !column_exists(new_column_name)
|
90
|
-
super if to_be_called
|
91
|
-
$log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
|
92
|
-
end
|
93
|
-
|
94
|
-
def references *args
|
95
|
-
to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
|
96
|
-
super if to_be_called
|
97
|
-
$log.debug {debug_str :references, to_be_called, !to_be_called, *args}
|
98
|
-
end
|
99
|
-
alias :belongs_to :references
|
100
|
-
|
101
|
-
def remove *args
|
102
|
-
to_be_called = column_exists :all, *args
|
103
|
-
super if to_be_called
|
104
|
-
$log.debug {debug_str :remove, to_be_called, to_be_called, *args}
|
105
|
-
end
|
106
|
-
|
107
|
-
def remove_references *args
|
108
|
-
to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
|
109
|
-
super if to_be_called
|
110
|
-
$log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
|
111
|
-
end
|
112
|
-
alias :remove_belongs_to :remove_references
|
113
|
-
|
114
|
-
def remove_index options
|
115
|
-
indexes = options.is(Hash) ? options[:column] : options
|
116
|
-
raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
|
117
|
-
to_be_called = index_exists :all, *indexes
|
118
|
-
super if to_be_called
|
119
|
-
$log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
|
120
|
-
end
|
121
|
-
|
122
|
-
def remove_timestamps
|
123
|
-
to_be_called = column_exists 'created_at', 'updated_at'
|
124
|
-
super if to_be_called
|
125
|
-
$log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
|
126
|
-
end
|
127
|
-
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
class Base
|
133
|
-
|
134
|
-
def self.declare name, options={}, &block
|
135
|
-
self.table_name = name
|
136
|
-
if !table_exists? or options[:force]
|
137
|
-
$log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
|
138
|
-
self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
|
139
|
-
$log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
|
140
|
-
connection.create_table(name, options, &block)
|
141
|
-
elsif options[:map]
|
142
|
-
table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
|
143
|
-
yield table
|
144
|
-
table.map!
|
145
|
-
else yield ConnectionAdapters::VirtualTable.new(name, connection)
|
146
|
-
end
|
147
|
-
reset_column_information
|
148
|
-
end
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
end
|
153
|
-
|
data/lib/extensions/johnson.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module Johnson
|
3
|
-
begin
|
4
|
-
require 'johnson'
|
5
|
-
rescue LoadError
|
6
|
-
Enabled = false
|
7
|
-
else
|
8
|
-
if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
|
9
|
-
Enabled = false
|
10
|
-
else Enabled = true
|
11
|
-
end
|
12
|
-
end
|
13
|
-
### JavaScript interface DOM emulation ###
|
14
|
-
|
15
|
-
class Runtime
|
16
|
-
attr_accessor :thread_id
|
17
|
-
Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
|
18
|
-
BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
|
19
|
-
|
20
|
-
# CarierThread breaks if Multi has no work && CarierThread
|
21
|
-
# is joined so itwon't last forever.
|
22
|
-
#
|
23
|
-
# Johnson is not thread safe =>
|
24
|
-
# Runtime created in this thread will become unusable after
|
25
|
-
# CarierThread dies.
|
26
|
-
#
|
27
|
-
# So we don't use Curl.wait until Carier haven't got whole
|
28
|
-
# request for this Runtime.
|
29
|
-
def self.set_browser_for_curl(opts)
|
30
|
-
if !Runtime_is_set[opts]
|
31
|
-
if Curl.status
|
32
|
-
Curl.recall
|
33
|
-
$log.debug 'recalled'
|
34
|
-
end
|
35
|
-
if opts[:thread_safe].b
|
36
|
-
$JSRuntime = new_browser(opts[:jq])
|
37
|
-
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
38
|
-
else
|
39
|
-
$log.debug 'about to run carier'
|
40
|
-
Curl.execute {$JSRuntime = new_browser(opts[:jq])
|
41
|
-
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
|
42
|
-
sleep 0.01 until Runtime_is_set[opts]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def self.new_browser(jq=false)
|
48
|
-
rt = new
|
49
|
-
%w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
|
50
|
-
path = "#{BROWSER_PATH}/#{f}.js"
|
51
|
-
rt.evaluate IO.read(path), path, 1
|
52
|
-
}
|
53
|
-
rt.document = ''
|
54
|
-
rt
|
55
|
-
end
|
56
|
-
|
57
|
-
def document=(html)
|
58
|
-
evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|
data/lib/frame.rb
DELETED
@@ -1,848 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module HTTPAccessKit
|
3
|
-
|
4
|
-
# Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
|
5
|
-
# Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
|
6
|
-
|
7
|
-
class ZippingError < ArgumentError
|
8
|
-
def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
|
9
|
-
super str%debug end
|
10
|
-
end
|
11
|
-
|
12
|
-
class TargetError < ArgumentError
|
13
|
-
def initialize msg="only static frame can use local paths"
|
14
|
-
super end
|
15
|
-
end
|
16
|
-
|
17
|
-
class ConfigError < ArgumentError
|
18
|
-
def initialize msg
|
19
|
-
super end
|
20
|
-
end
|
21
|
-
|
22
|
-
class Frame
|
23
|
-
__init__
|
24
|
-
attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
|
25
|
-
@@cache = {}
|
26
|
-
|
27
|
-
def initialize *args
|
28
|
-
args << 10 unless args[-1].is Fixnum
|
29
|
-
args.insert -2, {} unless args[-2].is Hash
|
30
|
-
@opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
|
31
|
-
args[-2] = @opts
|
32
|
-
if args[0].is String
|
33
|
-
uri = args[0]
|
34
|
-
'http://' >> uri if uri !~ /^\w+:\/\//
|
35
|
-
@loc = uri.parse:uri
|
36
|
-
# be careful, if you set :static => false, frame will be unable to use implicit url
|
37
|
-
@static = @opts.fetch(:static, true)
|
38
|
-
else
|
39
|
-
@loc = {}
|
40
|
-
@static = false
|
41
|
-
end
|
42
|
-
@ss = ScoutSquad *args
|
43
|
-
Curl.run :unless_allready
|
44
|
-
end
|
45
|
-
|
46
|
-
def retarget to, forced=nil
|
47
|
-
to = 'http://' + to if to !~ /^\w+:/
|
48
|
-
@ss.update to, forced
|
49
|
-
@loc = to.parse:uri
|
50
|
-
end
|
51
|
-
alias :target= :retarget
|
52
|
-
|
53
|
-
def next() @ss.next end
|
54
|
-
def rand() @ss.rand end
|
55
|
-
def each(&block) @ss.each &block end
|
56
|
-
def [](i) @ss[i] end
|
57
|
-
|
58
|
-
def copy_cookies! i=0
|
59
|
-
@ss.each {|s| s.cookies.replace @ss[i].cookies}
|
60
|
-
end
|
61
|
-
|
62
|
-
def use_cache! opts={}
|
63
|
-
if opts == false
|
64
|
-
@use_cache = false
|
65
|
-
else
|
66
|
-
@@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
|
67
|
-
#@write_to = opts[:write_to] if :write_to.in opts
|
68
|
-
@use_cache = true
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def drop_cache! use=nil
|
73
|
-
@@cache.clear
|
74
|
-
GC.start
|
75
|
-
@use_cache = use if use.in [true, false]
|
76
|
-
end
|
77
|
-
|
78
|
-
def inspect
|
79
|
-
"<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static'+(' => '+@static.protocol if @static.is(Hash)) if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
|
80
|
-
end
|
81
|
-
|
82
|
-
# opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
|
83
|
-
# :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
|
84
|
-
def exec *args, &callback
|
85
|
-
many, order, orders, with_opts = interpret_request *args
|
86
|
-
L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
|
87
|
-
|
88
|
-
if !Johnson::Enabled and with_opts[:eval]
|
89
|
-
L < "failed to use option :eval because Johnson is disabled"
|
90
|
-
with_opts.delete :eval
|
91
|
-
end
|
92
|
-
# JS Runtime is not thread-safe and must be created in curl thread
|
93
|
-
# if we aren't said explicitly about the opposite
|
94
|
-
Johnson::Runtime.set_browser_for_curl with_opts
|
95
|
-
|
96
|
-
if many then exec_many orders, with_opts, &callback
|
97
|
-
else exec_one order, with_opts, &callback end
|
98
|
-
end
|
99
|
-
alias :get :exec
|
100
|
-
alias :run :get
|
101
|
-
|
102
|
-
def interpret_request(*args)
|
103
|
-
body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
|
104
|
-
L.log [body, mp, uri, opts]
|
105
|
-
zip = opts.delete :zip
|
106
|
-
many = order = orders = post = false
|
107
|
-
# Default options set is for POST
|
108
|
-
if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
|
109
|
-
# if second arg is String, then that's uri
|
110
|
-
uri, mp, post = mp.dup, false, true
|
111
|
-
# L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
|
112
|
-
# But if we have only one argument actually passed
|
113
|
-
# except for options hash, then believe it's GET
|
114
|
-
elsif body.is String or body.kinda [String]
|
115
|
-
L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
|
116
|
-
uri = body.dup
|
117
|
-
elsif !body then uri = nil
|
118
|
-
else
|
119
|
-
uri = uri.dup if uri
|
120
|
-
mp, post = !!mp, true
|
121
|
-
end
|
122
|
-
if post
|
123
|
-
unless body.is Hash or body.kinda [Hash]
|
124
|
-
raise TypeError, "body of post request must be a hash or hash array, params was
|
125
|
-
(#{args.inspect[1..-2]})"
|
126
|
-
end
|
127
|
-
validate_zip uri, body if zip
|
128
|
-
if zip or uri.kinda Array or body.kinda Array
|
129
|
-
many = true
|
130
|
-
if zip or uri.kinda Array
|
131
|
-
validate_some uri
|
132
|
-
orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
|
133
|
-
else
|
134
|
-
uri = validate uri
|
135
|
-
orders = body.xprod uri
|
136
|
-
end
|
137
|
-
orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
|
138
|
-
else
|
139
|
-
uri = validate uri
|
140
|
-
order = [:loadPost, body, mp, uri]
|
141
|
-
end
|
142
|
-
else
|
143
|
-
if uri.kinda Array
|
144
|
-
many = true
|
145
|
-
validate_some uri
|
146
|
-
orders = [:loadGet].xprod uri
|
147
|
-
else
|
148
|
-
uri = validate uri
|
149
|
-
order = [:loadGet, uri]
|
150
|
-
end
|
151
|
-
end
|
152
|
-
if !order.b and !orders.b
|
153
|
-
raise ArgumentError, "failed to run blank request#{'s' if many}, params was
|
154
|
-
(#{args.inspect[1..-2]})"
|
155
|
-
end
|
156
|
-
|
157
|
-
opts[:wait] = opts[:sync] if :sync.in opts
|
158
|
-
opts[:wait] = true if !:wait.in(opts) and
|
159
|
-
:proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
|
160
|
-
opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
|
161
|
-
opts[:load_scripts] = self if opts[:load_scripts]
|
162
|
-
opts[:stream] = true if opts[:raw]
|
163
|
-
(opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
|
164
|
-
[many, order, orders, opts]
|
165
|
-
end
|
166
|
-
|
167
|
-
def get_cached(*links)
|
168
|
-
res = []
|
169
|
-
expire = links[-1] == :expire ? links.pop : false
|
170
|
-
links.parses(:uri).each_with_index {|uri, i|
|
171
|
-
next if uri.path[/ads|count|stats/]
|
172
|
-
file = Cache.load uri, !expire
|
173
|
-
if file
|
174
|
-
if expire
|
175
|
-
@ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
|
176
|
-
if c.res.code == 200
|
177
|
-
res << [i, (data = c.res.body)]
|
178
|
-
Cache.save uri, data, false
|
179
|
-
else
|
180
|
-
res << [i, file.is(String) ? file : read(file.path)]
|
181
|
-
end
|
182
|
-
}
|
183
|
-
else
|
184
|
-
res << [i, file.is(String) ? file : read(file.path)]
|
185
|
-
end
|
186
|
-
else
|
187
|
-
@ss.next.loadGet(uri.href) {|c|
|
188
|
-
if c.res.code == 200
|
189
|
-
res << [i, (data = c.res.body)]
|
190
|
-
Cache.save uri, data, !expire
|
191
|
-
end
|
192
|
-
}
|
193
|
-
end
|
194
|
-
}
|
195
|
-
Curl.wait
|
196
|
-
links.size == 1 ? res[0][1] : res.sort!.lasts
|
197
|
-
end
|
198
|
-
|
199
|
-
def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
|
200
|
-
raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
|
201
|
-
@print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
|
202
|
-
(s = @ss.next).http.on_header {|h|
|
203
|
-
next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
|
204
|
-
if code = $2
|
205
|
-
if code != '200'
|
206
|
-
L << "#$& getting #{uri}; interrupting request."
|
207
|
-
s.http.on_header() # set default process
|
208
|
-
next 0
|
209
|
-
end
|
210
|
-
next h.size
|
211
|
-
end
|
212
|
-
|
213
|
-
s.http.on_header() # set default process
|
214
|
-
if !$1 # конец хедера, content-length отсутствует
|
215
|
-
L << "No Content-Length header; trying to load a whole #{uri} at once!"
|
216
|
-
s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
|
217
|
-
next 0
|
218
|
-
end
|
219
|
-
|
220
|
-
len = $1.to_i - start
|
221
|
-
psize = configure_psize(len, psize, threads)
|
222
|
-
parts = (len/psize.to_f).ceil
|
223
|
-
setup_speedometer(uri, parts, len)
|
224
|
-
yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
|
225
|
-
|
226
|
-
@ss_reserve = @ss[threads+1..-1]
|
227
|
-
@ss = @ss[0..threads]
|
228
|
-
(0...parts).each {|n|
|
229
|
-
break if stop_download
|
230
|
-
|
231
|
-
s = @ss.next
|
232
|
-
run_speedometer(s, len, n)
|
233
|
-
s.loadGet(uri, :headers => {
|
234
|
-
'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
|
235
|
-
}) {|c|
|
236
|
-
clear_speedometer(s)
|
237
|
-
if c.res.code/10 == 20
|
238
|
-
yield len, n*psize, c.res.body
|
239
|
-
else
|
240
|
-
L << "#{c.res} during get #{uri.inspect}; interrupting request."
|
241
|
-
stop_download = true
|
242
|
-
end
|
243
|
-
}
|
244
|
-
}
|
245
|
-
0
|
246
|
-
}
|
247
|
-
s.raise_err = false
|
248
|
-
s.loadGet validate uri
|
249
|
-
ensure
|
250
|
-
@ss.concat @ss_reserve || []
|
251
|
-
end
|
252
|
-
|
253
|
-
def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
|
254
|
-
dled = 0
|
255
|
-
lock = ''
|
256
|
-
callback = lambda {|len, pos, body|
|
257
|
-
if body != :careful_dl
|
258
|
-
begin
|
259
|
-
write(df, body, pos)
|
260
|
-
rescue => e
|
261
|
-
binding.start_interaction
|
262
|
-
raise
|
263
|
-
end
|
264
|
-
if (dled += body.size) == len
|
265
|
-
File.delete lock if File.file? lock
|
266
|
-
yield df if block_given?
|
267
|
-
end
|
268
|
-
else
|
269
|
-
lock = lock_file df, len, pos # filename, filesize, partsize
|
270
|
-
end
|
271
|
-
}
|
272
|
-
opts[:threads] ||= @ss.size-1
|
273
|
-
get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
|
274
|
-
Curl.wait unless block_given?
|
275
|
-
df
|
276
|
-
end
|
277
|
-
|
278
|
-
def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
|
279
|
-
opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
|
280
|
-
L << opts
|
281
|
-
|
282
|
-
@print_progress = opts[:print_progress]
|
283
|
-
unless len = opts[:len] || (map = read_mapfile(df) and map.len)
|
284
|
-
return @ss.next.loadHead(uri) {|c| $log << c
|
285
|
-
if len = c.res['Content-Length']
|
286
|
-
simple_dl(uri, df, opts.merge(:len => len.to_i))
|
287
|
-
else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
|
288
|
-
end
|
289
|
-
}
|
290
|
-
end
|
291
|
-
|
292
|
-
psize, parts = check_mapfile(df, opts)
|
293
|
-
return unless psize
|
294
|
-
L << [psize, parts]
|
295
|
-
setup_speedometer(uri, parts.size, len)
|
296
|
-
|
297
|
-
obtained uri do |uri|
|
298
|
-
if opts[:threads] == 1
|
299
|
-
start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
|
300
|
-
scout = opts[:scout] || @ss.next
|
301
|
-
$log << [uri, scout]
|
302
|
-
(loadget = lambda {|n|
|
303
|
-
run_speedometer(scout, len, n)
|
304
|
-
from = start + n*psize
|
305
|
-
to = start + (n+1)*psize - 1
|
306
|
-
scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
|
307
|
-
begin
|
308
|
-
$log << "writing #{df} from #{from}: #{c.res.body.inspect}"
|
309
|
-
write(df, c.res.body, from)
|
310
|
-
rescue => e
|
311
|
-
binding.start_interaction
|
312
|
-
raise
|
313
|
-
end
|
314
|
-
if write_mapfile(df, from, to)
|
315
|
-
clear_speedometer(scout)
|
316
|
-
L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
|
317
|
-
yield df if block_given?
|
318
|
-
elsif (n+1)*psize <= len
|
319
|
-
loadget[n+1]
|
320
|
-
end
|
321
|
-
}
|
322
|
-
})[0]
|
323
|
-
else
|
324
|
-
exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
|
325
|
-
L << c.res
|
326
|
-
range = c.req.range
|
327
|
-
begin
|
328
|
-
write(df, c.res.body, range.begin)
|
329
|
-
rescue => e
|
330
|
-
binding.start_interaction
|
331
|
-
raise
|
332
|
-
end
|
333
|
-
if write_mapfile(df, range.begin, range.end)
|
334
|
-
@ss.each {|s| s.http.on_progress} if @print_progress
|
335
|
-
yield df if block_given?
|
336
|
-
end
|
337
|
-
}
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
def check_mapfile(df, opts={})
|
343
|
-
opts.reverse_merge! :psize => :auto, :threads => 1
|
344
|
-
map = read_mapfile df
|
345
|
-
if map
|
346
|
-
L << map
|
347
|
-
if map.rest.empty?
|
348
|
-
puts "#{df} is loaded"
|
349
|
-
$log << 'deleting mapfile'
|
350
|
-
File.delete df+'.map'
|
351
|
-
[]
|
352
|
-
else
|
353
|
-
if opts[:len] and map.len != opts[:len]
|
354
|
-
raise "Incorrect file size for #{df}"
|
355
|
-
end
|
356
|
-
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
357
|
-
[psize, map.rest.div(psize)]
|
358
|
-
end
|
359
|
-
else
|
360
|
-
write_mapfile df, opts[:len]
|
361
|
-
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
362
|
-
$log << (0...opts[:len]).div(psize)
|
363
|
-
[psize, (0...opts[:len]).div(psize)]
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
def read_mapfile(df)
|
368
|
-
df += '.map'
|
369
|
-
text = read df
|
370
|
-
$log << "mapfile read: #{text}"
|
371
|
-
if text.b
|
372
|
-
text[/^(\d+)\0+(\d+)\0*\n/]
|
373
|
-
map = {}
|
374
|
-
$log << [$1,$2]
|
375
|
-
if $1 and $1 == $2
|
376
|
-
map.rest = []
|
377
|
-
else
|
378
|
-
map.len, *map.parts = text.chop/"\n"
|
379
|
-
map.len = map.len.to_i
|
380
|
-
map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
|
381
|
-
$log << map.parts
|
382
|
-
map.rest = (0...map.len) - XRange(*map.parts)
|
383
|
-
end
|
384
|
-
map
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
def write_mapfile(df, *args)
|
389
|
-
df += '.map'
|
390
|
-
map = ''
|
391
|
-
if args.size != 2
|
392
|
-
len = args.shift
|
393
|
-
map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
|
394
|
-
end
|
395
|
-
if args.any?
|
396
|
-
read(df)[/^(\d+)\0+(\d+)\0*\n/]
|
397
|
-
$log << "mapfile read"
|
398
|
-
$log << [$1,$2]
|
399
|
-
dled = $2.to_i + args[1] - args[0] + 1
|
400
|
-
return true if dled == $1.to_i
|
401
|
-
map << "#{args[0]}..#{args[1]}\n"
|
402
|
-
$log << 'writing mapfile'
|
403
|
-
write(df, dled.to_s.ljust(11, "\0"), 11)
|
404
|
-
end
|
405
|
-
$log << [df, map]
|
406
|
-
$log << 'writing mapfile'
|
407
|
-
write df, map
|
408
|
-
nil
|
409
|
-
end
|
410
|
-
|
411
|
-
def configure_psize(len, psize, threads)
|
412
|
-
case psize
|
413
|
-
when Numeric; psize.to_i
|
414
|
-
when :auto; len > 100000 ? len/threads+1 : len
|
415
|
-
when :mb; 1.mb
|
416
|
-
else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
|
417
|
-
end
|
418
|
-
end
|
419
|
-
|
420
|
-
private
|
421
|
-
def validate_zip(uri, body)
|
422
|
-
if !(uri.kinda Array and body.kinda Array)
|
423
|
-
raise ZippingError, [uri.class, nil, body.class, nil]
|
424
|
-
elsif uri.size != body.size
|
425
|
-
raise ZippingError, [uri.class, uri.size, body.class, body.size]
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
|
-
# :static option now can accept hash with :procotol key, in that case Frame can be relocated to the same domain on another protocol and default protocol would be the value of @static.protocol
|
430
|
-
def validate(uri)
|
431
|
-
if uri
|
432
|
-
loc = uri.parse:uri
|
433
|
-
if loc.root and loc.root != @loc.root
|
434
|
-
if @static
|
435
|
-
if @static.is Hash
|
436
|
-
if loc.host != @loc.host
|
437
|
-
raise TargetError, "unable to get #{uri} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
|
438
|
-
end
|
439
|
-
else
|
440
|
-
raise TargetError, "unable to get #{uri} by static frame #{@loc.root}, you should first update it with new target"
|
441
|
-
end
|
442
|
-
end
|
443
|
-
@loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
|
444
|
-
uri
|
445
|
-
elsif !loc.root
|
446
|
-
raise TargetError if !@static
|
447
|
-
if @static.is Hash
|
448
|
-
@loc.protocol = @static.protocol
|
449
|
-
@loc.root = @loc.protocol+'://'+@loc.host
|
450
|
-
end
|
451
|
-
File.join @loc.root, uri
|
452
|
-
else uri
|
453
|
-
end
|
454
|
-
else
|
455
|
-
raise TargetError if !@static
|
456
|
-
@loc.href
|
457
|
-
end
|
458
|
-
end
|
459
|
-
|
460
|
-
def validate_some(uris)
|
461
|
-
uris.map! {|u| validate u}
|
462
|
-
end
|
463
|
-
|
464
|
-
def run_callbacks!(page, opts, &callback)
|
465
|
-
if callback
|
466
|
-
yres = callback.call page
|
467
|
-
if opts[:save_result] or :proc_result.in opts
|
468
|
-
page.res = yres
|
469
|
-
end
|
470
|
-
if opts[:proc_result].is Proc and yres != :skip
|
471
|
-
opts[:proc_result].call yres
|
472
|
-
end
|
473
|
-
end
|
474
|
-
end
|
475
|
-
|
476
|
-
# TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
|
477
|
-
def exec_one(order, opts, &callback)
|
478
|
-
if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
|
479
|
-
run_callbacks! page, opts, &callback
|
480
|
-
res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
|
481
|
-
return res
|
482
|
-
end
|
483
|
-
# must result in Page (default) or it's subclass
|
484
|
-
page = opts[:result].new
|
485
|
-
# if no spare scouts can be found, squad simply waits for first callbacks to complete
|
486
|
-
s = @ss.next
|
487
|
-
s.send(*(order << opts)) {|curl|
|
488
|
-
# there is a problem with storing html on disk
|
489
|
-
if order[0] == :loadGet and @write_to
|
490
|
-
# sometimes (about 2% for 100-threads-dling) when this string is calling
|
491
|
-
# no matter what +curl.res.body+ has contained here
|
492
|
-
RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
|
493
|
-
end
|
494
|
-
if opts[:raw]
|
495
|
-
yield curl
|
496
|
-
# here +curl.res.body+ become empty
|
497
|
-
elsif page.process(curl, opts)
|
498
|
-
@@cache[page.href] = page if order[0] == :loadGet and @use_cache
|
499
|
-
run_callbacks! page, opts, &callback
|
500
|
-
end
|
501
|
-
}
|
502
|
-
if opts[:wait]
|
503
|
-
opts[:thread_safe] ? $Carier.perform : Curl.wait
|
504
|
-
(opts[:save_result] or :proc_result.in opts) ? page.res : page
|
505
|
-
else page
|
506
|
-
end
|
507
|
-
end
|
508
|
-
|
509
|
-
def exec_many(orders, with_opts, &callback)
|
510
|
-
w = with_opts.delete :wait
|
511
|
-
iterator = with_opts[:stream] ? :each : :map
|
512
|
-
if with_opts[:ranges]
|
513
|
-
if orders.size != with_opts[:ranges].size
|
514
|
-
raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
|
515
|
-
end
|
516
|
-
pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
|
517
|
-
(with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
|
518
|
-
exec_one order, with_opts, &callback
|
519
|
-
}
|
520
|
-
else
|
521
|
-
pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
|
522
|
-
end
|
523
|
-
with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
|
524
|
-
with_opts[:stream] || pages
|
525
|
-
end
|
526
|
-
|
527
|
-
|
528
|
-
def setup_speedometer(uri, parts, len)
|
529
|
-
return unless @print_progress
|
530
|
-
@progress = Array.new(parts, 0)
|
531
|
-
@stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
|
532
|
-
@str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
|
533
|
-
@bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
|
534
|
-
Thread.new {
|
535
|
-
until @stop_print
|
536
|
-
sleep 0.2
|
537
|
-
now = Time.now
|
538
|
-
if now > @speedometer[0] and @sum > @speedometer[1]
|
539
|
-
@speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
|
540
|
-
@speedometer.replace [now, @sum]
|
541
|
-
end
|
542
|
-
end
|
543
|
-
}
|
544
|
-
end
|
545
|
-
|
546
|
-
def run_speedometer(scout, len, n)
|
547
|
-
return unless @print_progress
|
548
|
-
scout.http.on_progress {|dl_need, dl_now, *ul|
|
549
|
-
if !@stop_print
|
550
|
-
@progress[n] = dl_now
|
551
|
-
percents = (@sum = @progress.sum)*100/len
|
552
|
-
print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
|
553
|
-
if percents == 100
|
554
|
-
puts "\v"*@newlines
|
555
|
-
@stop_print = true
|
556
|
-
end
|
557
|
-
end
|
558
|
-
true
|
559
|
-
}
|
560
|
-
end
|
561
|
-
|
562
|
-
def clear_speedometer(scout)
|
563
|
-
return unless @print_progress
|
564
|
-
scout.http.on_progress
|
565
|
-
end
|
566
|
-
|
567
|
-
end
|
568
|
-
|
569
|
-
def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
|
570
|
-
Curl.run
|
571
|
-
Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
|
572
|
-
end
|
573
|
-
module_function :dl
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
class Page
|
578
|
-
# for debug, just enable L#debug, don't write tons of chaotic log-lines
|
579
|
-
__init__
|
580
|
-
# res here is result of page processing made in frame context
|
581
|
-
attr_writer :title
|
582
|
-
attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
|
583
|
-
attr_accessor :res
|
584
|
-
@@ignore = /google|_gat|tracker|adver/i
|
585
|
-
|
586
|
-
def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
|
587
|
-
loc = loc.parse:uri if !loc.is Hash
|
588
|
-
@js = js
|
589
|
-
if obj.is Curl::Easy or obj.kinda Scout
|
590
|
-
c = obj.kinda(Scout) ? obj.http : obj
|
591
|
-
@html = ''
|
592
|
-
# just (c, loc) would pass to #process opts variable that returns '' on any key
|
593
|
-
process(c, loc.b || {})
|
594
|
-
else
|
595
|
-
@html = obj
|
596
|
-
@loc = loc
|
597
|
-
end
|
598
|
-
end
|
599
|
-
|
600
|
-
def empty?
|
601
|
-
!(@hash.nil? ? @html : @hash).b
|
602
|
-
end
|
603
|
-
|
604
|
-
def inspect
|
605
|
-
if !@hash.nil?
|
606
|
-
"<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
|
607
|
-
else
|
608
|
-
"<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
|
609
|
-
end
|
610
|
-
end
|
611
|
-
|
612
|
-
def html!(encoding='UTF-8')
|
613
|
-
@html.force_encoding(encoding)
|
614
|
-
end
|
615
|
-
|
616
|
-
# We can then alternate #process in Page subclasses
|
617
|
-
# Frame doesn't mind about value returned by #process
|
618
|
-
def process(c, opts={})
|
619
|
-
@loc = c.last_effective_url.parse:uri
|
620
|
-
@curl_res = c.res
|
621
|
-
L.debug "#{@loc.fullpath} -> #{@curl_res}"
|
622
|
-
if @curl_res.code == 200
|
623
|
-
body = @curl_res.body
|
624
|
-
if opts[:json]
|
625
|
-
@json = true
|
626
|
-
@hash = begin; body.from_json
|
627
|
-
rescue StandardError
|
628
|
-
false
|
629
|
-
end
|
630
|
-
if !@hash or @hash.is String
|
631
|
-
L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
632
|
-
@html = body; to_doc
|
633
|
-
@hash = false
|
634
|
-
end
|
635
|
-
|
636
|
-
elsif opts[:hash]
|
637
|
-
if body.inline
|
638
|
-
@hash = body.to_params
|
639
|
-
else
|
640
|
-
@hash = false
|
641
|
-
L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
642
|
-
@html = body; to_doc
|
643
|
-
end
|
644
|
-
|
645
|
-
else
|
646
|
-
@html = body.xml_to_utf
|
647
|
-
to_doc
|
648
|
-
if opts[:eval]
|
649
|
-
load_scripts opts[:load_scripts]
|
650
|
-
eval_js
|
651
|
-
end
|
652
|
-
end
|
653
|
-
elsif !(opts[:json] or opts[:hash])
|
654
|
-
@html = @curl_res.body
|
655
|
-
@failed = @curl_res.code
|
656
|
-
end
|
657
|
-
self
|
658
|
-
end
|
659
|
-
|
660
|
-
def eval_js(frame=nil)
|
661
|
-
eval_string "document.location = window.location = #{@loc.to_json};
|
662
|
-
document.URL = document.baseURI = document.documentURI = location.href;
|
663
|
-
document.domain = location.host;"
|
664
|
-
find("script").each {|n|
|
665
|
-
L.debug n.text.strip
|
666
|
-
if text = n.text.strip.b
|
667
|
-
js[:write_output] = ''
|
668
|
-
eval_string text
|
669
|
-
if res = js[:write_output].b then n.after res end
|
670
|
-
n.remove!
|
671
|
-
elsif frame and n.src
|
672
|
-
eval_string frame.get_cached expand_link n.src
|
673
|
-
end
|
674
|
-
}
|
675
|
-
end
|
676
|
-
|
677
|
-
def eval_string(str)
|
678
|
-
@js ||= Johnson::Runtime.new
|
679
|
-
L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
680
|
-
begin
|
681
|
-
@js.evaluate(str)
|
682
|
-
rescue Johnson::Error => e
|
683
|
-
L.warn e.message
|
684
|
-
L.debug {
|
685
|
-
if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
|
686
|
-
L.clr.hl! str, /\b#{m[1] || m[2]}\b/
|
687
|
-
end
|
688
|
-
"\n\t#{str}"
|
689
|
-
}
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
def to_doc
|
694
|
-
@doc = @html.to_doc :forceutf
|
695
|
-
end
|
696
|
-
|
697
|
-
def title(full=true)
|
698
|
-
if @hash.nil? and !@failed and @html.b
|
699
|
-
if full
|
700
|
-
to_doc unless defined? @doc
|
701
|
-
if @doc.title.b
|
702
|
-
@title = @doc.title
|
703
|
-
else
|
704
|
-
@title = @loc.href
|
705
|
-
@doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
|
706
|
-
@title
|
707
|
-
end
|
708
|
-
else
|
709
|
-
title true unless defined? @title
|
710
|
-
if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
|
711
|
-
@short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
|
712
|
-
elsif @title.size > 40
|
713
|
-
@short_title = @title[/.{1,30}\S*/][0..38]+'…'
|
714
|
-
else
|
715
|
-
@short_title = @title
|
716
|
-
end
|
717
|
-
end
|
718
|
-
else
|
719
|
-
@loc.href
|
720
|
-
end
|
721
|
-
end
|
722
|
-
|
723
|
-
def find(xp) (@doc || to_doc).find xp end
|
724
|
-
|
725
|
-
def at(xp) (@doc || to_doc).at xp end
|
726
|
-
|
727
|
-
def url() @loc.href end
|
728
|
-
alias :href :url
|
729
|
-
|
730
|
-
def get_srcs(links='img')
|
731
|
-
begin
|
732
|
-
links = find(links).map {|e| e.src} if links.is String
|
733
|
-
rescue XML::Error
|
734
|
-
links = [links]
|
735
|
-
end
|
736
|
-
links.map {|link| expand_link link}.uniq
|
737
|
-
end
|
738
|
-
|
739
|
-
def get_src(link='img')
|
740
|
-
begin
|
741
|
-
link = at(link) && at(link).src if link.is String
|
742
|
-
rescue XML::Error; nil
|
743
|
-
end
|
744
|
-
expand_link link if link
|
745
|
-
end
|
746
|
-
|
747
|
-
def get_links(links='a')
|
748
|
-
begin
|
749
|
-
links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
|
750
|
-
rescue XML::Error
|
751
|
-
links = [links]
|
752
|
-
end
|
753
|
-
links.map {|link| expand_link link}.uniq
|
754
|
-
end
|
755
|
-
|
756
|
-
def get_link(link='a')
|
757
|
-
begin
|
758
|
-
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
|
759
|
-
rescue XML::Error; nil
|
760
|
-
end
|
761
|
-
expand_link link if link
|
762
|
-
end
|
763
|
-
alias :get_hrefs :get_links
|
764
|
-
alias :links :get_links
|
765
|
-
alias :get_href :get_link
|
766
|
-
alias :link :get_link
|
767
|
-
alias :srcs :get_srcs
|
768
|
-
alias :src :get_src
|
769
|
-
|
770
|
-
def expand_link(link)
|
771
|
-
case link
|
772
|
-
when /^\w+:\/\// then link
|
773
|
-
when /^\/\// then @loc.protocol+link
|
774
|
-
when /^\// then @loc.root+link
|
775
|
-
else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
|
776
|
-
end
|
777
|
-
end
|
778
|
-
|
779
|
-
def form(form='form', hash={}, opts={})
|
780
|
-
form = "[action=#{@loc.path.inspect}]" if form == :self
|
781
|
-
if form.is String
|
782
|
-
form_node = at form
|
783
|
-
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
|
784
|
-
else form_node = form
|
785
|
-
end
|
786
|
-
hash = form_node.inputs_all.merge!(hash)
|
787
|
-
action = expand_link(form_node.action || @loc.path)
|
788
|
-
if form_node['method'].downcase == 'post'
|
789
|
-
[hash, form_node.enctype =~ /multipart/, action, opts]
|
790
|
-
else
|
791
|
-
action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
|
792
|
-
[action, opts]
|
793
|
-
end
|
794
|
-
end
|
795
|
-
|
796
|
-
def submit(form, frame, hash={}, opts={}, &callback)
|
797
|
-
(opts[:headers] ||= {}).Referer ||= @loc.href if @loc
|
798
|
-
query = form(form, hash, opts)
|
799
|
-
|
800
|
-
curr_target, new_target = frame.loc.href, (query[2] || query[0])
|
801
|
-
if need_retargeting = (frame.static && curr_target != new_target)
|
802
|
-
frame.retarget new_target
|
803
|
-
end
|
804
|
-
page = frame.exec(*query, &callback)
|
805
|
-
frame.retarget curr_target, :forced if need_retargeting
|
806
|
-
page
|
807
|
-
end
|
808
|
-
|
809
|
-
def load_scripts(frame)
|
810
|
-
frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
|
811
|
-
end
|
812
|
-
|
813
|
-
end
|
814
|
-
|
815
|
-
# using reprocessing of page in case of non-200 response:
|
816
|
-
# page_class = ReloadablePage do
|
817
|
-
# @res and @res.code != 200
|
818
|
-
# end
|
819
|
-
def ReloadablePage(&reload_condition)
|
820
|
-
rp = Class.new Page
|
821
|
-
rp.send :define_method, :process do |curl, opts|
|
822
|
-
super(curl, opts || {})
|
823
|
-
if curl.instance_eval &reload_condition
|
824
|
-
curl.retry!
|
825
|
-
nil # in case of reload_condition.call super's callback will not proceed
|
826
|
-
else self
|
827
|
-
end
|
828
|
-
end
|
829
|
-
rp
|
830
|
-
end
|
831
|
-
|
832
|
-
end
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|