rhack 0.4.1 → 1.0.0.rc4
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +22 -0
- data/Gemfile +2 -5
- data/LICENSE +19 -15
- data/README.md +66 -26
- data/Rakefile +42 -31
- data/config/cacert.pem +3895 -0
- data/config/rhack.yml.template +40 -0
- data/ext/curb-original/curb_config.h +3 -0
- data/ext/curb-original/curb_easy.c +3 -54
- data/ext/curb-original/curb_multi.c +69 -140
- data/ext/curb/curb_multi.c +1 -1
- data/lib/rhack.rb +82 -12
- data/lib/rhack/cookie.rb +49 -0
- data/lib/rhack/curl.rb +6 -0
- data/lib/{extensions/curb.rb → rhack/curl/easy.rb} +26 -48
- data/lib/rhack/curl/global.rb +175 -0
- data/lib/rhack/curl/itt.rb +11 -0
- data/lib/rhack/curl/multi.rb +37 -0
- data/lib/rhack/curl/post_field.rb +20 -0
- data/lib/rhack/curl/response.rb +91 -0
- data/lib/rhack/dl.rb +308 -0
- data/lib/rhack/frame.rb +316 -0
- data/lib/{extensions → rhack/js}/browser/env.js +0 -0
- data/lib/{extensions → rhack/js}/browser/jquery.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlsax.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlw3cdom_1.js +0 -0
- data/lib/{extensions → rhack/js}/browser/xmlw3cdom_2.js +0 -0
- data/lib/rhack/js/johnson.rb +71 -0
- data/lib/rhack/page.rb +263 -0
- data/lib/rhack/proxy.rb +3 -0
- data/lib/rhack/proxy/checker.rb +1 -1
- data/lib/rhack/scout.rb +342 -0
- data/lib/rhack/scout_squad.rb +98 -0
- data/lib/rhack/services.rb +1 -464
- data/lib/rhack/services/base.rb +59 -0
- data/lib/rhack/services/examples.rb +423 -0
- data/lib/rhack/version.rb +3 -0
- data/lib/rhack_in.rb +3 -2
- data/rhack.gemspec +28 -0
- metadata +104 -85
- data/.gemtest +0 -0
- data/Gemfile.lock +0 -23
- data/Manifest.txt +0 -60
- data/ext/curb/Makefile +0 -217
- data/lib/cache.rb +0 -44
- data/lib/curl-global.rb +0 -164
- data/lib/extensions/declarative.rb +0 -153
- data/lib/extensions/johnson.rb +0 -63
- data/lib/frame.rb +0 -848
- data/lib/init.rb +0 -49
- data/lib/rhack.yml.template +0 -19
- data/lib/scout.rb +0 -589
- data/lib/words.rb +0 -25
data/lib/cache.rb
DELETED
@@ -1,44 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module HTTPAccessKit
|
3
|
-
|
4
|
-
class Cache < ActiveRecord::Base
|
5
|
-
declare CacheTable do |t|
|
6
|
-
t.integer :url_hash
|
7
|
-
t.string :url
|
8
|
-
t.string :path
|
9
|
-
t.string :date
|
10
|
-
t.string :ext
|
11
|
-
t.timestamps
|
12
|
-
end if DB
|
13
|
-
RAMCache = {}
|
14
|
-
|
15
|
-
def self.clean(time=7.days)
|
16
|
-
destroy_all("created_at < '#{time.ago}'").each {|c|
|
17
|
-
FileUtils.remove c.path if c.path and File.file?(c.path)}
|
18
|
-
end
|
19
|
-
CacheTTL and clean CacheTTL
|
20
|
-
|
21
|
-
def self.save(url, data, cache_data=true)
|
22
|
-
new(url, data).save
|
23
|
-
RAMCache[url.href] = data if cache_data
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.load(url, cache_data=true)
|
27
|
-
if data = RAMCache[url.href]
|
28
|
-
data
|
29
|
-
elsif file = first(:select => 'date, path', :conditions => {:url_hash => url.href.hash})
|
30
|
-
RAMCache[url.href] = read(file.path) if cache_data
|
31
|
-
file
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def initialize(url, data)
|
36
|
-
t = Time.now
|
37
|
-
path = "#{CacheDir}/#{t.to_i}-#{File.split(url.path)[1]}"
|
38
|
-
rw path, data
|
39
|
-
super :url => url.href, :url_hash => url.href.hash, :date => t.httpdate, :path => path, :ext => url.ext
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
|
44
|
-
end
|
data/lib/curl-global.rb
DELETED
@@ -1,164 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module Curl
|
3
|
-
|
4
|
-
def execute(unless_allready=false)
|
5
|
-
if unless_allready and Curl.status
|
6
|
-
return L.log "Non-nil status! Avoid executing"
|
7
|
-
end
|
8
|
-
if $CarierThread and s = $CarierThread.status
|
9
|
-
L.log "Carier thread allready started and has status #{s}"
|
10
|
-
else
|
11
|
-
if s = Curl.status(false) then L.warn s end
|
12
|
-
L.log($CarierThread ? "Resetting Carier thread" : "Setting Carier thread up")
|
13
|
-
$CarierThread = Thread.new {
|
14
|
-
error = nil
|
15
|
-
begin
|
16
|
-
# "why Thread#value is raising since it never raised before?"
|
17
|
-
yield if block_given?
|
18
|
-
rescue => error
|
19
|
-
nil
|
20
|
-
end
|
21
|
-
loop {
|
22
|
-
begin
|
23
|
-
# with true argument (idle) it would break only if no requests to perform
|
24
|
-
break unless $Carier.perform true
|
25
|
-
L.log "Nothing to perform; idling..."
|
26
|
-
rescue => error
|
27
|
-
break
|
28
|
-
# but ruby mystically crashes if next sequence occur:
|
29
|
-
# Multi performs and can't see any requests so entering idle mode
|
30
|
-
# we add some requests and multi load them
|
31
|
-
# one of requests' callbacks raises error in *main* thread
|
32
|
-
# so we can't allow any raises here, instead, keep them in 'wait' section
|
33
|
-
end
|
34
|
-
} unless error
|
35
|
-
error
|
36
|
-
}
|
37
|
-
# until main thread has sleep a bit, $CarierThread will have status "run",
|
38
|
-
# no matter whether it's idling or performing requests
|
39
|
-
sleep 0.001
|
40
|
-
end
|
41
|
-
end
|
42
|
-
alias :run :execute
|
43
|
-
module_function :execute, :run
|
44
|
-
|
45
|
-
def wait
|
46
|
-
if $CarierThread and $CarierThread.status
|
47
|
-
unless within = Thread.current == $CarierThread
|
48
|
-
# We can't set `perform' timeout lesser than 1 second in the curl binding
|
49
|
-
# because in that case thread status would always be "run"
|
50
|
-
# so here we wait for exactly 1 sec
|
51
|
-
sleep 1
|
52
|
-
end
|
53
|
-
# Also, if thread do Kernel.sleep, it would skip Curl.wait here
|
54
|
-
if !$Carier.sheduled and ($CarierThread.status == 'sleep' or within && $Carier.reqs.empty?)
|
55
|
-
L.log "No shedule to wait"
|
56
|
-
else
|
57
|
-
this_thread = within ? 'it\'s thread' : Thread.main == Thread.current ? 'main thread' : 'thread '+Thread.current.object_id
|
58
|
-
L.log "Waiting for Carier to complete in #{this_thread}"
|
59
|
-
begin
|
60
|
-
L.log { "Trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> true from #{this_thread}" }
|
61
|
-
if within
|
62
|
-
L.log "calling this from one of callbacks to wait for the rest to complete"
|
63
|
-
begin
|
64
|
-
$Carier.perform
|
65
|
-
rescue RuntimeError => e
|
66
|
-
L.warn [e, e.message]
|
67
|
-
L.info "$Carier $Carier.sheduled $CarierThread $CarierThread.status", binding
|
68
|
-
L.warn "Failed to run Multi#perform: nothing to perform"
|
69
|
-
end
|
70
|
-
else
|
71
|
-
$CarierThreadIsJoined = true
|
72
|
-
$CarierThread.join
|
73
|
-
end
|
74
|
-
rescue (defined?(IRB) ? IRB::Abort : NilClass)
|
75
|
-
recall!
|
76
|
-
L.info "Carier thread recalled by keyboard"
|
77
|
-
ensure
|
78
|
-
L.log "trying to change $CarierThreadIsJoined #{$CarierThreadIsJoined} -> false from #{this_thread}"
|
79
|
-
if !within
|
80
|
-
$CarierThreadIsJoined = false
|
81
|
-
# using Curl#execute from different threads may cause problems here when you don't control input,
|
82
|
-
# for example, in a daemonized ruby process
|
83
|
-
# just do not get $CarierThread joined from non-main thread
|
84
|
-
if $CarierThread and e = $CarierThread.value
|
85
|
-
# this will raise thread-safely in main thread
|
86
|
-
# in case of unrescued error in CarierThread
|
87
|
-
L.log(([e.message]+RMTools.format_trace(e.backtrace))*"\n")
|
88
|
-
recall!
|
89
|
-
raise e
|
90
|
-
end
|
91
|
-
execute
|
92
|
-
end
|
93
|
-
end
|
94
|
-
end
|
95
|
-
else
|
96
|
-
L < "No thread to wait. I guess I should create one"
|
97
|
-
execute
|
98
|
-
wait
|
99
|
-
end
|
100
|
-
end
|
101
|
-
module_function :wait
|
102
|
-
|
103
|
-
def recall
|
104
|
-
L.debug caller
|
105
|
-
if $CarierThread
|
106
|
-
L.log "Recalling Carier thread"
|
107
|
-
$CarierThread.kill
|
108
|
-
sleep 1
|
109
|
-
else
|
110
|
-
L.log "No thread to recall"
|
111
|
-
end
|
112
|
-
end
|
113
|
-
alias :stop :recall
|
114
|
-
|
115
|
-
def recall!
|
116
|
-
if $CarierThread
|
117
|
-
L.warn "Recalling thread and resetting Carier!!!"
|
118
|
-
$CarierThread.kill
|
119
|
-
$CarierThread = nil
|
120
|
-
$Carier.reset
|
121
|
-
else
|
122
|
-
L.log "No thread to recall!"
|
123
|
-
end
|
124
|
-
end
|
125
|
-
alias :stop! :recall!
|
126
|
-
module_function :recall!, :stop!, :recall, :stop
|
127
|
-
|
128
|
-
def reset
|
129
|
-
recall
|
130
|
-
execute
|
131
|
-
end
|
132
|
-
alias :reload :reset
|
133
|
-
|
134
|
-
def reset!
|
135
|
-
recall!
|
136
|
-
execute
|
137
|
-
end
|
138
|
-
alias :reload! :reset!
|
139
|
-
module_function :reset!, :reset, :reload!, :reload
|
140
|
-
|
141
|
-
def status(raise_e=true)
|
142
|
-
if $CarierThread and (s = $CarierThread.status)
|
143
|
-
L.log "Carier thread responding with status #{s}"
|
144
|
-
s
|
145
|
-
elsif $CarierThread
|
146
|
-
if e = $CarierThread.value
|
147
|
-
if raise_e
|
148
|
-
recall!
|
149
|
-
raise e
|
150
|
-
else
|
151
|
-
L.log "Carier Thread returned #{e.inspect}"
|
152
|
-
e
|
153
|
-
end
|
154
|
-
else
|
155
|
-
L.log "Carier Thread is exited without error"
|
156
|
-
end
|
157
|
-
else
|
158
|
-
L.log "There is no Carier Thread atm"
|
159
|
-
end
|
160
|
-
end
|
161
|
-
alias :st :status
|
162
|
-
module_function :status, :st
|
163
|
-
|
164
|
-
end
|
@@ -1,153 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module ActiveRecord
|
3
|
-
|
4
|
-
module ConnectionAdapters
|
5
|
-
AbstractAdapter
|
6
|
-
|
7
|
-
class VirtualTable < Table
|
8
|
-
|
9
|
-
def debug_str meth, called, exist, *args
|
10
|
-
"Table.#{meth}(#{args.inspects*', '}) was#{' NOT' if !called} called due to #{'in' if !exist}existance"
|
11
|
-
end
|
12
|
-
|
13
|
-
def column_exists *args
|
14
|
-
column_names = @base.columns(@table_name).names
|
15
|
-
options = args.extract_options!
|
16
|
-
names = args.dup
|
17
|
-
args << options
|
18
|
-
_or_ = (names[0] == :all) ? !names.shift : true
|
19
|
-
names.each {|name| return _or_ if name.to_s.in(column_names) == _or_}
|
20
|
-
!_or_
|
21
|
-
end
|
22
|
-
|
23
|
-
def index_exists *indexes
|
24
|
-
column_indexes = @base.indexes(@table_name).columnss.flatten
|
25
|
-
_or_ = (indexes[0] == :all) ? !indexes.shift : true
|
26
|
-
indexes.each {|index| return _or_ if index.to_s.in(column_indexes) == _or_}
|
27
|
-
!_or_
|
28
|
-
end
|
29
|
-
|
30
|
-
def initialize name, connection, map=nil
|
31
|
-
super name, connection
|
32
|
-
case map
|
33
|
-
when true; @map = []
|
34
|
-
when Array; @map = map
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
def map!
|
39
|
-
map_names = @map.firsts.to_ss
|
40
|
-
@base.columns(@table_name).names.each {|name|
|
41
|
-
name.in(map_names) ? @map.reject! {|_| _[0] == name} : remove(name)
|
42
|
-
}
|
43
|
-
@map.each {|col| column *col}
|
44
|
-
end
|
45
|
-
|
46
|
-
def column name, *args
|
47
|
-
to_be_called = !column_exists(name)
|
48
|
-
super if to_be_called
|
49
|
-
$log.debug {debug_str :column, to_be_called, !to_be_called, name, *args}
|
50
|
-
@map << [name, *args] if @map
|
51
|
-
end
|
52
|
-
|
53
|
-
%w{string text integer float decimal
|
54
|
-
datetime timestamp time date binary boolean}.each {|column_type|
|
55
|
-
define_method(column_type) {|*args|
|
56
|
-
to_be_called = !column_exists(*args)
|
57
|
-
super if to_be_called
|
58
|
-
$log.debug {debug_str column_type, to_be_called, !to_be_called, *args}
|
59
|
-
if @map
|
60
|
-
options = args.extract_options!
|
61
|
-
args = args.xprod(column_type)
|
62
|
-
args = args.xprod(options) if options
|
63
|
-
@map.concat args
|
64
|
-
end
|
65
|
-
} }
|
66
|
-
|
67
|
-
def index name, *args
|
68
|
-
to_be_called = !index_exists(name)
|
69
|
-
super if to_be_called
|
70
|
-
$log.debug {debug_str :index, to_be_called, !to_be_called, name, *args}
|
71
|
-
end
|
72
|
-
|
73
|
-
def timestamps
|
74
|
-
to_be_called = !column_exists('created_at', 'updated_at')
|
75
|
-
super if to_be_called
|
76
|
-
$log.debug {debug_str :timestamps, to_be_called, !to_be_called}
|
77
|
-
@map.concat [[:created_at, :datetime], [:updated_at, :datetime]] if @map
|
78
|
-
end
|
79
|
-
|
80
|
-
def change *args
|
81
|
-
raise NotImplementedError, "don't use #change in declaration!"
|
82
|
-
end
|
83
|
-
|
84
|
-
def change_default *args
|
85
|
-
raise NotImplementedError, "don't use #change_default in declaration!"
|
86
|
-
end
|
87
|
-
|
88
|
-
def rename column_name, new_column_name
|
89
|
-
to_be_called = !column_exists(new_column_name)
|
90
|
-
super if to_be_called
|
91
|
-
$log.debug {debug_str :rename, to_be_called, !to_be_called, column_name, new_column_name}
|
92
|
-
end
|
93
|
-
|
94
|
-
def references *args
|
95
|
-
to_be_called = !column_exists(*args.map {|col| "#{col}_id"})
|
96
|
-
super if to_be_called
|
97
|
-
$log.debug {debug_str :references, to_be_called, !to_be_called, *args}
|
98
|
-
end
|
99
|
-
alias :belongs_to :references
|
100
|
-
|
101
|
-
def remove *args
|
102
|
-
to_be_called = column_exists :all, *args
|
103
|
-
super if to_be_called
|
104
|
-
$log.debug {debug_str :remove, to_be_called, to_be_called, *args}
|
105
|
-
end
|
106
|
-
|
107
|
-
def remove_references *args
|
108
|
-
to_be_called = column_exists(:all, *args.map {|col| "#{col}_id"})
|
109
|
-
super if to_be_called
|
110
|
-
$log.debug {debug_str :remove_references, to_be_called, to_be_called, *args}
|
111
|
-
end
|
112
|
-
alias :remove_belongs_to :remove_references
|
113
|
-
|
114
|
-
def remove_index options
|
115
|
-
indexes = options.is(Hash) ? options[:column] : options
|
116
|
-
raise ArgumentError, "can remove only default format named indexes in declaration!" if !indexes
|
117
|
-
to_be_called = index_exists :all, *indexes
|
118
|
-
super if to_be_called
|
119
|
-
$log.debug {debug_str :remove_index, to_be_called, to_be_called, options}
|
120
|
-
end
|
121
|
-
|
122
|
-
def remove_timestamps
|
123
|
-
to_be_called = column_exists 'created_at', 'updated_at'
|
124
|
-
super if to_be_called
|
125
|
-
$log.debug {debug_str :remove_timestamps, to_be_called, to_be_called}
|
126
|
-
end
|
127
|
-
|
128
|
-
end
|
129
|
-
|
130
|
-
end
|
131
|
-
|
132
|
-
class Base
|
133
|
-
|
134
|
-
def self.declare name, options={}, &block
|
135
|
-
self.table_name = name
|
136
|
-
if !table_exists? or options[:force]
|
137
|
-
$log < "with options[:force] the `#{table_name}` table will have been recreated each time you load the #{model_name} model" if options[:force]
|
138
|
-
self.primary_key = options[:primary_key] if options[:id] != false and options[:primary_key]
|
139
|
-
$log.debug "connection.create_table(#{name}, #{options.inspect}) {}"
|
140
|
-
connection.create_table(name, options, &block)
|
141
|
-
elsif options[:map]
|
142
|
-
table = ConnectionAdapters::VirtualTable.new(name, connection, options[:map])
|
143
|
-
yield table
|
144
|
-
table.map!
|
145
|
-
else yield ConnectionAdapters::VirtualTable.new(name, connection)
|
146
|
-
end
|
147
|
-
reset_column_information
|
148
|
-
end
|
149
|
-
|
150
|
-
end
|
151
|
-
|
152
|
-
end
|
153
|
-
|
data/lib/extensions/johnson.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module Johnson
|
3
|
-
begin
|
4
|
-
require 'johnson'
|
5
|
-
rescue LoadError
|
6
|
-
Enabled = false
|
7
|
-
else
|
8
|
-
if VERSION <= "2.0.0" and RUBY_VERSION > "1.9"
|
9
|
-
Enabled = false
|
10
|
-
else Enabled = true
|
11
|
-
end
|
12
|
-
end
|
13
|
-
### JavaScript interface DOM emulation ###
|
14
|
-
|
15
|
-
class Runtime
|
16
|
-
attr_accessor :thread_id
|
17
|
-
Runtime_is_set = lambda {|o| !o[:eval].b or ($JSRuntime and $JSRuntime.thread_id == $CarierThread.object_id)}
|
18
|
-
BROWSER_PATH = File.expand_path "#{File.dirname(__FILE__)}/browser"
|
19
|
-
|
20
|
-
# CarierThread breaks if Multi has no work && CarierThread
|
21
|
-
# is joined so itwon't last forever.
|
22
|
-
#
|
23
|
-
# Johnson is not thread safe =>
|
24
|
-
# Runtime created in this thread will become unusable after
|
25
|
-
# CarierThread dies.
|
26
|
-
#
|
27
|
-
# So we don't use Curl.wait until Carier haven't got whole
|
28
|
-
# request for this Runtime.
|
29
|
-
def self.set_browser_for_curl(opts)
|
30
|
-
if !Runtime_is_set[opts]
|
31
|
-
if Curl.status
|
32
|
-
Curl.recall
|
33
|
-
$log.debug 'recalled'
|
34
|
-
end
|
35
|
-
if opts[:thread_safe].b
|
36
|
-
$JSRuntime = new_browser(opts[:jq])
|
37
|
-
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
38
|
-
else
|
39
|
-
$log.debug 'about to run carier'
|
40
|
-
Curl.execute {$JSRuntime = new_browser(opts[:jq])
|
41
|
-
$log.debug "#{$JSRuntime} initialized in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"}
|
42
|
-
sleep 0.01 until Runtime_is_set[opts]
|
43
|
-
end
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def self.new_browser(jq=false)
|
48
|
-
rt = new
|
49
|
-
%w{xmlw3cdom_1 xmlw3cdom_2 xmlsax env}.concat(jq ? ['jquery'] : []).each {|f|
|
50
|
-
path = "#{BROWSER_PATH}/#{f}.js"
|
51
|
-
rt.evaluate IO.read(path), path, 1
|
52
|
-
}
|
53
|
-
rt.document = ''
|
54
|
-
rt
|
55
|
-
end
|
56
|
-
|
57
|
-
def document=(html)
|
58
|
-
evaluate "var document = new DOMDocument(#{html.to_doc.to_xhtml.inspect})"
|
59
|
-
end
|
60
|
-
|
61
|
-
end
|
62
|
-
|
63
|
-
end
|
data/lib/frame.rb
DELETED
@@ -1,848 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
module HTTPAccessKit
|
3
|
-
|
4
|
-
# Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
|
5
|
-
# Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
|
6
|
-
|
7
|
-
class ZippingError < ArgumentError
|
8
|
-
def initialize debug, str="invalid use of :zip option, uri and body must be an arrays with the same size\n uri: %s(%s), body: %s(%s)"
|
9
|
-
super str%debug end
|
10
|
-
end
|
11
|
-
|
12
|
-
class TargetError < ArgumentError
|
13
|
-
def initialize msg="only static frame can use local paths"
|
14
|
-
super end
|
15
|
-
end
|
16
|
-
|
17
|
-
class ConfigError < ArgumentError
|
18
|
-
def initialize msg
|
19
|
-
super end
|
20
|
-
end
|
21
|
-
|
22
|
-
class Frame
|
23
|
-
__init__
|
24
|
-
attr_reader :loc, :static, :ss, :opts, :use_cache, :write_to
|
25
|
-
@@cache = {}
|
26
|
-
|
27
|
-
def initialize *args
|
28
|
-
args << 10 unless args[-1].is Fixnum
|
29
|
-
args.insert -2, {} unless args[-2].is Hash
|
30
|
-
@opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2].kinda(Hash) ? args[-2] : {})
|
31
|
-
args[-2] = @opts
|
32
|
-
if args[0].is String
|
33
|
-
uri = args[0]
|
34
|
-
'http://' >> uri if uri !~ /^\w+:\/\//
|
35
|
-
@loc = uri.parse:uri
|
36
|
-
# be careful, if you set :static => false, frame will be unable to use implicit url
|
37
|
-
@static = @opts.fetch(:static, true)
|
38
|
-
else
|
39
|
-
@loc = {}
|
40
|
-
@static = false
|
41
|
-
end
|
42
|
-
@ss = ScoutSquad *args
|
43
|
-
Curl.run :unless_allready
|
44
|
-
end
|
45
|
-
|
46
|
-
def retarget to, forced=nil
|
47
|
-
to = 'http://' + to if to !~ /^\w+:/
|
48
|
-
@ss.update to, forced
|
49
|
-
@loc = to.parse:uri
|
50
|
-
end
|
51
|
-
alias :target= :retarget
|
52
|
-
|
53
|
-
def next() @ss.next end
|
54
|
-
def rand() @ss.rand end
|
55
|
-
def each(&block) @ss.each &block end
|
56
|
-
def [](i) @ss[i] end
|
57
|
-
|
58
|
-
def copy_cookies! i=0
|
59
|
-
@ss.each {|s| s.cookies.replace @ss[i].cookies}
|
60
|
-
end
|
61
|
-
|
62
|
-
def use_cache! opts={}
|
63
|
-
if opts == false
|
64
|
-
@use_cache = false
|
65
|
-
else
|
66
|
-
@@cache = opts[:pages].kinda(Hash) ? opts[:pages] : opts[:pages].map_hash {|p| [p.href, p]} if opts[:pages]
|
67
|
-
#@write_to = opts[:write_to] if :write_to.in opts
|
68
|
-
@use_cache = true
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
def drop_cache! use=nil
|
73
|
-
@@cache.clear
|
74
|
-
GC.start
|
75
|
-
@use_cache = use if use.in [true, false]
|
76
|
-
end
|
77
|
-
|
78
|
-
def inspect
|
79
|
-
"<#Frame @ #{@ss.untargeted ? 'no target' : @loc.root}: #{'scout'.x @ss.size}#{', static'+(' => '+@static.protocol if @static.is(Hash)) if @static}, cookies #{@ss[0].cookieProc ? 'on' : 'off'}>"
|
80
|
-
end
|
81
|
-
|
82
|
-
# opts are :eval, :json, :hash, :wait, :proc_result, :save_result, :load_scripts,
|
83
|
-
# :zip, :thread_safe, :result, :stream, :raw, :xhr + any opts for Scouts in one hash
|
84
|
-
def exec *args, &callback
|
85
|
-
many, order, orders, with_opts = interpret_request *args
|
86
|
-
L.log({:many => many, :order => order, :orders => orders, :with_opts => with_opts})
|
87
|
-
|
88
|
-
if !Johnson::Enabled and with_opts[:eval]
|
89
|
-
L < "failed to use option :eval because Johnson is disabled"
|
90
|
-
with_opts.delete :eval
|
91
|
-
end
|
92
|
-
# JS Runtime is not thread-safe and must be created in curl thread
|
93
|
-
# if we aren't said explicitly about the opposite
|
94
|
-
Johnson::Runtime.set_browser_for_curl with_opts
|
95
|
-
|
96
|
-
if many then exec_many orders, with_opts, &callback
|
97
|
-
else exec_one order, with_opts, &callback end
|
98
|
-
end
|
99
|
-
alias :get :exec
|
100
|
-
alias :run :get
|
101
|
-
|
102
|
-
def interpret_request(*args)
|
103
|
-
body, mp, uri, opts = args.dup.get_opts [nil, false, nil], @opts
|
104
|
-
L.log [body, mp, uri, opts]
|
105
|
-
zip = opts.delete :zip
|
106
|
-
many = order = orders = post = false
|
107
|
-
# Default options set is for POST
|
108
|
-
if mp.is String or mp.kinda Array and !(uri.is String or uri.kinda Array)
|
109
|
-
# if second arg is String, then that's uri
|
110
|
-
uri, mp, post = mp.dup, false, true
|
111
|
-
# L.debug "uri #{uri.inspect} has been passed as second argument instead of third"
|
112
|
-
# But if we have only one argument actually passed
|
113
|
-
# except for options hash, then believe it's GET
|
114
|
-
elsif body.is String or body.kinda [String]
|
115
|
-
L.debug "first parameter (#{body.inspect}) was implicitly taken as uri#{' '+body.class if body.kinda Array}, but last paramter is of type #{uri.class}, too" if uri
|
116
|
-
uri = body.dup
|
117
|
-
elsif !body then uri = nil
|
118
|
-
else
|
119
|
-
uri = uri.dup if uri
|
120
|
-
mp, post = !!mp, true
|
121
|
-
end
|
122
|
-
if post
|
123
|
-
unless body.is Hash or body.kinda [Hash]
|
124
|
-
raise TypeError, "body of post request must be a hash or hash array, params was
|
125
|
-
(#{args.inspect[1..-2]})"
|
126
|
-
end
|
127
|
-
validate_zip uri, body if zip
|
128
|
-
if zip or uri.kinda Array or body.kinda Array
|
129
|
-
many = true
|
130
|
-
if zip or uri.kinda Array
|
131
|
-
validate_some uri
|
132
|
-
orders = zip ? body.zip(uri) : uri.xprod(body, :inverse)
|
133
|
-
else
|
134
|
-
uri = validate uri
|
135
|
-
orders = body.xprod uri
|
136
|
-
end
|
137
|
-
orders.each {|o| o.unshift :loadPost and o.insert 2, mp}
|
138
|
-
else
|
139
|
-
uri = validate uri
|
140
|
-
order = [:loadPost, body, mp, uri]
|
141
|
-
end
|
142
|
-
else
|
143
|
-
if uri.kinda Array
|
144
|
-
many = true
|
145
|
-
validate_some uri
|
146
|
-
orders = [:loadGet].xprod uri
|
147
|
-
else
|
148
|
-
uri = validate uri
|
149
|
-
order = [:loadGet, uri]
|
150
|
-
end
|
151
|
-
end
|
152
|
-
if !order.b and !orders.b
|
153
|
-
raise ArgumentError, "failed to run blank request#{'s' if many}, params was
|
154
|
-
(#{args.inspect[1..-2]})"
|
155
|
-
end
|
156
|
-
|
157
|
-
opts[:wait] = opts[:sync] if :sync.in opts
|
158
|
-
opts[:wait] = true if !:wait.in(opts) and
|
159
|
-
:proc_result.in(opts) ? !opts[:proc_result] : opts[:save_result]
|
160
|
-
opts[:eval] = false if opts[:json] or opts[:hash] or opts[:raw]
|
161
|
-
opts[:load_scripts] = self if opts[:load_scripts]
|
162
|
-
opts[:stream] = true if opts[:raw]
|
163
|
-
(opts[:headers] ||= {})['X-Requested-With'] = 'XMLHttpRequest' if opts[:xhr]
|
164
|
-
[many, order, orders, opts]
|
165
|
-
end
|
166
|
-
|
167
|
-
def get_cached(*links)
|
168
|
-
res = []
|
169
|
-
expire = links[-1] == :expire ? links.pop : false
|
170
|
-
links.parses(:uri).each_with_index {|uri, i|
|
171
|
-
next if uri.path[/ads|count|stats/]
|
172
|
-
file = Cache.load uri, !expire
|
173
|
-
if file
|
174
|
-
if expire
|
175
|
-
@ss.next.loadGet(uri.href, :headers=>{'If-Modified-Since'=>file.date}) {|c|
|
176
|
-
if c.res.code == 200
|
177
|
-
res << [i, (data = c.res.body)]
|
178
|
-
Cache.save uri, data, false
|
179
|
-
else
|
180
|
-
res << [i, file.is(String) ? file : read(file.path)]
|
181
|
-
end
|
182
|
-
}
|
183
|
-
else
|
184
|
-
res << [i, file.is(String) ? file : read(file.path)]
|
185
|
-
end
|
186
|
-
else
|
187
|
-
@ss.next.loadGet(uri.href) {|c|
|
188
|
-
if c.res.code == 200
|
189
|
-
res << [i, (data = c.res.body)]
|
190
|
-
Cache.save uri, data, !expire
|
191
|
-
end
|
192
|
-
}
|
193
|
-
end
|
194
|
-
}
|
195
|
-
Curl.wait
|
196
|
-
links.size == 1 ? res[0][1] : res.sort!.lasts
|
197
|
-
end
|
198
|
-
|
199
|
-
def get_distr(uri, psize, threads, start=0, print_progress=$verbose)
|
200
|
-
raise ConfigError, "Insufficient Scouts in the Frame for distributed downloading" if @ss.size < 2
|
201
|
-
@print_progress, code, stop_download, @ss_reserve = print_progress, nil, false, []
|
202
|
-
(s = @ss.next).http.on_header {|h|
|
203
|
-
next h.size unless h[/Content-Length: (\d+)|HTTP\/1\.[01] (\d+)[^\r]+|^\s*$/]
|
204
|
-
if code = $2
|
205
|
-
if code != '200'
|
206
|
-
L << "#$& getting #{uri}; interrupting request."
|
207
|
-
s.http.on_header() # set default process
|
208
|
-
next 0
|
209
|
-
end
|
210
|
-
next h.size
|
211
|
-
end
|
212
|
-
|
213
|
-
s.http.on_header() # set default process
|
214
|
-
if !$1 # конец хедера, content-length отсутствует
|
215
|
-
L << "No Content-Length header; trying to load a whole #{uri} at once!"
|
216
|
-
s.loadGet {|c| yield c.res.body.size, 0, c.res.body}
|
217
|
-
next 0
|
218
|
-
end
|
219
|
-
|
220
|
-
len = $1.to_i - start
|
221
|
-
psize = configure_psize(len, psize, threads)
|
222
|
-
parts = (len/psize.to_f).ceil
|
223
|
-
setup_speedometer(uri, parts, len)
|
224
|
-
yield len, psize, :careful_dl if len > (@opts[:careful_dl] || 10.mb)
|
225
|
-
|
226
|
-
@ss_reserve = @ss[threads+1..-1]
|
227
|
-
@ss = @ss[0..threads]
|
228
|
-
(0...parts).each {|n|
|
229
|
-
break if stop_download
|
230
|
-
|
231
|
-
s = @ss.next
|
232
|
-
run_speedometer(s, len, n)
|
233
|
-
s.loadGet(uri, :headers => {
|
234
|
-
'Range' => "bytes=#{start + n*psize}-#{start + (n+1)*psize - 1}"
|
235
|
-
}) {|c|
|
236
|
-
clear_speedometer(s)
|
237
|
-
if c.res.code/10 == 20
|
238
|
-
yield len, n*psize, c.res.body
|
239
|
-
else
|
240
|
-
L << "#{c.res} during get #{uri.inspect}; interrupting request."
|
241
|
-
stop_download = true
|
242
|
-
end
|
243
|
-
}
|
244
|
-
}
|
245
|
-
0
|
246
|
-
}
|
247
|
-
s.raise_err = false
|
248
|
-
s.loadGet validate uri
|
249
|
-
ensure
|
250
|
-
@ss.concat @ss_reserve || []
|
251
|
-
end
|
252
|
-
|
253
|
-
def dl(uri, df=File.basename(uri.parse(:uri).path), psize=:auto, opts={})
|
254
|
-
dled = 0
|
255
|
-
lock = ''
|
256
|
-
callback = lambda {|len, pos, body|
|
257
|
-
if body != :careful_dl
|
258
|
-
begin
|
259
|
-
write(df, body, pos)
|
260
|
-
rescue => e
|
261
|
-
binding.start_interaction
|
262
|
-
raise
|
263
|
-
end
|
264
|
-
if (dled += body.size) == len
|
265
|
-
File.delete lock if File.file? lock
|
266
|
-
yield df if block_given?
|
267
|
-
end
|
268
|
-
else
|
269
|
-
lock = lock_file df, len, pos # filename, filesize, partsize
|
270
|
-
end
|
271
|
-
}
|
272
|
-
opts[:threads] ||= @ss.size-1
|
273
|
-
get_distr(uri, psize, opts[:threads], opts[:start].to_i, &callback)
|
274
|
-
Curl.wait unless block_given?
|
275
|
-
df
|
276
|
-
end
|
277
|
-
|
278
|
-
def simple_dl(uri, df=File.basename(uri.parse(:uri).path), opts={})
|
279
|
-
opts.reverse_merge! :psize => :auto, :threads => 1, :print_progress => $verbose
|
280
|
-
L << opts
|
281
|
-
|
282
|
-
@print_progress = opts[:print_progress]
|
283
|
-
unless len = opts[:len] || (map = read_mapfile(df) and map.len)
|
284
|
-
return @ss.next.loadHead(uri) {|c| $log << c
|
285
|
-
if len = c.res['Content-Length']
|
286
|
-
simple_dl(uri, df, opts.merge(:len => len.to_i))
|
287
|
-
else L.warn "Can't get file size, so it has no sence to download this way. Or maybe it's just an error. Check ObjectSpace.find(#{c.res.object_id}) out."
|
288
|
-
end
|
289
|
-
}
|
290
|
-
end
|
291
|
-
|
292
|
-
psize, parts = check_mapfile(df, opts)
|
293
|
-
return unless psize
|
294
|
-
L << [psize, parts]
|
295
|
-
setup_speedometer(uri, parts.size, len)
|
296
|
-
|
297
|
-
obtained uri do |uri|
|
298
|
-
if opts[:threads] == 1
|
299
|
-
start = opts[:start].to_i || (parts[0] && parts[0].begin) || 0
|
300
|
-
scout = opts[:scout] || @ss.next
|
301
|
-
$log << [uri, scout]
|
302
|
-
(loadget = lambda {|n|
|
303
|
-
run_speedometer(scout, len, n)
|
304
|
-
from = start + n*psize
|
305
|
-
to = start + (n+1)*psize - 1
|
306
|
-
scout.loadGet(uri, :headers => {'Range' => "bytes=#{from}-#{to}"}) {|c|
|
307
|
-
begin
|
308
|
-
$log << "writing #{df} from #{from}: #{c.res.body.inspect}"
|
309
|
-
write(df, c.res.body, from)
|
310
|
-
rescue => e
|
311
|
-
binding.start_interaction
|
312
|
-
raise
|
313
|
-
end
|
314
|
-
if write_mapfile(df, from, to)
|
315
|
-
clear_speedometer(scout)
|
316
|
-
L.warn "file completely dl'ed, but (n+1)*psize <= len: (#{n}+1)*#{psize} <= #{len}" if (n+1)*psize <= len
|
317
|
-
yield df if block_given?
|
318
|
-
elsif (n+1)*psize <= len
|
319
|
-
loadget[n+1]
|
320
|
-
end
|
321
|
-
}
|
322
|
-
})[0]
|
323
|
-
else
|
324
|
-
exec(uri, opts.merge(:raw => true, :ranges => parts)) {|c|
|
325
|
-
L << c.res
|
326
|
-
range = c.req.range
|
327
|
-
begin
|
328
|
-
write(df, c.res.body, range.begin)
|
329
|
-
rescue => e
|
330
|
-
binding.start_interaction
|
331
|
-
raise
|
332
|
-
end
|
333
|
-
if write_mapfile(df, range.begin, range.end)
|
334
|
-
@ss.each {|s| s.http.on_progress} if @print_progress
|
335
|
-
yield df if block_given?
|
336
|
-
end
|
337
|
-
}
|
338
|
-
end
|
339
|
-
end
|
340
|
-
end
|
341
|
-
|
342
|
-
def check_mapfile(df, opts={})
|
343
|
-
opts.reverse_merge! :psize => :auto, :threads => 1
|
344
|
-
map = read_mapfile df
|
345
|
-
if map
|
346
|
-
L << map
|
347
|
-
if map.rest.empty?
|
348
|
-
puts "#{df} is loaded"
|
349
|
-
$log << 'deleting mapfile'
|
350
|
-
File.delete df+'.map'
|
351
|
-
[]
|
352
|
-
else
|
353
|
-
if opts[:len] and map.len != opts[:len]
|
354
|
-
raise "Incorrect file size for #{df}"
|
355
|
-
end
|
356
|
-
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
357
|
-
[psize, map.rest.div(psize)]
|
358
|
-
end
|
359
|
-
else
|
360
|
-
write_mapfile df, opts[:len]
|
361
|
-
psize = configure_psize *opts.values_at(:len, :psize, :threads)
|
362
|
-
$log << (0...opts[:len]).div(psize)
|
363
|
-
[psize, (0...opts[:len]).div(psize)]
|
364
|
-
end
|
365
|
-
end
|
366
|
-
|
367
|
-
def read_mapfile(df)
|
368
|
-
df += '.map'
|
369
|
-
text = read df
|
370
|
-
$log << "mapfile read: #{text}"
|
371
|
-
if text.b
|
372
|
-
text[/^(\d+)\0+(\d+)\0*\n/]
|
373
|
-
map = {}
|
374
|
-
$log << [$1,$2]
|
375
|
-
if $1 and $1 == $2
|
376
|
-
map.rest = []
|
377
|
-
else
|
378
|
-
map.len, *map.parts = text.chop/"\n"
|
379
|
-
map.len = map.len.to_i
|
380
|
-
map.parts.map! {|part| part /= '-'; part[0].to_i..part[1].to_i}
|
381
|
-
$log << map.parts
|
382
|
-
map.rest = (0...map.len) - XRange(*map.parts)
|
383
|
-
end
|
384
|
-
map
|
385
|
-
end
|
386
|
-
end
|
387
|
-
|
388
|
-
def write_mapfile(df, *args)
|
389
|
-
df += '.map'
|
390
|
-
map = ''
|
391
|
-
if args.size != 2
|
392
|
-
len = args.shift
|
393
|
-
map << len.to_s.ljust(22, "\0") << "\n" if File.file? df
|
394
|
-
end
|
395
|
-
if args.any?
|
396
|
-
read(df)[/^(\d+)\0+(\d+)\0*\n/]
|
397
|
-
$log << "mapfile read"
|
398
|
-
$log << [$1,$2]
|
399
|
-
dled = $2.to_i + args[1] - args[0] + 1
|
400
|
-
return true if dled == $1.to_i
|
401
|
-
map << "#{args[0]}..#{args[1]}\n"
|
402
|
-
$log << 'writing mapfile'
|
403
|
-
write(df, dled.to_s.ljust(11, "\0"), 11)
|
404
|
-
end
|
405
|
-
$log << [df, map]
|
406
|
-
$log << 'writing mapfile'
|
407
|
-
write df, map
|
408
|
-
nil
|
409
|
-
end
|
410
|
-
|
411
|
-
def configure_psize(len, psize, threads)
|
412
|
-
case psize
|
413
|
-
when Numeric; psize.to_i
|
414
|
-
when :auto; len > 100000 ? len/threads+1 : len
|
415
|
-
when :mb; 1.mb
|
416
|
-
else raise ArgumentError, "Incorrect value for part size #{psize}:#{psize.class}"
|
417
|
-
end
|
418
|
-
end
|
419
|
-
|
420
|
-
private
|
421
|
-
def validate_zip(uri, body)
|
422
|
-
if !(uri.kinda Array and body.kinda Array)
|
423
|
-
raise ZippingError, [uri.class, nil, body.class, nil]
|
424
|
-
elsif uri.size != body.size
|
425
|
-
raise ZippingError, [uri.class, uri.size, body.class, body.size]
|
426
|
-
end
|
427
|
-
end
|
428
|
-
|
429
|
-
# :static option now can accept hash with :procotol key, in that case Frame can be relocated to the same domain on another protocol and default protocol would be the value of @static.protocol
|
430
|
-
def validate(uri)
|
431
|
-
if uri
|
432
|
-
loc = uri.parse:uri
|
433
|
-
if loc.root and loc.root != @loc.root
|
434
|
-
if @static
|
435
|
-
if @static.is Hash
|
436
|
-
if loc.host != @loc.host
|
437
|
-
raise TargetError, "unable to get #{uri} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
|
438
|
-
end
|
439
|
-
else
|
440
|
-
raise TargetError, "unable to get #{uri} by static frame #{@loc.root}, you should first update it with new target"
|
441
|
-
end
|
442
|
-
end
|
443
|
-
@loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
|
444
|
-
uri
|
445
|
-
elsif !loc.root
|
446
|
-
raise TargetError if !@static
|
447
|
-
if @static.is Hash
|
448
|
-
@loc.protocol = @static.protocol
|
449
|
-
@loc.root = @loc.protocol+'://'+@loc.host
|
450
|
-
end
|
451
|
-
File.join @loc.root, uri
|
452
|
-
else uri
|
453
|
-
end
|
454
|
-
else
|
455
|
-
raise TargetError if !@static
|
456
|
-
@loc.href
|
457
|
-
end
|
458
|
-
end
|
459
|
-
|
460
|
-
def validate_some(uris)
|
461
|
-
uris.map! {|u| validate u}
|
462
|
-
end
|
463
|
-
|
464
|
-
def run_callbacks!(page, opts, &callback)
|
465
|
-
if callback
|
466
|
-
yres = callback.call page
|
467
|
-
if opts[:save_result] or :proc_result.in opts
|
468
|
-
page.res = yres
|
469
|
-
end
|
470
|
-
if opts[:proc_result].is Proc and yres != :skip
|
471
|
-
opts[:proc_result].call yres
|
472
|
-
end
|
473
|
-
end
|
474
|
-
end
|
475
|
-
|
476
|
-
# TODO: found why/how IO on callbacks breaks +curl.res.body+ content and how to fix or how to avoid it
|
477
|
-
def exec_one(order, opts, &callback)
|
478
|
-
if @use_cache and order[0] == :loadGet and page = @@cache[order[1]]
|
479
|
-
run_callbacks! page, opts, &callback
|
480
|
-
res = opts[:wait] && (opts[:save_result] or :proc_result.in opts) ? page.res : page
|
481
|
-
return res
|
482
|
-
end
|
483
|
-
# must result in Page (default) or it's subclass
|
484
|
-
page = opts[:result].new
|
485
|
-
# if no spare scouts can be found, squad simply waits for first callbacks to complete
|
486
|
-
s = @ss.next
|
487
|
-
s.send(*(order << opts)) {|curl|
|
488
|
-
# there is a problem with storing html on disk
|
489
|
-
if order[0] == :loadGet and @write_to
|
490
|
-
# sometimes (about 2% for 100-threads-dling) when this string is calling
|
491
|
-
# no matter what +curl.res.body+ has contained here
|
492
|
-
RMTools.rw @write_to+'/'+order[-2].sub(/^[a-z]+:\/\//, ''), curl.res.body.xml_to_utf
|
493
|
-
end
|
494
|
-
if opts[:raw]
|
495
|
-
yield curl
|
496
|
-
# here +curl.res.body+ become empty
|
497
|
-
elsif page.process(curl, opts)
|
498
|
-
@@cache[page.href] = page if order[0] == :loadGet and @use_cache
|
499
|
-
run_callbacks! page, opts, &callback
|
500
|
-
end
|
501
|
-
}
|
502
|
-
if opts[:wait]
|
503
|
-
opts[:thread_safe] ? $Carier.perform : Curl.wait
|
504
|
-
(opts[:save_result] or :proc_result.in opts) ? page.res : page
|
505
|
-
else page
|
506
|
-
end
|
507
|
-
end
|
508
|
-
|
509
|
-
def exec_many(orders, with_opts, &callback)
|
510
|
-
w = with_opts.delete :wait
|
511
|
-
iterator = with_opts[:stream] ? :each : :map
|
512
|
-
if with_opts[:ranges]
|
513
|
-
if orders.size != with_opts[:ranges].size
|
514
|
-
raise ZippingError, [orders.size, with_opts[:ranges].size], "orders quantity (%s) is not equal ranges quantity (%s)"
|
515
|
-
end
|
516
|
-
pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
|
517
|
-
(with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
|
518
|
-
exec_one order, with_opts, &callback
|
519
|
-
}
|
520
|
-
else
|
521
|
-
pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
|
522
|
-
end
|
523
|
-
with_opts[:thread_safe] ? $Carier.perform : Curl.wait if w
|
524
|
-
with_opts[:stream] || pages
|
525
|
-
end
|
526
|
-
|
527
|
-
|
528
|
-
def setup_speedometer(uri, parts, len)
|
529
|
-
return unless @print_progress
|
530
|
-
@progress = Array.new(parts, 0)
|
531
|
-
@stop_print, @speed, @sum, *@speedometer = false, '', 0, Time.now, 0
|
532
|
-
@str = "Downloading #{uri.gsub '%', '%%'} (#{len.bytes}) in %03s streams, %07s/s:"
|
533
|
-
@bs = "\b\r"*(@newlines = (uri.unpack('U*').size+len.bytes.size+42)/(ENV['COLUMNS'] || 80).to_i)
|
534
|
-
Thread.new {
|
535
|
-
until @stop_print
|
536
|
-
sleep 0.2
|
537
|
-
now = Time.now
|
538
|
-
if now > @speedometer[0] and @sum > @speedometer[1]
|
539
|
-
@speed.replace(((@sum - @speedometer[1])/(now - @speedometer[0])).to_i.bytes)
|
540
|
-
@speedometer.replace [now, @sum]
|
541
|
-
end
|
542
|
-
end
|
543
|
-
}
|
544
|
-
end
|
545
|
-
|
546
|
-
def run_speedometer(scout, len, n)
|
547
|
-
return unless @print_progress
|
548
|
-
scout.http.on_progress {|dl_need, dl_now, *ul|
|
549
|
-
if !@stop_print
|
550
|
-
@progress[n] = dl_now
|
551
|
-
percents = (@sum = @progress.sum)*100/len
|
552
|
-
print @str%[@progress.select_b.size, @speed]+"\n%%[#{'@'*percents}#{' '*(100-percents)}]\r\b\r"+@bs
|
553
|
-
if percents == 100
|
554
|
-
puts "\v"*@newlines
|
555
|
-
@stop_print = true
|
556
|
-
end
|
557
|
-
end
|
558
|
-
true
|
559
|
-
}
|
560
|
-
end
|
561
|
-
|
562
|
-
def clear_speedometer(scout)
|
563
|
-
return unless @print_progress
|
564
|
-
scout.http.on_progress
|
565
|
-
end
|
566
|
-
|
567
|
-
end
|
568
|
-
|
569
|
-
def dl(uri, df=File.basename(uri.parse(:uri).path), threads=5, timeout=600, &block)
|
570
|
-
Curl.run
|
571
|
-
Frame({:timeout=>timeout}, threads).dl(uri, df, :auto, threads, &block)
|
572
|
-
end
|
573
|
-
module_function :dl
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
class Page
|
578
|
-
# for debug, just enable L#debug, don't write tons of chaotic log-lines
|
579
|
-
__init__
|
580
|
-
# res here is result of page processing made in frame context
|
581
|
-
attr_writer :title
|
582
|
-
attr_reader :html, :loc, :hash, :doc, :js, :curl_res, :failed
|
583
|
-
attr_accessor :res
|
584
|
-
@@ignore = /google|_gat|tracker|adver/i
|
585
|
-
|
586
|
-
def initialize(obj='', loc=Hash.new(''), js=$JSRuntime||Johnson::Runtime.new)
|
587
|
-
loc = loc.parse:uri if !loc.is Hash
|
588
|
-
@js = js
|
589
|
-
if obj.is Curl::Easy or obj.kinda Scout
|
590
|
-
c = obj.kinda(Scout) ? obj.http : obj
|
591
|
-
@html = ''
|
592
|
-
# just (c, loc) would pass to #process opts variable that returns '' on any key
|
593
|
-
process(c, loc.b || {})
|
594
|
-
else
|
595
|
-
@html = obj
|
596
|
-
@loc = loc
|
597
|
-
end
|
598
|
-
end
|
599
|
-
|
600
|
-
def empty?
|
601
|
-
!(@hash.nil? ? @html : @hash).b
|
602
|
-
end
|
603
|
-
|
604
|
-
def inspect
|
605
|
-
if !@hash.nil?
|
606
|
-
"<#FramePage (#{@hash ? @hash.inspect.size.bytes : 'failed to parse'}) #{@json ? 'json' : 'params hash'}>"
|
607
|
-
else
|
608
|
-
"<#FramePage #{@html.b ? "#{@failed ? @curl_res.header : '«'+title(false)+'»'} (#{@html.size.bytes}" : '(empty'})#{' js enabled' if @js and @doc and @hash.nil?}>"
|
609
|
-
end
|
610
|
-
end
|
611
|
-
|
612
|
-
def html!(encoding='UTF-8')
|
613
|
-
@html.force_encoding(encoding)
|
614
|
-
end
|
615
|
-
|
616
|
-
# We can then alternate #process in Page subclasses
|
617
|
-
# Frame doesn't mind about value returned by #process
|
618
|
-
def process(c, opts={})
|
619
|
-
@loc = c.last_effective_url.parse:uri
|
620
|
-
@curl_res = c.res
|
621
|
-
L.debug "#{@loc.fullpath} -> #{@curl_res}"
|
622
|
-
if @curl_res.code == 200
|
623
|
-
body = @curl_res.body
|
624
|
-
if opts[:json]
|
625
|
-
@json = true
|
626
|
-
@hash = begin; body.from_json
|
627
|
-
rescue StandardError
|
628
|
-
false
|
629
|
-
end
|
630
|
-
if !@hash or @hash.is String
|
631
|
-
L.debug "failed to get json from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
632
|
-
@html = body; to_doc
|
633
|
-
@hash = false
|
634
|
-
end
|
635
|
-
|
636
|
-
elsif opts[:hash]
|
637
|
-
if body.inline
|
638
|
-
@hash = body.to_params
|
639
|
-
else
|
640
|
-
@hash = false
|
641
|
-
L.debug "failed to get params hash from #{c.last_effective_url}, take a look at my @doc for info; my object_id is #{object_id}"
|
642
|
-
@html = body; to_doc
|
643
|
-
end
|
644
|
-
|
645
|
-
else
|
646
|
-
@html = body.xml_to_utf
|
647
|
-
to_doc
|
648
|
-
if opts[:eval]
|
649
|
-
load_scripts opts[:load_scripts]
|
650
|
-
eval_js
|
651
|
-
end
|
652
|
-
end
|
653
|
-
elsif !(opts[:json] or opts[:hash])
|
654
|
-
@html = @curl_res.body
|
655
|
-
@failed = @curl_res.code
|
656
|
-
end
|
657
|
-
self
|
658
|
-
end
|
659
|
-
|
660
|
-
def eval_js(frame=nil)
|
661
|
-
eval_string "document.location = window.location = #{@loc.to_json};
|
662
|
-
document.URL = document.baseURI = document.documentURI = location.href;
|
663
|
-
document.domain = location.host;"
|
664
|
-
find("script").each {|n|
|
665
|
-
L.debug n.text.strip
|
666
|
-
if text = n.text.strip.b
|
667
|
-
js[:write_output] = ''
|
668
|
-
eval_string text
|
669
|
-
if res = js[:write_output].b then n.after res end
|
670
|
-
n.remove!
|
671
|
-
elsif frame and n.src
|
672
|
-
eval_string frame.get_cached expand_link n.src
|
673
|
-
end
|
674
|
-
}
|
675
|
-
end
|
676
|
-
|
677
|
-
def eval_string(str)
|
678
|
-
@js ||= Johnson::Runtime.new
|
679
|
-
L.debug "#{@js} evaluating in #{Thread.current}\nmain: #{Thread.main}; carier: #{$CarierThread}"
|
680
|
-
begin
|
681
|
-
@js.evaluate(str)
|
682
|
-
rescue Johnson::Error => e
|
683
|
-
L.warn e.message
|
684
|
-
L.debug {
|
685
|
-
if m = e.message.match(/(\w+) is undefined|([\w.]+) is not a function/)
|
686
|
-
L.clr.hl! str, /\b#{m[1] || m[2]}\b/
|
687
|
-
end
|
688
|
-
"\n\t#{str}"
|
689
|
-
}
|
690
|
-
end
|
691
|
-
end
|
692
|
-
|
693
|
-
def to_doc
|
694
|
-
@doc = @html.to_doc :forceutf
|
695
|
-
end
|
696
|
-
|
697
|
-
def title(full=true)
|
698
|
-
if @hash.nil? and !@failed and @html.b
|
699
|
-
if full
|
700
|
-
to_doc unless defined? @doc
|
701
|
-
if @doc.title.b
|
702
|
-
@title = @doc.title
|
703
|
-
else
|
704
|
-
@title = @loc.href
|
705
|
-
@doc.at('head').prepend XML::Node('title', @title) if @doc.at('head')
|
706
|
-
@title
|
707
|
-
end
|
708
|
-
else
|
709
|
-
title true unless defined? @title
|
710
|
-
if RUBY_VERSION < '1.9' and @title.cyr? and UTF2ANSI[@title].size > 40
|
711
|
-
@short_title = ANSI2UTF[UTF2ANSI[@title][/.{1,30}\S*/][0..38]]+'…'
|
712
|
-
elsif @title.size > 40
|
713
|
-
@short_title = @title[/.{1,30}\S*/][0..38]+'…'
|
714
|
-
else
|
715
|
-
@short_title = @title
|
716
|
-
end
|
717
|
-
end
|
718
|
-
else
|
719
|
-
@loc.href
|
720
|
-
end
|
721
|
-
end
|
722
|
-
|
723
|
-
def find(xp) (@doc || to_doc).find xp end
|
724
|
-
|
725
|
-
def at(xp) (@doc || to_doc).at xp end
|
726
|
-
|
727
|
-
def url() @loc.href end
|
728
|
-
alias :href :url
|
729
|
-
|
730
|
-
def get_srcs(links='img')
|
731
|
-
begin
|
732
|
-
links = find(links).map {|e| e.src} if links.is String
|
733
|
-
rescue XML::Error
|
734
|
-
links = [links]
|
735
|
-
end
|
736
|
-
links.map {|link| expand_link link}.uniq
|
737
|
-
end
|
738
|
-
|
739
|
-
def get_src(link='img')
|
740
|
-
begin
|
741
|
-
link = at(link) && at(link).src if link.is String
|
742
|
-
rescue XML::Error; nil
|
743
|
-
end
|
744
|
-
expand_link link if link
|
745
|
-
end
|
746
|
-
|
747
|
-
def get_links(links='a')
|
748
|
-
begin
|
749
|
-
links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
|
750
|
-
rescue XML::Error
|
751
|
-
links = [links]
|
752
|
-
end
|
753
|
-
links.map {|link| expand_link link}.uniq
|
754
|
-
end
|
755
|
-
|
756
|
-
def get_link(link='a')
|
757
|
-
begin
|
758
|
-
link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
|
759
|
-
rescue XML::Error; nil
|
760
|
-
end
|
761
|
-
expand_link link if link
|
762
|
-
end
|
763
|
-
alias :get_hrefs :get_links
|
764
|
-
alias :links :get_links
|
765
|
-
alias :get_href :get_link
|
766
|
-
alias :link :get_link
|
767
|
-
alias :srcs :get_srcs
|
768
|
-
alias :src :get_src
|
769
|
-
|
770
|
-
def expand_link(link)
|
771
|
-
case link
|
772
|
-
when /^\w+:\/\// then link
|
773
|
-
when /^\/\// then @loc.protocol+link
|
774
|
-
when /^\// then @loc.root+link
|
775
|
-
else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
|
776
|
-
end
|
777
|
-
end
|
778
|
-
|
779
|
-
def form(form='form', hash={}, opts={})
|
780
|
-
form = "[action=#{@loc.path.inspect}]" if form == :self
|
781
|
-
if form.is String
|
782
|
-
form_node = at form
|
783
|
-
raise XML::Error, "Can't find form by xpath `#{form}` on page #{inspect}" if !form_node or form_node.name != 'form'
|
784
|
-
else form_node = form
|
785
|
-
end
|
786
|
-
hash = form_node.inputs_all.merge!(hash)
|
787
|
-
action = expand_link(form_node.action || @loc.path)
|
788
|
-
if form_node['method'].downcase == 'post'
|
789
|
-
[hash, form_node.enctype =~ /multipart/, action, opts]
|
790
|
-
else
|
791
|
-
action = "#{action}#{action['?'] ? '&' : '?'}#{hash.urlencode}" if hash.b
|
792
|
-
[action, opts]
|
793
|
-
end
|
794
|
-
end
|
795
|
-
|
796
|
-
def submit(form, frame, hash={}, opts={}, &callback)
|
797
|
-
(opts[:headers] ||= {}).Referer ||= @loc.href if @loc
|
798
|
-
query = form(form, hash, opts)
|
799
|
-
|
800
|
-
curr_target, new_target = frame.loc.href, (query[2] || query[0])
|
801
|
-
if need_retargeting = (frame.static && curr_target != new_target)
|
802
|
-
frame.retarget new_target
|
803
|
-
end
|
804
|
-
page = frame.exec(*query, &callback)
|
805
|
-
frame.retarget curr_target, :forced if need_retargeting
|
806
|
-
page
|
807
|
-
end
|
808
|
-
|
809
|
-
def load_scripts(frame)
|
810
|
-
frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
|
811
|
-
end
|
812
|
-
|
813
|
-
end
|
814
|
-
|
815
|
-
# using reprocessing of page in case of non-200 response:
|
816
|
-
# page_class = ReloadablePage do
|
817
|
-
# @res and @res.code != 200
|
818
|
-
# end
|
819
|
-
def ReloadablePage(&reload_condition)
|
820
|
-
rp = Class.new Page
|
821
|
-
rp.send :define_method, :process do |curl, opts|
|
822
|
-
super(curl, opts || {})
|
823
|
-
if curl.instance_eval &reload_condition
|
824
|
-
curl.retry!
|
825
|
-
nil # in case of reload_condition.call super's callback will not proceed
|
826
|
-
else self
|
827
|
-
end
|
828
|
-
end
|
829
|
-
rp
|
830
|
-
end
|
831
|
-
|
832
|
-
end
|
833
|
-
|
834
|
-
|
835
|
-
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|