list_spider 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +209 -57
- metadata +2 -3
- data/lib/spider_base.rb +0 -298
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
|
4
|
+
data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
|
7
|
+
data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465
|
data/lib/list_spider.rb
CHANGED
@@ -1,81 +1,233 @@
|
|
1
|
-
require
|
1
|
+
require 'em-http-request'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'set'
|
5
|
+
require "addressable/uri"
|
6
|
+
require File.expand_path('../spider_helper', __FILE__)
|
2
7
|
require File.expand_path('../delete_unvalid', __FILE__)
|
3
8
|
|
4
|
-
class
|
5
|
-
|
6
|
-
|
7
|
-
|
9
|
+
class TaskStruct
|
10
|
+
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
|
11
|
+
@origin_href = href
|
12
|
+
@href = href
|
13
|
+
if @href.class == "".class
|
14
|
+
@href = SpiderHelper.string_to_uri(@href)
|
15
|
+
end
|
16
|
+
@local_path = local_path
|
17
|
+
@http_method = http_method
|
18
|
+
@params = params
|
19
|
+
@extra_data = extra_data
|
20
|
+
@parse_method = parse_method
|
21
|
+
end
|
8
22
|
|
9
|
-
|
23
|
+
def == (o)
|
24
|
+
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
25
|
+
end
|
10
26
|
|
11
|
-
|
27
|
+
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
|
12
28
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
@failed_size = 0
|
20
|
-
end
|
29
|
+
end
|
30
|
+
|
31
|
+
module ListSpider
|
32
|
+
|
33
|
+
RANDOM_TIME = -1
|
34
|
+
NO_LIMIT_CONCURRENT = -1
|
21
35
|
|
22
|
-
|
36
|
+
@@random_time_range = 3..10
|
37
|
+
@@conver_to_utf8 = false
|
38
|
+
@@connection_opts = {:connect_timeout => 2*60}
|
39
|
+
@@overwrite_exist = false
|
40
|
+
@@max_redirects = 10
|
41
|
+
@@url_set = Set.new
|
23
42
|
|
24
43
|
class << self
|
25
44
|
|
26
|
-
attr_accessor :random_time_range
|
45
|
+
attr_accessor :random_time_range, :conver_to_utf8, :overwrite_exist, :max_redirects
|
27
46
|
|
28
|
-
|
47
|
+
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
48
|
+
@@connection_opts = {
|
49
|
+
:proxy => {
|
50
|
+
:host => proxy_addr,
|
51
|
+
:port => proxy_port
|
52
|
+
}
|
53
|
+
}
|
54
|
+
@@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
55
|
+
end
|
29
56
|
|
30
|
-
|
31
|
-
|
32
|
-
@down_list = @down_list + task
|
33
|
-
elsif task.is_a?TaskStruct
|
34
|
-
@down_list << task
|
35
|
-
else
|
36
|
-
puts "error task type:#{task.class}"
|
57
|
+
def connect_timeout(max_connect_time)
|
58
|
+
@@connection_opts[:connect_timeout] = max_connect_time
|
37
59
|
end
|
38
|
-
end
|
39
60
|
|
40
|
-
|
41
|
-
|
42
|
-
@failed_size += failed_list.size
|
43
|
-
# puts "success size:#{success_list.size}"
|
44
|
-
# puts "failed size:#{failed_list.size}"
|
45
|
-
success_list.each do |e|
|
46
|
-
e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
|
61
|
+
def set_header_option(header_option)
|
62
|
+
@@header_option = optHash
|
47
63
|
end
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
64
|
+
|
65
|
+
def event_machine_down(link_struct_list, callback = nil)
|
66
|
+
failed_list = []
|
67
|
+
succeed_list = []
|
68
|
+
multi = EventMachine::MultiRequest.new
|
69
|
+
# no_job = true
|
70
|
+
begin_time = Time.now
|
71
|
+
|
72
|
+
for_each_proc = proc do |e|
|
73
|
+
# if !@@overwrite_exist && File.exist?(e.local_path)
|
74
|
+
# succeed_list << e
|
75
|
+
# else
|
76
|
+
next unless @@url_set.add?(e.href)
|
77
|
+
# no_job = false
|
78
|
+
opt = {}
|
79
|
+
opt = {:redirects => @@max_redirects}
|
80
|
+
opt[:head] = @@header_option if defined? @@header_option
|
81
|
+
if e.http_method == :post
|
82
|
+
opt[:body] = e.params unless e.params.empty?
|
83
|
+
if @@connection_opts
|
84
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
85
|
+
else
|
86
|
+
w = EventMachine::HttpRequest.new(e.href).post opt
|
87
|
+
end
|
59
88
|
else
|
60
|
-
|
89
|
+
if @@connection_opts
|
90
|
+
opt[:query] = e.params unless e.params.empty?
|
91
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
|
92
|
+
else
|
93
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
94
|
+
end
|
61
95
|
end
|
96
|
+
|
97
|
+
w.callback {
|
98
|
+
@@url_set.delete(e.href)
|
99
|
+
# puts "complete:#{w.response_header}"
|
100
|
+
s = w.response_header.status
|
101
|
+
puts s
|
102
|
+
if s != 404
|
103
|
+
local_dir = File.dirname(e.local_path)
|
104
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
105
|
+
begin
|
106
|
+
File.open(e.local_path, "w") do |f|
|
107
|
+
if @@conver_to_utf8 == true
|
108
|
+
f << SpiderHelper.to_utf8( w.response)
|
109
|
+
else
|
110
|
+
f << w.response
|
111
|
+
end
|
112
|
+
end
|
113
|
+
succeed_list << e
|
114
|
+
rescue Exception => e
|
115
|
+
puts e
|
116
|
+
end
|
117
|
+
end
|
118
|
+
}
|
119
|
+
w.errback {
|
120
|
+
@@url_set.delete(e.href)
|
121
|
+
puts "errback:#{w.response_header}"
|
122
|
+
puts e.origin_href
|
123
|
+
puts e.href
|
124
|
+
puts w.response_header.status
|
125
|
+
failed_list << e
|
126
|
+
if e.http_method == :get
|
127
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
128
|
+
elsif e.http_method == :post
|
129
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
130
|
+
end
|
131
|
+
}
|
132
|
+
multi.add e.local_path, w
|
133
|
+
# end
|
134
|
+
end
|
135
|
+
|
136
|
+
cb = Proc.new do
|
137
|
+
end_time = Time.now
|
138
|
+
puts "use time:#{end_time-begin_time} seconds"
|
139
|
+
if callback.nil?
|
140
|
+
puts "success size:#{self.succeed_size}"
|
141
|
+
puts "failed size:#{self.failed_size}"
|
142
|
+
EventMachine.stop
|
143
|
+
else
|
144
|
+
callback.call(multi, succeed_list, failed_list)
|
62
145
|
end
|
63
146
|
end
|
64
|
-
|
147
|
+
|
148
|
+
link_struct_list.each &for_each_proc
|
149
|
+
multi.callback &cb
|
65
150
|
end
|
66
|
-
end
|
67
151
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
152
|
+
def complete(multi, success_list, failed_list)
|
153
|
+
@@succeed_size += success_list.size
|
154
|
+
@@failed_size += failed_list.size
|
155
|
+
success_list.each do |e|
|
156
|
+
e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
|
157
|
+
end
|
72
158
|
|
73
|
-
|
74
|
-
|
75
|
-
|
159
|
+
todo = @@down_list.slice!(0, @@max)
|
160
|
+
if todo.empty?
|
161
|
+
puts "success size:#{@@succeed_size}"
|
162
|
+
puts "failed size:#{@@failed_size}"
|
163
|
+
EventMachine.stop
|
164
|
+
else
|
165
|
+
if @@inter_val != 0
|
166
|
+
if success_list.size != 0 || failed_list.size !=0
|
167
|
+
if @@inter_val == RANDOM_TIME
|
168
|
+
sleep(rand(@@random_time_range))
|
169
|
+
else
|
170
|
+
sleep(@@inter_val)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
event_machine_down(todo, method(:complete))
|
175
|
+
end
|
176
|
+
end
|
76
177
|
|
77
|
-
|
78
|
-
|
79
|
-
|
178
|
+
def event_machine_start_list(down_list, callback = nil)
|
179
|
+
EventMachine.run {
|
180
|
+
if down_list.empty?
|
181
|
+
callback.call(nil, [], []) if callback
|
182
|
+
else
|
183
|
+
event_machine_down(down_list, callback)
|
184
|
+
end
|
185
|
+
}
|
186
|
+
end
|
80
187
|
|
188
|
+
def filter_list(down_list)
|
189
|
+
need_down_list = []
|
190
|
+
down_list.each do |ts|
|
191
|
+
if !@@overwrite_exist && File.exist?(ts.local_path)
|
192
|
+
ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
|
193
|
+
else
|
194
|
+
need_down_list << ts
|
195
|
+
end
|
196
|
+
end
|
197
|
+
return need_down_list
|
198
|
+
end
|
199
|
+
|
200
|
+
def get_list(down_list, inter_val: 0, max: 30)
|
201
|
+
@@down_list = []
|
202
|
+
|
203
|
+
need_down_list = filter_list(down_list)
|
204
|
+
|
205
|
+
@@down_list = @@down_list + need_down_list
|
206
|
+
@@inter_val = inter_val
|
207
|
+
@@max = max
|
208
|
+
@@max = @@down_list.size if @@max == NO_LIMIT_CONCURRENT
|
209
|
+
@@succeed_size = 0
|
210
|
+
@@failed_size = 0
|
211
|
+
|
212
|
+
puts "total size:#{@@down_list.size}"
|
213
|
+
event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
|
214
|
+
end
|
215
|
+
|
216
|
+
def get_one(task)
|
217
|
+
get_list([task])
|
218
|
+
end
|
219
|
+
|
220
|
+
def add_task(task)
|
221
|
+
if task.is_a?Array
|
222
|
+
need_down_list = filter_list(task)
|
223
|
+
@@down_list = @@down_list + need_down_list
|
224
|
+
elsif task.is_a?TaskStruct
|
225
|
+
need_down_list = filter_list([task])
|
226
|
+
@@down_list = @@down_list + need_down_list
|
227
|
+
else
|
228
|
+
puts "error task type:#{task.class}"
|
229
|
+
end
|
230
|
+
end
|
231
|
+
|
232
|
+
end
|
81
233
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04
|
11
|
+
date: 2016-05-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|
@@ -58,7 +58,6 @@ extra_rdoc_files: []
|
|
58
58
|
files:
|
59
59
|
- lib/delete_unvalid.rb
|
60
60
|
- lib/list_spider.rb
|
61
|
-
- lib/spider_base.rb
|
62
61
|
- lib/spider_helper.rb
|
63
62
|
homepage: https://github.com/chinazhangchao/list_spider
|
64
63
|
licenses:
|
data/lib/spider_base.rb
DELETED
@@ -1,298 +0,0 @@
|
|
1
|
-
require 'em-http-request'
|
2
|
-
require 'nokogiri'
|
3
|
-
require 'fileutils'
|
4
|
-
require 'set'
|
5
|
-
require File.expand_path('../spider_helper', __FILE__)
|
6
|
-
require "addressable/uri"
|
7
|
-
|
8
|
-
class TaskStruct
|
9
|
-
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
|
10
|
-
@origin_href = href
|
11
|
-
@href = href
|
12
|
-
if @href.class == "".class
|
13
|
-
@href = SpiderHelper.string_to_uri(@href)
|
14
|
-
end
|
15
|
-
@local_path = local_path
|
16
|
-
@http_method = http_method
|
17
|
-
@params = params
|
18
|
-
@extra_data = extra_data
|
19
|
-
@parse_method = parse_method
|
20
|
-
end
|
21
|
-
|
22
|
-
def == (o)
|
23
|
-
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
24
|
-
end
|
25
|
-
|
26
|
-
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
|
27
|
-
|
28
|
-
end
|
29
|
-
|
30
|
-
module SpiderBase
|
31
|
-
|
32
|
-
@@conver_to_utf8 = false
|
33
|
-
@@connection_opts = {:connect_timeout => 2*60}
|
34
|
-
@@overwrite_exist = false
|
35
|
-
@@max_redirects = 10
|
36
|
-
|
37
|
-
class << self
|
38
|
-
|
39
|
-
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
40
|
-
|
41
|
-
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
42
|
-
@@connection_opts = {
|
43
|
-
:proxy => {
|
44
|
-
:host => proxy_addr,
|
45
|
-
:port => proxy_port
|
46
|
-
}
|
47
|
-
}
|
48
|
-
@@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
49
|
-
end
|
50
|
-
|
51
|
-
def connect_timeout(max_connect_time)
|
52
|
-
@@connection_opts[:connect_timeout] = max_connect_time
|
53
|
-
end
|
54
|
-
|
55
|
-
def set_header_option(header_option)
|
56
|
-
@@header_option = optHash
|
57
|
-
end
|
58
|
-
|
59
|
-
def event_machine_down(link_struct_list, callback = nil)
|
60
|
-
failed_list = []
|
61
|
-
succeed_list = []
|
62
|
-
# puts "event_machine_down callback:#{callback}"
|
63
|
-
multi = EventMachine::MultiRequest.new
|
64
|
-
no_job = true
|
65
|
-
begin_time = Time.now
|
66
|
-
|
67
|
-
for_each_proc = proc do |e|
|
68
|
-
if !@@overwrite_exist && File.exist?(e.local_path)
|
69
|
-
succeed_list << e
|
70
|
-
else
|
71
|
-
no_job = false
|
72
|
-
opt = {}
|
73
|
-
opt = {:redirects => @@max_redirects}
|
74
|
-
opt[:head] = @@header_option if defined? @@header_option
|
75
|
-
if e.http_method == :post
|
76
|
-
opt[:body] = e.params unless e.params.empty?
|
77
|
-
if @@connection_opts
|
78
|
-
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
79
|
-
else
|
80
|
-
w = EventMachine::HttpRequest.new(e.href).post opt
|
81
|
-
end
|
82
|
-
else
|
83
|
-
if @@connection_opts
|
84
|
-
opt[:query] = e.params unless e.params.empty?
|
85
|
-
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
|
86
|
-
else
|
87
|
-
w = EventMachine::HttpRequest.new(e.href).get opt
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
w.callback {
|
92
|
-
# puts "complete:#{w.response_header}"
|
93
|
-
s = w.response_header.status
|
94
|
-
puts s
|
95
|
-
if s == 403 || s == 502 #Forbidden
|
96
|
-
# EventMachine.stop
|
97
|
-
elsif s != 404
|
98
|
-
local_dir = File.dirname(e.local_path)
|
99
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
100
|
-
begin
|
101
|
-
File.open(e.local_path, "w") do |f|
|
102
|
-
if @@conver_to_utf8 == true
|
103
|
-
f << SpiderHelper.to_utf8( w.response)
|
104
|
-
else
|
105
|
-
f << w.response
|
106
|
-
end
|
107
|
-
end
|
108
|
-
succeed_list << e
|
109
|
-
rescue Exception => e
|
110
|
-
puts e
|
111
|
-
end
|
112
|
-
end
|
113
|
-
}
|
114
|
-
w.errback {
|
115
|
-
puts "errback:#{w.response_header}"
|
116
|
-
puts e.origin_href
|
117
|
-
puts e.href
|
118
|
-
puts w.response_header.status
|
119
|
-
failed_list << e
|
120
|
-
if e.http_method == :get
|
121
|
-
SpiderHelper.direct_http_get(e.href, e.local_path)
|
122
|
-
elsif e.http_method == :post
|
123
|
-
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
124
|
-
end
|
125
|
-
}
|
126
|
-
multi.add e.local_path, w
|
127
|
-
end
|
128
|
-
end
|
129
|
-
|
130
|
-
# em_for_each_proc = proc do |e, iter|
|
131
|
-
# for_each_proc.call(e)
|
132
|
-
# iter.next
|
133
|
-
# end
|
134
|
-
|
135
|
-
cb = Proc.new do
|
136
|
-
end_time = Time.now
|
137
|
-
puts "use time:#{end_time-begin_time} seconds"
|
138
|
-
if callback.nil?
|
139
|
-
puts "success size:#{self.succeed_size}"
|
140
|
-
puts "failed size:#{self.failed_size}"
|
141
|
-
EventMachine.stop
|
142
|
-
else
|
143
|
-
callback.call(multi, succeed_list, failed_list)
|
144
|
-
end
|
145
|
-
end
|
146
|
-
|
147
|
-
after_proc = proc {
|
148
|
-
if no_job #没有任务直接调回调
|
149
|
-
cb.call
|
150
|
-
else
|
151
|
-
multi.callback &cb
|
152
|
-
end
|
153
|
-
}
|
154
|
-
|
155
|
-
# if DownLoadConfig::MaxConcurrent <= 0
|
156
|
-
link_struct_list.each &for_each_proc
|
157
|
-
after_proc.call
|
158
|
-
# else
|
159
|
-
# EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
|
160
|
-
# end
|
161
|
-
end
|
162
|
-
|
163
|
-
def event_machine_start(url, down_dir, file_name, callback = nil)
|
164
|
-
down_dir << "/" unless down_dir.end_with?("/")
|
165
|
-
FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
|
166
|
-
down_list = []
|
167
|
-
down_list << TaskStruct.new(url, down_dir + file_name)
|
168
|
-
EventMachine.run {
|
169
|
-
index = 0
|
170
|
-
begin_time = Time.now
|
171
|
-
event_machine_down(down_list, callback)
|
172
|
-
end_time = Time.now
|
173
|
-
}
|
174
|
-
end
|
175
|
-
|
176
|
-
def event_machine_start_list(down_list, callback = nil)
|
177
|
-
EventMachine.run {
|
178
|
-
index = 0
|
179
|
-
begin_time = Time.now
|
180
|
-
event_machine_down(down_list, callback)
|
181
|
-
end_time = Time.now
|
182
|
-
}
|
183
|
-
end
|
184
|
-
|
185
|
-
end#self end
|
186
|
-
end#SpiderBase end
|
187
|
-
|
188
|
-
def batch_down_list(down_list, callback = nil)
|
189
|
-
SpiderBase.event_machine_down(down_list, callback)
|
190
|
-
end
|
191
|
-
|
192
|
-
def event_machine_start_list(down_list, callback = nil)
|
193
|
-
SpiderBase.event_machine_start_list(down_list, callback)
|
194
|
-
end
|
195
|
-
|
196
|
-
def parse_down_load_url(url, down_dir, file_name, callback = nil)
|
197
|
-
SpiderBase.event_machine_start(url, down_dir, file_name, callback)
|
198
|
-
end
|
199
|
-
|
200
|
-
class GetRelative
|
201
|
-
|
202
|
-
def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
|
203
|
-
@get_depth = get_depth
|
204
|
-
@base_url = base_url
|
205
|
-
@down_dir = down_dir
|
206
|
-
@suffix = suffix
|
207
|
-
end
|
208
|
-
|
209
|
-
def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
|
210
|
-
puts "success"
|
211
|
-
puts succeed_list.size
|
212
|
-
puts "error"
|
213
|
-
puts failed_list.size
|
214
|
-
puts failed_list
|
215
|
-
puts "get index complete"
|
216
|
-
if succeed_list.size > 0
|
217
|
-
link_list = []
|
218
|
-
succeed_list.each do |e|
|
219
|
-
doc = Nokogiri::HTML(open(e.local_path))
|
220
|
-
link_list.concat(doc.css("a"))
|
221
|
-
end
|
222
|
-
puts "extrat href complete"
|
223
|
-
|
224
|
-
down_dir << "/" unless down_dir.end_with?("/")
|
225
|
-
FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
|
226
|
-
|
227
|
-
down_list = []
|
228
|
-
set_list = Set.new
|
229
|
-
link_list.each do |link|
|
230
|
-
href = link['href']
|
231
|
-
next if href.nil? || !href.include?(@suffix)
|
232
|
-
#process such as "scheme_2.html#SEC15"
|
233
|
-
href = href[0, href.index(@suffix) + 5]
|
234
|
-
#process such as "./preface.html"
|
235
|
-
href = href[2..-1] if href.start_with?("./")
|
236
|
-
|
237
|
-
next if !set_list.add?(href)
|
238
|
-
unless base_url.end_with?("/")
|
239
|
-
i = base_url.rindex"/"
|
240
|
-
base_url = base_url[0..i]
|
241
|
-
end
|
242
|
-
|
243
|
-
#process such as "http://www.ccs.neu.edu/~dorai"
|
244
|
-
next if href.start_with?("http:") || href.start_with?("https:")
|
245
|
-
|
246
|
-
local_path = down_dir + href
|
247
|
-
|
248
|
-
down_list.push( TaskStruct.new(base_url + href, local_path))
|
249
|
-
end
|
250
|
-
puts "down list complete,size:#{down_list.size}"
|
251
|
-
batch_down_list(down_list, callback)
|
252
|
-
end
|
253
|
-
end
|
254
|
-
|
255
|
-
def down_other_node (multi, succeed_list, failed_list)
|
256
|
-
puts "down_other_node"
|
257
|
-
@get_depth = @get_depth - 1
|
258
|
-
puts "depth:#{@get_depth}"
|
259
|
-
if @get_depth <= 0
|
260
|
-
down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
|
261
|
-
else
|
262
|
-
down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
|
263
|
-
end
|
264
|
-
end
|
265
|
-
|
266
|
-
def event_all_complete (multi, succeed_list, failed_list)
|
267
|
-
puts "all complete"
|
268
|
-
puts "success"
|
269
|
-
puts succeed_list.size
|
270
|
-
puts "error"
|
271
|
-
puts failed_list.size
|
272
|
-
puts failed_list
|
273
|
-
EventMachine.stop
|
274
|
-
end
|
275
|
-
|
276
|
-
attr_writer :get_depth,:base_url,:down_dir
|
277
|
-
|
278
|
-
def start
|
279
|
-
index_file_name = "index.html"
|
280
|
-
#http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
|
281
|
-
unless @base_url.end_with?("/")
|
282
|
-
i = @base_url.rindex"/"
|
283
|
-
index_file_name = @base_url[i+1 .. -1]
|
284
|
-
end
|
285
|
-
|
286
|
-
@get_depth = @get_depth - 1
|
287
|
-
puts @get_depth
|
288
|
-
if @get_depth <= 0
|
289
|
-
parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
|
290
|
-
else
|
291
|
-
parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
|
292
|
-
end
|
293
|
-
end
|
294
|
-
|
295
|
-
def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
|
296
|
-
GetRelative.new(base_url,down_dir, get_depth, suffix).start
|
297
|
-
end
|
298
|
-
end #GetRelative
|