list_spider 2.2.0 → 2.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/Gemfile.lock +12 -11
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +297 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +6 -5
data/bin/setup
CHANGED
@@ -1,8 +1,8 @@
|
|
1
|
-
#!/usr/bin/env bash
|
2
|
-
set -euo pipefail
|
3
|
-
IFS=$'\n\t'
|
4
|
-
set -vx
|
5
|
-
|
6
|
-
bundle install
|
7
|
-
|
8
|
-
# Do any other automated setup that you need to do here
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
set -euo pipefail
|
3
|
+
IFS=$'\n\t'
|
4
|
+
set -vx
|
5
|
+
|
6
|
+
bundle install
|
7
|
+
|
8
|
+
# Do any other automated setup that you need to do here
|
data/check_code.sh
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
#!/bin/sh
|
2
|
-
|
1
|
+
#!/bin/sh
|
2
|
+
|
3
3
|
rubocop -a -D -f simple -o rubocopresult
|
data/lib/file_filter.rb
CHANGED
@@ -1,72 +1,72 @@
|
|
1
|
-
|
2
|
-
class FileFilter
|
3
|
-
# 4033
|
4
|
-
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
-
cust_judge: nil, process_block: nil)
|
7
|
-
@dir_pattern = dir_pattern
|
8
|
-
@size_threshold = size_threshold
|
9
|
-
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
10
|
-
@total = 0
|
11
|
-
@process_block = process_block
|
12
|
-
end
|
13
|
-
|
14
|
-
def default_judge(f)
|
15
|
-
File.size(f) <= @size_threshold
|
16
|
-
end
|
17
|
-
|
18
|
-
def filter_file(f)
|
19
|
-
if @cust_judge.call(f)
|
20
|
-
@total += 1
|
21
|
-
@process_block.call(f)
|
22
|
-
end
|
23
|
-
end
|
24
|
-
|
25
|
-
def start
|
26
|
-
Dir.glob(@dir_pattern) do |f|
|
27
|
-
filter_file(f)
|
28
|
-
end
|
29
|
-
puts "total:#{@total}"
|
30
|
-
end
|
31
|
-
|
32
|
-
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
33
|
-
FileFilter.new(
|
34
|
-
dir_pattern,
|
35
|
-
size_threshold: size_threshold,
|
36
|
-
cust_judge: cust_judge,
|
37
|
-
process_block:
|
38
|
-
proc do |f|
|
39
|
-
puts "deleted file: #{f}"
|
40
|
-
File.delete(f)
|
41
|
-
end
|
42
|
-
).start
|
43
|
-
end
|
44
|
-
|
45
|
-
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
46
|
-
FileFilter.new(
|
47
|
-
dir_pattern,
|
48
|
-
size_threshold: size_threshold,
|
49
|
-
cust_judge: cust_judge,
|
50
|
-
process_block:
|
51
|
-
proc do |f|
|
52
|
-
puts "filterd file: #{f}"
|
53
|
-
end
|
54
|
-
).start
|
55
|
-
end
|
56
|
-
|
57
|
-
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
-
size_threshold: 1000, cust_judge: nil)
|
59
|
-
result_file = File.open(save_file_name, 'wt')
|
60
|
-
FileFilter.new(
|
61
|
-
dir_pattern,
|
62
|
-
size_threshold: size_threshold,
|
63
|
-
cust_judge: cust_judge,
|
64
|
-
process_block:
|
65
|
-
proc do |f|
|
66
|
-
puts "filterd file: #{f}"
|
67
|
-
result_file << f << "\n"
|
68
|
-
end
|
69
|
-
).start
|
70
|
-
result_file.close
|
71
|
-
end
|
72
|
-
end
|
1
|
+
|
2
|
+
class FileFilter
|
3
|
+
# 4033
|
4
|
+
# 920
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
+
cust_judge: nil, process_block: nil)
|
7
|
+
@dir_pattern = dir_pattern
|
8
|
+
@size_threshold = size_threshold
|
9
|
+
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
10
|
+
@total = 0
|
11
|
+
@process_block = process_block
|
12
|
+
end
|
13
|
+
|
14
|
+
def default_judge(f)
|
15
|
+
File.size(f) <= @size_threshold
|
16
|
+
end
|
17
|
+
|
18
|
+
def filter_file(f)
|
19
|
+
if @cust_judge.call(f)
|
20
|
+
@total += 1
|
21
|
+
@process_block.call(f)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def start
|
26
|
+
Dir.glob(@dir_pattern) do |f|
|
27
|
+
filter_file(f)
|
28
|
+
end
|
29
|
+
puts "total:#{@total}"
|
30
|
+
end
|
31
|
+
|
32
|
+
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
33
|
+
FileFilter.new(
|
34
|
+
dir_pattern,
|
35
|
+
size_threshold: size_threshold,
|
36
|
+
cust_judge: cust_judge,
|
37
|
+
process_block:
|
38
|
+
proc do |f|
|
39
|
+
puts "deleted file: #{f}"
|
40
|
+
File.delete(f)
|
41
|
+
end
|
42
|
+
).start
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
46
|
+
FileFilter.new(
|
47
|
+
dir_pattern,
|
48
|
+
size_threshold: size_threshold,
|
49
|
+
cust_judge: cust_judge,
|
50
|
+
process_block:
|
51
|
+
proc do |f|
|
52
|
+
puts "filterd file: #{f}"
|
53
|
+
end
|
54
|
+
).start
|
55
|
+
end
|
56
|
+
|
57
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
+
size_threshold: 1000, cust_judge: nil)
|
59
|
+
result_file = File.open(save_file_name, 'wt')
|
60
|
+
FileFilter.new(
|
61
|
+
dir_pattern,
|
62
|
+
size_threshold: size_threshold,
|
63
|
+
cust_judge: cust_judge,
|
64
|
+
process_block:
|
65
|
+
proc do |f|
|
66
|
+
puts "filterd file: #{f}"
|
67
|
+
result_file << f << "\n"
|
68
|
+
end
|
69
|
+
).start
|
70
|
+
result_file.close
|
71
|
+
end
|
72
|
+
end
|
data/lib/list_spider.rb
CHANGED
@@ -1,297 +1,297 @@
|
|
1
|
-
require 'list_spider/version'
|
2
|
-
require 'em-http-request'
|
3
|
-
require 'nokogiri'
|
4
|
-
require 'fileutils'
|
5
|
-
require 'set'
|
6
|
-
require 'addressable/uri'
|
7
|
-
require File.expand_path('spider_helper', __dir__)
|
8
|
-
require File.expand_path('file_filter', __dir__)
|
9
|
-
|
10
|
-
# 爬取任务类
|
11
|
-
class TaskStruct
|
12
|
-
# * href 请求链接
|
13
|
-
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
-
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
-
# * custom_data 自定义数据
|
16
|
-
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
-
def initialize(href, # 请求链接
|
18
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
19
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
|
-
http_method: :get,
|
21
|
-
custom_data: nil, # 自定义数据
|
22
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
23
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
24
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
-
# http_req.response_header.status 状态码
|
26
|
-
# http_req.response_header 返回头
|
27
|
-
# http_req.response 返回体
|
28
|
-
callback: nil,
|
29
|
-
# 请求失败后的回调
|
30
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
31
|
-
errback: nil,
|
32
|
-
stream_callback: nil, # 流数据处理回调
|
33
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
34
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
35
|
-
# 请求设置
|
36
|
-
redirects: 3, # 重定向次数
|
37
|
-
keepalive: nil, # (暂不支持复用)
|
38
|
-
file: nil, # 要上传的文件路径
|
39
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
40
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
41
|
-
body: nil, # 请求体,可以是string或hash类型
|
42
|
-
head: nil, # 请求头
|
43
|
-
# 连接设置
|
44
|
-
connect_timeout: 60, # 连接超时时间
|
45
|
-
inactivity_timeout: nil, # 连接后超时时间
|
46
|
-
# ssl设置
|
47
|
-
# ssl: {
|
48
|
-
# :private_key_file => '/tmp/server.key',
|
49
|
-
# :cert_chain_file => '/tmp/server.crt',
|
50
|
-
# :verify_peer => false
|
51
|
-
# }
|
52
|
-
ssl: nil,
|
53
|
-
# bind: {
|
54
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
55
|
-
# :port => '123'
|
56
|
-
# }
|
57
|
-
bind: nil,
|
58
|
-
# 代理设置
|
59
|
-
# proxy: {
|
60
|
-
# :host => '127.0.0.1', # proxy address
|
61
|
-
# :port => 9000, # proxy port
|
62
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
63
|
-
|
64
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
65
|
-
# }
|
66
|
-
proxy: nil)
|
67
|
-
@href = href
|
68
|
-
@local_path = local_path
|
69
|
-
@http_method = http_method
|
70
|
-
@custom_data = custom_data
|
71
|
-
@parse_method = parse_method
|
72
|
-
@callback = callback
|
73
|
-
@errback = errback
|
74
|
-
@stream_callback = stream_callback
|
75
|
-
@convert_to_utf8 = convert_to_utf8
|
76
|
-
@overwrite_exist = overwrite_exist
|
77
|
-
|
78
|
-
@request_options = {
|
79
|
-
redirects: redirects,
|
80
|
-
keepalive: keepalive,
|
81
|
-
file: file,
|
82
|
-
path: path,
|
83
|
-
query: query,
|
84
|
-
body: body,
|
85
|
-
head: head
|
86
|
-
}.compact
|
87
|
-
|
88
|
-
@connection_options = {
|
89
|
-
connect_timeout: connect_timeout,
|
90
|
-
inactivity_timeout: inactivity_timeout,
|
91
|
-
ssl: ssl,
|
92
|
-
bind: bind,
|
93
|
-
proxy: proxy
|
94
|
-
}.compact
|
95
|
-
end
|
96
|
-
|
97
|
-
attr_accessor :href, :local_path,
|
98
|
-
:http_method,
|
99
|
-
:custom_data,
|
100
|
-
:request_object,
|
101
|
-
:parse_method,
|
102
|
-
:callback,
|
103
|
-
:errback,
|
104
|
-
:stream_callback,
|
105
|
-
:convert_to_utf8,
|
106
|
-
:overwrite_exist,
|
107
|
-
:request_options,
|
108
|
-
:connection_options
|
109
|
-
end
|
110
|
-
|
111
|
-
module ListSpider
|
112
|
-
RANDOM_TIME = -1
|
113
|
-
NO_LIMIT_CONCURRENT = -1
|
114
|
-
DEFAULT_CONCURRNET_MAX = 50
|
115
|
-
DEFAULT_INTERVAL = 0
|
116
|
-
|
117
|
-
@random_time_range = 3..10
|
118
|
-
@local_path_set = Set.new
|
119
|
-
|
120
|
-
class << self
|
121
|
-
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
122
|
-
if interval.is_a? Range
|
123
|
-
@random_time_range = interval
|
124
|
-
interval = RANDOM_TIME
|
125
|
-
end
|
126
|
-
|
127
|
-
@down_list = filter_list(down_list)
|
128
|
-
@interval = interval
|
129
|
-
@max = max
|
130
|
-
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
131
|
-
@succeed_size = 0
|
132
|
-
@failed_size = 0
|
133
|
-
|
134
|
-
puts "total size:#{@down_list.size}"
|
135
|
-
event_machine_start_list(next_task, method(:complete))
|
136
|
-
end
|
137
|
-
|
138
|
-
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
139
|
-
get_list([task], interval: interval, max: max)
|
140
|
-
end
|
141
|
-
|
142
|
-
def add_task(task)
|
143
|
-
if task.is_a? Array
|
144
|
-
need_down_list = filter_list(task)
|
145
|
-
@down_list += need_down_list
|
146
|
-
elsif task.is_a?TaskStruct
|
147
|
-
need_down_list = filter_list([task])
|
148
|
-
@down_list += need_down_list
|
149
|
-
else
|
150
|
-
puts "error task type:#{task.class}"
|
151
|
-
end
|
152
|
-
end
|
153
|
-
|
154
|
-
private
|
155
|
-
|
156
|
-
def event_machine_down(link_struct_list, callback = nil)
|
157
|
-
failed_list = []
|
158
|
-
succeed_list = []
|
159
|
-
multi = EventMachine::MultiRequest.new
|
160
|
-
begin_time = Time.now
|
161
|
-
|
162
|
-
for_each_proc =
|
163
|
-
proc do |task_struct|
|
164
|
-
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
165
|
-
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
166
|
-
task_struct.request_object = http_req
|
167
|
-
|
168
|
-
http_req.callback do
|
169
|
-
s = http_req.response_header.status
|
170
|
-
puts "#{Time.now}, http status code: #{s}"
|
171
|
-
|
172
|
-
if s == 200
|
173
|
-
local_dir = File.dirname(task_struct.local_path)
|
174
|
-
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
175
|
-
begin
|
176
|
-
File.open(task_struct.local_path, 'wb') do |f|
|
177
|
-
f << if @convert_to_utf8 == true
|
178
|
-
SpiderHelper.to_utf8(http_req.response)
|
179
|
-
else
|
180
|
-
http_req.response
|
181
|
-
end
|
182
|
-
end
|
183
|
-
call_parse_method(task_struct)
|
184
|
-
succeed_list << task_struct
|
185
|
-
rescue StandardError => exception
|
186
|
-
puts exception
|
187
|
-
end
|
188
|
-
end
|
189
|
-
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
190
|
-
end
|
191
|
-
|
192
|
-
http_req.errback do
|
193
|
-
puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
|
194
|
-
|
195
|
-
task_struct.errback.call(task_struct, http_req) if task_struct.errback
|
196
|
-
end
|
197
|
-
|
198
|
-
begin
|
199
|
-
multi.add task_struct.local_path, http_req
|
200
|
-
rescue StandardError => exception
|
201
|
-
puts exception
|
202
|
-
puts task_struct.href
|
203
|
-
puts task_struct.local_path
|
204
|
-
stop_machine
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
cb =
|
209
|
-
proc do
|
210
|
-
end_time = Time.now
|
211
|
-
puts "use time:#{end_time - begin_time} seconds"
|
212
|
-
if callback.nil?
|
213
|
-
stop_machine
|
214
|
-
else
|
215
|
-
callback.call(multi, succeed_list, failed_list)
|
216
|
-
end
|
217
|
-
end
|
218
|
-
link_struct_list.each(&for_each_proc)
|
219
|
-
multi.callback(&cb)
|
220
|
-
end
|
221
|
-
|
222
|
-
def stop_machine
|
223
|
-
puts "success size:#{@succeed_size}"
|
224
|
-
puts "failed size:#{@failed_size}"
|
225
|
-
@end_time = Time.now
|
226
|
-
puts "total use time:#{@end_time - @begin_time} seconds"
|
227
|
-
EventMachine.stop
|
228
|
-
@local_path_set.clear
|
229
|
-
end
|
230
|
-
|
231
|
-
def next_task
|
232
|
-
@down_list.shift(@max)
|
233
|
-
end
|
234
|
-
|
235
|
-
def call_parse_method(task_struct)
|
236
|
-
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
237
|
-
end
|
238
|
-
|
239
|
-
def complete(_multi, success_list, failed_list)
|
240
|
-
@succeed_size += success_list.size
|
241
|
-
@failed_size += failed_list.size
|
242
|
-
@succeed_list.concat(success_list)
|
243
|
-
@failed_list.concat(failed_list)
|
244
|
-
|
245
|
-
todo = next_task
|
246
|
-
|
247
|
-
if todo.empty?
|
248
|
-
stop_machine
|
249
|
-
else
|
250
|
-
if @interval != 0
|
251
|
-
if !success_list.empty? || !failed_list.empty?
|
252
|
-
if @interval == RANDOM_TIME
|
253
|
-
sleep(rand(@random_time_range))
|
254
|
-
else
|
255
|
-
sleep(@interval)
|
256
|
-
end
|
257
|
-
end
|
258
|
-
end
|
259
|
-
event_machine_down(todo, method(:complete))
|
260
|
-
end
|
261
|
-
end
|
262
|
-
|
263
|
-
def event_machine_start_list(down_list, callback = nil)
|
264
|
-
EventMachine.run do
|
265
|
-
@succeed_list = []
|
266
|
-
@failed_list = []
|
267
|
-
@begin_time = Time.now
|
268
|
-
if down_list.empty?
|
269
|
-
if callback
|
270
|
-
callback.call(nil, [], [])
|
271
|
-
else
|
272
|
-
stop_machine
|
273
|
-
end
|
274
|
-
else
|
275
|
-
event_machine_down(down_list, callback)
|
276
|
-
end
|
277
|
-
end
|
278
|
-
end
|
279
|
-
|
280
|
-
def filter_list(down_list)
|
281
|
-
need_down_list = []
|
282
|
-
down_list.each do |ts|
|
283
|
-
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
284
|
-
call_parse_method(ts)
|
285
|
-
elsif @local_path_set.add?(ts.local_path)
|
286
|
-
need_down_list << ts
|
287
|
-
end
|
288
|
-
end
|
289
|
-
need_down_list
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
Signal.trap('INT') do
|
294
|
-
ListSpider.stop_machine
|
295
|
-
exit!
|
296
|
-
end
|
297
|
-
end
|
1
|
+
require 'list_spider/version'
|
2
|
+
require 'em-http-request'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'set'
|
6
|
+
require 'addressable/uri'
|
7
|
+
require File.expand_path('spider_helper', __dir__)
|
8
|
+
require File.expand_path('file_filter', __dir__)
|
9
|
+
|
10
|
+
# 爬取任务类
|
11
|
+
class TaskStruct
|
12
|
+
# * href 请求链接
|
13
|
+
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
+
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
+
# * custom_data 自定义数据
|
16
|
+
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
+
def initialize(href, # 请求链接
|
18
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
19
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
|
+
http_method: :get,
|
21
|
+
custom_data: nil, # 自定义数据
|
22
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
23
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
24
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
+
# http_req.response_header.status 状态码
|
26
|
+
# http_req.response_header 返回头
|
27
|
+
# http_req.response 返回体
|
28
|
+
callback: nil,
|
29
|
+
# 请求失败后的回调
|
30
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
31
|
+
errback: nil,
|
32
|
+
stream_callback: nil, # 流数据处理回调
|
33
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
34
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
35
|
+
# 请求设置
|
36
|
+
redirects: 3, # 重定向次数
|
37
|
+
keepalive: nil, # (暂不支持复用)
|
38
|
+
file: nil, # 要上传的文件路径
|
39
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
40
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
41
|
+
body: nil, # 请求体,可以是string或hash类型
|
42
|
+
head: nil, # 请求头
|
43
|
+
# 连接设置
|
44
|
+
connect_timeout: 60, # 连接超时时间
|
45
|
+
inactivity_timeout: nil, # 连接后超时时间
|
46
|
+
# ssl设置
|
47
|
+
# ssl: {
|
48
|
+
# :private_key_file => '/tmp/server.key',
|
49
|
+
# :cert_chain_file => '/tmp/server.crt',
|
50
|
+
# :verify_peer => false
|
51
|
+
# }
|
52
|
+
ssl: nil,
|
53
|
+
# bind: {
|
54
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
55
|
+
# :port => '123'
|
56
|
+
# }
|
57
|
+
bind: nil,
|
58
|
+
# 代理设置
|
59
|
+
# proxy: {
|
60
|
+
# :host => '127.0.0.1', # proxy address
|
61
|
+
# :port => 9000, # proxy port
|
62
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
63
|
+
|
64
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
65
|
+
# }
|
66
|
+
proxy: nil)
|
67
|
+
@href = href
|
68
|
+
@local_path = local_path
|
69
|
+
@http_method = http_method
|
70
|
+
@custom_data = custom_data
|
71
|
+
@parse_method = parse_method
|
72
|
+
@callback = callback
|
73
|
+
@errback = errback
|
74
|
+
@stream_callback = stream_callback
|
75
|
+
@convert_to_utf8 = convert_to_utf8
|
76
|
+
@overwrite_exist = overwrite_exist
|
77
|
+
|
78
|
+
@request_options = {
|
79
|
+
redirects: redirects,
|
80
|
+
keepalive: keepalive,
|
81
|
+
file: file,
|
82
|
+
path: path,
|
83
|
+
query: query,
|
84
|
+
body: body,
|
85
|
+
head: head
|
86
|
+
}.compact
|
87
|
+
|
88
|
+
@connection_options = {
|
89
|
+
connect_timeout: connect_timeout,
|
90
|
+
inactivity_timeout: inactivity_timeout,
|
91
|
+
ssl: ssl,
|
92
|
+
bind: bind,
|
93
|
+
proxy: proxy
|
94
|
+
}.compact
|
95
|
+
end
|
96
|
+
|
97
|
+
attr_accessor :href, :local_path,
|
98
|
+
:http_method,
|
99
|
+
:custom_data,
|
100
|
+
:request_object,
|
101
|
+
:parse_method,
|
102
|
+
:callback,
|
103
|
+
:errback,
|
104
|
+
:stream_callback,
|
105
|
+
:convert_to_utf8,
|
106
|
+
:overwrite_exist,
|
107
|
+
:request_options,
|
108
|
+
:connection_options
|
109
|
+
end
|
110
|
+
|
111
|
+
module ListSpider
|
112
|
+
RANDOM_TIME = -1
|
113
|
+
NO_LIMIT_CONCURRENT = -1
|
114
|
+
DEFAULT_CONCURRNET_MAX = 50
|
115
|
+
DEFAULT_INTERVAL = 0
|
116
|
+
|
117
|
+
@random_time_range = 3..10
|
118
|
+
@local_path_set = Set.new
|
119
|
+
|
120
|
+
class << self
|
121
|
+
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
122
|
+
if interval.is_a? Range
|
123
|
+
@random_time_range = interval
|
124
|
+
interval = RANDOM_TIME
|
125
|
+
end
|
126
|
+
|
127
|
+
@down_list = filter_list(down_list)
|
128
|
+
@interval = interval
|
129
|
+
@max = max
|
130
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
131
|
+
@succeed_size = 0
|
132
|
+
@failed_size = 0
|
133
|
+
|
134
|
+
puts "total size:#{@down_list.size}"
|
135
|
+
event_machine_start_list(next_task, method(:complete))
|
136
|
+
end
|
137
|
+
|
138
|
+
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
139
|
+
get_list([task], interval: interval, max: max)
|
140
|
+
end
|
141
|
+
|
142
|
+
def add_task(task)
|
143
|
+
if task.is_a? Array
|
144
|
+
need_down_list = filter_list(task)
|
145
|
+
@down_list += need_down_list
|
146
|
+
elsif task.is_a?TaskStruct
|
147
|
+
need_down_list = filter_list([task])
|
148
|
+
@down_list += need_down_list
|
149
|
+
else
|
150
|
+
puts "error task type:#{task.class}"
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
private
|
155
|
+
|
156
|
+
def event_machine_down(link_struct_list, callback = nil)
|
157
|
+
failed_list = []
|
158
|
+
succeed_list = []
|
159
|
+
multi = EventMachine::MultiRequest.new
|
160
|
+
begin_time = Time.now
|
161
|
+
|
162
|
+
for_each_proc =
|
163
|
+
proc do |task_struct|
|
164
|
+
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
165
|
+
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
166
|
+
task_struct.request_object = http_req
|
167
|
+
|
168
|
+
http_req.callback do
|
169
|
+
s = http_req.response_header.status
|
170
|
+
puts "#{Time.now}, http status code: #{s}"
|
171
|
+
|
172
|
+
if s == 200
|
173
|
+
local_dir = File.dirname(task_struct.local_path)
|
174
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
175
|
+
begin
|
176
|
+
File.open(task_struct.local_path, 'wb') do |f|
|
177
|
+
f << if @convert_to_utf8 == true
|
178
|
+
SpiderHelper.to_utf8(http_req.response)
|
179
|
+
else
|
180
|
+
http_req.response
|
181
|
+
end
|
182
|
+
end
|
183
|
+
call_parse_method(task_struct)
|
184
|
+
succeed_list << task_struct
|
185
|
+
rescue StandardError => exception
|
186
|
+
puts exception
|
187
|
+
end
|
188
|
+
end
|
189
|
+
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
190
|
+
end
|
191
|
+
|
192
|
+
http_req.errback do
|
193
|
+
puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
|
194
|
+
|
195
|
+
task_struct.errback.call(task_struct, http_req) if task_struct.errback
|
196
|
+
end
|
197
|
+
|
198
|
+
begin
|
199
|
+
multi.add task_struct.local_path, http_req
|
200
|
+
rescue StandardError => exception
|
201
|
+
puts exception
|
202
|
+
puts task_struct.href
|
203
|
+
puts task_struct.local_path
|
204
|
+
stop_machine
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
cb =
|
209
|
+
proc do
|
210
|
+
end_time = Time.now
|
211
|
+
puts "use time:#{end_time - begin_time} seconds"
|
212
|
+
if callback.nil?
|
213
|
+
stop_machine
|
214
|
+
else
|
215
|
+
callback.call(multi, succeed_list, failed_list)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
link_struct_list.each(&for_each_proc)
|
219
|
+
multi.callback(&cb)
|
220
|
+
end
|
221
|
+
|
222
|
+
def stop_machine
|
223
|
+
puts "success size:#{@succeed_size}"
|
224
|
+
puts "failed size:#{@failed_size}"
|
225
|
+
@end_time = Time.now
|
226
|
+
puts "total use time:#{@end_time - @begin_time} seconds"
|
227
|
+
EventMachine.stop
|
228
|
+
@local_path_set.clear
|
229
|
+
end
|
230
|
+
|
231
|
+
def next_task
|
232
|
+
@down_list.shift(@max)
|
233
|
+
end
|
234
|
+
|
235
|
+
def call_parse_method(task_struct)
|
236
|
+
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
237
|
+
end
|
238
|
+
|
239
|
+
def complete(_multi, success_list, failed_list)
|
240
|
+
@succeed_size += success_list.size
|
241
|
+
@failed_size += failed_list.size
|
242
|
+
@succeed_list.concat(success_list)
|
243
|
+
@failed_list.concat(failed_list)
|
244
|
+
|
245
|
+
todo = next_task
|
246
|
+
|
247
|
+
if todo.empty?
|
248
|
+
stop_machine
|
249
|
+
else
|
250
|
+
if @interval != 0
|
251
|
+
if !success_list.empty? || !failed_list.empty?
|
252
|
+
if @interval == RANDOM_TIME
|
253
|
+
sleep(rand(@random_time_range))
|
254
|
+
else
|
255
|
+
sleep(@interval)
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
event_machine_down(todo, method(:complete))
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
def event_machine_start_list(down_list, callback = nil)
|
264
|
+
EventMachine.run do
|
265
|
+
@succeed_list = []
|
266
|
+
@failed_list = []
|
267
|
+
@begin_time = Time.now
|
268
|
+
if down_list.empty?
|
269
|
+
if callback
|
270
|
+
callback.call(nil, [], [])
|
271
|
+
else
|
272
|
+
stop_machine
|
273
|
+
end
|
274
|
+
else
|
275
|
+
event_machine_down(down_list, callback)
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|
279
|
+
|
280
|
+
def filter_list(down_list)
|
281
|
+
need_down_list = []
|
282
|
+
down_list.each do |ts|
|
283
|
+
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
284
|
+
call_parse_method(ts)
|
285
|
+
elsif @local_path_set.add?(ts.local_path)
|
286
|
+
need_down_list << ts
|
287
|
+
end
|
288
|
+
end
|
289
|
+
need_down_list
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
Signal.trap('INT') do
|
294
|
+
ListSpider.stop_machine
|
295
|
+
exit!
|
296
|
+
end
|
297
|
+
end
|