list_spider 2.3.0 → 2.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +84 -84
- data/.rdoc_options +23 -23
- data/.rubocop.yml +48 -48
- data/English_README.md +169 -169
- data/Gemfile +6 -6
- data/README.md +181 -181
- data/Rakefile +2 -2
- data/bin/console +14 -14
- data/bin/setup +8 -8
- data/check_code.sh +2 -2
- data/lib/file_filter.rb +72 -72
- data/lib/list_spider.rb +298 -297
- data/lib/list_spider/version.rb +3 -3
- data/lib/spider_helper.rb +110 -110
- data/list_spider.gemspec +31 -31
- data/spider_example.rb +27 -27
- data/spider_example_2.rb +29 -29
- metadata +3 -4
data/lib/list_spider.rb
CHANGED
@@ -1,297 +1,298 @@
|
|
1
|
-
require 'list_spider/version'
|
2
|
-
require 'em-http-request'
|
3
|
-
require 'nokogiri'
|
4
|
-
require 'fileutils'
|
5
|
-
require 'set'
|
6
|
-
require 'addressable/uri'
|
7
|
-
require File.expand_path('spider_helper', __dir__)
|
8
|
-
require File.expand_path('file_filter', __dir__)
|
9
|
-
|
10
|
-
# 爬取任务类
|
11
|
-
class TaskStruct
|
12
|
-
# * href 请求链接
|
13
|
-
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
-
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
-
# * custom_data 自定义数据
|
16
|
-
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
-
def initialize(href, # 请求链接
|
18
|
-
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
19
|
-
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
|
-
http_method: :get,
|
21
|
-
custom_data: nil, # 自定义数据
|
22
|
-
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
23
|
-
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
24
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
-
# http_req.response_header.status 状态码
|
26
|
-
# http_req.response_header 返回头
|
27
|
-
# http_req.response 返回体
|
28
|
-
callback: nil,
|
29
|
-
# 请求失败后的回调
|
30
|
-
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
31
|
-
errback: nil,
|
32
|
-
stream_callback: nil, # 流数据处理回调
|
33
|
-
convert_to_utf8: false, # 是否转换为utf8编码
|
34
|
-
overwrite_exist: false, # 是否覆盖现有文件
|
35
|
-
# 请求设置
|
36
|
-
redirects: 3, # 重定向次数
|
37
|
-
keepalive: nil, # (暂不支持复用)
|
38
|
-
file: nil, # 要上传的文件路径
|
39
|
-
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
40
|
-
query: nil, # 查询字符串,可以是string或hash类型
|
41
|
-
body: nil, # 请求体,可以是string或hash类型
|
42
|
-
head: nil, # 请求头
|
43
|
-
# 连接设置
|
44
|
-
connect_timeout: 60, # 连接超时时间
|
45
|
-
inactivity_timeout: nil, # 连接后超时时间
|
46
|
-
# ssl设置
|
47
|
-
# ssl: {
|
48
|
-
# :private_key_file => '/tmp/server.key',
|
49
|
-
# :cert_chain_file => '/tmp/server.crt',
|
50
|
-
# :verify_peer => false
|
51
|
-
# }
|
52
|
-
ssl: nil,
|
53
|
-
# bind: {
|
54
|
-
# :host => '123.123.123.123', # use a specific interface for outbound request
|
55
|
-
# :port => '123'
|
56
|
-
# }
|
57
|
-
bind: nil,
|
58
|
-
# 代理设置
|
59
|
-
# proxy: {
|
60
|
-
# :host => '127.0.0.1', # proxy address
|
61
|
-
# :port => 9000, # proxy port
|
62
|
-
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
63
|
-
|
64
|
-
# :authorization => ['user', 'pass'] # proxy authorization header
|
65
|
-
# }
|
66
|
-
proxy: nil)
|
67
|
-
@href = href
|
68
|
-
@local_path = local_path
|
69
|
-
@http_method = http_method
|
70
|
-
@custom_data = custom_data
|
71
|
-
@parse_method = parse_method
|
72
|
-
@callback = callback
|
73
|
-
@errback = errback
|
74
|
-
@stream_callback = stream_callback
|
75
|
-
@convert_to_utf8 = convert_to_utf8
|
76
|
-
@overwrite_exist = overwrite_exist
|
77
|
-
|
78
|
-
@request_options = {
|
79
|
-
redirects: redirects,
|
80
|
-
keepalive: keepalive,
|
81
|
-
file: file,
|
82
|
-
path: path,
|
83
|
-
query: query,
|
84
|
-
body: body,
|
85
|
-
head: head
|
86
|
-
}.compact
|
87
|
-
|
88
|
-
@connection_options = {
|
89
|
-
connect_timeout: connect_timeout,
|
90
|
-
inactivity_timeout: inactivity_timeout,
|
91
|
-
ssl: ssl,
|
92
|
-
bind: bind,
|
93
|
-
proxy: proxy
|
94
|
-
}.compact
|
95
|
-
end
|
96
|
-
|
97
|
-
attr_accessor :href, :local_path,
|
98
|
-
:http_method,
|
99
|
-
:custom_data,
|
100
|
-
:request_object,
|
101
|
-
:parse_method,
|
102
|
-
:callback,
|
103
|
-
:errback,
|
104
|
-
:stream_callback,
|
105
|
-
:convert_to_utf8,
|
106
|
-
:overwrite_exist,
|
107
|
-
:request_options,
|
108
|
-
:connection_options
|
109
|
-
end
|
110
|
-
|
111
|
-
module ListSpider
|
112
|
-
RANDOM_TIME = -1
|
113
|
-
NO_LIMIT_CONCURRENT = -1
|
114
|
-
DEFAULT_CONCURRNET_MAX = 50
|
115
|
-
DEFAULT_INTERVAL = 0
|
116
|
-
|
117
|
-
@random_time_range = 3..10
|
118
|
-
@local_path_set = Set.new
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
@
|
129
|
-
@
|
130
|
-
@max =
|
131
|
-
@
|
132
|
-
@
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
http_req
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
puts
|
203
|
-
puts task_struct.
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
puts "
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
@
|
242
|
-
@
|
243
|
-
@
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
@
|
267
|
-
@
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
end
|
1
|
+
require 'list_spider/version'
|
2
|
+
require 'em-http-request'
|
3
|
+
require 'nokogiri'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'set'
|
6
|
+
require 'addressable/uri'
|
7
|
+
require File.expand_path('spider_helper', __dir__)
|
8
|
+
require File.expand_path('file_filter', __dir__)
|
9
|
+
|
10
|
+
# 爬取任务类
|
11
|
+
class TaskStruct
|
12
|
+
# * href 请求链接
|
13
|
+
# * local_path 保存数据的本地路径(此路径作为去重标准)
|
14
|
+
# * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
15
|
+
# * custom_data 自定义数据
|
16
|
+
# * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
+
def initialize(href, # 请求链接
|
18
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
19
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
20
|
+
http_method: :get,
|
21
|
+
custom_data: nil, # 自定义数据
|
22
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
23
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,404
|
24
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
+
# http_req.response_header.status 状态码
|
26
|
+
# http_req.response_header 返回头
|
27
|
+
# http_req.response 返回体
|
28
|
+
callback: nil,
|
29
|
+
# 请求失败后的回调
|
30
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
31
|
+
errback: nil,
|
32
|
+
stream_callback: nil, # 流数据处理回调
|
33
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
34
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
35
|
+
# 请求设置
|
36
|
+
redirects: 3, # 重定向次数
|
37
|
+
keepalive: nil, # (暂不支持复用)
|
38
|
+
file: nil, # 要上传的文件路径
|
39
|
+
path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
40
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
41
|
+
body: nil, # 请求体,可以是string或hash类型
|
42
|
+
head: nil, # 请求头
|
43
|
+
# 连接设置
|
44
|
+
connect_timeout: 60, # 连接超时时间
|
45
|
+
inactivity_timeout: nil, # 连接后超时时间
|
46
|
+
# ssl设置
|
47
|
+
# ssl: {
|
48
|
+
# :private_key_file => '/tmp/server.key',
|
49
|
+
# :cert_chain_file => '/tmp/server.crt',
|
50
|
+
# :verify_peer => false
|
51
|
+
# }
|
52
|
+
ssl: nil,
|
53
|
+
# bind: {
|
54
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
55
|
+
# :port => '123'
|
56
|
+
# }
|
57
|
+
bind: nil,
|
58
|
+
# 代理设置
|
59
|
+
# proxy: {
|
60
|
+
# :host => '127.0.0.1', # proxy address
|
61
|
+
# :port => 9000, # proxy port
|
62
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
63
|
+
|
64
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
65
|
+
# }
|
66
|
+
proxy: nil)
|
67
|
+
@href = href
|
68
|
+
@local_path = local_path
|
69
|
+
@http_method = http_method
|
70
|
+
@custom_data = custom_data
|
71
|
+
@parse_method = parse_method
|
72
|
+
@callback = callback
|
73
|
+
@errback = errback
|
74
|
+
@stream_callback = stream_callback
|
75
|
+
@convert_to_utf8 = convert_to_utf8
|
76
|
+
@overwrite_exist = overwrite_exist
|
77
|
+
|
78
|
+
@request_options = {
|
79
|
+
redirects: redirects,
|
80
|
+
keepalive: keepalive,
|
81
|
+
file: file,
|
82
|
+
path: path,
|
83
|
+
query: query,
|
84
|
+
body: body,
|
85
|
+
head: head
|
86
|
+
}.compact
|
87
|
+
|
88
|
+
@connection_options = {
|
89
|
+
connect_timeout: connect_timeout,
|
90
|
+
inactivity_timeout: inactivity_timeout,
|
91
|
+
ssl: ssl,
|
92
|
+
bind: bind,
|
93
|
+
proxy: proxy
|
94
|
+
}.compact
|
95
|
+
end
|
96
|
+
|
97
|
+
attr_accessor :href, :local_path,
|
98
|
+
:http_method,
|
99
|
+
:custom_data,
|
100
|
+
:request_object,
|
101
|
+
:parse_method,
|
102
|
+
:callback,
|
103
|
+
:errback,
|
104
|
+
:stream_callback,
|
105
|
+
:convert_to_utf8,
|
106
|
+
:overwrite_exist,
|
107
|
+
:request_options,
|
108
|
+
:connection_options
|
109
|
+
end
|
110
|
+
|
111
|
+
module ListSpider
|
112
|
+
RANDOM_TIME = -1
|
113
|
+
NO_LIMIT_CONCURRENT = -1
|
114
|
+
DEFAULT_CONCURRNET_MAX = 50
|
115
|
+
DEFAULT_INTERVAL = 0
|
116
|
+
|
117
|
+
@random_time_range = 3..10
|
118
|
+
@local_path_set = Set.new
|
119
|
+
@down_list = []
|
120
|
+
|
121
|
+
class << self
|
122
|
+
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
123
|
+
if interval.is_a? Range
|
124
|
+
@random_time_range = interval
|
125
|
+
interval = RANDOM_TIME
|
126
|
+
end
|
127
|
+
|
128
|
+
@down_list = filter_list(down_list)
|
129
|
+
@interval = interval
|
130
|
+
@max = max
|
131
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
132
|
+
@succeed_size = 0
|
133
|
+
@failed_size = 0
|
134
|
+
|
135
|
+
puts "total size:#{@down_list.size}"
|
136
|
+
event_machine_start_list(next_task, method(:complete))
|
137
|
+
end
|
138
|
+
|
139
|
+
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
140
|
+
get_list([task], interval: interval, max: max)
|
141
|
+
end
|
142
|
+
|
143
|
+
def add_task(task)
|
144
|
+
if task.is_a? Array
|
145
|
+
need_down_list = filter_list(task)
|
146
|
+
@down_list += need_down_list
|
147
|
+
elsif task.is_a?TaskStruct
|
148
|
+
need_down_list = filter_list([task])
|
149
|
+
@down_list += need_down_list
|
150
|
+
else
|
151
|
+
puts "error task type:#{task.class}"
|
152
|
+
end
|
153
|
+
end
|
154
|
+
|
155
|
+
private
|
156
|
+
|
157
|
+
def event_machine_down(link_struct_list, callback = nil)
|
158
|
+
failed_list = []
|
159
|
+
succeed_list = []
|
160
|
+
multi = EventMachine::MultiRequest.new
|
161
|
+
begin_time = Time.now
|
162
|
+
|
163
|
+
for_each_proc =
|
164
|
+
proc do |task_struct|
|
165
|
+
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
166
|
+
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
167
|
+
task_struct.request_object = http_req
|
168
|
+
|
169
|
+
http_req.callback do
|
170
|
+
s = http_req.response_header.status
|
171
|
+
puts "#{Time.now}, http status code: #{s}"
|
172
|
+
|
173
|
+
if s == 200
|
174
|
+
local_dir = File.dirname(task_struct.local_path)
|
175
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
176
|
+
begin
|
177
|
+
File.open(task_struct.local_path, 'wb') do |f|
|
178
|
+
f << if @convert_to_utf8 == true
|
179
|
+
SpiderHelper.to_utf8(http_req.response)
|
180
|
+
else
|
181
|
+
http_req.response
|
182
|
+
end
|
183
|
+
end
|
184
|
+
call_parse_method(task_struct)
|
185
|
+
succeed_list << task_struct
|
186
|
+
rescue StandardError => exception
|
187
|
+
puts exception
|
188
|
+
end
|
189
|
+
end
|
190
|
+
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
191
|
+
end
|
192
|
+
|
193
|
+
http_req.errback do
|
194
|
+
puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
|
195
|
+
|
196
|
+
task_struct.errback.call(task_struct, http_req) if task_struct.errback
|
197
|
+
end
|
198
|
+
|
199
|
+
begin
|
200
|
+
multi.add task_struct.local_path, http_req
|
201
|
+
rescue StandardError => exception
|
202
|
+
puts exception
|
203
|
+
puts task_struct.href
|
204
|
+
puts task_struct.local_path
|
205
|
+
stop_machine
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
cb =
|
210
|
+
proc do
|
211
|
+
end_time = Time.now
|
212
|
+
puts "use time:#{end_time - begin_time} seconds"
|
213
|
+
if callback.nil?
|
214
|
+
stop_machine
|
215
|
+
else
|
216
|
+
callback.call(multi, succeed_list, failed_list)
|
217
|
+
end
|
218
|
+
end
|
219
|
+
link_struct_list.each(&for_each_proc)
|
220
|
+
multi.callback(&cb)
|
221
|
+
end
|
222
|
+
|
223
|
+
def stop_machine
|
224
|
+
puts "success size:#{@succeed_size}"
|
225
|
+
puts "failed size:#{@failed_size}"
|
226
|
+
@end_time = Time.now
|
227
|
+
puts "total use time:#{@end_time - @begin_time} seconds"
|
228
|
+
EventMachine.stop
|
229
|
+
@local_path_set.clear
|
230
|
+
end
|
231
|
+
|
232
|
+
def next_task
|
233
|
+
@down_list.shift(@max)
|
234
|
+
end
|
235
|
+
|
236
|
+
def call_parse_method(task_struct)
|
237
|
+
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
238
|
+
end
|
239
|
+
|
240
|
+
def complete(_multi, success_list, failed_list)
|
241
|
+
@succeed_size += success_list.size
|
242
|
+
@failed_size += failed_list.size
|
243
|
+
@succeed_list.concat(success_list)
|
244
|
+
@failed_list.concat(failed_list)
|
245
|
+
|
246
|
+
todo = next_task
|
247
|
+
|
248
|
+
if todo.empty?
|
249
|
+
stop_machine
|
250
|
+
else
|
251
|
+
if @interval != 0
|
252
|
+
if !success_list.empty? || !failed_list.empty?
|
253
|
+
if @interval == RANDOM_TIME
|
254
|
+
sleep(rand(@random_time_range))
|
255
|
+
else
|
256
|
+
sleep(@interval)
|
257
|
+
end
|
258
|
+
end
|
259
|
+
end
|
260
|
+
event_machine_down(todo, method(:complete))
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def event_machine_start_list(down_list, callback = nil)
|
265
|
+
EventMachine.run do
|
266
|
+
@succeed_list = []
|
267
|
+
@failed_list = []
|
268
|
+
@begin_time = Time.now
|
269
|
+
if down_list.empty?
|
270
|
+
if callback
|
271
|
+
callback.call(nil, [], [])
|
272
|
+
else
|
273
|
+
stop_machine
|
274
|
+
end
|
275
|
+
else
|
276
|
+
event_machine_down(down_list, callback)
|
277
|
+
end
|
278
|
+
end
|
279
|
+
end
|
280
|
+
|
281
|
+
def filter_list(down_list)
|
282
|
+
need_down_list = []
|
283
|
+
down_list.each do |ts|
|
284
|
+
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
285
|
+
call_parse_method(ts)
|
286
|
+
elsif @local_path_set.add?(ts.local_path)
|
287
|
+
need_down_list << ts
|
288
|
+
end
|
289
|
+
end
|
290
|
+
need_down_list
|
291
|
+
end
|
292
|
+
end
|
293
|
+
|
294
|
+
Signal.trap('INT') do
|
295
|
+
ListSpider.stop_machine
|
296
|
+
exit!
|
297
|
+
end
|
298
|
+
end
|