list_spider 2.3.0 → 2.4.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/list_spider.rb CHANGED
@@ -1,297 +1,298 @@
1
- require 'list_spider/version'
2
- require 'em-http-request'
3
- require 'nokogiri'
4
- require 'fileutils'
5
- require 'set'
6
- require 'addressable/uri'
7
- require File.expand_path('spider_helper', __dir__)
8
- require File.expand_path('file_filter', __dir__)
9
-
10
- # 爬取任务类
11
- class TaskStruct
12
- # * href 请求链接
13
- # * local_path 保存数据的本地路径(此路径作为去重标准)
14
- # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
- # * custom_data 自定义数据
16
- # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
- def initialize(href, # 请求链接
18
- local_path, # 保存数据的本地路径(此路径作为去重标准)
19
- # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
- http_method: :get,
21
- custom_data: nil, # 自定义数据
22
- parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
- # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
- # http_req.response_header.status 状态码
26
- # http_req.response_header 返回头
27
- # http_req.response 返回体
28
- callback: nil,
29
- # 请求失败后的回调
30
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
- errback: nil,
32
- stream_callback: nil, # 流数据处理回调
33
- convert_to_utf8: false, # 是否转换为utf8编码
34
- overwrite_exist: false, # 是否覆盖现有文件
35
- # 请求设置
36
- redirects: 3, # 重定向次数
37
- keepalive: nil, # (暂不支持复用)
38
- file: nil, # 要上传的文件路径
39
- path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
- query: nil, # 查询字符串,可以是string或hash类型
41
- body: nil, # 请求体,可以是string或hash类型
42
- head: nil, # 请求头
43
- # 连接设置
44
- connect_timeout: 60, # 连接超时时间
45
- inactivity_timeout: nil, # 连接后超时时间
46
- # ssl设置
47
- # ssl: {
48
- # :private_key_file => '/tmp/server.key',
49
- # :cert_chain_file => '/tmp/server.crt',
50
- # :verify_peer => false
51
- # }
52
- ssl: nil,
53
- # bind: {
54
- # :host => '123.123.123.123', # use a specific interface for outbound request
55
- # :port => '123'
56
- # }
57
- bind: nil,
58
- # 代理设置
59
- # proxy: {
60
- # :host => '127.0.0.1', # proxy address
61
- # :port => 9000, # proxy port
62
- # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
-
64
- # :authorization => ['user', 'pass'] # proxy authorization header
65
- # }
66
- proxy: nil)
67
- @href = href
68
- @local_path = local_path
69
- @http_method = http_method
70
- @custom_data = custom_data
71
- @parse_method = parse_method
72
- @callback = callback
73
- @errback = errback
74
- @stream_callback = stream_callback
75
- @convert_to_utf8 = convert_to_utf8
76
- @overwrite_exist = overwrite_exist
77
-
78
- @request_options = {
79
- redirects: redirects,
80
- keepalive: keepalive,
81
- file: file,
82
- path: path,
83
- query: query,
84
- body: body,
85
- head: head
86
- }.compact
87
-
88
- @connection_options = {
89
- connect_timeout: connect_timeout,
90
- inactivity_timeout: inactivity_timeout,
91
- ssl: ssl,
92
- bind: bind,
93
- proxy: proxy
94
- }.compact
95
- end
96
-
97
- attr_accessor :href, :local_path,
98
- :http_method,
99
- :custom_data,
100
- :request_object,
101
- :parse_method,
102
- :callback,
103
- :errback,
104
- :stream_callback,
105
- :convert_to_utf8,
106
- :overwrite_exist,
107
- :request_options,
108
- :connection_options
109
- end
110
-
111
- module ListSpider
112
- RANDOM_TIME = -1
113
- NO_LIMIT_CONCURRENT = -1
114
- DEFAULT_CONCURRNET_MAX = 50
115
- DEFAULT_INTERVAL = 0
116
-
117
- @random_time_range = 3..10
118
- @local_path_set = Set.new
119
-
120
- class << self
121
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
- if interval.is_a? Range
123
- @random_time_range = interval
124
- interval = RANDOM_TIME
125
- end
126
-
127
- @down_list = filter_list(down_list)
128
- @interval = interval
129
- @max = max
130
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
- @succeed_size = 0
132
- @failed_size = 0
133
-
134
- puts "total size:#{@down_list.size}"
135
- event_machine_start_list(next_task, method(:complete))
136
- end
137
-
138
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
- get_list([task], interval: interval, max: max)
140
- end
141
-
142
- def add_task(task)
143
- if task.is_a? Array
144
- need_down_list = filter_list(task)
145
- @down_list += need_down_list
146
- elsif task.is_a?TaskStruct
147
- need_down_list = filter_list([task])
148
- @down_list += need_down_list
149
- else
150
- puts "error task type:#{task.class}"
151
- end
152
- end
153
-
154
- private
155
-
156
- def event_machine_down(link_struct_list, callback = nil)
157
- failed_list = []
158
- succeed_list = []
159
- multi = EventMachine::MultiRequest.new
160
- begin_time = Time.now
161
-
162
- for_each_proc =
163
- proc do |task_struct|
164
- http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
165
- http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
166
- task_struct.request_object = http_req
167
-
168
- http_req.callback do
169
- s = http_req.response_header.status
170
- puts "#{Time.now}, http status code: #{s}"
171
-
172
- if s == 200
173
- local_dir = File.dirname(task_struct.local_path)
174
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
175
- begin
176
- File.open(task_struct.local_path, 'wb') do |f|
177
- f << if @convert_to_utf8 == true
178
- SpiderHelper.to_utf8(http_req.response)
179
- else
180
- http_req.response
181
- end
182
- end
183
- call_parse_method(task_struct)
184
- succeed_list << task_struct
185
- rescue StandardError => exception
186
- puts exception
187
- end
188
- end
189
- task_struct.callback.call(task_struct, http_req) if task_struct.callback
190
- end
191
-
192
- http_req.errback do
193
- puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
194
-
195
- task_struct.errback.call(task_struct, http_req) if task_struct.errback
196
- end
197
-
198
- begin
199
- multi.add task_struct.local_path, http_req
200
- rescue StandardError => exception
201
- puts exception
202
- puts task_struct.href
203
- puts task_struct.local_path
204
- stop_machine
205
- end
206
- end
207
-
208
- cb =
209
- proc do
210
- end_time = Time.now
211
- puts "use time:#{end_time - begin_time} seconds"
212
- if callback.nil?
213
- stop_machine
214
- else
215
- callback.call(multi, succeed_list, failed_list)
216
- end
217
- end
218
- link_struct_list.each(&for_each_proc)
219
- multi.callback(&cb)
220
- end
221
-
222
- def stop_machine
223
- puts "success size:#{@succeed_size}"
224
- puts "failed size:#{@failed_size}"
225
- @end_time = Time.now
226
- puts "total use time:#{@end_time - @begin_time} seconds"
227
- EventMachine.stop
228
- @local_path_set.clear
229
- end
230
-
231
- def next_task
232
- @down_list.shift(@max)
233
- end
234
-
235
- def call_parse_method(task_struct)
236
- task_struct.parse_method.call(task_struct) if task_struct.parse_method
237
- end
238
-
239
- def complete(_multi, success_list, failed_list)
240
- @succeed_size += success_list.size
241
- @failed_size += failed_list.size
242
- @succeed_list.concat(success_list)
243
- @failed_list.concat(failed_list)
244
-
245
- todo = next_task
246
-
247
- if todo.empty?
248
- stop_machine
249
- else
250
- if @interval != 0
251
- if !success_list.empty? || !failed_list.empty?
252
- if @interval == RANDOM_TIME
253
- sleep(rand(@random_time_range))
254
- else
255
- sleep(@interval)
256
- end
257
- end
258
- end
259
- event_machine_down(todo, method(:complete))
260
- end
261
- end
262
-
263
- def event_machine_start_list(down_list, callback = nil)
264
- EventMachine.run do
265
- @succeed_list = []
266
- @failed_list = []
267
- @begin_time = Time.now
268
- if down_list.empty?
269
- if callback
270
- callback.call(nil, [], [])
271
- else
272
- stop_machine
273
- end
274
- else
275
- event_machine_down(down_list, callback)
276
- end
277
- end
278
- end
279
-
280
- def filter_list(down_list)
281
- need_down_list = []
282
- down_list.each do |ts|
283
- if !ts.overwrite_exist && File.exist?(ts.local_path)
284
- call_parse_method(ts)
285
- elsif @local_path_set.add?(ts.local_path)
286
- need_down_list << ts
287
- end
288
- end
289
- need_down_list
290
- end
291
- end
292
-
293
- Signal.trap('INT') do
294
- ListSpider.stop_machine
295
- exit!
296
- end
297
- end
1
+ require 'list_spider/version'
2
+ require 'em-http-request'
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'set'
6
+ require 'addressable/uri'
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
+
10
+ # 爬取任务类
11
+ class TaskStruct
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
+ def initialize(href, # 请求链接
18
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
+ http_method: :get,
21
+ custom_data: nil, # 自定义数据
22
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
28
+ callback: nil,
29
+ # 请求失败后的回调
30
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
+ errback: nil,
32
+ stream_callback: nil, # 流数据处理回调
33
+ convert_to_utf8: false, # 是否转换为utf8编码
34
+ overwrite_exist: false, # 是否覆盖现有文件
35
+ # 请求设置
36
+ redirects: 3, # 重定向次数
37
+ keepalive: nil, # (暂不支持复用)
38
+ file: nil, # 要上传的文件路径
39
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
+ query: nil, # 查询字符串,可以是string或hash类型
41
+ body: nil, # 请求体,可以是string或hash类型
42
+ head: nil, # 请求头
43
+ # 连接设置
44
+ connect_timeout: 60, # 连接超时时间
45
+ inactivity_timeout: nil, # 连接后超时时间
46
+ # ssl设置
47
+ # ssl: {
48
+ # :private_key_file => '/tmp/server.key',
49
+ # :cert_chain_file => '/tmp/server.crt',
50
+ # :verify_peer => false
51
+ # }
52
+ ssl: nil,
53
+ # bind: {
54
+ # :host => '123.123.123.123', # use a specific interface for outbound request
55
+ # :port => '123'
56
+ # }
57
+ bind: nil,
58
+ # 代理设置
59
+ # proxy: {
60
+ # :host => '127.0.0.1', # proxy address
61
+ # :port => 9000, # proxy port
62
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
+
64
+ # :authorization => ['user', 'pass'] # proxy authorization header
65
+ # }
66
+ proxy: nil)
67
+ @href = href
68
+ @local_path = local_path
69
+ @http_method = http_method
70
+ @custom_data = custom_data
71
+ @parse_method = parse_method
72
+ @callback = callback
73
+ @errback = errback
74
+ @stream_callback = stream_callback
75
+ @convert_to_utf8 = convert_to_utf8
76
+ @overwrite_exist = overwrite_exist
77
+
78
+ @request_options = {
79
+ redirects: redirects,
80
+ keepalive: keepalive,
81
+ file: file,
82
+ path: path,
83
+ query: query,
84
+ body: body,
85
+ head: head
86
+ }.compact
87
+
88
+ @connection_options = {
89
+ connect_timeout: connect_timeout,
90
+ inactivity_timeout: inactivity_timeout,
91
+ ssl: ssl,
92
+ bind: bind,
93
+ proxy: proxy
94
+ }.compact
95
+ end
96
+
97
+ attr_accessor :href, :local_path,
98
+ :http_method,
99
+ :custom_data,
100
+ :request_object,
101
+ :parse_method,
102
+ :callback,
103
+ :errback,
104
+ :stream_callback,
105
+ :convert_to_utf8,
106
+ :overwrite_exist,
107
+ :request_options,
108
+ :connection_options
109
+ end
110
+
111
+ module ListSpider
112
+ RANDOM_TIME = -1
113
+ NO_LIMIT_CONCURRENT = -1
114
+ DEFAULT_CONCURRNET_MAX = 50
115
+ DEFAULT_INTERVAL = 0
116
+
117
+ @random_time_range = 3..10
118
+ @local_path_set = Set.new
119
+ @down_list = []
120
+
121
+ class << self
122
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
123
+ if interval.is_a? Range
124
+ @random_time_range = interval
125
+ interval = RANDOM_TIME
126
+ end
127
+
128
+ @down_list = filter_list(down_list)
129
+ @interval = interval
130
+ @max = max
131
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
132
+ @succeed_size = 0
133
+ @failed_size = 0
134
+
135
+ puts "total size:#{@down_list.size}"
136
+ event_machine_start_list(next_task, method(:complete))
137
+ end
138
+
139
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
140
+ get_list([task], interval: interval, max: max)
141
+ end
142
+
143
+ def add_task(task)
144
+ if task.is_a? Array
145
+ need_down_list = filter_list(task)
146
+ @down_list += need_down_list
147
+ elsif task.is_a?TaskStruct
148
+ need_down_list = filter_list([task])
149
+ @down_list += need_down_list
150
+ else
151
+ puts "error task type:#{task.class}"
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ def event_machine_down(link_struct_list, callback = nil)
158
+ failed_list = []
159
+ succeed_list = []
160
+ multi = EventMachine::MultiRequest.new
161
+ begin_time = Time.now
162
+
163
+ for_each_proc =
164
+ proc do |task_struct|
165
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
166
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
167
+ task_struct.request_object = http_req
168
+
169
+ http_req.callback do
170
+ s = http_req.response_header.status
171
+ puts "#{Time.now}, http status code: #{s}"
172
+
173
+ if s == 200
174
+ local_dir = File.dirname(task_struct.local_path)
175
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
176
+ begin
177
+ File.open(task_struct.local_path, 'wb') do |f|
178
+ f << if @convert_to_utf8 == true
179
+ SpiderHelper.to_utf8(http_req.response)
180
+ else
181
+ http_req.response
182
+ end
183
+ end
184
+ call_parse_method(task_struct)
185
+ succeed_list << task_struct
186
+ rescue StandardError => exception
187
+ puts exception
188
+ end
189
+ end
190
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
191
+ end
192
+
193
+ http_req.errback do
194
+ puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
195
+
196
+ task_struct.errback.call(task_struct, http_req) if task_struct.errback
197
+ end
198
+
199
+ begin
200
+ multi.add task_struct.local_path, http_req
201
+ rescue StandardError => exception
202
+ puts exception
203
+ puts task_struct.href
204
+ puts task_struct.local_path
205
+ stop_machine
206
+ end
207
+ end
208
+
209
+ cb =
210
+ proc do
211
+ end_time = Time.now
212
+ puts "use time:#{end_time - begin_time} seconds"
213
+ if callback.nil?
214
+ stop_machine
215
+ else
216
+ callback.call(multi, succeed_list, failed_list)
217
+ end
218
+ end
219
+ link_struct_list.each(&for_each_proc)
220
+ multi.callback(&cb)
221
+ end
222
+
223
+ def stop_machine
224
+ puts "success size:#{@succeed_size}"
225
+ puts "failed size:#{@failed_size}"
226
+ @end_time = Time.now
227
+ puts "total use time:#{@end_time - @begin_time} seconds"
228
+ EventMachine.stop
229
+ @local_path_set.clear
230
+ end
231
+
232
+ def next_task
233
+ @down_list.shift(@max)
234
+ end
235
+
236
+ def call_parse_method(task_struct)
237
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
238
+ end
239
+
240
+ def complete(_multi, success_list, failed_list)
241
+ @succeed_size += success_list.size
242
+ @failed_size += failed_list.size
243
+ @succeed_list.concat(success_list)
244
+ @failed_list.concat(failed_list)
245
+
246
+ todo = next_task
247
+
248
+ if todo.empty?
249
+ stop_machine
250
+ else
251
+ if @interval != 0
252
+ if !success_list.empty? || !failed_list.empty?
253
+ if @interval == RANDOM_TIME
254
+ sleep(rand(@random_time_range))
255
+ else
256
+ sleep(@interval)
257
+ end
258
+ end
259
+ end
260
+ event_machine_down(todo, method(:complete))
261
+ end
262
+ end
263
+
264
+ def event_machine_start_list(down_list, callback = nil)
265
+ EventMachine.run do
266
+ @succeed_list = []
267
+ @failed_list = []
268
+ @begin_time = Time.now
269
+ if down_list.empty?
270
+ if callback
271
+ callback.call(nil, [], [])
272
+ else
273
+ stop_machine
274
+ end
275
+ else
276
+ event_machine_down(down_list, callback)
277
+ end
278
+ end
279
+ end
280
+
281
+ def filter_list(down_list)
282
+ need_down_list = []
283
+ down_list.each do |ts|
284
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
285
+ call_parse_method(ts)
286
+ elsif @local_path_set.add?(ts.local_path)
287
+ need_down_list << ts
288
+ end
289
+ end
290
+ need_down_list
291
+ end
292
+ end
293
+
294
+ Signal.trap('INT') do
295
+ ListSpider.stop_machine
296
+ exit!
297
+ end
298
+ end