list_spider 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/list_spider.rb CHANGED
@@ -1,297 +1,298 @@
1
- require 'list_spider/version'
2
- require 'em-http-request'
3
- require 'nokogiri'
4
- require 'fileutils'
5
- require 'set'
6
- require 'addressable/uri'
7
- require File.expand_path('spider_helper', __dir__)
8
- require File.expand_path('file_filter', __dir__)
9
-
10
- # 爬取任务类
11
- class TaskStruct
12
- # * href 请求链接
13
- # * local_path 保存数据的本地路径(此路径作为去重标准)
14
- # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
- # * custom_data 自定义数据
16
- # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
- def initialize(href, # 请求链接
18
- local_path, # 保存数据的本地路径(此路径作为去重标准)
19
- # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
- http_method: :get,
21
- custom_data: nil, # 自定义数据
22
- parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
- # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
- # http_req.response_header.status 状态码
26
- # http_req.response_header 返回头
27
- # http_req.response 返回体
28
- callback: nil,
29
- # 请求失败后的回调
30
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
- errback: nil,
32
- stream_callback: nil, # 流数据处理回调
33
- convert_to_utf8: false, # 是否转换为utf8编码
34
- overwrite_exist: false, # 是否覆盖现有文件
35
- # 请求设置
36
- redirects: 3, # 重定向次数
37
- keepalive: nil, # (暂不支持复用)
38
- file: nil, # 要上传的文件路径
39
- path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
- query: nil, # 查询字符串,可以是string或hash类型
41
- body: nil, # 请求体,可以是string或hash类型
42
- head: nil, # 请求头
43
- # 连接设置
44
- connect_timeout: 60, # 连接超时时间
45
- inactivity_timeout: nil, # 连接后超时时间
46
- # ssl设置
47
- # ssl: {
48
- # :private_key_file => '/tmp/server.key',
49
- # :cert_chain_file => '/tmp/server.crt',
50
- # :verify_peer => false
51
- # }
52
- ssl: nil,
53
- # bind: {
54
- # :host => '123.123.123.123', # use a specific interface for outbound request
55
- # :port => '123'
56
- # }
57
- bind: nil,
58
- # 代理设置
59
- # proxy: {
60
- # :host => '127.0.0.1', # proxy address
61
- # :port => 9000, # proxy port
62
- # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
-
64
- # :authorization => ['user', 'pass'] # proxy authorization header
65
- # }
66
- proxy: nil)
67
- @href = href
68
- @local_path = local_path
69
- @http_method = http_method
70
- @custom_data = custom_data
71
- @parse_method = parse_method
72
- @callback = callback
73
- @errback = errback
74
- @stream_callback = stream_callback
75
- @convert_to_utf8 = convert_to_utf8
76
- @overwrite_exist = overwrite_exist
77
-
78
- @request_options = {
79
- redirects: redirects,
80
- keepalive: keepalive,
81
- file: file,
82
- path: path,
83
- query: query,
84
- body: body,
85
- head: head
86
- }.compact
87
-
88
- @connection_options = {
89
- connect_timeout: connect_timeout,
90
- inactivity_timeout: inactivity_timeout,
91
- ssl: ssl,
92
- bind: bind,
93
- proxy: proxy
94
- }.compact
95
- end
96
-
97
- attr_accessor :href, :local_path,
98
- :http_method,
99
- :custom_data,
100
- :request_object,
101
- :parse_method,
102
- :callback,
103
- :errback,
104
- :stream_callback,
105
- :convert_to_utf8,
106
- :overwrite_exist,
107
- :request_options,
108
- :connection_options
109
- end
110
-
111
- module ListSpider
112
- RANDOM_TIME = -1
113
- NO_LIMIT_CONCURRENT = -1
114
- DEFAULT_CONCURRNET_MAX = 50
115
- DEFAULT_INTERVAL = 0
116
-
117
- @random_time_range = 3..10
118
- @local_path_set = Set.new
119
-
120
- class << self
121
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
- if interval.is_a? Range
123
- @random_time_range = interval
124
- interval = RANDOM_TIME
125
- end
126
-
127
- @down_list = filter_list(down_list)
128
- @interval = interval
129
- @max = max
130
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
- @succeed_size = 0
132
- @failed_size = 0
133
-
134
- puts "total size:#{@down_list.size}"
135
- event_machine_start_list(next_task, method(:complete))
136
- end
137
-
138
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
- get_list([task], interval: interval, max: max)
140
- end
141
-
142
- def add_task(task)
143
- if task.is_a? Array
144
- need_down_list = filter_list(task)
145
- @down_list += need_down_list
146
- elsif task.is_a?TaskStruct
147
- need_down_list = filter_list([task])
148
- @down_list += need_down_list
149
- else
150
- puts "error task type:#{task.class}"
151
- end
152
- end
153
-
154
- private
155
-
156
- def event_machine_down(link_struct_list, callback = nil)
157
- failed_list = []
158
- succeed_list = []
159
- multi = EventMachine::MultiRequest.new
160
- begin_time = Time.now
161
-
162
- for_each_proc =
163
- proc do |task_struct|
164
- http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
165
- http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
166
- task_struct.request_object = http_req
167
-
168
- http_req.callback do
169
- s = http_req.response_header.status
170
- puts "#{Time.now}, http status code: #{s}"
171
-
172
- if s == 200
173
- local_dir = File.dirname(task_struct.local_path)
174
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
175
- begin
176
- File.open(task_struct.local_path, 'wb') do |f|
177
- f << if @convert_to_utf8 == true
178
- SpiderHelper.to_utf8(http_req.response)
179
- else
180
- http_req.response
181
- end
182
- end
183
- call_parse_method(task_struct)
184
- succeed_list << task_struct
185
- rescue StandardError => exception
186
- puts exception
187
- end
188
- end
189
- task_struct.callback.call(task_struct, http_req) if task_struct.callback
190
- end
191
-
192
- http_req.errback do
193
- puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
194
-
195
- task_struct.errback.call(task_struct, http_req) if task_struct.errback
196
- end
197
-
198
- begin
199
- multi.add task_struct.local_path, http_req
200
- rescue StandardError => exception
201
- puts exception
202
- puts task_struct.href
203
- puts task_struct.local_path
204
- stop_machine
205
- end
206
- end
207
-
208
- cb =
209
- proc do
210
- end_time = Time.now
211
- puts "use time:#{end_time - begin_time} seconds"
212
- if callback.nil?
213
- stop_machine
214
- else
215
- callback.call(multi, succeed_list, failed_list)
216
- end
217
- end
218
- link_struct_list.each(&for_each_proc)
219
- multi.callback(&cb)
220
- end
221
-
222
- def stop_machine
223
- puts "success size:#{@succeed_size}"
224
- puts "failed size:#{@failed_size}"
225
- @end_time = Time.now
226
- puts "total use time:#{@end_time - @begin_time} seconds"
227
- EventMachine.stop
228
- @local_path_set.clear
229
- end
230
-
231
- def next_task
232
- @down_list.shift(@max)
233
- end
234
-
235
- def call_parse_method(task_struct)
236
- task_struct.parse_method.call(task_struct) if task_struct.parse_method
237
- end
238
-
239
- def complete(_multi, success_list, failed_list)
240
- @succeed_size += success_list.size
241
- @failed_size += failed_list.size
242
- @succeed_list.concat(success_list)
243
- @failed_list.concat(failed_list)
244
-
245
- todo = next_task
246
-
247
- if todo.empty?
248
- stop_machine
249
- else
250
- if @interval != 0
251
- if !success_list.empty? || !failed_list.empty?
252
- if @interval == RANDOM_TIME
253
- sleep(rand(@random_time_range))
254
- else
255
- sleep(@interval)
256
- end
257
- end
258
- end
259
- event_machine_down(todo, method(:complete))
260
- end
261
- end
262
-
263
- def event_machine_start_list(down_list, callback = nil)
264
- EventMachine.run do
265
- @succeed_list = []
266
- @failed_list = []
267
- @begin_time = Time.now
268
- if down_list.empty?
269
- if callback
270
- callback.call(nil, [], [])
271
- else
272
- stop_machine
273
- end
274
- else
275
- event_machine_down(down_list, callback)
276
- end
277
- end
278
- end
279
-
280
- def filter_list(down_list)
281
- need_down_list = []
282
- down_list.each do |ts|
283
- if !ts.overwrite_exist && File.exist?(ts.local_path)
284
- call_parse_method(ts)
285
- elsif @local_path_set.add?(ts.local_path)
286
- need_down_list << ts
287
- end
288
- end
289
- need_down_list
290
- end
291
- end
292
-
293
- Signal.trap('INT') do
294
- ListSpider.stop_machine
295
- exit!
296
- end
297
- end
1
+ require 'list_spider/version'
2
+ require 'em-http-request'
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'set'
6
+ require 'addressable/uri'
7
+ require File.expand_path('spider_helper', __dir__)
8
+ require File.expand_path('file_filter', __dir__)
9
+
10
+ # 爬取任务类
11
+ class TaskStruct
12
+ # * href 请求链接
13
+ # * local_path 保存数据的本地路径(此路径作为去重标准)
14
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
+ # * custom_data 自定义数据
16
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
+ def initialize(href, # 请求链接
18
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
19
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
+ http_method: :get,
21
+ custom_data: nil, # 自定义数据
22
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ # http_req.response_header.status 状态码
26
+ # http_req.response_header 返回头
27
+ # http_req.response 返回体
28
+ callback: nil,
29
+ # 请求失败后的回调
30
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
+ errback: nil,
32
+ stream_callback: nil, # 流数据处理回调
33
+ convert_to_utf8: false, # 是否转换为utf8编码
34
+ overwrite_exist: false, # 是否覆盖现有文件
35
+ # 请求设置
36
+ redirects: 3, # 重定向次数
37
+ keepalive: nil, # (暂不支持复用)
38
+ file: nil, # 要上传的文件路径
39
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
+ query: nil, # 查询字符串,可以是string或hash类型
41
+ body: nil, # 请求体,可以是string或hash类型
42
+ head: nil, # 请求头
43
+ # 连接设置
44
+ connect_timeout: 60, # 连接超时时间
45
+ inactivity_timeout: nil, # 连接后超时时间
46
+ # ssl设置
47
+ # ssl: {
48
+ # :private_key_file => '/tmp/server.key',
49
+ # :cert_chain_file => '/tmp/server.crt',
50
+ # :verify_peer => false
51
+ # }
52
+ ssl: nil,
53
+ # bind: {
54
+ # :host => '123.123.123.123', # use a specific interface for outbound request
55
+ # :port => '123'
56
+ # }
57
+ bind: nil,
58
+ # 代理设置
59
+ # proxy: {
60
+ # :host => '127.0.0.1', # proxy address
61
+ # :port => 9000, # proxy port
62
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
+
64
+ # :authorization => ['user', 'pass'] # proxy authorization header
65
+ # }
66
+ proxy: nil)
67
+ @href = href
68
+ @local_path = local_path
69
+ @http_method = http_method
70
+ @custom_data = custom_data
71
+ @parse_method = parse_method
72
+ @callback = callback
73
+ @errback = errback
74
+ @stream_callback = stream_callback
75
+ @convert_to_utf8 = convert_to_utf8
76
+ @overwrite_exist = overwrite_exist
77
+
78
+ @request_options = {
79
+ redirects: redirects,
80
+ keepalive: keepalive,
81
+ file: file,
82
+ path: path,
83
+ query: query,
84
+ body: body,
85
+ head: head
86
+ }.compact
87
+
88
+ @connection_options = {
89
+ connect_timeout: connect_timeout,
90
+ inactivity_timeout: inactivity_timeout,
91
+ ssl: ssl,
92
+ bind: bind,
93
+ proxy: proxy
94
+ }.compact
95
+ end
96
+
97
+ attr_accessor :href, :local_path,
98
+ :http_method,
99
+ :custom_data,
100
+ :request_object,
101
+ :parse_method,
102
+ :callback,
103
+ :errback,
104
+ :stream_callback,
105
+ :convert_to_utf8,
106
+ :overwrite_exist,
107
+ :request_options,
108
+ :connection_options
109
+ end
110
+
111
+ module ListSpider
112
+ RANDOM_TIME = -1
113
+ NO_LIMIT_CONCURRENT = -1
114
+ DEFAULT_CONCURRNET_MAX = 50
115
+ DEFAULT_INTERVAL = 0
116
+
117
+ @random_time_range = 3..10
118
+ @local_path_set = Set.new
119
+ @down_list = []
120
+
121
+ class << self
122
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
123
+ if interval.is_a? Range
124
+ @random_time_range = interval
125
+ interval = RANDOM_TIME
126
+ end
127
+
128
+ @down_list = filter_list(down_list)
129
+ @interval = interval
130
+ @max = max
131
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
132
+ @succeed_size = 0
133
+ @failed_size = 0
134
+
135
+ puts "total size:#{@down_list.size}"
136
+ event_machine_start_list(next_task, method(:complete))
137
+ end
138
+
139
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
140
+ get_list([task], interval: interval, max: max)
141
+ end
142
+
143
+ def add_task(task)
144
+ if task.is_a? Array
145
+ need_down_list = filter_list(task)
146
+ @down_list += need_down_list
147
+ elsif task.is_a?TaskStruct
148
+ need_down_list = filter_list([task])
149
+ @down_list += need_down_list
150
+ else
151
+ puts "error task type:#{task.class}"
152
+ end
153
+ end
154
+
155
+ private
156
+
157
+ def event_machine_down(link_struct_list, callback = nil)
158
+ failed_list = []
159
+ succeed_list = []
160
+ multi = EventMachine::MultiRequest.new
161
+ begin_time = Time.now
162
+
163
+ for_each_proc =
164
+ proc do |task_struct|
165
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
166
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
167
+ task_struct.request_object = http_req
168
+
169
+ http_req.callback do
170
+ s = http_req.response_header.status
171
+ puts "#{Time.now}, http status code: #{s}"
172
+
173
+ if s == 200
174
+ local_dir = File.dirname(task_struct.local_path)
175
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
176
+ begin
177
+ File.open(task_struct.local_path, 'wb') do |f|
178
+ f << if @convert_to_utf8 == true
179
+ SpiderHelper.to_utf8(http_req.response)
180
+ else
181
+ http_req.response
182
+ end
183
+ end
184
+ call_parse_method(task_struct)
185
+ succeed_list << task_struct
186
+ rescue StandardError => exception
187
+ puts exception
188
+ end
189
+ end
190
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
191
+ end
192
+
193
+ http_req.errback do
194
+ puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
195
+
196
+ task_struct.errback.call(task_struct, http_req) if task_struct.errback
197
+ end
198
+
199
+ begin
200
+ multi.add task_struct.local_path, http_req
201
+ rescue StandardError => exception
202
+ puts exception
203
+ puts task_struct.href
204
+ puts task_struct.local_path
205
+ stop_machine
206
+ end
207
+ end
208
+
209
+ cb =
210
+ proc do
211
+ end_time = Time.now
212
+ puts "use time:#{end_time - begin_time} seconds"
213
+ if callback.nil?
214
+ stop_machine
215
+ else
216
+ callback.call(multi, succeed_list, failed_list)
217
+ end
218
+ end
219
+ link_struct_list.each(&for_each_proc)
220
+ multi.callback(&cb)
221
+ end
222
+
223
+ def stop_machine
224
+ puts "success size:#{@succeed_size}"
225
+ puts "failed size:#{@failed_size}"
226
+ @end_time = Time.now
227
+ puts "total use time:#{@end_time - @begin_time} seconds"
228
+ EventMachine.stop
229
+ @local_path_set.clear
230
+ end
231
+
232
+ def next_task
233
+ @down_list.shift(@max)
234
+ end
235
+
236
+ def call_parse_method(task_struct)
237
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
238
+ end
239
+
240
+ def complete(_multi, success_list, failed_list)
241
+ @succeed_size += success_list.size
242
+ @failed_size += failed_list.size
243
+ @succeed_list.concat(success_list)
244
+ @failed_list.concat(failed_list)
245
+
246
+ todo = next_task
247
+
248
+ if todo.empty?
249
+ stop_machine
250
+ else
251
+ if @interval != 0
252
+ if !success_list.empty? || !failed_list.empty?
253
+ if @interval == RANDOM_TIME
254
+ sleep(rand(@random_time_range))
255
+ else
256
+ sleep(@interval)
257
+ end
258
+ end
259
+ end
260
+ event_machine_down(todo, method(:complete))
261
+ end
262
+ end
263
+
264
+ def event_machine_start_list(down_list, callback = nil)
265
+ EventMachine.run do
266
+ @succeed_list = []
267
+ @failed_list = []
268
+ @begin_time = Time.now
269
+ if down_list.empty?
270
+ if callback
271
+ callback.call(nil, [], [])
272
+ else
273
+ stop_machine
274
+ end
275
+ else
276
+ event_machine_down(down_list, callback)
277
+ end
278
+ end
279
+ end
280
+
281
+ def filter_list(down_list)
282
+ need_down_list = []
283
+ down_list.each do |ts|
284
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
285
+ call_parse_method(ts)
286
+ elsif @local_path_set.add?(ts.local_path)
287
+ need_down_list << ts
288
+ end
289
+ end
290
+ need_down_list
291
+ end
292
+ end
293
+
294
+ Signal.trap('INT') do
295
+ ListSpider.stop_machine
296
+ exit!
297
+ end
298
+ end