list_spider 2.3.0 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/setup CHANGED
@@ -1,8 +1,8 @@
1
- #!/usr/bin/env bash
2
- set -euo pipefail
3
- IFS=$'\n\t'
4
- set -vx
5
-
6
- bundle install
7
-
8
- # Do any other automated setup that you need to do here
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -1,3 +1,3 @@
1
- #!/bin/sh
2
-
1
+ #!/bin/sh
2
+
3
3
  rubocop -a -D -f simple -o rubocopresult
@@ -1,72 +1,72 @@
1
-
2
- class FileFilter
3
- # 4033
4
- # 920
5
- def initialize(dir_pattern, size_threshold: 1000,
6
- cust_judge: nil, process_block: nil)
7
- @dir_pattern = dir_pattern
8
- @size_threshold = size_threshold
9
- @cust_judge = cust_judge ? cust_judge : method(:default_judge)
10
- @total = 0
11
- @process_block = process_block
12
- end
13
-
14
- def default_judge(f)
15
- File.size(f) <= @size_threshold
16
- end
17
-
18
- def filter_file(f)
19
- if @cust_judge.call(f)
20
- @total += 1
21
- @process_block.call(f)
22
- end
23
- end
24
-
25
- def start
26
- Dir.glob(@dir_pattern) do |f|
27
- filter_file(f)
28
- end
29
- puts "total:#{@total}"
30
- end
31
-
32
- def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
33
- FileFilter.new(
34
- dir_pattern,
35
- size_threshold: size_threshold,
36
- cust_judge: cust_judge,
37
- process_block:
38
- proc do |f|
39
- puts "deleted file: #{f}"
40
- File.delete(f)
41
- end
42
- ).start
43
- end
44
-
45
- def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
46
- FileFilter.new(
47
- dir_pattern,
48
- size_threshold: size_threshold,
49
- cust_judge: cust_judge,
50
- process_block:
51
- proc do |f|
52
- puts "filterd file: #{f}"
53
- end
54
- ).start
55
- end
56
-
57
- def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
- size_threshold: 1000, cust_judge: nil)
59
- result_file = File.open(save_file_name, 'wt')
60
- FileFilter.new(
61
- dir_pattern,
62
- size_threshold: size_threshold,
63
- cust_judge: cust_judge,
64
- process_block:
65
- proc do |f|
66
- puts "filterd file: #{f}"
67
- result_file << f << "\n"
68
- end
69
- ).start
70
- result_file.close
71
- end
72
- end
1
+
2
+ class FileFilter
3
+ # 4033
4
+ # 920
5
+ def initialize(dir_pattern, size_threshold: 1000,
6
+ cust_judge: nil, process_block: nil)
7
+ @dir_pattern = dir_pattern
8
+ @size_threshold = size_threshold
9
+ @cust_judge = cust_judge ? cust_judge : method(:default_judge)
10
+ @total = 0
11
+ @process_block = process_block
12
+ end
13
+
14
+ def default_judge(f)
15
+ File.size(f) <= @size_threshold
16
+ end
17
+
18
+ def filter_file(f)
19
+ if @cust_judge.call(f)
20
+ @total += 1
21
+ @process_block.call(f)
22
+ end
23
+ end
24
+
25
+ def start
26
+ Dir.glob(@dir_pattern) do |f|
27
+ filter_file(f)
28
+ end
29
+ puts "total:#{@total}"
30
+ end
31
+
32
+ def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
33
+ FileFilter.new(
34
+ dir_pattern,
35
+ size_threshold: size_threshold,
36
+ cust_judge: cust_judge,
37
+ process_block:
38
+ proc do |f|
39
+ puts "deleted file: #{f}"
40
+ File.delete(f)
41
+ end
42
+ ).start
43
+ end
44
+
45
+ def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
46
+ FileFilter.new(
47
+ dir_pattern,
48
+ size_threshold: size_threshold,
49
+ cust_judge: cust_judge,
50
+ process_block:
51
+ proc do |f|
52
+ puts "filterd file: #{f}"
53
+ end
54
+ ).start
55
+ end
56
+
57
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
+ size_threshold: 1000, cust_judge: nil)
59
+ result_file = File.open(save_file_name, 'wt')
60
+ FileFilter.new(
61
+ dir_pattern,
62
+ size_threshold: size_threshold,
63
+ cust_judge: cust_judge,
64
+ process_block:
65
+ proc do |f|
66
+ puts "filterd file: #{f}"
67
+ result_file << f << "\n"
68
+ end
69
+ ).start
70
+ result_file.close
71
+ end
72
+ end
@@ -1,297 +1,308 @@
1
- require 'list_spider/version'
2
- require 'em-http-request'
3
- require 'nokogiri'
4
- require 'fileutils'
5
- require 'set'
6
- require 'addressable/uri'
7
- require File.expand_path('spider_helper', __dir__)
8
- require File.expand_path('file_filter', __dir__)
9
-
10
- # 爬取任务类
11
- class TaskStruct
12
- # * href 请求链接
13
- # * local_path 保存数据的本地路径(此路径作为去重标准)
14
- # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
15
- # * custom_data 自定义数据
16
- # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
17
- def initialize(href, # 请求链接
18
- local_path, # 保存数据的本地路径(此路径作为去重标准)
19
- # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
20
- http_method: :get,
21
- custom_data: nil, # 自定义数据
22
- parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
23
- # 请求成功后的回调,此时可能没有保存文件,比如301,404
24
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
- # http_req.response_header.status 状态码
26
- # http_req.response_header 返回头
27
- # http_req.response 返回体
28
- callback: nil,
29
- # 请求失败后的回调
30
- # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
31
- errback: nil,
32
- stream_callback: nil, # 流数据处理回调
33
- convert_to_utf8: false, # 是否转换为utf8编码
34
- overwrite_exist: false, # 是否覆盖现有文件
35
- # 请求设置
36
- redirects: 3, # 重定向次数
37
- keepalive: nil, # (暂不支持复用)
38
- file: nil, # 要上传的文件路径
39
- path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
40
- query: nil, # 查询字符串,可以是string或hash类型
41
- body: nil, # 请求体,可以是string或hash类型
42
- head: nil, # 请求头
43
- # 连接设置
44
- connect_timeout: 60, # 连接超时时间
45
- inactivity_timeout: nil, # 连接后超时时间
46
- # ssl设置
47
- # ssl: {
48
- # :private_key_file => '/tmp/server.key',
49
- # :cert_chain_file => '/tmp/server.crt',
50
- # :verify_peer => false
51
- # }
52
- ssl: nil,
53
- # bind: {
54
- # :host => '123.123.123.123', # use a specific interface for outbound request
55
- # :port => '123'
56
- # }
57
- bind: nil,
58
- # 代理设置
59
- # proxy: {
60
- # :host => '127.0.0.1', # proxy address
61
- # :port => 9000, # proxy port
62
- # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
63
-
64
- # :authorization => ['user', 'pass'] # proxy authorization header
65
- # }
66
- proxy: nil)
67
- @href = href
68
- @local_path = local_path
69
- @http_method = http_method
70
- @custom_data = custom_data
71
- @parse_method = parse_method
72
- @callback = callback
73
- @errback = errback
74
- @stream_callback = stream_callback
75
- @convert_to_utf8 = convert_to_utf8
76
- @overwrite_exist = overwrite_exist
77
-
78
- @request_options = {
79
- redirects: redirects,
80
- keepalive: keepalive,
81
- file: file,
82
- path: path,
83
- query: query,
84
- body: body,
85
- head: head
86
- }.compact
87
-
88
- @connection_options = {
89
- connect_timeout: connect_timeout,
90
- inactivity_timeout: inactivity_timeout,
91
- ssl: ssl,
92
- bind: bind,
93
- proxy: proxy
94
- }.compact
95
- end
96
-
97
- attr_accessor :href, :local_path,
98
- :http_method,
99
- :custom_data,
100
- :request_object,
101
- :parse_method,
102
- :callback,
103
- :errback,
104
- :stream_callback,
105
- :convert_to_utf8,
106
- :overwrite_exist,
107
- :request_options,
108
- :connection_options
109
- end
110
-
111
- module ListSpider
112
- RANDOM_TIME = -1
113
- NO_LIMIT_CONCURRENT = -1
114
- DEFAULT_CONCURRNET_MAX = 50
115
- DEFAULT_INTERVAL = 0
116
-
117
- @random_time_range = 3..10
118
- @local_path_set = Set.new
119
-
120
- class << self
121
- def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
122
- if interval.is_a? Range
123
- @random_time_range = interval
124
- interval = RANDOM_TIME
125
- end
126
-
127
- @down_list = filter_list(down_list)
128
- @interval = interval
129
- @max = max
130
- @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
131
- @succeed_size = 0
132
- @failed_size = 0
133
-
134
- puts "total size:#{@down_list.size}"
135
- event_machine_start_list(next_task, method(:complete))
136
- end
137
-
138
- def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
139
- get_list([task], interval: interval, max: max)
140
- end
141
-
142
- def add_task(task)
143
- if task.is_a? Array
144
- need_down_list = filter_list(task)
145
- @down_list += need_down_list
146
- elsif task.is_a?TaskStruct
147
- need_down_list = filter_list([task])
148
- @down_list += need_down_list
149
- else
150
- puts "error task type:#{task.class}"
151
- end
152
- end
153
-
154
- private
155
-
156
- def event_machine_down(link_struct_list, callback = nil)
157
- failed_list = []
158
- succeed_list = []
159
- multi = EventMachine::MultiRequest.new
160
- begin_time = Time.now
161
-
162
- for_each_proc =
163
- proc do |task_struct|
164
- http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
165
- http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
166
- task_struct.request_object = http_req
167
-
168
- http_req.callback do
169
- s = http_req.response_header.status
170
- puts "#{Time.now}, http status code: #{s}"
171
-
172
- if s == 200
173
- local_dir = File.dirname(task_struct.local_path)
174
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
175
- begin
176
- File.open(task_struct.local_path, 'wb') do |f|
177
- f << if @convert_to_utf8 == true
178
- SpiderHelper.to_utf8(http_req.response)
179
- else
180
- http_req.response
181
- end
182
- end
183
- call_parse_method(task_struct)
184
- succeed_list << task_struct
185
- rescue StandardError => exception
186
- puts exception
187
- end
188
- end
189
- task_struct.callback.call(task_struct, http_req) if task_struct.callback
190
- end
191
-
192
- http_req.errback do
193
- puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
194
-
195
- task_struct.errback.call(task_struct, http_req) if task_struct.errback
196
- end
197
-
198
- begin
199
- multi.add task_struct.local_path, http_req
200
- rescue StandardError => exception
201
- puts exception
202
- puts task_struct.href
203
- puts task_struct.local_path
204
- stop_machine
205
- end
206
- end
207
-
208
- cb =
209
- proc do
210
- end_time = Time.now
211
- puts "use time:#{end_time - begin_time} seconds"
212
- if callback.nil?
213
- stop_machine
214
- else
215
- callback.call(multi, succeed_list, failed_list)
216
- end
217
- end
218
- link_struct_list.each(&for_each_proc)
219
- multi.callback(&cb)
220
- end
221
-
222
- def stop_machine
223
- puts "success size:#{@succeed_size}"
224
- puts "failed size:#{@failed_size}"
225
- @end_time = Time.now
226
- puts "total use time:#{@end_time - @begin_time} seconds"
227
- EventMachine.stop
228
- @local_path_set.clear
229
- end
230
-
231
- def next_task
232
- @down_list.shift(@max)
233
- end
234
-
235
- def call_parse_method(task_struct)
236
- task_struct.parse_method.call(task_struct) if task_struct.parse_method
237
- end
238
-
239
- def complete(_multi, success_list, failed_list)
240
- @succeed_size += success_list.size
241
- @failed_size += failed_list.size
242
- @succeed_list.concat(success_list)
243
- @failed_list.concat(failed_list)
244
-
245
- todo = next_task
246
-
247
- if todo.empty?
248
- stop_machine
249
- else
250
- if @interval != 0
251
- if !success_list.empty? || !failed_list.empty?
252
- if @interval == RANDOM_TIME
253
- sleep(rand(@random_time_range))
254
- else
255
- sleep(@interval)
256
- end
257
- end
258
- end
259
- event_machine_down(todo, method(:complete))
260
- end
261
- end
262
-
263
- def event_machine_start_list(down_list, callback = nil)
264
- EventMachine.run do
265
- @succeed_list = []
266
- @failed_list = []
267
- @begin_time = Time.now
268
- if down_list.empty?
269
- if callback
270
- callback.call(nil, [], [])
271
- else
272
- stop_machine
273
- end
274
- else
275
- event_machine_down(down_list, callback)
276
- end
277
- end
278
- end
279
-
280
- def filter_list(down_list)
281
- need_down_list = []
282
- down_list.each do |ts|
283
- if !ts.overwrite_exist && File.exist?(ts.local_path)
284
- call_parse_method(ts)
285
- elsif @local_path_set.add?(ts.local_path)
286
- need_down_list << ts
287
- end
288
- end
289
- need_down_list
290
- end
291
- end
292
-
293
- Signal.trap('INT') do
294
- ListSpider.stop_machine
295
- exit!
296
- end
297
- end
1
+ require 'list_spider/version'
2
+ require 'em-http-request'
3
+ require 'nokogiri'
4
+ require 'fileutils'
5
+ require 'set'
6
+ require 'securerandom'
7
+ require 'addressable/uri'
8
+ require File.expand_path('spider_helper', __dir__)
9
+ require File.expand_path('file_filter', __dir__)
10
+
11
+ # 爬取任务类
12
+ class TaskStruct
13
+ # * href 请求链接
14
+ # * local_path 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
15
+ # * http_method http方法,取值::get, :head, :delete, :put, :post, :patch, :options
16
+ # * custom_data 自定义数据
17
+ # * parse_method 解析保存文件的回调,参数是TaskStruct对象本身
18
+ def initialize(href, # 请求链接
19
+ local_path = :nil, # 保存数据的本地路径(保存文件的情况下此路径作为去重标准)
20
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
21
+ http_method: :get,
22
+ custom_data: nil, # 自定义数据
23
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
24
+ # 请求成功后的回调,此时可能没有保存文件,比如301,404
25
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
26
+ # http_req.response_header.status 状态码
27
+ # http_req.response_header 返回头
28
+ # http_req.response 返回体
29
+ callback: nil,
30
+ # 请求失败后的回调
31
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
32
+ errback: nil,
33
+ stream_callback: nil, # 流数据处理回调
34
+ convert_to_utf8: false, # 是否转换为utf8编码
35
+ overwrite_exist: false, # 是否覆盖现有文件
36
+ # 请求设置
37
+ redirects: 3, # 重定向次数
38
+ keepalive: nil, # (暂不支持复用)
39
+ file: nil, # 要上传的文件路径
40
+ path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
41
+ query: nil, # 查询字符串,可以是string或hash类型
42
+ body: nil, # 请求体,可以是string或hash类型
43
+ head: nil, # 请求头
44
+ # 连接设置
45
+ connect_timeout: 60, # 连接超时时间
46
+ inactivity_timeout: nil, # 连接后超时时间
47
+ # ssl设置
48
+ # ssl: {
49
+ # :private_key_file => '/tmp/server.key',
50
+ # :cert_chain_file => '/tmp/server.crt',
51
+ # :verify_peer => false
52
+ # }
53
+ ssl: nil,
54
+ # bind: {
55
+ # :host => '123.123.123.123', # use a specific interface for outbound request
56
+ # :port => '123'
57
+ # }
58
+ bind: nil,
59
+ # 代理设置
60
+ # proxy: {
61
+ # :host => '127.0.0.1', # proxy address
62
+ # :port => 9000, # proxy port
63
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
64
+
65
+ # :authorization => ['user', 'pass'] # proxy authorization header
66
+ # }
67
+ proxy: nil)
68
+ @href = href
69
+ @local_path = local_path
70
+ @http_method = http_method
71
+ @custom_data = custom_data
72
+ @parse_method = parse_method
73
+ @callback = callback
74
+ @errback = errback
75
+ @stream_callback = stream_callback
76
+ @convert_to_utf8 = convert_to_utf8
77
+ @overwrite_exist = overwrite_exist
78
+
79
+ @request_options = {
80
+ redirects: redirects,
81
+ keepalive: keepalive,
82
+ file: file,
83
+ path: path,
84
+ query: query,
85
+ body: body,
86
+ head: head
87
+ }.compact
88
+
89
+ @connection_options = {
90
+ connect_timeout: connect_timeout,
91
+ inactivity_timeout: inactivity_timeout,
92
+ ssl: ssl,
93
+ bind: bind,
94
+ proxy: proxy
95
+ }.compact
96
+ end
97
+
98
+ attr_accessor :href, :local_path,
99
+ :http_method,
100
+ :custom_data,
101
+ :request_object,
102
+ :parse_method,
103
+ :callback,
104
+ :errback,
105
+ :stream_callback,
106
+ :convert_to_utf8,
107
+ :overwrite_exist,
108
+ :request_options,
109
+ :connection_options
110
+ end
111
+
112
+ module ListSpider
113
+ RANDOM_TIME = -1
114
+ NO_LIMIT_CONCURRENT = -1
115
+ DEFAULT_CONCURRNET_MAX = 50
116
+ DEFAULT_INTERVAL = 0
117
+
118
+ @random_time_range = 3..10
119
+ @local_path_set = Set.new
120
+ @down_list = []
121
+ @save_file = true
122
+
123
+ class << self
124
+ attr_accessor :save_file
125
+
126
+ def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
127
+ if interval.is_a? Range
128
+ @random_time_range = interval
129
+ interval = RANDOM_TIME
130
+ end
131
+
132
+ filter_list(down_list)
133
+ @interval = interval
134
+ @max = max
135
+ @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
136
+ @succeed_size = 0
137
+ @failed_size = 0
138
+
139
+ puts "total size:#{@down_list.size}"
140
+ event_machine_start_list(next_task, method(:complete))
141
+ end
142
+
143
+ def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
144
+ get_list([task], interval: interval, max: max)
145
+ end
146
+
147
+ def add_task(task)
148
+ if task.is_a? Array
149
+ filter_list(task)
150
+ elsif task.is_a?TaskStruct
151
+ filter_list([task])
152
+ else
153
+ puts "error task type:#{task.class}"
154
+ end
155
+ end
156
+
157
+ def stop
158
+ stop_machine
159
+ end
160
+
161
+ private
162
+
163
+ def event_machine_down(link_struct_list, callback = nil)
164
+ failed_list = []
165
+ succeed_list = []
166
+ multi = EventMachine::MultiRequest.new
167
+ begin_time = Time.now
168
+
169
+ for_each_proc =
170
+ proc do |task_struct|
171
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
172
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
173
+ task_struct.request_object = http_req
174
+
175
+ http_req.callback do
176
+ s = http_req.response_header.status
177
+ puts "#{Time.now}, http status code: #{s}"
178
+
179
+ if s == 200 && @save_file
180
+ local_dir = File.dirname(task_struct.local_path)
181
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
182
+ begin
183
+ File.open(task_struct.local_path, 'wb') do |f|
184
+ f << if @convert_to_utf8 == true
185
+ SpiderHelper.to_utf8(http_req.response)
186
+ else
187
+ http_req.response
188
+ end
189
+ end
190
+ call_parse_method(task_struct)
191
+ succeed_list << task_struct
192
+ rescue StandardError => exception
193
+ puts exception
194
+ end
195
+ end
196
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
197
+ end
198
+
199
+ http_req.errback do
200
+ puts "#{Time.now}, #{task_struct.href}, error: #{http_req.error}"
201
+
202
+ task_struct.errback.call(task_struct, http_req) if task_struct.errback
203
+ end
204
+
205
+ begin
206
+ if @save_file
207
+ multi.add task_struct.local_path, http_req
208
+ else
209
+ multi.add SecureRandom.uuid, http_req
210
+ end
211
+ rescue StandardError => exception
212
+ puts exception
213
+ puts task_struct.href
214
+ puts task_struct.local_path
215
+ stop_machine
216
+ end
217
+ end
218
+
219
+ cb =
220
+ proc do
221
+ end_time = Time.now
222
+ puts "use time:#{end_time - begin_time} seconds"
223
+ if callback.nil?
224
+ stop_machine
225
+ else
226
+ callback.call(multi, succeed_list, failed_list)
227
+ end
228
+ end
229
+ link_struct_list.each(&for_each_proc)
230
+ multi.callback(&cb)
231
+ end
232
+
233
+ def stop_machine
234
+ puts "success size:#{@succeed_size}"
235
+ puts "failed size:#{@failed_size}"
236
+ @end_time = Time.now
237
+ puts "total use time:#{@end_time - @begin_time} seconds"
238
+ EventMachine.stop
239
+ @local_path_set.clear
240
+ end
241
+
242
+ def next_task
243
+ @down_list.shift(@max)
244
+ end
245
+
246
+ def call_parse_method(task_struct)
247
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
248
+ end
249
+
250
+ def complete(_multi, success_list, failed_list)
251
+ @succeed_size += success_list.size
252
+ @failed_size += failed_list.size
253
+ @succeed_list.concat(success_list)
254
+ @failed_list.concat(failed_list)
255
+
256
+ todo = next_task
257
+
258
+ if todo.empty?
259
+ stop_machine
260
+ else
261
+ if @interval != 0
262
+ if !success_list.empty? || !failed_list.empty?
263
+ if @interval == RANDOM_TIME
264
+ sleep(rand(@random_time_range))
265
+ else
266
+ sleep(@interval)
267
+ end
268
+ end
269
+ end
270
+ event_machine_down(todo, method(:complete))
271
+ end
272
+ end
273
+
274
+ def event_machine_start_list(down_list, callback = nil)
275
+ EventMachine.run do
276
+ @succeed_list = []
277
+ @failed_list = []
278
+ @begin_time = Time.now
279
+ if down_list.empty?
280
+ if callback
281
+ callback.call(nil, [], [])
282
+ else
283
+ stop_machine
284
+ end
285
+ else
286
+ event_machine_down(down_list, callback)
287
+ end
288
+ end
289
+ end
290
+
291
+ def filter_list(down_list)
292
+ return unless @save_file
293
+
294
+ down_list.each do |ts|
295
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
296
+ call_parse_method(ts)
297
+ elsif @local_path_set.add?(ts.local_path)
298
+ @down_list << ts
299
+ end
300
+ end
301
+ end
302
+ end
303
+
304
+ Signal.trap('INT') do
305
+ ListSpider.stop_machine
306
+ exit!
307
+ end
308
+ end