list_spider 0.1.9 → 0.1.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +27 -27
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 540f6b583b38b40fe7c6096d372a43b93dd20002
|
4
|
+
data.tar.gz: 99b187103019165b5c87dd094c748d0b889d306b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bd249021510b182d67912573feb5f9204c82f201340d7515f6dcb0d024fd9920abd1acb7724c6670967455a1a0b92426f40cf3d7550da40f00decfdd8e7f903
|
7
|
+
data.tar.gz: 6a28ca654fbb5f906dacfcccc121db52c09b6efa70c1e3724a1db3937e7be15922286c476ff390c48c227a7b492866c39ef1c3187e9636a0fd3ec9ac391d88fc
|
data/lib/list_spider.rb
CHANGED
@@ -40,11 +40,11 @@ module ListSpider
|
|
40
40
|
@connection_opts = {connect_timeout: 60}
|
41
41
|
@overwrite_exist = false
|
42
42
|
@max_redirects = 10
|
43
|
-
|
43
|
+
@local_path_set = Set.new
|
44
44
|
|
45
45
|
class << self
|
46
46
|
|
47
|
-
attr_accessor :
|
47
|
+
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
48
48
|
|
49
49
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
50
50
|
@connection_opts = {
|
@@ -61,7 +61,7 @@ module ListSpider
|
|
61
61
|
end
|
62
62
|
|
63
63
|
def set_header_option(header_option)
|
64
|
-
|
64
|
+
@header_option = header_option
|
65
65
|
end
|
66
66
|
|
67
67
|
def event_machine_down(link_struct_list, callback = nil)
|
@@ -73,7 +73,7 @@ module ListSpider
|
|
73
73
|
for_each_proc = proc do |e|
|
74
74
|
opt = {}
|
75
75
|
opt = {:redirects => @max_redirects}
|
76
|
-
opt[:head] =
|
76
|
+
opt[:head] = @header_option if defined? @header_option
|
77
77
|
if e.http_method == :post
|
78
78
|
opt[:body] = e.params unless e.params.empty?
|
79
79
|
if @connection_opts
|
@@ -149,16 +149,16 @@ module ListSpider
|
|
149
149
|
end
|
150
150
|
|
151
151
|
def stop_machine
|
152
|
-
puts "success size:#{
|
153
|
-
puts "failed size:#{
|
154
|
-
|
155
|
-
puts "total use time:#{
|
152
|
+
puts "success size:#{@succeed_size}"
|
153
|
+
puts "failed size:#{@failed_size}"
|
154
|
+
@end_time = Time.now
|
155
|
+
puts "total use time:#{@end_time-@begin_time} seconds"
|
156
156
|
EventMachine.stop
|
157
|
-
|
157
|
+
@local_path_set.clear
|
158
158
|
end
|
159
159
|
|
160
160
|
def get_next_task
|
161
|
-
return
|
161
|
+
return @down_list.shift(@max)
|
162
162
|
end
|
163
163
|
|
164
164
|
def call_parse_method(e)
|
@@ -188,8 +188,8 @@ module ListSpider
|
|
188
188
|
end
|
189
189
|
|
190
190
|
def complete(multi, success_list, failed_list)
|
191
|
-
|
192
|
-
|
191
|
+
@succeed_size += success_list.size
|
192
|
+
@failed_size += failed_list.size
|
193
193
|
success_list.each do |e|
|
194
194
|
call_parse_method(e)
|
195
195
|
end
|
@@ -199,12 +199,12 @@ module ListSpider
|
|
199
199
|
if todo.empty?
|
200
200
|
stop_machine
|
201
201
|
else
|
202
|
-
if
|
202
|
+
if @interval != 0
|
203
203
|
if success_list.size != 0 || failed_list.size != 0
|
204
|
-
if
|
204
|
+
if @interval == RANDOM_TIME
|
205
205
|
sleep(rand(@random_time_range))
|
206
206
|
else
|
207
|
-
sleep(
|
207
|
+
sleep(@interval)
|
208
208
|
end
|
209
209
|
end
|
210
210
|
end
|
@@ -214,7 +214,7 @@ module ListSpider
|
|
214
214
|
|
215
215
|
def event_machine_start_list(down_list, callback = nil)
|
216
216
|
EventMachine.run {
|
217
|
-
|
217
|
+
@begin_time = Time.now
|
218
218
|
if down_list.empty?
|
219
219
|
if callback
|
220
220
|
callback.call(nil, [], [])
|
@@ -232,7 +232,7 @@ module ListSpider
|
|
232
232
|
down_list.each do |ts|
|
233
233
|
if !@overwrite_exist && File.exist?(ts.local_path)
|
234
234
|
call_parse_method(ts)
|
235
|
-
elsif
|
235
|
+
elsif @local_path_set.add?(ts.local_path)
|
236
236
|
need_down_list << ts
|
237
237
|
end
|
238
238
|
end
|
@@ -245,18 +245,18 @@ module ListSpider
|
|
245
245
|
interval = RANDOM_TIME
|
246
246
|
end
|
247
247
|
|
248
|
-
|
248
|
+
@down_list = []
|
249
249
|
|
250
250
|
need_down_list = filter_list(down_list)
|
251
251
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
252
|
+
@down_list = @down_list + need_down_list
|
253
|
+
@interval = interval
|
254
|
+
@max = max
|
255
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
256
|
+
@succeed_size = 0
|
257
|
+
@failed_size = 0
|
258
258
|
|
259
|
-
puts "total size:#{
|
259
|
+
puts "total size:#{@down_list.size}"
|
260
260
|
event_machine_start_list(get_next_task, method(:complete))
|
261
261
|
end
|
262
262
|
|
@@ -267,10 +267,10 @@ module ListSpider
|
|
267
267
|
def add_task(task)
|
268
268
|
if task.is_a?Array
|
269
269
|
need_down_list = filter_list(task)
|
270
|
-
|
270
|
+
@down_list = @down_list + need_down_list
|
271
271
|
elsif task.is_a?TaskStruct
|
272
272
|
need_down_list = filter_list([task])
|
273
|
-
|
273
|
+
@down_list = @down_list + need_down_list
|
274
274
|
else
|
275
275
|
puts "error task type:#{task.class}"
|
276
276
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|