list_spider 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +27 -27
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 540f6b583b38b40fe7c6096d372a43b93dd20002
|
4
|
+
data.tar.gz: 99b187103019165b5c87dd094c748d0b889d306b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6bd249021510b182d67912573feb5f9204c82f201340d7515f6dcb0d024fd9920abd1acb7724c6670967455a1a0b92426f40cf3d7550da40f00decfdd8e7f903
|
7
|
+
data.tar.gz: 6a28ca654fbb5f906dacfcccc121db52c09b6efa70c1e3724a1db3937e7be15922286c476ff390c48c227a7b492866c39ef1c3187e9636a0fd3ec9ac391d88fc
|
data/lib/list_spider.rb
CHANGED
@@ -40,11 +40,11 @@ module ListSpider
|
|
40
40
|
@connection_opts = {connect_timeout: 60}
|
41
41
|
@overwrite_exist = false
|
42
42
|
@max_redirects = 10
|
43
|
-
|
43
|
+
@local_path_set = Set.new
|
44
44
|
|
45
45
|
class << self
|
46
46
|
|
47
|
-
attr_accessor :
|
47
|
+
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
48
48
|
|
49
49
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
50
50
|
@connection_opts = {
|
@@ -61,7 +61,7 @@ module ListSpider
|
|
61
61
|
end
|
62
62
|
|
63
63
|
def set_header_option(header_option)
|
64
|
-
|
64
|
+
@header_option = header_option
|
65
65
|
end
|
66
66
|
|
67
67
|
def event_machine_down(link_struct_list, callback = nil)
|
@@ -73,7 +73,7 @@ module ListSpider
|
|
73
73
|
for_each_proc = proc do |e|
|
74
74
|
opt = {}
|
75
75
|
opt = {:redirects => @max_redirects}
|
76
|
-
opt[:head] =
|
76
|
+
opt[:head] = @header_option if defined? @header_option
|
77
77
|
if e.http_method == :post
|
78
78
|
opt[:body] = e.params unless e.params.empty?
|
79
79
|
if @connection_opts
|
@@ -149,16 +149,16 @@ module ListSpider
|
|
149
149
|
end
|
150
150
|
|
151
151
|
def stop_machine
|
152
|
-
puts "success size:#{
|
153
|
-
puts "failed size:#{
|
154
|
-
|
155
|
-
puts "total use time:#{
|
152
|
+
puts "success size:#{@succeed_size}"
|
153
|
+
puts "failed size:#{@failed_size}"
|
154
|
+
@end_time = Time.now
|
155
|
+
puts "total use time:#{@end_time-@begin_time} seconds"
|
156
156
|
EventMachine.stop
|
157
|
-
|
157
|
+
@local_path_set.clear
|
158
158
|
end
|
159
159
|
|
160
160
|
def get_next_task
|
161
|
-
return
|
161
|
+
return @down_list.shift(@max)
|
162
162
|
end
|
163
163
|
|
164
164
|
def call_parse_method(e)
|
@@ -188,8 +188,8 @@ module ListSpider
|
|
188
188
|
end
|
189
189
|
|
190
190
|
def complete(multi, success_list, failed_list)
|
191
|
-
|
192
|
-
|
191
|
+
@succeed_size += success_list.size
|
192
|
+
@failed_size += failed_list.size
|
193
193
|
success_list.each do |e|
|
194
194
|
call_parse_method(e)
|
195
195
|
end
|
@@ -199,12 +199,12 @@ module ListSpider
|
|
199
199
|
if todo.empty?
|
200
200
|
stop_machine
|
201
201
|
else
|
202
|
-
if
|
202
|
+
if @interval != 0
|
203
203
|
if success_list.size != 0 || failed_list.size != 0
|
204
|
-
if
|
204
|
+
if @interval == RANDOM_TIME
|
205
205
|
sleep(rand(@random_time_range))
|
206
206
|
else
|
207
|
-
sleep(
|
207
|
+
sleep(@interval)
|
208
208
|
end
|
209
209
|
end
|
210
210
|
end
|
@@ -214,7 +214,7 @@ module ListSpider
|
|
214
214
|
|
215
215
|
def event_machine_start_list(down_list, callback = nil)
|
216
216
|
EventMachine.run {
|
217
|
-
|
217
|
+
@begin_time = Time.now
|
218
218
|
if down_list.empty?
|
219
219
|
if callback
|
220
220
|
callback.call(nil, [], [])
|
@@ -232,7 +232,7 @@ module ListSpider
|
|
232
232
|
down_list.each do |ts|
|
233
233
|
if !@overwrite_exist && File.exist?(ts.local_path)
|
234
234
|
call_parse_method(ts)
|
235
|
-
elsif
|
235
|
+
elsif @local_path_set.add?(ts.local_path)
|
236
236
|
need_down_list << ts
|
237
237
|
end
|
238
238
|
end
|
@@ -245,18 +245,18 @@ module ListSpider
|
|
245
245
|
interval = RANDOM_TIME
|
246
246
|
end
|
247
247
|
|
248
|
-
|
248
|
+
@down_list = []
|
249
249
|
|
250
250
|
need_down_list = filter_list(down_list)
|
251
251
|
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
252
|
+
@down_list = @down_list + need_down_list
|
253
|
+
@interval = interval
|
254
|
+
@max = max
|
255
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
256
|
+
@succeed_size = 0
|
257
|
+
@failed_size = 0
|
258
258
|
|
259
|
-
puts "total size:#{
|
259
|
+
puts "total size:#{@down_list.size}"
|
260
260
|
event_machine_start_list(get_next_task, method(:complete))
|
261
261
|
end
|
262
262
|
|
@@ -267,10 +267,10 @@ module ListSpider
|
|
267
267
|
def add_task(task)
|
268
268
|
if task.is_a?Array
|
269
269
|
need_down_list = filter_list(task)
|
270
|
-
|
270
|
+
@down_list = @down_list + need_down_list
|
271
271
|
elsif task.is_a?TaskStruct
|
272
272
|
need_down_list = filter_list([task])
|
273
|
-
|
273
|
+
@down_list = @down_list + need_down_list
|
274
274
|
else
|
275
275
|
puts "error task type:#{task.class}"
|
276
276
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|