list_spider 0.1.6 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +40 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa975b345606cc3995e08310737afd1338ad4c80
|
4
|
+
data.tar.gz: a5d15c9dcf5f1f78395f75911370617c9481b1ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fd86f3186bf640d2d895028d1373984046183e941619bf01b0c2d611deb7a62d4a52098da23e3299d20949212b776a6b368a4923c37e152d2e3e3038b433a1a
|
7
|
+
data.tar.gz: 722ee573500abe11549e679b3def10def9830ef58ae6f4bf24e0e98d0fc872524d97c964ce17df3dc9718eedfb3e66067f09fdeda89bce6d39f0c2fa1bbca750
|
data/lib/list_spider.rb
CHANGED
@@ -24,7 +24,7 @@ class TaskStruct
|
|
24
24
|
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
25
25
|
end
|
26
26
|
|
27
|
-
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
|
27
|
+
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
|
28
28
|
|
29
29
|
end
|
30
30
|
|
@@ -37,10 +37,10 @@ module ListSpider
|
|
37
37
|
|
38
38
|
@random_time_range = 3..10
|
39
39
|
@conver_to_utf8 = false
|
40
|
-
@connection_opts = {connect_timeout:
|
40
|
+
@connection_opts = {connect_timeout: 60}
|
41
41
|
@overwrite_exist = false
|
42
42
|
@max_redirects = 10
|
43
|
-
@@
|
43
|
+
@@local_path_set = Set.new
|
44
44
|
|
45
45
|
class << self
|
46
46
|
|
@@ -61,7 +61,7 @@ module ListSpider
|
|
61
61
|
end
|
62
62
|
|
63
63
|
def set_header_option(header_option)
|
64
|
-
@@header_option =
|
64
|
+
@@header_option = header_option
|
65
65
|
end
|
66
66
|
|
67
67
|
def event_machine_down(link_struct_list, callback = nil)
|
@@ -90,6 +90,8 @@ module ListSpider
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
+
e.request_object = w
|
94
|
+
|
93
95
|
w.callback {
|
94
96
|
s = w.response_header.status
|
95
97
|
puts s
|
@@ -122,7 +124,15 @@ module ListSpider
|
|
122
124
|
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
123
125
|
end
|
124
126
|
}
|
125
|
-
|
127
|
+
|
128
|
+
begin
|
129
|
+
multi.add e.local_path, w
|
130
|
+
rescue Exception => exception
|
131
|
+
puts exception
|
132
|
+
puts e.href
|
133
|
+
puts e.local_path
|
134
|
+
stop_machine
|
135
|
+
end
|
126
136
|
end
|
127
137
|
|
128
138
|
cb = Proc.new do
|
@@ -144,27 +154,36 @@ module ListSpider
|
|
144
154
|
@@end_time = Time.now
|
145
155
|
puts "total use time:#{@@end_time-@@begin_time} seconds"
|
146
156
|
EventMachine.stop
|
147
|
-
@@
|
157
|
+
@@local_path_set.clear
|
148
158
|
end
|
149
159
|
|
150
160
|
def get_next_task
|
151
|
-
|
161
|
+
return @@down_list.shift(@@max)
|
162
|
+
end
|
152
163
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
164
|
+
def call_parse_method(e)
|
165
|
+
pm = e.parse_method
|
166
|
+
if pm
|
167
|
+
case pm.arity
|
168
|
+
when 1
|
169
|
+
pm.call(e.local_path)
|
170
|
+
when 2
|
171
|
+
pm.call(e.local_path, e.extra_data)
|
172
|
+
when 3
|
173
|
+
res_header = nil
|
174
|
+
res_header = e.request_object.response_header if e.request_object
|
175
|
+
pm.call(e.local_path, e.extra_data, res_header)
|
176
|
+
else
|
177
|
+
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
|
157
178
|
end
|
158
179
|
end
|
159
|
-
|
160
|
-
return todo
|
161
180
|
end
|
162
181
|
|
163
182
|
def complete(multi, success_list, failed_list)
|
164
183
|
@@succeed_size += success_list.size
|
165
184
|
@@failed_size += failed_list.size
|
166
185
|
success_list.each do |e|
|
167
|
-
|
186
|
+
call_parse_method(e)
|
168
187
|
end
|
169
188
|
|
170
189
|
todo = get_next_task
|
@@ -204,8 +223,8 @@ module ListSpider
|
|
204
223
|
need_down_list = []
|
205
224
|
down_list.each do |ts|
|
206
225
|
if !@overwrite_exist && File.exist?(ts.local_path)
|
207
|
-
|
208
|
-
|
226
|
+
call_parse_method(ts)
|
227
|
+
elsif @@local_path_set.add?(ts.local_path)
|
209
228
|
need_down_list << ts
|
210
229
|
end
|
211
230
|
end
|
@@ -243,6 +262,11 @@ module ListSpider
|
|
243
262
|
puts "error task type:#{task.class}"
|
244
263
|
end
|
245
264
|
end
|
265
|
+
end
|
246
266
|
|
267
|
+
Signal.trap("INT") do
|
268
|
+
ListSpider.stop_machine
|
269
|
+
exit!
|
247
270
|
end
|
271
|
+
|
248
272
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|