list_spider 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +40 -16
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: aa975b345606cc3995e08310737afd1338ad4c80
|
4
|
+
data.tar.gz: a5d15c9dcf5f1f78395f75911370617c9481b1ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8fd86f3186bf640d2d895028d1373984046183e941619bf01b0c2d611deb7a62d4a52098da23e3299d20949212b776a6b368a4923c37e152d2e3e3038b433a1a
|
7
|
+
data.tar.gz: 722ee573500abe11549e679b3def10def9830ef58ae6f4bf24e0e98d0fc872524d97c964ce17df3dc9718eedfb3e66067f09fdeda89bce6d39f0c2fa1bbca750
|
data/lib/list_spider.rb
CHANGED
@@ -24,7 +24,7 @@ class TaskStruct
|
|
24
24
|
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
25
25
|
end
|
26
26
|
|
27
|
-
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
|
27
|
+
attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
|
28
28
|
|
29
29
|
end
|
30
30
|
|
@@ -37,10 +37,10 @@ module ListSpider
|
|
37
37
|
|
38
38
|
@random_time_range = 3..10
|
39
39
|
@conver_to_utf8 = false
|
40
|
-
@connection_opts = {connect_timeout:
|
40
|
+
@connection_opts = {connect_timeout: 60}
|
41
41
|
@overwrite_exist = false
|
42
42
|
@max_redirects = 10
|
43
|
-
@@
|
43
|
+
@@local_path_set = Set.new
|
44
44
|
|
45
45
|
class << self
|
46
46
|
|
@@ -61,7 +61,7 @@ module ListSpider
|
|
61
61
|
end
|
62
62
|
|
63
63
|
def set_header_option(header_option)
|
64
|
-
@@header_option =
|
64
|
+
@@header_option = header_option
|
65
65
|
end
|
66
66
|
|
67
67
|
def event_machine_down(link_struct_list, callback = nil)
|
@@ -90,6 +90,8 @@ module ListSpider
|
|
90
90
|
end
|
91
91
|
end
|
92
92
|
|
93
|
+
e.request_object = w
|
94
|
+
|
93
95
|
w.callback {
|
94
96
|
s = w.response_header.status
|
95
97
|
puts s
|
@@ -122,7 +124,15 @@ module ListSpider
|
|
122
124
|
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
123
125
|
end
|
124
126
|
}
|
125
|
-
|
127
|
+
|
128
|
+
begin
|
129
|
+
multi.add e.local_path, w
|
130
|
+
rescue Exception => exception
|
131
|
+
puts exception
|
132
|
+
puts e.href
|
133
|
+
puts e.local_path
|
134
|
+
stop_machine
|
135
|
+
end
|
126
136
|
end
|
127
137
|
|
128
138
|
cb = Proc.new do
|
@@ -144,27 +154,36 @@ module ListSpider
|
|
144
154
|
@@end_time = Time.now
|
145
155
|
puts "total use time:#{@@end_time-@@begin_time} seconds"
|
146
156
|
EventMachine.stop
|
147
|
-
@@
|
157
|
+
@@local_path_set.clear
|
148
158
|
end
|
149
159
|
|
150
160
|
def get_next_task
|
151
|
-
|
161
|
+
return @@down_list.shift(@@max)
|
162
|
+
end
|
152
163
|
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
164
|
+
def call_parse_method(e)
|
165
|
+
pm = e.parse_method
|
166
|
+
if pm
|
167
|
+
case pm.arity
|
168
|
+
when 1
|
169
|
+
pm.call(e.local_path)
|
170
|
+
when 2
|
171
|
+
pm.call(e.local_path, e.extra_data)
|
172
|
+
when 3
|
173
|
+
res_header = nil
|
174
|
+
res_header = e.request_object.response_header if e.request_object
|
175
|
+
pm.call(e.local_path, e.extra_data, res_header)
|
176
|
+
else
|
177
|
+
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
|
157
178
|
end
|
158
179
|
end
|
159
|
-
|
160
|
-
return todo
|
161
180
|
end
|
162
181
|
|
163
182
|
def complete(multi, success_list, failed_list)
|
164
183
|
@@succeed_size += success_list.size
|
165
184
|
@@failed_size += failed_list.size
|
166
185
|
success_list.each do |e|
|
167
|
-
|
186
|
+
call_parse_method(e)
|
168
187
|
end
|
169
188
|
|
170
189
|
todo = get_next_task
|
@@ -204,8 +223,8 @@ module ListSpider
|
|
204
223
|
need_down_list = []
|
205
224
|
down_list.each do |ts|
|
206
225
|
if !@overwrite_exist && File.exist?(ts.local_path)
|
207
|
-
|
208
|
-
|
226
|
+
call_parse_method(ts)
|
227
|
+
elsif @@local_path_set.add?(ts.local_path)
|
209
228
|
need_down_list << ts
|
210
229
|
end
|
211
230
|
end
|
@@ -243,6 +262,11 @@ module ListSpider
|
|
243
262
|
puts "error task type:#{task.class}"
|
244
263
|
end
|
245
264
|
end
|
265
|
+
end
|
246
266
|
|
267
|
+
Signal.trap("INT") do
|
268
|
+
ListSpider.stop_machine
|
269
|
+
exit!
|
247
270
|
end
|
271
|
+
|
248
272
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-10 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|