list_spider 0.1.6 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/list_spider.rb +40 -16
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ffb9c26b0fc7db4fd6a2a3b831832964afbaa949
4
- data.tar.gz: 2c829fb860ad3c0c8b3825c1575988cadfccaa15
3
+ metadata.gz: aa975b345606cc3995e08310737afd1338ad4c80
4
+ data.tar.gz: a5d15c9dcf5f1f78395f75911370617c9481b1ea
5
5
  SHA512:
6
- metadata.gz: 1673f2f5daed52295ee567c2d1c6b672f1bdf532fd31a334a704cd9ac5fbdcef9aea0f27e9828cd816769f74bfcbd8498294f1d190e73defb0fb6c15fb990e04
7
- data.tar.gz: 31247ae66dfac70a69671c336dcc942812d26fd9572c2e527dbd511bf40ed680304fcb24d7f1fac01e51b52d666061ffacaaae49c960a51aecff61f07ddcf9be
6
+ metadata.gz: 8fd86f3186bf640d2d895028d1373984046183e941619bf01b0c2d611deb7a62d4a52098da23e3299d20949212b776a6b368a4923c37e152d2e3e3038b433a1a
7
+ data.tar.gz: 722ee573500abe11549e679b3def10def9830ef58ae6f4bf24e0e98d0fc872524d97c964ce17df3dc9718eedfb3e66067f09fdeda89bce6d39f0c2fa1bbca750
@@ -24,7 +24,7 @@ class TaskStruct
24
24
  o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
25
25
  end
26
26
 
27
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method
27
+ attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
28
28
 
29
29
  end
30
30
 
@@ -37,10 +37,10 @@ module ListSpider
37
37
 
38
38
  @random_time_range = 3..10
39
39
  @conver_to_utf8 = false
40
- @connection_opts = {connect_timeout: 2*60}
40
+ @connection_opts = {connect_timeout: 60}
41
41
  @overwrite_exist = false
42
42
  @max_redirects = 10
43
- @@url_set = Set.new
43
+ @@local_path_set = Set.new
44
44
 
45
45
  class << self
46
46
 
@@ -61,7 +61,7 @@ module ListSpider
61
61
  end
62
62
 
63
63
  def set_header_option(header_option)
64
- @@header_option = optHash
64
+ @@header_option = header_option
65
65
  end
66
66
 
67
67
  def event_machine_down(link_struct_list, callback = nil)
@@ -90,6 +90,8 @@ module ListSpider
90
90
  end
91
91
  end
92
92
 
93
+ e.request_object = w
94
+
93
95
  w.callback {
94
96
  s = w.response_header.status
95
97
  puts s
@@ -122,7 +124,15 @@ module ListSpider
122
124
  SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
123
125
  end
124
126
  }
125
- multi.add e.local_path, w
127
+
128
+ begin
129
+ multi.add e.local_path, w
130
+ rescue Exception => exception
131
+ puts exception
132
+ puts e.href
133
+ puts e.local_path
134
+ stop_machine
135
+ end
126
136
  end
127
137
 
128
138
  cb = Proc.new do
@@ -144,27 +154,36 @@ module ListSpider
144
154
  @@end_time = Time.now
145
155
  puts "total use time:#{@@end_time-@@begin_time} seconds"
146
156
  EventMachine.stop
147
- @@url_set.clear
157
+ @@local_path_set.clear
148
158
  end
149
159
 
150
160
  def get_next_task
151
- todo = []
161
+ return @@down_list.shift(@@max)
162
+ end
152
163
 
153
- until todo.size >= @@max || @@down_list.empty? do
154
- e = @@down_list.shift
155
- if @@url_set.add?(e.href)
156
- todo << e
164
+ def call_parse_method(e)
165
+ pm = e.parse_method
166
+ if pm
167
+ case pm.arity
168
+ when 1
169
+ pm.call(e.local_path)
170
+ when 2
171
+ pm.call(e.local_path, e.extra_data)
172
+ when 3
173
+ res_header = nil
174
+ res_header = e.request_object.response_header if e.request_object
175
+ pm.call(e.local_path, e.extra_data, res_header)
176
+ else
177
+ puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
157
178
  end
158
179
  end
159
-
160
- return todo
161
180
  end
162
181
 
163
182
  def complete(multi, success_list, failed_list)
164
183
  @@succeed_size += success_list.size
165
184
  @@failed_size += failed_list.size
166
185
  success_list.each do |e|
167
- e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
186
+ call_parse_method(e)
168
187
  end
169
188
 
170
189
  todo = get_next_task
@@ -204,8 +223,8 @@ module ListSpider
204
223
  need_down_list = []
205
224
  down_list.each do |ts|
206
225
  if !@overwrite_exist && File.exist?(ts.local_path)
207
- ts.parse_method.call(ts.local_path, ts.extra_data) if ts.parse_method
208
- else
226
+ call_parse_method(ts)
227
+ elsif @@local_path_set.add?(ts.local_path)
209
228
  need_down_list << ts
210
229
  end
211
230
  end
@@ -243,6 +262,11 @@ module ListSpider
243
262
  puts "error task type:#{task.class}"
244
263
  end
245
264
  end
265
+ end
246
266
 
267
+ Signal.trap("INT") do
268
+ ListSpider.stop_machine
269
+ exit!
247
270
  end
271
+
248
272
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-08 00:00:00.000000000 Z
11
+ date: 2016-05-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request