list_spider 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/list_spider.rb +81 -68
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
4
- data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
3
+ metadata.gz: facd4c45a64f0cca934e70de9cfda4f05cf1a1d9
4
+ data.tar.gz: b0e5940faa51be59c13a0efcfc114b43981082b7
5
5
  SHA512:
6
- metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
7
- data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465
6
+ metadata.gz: a21daf8996b6aa27714a6407511a094f6381f739a65066077ca89bb0f4783ba0e6576b48080dbdaacee73d1a2ea4e6c77195ba832ee8f0bc28bde295f8e50b8c
7
+ data.tar.gz: e5bb44f131f82f480d9b5ebd146587c6473f6d3a7433fea32dd46bb74ac2c76adebc362048ee1c6228ca563ae4aa3d1b69bcc6d8735a4ee683e5c848f60944a4
data/lib/list_spider.rb CHANGED
@@ -66,89 +66,98 @@ module ListSpider
66
66
  failed_list = []
67
67
  succeed_list = []
68
68
  multi = EventMachine::MultiRequest.new
69
- # no_job = true
70
69
  begin_time = Time.now
71
70
 
72
71
  for_each_proc = proc do |e|
73
- # if !@@overwrite_exist && File.exist?(e.local_path)
74
- # succeed_list << e
75
- # else
76
- next unless @@url_set.add?(e.href)
77
- # no_job = false
78
- opt = {}
79
- opt = {:redirects => @@max_redirects}
80
- opt[:head] = @@header_option if defined? @@header_option
81
- if e.http_method == :post
82
- opt[:body] = e.params unless e.params.empty?
83
- if @@connection_opts
84
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
85
- else
86
- w = EventMachine::HttpRequest.new(e.href).post opt
87
- end
72
+ opt = {}
73
+ opt = {:redirects => @@max_redirects}
74
+ opt[:head] = @@header_option if defined? @@header_option
75
+ if e.http_method == :post
76
+ opt[:body] = e.params unless e.params.empty?
77
+ if @@connection_opts
78
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
88
79
  else
89
- if @@connection_opts
90
- opt[:query] = e.params unless e.params.empty?
91
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
92
- else
93
- w = EventMachine::HttpRequest.new(e.href).get opt
94
- end
80
+ w = EventMachine::HttpRequest.new(e.href).post opt
81
+ end
82
+ else
83
+ if @@connection_opts
84
+ opt[:query] = e.params unless e.params.empty?
85
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
86
+ else
87
+ w = EventMachine::HttpRequest.new(e.href).get opt
95
88
  end
89
+ end
96
90
 
97
- w.callback {
98
- @@url_set.delete(e.href)
99
- # puts "complete:#{w.response_header}"
100
- s = w.response_header.status
101
- puts s
102
- if s != 404
103
- local_dir = File.dirname(e.local_path)
104
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
105
- begin
106
- File.open(e.local_path, "w") do |f|
107
- if @@conver_to_utf8 == true
108
- f << SpiderHelper.to_utf8( w.response)
109
- else
110
- f << w.response
111
- end
91
+ w.callback {
92
+ s = w.response_header.status
93
+ puts s
94
+ if s != 404
95
+ local_dir = File.dirname(e.local_path)
96
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
97
+ begin
98
+ File.open(e.local_path, "w") do |f|
99
+ if @@conver_to_utf8 == true
100
+ f << SpiderHelper.to_utf8( w.response)
101
+ else
102
+ f << w.response
112
103
  end
113
- succeed_list << e
114
- rescue Exception => e
115
- puts e
116
104
  end
105
+ succeed_list << e
106
+ rescue Exception => e
107
+ puts e
117
108
  end
118
- }
119
- w.errback {
120
- @@url_set.delete(e.href)
121
- puts "errback:#{w.response_header}"
122
- puts e.origin_href
123
- puts e.href
124
- puts w.response_header.status
125
- failed_list << e
126
- if e.http_method == :get
127
- SpiderHelper.direct_http_get(e.href, e.local_path)
128
- elsif e.http_method == :post
129
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
130
- end
131
- }
132
- multi.add e.local_path, w
133
- # end
109
+ end
110
+ }
111
+ w.errback {
112
+ puts "errback:#{w.response_header}"
113
+ puts e.origin_href
114
+ puts e.href
115
+ puts w.response_header.status
116
+ failed_list << e
117
+ if e.http_method == :get
118
+ SpiderHelper.direct_http_get(e.href, e.local_path)
119
+ elsif e.http_method == :post
120
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
121
+ end
122
+ }
123
+ multi.add e.local_path, w
134
124
  end
135
125
 
136
126
  cb = Proc.new do
137
127
  end_time = Time.now
138
128
  puts "use time:#{end_time-begin_time} seconds"
139
129
  if callback.nil?
140
- puts "success size:#{self.succeed_size}"
141
- puts "failed size:#{self.failed_size}"
142
- EventMachine.stop
130
+ stop_machine
143
131
  else
144
132
  callback.call(multi, succeed_list, failed_list)
145
133
  end
146
134
  end
147
-
148
135
  link_struct_list.each &for_each_proc
149
136
  multi.callback &cb
150
137
  end
151
138
 
139
+ def stop_machine
140
+ puts "success size:#{@@succeed_size}"
141
+ puts "failed size:#{@@failed_size}"
142
+ @@end_time = Time.now
143
+ puts "total use time:#{@@end_time-@@begin_time} seconds"
144
+ EventMachine.stop
145
+ @@url_set.clear
146
+ end
147
+
148
+ def get_next_task
149
+ todo = []
150
+
151
+ until todo.size >= @@max || @@down_list.empty? do
152
+ e = @@down_list.shift
153
+ if @@url_set.add?(e.href)
154
+ todo << e
155
+ end
156
+ end
157
+
158
+ return todo
159
+ end
160
+
152
161
  def complete(multi, success_list, failed_list)
153
162
  @@succeed_size += success_list.size
154
163
  @@failed_size += failed_list.size
@@ -156,14 +165,13 @@ module ListSpider
156
165
  e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
157
166
  end
158
167
 
159
- todo = @@down_list.slice!(0, @@max)
168
+ todo = get_next_task
169
+
160
170
  if todo.empty?
161
- puts "success size:#{@@succeed_size}"
162
- puts "failed size:#{@@failed_size}"
163
- EventMachine.stop
171
+ stop_machine
164
172
  else
165
173
  if @@inter_val != 0
166
- if success_list.size != 0 || failed_list.size !=0
174
+ if success_list.size != 0 || failed_list.size != 0
167
175
  if @@inter_val == RANDOM_TIME
168
176
  sleep(rand(@@random_time_range))
169
177
  else
@@ -177,8 +185,13 @@ module ListSpider
177
185
 
178
186
  def event_machine_start_list(down_list, callback = nil)
179
187
  EventMachine.run {
188
+ @@begin_time = Time.now
180
189
  if down_list.empty?
181
- callback.call(nil, [], []) if callback
190
+ if callback
191
+ callback.call(nil, [], [])
192
+ else
193
+ stop_machine
194
+ end
182
195
  else
183
196
  event_machine_down(down_list, callback)
184
197
  end
@@ -199,7 +212,7 @@ module ListSpider
199
212
 
200
213
  def get_list(down_list, inter_val: 0, max: 30)
201
214
  @@down_list = []
202
-
215
+
203
216
  need_down_list = filter_list(down_list)
204
217
 
205
218
  @@down_list = @@down_list + need_down_list
@@ -210,7 +223,7 @@ module ListSpider
210
223
  @@failed_size = 0
211
224
 
212
225
  puts "total size:#{@@down_list.size}"
213
- event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
226
+ event_machine_start_list(get_next_task, method(:complete))
214
227
  end
215
228
 
216
229
  def get_one(task)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-04 00:00:00.000000000 Z
11
+ date: 2016-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request