list_spider 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/list_spider.rb +81 -68
  3. metadata +2 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 45ea1dba6db98ca7a9cdaecde7f744728cd20b03
4
- data.tar.gz: 118764345cebb58a37e15af591b3f007451c2486
3
+ metadata.gz: facd4c45a64f0cca934e70de9cfda4f05cf1a1d9
4
+ data.tar.gz: b0e5940faa51be59c13a0efcfc114b43981082b7
5
5
  SHA512:
6
- metadata.gz: 673150361b67fd16cf7dc86560c0bbe17d3d432f3f40dc4456019e9700d0d68f3b1d9eea8d6c036fc3ea904866497d248b51a36007e345a9233a43b827d0846b
7
- data.tar.gz: 5c2b99885733c979d9e1f9f2426521b125fce8dd951a3f51c96d25c33ae1c180b0aeb70654b5b4422b0691bb337fdd517834cb28fa0edbee2798e895c6aa2465
6
+ metadata.gz: a21daf8996b6aa27714a6407511a094f6381f739a65066077ca89bb0f4783ba0e6576b48080dbdaacee73d1a2ea4e6c77195ba832ee8f0bc28bde295f8e50b8c
7
+ data.tar.gz: e5bb44f131f82f480d9b5ebd146587c6473f6d3a7433fea32dd46bb74ac2c76adebc362048ee1c6228ca563ae4aa3d1b69bcc6d8735a4ee683e5c848f60944a4
data/lib/list_spider.rb CHANGED
@@ -66,89 +66,98 @@ module ListSpider
66
66
  failed_list = []
67
67
  succeed_list = []
68
68
  multi = EventMachine::MultiRequest.new
69
- # no_job = true
70
69
  begin_time = Time.now
71
70
 
72
71
  for_each_proc = proc do |e|
73
- # if !@@overwrite_exist && File.exist?(e.local_path)
74
- # succeed_list << e
75
- # else
76
- next unless @@url_set.add?(e.href)
77
- # no_job = false
78
- opt = {}
79
- opt = {:redirects => @@max_redirects}
80
- opt[:head] = @@header_option if defined? @@header_option
81
- if e.http_method == :post
82
- opt[:body] = e.params unless e.params.empty?
83
- if @@connection_opts
84
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
85
- else
86
- w = EventMachine::HttpRequest.new(e.href).post opt
87
- end
72
+ opt = {}
73
+ opt = {:redirects => @@max_redirects}
74
+ opt[:head] = @@header_option if defined? @@header_option
75
+ if e.http_method == :post
76
+ opt[:body] = e.params unless e.params.empty?
77
+ if @@connection_opts
78
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
88
79
  else
89
- if @@connection_opts
90
- opt[:query] = e.params unless e.params.empty?
91
- w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
92
- else
93
- w = EventMachine::HttpRequest.new(e.href).get opt
94
- end
80
+ w = EventMachine::HttpRequest.new(e.href).post opt
81
+ end
82
+ else
83
+ if @@connection_opts
84
+ opt[:query] = e.params unless e.params.empty?
85
+ w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
86
+ else
87
+ w = EventMachine::HttpRequest.new(e.href).get opt
95
88
  end
89
+ end
96
90
 
97
- w.callback {
98
- @@url_set.delete(e.href)
99
- # puts "complete:#{w.response_header}"
100
- s = w.response_header.status
101
- puts s
102
- if s != 404
103
- local_dir = File.dirname(e.local_path)
104
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
105
- begin
106
- File.open(e.local_path, "w") do |f|
107
- if @@conver_to_utf8 == true
108
- f << SpiderHelper.to_utf8( w.response)
109
- else
110
- f << w.response
111
- end
91
+ w.callback {
92
+ s = w.response_header.status
93
+ puts s
94
+ if s != 404
95
+ local_dir = File.dirname(e.local_path)
96
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
97
+ begin
98
+ File.open(e.local_path, "w") do |f|
99
+ if @@conver_to_utf8 == true
100
+ f << SpiderHelper.to_utf8( w.response)
101
+ else
102
+ f << w.response
112
103
  end
113
- succeed_list << e
114
- rescue Exception => e
115
- puts e
116
104
  end
105
+ succeed_list << e
106
+ rescue Exception => e
107
+ puts e
117
108
  end
118
- }
119
- w.errback {
120
- @@url_set.delete(e.href)
121
- puts "errback:#{w.response_header}"
122
- puts e.origin_href
123
- puts e.href
124
- puts w.response_header.status
125
- failed_list << e
126
- if e.http_method == :get
127
- SpiderHelper.direct_http_get(e.href, e.local_path)
128
- elsif e.http_method == :post
129
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
130
- end
131
- }
132
- multi.add e.local_path, w
133
- # end
109
+ end
110
+ }
111
+ w.errback {
112
+ puts "errback:#{w.response_header}"
113
+ puts e.origin_href
114
+ puts e.href
115
+ puts w.response_header.status
116
+ failed_list << e
117
+ if e.http_method == :get
118
+ SpiderHelper.direct_http_get(e.href, e.local_path)
119
+ elsif e.http_method == :post
120
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
121
+ end
122
+ }
123
+ multi.add e.local_path, w
134
124
  end
135
125
 
136
126
  cb = Proc.new do
137
127
  end_time = Time.now
138
128
  puts "use time:#{end_time-begin_time} seconds"
139
129
  if callback.nil?
140
- puts "success size:#{self.succeed_size}"
141
- puts "failed size:#{self.failed_size}"
142
- EventMachine.stop
130
+ stop_machine
143
131
  else
144
132
  callback.call(multi, succeed_list, failed_list)
145
133
  end
146
134
  end
147
-
148
135
  link_struct_list.each &for_each_proc
149
136
  multi.callback &cb
150
137
  end
151
138
 
139
+ def stop_machine
140
+ puts "success size:#{@@succeed_size}"
141
+ puts "failed size:#{@@failed_size}"
142
+ @@end_time = Time.now
143
+ puts "total use time:#{@@end_time-@@begin_time} seconds"
144
+ EventMachine.stop
145
+ @@url_set.clear
146
+ end
147
+
148
+ def get_next_task
149
+ todo = []
150
+
151
+ until todo.size >= @@max || @@down_list.empty? do
152
+ e = @@down_list.shift
153
+ if @@url_set.add?(e.href)
154
+ todo << e
155
+ end
156
+ end
157
+
158
+ return todo
159
+ end
160
+
152
161
  def complete(multi, success_list, failed_list)
153
162
  @@succeed_size += success_list.size
154
163
  @@failed_size += failed_list.size
@@ -156,14 +165,13 @@ module ListSpider
156
165
  e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
157
166
  end
158
167
 
159
- todo = @@down_list.slice!(0, @@max)
168
+ todo = get_next_task
169
+
160
170
  if todo.empty?
161
- puts "success size:#{@@succeed_size}"
162
- puts "failed size:#{@@failed_size}"
163
- EventMachine.stop
171
+ stop_machine
164
172
  else
165
173
  if @@inter_val != 0
166
- if success_list.size != 0 || failed_list.size !=0
174
+ if success_list.size != 0 || failed_list.size != 0
167
175
  if @@inter_val == RANDOM_TIME
168
176
  sleep(rand(@@random_time_range))
169
177
  else
@@ -177,8 +185,13 @@ module ListSpider
177
185
 
178
186
  def event_machine_start_list(down_list, callback = nil)
179
187
  EventMachine.run {
188
+ @@begin_time = Time.now
180
189
  if down_list.empty?
181
- callback.call(nil, [], []) if callback
190
+ if callback
191
+ callback.call(nil, [], [])
192
+ else
193
+ stop_machine
194
+ end
182
195
  else
183
196
  event_machine_down(down_list, callback)
184
197
  end
@@ -199,7 +212,7 @@ module ListSpider
199
212
 
200
213
  def get_list(down_list, inter_val: 0, max: 30)
201
214
  @@down_list = []
202
-
215
+
203
216
  need_down_list = filter_list(down_list)
204
217
 
205
218
  @@down_list = @@down_list + need_down_list
@@ -210,7 +223,7 @@ module ListSpider
210
223
  @@failed_size = 0
211
224
 
212
225
  puts "total size:#{@@down_list.size}"
213
- event_machine_start_list(@@down_list.slice!(0, @@max), method(:complete))
226
+ event_machine_start_list(get_next_task, method(:complete))
214
227
  end
215
228
 
216
229
  def get_one(task)
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-04 00:00:00.000000000 Z
11
+ date: 2016-05-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request