list_spider 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +81 -68
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: facd4c45a64f0cca934e70de9cfda4f05cf1a1d9
|
4
|
+
data.tar.gz: b0e5940faa51be59c13a0efcfc114b43981082b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a21daf8996b6aa27714a6407511a094f6381f739a65066077ca89bb0f4783ba0e6576b48080dbdaacee73d1a2ea4e6c77195ba832ee8f0bc28bde295f8e50b8c
|
7
|
+
data.tar.gz: e5bb44f131f82f480d9b5ebd146587c6473f6d3a7433fea32dd46bb74ac2c76adebc362048ee1c6228ca563ae4aa3d1b69bcc6d8735a4ee683e5c848f60944a4
|
data/lib/list_spider.rb
CHANGED
@@ -66,89 +66,98 @@ module ListSpider
|
|
66
66
|
failed_list = []
|
67
67
|
succeed_list = []
|
68
68
|
multi = EventMachine::MultiRequest.new
|
69
|
-
# no_job = true
|
70
69
|
begin_time = Time.now
|
71
70
|
|
72
71
|
for_each_proc = proc do |e|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
opt[:head] = @@header_option if defined? @@header_option
|
81
|
-
if e.http_method == :post
|
82
|
-
opt[:body] = e.params unless e.params.empty?
|
83
|
-
if @@connection_opts
|
84
|
-
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
85
|
-
else
|
86
|
-
w = EventMachine::HttpRequest.new(e.href).post opt
|
87
|
-
end
|
72
|
+
opt = {}
|
73
|
+
opt = {:redirects => @@max_redirects}
|
74
|
+
opt[:head] = @@header_option if defined? @@header_option
|
75
|
+
if e.http_method == :post
|
76
|
+
opt[:body] = e.params unless e.params.empty?
|
77
|
+
if @@connection_opts
|
78
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
88
79
|
else
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
80
|
+
w = EventMachine::HttpRequest.new(e.href).post opt
|
81
|
+
end
|
82
|
+
else
|
83
|
+
if @@connection_opts
|
84
|
+
opt[:query] = e.params unless e.params.empty?
|
85
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
|
86
|
+
else
|
87
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
95
88
|
end
|
89
|
+
end
|
96
90
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
else
|
110
|
-
f << w.response
|
111
|
-
end
|
91
|
+
w.callback {
|
92
|
+
s = w.response_header.status
|
93
|
+
puts s
|
94
|
+
if s != 404
|
95
|
+
local_dir = File.dirname(e.local_path)
|
96
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
97
|
+
begin
|
98
|
+
File.open(e.local_path, "w") do |f|
|
99
|
+
if @@conver_to_utf8 == true
|
100
|
+
f << SpiderHelper.to_utf8( w.response)
|
101
|
+
else
|
102
|
+
f << w.response
|
112
103
|
end
|
113
|
-
succeed_list << e
|
114
|
-
rescue Exception => e
|
115
|
-
puts e
|
116
104
|
end
|
105
|
+
succeed_list << e
|
106
|
+
rescue Exception => e
|
107
|
+
puts e
|
117
108
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# end
|
109
|
+
end
|
110
|
+
}
|
111
|
+
w.errback {
|
112
|
+
puts "errback:#{w.response_header}"
|
113
|
+
puts e.origin_href
|
114
|
+
puts e.href
|
115
|
+
puts w.response_header.status
|
116
|
+
failed_list << e
|
117
|
+
if e.http_method == :get
|
118
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
119
|
+
elsif e.http_method == :post
|
120
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
121
|
+
end
|
122
|
+
}
|
123
|
+
multi.add e.local_path, w
|
134
124
|
end
|
135
125
|
|
136
126
|
cb = Proc.new do
|
137
127
|
end_time = Time.now
|
138
128
|
puts "use time:#{end_time-begin_time} seconds"
|
139
129
|
if callback.nil?
|
140
|
-
|
141
|
-
puts "failed size:#{self.failed_size}"
|
142
|
-
EventMachine.stop
|
130
|
+
stop_machine
|
143
131
|
else
|
144
132
|
callback.call(multi, succeed_list, failed_list)
|
145
133
|
end
|
146
134
|
end
|
147
|
-
|
148
135
|
link_struct_list.each &for_each_proc
|
149
136
|
multi.callback &cb
|
150
137
|
end
|
151
138
|
|
139
|
+
def stop_machine
|
140
|
+
puts "success size:#{@@succeed_size}"
|
141
|
+
puts "failed size:#{@@failed_size}"
|
142
|
+
@@end_time = Time.now
|
143
|
+
puts "total use time:#{@@end_time-@@begin_time} seconds"
|
144
|
+
EventMachine.stop
|
145
|
+
@@url_set.clear
|
146
|
+
end
|
147
|
+
|
148
|
+
def get_next_task
|
149
|
+
todo = []
|
150
|
+
|
151
|
+
until todo.size >= @@max || @@down_list.empty? do
|
152
|
+
e = @@down_list.shift
|
153
|
+
if @@url_set.add?(e.href)
|
154
|
+
todo << e
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
return todo
|
159
|
+
end
|
160
|
+
|
152
161
|
def complete(multi, success_list, failed_list)
|
153
162
|
@@succeed_size += success_list.size
|
154
163
|
@@failed_size += failed_list.size
|
@@ -156,14 +165,13 @@ module ListSpider
|
|
156
165
|
e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
|
157
166
|
end
|
158
167
|
|
159
|
-
todo =
|
168
|
+
todo = get_next_task
|
169
|
+
|
160
170
|
if todo.empty?
|
161
|
-
|
162
|
-
puts "failed size:#{@@failed_size}"
|
163
|
-
EventMachine.stop
|
171
|
+
stop_machine
|
164
172
|
else
|
165
173
|
if @@inter_val != 0
|
166
|
-
if success_list.size != 0 || failed_list.size !=0
|
174
|
+
if success_list.size != 0 || failed_list.size != 0
|
167
175
|
if @@inter_val == RANDOM_TIME
|
168
176
|
sleep(rand(@@random_time_range))
|
169
177
|
else
|
@@ -177,8 +185,13 @@ module ListSpider
|
|
177
185
|
|
178
186
|
def event_machine_start_list(down_list, callback = nil)
|
179
187
|
EventMachine.run {
|
188
|
+
@@begin_time = Time.now
|
180
189
|
if down_list.empty?
|
181
|
-
|
190
|
+
if callback
|
191
|
+
callback.call(nil, [], [])
|
192
|
+
else
|
193
|
+
stop_machine
|
194
|
+
end
|
182
195
|
else
|
183
196
|
event_machine_down(down_list, callback)
|
184
197
|
end
|
@@ -199,7 +212,7 @@ module ListSpider
|
|
199
212
|
|
200
213
|
def get_list(down_list, inter_val: 0, max: 30)
|
201
214
|
@@down_list = []
|
202
|
-
|
215
|
+
|
203
216
|
need_down_list = filter_list(down_list)
|
204
217
|
|
205
218
|
@@down_list = @@down_list + need_down_list
|
@@ -210,7 +223,7 @@ module ListSpider
|
|
210
223
|
@@failed_size = 0
|
211
224
|
|
212
225
|
puts "total size:#{@@down_list.size}"
|
213
|
-
event_machine_start_list(
|
226
|
+
event_machine_start_list(get_next_task, method(:complete))
|
214
227
|
end
|
215
228
|
|
216
229
|
def get_one(task)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|