list_spider 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/list_spider.rb +81 -68
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: facd4c45a64f0cca934e70de9cfda4f05cf1a1d9
|
4
|
+
data.tar.gz: b0e5940faa51be59c13a0efcfc114b43981082b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a21daf8996b6aa27714a6407511a094f6381f739a65066077ca89bb0f4783ba0e6576b48080dbdaacee73d1a2ea4e6c77195ba832ee8f0bc28bde295f8e50b8c
|
7
|
+
data.tar.gz: e5bb44f131f82f480d9b5ebd146587c6473f6d3a7433fea32dd46bb74ac2c76adebc362048ee1c6228ca563ae4aa3d1b69bcc6d8735a4ee683e5c848f60944a4
|
data/lib/list_spider.rb
CHANGED
@@ -66,89 +66,98 @@ module ListSpider
|
|
66
66
|
failed_list = []
|
67
67
|
succeed_list = []
|
68
68
|
multi = EventMachine::MultiRequest.new
|
69
|
-
# no_job = true
|
70
69
|
begin_time = Time.now
|
71
70
|
|
72
71
|
for_each_proc = proc do |e|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
opt[:head] = @@header_option if defined? @@header_option
|
81
|
-
if e.http_method == :post
|
82
|
-
opt[:body] = e.params unless e.params.empty?
|
83
|
-
if @@connection_opts
|
84
|
-
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
85
|
-
else
|
86
|
-
w = EventMachine::HttpRequest.new(e.href).post opt
|
87
|
-
end
|
72
|
+
opt = {}
|
73
|
+
opt = {:redirects => @@max_redirects}
|
74
|
+
opt[:head] = @@header_option if defined? @@header_option
|
75
|
+
if e.http_method == :post
|
76
|
+
opt[:body] = e.params unless e.params.empty?
|
77
|
+
if @@connection_opts
|
78
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
88
79
|
else
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
80
|
+
w = EventMachine::HttpRequest.new(e.href).post opt
|
81
|
+
end
|
82
|
+
else
|
83
|
+
if @@connection_opts
|
84
|
+
opt[:query] = e.params unless e.params.empty?
|
85
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
|
86
|
+
else
|
87
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
95
88
|
end
|
89
|
+
end
|
96
90
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
else
|
110
|
-
f << w.response
|
111
|
-
end
|
91
|
+
w.callback {
|
92
|
+
s = w.response_header.status
|
93
|
+
puts s
|
94
|
+
if s != 404
|
95
|
+
local_dir = File.dirname(e.local_path)
|
96
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
97
|
+
begin
|
98
|
+
File.open(e.local_path, "w") do |f|
|
99
|
+
if @@conver_to_utf8 == true
|
100
|
+
f << SpiderHelper.to_utf8( w.response)
|
101
|
+
else
|
102
|
+
f << w.response
|
112
103
|
end
|
113
|
-
succeed_list << e
|
114
|
-
rescue Exception => e
|
115
|
-
puts e
|
116
104
|
end
|
105
|
+
succeed_list << e
|
106
|
+
rescue Exception => e
|
107
|
+
puts e
|
117
108
|
end
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# end
|
109
|
+
end
|
110
|
+
}
|
111
|
+
w.errback {
|
112
|
+
puts "errback:#{w.response_header}"
|
113
|
+
puts e.origin_href
|
114
|
+
puts e.href
|
115
|
+
puts w.response_header.status
|
116
|
+
failed_list << e
|
117
|
+
if e.http_method == :get
|
118
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
119
|
+
elsif e.http_method == :post
|
120
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
121
|
+
end
|
122
|
+
}
|
123
|
+
multi.add e.local_path, w
|
134
124
|
end
|
135
125
|
|
136
126
|
cb = Proc.new do
|
137
127
|
end_time = Time.now
|
138
128
|
puts "use time:#{end_time-begin_time} seconds"
|
139
129
|
if callback.nil?
|
140
|
-
|
141
|
-
puts "failed size:#{self.failed_size}"
|
142
|
-
EventMachine.stop
|
130
|
+
stop_machine
|
143
131
|
else
|
144
132
|
callback.call(multi, succeed_list, failed_list)
|
145
133
|
end
|
146
134
|
end
|
147
|
-
|
148
135
|
link_struct_list.each &for_each_proc
|
149
136
|
multi.callback &cb
|
150
137
|
end
|
151
138
|
|
139
|
+
def stop_machine
|
140
|
+
puts "success size:#{@@succeed_size}"
|
141
|
+
puts "failed size:#{@@failed_size}"
|
142
|
+
@@end_time = Time.now
|
143
|
+
puts "total use time:#{@@end_time-@@begin_time} seconds"
|
144
|
+
EventMachine.stop
|
145
|
+
@@url_set.clear
|
146
|
+
end
|
147
|
+
|
148
|
+
def get_next_task
|
149
|
+
todo = []
|
150
|
+
|
151
|
+
until todo.size >= @@max || @@down_list.empty? do
|
152
|
+
e = @@down_list.shift
|
153
|
+
if @@url_set.add?(e.href)
|
154
|
+
todo << e
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
return todo
|
159
|
+
end
|
160
|
+
|
152
161
|
def complete(multi, success_list, failed_list)
|
153
162
|
@@succeed_size += success_list.size
|
154
163
|
@@failed_size += failed_list.size
|
@@ -156,14 +165,13 @@ module ListSpider
|
|
156
165
|
e.parse_method.call(e.local_path, e.extra_data) if e.parse_method
|
157
166
|
end
|
158
167
|
|
159
|
-
todo =
|
168
|
+
todo = get_next_task
|
169
|
+
|
160
170
|
if todo.empty?
|
161
|
-
|
162
|
-
puts "failed size:#{@@failed_size}"
|
163
|
-
EventMachine.stop
|
171
|
+
stop_machine
|
164
172
|
else
|
165
173
|
if @@inter_val != 0
|
166
|
-
if success_list.size != 0 || failed_list.size !=0
|
174
|
+
if success_list.size != 0 || failed_list.size != 0
|
167
175
|
if @@inter_val == RANDOM_TIME
|
168
176
|
sleep(rand(@@random_time_range))
|
169
177
|
else
|
@@ -177,8 +185,13 @@ module ListSpider
|
|
177
185
|
|
178
186
|
def event_machine_start_list(down_list, callback = nil)
|
179
187
|
EventMachine.run {
|
188
|
+
@@begin_time = Time.now
|
180
189
|
if down_list.empty?
|
181
|
-
|
190
|
+
if callback
|
191
|
+
callback.call(nil, [], [])
|
192
|
+
else
|
193
|
+
stop_machine
|
194
|
+
end
|
182
195
|
else
|
183
196
|
event_machine_down(down_list, callback)
|
184
197
|
end
|
@@ -199,7 +212,7 @@ module ListSpider
|
|
199
212
|
|
200
213
|
def get_list(down_list, inter_val: 0, max: 30)
|
201
214
|
@@down_list = []
|
202
|
-
|
215
|
+
|
203
216
|
need_down_list = filter_list(down_list)
|
204
217
|
|
205
218
|
@@down_list = @@down_list + need_down_list
|
@@ -210,7 +223,7 @@ module ListSpider
|
|
210
223
|
@@failed_size = 0
|
211
224
|
|
212
225
|
puts "total size:#{@@down_list.size}"
|
213
|
-
event_machine_start_list(
|
226
|
+
event_machine_start_list(get_next_task, method(:complete))
|
214
227
|
end
|
215
228
|
|
216
229
|
def get_one(task)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-05 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: em-http-request
|