list_spider 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/delete_unvalid.rb +40 -0
- data/lib/list_spider.rb +77 -0
- data/lib/spider_base.rb +296 -0
- data/lib/spider_helper.rb +103 -0
- metadata +88 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0f897ec31adc2d2a6c9713729030e8decc85cb9d
|
4
|
+
data.tar.gz: c81ed9cab5fc1bbd4395aeb9692924887042c27f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 907529b23336256e2c72232ec2b413c50ebee99df49695bac368ae4dc029afaa81424fe542c63d6c7247714964382e2eaeaae0ea34ab51658e34ce67e5c7b9e7
|
7
|
+
data.tar.gz: 22dff7b012b1c12f8cefb50606e0a9b0874c5b013a305c268ae2a950374760ba77ed24af075f6ca5217e128d905b31bae754107e0cf063e0d90171dd51b94f20
|
@@ -0,0 +1,40 @@
|
|
1
|
+
|
2
|
+
class DeleteUnvalid
|
3
|
+
# 4033
|
4
|
+
# 920
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
6
|
+
@dir_pattern = dir_pattern
|
7
|
+
@size_threshold = size_threshold
|
8
|
+
if cust_judge
|
9
|
+
@cust_judge = cust_judge
|
10
|
+
else
|
11
|
+
@cust_judge = method(:default_judge)
|
12
|
+
end
|
13
|
+
@total = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
def default_judge(f)
|
17
|
+
File.size(f) <= @size_threshold
|
18
|
+
end
|
19
|
+
|
20
|
+
def delete_unvaild(f)
|
21
|
+
if @cust_judge.call(f)
|
22
|
+
@total += 1
|
23
|
+
puts "deleted file: #{f}"
|
24
|
+
File.delete(f)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
def start
|
29
|
+
Dir.glob(@dir_pattern) do |f|
|
30
|
+
# puts f
|
31
|
+
delete_unvaild(f)
|
32
|
+
end
|
33
|
+
puts "delete total:#{@total}"
|
34
|
+
end
|
35
|
+
|
36
|
+
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
37
|
+
DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
|
38
|
+
end
|
39
|
+
|
40
|
+
end
|
data/lib/list_spider.rb
ADDED
@@ -0,0 +1,77 @@
|
|
1
|
+
require File.expand_path('../spider_base', __FILE__)
|
2
|
+
require File.expand_path('../delete_unvalid', __FILE__)
|
3
|
+
|
4
|
+
class ListSpider
|
5
|
+
|
6
|
+
RANDOM_TIME = -1
|
7
|
+
NO_LIMIT_CONCURRENT = -1
|
8
|
+
|
9
|
+
@@random_time_range = 3..10
|
10
|
+
|
11
|
+
include SpiderBase
|
12
|
+
|
13
|
+
def initialize(down_list, inter_val: 0, max: 30)
|
14
|
+
@down_list = down_list
|
15
|
+
@inter_val = inter_val
|
16
|
+
@max = max
|
17
|
+
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
18
|
+
@succeed_size = 0
|
19
|
+
@failed_size = 0
|
20
|
+
end
|
21
|
+
|
22
|
+
attr_reader :succeed_size, :failed_size
|
23
|
+
|
24
|
+
class << self
|
25
|
+
|
26
|
+
attr_accessor :random_time_range
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
def add_task(task)
|
31
|
+
if task.is_a?Array
|
32
|
+
@down_list = @down_list + task
|
33
|
+
elsif task.is_a?TaskStruct
|
34
|
+
@down_list << task
|
35
|
+
else
|
36
|
+
puts "error task type:#{task.class}"
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
def complete(multi, success_list, failed_list)
|
41
|
+
@succeed_size += success_list.size
|
42
|
+
@failed_size += failed_list.size
|
43
|
+
# puts "success size:#{success_list.size}"
|
44
|
+
# puts "failed size:#{failed_list.size}"
|
45
|
+
success_list.each do |e|
|
46
|
+
e.parse_method.call(e.local_path, e.extra_data, self) if e.parse_method
|
47
|
+
end
|
48
|
+
|
49
|
+
todo = @down_list.slice!(0, @max)
|
50
|
+
if todo.empty?
|
51
|
+
puts "success size:#{@succeed_size}"
|
52
|
+
puts "failed size:#{@failed_size}"
|
53
|
+
EventMachine.stop
|
54
|
+
else
|
55
|
+
if @inter_val != 0
|
56
|
+
if success_list.size != 0 || failed_list.size !=0
|
57
|
+
if @inter_val == RANDOM_TIME
|
58
|
+
sleep(rand(@@random_time_range))
|
59
|
+
else
|
60
|
+
sleep(@inter_val)
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
batch_down_list(todo, method(:complete))
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def start
|
69
|
+
puts "total size:#{@down_list.size}"
|
70
|
+
event_machine_start_list(@down_list.slice!(0, @max), method(:complete))
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.get_list(down_list, inter_val: 0, max: 30)
|
74
|
+
ListSpider.new(down_list, inter_val: inter_val, max: max).start
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
data/lib/spider_base.rb
ADDED
@@ -0,0 +1,296 @@
|
|
1
|
+
require 'em-http-request'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'fileutils'
|
4
|
+
require 'set'
|
5
|
+
require File.expand_path('../spider_helper', __FILE__)
|
6
|
+
require "addressable/uri"
|
7
|
+
|
8
|
+
class TaskStruct
|
9
|
+
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
|
10
|
+
@href = href
|
11
|
+
if @href.class == "".class
|
12
|
+
@href = SpiderHelper.string_to_uri(@href)
|
13
|
+
end
|
14
|
+
@local_path = local_path
|
15
|
+
@http_method = http_method
|
16
|
+
@params = params
|
17
|
+
@extra_data = extra_data
|
18
|
+
@parse_method = parse_method
|
19
|
+
end
|
20
|
+
|
21
|
+
def == (o)
|
22
|
+
o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
|
23
|
+
end
|
24
|
+
|
25
|
+
attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method
|
26
|
+
|
27
|
+
end
|
28
|
+
|
29
|
+
module SpiderBase
|
30
|
+
|
31
|
+
@@conver_to_utf8 = false
|
32
|
+
@@connection_opts = {:connect_timeout => 2*60}
|
33
|
+
@@overwrite_exist = false
|
34
|
+
@@max_redirects = 10
|
35
|
+
|
36
|
+
class << self
|
37
|
+
|
38
|
+
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
39
|
+
|
40
|
+
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
41
|
+
@@connection_opts = {
|
42
|
+
:proxy => {
|
43
|
+
:host => proxy_addr,
|
44
|
+
:port => proxy_port
|
45
|
+
}
|
46
|
+
}
|
47
|
+
@@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
48
|
+
end
|
49
|
+
|
50
|
+
def connect_timeout(max_connect_time)
|
51
|
+
@@connection_opts[:connect_timeout] = max_connect_time
|
52
|
+
end
|
53
|
+
|
54
|
+
def set_header_option(header_option)
|
55
|
+
@@header_option = optHash
|
56
|
+
end
|
57
|
+
|
58
|
+
def event_machine_down(link_struct_list, callback = nil)
|
59
|
+
failed_list = []
|
60
|
+
succeed_list = []
|
61
|
+
# puts "event_machine_down callback:#{callback}"
|
62
|
+
multi = EventMachine::MultiRequest.new
|
63
|
+
no_job = true
|
64
|
+
begin_time = Time.now
|
65
|
+
|
66
|
+
for_each_proc = proc do |e|
|
67
|
+
if !@@overwrite_exist && File.exist?(e.local_path)
|
68
|
+
succeed_list << e
|
69
|
+
else
|
70
|
+
no_job = false
|
71
|
+
opt = {}
|
72
|
+
opt = {:redirects => @@max_redirects}
|
73
|
+
opt[:head] = @@header_option if defined? @@header_option
|
74
|
+
if e.http_method == :post
|
75
|
+
opt[:body] = e.params unless e.params.empty?
|
76
|
+
if @@connection_opts
|
77
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).post opt
|
78
|
+
else
|
79
|
+
w = EventMachine::HttpRequest.new(e.href).post opt
|
80
|
+
end
|
81
|
+
else
|
82
|
+
if @@connection_opts
|
83
|
+
opt[:query] = e.params unless e.params.empty?
|
84
|
+
w = EventMachine::HttpRequest.new(e.href, @@connection_opts).get opt
|
85
|
+
else
|
86
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
w.callback {
|
91
|
+
# puts "complete:#{w.response_header}"
|
92
|
+
s = w.response_header.status
|
93
|
+
puts s
|
94
|
+
if s == 403 || s == 502 #Forbidden
|
95
|
+
# EventMachine.stop
|
96
|
+
elsif s != 404
|
97
|
+
local_dir = File.dirname(e.local_path)
|
98
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
99
|
+
begin
|
100
|
+
File.open(e.local_path, "w") do |f|
|
101
|
+
if @@conver_to_utf8 == true
|
102
|
+
f << SpiderHelper.to_utf8( w.response)
|
103
|
+
else
|
104
|
+
f << w.response
|
105
|
+
end
|
106
|
+
end
|
107
|
+
succeed_list << e
|
108
|
+
rescue Exception => e
|
109
|
+
puts e
|
110
|
+
end
|
111
|
+
end
|
112
|
+
}
|
113
|
+
w.errback {
|
114
|
+
puts "errback:#{w.response_header}"
|
115
|
+
puts e.href
|
116
|
+
puts w.response_header.status
|
117
|
+
failed_list << e
|
118
|
+
if e.http_method == :get
|
119
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
120
|
+
elsif e.http_method == :post
|
121
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
122
|
+
end
|
123
|
+
}
|
124
|
+
multi.add e.local_path, w
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
# em_for_each_proc = proc do |e, iter|
|
129
|
+
# for_each_proc.call(e)
|
130
|
+
# iter.next
|
131
|
+
# end
|
132
|
+
|
133
|
+
cb = Proc.new do
|
134
|
+
end_time = Time.now
|
135
|
+
puts "use time:#{end_time-begin_time} seconds"
|
136
|
+
if callback.nil?
|
137
|
+
puts "success size:#{self.succeed_size}"
|
138
|
+
puts "failed size:#{self.failed_size}"
|
139
|
+
EventMachine.stop
|
140
|
+
else
|
141
|
+
callback.call(multi, succeed_list, failed_list)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
after_proc = proc {
|
146
|
+
if no_job #没有任务直接调回调
|
147
|
+
cb.call
|
148
|
+
else
|
149
|
+
multi.callback &cb
|
150
|
+
end
|
151
|
+
}
|
152
|
+
|
153
|
+
# if DownLoadConfig::MaxConcurrent <= 0
|
154
|
+
link_struct_list.each &for_each_proc
|
155
|
+
after_proc.call
|
156
|
+
# else
|
157
|
+
# EM::Iterator.new(link_struct_list, DownLoadConfig::MaxConcurrent).each(em_for_each_proc, after_proc)
|
158
|
+
# end
|
159
|
+
end
|
160
|
+
|
161
|
+
def event_machine_start(url, down_dir, file_name, callback = nil)
|
162
|
+
down_dir << "/" unless down_dir.end_with?("/")
|
163
|
+
FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
|
164
|
+
down_list = []
|
165
|
+
down_list << TaskStruct.new(url, down_dir + file_name)
|
166
|
+
EventMachine.run {
|
167
|
+
index = 0
|
168
|
+
begin_time = Time.now
|
169
|
+
event_machine_down(down_list, callback)
|
170
|
+
end_time = Time.now
|
171
|
+
}
|
172
|
+
end
|
173
|
+
|
174
|
+
def event_machine_start_list(down_list, callback = nil)
|
175
|
+
EventMachine.run {
|
176
|
+
index = 0
|
177
|
+
begin_time = Time.now
|
178
|
+
event_machine_down(down_list, callback)
|
179
|
+
end_time = Time.now
|
180
|
+
}
|
181
|
+
end
|
182
|
+
|
183
|
+
end#self end
|
184
|
+
end#SpiderBase end
|
185
|
+
|
186
|
+
def batch_down_list(down_list, callback = nil)
|
187
|
+
SpiderBase.event_machine_down(down_list, callback)
|
188
|
+
end
|
189
|
+
|
190
|
+
def event_machine_start_list(down_list, callback = nil)
|
191
|
+
SpiderBase.event_machine_start_list(down_list, callback)
|
192
|
+
end
|
193
|
+
|
194
|
+
def parse_down_load_url(url, down_dir, file_name, callback = nil)
|
195
|
+
SpiderBase.event_machine_start(url, down_dir, file_name, callback)
|
196
|
+
end
|
197
|
+
|
198
|
+
class GetRelative
|
199
|
+
|
200
|
+
def initialize(base_url,down_dir,get_depth = 2,suffix=".html")
|
201
|
+
@get_depth = get_depth
|
202
|
+
@base_url = base_url
|
203
|
+
@down_dir = down_dir
|
204
|
+
@suffix = suffix
|
205
|
+
end
|
206
|
+
|
207
|
+
def down_node (multi, succeed_list, failed_list, base_url, down_dir, callback)
|
208
|
+
puts "success"
|
209
|
+
puts succeed_list.size
|
210
|
+
puts "error"
|
211
|
+
puts failed_list.size
|
212
|
+
puts failed_list
|
213
|
+
puts "get index complete"
|
214
|
+
if succeed_list.size > 0
|
215
|
+
link_list = []
|
216
|
+
succeed_list.each do |e|
|
217
|
+
doc = Nokogiri::HTML(open(e.local_path))
|
218
|
+
link_list.concat(doc.css("a"))
|
219
|
+
end
|
220
|
+
puts "extrat href complete"
|
221
|
+
|
222
|
+
down_dir << "/" unless down_dir.end_with?("/")
|
223
|
+
FileUtils.mkdir_p(down_dir) unless Dir.exist?(down_dir)
|
224
|
+
|
225
|
+
down_list = []
|
226
|
+
set_list = Set.new
|
227
|
+
link_list.each do |link|
|
228
|
+
href = link['href']
|
229
|
+
next if href.nil? || !href.include?(@suffix)
|
230
|
+
#process such as "scheme_2.html#SEC15"
|
231
|
+
href = href[0, href.index(@suffix) + 5]
|
232
|
+
#process such as "./preface.html"
|
233
|
+
href = href[2..-1] if href.start_with?("./")
|
234
|
+
|
235
|
+
next if !set_list.add?(href)
|
236
|
+
unless base_url.end_with?("/")
|
237
|
+
i = base_url.rindex"/"
|
238
|
+
base_url = base_url[0..i]
|
239
|
+
end
|
240
|
+
|
241
|
+
#process such as "http://www.ccs.neu.edu/~dorai"
|
242
|
+
next if href.start_with?("http:") || href.start_with?("https:")
|
243
|
+
|
244
|
+
local_path = down_dir + href
|
245
|
+
|
246
|
+
down_list.push( TaskStruct.new(base_url + href, local_path))
|
247
|
+
end
|
248
|
+
puts "down list complete,size:#{down_list.size}"
|
249
|
+
batch_down_list(down_list, callback)
|
250
|
+
end
|
251
|
+
end
|
252
|
+
|
253
|
+
def down_other_node (multi, succeed_list, failed_list)
|
254
|
+
puts "down_other_node"
|
255
|
+
@get_depth = @get_depth - 1
|
256
|
+
puts "depth:#{@get_depth}"
|
257
|
+
if @get_depth <= 0
|
258
|
+
down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:event_all_complete));
|
259
|
+
else
|
260
|
+
down_node(multi, succeed_list, failed_list, @base_url, @down_dir, method(:down_other_node));
|
261
|
+
end
|
262
|
+
end
|
263
|
+
|
264
|
+
def event_all_complete (multi, succeed_list, failed_list)
|
265
|
+
puts "all complete"
|
266
|
+
puts "success"
|
267
|
+
puts succeed_list.size
|
268
|
+
puts "error"
|
269
|
+
puts failed_list.size
|
270
|
+
puts failed_list
|
271
|
+
EventMachine.stop
|
272
|
+
end
|
273
|
+
|
274
|
+
attr_writer :get_depth,:base_url,:down_dir
|
275
|
+
|
276
|
+
def start
|
277
|
+
index_file_name = "index.html"
|
278
|
+
#http://www.ccs.neu.edu/home/dorai/t-y-scheme/t-y-scheme-Z-H-1.html
|
279
|
+
unless @base_url.end_with?("/")
|
280
|
+
i = @base_url.rindex"/"
|
281
|
+
index_file_name = @base_url[i+1 .. -1]
|
282
|
+
end
|
283
|
+
|
284
|
+
@get_depth = @get_depth - 1
|
285
|
+
puts @get_depth
|
286
|
+
if @get_depth <= 0
|
287
|
+
parse_down_load_url(@base_url, @down_dir, index_file_name, method(:event_all_complete))
|
288
|
+
else
|
289
|
+
parse_down_load_url(@base_url, @down_dir, index_file_name, method(:down_other_node))
|
290
|
+
end
|
291
|
+
end
|
292
|
+
|
293
|
+
def self.Get(base_url, down_dir, get_depth = 2, suffix = ".html")
|
294
|
+
GetRelative.new(base_url,down_dir, get_depth, suffix).start
|
295
|
+
end
|
296
|
+
end #GetRelative
|
@@ -0,0 +1,103 @@
|
|
1
|
+
require 'rchardet'
|
2
|
+
require 'net/http'
|
3
|
+
|
4
|
+
module SpiderHelper
|
5
|
+
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def direct_http_get(href, local_path, params: nil, header: nil)
|
9
|
+
if href.class == "".class
|
10
|
+
href = string_to_uri(href)
|
11
|
+
end
|
12
|
+
|
13
|
+
begin
|
14
|
+
href.query = URI.encode_www_form(params) if params
|
15
|
+
req = Net::HTTP::Get.new(href)
|
16
|
+
header.each{|k,v| req[k] = v} if header
|
17
|
+
|
18
|
+
res = Net::HTTP.start(href.hostname, href.port) do |http|
|
19
|
+
http.request(req)
|
20
|
+
end
|
21
|
+
|
22
|
+
if res.is_a?(Net::HTTPSuccess)
|
23
|
+
local_dir = File.dirname(local_path)
|
24
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
25
|
+
File.write(local_path, res.body)
|
26
|
+
puts "succeed"
|
27
|
+
else
|
28
|
+
puts res
|
29
|
+
end
|
30
|
+
rescue Exception => e
|
31
|
+
puts e.backtrace
|
32
|
+
puts e
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def direct_http_post(href, local_path, params, header: nil)
|
37
|
+
if href.class == "".class
|
38
|
+
href = string_to_uri(href)
|
39
|
+
end
|
40
|
+
|
41
|
+
begin
|
42
|
+
req = Net::HTTP::Post.new(href)
|
43
|
+
req.set_form_data(params)
|
44
|
+
header.each{|k,v| req[k] = v} if header
|
45
|
+
|
46
|
+
res = Net::HTTP.start(href.hostname, href.port) do |http|
|
47
|
+
http.request(req)
|
48
|
+
end
|
49
|
+
|
50
|
+
if res.is_a?(Net::HTTPSuccess)
|
51
|
+
local_dir = File.dirname(local_path)
|
52
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
53
|
+
File.write(local_path, res.body)
|
54
|
+
else
|
55
|
+
puts res
|
56
|
+
end
|
57
|
+
rescue Exception => e
|
58
|
+
puts e
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract_href_last(origin_href)
|
63
|
+
origin_href.split('/')[-1]
|
64
|
+
end
|
65
|
+
|
66
|
+
def string_to_uri(href)
|
67
|
+
l = href
|
68
|
+
l.sub!('http:///', 'http://') if l.start_with?('http:///')
|
69
|
+
l = Addressable::URI.parse(l)
|
70
|
+
l.normalize!
|
71
|
+
end
|
72
|
+
|
73
|
+
BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
|
74
|
+
"UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
|
75
|
+
"UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
|
76
|
+
"UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
|
77
|
+
"UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
|
78
|
+
|
79
|
+
#此函数有时此判断有误,使用to_utf8函数直接转换
|
80
|
+
def smart_to_utf8(str)
|
81
|
+
return str if str.encoding == Encoding::UTF_8
|
82
|
+
to_utf8(str)
|
83
|
+
end
|
84
|
+
|
85
|
+
def to_utf8(str)
|
86
|
+
#解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
87
|
+
str.force_encoding(Encoding::ASCII_8BIT)
|
88
|
+
cd = CharDet.detect(str)
|
89
|
+
if cd["confidence"] > 0.6
|
90
|
+
puts cd["encoding"]
|
91
|
+
str.force_encoding(cd["encoding"])
|
92
|
+
#移除BOM头
|
93
|
+
bomHeader = BomHeaderMap[cd["encoding"]]
|
94
|
+
str.sub!(bomHeader, "") if bomHeader
|
95
|
+
end
|
96
|
+
str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
|
97
|
+
|
98
|
+
return str
|
99
|
+
end
|
100
|
+
|
101
|
+
end
|
102
|
+
|
103
|
+
end
|
metadata
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: list_spider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Charles Zhang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-29 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: em-http-request
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.1'
|
20
|
+
- - ">="
|
21
|
+
- !ruby/object:Gem::Version
|
22
|
+
version: 1.1.3
|
23
|
+
type: :runtime
|
24
|
+
prerelease: false
|
25
|
+
version_requirements: !ruby/object:Gem::Requirement
|
26
|
+
requirements:
|
27
|
+
- - "~>"
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '1.1'
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 1.1.3
|
33
|
+
- !ruby/object:Gem::Dependency
|
34
|
+
name: nokogiri
|
35
|
+
requirement: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - "~>"
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '1.6'
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: 1.6.7
|
43
|
+
type: :runtime
|
44
|
+
prerelease: false
|
45
|
+
version_requirements: !ruby/object:Gem::Requirement
|
46
|
+
requirements:
|
47
|
+
- - "~>"
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: '1.6'
|
50
|
+
- - ">="
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: 1.6.7
|
53
|
+
description: A url list spider based on em-http-request.
|
54
|
+
email: gis05zc@163.com
|
55
|
+
executables: []
|
56
|
+
extensions: []
|
57
|
+
extra_rdoc_files: []
|
58
|
+
files:
|
59
|
+
- lib/delete_unvalid.rb
|
60
|
+
- lib/list_spider.rb
|
61
|
+
- lib/spider_base.rb
|
62
|
+
- lib/spider_helper.rb
|
63
|
+
homepage: https://github.com/chinazhangchao/list_spider
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.6.4
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: List Spider
|
87
|
+
test_files: []
|
88
|
+
has_rdoc:
|