list_spider 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +31 -32
- data/lib/list_spider.rb +94 -98
- data/lib/spider_helper.rb +31 -37
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e16f58e39ec18d181da107dfdc76626c23dc44c
|
4
|
+
data.tar.gz: c6fd2518111983f1bdcf9c471a93cfe279a722fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d3374e67436fc68afdd770c9ece27cfd734e82dc74917e7db226f311f743cbd51c69129f0c400bad6ec602c15f1cb08697c2b85a91805c8c3c549e519a68f74
|
7
|
+
data.tar.gz: 9ef2fdb5d99fcf018ce02eeb395eedf4e72917018b21b48a23ae12601fa7b2b566ba6c08595674900326d8f4d1d18c6e992c2cf544f468dffa96f3cbeefecf7b
|
data/lib/file_filter.rb
CHANGED
@@ -5,11 +5,7 @@ class FileFilter
|
|
5
5
|
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
|
6
6
|
@dir_pattern = dir_pattern
|
7
7
|
@size_threshold = size_threshold
|
8
|
-
|
9
|
-
@cust_judge = cust_judge
|
10
|
-
else
|
11
|
-
@cust_judge = method(:default_judge)
|
12
|
-
end
|
8
|
+
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
13
9
|
@total = 0
|
14
10
|
@process_block = process_block
|
15
11
|
end
|
@@ -33,39 +29,42 @@ class FileFilter
|
|
33
29
|
end
|
34
30
|
|
35
31
|
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
36
|
-
FileFilter.new(
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
FileFilter.new(
|
33
|
+
dir_pattern,
|
34
|
+
size_threshold: size_threshold,
|
35
|
+
cust_judge: cust_judge,
|
36
|
+
process_block:
|
37
|
+
proc do |f|
|
38
|
+
puts "deleted file: #{f}"
|
39
|
+
File.delete(f)
|
40
|
+
end
|
41
|
+
).start
|
45
42
|
end
|
46
43
|
|
47
44
|
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
48
|
-
FileFilter.new(
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
45
|
+
FileFilter.new(
|
46
|
+
dir_pattern,
|
47
|
+
size_threshold: size_threshold,
|
48
|
+
cust_judge: cust_judge,
|
49
|
+
process_block:
|
50
|
+
proc do |f|
|
51
|
+
puts "filterd file: #{f}"
|
52
|
+
end
|
53
|
+
).start
|
56
54
|
end
|
57
55
|
|
58
56
|
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
|
59
57
|
result_file = File.open(save_file_name, 'wt')
|
60
|
-
FileFilter.new(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
58
|
+
FileFilter.new(
|
59
|
+
dir_pattern,
|
60
|
+
size_threshold: size_threshold,
|
61
|
+
cust_judge: cust_judge,
|
62
|
+
process_block:
|
63
|
+
proc do |f|
|
64
|
+
puts "filterd file: #{f}"
|
65
|
+
result_file << f << "\n"
|
66
|
+
end
|
67
|
+
).start
|
68
|
+
result_file.close
|
69
69
|
end
|
70
|
-
|
71
70
|
end
|
data/lib/list_spider.rb
CHANGED
@@ -2,7 +2,7 @@ require 'em-http-request'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'fileutils'
|
4
4
|
require 'set'
|
5
|
-
require
|
5
|
+
require 'addressable/uri'
|
6
6
|
require File.expand_path('../spider_helper', __FILE__)
|
7
7
|
require File.expand_path('../file_filter', __FILE__)
|
8
8
|
|
@@ -10,9 +10,7 @@ class TaskStruct
|
|
10
10
|
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
11
|
@origin_href = href
|
12
12
|
@href = href
|
13
|
-
if @href.class ==
|
14
|
-
@href = SpiderHelper.string_to_uri(@href)
|
15
|
-
end
|
13
|
+
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
16
14
|
@local_path = local_path
|
17
15
|
@http_method = http_method
|
18
16
|
@params = params
|
@@ -21,16 +19,14 @@ class TaskStruct
|
|
21
19
|
@header = header
|
22
20
|
end
|
23
21
|
|
24
|
-
def ==
|
25
|
-
|
22
|
+
def ==(other)
|
23
|
+
other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
|
26
24
|
end
|
27
25
|
|
28
|
-
attr_accessor :origin_href
|
29
|
-
|
26
|
+
attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
30
27
|
end
|
31
28
|
|
32
29
|
module ListSpider
|
33
|
-
|
34
30
|
RANDOM_TIME = -1
|
35
31
|
NO_LIMIT_CONCURRENT = -1
|
36
32
|
DEFAULT_CONCURRNET_MAX = 50
|
@@ -38,21 +34,20 @@ module ListSpider
|
|
38
34
|
|
39
35
|
@random_time_range = 3..10
|
40
36
|
@conver_to_utf8 = false
|
41
|
-
@connection_opts = {connect_timeout: 60}
|
37
|
+
@connection_opts = { connect_timeout: 60 }
|
42
38
|
@overwrite_exist = false
|
43
39
|
@max_redirects = 10
|
44
40
|
@local_path_set = Set.new
|
45
41
|
|
46
42
|
class << self
|
47
|
-
|
48
43
|
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
49
44
|
|
50
45
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
51
46
|
@connection_opts = {
|
52
|
-
:
|
53
|
-
|
54
|
-
|
55
|
-
|
47
|
+
proxy: {
|
48
|
+
host: proxy_addr,
|
49
|
+
port: proxy_port
|
50
|
+
}
|
56
51
|
}
|
57
52
|
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
58
53
|
end
|
@@ -71,100 +66,102 @@ module ListSpider
|
|
71
66
|
multi = EventMachine::MultiRequest.new
|
72
67
|
begin_time = Time.now
|
73
68
|
|
74
|
-
for_each_proc =
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
if e.http_method == :post
|
84
|
-
opt[:body] = e.params unless e.params.empty?
|
85
|
-
if @connection_opts
|
86
|
-
w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
87
|
-
else
|
88
|
-
w = EventMachine::HttpRequest.new(e.href).post opt
|
69
|
+
for_each_proc =
|
70
|
+
proc do |e|
|
71
|
+
opt = { redirects: @max_redirects }
|
72
|
+
if e.header
|
73
|
+
opt[:head] = e.header
|
74
|
+
elsif defined? @header_option
|
75
|
+
opt[:head] = @header_option
|
89
76
|
end
|
90
|
-
|
91
|
-
if
|
92
|
-
opt[:
|
93
|
-
w =
|
77
|
+
|
78
|
+
if e.http_method == :post
|
79
|
+
opt[:body] = e.params unless e.params.empty?
|
80
|
+
w =
|
81
|
+
if @connection_opts
|
82
|
+
EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
83
|
+
else
|
84
|
+
EventMachine::HttpRequest.new(e.href).post opt
|
85
|
+
end
|
94
86
|
else
|
95
|
-
|
87
|
+
if @connection_opts
|
88
|
+
opt[:query] = e.params unless e.params.empty?
|
89
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
90
|
+
else
|
91
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
92
|
+
end
|
96
93
|
end
|
97
|
-
end
|
98
94
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
95
|
+
e.request_object = w
|
96
|
+
|
97
|
+
w.callback do
|
98
|
+
s = w.response_header.status
|
99
|
+
puts s
|
100
|
+
if s != 404
|
101
|
+
local_dir = File.dirname(e.local_path)
|
102
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
|
+
begin
|
104
|
+
File.open(e.local_path, 'w') do |f|
|
105
|
+
f << if @conver_to_utf8 == true
|
106
|
+
SpiderHelper.to_utf8(w.response)
|
107
|
+
else
|
108
|
+
w.response
|
109
|
+
end
|
113
110
|
end
|
111
|
+
succeed_list << e
|
112
|
+
rescue => e
|
113
|
+
puts e
|
114
114
|
end
|
115
|
-
succeed_list << e
|
116
|
-
rescue Exception => e
|
117
|
-
puts e
|
118
115
|
end
|
119
116
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
117
|
+
w.errback do
|
118
|
+
puts "errback:#{w.response_header}"
|
119
|
+
puts e.origin_href
|
120
|
+
puts e.href
|
121
|
+
puts w.response_header.status
|
122
|
+
failed_list << e
|
123
|
+
if e.http_method == :get
|
124
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
125
|
+
elsif e.http_method == :post
|
126
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
127
|
+
end
|
131
128
|
end
|
132
|
-
}
|
133
129
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
130
|
+
begin
|
131
|
+
multi.add e.local_path, w
|
132
|
+
rescue => exception
|
133
|
+
puts exception
|
134
|
+
puts e.href
|
135
|
+
puts e.local_path
|
136
|
+
stop_machine
|
137
|
+
end
|
141
138
|
end
|
142
|
-
end
|
143
139
|
|
144
|
-
cb =
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
140
|
+
cb =
|
141
|
+
proc do
|
142
|
+
end_time = Time.now
|
143
|
+
puts "use time:#{end_time - begin_time} seconds"
|
144
|
+
if callback.nil?
|
145
|
+
stop_machine
|
146
|
+
else
|
147
|
+
callback.call(multi, succeed_list, failed_list)
|
148
|
+
end
|
151
149
|
end
|
152
|
-
|
153
|
-
|
154
|
-
multi.callback &cb
|
150
|
+
link_struct_list.each(&for_each_proc)
|
151
|
+
multi.callback(&cb)
|
155
152
|
end
|
156
153
|
|
157
154
|
def stop_machine
|
158
155
|
puts "success size:#{@succeed_size}"
|
159
156
|
puts "failed size:#{@failed_size}"
|
160
157
|
@end_time = Time.now
|
161
|
-
puts "total use time:#{@end_time
|
158
|
+
puts "total use time:#{@end_time - @begin_time} seconds"
|
162
159
|
EventMachine.stop
|
163
160
|
@local_path_set.clear
|
164
161
|
end
|
165
162
|
|
166
|
-
def
|
167
|
-
|
163
|
+
def next_task
|
164
|
+
@down_list.shift(@max)
|
168
165
|
end
|
169
166
|
|
170
167
|
def call_parse_method(e)
|
@@ -193,20 +190,20 @@ module ListSpider
|
|
193
190
|
end
|
194
191
|
end
|
195
192
|
|
196
|
-
def complete(
|
193
|
+
def complete(_multi, success_list, failed_list)
|
197
194
|
@succeed_size += success_list.size
|
198
195
|
@failed_size += failed_list.size
|
199
196
|
success_list.each do |e|
|
200
197
|
call_parse_method(e)
|
201
198
|
end
|
202
199
|
|
203
|
-
todo =
|
200
|
+
todo = next_task
|
204
201
|
|
205
202
|
if todo.empty?
|
206
203
|
stop_machine
|
207
204
|
else
|
208
205
|
if @interval != 0
|
209
|
-
if success_list.
|
206
|
+
if !success_list.empty? || !failed_list.empty?
|
210
207
|
if @interval == RANDOM_TIME
|
211
208
|
sleep(rand(@random_time_range))
|
212
209
|
else
|
@@ -219,7 +216,7 @@ module ListSpider
|
|
219
216
|
end
|
220
217
|
|
221
218
|
def event_machine_start_list(down_list, callback = nil)
|
222
|
-
EventMachine.run
|
219
|
+
EventMachine.run do
|
223
220
|
@begin_time = Time.now
|
224
221
|
if down_list.empty?
|
225
222
|
if callback
|
@@ -230,7 +227,7 @@ module ListSpider
|
|
230
227
|
else
|
231
228
|
event_machine_down(down_list, callback)
|
232
229
|
end
|
233
|
-
|
230
|
+
end
|
234
231
|
end
|
235
232
|
|
236
233
|
def filter_list(down_list)
|
@@ -242,7 +239,7 @@ module ListSpider
|
|
242
239
|
need_down_list << ts
|
243
240
|
end
|
244
241
|
end
|
245
|
-
|
242
|
+
need_down_list
|
246
243
|
end
|
247
244
|
|
248
245
|
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
@@ -255,7 +252,7 @@ module ListSpider
|
|
255
252
|
|
256
253
|
need_down_list = filter_list(down_list)
|
257
254
|
|
258
|
-
@down_list
|
255
|
+
@down_list += need_down_list
|
259
256
|
@interval = interval
|
260
257
|
@max = max
|
261
258
|
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
@@ -263,7 +260,7 @@ module ListSpider
|
|
263
260
|
@failed_size = 0
|
264
261
|
|
265
262
|
puts "total size:#{@down_list.size}"
|
266
|
-
event_machine_start_list(
|
263
|
+
event_machine_start_list(next_task, method(:complete))
|
267
264
|
end
|
268
265
|
|
269
266
|
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
@@ -273,19 +270,18 @@ module ListSpider
|
|
273
270
|
def add_task(task)
|
274
271
|
if task.is_a?Array
|
275
272
|
need_down_list = filter_list(task)
|
276
|
-
@down_list
|
273
|
+
@down_list += need_down_list
|
277
274
|
elsif task.is_a?TaskStruct
|
278
275
|
need_down_list = filter_list([task])
|
279
|
-
@down_list
|
276
|
+
@down_list += need_down_list
|
280
277
|
else
|
281
278
|
puts "error task type:#{task.class}"
|
282
279
|
end
|
283
280
|
end
|
284
281
|
end
|
285
282
|
|
286
|
-
Signal.trap(
|
283
|
+
Signal.trap('INT') do
|
287
284
|
ListSpider.stop_machine
|
288
285
|
exit!
|
289
286
|
end
|
290
|
-
|
291
287
|
end
|
data/lib/spider_helper.rb
CHANGED
@@ -2,50 +2,46 @@ require 'rchardet'
|
|
2
2
|
require 'net/http'
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
|
-
|
6
5
|
class << self
|
7
|
-
|
8
6
|
def direct_http_get(href, local_path, params: nil, header: nil)
|
9
|
-
if href.class ==
|
10
|
-
href = string_to_uri(href)
|
11
|
-
end
|
7
|
+
href = string_to_uri(href) if href.class == ''.class
|
12
8
|
|
13
9
|
begin
|
14
10
|
href.query = URI.encode_www_form(params) if params
|
15
11
|
req = Net::HTTP::Get.new(href)
|
16
|
-
header.each{|k,v| req[k] = v} if header
|
12
|
+
header.each { |k, v| req[k] = v } if header
|
17
13
|
|
18
|
-
res =
|
19
|
-
|
20
|
-
|
14
|
+
res =
|
15
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
16
|
+
http.request(req)
|
17
|
+
end
|
21
18
|
|
22
19
|
if res.is_a?(Net::HTTPSuccess)
|
23
20
|
local_dir = File.dirname(local_path)
|
24
21
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
25
22
|
File.write(local_path, res.body)
|
26
|
-
puts
|
23
|
+
puts 'succeed'
|
27
24
|
else
|
28
25
|
puts res
|
29
26
|
end
|
30
|
-
rescue
|
27
|
+
rescue => e
|
31
28
|
puts e.backtrace
|
32
29
|
puts e
|
33
30
|
end
|
34
31
|
end
|
35
32
|
|
36
33
|
def direct_http_post(href, local_path, params, header: nil)
|
37
|
-
if href.class ==
|
38
|
-
href = string_to_uri(href)
|
39
|
-
end
|
34
|
+
href = string_to_uri(href) if href.class == ''.class
|
40
35
|
|
41
36
|
begin
|
42
37
|
req = Net::HTTP::Post.new(href)
|
43
38
|
req.set_form_data(params)
|
44
|
-
header.each{|k,v| req[k] = v} if header
|
39
|
+
header.each { |k, v| req[k] = v } if header
|
45
40
|
|
46
|
-
res =
|
47
|
-
|
48
|
-
|
41
|
+
res =
|
42
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
43
|
+
http.request(req)
|
44
|
+
end
|
49
45
|
|
50
46
|
if res.is_a?(Net::HTTPSuccess)
|
51
47
|
local_dir = File.dirname(local_path)
|
@@ -54,7 +50,7 @@ module SpiderHelper
|
|
54
50
|
else
|
55
51
|
puts res
|
56
52
|
end
|
57
|
-
rescue
|
53
|
+
rescue => e
|
58
54
|
puts e
|
59
55
|
end
|
60
56
|
end
|
@@ -70,34 +66,32 @@ module SpiderHelper
|
|
70
66
|
l.normalize!
|
71
67
|
end
|
72
68
|
|
73
|
-
BomHeaderMap = {
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
69
|
+
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
70
|
+
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
71
|
+
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
72
|
+
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
73
|
+
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
78
74
|
|
79
|
-
|
75
|
+
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
80
76
|
def smart_to_utf8(str)
|
81
77
|
return str if str.encoding == Encoding::UTF_8
|
82
78
|
to_utf8(str)
|
83
79
|
end
|
84
80
|
|
85
81
|
def to_utf8(str)
|
86
|
-
|
82
|
+
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
87
83
|
str.force_encoding(Encoding::ASCII_8BIT)
|
88
84
|
cd = CharDet.detect(str)
|
89
|
-
if cd[
|
90
|
-
puts cd[
|
91
|
-
str.force_encoding(cd[
|
92
|
-
|
93
|
-
|
94
|
-
str.sub!(
|
85
|
+
if cd['confidence'] > 0.6
|
86
|
+
puts cd['encoding']
|
87
|
+
str.force_encoding(cd['encoding'])
|
88
|
+
# 移除BOM头
|
89
|
+
bom_header = BomHeaderMap[cd['encoding']]
|
90
|
+
str.sub!(bom_header, '') if bom_header
|
95
91
|
end
|
96
|
-
str.encode!(Encoding::UTF_8, :
|
97
|
-
|
98
|
-
return str
|
99
|
-
end
|
92
|
+
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
100
93
|
|
94
|
+
str
|
95
|
+
end
|
101
96
|
end
|
102
|
-
|
103
97
|
end
|