list_spider 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/file_filter.rb +31 -32
- data/lib/list_spider.rb +94 -98
- data/lib/spider_helper.rb +31 -37
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9e16f58e39ec18d181da107dfdc76626c23dc44c
|
4
|
+
data.tar.gz: c6fd2518111983f1bdcf9c471a93cfe279a722fe
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d3374e67436fc68afdd770c9ece27cfd734e82dc74917e7db226f311f743cbd51c69129f0c400bad6ec602c15f1cb08697c2b85a91805c8c3c549e519a68f74
|
7
|
+
data.tar.gz: 9ef2fdb5d99fcf018ce02eeb395eedf4e72917018b21b48a23ae12601fa7b2b566ba6c08595674900326d8f4d1d18c6e992c2cf544f468dffa96f3cbeefecf7b
|
data/lib/file_filter.rb
CHANGED
@@ -5,11 +5,7 @@ class FileFilter
|
|
5
5
|
def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
|
6
6
|
@dir_pattern = dir_pattern
|
7
7
|
@size_threshold = size_threshold
|
8
|
-
|
9
|
-
@cust_judge = cust_judge
|
10
|
-
else
|
11
|
-
@cust_judge = method(:default_judge)
|
12
|
-
end
|
8
|
+
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
13
9
|
@total = 0
|
14
10
|
@process_block = process_block
|
15
11
|
end
|
@@ -33,39 +29,42 @@ class FileFilter
|
|
33
29
|
end
|
34
30
|
|
35
31
|
def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
36
|
-
FileFilter.new(
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
32
|
+
FileFilter.new(
|
33
|
+
dir_pattern,
|
34
|
+
size_threshold: size_threshold,
|
35
|
+
cust_judge: cust_judge,
|
36
|
+
process_block:
|
37
|
+
proc do |f|
|
38
|
+
puts "deleted file: #{f}"
|
39
|
+
File.delete(f)
|
40
|
+
end
|
41
|
+
).start
|
45
42
|
end
|
46
43
|
|
47
44
|
def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
|
48
|
-
FileFilter.new(
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
45
|
+
FileFilter.new(
|
46
|
+
dir_pattern,
|
47
|
+
size_threshold: size_threshold,
|
48
|
+
cust_judge: cust_judge,
|
49
|
+
process_block:
|
50
|
+
proc do |f|
|
51
|
+
puts "filterd file: #{f}"
|
52
|
+
end
|
53
|
+
).start
|
56
54
|
end
|
57
55
|
|
58
56
|
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
|
59
57
|
result_file = File.open(save_file_name, 'wt')
|
60
|
-
FileFilter.new(
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
58
|
+
FileFilter.new(
|
59
|
+
dir_pattern,
|
60
|
+
size_threshold: size_threshold,
|
61
|
+
cust_judge: cust_judge,
|
62
|
+
process_block:
|
63
|
+
proc do |f|
|
64
|
+
puts "filterd file: #{f}"
|
65
|
+
result_file << f << "\n"
|
66
|
+
end
|
67
|
+
).start
|
68
|
+
result_file.close
|
69
69
|
end
|
70
|
-
|
71
70
|
end
|
data/lib/list_spider.rb
CHANGED
@@ -2,7 +2,7 @@ require 'em-http-request'
|
|
2
2
|
require 'nokogiri'
|
3
3
|
require 'fileutils'
|
4
4
|
require 'set'
|
5
|
-
require
|
5
|
+
require 'addressable/uri'
|
6
6
|
require File.expand_path('../spider_helper', __FILE__)
|
7
7
|
require File.expand_path('../file_filter', __FILE__)
|
8
8
|
|
@@ -10,9 +10,7 @@ class TaskStruct
|
|
10
10
|
def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
|
11
11
|
@origin_href = href
|
12
12
|
@href = href
|
13
|
-
if @href.class ==
|
14
|
-
@href = SpiderHelper.string_to_uri(@href)
|
15
|
-
end
|
13
|
+
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
16
14
|
@local_path = local_path
|
17
15
|
@http_method = http_method
|
18
16
|
@params = params
|
@@ -21,16 +19,14 @@ class TaskStruct
|
|
21
19
|
@header = header
|
22
20
|
end
|
23
21
|
|
24
|
-
def ==
|
25
|
-
|
22
|
+
def ==(other)
|
23
|
+
other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
|
26
24
|
end
|
27
25
|
|
28
|
-
attr_accessor :origin_href
|
29
|
-
|
26
|
+
attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
|
30
27
|
end
|
31
28
|
|
32
29
|
module ListSpider
|
33
|
-
|
34
30
|
RANDOM_TIME = -1
|
35
31
|
NO_LIMIT_CONCURRENT = -1
|
36
32
|
DEFAULT_CONCURRNET_MAX = 50
|
@@ -38,21 +34,20 @@ module ListSpider
|
|
38
34
|
|
39
35
|
@random_time_range = 3..10
|
40
36
|
@conver_to_utf8 = false
|
41
|
-
@connection_opts = {connect_timeout: 60}
|
37
|
+
@connection_opts = { connect_timeout: 60 }
|
42
38
|
@overwrite_exist = false
|
43
39
|
@max_redirects = 10
|
44
40
|
@local_path_set = Set.new
|
45
41
|
|
46
42
|
class << self
|
47
|
-
|
48
43
|
attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
|
49
44
|
|
50
45
|
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
51
46
|
@connection_opts = {
|
52
|
-
:
|
53
|
-
|
54
|
-
|
55
|
-
|
47
|
+
proxy: {
|
48
|
+
host: proxy_addr,
|
49
|
+
port: proxy_port
|
50
|
+
}
|
56
51
|
}
|
57
52
|
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
58
53
|
end
|
@@ -71,100 +66,102 @@ module ListSpider
|
|
71
66
|
multi = EventMachine::MultiRequest.new
|
72
67
|
begin_time = Time.now
|
73
68
|
|
74
|
-
for_each_proc =
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
end
|
82
|
-
|
83
|
-
if e.http_method == :post
|
84
|
-
opt[:body] = e.params unless e.params.empty?
|
85
|
-
if @connection_opts
|
86
|
-
w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
87
|
-
else
|
88
|
-
w = EventMachine::HttpRequest.new(e.href).post opt
|
69
|
+
for_each_proc =
|
70
|
+
proc do |e|
|
71
|
+
opt = { redirects: @max_redirects }
|
72
|
+
if e.header
|
73
|
+
opt[:head] = e.header
|
74
|
+
elsif defined? @header_option
|
75
|
+
opt[:head] = @header_option
|
89
76
|
end
|
90
|
-
|
91
|
-
if
|
92
|
-
opt[:
|
93
|
-
w =
|
77
|
+
|
78
|
+
if e.http_method == :post
|
79
|
+
opt[:body] = e.params unless e.params.empty?
|
80
|
+
w =
|
81
|
+
if @connection_opts
|
82
|
+
EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
83
|
+
else
|
84
|
+
EventMachine::HttpRequest.new(e.href).post opt
|
85
|
+
end
|
94
86
|
else
|
95
|
-
|
87
|
+
if @connection_opts
|
88
|
+
opt[:query] = e.params unless e.params.empty?
|
89
|
+
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
90
|
+
else
|
91
|
+
w = EventMachine::HttpRequest.new(e.href).get opt
|
92
|
+
end
|
96
93
|
end
|
97
|
-
end
|
98
94
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
95
|
+
e.request_object = w
|
96
|
+
|
97
|
+
w.callback do
|
98
|
+
s = w.response_header.status
|
99
|
+
puts s
|
100
|
+
if s != 404
|
101
|
+
local_dir = File.dirname(e.local_path)
|
102
|
+
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
|
+
begin
|
104
|
+
File.open(e.local_path, 'w') do |f|
|
105
|
+
f << if @conver_to_utf8 == true
|
106
|
+
SpiderHelper.to_utf8(w.response)
|
107
|
+
else
|
108
|
+
w.response
|
109
|
+
end
|
113
110
|
end
|
111
|
+
succeed_list << e
|
112
|
+
rescue => e
|
113
|
+
puts e
|
114
114
|
end
|
115
|
-
succeed_list << e
|
116
|
-
rescue Exception => e
|
117
|
-
puts e
|
118
115
|
end
|
119
116
|
end
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
117
|
+
w.errback do
|
118
|
+
puts "errback:#{w.response_header}"
|
119
|
+
puts e.origin_href
|
120
|
+
puts e.href
|
121
|
+
puts w.response_header.status
|
122
|
+
failed_list << e
|
123
|
+
if e.http_method == :get
|
124
|
+
SpiderHelper.direct_http_get(e.href, e.local_path)
|
125
|
+
elsif e.http_method == :post
|
126
|
+
SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
|
127
|
+
end
|
131
128
|
end
|
132
|
-
}
|
133
129
|
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
130
|
+
begin
|
131
|
+
multi.add e.local_path, w
|
132
|
+
rescue => exception
|
133
|
+
puts exception
|
134
|
+
puts e.href
|
135
|
+
puts e.local_path
|
136
|
+
stop_machine
|
137
|
+
end
|
141
138
|
end
|
142
|
-
end
|
143
139
|
|
144
|
-
cb =
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
140
|
+
cb =
|
141
|
+
proc do
|
142
|
+
end_time = Time.now
|
143
|
+
puts "use time:#{end_time - begin_time} seconds"
|
144
|
+
if callback.nil?
|
145
|
+
stop_machine
|
146
|
+
else
|
147
|
+
callback.call(multi, succeed_list, failed_list)
|
148
|
+
end
|
151
149
|
end
|
152
|
-
|
153
|
-
|
154
|
-
multi.callback &cb
|
150
|
+
link_struct_list.each(&for_each_proc)
|
151
|
+
multi.callback(&cb)
|
155
152
|
end
|
156
153
|
|
157
154
|
def stop_machine
|
158
155
|
puts "success size:#{@succeed_size}"
|
159
156
|
puts "failed size:#{@failed_size}"
|
160
157
|
@end_time = Time.now
|
161
|
-
puts "total use time:#{@end_time
|
158
|
+
puts "total use time:#{@end_time - @begin_time} seconds"
|
162
159
|
EventMachine.stop
|
163
160
|
@local_path_set.clear
|
164
161
|
end
|
165
162
|
|
166
|
-
def
|
167
|
-
|
163
|
+
def next_task
|
164
|
+
@down_list.shift(@max)
|
168
165
|
end
|
169
166
|
|
170
167
|
def call_parse_method(e)
|
@@ -193,20 +190,20 @@ module ListSpider
|
|
193
190
|
end
|
194
191
|
end
|
195
192
|
|
196
|
-
def complete(
|
193
|
+
def complete(_multi, success_list, failed_list)
|
197
194
|
@succeed_size += success_list.size
|
198
195
|
@failed_size += failed_list.size
|
199
196
|
success_list.each do |e|
|
200
197
|
call_parse_method(e)
|
201
198
|
end
|
202
199
|
|
203
|
-
todo =
|
200
|
+
todo = next_task
|
204
201
|
|
205
202
|
if todo.empty?
|
206
203
|
stop_machine
|
207
204
|
else
|
208
205
|
if @interval != 0
|
209
|
-
if success_list.
|
206
|
+
if !success_list.empty? || !failed_list.empty?
|
210
207
|
if @interval == RANDOM_TIME
|
211
208
|
sleep(rand(@random_time_range))
|
212
209
|
else
|
@@ -219,7 +216,7 @@ module ListSpider
|
|
219
216
|
end
|
220
217
|
|
221
218
|
def event_machine_start_list(down_list, callback = nil)
|
222
|
-
EventMachine.run
|
219
|
+
EventMachine.run do
|
223
220
|
@begin_time = Time.now
|
224
221
|
if down_list.empty?
|
225
222
|
if callback
|
@@ -230,7 +227,7 @@ module ListSpider
|
|
230
227
|
else
|
231
228
|
event_machine_down(down_list, callback)
|
232
229
|
end
|
233
|
-
|
230
|
+
end
|
234
231
|
end
|
235
232
|
|
236
233
|
def filter_list(down_list)
|
@@ -242,7 +239,7 @@ module ListSpider
|
|
242
239
|
need_down_list << ts
|
243
240
|
end
|
244
241
|
end
|
245
|
-
|
242
|
+
need_down_list
|
246
243
|
end
|
247
244
|
|
248
245
|
def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
@@ -255,7 +252,7 @@ module ListSpider
|
|
255
252
|
|
256
253
|
need_down_list = filter_list(down_list)
|
257
254
|
|
258
|
-
@down_list
|
255
|
+
@down_list += need_down_list
|
259
256
|
@interval = interval
|
260
257
|
@max = max
|
261
258
|
@max = @down_list.size if @max == NO_LIMIT_CONCURRENT
|
@@ -263,7 +260,7 @@ module ListSpider
|
|
263
260
|
@failed_size = 0
|
264
261
|
|
265
262
|
puts "total size:#{@down_list.size}"
|
266
|
-
event_machine_start_list(
|
263
|
+
event_machine_start_list(next_task, method(:complete))
|
267
264
|
end
|
268
265
|
|
269
266
|
def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
|
@@ -273,19 +270,18 @@ module ListSpider
|
|
273
270
|
def add_task(task)
|
274
271
|
if task.is_a?Array
|
275
272
|
need_down_list = filter_list(task)
|
276
|
-
@down_list
|
273
|
+
@down_list += need_down_list
|
277
274
|
elsif task.is_a?TaskStruct
|
278
275
|
need_down_list = filter_list([task])
|
279
|
-
@down_list
|
276
|
+
@down_list += need_down_list
|
280
277
|
else
|
281
278
|
puts "error task type:#{task.class}"
|
282
279
|
end
|
283
280
|
end
|
284
281
|
end
|
285
282
|
|
286
|
-
Signal.trap(
|
283
|
+
Signal.trap('INT') do
|
287
284
|
ListSpider.stop_machine
|
288
285
|
exit!
|
289
286
|
end
|
290
|
-
|
291
287
|
end
|
data/lib/spider_helper.rb
CHANGED
@@ -2,50 +2,46 @@ require 'rchardet'
|
|
2
2
|
require 'net/http'
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
|
-
|
6
5
|
class << self
|
7
|
-
|
8
6
|
def direct_http_get(href, local_path, params: nil, header: nil)
|
9
|
-
if href.class ==
|
10
|
-
href = string_to_uri(href)
|
11
|
-
end
|
7
|
+
href = string_to_uri(href) if href.class == ''.class
|
12
8
|
|
13
9
|
begin
|
14
10
|
href.query = URI.encode_www_form(params) if params
|
15
11
|
req = Net::HTTP::Get.new(href)
|
16
|
-
header.each{|k,v| req[k] = v} if header
|
12
|
+
header.each { |k, v| req[k] = v } if header
|
17
13
|
|
18
|
-
res =
|
19
|
-
|
20
|
-
|
14
|
+
res =
|
15
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
16
|
+
http.request(req)
|
17
|
+
end
|
21
18
|
|
22
19
|
if res.is_a?(Net::HTTPSuccess)
|
23
20
|
local_dir = File.dirname(local_path)
|
24
21
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
25
22
|
File.write(local_path, res.body)
|
26
|
-
puts
|
23
|
+
puts 'succeed'
|
27
24
|
else
|
28
25
|
puts res
|
29
26
|
end
|
30
|
-
rescue
|
27
|
+
rescue => e
|
31
28
|
puts e.backtrace
|
32
29
|
puts e
|
33
30
|
end
|
34
31
|
end
|
35
32
|
|
36
33
|
def direct_http_post(href, local_path, params, header: nil)
|
37
|
-
if href.class ==
|
38
|
-
href = string_to_uri(href)
|
39
|
-
end
|
34
|
+
href = string_to_uri(href) if href.class == ''.class
|
40
35
|
|
41
36
|
begin
|
42
37
|
req = Net::HTTP::Post.new(href)
|
43
38
|
req.set_form_data(params)
|
44
|
-
header.each{|k,v| req[k] = v} if header
|
39
|
+
header.each { |k, v| req[k] = v } if header
|
45
40
|
|
46
|
-
res =
|
47
|
-
|
48
|
-
|
41
|
+
res =
|
42
|
+
Net::HTTP.start(href.hostname, href.port) do |http|
|
43
|
+
http.request(req)
|
44
|
+
end
|
49
45
|
|
50
46
|
if res.is_a?(Net::HTTPSuccess)
|
51
47
|
local_dir = File.dirname(local_path)
|
@@ -54,7 +50,7 @@ module SpiderHelper
|
|
54
50
|
else
|
55
51
|
puts res
|
56
52
|
end
|
57
|
-
rescue
|
53
|
+
rescue => e
|
58
54
|
puts e
|
59
55
|
end
|
60
56
|
end
|
@@ -70,34 +66,32 @@ module SpiderHelper
|
|
70
66
|
l.normalize!
|
71
67
|
end
|
72
68
|
|
73
|
-
BomHeaderMap = {
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
69
|
+
BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
|
70
|
+
'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
|
71
|
+
'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
|
72
|
+
'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
|
73
|
+
'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
|
78
74
|
|
79
|
-
|
75
|
+
# 此函数有时此判断有误,使用to_utf8函数直接转换
|
80
76
|
def smart_to_utf8(str)
|
81
77
|
return str if str.encoding == Encoding::UTF_8
|
82
78
|
to_utf8(str)
|
83
79
|
end
|
84
80
|
|
85
81
|
def to_utf8(str)
|
86
|
-
|
82
|
+
# 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
|
87
83
|
str.force_encoding(Encoding::ASCII_8BIT)
|
88
84
|
cd = CharDet.detect(str)
|
89
|
-
if cd[
|
90
|
-
puts cd[
|
91
|
-
str.force_encoding(cd[
|
92
|
-
|
93
|
-
|
94
|
-
str.sub!(
|
85
|
+
if cd['confidence'] > 0.6
|
86
|
+
puts cd['encoding']
|
87
|
+
str.force_encoding(cd['encoding'])
|
88
|
+
# 移除BOM头
|
89
|
+
bom_header = BomHeaderMap[cd['encoding']]
|
90
|
+
str.sub!(bom_header, '') if bom_header
|
95
91
|
end
|
96
|
-
str.encode!(Encoding::UTF_8, :
|
97
|
-
|
98
|
-
return str
|
99
|
-
end
|
92
|
+
str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
|
100
93
|
|
94
|
+
str
|
95
|
+
end
|
101
96
|
end
|
102
|
-
|
103
97
|
end
|