list_spider 1.0.0 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/lib/file_filter.rb +4 -2
- data/lib/list_spider/version.rb +1 -1
- data/lib/list_spider.rb +133 -115
- data/lib/spider_helper.rb +7 -5
- data/spider_example.rb +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 837d9e4cb2b3aa829466cf9eaa4f48a24b5d4ff5067bbc27fb67fbdb37eec291
|
4
|
+
data.tar.gz: 8d378b9e3240b8d9c3bdc9c7e32aceb39a16fc63310224dc7ce6a68a2c570893
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd2c77aa71d8ff3d7ecba93fc6e30ec158b479dcffed9e3cc744944e2bcea3cb5425fc59f85acc22573bcbe3d1eb9a0967e7d0b1e11d3c9cb8d04a58450a0a7e
|
7
|
+
data.tar.gz: ec0e3ac5b2a09a3986eea20c69efc31c9536d1d96f77507e50755bfa07531c4bf7303317bc657a573dd8347bd304d8e93c9adbabf868918f0bdbe56c480e82e6
|
data/README.md
CHANGED
@@ -86,9 +86,9 @@ def parse_response(file_name)
|
|
86
86
|
end
|
87
87
|
|
88
88
|
|
89
|
-
#
|
89
|
+
# custom_data is passed by TaskStruct's custom_data param
|
90
90
|
|
91
|
-
def parse_response(file_name,
|
91
|
+
def parse_response(file_name, custom_data)
|
92
92
|
#...
|
93
93
|
end
|
94
94
|
|
@@ -99,7 +99,7 @@ end
|
|
99
99
|
# response_header.cookie
|
100
100
|
# response_header['Last-Modified']
|
101
101
|
|
102
|
-
def parse_response(file_name,
|
102
|
+
def parse_response(file_name, custom_data, response_header)
|
103
103
|
response_header.status
|
104
104
|
response_header['Last-Modified']
|
105
105
|
|
@@ -113,7 +113,7 @@ end
|
|
113
113
|
# req.uri
|
114
114
|
# req.host
|
115
115
|
# req.port
|
116
|
-
def parse_response(file_name,
|
116
|
+
def parse_response(file_name, custom_data, response_header, req)
|
117
117
|
puts req.body
|
118
118
|
puts req.headers
|
119
119
|
puts req.uri
|
@@ -128,7 +128,7 @@ end
|
|
128
128
|
## And there are many options you can use
|
129
129
|
|
130
130
|
```ruby
|
131
|
-
TaskStruct.new(href, local_path, http_method: :get, params: {},
|
131
|
+
TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
|
132
132
|
```
|
133
133
|
|
134
134
|
```ruby
|
data/lib/file_filter.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
class FileFilter
|
3
3
|
# 4033
|
4
4
|
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000,
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
+
cust_judge: nil, process_block: nil)
|
6
7
|
@dir_pattern = dir_pattern
|
7
8
|
@size_threshold = size_threshold
|
8
9
|
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
@@ -53,7 +54,8 @@ class FileFilter
|
|
53
54
|
).start
|
54
55
|
end
|
55
56
|
|
56
|
-
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
57
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
+
size_threshold: 1000, cust_judge: nil)
|
57
59
|
result_file = File.open(save_file_name, 'wt')
|
58
60
|
FileFilter.new(
|
59
61
|
dir_pattern,
|
data/lib/list_spider/version.rb
CHANGED
data/lib/list_spider.rb
CHANGED
@@ -8,22 +8,98 @@ require File.expand_path('../spider_helper', __FILE__)
|
|
8
8
|
require File.expand_path('../file_filter', __FILE__)
|
9
9
|
|
10
10
|
class TaskStruct
|
11
|
-
def initialize(href,
|
11
|
+
def initialize(href, # 请求链接
|
12
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
13
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
14
|
+
http_method: :get,
|
15
|
+
custom_data: nil, # 自定义数据
|
16
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,
|
18
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
19
|
+
# http.response_header.status 状态码
|
20
|
+
# http.response_header 返回头
|
21
|
+
# http.response 返回体
|
22
|
+
callback: nil,
|
23
|
+
# 请求失败后的回调
|
24
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
+
errback: nil,
|
26
|
+
stream_callback: nil, # 流数据处理回调
|
27
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
28
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
29
|
+
# request options
|
30
|
+
redirects: 3, # 重定向次数
|
31
|
+
# keepalive: nil, # (暂不支持)
|
32
|
+
file: nil, # 要上传的文件路径
|
33
|
+
# path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
34
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
35
|
+
body: nil, # 请求体,可以是string或hash类型
|
36
|
+
head: nil, # 请求头
|
37
|
+
# connection options
|
38
|
+
connect_timeout: 60, # 连接超时时间
|
39
|
+
inactivity_timeout: nil, # 连接后超时时间
|
40
|
+
# ssl设置
|
41
|
+
# ssl: {
|
42
|
+
# :private_key_file => '/tmp/server.key',
|
43
|
+
# :cert_chain_file => '/tmp/server.crt',
|
44
|
+
# :verify_peer => false
|
45
|
+
# }
|
46
|
+
ssl: nil,
|
47
|
+
# bind: {
|
48
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
49
|
+
# :port => '123'
|
50
|
+
# }
|
51
|
+
bind: nil,
|
52
|
+
# 代理设置
|
53
|
+
# proxy: {
|
54
|
+
# :host => '127.0.0.1', # proxy address
|
55
|
+
# :port => 9000, # proxy port
|
56
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
57
|
+
|
58
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
59
|
+
# }
|
60
|
+
proxy: nil)
|
12
61
|
@href = href
|
13
|
-
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
14
62
|
@local_path = local_path
|
15
63
|
@http_method = http_method
|
16
|
-
@
|
17
|
-
@extra_data = extra_data
|
64
|
+
@custom_data = custom_data
|
18
65
|
@parse_method = parse_method
|
19
|
-
@
|
66
|
+
@callback = callback
|
67
|
+
@errback = errback
|
68
|
+
@stream_callback = stream_callback
|
69
|
+
@convert_to_utf8 = convert_to_utf8
|
70
|
+
@overwrite_exist = overwrite_exist
|
71
|
+
|
72
|
+
@request_options = {
|
73
|
+
redirects: redirects,
|
74
|
+
# keepalive: keepalive,
|
75
|
+
file: file,
|
76
|
+
# path: path,
|
77
|
+
query: query,
|
78
|
+
body: body,
|
79
|
+
head: head
|
80
|
+
}.compact
|
81
|
+
|
82
|
+
@connection_options = {
|
83
|
+
connect_timeout: connect_timeout,
|
84
|
+
inactivity_timeout: inactivity_timeout,
|
85
|
+
ssl: ssl,
|
86
|
+
bind: bind,
|
87
|
+
proxy: proxy
|
88
|
+
}.compact
|
20
89
|
end
|
21
90
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
91
|
+
attr_accessor :href, :local_path,
|
92
|
+
:http_method,
|
93
|
+
:custom_data,
|
94
|
+
:request_object,
|
95
|
+
:parse_method,
|
96
|
+
:callback,
|
97
|
+
:errback,
|
98
|
+
:stream_callback,
|
99
|
+
:convert_to_utf8,
|
100
|
+
:overwrite_exist,
|
101
|
+
:request_options,
|
102
|
+
:connection_options
|
27
103
|
end
|
28
104
|
|
29
105
|
module ListSpider
|
@@ -33,33 +109,9 @@ module ListSpider
|
|
33
109
|
DEFAULT_INTERVAL = 0
|
34
110
|
|
35
111
|
@random_time_range = 3..10
|
36
|
-
@convert_to_utf8 = false
|
37
|
-
@connection_opts = { connect_timeout: 60 }
|
38
|
-
@overwrite_exist = false
|
39
|
-
@max_redirects = 10
|
40
112
|
@local_path_set = Set.new
|
41
113
|
|
42
114
|
class << self
|
43
|
-
attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
|
44
|
-
|
45
|
-
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
46
|
-
@connection_opts = {
|
47
|
-
proxy: {
|
48
|
-
host: proxy_addr,
|
49
|
-
port: proxy_port
|
50
|
-
}
|
51
|
-
}
|
52
|
-
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
53
|
-
end
|
54
|
-
|
55
|
-
def connect_timeout(max_connect_time)
|
56
|
-
@connection_opts[:connect_timeout] = max_connect_time
|
57
|
-
end
|
58
|
-
|
59
|
-
def set_header_option(header_option)
|
60
|
-
@header_option = header_option
|
61
|
-
end
|
62
|
-
|
63
115
|
def event_machine_down(link_struct_list, callback = nil)
|
64
116
|
failed_list = []
|
65
117
|
succeed_list = []
|
@@ -67,78 +119,65 @@ module ListSpider
|
|
67
119
|
begin_time = Time.now
|
68
120
|
|
69
121
|
for_each_proc =
|
70
|
-
proc do |
|
71
|
-
|
72
|
-
if
|
73
|
-
|
74
|
-
elsif defined? @header_option
|
75
|
-
opt[:head] = @header_option
|
76
|
-
end
|
77
|
-
|
78
|
-
if e.http_method == :post
|
79
|
-
opt[:body] = e.params unless e.params.empty?
|
80
|
-
w =
|
81
|
-
if @connection_opts
|
82
|
-
EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
83
|
-
else
|
84
|
-
EventMachine::HttpRequest.new(e.href).post opt
|
85
|
-
end
|
86
|
-
else
|
87
|
-
if @connection_opts
|
88
|
-
opt[:query] = e.params unless e.params.empty?
|
89
|
-
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
90
|
-
else
|
91
|
-
w = EventMachine::HttpRequest.new(e.href).get opt
|
92
|
-
end
|
93
|
-
end
|
122
|
+
proc do |task_struct|
|
123
|
+
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
124
|
+
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
125
|
+
task_struct.request_object = http_req
|
94
126
|
|
95
|
-
|
96
|
-
|
97
|
-
w.callback do
|
98
|
-
s = w.response_header.status
|
127
|
+
http_req.callback do
|
128
|
+
s = http_req.response_header.status
|
99
129
|
puts s
|
100
|
-
|
101
|
-
|
130
|
+
|
131
|
+
if s == 200
|
132
|
+
local_dir = File.dirname(task_struct.local_path)
|
102
133
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
134
|
begin
|
104
|
-
File.open(
|
135
|
+
File.open(task_struct.local_path, 'wb') do |f|
|
105
136
|
f << if @convert_to_utf8 == true
|
106
|
-
SpiderHelper.to_utf8(
|
137
|
+
SpiderHelper.to_utf8(http_req.response)
|
107
138
|
else
|
108
|
-
|
139
|
+
http_req.response
|
109
140
|
end
|
110
141
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
142
|
+
call_parse_method(task_struct)
|
143
|
+
succeed_list << task_struct
|
144
|
+
rescue StandardError => exception
|
145
|
+
puts exception
|
114
146
|
end
|
115
147
|
end
|
148
|
+
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
116
149
|
end
|
117
|
-
w.errback do
|
118
|
-
puts "errback:#{w.response_header},retry..."
|
119
|
-
puts e.href
|
120
|
-
puts w.response_header.status
|
121
|
-
|
122
|
-
ret = false
|
123
|
-
if e.http_method == :get
|
124
|
-
ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
|
125
|
-
elsif e.http_method == :post
|
126
|
-
ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
|
127
|
-
end
|
128
150
|
|
129
|
-
|
130
|
-
|
151
|
+
http_req.errback do
|
152
|
+
puts "errback:#{http_req.response_header},retry..."
|
153
|
+
puts task_struct.href
|
154
|
+
puts http_req.response_header.status
|
155
|
+
|
156
|
+
if task_struct.errback
|
157
|
+
task_struct.errback.call(task_struct, http_req)
|
131
158
|
else
|
132
|
-
|
159
|
+
ret = false
|
160
|
+
if task_struct.http_method == :get
|
161
|
+
ret = SpiderHelper.direct_http_get(task_struct.href, task_struct.local_path, convert_to_utf8: @convert_to_utf8)
|
162
|
+
elsif task_struct.http_method == :post
|
163
|
+
ret = SpiderHelper.direct_http_post(task_struct.href, task_struct.local_path, task_struct.params, convert_to_utf8: @convert_to_utf8)
|
164
|
+
end
|
165
|
+
|
166
|
+
if ret
|
167
|
+
call_parse_method(task_struct)
|
168
|
+
succeed_list << task_struct
|
169
|
+
else
|
170
|
+
failed_list << task_struct
|
171
|
+
end
|
133
172
|
end
|
134
173
|
end
|
135
174
|
|
136
175
|
begin
|
137
|
-
multi.add
|
176
|
+
multi.add task_struct.local_path, http_req
|
138
177
|
rescue StandardError => exception
|
139
178
|
puts exception
|
140
|
-
puts
|
141
|
-
puts
|
179
|
+
puts task_struct.href
|
180
|
+
puts task_struct.local_path
|
142
181
|
stop_machine
|
143
182
|
end
|
144
183
|
end
|
@@ -170,38 +209,15 @@ module ListSpider
|
|
170
209
|
@down_list.shift(@max)
|
171
210
|
end
|
172
211
|
|
173
|
-
def call_parse_method(
|
174
|
-
|
175
|
-
if pm
|
176
|
-
case pm.arity
|
177
|
-
when 1
|
178
|
-
pm.call(e.local_path)
|
179
|
-
when 2
|
180
|
-
pm.call(e.local_path, e.extra_data)
|
181
|
-
when 3
|
182
|
-
res_header = nil
|
183
|
-
res_header = e.request_object.response_header if e.request_object
|
184
|
-
pm.call(e.local_path, e.extra_data, res_header)
|
185
|
-
when 4
|
186
|
-
res_header = nil
|
187
|
-
res_header = e.request_object.response_header if e.request_object
|
188
|
-
|
189
|
-
req = nil
|
190
|
-
req = e.request_object.req if e.request_object
|
191
|
-
|
192
|
-
pm.call(e.local_path, e.extra_data, res_header, req)
|
193
|
-
else
|
194
|
-
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
|
195
|
-
end
|
196
|
-
end
|
212
|
+
def call_parse_method(task_struct)
|
213
|
+
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
197
214
|
end
|
198
215
|
|
199
216
|
def complete(_multi, success_list, failed_list)
|
200
217
|
@succeed_size += success_list.size
|
201
218
|
@failed_size += failed_list.size
|
202
|
-
success_list
|
203
|
-
|
204
|
-
end
|
219
|
+
@succeed_list.concat(success_list)
|
220
|
+
@failed_list.concat(failed_list)
|
205
221
|
|
206
222
|
todo = next_task
|
207
223
|
|
@@ -223,6 +239,8 @@ module ListSpider
|
|
223
239
|
|
224
240
|
def event_machine_start_list(down_list, callback = nil)
|
225
241
|
EventMachine.run do
|
242
|
+
@succeed_list = []
|
243
|
+
@failed_list = []
|
226
244
|
@begin_time = Time.now
|
227
245
|
if down_list.empty?
|
228
246
|
if callback
|
@@ -239,7 +257,7 @@ module ListSpider
|
|
239
257
|
def filter_list(down_list)
|
240
258
|
need_down_list = []
|
241
259
|
down_list.each do |ts|
|
242
|
-
if
|
260
|
+
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
243
261
|
call_parse_method(ts)
|
244
262
|
elsif @local_path_set.add?(ts.local_path)
|
245
263
|
need_down_list << ts
|
data/lib/spider_helper.rb
CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
5
|
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil,
|
7
|
-
|
6
|
+
def direct_http_get(href, local_path, params: nil,
|
7
|
+
header: nil, convert_to_utf8: false)
|
8
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
8
9
|
|
9
10
|
begin
|
10
11
|
href.query = URI.encode_www_form(params) if params
|
@@ -35,8 +36,9 @@ module SpiderHelper
|
|
35
36
|
false
|
36
37
|
end
|
37
38
|
|
38
|
-
def direct_http_post(href, local_path, params,
|
39
|
-
|
39
|
+
def direct_http_post(href, local_path, params,
|
40
|
+
header: nil, convert_to_utf8: false)
|
41
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
40
42
|
|
41
43
|
begin
|
42
44
|
req = Net::HTTP::Post.new(href)
|
@@ -72,7 +74,7 @@ module SpiderHelper
|
|
72
74
|
|
73
75
|
def string_to_uri(href)
|
74
76
|
l = href
|
75
|
-
l.sub!('http:///', 'http://')
|
77
|
+
l.sub!('http:///', 'http://')
|
76
78
|
l = Addressable::URI.parse(l)
|
77
79
|
l.normalize!
|
78
80
|
end
|
data/spider_example.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
|
1
|
+
# require 'list_spider'
|
2
|
+
require File.expand_path('../lib/list_spider', __FILE__)
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
5
|
|
6
|
-
def parse_index_item(
|
7
|
-
content = File.read(
|
6
|
+
def parse_index_item(e)
|
7
|
+
content = File.read(e.local_path)
|
8
8
|
doc = Nokogiri::HTML(content)
|
9
9
|
list_group = doc.css('h2.entry-title')
|
10
10
|
link_list = list_group.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|