list_spider 1.0.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +5 -5
- data/lib/file_filter.rb +4 -2
- data/lib/list_spider/version.rb +1 -1
- data/lib/list_spider.rb +133 -115
- data/lib/spider_helper.rb +7 -5
- data/spider_example.rb +4 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 837d9e4cb2b3aa829466cf9eaa4f48a24b5d4ff5067bbc27fb67fbdb37eec291
|
4
|
+
data.tar.gz: 8d378b9e3240b8d9c3bdc9c7e32aceb39a16fc63310224dc7ce6a68a2c570893
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd2c77aa71d8ff3d7ecba93fc6e30ec158b479dcffed9e3cc744944e2bcea3cb5425fc59f85acc22573bcbe3d1eb9a0967e7d0b1e11d3c9cb8d04a58450a0a7e
|
7
|
+
data.tar.gz: ec0e3ac5b2a09a3986eea20c69efc31c9536d1d96f77507e50755bfa07531c4bf7303317bc657a573dd8347bd304d8e93c9adbabf868918f0bdbe56c480e82e6
|
data/README.md
CHANGED
@@ -86,9 +86,9 @@ def parse_response(file_name)
|
|
86
86
|
end
|
87
87
|
|
88
88
|
|
89
|
-
#
|
89
|
+
# custom_data is passed by TaskStruct's custom_data param
|
90
90
|
|
91
|
-
def parse_response(file_name,
|
91
|
+
def parse_response(file_name, custom_data)
|
92
92
|
#...
|
93
93
|
end
|
94
94
|
|
@@ -99,7 +99,7 @@ end
|
|
99
99
|
# response_header.cookie
|
100
100
|
# response_header['Last-Modified']
|
101
101
|
|
102
|
-
def parse_response(file_name,
|
102
|
+
def parse_response(file_name, custom_data, response_header)
|
103
103
|
response_header.status
|
104
104
|
response_header['Last-Modified']
|
105
105
|
|
@@ -113,7 +113,7 @@ end
|
|
113
113
|
# req.uri
|
114
114
|
# req.host
|
115
115
|
# req.port
|
116
|
-
def parse_response(file_name,
|
116
|
+
def parse_response(file_name, custom_data, response_header, req)
|
117
117
|
puts req.body
|
118
118
|
puts req.headers
|
119
119
|
puts req.uri
|
@@ -128,7 +128,7 @@ end
|
|
128
128
|
## And there are many options you can use
|
129
129
|
|
130
130
|
```ruby
|
131
|
-
TaskStruct.new(href, local_path, http_method: :get, params: {},
|
131
|
+
TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
|
132
132
|
```
|
133
133
|
|
134
134
|
```ruby
|
data/lib/file_filter.rb
CHANGED
@@ -2,7 +2,8 @@
|
|
2
2
|
class FileFilter
|
3
3
|
# 4033
|
4
4
|
# 920
|
5
|
-
def initialize(dir_pattern, size_threshold: 1000,
|
5
|
+
def initialize(dir_pattern, size_threshold: 1000,
|
6
|
+
cust_judge: nil, process_block: nil)
|
6
7
|
@dir_pattern = dir_pattern
|
7
8
|
@size_threshold = size_threshold
|
8
9
|
@cust_judge = cust_judge ? cust_judge : method(:default_judge)
|
@@ -53,7 +54,8 @@ class FileFilter
|
|
53
54
|
).start
|
54
55
|
end
|
55
56
|
|
56
|
-
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
57
|
+
def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
|
58
|
+
size_threshold: 1000, cust_judge: nil)
|
57
59
|
result_file = File.open(save_file_name, 'wt')
|
58
60
|
FileFilter.new(
|
59
61
|
dir_pattern,
|
data/lib/list_spider/version.rb
CHANGED
data/lib/list_spider.rb
CHANGED
@@ -8,22 +8,98 @@ require File.expand_path('../spider_helper', __FILE__)
|
|
8
8
|
require File.expand_path('../file_filter', __FILE__)
|
9
9
|
|
10
10
|
class TaskStruct
|
11
|
-
def initialize(href,
|
11
|
+
def initialize(href, # 请求链接
|
12
|
+
local_path, # 保存数据的本地路径(此路径作为去重标准)
|
13
|
+
# http方法,取值::get, :head, :delete, :put, :post, :patch, :options
|
14
|
+
http_method: :get,
|
15
|
+
custom_data: nil, # 自定义数据
|
16
|
+
parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
|
17
|
+
# 请求成功后的回调,此时可能没有保存文件,比如301,
|
18
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
19
|
+
# http.response_header.status 状态码
|
20
|
+
# http.response_header 返回头
|
21
|
+
# http.response 返回体
|
22
|
+
callback: nil,
|
23
|
+
# 请求失败后的回调
|
24
|
+
# 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
|
25
|
+
errback: nil,
|
26
|
+
stream_callback: nil, # 流数据处理回调
|
27
|
+
convert_to_utf8: false, # 是否转换为utf8编码
|
28
|
+
overwrite_exist: false, # 是否覆盖现有文件
|
29
|
+
# request options
|
30
|
+
redirects: 3, # 重定向次数
|
31
|
+
# keepalive: nil, # (暂不支持)
|
32
|
+
file: nil, # 要上传的文件路径
|
33
|
+
# path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
|
34
|
+
query: nil, # 查询字符串,可以是string或hash类型
|
35
|
+
body: nil, # 请求体,可以是string或hash类型
|
36
|
+
head: nil, # 请求头
|
37
|
+
# connection options
|
38
|
+
connect_timeout: 60, # 连接超时时间
|
39
|
+
inactivity_timeout: nil, # 连接后超时时间
|
40
|
+
# ssl设置
|
41
|
+
# ssl: {
|
42
|
+
# :private_key_file => '/tmp/server.key',
|
43
|
+
# :cert_chain_file => '/tmp/server.crt',
|
44
|
+
# :verify_peer => false
|
45
|
+
# }
|
46
|
+
ssl: nil,
|
47
|
+
# bind: {
|
48
|
+
# :host => '123.123.123.123', # use a specific interface for outbound request
|
49
|
+
# :port => '123'
|
50
|
+
# }
|
51
|
+
bind: nil,
|
52
|
+
# 代理设置
|
53
|
+
# proxy: {
|
54
|
+
# :host => '127.0.0.1', # proxy address
|
55
|
+
# :port => 9000, # proxy port
|
56
|
+
# :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
|
57
|
+
|
58
|
+
# :authorization => ['user', 'pass'] # proxy authorization header
|
59
|
+
# }
|
60
|
+
proxy: nil)
|
12
61
|
@href = href
|
13
|
-
@href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
|
14
62
|
@local_path = local_path
|
15
63
|
@http_method = http_method
|
16
|
-
@
|
17
|
-
@extra_data = extra_data
|
64
|
+
@custom_data = custom_data
|
18
65
|
@parse_method = parse_method
|
19
|
-
@
|
66
|
+
@callback = callback
|
67
|
+
@errback = errback
|
68
|
+
@stream_callback = stream_callback
|
69
|
+
@convert_to_utf8 = convert_to_utf8
|
70
|
+
@overwrite_exist = overwrite_exist
|
71
|
+
|
72
|
+
@request_options = {
|
73
|
+
redirects: redirects,
|
74
|
+
# keepalive: keepalive,
|
75
|
+
file: file,
|
76
|
+
# path: path,
|
77
|
+
query: query,
|
78
|
+
body: body,
|
79
|
+
head: head
|
80
|
+
}.compact
|
81
|
+
|
82
|
+
@connection_options = {
|
83
|
+
connect_timeout: connect_timeout,
|
84
|
+
inactivity_timeout: inactivity_timeout,
|
85
|
+
ssl: ssl,
|
86
|
+
bind: bind,
|
87
|
+
proxy: proxy
|
88
|
+
}.compact
|
20
89
|
end
|
21
90
|
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
91
|
+
attr_accessor :href, :local_path,
|
92
|
+
:http_method,
|
93
|
+
:custom_data,
|
94
|
+
:request_object,
|
95
|
+
:parse_method,
|
96
|
+
:callback,
|
97
|
+
:errback,
|
98
|
+
:stream_callback,
|
99
|
+
:convert_to_utf8,
|
100
|
+
:overwrite_exist,
|
101
|
+
:request_options,
|
102
|
+
:connection_options
|
27
103
|
end
|
28
104
|
|
29
105
|
module ListSpider
|
@@ -33,33 +109,9 @@ module ListSpider
|
|
33
109
|
DEFAULT_INTERVAL = 0
|
34
110
|
|
35
111
|
@random_time_range = 3..10
|
36
|
-
@convert_to_utf8 = false
|
37
|
-
@connection_opts = { connect_timeout: 60 }
|
38
|
-
@overwrite_exist = false
|
39
|
-
@max_redirects = 10
|
40
112
|
@local_path_set = Set.new
|
41
113
|
|
42
114
|
class << self
|
43
|
-
attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
|
44
|
-
|
45
|
-
def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
|
46
|
-
@connection_opts = {
|
47
|
-
proxy: {
|
48
|
-
host: proxy_addr,
|
49
|
-
port: proxy_port
|
50
|
-
}
|
51
|
-
}
|
52
|
-
@connection_opts[:proxy][:authorization] = [username, password] if username && password
|
53
|
-
end
|
54
|
-
|
55
|
-
def connect_timeout(max_connect_time)
|
56
|
-
@connection_opts[:connect_timeout] = max_connect_time
|
57
|
-
end
|
58
|
-
|
59
|
-
def set_header_option(header_option)
|
60
|
-
@header_option = header_option
|
61
|
-
end
|
62
|
-
|
63
115
|
def event_machine_down(link_struct_list, callback = nil)
|
64
116
|
failed_list = []
|
65
117
|
succeed_list = []
|
@@ -67,78 +119,65 @@ module ListSpider
|
|
67
119
|
begin_time = Time.now
|
68
120
|
|
69
121
|
for_each_proc =
|
70
|
-
proc do |
|
71
|
-
|
72
|
-
if
|
73
|
-
|
74
|
-
elsif defined? @header_option
|
75
|
-
opt[:head] = @header_option
|
76
|
-
end
|
77
|
-
|
78
|
-
if e.http_method == :post
|
79
|
-
opt[:body] = e.params unless e.params.empty?
|
80
|
-
w =
|
81
|
-
if @connection_opts
|
82
|
-
EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
|
83
|
-
else
|
84
|
-
EventMachine::HttpRequest.new(e.href).post opt
|
85
|
-
end
|
86
|
-
else
|
87
|
-
if @connection_opts
|
88
|
-
opt[:query] = e.params unless e.params.empty?
|
89
|
-
w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
|
90
|
-
else
|
91
|
-
w = EventMachine::HttpRequest.new(e.href).get opt
|
92
|
-
end
|
93
|
-
end
|
122
|
+
proc do |task_struct|
|
123
|
+
http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
|
124
|
+
http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
|
125
|
+
task_struct.request_object = http_req
|
94
126
|
|
95
|
-
|
96
|
-
|
97
|
-
w.callback do
|
98
|
-
s = w.response_header.status
|
127
|
+
http_req.callback do
|
128
|
+
s = http_req.response_header.status
|
99
129
|
puts s
|
100
|
-
|
101
|
-
|
130
|
+
|
131
|
+
if s == 200
|
132
|
+
local_dir = File.dirname(task_struct.local_path)
|
102
133
|
FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
|
103
134
|
begin
|
104
|
-
File.open(
|
135
|
+
File.open(task_struct.local_path, 'wb') do |f|
|
105
136
|
f << if @convert_to_utf8 == true
|
106
|
-
SpiderHelper.to_utf8(
|
137
|
+
SpiderHelper.to_utf8(http_req.response)
|
107
138
|
else
|
108
|
-
|
139
|
+
http_req.response
|
109
140
|
end
|
110
141
|
end
|
111
|
-
|
112
|
-
|
113
|
-
|
142
|
+
call_parse_method(task_struct)
|
143
|
+
succeed_list << task_struct
|
144
|
+
rescue StandardError => exception
|
145
|
+
puts exception
|
114
146
|
end
|
115
147
|
end
|
148
|
+
task_struct.callback.call(task_struct, http_req) if task_struct.callback
|
116
149
|
end
|
117
|
-
w.errback do
|
118
|
-
puts "errback:#{w.response_header},retry..."
|
119
|
-
puts e.href
|
120
|
-
puts w.response_header.status
|
121
|
-
|
122
|
-
ret = false
|
123
|
-
if e.http_method == :get
|
124
|
-
ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
|
125
|
-
elsif e.http_method == :post
|
126
|
-
ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
|
127
|
-
end
|
128
150
|
|
129
|
-
|
130
|
-
|
151
|
+
http_req.errback do
|
152
|
+
puts "errback:#{http_req.response_header},retry..."
|
153
|
+
puts task_struct.href
|
154
|
+
puts http_req.response_header.status
|
155
|
+
|
156
|
+
if task_struct.errback
|
157
|
+
task_struct.errback.call(task_struct, http_req)
|
131
158
|
else
|
132
|
-
|
159
|
+
ret = false
|
160
|
+
if task_struct.http_method == :get
|
161
|
+
ret = SpiderHelper.direct_http_get(task_struct.href, task_struct.local_path, convert_to_utf8: @convert_to_utf8)
|
162
|
+
elsif task_struct.http_method == :post
|
163
|
+
ret = SpiderHelper.direct_http_post(task_struct.href, task_struct.local_path, task_struct.params, convert_to_utf8: @convert_to_utf8)
|
164
|
+
end
|
165
|
+
|
166
|
+
if ret
|
167
|
+
call_parse_method(task_struct)
|
168
|
+
succeed_list << task_struct
|
169
|
+
else
|
170
|
+
failed_list << task_struct
|
171
|
+
end
|
133
172
|
end
|
134
173
|
end
|
135
174
|
|
136
175
|
begin
|
137
|
-
multi.add
|
176
|
+
multi.add task_struct.local_path, http_req
|
138
177
|
rescue StandardError => exception
|
139
178
|
puts exception
|
140
|
-
puts
|
141
|
-
puts
|
179
|
+
puts task_struct.href
|
180
|
+
puts task_struct.local_path
|
142
181
|
stop_machine
|
143
182
|
end
|
144
183
|
end
|
@@ -170,38 +209,15 @@ module ListSpider
|
|
170
209
|
@down_list.shift(@max)
|
171
210
|
end
|
172
211
|
|
173
|
-
def call_parse_method(
|
174
|
-
|
175
|
-
if pm
|
176
|
-
case pm.arity
|
177
|
-
when 1
|
178
|
-
pm.call(e.local_path)
|
179
|
-
when 2
|
180
|
-
pm.call(e.local_path, e.extra_data)
|
181
|
-
when 3
|
182
|
-
res_header = nil
|
183
|
-
res_header = e.request_object.response_header if e.request_object
|
184
|
-
pm.call(e.local_path, e.extra_data, res_header)
|
185
|
-
when 4
|
186
|
-
res_header = nil
|
187
|
-
res_header = e.request_object.response_header if e.request_object
|
188
|
-
|
189
|
-
req = nil
|
190
|
-
req = e.request_object.req if e.request_object
|
191
|
-
|
192
|
-
pm.call(e.local_path, e.extra_data, res_header, req)
|
193
|
-
else
|
194
|
-
puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
|
195
|
-
end
|
196
|
-
end
|
212
|
+
def call_parse_method(task_struct)
|
213
|
+
task_struct.parse_method.call(task_struct) if task_struct.parse_method
|
197
214
|
end
|
198
215
|
|
199
216
|
def complete(_multi, success_list, failed_list)
|
200
217
|
@succeed_size += success_list.size
|
201
218
|
@failed_size += failed_list.size
|
202
|
-
success_list
|
203
|
-
|
204
|
-
end
|
219
|
+
@succeed_list.concat(success_list)
|
220
|
+
@failed_list.concat(failed_list)
|
205
221
|
|
206
222
|
todo = next_task
|
207
223
|
|
@@ -223,6 +239,8 @@ module ListSpider
|
|
223
239
|
|
224
240
|
def event_machine_start_list(down_list, callback = nil)
|
225
241
|
EventMachine.run do
|
242
|
+
@succeed_list = []
|
243
|
+
@failed_list = []
|
226
244
|
@begin_time = Time.now
|
227
245
|
if down_list.empty?
|
228
246
|
if callback
|
@@ -239,7 +257,7 @@ module ListSpider
|
|
239
257
|
def filter_list(down_list)
|
240
258
|
need_down_list = []
|
241
259
|
down_list.each do |ts|
|
242
|
-
if
|
260
|
+
if !ts.overwrite_exist && File.exist?(ts.local_path)
|
243
261
|
call_parse_method(ts)
|
244
262
|
elsif @local_path_set.add?(ts.local_path)
|
245
263
|
need_down_list << ts
|
data/lib/spider_helper.rb
CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
|
|
3
3
|
|
4
4
|
module SpiderHelper
|
5
5
|
class << self
|
6
|
-
def direct_http_get(href, local_path, params: nil,
|
7
|
-
|
6
|
+
def direct_http_get(href, local_path, params: nil,
|
7
|
+
header: nil, convert_to_utf8: false)
|
8
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
8
9
|
|
9
10
|
begin
|
10
11
|
href.query = URI.encode_www_form(params) if params
|
@@ -35,8 +36,9 @@ module SpiderHelper
|
|
35
36
|
false
|
36
37
|
end
|
37
38
|
|
38
|
-
def direct_http_post(href, local_path, params,
|
39
|
-
|
39
|
+
def direct_http_post(href, local_path, params,
|
40
|
+
header: nil, convert_to_utf8: false)
|
41
|
+
href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
|
40
42
|
|
41
43
|
begin
|
42
44
|
req = Net::HTTP::Post.new(href)
|
@@ -72,7 +74,7 @@ module SpiderHelper
|
|
72
74
|
|
73
75
|
def string_to_uri(href)
|
74
76
|
l = href
|
75
|
-
l.sub!('http:///', 'http://')
|
77
|
+
l.sub!('http:///', 'http://')
|
76
78
|
l = Addressable::URI.parse(l)
|
77
79
|
l.normalize!
|
78
80
|
end
|
data/spider_example.rb
CHANGED
@@ -1,10 +1,10 @@
|
|
1
|
-
require 'list_spider'
|
2
|
-
|
1
|
+
# require 'list_spider'
|
2
|
+
require File.expand_path('../lib/list_spider', __FILE__)
|
3
3
|
|
4
4
|
DOWNLOAD_DIR = 'coolshell/'.freeze
|
5
5
|
|
6
|
-
def parse_index_item(
|
7
|
-
content = File.read(
|
6
|
+
def parse_index_item(e)
|
7
|
+
content = File.read(e.local_path)
|
8
8
|
doc = Nokogiri::HTML(content)
|
9
9
|
list_group = doc.css('h2.entry-title')
|
10
10
|
link_list = list_group.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: list_spider
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Charles Zhang
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|