list_spider 1.0.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
- data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
3
+ metadata.gz: 837d9e4cb2b3aa829466cf9eaa4f48a24b5d4ff5067bbc27fb67fbdb37eec291
4
+ data.tar.gz: 8d378b9e3240b8d9c3bdc9c7e32aceb39a16fc63310224dc7ce6a68a2c570893
5
5
  SHA512:
6
- metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
- data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
6
+ metadata.gz: dd2c77aa71d8ff3d7ecba93fc6e30ec158b479dcffed9e3cc744944e2bcea3cb5425fc59f85acc22573bcbe3d1eb9a0967e7d0b1e11d3c9cb8d04a58450a0a7e
7
+ data.tar.gz: ec0e3ac5b2a09a3986eea20c69efc31c9536d1d96f77507e50755bfa07531c4bf7303317bc657a573dd8347bd304d8e93c9adbabf868918f0bdbe56c480e82e6
data/README.md CHANGED
@@ -86,9 +86,9 @@ def parse_response(file_name)
86
86
  end
87
87
 
88
88
 
89
- # extra_data is passed by TaskStruct's extra_data param
89
+ # custom_data is passed by TaskStruct's custom_data param
90
90
 
91
- def parse_response(file_name, extra_data)
91
+ def parse_response(file_name, custom_data)
92
92
  #...
93
93
  end
94
94
 
@@ -99,7 +99,7 @@ end
99
99
  # response_header.cookie
100
100
  # response_header['Last-Modified']
101
101
 
102
- def parse_response(file_name, extra_data, response_header)
102
+ def parse_response(file_name, custom_data, response_header)
103
103
  response_header.status
104
104
  response_header['Last-Modified']
105
105
 
@@ -113,7 +113,7 @@ end
113
113
  # req.uri
114
114
  # req.host
115
115
  # req.port
116
- def parse_response(file_name, extra_data, response_header, req)
116
+ def parse_response(file_name, custom_data, response_header, req)
117
117
  puts req.body
118
118
  puts req.headers
119
119
  puts req.uri
@@ -128,7 +128,7 @@ end
128
128
  ## And there are many options you can use
129
129
 
130
130
  ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
131
+ TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
132
132
  ```
133
133
 
134
134
  ```ruby
data/lib/file_filter.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  class FileFilter
3
3
  # 4033
4
4
  # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
5
+ def initialize(dir_pattern, size_threshold: 1000,
6
+ cust_judge: nil, process_block: nil)
6
7
  @dir_pattern = dir_pattern
7
8
  @size_threshold = size_threshold
8
9
  @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
53
54
  ).start
54
55
  end
55
56
 
56
- def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
57
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
+ size_threshold: 1000, cust_judge: nil)
57
59
  result_file = File.open(save_file_name, 'wt')
58
60
  FileFilter.new(
59
61
  dir_pattern,
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '2.0.0'.freeze
3
3
  end
data/lib/list_spider.rb CHANGED
@@ -8,22 +8,98 @@ require File.expand_path('../spider_helper', __FILE__)
8
8
  require File.expand_path('../file_filter', __FILE__)
9
9
 
10
10
  class TaskStruct
11
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
+ def initialize(href, # 请求链接
12
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
13
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
14
+ http_method: :get,
15
+ custom_data: nil, # 自定义数据
16
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
17
+ # 请求成功后的回调,此时可能没有保存文件,比如301,
18
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
19
+ # http.response_header.status 状态码
20
+ # http.response_header 返回头
21
+ # http.response 返回体
22
+ callback: nil,
23
+ # 请求失败后的回调
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ errback: nil,
26
+ stream_callback: nil, # 流数据处理回调
27
+ convert_to_utf8: false, # 是否转换为utf8编码
28
+ overwrite_exist: false, # 是否覆盖现有文件
29
+ # request options
30
+ redirects: 3, # 重定向次数
31
+ # keepalive: nil, # (暂不支持)
32
+ file: nil, # 要上传的文件路径
33
+ # path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
34
+ query: nil, # 查询字符串,可以是string或hash类型
35
+ body: nil, # 请求体,可以是string或hash类型
36
+ head: nil, # 请求头
37
+ # connection options
38
+ connect_timeout: 60, # 连接超时时间
39
+ inactivity_timeout: nil, # 连接后超时时间
40
+ # ssl设置
41
+ # ssl: {
42
+ # :private_key_file => '/tmp/server.key',
43
+ # :cert_chain_file => '/tmp/server.crt',
44
+ # :verify_peer => false
45
+ # }
46
+ ssl: nil,
47
+ # bind: {
48
+ # :host => '123.123.123.123', # use a specific interface for outbound request
49
+ # :port => '123'
50
+ # }
51
+ bind: nil,
52
+ # 代理设置
53
+ # proxy: {
54
+ # :host => '127.0.0.1', # proxy address
55
+ # :port => 9000, # proxy port
56
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
57
+
58
+ # :authorization => ['user', 'pass'] # proxy authorization header
59
+ # }
60
+ proxy: nil)
12
61
  @href = href
13
- @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
62
  @local_path = local_path
15
63
  @http_method = http_method
16
- @params = params
17
- @extra_data = extra_data
64
+ @custom_data = custom_data
18
65
  @parse_method = parse_method
19
- @header = header
66
+ @callback = callback
67
+ @errback = errback
68
+ @stream_callback = stream_callback
69
+ @convert_to_utf8 = convert_to_utf8
70
+ @overwrite_exist = overwrite_exist
71
+
72
+ @request_options = {
73
+ redirects: redirects,
74
+ # keepalive: keepalive,
75
+ file: file,
76
+ # path: path,
77
+ query: query,
78
+ body: body,
79
+ head: head
80
+ }.compact
81
+
82
+ @connection_options = {
83
+ connect_timeout: connect_timeout,
84
+ inactivity_timeout: inactivity_timeout,
85
+ ssl: ssl,
86
+ bind: bind,
87
+ proxy: proxy
88
+ }.compact
20
89
  end
21
90
 
22
- def ==(other)
23
- other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
- end
25
-
26
- attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
91
+ attr_accessor :href, :local_path,
92
+ :http_method,
93
+ :custom_data,
94
+ :request_object,
95
+ :parse_method,
96
+ :callback,
97
+ :errback,
98
+ :stream_callback,
99
+ :convert_to_utf8,
100
+ :overwrite_exist,
101
+ :request_options,
102
+ :connection_options
27
103
  end
28
104
 
29
105
  module ListSpider
@@ -33,33 +109,9 @@ module ListSpider
33
109
  DEFAULT_INTERVAL = 0
34
110
 
35
111
  @random_time_range = 3..10
36
- @convert_to_utf8 = false
37
- @connection_opts = { connect_timeout: 60 }
38
- @overwrite_exist = false
39
- @max_redirects = 10
40
112
  @local_path_set = Set.new
41
113
 
42
114
  class << self
43
- attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
44
-
45
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
- @connection_opts = {
47
- proxy: {
48
- host: proxy_addr,
49
- port: proxy_port
50
- }
51
- }
52
- @connection_opts[:proxy][:authorization] = [username, password] if username && password
53
- end
54
-
55
- def connect_timeout(max_connect_time)
56
- @connection_opts[:connect_timeout] = max_connect_time
57
- end
58
-
59
- def set_header_option(header_option)
60
- @header_option = header_option
61
- end
62
-
63
115
  def event_machine_down(link_struct_list, callback = nil)
64
116
  failed_list = []
65
117
  succeed_list = []
@@ -67,78 +119,65 @@ module ListSpider
67
119
  begin_time = Time.now
68
120
 
69
121
  for_each_proc =
70
- proc do |e|
71
- opt = { redirects: @max_redirects }
72
- if e.header
73
- opt[:head] = e.header
74
- elsif defined? @header_option
75
- opt[:head] = @header_option
76
- end
77
-
78
- if e.http_method == :post
79
- opt[:body] = e.params unless e.params.empty?
80
- w =
81
- if @connection_opts
82
- EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
- else
84
- EventMachine::HttpRequest.new(e.href).post opt
85
- end
86
- else
87
- if @connection_opts
88
- opt[:query] = e.params unless e.params.empty?
89
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
- else
91
- w = EventMachine::HttpRequest.new(e.href).get opt
92
- end
93
- end
122
+ proc do |task_struct|
123
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
124
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
125
+ task_struct.request_object = http_req
94
126
 
95
- e.request_object = w
96
-
97
- w.callback do
98
- s = w.response_header.status
127
+ http_req.callback do
128
+ s = http_req.response_header.status
99
129
  puts s
100
- if s != 404
101
- local_dir = File.dirname(e.local_path)
130
+
131
+ if s == 200
132
+ local_dir = File.dirname(task_struct.local_path)
102
133
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
134
  begin
104
- File.open(e.local_path, 'wb') do |f|
135
+ File.open(task_struct.local_path, 'wb') do |f|
105
136
  f << if @convert_to_utf8 == true
106
- SpiderHelper.to_utf8(w.response)
137
+ SpiderHelper.to_utf8(http_req.response)
107
138
  else
108
- w.response
139
+ http_req.response
109
140
  end
110
141
  end
111
- succeed_list << e
112
- rescue StandardError => e
113
- puts e
142
+ call_parse_method(task_struct)
143
+ succeed_list << task_struct
144
+ rescue StandardError => exception
145
+ puts exception
114
146
  end
115
147
  end
148
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
116
149
  end
117
- w.errback do
118
- puts "errback:#{w.response_header},retry..."
119
- puts e.href
120
- puts w.response_header.status
121
-
122
- ret = false
123
- if e.http_method == :get
124
- ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
- elsif e.http_method == :post
126
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
127
- end
128
150
 
129
- if ret
130
- succeed_list << e
151
+ http_req.errback do
152
+ puts "errback:#{http_req.response_header},retry..."
153
+ puts task_struct.href
154
+ puts http_req.response_header.status
155
+
156
+ if task_struct.errback
157
+ task_struct.errback.call(task_struct, http_req)
131
158
  else
132
- failed_list << e
159
+ ret = false
160
+ if task_struct.http_method == :get
161
+ ret = SpiderHelper.direct_http_get(task_struct.href, task_struct.local_path, convert_to_utf8: @convert_to_utf8)
162
+ elsif task_struct.http_method == :post
163
+ ret = SpiderHelper.direct_http_post(task_struct.href, task_struct.local_path, task_struct.params, convert_to_utf8: @convert_to_utf8)
164
+ end
165
+
166
+ if ret
167
+ call_parse_method(task_struct)
168
+ succeed_list << task_struct
169
+ else
170
+ failed_list << task_struct
171
+ end
133
172
  end
134
173
  end
135
174
 
136
175
  begin
137
- multi.add e.local_path, w
176
+ multi.add task_struct.local_path, http_req
138
177
  rescue StandardError => exception
139
178
  puts exception
140
- puts e.href
141
- puts e.local_path
179
+ puts task_struct.href
180
+ puts task_struct.local_path
142
181
  stop_machine
143
182
  end
144
183
  end
@@ -170,38 +209,15 @@ module ListSpider
170
209
  @down_list.shift(@max)
171
210
  end
172
211
 
173
- def call_parse_method(e)
174
- pm = e.parse_method
175
- if pm
176
- case pm.arity
177
- when 1
178
- pm.call(e.local_path)
179
- when 2
180
- pm.call(e.local_path, e.extra_data)
181
- when 3
182
- res_header = nil
183
- res_header = e.request_object.response_header if e.request_object
184
- pm.call(e.local_path, e.extra_data, res_header)
185
- when 4
186
- res_header = nil
187
- res_header = e.request_object.response_header if e.request_object
188
-
189
- req = nil
190
- req = e.request_object.req if e.request_object
191
-
192
- pm.call(e.local_path, e.extra_data, res_header, req)
193
- else
194
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
195
- end
196
- end
212
+ def call_parse_method(task_struct)
213
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
197
214
  end
198
215
 
199
216
  def complete(_multi, success_list, failed_list)
200
217
  @succeed_size += success_list.size
201
218
  @failed_size += failed_list.size
202
- success_list.each do |e|
203
- call_parse_method(e)
204
- end
219
+ @succeed_list.concat(success_list)
220
+ @failed_list.concat(failed_list)
205
221
 
206
222
  todo = next_task
207
223
 
@@ -223,6 +239,8 @@ module ListSpider
223
239
 
224
240
  def event_machine_start_list(down_list, callback = nil)
225
241
  EventMachine.run do
242
+ @succeed_list = []
243
+ @failed_list = []
226
244
  @begin_time = Time.now
227
245
  if down_list.empty?
228
246
  if callback
@@ -239,7 +257,7 @@ module ListSpider
239
257
  def filter_list(down_list)
240
258
  need_down_list = []
241
259
  down_list.each do |ts|
242
- if !@overwrite_exist && File.exist?(ts.local_path)
260
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
243
261
  call_parse_method(ts)
244
262
  elsif @local_path_set.add?(ts.local_path)
245
263
  need_down_list << ts
data/lib/spider_helper.rb CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
- href = string_to_uri(href) if href.class == ''.class
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
8
9
 
9
10
  begin
10
11
  href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
35
36
  false
36
37
  end
37
38
 
38
- def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
39
- href = string_to_uri(href) if href.class == ''.class
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
40
42
 
41
43
  begin
42
44
  req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
72
74
 
73
75
  def string_to_uri(href)
74
76
  l = href
75
- l.sub!('http:///', 'http://') if l.start_with?('http:///')
77
+ l.sub!('http:///', 'http://')
76
78
  l = Addressable::URI.parse(l)
77
79
  l.normalize!
78
80
  end
data/spider_example.rb CHANGED
@@ -1,10 +1,10 @@
1
- require 'list_spider'
2
- # require File.expand_path('../lib/list_spider', __FILE__)
1
+ # require 'list_spider'
2
+ require File.expand_path('../lib/list_spider', __FILE__)
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
6
- def parse_index_item(file_name)
7
- content = File.read(file_name)
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
8
  doc = Nokogiri::HTML(content)
9
9
  list_group = doc.css('h2.entry-title')
10
10
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-01-29 00:00:00.000000000 Z
11
+ date: 2018-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler