list_spider 1.0.0 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 197035f7521ba4c326c0181c7133afe4c5d7bacfc3246795dc32758dce40da64
4
- data.tar.gz: 89d14776f4c041806b6b9e164b31e651d03746c74d83505d5a32c1aeeaa62aa2
3
+ metadata.gz: 837d9e4cb2b3aa829466cf9eaa4f48a24b5d4ff5067bbc27fb67fbdb37eec291
4
+ data.tar.gz: 8d378b9e3240b8d9c3bdc9c7e32aceb39a16fc63310224dc7ce6a68a2c570893
5
5
  SHA512:
6
- metadata.gz: a1b38832345203ec036ff4f8e11fba1d92e8ec58674d05ef129784a9e274dcd03ef421fa3db6e38bc38d7bb1cf3c54b7d56cbb321a5340bbe197fe57099ed077
7
- data.tar.gz: 43de7e093004c823abb3c51a053869fd294af7fee9f9724c499af572ead7d5ba79d7ab9bb16b2baae1e00a1d198f89fcfbbedc35f57a3a8ed00f7f785d40cbfc
6
+ metadata.gz: dd2c77aa71d8ff3d7ecba93fc6e30ec158b479dcffed9e3cc744944e2bcea3cb5425fc59f85acc22573bcbe3d1eb9a0967e7d0b1e11d3c9cb8d04a58450a0a7e
7
+ data.tar.gz: ec0e3ac5b2a09a3986eea20c69efc31c9536d1d96f77507e50755bfa07531c4bf7303317bc657a573dd8347bd304d8e93c9adbabf868918f0bdbe56c480e82e6
data/README.md CHANGED
@@ -86,9 +86,9 @@ def parse_response(file_name)
86
86
  end
87
87
 
88
88
 
89
- # extra_data is passed by TaskStruct's extra_data param
89
+ # custom_data is passed by TaskStruct's custom_data param
90
90
 
91
- def parse_response(file_name, extra_data)
91
+ def parse_response(file_name, custom_data)
92
92
  #...
93
93
  end
94
94
 
@@ -99,7 +99,7 @@ end
99
99
  # response_header.cookie
100
100
  # response_header['Last-Modified']
101
101
 
102
- def parse_response(file_name, extra_data, response_header)
102
+ def parse_response(file_name, custom_data, response_header)
103
103
  response_header.status
104
104
  response_header['Last-Modified']
105
105
 
@@ -113,7 +113,7 @@ end
113
113
  # req.uri
114
114
  # req.host
115
115
  # req.port
116
- def parse_response(file_name, extra_data, response_header, req)
116
+ def parse_response(file_name, custom_data, response_header, req)
117
117
  puts req.body
118
118
  puts req.headers
119
119
  puts req.uri
@@ -128,7 +128,7 @@ end
128
128
  ## And there are many options you can use
129
129
 
130
130
  ```ruby
131
- TaskStruct.new(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
131
+ TaskStruct.new(href, local_path, http_method: :get, params: {}, custom_data: nil, parse_method: nil, header: nil)
132
132
  ```
133
133
 
134
134
  ```ruby
data/lib/file_filter.rb CHANGED
@@ -2,7 +2,8 @@
2
2
  class FileFilter
3
3
  # 4033
4
4
  # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
5
+ def initialize(dir_pattern, size_threshold: 1000,
6
+ cust_judge: nil, process_block: nil)
6
7
  @dir_pattern = dir_pattern
7
8
  @size_threshold = size_threshold
8
9
  @cust_judge = cust_judge ? cust_judge : method(:default_judge)
@@ -53,7 +54,8 @@ class FileFilter
53
54
  ).start
54
55
  end
55
56
 
56
- def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
57
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt',
58
+ size_threshold: 1000, cust_judge: nil)
57
59
  result_file = File.open(save_file_name, 'wt')
58
60
  FileFilter.new(
59
61
  dir_pattern,
@@ -1,3 +1,3 @@
1
1
  module ListSpider
2
- VERSION = '1.0.0'.freeze
2
+ VERSION = '2.0.0'.freeze
3
3
  end
data/lib/list_spider.rb CHANGED
@@ -8,22 +8,98 @@ require File.expand_path('../spider_helper', __FILE__)
8
8
  require File.expand_path('../file_filter', __FILE__)
9
9
 
10
10
  class TaskStruct
11
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
+ def initialize(href, # 请求链接
12
+ local_path, # 保存数据的本地路径(此路径作为去重标准)
13
+ # http方法,取值::get, :head, :delete, :put, :post, :patch, :options
14
+ http_method: :get,
15
+ custom_data: nil, # 自定义数据
16
+ parse_method: nil, # 解析保存文件的回调,参数是TaskStruct对象本身
17
+ # 请求成功后的回调,此时可能没有保存文件,比如301,
18
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
19
+ # http.response_header.status 状态码
20
+ # http.response_header 返回头
21
+ # http.response 返回体
22
+ callback: nil,
23
+ # 请求失败后的回调
24
+ # 参数是TaskStruct对象本身和对应的EventMachine::HttpRequest对象
25
+ errback: nil,
26
+ stream_callback: nil, # 流数据处理回调
27
+ convert_to_utf8: false, # 是否转换为utf8编码
28
+ overwrite_exist: false, # 是否覆盖现有文件
29
+ # request options
30
+ redirects: 3, # 重定向次数
31
+ # keepalive: nil, # (暂不支持)
32
+ file: nil, # 要上传的文件路径
33
+ # path: nil, # 请求路径,在流水线方式请求时有用(暂不支持)
34
+ query: nil, # 查询字符串,可以是string或hash类型
35
+ body: nil, # 请求体,可以是string或hash类型
36
+ head: nil, # 请求头
37
+ # connection options
38
+ connect_timeout: 60, # 连接超时时间
39
+ inactivity_timeout: nil, # 连接后超时时间
40
+ # ssl设置
41
+ # ssl: {
42
+ # :private_key_file => '/tmp/server.key',
43
+ # :cert_chain_file => '/tmp/server.crt',
44
+ # :verify_peer => false
45
+ # }
46
+ ssl: nil,
47
+ # bind: {
48
+ # :host => '123.123.123.123', # use a specific interface for outbound request
49
+ # :port => '123'
50
+ # }
51
+ bind: nil,
52
+ # 代理设置
53
+ # proxy: {
54
+ # :host => '127.0.0.1', # proxy address
55
+ # :port => 9000, # proxy port
56
+ # :type => :socks5 # default proxy mode is HTTP proxy, change to :socks5 if required
57
+
58
+ # :authorization => ['user', 'pass'] # proxy authorization header
59
+ # }
60
+ proxy: nil)
12
61
  @href = href
13
- @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
14
62
  @local_path = local_path
15
63
  @http_method = http_method
16
- @params = params
17
- @extra_data = extra_data
64
+ @custom_data = custom_data
18
65
  @parse_method = parse_method
19
- @header = header
66
+ @callback = callback
67
+ @errback = errback
68
+ @stream_callback = stream_callback
69
+ @convert_to_utf8 = convert_to_utf8
70
+ @overwrite_exist = overwrite_exist
71
+
72
+ @request_options = {
73
+ redirects: redirects,
74
+ # keepalive: keepalive,
75
+ file: file,
76
+ # path: path,
77
+ query: query,
78
+ body: body,
79
+ head: head
80
+ }.compact
81
+
82
+ @connection_options = {
83
+ connect_timeout: connect_timeout,
84
+ inactivity_timeout: inactivity_timeout,
85
+ ssl: ssl,
86
+ bind: bind,
87
+ proxy: proxy
88
+ }.compact
20
89
  end
21
90
 
22
- def ==(other)
23
- other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
24
- end
25
-
26
- attr_accessor :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
91
+ attr_accessor :href, :local_path,
92
+ :http_method,
93
+ :custom_data,
94
+ :request_object,
95
+ :parse_method,
96
+ :callback,
97
+ :errback,
98
+ :stream_callback,
99
+ :convert_to_utf8,
100
+ :overwrite_exist,
101
+ :request_options,
102
+ :connection_options
27
103
  end
28
104
 
29
105
  module ListSpider
@@ -33,33 +109,9 @@ module ListSpider
33
109
  DEFAULT_INTERVAL = 0
34
110
 
35
111
  @random_time_range = 3..10
36
- @convert_to_utf8 = false
37
- @connection_opts = { connect_timeout: 60 }
38
- @overwrite_exist = false
39
- @max_redirects = 10
40
112
  @local_path_set = Set.new
41
113
 
42
114
  class << self
43
- attr_accessor :convert_to_utf8, :overwrite_exist, :max_redirects
44
-
45
- def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
46
- @connection_opts = {
47
- proxy: {
48
- host: proxy_addr,
49
- port: proxy_port
50
- }
51
- }
52
- @connection_opts[:proxy][:authorization] = [username, password] if username && password
53
- end
54
-
55
- def connect_timeout(max_connect_time)
56
- @connection_opts[:connect_timeout] = max_connect_time
57
- end
58
-
59
- def set_header_option(header_option)
60
- @header_option = header_option
61
- end
62
-
63
115
  def event_machine_down(link_struct_list, callback = nil)
64
116
  failed_list = []
65
117
  succeed_list = []
@@ -67,78 +119,65 @@ module ListSpider
67
119
  begin_time = Time.now
68
120
 
69
121
  for_each_proc =
70
- proc do |e|
71
- opt = { redirects: @max_redirects }
72
- if e.header
73
- opt[:head] = e.header
74
- elsif defined? @header_option
75
- opt[:head] = @header_option
76
- end
77
-
78
- if e.http_method == :post
79
- opt[:body] = e.params unless e.params.empty?
80
- w =
81
- if @connection_opts
82
- EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
- else
84
- EventMachine::HttpRequest.new(e.href).post opt
85
- end
86
- else
87
- if @connection_opts
88
- opt[:query] = e.params unless e.params.empty?
89
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
- else
91
- w = EventMachine::HttpRequest.new(e.href).get opt
92
- end
93
- end
122
+ proc do |task_struct|
123
+ http_req = EventMachine::HttpRequest.new(task_struct.href, task_struct.connection_options).public_send(task_struct.http_method, task_struct.request_options)
124
+ http_req.stream { |chunk| stream_callback.call(chunk) } if task_struct.stream_callback
125
+ task_struct.request_object = http_req
94
126
 
95
- e.request_object = w
96
-
97
- w.callback do
98
- s = w.response_header.status
127
+ http_req.callback do
128
+ s = http_req.response_header.status
99
129
  puts s
100
- if s != 404
101
- local_dir = File.dirname(e.local_path)
130
+
131
+ if s == 200
132
+ local_dir = File.dirname(task_struct.local_path)
102
133
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
134
  begin
104
- File.open(e.local_path, 'wb') do |f|
135
+ File.open(task_struct.local_path, 'wb') do |f|
105
136
  f << if @convert_to_utf8 == true
106
- SpiderHelper.to_utf8(w.response)
137
+ SpiderHelper.to_utf8(http_req.response)
107
138
  else
108
- w.response
139
+ http_req.response
109
140
  end
110
141
  end
111
- succeed_list << e
112
- rescue StandardError => e
113
- puts e
142
+ call_parse_method(task_struct)
143
+ succeed_list << task_struct
144
+ rescue StandardError => exception
145
+ puts exception
114
146
  end
115
147
  end
148
+ task_struct.callback.call(task_struct, http_req) if task_struct.callback
116
149
  end
117
- w.errback do
118
- puts "errback:#{w.response_header},retry..."
119
- puts e.href
120
- puts w.response_header.status
121
-
122
- ret = false
123
- if e.http_method == :get
124
- ret = SpiderHelper.direct_http_get(e.href, e.local_path, convert_to_utf8: @convert_to_utf8)
125
- elsif e.http_method == :post
126
- ret = SpiderHelper.direct_http_post(e.href, e.local_path, e.params, convert_to_utf8: @convert_to_utf8)
127
- end
128
150
 
129
- if ret
130
- succeed_list << e
151
+ http_req.errback do
152
+ puts "errback:#{http_req.response_header},retry..."
153
+ puts task_struct.href
154
+ puts http_req.response_header.status
155
+
156
+ if task_struct.errback
157
+ task_struct.errback.call(task_struct, http_req)
131
158
  else
132
- failed_list << e
159
+ ret = false
160
+ if task_struct.http_method == :get
161
+ ret = SpiderHelper.direct_http_get(task_struct.href, task_struct.local_path, convert_to_utf8: @convert_to_utf8)
162
+ elsif task_struct.http_method == :post
163
+ ret = SpiderHelper.direct_http_post(task_struct.href, task_struct.local_path, task_struct.params, convert_to_utf8: @convert_to_utf8)
164
+ end
165
+
166
+ if ret
167
+ call_parse_method(task_struct)
168
+ succeed_list << task_struct
169
+ else
170
+ failed_list << task_struct
171
+ end
133
172
  end
134
173
  end
135
174
 
136
175
  begin
137
- multi.add e.local_path, w
176
+ multi.add task_struct.local_path, http_req
138
177
  rescue StandardError => exception
139
178
  puts exception
140
- puts e.href
141
- puts e.local_path
179
+ puts task_struct.href
180
+ puts task_struct.local_path
142
181
  stop_machine
143
182
  end
144
183
  end
@@ -170,38 +209,15 @@ module ListSpider
170
209
  @down_list.shift(@max)
171
210
  end
172
211
 
173
- def call_parse_method(e)
174
- pm = e.parse_method
175
- if pm
176
- case pm.arity
177
- when 1
178
- pm.call(e.local_path)
179
- when 2
180
- pm.call(e.local_path, e.extra_data)
181
- when 3
182
- res_header = nil
183
- res_header = e.request_object.response_header if e.request_object
184
- pm.call(e.local_path, e.extra_data, res_header)
185
- when 4
186
- res_header = nil
187
- res_header = e.request_object.response_header if e.request_object
188
-
189
- req = nil
190
- req = e.request_object.req if e.request_object
191
-
192
- pm.call(e.local_path, e.extra_data, res_header, req)
193
- else
194
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
195
- end
196
- end
212
+ def call_parse_method(task_struct)
213
+ task_struct.parse_method.call(task_struct) if task_struct.parse_method
197
214
  end
198
215
 
199
216
  def complete(_multi, success_list, failed_list)
200
217
  @succeed_size += success_list.size
201
218
  @failed_size += failed_list.size
202
- success_list.each do |e|
203
- call_parse_method(e)
204
- end
219
+ @succeed_list.concat(success_list)
220
+ @failed_list.concat(failed_list)
205
221
 
206
222
  todo = next_task
207
223
 
@@ -223,6 +239,8 @@ module ListSpider
223
239
 
224
240
  def event_machine_start_list(down_list, callback = nil)
225
241
  EventMachine.run do
242
+ @succeed_list = []
243
+ @failed_list = []
226
244
  @begin_time = Time.now
227
245
  if down_list.empty?
228
246
  if callback
@@ -239,7 +257,7 @@ module ListSpider
239
257
  def filter_list(down_list)
240
258
  need_down_list = []
241
259
  down_list.each do |ts|
242
- if !@overwrite_exist && File.exist?(ts.local_path)
260
+ if !ts.overwrite_exist && File.exist?(ts.local_path)
243
261
  call_parse_method(ts)
244
262
  elsif @local_path_set.add?(ts.local_path)
245
263
  need_down_list << ts
data/lib/spider_helper.rb CHANGED
@@ -3,8 +3,9 @@ require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
5
  class << self
6
- def direct_http_get(href, local_path, params: nil, header: nil, convert_to_utf8: false)
7
- href = string_to_uri(href) if href.class == ''.class
6
+ def direct_http_get(href, local_path, params: nil,
7
+ header: nil, convert_to_utf8: false)
8
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
8
9
 
9
10
  begin
10
11
  href.query = URI.encode_www_form(params) if params
@@ -35,8 +36,9 @@ module SpiderHelper
35
36
  false
36
37
  end
37
38
 
38
- def direct_http_post(href, local_path, params, header: nil, convert_to_utf8: false)
39
- href = string_to_uri(href) if href.class == ''.class
39
+ def direct_http_post(href, local_path, params,
40
+ header: nil, convert_to_utf8: false)
41
+ href = string_to_uri(href.to_s) unless href.is_a?(Addressable::URI)
40
42
 
41
43
  begin
42
44
  req = Net::HTTP::Post.new(href)
@@ -72,7 +74,7 @@ module SpiderHelper
72
74
 
73
75
  def string_to_uri(href)
74
76
  l = href
75
- l.sub!('http:///', 'http://') if l.start_with?('http:///')
77
+ l.sub!('http:///', 'http://')
76
78
  l = Addressable::URI.parse(l)
77
79
  l.normalize!
78
80
  end
data/spider_example.rb CHANGED
@@ -1,10 +1,10 @@
1
- require 'list_spider'
2
- # require File.expand_path('../lib/list_spider', __FILE__)
1
+ # require 'list_spider'
2
+ require File.expand_path('../lib/list_spider', __FILE__)
3
3
 
4
4
  DOWNLOAD_DIR = 'coolshell/'.freeze
5
5
 
6
- def parse_index_item(file_name)
7
- content = File.read(file_name)
6
+ def parse_index_item(e)
7
+ content = File.read(e.local_path)
8
8
  doc = Nokogiri::HTML(content)
9
9
  list_group = doc.css('h2.entry-title')
10
10
  link_list = list_group.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-01-29 00:00:00.000000000 Z
11
+ date: 2018-02-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler