list_spider 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2daa1e0b7fbebdfcc8d4b7299d7c2378953721e9
4
- data.tar.gz: c7975f3e4eda6c80204ce92713b39c6eca0e2c7e
3
+ metadata.gz: 9e16f58e39ec18d181da107dfdc76626c23dc44c
4
+ data.tar.gz: c6fd2518111983f1bdcf9c471a93cfe279a722fe
5
5
  SHA512:
6
- metadata.gz: 87bca1533b16e09032b02dbb90830c1d9db03699d8fe2138574e389df63c2293dc8af6221f5a315d13ccd1bc09be4c10c8040ee13145d70ee523df37b6720912
7
- data.tar.gz: fd29efa797377f01b99d7d878bec5b9263628b21f36090e08b9e62aa4196ad2273c704423b0dc15afb4e853f5757c58aa3f64a689d51c32fffe222aa6bb3279a
6
+ metadata.gz: 8d3374e67436fc68afdd770c9ece27cfd734e82dc74917e7db226f311f743cbd51c69129f0c400bad6ec602c15f1cb08697c2b85a91805c8c3c549e519a68f74
7
+ data.tar.gz: 9ef2fdb5d99fcf018ce02eeb395eedf4e72917018b21b48a23ae12601fa7b2b566ba6c08595674900326d8f4d1d18c6e992c2cf544f468dffa96f3cbeefecf7b
data/lib/file_filter.rb CHANGED
@@ -5,11 +5,7 @@ class FileFilter
5
5
  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
6
6
  @dir_pattern = dir_pattern
7
7
  @size_threshold = size_threshold
8
- if cust_judge
9
- @cust_judge = cust_judge
10
- else
11
- @cust_judge = method(:default_judge)
12
- end
8
+ @cust_judge = cust_judge ? cust_judge : method(:default_judge)
13
9
  @total = 0
14
10
  @process_block = process_block
15
11
  end
@@ -33,39 +29,42 @@ class FileFilter
33
29
  end
34
30
 
35
31
  def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
36
- FileFilter.new(dir_pattern,
37
- size_threshold: size_threshold,
38
- cust_judge: cust_judge,
39
- process_block:
40
- proc do |f|
41
- puts "deleted file: #{f}"
42
- File.delete(f)
43
- end
44
- ).start
32
+ FileFilter.new(
33
+ dir_pattern,
34
+ size_threshold: size_threshold,
35
+ cust_judge: cust_judge,
36
+ process_block:
37
+ proc do |f|
38
+ puts "deleted file: #{f}"
39
+ File.delete(f)
40
+ end
41
+ ).start
45
42
  end
46
43
 
47
44
  def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
48
- FileFilter.new(dir_pattern,
49
- size_threshold: size_threshold,
50
- cust_judge: cust_judge,
51
- process_block:
52
- proc do |f|
53
- puts "filterd file: #{f}"
54
- end
55
- ).start
45
+ FileFilter.new(
46
+ dir_pattern,
47
+ size_threshold: size_threshold,
48
+ cust_judge: cust_judge,
49
+ process_block:
50
+ proc do |f|
51
+ puts "filterd file: #{f}"
52
+ end
53
+ ).start
56
54
  end
57
55
 
58
56
  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
59
57
  result_file = File.open(save_file_name, 'wt')
60
- FileFilter.new(dir_pattern,
61
- size_threshold: size_threshold,
62
- cust_judge: cust_judge,
63
- process_block:
64
- proc do |f|
65
- puts "filterd file: #{f}"
66
- result_file << f << "\n"
67
- end
68
- ).start
58
+ FileFilter.new(
59
+ dir_pattern,
60
+ size_threshold: size_threshold,
61
+ cust_judge: cust_judge,
62
+ process_block:
63
+ proc do |f|
64
+ puts "filterd file: #{f}"
65
+ result_file << f << "\n"
66
+ end
67
+ ).start
68
+ result_file.close
69
69
  end
70
-
71
70
  end
data/lib/list_spider.rb CHANGED
@@ -2,7 +2,7 @@ require 'em-http-request'
2
2
  require 'nokogiri'
3
3
  require 'fileutils'
4
4
  require 'set'
5
- require "addressable/uri"
5
+ require 'addressable/uri'
6
6
  require File.expand_path('../spider_helper', __FILE__)
7
7
  require File.expand_path('../file_filter', __FILE__)
8
8
 
@@ -10,9 +10,7 @@ class TaskStruct
10
10
  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
11
  @origin_href = href
12
12
  @href = href
13
- if @href.class == "".class
14
- @href = SpiderHelper.string_to_uri(@href)
15
- end
13
+ @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
16
14
  @local_path = local_path
17
15
  @http_method = http_method
18
16
  @params = params
@@ -21,16 +19,14 @@ class TaskStruct
21
19
  @header = header
22
20
  end
23
21
 
24
- def == (o)
25
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
22
+ def ==(other)
23
+ other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
26
24
  end
27
25
 
28
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
29
-
26
+ attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
30
27
  end
31
28
 
32
29
  module ListSpider
33
-
34
30
  RANDOM_TIME = -1
35
31
  NO_LIMIT_CONCURRENT = -1
36
32
  DEFAULT_CONCURRNET_MAX = 50
@@ -38,21 +34,20 @@ module ListSpider
38
34
 
39
35
  @random_time_range = 3..10
40
36
  @conver_to_utf8 = false
41
- @connection_opts = {connect_timeout: 60}
37
+ @connection_opts = { connect_timeout: 60 }
42
38
  @overwrite_exist = false
43
39
  @max_redirects = 10
44
40
  @local_path_set = Set.new
45
41
 
46
42
  class << self
47
-
48
43
  attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
49
44
 
50
45
  def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
51
46
  @connection_opts = {
52
- :proxy => {
53
- :host => proxy_addr,
54
- :port => proxy_port
55
- }
47
+ proxy: {
48
+ host: proxy_addr,
49
+ port: proxy_port
50
+ }
56
51
  }
57
52
  @connection_opts[:proxy][:authorization] = [username, password] if username && password
58
53
  end
@@ -71,100 +66,102 @@ module ListSpider
71
66
  multi = EventMachine::MultiRequest.new
72
67
  begin_time = Time.now
73
68
 
74
- for_each_proc = proc do |e|
75
- opt = {}
76
- opt = {:redirects => @max_redirects}
77
- if e.header
78
- opt[:head] = e.header
79
- elsif defined? @header_option
80
- opt[:head] = @header_option
81
- end
82
-
83
- if e.http_method == :post
84
- opt[:body] = e.params unless e.params.empty?
85
- if @connection_opts
86
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
87
- else
88
- w = EventMachine::HttpRequest.new(e.href).post opt
69
+ for_each_proc =
70
+ proc do |e|
71
+ opt = { redirects: @max_redirects }
72
+ if e.header
73
+ opt[:head] = e.header
74
+ elsif defined? @header_option
75
+ opt[:head] = @header_option
89
76
  end
90
- else
91
- if @connection_opts
92
- opt[:query] = e.params unless e.params.empty?
93
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
77
+
78
+ if e.http_method == :post
79
+ opt[:body] = e.params unless e.params.empty?
80
+ w =
81
+ if @connection_opts
82
+ EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
+ else
84
+ EventMachine::HttpRequest.new(e.href).post opt
85
+ end
94
86
  else
95
- w = EventMachine::HttpRequest.new(e.href).get opt
87
+ if @connection_opts
88
+ opt[:query] = e.params unless e.params.empty?
89
+ w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
+ else
91
+ w = EventMachine::HttpRequest.new(e.href).get opt
92
+ end
96
93
  end
97
- end
98
94
 
99
- e.request_object = w
100
-
101
- w.callback {
102
- s = w.response_header.status
103
- puts s
104
- if s != 404
105
- local_dir = File.dirname(e.local_path)
106
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
107
- begin
108
- File.open(e.local_path, "w") do |f|
109
- if @conver_to_utf8 == true
110
- f << SpiderHelper.to_utf8( w.response)
111
- else
112
- f << w.response
95
+ e.request_object = w
96
+
97
+ w.callback do
98
+ s = w.response_header.status
99
+ puts s
100
+ if s != 404
101
+ local_dir = File.dirname(e.local_path)
102
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
+ begin
104
+ File.open(e.local_path, 'w') do |f|
105
+ f << if @conver_to_utf8 == true
106
+ SpiderHelper.to_utf8(w.response)
107
+ else
108
+ w.response
109
+ end
113
110
  end
111
+ succeed_list << e
112
+ rescue => e
113
+ puts e
114
114
  end
115
- succeed_list << e
116
- rescue Exception => e
117
- puts e
118
115
  end
119
116
  end
120
- }
121
- w.errback {
122
- puts "errback:#{w.response_header}"
123
- puts e.origin_href
124
- puts e.href
125
- puts w.response_header.status
126
- failed_list << e
127
- if e.http_method == :get
128
- SpiderHelper.direct_http_get(e.href, e.local_path)
129
- elsif e.http_method == :post
130
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
117
+ w.errback do
118
+ puts "errback:#{w.response_header}"
119
+ puts e.origin_href
120
+ puts e.href
121
+ puts w.response_header.status
122
+ failed_list << e
123
+ if e.http_method == :get
124
+ SpiderHelper.direct_http_get(e.href, e.local_path)
125
+ elsif e.http_method == :post
126
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
127
+ end
131
128
  end
132
- }
133
129
 
134
- begin
135
- multi.add e.local_path, w
136
- rescue Exception => exception
137
- puts exception
138
- puts e.href
139
- puts e.local_path
140
- stop_machine
130
+ begin
131
+ multi.add e.local_path, w
132
+ rescue => exception
133
+ puts exception
134
+ puts e.href
135
+ puts e.local_path
136
+ stop_machine
137
+ end
141
138
  end
142
- end
143
139
 
144
- cb = Proc.new do
145
- end_time = Time.now
146
- puts "use time:#{end_time-begin_time} seconds"
147
- if callback.nil?
148
- stop_machine
149
- else
150
- callback.call(multi, succeed_list, failed_list)
140
+ cb =
141
+ proc do
142
+ end_time = Time.now
143
+ puts "use time:#{end_time - begin_time} seconds"
144
+ if callback.nil?
145
+ stop_machine
146
+ else
147
+ callback.call(multi, succeed_list, failed_list)
148
+ end
151
149
  end
152
- end
153
- link_struct_list.each &for_each_proc
154
- multi.callback &cb
150
+ link_struct_list.each(&for_each_proc)
151
+ multi.callback(&cb)
155
152
  end
156
153
 
157
154
  def stop_machine
158
155
  puts "success size:#{@succeed_size}"
159
156
  puts "failed size:#{@failed_size}"
160
157
  @end_time = Time.now
161
- puts "total use time:#{@end_time-@begin_time} seconds"
158
+ puts "total use time:#{@end_time - @begin_time} seconds"
162
159
  EventMachine.stop
163
160
  @local_path_set.clear
164
161
  end
165
162
 
166
- def get_next_task
167
- return @down_list.shift(@max)
163
+ def next_task
164
+ @down_list.shift(@max)
168
165
  end
169
166
 
170
167
  def call_parse_method(e)
@@ -193,20 +190,20 @@ module ListSpider
193
190
  end
194
191
  end
195
192
 
196
- def complete(multi, success_list, failed_list)
193
+ def complete(_multi, success_list, failed_list)
197
194
  @succeed_size += success_list.size
198
195
  @failed_size += failed_list.size
199
196
  success_list.each do |e|
200
197
  call_parse_method(e)
201
198
  end
202
199
 
203
- todo = get_next_task
200
+ todo = next_task
204
201
 
205
202
  if todo.empty?
206
203
  stop_machine
207
204
  else
208
205
  if @interval != 0
209
- if success_list.size != 0 || failed_list.size != 0
206
+ if !success_list.empty? || !failed_list.empty?
210
207
  if @interval == RANDOM_TIME
211
208
  sleep(rand(@random_time_range))
212
209
  else
@@ -219,7 +216,7 @@ module ListSpider
219
216
  end
220
217
 
221
218
  def event_machine_start_list(down_list, callback = nil)
222
- EventMachine.run {
219
+ EventMachine.run do
223
220
  @begin_time = Time.now
224
221
  if down_list.empty?
225
222
  if callback
@@ -230,7 +227,7 @@ module ListSpider
230
227
  else
231
228
  event_machine_down(down_list, callback)
232
229
  end
233
- }
230
+ end
234
231
  end
235
232
 
236
233
  def filter_list(down_list)
@@ -242,7 +239,7 @@ module ListSpider
242
239
  need_down_list << ts
243
240
  end
244
241
  end
245
- return need_down_list
242
+ need_down_list
246
243
  end
247
244
 
248
245
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
@@ -255,7 +252,7 @@ module ListSpider
255
252
 
256
253
  need_down_list = filter_list(down_list)
257
254
 
258
- @down_list = @down_list + need_down_list
255
+ @down_list += need_down_list
259
256
  @interval = interval
260
257
  @max = max
261
258
  @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
@@ -263,7 +260,7 @@ module ListSpider
263
260
  @failed_size = 0
264
261
 
265
262
  puts "total size:#{@down_list.size}"
266
- event_machine_start_list(get_next_task, method(:complete))
263
+ event_machine_start_list(next_task, method(:complete))
267
264
  end
268
265
 
269
266
  def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
@@ -273,19 +270,18 @@ module ListSpider
273
270
  def add_task(task)
274
271
  if task.is_a?Array
275
272
  need_down_list = filter_list(task)
276
- @down_list = @down_list + need_down_list
273
+ @down_list += need_down_list
277
274
  elsif task.is_a?TaskStruct
278
275
  need_down_list = filter_list([task])
279
- @down_list = @down_list + need_down_list
276
+ @down_list += need_down_list
280
277
  else
281
278
  puts "error task type:#{task.class}"
282
279
  end
283
280
  end
284
281
  end
285
282
 
286
- Signal.trap("INT") do
283
+ Signal.trap('INT') do
287
284
  ListSpider.stop_machine
288
285
  exit!
289
286
  end
290
-
291
287
  end
data/lib/spider_helper.rb CHANGED
@@ -2,50 +2,46 @@ require 'rchardet'
2
2
  require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
-
6
5
  class << self
7
-
8
6
  def direct_http_get(href, local_path, params: nil, header: nil)
9
- if href.class == "".class
10
- href = string_to_uri(href)
11
- end
7
+ href = string_to_uri(href) if href.class == ''.class
12
8
 
13
9
  begin
14
10
  href.query = URI.encode_www_form(params) if params
15
11
  req = Net::HTTP::Get.new(href)
16
- header.each{|k,v| req[k] = v} if header
12
+ header.each { |k, v| req[k] = v } if header
17
13
 
18
- res = Net::HTTP.start(href.hostname, href.port) do |http|
19
- http.request(req)
20
- end
14
+ res =
15
+ Net::HTTP.start(href.hostname, href.port) do |http|
16
+ http.request(req)
17
+ end
21
18
 
22
19
  if res.is_a?(Net::HTTPSuccess)
23
20
  local_dir = File.dirname(local_path)
24
21
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
25
22
  File.write(local_path, res.body)
26
- puts "succeed"
23
+ puts 'succeed'
27
24
  else
28
25
  puts res
29
26
  end
30
- rescue Exception => e
27
+ rescue => e
31
28
  puts e.backtrace
32
29
  puts e
33
30
  end
34
31
  end
35
32
 
36
33
  def direct_http_post(href, local_path, params, header: nil)
37
- if href.class == "".class
38
- href = string_to_uri(href)
39
- end
34
+ href = string_to_uri(href) if href.class == ''.class
40
35
 
41
36
  begin
42
37
  req = Net::HTTP::Post.new(href)
43
38
  req.set_form_data(params)
44
- header.each{|k,v| req[k] = v} if header
39
+ header.each { |k, v| req[k] = v } if header
45
40
 
46
- res = Net::HTTP.start(href.hostname, href.port) do |http|
47
- http.request(req)
48
- end
41
+ res =
42
+ Net::HTTP.start(href.hostname, href.port) do |http|
43
+ http.request(req)
44
+ end
49
45
 
50
46
  if res.is_a?(Net::HTTPSuccess)
51
47
  local_dir = File.dirname(local_path)
@@ -54,7 +50,7 @@ module SpiderHelper
54
50
  else
55
51
  puts res
56
52
  end
57
- rescue Exception => e
53
+ rescue => e
58
54
  puts e
59
55
  end
60
56
  end
@@ -70,34 +66,32 @@ module SpiderHelper
70
66
  l.normalize!
71
67
  end
72
68
 
73
- BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
74
- "UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
75
- "UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
76
- "UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
77
- "UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
69
+ BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
70
+ 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
71
+ 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
72
+ 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
73
+ 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
78
74
 
79
- #此函数有时此判断有误,使用to_utf8函数直接转换
75
+ # 此函数有时此判断有误,使用to_utf8函数直接转换
80
76
  def smart_to_utf8(str)
81
77
  return str if str.encoding == Encoding::UTF_8
82
78
  to_utf8(str)
83
79
  end
84
80
 
85
81
  def to_utf8(str)
86
- #解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
82
+ # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
87
83
  str.force_encoding(Encoding::ASCII_8BIT)
88
84
  cd = CharDet.detect(str)
89
- if cd["confidence"] > 0.6
90
- puts cd["encoding"]
91
- str.force_encoding(cd["encoding"])
92
- #移除BOM头
93
- bomHeader = BomHeaderMap[cd["encoding"]]
94
- str.sub!(bomHeader, "") if bomHeader
85
+ if cd['confidence'] > 0.6
86
+ puts cd['encoding']
87
+ str.force_encoding(cd['encoding'])
88
+ # 移除BOM头
89
+ bom_header = BomHeaderMap[cd['encoding']]
90
+ str.sub!(bom_header, '') if bom_header
95
91
  end
96
- str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
97
-
98
- return str
99
- end
92
+ str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
100
93
 
94
+ str
95
+ end
101
96
  end
102
-
103
97
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang