list_spider 0.3.1 → 0.3.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2daa1e0b7fbebdfcc8d4b7299d7c2378953721e9
4
- data.tar.gz: c7975f3e4eda6c80204ce92713b39c6eca0e2c7e
3
+ metadata.gz: 9e16f58e39ec18d181da107dfdc76626c23dc44c
4
+ data.tar.gz: c6fd2518111983f1bdcf9c471a93cfe279a722fe
5
5
  SHA512:
6
- metadata.gz: 87bca1533b16e09032b02dbb90830c1d9db03699d8fe2138574e389df63c2293dc8af6221f5a315d13ccd1bc09be4c10c8040ee13145d70ee523df37b6720912
7
- data.tar.gz: fd29efa797377f01b99d7d878bec5b9263628b21f36090e08b9e62aa4196ad2273c704423b0dc15afb4e853f5757c58aa3f64a689d51c32fffe222aa6bb3279a
6
+ metadata.gz: 8d3374e67436fc68afdd770c9ece27cfd734e82dc74917e7db226f311f743cbd51c69129f0c400bad6ec602c15f1cb08697c2b85a91805c8c3c549e519a68f74
7
+ data.tar.gz: 9ef2fdb5d99fcf018ce02eeb395eedf4e72917018b21b48a23ae12601fa7b2b566ba6c08595674900326d8f4d1d18c6e992c2cf544f468dffa96f3cbeefecf7b
data/lib/file_filter.rb CHANGED
@@ -5,11 +5,7 @@ class FileFilter
5
5
  def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
6
6
  @dir_pattern = dir_pattern
7
7
  @size_threshold = size_threshold
8
- if cust_judge
9
- @cust_judge = cust_judge
10
- else
11
- @cust_judge = method(:default_judge)
12
- end
8
+ @cust_judge = cust_judge ? cust_judge : method(:default_judge)
13
9
  @total = 0
14
10
  @process_block = process_block
15
11
  end
@@ -33,39 +29,42 @@ class FileFilter
33
29
  end
34
30
 
35
31
  def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
36
- FileFilter.new(dir_pattern,
37
- size_threshold: size_threshold,
38
- cust_judge: cust_judge,
39
- process_block:
40
- proc do |f|
41
- puts "deleted file: #{f}"
42
- File.delete(f)
43
- end
44
- ).start
32
+ FileFilter.new(
33
+ dir_pattern,
34
+ size_threshold: size_threshold,
35
+ cust_judge: cust_judge,
36
+ process_block:
37
+ proc do |f|
38
+ puts "deleted file: #{f}"
39
+ File.delete(f)
40
+ end
41
+ ).start
45
42
  end
46
43
 
47
44
  def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
48
- FileFilter.new(dir_pattern,
49
- size_threshold: size_threshold,
50
- cust_judge: cust_judge,
51
- process_block:
52
- proc do |f|
53
- puts "filterd file: #{f}"
54
- end
55
- ).start
45
+ FileFilter.new(
46
+ dir_pattern,
47
+ size_threshold: size_threshold,
48
+ cust_judge: cust_judge,
49
+ process_block:
50
+ proc do |f|
51
+ puts "filterd file: #{f}"
52
+ end
53
+ ).start
56
54
  end
57
55
 
58
56
  def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
59
57
  result_file = File.open(save_file_name, 'wt')
60
- FileFilter.new(dir_pattern,
61
- size_threshold: size_threshold,
62
- cust_judge: cust_judge,
63
- process_block:
64
- proc do |f|
65
- puts "filterd file: #{f}"
66
- result_file << f << "\n"
67
- end
68
- ).start
58
+ FileFilter.new(
59
+ dir_pattern,
60
+ size_threshold: size_threshold,
61
+ cust_judge: cust_judge,
62
+ process_block:
63
+ proc do |f|
64
+ puts "filterd file: #{f}"
65
+ result_file << f << "\n"
66
+ end
67
+ ).start
68
+ result_file.close
69
69
  end
70
-
71
70
  end
data/lib/list_spider.rb CHANGED
@@ -2,7 +2,7 @@ require 'em-http-request'
2
2
  require 'nokogiri'
3
3
  require 'fileutils'
4
4
  require 'set'
5
- require "addressable/uri"
5
+ require 'addressable/uri'
6
6
  require File.expand_path('../spider_helper', __FILE__)
7
7
  require File.expand_path('../file_filter', __FILE__)
8
8
 
@@ -10,9 +10,7 @@ class TaskStruct
10
10
  def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
11
  @origin_href = href
12
12
  @href = href
13
- if @href.class == "".class
14
- @href = SpiderHelper.string_to_uri(@href)
15
- end
13
+ @href = SpiderHelper.string_to_uri(@href) if @href.class == ''.class
16
14
  @local_path = local_path
17
15
  @http_method = http_method
18
16
  @params = params
@@ -21,16 +19,14 @@ class TaskStruct
21
19
  @header = header
22
20
  end
23
21
 
24
- def == (o)
25
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
22
+ def ==(other)
23
+ other.class == self.class && other.href == href && other.local_path == local_path && other.http_method == http_method && other.params == params && other.extra_data == extra_data && other.header == header
26
24
  end
27
25
 
28
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
29
-
26
+ attr_accessor :origin_href, :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
30
27
  end
31
28
 
32
29
  module ListSpider
33
-
34
30
  RANDOM_TIME = -1
35
31
  NO_LIMIT_CONCURRENT = -1
36
32
  DEFAULT_CONCURRNET_MAX = 50
@@ -38,21 +34,20 @@ module ListSpider
38
34
 
39
35
  @random_time_range = 3..10
40
36
  @conver_to_utf8 = false
41
- @connection_opts = {connect_timeout: 60}
37
+ @connection_opts = { connect_timeout: 60 }
42
38
  @overwrite_exist = false
43
39
  @max_redirects = 10
44
40
  @local_path_set = Set.new
45
41
 
46
42
  class << self
47
-
48
43
  attr_accessor :conver_to_utf8, :overwrite_exist, :max_redirects
49
44
 
50
45
  def set_proxy(proxy_addr, proxy_port, username: nil, password: nil)
51
46
  @connection_opts = {
52
- :proxy => {
53
- :host => proxy_addr,
54
- :port => proxy_port
55
- }
47
+ proxy: {
48
+ host: proxy_addr,
49
+ port: proxy_port
50
+ }
56
51
  }
57
52
  @connection_opts[:proxy][:authorization] = [username, password] if username && password
58
53
  end
@@ -71,100 +66,102 @@ module ListSpider
71
66
  multi = EventMachine::MultiRequest.new
72
67
  begin_time = Time.now
73
68
 
74
- for_each_proc = proc do |e|
75
- opt = {}
76
- opt = {:redirects => @max_redirects}
77
- if e.header
78
- opt[:head] = e.header
79
- elsif defined? @header_option
80
- opt[:head] = @header_option
81
- end
82
-
83
- if e.http_method == :post
84
- opt[:body] = e.params unless e.params.empty?
85
- if @connection_opts
86
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
87
- else
88
- w = EventMachine::HttpRequest.new(e.href).post opt
69
+ for_each_proc =
70
+ proc do |e|
71
+ opt = { redirects: @max_redirects }
72
+ if e.header
73
+ opt[:head] = e.header
74
+ elsif defined? @header_option
75
+ opt[:head] = @header_option
89
76
  end
90
- else
91
- if @connection_opts
92
- opt[:query] = e.params unless e.params.empty?
93
- w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
77
+
78
+ if e.http_method == :post
79
+ opt[:body] = e.params unless e.params.empty?
80
+ w =
81
+ if @connection_opts
82
+ EventMachine::HttpRequest.new(e.href, @connection_opts).post opt
83
+ else
84
+ EventMachine::HttpRequest.new(e.href).post opt
85
+ end
94
86
  else
95
- w = EventMachine::HttpRequest.new(e.href).get opt
87
+ if @connection_opts
88
+ opt[:query] = e.params unless e.params.empty?
89
+ w = EventMachine::HttpRequest.new(e.href, @connection_opts).get opt
90
+ else
91
+ w = EventMachine::HttpRequest.new(e.href).get opt
92
+ end
96
93
  end
97
- end
98
94
 
99
- e.request_object = w
100
-
101
- w.callback {
102
- s = w.response_header.status
103
- puts s
104
- if s != 404
105
- local_dir = File.dirname(e.local_path)
106
- FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
107
- begin
108
- File.open(e.local_path, "w") do |f|
109
- if @conver_to_utf8 == true
110
- f << SpiderHelper.to_utf8( w.response)
111
- else
112
- f << w.response
95
+ e.request_object = w
96
+
97
+ w.callback do
98
+ s = w.response_header.status
99
+ puts s
100
+ if s != 404
101
+ local_dir = File.dirname(e.local_path)
102
+ FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
103
+ begin
104
+ File.open(e.local_path, 'w') do |f|
105
+ f << if @conver_to_utf8 == true
106
+ SpiderHelper.to_utf8(w.response)
107
+ else
108
+ w.response
109
+ end
113
110
  end
111
+ succeed_list << e
112
+ rescue => e
113
+ puts e
114
114
  end
115
- succeed_list << e
116
- rescue Exception => e
117
- puts e
118
115
  end
119
116
  end
120
- }
121
- w.errback {
122
- puts "errback:#{w.response_header}"
123
- puts e.origin_href
124
- puts e.href
125
- puts w.response_header.status
126
- failed_list << e
127
- if e.http_method == :get
128
- SpiderHelper.direct_http_get(e.href, e.local_path)
129
- elsif e.http_method == :post
130
- SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
117
+ w.errback do
118
+ puts "errback:#{w.response_header}"
119
+ puts e.origin_href
120
+ puts e.href
121
+ puts w.response_header.status
122
+ failed_list << e
123
+ if e.http_method == :get
124
+ SpiderHelper.direct_http_get(e.href, e.local_path)
125
+ elsif e.http_method == :post
126
+ SpiderHelper.direct_http_post(e.href, e.local_path, e.params)
127
+ end
131
128
  end
132
- }
133
129
 
134
- begin
135
- multi.add e.local_path, w
136
- rescue Exception => exception
137
- puts exception
138
- puts e.href
139
- puts e.local_path
140
- stop_machine
130
+ begin
131
+ multi.add e.local_path, w
132
+ rescue => exception
133
+ puts exception
134
+ puts e.href
135
+ puts e.local_path
136
+ stop_machine
137
+ end
141
138
  end
142
- end
143
139
 
144
- cb = Proc.new do
145
- end_time = Time.now
146
- puts "use time:#{end_time-begin_time} seconds"
147
- if callback.nil?
148
- stop_machine
149
- else
150
- callback.call(multi, succeed_list, failed_list)
140
+ cb =
141
+ proc do
142
+ end_time = Time.now
143
+ puts "use time:#{end_time - begin_time} seconds"
144
+ if callback.nil?
145
+ stop_machine
146
+ else
147
+ callback.call(multi, succeed_list, failed_list)
148
+ end
151
149
  end
152
- end
153
- link_struct_list.each &for_each_proc
154
- multi.callback &cb
150
+ link_struct_list.each(&for_each_proc)
151
+ multi.callback(&cb)
155
152
  end
156
153
 
157
154
  def stop_machine
158
155
  puts "success size:#{@succeed_size}"
159
156
  puts "failed size:#{@failed_size}"
160
157
  @end_time = Time.now
161
- puts "total use time:#{@end_time-@begin_time} seconds"
158
+ puts "total use time:#{@end_time - @begin_time} seconds"
162
159
  EventMachine.stop
163
160
  @local_path_set.clear
164
161
  end
165
162
 
166
- def get_next_task
167
- return @down_list.shift(@max)
163
+ def next_task
164
+ @down_list.shift(@max)
168
165
  end
169
166
 
170
167
  def call_parse_method(e)
@@ -193,20 +190,20 @@ module ListSpider
193
190
  end
194
191
  end
195
192
 
196
- def complete(multi, success_list, failed_list)
193
+ def complete(_multi, success_list, failed_list)
197
194
  @succeed_size += success_list.size
198
195
  @failed_size += failed_list.size
199
196
  success_list.each do |e|
200
197
  call_parse_method(e)
201
198
  end
202
199
 
203
- todo = get_next_task
200
+ todo = next_task
204
201
 
205
202
  if todo.empty?
206
203
  stop_machine
207
204
  else
208
205
  if @interval != 0
209
- if success_list.size != 0 || failed_list.size != 0
206
+ if !success_list.empty? || !failed_list.empty?
210
207
  if @interval == RANDOM_TIME
211
208
  sleep(rand(@random_time_range))
212
209
  else
@@ -219,7 +216,7 @@ module ListSpider
219
216
  end
220
217
 
221
218
  def event_machine_start_list(down_list, callback = nil)
222
- EventMachine.run {
219
+ EventMachine.run do
223
220
  @begin_time = Time.now
224
221
  if down_list.empty?
225
222
  if callback
@@ -230,7 +227,7 @@ module ListSpider
230
227
  else
231
228
  event_machine_down(down_list, callback)
232
229
  end
233
- }
230
+ end
234
231
  end
235
232
 
236
233
  def filter_list(down_list)
@@ -242,7 +239,7 @@ module ListSpider
242
239
  need_down_list << ts
243
240
  end
244
241
  end
245
- return need_down_list
242
+ need_down_list
246
243
  end
247
244
 
248
245
  def get_list(down_list, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
@@ -255,7 +252,7 @@ module ListSpider
255
252
 
256
253
  need_down_list = filter_list(down_list)
257
254
 
258
- @down_list = @down_list + need_down_list
255
+ @down_list += need_down_list
259
256
  @interval = interval
260
257
  @max = max
261
258
  @max = @down_list.size if @max == NO_LIMIT_CONCURRENT
@@ -263,7 +260,7 @@ module ListSpider
263
260
  @failed_size = 0
264
261
 
265
262
  puts "total size:#{@down_list.size}"
266
- event_machine_start_list(get_next_task, method(:complete))
263
+ event_machine_start_list(next_task, method(:complete))
267
264
  end
268
265
 
269
266
  def get_one(task, interval: DEFAULT_INTERVAL, max: DEFAULT_CONCURRNET_MAX)
@@ -273,19 +270,18 @@ module ListSpider
273
270
  def add_task(task)
274
271
  if task.is_a?Array
275
272
  need_down_list = filter_list(task)
276
- @down_list = @down_list + need_down_list
273
+ @down_list += need_down_list
277
274
  elsif task.is_a?TaskStruct
278
275
  need_down_list = filter_list([task])
279
- @down_list = @down_list + need_down_list
276
+ @down_list += need_down_list
280
277
  else
281
278
  puts "error task type:#{task.class}"
282
279
  end
283
280
  end
284
281
  end
285
282
 
286
- Signal.trap("INT") do
283
+ Signal.trap('INT') do
287
284
  ListSpider.stop_machine
288
285
  exit!
289
286
  end
290
-
291
287
  end
data/lib/spider_helper.rb CHANGED
@@ -2,50 +2,46 @@ require 'rchardet'
2
2
  require 'net/http'
3
3
 
4
4
  module SpiderHelper
5
-
6
5
  class << self
7
-
8
6
  def direct_http_get(href, local_path, params: nil, header: nil)
9
- if href.class == "".class
10
- href = string_to_uri(href)
11
- end
7
+ href = string_to_uri(href) if href.class == ''.class
12
8
 
13
9
  begin
14
10
  href.query = URI.encode_www_form(params) if params
15
11
  req = Net::HTTP::Get.new(href)
16
- header.each{|k,v| req[k] = v} if header
12
+ header.each { |k, v| req[k] = v } if header
17
13
 
18
- res = Net::HTTP.start(href.hostname, href.port) do |http|
19
- http.request(req)
20
- end
14
+ res =
15
+ Net::HTTP.start(href.hostname, href.port) do |http|
16
+ http.request(req)
17
+ end
21
18
 
22
19
  if res.is_a?(Net::HTTPSuccess)
23
20
  local_dir = File.dirname(local_path)
24
21
  FileUtils.mkdir_p(local_dir) unless Dir.exist?(local_dir)
25
22
  File.write(local_path, res.body)
26
- puts "succeed"
23
+ puts 'succeed'
27
24
  else
28
25
  puts res
29
26
  end
30
- rescue Exception => e
27
+ rescue => e
31
28
  puts e.backtrace
32
29
  puts e
33
30
  end
34
31
  end
35
32
 
36
33
  def direct_http_post(href, local_path, params, header: nil)
37
- if href.class == "".class
38
- href = string_to_uri(href)
39
- end
34
+ href = string_to_uri(href) if href.class == ''.class
40
35
 
41
36
  begin
42
37
  req = Net::HTTP::Post.new(href)
43
38
  req.set_form_data(params)
44
- header.each{|k,v| req[k] = v} if header
39
+ header.each { |k, v| req[k] = v } if header
45
40
 
46
- res = Net::HTTP.start(href.hostname, href.port) do |http|
47
- http.request(req)
48
- end
41
+ res =
42
+ Net::HTTP.start(href.hostname, href.port) do |http|
43
+ http.request(req)
44
+ end
49
45
 
50
46
  if res.is_a?(Net::HTTPSuccess)
51
47
  local_dir = File.dirname(local_path)
@@ -54,7 +50,7 @@ module SpiderHelper
54
50
  else
55
51
  puts res
56
52
  end
57
- rescue Exception => e
53
+ rescue => e
58
54
  puts e
59
55
  end
60
56
  end
@@ -70,34 +66,32 @@ module SpiderHelper
70
66
  l.normalize!
71
67
  end
72
68
 
73
- BomHeaderMap = {"UTF-8" => "\xEF\xBB\xBF".force_encoding("UTF-8"),
74
- "UTF-16BE"=>"\xFE\xFF".force_encoding("UTF-16BE"),
75
- "UTF-16LE"=>"\xFF\xFE".force_encoding("UTF-16LE"),
76
- "UTF-32BE"=>"\x00\x00\xFE\xFF".force_encoding("UTF-32BE"),
77
- "UTF-32LE"=>"\xFF\xFE\x00\x00".force_encoding("UTF-32LE")}
69
+ BomHeaderMap = { 'UTF-8' => "\xEF\xBB\xBF".force_encoding('UTF-8'),
70
+ 'UTF-16BE' => "\xFE\xFF".force_encoding('UTF-16BE'),
71
+ 'UTF-16LE' => "\xFF\xFE".force_encoding('UTF-16LE'),
72
+ 'UTF-32BE' => "\x00\x00\xFE\xFF".force_encoding('UTF-32BE'),
73
+ 'UTF-32LE' => "\xFF\xFE\x00\x00".force_encoding('UTF-32LE') }.freeze
78
74
 
79
- #此函数有时此判断有误,使用to_utf8函数直接转换
75
+ # 此函数有时此判断有误,使用to_utf8函数直接转换
80
76
  def smart_to_utf8(str)
81
77
  return str if str.encoding == Encoding::UTF_8
82
78
  to_utf8(str)
83
79
  end
84
80
 
85
81
  def to_utf8(str)
86
- #解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
82
+ # 解决windows下CharDet库编译为ASCII_8BIT,无法与UTF-8兼容问题
87
83
  str.force_encoding(Encoding::ASCII_8BIT)
88
84
  cd = CharDet.detect(str)
89
- if cd["confidence"] > 0.6
90
- puts cd["encoding"]
91
- str.force_encoding(cd["encoding"])
92
- #移除BOM头
93
- bomHeader = BomHeaderMap[cd["encoding"]]
94
- str.sub!(bomHeader, "") if bomHeader
85
+ if cd['confidence'] > 0.6
86
+ puts cd['encoding']
87
+ str.force_encoding(cd['encoding'])
88
+ # 移除BOM头
89
+ bom_header = BomHeaderMap[cd['encoding']]
90
+ str.sub!(bom_header, '') if bom_header
95
91
  end
96
- str.encode!(Encoding::UTF_8, :undef => :replace, :replace => "?", :invalid => :replace)
97
-
98
- return str
99
- end
92
+ str.encode!(Encoding::UTF_8, undef: :replace, replace: '?', invalid: :replace)
100
93
 
94
+ str
95
+ end
101
96
  end
102
-
103
97
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang