list_spider 0.2.0 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dffcea81ca28eb242e97c493175682e01ddddbc6
4
- data.tar.gz: c721f41b7f763988a3b29745d303f124cea0ca71
3
+ metadata.gz: 38d3b1cc9c12998f7c0112f9f328d93e0c3909c8
4
+ data.tar.gz: 2e46c3201a979f17f3bafcbd7c2a215c666b29aa
5
5
  SHA512:
6
- metadata.gz: 99714407feee9e92043d8be1e19dc41bfc50c76f588a067ffc953ce9bea9d28dd42581c70a88c53815529f02036ea192b371b49a6a4ea20b42257728bc3ccb8b
7
- data.tar.gz: 18e625bd37040159fc1394bf93ab9d88464ac6459d8da51ed975a91367c2458e0f3f76c483fbee9362979c81a45c0508e29f4acae49a6896a0cfce7e5de1779c
6
+ metadata.gz: 25a9be2402259455a04e5c1cee6767de829841670a0967a2435e7a9616e94552e71371ab8ef69fd5e57fd640f71b205c2ae894798682a3583ec80ad2d7ac2b6d
7
+ data.tar.gz: cbe88b9d30421f6c1cf600d54755ad7df699e2186003c8df76b748317a3bedadd1b8c3536434c6cc11e34a9a59ece4ed4cfbfe712238538ab595d3ac4a121654
@@ -0,0 +1,71 @@
1
+
2
+ class FileFilter
3
+ # 4033
4
+ # 920
5
+ def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
6
+ @dir_pattern = dir_pattern
7
+ @size_threshold = size_threshold
8
+ if cust_judge
9
+ @cust_judge = cust_judge
10
+ else
11
+ @cust_judge = method(:default_judge)
12
+ end
13
+ @total = 0
14
+ @process_block = process_block
15
+ end
16
+
17
+ def default_judge(f)
18
+ File.size(f) <= @size_threshold
19
+ end
20
+
21
+ def filter_file(f)
22
+ if @cust_judge.call(f)
23
+ @total += 1
24
+ @process_block.call(f)
25
+ end
26
+ end
27
+
28
+ def start
29
+ Dir.glob(@dir_pattern) do |f|
30
+ filter_file(f)
31
+ end
32
+ puts "total:#{@total}"
33
+ end
34
+
35
+ def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
36
+ FileFilter.new(dir_pattern,
37
+ size_threshold: size_threshold,
38
+ cust_judge: cust_judge,
39
+ process_block:
40
+ proc do |f|
41
+ puts "deleted file: #{f}"
42
+ File.delete(f)
43
+ end
44
+ ).start
45
+ end
46
+
47
+ def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
48
+ FileFilter.new(dir_pattern,
49
+ size_threshold: size_threshold,
50
+ cust_judge: cust_judge,
51
+ process_block:
52
+ proc do |f|
53
+ puts "filterd file: #{f}"
54
+ end
55
+ ).start
56
+ end
57
+
58
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
59
+ result_file = File.open(save_file_name, 'wt')
60
+ FileFilter.new(dir_pattern,
61
+ size_threshold: size_threshold,
62
+ cust_judge: cust_judge,
63
+ process_block:
64
+ proc do |f|
65
+ puts "filterd file: #{f}"
66
+ result_file << f << "\n"
67
+ end
68
+ ).start
69
+ end
70
+
71
+ end
@@ -4,10 +4,10 @@ require 'fileutils'
4
4
  require 'set'
5
5
  require "addressable/uri"
6
6
  require File.expand_path('../spider_helper', __FILE__)
7
- require File.expand_path('../delete_unvalid', __FILE__)
7
+ require File.expand_path('../file_filter', __FILE__)
8
8
 
9
9
  class TaskStruct
10
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
11
  @origin_href = href
12
12
  @href = href
13
13
  if @href.class == "".class
@@ -18,13 +18,14 @@ class TaskStruct
18
18
  @params = params
19
19
  @extra_data = extra_data
20
20
  @parse_method = parse_method
21
+ @header = header
21
22
  end
22
23
 
23
24
  def == (o)
24
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
25
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
25
26
  end
26
27
 
27
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
28
+ attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
28
29
 
29
30
  end
30
31
 
@@ -73,7 +74,12 @@ module ListSpider
73
74
  for_each_proc = proc do |e|
74
75
  opt = {}
75
76
  opt = {:redirects => @max_redirects}
76
- opt[:head] = @header_option if defined? @header_option
77
+ if e.header
78
+ opt[:head] = e.header
79
+ elsif defined? @header_option
80
+ opt[:head] = @header_option
81
+ end
82
+
77
83
  if e.http_method == :post
78
84
  opt[:body] = e.params unless e.params.empty?
79
85
  if @connection_opts
@@ -182,7 +188,7 @@ module ListSpider
182
188
 
183
189
  pm.call(e.local_path, e.extra_data, res_header, req)
184
190
  else
185
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
191
+ puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
186
192
  end
187
193
  end
188
194
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-13 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -56,7 +56,7 @@ executables: []
56
56
  extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
- - lib/delete_unvalid.rb
59
+ - lib/file_filter.rb
60
60
  - lib/list_spider.rb
61
61
  - lib/spider_helper.rb
62
62
  homepage: https://github.com/chinazhangchao/list_spider
@@ -1,40 +0,0 @@
1
-
2
- class DeleteUnvalid
3
- # 4033
4
- # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
6
- @dir_pattern = dir_pattern
7
- @size_threshold = size_threshold
8
- if cust_judge
9
- @cust_judge = cust_judge
10
- else
11
- @cust_judge = method(:default_judge)
12
- end
13
- @total = 0
14
- end
15
-
16
- def default_judge(f)
17
- File.size(f) <= @size_threshold
18
- end
19
-
20
- def delete_unvaild(f)
21
- if @cust_judge.call(f)
22
- @total += 1
23
- puts "deleted file: #{f}"
24
- File.delete(f)
25
- end
26
- end
27
-
28
- def start
29
- Dir.glob(@dir_pattern) do |f|
30
- # puts f
31
- delete_unvaild(f)
32
- end
33
- puts "delete total:#{@total}"
34
- end
35
-
36
- def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
37
- DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
38
- end
39
-
40
- end