list_spider 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dffcea81ca28eb242e97c493175682e01ddddbc6
4
- data.tar.gz: c721f41b7f763988a3b29745d303f124cea0ca71
3
+ metadata.gz: 38d3b1cc9c12998f7c0112f9f328d93e0c3909c8
4
+ data.tar.gz: 2e46c3201a979f17f3bafcbd7c2a215c666b29aa
5
5
  SHA512:
6
- metadata.gz: 99714407feee9e92043d8be1e19dc41bfc50c76f588a067ffc953ce9bea9d28dd42581c70a88c53815529f02036ea192b371b49a6a4ea20b42257728bc3ccb8b
7
- data.tar.gz: 18e625bd37040159fc1394bf93ab9d88464ac6459d8da51ed975a91367c2458e0f3f76c483fbee9362979c81a45c0508e29f4acae49a6896a0cfce7e5de1779c
6
+ metadata.gz: 25a9be2402259455a04e5c1cee6767de829841670a0967a2435e7a9616e94552e71371ab8ef69fd5e57fd640f71b205c2ae894798682a3583ec80ad2d7ac2b6d
7
+ data.tar.gz: cbe88b9d30421f6c1cf600d54755ad7df699e2186003c8df76b748317a3bedadd1b8c3536434c6cc11e34a9a59ece4ed4cfbfe712238538ab595d3ac4a121654
@@ -0,0 +1,71 @@
1
+
2
+ class FileFilter
3
+ # 4033
4
+ # 920
5
+ def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil, process_block: nil)
6
+ @dir_pattern = dir_pattern
7
+ @size_threshold = size_threshold
8
+ if cust_judge
9
+ @cust_judge = cust_judge
10
+ else
11
+ @cust_judge = method(:default_judge)
12
+ end
13
+ @total = 0
14
+ @process_block = process_block
15
+ end
16
+
17
+ def default_judge(f)
18
+ File.size(f) <= @size_threshold
19
+ end
20
+
21
+ def filter_file(f)
22
+ if @cust_judge.call(f)
23
+ @total += 1
24
+ @process_block.call(f)
25
+ end
26
+ end
27
+
28
+ def start
29
+ Dir.glob(@dir_pattern) do |f|
30
+ filter_file(f)
31
+ end
32
+ puts "total:#{@total}"
33
+ end
34
+
35
+ def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
36
+ FileFilter.new(dir_pattern,
37
+ size_threshold: size_threshold,
38
+ cust_judge: cust_judge,
39
+ process_block:
40
+ proc do |f|
41
+ puts "deleted file: #{f}"
42
+ File.delete(f)
43
+ end
44
+ ).start
45
+ end
46
+
47
+ def self.check(dir_pattern, size_threshold: 1000, cust_judge: nil)
48
+ FileFilter.new(dir_pattern,
49
+ size_threshold: size_threshold,
50
+ cust_judge: cust_judge,
51
+ process_block:
52
+ proc do |f|
53
+ puts "filterd file: #{f}"
54
+ end
55
+ ).start
56
+ end
57
+
58
+ def self.check_save_result(dir_pattern, save_file_name: 'filtered_file.txt', size_threshold: 1000, cust_judge: nil)
59
+ result_file = File.open(save_file_name, 'wt')
60
+ FileFilter.new(dir_pattern,
61
+ size_threshold: size_threshold,
62
+ cust_judge: cust_judge,
63
+ process_block:
64
+ proc do |f|
65
+ puts "filterd file: #{f}"
66
+ result_file << f << "\n"
67
+ end
68
+ ).start
69
+ end
70
+
71
+ end
@@ -4,10 +4,10 @@ require 'fileutils'
4
4
  require 'set'
5
5
  require "addressable/uri"
6
6
  require File.expand_path('../spider_helper', __FILE__)
7
- require File.expand_path('../delete_unvalid', __FILE__)
7
+ require File.expand_path('../file_filter', __FILE__)
8
8
 
9
9
  class TaskStruct
10
- def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil)
10
+ def initialize(href, local_path, http_method: :get, params: {}, extra_data: nil, parse_method: nil, header: nil)
11
11
  @origin_href = href
12
12
  @href = href
13
13
  if @href.class == "".class
@@ -18,13 +18,14 @@ class TaskStruct
18
18
  @params = params
19
19
  @extra_data = extra_data
20
20
  @parse_method = parse_method
21
+ @header = header
21
22
  end
22
23
 
23
24
  def == (o)
24
- o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data
25
+ o.class == self.class && o.href == href && o.local_path == local_path && o.http_method == http_method && o.params == params && o.extra_data == extra_data && o.header == header
25
26
  end
26
27
 
27
- attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object
28
+ attr_accessor :origin_href , :href, :local_path, :http_method, :params, :extra_data, :parse_method, :request_object, :header
28
29
 
29
30
  end
30
31
 
@@ -73,7 +74,12 @@ module ListSpider
73
74
  for_each_proc = proc do |e|
74
75
  opt = {}
75
76
  opt = {:redirects => @max_redirects}
76
- opt[:head] = @header_option if defined? @header_option
77
+ if e.header
78
+ opt[:head] = e.header
79
+ elsif defined? @header_option
80
+ opt[:head] = @header_option
81
+ end
82
+
77
83
  if e.http_method == :post
78
84
  opt[:body] = e.params unless e.params.empty?
79
85
  if @connection_opts
@@ -182,7 +188,7 @@ module ListSpider
182
188
 
183
189
  pm.call(e.local_path, e.extra_data, res_header, req)
184
190
  else
185
- puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3"
191
+ puts "Error! The number of arguments is:#{pm.arity}. While expected number is 1, 2, 3, 4"
186
192
  end
187
193
  end
188
194
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: list_spider
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Charles Zhang
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-05-13 00:00:00.000000000 Z
11
+ date: 2016-05-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: em-http-request
@@ -56,7 +56,7 @@ executables: []
56
56
  extensions: []
57
57
  extra_rdoc_files: []
58
58
  files:
59
- - lib/delete_unvalid.rb
59
+ - lib/file_filter.rb
60
60
  - lib/list_spider.rb
61
61
  - lib/spider_helper.rb
62
62
  homepage: https://github.com/chinazhangchao/list_spider
@@ -1,40 +0,0 @@
1
-
2
- class DeleteUnvalid
3
- # 4033
4
- # 920
5
- def initialize(dir_pattern, size_threshold: 1000, cust_judge: nil)
6
- @dir_pattern = dir_pattern
7
- @size_threshold = size_threshold
8
- if cust_judge
9
- @cust_judge = cust_judge
10
- else
11
- @cust_judge = method(:default_judge)
12
- end
13
- @total = 0
14
- end
15
-
16
- def default_judge(f)
17
- File.size(f) <= @size_threshold
18
- end
19
-
20
- def delete_unvaild(f)
21
- if @cust_judge.call(f)
22
- @total += 1
23
- puts "deleted file: #{f}"
24
- File.delete(f)
25
- end
26
- end
27
-
28
- def start
29
- Dir.glob(@dir_pattern) do |f|
30
- # puts f
31
- delete_unvaild(f)
32
- end
33
- puts "delete total:#{@total}"
34
- end
35
-
36
- def self.delete(dir_pattern, size_threshold: 1000, cust_judge: nil)
37
- DeleteUnvalid.new(dir_pattern, size_threshold: size_threshold, cust_judge: cust_judge).start
38
- end
39
-
40
- end