filetype_validation 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3dfb2058caae6c57e4e98be1640073fed50ad26
4
- data.tar.gz: f7788e378e8a863164dd9a6c36dfc2b0b6c946a2
3
+ metadata.gz: c6b53afe61d6bd6d8c0b5f836dff50200d54e7f2
4
+ data.tar.gz: 2ea109699389980fb07cfad40c9ca158dd92367e
5
5
  SHA512:
6
- metadata.gz: ef2105050bc1f563a1c15423ce9274d2c1441c43bb88babaed75622ae85e2f097a88de019be37529c8ff6f2fdf8d5121a2c499570417c49423bd0ca29c94cbed
7
- data.tar.gz: 7c632a760d7a762c46c13d68a808df3b3b3ba1a85c2e2f61492ff8ec39a5eb58a4f25a3eef6c62c5236956888747e0eafda238d0b2f3328c4b79ca73b6dcb1f0
6
+ metadata.gz: 26aad3870f438d218e2b29845280375df4b490ecd490758791f5bfa0e51f30aaaa0fbf34eab6b0dc8d3860d49208c069f2a3cc0507891fb4385219f98a5e124b
7
+ data.tar.gz: dbc20c0e58ba168ae52be90be80e4d8292a8a06859bc54da5c123adf3454375461a073f10f3ba27df8bf213086c7d0ca7466662f772439c5ecf06ad73c8c4217
data/.gitignore CHANGED
@@ -50,3 +50,5 @@ coverage
50
50
 
51
51
  # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
52
52
  .rvmrc
53
+
54
+ .*.swp
@@ -38,4 +38,5 @@ Gem::Specification.new do |spec|
38
38
  spec.add_development_dependency 'rspec', '~> 3.4'
39
39
  spec.add_development_dependency 'rubocop', '~> 0.40'
40
40
  spec.add_development_dependency 'simplecov', '~> 0.11'
41
+ spec.add_development_dependency 'rubocop-checkstyle_formatter', '~> 0.2.0'
41
42
  end
@@ -1,9 +1,24 @@
1
1
  require 'filetype_validation/version'
2
+ require 'filetype_validation/plaintext_validator'
2
3
  require 'filetype_validation/csv_validator'
3
4
 
4
5
  # API entry point
5
6
  module FiletypeValidation
7
+ # Evaluates whether given file is a csv
8
+ #
9
+ # @param file [File, String] the file or filepath
10
+ # @param options [Hash{Symbol => Number}] validation customizations
11
+ # @return [Boolean] true if file is a csv
6
12
  def self.csv?(file, options = {})
7
13
  CsvValidator.new(file, options).valid?
8
14
  end
15
+
16
+ # Evaluates whether given file is plaintext
17
+ #
18
+ # @param file [File, String] the file or filepath
19
+ # @param options [Hash{Symbol => Number}] validation customizations
20
+ # @return [Boolean] true if file is plaintext
21
+ def self.plaintext?(file, options = {})
22
+ PlaintextValidator.new(file, options).valid?
23
+ end
9
24
  end
@@ -7,25 +7,29 @@ module FiletypeValidation
7
7
  MAX_LINES = 25
8
8
  HEADERS = true
9
9
 
10
+ # Evaluates whether the file is a csv
11
+ #
12
+ # @return [Boolean] true if it's a csv file
10
13
  def valid?
11
- csv_extension? && parse_csv?
14
+ csv_extension? && FiletypeValidation.plaintext?(file, options) &&
15
+ parse_csv?
12
16
  end
13
17
 
14
18
  private
15
19
 
20
+ # Checks if the file has a .csv extension
16
21
  #
17
- # Simply checks if the file has a .csv extension
18
- #
22
+ # @return [Boolean] true if the file has a csv extension
19
23
  def csv_extension?
20
24
  '.csv'.casecmp(File.extname(file.path))
21
25
  .zero?
22
26
  end
23
27
 
24
- #
25
- # This method checks to see if the file can have its
28
+ # Checks to see if the file can have its
26
29
  # first number_of_lines parsed as a csv file using the CSV
27
30
  # library.
28
31
  #
32
+ # @return [Boolean] true if file can be parsed as csv
29
33
  def parse_csv?
30
34
  file_sample = File.foreach(file.path)
31
35
  .first(options[:max_lines] || MAX_LINES)
@@ -0,0 +1,134 @@
1
+ require 'filetype_validation/base_validator'
2
+
3
+ module FiletypeValidation
4
+ # validator to classify a file as plaintext or bin
5
+ class PlaintextValidator < BaseValidator
6
+ # counting constants
7
+ LINE_FEED = 10
8
+ CARRIAGE_RETURN = 13
9
+ CTRL_CHAR = 0...32
10
+ ASCII_CHAR = 33...128
11
+ BIN_CHAR_8 = 129...(2**8)
12
+
13
+ # heuristic magic nums
14
+ LINE_TH = 1000
15
+ CTRL_TH = 0.1
16
+ BIN_TH = 0.05
17
+ LINE_NORM = 1 / 10_000
18
+
19
+ # file reading
20
+ BYTES_TO_READ = 4096
21
+
22
+ def initialize(file, options = {})
23
+ super(file, options)
24
+
25
+ @line_len = 0
26
+ @max_line_len = 0
27
+ @char_counts = { ascii: 0, ctrl: 0, bin_8: 0 }
28
+
29
+ @file_sample = make_sample_file
30
+ end
31
+
32
+ # Evaluates the given file to determine if its binary
33
+ #
34
+ # @return [Boolean] true if it's plaintext
35
+ def valid?
36
+ return true if @file_sample.nil?
37
+
38
+ calculate_counts
39
+
40
+ return true if within_threshold?
41
+
42
+ false
43
+ end
44
+
45
+ private
46
+
47
+ # Source: https://www.ruby-forum.com/topic/122170
48
+ # Combination of Simon Krahnke and Robert Klemme's algorithms
49
+ # to weight character and line counts in order to form a heuristic
50
+ #
51
+ # @return [Boolean] true if the file sample data satisfies the heuristic
52
+ def within_threshold?
53
+ if @max_line_len > LINE_TH
54
+ ctrl_line_weight < (CTRL_TH * 2) || bin_line_weight < (BIN_TH * 2)
55
+ else
56
+ ctrl_ratio < CTRL_TH || bin_ratio < BIN_TH
57
+ end
58
+ end
59
+
60
+ # Calculate ratio of control characters to ascii characters
61
+ #
62
+ # @return [Number] the ratio
63
+ def ctrl_ratio
64
+ @char_counts[:ctrl].to_f / @char_counts[:ascii]
65
+ end
66
+
67
+ # Calculate ratio of binary characters to ascii characters
68
+ #
69
+ # @return [Number] the ratio
70
+ def bin_ratio
71
+ @char_counts[:bin_8].to_f / @char_counts[:ascii]
72
+ end
73
+
74
+ # Include max_line_len in the control weight
75
+ #
76
+ # @return [Number] the combined control and line weight
77
+ def ctrl_line_weight
78
+ ctrl_ratio + @max_line_len * LINE_NORM
79
+ end
80
+
81
+ # Include max_line_len in the binary weight
82
+ #
83
+ # @return [Number] the combined binary and line weight
84
+ def bin_line_weight
85
+ bin_ratio + @max_line_len * LINE_NORM * (CTRL_TH / BIN_TH)
86
+ end
87
+
88
+ # Computes line and character counts used for the heuristic
89
+ def calculate_counts
90
+ @file_sample.each_byte do |bt|
91
+ update_line_count(bt)
92
+ update_char_count(bt)
93
+ end
94
+
95
+ # save length of last line
96
+ update_line_count(LINE_FEED)
97
+ end
98
+
99
+ # Increments line length count and updates the max
100
+ #
101
+ # @char [String] char the UTF-8 character being parsed
102
+ def update_line_count(char)
103
+ if char == LINE_FEED || char == CARRIAGE_RETURN
104
+ @max_line_len = @line_len if @line_len > @max_line_len
105
+ @line_len = 0
106
+ else
107
+ @line_len += 1
108
+ end
109
+ end
110
+
111
+ # Updates corresponding value in character count hash
112
+ #
113
+ # @char [String] char the UTF-8 character being parsed
114
+ def update_char_count(char)
115
+ case char
116
+ when CTRL_CHAR
117
+ @char_counts[:ctrl] += 1
118
+ when ASCII_CHAR
119
+ @char_counts[:ascii] += 1
120
+ when BIN_CHAR_8
121
+ @char_counts[:bin_8] += 1
122
+ end
123
+ end
124
+
125
+ # Reads the first options[:bytes_to_read] or BYTES_TO_READ
126
+ # bytes of the given file
127
+ #
128
+ # @return [File] a sample of the file
129
+ def make_sample_file
130
+ read_length = options[:bytes_to_read] || BYTES_TO_READ
131
+ File.open(file.path, 'rb') { |io| io.read(read_length) }
132
+ end
133
+ end
134
+ end
@@ -1,3 +1,3 @@
1
1
  module FiletypeValidation
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.1.1'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filetype_validation
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Jacob
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-06-30 00:00:00.000000000 Z
11
+ date: 2016-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop-checkstyle_formatter
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 0.2.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 0.2.0
83
97
  description: FiletypeValidation is a Ruby gem for validating file content types.
84
98
  email: jjacob@optoro.com
85
99
  executables: []
@@ -98,6 +112,7 @@ files:
98
112
  - lib/filetype_validation.rb
99
113
  - lib/filetype_validation/base_validator.rb
100
114
  - lib/filetype_validation/csv_validator.rb
115
+ - lib/filetype_validation/plaintext_validator.rb
101
116
  - lib/filetype_validation/version.rb
102
117
  homepage: https://github.com/optoro/filetype_validation/
103
118
  licenses: