filetype_validation 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e3dfb2058caae6c57e4e98be1640073fed50ad26
4
- data.tar.gz: f7788e378e8a863164dd9a6c36dfc2b0b6c946a2
3
+ metadata.gz: c6b53afe61d6bd6d8c0b5f836dff50200d54e7f2
4
+ data.tar.gz: 2ea109699389980fb07cfad40c9ca158dd92367e
5
5
  SHA512:
6
- metadata.gz: ef2105050bc1f563a1c15423ce9274d2c1441c43bb88babaed75622ae85e2f097a88de019be37529c8ff6f2fdf8d5121a2c499570417c49423bd0ca29c94cbed
7
- data.tar.gz: 7c632a760d7a762c46c13d68a808df3b3b3ba1a85c2e2f61492ff8ec39a5eb58a4f25a3eef6c62c5236956888747e0eafda238d0b2f3328c4b79ca73b6dcb1f0
6
+ metadata.gz: 26aad3870f438d218e2b29845280375df4b490ecd490758791f5bfa0e51f30aaaa0fbf34eab6b0dc8d3860d49208c069f2a3cc0507891fb4385219f98a5e124b
7
+ data.tar.gz: dbc20c0e58ba168ae52be90be80e4d8292a8a06859bc54da5c123adf3454375461a073f10f3ba27df8bf213086c7d0ca7466662f772439c5ecf06ad73c8c4217
data/.gitignore CHANGED
@@ -50,3 +50,5 @@ coverage
50
50
 
51
51
  # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
52
52
  .rvmrc
53
+
54
+ .*.swp
@@ -38,4 +38,5 @@ Gem::Specification.new do |spec|
38
38
  spec.add_development_dependency 'rspec', '~> 3.4'
39
39
  spec.add_development_dependency 'rubocop', '~> 0.40'
40
40
  spec.add_development_dependency 'simplecov', '~> 0.11'
41
+ spec.add_development_dependency 'rubocop-checkstyle_formatter', '~> 0.2.0'
41
42
  end
@@ -1,9 +1,24 @@
1
1
  require 'filetype_validation/version'
2
+ require 'filetype_validation/plaintext_validator'
2
3
  require 'filetype_validation/csv_validator'
3
4
 
4
5
  # API entry point
5
6
  module FiletypeValidation
7
+ # Evaluates whether given file is a csv
8
+ #
9
+ # @param file [File, String] the file or filepath
10
+ # @param options [Hash{Symbol => Number}] validation customizations
11
+ # @return [Boolean] true if file is a csv
6
12
  def self.csv?(file, options = {})
7
13
  CsvValidator.new(file, options).valid?
8
14
  end
15
+
16
+ # Evaluates whether given file is plaintext
17
+ #
18
+ # @param file [File, String] the file or filepath
19
+ # @param options [Hash{Symbol => Number}] validation customizations
20
+ # @return [Boolean] true if file is plaintext
21
+ def self.plaintext?(file, options = {})
22
+ PlaintextValidator.new(file, options).valid?
23
+ end
9
24
  end
@@ -7,25 +7,29 @@ module FiletypeValidation
7
7
  MAX_LINES = 25
8
8
  HEADERS = true
9
9
 
10
+ # Evaluates whether the file is a csv
11
+ #
12
+ # @return [Boolean] true if it's a csv file
10
13
  def valid?
11
- csv_extension? && parse_csv?
14
+ csv_extension? && FiletypeValidation.plaintext?(file, options) &&
15
+ parse_csv?
12
16
  end
13
17
 
14
18
  private
15
19
 
20
+ # Checks if the file has a .csv extension
16
21
  #
17
- # Simply checks if the file has a .csv extension
18
- #
22
+ # @return [Boolean] true if the file has a csv extension
19
23
  def csv_extension?
20
24
  '.csv'.casecmp(File.extname(file.path))
21
25
  .zero?
22
26
  end
23
27
 
24
- #
25
- # This method checks to see if the file can have its
28
+ # Checks to see if the file can have its
26
29
  # first number_of_lines parsed as a csv file using the CSV
27
30
  # library.
28
31
  #
32
+ # @return [Boolean] true if file can be parsed as csv
29
33
  def parse_csv?
30
34
  file_sample = File.foreach(file.path)
31
35
  .first(options[:max_lines] || MAX_LINES)
@@ -0,0 +1,134 @@
1
+ require 'filetype_validation/base_validator'
2
+
3
+ module FiletypeValidation
4
+ # validator to classify a file as plaintext or bin
5
+ class PlaintextValidator < BaseValidator
6
+ # counting constants
7
+ LINE_FEED = 10
8
+ CARRIAGE_RETURN = 13
9
+ CTRL_CHAR = 0...32
10
+ ASCII_CHAR = 33...128
11
+ BIN_CHAR_8 = 129...(2**8)
12
+
13
+ # heuristic magic nums
14
+ LINE_TH = 1000
15
+ CTRL_TH = 0.1
16
+ BIN_TH = 0.05
17
+ LINE_NORM = 1 / 10_000
18
+
19
+ # file reading
20
+ BYTES_TO_READ = 4096
21
+
22
+ def initialize(file, options = {})
23
+ super(file, options)
24
+
25
+ @line_len = 0
26
+ @max_line_len = 0
27
+ @char_counts = { ascii: 0, ctrl: 0, bin_8: 0 }
28
+
29
+ @file_sample = make_sample_file
30
+ end
31
+
32
+ # Evaluates the given file to determine if its binary
33
+ #
34
+ # @return [Boolean] true if it's plaintext
35
+ def valid?
36
+ return true if @file_sample.nil?
37
+
38
+ calculate_counts
39
+
40
+ return true if within_threshold?
41
+
42
+ false
43
+ end
44
+
45
+ private
46
+
47
+ # Source: https://www.ruby-forum.com/topic/122170
48
+ # Combination of Simon Krahnke and Robert Klemme's algorithms
49
+ # to weight character and line counts in order to form a heuristic
50
+ #
51
+ # @return [Boolean] true if the file sample data satisfies the heuristic
52
+ def within_threshold?
53
+ if @max_line_len > LINE_TH
54
+ ctrl_line_weight < (CTRL_TH * 2) || bin_line_weight < (BIN_TH * 2)
55
+ else
56
+ ctrl_ratio < CTRL_TH || bin_ratio < BIN_TH
57
+ end
58
+ end
59
+
60
+ # Calculate ratio of control characters to ascii characters
61
+ #
62
+ # @return [Number] the ratio
63
+ def ctrl_ratio
64
+ @char_counts[:ctrl].to_f / @char_counts[:ascii]
65
+ end
66
+
67
+ # Calculate ratio of binary characters to ascii characters
68
+ #
69
+ # @return [Number] the ratio
70
+ def bin_ratio
71
+ @char_counts[:bin_8].to_f / @char_counts[:ascii]
72
+ end
73
+
74
+ # Include max_line_len in the control weight
75
+ #
76
+ # @return [Number] the combined control and line weight
77
+ def ctrl_line_weight
78
+ ctrl_ratio + @max_line_len * LINE_NORM
79
+ end
80
+
81
+ # Include max_line_len in the binary weight
82
+ #
83
+ # @return [Number] the combined binary and line weight
84
+ def bin_line_weight
85
+ bin_ratio + @max_line_len * LINE_NORM * (CTRL_TH / BIN_TH)
86
+ end
87
+
88
+ # Computes line and character counts used for the heuristic
89
+ def calculate_counts
90
+ @file_sample.each_byte do |bt|
91
+ update_line_count(bt)
92
+ update_char_count(bt)
93
+ end
94
+
95
+ # save length of last line
96
+ update_line_count(LINE_FEED)
97
+ end
98
+
99
+ # Increments line length count and updates the max
100
+ #
101
+ # @char [String] char the UTF-8 character being parsed
102
+ def update_line_count(char)
103
+ if char == LINE_FEED || char == CARRIAGE_RETURN
104
+ @max_line_len = @line_len if @line_len > @max_line_len
105
+ @line_len = 0
106
+ else
107
+ @line_len += 1
108
+ end
109
+ end
110
+
111
+ # Updates corresponding value in character count hash
112
+ #
113
+ # @char [String] char the UTF-8 character being parsed
114
+ def update_char_count(char)
115
+ case char
116
+ when CTRL_CHAR
117
+ @char_counts[:ctrl] += 1
118
+ when ASCII_CHAR
119
+ @char_counts[:ascii] += 1
120
+ when BIN_CHAR_8
121
+ @char_counts[:bin_8] += 1
122
+ end
123
+ end
124
+
125
+ # Reads the first options[:bytes_to_read] or BYTES_TO_READ
126
+ # bytes of the given file
127
+ #
128
+ # @return [File] a sample of the file
129
+ def make_sample_file
130
+ read_length = options[:bytes_to_read] || BYTES_TO_READ
131
+ File.open(file.path, 'rb') { |io| io.read(read_length) }
132
+ end
133
+ end
134
+ end
@@ -1,3 +1,3 @@
1
1
  module FiletypeValidation
2
- VERSION = '0.1.0'.freeze
2
+ VERSION = '0.1.1'.freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: filetype_validation
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Joe Jacob
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-06-30 00:00:00.000000000 Z
11
+ date: 2016-07-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -80,6 +80,20 @@ dependencies:
80
80
  - - ~>
81
81
  - !ruby/object:Gem::Version
82
82
  version: '0.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rubocop-checkstyle_formatter
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ~>
88
+ - !ruby/object:Gem::Version
89
+ version: 0.2.0
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ~>
95
+ - !ruby/object:Gem::Version
96
+ version: 0.2.0
83
97
  description: FiletypeValidation is a Ruby gem for validating file content types.
84
98
  email: jjacob@optoro.com
85
99
  executables: []
@@ -98,6 +112,7 @@ files:
98
112
  - lib/filetype_validation.rb
99
113
  - lib/filetype_validation/base_validator.rb
100
114
  - lib/filetype_validation/csv_validator.rb
115
+ - lib/filetype_validation/plaintext_validator.rb
101
116
  - lib/filetype_validation/version.rb
102
117
  homepage: https://github.com/optoro/filetype_validation/
103
118
  licenses: