filetype_validation 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/filetype_validation.gemspec +1 -0
- data/lib/filetype_validation.rb +15 -0
- data/lib/filetype_validation/csv_validator.rb +9 -5
- data/lib/filetype_validation/plaintext_validator.rb +134 -0
- data/lib/filetype_validation/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c6b53afe61d6bd6d8c0b5f836dff50200d54e7f2
|
4
|
+
data.tar.gz: 2ea109699389980fb07cfad40c9ca158dd92367e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 26aad3870f438d218e2b29845280375df4b490ecd490758791f5bfa0e51f30aaaa0fbf34eab6b0dc8d3860d49208c069f2a3cc0507891fb4385219f98a5e124b
|
7
|
+
data.tar.gz: dbc20c0e58ba168ae52be90be80e4d8292a8a06859bc54da5c123adf3454375461a073f10f3ba27df8bf213086c7d0ca7466662f772439c5ecf06ad73c8c4217
|
data/.gitignore
CHANGED
data/filetype_validation.gemspec
CHANGED
@@ -38,4 +38,5 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_development_dependency 'rspec', '~> 3.4'
|
39
39
|
spec.add_development_dependency 'rubocop', '~> 0.40'
|
40
40
|
spec.add_development_dependency 'simplecov', '~> 0.11'
|
41
|
+
spec.add_development_dependency 'rubocop-checkstyle_formatter', '~> 0.2.0'
|
41
42
|
end
|
data/lib/filetype_validation.rb
CHANGED
@@ -1,9 +1,24 @@
|
|
1
1
|
require 'filetype_validation/version'
|
2
|
+
require 'filetype_validation/plaintext_validator'
|
2
3
|
require 'filetype_validation/csv_validator'
|
3
4
|
|
4
5
|
# API entry point
|
5
6
|
module FiletypeValidation
|
7
|
+
# Evaluates whether given file is a csv
|
8
|
+
#
|
9
|
+
# @param file [File, String] the file or filepath
|
10
|
+
# @param options [Hash{Symbol => Number}] validation customizations
|
11
|
+
# @return [Boolean] true if file is a csv
|
6
12
|
def self.csv?(file, options = {})
|
7
13
|
CsvValidator.new(file, options).valid?
|
8
14
|
end
|
15
|
+
|
16
|
+
# Evaluates whether given file is plaintext
|
17
|
+
#
|
18
|
+
# @param file [File, String] the file or filepath
|
19
|
+
# @param options [Hash{Symbol => Number}] validation customizations
|
20
|
+
# @return [Boolean] true if file is plaintext
|
21
|
+
def self.plaintext?(file, options = {})
|
22
|
+
PlaintextValidator.new(file, options).valid?
|
23
|
+
end
|
9
24
|
end
|
@@ -7,25 +7,29 @@ module FiletypeValidation
|
|
7
7
|
MAX_LINES = 25
|
8
8
|
HEADERS = true
|
9
9
|
|
10
|
+
# Evaluates whether the file is a csv
|
11
|
+
#
|
12
|
+
# @return [Boolean] true if it's a csv file
|
10
13
|
def valid?
|
11
|
-
csv_extension? &&
|
14
|
+
csv_extension? && FiletypeValidation.plaintext?(file, options) &&
|
15
|
+
parse_csv?
|
12
16
|
end
|
13
17
|
|
14
18
|
private
|
15
19
|
|
20
|
+
# Checks if the file has a .csv extension
|
16
21
|
#
|
17
|
-
#
|
18
|
-
#
|
22
|
+
# @return [Boolean] true if the file has a csv extension
|
19
23
|
def csv_extension?
|
20
24
|
'.csv'.casecmp(File.extname(file.path))
|
21
25
|
.zero?
|
22
26
|
end
|
23
27
|
|
24
|
-
#
|
25
|
-
# This method checks to see if the file can have its
|
28
|
+
# Checks to see if the file can have its
|
26
29
|
# first number_of_lines parsed as a csv file using the CSV
|
27
30
|
# library.
|
28
31
|
#
|
32
|
+
# @return [Boolean] true if file can be parsed as csv
|
29
33
|
def parse_csv?
|
30
34
|
file_sample = File.foreach(file.path)
|
31
35
|
.first(options[:max_lines] || MAX_LINES)
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'filetype_validation/base_validator'
|
2
|
+
|
3
|
+
module FiletypeValidation
|
4
|
+
# validator to classify a file as plaintext or bin
|
5
|
+
class PlaintextValidator < BaseValidator
|
6
|
+
# counting constants
|
7
|
+
LINE_FEED = 10
|
8
|
+
CARRIAGE_RETURN = 13
|
9
|
+
CTRL_CHAR = 0...32
|
10
|
+
ASCII_CHAR = 33...128
|
11
|
+
BIN_CHAR_8 = 129...(2**8)
|
12
|
+
|
13
|
+
# heuristic magic nums
|
14
|
+
LINE_TH = 1000
|
15
|
+
CTRL_TH = 0.1
|
16
|
+
BIN_TH = 0.05
|
17
|
+
LINE_NORM = 1 / 10_000
|
18
|
+
|
19
|
+
# file reading
|
20
|
+
BYTES_TO_READ = 4096
|
21
|
+
|
22
|
+
def initialize(file, options = {})
|
23
|
+
super(file, options)
|
24
|
+
|
25
|
+
@line_len = 0
|
26
|
+
@max_line_len = 0
|
27
|
+
@char_counts = { ascii: 0, ctrl: 0, bin_8: 0 }
|
28
|
+
|
29
|
+
@file_sample = make_sample_file
|
30
|
+
end
|
31
|
+
|
32
|
+
# Evaluates the given file to determine if its binary
|
33
|
+
#
|
34
|
+
# @return [Boolean] true if it's plaintext
|
35
|
+
def valid?
|
36
|
+
return true if @file_sample.nil?
|
37
|
+
|
38
|
+
calculate_counts
|
39
|
+
|
40
|
+
return true if within_threshold?
|
41
|
+
|
42
|
+
false
|
43
|
+
end
|
44
|
+
|
45
|
+
private
|
46
|
+
|
47
|
+
# Source: https://www.ruby-forum.com/topic/122170
|
48
|
+
# Combination of Simon Krahnke and Robert Klemme's algorithms
|
49
|
+
# to weight character and line counts in order to form a heuristic
|
50
|
+
#
|
51
|
+
# @return [Boolean] true if the file sample data satisfies the heuristic
|
52
|
+
def within_threshold?
|
53
|
+
if @max_line_len > LINE_TH
|
54
|
+
ctrl_line_weight < (CTRL_TH * 2) || bin_line_weight < (BIN_TH * 2)
|
55
|
+
else
|
56
|
+
ctrl_ratio < CTRL_TH || bin_ratio < BIN_TH
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
# Calculate ratio of control characters to ascii characters
|
61
|
+
#
|
62
|
+
# @return [Number] the ratio
|
63
|
+
def ctrl_ratio
|
64
|
+
@char_counts[:ctrl].to_f / @char_counts[:ascii]
|
65
|
+
end
|
66
|
+
|
67
|
+
# Calculate ratio of binary characters to ascii characters
|
68
|
+
#
|
69
|
+
# @return [Number] the ratio
|
70
|
+
def bin_ratio
|
71
|
+
@char_counts[:bin_8].to_f / @char_counts[:ascii]
|
72
|
+
end
|
73
|
+
|
74
|
+
# Include max_line_len in the control weight
|
75
|
+
#
|
76
|
+
# @return [Number] the combined control and line weight
|
77
|
+
def ctrl_line_weight
|
78
|
+
ctrl_ratio + @max_line_len * LINE_NORM
|
79
|
+
end
|
80
|
+
|
81
|
+
# Include max_line_len in the binary weight
|
82
|
+
#
|
83
|
+
# @return [Number] the combined binary and line weight
|
84
|
+
def bin_line_weight
|
85
|
+
bin_ratio + @max_line_len * LINE_NORM * (CTRL_TH / BIN_TH)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Computes line and character counts used for the heuristic
|
89
|
+
def calculate_counts
|
90
|
+
@file_sample.each_byte do |bt|
|
91
|
+
update_line_count(bt)
|
92
|
+
update_char_count(bt)
|
93
|
+
end
|
94
|
+
|
95
|
+
# save length of last line
|
96
|
+
update_line_count(LINE_FEED)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Increments line length count and updates the max
|
100
|
+
#
|
101
|
+
# @char [String] char the UTF-8 character being parsed
|
102
|
+
def update_line_count(char)
|
103
|
+
if char == LINE_FEED || char == CARRIAGE_RETURN
|
104
|
+
@max_line_len = @line_len if @line_len > @max_line_len
|
105
|
+
@line_len = 0
|
106
|
+
else
|
107
|
+
@line_len += 1
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
# Updates corresponding value in character count hash
|
112
|
+
#
|
113
|
+
# @char [String] char the UTF-8 character being parsed
|
114
|
+
def update_char_count(char)
|
115
|
+
case char
|
116
|
+
when CTRL_CHAR
|
117
|
+
@char_counts[:ctrl] += 1
|
118
|
+
when ASCII_CHAR
|
119
|
+
@char_counts[:ascii] += 1
|
120
|
+
when BIN_CHAR_8
|
121
|
+
@char_counts[:bin_8] += 1
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
# Reads the first options[:bytes_to_read] or BYTES_TO_READ
|
126
|
+
# bytes of the given file
|
127
|
+
#
|
128
|
+
# @return [File] a sample of the file
|
129
|
+
def make_sample_file
|
130
|
+
read_length = options[:bytes_to_read] || BYTES_TO_READ
|
131
|
+
File.open(file.path, 'rb') { |io| io.read(read_length) }
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: filetype_validation
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Joe Jacob
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -80,6 +80,20 @@ dependencies:
|
|
80
80
|
- - ~>
|
81
81
|
- !ruby/object:Gem::Version
|
82
82
|
version: '0.11'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop-checkstyle_formatter
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ~>
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.2.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ~>
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.2.0
|
83
97
|
description: FiletypeValidation is a Ruby gem for validating file content types.
|
84
98
|
email: jjacob@optoro.com
|
85
99
|
executables: []
|
@@ -98,6 +112,7 @@ files:
|
|
98
112
|
- lib/filetype_validation.rb
|
99
113
|
- lib/filetype_validation/base_validator.rb
|
100
114
|
- lib/filetype_validation/csv_validator.rb
|
115
|
+
- lib/filetype_validation/plaintext_validator.rb
|
101
116
|
- lib/filetype_validation/version.rb
|
102
117
|
homepage: https://github.com/optoro/filetype_validation/
|
103
118
|
licenses:
|