fixed_width_file_validator 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 3669e738753290e5ebd2c0d3fb5514db3045c103
4
+ data.tar.gz: bfd9a7a87b5a743f4ce04148f01a1513f736c768
5
+ SHA512:
6
+ metadata.gz: de02afb537d74585f2fe60bb12710bd91b32b8466630c3280b355301619917e0f5f21c2853f8b1aa75e6a542e6ab6ef14154c9d8de2b75d094523eaef507ab71
7
+ data.tar.gz: 383f7f1bd0e385aeb981c8f94d12b2bd6568c2fad69436e4ae3d04a8d0e1f42336e9dab52edd889af5dfedc3d0d6b91c2d475dbb31dea3fed0196a11dac0f720
data/README.md ADDED
@@ -0,0 +1,27 @@
1
+ # FixedWidthFileValidator
2
+
3
+ Validation a fixed width text file based on configuration
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application's Gemfile:
8
+
9
+ ```ruby
10
+ gem 'fixed_width_file_validator'
11
+ ```
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ ## Usage
18
+
19
+ See [sameple configuration for details](test/data/sample_format_1.yml)
20
+
21
+ ## Contributing
22
+
23
+ Bug reports and pull requests are welcome on GitHub at https://github.com/sloppycoder/fixed_width_file_validator.
24
+
25
+ ## License
26
+
27
+ The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
@@ -0,0 +1,7 @@
1
+ require 'fixed_width_file_validator/version'
2
+ require 'fixed_width_file_validator/string_helper'
3
+ require 'fixed_width_file_validator/file_format'
4
+ require 'fixed_width_file_validator/report_formatter'
5
+
6
+ module FixedWidthFileValidator
7
+ end
@@ -0,0 +1,131 @@
1
+ require 'fixed_width_file_validator/file_reader'
2
+ require 'fixed_width_file_validator/validator'
3
+ require 'fixed_width_file_validator/record_parser'
4
+
5
+ require 'yaml'
6
+
7
+ module FixedWidthFileValidator
8
+ class FileFormat
9
+ attr_reader :record_type, :fields, :unique_fields, :file_settings
10
+
11
+ @all_formats = {}
12
+
13
+ # not threadsafe
14
+ def self.for(record_type, config_file = nil)
15
+ @all_formats[record_type.to_sym] ||= FileFormat.new(record_type.to_sym, config_file)
16
+ end
17
+
18
+ def initialize(record_type, config_file = nil)
19
+ @record_type = record_type.to_sym
20
+ @fields = {}
21
+ @unique_fields = []
22
+ @parser_field_list = []
23
+ @file_settings = {}
24
+ @column = 1
25
+ @config_file = config_file
26
+
27
+ File.open(@config_file) do |f|
28
+ @raw_config = symbolize(YAML.safe_load(f, [], [], true))
29
+ end
30
+
31
+ load_config(@record_type)
32
+ find_unique_fields
33
+ end
34
+
35
+ def field_validations(field_name)
36
+ fields[field_name]&.fetch :validations
37
+ end
38
+
39
+ def create_file_reader(data_file_path)
40
+ FixedWidthFileValidator::FileReader.new(data_file_path, record_parser, file_settings)
41
+ end
42
+
43
+ def create_record_validator
44
+ FixedWidthFileValidator::RecordValidator.new fields
45
+ end
46
+
47
+ def create_record_validator_with_reader(data_file_path)
48
+ # in this scenario the reader will read through the file to find unique values
49
+ # so we need to create a new reader and close it when done
50
+ reader = create_file_reader(data_file_path)
51
+ FixedWidthFileValidator::RecordValidator.new fields, unique_fields, reader
52
+ ensure
53
+ reader.close
54
+ end
55
+
56
+ private
57
+
58
+ def record_parser
59
+ # rubocop:disable Naming/MemoizedInstanceVariableName
60
+ @parser ||= RecordParser.new(@parser_field_list, file_settings[:encoding])
61
+ # rubocop:enable Naming/MemoizedInstanceVariableName
62
+ end
63
+
64
+ def load_config(record_type)
65
+ format_config = config_for(record_type)
66
+ format_config[:fields].each do |field_config|
67
+ field_config = parse_field_config(field_config)
68
+ key = field_config[:field_name].to_sym
69
+ @fields[key] = field_config
70
+ @parser_field_list << parser_params(key)
71
+ @column = fields[key][:end_column] + 1
72
+ end
73
+ @file_settings = { skip_top_lines: format_config[:skip_top_lines] || 0,
74
+ skip_bottom_lines: format_config[:skip_bottom_lines] || 0 }
75
+ end
76
+
77
+ def parse_field_config(field_config)
78
+ width = field_config[:width]
79
+ return unless width&.positive?
80
+
81
+ field_name = field_config[:name] || "field_#{@column}"
82
+ start_column = field_config[:starts_at] || @column
83
+ end_column = start_column + width - 1
84
+ validations = field_config[:validate]
85
+ validations = [validations] unless validations.is_a?(Array)
86
+
87
+ {
88
+ field_name: field_name,
89
+ start_column: start_column,
90
+ end_column: end_column,
91
+ validations: validations
92
+ }
93
+ end
94
+
95
+ def find_unique_fields
96
+ fields.each do |field_name, field_rule|
97
+ next if field_rule[:validations].select { |v| v == 'unique' }.empty?
98
+ unique_fields << field_name
99
+ end
100
+ end
101
+
102
+ def parser_params(field_name)
103
+ f = fields[field_name]
104
+ # in config file column starts with 1 but when parsing line begins at 0
105
+ { name: field_name, position: (f[:start_column] - 1..f[:end_column] - 1) }
106
+ end
107
+
108
+ def config_for(record_type)
109
+ format_config = @raw_config[record_type]
110
+ format_fields = format_config[:fields]
111
+ inherit_format = format_config[:inherit_from]&.to_sym
112
+
113
+ if inherit_format
114
+ inherit_config = @raw_config[inherit_format]
115
+ inherit_fields = inherit_config ? inherit_config[:fields] : []
116
+
117
+ inherit_fields.each do |field|
118
+ format_fields << field if format_fields.select { |f| f[:name] == field[:name] }.empty?
119
+ end
120
+ end
121
+
122
+ format_config
123
+ end
124
+
125
+ def symbolize(obj)
126
+ return obj.each_with_object({}) { |(k, v), memo| memo[k.to_sym] = symbolize(v); } if obj.is_a? Hash
127
+ return obj.each_with_object([]) { |v, memo| memo << symbolize(v); } if obj.is_a? Array
128
+ obj
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,100 @@
1
+ module FixedWidthFileValidator
2
+ class FileReader
3
+ def initialize(file_name, parser = nil, settings = {})
4
+ @skip_top_lines = settings[:skip_top_lines] || 0
5
+ @skip_bottom_lines = settings[:skip_bottom_lines] || 0
6
+ @data_file_path = file_name
7
+ @line_num = 0
8
+ @buffer = []
9
+ @parser = parser
10
+ @skip_top_done = false
11
+ end
12
+
13
+ def next_record
14
+ line_num, content = readline_with_skip
15
+ return unless line_num
16
+ @parser ? @parser.parse(content, line_num, content) : content.strip
17
+ end
18
+
19
+ def each_record
20
+ record = next_record
21
+ until record.nil?
22
+ # puts "#{Time.now} at #{@line_num}" if @line_num % 10000 == 0
23
+ yield record
24
+ record = next_record
25
+ end
26
+ ensure
27
+ close
28
+ end
29
+
30
+ def close
31
+ @file&.close
32
+ ensure
33
+ @file = nil
34
+ end
35
+
36
+ def find_non_unique_values(field_list = [])
37
+ return if field_list.empty?
38
+
39
+ lookup_hash = build_unique_value_lookup_hash(field_list)
40
+
41
+ result = {}
42
+ field_list.each do |field_name|
43
+ result[field_name] = lookup_hash[field_name].select { |_k, v| v.count > 1 }
44
+ end
45
+ result
46
+ end
47
+
48
+ private
49
+
50
+ def readline_with_skip
51
+ @file ||= File.open(@data_file_path, 'r')
52
+ skip_top
53
+ skip_bottom
54
+ readline
55
+ end
56
+
57
+ def readline
58
+ return nil if @file.eof?
59
+
60
+ buffer_line
61
+ @buffer.shift
62
+ end
63
+
64
+ def next_line
65
+ return nil if @file.eof?
66
+ @line_num += 1
67
+ [@line_num, @file.readline]
68
+ end
69
+
70
+ def buffer_line
71
+ return nil if @file.eof?
72
+ @buffer << next_line
73
+ end
74
+
75
+ def skip_top
76
+ return if @skip_top_done
77
+
78
+ @skip_top_lines.times { next_line }
79
+ @skip_top_done = true
80
+ end
81
+
82
+ def skip_bottom
83
+ @skip_bottom_lines.times { buffer_line } if @buffer.empty?
84
+ end
85
+
86
+ def build_unique_value_lookup_hash(field_list)
87
+ tmp_store = {}
88
+
89
+ each_record do |record|
90
+ field_list.each do |field_name|
91
+ tmp_store[field_name] ||= {}
92
+ tmp_store[field_name][record[field_name]] ||= []
93
+ tmp_store[field_name][record[field_name]] << @line_num
94
+ end
95
+ end
96
+
97
+ tmp_store
98
+ end
99
+ end
100
+ end
@@ -0,0 +1,19 @@
1
+ module FixedWidthFileValidator
2
+ class RecordParser
3
+ attr_reader :field_list, :encoding
4
+
5
+ def initialize(field_list, encoding)
6
+ @field_list = field_list
7
+ @encoding = encoding || 'ISO-8859-1'
8
+ end
9
+
10
+ def parse(line, line_num, raw_line)
11
+ record = { _line_num: line_num, _raw: raw_line }
12
+ encoded = line.encode(@encoding, 'binary', invalid: :replace, undef: :replace)
13
+ field_list.each do |field|
14
+ record[field[:name].to_sym] = encoded[field[:position]].nil? ? nil : encoded[field[:position]].strip
15
+ end
16
+ record
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,36 @@
1
+ require 'fixed_width_file_validator/validator'
2
+
3
+ module FixedWidthFileValidator
4
+ class TextReportFormatter
5
+ attr_accessor :line_no_width
6
+
7
+ def initialize(args = {})
8
+ @line_no_width = args[:line_no_width] || 5
9
+ end
10
+
11
+ def write(err, file = $stderr)
12
+ return if err.nil?
13
+
14
+ line_prefix = format("%0#{line_no_width}i:", err.line_num)
15
+ marker = line_prefix + ' ' * (err.pos - 1) + '^' * err.width
16
+ message = line_prefix + ' ' * (err.pos - 1) + "field #{err.failed_field} does not satisfy #{err.failed_validation}"
17
+ file.puts line_prefix + err.raw.chop
18
+ file.puts marker
19
+ file.puts message
20
+ file.puts
21
+ end
22
+
23
+ def write_unique_errors(errors, file = $stderr)
24
+ return if errors.empty?
25
+
26
+ err = errors.first
27
+ pos = err.pos - 1
28
+ width = err.width
29
+ line_prefix = format("%0#{line_no_width}i:", err.line_num)
30
+
31
+ errors.each { |e| file.puts line_prefix + e.raw.chop }
32
+ file.puts line_prefix + ' ' * pos + '^' * width
33
+ file.puts line_prefix + ' ' * pos + "field #{err.failed_field} is not #{err.failed_validation}"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,76 @@
1
+ require 'date'
2
+ require 'time'
3
+
4
+ module FixedWidthFileValidator
5
+ module StringHelper
6
+ def any
7
+ true
8
+ end
9
+
10
+ def blank
11
+ strip.empty?
12
+ end
13
+
14
+ def not_blank
15
+ !blank
16
+ end
17
+
18
+ def width(num)
19
+ size == num
20
+ end
21
+
22
+ def date_time(format = '%Y%m%d%H%M%S')
23
+ Time.strptime(self, format)
24
+ rescue ArgumentError
25
+ false
26
+ end
27
+
28
+ def date(format = '%Y%m%d')
29
+ # since exception is slow, optimize for known format here
30
+ if format == '%Y%m%d'
31
+ return false unless length == 8
32
+ y = slice(0..3).to_i
33
+ m = slice(4..5).to_i
34
+ d = slice(6..7).to_i
35
+ Date.valid_date?(y, m, d)
36
+ else
37
+ Date.strptime(self, format)
38
+ end
39
+ rescue ArgumentError
40
+ false
41
+ end
42
+
43
+ def time
44
+ return false unless length == 6
45
+ h = self[0..1].to_i
46
+ m = self[2..3].to_i
47
+ s = self[4..5].to_i
48
+ h >= 0 && h < 24 && m >= 0 && m < 60 && s >= 0 && s < 60
49
+ end
50
+
51
+ def time_or_blank
52
+ blank || time
53
+ end
54
+
55
+ def date_or_blank(format = '%Y%m%d')
56
+ blank || date(format)
57
+ end
58
+
59
+ def positive
60
+ to_i.positive?
61
+ end
62
+
63
+ def numeric(max = 32, precision = 0, min = 1)
64
+ m = /^(\d*)\.?(\d*)$/.match(self)
65
+ m && m[1] && (min..max).cover?(m[1].size) && m[2].size == precision
66
+ end
67
+
68
+ def numeric_or_blank(max = 32, precision = 0, min = 1)
69
+ blank || numeric(max, precision, min)
70
+ end
71
+ end
72
+ end
73
+
74
+ class String
75
+ include FixedWidthFileValidator::StringHelper
76
+ end
@@ -0,0 +1,126 @@
1
+ require 'ripper'
2
+
3
+ module FixedWidthFileValidator
4
+ class FieldValidationError
5
+ attr_reader :raw, :record, :line_num, :failed_field, :failed_value, :failed_validation, :pos, :width
6
+
7
+ def initialize(validation, record, field_name, pos, width)
8
+ @raw = record[:_raw]
9
+ @line_num = record[:_line_num]
10
+ @record = record
11
+ @failed_field = field_name
12
+ @failed_validation = validation
13
+ @failed_value = record[field_name]
14
+ @pos = pos
15
+ @width = width
16
+ end
17
+ end
18
+
19
+ # rubocop:disable Style/ClassVars
20
+ class FieldValidator
21
+ attr_accessor :field_name, :non_unique_values, :validations, :pos, :width
22
+
23
+ @@token_cache = {}
24
+
25
+ def initialize(field_name, pos, width, validations = nil)
26
+ self.field_name = field_name
27
+ self.non_unique_values = []
28
+ self.validations = validations
29
+ self.pos = pos
30
+ self.width = width
31
+ end
32
+
33
+ # return an array of error objects
34
+ # empty array if all validation passes
35
+ def validate(record, field_name, bindings = {})
36
+ if validations
37
+ validations.collect do |validation|
38
+ unless valid_value?(validation, record, field_name, bindings)
39
+ FieldValidationError.new(validation, record, field_name, pos, width)
40
+ end
41
+ end.compact
42
+ elsif record && record[field_name]
43
+ # when no validation rules exist for the field, just check if the field exists in the record
44
+ []
45
+ else
46
+ raise "found field value nil in #{record} field #{field_name}, shouldn't be possible?"
47
+ end
48
+ end
49
+
50
+ private
51
+
52
+ def valid_value?(validation, record, field_name, bindings)
53
+ value = record[field_name]
54
+ if value.nil?
55
+ false
56
+ elsif validation.is_a? String
57
+ keyword = keyword_for(validation)
58
+ if validation == 'unique'
59
+ !non_unique_values.include?(value)
60
+ elsif validation == keyword && value.respond_to?(keyword)
61
+ # this scenario can be handled by instance_eval too
62
+ # we do this as an optimization since it is much faster
63
+ value.public_send(validation)
64
+ elsif keyword == '^' || value.respond_to?(keyword)
65
+ validation = validation[1..-1] if keyword == '^'
66
+ code = "lambda { |r, _g| #{validation} }"
67
+ value.instance_eval(code).call(record, bindings)
68
+ else
69
+ value == validation
70
+ end
71
+ elsif validation.is_a? Array
72
+ validation.include? value
73
+ else
74
+ raise "Unknown validation #{validation} for #{record_type}/#{field_name}"
75
+ end
76
+ end
77
+
78
+ def keyword_for(validation)
79
+ @@token_cache[validation] ||= Ripper.tokenize(validation).first
80
+ @@token_cache[validation]
81
+ end
82
+ end
83
+ # rubocop:enable Style/ClassVars
84
+
85
+ class RecordValidator
86
+ attr_accessor :bindings
87
+
88
+ def initialize(fields, unique_field_list = nil, reader = nil)
89
+ @field_validators = {}
90
+ @bindings = {}
91
+
92
+ non_unique_values = reader.find_non_unique_values(unique_field_list)
93
+
94
+ fields.each do |field_name, conf|
95
+ pos = conf[:start_column]
96
+ width = conf[:end_column] - pos + 1
97
+ @field_validators[field_name] = FieldValidator.new(field_name, pos, width, fields[field_name][:validations])
98
+ @field_validators[field_name].non_unique_values = non_unique_values[field_name] if unique_field_list.include?(field_name)
99
+ end
100
+ end
101
+
102
+ def validate(record)
103
+ errors = @field_validators.collect do |field, validator|
104
+ validator.validate(record, field, @bindings)
105
+ end
106
+ errors.reject(&:empty?).flatten
107
+ end
108
+
109
+ def each_error(file_reader)
110
+ file_reader.each_record do |record|
111
+ errors = validate(record)
112
+ errors.each do |err|
113
+ yield err
114
+ end
115
+ end
116
+ end
117
+
118
+ def find_all_errors(file_reader)
119
+ errors = []
120
+ each_error(file_reader) do |err|
121
+ errors << err
122
+ end
123
+ errors
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,3 @@
1
+ module FixedWidthFileValidator
2
+ VERSION = '0.3.0'.freeze
3
+ end
metadata ADDED
@@ -0,0 +1,95 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fixed_width_file_validator
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.3.0
5
+ platform: ruby
6
+ authors:
7
+ - Li Lin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2018-03-22 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - "~>"
17
+ - !ruby/object:Gem::Version
18
+ version: '1.16'
19
+ name: bundler
20
+ prerelease: false
21
+ type: :development
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.16'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - "~>"
31
+ - !ruby/object:Gem::Version
32
+ version: '5.0'
33
+ name: minitest
34
+ prerelease: false
35
+ type: :development
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '5.0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '10.0'
47
+ name: rake
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '10.0'
55
+ description: validate fixed width text file base on configuration
56
+ email:
57
+ - guru.lin@gmail.com
58
+ executables: []
59
+ extensions: []
60
+ extra_rdoc_files: []
61
+ files:
62
+ - README.md
63
+ - lib/fixed_width_file_validator.rb
64
+ - lib/fixed_width_file_validator/file_format.rb
65
+ - lib/fixed_width_file_validator/file_reader.rb
66
+ - lib/fixed_width_file_validator/record_parser.rb
67
+ - lib/fixed_width_file_validator/report_formatter.rb
68
+ - lib/fixed_width_file_validator/string_helper.rb
69
+ - lib/fixed_width_file_validator/validator.rb
70
+ - lib/fixed_width_file_validator/version.rb
71
+ homepage: https://github.com/sloppycoder/fixed_width_file_validator.git
72
+ licenses:
73
+ - MIT
74
+ metadata: {}
75
+ post_install_message:
76
+ rdoc_options: []
77
+ require_paths:
78
+ - lib
79
+ required_ruby_version: !ruby/object:Gem::Requirement
80
+ requirements:
81
+ - - ">="
82
+ - !ruby/object:Gem::Version
83
+ version: '0'
84
+ required_rubygems_version: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ requirements: []
90
+ rubyforge_project:
91
+ rubygems_version: 2.6.14.1
92
+ signing_key:
93
+ specification_version: 4
94
+ summary: validate fixed width text file base on configuration
95
+ test_files: []