ndr_import 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +14 -0
- data/.rubocop.yml +27 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +21 -0
- data/README.md +69 -0
- data/Rakefile +13 -0
- data/code_safety.yml +374 -0
- data/gemfiles/Gemfile.rails32 +5 -0
- data/gemfiles/Gemfile.rails32.lock +142 -0
- data/gemfiles/Gemfile.rails41 +5 -0
- data/gemfiles/Gemfile.rails41.lock +145 -0
- data/gemfiles/Gemfile.rails42 +5 -0
- data/gemfiles/Gemfile.rails42.lock +145 -0
- data/lib/ndr_import.rb +13 -0
- data/lib/ndr_import/csv_library.rb +40 -0
- data/lib/ndr_import/file/all.rb +8 -0
- data/lib/ndr_import/file/base.rb +76 -0
- data/lib/ndr_import/file/delimited.rb +86 -0
- data/lib/ndr_import/file/excel.rb +131 -0
- data/lib/ndr_import/file/pdf.rb +38 -0
- data/lib/ndr_import/file/registry.rb +50 -0
- data/lib/ndr_import/file/text.rb +52 -0
- data/lib/ndr_import/file/word.rb +30 -0
- data/lib/ndr_import/file/zip.rb +67 -0
- data/lib/ndr_import/helpers/file/delimited.rb +105 -0
- data/lib/ndr_import/helpers/file/excel.rb +181 -0
- data/lib/ndr_import/helpers/file/pdf.rb +29 -0
- data/lib/ndr_import/helpers/file/word.rb +27 -0
- data/lib/ndr_import/helpers/file/xml.rb +45 -0
- data/lib/ndr_import/helpers/file/zip.rb +44 -0
- data/lib/ndr_import/mapper.rb +220 -0
- data/lib/ndr_import/mapping_error.rb +5 -0
- data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
- data/lib/ndr_import/non_tabular/line.rb +46 -0
- data/lib/ndr_import/non_tabular/mapping.rb +35 -0
- data/lib/ndr_import/non_tabular/record.rb +99 -0
- data/lib/ndr_import/non_tabular/table.rb +193 -0
- data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
- data/lib/ndr_import/standard_mappings.rb +23 -0
- data/lib/ndr_import/table.rb +179 -0
- data/lib/ndr_import/version.rb +4 -0
- data/ndr_import.gemspec +44 -0
- data/test/file/base_test.rb +54 -0
- data/test/file/delimited_test.rb +143 -0
- data/test/file/excel_test.rb +85 -0
- data/test/file/pdf_test.rb +35 -0
- data/test/file/registry_test.rb +60 -0
- data/test/file/text_test.rb +92 -0
- data/test/file/word_test.rb +35 -0
- data/test/file/zip_test.rb +47 -0
- data/test/helpers/file/delimited_test.rb +113 -0
- data/test/helpers/file/excel_test.rb +97 -0
- data/test/helpers/file/pdf_test.rb +26 -0
- data/test/helpers/file/word_test.rb +26 -0
- data/test/helpers/file/xml_test.rb +131 -0
- data/test/helpers/file/zip_test.rb +75 -0
- data/test/mapper_test.rb +551 -0
- data/test/non_tabular/mapping_test.rb +36 -0
- data/test/non_tabular/table_test.rb +510 -0
- data/test/non_tabular_file_helper_test.rb +501 -0
- data/test/readme_test.rb +53 -0
- data/test/resources/bomd.csv +3 -0
- data/test/resources/broken.csv +3 -0
- data/test/resources/filesystem_paths.yml +26 -0
- data/test/resources/flat_file.pdf +0 -0
- data/test/resources/flat_file.txt +27 -0
- data/test/resources/flat_file.yml +20 -0
- data/test/resources/hello_utf16be.txt +0 -0
- data/test/resources/hello_utf16le.txt +0 -0
- data/test/resources/hello_utf8.txt +2 -0
- data/test/resources/hello_windows.txt +2 -0
- data/test/resources/hello_world.doc +0 -0
- data/test/resources/hello_world.pdf +0 -0
- data/test/resources/hello_world.txt +2 -0
- data/test/resources/high_ascii_delimited.txt +2 -0
- data/test/resources/malformed.xml +6 -0
- data/test/resources/normal.csv +3 -0
- data/test/resources/normal.csv.zip +0 -0
- data/test/resources/normal_pipe.csv +3 -0
- data/test/resources/normal_thorn.csv +3 -0
- data/test/resources/not_a_pdf.pdf +0 -0
- data/test/resources/not_a_word_file.doc +0 -0
- data/test/resources/sample_xls.xls +0 -0
- data/test/resources/sample_xlsx.xlsx +0 -0
- data/test/resources/standard_mappings.yml +39 -0
- data/test/resources/txt_file_xls_extension.xls +1 -0
- data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
- data/test/resources/utf-16be_xml.xml +0 -0
- data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
- data/test/resources/utf-16le_xml.xml +0 -0
- data/test/resources/utf-8_xml.xml +9 -0
- data/test/resources/windows-1252_xml.xml +9 -0
- data/test/resources/windows.csv +5 -0
- data/test/resources/xlsx_file_xls_extension.xls +0 -0
- data/test/standard_mappings_test.rb +22 -0
- data/test/table_test.rb +288 -0
- data/test/test_helper.rb +13 -0
- metadata +443 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
require 'ndr_import/non_tabular/table'
|
|
3
|
+
|
|
4
|
+
module NdrImport
|
|
5
|
+
module NonTabular
|
|
6
|
+
# This class stores the mapping used to break an incoming file into multiple rows/records
|
|
7
|
+
class Mapping < Table
|
|
8
|
+
def self.all_valid_options
|
|
9
|
+
super + %w(non_tabular_row)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def initialize(options)
|
|
13
|
+
non_tabular_mappings = options['non_tabular_row']
|
|
14
|
+
if non_tabular_mappings
|
|
15
|
+
initialize_non_tabular_mappings(non_tabular_mappings)
|
|
16
|
+
else
|
|
17
|
+
# validate presence of non_tabular_row
|
|
18
|
+
fail NdrImport::MappingError,
|
|
19
|
+
I18n.t('mapping.errors.missing_non_tabular_row')
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
super(options)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
private
|
|
26
|
+
|
|
27
|
+
def initialize_non_tabular_mappings(non_tabular_mappings)
|
|
28
|
+
NON_TABULAR_OPTIONS.each do |key|
|
|
29
|
+
next unless non_tabular_mappings[key]
|
|
30
|
+
instance_variable_set("@#{key}", non_tabular_mappings[key])
|
|
31
|
+
end
|
|
32
|
+
end
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module NonTabular
|
|
5
|
+
# This class behaves like an array of NdrImport::NonTabular::Line elements
|
|
6
|
+
# that contains all the source lines of text that relate to a single record of data.
|
|
7
|
+
# It also encapsulates the logic that tabulates the data.
|
|
8
|
+
class Record
|
|
9
|
+
attr_reader :lines
|
|
10
|
+
|
|
11
|
+
def initialize
|
|
12
|
+
@lines = []
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def <<(line)
|
|
16
|
+
return if line.removed
|
|
17
|
+
line.in_a_record = true
|
|
18
|
+
line.record_line_number = @lines.length
|
|
19
|
+
@lines << line
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def empty?
|
|
23
|
+
@lines.empty?
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Call this if it turns out that this is not a record.
|
|
27
|
+
# All lines will be flagged accordingly.
|
|
28
|
+
def not_a_record!
|
|
29
|
+
@lines.each { |line| line.in_a_record = false }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Returns an array of "cells" for a given array of lines of a file that represent
|
|
33
|
+
# a single "row" of data. Allowing the output to be mapped by the standard mapper.
|
|
34
|
+
#
|
|
35
|
+
# ==== Signature
|
|
36
|
+
#
|
|
37
|
+
# tabulate(mappings)
|
|
38
|
+
#
|
|
39
|
+
# ==== Examples
|
|
40
|
+
#
|
|
41
|
+
# If the YAML mapping is
|
|
42
|
+
# ---
|
|
43
|
+
# - standard_mapping: nhsnumber
|
|
44
|
+
# non_tabular_cell:
|
|
45
|
+
# lines: 0
|
|
46
|
+
# capture:
|
|
47
|
+
# - !ruby/regexp /^D\|([^|]*).*/
|
|
48
|
+
# - column: fulltextreport
|
|
49
|
+
# non_tabular_cell:
|
|
50
|
+
# lines: !ruby/range
|
|
51
|
+
# begin: 1
|
|
52
|
+
# end: -1
|
|
53
|
+
# excl: false
|
|
54
|
+
# capture: !ruby/regexp /^(?:R|\d+)\|(.*)$/i
|
|
55
|
+
# join: \n
|
|
56
|
+
#
|
|
57
|
+
# lines = [
|
|
58
|
+
# "D|1111111111|...",
|
|
59
|
+
# "R|This is a",
|
|
60
|
+
# "1|multiline report"
|
|
61
|
+
# ]
|
|
62
|
+
#
|
|
63
|
+
# tabulate(mappings)
|
|
64
|
+
#
|
|
65
|
+
# # =>
|
|
66
|
+
# [
|
|
67
|
+
# "1111111111",
|
|
68
|
+
# "This is a\nmultiline report"
|
|
69
|
+
# ]
|
|
70
|
+
#
|
|
71
|
+
def tabulate(mappings)
|
|
72
|
+
cells = []
|
|
73
|
+
mappings.each do |column_mapping|
|
|
74
|
+
begin
|
|
75
|
+
matches = get_matches(column_mapping)
|
|
76
|
+
# Join the non-blank lines together and add to the array of cells
|
|
77
|
+
cells << matches.select { |value| !value.blank? }.join(column_mapping.join || '')
|
|
78
|
+
rescue RegexpRange::PatternMatchError
|
|
79
|
+
cells << nil
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
cells
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
# returns an array of matches from within the captured lines
|
|
86
|
+
def get_matches(column_mapping)
|
|
87
|
+
matching_lines = column_mapping.matching_lines(@lines)
|
|
88
|
+
# loop through the specified line (or lines)
|
|
89
|
+
matches = Array(@lines[matching_lines]).map do |line|
|
|
90
|
+
line.captured_for(column_mapping.name)
|
|
91
|
+
value = column_mapping.capture_value(line)
|
|
92
|
+
line.matches_for(column_mapping.name, value)
|
|
93
|
+
value
|
|
94
|
+
end
|
|
95
|
+
matches
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
require 'ndr_import/table'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module NonTabular
|
|
5
|
+
# This class maintains the state of a non tabular table mapping and encapsulates
|
|
6
|
+
# the logic required to transform a table of data into "records". Particular
|
|
7
|
+
# attention has been made to use enumerables throughout to help with the
|
|
8
|
+
# transformation of large quantities of data.
|
|
9
|
+
class Table < ::NdrImport::Table
|
|
10
|
+
require 'i18n'
|
|
11
|
+
require 'ndr_support/regexp_range' # TODO: unneeded?
|
|
12
|
+
require 'ndr_support/utf8_encoding'
|
|
13
|
+
require 'ndr_import/non_tabular/column_mapping'
|
|
14
|
+
require 'ndr_import/non_tabular/record'
|
|
15
|
+
require 'ndr_import/non_tabular/line'
|
|
16
|
+
|
|
17
|
+
include UTF8Encoding
|
|
18
|
+
|
|
19
|
+
NON_TABULAR_OPTIONS = %w(capture_start_line start_line_pattern end_line_pattern remove_lines
|
|
20
|
+
start_in_a_record end_in_a_record)
|
|
21
|
+
|
|
22
|
+
def self.all_valid_options
|
|
23
|
+
super - %w(tablename_pattern header_lines footer_lines) + NON_TABULAR_OPTIONS
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
attr_reader(*NON_TABULAR_OPTIONS)
|
|
27
|
+
attr_reader :non_tabular_lines
|
|
28
|
+
|
|
29
|
+
def header_lines
|
|
30
|
+
0
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def footer_lines
|
|
34
|
+
0
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def initialize(options = {})
|
|
38
|
+
super(options)
|
|
39
|
+
|
|
40
|
+
validate_presence_of_start_line_pattern
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def tablename_pattern=(_value)
|
|
44
|
+
fail NdrImport::MappingError, 'Should not define tablename_pattern'
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def validate_presence_of_start_line_pattern
|
|
48
|
+
return if @start_line_pattern
|
|
49
|
+
fail NdrImport::MappingError,
|
|
50
|
+
I18n.t('mapping.errors.missing_start_line_pattern')
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# This method transforms a table of data, given a line array/enumerator and yields
|
|
54
|
+
# klass, fields and index (input row number) for each record that it would create
|
|
55
|
+
# as a result of the transformation process.
|
|
56
|
+
def transform(lines, &block)
|
|
57
|
+
return enum_for(:transform, lines) unless block
|
|
58
|
+
|
|
59
|
+
self.non_tabular_lines = ensure_utf8_enum!(lines)
|
|
60
|
+
remove_unwanted_lines
|
|
61
|
+
|
|
62
|
+
super(read_non_tabular_array, &block)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def validate_header(_line, _column_mappings)
|
|
66
|
+
@header_valid = true
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
protected
|
|
70
|
+
|
|
71
|
+
def ensure_utf8_enum!(lines)
|
|
72
|
+
return enum_for(:ensure_utf8_enum!, lines) unless block_given?
|
|
73
|
+
|
|
74
|
+
lines.each do |line|
|
|
75
|
+
# puts 'ensure_utf8_object!'
|
|
76
|
+
yield ensure_utf8_object!(line)
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# This method flages unwanted lines, typically page headers and footers as removed
|
|
81
|
+
# preventing them from being captured in the non tabular record. Especially useful
|
|
82
|
+
# when there page headers and footers that are out of step with the start and end
|
|
83
|
+
# of each record and could therefore appear anywhere in an individual record if kept.
|
|
84
|
+
def remove_unwanted_lines
|
|
85
|
+
return unless @remove_lines.is_a?(Hash)
|
|
86
|
+
@non_tabular_lines.each_with_index do |_line, i|
|
|
87
|
+
@remove_lines.each do |_key, lines_to_remove|
|
|
88
|
+
comparable_lines = @non_tabular_lines[i, lines_to_remove.length]
|
|
89
|
+
next unless lines_equal(comparable_lines, lines_to_remove)
|
|
90
|
+
# All lines are equal, so flag them as removed
|
|
91
|
+
comparable_lines.each { |line| line.removed = true }
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def read_non_tabular_array
|
|
97
|
+
@tabular_array = []
|
|
98
|
+
@in_a_record = @start_in_a_record
|
|
99
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
100
|
+
|
|
101
|
+
partition_and_process_non_tabular_lines
|
|
102
|
+
process_end_of_record
|
|
103
|
+
|
|
104
|
+
@tabular_array
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Reads the array of lines, looking to see if a line matches the start_line_pattern,
|
|
108
|
+
# identifying the start of a record. It then collects all the lines until a line
|
|
109
|
+
# matches the end_line_pattern (if defined, otherwise when it matches the next
|
|
110
|
+
# start_line_pattern) and sends these line to NdrImport::NonTabular::Record#tabulate.
|
|
111
|
+
#
|
|
112
|
+
# NOTE: Currently the end line is consumed and does not form part of the
|
|
113
|
+
# collected array.
|
|
114
|
+
def partition_and_process_non_tabular_lines
|
|
115
|
+
non_tabular_lines.each do |line|
|
|
116
|
+
if line =~ @start_line_pattern
|
|
117
|
+
# This is a start line
|
|
118
|
+
start_record(line)
|
|
119
|
+
elsif line =~ @end_line_pattern
|
|
120
|
+
# This is an end line
|
|
121
|
+
end_record
|
|
122
|
+
else
|
|
123
|
+
@non_tabular_record << line if @in_a_record
|
|
124
|
+
end
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# Checks to see if we get the start of a new record before getting the end of the previous
|
|
129
|
+
# one and fails if so. Otherwise it tabulates the previous record
|
|
130
|
+
def start_record(line)
|
|
131
|
+
if @end_line_pattern
|
|
132
|
+
fail NdrImport::MappingError,
|
|
133
|
+
I18n.t('mapping.errors.start_pattern_before_end') if @in_a_record
|
|
134
|
+
else
|
|
135
|
+
# No endline mapping
|
|
136
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
137
|
+
end
|
|
138
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
139
|
+
@non_tabular_record << line if @capture_start_line
|
|
140
|
+
@in_a_record = true
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
# Tabulate the record (if in one), flagged it as no longer being in a record
|
|
144
|
+
# and set the record to be a new one.
|
|
145
|
+
def end_record
|
|
146
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
147
|
+
@in_a_record = false
|
|
148
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# If the non-tabular data ends in a record (i.e. the last record is terminated by the EOF)
|
|
152
|
+
# then we need to process the last record manually or flag those lines as not being part
|
|
153
|
+
# of a record
|
|
154
|
+
def process_end_of_record
|
|
155
|
+
return if @non_tabular_record.empty?
|
|
156
|
+
if @end_in_a_record
|
|
157
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
158
|
+
else
|
|
159
|
+
@non_tabular_record.not_a_record!
|
|
160
|
+
end
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
# Store the source lines as instances of NdrImport::NonTabular::Line
|
|
164
|
+
def non_tabular_lines=(lines)
|
|
165
|
+
@non_tabular_lines = lines.map.with_index do |line, i|
|
|
166
|
+
NdrImport::NonTabular::Line.new(line, i)
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Create and memoize the column mappings
|
|
171
|
+
def column_mappings
|
|
172
|
+
@column_mappings ||= raw_column_mappings.map do |column_mapping|
|
|
173
|
+
NdrImport::NonTabular::ColumnMapping.new(column_mapping)
|
|
174
|
+
end
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
def raw_column_mappings
|
|
178
|
+
@columns || []
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# This method compares two arrays, where the first must be an array of
|
|
182
|
+
# NdrImport::NonTabular::Line or string elements
|
|
183
|
+
# and the second can be a mix of strings and/or regular expressions
|
|
184
|
+
def lines_equal(lines, other_lines)
|
|
185
|
+
return false unless lines.length == other_lines.length
|
|
186
|
+
lines.each_with_index.map do |line, i|
|
|
187
|
+
other_line = other_lines[i]
|
|
188
|
+
other_line.is_a?(Regexp) ? line.to_s =~ other_line : line.to_s == other_line
|
|
189
|
+
end.all?
|
|
190
|
+
end
|
|
191
|
+
end
|
|
192
|
+
end
|
|
193
|
+
end
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
# This mixin adds (multiline) non-tabular file functionality to unified importers.
|
|
5
|
+
# It provides a file reader method and method to capture the rawtext value
|
|
6
|
+
# appropriately. These methods can be overridden or aliased as required.
|
|
7
|
+
#
|
|
8
|
+
# The YAML mapping must define the start_line_pattern which identifies the start
|
|
9
|
+
# of a multiline record (or "row") and can optionally define an end_line_pattern.
|
|
10
|
+
module NonTabularFileHelper
|
|
11
|
+
require 'i18n'
|
|
12
|
+
require 'ndr_support/regexp_range' # TODO: unneeded?
|
|
13
|
+
require 'ndr_support/utf8_encoding'
|
|
14
|
+
require 'ndr_import/non_tabular/column_mapping'
|
|
15
|
+
require 'ndr_import/non_tabular/record'
|
|
16
|
+
require 'ndr_import/non_tabular/line'
|
|
17
|
+
require 'ndr_import/non_tabular/mapping'
|
|
18
|
+
|
|
19
|
+
include UTF8Encoding
|
|
20
|
+
|
|
21
|
+
attr_reader :non_tabular_lines
|
|
22
|
+
|
|
23
|
+
protected
|
|
24
|
+
|
|
25
|
+
# Reads a non-tabular text file and returns an array of tabulated rows of data,
|
|
26
|
+
# where each row is an array of cells.
|
|
27
|
+
def read_non_tabular_file
|
|
28
|
+
self.non_tabular_lines = ensure_utf8_object! SafeFile.readlines(filename)
|
|
29
|
+
remove_unwanted_lines
|
|
30
|
+
read_non_tabular_array
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# Reads a string and returns an array of tabulated data. Use only for prototyping.
|
|
34
|
+
def read_non_tabular_string(text)
|
|
35
|
+
self.non_tabular_lines = ensure_utf8_object!(text).split("\n")
|
|
36
|
+
remove_unwanted_lines
|
|
37
|
+
read_non_tabular_array
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# This method flages unwanted lines, typically page headers and footers as removed
|
|
41
|
+
# preventing them from being captured in the non tabular record. Especially useful
|
|
42
|
+
# when there page headers and footers that are out of step with the start and end
|
|
43
|
+
# of each record and could therefore appear anywhere in an individual record if kept.
|
|
44
|
+
def remove_unwanted_lines
|
|
45
|
+
return unless row_mapping.remove_lines.is_a?(Hash)
|
|
46
|
+
@non_tabular_lines.each_with_index do |_line, i|
|
|
47
|
+
row_mapping.remove_lines.each do |_key, lines_to_remove|
|
|
48
|
+
comparable_lines = @non_tabular_lines[i, lines_to_remove.length]
|
|
49
|
+
next unless lines_equal(comparable_lines, lines_to_remove)
|
|
50
|
+
# All lines are equal, so flag them as removed
|
|
51
|
+
comparable_lines.each { |line| line.removed = true }
|
|
52
|
+
end
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def read_non_tabular_array
|
|
57
|
+
@tabular_array = []
|
|
58
|
+
@in_a_record = row_mapping.start_in_a_record
|
|
59
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
60
|
+
|
|
61
|
+
partition_and_process_non_tabular_lines
|
|
62
|
+
process_end_of_record
|
|
63
|
+
|
|
64
|
+
# We change the mapping instance variable to only contain the column mappings.
|
|
65
|
+
# This enables the standard mapper to work unaltered.
|
|
66
|
+
@mappings = raw_column_mappings
|
|
67
|
+
@tabular_array
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Reads the array of lines, looking to see if a line matches the start_line_pattern,
|
|
71
|
+
# identifying the start of a record. It then collects all the lines until a line
|
|
72
|
+
# matches the end_line_pattern (if defined, otherwise when it matches the next
|
|
73
|
+
# start_line_pattern) and sends these line to NdrImport::NonTabular::Record#tabulate.
|
|
74
|
+
#
|
|
75
|
+
# NOTE: Currently the end line is consumed and does not form part of the
|
|
76
|
+
# collected array.
|
|
77
|
+
def partition_and_process_non_tabular_lines
|
|
78
|
+
non_tabular_lines.each do |line|
|
|
79
|
+
if line =~ row_mapping.start_line_pattern
|
|
80
|
+
# This is a start line
|
|
81
|
+
start_record(line)
|
|
82
|
+
elsif line =~ row_mapping.end_line_pattern
|
|
83
|
+
# This is an end line
|
|
84
|
+
end_record
|
|
85
|
+
else
|
|
86
|
+
@non_tabular_record << line if @in_a_record
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
# Checks to see if we get the start of a new record before getting the end of the previous
|
|
92
|
+
# one and fails if so. Otherwise it tabulates the previous record
|
|
93
|
+
def start_record(line)
|
|
94
|
+
if row_mapping.end_line_pattern
|
|
95
|
+
fail NdrImport::MappingError,
|
|
96
|
+
I18n.t('mapping.errors.start_pattern_before_end') if @in_a_record
|
|
97
|
+
else
|
|
98
|
+
# No endline mapping
|
|
99
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
100
|
+
end
|
|
101
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
102
|
+
@non_tabular_record << line if row_mapping.capture_start_line
|
|
103
|
+
@in_a_record = true
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
# Tabulate the record (if in one), flagged it as no longer being in a record
|
|
107
|
+
# and set the record to be a new one.
|
|
108
|
+
def end_record
|
|
109
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
110
|
+
@in_a_record = false
|
|
111
|
+
@non_tabular_record = NdrImport::NonTabular::Record.new
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
# If the non-tabular data ends in a record (i.e. the last record is terminated by the EOF)
|
|
115
|
+
# then we need to process the last record manually or flag those lines as not being part
|
|
116
|
+
# of a record
|
|
117
|
+
def process_end_of_record
|
|
118
|
+
return if @non_tabular_record.empty?
|
|
119
|
+
if row_mapping.end_in_a_record
|
|
120
|
+
@tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
|
|
121
|
+
else
|
|
122
|
+
@non_tabular_record.not_a_record!
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Store the source lines as instances of NdrImport::NonTabular::Line
|
|
127
|
+
def non_tabular_lines=(lines)
|
|
128
|
+
@non_tabular_lines = lines.map.with_index do |line, i|
|
|
129
|
+
NdrImport::NonTabular::Line.new(line, i)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
# Create and memoize the row mappings
|
|
134
|
+
def row_mapping
|
|
135
|
+
@row_mapping ||= NdrImport::NonTabular::Mapping.new(@mappings)
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Create and memoize the column mappings
|
|
139
|
+
def column_mappings
|
|
140
|
+
@column_mappings ||= raw_column_mappings.map do |column_mapping|
|
|
141
|
+
NdrImport::NonTabular::ColumnMapping.new(column_mapping)
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
def raw_column_mappings
|
|
146
|
+
@mappings['columns'] || []
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# This method compares two arrays, where the first must be an array of
|
|
150
|
+
# NdrImport::NonTabular::Line or string elements
|
|
151
|
+
# and the second can be a mix of strings and/or regular expressions
|
|
152
|
+
def lines_equal(lines, other_lines)
|
|
153
|
+
return false unless lines.length == other_lines.length
|
|
154
|
+
lines.each_with_index.map do |line, i|
|
|
155
|
+
other_line = other_lines[i]
|
|
156
|
+
other_line.is_a?(Regexp) ? line.to_s =~ other_line : line.to_s == other_line
|
|
157
|
+
end.all?
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|