ndr_import 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.gitignore +14 -0
- data/.rubocop.yml +27 -0
- data/.ruby-version +1 -0
- data/.travis.yml +22 -0
- data/CODE_OF_CONDUCT.md +13 -0
- data/Gemfile +4 -0
- data/Guardfile +16 -0
- data/LICENSE.txt +21 -0
- data/README.md +69 -0
- data/Rakefile +13 -0
- data/code_safety.yml +374 -0
- data/gemfiles/Gemfile.rails32 +5 -0
- data/gemfiles/Gemfile.rails32.lock +142 -0
- data/gemfiles/Gemfile.rails41 +5 -0
- data/gemfiles/Gemfile.rails41.lock +145 -0
- data/gemfiles/Gemfile.rails42 +5 -0
- data/gemfiles/Gemfile.rails42.lock +145 -0
- data/lib/ndr_import.rb +13 -0
- data/lib/ndr_import/csv_library.rb +40 -0
- data/lib/ndr_import/file/all.rb +8 -0
- data/lib/ndr_import/file/base.rb +76 -0
- data/lib/ndr_import/file/delimited.rb +86 -0
- data/lib/ndr_import/file/excel.rb +131 -0
- data/lib/ndr_import/file/pdf.rb +38 -0
- data/lib/ndr_import/file/registry.rb +50 -0
- data/lib/ndr_import/file/text.rb +52 -0
- data/lib/ndr_import/file/word.rb +30 -0
- data/lib/ndr_import/file/zip.rb +67 -0
- data/lib/ndr_import/helpers/file/delimited.rb +105 -0
- data/lib/ndr_import/helpers/file/excel.rb +181 -0
- data/lib/ndr_import/helpers/file/pdf.rb +29 -0
- data/lib/ndr_import/helpers/file/word.rb +27 -0
- data/lib/ndr_import/helpers/file/xml.rb +45 -0
- data/lib/ndr_import/helpers/file/zip.rb +44 -0
- data/lib/ndr_import/mapper.rb +220 -0
- data/lib/ndr_import/mapping_error.rb +5 -0
- data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
- data/lib/ndr_import/non_tabular/line.rb +46 -0
- data/lib/ndr_import/non_tabular/mapping.rb +35 -0
- data/lib/ndr_import/non_tabular/record.rb +99 -0
- data/lib/ndr_import/non_tabular/table.rb +193 -0
- data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
- data/lib/ndr_import/standard_mappings.rb +23 -0
- data/lib/ndr_import/table.rb +179 -0
- data/lib/ndr_import/version.rb +4 -0
- data/ndr_import.gemspec +44 -0
- data/test/file/base_test.rb +54 -0
- data/test/file/delimited_test.rb +143 -0
- data/test/file/excel_test.rb +85 -0
- data/test/file/pdf_test.rb +35 -0
- data/test/file/registry_test.rb +60 -0
- data/test/file/text_test.rb +92 -0
- data/test/file/word_test.rb +35 -0
- data/test/file/zip_test.rb +47 -0
- data/test/helpers/file/delimited_test.rb +113 -0
- data/test/helpers/file/excel_test.rb +97 -0
- data/test/helpers/file/pdf_test.rb +26 -0
- data/test/helpers/file/word_test.rb +26 -0
- data/test/helpers/file/xml_test.rb +131 -0
- data/test/helpers/file/zip_test.rb +75 -0
- data/test/mapper_test.rb +551 -0
- data/test/non_tabular/mapping_test.rb +36 -0
- data/test/non_tabular/table_test.rb +510 -0
- data/test/non_tabular_file_helper_test.rb +501 -0
- data/test/readme_test.rb +53 -0
- data/test/resources/bomd.csv +3 -0
- data/test/resources/broken.csv +3 -0
- data/test/resources/filesystem_paths.yml +26 -0
- data/test/resources/flat_file.pdf +0 -0
- data/test/resources/flat_file.txt +27 -0
- data/test/resources/flat_file.yml +20 -0
- data/test/resources/hello_utf16be.txt +0 -0
- data/test/resources/hello_utf16le.txt +0 -0
- data/test/resources/hello_utf8.txt +2 -0
- data/test/resources/hello_windows.txt +2 -0
- data/test/resources/hello_world.doc +0 -0
- data/test/resources/hello_world.pdf +0 -0
- data/test/resources/hello_world.txt +2 -0
- data/test/resources/high_ascii_delimited.txt +2 -0
- data/test/resources/malformed.xml +6 -0
- data/test/resources/normal.csv +3 -0
- data/test/resources/normal.csv.zip +0 -0
- data/test/resources/normal_pipe.csv +3 -0
- data/test/resources/normal_thorn.csv +3 -0
- data/test/resources/not_a_pdf.pdf +0 -0
- data/test/resources/not_a_word_file.doc +0 -0
- data/test/resources/sample_xls.xls +0 -0
- data/test/resources/sample_xlsx.xlsx +0 -0
- data/test/resources/standard_mappings.yml +39 -0
- data/test/resources/txt_file_xls_extension.xls +1 -0
- data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
- data/test/resources/utf-16be_xml.xml +0 -0
- data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
- data/test/resources/utf-16le_xml.xml +0 -0
- data/test/resources/utf-8_xml.xml +9 -0
- data/test/resources/windows-1252_xml.xml +9 -0
- data/test/resources/windows.csv +5 -0
- data/test/resources/xlsx_file_xls_extension.xls +0 -0
- data/test/standard_mappings_test.rb +22 -0
- data/test/table_test.rb +288 -0
- data/test/test_helper.rb +13 -0
- metadata +443 -0
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Helpers
|
|
5
|
+
module File
|
|
6
|
+
# This mixin adds Word document functionality to unified importers.
|
|
7
|
+
# It provides a file reader method.
|
|
8
|
+
# currently only works on .doc (97-2003), not.docx
|
|
9
|
+
module Word
|
|
10
|
+
private
|
|
11
|
+
|
|
12
|
+
def read_word_file(path)
|
|
13
|
+
require 'msworddoc-extractor'
|
|
14
|
+
lines = []
|
|
15
|
+
begin
|
|
16
|
+
doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(path))
|
|
17
|
+
|
|
18
|
+
lines.concat doc.whole_contents.split("\n")
|
|
19
|
+
rescue => e
|
|
20
|
+
raise("#{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
|
|
21
|
+
end
|
|
22
|
+
lines
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
require 'ndr_support/utf8_encoding'
|
|
3
|
+
|
|
4
|
+
module NdrImport
|
|
5
|
+
module Helpers
|
|
6
|
+
module File
|
|
7
|
+
# This mixin adds XML functionality to unified importers.
|
|
8
|
+
module Xml
|
|
9
|
+
include UTF8Encoding
|
|
10
|
+
|
|
11
|
+
private
|
|
12
|
+
|
|
13
|
+
def read_xml_file(path)
|
|
14
|
+
file_data = SafeFile.new(path).read
|
|
15
|
+
|
|
16
|
+
require 'nokogiri'
|
|
17
|
+
|
|
18
|
+
Nokogiri::XML(ensure_utf8! file_data).tap do |doc|
|
|
19
|
+
doc.encoding = 'UTF-8'
|
|
20
|
+
emulate_strict_mode_fatal_check!(doc)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
|
|
25
|
+
# handling of muddled encodings causes XML explicitly declared as something
|
|
26
|
+
# other than UTF-8 to fail (because it has been recoded to UTF-8 by the
|
|
27
|
+
# time it is given to Nokogiri / libxml).
|
|
28
|
+
# This raises a SyntaxError if strict mode would have found any other
|
|
29
|
+
# (fatal) issues with the document.
|
|
30
|
+
def emulate_strict_mode_fatal_check!(document)
|
|
31
|
+
# We let slide any warnings about xml declared as one of our
|
|
32
|
+
# auto encodings, but parsed as UTF-8:
|
|
33
|
+
encoding_pattern = AUTO_ENCODINGS.map { |name| Regexp.escape(name) }.join('|')
|
|
34
|
+
encoding_warning = /\ADocument labelled (#{encoding_pattern}) but has UTF-8 content\z/
|
|
35
|
+
fatal_errors = document.errors.select do |error|
|
|
36
|
+
error.fatal? && (encoding_warning !~ error.message)
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
return unless fatal_errors.any?
|
|
40
|
+
fail Nokogiri::XML::SyntaxError, "The file had #{fatal_errors.length} fatal error(s)!"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
require 'ndr_support/safe_file'
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module Helpers
|
|
5
|
+
module File
|
|
6
|
+
# This mixin adds Zip functionality to unified importers.
|
|
7
|
+
module Zip
|
|
8
|
+
private
|
|
9
|
+
|
|
10
|
+
# Unzip the file, creating the destination directory if necessary.
|
|
11
|
+
# A pattern can be provided to only extract required files.
|
|
12
|
+
def unzip_file(source, destination, pattern = //)
|
|
13
|
+
# SECURE TVB Mon Aug 13 14:41:05 BST 2012 : SafePath will raise exception if insecure
|
|
14
|
+
# path is constructed
|
|
15
|
+
# SafeFile.safepath_to_string will make sure that the arguments are from type SafePath
|
|
16
|
+
|
|
17
|
+
# SECURE: BNS 2010-09-21 (for external access)
|
|
18
|
+
fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
|
|
19
|
+
|
|
20
|
+
require 'zip'
|
|
21
|
+
# TODO: Abort if destination directory already exists...
|
|
22
|
+
FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
|
|
23
|
+
|
|
24
|
+
::Zip::File.open(SafeFile.safepath_to_string(source)) do |zipfile|
|
|
25
|
+
zipfile.entries.each do |entry|
|
|
26
|
+
# SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
|
|
27
|
+
basename = ::File.basename(entry.name)
|
|
28
|
+
zipfile.extract(entry, destination.join(basename)) if entry.file? && basename.match(pattern)
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
rescue ::Zip::ZipDestinationFileExistsError
|
|
33
|
+
# I'm going to ignore this and just overwrite the files.
|
|
34
|
+
rescue SecurityError => ex
|
|
35
|
+
raise ex
|
|
36
|
+
rescue ArgumentError => ex
|
|
37
|
+
raise ex
|
|
38
|
+
rescue => ex
|
|
39
|
+
puts ex
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
require 'ndr_support/string/cleaning'
|
|
2
|
+
require 'ndr_support/string/conversions'
|
|
3
|
+
require 'ndr_import/standard_mappings'
|
|
4
|
+
require 'base64'
|
|
5
|
+
require 'msworddoc-extractor'
|
|
6
|
+
|
|
7
|
+
# This module provides helper logic for mapping unified sources for import into the system
|
|
8
|
+
module NdrImport::Mapper
|
|
9
|
+
private
|
|
10
|
+
|
|
11
|
+
# uses the mappings for this line to unpack the fixed width string
|
|
12
|
+
# returning an array of the resulting columns
|
|
13
|
+
def fixed_width_columns(line, line_mappings)
|
|
14
|
+
unpack_patterns = line_mappings.map { |c| c['unpack_pattern'] }.join
|
|
15
|
+
line.unpack(unpack_patterns)
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
# the replace option can be used before any other mapping option
|
|
19
|
+
def replace_before_mapping(original_value, field_mapping)
|
|
20
|
+
return unless field_mapping.include?('replace') && original_value
|
|
21
|
+
|
|
22
|
+
[field_mapping['replace']].flatten.each do |field_replacement|
|
|
23
|
+
field_replacement.each do |pattern, replacement|
|
|
24
|
+
original_value.gsub!(pattern, replacement)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
# Returns the standard_mapping hash specified
|
|
30
|
+
# Assumes mappping exists
|
|
31
|
+
def standard_mapping(mapping_name, column_mapping)
|
|
32
|
+
mapping = NdrImport::StandardMappings.mappings[mapping_name]
|
|
33
|
+
return nil if mapping.nil?
|
|
34
|
+
if column_mapping['mappings']
|
|
35
|
+
mapping['mappings'] = mapping['mappings'] + column_mapping.delete('mappings')
|
|
36
|
+
end
|
|
37
|
+
mapping.merge(column_mapping)
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# This takes an array of raw values and their associated mappings and returns an attribute hash
|
|
41
|
+
# It accepts a block to alter the raw value that is stored in the raw text (if necessary),
|
|
42
|
+
# enabling it to work for different sources
|
|
43
|
+
def mapped_line(line, line_mappings)
|
|
44
|
+
attributes = {}
|
|
45
|
+
rawtext = {}
|
|
46
|
+
validate_line_mappings(line_mappings)
|
|
47
|
+
|
|
48
|
+
line.each_with_index do |raw_value, col|
|
|
49
|
+
column_mapping = line_mappings[col]
|
|
50
|
+
if column_mapping.nil?
|
|
51
|
+
fail ArgumentError,
|
|
52
|
+
"Line has too many columns (expected #{line_mappings.size} but got #{line.size})"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
next if column_mapping['do_not_capture']
|
|
56
|
+
|
|
57
|
+
if column_mapping['standard_mapping']
|
|
58
|
+
column_mapping = standard_mapping(column_mapping['standard_mapping'], column_mapping)
|
|
59
|
+
end
|
|
60
|
+
field_mappings = column_mapping['mappings'] || []
|
|
61
|
+
|
|
62
|
+
# Establish the rawtext column name we are to use for this column
|
|
63
|
+
rawtext_column_name = (column_mapping['rawtext_name'] || column_mapping['column']).downcase
|
|
64
|
+
|
|
65
|
+
# Replace raw_value with decoded raw_value
|
|
66
|
+
Array(column_mapping['decode']).each do |encoding|
|
|
67
|
+
raw_value = decode_raw_value(raw_value, encoding)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# raw value casting can vary between sources, so we allow the caller to apply it here
|
|
71
|
+
if respond_to?(:cast_raw_value)
|
|
72
|
+
raw_value = cast_raw_value(rawtext_column_name, raw_value, column_mapping)
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Store the raw column value
|
|
76
|
+
rawtext[rawtext_column_name] = raw_value
|
|
77
|
+
|
|
78
|
+
field_mappings.each do |field_mapping|
|
|
79
|
+
# create a duplicate of the raw value we can manipulate
|
|
80
|
+
original_value = raw_value ? raw_value.dup : nil
|
|
81
|
+
|
|
82
|
+
replace_before_mapping(original_value, field_mapping)
|
|
83
|
+
value = mapped_value(original_value, field_mapping)
|
|
84
|
+
|
|
85
|
+
field = field_mapping['field']
|
|
86
|
+
|
|
87
|
+
# Assumes join is specified in first joined field
|
|
88
|
+
joined = field_mapping['join'] ? true : false
|
|
89
|
+
|
|
90
|
+
# Currently assuming already validated YAML, s.t. no fields have the
|
|
91
|
+
# same priorities
|
|
92
|
+
#
|
|
93
|
+
# This has become really messy...
|
|
94
|
+
unless value.blank? && !joined
|
|
95
|
+
attributes[field] = {} unless attributes[field]
|
|
96
|
+
attributes[field][:priority] = {} unless attributes[field][:priority]
|
|
97
|
+
if field_mapping['order']
|
|
98
|
+
attributes[field][field_mapping['order']] = value
|
|
99
|
+
attributes[field][:join] = field_mapping['join'] if field_mapping['join']
|
|
100
|
+
attributes[field][:compact] = field_mapping['compact'] if field_mapping.include?('compact')
|
|
101
|
+
elsif field_mapping['priority']
|
|
102
|
+
attributes[field][:priority][field_mapping['priority']] = value
|
|
103
|
+
else
|
|
104
|
+
# Check if already a mapped-to field, and assign default low
|
|
105
|
+
# priority
|
|
106
|
+
attributes[field][:priority][1] = value
|
|
107
|
+
attributes[field][:value] = value
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# tidy up many to one field mappings
|
|
114
|
+
# and one to many, for cross-populating
|
|
115
|
+
attributes.each do |field, value|
|
|
116
|
+
if value.include?(:join)
|
|
117
|
+
join_string = value.delete(:join) || ','
|
|
118
|
+
value.delete(:value)
|
|
119
|
+
value.delete(:priority)
|
|
120
|
+
if value.include?(:compact)
|
|
121
|
+
compact = value.delete(:compact)
|
|
122
|
+
else
|
|
123
|
+
compact = true
|
|
124
|
+
end
|
|
125
|
+
t = value.sort.map do |_part_order, part_value|
|
|
126
|
+
part_value.blank? ? nil : part_value
|
|
127
|
+
end
|
|
128
|
+
if compact
|
|
129
|
+
attributes[field] = t.compact.join(join_string)
|
|
130
|
+
else
|
|
131
|
+
attributes[field] = t.join(join_string)
|
|
132
|
+
end
|
|
133
|
+
else
|
|
134
|
+
attributes[field][:priority].reject! { |_k, v| v.blank? }
|
|
135
|
+
attributes[field] = attributes[field][:priority].sort.first[1]
|
|
136
|
+
end
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
attributes[:rawtext] = rawtext
|
|
140
|
+
attributes
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
def mapped_value(original_value, field_mapping)
|
|
144
|
+
if field_mapping.include?('format')
|
|
145
|
+
begin
|
|
146
|
+
return original_value.blank? ? nil : original_value.to_date(field_mapping['format'])
|
|
147
|
+
rescue ArgumentError => e
|
|
148
|
+
e2 = ArgumentError.new("#{e} value #{original_value.inspect}")
|
|
149
|
+
e2.set_backtrace(e.backtrace)
|
|
150
|
+
raise e2
|
|
151
|
+
end
|
|
152
|
+
elsif field_mapping.include?('clean')
|
|
153
|
+
return original_value.blank? ? nil : original_value.clean(field_mapping['clean'])
|
|
154
|
+
elsif field_mapping.include?('map')
|
|
155
|
+
return field_mapping['map'] ? field_mapping['map'][original_value] : nil
|
|
156
|
+
elsif field_mapping.include?('match')
|
|
157
|
+
# WARNING:TVB Thu Aug 9 17:09:25 BST 2012 field_mapping['match'] regexp
|
|
158
|
+
# may need to be escaped
|
|
159
|
+
matches = Regexp.new(field_mapping['match']).match(original_value)
|
|
160
|
+
return matches[1].strip if matches && matches.size > 0
|
|
161
|
+
elsif field_mapping.include?('daysafter')
|
|
162
|
+
return original_value unless original_value.to_i.to_s == original_value.to_s
|
|
163
|
+
return original_value.to_i.days.since(field_mapping['daysafter'].to_time).to_date
|
|
164
|
+
else
|
|
165
|
+
return nil if original_value.blank?
|
|
166
|
+
return original_value.is_a?(String) ? original_value.strip : original_value
|
|
167
|
+
end
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
# Check for duplicate priorities, check for nonexistent standard_mappings
|
|
171
|
+
def validate_line_mappings(line_mappings)
|
|
172
|
+
priority = {}
|
|
173
|
+
line_mappings.each do |column_mapping|
|
|
174
|
+
if column_mapping['standard_mapping']
|
|
175
|
+
if standard_mapping(column_mapping['standard_mapping'], column_mapping).nil?
|
|
176
|
+
fail "Standard mapping \"#{column_mapping['standard_mapping']}\" does not exist"
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
field_mappings = column_mapping['mappings'] || []
|
|
180
|
+
field_mappings.each do |field_mapping|
|
|
181
|
+
field = field_mapping['field']
|
|
182
|
+
if field_mapping['priority']
|
|
183
|
+
fail 'Cannot have duplicate priorities' if priority[field] == field_mapping['priority']
|
|
184
|
+
priority[field] = field_mapping['priority']
|
|
185
|
+
else
|
|
186
|
+
priority[field] = 1
|
|
187
|
+
end
|
|
188
|
+
end
|
|
189
|
+
end
|
|
190
|
+
true
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
# Decode raw_value using specified encoding
|
|
194
|
+
# E.g. adding decode to a column:
|
|
195
|
+
#
|
|
196
|
+
# - column: base64
|
|
197
|
+
# decode:
|
|
198
|
+
# - :base64
|
|
199
|
+
# - :word_doc
|
|
200
|
+
#
|
|
201
|
+
# would base64 decode a word document and then 'decode' the word document into plain text
|
|
202
|
+
def decode_raw_value(raw_value, encoding)
|
|
203
|
+
case encoding
|
|
204
|
+
when :base64
|
|
205
|
+
Base64.decode64(raw_value)
|
|
206
|
+
when :word_doc
|
|
207
|
+
read_word_stream(StringIO.new(raw_value, 'r'))
|
|
208
|
+
else
|
|
209
|
+
fail "Cannot decode: #{encoding}"
|
|
210
|
+
end
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Given an IO stream representing a .doc word document,
|
|
214
|
+
# this method will extract the text for the document in the same way
|
|
215
|
+
# as NdrImport::Helpers::File::Word#read_word_file
|
|
216
|
+
def read_word_stream(stream)
|
|
217
|
+
# whole_contents adds "\n" to end of stream, we remove it
|
|
218
|
+
MSWordDoc::Extractor.load(stream).whole_contents.sub(/\n\z/, '')
|
|
219
|
+
end
|
|
220
|
+
end
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module NonTabular
|
|
5
|
+
# This class stores the mapping for an individual non-tabular column, encapsulating
|
|
6
|
+
# the logic associated with finding matching lines of source data and subsequently
|
|
7
|
+
# capturing arrays of values within them.
|
|
8
|
+
class ColumnMapping
|
|
9
|
+
attr_accessor :name, :cell_mapping, :lines, :capture, :join
|
|
10
|
+
|
|
11
|
+
def initialize(column_mapping)
|
|
12
|
+
@name = column_mapping['rawtext_name'] ||
|
|
13
|
+
column_mapping['column'] ||
|
|
14
|
+
column_mapping['standard_mapping']
|
|
15
|
+
@cell_mapping = column_mapping['non_tabular_cell']
|
|
16
|
+
|
|
17
|
+
validate_cell_mapping
|
|
18
|
+
|
|
19
|
+
@lines = @cell_mapping['lines']
|
|
20
|
+
@join = @cell_mapping['join']
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# This method returns the range of matching source data lines. If the range is a
|
|
24
|
+
# RegexpRange then it will calculate it for the text provided.
|
|
25
|
+
def matching_lines(text)
|
|
26
|
+
if @lines.is_a?(RegexpRange)
|
|
27
|
+
@lines.to_range(text)
|
|
28
|
+
else
|
|
29
|
+
@lines
|
|
30
|
+
end
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
# capture the required part of the line by replacing (recusively) the line,
|
|
34
|
+
# with the first captured regular expression group. This is hardcoded in an attempt
|
|
35
|
+
# to preserve the rawtext as much as possible
|
|
36
|
+
def capture_value(line)
|
|
37
|
+
value = line.dup
|
|
38
|
+
[@cell_mapping['capture']].flatten.each do |pattern|
|
|
39
|
+
if matchdata = value.to_s.match(pattern)
|
|
40
|
+
value = matchdata[1]
|
|
41
|
+
else
|
|
42
|
+
value = nil
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
value
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def validate_cell_mapping
|
|
49
|
+
validate_presence_of_non_tabular_cell
|
|
50
|
+
validate_presence_of_non_tabular_cell_lines
|
|
51
|
+
validate_presence_of_non_tabular_cell_capture
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def validate_presence_of_non_tabular_cell
|
|
55
|
+
return if @cell_mapping
|
|
56
|
+
fail NdrImport::MappingError,
|
|
57
|
+
I18n.t('mapping.errors.missing_non_tabular_cell', :name => @name)
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def validate_presence_of_non_tabular_cell_lines
|
|
61
|
+
return if @cell_mapping['lines']
|
|
62
|
+
fail NdrImport::MappingError,
|
|
63
|
+
I18n.t('mapping.errors.missing_non_tabular_cell_lines', :name => @name)
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def validate_presence_of_non_tabular_cell_capture
|
|
67
|
+
return if @cell_mapping['capture']
|
|
68
|
+
fail NdrImport::MappingError,
|
|
69
|
+
I18n.t('mapping.errors.missing_non_tabular_cell_capture', :name => @name)
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
|
|
3
|
+
module NdrImport
|
|
4
|
+
module NonTabular
|
|
5
|
+
# This class behaves like a string and is used instead of the each source line of text.
|
|
6
|
+
# It allows us to contain additional information relating to the use of the line e.g. is
|
|
7
|
+
# the line within a record or for which fields the line has been used to capture a value.
|
|
8
|
+
class Line
|
|
9
|
+
attr_accessor :absolute_line_number,
|
|
10
|
+
:captured_fields,
|
|
11
|
+
:captures_values,
|
|
12
|
+
:in_a_record,
|
|
13
|
+
:record_line_number,
|
|
14
|
+
:removed
|
|
15
|
+
|
|
16
|
+
def initialize(line, absolute_line_number)
|
|
17
|
+
@line = line.rstrip
|
|
18
|
+
@absolute_line_number = absolute_line_number
|
|
19
|
+
@in_a_record = false
|
|
20
|
+
@removed = false
|
|
21
|
+
@captured_fields = []
|
|
22
|
+
@captures_values = []
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def =~(other)
|
|
26
|
+
@line =~ other
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def match(*args)
|
|
30
|
+
@line.match(*args)
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def to_s
|
|
34
|
+
@line
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def captured_for(field)
|
|
38
|
+
@captured_fields << field if field && !@captured_fields.include?(field)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def matches_for(field, value)
|
|
42
|
+
@captures_values << [field, value]
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|