ndr_import 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,27 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds Word document functionality to unified importers.
7
+ # It provides a file reader method.
8
+ # currently only works on .doc (97-2003), not.docx
9
+ module Word
10
+ private
11
+
12
+ def read_word_file(path)
13
+ require 'msworddoc-extractor'
14
+ lines = []
15
+ begin
16
+ doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(path))
17
+
18
+ lines.concat doc.whole_contents.split("\n")
19
+ rescue => e
20
+ raise("#{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
21
+ end
22
+ lines
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,45 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds XML functionality to unified importers.
8
+ module Xml
9
+ include UTF8Encoding
10
+
11
+ private
12
+
13
+ def read_xml_file(path)
14
+ file_data = SafeFile.new(path).read
15
+
16
+ require 'nokogiri'
17
+
18
+ Nokogiri::XML(ensure_utf8! file_data).tap do |doc|
19
+ doc.encoding = 'UTF-8'
20
+ emulate_strict_mode_fatal_check!(doc)
21
+ end
22
+ end
23
+
24
+ # Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
25
+ # handling of muddled encodings causes XML explicitly declared as something
26
+ # other than UTF-8 to fail (because it has been recoded to UTF-8 by the
27
+ # time it is given to Nokogiri / libxml).
28
+ # This raises a SyntaxError if strict mode would have found any other
29
+ # (fatal) issues with the document.
30
+ def emulate_strict_mode_fatal_check!(document)
31
+ # We let slide any warnings about xml declared as one of our
32
+ # auto encodings, but parsed as UTF-8:
33
+ encoding_pattern = AUTO_ENCODINGS.map { |name| Regexp.escape(name) }.join('|')
34
+ encoding_warning = /\ADocument labelled (#{encoding_pattern}) but has UTF-8 content\z/
35
+ fatal_errors = document.errors.select do |error|
36
+ error.fatal? && (encoding_warning !~ error.message)
37
+ end
38
+
39
+ return unless fatal_errors.any?
40
+ fail Nokogiri::XML::SyntaxError, "The file had #{fatal_errors.length} fatal error(s)!"
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,44 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds Zip functionality to unified importers.
7
+ module Zip
8
+ private
9
+
10
+ # Unzip the file, creating the destination directory if necessary.
11
+ # A pattern can be provided to only extract required files.
12
+ def unzip_file(source, destination, pattern = //)
13
+ # SECURE TVB Mon Aug 13 14:41:05 BST 2012 : SafePath will raise exception if insecure
14
+ # path is constructed
15
+ # SafeFile.safepath_to_string will make sure that the arguments are from type SafePath
16
+
17
+ # SECURE: BNS 2010-09-21 (for external access)
18
+ fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
19
+
20
+ require 'zip'
21
+ # TODO: Abort if destination directory already exists...
22
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
23
+
24
+ ::Zip::File.open(SafeFile.safepath_to_string(source)) do |zipfile|
25
+ zipfile.entries.each do |entry|
26
+ # SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
27
+ basename = ::File.basename(entry.name)
28
+ zipfile.extract(entry, destination.join(basename)) if entry.file? && basename.match(pattern)
29
+ end
30
+ end
31
+
32
+ rescue ::Zip::ZipDestinationFileExistsError
33
+ # I'm going to ignore this and just overwrite the files.
34
+ rescue SecurityError => ex
35
+ raise ex
36
+ rescue ArgumentError => ex
37
+ raise ex
38
+ rescue => ex
39
+ puts ex
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,220 @@
1
+ require 'ndr_support/string/cleaning'
2
+ require 'ndr_support/string/conversions'
3
+ require 'ndr_import/standard_mappings'
4
+ require 'base64'
5
+ require 'msworddoc-extractor'
6
+
7
+ # This module provides helper logic for mapping unified sources for import into the system
8
+ module NdrImport::Mapper
9
+ private
10
+
11
+ # uses the mappings for this line to unpack the fixed width string
12
+ # returning an array of the resulting columns
13
+ def fixed_width_columns(line, line_mappings)
14
+ unpack_patterns = line_mappings.map { |c| c['unpack_pattern'] }.join
15
+ line.unpack(unpack_patterns)
16
+ end
17
+
18
+ # the replace option can be used before any other mapping option
19
+ def replace_before_mapping(original_value, field_mapping)
20
+ return unless field_mapping.include?('replace') && original_value
21
+
22
+ [field_mapping['replace']].flatten.each do |field_replacement|
23
+ field_replacement.each do |pattern, replacement|
24
+ original_value.gsub!(pattern, replacement)
25
+ end
26
+ end
27
+ end
28
+
29
+ # Returns the standard_mapping hash specified
30
+ # Assumes mappping exists
31
+ def standard_mapping(mapping_name, column_mapping)
32
+ mapping = NdrImport::StandardMappings.mappings[mapping_name]
33
+ return nil if mapping.nil?
34
+ if column_mapping['mappings']
35
+ mapping['mappings'] = mapping['mappings'] + column_mapping.delete('mappings')
36
+ end
37
+ mapping.merge(column_mapping)
38
+ end
39
+
40
+ # This takes an array of raw values and their associated mappings and returns an attribute hash
41
+ # It accepts a block to alter the raw value that is stored in the raw text (if necessary),
42
+ # enabling it to work for different sources
43
+ def mapped_line(line, line_mappings)
44
+ attributes = {}
45
+ rawtext = {}
46
+ validate_line_mappings(line_mappings)
47
+
48
+ line.each_with_index do |raw_value, col|
49
+ column_mapping = line_mappings[col]
50
+ if column_mapping.nil?
51
+ fail ArgumentError,
52
+ "Line has too many columns (expected #{line_mappings.size} but got #{line.size})"
53
+ end
54
+
55
+ next if column_mapping['do_not_capture']
56
+
57
+ if column_mapping['standard_mapping']
58
+ column_mapping = standard_mapping(column_mapping['standard_mapping'], column_mapping)
59
+ end
60
+ field_mappings = column_mapping['mappings'] || []
61
+
62
+ # Establish the rawtext column name we are to use for this column
63
+ rawtext_column_name = (column_mapping['rawtext_name'] || column_mapping['column']).downcase
64
+
65
+ # Replace raw_value with decoded raw_value
66
+ Array(column_mapping['decode']).each do |encoding|
67
+ raw_value = decode_raw_value(raw_value, encoding)
68
+ end
69
+
70
+ # raw value casting can vary between sources, so we allow the caller to apply it here
71
+ if respond_to?(:cast_raw_value)
72
+ raw_value = cast_raw_value(rawtext_column_name, raw_value, column_mapping)
73
+ end
74
+
75
+ # Store the raw column value
76
+ rawtext[rawtext_column_name] = raw_value
77
+
78
+ field_mappings.each do |field_mapping|
79
+ # create a duplicate of the raw value we can manipulate
80
+ original_value = raw_value ? raw_value.dup : nil
81
+
82
+ replace_before_mapping(original_value, field_mapping)
83
+ value = mapped_value(original_value, field_mapping)
84
+
85
+ field = field_mapping['field']
86
+
87
+ # Assumes join is specified in first joined field
88
+ joined = field_mapping['join'] ? true : false
89
+
90
+ # Currently assuming already validated YAML, s.t. no fields have the
91
+ # same priorities
92
+ #
93
+ # This has become really messy...
94
+ unless value.blank? && !joined
95
+ attributes[field] = {} unless attributes[field]
96
+ attributes[field][:priority] = {} unless attributes[field][:priority]
97
+ if field_mapping['order']
98
+ attributes[field][field_mapping['order']] = value
99
+ attributes[field][:join] = field_mapping['join'] if field_mapping['join']
100
+ attributes[field][:compact] = field_mapping['compact'] if field_mapping.include?('compact')
101
+ elsif field_mapping['priority']
102
+ attributes[field][:priority][field_mapping['priority']] = value
103
+ else
104
+ # Check if already a mapped-to field, and assign default low
105
+ # priority
106
+ attributes[field][:priority][1] = value
107
+ attributes[field][:value] = value
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ # tidy up many to one field mappings
114
+ # and one to many, for cross-populating
115
+ attributes.each do |field, value|
116
+ if value.include?(:join)
117
+ join_string = value.delete(:join) || ','
118
+ value.delete(:value)
119
+ value.delete(:priority)
120
+ if value.include?(:compact)
121
+ compact = value.delete(:compact)
122
+ else
123
+ compact = true
124
+ end
125
+ t = value.sort.map do |_part_order, part_value|
126
+ part_value.blank? ? nil : part_value
127
+ end
128
+ if compact
129
+ attributes[field] = t.compact.join(join_string)
130
+ else
131
+ attributes[field] = t.join(join_string)
132
+ end
133
+ else
134
+ attributes[field][:priority].reject! { |_k, v| v.blank? }
135
+ attributes[field] = attributes[field][:priority].sort.first[1]
136
+ end
137
+ end
138
+
139
+ attributes[:rawtext] = rawtext
140
+ attributes
141
+ end
142
+
143
+ def mapped_value(original_value, field_mapping)
144
+ if field_mapping.include?('format')
145
+ begin
146
+ return original_value.blank? ? nil : original_value.to_date(field_mapping['format'])
147
+ rescue ArgumentError => e
148
+ e2 = ArgumentError.new("#{e} value #{original_value.inspect}")
149
+ e2.set_backtrace(e.backtrace)
150
+ raise e2
151
+ end
152
+ elsif field_mapping.include?('clean')
153
+ return original_value.blank? ? nil : original_value.clean(field_mapping['clean'])
154
+ elsif field_mapping.include?('map')
155
+ return field_mapping['map'] ? field_mapping['map'][original_value] : nil
156
+ elsif field_mapping.include?('match')
157
+ # WARNING:TVB Thu Aug 9 17:09:25 BST 2012 field_mapping['match'] regexp
158
+ # may need to be escaped
159
+ matches = Regexp.new(field_mapping['match']).match(original_value)
160
+ return matches[1].strip if matches && matches.size > 0
161
+ elsif field_mapping.include?('daysafter')
162
+ return original_value unless original_value.to_i.to_s == original_value.to_s
163
+ return original_value.to_i.days.since(field_mapping['daysafter'].to_time).to_date
164
+ else
165
+ return nil if original_value.blank?
166
+ return original_value.is_a?(String) ? original_value.strip : original_value
167
+ end
168
+ end
169
+
170
+ # Check for duplicate priorities, check for nonexistent standard_mappings
171
+ def validate_line_mappings(line_mappings)
172
+ priority = {}
173
+ line_mappings.each do |column_mapping|
174
+ if column_mapping['standard_mapping']
175
+ if standard_mapping(column_mapping['standard_mapping'], column_mapping).nil?
176
+ fail "Standard mapping \"#{column_mapping['standard_mapping']}\" does not exist"
177
+ end
178
+ end
179
+ field_mappings = column_mapping['mappings'] || []
180
+ field_mappings.each do |field_mapping|
181
+ field = field_mapping['field']
182
+ if field_mapping['priority']
183
+ fail 'Cannot have duplicate priorities' if priority[field] == field_mapping['priority']
184
+ priority[field] = field_mapping['priority']
185
+ else
186
+ priority[field] = 1
187
+ end
188
+ end
189
+ end
190
+ true
191
+ end
192
+
193
+ # Decode raw_value using specified encoding
194
+ # E.g. adding decode to a column:
195
+ #
196
+ # - column: base64
197
+ # decode:
198
+ # - :base64
199
+ # - :word_doc
200
+ #
201
+ # would base64 decode a word document and then 'decode' the word document into plain text
202
+ def decode_raw_value(raw_value, encoding)
203
+ case encoding
204
+ when :base64
205
+ Base64.decode64(raw_value)
206
+ when :word_doc
207
+ read_word_stream(StringIO.new(raw_value, 'r'))
208
+ else
209
+ fail "Cannot decode: #{encoding}"
210
+ end
211
+ end
212
+
213
+ # Given an IO stream representing a .doc word document,
214
+ # this method will extract the text for the document in the same way
215
+ # as NdrImport::Helpers::File::Word#read_word_file
216
+ def read_word_stream(stream)
217
+ # whole_contents adds "\n" to end of stream, we remove it
218
+ MSWordDoc::Extractor.load(stream).whole_contents.sub(/\n\z/, '')
219
+ end
220
+ end
@@ -0,0 +1,5 @@
1
+ # Raised if there is a problem with an import mapping.
2
+ module NdrImport
3
+ class MappingError < StandardError
4
+ end
5
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: UTF-8
2
+
3
+ module NdrImport
4
+ module NonTabular
5
+ # This class stores the mapping for an individual non-tabular column, encapsulating
6
+ # the logic associated with finding matching lines of source data and subsequently
7
+ # capturing arrays of values within them.
8
+ class ColumnMapping
9
+ attr_accessor :name, :cell_mapping, :lines, :capture, :join
10
+
11
+ def initialize(column_mapping)
12
+ @name = column_mapping['rawtext_name'] ||
13
+ column_mapping['column'] ||
14
+ column_mapping['standard_mapping']
15
+ @cell_mapping = column_mapping['non_tabular_cell']
16
+
17
+ validate_cell_mapping
18
+
19
+ @lines = @cell_mapping['lines']
20
+ @join = @cell_mapping['join']
21
+ end
22
+
23
+ # This method returns the range of matching source data lines. If the range is a
24
+ # RegexpRange then it will calculate it for the text provided.
25
+ def matching_lines(text)
26
+ if @lines.is_a?(RegexpRange)
27
+ @lines.to_range(text)
28
+ else
29
+ @lines
30
+ end
31
+ end
32
+
33
+ # capture the required part of the line by replacing (recusively) the line,
34
+ # with the first captured regular expression group. This is hardcoded in an attempt
35
+ # to preserve the rawtext as much as possible
36
+ def capture_value(line)
37
+ value = line.dup
38
+ [@cell_mapping['capture']].flatten.each do |pattern|
39
+ if matchdata = value.to_s.match(pattern)
40
+ value = matchdata[1]
41
+ else
42
+ value = nil
43
+ end
44
+ end
45
+ value
46
+ end
47
+
48
+ def validate_cell_mapping
49
+ validate_presence_of_non_tabular_cell
50
+ validate_presence_of_non_tabular_cell_lines
51
+ validate_presence_of_non_tabular_cell_capture
52
+ end
53
+
54
+ def validate_presence_of_non_tabular_cell
55
+ return if @cell_mapping
56
+ fail NdrImport::MappingError,
57
+ I18n.t('mapping.errors.missing_non_tabular_cell', :name => @name)
58
+ end
59
+
60
+ def validate_presence_of_non_tabular_cell_lines
61
+ return if @cell_mapping['lines']
62
+ fail NdrImport::MappingError,
63
+ I18n.t('mapping.errors.missing_non_tabular_cell_lines', :name => @name)
64
+ end
65
+
66
+ def validate_presence_of_non_tabular_cell_capture
67
+ return if @cell_mapping['capture']
68
+ fail NdrImport::MappingError,
69
+ I18n.t('mapping.errors.missing_non_tabular_cell_capture', :name => @name)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: UTF-8
2
+
3
+ module NdrImport
4
+ module NonTabular
5
+ # This class behaves like a string and is used instead of the each source line of text.
6
+ # It allows us to contain additional information relating to the use of the line e.g. is
7
+ # the line within a record or for which fields the line has been used to capture a value.
8
+ class Line
9
+ attr_accessor :absolute_line_number,
10
+ :captured_fields,
11
+ :captures_values,
12
+ :in_a_record,
13
+ :record_line_number,
14
+ :removed
15
+
16
+ def initialize(line, absolute_line_number)
17
+ @line = line.rstrip
18
+ @absolute_line_number = absolute_line_number
19
+ @in_a_record = false
20
+ @removed = false
21
+ @captured_fields = []
22
+ @captures_values = []
23
+ end
24
+
25
+ def =~(other)
26
+ @line =~ other
27
+ end
28
+
29
+ def match(*args)
30
+ @line.match(*args)
31
+ end
32
+
33
+ def to_s
34
+ @line
35
+ end
36
+
37
+ def captured_for(field)
38
+ @captured_fields << field if field && !@captured_fields.include?(field)
39
+ end
40
+
41
+ def matches_for(field, value)
42
+ @captures_values << [field, value]
43
+ end
44
+ end
45
+ end
46
+ end