ndr_import 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,27 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds Word document functionality to unified importers.
7
+ # It provides a file reader method.
8
+ # currently only works on .doc (97-2003), not.docx
9
+ module Word
10
+ private
11
+
12
+ def read_word_file(path)
13
+ require 'msworddoc-extractor'
14
+ lines = []
15
+ begin
16
+ doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(path))
17
+
18
+ lines.concat doc.whole_contents.split("\n")
19
+ rescue => e
20
+ raise("#{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
21
+ end
22
+ lines
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,45 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds XML functionality to unified importers.
8
+ module Xml
9
+ include UTF8Encoding
10
+
11
+ private
12
+
13
+ def read_xml_file(path)
14
+ file_data = SafeFile.new(path).read
15
+
16
+ require 'nokogiri'
17
+
18
+ Nokogiri::XML(ensure_utf8! file_data).tap do |doc|
19
+ doc.encoding = 'UTF-8'
20
+ emulate_strict_mode_fatal_check!(doc)
21
+ end
22
+ end
23
+
24
+ # Nokogiri can use give a `STRICT` parse option to libxml, but our friendly
25
+ # handling of muddled encodings causes XML explicitly declared as something
26
+ # other than UTF-8 to fail (because it has been recoded to UTF-8 by the
27
+ # time it is given to Nokogiri / libxml).
28
+ # This raises a SyntaxError if strict mode would have found any other
29
+ # (fatal) issues with the document.
30
+ def emulate_strict_mode_fatal_check!(document)
31
+ # We let slide any warnings about xml declared as one of our
32
+ # auto encodings, but parsed as UTF-8:
33
+ encoding_pattern = AUTO_ENCODINGS.map { |name| Regexp.escape(name) }.join('|')
34
+ encoding_warning = /\ADocument labelled (#{encoding_pattern}) but has UTF-8 content\z/
35
+ fatal_errors = document.errors.select do |error|
36
+ error.fatal? && (encoding_warning !~ error.message)
37
+ end
38
+
39
+ return unless fatal_errors.any?
40
+ fail Nokogiri::XML::SyntaxError, "The file had #{fatal_errors.length} fatal error(s)!"
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,44 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds Zip functionality to unified importers.
7
+ module Zip
8
+ private
9
+
10
+ # Unzip the file, creating the destination directory if necessary.
11
+ # A pattern can be provided to only extract required files.
12
+ def unzip_file(source, destination, pattern = //)
13
+ # SECURE TVB Mon Aug 13 14:41:05 BST 2012 : SafePath will raise exception if insecure
14
+ # path is constructed
15
+ # SafeFile.safepath_to_string will make sure that the arguments are from type SafePath
16
+
17
+ # SECURE: BNS 2010-09-21 (for external access)
18
+ fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
19
+
20
+ require 'zip'
21
+ # TODO: Abort if destination directory already exists...
22
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
23
+
24
+ ::Zip::File.open(SafeFile.safepath_to_string(source)) do |zipfile|
25
+ zipfile.entries.each do |entry|
26
+ # SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
27
+ basename = ::File.basename(entry.name)
28
+ zipfile.extract(entry, destination.join(basename)) if entry.file? && basename.match(pattern)
29
+ end
30
+ end
31
+
32
+ rescue ::Zip::ZipDestinationFileExistsError
33
+ # I'm going to ignore this and just overwrite the files.
34
+ rescue SecurityError => ex
35
+ raise ex
36
+ rescue ArgumentError => ex
37
+ raise ex
38
+ rescue => ex
39
+ puts ex
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,220 @@
1
+ require 'ndr_support/string/cleaning'
2
+ require 'ndr_support/string/conversions'
3
+ require 'ndr_import/standard_mappings'
4
+ require 'base64'
5
+ require 'msworddoc-extractor'
6
+
7
+ # This module provides helper logic for mapping unified sources for import into the system
8
+ module NdrImport::Mapper
9
+ private
10
+
11
+ # uses the mappings for this line to unpack the fixed width string
12
+ # returning an array of the resulting columns
13
+ def fixed_width_columns(line, line_mappings)
14
+ unpack_patterns = line_mappings.map { |c| c['unpack_pattern'] }.join
15
+ line.unpack(unpack_patterns)
16
+ end
17
+
18
+ # the replace option can be used before any other mapping option
19
+ def replace_before_mapping(original_value, field_mapping)
20
+ return unless field_mapping.include?('replace') && original_value
21
+
22
+ [field_mapping['replace']].flatten.each do |field_replacement|
23
+ field_replacement.each do |pattern, replacement|
24
+ original_value.gsub!(pattern, replacement)
25
+ end
26
+ end
27
+ end
28
+
29
+ # Returns the standard_mapping hash specified
30
+ # Assumes mappping exists
31
+ def standard_mapping(mapping_name, column_mapping)
32
+ mapping = NdrImport::StandardMappings.mappings[mapping_name]
33
+ return nil if mapping.nil?
34
+ if column_mapping['mappings']
35
+ mapping['mappings'] = mapping['mappings'] + column_mapping.delete('mappings')
36
+ end
37
+ mapping.merge(column_mapping)
38
+ end
39
+
40
+ # This takes an array of raw values and their associated mappings and returns an attribute hash
41
+ # It accepts a block to alter the raw value that is stored in the raw text (if necessary),
42
+ # enabling it to work for different sources
43
+ def mapped_line(line, line_mappings)
44
+ attributes = {}
45
+ rawtext = {}
46
+ validate_line_mappings(line_mappings)
47
+
48
+ line.each_with_index do |raw_value, col|
49
+ column_mapping = line_mappings[col]
50
+ if column_mapping.nil?
51
+ fail ArgumentError,
52
+ "Line has too many columns (expected #{line_mappings.size} but got #{line.size})"
53
+ end
54
+
55
+ next if column_mapping['do_not_capture']
56
+
57
+ if column_mapping['standard_mapping']
58
+ column_mapping = standard_mapping(column_mapping['standard_mapping'], column_mapping)
59
+ end
60
+ field_mappings = column_mapping['mappings'] || []
61
+
62
+ # Establish the rawtext column name we are to use for this column
63
+ rawtext_column_name = (column_mapping['rawtext_name'] || column_mapping['column']).downcase
64
+
65
+ # Replace raw_value with decoded raw_value
66
+ Array(column_mapping['decode']).each do |encoding|
67
+ raw_value = decode_raw_value(raw_value, encoding)
68
+ end
69
+
70
+ # raw value casting can vary between sources, so we allow the caller to apply it here
71
+ if respond_to?(:cast_raw_value)
72
+ raw_value = cast_raw_value(rawtext_column_name, raw_value, column_mapping)
73
+ end
74
+
75
+ # Store the raw column value
76
+ rawtext[rawtext_column_name] = raw_value
77
+
78
+ field_mappings.each do |field_mapping|
79
+ # create a duplicate of the raw value we can manipulate
80
+ original_value = raw_value ? raw_value.dup : nil
81
+
82
+ replace_before_mapping(original_value, field_mapping)
83
+ value = mapped_value(original_value, field_mapping)
84
+
85
+ field = field_mapping['field']
86
+
87
+ # Assumes join is specified in first joined field
88
+ joined = field_mapping['join'] ? true : false
89
+
90
+ # Currently assuming already validated YAML, s.t. no fields have the
91
+ # same priorities
92
+ #
93
+ # This has become really messy...
94
+ unless value.blank? && !joined
95
+ attributes[field] = {} unless attributes[field]
96
+ attributes[field][:priority] = {} unless attributes[field][:priority]
97
+ if field_mapping['order']
98
+ attributes[field][field_mapping['order']] = value
99
+ attributes[field][:join] = field_mapping['join'] if field_mapping['join']
100
+ attributes[field][:compact] = field_mapping['compact'] if field_mapping.include?('compact')
101
+ elsif field_mapping['priority']
102
+ attributes[field][:priority][field_mapping['priority']] = value
103
+ else
104
+ # Check if already a mapped-to field, and assign default low
105
+ # priority
106
+ attributes[field][:priority][1] = value
107
+ attributes[field][:value] = value
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ # tidy up many to one field mappings
114
+ # and one to many, for cross-populating
115
+ attributes.each do |field, value|
116
+ if value.include?(:join)
117
+ join_string = value.delete(:join) || ','
118
+ value.delete(:value)
119
+ value.delete(:priority)
120
+ if value.include?(:compact)
121
+ compact = value.delete(:compact)
122
+ else
123
+ compact = true
124
+ end
125
+ t = value.sort.map do |_part_order, part_value|
126
+ part_value.blank? ? nil : part_value
127
+ end
128
+ if compact
129
+ attributes[field] = t.compact.join(join_string)
130
+ else
131
+ attributes[field] = t.join(join_string)
132
+ end
133
+ else
134
+ attributes[field][:priority].reject! { |_k, v| v.blank? }
135
+ attributes[field] = attributes[field][:priority].sort.first[1]
136
+ end
137
+ end
138
+
139
+ attributes[:rawtext] = rawtext
140
+ attributes
141
+ end
142
+
143
+ def mapped_value(original_value, field_mapping)
144
+ if field_mapping.include?('format')
145
+ begin
146
+ return original_value.blank? ? nil : original_value.to_date(field_mapping['format'])
147
+ rescue ArgumentError => e
148
+ e2 = ArgumentError.new("#{e} value #{original_value.inspect}")
149
+ e2.set_backtrace(e.backtrace)
150
+ raise e2
151
+ end
152
+ elsif field_mapping.include?('clean')
153
+ return original_value.blank? ? nil : original_value.clean(field_mapping['clean'])
154
+ elsif field_mapping.include?('map')
155
+ return field_mapping['map'] ? field_mapping['map'][original_value] : nil
156
+ elsif field_mapping.include?('match')
157
+ # WARNING:TVB Thu Aug 9 17:09:25 BST 2012 field_mapping['match'] regexp
158
+ # may need to be escaped
159
+ matches = Regexp.new(field_mapping['match']).match(original_value)
160
+ return matches[1].strip if matches && matches.size > 0
161
+ elsif field_mapping.include?('daysafter')
162
+ return original_value unless original_value.to_i.to_s == original_value.to_s
163
+ return original_value.to_i.days.since(field_mapping['daysafter'].to_time).to_date
164
+ else
165
+ return nil if original_value.blank?
166
+ return original_value.is_a?(String) ? original_value.strip : original_value
167
+ end
168
+ end
169
+
170
+ # Check for duplicate priorities, check for nonexistent standard_mappings
171
+ def validate_line_mappings(line_mappings)
172
+ priority = {}
173
+ line_mappings.each do |column_mapping|
174
+ if column_mapping['standard_mapping']
175
+ if standard_mapping(column_mapping['standard_mapping'], column_mapping).nil?
176
+ fail "Standard mapping \"#{column_mapping['standard_mapping']}\" does not exist"
177
+ end
178
+ end
179
+ field_mappings = column_mapping['mappings'] || []
180
+ field_mappings.each do |field_mapping|
181
+ field = field_mapping['field']
182
+ if field_mapping['priority']
183
+ fail 'Cannot have duplicate priorities' if priority[field] == field_mapping['priority']
184
+ priority[field] = field_mapping['priority']
185
+ else
186
+ priority[field] = 1
187
+ end
188
+ end
189
+ end
190
+ true
191
+ end
192
+
193
+ # Decode raw_value using specified encoding
194
+ # E.g. adding decode to a column:
195
+ #
196
+ # - column: base64
197
+ # decode:
198
+ # - :base64
199
+ # - :word_doc
200
+ #
201
+ # would base64 decode a word document and then 'decode' the word document into plain text
202
+ def decode_raw_value(raw_value, encoding)
203
+ case encoding
204
+ when :base64
205
+ Base64.decode64(raw_value)
206
+ when :word_doc
207
+ read_word_stream(StringIO.new(raw_value, 'r'))
208
+ else
209
+ fail "Cannot decode: #{encoding}"
210
+ end
211
+ end
212
+
213
+ # Given an IO stream representing a .doc word document,
214
+ # this method will extract the text for the document in the same way
215
+ # as NdrImport::Helpers::File::Word#read_word_file
216
+ def read_word_stream(stream)
217
+ # whole_contents adds "\n" to end of stream, we remove it
218
+ MSWordDoc::Extractor.load(stream).whole_contents.sub(/\n\z/, '')
219
+ end
220
+ end
@@ -0,0 +1,5 @@
1
+ # Raised if there is a problem with an import mapping.
2
+ module NdrImport
3
+ class MappingError < StandardError
4
+ end
5
+ end
@@ -0,0 +1,73 @@
1
+ # encoding: UTF-8
2
+
3
+ module NdrImport
4
+ module NonTabular
5
+ # This class stores the mapping for an individual non-tabular column, encapsulating
6
+ # the logic associated with finding matching lines of source data and subsequently
7
+ # capturing arrays of values within them.
8
+ class ColumnMapping
9
+ attr_accessor :name, :cell_mapping, :lines, :capture, :join
10
+
11
+ def initialize(column_mapping)
12
+ @name = column_mapping['rawtext_name'] ||
13
+ column_mapping['column'] ||
14
+ column_mapping['standard_mapping']
15
+ @cell_mapping = column_mapping['non_tabular_cell']
16
+
17
+ validate_cell_mapping
18
+
19
+ @lines = @cell_mapping['lines']
20
+ @join = @cell_mapping['join']
21
+ end
22
+
23
+ # This method returns the range of matching source data lines. If the range is a
24
+ # RegexpRange then it will calculate it for the text provided.
25
+ def matching_lines(text)
26
+ if @lines.is_a?(RegexpRange)
27
+ @lines.to_range(text)
28
+ else
29
+ @lines
30
+ end
31
+ end
32
+
33
+ # capture the required part of the line by replacing (recusively) the line,
34
+ # with the first captured regular expression group. This is hardcoded in an attempt
35
+ # to preserve the rawtext as much as possible
36
+ def capture_value(line)
37
+ value = line.dup
38
+ [@cell_mapping['capture']].flatten.each do |pattern|
39
+ if matchdata = value.to_s.match(pattern)
40
+ value = matchdata[1]
41
+ else
42
+ value = nil
43
+ end
44
+ end
45
+ value
46
+ end
47
+
48
+ def validate_cell_mapping
49
+ validate_presence_of_non_tabular_cell
50
+ validate_presence_of_non_tabular_cell_lines
51
+ validate_presence_of_non_tabular_cell_capture
52
+ end
53
+
54
+ def validate_presence_of_non_tabular_cell
55
+ return if @cell_mapping
56
+ fail NdrImport::MappingError,
57
+ I18n.t('mapping.errors.missing_non_tabular_cell', :name => @name)
58
+ end
59
+
60
+ def validate_presence_of_non_tabular_cell_lines
61
+ return if @cell_mapping['lines']
62
+ fail NdrImport::MappingError,
63
+ I18n.t('mapping.errors.missing_non_tabular_cell_lines', :name => @name)
64
+ end
65
+
66
+ def validate_presence_of_non_tabular_cell_capture
67
+ return if @cell_mapping['capture']
68
+ fail NdrImport::MappingError,
69
+ I18n.t('mapping.errors.missing_non_tabular_cell_capture', :name => @name)
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,46 @@
1
+ # encoding: UTF-8
2
+
3
+ module NdrImport
4
+ module NonTabular
5
+ # This class behaves like a string and is used instead of the each source line of text.
6
+ # It allows us to contain additional information relating to the use of the line e.g. is
7
+ # the line within a record or for which fields the line has been used to capture a value.
8
+ class Line
9
+ attr_accessor :absolute_line_number,
10
+ :captured_fields,
11
+ :captures_values,
12
+ :in_a_record,
13
+ :record_line_number,
14
+ :removed
15
+
16
+ def initialize(line, absolute_line_number)
17
+ @line = line.rstrip
18
+ @absolute_line_number = absolute_line_number
19
+ @in_a_record = false
20
+ @removed = false
21
+ @captured_fields = []
22
+ @captures_values = []
23
+ end
24
+
25
+ def =~(other)
26
+ @line =~ other
27
+ end
28
+
29
+ def match(*args)
30
+ @line.match(*args)
31
+ end
32
+
33
+ def to_s
34
+ @line
35
+ end
36
+
37
+ def captured_for(field)
38
+ @captured_fields << field if field && !@captured_fields.include?(field)
39
+ end
40
+
41
+ def matches_for(field, value)
42
+ @captures_values << [field, value]
43
+ end
44
+ end
45
+ end
46
+ end