ndr_import 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,52 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a text file handler that returns a single table.
10
+ class Text < Base
11
+ include UTF8Encoding
12
+
13
+ private
14
+
15
+ def rows(&block)
16
+ return enum_for(:rows) unless block
17
+
18
+ # Encoding:
19
+ # As we're going to be yielding the lines of the file as it is streamed
20
+ # (rather than slurped in advance), we need to know which encoding / mode
21
+ # is going to work in advance.
22
+ #
23
+ path = SafeFile.safepath_to_string(@filename)
24
+ mode = read_mode_for(path)
25
+
26
+ # SECURE: TG 13 Oct 2015 SafeFile.safepath_to_string ensures that the path is SafePath.
27
+ ::File.new(path, mode).each { |line| block.call ensure_utf8!(line).chomp }
28
+ rescue => e
29
+ raise "Failed to read #{SafeFile.basename(@filename)} as text [#{e.class}: #{e.message}]"
30
+ end
31
+
32
+ # TODO: In Ruby 2.0+, a mode of "rb:bom|utf-16:utf-8" seemed to fix all cases,
33
+ # but this doesn't work on Ruby 1.9.3, which we are currently still supporting.
34
+ # Therefore, we have to test multiple modes in advance, hence #read_mode_for.
35
+ def read_mode_for(trusted_path)
36
+ # These are the read modes we will try, in order:
37
+ modes = ['rb:utf-16:utf-8', 'r:utf-8']
38
+
39
+ begin
40
+ ::File.new(trusted_path, modes.first).each { |_line| }
41
+ rescue Encoding::InvalidByteSequenceError
42
+ modes.shift # That one didn't work...
43
+ retry if modes.any?
44
+ end
45
+
46
+ modes.first || fail('Unable to determine working stream encoding!')
47
+ end
48
+ end
49
+
50
+ Registry.register(Text, 'txt') # TODO: Add 'nontabular'?
51
+ end
52
+ end
@@ -0,0 +1,30 @@
1
+ require 'msworddoc-extractor'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a Word document file handler that returns a single table.
10
+ # currently only works on .doc (97-2003), not.docx
11
+ class Word < Base
12
+ private
13
+
14
+ def rows(&block)
15
+ return enum_for(:rows) unless block
16
+
17
+ doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(@filename))
18
+
19
+ doc.whole_contents.split("\n").each do |line|
20
+ block.call(line)
21
+ end
22
+
23
+ rescue => e
24
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
25
+ end
26
+ end
27
+
28
+ Registry.register(Word, 'doc') # TODO: Add 'word'?
29
+ end
30
+ end
@@ -0,0 +1,67 @@
1
+ require 'zip'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a zip file handler that returns tables from the extracted files.
10
+ class Zip < Base
11
+ def initialize(filename, format, options = {})
12
+ super
13
+ @pattern = options['pattern'] || //
14
+ @unzip_path = options['unzip_path']
15
+
16
+ validate_unzip_path_is_safe!
17
+ end
18
+
19
+ def files(&block)
20
+ fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
21
+
22
+ return enum_for(:files) unless block
23
+
24
+ destination = @unzip_path.join(Time.current.strftime('%H%M%S%L'))
25
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
26
+
27
+ ::Zip::File.open(SafeFile.safepath_to_string(@filename)) do |zipfile|
28
+ unzip_entries(zipfile, destination, &block)
29
+ end
30
+ end
31
+
32
+ # Zip files produce files, never tables.
33
+ def tables
34
+ fail 'Zip#tables should never be called'
35
+ end
36
+
37
+ private
38
+
39
+ # Unzip the zip file entry and enumerate over it
40
+ def unzip_entries(zipfile, destination, &block)
41
+ zipfile.entries.each do |entry|
42
+ # SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
43
+ basename = ::File.basename(entry.name)
44
+ next unless entry.file? && basename.match(@pattern)
45
+
46
+ unzipped_filename = destination.join(basename)
47
+ zipfile.extract(entry, unzipped_filename)
48
+
49
+ unzipped_files(unzipped_filename, &block)
50
+ end
51
+ end
52
+
53
+ # Enumerate over an unzipped file like any other
54
+ def unzipped_files(unzipped_filename, &block)
55
+ Registry.files(unzipped_filename, @options).each do |filename|
56
+ block.call(filename)
57
+ end
58
+ end
59
+
60
+ def validate_unzip_path_is_safe!
61
+ SafeFile.safepath_to_string(@unzip_path)
62
+ end
63
+ end
64
+
65
+ Registry.register(Zip, 'zip')
66
+ end
67
+ end
@@ -0,0 +1,105 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_import/csv_library'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds delimited file functionality to unified importers.
8
+ module Delimited
9
+ # Read a plain text CSV file, return an array of the content
10
+ def read_csv_file(path)
11
+ # Read the page below when encountering "CSV::IllegalFormatError" error caused by CSV
12
+ # file generated at MAC OS
13
+ # http://stackoverflow.com/questions/1549139/ruby-cannot-parse-excel-file-exported-as-csv-in-os-x
14
+
15
+ read_delimited_file(path)
16
+ end
17
+
18
+ # Slurp the entire file into an array of lines.
19
+ def read_delimited_file(path, col_sep = nil)
20
+ delimited_rows(path, col_sep).to_a
21
+ end
22
+
23
+ # Iterate through the file table by table, yielding each one in turn.
24
+ def delimited_tables(path, col_sep = nil)
25
+ return enum_for(:delimited_tables, path, col_sep) unless block_given?
26
+
27
+ yield nil, delimited_rows(path, col_sep)
28
+ end
29
+
30
+ # Deprecated method
31
+ def each_delimited_table(path, col_sep = nil, &block)
32
+ Kernel.warn '[warning] each_delimited_table will be deprecated,' \
33
+ ' please use delimited_tables instead.'
34
+ delimited_tables(path, col_sep, &block)
35
+ end
36
+
37
+ # Iterate through the file line by line, yielding each one in turn.
38
+ def delimited_rows(path, col_sep = nil)
39
+ return enum_for(:delimited_rows, path, col_sep) unless block_given?
40
+
41
+ safe_path = SafeFile.safepath_to_string(path)
42
+ encodings = determine_encodings!(safe_path, col_sep)
43
+
44
+ # By now, we know `encodings` should let us read the whole
45
+ # file succesfully; if there are problems, we should crash.
46
+ CSVLibrary.foreach(safe_path, encodings) do |line|
47
+ yield line.map(&:to_s) unless line.length <= 5
48
+ end
49
+ end
50
+
51
+ # Deprecated method
52
+ def each_delimited_row(path, col_sep = nil, &block)
53
+ Kernel.warn '[warning] each_delimited_row will be deprecated,' \
54
+ ' please use delimited_rows instead.'
55
+ delimited_rows(path, col_sep, &block)
56
+ end
57
+
58
+ private
59
+
60
+ # Derive the source encoding by trying all supported encodings.
61
+ # Returns first set of working options, or raises if none could be found.
62
+ def determine_encodings!(safe_path, col_sep = nil)
63
+ # delimiter encoding => # FasterCSV encoding string
64
+ supported_encodings = {
65
+ 'UTF-8' => 'bom|utf-8',
66
+ 'Windows-1252' => 'windows-1252:utf-8'
67
+ }
68
+
69
+ successful_options = nil
70
+ supported_encodings.each do |delimiter_encoding, csv_encoding|
71
+ begin
72
+ options = {
73
+ :col_sep => (col_sep || ',').force_encoding(delimiter_encoding),
74
+ :encoding => csv_encoding
75
+ }
76
+
77
+ row_num = 0
78
+ # Iterate through the file; if we reach the end, this encoding worked:
79
+ CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
80
+ rescue ArgumentError => e
81
+ next if e.message =~ /invalid byte sequence/ # This encoding didn't work
82
+ raise(e)
83
+ rescue CSVLibrary::MalformedCSVError => e
84
+ description = (col_sep ? col_sep.inspect + ' delimited' : 'CSV')
85
+
86
+ raise(CSVLibrary::MalformedCSVError, "Invalid #{description} format " \
87
+ "on row #{row_num + 1} of #{::File.basename(safe_path)}. Original: #{e.message}")
88
+ end
89
+
90
+ # We got this far => encoding choice worked:
91
+ successful_options = options
92
+ break
93
+ end
94
+
95
+ # We tried them all, and none worked:
96
+ unless successful_options
97
+ fail "None of the encodings #{supported_encodings.values.inspect} were successful!"
98
+ end
99
+
100
+ successful_options
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,181 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds excel spreadsheet functionality to unified importers.
7
+ # It provides a file reader method and methods to cast raw values
8
+ # appropriately. These methods can be overridden or aliased as required.
9
+ #
10
+ module Excel
11
+ require 'roo'
12
+ require 'roo-xls'
13
+ require 'ole/storage'
14
+
15
+ protected
16
+
17
+ def cast_excel_value(raw_value)
18
+ return raw_value if raw_value.nil?
19
+
20
+ if raw_value.is_a?(Date) || raw_value.is_a?(DateTime) || raw_value.is_a?(Time)
21
+ cast_excel_datetime_as_date(raw_value)
22
+ elsif raw_value.is_a?(Float)
23
+ if raw_value.to_f == raw_value.to_i
24
+ # Whole number
25
+ return raw_value.to_i.to_s
26
+ else
27
+ return raw_value.to_f.to_s
28
+ end
29
+ else
30
+ return raw_value.to_s.strip
31
+ end
32
+ end
33
+
34
+ def cast_excel_datetime_as_date(raw_value)
35
+ raw_value.to_s(:db)
36
+ end
37
+
38
+ # Iterate through the file table by table, yielding each one in turn.
39
+ def excel_tables(path)
40
+ return enum_for(:excel_tables, path) unless block_given?
41
+
42
+ workbook = load_workbook(path)
43
+ workbook.each_with_pagename do |tablename, sheet|
44
+ yield tablename, excel_rows(workbook, sheet)
45
+ end
46
+ end
47
+
48
+ # Deprecated method
49
+ def each_excel_table(path, &block)
50
+ Kernel.warn '[warning] each_excel_table will be deprecated,' \
51
+ ' please use excel_tables instead.'
52
+ excel_tables(path, &block)
53
+ end
54
+
55
+ private
56
+
57
+ def read_excel_file(path, selected_sheet = nil)
58
+ # SECURE: TVB Mon Aug 13 15:30:32 BST 2012 SafeFile.safepath_to_string makes sure that
59
+ # the path is SafePath.
60
+
61
+ # Load the workbook
62
+ workbook = load_workbook(path)
63
+
64
+ # Choose selected worksheet (if provided and exist) or the first worksheet
65
+ workbook.default_sheet =
66
+ if selected_sheet.nil? || !workbook.sheets.include?(selected_sheet.to_s)
67
+ workbook.sheets.first
68
+ else
69
+ selected_sheet.to_s
70
+ end
71
+
72
+ # Read the cells from working worksheet into a nested array
73
+ excel_rows(workbook, workbook).to_a
74
+ end
75
+
76
+ # Iterate through the sheet line by line, yielding each one in turn.
77
+ def excel_rows(workbook, sheet, &block)
78
+ return enum_for(:excel_rows, workbook, sheet) unless block
79
+
80
+ if workbook.is_a?(Roo::Excelx)
81
+ # FIXME: xlsx_rows(sheet, &block) should produce the same output as xls_rows
82
+ xls_rows(sheet, &block)
83
+ else
84
+ xls_rows(sheet, &block)
85
+ end
86
+ end
87
+
88
+ # Deprecated method
89
+ def each_excel_row(workbook, sheet, &block)
90
+ Kernel.warn '[warning] each_excel_row will be deprecated,' \
91
+ ' please use excel_rows instead.'
92
+ excel_rows(workbook, sheet, &block)
93
+ end
94
+
95
+ # Iterate through an xls sheet line by line, yielding each one in turn.
96
+ def xls_rows(sheet)
97
+ return enum_for(:xls_rows, sheet) unless block_given?
98
+
99
+ sheet.first_row.upto(sheet.last_row) do |row|
100
+ line = []
101
+ sheet.first_column.upto(sheet.last_column) do |col|
102
+ line << cast_excel_value(sheet.cell(row, col))
103
+ end
104
+ yield line
105
+ end
106
+ end
107
+
108
+ # Deprecated method
109
+ def each_xls_row(sheet, &block)
110
+ Kernel.warn '[warning] each_xls_row will be deprecated,' \
111
+ ' please use xls_rows instead.'
112
+ xls_rows(sheet, &block)
113
+ end
114
+
115
+ # Iterate through an xlsx sheet line by line, yielding each one in turn.
116
+ # This method uses streaming https://github.com/roo-rb/roo#excel-xlsx-and-xlsm-support
117
+ def xlsx_rows(sheet)
118
+ return enum_for(:xlsx_rows, sheet) unless block_given?
119
+
120
+ sheet.each_row_streaming(:pad_cells => true) do |row|
121
+ yield row.map { |cell| cast_excel_value(cell.value) }
122
+ end
123
+ end
124
+
125
+ # Deprecated method
126
+ def each_xlsx_row(sheet, &block)
127
+ Kernel.warn '[warning] each_xlsx_row will be deprecated,' \
128
+ ' please use xlsx_rows instead.'
129
+ xlsx_rows(sheet, &block)
130
+ end
131
+
132
+ def get_excel_sheets_name(path)
133
+ workbook = load_workbook(path)
134
+ workbook.sheets
135
+ end
136
+
137
+ def load_workbook(path)
138
+ case SafeFile.extname(path).downcase
139
+ when '.xls'
140
+ Roo::Excel.new(SafeFile.safepath_to_string(path))
141
+ when '.xlsx'
142
+ Roo::Excelx.new(SafeFile.safepath_to_string(path))
143
+ else
144
+ fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
145
+ end
146
+ rescue Ole::Storage::FormatError => e
147
+ # TODO: Do we need to remove the new_file after using it?
148
+
149
+ # try to load the .xls file as an .xlsx file, useful for sources like USOM
150
+ # roo check file extensions in file_type_check (GenericSpreadsheet),
151
+ # so we create a duplicate file in xlsx extension
152
+ if /(.*)\.xls$/.match(path)
153
+ new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
154
+ new_file_path = SafeFile.dirname(path).join(new_file_name)
155
+ copy_file(path, new_file_path)
156
+
157
+ load_workbook(new_file_path)
158
+ else
159
+ raise e.message
160
+ end
161
+ rescue => e
162
+ raise ["Unable to read the file '#{path}'", e.message].join('; ')
163
+ end
164
+
165
+ # Note that this method can produce insecure calls. All callers must protect
166
+ # their arguments.
167
+ # Arguments:
168
+ # * source - SafeFile
169
+ # * dest - SafeFile
170
+ #
171
+ def copy_file(source, dest)
172
+ # SECURE: TVB Mon Aug 13 13:53:02 BST 2012 : Secure SafePath will do the security checks
173
+ # before it is converted to string.
174
+ # SafeFile will make sure that the arguments are actually SafePath
175
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(SafeFile.dirname(dest)))
176
+ FileUtils.cp(SafeFile.safepath_to_string(source), SafeFile.safepath_to_string(dest))
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,29 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds PDF functionality to unified importers. It provides a file reader method.
7
+ module Pdf
8
+ private
9
+
10
+ def read_pdf_file(path)
11
+ require 'pdf-reader'
12
+ lines = []
13
+ pagenum = 0
14
+ begin
15
+ reader = PDF::Reader.new(SafeFile.safepath_to_string(path))
16
+
17
+ reader.pages.each do |page|
18
+ lines.concat page.text.split("\n")
19
+ pagenum += 1
20
+ end
21
+ rescue => e
22
+ raise("Invalid format on page #{pagenum + 1} of #{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
23
+ end
24
+ lines
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end