ndr_import 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,52 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_support/utf8_encoding'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a text file handler that returns a single table.
10
+ class Text < Base
11
+ include UTF8Encoding
12
+
13
+ private
14
+
15
+ def rows(&block)
16
+ return enum_for(:rows) unless block
17
+
18
+ # Encoding:
19
+ # As we're going to be yielding the lines of the file as it is streamed
20
+ # (rather than slurped in advance), we need to know which encoding / mode
21
+ # is going to work in advance.
22
+ #
23
+ path = SafeFile.safepath_to_string(@filename)
24
+ mode = read_mode_for(path)
25
+
26
+ # SECURE: TG 13 Oct 2015 SafeFile.safepath_to_string ensures that the path is SafePath.
27
+ ::File.new(path, mode).each { |line| block.call ensure_utf8!(line).chomp }
28
+ rescue => e
29
+ raise "Failed to read #{SafeFile.basename(@filename)} as text [#{e.class}: #{e.message}]"
30
+ end
31
+
32
+ # TODO: In Ruby 2.0+, a mode of "rb:bom|utf-16:utf-8" seemed to fix all cases,
33
+ # but this doesn't work on Ruby 1.9.3, which we are currently still supporting.
34
+ # Therefore, we have to test multiple modes in advance, hence #read_mode_for.
35
+ def read_mode_for(trusted_path)
36
+ # These are the read modes we will try, in order:
37
+ modes = ['rb:utf-16:utf-8', 'r:utf-8']
38
+
39
+ begin
40
+ ::File.new(trusted_path, modes.first).each { |_line| }
41
+ rescue Encoding::InvalidByteSequenceError
42
+ modes.shift # That one didn't work...
43
+ retry if modes.any?
44
+ end
45
+
46
+ modes.first || fail('Unable to determine working stream encoding!')
47
+ end
48
+ end
49
+
50
+ Registry.register(Text, 'txt') # TODO: Add 'nontabular'?
51
+ end
52
+ end
@@ -0,0 +1,30 @@
1
+ require 'msworddoc-extractor'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a Word document file handler that returns a single table.
10
+ # currently only works on .doc (97-2003), not.docx
11
+ class Word < Base
12
+ private
13
+
14
+ def rows(&block)
15
+ return enum_for(:rows) unless block
16
+
17
+ doc = MSWordDoc::Extractor.load(SafeFile.safepath_to_string(@filename))
18
+
19
+ doc.whole_contents.split("\n").each do |line|
20
+ block.call(line)
21
+ end
22
+
23
+ rescue => e
24
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
25
+ end
26
+ end
27
+
28
+ Registry.register(Word, 'doc') # TODO: Add 'word'?
29
+ end
30
+ end
@@ -0,0 +1,67 @@
1
+ require 'zip'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a zip file handler that returns tables from the extracted files.
10
+ class Zip < Base
11
+ def initialize(filename, format, options = {})
12
+ super
13
+ @pattern = options['pattern'] || //
14
+ @unzip_path = options['unzip_path']
15
+
16
+ validate_unzip_path_is_safe!
17
+ end
18
+
19
+ def files(&block)
20
+ fail 'Not allowed in external environment' if defined?(::Rails) && ::Rails.env.external?
21
+
22
+ return enum_for(:files) unless block
23
+
24
+ destination = @unzip_path.join(Time.current.strftime('%H%M%S%L'))
25
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(destination))
26
+
27
+ ::Zip::File.open(SafeFile.safepath_to_string(@filename)) do |zipfile|
28
+ unzip_entries(zipfile, destination, &block)
29
+ end
30
+ end
31
+
32
+ # Zip files produce files, never tables.
33
+ def tables
34
+ fail 'Zip#tables should never be called'
35
+ end
36
+
37
+ private
38
+
39
+ # Unzip the zip file entry and enumerate over it
40
+ def unzip_entries(zipfile, destination, &block)
41
+ zipfile.entries.each do |entry|
42
+ # SECURE: TPG 2010-11-1: The path is stripped from the zipfile entry when extracted
43
+ basename = ::File.basename(entry.name)
44
+ next unless entry.file? && basename.match(@pattern)
45
+
46
+ unzipped_filename = destination.join(basename)
47
+ zipfile.extract(entry, unzipped_filename)
48
+
49
+ unzipped_files(unzipped_filename, &block)
50
+ end
51
+ end
52
+
53
+ # Enumerate over an unzipped file like any other
54
+ def unzipped_files(unzipped_filename, &block)
55
+ Registry.files(unzipped_filename, @options).each do |filename|
56
+ block.call(filename)
57
+ end
58
+ end
59
+
60
+ def validate_unzip_path_is_safe!
61
+ SafeFile.safepath_to_string(@unzip_path)
62
+ end
63
+ end
64
+
65
+ Registry.register(Zip, 'zip')
66
+ end
67
+ end
@@ -0,0 +1,105 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_import/csv_library'
3
+
4
+ module NdrImport
5
+ module Helpers
6
+ module File
7
+ # This mixin adds delimited file functionality to unified importers.
8
+ module Delimited
9
+ # Read a plain text CSV file, return an array of the content
10
+ def read_csv_file(path)
11
+ # Read the page below when encountering "CSV::IllegalFormatError" error caused by CSV
12
+ # file generated at MAC OS
13
+ # http://stackoverflow.com/questions/1549139/ruby-cannot-parse-excel-file-exported-as-csv-in-os-x
14
+
15
+ read_delimited_file(path)
16
+ end
17
+
18
+ # Slurp the entire file into an array of lines.
19
+ def read_delimited_file(path, col_sep = nil)
20
+ delimited_rows(path, col_sep).to_a
21
+ end
22
+
23
+ # Iterate through the file table by table, yielding each one in turn.
24
+ def delimited_tables(path, col_sep = nil)
25
+ return enum_for(:delimited_tables, path, col_sep) unless block_given?
26
+
27
+ yield nil, delimited_rows(path, col_sep)
28
+ end
29
+
30
+ # Deprecated method
31
+ def each_delimited_table(path, col_sep = nil, &block)
32
+ Kernel.warn '[warning] each_delimited_table will be deprecated,' \
33
+ ' please use delimited_tables instead.'
34
+ delimited_tables(path, col_sep, &block)
35
+ end
36
+
37
+ # Iterate through the file line by line, yielding each one in turn.
38
+ def delimited_rows(path, col_sep = nil)
39
+ return enum_for(:delimited_rows, path, col_sep) unless block_given?
40
+
41
+ safe_path = SafeFile.safepath_to_string(path)
42
+ encodings = determine_encodings!(safe_path, col_sep)
43
+
44
+ # By now, we know `encodings` should let us read the whole
45
+ # file succesfully; if there are problems, we should crash.
46
+ CSVLibrary.foreach(safe_path, encodings) do |line|
47
+ yield line.map(&:to_s) unless line.length <= 5
48
+ end
49
+ end
50
+
51
+ # Deprecated method
52
+ def each_delimited_row(path, col_sep = nil, &block)
53
+ Kernel.warn '[warning] each_delimited_row will be deprecated,' \
54
+ ' please use delimited_rows instead.'
55
+ delimited_rows(path, col_sep, &block)
56
+ end
57
+
58
+ private
59
+
60
+ # Derive the source encoding by trying all supported encodings.
61
+ # Returns first set of working options, or raises if none could be found.
62
+ def determine_encodings!(safe_path, col_sep = nil)
63
+ # delimiter encoding => # FasterCSV encoding string
64
+ supported_encodings = {
65
+ 'UTF-8' => 'bom|utf-8',
66
+ 'Windows-1252' => 'windows-1252:utf-8'
67
+ }
68
+
69
+ successful_options = nil
70
+ supported_encodings.each do |delimiter_encoding, csv_encoding|
71
+ begin
72
+ options = {
73
+ :col_sep => (col_sep || ',').force_encoding(delimiter_encoding),
74
+ :encoding => csv_encoding
75
+ }
76
+
77
+ row_num = 0
78
+ # Iterate through the file; if we reach the end, this encoding worked:
79
+ CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
80
+ rescue ArgumentError => e
81
+ next if e.message =~ /invalid byte sequence/ # This encoding didn't work
82
+ raise(e)
83
+ rescue CSVLibrary::MalformedCSVError => e
84
+ description = (col_sep ? col_sep.inspect + ' delimited' : 'CSV')
85
+
86
+ raise(CSVLibrary::MalformedCSVError, "Invalid #{description} format " \
87
+ "on row #{row_num + 1} of #{::File.basename(safe_path)}. Original: #{e.message}")
88
+ end
89
+
90
+ # We got this far => encoding choice worked:
91
+ successful_options = options
92
+ break
93
+ end
94
+
95
+ # We tried them all, and none worked:
96
+ unless successful_options
97
+ fail "None of the encodings #{supported_encodings.values.inspect} were successful!"
98
+ end
99
+
100
+ successful_options
101
+ end
102
+ end
103
+ end
104
+ end
105
+ end
@@ -0,0 +1,181 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds excel spreadsheet functionality to unified importers.
7
+ # It provides a file reader method and methods to cast raw values
8
+ # appropriately. These methods can be overridden or aliased as required.
9
+ #
10
+ module Excel
11
+ require 'roo'
12
+ require 'roo-xls'
13
+ require 'ole/storage'
14
+
15
+ protected
16
+
17
+ def cast_excel_value(raw_value)
18
+ return raw_value if raw_value.nil?
19
+
20
+ if raw_value.is_a?(Date) || raw_value.is_a?(DateTime) || raw_value.is_a?(Time)
21
+ cast_excel_datetime_as_date(raw_value)
22
+ elsif raw_value.is_a?(Float)
23
+ if raw_value.to_f == raw_value.to_i
24
+ # Whole number
25
+ return raw_value.to_i.to_s
26
+ else
27
+ return raw_value.to_f.to_s
28
+ end
29
+ else
30
+ return raw_value.to_s.strip
31
+ end
32
+ end
33
+
34
+ def cast_excel_datetime_as_date(raw_value)
35
+ raw_value.to_s(:db)
36
+ end
37
+
38
+ # Iterate through the file table by table, yielding each one in turn.
39
+ def excel_tables(path)
40
+ return enum_for(:excel_tables, path) unless block_given?
41
+
42
+ workbook = load_workbook(path)
43
+ workbook.each_with_pagename do |tablename, sheet|
44
+ yield tablename, excel_rows(workbook, sheet)
45
+ end
46
+ end
47
+
48
+ # Deprecated method
49
+ def each_excel_table(path, &block)
50
+ Kernel.warn '[warning] each_excel_table will be deprecated,' \
51
+ ' please use excel_tables instead.'
52
+ excel_tables(path, &block)
53
+ end
54
+
55
+ private
56
+
57
+ def read_excel_file(path, selected_sheet = nil)
58
+ # SECURE: TVB Mon Aug 13 15:30:32 BST 2012 SafeFile.safepath_to_string makes sure that
59
+ # the path is SafePath.
60
+
61
+ # Load the workbook
62
+ workbook = load_workbook(path)
63
+
64
+ # Choose selected worksheet (if provided and exist) or the first worksheet
65
+ workbook.default_sheet =
66
+ if selected_sheet.nil? || !workbook.sheets.include?(selected_sheet.to_s)
67
+ workbook.sheets.first
68
+ else
69
+ selected_sheet.to_s
70
+ end
71
+
72
+ # Read the cells from working worksheet into a nested array
73
+ excel_rows(workbook, workbook).to_a
74
+ end
75
+
76
+ # Iterate through the sheet line by line, yielding each one in turn.
77
+ def excel_rows(workbook, sheet, &block)
78
+ return enum_for(:excel_rows, workbook, sheet) unless block
79
+
80
+ if workbook.is_a?(Roo::Excelx)
81
+ # FIXME: xlsx_rows(sheet, &block) should produce the same output as xls_rows
82
+ xls_rows(sheet, &block)
83
+ else
84
+ xls_rows(sheet, &block)
85
+ end
86
+ end
87
+
88
+ # Deprecated method
89
+ def each_excel_row(workbook, sheet, &block)
90
+ Kernel.warn '[warning] each_excel_row will be deprecated,' \
91
+ ' please use excel_rows instead.'
92
+ excel_rows(workbook, sheet, &block)
93
+ end
94
+
95
+ # Iterate through an xls sheet line by line, yielding each one in turn.
96
+ def xls_rows(sheet)
97
+ return enum_for(:xls_rows, sheet) unless block_given?
98
+
99
+ sheet.first_row.upto(sheet.last_row) do |row|
100
+ line = []
101
+ sheet.first_column.upto(sheet.last_column) do |col|
102
+ line << cast_excel_value(sheet.cell(row, col))
103
+ end
104
+ yield line
105
+ end
106
+ end
107
+
108
+ # Deprecated method
109
+ def each_xls_row(sheet, &block)
110
+ Kernel.warn '[warning] each_xls_row will be deprecated,' \
111
+ ' please use xls_rows instead.'
112
+ xls_rows(sheet, &block)
113
+ end
114
+
115
+ # Iterate through an xlsx sheet line by line, yielding each one in turn.
116
+ # This method uses streaming https://github.com/roo-rb/roo#excel-xlsx-and-xlsm-support
117
+ def xlsx_rows(sheet)
118
+ return enum_for(:xlsx_rows, sheet) unless block_given?
119
+
120
+ sheet.each_row_streaming(:pad_cells => true) do |row|
121
+ yield row.map { |cell| cast_excel_value(cell.value) }
122
+ end
123
+ end
124
+
125
+ # Deprecated method
126
+ def each_xlsx_row(sheet, &block)
127
+ Kernel.warn '[warning] each_xlsx_row will be deprecated,' \
128
+ ' please use xlsx_rows instead.'
129
+ xlsx_rows(sheet, &block)
130
+ end
131
+
132
+ def get_excel_sheets_name(path)
133
+ workbook = load_workbook(path)
134
+ workbook.sheets
135
+ end
136
+
137
+ def load_workbook(path)
138
+ case SafeFile.extname(path).downcase
139
+ when '.xls'
140
+ Roo::Excel.new(SafeFile.safepath_to_string(path))
141
+ when '.xlsx'
142
+ Roo::Excelx.new(SafeFile.safepath_to_string(path))
143
+ else
144
+ fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
145
+ end
146
+ rescue Ole::Storage::FormatError => e
147
+ # TODO: Do we need to remove the new_file after using it?
148
+
149
+ # try to load the .xls file as an .xlsx file, useful for sources like USOM
150
+ # roo check file extensions in file_type_check (GenericSpreadsheet),
151
+ # so we create a duplicate file in xlsx extension
152
+ if /(.*)\.xls$/.match(path)
153
+ new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
154
+ new_file_path = SafeFile.dirname(path).join(new_file_name)
155
+ copy_file(path, new_file_path)
156
+
157
+ load_workbook(new_file_path)
158
+ else
159
+ raise e.message
160
+ end
161
+ rescue => e
162
+ raise ["Unable to read the file '#{path}'", e.message].join('; ')
163
+ end
164
+
165
+ # Note that this method can produce insecure calls. All callers must protect
166
+ # their arguments.
167
+ # Arguments:
168
+ # * source - SafeFile
169
+ # * dest - SafeFile
170
+ #
171
+ def copy_file(source, dest)
172
+ # SECURE: TVB Mon Aug 13 13:53:02 BST 2012 : Secure SafePath will do the security checks
173
+ # before it is converted to string.
174
+ # SafeFile will make sure that the arguments are actually SafePath
175
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(SafeFile.dirname(dest)))
176
+ FileUtils.cp(SafeFile.safepath_to_string(source), SafeFile.safepath_to_string(dest))
177
+ end
178
+ end
179
+ end
180
+ end
181
+ end
@@ -0,0 +1,29 @@
1
+ require 'ndr_support/safe_file'
2
+
3
+ module NdrImport
4
+ module Helpers
5
+ module File
6
+ # This mixin adds PDF functionality to unified importers. It provides a file reader method.
7
+ module Pdf
8
+ private
9
+
10
+ def read_pdf_file(path)
11
+ require 'pdf-reader'
12
+ lines = []
13
+ pagenum = 0
14
+ begin
15
+ reader = PDF::Reader.new(SafeFile.safepath_to_string(path))
16
+
17
+ reader.pages.each do |page|
18
+ lines.concat page.text.split("\n")
19
+ pagenum += 1
20
+ end
21
+ rescue => e
22
+ raise("Invalid format on page #{pagenum + 1} of #{SafeFile.basename(path)} [#{e.class}: #{e.message}]")
23
+ end
24
+ lines
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end