ndr_import 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
data/lib/ndr_import.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'ndr_import/version'
2
+ require 'ndr_import/csv_library'
3
+ require 'ndr_import/mapping_error'
4
+ require 'ndr_import/mapper'
5
+ require 'ndr_import/non_tabular_file_helper'
6
+ require 'ndr_import/table'
7
+ require 'ndr_import/non_tabular/table'
8
+
9
+ module NdrImport
10
+ def self.root
11
+ ::File.expand_path('../..', __FILE__)
12
+ end
13
+ end
@@ -0,0 +1,40 @@
1
+ # This file allows us to choose the CSV library we want to use.
2
+
3
+ require 'csv'
4
+ # Using relevant core CSV library.
5
+ CSVLibrary = CSV
6
+
7
+ class << CSVLibrary
8
+ # Is the library we're using FasterCSV?
9
+ def fastercsv?
10
+ not self.const_defined?(:Reader)
11
+ end
12
+
13
+ def write_csv_to_string(data)
14
+ self.generate do |csv|
15
+ data.each { |line| csv << line }
16
+ end
17
+ end
18
+
19
+ def write_csv_to_file(data, filepath, mode = 'w')
20
+ self.open(filepath, mode) do |csv|
21
+ data.each { |line| csv << line }
22
+ end
23
+ end
24
+
25
+ def read_csv_from_file(filepath)
26
+ self.read(filepath)
27
+ end
28
+ end
29
+
30
+ # Forward port CSV::Cell, as it is sometimes
31
+ # serialised in YAML. :-(
32
+ class CSV::Cell < String
33
+ def initialize(data = '', is_null = false)
34
+ super(is_null ? '' : data)
35
+ end
36
+
37
+ def data
38
+ to_s
39
+ end
40
+ end
@@ -0,0 +1,8 @@
1
+ require_relative 'base'
2
+ require_relative 'delimited'
3
+ require_relative 'excel'
4
+ require_relative 'pdf'
5
+ require_relative 'text'
6
+ require_relative 'word'
7
+ # # require_relative 'xml'
8
+ require_relative 'zip'
@@ -0,0 +1,76 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_import/csv_library'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is the base of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # All common base file handler logic is defined here.
10
+ class Base
11
+ def initialize(filename, format, options = {})
12
+ @filename = filename
13
+ @format = format
14
+ @options = options
15
+
16
+ validate_filename_is_safe_and_readable!
17
+ end
18
+
19
+ # This method iterates over the files in the given file and yields the filenames.
20
+ # For a zip file it will yield for every file in the zip file and for (currently)
21
+ # every other file it will yield its own filename.
22
+ #
23
+ # As the majority of files are not containers (of other files), the Base implementation
24
+ # is defined for these handlers. If your file contains more than one file, then
25
+ # override this method. If you do overide this method, then you will probably want
26
+ # to raise an exception if your tables method is called. E.g. a zip file handler would
27
+ # produce files, never tables.
28
+ def files
29
+ return enum_for(:files) unless block_given?
30
+
31
+ yield @filename
32
+ end
33
+
34
+ # This method iterates over the tables in the given file and yields with two arguments:
35
+ # a tablename and a row enumerator (for that table). For a spreadsheet it may yield for
36
+ # every worksheet in the file and for a CSV file it will only yield once (the entire
37
+ # file is one table).
38
+ #
39
+ # As single table files are in the majority, the Base implementation is defined for
40
+ # single table handlers and you will only need to implement the rows iterator. If your
41
+ # file contains more than one table, then override this method.
42
+ #
43
+ # NOTE: for single table handlers, the tablename argument should be nil.
44
+ def tables
45
+ return enum_for(:tables) unless block_given?
46
+
47
+ yield nil, rows
48
+ end
49
+
50
+ private
51
+
52
+ # If this is a single table file handler then this method must be implemented by
53
+ # the subclass. It iterates over each of the rows of the current table (the whole
54
+ # file in this case) and should work in both the block and non-block form, returning
55
+ # an Enumerator in the latter case. We recommend that you follow the following pattern:
56
+ #
57
+ # def rows
58
+ # return enum_for(:rows) unless block_given?
59
+ #
60
+ # ... your code goes here ...
61
+ # end
62
+ #
63
+ def rows
64
+ fail "Implement #{self.class}#rows"
65
+ end
66
+
67
+ def validate_filename_is_safe_and_readable!
68
+ SafeFile.safepath_to_string(@filename)
69
+
70
+ # Ensure that we're allowed to read from the safe path:
71
+ # (they can be configured to be write-only, for example)
72
+ SafeFile.verify_mode(@filename, 'r')
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,86 @@
1
+ require 'ndr_support/safe_file'
2
+ require 'ndr_import/csv_library'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a delimited file handler that returns a single table.
10
+ class Delimited < Base
11
+ DELIMITED_COL_SEP = {
12
+ 'csv' => nil,
13
+ 'pipe' => '|',
14
+ 'thorn' => "\xfe" # high-ascii (lower case thorn) delimited files
15
+ }
16
+
17
+ def initialize(filename, format, options = {})
18
+ super
19
+
20
+ @options['col_sep'] ||= DELIMITED_COL_SEP[format]
21
+ end
22
+
23
+ private
24
+
25
+ # Iterate through the file line by line, yielding each one in turn.
26
+ def rows
27
+ return enum_for(:rows) unless block_given?
28
+
29
+ safe_path = SafeFile.safepath_to_string(@filename)
30
+ encodings = determine_encodings!(safe_path)
31
+
32
+ # By now, we know `encodings` should let us read the whole
33
+ # file succesfully; if there are problems, we should crash.
34
+ CSVLibrary.foreach(safe_path, encodings) do |line|
35
+ yield line.map(&:to_s) unless line.length <= 5
36
+ end
37
+ end
38
+
39
+ # Derive the source encoding by trying all supported encodings.
40
+ # Returns first set of working options, or raises if none could be found.
41
+ def determine_encodings!(safe_path)
42
+ # delimiter encoding => # FasterCSV encoding string
43
+ supported_encodings = {
44
+ 'UTF-8' => 'bom|utf-8',
45
+ 'Windows-1252' => 'windows-1252:utf-8'
46
+ }
47
+
48
+ successful_options = nil
49
+ supported_encodings.each do |delimiter_encoding, csv_encoding|
50
+ begin
51
+ col_sep = @options['col_sep']
52
+ options = {
53
+ :col_sep => (col_sep || ',').force_encoding(delimiter_encoding),
54
+ :encoding => csv_encoding
55
+ }
56
+
57
+ row_num = 0
58
+ # Iterate through the file; if we reach the end, this encoding worked:
59
+ CSVLibrary.foreach(safe_path, options) { |_line| row_num += 1 }
60
+ rescue ArgumentError => e
61
+ next if e.message =~ /invalid byte sequence/ # This encoding didn't work
62
+ raise(e)
63
+ rescue CSVLibrary::MalformedCSVError => e
64
+ description = (col_sep ? col_sep.inspect + ' delimited' : 'CSV')
65
+
66
+ raise(CSVLibrary::MalformedCSVError, "Invalid #{description} format " \
67
+ "on row #{row_num + 1} of #{::File.basename(safe_path)}. Original: #{e.message}")
68
+ end
69
+
70
+ # We got this far => encoding choice worked:
71
+ successful_options = options
72
+ break
73
+ end
74
+
75
+ # We tried them all, and none worked:
76
+ unless successful_options
77
+ fail "None of the encodings #{supported_encodings.values.inspect} were successful!"
78
+ end
79
+
80
+ successful_options
81
+ end
82
+ end
83
+
84
+ Registry.register(Delimited, 'csv', 'pipe', 'thorn')
85
+ end
86
+ end
@@ -0,0 +1,131 @@
1
+ require 'roo'
2
+ require 'roo-xls'
3
+ require 'ole/storage'
4
+ require 'ndr_support/safe_file'
5
+ require_relative 'registry'
6
+
7
+ module NdrImport
8
+ # This is one of a collection of file handlers that deal with individual formats of data.
9
+ # They can be instantiated directly or via the factory method Registry.tables
10
+ module File
11
+ # This class is an excel file handler that returns tables (worksheets).
12
+ # It provides a file reader method and methods to cast raw values
13
+ # appropriately. These methods can be overridden or aliased as required.
14
+ #
15
+ class Excel < Base
16
+ # Iterate through the file table by table, yielding each one in turn.
17
+ def tables
18
+ return enum_for(:tables) unless block_given?
19
+
20
+ workbook = load_workbook(@filename)
21
+ workbook.each_with_pagename do |tablename, sheet|
22
+ yield tablename, excel_rows(workbook, sheet)
23
+ end
24
+ end
25
+
26
+ protected
27
+
28
+ def cast_excel_value(raw_value)
29
+ return raw_value if raw_value.nil?
30
+
31
+ if raw_value.is_a?(Date) || raw_value.is_a?(DateTime) || raw_value.is_a?(Time)
32
+ cast_excel_datetime_as_date(raw_value)
33
+ elsif raw_value.is_a?(Float)
34
+ if raw_value.to_f == raw_value.to_i
35
+ # Whole number
36
+ return raw_value.to_i.to_s
37
+ else
38
+ return raw_value.to_f.to_s
39
+ end
40
+ else
41
+ return raw_value.to_s.strip
42
+ end
43
+ end
44
+
45
+ def cast_excel_datetime_as_date(raw_value)
46
+ raw_value.to_s(:db)
47
+ end
48
+
49
+ private
50
+
51
+ # Iterate through the sheet line by line, yielding each one in turn.
52
+ def excel_rows(workbook, sheet, &block)
53
+ return enum_for(:excel_rows, workbook, sheet) unless block
54
+
55
+ if workbook.is_a?(Roo::Excelx)
56
+ # FIXME: xlsx_rows(sheet, &block) should produce the same output as xls_rows
57
+ xls_rows(sheet, &block)
58
+ else
59
+ xls_rows(sheet, &block)
60
+ end
61
+ end
62
+
63
+ # Iterate through an xls sheet line by line, yielding each one in turn.
64
+ def xls_rows(sheet)
65
+ return enum_for(:xls_rows, sheet) unless block_given?
66
+
67
+ sheet.first_row.upto(sheet.last_row) do |row|
68
+ line = []
69
+ sheet.first_column.upto(sheet.last_column) do |col|
70
+ line << cast_excel_value(sheet.cell(row, col))
71
+ end
72
+ yield line
73
+ end
74
+ end
75
+
76
+ # Iterate through an xlsx sheet line by line, yielding each one in turn.
77
+ # This method uses streaming https://github.com/roo-rb/roo#excel-xlsx-and-xlsm-support
78
+ def xlsx_rows(sheet)
79
+ return enum_for(:xlsx_rows, sheet) unless block_given?
80
+
81
+ sheet.each_row_streaming(:pad_cells => true) do |row|
82
+ yield row.map { |cell| cast_excel_value(cell.value) }
83
+ end
84
+ end
85
+
86
+ def load_workbook(path)
87
+ case SafeFile.extname(path).downcase
88
+ when '.xls'
89
+ Roo::Excel.new(SafeFile.safepath_to_string(path))
90
+ when '.xlsx'
91
+ Roo::Excelx.new(SafeFile.safepath_to_string(path))
92
+ else
93
+ fail "Received file path with unexpected extension #{SafeFile.extname(path)}"
94
+ end
95
+ rescue Ole::Storage::FormatError => e
96
+ # TODO: Do we need to remove the new_file after using it?
97
+
98
+ # try to load the .xls file as an .xlsx file, useful for sources like USOM
99
+ # roo check file extensions in file_type_check (GenericSpreadsheet),
100
+ # so we create a duplicate file in xlsx extension
101
+ if /(.*)\.xls$/.match(path)
102
+ new_file_name = SafeFile.basename(path).gsub(/(.*)\.xls$/, '\1_amend.xlsx')
103
+ new_file_path = SafeFile.dirname(path).join(new_file_name)
104
+ copy_file(path, new_file_path)
105
+
106
+ load_workbook(new_file_path)
107
+ else
108
+ raise e.message
109
+ end
110
+ rescue => e
111
+ raise ["Unable to read the file '#{path}'", e.message].join('; ')
112
+ end
113
+
114
+ # Note that this method can produce insecure calls. All callers must protect
115
+ # their arguments.
116
+ # Arguments:
117
+ # * source - SafeFile
118
+ # * dest - SafeFile
119
+ #
120
+ def copy_file(source, dest)
121
+ # SECURE: TVB Mon Aug 13 13:53:02 BST 2012 : Secure SafePath will do the security checks
122
+ # before it is converted to string.
123
+ # SafeFile will make sure that the arguments are actually SafePath
124
+ FileUtils.mkdir_p(SafeFile.safepath_to_string(SafeFile.dirname(dest)))
125
+ FileUtils.cp(SafeFile.safepath_to_string(source), SafeFile.safepath_to_string(dest))
126
+ end
127
+ end
128
+
129
+ Registry.register(Excel, 'xls', 'xlsx')
130
+ end
131
+ end
@@ -0,0 +1,38 @@
1
+ require 'pdf-reader'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a PDF file handler that returns a single table.
10
+ class Pdf < Base
11
+ private
12
+
13
+ def rows(&block)
14
+ return enum_for(:rows) unless block
15
+
16
+ reader = PDF::Reader.new(SafeFile.safepath_to_string(@filename))
17
+
18
+ reader.pages.each do |page|
19
+ process_page(page, &block)
20
+ end
21
+
22
+ rescue NoMethodError
23
+ raise "Failed to read #{SafeFile.basename(@filename)} as a PDF"
24
+ end
25
+
26
+ def process_page(page, &block)
27
+ page.text.split("\n").each do |line|
28
+ block.call(line)
29
+ end
30
+ rescue => e
31
+ raise "Invalid format on page #{page.number} of #{SafeFile.basename(@filename)} " \
32
+ "[#{e.class}: #{e.message}]"
33
+ end
34
+ end
35
+
36
+ Registry.register(Pdf, 'pdf')
37
+ end
38
+ end
@@ -0,0 +1,50 @@
1
+ module NdrImport
2
+ module File
3
+ # This mixin adds table enumeration functionality to importers.
4
+ module Registry
5
+ class <<self
6
+ attr_accessor :handlers
7
+
8
+ def register(klass, *formats)
9
+ @handlers ||= {}
10
+
11
+ formats.each do |format|
12
+ @handlers[format] = klass
13
+ end
14
+ end
15
+
16
+ def unregister(*formats)
17
+ formats.each do |format|
18
+ @handlers.delete(format)
19
+ end
20
+ end
21
+
22
+ def files(filename, options = {}, &block)
23
+ return enum_for(:files, filename, options) unless block
24
+
25
+ klass_factory(filename, nil, options).files(&block)
26
+ end
27
+
28
+ def tables(filename, format = nil, options = {}, &block)
29
+ return enum_for(:tables, filename, format, options) unless block
30
+
31
+ klass_factory(filename, format, options).tables(&block)
32
+ end
33
+
34
+ private
35
+
36
+ def klass_factory(filename, format, options)
37
+ format ||= SafeFile.extname(filename).delete('.').downcase
38
+ klass = Registry.handlers[format]
39
+ if klass
40
+ klass.new(filename, format, options)
41
+ else
42
+ fail "Error: Unknown file format #{format.inspect}"
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+
50
+ require_relative 'all'