ndr_import 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,23 @@
1
+ module NdrImport
2
+ # NdrImport::StandardMappings stores the standard mappings hash
3
+ class StandardMappings
4
+ # mappings are stored as a class level instance variable
5
+ class << self
6
+ # Gets the standard mappings
7
+ def mappings
8
+ if defined?(@standard_mappings)
9
+ @standard_mappings
10
+ else
11
+ fail 'NdrImport::StandardMappings not configured!'
12
+ end
13
+ end
14
+
15
+ # Sets the standard mappings
16
+ def mappings=(hash)
17
+ fail ArgumentError unless hash.is_a?(Hash)
18
+
19
+ @standard_mappings = hash
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,179 @@
1
+ require 'ndr_import/mapper'
2
+ require 'active_support/core_ext/hash'
3
+
4
+ module NdrImport
5
+ # This class maintains the state of a table mapping and encapsulates the logic
6
+ # required to transform a table of data into "records". Particular attention
7
+ # has been made to use enumerables throughout to help with the transformation
8
+ # of large quantities of data.
9
+ class Table
10
+ include NdrImport::Mapper
11
+
12
+ def self.all_valid_options
13
+ %w(canonical_name filename_pattern tablename_pattern header_lines footer_lines format klass columns)
14
+ end
15
+
16
+ def all_valid_options
17
+ self.class.all_valid_options
18
+ end
19
+
20
+ attr_reader(*all_valid_options)
21
+ attr_writer :notifier
22
+
23
+ def initialize(options = {})
24
+ options.stringify_keys! if options.is_a?(Hash)
25
+ validate_options(options)
26
+
27
+ all_valid_options.each do |key|
28
+ # This pattern is used to only set attributes if option specified,
29
+ # which makes for more concise YAML serialization.
30
+ options[key] && instance_variable_set("@#{key}", options[key])
31
+ end
32
+
33
+ @row_index = 0
34
+ end
35
+
36
+ def match(filename, tablename)
37
+ ::File.basename(filename) =~ (filename_pattern || /\A.*\z/) &&
38
+ (tablename.nil? || tablename =~ (tablename_pattern || /\A.*\z/))
39
+ end
40
+
41
+ # This method transforms a table of data, given a line array/enumerator and yields
42
+ # klass, fields and index (input row number) for each record that it would create
43
+ # as a result of the transformation process.
44
+ def transform(lines, &block)
45
+ return enum_for(:transform, lines) unless block
46
+
47
+ @row_index = 0
48
+ @header_valid = false
49
+ @notifier.try(:started)
50
+
51
+ skip_footer_lines(lines, footer_lines).each do |line|
52
+ process_line(line, &block)
53
+ end
54
+
55
+ @notifier.try(:finished)
56
+ end
57
+
58
+ # This method process a line of data, If it is a header line it validates it, otherwise
59
+ # transforms it. It also increments and row index and notifies the amount of lines processed.
60
+ def process_line(line, &block)
61
+ return enum_for(:process_line, line) unless block
62
+
63
+ if @row_index < header_lines
64
+ validate_header(line, @columns)
65
+ else
66
+ fail 'Header is not valid' if header_lines > 0 && !header_valid?
67
+ transform_line(line, @row_index, &block)
68
+ end
69
+
70
+ @row_index += 1
71
+ @notifier.try(:processed, @row_index)
72
+ end
73
+
74
+ # This method transforms an incoming line of data by applying each of the klass masked
75
+ # mappings to the line and yielding the klass and fields for each mapped klass.
76
+ def transform_line(line, index)
77
+ return enum_for(:transform_line, line, index) unless block_given?
78
+
79
+ masked_mappings.each do |klass, klass_mappings|
80
+ fields = mapped_line(line, klass_mappings)
81
+ next if fields[:skip].to_s == 'true'
82
+ yield(klass, fields, index)
83
+ end
84
+ end
85
+
86
+ def header_valid?
87
+ @header_valid == true
88
+ end
89
+
90
+ private
91
+
92
+ # This method uses a buffer to not yield the last <buffer_size> iterations of an enumerable.
93
+ # We use it to skip footer lines (without having to convert the enumerable to an array).
94
+ def skip_footer_lines(lines, buffer_size)
95
+ return enum_for(:skip_footer_lines, lines, buffer_size) unless block_given?
96
+
97
+ buffer = []
98
+ lines.each do |line|
99
+ buffer.unshift(line)
100
+
101
+ yield buffer.pop if buffer.length > buffer_size
102
+ end
103
+ end
104
+
105
+ # This method memoizes the klass masked mappings. Where a table level
106
+ # klass is defined it is used with the whole mapping, otherwise the masks are generated.
107
+ def masked_mappings
108
+ @masked_mappings ||= begin
109
+ if @klass
110
+ { @klass => @columns }
111
+ else
112
+ column_level_klass_masked_mappings
113
+ end
114
+ end
115
+ end
116
+
117
+ # This method generates a hash of klass based mappings, one for each defined klass
118
+ # where the whole line mapping is masked to just the data items of that klass.
119
+ def column_level_klass_masked_mappings
120
+ ensure_mappings_define_klass
121
+
122
+ # Loop through each klass
123
+ masked_mappings = {}
124
+ @columns.map { |mapping| mapping['klass'] }.flatten.compact.uniq.each do |klass|
125
+ # Duplicate the column mappings and do not capture fields that relate to other klasses
126
+ masked_mappings[klass] = mask_mappings_by_klass(klass)
127
+ end
128
+ masked_mappings
129
+ end
130
+
131
+ # This method ensures that every column mapping defines a klass (unless it is a column that
132
+ # we do not capture). It is only used where a table level klass is not defined.
133
+ def ensure_mappings_define_klass
134
+ klassless_mappings = @columns.
135
+ select { |mapping| mapping.nil? || mapping['klass'].nil? }.
136
+ reject { |mapping| mapping['do_not_capture'] }.
137
+ map { |mapping| mapping['column'] || mapping['standard_mapping'] }
138
+
139
+ return if klassless_mappings.empty?
140
+
141
+ # All column mappings for the single item file require a klass definition.
142
+ fail "Missing klass for column(s): #{klassless_mappings.to_sentence}"
143
+ end
144
+
145
+ # This method duplicates the mappings and applies a do_not_capture mask to those that do not
146
+ # relate to this klass, returning the masked mappings
147
+ def mask_mappings_by_klass(klass)
148
+ @columns.dup.map do |mapping|
149
+ if Array(mapping['klass']).include?(klass)
150
+ mapping
151
+ else
152
+ { 'do_not_capture' => true }
153
+ end
154
+ end
155
+ end
156
+
157
+ def validate_options(hash)
158
+ fail ArgumentError unless hash.is_a?(Hash)
159
+
160
+ unrecognised_options = hash.keys - all_valid_options
161
+ return if unrecognised_options.empty?
162
+ fail ArgumentError, "Unrecognised options: #{unrecognised_options.inspect}"
163
+ end
164
+
165
+ # if there is a header, then check the column headings are as expected in the correct order
166
+ def validate_header(line, column_mappings)
167
+ columns = column_names(column_mappings)
168
+ fail 'Number of columns does not match' if line.length != columns.length
169
+
170
+ return unless line.map(&:downcase) == columns
171
+ @header_valid = true
172
+ end
173
+
174
+ # returns the column names as we expect to receive them
175
+ def column_names(column_mappings)
176
+ column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
177
+ end
178
+ end # class Table
179
+ end
@@ -0,0 +1,4 @@
1
+ # This stores the current version of the NdrImport gem
2
+ module NdrImport
3
+ VERSION = '3.0.0'
4
+ end
@@ -0,0 +1,44 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ndr_import/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'ndr_import'
8
+ spec.version = NdrImport::VERSION
9
+ spec.authors = ['NCRS Development Team']
10
+ spec.email = []
11
+ spec.summary = 'NDR Import'
12
+ spec.description = 'NDR ETL Importer'
13
+ spec.homepage = 'https://github.com/PublicHealthEngland/ndr_import'
14
+ spec.license = 'MIT'
15
+
16
+ # Exclude older versions of this gem from the package.
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |s| s =~ %r{^pkg/} }
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'activesupport', '>= 3.2.18', '< 5.0.0'
23
+ spec.add_dependency 'ndr_support', '~> 3.0'
24
+
25
+ spec.add_dependency 'rubyzip', '~> 1.1'
26
+ spec.add_dependency 'roo', '~> 2.0'
27
+ # roo requires nokogiri >=1.5, but nokogiri (1.6.1) requires Ruby version >= 1.9.2.
28
+ spec.add_dependency 'nokogiri', '~> 1.6'
29
+ spec.add_dependency 'roo-xls'
30
+ spec.add_dependency 'spreadsheet', '1.0.3' # Aligning with encore
31
+ spec.add_dependency 'pdf-reader', '1.2.0'
32
+ spec.add_dependency 'msworddoc-extractor', '0.2.0'
33
+
34
+ spec.required_ruby_version = '>= 1.9.3'
35
+
36
+ spec.add_development_dependency 'bundler', '~> 1.7'
37
+ spec.add_development_dependency 'rake', '~> 10.0'
38
+ spec.add_development_dependency 'minitest'
39
+ spec.add_development_dependency 'guard'
40
+ spec.add_development_dependency 'guard-rubocop'
41
+ spec.add_development_dependency 'guard-test'
42
+ spec.add_development_dependency 'terminal-notifier-guard' if RUBY_PLATFORM =~ /darwin/
43
+ spec.add_development_dependency 'simplecov'
44
+ end
@@ -0,0 +1,54 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/registry'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Base file handler tests
7
+ class BaseTest < ActiveSupport::TestCase
8
+ # Handles a single table file, but for test purposes,
9
+ # I could be bothered to implement it fully
10
+ class SingleTableLazyDeveloper < ::NdrImport::File::Base
11
+ end
12
+
13
+ def setup
14
+ @permanent_test_files = SafePath.new('permanent_test_files')
15
+ end
16
+
17
+ test 'should fail on not implementing rows' do
18
+ begin
19
+ Registry.register(SingleTableLazyDeveloper, 'lazy_dev')
20
+
21
+ exception = assert_raises(RuntimeError) do
22
+ file_path = @permanent_test_files.join('normal.csv')
23
+ handler = SingleTableLazyDeveloper.new(file_path, 'lazy_dev')
24
+
25
+ handler.tables.each do |tablename, sheet|
26
+ assert_nil tablename
27
+ assert_instance_of Enumerator, sheet
28
+ sheet.to_a
29
+ end
30
+ end
31
+
32
+ msg = 'Implement NdrImport::File::BaseTest::SingleTableLazyDeveloper#rows'
33
+ assert_equal msg, exception.message
34
+ ensure
35
+ Registry.unregister('lazy_dev')
36
+ end
37
+ end
38
+
39
+ test 'should not fail when set up with an readable safepath' do
40
+ assert Base.new(SafePath.new('test_space_r'), nil)
41
+ assert Base.new(SafePath.new('test_space_rw'), nil)
42
+ end
43
+
44
+ test 'should fail when set up with an unreadable safepath' do
45
+ assert_raises(SecurityError) { Base.new(SafePath.new('test_space_w'), nil) }
46
+ end
47
+
48
+ test 'should fail when set up with a non-safepath' do
49
+ exception = assert_raises(ArgumentError) { Base.new(NdrImport.root, nil) }
50
+ assert exception.message =~ /file_name should be of type SafePath, but it is String/
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,143 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/delimited'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Delimited file handler tests
7
+ class DelimitedTest < ActiveSupport::TestCase
8
+ def setup
9
+ @permanent_test_files = SafePath.new('permanent_test_files')
10
+ end
11
+
12
+ test 'should read csv correctly' do
13
+ file_path = @permanent_test_files.join('normal.csv')
14
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
15
+ handler.tables.each do |tablename, sheet|
16
+ assert_nil tablename
17
+ sheet = sheet.to_a
18
+ assert_equal(('A'..'Z').to_a, sheet[0])
19
+ assert_equal ['1'] * 26, sheet[1]
20
+ assert_equal ['2'] * 26, sheet[2]
21
+ end
22
+ end
23
+
24
+ test 'should read pipe correctly' do
25
+ file_path = @permanent_test_files.join('normal_pipe.csv')
26
+ handler = NdrImport::File::Delimited.new(file_path, 'pipe', 'col_sep' => nil)
27
+ handler.tables.each do |tablename, sheet|
28
+ assert_nil tablename
29
+ sheet = sheet.to_a
30
+ assert_equal(('A'..'Z').to_a, sheet[0])
31
+ assert_equal ['1'] * 26, sheet[1]
32
+ assert_equal ['2'] * 26, sheet[2]
33
+ end
34
+ end
35
+
36
+ test 'should read thorn correctly' do
37
+ file_path = @permanent_test_files.join('normal_thorn.csv')
38
+ handler = NdrImport::File::Delimited.new(file_path, 'thorn', 'col_sep' => nil)
39
+ handler.tables.each do |tablename, sheet|
40
+ assert_nil tablename
41
+ sheet = sheet.to_a
42
+ assert_equal(('A'..'Z').to_a, sheet[0])
43
+ assert_equal ['1'] * 26, sheet[1]
44
+ assert_equal ['2'] * 26, sheet[2]
45
+ end
46
+ end
47
+
48
+ test 'should read csv with a BOM' do
49
+ file_path = @permanent_test_files.join('bomd.csv')
50
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
51
+ handler.tables.each do |tablename, sheet|
52
+ assert_nil tablename
53
+ assert_instance_of Enumerator, sheet
54
+ sheet = sheet.to_a
55
+ assert_equal(('A'..'Z').to_a, sheet[0])
56
+ assert_equal ['1'] * 26, sheet[1]
57
+ assert_equal ['2'] * 26, sheet[2]
58
+ end
59
+ end
60
+
61
+ test 'should read windows-1252 csv' do
62
+ file_path = @permanent_test_files.join('windows.csv')
63
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
64
+ handler.tables.each do |tablename, sheet|
65
+ assert_nil tablename
66
+ assert_instance_of Enumerator, sheet
67
+ sheet = sheet.to_a
68
+ assert_equal 1, sheet.length
69
+ end
70
+ end
71
+
72
+ test 'should read acsii-delimited csv' do
73
+ file_path = @permanent_test_files.join('high_ascii_delimited.txt')
74
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => "\xfe")
75
+ handler.tables.each do |tablename, sheet|
76
+ assert_nil tablename
77
+ assert_instance_of Enumerator, sheet
78
+ sheet = sheet.to_a
79
+ assert_equal 2, sheet.length
80
+ assert_equal '1234567890', sheet[0][1]
81
+ assert_equal '1234567890', sheet[1][1]
82
+ end
83
+ end
84
+
85
+ test 'should read line-by-line' do
86
+ rows = []
87
+ file_path = @permanent_test_files.join('normal.csv')
88
+ handler = NdrImport::File::Delimited.new(file_path, 'csv')
89
+
90
+ handler.tables.each do |tablename, sheet|
91
+ assert_nil tablename
92
+ assert_instance_of Enumerator, sheet
93
+ sheet.each do |row|
94
+ rows << row
95
+ end
96
+ end
97
+
98
+ assert_equal(('A'..'Z').to_a, rows[0])
99
+ assert_equal ['1'] * 26, rows[1]
100
+ assert_equal ['2'] * 26, rows[2]
101
+ end
102
+
103
+ test 'should report addition details upon failure to slurp csv' do
104
+ exception = assert_raises(CSVLibrary::MalformedCSVError) do
105
+ file_path = @permanent_test_files.join('broken.csv')
106
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
107
+
108
+ handler.tables.each do |tablename, sheet|
109
+ assert_nil tablename
110
+ assert_instance_of Enumerator, sheet
111
+ sheet.to_a
112
+ end
113
+ end
114
+
115
+ msg = 'Invalid CSV format on row 2 of broken.csv. ' \
116
+ 'Original: Missing or stray quote in line 2'
117
+ assert_equal msg, exception.message
118
+ end
119
+
120
+ test 'should report addition details upon failure to read csv line-by-line' do
121
+ rows_yielded = []
122
+ exception = assert_raises(CSVLibrary::MalformedCSVError) do
123
+ file_path = @permanent_test_files.join('broken.csv')
124
+ handler = NdrImport::File::Delimited.new(file_path, 'csv')
125
+
126
+ handler.tables.each do |tablename, sheet|
127
+ assert_nil tablename
128
+ assert_instance_of Enumerator, sheet
129
+ sheet.each do |row|
130
+ rows_yielded << row
131
+ end
132
+ end
133
+ end
134
+
135
+ assert rows_yielded.empty?, 'no rows should have been yielded'
136
+
137
+ msg = 'Invalid CSV format on row 2 of broken.csv. ' \
138
+ 'Original: Missing or stray quote in line 2'
139
+ assert_equal msg, exception.message
140
+ end
141
+ end
142
+ end
143
+ end