ndr_import 3.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (103) hide show
  1. checksums.yaml +15 -0
  2. data/.gitignore +14 -0
  3. data/.rubocop.yml +27 -0
  4. data/.ruby-version +1 -0
  5. data/.travis.yml +22 -0
  6. data/CODE_OF_CONDUCT.md +13 -0
  7. data/Gemfile +4 -0
  8. data/Guardfile +16 -0
  9. data/LICENSE.txt +21 -0
  10. data/README.md +69 -0
  11. data/Rakefile +13 -0
  12. data/code_safety.yml +374 -0
  13. data/gemfiles/Gemfile.rails32 +5 -0
  14. data/gemfiles/Gemfile.rails32.lock +142 -0
  15. data/gemfiles/Gemfile.rails41 +5 -0
  16. data/gemfiles/Gemfile.rails41.lock +145 -0
  17. data/gemfiles/Gemfile.rails42 +5 -0
  18. data/gemfiles/Gemfile.rails42.lock +145 -0
  19. data/lib/ndr_import.rb +13 -0
  20. data/lib/ndr_import/csv_library.rb +40 -0
  21. data/lib/ndr_import/file/all.rb +8 -0
  22. data/lib/ndr_import/file/base.rb +76 -0
  23. data/lib/ndr_import/file/delimited.rb +86 -0
  24. data/lib/ndr_import/file/excel.rb +131 -0
  25. data/lib/ndr_import/file/pdf.rb +38 -0
  26. data/lib/ndr_import/file/registry.rb +50 -0
  27. data/lib/ndr_import/file/text.rb +52 -0
  28. data/lib/ndr_import/file/word.rb +30 -0
  29. data/lib/ndr_import/file/zip.rb +67 -0
  30. data/lib/ndr_import/helpers/file/delimited.rb +105 -0
  31. data/lib/ndr_import/helpers/file/excel.rb +181 -0
  32. data/lib/ndr_import/helpers/file/pdf.rb +29 -0
  33. data/lib/ndr_import/helpers/file/word.rb +27 -0
  34. data/lib/ndr_import/helpers/file/xml.rb +45 -0
  35. data/lib/ndr_import/helpers/file/zip.rb +44 -0
  36. data/lib/ndr_import/mapper.rb +220 -0
  37. data/lib/ndr_import/mapping_error.rb +5 -0
  38. data/lib/ndr_import/non_tabular/column_mapping.rb +73 -0
  39. data/lib/ndr_import/non_tabular/line.rb +46 -0
  40. data/lib/ndr_import/non_tabular/mapping.rb +35 -0
  41. data/lib/ndr_import/non_tabular/record.rb +99 -0
  42. data/lib/ndr_import/non_tabular/table.rb +193 -0
  43. data/lib/ndr_import/non_tabular_file_helper.rb +160 -0
  44. data/lib/ndr_import/standard_mappings.rb +23 -0
  45. data/lib/ndr_import/table.rb +179 -0
  46. data/lib/ndr_import/version.rb +4 -0
  47. data/ndr_import.gemspec +44 -0
  48. data/test/file/base_test.rb +54 -0
  49. data/test/file/delimited_test.rb +143 -0
  50. data/test/file/excel_test.rb +85 -0
  51. data/test/file/pdf_test.rb +35 -0
  52. data/test/file/registry_test.rb +60 -0
  53. data/test/file/text_test.rb +92 -0
  54. data/test/file/word_test.rb +35 -0
  55. data/test/file/zip_test.rb +47 -0
  56. data/test/helpers/file/delimited_test.rb +113 -0
  57. data/test/helpers/file/excel_test.rb +97 -0
  58. data/test/helpers/file/pdf_test.rb +26 -0
  59. data/test/helpers/file/word_test.rb +26 -0
  60. data/test/helpers/file/xml_test.rb +131 -0
  61. data/test/helpers/file/zip_test.rb +75 -0
  62. data/test/mapper_test.rb +551 -0
  63. data/test/non_tabular/mapping_test.rb +36 -0
  64. data/test/non_tabular/table_test.rb +510 -0
  65. data/test/non_tabular_file_helper_test.rb +501 -0
  66. data/test/readme_test.rb +53 -0
  67. data/test/resources/bomd.csv +3 -0
  68. data/test/resources/broken.csv +3 -0
  69. data/test/resources/filesystem_paths.yml +26 -0
  70. data/test/resources/flat_file.pdf +0 -0
  71. data/test/resources/flat_file.txt +27 -0
  72. data/test/resources/flat_file.yml +20 -0
  73. data/test/resources/hello_utf16be.txt +0 -0
  74. data/test/resources/hello_utf16le.txt +0 -0
  75. data/test/resources/hello_utf8.txt +2 -0
  76. data/test/resources/hello_windows.txt +2 -0
  77. data/test/resources/hello_world.doc +0 -0
  78. data/test/resources/hello_world.pdf +0 -0
  79. data/test/resources/hello_world.txt +2 -0
  80. data/test/resources/high_ascii_delimited.txt +2 -0
  81. data/test/resources/malformed.xml +6 -0
  82. data/test/resources/normal.csv +3 -0
  83. data/test/resources/normal.csv.zip +0 -0
  84. data/test/resources/normal_pipe.csv +3 -0
  85. data/test/resources/normal_thorn.csv +3 -0
  86. data/test/resources/not_a_pdf.pdf +0 -0
  87. data/test/resources/not_a_word_file.doc +0 -0
  88. data/test/resources/sample_xls.xls +0 -0
  89. data/test/resources/sample_xlsx.xlsx +0 -0
  90. data/test/resources/standard_mappings.yml +39 -0
  91. data/test/resources/txt_file_xls_extension.xls +1 -0
  92. data/test/resources/txt_file_xlsx_extension.xlsx +1 -0
  93. data/test/resources/utf-16be_xml.xml +0 -0
  94. data/test/resources/utf-16be_xml_with_declaration.xml +0 -0
  95. data/test/resources/utf-16le_xml.xml +0 -0
  96. data/test/resources/utf-8_xml.xml +9 -0
  97. data/test/resources/windows-1252_xml.xml +9 -0
  98. data/test/resources/windows.csv +5 -0
  99. data/test/resources/xlsx_file_xls_extension.xls +0 -0
  100. data/test/standard_mappings_test.rb +22 -0
  101. data/test/table_test.rb +288 -0
  102. data/test/test_helper.rb +13 -0
  103. metadata +443 -0
@@ -0,0 +1,23 @@
1
+ module NdrImport
2
+ # NdrImport::StandardMappings stores the standard mappings hash
3
+ class StandardMappings
4
+ # mappings are stored as a class level instance variable
5
+ class << self
6
+ # Gets the standard mappings
7
+ def mappings
8
+ if defined?(@standard_mappings)
9
+ @standard_mappings
10
+ else
11
+ fail 'NdrImport::StandardMappings not configured!'
12
+ end
13
+ end
14
+
15
+ # Sets the standard mappings
16
+ def mappings=(hash)
17
+ fail ArgumentError unless hash.is_a?(Hash)
18
+
19
+ @standard_mappings = hash
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,179 @@
1
+ require 'ndr_import/mapper'
2
+ require 'active_support/core_ext/hash'
3
+
4
+ module NdrImport
5
+ # This class maintains the state of a table mapping and encapsulates the logic
6
+ # required to transform a table of data into "records". Particular attention
7
+ # has been made to use enumerables throughout to help with the transformation
8
+ # of large quantities of data.
9
+ class Table
10
+ include NdrImport::Mapper
11
+
12
+ def self.all_valid_options
13
+ %w(canonical_name filename_pattern tablename_pattern header_lines footer_lines format klass columns)
14
+ end
15
+
16
+ def all_valid_options
17
+ self.class.all_valid_options
18
+ end
19
+
20
+ attr_reader(*all_valid_options)
21
+ attr_writer :notifier
22
+
23
+ def initialize(options = {})
24
+ options.stringify_keys! if options.is_a?(Hash)
25
+ validate_options(options)
26
+
27
+ all_valid_options.each do |key|
28
+ # This pattern is used to only set attributes if option specified,
29
+ # which makes for more concise YAML serialization.
30
+ options[key] && instance_variable_set("@#{key}", options[key])
31
+ end
32
+
33
+ @row_index = 0
34
+ end
35
+
36
+ def match(filename, tablename)
37
+ ::File.basename(filename) =~ (filename_pattern || /\A.*\z/) &&
38
+ (tablename.nil? || tablename =~ (tablename_pattern || /\A.*\z/))
39
+ end
40
+
41
+ # This method transforms a table of data, given a line array/enumerator and yields
42
+ # klass, fields and index (input row number) for each record that it would create
43
+ # as a result of the transformation process.
44
+ def transform(lines, &block)
45
+ return enum_for(:transform, lines) unless block
46
+
47
+ @row_index = 0
48
+ @header_valid = false
49
+ @notifier.try(:started)
50
+
51
+ skip_footer_lines(lines, footer_lines).each do |line|
52
+ process_line(line, &block)
53
+ end
54
+
55
+ @notifier.try(:finished)
56
+ end
57
+
58
+ # This method process a line of data, If it is a header line it validates it, otherwise
59
+ # transforms it. It also increments and row index and notifies the amount of lines processed.
60
+ def process_line(line, &block)
61
+ return enum_for(:process_line, line) unless block
62
+
63
+ if @row_index < header_lines
64
+ validate_header(line, @columns)
65
+ else
66
+ fail 'Header is not valid' if header_lines > 0 && !header_valid?
67
+ transform_line(line, @row_index, &block)
68
+ end
69
+
70
+ @row_index += 1
71
+ @notifier.try(:processed, @row_index)
72
+ end
73
+
74
+ # This method transforms an incoming line of data by applying each of the klass masked
75
+ # mappings to the line and yielding the klass and fields for each mapped klass.
76
+ def transform_line(line, index)
77
+ return enum_for(:transform_line, line, index) unless block_given?
78
+
79
+ masked_mappings.each do |klass, klass_mappings|
80
+ fields = mapped_line(line, klass_mappings)
81
+ next if fields[:skip].to_s == 'true'
82
+ yield(klass, fields, index)
83
+ end
84
+ end
85
+
86
+ def header_valid?
87
+ @header_valid == true
88
+ end
89
+
90
+ private
91
+
92
+ # This method uses a buffer to not yield the last <buffer_size> iterations of an enumerable.
93
+ # We use it to skip footer lines (without having to convert the enumerable to an array).
94
+ def skip_footer_lines(lines, buffer_size)
95
+ return enum_for(:skip_footer_lines, lines, buffer_size) unless block_given?
96
+
97
+ buffer = []
98
+ lines.each do |line|
99
+ buffer.unshift(line)
100
+
101
+ yield buffer.pop if buffer.length > buffer_size
102
+ end
103
+ end
104
+
105
+ # This method memoizes the klass masked mappings. Where a table level
106
+ # klass is defined it is used with the whole mapping, otherwise the masks are generated.
107
+ def masked_mappings
108
+ @masked_mappings ||= begin
109
+ if @klass
110
+ { @klass => @columns }
111
+ else
112
+ column_level_klass_masked_mappings
113
+ end
114
+ end
115
+ end
116
+
117
+ # This method generates a hash of klass based mappings, one for each defined klass
118
+ # where the whole line mapping is masked to just the data items of that klass.
119
+ def column_level_klass_masked_mappings
120
+ ensure_mappings_define_klass
121
+
122
+ # Loop through each klass
123
+ masked_mappings = {}
124
+ @columns.map { |mapping| mapping['klass'] }.flatten.compact.uniq.each do |klass|
125
+ # Duplicate the column mappings and do not capture fields that relate to other klasses
126
+ masked_mappings[klass] = mask_mappings_by_klass(klass)
127
+ end
128
+ masked_mappings
129
+ end
130
+
131
+ # This method ensures that every column mapping defines a klass (unless it is a column that
132
+ # we do not capture). It is only used where a table level klass is not defined.
133
+ def ensure_mappings_define_klass
134
+ klassless_mappings = @columns.
135
+ select { |mapping| mapping.nil? || mapping['klass'].nil? }.
136
+ reject { |mapping| mapping['do_not_capture'] }.
137
+ map { |mapping| mapping['column'] || mapping['standard_mapping'] }
138
+
139
+ return if klassless_mappings.empty?
140
+
141
+ # All column mappings for the single item file require a klass definition.
142
+ fail "Missing klass for column(s): #{klassless_mappings.to_sentence}"
143
+ end
144
+
145
+ # This method duplicates the mappings and applies a do_not_capture mask to those that do not
146
+ # relate to this klass, returning the masked mappings
147
+ def mask_mappings_by_klass(klass)
148
+ @columns.dup.map do |mapping|
149
+ if Array(mapping['klass']).include?(klass)
150
+ mapping
151
+ else
152
+ { 'do_not_capture' => true }
153
+ end
154
+ end
155
+ end
156
+
157
+ def validate_options(hash)
158
+ fail ArgumentError unless hash.is_a?(Hash)
159
+
160
+ unrecognised_options = hash.keys - all_valid_options
161
+ return if unrecognised_options.empty?
162
+ fail ArgumentError, "Unrecognised options: #{unrecognised_options.inspect}"
163
+ end
164
+
165
+ # if there is a header, then check the column headings are as expected in the correct order
166
+ def validate_header(line, column_mappings)
167
+ columns = column_names(column_mappings)
168
+ fail 'Number of columns does not match' if line.length != columns.length
169
+
170
+ return unless line.map(&:downcase) == columns
171
+ @header_valid = true
172
+ end
173
+
174
+ # returns the column names as we expect to receive them
175
+ def column_names(column_mappings)
176
+ column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
177
+ end
178
+ end # class Table
179
+ end
@@ -0,0 +1,4 @@
1
+ # This stores the current version of the NdrImport gem
2
+ module NdrImport
3
+ VERSION = '3.0.0'
4
+ end
@@ -0,0 +1,44 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'ndr_import/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'ndr_import'
8
+ spec.version = NdrImport::VERSION
9
+ spec.authors = ['NCRS Development Team']
10
+ spec.email = []
11
+ spec.summary = 'NDR Import'
12
+ spec.description = 'NDR ETL Importer'
13
+ spec.homepage = 'https://github.com/PublicHealthEngland/ndr_import'
14
+ spec.license = 'MIT'
15
+
16
+ # Exclude older versions of this gem from the package.
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |s| s =~ %r{^pkg/} }
18
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
19
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
20
+ spec.require_paths = ['lib']
21
+
22
+ spec.add_dependency 'activesupport', '>= 3.2.18', '< 5.0.0'
23
+ spec.add_dependency 'ndr_support', '~> 3.0'
24
+
25
+ spec.add_dependency 'rubyzip', '~> 1.1'
26
+ spec.add_dependency 'roo', '~> 2.0'
27
+ # roo requires nokogiri >=1.5, but nokogiri (1.6.1) requires Ruby version >= 1.9.2.
28
+ spec.add_dependency 'nokogiri', '~> 1.6'
29
+ spec.add_dependency 'roo-xls'
30
+ spec.add_dependency 'spreadsheet', '1.0.3' # Aligning with encore
31
+ spec.add_dependency 'pdf-reader', '1.2.0'
32
+ spec.add_dependency 'msworddoc-extractor', '0.2.0'
33
+
34
+ spec.required_ruby_version = '>= 1.9.3'
35
+
36
+ spec.add_development_dependency 'bundler', '~> 1.7'
37
+ spec.add_development_dependency 'rake', '~> 10.0'
38
+ spec.add_development_dependency 'minitest'
39
+ spec.add_development_dependency 'guard'
40
+ spec.add_development_dependency 'guard-rubocop'
41
+ spec.add_development_dependency 'guard-test'
42
+ spec.add_development_dependency 'terminal-notifier-guard' if RUBY_PLATFORM =~ /darwin/
43
+ spec.add_development_dependency 'simplecov'
44
+ end
@@ -0,0 +1,54 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/registry'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Base file handler tests
7
+ class BaseTest < ActiveSupport::TestCase
8
+ # Handles a single table file, but for test purposes,
9
+ # I could be bothered to implement it fully
10
+ class SingleTableLazyDeveloper < ::NdrImport::File::Base
11
+ end
12
+
13
+ def setup
14
+ @permanent_test_files = SafePath.new('permanent_test_files')
15
+ end
16
+
17
+ test 'should fail on not implementing rows' do
18
+ begin
19
+ Registry.register(SingleTableLazyDeveloper, 'lazy_dev')
20
+
21
+ exception = assert_raises(RuntimeError) do
22
+ file_path = @permanent_test_files.join('normal.csv')
23
+ handler = SingleTableLazyDeveloper.new(file_path, 'lazy_dev')
24
+
25
+ handler.tables.each do |tablename, sheet|
26
+ assert_nil tablename
27
+ assert_instance_of Enumerator, sheet
28
+ sheet.to_a
29
+ end
30
+ end
31
+
32
+ msg = 'Implement NdrImport::File::BaseTest::SingleTableLazyDeveloper#rows'
33
+ assert_equal msg, exception.message
34
+ ensure
35
+ Registry.unregister('lazy_dev')
36
+ end
37
+ end
38
+
39
+ test 'should not fail when set up with an readable safepath' do
40
+ assert Base.new(SafePath.new('test_space_r'), nil)
41
+ assert Base.new(SafePath.new('test_space_rw'), nil)
42
+ end
43
+
44
+ test 'should fail when set up with an unreadable safepath' do
45
+ assert_raises(SecurityError) { Base.new(SafePath.new('test_space_w'), nil) }
46
+ end
47
+
48
+ test 'should fail when set up with a non-safepath' do
49
+ exception = assert_raises(ArgumentError) { Base.new(NdrImport.root, nil) }
50
+ assert exception.message =~ /file_name should be of type SafePath, but it is String/
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,143 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/delimited'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Delimited file handler tests
7
+ class DelimitedTest < ActiveSupport::TestCase
8
+ def setup
9
+ @permanent_test_files = SafePath.new('permanent_test_files')
10
+ end
11
+
12
+ test 'should read csv correctly' do
13
+ file_path = @permanent_test_files.join('normal.csv')
14
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
15
+ handler.tables.each do |tablename, sheet|
16
+ assert_nil tablename
17
+ sheet = sheet.to_a
18
+ assert_equal(('A'..'Z').to_a, sheet[0])
19
+ assert_equal ['1'] * 26, sheet[1]
20
+ assert_equal ['2'] * 26, sheet[2]
21
+ end
22
+ end
23
+
24
+ test 'should read pipe correctly' do
25
+ file_path = @permanent_test_files.join('normal_pipe.csv')
26
+ handler = NdrImport::File::Delimited.new(file_path, 'pipe', 'col_sep' => nil)
27
+ handler.tables.each do |tablename, sheet|
28
+ assert_nil tablename
29
+ sheet = sheet.to_a
30
+ assert_equal(('A'..'Z').to_a, sheet[0])
31
+ assert_equal ['1'] * 26, sheet[1]
32
+ assert_equal ['2'] * 26, sheet[2]
33
+ end
34
+ end
35
+
36
+ test 'should read thorn correctly' do
37
+ file_path = @permanent_test_files.join('normal_thorn.csv')
38
+ handler = NdrImport::File::Delimited.new(file_path, 'thorn', 'col_sep' => nil)
39
+ handler.tables.each do |tablename, sheet|
40
+ assert_nil tablename
41
+ sheet = sheet.to_a
42
+ assert_equal(('A'..'Z').to_a, sheet[0])
43
+ assert_equal ['1'] * 26, sheet[1]
44
+ assert_equal ['2'] * 26, sheet[2]
45
+ end
46
+ end
47
+
48
+ test 'should read csv with a BOM' do
49
+ file_path = @permanent_test_files.join('bomd.csv')
50
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
51
+ handler.tables.each do |tablename, sheet|
52
+ assert_nil tablename
53
+ assert_instance_of Enumerator, sheet
54
+ sheet = sheet.to_a
55
+ assert_equal(('A'..'Z').to_a, sheet[0])
56
+ assert_equal ['1'] * 26, sheet[1]
57
+ assert_equal ['2'] * 26, sheet[2]
58
+ end
59
+ end
60
+
61
+ test 'should read windows-1252 csv' do
62
+ file_path = @permanent_test_files.join('windows.csv')
63
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
64
+ handler.tables.each do |tablename, sheet|
65
+ assert_nil tablename
66
+ assert_instance_of Enumerator, sheet
67
+ sheet = sheet.to_a
68
+ assert_equal 1, sheet.length
69
+ end
70
+ end
71
+
72
+ test 'should read acsii-delimited csv' do
73
+ file_path = @permanent_test_files.join('high_ascii_delimited.txt')
74
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => "\xfe")
75
+ handler.tables.each do |tablename, sheet|
76
+ assert_nil tablename
77
+ assert_instance_of Enumerator, sheet
78
+ sheet = sheet.to_a
79
+ assert_equal 2, sheet.length
80
+ assert_equal '1234567890', sheet[0][1]
81
+ assert_equal '1234567890', sheet[1][1]
82
+ end
83
+ end
84
+
85
+ test 'should read line-by-line' do
86
+ rows = []
87
+ file_path = @permanent_test_files.join('normal.csv')
88
+ handler = NdrImport::File::Delimited.new(file_path, 'csv')
89
+
90
+ handler.tables.each do |tablename, sheet|
91
+ assert_nil tablename
92
+ assert_instance_of Enumerator, sheet
93
+ sheet.each do |row|
94
+ rows << row
95
+ end
96
+ end
97
+
98
+ assert_equal(('A'..'Z').to_a, rows[0])
99
+ assert_equal ['1'] * 26, rows[1]
100
+ assert_equal ['2'] * 26, rows[2]
101
+ end
102
+
103
+ test 'should report addition details upon failure to slurp csv' do
104
+ exception = assert_raises(CSVLibrary::MalformedCSVError) do
105
+ file_path = @permanent_test_files.join('broken.csv')
106
+ handler = NdrImport::File::Delimited.new(file_path, 'csv', 'col_sep' => nil)
107
+
108
+ handler.tables.each do |tablename, sheet|
109
+ assert_nil tablename
110
+ assert_instance_of Enumerator, sheet
111
+ sheet.to_a
112
+ end
113
+ end
114
+
115
+ msg = 'Invalid CSV format on row 2 of broken.csv. ' \
116
+ 'Original: Missing or stray quote in line 2'
117
+ assert_equal msg, exception.message
118
+ end
119
+
120
+ test 'should report addition details upon failure to read csv line-by-line' do
121
+ rows_yielded = []
122
+ exception = assert_raises(CSVLibrary::MalformedCSVError) do
123
+ file_path = @permanent_test_files.join('broken.csv')
124
+ handler = NdrImport::File::Delimited.new(file_path, 'csv')
125
+
126
+ handler.tables.each do |tablename, sheet|
127
+ assert_nil tablename
128
+ assert_instance_of Enumerator, sheet
129
+ sheet.each do |row|
130
+ rows_yielded << row
131
+ end
132
+ end
133
+ end
134
+
135
+ assert rows_yielded.empty?, 'no rows should have been yielded'
136
+
137
+ msg = 'Invalid CSV format on row 2 of broken.csv. ' \
138
+ 'Original: Missing or stray quote in line 2'
139
+ assert_equal msg, exception.message
140
+ end
141
+ end
142
+ end
143
+ end