ndr_import 6.1.1 → 6.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0f55e93045c35c6bdbe7e87d04727c07ba4dc45aa780bbee7ce6715edf9dcfc1
4
- data.tar.gz: 2418dd97d27989fa10e327e9b7111fdfcae08ff18bf05f83b2683e616e93487e
3
+ metadata.gz: 33380c2e5d1abc43cb56a040c8b6255ec6315bb9860506c173dcd573df777f11
4
+ data.tar.gz: fe9a51d153f9f64cdeaf872198f06887ba0c0becf9877e9a0f6cbb511160ae10
5
5
  SHA512:
6
- metadata.gz: fdf1fba48a41e4e75f8a0d6cec5a08afc959a06bc887330b635ff801e33f3fbbd73632e1c4f24f5ea7c4e76c5499d7caa1470f908cc44ac8ba80e780cfaa2153
7
- data.tar.gz: 8bcfc1bcb1fd6fc6848cb2362980d932d04a8cb67e5fd8bb05cbd36815d937f8fd06ec8bee14e200de1b7ca054958ecf13415fb49e6bbb304a1fd13c62056d1e
6
+ metadata.gz: e11257edc7d89f30f05943c9eb992f7c3660d66a15100092639840768843456e15a8b3babe16194f04387bd890f399fab58238842276e1a8c40209cfc127ddf4
7
+ data.tar.gz: 27f159c69d12780b967caa9c5223f8ab91a33fa6cd21b62ebaa3811ab7a8d115d388fc084a8e2501663cbb24d81a97ae4cf090bd377ae4ccd1bbdb617e0ea9ab
data/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  This is the Public Health England (PHE) National Disease Registers (NDR) Import ETL ruby gem, providing:
4
4
 
5
- 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc word documents, PDF, XML and Zip files.
5
+ 1. file import handlers for *extracting* data from delimited files (csv, pipe, tab, thorn), .xls(x) spreadsheets, .doc(x) word documents, PDF, XML and Zip files.
6
6
  2. table mappers for *transforming* tabular and non-tabular data into key value pairs grouped by a common "klass".
7
7
 
8
8
  ## Installation
data/code_safety.yml CHANGED
@@ -35,7 +35,7 @@ file safety:
35
35
  README.md:
36
36
  comments:
37
37
  reviewed_by: josh.pencheon
38
- safe_revision: ea7acbd1bbe99eb57f65c1dd381c309299ae3e08
38
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
39
39
  Rakefile:
40
40
  comments:
41
41
  reviewed_by: josh.pencheon
@@ -67,7 +67,7 @@ file safety:
67
67
  lib/ndr_import/file/all.rb:
68
68
  comments:
69
69
  reviewed_by: josh.pencheon
70
- safe_revision: 2104514689a1a1286195fff18144a8cecb93048b
70
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
71
71
  lib/ndr_import/file/base.rb:
72
72
  comments:
73
73
  reviewed_by: timgentry
@@ -76,6 +76,10 @@ file safety:
76
76
  comments:
77
77
  reviewed_by: josh.pencheon
78
78
  safe_revision: 902f5326d85372d9632de9869d6f56fc02b83a10
79
+ lib/ndr_import/file/docx.rb:
80
+ comments:
81
+ reviewed_by: josh.pencheon
82
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
79
83
  lib/ndr_import/file/excel.rb:
80
84
  comments:
81
85
  reviewed_by: joshpencheon
@@ -159,7 +163,7 @@ file safety:
159
163
  lib/ndr_import/non_tabular/table.rb:
160
164
  comments:
161
165
  reviewed_by: josh.pencheon
162
- safe_revision: f7670cf27e137064c19490abc7df020fb6e95801
166
+ safe_revision: 337bf56e39f0f08cf7593b03867bb2da48630663
163
167
  lib/ndr_import/non_tabular_file_helper.rb:
164
168
  comments:
165
169
  reviewed_by: josh.pencheon
@@ -179,11 +183,11 @@ file safety:
179
183
  lib/ndr_import/version.rb:
180
184
  comments: another check?
181
185
  reviewed_by: josh.pencheon
182
- safe_revision: e350bd3f45bac82e5a05bd75eb87e9ae82486cf2
186
+ safe_revision: b85d90430543f238706d569bacd750ee50cb5493
183
187
  ndr_import.gemspec:
184
188
  comments:
185
189
  reviewed_by: josh.pencheon
186
- safe_revision: a1bd9cb14dd5a660119a339795fb6a9bf0c1819e
190
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
187
191
  test/file/base_test.rb:
188
192
  comments:
189
193
  reviewed_by: timgentry
@@ -192,6 +196,10 @@ file safety:
192
196
  comments:
193
197
  reviewed_by: josh.pencheon
194
198
  safe_revision: 902f5326d85372d9632de9869d6f56fc02b83a10
199
+ test/file/docx_test.rb:
200
+ comments:
201
+ reviewed_by: josh.pencheon
202
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
195
203
  test/file/excel_test.rb:
196
204
  comments:
197
205
  reviewed_by: joshpencheon
@@ -203,15 +211,15 @@ file safety:
203
211
  test/file/registry_test.rb:
204
212
  comments:
205
213
  reviewed_by: josh.pencheon
206
- safe_revision: dfe367e64c6e0ff80495cd7989fc50311d5b258f
214
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
207
215
  test/file/text_test.rb:
208
216
  comments:
209
217
  reviewed_by: timgentry
210
218
  safe_revision: 3107f711805b6d3b89d32ec923178425aa600dac
211
219
  test/file/word_test.rb:
212
220
  comments:
213
- reviewed_by: timgentry
214
- safe_revision: c88000b32401b5ae9ef7f5878a9b630506ab5a94
221
+ reviewed_by: josh.pencheon
222
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
215
223
  test/file/zip_test.rb:
216
224
  comments:
217
225
  reviewed_by: timgentry
@@ -255,7 +263,7 @@ file safety:
255
263
  test/non_tabular/table_test.rb:
256
264
  comments:
257
265
  reviewed_by: josh.pencheon
258
- safe_revision: f7670cf27e137064c19490abc7df020fb6e95801
266
+ safe_revision: 337bf56e39f0f08cf7593b03867bb2da48630663
259
267
  test/non_tabular_file_helper_test.rb:
260
268
  comments:
261
269
  reviewed_by: timgentry
@@ -312,6 +320,10 @@ file safety:
312
320
  comments:
313
321
  reviewed_by: timgentry
314
322
  safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
323
+ test/resources/hello_world.docx:
324
+ comments:
325
+ reviewed_by: josh.pencheon
326
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
315
327
  test/resources/hello_world.pdf:
316
328
  comments:
317
329
  reviewed_by: josh.pencheon
@@ -356,6 +368,10 @@ file safety:
356
368
  comments:
357
369
  reviewed_by: timgentry
358
370
  safe_revision: 45da71ebd3acbc0fe53755bcd75483ba17cb6924
371
+ test/resources/not_a_word_file.docx:
372
+ comments:
373
+ reviewed_by: josh.pencheon
374
+ safe_revision: 1b66cfcbb61dfac93c44889ca0ced5836101c20c
359
375
  test/resources/not_sign_delimited.txt:
360
376
  comments:
361
377
  reviewed_by: josh.pencheon
@@ -1,5 +1,6 @@
1
1
  require_relative 'base'
2
2
  require_relative 'delimited'
3
+ require_relative 'docx'
3
4
  require_relative 'excel'
4
5
  require_relative 'pdf'
5
6
  require_relative 'text'
@@ -0,0 +1,29 @@
1
+ require 'docx'
2
+ require 'ndr_support/safe_file'
3
+ require_relative 'registry'
4
+
5
+ module NdrImport
6
+ # This is one of a collection of file handlers that deal with individual formats of data.
7
+ # They can be instantiated directly or via the factory method Registry.tables
8
+ module File
9
+ # This class is a modern Word document file handler that returns a single table.
10
+ # It only works on .docx documents
11
+ class Docx < Base
12
+ private
13
+
14
+ def rows(&block)
15
+ return enum_for(:rows) unless block
16
+
17
+ doc = ::Docx::Document.open(SafeFile.safepath_to_string(@filename))
18
+
19
+ doc.paragraphs.each do |p|
20
+ yield(p.to_s)
21
+ end
22
+ rescue StandardError => e
23
+ raise("#{SafeFile.basename(@filename)} [#{e.class}: #{e.message}]")
24
+ end
25
+ end
26
+
27
+ Registry.register(Docx, 'docx')
28
+ end
29
+ end
@@ -16,8 +16,9 @@ module NdrImport
16
16
 
17
17
  include UTF8Encoding
18
18
 
19
- NON_TABULAR_OPTIONS = %w(capture_start_line start_line_pattern end_line_pattern remove_lines
20
- start_in_a_record end_in_a_record)
19
+ NON_TABULAR_OPTIONS = %w(capture_end_line capture_start_line start_line_pattern
20
+ end_line_pattern remove_lines start_in_a_record
21
+ end_in_a_record).freeze
21
22
 
22
23
  def self.all_valid_options
23
24
  super - %w(delimiter tablename_pattern header_lines footer_lines) + NON_TABULAR_OPTIONS
@@ -118,7 +119,7 @@ module NdrImport
118
119
  start_record(line)
119
120
  elsif line =~ @end_line_pattern
120
121
  # This is an end line
121
- end_record
122
+ end_record(line)
122
123
  else
123
124
  @non_tabular_record << line if @in_a_record
124
125
  end
@@ -142,7 +143,9 @@ module NdrImport
142
143
 
143
144
  # Tabulate the record (if in one), flagged it as no longer being in a record
144
145
  # and set the record to be a new one.
145
- def end_record
146
+ def end_record(line)
147
+ # Add the end line to the @non_tabular_record (if required) before ending the record
148
+ @non_tabular_record << line if @capture_end_line
146
149
  @tabular_array << @non_tabular_record.tabulate(column_mappings) if @in_a_record
147
150
  @in_a_record = false
148
151
  @non_tabular_record = NdrImport::NonTabular::Record.new
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '6.1.1'.freeze
4
+ VERSION = '6.2.0'.freeze
5
5
  end
data/ndr_import.gemspec CHANGED
@@ -1,4 +1,3 @@
1
- # coding: utf-8
2
1
  lib = File.expand_path('../lib', __FILE__)
3
2
  $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
3
  require 'ndr_import/version'
@@ -27,11 +26,12 @@ Gem::Specification.new do |spec|
27
26
  spec.add_dependency 'rubyzip', '~> 1.2', '>= 1.2.2'
28
27
  spec.add_dependency 'roo', '~> 2.0'
29
28
 
29
+ spec.add_dependency 'docx', '~> 0.3'
30
+ spec.add_dependency 'msworddoc-extractor', '0.2.0'
30
31
  spec.add_dependency 'nokogiri', '~> 1.8', '>= 1.8.2'
32
+ spec.add_dependency 'pdf-reader', '1.2.0' # Raises warnings on Ruby 2.4+
31
33
  spec.add_dependency 'roo-xls'
32
34
  spec.add_dependency 'spreadsheet', '1.0.3'
33
- spec.add_dependency 'pdf-reader', '1.2.0' # Raises warnings on Ruby 2.4+
34
- spec.add_dependency 'msworddoc-extractor', '0.2.0'
35
35
 
36
36
  spec.required_ruby_version = '>= 2.2'
37
37
 
@@ -0,0 +1,39 @@
1
+ require 'test_helper'
2
+ require 'ndr_import/file/docx'
3
+
4
+ module NdrImport
5
+ module File
6
+ # Word .docx document file handler tests
7
+ class DocxTest < ActiveSupport::TestCase
8
+ def setup
9
+ @permanent_test_files = SafePath.new('permanent_test_files')
10
+ end
11
+
12
+ test 'should read word file' do
13
+ file_path = @permanent_test_files.join('hello_world.docx')
14
+ handler = NdrImport::File::Docx.new(file_path, nil)
15
+ handler.tables.each do |tablename, sheet|
16
+ assert_nil tablename
17
+ assert_instance_of Enumerator, sheet
18
+ assert_equal [
19
+ 'Hello world, this is a modern word document',
20
+ 'With more than one line of text',
21
+ 'Three in fact'
22
+ ], sheet.to_a
23
+ end
24
+ end
25
+
26
+ test 'should raise exception on invalid word file' do
27
+ assert_raises RuntimeError do
28
+ file_path = @permanent_test_files.join('not_a_word_file.docx')
29
+ handler = NdrImport::File::Docx.new(file_path, nil)
30
+ handler.tables.each do |tablename, sheet|
31
+ assert_nil tablename
32
+ assert_instance_of Enumerator, sheet
33
+ sheet.to_a
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
39
+ end
@@ -12,7 +12,7 @@ module NdrImport
12
12
 
13
13
  test 'Registry.handlers' do
14
14
  assert_instance_of Hash, NdrImport::File::Registry.handlers
15
- assert_equal %w[csv delimited doc nontabular pdf text txt xls xlsx zip],
15
+ assert_equal %w[csv delimited doc docx nontabular pdf text txt xls xlsx zip],
16
16
  NdrImport::File::Registry.handlers.keys.sort
17
17
  end
18
18
 
@@ -1,5 +1,5 @@
1
1
  require 'test_helper'
2
- require 'ndr_import/file/pdf'
2
+ require 'ndr_import/file/word'
3
3
 
4
4
  module NdrImport
5
5
  module File
@@ -48,7 +48,7 @@ STR
48
48
 
49
49
  def test_all_valid_options
50
50
  valid_options = %w[
51
- canonical_name capture_start_line columns end_in_a_record end_line_pattern
51
+ canonical_name capture_end_line capture_start_line columns end_in_a_record end_line_pattern
52
52
  filename_pattern format klass remove_lines start_in_a_record start_line_pattern
53
53
  ]
54
54
  assert_equal valid_options.sort,
@@ -322,6 +322,41 @@ STR
322
322
  refute results.any? { |result| result =~ /This is never captured/ }
323
323
  end
324
324
 
325
+ def test_should_capture_end_line
326
+ data = <<~STR.each_line
327
+ 111
328
+ Lorem ipsum dolor sit amet.
329
+ CAPTURE THIS CODE ABC
330
+ 111
331
+ Lorem ipsum dolor sit amet.
332
+ CAPTURE THIS CODE XYZ
333
+ 111
334
+ Lorem ipsum dolor sit amet.
335
+ CAPTURE THIS CODE 123
336
+ STR
337
+
338
+ table = YAML.load <<-YML.strip_heredoc
339
+ --- !ruby/object:NdrImport::NonTabular::Table
340
+ start_line_pattern: !ruby/regexp /\\A111\\z/
341
+ end_line_pattern: !ruby/regexp /\\ACAPTURE THIS CODE/
342
+ capture_start_line: true
343
+ capture_end_line: true
344
+ klass: SomeTestKlass
345
+ columns:
346
+ - column: one
347
+ non_tabular_cell:
348
+ lines: -1
349
+ capture: !ruby/regexp /\\A(.*)\\z/i
350
+ YML
351
+ enum = table.transform(data)
352
+ assert_instance_of Enumerator, enum
353
+
354
+ results = enum.map { |_klass, fields, _index| fields[:rawtext]['one'] }
355
+
356
+ assert_equal 3, results.count
357
+ assert_equal 'CAPTURE THIS CODE ABC', results.first
358
+ end
359
+
325
360
  def test_should_capture
326
361
  table = YAML.load <<-YML.strip_heredoc
327
362
  --- !ruby/object:NdrImport::NonTabular::Table
Binary file
File without changes
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 6.1.1
4
+ version: 6.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-09-14 00:00:00.000000000 Z
11
+ date: 2018-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -85,53 +85,53 @@ dependencies:
85
85
  - !ruby/object:Gem::Version
86
86
  version: '2.0'
87
87
  - !ruby/object:Gem::Dependency
88
- name: nokogiri
88
+ name: docx
89
89
  requirement: !ruby/object:Gem::Requirement
90
90
  requirements:
91
91
  - - "~>"
92
92
  - !ruby/object:Gem::Version
93
- version: '1.8'
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- version: 1.8.2
93
+ version: '0.3'
97
94
  type: :runtime
98
95
  prerelease: false
99
96
  version_requirements: !ruby/object:Gem::Requirement
100
97
  requirements:
101
98
  - - "~>"
102
99
  - !ruby/object:Gem::Version
103
- version: '1.8'
104
- - - ">="
105
- - !ruby/object:Gem::Version
106
- version: 1.8.2
100
+ version: '0.3'
107
101
  - !ruby/object:Gem::Dependency
108
- name: roo-xls
102
+ name: msworddoc-extractor
109
103
  requirement: !ruby/object:Gem::Requirement
110
104
  requirements:
111
- - - ">="
105
+ - - '='
112
106
  - !ruby/object:Gem::Version
113
- version: '0'
107
+ version: 0.2.0
114
108
  type: :runtime
115
109
  prerelease: false
116
110
  version_requirements: !ruby/object:Gem::Requirement
117
111
  requirements:
118
- - - ">="
112
+ - - '='
119
113
  - !ruby/object:Gem::Version
120
- version: '0'
114
+ version: 0.2.0
121
115
  - !ruby/object:Gem::Dependency
122
- name: spreadsheet
116
+ name: nokogiri
123
117
  requirement: !ruby/object:Gem::Requirement
124
118
  requirements:
125
- - - '='
119
+ - - "~>"
126
120
  - !ruby/object:Gem::Version
127
- version: 1.0.3
121
+ version: '1.8'
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: 1.8.2
128
125
  type: :runtime
129
126
  prerelease: false
130
127
  version_requirements: !ruby/object:Gem::Requirement
131
128
  requirements:
132
- - - '='
129
+ - - "~>"
133
130
  - !ruby/object:Gem::Version
134
- version: 1.0.3
131
+ version: '1.8'
132
+ - - ">="
133
+ - !ruby/object:Gem::Version
134
+ version: 1.8.2
135
135
  - !ruby/object:Gem::Dependency
136
136
  name: pdf-reader
137
137
  requirement: !ruby/object:Gem::Requirement
@@ -147,19 +147,33 @@ dependencies:
147
147
  - !ruby/object:Gem::Version
148
148
  version: 1.2.0
149
149
  - !ruby/object:Gem::Dependency
150
- name: msworddoc-extractor
150
+ name: roo-xls
151
+ requirement: !ruby/object:Gem::Requirement
152
+ requirements:
153
+ - - ">="
154
+ - !ruby/object:Gem::Version
155
+ version: '0'
156
+ type: :runtime
157
+ prerelease: false
158
+ version_requirements: !ruby/object:Gem::Requirement
159
+ requirements:
160
+ - - ">="
161
+ - !ruby/object:Gem::Version
162
+ version: '0'
163
+ - !ruby/object:Gem::Dependency
164
+ name: spreadsheet
151
165
  requirement: !ruby/object:Gem::Requirement
152
166
  requirements:
153
167
  - - '='
154
168
  - !ruby/object:Gem::Version
155
- version: 0.2.0
169
+ version: 1.0.3
156
170
  type: :runtime
157
171
  prerelease: false
158
172
  version_requirements: !ruby/object:Gem::Requirement
159
173
  requirements:
160
174
  - - '='
161
175
  - !ruby/object:Gem::Version
162
- version: 0.2.0
176
+ version: 1.0.3
163
177
  - !ruby/object:Gem::Dependency
164
178
  name: bundler
165
179
  requirement: !ruby/object:Gem::Requirement
@@ -331,6 +345,7 @@ files:
331
345
  - lib/ndr_import/file/all.rb
332
346
  - lib/ndr_import/file/base.rb
333
347
  - lib/ndr_import/file/delimited.rb
348
+ - lib/ndr_import/file/docx.rb
334
349
  - lib/ndr_import/file/excel.rb
335
350
  - lib/ndr_import/file/pdf.rb
336
351
  - lib/ndr_import/file/registry.rb
@@ -360,6 +375,7 @@ files:
360
375
  - ndr_import.gemspec
361
376
  - test/file/base_test.rb
362
377
  - test/file/delimited_test.rb
378
+ - test/file/docx_test.rb
363
379
  - test/file/excel_test.rb
364
380
  - test/file/pdf_test.rb
365
381
  - test/file/registry_test.rb
@@ -390,6 +406,7 @@ files:
390
406
  - test/resources/hello_utf8.txt
391
407
  - test/resources/hello_windows.txt
392
408
  - test/resources/hello_world.doc
409
+ - test/resources/hello_world.docx
393
410
  - test/resources/hello_world.pdf
394
411
  - test/resources/hello_world.txt
395
412
  - test/resources/high_ascii_delimited.txt
@@ -401,6 +418,7 @@ files:
401
418
  - test/resources/normal_thorn.csv
402
419
  - test/resources/not_a_pdf.pdf
403
420
  - test/resources/not_a_word_file.doc
421
+ - test/resources/not_a_word_file.docx
404
422
  - test/resources/not_sign_delimited.txt
405
423
  - test/resources/sample_xls.xls
406
424
  - test/resources/sample_xlsx.xlsx
@@ -447,6 +465,7 @@ summary: NDR Import
447
465
  test_files:
448
466
  - test/file/base_test.rb
449
467
  - test/file/delimited_test.rb
468
+ - test/file/docx_test.rb
450
469
  - test/file/excel_test.rb
451
470
  - test/file/pdf_test.rb
452
471
  - test/file/registry_test.rb
@@ -477,6 +496,7 @@ test_files:
477
496
  - test/resources/hello_utf8.txt
478
497
  - test/resources/hello_windows.txt
479
498
  - test/resources/hello_world.doc
499
+ - test/resources/hello_world.docx
480
500
  - test/resources/hello_world.pdf
481
501
  - test/resources/hello_world.txt
482
502
  - test/resources/high_ascii_delimited.txt
@@ -488,6 +508,7 @@ test_files:
488
508
  - test/resources/normal_thorn.csv
489
509
  - test/resources/not_a_pdf.pdf
490
510
  - test/resources/not_a_word_file.doc
511
+ - test/resources/not_a_word_file.docx
491
512
  - test/resources/not_sign_delimited.txt
492
513
  - test/resources/sample_xls.xls
493
514
  - test/resources/sample_xlsx.xlsx