ndr_import 11.2.1 → 11.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 77f2f2adb4de01a7dca5f2e004976a0f96e8d486f4bf573e24543c1a27b894dd
4
- data.tar.gz: eb83c8b95408ee9761b513c9e9309a6b565fd937e7cfba585088bd3d7e7b2573
3
+ metadata.gz: 81d7c1189af2322610f0e094c3b469dc187a2738d4e97edfd0a03a2f5bec079a
4
+ data.tar.gz: 5a2553b6bbe30e6b74142395abde887d02298a268e893adb49674effb6047e31
5
5
  SHA512:
6
- metadata.gz: 6dcfa45678b041cbe5c298b43c935aa2dacc47da9421cfbc6d6f8e646049f89326fb7a7a41e6e5ea283a1114ce367e7cff92d77dd4b83db56775e1d69a93fc6e
7
- data.tar.gz: 2443a1c7b4ab1f1c8fcc105cf7c3d29b1885f996e41ad55b65f0ea74d64fbe43042d50a2bb3943a5563374aa8bdc0dd3d47e066226f010922f1fd7e69c167641
6
+ metadata.gz: c7ce3014831428ab2e0c151af76aa45b0b6f04f6c61232d170a2c88952b3a20c4e8a7e48631cf0f6776da137371d709dc3de8cbe25781083861f218676c80def
7
+ data.tar.gz: 76839142c01dcab91c21e69d6f96b36de761f89e71621ad4f6a86c9e11becf029b358e30aabbc628590763f6b66f018189c587fe1f9b139858bdaf4df7d1d6f1
data/CHANGELOG.md CHANGED
@@ -1,6 +1,20 @@
1
1
  ## [Unreleased]
2
2
  * no unreleased changes *
3
3
 
4
+ ## 11.3.1/ 2025-09-02
5
+ ### Enhancement
6
+ * Performance improvements for XML mask mappings
7
+
8
+ ## 11.3.0/ 2025-02-11
9
+ ### Fixed
10
+ * Fix CSV parsing bug
11
+
12
+ ### Added
13
+ * Column zipping functionality *
14
+ * Capturing Column name *
15
+ * Regular expression column names *
16
+ * VCF file/table metadata storage *
17
+
4
18
  ## 11.2.1 / 2024-11-18
5
19
  ### Fixed
6
20
  * Support Ruby 3.2 and 3.3, Rails 7.1, 7.2 and 8.0
@@ -8,13 +8,38 @@ module NdrImport
8
8
  module File
9
9
  # This class is a vcf file handler that returns a single table.
10
10
  class Vcf < Base
11
+ attr_accessor :vcf_file_metadata
12
+
13
+ def initialize(*)
14
+ super
15
+
16
+ @vcf_file_metadata = @options['vcf_file_metadata']
17
+ assign_file_metadata
18
+ end
19
+
11
20
  private
12
21
 
22
+ def assign_file_metadata
23
+ return unless vcf_file_metadata.is_a?(Hash)
24
+
25
+ file_metadata_hash = {}
26
+
27
+ ::File.read(@filename).each_line do |line|
28
+ next unless line.match?(/^##/)
29
+
30
+ vcf_file_metadata.each do |attribute, pattern|
31
+ file_metadata_hash[attribute] = line.match(pattern)[1].presence if line.match? pattern
32
+ end
33
+ end
34
+
35
+ self.file_metadata = file_metadata_hash
36
+ end
37
+
13
38
  def rows(&block)
14
39
  return enum_for(:rows) unless block
15
40
 
16
41
  ::File.read(@filename).each_line do |line|
17
- next if line =~ /^##/
42
+ next if line.match?(/^##/)
18
43
 
19
44
  yield BioVcf::VcfLine.parse(line)
20
45
  end
@@ -28,7 +28,7 @@ module NdrImport
28
28
  end
29
29
 
30
30
  # Iterate through the file line by line, yielding each one in turn.
31
- def delimited_rows(path, col_sep = nil, liberal = false)
31
+ def delimited_rows(path, col_sep = nil, liberal = false) # rubocop:disable Style/OptionalBooleanParameter
32
32
  return enum_for(:delimited_rows, path, col_sep, liberal) unless block_given?
33
33
 
34
34
  safe_path = SafeFile.safepath_to_string(path)
@@ -36,7 +36,7 @@ module NdrImport
36
36
 
37
37
  # By now, we know `options` should let us read the whole
38
38
  # file succesfully; if there are problems, we should crash.
39
- CSV.foreach(safe_path, options.delete(:mode), **options) do |line|
39
+ CSV.foreach(safe_path, options[:mode], **options.except(:mode)) do |line|
40
40
  yield line.map(&:to_s)
41
41
  end
42
42
  end
@@ -30,6 +30,9 @@ module NdrImport::Mapper
30
30
  STANDARD_MAPPING = 'standard_mapping'.freeze
31
31
  UNPACK_PATTERN = 'unpack_pattern'.freeze
32
32
  VALIDATES = 'validates'.freeze
33
+ ZIP_ORDER = 'zip_order'.freeze
34
+ SPLIT_CHAR = 'split_char'.freeze
35
+ MAP_COLUMNAME_TO = 'map_columname_to'.freeze
33
36
  end
34
37
 
35
38
  private
@@ -118,10 +121,18 @@ module NdrImport::Mapper
118
121
  # Store the raw column value
119
122
  rawtext[rawtext_column_name] = raw_value
120
123
 
124
+ # If configured, store the column name in the given field
125
+ if column_mapping[Strings::MAP_COLUMNAME_TO].present?
126
+ data[column_mapping[Strings::MAP_COLUMNAME_TO]] ||= {}
127
+ data[column_mapping[Strings::MAP_COLUMNAME_TO]][:values] = [column_mapping['column']]
128
+ rawtext[column_mapping[Strings::MAP_COLUMNAME_TO]] = column_mapping['column']
129
+ end
130
+
121
131
  next unless column_mapping.key?(Strings::MAPPINGS)
132
+
122
133
  column_mapping[Strings::MAPPINGS].each do |field_mapping|
123
134
  # create a duplicate of the raw value we can manipulate
124
- original_value = raw_value ? raw_value.dup : nil
135
+ original_value = raw_value&.dup
125
136
 
126
137
  replace_before_mapping(original_value, field_mapping)
127
138
  value = mapped_value(original_value, field_mapping)
@@ -137,7 +148,8 @@ module NdrImport::Mapper
137
148
 
138
149
  data[field] ||= {}
139
150
  data[field][:values] ||= [] # "better" values come earlier
140
- data[field][:compact] = true unless data[field].key?(:compact)
151
+ data[field][:zipped_values] ||= []
152
+ data[field][:compact] = true unless data[field].key?(:compact)
141
153
 
142
154
  if field_mapping[Strings::ORDER]
143
155
  data[field][:join] ||= field_mapping[Strings::JOIN]
@@ -148,6 +160,9 @@ module NdrImport::Mapper
148
160
  data[field][:values][field_mapping[Strings::ORDER] - 1] = value
149
161
  elsif field_mapping[Strings::PRIORITY]
150
162
  data[field][:values][field_mapping[Strings::PRIORITY]] = value
163
+ elsif field_zippable?(field_mapping, data[field])
164
+ data[field][:split_char] ||= field_mapping[Strings::SPLIT_CHAR]
165
+ data[field][:zipped_values][field_mapping[Strings::ZIP_ORDER] - 1] = value
151
166
  else
152
167
  data[field][:values].unshift(value) # new "best" value
153
168
  end
@@ -160,6 +175,7 @@ module NdrImport::Mapper
160
175
  # and one to many, for cross-populating
161
176
  data.each do |field, field_data|
162
177
  values = field_data[:values]
178
+ zipped_values = field_data[:zipped_values]
163
179
 
164
180
  attributes[field] =
165
181
  if field_data.key?(:join)
@@ -167,6 +183,9 @@ module NdrImport::Mapper
167
183
  values = values.map(&:presence)
168
184
  values.compact! if field_data[:compact]
169
185
  values.join(field_data[:join])
186
+ elsif zipped_values.present?
187
+ values = zipped_values.map { |value| value.split(field_data[:split_char]) }
188
+ values.first.zip(*values[1..])
170
189
  else
171
190
  values.detect(&:present?)
172
191
  end
@@ -176,6 +195,12 @@ module NdrImport::Mapper
176
195
  attributes
177
196
  end
178
197
 
198
+ def field_zippable?(field_mapping, data_field)
199
+ return false if field_mapping[Strings::ZIP_ORDER].blank?
200
+
201
+ data_field[:split_char].present? || field_mapping[Strings::SPLIT_CHAR].present?
202
+ end
203
+
179
204
  def mapped_value(original_value, field_mapping)
180
205
  if field_mapping.include?(Strings::FORMAT)
181
206
  begin
@@ -66,6 +66,7 @@ module NdrImport
66
66
  return enum_for(:process_line, line) unless block
67
67
 
68
68
  if @row_index < header_lines
69
+ mutate_regexp_columns(line)
69
70
  consume_header_line(line, @columns)
70
71
  else
71
72
  transform_line(line, @row_index, &block)
@@ -79,6 +80,15 @@ module NdrImport
79
80
  @notifier.try(:processed, @row_index)
80
81
  end
81
82
 
83
+ # Update 'column' values expressed as a regular expression
84
+ def mutate_regexp_columns(line)
85
+ @columns.each_with_index do |column, index|
86
+ next unless column['column'].is_a? Regexp
87
+
88
+ column['column'] = line[index] if line[index].match? column['column']
89
+ end
90
+ end
91
+
82
92
  # This method transforms an incoming line of data by applying each of the klass masked
83
93
  # mappings to the line and yielding the klass and fields for each mapped klass.
84
94
  def transform_line(line, index)
@@ -227,7 +237,7 @@ module NdrImport
227
237
 
228
238
  # returns the column names as we expect to receive them
229
239
  def column_names(column_mappings)
230
- column_mappings.map { |c| (c['column'] || c['standard_mapping']).downcase }
240
+ column_mappings.map { |c| (c['column'] || c['standard_mapping']).try(:downcase) }
231
241
  end
232
242
 
233
243
  # If specified in the mapping, stop transforming data at a given index (column)
@@ -51,22 +51,25 @@ module NdrImport
51
51
  NdrImport::File::Registry.files(source_file, 'unzip_path' => unzip_path).each do |filename|
52
52
  # now at the individual file level, can we find the table mapping?
53
53
  table_mapping = get_table_mapping(filename, nil)
54
-
55
- options = { 'unzip_path' => unzip_path,
56
- 'col_sep' => table_mapping.try(:delimiter),
57
- 'file_password' => table_mapping.try(:file_password),
58
- 'liberal_parsing' => table_mapping.try(:liberal_parsing),
59
- 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
60
- 'slurp' => table_mapping.try(:slurp),
61
- 'yield_xml_record' => table_mapping.try(:yield_xml_record),
62
- 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath),
63
- 'xml_file_metadata' => table_mapping.try(:xml_file_metadata) }
54
+ options = table_options_from(table_mapping).merge { 'unzip_path' => unzip_path }
64
55
 
65
56
  tables = NdrImport::File::Registry.tables(filename, table_mapping.try(:format), options)
66
57
  yield_tables_and_their_content(filename, tables, &block)
67
58
  end
68
59
  end
69
60
 
61
+ def table_options_from(table_mapping)
62
+ { 'col_sep' => table_mapping.try(:delimiter),
63
+ 'file_password' => table_mapping.try(:file_password),
64
+ 'liberal_parsing' => table_mapping.try(:liberal_parsing),
65
+ 'xml_record_xpath' => table_mapping.try(:xml_record_xpath),
66
+ 'slurp' => table_mapping.try(:slurp),
67
+ 'yield_xml_record' => table_mapping.try(:yield_xml_record),
68
+ 'pattern_match_record_xpath' => table_mapping.try(:pattern_match_record_xpath),
69
+ 'xml_file_metadata' => table_mapping.try(:xml_file_metadata),
70
+ 'vcf_file_metadata' => table_mapping.try(:vcf_file_metadata) }
71
+ end
72
+
70
73
  # This method does the table row yielding for the extract method, setting the notifier
71
74
  # so that we can monitor progress
72
75
  def yield_tables_and_their_content(filename, tables, &block)
@@ -6,7 +6,7 @@ module NdrImport
6
6
  # All other Table logic is inherited from `NdrImport::Table`
7
7
  class Table < ::NdrImport::Table
8
8
  def self.all_valid_options
9
- super - %w[delimiter header_lines footer_lines]
9
+ super - %w[delimiter header_lines footer_lines] + %w[vcf_file_metadata]
10
10
  end
11
11
 
12
12
  def header_lines
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
  # This stores the current version of the NdrImport gem
3
3
  module NdrImport
4
- VERSION = '11.2.1'
4
+ VERSION = '11.3.1'
5
5
  end
@@ -4,70 +4,156 @@ module NdrImport
4
4
  # Overriding the NdrImport::Table method to avoid memoizing. This by design, column mappings
5
5
  # can change if new mappings are added on the fly where repeating sections are present
6
6
  class MaskedMappings
7
- attr_accessor :klass, :augmented_columns
7
+ DO_NOT_CAPTURE_MAPPING = { 'do_not_capture' => true }.freeze
8
+ KLASS_KEY = 'klass'.freeze
9
+ COLUMN_KEY = 'column'.freeze
10
+ STANDARD_MAPPING_KEY = 'standard_mapping'.freeze
11
+ DO_NOT_CAPTURE_KEY = 'do_not_capture'.freeze
12
+ XML_CELL_KEY = 'xml_cell'.freeze
13
+ KEEP_KLASS_KEY = 'keep_klass'.freeze
14
+
15
+ # Pre-compiled regex for numbered variants
16
+ NUMBERED_VARIANT_PATTERN = /#\d+\z/
17
+
18
+ attr_reader :klass, :augmented_columns
8
19
 
9
20
  def initialize(klass, augmented_columns)
10
21
  @klass = klass
11
22
  @augmented_columns = augmented_columns
23
+ @column_count = augmented_columns.size
24
+ @has_klass = !klass.nil?
25
+
26
+ freeze
12
27
  end
13
28
 
14
29
  def call
15
- return { klass => augmented_columns } if klass.present?
16
-
17
- masked_mappings = column_level_klass_masked_mappings
18
-
19
- augmented_masked_mappings = masked_mappings
20
- # Remove any masked klasses where additional columns mappings
21
- # have been added for repeated sections
22
- # e.g. SomeTestKlass column mappings are not needed if SomeTestKlass#1
23
- # have been added
24
- masked_mappings.each do |masked_key, columns|
25
- # There may be occasions where the e.g. SomeTestKlass should be kept,
26
- # This can be flagged in the one the klass's column mappings
27
- next if columns.any? { |column| column.dig('xml_cell', 'keep_klass') }
28
-
29
- if masked_mappings.keys.any? { |key| key =~ /\A#{masked_key}#\d+\z/ }
30
- augmented_masked_mappings.delete(masked_key)
30
+ return { @klass => @augmented_columns } if @has_klass
31
+
32
+ masked_mappings = build_masked_mappings
33
+ remove_superseded_base_klasses(masked_mappings)
34
+ end
35
+
36
+ private
37
+
38
+ def build_masked_mappings
39
+ # Pre-validate and extract all klasses in one pass
40
+ all_klasses_set, klassless_column_names = extract_klasses_and_validate
41
+
42
+ raise "Missing klass for column(s): #{klassless_column_names.join(', ')}" unless klassless_column_names.empty?
43
+
44
+ all_klasses_array = all_klasses_set.to_a
45
+
46
+ # Pre-allocate result hash with exact size
47
+ result = Hash.new(all_klasses_array.size)
48
+
49
+ all_klasses_array.each do |current_klass|
50
+ result[current_klass] = mask_mappings_for_klass(current_klass)
51
+ end
52
+
53
+ result
54
+ end
55
+
56
+ def extract_klasses_and_validate
57
+ klasses_set = Set.new
58
+ klassless_column_names = []
59
+
60
+ @augmented_columns.each do |mapping|
61
+ mapping_klass = mapping[KLASS_KEY]
62
+
63
+ if mapping_klass.nil?
64
+ # Only collect klassless mappings that aren't marked as do_not_capture
65
+ klassless_column_names << column_name_from(mapping) unless mapping[DO_NOT_CAPTURE_KEY]
66
+ elsif mapping_klass.is_a?(Array)
67
+ klasses_set.merge(mapping_klass.compact)
68
+ else
69
+ klasses_set.add(mapping_klass)
31
70
  end
32
71
  end
33
72
 
34
- augmented_masked_mappings
73
+ [klasses_set, klassless_column_names]
35
74
  end
36
75
 
37
- private
76
+ def column_name_from(mapping)
77
+ mapping[COLUMN_KEY] || mapping[STANDARD_MAPPING_KEY]
78
+ end
38
79
 
39
- # This method duplicates the mappings and applies a do_not_capture mask to those that do not
40
- # relate to this klass, returning the masked mappings
41
- def mask_mappings_by_klass(klass)
42
- augmented_columns.deep_dup.map do |mapping|
43
- Array(mapping['klass']).flatten.include?(klass) ? mapping : { 'do_not_capture' => true }
80
+ def mask_mappings_for_klass(target_klass)
81
+ # Pre-allocate array with exact size
82
+ result = Array.new(@column_count)
83
+
84
+ # Single pass with index tracking
85
+ @augmented_columns.each_with_index do |mapping, index|
86
+ result[index] = if mapping_applies_to_klass?(mapping, target_klass)
87
+ mapping.deep_dup
88
+ else
89
+ DO_NOT_CAPTURE_MAPPING
90
+ end
91
+ end
92
+
93
+ result
94
+ end
95
+
96
+ def mapping_applies_to_klass?(mapping, target_klass)
97
+ mapping_klass = mapping[KLASS_KEY]
98
+ return false unless mapping_klass
99
+
100
+ # Optimized type checking and inclusion
101
+ case mapping_klass
102
+ when Array
103
+ mapping_klass.include?(target_klass)
104
+ when String
105
+ mapping_klass == target_klass
106
+ else
107
+ false
44
108
  end
45
109
  end
46
110
 
47
- def column_level_klass_masked_mappings
48
- ensure_mappings_define_klass
111
+ def remove_superseded_base_klasses(masked_mappings)
112
+ return masked_mappings if masked_mappings.size <= 1
113
+
114
+ # Pre-build numbered variants lookup for O(1) access
115
+ numbered_klasses = build_numbered_klasses_lookup(masked_mappings.keys)
116
+ return masked_mappings if numbered_klasses.empty?
117
+
118
+ klasses_to_keep = compute_klasses_to_keep(masked_mappings)
49
119
 
50
- # Loop through each klass
51
- masked_mappings = {}
52
- augmented_columns.pluck('klass').flatten.compact.uniq.each do |klass|
53
- # Do not capture fields that relate to other klasses
54
- masked_mappings[klass] = mask_mappings_by_klass(klass)
120
+ masked_mappings.select do |klass, _columns|
121
+ klasses_to_keep.include?(klass) || numbered_klasses.exclude?(klass)
55
122
  end
56
- masked_mappings
57
123
  end
58
124
 
59
- # This method ensures that every column mapping defines a klass (unless it is a column that
60
- # we do not capture). It is only used where a table level klass is not defined.
61
- def ensure_mappings_define_klass
62
- klassless_mappings = augmented_columns.
63
- select { |mapping| mapping.nil? || mapping['klass'].nil? }.
64
- reject { |mapping| mapping['do_not_capture'] }.
65
- map { |mapping| mapping['column'] || mapping['standard_mapping'] }
125
+ def build_numbered_klasses_lookup(klass_keys)
126
+ numbered_klasses = Set.new
66
127
 
67
- return if klassless_mappings.empty?
128
+ klass_keys.each do |key|
129
+ next unless key.match?(NUMBERED_VARIANT_PATTERN)
130
+
131
+ # Extract base klass name (everything before #)
132
+ base_klass = key.split(NUMBERED_VARIANT_PATTERN, 2).first
133
+ numbered_klasses.add(base_klass)
134
+ end
135
+
136
+ numbered_klasses
137
+ end
138
+
139
+ def compute_klasses_to_keep(masked_mappings)
140
+ klasses_to_keep = Set.new
141
+
142
+ masked_mappings.each do |klass, columns|
143
+ klasses_to_keep.add(klass) if should_keep_base_klass?(columns)
144
+ end
145
+
146
+ klasses_to_keep
147
+ end
148
+
149
+ def should_keep_base_klass?(columns)
150
+ # Fast iteration with early termination
151
+ columns.each do |column|
152
+ xml_cell = column[XML_CELL_KEY]
153
+ return true if xml_cell && xml_cell[KEEP_KLASS_KEY]
154
+ end
68
155
 
69
- # All column mappings for the single item file require a klass definition.
70
- raise "Missing klass for column(s): #{klassless_mappings.to_sentence}"
156
+ false
71
157
  end
72
158
  end
73
159
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ndr_import
3
3
  version: !ruby/object:Gem::Version
4
- version: 11.2.1
4
+ version: 11.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - NCRS Development Team
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-11-18 00:00:00.000000000 Z
11
+ date: 2025-09-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activemodel
@@ -479,7 +479,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
479
479
  - !ruby/object:Gem::Version
480
480
  version: '0'
481
481
  requirements: []
482
- rubygems_version: 3.3.27
482
+ rubygems_version: 3.4.19
483
483
  signing_key:
484
484
  specification_version: 4
485
485
  summary: NDR Import