ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
@@ -0,0 +1,209 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Util
5
+ class CsvValidator
6
+
7
+ ERROR_UNBALANCED_SUBFIELDS = 'Row has subfields of different lengths'
8
+ ERROR_BLANK_SUBFIELDS = 'Row has blank subfields'
9
+ ERROR_MISSING_REQUIRED_COLUMNS = "CSV is missing required column(s)"
10
+ ERROR_TRAILING_WHITESPACE = 'Row contains trailing whitespace'
11
+
12
+ # split on pipes that are not escaped with '\'
13
+ PIPE_SPLIT_REGEXP = %r{(?<!\\)\|}
14
+ # split on pipes and semicolons that are not escaped with '\'
15
+ PIPE_SEMICOLON_REGEXP = %r{(?<!\\)[;|]}
16
+
17
+
18
+ # Validates all rows of data against a set of required columns, balanced columns, and nested columns.
19
+ #
20
+ # @param rows [Array<Hash,CSV::Row>] The rows of data to be validated.
21
+ # @param required_columns [Array<Symbol>] The required columns for each row.
22
+ # @param balanced_columns [Hash<Symbol, Array<Symbol>>] A hash of groups of balanced columns.
23
+ # @param nested_columns [Hash<Symbol, Array<Symbol>>] A hash of nested columns.
24
+ # @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns.
25
+ # @return [Array<String>] An array of error messages, if any.
26
+ def self.validate_all_rows rows, required_columns: [], balanced_columns: {}, nested_columns: {}, allow_blank: false
27
+ errors = validate_required_columns(rows.first, row_num: 1, required_columns: required_columns)
28
+ return errors unless errors.blank?
29
+ rows.each_with_index do |row, row_num|
30
+ errors += validate_row(
31
+ row, row_num: row_num + 1,
32
+ required_columns: required_columns,
33
+ balanced_columns: balanced_columns,
34
+ nested_columns: nested_columns,
35
+ allow_blank: allow_blank
36
+ )
37
+ end
38
+ errors
39
+ end
40
+
41
+ # Validates a row of data against a set of required columns and balanced columns.
42
+ #
43
+ # # validate a CSV row for required columns and balanced columns
44
+ # # columns a and b are required,
45
+ # # columns a and b, and c and d are balanced
46
+ # # balanced_columns keys are used as labels for the error messages
47
+ # required_columns = [:a, :b]
48
+ # balanced_columns = { group1: [:a, :b], group2: [:c: :d] }
49
+ # csv_validator.validate(row, required_columns: required_columns, balanced_columns: balanced_columns)
50
+ #
51
+ # @param row [Hash,CSV::Row] The row of data to be validated.
52
+ # @param required_columns [Array<Symbol>] The required columns for the row.
53
+ # @param balanced_columns [Hash<Symbol, Array<Symbol>>] a hash of groups of balanced columns; see example above
54
+ # @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns
55
+ # @return [Array<String>] An array of error messages, if any.
56
+ def self.validate_row row, row_num:, required_columns: [], balanced_columns: {}, nested_columns: {}, allow_blank: false
57
+ errors = []
58
+ errors += validate_required_columns(row, row_num: row_num, required_columns: required_columns)
59
+ return errors unless errors.blank?
60
+ errors += validate_balanced_columns(row, row_num: row_num, balanced_columns: balanced_columns, allow_blank: allow_blank)
61
+ errors += validate_whitespace(row, row_num: row_num, nested_columns: nested_columns)
62
+ errors
63
+ end
64
+
65
+ # Validates the presence of required columns in a given row of data.
66
+ #
67
+ # @param row [Hash, CSV::Row] The row of data to be validated.
68
+ # @param required_columns [Array<Symbol>] The required columns for the row.
69
+ # @return [Array<String>] An array of error messages, if any; otherwise, an empty array.
70
+ def self.validate_required_columns row, row_num:, required_columns:
71
+ return [] if required_columns.blank?
72
+ missing = required_columns - row.to_h.keys
73
+ return [] if missing.empty?
74
+ ["#{ERROR_MISSING_REQUIRED_COLUMNS}: #{missing.map(&:inspect).join(', ')} row #{row_num}"]
75
+ end
76
+
77
+
78
+ # Validates the balanced columns in a given row of data.
79
+ #
80
+ # +balanced_columns+ is a hash of groups of balanced columns.
81
+ #
82
+ # @param row [Hash] The row of data to be validated.
83
+ # @param balanced_columns [Hash<Symbol, Array<Symbol>>] A hash of groups of balanced columns.
84
+ # @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns.
85
+ # @return [Array<String>] An array of error messages, if any; otherwise, an empty array.
86
+ #
87
+ # @example
88
+ # # row has unbalanced columns :a and :b
89
+ # row = { a: 'a', b: 'b|b', c: 'c', d: 'd' }
90
+ # balanced_columns = { group1: [:a, :b] }
91
+ # csv_validator.validate_balanced_columns(
92
+ # row, balanced_columns: balanced_columns
93
+ # ) # => ["Row has subfields of different lengths: group: :group1, sizes: [1, 2], row: [\"a\", \"b|b\"]"]
94
+ def self.validate_balanced_columns row, row_num:, balanced_columns: {}, allow_blank: false
95
+ return [] if balanced_columns.blank?
96
+ errors = []
97
+ balanced_columns.each { |group, columns|
98
+ values = columns.map { |column| row[column.to_s] || row[column.to_sym] }
99
+ errors += validate_row_splits(group: group, row_num: row_num, row_values: values, allow_blank: allow_blank)
100
+ }
101
+ errors
102
+ end
103
+
104
+ # Maximum number of subfields to allow in a row; this number is
105
+ # arbitrarily set to 100,000 to ensure all trailing empty
106
+ # values are included in the array output by split.
107
+ MAX_SPLITS = 100000
108
+
109
+
110
+ ##
111
+ # Return an error if each value in +row_values+ has the same number of subfields
112
+ # **and** none of the subfields are blank; otherwise, return +nil+.
113
+ #
114
+ # If +allow_blank+ is +true+, ignore blanks, only check for balanced
115
+ # subfields.
116
+ #
117
+ # Note: It is always allowed for every value to be blank (empty string).
118
+ # When row values are +nil+ they are treated as empty strings.
119
+ # Blank values are treated a single values
120
+ #
121
+ # So:
122
+ #
123
+ # [ 'a|b|c', '1|2|3' ] # => valid, return []
124
+ # [ '', '' ] # => valid, return []
125
+ # [ 'a', ''] # => valid, return []
126
+ # [ 'a|b|c', '1|2' ] # => not valid, return ERROR_UNBALANCED_SUBFIELDS
127
+ # [ 'a|b', ''] # => not valid, return ERROR_UNBALANCED_SUBFIELDS
128
+ # [ 'a||c', '1|2|3' ] # => not valid, return ERROR_BLANK_SUBFIELDS
129
+ # [ 'a||c', '1|2|3' ] # => valid if allow_blank == true, return []
130
+ #
131
+ #
132
+ # @param [Array<String>] row_values an array of strings from one or more columns
133
+ # @param [String] separators a list of allowed subfield separators; e.g., ';', '|', ';|'
134
+ # @param [Boolean] allow_blank whether any of the subfields may be blank
135
+ # @return [Array<String>] the row errors, or [] if there are no errors
136
+ def self.validate_row_splits row_values: [], row_num:, separators: '|;', allow_blank: false, group: nil
137
+ errors = []
138
+ return errors if row_values.all? { |val| val.blank? }
139
+ # Input array is an array of two or more strings that must split into
140
+ # equal numbers of subfields.
141
+ #
142
+ # ['a|bc', '1|2|3'] => [['a', 'b', 'c'],
143
+ # ['1', '2', '3']]
144
+ # ['a|b|c', '1|2'] => [['a', 'b', 'c'],
145
+ # ['1' '2']]
146
+ #
147
+ # Count the subfields and make sure there's an equal number in each field
148
+ #
149
+ # ['a|bc', '1|2|3'] => # 3 subfields each; => valid
150
+ # ['a|b|c', '1|2'] => # 2 and 3 subfields; => not valid
151
+ splits = row_values.map { |v|
152
+ v.to_s.split %r{[#{Regexp.escape separators}]}, MAX_SPLITS
153
+ }
154
+
155
+ # all sizes should 0 or 1; or there should be only one
156
+ # subfield length
157
+ sizes = splits.map { |vals| vals.size }
158
+ if sizes.all? { |size| [0,1].include? size }
159
+ return errors
160
+ elsif sizes.uniq.size > 1
161
+ errors << "#{ERROR_UNBALANCED_SUBFIELDS}: group: #{group.inspect}, sizes: #{sizes.inspect}, row: #{row_values.inspect} (row #{row_num})"
162
+ end
163
+
164
+ # return true if we don't have check for blanks
165
+ return errors if allow_blank
166
+
167
+ # return an error if any of the subfields are blank
168
+ if splits.flatten.any? &:blank?
169
+ errors << "#{ERROR_BLANK_SUBFIELDS}: group: #{group.inspect}, row: #{row_values.inspect} (row #{row_num})"
170
+ end
171
+ errors
172
+ end
173
+
174
+ # Validates a row of data for trailing whitespace. Returns an
175
+ # error for each column that contains trailing whitespace.
176
+ #
177
+ # Nested columns is a hash with column names as keys and group
178
+ # names as values; e.g.,
179
+ #
180
+ # nested_columns = {
181
+ # "subject_label" => :subjects,
182
+ # "subject" => :subjects,
183
+ # "genre_label" => :genres
184
+ # "genre" => :genres
185
+ # }
186
+ #
187
+ # @param row [Hash] The row of data to be validated.
188
+ # @param nested_columns [Array<Symbol>] A hash of nested columns.
189
+ # @return [Array<String>] An array of error messages, if any.
190
+ def self.validate_whitespace row, row_num:, nested_columns: []
191
+ errors = []
192
+
193
+ row.each do |column, value|
194
+ # Assume all columns can have subfields delimited by pipes;
195
+ # some columns are "nested"; that is, they can be be further
196
+ # subdivided by semicolons. Select the regexp for the
197
+ # subfield type
198
+ split_chars = nested_columns.include?(column) ? PIPE_SEMICOLON_REGEXP : PIPE_SPLIT_REGEXP
199
+ if value.to_s.split(split_chars).any? { |sub| sub =~ %r{\s+$} }
200
+ errors << "#{ERROR_TRAILING_WHITESPACE}: column #{column.inspect}, value: #{value.inspect} (row #{row_num})"
201
+ end
202
+ end
203
+
204
+ errors
205
+ end
206
+
207
+ end
208
+ end
209
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Util
5
+ class CSVWriter
6
+ attr_reader :headers
7
+ attr_reader :outfile
8
+
9
+ def initialize outfile:, headers: []
10
+ @headers = headers
11
+ @outfile = outfile
12
+ end
13
+
14
+ def write rows=nil, &block
15
+ if block_given?
16
+ _write_with_block &block
17
+ elsif rows.is_a? Enumerable
18
+ _write_all rows
19
+ else
20
+ raise ""
21
+ end
22
+ end
23
+
24
+ private
25
+ def _write_with_block
26
+ CSV.open outfile, 'w+', headers: true do |csv|
27
+ csv << headers
28
+ yield csv
29
+ end
30
+ end
31
+
32
+ def _write_all rows
33
+ CSV.open outfile, 'w+', headers: true do |csv|
34
+ csv << headers
35
+ rows.each do |row|
36
+ csv << row
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,194 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ module Util
5
+ module Strings
6
+
7
+ ##
8
+ # This method calls
9
+ #
10
+ # - +convert_mets_superscript+
11
+ # - +remove_brackets+
12
+ # - +fix_double_periods+
13
+ # - +escape_pipes+
14
+ # - +normalize_string+
15
+ #
16
+ # If +terminator+ is non-nil, the method removes any trailing
17
+ # punctuation and whitespace and appends +terminator+.
18
+ #
19
+ # Set +terminator+ to +``+ (empty string) to remove trailing
20
+ # punctuation.
21
+ #
22
+ # @param [String] string the string to clean
23
+ # @param [String] terminator the terminator to use, if any
24
+ # @param [Boolean] force use exact termination with +terminator+
25
+ # @return [String] the cleaned string
26
+ def clean_string string, terminator: nil, force: false
27
+ normal = normalize_string(
28
+ escape_pipes(
29
+ fix_double_periods(
30
+ remove_brackets(
31
+ convert_mets_superscript(string.to_s)
32
+ )
33
+ )
34
+ )
35
+ )
36
+
37
+ return normal if terminator.nil?
38
+
39
+ cleaned = terminate normal, terminator: terminator, force: force
40
+ # keep cleaning until no changes are made
41
+ return clean_string cleaned unless cleaned == string
42
+ cleaned
43
+ end
44
+
45
+ # TERMINAL_PUNCT_REGEX matches strings terminated by any of +.,;:?!+
46
+ TERMINAL_PUNCT_REGEX = %r{\s*([.,;:!]+)("?)$}
47
+
48
+ # ELLIPSIS_REGEX matches strings terminated by +...+
49
+ ELLIPSIS_REGEX = %r{\.\.\."?$}
50
+
51
+ # ABBREV_REGEX matches values like 'N.T.', 'O.T.'
52
+ ABBREV_REGEX = %r{\W[A-Z]\.$}
53
+
54
+ # Final ? regex
55
+ FINAL_QUESTION_REGEX = %r{\s*\?(\s*"?\s*)$}
56
+ ##
57
+ # Add termination to string if it lacks terminal punctuation.
58
+ # Terminal punctuation is one of
59
+ #
60
+ # . , ; : ? !
61
+ #
62
+ # When +:terminator+ is +''+ or +nil+, trailing punctuation is*always*
63
+ # removed.
64
+ #
65
+ # Strings ending with ellipsis, '...' or '..."' are returned unaltered. This
66
+ # behavior cannot be overridden with `:force`.
67
+ #
68
+ # @param [String] str the string to terminate
69
+ # @param [String] terminator the terminator to use; default: +.+
70
+ # @param [Boolean] force use exact termination with +terminator+
71
+ # @return [String]
72
+ def terminate str, terminator: '.', force: false
73
+ str.strip!
74
+ # DE 2022.08.12 Note the \s* to match and replace whitespace before
75
+ # punctuation; this addresses a bug where some strings were returned
76
+ # with trailing whitespace: 'value :' => 'value '
77
+ # TODO: Refactor? Two functions: strip_punctuation(), terminate() ??
78
+
79
+ # don't strip ellipses
80
+ return str if str =~ ELLIPSIS_REGEX
81
+ # don't strip final periods for strings like "N.T."
82
+ return str if str =~ ABBREV_REGEX
83
+
84
+ # don't strip final question marks
85
+ return str if str =~ FINAL_QUESTION_REGEX
86
+
87
+ # if :terminator is '' or nil, remove any terminal punctuation
88
+ return str.sub TERMINAL_PUNCT_REGEX, '\2' if terminator.blank?
89
+
90
+ # str is already terminated
91
+ return str if str.end_with? terminator
92
+ return str if str.end_with? %Q{#{terminator}"}
93
+
94
+ # if string ends with '?', don't add terminator
95
+ return str if str.end_with? '?'
96
+
97
+ # str lacks terminal punctuation; add it;
98
+ # \\1 => keep final '"' (double-quote)
99
+ return str.sub %r{("?)$}, "#{terminator}\\1" if str !~ TERMINAL_PUNCT_REGEX
100
+ # str has to have exact terminal punctuation
101
+ # \\1 => keep final '"' (double-quote)
102
+ return str.sub TERMINAL_PUNCT_REGEX, "#{terminator}\\2" if force
103
+ # string has some terminal punctuation; return it
104
+ str
105
+ end
106
+
107
+ ##
108
+ # Strip and replace all sequences of white space with single
109
+ # spaces and apply Unicode normalization. NFC normalization is
110
+ # used for all strings except URLs, to which NFKC normalization
111
+ # is applied. See RFC 3987:
112
+ #
113
+ # https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2.2
114
+ #
115
+ # @param [String] value the string to normalize
116
+ # @return [String] the normalized string
117
+ def normalize_string value
118
+ form = is_url?(value) ? :nfkc : :nfc
119
+ escape_pipes(
120
+ clean_white_space(
121
+ unicode_normalize(value, form)
122
+ )
123
+ )
124
+ end
125
+
126
+ ##
127
+ # converts encoded DS 1.0 encoded superscripts to parenthetical
128
+ # values; e.g., 'XVI#^4/4#' is converted to 'XVI(4/4)'
129
+ def convert_mets_superscript value
130
+ value.to_s.gsub(%r{#\^([^#]+)#}, '(\1)')
131
+ end
132
+
133
+ ##
134
+ # Escape pipe characters in source strings so split operations
135
+ # can avoid splitting on them.
136
+ def escape_pipes value
137
+ value.gsub('|', '\|')
138
+ end
139
+
140
+ def clean_white_space value
141
+ value.to_s.strip.gsub(%r{\s+}, ' ')
142
+ end
143
+
144
+ ##
145
+ # Return the string using unicode normalization form +form+.
146
+ # Use +NFC+ normalization by default. NFC normalization is
147
+ # recommended best practice. See
148
+ #
149
+ # https://www.honeybadger.io/blog/ruby-unicode-normalization/
150
+ #
151
+ # In short: NFC should be used for most strings, but NFKC for
152
+ # URLs. See RFC 3987:
153
+ #
154
+ # https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2.2
155
+ #
156
+ # Wikibase uses NFC normalization:
157
+ #
158
+ # https://doc.wikimedia.org/Wikibase/REL1_28/php/classWikibase_1_1Repo_1_1Parsers_1_1WikibaseStringValueNormalizer.html
159
+ #
160
+ # @param [String] value the string to normalize
161
+ # @param [Symbol] form the normalization form: +:nfc+, +:nfkc+.
162
+ # +:nfd+, or +:nfkd+; default: +:nfc+
163
+ # @return [String] the normalized string
164
+ def unicode_normalize value, form = :nfc
165
+ value.to_s.unicode_normalize form
166
+ end
167
+
168
+ def remove_brackets value
169
+ value.to_s.strip.delete_prefix('[').delete_suffix(']')
170
+ end
171
+
172
+ ##
173
+ # Replace any sequence of two '..' with a single period.
174
+ # Ellipses, that is, sequences of three periods '...', are
175
+ # ignored.
176
+ #
177
+ # fix_double_periods('....') # => "...."
178
+ # fix_double_periods('.. ..') # => ". ."
179
+ # fix_double_periods('... ..') # => "... ."
180
+ # fix_double_periods('... a..') # => "... a."
181
+ # fix_double_periods('a... a..') # => "a... a."
182
+ #
183
+ # @param [String] value the string to process
184
+ # @return [String]
185
+ def fix_double_periods value
186
+ value.to_s.gsub(%r{(?<!\.)\.\.(?!\.)}, '.')
187
+ end
188
+
189
+ def is_url? value
190
+ value.to_s =~ URI::regexp
191
+ end
192
+ end
193
+ end
194
+ end
data/lib/ds/util.rb ADDED
@@ -0,0 +1,37 @@
1
+ require 'nokogiri'
2
+
3
+ require_relative 'util/strings'
4
+ require_relative 'util/cache'
5
+ require_relative 'util/csv_writer'
6
+ require_relative 'util//csv_validator'
7
+
8
+ module DS
9
+ module Util
10
+
11
+ extend DS::Util::Strings
12
+ ##
13
+ # Open and parse each XML file in +files+, optionally stripping namespaces
14
+ # from the parsed XML, running block on each XML document:
15
+ #
16
+ # data = []
17
+ # process_xml files, remove_namespaces: true do |xml|
18
+ # data << xml.xpath('//some/path/text()').text
19
+ # end
20
+ #
21
+ # @yield [xml, data] yields a Nokogiri XML document and the array of data
22
+ # to populate the CSV; you must know the format of each item
23
+ # in the ++data++ array
24
+ #
25
+ # @param files [Enumerable<String>] XML files to process
26
+ # @param remove_namespaces [Boolean] whether strip namespaces from parsed XML
27
+ # @yieldparam xml [Nokogiri::XML::Document] the parsed document
28
+ def process_xml files, remove_namespaces: false, &block
29
+ files.each do |in_xml|
30
+ # may be reading file list from STDIN; remove any trailing \r or \n
31
+ xml = File.open(in_xml.chomp) { |f| Nokogiri::XML f }
32
+ xml.remove_namespaces! if remove_namespaces
33
+ yield xml
34
+ end
35
+ end
36
+ end
37
+ end
data/lib/ds/version.rb ADDED
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DS
4
+ VERSION = "0.1.1"
5
+ end