ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
@@ -0,0 +1,209 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Util
|
5
|
+
class CsvValidator
|
6
|
+
|
7
|
+
ERROR_UNBALANCED_SUBFIELDS = 'Row has subfields of different lengths'
|
8
|
+
ERROR_BLANK_SUBFIELDS = 'Row has blank subfields'
|
9
|
+
ERROR_MISSING_REQUIRED_COLUMNS = "CSV is missing required column(s)"
|
10
|
+
ERROR_TRAILING_WHITESPACE = 'Row contains trailing whitespace'
|
11
|
+
|
12
|
+
# split on pipes that are not escaped with '\'
|
13
|
+
PIPE_SPLIT_REGEXP = %r{(?<!\\)\|}
|
14
|
+
# split on pipes and semicolons that are not escaped with '\'
|
15
|
+
PIPE_SEMICOLON_REGEXP = %r{(?<!\\)[;|]}
|
16
|
+
|
17
|
+
|
18
|
+
# Validates all rows of data against a set of required columns, balanced columns, and nested columns.
|
19
|
+
#
|
20
|
+
# @param rows [Array<Hash,CSV::Row>] The rows of data to be validated.
|
21
|
+
# @param required_columns [Array<Symbol>] The required columns for each row.
|
22
|
+
# @param balanced_columns [Hash<Symbol, Array<Symbol>>] A hash of groups of balanced columns.
|
23
|
+
# @param nested_columns [Hash<Symbol, Array<Symbol>>] A hash of nested columns.
|
24
|
+
# @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns.
|
25
|
+
# @return [Array<String>] An array of error messages, if any.
|
26
|
+
def self.validate_all_rows rows, required_columns: [], balanced_columns: {}, nested_columns: {}, allow_blank: false
|
27
|
+
errors = validate_required_columns(rows.first, row_num: 1, required_columns: required_columns)
|
28
|
+
return errors unless errors.blank?
|
29
|
+
rows.each_with_index do |row, row_num|
|
30
|
+
errors += validate_row(
|
31
|
+
row, row_num: row_num + 1,
|
32
|
+
required_columns: required_columns,
|
33
|
+
balanced_columns: balanced_columns,
|
34
|
+
nested_columns: nested_columns,
|
35
|
+
allow_blank: allow_blank
|
36
|
+
)
|
37
|
+
end
|
38
|
+
errors
|
39
|
+
end
|
40
|
+
|
41
|
+
# Validates a row of data against a set of required columns and balanced columns.
|
42
|
+
#
|
43
|
+
# # validate a CSV row for required columns and balanced columns
|
44
|
+
# # columns a and b are required,
|
45
|
+
# # columns a and b, and c and d are balanced
|
46
|
+
# # balanced_columns keys are used as labels for the error messages
|
47
|
+
# required_columns = [:a, :b]
|
48
|
+
# balanced_columns = { group1: [:a, :b], group2: [:c: :d] }
|
49
|
+
# csv_validator.validate(row, required_columns: required_columns, balanced_columns: balanced_columns)
|
50
|
+
#
|
51
|
+
# @param row [Hash,CSV::Row] The row of data to be validated.
|
52
|
+
# @param required_columns [Array<Symbol>] The required columns for the row.
|
53
|
+
# @param balanced_columns [Hash<Symbol, Array<Symbol>>] a hash of groups of balanced columns; see example above
|
54
|
+
# @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns
|
55
|
+
# @return [Array<String>] An array of error messages, if any.
|
56
|
+
def self.validate_row row, row_num:, required_columns: [], balanced_columns: {}, nested_columns: {}, allow_blank: false
|
57
|
+
errors = []
|
58
|
+
errors += validate_required_columns(row, row_num: row_num, required_columns: required_columns)
|
59
|
+
return errors unless errors.blank?
|
60
|
+
errors += validate_balanced_columns(row, row_num: row_num, balanced_columns: balanced_columns, allow_blank: allow_blank)
|
61
|
+
errors += validate_whitespace(row, row_num: row_num, nested_columns: nested_columns)
|
62
|
+
errors
|
63
|
+
end
|
64
|
+
|
65
|
+
# Validates the presence of required columns in a given row of data.
|
66
|
+
#
|
67
|
+
# @param row [Hash, CSV::Row] The row of data to be validated.
|
68
|
+
# @param required_columns [Array<Symbol>] The required columns for the row.
|
69
|
+
# @return [Array<String>] An array of error messages, if any; otherwise, an empty array.
|
70
|
+
def self.validate_required_columns row, row_num:, required_columns:
|
71
|
+
return [] if required_columns.blank?
|
72
|
+
missing = required_columns - row.to_h.keys
|
73
|
+
return [] if missing.empty?
|
74
|
+
["#{ERROR_MISSING_REQUIRED_COLUMNS}: #{missing.map(&:inspect).join(', ')} row #{row_num}"]
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
# Validates the balanced columns in a given row of data.
|
79
|
+
#
|
80
|
+
# +balanced_columns+ is a hash of groups of balanced columns.
|
81
|
+
#
|
82
|
+
# @param row [Hash] The row of data to be validated.
|
83
|
+
# @param balanced_columns [Hash<Symbol, Array<Symbol>>] A hash of groups of balanced columns.
|
84
|
+
# @param allow_blank [Boolean] Whether to allow blank subfields in balanced columns.
|
85
|
+
# @return [Array<String>] An array of error messages, if any; otherwise, an empty array.
|
86
|
+
#
|
87
|
+
# @example
|
88
|
+
# # row has unbalanced columns :a and :b
|
89
|
+
# row = { a: 'a', b: 'b|b', c: 'c', d: 'd' }
|
90
|
+
# balanced_columns = { group1: [:a, :b] }
|
91
|
+
# csv_validator.validate_balanced_columns(
|
92
|
+
# row, balanced_columns: balanced_columns
|
93
|
+
# ) # => ["Row has subfields of different lengths: group: :group1, sizes: [1, 2], row: [\"a\", \"b|b\"]"]
|
94
|
+
def self.validate_balanced_columns row, row_num:, balanced_columns: {}, allow_blank: false
|
95
|
+
return [] if balanced_columns.blank?
|
96
|
+
errors = []
|
97
|
+
balanced_columns.each { |group, columns|
|
98
|
+
values = columns.map { |column| row[column.to_s] || row[column.to_sym] }
|
99
|
+
errors += validate_row_splits(group: group, row_num: row_num, row_values: values, allow_blank: allow_blank)
|
100
|
+
}
|
101
|
+
errors
|
102
|
+
end
|
103
|
+
|
104
|
+
# Maximum number of subfields to allow in a row; this number is
|
105
|
+
# arbitrarily set to 100,000 to ensure all trailing empty
|
106
|
+
# values are included in the array output by split.
|
107
|
+
MAX_SPLITS = 100000
|
108
|
+
|
109
|
+
|
110
|
+
##
|
111
|
+
# Return an error if each value in +row_values+ has the same number of subfields
|
112
|
+
# **and** none of the subfields are blank; otherwise, return +nil+.
|
113
|
+
#
|
114
|
+
# If +allow_blank+ is +true+, ignore blanks, only check for balanced
|
115
|
+
# subfields.
|
116
|
+
#
|
117
|
+
# Note: It is always allowed for every value to be blank (empty string).
|
118
|
+
# When row values are +nil+ they are treated as empty strings.
|
119
|
+
# Blank values are treated a single values
|
120
|
+
#
|
121
|
+
# So:
|
122
|
+
#
|
123
|
+
# [ 'a|b|c', '1|2|3' ] # => valid, return []
|
124
|
+
# [ '', '' ] # => valid, return []
|
125
|
+
# [ 'a', ''] # => valid, return []
|
126
|
+
# [ 'a|b|c', '1|2' ] # => not valid, return ERROR_UNBALANCED_SUBFIELDS
|
127
|
+
# [ 'a|b', ''] # => not valid, return ERROR_UNBALANCED_SUBFIELDS
|
128
|
+
# [ 'a||c', '1|2|3' ] # => not valid, return ERROR_BLANK_SUBFIELDS
|
129
|
+
# [ 'a||c', '1|2|3' ] # => valid if allow_blank == true, return []
|
130
|
+
#
|
131
|
+
#
|
132
|
+
# @param [Array<String>] row_values an array of strings from one or more columns
|
133
|
+
# @param [String] separators a list of allowed subfield separators; e.g., ';', '|', ';|'
|
134
|
+
# @param [Boolean] allow_blank whether any of the subfields may be blank
|
135
|
+
# @return [Array<String>] the row errors, or [] if there are no errors
|
136
|
+
def self.validate_row_splits row_values: [], row_num:, separators: '|;', allow_blank: false, group: nil
|
137
|
+
errors = []
|
138
|
+
return errors if row_values.all? { |val| val.blank? }
|
139
|
+
# Input array is an array of two or more strings that must split into
|
140
|
+
# equal numbers of subfields.
|
141
|
+
#
|
142
|
+
# ['a|bc', '1|2|3'] => [['a', 'b', 'c'],
|
143
|
+
# ['1', '2', '3']]
|
144
|
+
# ['a|b|c', '1|2'] => [['a', 'b', 'c'],
|
145
|
+
# ['1' '2']]
|
146
|
+
#
|
147
|
+
# Count the subfields and make sure there's an equal number in each field
|
148
|
+
#
|
149
|
+
# ['a|bc', '1|2|3'] => # 3 subfields each; => valid
|
150
|
+
# ['a|b|c', '1|2'] => # 2 and 3 subfields; => not valid
|
151
|
+
splits = row_values.map { |v|
|
152
|
+
v.to_s.split %r{[#{Regexp.escape separators}]}, MAX_SPLITS
|
153
|
+
}
|
154
|
+
|
155
|
+
# all sizes should 0 or 1; or there should be only one
|
156
|
+
# subfield length
|
157
|
+
sizes = splits.map { |vals| vals.size }
|
158
|
+
if sizes.all? { |size| [0,1].include? size }
|
159
|
+
return errors
|
160
|
+
elsif sizes.uniq.size > 1
|
161
|
+
errors << "#{ERROR_UNBALANCED_SUBFIELDS}: group: #{group.inspect}, sizes: #{sizes.inspect}, row: #{row_values.inspect} (row #{row_num})"
|
162
|
+
end
|
163
|
+
|
164
|
+
# return true if we don't have check for blanks
|
165
|
+
return errors if allow_blank
|
166
|
+
|
167
|
+
# return an error if any of the subfields are blank
|
168
|
+
if splits.flatten.any? &:blank?
|
169
|
+
errors << "#{ERROR_BLANK_SUBFIELDS}: group: #{group.inspect}, row: #{row_values.inspect} (row #{row_num})"
|
170
|
+
end
|
171
|
+
errors
|
172
|
+
end
|
173
|
+
|
174
|
+
# Validates a row of data for trailing whitespace. Returns an
|
175
|
+
# error for each column that contains trailing whitespace.
|
176
|
+
#
|
177
|
+
# Nested columns is a hash with column names as keys and group
|
178
|
+
# names as values; e.g.,
|
179
|
+
#
|
180
|
+
# nested_columns = {
|
181
|
+
# "subject_label" => :subjects,
|
182
|
+
# "subject" => :subjects,
|
183
|
+
# "genre_label" => :genres
|
184
|
+
# "genre" => :genres
|
185
|
+
# }
|
186
|
+
#
|
187
|
+
# @param row [Hash] The row of data to be validated.
|
188
|
+
# @param nested_columns [Array<Symbol>] A hash of nested columns.
|
189
|
+
# @return [Array<String>] An array of error messages, if any.
|
190
|
+
def self.validate_whitespace row, row_num:, nested_columns: []
|
191
|
+
errors = []
|
192
|
+
|
193
|
+
row.each do |column, value|
|
194
|
+
# Assume all columns can have subfields delimited by pipes;
|
195
|
+
# some columns are "nested"; that is, they can be be further
|
196
|
+
# subdivided by semicolons. Select the regexp for the
|
197
|
+
# subfield type
|
198
|
+
split_chars = nested_columns.include?(column) ? PIPE_SEMICOLON_REGEXP : PIPE_SPLIT_REGEXP
|
199
|
+
if value.to_s.split(split_chars).any? { |sub| sub =~ %r{\s+$} }
|
200
|
+
errors << "#{ERROR_TRAILING_WHITESPACE}: column #{column.inspect}, value: #{value.inspect} (row #{row_num})"
|
201
|
+
end
|
202
|
+
end
|
203
|
+
|
204
|
+
errors
|
205
|
+
end
|
206
|
+
|
207
|
+
end
|
208
|
+
end
|
209
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Util
|
5
|
+
class CSVWriter
|
6
|
+
attr_reader :headers
|
7
|
+
attr_reader :outfile
|
8
|
+
|
9
|
+
def initialize outfile:, headers: []
|
10
|
+
@headers = headers
|
11
|
+
@outfile = outfile
|
12
|
+
end
|
13
|
+
|
14
|
+
def write rows=nil, &block
|
15
|
+
if block_given?
|
16
|
+
_write_with_block &block
|
17
|
+
elsif rows.is_a? Enumerable
|
18
|
+
_write_all rows
|
19
|
+
else
|
20
|
+
raise ""
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
private
|
25
|
+
def _write_with_block
|
26
|
+
CSV.open outfile, 'w+', headers: true do |csv|
|
27
|
+
csv << headers
|
28
|
+
yield csv
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def _write_all rows
|
33
|
+
CSV.open outfile, 'w+', headers: true do |csv|
|
34
|
+
csv << headers
|
35
|
+
rows.each do |row|
|
36
|
+
csv << row
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,194 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module DS
|
4
|
+
module Util
|
5
|
+
module Strings
|
6
|
+
|
7
|
+
##
|
8
|
+
# This method calls
|
9
|
+
#
|
10
|
+
# - +convert_mets_superscript+
|
11
|
+
# - +remove_brackets+
|
12
|
+
# - +fix_double_periods+
|
13
|
+
# - +escape_pipes+
|
14
|
+
# - +normalize_string+
|
15
|
+
#
|
16
|
+
# If +terminator+ is non-nil, the method removes any trailing
|
17
|
+
# punctuation and whitespace and appends +terminator+.
|
18
|
+
#
|
19
|
+
# Set +terminator+ to +``+ (empty string) to remove trailing
|
20
|
+
# punctuation.
|
21
|
+
#
|
22
|
+
# @param [String] string the string to clean
|
23
|
+
# @param [String] terminator the terminator to use, if any
|
24
|
+
# @param [Boolean] force use exact termination with +terminator+
|
25
|
+
# @return [String] the cleaned string
|
26
|
+
def clean_string string, terminator: nil, force: false
|
27
|
+
normal = normalize_string(
|
28
|
+
escape_pipes(
|
29
|
+
fix_double_periods(
|
30
|
+
remove_brackets(
|
31
|
+
convert_mets_superscript(string.to_s)
|
32
|
+
)
|
33
|
+
)
|
34
|
+
)
|
35
|
+
)
|
36
|
+
|
37
|
+
return normal if terminator.nil?
|
38
|
+
|
39
|
+
cleaned = terminate normal, terminator: terminator, force: force
|
40
|
+
# keep cleaning until no changes are made
|
41
|
+
return clean_string cleaned unless cleaned == string
|
42
|
+
cleaned
|
43
|
+
end
|
44
|
+
|
45
|
+
# TERMINAL_PUNCT_REGEX matches strings terminated by any of +.,;:?!+
|
46
|
+
TERMINAL_PUNCT_REGEX = %r{\s*([.,;:!]+)("?)$}
|
47
|
+
|
48
|
+
# ELLIPSIS_REGEX matches strings terminated by +...+
|
49
|
+
ELLIPSIS_REGEX = %r{\.\.\."?$}
|
50
|
+
|
51
|
+
# ABBREV_REGEX matches values like 'N.T.', 'O.T.'
|
52
|
+
ABBREV_REGEX = %r{\W[A-Z]\.$}
|
53
|
+
|
54
|
+
# Final ? regex
|
55
|
+
FINAL_QUESTION_REGEX = %r{\s*\?(\s*"?\s*)$}
|
56
|
+
##
|
57
|
+
# Add termination to string if it lacks terminal punctuation.
|
58
|
+
# Terminal punctuation is one of
|
59
|
+
#
|
60
|
+
# . , ; : ? !
|
61
|
+
#
|
62
|
+
# When +:terminator+ is +''+ or +nil+, trailing punctuation is*always*
|
63
|
+
# removed.
|
64
|
+
#
|
65
|
+
# Strings ending with ellipsis, '...' or '..."' are returned unaltered. This
|
66
|
+
# behavior cannot be overridden with `:force`.
|
67
|
+
#
|
68
|
+
# @param [String] str the string to terminate
|
69
|
+
# @param [String] terminator the terminator to use; default: +.+
|
70
|
+
# @param [Boolean] force use exact termination with +terminator+
|
71
|
+
# @return [String]
|
72
|
+
def terminate str, terminator: '.', force: false
|
73
|
+
str.strip!
|
74
|
+
# DE 2022.08.12 Note the \s* to match and replace whitespace before
|
75
|
+
# punctuation; this addresses a bug where some strings were returned
|
76
|
+
# with trailing whitespace: 'value :' => 'value '
|
77
|
+
# TODO: Refactor? Two functions: strip_punctuation(), terminate() ??
|
78
|
+
|
79
|
+
# don't strip ellipses
|
80
|
+
return str if str =~ ELLIPSIS_REGEX
|
81
|
+
# don't strip final periods for strings like "N.T."
|
82
|
+
return str if str =~ ABBREV_REGEX
|
83
|
+
|
84
|
+
# don't strip final question marks
|
85
|
+
return str if str =~ FINAL_QUESTION_REGEX
|
86
|
+
|
87
|
+
# if :terminator is '' or nil, remove any terminal punctuation
|
88
|
+
return str.sub TERMINAL_PUNCT_REGEX, '\2' if terminator.blank?
|
89
|
+
|
90
|
+
# str is already terminated
|
91
|
+
return str if str.end_with? terminator
|
92
|
+
return str if str.end_with? %Q{#{terminator}"}
|
93
|
+
|
94
|
+
# if string ends with '?', don't add terminator
|
95
|
+
return str if str.end_with? '?'
|
96
|
+
|
97
|
+
# str lacks terminal punctuation; add it;
|
98
|
+
# \\1 => keep final '"' (double-quote)
|
99
|
+
return str.sub %r{("?)$}, "#{terminator}\\1" if str !~ TERMINAL_PUNCT_REGEX
|
100
|
+
# str has to have exact terminal punctuation
|
101
|
+
# \\1 => keep final '"' (double-quote)
|
102
|
+
return str.sub TERMINAL_PUNCT_REGEX, "#{terminator}\\2" if force
|
103
|
+
# string has some terminal punctuation; return it
|
104
|
+
str
|
105
|
+
end
|
106
|
+
|
107
|
+
##
|
108
|
+
# Strip and replace all sequences of white space with single
|
109
|
+
# spaces and apply Unicode normalization. NFC normalization is
|
110
|
+
# used for all strings except URLs, to which NFKC normalization
|
111
|
+
# is applied. See RFC 3987:
|
112
|
+
#
|
113
|
+
# https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2.2
|
114
|
+
#
|
115
|
+
# @param [String] value the string to normalize
|
116
|
+
# @return [String] the normalized string
|
117
|
+
def normalize_string value
|
118
|
+
form = is_url?(value) ? :nfkc : :nfc
|
119
|
+
escape_pipes(
|
120
|
+
clean_white_space(
|
121
|
+
unicode_normalize(value, form)
|
122
|
+
)
|
123
|
+
)
|
124
|
+
end
|
125
|
+
|
126
|
+
##
|
127
|
+
# converts encoded DS 1.0 encoded superscripts to parenthetical
|
128
|
+
# values; e.g., 'XVI#^4/4#' is converted to 'XVI(4/4)'
|
129
|
+
def convert_mets_superscript value
|
130
|
+
value.to_s.gsub(%r{#\^([^#]+)#}, '(\1)')
|
131
|
+
end
|
132
|
+
|
133
|
+
##
|
134
|
+
# Escape pipe characters in source strings so split operations
|
135
|
+
# can avoid splitting on them.
|
136
|
+
def escape_pipes value
|
137
|
+
value.gsub('|', '\|')
|
138
|
+
end
|
139
|
+
|
140
|
+
def clean_white_space value
|
141
|
+
value.to_s.strip.gsub(%r{\s+}, ' ')
|
142
|
+
end
|
143
|
+
|
144
|
+
##
|
145
|
+
# Return the string using unicode normalization form +form+.
|
146
|
+
# Use +NFC+ normalization by default. NFC normalization is
|
147
|
+
# recommended best practice. See
|
148
|
+
#
|
149
|
+
# https://www.honeybadger.io/blog/ruby-unicode-normalization/
|
150
|
+
#
|
151
|
+
# In short: NFC should be used for most strings, but NFKC for
|
152
|
+
# URLs. See RFC 3987:
|
153
|
+
#
|
154
|
+
# https://datatracker.ietf.org/doc/html/rfc3987#section-5.3.2.2
|
155
|
+
#
|
156
|
+
# Wikibase uses NFC normalization:
|
157
|
+
#
|
158
|
+
# https://doc.wikimedia.org/Wikibase/REL1_28/php/classWikibase_1_1Repo_1_1Parsers_1_1WikibaseStringValueNormalizer.html
|
159
|
+
#
|
160
|
+
# @param [String] value the string to normalize
|
161
|
+
# @param [Symbol] form the normalization form: +:nfc+, +:nfkc+.
|
162
|
+
# +:nfd+, or +:nfkd+; default: +:nfc+
|
163
|
+
# @return [String] the normalized string
|
164
|
+
def unicode_normalize value, form = :nfc
|
165
|
+
value.to_s.unicode_normalize form
|
166
|
+
end
|
167
|
+
|
168
|
+
def remove_brackets value
|
169
|
+
value.to_s.strip.delete_prefix('[').delete_suffix(']')
|
170
|
+
end
|
171
|
+
|
172
|
+
##
|
173
|
+
# Replace any sequence of two '..' with a single period.
|
174
|
+
# Ellipses, that is, sequences of three periods '...', are
|
175
|
+
# ignored.
|
176
|
+
#
|
177
|
+
# fix_double_periods('....') # => "...."
|
178
|
+
# fix_double_periods('.. ..') # => ". ."
|
179
|
+
# fix_double_periods('... ..') # => "... ."
|
180
|
+
# fix_double_periods('... a..') # => "... a."
|
181
|
+
# fix_double_periods('a... a..') # => "a... a."
|
182
|
+
#
|
183
|
+
# @param [String] value the string to process
|
184
|
+
# @return [String]
|
185
|
+
def fix_double_periods value
|
186
|
+
value.to_s.gsub(%r{(?<!\.)\.\.(?!\.)}, '.')
|
187
|
+
end
|
188
|
+
|
189
|
+
def is_url? value
|
190
|
+
value.to_s =~ URI::regexp
|
191
|
+
end
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
data/lib/ds/util.rb
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
require_relative 'util/strings'
|
4
|
+
require_relative 'util/cache'
|
5
|
+
require_relative 'util/csv_writer'
|
6
|
+
require_relative 'util//csv_validator'
|
7
|
+
|
8
|
+
module DS
|
9
|
+
module Util
|
10
|
+
|
11
|
+
extend DS::Util::Strings
|
12
|
+
##
|
13
|
+
# Open and parse each XML file in +files+, optionally stripping namespaces
|
14
|
+
# from the parsed XML, running block on each XML document:
|
15
|
+
#
|
16
|
+
# data = []
|
17
|
+
# process_xml files, remove_namespaces: true do |xml|
|
18
|
+
# data << xml.xpath('//some/path/text()').text
|
19
|
+
# end
|
20
|
+
#
|
21
|
+
# @yield [xml, data] yields a Nokogiri XML document and the array of data
|
22
|
+
# to populate the CSV; you must know the format of each item
|
23
|
+
# in the ++data++ array
|
24
|
+
#
|
25
|
+
# @param files [Enumerable<String>] XML files to process
|
26
|
+
# @param remove_namespaces [Boolean] whether strip namespaces from parsed XML
|
27
|
+
# @yieldparam xml [Nokogiri::XML::Document] the parsed document
|
28
|
+
def process_xml files, remove_namespaces: false, &block
|
29
|
+
files.each do |in_xml|
|
30
|
+
# may be reading file list from STDIN; remove any trailing \r or \n
|
31
|
+
xml = File.open(in_xml.chomp) { |f| Nokogiri::XML f }
|
32
|
+
xml.remove_namespaces! if remove_namespaces
|
33
|
+
yield xml
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|