ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
data/exe/ds-recon
ADDED
@@ -0,0 +1,275 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require 'csv'
|
5
|
+
require_relative '../lib/ds/cli'
|
6
|
+
|
7
|
+
class ReconCLI < DS::CLI
|
8
|
+
|
9
|
+
def self.exit_on_failure?
|
10
|
+
true
|
11
|
+
end
|
12
|
+
|
13
|
+
RECON_SETS = Settings.recon.sets.map { |set| set[:name] }.freeze
|
14
|
+
# We don't validate the aggregate subjects, named subjects set
|
15
|
+
|
16
|
+
class_option :directory, banner: 'PATH', desc: "Output directory [ignored by recon-update, validate]", aliases: '-o', default: '.'
|
17
|
+
class_option :'source-type',
|
18
|
+
banner: 'TYPE',
|
19
|
+
desc: "One of #{DS::VALID_SOURCE_TYPES.join(', ')}; REQUIRED except for recon-update, validate",
|
20
|
+
enum: DS::VALID_SOURCE_TYPES,
|
21
|
+
aliases: '-t',
|
22
|
+
required: (ARGV.size > 0 && ! %w{recon-update validate}.include?(ARGV[0]))
|
23
|
+
class_option :verbose, desc: "Print full error messages", aliases: '-v', type: :boolean, default: false
|
24
|
+
|
25
|
+
desc "validate [RECON_CSV]", "Validate RECON CSV structure and data"
|
26
|
+
long_desc <<~LONGDESC
|
27
|
+
Validate the recon CSV using set configuration 'config/recon.yml'
|
28
|
+
|
29
|
+
-t/-source-type option is NOT REQUIRED
|
30
|
+
|
31
|
+
ds-recon validate -s names path/to/names.csv
|
32
|
+
LONGDESC
|
33
|
+
option :'recon-set', banner: 'SET', desc: "One of #{Recon::RECON_VALIDATION_SETS.join ', '}; REQUIRED", aliases: '-s', required: true
|
34
|
+
def validate *files
|
35
|
+
set_name = options[:'recon-set']
|
36
|
+
recon_csvs = files.present? ? files : Recon.csv_files(set_name.to_sym)
|
37
|
+
errors = recon_csvs.flat_map { |csv| Recon.validate set_name, csv }.compact
|
38
|
+
|
39
|
+
if errors.empty?
|
40
|
+
puts "SUCCESS no errors found for: #{recon_csvs.join ', '}"
|
41
|
+
else
|
42
|
+
puts "ERRORS validating #{recon_csvs.join ', '}:#{$/}#{$/}#{errors.join $/}" unless errors.empty?
|
43
|
+
end
|
44
|
+
|
45
|
+
true
|
46
|
+
end
|
47
|
+
|
48
|
+
|
49
|
+
desc "names FILES", "Extract names from one or more FILEs and write to CSV"
|
50
|
+
long_desc <<-LONGDESC
|
51
|
+
Extract names from one or more FILEs.
|
52
|
+
|
53
|
+
Use '-' to read a list of files from standard input:
|
54
|
+
|
55
|
+
cat list_of_files | recon names -t mets -
|
56
|
+
LONGDESC
|
57
|
+
def names *files
|
58
|
+
# TODO: Add role column to names
|
59
|
+
return false unless validate_args files
|
60
|
+
|
61
|
+
out_csv = write_csv_for :names, files
|
62
|
+
puts "Wrote: #{out_csv}"
|
63
|
+
|
64
|
+
true
|
65
|
+
end
|
66
|
+
|
67
|
+
desc "places FILES", "Extract place names from one or more FILEs and write to CSV"
|
68
|
+
long_desc <<-LONGDESC
|
69
|
+
Extract place names from one or more FILEs.
|
70
|
+
|
71
|
+
Use '-' to read a list of files from standard input:
|
72
|
+
|
73
|
+
cat list_of_files | recon places -t mets -
|
74
|
+
LONGDESC
|
75
|
+
def places *files
|
76
|
+
return false unless validate_args files
|
77
|
+
invoke :recon_update
|
78
|
+
|
79
|
+
out_csv = write_csv_for :places, files
|
80
|
+
puts "Wrote: #{out_csv}"
|
81
|
+
|
82
|
+
true
|
83
|
+
end
|
84
|
+
|
85
|
+
desc "subjects FILES", "Extract subjects from one or more FILEs and write to CSV"
|
86
|
+
long_desc <<-LONGDESC
|
87
|
+
Extract LC subjects from one or more FILEs.
|
88
|
+
|
89
|
+
Use '-' to read a list of files from standard input:
|
90
|
+
|
91
|
+
cat list_of_files | recon subjects -t marc -
|
92
|
+
|
93
|
+
By default MARC fields 650, 651 are extracted, but fields 600, 610 are extracted if `--named-subjects` is given.
|
94
|
+
|
95
|
+
NOTE: Not implemented for mets or tei source types.
|
96
|
+
|
97
|
+
LONGDESC
|
98
|
+
option :'named-subjects', desc: 'Extract named subjects', aliases: '-n', type: :boolean, default: false
|
99
|
+
def subjects *files
|
100
|
+
|
101
|
+
return false unless validate_args files
|
102
|
+
invoke :recon_update
|
103
|
+
|
104
|
+
out_csv = write_csv_for :subjects, files
|
105
|
+
puts "Wrote: #{out_csv}"
|
106
|
+
|
107
|
+
true
|
108
|
+
end
|
109
|
+
|
110
|
+
desc "genres FILES", "Extract genre terms from one or more FILEs and write to CSV"
|
111
|
+
long_desc <<-LONGDESC
|
112
|
+
Extract genre terms from one or more FILEs.
|
113
|
+
|
114
|
+
Use '-' to read a list of files from standard input:
|
115
|
+
|
116
|
+
cat list_of_files | recon genres -t marc -
|
117
|
+
|
118
|
+
NOTE: Not implemented for mets or tei source types.
|
119
|
+
|
120
|
+
LONGDESC
|
121
|
+
def genres *files
|
122
|
+
return false unless validate_args files
|
123
|
+
invoke :recon_update
|
124
|
+
|
125
|
+
out_csv = write_csv_for :genres, files
|
126
|
+
puts "Wrote: #{out_csv}"
|
127
|
+
|
128
|
+
true
|
129
|
+
end
|
130
|
+
|
131
|
+
desc "materials FILES", "Extract materials terms from one or more FILEs and write to CSV"
|
132
|
+
long_desc <<-LONGDESC
|
133
|
+
Extract materials terms from one or more FILEs.
|
134
|
+
|
135
|
+
Use '-' to read a list of files from standard input:
|
136
|
+
|
137
|
+
cat list_of_files | recon materials -t mets -
|
138
|
+
LONGDESC
|
139
|
+
def materials *files
|
140
|
+
return false unless validate_args files
|
141
|
+
invoke :recon_update
|
142
|
+
|
143
|
+
out_csv = write_csv_for :materials, files
|
144
|
+
puts "Wrote: #{out_csv}"
|
145
|
+
|
146
|
+
true
|
147
|
+
end
|
148
|
+
|
149
|
+
desc "languages FILES", "Extract languages from one or more FILEs and write to CSV"
|
150
|
+
long_desc <<-LONGDESC
|
151
|
+
Extract languages from one or more files.
|
152
|
+
|
153
|
+
Use '-' to read a list of files from standard input:
|
154
|
+
|
155
|
+
cat list_of_files | recon languages -t mets -
|
156
|
+
LONGDESC
|
157
|
+
def languages *files
|
158
|
+
return false unless validate_args files
|
159
|
+
invoke :recon_update
|
160
|
+
|
161
|
+
out_csv = write_csv_for :languages, files
|
162
|
+
puts "Wrote: #{out_csv}"
|
163
|
+
|
164
|
+
true
|
165
|
+
end
|
166
|
+
|
167
|
+
desc "titles FILES", "Extract titles from one or more FILEs and write to CSV"
|
168
|
+
long_desc <<-LONGDESC
|
169
|
+
Extract titles from one or more files.
|
170
|
+
|
171
|
+
Use '-' to read a list of files from standard input:
|
172
|
+
|
173
|
+
cat list_of_files | recon titles -t mets -
|
174
|
+
LONGDESC
|
175
|
+
def titles *files
|
176
|
+
return false unless validate_args files
|
177
|
+
invoke :recon_update
|
178
|
+
|
179
|
+
out_csv = write_csv_for :titles, files
|
180
|
+
puts "Wrote: #{out_csv}"
|
181
|
+
|
182
|
+
true
|
183
|
+
end
|
184
|
+
|
185
|
+
desc 'write-all FILES', 'Extract recons and write to CSV for all recon types'
|
186
|
+
def write_all *files
|
187
|
+
return false unless validate_args files
|
188
|
+
invoke :recon_update
|
189
|
+
|
190
|
+
recon_manager = build_recon_manager files
|
191
|
+
Recon::RECON_TYPES.each do |recon_type|
|
192
|
+
out_csv = recon_manager.write_csv recon_type
|
193
|
+
puts "Wrote: #{out_csv}"
|
194
|
+
end
|
195
|
+
true
|
196
|
+
end
|
197
|
+
|
198
|
+
desc "splits FILES", "Extract long lines"
|
199
|
+
long_desc <<-LONGDESC
|
200
|
+
Extract strings longer than 400 characters
|
201
|
+
|
202
|
+
Use '-' to read a list of files from standard input:
|
203
|
+
|
204
|
+
cat list_of_files | recon splits -t mets -
|
205
|
+
LONGDESC
|
206
|
+
def splits *files
|
207
|
+
return false unless validate_args files
|
208
|
+
|
209
|
+
out_csv = out_file 'splits', options
|
210
|
+
begin
|
211
|
+
invoke :recon_update
|
212
|
+
|
213
|
+
case options[:'source-type']
|
214
|
+
when 'marc'
|
215
|
+
data = Splits.from_marc select_input files
|
216
|
+
when 'mets'
|
217
|
+
data = Splits.from_mets select_input files
|
218
|
+
when 'tei'
|
219
|
+
data = Splits.from_tei select_input files
|
220
|
+
else
|
221
|
+
raise NotImplementedError, "No method to process splits for source type: '#{options[:source_type]}'"
|
222
|
+
end
|
223
|
+
|
224
|
+
validate! data unless options[:skip_validation]
|
225
|
+
write_csv out_csv, Splits::CSV_HEADERS, data
|
226
|
+
|
227
|
+
rescue NotImplementedError, StandardError
|
228
|
+
STDERR.puts $!.backtrace if options[:verbose]
|
229
|
+
abort "#{$!}"
|
230
|
+
end
|
231
|
+
|
232
|
+
true
|
233
|
+
end
|
234
|
+
|
235
|
+
protected
|
236
|
+
|
237
|
+
# Extract recon values from files for given recon type, and write to CSV.
|
238
|
+
#
|
239
|
+
# @param recon_type_name [String] the name of the recon type
|
240
|
+
# @param files [Array<String>] the list of files to process
|
241
|
+
# @return [String] the path of the output CSV file
|
242
|
+
def write_csv_for recon_type_name, files
|
243
|
+
build_recon_manager = build_recon_manager files
|
244
|
+
recon_type = Recon.find_recon_type recon_type_name
|
245
|
+
out_file = build_recon_manager.write_csv recon_type
|
246
|
+
out_file
|
247
|
+
end
|
248
|
+
|
249
|
+
# Builds a ReconManager object with the given files.
|
250
|
+
#
|
251
|
+
# @param files [Array<String>] the list of files to process
|
252
|
+
# @return [Recon::ReconManager] the ReconManager object
|
253
|
+
def build_recon_manager files
|
254
|
+
Recon::ReconManager.new(
|
255
|
+
source_type: options[:'source-type'],
|
256
|
+
out_dir: options[:directory],
|
257
|
+
files: files
|
258
|
+
)
|
259
|
+
end
|
260
|
+
|
261
|
+
# Ensures that the list of files is not empty and that they exist.
|
262
|
+
#
|
263
|
+
# @param files [Array<String>] the list of files to validate
|
264
|
+
# @return [Boolean] true if the arguments are valid, false otherwise
|
265
|
+
def validate_args files
|
266
|
+
return true if read_from_stdin? files
|
267
|
+
cannot_find = files.reject { |f| File.exist?(f) }
|
268
|
+
return true if cannot_find.empty?
|
269
|
+
|
270
|
+
puts "Can't find input file(s): #{cannot_find.join '; ' }"
|
271
|
+
false
|
272
|
+
end
|
273
|
+
end
|
274
|
+
|
275
|
+
ReconCLI.start ARGV
|
data/exe/ds-validate-csv
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
require_relative '../lib/ds/csv_util'
|
7
|
+
|
8
|
+
##
|
9
|
+
# Check output CSV values for trailing whitespace.
|
10
|
+
#
|
11
|
+
# Trailing whitespace is not permitted in Wikibase values.
|
12
|
+
parser = OptionParser.new do |opts|
|
13
|
+
|
14
|
+
opts.banner = <<EOF
|
15
|
+
Usage: #{File.basename __FILE__} CSV [CSV...]
|
16
|
+
|
17
|
+
Check output CSV values for trailing whitespace.
|
18
|
+
|
19
|
+
Trailing whitespace is not permitted in Wikibase values.
|
20
|
+
|
21
|
+
EOF
|
22
|
+
help_help = 'Prints this help'
|
23
|
+
opts.on "-h", "--help", help_help do
|
24
|
+
puts opts
|
25
|
+
exit
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
parser.parse!
|
30
|
+
|
31
|
+
abort 'Please provide a CSV' if ARGV.empty?
|
32
|
+
|
33
|
+
ARGV.each do |csv|
|
34
|
+
rows = CSV.readlines(csv, headers: true).map &:to_h
|
35
|
+
is_valid = DS::CSVUtil.validate rows
|
36
|
+
|
37
|
+
flash = is_valid ? 'SUCCESS!' :'ERROR! '
|
38
|
+
result = is_valid ? 'CSV is valid' : 'CSV is NOT valid'
|
39
|
+
puts "#{flash} #{result} -- '#{csv}'"
|
40
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
######
|
4
|
+
# Script to convert UPenn Marc XML to DS 2.0 CSV format.
|
5
|
+
#
|
6
|
+
|
7
|
+
|
8
|
+
require 'marc'
|
9
|
+
require 'csv'
|
10
|
+
require 'optionparser'
|
11
|
+
require 'rexml'
|
12
|
+
require_relative '../lib/ds'
|
13
|
+
|
14
|
+
options = {}
|
15
|
+
options[:directory] = '.'
|
16
|
+
options[:prefix] = ''
|
17
|
+
options[:encoding] = 'UTF-8'
|
18
|
+
|
19
|
+
OptionParser.new do |opts|
|
20
|
+
|
21
|
+
opts.banner = <<EOF
|
22
|
+
Usage: #{File.basename __FILE__} [options] MRC [MRC ..]
|
23
|
+
|
24
|
+
Convert MARC MRC to MARC XML.
|
25
|
+
|
26
|
+
EOF
|
27
|
+
|
28
|
+
p_help = "String prefix for file name; e.g., 'inst-marc-' [default: '#{options[:prefix]}']"
|
29
|
+
opts.on('-p PREFIX', '--prefix=PREFIX', p_help) do |prefix|
|
30
|
+
options[:prefix] = prefix
|
31
|
+
end
|
32
|
+
|
33
|
+
d_help = "Path of an directory to output XML files to [default: '#{options[:directory]}']"
|
34
|
+
opts.on('-d PATH', '--directory=PATH', d_help) do |directory|
|
35
|
+
options[:directory] = directory
|
36
|
+
end
|
37
|
+
|
38
|
+
e_help = "Encoding of the incoming MARC MRC/DAT file: [default: #{options[:encoding]}]"
|
39
|
+
opts.on('-e ENCODING', '--marc-encoding=ENCODING', e_help) do |encoding|
|
40
|
+
options[:encoding] = encoding
|
41
|
+
end
|
42
|
+
|
43
|
+
l_help = "List encodings available on this computer; WARNING: long list"
|
44
|
+
opts.on('-l', '--list-encodings', l_help) do
|
45
|
+
puts "Known encodings: "
|
46
|
+
puts Encoding.list.map { |enc| enc.names.join ', ' }
|
47
|
+
exit
|
48
|
+
end
|
49
|
+
|
50
|
+
h_help = <<~EOF
|
51
|
+
Prints this help
|
52
|
+
|
53
|
+
Note on encodings: Legacy MARC files can use MARC-8. If the default fails,
|
54
|
+
try that. To see a list of available encodings (a long list), use the -l option.
|
55
|
+
EOF
|
56
|
+
opts.on("-h", "--help", h_help) do
|
57
|
+
puts opts
|
58
|
+
exit
|
59
|
+
end
|
60
|
+
end.parse!
|
61
|
+
|
62
|
+
marc_mrc = ARGV.dup
|
63
|
+
|
64
|
+
abort 'Please provide an input MRC' if marc_mrc.empty?
|
65
|
+
cannot_find = marc_mrc.reject { |f| File.exist?(f) }
|
66
|
+
abort "Can't find input MRC: #{cannot_find.join '; ' }" unless cannot_find.empty?
|
67
|
+
abort "Cannot find output path: #{options[:directory]}" unless File.directory? options[:directory]
|
68
|
+
|
69
|
+
marc_mrc.each do |mrc|
|
70
|
+
reader = MARC::Reader.new(mrc, :external_encoding => "UTF-8")
|
71
|
+
reader.each_with_index do |record,ndx|
|
72
|
+
base = sprintf '%s%03d.xml', options[:prefix], ndx
|
73
|
+
marc_out = File.join options[:directory], base
|
74
|
+
# Use REXML::Document to format and preserve whitespace in outputs
|
75
|
+
doc = REXML::Document.new(record.to_xml.to_s)
|
76
|
+
# transitive: true prevents the formatter compressing whitespace
|
77
|
+
doc.write(output: File.open(marc_out, 'w+'), indent: 2, transitive: true)
|
78
|
+
puts "Wrote #{marc_out}"
|
79
|
+
end
|
80
|
+
end
|
data/lib/ds/cli.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'thor'
|
2
|
+
require 'colorize'
|
3
|
+
require_relative '../ds'
|
4
|
+
|
5
|
+
module DS
|
6
|
+
class CLI < Thor
|
7
|
+
include Recon
|
8
|
+
include ActiveSupport::NumberHelper
|
9
|
+
|
10
|
+
DS.env = (ENV['DS_ENV'].present? ? ENV['DS_ENV'] : 'production')
|
11
|
+
DS.configure!
|
12
|
+
|
13
|
+
class_option :'skip-recon-update', desc: "Skip CSV update from git [ignored by recon-update, validate]", aliases: '-G', type: :boolean, default: false
|
14
|
+
class_option :'skip-validation', desc: "Skip validation of CSV values [same as SKIP_OUTPUT_VALIDATION=true, ignored by recon-update, validate]", aliases: '-V', type: :boolean, default: false
|
15
|
+
class_option :verbose, desc: "Be chatty; print stacktraces; overrides --quiet", aliases: '-v', type: :boolean, default: false
|
16
|
+
class_option :quiet, desc: "Don't print messages", aliases: '-q', type: :boolean, default: false
|
17
|
+
|
18
|
+
|
19
|
+
desc "recon-update", "Update Recon CSVs from git"
|
20
|
+
long_desc <<-LONGDESC
|
21
|
+
Update Recon CSVs from #{Settings.recon.git_repo}.
|
22
|
+
|
23
|
+
NOTE: This command ignores all options, including `--skip-recon-update`; set
|
24
|
+
the SKIP_RECON_UPDATE environment variable to override.
|
25
|
+
|
26
|
+
LONGDESC
|
27
|
+
def recon_update(*args)
|
28
|
+
# allow any args so this command can be invoked by any other
|
29
|
+
if skip_git? options
|
30
|
+
print_message(options, verbose_only: true) { <<~EOF.squish }
|
31
|
+
WARNING: SKIP_RECON_UPDATE or
|
32
|
+
--skip-recon-update set; skipping git pull
|
33
|
+
EOF
|
34
|
+
return
|
35
|
+
end
|
36
|
+
STDOUT.print "Updating Recon CSVs from #{Settings.recon.git_repo}..."
|
37
|
+
Recon::ReconData.update!
|
38
|
+
STDOUT.puts "done."
|
39
|
+
end
|
40
|
+
|
41
|
+
##
|
42
|
+
# Needed to return a non-zero exit code on failure. See:
|
43
|
+
#
|
44
|
+
# https://github.com/rails/thor/wiki/Making-An-Executable
|
45
|
+
def self.exit_on_failure?
|
46
|
+
true
|
47
|
+
end
|
48
|
+
|
49
|
+
protected
|
50
|
+
def skip_git? options
|
51
|
+
return true if options[:'skip-recon-update']
|
52
|
+
return true if ENV['SKIP_RECON_UPDATE']
|
53
|
+
false
|
54
|
+
end
|
55
|
+
|
56
|
+
##
|
57
|
+
# See if the user has signaled input is coming from STDIN
|
58
|
+
#
|
59
|
+
# @param files [Enumerable<String>] the file list from ARGV
|
60
|
+
# (by way of Thor)
|
61
|
+
# @return [Boolean]
|
62
|
+
def read_from_stdin? files
|
63
|
+
files == ['-']
|
64
|
+
end
|
65
|
+
|
66
|
+
##
|
67
|
+
# @param [String] msg the message to print
|
68
|
+
# @param [Hash] options the command options; the following are used here:
|
69
|
+
# @option options [Boolean] :verbose whether to print all messages
|
70
|
+
# @option option [Boolean] :quiet suppress all messages (except errors); overrides +:verbose+
|
71
|
+
# @param [Boolean] verbose_only print message only if +:verbose+ is true
|
72
|
+
def print_message options, verbose_only: false, &msg
|
73
|
+
return if options[:quiet]
|
74
|
+
# if +verbose_only+ is true, return unless +options[:verbose]+ is true
|
75
|
+
return if verbose_only && ! options[:verbose]
|
76
|
+
|
77
|
+
puts yield
|
78
|
+
end
|
79
|
+
|
80
|
+
##
|
81
|
+
# Return the input to read from based on whether input is stdin.
|
82
|
+
# If `read_from_stdin?` returns true, return +ARGF+; otherwise,
|
83
|
+
# return +files+.
|
84
|
+
#
|
85
|
+
# @param files [Enumerable<String>] the file list from ARGV
|
86
|
+
# (by way of Thor)
|
87
|
+
# @return [Enumerable] +files+ or +ARGF+
|
88
|
+
def select_input files
|
89
|
+
return files unless read_from_stdin? files
|
90
|
+
ARGV.clear
|
91
|
+
ARGF
|
92
|
+
end
|
93
|
+
|
94
|
+
def validate! rows
|
95
|
+
return if options[:'skip-validation']
|
96
|
+
return if ENV['SKIP_OUTPUT_VALIDATION']
|
97
|
+
return if CSVUtil.validate rows
|
98
|
+
|
99
|
+
raise StandardError, "Validation errors found in output"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
data/lib/ds/constants.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
module DS
|
2
|
+
module Constants
|
3
|
+
HEADINGS = %i{
|
4
|
+
ds_id
|
5
|
+
date_added
|
6
|
+
date_last_updated
|
7
|
+
source_type
|
8
|
+
cataloging_convention
|
9
|
+
holding_institution_ds_qid
|
10
|
+
holding_institution_as_recorded
|
11
|
+
holding_institution_id_number
|
12
|
+
holding_institution_shelfmark
|
13
|
+
link_to_holding_institution_record
|
14
|
+
iiif_manifest
|
15
|
+
production_place_as_recorded
|
16
|
+
production_place_ds_qid
|
17
|
+
production_date_as_recorded
|
18
|
+
production_date
|
19
|
+
century
|
20
|
+
century_aat
|
21
|
+
dated
|
22
|
+
title_as_recorded
|
23
|
+
title_as_recorded_agr
|
24
|
+
uniform_title_as_recorded
|
25
|
+
uniform_title_agr
|
26
|
+
standard_title_ds_qid
|
27
|
+
genre_as_recorded
|
28
|
+
genre_ds_qid
|
29
|
+
subject_as_recorded
|
30
|
+
subject_ds_qid
|
31
|
+
author_as_recorded
|
32
|
+
author_as_recorded_agr
|
33
|
+
author_ds_qid
|
34
|
+
artist_as_recorded
|
35
|
+
artist_as_recorded_agr
|
36
|
+
artist_ds_qid
|
37
|
+
scribe_as_recorded
|
38
|
+
scribe_as_recorded_agr
|
39
|
+
scribe_ds_qid
|
40
|
+
associated_agent_as_recorded
|
41
|
+
associated_agent_as_recorded_agr
|
42
|
+
associated_agent_ds_qid
|
43
|
+
former_owner_as_recorded
|
44
|
+
former_owner_as_recorded_agr
|
45
|
+
former_owner_ds_qid
|
46
|
+
language_as_recorded
|
47
|
+
language_ds_qid
|
48
|
+
material_as_recorded
|
49
|
+
material_ds_qid
|
50
|
+
physical_description
|
51
|
+
note
|
52
|
+
acknowledgments
|
53
|
+
data_processed_at
|
54
|
+
data_source_modified
|
55
|
+
source_file
|
56
|
+
}
|
57
|
+
|
58
|
+
|
59
|
+
NESTED_COLUMNS = %i{ subject subject_label genre genre_label production_place production_place_label language language_label }
|
60
|
+
# Institutions dependent on DS and their DS IDs
|
61
|
+
# Some institutions have more than one collection
|
62
|
+
#
|
63
|
+
# conception 15
|
64
|
+
# csl 12, 9
|
65
|
+
# cuny 5
|
66
|
+
# grolier 24
|
67
|
+
# gts 23
|
68
|
+
# indiana 40
|
69
|
+
# kansas 30
|
70
|
+
# nelsonatkins 46
|
71
|
+
# nyu 25
|
72
|
+
# providence 28
|
73
|
+
# rutgers 6
|
74
|
+
# ucb 1, 8, 11
|
75
|
+
# wellesley 50
|
76
|
+
|
77
|
+
|
78
|
+
|
79
|
+
INSTITUTION_DS_IDS = {
|
80
|
+
1 => 'ucb',
|
81
|
+
2 => 'harvard',
|
82
|
+
3 => 'fordham',
|
83
|
+
4 => 'freelib',
|
84
|
+
5 => 'cuny',
|
85
|
+
6 => 'rutgers',
|
86
|
+
7 => 'ucd',
|
87
|
+
8 => 'ucb',
|
88
|
+
9 => 'csl',
|
89
|
+
10 => 'ucr',
|
90
|
+
11 => 'ucb',
|
91
|
+
12 => 'csl',
|
92
|
+
13 => 'sfu',
|
93
|
+
14 => 'notredame',
|
94
|
+
15 => 'conception',
|
95
|
+
16 => 'columbia',
|
96
|
+
17 => 'columbia',
|
97
|
+
18 => 'columbia',
|
98
|
+
19 => 'columbia',
|
99
|
+
20 => 'columbia',
|
100
|
+
21 => 'columbia',
|
101
|
+
22 => 'columbia',
|
102
|
+
23 => 'gts',
|
103
|
+
24 => 'grolier',
|
104
|
+
25 => 'nyu',
|
105
|
+
26 => 'oberlin',
|
106
|
+
27 => 'penn',
|
107
|
+
28 => 'providence',
|
108
|
+
29 => 'rome',
|
109
|
+
30 => 'kansas',
|
110
|
+
31 => 'jhopkins',
|
111
|
+
32 => 'jhopkins',
|
112
|
+
33 => 'jhopkins',
|
113
|
+
34 => 'jhopkins',
|
114
|
+
35 => 'walters',
|
115
|
+
36 => 'pittsburgh',
|
116
|
+
37 => 'txaustin',
|
117
|
+
38 => 'uvm',
|
118
|
+
39 => 'jtsa',
|
119
|
+
40 => 'indiana',
|
120
|
+
41 => 'nypl',
|
121
|
+
42 => 'nypl',
|
122
|
+
43 => 'huntington',
|
123
|
+
44 => 'slu',
|
124
|
+
45 => 'missouri',
|
125
|
+
46 => 'nelsonatkins',
|
126
|
+
47 => 'beinecke',
|
127
|
+
48 => 'smith',
|
128
|
+
50 => 'wellesley',
|
129
|
+
52 => 'tufts'
|
130
|
+
}.freeze
|
131
|
+
|
132
|
+
TRAILING_PUNCTUATION_RE = %r{[,.:!?;[:space:]]+$}
|
133
|
+
|
134
|
+
MAX_WIKIBASE_FIELD_LENGTH = 400
|
135
|
+
|
136
|
+
INSTITUTIONS = INSTITUTION_DS_IDS.values.uniq.freeze
|
137
|
+
|
138
|
+
MARC_XML = 'marc-xml'
|
139
|
+
TEI_XML = 'tei-xml'
|
140
|
+
DS_CSV = 'ds-csv'
|
141
|
+
DS_METS = 'ds-mets-xml'
|
142
|
+
|
143
|
+
# source type list of all type names and normalized names; i.e.,
|
144
|
+
# lower case names stripped of all whitespace and non-word characters
|
145
|
+
VALID_SOURCE_TYPES = [
|
146
|
+
MARC_XML,
|
147
|
+
TEI_XML,
|
148
|
+
DS_CSV,
|
149
|
+
DS_METS
|
150
|
+
].freeze
|
151
|
+
|
152
|
+
XML_NAMESPACES = {
|
153
|
+
marc: 'http://www.loc.gov/MARC21/slim',
|
154
|
+
mets: 'http://www.loc.gov/METS/',
|
155
|
+
mods: 'http://www.loc.gov/mods/v3',
|
156
|
+
rts: 'http://cosimo.stanford.edu/sdr/metsrights/',
|
157
|
+
mix: 'http://www.loc.gov/mix/v10',
|
158
|
+
xlink: 'http://www.w3.org/1999/xlink',
|
159
|
+
xsi: 'http://www.w3.org/2001/XMLSchema-instance',
|
160
|
+
xs: 'http://www.w3.org/2001/XMLSchema',
|
161
|
+
xd: 'http://www.oxygenxml.com/ns/doc/xsl',
|
162
|
+
tei: 'http://www.tei-c.org/ns/1.0'
|
163
|
+
}
|
164
|
+
|
165
|
+
end
|
166
|
+
end
|