ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
data/exe/ds-recon ADDED
@@ -0,0 +1,275 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'thor'
4
+ require 'csv'
5
+ require_relative '../lib/ds/cli'
6
+
7
+ class ReconCLI < DS::CLI
8
+
9
+ def self.exit_on_failure?
10
+ true
11
+ end
12
+
13
+ RECON_SETS = Settings.recon.sets.map { |set| set[:name] }.freeze
14
+ # We don't validate the aggregate subjects, named subjects set
15
+
16
+ class_option :directory, banner: 'PATH', desc: "Output directory [ignored by recon-update, validate]", aliases: '-o', default: '.'
17
+ class_option :'source-type',
18
+ banner: 'TYPE',
19
+ desc: "One of #{DS::VALID_SOURCE_TYPES.join(', ')}; REQUIRED except for recon-update, validate",
20
+ enum: DS::VALID_SOURCE_TYPES,
21
+ aliases: '-t',
22
+ required: (ARGV.size > 0 && ! %w{recon-update validate}.include?(ARGV[0]))
23
+ class_option :verbose, desc: "Print full error messages", aliases: '-v', type: :boolean, default: false
24
+
25
+ desc "validate [RECON_CSV]", "Validate RECON CSV structure and data"
26
+ long_desc <<~LONGDESC
27
+ Validate the recon CSV using set configuration 'config/recon.yml'
28
+
29
+ -t/-source-type option is NOT REQUIRED
30
+
31
+ ds-recon validate -s names path/to/names.csv
32
+ LONGDESC
33
+ option :'recon-set', banner: 'SET', desc: "One of #{Recon::RECON_VALIDATION_SETS.join ', '}; REQUIRED", aliases: '-s', required: true
34
+ def validate *files
35
+ set_name = options[:'recon-set']
36
+ recon_csvs = files.present? ? files : Recon.csv_files(set_name.to_sym)
37
+ errors = recon_csvs.flat_map { |csv| Recon.validate set_name, csv }.compact
38
+
39
+ if errors.empty?
40
+ puts "SUCCESS no errors found for: #{recon_csvs.join ', '}"
41
+ else
42
+ puts "ERRORS validating #{recon_csvs.join ', '}:#{$/}#{$/}#{errors.join $/}" unless errors.empty?
43
+ end
44
+
45
+ true
46
+ end
47
+
48
+
49
+ desc "names FILES", "Extract names from one or more FILEs and write to CSV"
50
+ long_desc <<-LONGDESC
51
+ Extract names from one or more FILEs.
52
+
53
+ Use '-' to read a list of files from standard input:
54
+
55
+ cat list_of_files | recon names -t mets -
56
+ LONGDESC
57
+ def names *files
58
+ # TODO: Add role column to names
59
+ return false unless validate_args files
60
+
61
+ out_csv = write_csv_for :names, files
62
+ puts "Wrote: #{out_csv}"
63
+
64
+ true
65
+ end
66
+
67
+ desc "places FILES", "Extract place names from one or more FILEs and write to CSV"
68
+ long_desc <<-LONGDESC
69
+ Extract place names from one or more FILEs.
70
+
71
+ Use '-' to read a list of files from standard input:
72
+
73
+ cat list_of_files | recon places -t mets -
74
+ LONGDESC
75
+ def places *files
76
+ return false unless validate_args files
77
+ invoke :recon_update
78
+
79
+ out_csv = write_csv_for :places, files
80
+ puts "Wrote: #{out_csv}"
81
+
82
+ true
83
+ end
84
+
85
+ desc "subjects FILES", "Extract subjects from one or more FILEs and write to CSV"
86
+ long_desc <<-LONGDESC
87
+ Extract LC subjects from one or more FILEs.
88
+
89
+ Use '-' to read a list of files from standard input:
90
+
91
+ cat list_of_files | recon subjects -t marc -
92
+
93
+ By default MARC fields 650, 651 are extracted, but fields 600, 610 are extracted if `--named-subjects` is given.
94
+
95
+ NOTE: Not implemented for mets or tei source types.
96
+
97
+ LONGDESC
98
+ option :'named-subjects', desc: 'Extract named subjects', aliases: '-n', type: :boolean, default: false
99
+ def subjects *files
100
+
101
+ return false unless validate_args files
102
+ invoke :recon_update
103
+
104
+ out_csv = write_csv_for :subjects, files
105
+ puts "Wrote: #{out_csv}"
106
+
107
+ true
108
+ end
109
+
110
+ desc "genres FILES", "Extract genre terms from one or more FILEs and write to CSV"
111
+ long_desc <<-LONGDESC
112
+ Extract genre terms from one or more FILEs.
113
+
114
+ Use '-' to read a list of files from standard input:
115
+
116
+ cat list_of_files | recon genres -t marc -
117
+
118
+ NOTE: Not implemented for mets or tei source types.
119
+
120
+ LONGDESC
121
+ def genres *files
122
+ return false unless validate_args files
123
+ invoke :recon_update
124
+
125
+ out_csv = write_csv_for :genres, files
126
+ puts "Wrote: #{out_csv}"
127
+
128
+ true
129
+ end
130
+
131
+ desc "materials FILES", "Extract materials terms from one or more FILEs and write to CSV"
132
+ long_desc <<-LONGDESC
133
+ Extract materials terms from one or more FILEs.
134
+
135
+ Use '-' to read a list of files from standard input:
136
+
137
+ cat list_of_files | recon materials -t mets -
138
+ LONGDESC
139
+ def materials *files
140
+ return false unless validate_args files
141
+ invoke :recon_update
142
+
143
+ out_csv = write_csv_for :materials, files
144
+ puts "Wrote: #{out_csv}"
145
+
146
+ true
147
+ end
148
+
149
+ desc "languages FILES", "Extract languages from one or more FILEs and write to CSV"
150
+ long_desc <<-LONGDESC
151
+ Extract languages from one or more files.
152
+
153
+ Use '-' to read a list of files from standard input:
154
+
155
+ cat list_of_files | recon languages -t mets -
156
+ LONGDESC
157
+ def languages *files
158
+ return false unless validate_args files
159
+ invoke :recon_update
160
+
161
+ out_csv = write_csv_for :languages, files
162
+ puts "Wrote: #{out_csv}"
163
+
164
+ true
165
+ end
166
+
167
+ desc "titles FILES", "Extract titles from one or more FILEs and write to CSV"
168
+ long_desc <<-LONGDESC
169
+ Extract titles from one or more files.
170
+
171
+ Use '-' to read a list of files from standard input:
172
+
173
+ cat list_of_files | recon titles -t mets -
174
+ LONGDESC
175
+ def titles *files
176
+ return false unless validate_args files
177
+ invoke :recon_update
178
+
179
+ out_csv = write_csv_for :titles, files
180
+ puts "Wrote: #{out_csv}"
181
+
182
+ true
183
+ end
184
+
185
+ desc 'write-all FILES', 'Extract recons and write to CSV for all recon types'
186
+ def write_all *files
187
+ return false unless validate_args files
188
+ invoke :recon_update
189
+
190
+ recon_manager = build_recon_manager files
191
+ Recon::RECON_TYPES.each do |recon_type|
192
+ out_csv = recon_manager.write_csv recon_type
193
+ puts "Wrote: #{out_csv}"
194
+ end
195
+ true
196
+ end
197
+
198
+ desc "splits FILES", "Extract long lines"
199
+ long_desc <<-LONGDESC
200
+ Extract strings longer than 400 characters
201
+
202
+ Use '-' to read a list of files from standard input:
203
+
204
+ cat list_of_files | recon splits -t mets -
205
+ LONGDESC
206
+ def splits *files
207
+ return false unless validate_args files
208
+
209
+ out_csv = out_file 'splits', options
210
+ begin
211
+ invoke :recon_update
212
+
213
+ case options[:'source-type']
214
+ when 'marc'
215
+ data = Splits.from_marc select_input files
216
+ when 'mets'
217
+ data = Splits.from_mets select_input files
218
+ when 'tei'
219
+ data = Splits.from_tei select_input files
220
+ else
221
+ raise NotImplementedError, "No method to process splits for source type: '#{options[:source_type]}'"
222
+ end
223
+
224
+ validate! data unless options[:skip_validation]
225
+ write_csv out_csv, Splits::CSV_HEADERS, data
226
+
227
+ rescue NotImplementedError, StandardError
228
+ STDERR.puts $!.backtrace if options[:verbose]
229
+ abort "#{$!}"
230
+ end
231
+
232
+ true
233
+ end
234
+
235
+ protected
236
+
237
+ # Extract recon values from files for given recon type, and write to CSV.
238
+ #
239
+ # @param recon_type_name [String] the name of the recon type
240
+ # @param files [Array<String>] the list of files to process
241
+ # @return [String] the path of the output CSV file
242
+ def write_csv_for recon_type_name, files
243
+ build_recon_manager = build_recon_manager files
244
+ recon_type = Recon.find_recon_type recon_type_name
245
+ out_file = build_recon_manager.write_csv recon_type
246
+ out_file
247
+ end
248
+
249
+ # Builds a ReconManager object with the given files.
250
+ #
251
+ # @param files [Array<String>] the list of files to process
252
+ # @return [Recon::ReconManager] the ReconManager object
253
+ def build_recon_manager files
254
+ Recon::ReconManager.new(
255
+ source_type: options[:'source-type'],
256
+ out_dir: options[:directory],
257
+ files: files
258
+ )
259
+ end
260
+
261
+ # Ensures that the list of files is not empty and that they exist.
262
+ #
263
+ # @param files [Array<String>] the list of files to validate
264
+ # @return [Boolean] true if the arguments are valid, false otherwise
265
+ def validate_args files
266
+ return true if read_from_stdin? files
267
+ cannot_find = files.reject { |f| File.exist?(f) }
268
+ return true if cannot_find.empty?
269
+
270
+ puts "Can't find input file(s): #{cannot_find.join '; ' }"
271
+ false
272
+ end
273
+ end
274
+
275
+ ReconCLI.start ARGV
@@ -0,0 +1,40 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'optparse'
5
+
6
+ require_relative '../lib/ds/csv_util'
7
+
8
+ ##
9
+ # Check output CSV values for trailing whitespace.
10
+ #
11
+ # Trailing whitespace is not permitted in Wikibase values.
12
+ parser = OptionParser.new do |opts|
13
+
14
+ opts.banner = <<EOF
15
+ Usage: #{File.basename __FILE__} CSV [CSV...]
16
+
17
+ Check output CSV values for trailing whitespace.
18
+
19
+ Trailing whitespace is not permitted in Wikibase values.
20
+
21
+ EOF
22
+ help_help = 'Prints this help'
23
+ opts.on "-h", "--help", help_help do
24
+ puts opts
25
+ exit
26
+ end
27
+ end
28
+
29
+ parser.parse!
30
+
31
+ abort 'Please provide a CSV' if ARGV.empty?
32
+
33
+ ARGV.each do |csv|
34
+ rows = CSV.readlines(csv, headers: true).map &:to_h
35
+ is_valid = DS::CSVUtil.validate rows
36
+
37
+ flash = is_valid ? 'SUCCESS!' :'ERROR! '
38
+ result = is_valid ? 'CSV is valid' : 'CSV is NOT valid'
39
+ puts "#{flash} #{result} -- '#{csv}'"
40
+ end
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ ######
4
+ # Script to convert UPenn Marc XML to DS 2.0 CSV format.
5
+ #
6
+
7
+
8
+ require 'marc'
9
+ require 'csv'
10
+ require 'optionparser'
11
+ require 'rexml'
12
+ require_relative '../lib/ds'
13
+
14
+ options = {}
15
+ options[:directory] = '.'
16
+ options[:prefix] = ''
17
+ options[:encoding] = 'UTF-8'
18
+
19
+ OptionParser.new do |opts|
20
+
21
+ opts.banner = <<EOF
22
+ Usage: #{File.basename __FILE__} [options] MRC [MRC ..]
23
+
24
+ Convert MARC MRC to MARC XML.
25
+
26
+ EOF
27
+
28
+ p_help = "String prefix for file name; e.g., 'inst-marc-' [default: '#{options[:prefix]}']"
29
+ opts.on('-p PREFIX', '--prefix=PREFIX', p_help) do |prefix|
30
+ options[:prefix] = prefix
31
+ end
32
+
33
+ d_help = "Path of an directory to output XML files to [default: '#{options[:directory]}']"
34
+ opts.on('-d PATH', '--directory=PATH', d_help) do |directory|
35
+ options[:directory] = directory
36
+ end
37
+
38
+ e_help = "Encoding of the incoming MARC MRC/DAT file: [default: #{options[:encoding]}]"
39
+ opts.on('-e ENCODING', '--marc-encoding=ENCODING', e_help) do |encoding|
40
+ options[:encoding] = encoding
41
+ end
42
+
43
+ l_help = "List encodings available on this computer; WARNING: long list"
44
+ opts.on('-l', '--list-encodings', l_help) do
45
+ puts "Known encodings: "
46
+ puts Encoding.list.map { |enc| enc.names.join ', ' }
47
+ exit
48
+ end
49
+
50
+ h_help = <<~EOF
51
+ Prints this help
52
+
53
+ Note on encodings: Legacy MARC files can use MARC-8. If the default fails,
54
+ try that. To see a list of available encodings (a long list), use the -l option.
55
+ EOF
56
+ opts.on("-h", "--help", h_help) do
57
+ puts opts
58
+ exit
59
+ end
60
+ end.parse!
61
+
62
+ marc_mrc = ARGV.dup
63
+
64
+ abort 'Please provide an input MRC' if marc_mrc.empty?
65
+ cannot_find = marc_mrc.reject { |f| File.exist?(f) }
66
+ abort "Can't find input MRC: #{cannot_find.join '; ' }" unless cannot_find.empty?
67
+ abort "Cannot find output path: #{options[:directory]}" unless File.directory? options[:directory]
68
+
69
+ marc_mrc.each do |mrc|
70
+ reader = MARC::Reader.new(mrc, :external_encoding => "UTF-8")
71
+ reader.each_with_index do |record,ndx|
72
+ base = sprintf '%s%03d.xml', options[:prefix], ndx
73
+ marc_out = File.join options[:directory], base
74
+ # Use REXML::Document to format and preserve whitespace in outputs
75
+ doc = REXML::Document.new(record.to_xml.to_s)
76
+ # transitive: true prevents the formatter compressing whitespace
77
+ doc.write(output: File.open(marc_out, 'w+'), indent: 2, transitive: true)
78
+ puts "Wrote #{marc_out}"
79
+ end
80
+ end
data/lib/ds/cli.rb ADDED
@@ -0,0 +1,102 @@
1
+ require 'thor'
2
+ require 'colorize'
3
+ require_relative '../ds'
4
+
5
+ module DS
6
+ class CLI < Thor
7
+ include Recon
8
+ include ActiveSupport::NumberHelper
9
+
10
+ DS.env = (ENV['DS_ENV'].present? ? ENV['DS_ENV'] : 'production')
11
+ DS.configure!
12
+
13
+ class_option :'skip-recon-update', desc: "Skip CSV update from git [ignored by recon-update, validate]", aliases: '-G', type: :boolean, default: false
14
+ class_option :'skip-validation', desc: "Skip validation of CSV values [same as SKIP_OUTPUT_VALIDATION=true, ignored by recon-update, validate]", aliases: '-V', type: :boolean, default: false
15
+ class_option :verbose, desc: "Be chatty; print stacktraces; overrides --quiet", aliases: '-v', type: :boolean, default: false
16
+ class_option :quiet, desc: "Don't print messages", aliases: '-q', type: :boolean, default: false
17
+
18
+
19
+ desc "recon-update", "Update Recon CSVs from git"
20
+ long_desc <<-LONGDESC
21
+ Update Recon CSVs from #{Settings.recon.git_repo}.
22
+
23
+ NOTE: This command ignores all options, including `--skip-recon-update`; set
24
+ the SKIP_RECON_UPDATE environment variable to override.
25
+
26
+ LONGDESC
27
+ def recon_update(*args)
28
+ # allow any args so this command can be invoked by any other
29
+ if skip_git? options
30
+ print_message(options, verbose_only: true) { <<~EOF.squish }
31
+ WARNING: SKIP_RECON_UPDATE or
32
+ --skip-recon-update set; skipping git pull
33
+ EOF
34
+ return
35
+ end
36
+ STDOUT.print "Updating Recon CSVs from #{Settings.recon.git_repo}..."
37
+ Recon::ReconData.update!
38
+ STDOUT.puts "done."
39
+ end
40
+
41
+ ##
42
+ # Needed to return a non-zero exit code on failure. See:
43
+ #
44
+ # https://github.com/rails/thor/wiki/Making-An-Executable
45
+ def self.exit_on_failure?
46
+ true
47
+ end
48
+
49
+ protected
50
+ def skip_git? options
51
+ return true if options[:'skip-recon-update']
52
+ return true if ENV['SKIP_RECON_UPDATE']
53
+ false
54
+ end
55
+
56
+ ##
57
+ # See if the user has signaled input is coming from STDIN
58
+ #
59
+ # @param files [Enumerable<String>] the file list from ARGV
60
+ # (by way of Thor)
61
+ # @return [Boolean]
62
+ def read_from_stdin? files
63
+ files == ['-']
64
+ end
65
+
66
+ ##
67
+ # @param [String] msg the message to print
68
+ # @param [Hash] options the command options; the following are used here:
69
+ # @option options [Boolean] :verbose whether to print all messages
70
+ # @option option [Boolean] :quiet suppress all messages (except errors); overrides +:verbose+
71
+ # @param [Boolean] verbose_only print message only if +:verbose+ is true
72
+ def print_message options, verbose_only: false, &msg
73
+ return if options[:quiet]
74
+ # if +verbose_only+ is true, return unless +options[:verbose]+ is true
75
+ return if verbose_only && ! options[:verbose]
76
+
77
+ puts yield
78
+ end
79
+
80
+ ##
81
+ # Return the input to read from based on whether input is stdin.
82
+ # If `read_from_stdin?` returns true, return +ARGF+; otherwise,
83
+ # return +files+.
84
+ #
85
+ # @param files [Enumerable<String>] the file list from ARGV
86
+ # (by way of Thor)
87
+ # @return [Enumerable] +files+ or +ARGF+
88
+ def select_input files
89
+ return files unless read_from_stdin? files
90
+ ARGV.clear
91
+ ARGF
92
+ end
93
+
94
+ def validate! rows
95
+ return if options[:'skip-validation']
96
+ return if ENV['SKIP_OUTPUT_VALIDATION']
97
+ return if CSVUtil.validate rows
98
+
99
+ raise StandardError, "Validation errors found in output"
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,166 @@
1
+ module DS
2
+ module Constants
3
+ HEADINGS = %i{
4
+ ds_id
5
+ date_added
6
+ date_last_updated
7
+ source_type
8
+ cataloging_convention
9
+ holding_institution_ds_qid
10
+ holding_institution_as_recorded
11
+ holding_institution_id_number
12
+ holding_institution_shelfmark
13
+ link_to_holding_institution_record
14
+ iiif_manifest
15
+ production_place_as_recorded
16
+ production_place_ds_qid
17
+ production_date_as_recorded
18
+ production_date
19
+ century
20
+ century_aat
21
+ dated
22
+ title_as_recorded
23
+ title_as_recorded_agr
24
+ uniform_title_as_recorded
25
+ uniform_title_agr
26
+ standard_title_ds_qid
27
+ genre_as_recorded
28
+ genre_ds_qid
29
+ subject_as_recorded
30
+ subject_ds_qid
31
+ author_as_recorded
32
+ author_as_recorded_agr
33
+ author_ds_qid
34
+ artist_as_recorded
35
+ artist_as_recorded_agr
36
+ artist_ds_qid
37
+ scribe_as_recorded
38
+ scribe_as_recorded_agr
39
+ scribe_ds_qid
40
+ associated_agent_as_recorded
41
+ associated_agent_as_recorded_agr
42
+ associated_agent_ds_qid
43
+ former_owner_as_recorded
44
+ former_owner_as_recorded_agr
45
+ former_owner_ds_qid
46
+ language_as_recorded
47
+ language_ds_qid
48
+ material_as_recorded
49
+ material_ds_qid
50
+ physical_description
51
+ note
52
+ acknowledgments
53
+ data_processed_at
54
+ data_source_modified
55
+ source_file
56
+ }
57
+
58
+
59
+ NESTED_COLUMNS = %i{ subject subject_label genre genre_label production_place production_place_label language language_label }
60
+ # Institutions dependent on DS and their DS IDs
61
+ # Some institutions have more than one collection
62
+ #
63
+ # conception 15
64
+ # csl 12, 9
65
+ # cuny 5
66
+ # grolier 24
67
+ # gts 23
68
+ # indiana 40
69
+ # kansas 30
70
+ # nelsonatkins 46
71
+ # nyu 25
72
+ # providence 28
73
+ # rutgers 6
74
+ # ucb 1, 8, 11
75
+ # wellesley 50
76
+
77
+
78
+
79
+ INSTITUTION_DS_IDS = {
80
+ 1 => 'ucb',
81
+ 2 => 'harvard',
82
+ 3 => 'fordham',
83
+ 4 => 'freelib',
84
+ 5 => 'cuny',
85
+ 6 => 'rutgers',
86
+ 7 => 'ucd',
87
+ 8 => 'ucb',
88
+ 9 => 'csl',
89
+ 10 => 'ucr',
90
+ 11 => 'ucb',
91
+ 12 => 'csl',
92
+ 13 => 'sfu',
93
+ 14 => 'notredame',
94
+ 15 => 'conception',
95
+ 16 => 'columbia',
96
+ 17 => 'columbia',
97
+ 18 => 'columbia',
98
+ 19 => 'columbia',
99
+ 20 => 'columbia',
100
+ 21 => 'columbia',
101
+ 22 => 'columbia',
102
+ 23 => 'gts',
103
+ 24 => 'grolier',
104
+ 25 => 'nyu',
105
+ 26 => 'oberlin',
106
+ 27 => 'penn',
107
+ 28 => 'providence',
108
+ 29 => 'rome',
109
+ 30 => 'kansas',
110
+ 31 => 'jhopkins',
111
+ 32 => 'jhopkins',
112
+ 33 => 'jhopkins',
113
+ 34 => 'jhopkins',
114
+ 35 => 'walters',
115
+ 36 => 'pittsburgh',
116
+ 37 => 'txaustin',
117
+ 38 => 'uvm',
118
+ 39 => 'jtsa',
119
+ 40 => 'indiana',
120
+ 41 => 'nypl',
121
+ 42 => 'nypl',
122
+ 43 => 'huntington',
123
+ 44 => 'slu',
124
+ 45 => 'missouri',
125
+ 46 => 'nelsonatkins',
126
+ 47 => 'beinecke',
127
+ 48 => 'smith',
128
+ 50 => 'wellesley',
129
+ 52 => 'tufts'
130
+ }.freeze
131
+
132
+ TRAILING_PUNCTUATION_RE = %r{[,.:!?;[:space:]]+$}
133
+
134
+ MAX_WIKIBASE_FIELD_LENGTH = 400
135
+
136
+ INSTITUTIONS = INSTITUTION_DS_IDS.values.uniq.freeze
137
+
138
+ MARC_XML = 'marc-xml'
139
+ TEI_XML = 'tei-xml'
140
+ DS_CSV = 'ds-csv'
141
+ DS_METS = 'ds-mets-xml'
142
+
143
+ # source type list of all type names and normalized names; i.e.,
144
+ # lower case names stripped of all whitespace and non-word characters
145
+ VALID_SOURCE_TYPES = [
146
+ MARC_XML,
147
+ TEI_XML,
148
+ DS_CSV,
149
+ DS_METS
150
+ ].freeze
151
+
152
+ XML_NAMESPACES = {
153
+ marc: 'http://www.loc.gov/MARC21/slim',
154
+ mets: 'http://www.loc.gov/METS/',
155
+ mods: 'http://www.loc.gov/mods/v3',
156
+ rts: 'http://cosimo.stanford.edu/sdr/metsrights/',
157
+ mix: 'http://www.loc.gov/mix/v10',
158
+ xlink: 'http://www.w3.org/1999/xlink',
159
+ xsi: 'http://www.w3.org/2001/XMLSchema-instance',
160
+ xs: 'http://www.w3.org/2001/XMLSchema',
161
+ xd: 'http://www.oxygenxml.com/ns/doc/xsl',
162
+ tei: 'http://www.tei-c.org/ns/1.0'
163
+ }
164
+
165
+ end
166
+ end