ds-convert 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. checksums.yaml +7 -0
  2. data/README.md +294 -0
  3. data/Rakefile +12 -0
  4. data/config/settings.yml +150 -0
  5. data/exe/ds-convert +149 -0
  6. data/exe/ds-recon +275 -0
  7. data/exe/ds-validate-csv +40 -0
  8. data/exe/marc-mrc-to-xml.rb +80 -0
  9. data/lib/ds/cli.rb +102 -0
  10. data/lib/ds/constants.rb +166 -0
  11. data/lib/ds/converter/converter.rb +124 -0
  12. data/lib/ds/converter/writer.rb +50 -0
  13. data/lib/ds/converter.rb +7 -0
  14. data/lib/ds/csv_util.rb +43 -0
  15. data/lib/ds/data/berkeley-arks.txt +4000 -0
  16. data/lib/ds/data/getty-aat-centuries.csv +71 -0
  17. data/lib/ds/data/iiif_manifests.csv +122 -0
  18. data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
  19. data/lib/ds/ds_error.rb +1 -0
  20. data/lib/ds/extractor/base_record_locator.rb +24 -0
  21. data/lib/ds/extractor/base_term.rb +79 -0
  22. data/lib/ds/extractor/csv_record_locator.rb +13 -0
  23. data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
  24. data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
  25. data/lib/ds/extractor/genre.rb +45 -0
  26. data/lib/ds/extractor/language.rb +31 -0
  27. data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
  28. data/lib/ds/extractor/material.rb +12 -0
  29. data/lib/ds/extractor/name.rb +50 -0
  30. data/lib/ds/extractor/place.rb +11 -0
  31. data/lib/ds/extractor/subject.rb +58 -0
  32. data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
  33. data/lib/ds/extractor/title.rb +52 -0
  34. data/lib/ds/extractor/xml_record_locator.rb +38 -0
  35. data/lib/ds/extractor.rb +24 -0
  36. data/lib/ds/institutions.rb +55 -0
  37. data/lib/ds/manifest/base_id_validator.rb +76 -0
  38. data/lib/ds/manifest/constants.rb +67 -0
  39. data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
  40. data/lib/ds/manifest/entry.rb +133 -0
  41. data/lib/ds/manifest/manifest.rb +74 -0
  42. data/lib/ds/manifest/manifest_validator.rb +256 -0
  43. data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
  44. data/lib/ds/manifest.rb +30 -0
  45. data/lib/ds/mapper/base_mapper.rb +221 -0
  46. data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
  47. data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
  48. data/lib/ds/mapper/marc_mapper.rb +87 -0
  49. data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
  50. data/lib/ds/mapper.rb +13 -0
  51. data/lib/ds/recon/constants.rb +56 -0
  52. data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
  53. data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
  54. data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
  55. data/lib/ds/recon/recon_builder.rb +183 -0
  56. data/lib/ds/recon/recon_data.rb +37 -0
  57. data/lib/ds/recon/recon_manager.rb +92 -0
  58. data/lib/ds/recon/source_enumerator.rb +21 -0
  59. data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
  60. data/lib/ds/recon/type/all_subjects.rb +18 -0
  61. data/lib/ds/recon/type/genres.rb +50 -0
  62. data/lib/ds/recon/type/languages.rb +38 -0
  63. data/lib/ds/recon/type/materials.rb +40 -0
  64. data/lib/ds/recon/type/named_subjects.rb +20 -0
  65. data/lib/ds/recon/type/names.rb +65 -0
  66. data/lib/ds/recon/type/places.rb +40 -0
  67. data/lib/ds/recon/type/recon_type.rb +136 -0
  68. data/lib/ds/recon/type/splits.rb +34 -0
  69. data/lib/ds/recon/type/subjects.rb +65 -0
  70. data/lib/ds/recon/type/titles.rb +38 -0
  71. data/lib/ds/recon/url_lookup.rb +52 -0
  72. data/lib/ds/recon.rb +292 -0
  73. data/lib/ds/source/base_source.rb +32 -0
  74. data/lib/ds/source/ds_csv.rb +18 -0
  75. data/lib/ds/source/ds_mets_xml.rb +20 -0
  76. data/lib/ds/source/marc_xml.rb +22 -0
  77. data/lib/ds/source/source_cache.rb +69 -0
  78. data/lib/ds/source/tei_xml.rb +22 -0
  79. data/lib/ds/source.rb +20 -0
  80. data/lib/ds/util/cache.rb +111 -0
  81. data/lib/ds/util/csv_validator.rb +209 -0
  82. data/lib/ds/util/csv_writer.rb +42 -0
  83. data/lib/ds/util/strings.rb +194 -0
  84. data/lib/ds/util.rb +37 -0
  85. data/lib/ds/version.rb +5 -0
  86. data/lib/ds.rb +237 -0
  87. metadata +246 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d88babed86e60cc72a333832fd8480af964e54978d37ed9a0005e7cc140787af
4
+ data.tar.gz: d83d3d595c0ea0299dfc83ce4ee4d1d5e955ce6edb06a8b6ddc0ce0207a16262
5
+ SHA512:
6
+ metadata.gz: 62e11d6bf7cfb5397228177127464ef7f1e5ad4bc948b1afdd7e358d1658974abd2d0f1bfbd8d884cc9b33e31769060bd64480c456fc84ebe7727f4dbb4da3cd
7
+ data.tar.gz: cfc8df9d5f25b63367689429e9e763f0720abf2b44a8b9e0b4a872bf8c11d42cccff16245daa4c0b6c0a23aae9b74fbfd36002c7406ac6d9e572bcd6ddea25d4
data/README.md ADDED
@@ -0,0 +1,294 @@
1
+ # DS Convert
2
+
3
+ RubyGem that provides scripts to transform and manage input from multiple sources to generate a DS 2.0 imports CSV. Also includes scripts to extract strings from sources for authority reconciliation.
4
+
5
+ # Requirements
6
+
7
+ * Ruby version >= 3.4.0
8
+
9
+ # Installation
10
+
11
+ Run
12
+
13
+ ```
14
+ gem install ds-convert
15
+ ```
16
+
17
+ ## Transformation scripts
18
+
19
+ There are three main scripts:
20
+
21
+ ds-convert # Generate DS import CSV from member source data
22
+ ds-recon # Extract string values from source data for reconciliation
23
+ ds-validate-csv # Check DS Import CSV for values with trailing whitespace
24
+ marc-mrc-to-xml.rb # Utility script to conver MARC MRC files to MARC XML
25
+
26
+ The `ds-convert` script outputs a standard DS import CSV. Columns names and order are defined in `lib/ds/constants.rb` and can be access via `DS::HEADINGS`.
27
+
28
+ The `recon` script outputs a number of CSV with extracted values for names (authors, artists, scribe, former owners), places, subjects, and genres (from various vocabularies). CSVs output by `recon` have different columns according the content type.
29
+
30
+ ### `ds-convert` process
31
+
32
+ Usage:
33
+
34
+ ```
35
+ ds-convert convert OPTIONS MANIFEST [SOURCE_DIR]
36
+ ```
37
+
38
+ For example,
39
+
40
+ ```
41
+ ds-convert convert --output path/to/outputdir/output.csv ../path/to/manifest.csv
42
+ ```
43
+
44
+ Given a directory containing a set of source records (MARC XML, DS 1.0
45
+ METS, OPenn TEI XML, a CSV) and a `manifest.csv` file, `ds-convert` generates a DS
46
+ import CSV for all records listed in `manifest.csv`. The output import
47
+ CSV is used by the DS Import scripts to import data into the DS
48
+ Wikibase instance.
49
+
50
+ The values found in the `manifest.csv` are described in the [DS import
51
+ manifest data
52
+ dictionary](https://docs.google.com/spreadsheets/d/195ItCa2Qg69lp0lMuVlq2eLWJzIAmWHUzDP170_af3I/edit?usp=sharing).
53
+ The DS::Manifest::ManifestValidator validates the manifest and the
54
+ designated source records. Here is a sample manifest: [manifest.csv](https://github.com/DigitalScriptorium/ds-convert/blob/main/spec/fixtures/marc_xml/manifest.csv).
55
+
56
+ ### `ds-recon` process
57
+
58
+ Given a list of source files, `ds-recon` generates one or more CSVs listing reconcilable values from the sources, names, subjects, places, etc.
59
+
60
+ Usage:
61
+
62
+ ```
63
+ ds-recon --source-type=TYPE genres FILES
64
+ ```
65
+
66
+ Source type is one of `marc-xml`, `tei-xml`, `ds-csv`, or `ds-mets-xml`.
67
+
68
+ Example:
69
+
70
+ ```
71
+ ds-recon genres --source-type=marc-xml --directory=path/to/output_dir/ path/to/marc/*.xml
72
+ ```
73
+
74
+ The `ds-recon` subcommands are:
75
+
76
+ - `write-all` - output all recon CSVs
77
+ - `genres` - output `genres.csv`
78
+ - `languages` - output `languages.csv`
79
+ - `materials` - output `materials.csv`
80
+ - `names` - output `names.csv`
81
+ - `places` - output `places.csv`
82
+ - `subjects` - output `subjects.csv`
83
+ - `titles` - output `titles.csv`
84
+ - `splits` - output `splits.csv` (see below)
85
+ - `validate` - validate a recon CSV for format and well-formedness
86
+
87
+ Splits: `splits.csv` is an ad hoc list of long lines in source records that exceed the Wikibase 400-character limit for fields. When such long lines occur the data management team splits these lines into smaller chunks and adds them to the [`splits.csv`](https://github.com/DigitalScriptorium/ds-data/blob/main/terms/reconciled/splits.csv).
88
+
89
+ ## Scripts folder
90
+
91
+ The `/scripts` directory contains utility scripts for managing DS data.
92
+
93
+ scripts/
94
+ ├── ds_mets_manifest.rb # Generate an import manifest for DS METS
95
+ ├── flp_modification_dates.txt # List of Free Lib TEI mod dates; used by gen-tei-manifest.rb
96
+ ├── gen-tei-manifest.rb # Generate an import manifest for OPenn TEI
97
+ ├── marc-tag.rb # Find marc records by MARC tag/code
98
+ └── run-test-data.sh # Test ds-recon, ds-convert with all source types
99
+
100
+ # Development
101
+
102
+ ## Requirements
103
+
104
+ * Ruby version >= 3.4.0
105
+ * bundler Ruby gem
106
+
107
+ If you need to install Ruby or a compatible version of Ruby, you can use
108
+ [rbenv][rbenv], [rvm][rvm] or the [asdf][asdf] [ruby plugin][asdf-ruby].
109
+
110
+ [rbenv]: https://github.com/rbenv/rbenv "rbenv on github"
111
+ [rvm]: https://rvm.io "Ruby Version Manger home"
112
+ [asdf]: https://asdf-vm.com/guide/getting-started.html "ASDF getting started"
113
+ [asdf-ruby]: https://github.com/asdf-vm/asdf-ruby "ASDF Ruby plugin"
114
+
115
+ If you don't have the bundler gem installed run:
116
+
117
+ ```shell
118
+ $ gem install bundler
119
+ ```
120
+
121
+ ## Setup
122
+
123
+ Clone the repository, then:
124
+
125
+ ```shell
126
+ cd ds-convert
127
+ bundle install
128
+ ```
129
+
130
+ Run the Rspec specs to confirm everything is working as expected:
131
+
132
+ ```
133
+ bundle exec rspec
134
+ ```
135
+
136
+ Generate the API documentation:
137
+
138
+ ```
139
+ gem install yard webrick
140
+ yard doc # open doc/index.html
141
+ # or run a yard server at http://localhost:8808/
142
+ yard server
143
+ ```
144
+
145
+ Open `doc/index.html` to view API docs.
146
+
147
+
148
+ ### Testing
149
+
150
+ This project uses rspec for testing. To run the tests:
151
+
152
+ ```
153
+ bundle exec rspec
154
+ ```
155
+
156
+ ### Configuration
157
+
158
+ #### Institution/QID mappings
159
+
160
+ TODO: These mappings are probably no longer used. Investigate and remove if possible.
161
+
162
+ Several of the scripts rely on mappings from institution names to Wikidata QIDs
163
+ for CSV output. These have to be entered manually in `config/settings.yml`.
164
+
165
+ Wikidata QIDs for institutions are mapped to institution names in
166
+ `config/settings.yml`. These values are used to create a reverse hash,
167
+ `Constants::INSTITUTION_NAMES_TO_QID`, which maps institution names and the
168
+ one-word aliases to Wikidata QID URLs.
169
+
170
+ `config/institutions.yml`:
171
+
172
+ ```yaml
173
+ ---
174
+ institutions:
175
+ Q814779:
176
+ - Beinecke Rare Book & Manuscript Library
177
+ - beinecke
178
+ Q995265:
179
+ - Bryn Mawr College
180
+ - brynmawr
181
+ Q63969940:
182
+ - Burke Library at Union Theological Seminary
183
+ - burke
184
+ ```
185
+
186
+ Lists can be any length to allow for a number of variant names. The
187
+ preferred name for the institution should be first in the list, and
188
+ alias(es) should come at the end. The last item in each list should
189
+ be the preferred short name for the institution; e.g., 'beinecke',
190
+ 'burke', 'penn'.
191
+
192
+ #### Reconciliation values
193
+
194
+ Reconciliation CSVs are maintained in git and loaded at runtime.
195
+
196
+ The file `config/settings.yml` defines the location of the git repository,
197
+ path to each reconciliation CSV, and key columns:
198
+
199
+ ```yaml
200
+ ---
201
+ recon:
202
+ local_dir: <%= ENV['DS_DATA_DIR'] || '/tmp' %>
203
+ git_repo: 'https://github.com/DigitalScriptorium/ds-data.git'
204
+ git_branch: main
205
+ git_local_name: ds-data
206
+ iiif_manifests: iiif/legacy-iiif-manifests.csv
207
+ legacy_ia_urls: internet_archive/legacy-ia-urls.csv
208
+ sets:
209
+ - name: names
210
+ repo_path: terms/names.csv
211
+ key_column: name
212
+ - name: genres
213
+ repo_path: terms/genres.csv
214
+ key_column: term
215
+ subset_column: vocabulary
216
+ - name: places
217
+ repo_path: terms/places.csv
218
+ key_column: place_as_recorded
219
+ ```
220
+
221
+ Values are:
222
+
223
+ - `sets`: each CSV set loaded by the `Recon` module
224
+ - `name`: name of each set, used by `Recon.find_set(name)`
225
+ - `repo_path`: path of the CSV file or files in the repository
226
+
227
+ # DS Convert Architecture
228
+
229
+ DS Convert handles two responsibilities within the Digital Scriptorium source-data-to-Wikibase workflow.
230
+
231
+ ## Overall DS data workflow
232
+
233
+ The overall process from source file to Web publication is shown in the image below.
234
+
235
+ ![DSWorkflow1.jpg](docs/DSWorkflow1.jpg)
236
+
237
+ DS members provide their manuscript data in structured form as METS, MARC, TEI, CSV, or (forthcoming) EAD. That data is then converted to an agnostic DS import spreadsheet, with certain values enhanced by links to authorities, like Wikidata, the Getty Art and Architecture Thesaurus, and OCLC FAST. The import CSV is parsed and loaded into Wikibase. The Wikibase records are then extracted and ingested into the DS Search site.
238
+
239
+ The _**DS Convert**_ scripts `ds-recon` and `ds-convert` are responsible for the extraction and transformation of structure member data to generate the Agnostic Transition Spreadsheet, referred to here as the DS import CSV.
240
+
241
+ The full workflow, from source to Web is:
242
+
243
+ 1. Source records for extraction are delivered to the DS data manager (as MARC, CSV, etc.)
244
+ 2. Reconciliation values are extracted as CSVs from the source records for names, places, languages, etc. (DS Convert's `ds-recon`)
245
+ 3. The DS data manager reconciles unreconciled values from the extracted reconciliation CSVs and adds them to the [DS data dictionaries][DS Data Dictionaries].
246
+ 4. The data manager generates a manifest listing all manuscript records to be extracted form the source.
247
+ 5. The manifest, the data dictionary CSVs, and source data are used to generate the import CSV (DS Convert's `ds-convert`)
248
+ 6. The generated import CSV is loaded Wikibase by the data manager (DS Import service, not publicly available)
249
+ 7. The Wikibase data is exported and staged for ingest into the DS Search application (DS Import service)
250
+ 8. The DS Search application downloads and converts exported Wikibase JSON as Solr records
251
+
252
+ [DS Data Dictionaries]: https://github.com/DigitalScriptorium/ds-data/tree/main/terms/reconciled "DS Data Dictionaries"\
253
+
254
+ ## DS Convert workflow
255
+
256
+ The DS Convert scripts `ds-recon` and `ds-covert` are responsible for the extraction of reconciliation CSVs data and the generation DS import CSVs from source data provided by DS members. In more detail:
257
+
258
+ - `ds-recon` performs the extraction of values, like, names, places, and languages from sources that are to be reconciled with authorities and added to the DS [authority data dictionaries][DS Data Dictionaries], and
259
+ - `ds-convert` generates the import CSV from source records, enhancing those records with authority values from the project data dictionaries
260
+
261
+ ## DS Convert components
262
+
263
+ The main work of `ds-recon` and `ds-convert` is done by extractors: [DS::Extractor::TeiXml][TEI Extractor], [DS::Extractor::DsMetsXmlExtractor][METS Extractor], [DS::Extractor::DsCsvExtractor][CSV Extractor], and [DS::Extractor::Extractor::MarcXmlExtractor][MARC Extractor]
264
+
265
+ [CSV Extractor]: lib/ds/extractor/ds_csv_extractor.rb "DS::Extractor::DsCsvExtractor"
266
+ [MARC Extractor]: lib/ds/extractor/marc_xml_extractor.rb "DS::Extractor::Extractor::MarcXmlExtractor"
267
+ [METS Extractor]: lib/ds/extractor/ds_mets_xml_extractor.rb "DS::Extractor::DsMetsXmlExtractor"
268
+ [TEI Extractor]: lib/ds/extractor/tei_xml_extractor.rb "DS::Extractor::TeiXml"
269
+
270
+ These classes are responsible for extracting reconciliation and manuscript description values from source files.
271
+
272
+ Separate modules are responsible for transforming the extracted data as either reconciliation CSVs or an import CSV.
273
+ Reconciliation transformations are managed by the [Recon][Recon] module and its components.
274
+
275
+ The `ds-convert` process is more complex and comprises several modules:
276
+
277
+ - [Mappers][Mappers] that generate hashes for each source record (i.e., each manuscript description) using extractors
278
+ - Format-specific [Extractors][Extractors] that extract data from source files (MARC XML, DS CSV, etc.)
279
+ - A [Convertor][Convertor] that orchestrates the mapping and collects hashes for output as an import CSV
280
+ - A [Manifest][Manifest] that lists all records in a data source and provides information to a record locator (see below)
281
+ - Record locators, like [DS::Extractors::XmlRecordLocator][XmlRecordLocator]), that are used by a mapper to retrieve records for hash generation
282
+ - [Sources][Sources] that are responsible for open and returning parse source files _in the format expected by a mapper_ (see below)
283
+
284
+ [Extractors]: lib/ds/extractor
285
+ [Recon]: lib/ds/recon.rb "DS Recon module"
286
+ [Mappers]: lib/ds/mapper "DS Record mappers"
287
+ [Convertor]: lib/ds/converter/converter.rb "DS Converter"
288
+ [Manifest]: lib/ds/manifest "DS Manifest classes"
289
+ [Sources]: lib/ds/source "DS Sources"
290
+ [XmlRecordLocator]: lib/ds/extractor/xml_record_locator.rb "DS::Extractors::XmlRecordLocator"
291
+
292
+
293
+ ### A note on Sources
294
+ A note on `DS::Source`: A source is used by a mapper to request a parsed source file; for example, a MARC XML file as a Nokogiri::XML document. The primary motivation for the Source is its inclusion of the [SourceCache](lib/ds/source/source_cache.rb), which is responsible for caching open source files. This is useful for source files that have many records and are slow to open and parse, as for DS CSVs and MARC XML files, which may have a thousand or more records.
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rspec/core/rake_task"
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+
8
+ require "rubocop/rake_task"
9
+
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %i[spec rubocop]
@@ -0,0 +1,150 @@
1
+ ---
2
+ ds:
3
+ log_level: <%= ENV['DS_LOG_LEVEL'] || 'WARN' %>
4
+ recon:
5
+ local_dir: <%= ENV['DS_DATA_DIR'] || '/tmp' %>
6
+ git_repo: 'https://github.com/DigitalScriptorium/ds-data.git'
7
+ git_branch: main
8
+ git_local_name: ds-data
9
+ iiif_manifests: iiif/legacy-iiif-manifests.csv
10
+ legacy_ia_urls: internet_archive/legacy-ia-urls.csv
11
+ # TODO: make sets a hash so you can do Settings.recon.sets.genres
12
+ splits:
13
+ columns:
14
+
15
+ sets:
16
+ - name: :names
17
+ repo_path: terms/reconciled/names.csv
18
+ - name: :genres
19
+ repo_path: terms/reconciled/genres.csv
20
+ - name: :materials
21
+ repo_path: terms/reconciled/materials.csv
22
+ - name: :languages
23
+ repo_path: terms/reconciled/languages.csv
24
+ - name: :places
25
+ repo_path: terms/reconciled/places.csv
26
+ - name: :subjects
27
+ repo_path: terms/reconciled/subjects.csv
28
+ - name: :'named-subjects'
29
+ repo_path: terms/reconciled/named-subjects.csv
30
+ - name: :'all-subjects'
31
+ repo_path:
32
+ - terms/reconciled/subjects.csv
33
+ - terms/reconciled/named-subjects.csv
34
+ - name: :titles
35
+ repo_path: terms/reconciled/titles.csv
36
+ - name: :splits
37
+ repo_path: terms/reconciled/splits.csv
38
+
39
+ institutions:
40
+ Q463271:
41
+ - American Academy in Rome
42
+ - rome
43
+ Q814779:
44
+ - Beinecke Rare Book & Manuscript Library
45
+ - beinecke
46
+ Q995265:
47
+ - Bryn Mawr College
48
+ - brynmawr
49
+ Q63969940:
50
+ - Burke Library at Union Theological Seminary
51
+ - burke
52
+ Q5146808:
53
+ - The College of Physicians of Philadelphia
54
+ Q30257935:
55
+ - Conception Abbey and Seminary
56
+ - Conception Seminary College
57
+ - conception
58
+ Q1093910:
59
+ - City College of New York
60
+ - cuny
61
+ - ccny
62
+ Q5021042:
63
+ - State of California
64
+ - California State Library
65
+ - csl
66
+ Q49088:
67
+ - Columbia University
68
+ - columbia
69
+ Q49115:
70
+ - Cornell University
71
+ - cornell
72
+ Q5671855:
73
+ - Harry Ransom Center
74
+ - hrc
75
+ Q3087288:
76
+ - Free Library of Philadelphia
77
+ - flp
78
+ Q1501676:
79
+ - General Theological Seminary
80
+ - gts
81
+ Q5174002:
82
+ - Grolier Club
83
+ - grolier
84
+ Q13371:
85
+ - Harvard University
86
+ - harvard
87
+ Q1400558:
88
+ - Huntington Library, Art Museum, and Botanical Gardens
89
+ - The Huntington Library, Art Museum, and Botanical Gardens
90
+ - huntington
91
+ Q1079140:
92
+ - Indiana University, Bloomington
93
+ - Indiana University
94
+ - indiana
95
+ Q52413:
96
+ - University of Kansas
97
+ - kansas
98
+ Q1976985:
99
+ - Nelson-Atkins Museum of Art
100
+ - nelsonatkins
101
+ Q49210:
102
+ - New York University
103
+ - nyu
104
+ Q510324:
105
+ - Philadelphia Museum of Art
106
+ Q21578:
107
+ - Princeton University
108
+ - princeton
109
+ Q20745482:
110
+ - Providence Public Library
111
+ - providence
112
+ Q499451:
113
+ - Rutgers, The State University of New Jersey
114
+ - rutgers
115
+ Q5090408:
116
+ - Science History Institute
117
+ - Chemical Heritage Foundation
118
+ - shi
119
+ Q734774:
120
+ - Saint Louis University
121
+ - slu
122
+ Q49204:
123
+ - Smith College
124
+ - smith
125
+ Q1378320:
126
+ - Swarthmore College
127
+ - swarthmore
128
+ Q168756:
129
+ - University of California, Berkeley
130
+ - ucb
131
+ Q579968:
132
+ - University of Missouri
133
+ - mizzou
134
+ - missouri
135
+ Q766145:
136
+ - University of Oregon
137
+ - oregon
138
+ Q49117:
139
+ - University of Pennsylvania
140
+ - upenn
141
+ - penn
142
+ Q49205:
143
+ - Wellesley College
144
+ - wellesley
145
+ Q49112:
146
+ - Yale University
147
+ - yale
148
+ Q129421:
149
+ - University of California, Davis
150
+ - ucdavis
data/exe/ds-convert ADDED
@@ -0,0 +1,149 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'thor'
4
+ require 'csv'
5
+ require_relative '../lib/ds/cli'
6
+
7
+ # TODO: No URIs for Scribes in prototype data set; need test data
8
+ # Given a directory containing a set of source records (MARC XML, DS 1.0
9
+ # METS, OPenn TEI XML, a CSV) and a `manifest.csv` file, generate a DS
10
+ # import CSV for all records listed in `manifest.csv`. The output import
11
+ # CSV is used by the DS Import scripts to import data into the DS
12
+ # Wikibase instance.
13
+ class DSConvertCLI < DS::CLI
14
+
15
+ def self.exit_on_failure?
16
+ true
17
+ end
18
+
19
+ class_option :'output-csv', banner: 'FILE', desc: "Name of the output CSV file [default: output.csv]", default: 'output.csv', aliases: '-o'
20
+
21
+ desc "convert OPTIONS MANIFEST [SOURCE_DIR]", "Generate import CSV for MANIFEST and SOURCE_DIR"
22
+ long_desc <<-LONGDESC
23
+ Generate an import CSV for all files listed in the MANIFEST CSV that are found in SOURCE_DIR.
24
+
25
+ If SOURCE_DIR is not specified, the parent directory of MANIFEST is used to locate source records.
26
+
27
+ The MANIFEST is validated before the import CSV is generated. See 'validate' for details.
28
+ LONGDESC
29
+ ##
30
+ # Convert all source records listed in +manifest_csv+; all source
31
+ # records must be in +source_dir+. If +source_dir+ is nil, +convert+
32
+ # expects to find source records in the parent directory of
33
+ # +manifest_csv+.
34
+ #
35
+ # @param [String] manifest_csv path to the DS manifest CSV
36
+ # @param [String] source_dir path to the directory containing all
37
+ # source records listed in +manifest_csv+
38
+ def convert manifest_csv, source_dir = nil
39
+
40
+ invoke :recon_update
41
+ # validate the manifest csv before preceding
42
+ invoke :validate
43
+
44
+ # TODO: This is pretty busy; simplify; push to another class?
45
+ manifest = DS::Manifest::Manifest.new manifest_csv, source_dir
46
+ converter = DS::Converter::Converter.new manifest
47
+ count = 0
48
+ valid = true
49
+ outfile = options[:'output-csv']
50
+ write outfile: outfile do |csv|
51
+ converter.convert do |row|
52
+ count += 1
53
+ csv << row
54
+ end
55
+ end
56
+
57
+ print_message(options) { "Wrote #{number_to_delimited(count)} rows to '#{outfile}'" }
58
+ if converter.csv_valid?
59
+ print_message(options) { "CSV is valid: #{outfile}".colorize(:green) }
60
+ else
61
+ puts converter.errors
62
+ raise DSError, "Validation errors found; see previous messages".colorize(:red)
63
+ end
64
+ end
65
+
66
+ desc "validate MANIFEST SOURCE_DIR", "Validate the import MANIFEST"
67
+ # Ugh. I can't figure out how to make thor not join consecutive
68
+ # so I've separated each list item.
69
+ long_desc <<LONGDESC
70
+ Validate the import MANIFEST and the SOURCE_DIR.
71
+
72
+ NOTE: Works only for MARC at present.
73
+
74
+ The validator:
75
+
76
+ - Confirms all required columns are present
77
+
78
+ - Confirms all all required values are present
79
+
80
+ - Confirms all column values are the correct type
81
+
82
+ - Confirms all listed input files are present
83
+
84
+ - Confirms all listed input files match the record
85
+ identifier provided in the manifest
86
+
87
+ LONGDESC
88
+
89
+ def validate manifest_path, source_dir=nil
90
+ print_message(options, verbose_only: true) {
91
+ "Validating manifest CSV: '#{manifest_path}'"
92
+ }
93
+
94
+ manifest = DS::Manifest::Manifest.new manifest_path, source_dir
95
+
96
+ manifest_validator = DS::Manifest::ManifestValidator.new manifest
97
+ if manifest_validator.valid?
98
+ print_message(options) {
99
+ "SUCCESS! Manifest is valid: '#{manifest_path}'".colorize(:green)
100
+ }
101
+ else
102
+ print_message(options) { manifest_validator.errors.take(10).join("\n") }
103
+ print_message(options) {
104
+ "ERROR: Manifest has validation errors: #{manifest_path}".colorize(:red)
105
+ }
106
+
107
+ exit 1
108
+ end
109
+ end
110
+
111
+ protected
112
+
113
+ ##
114
+ # Create +outfile+ and yield +csv+ to caller.
115
+ #
116
+ # This method should replace {#write_csv}.
117
+ #
118
+ # @param [String] outfile the outfile name and path
119
+ # @yield [CSV] the CSV object
120
+ def write outfile:, &block
121
+ CSV.open outfile, "w", headers: true do |csv|
122
+ csv << DS::HEADINGS
123
+ yield csv
124
+ end
125
+ end
126
+
127
+ def write_csv rows, options
128
+ out_csv = options[:'output-csv']
129
+ CSV.open out_csv, "w", headers: true do |csv|
130
+ csv << DS::HEADINGS
131
+ rows.each do |row|
132
+ csv << row
133
+ end
134
+ end
135
+ puts "Wrote: '#{out_csv}'"
136
+ end
137
+
138
+ def validate_args files
139
+ return true if read_from_stdin? files
140
+
141
+ cannot_find = files.reject { |f| File.exist?(f) }
142
+ return true if cannot_find.empty?
143
+
144
+ puts "Can't find input file(s): #{cannot_find.join '; ' }"
145
+ false
146
+ end
147
+ end
148
+
149
+ DSConvertCLI.start ARGV