ds-convert 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +294 -0
- data/Rakefile +12 -0
- data/config/settings.yml +150 -0
- data/exe/ds-convert +149 -0
- data/exe/ds-recon +275 -0
- data/exe/ds-validate-csv +40 -0
- data/exe/marc-mrc-to-xml.rb +80 -0
- data/lib/ds/cli.rb +102 -0
- data/lib/ds/constants.rb +166 -0
- data/lib/ds/converter/converter.rb +124 -0
- data/lib/ds/converter/writer.rb +50 -0
- data/lib/ds/converter.rb +7 -0
- data/lib/ds/csv_util.rb +43 -0
- data/lib/ds/data/berkeley-arks.txt +4000 -0
- data/lib/ds/data/getty-aat-centuries.csv +71 -0
- data/lib/ds/data/iiif_manifests.csv +122 -0
- data/lib/ds/data/legacy-iiif-manifests.csv +77 -0
- data/lib/ds/ds_error.rb +1 -0
- data/lib/ds/extractor/base_record_locator.rb +24 -0
- data/lib/ds/extractor/base_term.rb +79 -0
- data/lib/ds/extractor/csv_record_locator.rb +13 -0
- data/lib/ds/extractor/ds_csv_extractor.rb +695 -0
- data/lib/ds/extractor/ds_mets_xml_extractor.rb +1114 -0
- data/lib/ds/extractor/genre.rb +45 -0
- data/lib/ds/extractor/language.rb +31 -0
- data/lib/ds/extractor/marc_xml_extractor.rb +1172 -0
- data/lib/ds/extractor/material.rb +12 -0
- data/lib/ds/extractor/name.rb +50 -0
- data/lib/ds/extractor/place.rb +11 -0
- data/lib/ds/extractor/subject.rb +58 -0
- data/lib/ds/extractor/tei_xml_extractor.rb +687 -0
- data/lib/ds/extractor/title.rb +52 -0
- data/lib/ds/extractor/xml_record_locator.rb +38 -0
- data/lib/ds/extractor.rb +24 -0
- data/lib/ds/institutions.rb +55 -0
- data/lib/ds/manifest/base_id_validator.rb +76 -0
- data/lib/ds/manifest/constants.rb +67 -0
- data/lib/ds/manifest/ds_csv_id_validator.rb +15 -0
- data/lib/ds/manifest/entry.rb +133 -0
- data/lib/ds/manifest/manifest.rb +74 -0
- data/lib/ds/manifest/manifest_validator.rb +256 -0
- data/lib/ds/manifest/simple_xml_id_validator.rb +42 -0
- data/lib/ds/manifest.rb +30 -0
- data/lib/ds/mapper/base_mapper.rb +221 -0
- data/lib/ds/mapper/ds_csv_mapper.rb +77 -0
- data/lib/ds/mapper/ds_mets_mapper.rb +85 -0
- data/lib/ds/mapper/marc_mapper.rb +87 -0
- data/lib/ds/mapper/tei_xml_mapper.rb +79 -0
- data/lib/ds/mapper.rb +13 -0
- data/lib/ds/recon/constants.rb +56 -0
- data/lib/ds/recon/ds_csv_enumerator.rb +16 -0
- data/lib/ds/recon/ds_mets_xml_enumerator.rb +14 -0
- data/lib/ds/recon/marc_xml_enumerator.rb +15 -0
- data/lib/ds/recon/recon_builder.rb +183 -0
- data/lib/ds/recon/recon_data.rb +37 -0
- data/lib/ds/recon/recon_manager.rb +92 -0
- data/lib/ds/recon/source_enumerator.rb +21 -0
- data/lib/ds/recon/tei_xml_enumerator.rb +14 -0
- data/lib/ds/recon/type/all_subjects.rb +18 -0
- data/lib/ds/recon/type/genres.rb +50 -0
- data/lib/ds/recon/type/languages.rb +38 -0
- data/lib/ds/recon/type/materials.rb +40 -0
- data/lib/ds/recon/type/named_subjects.rb +20 -0
- data/lib/ds/recon/type/names.rb +65 -0
- data/lib/ds/recon/type/places.rb +40 -0
- data/lib/ds/recon/type/recon_type.rb +136 -0
- data/lib/ds/recon/type/splits.rb +34 -0
- data/lib/ds/recon/type/subjects.rb +65 -0
- data/lib/ds/recon/type/titles.rb +38 -0
- data/lib/ds/recon/url_lookup.rb +52 -0
- data/lib/ds/recon.rb +292 -0
- data/lib/ds/source/base_source.rb +32 -0
- data/lib/ds/source/ds_csv.rb +18 -0
- data/lib/ds/source/ds_mets_xml.rb +20 -0
- data/lib/ds/source/marc_xml.rb +22 -0
- data/lib/ds/source/source_cache.rb +69 -0
- data/lib/ds/source/tei_xml.rb +22 -0
- data/lib/ds/source.rb +20 -0
- data/lib/ds/util/cache.rb +111 -0
- data/lib/ds/util/csv_validator.rb +209 -0
- data/lib/ds/util/csv_writer.rb +42 -0
- data/lib/ds/util/strings.rb +194 -0
- data/lib/ds/util.rb +37 -0
- data/lib/ds/version.rb +5 -0
- data/lib/ds.rb +237 -0
- metadata +246 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: d88babed86e60cc72a333832fd8480af964e54978d37ed9a0005e7cc140787af
|
4
|
+
data.tar.gz: d83d3d595c0ea0299dfc83ce4ee4d1d5e955ce6edb06a8b6ddc0ce0207a16262
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 62e11d6bf7cfb5397228177127464ef7f1e5ad4bc948b1afdd7e358d1658974abd2d0f1bfbd8d884cc9b33e31769060bd64480c456fc84ebe7727f4dbb4da3cd
|
7
|
+
data.tar.gz: cfc8df9d5f25b63367689429e9e763f0720abf2b44a8b9e0b4a872bf8c11d42cccff16245daa4c0b6c0a23aae9b74fbfd36002c7406ac6d9e572bcd6ddea25d4
|
data/README.md
ADDED
@@ -0,0 +1,294 @@
|
|
1
|
+
# DS Convert
|
2
|
+
|
3
|
+
RubyGem that provides scripts to transform and manage input from multiple sources to generate a DS 2.0 imports CSV. Also includes scripts to extract strings from sources for authority reconciliation.
|
4
|
+
|
5
|
+
# Requirements
|
6
|
+
|
7
|
+
* Ruby version >= 3.4.0
|
8
|
+
|
9
|
+
# Installation
|
10
|
+
|
11
|
+
Run
|
12
|
+
|
13
|
+
```
|
14
|
+
gem install ds-convert
|
15
|
+
```
|
16
|
+
|
17
|
+
## Transformation scripts
|
18
|
+
|
19
|
+
There are three main scripts:
|
20
|
+
|
21
|
+
ds-convert # Generate DS import CSV from member source data
|
22
|
+
ds-recon # Extract string values from source data for reconciliation
|
23
|
+
ds-validate-csv # Check DS Import CSV for values with trailing whitespace
|
24
|
+
marc-mrc-to-xml.rb # Utility script to conver MARC MRC files to MARC XML
|
25
|
+
|
26
|
+
The `ds-convert` script outputs a standard DS import CSV. Columns names and order are defined in `lib/ds/constants.rb` and can be access via `DS::HEADINGS`.
|
27
|
+
|
28
|
+
The `recon` script outputs a number of CSV with extracted values for names (authors, artists, scribe, former owners), places, subjects, and genres (from various vocabularies). CSVs output by `recon` have different columns according the content type.
|
29
|
+
|
30
|
+
### `ds-convert` process
|
31
|
+
|
32
|
+
Usage:
|
33
|
+
|
34
|
+
```
|
35
|
+
ds-convert convert OPTIONS MANIFEST [SOURCE_DIR]
|
36
|
+
```
|
37
|
+
|
38
|
+
For example,
|
39
|
+
|
40
|
+
```
|
41
|
+
ds-convert convert --output path/to/outputdir/output.csv ../path/to/manifest.csv
|
42
|
+
```
|
43
|
+
|
44
|
+
Given a directory containing a set of source records (MARC XML, DS 1.0
|
45
|
+
METS, OPenn TEI XML, a CSV) and a `manifest.csv` file, `ds-convert` generates a DS
|
46
|
+
import CSV for all records listed in `manifest.csv`. The output import
|
47
|
+
CSV is used by the DS Import scripts to import data into the DS
|
48
|
+
Wikibase instance.
|
49
|
+
|
50
|
+
The values found in the `manifest.csv` are described in the [DS import
|
51
|
+
manifest data
|
52
|
+
dictionary](https://docs.google.com/spreadsheets/d/195ItCa2Qg69lp0lMuVlq2eLWJzIAmWHUzDP170_af3I/edit?usp=sharing).
|
53
|
+
The DS::Manifest::ManifestValidator validates the manifest and the
|
54
|
+
designated source records. Here is a sample manifest: [manifest.csv](https://github.com/DigitalScriptorium/ds-convert/blob/main/spec/fixtures/marc_xml/manifest.csv).
|
55
|
+
|
56
|
+
### `ds-recon` process
|
57
|
+
|
58
|
+
Given a list of source files, `ds-recon` generates one or more CSVs listing reconcilable values from the sources, names, subjects, places, etc.
|
59
|
+
|
60
|
+
Usage:
|
61
|
+
|
62
|
+
```
|
63
|
+
ds-recon --source-type=TYPE genres FILES
|
64
|
+
```
|
65
|
+
|
66
|
+
Source type is one of `marc-xml`, `tei-xml`, `ds-csv`, or `ds-mets-xml`.
|
67
|
+
|
68
|
+
Example:
|
69
|
+
|
70
|
+
```
|
71
|
+
ds-recon genres --source-type=marc-xml --directory=path/to/output_dir/ path/to/marc/*.xml
|
72
|
+
```
|
73
|
+
|
74
|
+
The `ds-recon` subcommands are:
|
75
|
+
|
76
|
+
- `write-all` - output all recon CSVs
|
77
|
+
- `genres` - output `genres.csv`
|
78
|
+
- `languages` - output `languages.csv`
|
79
|
+
- `materials` - output `materials.csv`
|
80
|
+
- `names` - output `names.csv`
|
81
|
+
- `places` - output `places.csv`
|
82
|
+
- `subjects` - output `subjects.csv`
|
83
|
+
- `titles` - output `titles.csv`
|
84
|
+
- `splits` - output `splits.csv` (see below)
|
85
|
+
- `validate` - validate a recon CSV for format and well-formedness
|
86
|
+
|
87
|
+
Splits: `splits.csv` is an ad hoc list of long lines in source records that exceed the Wikibase 400-character limit for fields. When such long lines occur the data management team splits these lines into smaller chunks and adds them to the [`splits.csv`](https://github.com/DigitalScriptorium/ds-data/blob/main/terms/reconciled/splits.csv).
|
88
|
+
|
89
|
+
## Scripts folder
|
90
|
+
|
91
|
+
The `/scripts` directory contains utility scripts for managing DS data.
|
92
|
+
|
93
|
+
scripts/
|
94
|
+
├── ds_mets_manifest.rb # Generate an import manifest for DS METS
|
95
|
+
├── flp_modification_dates.txt # List of Free Lib TEI mod dates; used by gen-tei-manifest.rb
|
96
|
+
├── gen-tei-manifest.rb # Generate an import manifest for OPenn TEI
|
97
|
+
├── marc-tag.rb # Find marc records by MARC tag/code
|
98
|
+
└── run-test-data.sh # Test ds-recon, ds-convert with all source types
|
99
|
+
|
100
|
+
# Development
|
101
|
+
|
102
|
+
## Requirements
|
103
|
+
|
104
|
+
* Ruby version >= 3.4.0
|
105
|
+
* bundler Ruby gem
|
106
|
+
|
107
|
+
If you need to install Ruby or a compatible version of Ruby, you can use
|
108
|
+
[rbenv][rbenv], [rvm][rvm] or the [asdf][asdf] [ruby plugin][asdf-ruby].
|
109
|
+
|
110
|
+
[rbenv]: https://github.com/rbenv/rbenv "rbenv on github"
|
111
|
+
[rvm]: https://rvm.io "Ruby Version Manger home"
|
112
|
+
[asdf]: https://asdf-vm.com/guide/getting-started.html "ASDF getting started"
|
113
|
+
[asdf-ruby]: https://github.com/asdf-vm/asdf-ruby "ASDF Ruby plugin"
|
114
|
+
|
115
|
+
If you don't have the bundler gem installed run:
|
116
|
+
|
117
|
+
```shell
|
118
|
+
$ gem install bundler
|
119
|
+
```
|
120
|
+
|
121
|
+
## Setup
|
122
|
+
|
123
|
+
Clone the repository, then:
|
124
|
+
|
125
|
+
```shell
|
126
|
+
cd ds-convert
|
127
|
+
bundle install
|
128
|
+
```
|
129
|
+
|
130
|
+
Run the Rspec specs to confirm everything is working as expected:
|
131
|
+
|
132
|
+
```
|
133
|
+
bundle exec rspec
|
134
|
+
```
|
135
|
+
|
136
|
+
Generate the API documentation:
|
137
|
+
|
138
|
+
```
|
139
|
+
gem install yard webrick
|
140
|
+
yard doc # open doc/index.html
|
141
|
+
# or run a yard server at http://localhost:8808/
|
142
|
+
yard server
|
143
|
+
```
|
144
|
+
|
145
|
+
Open `doc/index.html` to view API docs.
|
146
|
+
|
147
|
+
|
148
|
+
### Testing
|
149
|
+
|
150
|
+
This project uses rspec for testing. To run the tests:
|
151
|
+
|
152
|
+
```
|
153
|
+
bundle exec rspec
|
154
|
+
```
|
155
|
+
|
156
|
+
### Configuration
|
157
|
+
|
158
|
+
#### Institution/QID mappings
|
159
|
+
|
160
|
+
TODO: These mappings are probably no longer used. Investigate and remove if possible.
|
161
|
+
|
162
|
+
Several of the scripts rely on mappings from institution names to Wikidata QIDs
|
163
|
+
for CSV output. These have to be entered manually in `config/settings.yml`.
|
164
|
+
|
165
|
+
Wikidata QIDs for institutions are mapped to institution names in
|
166
|
+
`config/settings.yml`. These values are used to create a reverse hash,
|
167
|
+
`Constants::INSTITUTION_NAMES_TO_QID`, which maps institution names and the
|
168
|
+
one-word aliases to Wikidata QID URLs.
|
169
|
+
|
170
|
+
`config/institutions.yml`:
|
171
|
+
|
172
|
+
```yaml
|
173
|
+
---
|
174
|
+
institutions:
|
175
|
+
Q814779:
|
176
|
+
- Beinecke Rare Book & Manuscript Library
|
177
|
+
- beinecke
|
178
|
+
Q995265:
|
179
|
+
- Bryn Mawr College
|
180
|
+
- brynmawr
|
181
|
+
Q63969940:
|
182
|
+
- Burke Library at Union Theological Seminary
|
183
|
+
- burke
|
184
|
+
```
|
185
|
+
|
186
|
+
Lists can be any length to allow for a number of variant names. The
|
187
|
+
preferred name for the institution should be first in the list, and
|
188
|
+
alias(es) should come at the end. The last item in each list should
|
189
|
+
be the preferred short name for the institution; e.g., 'beinecke',
|
190
|
+
'burke', 'penn'.
|
191
|
+
|
192
|
+
#### Reconciliation values
|
193
|
+
|
194
|
+
Reconciliation CSVs are maintained in git and loaded at runtime.
|
195
|
+
|
196
|
+
The file `config/settings.yml` defines the location of the git repository,
|
197
|
+
path to each reconciliation CSV, and key columns:
|
198
|
+
|
199
|
+
```yaml
|
200
|
+
---
|
201
|
+
recon:
|
202
|
+
local_dir: <%= ENV['DS_DATA_DIR'] || '/tmp' %>
|
203
|
+
git_repo: 'https://github.com/DigitalScriptorium/ds-data.git'
|
204
|
+
git_branch: main
|
205
|
+
git_local_name: ds-data
|
206
|
+
iiif_manifests: iiif/legacy-iiif-manifests.csv
|
207
|
+
legacy_ia_urls: internet_archive/legacy-ia-urls.csv
|
208
|
+
sets:
|
209
|
+
- name: names
|
210
|
+
repo_path: terms/names.csv
|
211
|
+
key_column: name
|
212
|
+
- name: genres
|
213
|
+
repo_path: terms/genres.csv
|
214
|
+
key_column: term
|
215
|
+
subset_column: vocabulary
|
216
|
+
- name: places
|
217
|
+
repo_path: terms/places.csv
|
218
|
+
key_column: place_as_recorded
|
219
|
+
```
|
220
|
+
|
221
|
+
Values are:
|
222
|
+
|
223
|
+
- `sets`: each CSV set loaded by the `Recon` module
|
224
|
+
- `name`: name of each set, used by `Recon.find_set(name)`
|
225
|
+
- `repo_path`: path of the CSV file or files in the repository
|
226
|
+
|
227
|
+
# DS Convert Architecture
|
228
|
+
|
229
|
+
DS Convert handles two responsibilities within the Digital Scriptorium source-data-to-Wikibase workflow.
|
230
|
+
|
231
|
+
## Overall DS data workflow
|
232
|
+
|
233
|
+
The overall process from source file to Web publication is shown in the image below.
|
234
|
+
|
235
|
+

|
236
|
+
|
237
|
+
DS members provide their manuscript data in structured form as METS, MARC, TEI, CSV, or (forthcoming) EAD. That data is then converted to an agnostic DS import spreadsheet, with certain values enhanced by links to authorities, like Wikidata, the Getty Art and Architecture Thesaurus, and OCLC FAST. The import CSV is parsed and loaded into Wikibase. The Wikibase records are then extracted and ingested into the DS Search site.
|
238
|
+
|
239
|
+
The _**DS Convert**_ scripts `ds-recon` and `ds-convert` are responsible for the extraction and transformation of structure member data to generate the Agnostic Transition Spreadsheet, referred to here as the DS import CSV.
|
240
|
+
|
241
|
+
The full workflow, from source to Web is:
|
242
|
+
|
243
|
+
1. Source records for extraction are delivered to the DS data manager (as MARC, CSV, etc.)
|
244
|
+
2. Reconciliation values are extracted as CSVs from the source records for names, places, languages, etc. (DS Convert's `ds-recon`)
|
245
|
+
3. The DS data manager reconciles unreconciled values from the extracted reconciliation CSVs and adds them to the [DS data dictionaries][DS Data Dictionaries].
|
246
|
+
4. The data manager generates a manifest listing all manuscript records to be extracted form the source.
|
247
|
+
5. The manifest, the data dictionary CSVs, and source data are used to generate the import CSV (DS Convert's `ds-convert`)
|
248
|
+
6. The generated import CSV is loaded Wikibase by the data manager (DS Import service, not publicly available)
|
249
|
+
7. The Wikibase data is exported and staged for ingest into the DS Search application (DS Import service)
|
250
|
+
8. The DS Search application downloads and converts exported Wikibase JSON as Solr records
|
251
|
+
|
252
|
+
[DS Data Dictionaries]: https://github.com/DigitalScriptorium/ds-data/tree/main/terms/reconciled "DS Data Dictionaries"\
|
253
|
+
|
254
|
+
## DS Convert workflow
|
255
|
+
|
256
|
+
The DS Convert scripts `ds-recon` and `ds-covert` are responsible for the extraction of reconciliation CSVs data and the generation DS import CSVs from source data provided by DS members. In more detail:
|
257
|
+
|
258
|
+
- `ds-recon` performs the extraction of values, like, names, places, and languages from sources that are to be reconciled with authorities and added to the DS [authority data dictionaries][DS Data Dictionaries], and
|
259
|
+
- `ds-convert` generates the import CSV from source records, enhancing those records with authority values from the project data dictionaries
|
260
|
+
|
261
|
+
## DS Convert components
|
262
|
+
|
263
|
+
The main work of `ds-recon` and `ds-convert` is done by extractors: [DS::Extractor::TeiXml][TEI Extractor], [DS::Extractor::DsMetsXmlExtractor][METS Extractor], [DS::Extractor::DsCsvExtractor][CSV Extractor], and [DS::Extractor::Extractor::MarcXmlExtractor][MARC Extractor]
|
264
|
+
|
265
|
+
[CSV Extractor]: lib/ds/extractor/ds_csv_extractor.rb "DS::Extractor::DsCsvExtractor"
|
266
|
+
[MARC Extractor]: lib/ds/extractor/marc_xml_extractor.rb "DS::Extractor::Extractor::MarcXmlExtractor"
|
267
|
+
[METS Extractor]: lib/ds/extractor/ds_mets_xml_extractor.rb "DS::Extractor::DsMetsXmlExtractor"
|
268
|
+
[TEI Extractor]: lib/ds/extractor/tei_xml_extractor.rb "DS::Extractor::TeiXml"
|
269
|
+
|
270
|
+
These classes are responsible for extracting reconciliation and manuscript description values from source files.
|
271
|
+
|
272
|
+
Separate modules are responsible for transforming the extracted data as either reconciliation CSVs or an import CSV.
|
273
|
+
Reconciliation transformations are managed by the [Recon][Recon] module and its components.
|
274
|
+
|
275
|
+
The `ds-convert` process is more complex and comprises several modules:
|
276
|
+
|
277
|
+
- [Mappers][Mappers] that generate hashes for each source record (i.e., each manuscript description) using extractors
|
278
|
+
- Format-specific [Extractors][Extractors] that extract data from source files (MARC XML, DS CSV, etc.)
|
279
|
+
- A [Convertor][Convertor] that orchestrates the mapping and collects hashes for output as an import CSV
|
280
|
+
- A [Manifest][Manifest] that lists all records in a data source and provides information to a record locator (see below)
|
281
|
+
- Record locators, like [DS::Extractors::XmlRecordLocator][XmlRecordLocator]), that are used by a mapper to retrieve records for hash generation
|
282
|
+
- [Sources][Sources] that are responsible for open and returning parse source files _in the format expected by a mapper_ (see below)
|
283
|
+
|
284
|
+
[Extractors]: lib/ds/extractor
|
285
|
+
[Recon]: lib/ds/recon.rb "DS Recon module"
|
286
|
+
[Mappers]: lib/ds/mapper "DS Record mappers"
|
287
|
+
[Convertor]: lib/ds/converter/converter.rb "DS Converter"
|
288
|
+
[Manifest]: lib/ds/manifest "DS Manifest classes"
|
289
|
+
[Sources]: lib/ds/source "DS Sources"
|
290
|
+
[XmlRecordLocator]: lib/ds/extractor/xml_record_locator.rb "DS::Extractors::XmlRecordLocator"
|
291
|
+
|
292
|
+
|
293
|
+
### A note on Sources
|
294
|
+
A note on `DS::Source`: A source is used by a mapper to request a parsed source file; for example, a MARC XML file as a Nokogiri::XML document. The primary motivation for the Source is its inclusion of the [SourceCache](lib/ds/source/source_cache.rb), which is responsible for caching open source files. This is useful for source files that have many records and are slow to open and parse, as for DS CSVs and MARC XML files, which may have a thousand or more records.
|
data/Rakefile
ADDED
data/config/settings.yml
ADDED
@@ -0,0 +1,150 @@
|
|
1
|
+
---
|
2
|
+
ds:
|
3
|
+
log_level: <%= ENV['DS_LOG_LEVEL'] || 'WARN' %>
|
4
|
+
recon:
|
5
|
+
local_dir: <%= ENV['DS_DATA_DIR'] || '/tmp' %>
|
6
|
+
git_repo: 'https://github.com/DigitalScriptorium/ds-data.git'
|
7
|
+
git_branch: main
|
8
|
+
git_local_name: ds-data
|
9
|
+
iiif_manifests: iiif/legacy-iiif-manifests.csv
|
10
|
+
legacy_ia_urls: internet_archive/legacy-ia-urls.csv
|
11
|
+
# TODO: make sets a hash so you can do Settings.recon.sets.genres
|
12
|
+
splits:
|
13
|
+
columns:
|
14
|
+
|
15
|
+
sets:
|
16
|
+
- name: :names
|
17
|
+
repo_path: terms/reconciled/names.csv
|
18
|
+
- name: :genres
|
19
|
+
repo_path: terms/reconciled/genres.csv
|
20
|
+
- name: :materials
|
21
|
+
repo_path: terms/reconciled/materials.csv
|
22
|
+
- name: :languages
|
23
|
+
repo_path: terms/reconciled/languages.csv
|
24
|
+
- name: :places
|
25
|
+
repo_path: terms/reconciled/places.csv
|
26
|
+
- name: :subjects
|
27
|
+
repo_path: terms/reconciled/subjects.csv
|
28
|
+
- name: :'named-subjects'
|
29
|
+
repo_path: terms/reconciled/named-subjects.csv
|
30
|
+
- name: :'all-subjects'
|
31
|
+
repo_path:
|
32
|
+
- terms/reconciled/subjects.csv
|
33
|
+
- terms/reconciled/named-subjects.csv
|
34
|
+
- name: :titles
|
35
|
+
repo_path: terms/reconciled/titles.csv
|
36
|
+
- name: :splits
|
37
|
+
repo_path: terms/reconciled/splits.csv
|
38
|
+
|
39
|
+
institutions:
|
40
|
+
Q463271:
|
41
|
+
- American Academy in Rome
|
42
|
+
- rome
|
43
|
+
Q814779:
|
44
|
+
- Beinecke Rare Book & Manuscript Library
|
45
|
+
- beinecke
|
46
|
+
Q995265:
|
47
|
+
- Bryn Mawr College
|
48
|
+
- brynmawr
|
49
|
+
Q63969940:
|
50
|
+
- Burke Library at Union Theological Seminary
|
51
|
+
- burke
|
52
|
+
Q5146808:
|
53
|
+
- The College of Physicians of Philadelphia
|
54
|
+
Q30257935:
|
55
|
+
- Conception Abbey and Seminary
|
56
|
+
- Conception Seminary College
|
57
|
+
- conception
|
58
|
+
Q1093910:
|
59
|
+
- City College of New York
|
60
|
+
- cuny
|
61
|
+
- ccny
|
62
|
+
Q5021042:
|
63
|
+
- State of California
|
64
|
+
- California State Library
|
65
|
+
- csl
|
66
|
+
Q49088:
|
67
|
+
- Columbia University
|
68
|
+
- columbia
|
69
|
+
Q49115:
|
70
|
+
- Cornell University
|
71
|
+
- cornell
|
72
|
+
Q5671855:
|
73
|
+
- Harry Ransom Center
|
74
|
+
- hrc
|
75
|
+
Q3087288:
|
76
|
+
- Free Library of Philadelphia
|
77
|
+
- flp
|
78
|
+
Q1501676:
|
79
|
+
- General Theological Seminary
|
80
|
+
- gts
|
81
|
+
Q5174002:
|
82
|
+
- Grolier Club
|
83
|
+
- grolier
|
84
|
+
Q13371:
|
85
|
+
- Harvard University
|
86
|
+
- harvard
|
87
|
+
Q1400558:
|
88
|
+
- Huntington Library, Art Museum, and Botanical Gardens
|
89
|
+
- The Huntington Library, Art Museum, and Botanical Gardens
|
90
|
+
- huntington
|
91
|
+
Q1079140:
|
92
|
+
- Indiana University, Bloomington
|
93
|
+
- Indiana University
|
94
|
+
- indiana
|
95
|
+
Q52413:
|
96
|
+
- University of Kansas
|
97
|
+
- kansas
|
98
|
+
Q1976985:
|
99
|
+
- Nelson-Atkins Museum of Art
|
100
|
+
- nelsonatkins
|
101
|
+
Q49210:
|
102
|
+
- New York University
|
103
|
+
- nyu
|
104
|
+
Q510324:
|
105
|
+
- Philadelphia Museum of Art
|
106
|
+
Q21578:
|
107
|
+
- Princeton University
|
108
|
+
- princeton
|
109
|
+
Q20745482:
|
110
|
+
- Providence Public Library
|
111
|
+
- providence
|
112
|
+
Q499451:
|
113
|
+
- Rutgers, The State University of New Jersey
|
114
|
+
- rutgers
|
115
|
+
Q5090408:
|
116
|
+
- Science History Institute
|
117
|
+
- Chemical Heritage Foundation
|
118
|
+
- shi
|
119
|
+
Q734774:
|
120
|
+
- Saint Louis University
|
121
|
+
- slu
|
122
|
+
Q49204:
|
123
|
+
- Smith College
|
124
|
+
- smith
|
125
|
+
Q1378320:
|
126
|
+
- Swarthmore College
|
127
|
+
- swarthmore
|
128
|
+
Q168756:
|
129
|
+
- University of California, Berkeley
|
130
|
+
- ucb
|
131
|
+
Q579968:
|
132
|
+
- University of Missouri
|
133
|
+
- mizzou
|
134
|
+
- missouri
|
135
|
+
Q766145:
|
136
|
+
- University of Oregon
|
137
|
+
- oregon
|
138
|
+
Q49117:
|
139
|
+
- University of Pennsylvania
|
140
|
+
- upenn
|
141
|
+
- penn
|
142
|
+
Q49205:
|
143
|
+
- Wellesley College
|
144
|
+
- wellesley
|
145
|
+
Q49112:
|
146
|
+
- Yale University
|
147
|
+
- yale
|
148
|
+
Q129421:
|
149
|
+
- University of California, Davis
|
150
|
+
- ucdavis
|
data/exe/ds-convert
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'thor'
|
4
|
+
require 'csv'
|
5
|
+
require_relative '../lib/ds/cli'
|
6
|
+
|
7
|
+
# TODO: No URIs for Scribes in prototype data set; need test data
|
8
|
+
# Given a directory containing a set of source records (MARC XML, DS 1.0
|
9
|
+
# METS, OPenn TEI XML, a CSV) and a `manifest.csv` file, generate a DS
|
10
|
+
# import CSV for all records listed in `manifest.csv`. The output import
|
11
|
+
# CSV is used by the DS Import scripts to import data into the DS
|
12
|
+
# Wikibase instance.
|
13
|
+
class DSConvertCLI < DS::CLI
|
14
|
+
|
15
|
+
def self.exit_on_failure?
|
16
|
+
true
|
17
|
+
end
|
18
|
+
|
19
|
+
class_option :'output-csv', banner: 'FILE', desc: "Name of the output CSV file [default: output.csv]", default: 'output.csv', aliases: '-o'
|
20
|
+
|
21
|
+
desc "convert OPTIONS MANIFEST [SOURCE_DIR]", "Generate import CSV for MANIFEST and SOURCE_DIR"
|
22
|
+
long_desc <<-LONGDESC
|
23
|
+
Generate an import CSV for all files listed in the MANIFEST CSV that are found in SOURCE_DIR.
|
24
|
+
|
25
|
+
If SOURCE_DIR is not specified, the parent directory of MANIFEST is used to locate source records.
|
26
|
+
|
27
|
+
The MANIFEST is validated before the import CSV is generated. See 'validate' for details.
|
28
|
+
LONGDESC
|
29
|
+
##
|
30
|
+
# Convert all source records listed in +manifest_csv+; all source
|
31
|
+
# records must be in +source_dir+. If +source_dir+ is nil, +convert+
|
32
|
+
# expects to find source records in the parent directory of
|
33
|
+
# +manifest_csv+.
|
34
|
+
#
|
35
|
+
# @param [String] manifest_csv path to the DS manifest CSV
|
36
|
+
# @param [String] source_dir path to the directory containing all
|
37
|
+
# source records listed in +manifest_csv+
|
38
|
+
def convert manifest_csv, source_dir = nil
|
39
|
+
|
40
|
+
invoke :recon_update
|
41
|
+
# validate the manifest csv before preceding
|
42
|
+
invoke :validate
|
43
|
+
|
44
|
+
# TODO: This is pretty busy; simplify; push to another class?
|
45
|
+
manifest = DS::Manifest::Manifest.new manifest_csv, source_dir
|
46
|
+
converter = DS::Converter::Converter.new manifest
|
47
|
+
count = 0
|
48
|
+
valid = true
|
49
|
+
outfile = options[:'output-csv']
|
50
|
+
write outfile: outfile do |csv|
|
51
|
+
converter.convert do |row|
|
52
|
+
count += 1
|
53
|
+
csv << row
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
print_message(options) { "Wrote #{number_to_delimited(count)} rows to '#{outfile}'" }
|
58
|
+
if converter.csv_valid?
|
59
|
+
print_message(options) { "CSV is valid: #{outfile}".colorize(:green) }
|
60
|
+
else
|
61
|
+
puts converter.errors
|
62
|
+
raise DSError, "Validation errors found; see previous messages".colorize(:red)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
desc "validate MANIFEST SOURCE_DIR", "Validate the import MANIFEST"
|
67
|
+
# Ugh. I can't figure out how to make thor not join consecutive
|
68
|
+
# so I've separated each list item.
|
69
|
+
long_desc <<LONGDESC
|
70
|
+
Validate the import MANIFEST and the SOURCE_DIR.
|
71
|
+
|
72
|
+
NOTE: Works only for MARC at present.
|
73
|
+
|
74
|
+
The validator:
|
75
|
+
|
76
|
+
- Confirms all required columns are present
|
77
|
+
|
78
|
+
- Confirms all all required values are present
|
79
|
+
|
80
|
+
- Confirms all column values are the correct type
|
81
|
+
|
82
|
+
- Confirms all listed input files are present
|
83
|
+
|
84
|
+
- Confirms all listed input files match the record
|
85
|
+
identifier provided in the manifest
|
86
|
+
|
87
|
+
LONGDESC
|
88
|
+
|
89
|
+
def validate manifest_path, source_dir=nil
|
90
|
+
print_message(options, verbose_only: true) {
|
91
|
+
"Validating manifest CSV: '#{manifest_path}'"
|
92
|
+
}
|
93
|
+
|
94
|
+
manifest = DS::Manifest::Manifest.new manifest_path, source_dir
|
95
|
+
|
96
|
+
manifest_validator = DS::Manifest::ManifestValidator.new manifest
|
97
|
+
if manifest_validator.valid?
|
98
|
+
print_message(options) {
|
99
|
+
"SUCCESS! Manifest is valid: '#{manifest_path}'".colorize(:green)
|
100
|
+
}
|
101
|
+
else
|
102
|
+
print_message(options) { manifest_validator.errors.take(10).join("\n") }
|
103
|
+
print_message(options) {
|
104
|
+
"ERROR: Manifest has validation errors: #{manifest_path}".colorize(:red)
|
105
|
+
}
|
106
|
+
|
107
|
+
exit 1
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
protected
|
112
|
+
|
113
|
+
##
|
114
|
+
# Create +outfile+ and yield +csv+ to caller.
|
115
|
+
#
|
116
|
+
# This method should replace {#write_csv}.
|
117
|
+
#
|
118
|
+
# @param [String] outfile the outfile name and path
|
119
|
+
# @yield [CSV] the CSV object
|
120
|
+
def write outfile:, &block
|
121
|
+
CSV.open outfile, "w", headers: true do |csv|
|
122
|
+
csv << DS::HEADINGS
|
123
|
+
yield csv
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def write_csv rows, options
|
128
|
+
out_csv = options[:'output-csv']
|
129
|
+
CSV.open out_csv, "w", headers: true do |csv|
|
130
|
+
csv << DS::HEADINGS
|
131
|
+
rows.each do |row|
|
132
|
+
csv << row
|
133
|
+
end
|
134
|
+
end
|
135
|
+
puts "Wrote: '#{out_csv}'"
|
136
|
+
end
|
137
|
+
|
138
|
+
def validate_args files
|
139
|
+
return true if read_from_stdin? files
|
140
|
+
|
141
|
+
cannot_find = files.reject { |f| File.exist?(f) }
|
142
|
+
return true if cannot_find.empty?
|
143
|
+
|
144
|
+
puts "Can't find input file(s): #{cannot_find.join '; ' }"
|
145
|
+
false
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
DSConvertCLI.start ARGV
|