cocina-models 0.112.1 → 0.114.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.circleci/config.yml +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +30 -15
- data/README.md +10 -18
- data/bin/validate-data +282 -0
- data/description_types.yml +2 -0
- data/docs/description_types.md +2 -0
- data/lib/cocina/generator/datatype.rb +1 -1
- data/lib/cocina/generator/generator.rb +1 -1
- data/lib/cocina/generator/schema.rb +2 -2
- data/lib/cocina/generator/schema_array.rb +1 -1
- data/lib/cocina/generator/schema_base.rb +2 -2
- data/lib/cocina/generator/schema_ref.rb +1 -1
- data/lib/cocina/generator/schema_value.rb +1 -1
- data/lib/cocina/generator.rb +1 -1
- data/lib/cocina/json_schema_wrapper.rb +1 -2
- data/lib/cocina/models/access.rb +4 -4
- data/lib/cocina/models/admin_policy_access_template.rb +4 -4
- data/lib/cocina/models/admin_policy_lite.rb +2 -2
- data/lib/cocina/models/collection_lite.rb +4 -4
- data/lib/cocina/models/descriptive_basic_value.rb +1 -1
- data/lib/cocina/models/descriptive_parallel_event.rb +1 -1
- data/lib/cocina/models/descriptive_value.rb +1 -1
- data/lib/cocina/models/dro_access.rb +4 -4
- data/lib/cocina/models/dro_lite.rb +6 -6
- data/lib/cocina/models/embargo.rb +4 -4
- data/lib/cocina/models/event.rb +1 -1
- data/lib/cocina/models/file_access.rb +4 -4
- data/lib/cocina/models/language.rb +1 -1
- data/lib/cocina/models/mapping/from_mods/description.rb +1 -1
- data/lib/cocina/models/mapping/from_mods/form.rb +0 -6
- data/lib/cocina/models/mapping/to_mods/contributor.rb +4 -4
- data/lib/cocina/models/related_resource.rb +1 -1
- data/lib/cocina/models/title.rb +1 -1
- data/lib/cocina/models/validatable.rb +6 -1
- data/lib/cocina/models/validators/catalog_links_validator.rb +1 -1
- data/lib/cocina/models/version.rb +1 -1
- data/lib/cocina/models.rb +3 -5
- data/lib/cocina/rspec/factories.rb +3 -3
- data/schema.json +5 -5
- metadata +4 -5
- data/docs/index.html +0 -20
- data/openapi.yml +0 -1930
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 2b8514cca9a06bbf3e7c36d84a1f27d093e16a263e8ac1cb92289b6640d28bec
|
|
4
|
+
data.tar.gz: '09f25bf203314c534eba5e21a16b163f56b21ac2e858cf9a5a25f309086efa12'
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2dce9b0cc4eb70cc6a2ddbc2dda88e5249b9ac58a0ec96523ec9c3e23ebe82108d87eb1e36b70e8e44a0d8c0abf75682c6f284dbda26e4cefe469b4acf5bf715
|
|
7
|
+
data.tar.gz: 795ba280b81c888fd005cc5af39d671b942f3d5b718b61b44e902581d476958654f28485aec4f8d18de4d725a6b9338c84c8a0acc4a7c315b04e5cd2ec22c9a8
|
data/.circleci/config.yml
CHANGED
data/Gemfile
CHANGED
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
cocina-models (0.
|
|
4
|
+
cocina-models (0.114.0)
|
|
5
5
|
activesupport
|
|
6
6
|
deprecation
|
|
7
7
|
dry-struct (~> 1.0)
|
|
@@ -32,6 +32,8 @@ GEM
|
|
|
32
32
|
securerandom (>= 0.3)
|
|
33
33
|
tzinfo (~> 2.0, >= 2.0.5)
|
|
34
34
|
uri (>= 0.13.1)
|
|
35
|
+
addressable (2.8.9)
|
|
36
|
+
public_suffix (>= 2.0.2, < 8.0)
|
|
35
37
|
ast (2.4.3)
|
|
36
38
|
attr_extras (7.1.0)
|
|
37
39
|
base64 (0.3.0)
|
|
@@ -57,7 +59,7 @@ GEM
|
|
|
57
59
|
concurrent-ruby (~> 1.0)
|
|
58
60
|
dry-core (~> 1.1)
|
|
59
61
|
zeitwerk (~> 2.6)
|
|
60
|
-
dry-struct (1.8.
|
|
62
|
+
dry-struct (1.8.1)
|
|
61
63
|
dry-core (~> 1.1)
|
|
62
64
|
dry-types (~> 1.8, >= 1.8.2)
|
|
63
65
|
ice_nine (~> 0.11)
|
|
@@ -73,7 +75,7 @@ GEM
|
|
|
73
75
|
activesupport (>= 3.0, < 9.0)
|
|
74
76
|
equivalent-xml (0.6.0)
|
|
75
77
|
nokogiri (>= 1.4.3)
|
|
76
|
-
erb (6.0.
|
|
78
|
+
erb (6.0.2)
|
|
77
79
|
hana (1.3.7)
|
|
78
80
|
i18n (1.14.8)
|
|
79
81
|
concurrent-ruby (~> 1.0)
|
|
@@ -84,7 +86,10 @@ GEM
|
|
|
84
86
|
prism (>= 1.3.0)
|
|
85
87
|
rdoc (>= 4.0.0)
|
|
86
88
|
reline (>= 0.4.2)
|
|
87
|
-
json (2.
|
|
89
|
+
json (2.19.1)
|
|
90
|
+
json-schema (6.2.0)
|
|
91
|
+
addressable (~> 2.8)
|
|
92
|
+
bigdecimal (>= 3.1, < 5)
|
|
88
93
|
json_schemer (2.5.0)
|
|
89
94
|
bigdecimal
|
|
90
95
|
hana (~> 1.3)
|
|
@@ -95,7 +100,10 @@ GEM
|
|
|
95
100
|
language_server-protocol (3.17.0.5)
|
|
96
101
|
lint_roller (1.1.0)
|
|
97
102
|
logger (1.7.0)
|
|
98
|
-
|
|
103
|
+
mcp (0.8.0)
|
|
104
|
+
json-schema (>= 4.1)
|
|
105
|
+
minitest (6.0.2)
|
|
106
|
+
drb (~> 2.0)
|
|
99
107
|
prism (~> 1.5)
|
|
100
108
|
multi_json (1.19.1)
|
|
101
109
|
nokogiri (1.19.1-arm64-darwin)
|
|
@@ -116,6 +124,7 @@ GEM
|
|
|
116
124
|
psych (5.3.1)
|
|
117
125
|
date
|
|
118
126
|
stringio
|
|
127
|
+
public_suffix (7.0.5)
|
|
119
128
|
racc (1.8.1)
|
|
120
129
|
rainbow (3.1.1)
|
|
121
130
|
rake (13.3.1)
|
|
@@ -135,16 +144,17 @@ GEM
|
|
|
135
144
|
rspec-expectations (3.13.5)
|
|
136
145
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
137
146
|
rspec-support (~> 3.13.0)
|
|
138
|
-
rspec-mocks (3.13.
|
|
147
|
+
rspec-mocks (3.13.8)
|
|
139
148
|
diff-lcs (>= 1.2.0, < 2.0)
|
|
140
149
|
rspec-support (~> 3.13.0)
|
|
141
150
|
rspec-support (3.13.7)
|
|
142
151
|
rspec_junit_formatter (0.6.0)
|
|
143
152
|
rspec-core (>= 2, < 4, != 2.12.0)
|
|
144
|
-
rubocop (1.
|
|
153
|
+
rubocop (1.85.1)
|
|
145
154
|
json (~> 2.3)
|
|
146
155
|
language_server-protocol (~> 3.17.0.2)
|
|
147
156
|
lint_roller (~> 1.1.0)
|
|
157
|
+
mcp (~> 0.6)
|
|
148
158
|
parallel (~> 1.10)
|
|
149
159
|
parser (>= 3.3.0.2)
|
|
150
160
|
rainbow (>= 2.2.2, < 4.0)
|
|
@@ -199,15 +209,17 @@ DEPENDENCIES
|
|
|
199
209
|
rubocop (~> 1.24)
|
|
200
210
|
rubocop-rake
|
|
201
211
|
rubocop-rspec
|
|
212
|
+
ruby-progressbar
|
|
202
213
|
simplecov
|
|
203
214
|
|
|
204
215
|
CHECKSUMS
|
|
205
216
|
activesupport (8.1.2) sha256=88842578ccd0d40f658289b0e8c842acfe9af751afee2e0744a7873f50b6fdae
|
|
217
|
+
addressable (2.8.9) sha256=cc154fcbe689711808a43601dee7b980238ce54368d23e127421753e46895485
|
|
206
218
|
ast (2.4.3) sha256=954615157c1d6a382bc27d690d973195e79db7f55e9765ac7c481c60bdb4d383
|
|
207
219
|
attr_extras (7.1.0) sha256=d96fc9a9dd5d85ba2d37762440a816f840093959ae26bb90da994c2d9f1fc827
|
|
208
220
|
base64 (0.3.0) sha256=27337aeabad6ffae05c265c450490628ef3ebd4b67be58257393227588f5a97b
|
|
209
221
|
bigdecimal (4.0.1) sha256=8b07d3d065a9f921c80ceaea7c9d4ae596697295b584c296fe599dd0ad01c4a7
|
|
210
|
-
cocina-models (0.
|
|
222
|
+
cocina-models (0.114.0)
|
|
211
223
|
concurrent-ruby (1.3.6) sha256=6b56837e1e7e5292f9864f34b69c5a2cbc75c0cf5338f1ce9903d10fa762d5ab
|
|
212
224
|
connection_pool (3.0.2) sha256=33fff5ba71a12d2aa26cb72b1db8bba2a1a01823559fb01d29eb74c286e62e0a
|
|
213
225
|
date (3.5.1) sha256=750d06384d7b9c15d562c76291407d89e368dda4d4fff957eb94962d325a0dc0
|
|
@@ -219,23 +231,25 @@ CHECKSUMS
|
|
|
219
231
|
dry-core (1.2.0) sha256=0cc5a7da88df397f153947eeeae42e876e999c1e30900f3c536fb173854e96a1
|
|
220
232
|
dry-inflector (1.3.1) sha256=7fb0c2bb04f67638f25c52e7ba39ab435d922a3a5c3cd196120f63accb682dcc
|
|
221
233
|
dry-logic (1.6.0) sha256=da6fedbc0f90fc41f9b0cc7e6f05f5d529d1efaef6c8dcc8e0733f685745cea2
|
|
222
|
-
dry-struct (1.8.
|
|
234
|
+
dry-struct (1.8.1) sha256=033868594c45241540172bf1ebbc8bb76b72b4f0717072325deba38ac13e80f1
|
|
223
235
|
dry-types (1.9.1) sha256=baebeecdb9f8395d6c9d227b62011279440943e3ef2468fe8ccc1ba11467f178
|
|
224
236
|
edtf (3.2.0) sha256=a15a0ee274e49c8047a3ebb5d61d793ba44f7f8ffbf0595392c467e3ea8d2447
|
|
225
237
|
equivalent-xml (0.6.0) sha256=8919761efa848ad0846369ff8be1f646b17e5061698c4867b09829000cc3f487
|
|
226
|
-
erb (6.0.
|
|
238
|
+
erb (6.0.2) sha256=9fe6264d44f79422c87490a1558479bd0e7dad4dd0e317656e67ea3077b5242b
|
|
227
239
|
hana (1.3.7) sha256=5425db42d651fea08859811c29d20446f16af196308162894db208cac5ce9b0d
|
|
228
240
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
229
241
|
ice_nine (0.11.2) sha256=5d506a7d2723d5592dc121b9928e4931742730131f22a1a37649df1c1e2e63db
|
|
230
242
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
231
243
|
irb (1.17.0) sha256=168c4ddb93d8a361a045c41d92b2952c7a118fa73f23fe14e55609eb7a863aae
|
|
232
|
-
json (2.
|
|
244
|
+
json (2.19.1) sha256=dd94fdc59e48bff85913829a32350b3148156bc4fd2a95a2568a78b11344082d
|
|
245
|
+
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
233
246
|
json_schemer (2.5.0) sha256=2f01fb4cce721a4e08dd068fc2030cffd0702a7f333f1ea2be6e8991f00ae396
|
|
234
247
|
jsonpath (1.1.5) sha256=29f70467193a2dc93ab864ec3d3326d54267961acc623f487340eb9c34931dbe
|
|
235
248
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
236
249
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
237
250
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
238
|
-
|
|
251
|
+
mcp (0.8.0) sha256=ae8bd146bb8e168852866fd26f805f52744f6326afb3211e073f78a95e0c34fb
|
|
252
|
+
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
239
253
|
multi_json (1.19.1) sha256=7aefeff8f2c854bf739931a238e4aea64592845e0c0395c8a7d2eea7fdd631b7
|
|
240
254
|
nokogiri (1.19.1-arm64-darwin) sha256=dfe2d337e6700eac47290407c289d56bcf85805d128c1b5a6434ddb79731cb9e
|
|
241
255
|
nokogiri (1.19.1-x86_64-linux-gnu) sha256=1a4902842a186b4f901078e692d12257678e6133858d0566152fe29cdb98456a
|
|
@@ -247,6 +261,7 @@ CHECKSUMS
|
|
|
247
261
|
prettyprint (0.2.0) sha256=2bc9e15581a94742064a3cc8b0fb9d45aae3d03a1baa6ef80922627a0766f193
|
|
248
262
|
prism (1.9.0) sha256=7b530c6a9f92c24300014919c9dcbc055bf4cdf51ec30aed099b06cd6674ef85
|
|
249
263
|
psych (5.3.1) sha256=eb7a57cef10c9d70173ff74e739d843ac3b2c019a003de48447b2963d81b1974
|
|
264
|
+
public_suffix (7.0.5) sha256=1a8bb08f1bbea19228d3bed6e5ed908d1cb4f7c2726d18bd9cadf60bc676f623
|
|
250
265
|
racc (1.8.1) sha256=4a7f6929691dbec8b5209a0b373bc2614882b55fc5d2e447a21aaa691303d62f
|
|
251
266
|
rainbow (3.1.1) sha256=039491aa3a89f42efa1d6dec2fc4e62ede96eb6acd95e52f1ad581182b79bc6a
|
|
252
267
|
rake (13.3.1) sha256=8c9e89d09f66a26a01264e7e3480ec0607f0c497a861ef16063604b1b08eb19c
|
|
@@ -256,10 +271,10 @@ CHECKSUMS
|
|
|
256
271
|
rspec (3.13.2) sha256=206284a08ad798e61f86d7ca3e376718d52c0bc944626b2349266f239f820587
|
|
257
272
|
rspec-core (3.13.6) sha256=a8823c6411667b60a8bca135364351dda34cd55e44ff94c4be4633b37d828b2d
|
|
258
273
|
rspec-expectations (3.13.5) sha256=33a4d3a1d95060aea4c94e9f237030a8f9eae5615e9bd85718fe3a09e4b58836
|
|
259
|
-
rspec-mocks (3.13.
|
|
274
|
+
rspec-mocks (3.13.8) sha256=086ad3d3d17533f4237643de0b5c42f04b66348c28bf6b9c2d3f4a3b01af1d47
|
|
260
275
|
rspec-support (3.13.7) sha256=0640e5570872aafefd79867901deeeeb40b0c9875a36b983d85f54fb7381c47c
|
|
261
276
|
rspec_junit_formatter (0.6.0) sha256=40dde674e6ae4e6cc0ff560da25497677e34fefd2338cc467a8972f602b62b15
|
|
262
|
-
rubocop (1.
|
|
277
|
+
rubocop (1.85.1) sha256=3dbcf9e961baa4c376eeeb2a03913dca5e3987033b04d38fa538aa1e7406cc77
|
|
263
278
|
rubocop-ast (1.49.0) sha256=49c3676d3123a0923d333e20c6c2dbaaae2d2287b475273fddee0c61da9f71fd
|
|
264
279
|
rubocop-rake (0.7.1) sha256=3797f2b6810c3e9df7376c26d5f44f3475eda59eb1adc38e6f62ecf027cbae4d
|
|
265
280
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
@@ -280,4 +295,4 @@ CHECKSUMS
|
|
|
280
295
|
zeitwerk (2.7.5) sha256=d8da92128c09ea6ec62c949011b00ed4a20242b255293dd66bf41545398f73dd
|
|
281
296
|
|
|
282
297
|
BUNDLED WITH
|
|
283
|
-
4.0.
|
|
298
|
+
4.0.7
|
data/README.md
CHANGED
|
@@ -1,18 +1,21 @@
|
|
|
1
1
|
[](https://circleci.com/gh/sul-dlss/cocina-models)
|
|
2
2
|
[](https://codecov.io/github/sul-dlss/cocina-models)
|
|
3
3
|
[](https://badge.fury.io/rb/cocina-models)
|
|
4
|
-
[](http://validator.swagger.io/validator/debug?url=https://raw.githubusercontent.com/sul-dlss/cocina-models/main/openapi.yml)
|
|
5
4
|
|
|
6
5
|
# Cocina::Models
|
|
7
6
|
|
|
8
7
|
The cocina-models gem is a Ruby implementation of the Stanford Digital Repository (SDR) data model, which we named "Cocina." The data being modeled is oriented around digital repository objects.
|
|
9
8
|
|
|
10
|
-
The data model is expressed in an
|
|
9
|
+
The data model is expressed in an JSON Schema specification that lives in this codebase. Expressing the model in such a spec allows for rich validation (using gems such as `json_schemer`). The gem provides a set of generators (see below) to generate Ruby classes from the specification, with modeling provided by dry-struct / dry-types. Together, these provide a way for consumers to validate objects against models and to manipulate those objects.
|
|
11
10
|
|
|
12
11
|
Note that the data model encodes properties as camelCase, which the team believes to be consistent with other HTTP APIs and the original design of the Cocina data model. While using camelCase in Ruby code may look and feel wrong, we did explore automagic conversion between camelCase in the model and snake_case in the Ruby context. We ultimately concluded that we have enough representations of the data model in enough codebases to reasonably worry about data inconsistency problems, none of which we need in our work on SDR.
|
|
13
12
|
|
|
14
13
|
For more about the model for description see https://consul.stanford.edu/display/DIGMETADATA/Digital+Object+Metadata+Documentation#DigitalObjectMetadataDocumentation-Cocinamodel
|
|
15
14
|
|
|
15
|
+
## Schema
|
|
16
|
+
|
|
17
|
+
The [schema.json](schema.json) can also be viewed via JSON HERO: https://jsonhero.io/j/rynNH3NLBhcf and DOR Services App's openapi.yml: https://sul-dlss.github.io/dor-services-app/
|
|
18
|
+
|
|
16
19
|
## Configuration
|
|
17
20
|
|
|
18
21
|
Set the PURL url base:
|
|
@@ -52,7 +55,10 @@ Beyond what is necessary to test the generator, the Cocina model classes are not
|
|
|
52
55
|
|
|
53
56
|
## Testing validation changes
|
|
54
57
|
|
|
55
|
-
If there is a possibility that a model, mapping, or validation change will conflict with some existing objects then
|
|
58
|
+
If there is a possibility that a model, mapping, or validation change will conflict with some existing objects then `bin/validate-data` should be used for testing. This operates on a sample of objects from the repository and reports any validation errors. You may get the sample by running the script [bin/export-cocina-head-versions](https://github.com/sul-dlss/dor-services-app/pull/5854) and downloading the data file to your computer.
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
Alternatively, you can use [validate-cocina](https://github.com/sul-dlss/dor-services-app/blob/main/bin/validate-cocina) for testing. This must be run on the `sdr-infra` VM since it requires deploying a branch of cocina-models. It is slower than using `bin/validate-data`, but all of the data is completely up to date.
|
|
56
62
|
|
|
57
63
|
For background on object validation, as it relates to migrating versions, see: https://github.com/sul-dlss/dor-services-app/wiki/Migrating-Cocina
|
|
58
64
|
|
|
@@ -150,21 +156,7 @@ This list of services is known to include:
|
|
|
150
156
|
* [sul-dlss/sdr-api](https://github.com/sul-dlss/sdr-api)
|
|
151
157
|
* [sul-dlss/dor-services-app](https://github.com/sul-dlss/dor-services-app/)
|
|
152
158
|
|
|
153
|
-
|
|
154
|
-
#### Step 3A: Update API specifications
|
|
155
|
-
|
|
156
|
-
**NOTE**: You can skip step 3A if there have not been any changes to the `cocina-models` OpenAPI spec since the prior release.
|
|
157
|
-
|
|
158
|
-
The cocina-models gem is used in applications that have an API specification that accepts Cocina models.
|
|
159
|
-
|
|
160
|
-
#### Step 3B: Bump gems and create the PRs
|
|
161
|
-
|
|
162
|
-
If you updated the `schema.json` in step 3A, use the same PR for step 3B. Why? When [dor-services-app](https://github.com/sul-dlss/dor-services-app), for example, is updated to use the new models (via the auto-update script), these clients should be updated at the same time or there is risk of models produced by dor-services-app not being acceptable to the clients.
|
|
163
|
-
|
|
164
|
-
1. Perform `bundle update --conservative cocina-models dor-services-client` in the services above and make PRs for those repos if they don't already exist. You may first need to update how these gems are pinned in the `Gemfile` in order to bump them.
|
|
165
|
-
2. Note that sdr-client is not currently used in these applications, but if it were, would also need to be bumped to the latest release.
|
|
166
|
-
|
|
167
|
-
#### Step 3C: Merge 'em
|
|
159
|
+
Perform `bundle update --conservative cocina-models dor-services-client` in the services above and make PRs for those repos. You may first need to update how these gems are pinned in the `Gemfile` in order to bump them.
|
|
168
160
|
|
|
169
161
|
Get the directly coupled services PRs merged before the deploy in step 5.
|
|
170
162
|
|
data/bin/validate-data
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Validate JSONL (XZ compressed) files against the schema
|
|
5
|
+
|
|
6
|
+
# Trap Ctrl+C to exit gracefully
|
|
7
|
+
Signal.trap('INT') do
|
|
8
|
+
puts "\nTerminated"
|
|
9
|
+
exit(1)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
Signal.trap('TERM') do
|
|
13
|
+
puts "\nTerminated"
|
|
14
|
+
exit(1)
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
require 'bundler/setup'
|
|
18
|
+
require 'cocina/models'
|
|
19
|
+
require 'json'
|
|
20
|
+
require 'ruby-progressbar'
|
|
21
|
+
require 'optparse'
|
|
22
|
+
|
|
23
|
+
# Parse command line options
|
|
24
|
+
def parse_options # rubocop:disable Metrics/MethodLength
|
|
25
|
+
options = {
|
|
26
|
+
processes: 12,
|
|
27
|
+
count: nil,
|
|
28
|
+
batch_size: 100
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
OptionParser.new do |opts|
|
|
32
|
+
opts.banner = 'Usage: validate-data FILENAME [options]'
|
|
33
|
+
|
|
34
|
+
opts.on('-p', '--processes NUM', Integer, 'Number of processes to use (default: 12)') do |p|
|
|
35
|
+
options[:processes] = p
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
opts.on('-c', '--count NUM', Integer, 'Total line count (skips counting pass if provided)') do |c|
|
|
39
|
+
options[:count] = c
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
opts.on('-b', '--batch-size NUM', Integer, 'Batch size for worker processing (default: 100)') do |b|
|
|
43
|
+
options[:batch_size] = b
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
opts.on('-h', '--help', 'Display this help message') do
|
|
47
|
+
puts opts
|
|
48
|
+
exit
|
|
49
|
+
end
|
|
50
|
+
end.parse!
|
|
51
|
+
|
|
52
|
+
# Filename is required as a positional argument
|
|
53
|
+
if ARGV.empty?
|
|
54
|
+
puts 'Error: FILENAME is required'
|
|
55
|
+
puts 'Usage: validate-data FILENAME [options]'
|
|
56
|
+
puts 'Run with --help for more information'
|
|
57
|
+
exit 1
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
options[:filename] = ARGV[0]
|
|
61
|
+
options
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Count lines in the file
|
|
65
|
+
def count_lines(filename)
|
|
66
|
+
count = 0
|
|
67
|
+
IO.popen(['xzcat', filename]) do |io|
|
|
68
|
+
io.each_line { count += 1 }
|
|
69
|
+
end
|
|
70
|
+
count
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Get total line count (either from option or by counting)
|
|
74
|
+
def get_total_lines(filename, provided_count)
|
|
75
|
+
if provided_count
|
|
76
|
+
puts "Using provided line count: #{provided_count}"
|
|
77
|
+
provided_count
|
|
78
|
+
else
|
|
79
|
+
puts 'Counting lines...'
|
|
80
|
+
total = count_lines(filename)
|
|
81
|
+
puts "Total lines to validate: #{total}"
|
|
82
|
+
total
|
|
83
|
+
end
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Worker process that reads batches from a pipe and validates
|
|
87
|
+
def worker_process(reader) # rubocop:disable Metrics/MethodLength
|
|
88
|
+
errors = []
|
|
89
|
+
|
|
90
|
+
loop do
|
|
91
|
+
# Read length prefix (4 bytes)
|
|
92
|
+
length_data = reader.read(4)
|
|
93
|
+
break if length_data.nil? || length_data.empty?
|
|
94
|
+
|
|
95
|
+
length = length_data.unpack1('N')
|
|
96
|
+
data = reader.read(length)
|
|
97
|
+
batch = Marshal.load(data) # rubocop:disable Security/MarshalLoad
|
|
98
|
+
|
|
99
|
+
# Process each line in the batch
|
|
100
|
+
batch.each do |line_num, line_content|
|
|
101
|
+
json = JSON.parse(line_content)
|
|
102
|
+
Cocina::Models.build(json)
|
|
103
|
+
rescue JSON::ParserError => e
|
|
104
|
+
errors << { line: line_num, error: "JSON Parse Error: #{e.message}" }
|
|
105
|
+
rescue Cocina::Models::ValidationError => e
|
|
106
|
+
errors << { line: line_num, error: "Validation Error: #{e.message}" }
|
|
107
|
+
rescue Cocina::Models::UnknownTypeError => e
|
|
108
|
+
errors << { line: line_num, error: "Unknown Type Error: #{e.message}" }
|
|
109
|
+
rescue StandardError => e
|
|
110
|
+
errors << { line: line_num, error: "Error: #{e.class} - #{e.message}" }
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
errors
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
# Spawn worker processes
|
|
118
|
+
def spawn_workers(num_processes) # rubocop:disable Metrics/MethodLength
|
|
119
|
+
workers = []
|
|
120
|
+
result_readers = []
|
|
121
|
+
|
|
122
|
+
num_processes.times do
|
|
123
|
+
work_reader, work_writer = IO.pipe
|
|
124
|
+
result_reader, result_writer = IO.pipe
|
|
125
|
+
|
|
126
|
+
pid = fork do
|
|
127
|
+
# Child process
|
|
128
|
+
work_writer.close
|
|
129
|
+
result_reader.close
|
|
130
|
+
|
|
131
|
+
errors = worker_process(work_reader)
|
|
132
|
+
|
|
133
|
+
# Send results back
|
|
134
|
+
result_writer.write(Marshal.dump(errors))
|
|
135
|
+
result_writer.close
|
|
136
|
+
work_reader.close
|
|
137
|
+
exit(0)
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Parent process
|
|
141
|
+
work_reader.close
|
|
142
|
+
result_writer.close
|
|
143
|
+
|
|
144
|
+
workers << { pid: pid, writer: work_writer }
|
|
145
|
+
result_readers << result_reader
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
[workers, result_readers]
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
# Send a batch to a worker
|
|
152
|
+
def send_batch(worker, batch)
|
|
153
|
+
return if batch.empty?
|
|
154
|
+
|
|
155
|
+
data = Marshal.dump(batch)
|
|
156
|
+
worker[:writer].write([data.bytesize].pack('N'))
|
|
157
|
+
worker[:writer].write(data)
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Stream file and distribute work to workers
|
|
161
|
+
def distribute_work(filename, workers, batch_size, total_lines) # rubocop:disable Metrics/MethodLength
|
|
162
|
+
line_number = 0
|
|
163
|
+
current_worker = 0
|
|
164
|
+
batch = []
|
|
165
|
+
|
|
166
|
+
# Create progress bar
|
|
167
|
+
progressbar = ProgressBar.create(
|
|
168
|
+
title: 'Validating',
|
|
169
|
+
total: total_lines,
|
|
170
|
+
format: '%t: |%B| %p%% %c/%C %a %e',
|
|
171
|
+
throttle_rate: 0.1
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
IO.popen(['xzcat', filename]) do |io|
|
|
175
|
+
io.each_line do |line|
|
|
176
|
+
line_number += 1
|
|
177
|
+
batch << [line_number, line]
|
|
178
|
+
|
|
179
|
+
# When batch is full, send to worker
|
|
180
|
+
if batch.size >= batch_size
|
|
181
|
+
worker = workers[current_worker]
|
|
182
|
+
send_batch(worker, batch)
|
|
183
|
+
batch = []
|
|
184
|
+
current_worker = (current_worker + 1) % workers.length
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Update progress bar
|
|
188
|
+
progressbar.increment
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# Send any remaining lines in the last batch
|
|
193
|
+
if batch.any?
|
|
194
|
+
worker = workers[current_worker]
|
|
195
|
+
send_batch(worker, batch)
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
# Final progress update
|
|
199
|
+
progressbar.finish
|
|
200
|
+
end
|
|
201
|
+
|
|
202
|
+
# Collect results from all workers
|
|
203
|
+
def collect_results(workers, result_readers)
|
|
204
|
+
# Close all worker input pipes to signal completion
|
|
205
|
+
workers.each { |w| w[:writer].close }
|
|
206
|
+
|
|
207
|
+
puts 'Collecting results from workers...'
|
|
208
|
+
all_errors = []
|
|
209
|
+
|
|
210
|
+
result_readers.each do |reader|
|
|
211
|
+
data = reader.read
|
|
212
|
+
worker_errors = Marshal.load(data) # rubocop:disable Security/MarshalLoad
|
|
213
|
+
all_errors.concat(worker_errors)
|
|
214
|
+
reader.close
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
# Wait for all workers to complete
|
|
218
|
+
workers.each { |w| Process.wait(w[:pid]) }
|
|
219
|
+
|
|
220
|
+
all_errors
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Print validation summary
|
|
224
|
+
def print_summary(total_lines, errors, elapsed_time) # rubocop:disable Metrics/MethodLength
|
|
225
|
+
puts '=' * 80
|
|
226
|
+
puts 'VALIDATION SUMMARY'
|
|
227
|
+
puts '=' * 80
|
|
228
|
+
puts "Total lines processed: #{total_lines}"
|
|
229
|
+
puts "Lines with errors: #{errors.length}"
|
|
230
|
+
puts "Success rate: #{((total_lines - errors.length).to_f / total_lines * 100).round(2)}%"
|
|
231
|
+
puts "Time elapsed: #{elapsed_time.round(2)} seconds"
|
|
232
|
+
puts "Throughput: #{(total_lines / elapsed_time).round(0)} lines/second"
|
|
233
|
+
|
|
234
|
+
return unless errors.any?
|
|
235
|
+
|
|
236
|
+
puts "\n"
|
|
237
|
+
puts 'Error details:'
|
|
238
|
+
puts '-' * 80
|
|
239
|
+
# Sort errors by line number for better readability
|
|
240
|
+
errors.sort_by { |e| e[:line] }.each do |error|
|
|
241
|
+
puts "Line #{error[:line]}: #{error[:error]}"
|
|
242
|
+
end
|
|
243
|
+
puts "\n"
|
|
244
|
+
puts "Line numbers with errors: #{errors.sort_by { |e| e[:line] }.map { |e| e[:line] }.join(', ')}"
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
# Main execution
|
|
248
|
+
def main
|
|
249
|
+
options = parse_options
|
|
250
|
+
|
|
251
|
+
puts "Validating file: #{options[:filename]}"
|
|
252
|
+
puts "Using #{options[:processes]} processes with batch size #{options[:batch_size]}"
|
|
253
|
+
puts '=' * 80
|
|
254
|
+
|
|
255
|
+
# Get total line count
|
|
256
|
+
total_lines = get_total_lines(options[:filename], options[:count])
|
|
257
|
+
puts '=' * 80
|
|
258
|
+
|
|
259
|
+
# Spawn worker processes
|
|
260
|
+
workers, result_readers = spawn_workers(options[:processes])
|
|
261
|
+
|
|
262
|
+
# Start timing
|
|
263
|
+
start_time = Time.now
|
|
264
|
+
|
|
265
|
+
# Distribute work to workers
|
|
266
|
+
distribute_work(options[:filename], workers, options[:batch_size], total_lines)
|
|
267
|
+
|
|
268
|
+
# Collect results
|
|
269
|
+
all_errors = collect_results(workers, result_readers)
|
|
270
|
+
|
|
271
|
+
# Calculate elapsed time
|
|
272
|
+
elapsed_time = Time.now - start_time
|
|
273
|
+
|
|
274
|
+
# Print summary
|
|
275
|
+
print_summary(total_lines, all_errors, elapsed_time)
|
|
276
|
+
|
|
277
|
+
# Exit with appropriate code
|
|
278
|
+
exit(all_errors.empty? ? 0 : 1)
|
|
279
|
+
end
|
|
280
|
+
|
|
281
|
+
# Run the script
|
|
282
|
+
main
|
data/description_types.yml
CHANGED
|
@@ -432,6 +432,8 @@ identifier:
|
|
|
432
432
|
- value: "West Mat #"
|
|
433
433
|
- value: Wikidata
|
|
434
434
|
code: wikidata
|
|
435
|
+
- value: International Article Number
|
|
436
|
+
- value: ProQuest Module ID
|
|
435
437
|
note:
|
|
436
438
|
- value: abstract
|
|
437
439
|
description: A short overview of a research article or other work.
|
data/docs/description_types.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Cocina
|
|
4
4
|
module Generator
|
|
5
|
-
# Class for generating from
|
|
5
|
+
# Class for generating from a JSON schema
|
|
6
6
|
class Schema < SchemaBase
|
|
7
7
|
def schema_properties
|
|
8
8
|
@schema_properties ||= (properties + all_of_properties + one_of_properties).uniq(&:key)
|
|
@@ -112,7 +112,7 @@ module Cocina
|
|
|
112
112
|
key: key,
|
|
113
113
|
# The property does less validation because may vary between
|
|
114
114
|
# different oneOf schemas. Validation is still performed
|
|
115
|
-
# by
|
|
115
|
+
# by JSON Schema.
|
|
116
116
|
relaxed: true,
|
|
117
117
|
parent: self,
|
|
118
118
|
schemas: schemas)
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
module Cocina
|
|
4
4
|
module Generator
|
|
5
|
-
# Base class for generating from
|
|
5
|
+
# Base class for generating from a JSON Schema
|
|
6
6
|
class SchemaBase
|
|
7
7
|
attr_reader :schema_doc, :key, :required, :nullable, :parent, :relaxed, :schemas, :lite
|
|
8
8
|
|
|
@@ -74,7 +74,7 @@ module Cocina
|
|
|
74
74
|
def relaxed_comment
|
|
75
75
|
return '' unless relaxed
|
|
76
76
|
|
|
77
|
-
"# Validation of this property is relaxed. See the
|
|
77
|
+
"# Validation of this property is relaxed. See the schema.json for full validation.\n"
|
|
78
78
|
end
|
|
79
79
|
|
|
80
80
|
# dry-types-based types contain the word `Types` (e.g., `Types::String`), and custom types (e.g., `SourceId`) do not
|
data/lib/cocina/generator.rb
CHANGED
|
@@ -3,8 +3,7 @@
|
|
|
3
3
|
module Cocina
|
|
4
4
|
# Wrapper for JSON Schema support using json_schemer
|
|
5
5
|
class JsonSchemaWrapper
|
|
6
|
-
class
|
|
7
|
-
class MissingReferenceError < OpenApiError; end
|
|
6
|
+
class MissingReferenceError < StandardError; end
|
|
8
7
|
|
|
9
8
|
def initialize(spec_hash, strict_reference_validation: true)
|
|
10
9
|
@spec = spec_hash
|