glossarist 2.6.4 → 2.6.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/release.yml +1 -4
- data/.rubocop_todo.yml +25 -74
- data/CLAUDE.md +27 -2
- data/Gemfile +0 -2
- data/README.adoc +650 -29
- data/config.yml +68 -1
- data/glossarist.gemspec +1 -1
- data/lib/glossarist/asset_reference.rb +16 -0
- data/lib/glossarist/bibliographic_reference.rb +16 -0
- data/lib/glossarist/concept.rb +1 -1
- data/lib/glossarist/concept_data.rb +4 -0
- data/lib/glossarist/concept_enricher.rb +1 -0
- data/lib/glossarist/concept_reference.rb +14 -17
- data/lib/glossarist/concept_validator.rb +27 -56
- data/lib/glossarist/dataset_validator.rb +30 -34
- data/lib/glossarist/designation/abbreviation.rb +0 -2
- data/lib/glossarist/designation/base.rb +21 -1
- data/lib/glossarist/designation/expression.rb +3 -0
- data/lib/glossarist/designation/letter_symbol.rb +0 -4
- data/lib/glossarist/designation/symbol.rb +0 -2
- data/lib/glossarist/gcr_validator.rb +26 -101
- data/lib/glossarist/glossary_definition.rb +5 -0
- data/lib/glossarist/managed_concept_data.rb +21 -2
- data/lib/glossarist/non_verb_rep.rb +21 -6
- data/lib/glossarist/pronunciation.rb +32 -0
- data/lib/glossarist/reference_extractor.rb +78 -10
- data/lib/glossarist/reference_resolver.rb +1 -0
- data/lib/glossarist/urn_resolver.rb +13 -1
- data/lib/glossarist/v1/concept.rb +7 -0
- data/lib/glossarist/validation/asset_index.rb +114 -0
- data/lib/glossarist/validation/bibliography_index.rb +121 -0
- data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +60 -0
- data/lib/glossarist/validation/rules/authoritative_source_rule.rb +47 -0
- data/lib/glossarist/validation/rules/base.rb +46 -0
- data/lib/glossarist/validation/rules/bibliography_yaml_rule.rb +37 -0
- data/lib/glossarist/validation/rules/citation_completeness_rule.rb +63 -0
- data/lib/glossarist/validation/rules/concept_context.rb +45 -0
- data/lib/glossarist/validation/rules/concept_count_rule.rb +34 -0
- data/lib/glossarist/validation/rules/concept_id_rule.rb +29 -0
- data/lib/glossarist/validation/rules/concept_id_uniqueness_rule.rb +42 -0
- data/lib/glossarist/validation/rules/concept_mention_rule.rb +44 -0
- data/lib/glossarist/validation/rules/concept_status_rule.rb +36 -0
- data/lib/glossarist/validation/rules/concept_uri_rule.rb +30 -0
- data/lib/glossarist/validation/rules/dataset_context.rb +99 -0
- data/lib/glossarist/validation/rules/date_type_rule.rb +54 -0
- data/lib/glossarist/validation/rules/date_validity_rule.rb +66 -0
- data/lib/glossarist/validation/rules/definition_content_rule.rb +41 -0
- data/lib/glossarist/validation/rules/designation_status_rule.rb +45 -0
- data/lib/glossarist/validation/rules/designation_type_rule.rb +55 -0
- data/lib/glossarist/validation/rules/duplicate_term_rule.rb +63 -0
- data/lib/glossarist/validation/rules/entry_status_rule.rb +39 -0
- data/lib/glossarist/validation/rules/filename_id_rule.rb +35 -0
- data/lib/glossarist/validation/rules/gcr_context.rb +92 -0
- data/lib/glossarist/validation/rules/image_reference_rule.rb +73 -0
- data/lib/glossarist/validation/rules/l10n_uuid_integrity_rule.rb +40 -0
- data/lib/glossarist/validation/rules/language_code_format_rule.rb +39 -0
- data/lib/glossarist/validation/rules/language_coverage_rule.rb +37 -0
- data/lib/glossarist/validation/rules/language_list_rule.rb +46 -0
- data/lib/glossarist/validation/rules/localization_presence_rule.rb +25 -0
- data/lib/glossarist/validation/rules/orphaned_bibliography_rule.rb +64 -0
- data/lib/glossarist/validation/rules/orphaned_images_rule.rb +68 -0
- data/lib/glossarist/validation/rules/orphaned_l10n_files_rule.rb +39 -0
- data/lib/glossarist/validation/rules/preferred_term_rule.rb +41 -0
- data/lib/glossarist/validation/rules/registry.rb +42 -0
- data/lib/glossarist/validation/rules/related_concept_cycle_rule.rb +102 -0
- data/lib/glossarist/validation/rules/related_concept_rule.rb +40 -0
- data/lib/glossarist/validation/rules/related_concept_symmetry_rule.rb +87 -0
- data/lib/glossarist/validation/rules/source_type_rule.rb +63 -0
- data/lib/glossarist/validation/rules/terms_presence_rule.rb +39 -0
- data/lib/glossarist/validation/rules.rb +85 -0
- data/lib/glossarist/validation/validation_issue.rb +39 -0
- data/lib/glossarist/validation.rb +12 -0
- data/lib/glossarist/validation_result.rb +26 -9
- data/lib/glossarist/version.rb +1 -1
- data/lib/glossarist.rb +4 -0
- metadata +62 -16
data/README.adoc
CHANGED
|
@@ -101,7 +101,7 @@ related:: Array of <<related-concept,RelatedConcept>>
|
|
|
101
101
|
status:: Enum for the normative status of the term.
|
|
102
102
|
dates:: Array of <<concept-date,ConceptDate>>
|
|
103
103
|
localized_concepts:: Hash of all localizations where keys are language codes and values are uuid of the localized concept.
|
|
104
|
-
|
|
104
|
+
domains:: Array of <<concept-reference,ConceptReference>> — upper concepts (subject areas, concept schemes, organizing concepts) that this concept belongs to across all languages. Each domain is a typed reference (e.g. `{ concept_id: "103", ref_type: "domain" }`).
|
|
105
105
|
localizations:: Hash of all localizations for this concept where keys are language codes and values are instances of <<localized-concept,LocalizedConcept>>.
|
|
106
106
|
|
|
107
107
|
There are two ways to initialize and populate a managed concept
|
|
@@ -118,9 +118,8 @@ concept = Glossarist::ManagedConcept.new({
|
|
|
118
118
|
"eng" => "<uuid>"
|
|
119
119
|
},
|
|
120
120
|
"localizations" => <Array of localized concepts or localized concept hashes>,
|
|
121
|
-
"
|
|
122
|
-
"
|
|
123
|
-
"bar",
|
|
121
|
+
"domains" => [
|
|
122
|
+
{ "concept_id" => "103", "ref_type" => "domain" },
|
|
124
123
|
],
|
|
125
124
|
},
|
|
126
125
|
})
|
|
@@ -132,7 +131,9 @@ concept = Glossarist::ManagedConcept.new({
|
|
|
132
131
|
----
|
|
133
132
|
concept = Glossarist::ManagedConcept.new
|
|
134
133
|
concept.id = "123"
|
|
135
|
-
concept.
|
|
134
|
+
concept.data.domains = [
|
|
135
|
+
Glossarist::ConceptReference.new(concept_id: "103", ref_type: "domain"),
|
|
136
|
+
]
|
|
136
137
|
concept.localizations = <Array of localized concepts or localized concept hashes>
|
|
137
138
|
----
|
|
138
139
|
|
|
@@ -146,55 +147,252 @@ Localized concept has the following fields
|
|
|
146
147
|
id:: An optional identifier for the term, to be used in cross-references.
|
|
147
148
|
uuid:: UUID for the concept
|
|
148
149
|
designations:: Array of <<designation,Designations>> under which the term being defined is known. This method will also accept an array of hashes for designation and will convert them to their respective classes.
|
|
149
|
-
domain::
|
|
150
|
+
domain:: URI reference to the subject area or section concept. Can be a relative URI (e.g. `section-103-01`), a URN (e.g. `urn:iec:std:iec:60050-103-01`), or a URL (e.g. `https://www.electropedia.org/iev/iev.nsf/display?openform&ievref=103-01`). This is the per-language upper concept reference — the subject area for this specific localization. Different languages may assign the same abstract concept to different domains.
|
|
151
|
+
related:: Array of <<related-concept,RelatedConcept>> — per-language concept relationships. Concept hierarchies can differ across languages (e.g. Russian distinguishes голубой/siniy as coordinate basic colors, while English unifies them under "blue"). Language-specific broader/narrower/equivalent relationships go here.
|
|
150
152
|
subject:: Subject of the term.
|
|
151
153
|
definition:: Array of <<detailed-definition,Detailed Definition>> of the term.
|
|
152
154
|
non_verb_rep:: Array of <<non-verbal,non-verbal>> representations used to help define the term.
|
|
153
155
|
notes:: Zero or more notes about the term. A note is in <<detailed-definition,Detailed Definition>> format.
|
|
154
156
|
examples:: Zero or more examples of how the term is to be used in <<detailed-definition,Detailed Definition>> format.
|
|
155
157
|
language_code:: The language of the localization, as an ISO-639 3-letter code.
|
|
158
|
+
script:: The script of the localization, as an ISO 15924 4-letter code (e.g. `Hans` for Simplified Chinese, `Latn` for Latin, `Cyrl` for Cyrillic). Optional — when omitted, the default script for the language is assumed.
|
|
159
|
+
system:: The ISO 24229 conversion system code used to produce this localization (e.g. `Var:jpn-Hrkt:Latn:Hepburn-1886` for Hepburn-romanized Japanese). Optional — only set when the localization is a romanization or transliteration.
|
|
156
160
|
entry_status:: Entry status of the concept. Must be one of the following: +notValid+, +valid+, +superseded+, +retired+.
|
|
157
161
|
classification:: Classification of the concept. Must be one of the following: +preferred+, +admitted+, +deprecated+.
|
|
158
162
|
|
|
159
163
|
[[id,designation]]
|
|
160
|
-
=== Designation
|
|
164
|
+
=== Designation
|
|
165
|
+
|
|
166
|
+
A name under which a managed term is known. Designations follow an
|
|
167
|
+
inheritance hierarchy based on ISO 10241-1 and the Metanorma concept model.
|
|
168
|
+
|
|
169
|
+
==== Designation::Base (common to all types)
|
|
170
|
+
|
|
171
|
+
designation:: String — the term text or symbol.
|
|
172
|
+
normative_status:: Enum — one of `preferred`, `admitted`, `deprecated`, `superseded`.
|
|
173
|
+
geographical_area:: String — geographic usage region (ISO 3166-1 country code).
|
|
174
|
+
language:: String — language of this designation (ISO 639 code). Usually inherited from the LocalizedConcept's `language_code`, but can differ for borrowed terms.
|
|
175
|
+
script:: String — script of the designation text (ISO 15924 code, e.g. `Hani` for Kanji, `Latn` for Latin, `Cyrl` for Cyrillic).
|
|
176
|
+
system:: String — ISO 24229 conversion system code used to produce this designation (e.g. `Var:jpn-Hrkt:Latn:Hepburn-1886` for Hepburn romanization). Optional — only set when the designation is a romanization or transliteration.
|
|
177
|
+
international:: Boolean — whether the designation is used internationally.
|
|
178
|
+
absent:: Boolean — whether the designation is intentionally absent in this language.
|
|
179
|
+
pronunciation:: Collection of `Pronunciation` entries — phonetic or romanized representations of the designation.
|
|
180
|
+
sources:: Collection of `ConceptSource` entries — bibliographic sources for this designation (ISO 10241-1 §6.8).
|
|
181
|
+
term_type:: Enum (ISO 12620) — optional classification of the designation's term type. See <<iso12620-term-types>> below.
|
|
182
|
+
related:: Collection of `RelatedConcept` entries — term-level (designation-to-designation) relationships within the same concept entry. Used for linking abbreviated forms to full forms, short forms to expanded forms, etc. (TBX xref types).
|
|
183
|
+
+
|
|
184
|
+
Each `Pronunciation` entry has:
|
|
185
|
+
+
|
|
186
|
+
[cols="1,1,2"]
|
|
187
|
+
|===
|
|
188
|
+
|Attribute |Standard |Description
|
|
189
|
+
|
|
190
|
+
|`content` |— |The pronunciation text
|
|
191
|
+
|`language` |ISO 639 |Language/dialect being pronounced (3-letter code)
|
|
192
|
+
|`script` |ISO 15924 |Script of the pronunciation text (4-letter code)
|
|
193
|
+
|`country` |ISO 3166-1 |Country variant (2-letter code, optional)
|
|
194
|
+
|`system` |ISO 24229 |Conversion system code or identifier (e.g. `IPA`, `Var:jpn-Hrkt:Latn:Hepburn-1886`)
|
|
195
|
+
|===
|
|
196
|
+
+
|
|
197
|
+
Example:
|
|
198
|
+
+
|
|
199
|
+
[,yaml]
|
|
200
|
+
----
|
|
201
|
+
pronunciation:
|
|
202
|
+
- content: "toːkjoː"
|
|
203
|
+
language: jpn
|
|
204
|
+
script: Latn
|
|
205
|
+
system: IPA
|
|
206
|
+
- content: "Tōkyō"
|
|
207
|
+
language: jpn
|
|
208
|
+
script: Latn
|
|
209
|
+
system: "Var:jpn-Hrkt:Latn:Hepburn-1886"
|
|
210
|
+
----
|
|
211
|
+
|
|
212
|
+
==== Designation::Expression (text-based, inherits Base)
|
|
213
|
+
|
|
214
|
+
prefix:: String — text before the designation.
|
|
215
|
+
usage_info:: String — disambiguation text for the designation.
|
|
216
|
+
field_of_application:: String — IEC "specific use", appears in angle brackets after the designation (e.g. "in communication theory").
|
|
217
|
+
grammar_info:: Array of GrammarInfo — gender, number, part of speech.
|
|
218
|
+
|
|
219
|
+
==== Designation::Abbreviation (inherits Expression)
|
|
220
|
+
|
|
221
|
+
acronym:: Boolean — is this an acronym?
|
|
222
|
+
initialism:: Boolean — is this an initialism?
|
|
223
|
+
truncation:: Boolean — is this a truncation?
|
|
224
|
+
|
|
225
|
+
==== Designation::Symbol (inherits Base)
|
|
226
|
+
|
|
227
|
+
No additional attributes beyond Base.
|
|
228
|
+
|
|
229
|
+
==== Designation::LetterSymbol (inherits Symbol)
|
|
230
|
+
|
|
231
|
+
text:: String — the letter symbol text.
|
|
161
232
|
|
|
162
|
-
|
|
233
|
+
==== Designation::GraphicalSymbol (inherits Symbol)
|
|
163
234
|
|
|
164
|
-
|
|
165
|
-
|
|
235
|
+
text:: String — description of the symbol.
|
|
236
|
+
image:: String — the graphical symbol (emoji, path, or data URL).
|
|
237
|
+
|
|
238
|
+
==== Factory Method
|
|
239
|
+
|
|
240
|
+
`Designation::Base.from_h(options)` creates a new designation instance based on the specified type.
|
|
166
241
|
|
|
167
242
|
Parameters::
|
|
168
243
|
* options (Hash) - The options for creating the designation.
|
|
169
|
-
* "type" (String) - The type of designation (expression
|
|
244
|
+
* "type" (String) - The type of designation (`expression`, `symbol`, `abbreviation`, `graphical_symbol`, `letter_symbol`). Note: type key should be string and not a symbol so `{ type: "expression" }` will not work.
|
|
170
245
|
* Additional options depend on the specific designation type.
|
|
171
246
|
|
|
172
247
|
Returns::
|
|
173
|
-
Designation::{type}::: A new instance of specified type.
|
|
248
|
+
Designation::{type}::: A new instance of specified type.
|
|
174
249
|
|
|
175
250
|
Example
|
|
176
251
|
[,ruby]
|
|
177
252
|
----
|
|
178
|
-
#
|
|
253
|
+
# Expression with field of application
|
|
254
|
+
expr = Designation::Base.from_h({
|
|
255
|
+
"type" => "expression",
|
|
256
|
+
"designation" => "information",
|
|
257
|
+
"normative_status" => "preferred",
|
|
258
|
+
"field_of_application" => "in communication theory",
|
|
259
|
+
})
|
|
260
|
+
|
|
261
|
+
# International abbreviation
|
|
262
|
+
abbr = Designation::Base.from_h({
|
|
263
|
+
"type" => "abbreviation",
|
|
264
|
+
"designation" => "ISO",
|
|
265
|
+
"international" => true,
|
|
266
|
+
"acronym" => true,
|
|
267
|
+
})
|
|
268
|
+
----
|
|
269
|
+
|
|
270
|
+
[[iso12620-term-types]]
|
|
271
|
+
==== ISO 12620 Term Types
|
|
272
|
+
|
|
273
|
+
The `term_type` attribute on `Designation::Base` classifies designations
|
|
274
|
+
according to ISO 12620 (also used as TBX `termType`). This is orthogonal to
|
|
275
|
+
the structural designation `type` (expression/abbreviation/symbol): the
|
|
276
|
+
structural type determines how the designation is serialized, while
|
|
277
|
+
`term_type` provides ISO 12620 semantic classification.
|
|
278
|
+
|
|
279
|
+
[cols="1,2"]
|
|
280
|
+
|===
|
|
281
|
+
|Term type |Description
|
|
282
|
+
|
|
283
|
+
|`abbreviation`
|
|
284
|
+
|A shortened form of a word or phrase (general category)
|
|
285
|
+
|
|
286
|
+
|`acronym`
|
|
287
|
+
|An abbreviation pronounced as a word (e.g. NATO, laser)
|
|
288
|
+
|
|
289
|
+
|`clipped_term`
|
|
290
|
+
|A term formed by clipping part of a longer term (e.g. "phone" from "telephone")
|
|
291
|
+
|
|
292
|
+
|`common_name`
|
|
293
|
+
|A name in common use for a concept (e.g. "water" vs H₂O)
|
|
294
|
+
|
|
295
|
+
|`entry_term`
|
|
296
|
+
|The headword or main term in a terminological entry
|
|
297
|
+
|
|
298
|
+
|`equation`
|
|
299
|
+
|A mathematical equation used as a designation
|
|
300
|
+
|
|
301
|
+
|`formula`
|
|
302
|
+
|A chemical or mathematical formula (e.g. H₂O, E=mc²)
|
|
179
303
|
|
|
180
|
-
|
|
181
|
-
|
|
304
|
+
|`full_form`
|
|
305
|
+
|The complete, unabbreviated form of a designation (e.g. "World Wide Web")
|
|
182
306
|
|
|
183
|
-
|
|
184
|
-
|
|
307
|
+
|`initialism`
|
|
308
|
+
|An abbreviation pronounced letter by letter (e.g. "URL", "FBI")
|
|
185
309
|
|
|
310
|
+
|`internationalism`
|
|
311
|
+
|A term used with the same meaning across many languages (e.g. "computer", "algorithm")
|
|
312
|
+
|
|
313
|
+
|`international_scientific_term`
|
|
314
|
+
|A term established by international scientific agreement (e.g. "hydrogen")
|
|
315
|
+
|
|
316
|
+
|`logical_expression`
|
|
317
|
+
|A logical or Boolean expression used as a designation
|
|
318
|
+
|
|
319
|
+
|`part_number`
|
|
320
|
+
|A part number or catalog identifier used as a designation
|
|
321
|
+
|
|
322
|
+
|`phraseological_unit`
|
|
323
|
+
|A multi-word expression or phrase functioning as a term (e.g. "software engineering")
|
|
324
|
+
|
|
325
|
+
|`transcribed_form`
|
|
326
|
+
|A designation produced by phonetic transcription from another script
|
|
327
|
+
|
|
328
|
+
|`transliterated_form`
|
|
329
|
+
|A designation produced by transliteration from another script (e.g. "Moskva" from "Москва")
|
|
330
|
+
|
|
331
|
+
|`short_form`
|
|
332
|
+
|A shortened form of a designation that is not an abbreviation (e.g. "US" for "United States")
|
|
333
|
+
|
|
334
|
+
|`shortcut`
|
|
335
|
+
|A keyboard shortcut or command sequence (e.g. "Ctrl+V" for paste)
|
|
336
|
+
|
|
337
|
+
|`sku`
|
|
338
|
+
|A stock keeping unit identifier
|
|
339
|
+
|
|
340
|
+
|`standard_text`
|
|
341
|
+
|A standardized text passage used as a designation
|
|
342
|
+
|
|
343
|
+
|`symbol`
|
|
344
|
+
|A non-verbal symbol representing the concept (e.g. Ω for ohm)
|
|
345
|
+
|
|
346
|
+
|`synonym`
|
|
347
|
+
|A term with the same meaning in the same language, used as an alternative designation
|
|
348
|
+
|
|
349
|
+
|`synonymous_phrase`
|
|
350
|
+
|A phrase that is synonymous with the preferred designation
|
|
351
|
+
|
|
352
|
+
|`variant`
|
|
353
|
+
|A spelling, regional, or stylistic variant of another designation
|
|
354
|
+
|===
|
|
355
|
+
|
|
356
|
+
==== Designation-Level Relationships (TBX xref)
|
|
357
|
+
|
|
358
|
+
Designations can have intra-entry relationships — links between
|
|
359
|
+
designations of the *same* concept. These correspond to TBX `xref`
|
|
360
|
+
elements on term information groups (`<tig>`).
|
|
361
|
+
|
|
362
|
+
[cols="1,2"]
|
|
363
|
+
|===
|
|
364
|
+
|Relationship type |Description
|
|
365
|
+
|
|
366
|
+
|`abbreviated_form_for`
|
|
367
|
+
|This designation is an abbreviated form of the target (e.g. "WWW" → "World Wide Web")
|
|
368
|
+
|
|
369
|
+
|`short_form_for`
|
|
370
|
+
|This designation is a short form of the target (e.g. "US" → "United States of America")
|
|
371
|
+
|===
|
|
372
|
+
|
|
373
|
+
Example:
|
|
374
|
+
[,yaml]
|
|
375
|
+
----
|
|
376
|
+
terms:
|
|
377
|
+
- designation: WWW
|
|
378
|
+
type: abbreviation
|
|
379
|
+
term_type: acronym
|
|
380
|
+
related:
|
|
381
|
+
- type: abbreviated_form_for
|
|
382
|
+
content: "World Wide Web"
|
|
383
|
+
- designation: World Wide Web
|
|
384
|
+
type: expression
|
|
385
|
+
term_type: full_form
|
|
186
386
|
----
|
|
187
387
|
|
|
188
388
|
[[id,related-concept]]
|
|
189
389
|
=== RelatedConcept
|
|
190
390
|
|
|
191
|
-
A
|
|
192
|
-
|
|
193
|
-
Following fields are available for the Related Concept
|
|
391
|
+
A concept related to the current concept with a typed relationship.
|
|
194
392
|
|
|
195
|
-
type::
|
|
196
|
-
content::
|
|
197
|
-
ref:: A <<citation,
|
|
393
|
+
type:: Enum — the relationship type (see <<relationship-types,Relationship Types>> below).
|
|
394
|
+
content:: String — free-text content describing the related concept.
|
|
395
|
+
ref:: A <<citation,Citation>> reference to the related concept.
|
|
198
396
|
|
|
199
397
|
There are two ways to initialize and populate a related concept
|
|
200
398
|
|
|
@@ -219,6 +417,208 @@ related_concept.content = "designation of the related concept"
|
|
|
219
417
|
related_concept.ref = <Citation object>
|
|
220
418
|
----
|
|
221
419
|
|
|
420
|
+
[[relationship-types]]
|
|
421
|
+
==== Relationship Types
|
|
422
|
+
|
|
423
|
+
Relationship types are drawn from ISO 10241-1, ISO 25964/SKOS, and ISO 12620/TBX. The table below shows each type with its provenance and cross-standard equivalents.
|
|
424
|
+
|
|
425
|
+
[cols="1,1,1,1,1"]
|
|
426
|
+
|===
|
|
427
|
+
|Glossarist type |Category |ISO 10241-1 |ISO 25964 / SKOS |ISO 12620 / TBX
|
|
428
|
+
|
|
429
|
+
|`deprecates`
|
|
430
|
+
|Lifecycle
|
|
431
|
+
|deprecates
|
|
432
|
+
|—
|
|
433
|
+
|—
|
|
434
|
+
|
|
435
|
+
|`supersedes`
|
|
436
|
+
|Lifecycle
|
|
437
|
+
|supersedes
|
|
438
|
+
|—
|
|
439
|
+
|—
|
|
440
|
+
|
|
441
|
+
|`superseded_by`
|
|
442
|
+
|Lifecycle
|
|
443
|
+
|superseded by
|
|
444
|
+
|—
|
|
445
|
+
|—
|
|
446
|
+
|
|
447
|
+
|`broader`
|
|
448
|
+
|Hierarchical
|
|
449
|
+
|broader concept
|
|
450
|
+
|BT (broaderTerm)
|
|
451
|
+
|broaderTerm
|
|
452
|
+
|
|
453
|
+
|`narrower`
|
|
454
|
+
|Hierarchical
|
|
455
|
+
|narrower concept
|
|
456
|
+
|NT (narrowerTerm)
|
|
457
|
+
|narrowerTerm
|
|
458
|
+
|
|
459
|
+
|`broader_generic`
|
|
460
|
+
|Hierarchical (generic)
|
|
461
|
+
|—
|
|
462
|
+
|BTG (broaderGeneric, is-a)
|
|
463
|
+
|broaderTermGeneric
|
|
464
|
+
|
|
465
|
+
|`narrower_generic`
|
|
466
|
+
|Hierarchical (generic)
|
|
467
|
+
|—
|
|
468
|
+
|NTG (narrowerGeneric)
|
|
469
|
+
|narrowerTermGeneric
|
|
470
|
+
|
|
471
|
+
|`broader_partitive`
|
|
472
|
+
|Hierarchical (partitive)
|
|
473
|
+
|—
|
|
474
|
+
|BTP (broaderPartitive, part-whole)
|
|
475
|
+
|broaderTermPartitive
|
|
476
|
+
|
|
477
|
+
|`narrower_partitive`
|
|
478
|
+
|Hierarchical (partitive)
|
|
479
|
+
|—
|
|
480
|
+
|NTP (narrowerPartitive)
|
|
481
|
+
|narrowerTermPartitive
|
|
482
|
+
|
|
483
|
+
|`broader_instantial`
|
|
484
|
+
|Hierarchical (instantial)
|
|
485
|
+
|—
|
|
486
|
+
|BTI (broaderInstantial, instance-of)
|
|
487
|
+
|broaderTermInstantial
|
|
488
|
+
|
|
489
|
+
|`narrower_instantial`
|
|
490
|
+
|Hierarchical (instantial)
|
|
491
|
+
|—
|
|
492
|
+
|NTI (narrowerInstantial)
|
|
493
|
+
|narrowerTermInstantial
|
|
494
|
+
|
|
495
|
+
|`equivalent`
|
|
496
|
+
|Equivalence
|
|
497
|
+
|equivalent
|
|
498
|
+
|exactMatch
|
|
499
|
+
|—
|
|
500
|
+
|
|
501
|
+
|`close_match`
|
|
502
|
+
|Approx. equiv.
|
|
503
|
+
|—
|
|
504
|
+
|closeMatch
|
|
505
|
+
|—
|
|
506
|
+
|
|
507
|
+
|`broad_match`
|
|
508
|
+
|Cross-vocab mapping
|
|
509
|
+
|—
|
|
510
|
+
|broadMatch
|
|
511
|
+
|—
|
|
512
|
+
|
|
513
|
+
|`narrow_match`
|
|
514
|
+
|Cross-vocab mapping
|
|
515
|
+
|—
|
|
516
|
+
|narrowMatch
|
|
517
|
+
|—
|
|
518
|
+
|
|
519
|
+
|`related_match`
|
|
520
|
+
|Cross-vocab mapping
|
|
521
|
+
|—
|
|
522
|
+
|relatedMatch
|
|
523
|
+
|—
|
|
524
|
+
|
|
525
|
+
|`compare`
|
|
526
|
+
|Comparative
|
|
527
|
+
|compare
|
|
528
|
+
|—
|
|
529
|
+
|—
|
|
530
|
+
|
|
531
|
+
|`contrast`
|
|
532
|
+
|Comparative
|
|
533
|
+
|contrast
|
|
534
|
+
|—
|
|
535
|
+
|—
|
|
536
|
+
|
|
537
|
+
|`see`
|
|
538
|
+
|Associative
|
|
539
|
+
|see also
|
|
540
|
+
|RT (relatedTerm)
|
|
541
|
+
|crossReference
|
|
542
|
+
|
|
543
|
+
|`related_concept`
|
|
544
|
+
|Associative
|
|
545
|
+
|—
|
|
546
|
+
|—
|
|
547
|
+
|relatedConcept
|
|
548
|
+
|
|
549
|
+
|`related_concept_broader`
|
|
550
|
+
|Associative (broader)
|
|
551
|
+
|—
|
|
552
|
+
|—
|
|
553
|
+
|relatedConceptBroader
|
|
554
|
+
|
|
555
|
+
|`related_concept_narrower`
|
|
556
|
+
|Associative (narrower)
|
|
557
|
+
|—
|
|
558
|
+
|—
|
|
559
|
+
|relatedConceptNarrower
|
|
560
|
+
|
|
561
|
+
|`sequentially_related_concept`
|
|
562
|
+
|Associative (sequential)
|
|
563
|
+
|—
|
|
564
|
+
|—
|
|
565
|
+
|sequentiallyRelatedConcept
|
|
566
|
+
|
|
567
|
+
|`spatially_related_concept`
|
|
568
|
+
|Associative (spatial)
|
|
569
|
+
|—
|
|
570
|
+
|—
|
|
571
|
+
|spatiallyRelatedConcept
|
|
572
|
+
|
|
573
|
+
|`temporally_related_concept`
|
|
574
|
+
|Associative (temporal)
|
|
575
|
+
|—
|
|
576
|
+
|—
|
|
577
|
+
|temporallyRelatedConcept
|
|
578
|
+
|
|
579
|
+
|`homograph`
|
|
580
|
+
|Lexical
|
|
581
|
+
|—
|
|
582
|
+
|—
|
|
583
|
+
|homograph
|
|
584
|
+
|
|
585
|
+
|`false_friend`
|
|
586
|
+
|Lexical
|
|
587
|
+
|—
|
|
588
|
+
|—
|
|
589
|
+
|falseFriend
|
|
590
|
+
|===
|
|
591
|
+
|
|
592
|
+
[[id,concept-reference]]
|
|
593
|
+
=== ConceptReference
|
|
594
|
+
|
|
595
|
+
A typed reference to another concept, either local (within the same glossary) or external (in another concept registry).
|
|
596
|
+
|
|
597
|
+
term:: String — the display text for the referenced concept.
|
|
598
|
+
concept_id:: String — the identifier of the target concept.
|
|
599
|
+
source:: String — the registry URI prefix for external references (e.g. `urn:iec:std:iec:60050`).
|
|
600
|
+
ref_type:: String — the reference type: `local`, `designation`, or `urn`.
|
|
601
|
+
urn:: String — a direct URN for the target concept (e.g. `urn:iec:std:iec:60050-102-01-01`).
|
|
602
|
+
|
|
603
|
+
Local references use `concept_id` without `source`. External references use `source` + `concept_id` or a direct `urn`.
|
|
604
|
+
|
|
605
|
+
[,ruby]
|
|
606
|
+
----
|
|
607
|
+
# Local reference
|
|
608
|
+
ref = Glossarist::ConceptReference.new(term: "latitude", concept_id: "200", ref_type: "local")
|
|
609
|
+
|
|
610
|
+
# External reference via URN
|
|
611
|
+
ref = Glossarist::ConceptReference.new(
|
|
612
|
+
term: "equality",
|
|
613
|
+
concept_id: "102-01-01",
|
|
614
|
+
source: "urn:iec:std:iec:60050",
|
|
615
|
+
ref_type: "urn",
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
ref.local? # => false
|
|
619
|
+
ref.external? # => true
|
|
620
|
+
----
|
|
621
|
+
|
|
222
622
|
[[id,concept-date]]
|
|
223
623
|
=== Concept Date
|
|
224
624
|
|
|
@@ -324,12 +724,27 @@ citation.clause = "some clause"
|
|
|
324
724
|
|
|
325
725
|
=== NonVerbRep
|
|
326
726
|
|
|
327
|
-
Non-verbal
|
|
727
|
+
Non-verbal representations are associated resources (images, tables, formulas) used to help define a concept (ISO 10241-1 §6.5). They live outside the concept model and are referenced by URI. Resources can be shared across concepts and belong either to the dataset package (relative path) or are externally referenced (URL/URN).
|
|
728
|
+
|
|
729
|
+
type:: String — the type of representation: `image`, `table`, or `formula`.
|
|
730
|
+
ref:: String — URI reference to the resource (relative path within the GCR package, URN, or URL).
|
|
731
|
+
text:: String — optional text description or alt text.
|
|
732
|
+
sources:: Collection of <<concept-source,ConceptSource>> entries — bibliographic sources for the representation.
|
|
328
733
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
734
|
+
Example:
|
|
735
|
+
+
|
|
736
|
+
[,yaml]
|
|
737
|
+
----
|
|
738
|
+
non_verbal_rep:
|
|
739
|
+
- type: image
|
|
740
|
+
ref: assets/images/figure-1.svg
|
|
741
|
+
text: Diagram showing the concept hierarchy
|
|
742
|
+
- type: formula
|
|
743
|
+
ref: urn:gcr:assets:formula-eq1
|
|
744
|
+
sources:
|
|
745
|
+
- type: authoritative
|
|
746
|
+
status: identical
|
|
747
|
+
----
|
|
333
748
|
|
|
334
749
|
[[id,concept-source]]
|
|
335
750
|
=== ConceptSource
|
|
@@ -626,12 +1041,14 @@ skipped_count:: `Integer` — concepts skipped due to duplicates (strategy: skip
|
|
|
626
1041
|
|
|
627
1042
|
=== validate
|
|
628
1043
|
|
|
629
|
-
Validate a dataset directory or `.gcr` file for schema compliance
|
|
1044
|
+
Validate a dataset directory or `.gcr` file for schema compliance, structural
|
|
1045
|
+
integrity, cross-reference resolution, and data quality.
|
|
630
1046
|
|
|
631
1047
|
[,bash]
|
|
632
1048
|
----
|
|
633
1049
|
glossarist validate PATH
|
|
634
1050
|
glossarist validate PATH --reference-path path/to/gcrs/
|
|
1051
|
+
glossarist validate PATH --strict
|
|
635
1052
|
----
|
|
636
1053
|
|
|
637
1054
|
Options:
|
|
@@ -657,6 +1074,210 @@ result.errors # => [...]
|
|
|
657
1074
|
result.warnings # => [...]
|
|
658
1075
|
----
|
|
659
1076
|
|
|
1077
|
+
== Validation System
|
|
1078
|
+
|
|
1079
|
+
Glossarist provides a rule-based validation framework that checks dataset
|
|
1080
|
+
directories and GCR packages for structural, schema, reference, integrity,
|
|
1081
|
+
quality, and localization issues.
|
|
1082
|
+
|
|
1083
|
+
=== Architecture
|
|
1084
|
+
|
|
1085
|
+
The validation system uses the **rule-registry pattern** (Open/Closed
|
|
1086
|
+
Principle). Each check is a self-describing rule class that subclasses
|
|
1087
|
+
`Glossarist::Validation::Rules::Base`. New rules are added by subclassing and
|
|
1088
|
+
registering — no existing code is modified.
|
|
1089
|
+
|
|
1090
|
+
```
|
|
1091
|
+
Glossarist::Validation
|
|
1092
|
+
├── Rules
|
|
1093
|
+
│ ├── Base # Abstract rule: code, category, severity, scope, check
|
|
1094
|
+
│ ├── Registry # Global registry: register, all, for_category, for_scope
|
|
1095
|
+
│ ├── DatasetContext # Lazy-loaded access to a directory dataset
|
|
1096
|
+
│ ├── GcrContext # Lazy-loaded access to a .gcr package
|
|
1097
|
+
│ └── (26 rule classes) # One file per rule
|
|
1098
|
+
├── ValidationIssue # Single finding: severity, code, message, location, suggestion
|
|
1099
|
+
├── BibliographyIndex # Index of bibliography anchors from sources + bibliography.yaml
|
|
1100
|
+
├── AssetIndex # Index of asset paths from images/ directory or GCR ZIP
|
|
1101
|
+
├── ConceptValidator # Orchestrator: runs per-concept rules
|
|
1102
|
+
├── GcrValidator # Orchestrator: runs GCR-level rules
|
|
1103
|
+
└── DatasetValidator # Orchestrator: runs directory-level + collection rules
|
|
1104
|
+
```
|
|
1105
|
+
|
|
1106
|
+
=== Rule Categories
|
|
1107
|
+
|
|
1108
|
+
Rules are classified into six MECE (Mutually Exclusive, Collectively
|
|
1109
|
+
Exhaustive) categories:
|
|
1110
|
+
|
|
1111
|
+
[cols="1,2"]
|
|
1112
|
+
|===
|
|
1113
|
+
|Category |What it checks
|
|
1114
|
+
|
|
1115
|
+
|`structure` |File/directory layout, ZIP contents, required parts
|
|
1116
|
+
|`schema` |Field types, enum values, required fields, YAML syntax
|
|
1117
|
+
|`references` |Cross-references between concepts, bibliography, assets
|
|
1118
|
+
|`integrity` |Metadata vs. reality, filename vs. ID, UUID cross-references
|
|
1119
|
+
|`quality` |Empty content, missing preferred terms, duplicate terms
|
|
1120
|
+
|`localization` |Language coverage, orphaned/missing localization files
|
|
1121
|
+
|===
|
|
1122
|
+
|
|
1123
|
+
=== Built-in Rules
|
|
1124
|
+
|
|
1125
|
+
The following rules are registered by default. Each rule has a unique code
|
|
1126
|
+
(e.g. `GLS-001`), a severity (`error` or `warning`), and a scope (`:concept`
|
|
1127
|
+
for per-concept checks or `:collection` for dataset-wide checks).
|
|
1128
|
+
|
|
1129
|
+
==== Structure Rules
|
|
1130
|
+
|
|
1131
|
+
[cols="1,2,1,1"]
|
|
1132
|
+
|===
|
|
1133
|
+
|Code |Rule |Severity |Scope
|
|
1134
|
+
|
|
1135
|
+
|GLS-001 |Concept ID is present |error |`:concept`
|
|
1136
|
+
|GLS-002 |At least one localization per concept |error |`:concept`
|
|
1137
|
+
|GLS-005 |Each localization has at least 1 term |error |`:concept`
|
|
1138
|
+
|GLS-020-YAML |bibliography.yaml is valid YAML |error |`:collection`
|
|
1139
|
+
|===
|
|
1140
|
+
|
|
1141
|
+
==== Schema Rules
|
|
1142
|
+
|
|
1143
|
+
[cols="1,2,1,1"]
|
|
1144
|
+
|===
|
|
1145
|
+
|Code |Rule |Severity |Scope
|
|
1146
|
+
|
|
1147
|
+
|GLS-003 |Entry status is a valid enum value |error |`:concept`
|
|
1148
|
+
|GLS-201 |Concept status is a valid enum value |error |`:concept`
|
|
1149
|
+
|GLS-202/203 |Source type and status are valid enums |error |`:concept`
|
|
1150
|
+
|GLS-200 |Related concept type is valid |error |`:concept`
|
|
1151
|
+
|GLS-204 |Designation normative_status is valid |error |`:concept`
|
|
1152
|
+
|GLS-205 |Date type is a valid enum |warning |`:concept`
|
|
1153
|
+
|GLS-206 |Language code is exactly 3 lowercase letters |error |`:concept`
|
|
1154
|
+
|GLS-207 |Designation type maps to a known subclass |error |`:concept`
|
|
1155
|
+
|===
|
|
1156
|
+
|
|
1157
|
+
==== Reference Rules
|
|
1158
|
+
|
|
1159
|
+
[cols="1,2,1,1"]
|
|
1160
|
+
|===
|
|
1161
|
+
|Code |Rule |Severity |Scope
|
|
1162
|
+
|
|
1163
|
+
|GLS-100 |`{{...}}` concept mentions resolve locally |warning |`:concept`
|
|
1164
|
+
|GLS-102 |`<<anchor>>` AsciiDoc xrefs resolve in bibliography index |warning |`:concept`
|
|
1165
|
+
|GLS-103-105 |Image references resolve in asset index |warning |`:concept`
|
|
1166
|
+
|GLS-110 |Related concept references resolve |warning |`:concept`
|
|
1167
|
+
|GLS-020 |Orphaned bibliography entries |warning |`:collection`
|
|
1168
|
+
|GLS-021 |Orphaned images |warning |`:collection`
|
|
1169
|
+
|GLS-112 |Supersedes/superseded_by symmetry check |warning |`:collection`
|
|
1170
|
+
|GLS-113 |No circular related-concept chains |error |`:collection`
|
|
1171
|
+
|===
|
|
1172
|
+
|
|
1173
|
+
==== Integrity Rules
|
|
1174
|
+
|
|
1175
|
+
[cols="1,2,1,1"]
|
|
1176
|
+
|===
|
|
1177
|
+
|Code |Rule |Severity |Scope
|
|
1178
|
+
|
|
1179
|
+
|GLS-001-U |Concept IDs are unique |error |`:collection`
|
|
1180
|
+
|GLS-011 |Concept count matches metadata |error |`:collection`
|
|
1181
|
+
|GLS-012 |Language list matches actual languages |warning |`:collection`
|
|
1182
|
+
|GLS-013 |Language coverage per concept |warning |`:concept`
|
|
1183
|
+
|GLS-015 |Filename matches concept ID (GCR) |error |`:concept`
|
|
1184
|
+
|GLS-016 |Concept URI is set or template is applicable |warning |`:collection`
|
|
1185
|
+
|GLS-018 |Localized concept UUID cross-references resolve |error |`:concept`
|
|
1186
|
+
|GLS-019 |Orphaned localization files |warning |`:collection`
|
|
1187
|
+
|===
|
|
1188
|
+
|
|
1189
|
+
==== Quality Rules
|
|
1190
|
+
|
|
1191
|
+
[cols="1,2,1,1"]
|
|
1192
|
+
|===
|
|
1193
|
+
|Code |Rule |Severity |Scope
|
|
1194
|
+
|
|
1195
|
+
|GLS-300 |Definition content is non-empty |warning |`:concept`
|
|
1196
|
+
|GLS-301 |At least one preferred designation per localization |warning |`:concept`
|
|
1197
|
+
|GLS-302 |No duplicate preferred terms within a language |warning |`:collection`
|
|
1198
|
+
|GLS-304 |Source citation is not empty |warning |`:concept`
|
|
1199
|
+
|GLS-306 |At least one authoritative source |warning |`:concept`
|
|
1200
|
+
|GLS-307 |Date values are parseable |warning |`:concept`
|
|
1201
|
+
|===
|
|
1202
|
+
|
|
1203
|
+
=== Cross-Reference Validation
|
|
1204
|
+
|
|
1205
|
+
The validation system checks that all references in concept content point to
|
|
1206
|
+
resources that actually exist:
|
|
1207
|
+
|
|
1208
|
+
* **Bibliographic cross-references** — AsciiDoc `<<anchor>>` xrefs are checked
|
|
1209
|
+
against a `BibliographyIndex` built from all `ConceptSource` entries and
|
|
1210
|
+
optional `bibliography.yaml`.
|
|
1211
|
+
* **Image/asset references** — `image::path[]` references and model-level asset
|
|
1212
|
+
paths (`NonVerbRep`, `GraphicalSymbol`) are checked against an `AssetIndex`
|
|
1213
|
+
built from the `images/` directory or GCR ZIP entries.
|
|
1214
|
+
* **Inter-concept references** — `{{...}}` concept mentions are checked against
|
|
1215
|
+
the concept collection for local references, and against registered GCR
|
|
1216
|
+
packages for inter-set URN references.
|
|
1217
|
+
|
|
1218
|
+
=== Validation Result
|
|
1219
|
+
|
|
1220
|
+
`ValidationResult` holds the aggregated findings from all rules:
|
|
1221
|
+
|
|
1222
|
+
[,ruby]
|
|
1223
|
+
----
|
|
1224
|
+
result = DatasetValidator.new.validate("path/to/dataset")
|
|
1225
|
+
result.valid? # => true if no errors
|
|
1226
|
+
result.errors # => Array of error strings
|
|
1227
|
+
result.warnings # => Array of warning strings
|
|
1228
|
+
result.issues # => Array of ValidationIssue objects (full detail)
|
|
1229
|
+
----
|
|
1230
|
+
|
|
1231
|
+
Each `ValidationIssue` carries structured metadata:
|
|
1232
|
+
|
|
1233
|
+
[,ruby]
|
|
1234
|
+
----
|
|
1235
|
+
issue = result.issues.first
|
|
1236
|
+
issue.severity # => "error" or "warning"
|
|
1237
|
+
issue.code # => "GLS-300"
|
|
1238
|
+
issue.message # => "definition 1 has empty content"
|
|
1239
|
+
issue.location # => "concepts/100.yaml/eng"
|
|
1240
|
+
issue.suggestion # => "Add definition text or remove the empty entry"
|
|
1241
|
+
issue.to_s # => "[ERROR] [GLS-300] concepts/100.yaml/eng: definition 1 has empty content"
|
|
1242
|
+
----
|
|
1243
|
+
|
|
1244
|
+
=== Adding Custom Rules
|
|
1245
|
+
|
|
1246
|
+
New validation rules are added by subclassing `Base` and registering with the
|
|
1247
|
+
global `Registry`. This extends validation without modifying existing code:
|
|
1248
|
+
|
|
1249
|
+
[,ruby]
|
|
1250
|
+
----
|
|
1251
|
+
class MyCustomRule < Glossarist::Validation::Rules::Base
|
|
1252
|
+
def code = "CUSTOM-001"
|
|
1253
|
+
def category = :quality
|
|
1254
|
+
def severity = "warning"
|
|
1255
|
+
def scope = :concept
|
|
1256
|
+
|
|
1257
|
+
def applicable?(context)
|
|
1258
|
+
context.concept&.localizations&.any?
|
|
1259
|
+
end
|
|
1260
|
+
|
|
1261
|
+
def check(context)
|
|
1262
|
+
issues = []
|
|
1263
|
+
context.concept.localizations.each do |l10n|
|
|
1264
|
+
# ... your check logic ...
|
|
1265
|
+
if some_condition
|
|
1266
|
+
issues << issue("something is wrong",
|
|
1267
|
+
location: context.file_name,
|
|
1268
|
+
suggestion: "how to fix it")
|
|
1269
|
+
end
|
|
1270
|
+
end
|
|
1271
|
+
issues
|
|
1272
|
+
end
|
|
1273
|
+
end
|
|
1274
|
+
|
|
1275
|
+
Glossarist::Validation::Rules::Registry.register(MyCustomRule)
|
|
1276
|
+
----
|
|
1277
|
+
|
|
1278
|
+
Custom rules are automatically picked up by `DatasetValidator`, `GcrValidator`,
|
|
1279
|
+
and `ConceptValidator` on the next validation run.
|
|
1280
|
+
|
|
660
1281
|
=== upgrade
|
|
661
1282
|
|
|
662
1283
|
Upgrade a dataset to the current schema version.
|