glossarist 2.6.4 → 2.6.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop_todo.yml +11 -111
  3. data/Gemfile +0 -2
  4. data/README.adoc +207 -1
  5. data/glossarist.gemspec +1 -1
  6. data/lib/glossarist/asset_reference.rb +16 -0
  7. data/lib/glossarist/bibliographic_reference.rb +16 -0
  8. data/lib/glossarist/concept_enricher.rb +1 -0
  9. data/lib/glossarist/concept_reference.rb +4 -0
  10. data/lib/glossarist/concept_validator.rb +27 -56
  11. data/lib/glossarist/dataset_validator.rb +30 -34
  12. data/lib/glossarist/gcr_validator.rb +26 -101
  13. data/lib/glossarist/reference_extractor.rb +80 -10
  14. data/lib/glossarist/reference_resolver.rb +1 -0
  15. data/lib/glossarist/validation/asset_index.rb +113 -0
  16. data/lib/glossarist/validation/bibliography_index.rb +121 -0
  17. data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +60 -0
  18. data/lib/glossarist/validation/rules/authoritative_source_rule.rb +47 -0
  19. data/lib/glossarist/validation/rules/base.rb +46 -0
  20. data/lib/glossarist/validation/rules/bibliography_yaml_rule.rb +37 -0
  21. data/lib/glossarist/validation/rules/citation_completeness_rule.rb +63 -0
  22. data/lib/glossarist/validation/rules/concept_context.rb +45 -0
  23. data/lib/glossarist/validation/rules/concept_count_rule.rb +34 -0
  24. data/lib/glossarist/validation/rules/concept_id_rule.rb +29 -0
  25. data/lib/glossarist/validation/rules/concept_id_uniqueness_rule.rb +42 -0
  26. data/lib/glossarist/validation/rules/concept_mention_rule.rb +44 -0
  27. data/lib/glossarist/validation/rules/concept_status_rule.rb +36 -0
  28. data/lib/glossarist/validation/rules/concept_uri_rule.rb +30 -0
  29. data/lib/glossarist/validation/rules/dataset_context.rb +99 -0
  30. data/lib/glossarist/validation/rules/date_type_rule.rb +54 -0
  31. data/lib/glossarist/validation/rules/date_validity_rule.rb +66 -0
  32. data/lib/glossarist/validation/rules/definition_content_rule.rb +41 -0
  33. data/lib/glossarist/validation/rules/designation_status_rule.rb +45 -0
  34. data/lib/glossarist/validation/rules/designation_type_rule.rb +55 -0
  35. data/lib/glossarist/validation/rules/duplicate_term_rule.rb +63 -0
  36. data/lib/glossarist/validation/rules/entry_status_rule.rb +39 -0
  37. data/lib/glossarist/validation/rules/filename_id_rule.rb +35 -0
  38. data/lib/glossarist/validation/rules/gcr_context.rb +92 -0
  39. data/lib/glossarist/validation/rules/image_reference_rule.rb +73 -0
  40. data/lib/glossarist/validation/rules/l10n_uuid_integrity_rule.rb +40 -0
  41. data/lib/glossarist/validation/rules/language_code_format_rule.rb +39 -0
  42. data/lib/glossarist/validation/rules/language_coverage_rule.rb +37 -0
  43. data/lib/glossarist/validation/rules/language_list_rule.rb +46 -0
  44. data/lib/glossarist/validation/rules/localization_presence_rule.rb +25 -0
  45. data/lib/glossarist/validation/rules/orphaned_bibliography_rule.rb +64 -0
  46. data/lib/glossarist/validation/rules/orphaned_images_rule.rb +68 -0
  47. data/lib/glossarist/validation/rules/orphaned_l10n_files_rule.rb +39 -0
  48. data/lib/glossarist/validation/rules/preferred_term_rule.rb +41 -0
  49. data/lib/glossarist/validation/rules/registry.rb +42 -0
  50. data/lib/glossarist/validation/rules/related_concept_cycle_rule.rb +102 -0
  51. data/lib/glossarist/validation/rules/related_concept_rule.rb +40 -0
  52. data/lib/glossarist/validation/rules/related_concept_symmetry_rule.rb +87 -0
  53. data/lib/glossarist/validation/rules/source_type_rule.rb +63 -0
  54. data/lib/glossarist/validation/rules/terms_presence_rule.rb +39 -0
  55. data/lib/glossarist/validation/rules.rb +85 -0
  56. data/lib/glossarist/validation/validation_issue.rb +39 -0
  57. data/lib/glossarist/validation.rb +12 -0
  58. data/lib/glossarist/validation_result.rb +26 -9
  59. data/lib/glossarist/version.rb +1 -1
  60. data/lib/glossarist.rb +3 -0
  61. metadata +60 -15
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: facc3d87fcd7c04944cdb267bd8a0cf5b259bd53b95fe20259b1708b6120fb82
4
- data.tar.gz: 89676d88ae921c18ad4c932074f3dabe47674ede8f82c3fac2a7c5fa8330d780
3
+ metadata.gz: 0e3cd8f02f83acf4b2c43139fe2ec0afd8b635860e02e31e530827297648168b
4
+ data.tar.gz: 755e65d489c0d889ae3d9f4652703d55055aa0b0f0b669b588380070746fc63a
5
5
  SHA512:
6
- metadata.gz: 3f08e797184f6c4b006c33bb56e61f31de44ab57143697492dc60397eba400b0cde8ce7d6d92a616ef345271d477b77ce94d327b5421c4a335f2282b1d3aafc2
7
- data.tar.gz: cf150e9819f6a0242fd63617099a5ce34e321a559687e13f9f04c0edfb571972e960603eb35d3df112e68d959f317ee47db844aba0b86b68391414db84e4c421
6
+ metadata.gz: 5794f170646fa14bae1f7e9d3bf82499c4298f3976804d6cf1b0d33dea90cd26715c5c394cfd6335f8616c4a55d05d78b9cdeb0f653b7a86d629a7b3d0bfe799
7
+ data.tar.gz: 7681ef4afc518c7cf5bc4fe99a62dc3b41a44c66778257fc5f1e81004867ed86b98ef4c9e4026b8c5a11791d3d618849cf4bf809f11e07d4d87dfef56ffbec5c
data/.rubocop_todo.yml CHANGED
@@ -1,94 +1,23 @@
1
1
  # This configuration was generated by
2
2
  # `rubocop --auto-gen-config`
3
- # on 2026-05-12 04:13:45 UTC using RuboCop version 1.86.1.
3
+ # on 2026-05-12 10:41:50 UTC using RuboCop version 1.86.1.
4
4
  # The point is for the user to remove these configuration records
5
5
  # one by one as the offenses are removed from the code base.
6
6
  # Note that changes in the inspected code, or installation of new
7
7
  # versions of RuboCop, may require this file to be generated again.
8
8
 
9
- # Offense count: 7
10
- # This cop supports safe autocorrection (--autocorrect).
11
- # Configuration parameters: TreatCommentsAsGroupSeparators, ConsiderPunctuation.
12
- Bundler/OrderedGems:
13
- Exclude:
14
- - 'Gemfile'
15
-
16
9
  # Offense count: 1
17
10
  Gemspec/RequiredRubyVersion:
18
11
  Exclude:
19
12
  - 'glossarist.gemspec'
20
13
 
21
- # Offense count: 4
22
- # This cop supports safe autocorrection (--autocorrect).
23
- # Configuration parameters: EnforcedStyle, IndentationWidth.
24
- # SupportedStyles: with_first_argument, with_fixed_indentation
25
- Layout/ArgumentAlignment:
26
- Exclude:
27
- - 'lib/glossarist/sts/import_result.rb'
28
- - 'lib/glossarist/sts/importer.rb'
29
- - 'lib/glossarist/sts/term_mapper.rb'
30
-
31
- # Offense count: 1
32
- # This cop supports safe autocorrection (--autocorrect).
33
- # Configuration parameters: IndentationWidth.
34
- Layout/AssignmentIndentation:
35
- Exclude:
36
- - 'lib/glossarist/sts/term_mapper.rb'
37
-
38
- # Offense count: 6
39
- # This cop supports safe autocorrection (--autocorrect).
40
- # Configuration parameters: EnforcedStyleAlignWith.
41
- # SupportedStylesAlignWith: either, start_of_block, start_of_line
42
- Layout/BlockAlignment:
43
- Exclude:
44
- - 'lib/glossarist/sts/term_extractor.rb'
45
- - 'spec/unit/sts/term_extractor_spec.rb'
46
- - 'spec/unit/sts/term_mapper_spec.rb'
47
-
48
- # Offense count: 6
49
- # This cop supports safe autocorrection (--autocorrect).
50
- Layout/BlockEndNewline:
51
- Exclude:
52
- - 'lib/glossarist/sts/term_extractor.rb'
53
- - 'spec/unit/sts/term_extractor_spec.rb'
54
- - 'spec/unit/sts/term_mapper_spec.rb'
55
-
56
- # Offense count: 1
57
- # This cop supports safe autocorrection (--autocorrect).
58
- # Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
59
- # SupportedHashRocketStyles: key, separator, table
60
- # SupportedColonStyles: key, separator, table
61
- # SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
62
- Layout/HashAlignment:
63
- Exclude:
64
- - 'lib/glossarist/sts/importer.rb'
65
-
66
- # Offense count: 12
67
- # This cop supports safe autocorrection (--autocorrect).
68
- # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
69
- # SupportedStylesAlignWith: start_of_line, relative_to_receiver
70
- Layout/IndentationWidth:
71
- Exclude:
72
- - 'lib/glossarist/sts/term_extractor.rb'
73
- - 'spec/unit/sts/term_extractor_spec.rb'
74
- - 'spec/unit/sts/term_mapper_spec.rb'
75
-
76
- # Offense count: 236
14
+ # Offense count: 254
77
15
  # This cop supports safe autocorrection (--autocorrect).
78
16
  # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
79
17
  # URISchemes: http, https
80
18
  Layout/LineLength:
81
19
  Enabled: false
82
20
 
83
- # Offense count: 7
84
- # This cop supports safe autocorrection (--autocorrect).
85
- # Configuration parameters: AllowInHeredoc.
86
- Layout/TrailingWhitespace:
87
- Exclude:
88
- - 'lib/glossarist/sts/import_result.rb'
89
- - 'lib/glossarist/sts/importer.rb'
90
- - 'lib/glossarist/sts/term_mapper.rb'
91
-
92
21
  # Offense count: 1
93
22
  # Configuration parameters: AllowedMethods.
94
23
  # AllowedMethods: enums
@@ -104,26 +33,12 @@ Lint/UnusedMethodArgument:
104
33
  Exclude:
105
34
  - 'lib/glossarist/dataset_validator.rb'
106
35
 
107
- # Offense count: 24
36
+ # Offense count: 37
108
37
  # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
109
38
  Metrics/AbcSize:
110
- Exclude:
111
- - 'lib/glossarist/cli/export_command.rb'
112
- - 'lib/glossarist/cli/package_command.rb'
113
- - 'lib/glossarist/cli/validate_command.rb'
114
- - 'lib/glossarist/concept_manager.rb'
115
- - 'lib/glossarist/gcr_metadata.rb'
116
- - 'lib/glossarist/gcr_package.rb'
117
- - 'lib/glossarist/reference_extractor.rb'
118
- - 'lib/glossarist/reference_resolver.rb'
119
- - 'lib/glossarist/resolution_adapter/local.rb'
120
- - 'lib/glossarist/schema_migration.rb'
121
- - 'lib/glossarist/transforms/concept_to_skos_transform.rb'
122
- - 'lib/glossarist/transforms/concept_to_tbx_transform.rb'
123
- - 'lib/glossarist/utilities/uuid.rb'
124
- - 'spec/unit/concept_collector_spec.rb'
39
+ Enabled: false
125
40
 
126
- # Offense count: 19
41
+ # Offense count: 24
127
42
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
128
43
  Metrics/CyclomaticComplexity:
129
44
  Exclude:
@@ -132,6 +47,7 @@ Metrics/CyclomaticComplexity:
132
47
  - 'lib/glossarist/designation/expression.rb'
133
48
  - 'lib/glossarist/gcr_metadata.rb'
134
49
  - 'lib/glossarist/gcr_statistics.rb'
50
+ - 'lib/glossarist/gcr_validator.rb'
135
51
  - 'lib/glossarist/managed_concept.rb'
136
52
  - 'lib/glossarist/reference_extractor.rb'
137
53
  - 'lib/glossarist/reference_resolver.rb'
@@ -139,8 +55,9 @@ Metrics/CyclomaticComplexity:
139
55
  - 'lib/glossarist/schema_migration.rb'
140
56
  - 'lib/glossarist/transforms/concept_to_skos_transform.rb'
141
57
  - 'lib/glossarist/transforms/concept_to_tbx_transform.rb'
58
+ - 'lib/glossarist/validation/bibliography_index.rb'
142
59
 
143
- # Offense count: 47
60
+ # Offense count: 48
144
61
  # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
145
62
  Metrics/MethodLength:
146
63
  Max: 42
@@ -150,19 +67,21 @@ Metrics/MethodLength:
150
67
  Metrics/ParameterLists:
151
68
  Max: 6
152
69
 
153
- # Offense count: 13
70
+ # Offense count: 18
154
71
  # Configuration parameters: AllowedMethods, AllowedPatterns, Max.
155
72
  Metrics/PerceivedComplexity:
156
73
  Exclude:
157
74
  - 'lib/glossarist/concept_validator.rb'
158
75
  - 'lib/glossarist/designation/expression.rb'
159
76
  - 'lib/glossarist/gcr_metadata.rb'
77
+ - 'lib/glossarist/gcr_validator.rb'
160
78
  - 'lib/glossarist/reference_extractor.rb'
161
79
  - 'lib/glossarist/reference_resolver.rb'
162
80
  - 'lib/glossarist/resolution_adapter/local.rb'
163
81
  - 'lib/glossarist/schema_migration.rb'
164
82
  - 'lib/glossarist/transforms/concept_to_skos_transform.rb'
165
83
  - 'lib/glossarist/transforms/concept_to_tbx_transform.rb'
84
+ - 'lib/glossarist/validation/bibliography_index.rb'
166
85
 
167
86
  # Offense count: 6
168
87
  # Configuration parameters: MinNameLength, AllowNamesEndingInNumbers, AllowedNames, ForbiddenNames.
@@ -179,19 +98,6 @@ Naming/VariableNumber:
179
98
  Exclude:
180
99
  - 'spec/unit/rdf/skos_vocabulary_spec.rb'
181
100
 
182
- # Offense count: 9
183
- # This cop supports safe autocorrection (--autocorrect).
184
- # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
185
- # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
186
- # ProceduralMethods: benchmark, bm, bmbm, create, each_with_object, measure, new, realtime, tap, with_object
187
- # FunctionalMethods: let, let!, subject, watch
188
- # AllowedMethods: lambda, proc, it
189
- Style/BlockDelimiters:
190
- Exclude:
191
- - 'lib/glossarist/sts/term_extractor.rb'
192
- - 'spec/unit/sts/term_extractor_spec.rb'
193
- - 'spec/unit/sts/term_mapper_spec.rb'
194
-
195
101
  # Offense count: 6
196
102
  # This cop supports safe autocorrection (--autocorrect).
197
103
  # Configuration parameters: MaxUnannotatedPlaceholdersAllowed, Mode, AllowedMethods, AllowedPatterns.
@@ -199,12 +105,6 @@ Style/BlockDelimiters:
199
105
  Style/FormatStringToken:
200
106
  EnforcedStyle: unannotated
201
107
 
202
- # Offense count: 2
203
- # This cop supports safe autocorrection (--autocorrect).
204
- Style/MultilineIfModifier:
205
- Exclude:
206
- - 'lib/glossarist/sts/importer.rb'
207
-
208
108
  # Offense count: 1
209
109
  # Configuration parameters: AllowedClasses.
210
110
  Style/OneClassPerFile:
data/Gemfile CHANGED
@@ -14,5 +14,3 @@ gem "rubocop"
14
14
  gem "rubocop-performance"
15
15
  gem "rubocop-rake"
16
16
  gem "rubocop-rspec"
17
-
18
-
data/README.adoc CHANGED
@@ -626,12 +626,14 @@ skipped_count:: `Integer` — concepts skipped due to duplicates (strategy: skip
626
626
 
627
627
  === validate
628
628
 
629
- Validate a dataset directory or `.gcr` file for schema compliance.
629
+ Validate a dataset directory or `.gcr` file for schema compliance, structural
630
+ integrity, cross-reference resolution, and data quality.
630
631
 
631
632
  [,bash]
632
633
  ----
633
634
  glossarist validate PATH
634
635
  glossarist validate PATH --reference-path path/to/gcrs/
636
+ glossarist validate PATH --strict
635
637
  ----
636
638
 
637
639
  Options:
@@ -657,6 +659,210 @@ result.errors # => [...]
657
659
  result.warnings # => [...]
658
660
  ----
659
661
 
662
+ == Validation System
663
+
664
+ Glossarist provides a rule-based validation framework that checks dataset
665
+ directories and GCR packages for structural, schema, reference, integrity,
666
+ quality, and localization issues.
667
+
668
+ === Architecture
669
+
670
+ The validation system uses the **rule-registry pattern** (Open/Closed
671
+ Principle). Each check is a self-describing rule class that subclasses
672
+ `Glossarist::Validation::Rules::Base`. New rules are added by subclassing and
673
+ registering — no existing code is modified.
674
+
675
+ ```
676
+ Glossarist::Validation
677
+ ├── Rules
678
+ │ ├── Base # Abstract rule: code, category, severity, scope, check
679
+ │ ├── Registry # Global registry: register, all, for_category, for_scope
680
+ │ ├── DatasetContext # Lazy-loaded access to a directory dataset
681
+ │ ├── GcrContext # Lazy-loaded access to a .gcr package
682
+ │ └── (26 rule classes) # One file per rule
683
+ ├── ValidationIssue # Single finding: severity, code, message, location, suggestion
684
+ ├── BibliographyIndex # Index of bibliography anchors from sources + bibliography.yaml
685
+ ├── AssetIndex # Index of asset paths from images/ directory or GCR ZIP
686
+ ├── ConceptValidator # Orchestrator: runs per-concept rules
687
+ ├── GcrValidator # Orchestrator: runs GCR-level rules
688
+ └── DatasetValidator # Orchestrator: runs directory-level + collection rules
689
+ ```
690
+
691
+ === Rule Categories
692
+
693
+ Rules are classified into six MECE (Mutually Exclusive, Collectively
694
+ Exhaustive) categories:
695
+
696
+ [cols="1,2"]
697
+ |===
698
+ |Category |What it checks
699
+
700
+ |`structure` |File/directory layout, ZIP contents, required parts
701
+ |`schema` |Field types, enum values, required fields, YAML syntax
702
+ |`references` |Cross-references between concepts, bibliography, assets
703
+ |`integrity` |Metadata vs. reality, filename vs. ID, UUID cross-references
704
+ |`quality` |Empty content, missing preferred terms, duplicate terms
705
+ |`localization` |Language coverage, orphaned/missing localization files
706
+ |===
707
+
708
+ === Built-in Rules
709
+
710
+ The following rules are registered by default. Each rule has a unique code
711
+ (e.g. `GLS-001`), a severity (`error` or `warning`), and a scope (`:concept`
712
+ for per-concept checks or `:collection` for dataset-wide checks).
713
+
714
+ ==== Structure Rules
715
+
716
+ [cols="1,2,1,1"]
717
+ |===
718
+ |Code |Rule |Severity |Scope
719
+
720
+ |GLS-001 |Concept ID is present |error |`:concept`
721
+ |GLS-002 |At least one localization per concept |error |`:concept`
722
+ |GLS-005 |Each localization has at least 1 term |error |`:concept`
723
+ |GLS-020-YAML |bibliography.yaml is valid YAML |error |`:collection`
724
+ |===
725
+
726
+ ==== Schema Rules
727
+
728
+ [cols="1,2,1,1"]
729
+ |===
730
+ |Code |Rule |Severity |Scope
731
+
732
+ |GLS-003 |Entry status is a valid enum value |error |`:concept`
733
+ |GLS-201 |Concept status is a valid enum value |error |`:concept`
734
+ |GLS-202/203 |Source type and status are valid enums |error |`:concept`
735
+ |GLS-200 |Related concept type is valid |error |`:concept`
736
+ |GLS-204 |Designation normative_status is valid |error |`:concept`
737
+ |GLS-205 |Date type is a valid enum |warning |`:concept`
738
+ |GLS-206 |Language code is exactly 3 lowercase letters |error |`:concept`
739
+ |GLS-207 |Designation type maps to a known subclass |error |`:concept`
740
+ |===
741
+
742
+ ==== Reference Rules
743
+
744
+ [cols="1,2,1,1"]
745
+ |===
746
+ |Code |Rule |Severity |Scope
747
+
748
+ |GLS-100 |`{{...}}` concept mentions resolve locally |warning |`:concept`
749
+ |GLS-102 |`<<anchor>>` AsciiDoc xrefs resolve in bibliography index |warning |`:concept`
750
+ |GLS-103-105 |Image references resolve in asset index |warning |`:concept`
751
+ |GLS-110 |Related concept references resolve |warning |`:concept`
752
+ |GLS-020 |Orphaned bibliography entries |warning |`:collection`
753
+ |GLS-021 |Orphaned images |warning |`:collection`
754
+ |GLS-112 |Supersedes/superseded_by symmetry check |warning |`:collection`
755
+ |GLS-113 |No circular related-concept chains |error |`:collection`
756
+ |===
757
+
758
+ ==== Integrity Rules
759
+
760
+ [cols="1,2,1,1"]
761
+ |===
762
+ |Code |Rule |Severity |Scope
763
+
764
+ |GLS-001-U |Concept IDs are unique |error |`:collection`
765
+ |GLS-011 |Concept count matches metadata |error |`:collection`
766
+ |GLS-012 |Language list matches actual languages |warning |`:collection`
767
+ |GLS-013 |Language coverage per concept |warning |`:concept`
768
+ |GLS-015 |Filename matches concept ID (GCR) |error |`:concept`
769
+ |GLS-016 |Concept URI is set or template is applicable |warning |`:collection`
770
+ |GLS-018 |Localized concept UUID cross-references resolve |error |`:concept`
771
+ |GLS-019 |Orphaned localization files |warning |`:collection`
772
+ |===
773
+
774
+ ==== Quality Rules
775
+
776
+ [cols="1,2,1,1"]
777
+ |===
778
+ |Code |Rule |Severity |Scope
779
+
780
+ |GLS-300 |Definition content is non-empty |warning |`:concept`
781
+ |GLS-301 |At least one preferred designation per localization |warning |`:concept`
782
+ |GLS-302 |No duplicate preferred terms within a language |warning |`:collection`
783
+ |GLS-304 |Source citation is not empty |warning |`:concept`
784
+ |GLS-306 |At least one authoritative source |warning |`:concept`
785
+ |GLS-307 |Date values are parseable |warning |`:concept`
786
+ |===
787
+
788
+ === Cross-Reference Validation
789
+
790
+ The validation system checks that all references in concept content point to
791
+ resources that actually exist:
792
+
793
+ * **Bibliographic cross-references** — AsciiDoc `<<anchor>>` xrefs are checked
794
+ against a `BibliographyIndex` built from all `ConceptSource` entries and
795
+ optional `bibliography.yaml`.
796
+ * **Image/asset references** — `image::path[]` references and model-level asset
797
+ paths (`NonVerbRep`, `GraphicalSymbol`) are checked against an `AssetIndex`
798
+ built from the `images/` directory or GCR ZIP entries.
799
+ * **Inter-concept references** — `{{...}}` concept mentions are checked against
800
+ the concept collection for local references, and against registered GCR
801
+ packages for inter-set URN references.
802
+
803
+ === Validation Result
804
+
805
+ `ValidationResult` holds the aggregated findings from all rules:
806
+
807
+ [,ruby]
808
+ ----
809
+ result = DatasetValidator.new.validate("path/to/dataset")
810
+ result.valid? # => true if no errors
811
+ result.errors # => Array of error strings
812
+ result.warnings # => Array of warning strings
813
+ result.issues # => Array of ValidationIssue objects (full detail)
814
+ ----
815
+
816
+ Each `ValidationIssue` carries structured metadata:
817
+
818
+ [,ruby]
819
+ ----
820
+ issue = result.issues.first
821
+ issue.severity # => "error" or "warning"
822
+ issue.code # => "GLS-300"
823
+ issue.message # => "definition 1 has empty content"
824
+ issue.location # => "concepts/100.yaml/eng"
825
+ issue.suggestion # => "Add definition text or remove the empty entry"
826
+ issue.to_s # => "[ERROR] [GLS-300] concepts/100.yaml/eng: definition 1 has empty content"
827
+ ----
828
+
829
+ === Adding Custom Rules
830
+
831
+ New validation rules are added by subclassing `Base` and registering with the
832
+ global `Registry`. This extends validation without modifying existing code:
833
+
834
+ [,ruby]
835
+ ----
836
+ class MyCustomRule < Glossarist::Validation::Rules::Base
837
+ def code = "CUSTOM-001"
838
+ def category = :quality
839
+ def severity = "warning"
840
+ def scope = :concept
841
+
842
+ def applicable?(context)
843
+ context.concept&.localizations&.any?
844
+ end
845
+
846
+ def check(context)
847
+ issues = []
848
+ context.concept.localizations.each do |l10n|
849
+ # ... your check logic ...
850
+ if some_condition
851
+ issues << issue("something is wrong",
852
+ location: context.file_name,
853
+ suggestion: "how to fix it")
854
+ end
855
+ end
856
+ issues
857
+ end
858
+ end
859
+
860
+ Glossarist::Validation::Rules::Registry.register(MyCustomRule)
861
+ ----
862
+
863
+ Custom rules are automatically picked up by `DatasetValidator`, `GcrValidator`,
864
+ and `ConceptValidator` on the next validation run.
865
+
660
866
  === upgrade
661
867
 
662
868
  Upgrade a dataset to the current schema version.
data/glossarist.gemspec CHANGED
@@ -32,9 +32,9 @@ Gem::Specification.new do |spec|
32
32
  spec.require_paths = ["lib"]
33
33
 
34
34
  spec.add_dependency "lutaml-model", "~> 0.8.5"
35
- spec.add_dependency "sts", "~> 0.5.6"
36
35
  spec.add_dependency "relaton", ">= 2.0.0", "< 3"
37
36
  spec.add_dependency "rubyzip", ">= 2.3", "< 3"
37
+ spec.add_dependency "sts", "~> 0.5.6"
38
38
  spec.add_dependency "tbx", "~> 0.1"
39
39
  spec.add_dependency "thor"
40
40
  end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ class AssetReference
5
+ attr_reader :path, :location
6
+
7
+ def initialize(path:, location: nil)
8
+ @path = path
9
+ @location = location
10
+ end
11
+
12
+ def dedup_key
13
+ path
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ class BibliographicReference
5
+ attr_reader :anchor, :location
6
+
7
+ def initialize(anchor:, location: nil)
8
+ @anchor = anchor
9
+ @location = location
10
+ end
11
+
12
+ def dedup_key
13
+ anchor
14
+ end
15
+ end
16
+ end
@@ -8,6 +8,7 @@ module Glossarist
8
8
  concepts.each do |mc|
9
9
  mc.localizations.each do |l10n|
10
10
  refs = extractor.extract_from_localized_concept(l10n)
11
+ .grep(ConceptReference)
11
12
  next if refs.empty?
12
13
 
13
14
  existing = l10n.data.references || []
@@ -41,5 +41,9 @@ module Glossarist
41
41
  h["ref_type"] = ref_type if ref_type
42
42
  h.compact
43
43
  end
44
+
45
+ def dedup_key
46
+ concept_id ? [source, concept_id] : [source, concept_id, term]
47
+ end
44
48
  end
45
49
  end
@@ -2,9 +2,6 @@
2
2
 
3
3
  module Glossarist
4
4
  class ConceptValidator
5
- LANG_CODES = Glossarist::LANG_CODES
6
- VALID_ENTRY_STATUSES = %w[valid superseded withdrawn draft].freeze
7
-
8
5
  attr_reader :path, :errors, :warnings
9
6
 
10
7
  def initialize(path)
@@ -14,23 +11,46 @@ module Glossarist
14
11
  end
15
12
 
16
13
  def validate_all
17
- seen_ids = {}
14
+ result = ValidationResult.new
15
+ context = Validation::Rules::DatasetContext.new(@path)
16
+ concept_rules = Validation::Rules::Registry.for_scope(:concept)
18
17
  file_idx = 0
19
18
 
20
19
  ConceptCollector.each_concept(@path) do |concept|
21
20
  fname = concept_file_name(concept, file_idx)
22
- validate_concept(concept, fname, seen_ids)
21
+ concept_context = Validation::Rules::ConceptContext.new(
22
+ concept, file_name: fname, collection_context: context
23
+ )
24
+
25
+ concept_rules.each do |rule|
26
+ next unless rule.applicable?(concept_context)
27
+
28
+ rule.check(concept_context).each { |i| result.add_issue(i) }
29
+ end
30
+
23
31
  file_idx += 1
24
32
  end
25
33
 
26
34
  if file_idx.zero?
27
35
  yaml_files = find_yaml_files
28
36
  if yaml_files.any?
29
- @errors << "YAML files found but no parseable concepts"
37
+ result.add_error("YAML files found but no parseable concepts")
30
38
  end
31
39
  end
32
40
 
33
- ValidationResult.new(errors: @errors, warnings: @warnings)
41
+ # Run collection-level rules
42
+ collection_rules = Validation::Rules::Registry.for_scope(:collection)
43
+ collection_rules.each do |rule|
44
+ next unless rule.applicable?(context)
45
+
46
+ rule.check(context).each { |i| result.add_issue(i) }
47
+ end
48
+
49
+ # Sync legacy arrays for backward compatibility
50
+ @errors = result.errors
51
+ @warnings = result.warnings
52
+
53
+ result
34
54
  end
35
55
 
36
56
  private
@@ -48,54 +68,5 @@ module Glossarist
48
68
  id = concept.data&.id
49
69
  id ? "concept-#{id}.yaml" : "concept-#{idx}.yaml"
50
70
  end
51
-
52
- def validate_concept(concept, fname, seen_ids)
53
- validate_id(concept, fname, seen_ids)
54
- validate_localizations(concept, fname)
55
- validate_entry_statuses(concept, fname)
56
- end
57
-
58
- def validate_id(concept, fname, seen_ids)
59
- id = concept.data&.id
60
- unless id
61
- @errors << "#{fname}: missing concept id"
62
- return
63
- end
64
-
65
- id_str = id.to_s
66
- if seen_ids[id_str]
67
- @errors << "#{fname}: duplicate id '#{id_str}' (first seen in #{seen_ids[id_str]})"
68
- else
69
- seen_ids[id_str] = fname
70
- end
71
- end
72
-
73
- def validate_localizations(concept, fname)
74
- l10ns = concept.localizations&.values || []
75
- if l10ns.empty?
76
- @errors << "#{fname}: no localizations found"
77
- return
78
- end
79
-
80
- l10ns.each do |l10n|
81
- lang = l10n.language_code || "unknown"
82
- terms = l10n.data&.terms
83
- unless terms.is_a?(Array) && terms.any?
84
- @errors << "#{fname}/#{lang}: must have at least 1 term"
85
- end
86
- end
87
- end
88
-
89
- def validate_entry_statuses(concept, fname)
90
- (concept.localizations&.values || []).each do |l10n|
91
- lang = l10n.language_code || "unknown"
92
- status = l10n.data&.entry_status
93
- next unless status
94
-
95
- unless VALID_ENTRY_STATUSES.include?(status)
96
- @errors << "#{fname}/#{lang}: invalid entry_status '#{status}' (expected one of: #{VALID_ENTRY_STATUSES.join(', ')})"
97
- end
98
- end
99
- end
100
71
  end
101
72
  end