glossarist 2.6.4 → 2.6.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/release.yml +1 -4
  3. data/.rubocop_todo.yml +25 -74
  4. data/CLAUDE.md +27 -2
  5. data/Gemfile +0 -2
  6. data/README.adoc +650 -29
  7. data/config.yml +68 -1
  8. data/glossarist.gemspec +1 -1
  9. data/lib/glossarist/asset_reference.rb +16 -0
  10. data/lib/glossarist/bibliographic_reference.rb +16 -0
  11. data/lib/glossarist/concept.rb +1 -1
  12. data/lib/glossarist/concept_data.rb +4 -0
  13. data/lib/glossarist/concept_enricher.rb +1 -0
  14. data/lib/glossarist/concept_reference.rb +14 -17
  15. data/lib/glossarist/concept_validator.rb +27 -56
  16. data/lib/glossarist/dataset_validator.rb +30 -34
  17. data/lib/glossarist/designation/abbreviation.rb +0 -2
  18. data/lib/glossarist/designation/base.rb +21 -1
  19. data/lib/glossarist/designation/expression.rb +3 -0
  20. data/lib/glossarist/designation/letter_symbol.rb +0 -4
  21. data/lib/glossarist/designation/symbol.rb +0 -2
  22. data/lib/glossarist/gcr_validator.rb +26 -101
  23. data/lib/glossarist/glossary_definition.rb +5 -0
  24. data/lib/glossarist/managed_concept_data.rb +21 -2
  25. data/lib/glossarist/non_verb_rep.rb +21 -6
  26. data/lib/glossarist/pronunciation.rb +32 -0
  27. data/lib/glossarist/reference_extractor.rb +78 -10
  28. data/lib/glossarist/reference_resolver.rb +1 -0
  29. data/lib/glossarist/urn_resolver.rb +13 -1
  30. data/lib/glossarist/v1/concept.rb +7 -0
  31. data/lib/glossarist/validation/asset_index.rb +114 -0
  32. data/lib/glossarist/validation/bibliography_index.rb +121 -0
  33. data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +60 -0
  34. data/lib/glossarist/validation/rules/authoritative_source_rule.rb +47 -0
  35. data/lib/glossarist/validation/rules/base.rb +46 -0
  36. data/lib/glossarist/validation/rules/bibliography_yaml_rule.rb +37 -0
  37. data/lib/glossarist/validation/rules/citation_completeness_rule.rb +63 -0
  38. data/lib/glossarist/validation/rules/concept_context.rb +45 -0
  39. data/lib/glossarist/validation/rules/concept_count_rule.rb +34 -0
  40. data/lib/glossarist/validation/rules/concept_id_rule.rb +29 -0
  41. data/lib/glossarist/validation/rules/concept_id_uniqueness_rule.rb +42 -0
  42. data/lib/glossarist/validation/rules/concept_mention_rule.rb +44 -0
  43. data/lib/glossarist/validation/rules/concept_status_rule.rb +36 -0
  44. data/lib/glossarist/validation/rules/concept_uri_rule.rb +30 -0
  45. data/lib/glossarist/validation/rules/dataset_context.rb +99 -0
  46. data/lib/glossarist/validation/rules/date_type_rule.rb +54 -0
  47. data/lib/glossarist/validation/rules/date_validity_rule.rb +66 -0
  48. data/lib/glossarist/validation/rules/definition_content_rule.rb +41 -0
  49. data/lib/glossarist/validation/rules/designation_status_rule.rb +45 -0
  50. data/lib/glossarist/validation/rules/designation_type_rule.rb +55 -0
  51. data/lib/glossarist/validation/rules/duplicate_term_rule.rb +63 -0
  52. data/lib/glossarist/validation/rules/entry_status_rule.rb +39 -0
  53. data/lib/glossarist/validation/rules/filename_id_rule.rb +35 -0
  54. data/lib/glossarist/validation/rules/gcr_context.rb +92 -0
  55. data/lib/glossarist/validation/rules/image_reference_rule.rb +73 -0
  56. data/lib/glossarist/validation/rules/l10n_uuid_integrity_rule.rb +40 -0
  57. data/lib/glossarist/validation/rules/language_code_format_rule.rb +39 -0
  58. data/lib/glossarist/validation/rules/language_coverage_rule.rb +37 -0
  59. data/lib/glossarist/validation/rules/language_list_rule.rb +46 -0
  60. data/lib/glossarist/validation/rules/localization_presence_rule.rb +25 -0
  61. data/lib/glossarist/validation/rules/orphaned_bibliography_rule.rb +64 -0
  62. data/lib/glossarist/validation/rules/orphaned_images_rule.rb +68 -0
  63. data/lib/glossarist/validation/rules/orphaned_l10n_files_rule.rb +39 -0
  64. data/lib/glossarist/validation/rules/preferred_term_rule.rb +41 -0
  65. data/lib/glossarist/validation/rules/registry.rb +42 -0
  66. data/lib/glossarist/validation/rules/related_concept_cycle_rule.rb +102 -0
  67. data/lib/glossarist/validation/rules/related_concept_rule.rb +40 -0
  68. data/lib/glossarist/validation/rules/related_concept_symmetry_rule.rb +87 -0
  69. data/lib/glossarist/validation/rules/source_type_rule.rb +63 -0
  70. data/lib/glossarist/validation/rules/terms_presence_rule.rb +39 -0
  71. data/lib/glossarist/validation/rules.rb +85 -0
  72. data/lib/glossarist/validation/validation_issue.rb +39 -0
  73. data/lib/glossarist/validation.rb +12 -0
  74. data/lib/glossarist/validation_result.rb +26 -9
  75. data/lib/glossarist/version.rb +1 -1
  76. data/lib/glossarist.rb +4 -0
  77. metadata +62 -16
@@ -4,7 +4,7 @@ require "zip"
4
4
 
5
5
  module Glossarist
6
6
  class GcrValidator
7
- def validate(zip_path) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
7
+ def validate(zip_path)
8
8
  result = ValidationResult.new
9
9
 
10
10
  unless File.exist?(zip_path)
@@ -13,123 +13,48 @@ module Glossarist
13
13
  end
14
14
 
15
15
  begin
16
- Zip::File.open(zip_path) do |zip_file|
17
- validate_zip_contents(zip_file, result)
18
- end
16
+ zip_entries = Zip::File.open(zip_path) { |zf| zf.entries.to_set(&:name) }
19
17
  rescue StandardError => e
20
18
  result.add_error("Failed to read ZIP: #{e.message}")
19
+ return result
21
20
  end
22
21
 
23
- result
24
- end
25
-
26
- private
27
-
28
- def validate_zip_contents(zip_file, result) # rubocop:disable Metrics/AbcSize
29
- unless zip_file.find_entry("metadata.yaml")
22
+ unless zip_entries.include?("metadata.yaml")
30
23
  result.add_error("Missing metadata.yaml")
31
- return
32
- end
33
-
34
- metadata = GcrMetadata.from_yaml(
35
- zip_file.find_entry("metadata.yaml").get_input_stream.read,
36
- )
37
- validate_metadata(metadata, result)
38
-
39
- concept_entries = zip_file.entries.select do |e|
40
- e.name.start_with?("concepts/") && e.name.end_with?(".yaml")
41
- end
42
- if concept_entries.empty?
43
- result.add_error("No concept files found in concepts/")
44
- end
45
-
46
- concept_entries.each do |entry|
47
- validate_concept_entry(entry, metadata, result)
48
- end
49
-
50
- validate_assets(zip_file, result)
51
- end
52
-
53
- def validate_metadata(metadata, result)
54
- unless metadata&.concept_count
55
- result.add_error("metadata.yaml missing required fields (concept_count)")
56
- end
57
-
58
- unless metadata&.shortname
59
- result.add_error("metadata.yaml missing shortname")
24
+ return result
60
25
  end
61
26
 
62
- unless metadata&.version
63
- result.add_error("metadata.yaml missing version")
27
+ begin
28
+ context = Validation::Rules::GcrContext.new(zip_path)
29
+ rescue StandardError => e
30
+ result.add_error("Failed to load GCR: #{e.message}")
31
+ return result
64
32
  end
65
- end
66
33
 
67
- def validate_concept_entry(entry, metadata, result) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
68
- raw = entry.get_input_stream.read
69
- doc = ConceptDocument.from_yamls(raw)
70
- rescue Psych::SyntaxError => e
71
- result.add_error("#{entry.name}: invalid YAML at line #{e.line}: #{e.message}")
72
- rescue StandardError => e
73
- result.add_error("#{entry.name}: parse error: #{e.message}")
74
- else
75
- concept = doc.concept
76
- unless concept&.data&.id
77
- result.add_error("#{entry.name}: document 0 missing data.identifier")
78
- end
34
+ # Collection-level rules (metadata, structure, integrity)
35
+ collection_rules = Validation::Rules::Registry.for_scope(:collection)
36
+ collection_rules.each do |rule|
37
+ next unless rule.applicable?(context)
79
38
 
80
- localizations = doc.localizations
81
- if localizations.empty?
82
- result.add_error("#{entry.name}: expected at least 1 localization document")
83
- else
84
- localizations.each_with_index do |l10n, idx|
85
- unless l10n&.language_code
86
- result.add_error("#{entry.name}: document #{idx + 1} missing data.language_code")
87
- end
88
- end
39
+ rule.check(context).each { |i| result.add_issue(i) }
89
40
  end
90
41
 
91
- validate_concept_uri(entry, concept, metadata, result)
92
- end
93
-
94
- def validate_concept_uri(entry, concept, metadata, result) # rubocop:disable Metrics/CyclomaticComplexity
95
- concept_uri = concept&.data&.uri
96
- template = metadata&.concept_uri_template
97
- uri_prefix = metadata&.uri_prefix
42
+ # Per-concept rules
43
+ concept_rules = Validation::Rules::Registry.for_scope(:concept)
44
+ context.concepts.each_with_index do |concept, idx|
45
+ fname = concept.data&.id ? "concepts/#{concept.data.id}.yaml" : "concepts/concept-#{idx}.yaml"
46
+ concept_context = Validation::Rules::ConceptContext.new(
47
+ concept, file_name: fname, collection_context: context
48
+ )
98
49
 
99
- if concept_uri.nil? && template.nil? && uri_prefix.nil?
100
- result.add_warning("#{entry.name}: no concept URI (data.uri) and no concept_uri_template or uri_prefix in metadata")
101
- end
102
- end
50
+ concept_rules.each do |rule|
51
+ next unless rule.applicable?(concept_context)
103
52
 
104
- def validate_assets(zip_file, result)
105
- GcrPackage::DATASET_ASSETS.each do |asset|
106
- case asset[:type]
107
- when :file
108
- validate_file_asset_entry(zip_file, asset[:path], result)
109
- when :directory
110
- validate_directory_asset(zip_file, asset[:path], result)
53
+ rule.check(concept_context).each { |i| result.add_issue(i) }
111
54
  end
112
55
  end
113
- end
114
-
115
- def validate_file_asset_entry(zip_file, path, result)
116
- entry = zip_file.find_entry(path)
117
- return unless entry
118
-
119
- YAML.safe_load(entry.get_input_stream.read)
120
- rescue Psych::SyntaxError => e
121
- result.add_error("#{path}: invalid YAML at line #{e.line}: #{e.message}")
122
- end
123
56
 
124
- def validate_directory_asset(zip_file, dir_path, result)
125
- dir_entries = zip_file.entries.select do |e|
126
- e.name.start_with?("#{dir_path}/")
127
- end
128
- return unless dir_entries.any? && dir_entries.all? do |e|
129
- e.name.end_with?("/")
130
- end
131
-
132
- result.add_warning("#{dir_path}/ directory is empty")
57
+ result
133
58
  end
134
59
  end
135
60
  end
@@ -28,5 +28,10 @@ module Glossarist
28
28
  CONCEPT_DATE_TYPES = config.dig("concept_date", "type").freeze
29
29
 
30
30
  CONCEPT_STATUSES = config.dig("concept", "status").freeze
31
+
32
+ DESIGNATION_RELATIONSHIP_TYPES = config.dig("designation",
33
+ "relationship_type")&.freeze
34
+
35
+ ISO12620_TERM_TYPES = config.dig("iso12620", "term_type").freeze
31
36
  end
32
37
  end
@@ -5,7 +5,7 @@ module Glossarist
5
5
  attribute :id, :string
6
6
  attribute :uri, :string
7
7
  attribute :localized_concepts, :hash
8
- attribute :groups, :string, collection: true
8
+ attribute :domains, ConceptReference, collection: true
9
9
  attribute :sources, ConceptSource, collection: true
10
10
  attribute :localizations, LocalizedConcept,
11
11
  collection: Collections::LocalizationCollection,
@@ -16,7 +16,8 @@ module Glossarist
16
16
  with: { to: :id_to_yaml, from: :id_from_yaml }
17
17
  map :uri, to: :uri
18
18
  map %i[localized_concepts localizedConcepts], to: :localized_concepts
19
- map :groups, to: :groups
19
+ map %i[domains groups], to: :domains,
20
+ with: { from: :domains_from_yaml, to: :domains_to_yaml }
20
21
  map :sources, to: :sources
21
22
  map :localizations, to: :localizations,
22
23
  with: { from: :localizations_from_yaml, to: :localizations_to_yaml }
@@ -41,6 +42,24 @@ module Glossarist
41
42
 
42
43
  def localizations_to_yaml(model, doc); end
43
44
 
45
+ def domains_from_yaml(model, value)
46
+ return unless value.is_a?(Array)
47
+
48
+ model.domains = value.map do |item|
49
+ if item.is_a?(Hash)
50
+ ConceptReference.of_yaml(item)
51
+ else
52
+ ConceptReference.new(concept_id: item.to_s, ref_type: "domain")
53
+ end
54
+ end
55
+ end
56
+
57
+ def domains_to_yaml(model, doc)
58
+ return if model.domains.nil? || model.domains.empty?
59
+
60
+ doc["domains"] = model.domains.map(&:to_hash)
61
+ end
62
+
44
63
  def authoritative_source
45
64
  return [] unless sources
46
65
 
@@ -1,14 +1,29 @@
1
+ # frozen_string_literal: true
2
+
1
3
  module Glossarist
4
+ # A non-verbal representation used to help define a concept, following
5
+ # ISO 10241-1 §6.5.
6
+ #
7
+ # Non-verbal representations are associated resources (images, tables,
8
+ # formulas) that live outside the concept model. They are referenced by URI
9
+ # and can be shared across concepts. The resource belongs either to the
10
+ # dataset package (relative path) or is externally referenced (URL/URN).
11
+ #
12
+ # Each non-verbal representation specifies:
13
+ # - +type+: one of "image", "table", "formula"
14
+ # - +ref+: URI reference to the resource (relative path, URN, or URL)
15
+ # - +text+: optional text description or alt text
16
+ # - +sources+: bibliographic sources for the representation
2
17
  class NonVerbRep < Lutaml::Model::Serializable
3
- attribute :image, :string
4
- attribute :table, :string
5
- attribute :formula, :string
18
+ attribute :type, :string
19
+ attribute :ref, :string
20
+ attribute :text, :string
6
21
  attribute :sources, ConceptSource, collection: true
7
22
 
8
23
  key_value do
9
- map :image, to: :image
10
- map :table, to: :table
11
- map :formula, to: :formula
24
+ map :type, to: :type
25
+ map :ref, to: :ref
26
+ map :text, to: :text
12
27
  map :sources, to: :sources
13
28
  end
14
29
  end
@@ -0,0 +1,32 @@
1
+ module Glossarist
2
+ # A pronunciation or transcription of a designation, following ISO 24229
3
+ # spelling system conventions.
4
+ #
5
+ # Each pronunciation entry specifies the text of the pronunciation and the
6
+ # context in which it is expressed:
7
+ # - +language+ (ISO 639) identifies the language or dialect being pronounced
8
+ # - +script+ (ISO 15924) identifies the script used for the pronunciation text
9
+ # - +country+ (ISO 3166-1) identifies the country variant
10
+ # - +system+ identifies the transcription/romanization system used (ISO 24229
11
+ # conversion system code or a simple identifier like "IPA")
12
+ #
13
+ # A designation can have multiple pronunciations, e.g.:
14
+ # - IPA: { content: "toːkjoː", script: "Latn", language: "jpn", system: "IPA" }
15
+ # - Hepburn: { content: "Tōkyō", script: "Latn", language: "jpn", system: "Var:jpn-Hrkt:Latn:Hepburn-1886" }
16
+ # - Cyrillic: { content: "Токио", script: "Cyrl", language: "rus", system: "polivanov" }
17
+ class Pronunciation < Lutaml::Model::Serializable
18
+ attribute :content, :string
19
+ attribute :language, :string
20
+ attribute :script, :string
21
+ attribute :country, :string
22
+ attribute :system, :string
23
+
24
+ key_value do
25
+ map :content, to: :content
26
+ map :language, to: :language
27
+ map :script, to: :script
28
+ map :country, to: :country
29
+ map :system, to: :system
30
+ end
31
+ end
32
+ end
@@ -152,6 +152,63 @@ module Glossarist
152
152
 
153
153
  LANG_CODES = Glossarist::LANG_CODES
154
154
 
155
+ # Extract asset references from model attributes (NonVerbRep, GraphicalSymbol).
156
+ def extract_asset_refs_from_concept(concept)
157
+ refs = []
158
+
159
+ concept.localizations.each do |l10n|
160
+ Array(l10n.non_verb_rep).each do |nvr|
161
+ next unless nvr.is_a?(NonVerbRep) && nvr.ref && !nvr.ref.strip.empty?
162
+ refs << AssetReference.new(path: nvr.ref.strip)
163
+ end
164
+
165
+ (l10n.data&.terms || []).each do |term|
166
+ if term.is_a?(Designation::GraphicalSymbol) && term.image && !term.image.strip.empty?
167
+ refs << AssetReference.new(path: term.image.strip)
168
+ end
169
+ end
170
+ end
171
+
172
+ refs
173
+ end
174
+
175
+ # Extract bibliographic xrefs from model-level source citations.
176
+ def extract_bib_refs_from_concept(concept)
177
+ refs = []
178
+ concept.localizations.each do |l10n|
179
+ gather_all_sources(l10n).each do |source|
180
+ origin = source.origin
181
+ next unless origin
182
+
183
+ if origin.text && !origin.text.strip.empty?
184
+ refs << BibliographicReference.new(anchor: origin.text)
185
+ end
186
+
187
+ next unless origin.source && origin.id
188
+
189
+ key = "#{origin.source} #{origin.id}"
190
+ refs << BibliographicReference.new(anchor: key)
191
+ refs << BibliographicReference.new(anchor: origin.id.to_s)
192
+ end
193
+ end
194
+ refs
195
+ end
196
+
197
+ # Extract all reference types from a managed concept.
198
+ def extract_all_from_managed_concept(concept)
199
+ concept_refs = extract_from_managed_concept(concept)
200
+ asset_refs = extract_asset_refs_from_concept(concept)
201
+ concept_refs + asset_refs
202
+ end
203
+
204
+ def resolve_asciidoc_xref(target)
205
+ BibliographicReference.new(anchor: target.strip)
206
+ end
207
+
208
+ def resolve_image_ref(path)
209
+ AssetReference.new(path: path.strip)
210
+ end
211
+
155
212
  private
156
213
 
157
214
  def gather_texts(lc_hash)
@@ -170,16 +227,7 @@ module Glossarist
170
227
 
171
228
  def deduplicate(refs)
172
229
  seen = Set.new
173
- refs.select do |ref|
174
- key = if ref.concept_id
175
- [ref.source,
176
- ref.concept_id]
177
- else
178
- [ref.source, ref.concept_id,
179
- ref.term]
180
- end
181
- seen.add?(key)
182
- end
230
+ refs.select { |ref| seen.add?(ref.dedup_key) }
183
231
  end
184
232
 
185
233
  def extract_term_id_from_urn_tail(tail)
@@ -212,6 +260,18 @@ module Glossarist
212
260
  regex: /\{\{([^}]+)\}\}/,
213
261
  ) { |ext, content| ext.resolve_mention(content) }
214
262
 
263
+ # AsciiDoc cross-references: <<anchor>> or <<anchor,display text>>
264
+ register_pattern(
265
+ name: :asciidoc_xref,
266
+ regex: /<<([^,>\n]+?)(?:,[^>\n]*)?>>/,
267
+ ) { |ext, target| ext.resolve_asciidoc_xref(target) }
268
+
269
+ # Image references: image::path[] or image:path[]
270
+ register_pattern(
271
+ name: :asciidoc_image,
272
+ regex: /image::?([^\[\]]+)\[/,
273
+ ) { |ext, path| ext.resolve_image_ref(path) }
274
+
215
275
  register_identifier_resolver("urn:iec:std:iec:60050") do |ext, identifier, display|
216
276
  ext.resolve_iec_urn(identifier, display)
217
277
  end
@@ -223,5 +283,13 @@ module Glossarist
223
283
  register_identifier_resolver("urn:") do |ext, identifier, display|
224
284
  ext.resolve_generic_urn(identifier, display)
225
285
  end
286
+
287
+ def gather_all_sources(l10n)
288
+ sources = Array(l10n.data&.sources)
289
+ sources += Array((l10n.data&.definition || []).flat_map(&:sources).compact)
290
+ sources += Array((l10n.data&.notes || []).flat_map(&:sources).compact)
291
+ sources += Array((l10n.data&.examples || []).flat_map(&:sources).compact)
292
+ sources
293
+ end
226
294
  end
227
295
  end
@@ -153,6 +153,7 @@ module Glossarist
153
153
  def extract_refs(concept, extractor)
154
154
  if concept.is_a?(ManagedConcept)
155
155
  extractor.extract_from_managed_concept(concept)
156
+ .grep(ConceptReference)
156
157
  else
157
158
  extractor.extract_from_concept_hash(concept)
158
159
  end
@@ -64,7 +64,19 @@ module Glossarist
64
64
  def to_urn(urn_or_reference)
65
65
  case urn_or_reference
66
66
  when String then urn_or_reference
67
- when ConceptReference then urn_or_reference.to_urn
67
+ when ConceptReference then concept_reference_to_urn(urn_or_reference)
68
+ end
69
+ end
70
+
71
+ def concept_reference_to_urn(ref)
72
+ return ref.urn if ref.urn && !ref.urn.strip.empty?
73
+ return nil unless ref.external?
74
+ return nil unless ref.source && ref.concept_id
75
+
76
+ case ref.source
77
+ when /\Aurn:iec/ then "#{ref.source}-#{ref.concept_id}"
78
+ when /\Aurn:iso/ then "#{ref.source}:term:#{ref.concept_id}"
79
+ else "#{ref.source}/#{ref.concept_id}"
68
80
  end
69
81
  end
70
82
  end
@@ -42,6 +42,7 @@ module Glossarist
42
42
  mc.add_localization(LocalizedConcept.of_yaml({ "data" => data }))
43
43
  end
44
44
 
45
+ assign_domains(mc) if groups.is_a?(Array) && groups.any?
45
46
  assign_references(mc) if references.is_a?(Array) && references.any?
46
47
 
47
48
  mc
@@ -49,6 +50,12 @@ module Glossarist
49
50
 
50
51
  private
51
52
 
53
+ def assign_domains(concept)
54
+ concept.data.domains = groups.map do |g|
55
+ ConceptReference.new(concept_id: g.to_s, ref_type: "domain")
56
+ end
57
+ end
58
+
52
59
  def assign_references(concept)
53
60
  l10n = concept.localization("eng") || concept.localizations.values.first
54
61
  return unless l10n
@@ -0,0 +1,114 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "set"
4
+ require "zip"
5
+
6
+ module Glossarist
7
+ module Validation
8
+ class AssetIndex
9
+ IMAGE_TERMS = %w[id ref text anchor].freeze
10
+ private_constant :IMAGE_TERMS
11
+
12
+ attr_reader :paths
13
+
14
+ def initialize
15
+ @paths = Set.new
16
+ end
17
+
18
+ def register(path)
19
+ @paths.add(normalize_path(path))
20
+ end
21
+
22
+ def resolve?(path)
23
+ @paths.include?(normalize_path(path))
24
+ end
25
+
26
+ def each_path(&block)
27
+ @paths.each(&block)
28
+ end
29
+
30
+ def self.build_from_directory(dataset_path)
31
+ index = new
32
+ index_image_files(index, dataset_path)
33
+ index_model_assets(index, dataset_path)
34
+ index
35
+ end
36
+
37
+ def self.build_from_zip(zip_path)
38
+ index = new
39
+ index_zip_images(index, zip_path)
40
+ index_zip_concept_assets(index, zip_path)
41
+ index
42
+ end
43
+
44
+ private
45
+
46
+ def normalize_path(path)
47
+ path.to_s.delete_prefix("/")
48
+ end
49
+
50
+ class << self
51
+ private
52
+
53
+ def index_image_files(index, dataset_path)
54
+ images_dir = File.join(dataset_path, "images")
55
+ return unless File.directory?(images_dir)
56
+
57
+ base = File.expand_path(dataset_path)
58
+ Dir.glob(File.join(images_dir, "**", "*")).each do |file|
59
+ next unless File.file?(file)
60
+
61
+ relative = file.sub("#{base}/", "")
62
+ index.register(relative)
63
+ end
64
+ end
65
+
66
+ def index_model_assets(index, dataset_path)
67
+ concepts = ConceptCollector.collect(dataset_path)
68
+ index_concept_assets(index, concepts)
69
+ end
70
+
71
+ def index_zip_images(index, zip_path)
72
+ Zip::File.open(zip_path) do |zf|
73
+ zf.entries.each do |entry|
74
+ next if entry.name.end_with?("/")
75
+ next unless entry.name.start_with?("images/")
76
+
77
+ index.register(entry.name)
78
+ end
79
+ end
80
+ end
81
+
82
+ def index_zip_concept_assets(index, zip_path)
83
+ pkg = GcrPackage.load(zip_path)
84
+ index_concept_assets(index, pkg.concepts)
85
+ end
86
+
87
+ def index_concept_assets(index, concepts)
88
+ concepts.each do |concept|
89
+ concept.localizations.each do |l10n|
90
+ register_non_verb_rep(index, l10n)
91
+ register_graphical_symbols(index, l10n)
92
+ end
93
+ end
94
+ end
95
+
96
+ def register_non_verb_rep(index, l10n)
97
+ Array(l10n.non_verb_rep).each do |nvr|
98
+ next unless nvr.is_a?(NonVerbRep) && nvr.ref && !nvr.ref.strip.empty?
99
+
100
+ index.register(nvr.ref.strip)
101
+ end
102
+ end
103
+
104
+ def register_graphical_symbols(index, l10n)
105
+ (l10n.data&.terms || []).each do |term|
106
+ next unless term.is_a?(Designation::GraphicalSymbol) && term.image
107
+
108
+ index.register(term.image)
109
+ end
110
+ end
111
+ end
112
+ end
113
+ end
114
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Validation
5
+ class BibliographyIndex
6
+ BIB_ENTRY_KEYS = %w[id ref text anchor].freeze
7
+ private_constant :BIB_ENTRY_KEYS
8
+
9
+ attr_reader :entries
10
+
11
+ def initialize
12
+ @entries = {}
13
+ end
14
+
15
+ def register(anchor, source = nil)
16
+ @entries[normalize_anchor(anchor)] = { anchor: anchor, source: source }
17
+ end
18
+
19
+ def resolve?(anchor)
20
+ @entries.key?(normalize_anchor(anchor))
21
+ end
22
+
23
+ def anchors
24
+ @entries.keys
25
+ end
26
+
27
+ def each_entry(&block)
28
+ @entries.each_value(&block)
29
+ end
30
+
31
+ def self.build_from_concepts(concepts, dataset_path: nil,
32
+ bibliography_yaml: nil)
33
+ index = new
34
+
35
+ concepts.each { |concept| index_concept_sources(index, concept) }
36
+
37
+ yaml = bibliography_yaml || read_bibliography_file(dataset_path)
38
+ index_bibliography_yaml(index, yaml) if yaml
39
+
40
+ index
41
+ end
42
+
43
+ private
44
+
45
+ def normalize_anchor(anchor)
46
+ anchor.to_s.gsub(/[ \/:]/, "_").gsub(/__+/, "_")
47
+ end
48
+
49
+ class << self
50
+ private
51
+
52
+ def read_bibliography_file(dataset_path)
53
+ return nil unless dataset_path
54
+
55
+ bib_path = File.join(dataset_path, "bibliography.yaml")
56
+ File.exist?(bib_path) ? File.read(bib_path) : nil
57
+ end
58
+
59
+ def index_concept_sources(index, concept)
60
+ concept.localizations.each do |l10n|
61
+ index_l10n_sources(index, l10n)
62
+ end
63
+ end
64
+
65
+ def index_l10n_sources(index, l10n)
66
+ data = l10n.data
67
+ return unless data
68
+
69
+ register_source_collection(index, data.sources)
70
+ register_source_collection(index,
71
+ data.definition&.flat_map(&:sources))
72
+ register_source_collection(index, data.notes&.flat_map(&:sources))
73
+ register_source_collection(index, data.examples&.flat_map(&:sources))
74
+ end
75
+
76
+ def register_source_collection(index, sources)
77
+ Array(sources).compact.each { |s| register_source(index, s) }
78
+ end
79
+
80
+ def register_source(index, source)
81
+ origin = source.origin
82
+ return unless origin
83
+
84
+ register_origin_text(index, origin)
85
+ register_origin_ref(index, origin)
86
+ end
87
+
88
+ def register_origin_text(index, origin)
89
+ return unless origin.text && !origin.text.strip.empty?
90
+
91
+ index.register(origin.text, origin)
92
+ end
93
+
94
+ def register_origin_ref(index, origin)
95
+ return unless origin.source && origin.id
96
+
97
+ key = "#{origin.source} #{origin.id}"
98
+ index.register(key, origin)
99
+ index.register(origin.id.to_s, origin)
100
+ end
101
+
102
+ def index_bibliography_yaml(index, yaml_content)
103
+ data = YAML.safe_load(yaml_content)
104
+ return unless data.is_a?(Hash) || data.is_a?(Array)
105
+
106
+ entries = data.is_a?(Hash) ? data.values : data
107
+ entries.each do |entry|
108
+ next unless entry.is_a?(Hash)
109
+
110
+ BIB_ENTRY_KEYS.each do |key|
111
+ val = entry[key]
112
+ index.register(val.to_s, entry) if val && !val.to_s.strip.empty?
113
+ end
114
+ end
115
+ rescue Psych::SyntaxError, Psych::DisallowedClass
116
+ nil
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end