glossarist 2.6.6 → 2.6.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/README.adoc +90 -29
  3. data/glossarist.gemspec +2 -0
  4. data/lib/glossarist/citation.rb +26 -123
  5. data/lib/glossarist/cli/compare_command.rb +106 -0
  6. data/lib/glossarist/cli/export_command.rb +11 -14
  7. data/lib/glossarist/cli/validate_command.rb +111 -20
  8. data/lib/glossarist/cli.rb +18 -0
  9. data/lib/glossarist/collections/bibliography_collection.rb +4 -2
  10. data/lib/glossarist/collections/localization_collection.rb +2 -0
  11. data/lib/glossarist/comparison_result.rb +35 -0
  12. data/lib/glossarist/concept_collector.rb +44 -0
  13. data/lib/glossarist/concept_comparator.rb +72 -0
  14. data/lib/glossarist/concept_data.rb +16 -0
  15. data/lib/glossarist/concept_diff.rb +15 -0
  16. data/lib/glossarist/concept_document.rb +11 -0
  17. data/lib/glossarist/concept_manager.rb +19 -5
  18. data/lib/glossarist/concept_ref.rb +13 -0
  19. data/lib/glossarist/concept_validator.rb +6 -1
  20. data/lib/glossarist/context_configuration.rb +90 -0
  21. data/lib/glossarist/dataset_validator.rb +8 -4
  22. data/lib/glossarist/designation/prefix.rb +17 -0
  23. data/lib/glossarist/designation/suffix.rb +17 -0
  24. data/lib/glossarist/gcr_metadata.rb +7 -14
  25. data/lib/glossarist/gcr_package.rb +35 -23
  26. data/lib/glossarist/gcr_validator.rb +38 -17
  27. data/lib/glossarist/localized_concept.rb +8 -0
  28. data/lib/glossarist/managed_concept.rb +39 -6
  29. data/lib/glossarist/managed_concept_data.rb +2 -1
  30. data/lib/glossarist/rdf/ext/jsonld_transform_ext.rb +208 -0
  31. data/lib/glossarist/rdf/ext/mapping_ext.rb +37 -0
  32. data/lib/glossarist/rdf/ext/mapping_rule_ext.rb +27 -0
  33. data/lib/glossarist/rdf/ext/member_rule_ext.rb +34 -0
  34. data/lib/glossarist/rdf/ext/turtle_transform_ext.rb +222 -0
  35. data/lib/glossarist/rdf/ext.rb +39 -0
  36. data/lib/glossarist/rdf/gloss_citation.rb +36 -0
  37. data/lib/glossarist/rdf/gloss_concept.rb +58 -0
  38. data/lib/glossarist/rdf/gloss_concept_date.rb +24 -0
  39. data/lib/glossarist/rdf/gloss_concept_reference.rb +29 -0
  40. data/lib/glossarist/rdf/gloss_concept_source.rb +37 -0
  41. data/lib/glossarist/rdf/gloss_designation.rb +146 -0
  42. data/lib/glossarist/rdf/gloss_detailed_definition.rb +24 -0
  43. data/lib/glossarist/rdf/gloss_grammar_info.rb +57 -0
  44. data/lib/glossarist/rdf/gloss_locality.rb +25 -0
  45. data/lib/glossarist/rdf/gloss_localized_concept.rb +67 -0
  46. data/lib/glossarist/rdf/gloss_non_verbal_rep.rb +31 -0
  47. data/lib/glossarist/rdf/gloss_pronunciation.rb +32 -0
  48. data/lib/glossarist/rdf/gloss_reference.rb +55 -0
  49. data/lib/glossarist/rdf/namespaces/glossarist_namespace.rb +12 -0
  50. data/lib/glossarist/rdf/namespaces/iso_thes_namespace.rb +12 -0
  51. data/lib/glossarist/rdf/namespaces/owl_namespace.rb +12 -0
  52. data/lib/glossarist/rdf/namespaces/prov_namespace.rb +12 -0
  53. data/lib/glossarist/rdf/namespaces/rdf_namespace.rb +12 -0
  54. data/lib/glossarist/rdf/namespaces/skosxl_namespace.rb +12 -0
  55. data/lib/glossarist/rdf/namespaces.rb +8 -2
  56. data/lib/glossarist/rdf/relationships.rb +19 -0
  57. data/lib/glossarist/rdf/v3/configuration.rb +15 -0
  58. data/lib/glossarist/rdf/v3.rb +79 -0
  59. data/lib/glossarist/rdf.rb +22 -2
  60. data/lib/glossarist/reference_extractor.rb +12 -19
  61. data/lib/glossarist/reference_resolver.rb +3 -3
  62. data/lib/glossarist/related_concept.rb +2 -10
  63. data/lib/glossarist/schema_migration.rb +39 -0
  64. data/lib/glossarist/sts/term_mapper.rb +2 -2
  65. data/lib/glossarist/transforms/concept_to_gloss_transform.rb +355 -0
  66. data/lib/glossarist/transforms.rb +2 -2
  67. data/lib/glossarist/v1/concept.rb +17 -17
  68. data/lib/glossarist/v2/citation.rb +36 -0
  69. data/lib/glossarist/v2/concept_data.rb +46 -0
  70. data/lib/glossarist/v2/concept_document.rb +18 -0
  71. data/lib/glossarist/v2/concept_ref.rb +8 -0
  72. data/lib/glossarist/v2/concept_source.rb +16 -0
  73. data/lib/glossarist/v2/configuration.rb +13 -0
  74. data/lib/glossarist/v2/detailed_definition.rb +14 -0
  75. data/lib/glossarist/v2/localized_concept.rb +9 -0
  76. data/lib/glossarist/v2/managed_concept.rb +25 -0
  77. data/lib/glossarist/v2/managed_concept_data.rb +49 -0
  78. data/lib/glossarist/v2/related_concept.rb +15 -0
  79. data/lib/glossarist/v2.rb +28 -0
  80. data/lib/glossarist/v3/bibliography_entry.rb +19 -0
  81. data/lib/glossarist/v3/bibliography_file.rb +27 -0
  82. data/lib/glossarist/v3/citation.rb +30 -0
  83. data/lib/glossarist/v3/concept_data.rb +46 -0
  84. data/lib/glossarist/v3/concept_document.rb +18 -0
  85. data/lib/glossarist/v3/concept_ref.rb +8 -0
  86. data/lib/glossarist/v3/concept_source.rb +16 -0
  87. data/lib/glossarist/v3/configuration.rb +13 -0
  88. data/lib/glossarist/v3/detailed_definition.rb +14 -0
  89. data/lib/glossarist/v3/image_entry.rb +21 -0
  90. data/lib/glossarist/v3/image_file.rb +31 -0
  91. data/lib/glossarist/v3/localized_concept.rb +9 -0
  92. data/lib/glossarist/v3/managed_concept.rb +26 -0
  93. data/lib/glossarist/v3/managed_concept_data.rb +34 -0
  94. data/lib/glossarist/v3/related_concept.rb +15 -0
  95. data/lib/glossarist/v3.rb +36 -0
  96. data/lib/glossarist/validation/bibliography_index.rb +61 -30
  97. data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +2 -15
  98. data/lib/glossarist/validation/rules/authoritative_source_rule.rb +2 -15
  99. data/lib/glossarist/validation/rules/base.rb +5 -0
  100. data/lib/glossarist/validation/rules/bibliography_yaml_rule.rb +2 -3
  101. data/lib/glossarist/validation/rules/citation_completeness_rule.rb +5 -27
  102. data/lib/glossarist/validation/rules/dataset_context.rb +8 -3
  103. data/lib/glossarist/validation/rules/date_validity_rule.rb +1 -1
  104. data/lib/glossarist/validation/rules/designation_status_rule.rb +0 -1
  105. data/lib/glossarist/validation/rules/designation_type_rule.rb +1 -5
  106. data/lib/glossarist/validation/rules/domain_ref_rule.rb +37 -0
  107. data/lib/glossarist/validation/rules/domain_target_rule.rb +56 -0
  108. data/lib/glossarist/validation/rules/gcr_context.rb +12 -13
  109. data/lib/glossarist/validation/rules/image_reference_rule.rb +2 -17
  110. data/lib/glossarist/validation/rules/locality_completeness_rule.rb +58 -0
  111. data/lib/glossarist/validation/rules/localization_consistency_rule.rb +72 -0
  112. data/lib/glossarist/validation/rules/localization_presence_rule.rb +1 -1
  113. data/lib/glossarist/validation/rules/model_validity_rule.rb +71 -0
  114. data/lib/glossarist/validation/rules/orphaned_bibliography_rule.rb +1 -13
  115. data/lib/glossarist/validation/rules/orphaned_images_rule.rb +16 -11
  116. data/lib/glossarist/validation/rules/ref_shape_rule.rb +68 -0
  117. data/lib/glossarist/validation/rules/related_concept_cycle_rule.rb +1 -3
  118. data/lib/glossarist/validation/rules/related_concept_symmetry_rule.rb +1 -3
  119. data/lib/glossarist/validation/rules/related_concept_target_rule.rb +64 -0
  120. data/lib/glossarist/validation/rules/schema_version_rule.rb +41 -0
  121. data/lib/glossarist/validation/rules/source_type_rule.rb +1 -15
  122. data/lib/glossarist/validation/rules/source_urn_format_rule.rb +65 -0
  123. data/lib/glossarist/validation/rules/uuid_format_rule.rb +33 -0
  124. data/lib/glossarist/validation/rules.rb +10 -43
  125. data/lib/glossarist/validation/validation_issue.rb +14 -11
  126. data/lib/glossarist/validation_result.rb +12 -22
  127. data/lib/glossarist/version.rb +1 -1
  128. data/lib/glossarist.rb +9 -0
  129. data/memory/project-status.md +43 -0
  130. data/scripts/migrate_dataset.rb +180 -0
  131. data/scripts/migrate_isotc204_to_v3.rb +134 -0
  132. data/scripts/migrate_isotc211_to_v3.rb +153 -0
  133. data/scripts/migrate_osgeo_to_v3.rb +155 -0
  134. data/scripts/upgrade_dataset_to_v3.rb +47 -0
  135. metadata +111 -6
  136. data/TODO.integration/01-gcr-package-cli.md +0 -180
  137. data/lib/glossarist/rdf/skos_concept.rb +0 -43
  138. data/lib/glossarist/rdf/skos_vocabulary.rb +0 -25
  139. data/lib/glossarist/transforms/concept_to_skos_transform.rb +0 -131
@@ -0,0 +1,180 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Migrate glossarist datasets to current schema format.
4
+ #
5
+ # Usage:
6
+ # bundle exec ruby scripts/migrate_dataset.rb SOURCE_DIR OUTPUT_DIR [--add-iev-domains]
7
+ #
8
+ # --add-iev-domains: Add domain ConceptReference objects for IEV-style identifiers
9
+ # (e.g. "426-24-74" → area-426, section-426-24)
10
+
11
+ require "glossarist"
12
+ require "fileutils"
13
+
14
+ source_dir = ARGV[0]
15
+ output_dir = ARGV[1]
16
+ add_iev_domains = ARGV.include?("--add-iev-domains")
17
+
18
+ unless source_dir && output_dir
19
+ abort "Usage: bundle exec ruby scripts/migrate_dataset.rb SOURCE_DIR OUTPUT_DIR [--add-iev-domains]"
20
+ end
21
+
22
+ source_dir = File.expand_path(source_dir)
23
+ output_dir = File.expand_path(output_dir)
24
+
25
+ unless File.directory?(source_dir)
26
+ abort "Error: #{source_dir} is not a directory"
27
+ end
28
+
29
+ def add_subject_area_concepts(collection)
30
+ areas = {}
31
+ sections = {}
32
+
33
+ collection.each do |concept|
34
+ next unless concept.data.domains
35
+
36
+ concept.data.domains.each do |ref|
37
+ next unless ref.is_a?(Glossarist::ConceptReference) && ref.concept_id
38
+
39
+ id = ref.concept_id
40
+ if id.start_with?("area-")
41
+ areas[id] = true
42
+ elsif id.start_with?("section-")
43
+ sections[id] = true
44
+ end
45
+ end
46
+ end
47
+
48
+ existing_ids = collection.map { |c| c.data.id }.to_set
49
+
50
+ areas.each_key do |area_id|
51
+ next if existing_ids.include?(area_id)
52
+
53
+ mc = Glossarist::ManagedConcept.new(
54
+ data: Glossarist::ManagedConceptData.new(
55
+ id: area_id,
56
+ domains: [Glossarist::ConceptReference.domain(area_id)],
57
+ ),
58
+ )
59
+
60
+ code = area_id.sub("area-", "")
61
+ narrower = sections.keys.select { |s| s.start_with?("section-#{code}-") }
62
+ mc.related = narrower.map { |s| Glossarist::RelatedConcept.new(type: "narrower", content: s) }
63
+
64
+ l10n = build_domain_localization(area_id, code, "eng")
65
+ mc.add_l10n(l10n)
66
+
67
+ collection.store(mc)
68
+ end
69
+
70
+ sections.each_key do |section_id|
71
+ next if existing_ids.include?(section_id)
72
+
73
+ parts = section_id.sub("section-", "").split("-")
74
+ area_id = "area-#{parts[0]}"
75
+
76
+ mc = Glossarist::ManagedConcept.new(
77
+ data: Glossarist::ManagedConceptData.new(
78
+ id: section_id,
79
+ domains: [
80
+ Glossarist::ConceptReference.domain(area_id),
81
+ Glossarist::ConceptReference.domain(section_id),
82
+ ],
83
+ ),
84
+ )
85
+
86
+ mc.related = [Glossarist::RelatedConcept.new(type: "broader", content: area_id)]
87
+
88
+ section_code = parts.length > 1 ? parts[0..1].join("-") : parts[0]
89
+ l10n = build_domain_localization(section_id, section_code, "eng")
90
+ l10n.data.domain = area_id
91
+ mc.add_l10n(l10n)
92
+
93
+ collection.store(mc)
94
+ end
95
+ end
96
+
97
+ def build_domain_localization(id, label, lang_code)
98
+ cd = Glossarist::ConceptData.new(
99
+ id: id,
100
+ language_code: lang_code,
101
+ terms: [
102
+ Glossarist::Designation::Expression.new(
103
+ type: "expression",
104
+ designation: id,
105
+ normative_status: "preferred",
106
+ ),
107
+ ],
108
+ )
109
+
110
+ l10n = Glossarist::LocalizedConcept.new
111
+ l10n.data = cd
112
+ l10n.entry_status = "valid"
113
+ l10n
114
+ end
115
+
116
+ # Detect format: managed (concept/ + localized_concept/) vs grouped (*.yaml)
117
+ concept_subdir = File.join(source_dir, "concept")
118
+ is_managed_format = File.directory?(concept_subdir)
119
+
120
+ puts "Loading concepts from #{source_dir} (#{is_managed_format ? 'managed' : 'grouped'} format)..."
121
+
122
+ collection = Glossarist::ManagedConceptCollection.new
123
+ collection.load_from_files(source_dir)
124
+
125
+ puts "Loaded #{collection.count} concepts"
126
+
127
+ # Add IEV domain references if requested
128
+ if add_iev_domains
129
+ puts "Adding IEV domain references..."
130
+
131
+ collection.each do |concept|
132
+ next if concept.data.domains && !concept.data.domains.empty?
133
+
134
+ identifier = concept.data.id.to_s
135
+ next if identifier.empty? || identifier.start_with?("area-", "section-")
136
+ parts = identifier.split("-")
137
+ next unless parts.length >= 2
138
+
139
+ area_uri = "area-#{parts[0]}"
140
+ section_uri = "section-#{parts[0]}-#{parts[1]}"
141
+
142
+ concept.data.domains = [
143
+ Glossarist::ConceptReference.domain(area_uri),
144
+ Glossarist::ConceptReference.domain(section_uri),
145
+ ]
146
+ end
147
+
148
+ puts "Adding subject area hierarchy concepts..."
149
+ add_subject_area_concepts(collection)
150
+
151
+ puts "Domains added. Total concepts: #{collection.count}"
152
+ end
153
+
154
+ # Save output
155
+ puts "Saving to #{output_dir}..."
156
+
157
+ if is_managed_format
158
+ concepts_out = File.join(output_dir, "concepts")
159
+ FileUtils.mkdir_p(concepts_out)
160
+ collection.save_to_files(concepts_out)
161
+ else
162
+ concepts_out = File.join(output_dir)
163
+ FileUtils.mkdir_p(concepts_out)
164
+ collection.save_grouped_concepts_to_files(concepts_out)
165
+ end
166
+
167
+ # Copy register.yaml if present
168
+ register_src = File.join(File.dirname(source_dir), "register.yaml")
169
+ if File.exist?(register_src) && !File.exist?(File.join(output_dir, "..", "register.yaml"))
170
+ register_dst_dir = is_managed_format ? File.dirname(output_dir) : output_dir
171
+ register_dst = if File.exist?(File.join(File.dirname(source_dir), "register.yaml"))
172
+ File.join(is_managed_format ? File.dirname(output_dir) : File.dirname(output_dir), "register.yaml")
173
+ end
174
+ if register_dst
175
+ FileUtils.mkdir_p(File.dirname(register_dst))
176
+ FileUtils.cp(register_src, register_dst) unless register_src == register_dst
177
+ end
178
+ end
179
+
180
+ puts "Done. #{collection.count} concepts migrated."
@@ -0,0 +1,134 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Migration script for isotc204-glossary: adds v3 fields to existing concepts.
5
+ #
6
+ # Adds:
7
+ # - status: "valid" at managed concept level
8
+ # - date_accepted at managed concept level
9
+ # - domains (section ConceptReference) at managed concept data level
10
+ # - related (broader to section) at managed concept level
11
+ # - Creates section hierarchy concepts with narrower relations
12
+ #
13
+ # Usage:
14
+ # cd /Users/mulgogi/src/glossarist/glossarist-ruby
15
+ # bundle exec ruby scripts/migrate_isotc204_to_v3.rb
16
+ #
17
+ # Safe to run multiple times (idempotent).
18
+
19
+ require "glossarist"
20
+
21
+ DIR = "/Users/mulgogi/src/geolexica/isotc204-glossary/concepts"
22
+
23
+ ISO_TS_14812_SECTIONS = {
24
+ "3.1" => "General concepts",
25
+ "3.2" => "Transport information and control",
26
+ "3.3" => "ITS station",
27
+ "3.4" => "Communications",
28
+ "3.5" => "ITS services",
29
+ "3.6" => "Geospatial",
30
+ "3.7" => "Driving automation",
31
+ }.freeze
32
+
33
+ ISO_TS_14812_DATE = "2022-01-01T00:00:00+00:00"
34
+ ISO_TS_14812_SOURCE = "urn:iso:std:iso:ts:14812"
35
+
36
+ collection = Glossarist::ManagedConceptCollection.new
37
+ collection.load_from_files(DIR)
38
+
39
+ puts "Loaded #{collection.count} concepts"
40
+
41
+ # Track which concepts belong to which section for narrower relations
42
+ section_children = Hash.new { |h, k| h[k] = [] }
43
+
44
+ collection.each do |concept|
45
+ identifier = concept.data.id.to_s
46
+ section_code = identifier.split(".")[0..1].join(".")
47
+ section_uri = "section-#{section_code.gsub('.', '-')}"
48
+ section_children[section_uri] << identifier
49
+
50
+ # Set status
51
+ concept.status = "valid" unless concept.status
52
+
53
+ # Set date_accepted
54
+ unless concept.date_accepted
55
+ concept.date_accepted = Glossarist::ConceptDate.new(
56
+ type: "accepted",
57
+ date: ISO_TS_14812_DATE,
58
+ )
59
+ end
60
+
61
+ # Add domain ConceptReference
62
+ concept.data.domains ||= []
63
+ unless concept.data.domains.any? { |d| d.concept_id == section_uri }
64
+ concept.data.domains << Glossarist::ConceptReference.new(
65
+ concept_id: section_uri,
66
+ source: ISO_TS_14812_SOURCE,
67
+ ref_type: "domain",
68
+ )
69
+ end
70
+
71
+ # Add broader relation to section
72
+ concept.related ||= []
73
+ unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == section_uri }
74
+ concept.related << Glossarist::RelatedConcept.new(
75
+ type: "broader",
76
+ content: section_uri,
77
+ ref: Glossarist::Citation.new(source: "ISO/TS 14812", id: section_uri),
78
+ )
79
+ end
80
+ end
81
+
82
+ puts "Updated #{collection.count} concepts with status, date_accepted, domains, related"
83
+
84
+ # Create section hierarchy concepts
85
+ ISO_TS_14812_SECTIONS.each do |code, title|
86
+ section_uri = "section-#{code.gsub('.', '-')}"
87
+
88
+ mc = Glossarist::ManagedConcept.new(
89
+ data: Glossarist::ManagedConceptData.new(
90
+ id: section_uri,
91
+ domains: [Glossarist::ConceptReference.new(
92
+ concept_id: section_uri,
93
+ source: ISO_TS_14812_SOURCE,
94
+ ref_type: "domain",
95
+ )],
96
+ ),
97
+ )
98
+ mc.status = "valid"
99
+ mc.date_accepted = Glossarist::ConceptDate.new(
100
+ type: "accepted",
101
+ date: ISO_TS_14812_DATE,
102
+ )
103
+
104
+ l10n = Glossarist::LocalizedConcept.new
105
+ l10n.data = Glossarist::ConceptData.new(
106
+ id: section_uri,
107
+ language_code: "eng",
108
+ terms: [Glossarist::Designation::Expression.new(
109
+ type: "expression",
110
+ designation: title,
111
+ normative_status: "preferred",
112
+ )],
113
+ )
114
+ l10n.entry_status = "valid"
115
+ mc.add_l10n(l10n)
116
+
117
+ # Add narrower relations to child concepts
118
+ children = section_children[section_uri]
119
+ if children.any?
120
+ mc.related = children.sort.map do |child_id|
121
+ Glossarist::RelatedConcept.new(
122
+ type: "narrower",
123
+ content: child_id,
124
+ ref: Glossarist::Citation.new(source: "ISO/TS 14812", id: child_id),
125
+ )
126
+ end
127
+ end
128
+
129
+ collection.store(mc)
130
+ puts "Created section concept: #{section_uri} (#{title}) — #{children.length} narrower"
131
+ end
132
+
133
+ collection.save_grouped_concepts_to_files(DIR)
134
+ puts "Saved #{collection.count} concepts to #{DIR}"
@@ -0,0 +1,153 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Migration script for isotc211-glossary: adds v3 fields to existing concepts.
5
+ #
6
+ # Adds:
7
+ # - domains (ISO standard ConceptReference) at managed concept data level
8
+ # - related (broader to standard domain) at managed concept level
9
+ # - Creates domain concepts per ISO standard (with narrower relations)
10
+ #
11
+ # Idempotent: safe to run multiple times.
12
+ #
13
+ # Usage:
14
+ # cd /Users/mulgogi/src/glossarist/glossarist-ruby
15
+ # bundle exec ruby scripts/migrate_isotc211_to_v3.rb
16
+
17
+ require "glossarist"
18
+
19
+ DIR = "/Users/mulgogi/src/geolexica/isotc211-glossary/concepts"
20
+
21
+ ISO_SOURCE_URN = "urn:iso:std:iso"
22
+
23
+ # Extract a stable domain ID from an authoritative source reference string.
24
+ # @param ref_text [String] e.g. "ISO 19136-1:2020", "ISO/IEC 19501:2005"
25
+ # @return [String, nil] domain ID e.g. "iso-19136-1", "iso-iec-19501"
26
+ def extract_domain_id(ref_text)
27
+ # Match various ISO reference patterns
28
+ patterns = [
29
+ %r{ISO/IEC/IEEE\s+([\d-]+)}, # ISO/IEC/IEEE 24765:2017
30
+ %r{ISO/IEC\s+([\d-]+)}, # ISO/IEC 19501:2005
31
+ %r{ISO/TS\s+([\d-]+)}, # ISO/TS 19130:2010
32
+ %r{ISO/TR\s+([\d-]+)}, # ISO/TR 19120:2001
33
+ %r{ISO/IEC\s+Guide\s+([\d-]+)}, # ISO/IEC Guide 98-3:2008
34
+ %r{ISO\s+DIS\s+([\d-]+)}, # ISO DIS 19123-1:2022
35
+ %r{ISO\s+([\d]+-?[\d]*)}, # ISO 19136-1:2020
36
+ ]
37
+
38
+ patterns.each do |pat|
39
+ if (m = ref_text.match(pat))
40
+ # Extract the full match, normalize
41
+ prefix = ref_text[m.begin(0)...m.begin(1)].strip
42
+ number = m[1]
43
+ domain = "#{prefix} #{number}"
44
+ return domain.downcase.gsub(/[\s\/]+/, "-")
45
+ end
46
+ end
47
+
48
+ nil
49
+ end
50
+
51
+ collection = Glossarist::ManagedConceptCollection.new
52
+ collection.load_from_files(DIR)
53
+
54
+ puts "Loaded #{collection.count} concepts"
55
+
56
+ # Build index: domain_id -> [concept_id]
57
+ domain_index = {}
58
+ concepts_with_domain = 0
59
+ concepts_without_domain = 0
60
+
61
+ collection.each do |concept|
62
+ eng = concept.localization("eng")
63
+ next unless eng
64
+
65
+ sources = eng.data&.sources
66
+ next unless sources
67
+
68
+ auth = sources.find { |s| s.type == "authoritative" }
69
+ next unless auth&.origin
70
+
71
+ ref_text = auth.origin.text || auth.origin.ref
72
+ unless ref_text
73
+ concepts_without_domain += 1
74
+ next
75
+ end
76
+
77
+ domain_id = extract_domain_id(ref_text)
78
+ unless domain_id
79
+ concepts_without_domain += 1
80
+ next
81
+ end
82
+
83
+ (domain_index[domain_id] ||= []) << concept.data.id
84
+ concepts_with_domain += 1
85
+
86
+ # Add domain ConceptReference
87
+ concept.data.domains ||= []
88
+ unless concept.data.domains.any? { |d| d.concept_id == domain_id }
89
+ concept.data.domains << Glossarist::ConceptReference.new(
90
+ concept_id: domain_id,
91
+ source: ISO_SOURCE_URN,
92
+ ref_type: "domain",
93
+ )
94
+ end
95
+
96
+ # Add broader relation to domain concept
97
+ concept.related ||= []
98
+ unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == domain_id }
99
+ concept.related << Glossarist::RelatedConcept.new(
100
+ type: "broader",
101
+ content: domain_id,
102
+ ref: Glossarist::Citation.new(source: "ISO", id: domain_id),
103
+ )
104
+ end
105
+ end
106
+
107
+ puts "Added domains to #{concepts_with_domain} concepts"
108
+ puts "No domain extracted for #{concepts_without_domain} concepts"
109
+
110
+ # Create domain hierarchy concepts
111
+ domain_index.sort.each do |domain_id, child_ids|
112
+ mc = Glossarist::ManagedConcept.new(
113
+ data: Glossarist::ManagedConceptData.new(
114
+ id: domain_id,
115
+ domains: [Glossarist::ConceptReference.new(
116
+ concept_id: domain_id,
117
+ source: ISO_SOURCE_URN,
118
+ ref_type: "domain",
119
+ )],
120
+ ),
121
+ )
122
+ mc.status = "valid"
123
+
124
+ # Create a basic English localization with the domain ID as the term
125
+ l10n = Glossarist::LocalizedConcept.new
126
+ l10n.data = Glossarist::ConceptData.new(
127
+ id: domain_id,
128
+ language_code: "eng",
129
+ terms: [Glossarist::Designation::Expression.new(
130
+ type: "expression",
131
+ designation: domain_id,
132
+ normative_status: "preferred",
133
+ )],
134
+ )
135
+ l10n.entry_status = "valid"
136
+ mc.add_l10n(l10n)
137
+
138
+ # Add narrower relations to child concepts
139
+ narrower = child_ids.sort.map do |child_id|
140
+ Glossarist::RelatedConcept.new(
141
+ type: "narrower",
142
+ content: child_id.to_s,
143
+ ref: Glossarist::Citation.new(source: "ISO", id: child_id.to_s),
144
+ )
145
+ end
146
+ mc.related = narrower
147
+
148
+ collection.store(mc)
149
+ puts "Created domain: #{domain_id} — #{child_ids.size} narrower"
150
+ end
151
+
152
+ collection.save_grouped_concepts_to_files(DIR)
153
+ puts "Saved #{collection.count} concepts to #{DIR}"
@@ -0,0 +1,155 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Migration script for osgeo-glossary: adds v3 fields to existing concepts.
5
+ #
6
+ # Adds:
7
+ # - status: "valid" at managed concept level
8
+ # - date_accepted at managed concept level (set to 2011-01-01 as earliest
9
+ # known publication date for the OSGeo Lexicon)
10
+ # - domains for concepts with identifiable ISO standard sources
11
+ # - related (broader to domain) for those concepts
12
+ # - Creates domain concepts for identified ISO standards
13
+ #
14
+ # Idempotent: safe to run multiple times.
15
+ #
16
+ # Usage:
17
+ # cd /Users/mulgogi/src/glossarist/glossarist-ruby
18
+ # bundle exec ruby scripts/migrate_osgeo_to_v3.rb
19
+
20
+ require "glossarist"
21
+
22
+ DIR = "/Users/mulgogi/src/geolexica/osgeo-glossary/concepts"
23
+
24
+ OSGEO_DATE = "2011-01-01T00:00:00+00:00"
25
+
26
+ # Extract a stable domain ID from an authoritative source reference string.
27
+ def extract_domain_id(ref_text)
28
+ return nil unless ref_text
29
+
30
+ patterns = [
31
+ %r{ISO/IEC/IEEE\s+([\d-]+)},
32
+ %r{ISO/IEC\s+([\d-]+)},
33
+ %r{ISO/TS\s+([\d-]+)},
34
+ %r{ISO/TR\s+([\d-]+)},
35
+ %r{ISO\s+([\d]+-?[\d]*)},
36
+ ]
37
+
38
+ patterns.each do |pat|
39
+ if (m = ref_text.match(pat))
40
+ return "iso-#{m[1]}"
41
+ end
42
+ end
43
+
44
+ nil
45
+ end
46
+
47
+ collection = Glossarist::ManagedConceptCollection.new
48
+ collection.load_from_files(DIR)
49
+
50
+ puts "Loaded #{collection.count} concepts"
51
+
52
+ domain_index = {}
53
+ concepts_with_domain = 0
54
+
55
+ collection.each do |concept|
56
+ # Set status
57
+ concept.status = "valid" unless concept.status
58
+
59
+ # Set date_accepted
60
+ unless concept.date_accepted
61
+ concept.date_accepted = Glossarist::ConceptDate.new(
62
+ type: "accepted",
63
+ date: OSGEO_DATE,
64
+ )
65
+ end
66
+
67
+ # Extract domain from source
68
+ eng = concept.localization("eng")
69
+ next unless eng
70
+
71
+ sources = eng.data&.sources
72
+ next unless sources
73
+
74
+ auth = sources.find { |s| s.type == "authoritative" }
75
+ next unless auth&.origin
76
+
77
+ ref_text = auth.origin.text || auth.origin.ref
78
+ next unless ref_text
79
+
80
+ domain_id = extract_domain_id(ref_text)
81
+ next unless domain_id
82
+
83
+ (domain_index[domain_id] ||= []) << concept.data.id
84
+ concepts_with_domain += 1
85
+
86
+ # Add domain ConceptReference
87
+ concept.data.domains ||= []
88
+ unless concept.data.domains.any? { |d| d.concept_id == domain_id }
89
+ concept.data.domains << Glossarist::ConceptReference.new(
90
+ concept_id: domain_id,
91
+ source: "urn:iso:std:iso",
92
+ ref_type: "domain",
93
+ )
94
+ end
95
+
96
+ # Add broader relation to domain concept
97
+ concept.related ||= []
98
+ unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == domain_id }
99
+ concept.related << Glossarist::RelatedConcept.new(
100
+ type: "broader",
101
+ content: domain_id,
102
+ ref: Glossarist::Citation.new(source: "ISO", id: domain_id),
103
+ )
104
+ end
105
+ end
106
+
107
+ puts "Added status and date_accepted to #{collection.count} concepts"
108
+ puts "Added domains to #{concepts_with_domain} concepts with ISO sources"
109
+
110
+ # Create domain hierarchy concepts
111
+ domain_index.sort.each do |domain_id, child_ids|
112
+ mc = Glossarist::ManagedConcept.new(
113
+ data: Glossarist::ManagedConceptData.new(
114
+ id: domain_id,
115
+ domains: [Glossarist::ConceptReference.new(
116
+ concept_id: domain_id,
117
+ source: "urn:iso:std:iso",
118
+ ref_type: "domain",
119
+ )],
120
+ ),
121
+ )
122
+ mc.status = "valid"
123
+ mc.date_accepted = Glossarist::ConceptDate.new(
124
+ type: "accepted",
125
+ date: OSGEO_DATE,
126
+ )
127
+
128
+ l10n = Glossarist::LocalizedConcept.new
129
+ l10n.data = Glossarist::ConceptData.new(
130
+ id: domain_id,
131
+ language_code: "eng",
132
+ terms: [Glossarist::Designation::Expression.new(
133
+ type: "expression",
134
+ designation: domain_id,
135
+ normative_status: "preferred",
136
+ )],
137
+ )
138
+ l10n.entry_status = "valid"
139
+ mc.add_l10n(l10n)
140
+
141
+ narrower = child_ids.sort.map do |child_id|
142
+ Glossarist::RelatedConcept.new(
143
+ type: "narrower",
144
+ content: child_id.to_s,
145
+ ref: Glossarist::Citation.new(source: "OSGeo", id: child_id.to_s),
146
+ )
147
+ end
148
+ mc.related = narrower
149
+
150
+ collection.store(mc)
151
+ puts "Created domain: #{domain_id} — #{child_ids.size} narrower"
152
+ end
153
+
154
+ collection.save_grouped_concepts_to_files(DIR)
155
+ puts "Saved #{collection.count} concepts to #{DIR}"
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ # Upgrades a dataset directory to v3 format.
5
+ #
6
+ # Usage: bundle exec ruby scripts/upgrade_dataset_to_v3.rb <concepts_dir> [--grouped|--separate]
7
+ #
8
+ # Formats:
9
+ # --grouped (default): each concept + localizations in a single YAML file
10
+ # --separate: concept/ and localized_concept/ in separate files
11
+ #
12
+ # For each concept:
13
+ # 1. Loads with ConceptManager (auto-detects v2/v3)
14
+ # 2. Promotes data.related -> top-level related (if needed)
15
+ # 3. Sets schema_version = "3"
16
+ # 4. Re-saves in v3 format
17
+
18
+ require "glossarist"
19
+
20
+ dir = ARGV[0]
21
+ mode = ARGV.include?("--separate") ? :separate : :grouped
22
+
23
+ unless dir && Dir.exist?(dir)
24
+ abort "Usage: #{$PROGRAM_NAME} <concepts_dir> [--grouped|--separate]"
25
+ end
26
+
27
+ manager = Glossarist::ConceptManager.new(path: dir)
28
+ collection = Glossarist::ManagedConceptCollection.new
29
+
30
+ puts "Loading concepts from #{dir}..."
31
+ manager.load_from_files(collection: collection)
32
+
33
+ count = 0
34
+ collection.each do |concept|
35
+ Glossarist::SchemaMigration.migrate_concept(concept)
36
+ count += 1
37
+ end
38
+
39
+ puts "Migrated #{count} concepts. Saving..."
40
+
41
+ if mode == :separate
42
+ manager.save_to_files(collection)
43
+ else
44
+ manager.save_grouped_concepts_to_files(collection)
45
+ end
46
+
47
+ puts "Done. All concepts now in v3 format."