glossarist 2.6.6 → 2.6.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +90 -29
- data/glossarist.gemspec +2 -0
- data/lib/glossarist/citation.rb +26 -123
- data/lib/glossarist/cli/compare_command.rb +106 -0
- data/lib/glossarist/cli/export_command.rb +11 -14
- data/lib/glossarist/cli/validate_command.rb +111 -20
- data/lib/glossarist/cli.rb +18 -0
- data/lib/glossarist/collections/bibliography_collection.rb +4 -2
- data/lib/glossarist/collections/localization_collection.rb +2 -0
- data/lib/glossarist/comparison_result.rb +35 -0
- data/lib/glossarist/concept_collector.rb +44 -0
- data/lib/glossarist/concept_comparator.rb +72 -0
- data/lib/glossarist/concept_data.rb +16 -0
- data/lib/glossarist/concept_diff.rb +15 -0
- data/lib/glossarist/concept_document.rb +11 -0
- data/lib/glossarist/concept_manager.rb +19 -5
- data/lib/glossarist/concept_ref.rb +13 -0
- data/lib/glossarist/concept_validator.rb +6 -1
- data/lib/glossarist/context_configuration.rb +90 -0
- data/lib/glossarist/dataset_validator.rb +8 -4
- data/lib/glossarist/designation/prefix.rb +17 -0
- data/lib/glossarist/designation/suffix.rb +17 -0
- data/lib/glossarist/gcr_metadata.rb +7 -14
- data/lib/glossarist/gcr_package.rb +35 -23
- data/lib/glossarist/gcr_validator.rb +38 -17
- data/lib/glossarist/localized_concept.rb +8 -0
- data/lib/glossarist/managed_concept.rb +39 -6
- data/lib/glossarist/managed_concept_data.rb +2 -1
- data/lib/glossarist/rdf/ext/jsonld_transform_ext.rb +208 -0
- data/lib/glossarist/rdf/ext/mapping_ext.rb +37 -0
- data/lib/glossarist/rdf/ext/mapping_rule_ext.rb +27 -0
- data/lib/glossarist/rdf/ext/member_rule_ext.rb +34 -0
- data/lib/glossarist/rdf/ext/turtle_transform_ext.rb +222 -0
- data/lib/glossarist/rdf/ext.rb +39 -0
- data/lib/glossarist/rdf/gloss_citation.rb +36 -0
- data/lib/glossarist/rdf/gloss_concept.rb +58 -0
- data/lib/glossarist/rdf/gloss_concept_date.rb +24 -0
- data/lib/glossarist/rdf/gloss_concept_reference.rb +29 -0
- data/lib/glossarist/rdf/gloss_concept_source.rb +37 -0
- data/lib/glossarist/rdf/gloss_designation.rb +146 -0
- data/lib/glossarist/rdf/gloss_detailed_definition.rb +24 -0
- data/lib/glossarist/rdf/gloss_grammar_info.rb +57 -0
- data/lib/glossarist/rdf/gloss_locality.rb +25 -0
- data/lib/glossarist/rdf/gloss_localized_concept.rb +67 -0
- data/lib/glossarist/rdf/gloss_non_verbal_rep.rb +31 -0
- data/lib/glossarist/rdf/gloss_pronunciation.rb +32 -0
- data/lib/glossarist/rdf/gloss_reference.rb +55 -0
- data/lib/glossarist/rdf/namespaces/glossarist_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces/iso_thes_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces/owl_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces/prov_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces/rdf_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces/skosxl_namespace.rb +12 -0
- data/lib/glossarist/rdf/namespaces.rb +8 -2
- data/lib/glossarist/rdf/relationships.rb +19 -0
- data/lib/glossarist/rdf/v3/configuration.rb +15 -0
- data/lib/glossarist/rdf/v3.rb +79 -0
- data/lib/glossarist/rdf.rb +22 -2
- data/lib/glossarist/reference_extractor.rb +12 -19
- data/lib/glossarist/reference_resolver.rb +3 -3
- data/lib/glossarist/related_concept.rb +2 -10
- data/lib/glossarist/schema_migration.rb +39 -0
- data/lib/glossarist/sts/term_mapper.rb +2 -2
- data/lib/glossarist/transforms/concept_to_gloss_transform.rb +355 -0
- data/lib/glossarist/transforms.rb +2 -2
- data/lib/glossarist/v1/concept.rb +17 -17
- data/lib/glossarist/v2/citation.rb +36 -0
- data/lib/glossarist/v2/concept_data.rb +46 -0
- data/lib/glossarist/v2/concept_document.rb +18 -0
- data/lib/glossarist/v2/concept_ref.rb +8 -0
- data/lib/glossarist/v2/concept_source.rb +16 -0
- data/lib/glossarist/v2/configuration.rb +13 -0
- data/lib/glossarist/v2/detailed_definition.rb +14 -0
- data/lib/glossarist/v2/localized_concept.rb +9 -0
- data/lib/glossarist/v2/managed_concept.rb +25 -0
- data/lib/glossarist/v2/managed_concept_data.rb +49 -0
- data/lib/glossarist/v2/related_concept.rb +15 -0
- data/lib/glossarist/v2.rb +28 -0
- data/lib/glossarist/v3/bibliography_entry.rb +19 -0
- data/lib/glossarist/v3/bibliography_file.rb +27 -0
- data/lib/glossarist/v3/citation.rb +30 -0
- data/lib/glossarist/v3/concept_data.rb +46 -0
- data/lib/glossarist/v3/concept_document.rb +18 -0
- data/lib/glossarist/v3/concept_ref.rb +8 -0
- data/lib/glossarist/v3/concept_source.rb +16 -0
- data/lib/glossarist/v3/configuration.rb +13 -0
- data/lib/glossarist/v3/detailed_definition.rb +14 -0
- data/lib/glossarist/v3/image_entry.rb +21 -0
- data/lib/glossarist/v3/image_file.rb +31 -0
- data/lib/glossarist/v3/localized_concept.rb +9 -0
- data/lib/glossarist/v3/managed_concept.rb +26 -0
- data/lib/glossarist/v3/managed_concept_data.rb +34 -0
- data/lib/glossarist/v3/related_concept.rb +15 -0
- data/lib/glossarist/v3.rb +36 -0
- data/lib/glossarist/validation/bibliography_index.rb +61 -30
- data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +2 -15
- data/lib/glossarist/validation/rules/authoritative_source_rule.rb +2 -15
- data/lib/glossarist/validation/rules/base.rb +5 -0
- data/lib/glossarist/validation/rules/bibliography_yaml_rule.rb +2 -3
- data/lib/glossarist/validation/rules/citation_completeness_rule.rb +5 -27
- data/lib/glossarist/validation/rules/dataset_context.rb +8 -3
- data/lib/glossarist/validation/rules/date_validity_rule.rb +1 -1
- data/lib/glossarist/validation/rules/designation_status_rule.rb +0 -1
- data/lib/glossarist/validation/rules/designation_type_rule.rb +1 -5
- data/lib/glossarist/validation/rules/domain_ref_rule.rb +37 -0
- data/lib/glossarist/validation/rules/domain_target_rule.rb +56 -0
- data/lib/glossarist/validation/rules/gcr_context.rb +12 -13
- data/lib/glossarist/validation/rules/image_reference_rule.rb +2 -17
- data/lib/glossarist/validation/rules/locality_completeness_rule.rb +58 -0
- data/lib/glossarist/validation/rules/localization_consistency_rule.rb +72 -0
- data/lib/glossarist/validation/rules/localization_presence_rule.rb +1 -1
- data/lib/glossarist/validation/rules/model_validity_rule.rb +71 -0
- data/lib/glossarist/validation/rules/orphaned_bibliography_rule.rb +1 -13
- data/lib/glossarist/validation/rules/orphaned_images_rule.rb +16 -11
- data/lib/glossarist/validation/rules/ref_shape_rule.rb +68 -0
- data/lib/glossarist/validation/rules/related_concept_cycle_rule.rb +1 -3
- data/lib/glossarist/validation/rules/related_concept_symmetry_rule.rb +1 -3
- data/lib/glossarist/validation/rules/related_concept_target_rule.rb +64 -0
- data/lib/glossarist/validation/rules/schema_version_rule.rb +41 -0
- data/lib/glossarist/validation/rules/source_type_rule.rb +1 -15
- data/lib/glossarist/validation/rules/source_urn_format_rule.rb +65 -0
- data/lib/glossarist/validation/rules/uuid_format_rule.rb +33 -0
- data/lib/glossarist/validation/rules.rb +10 -43
- data/lib/glossarist/validation/validation_issue.rb +14 -11
- data/lib/glossarist/validation_result.rb +12 -22
- data/lib/glossarist/version.rb +1 -1
- data/lib/glossarist.rb +9 -0
- data/memory/project-status.md +43 -0
- data/scripts/migrate_dataset.rb +180 -0
- data/scripts/migrate_isotc204_to_v3.rb +134 -0
- data/scripts/migrate_isotc211_to_v3.rb +153 -0
- data/scripts/migrate_osgeo_to_v3.rb +155 -0
- data/scripts/upgrade_dataset_to_v3.rb +47 -0
- metadata +111 -6
- data/TODO.integration/01-gcr-package-cli.md +0 -180
- data/lib/glossarist/rdf/skos_concept.rb +0 -43
- data/lib/glossarist/rdf/skos_vocabulary.rb +0 -25
- data/lib/glossarist/transforms/concept_to_skos_transform.rb +0 -131
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Migrate glossarist datasets to current schema format.
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
# bundle exec ruby scripts/migrate_dataset.rb SOURCE_DIR OUTPUT_DIR [--add-iev-domains]
|
|
7
|
+
#
|
|
8
|
+
# --add-iev-domains: Add domain ConceptReference objects for IEV-style identifiers
|
|
9
|
+
# (e.g. "426-24-74" → area-426, section-426-24)
|
|
10
|
+
|
|
11
|
+
require "glossarist"
|
|
12
|
+
require "fileutils"
|
|
13
|
+
|
|
14
|
+
source_dir = ARGV[0]
|
|
15
|
+
output_dir = ARGV[1]
|
|
16
|
+
add_iev_domains = ARGV.include?("--add-iev-domains")
|
|
17
|
+
|
|
18
|
+
unless source_dir && output_dir
|
|
19
|
+
abort "Usage: bundle exec ruby scripts/migrate_dataset.rb SOURCE_DIR OUTPUT_DIR [--add-iev-domains]"
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
source_dir = File.expand_path(source_dir)
|
|
23
|
+
output_dir = File.expand_path(output_dir)
|
|
24
|
+
|
|
25
|
+
unless File.directory?(source_dir)
|
|
26
|
+
abort "Error: #{source_dir} is not a directory"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def add_subject_area_concepts(collection)
|
|
30
|
+
areas = {}
|
|
31
|
+
sections = {}
|
|
32
|
+
|
|
33
|
+
collection.each do |concept|
|
|
34
|
+
next unless concept.data.domains
|
|
35
|
+
|
|
36
|
+
concept.data.domains.each do |ref|
|
|
37
|
+
next unless ref.is_a?(Glossarist::ConceptReference) && ref.concept_id
|
|
38
|
+
|
|
39
|
+
id = ref.concept_id
|
|
40
|
+
if id.start_with?("area-")
|
|
41
|
+
areas[id] = true
|
|
42
|
+
elsif id.start_with?("section-")
|
|
43
|
+
sections[id] = true
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
existing_ids = collection.map { |c| c.data.id }.to_set
|
|
49
|
+
|
|
50
|
+
areas.each_key do |area_id|
|
|
51
|
+
next if existing_ids.include?(area_id)
|
|
52
|
+
|
|
53
|
+
mc = Glossarist::ManagedConcept.new(
|
|
54
|
+
data: Glossarist::ManagedConceptData.new(
|
|
55
|
+
id: area_id,
|
|
56
|
+
domains: [Glossarist::ConceptReference.domain(area_id)],
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
code = area_id.sub("area-", "")
|
|
61
|
+
narrower = sections.keys.select { |s| s.start_with?("section-#{code}-") }
|
|
62
|
+
mc.related = narrower.map { |s| Glossarist::RelatedConcept.new(type: "narrower", content: s) }
|
|
63
|
+
|
|
64
|
+
l10n = build_domain_localization(area_id, code, "eng")
|
|
65
|
+
mc.add_l10n(l10n)
|
|
66
|
+
|
|
67
|
+
collection.store(mc)
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
sections.each_key do |section_id|
|
|
71
|
+
next if existing_ids.include?(section_id)
|
|
72
|
+
|
|
73
|
+
parts = section_id.sub("section-", "").split("-")
|
|
74
|
+
area_id = "area-#{parts[0]}"
|
|
75
|
+
|
|
76
|
+
mc = Glossarist::ManagedConcept.new(
|
|
77
|
+
data: Glossarist::ManagedConceptData.new(
|
|
78
|
+
id: section_id,
|
|
79
|
+
domains: [
|
|
80
|
+
Glossarist::ConceptReference.domain(area_id),
|
|
81
|
+
Glossarist::ConceptReference.domain(section_id),
|
|
82
|
+
],
|
|
83
|
+
),
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
mc.related = [Glossarist::RelatedConcept.new(type: "broader", content: area_id)]
|
|
87
|
+
|
|
88
|
+
section_code = parts.length > 1 ? parts[0..1].join("-") : parts[0]
|
|
89
|
+
l10n = build_domain_localization(section_id, section_code, "eng")
|
|
90
|
+
l10n.data.domain = area_id
|
|
91
|
+
mc.add_l10n(l10n)
|
|
92
|
+
|
|
93
|
+
collection.store(mc)
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
def build_domain_localization(id, label, lang_code)
|
|
98
|
+
cd = Glossarist::ConceptData.new(
|
|
99
|
+
id: id,
|
|
100
|
+
language_code: lang_code,
|
|
101
|
+
terms: [
|
|
102
|
+
Glossarist::Designation::Expression.new(
|
|
103
|
+
type: "expression",
|
|
104
|
+
designation: id,
|
|
105
|
+
normative_status: "preferred",
|
|
106
|
+
),
|
|
107
|
+
],
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
111
|
+
l10n.data = cd
|
|
112
|
+
l10n.entry_status = "valid"
|
|
113
|
+
l10n
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Detect format: managed (concept/ + localized_concept/) vs grouped (*.yaml)
|
|
117
|
+
concept_subdir = File.join(source_dir, "concept")
|
|
118
|
+
is_managed_format = File.directory?(concept_subdir)
|
|
119
|
+
|
|
120
|
+
puts "Loading concepts from #{source_dir} (#{is_managed_format ? 'managed' : 'grouped'} format)..."
|
|
121
|
+
|
|
122
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
123
|
+
collection.load_from_files(source_dir)
|
|
124
|
+
|
|
125
|
+
puts "Loaded #{collection.count} concepts"
|
|
126
|
+
|
|
127
|
+
# Add IEV domain references if requested
|
|
128
|
+
if add_iev_domains
|
|
129
|
+
puts "Adding IEV domain references..."
|
|
130
|
+
|
|
131
|
+
collection.each do |concept|
|
|
132
|
+
next if concept.data.domains && !concept.data.domains.empty?
|
|
133
|
+
|
|
134
|
+
identifier = concept.data.id.to_s
|
|
135
|
+
next if identifier.empty? || identifier.start_with?("area-", "section-")
|
|
136
|
+
parts = identifier.split("-")
|
|
137
|
+
next unless parts.length >= 2
|
|
138
|
+
|
|
139
|
+
area_uri = "area-#{parts[0]}"
|
|
140
|
+
section_uri = "section-#{parts[0]}-#{parts[1]}"
|
|
141
|
+
|
|
142
|
+
concept.data.domains = [
|
|
143
|
+
Glossarist::ConceptReference.domain(area_uri),
|
|
144
|
+
Glossarist::ConceptReference.domain(section_uri),
|
|
145
|
+
]
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
puts "Adding subject area hierarchy concepts..."
|
|
149
|
+
add_subject_area_concepts(collection)
|
|
150
|
+
|
|
151
|
+
puts "Domains added. Total concepts: #{collection.count}"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Save output
|
|
155
|
+
puts "Saving to #{output_dir}..."
|
|
156
|
+
|
|
157
|
+
if is_managed_format
|
|
158
|
+
concepts_out = File.join(output_dir, "concepts")
|
|
159
|
+
FileUtils.mkdir_p(concepts_out)
|
|
160
|
+
collection.save_to_files(concepts_out)
|
|
161
|
+
else
|
|
162
|
+
concepts_out = File.join(output_dir)
|
|
163
|
+
FileUtils.mkdir_p(concepts_out)
|
|
164
|
+
collection.save_grouped_concepts_to_files(concepts_out)
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
# Copy register.yaml if present
|
|
168
|
+
register_src = File.join(File.dirname(source_dir), "register.yaml")
|
|
169
|
+
if File.exist?(register_src) && !File.exist?(File.join(output_dir, "..", "register.yaml"))
|
|
170
|
+
register_dst_dir = is_managed_format ? File.dirname(output_dir) : output_dir
|
|
171
|
+
register_dst = if File.exist?(File.join(File.dirname(source_dir), "register.yaml"))
|
|
172
|
+
File.join(is_managed_format ? File.dirname(output_dir) : File.dirname(output_dir), "register.yaml")
|
|
173
|
+
end
|
|
174
|
+
if register_dst
|
|
175
|
+
FileUtils.mkdir_p(File.dirname(register_dst))
|
|
176
|
+
FileUtils.cp(register_src, register_dst) unless register_src == register_dst
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
puts "Done. #{collection.count} concepts migrated."
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Migration script for isotc204-glossary: adds v3 fields to existing concepts.
|
|
5
|
+
#
|
|
6
|
+
# Adds:
|
|
7
|
+
# - status: "valid" at managed concept level
|
|
8
|
+
# - date_accepted at managed concept level
|
|
9
|
+
# - domains (section ConceptReference) at managed concept data level
|
|
10
|
+
# - related (broader to section) at managed concept level
|
|
11
|
+
# - Creates section hierarchy concepts with narrower relations
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# cd /Users/mulgogi/src/glossarist/glossarist-ruby
|
|
15
|
+
# bundle exec ruby scripts/migrate_isotc204_to_v3.rb
|
|
16
|
+
#
|
|
17
|
+
# Safe to run multiple times (idempotent).
|
|
18
|
+
|
|
19
|
+
require "glossarist"
|
|
20
|
+
|
|
21
|
+
DIR = "/Users/mulgogi/src/geolexica/isotc204-glossary/concepts"
|
|
22
|
+
|
|
23
|
+
ISO_TS_14812_SECTIONS = {
|
|
24
|
+
"3.1" => "General concepts",
|
|
25
|
+
"3.2" => "Transport information and control",
|
|
26
|
+
"3.3" => "ITS station",
|
|
27
|
+
"3.4" => "Communications",
|
|
28
|
+
"3.5" => "ITS services",
|
|
29
|
+
"3.6" => "Geospatial",
|
|
30
|
+
"3.7" => "Driving automation",
|
|
31
|
+
}.freeze
|
|
32
|
+
|
|
33
|
+
ISO_TS_14812_DATE = "2022-01-01T00:00:00+00:00"
|
|
34
|
+
ISO_TS_14812_SOURCE = "urn:iso:std:iso:ts:14812"
|
|
35
|
+
|
|
36
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
37
|
+
collection.load_from_files(DIR)
|
|
38
|
+
|
|
39
|
+
puts "Loaded #{collection.count} concepts"
|
|
40
|
+
|
|
41
|
+
# Track which concepts belong to which section for narrower relations
|
|
42
|
+
section_children = Hash.new { |h, k| h[k] = [] }
|
|
43
|
+
|
|
44
|
+
collection.each do |concept|
|
|
45
|
+
identifier = concept.data.id.to_s
|
|
46
|
+
section_code = identifier.split(".")[0..1].join(".")
|
|
47
|
+
section_uri = "section-#{section_code.gsub('.', '-')}"
|
|
48
|
+
section_children[section_uri] << identifier
|
|
49
|
+
|
|
50
|
+
# Set status
|
|
51
|
+
concept.status = "valid" unless concept.status
|
|
52
|
+
|
|
53
|
+
# Set date_accepted
|
|
54
|
+
unless concept.date_accepted
|
|
55
|
+
concept.date_accepted = Glossarist::ConceptDate.new(
|
|
56
|
+
type: "accepted",
|
|
57
|
+
date: ISO_TS_14812_DATE,
|
|
58
|
+
)
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# Add domain ConceptReference
|
|
62
|
+
concept.data.domains ||= []
|
|
63
|
+
unless concept.data.domains.any? { |d| d.concept_id == section_uri }
|
|
64
|
+
concept.data.domains << Glossarist::ConceptReference.new(
|
|
65
|
+
concept_id: section_uri,
|
|
66
|
+
source: ISO_TS_14812_SOURCE,
|
|
67
|
+
ref_type: "domain",
|
|
68
|
+
)
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Add broader relation to section
|
|
72
|
+
concept.related ||= []
|
|
73
|
+
unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == section_uri }
|
|
74
|
+
concept.related << Glossarist::RelatedConcept.new(
|
|
75
|
+
type: "broader",
|
|
76
|
+
content: section_uri,
|
|
77
|
+
ref: Glossarist::Citation.new(source: "ISO/TS 14812", id: section_uri),
|
|
78
|
+
)
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
puts "Updated #{collection.count} concepts with status, date_accepted, domains, related"
|
|
83
|
+
|
|
84
|
+
# Create section hierarchy concepts
|
|
85
|
+
ISO_TS_14812_SECTIONS.each do |code, title|
|
|
86
|
+
section_uri = "section-#{code.gsub('.', '-')}"
|
|
87
|
+
|
|
88
|
+
mc = Glossarist::ManagedConcept.new(
|
|
89
|
+
data: Glossarist::ManagedConceptData.new(
|
|
90
|
+
id: section_uri,
|
|
91
|
+
domains: [Glossarist::ConceptReference.new(
|
|
92
|
+
concept_id: section_uri,
|
|
93
|
+
source: ISO_TS_14812_SOURCE,
|
|
94
|
+
ref_type: "domain",
|
|
95
|
+
)],
|
|
96
|
+
),
|
|
97
|
+
)
|
|
98
|
+
mc.status = "valid"
|
|
99
|
+
mc.date_accepted = Glossarist::ConceptDate.new(
|
|
100
|
+
type: "accepted",
|
|
101
|
+
date: ISO_TS_14812_DATE,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
105
|
+
l10n.data = Glossarist::ConceptData.new(
|
|
106
|
+
id: section_uri,
|
|
107
|
+
language_code: "eng",
|
|
108
|
+
terms: [Glossarist::Designation::Expression.new(
|
|
109
|
+
type: "expression",
|
|
110
|
+
designation: title,
|
|
111
|
+
normative_status: "preferred",
|
|
112
|
+
)],
|
|
113
|
+
)
|
|
114
|
+
l10n.entry_status = "valid"
|
|
115
|
+
mc.add_l10n(l10n)
|
|
116
|
+
|
|
117
|
+
# Add narrower relations to child concepts
|
|
118
|
+
children = section_children[section_uri]
|
|
119
|
+
if children.any?
|
|
120
|
+
mc.related = children.sort.map do |child_id|
|
|
121
|
+
Glossarist::RelatedConcept.new(
|
|
122
|
+
type: "narrower",
|
|
123
|
+
content: child_id,
|
|
124
|
+
ref: Glossarist::Citation.new(source: "ISO/TS 14812", id: child_id),
|
|
125
|
+
)
|
|
126
|
+
end
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
collection.store(mc)
|
|
130
|
+
puts "Created section concept: #{section_uri} (#{title}) — #{children.length} narrower"
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
collection.save_grouped_concepts_to_files(DIR)
|
|
134
|
+
puts "Saved #{collection.count} concepts to #{DIR}"
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Migration script for isotc211-glossary: adds v3 fields to existing concepts.
|
|
5
|
+
#
|
|
6
|
+
# Adds:
|
|
7
|
+
# - domains (ISO standard ConceptReference) at managed concept data level
|
|
8
|
+
# - related (broader to standard domain) at managed concept level
|
|
9
|
+
# - Creates domain concepts per ISO standard (with narrower relations)
|
|
10
|
+
#
|
|
11
|
+
# Idempotent: safe to run multiple times.
|
|
12
|
+
#
|
|
13
|
+
# Usage:
|
|
14
|
+
# cd /Users/mulgogi/src/glossarist/glossarist-ruby
|
|
15
|
+
# bundle exec ruby scripts/migrate_isotc211_to_v3.rb
|
|
16
|
+
|
|
17
|
+
require "glossarist"
|
|
18
|
+
|
|
19
|
+
DIR = "/Users/mulgogi/src/geolexica/isotc211-glossary/concepts"
|
|
20
|
+
|
|
21
|
+
ISO_SOURCE_URN = "urn:iso:std:iso"
|
|
22
|
+
|
|
23
|
+
# Extract a stable domain ID from an authoritative source reference string.
|
|
24
|
+
# @param ref_text [String] e.g. "ISO 19136-1:2020", "ISO/IEC 19501:2005"
|
|
25
|
+
# @return [String, nil] domain ID e.g. "iso-19136-1", "iso-iec-19501"
|
|
26
|
+
def extract_domain_id(ref_text)
|
|
27
|
+
# Match various ISO reference patterns
|
|
28
|
+
patterns = [
|
|
29
|
+
%r{ISO/IEC/IEEE\s+([\d-]+)}, # ISO/IEC/IEEE 24765:2017
|
|
30
|
+
%r{ISO/IEC\s+([\d-]+)}, # ISO/IEC 19501:2005
|
|
31
|
+
%r{ISO/TS\s+([\d-]+)}, # ISO/TS 19130:2010
|
|
32
|
+
%r{ISO/TR\s+([\d-]+)}, # ISO/TR 19120:2001
|
|
33
|
+
%r{ISO/IEC\s+Guide\s+([\d-]+)}, # ISO/IEC Guide 98-3:2008
|
|
34
|
+
%r{ISO\s+DIS\s+([\d-]+)}, # ISO DIS 19123-1:2022
|
|
35
|
+
%r{ISO\s+([\d]+-?[\d]*)}, # ISO 19136-1:2020
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
patterns.each do |pat|
|
|
39
|
+
if (m = ref_text.match(pat))
|
|
40
|
+
# Extract the full match, normalize
|
|
41
|
+
prefix = ref_text[m.begin(0)...m.begin(1)].strip
|
|
42
|
+
number = m[1]
|
|
43
|
+
domain = "#{prefix} #{number}"
|
|
44
|
+
return domain.downcase.gsub(/[\s\/]+/, "-")
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
52
|
+
collection.load_from_files(DIR)
|
|
53
|
+
|
|
54
|
+
puts "Loaded #{collection.count} concepts"
|
|
55
|
+
|
|
56
|
+
# Build index: domain_id -> [concept_id]
|
|
57
|
+
domain_index = {}
|
|
58
|
+
concepts_with_domain = 0
|
|
59
|
+
concepts_without_domain = 0
|
|
60
|
+
|
|
61
|
+
collection.each do |concept|
|
|
62
|
+
eng = concept.localization("eng")
|
|
63
|
+
next unless eng
|
|
64
|
+
|
|
65
|
+
sources = eng.data&.sources
|
|
66
|
+
next unless sources
|
|
67
|
+
|
|
68
|
+
auth = sources.find { |s| s.type == "authoritative" }
|
|
69
|
+
next unless auth&.origin
|
|
70
|
+
|
|
71
|
+
ref_text = auth.origin.text || auth.origin.ref
|
|
72
|
+
unless ref_text
|
|
73
|
+
concepts_without_domain += 1
|
|
74
|
+
next
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
domain_id = extract_domain_id(ref_text)
|
|
78
|
+
unless domain_id
|
|
79
|
+
concepts_without_domain += 1
|
|
80
|
+
next
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
(domain_index[domain_id] ||= []) << concept.data.id
|
|
84
|
+
concepts_with_domain += 1
|
|
85
|
+
|
|
86
|
+
# Add domain ConceptReference
|
|
87
|
+
concept.data.domains ||= []
|
|
88
|
+
unless concept.data.domains.any? { |d| d.concept_id == domain_id }
|
|
89
|
+
concept.data.domains << Glossarist::ConceptReference.new(
|
|
90
|
+
concept_id: domain_id,
|
|
91
|
+
source: ISO_SOURCE_URN,
|
|
92
|
+
ref_type: "domain",
|
|
93
|
+
)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Add broader relation to domain concept
|
|
97
|
+
concept.related ||= []
|
|
98
|
+
unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == domain_id }
|
|
99
|
+
concept.related << Glossarist::RelatedConcept.new(
|
|
100
|
+
type: "broader",
|
|
101
|
+
content: domain_id,
|
|
102
|
+
ref: Glossarist::Citation.new(source: "ISO", id: domain_id),
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
puts "Added domains to #{concepts_with_domain} concepts"
|
|
108
|
+
puts "No domain extracted for #{concepts_without_domain} concepts"
|
|
109
|
+
|
|
110
|
+
# Create domain hierarchy concepts
|
|
111
|
+
domain_index.sort.each do |domain_id, child_ids|
|
|
112
|
+
mc = Glossarist::ManagedConcept.new(
|
|
113
|
+
data: Glossarist::ManagedConceptData.new(
|
|
114
|
+
id: domain_id,
|
|
115
|
+
domains: [Glossarist::ConceptReference.new(
|
|
116
|
+
concept_id: domain_id,
|
|
117
|
+
source: ISO_SOURCE_URN,
|
|
118
|
+
ref_type: "domain",
|
|
119
|
+
)],
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
mc.status = "valid"
|
|
123
|
+
|
|
124
|
+
# Create a basic English localization with the domain ID as the term
|
|
125
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
126
|
+
l10n.data = Glossarist::ConceptData.new(
|
|
127
|
+
id: domain_id,
|
|
128
|
+
language_code: "eng",
|
|
129
|
+
terms: [Glossarist::Designation::Expression.new(
|
|
130
|
+
type: "expression",
|
|
131
|
+
designation: domain_id,
|
|
132
|
+
normative_status: "preferred",
|
|
133
|
+
)],
|
|
134
|
+
)
|
|
135
|
+
l10n.entry_status = "valid"
|
|
136
|
+
mc.add_l10n(l10n)
|
|
137
|
+
|
|
138
|
+
# Add narrower relations to child concepts
|
|
139
|
+
narrower = child_ids.sort.map do |child_id|
|
|
140
|
+
Glossarist::RelatedConcept.new(
|
|
141
|
+
type: "narrower",
|
|
142
|
+
content: child_id.to_s,
|
|
143
|
+
ref: Glossarist::Citation.new(source: "ISO", id: child_id.to_s),
|
|
144
|
+
)
|
|
145
|
+
end
|
|
146
|
+
mc.related = narrower
|
|
147
|
+
|
|
148
|
+
collection.store(mc)
|
|
149
|
+
puts "Created domain: #{domain_id} — #{child_ids.size} narrower"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
collection.save_grouped_concepts_to_files(DIR)
|
|
153
|
+
puts "Saved #{collection.count} concepts to #{DIR}"
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Migration script for osgeo-glossary: adds v3 fields to existing concepts.
|
|
5
|
+
#
|
|
6
|
+
# Adds:
|
|
7
|
+
# - status: "valid" at managed concept level
|
|
8
|
+
# - date_accepted at managed concept level (set to 2011-01-01 as earliest
|
|
9
|
+
# known publication date for the OSGeo Lexicon)
|
|
10
|
+
# - domains for concepts with identifiable ISO standard sources
|
|
11
|
+
# - related (broader to domain) for those concepts
|
|
12
|
+
# - Creates domain concepts for identified ISO standards
|
|
13
|
+
#
|
|
14
|
+
# Idempotent: safe to run multiple times.
|
|
15
|
+
#
|
|
16
|
+
# Usage:
|
|
17
|
+
# cd /Users/mulgogi/src/glossarist/glossarist-ruby
|
|
18
|
+
# bundle exec ruby scripts/migrate_osgeo_to_v3.rb
|
|
19
|
+
|
|
20
|
+
require "glossarist"
|
|
21
|
+
|
|
22
|
+
DIR = "/Users/mulgogi/src/geolexica/osgeo-glossary/concepts"
|
|
23
|
+
|
|
24
|
+
OSGEO_DATE = "2011-01-01T00:00:00+00:00"
|
|
25
|
+
|
|
26
|
+
# Extract a stable domain ID from an authoritative source reference string.
|
|
27
|
+
def extract_domain_id(ref_text)
|
|
28
|
+
return nil unless ref_text
|
|
29
|
+
|
|
30
|
+
patterns = [
|
|
31
|
+
%r{ISO/IEC/IEEE\s+([\d-]+)},
|
|
32
|
+
%r{ISO/IEC\s+([\d-]+)},
|
|
33
|
+
%r{ISO/TS\s+([\d-]+)},
|
|
34
|
+
%r{ISO/TR\s+([\d-]+)},
|
|
35
|
+
%r{ISO\s+([\d]+-?[\d]*)},
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
patterns.each do |pat|
|
|
39
|
+
if (m = ref_text.match(pat))
|
|
40
|
+
return "iso-#{m[1]}"
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
nil
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
48
|
+
collection.load_from_files(DIR)
|
|
49
|
+
|
|
50
|
+
puts "Loaded #{collection.count} concepts"
|
|
51
|
+
|
|
52
|
+
domain_index = {}
|
|
53
|
+
concepts_with_domain = 0
|
|
54
|
+
|
|
55
|
+
collection.each do |concept|
|
|
56
|
+
# Set status
|
|
57
|
+
concept.status = "valid" unless concept.status
|
|
58
|
+
|
|
59
|
+
# Set date_accepted
|
|
60
|
+
unless concept.date_accepted
|
|
61
|
+
concept.date_accepted = Glossarist::ConceptDate.new(
|
|
62
|
+
type: "accepted",
|
|
63
|
+
date: OSGEO_DATE,
|
|
64
|
+
)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# Extract domain from source
|
|
68
|
+
eng = concept.localization("eng")
|
|
69
|
+
next unless eng
|
|
70
|
+
|
|
71
|
+
sources = eng.data&.sources
|
|
72
|
+
next unless sources
|
|
73
|
+
|
|
74
|
+
auth = sources.find { |s| s.type == "authoritative" }
|
|
75
|
+
next unless auth&.origin
|
|
76
|
+
|
|
77
|
+
ref_text = auth.origin.text || auth.origin.ref
|
|
78
|
+
next unless ref_text
|
|
79
|
+
|
|
80
|
+
domain_id = extract_domain_id(ref_text)
|
|
81
|
+
next unless domain_id
|
|
82
|
+
|
|
83
|
+
(domain_index[domain_id] ||= []) << concept.data.id
|
|
84
|
+
concepts_with_domain += 1
|
|
85
|
+
|
|
86
|
+
# Add domain ConceptReference
|
|
87
|
+
concept.data.domains ||= []
|
|
88
|
+
unless concept.data.domains.any? { |d| d.concept_id == domain_id }
|
|
89
|
+
concept.data.domains << Glossarist::ConceptReference.new(
|
|
90
|
+
concept_id: domain_id,
|
|
91
|
+
source: "urn:iso:std:iso",
|
|
92
|
+
ref_type: "domain",
|
|
93
|
+
)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Add broader relation to domain concept
|
|
97
|
+
concept.related ||= []
|
|
98
|
+
unless concept.related.any? { |r| r.type == "broader" && r.ref&.id == domain_id }
|
|
99
|
+
concept.related << Glossarist::RelatedConcept.new(
|
|
100
|
+
type: "broader",
|
|
101
|
+
content: domain_id,
|
|
102
|
+
ref: Glossarist::Citation.new(source: "ISO", id: domain_id),
|
|
103
|
+
)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
puts "Added status and date_accepted to #{collection.count} concepts"
|
|
108
|
+
puts "Added domains to #{concepts_with_domain} concepts with ISO sources"
|
|
109
|
+
|
|
110
|
+
# Create domain hierarchy concepts
|
|
111
|
+
domain_index.sort.each do |domain_id, child_ids|
|
|
112
|
+
mc = Glossarist::ManagedConcept.new(
|
|
113
|
+
data: Glossarist::ManagedConceptData.new(
|
|
114
|
+
id: domain_id,
|
|
115
|
+
domains: [Glossarist::ConceptReference.new(
|
|
116
|
+
concept_id: domain_id,
|
|
117
|
+
source: "urn:iso:std:iso",
|
|
118
|
+
ref_type: "domain",
|
|
119
|
+
)],
|
|
120
|
+
),
|
|
121
|
+
)
|
|
122
|
+
mc.status = "valid"
|
|
123
|
+
mc.date_accepted = Glossarist::ConceptDate.new(
|
|
124
|
+
type: "accepted",
|
|
125
|
+
date: OSGEO_DATE,
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
l10n = Glossarist::LocalizedConcept.new
|
|
129
|
+
l10n.data = Glossarist::ConceptData.new(
|
|
130
|
+
id: domain_id,
|
|
131
|
+
language_code: "eng",
|
|
132
|
+
terms: [Glossarist::Designation::Expression.new(
|
|
133
|
+
type: "expression",
|
|
134
|
+
designation: domain_id,
|
|
135
|
+
normative_status: "preferred",
|
|
136
|
+
)],
|
|
137
|
+
)
|
|
138
|
+
l10n.entry_status = "valid"
|
|
139
|
+
mc.add_l10n(l10n)
|
|
140
|
+
|
|
141
|
+
narrower = child_ids.sort.map do |child_id|
|
|
142
|
+
Glossarist::RelatedConcept.new(
|
|
143
|
+
type: "narrower",
|
|
144
|
+
content: child_id.to_s,
|
|
145
|
+
ref: Glossarist::Citation.new(source: "OSGeo", id: child_id.to_s),
|
|
146
|
+
)
|
|
147
|
+
end
|
|
148
|
+
mc.related = narrower
|
|
149
|
+
|
|
150
|
+
collection.store(mc)
|
|
151
|
+
puts "Created domain: #{domain_id} — #{child_ids.size} narrower"
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
collection.save_grouped_concepts_to_files(DIR)
|
|
155
|
+
puts "Saved #{collection.count} concepts to #{DIR}"
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
# Upgrades a dataset directory to v3 format.
|
|
5
|
+
#
|
|
6
|
+
# Usage: bundle exec ruby scripts/upgrade_dataset_to_v3.rb <concepts_dir> [--grouped|--separate]
|
|
7
|
+
#
|
|
8
|
+
# Formats:
|
|
9
|
+
# --grouped (default): each concept + localizations in a single YAML file
|
|
10
|
+
# --separate: concept/ and localized_concept/ in separate files
|
|
11
|
+
#
|
|
12
|
+
# For each concept:
|
|
13
|
+
# 1. Loads with ConceptManager (auto-detects v2/v3)
|
|
14
|
+
# 2. Promotes data.related -> top-level related (if needed)
|
|
15
|
+
# 3. Sets schema_version = "3"
|
|
16
|
+
# 4. Re-saves in v3 format
|
|
17
|
+
|
|
18
|
+
require "glossarist"
|
|
19
|
+
|
|
20
|
+
dir = ARGV[0]
|
|
21
|
+
mode = ARGV.include?("--separate") ? :separate : :grouped
|
|
22
|
+
|
|
23
|
+
unless dir && Dir.exist?(dir)
|
|
24
|
+
abort "Usage: #{$PROGRAM_NAME} <concepts_dir> [--grouped|--separate]"
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
manager = Glossarist::ConceptManager.new(path: dir)
|
|
28
|
+
collection = Glossarist::ManagedConceptCollection.new
|
|
29
|
+
|
|
30
|
+
puts "Loading concepts from #{dir}..."
|
|
31
|
+
manager.load_from_files(collection: collection)
|
|
32
|
+
|
|
33
|
+
count = 0
|
|
34
|
+
collection.each do |concept|
|
|
35
|
+
Glossarist::SchemaMigration.migrate_concept(concept)
|
|
36
|
+
count += 1
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
puts "Migrated #{count} concepts. Saving..."
|
|
40
|
+
|
|
41
|
+
if mode == :separate
|
|
42
|
+
manager.save_to_files(collection)
|
|
43
|
+
else
|
|
44
|
+
manager.save_grouped_concepts_to_files(collection)
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
puts "Done. All concepts now in v3 format."
|