glossarist 2.8.7 → 2.8.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/.rubocop_todo.yml +128 -11
  4. data/CLAUDE.md +34 -3
  5. data/Gemfile +1 -0
  6. data/lib/glossarist/cli/compare_command.rb +2 -2
  7. data/lib/glossarist/cli/export_command.rb +1 -3
  8. data/lib/glossarist/collection.rb +1 -1
  9. data/lib/glossarist/collections/bibliography_collection.rb +1 -1
  10. data/lib/glossarist/concept_data.rb +3 -2
  11. data/lib/glossarist/concept_reference.rb +7 -1
  12. data/lib/glossarist/concept_set.rb +5 -1
  13. data/lib/glossarist/concept_source.rb +2 -2
  14. data/lib/glossarist/concept_validator.rb +3 -1
  15. data/lib/glossarist/dataset_validator.rb +1 -1
  16. data/lib/glossarist/{error.rb → errors/base.rb} +3 -1
  17. data/lib/glossarist/errors/cache_version_mismatch_error.rb +12 -0
  18. data/lib/glossarist/errors/invalid_language_code_error.rb +19 -0
  19. data/lib/glossarist/errors/invalid_type_error.rb +8 -0
  20. data/lib/glossarist/errors/load_error.rb +22 -0
  21. data/lib/glossarist/errors/parse_error.rb +24 -0
  22. data/lib/glossarist/errors.rb +14 -0
  23. data/lib/glossarist/gcr_package.rb +4 -2
  24. data/lib/glossarist/glossary_store.rb +175 -1
  25. data/lib/glossarist/managed_concept.rb +31 -17
  26. data/lib/glossarist/managed_concept_collection.rb +52 -8
  27. data/lib/glossarist/reference_extractor.rb +22 -2
  28. data/lib/glossarist/reference_resolver.rb +38 -3
  29. data/lib/glossarist/resolution_adapter/bibliography.rb +22 -0
  30. data/lib/glossarist/resolution_adapter.rb +1 -0
  31. data/lib/glossarist/schema_migration/v0_to_v1.rb +200 -0
  32. data/lib/glossarist/schema_migration/v2_to_v3.rb +50 -0
  33. data/lib/glossarist/schema_migration.rb +10 -224
  34. data/lib/glossarist/sts/importer.rb +11 -12
  35. data/lib/glossarist/sts/term_extractor.rb +105 -6
  36. data/lib/glossarist/transforms/concept_to_gloss_transform.rb +1 -1
  37. data/lib/glossarist/v2/managed_concept.rb +2 -4
  38. data/lib/glossarist/v3/managed_concept.rb +2 -4
  39. data/lib/glossarist/validation/asset_index.rb +1 -1
  40. data/lib/glossarist/validation/rules/asciidoc_xref_rule.rb +11 -21
  41. data/lib/glossarist/validation/rules/cite_ref_integrity_rule.rb +74 -0
  42. data/lib/glossarist/validation/rules/concept_context.rb +24 -0
  43. data/lib/glossarist/validation/rules/concept_mention_rule.rb +1 -3
  44. data/lib/glossarist/validation/rules/image_reference_rule.rb +10 -21
  45. data/lib/glossarist/version.rb +1 -1
  46. data/lib/glossarist.rb +5 -13
  47. data/scripts/upgrade_dataset_to_v3.rb +1 -1
  48. metadata +13 -9
  49. data/lib/glossarist/concept_collector.rb +0 -231
  50. data/lib/glossarist/concept_manager.rb +0 -183
  51. data/lib/glossarist/error/cache_version_mismatch_error.rb +0 -8
  52. data/lib/glossarist/error/invalid_language_code_error.rb +0 -15
  53. data/lib/glossarist/error/invalid_type_error.rb +0 -4
  54. data/lib/glossarist/error/parse_error.rb +0 -16
@@ -0,0 +1,200 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ class SchemaMigration
5
+ class V0ToV1
6
+ ENTRY_STATUS_MAP = {
7
+ "Standard" => "valid",
8
+ "Confirmed" => "valid",
9
+ "Proposed" => "draft",
10
+ }.freeze
11
+
12
+ LANG_CODES = Glossarist::LANG_CODES
13
+
14
+ IEV_PATTERN = /\{\{([^,}]+),\s*IEV:([^}]+)\}\}/
15
+ URN_PATTERN = /\{urn:iso:std:iso:(\d+):([^,}]+),([^}]+)\}/
16
+
17
+ attr_reader :from_version, :to_version
18
+
19
+ def initialize(concept_hash, from_version: "0",
20
+ to_version: SchemaMigration::CURRENT_SCHEMA_VERSION,
21
+ ref_maps: {})
22
+ @concept = concept_hash
23
+ @from_version = from_version
24
+ @to_version = to_version
25
+ @ref_maps = ref_maps
26
+ end
27
+
28
+ def migrate
29
+ case [from_version, to_version]
30
+ when ["0", "1"] then migrate_v0_to_v1
31
+ else
32
+ raise Errors::Base,
33
+ "Unsupported migration: #{from_version} -> #{to_version}"
34
+ end
35
+ @concept
36
+ end
37
+
38
+ private
39
+
40
+ def migrate_v0_to_v1
41
+ migrate_termid
42
+ LANG_CODES.each do |lang|
43
+ migrate_language_block(lang) if @concept[lang]
44
+ end
45
+ strip_revisions
46
+ end
47
+
48
+ def migrate_termid
49
+ if @concept.key?("termid")
50
+ @concept["termid"] =
51
+ String(@concept["termid"])
52
+ end
53
+ end
54
+
55
+ def migrate_language_block(lang)
56
+ l10n_block = @concept[lang]
57
+ return unless l10n_block.is_a?(Hash)
58
+
59
+ migrate_definition(l10n_block)
60
+ migrate_authoritative_source(l10n_block)
61
+ migrate_dates(l10n_block)
62
+ migrate_entry_status(l10n_block)
63
+ migrate_terms_abbrev(l10n_block)
64
+ extract_inline_refs(l10n_block)
65
+ strip_revisions(l10n_block)
66
+ end
67
+
68
+ def migrate_definition(l10n_block)
69
+ return unless l10n_block.key?("definition")
70
+ return unless l10n_block["definition"].is_a?(String)
71
+
72
+ l10n_block["definition"] = [{ "content" => l10n_block["definition"] }]
73
+ end
74
+
75
+ def migrate_authoritative_source(l10n_block)
76
+ return unless l10n_block.key?("authoritative_source")
77
+
78
+ src = l10n_block.delete("authoritative_source")
79
+ return if l10n_block.key?("sources")
80
+
81
+ sources = (src.is_a?(Array) ? src : [src]).filter_map do |s|
82
+ next unless s.is_a?(Hash)
83
+
84
+ origin = {}
85
+ origin["ref"] = s["ref"] if s["ref"]
86
+ origin["clause"] = s["clause"] if s["clause"]
87
+ origin["link"] = s["link"] if s["link"]
88
+
89
+ entry = { "type" => "authoritative", "origin" => origin }
90
+ if s["relationship"]
91
+ entry["status"] = s["relationship"]["type"] || "identical"
92
+ if s["relationship"]["modification"]
93
+ entry["modification"] =
94
+ s["relationship"]["modification"]
95
+ end
96
+ end
97
+ entry
98
+ end
99
+
100
+ l10n_block["sources"] = sources if sources.any?
101
+ end
102
+
103
+ def migrate_dates(l10n_block)
104
+ return if l10n_block.key?("dates")
105
+
106
+ dates = []
107
+ if l10n_block["date_accepted"]
108
+ dates << { "type" => "accepted",
109
+ "date" => l10n_block["date_accepted"] }
110
+ end
111
+ if l10n_block["date_amended"]
112
+ dates << { "type" => "amended", "date" => l10n_block["date_amended"] }
113
+ end
114
+ l10n_block["dates"] = dates if dates.any?
115
+ end
116
+
117
+ def migrate_entry_status(l10n_block)
118
+ return unless l10n_block.key?("entry_status")
119
+
120
+ mapped = ENTRY_STATUS_MAP[l10n_block["entry_status"]]
121
+ l10n_block["entry_status"] = mapped if mapped
122
+ end
123
+
124
+ def migrate_terms_abbrev(l10n_block)
125
+ return unless l10n_block["terms"].is_a?(Array)
126
+
127
+ l10n_block["terms"].each do |term|
128
+ next unless term.is_a?(Hash)
129
+ next unless term["abbrev"] == true
130
+
131
+ term["type"] = "abbreviation"
132
+ term.delete("abbrev")
133
+ end
134
+ end
135
+
136
+ def extract_inline_refs(l10n_block)
137
+ texts = []
138
+
139
+ if l10n_block["definition"].is_a?(Array)
140
+ l10n_block["definition"].each do |d|
141
+ texts << (d.is_a?(Hash) ? d["content"].to_s : d.to_s)
142
+ end
143
+ elsif l10n_block["definition"].is_a?(String)
144
+ texts << l10n_block["definition"]
145
+ end
146
+
147
+ Array(l10n_block["notes"]).each do |n|
148
+ texts << (n.is_a?(Hash) ? n["content"].to_s : n.to_s)
149
+ end
150
+ Array(l10n_block["examples"]).each do |e|
151
+ texts << (e.is_a?(Hash) ? e["content"].to_s : e.to_s)
152
+ end
153
+
154
+ full_text = texts.join(" ")
155
+
156
+ refs = []
157
+
158
+ full_text.scan(IEV_PATTERN) do |term, id|
159
+ refs << {
160
+ "term" => term.strip,
161
+ "concept_id" => id.strip,
162
+ "source" => "urn:iec:std:iec:60050",
163
+ "ref_type" => "urn",
164
+ }
165
+ end
166
+
167
+ full_text.scan(URN_PATTERN) do |std_num, id, term|
168
+ refs << {
169
+ "term" => term.strip,
170
+ "concept_id" => id.strip,
171
+ "source" => "urn:iso:std:iso:#{std_num}",
172
+ "ref_type" => "urn",
173
+ }
174
+ end
175
+
176
+ return if refs.empty?
177
+
178
+ existing = l10n_block["references"] || []
179
+ seen_ids = existing.to_set { |r| r["concept_id"] || r["id"] }
180
+ refs.each do |ref|
181
+ key = ref["concept_id"] || ref["id"]
182
+ next if seen_ids.include?(key)
183
+
184
+ seen_ids.add(key)
185
+ existing << ref
186
+ end
187
+ l10n_block["references"] = existing
188
+ end
189
+
190
+ def strip_revisions(hash = @concept)
191
+ hash.delete("_revisions")
192
+ LANG_CODES.each do |lang|
193
+ next unless hash[lang].is_a?(Hash)
194
+
195
+ hash[lang].delete("_revisions")
196
+ end
197
+ end
198
+ end
199
+ end
200
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ class SchemaMigration
5
+ module V2ToV3
6
+ def self.migrate_concept(concept, target_version: Glossarist::SCHEMA_VERSION)
7
+ current = concept_version(concept)
8
+ target = target_version.to_s
9
+
10
+ return concept if current == target
11
+
12
+ max_steps = 5
13
+ max_steps.times do
14
+ break if current == target
15
+
16
+ case current
17
+ when "2" then current = step_v2_to_v3(concept)
18
+ else
19
+ raise Errors::Base,
20
+ "No concept migration step from version #{current}"
21
+ end
22
+ end
23
+
24
+ unless current == target
25
+ raise Errors::Base,
26
+ "Migration chain too long or unresolvable"
27
+ end
28
+
29
+ concept.schema_version = target
30
+ concept
31
+ end
32
+
33
+ def self.concept_version(concept)
34
+ version = concept.schema_version
35
+ return version.to_s if version && !version.to_s.empty?
36
+
37
+ ManagedConcept.detect_schema_version(concept)
38
+ end
39
+
40
+ def self.step_v2_to_v3(concept)
41
+ if concept.data&.related&.any?
42
+ concept.related ||= []
43
+ concept.related = (concept.related + concept.data.related).uniq
44
+ concept.data.related = []
45
+ end
46
+ "3"
47
+ end
48
+ end
49
+ end
50
+ end
@@ -6,76 +6,19 @@ module Glossarist
6
6
  class SchemaMigration
7
7
  CURRENT_SCHEMA_VERSION = "1"
8
8
 
9
- def self.migrate_concept(concept, target_version: Glossarist::SCHEMA_VERSION)
10
- current = concept_version(concept)
11
- target = target_version.to_s
12
-
13
- return concept if current == target
14
-
15
- max_steps = 5
16
- max_steps.times do
17
- break if current == target
18
-
19
- case current
20
- when "2" then current = step_v2_to_v3(concept)
21
- else
22
- raise Error, "No concept migration step from version #{current}"
23
- end
24
- end
25
-
26
- unless current == target
27
- raise Error,
28
- "Migration chain too long or unresolvable"
29
- end
30
-
31
- concept.schema_version = target
32
- concept
33
- end
34
-
35
- def self.concept_version(concept)
36
- version = concept.schema_version
37
- return version.to_s if version && !version.to_s.empty?
38
-
39
- ManagedConcept.detect_schema_version(concept)
40
- end
9
+ autoload :V0ToV1, "glossarist/schema_migration/v0_to_v1"
10
+ autoload :V2ToV3, "glossarist/schema_migration/v2_to_v3"
41
11
 
42
- def self.step_v2_to_v3(concept)
43
- if concept.data&.related&.any?
44
- concept.related ||= []
45
- concept.related = (concept.related + concept.data.related).uniq
46
- concept.data.related = []
47
- end
48
- "3"
12
+ def self.new(...)
13
+ V0ToV1.new(...)
49
14
  end
50
15
 
51
- ENTRY_STATUS_MAP = {
52
- "Standard" => "valid",
53
- "Confirmed" => "valid",
54
- "Proposed" => "draft",
55
- }.freeze
56
-
57
- LANG_CODES = Glossarist::LANG_CODES
58
-
59
- IEV_PATTERN = /\{\{([^,}]+),\s*IEV:([^}]+)\}\}/
60
- URN_PATTERN = /\{urn:iso:std:iso:(\d+):([^,}]+),([^}]+)\}/
61
-
62
- attr_reader :from_version, :to_version
63
-
64
- def initialize(concept_hash, from_version: "0",
65
- to_version: CURRENT_SCHEMA_VERSION, ref_maps: {})
66
- @concept = concept_hash
67
- @from_version = from_version
68
- @to_version = to_version
69
- @ref_maps = ref_maps
16
+ def self.migrate_concept(concept, target_version: Glossarist::SCHEMA_VERSION)
17
+ V2ToV3.migrate_concept(concept, target_version: target_version)
70
18
  end
71
19
 
72
- def migrate
73
- case [from_version, to_version]
74
- when ["0", "1"] then migrate_v0_to_v1
75
- else
76
- raise Error, "Unsupported migration: #{from_version} -> #{to_version}"
77
- end
78
- @concept
20
+ def self.concept_version(concept)
21
+ V2ToV3.concept_version(concept)
79
22
  end
80
23
 
81
24
  def self.upgrade_directory(source_dir, output:, # rubocop:disable Metrics/MethodLength, Metrics/ParameterLists
@@ -112,163 +55,6 @@ module Glossarist
112
55
  }
113
56
  end
114
57
 
115
- private
116
-
117
- def migrate_v0_to_v1
118
- migrate_termid
119
- LANG_CODES.each do |lang|
120
- migrate_language_block(lang) if @concept[lang]
121
- end
122
- strip_revisions
123
- end
124
-
125
- def migrate_termid
126
- @concept["termid"] = String(@concept["termid"]) if @concept.key?("termid")
127
- end
128
-
129
- def migrate_language_block(lang)
130
- lc = @concept[lang]
131
- return unless lc.is_a?(Hash)
132
-
133
- migrate_definition(lc)
134
- migrate_authoritative_source(lc)
135
- migrate_dates(lc)
136
- migrate_entry_status(lc)
137
- migrate_terms_abbrev(lc)
138
- extract_inline_refs(lc)
139
- strip_revisions(lc)
140
- end
141
-
142
- def migrate_definition(lc)
143
- return unless lc.key?("definition")
144
- return unless lc["definition"].is_a?(String)
145
-
146
- lc["definition"] = [{ "content" => lc["definition"] }]
147
- end
148
-
149
- def migrate_authoritative_source(lc)
150
- return unless lc.key?("authoritative_source")
151
-
152
- src = lc.delete("authoritative_source")
153
- return if lc.key?("sources")
154
-
155
- sources = (src.is_a?(Array) ? src : [src]).filter_map do |s|
156
- next unless s.is_a?(Hash)
157
-
158
- origin = {}
159
- origin["ref"] = s["ref"] if s["ref"]
160
- origin["clause"] = s["clause"] if s["clause"]
161
- origin["link"] = s["link"] if s["link"]
162
-
163
- entry = { "type" => "authoritative", "origin" => origin }
164
- if s["relationship"]
165
- entry["status"] = s["relationship"]["type"] || "identical"
166
- if s["relationship"]["modification"]
167
- entry["modification"] =
168
- s["relationship"]["modification"]
169
- end
170
- end
171
- entry
172
- end
173
-
174
- lc["sources"] = sources if sources.any?
175
- end
176
-
177
- def migrate_dates(lc)
178
- return if lc.key?("dates")
179
-
180
- dates = []
181
- if lc["date_accepted"]
182
- dates << { "type" => "accepted", "date" => lc["date_accepted"] }
183
- end
184
- if lc["date_amended"]
185
- dates << { "type" => "amended", "date" => lc["date_amended"] }
186
- end
187
- lc["dates"] = dates if dates.any?
188
- end
189
-
190
- def migrate_entry_status(lc)
191
- return unless lc.key?("entry_status")
192
-
193
- mapped = ENTRY_STATUS_MAP[lc["entry_status"]]
194
- lc["entry_status"] = mapped if mapped
195
- end
196
-
197
- def migrate_terms_abbrev(lc)
198
- return unless lc["terms"].is_a?(Array)
199
-
200
- lc["terms"].each do |term|
201
- next unless term.is_a?(Hash)
202
- next unless term["abbrev"] == true
203
-
204
- term["type"] = "abbreviation"
205
- term.delete("abbrev")
206
- end
207
- end
208
-
209
- def extract_inline_refs(lc)
210
- texts = []
211
-
212
- if lc["definition"].is_a?(Array)
213
- lc["definition"].each do |d|
214
- texts << (d.is_a?(Hash) ? d["content"].to_s : d.to_s)
215
- end
216
- elsif lc["definition"].is_a?(String)
217
- texts << lc["definition"]
218
- end
219
-
220
- Array(lc["notes"]).each do |n|
221
- texts << (n.is_a?(Hash) ? n["content"].to_s : n.to_s)
222
- end
223
- Array(lc["examples"]).each do |e|
224
- texts << (e.is_a?(Hash) ? e["content"].to_s : e.to_s)
225
- end
226
-
227
- full_text = texts.join(" ")
228
-
229
- refs = []
230
-
231
- full_text.scan(IEV_PATTERN) do |term, id|
232
- refs << {
233
- "term" => term.strip,
234
- "concept_id" => id.strip,
235
- "source" => "urn:iec:std:iec:60050",
236
- "ref_type" => "urn",
237
- }
238
- end
239
-
240
- full_text.scan(URN_PATTERN) do |std_num, id, term|
241
- refs << {
242
- "term" => term.strip,
243
- "concept_id" => id.strip,
244
- "source" => "urn:iso:std:iso:#{std_num}",
245
- "ref_type" => "urn",
246
- }
247
- end
248
-
249
- return if refs.empty?
250
-
251
- existing = lc["references"] || []
252
- seen_ids = existing.to_set { |r| r["concept_id"] || r["id"] }
253
- refs.each do |ref|
254
- key = ref["concept_id"] || ref["id"]
255
- next if seen_ids.include?(key)
256
-
257
- seen_ids.add(key)
258
- existing << ref
259
- end
260
- lc["references"] = existing
261
- end
262
-
263
- def strip_revisions(hash = @concept)
264
- hash.delete("_revisions")
265
- LANG_CODES.each do |lang|
266
- next unless hash[lang].is_a?(Hash)
267
-
268
- hash[lang].delete("_revisions")
269
- end
270
- end
271
-
272
58
  class << self
273
59
  private
274
60
 
@@ -301,14 +87,14 @@ module Glossarist
301
87
  v1 = V1::Concept.from_file(file)
302
88
  next unless v1
303
89
 
304
- migration = new(
90
+ migration = V0ToV1.new(
305
91
  v1.to_yaml_hash,
306
92
  from_version: source_version,
307
93
  to_version: target_version,
308
94
  ref_maps: ref_maps,
309
95
  )
310
96
  concepts << migration.migrate
311
- rescue StandardError => e
97
+ rescue Errors::Base, Psych::SyntaxError => e
312
98
  errors += 1
313
99
  warn " Error migrating #{File.basename(file)}: #{e.message}" if errors <= 5
314
100
  end
@@ -184,8 +184,9 @@ module Glossarist
184
184
  package = GcrPackage.load(path)
185
185
  package.concepts.each { |mc| collection.store(mc) }
186
186
  else
187
- concepts = ConceptCollector.collect(path)
188
- concepts.each { |mc| collection.store(mc) }
187
+ GlossaryStore.new.tap do |s|
188
+ s.load(path)
189
+ end.each_concept { |mc| collection.store(mc) }
189
190
  end
190
191
  collection
191
192
  end
@@ -214,11 +215,10 @@ module Glossarist
214
215
  end
215
216
 
216
217
  def save_dataset(concepts, dir)
217
- concepts_dir = File.join(dir, "concepts")
218
- FileUtils.mkdir_p(concepts_dir)
219
- collection = ManagedConceptCollection.new
220
- concepts.each { |mc| collection.store(mc) }
221
- collection.save_grouped_concepts_to_files(concepts_dir)
218
+ FileUtils.mkdir_p(dir)
219
+ store = GlossaryStore.new
220
+ concepts.each { |mc| store.add_concept(mc) }
221
+ store.save_directory(dir)
222
222
  end
223
223
 
224
224
  def create_gcr(concepts, output, shortname:, version:, **opts)
@@ -238,12 +238,11 @@ module Glossarist
238
238
 
239
239
  def build_temp_dataset(concepts)
240
240
  tmpdir = Dir.mktmpdir("glossarist-sts-import")
241
- concepts_dir = File.join(tmpdir, "concepts")
242
- FileUtils.mkdir_p(concepts_dir)
241
+ FileUtils.mkdir_p(tmpdir)
243
242
 
244
- collection = ManagedConceptCollection.new
245
- concepts.each { |mc| collection.store(mc) }
246
- collection.save_grouped_concepts_to_files(concepts_dir)
243
+ store = GlossaryStore.new
244
+ concepts.each { |mc| store.add_concept(mc) }
245
+ store.save_directory(tmpdir)
247
246
 
248
247
  tmpdir
249
248
  end