glossarist 2.6.1 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@
2
2
 
3
3
  require "zip"
4
4
  require "fileutils"
5
+ require "pathname"
5
6
 
6
7
  module Glossarist
7
8
  class GcrPackage
@@ -14,12 +15,18 @@ module Glossarist
14
15
 
15
16
  KNOWN_COMPILED_FORMATS = COMPILED_EXTENSIONS.keys.freeze
16
17
 
17
- attr_reader :zip_path, :metadata, :concepts
18
+ DATASET_ASSETS = [
19
+ { path: "bibliography.yaml", type: :file, attr: :bibliography },
20
+ { path: "images", type: :directory },
21
+ ].freeze
22
+
23
+ attr_reader :zip_path, :metadata, :concepts, :bibliography
18
24
 
19
25
  def initialize(zip_path)
20
26
  @zip_path = zip_path
21
27
  @metadata = nil
22
28
  @concepts = []
29
+ @bibliography = nil
23
30
  end
24
31
 
25
32
  def self.create(concepts:, metadata:, output_path:, register_data: nil,
@@ -66,12 +73,43 @@ module Glossarist
66
73
  end
67
74
  end
68
75
 
76
+ def self.each_dataset_asset(source_dir)
77
+ base = Pathname.new(source_dir)
78
+ DATASET_ASSETS.each do |asset|
79
+ path = File.join(source_dir, asset[:path])
80
+ case asset[:type]
81
+ when :file
82
+ yield_file_asset(path, asset[:path]) { |*a| yield(*a) }
83
+ when :directory
84
+ yield_directory_assets(path, base) { |*a| yield(*a) }
85
+ end
86
+ end
87
+ end
88
+
89
+ def self.yield_file_asset(path, entry_name)
90
+ return unless File.exist?(path)
91
+
92
+ yield entry_name, File.binread(path)
93
+ end
94
+
95
+ def self.yield_directory_assets(dir_path, base_path)
96
+ return unless File.directory?(dir_path)
97
+
98
+ Dir.glob(File.join(dir_path, "**", "*")).each do |file|
99
+ next unless File.file?(file)
100
+
101
+ relative = Pathname.new(file).relative_path_from(base_path).to_s
102
+ yield relative, File.binread(file)
103
+ end
104
+ end
105
+
69
106
  def validate
70
107
  GcrValidator.new.validate(@zip_path)
71
108
  end
72
109
 
73
- def write(concepts, metadata, register_data, compiled_formats: [],
74
- shortname: nil, **opts)
110
+ def write(concepts, metadata, register_data, # rubocop:disable Metrics/ParameterLists
111
+ compiled_formats: [],
112
+ shortname: nil, source_dir: nil, **opts)
75
113
  Zip::File.open(@zip_path, create: true) do |zf|
76
114
  zf.get_output_stream("metadata.yaml") do |f|
77
115
  f.write(metadata.to_yaml)
@@ -87,6 +125,12 @@ module Glossarist
87
125
  write_concept(zf, mc)
88
126
  end
89
127
 
128
+ if source_dir
129
+ self.class.each_dataset_asset(source_dir) do |name, content|
130
+ zf.get_output_stream(name) { |f| f.write(content) }
131
+ end
132
+ end
133
+
90
134
  if compiled_formats.any?
91
135
  write_compiled(zf, concepts, compiled_formats, shortname: shortname,
92
136
  **opts)
@@ -94,29 +138,51 @@ module Glossarist
94
138
  end
95
139
  end
96
140
 
97
- def write_concept(zip_file, concept)
98
- termid = concept.data.id.to_s
99
- doc = ConceptDocument.from_managed_concept(concept)
100
- zip_file.get_output_stream("concepts/#{termid}.yaml") do |f|
101
- f.write(doc.to_yamls)
102
- end
103
- end
104
-
105
141
  def read
106
142
  @concepts = []
107
143
 
108
144
  Zip::File.open(@zip_path) do |zf|
109
- if (entry = zf.find_entry("metadata.yaml"))
110
- @metadata = GcrMetadata.from_yaml(entry.get_input_stream.read)
111
- end
145
+ read_metadata(zf)
146
+ read_file_assets(zf)
147
+ read_concepts(zf)
148
+ end
149
+ end
112
150
 
113
- zf.entries.each do |entry|
114
- next unless entry.name.start_with?("concepts/") && entry.name.end_with?(".yaml")
151
+ def read_metadata(zip_file)
152
+ entry = zip_file.find_entry("metadata.yaml")
153
+ return unless entry
115
154
 
116
- raw = entry.get_input_stream.read
117
- doc = ConceptDocument.from_yamls(raw)
118
- @concepts << doc.to_managed_concept
119
- end
155
+ @metadata = GcrMetadata.from_yaml(entry.get_input_stream.read)
156
+ end
157
+
158
+ def read_file_assets(zip_file)
159
+ DATASET_ASSETS.each do |asset|
160
+ next unless asset[:type] == :file && asset[:attr]
161
+
162
+ entry = zip_file.find_entry(asset[:path])
163
+ next unless entry
164
+
165
+ instance_variable_set("@#{asset[:attr]}", entry.get_input_stream.read)
166
+ end
167
+ end
168
+
169
+ def read_concepts(zip_file)
170
+ zip_file.entries.each do |entry|
171
+ next unless entry.name.start_with?("concepts/") && entry.name.end_with?(".yaml")
172
+
173
+ raw = entry.get_input_stream.read
174
+ doc = ConceptDocument.from_yamls(raw)
175
+ @concepts << doc.to_managed_concept
176
+ end
177
+ end
178
+
179
+ private
180
+
181
+ def write_concept(zip_file, concept)
182
+ termid = concept.data.id.to_s
183
+ doc = ConceptDocument.from_managed_concept(concept)
184
+ zip_file.get_output_stream("concepts/#{termid}.yaml") do |f|
185
+ f.write(doc.to_yamls)
120
186
  end
121
187
  end
122
188
 
@@ -206,6 +272,7 @@ compiled_formats: [], **opts)
206
272
  output_path: File.expand_path(output),
207
273
  compiled_formats: compiled_formats,
208
274
  shortname: shortname,
275
+ source_dir: dir,
209
276
  **opts,
210
277
  )
211
278
  end
@@ -219,7 +286,7 @@ compiled_formats: [], **opts)
219
286
  concept_count = 0
220
287
  languages = Set.new
221
288
 
222
- Zip::OutputStream.open(output_path) do |zos|
289
+ Zip::OutputStream.open(output_path) do |zos| # rubocop:disable Metrics/BlockLength
223
290
  if register_data
224
291
  zos.put_next_entry("register.yaml")
225
292
  zos.write(register_data.to_yaml)
@@ -253,6 +320,11 @@ compiled_formats: [], **opts)
253
320
  register_data: register_data, **opts)
254
321
  zos.put_next_entry("metadata.yaml")
255
322
  zos.write(metadata.to_yaml)
323
+
324
+ each_dataset_asset(dir) do |name, content|
325
+ zos.put_next_entry(name)
326
+ zos.write(content)
327
+ end
256
328
  end
257
329
 
258
330
  new(output_path)
@@ -13,27 +13,8 @@ module Glossarist
13
13
  end
14
14
 
15
15
  begin
16
- Zip::File.open(zip_path) do |zf|
17
- unless zf.find_entry("metadata.yaml")
18
- result.add_error("Missing metadata.yaml")
19
- return result
20
- end
21
-
22
- metadata = GcrMetadata.from_yaml(
23
- zf.find_entry("metadata.yaml").get_input_stream.read,
24
- )
25
- validate_metadata(metadata, result)
26
-
27
- concept_entries = zf.entries.select do |e|
28
- e.name.start_with?("concepts/") && e.name.end_with?(".yaml")
29
- end
30
- if concept_entries.empty?
31
- result.add_error("No concept files found in concepts/")
32
- end
33
-
34
- concept_entries.each do |entry|
35
- validate_concept_entry(entry, metadata, result)
36
- end
16
+ Zip::File.open(zip_path) do |zip_file|
17
+ validate_zip_contents(zip_file, result)
37
18
  end
38
19
  rescue StandardError => e
39
20
  result.add_error("Failed to read ZIP: #{e.message}")
@@ -44,6 +25,31 @@ module Glossarist
44
25
 
45
26
  private
46
27
 
28
+ def validate_zip_contents(zip_file, result) # rubocop:disable Metrics/AbcSize
29
+ unless zip_file.find_entry("metadata.yaml")
30
+ result.add_error("Missing metadata.yaml")
31
+ return
32
+ end
33
+
34
+ metadata = GcrMetadata.from_yaml(
35
+ zip_file.find_entry("metadata.yaml").get_input_stream.read,
36
+ )
37
+ validate_metadata(metadata, result)
38
+
39
+ concept_entries = zip_file.entries.select do |e|
40
+ e.name.start_with?("concepts/") && e.name.end_with?(".yaml")
41
+ end
42
+ if concept_entries.empty?
43
+ result.add_error("No concept files found in concepts/")
44
+ end
45
+
46
+ concept_entries.each do |entry|
47
+ validate_concept_entry(entry, metadata, result)
48
+ end
49
+
50
+ validate_assets(zip_file, result)
51
+ end
52
+
47
53
  def validate_metadata(metadata, result)
48
54
  unless metadata&.concept_count
49
55
  result.add_error("metadata.yaml missing required fields (concept_count)")
@@ -94,5 +100,36 @@ module Glossarist
94
100
  result.add_warning("#{entry.name}: no concept URI (data.uri) and no concept_uri_template or uri_prefix in metadata")
95
101
  end
96
102
  end
103
+
104
+ def validate_assets(zip_file, result)
105
+ GcrPackage::DATASET_ASSETS.each do |asset|
106
+ case asset[:type]
107
+ when :file
108
+ validate_file_asset_entry(zip_file, asset[:path], result)
109
+ when :directory
110
+ validate_directory_asset(zip_file, asset[:path], result)
111
+ end
112
+ end
113
+ end
114
+
115
+ def validate_file_asset_entry(zip_file, path, result)
116
+ entry = zip_file.find_entry(path)
117
+ return unless entry
118
+
119
+ YAML.safe_load(entry.get_input_stream.read)
120
+ rescue Psych::SyntaxError => e
121
+ result.add_error("#{path}: invalid YAML at line #{e.line}: #{e.message}")
122
+ end
123
+
124
+ def validate_directory_asset(zip_file, dir_path, result)
125
+ dir_entries = zip_file.entries.select do |e|
126
+ e.name.start_with?("#{dir_path}/")
127
+ end
128
+ return unless dir_entries.any? && dir_entries.all? do |e|
129
+ e.name.end_with?("/")
130
+ end
131
+
132
+ result.add_warning("#{dir_path}/ directory is empty")
133
+ end
97
134
  end
98
135
  end
@@ -1,4 +1,4 @@
1
- require_relative "localized_concept"
1
+ # frozen_string_literal: true
2
2
 
3
3
  module Glossarist
4
4
  class ManagedConcept < Lutaml::Model::Serializable
@@ -2,7 +2,6 @@
2
2
 
3
3
  require "lutaml/turtle"
4
4
  require "lutaml/jsonld"
5
- require_relative "../rdf"
6
5
 
7
6
  module Glossarist
8
7
  module Rdf
@@ -2,7 +2,6 @@
2
2
 
3
3
  require "lutaml/turtle"
4
4
  require "lutaml/jsonld"
5
- require_relative "../rdf"
6
5
 
7
6
  module Glossarist
8
7
  module Rdf
@@ -0,0 +1,14 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ ExtractedDesignation = Struct.new(
6
+ :term,
7
+ :type,
8
+ :normative_status,
9
+ :part_of_speech,
10
+ :abbreviation_type,
11
+ keyword_init: true,
12
+ )
13
+ end
14
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ ExtractedLangSet = Struct.new(
6
+ :language_code,
7
+ :definition_text,
8
+ :note_texts,
9
+ :example_texts,
10
+ :source_texts,
11
+ :domain,
12
+ :designations,
13
+ keyword_init: true,
14
+ )
15
+ end
16
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ ExtractedTerm = Struct.new(
6
+ :id,
7
+ :label,
8
+ :source_ref,
9
+ :lang_sets,
10
+ keyword_init: true,
11
+ )
12
+ end
13
+ end
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ DuplicateConflict = Struct.new(:new_concept, :existing_concept, :key,
6
+ keyword_init: true)
7
+
8
+ class ImportResult
9
+ attr_reader :concepts, :conflicts, :source_files, :skipped_count
10
+
11
+ def initialize(concepts:, conflicts: [], source_files: [],
12
+ skipped_count: 0)
13
+ @concepts = concepts
14
+ @conflicts = conflicts
15
+ @source_files = source_files
16
+ @skipped_count = skipped_count
17
+ end
18
+
19
+ def conflict?
20
+ !conflicts.empty?
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,253 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "tmpdir"
4
+ require_relative "import_result"
5
+
6
+ module Glossarist
7
+ module Sts
8
+ class Importer
9
+ STRATEGIES = %i[skip replace merge].freeze
10
+
11
+ attr_reader :duplicate_strategy
12
+
13
+ def initialize(duplicate_strategy: :skip)
14
+ unless STRATEGIES.include?(duplicate_strategy)
15
+ raise ArgumentError,
16
+ "duplicate_strategy must be one of #{STRATEGIES.join(', ')}, got #{duplicate_strategy}"
17
+ end
18
+
19
+ @duplicate_strategy = duplicate_strategy
20
+ @mapper = TermMapper.new
21
+ end
22
+
23
+ def import_new(xml_files, output:, shortname: nil, version: nil, **opts)
24
+ raw_concepts = extract_all_concepts(xml_files)
25
+ concepts, conflicts, skipped = dedup_concepts(raw_concepts)
26
+
27
+ if output.end_with?(".gcr")
28
+ unless shortname
29
+ raise ArgumentError,
30
+ "--shortname is required for GCR output"
31
+ end
32
+ unless version
33
+ raise ArgumentError,
34
+ "--version is required for GCR output"
35
+ end
36
+
37
+ create_gcr(concepts, output, shortname: shortname, version: version,
38
+ **opts)
39
+ else
40
+ save_dataset(concepts, output)
41
+ end
42
+
43
+ ImportResult.new(
44
+ concepts: concepts,
45
+ conflicts: conflicts,
46
+ source_files: xml_files,
47
+ skipped_count: skipped,
48
+ )
49
+ end
50
+
51
+ def import_into_existing(xml_files, dataset_path)
52
+ existing = load_existing(dataset_path)
53
+ new_concepts = extract_all_concepts(xml_files)
54
+ index = build_concept_index(existing)
55
+
56
+ result_state = apply_with_dedup(new_concepts, existing, index)
57
+
58
+ save_to_path(existing, dataset_path)
59
+
60
+ ImportResult.new(
61
+ concepts: existing.managed_concepts,
62
+ conflicts: result_state.conflicts,
63
+ source_files: xml_files,
64
+ skipped_count: result_state.skipped,
65
+ )
66
+ end
67
+
68
+ DedupState = Struct.new(:conflicts, :skipped, keyword_init: true)
69
+
70
+ private
71
+
72
+ def apply_with_dedup(new_concepts, existing, index)
73
+ state = DedupState.new(conflicts: [], skipped: 0)
74
+
75
+ new_concepts.each do |mc|
76
+ key = concept_key(mc)
77
+ existing_mc = index[key]
78
+
79
+ if existing_mc.nil?
80
+ existing.store(mc)
81
+ index[key] = mc
82
+ else
83
+ state.conflicts << DuplicateConflict.new(
84
+ new_concept: mc, existing_concept: existing_mc, key: key,
85
+ )
86
+ handle_duplicate(existing, existing_mc, mc, index, key, state)
87
+ end
88
+ end
89
+
90
+ state
91
+ end
92
+
93
+ def handle_duplicate(existing, old_mc, new_mc, index, key, state)
94
+ case duplicate_strategy
95
+ when :skip
96
+ state.skipped += 1
97
+ when :replace
98
+ replace_in_collection(existing, old_mc, new_mc)
99
+ index[key] = new_mc
100
+ when :merge
101
+ merge_concept(old_mc, new_mc)
102
+ end
103
+ end
104
+
105
+ def extract_all_concepts(xml_files)
106
+ xml_files.flat_map do |path|
107
+ extractor = TermExtractor.new(path)
108
+ terms = extractor.extract
109
+ terms.map { |t| @mapper.map(t) }
110
+ end
111
+ end
112
+
113
+ def dedup_concepts(concepts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
114
+ seen = {}
115
+ conflicts = []
116
+ skipped = 0
117
+ unique = []
118
+
119
+ concepts.each do |mc|
120
+ key = concept_key(mc)
121
+ if key.first.empty? || seen[key].nil?
122
+ unique << mc
123
+ seen[key] = mc unless key.first.empty?
124
+ else
125
+ conflicts << DuplicateConflict.new(
126
+ new_concept: mc, existing_concept: seen[key], key: key,
127
+ )
128
+ skipped += apply_dedup_to_unique(unique, seen, mc, key)
129
+ end
130
+ end
131
+
132
+ [unique, conflicts, skipped]
133
+ end
134
+
135
+ def apply_dedup_to_unique(unique, seen, new_mc, key)
136
+ case duplicate_strategy
137
+ when :skip
138
+ 1
139
+ when :replace
140
+ unique.delete(seen[key])
141
+ unique << new_mc
142
+ seen[key] = new_mc
143
+ 0
144
+ when :merge
145
+ merge_concept(seen[key], new_mc)
146
+ 0
147
+ end
148
+ end
149
+
150
+ def concept_key(managed_concept)
151
+ designation = managed_concept.default_designation.to_s.downcase.strip
152
+ domain = begin
153
+ l10n = managed_concept.default_lang
154
+ l10n&.data&.domain.to_s.downcase.strip
155
+ end
156
+ [designation, domain]
157
+ end
158
+
159
+ def build_concept_index(collection)
160
+ index = {}
161
+ collection.each do |mc|
162
+ key = concept_key(mc)
163
+ index[key] = mc unless key.first.empty?
164
+ end
165
+ index
166
+ end
167
+
168
+ def merge_concept(existing_mc, new_mc)
169
+ new_mc.localizations.each do |l10n|
170
+ lang = l10n.language_code
171
+ if existing_mc.localization(lang).nil?
172
+ existing_mc.add_localization(l10n)
173
+ end
174
+ end
175
+ end
176
+
177
+ def replace_in_collection(collection, old_mc, new_mc)
178
+ collection.managed_concepts.delete(old_mc)
179
+ collection.store(new_mc)
180
+ end
181
+
182
+ def load_existing(path)
183
+ collection = ManagedConceptCollection.new
184
+ if path.end_with?(".gcr")
185
+ package = GcrPackage.load(path)
186
+ package.concepts.each { |mc| collection.store(mc) }
187
+ else
188
+ concepts = ConceptCollector.collect(path)
189
+ concepts.each { |mc| collection.store(mc) }
190
+ end
191
+ collection
192
+ end
193
+
194
+ def save_to_path(collection, path)
195
+ if path.end_with?(".gcr")
196
+ tmpdir = build_temp_dataset(collection.managed_concepts)
197
+ begin
198
+ GC.start
199
+ tmp_gcr = "#{path}.tmp.#{Process.pid}"
200
+ GcrPackage.create_from_directory(
201
+ tmpdir,
202
+ output: tmp_gcr,
203
+ shortname: File.basename(path, ".gcr"),
204
+ version: "1.0.0",
205
+ )
206
+ FileUtils.rm_f(path)
207
+ FileUtils.mv(tmp_gcr, path)
208
+ ensure
209
+ FileUtils.rm_rf(tmpdir)
210
+ FileUtils.rm_f(tmp_gcr)
211
+ end
212
+ else
213
+ save_dataset(collection.managed_concepts, path)
214
+ end
215
+ end
216
+
217
+ def save_dataset(concepts, dir)
218
+ concepts_dir = File.join(dir, "concepts")
219
+ FileUtils.mkdir_p(concepts_dir)
220
+ collection = ManagedConceptCollection.new
221
+ concepts.each { |mc| collection.store(mc) }
222
+ collection.save_grouped_concepts_to_files(concepts_dir)
223
+ end
224
+
225
+ def create_gcr(concepts, output, shortname:, version:, **opts)
226
+ tmpdir = build_temp_dataset(concepts)
227
+ begin
228
+ GcrPackage.create_from_directory(
229
+ tmpdir,
230
+ output: output,
231
+ shortname: shortname,
232
+ version: version,
233
+ **opts,
234
+ )
235
+ ensure
236
+ FileUtils.rm_rf(tmpdir)
237
+ end
238
+ end
239
+
240
+ def build_temp_dataset(concepts)
241
+ tmpdir = Dir.mktmpdir("glossarist-sts-import")
242
+ concepts_dir = File.join(tmpdir, "concepts")
243
+ FileUtils.mkdir_p(concepts_dir)
244
+
245
+ collection = ManagedConceptCollection.new
246
+ concepts.each { |mc| collection.store(mc) }
247
+ collection.save_grouped_concepts_to_files(concepts_dir)
248
+
249
+ tmpdir
250
+ end
251
+ end
252
+ end
253
+ end