glossarist 2.6.1 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,186 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ class TermExtractor
6
+ def initialize(xml_path)
7
+ raw = File.read(xml_path)
8
+ @standard = ::Sts::IsoSts::Standard.from_xml(raw)
9
+ @source_ref = extract_source_ref
10
+ end
11
+
12
+ def extract
13
+ term_secs = collect_term_secs
14
+ term_secs.filter_map do |ts|
15
+ next unless ts.term_entry
16
+
17
+ build_extracted_term(ts)
18
+ end
19
+ end
20
+
21
+ private
22
+
23
+ def collect_term_secs
24
+ secs = []
25
+ walk_sections(@standard.body, secs) if @standard.body
26
+ secs
27
+ end
28
+
29
+ def walk_sections(container, collected)
30
+ collect_term_secs_from(container, collected)
31
+ walk_child_secs(container, collected)
32
+ end
33
+
34
+ def collect_term_secs_from(container, collected)
35
+ secs = container.term_sec
36
+ secs&.each do |ts|
37
+ collected << ts
38
+ walk_sections(ts, collected) if ts.term_sec&.any?
39
+ end
40
+ end
41
+
42
+ def walk_child_secs(container, collected)
43
+ secs = container_child_secs(container)
44
+ secs&.each { |s| walk_sections(s, collected) }
45
+ end
46
+
47
+ def container_child_secs(container)
48
+ case container
49
+ when ::Sts::IsoSts::Body, ::Sts::IsoSts::Sec
50
+ container.sec
51
+ end
52
+ end
53
+
54
+ def build_extracted_term(term_sec)
55
+ entry = term_sec.term_entry
56
+ label_text = extract_label(term_sec)
57
+
58
+ lang_sets = entry.lang_set.filter_map do |ls|
59
+ build_lang_set(ls)
60
+ end
61
+
62
+ Sts::ExtractedTerm.new(
63
+ id: entry.id,
64
+ label: label_text,
65
+ source_ref: @source_ref,
66
+ lang_sets: lang_sets,
67
+ )
68
+ end
69
+
70
+ def extract_label(term_sec)
71
+ label = term_sec.label
72
+ return nil unless label
73
+
74
+ label.content&.join.to_s.strip
75
+ end
76
+
77
+ def build_lang_set(lang_set) # rubocop:disable Metrics/AbcSize
78
+ lang_code = Sts.convert_language_code(lang_set.lang.to_s)
79
+
80
+ Sts::ExtractedLangSet.new(
81
+ language_code: lang_code,
82
+ definition_text: extract_definition_text(lang_set),
83
+ note_texts: extract_note_texts(lang_set),
84
+ example_texts: extract_example_texts(lang_set),
85
+ source_texts: extract_source_texts(lang_set),
86
+ domain: extract_subject_field(lang_set),
87
+ designations: lang_set.tig.filter_map do |tig|
88
+ build_designation(tig)
89
+ end,
90
+ )
91
+ end
92
+
93
+ def extract_definition_text(lang_set)
94
+ definitions = lang_set.definition
95
+ return "" unless definitions&.any?
96
+
97
+ definitions.first.value&.join.to_s.strip
98
+ end
99
+
100
+ def extract_note_texts(lang_set)
101
+ lang_set.note.filter_map do |n|
102
+ text = n.value&.join.to_s.strip
103
+ text unless text.empty?
104
+ end
105
+ end
106
+
107
+ def extract_example_texts(lang_set)
108
+ lang_set.example.filter_map do |e|
109
+ text = e.value&.join.to_s.strip
110
+ text unless text.empty?
111
+ end
112
+ end
113
+
114
+ def extract_source_texts(lang_set)
115
+ lang_set.source.filter_map do |s|
116
+ text = s.value&.join.to_s.strip
117
+ text unless text.empty?
118
+ end
119
+ end
120
+
121
+ def extract_subject_field(lang_set)
122
+ fields = lang_set.subject_field
123
+ return nil unless fields&.any?
124
+
125
+ text = fields.first.value&.join.to_s.strip
126
+ text unless text.empty?
127
+ end
128
+
129
+ def build_designation(tig)
130
+ Sts::ExtractedDesignation.new(
131
+ term: resolve_term_text(tig),
132
+ type: map_term_type(tig),
133
+ normative_status: map_normative_status(tig),
134
+ part_of_speech: tig.pos&.value,
135
+ abbreviation_type: map_abbreviation_type(tig),
136
+ )
137
+ end
138
+
139
+ def resolve_term_text(tig)
140
+ tig.term&.value&.join.to_s.strip
141
+ end
142
+
143
+ def map_term_type(tig)
144
+ raw = tig.term_type&.value.to_s
145
+ mapped = TERM_TYPE_MAP[raw]
146
+ mapped.nil? || raw.empty? ? "expression" : mapped
147
+ end
148
+
149
+ def map_abbreviation_type(tig)
150
+ raw = tig.term_type&.value.to_s
151
+ return nil unless TERM_TYPE_MAP[raw] == "abbreviation"
152
+
153
+ raw == "acronym" ? "acronym" : "truncation"
154
+ end
155
+
156
+ def map_normative_status(tig)
157
+ NORMATIVE_STATUS_MAP[tig.normative_authorization&.value.to_s]
158
+ end
159
+
160
+ def extract_source_ref # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
161
+ front = @standard.front
162
+ return nil unless front
163
+
164
+ meta = front.iso_meta || front.std_meta
165
+ return nil unless meta
166
+
167
+ refs = meta.std_ref
168
+ return nil unless refs&.any?
169
+
170
+ best_ref = refs.find { |r| r.type == "dated" } ||
171
+ refs.find { |r| r.type == "undated" } ||
172
+ refs.first
173
+
174
+ extract_ref_text(best_ref)
175
+ end
176
+
177
+ def extract_ref_text(ref)
178
+ if ref.value.is_a?(String)
179
+ ref.value.to_s.strip
180
+ else
181
+ ref.content&.join.to_s.strip
182
+ end
183
+ end
184
+ end
185
+ end
186
+ end
@@ -0,0 +1,118 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Glossarist
4
+ module Sts
5
+ class TermMapper
6
+ def map(extracted_term)
7
+ concept_id = extracted_term.label || extracted_term.id
8
+
9
+ mc = Glossarist::ManagedConcept.new(data: { id: concept_id })
10
+
11
+ extracted_term.lang_sets.each do |ls|
12
+ mc.add_localization(build_localized_concept(ls,
13
+ extracted_term.source_ref))
14
+ end
15
+
16
+ mc
17
+ end
18
+
19
+ private
20
+
21
+ def build_localized_concept(lang_set, source_ref)
22
+ terms = lang_set.designations.map { |d| build_designation(d) }
23
+
24
+ Glossarist::LocalizedConcept.of_yaml(
25
+ "data" => {
26
+ "language_code" => lang_set.language_code,
27
+ "terms" => terms,
28
+ "definition" => build_definitions(lang_set.definition_text),
29
+ "notes" => build_detailed_definitions(lang_set.note_texts),
30
+ "examples" => build_detailed_definitions(lang_set.example_texts),
31
+ "sources" => build_sources(lang_set.source_texts, source_ref),
32
+ "domain" => lang_set.domain,
33
+ "entry_status" => "valid",
34
+ },
35
+ )
36
+ end
37
+
38
+ def build_definitions(text)
39
+ return [] unless text && !text.empty?
40
+
41
+ [{ "content" => text }]
42
+ end
43
+
44
+ def build_detailed_definitions(texts)
45
+ texts.filter_map do |text|
46
+ next if text.empty?
47
+
48
+ { "content" => text }
49
+ end
50
+ end
51
+
52
+ def build_designation(ext_desig)
53
+ case ext_desig.type
54
+ when "abbreviation"
55
+ build_abbreviation_designation(ext_desig)
56
+ when "symbol"
57
+ build_symbol_designation(ext_desig)
58
+ else
59
+ build_expression_designation(ext_desig)
60
+ end
61
+ end
62
+
63
+ def build_expression_designation(ext_desig)
64
+ hash = {
65
+ "type" => "expression",
66
+ "designation" => ext_desig.term,
67
+ "normative_status" => ext_desig.normative_status,
68
+ }.compact
69
+
70
+ if ext_desig.part_of_speech
71
+ hash["grammar_info"] =
72
+ [{ "part_of_speech" => ext_desig.part_of_speech }]
73
+ end
74
+
75
+ hash
76
+ end
77
+
78
+ def build_abbreviation_designation(ext_desig)
79
+ {
80
+ "type" => "abbreviation",
81
+ "designation" => ext_desig.term,
82
+ "normative_status" => ext_desig.normative_status,
83
+ "abbreviation_type" => ext_desig.abbreviation_type,
84
+ }.compact
85
+ end
86
+
87
+ def build_symbol_designation(ext_desig)
88
+ {
89
+ "type" => "symbol",
90
+ "designation" => ext_desig.term,
91
+ "normative_status" => ext_desig.normative_status,
92
+ }.compact
93
+ end
94
+
95
+ def build_sources(source_texts, source_ref)
96
+ sources = []
97
+ if source_ref
98
+ sources << {
99
+ "status" => "identical",
100
+ "type" => "authoritative",
101
+ "origin" => { "text" => source_ref },
102
+ }
103
+ end
104
+
105
+ source_texts.each do |text|
106
+ next if text.empty?
107
+
108
+ sources << {
109
+ "type" => "authoritative",
110
+ "origin" => { "text" => text },
111
+ }
112
+ end
113
+
114
+ sources
115
+ end
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sts"
4
+
5
+ module Glossarist
6
+ module Sts
7
+ autoload :ExtractedDesignation, "#{__dir__}/sts/extracted_designation"
8
+ autoload :ExtractedLangSet, "#{__dir__}/sts/extracted_lang_set"
9
+ autoload :ExtractedTerm, "#{__dir__}/sts/extracted_term"
10
+ autoload :ImportResult, "#{__dir__}/sts/import_result"
11
+ autoload :Importer, "#{__dir__}/sts/importer"
12
+ autoload :TermExtractor, "#{__dir__}/sts/term_extractor"
13
+ autoload :TermMapper, "#{__dir__}/sts/term_mapper"
14
+
15
+ ISO_639_1_TO_639_2 = {
16
+ "aa" => "aar", "ab" => "abk", "af" => "afr", "ak" => "aka",
17
+ "am" => "amh", "an" => "arg", "ar" => "ara", "as" => "asm",
18
+ "av" => "ava", "ay" => "aym", "az" => "aze", "ba" => "bak",
19
+ "be" => "bel", "bg" => "bul", "bh" => "bih", "bi" => "bis",
20
+ "bm" => "bam", "bn" => "ben", "bo" => "bod", "br" => "bre",
21
+ "bs" => "bos", "ca" => "cat", "ce" => "che", "ch" => "cha",
22
+ "co" => "cos", "cr" => "cre", "cs" => "ces", "cu" => "chu",
23
+ "cv" => "chv", "cy" => "cym", "da" => "dan", "de" => "deu",
24
+ "dv" => "div", "dz" => "dzo", "ee" => "ewe", "el" => "ell",
25
+ "en" => "eng", "eo" => "epo", "es" => "spa", "et" => "est",
26
+ "eu" => "eus", "fa" => "fas", "ff" => "ful", "fi" => "fin",
27
+ "fj" => "fij", "fo" => "fao", "fr" => "fra", "fy" => "fry",
28
+ "ga" => "gle", "gd" => "gla", "gl" => "glg", "gn" => "grn",
29
+ "gu" => "guj", "gv" => "glv", "ha" => "hau", "he" => "heb",
30
+ "hi" => "hin", "ho" => "hmo", "hr" => "hrv", "ht" => "hat",
31
+ "hu" => "hun", "hy" => "hye", "hz" => "her", "ia" => "ina",
32
+ "id" => "ind", "ie" => "ile", "ig" => "ibo", "ii" => "iii",
33
+ "ik" => "ipk", "io" => "ido", "is" => "isl", "it" => "ita",
34
+ "iu" => "iku", "ja" => "jpn", "jv" => "jav", "ka" => "kat",
35
+ "kg" => "kon", "ki" => "kik", "kj" => "kua", "kk" => "kaz",
36
+ "kl" => "kal", "km" => "khm", "kn" => "kan", "ko" => "kor",
37
+ "kr" => "kau", "ks" => "kas", "ku" => "kur", "kv" => "kom",
38
+ "kw" => "cor", "ky" => "kir", "la" => "lat", "lb" => "ltz",
39
+ "lg" => "lug", "li" => "lim", "ln" => "lin", "lo" => "lao",
40
+ "lt" => "lit", "lu" => "lub", "lv" => "lav", "mg" => "mlg",
41
+ "mh" => "mah", "mi" => "mri", "mk" => "mkd", "ml" => "mal",
42
+ "mn" => "mon", "mr" => "mar", "ms" => "msa", "mt" => "mlt",
43
+ "my" => "mya", "na" => "nau", "nb" => "nob", "nd" => "nde",
44
+ "ne" => "nep", "ng" => "ndo", "nl" => "nld", "nn" => "nno",
45
+ "no" => "nor", "nr" => "nbl", "nv" => "nav", "ny" => "nya",
46
+ "oc" => "oci", "oj" => "oji", "om" => "orm", "or" => "ori",
47
+ "os" => "oss", "pa" => "pan", "pi" => "pli", "pl" => "pol",
48
+ "ps" => "pus", "pt" => "por", "qu" => "que", "rm" => "roh",
49
+ "rn" => "run", "ro" => "ron", "ru" => "rus", "rw" => "kin",
50
+ "sa" => "san", "sc" => "srd", "sd" => "snd", "se" => "sme",
51
+ "sg" => "sag", "si" => "sin", "sk" => "slk", "sl" => "slv",
52
+ "sm" => "smo", "sn" => "sna", "so" => "som", "sq" => "sqi",
53
+ "sr" => "srp", "ss" => "ssw", "st" => "sot", "su" => "sun",
54
+ "sv" => "swe", "sw" => "swa", "ta" => "tam", "te" => "tel",
55
+ "tg" => "tgk", "th" => "tha", "ti" => "tir", "tk" => "tuk",
56
+ "tl" => "tgl", "tn" => "tsn", "to" => "ton", "tr" => "tur",
57
+ "ts" => "tso", "tt" => "tat", "tw" => "twi", "ty" => "tah",
58
+ "ug" => "uig", "uk" => "ukr", "ur" => "urd", "uz" => "uzb",
59
+ "ve" => "ven", "vi" => "vie", "vo" => "vol", "wa" => "wln",
60
+ "wo" => "wol", "xh" => "xho", "yi" => "yid", "yo" => "yor",
61
+ "za" => "zha", "zh" => "zho", "zu" => "zul"
62
+ }.freeze
63
+
64
+ TERM_TYPE_MAP = {
65
+ "acronym" => "abbreviation",
66
+ "abbreviation" => "abbreviation",
67
+ "fullForm" => "expression",
68
+ "symbol" => "symbol",
69
+ "variant" => "expression",
70
+ "equation" => "expression",
71
+ "formula" => "expression",
72
+ }.freeze
73
+
74
+ NORMATIVE_STATUS_MAP = {
75
+ "preferredTerm" => "preferred",
76
+ "admittedTerm" => "admitted",
77
+ "deprecatedTerm" => "deprecated",
78
+ }.freeze
79
+
80
+ def self.convert_language_code(code)
81
+ return code if code.nil?
82
+ return code if code.length == 3
83
+
84
+ ISO_639_1_TO_639_2[code] || code
85
+ end
86
+ end
87
+ end
@@ -1,7 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative "../rdf"
4
-
5
3
  module Glossarist
6
4
  module Transforms
7
5
  class ConceptToSkosTransform
@@ -4,5 +4,5 @@
4
4
  #
5
5
 
6
6
  module Glossarist
7
- VERSION = "2.6.1"
7
+ VERSION = "2.6.3"
8
8
  end
data/lib/glossarist.rb CHANGED
@@ -7,14 +7,13 @@ require "psych"
7
7
  require "thor"
8
8
  require "lutaml/model"
9
9
 
10
- require_relative "glossarist/glossary_definition"
11
-
12
10
  module Glossarist
13
11
  autoload :Asset, "glossarist/asset"
14
12
  autoload :Citation, "glossarist/citation"
15
13
  autoload :CLI, "glossarist/cli"
16
14
  autoload :CollectionConfig, "glossarist/collection_config"
17
15
  autoload :Collection, "glossarist/collection"
16
+ autoload :Collections, "glossarist/collections"
18
17
  autoload :Concept, "glossarist/concept"
19
18
  autoload :ConceptData, "glossarist/concept_data"
20
19
  autoload :ConceptReference, "glossarist/concept_reference"
@@ -35,10 +34,10 @@ module Glossarist
35
34
  autoload :DetailedDefinition, "glossarist/detailed_definition"
36
35
  autoload :Designation, "glossarist/designation"
37
36
  autoload :Error, "glossarist/error"
38
- autoload :GcrPackage, "glossarist/gcr_package"
39
- autoload :GcrMetadata, "glossarist/gcr_metadata"
40
- autoload :GcrStatistics, "glossarist/gcr_statistics"
41
- autoload :GcrValidator, "glossarist/gcr_validator"
37
+ autoload :GcrPackage, "glossarist/gcr_package"
38
+ autoload :GcrMetadata, "glossarist/gcr_metadata"
39
+ autoload :GcrStatistics, "glossarist/gcr_statistics"
40
+ autoload :GcrValidator, "glossarist/gcr_validator"
42
41
  autoload :InvalidTypeError, "glossarist/error/invalid_type_error"
43
42
  autoload :InvalidLanguageCodeError,
44
43
  "glossarist/error/invalid_language_code_error"
@@ -52,16 +51,20 @@ module Glossarist
52
51
  autoload :ManagedConceptData, "glossarist/managed_concept_data"
53
52
  autoload :NonVerbRep, "glossarist/non_verb_rep"
54
53
  autoload :RelatedConcept, "glossarist/related_concept"
54
+ autoload :Rdf, "glossarist/rdf"
55
+ autoload :Sts, "glossarist/sts"
56
+ autoload :Transforms, "glossarist/transforms"
55
57
  autoload :SchemaMigration, "glossarist/schema_migration"
56
58
  autoload :UrnResolver, "glossarist/urn_resolver"
57
59
  autoload :Utilities, "glossarist/utilities"
58
- autoload :RegisterData, "glossarist/register_data"
60
+ autoload :RegisterData, "glossarist/register_data"
59
61
  autoload :ValidationResult, "glossarist/validation_result"
60
62
  autoload :V1, "glossarist/v1"
61
63
  end
62
64
 
63
65
  require_relative "glossarist/version"
64
66
  require_relative "glossarist/collections"
67
+ require_relative "glossarist/glossary_definition"
65
68
 
66
69
  module Glossarist
67
70
  LANG_CODES = %w[eng ara deu fra spa ita jpn kor pol por srp swe zho rus fin
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: glossarist
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.6.1
4
+ version: 2.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-06 00:00:00.000000000 Z
11
+ date: 2026-05-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: lutaml-model
@@ -122,6 +122,7 @@ files:
122
122
  - lib/glossarist/citation.rb
123
123
  - lib/glossarist/cli.rb
124
124
  - lib/glossarist/cli/export_command.rb
125
+ - lib/glossarist/cli/import_command.rb
125
126
  - lib/glossarist/cli/package_command.rb
126
127
  - lib/glossarist/cli/upgrade_command.rb
127
128
  - lib/glossarist/cli/validate_command.rb
@@ -192,6 +193,14 @@ files:
192
193
  - lib/glossarist/resolution_adapter/remote.rb
193
194
  - lib/glossarist/resolution_adapter/route.rb
194
195
  - lib/glossarist/schema_migration.rb
196
+ - lib/glossarist/sts.rb
197
+ - lib/glossarist/sts/extracted_designation.rb
198
+ - lib/glossarist/sts/extracted_lang_set.rb
199
+ - lib/glossarist/sts/extracted_term.rb
200
+ - lib/glossarist/sts/import_result.rb
201
+ - lib/glossarist/sts/importer.rb
202
+ - lib/glossarist/sts/term_extractor.rb
203
+ - lib/glossarist/sts/term_mapper.rb
195
204
  - lib/glossarist/transforms.rb
196
205
  - lib/glossarist/transforms/concept_to_skos_transform.rb
197
206
  - lib/glossarist/transforms/concept_to_tbx_transform.rb