RubyGems - glossarist - Versions diffs - 2.6.2 → 2.6.4 - Mend

glossarist 2.6.2 → 2.6.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/.rubocop_todo.yml +58 -16
data/Gemfile +3 -19
data/README.adoc +117 -0
data/glossarist.gemspec +1 -0
data/lib/glossarist/cli/import_command.rb +54 -0
data/lib/glossarist/cli.rb +29 -8
data/lib/glossarist/designation/expression.rb +1 -2
data/lib/glossarist/designation/graphical_symbol.rb +1 -1
data/lib/glossarist/managed_concept.rb +1 -1
data/lib/glossarist/rdf/skos_concept.rb +0 -1
data/lib/glossarist/rdf/skos_vocabulary.rb +0 -1
data/lib/glossarist/sts/extracted_designation.rb +14 -0
data/lib/glossarist/sts/extracted_lang_set.rb +16 -0
data/lib/glossarist/sts/extracted_term.rb +13 -0
data/lib/glossarist/sts/import_result.rb +24 -0
data/lib/glossarist/sts/importer.rb +253 -0
data/lib/glossarist/sts/term_extractor.rb +186 -0
data/lib/glossarist/sts/term_mapper.rb +118 -0
data/lib/glossarist/sts.rb +87 -0
data/lib/glossarist/transforms/concept_to_skos_transform.rb +0 -2
data/lib/glossarist/version.rb +1 -1
data/lib/glossarist.rb +10 -7
metadata +25 -2

data/lib/glossarist/sts/importer.rb ADDED Viewed

@@ -0,0 +1,253 @@
+# frozen_string_literal: true
+require "tmpdir"
+require_relative "import_result"
+module Glossarist
+  module Sts
+    class Importer
+      STRATEGIES = %i[skip replace merge].freeze
+      attr_reader :duplicate_strategy
+      def initialize(duplicate_strategy: :skip)
+        unless STRATEGIES.include?(duplicate_strategy)
+          raise ArgumentError,
+                "duplicate_strategy must be one of #{STRATEGIES.join(', ')}, got #{duplicate_strategy}"
+        end
+        @duplicate_strategy = duplicate_strategy
+        @mapper = TermMapper.new
+      end
+      def import_new(xml_files, output:, shortname: nil, version: nil, **opts)
+        raw_concepts = extract_all_concepts(xml_files)
+        concepts, conflicts, skipped = dedup_concepts(raw_concepts)
+        if output.end_with?(".gcr")
+          unless shortname
+            raise ArgumentError,
+                  "--shortname is required for GCR output"
+          end
+          unless version
+            raise ArgumentError,
+                  "--version is required for GCR output"
+          end
+          create_gcr(concepts, output, shortname: shortname, version: version,
+                                       **opts)
+        else
+          save_dataset(concepts, output)
+        end
+        ImportResult.new(
+          concepts: concepts,
+          conflicts: conflicts,
+          source_files: xml_files,
+          skipped_count: skipped,
+        )
+      end
+      def import_into_existing(xml_files, dataset_path)
+        existing = load_existing(dataset_path)
+        new_concepts = extract_all_concepts(xml_files)
+        index = build_concept_index(existing)
+        result_state = apply_with_dedup(new_concepts, existing, index)
+        save_to_path(existing, dataset_path)
+        ImportResult.new(
+          concepts: existing.managed_concepts,
+          conflicts: result_state.conflicts,
+          source_files: xml_files,
+          skipped_count: result_state.skipped,
+        )
+      end
+      DedupState = Struct.new(:conflicts, :skipped, keyword_init: true)
+      private
+      def apply_with_dedup(new_concepts, existing, index)
+        state = DedupState.new(conflicts: [], skipped: 0)
+        new_concepts.each do |mc|
+          key = concept_key(mc)
+          existing_mc = index[key]
+          if existing_mc.nil?
+            existing.store(mc)
+            index[key] = mc
+          else
+            state.conflicts << DuplicateConflict.new(
+              new_concept: mc, existing_concept: existing_mc, key: key,
+            )
+            handle_duplicate(existing, existing_mc, mc, index, key, state)
+          end
+        end
+        state
+      end
+      def handle_duplicate(existing, old_mc, new_mc, index, key, state)
+        case duplicate_strategy
+        when :skip
+          state.skipped += 1
+        when :replace
+          replace_in_collection(existing, old_mc, new_mc)
+          index[key] = new_mc
+        when :merge
+          merge_concept(old_mc, new_mc)
+        end
+      end
+      def extract_all_concepts(xml_files)
+        xml_files.flat_map do |path|
+          extractor = TermExtractor.new(path)
+          terms = extractor.extract
+          terms.map { |t| @mapper.map(t) }
+        end
+      end
+      def dedup_concepts(concepts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
+        seen = {}
+        conflicts = []
+        skipped = 0
+        unique = []
+        concepts.each do |mc|
+          key = concept_key(mc)
+          if key.first.empty? || seen[key].nil?
+            unique << mc
+            seen[key] = mc unless key.first.empty?
+          else
+            conflicts << DuplicateConflict.new(
+              new_concept: mc, existing_concept: seen[key], key: key,
+            )
+            skipped += apply_dedup_to_unique(unique, seen, mc, key)
+          end
+        end
+        [unique, conflicts, skipped]
+      end
+      def apply_dedup_to_unique(unique, seen, new_mc, key)
+        case duplicate_strategy
+        when :skip
+          1
+        when :replace
+          unique.delete(seen[key])
+          unique << new_mc
+          seen[key] = new_mc
+          0
+        when :merge
+          merge_concept(seen[key], new_mc)
+          0
+        end
+      end
+      def concept_key(managed_concept)
+        designation = managed_concept.default_designation.to_s.downcase.strip
+        domain = begin
+          l10n = managed_concept.default_lang
+          l10n&.data&.domain.to_s.downcase.strip
+        end
+        [designation, domain]
+      end
+      def build_concept_index(collection)
+        index = {}
+        collection.each do |mc|
+          key = concept_key(mc)
+          index[key] = mc unless key.first.empty?
+        end
+        index
+      end
+      def merge_concept(existing_mc, new_mc)
+        new_mc.localizations.each do |l10n|
+          lang = l10n.language_code
+          if existing_mc.localization(lang).nil?
+            existing_mc.add_localization(l10n)
+          end
+        end
+      end
+      def replace_in_collection(collection, old_mc, new_mc)
+        collection.managed_concepts.delete(old_mc)
+        collection.store(new_mc)
+      end
+      def load_existing(path)
+        collection = ManagedConceptCollection.new
+        if path.end_with?(".gcr")
+          package = GcrPackage.load(path)
+          package.concepts.each { |mc| collection.store(mc) }
+        else
+          concepts = ConceptCollector.collect(path)
+          concepts.each { |mc| collection.store(mc) }
+        end
+        collection
+      end
+      def save_to_path(collection, path)
+        if path.end_with?(".gcr")
+          tmpdir = build_temp_dataset(collection.managed_concepts)
+          begin
+            GC.start
+            tmp_gcr = "#{path}.tmp.#{Process.pid}"
+            GcrPackage.create_from_directory(
+              tmpdir,
+              output: tmp_gcr,
+              shortname: File.basename(path, ".gcr"),
+              version: "1.0.0",
+            )
+            FileUtils.rm_f(path)
+            FileUtils.mv(tmp_gcr, path)
+          ensure
+            FileUtils.rm_rf(tmpdir)
+            FileUtils.rm_f(tmp_gcr)
+          end
+        else
+          save_dataset(collection.managed_concepts, path)
+        end
+      end
+      def save_dataset(concepts, dir)
+        concepts_dir = File.join(dir, "concepts")
+        FileUtils.mkdir_p(concepts_dir)
+        collection = ManagedConceptCollection.new
+        concepts.each { |mc| collection.store(mc) }
+        collection.save_grouped_concepts_to_files(concepts_dir)
+      end
+      def create_gcr(concepts, output, shortname:, version:, **opts)
+        tmpdir = build_temp_dataset(concepts)
+        begin
+          GcrPackage.create_from_directory(
+            tmpdir,
+            output: output,
+            shortname: shortname,
+            version: version,
+            **opts,
+          )
+        ensure
+          FileUtils.rm_rf(tmpdir)
+        end
+      end
+      def build_temp_dataset(concepts)
+        tmpdir = Dir.mktmpdir("glossarist-sts-import")
+        concepts_dir = File.join(tmpdir, "concepts")
+        FileUtils.mkdir_p(concepts_dir)
+        collection = ManagedConceptCollection.new
+        concepts.each { |mc| collection.store(mc) }
+        collection.save_grouped_concepts_to_files(concepts_dir)
+        tmpdir
+      end
+    end
+  end
+end

data/lib/glossarist/sts/term_extractor.rb ADDED Viewed

@@ -0,0 +1,186 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    class TermExtractor
+      def initialize(xml_path)
+        raw = File.read(xml_path)
+        @standard = ::Sts::IsoSts::Standard.from_xml(raw)
+        @source_ref = extract_source_ref
+      end
+      def extract
+        term_secs = collect_term_secs
+        term_secs.filter_map do |ts|
+          next unless ts.term_entry
+          build_extracted_term(ts)
+        end
+      end
+      private
+      def collect_term_secs
+        secs = []
+        walk_sections(@standard.body, secs) if @standard.body
+        secs
+      end
+      def walk_sections(container, collected)
+        collect_term_secs_from(container, collected)
+        walk_child_secs(container, collected)
+      end
+      def collect_term_secs_from(container, collected)
+        secs = container.term_sec
+        secs&.each do |ts|
+          collected << ts
+          walk_sections(ts, collected) if ts.term_sec&.any?
+        end
+      end
+      def walk_child_secs(container, collected)
+        secs = container_child_secs(container)
+        secs&.each { |s| walk_sections(s, collected) }
+      end
+      def container_child_secs(container)
+        case container
+        when ::Sts::IsoSts::Body, ::Sts::IsoSts::Sec
+          container.sec
+        end
+      end
+      def build_extracted_term(term_sec)
+        entry = term_sec.term_entry
+        label_text = extract_label(term_sec)
+        lang_sets = entry.lang_set.filter_map do |ls|
+          build_lang_set(ls)
+        end
+        Sts::ExtractedTerm.new(
+          id: entry.id,
+          label: label_text,
+          source_ref: @source_ref,
+          lang_sets: lang_sets,
+        )
+      end
+      def extract_label(term_sec)
+        label = term_sec.label
+        return nil unless label
+        label.content&.join.to_s.strip
+      end
+      def build_lang_set(lang_set) # rubocop:disable Metrics/AbcSize
+        lang_code = Sts.convert_language_code(lang_set.lang.to_s)
+        Sts::ExtractedLangSet.new(
+          language_code: lang_code,
+          definition_text: extract_definition_text(lang_set),
+          note_texts: extract_note_texts(lang_set),
+          example_texts: extract_example_texts(lang_set),
+          source_texts: extract_source_texts(lang_set),
+          domain: extract_subject_field(lang_set),
+          designations: lang_set.tig.filter_map do |tig|
+            build_designation(tig)
+          end,
+        )
+      end
+      def extract_definition_text(lang_set)
+        definitions = lang_set.definition
+        return "" unless definitions&.any?
+        definitions.first.value&.join.to_s.strip
+      end
+      def extract_note_texts(lang_set)
+        lang_set.note.filter_map do |n|
+          text = n.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_example_texts(lang_set)
+        lang_set.example.filter_map do |e|
+          text = e.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_source_texts(lang_set)
+        lang_set.source.filter_map do |s|
+          text = s.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_subject_field(lang_set)
+        fields = lang_set.subject_field
+        return nil unless fields&.any?
+        text = fields.first.value&.join.to_s.strip
+        text unless text.empty?
+      end
+      def build_designation(tig)
+        Sts::ExtractedDesignation.new(
+          term: resolve_term_text(tig),
+          type: map_term_type(tig),
+          normative_status: map_normative_status(tig),
+          part_of_speech: tig.pos&.value,
+          abbreviation_type: map_abbreviation_type(tig),
+        )
+      end
+      def resolve_term_text(tig)
+        tig.term&.value&.join.to_s.strip
+      end
+      def map_term_type(tig)
+        raw = tig.term_type&.value.to_s
+        mapped = TERM_TYPE_MAP[raw]
+        mapped.nil? || raw.empty? ? "expression" : mapped
+      end
+      def map_abbreviation_type(tig)
+        raw = tig.term_type&.value.to_s
+        return nil unless TERM_TYPE_MAP[raw] == "abbreviation"
+        raw == "acronym" ? "acronym" : "truncation"
+      end
+      def map_normative_status(tig)
+        NORMATIVE_STATUS_MAP[tig.normative_authorization&.value.to_s]
+      end
+      def extract_source_ref # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+        front = @standard.front
+        return nil unless front
+        meta = front.iso_meta || front.std_meta
+        return nil unless meta
+        refs = meta.std_ref
+        return nil unless refs&.any?
+        best_ref = refs.find { |r| r.type == "dated" } ||
+          refs.find { |r| r.type == "undated" } ||
+          refs.first
+        extract_ref_text(best_ref)
+      end
+      def extract_ref_text(ref)
+        if ref.value.is_a?(String)
+          ref.value.to_s.strip
+        else
+          ref.content&.join.to_s.strip
+        end
+      end
+    end
+  end
+end

data/lib/glossarist/sts/term_mapper.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    class TermMapper
+      def map(extracted_term)
+        concept_id = extracted_term.label || extracted_term.id
+        mc = Glossarist::ManagedConcept.new(data: { id: concept_id })
+        extracted_term.lang_sets.each do |ls|
+          mc.add_localization(build_localized_concept(ls,
+                                                      extracted_term.source_ref))
+        end
+        mc
+      end
+      private
+      def build_localized_concept(lang_set, source_ref)
+        terms = lang_set.designations.map { |d| build_designation(d) }
+        Glossarist::LocalizedConcept.of_yaml(
+          "data" => {
+            "language_code" => lang_set.language_code,
+            "terms" => terms,
+            "definition" => build_definitions(lang_set.definition_text),
+            "notes" => build_detailed_definitions(lang_set.note_texts),
+            "examples" => build_detailed_definitions(lang_set.example_texts),
+            "sources" => build_sources(lang_set.source_texts, source_ref),
+            "domain" => lang_set.domain,
+            "entry_status" => "valid",
+          },
+        )
+      end
+      def build_definitions(text)
+        return [] unless text && !text.empty?
+        [{ "content" => text }]
+      end
+      def build_detailed_definitions(texts)
+        texts.filter_map do |text|
+          next if text.empty?
+          { "content" => text }
+        end
+      end
+      def build_designation(ext_desig)
+        case ext_desig.type
+        when "abbreviation"
+          build_abbreviation_designation(ext_desig)
+        when "symbol"
+          build_symbol_designation(ext_desig)
+        else
+          build_expression_designation(ext_desig)
+        end
+      end
+      def build_expression_designation(ext_desig)
+        hash = {
+          "type" => "expression",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+        }.compact
+        if ext_desig.part_of_speech
+          hash["grammar_info"] =
+            [{ "part_of_speech" => ext_desig.part_of_speech }]
+        end
+        hash
+      end
+      def build_abbreviation_designation(ext_desig)
+        {
+          "type" => "abbreviation",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+          "abbreviation_type" => ext_desig.abbreviation_type,
+        }.compact
+      end
+      def build_symbol_designation(ext_desig)
+        {
+          "type" => "symbol",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+        }.compact
+      end
+      def build_sources(source_texts, source_ref)
+        sources = []
+        if source_ref
+          sources << {
+            "status" => "identical",
+            "type" => "authoritative",
+            "origin" => { "text" => source_ref },
+          }
+        end
+        source_texts.each do |text|
+          next if text.empty?
+          sources << {
+            "type" => "authoritative",
+            "origin" => { "text" => text },
+          }
+        end
+        sources
+      end
+    end
+  end
+end

data/lib/glossarist/sts.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require "sts"
+module Glossarist
+  module Sts
+    autoload :ExtractedDesignation, "#{__dir__}/sts/extracted_designation"
+    autoload :ExtractedLangSet,     "#{__dir__}/sts/extracted_lang_set"
+    autoload :ExtractedTerm,        "#{__dir__}/sts/extracted_term"
+    autoload :ImportResult,         "#{__dir__}/sts/import_result"
+    autoload :Importer,             "#{__dir__}/sts/importer"
+    autoload :TermExtractor,        "#{__dir__}/sts/term_extractor"
+    autoload :TermMapper,           "#{__dir__}/sts/term_mapper"
+    ISO_639_1_TO_639_2 = {
+      "aa" => "aar", "ab" => "abk", "af" => "afr", "ak" => "aka",
+      "am" => "amh", "an" => "arg", "ar" => "ara", "as" => "asm",
+      "av" => "ava", "ay" => "aym", "az" => "aze", "ba" => "bak",
+      "be" => "bel", "bg" => "bul", "bh" => "bih", "bi" => "bis",
+      "bm" => "bam", "bn" => "ben", "bo" => "bod", "br" => "bre",
+      "bs" => "bos", "ca" => "cat", "ce" => "che", "ch" => "cha",
+      "co" => "cos", "cr" => "cre", "cs" => "ces", "cu" => "chu",
+      "cv" => "chv", "cy" => "cym", "da" => "dan", "de" => "deu",
+      "dv" => "div", "dz" => "dzo", "ee" => "ewe", "el" => "ell",
+      "en" => "eng", "eo" => "epo", "es" => "spa", "et" => "est",
+      "eu" => "eus", "fa" => "fas", "ff" => "ful", "fi" => "fin",
+      "fj" => "fij", "fo" => "fao", "fr" => "fra", "fy" => "fry",
+      "ga" => "gle", "gd" => "gla", "gl" => "glg", "gn" => "grn",
+      "gu" => "guj", "gv" => "glv", "ha" => "hau", "he" => "heb",
+      "hi" => "hin", "ho" => "hmo", "hr" => "hrv", "ht" => "hat",
+      "hu" => "hun", "hy" => "hye", "hz" => "her", "ia" => "ina",
+      "id" => "ind", "ie" => "ile", "ig" => "ibo", "ii" => "iii",
+      "ik" => "ipk", "io" => "ido", "is" => "isl", "it" => "ita",
+      "iu" => "iku", "ja" => "jpn", "jv" => "jav", "ka" => "kat",
+      "kg" => "kon", "ki" => "kik", "kj" => "kua", "kk" => "kaz",
+      "kl" => "kal", "km" => "khm", "kn" => "kan", "ko" => "kor",
+      "kr" => "kau", "ks" => "kas", "ku" => "kur", "kv" => "kom",
+      "kw" => "cor", "ky" => "kir", "la" => "lat", "lb" => "ltz",
+      "lg" => "lug", "li" => "lim", "ln" => "lin", "lo" => "lao",
+      "lt" => "lit", "lu" => "lub", "lv" => "lav", "mg" => "mlg",
+      "mh" => "mah", "mi" => "mri", "mk" => "mkd", "ml" => "mal",
+      "mn" => "mon", "mr" => "mar", "ms" => "msa", "mt" => "mlt",
+      "my" => "mya", "na" => "nau", "nb" => "nob", "nd" => "nde",
+      "ne" => "nep", "ng" => "ndo", "nl" => "nld", "nn" => "nno",
+      "no" => "nor", "nr" => "nbl", "nv" => "nav", "ny" => "nya",
+      "oc" => "oci", "oj" => "oji", "om" => "orm", "or" => "ori",
+      "os" => "oss", "pa" => "pan", "pi" => "pli", "pl" => "pol",
+      "ps" => "pus", "pt" => "por", "qu" => "que", "rm" => "roh",
+      "rn" => "run", "ro" => "ron", "ru" => "rus", "rw" => "kin",
+      "sa" => "san", "sc" => "srd", "sd" => "snd", "se" => "sme",
+      "sg" => "sag", "si" => "sin", "sk" => "slk", "sl" => "slv",
+      "sm" => "smo", "sn" => "sna", "so" => "som", "sq" => "sqi",
+      "sr" => "srp", "ss" => "ssw", "st" => "sot", "su" => "sun",
+      "sv" => "swe", "sw" => "swa", "ta" => "tam", "te" => "tel",
+      "tg" => "tgk", "th" => "tha", "ti" => "tir", "tk" => "tuk",
+      "tl" => "tgl", "tn" => "tsn", "to" => "ton", "tr" => "tur",
+      "ts" => "tso", "tt" => "tat", "tw" => "twi", "ty" => "tah",
+      "ug" => "uig", "uk" => "ukr", "ur" => "urd", "uz" => "uzb",
+      "ve" => "ven", "vi" => "vie", "vo" => "vol", "wa" => "wln",
+      "wo" => "wol", "xh" => "xho", "yi" => "yid", "yo" => "yor",
+      "za" => "zha", "zh" => "zho", "zu" => "zul"
+    }.freeze
+    TERM_TYPE_MAP = {
+      "acronym" => "abbreviation",
+      "abbreviation" => "abbreviation",
+      "fullForm" => "expression",
+      "symbol" => "symbol",
+      "variant" => "expression",
+      "equation" => "expression",
+      "formula" => "expression",
+    }.freeze
+    NORMATIVE_STATUS_MAP = {
+      "preferredTerm" => "preferred",
+      "admittedTerm" => "admitted",
+      "deprecatedTerm" => "deprecated",
+    }.freeze
+    def self.convert_language_code(code)
+      return code if code.nil?
+      return code if code.length == 3
+      ISO_639_1_TO_639_2[code] || code
+    end
+  end
+end

data/lib/glossarist/transforms/concept_to_skos_transform.rb CHANGED Viewed

@@ -1,7 +1,5 @@
 # frozen_string_literal: true
-require_relative "../rdf"
 module Glossarist
   module Transforms
     class ConceptToSkosTransform

data/lib/glossarist/version.rb CHANGED Viewed

@@ -4,5 +4,5 @@
 #
 module Glossarist
-  VERSION = "2.6.2"
+  VERSION = "2.6.4"
 end