RubyGems - glossarist - Versions diffs - 2.6.2 → 2.6.3 - Mend

glossarist 2.6.2 → 2.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

checksums.yaml +4 -4
data/.gitignore +2 -1
data/.rubocop_todo.yml +58 -16
data/Gemfile +3 -19
data/README.adoc +117 -0
data/lib/glossarist/cli/import_command.rb +54 -0
data/lib/glossarist/cli.rb +29 -8
data/lib/glossarist/designation/expression.rb +1 -2
data/lib/glossarist/designation/graphical_symbol.rb +1 -1
data/lib/glossarist/managed_concept.rb +1 -1
data/lib/glossarist/rdf/skos_concept.rb +0 -1
data/lib/glossarist/rdf/skos_vocabulary.rb +0 -1
data/lib/glossarist/sts/extracted_designation.rb +14 -0
data/lib/glossarist/sts/extracted_lang_set.rb +16 -0
data/lib/glossarist/sts/extracted_term.rb +13 -0
data/lib/glossarist/sts/import_result.rb +24 -0
data/lib/glossarist/sts/importer.rb +253 -0
data/lib/glossarist/sts/term_extractor.rb +186 -0
data/lib/glossarist/sts/term_mapper.rb +118 -0
data/lib/glossarist/sts.rb +87 -0
data/lib/glossarist/transforms/concept_to_skos_transform.rb +0 -2
data/lib/glossarist/version.rb +1 -1
data/lib/glossarist.rb +10 -7
metadata +11 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: a4a05468d25c9ac0d13c748454dc98d7dc031718fe98d298c3db89d6874963a7
-  data.tar.gz: 203b26205a85b9593942d7d6095dab85faa93ed1e99b9f474f20c67a77e61c5d
+  metadata.gz: 1f3a8ec372c1c3e7a93ed7c2bad8ed2837f8f5bcd5ce4ae340bbb9f3b5ddaa75
+  data.tar.gz: e7c0672fc648ea748cff12bfc00a1ea62665aeaa20e4cf8a86dde1419a6094df
 SHA512:
-  metadata.gz: dc177b1d927f7b309cb47fac5158c151001202ba0332b35f0eaf87a8e1e2d9eda400dd077145b10d4acc5ee968676f39dd95172a32ad16e6078ff3357c299317
-  data.tar.gz: eb60cfa90dd26008e2287cbafb35aaeb4e48bb60c120cc254fba024d945f4a6cdb41e9b51a2b8c68a30ac6e31e7e40f38243ec44437a5995126eba47f47110dd
+  metadata.gz: 5a3654b99b5137104e26830fe77b1b6bad3eb2e0ce4ffa45d479b909399c469c41edbd7460ede72dc4f10bf94cc2f40649e78ce8510113ea9d97f0715750af15
+  data.tar.gz: eec5c75fd4a6a434999830038642ce387d74c2b7df976343d4b342aa919d1aaf7c4beccff4f92fbb18a0f0a4acf762885bf837edad7f16dd9889a93b33ed5613

data/.gitignore CHANGED Viewed

@@ -16,8 +16,9 @@
 .rubocop-http---*
 .rubocop-https---*
-# Relaton local cache directory
+# Relaton cache directories
 localcache
+spec/fixtures/relaton_cache/
 .vscode

data/.rubocop_todo.yml CHANGED Viewed

@@ -1,59 +1,93 @@
 # This configuration was generated by
 # `rubocop --auto-gen-config`
-# on 2026-05-07 13:57:54 UTC using RuboCop version 1.86.1.
+# on 2026-05-12 04:13:45 UTC using RuboCop version 1.86.1.
 # The point is for the user to remove these configuration records
 # one by one as the offenses are removed from the code base.
 # Note that changes in the inspected code, or installation of new
 # versions of RuboCop, may require this file to be generated again.
+# Offense count: 7
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: TreatCommentsAsGroupSeparators, ConsiderPunctuation.
+Bundler/OrderedGems:
+  Exclude:
+    - 'Gemfile'
 # Offense count: 1
 Gemspec/RequiredRubyVersion:
   Exclude:
     - 'glossarist.gemspec'
-# Offense count: 1
+# Offense count: 4
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: EnforcedStyle, IndentationWidth.
 # SupportedStyles: with_first_argument, with_fixed_indentation
 Layout/ArgumentAlignment:
   Exclude:
-    - 'spec/unit/gcr_package_spec.rb'
+    - 'lib/glossarist/sts/import_result.rb'
+    - 'lib/glossarist/sts/importer.rb'
+    - 'lib/glossarist/sts/term_mapper.rb'
-# Offense count: 2
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: IndentationWidth.
+Layout/AssignmentIndentation:
+  Exclude:
+    - 'lib/glossarist/sts/term_mapper.rb'
+# Offense count: 6
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: EnforcedStyleAlignWith.
 # SupportedStylesAlignWith: either, start_of_block, start_of_line
 Layout/BlockAlignment:
   Exclude:
-    - 'lib/glossarist/gcr_validator.rb'
+    - 'lib/glossarist/sts/term_extractor.rb'
+    - 'spec/unit/sts/term_extractor_spec.rb'
+    - 'spec/unit/sts/term_mapper_spec.rb'
-# Offense count: 2
+# Offense count: 6
 # This cop supports safe autocorrection (--autocorrect).
 Layout/BlockEndNewline:
   Exclude:
-    - 'lib/glossarist/gcr_validator.rb'
+    - 'lib/glossarist/sts/term_extractor.rb'
+    - 'spec/unit/sts/term_extractor_spec.rb'
+    - 'spec/unit/sts/term_mapper_spec.rb'
-# Offense count: 4
+# Offense count: 1
+# This cop supports safe autocorrection (--autocorrect).
+# Configuration parameters: AllowMultipleStyles, EnforcedHashRocketStyle, EnforcedColonStyle, EnforcedLastArgumentHashStyle.
+# SupportedHashRocketStyles: key, separator, table
+# SupportedColonStyles: key, separator, table
+# SupportedLastArgumentHashStyles: always_inspect, always_ignore, ignore_implicit, ignore_explicit
+Layout/HashAlignment:
+  Exclude:
+    - 'lib/glossarist/sts/importer.rb'
+# Offense count: 12
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: Width, EnforcedStyleAlignWith, AllowedPatterns.
 # SupportedStylesAlignWith: start_of_line, relative_to_receiver
 Layout/IndentationWidth:
   Exclude:
-    - 'lib/glossarist/gcr_validator.rb'
+    - 'lib/glossarist/sts/term_extractor.rb'
+    - 'spec/unit/sts/term_extractor_spec.rb'
+    - 'spec/unit/sts/term_mapper_spec.rb'
-# Offense count: 214
+# Offense count: 236
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: Max, AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
 # URISchemes: http, https
 Layout/LineLength:
   Enabled: false
-# Offense count: 1
+# Offense count: 7
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: AllowInHeredoc.
 Layout/TrailingWhitespace:
   Exclude:
-    - 'spec/unit/gcr_package_spec.rb'
+    - 'lib/glossarist/sts/import_result.rb'
+    - 'lib/glossarist/sts/importer.rb'
+    - 'lib/glossarist/sts/term_mapper.rb'
 # Offense count: 1
 # Configuration parameters: AllowedMethods.
@@ -106,12 +140,12 @@ Metrics/CyclomaticComplexity:
     - 'lib/glossarist/transforms/concept_to_skos_transform.rb'
     - 'lib/glossarist/transforms/concept_to_tbx_transform.rb'
-# Offense count: 35
+# Offense count: 47
 # Configuration parameters: CountComments, CountAsOne, AllowedMethods, AllowedPatterns.
 Metrics/MethodLength:
   Max: 42
-# Offense count: 3
+# Offense count: 4
 # Configuration parameters: CountKeywordArgs, MaxOptionalParameters.
 Metrics/ParameterLists:
   Max: 6
@@ -145,7 +179,7 @@ Naming/VariableNumber:
   Exclude:
     - 'spec/unit/rdf/skos_vocabulary_spec.rb'
-# Offense count: 3
+# Offense count: 9
 # This cop supports safe autocorrection (--autocorrect).
 # Configuration parameters: EnforcedStyle, ProceduralMethods, FunctionalMethods, AllowedMethods, AllowedPatterns, AllowBracesOnProceduralOneLiners, BracesRequiredMethods.
 # SupportedStyles: line_count_based, semantic, braces_for_chaining, always_braces
@@ -154,7 +188,9 @@ Naming/VariableNumber:
 # AllowedMethods: lambda, proc, it
 Style/BlockDelimiters:
   Exclude:
-    - 'lib/glossarist/gcr_validator.rb'
+    - 'lib/glossarist/sts/term_extractor.rb'
+    - 'spec/unit/sts/term_extractor_spec.rb'
+    - 'spec/unit/sts/term_mapper_spec.rb'
 # Offense count: 6
 # This cop supports safe autocorrection (--autocorrect).
@@ -163,6 +199,12 @@ Style/BlockDelimiters:
 Style/FormatStringToken:
   EnforcedStyle: unannotated
+# Offense count: 2
+# This cop supports safe autocorrection (--autocorrect).
+Style/MultilineIfModifier:
+  Exclude:
+    - 'lib/glossarist/sts/importer.rb'
 # Offense count: 1
 # Configuration parameters: AllowedClasses.
 Style/OneClassPerFile:

data/Gemfile CHANGED Viewed

@@ -6,29 +6,13 @@ gemspec
 gem "canon"
 gem "lutaml-model", "~> 0.8.0"
+gem "nokogiri"
 gem "rake", "~> 13.0"
+gem "relaton", "~> 2.1.0"
 gem "rspec", "~> 3.0"
 gem "rubocop"
 gem "rubocop-performance"
 gem "rubocop-rake"
 gem "rubocop-rspec"
+gem "sts", "~> 0.5.6"
 gem "tbx", "~> 0.1"
-# Override relaton gems with lutaml-model 0.8 compatible versions.
-# Released 2.0.0 gems have untyped lutaml-model attributes that fail with 0.8+.
-# lutaml-integration branches have typed attributes and relaton-bib ~> 2.1.0.
-# TODO: Remove once relaton gems release versions with lutaml-model 0.8 support.
-gem "relaton-3gpp", github: "relaton/relaton-3gpp",
-                    branch: "lutaml-integration"
-gem "relaton-bib", github: "relaton/relaton-bib", branch: "lutaml-integration"
-gem "relaton-bipm", github: "relaton/relaton-bipm",
-                    branch: "lutaml-integration"
-gem "relaton-bsi", github: "relaton/relaton-bsi", branch: "lutaml-integration"
-gem "relaton-calconnect", github: "relaton/relaton-calconnect",
-                          branch: "lutaml-integration"
-gem "relaton-ccsds", github: "relaton/relaton-ccsds",
-                     branch: "lutaml-integration"
-gem "relaton-cen", github: "relaton/relaton-cen", branch: "lutaml-integration"
-gem "relaton-iec", github: "relaton/relaton-iec", branch: "lutaml-integration"
-gem "relaton-iso", github: "relaton/relaton-iso", branch: "lutaml-integration"
-gem "relaton-itu", github: "relaton/relaton-itu", branch: "lutaml-integration"

data/README.adoc CHANGED Viewed

@@ -507,6 +507,123 @@ puts skos.to_jsonld
 puts skos.to_turtle
 ----
+=== import
+Import terminology concepts from STS XML files into a new or existing dataset.
+[,bash]
+----
+# Import one or more STS XML files into a new dataset directory
+glossarist import iso-8373.xml -o output_dir
+# Import into a new GCR package (--shortname and --version required)
+glossarist import iso-8373.xml -o iso-8373.gcr \
+  --shortname iso-8373 --version 1.0.0 --title "ISO 8373 Robotics"
+# Import multiple files into a new dataset
+glossarist import iso-8373.xml iso-9000.xml -o combined_dataset
+# Import into an existing dataset (dedup by designation + domain)
+glossarist import iso-8373.xml --into existing_dataset/
+# Import into an existing GCR (re-packages automatically)
+glossarist import iso-8373.xml --into existing.gcr
+# Control duplicate handling
+glossarist import iso-8373.xml --into existing_dataset/ --on-duplicate replace
+----
+Deduplication is based on **designation + domain** (case-insensitive). When
+duplicates are found, the `--on-duplicate` strategy determines the behavior:
+[cols="1,2"]
+|===
+|`skip` (default)
+|Keep the existing concept, skip the new one
+|`replace`
+|Replace the existing concept with the new one
+|`merge`
+|Add new localizations to the existing concept (e.g. add French to an English-only concept)
+|===
+Options:
+[cols="1,1"]
+|===
+|o, --output
+|Output directory or `.gcr` file path (new dataset)
+|--into
+|Path to existing dataset directory or `.gcr` file to merge into
+|--shortname
+|Dataset shortname (required for GCR output)
+|--version
+|Dataset version (required for GCR output)
+|--title
+|Dataset title
+|--description
+|Dataset description
+|--owner
+|Dataset owner
+|--uri-prefix
+|URI prefix for the dataset
+|--on-duplicate
+|How to handle duplicates: `skip`, `replace`, or `merge`
+|===
+Ruby API:
+[,ruby]
+----
+require "glossarist/sts"
+importer = Glossarist::Sts::Importer.new
+# Import into a new dataset directory
+result = importer.import_new(
+  ["iso-8373.xml", "iso-9000.xml"],
+  output: "output_dir",
+)
+puts result.concepts.length    # total concepts imported
+puts result.conflicts.length   # duplicates detected
+puts result.skipped_count      # skipped (strategy: skip)
+# Import into a new GCR package
+result = importer.import_new(
+  ["iso-8373.xml"],
+  output: "iso-8373.gcr",
+  shortname: "iso-8373",
+  version: "1.0.0",
+  title: "ISO 8373 Robotics Vocabulary",
+)
+# Import into an existing dataset with merge strategy
+importer = Glossarist::Sts::Importer.new(duplicate_strategy: :merge)
+result = importer.import_into_existing(
+  ["french_supplement.xml"],
+  "existing_dataset/",
+)
+result.concepts.each do |mc|
+  puts "#{mc.data.id}: #{mc.localizations.keys.join(', ')}"
+end
+----
+==== Import result
+`import_new` and `import_into_existing` return an `ImportResult` with:
+concepts:: `Array<ManagedConcept>` — the imported concepts
+conflicts:: `Array<DuplicateConflict>` — duplicate pairs detected by designation + domain
+source_files:: `Array<String>` — the input file paths
+skipped_count:: `Integer` — concepts skipped due to duplicates (strategy: skip)
 === validate
 Validate a dataset directory or `.gcr` file for schema compliance.

data/lib/glossarist/cli/import_command.rb ADDED Viewed

@@ -0,0 +1,54 @@
+# frozen_string_literal: true
+module Glossarist
+  class CLI
+    class ImportCommand
+      def initialize(files, options)
+        @files = files
+        @options = options
+      end
+      def run
+        importer = Sts::Importer.new(
+          duplicate_strategy: @options[:on_duplicate]&.to_sym || :skip,
+        )
+        result = if @options[:into]
+                   importer.import_into_existing(@files, @options[:into])
+                 else
+                   importer.import_new(@files, **import_new_args)
+                 end
+        print_summary(result)
+      rescue ArgumentError => e
+        warn "Error: #{e.message}"
+        exit 1
+      end
+      private
+      def import_new_args
+        {
+          output: @options[:output],
+          shortname: @options[:shortname],
+          version: @options[:version],
+          title: @options[:title],
+          description: @options[:description],
+          owner: @options[:owner],
+          uri_prefix: @options[:uri_prefix],
+        }
+      end
+      def print_summary(result) # rubocop:disable Metrics/AbcSize
+        dest = @options[:into] || @options[:output]
+        puts "Imported #{result.concepts.length} concepts to #{dest}"
+        puts "  Source files: #{@files.join(', ')}" if @files.any?
+        return unless result.conflict?
+        puts "  #{result.conflicts.length} duplicate(s) detected " \
+             "(strategy: #{@options[:on_duplicate] || 'skip'})"
+        puts "  #{result.skipped_count} concept(s) skipped" if result.skipped_count.positive?
+      end
+    end
+  end
+end

data/lib/glossarist/cli.rb CHANGED Viewed

@@ -4,6 +4,11 @@ require "thor"
 module Glossarist
   class CLI < Thor
+    autoload :UpgradeCommand, "#{__dir__}/cli/upgrade_command"
+    autoload :PackageCommand,  "#{__dir__}/cli/package_command"
+    autoload :ValidateCommand, "#{__dir__}/cli/validate_command"
+    autoload :ImportCommand,   "#{__dir__}/cli/import_command"
+    autoload :ExportCommand,   "#{__dir__}/cli/export_command"
     desc "generate_latex", "Convert Concepts to Latex format"
     option :concepts_path, aliases: :p, required: true,
@@ -38,8 +43,7 @@ module Glossarist
     option :dry_run, type: :boolean, default: false,
                      desc: "Show what would change without writing"
     def upgrade(source_dir)
-      require_relative "cli/upgrade_command"
-      Glossarist::CLI::UpgradeCommand.new(source_dir, options).run
+      CLI::UpgradeCommand.new(source_dir, options).run
     end
     desc "package DIR", "Create a .gcr ZIP archive from a schema v1 dataset"
@@ -62,8 +66,7 @@ module Glossarist
     option :concept_uri_template, type: :string,
                                   desc: "URI template for concept URIs"
     def package(dir)
-      require_relative "cli/package_command"
-      Glossarist::CLI::PackageCommand.new(dir, options).run
+      CLI::PackageCommand.new(dir, options).run
     end
     desc "validate PATH",
@@ -76,8 +79,27 @@ module Glossarist
     option :reference_path, type: :string,
                             desc: "Path to directory of .gcr files for cross-dataset reference validation"
     def validate(path)
-      require_relative "cli/validate_command"
-      Glossarist::CLI::ValidateCommand.new(path, options).run
+      CLI::ValidateCommand.new(path, options).run
+    end
+    desc "import FILES...", "Import terms from STS XML files"
+    option :output, aliases: :o, type: :string,
+                    desc: "Output directory or .gcr file path (new dataset)"
+    option :into, type: :string,
+                  desc: "Path to existing dataset directory or .gcr file to merge into"
+    option :shortname, type: :string,
+                       desc: "Dataset shortname (required for GCR output)"
+    option :version, type: :string,
+                     desc: "Dataset version (required for GCR output)"
+    option :title, type: :string, desc: "Dataset title"
+    option :description, type: :string, desc: "Dataset description"
+    option :owner, type: :string, desc: "Dataset owner"
+    option :uri_prefix, type: :string, desc: "URI prefix for the dataset"
+    option :on_duplicate, type: :string, default: "skip",
+                          enum: %w[skip replace merge],
+                          desc: "How to handle duplicate concepts (designation + domain)"
+    def import(*files)
+      CLI::ImportCommand.new(files, options).run
     end
     desc "export PATH", "Export concepts in machine-readable formats"
@@ -95,8 +117,7 @@ module Glossarist
     option :title, type: :string,
                    desc: "Dataset title for document header"
     def export(path)
-      require_relative "cli/export_command"
-      Glossarist::CLI::ExportCommand.new(path, options).run
+      CLI::ExportCommand.new(path, options).run
     end
     def method_missing(*args)

data/lib/glossarist/designation/expression.rb CHANGED Viewed

@@ -1,5 +1,4 @@
-require_relative "base"
-require_relative "grammar_info"
+# frozen_string_literal: true
 module Glossarist
   module Designation

data/lib/glossarist/designation/graphical_symbol.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require_relative "symbol"
+# frozen_string_literal: true
 module Glossarist
   module Designation

data/lib/glossarist/managed_concept.rb CHANGED Viewed

@@ -1,4 +1,4 @@
-require_relative "localized_concept"
+# frozen_string_literal: true
 module Glossarist
   class ManagedConcept < Lutaml::Model::Serializable

data/lib/glossarist/rdf/skos_concept.rb CHANGED Viewed

@@ -2,7 +2,6 @@
 require "lutaml/turtle"
 require "lutaml/jsonld"
-require_relative "../rdf"
 module Glossarist
   module Rdf

data/lib/glossarist/rdf/skos_vocabulary.rb CHANGED Viewed

@@ -2,7 +2,6 @@
 require "lutaml/turtle"
 require "lutaml/jsonld"
-require_relative "../rdf"
 module Glossarist
   module Rdf

data/lib/glossarist/sts/extracted_designation.rb ADDED Viewed

@@ -0,0 +1,14 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    ExtractedDesignation = Struct.new(
+      :term,
+      :type,
+      :normative_status,
+      :part_of_speech,
+      :abbreviation_type,
+      keyword_init: true,
+    )
+  end
+end

data/lib/glossarist/sts/extracted_lang_set.rb ADDED Viewed

@@ -0,0 +1,16 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    ExtractedLangSet = Struct.new(
+      :language_code,
+      :definition_text,
+      :note_texts,
+      :example_texts,
+      :source_texts,
+      :domain,
+      :designations,
+      keyword_init: true,
+    )
+  end
+end

data/lib/glossarist/sts/extracted_term.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    ExtractedTerm = Struct.new(
+      :id,
+      :label,
+      :source_ref,
+      :lang_sets,
+      keyword_init: true,
+    )
+  end
+end

data/lib/glossarist/sts/import_result.rb ADDED Viewed

@@ -0,0 +1,24 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    DuplicateConflict = Struct.new(:new_concept, :existing_concept, :key,
+                                   keyword_init: true)
+    class ImportResult
+      attr_reader :concepts, :conflicts, :source_files, :skipped_count
+      def initialize(concepts:, conflicts: [], source_files: [],
+skipped_count: 0)
+        @concepts = concepts
+        @conflicts = conflicts
+        @source_files = source_files
+        @skipped_count = skipped_count
+      end
+      def conflict?
+        !conflicts.empty?
+      end
+    end
+  end
+end

data/lib/glossarist/sts/importer.rb ADDED Viewed

@@ -0,0 +1,253 @@
+# frozen_string_literal: true
+require "tmpdir"
+require_relative "import_result"
+module Glossarist
+  module Sts
+    class Importer
+      STRATEGIES = %i[skip replace merge].freeze
+      attr_reader :duplicate_strategy
+      def initialize(duplicate_strategy: :skip)
+        unless STRATEGIES.include?(duplicate_strategy)
+          raise ArgumentError,
+                "duplicate_strategy must be one of #{STRATEGIES.join(', ')}, got #{duplicate_strategy}"
+        end
+        @duplicate_strategy = duplicate_strategy
+        @mapper = TermMapper.new
+      end
+      def import_new(xml_files, output:, shortname: nil, version: nil, **opts)
+        raw_concepts = extract_all_concepts(xml_files)
+        concepts, conflicts, skipped = dedup_concepts(raw_concepts)
+        if output.end_with?(".gcr")
+          unless shortname
+            raise ArgumentError,
+                  "--shortname is required for GCR output"
+          end
+          unless version
+            raise ArgumentError,
+                  "--version is required for GCR output"
+          end
+          create_gcr(concepts, output, shortname: shortname, version: version,
+                                       **opts)
+        else
+          save_dataset(concepts, output)
+        end
+        ImportResult.new(
+          concepts: concepts,
+          conflicts: conflicts,
+          source_files: xml_files,
+          skipped_count: skipped,
+        )
+      end
+      def import_into_existing(xml_files, dataset_path)
+        existing = load_existing(dataset_path)
+        new_concepts = extract_all_concepts(xml_files)
+        index = build_concept_index(existing)
+        result_state = apply_with_dedup(new_concepts, existing, index)
+        save_to_path(existing, dataset_path)
+        ImportResult.new(
+          concepts: existing.managed_concepts,
+          conflicts: result_state.conflicts,
+          source_files: xml_files,
+          skipped_count: result_state.skipped,
+        )
+      end
+      DedupState = Struct.new(:conflicts, :skipped, keyword_init: true)
+      private
+      def apply_with_dedup(new_concepts, existing, index)
+        state = DedupState.new(conflicts: [], skipped: 0)
+        new_concepts.each do |mc|
+          key = concept_key(mc)
+          existing_mc = index[key]
+          if existing_mc.nil?
+            existing.store(mc)
+            index[key] = mc
+          else
+            state.conflicts << DuplicateConflict.new(
+              new_concept: mc, existing_concept: existing_mc, key: key,
+            )
+            handle_duplicate(existing, existing_mc, mc, index, key, state)
+          end
+        end
+        state
+      end
+      def handle_duplicate(existing, old_mc, new_mc, index, key, state)
+        case duplicate_strategy
+        when :skip
+          state.skipped += 1
+        when :replace
+          replace_in_collection(existing, old_mc, new_mc)
+          index[key] = new_mc
+        when :merge
+          merge_concept(old_mc, new_mc)
+        end
+      end
+      def extract_all_concepts(xml_files)
+        xml_files.flat_map do |path|
+          extractor = TermExtractor.new(path)
+          terms = extractor.extract
+          terms.map { |t| @mapper.map(t) }
+        end
+      end
+      def dedup_concepts(concepts) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
+        seen = {}
+        conflicts = []
+        skipped = 0
+        unique = []
+        concepts.each do |mc|
+          key = concept_key(mc)
+          if key.first.empty? || seen[key].nil?
+            unique << mc
+            seen[key] = mc unless key.first.empty?
+          else
+            conflicts << DuplicateConflict.new(
+              new_concept: mc, existing_concept: seen[key], key: key,
+            )
+            skipped += apply_dedup_to_unique(unique, seen, mc, key)
+          end
+        end
+        [unique, conflicts, skipped]
+      end
+      def apply_dedup_to_unique(unique, seen, new_mc, key)
+        case duplicate_strategy
+        when :skip
+          1
+        when :replace
+          unique.delete(seen[key])
+          unique << new_mc
+          seen[key] = new_mc
+          0
+        when :merge
+          merge_concept(seen[key], new_mc)
+          0
+        end
+      end
+      def concept_key(managed_concept)
+        designation = managed_concept.default_designation.to_s.downcase.strip
+        domain = begin
+          l10n = managed_concept.default_lang
+          l10n&.data&.domain.to_s.downcase.strip
+        end
+        [designation, domain]
+      end
+      def build_concept_index(collection)
+        index = {}
+        collection.each do |mc|
+          key = concept_key(mc)
+          index[key] = mc unless key.first.empty?
+        end
+        index
+      end
+      def merge_concept(existing_mc, new_mc)
+        new_mc.localizations.each do |l10n|
+          lang = l10n.language_code
+          if existing_mc.localization(lang).nil?
+            existing_mc.add_localization(l10n)
+          end
+        end
+      end
+      def replace_in_collection(collection, old_mc, new_mc)
+        collection.managed_concepts.delete(old_mc)
+        collection.store(new_mc)
+      end
+      def load_existing(path)
+        collection = ManagedConceptCollection.new
+        if path.end_with?(".gcr")
+          package = GcrPackage.load(path)
+          package.concepts.each { |mc| collection.store(mc) }
+        else
+          concepts = ConceptCollector.collect(path)
+          concepts.each { |mc| collection.store(mc) }
+        end
+        collection
+      end
+      def save_to_path(collection, path)
+        if path.end_with?(".gcr")
+          tmpdir = build_temp_dataset(collection.managed_concepts)
+          begin
+            GC.start
+            tmp_gcr = "#{path}.tmp.#{Process.pid}"
+            GcrPackage.create_from_directory(
+              tmpdir,
+              output: tmp_gcr,
+              shortname: File.basename(path, ".gcr"),
+              version: "1.0.0",
+            )
+            FileUtils.rm_f(path)
+            FileUtils.mv(tmp_gcr, path)
+          ensure
+            FileUtils.rm_rf(tmpdir)
+            FileUtils.rm_f(tmp_gcr)
+          end
+        else
+          save_dataset(collection.managed_concepts, path)
+        end
+      end
+      def save_dataset(concepts, dir)
+        concepts_dir = File.join(dir, "concepts")
+        FileUtils.mkdir_p(concepts_dir)
+        collection = ManagedConceptCollection.new
+        concepts.each { |mc| collection.store(mc) }
+        collection.save_grouped_concepts_to_files(concepts_dir)
+      end
+      def create_gcr(concepts, output, shortname:, version:, **opts)
+        tmpdir = build_temp_dataset(concepts)
+        begin
+          GcrPackage.create_from_directory(
+            tmpdir,
+            output: output,
+            shortname: shortname,
+            version: version,
+            **opts,
+          )
+        ensure
+          FileUtils.rm_rf(tmpdir)
+        end
+      end
+      def build_temp_dataset(concepts)
+        tmpdir = Dir.mktmpdir("glossarist-sts-import")
+        concepts_dir = File.join(tmpdir, "concepts")
+        FileUtils.mkdir_p(concepts_dir)
+        collection = ManagedConceptCollection.new
+        concepts.each { |mc| collection.store(mc) }
+        collection.save_grouped_concepts_to_files(concepts_dir)
+        tmpdir
+      end
+    end
+  end
+end

data/lib/glossarist/sts/term_extractor.rb ADDED Viewed

@@ -0,0 +1,186 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    class TermExtractor
+      def initialize(xml_path)
+        raw = File.read(xml_path)
+        @standard = ::Sts::IsoSts::Standard.from_xml(raw)
+        @source_ref = extract_source_ref
+      end
+      def extract
+        term_secs = collect_term_secs
+        term_secs.filter_map do |ts|
+          next unless ts.term_entry
+          build_extracted_term(ts)
+        end
+      end
+      private
+      def collect_term_secs
+        secs = []
+        walk_sections(@standard.body, secs) if @standard.body
+        secs
+      end
+      def walk_sections(container, collected)
+        collect_term_secs_from(container, collected)
+        walk_child_secs(container, collected)
+      end
+      def collect_term_secs_from(container, collected)
+        secs = container.term_sec
+        secs&.each do |ts|
+          collected << ts
+          walk_sections(ts, collected) if ts.term_sec&.any?
+        end
+      end
+      def walk_child_secs(container, collected)
+        secs = container_child_secs(container)
+        secs&.each { |s| walk_sections(s, collected) }
+      end
+      def container_child_secs(container)
+        case container
+        when ::Sts::IsoSts::Body, ::Sts::IsoSts::Sec
+          container.sec
+        end
+      end
+      def build_extracted_term(term_sec)
+        entry = term_sec.term_entry
+        label_text = extract_label(term_sec)
+        lang_sets = entry.lang_set.filter_map do |ls|
+          build_lang_set(ls)
+        end
+        Sts::ExtractedTerm.new(
+          id: entry.id,
+          label: label_text,
+          source_ref: @source_ref,
+          lang_sets: lang_sets,
+        )
+      end
+      def extract_label(term_sec)
+        label = term_sec.label
+        return nil unless label
+        label.content&.join.to_s.strip
+      end
+      def build_lang_set(lang_set) # rubocop:disable Metrics/AbcSize
+        lang_code = Sts.convert_language_code(lang_set.lang.to_s)
+        Sts::ExtractedLangSet.new(
+          language_code: lang_code,
+          definition_text: extract_definition_text(lang_set),
+          note_texts: extract_note_texts(lang_set),
+          example_texts: extract_example_texts(lang_set),
+          source_texts: extract_source_texts(lang_set),
+          domain: extract_subject_field(lang_set),
+          designations: lang_set.tig.filter_map do |tig|
+            build_designation(tig)
+          end,
+        )
+      end
+      def extract_definition_text(lang_set)
+        definitions = lang_set.definition
+        return "" unless definitions&.any?
+        definitions.first.value&.join.to_s.strip
+      end
+      def extract_note_texts(lang_set)
+        lang_set.note.filter_map do |n|
+          text = n.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_example_texts(lang_set)
+        lang_set.example.filter_map do |e|
+          text = e.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_source_texts(lang_set)
+        lang_set.source.filter_map do |s|
+          text = s.value&.join.to_s.strip
+          text unless text.empty?
+        end
+      end
+      def extract_subject_field(lang_set)
+        fields = lang_set.subject_field
+        return nil unless fields&.any?
+        text = fields.first.value&.join.to_s.strip
+        text unless text.empty?
+      end
+      def build_designation(tig)
+        Sts::ExtractedDesignation.new(
+          term: resolve_term_text(tig),
+          type: map_term_type(tig),
+          normative_status: map_normative_status(tig),
+          part_of_speech: tig.pos&.value,
+          abbreviation_type: map_abbreviation_type(tig),
+        )
+      end
+      def resolve_term_text(tig)
+        tig.term&.value&.join.to_s.strip
+      end
+      def map_term_type(tig)
+        raw = tig.term_type&.value.to_s
+        mapped = TERM_TYPE_MAP[raw]
+        mapped.nil? || raw.empty? ? "expression" : mapped
+      end
+      def map_abbreviation_type(tig)
+        raw = tig.term_type&.value.to_s
+        return nil unless TERM_TYPE_MAP[raw] == "abbreviation"
+        raw == "acronym" ? "acronym" : "truncation"
+      end
+      def map_normative_status(tig)
+        NORMATIVE_STATUS_MAP[tig.normative_authorization&.value.to_s]
+      end
+      def extract_source_ref # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
+        front = @standard.front
+        return nil unless front
+        meta = front.iso_meta || front.std_meta
+        return nil unless meta
+        refs = meta.std_ref
+        return nil unless refs&.any?
+        best_ref = refs.find { |r| r.type == "dated" } ||
+          refs.find { |r| r.type == "undated" } ||
+          refs.first
+        extract_ref_text(best_ref)
+      end
+      def extract_ref_text(ref)
+        if ref.value.is_a?(String)
+          ref.value.to_s.strip
+        else
+          ref.content&.join.to_s.strip
+        end
+      end
+    end
+  end
+end

data/lib/glossarist/sts/term_mapper.rb ADDED Viewed

@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+module Glossarist
+  module Sts
+    class TermMapper
+      def map(extracted_term)
+        concept_id = extracted_term.label || extracted_term.id
+        mc = Glossarist::ManagedConcept.new(data: { id: concept_id })
+        extracted_term.lang_sets.each do |ls|
+          mc.add_localization(build_localized_concept(ls,
+                                                      extracted_term.source_ref))
+        end
+        mc
+      end
+      private
+      def build_localized_concept(lang_set, source_ref)
+        terms = lang_set.designations.map { |d| build_designation(d) }
+        Glossarist::LocalizedConcept.of_yaml(
+          "data" => {
+            "language_code" => lang_set.language_code,
+            "terms" => terms,
+            "definition" => build_definitions(lang_set.definition_text),
+            "notes" => build_detailed_definitions(lang_set.note_texts),
+            "examples" => build_detailed_definitions(lang_set.example_texts),
+            "sources" => build_sources(lang_set.source_texts, source_ref),
+            "domain" => lang_set.domain,
+            "entry_status" => "valid",
+          },
+        )
+      end
+      def build_definitions(text)
+        return [] unless text && !text.empty?
+        [{ "content" => text }]
+      end
+      def build_detailed_definitions(texts)
+        texts.filter_map do |text|
+          next if text.empty?
+          { "content" => text }
+        end
+      end
+      def build_designation(ext_desig)
+        case ext_desig.type
+        when "abbreviation"
+          build_abbreviation_designation(ext_desig)
+        when "symbol"
+          build_symbol_designation(ext_desig)
+        else
+          build_expression_designation(ext_desig)
+        end
+      end
+      def build_expression_designation(ext_desig)
+        hash = {
+          "type" => "expression",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+        }.compact
+        if ext_desig.part_of_speech
+          hash["grammar_info"] =
+            [{ "part_of_speech" => ext_desig.part_of_speech }]
+        end
+        hash
+      end
+      def build_abbreviation_designation(ext_desig)
+        {
+          "type" => "abbreviation",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+          "abbreviation_type" => ext_desig.abbreviation_type,
+        }.compact
+      end
+      def build_symbol_designation(ext_desig)
+        {
+          "type" => "symbol",
+          "designation" => ext_desig.term,
+          "normative_status" => ext_desig.normative_status,
+        }.compact
+      end
+      def build_sources(source_texts, source_ref)
+        sources = []
+        if source_ref
+          sources << {
+            "status" => "identical",
+            "type" => "authoritative",
+            "origin" => { "text" => source_ref },
+          }
+        end
+        source_texts.each do |text|
+          next if text.empty?
+          sources << {
+            "type" => "authoritative",
+            "origin" => { "text" => text },
+          }
+        end
+        sources
+      end
+    end
+  end
+end

data/lib/glossarist/sts.rb ADDED Viewed

@@ -0,0 +1,87 @@
+# frozen_string_literal: true
+require "sts"
+module Glossarist
+  module Sts
+    autoload :ExtractedDesignation, "#{__dir__}/sts/extracted_designation"
+    autoload :ExtractedLangSet,     "#{__dir__}/sts/extracted_lang_set"
+    autoload :ExtractedTerm,        "#{__dir__}/sts/extracted_term"
+    autoload :ImportResult,         "#{__dir__}/sts/import_result"
+    autoload :Importer,             "#{__dir__}/sts/importer"
+    autoload :TermExtractor,        "#{__dir__}/sts/term_extractor"
+    autoload :TermMapper,           "#{__dir__}/sts/term_mapper"
+    ISO_639_1_TO_639_2 = {
+      "aa" => "aar", "ab" => "abk", "af" => "afr", "ak" => "aka",
+      "am" => "amh", "an" => "arg", "ar" => "ara", "as" => "asm",
+      "av" => "ava", "ay" => "aym", "az" => "aze", "ba" => "bak",
+      "be" => "bel", "bg" => "bul", "bh" => "bih", "bi" => "bis",
+      "bm" => "bam", "bn" => "ben", "bo" => "bod", "br" => "bre",
+      "bs" => "bos", "ca" => "cat", "ce" => "che", "ch" => "cha",
+      "co" => "cos", "cr" => "cre", "cs" => "ces", "cu" => "chu",
+      "cv" => "chv", "cy" => "cym", "da" => "dan", "de" => "deu",
+      "dv" => "div", "dz" => "dzo", "ee" => "ewe", "el" => "ell",
+      "en" => "eng", "eo" => "epo", "es" => "spa", "et" => "est",
+      "eu" => "eus", "fa" => "fas", "ff" => "ful", "fi" => "fin",
+      "fj" => "fij", "fo" => "fao", "fr" => "fra", "fy" => "fry",
+      "ga" => "gle", "gd" => "gla", "gl" => "glg", "gn" => "grn",
+      "gu" => "guj", "gv" => "glv", "ha" => "hau", "he" => "heb",
+      "hi" => "hin", "ho" => "hmo", "hr" => "hrv", "ht" => "hat",
+      "hu" => "hun", "hy" => "hye", "hz" => "her", "ia" => "ina",
+      "id" => "ind", "ie" => "ile", "ig" => "ibo", "ii" => "iii",
+      "ik" => "ipk", "io" => "ido", "is" => "isl", "it" => "ita",
+      "iu" => "iku", "ja" => "jpn", "jv" => "jav", "ka" => "kat",
+      "kg" => "kon", "ki" => "kik", "kj" => "kua", "kk" => "kaz",
+      "kl" => "kal", "km" => "khm", "kn" => "kan", "ko" => "kor",
+      "kr" => "kau", "ks" => "kas", "ku" => "kur", "kv" => "kom",
+      "kw" => "cor", "ky" => "kir", "la" => "lat", "lb" => "ltz",
+      "lg" => "lug", "li" => "lim", "ln" => "lin", "lo" => "lao",
+      "lt" => "lit", "lu" => "lub", "lv" => "lav", "mg" => "mlg",
+      "mh" => "mah", "mi" => "mri", "mk" => "mkd", "ml" => "mal",
+      "mn" => "mon", "mr" => "mar", "ms" => "msa", "mt" => "mlt",
+      "my" => "mya", "na" => "nau", "nb" => "nob", "nd" => "nde",
+      "ne" => "nep", "ng" => "ndo", "nl" => "nld", "nn" => "nno",
+      "no" => "nor", "nr" => "nbl", "nv" => "nav", "ny" => "nya",
+      "oc" => "oci", "oj" => "oji", "om" => "orm", "or" => "ori",
+      "os" => "oss", "pa" => "pan", "pi" => "pli", "pl" => "pol",
+      "ps" => "pus", "pt" => "por", "qu" => "que", "rm" => "roh",
+      "rn" => "run", "ro" => "ron", "ru" => "rus", "rw" => "kin",
+      "sa" => "san", "sc" => "srd", "sd" => "snd", "se" => "sme",
+      "sg" => "sag", "si" => "sin", "sk" => "slk", "sl" => "slv",
+      "sm" => "smo", "sn" => "sna", "so" => "som", "sq" => "sqi",
+      "sr" => "srp", "ss" => "ssw", "st" => "sot", "su" => "sun",
+      "sv" => "swe", "sw" => "swa", "ta" => "tam", "te" => "tel",
+      "tg" => "tgk", "th" => "tha", "ti" => "tir", "tk" => "tuk",
+      "tl" => "tgl", "tn" => "tsn", "to" => "ton", "tr" => "tur",
+      "ts" => "tso", "tt" => "tat", "tw" => "twi", "ty" => "tah",
+      "ug" => "uig", "uk" => "ukr", "ur" => "urd", "uz" => "uzb",
+      "ve" => "ven", "vi" => "vie", "vo" => "vol", "wa" => "wln",
+      "wo" => "wol", "xh" => "xho", "yi" => "yid", "yo" => "yor",
+      "za" => "zha", "zh" => "zho", "zu" => "zul"
+    }.freeze
+    TERM_TYPE_MAP = {
+      "acronym" => "abbreviation",
+      "abbreviation" => "abbreviation",
+      "fullForm" => "expression",
+      "symbol" => "symbol",
+      "variant" => "expression",
+      "equation" => "expression",
+      "formula" => "expression",
+    }.freeze
+    NORMATIVE_STATUS_MAP = {
+      "preferredTerm" => "preferred",
+      "admittedTerm" => "admitted",
+      "deprecatedTerm" => "deprecated",
+    }.freeze
+    def self.convert_language_code(code)
+      return code if code.nil?
+      return code if code.length == 3
+      ISO_639_1_TO_639_2[code] || code
+    end
+  end
+end

data/lib/glossarist/transforms/concept_to_skos_transform.rb CHANGED Viewed

@@ -1,7 +1,5 @@
 # frozen_string_literal: true
-require_relative "../rdf"
 module Glossarist
   module Transforms
     class ConceptToSkosTransform

data/lib/glossarist/version.rb CHANGED Viewed

@@ -4,5 +4,5 @@
 #
 module Glossarist
-  VERSION = "2.6.2"
+  VERSION = "2.6.3"
 end

data/lib/glossarist.rb CHANGED Viewed

@@ -7,14 +7,13 @@ require "psych"
 require "thor"
 require "lutaml/model"
-require_relative "glossarist/glossary_definition"
 module Glossarist
   autoload :Asset,                    "glossarist/asset"
   autoload :Citation,                 "glossarist/citation"
   autoload :CLI,                      "glossarist/cli"
   autoload :CollectionConfig,         "glossarist/collection_config"
   autoload :Collection,               "glossarist/collection"
+  autoload :Collections,              "glossarist/collections"
   autoload :Concept,                  "glossarist/concept"
   autoload :ConceptData,              "glossarist/concept_data"
   autoload :ConceptReference,         "glossarist/concept_reference"
@@ -35,10 +34,10 @@ module Glossarist
   autoload :DetailedDefinition,       "glossarist/detailed_definition"
   autoload :Designation,              "glossarist/designation"
   autoload :Error,                    "glossarist/error"
-  autoload :GcrPackage,              "glossarist/gcr_package"
-  autoload :GcrMetadata,             "glossarist/gcr_metadata"
-  autoload :GcrStatistics,           "glossarist/gcr_statistics"
-  autoload :GcrValidator,            "glossarist/gcr_validator"
+  autoload :GcrPackage,               "glossarist/gcr_package"
+  autoload :GcrMetadata,              "glossarist/gcr_metadata"
+  autoload :GcrStatistics,            "glossarist/gcr_statistics"
+  autoload :GcrValidator,             "glossarist/gcr_validator"
   autoload :InvalidTypeError, "glossarist/error/invalid_type_error"
   autoload :InvalidLanguageCodeError,
            "glossarist/error/invalid_language_code_error"
@@ -52,16 +51,20 @@ module Glossarist
   autoload :ManagedConceptData,       "glossarist/managed_concept_data"
   autoload :NonVerbRep,               "glossarist/non_verb_rep"
   autoload :RelatedConcept,           "glossarist/related_concept"
+  autoload :Rdf,                      "glossarist/rdf"
+  autoload :Sts,                      "glossarist/sts"
+  autoload :Transforms,               "glossarist/transforms"
   autoload :SchemaMigration,          "glossarist/schema_migration"
   autoload :UrnResolver,              "glossarist/urn_resolver"
   autoload :Utilities,                "glossarist/utilities"
-  autoload :RegisterData, "glossarist/register_data"
+  autoload :RegisterData,             "glossarist/register_data"
   autoload :ValidationResult,         "glossarist/validation_result"
   autoload :V1,                       "glossarist/v1"
 end
 require_relative "glossarist/version"
 require_relative "glossarist/collections"
+require_relative "glossarist/glossary_definition"
 module Glossarist
   LANG_CODES = %w[eng ara deu fra spa ita jpn kor pol por srp swe zho rus fin

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: glossarist
 version: !ruby/object:Gem::Version
-  version: 2.6.2
+  version: 2.6.3
 platform: ruby
 authors:
 - Ribose
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2026-05-07 00:00:00.000000000 Z
+date: 2026-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: lutaml-model
@@ -122,6 +122,7 @@ files:
 - lib/glossarist/citation.rb
 - lib/glossarist/cli.rb
 - lib/glossarist/cli/export_command.rb
+- lib/glossarist/cli/import_command.rb
 - lib/glossarist/cli/package_command.rb
 - lib/glossarist/cli/upgrade_command.rb
 - lib/glossarist/cli/validate_command.rb
@@ -192,6 +193,14 @@ files:
 - lib/glossarist/resolution_adapter/remote.rb
 - lib/glossarist/resolution_adapter/route.rb
 - lib/glossarist/schema_migration.rb
+- lib/glossarist/sts.rb
+- lib/glossarist/sts/extracted_designation.rb
+- lib/glossarist/sts/extracted_lang_set.rb
+- lib/glossarist/sts/extracted_term.rb
+- lib/glossarist/sts/import_result.rb
+- lib/glossarist/sts/importer.rb
+- lib/glossarist/sts/term_extractor.rb
+- lib/glossarist/sts/term_mapper.rb
 - lib/glossarist/transforms.rb
 - lib/glossarist/transforms/concept_to_skos_transform.rb
 - lib/glossarist/transforms/concept_to_tbx_transform.rb