RubyGems - iev - Versions diffs - 0.3.9 → 0.4.1 - Mend

iev 0.3.9 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +1 -3
data/.github/workflows/release.yml +3 -1
data/.gitignore +3 -1
data/CLAUDE.md +50 -0
data/Gemfile +3 -0
data/README.adoc +65 -15
data/exe/iev +11 -0
data/iev.gemspec +5 -4
data/lib/iev/cli/command.rb +122 -76
data/lib/iev/cli/command_helper.rb +55 -36
data/lib/iev/config.rb +31 -0
data/lib/iev/converter/mathml_to_asciimath.rb +137 -159
data/lib/iev/data_source.rb +124 -0
data/lib/iev/exporter.rb +138 -0
data/lib/iev/scraper/page_parser.rb +176 -0
data/lib/iev/scraper.rb +135 -0
data/lib/iev/source_parser.rb +39 -19
data/lib/iev/supersession_parser.rb +9 -13
data/lib/iev/term_attrs_parser.rb +21 -7
data/lib/iev/term_builder.rb +102 -94
data/lib/iev/utilities.rb +129 -42
data/lib/iev/version.rb +1 -1
data/lib/iev.rb +47 -35
metadata +34 -13
data/lib/iev/db.rb +0 -82
data/lib/iev/db_cache.rb +0 -124

data/lib/iev/converter/mathml_to_asciimath.rb CHANGED Viewed

@@ -5,190 +5,168 @@ module Iev
     class MathmlToAsciimath
       using DataConversions
-      def self.convert(input)
-        new.convert(input)
-      end
+      GREEK_ENTITIES = {
+        "&alpha;" => "alpha",
+        "&beta;" => "beta",
+        "&gamma;" => "gamma",
+        "&Gamma;" => "Gamma",
+        "&delta;" => "delta",
+        "&Delta;" => "Delta",
+        "&epsilon;" => "epsilon",
+        "&varepsilon;" => "varepsilon",
+        "&zeta;" => "zeta",
+        "&eta;" => "eta",
+        "&theta;" => "theta",
+        "&Theta;" => "Theta",
+        "&vartheta;" => "vartheta",
+        "&iota;" => "iota",
+        "&kappa;" => "kappa",
+        "&lambda;" => "lambda",
+        "&Lambda;" => "Lambda",
+        "&mu;" => "mu",
+        "&nu;" => "nu",
+        "&xi;" => "xi",
+        "&Xi;" => "Xi",
+        "&pi;" => "pi",
+        "&Pi;" => "Pi",
+        "&rho;" => "rho",
+        "&sigma;" => "sigma",
+        "&Sigma;" => "Sigma",
+        "&tau;" => "tau",
+        "&upsilon;" => "upsilon",
+        "&phi;" => "phi",
+        "&Phi;" => "Phi",
+        "&varphi;" => "varphi",
+        "&chi;" => "chi",
+        "&psi;" => "psi",
+        "&Psi;" => "Psi",
+        "&omega;" => "omega",
+      }.freeze
+      class << self
+        def convert(input)
+          mathml_to_asciimath(input)
+        end
-      def convert(input)
-        mathml_to_asciimath(input)
-      end
+        # Clear the Plurimath expression cache. Call between export runs.
+        def clear_cache
+          @math_cache = nil
+        end
-      private
+        private
-      def mathml_to_asciimath(input)
-        # If given string does not include '<' (for elements) nor '&'
-        # (for entities), then it's certain that it doesn't contain
-        # any MathML or HTML formula.
-        return input unless input&.match?(/<|&/)
+        def math_cache
+          @math_cache ||= {}
+        end
+        def mathml_to_asciimath(input)
+          return input unless input&.match?(/<|&/)
-        return html_to_asciimath(input) unless input.include?("<math>")
+          return html_to_asciimath(input) unless input.include?("<math>")
-        # puts "GOING TO MATHML MATH"
-        # puts input
-        to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
-        # to_asciimath.remove_namespaces!
+          to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
-        to_asciimath.css("math").each do |math_element|
-          asciimath = Plurimath::Math.parse(
-            text_to_asciimath(math_element.to_xml), :mathml
-          ).to_asciimath.strip
-          # puts"ASCIIMATH!!  #{asciimath}"
+          to_asciimath.css("math").each do |math_element|
+            math_xml = math_element.to_xml
+            asciimath = math_cache[math_xml] ||= begin
+              Plurimath::Math.parse(math_xml, :mathml).to_asciimath.strip
+            rescue Plurimath::Math::ParseError
+              ""
+            end
-          if asciimath.empty?
-            math_element.remove
-          else
-            math_element.replace "stem:[#{asciimath}]"
+            if asciimath.empty?
+              math_element.remove
+            else
+              math_element.replace "stem:[#{asciimath}]"
+            end
           end
+          html_to_asciimath(
+            to_asciimath.children.to_s,
+          )
         end
-        html_to_asciimath(
-          to_asciimath.children.to_s,
-        )
-      end
+        def html_to_asciimath(input)
+          return input if input.nil? || input.empty?
-      def html_to_asciimath(input)
-        return input if input.nil? || input.empty?
-        to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
-        to_asciimath.css("i").each do |math_element|
-          # puts "HTML MATH!!  #{math_element.to_xml}"
-          # puts "HTML MATH!!  #{math_element.text}"
-          decoded = text_to_asciimath(math_element.text)
-          case decoded.length
-          when 1..12
-            # puts "(#{math_element.text} to => #{decoded})"
-            math_element.replace "stem:[#{decoded}]"
-          when 0
-            math_element.remove
-          else
-            math_element.replace "_#{decoded}_"
+          # Fast path: if no HTML elements remain that need Nokogiri processing
+          # (after parse_anchor_tag handles <i>/<sub>/<sup>/<ol>/<ul>/<font>),
+          # just do the Greek entity replacement.
+          unless input.match?(/<([iI]|sub|sup|ol|ul|font)\b/)
+            return html_entities_to_stem(input)
           end
-        end
-        to_asciimath.css("sub").each do |math_element|
-          case math_element.text.length
-          when 0
-            math_element.remove
-          else
-            math_element.replace "~#{text_to_asciimath(math_element.text)}~"
+          to_asciimath = Nokogiri::HTML.fragment(input, "UTF-8")
+          to_asciimath.css("i").each do |math_element|
+            decoded = text_to_asciimath(math_element.text)
+            case decoded.length
+            when 1..12
+              math_element.replace "stem:[#{decoded}]"
+            when 0
+              math_element.remove
+            else
+              math_element.replace "_#{decoded}_"
+            end
           end
-        end
-        to_asciimath.css("sup").each do |math_element|
-          case math_element.text.length
-          when 0
-            math_element.remove
-          else
-            math_element.replace "^#{text_to_asciimath(math_element.text)}^"
+          to_asciimath.css("sub").each do |math_element|
+            case math_element.text.length
+            when 0
+              math_element.remove
+            else
+              math_element.replace "~#{text_to_asciimath(math_element.text)}~"
+            end
           end
-        end
-        to_asciimath.css("ol").each do |element|
-          element.css("li").each do |li|
-            li.replace ". #{li.text}"
+          to_asciimath.css("sup").each do |math_element|
+            case math_element.text.length
+            when 0
+              math_element.remove
+            else
+              math_element.replace "^#{text_to_asciimath(math_element.text)}^"
+            end
           end
-        end
-        to_asciimath.css("ul").each do |element|
-          element.css("li").each do |li|
-            li.replace "* #{li.text}"
+          to_asciimath.css("ol").each do |element|
+            element.css("li").each do |li|
+              li.replace ". #{li.text}"
+            end
           end
-        end
-        # Replace sans-serif font with monospace
-        to_asciimath.css('font[style*="sans-serif"]').each do |x|
-          x.replace "`#{x.text}`"
-        end
+          to_asciimath.css("ul").each do |element|
+            element.css("li").each do |li|
+              li.replace "* #{li.text}"
+            end
+          end
-        html_entities_to_stem(
-          to_asciimath
-            .children.to_s
-            .gsub(/\]stem:\[/, "")
-            .gsub(%r{</?[uo]l>}, ""),
-        )
-      end
+          to_asciimath.css('font[style*="sans-serif"]').each do |x|
+            x.replace "`#{x.text}`"
+          end
-      def text_to_asciimath(text)
-        html_entities_to_asciimath(text.decode_html)
-      end
+          html_entities_to_stem(
+            to_asciimath
+              .children.to_s
+              .gsub("]stem:[", "")
+              .gsub(%r{</?[uo]l>}, ""),
+          )
+        end
-      def html_entities_to_asciimath(input)
-        input.gsub("&alpha;", "alpha")
-          .gsub("&beta;", "beta")
-          .gsub("&gamma;", "gamma")
-          .gsub("&Gamma;", "Gamma")
-          .gsub("&delta;", "delta")
-          .gsub("&Delta;", "Delta")
-          .gsub("&epsilon;", "epsilon")
-          .gsub("&varepsilon;", "varepsilon")
-          .gsub("&zeta;", "zeta")
-          .gsub("&eta;", "eta")
-          .gsub("&theta;", "theta")
-          .gsub("&Theta;", "Theta")
-          .gsub("&vartheta;", "vartheta")
-          .gsub("&iota;", "iota")
-          .gsub("&kappa;", "kappa")
-          .gsub("&lambda;", "lambda")
-          .gsub("&Lambda;", "Lambda")
-          .gsub("&mu;", "mu")
-          .gsub("&nu;", "nu")
-          .gsub("&xi;", "xi")
-          .gsub("&Xi;", "Xi")
-          .gsub("&pi;", "pi")
-          .gsub("&Pi;", "Pi")
-          .gsub("&rho;", "rho")
-          .gsub("&beta;", "beta")
-          .gsub("&sigma;", "sigma")
-          .gsub("&Sigma;", "Sigma")
-          .gsub("&tau;", "tau")
-          .gsub("&upsilon;", "upsilon")
-          .gsub("&phi;", "phi")
-          .gsub("&Phi;", "Phi")
-          .gsub("&varphi;", "varphi")
-          .gsub("&chi;", "chi")
-          .gsub("&psi;", "psi")
-          .gsub("&Psi;", "Psi")
-          .gsub("&omega;", "omega")
-      end
+        def text_to_asciimath(text)
+          html_entities_to_asciimath(text.decode_html)
+        end
+        def html_entities_to_asciimath(input)
+          GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
+            str.gsub(entity, name)
+          end
+        end
-      def html_entities_to_stem(input)
-        input.gsub("&alpha;", "stem:[alpha]")
-          .gsub("&beta;", "stem:[beta]")
-          .gsub("&gamma;", "stem:[gamma]")
-          .gsub("&Gamma;", "stem:[Gamma]")
-          .gsub("&delta;", "stem:[delta]")
-          .gsub("&Delta;", "stem:[Delta]")
-          .gsub("&epsilon;", "stem:[epsilon]")
-          .gsub("&varepsilon;", "stem:[varepsilon]")
-          .gsub("&zeta;", "stem:[zeta]")
-          .gsub("&eta;", "stem:[eta]")
-          .gsub("&theta;", "stem:[theta]")
-          .gsub("&Theta;", "stem:[Theta]")
-          .gsub("&vartheta;", "stem:[vartheta]")
-          .gsub("&iota;", "stem:[iota]")
-          .gsub("&kappa;", "stem:[kappa]")
-          .gsub("&lambda;", "stem:[lambda]")
-          .gsub("&Lambda;", "stem:[Lambda]")
-          .gsub("&mu;", "stem:[mu]")
-          .gsub("&nu;", "stem:[nu]")
-          .gsub("&xi;", "stem:[xi]")
-          .gsub("&Xi;", "stem:[Xi]")
-          .gsub("&pi;", "stem:[pi]")
-          .gsub("&Pi;", "stem:[Pi]")
-          .gsub("&rho;", "stem:[rho]")
-          .gsub("&beta;", "stem:[beta]")
-          .gsub("&sigma;", "stem:[sigma]")
-          .gsub("&Sigma;", "stem:[Sigma]")
-          .gsub("&tau;", "stem:[tau]")
-          .gsub("&upsilon;", "stem:[upsilon]")
-          .gsub("&phi;", "stem:[phi]")
-          .gsub("&Phi;", "stem:[Phi]")
-          .gsub("&varphi;", "stem:[varphi]")
-          .gsub("&chi;", "stem:[chi]")
-          .gsub("&psi;", "stem:[psi]")
-          .gsub("&Psi;", "stem:[Psi]")
-          .gsub("&omega;", "stem:[omega]")
+        def html_entities_to_stem(input)
+          GREEK_ENTITIES.reduce(input) do |str, (entity, name)|
+            str.gsub(entity, "stem:[#{name}]")
+          end
+        end
       end
     end
   end

data/lib/iev/data_source.rb ADDED Viewed

@@ -0,0 +1,124 @@
+# frozen_string_literal: true
+require "net/http"
+require "uri"
+require "yaml"
+module Iev
+  module DataSource
+    class NotFoundError < StandardError; end
+    class << self
+      # Fetch full concept data (all languages) for a given IEV code.
+      #
+      # @param code [String] IEV code, e.g. "103-01-02"
+      # @return [Hash, nil] concept data hash or nil if not found
+      def fetch_concept(code)
+        fetch_concept_data(code)
+      end
+      # Fetch localized term data for a given IEV code and language.
+      #
+      # @param code [String] IEV code, e.g. "103-01-02"
+      # @param lang [String] language code, e.g. "en" or "eng"
+      # @return [Hash, nil] localized concept data or nil
+      def fetch_term(code, lang)
+        concept = fetch_concept(code)
+        return nil unless concept
+        lang_key = normalize_lang(lang)
+        concept[lang_key]
+      end
+      # Fetch the term designation string for a given IEV code and language.
+      # This is the backward-compatible replacement for the scraping-based Iev.get.
+      #
+      # @param code [String] IEV code, e.g. "103-01-02"
+      # @param lang [String] language code, e.g. "en"
+      # @return [String, nil] term designation or nil
+      def fetch_term_designation(code, lang)
+        term_data = fetch_term(code, lang)
+        return nil unless term_data
+        terms = term_data["terms"]
+        return nil unless terms&.any?
+        preferred = terms.find { |t| t["normative_status"] == "preferred" }
+        (preferred || terms.first)["designation"]
+      end
+      private
+      def fetch_concept_data(code)
+        from_local(code) || from_remote(code)
+      end
+      def from_local(code)
+        data_path = Iev.config.data_path
+        return nil unless data_path
+        path = File.join(data_path, "concept-#{code}.yaml")
+        return nil unless File.exist?(path)
+        YAML.safe_load(File.read(path, encoding: "utf-8"), permitted_classes: [Date, Time])
+      end
+      def from_remote(code)
+        cache_key = "concept-#{code}.yaml"
+        cached = read_cache(cache_key)
+        return cached if cached
+        url = "#{Iev.config.remote_base_url}/#{cache_key}"
+        data = http_get_yaml(url)
+        return nil unless data
+        write_cache(cache_key, data)
+        data
+      end
+      def http_get_yaml(url)
+        uri = URI(url)
+        response = Net::HTTP.get_response(uri)
+        case response.code
+        when "200"
+          YAML.safe_load(response.body, permitted_classes: [Date, Time])
+        when "404"
+          nil
+        else
+          warn "IEV: Failed to fetch #{url}: HTTP #{response.code}"
+          nil
+        end
+      rescue SocketError, Timeout::Error => e
+        warn "IEV: Network error fetching #{url}: #{e.message}"
+        nil
+      end
+      def read_cache(filename)
+        cache_path = cache_file_path(filename)
+        return nil unless File.exist?(cache_path)
+        YAML.safe_load(File.read(cache_path, encoding: "utf-8"), permitted_classes: [Date, Time])
+      end
+      def write_cache(filename, data)
+        cache_path = cache_file_path(filename)
+        FileUtils.mkdir_p(File.dirname(cache_path))
+        File.write(cache_path, YAML.dump(data), encoding: "utf-8")
+      end
+      def cache_file_path(filename)
+        File.join(Iev.config.cache_dir, filename)
+      end
+      # Normalize language code: "en" → "eng", "de" → "deu", etc.
+      def normalize_lang(lang)
+        return lang if lang.length == 3
+        Iso639Code.three_char_code(lang).first
+      rescue StandardError
+        lang
+      end
+    end
+  end
+end

data/lib/iev/exporter.rb ADDED Viewed

@@ -0,0 +1,138 @@
+# frozen_string_literal: true
+module Iev
+  # Exports IEV data to Glossarist YAML format.
+  #
+  # Automatically detects input format from file extension:
+  #   .xlsx / .xls   → Excel IEV export
+  #   .sqlite3 / .sqlite / .db → SQLite database
+  #
+  # @example Programmatic usage
+  #   exporter = Iev::Exporter.new("data.xlsx", output_dir: "/tmp/output")
+  #   collection = exporter.export
+  #
+  # @example With filters
+  #   Iev::Exporter.new("data.sqlite3",
+  #     output_dir: "/tmp/output",
+  #     only_concepts: "103-%",
+  #     only_languages: "en,fr",
+  #   ).export
+  class Exporter
+    XLSX_EXTENSIONS = %w[.xlsx .xls].freeze
+    SQLITE_EXTENSIONS = %w[.sqlite3 .sqlite .db].freeze
+    attr_reader :input_path, :output_dir, :filters
+    # @param input_path [String, Pathname] path to Excel or SQLite file
+    # @param output_dir [String, Pathname] destination for YAML files
+    # @param only_concepts [String, nil] SQL LIKE pattern for IEVREF filtering
+    # @param only_languages [String, nil] comma-separated language codes
+    # @param fetch_relaton_links [Boolean] whether to fetch source URLs via Relaton
+    def initialize(input_path, output_dir: Dir.pwd,
+                   only_concepts: nil, only_languages: nil,
+                   fetch_relaton_links: false)
+      @input_path = Pathname.new(input_path)
+      validate_input!
+      @output_dir = Pathname.new(output_dir)
+      @fetch_relaton_links = fetch_relaton_links
+      @filters = {
+        only_concepts: only_concepts,
+        only_languages: only_languages,
+      }.compact
+    end
+    # Run the export pipeline: load → transform → save.
+    # @return [Glossarist::ManagedConceptCollection]
+    def export
+      dataset = load_dataset
+      collection = build_collection(dataset)
+      save_collection(collection)
+      collection
+    end
+    private
+    def supported_format?
+      ext = input_path.extname.downcase
+      XLSX_EXTENSIONS.include?(ext) || SQLITE_EXTENSIONS.include?(ext)
+    end
+    def validate_input!
+      unless input_path.exist?
+        raise ArgumentError, "Input file not found: #{input_path}"
+      end
+      return if supported_format?
+      exts = (XLSX_EXTENSIONS + SQLITE_EXTENSIONS).join(", ")
+      raise ArgumentError,
+        "Unsupported format: #{input_path.extname}. Supported: #{exts}"
+    end
+    def input_format
+      ext = input_path.extname.downcase
+      XLSX_EXTENSIONS.include?(ext) ? :xlsx : :sqlite
+    end
+    def load_dataset
+      case input_format
+      when :xlsx then load_from_xlsx
+      when :sqlite then load_from_sqlite
+      end
+    end
+    def load_from_xlsx
+      require "creek"
+      db = Sequel.sqlite
+      DbWriter.new(db).import_spreadsheet(input_path.to_s)
+      apply_filters(db)
+    end
+    def load_from_sqlite
+      apply_filters(Sequel.sqlite(input_path.to_s))
+    end
+    def apply_filters(db)
+      query = db[:concepts]
+      if filters[:only_concepts]
+        query = query.where(Sequel.ilike(:ievref, filters[:only_concepts]))
+      end
+      if filters[:only_languages]
+        query = query.where(language: filters[:only_languages].split(","))
+      end
+      query
+    end
+    def build_collection(dataset)
+      SourceParser.relaton_enabled = @fetch_relaton_links
+      # Use a hash index for O(1) concept lookup instead of
+      # Glossarist's O(n) fetch_or_initialize which does linear scan.
+      concept_index = {}
+      collection = Glossarist::ManagedConceptCollection.new
+      dataset.each do |row|
+        term = TermBuilder.build_from(row)
+        next unless term
+        concept = concept_index[term.id] ||= begin
+          c = Glossarist::ManagedConcept.new(data: { "id" => term.id })
+          collection.store(c)
+          c
+        end
+        concept.add_l10n(term)
+      end
+      collection
+    ensure
+      SourceParser.relaton_enabled = true
+    end
+    def save_collection(collection)
+      concepts_dir = output_dir.expand_path.join("concepts")
+      FileUtils.mkdir_p(concepts_dir)
+      collection.save_to_files(concepts_dir.to_s)
+    end
+  end
+end