RubyGems - iev - Versions diffs - 0.3.9 → 0.4.0 - Mend

iev 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +0 -2
data/.github/workflows/release.yml +3 -1
data/.gitignore +3 -1
data/CLAUDE.md +50 -0
data/Gemfile +3 -0
data/README.adoc +65 -15
data/exe/iev +11 -0
data/iev.gemspec +5 -4
data/lib/iev/cli/command.rb +119 -76
data/lib/iev/cli/command_helper.rb +55 -36
data/lib/iev/config.rb +31 -0
data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
data/lib/iev/data_source.rb +124 -0
data/lib/iev/exporter.rb +122 -0
data/lib/iev/scraper/page_parser.rb +176 -0
data/lib/iev/scraper.rb +135 -0
data/lib/iev/source_parser.rb +31 -18
data/lib/iev/supersession_parser.rb +9 -13
data/lib/iev/term_attrs_parser.rb +21 -7
data/lib/iev/term_builder.rb +100 -94
data/lib/iev/utilities.rb +91 -42
data/lib/iev/version.rb +1 -1
data/lib/iev.rb +47 -35
metadata +34 -13
data/lib/iev/db.rb +0 -82
data/lib/iev/db_cache.rb +0 -124

data/lib/iev/term_builder.rb CHANGED Viewed

@@ -44,44 +44,46 @@ module Iev
       split_definition
-      Glossarist::LocalizedConcept.from_hash(term_hash)
+      concept_data = build_concept_data
+      concept = Glossarist::LocalizedConcept.new
+      concept.data = concept_data
+      concept.id = term_id
+      concept.entry_status = extract_entry_status
+      concept.classification = extract_classification
+      concept
     end
-    def term_hash
-      dates = nil
-      if flesh_date(find_value_for("PUBLICATIONDATE"))
-        dates = [
-          {
-            type: :accepted,
-            date: flesh_date(find_value_for("PUBLICATIONDATE")),
-          },
-          {
-            type: :amended,
-            date: flesh_date(find_value_for("PUBLICATIONDATE")),
-          },
+    def build_concept_data
+      cd = Glossarist::ConceptData.new
+      cd.id = term_id
+      cd.language_code = term_language
+      pub_date = flesh_date(find_value_for("PUBLICATIONDATE"))
+      if pub_date
+        cd.dates = [
+          Glossarist::ConceptDate.new(type: "accepted", date: pub_date),
+          Glossarist::ConceptDate.new(type: "amended", date: pub_date),
         ]
+        cd.review_date = pub_date
+        cd.review_decision_date = pub_date
       end
+      cd.review_decision_event = "published"
-      {
-        id: term_id,
-        classification: extract_classification,
-        entry_status: extract_entry_status,
-        data: {
-          id: term_id,
-          dates: dates,
-          definition: [{ "content" => extract_definition_value }],
-          examples: extract_examples,
-          notes: extract_notes,
-          terms: extract_terms,
-          review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
-          review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
-          review_decision_event: "published",
-          language_code: term_language,
-          sources: extract_authoritative_source,
-          related: extract_superseded_concepts,
-        }.compact,
-      }.compact
+      definition = extract_definition_value
+      cd.definition = [definition] if definition
+      cd.examples = extract_examples
+      cd.notes = extract_notes
+      cd.terms = extract_terms
+      sources = extract_authoritative_source
+      cd.sources = sources if sources&.any?
+      related = extract_superseded_concepts
+      cd.related = related if related&.any?
+      cd
     end
     def term_id
@@ -121,10 +123,10 @@ module Iev
             Note&nbsp;\d+\sto\sentry: |
             Note\s*\d+\sto\sthe\sentry: |
             Note\sto\sentry\s*\d+: |
-            Note\s*\d+?\sà\sl['’]article: |
-            <NOTE/?>?\s*\d?\s+.*?– |
-            NOTE(?:\s+-)? |
-            Note\s+\d+\s– |
+            Note\s*\d+?\sà\sl['']article: |
+            <NOTE/?>?\s*\d?\s+[–-]\s* |
+            NOTE(?:\s+-)?\s* |
+            Note\s+\d+\s[–-]\s* |
             Note&nbsp;\d+\s
           )
         )
@@ -140,28 +142,14 @@ module Iev
       while (md = remaining_str&.match(slicer_rx))
         next_part = md.pre_match
-        next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\\1mple')
+        next_part.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
         next_part_arr.push(next_part)
         next_part_arr = md[:example] ? @examples : @notes
-        # 112-03-17
-        # supplements the name of a quantity, especially for a component in a
-        # system, to indicate the quotient of that quantity by the total
-        # volume
-        # <NOTE – Examples: amount-of-substance volume concentration of
-        # component B (or concentration of B, in particular, ion
-        # concentration), molecular concentration of B, electron concentration
-        # (or electron density).
-        #
-        # In the above case the `Example` is part of the note but the regex
-        # above will capture it as an example and will add an empty `Note`
-        # and put the rest in an `Example`. So In this case we will replace
-        # the `Example` with `[:Example]` and revert it in the next iteration
-        # so it will not be caught by the regex.
         remaining_str = md.post_match
-        remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex\\1mple]') if md[:note]
+        remaining_str.sub!(/^Ex(a|e)mple/, '[:Ex\1mple]') if md[:note]
       end
-      remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\\1mple')
+      remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, 'Ex\1mple')
       next_part_arr.push(remaining_str)
       @definition = definition_arr.first
       @definition = nil if @definition&.empty?
@@ -211,28 +199,21 @@ module Iev
     def extract_definition_value
       return unless @definition
-      Iev::Converter.mathml_to_asciimath(
-        replace_newlines(parse_anchor_tag(@definition, term_domain)),
-      ).strip
+      content = convert_content(@definition)
+      Glossarist::DetailedDefinition.new(content: content)
     end
     def extract_examples
       @examples.map do |str|
-        {
-          content: Iev::Converter.mathml_to_asciimath(
-            replace_newlines(parse_anchor_tag(str, term_domain)),
-          ).strip,
-        }
+        content = convert_content(clean_extracted_text(str))
+        Glossarist::DetailedDefinition.new(content: content)
       end
     end
     def extract_notes
       @notes.map do |str|
-        {
-          content: Iev::Converter.mathml_to_asciimath(
-            replace_newlines(parse_anchor_tag(str, term_domain)),
-          ).strip,
-        }
+        content = convert_content(clean_extracted_text(str))
+        Glossarist::DetailedDefinition.new(content: content)
       end
     end
@@ -246,14 +227,14 @@ module Iev
       classification_val = find_value_for("SYNONYM1STATUS")
       case classification_val
-      when ""
-        "admitted"
+      when nil, ""
+        nil
       when "认可的", "допустимый", "admitido"
         "admitted"
       when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
         "preferred"
       else
-        classification_val
+        classification_val.downcase
       end
     end
@@ -261,12 +242,12 @@ module Iev
       source_val = find_value_for("SOURCE")
       return nil if source_val.nil?
-      SourceParser.new(source_val, term_domain)
+      sources = SourceParser.new(source_val, term_domain)
         .parsed_sources
         .compact
-        .map do |source|
-        source.merge({ "type" => "authoritative" })
-      end
+      sources.each { |src| src.type = "authoritative" }
+      sources.empty? ? nil : sources
     end
     def extract_superseded_concepts
@@ -279,9 +260,7 @@ module Iev
     private
     def build_expression_designation(raw_term, attribute_data:, status:)
-      term = Iev::Converter.mathml_to_asciimath(
-        parse_anchor_tag(raw_term, term_domain),
-      )
+      term = convert_content(raw_term)
       term_attributes = TermAttrsParser.new(attribute_data.to_s)
       statuses = {
@@ -289,29 +268,56 @@ module Iev
         "напуштен" => "deprecated",
       }
-      {
-        "type" => "expression",
-        "prefix" => term_attributes.prefix,
-        "normative_status" => statuses[status] || status,
-        "usage_info" => term_attributes.usage_info,
-        "designation" => term,
-        "part_of_speech" => term_attributes.part_of_speech,
-        "geographical_area" => term_attributes.geographical_area,
-        "gender" => term_attributes.gender,
-        "plurality" => term_attributes.plurality,
+      grammar_info = term_attributes.to_grammar_info
+      attrs = {
+        designation: term,
+        normative_status: statuses[status] || status,
+        geographical_area: term_attributes.geographical_area,
+        prefix: term_attributes.prefix,
+        usage_info: term_attributes.usage_info,
+        grammar_info: grammar_info ? [grammar_info] : nil,
       }.compact
+      Glossarist::Designation::Expression.new(**attrs)
     end
     def build_symbol_designation(raw_term)
-      term = Iev::Converter.mathml_to_asciimath(
-        parse_anchor_tag(raw_term, term_domain),
+      term = convert_content(raw_term)
+      Glossarist::Designation::Symbol.new(
+        designation: term,
+        international: true,
       )
+    end
-      {
-        "type" => "symbol",
-        "designation" => term,
-        "international" => true,
-      }.compact
+    def convert_content(str)
+      stripped = strip_html_comments(str.to_s)
+      Iev::Converter.mathml_to_asciimath(
+        replace_newlines(parse_anchor_tag(stripped, term_domain)),
+      ).strip
+    end
+    def strip_html_comments(str)
+      doc = Nokogiri::HTML::DocumentFragment.parse(str)
+      comments = doc.children.select(&:comment?)
+      return str if comments.empty?
+      result = str.dup
+      comments.each { |c| result = result.gsub("<!--#{c.content}-->", "") }
+      result
+    end
+    # Remove leading numbering artifacts from extracted notes/examples.
+    # The definition text sometimes duplicates note/example numbers:
+    #   "1  A time interval comprises..." (note)
+    #   "1: In a vending machine..." (example)
+    #   "2 à l'article: ..." (French note)
+    #   ": Par la réticulation..." (French note)
+    def clean_extracted_text(str)
+      # Strip leading number + optional separator (colon, em-space, etc.)
+      str.gsub(/\A\s*\d+[\s: ]*\s*/, "")
+        # Strip leading standalone colon (French style: ": text")
+        .gsub(/\A\s*:\s*/, "")
     end
   end
 end

data/lib/iev/utilities.rb CHANGED Viewed

@@ -2,57 +2,106 @@
 module Iev
   module Utilities
-    SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
-    FIGURE_ONE_REGEX =
-      '<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?'
-    FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
     IMAGE_PATH_PREFIX = "image::/assets/images/parts"
+    IEV_CODE_RE = /\A(IEV)?\s*(\d{2,3}-\d{2,3}-\d{2,3})\z/
+    # SIMG/Figure patterns — custom IEV XML, pre-processed before Nokogiri.
+    # Uses [^>] and [^<] instead of . to avoid polynomial backtracking.
+    SIMG_PATH_REGEX = /<simg [^>]*\/\$file\/([\d\-\w.]+)>/
+    FIGURE_ONE_REGEX = '<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+([^<]+)\\s*<\\/b>(<\\/p>)?'
+    FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}".freeze
     def parse_anchor_tag(text, term_domain)
-      return unless text
-      # Convert IEV term references
-      # Convert href links
-      # Need to take care of this pattern:
-      #  `inverse de la <a href="IEV103-06-01">période<a>`
-      text.gsub(
-        %r{<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
-        '{{\3, \1:\2}}',
-      ).gsub(
-        %r{<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)</?a>},
-        '{{\3, IEV:\2}}',
-      ).gsub(
-        # To handle <a> tags without ending tag like
-        #  `Voir <a href=IEV103-05-21>IEV 103-05-21`
-        #  for concept '702-03-11' in `fr`
-        /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
-        '{{\3, IEV:\2}}',
-      ).gsub(
-        %r{<a href="?([^<>]*?)"?>(.*?)</a>},
-        '\1[\2]',
-      ).gsub(
-        Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_TWO_REGEX].join),
-        "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
-      ).gsub(
-        Regexp.new([SIMG_PATH_REGEX, '\\s*', FIGURE_ONE_REGEX].join),
-        "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
-      ).gsub(
-        /<img\s+([^<>]+?)\s*>/,
-        "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
-      ).gsub(
-        /<br>/,
-        "\n",
-      ).gsub(
-        %r{<b>(.*?)</b>},
-        '*\\1*',
-      )
+      return nil if text.nil?
+      text = process_simg_figures(text, term_domain)
+      text = fix_unquoted_href(text)
+      doc = Nokogiri::HTML::DocumentFragment.parse(text)
+      nodes_to_adoc(doc.children, term_domain)
     end
     def replace_newlines(input)
-      input.gsub('\n', "\n\n")
+      input
+        .gsub('\n', "\n\n")
         .gsub(/<[pbr]+>/, "\n\n")
+        .gsub(/<br\s*\/?>/, "\n\n")
         .gsub(/\s*\n[\n\s]+/, "\n\n")
         .strip
     end
+    private
+    # IEV data has unquoted href with spaces, e.g.
+    #   <a href=IEV 102-01-10>...</a>
+    # Nokogiri stops at first space, so add quotes.
+    # Uses a specific IEV code pattern to avoid regex backtracking.
+    def fix_unquoted_href(text)
+      text.gsub(/href=(IEV\s\d{2,3}-\d{2,3}-\d{2,3})(?=[>\s])/) do
+        "href=\"#{Regexp.last_match(1)}\""
+      end
+    end
+    def process_simg_figures(text, term_domain)
+      text = text.gsub(
+        Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_TWO_REGEX].join),
+        "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6 - \\7]",
+      )
+      text = text.gsub(
+        Regexp.new([SIMG_PATH_REGEX.source, '\s*', FIGURE_ONE_REGEX].join),
+        "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
+      )
+      text.gsub(SIMG_PATH_REGEX, "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]")
+    end
+    def nodes_to_adoc(nodes, term_domain)
+      nodes.map { |n| node_to_adoc(n, term_domain) }.join
+    end
+    def node_to_adoc(node, term_domain)
+      case node
+      when Nokogiri::XML::Text
+        node.text
+      when Nokogiri::XML::Comment
+        ""
+      when Nokogiri::XML::Element
+        element_to_adoc(node, term_domain)
+      else
+        ""
+      end
+    end
+    def element_to_adoc(node, term_domain)
+      inner = nodes_to_adoc(node.children, term_domain)
+      case node.name
+      when "a"
+        convert_link(node, inner)
+      when "b"
+        "*#{inner}*"
+      when "br"
+        "\n"
+      when "img"
+        src = node["src"] || node.attributes.keys.first.to_s
+        "#{IMAGE_PATH_PREFIX}/#{term_domain}/#{src}[]"
+      when "p", "div", "span"
+        inner
+      else
+        node.to_s
+      end
+    end
+    def convert_link(node, inner)
+      href = (node["href"] || "").to_s.strip
+      if href.match?(IEV_CODE_RE)
+        iev_code = href.sub(/\AIEV\s*/, "")
+        "{{#{inner}, IEV:#{iev_code}}}"
+      elsif !href.empty?
+        "#{href}[#{inner}]"
+      else
+        inner
+      end
+    end
   end
 end

data/lib/iev/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Iev
-  VERSION = "0.3.9"
+  VERSION = "0.4.0"
 end

data/lib/iev.rb CHANGED Viewed

@@ -1,68 +1,80 @@
 # frozen_string_literal: true
 require "iev/version"
-require "iev/db"
-require "mechanize"
-require "nokogiri"
+require "iev/config"
+require "iev/data_source"
-require "benchmark"
-require "creek"
-require "unitsml"
-require "plurimath"
-require "glossarist"
-require "relaton"
-require "relaton_bib"
-require "sequel"
-require "thor"
 require "yaml"
+# plurimath and unitsml both depend on mml, which has a transitive
+# dependency version mismatch with lutaml-model in some environments.
+# Load them when available; the DataSource APIs work without them.
+begin
+  require "plurimath"
+rescue LoadError
+  nil
+end
+begin
+  require "unitsml"
+rescue LoadError
+  nil
+end
 module Iev
   autoload :Cli, "iev/cli"
+  autoload :Config, "iev/config"
   autoload :Converter, "iev/converter"
   autoload :DataConversions, "iev/data_conversions"
-  autoload :Db, "iev/db"
-  autoload :DbCache, "iev/db_cache"
+  autoload :DataSource, "iev/data_source"
   autoload :DbWriter, "iev/db_writer"
+  autoload :Exporter, "iev/exporter"
   autoload :Iso639Code, "iev/iso_639_code"
   autoload :Profiler, "iev/profiler"
   autoload :RelatonDb, "iev/relaton_db"
+  autoload :Scraper, "iev/scraper"
   autoload :SourceParser, "iev/source_parser"
   autoload :SupersessionParser, "iev/supersession_parser"
   autoload :TermAttrsParser, "iev/term_attrs_parser"
   autoload :TermBuilder, "iev/term_builder"
   autoload :Utilities, "iev/utilities"
-  #
-  # Scrape Electropedia for term.
+  # Fetch term designation from IEV data.
   #
   # @param [String] code for example "103-01-02"
   # @param [String] lang language code, for example "en"
   #
-  # @return [String, nil] if found than term,
-  # if code not found then empty string,
+  # @return [String, nil] if found then term,
+  # if code not found then nil,
   #   if language not found then nil.
   #
   def self.get(code, lang)
-    doc = get_doc(code)
-    xpath = "//table/tr/td/div/font[.=\"#{lang}\"]/../../"\
-            "following-sibling::td[2]"
-    a = doc&.at(xpath)&.children&.to_xml
-    a&.sub(%r{<br/>.*$}, "")
-      &.sub(/, &lt;.*$/, "")
-      &.gsub(/<[^<>]*>/, "")&.strip
+    DataSource.fetch_term_designation(code, lang)
   end
-  def self.get_doc(code)
-    url = "https://www.electropedia.org/iev/iev.nsf/"\
-         "display?openform&ievref=#{code}"
+  # Fetch full concept data (all languages) for a given IEV code.
+  #
+  # @param [String] code IEV code, e.g. "103-01-02"
+  # @return [Hash, nil] concept data hash with all languages
+  def self.fetch_concept(code)
+    DataSource.fetch_concept(code)
+  end
-    # Use Mechanize with User-Agent to avoid 403 Forbidden errors from bot detection
-    agent = Mechanize.new
-    agent.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+  # Fetch localized term data for a given IEV code and language.
+  #
+  # @param [String] code IEV code, e.g. "103-01-02"
+  # @param [String] lang language code, e.g. "en" or "eng"
+  # @return [Hash, nil] localized concept data
+  def self.fetch_term(code, lang)
+    DataSource.fetch_term(code, lang)
+  end
-    page = agent.get(url)
-    page.parser # Nokogiri document
+  # Scrape concept data from Electropedia for a given IEV code.
+  # Uses Ferrum (headless Chrome) to handle AWS WAF challenge.
+  #
+  # @param code [String] IEV code, e.g. "103-01-02"
+  # @return [Hash, nil] concept data hash or nil if not found
+  def self.scrape_concept(code)
+    Scraper.new.fetch_concept(code)
   end
 end
-require_relative "iev/cli"