RubyGems - iev - Versions diffs - 0.3.9 → 0.4.0 - Mend

iev 0.3.9 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +0 -2
data/.github/workflows/release.yml +3 -1
data/.gitignore +3 -1
data/CLAUDE.md +50 -0
data/Gemfile +3 -0
data/README.adoc +65 -15
data/exe/iev +11 -0
data/iev.gemspec +5 -4
data/lib/iev/cli/command.rb +119 -76
data/lib/iev/cli/command_helper.rb +55 -36
data/lib/iev/config.rb +31 -0
data/lib/iev/converter/mathml_to_asciimath.rb +119 -158
data/lib/iev/data_source.rb +124 -0
data/lib/iev/exporter.rb +122 -0
data/lib/iev/scraper/page_parser.rb +176 -0
data/lib/iev/scraper.rb +135 -0
data/lib/iev/source_parser.rb +31 -18
data/lib/iev/supersession_parser.rb +9 -13
data/lib/iev/term_attrs_parser.rb +21 -7
data/lib/iev/term_builder.rb +100 -94
data/lib/iev/utilities.rb +91 -42
data/lib/iev/version.rb +1 -1
data/lib/iev.rb +47 -35
metadata +34 -13
data/lib/iev/db.rb +0 -82
data/lib/iev/db_cache.rb +0 -124

data/lib/iev/scraper/page_parser.rb ADDED Viewed

@@ -0,0 +1,176 @@
+# frozen_string_literal: true
+module Iev
+  class Scraper
+    # Parses an Electropedia HTML page into a concept data hash.
+    #
+    # The Electropedia HTML structure is a table with rows for each language:
+    # - Language row: <div align="center"><font color="#800080">en</font></div>
+    # - Term cell: <b>term text</b> in the third <td>
+    # - Definition row: next row's third <td> (if present)
+    # - Empty/separator rows with <hr> or spacer images
+    class PageParser
+      # Map Electropedia HTML language codes to ISO 639-2/3 three-char codes.
+      LANG_CODE_MAP = {
+        "en" => "eng",
+        "fr" => "fra",
+        "ar" => "ara",
+        "de" => "deu",
+        "es" => "spa",
+        "it" => "ita",
+        "ko" => "kor",
+        "ja" => "jpn",
+        "pl" => "pol",
+        "pt" => "por",
+        "sr" => "srp",
+        "sv" => "swe",
+        "zh" => "zho",
+        "nl" => "nld",
+        "fi" => "fin",
+        "cs" => "ces",
+        "no" => "nor",
+        "ru" => "rus",
+        "sl" => "slv",
+        "sk" => "slk",
+      }.freeze
+      def initialize(doc, code)
+        @doc = doc
+        @code = code
+      end
+      def parse
+        return nil unless find_iev_ref
+        {
+          "id" => @code,
+          "data" => {
+            "identifier" => @code,
+            "localized_concepts" => localized_concepts,
+          },
+        }
+      end
+      private
+      def find_iev_ref
+        # Find the IEV reference cell to confirm the page is valid
+        @doc.at_css("b:contains('#{@code}')") ||
+          @doc.at_xpath("//td/b[contains(text(), '#{@code}')]")
+      end
+      def localized_concepts
+        result = {}
+        lang_sections.each do |lang, term_row, def_row|
+          term = extract_term(term_row)
+          next unless term
+          entry = { "term" => term }
+          definition = extract_definition(def_row)
+          entry["definition"] = definition if definition
+          result[lang] = entry
+        end
+        result
+      end
+      # Finds all language sections in the table.
+      # Returns array of [lang_code, term_row, definition_row] tuples.
+      def lang_sections
+        sections = []
+        rows = content_rows
+        rows.each_with_index do |row, idx|
+          lang = extract_lang(row)
+          next unless lang
+          # The definition is in the next non-empty, non-separator row
+          def_row = find_definition_row(rows, idx + 1)
+          sections << [lang, row, def_row]
+        end
+        sections
+      end
+      def content_rows
+        # Find the main content table (the one with language data)
+        # It's the largest table with IEV data
+        tables = @doc.css("table")
+        content_table = tables.max_by { |t| t.css("tr").length }
+        content_table ? content_table.css("tr").to_a : []
+      end
+      def extract_lang(row)
+        font = row.at_css("div[align='center'] font[color='#800080']")
+        return nil unless font
+        lang_code = font.text.strip.downcase
+        LANG_CODE_MAP[lang_code]
+      end
+      def extract_term(row)
+        # Term is in the third <td> — may be in a <b> tag (en, fr) or plain text
+        tds = row.css("td")
+        return nil if tds.length < 3
+        content_td = tds[2]
+        bold = content_td.at_css("b")
+        term = bold ? bold.text.strip : content_td.text.strip
+        term.empty? ? nil : term
+      end
+      def extract_definition(row)
+        return nil unless row
+        tds = row.css("td")
+        return nil if tds.length < 3
+        content_td = tds[2]
+        # The definition is the text content, which may include MathML
+        html = content_td.inner_html.strip
+        return nil if html.empty? || html.match?(/\A<img.*ecblank/)
+        html
+      end
+      # Find the definition row following a language row.
+      # Skip separator rows (empty, <hr>, or spacer images).
+      def find_definition_row(rows, start_idx)
+        return nil if start_idx >= rows.length
+        row = rows[start_idx]
+        return nil if extract_lang(row)
+        return nil if separator?(row)
+        tds = row.css("td")
+        return nil if tds.length < 3
+        content = tds[2].inner_html.strip
+        return nil if content.empty?
+        # Skip rows that are only spacer images (unless they have <b> content)
+        if content.match?(/\A<img.*ecblank/) && !content.include?("<b>")
+          return nil
+        end
+        row
+      end
+      def separator?(row)
+        tds = row.css("td")
+        return true if tds.any? { |td| td.at_css("hr") }
+        tds.all? { |td| spacer_only?(td) }
+      end
+      def spacer_only?(cell)
+        html = cell.inner_html.strip
+        return true if html.empty?
+        return true if html.match?(/\A<img.*ecblank/)
+        cell.at_css("img[src*='ecblank']") && cell.text.strip.empty?
+      end
+    end
+  end
+end

data/lib/iev/scraper.rb ADDED Viewed

@@ -0,0 +1,135 @@
+# frozen_string_literal: true
+module Iev
+  # Scrapes IEV term data from Electropedia (electropedia.org).
+  #
+  # Electropedia is behind AWS WAF which requires JavaScript execution,
+  # so a headless browser (via Ferrum/Chrome) is used to handle the challenge.
+  #
+  # @example
+  #   scraper = Iev::Scraper.new
+  #   concept = scraper.fetch_concept("103-01-02")
+  #   doc = scraper.fetch_page("103-01-02")
+  class Scraper
+    BASE_URL = "https://www.electropedia.org/iev/iev.nsf/" \
+               "display?openform&ievref="
+    # Pool of realistic Chrome User-Agent strings with matching platform hints.
+    # Rotated per request to reduce fingerprinting by AWS WAF.
+    USER_AGENT_PROFILES = [
+      {
+        user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
+                    "AppleWebKit/537.36 (KHTML, like Gecko) " \
+                    "Chrome/131.0.0.0 Safari/537.36",
+        platform: '"macOS"',
+        chrome_version: "131",
+      },
+      {
+        user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
+                    "AppleWebKit/537.36 (KHTML, like Gecko) " \
+                    "Chrome/130.0.0.0 Safari/537.36",
+        platform: '"Windows"',
+        chrome_version: "130",
+      },
+      {
+        user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
+                    "AppleWebKit/537.36 (KHTML, like Gecko) " \
+                    "Chrome/131.0.0.0 Safari/537.36",
+        platform: '"Linux"',
+        chrome_version: "131",
+      },
+      {
+        user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
+                    "AppleWebKit/537.36 (KHTML, like Gecko) " \
+                    "Chrome/129.0.0.0 Safari/537.36",
+        platform: '"macOS"',
+        chrome_version: "129",
+      },
+      {
+        user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
+                    "AppleWebKit/537.36 (KHTML, like Gecko) " \
+                    "Chrome/131.0.0.0 Safari/537.36",
+        platform: '"Windows"',
+        chrome_version: "131",
+      },
+    ].freeze
+    def initialize(browser_opts: {})
+      @browser_opts = browser_opts
+    end
+    # Fetch the Electropedia page HTML for a given IEV code.
+    # Returns a Nokogiri document.
+    def fetch_page(code)
+      require "ferrum"
+      require "nokogiri"
+      url = "#{BASE_URL}#{code}"
+      browser = Ferrum::Browser.new(
+        headless: "new",
+        timeout: 30,
+        window_size: [1366, 768],
+        browser_options: {
+          "disable-blink-features" => "AutomationControlled",
+        },
+        **@browser_opts,
+      )
+      browser.headers.set(random_headers)
+      browser.go_to(url)
+      browser.network.wait_for_idle(timeout: 15)
+      html = browser.body
+      # Check if we got a real page or a WAF block
+      if html.include?("403 ERROR") || html.include?("Request blocked")
+        warn "IEV Scraper: AWS WAF blocked request for #{code}"
+        return nil
+      end
+      Nokogiri::HTML(html)
+    rescue Ferrum::Error, Ferrum::BrowserError => e
+      warn "IEV Scraper error for #{code}: #{e.message}"
+      nil
+    ensure
+      browser&.quit
+    end
+    # Fetch and parse concept data for an IEV code.
+    # Returns a hash with concept data or nil if not found.
+    def fetch_concept(code)
+      doc = fetch_page(code)
+      return nil unless doc
+      PageParser.new(doc, code).parse
+    end
+    private
+    def random_headers
+      profile = USER_AGENT_PROFILES.sample
+      sec_ch_ua = "\"Google Chrome\";v=\"#{profile[:chrome_version]}\", " \
+                  "\"Chromium\";v=\"#{profile[:chrome_version]}\", " \
+                  "\"Not_A Brand\";v=\"24\""
+      {
+        "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
+                    "image/avif,image/webp,image/apng,*/*;q=0.8," \
+                    "application/signed-exchange;v=b3;q=0.7",
+        "Accept-Language" => "en-GB,en-US;q=0.9,en;q=0.8",
+        "Cache-Control" => "no-cache",
+        "Pragma" => "no-cache",
+        "Sec-Ch-Ua" => sec_ch_ua,
+        "Sec-Ch-Ua-Mobile" => "?0",
+        "Sec-Ch-Ua-Platform" => profile[:platform],
+        "Sec-Fetch-Dest" => "document",
+        "Sec-Fetch-Mode" => "navigate",
+        "Sec-Fetch-Site" => "cross-site",
+        "Sec-Fetch-User" => "?1",
+        "Upgrade-Insecure-Requests" => "1",
+        "User-Agent" => profile[:user_agent],
+      }
+    end
+  end
+end
+require_relative "scraper/page_parser"

data/lib/iev/source_parser.rb CHANGED Viewed

@@ -11,7 +11,6 @@ module Iev
   # @example
   #   SourceParser.new(cell_data_string).parsed_sources
   class SourceParser
-    include Cli::Ui
     include Utilities
     using DataConversions
@@ -71,20 +70,25 @@ module Iev
     end
     def extract_single_source(raw_ref)
-      relation_type = extract_source_relationship(raw_ref)
+      relationship = extract_source_relationship(raw_ref)
       clean_ref = normalize_ref_string(raw_ref)
       source_ref = extract_source_ref(clean_ref)
       clause = extract_source_clause(clean_ref)
-      {
-        "ref" => source_ref,
-        "clause" => clause,
-        "link" => obtain_source_link(source_ref),
-        "relationship" => relation_type,
-        "original" => Iev::Converter.mathml_to_asciimath(
+      origin = Glossarist::Citation.new(
+        ref: source_ref,
+        locality: build_locality(clause),
+        link: obtain_source_link(source_ref),
+        original: Iev::Converter.mathml_to_asciimath(
           parse_anchor_tag(raw_ref, @term_domain),
         ),
-      }.compact
+      )
+      Glossarist::ConceptSource.new(
+        status: relationship[:status],
+        origin: origin,
+        modification: relationship[:modification],
+      )
     rescue ::RelatonBib::RequestError => e
       warn e.message
     end
@@ -208,7 +212,6 @@ module Iev
         /Constitution de l’Union internationale des télécommunications (UIT)/
         "International Telecommunication Union (ITU) Constitution (Ed. 2015)"
       else
-        debug :sources, "Failed to parse source: '#{str}'"
         str
       end
     end
@@ -320,26 +323,36 @@ module Iev
       case str
       when /^MOD ([\d\-])/
-        {
-          "type" => type.to_s,
-        }
+        { status: type.to_s }
       when /(modified|modifié|modifiée|modifiés|MOD)\s*[–-]?\s+(.+)\Z/
         {
-          "type" => type.to_s,
-          "modification" => Iev::Converter.mathml_to_asciimath(
+          status: type.to_s,
+          modification: Iev::Converter.mathml_to_asciimath(
             parse_anchor_tag(::Regexp.last_match(2), @term_domain),
           ).strip,
         }
       else
-        {
-          "type" => type.to_s,
-        }
+        { status: type.to_s }
       end
     end
+    def build_locality(clause)
+      return nil unless clause
+      Glossarist::Locality.new(
+        type: "clause",
+        reference_from: clause,
+      )
+    end
     # Uses Relaton to obtain link for given source ref.
     def obtain_source_link(ref)
+      return nil unless defined?(RelatonDb)
       RelatonDb.instance.fetch(ref)&.url
+    rescue ::RelatonBib::RequestError => e
+      warn e.message
+      nil
     end
   end
 end

data/lib/iev/supersession_parser.rb CHANGED Viewed

@@ -9,8 +9,8 @@ module Iev
   #
   # @example
   #   SupersessionParser.new(cell_data_string).supersessions
+  #   # => [Glossarist::RelatedConcept, ...]
   class SupersessionParser
-    include Cli::Ui
     using DataConversions
     attr_reader :raw_str, :src_str, :supersessions
@@ -52,18 +52,14 @@ module Iev
     end
     def relation_from_match(match_data)
-      {
-        "type" => "supersedes",
-        "ref" => iev_ref_from_match(match_data),
-      }
-    end
-    def iev_ref_from_match(match_data)
-      {
-        "source" => "IEV",
-        "id" => match_data[:ref],
-        "version" => match_data[:version],
-      }
+      Glossarist::RelatedConcept.new(
+        type: "supersedes",
+        ref: Glossarist::Citation.new(
+          source: "IEV",
+          id: match_data[:ref],
+          version: match_data[:version],
+        ),
+      )
     end
   end
 end

data/lib/iev/term_attrs_parser.rb CHANGED Viewed

@@ -13,7 +13,6 @@ module Iev
   #   parser.plurality # returns grammatical plurality
   #   parser.part_of_speech # returns part of speech
   class TermAttrsParser
-    include Cli::Ui
     using DataConversions
     attr_reader :raw_str, :src_str, :gender, :geographical_area,
@@ -44,6 +43,19 @@ module Iev
       "<ATTRIBUTES: #{src_str}>".freeze
     end
+    # Constructs a Glossarist::Designation::GrammarInfo from the parsed
+    # gender, plurality, and part_of_speech attributes.
+    # Returns nil if none of these attributes were parsed.
+    def to_grammar_info
+      return nil unless gender || plurality || part_of_speech
+      Glossarist::Designation::GrammarInfo.new(
+        gender: gender ? [gender] : nil,
+        number: plurality ? [plurality] : nil,
+        part_of_speech: part_of_speech,
+      )
+    end
     private
     def parse
@@ -58,10 +70,7 @@ module Iev
       return unless /\p{Word}/.match?(curr_str)
-      debug(
-        :term_attributes,
-        "Term attributes could not be parsed completely: '#{src_str}'",
-      )
+      # Term attributes could not be parsed completely
     end
     def extract_gender(str)
@@ -130,11 +139,16 @@ module Iev
         \b
       /x
-      @prefix = true if remove_from_string(str, prefix_rx)
+      removed = remove_from_string(str, prefix_rx)
+      @prefix = removed if removed
     end
     def decode_attrs_string(str)
-      str.decode_html || ""
+      decoded = str.decode_html || ""
+      # Strip common HTML inline tags that appear in TERMATTRIBUTE data
+      # and would interfere with usage_info angle-bracket parsing.
+      # Only strip known HTML tags, not usage_info like <telecommunications>.
+      decoded.gsub(/<\/?(?:sup|sub|i|b|em|strong|span|small)>/, "")
     end
     def remove_from_string(string, regexp)