RubyGems - iev - Versions diffs - 0.3.0 → 0.3.3 - Mend

iev 0.3.0 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +3 -20
data/.github/workflows/release.yml +25 -0
data/.gitignore +4 -0
data/.rubocop.yml +0 -2
data/README.adoc +4 -4
data/exe/iev-glossarist +21 -0
data/iev.gemspec +12 -3
data/lib/iev/cli/command.rb +109 -0
data/lib/iev/cli/command_helper.rb +83 -0
data/lib/iev/cli/ui.rb +70 -0
data/lib/iev/cli.rb +22 -0
data/lib/iev/converter/mathml_to_asciimath.rb +197 -0
data/lib/iev/converter.rb +9 -0
data/lib/iev/data_conversions.rb +39 -0
data/lib/iev/db.rb +3 -3
data/lib/iev/db_cache.rb +2 -2
data/lib/iev/db_writer.rb +81 -0
data/lib/iev/iso_639_2.yaml +4075 -0
data/lib/iev/iso_639_code.rb +47 -0
data/lib/iev/profiler.rb +69 -0
data/lib/iev/relaton_db.rb +63 -0
data/lib/iev/source_parser.rb +350 -0
data/lib/iev/supersession_parser.rb +70 -0
data/lib/iev/term_attrs_parser.rb +143 -0
data/lib/iev/term_builder.rb +313 -0
data/lib/iev/utilities.rb +58 -0
data/lib/iev/version.rb +2 -2
data/lib/iev.rb +24 -2
metadata +153 -10

data/lib/iev/term_attrs_parser.rb ADDED Viewed

@@ -0,0 +1,143 @@
+# frozen_string_literal: true
+# (c) Copyright 2020 Ribose Inc.
+#
+module IEV
+  # Parses information from the spreadsheet's TERMATTRIBUTE column and alike.
+  #
+  # @example
+  #   parser = TermAttrsParser.new(cell_data_string)
+  #   parser.gender # returns grammatical gender
+  #   parser.plurality # returns grammatical plurality
+  #   parser.part_of_speech # returns part of speech
+  class TermAttrsParser
+    include CLI::UI
+    using DataConversions
+    attr_reader :raw_str, :src_str
+    attr_reader :gender, :geographical_area, :part_of_speech, :plurality,
+      :prefix, :usage_info
+    PARTS_OF_SPEECH = {
+      "adj" => "adj",
+      "noun" => "noun",
+      "verb" => "verb",
+      "名詞" => "noun",
+      "動詞" => "verb",
+      "形容詞" => "adj",
+      "형용사" => "adj",
+      "Adjektiv" => "adj",
+    }.freeze
+    PREFIX_KEYWORDS = %w[
+      Präfix prefix préfixe 接尾語 접두사 przedrostek prefixo 词头
+    ].freeze
+    def initialize(attr_str)
+      @raw_str = attr_str.dup.freeze
+      @src_str = decode_attrs_string(raw_str).freeze
+      parse
+    end
+    def inspect
+      "<ATTRIBUTES: #{src_str}>".freeze
+    end
+    private
+    def parse
+      curr_str = src_str.dup
+      extract_gender(curr_str)
+      extract_plurality(curr_str)
+      extract_geographical_area(curr_str)
+      extract_part_of_speech(curr_str)
+      extract_usage_info(curr_str)
+      extract_prefix(curr_str)
+      if /\p{Word}/ =~ curr_str
+        debug(
+          :term_attributes,
+          "Term attributes could not be parsed completely: '#{src_str}'",
+        )
+      end
+    end
+    def extract_gender(str)
+      gender_rx = /\b[mfn]\b/
+      @gender = remove_from_string(str, gender_rx)
+    end
+    # Must happen after #extract_gender
+    def extract_plurality(str)
+      plural_rx = /\bpl\b/
+      if remove_from_string(str, plural_rx)
+        @plurality = "plural"
+      elsif !gender.nil?
+        # TODO Really needed?
+        @plurality = "singular"
+      end
+    end
+    # TODO this is likely buggy
+    def extract_geographical_area(str)
+      ga_rx = /\b[A-Z]{2}$/
+      @geographical_area = remove_from_string(str, ga_rx)
+    end
+    def extract_part_of_speech(str)
+      pos_rx = %r{
+        \b
+        #{Regexp.union(PARTS_OF_SPEECH.keys)}
+        \b
+      }x.freeze
+      removed = remove_from_string(str, pos_rx)
+      @part_of_speech = PARTS_OF_SPEECH[removed] || removed
+    end
+    def extract_usage_info(str)
+      info_rx = %r{
+        # regular ASCII less and greater than signs
+        < (?<inner>.*?) >
+        |
+        # ＜ and ＞, i.e. full-width less and greater than signs
+        # which are used instead of ASCII signs in some CJK terms
+        \uFF1C (?<inner>.*?) \uFF1E
+      }x.freeze
+      remove_from_string(str, info_rx) do |md|
+        @usage_info = md[:inner].strip
+      end
+    end
+    def extract_prefix(str)
+      prefix_rx = %r{
+        \b
+        #{Regexp.union(PREFIX_KEYWORDS)}
+        \b
+      }x.freeze
+      @prefix = true if remove_from_string(str, prefix_rx)
+    end
+    def decode_attrs_string(str)
+      str.decode_html || ""
+    end
+    def remove_from_string(string, regexp)
+      string.sub!(regexp, "")
+      if $~ && block_given?
+        yield $~
+      else
+        $& # removed substring or nil
+      end
+    end
+  end
+end

data/lib/iev/term_builder.rb ADDED Viewed

@@ -0,0 +1,313 @@
+# frozen_string_literal: true
+# (c) Copyright 2020 Ribose Inc.
+#
+require "pp"
+module IEV
+  class TermBuilder
+    include CLI::UI
+    include Utilities
+    using DataConversions
+    def initialize(data)
+      @data = data
+    end
+    def build
+      build_term_object
+    end
+    def self.build_from(data)
+      new(data).build
+    end
+    attr_reader :data
+    def find_value_for(key)
+      data.fetch(key.to_sym, nil)&.sanitize
+    end
+    def flesh_date(incomplete_date)
+      return incomplete_date if incomplete_date.nil? || incomplete_date.empty?
+      year, month, day = incomplete_date.split("-")
+      month ||= "01"
+      day ||= "01"
+      DateTime.parse("#{year}-#{month}-#{day}").to_s
+    end
+    def build_term_object
+      set_ui_tag "#{term_id} (#{term_language})"
+      progress "Processing term #{term_id} (#{term_language})..."
+      split_definition
+      Glossarist::LocalizedConcept.new(term_hash)
+    end
+    def term_hash
+      dates = nil
+      if flesh_date(find_value_for("PUBLICATIONDATE"))
+        dates = [
+          {
+            type: :accepted,
+            date: flesh_date(find_value_for("PUBLICATIONDATE")),
+          },
+          {
+            type: :amended,
+            date: flesh_date(find_value_for("PUBLICATIONDATE")),
+          },
+        ]
+      end
+      {
+        id: term_id,
+        entry_status: extract_entry_status,
+        classification: extract_classification,
+        dates: dates,
+        review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
+        review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
+        review_decision_event: "published",
+        terms: extract_terms,
+        notes: extract_notes,
+        examples: extract_examples,
+        definition: [{ "content" => extract_definition_value }],
+        sources: extract_authoritative_source,
+        language_code: term_language,
+        related: extract_superseded_concepts,
+      }.compact
+    end
+    def term_id
+      @term_id ||= find_value_for("IEVREF")
+    end
+    def term_domain
+      @term_domain ||= term_id.slice(0, 3)
+    end
+    def term_language
+      @term_language ||= find_value_for("LANGUAGE").to_three_char_code
+    end
+    # Splits unified definition (from the spreadsheet) into separate
+    # definition, examples, and notes strings (for YAMLs).
+    #
+    # Sets +@definition+, +@examples+ and +@notes+ variables.
+    def split_definition
+      slicer_rx = %r{
+        \s*
+        (?:<p>\s*)?
+        (
+          (?<example>
+            # English example
+            \bEXAMPLE\b |
+            ^\bExamples\s+are\b: |
+            ^\bExamples\b: |
+            ^\bExample\b: |
+            # French examples
+            \bEXEMPLE\b |
+            ^\bExemples\b:
+          )
+          |
+          (?<note>
+            Note\s*\d+\sto\sentry: |
+            Note&nbsp;\d+\sto\sentry: |
+            Note\s*\d+\sto\sthe\sentry: |
+            Note\sto\sentry\s*\d+: |
+            Note\s*\d+?\sà\sl['’]article: |
+            <NOTE\/?>?\s*\d?\s+.*?– |
+            NOTE(?:\s+-)? |
+            Note\s+\d+\s– |
+            Note&nbsp;\d+\s
+          )
+        )
+        \s*
+      }x
+      @examples = []
+      @notes = []
+      definition_arr = [] # here array for consistent interface
+      next_part_arr = definition_arr
+      remaining_str = find_value_for("DEFINITION")
+      while md = remaining_str&.match(slicer_rx)
+        next_part = md.pre_match
+        next_part.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
+        next_part_arr.push(next_part)
+        next_part_arr = md[:example] ? @examples : @notes
+        # 112-03-17
+        # supplements the name of a quantity, especially for a component in a
+        # system, to indicate the quotient of that quantity by the total
+        # volume
+        # <NOTE – Examples: amount-of-substance volume concentration of
+        # component B (or concentration of B, in particular, ion
+        # concentration), molecular concentration of B, electron concentration
+        # (or electron density).
+        #
+        # In the above case the `Example` is part of the note but the regex
+        # above will capture it as an example and will add an empty `Note`
+        # and put the rest in an `Example`. So In this case we will replace
+        # the `Example` with `[:Example]` and revert it in the next iteration
+        # so it will not be caught by the regex.
+        remaining_str = md.post_match
+        remaining_str.sub!(/^Ex(a|e)mple/, "[:Ex\\1mple]") if md[:note]
+      end
+      remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
+      next_part_arr.push(remaining_str)
+      @definition = definition_arr.first
+      @definition = nil if @definition&.empty?
+    end
+    def extract_terms
+      [
+        extract_primary_designation,
+        *extract_synonymous_designations,
+        extract_international_symbol_designation,
+      ].compact
+    end
+    def extract_primary_designation
+      raw_term = find_value_for("TERM")
+      raw_term = "NA" if raw_term == "....."
+      build_expression_designation(
+        raw_term,
+        attribute_data: find_value_for("TERMATTRIBUTE"),
+        status: "preferred",
+      )
+    end
+    def extract_synonymous_designations
+      retval = (1..3).map do |num|
+        designations = find_value_for("SYNONYM#{num}") || ""
+        # Some synonyms have more than one entry
+        designations.split(/<[pbr]+>/).map do |raw_term|
+          build_expression_designation(
+            raw_term,
+            attribute_data: find_value_for("SYNONYM#{num}ATTRIBUTE"),
+            status: find_value_for("SYNONYM#{num}STATUS")&.downcase,
+          )
+        end
+      end
+      retval.flatten.compact
+    end
+    def extract_international_symbol_designation
+      raw_term = find_value_for("SYMBOLE")
+      raw_term && build_symbol_designation(raw_term)
+    end
+    def extract_definition_value
+      if @definition
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(@definition, term_domain)),
+        ).strip
+      end
+    end
+    def extract_examples
+      @examples.map do |str|
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(str, term_domain)),
+        ).strip
+      end
+    end
+    def extract_notes
+      @notes.map do |str|
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(str, term_domain)),
+        ).strip
+      end
+    end
+    def extract_entry_status
+      case find_value_for("STATUS").downcase
+      when "standard" then "valid"
+      else nil
+      end
+    end
+    def extract_classification
+      classification_val = find_value_for("SYNONYM1STATUS")
+      case classification_val
+      when ""
+        "admitted"
+      when "认可的", "допустимый", "admitido"
+        "admitted"
+      when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
+        "preferred"
+      else
+        classification_val
+      end
+    end
+    def extract_authoritative_source
+      source_val = find_value_for("SOURCE")
+      return nil if source_val.nil?
+      SourceParser.new(source_val, term_domain)
+        .parsed_sources
+        .compact
+        .map do |source|
+        source.merge({ "type" => "authoritative" })
+      end
+    end
+    def extract_superseded_concepts
+      replaces_val = find_value_for("REPLACES")
+      return nil if replaces_val.nil?
+      SupersessionParser.new(replaces_val).supersessions
+    end
+    private
+    def build_expression_designation(raw_term, attribute_data:, status:)
+      term = IEV::Converter.mathml_to_asciimath(
+        parse_anchor_tag(raw_term, term_domain),
+      )
+      term_attributes = TermAttrsParser.new(attribute_data.to_s)
+      statuses = {
+        "obsoleto" => "deprecated",
+        "напуштен" => "deprecated",
+      }
+      {
+        "type" => "expression",
+        "prefix" => term_attributes.prefix,
+        "normative_status" => statuses[status] || status,
+        "usage_info" => term_attributes.usage_info,
+        "designation" => term,
+        "part_of_speech" => term_attributes.part_of_speech,
+        "geographical_area" => term_attributes.geographical_area,
+        "gender" => term_attributes.gender,
+        "plurality" => term_attributes.plurality,
+      }.compact
+    end
+    def build_symbol_designation(raw_term)
+      term = IEV::Converter.mathml_to_asciimath(
+        parse_anchor_tag(raw_term, term_domain),
+      )
+      {
+        "type" => "symbol",
+        "designation" => term,
+        "international" => true,
+      }.compact
+    end
+  end
+end

data/lib/iev/utilities.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module IEV
+  module Utilities
+    SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
+    FIGURE_ONE_REGEX =
+      "<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?"
+    FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}"
+    IMAGE_PATH_PREFIX = "image::/assets/images/parts"
+    def parse_anchor_tag(text, term_domain)
+      if text
+        # Convert IEV term references
+        # Convert href links
+        # Need to take care of this pattern:
+        #  `inverse de la <a href="IEV103-06-01">période<a>`
+        text.gsub(
+          /<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
+          '{{\3, \1:\2}}',
+        ).gsub(
+          /<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
+          '{{\3, IEV:\2}}',
+        ).gsub(
+          # To handle <a> tags without ending tag like
+          #  `Voir <a href=IEV103-05-21>IEV 103-05-21`
+          #  for concept '702-03-11' in `fr`
+          /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
+          '{{\3, IEV:\2}}',
+        ).gsub(
+          /<a href="?([^<>]*?)"?>(.*?)<\/a>/,
+          '\1[\2]',
+        ).gsub(
+          Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_TWO_REGEX].join),
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
+        ).gsub(
+          Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_ONE_REGEX].join),
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
+        ).gsub(
+          /<img\s+([^<>]+?)\s*>/,
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
+        ).gsub(
+          /<br>/,
+          "\n",
+        ).gsub(
+          /<b>(.*?)<\/b>/,
+          "*\\1*",
+        )
+      end
+    end
+    def replace_newlines(input)
+      input.gsub('\n', "\n\n")
+        .gsub(/<[pbr]+>/, "\n\n")
+        .gsub(/\s*\n[\n\s]+/, "\n\n")
+        .strip
+    end
+  end
+end

data/lib/iev/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
-module Iev
-  VERSION = "0.3.0".freeze
+module IEV
+  VERSION = "0.3.3".freeze
 end

data/lib/iev.rb CHANGED Viewed

@@ -3,7 +3,27 @@ require "iev/db"
 require "open-uri"
 require "nokogiri"
-module Iev
+require "benchmark"
+require "creek"
+require "unitsml"
+require "plurimath"
+require "glossarist"
+require "relaton"
+require "relaton_bib"
+require "sequel"
+require "thor"
+require "yaml"
+require "zeitwerk"
+loader = Zeitwerk::Loader.for_gem
+loader.inflector.inflect(
+  "cli" => "CLI",
+  "iev" => "IEV",
+  "ui" => "UI",
+)
+loader.setup
+module IEV
   #
   # Scrape Electropedia for term.
   #
@@ -23,6 +43,8 @@ module Iev
     a = doc&.at(xpath)&.children&.to_xml
     a&.sub(%r{<br/>.*$}, "")
       &.sub(%r{, &lt;.*$}, "")
-      &.gsub(%r{<[^>]*>}, "")&.strip
+      &.gsub(%r{<[^<>]*>}, "")&.strip
   end
 end
+require "iev/cli"