RubyGems - iev - Versions diffs - 0.3.1 → 0.3.3 - Mend

iev 0.3.1 → 0.3.3

Files changed (30) hide show

checksums.yaml +4 -4
data/.github/workflows/rake.yml +1 -1
data/.github/workflows/release.yml +25 -0
data/.gitignore +4 -0
data/.rubocop.yml +0 -2
data/README.adoc +4 -4
data/exe/iev-glossarist +21 -0
data/iev.gemspec +12 -3
data/lib/iev/cli/command.rb +109 -0
data/lib/iev/cli/command_helper.rb +83 -0
data/lib/iev/cli/ui.rb +70 -0
data/lib/iev/cli.rb +22 -0
data/lib/iev/converter/mathml_to_asciimath.rb +197 -0
data/lib/iev/converter.rb +9 -0
data/lib/iev/data_conversions.rb +39 -0
data/lib/iev/db.rb +3 -3
data/lib/iev/db_cache.rb +2 -2
data/lib/iev/db_writer.rb +81 -0
data/lib/iev/iso_639_2.yaml +4075 -0
data/lib/iev/iso_639_code.rb +47 -0
data/lib/iev/profiler.rb +69 -0
data/lib/iev/relaton_db.rb +63 -0
data/lib/iev/source_parser.rb +350 -0
data/lib/iev/supersession_parser.rb +70 -0
data/lib/iev/term_attrs_parser.rb +143 -0
data/lib/iev/term_builder.rb +313 -0
data/lib/iev/utilities.rb +58 -0
data/lib/iev/version.rb +2 -2
data/lib/iev.rb +24 -2
metadata +153 -10

data/lib/iev/term_attrs_parser.rb ADDED Viewed

@@ -0,0 +1,143 @@
+# frozen_string_literal: true
+# (c) Copyright 2020 Ribose Inc.
+#
+module IEV
+  # Parses information from the spreadsheet's TERMATTRIBUTE column and alike.
+  #
+  # @example
+  #   parser = TermAttrsParser.new(cell_data_string)
+  #   parser.gender # returns grammatical gender
+  #   parser.plurality # returns grammatical plurality
+  #   parser.part_of_speech # returns part of speech
+  class TermAttrsParser
+    include CLI::UI
+    using DataConversions
+    attr_reader :raw_str, :src_str
+    attr_reader :gender, :geographical_area, :part_of_speech, :plurality,
+      :prefix, :usage_info
+    PARTS_OF_SPEECH = {
+      "adj" => "adj",
+      "noun" => "noun",
+      "verb" => "verb",
+      "名詞" => "noun",
+      "動詞" => "verb",
+      "形容詞" => "adj",
+      "형용사" => "adj",
+      "Adjektiv" => "adj",
+    }.freeze
+    PREFIX_KEYWORDS = %w[
+      Präfix prefix préfixe 接尾語 접두사 przedrostek prefixo 词头
+    ].freeze
+    def initialize(attr_str)
+      @raw_str = attr_str.dup.freeze
+      @src_str = decode_attrs_string(raw_str).freeze
+      parse
+    end
+    def inspect
+      "<ATTRIBUTES: #{src_str}>".freeze
+    end
+    private
+    def parse
+      curr_str = src_str.dup
+      extract_gender(curr_str)
+      extract_plurality(curr_str)
+      extract_geographical_area(curr_str)
+      extract_part_of_speech(curr_str)
+      extract_usage_info(curr_str)
+      extract_prefix(curr_str)
+      if /\p{Word}/ =~ curr_str
+        debug(
+          :term_attributes,
+          "Term attributes could not be parsed completely: '#{src_str}'",
+        )
+      end
+    end
+    def extract_gender(str)
+      gender_rx = /\b[mfn]\b/
+      @gender = remove_from_string(str, gender_rx)
+    end
+    # Must happen after #extract_gender
+    def extract_plurality(str)
+      plural_rx = /\bpl\b/
+      if remove_from_string(str, plural_rx)
+        @plurality = "plural"
+      elsif !gender.nil?
+        # TODO Really needed?
+        @plurality = "singular"
+      end
+    end
+    # TODO this is likely buggy
+    def extract_geographical_area(str)
+      ga_rx = /\b[A-Z]{2}$/
+      @geographical_area = remove_from_string(str, ga_rx)
+    end
+    def extract_part_of_speech(str)
+      pos_rx = %r{
+        \b
+        #{Regexp.union(PARTS_OF_SPEECH.keys)}
+        \b
+      }x.freeze
+      removed = remove_from_string(str, pos_rx)
+      @part_of_speech = PARTS_OF_SPEECH[removed] || removed
+    end
+    def extract_usage_info(str)
+      info_rx = %r{
+        # regular ASCII less and greater than signs
+        < (?<inner>.*?) >
+        |
+        # ＜ and ＞, i.e. full-width less and greater than signs
+        # which are used instead of ASCII signs in some CJK terms
+        \uFF1C (?<inner>.*?) \uFF1E
+      }x.freeze
+      remove_from_string(str, info_rx) do |md|
+        @usage_info = md[:inner].strip
+      end
+    end
+    def extract_prefix(str)
+      prefix_rx = %r{
+        \b
+        #{Regexp.union(PREFIX_KEYWORDS)}
+        \b
+      }x.freeze
+      @prefix = true if remove_from_string(str, prefix_rx)
+    end
+    def decode_attrs_string(str)
+      str.decode_html || ""
+    end
+    def remove_from_string(string, regexp)
+      string.sub!(regexp, "")
+      if $~ && block_given?
+        yield $~
+      else
+        $& # removed substring or nil
+      end
+    end
+  end
+end

data/lib/iev/term_builder.rb ADDED Viewed

@@ -0,0 +1,313 @@
+# frozen_string_literal: true
+# (c) Copyright 2020 Ribose Inc.
+#
+require "pp"
+module IEV
+  class TermBuilder
+    include CLI::UI
+    include Utilities
+    using DataConversions
+    def initialize(data)
+      @data = data
+    end
+    def build
+      build_term_object
+    end
+    def self.build_from(data)
+      new(data).build
+    end
+    attr_reader :data
+    def find_value_for(key)
+      data.fetch(key.to_sym, nil)&.sanitize
+    end
+    def flesh_date(incomplete_date)
+      return incomplete_date if incomplete_date.nil? || incomplete_date.empty?
+      year, month, day = incomplete_date.split("-")
+      month ||= "01"
+      day ||= "01"
+      DateTime.parse("#{year}-#{month}-#{day}").to_s
+    end
+    def build_term_object
+      set_ui_tag "#{term_id} (#{term_language})"
+      progress "Processing term #{term_id} (#{term_language})..."
+      split_definition
+      Glossarist::LocalizedConcept.new(term_hash)
+    end
+    def term_hash
+      dates = nil
+      if flesh_date(find_value_for("PUBLICATIONDATE"))
+        dates = [
+          {
+            type: :accepted,
+            date: flesh_date(find_value_for("PUBLICATIONDATE")),
+          },
+          {
+            type: :amended,
+            date: flesh_date(find_value_for("PUBLICATIONDATE")),
+          },
+        ]
+      end
+      {
+        id: term_id,
+        entry_status: extract_entry_status,
+        classification: extract_classification,
+        dates: dates,
+        review_date: flesh_date(find_value_for("PUBLICATIONDATE")),
+        review_decision_date: flesh_date(find_value_for("PUBLICATIONDATE")),
+        review_decision_event: "published",
+        terms: extract_terms,
+        notes: extract_notes,
+        examples: extract_examples,
+        definition: [{ "content" => extract_definition_value }],
+        sources: extract_authoritative_source,
+        language_code: term_language,
+        related: extract_superseded_concepts,
+      }.compact
+    end
+    def term_id
+      @term_id ||= find_value_for("IEVREF")
+    end
+    def term_domain
+      @term_domain ||= term_id.slice(0, 3)
+    end
+    def term_language
+      @term_language ||= find_value_for("LANGUAGE").to_three_char_code
+    end
+    # Splits unified definition (from the spreadsheet) into separate
+    # definition, examples, and notes strings (for YAMLs).
+    #
+    # Sets +@definition+, +@examples+ and +@notes+ variables.
+    def split_definition
+      slicer_rx = %r{
+        \s*
+        (?:<p>\s*)?
+        (
+          (?<example>
+            # English example
+            \bEXAMPLE\b |
+            ^\bExamples\s+are\b: |
+            ^\bExamples\b: |
+            ^\bExample\b: |
+            # French examples
+            \bEXEMPLE\b |
+            ^\bExemples\b:
+          )
+          |
+          (?<note>
+            Note\s*\d+\sto\sentry: |
+            Note&nbsp;\d+\sto\sentry: |
+            Note\s*\d+\sto\sthe\sentry: |
+            Note\sto\sentry\s*\d+: |
+            Note\s*\d+?\sà\sl['’]article: |
+            <NOTE\/?>?\s*\d?\s+.*?– |
+            NOTE(?:\s+-)? |
+            Note\s+\d+\s– |
+            Note&nbsp;\d+\s
+          )
+        )
+        \s*
+      }x
+      @examples = []
+      @notes = []
+      definition_arr = [] # here array for consistent interface
+      next_part_arr = definition_arr
+      remaining_str = find_value_for("DEFINITION")
+      while md = remaining_str&.match(slicer_rx)
+        next_part = md.pre_match
+        next_part.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
+        next_part_arr.push(next_part)
+        next_part_arr = md[:example] ? @examples : @notes
+        # 112-03-17
+        # supplements the name of a quantity, especially for a component in a
+        # system, to indicate the quotient of that quantity by the total
+        # volume
+        # <NOTE – Examples: amount-of-substance volume concentration of
+        # component B (or concentration of B, in particular, ion
+        # concentration), molecular concentration of B, electron concentration
+        # (or electron density).
+        #
+        # In the above case the `Example` is part of the note but the regex
+        # above will capture it as an example and will add an empty `Note`
+        # and put the rest in an `Example`. So In this case we will replace
+        # the `Example` with `[:Example]` and revert it in the next iteration
+        # so it will not be caught by the regex.
+        remaining_str = md.post_match
+        remaining_str.sub!(/^Ex(a|e)mple/, "[:Ex\\1mple]") if md[:note]
+      end
+      remaining_str&.sub!(/^\[:Ex(a|e)mple\]/, "Ex\\1mple")
+      next_part_arr.push(remaining_str)
+      @definition = definition_arr.first
+      @definition = nil if @definition&.empty?
+    end
+    def extract_terms
+      [
+        extract_primary_designation,
+        *extract_synonymous_designations,
+        extract_international_symbol_designation,
+      ].compact
+    end
+    def extract_primary_designation
+      raw_term = find_value_for("TERM")
+      raw_term = "NA" if raw_term == "....."
+      build_expression_designation(
+        raw_term,
+        attribute_data: find_value_for("TERMATTRIBUTE"),
+        status: "preferred",
+      )
+    end
+    def extract_synonymous_designations
+      retval = (1..3).map do |num|
+        designations = find_value_for("SYNONYM#{num}") || ""
+        # Some synonyms have more than one entry
+        designations.split(/<[pbr]+>/).map do |raw_term|
+          build_expression_designation(
+            raw_term,
+            attribute_data: find_value_for("SYNONYM#{num}ATTRIBUTE"),
+            status: find_value_for("SYNONYM#{num}STATUS")&.downcase,
+          )
+        end
+      end
+      retval.flatten.compact
+    end
+    def extract_international_symbol_designation
+      raw_term = find_value_for("SYMBOLE")
+      raw_term && build_symbol_designation(raw_term)
+    end
+    def extract_definition_value
+      if @definition
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(@definition, term_domain)),
+        ).strip
+      end
+    end
+    def extract_examples
+      @examples.map do |str|
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(str, term_domain)),
+        ).strip
+      end
+    end
+    def extract_notes
+      @notes.map do |str|
+        IEV::Converter.mathml_to_asciimath(
+          replace_newlines(parse_anchor_tag(str, term_domain)),
+        ).strip
+      end
+    end
+    def extract_entry_status
+      case find_value_for("STATUS").downcase
+      when "standard" then "valid"
+      else nil
+      end
+    end
+    def extract_classification
+      classification_val = find_value_for("SYNONYM1STATUS")
+      case classification_val
+      when ""
+        "admitted"
+      when "认可的", "допустимый", "admitido"
+        "admitted"
+      when "首选的", "suositettava", "suositeltava", "рекомендуемый", "preferente"
+        "preferred"
+      else
+        classification_val
+      end
+    end
+    def extract_authoritative_source
+      source_val = find_value_for("SOURCE")
+      return nil if source_val.nil?
+      SourceParser.new(source_val, term_domain)
+        .parsed_sources
+        .compact
+        .map do |source|
+        source.merge({ "type" => "authoritative" })
+      end
+    end
+    def extract_superseded_concepts
+      replaces_val = find_value_for("REPLACES")
+      return nil if replaces_val.nil?
+      SupersessionParser.new(replaces_val).supersessions
+    end
+    private
+    def build_expression_designation(raw_term, attribute_data:, status:)
+      term = IEV::Converter.mathml_to_asciimath(
+        parse_anchor_tag(raw_term, term_domain),
+      )
+      term_attributes = TermAttrsParser.new(attribute_data.to_s)
+      statuses = {
+        "obsoleto" => "deprecated",
+        "напуштен" => "deprecated",
+      }
+      {
+        "type" => "expression",
+        "prefix" => term_attributes.prefix,
+        "normative_status" => statuses[status] || status,
+        "usage_info" => term_attributes.usage_info,
+        "designation" => term,
+        "part_of_speech" => term_attributes.part_of_speech,
+        "geographical_area" => term_attributes.geographical_area,
+        "gender" => term_attributes.gender,
+        "plurality" => term_attributes.plurality,
+      }.compact
+    end
+    def build_symbol_designation(raw_term)
+      term = IEV::Converter.mathml_to_asciimath(
+        parse_anchor_tag(raw_term, term_domain),
+      )
+      {
+        "type" => "symbol",
+        "designation" => term,
+        "international" => true,
+      }.compact
+    end
+  end
+end

data/lib/iev/utilities.rb ADDED Viewed

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module IEV
+  module Utilities
+    SIMG_PATH_REGEX = "<simg .*\\/\\$file\\/([\\d\\-\\w\.]+)>"
+    FIGURE_ONE_REGEX =
+      "<p><b>\\s*Figure\\s+(\\d)\\s+[–-]\\s+(.+)\\s*<\\/b>(<\\/p>)?"
+    FIGURE_TWO_REGEX = "#{FIGURE_ONE_REGEX}\\s*#{FIGURE_ONE_REGEX}"
+    IMAGE_PATH_PREFIX = "image::/assets/images/parts"
+    def parse_anchor_tag(text, term_domain)
+      if text
+        # Convert IEV term references
+        # Convert href links
+        # Need to take care of this pattern:
+        #  `inverse de la <a href="IEV103-06-01">période<a>`
+        text.gsub(
+          /<a href="?(IEV)\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
+          '{{\3, \1:\2}}',
+        ).gsub(
+          /<a href="?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)<\/?a>/,
+          '{{\3, IEV:\2}}',
+        ).gsub(
+          # To handle <a> tags without ending tag like
+          #  `Voir <a href=IEV103-05-21>IEV 103-05-21`
+          #  for concept '702-03-11' in `fr`
+          /<a href="?(IEV)?\s*(\d\d\d-\d\d-\d\d\d?)"?>(.*?)$/,
+          '{{\3, IEV:\2}}',
+        ).gsub(
+          /<a href="?([^<>]*?)"?>(.*?)<\/a>/,
+          '\1[\2]',
+        ).gsub(
+          Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_TWO_REGEX].join),
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3; \\6]",
+        ).gsub(
+          Regexp.new([SIMG_PATH_REGEX, "\\s*", FIGURE_ONE_REGEX].join),
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[Figure \\2 - \\3]",
+        ).gsub(
+          /<img\s+([^<>]+?)\s*>/,
+          "#{IMAGE_PATH_PREFIX}/#{term_domain}/\\1[]",
+        ).gsub(
+          /<br>/,
+          "\n",
+        ).gsub(
+          /<b>(.*?)<\/b>/,
+          "*\\1*",
+        )
+      end
+    end
+    def replace_newlines(input)
+      input.gsub('\n', "\n\n")
+        .gsub(/<[pbr]+>/, "\n\n")
+        .gsub(/\s*\n[\n\s]+/, "\n\n")
+        .strip
+    end
+  end
+end

data/lib/iev/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
-module Iev
-  VERSION = "0.3.1".freeze
+module IEV
+  VERSION = "0.3.3".freeze
 end

data/lib/iev.rb CHANGED Viewed

@@ -3,7 +3,27 @@ require "iev/db"
 require "open-uri"
 require "nokogiri"
-module Iev
+require "benchmark"
+require "creek"
+require "unitsml"
+require "plurimath"
+require "glossarist"
+require "relaton"
+require "relaton_bib"
+require "sequel"
+require "thor"
+require "yaml"
+require "zeitwerk"
+loader = Zeitwerk::Loader.for_gem
+loader.inflector.inflect(
+  "cli" => "CLI",
+  "iev" => "IEV",
+  "ui" => "UI",
+)
+loader.setup
+module IEV
   #
   # Scrape Electropedia for term.
   #
@@ -23,6 +43,8 @@ module Iev
     a = doc&.at(xpath)&.children&.to_xml
     a&.sub(%r{<br/>.*$}, "")
       &.sub(%r{, &lt;.*$}, "")
-      &.gsub(%r{<[^>]*>}, "")&.strip
+      &.gsub(%r{<[^<>]*>}, "")&.strip
   end
 end
+require "iev/cli"