RubyGems - dphil - Versions diffs - 0.1.4 - Mend

dphil 0.1.4

Files changed (44) hide show

checksums.yaml +7 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +6 -0
data/LICENSE +201 -0
data/README.md +54 -0
data/Rakefile +11 -0
data/dphil.gemspec +49 -0
data/exe/dphil +10 -0
data/lib/dphil.rb +53 -0
data/lib/dphil/cache.rb +15 -0
data/lib/dphil/change_list.rb +6 -0
data/lib/dphil/character.rb +236 -0
data/lib/dphil/character_matrix.rb +102 -0
data/lib/dphil/cli.rb +26 -0
data/lib/dphil/cli_commands/csv2ld.rb +71 -0
data/lib/dphil/cli_commands/csv2nex.rb +37 -0
data/lib/dphil/constants.rb +128 -0
data/lib/dphil/converter.rb +58 -0
data/lib/dphil/converters/csv2nex.rb +83 -0
data/lib/dphil/ld_data_set.rb +25 -0
data/lib/dphil/ld_output.rb +29 -0
data/lib/dphil/lemma.rb +44 -0
data/lib/dphil/lemma_list.rb +179 -0
data/lib/dphil/log_formatter.rb +39 -0
data/lib/dphil/logger.rb +27 -0
data/lib/dphil/metrical_data.rb +78 -0
data/lib/dphil/newick.rb +52 -0
data/lib/dphil/paup.rb +34 -0
data/lib/dphil/refinements.rb +8 -0
data/lib/dphil/refinements/natural_sort.rb +52 -0
data/lib/dphil/script_string.rb +124 -0
data/lib/dphil/syllables.rb +43 -0
data/lib/dphil/syllables/syllable.rb +45 -0
data/lib/dphil/tei_xml.rb +142 -0
data/lib/dphil/transliterate.rb +131 -0
data/lib/dphil/tree.rb +142 -0
data/lib/dphil/tree_node.rb +67 -0
data/lib/dphil/verse.rb +25 -0
data/lib/dphil/verse_analysis.rb +509 -0
data/lib/dphil/verse_analysis_new.rb +816 -0
data/lib/dphil/version.rb +30 -0
data/vendor/default_commands.paup +18 -0
data/vendor/metrical_data.yml +4035 -0
metadata +409 -0

data/lib/dphil/syllables/syllable.rb ADDED

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+module Dphil
+  class Syllables
+    using ::Ragabash::Refinements
+    class Syllable
+      attr_reader :source, :weight, :parent, :index, :source_script
+      def initialize(source, weight, **opts)
+        @source = source.to_str.safe_copy.freeze
+        @weight = weight.to_str.safe_copy.freeze
+        @parent = opts[:parent]
+        @index = opts[:index]&.to_i
+        @source_script = opts[:source_script] || (@parent&.source_script)
+        @slp1 = @source_script == :slp1 ? @source : opts[:slp1]&.to_str&.safe_copy.freeze
+      end
+      def inspect
+        "[#{index}]#{source.inspect}(#{weight})"
+      end
+      def to_s
+        @source.dup
+      end
+      def prev
+        return unless @parent && @index && @index.positive?
+        @parent[@index - 1]
+      end
+      def next
+        return unless @parent && @index && @index < @parent.length
+        @parent[@index + 1]
+      end
+      def simple_weight
+        @simple_weight ||= weight.upcase.freeze
+      end
+      def slp1
+        @slp1 ||= Transliterate.t(@source, @source_script, :slp1).freeze
+      end
+    end
+  end
+end

data/lib/dphil/tei_xml.rb ADDED

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+module Dphil
+  class TeiXML
+    using ::Ragabash::Refinements
+    # Public: Initialize a TeiXML object
+    #
+    def initialize(source)
+      source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
+      @raw_xml = source
+    end
+    # Return or re-parse xml
+    def xml
+      @xml ||= begin
+        xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
+        xml.encoding = "UTF-8"
+        xml.remove_namespaces!
+        xml_normalize!(xml)
+      rescue Nokogiri::XML::SyntaxError => e
+        raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
+      end
+    end
+    def to_xml
+      xml.to_xml
+    end
+    alias to_s to_xml
+    def empty?
+      xml.xpath("//text()[normalize-space()]").empty?
+    end
+    # Public: Return a portion of the document as a new document
+    #
+    # expr - a CSS selector or XPath expression
+    #
+    # Returns a new document.
+    def crop(expr)
+      segment = xml.search(expr)
+      pb = page_of(segment)
+      lb = line_of(segment)
+      source = <<~EOS
+        <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
+          <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
+          #{segment.to_xml}
+          <post></post>
+        </TEI>
+      EOS
+      self.class.new(source)
+    end
+    def crop_each(expr)
+      xml.search(expr).map do |segment|
+        pb = page_of(segment)
+        lb = line_of(segment)
+        source = <<~EOS
+          <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
+            <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
+            #{segment.to_xml}
+            <post></post>
+          </TEI>
+        EOS
+        self.class.new(source)
+      end
+    end
+    # Public: Remove elements from the document based on CSS selector.
+    #
+    # expr - a CSS selector or XPath expression
+    #
+    # Returns a new document.
+    def reject(expr)
+      source = xml.dup
+      source.search(expr).each do |node|
+        node.replace(node.search("pb, lb"))
+      end
+      self.class.new(source.to_xml)
+    end
+    # Public: Substitute elements from the document based on CSS selector with
+    #   ID-based token text-nodes.
+    #
+    # expr - a CSS selector or XPath expression
+    # subst_text - an optional text identifier
+    #
+    # Returns a new document.
+    def subst(expr, subst_text = nil)
+      source = parsed_xml.dup
+      subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?
+      source.search(expr).each do |node|
+        set = Nokogiri::XML::NodeSet.new(source)
+        escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
+        text_content = "#{subst_text || node.name}#{escaped_text}"
+        set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
+        node.replace(set + node.search("pb, lb"))
+      end
+      self.class.new(source.to_xml)
+    end
+    private
+    # Get nearest prior <pb/> node.
+    #
+    # id - node in document to start search from.
+    #
+    # Returns an XML node.
+    def page_of(node)
+      node.xpath("preceding::*[name() = 'pb'][1]")
+    end
+    # Get nearest prior <lb/> node with everything in between.
+    #
+    # node - node in document to start search from.
+    #
+    # Returns an XML node.
+    def line_of(node)
+      node.xpath("preceding::*[name() = 'lb'][1]")
+    end
+    # Normalize (mostly) whitespace in the XML.
+    def xml_normalize!(doc)
+      doc.search("//text()").each do |text_node|
+        text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ")
+      end
+      # Remove empty modification tags.
+      doc.search(
+        "//add[not(node())]|" \
+        "//del[not(node())]|" \
+        "//mod[not(node())]|" \
+        "//unclear[not(node())]|" \
+        "//g[not(node())]"
+      ).remove
+      doc
+    end
+  end
+end

data/lib/dphil/transliterate.rb ADDED

@@ -0,0 +1,131 @@
+# frozen_string_literal: true
+require "sanscript"
+module Dphil
+  # Transliteration module for basic romanization formats.
+  module Transliterate
+    using ::Ragabash::Refinements
+    @default_script = nil
+    module_function
+    def default_script
+      @default_script
+    end
+    def default_script=(scr)
+      scr = scr.to_sym
+      if script_supported?(scr)
+        @default_script = scr
+      else
+        warn "Script unsupported [:#{scr}]"
+      end
+    end
+    def transliterate(str, first, second = nil)
+      Sanscript.transliterate(str, first, second, default_script: default_script)
+    rescue RuntimeError => e
+      Dphil.logger.error "Transliteration Error: #{e}"
+      return str
+    end
+    def script_supported?(script)
+      Sanscript::Transliterate.scheme_names.include?(script)
+    end
+    def to_ascii(str)
+      process_string(str) do |out|
+        out.unicode_normalize!(:nfd)
+        out.gsub!(/[^\u0000-\u007F]+/, "")
+        out
+      end
+    end
+    def iast_kh(str)
+      transliterate(str, :iast, :kh)
+    end
+    def kh_iast(str)
+      transliterate(str, :kh, :iast)
+    end
+    def iast_slp1(str)
+      transliterate(str, :iast, :slp1)
+    end
+    def slp1_iast(str)
+      transliterate(str, :slp1, :iast)
+    end
+    def detect(str)
+      Sanscript::Detect.detect_scheme(str)
+    end
+    def normalize_slp1(st)
+      out = st.dup
+      out.gsub!(Constants::TRANS_CTRL_WORD) do |match|
+        control_content = match[Constants::TRANS_CTRL_WORD_CONTENT, 1]
+        next match if control_content&.match(Constants::TRANS_CTRL_WORD_PROCESSED)
+        "{###{Digest::SHA1.hexdigest(control_content).rjust(40, '0')}##}"
+      end
+      process_string!(out) do |token|
+        token.tr!("b", "v")
+        token.gsub!(/['‘]\b/, "") # Avagraha
+        token.gsub!(/\B[NYRnm]/, "M") # Medial and final nasals
+        token.gsub!(/\B[Hrs]\b/, "") # Final visarga/r/s
+        token.gsub!(%r{[\.\-\_\\\/]}, "") # Punctuation
+        token
+      end
+    end
+    def normalize_iast(word)
+      out = iast_slp1(word)
+      normalize_slp1(out)
+    end
+    def unicode_downcase!(str, ignore_control = false)
+      return UNICODE_DOWNCASE_PROC.call(str) if ignore_control
+      process_string!(str, &UNICODE_DOWNCASE_PROC)
+    end
+    def unicode_downcase(st, ignore_control = false)
+      unicode_downcase!(st.dup, ignore_control)
+    end
+    UNICODE_DOWNCASE_PROC = lambda do |str|
+      str.unicode_normalize!(:nfd)
+      str.downcase!
+      str.unicode_normalize!(:nfc)
+      str
+    end
+    private_constant :UNICODE_DOWNCASE_PROC
+    class << self
+      alias t transliterate
+      private
+      def process_string!(str, ignore_control = false, &_block)
+        str = str.to_str
+        return yield str if ignore_control
+        scan = str.scan(Constants::TRANS_CTRL_WORD)
+        return yield str if scan.empty?
+        return str if scan.first == str
+        str.gsub!(Constants::TRANS_CTRL_WORD, "\u0026\u0026")
+        str = yield str
+        str.gsub!("\u0026\u0026") { scan.shift }
+        str
+      end
+      def process_string(str, ignore_control = false, &block)
+        process_string!(str.dup, ignore_control, &block)
+      end
+    end
+  end
+end

data/lib/dphil/tree.rb ADDED

@@ -0,0 +1,142 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # Phylogenetic Tree generated from parsing PAUP output.
+  #
+  # Immutable.
+  #
+  class Tree
+    include LDOutput
+    attr_reader :id, :nodes, :stats, :tree
+    def initialize(id = nil, lengths = nil, stats = nil, **opts)
+      @id = (opts[:id] || id).to_i
+      if lengths.respond_to?(:to_str) && stats.respond_to?(:to_str)
+        @nodes = nodes_from_lengths(parse_paup_lengths(lengths))
+        @stats = parse_paup_stats(stats)
+      elsif (opts.keys & %i[nodes stats]).length == 2
+        @nodes = parse_json_nodes(opts[:nodes])
+        @stats = parse_json_stats(opts[:stats])
+      end
+      @tree = tree_from_nodes(nodes)
+      IceNine.deep_freeze(self)
+    end
+    def to_h
+      {
+        id: id,
+        root_id: tree.id,
+        nodes: nodes,
+        stats: stats,
+      }
+    end
+    def as_json(options = nil)
+      to_h.as_json(options)
+    end
+    def root
+      nodes[tree.id]
+    end
+    def get_node(id)
+      nodes[id]
+    end
+    def get_parent(node)
+      nodes[node.parent]
+    end
+    def get_children(node)
+      node.children&.map { |id| nodes[id] }
+    end
+    def tree_length
+      stats[:length]
+    end
+    def ci
+      stats[:ci]
+    end
+    private
+    PAUP_TREE_STATS = {
+      "Tree length" => :length,
+      "Consistency index (CI)" => :ci,
+      "Homoplasy index (HI)" => :hi,
+      "CI excluding uninformative characters" => :ci_ex,
+      "HI excluding uninformative characters" => :hi_ex,
+      "Retention index (RI)" => :ri,
+      "Rescaled consistency index (RC)" => :rc,
+    }.freeze
+    private_constant :PAUP_TREE_STATS
+    def parse_paup_lengths(lengths)
+      lengths.to_s&.split("\n")&.map { |l| l.strip.split(/\s{3,}/) }
+    end
+    def parse_paup_stats(stats)
+      stats.to_s&.split("\n")&.each_with_object({}) do |l, acc|
+        key, val = l.split(" = ")
+        acc[PAUP_TREE_STATS[key]] = (val["."] ? val.to_f : val.to_i)
+      end
+    end
+    def parse_json_nodes(json_nodes)
+      json_nodes.each_with_object({}) do |(id, node), acc|
+        acc[id.to_s.to_i] = TreeNode.new(node)
+      end
+    end
+    def parse_json_stats(json_stats)
+      missing_keys = (PAUP_TREE_STATS.values - json_stats.keys)
+      raise ArgumentError, "Missing `stats` keys: #{missing_keys}" unless missing_keys.empty?
+      json_stats.each_with_object({}) do |(k, v), acc|
+        raise ArgumentError, "Stat `#{k}` is not a Numeric" unless v.is_a?(Numeric) || v.nil?
+        acc[k] = v
+      end
+    end
+    def nodes_from_lengths(lengths)
+      lengths.each_with_object({}) do |arr, hash|
+        name, id = arr[0].match(/^(.*?)\s?\(?([0-9]{1,4})\)?$/).captures
+        id = id.to_i
+        parent = arr[1].to_i
+        node = TreeNode.new(
+          id: id,
+          name: (name.present? ? name : "##{id}"),
+          length: arr[2].to_i,
+          parent: parent
+        )
+        hash[id] ||= TreeNode.new
+        hash[id].merge!(node)
+        next if parent.zero?
+        hash[parent] ||= TreeNode.new(
+          id: parent,
+          name: (parent.to_i.zero? ? parent : "##{parent}"),
+          length: 0,
+          parent: 0
+        )
+        hash[parent].children ||= []
+        hash[parent].children << id
+      end
+    end
+    def tree_from_nodes(nodes)
+      root = nodes.select { |_, node| node.parent.zero? }&.first&.last
+      return {} if root.blank?
+      append_children(nodes, root)
+    end
+    def append_children(nodes, node)
+      new_node = TreeNode.new(node.to_h)
+      return new_node unless new_node.children.present?
+      new_node.children = new_node.children.map { |id| append_children(nodes, nodes[id]) }
+      new_node
+    end
+  end
+end