RubyGems - dphil - Versions diffs - 0.1.4 - Mend

dphil 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

checksums.yaml +7 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +6 -0
data/LICENSE +201 -0
data/README.md +54 -0
data/Rakefile +11 -0
data/dphil.gemspec +49 -0
data/exe/dphil +10 -0
data/lib/dphil.rb +53 -0
data/lib/dphil/cache.rb +15 -0
data/lib/dphil/change_list.rb +6 -0
data/lib/dphil/character.rb +236 -0
data/lib/dphil/character_matrix.rb +102 -0
data/lib/dphil/cli.rb +26 -0
data/lib/dphil/cli_commands/csv2ld.rb +71 -0
data/lib/dphil/cli_commands/csv2nex.rb +37 -0
data/lib/dphil/constants.rb +128 -0
data/lib/dphil/converter.rb +58 -0
data/lib/dphil/converters/csv2nex.rb +83 -0
data/lib/dphil/ld_data_set.rb +25 -0
data/lib/dphil/ld_output.rb +29 -0
data/lib/dphil/lemma.rb +44 -0
data/lib/dphil/lemma_list.rb +179 -0
data/lib/dphil/log_formatter.rb +39 -0
data/lib/dphil/logger.rb +27 -0
data/lib/dphil/metrical_data.rb +78 -0
data/lib/dphil/newick.rb +52 -0
data/lib/dphil/paup.rb +34 -0
data/lib/dphil/refinements.rb +8 -0
data/lib/dphil/refinements/natural_sort.rb +52 -0
data/lib/dphil/script_string.rb +124 -0
data/lib/dphil/syllables.rb +43 -0
data/lib/dphil/syllables/syllable.rb +45 -0
data/lib/dphil/tei_xml.rb +142 -0
data/lib/dphil/transliterate.rb +131 -0
data/lib/dphil/tree.rb +142 -0
data/lib/dphil/tree_node.rb +67 -0
data/lib/dphil/verse.rb +25 -0
data/lib/dphil/verse_analysis.rb +509 -0
data/lib/dphil/verse_analysis_new.rb +816 -0
data/lib/dphil/version.rb +30 -0
data/vendor/default_commands.paup +18 -0
data/vendor/metrical_data.yml +4035 -0
metadata +409 -0

data/lib/dphil/cli_commands/csv2nex.rb ADDED

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+Dphil::CLI.module_eval do
+  desc "Convert a CSV-format collation file into a NEXUS file"
+  long_desc <<~EOS
+    Convert a CSV-format collation file into a NEXUS file for use with PAUP.
+    This expects each column of the CSV to represent data for a single taxon,
+    and the first row to contain the names of the taxa.
+  EOS
+  arg :csv_file
+  command :csv2nex do |c|
+    c.desc "Transpose rows/columns in CSV"
+    c.switch :t, :transpose, negatable: false
+    c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
+    c.flag :d, :paup_data, arg_name: "file"
+    c.desc "Write NEXUS output to file instead of STDOUT"
+    c.flag :o, :outfile, arg_name: "file"
+    c.action do |_, copts, args|
+      nexus_output = Dphil::Csv2NexConverter.new(args[0], copts).convert
+      if copts[:outfile].nil?
+        puts nexus_output
+      else
+        abs_outfile = Pathname.new(copts[:outfile]).expand_path
+        rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
+        puts "#{File.write(abs_outfile, nexus_output)} bytes written to #{rel_outfile}"
+        puts "You can process this file using PAUP with the command\n" \
+             "`paup4 [options] #{rel_outfile}`"
+      end
+    end
+  end
+end

data/lib/dphil/constants.rb ADDED

@@ -0,0 +1,128 @@
+# frozen_string_literal: true
+require "set"
+module Dphil
+  module Constants
+    using ::Ragabash::Refinements
+    DEBUG = if defined?(::Rails) && ::Rails.env[/^dev/]
+              true
+            elsif !ENV["RUBY_ENV"].nil? && ENV["RUBY_ENV"][/^dev/]
+              true
+            else
+              false
+            end
+    # Regular expressions for SLP1 syllables
+    begin
+      vow = "aAiIuUfFxXeEoO"
+      con = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlvzSsh"
+      add = "MH"
+      R_SYL = /[']?[#{con}]*[\s]*[#{vow}][#{con}#{add}]*(?![#{vow}])\s*/
+      R_GSYL = /[AIUFXeEoO]|[MH]$/
+      R_CCONF = /[#{con}]{2}$/
+      R_CCON = /[#{con}]{2}/
+    end
+    TRANS_CTRL_WORD = /\{#.*?#\}/
+    TRANS_CTRL_WORD_CONTENT = /\{#(.*?)#\}/
+    TRANS_CTRL_WORD_PROCESSED = /#[a-f0-9]{40}#/
+    # Linked Data types and contexts
+    begin
+      ld_context_global = {
+        "@version" => 1.1,
+        "oa" => "http://www.w3.org/ns/oa#",
+        "dc" => "http://purl.org/dc/elements/1.1/",
+        "xsd" => "http://www.w3.org/2001/XMLSchema#",
+        "ubcs" => "http://ld.ubcsanskrit.ca/api#",
+        "id" => { "@id" => "dc:identifier" },
+      }
+      ld_context_character = {
+        "states" => { "@id" => "ubcs:charStateBySymbol", "@container" => "@index" },
+        "symbols" => { "@id" => "ubcs:charSymbolByState", "@container" => "@index" },
+        "stateTotals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
+        "state_totals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
+        "taxaStates" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
+        "taxa_states" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
+        "statesTaxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
+        "states_taxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
+        "isInformative" => { "@id" => "ubcs:charStateIsInformative" },
+        "is_informative" => { "@id" => "ubcs:charStateIsInformative" },
+        "isConstant" => { "@id" => "ubcs:charStateIsConstant" },
+        "is_constant" => { "@id" => "ubcs:charStateIsConstant" },
+      }
+      ld_context_matrix = {
+        "taxaNames" => { "@id" => "dc:identifier", "@container" => "@index" },
+        "taxa_names" => { "@id" => "dc:identifier", "@container" => "@index" },
+        "characters" => {
+          "@id" => "ubcs:phyloCharacter",
+          "@container" => "@index",
+          "@context" => ld_context_character,
+        },
+      }
+      ld_context_tree_node = {
+        "name" => { "@id" => "ubcs:treeNodeName" },
+        "length" => { "@id" => "ubcs:branchLength" },
+        "parent" => { "@id" => "ubcs:treeNodeParent" },
+        "children" => { "@id" => "ubcs:treeNodeChildren" },
+      }
+      ld_context_tree = {
+        "rootId" => { "@id" => "ubcs:treeRootId" },
+        "root_id" => { "@id" => "ubcs:treeRootId" },
+        "nodes" => {
+          "@id" => "ubcs:treeNode",
+          "@container" => "@index",
+          "@context" => ld_context_tree_node,
+        },
+        "stats" => {
+          "@id" => "ubcs:treeStats",
+          "@context" => {
+            "ci" => { "@id" => "ubcs:treeCI" },
+            "ciEx" => { "@id" => "ubcs:treeCIEx" },
+            "ci_ex" => { "@id" => "ubcs:treeCIEx" },
+            "hi" => { "@id" => "ubcs:treeHI" },
+            "hiEx" => { "@id" => "ubcs:treeHIEx" },
+            "hi_ex" => { "@id" => "ubcs:treeHIEx" },
+            "length" => { "@id" => "ubcs:treeLengh" },
+            "rc" => { "@id" => "ubcs:treeRC" },
+            "ri" => { "@id" => "ubcs:treeRI" },
+          },
+        },
+      }
+      ld_context_dataset = {
+        "matrix" => {
+          "@id" => "ubcs:characterMatrix",
+          "@context" => ld_context_matrix,
+        },
+        "trees" => {
+          "@id" => "ubcs:tree",
+          "@container" => "@index",
+          "@context" => ld_context_tree,
+        },
+      }
+      LD_TYPES = {
+        "Dphil::Character" => "ubcs:phyloCharacter",
+        "Dphil::CharacterMatrix" => "ubcs:characterMatrix",
+        "Dphil::TreeNode" => "ubcs:treeNode",
+        "Dphil::Tree" => "ubcs:tree",
+        "Dphil::LDDataSet" => "ubcs:dataSet",
+      }.deep_freeze
+      LD_CONTEXTS = {
+        "Dphil::Character" => ld_context_global.merge(ld_context_character),
+        "Dphil::CharacterMatrix" => ld_context_global.merge(ld_context_matrix),
+        "Dphil::TreeNode" => ld_context_global.merge(ld_context_tree_node),
+        "Dphil::Tree" => ld_context_global.merge(ld_context_tree),
+        "Dphil::LDDataSet" => ld_context_global.merge(ld_context_dataset),
+      }.deep_freeze
+    end
+  end
+end

data/lib/dphil/converter.rb ADDED

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # Base module for file converters (CSV, NEXUS, CollateX, etc.)
+  #
+  module Converter
+    private
+    # Load a file
+    def load_file(infile)
+      raise IOError, "File #{infile} not found." unless File.exist?(infile)
+      File.read(infile)
+    end
+    # Load a CSV file
+    def load_csv(infile, mode = "r")
+      raise IOError, "File #{infile} not found." unless File.exist?(infile)
+      CSV.read(infile, mode)
+    end
+    # Return a hash of array sorted/weighted by number of identical entries
+    def weighted_uniq(array)
+      weighted_hash = array.each_with_object({}) do |v, acc|
+        acc[v] ||= 0
+        acc[v] += 1
+      end
+      n = 0
+      (weighted_hash.sort_by do |x|
+        n += 1
+        [-x[1], n]
+      end).to_h
+    end
+    # Sanitize a character string to basic KH/ASCII
+    def sanitize_char(str)
+      str = str.to_s
+      src = Sanscript.detect(str) || :iast
+      str = Sanscript.transliterate(str, src, :kh)
+      str.gsub!(/\s/, "_")
+      str.tr!("'", "`")
+      str.strip!
+      str
+    end
+    # Tokenize the values of a character
+    def tokenize(characters)
+      char_set = weighted_uniq(characters.map { |c| sanitize_char(c) }.reject(&:empty?))
+      char_set.each_with_object({}).with_index do |(char, acc), i|
+        acc[char[0]] = [ALPHABET[i], char[1]]
+      end
+    end
+    # NEX Token Alphabet
+    ALPHABET = IceNine.deep_freeze(("A".."Z").to_a + ("a".."z").to_a)
+    private_constant :ALPHABET
+  end
+end

data/lib/dphil/converters/csv2nex.rb ADDED

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # CSV to NEXUS file converter class
+  #
+  class Csv2NexConverter
+    include Dphil::Converter
+    def initialize(csv_file, opts = {})
+      opts = opts.to_h
+      # Load csv file
+      @csv = load_csv(csv_file, "r:bom|utf-8")
+      @csv = @csv.transpose if opts[:transpose]
+      # Load paup file
+      if opts[:paup_data].nil?
+        opts[:paup_data] = File.join(GEM_ROOT, "vendor", "default_commands.paup")
+      end
+      @paup = load_file(opts[:paup_data])
+      @paup << "\n" unless @paup.blank? || @paup[-1] == "\n"
+      @paup.indent!(2)
+      @paup.freeze
+    end
+    # Perform the conversion and return a string result
+    def convert
+      # Setup taxa information and orientation
+      taxa_count = @csv.first.count
+      character_count = @csv.count - 1
+      taxa_labels = @csv.first.map { |name| name.to_s.strip.scrub.gsub(/[^A-Za-z0-9]/, "_") }
+      # Generate labels and matrix
+      character_labels = []
+      character_matrix = taxa_labels.map { |t| [t] }
+      (1..character_count).each do |r|
+        row = @csv[r]
+        token_hash = tokenize(row)
+        character_label = (token_hash.map do |k, _|
+          "'#{sanitize_char(k)}'"
+        end).join(" ")
+        character_labels << %(#{r} /#{character_label})
+        row.each_with_index do |charstate, i|
+          token = token_hash[sanitize_char(charstate)]
+          character_matrix[i] << (token.nil? ? "-" : token[0])
+        end
+      end
+      character_matrix.map! do |arr|
+        "#{arr.shift} #{arr.join('')}"
+      end
+      # Return NEXUS output
+      <<~NEXUS_EOF
+        #NEXUS
+        BEGIN TAXA;
+          TITLE Manuscripts;
+          DIMENSIONS NTAX=#{taxa_count};
+          TAXLABELS #{taxa_labels.join(' ')};
+        END;
+        BEGIN CHARACTERS;
+          TITLE  Variant_Matrix;
+          DIMENSIONS  NCHAR=#{character_count};
+          FORMAT DATATYPE = STANDARD RESPECTCASE GAP = - MISSING = ? SYMBOLS = "#{ALPHABET.join(' ')}";
+          CHARSTATELABELS #{character_labels.join(', ')};
+          MATRIX
+            #{character_matrix.join("\n    ")}
+        ;
+        END;
+        BEGIN ASSUMPTIONS;
+          OPTIONS DEFTYPE = UNORD;
+        END;
+        BEGIN PAUP;
+        #{@paup}END;
+      NEXUS_EOF
+    end
+  end
+end

data/lib/dphil/ld_data_set.rb ADDED

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+module Dphil
+  class LDDataSet
+    include Dphil::LDOutput
+    attr_reader :matrix, :trees
+    def initialize(matrix:, trees:)
+      @matrix = matrix
+      @trees = trees
+    end
+    def to_h
+      {
+        matrix: matrix,
+        trees: trees,
+      }
+    end
+    def as_json(options = nil)
+      to_h.as_json(options)
+    end
+  end
+end

data/lib/dphil/ld_output.rb ADDED

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # Mixin module for Linked Data output
+  #
+  # Requires that a class implements +#as_json+
+  #
+  module LDOutput
+    using Dphil::Refinements::NaturalSort
+    # Outputs a Linked Data Hash
+    def as_jsonld(**options)
+      ld = {
+        "@context" => options.delete(:context) || Constants::LD_CONTEXTS[self.class.name],
+        "@type" => options.delete(:ld_type) || Constants::LD_TYPES[self.class.name],
+      }.merge!(as_json(options))
+      ld_expanded = JSON::LD::API.expand(ld)
+      return ld_expanded if options[:compact] == false
+      ld_compact = JSON::LD::API.compact(ld_expanded, ld["@context"])
+      { "@context" => ld_compact.delete("@context") }.merge!(ld_compact.natural_sort_keys)
+    end
+    def to_jsonld(**options)
+      as_jsonld(options).to_json(options)
+    end
+  end
+end

data/lib/dphil/lemma.rb ADDED

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+module Dphil
+  # Public: A storage object for words and groups of words from TEI XML data.
+  # Also contains information about the source/location of the words.
+  # Immutable.
+  class Lemma
+    using ::Ragabash::Refinements
+    # Public: Returns the raw source data for the lemma.
+    attr_reader :source, :text, :page, :facs, :line, :index
+    # Public: Initialize a lemma object.
+    #
+    # source - XML data to initialize the lemma from
+    def initialize(source = "", index = nil)
+      @source = source.strip
+      @index = index
+      xml = Nokogiri::XML("<lemma>#{source}</lemma>") { |config| config.strict.noent }
+      xml.encoding = "UTF-8"
+      @text = xml.text.strip.gsub(/\-+\s*\-*/, "")
+      @page = xml.css("pb").map { |el| el.attr("n") }.join(",")
+      @facs = xml.css("pb").map { |el| el.attr("facs") }.join(",")
+      @line = xml.css("lb").map { |el| el.attr("n") }.join(",")
+    rescue Nokogiri::XML::SyntaxError => e
+      $stderr.puts "Error in Lemma.new(`#{source}`, ...): #{e}"
+      abort
+    end
+    def to_s
+      "(#{index}|#{page}:#{line}) #{text}"
+    end
+    def to_sym
+      "<Lemma>#{self}".to_sym
+    end
+    def ==(other)
+      return false unless other.is_a?(Dphil::Lemma)
+      source == other.source
+    end
+  end
+end

data/lib/dphil/lemma_list.rb ADDED

@@ -0,0 +1,179 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Dphil
+  # An object containing a list of lemmata generated through SAX parsing of an
+  #   XML document.
+  # Immutable.
+  class LemmaList < ::Nokogiri::XML::SAX::Document
+    using ::Ragabash::Refinements
+    include Enumerable
+    attr_reader :name
+    def initialize(source)
+      @members = []
+      source = source.to_s.strip
+      return if source.empty?
+      @lemma_ignore_start_tags = Set.new(%w[TEI text body pre post div])
+      @lemma_ignore_end_tags = @lemma_ignore_start_tags + Set.new(%w[pb lb])
+      @index = 0
+      @open_elements = []
+      @current_pb = []
+      @current_lb = []
+      @current_chars = ""
+      @current_lemma = []
+      @inside_hyphen = false
+      @empty_element = true
+      @parser = Nokogiri::XML::SAX::Parser.new(self)
+      @parser.parse(source)
+    end
+    def each(&block)
+      @members.each(&block)
+    end
+    def members(limit = nil)
+      return @members[0, limit] if limit.is_a? Numeric
+      @members
+    end
+    def [](*args)
+      @members[*args]
+    end
+    def get(index)
+      raise "Non-numeric index passed to Lemma.get" unless index.is_a? Numeric
+      if index < 1
+        warn "Minimum index of Lemma.get() is 1"
+        index = 1
+      end
+      @members[index - 1]
+    end
+    def size
+      @members.size
+    end
+    def to_s
+      @members.map(&:text).join("\n")
+    end
+    def cx_tokens
+      @members.map do |lemma|
+        out = {
+          t: lemma.text,
+          n: Transliterate.normalize_iast(lemma.text),
+          i: lemma.index,
+          p: lemma.page,
+          f: lemma.facs,
+          l: lemma.line,
+        }
+        warn "Token empty: #{out}" if out[:t].empty?
+        out
+      end
+    end
+    private
+    def start_element(name, attrs = [])
+      return if @lemma_ignore_start_tags.include?(name)
+      if %w[pb lb].include?(name)
+        el = gen_xmlel(name, attrs, true)
+        if @current_lemma.empty?
+          instance_variable_set("@current_#{name}", [el])
+        else
+          instance_variable_get("@current_#{name}") << el
+        end
+      else
+        el = gen_xmlel(name, attrs)
+        @open_elements << gen_xmlel(name, attrs)
+      end
+      @empty_element = true
+      @current_lemma << el unless el.empty?
+    end
+    def end_element(name)
+      return if @lemma_ignore_end_tags.include?(name)
+      if @empty_element
+        @current_lemma[-1] = @current_lemma[-1].gsub(%r{/*>\z}, "/>")
+        @empty_element = false
+      else
+        @current_lemma << "</#{name}>"
+      end
+      @open_elements.pop
+    end
+    def characters(string)
+      @empty_element = false
+      string.split(/(\s)/).reject(&:empty?).each do |lemma|
+        @current_chars += lemma.strip
+        if lemma.match?(/\-$/)
+          @inside_hyphen = true
+        elsif lemma.match?(/^\-?[^\s]/)
+          @inside_hyphen = false
+        end
+        if lemma.match(/^\s+$/) && !@inside_hyphen
+          finalize
+          next
+        end
+        text = lemma.strip
+        @current_lemma << text unless text.empty?
+      end
+    end
+    def end_document
+      finalize
+      (instance_variables - [:@members]).each do |var|
+        remove_instance_variable(var)
+      end
+    end
+    def gen_xmlel(name, attrs, self_closing = false)
+      attr_list = attrs.reduce("") do |result, attr|
+        %(#{result} #{attr[0]}="#{attr[1].gsub('"', '&quot;')}")
+      end
+      self_closing ? "<#{name}#{attr_list}/>" : "<#{name}#{attr_list}>"
+    end
+    def gen_xmlclose(el)
+      el.gsub(/^<([^\s\>]+).*/, '</\\1>')
+    end
+    def append_lemma
+      return unless @current_chars.match?(/[^\s\-\.\|]+/) # if not .empty?
+      new_lemma_source = @current_lemma.join("")
+      new_lemma = Lemma.new(new_lemma_source, @index)
+      @index += 1
+      @members << new_lemma
+    end
+    def finalize
+      return if @current_lemma.empty?
+      @current_lemma.unshift(@current_lb.first) unless @current_lemma[0] == @current_lb.first
+      @current_lemma.unshift(@current_pb.first) unless @current_lemma[0] == @current_pb.first
+      # Make sure missing open or close tags are inserted
+      unless @open_elements.empty?
+        @current_lemma.concat(@open_elements.reverse.map { |e| gen_xmlclose(e) })
+        prime_next = @open_elements.dup
+      end
+      append_lemma
+      @current_pb = [@current_pb.last]
+      @current_lb = [@current_lb.last]
+      @current_chars = ""
+      @current_lemma = prime_next || []
+      @inside_hyphen = false
+    end
+  end
+end