RubyGems - dphil - Versions diffs - 0.1.4 - Mend

dphil 0.1.4

Files changed (44) hide show

checksums.yaml +7 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +6 -0
data/LICENSE +201 -0
data/README.md +54 -0
data/Rakefile +11 -0
data/dphil.gemspec +49 -0
data/exe/dphil +10 -0
data/lib/dphil.rb +53 -0
data/lib/dphil/cache.rb +15 -0
data/lib/dphil/change_list.rb +6 -0
data/lib/dphil/character.rb +236 -0
data/lib/dphil/character_matrix.rb +102 -0
data/lib/dphil/cli.rb +26 -0
data/lib/dphil/cli_commands/csv2ld.rb +71 -0
data/lib/dphil/cli_commands/csv2nex.rb +37 -0
data/lib/dphil/constants.rb +128 -0
data/lib/dphil/converter.rb +58 -0
data/lib/dphil/converters/csv2nex.rb +83 -0
data/lib/dphil/ld_data_set.rb +25 -0
data/lib/dphil/ld_output.rb +29 -0
data/lib/dphil/lemma.rb +44 -0
data/lib/dphil/lemma_list.rb +179 -0
data/lib/dphil/log_formatter.rb +39 -0
data/lib/dphil/logger.rb +27 -0
data/lib/dphil/metrical_data.rb +78 -0
data/lib/dphil/newick.rb +52 -0
data/lib/dphil/paup.rb +34 -0
data/lib/dphil/refinements.rb +8 -0
data/lib/dphil/refinements/natural_sort.rb +52 -0
data/lib/dphil/script_string.rb +124 -0
data/lib/dphil/syllables.rb +43 -0
data/lib/dphil/syllables/syllable.rb +45 -0
data/lib/dphil/tei_xml.rb +142 -0
data/lib/dphil/transliterate.rb +131 -0
data/lib/dphil/tree.rb +142 -0
data/lib/dphil/tree_node.rb +67 -0
data/lib/dphil/verse.rb +25 -0
data/lib/dphil/verse_analysis.rb +509 -0
data/lib/dphil/verse_analysis_new.rb +816 -0
data/lib/dphil/version.rb +30 -0
data/vendor/default_commands.paup +18 -0
data/vendor/metrical_data.yml +4035 -0
metadata +409 -0

data/lib/dphil/cli_commands/csv2nex.rb ADDED

@@ -0,0 +1,37 @@
+# frozen_string_literal: true
+Dphil::CLI.module_eval do
+  desc "Convert a CSV-format collation file into a NEXUS file"
+  long_desc <<~EOS
+    Convert a CSV-format collation file into a NEXUS file for use with PAUP.
+    This expects each column of the CSV to represent data for a single taxon,
+    and the first row to contain the names of the taxa.
+  EOS
+  arg :csv_file
+  command :csv2nex do |c|
+    c.desc "Transpose rows/columns in CSV"
+    c.switch :t, :transpose, negatable: false
+    c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
+    c.flag :d, :paup_data, arg_name: "file"
+    c.desc "Write NEXUS output to file instead of STDOUT"
+    c.flag :o, :outfile, arg_name: "file"
+    c.action do |_, copts, args|
+      nexus_output = Dphil::Csv2NexConverter.new(args[0], copts).convert
+      if copts[:outfile].nil?
+        puts nexus_output
+      else
+        abs_outfile = Pathname.new(copts[:outfile]).expand_path
+        rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
+        puts "#{File.write(abs_outfile, nexus_output)} bytes written to #{rel_outfile}"
+        puts "You can process this file using PAUP with the command\n" \
+             "`paup4 [options] #{rel_outfile}`"
+      end
+    end
+  end
+end

data/lib/dphil/constants.rb ADDED

@@ -0,0 +1,128 @@
+# frozen_string_literal: true
+require "set"
+module Dphil
+  module Constants
+    using ::Ragabash::Refinements
+    DEBUG = if defined?(::Rails) && ::Rails.env[/^dev/]
+              true
+            elsif !ENV["RUBY_ENV"].nil? && ENV["RUBY_ENV"][/^dev/]
+              true
+            else
+              false
+            end
+    # Regular expressions for SLP1 syllables
+    begin
+      vow = "aAiIuUfFxXeEoO"
+      con = "kKgGNcCjJYwWqQRtTdDnpPbBmyrlvzSsh"
+      add = "MH"
+      R_SYL = /[']?[#{con}]*[\s]*[#{vow}][#{con}#{add}]*(?![#{vow}])\s*/
+      R_GSYL = /[AIUFXeEoO]|[MH]$/
+      R_CCONF = /[#{con}]{2}$/
+      R_CCON = /[#{con}]{2}/
+    end
+    TRANS_CTRL_WORD = /\{#.*?#\}/
+    TRANS_CTRL_WORD_CONTENT = /\{#(.*?)#\}/
+    TRANS_CTRL_WORD_PROCESSED = /#[a-f0-9]{40}#/
+    # Linked Data types and contexts
+    begin
+      ld_context_global = {
+        "@version" => 1.1,
+        "oa" => "http://www.w3.org/ns/oa#",
+        "dc" => "http://purl.org/dc/elements/1.1/",
+        "xsd" => "http://www.w3.org/2001/XMLSchema#",
+        "ubcs" => "http://ld.ubcsanskrit.ca/api#",
+        "id" => { "@id" => "dc:identifier" },
+      }
+      ld_context_character = {
+        "states" => { "@id" => "ubcs:charStateBySymbol", "@container" => "@index" },
+        "symbols" => { "@id" => "ubcs:charSymbolByState", "@container" => "@index" },
+        "stateTotals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
+        "state_totals" => { "@id" => "ubcs:charStateTotalsByState", "@container" => "@index" },
+        "taxaStates" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
+        "taxa_states" => { "@id" => "ubcs:charStateByTaxon", "@container" => "@index" },
+        "statesTaxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
+        "states_taxa" => { "@id" => "ubcs:taxonByCharState", "@container" => "@index" },
+        "isInformative" => { "@id" => "ubcs:charStateIsInformative" },
+        "is_informative" => { "@id" => "ubcs:charStateIsInformative" },
+        "isConstant" => { "@id" => "ubcs:charStateIsConstant" },
+        "is_constant" => { "@id" => "ubcs:charStateIsConstant" },
+      }
+      ld_context_matrix = {
+        "taxaNames" => { "@id" => "dc:identifier", "@container" => "@index" },
+        "taxa_names" => { "@id" => "dc:identifier", "@container" => "@index" },
+        "characters" => {
+          "@id" => "ubcs:phyloCharacter",
+          "@container" => "@index",
+          "@context" => ld_context_character,
+        },
+      }
+      ld_context_tree_node = {
+        "name" => { "@id" => "ubcs:treeNodeName" },
+        "length" => { "@id" => "ubcs:branchLength" },
+        "parent" => { "@id" => "ubcs:treeNodeParent" },
+        "children" => { "@id" => "ubcs:treeNodeChildren" },
+      }
+      ld_context_tree = {
+        "rootId" => { "@id" => "ubcs:treeRootId" },
+        "root_id" => { "@id" => "ubcs:treeRootId" },
+        "nodes" => {
+          "@id" => "ubcs:treeNode",
+          "@container" => "@index",
+          "@context" => ld_context_tree_node,
+        },
+        "stats" => {
+          "@id" => "ubcs:treeStats",
+          "@context" => {
+            "ci" => { "@id" => "ubcs:treeCI" },
+            "ciEx" => { "@id" => "ubcs:treeCIEx" },
+            "ci_ex" => { "@id" => "ubcs:treeCIEx" },
+            "hi" => { "@id" => "ubcs:treeHI" },
+            "hiEx" => { "@id" => "ubcs:treeHIEx" },
+            "hi_ex" => { "@id" => "ubcs:treeHIEx" },
+            "length" => { "@id" => "ubcs:treeLengh" },
+            "rc" => { "@id" => "ubcs:treeRC" },
+            "ri" => { "@id" => "ubcs:treeRI" },
+          },
+        },
+      }
+      ld_context_dataset = {
+        "matrix" => {
+          "@id" => "ubcs:characterMatrix",
+          "@context" => ld_context_matrix,
+        },
+        "trees" => {
+          "@id" => "ubcs:tree",
+          "@container" => "@index",
+          "@context" => ld_context_tree,
+        },
+      }
+      LD_TYPES = {
+        "Dphil::Character" => "ubcs:phyloCharacter",
+        "Dphil::CharacterMatrix" => "ubcs:characterMatrix",
+        "Dphil::TreeNode" => "ubcs:treeNode",
+        "Dphil::Tree" => "ubcs:tree",
+        "Dphil::LDDataSet" => "ubcs:dataSet",
+      }.deep_freeze
+      LD_CONTEXTS = {
+        "Dphil::Character" => ld_context_global.merge(ld_context_character),
+        "Dphil::CharacterMatrix" => ld_context_global.merge(ld_context_matrix),
+        "Dphil::TreeNode" => ld_context_global.merge(ld_context_tree_node),
+        "Dphil::Tree" => ld_context_global.merge(ld_context_tree),
+        "Dphil::LDDataSet" => ld_context_global.merge(ld_context_dataset),
+      }.deep_freeze
+    end
+  end
+end

data/lib/dphil/converter.rb ADDED

@@ -0,0 +1,58 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # Base module for file converters (CSV, NEXUS, CollateX, etc.)
+  #
+  module Converter
+    private
+    # Load a file
+    def load_file(infile)
+      raise IOError, "File #{infile} not found." unless File.exist?(infile)
+      File.read(infile)
+    end
+    # Load a CSV file
+    def load_csv(infile, mode = "r")
+      raise IOError, "File #{infile} not found." unless File.exist?(infile)
+      CSV.read(infile, mode)
+    end
+    # Return a hash of array sorted/weighted by number of identical entries
+    def weighted_uniq(array)
+      weighted_hash = array.each_with_object({}) do |v, acc|
+        acc[v] ||= 0
+        acc[v] += 1
+      end
+      n = 0
+      (weighted_hash.sort_by do |x|
+        n += 1
+        [-x[1], n]
+      end).to_h
+    end
+    # Sanitize a character string to basic KH/ASCII
+    def sanitize_char(str)
+      str = str.to_s
+      src = Sanscript.detect(str) || :iast
+      str = Sanscript.transliterate(str, src, :kh)
+      str.gsub!(/\s/, "_")
+      str.tr!("'", "`")
+      str.strip!
+      str
+    end
+    # Tokenize the values of a character
+    def tokenize(characters)
+      char_set = weighted_uniq(characters.map { |c| sanitize_char(c) }.reject(&:empty?))
+      char_set.each_with_object({}).with_index do |(char, acc), i|
+        acc[char[0]] = [ALPHABET[i], char[1]]
+      end
+    end
+    # NEX Token Alphabet
+    ALPHABET = IceNine.deep_freeze(("A".."Z").to_a + ("a".."z").to_a)
+    private_constant :ALPHABET
+  end
+end

data/lib/dphil/converters/csv2nex.rb ADDED

@@ -0,0 +1,83 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # CSV to NEXUS file converter class
+  #
+  class Csv2NexConverter
+    include Dphil::Converter
+    def initialize(csv_file, opts = {})
+      opts = opts.to_h
+      # Load csv file
+      @csv = load_csv(csv_file, "r:bom|utf-8")
+      @csv = @csv.transpose if opts[:transpose]
+      # Load paup file
+      if opts[:paup_data].nil?
+        opts[:paup_data] = File.join(GEM_ROOT, "vendor", "default_commands.paup")
+      end
+      @paup = load_file(opts[:paup_data])
+      @paup << "\n" unless @paup.blank? || @paup[-1] == "\n"
+      @paup.indent!(2)
+      @paup.freeze
+    end
+    # Perform the conversion and return a string result
+    def convert
+      # Setup taxa information and orientation
+      taxa_count = @csv.first.count
+      character_count = @csv.count - 1
+      taxa_labels = @csv.first.map { |name| name.to_s.strip.scrub.gsub(/[^A-Za-z0-9]/, "_") }
+      # Generate labels and matrix
+      character_labels = []
+      character_matrix = taxa_labels.map { |t| [t] }
+      (1..character_count).each do |r|
+        row = @csv[r]
+        token_hash = tokenize(row)
+        character_label = (token_hash.map do |k, _|
+          "'#{sanitize_char(k)}'"
+        end).join(" ")
+        character_labels << %(#{r} /#{character_label})
+        row.each_with_index do |charstate, i|
+          token = token_hash[sanitize_char(charstate)]
+          character_matrix[i] << (token.nil? ? "-" : token[0])
+        end
+      end
+      character_matrix.map! do |arr|
+        "#{arr.shift} #{arr.join('')}"
+      end
+      # Return NEXUS output
+      <<~NEXUS_EOF
+        #NEXUS
+        BEGIN TAXA;
+          TITLE Manuscripts;
+          DIMENSIONS NTAX=#{taxa_count};
+          TAXLABELS #{taxa_labels.join(' ')};
+        END;
+        BEGIN CHARACTERS;
+          TITLE  Variant_Matrix;
+          DIMENSIONS  NCHAR=#{character_count};
+          FORMAT DATATYPE = STANDARD RESPECTCASE GAP = - MISSING = ? SYMBOLS = "#{ALPHABET.join(' ')}";
+          CHARSTATELABELS #{character_labels.join(', ')};
+          MATRIX
+            #{character_matrix.join("\n    ")}
+        ;
+        END;
+        BEGIN ASSUMPTIONS;
+          OPTIONS DEFTYPE = UNORD;
+        END;
+        BEGIN PAUP;
+        #{@paup}END;
+      NEXUS_EOF
+    end
+  end
+end

data/lib/dphil/ld_data_set.rb ADDED

@@ -0,0 +1,25 @@
+# frozen_string_literal: true
+module Dphil
+  class LDDataSet
+    include Dphil::LDOutput
+    attr_reader :matrix, :trees
+    def initialize(matrix:, trees:)
+      @matrix = matrix
+      @trees = trees
+    end
+    def to_h
+      {
+        matrix: matrix,
+        trees: trees,
+      }
+    end
+    def as_json(options = nil)
+      to_h.as_json(options)
+    end
+  end
+end

data/lib/dphil/ld_output.rb ADDED

@@ -0,0 +1,29 @@
+# frozen_string_literal: true
+module Dphil
+  #
+  # Mixin module for Linked Data output
+  #
+  # Requires that a class implements +#as_json+
+  #
+  module LDOutput
+    using Dphil::Refinements::NaturalSort
+    # Outputs a Linked Data Hash
+    def as_jsonld(**options)
+      ld = {
+        "@context" => options.delete(:context) || Constants::LD_CONTEXTS[self.class.name],
+        "@type" => options.delete(:ld_type) || Constants::LD_TYPES[self.class.name],
+      }.merge!(as_json(options))
+      ld_expanded = JSON::LD::API.expand(ld)
+      return ld_expanded if options[:compact] == false
+      ld_compact = JSON::LD::API.compact(ld_expanded, ld["@context"])
+      { "@context" => ld_compact.delete("@context") }.merge!(ld_compact.natural_sort_keys)
+    end
+    def to_jsonld(**options)
+      as_jsonld(options).to_json(options)
+    end
+  end
+end

data/lib/dphil/lemma.rb ADDED

@@ -0,0 +1,44 @@
+# frozen_string_literal: true
+module Dphil
+  # Public: A storage object for words and groups of words from TEI XML data.
+  # Also contains information about the source/location of the words.
+  # Immutable.
+  class Lemma
+    using ::Ragabash::Refinements
+    # Public: Returns the raw source data for the lemma.
+    attr_reader :source, :text, :page, :facs, :line, :index
+    # Public: Initialize a lemma object.
+    #
+    # source - XML data to initialize the lemma from
+    def initialize(source = "", index = nil)
+      @source = source.strip
+      @index = index
+      xml = Nokogiri::XML("<lemma>#{source}</lemma>") { |config| config.strict.noent }
+      xml.encoding = "UTF-8"
+      @text = xml.text.strip.gsub(/\-+\s*\-*/, "")
+      @page = xml.css("pb").map { |el| el.attr("n") }.join(",")
+      @facs = xml.css("pb").map { |el| el.attr("facs") }.join(",")
+      @line = xml.css("lb").map { |el| el.attr("n") }.join(",")
+    rescue Nokogiri::XML::SyntaxError => e
+      $stderr.puts "Error in Lemma.new(`#{source}`, ...): #{e}"
+      abort
+    end
+    def to_s
+      "(#{index}|#{page}:#{line}) #{text}"
+    end
+    def to_sym
+      "<Lemma>#{self}".to_sym
+    end
+    def ==(other)
+      return false unless other.is_a?(Dphil::Lemma)
+      source == other.source
+    end
+  end
+end

data/lib/dphil/lemma_list.rb ADDED

@@ -0,0 +1,179 @@
+# frozen_string_literal: true
+require "nokogiri"
+module Dphil
+  # An object containing a list of lemmata generated through SAX parsing of an
+  #   XML document.
+  # Immutable.
+  class LemmaList < ::Nokogiri::XML::SAX::Document
+    using ::Ragabash::Refinements
+    include Enumerable
+    attr_reader :name
+    def initialize(source)
+      @members = []
+      source = source.to_s.strip
+      return if source.empty?
+      @lemma_ignore_start_tags = Set.new(%w[TEI text body pre post div])
+      @lemma_ignore_end_tags = @lemma_ignore_start_tags + Set.new(%w[pb lb])
+      @index = 0
+      @open_elements = []
+      @current_pb = []
+      @current_lb = []
+      @current_chars = ""
+      @current_lemma = []
+      @inside_hyphen = false
+      @empty_element = true
+      @parser = Nokogiri::XML::SAX::Parser.new(self)
+      @parser.parse(source)
+    end
+    def each(&block)
+      @members.each(&block)
+    end
+    def members(limit = nil)
+      return @members[0, limit] if limit.is_a? Numeric
+      @members
+    end
+    def [](*args)
+      @members[*args]
+    end
+    def get(index)
+      raise "Non-numeric index passed to Lemma.get" unless index.is_a? Numeric
+      if index < 1
+        warn "Minimum index of Lemma.get() is 1"
+        index = 1
+      end
+      @members[index - 1]
+    end
+    def size
+      @members.size
+    end
+    def to_s
+      @members.map(&:text).join("\n")
+    end
+    def cx_tokens
+      @members.map do |lemma|
+        out = {
+          t: lemma.text,
+          n: Transliterate.normalize_iast(lemma.text),
+          i: lemma.index,
+          p: lemma.page,
+          f: lemma.facs,
+          l: lemma.line,
+        }
+        warn "Token empty: #{out}" if out[:t].empty?
+        out
+      end
+    end
+    private
+    def start_element(name, attrs = [])
+      return if @lemma_ignore_start_tags.include?(name)
+      if %w[pb lb].include?(name)
+        el = gen_xmlel(name, attrs, true)
+        if @current_lemma.empty?
+          instance_variable_set("@current_#{name}", [el])
+        else
+          instance_variable_get("@current_#{name}") << el
+        end
+      else
+        el = gen_xmlel(name, attrs)
+        @open_elements << gen_xmlel(name, attrs)
+      end
+      @empty_element = true
+      @current_lemma << el unless el.empty?
+    end
+    def end_element(name)
+      return if @lemma_ignore_end_tags.include?(name)
+      if @empty_element
+        @current_lemma[-1] = @current_lemma[-1].gsub(%r{/*>\z}, "/>")
+        @empty_element = false
+      else
+        @current_lemma << "</#{name}>"
+      end
+      @open_elements.pop
+    end
+    def characters(string)
+      @empty_element = false
+      string.split(/(\s)/).reject(&:empty?).each do |lemma|
+        @current_chars += lemma.strip
+        if lemma.match?(/\-$/)
+          @inside_hyphen = true
+        elsif lemma.match?(/^\-?[^\s]/)
+          @inside_hyphen = false
+        end
+        if lemma.match(/^\s+$/) && !@inside_hyphen
+          finalize
+          next
+        end
+        text = lemma.strip
+        @current_lemma << text unless text.empty?
+      end
+    end
+    def end_document
+      finalize
+      (instance_variables - [:@members]).each do |var|
+        remove_instance_variable(var)
+      end
+    end
+    def gen_xmlel(name, attrs, self_closing = false)
+      attr_list = attrs.reduce("") do |result, attr|
+        %(#{result} #{attr[0]}="#{attr[1].gsub('"', '&quot;')}")
+      end
+      self_closing ? "<#{name}#{attr_list}/>" : "<#{name}#{attr_list}>"
+    end
+    def gen_xmlclose(el)
+      el.gsub(/^<([^\s\>]+).*/, '</\\1>')
+    end
+    def append_lemma
+      return unless @current_chars.match?(/[^\s\-\.\|]+/) # if not .empty?
+      new_lemma_source = @current_lemma.join("")
+      new_lemma = Lemma.new(new_lemma_source, @index)
+      @index += 1
+      @members << new_lemma
+    end
+    def finalize
+      return if @current_lemma.empty?
+      @current_lemma.unshift(@current_lb.first) unless @current_lemma[0] == @current_lb.first
+      @current_lemma.unshift(@current_pb.first) unless @current_lemma[0] == @current_pb.first
+      # Make sure missing open or close tags are inserted
+      unless @open_elements.empty?
+        @current_lemma.concat(@open_elements.reverse.map { |e| gen_xmlclose(e) })
+        prime_next = @open_elements.dup
+      end
+      append_lemma
+      @current_pb = [@current_pb.last]
+      @current_lb = [@current_lb.last]
+      @current_chars = ""
+      @current_lemma = prime_next || []
+      @inside_hyphen = false
+    end
+  end
+end