dphil 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,39 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "awesome_print"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
class LogFormatter < ::Logger::Formatter
|
7
|
+
using ::Ragabash::Refinements
|
8
|
+
|
9
|
+
def colorize(severity, string)
|
10
|
+
color = SEVERITY_MAP[severity] || :none
|
11
|
+
String.new("#{COLOR_MAP[color]}#{string}#{COLOR_MAP[:none]}")
|
12
|
+
end
|
13
|
+
|
14
|
+
def call(severity, timestamp, progname, msg)
|
15
|
+
out = colorize(severity, "[#{timestamp.strftime('%Y-%m-%d %H:%M:%S %Z')}][v#{VERSION}] [#{severity}] ")
|
16
|
+
out << colorize("PROGNAME", "[#{progname}]") unless progname.nil?
|
17
|
+
"#{out}\n#{(msg.respond_to?(:to_str) ? msg : msg.ai(indent: -2))}\n"
|
18
|
+
end
|
19
|
+
|
20
|
+
COLOR_MAP = {
|
21
|
+
none: "\e[0m",
|
22
|
+
bold: "\e[1m",
|
23
|
+
red: "\e[31m",
|
24
|
+
yellow: "\e[33m",
|
25
|
+
green: "\e[32m",
|
26
|
+
cyan: "\e[36m",
|
27
|
+
}.freeze
|
28
|
+
|
29
|
+
SEVERITY_MAP = {
|
30
|
+
"ERROR" => :red,
|
31
|
+
"FATAL" => :red,
|
32
|
+
"WARN" => :yellow,
|
33
|
+
"INFO" => :green,
|
34
|
+
"DEBUG" => :cyan,
|
35
|
+
"PROGNAME" => :bold,
|
36
|
+
}.freeze
|
37
|
+
private_constant :COLOR_MAP, :SEVERITY_MAP
|
38
|
+
end
|
39
|
+
end
|
data/lib/dphil/logger.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support/logger"
|
4
|
+
|
5
|
+
require "dphil/log_formatter"
|
6
|
+
|
7
|
+
# Namespace module definition
|
8
|
+
module Dphil
|
9
|
+
module_function
|
10
|
+
|
11
|
+
def logger
|
12
|
+
@logger ||= begin
|
13
|
+
if defined?(::Rails) && defined?(::Rails.logger)
|
14
|
+
::Rails.logger
|
15
|
+
else
|
16
|
+
file_logger = ActiveSupport::Logger.new(File.join(GEM_ROOT, "dphil.log"))
|
17
|
+
file_logger.formatter = LogFormatter.new
|
18
|
+
if Constants::DEBUG
|
19
|
+
logger = ActiveSupport::Logger.new(STDERR)
|
20
|
+
logger.formatter = file_logger.formatter
|
21
|
+
file_logger.extend(ActiveSupport::Logger.broadcast(logger))
|
22
|
+
end
|
23
|
+
file_logger
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "psych"
|
4
|
+
require "hashie"
|
5
|
+
|
6
|
+
module Dphil
|
7
|
+
#
|
8
|
+
# Metrical Data structure imported and parsed from "metrical_data" module at:
|
9
|
+
# https://github.com/shreevatsa/sanskrit
|
10
|
+
#
|
11
|
+
module MetricalData
|
12
|
+
using ::Ragabash::Refinements
|
13
|
+
class << self
|
14
|
+
attr_reader :version, :meters, :patterns, :regexes, :all
|
15
|
+
end
|
16
|
+
|
17
|
+
private_class_method
|
18
|
+
|
19
|
+
# This loads and processes the data into the module.
|
20
|
+
def self.load_data!
|
21
|
+
yml_data = Psych.load_file(File.join(GEM_ROOT, "vendor", "metrical_data.yml"))
|
22
|
+
|
23
|
+
@version = yml_data["commit"].deep_freeze
|
24
|
+
|
25
|
+
# Hash of meters with names as keys and patterns as values
|
26
|
+
meters_h = yml_data["meters"].each_with_object({}) do |(name, patterns), h|
|
27
|
+
h[Transliterate.unicode_downcase(name)] = patterns
|
28
|
+
end
|
29
|
+
@meters = IHash.new(meters_h)
|
30
|
+
|
31
|
+
# Hash of meters with patterns for keys and names/padas as values
|
32
|
+
patterns_h = yml_data["patterns"].each_with_object({}) do |(type, patterns), type_h|
|
33
|
+
type_h[type.to_sym] = (patterns.each_with_object({}) do |(pattern, meters), pattern_h|
|
34
|
+
pattern_h[pattern] = meters.each_with_object({}) do |(name, value), name_h|
|
35
|
+
name_h[Transliterate.unicode_downcase(name)] = value
|
36
|
+
end
|
37
|
+
end).sort_by { |(k, _)| k.to_s.length }.reverse.to_h
|
38
|
+
end
|
39
|
+
@patterns = IHashM.new(patterns_h)
|
40
|
+
|
41
|
+
# Hash of meters with regular expressions for keys and names/padas as values
|
42
|
+
regexes_h = yml_data["regexes"].each_with_object({}) do |(type, patterns), type_h|
|
43
|
+
type_h[type.to_sym] = (patterns.each_with_object({}) do |(pattern, meters), pattern_h|
|
44
|
+
new_pattern = Regexp.new(pattern.source.gsub(/^\^|\$$/, ""))
|
45
|
+
pattern_h[new_pattern] = meters.each_with_object({}) do |(name, value), name_h|
|
46
|
+
name_h[Transliterate.unicode_downcase(name)] = value
|
47
|
+
end
|
48
|
+
end).sort_by { |(k, _)| k.to_s.length }.reverse.to_h
|
49
|
+
end
|
50
|
+
@regexes = IHashM.new(regexes_h)
|
51
|
+
|
52
|
+
@all = IHashM.new(version: version,
|
53
|
+
meters: meters,
|
54
|
+
patterns: patterns,
|
55
|
+
regexes: regexes)
|
56
|
+
self
|
57
|
+
end
|
58
|
+
|
59
|
+
# Immutable Hash
|
60
|
+
class IHash < ::Hash
|
61
|
+
include Hashie::Extensions::MergeInitializer
|
62
|
+
|
63
|
+
def initialize(*)
|
64
|
+
super
|
65
|
+
deep_freeze
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
# Immutable Hash with method access (for :full, :half, :pada hashes)
|
70
|
+
class IHashM < IHash
|
71
|
+
include Hashie::Extensions::MethodAccess
|
72
|
+
end
|
73
|
+
|
74
|
+
# Load the data when we load the module
|
75
|
+
# (but keep it in a method for cleanliness)
|
76
|
+
load_data!
|
77
|
+
end
|
78
|
+
end
|
data/lib/dphil/newick.rb
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "bio"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
module NewickTree
|
7
|
+
module_function
|
8
|
+
|
9
|
+
def tree_from_nex(filename, tree_id: nil, taxa_map: nil) # rubocop:disable MethodLength
|
10
|
+
data = File.read(filename).to_s[/^\s*tree MajRule = \[&R\](.*)$/, 1]
|
11
|
+
tree = Bio::Newick.new(data).tree
|
12
|
+
new_taxa_id = (taxa_map&.keys&.max || 0) + 1
|
13
|
+
tree_hsh = tree.nodes.each_with_object({}) do |n, acc|
|
14
|
+
next if n == tree.root
|
15
|
+
id = taxa_map&.key(n.name)
|
16
|
+
if id.nil?
|
17
|
+
id = new_taxa_id
|
18
|
+
new_taxa_id += 1
|
19
|
+
end
|
20
|
+
acc[id] = n
|
21
|
+
end
|
22
|
+
|
23
|
+
tree_nodes = tree_hsh.each_with_object({}) do |(id, node), acc|
|
24
|
+
out = {
|
25
|
+
id: id,
|
26
|
+
name: node.name || "##{id}",
|
27
|
+
}
|
28
|
+
|
29
|
+
parent = tree.parent(node)
|
30
|
+
out[:parent] = tree_hsh.key(parent) || 0
|
31
|
+
out[:length] = tree.get_edge(node, parent)&.distance
|
32
|
+
|
33
|
+
out[:children] = tree.children(node).map do |n|
|
34
|
+
tree_hsh.key(n)
|
35
|
+
end
|
36
|
+
acc[id] = out
|
37
|
+
end
|
38
|
+
|
39
|
+
stats = {
|
40
|
+
length: nil,
|
41
|
+
ci: nil,
|
42
|
+
hi: nil,
|
43
|
+
ci_ex: nil,
|
44
|
+
hi_ex: nil,
|
45
|
+
ri: nil,
|
46
|
+
rc: nil,
|
47
|
+
}
|
48
|
+
|
49
|
+
Dphil::Tree.new(tree_id, nodes: tree_nodes, stats: stats)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
data/lib/dphil/paup.rb
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# PAUP* Log Processor
|
6
|
+
#
|
7
|
+
module PAUP
|
8
|
+
def self.parse_trees(infile)
|
9
|
+
infile = File.expand_path(infile)
|
10
|
+
return STDERR.puts("File #{infile} not found.") unless File.exist?(infile)
|
11
|
+
|
12
|
+
data = File.read(infile).to_s.split(/^Tree ([0-9]+)\:$/)
|
13
|
+
return data if data.empty?
|
14
|
+
|
15
|
+
hash = { preamble: data.shift.strip }
|
16
|
+
|
17
|
+
trees = {}
|
18
|
+
data.each_slice(2) do |k, v|
|
19
|
+
next trees[:remainder] = k if v.nil?
|
20
|
+
branches = v.match(BRANCH_REGEXP)&.captures
|
21
|
+
changes = v.match(CHGLIST_REGEXP)&.captures
|
22
|
+
arr = []
|
23
|
+
arr.concat(%i[lengths stats].zip(branches)) unless branches.nil?
|
24
|
+
arr << [:changes, changes[0]] unless branches.nil?
|
25
|
+
trees[k.to_i] = arr.to_h
|
26
|
+
end
|
27
|
+
|
28
|
+
hash.merge(trees)
|
29
|
+
end
|
30
|
+
|
31
|
+
BRANCH_REGEXP = /^Branch lengths and linkages.*?\n\-{40,}\n(.*?)\n\-{40,}\n^Sum.*?(^Tree length =.*?)\n\n/m
|
32
|
+
CHGLIST_REGEXP = /^Character change lists:.*?\n\-{40,}\n(.*?)\n\n/m
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
module Refinements
|
5
|
+
module NaturalSort
|
6
|
+
refine Hash do
|
7
|
+
def natural_sort_keys
|
8
|
+
sort_by_key(true) do |a, b|
|
9
|
+
NaturalSort.grouped_compare(a, b) || a <=> b
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
def sort_by_key(recursive = false, &block)
|
14
|
+
keys.sort(&block).each_with_object({}) do |key, acc|
|
15
|
+
acc[key] = self[key]
|
16
|
+
if recursive && acc[key].is_a?(Hash)
|
17
|
+
acc[key] = acc[key].sort_by_key(true, &block)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class << self
|
24
|
+
CMP_REGEX = /((?:@{1,2}|[\$\:])?\p{L}+(?:[^\p{L}\d\s]*))|(\d+)/
|
25
|
+
private_constant :CMP_REGEX
|
26
|
+
|
27
|
+
def grouped_compare(a, b) # rubocop:disable CyclomaticComplexity
|
28
|
+
a = a&.scan(CMP_REGEX)
|
29
|
+
b = b&.scan(CMP_REGEX)
|
30
|
+
return if a.blank? || b.blank?
|
31
|
+
|
32
|
+
ret = nil
|
33
|
+
[a.size, b.size].max.times do |index|
|
34
|
+
a_cmp = coerce_chunk(a[index]) || (return -1)
|
35
|
+
b_cmp = coerce_chunk(b[index]) || (return 1)
|
36
|
+
ret = a_cmp <=> b_cmp || (a.is_a?(Integer) && -1 || b.is_a?(Integer) && 1)
|
37
|
+
return ret unless ret == 0 # rubocop:disable NumericPredicate
|
38
|
+
end
|
39
|
+
ret
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def coerce_chunk(chunk)
|
45
|
+
return if chunk.nil?
|
46
|
+
return chunk[0] unless chunk[0].nil?
|
47
|
+
Integer(chunk[1])
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,124 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "forwardable"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
class ScriptString
|
7
|
+
using ::Ragabash::Refinements
|
8
|
+
extend Forwardable
|
9
|
+
def_delegators :@string, :<=>, :==, :===, :to_s, :to_str, :empty?, :length
|
10
|
+
attr_reader :string
|
11
|
+
|
12
|
+
def initialize(str, script = nil)
|
13
|
+
raise "Source must be a String" unless str.respond_to?(:to_str)
|
14
|
+
str = str.to_str
|
15
|
+
str = str.dup if str.frozen?
|
16
|
+
@string = str.encode!(Encoding::UTF_8)
|
17
|
+
self.script = script || self.script
|
18
|
+
end
|
19
|
+
|
20
|
+
def script
|
21
|
+
@script ||= Transliterate.detect(@string)
|
22
|
+
end
|
23
|
+
|
24
|
+
def script=(script)
|
25
|
+
@script = script.try(:flat_map, &:to_sym) || script.to_sym
|
26
|
+
end
|
27
|
+
|
28
|
+
def transliterate(target)
|
29
|
+
target = target.to_sym
|
30
|
+
string = Transliterate.transliterate(@string, from: @script, to: target)
|
31
|
+
if @script.is_a?(Array)
|
32
|
+
new_target = @script.dup
|
33
|
+
new_target[0] = target
|
34
|
+
new_target.uniq!
|
35
|
+
target = new_target
|
36
|
+
end
|
37
|
+
self.class.new(string, target)
|
38
|
+
end
|
39
|
+
|
40
|
+
def transliterate!(target)
|
41
|
+
target = target.to_sym
|
42
|
+
@string = Transliterate.transliterate(@string, from: @script, to: target)
|
43
|
+
if @script.is_a?(Array)
|
44
|
+
@script[0] = target
|
45
|
+
@script.uniq!
|
46
|
+
end
|
47
|
+
@string
|
48
|
+
end
|
49
|
+
|
50
|
+
# String methods implemented to return ScString intances wherever possible
|
51
|
+
|
52
|
+
def downcase
|
53
|
+
self.class.new(Transliterate.unicode_downcase(@string), @script)
|
54
|
+
end
|
55
|
+
|
56
|
+
def downcase!
|
57
|
+
ret_val = Transliterate.unicode_downcase!(@string)
|
58
|
+
self unless ret_val.nil?
|
59
|
+
end
|
60
|
+
|
61
|
+
def inspect
|
62
|
+
"#{@string.inspect}:#{script}"
|
63
|
+
end
|
64
|
+
|
65
|
+
def gsub(pattern, rep_hash = nil)
|
66
|
+
ret_val = if block_given?
|
67
|
+
@string.gsub(pattern, &Proc.new)
|
68
|
+
elsif !rep_hash.nil?
|
69
|
+
@string.gsub(pattern, rep_hash)
|
70
|
+
else
|
71
|
+
@string.gsub(pattern)
|
72
|
+
end
|
73
|
+
return ret_val if ret_val.is_a?(Enumerator)
|
74
|
+
self.class.new(ret_val, @script)
|
75
|
+
end
|
76
|
+
|
77
|
+
def gsub!(pattern, rep_hash = nil)
|
78
|
+
ret_val = if block_given?
|
79
|
+
@string.gsub!(pattern, &Proc.new)
|
80
|
+
elsif !rep_hash.nil?
|
81
|
+
@string.gsub!(pattern, rep_hash)
|
82
|
+
else
|
83
|
+
@string.gsub!(pattern)
|
84
|
+
end
|
85
|
+
return ret_val if ret_val.is_a?(Enumerator)
|
86
|
+
self unless ret_val.nil?
|
87
|
+
end
|
88
|
+
|
89
|
+
def scan(pattern)
|
90
|
+
ret_val = if block_given?
|
91
|
+
@string.scan(pattern, &Proc.new)
|
92
|
+
else
|
93
|
+
@string.scan(pattern)
|
94
|
+
end
|
95
|
+
return self if ret_val == @string
|
96
|
+
ret_val.map do |match|
|
97
|
+
next self.class.new(match, @script) if match.is_a?(String)
|
98
|
+
match.map do |group|
|
99
|
+
self.class.new(group, @script)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def slice(a, b = nil)
|
105
|
+
slice = b.nil? ? @string.slice(a) : @string.slice(a, b)
|
106
|
+
self.class.new(slice, @script)
|
107
|
+
end
|
108
|
+
alias [] slice
|
109
|
+
|
110
|
+
def slice!(a, b = nil)
|
111
|
+
slice = b.nil? ? @string.slice!(a) : @string.slice!(a, b)
|
112
|
+
self.class.new(slice, @script)
|
113
|
+
end
|
114
|
+
|
115
|
+
def strip
|
116
|
+
self.class.new(@string.strip, @script)
|
117
|
+
end
|
118
|
+
|
119
|
+
def strip!
|
120
|
+
ret_val = @string.strip!
|
121
|
+
self unless ret_val.nil?
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
@@ -0,0 +1,43 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "forwardable"
|
4
|
+
|
5
|
+
require "dphil/syllables/syllable"
|
6
|
+
|
7
|
+
module Dphil
|
8
|
+
class Syllables
|
9
|
+
using ::Ragabash::Refinements
|
10
|
+
include Enumerable
|
11
|
+
extend Forwardable
|
12
|
+
def_delegators :@syllables, :[], :each, :first, :last, :length
|
13
|
+
|
14
|
+
attr_reader :source, :source_script, :weights, :syllables
|
15
|
+
|
16
|
+
def initialize(source, source_script: nil)
|
17
|
+
@source = source.to_str.safe_copy.freeze
|
18
|
+
@source_script = source_script || Transliterate.detect(@source) || Transliterate.default_script
|
19
|
+
slp1_syllables = VerseAnalysis.syllables(@source, from: @source_script, to: :slp1)
|
20
|
+
@weights = VerseAnalysis.syllables_weights(slp1_syllables, from: :slp1, contextual: true).freeze
|
21
|
+
@syllables = (slp1_syllables.map.with_index do |syl, i|
|
22
|
+
source = @source_script == :slp1 ? syl : Transliterate.t(syl, :slp1, @source_script)
|
23
|
+
Syllables::Syllable.new(source, @weights[i], parent: self, index: i, slp1: syl)
|
24
|
+
end).freeze
|
25
|
+
end
|
26
|
+
|
27
|
+
def inspect
|
28
|
+
"<Syllables \"#{@source}\":#{@source_script} (#{@weights}) (#{@syllables.count}) => #{@syllables.inspect}>"
|
29
|
+
end
|
30
|
+
|
31
|
+
def to_a
|
32
|
+
@syllables.map { |syl| Transliterate.t(syl.source, :slp1, @source_script) }
|
33
|
+
end
|
34
|
+
|
35
|
+
def to_s
|
36
|
+
@source.dup
|
37
|
+
end
|
38
|
+
|
39
|
+
def simple_weights
|
40
|
+
@simple_weights ||= @weights.upcase.freeze
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|