dphil 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
class Syllables
|
5
|
+
using ::Ragabash::Refinements
|
6
|
+
class Syllable
|
7
|
+
attr_reader :source, :weight, :parent, :index, :source_script
|
8
|
+
|
9
|
+
def initialize(source, weight, **opts)
|
10
|
+
@source = source.to_str.safe_copy.freeze
|
11
|
+
@weight = weight.to_str.safe_copy.freeze
|
12
|
+
@parent = opts[:parent]
|
13
|
+
@index = opts[:index]&.to_i
|
14
|
+
@source_script = opts[:source_script] || (@parent&.source_script)
|
15
|
+
@slp1 = @source_script == :slp1 ? @source : opts[:slp1]&.to_str&.safe_copy.freeze
|
16
|
+
end
|
17
|
+
|
18
|
+
def inspect
|
19
|
+
"[#{index}]#{source.inspect}(#{weight})"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
@source.dup
|
24
|
+
end
|
25
|
+
|
26
|
+
def prev
|
27
|
+
return unless @parent && @index && @index.positive?
|
28
|
+
@parent[@index - 1]
|
29
|
+
end
|
30
|
+
|
31
|
+
def next
|
32
|
+
return unless @parent && @index && @index < @parent.length
|
33
|
+
@parent[@index + 1]
|
34
|
+
end
|
35
|
+
|
36
|
+
def simple_weight
|
37
|
+
@simple_weight ||= weight.upcase.freeze
|
38
|
+
end
|
39
|
+
|
40
|
+
def slp1
|
41
|
+
@slp1 ||= Transliterate.t(@source, @source_script, :slp1).freeze
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
class TeiXML
|
5
|
+
using ::Ragabash::Refinements
|
6
|
+
# Public: Initialize a TeiXML object
|
7
|
+
#
|
8
|
+
def initialize(source)
|
9
|
+
source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
|
10
|
+
@raw_xml = source
|
11
|
+
end
|
12
|
+
|
13
|
+
# Return or re-parse xml
|
14
|
+
def xml
|
15
|
+
@xml ||= begin
|
16
|
+
xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
|
17
|
+
xml.encoding = "UTF-8"
|
18
|
+
xml.remove_namespaces!
|
19
|
+
xml_normalize!(xml)
|
20
|
+
rescue Nokogiri::XML::SyntaxError => e
|
21
|
+
raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_xml
|
26
|
+
xml.to_xml
|
27
|
+
end
|
28
|
+
|
29
|
+
alias to_s to_xml
|
30
|
+
|
31
|
+
def empty?
|
32
|
+
xml.xpath("//text()[normalize-space()]").empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
# Public: Return a portion of the document as a new document
|
36
|
+
#
|
37
|
+
# expr - a CSS selector or XPath expression
|
38
|
+
#
|
39
|
+
# Returns a new document.
|
40
|
+
def crop(expr)
|
41
|
+
segment = xml.search(expr)
|
42
|
+
pb = page_of(segment)
|
43
|
+
lb = line_of(segment)
|
44
|
+
|
45
|
+
source = <<~EOS
|
46
|
+
<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
|
47
|
+
<pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
|
48
|
+
#{segment.to_xml}
|
49
|
+
<post></post>
|
50
|
+
</TEI>
|
51
|
+
EOS
|
52
|
+
self.class.new(source)
|
53
|
+
end
|
54
|
+
|
55
|
+
def crop_each(expr)
|
56
|
+
xml.search(expr).map do |segment|
|
57
|
+
pb = page_of(segment)
|
58
|
+
lb = line_of(segment)
|
59
|
+
|
60
|
+
source = <<~EOS
|
61
|
+
<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
|
62
|
+
<pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
|
63
|
+
#{segment.to_xml}
|
64
|
+
<post></post>
|
65
|
+
</TEI>
|
66
|
+
EOS
|
67
|
+
self.class.new(source)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Public: Remove elements from the document based on CSS selector.
|
72
|
+
#
|
73
|
+
# expr - a CSS selector or XPath expression
|
74
|
+
#
|
75
|
+
# Returns a new document.
|
76
|
+
def reject(expr)
|
77
|
+
source = xml.dup
|
78
|
+
source.search(expr).each do |node|
|
79
|
+
node.replace(node.search("pb, lb"))
|
80
|
+
end
|
81
|
+
self.class.new(source.to_xml)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Public: Substitute elements from the document based on CSS selector with
|
85
|
+
# ID-based token text-nodes.
|
86
|
+
#
|
87
|
+
# expr - a CSS selector or XPath expression
|
88
|
+
# subst_text - an optional text identifier
|
89
|
+
#
|
90
|
+
# Returns a new document.
|
91
|
+
def subst(expr, subst_text = nil)
|
92
|
+
source = parsed_xml.dup
|
93
|
+
subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?
|
94
|
+
|
95
|
+
source.search(expr).each do |node|
|
96
|
+
set = Nokogiri::XML::NodeSet.new(source)
|
97
|
+
escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
|
98
|
+
text_content = "#{subst_text || node.name}#{escaped_text}"
|
99
|
+
set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
|
100
|
+
node.replace(set + node.search("pb, lb"))
|
101
|
+
end
|
102
|
+
self.class.new(source.to_xml)
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
# Get nearest prior <pb/> node.
|
108
|
+
#
|
109
|
+
# id - node in document to start search from.
|
110
|
+
#
|
111
|
+
# Returns an XML node.
|
112
|
+
def page_of(node)
|
113
|
+
node.xpath("preceding::*[name() = 'pb'][1]")
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get nearest prior <lb/> node with everything in between.
|
117
|
+
#
|
118
|
+
# node - node in document to start search from.
|
119
|
+
#
|
120
|
+
# Returns an XML node.
|
121
|
+
def line_of(node)
|
122
|
+
node.xpath("preceding::*[name() = 'lb'][1]")
|
123
|
+
end
|
124
|
+
|
125
|
+
# Normalize (mostly) whitespace in the XML.
|
126
|
+
def xml_normalize!(doc)
|
127
|
+
doc.search("//text()").each do |text_node|
|
128
|
+
text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove empty modification tags.
|
132
|
+
doc.search(
|
133
|
+
"//add[not(node())]|" \
|
134
|
+
"//del[not(node())]|" \
|
135
|
+
"//mod[not(node())]|" \
|
136
|
+
"//unclear[not(node())]|" \
|
137
|
+
"//g[not(node())]"
|
138
|
+
).remove
|
139
|
+
doc
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sanscript"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
# Transliteration module for basic romanization formats.
|
7
|
+
module Transliterate
|
8
|
+
using ::Ragabash::Refinements
|
9
|
+
|
10
|
+
@default_script = nil
|
11
|
+
|
12
|
+
module_function
|
13
|
+
|
14
|
+
def default_script
|
15
|
+
@default_script
|
16
|
+
end
|
17
|
+
|
18
|
+
def default_script=(scr)
|
19
|
+
scr = scr.to_sym
|
20
|
+
if script_supported?(scr)
|
21
|
+
@default_script = scr
|
22
|
+
else
|
23
|
+
warn "Script unsupported [:#{scr}]"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def transliterate(str, first, second = nil)
|
28
|
+
Sanscript.transliterate(str, first, second, default_script: default_script)
|
29
|
+
rescue RuntimeError => e
|
30
|
+
Dphil.logger.error "Transliteration Error: #{e}"
|
31
|
+
return str
|
32
|
+
end
|
33
|
+
|
34
|
+
def script_supported?(script)
|
35
|
+
Sanscript::Transliterate.scheme_names.include?(script)
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_ascii(str)
|
39
|
+
process_string(str) do |out|
|
40
|
+
out.unicode_normalize!(:nfd)
|
41
|
+
out.gsub!(/[^\u0000-\u007F]+/, "")
|
42
|
+
out
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def iast_kh(str)
|
47
|
+
transliterate(str, :iast, :kh)
|
48
|
+
end
|
49
|
+
|
50
|
+
def kh_iast(str)
|
51
|
+
transliterate(str, :kh, :iast)
|
52
|
+
end
|
53
|
+
|
54
|
+
def iast_slp1(str)
|
55
|
+
transliterate(str, :iast, :slp1)
|
56
|
+
end
|
57
|
+
|
58
|
+
def slp1_iast(str)
|
59
|
+
transliterate(str, :slp1, :iast)
|
60
|
+
end
|
61
|
+
|
62
|
+
def detect(str)
|
63
|
+
Sanscript::Detect.detect_scheme(str)
|
64
|
+
end
|
65
|
+
|
66
|
+
def normalize_slp1(st)
|
67
|
+
out = st.dup
|
68
|
+
out.gsub!(Constants::TRANS_CTRL_WORD) do |match|
|
69
|
+
control_content = match[Constants::TRANS_CTRL_WORD_CONTENT, 1]
|
70
|
+
next match if control_content&.match(Constants::TRANS_CTRL_WORD_PROCESSED)
|
71
|
+
"{###{Digest::SHA1.hexdigest(control_content).rjust(40, '0')}##}"
|
72
|
+
end
|
73
|
+
|
74
|
+
process_string!(out) do |token|
|
75
|
+
token.tr!("b", "v")
|
76
|
+
token.gsub!(/['‘]\b/, "") # Avagraha
|
77
|
+
token.gsub!(/\B[NYRnm]/, "M") # Medial and final nasals
|
78
|
+
token.gsub!(/\B[Hrs]\b/, "") # Final visarga/r/s
|
79
|
+
token.gsub!(%r{[\.\-\_\\\/]}, "") # Punctuation
|
80
|
+
token
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def normalize_iast(word)
|
85
|
+
out = iast_slp1(word)
|
86
|
+
normalize_slp1(out)
|
87
|
+
end
|
88
|
+
|
89
|
+
def unicode_downcase!(str, ignore_control = false)
|
90
|
+
return UNICODE_DOWNCASE_PROC.call(str) if ignore_control
|
91
|
+
process_string!(str, &UNICODE_DOWNCASE_PROC)
|
92
|
+
end
|
93
|
+
|
94
|
+
def unicode_downcase(st, ignore_control = false)
|
95
|
+
unicode_downcase!(st.dup, ignore_control)
|
96
|
+
end
|
97
|
+
|
98
|
+
UNICODE_DOWNCASE_PROC = lambda do |str|
|
99
|
+
str.unicode_normalize!(:nfd)
|
100
|
+
str.downcase!
|
101
|
+
str.unicode_normalize!(:nfc)
|
102
|
+
str
|
103
|
+
end
|
104
|
+
|
105
|
+
private_constant :UNICODE_DOWNCASE_PROC
|
106
|
+
|
107
|
+
class << self
|
108
|
+
alias t transliterate
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def process_string!(str, ignore_control = false, &_block)
|
113
|
+
str = str.to_str
|
114
|
+
return yield str if ignore_control
|
115
|
+
|
116
|
+
scan = str.scan(Constants::TRANS_CTRL_WORD)
|
117
|
+
return yield str if scan.empty?
|
118
|
+
return str if scan.first == str
|
119
|
+
|
120
|
+
str.gsub!(Constants::TRANS_CTRL_WORD, "\u0026\u0026")
|
121
|
+
str = yield str
|
122
|
+
str.gsub!("\u0026\u0026") { scan.shift }
|
123
|
+
str
|
124
|
+
end
|
125
|
+
|
126
|
+
def process_string(str, ignore_control = false, &block)
|
127
|
+
process_string!(str.dup, ignore_control, &block)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
data/lib/dphil/tree.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Phylogenetic Tree generated from parsing PAUP output.
|
6
|
+
#
|
7
|
+
# Immutable.
|
8
|
+
#
|
9
|
+
class Tree
|
10
|
+
include LDOutput
|
11
|
+
attr_reader :id, :nodes, :stats, :tree
|
12
|
+
|
13
|
+
def initialize(id = nil, lengths = nil, stats = nil, **opts)
|
14
|
+
@id = (opts[:id] || id).to_i
|
15
|
+
if lengths.respond_to?(:to_str) && stats.respond_to?(:to_str)
|
16
|
+
@nodes = nodes_from_lengths(parse_paup_lengths(lengths))
|
17
|
+
@stats = parse_paup_stats(stats)
|
18
|
+
elsif (opts.keys & %i[nodes stats]).length == 2
|
19
|
+
@nodes = parse_json_nodes(opts[:nodes])
|
20
|
+
@stats = parse_json_stats(opts[:stats])
|
21
|
+
end
|
22
|
+
@tree = tree_from_nodes(nodes)
|
23
|
+
IceNine.deep_freeze(self)
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_h
|
27
|
+
{
|
28
|
+
id: id,
|
29
|
+
root_id: tree.id,
|
30
|
+
nodes: nodes,
|
31
|
+
stats: stats,
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_json(options = nil)
|
36
|
+
to_h.as_json(options)
|
37
|
+
end
|
38
|
+
|
39
|
+
def root
|
40
|
+
nodes[tree.id]
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_node(id)
|
44
|
+
nodes[id]
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_parent(node)
|
48
|
+
nodes[node.parent]
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_children(node)
|
52
|
+
node.children&.map { |id| nodes[id] }
|
53
|
+
end
|
54
|
+
|
55
|
+
def tree_length
|
56
|
+
stats[:length]
|
57
|
+
end
|
58
|
+
|
59
|
+
def ci
|
60
|
+
stats[:ci]
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
PAUP_TREE_STATS = {
|
66
|
+
"Tree length" => :length,
|
67
|
+
"Consistency index (CI)" => :ci,
|
68
|
+
"Homoplasy index (HI)" => :hi,
|
69
|
+
"CI excluding uninformative characters" => :ci_ex,
|
70
|
+
"HI excluding uninformative characters" => :hi_ex,
|
71
|
+
"Retention index (RI)" => :ri,
|
72
|
+
"Rescaled consistency index (RC)" => :rc,
|
73
|
+
}.freeze
|
74
|
+
|
75
|
+
private_constant :PAUP_TREE_STATS
|
76
|
+
|
77
|
+
def parse_paup_lengths(lengths)
|
78
|
+
lengths.to_s&.split("\n")&.map { |l| l.strip.split(/\s{3,}/) }
|
79
|
+
end
|
80
|
+
|
81
|
+
def parse_paup_stats(stats)
|
82
|
+
stats.to_s&.split("\n")&.each_with_object({}) do |l, acc|
|
83
|
+
key, val = l.split(" = ")
|
84
|
+
acc[PAUP_TREE_STATS[key]] = (val["."] ? val.to_f : val.to_i)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_json_nodes(json_nodes)
|
89
|
+
json_nodes.each_with_object({}) do |(id, node), acc|
|
90
|
+
acc[id.to_s.to_i] = TreeNode.new(node)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_json_stats(json_stats)
|
95
|
+
missing_keys = (PAUP_TREE_STATS.values - json_stats.keys)
|
96
|
+
raise ArgumentError, "Missing `stats` keys: #{missing_keys}" unless missing_keys.empty?
|
97
|
+
json_stats.each_with_object({}) do |(k, v), acc|
|
98
|
+
raise ArgumentError, "Stat `#{k}` is not a Numeric" unless v.is_a?(Numeric) || v.nil?
|
99
|
+
acc[k] = v
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def nodes_from_lengths(lengths)
|
104
|
+
lengths.each_with_object({}) do |arr, hash|
|
105
|
+
name, id = arr[0].match(/^(.*?)\s?\(?([0-9]{1,4})\)?$/).captures
|
106
|
+
id = id.to_i
|
107
|
+
parent = arr[1].to_i
|
108
|
+
node = TreeNode.new(
|
109
|
+
id: id,
|
110
|
+
name: (name.present? ? name : "##{id}"),
|
111
|
+
length: arr[2].to_i,
|
112
|
+
parent: parent
|
113
|
+
)
|
114
|
+
hash[id] ||= TreeNode.new
|
115
|
+
hash[id].merge!(node)
|
116
|
+
|
117
|
+
next if parent.zero?
|
118
|
+
hash[parent] ||= TreeNode.new(
|
119
|
+
id: parent,
|
120
|
+
name: (parent.to_i.zero? ? parent : "##{parent}"),
|
121
|
+
length: 0,
|
122
|
+
parent: 0
|
123
|
+
)
|
124
|
+
hash[parent].children ||= []
|
125
|
+
hash[parent].children << id
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def tree_from_nodes(nodes)
|
130
|
+
root = nodes.select { |_, node| node.parent.zero? }&.first&.last
|
131
|
+
return {} if root.blank?
|
132
|
+
append_children(nodes, root)
|
133
|
+
end
|
134
|
+
|
135
|
+
def append_children(nodes, node)
|
136
|
+
new_node = TreeNode.new(node.to_h)
|
137
|
+
return new_node unless new_node.children.present?
|
138
|
+
new_node.children = new_node.children.map { |id| append_children(nodes, nodes[id]) }
|
139
|
+
new_node
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|