dphil 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
class Syllables
|
5
|
+
using ::Ragabash::Refinements
|
6
|
+
class Syllable
|
7
|
+
attr_reader :source, :weight, :parent, :index, :source_script
|
8
|
+
|
9
|
+
def initialize(source, weight, **opts)
|
10
|
+
@source = source.to_str.safe_copy.freeze
|
11
|
+
@weight = weight.to_str.safe_copy.freeze
|
12
|
+
@parent = opts[:parent]
|
13
|
+
@index = opts[:index]&.to_i
|
14
|
+
@source_script = opts[:source_script] || (@parent&.source_script)
|
15
|
+
@slp1 = @source_script == :slp1 ? @source : opts[:slp1]&.to_str&.safe_copy.freeze
|
16
|
+
end
|
17
|
+
|
18
|
+
def inspect
|
19
|
+
"[#{index}]#{source.inspect}(#{weight})"
|
20
|
+
end
|
21
|
+
|
22
|
+
def to_s
|
23
|
+
@source.dup
|
24
|
+
end
|
25
|
+
|
26
|
+
def prev
|
27
|
+
return unless @parent && @index && @index.positive?
|
28
|
+
@parent[@index - 1]
|
29
|
+
end
|
30
|
+
|
31
|
+
def next
|
32
|
+
return unless @parent && @index && @index < @parent.length
|
33
|
+
@parent[@index + 1]
|
34
|
+
end
|
35
|
+
|
36
|
+
def simple_weight
|
37
|
+
@simple_weight ||= weight.upcase.freeze
|
38
|
+
end
|
39
|
+
|
40
|
+
def slp1
|
41
|
+
@slp1 ||= Transliterate.t(@source, @source_script, :slp1).freeze
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
class TeiXML
|
5
|
+
using ::Ragabash::Refinements
|
6
|
+
# Public: Initialize a TeiXML object
|
7
|
+
#
|
8
|
+
def initialize(source)
|
9
|
+
source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
|
10
|
+
@raw_xml = source
|
11
|
+
end
|
12
|
+
|
13
|
+
# Return or re-parse xml
|
14
|
+
def xml
|
15
|
+
@xml ||= begin
|
16
|
+
xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
|
17
|
+
xml.encoding = "UTF-8"
|
18
|
+
xml.remove_namespaces!
|
19
|
+
xml_normalize!(xml)
|
20
|
+
rescue Nokogiri::XML::SyntaxError => e
|
21
|
+
raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def to_xml
|
26
|
+
xml.to_xml
|
27
|
+
end
|
28
|
+
|
29
|
+
alias to_s to_xml
|
30
|
+
|
31
|
+
def empty?
|
32
|
+
xml.xpath("//text()[normalize-space()]").empty?
|
33
|
+
end
|
34
|
+
|
35
|
+
# Public: Return a portion of the document as a new document
|
36
|
+
#
|
37
|
+
# expr - a CSS selector or XPath expression
|
38
|
+
#
|
39
|
+
# Returns a new document.
|
40
|
+
def crop(expr)
|
41
|
+
segment = xml.search(expr)
|
42
|
+
pb = page_of(segment)
|
43
|
+
lb = line_of(segment)
|
44
|
+
|
45
|
+
source = <<~EOS
|
46
|
+
<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
|
47
|
+
<pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
|
48
|
+
#{segment.to_xml}
|
49
|
+
<post></post>
|
50
|
+
</TEI>
|
51
|
+
EOS
|
52
|
+
self.class.new(source)
|
53
|
+
end
|
54
|
+
|
55
|
+
def crop_each(expr)
|
56
|
+
xml.search(expr).map do |segment|
|
57
|
+
pb = page_of(segment)
|
58
|
+
lb = line_of(segment)
|
59
|
+
|
60
|
+
source = <<~EOS
|
61
|
+
<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
|
62
|
+
<pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
|
63
|
+
#{segment.to_xml}
|
64
|
+
<post></post>
|
65
|
+
</TEI>
|
66
|
+
EOS
|
67
|
+
self.class.new(source)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
# Public: Remove elements from the document based on CSS selector.
|
72
|
+
#
|
73
|
+
# expr - a CSS selector or XPath expression
|
74
|
+
#
|
75
|
+
# Returns a new document.
|
76
|
+
def reject(expr)
|
77
|
+
source = xml.dup
|
78
|
+
source.search(expr).each do |node|
|
79
|
+
node.replace(node.search("pb, lb"))
|
80
|
+
end
|
81
|
+
self.class.new(source.to_xml)
|
82
|
+
end
|
83
|
+
|
84
|
+
# Public: Substitute elements from the document based on CSS selector with
|
85
|
+
# ID-based token text-nodes.
|
86
|
+
#
|
87
|
+
# expr - a CSS selector or XPath expression
|
88
|
+
# subst_text - an optional text identifier
|
89
|
+
#
|
90
|
+
# Returns a new document.
|
91
|
+
def subst(expr, subst_text = nil)
|
92
|
+
source = parsed_xml.dup
|
93
|
+
subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?
|
94
|
+
|
95
|
+
source.search(expr).each do |node|
|
96
|
+
set = Nokogiri::XML::NodeSet.new(source)
|
97
|
+
escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
|
98
|
+
text_content = "#{subst_text || node.name}#{escaped_text}"
|
99
|
+
set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
|
100
|
+
node.replace(set + node.search("pb, lb"))
|
101
|
+
end
|
102
|
+
self.class.new(source.to_xml)
|
103
|
+
end
|
104
|
+
|
105
|
+
private
|
106
|
+
|
107
|
+
# Get nearest prior <pb/> node.
|
108
|
+
#
|
109
|
+
# id - node in document to start search from.
|
110
|
+
#
|
111
|
+
# Returns an XML node.
|
112
|
+
def page_of(node)
|
113
|
+
node.xpath("preceding::*[name() = 'pb'][1]")
|
114
|
+
end
|
115
|
+
|
116
|
+
# Get nearest prior <lb/> node with everything in between.
|
117
|
+
#
|
118
|
+
# node - node in document to start search from.
|
119
|
+
#
|
120
|
+
# Returns an XML node.
|
121
|
+
def line_of(node)
|
122
|
+
node.xpath("preceding::*[name() = 'lb'][1]")
|
123
|
+
end
|
124
|
+
|
125
|
+
# Normalize (mostly) whitespace in the XML.
|
126
|
+
def xml_normalize!(doc)
|
127
|
+
doc.search("//text()").each do |text_node|
|
128
|
+
text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ")
|
129
|
+
end
|
130
|
+
|
131
|
+
# Remove empty modification tags.
|
132
|
+
doc.search(
|
133
|
+
"//add[not(node())]|" \
|
134
|
+
"//del[not(node())]|" \
|
135
|
+
"//mod[not(node())]|" \
|
136
|
+
"//unclear[not(node())]|" \
|
137
|
+
"//g[not(node())]"
|
138
|
+
).remove
|
139
|
+
doc
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "sanscript"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
# Transliteration module for basic romanization formats.
|
7
|
+
module Transliterate
|
8
|
+
using ::Ragabash::Refinements
|
9
|
+
|
10
|
+
@default_script = nil
|
11
|
+
|
12
|
+
module_function
|
13
|
+
|
14
|
+
def default_script
|
15
|
+
@default_script
|
16
|
+
end
|
17
|
+
|
18
|
+
def default_script=(scr)
|
19
|
+
scr = scr.to_sym
|
20
|
+
if script_supported?(scr)
|
21
|
+
@default_script = scr
|
22
|
+
else
|
23
|
+
warn "Script unsupported [:#{scr}]"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def transliterate(str, first, second = nil)
|
28
|
+
Sanscript.transliterate(str, first, second, default_script: default_script)
|
29
|
+
rescue RuntimeError => e
|
30
|
+
Dphil.logger.error "Transliteration Error: #{e}"
|
31
|
+
return str
|
32
|
+
end
|
33
|
+
|
34
|
+
def script_supported?(script)
|
35
|
+
Sanscript::Transliterate.scheme_names.include?(script)
|
36
|
+
end
|
37
|
+
|
38
|
+
def to_ascii(str)
|
39
|
+
process_string(str) do |out|
|
40
|
+
out.unicode_normalize!(:nfd)
|
41
|
+
out.gsub!(/[^\u0000-\u007F]+/, "")
|
42
|
+
out
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
def iast_kh(str)
|
47
|
+
transliterate(str, :iast, :kh)
|
48
|
+
end
|
49
|
+
|
50
|
+
def kh_iast(str)
|
51
|
+
transliterate(str, :kh, :iast)
|
52
|
+
end
|
53
|
+
|
54
|
+
def iast_slp1(str)
|
55
|
+
transliterate(str, :iast, :slp1)
|
56
|
+
end
|
57
|
+
|
58
|
+
def slp1_iast(str)
|
59
|
+
transliterate(str, :slp1, :iast)
|
60
|
+
end
|
61
|
+
|
62
|
+
def detect(str)
|
63
|
+
Sanscript::Detect.detect_scheme(str)
|
64
|
+
end
|
65
|
+
|
66
|
+
def normalize_slp1(st)
|
67
|
+
out = st.dup
|
68
|
+
out.gsub!(Constants::TRANS_CTRL_WORD) do |match|
|
69
|
+
control_content = match[Constants::TRANS_CTRL_WORD_CONTENT, 1]
|
70
|
+
next match if control_content&.match(Constants::TRANS_CTRL_WORD_PROCESSED)
|
71
|
+
"{###{Digest::SHA1.hexdigest(control_content).rjust(40, '0')}##}"
|
72
|
+
end
|
73
|
+
|
74
|
+
process_string!(out) do |token|
|
75
|
+
token.tr!("b", "v")
|
76
|
+
token.gsub!(/['‘]\b/, "") # Avagraha
|
77
|
+
token.gsub!(/\B[NYRnm]/, "M") # Medial and final nasals
|
78
|
+
token.gsub!(/\B[Hrs]\b/, "") # Final visarga/r/s
|
79
|
+
token.gsub!(%r{[\.\-\_\\\/]}, "") # Punctuation
|
80
|
+
token
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def normalize_iast(word)
|
85
|
+
out = iast_slp1(word)
|
86
|
+
normalize_slp1(out)
|
87
|
+
end
|
88
|
+
|
89
|
+
def unicode_downcase!(str, ignore_control = false)
|
90
|
+
return UNICODE_DOWNCASE_PROC.call(str) if ignore_control
|
91
|
+
process_string!(str, &UNICODE_DOWNCASE_PROC)
|
92
|
+
end
|
93
|
+
|
94
|
+
def unicode_downcase(st, ignore_control = false)
|
95
|
+
unicode_downcase!(st.dup, ignore_control)
|
96
|
+
end
|
97
|
+
|
98
|
+
UNICODE_DOWNCASE_PROC = lambda do |str|
|
99
|
+
str.unicode_normalize!(:nfd)
|
100
|
+
str.downcase!
|
101
|
+
str.unicode_normalize!(:nfc)
|
102
|
+
str
|
103
|
+
end
|
104
|
+
|
105
|
+
private_constant :UNICODE_DOWNCASE_PROC
|
106
|
+
|
107
|
+
class << self
|
108
|
+
alias t transliterate
|
109
|
+
|
110
|
+
private
|
111
|
+
|
112
|
+
def process_string!(str, ignore_control = false, &_block)
|
113
|
+
str = str.to_str
|
114
|
+
return yield str if ignore_control
|
115
|
+
|
116
|
+
scan = str.scan(Constants::TRANS_CTRL_WORD)
|
117
|
+
return yield str if scan.empty?
|
118
|
+
return str if scan.first == str
|
119
|
+
|
120
|
+
str.gsub!(Constants::TRANS_CTRL_WORD, "\u0026\u0026")
|
121
|
+
str = yield str
|
122
|
+
str.gsub!("\u0026\u0026") { scan.shift }
|
123
|
+
str
|
124
|
+
end
|
125
|
+
|
126
|
+
def process_string(str, ignore_control = false, &block)
|
127
|
+
process_string!(str.dup, ignore_control, &block)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
data/lib/dphil/tree.rb
ADDED
@@ -0,0 +1,142 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Phylogenetic Tree generated from parsing PAUP output.
|
6
|
+
#
|
7
|
+
# Immutable.
|
8
|
+
#
|
9
|
+
class Tree
|
10
|
+
include LDOutput
|
11
|
+
attr_reader :id, :nodes, :stats, :tree
|
12
|
+
|
13
|
+
def initialize(id = nil, lengths = nil, stats = nil, **opts)
|
14
|
+
@id = (opts[:id] || id).to_i
|
15
|
+
if lengths.respond_to?(:to_str) && stats.respond_to?(:to_str)
|
16
|
+
@nodes = nodes_from_lengths(parse_paup_lengths(lengths))
|
17
|
+
@stats = parse_paup_stats(stats)
|
18
|
+
elsif (opts.keys & %i[nodes stats]).length == 2
|
19
|
+
@nodes = parse_json_nodes(opts[:nodes])
|
20
|
+
@stats = parse_json_stats(opts[:stats])
|
21
|
+
end
|
22
|
+
@tree = tree_from_nodes(nodes)
|
23
|
+
IceNine.deep_freeze(self)
|
24
|
+
end
|
25
|
+
|
26
|
+
def to_h
|
27
|
+
{
|
28
|
+
id: id,
|
29
|
+
root_id: tree.id,
|
30
|
+
nodes: nodes,
|
31
|
+
stats: stats,
|
32
|
+
}
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_json(options = nil)
|
36
|
+
to_h.as_json(options)
|
37
|
+
end
|
38
|
+
|
39
|
+
def root
|
40
|
+
nodes[tree.id]
|
41
|
+
end
|
42
|
+
|
43
|
+
def get_node(id)
|
44
|
+
nodes[id]
|
45
|
+
end
|
46
|
+
|
47
|
+
def get_parent(node)
|
48
|
+
nodes[node.parent]
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_children(node)
|
52
|
+
node.children&.map { |id| nodes[id] }
|
53
|
+
end
|
54
|
+
|
55
|
+
def tree_length
|
56
|
+
stats[:length]
|
57
|
+
end
|
58
|
+
|
59
|
+
def ci
|
60
|
+
stats[:ci]
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
PAUP_TREE_STATS = {
|
66
|
+
"Tree length" => :length,
|
67
|
+
"Consistency index (CI)" => :ci,
|
68
|
+
"Homoplasy index (HI)" => :hi,
|
69
|
+
"CI excluding uninformative characters" => :ci_ex,
|
70
|
+
"HI excluding uninformative characters" => :hi_ex,
|
71
|
+
"Retention index (RI)" => :ri,
|
72
|
+
"Rescaled consistency index (RC)" => :rc,
|
73
|
+
}.freeze
|
74
|
+
|
75
|
+
private_constant :PAUP_TREE_STATS
|
76
|
+
|
77
|
+
def parse_paup_lengths(lengths)
|
78
|
+
lengths.to_s&.split("\n")&.map { |l| l.strip.split(/\s{3,}/) }
|
79
|
+
end
|
80
|
+
|
81
|
+
def parse_paup_stats(stats)
|
82
|
+
stats.to_s&.split("\n")&.each_with_object({}) do |l, acc|
|
83
|
+
key, val = l.split(" = ")
|
84
|
+
acc[PAUP_TREE_STATS[key]] = (val["."] ? val.to_f : val.to_i)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
def parse_json_nodes(json_nodes)
|
89
|
+
json_nodes.each_with_object({}) do |(id, node), acc|
|
90
|
+
acc[id.to_s.to_i] = TreeNode.new(node)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def parse_json_stats(json_stats)
|
95
|
+
missing_keys = (PAUP_TREE_STATS.values - json_stats.keys)
|
96
|
+
raise ArgumentError, "Missing `stats` keys: #{missing_keys}" unless missing_keys.empty?
|
97
|
+
json_stats.each_with_object({}) do |(k, v), acc|
|
98
|
+
raise ArgumentError, "Stat `#{k}` is not a Numeric" unless v.is_a?(Numeric) || v.nil?
|
99
|
+
acc[k] = v
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def nodes_from_lengths(lengths)
|
104
|
+
lengths.each_with_object({}) do |arr, hash|
|
105
|
+
name, id = arr[0].match(/^(.*?)\s?\(?([0-9]{1,4})\)?$/).captures
|
106
|
+
id = id.to_i
|
107
|
+
parent = arr[1].to_i
|
108
|
+
node = TreeNode.new(
|
109
|
+
id: id,
|
110
|
+
name: (name.present? ? name : "##{id}"),
|
111
|
+
length: arr[2].to_i,
|
112
|
+
parent: parent
|
113
|
+
)
|
114
|
+
hash[id] ||= TreeNode.new
|
115
|
+
hash[id].merge!(node)
|
116
|
+
|
117
|
+
next if parent.zero?
|
118
|
+
hash[parent] ||= TreeNode.new(
|
119
|
+
id: parent,
|
120
|
+
name: (parent.to_i.zero? ? parent : "##{parent}"),
|
121
|
+
length: 0,
|
122
|
+
parent: 0
|
123
|
+
)
|
124
|
+
hash[parent].children ||= []
|
125
|
+
hash[parent].children << id
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def tree_from_nodes(nodes)
|
130
|
+
root = nodes.select { |_, node| node.parent.zero? }&.first&.last
|
131
|
+
return {} if root.blank?
|
132
|
+
append_children(nodes, root)
|
133
|
+
end
|
134
|
+
|
135
|
+
def append_children(nodes, node)
|
136
|
+
new_node = TreeNode.new(node.to_h)
|
137
|
+
return new_node unless new_node.children.present?
|
138
|
+
new_node.children = new_node.children.map { |id| append_children(nodes, nodes[id]) }
|
139
|
+
new_node
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|