dphil 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class Syllables
5
+ using ::Ragabash::Refinements
6
+ class Syllable
7
+ attr_reader :source, :weight, :parent, :index, :source_script
8
+
9
+ def initialize(source, weight, **opts)
10
+ @source = source.to_str.safe_copy.freeze
11
+ @weight = weight.to_str.safe_copy.freeze
12
+ @parent = opts[:parent]
13
+ @index = opts[:index]&.to_i
14
+ @source_script = opts[:source_script] || (@parent&.source_script)
15
+ @slp1 = @source_script == :slp1 ? @source : opts[:slp1]&.to_str&.safe_copy.freeze
16
+ end
17
+
18
+ def inspect
19
+ "[#{index}]#{source.inspect}(#{weight})"
20
+ end
21
+
22
+ def to_s
23
+ @source.dup
24
+ end
25
+
26
+ def prev
27
+ return unless @parent && @index && @index.positive?
28
+ @parent[@index - 1]
29
+ end
30
+
31
+ def next
32
+ return unless @parent && @index && @index < @parent.length
33
+ @parent[@index + 1]
34
+ end
35
+
36
+ def simple_weight
37
+ @simple_weight ||= weight.upcase.freeze
38
+ end
39
+
40
+ def slp1
41
+ @slp1 ||= Transliterate.t(@source, @source_script, :slp1).freeze
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class TeiXML
5
+ using ::Ragabash::Refinements
6
+ # Public: Initialize a TeiXML object
7
+ #
8
+ def initialize(source)
9
+ source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
10
+ @raw_xml = source
11
+ end
12
+
13
+ # Return or re-parse xml
14
+ def xml
15
+ @xml ||= begin
16
+ xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
17
+ xml.encoding = "UTF-8"
18
+ xml.remove_namespaces!
19
+ xml_normalize!(xml)
20
+ rescue Nokogiri::XML::SyntaxError => e
21
+ raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
22
+ end
23
+ end
24
+
25
+ def to_xml
26
+ xml.to_xml
27
+ end
28
+
29
+ alias to_s to_xml
30
+
31
+ def empty?
32
+ xml.xpath("//text()[normalize-space()]").empty?
33
+ end
34
+
35
+ # Public: Return a portion of the document as a new document
36
+ #
37
+ # expr - a CSS selector or XPath expression
38
+ #
39
+ # Returns a new document.
40
+ def crop(expr)
41
+ segment = xml.search(expr)
42
+ pb = page_of(segment)
43
+ lb = line_of(segment)
44
+
45
+ source = <<~EOS
46
+ <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
47
+ <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
48
+ #{segment.to_xml}
49
+ <post></post>
50
+ </TEI>
51
+ EOS
52
+ self.class.new(source)
53
+ end
54
+
55
+ def crop_each(expr)
56
+ xml.search(expr).map do |segment|
57
+ pb = page_of(segment)
58
+ lb = line_of(segment)
59
+
60
+ source = <<~EOS
61
+ <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
62
+ <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
63
+ #{segment.to_xml}
64
+ <post></post>
65
+ </TEI>
66
+ EOS
67
+ self.class.new(source)
68
+ end
69
+ end
70
+
71
+ # Public: Remove elements from the document based on CSS selector.
72
+ #
73
+ # expr - a CSS selector or XPath expression
74
+ #
75
+ # Returns a new document.
76
+ def reject(expr)
77
+ source = xml.dup
78
+ source.search(expr).each do |node|
79
+ node.replace(node.search("pb, lb"))
80
+ end
81
+ self.class.new(source.to_xml)
82
+ end
83
+
84
+ # Public: Substitute elements from the document based on CSS selector with
85
+ # ID-based token text-nodes.
86
+ #
87
+ # expr - a CSS selector or XPath expression
88
+ # subst_text - an optional text identifier
89
+ #
90
+ # Returns a new document.
91
+ def subst(expr, subst_text = nil)
92
+ source = parsed_xml.dup
93
+ subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?
94
+
95
+ source.search(expr).each do |node|
96
+ set = Nokogiri::XML::NodeSet.new(source)
97
+ escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
98
+ text_content = "#{subst_text || node.name}#{escaped_text}"
99
+ set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
100
+ node.replace(set + node.search("pb, lb"))
101
+ end
102
+ self.class.new(source.to_xml)
103
+ end
104
+
105
+ private
106
+
107
+ # Get nearest prior <pb/> node.
108
+ #
109
+ # id - node in document to start search from.
110
+ #
111
+ # Returns an XML node.
112
+ def page_of(node)
113
+ node.xpath("preceding::*[name() = 'pb'][1]")
114
+ end
115
+
116
+ # Get nearest prior <lb/> node with everything in between.
117
+ #
118
+ # node - node in document to start search from.
119
+ #
120
+ # Returns an XML node.
121
+ def line_of(node)
122
+ node.xpath("preceding::*[name() = 'lb'][1]")
123
+ end
124
+
125
+ # Normalize (mostly) whitespace in the XML.
126
+ def xml_normalize!(doc)
127
+ doc.search("//text()").each do |text_node|
128
+ text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ")
129
+ end
130
+
131
+ # Remove empty modification tags.
132
+ doc.search(
133
+ "//add[not(node())]|" \
134
+ "//del[not(node())]|" \
135
+ "//mod[not(node())]|" \
136
+ "//unclear[not(node())]|" \
137
+ "//g[not(node())]"
138
+ ).remove
139
+ doc
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sanscript"
4
+
5
+ module Dphil
6
+ # Transliteration module for basic romanization formats.
7
+ module Transliterate
8
+ using ::Ragabash::Refinements
9
+
10
+ @default_script = nil
11
+
12
+ module_function
13
+
14
+ def default_script
15
+ @default_script
16
+ end
17
+
18
+ def default_script=(scr)
19
+ scr = scr.to_sym
20
+ if script_supported?(scr)
21
+ @default_script = scr
22
+ else
23
+ warn "Script unsupported [:#{scr}]"
24
+ end
25
+ end
26
+
27
+ def transliterate(str, first, second = nil)
28
+ Sanscript.transliterate(str, first, second, default_script: default_script)
29
+ rescue RuntimeError => e
30
+ Dphil.logger.error "Transliteration Error: #{e}"
31
+ return str
32
+ end
33
+
34
+ def script_supported?(script)
35
+ Sanscript::Transliterate.scheme_names.include?(script)
36
+ end
37
+
38
+ def to_ascii(str)
39
+ process_string(str) do |out|
40
+ out.unicode_normalize!(:nfd)
41
+ out.gsub!(/[^\u0000-\u007F]+/, "")
42
+ out
43
+ end
44
+ end
45
+
46
+ def iast_kh(str)
47
+ transliterate(str, :iast, :kh)
48
+ end
49
+
50
+ def kh_iast(str)
51
+ transliterate(str, :kh, :iast)
52
+ end
53
+
54
+ def iast_slp1(str)
55
+ transliterate(str, :iast, :slp1)
56
+ end
57
+
58
+ def slp1_iast(str)
59
+ transliterate(str, :slp1, :iast)
60
+ end
61
+
62
+ def detect(str)
63
+ Sanscript::Detect.detect_scheme(str)
64
+ end
65
+
66
+ def normalize_slp1(st)
67
+ out = st.dup
68
+ out.gsub!(Constants::TRANS_CTRL_WORD) do |match|
69
+ control_content = match[Constants::TRANS_CTRL_WORD_CONTENT, 1]
70
+ next match if control_content&.match(Constants::TRANS_CTRL_WORD_PROCESSED)
71
+ "{###{Digest::SHA1.hexdigest(control_content).rjust(40, '0')}##}"
72
+ end
73
+
74
+ process_string!(out) do |token|
75
+ token.tr!("b", "v")
76
+ token.gsub!(/['‘]\b/, "") # Avagraha
77
+ token.gsub!(/\B[NYRnm]/, "M") # Medial and final nasals
78
+ token.gsub!(/\B[Hrs]\b/, "") # Final visarga/r/s
79
+ token.gsub!(%r{[\.\-\_\\\/]}, "") # Punctuation
80
+ token
81
+ end
82
+ end
83
+
84
+ def normalize_iast(word)
85
+ out = iast_slp1(word)
86
+ normalize_slp1(out)
87
+ end
88
+
89
+ def unicode_downcase!(str, ignore_control = false)
90
+ return UNICODE_DOWNCASE_PROC.call(str) if ignore_control
91
+ process_string!(str, &UNICODE_DOWNCASE_PROC)
92
+ end
93
+
94
+ def unicode_downcase(st, ignore_control = false)
95
+ unicode_downcase!(st.dup, ignore_control)
96
+ end
97
+
98
+ UNICODE_DOWNCASE_PROC = lambda do |str|
99
+ str.unicode_normalize!(:nfd)
100
+ str.downcase!
101
+ str.unicode_normalize!(:nfc)
102
+ str
103
+ end
104
+
105
+ private_constant :UNICODE_DOWNCASE_PROC
106
+
107
+ class << self
108
+ alias t transliterate
109
+
110
+ private
111
+
112
+ def process_string!(str, ignore_control = false, &_block)
113
+ str = str.to_str
114
+ return yield str if ignore_control
115
+
116
+ scan = str.scan(Constants::TRANS_CTRL_WORD)
117
+ return yield str if scan.empty?
118
+ return str if scan.first == str
119
+
120
+ str.gsub!(Constants::TRANS_CTRL_WORD, "\u0026\u0026")
121
+ str = yield str
122
+ str.gsub!("\u0026\u0026") { scan.shift }
123
+ str
124
+ end
125
+
126
+ def process_string(str, ignore_control = false, &block)
127
+ process_string!(str.dup, ignore_control, &block)
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Phylogenetic Tree generated from parsing PAUP output.
6
+ #
7
+ # Immutable.
8
+ #
9
+ class Tree
10
+ include LDOutput
11
+ attr_reader :id, :nodes, :stats, :tree
12
+
13
+ def initialize(id = nil, lengths = nil, stats = nil, **opts)
14
+ @id = (opts[:id] || id).to_i
15
+ if lengths.respond_to?(:to_str) && stats.respond_to?(:to_str)
16
+ @nodes = nodes_from_lengths(parse_paup_lengths(lengths))
17
+ @stats = parse_paup_stats(stats)
18
+ elsif (opts.keys & %i[nodes stats]).length == 2
19
+ @nodes = parse_json_nodes(opts[:nodes])
20
+ @stats = parse_json_stats(opts[:stats])
21
+ end
22
+ @tree = tree_from_nodes(nodes)
23
+ IceNine.deep_freeze(self)
24
+ end
25
+
26
+ def to_h
27
+ {
28
+ id: id,
29
+ root_id: tree.id,
30
+ nodes: nodes,
31
+ stats: stats,
32
+ }
33
+ end
34
+
35
+ def as_json(options = nil)
36
+ to_h.as_json(options)
37
+ end
38
+
39
+ def root
40
+ nodes[tree.id]
41
+ end
42
+
43
+ def get_node(id)
44
+ nodes[id]
45
+ end
46
+
47
+ def get_parent(node)
48
+ nodes[node.parent]
49
+ end
50
+
51
+ def get_children(node)
52
+ node.children&.map { |id| nodes[id] }
53
+ end
54
+
55
+ def tree_length
56
+ stats[:length]
57
+ end
58
+
59
+ def ci
60
+ stats[:ci]
61
+ end
62
+
63
+ private
64
+
65
+ PAUP_TREE_STATS = {
66
+ "Tree length" => :length,
67
+ "Consistency index (CI)" => :ci,
68
+ "Homoplasy index (HI)" => :hi,
69
+ "CI excluding uninformative characters" => :ci_ex,
70
+ "HI excluding uninformative characters" => :hi_ex,
71
+ "Retention index (RI)" => :ri,
72
+ "Rescaled consistency index (RC)" => :rc,
73
+ }.freeze
74
+
75
+ private_constant :PAUP_TREE_STATS
76
+
77
+ def parse_paup_lengths(lengths)
78
+ lengths.to_s&.split("\n")&.map { |l| l.strip.split(/\s{3,}/) }
79
+ end
80
+
81
+ def parse_paup_stats(stats)
82
+ stats.to_s&.split("\n")&.each_with_object({}) do |l, acc|
83
+ key, val = l.split(" = ")
84
+ acc[PAUP_TREE_STATS[key]] = (val["."] ? val.to_f : val.to_i)
85
+ end
86
+ end
87
+
88
+ def parse_json_nodes(json_nodes)
89
+ json_nodes.each_with_object({}) do |(id, node), acc|
90
+ acc[id.to_s.to_i] = TreeNode.new(node)
91
+ end
92
+ end
93
+
94
+ def parse_json_stats(json_stats)
95
+ missing_keys = (PAUP_TREE_STATS.values - json_stats.keys)
96
+ raise ArgumentError, "Missing `stats` keys: #{missing_keys}" unless missing_keys.empty?
97
+ json_stats.each_with_object({}) do |(k, v), acc|
98
+ raise ArgumentError, "Stat `#{k}` is not a Numeric" unless v.is_a?(Numeric) || v.nil?
99
+ acc[k] = v
100
+ end
101
+ end
102
+
103
+ def nodes_from_lengths(lengths)
104
+ lengths.each_with_object({}) do |arr, hash|
105
+ name, id = arr[0].match(/^(.*?)\s?\(?([0-9]{1,4})\)?$/).captures
106
+ id = id.to_i
107
+ parent = arr[1].to_i
108
+ node = TreeNode.new(
109
+ id: id,
110
+ name: (name.present? ? name : "##{id}"),
111
+ length: arr[2].to_i,
112
+ parent: parent
113
+ )
114
+ hash[id] ||= TreeNode.new
115
+ hash[id].merge!(node)
116
+
117
+ next if parent.zero?
118
+ hash[parent] ||= TreeNode.new(
119
+ id: parent,
120
+ name: (parent.to_i.zero? ? parent : "##{parent}"),
121
+ length: 0,
122
+ parent: 0
123
+ )
124
+ hash[parent].children ||= []
125
+ hash[parent].children << id
126
+ end
127
+ end
128
+
129
+ def tree_from_nodes(nodes)
130
+ root = nodes.select { |_, node| node.parent.zero? }&.first&.last
131
+ return {} if root.blank?
132
+ append_children(nodes, root)
133
+ end
134
+
135
+ def append_children(nodes, node)
136
+ new_node = TreeNode.new(node.to_h)
137
+ return new_node unless new_node.children.present?
138
+ new_node.children = new_node.children.map { |id| append_children(nodes, nodes[id]) }
139
+ new_node
140
+ end
141
+ end
142
+ end