dphil 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class Syllables
5
+ using ::Ragabash::Refinements
6
+ class Syllable
7
+ attr_reader :source, :weight, :parent, :index, :source_script
8
+
9
+ def initialize(source, weight, **opts)
10
+ @source = source.to_str.safe_copy.freeze
11
+ @weight = weight.to_str.safe_copy.freeze
12
+ @parent = opts[:parent]
13
+ @index = opts[:index]&.to_i
14
+ @source_script = opts[:source_script] || (@parent&.source_script)
15
+ @slp1 = @source_script == :slp1 ? @source : opts[:slp1]&.to_str&.safe_copy.freeze
16
+ end
17
+
18
+ def inspect
19
+ "[#{index}]#{source.inspect}(#{weight})"
20
+ end
21
+
22
+ def to_s
23
+ @source.dup
24
+ end
25
+
26
+ def prev
27
+ return unless @parent && @index && @index.positive?
28
+ @parent[@index - 1]
29
+ end
30
+
31
+ def next
32
+ return unless @parent && @index && @index < @parent.length
33
+ @parent[@index + 1]
34
+ end
35
+
36
+ def simple_weight
37
+ @simple_weight ||= weight.upcase.freeze
38
+ end
39
+
40
+ def slp1
41
+ @slp1 ||= Transliterate.t(@source, @source_script, :slp1).freeze
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class TeiXML
5
+ using ::Ragabash::Refinements
6
+ # Public: Initialize a TeiXML object
7
+ #
8
+ def initialize(source)
9
+ source = %(<TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0"></TEI>) if source.strip.empty?
10
+ @raw_xml = source
11
+ end
12
+
13
+ # Return or re-parse xml
14
+ def xml
15
+ @xml ||= begin
16
+ xml = Nokogiri::XML(@raw_xml) { |config| config.strict.noent }
17
+ xml.encoding = "UTF-8"
18
+ xml.remove_namespaces!
19
+ xml_normalize!(xml)
20
+ rescue Nokogiri::XML::SyntaxError => e
21
+ raise "TEIDocument (source: #{@raw_xml}) caught exception: #{e}"
22
+ end
23
+ end
24
+
25
+ def to_xml
26
+ xml.to_xml
27
+ end
28
+
29
+ alias to_s to_xml
30
+
31
+ def empty?
32
+ xml.xpath("//text()[normalize-space()]").empty?
33
+ end
34
+
35
+ # Public: Return a portion of the document as a new document
36
+ #
37
+ # expr - a CSS selector or XPath expression
38
+ #
39
+ # Returns a new document.
40
+ def crop(expr)
41
+ segment = xml.search(expr)
42
+ pb = page_of(segment)
43
+ lb = line_of(segment)
44
+
45
+ source = <<~EOS
46
+ <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
47
+ <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
48
+ #{segment.to_xml}
49
+ <post></post>
50
+ </TEI>
51
+ EOS
52
+ self.class.new(source)
53
+ end
54
+
55
+ def crop_each(expr)
56
+ xml.search(expr).map do |segment|
57
+ pb = page_of(segment)
58
+ lb = line_of(segment)
59
+
60
+ source = <<~EOS
61
+ <TEI version="5.0" xmlns="http://www.tei-c.org/ns/1.0">
62
+ <pre>#{pb&.to_xml}#{lb&.to_xml}</pre>
63
+ #{segment.to_xml}
64
+ <post></post>
65
+ </TEI>
66
+ EOS
67
+ self.class.new(source)
68
+ end
69
+ end
70
+
71
+ # Public: Remove elements from the document based on CSS selector.
72
+ #
73
+ # expr - a CSS selector or XPath expression
74
+ #
75
+ # Returns a new document.
76
+ def reject(expr)
77
+ source = xml.dup
78
+ source.search(expr).each do |node|
79
+ node.replace(node.search("pb, lb"))
80
+ end
81
+ self.class.new(source.to_xml)
82
+ end
83
+
84
+ # Public: Substitute elements from the document based on CSS selector with
85
+ # ID-based token text-nodes.
86
+ #
87
+ # expr - a CSS selector or XPath expression
88
+ # subst_text - an optional text identifier
89
+ #
90
+ # Returns a new document.
91
+ def subst(expr, subst_text = nil)
92
+ source = parsed_xml.dup
93
+ subst_text = subst_text.to_s.gsub(/\s+/, "_") unless subst_text.nil?
94
+
95
+ source.search(expr).each do |node|
96
+ set = Nokogiri::XML::NodeSet.new(source)
97
+ escaped_text = ":#{node.attribute('id').to_s.gsub(/\s+/, '_')}"
98
+ text_content = "#{subst_text || node.name}#{escaped_text}"
99
+ set << Nokogiri::XML::Text.new(" {{#{text_content}}} ", source)
100
+ node.replace(set + node.search("pb, lb"))
101
+ end
102
+ self.class.new(source.to_xml)
103
+ end
104
+
105
+ private
106
+
107
+ # Get nearest prior <pb/> node.
108
+ #
109
+ # id - node in document to start search from.
110
+ #
111
+ # Returns an XML node.
112
+ def page_of(node)
113
+ node.xpath("preceding::*[name() = 'pb'][1]")
114
+ end
115
+
116
+ # Get nearest prior <lb/> node with everything in between.
117
+ #
118
+ # node - node in document to start search from.
119
+ #
120
+ # Returns an XML node.
121
+ def line_of(node)
122
+ node.xpath("preceding::*[name() = 'lb'][1]")
123
+ end
124
+
125
+ # Normalize (mostly) whitespace in the XML.
126
+ def xml_normalize!(doc)
127
+ doc.search("//text()").each do |text_node|
128
+ text_node.content = text_node.content.gsub(%r{\s+[\s\.\-\\\/\_]*}, " ")
129
+ end
130
+
131
+ # Remove empty modification tags.
132
+ doc.search(
133
+ "//add[not(node())]|" \
134
+ "//del[not(node())]|" \
135
+ "//mod[not(node())]|" \
136
+ "//unclear[not(node())]|" \
137
+ "//g[not(node())]"
138
+ ).remove
139
+ doc
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "sanscript"
4
+
5
+ module Dphil
6
+ # Transliteration module for basic romanization formats.
7
+ module Transliterate
8
+ using ::Ragabash::Refinements
9
+
10
+ @default_script = nil
11
+
12
+ module_function
13
+
14
+ def default_script
15
+ @default_script
16
+ end
17
+
18
+ def default_script=(scr)
19
+ scr = scr.to_sym
20
+ if script_supported?(scr)
21
+ @default_script = scr
22
+ else
23
+ warn "Script unsupported [:#{scr}]"
24
+ end
25
+ end
26
+
27
+ def transliterate(str, first, second = nil)
28
+ Sanscript.transliterate(str, first, second, default_script: default_script)
29
+ rescue RuntimeError => e
30
+ Dphil.logger.error "Transliteration Error: #{e}"
31
+ return str
32
+ end
33
+
34
+ def script_supported?(script)
35
+ Sanscript::Transliterate.scheme_names.include?(script)
36
+ end
37
+
38
+ def to_ascii(str)
39
+ process_string(str) do |out|
40
+ out.unicode_normalize!(:nfd)
41
+ out.gsub!(/[^\u0000-\u007F]+/, "")
42
+ out
43
+ end
44
+ end
45
+
46
+ def iast_kh(str)
47
+ transliterate(str, :iast, :kh)
48
+ end
49
+
50
+ def kh_iast(str)
51
+ transliterate(str, :kh, :iast)
52
+ end
53
+
54
+ def iast_slp1(str)
55
+ transliterate(str, :iast, :slp1)
56
+ end
57
+
58
+ def slp1_iast(str)
59
+ transliterate(str, :slp1, :iast)
60
+ end
61
+
62
+ def detect(str)
63
+ Sanscript::Detect.detect_scheme(str)
64
+ end
65
+
66
+ def normalize_slp1(st)
67
+ out = st.dup
68
+ out.gsub!(Constants::TRANS_CTRL_WORD) do |match|
69
+ control_content = match[Constants::TRANS_CTRL_WORD_CONTENT, 1]
70
+ next match if control_content&.match(Constants::TRANS_CTRL_WORD_PROCESSED)
71
+ "{###{Digest::SHA1.hexdigest(control_content).rjust(40, '0')}##}"
72
+ end
73
+
74
+ process_string!(out) do |token|
75
+ token.tr!("b", "v")
76
+ token.gsub!(/['‘]\b/, "") # Avagraha
77
+ token.gsub!(/\B[NYRnm]/, "M") # Medial and final nasals
78
+ token.gsub!(/\B[Hrs]\b/, "") # Final visarga/r/s
79
+ token.gsub!(%r{[\.\-\_\\\/]}, "") # Punctuation
80
+ token
81
+ end
82
+ end
83
+
84
+ def normalize_iast(word)
85
+ out = iast_slp1(word)
86
+ normalize_slp1(out)
87
+ end
88
+
89
+ def unicode_downcase!(str, ignore_control = false)
90
+ return UNICODE_DOWNCASE_PROC.call(str) if ignore_control
91
+ process_string!(str, &UNICODE_DOWNCASE_PROC)
92
+ end
93
+
94
+ def unicode_downcase(st, ignore_control = false)
95
+ unicode_downcase!(st.dup, ignore_control)
96
+ end
97
+
98
+ UNICODE_DOWNCASE_PROC = lambda do |str|
99
+ str.unicode_normalize!(:nfd)
100
+ str.downcase!
101
+ str.unicode_normalize!(:nfc)
102
+ str
103
+ end
104
+
105
+ private_constant :UNICODE_DOWNCASE_PROC
106
+
107
+ class << self
108
+ alias t transliterate
109
+
110
+ private
111
+
112
+ def process_string!(str, ignore_control = false, &_block)
113
+ str = str.to_str
114
+ return yield str if ignore_control
115
+
116
+ scan = str.scan(Constants::TRANS_CTRL_WORD)
117
+ return yield str if scan.empty?
118
+ return str if scan.first == str
119
+
120
+ str.gsub!(Constants::TRANS_CTRL_WORD, "\u0026\u0026")
121
+ str = yield str
122
+ str.gsub!("\u0026\u0026") { scan.shift }
123
+ str
124
+ end
125
+
126
+ def process_string(str, ignore_control = false, &block)
127
+ process_string!(str.dup, ignore_control, &block)
128
+ end
129
+ end
130
+ end
131
+ end
@@ -0,0 +1,142 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Phylogenetic Tree generated from parsing PAUP output.
6
+ #
7
+ # Immutable.
8
+ #
9
+ class Tree
10
+ include LDOutput
11
+ attr_reader :id, :nodes, :stats, :tree
12
+
13
+ def initialize(id = nil, lengths = nil, stats = nil, **opts)
14
+ @id = (opts[:id] || id).to_i
15
+ if lengths.respond_to?(:to_str) && stats.respond_to?(:to_str)
16
+ @nodes = nodes_from_lengths(parse_paup_lengths(lengths))
17
+ @stats = parse_paup_stats(stats)
18
+ elsif (opts.keys & %i[nodes stats]).length == 2
19
+ @nodes = parse_json_nodes(opts[:nodes])
20
+ @stats = parse_json_stats(opts[:stats])
21
+ end
22
+ @tree = tree_from_nodes(nodes)
23
+ IceNine.deep_freeze(self)
24
+ end
25
+
26
+ def to_h
27
+ {
28
+ id: id,
29
+ root_id: tree.id,
30
+ nodes: nodes,
31
+ stats: stats,
32
+ }
33
+ end
34
+
35
+ def as_json(options = nil)
36
+ to_h.as_json(options)
37
+ end
38
+
39
+ def root
40
+ nodes[tree.id]
41
+ end
42
+
43
+ def get_node(id)
44
+ nodes[id]
45
+ end
46
+
47
+ def get_parent(node)
48
+ nodes[node.parent]
49
+ end
50
+
51
+ def get_children(node)
52
+ node.children&.map { |id| nodes[id] }
53
+ end
54
+
55
+ def tree_length
56
+ stats[:length]
57
+ end
58
+
59
+ def ci
60
+ stats[:ci]
61
+ end
62
+
63
+ private
64
+
65
+ PAUP_TREE_STATS = {
66
+ "Tree length" => :length,
67
+ "Consistency index (CI)" => :ci,
68
+ "Homoplasy index (HI)" => :hi,
69
+ "CI excluding uninformative characters" => :ci_ex,
70
+ "HI excluding uninformative characters" => :hi_ex,
71
+ "Retention index (RI)" => :ri,
72
+ "Rescaled consistency index (RC)" => :rc,
73
+ }.freeze
74
+
75
+ private_constant :PAUP_TREE_STATS
76
+
77
+ def parse_paup_lengths(lengths)
78
+ lengths.to_s&.split("\n")&.map { |l| l.strip.split(/\s{3,}/) }
79
+ end
80
+
81
+ def parse_paup_stats(stats)
82
+ stats.to_s&.split("\n")&.each_with_object({}) do |l, acc|
83
+ key, val = l.split(" = ")
84
+ acc[PAUP_TREE_STATS[key]] = (val["."] ? val.to_f : val.to_i)
85
+ end
86
+ end
87
+
88
+ def parse_json_nodes(json_nodes)
89
+ json_nodes.each_with_object({}) do |(id, node), acc|
90
+ acc[id.to_s.to_i] = TreeNode.new(node)
91
+ end
92
+ end
93
+
94
+ def parse_json_stats(json_stats)
95
+ missing_keys = (PAUP_TREE_STATS.values - json_stats.keys)
96
+ raise ArgumentError, "Missing `stats` keys: #{missing_keys}" unless missing_keys.empty?
97
+ json_stats.each_with_object({}) do |(k, v), acc|
98
+ raise ArgumentError, "Stat `#{k}` is not a Numeric" unless v.is_a?(Numeric) || v.nil?
99
+ acc[k] = v
100
+ end
101
+ end
102
+
103
+ def nodes_from_lengths(lengths)
104
+ lengths.each_with_object({}) do |arr, hash|
105
+ name, id = arr[0].match(/^(.*?)\s?\(?([0-9]{1,4})\)?$/).captures
106
+ id = id.to_i
107
+ parent = arr[1].to_i
108
+ node = TreeNode.new(
109
+ id: id,
110
+ name: (name.present? ? name : "##{id}"),
111
+ length: arr[2].to_i,
112
+ parent: parent
113
+ )
114
+ hash[id] ||= TreeNode.new
115
+ hash[id].merge!(node)
116
+
117
+ next if parent.zero?
118
+ hash[parent] ||= TreeNode.new(
119
+ id: parent,
120
+ name: (parent.to_i.zero? ? parent : "##{parent}"),
121
+ length: 0,
122
+ parent: 0
123
+ )
124
+ hash[parent].children ||= []
125
+ hash[parent].children << id
126
+ end
127
+ end
128
+
129
+ def tree_from_nodes(nodes)
130
+ root = nodes.select { |_, node| node.parent.zero? }&.first&.last
131
+ return {} if root.blank?
132
+ append_children(nodes, root)
133
+ end
134
+
135
+ def append_children(nodes, node)
136
+ new_node = TreeNode.new(node.to_h)
137
+ return new_node unless new_node.children.present?
138
+ new_node.children = new_node.children.map { |id| append_children(nodes, nodes[id]) }
139
+ new_node
140
+ end
141
+ end
142
+ end