dphil 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
data/lib/dphil/cache.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support/cache"
|
4
|
+
require "active_support/notifications"
|
5
|
+
|
6
|
+
module Dphil
|
7
|
+
module_function
|
8
|
+
|
9
|
+
def cache(key, params = nil)
|
10
|
+
@cache ||= defined?(::Rails.cache) ? ::Rails.cache : ActiveSupport::Cache::MemoryStore.new(size: 16_384)
|
11
|
+
full_key = String.new("Dphil-#{Dphil::VERSION}:cache:#{key}")
|
12
|
+
full_key << ":#{Digest::SHA1.base64digest(params.to_s)}" unless params.nil?
|
13
|
+
block_given? ? @cache.fetch(full_key, &Proc.new) : @cache.fetch(full_key)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,236 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Phylogenetic character for storing states and symbols.
|
6
|
+
#
|
7
|
+
# Immutable.
|
8
|
+
#
|
9
|
+
class Character
|
10
|
+
include Dphil::LDOutput
|
11
|
+
|
12
|
+
# Instantiates a new Character
|
13
|
+
# @overload initialize(id = nil, states = nil)
|
14
|
+
# @param id [Integer] a character ID
|
15
|
+
# @param states [Hash<Integer, String]] taxa and text-states +{ taxon_id => text_state }+
|
16
|
+
# @overload initialize(**opts = {})
|
17
|
+
# @param [Hash] opts options or keyword values
|
18
|
+
# @option opts [Integer] :id a character ID
|
19
|
+
# @option opts [Hash<Integer, String]] :states taxa and text-states +{ taxon_id => text_state }+
|
20
|
+
def initialize(id = nil, states = nil, **opts)
|
21
|
+
@id = (opts[:id] || id)&.to_s.to_i
|
22
|
+
@taxa_states = (opts[:states] || states)
|
23
|
+
.to_h.each_with_object({}) do |(taxon, state), acc|
|
24
|
+
next if state.blank?
|
25
|
+
taxon = taxon.to_s if taxon.is_a?(Symbol)
|
26
|
+
acc[taxon.to_i] = normalize_text(state)
|
27
|
+
end
|
28
|
+
|
29
|
+
unique_states = weighted_uniq(@taxa_states.values)
|
30
|
+
if unique_states.size > SYMBOL_ARRAY.size
|
31
|
+
raise ArgumentError,
|
32
|
+
"Too many states (found #{unique_states.size}, " \
|
33
|
+
"max #{SYMBOL_ARRAY.size})"
|
34
|
+
end
|
35
|
+
|
36
|
+
@states = {}
|
37
|
+
@state_totals = unique_states
|
38
|
+
unique_states.each_key.with_index do |state, index|
|
39
|
+
@states[SYMBOL_ARRAY[index]] = state
|
40
|
+
end
|
41
|
+
instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
|
42
|
+
end
|
43
|
+
|
44
|
+
# @!attribute [r] id
|
45
|
+
# @return [Integer] character ID
|
46
|
+
attr_reader :id
|
47
|
+
|
48
|
+
# @!attribute [r] taxa
|
49
|
+
# @return [Set<Integer>] taxon IDs
|
50
|
+
def taxa
|
51
|
+
@taxa ||= Set.new(taxa_states.keys).freeze
|
52
|
+
end
|
53
|
+
|
54
|
+
# @!attribute [r] states
|
55
|
+
# @return [Hash<String, String>] text-states by symbol
|
56
|
+
attr_reader :states
|
57
|
+
|
58
|
+
# @!attribute [r] symbols
|
59
|
+
# @return [Hash<String, String>] symbols by text-state
|
60
|
+
def symbols
|
61
|
+
@symbols ||= states.invert.freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# @!attribute [r] state_list
|
65
|
+
# @return [Array<String>] text-states
|
66
|
+
def state_list
|
67
|
+
@state_list ||= states.values.freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# @!attribute [r] symbol_list
|
71
|
+
# @return [Array<String>] symbols
|
72
|
+
def symbol_list
|
73
|
+
@symbol_list ||= states.keys.freeze
|
74
|
+
end
|
75
|
+
|
76
|
+
# @!attribute [r] state_totals
|
77
|
+
# @return [Hash<String, Integer>] character state totals by text-state
|
78
|
+
attr_reader :state_totals
|
79
|
+
|
80
|
+
# @!attribute [r] symbol_totals
|
81
|
+
# @return [Hash<String, Integer>] character state totals by symbol
|
82
|
+
def symbol_totals
|
83
|
+
@symbol_totals ||= state_totals.transform_keys { |state| symbols[state] }.freeze
|
84
|
+
end
|
85
|
+
|
86
|
+
# @!attribute [r] taxa_states
|
87
|
+
# @return [Hash<Integer, String>] text-states by taxon ID
|
88
|
+
attr_reader :taxa_states
|
89
|
+
|
90
|
+
# @!attribute [r] taxa_symbols
|
91
|
+
# @return [Hash<Integer, String>] symbols by taxon ID
|
92
|
+
def taxa_symbols
|
93
|
+
@taxa_symbols ||= taxa_states.transform_values { |state| symbols[state] }.freeze
|
94
|
+
end
|
95
|
+
|
96
|
+
# @!attribute [r] states_taxa
|
97
|
+
# @return [Hash<String, Integer>] taxa IDs by text-state
|
98
|
+
def states_taxa
|
99
|
+
@states_taxa ||= (states.each_value.each_with_object({}) do |state, acc|
|
100
|
+
acc[state] = taxa_states.select { |_, tstate| state == tstate }.keys
|
101
|
+
end).freeze
|
102
|
+
end
|
103
|
+
|
104
|
+
# @!attribute [r] symbols_taxa
|
105
|
+
# @return [Hash<String, Integer>] taxa IDs by symbol
|
106
|
+
def symbols_taxa
|
107
|
+
@symbols_taxa ||= states_taxa.transform_keys { |state| symbols[state] }.freeze
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get state from symbol
|
111
|
+
# @param symbol [String] a symbol
|
112
|
+
# @return [String, nil] the associated text-state, or Nil if not found
|
113
|
+
def get_state(symbol)
|
114
|
+
states[normalize_text(symbol)]
|
115
|
+
end
|
116
|
+
|
117
|
+
# Get symbol from state
|
118
|
+
# @param state [String] a text-state
|
119
|
+
# @return [String, nil] the associated symbol, or Nil if not found
|
120
|
+
def get_symbol(state)
|
121
|
+
symbols[normalize_text(state)]
|
122
|
+
end
|
123
|
+
|
124
|
+
# Get taxa from state
|
125
|
+
# @param symbol [String] a text-state
|
126
|
+
# @return [Array<Integer>] the associated taxa IDs
|
127
|
+
def get_taxa_state(state)
|
128
|
+
states_taxa[normalize_text(state)]
|
129
|
+
end
|
130
|
+
|
131
|
+
# Get taxa from symbol
|
132
|
+
# @param symbol [String] a symbol
|
133
|
+
# @return [Array<Integer>] the associated taxa IDs
|
134
|
+
def get_taxa_symbol(symbol)
|
135
|
+
symbols_taxa[normalize_text(symbol)]
|
136
|
+
end
|
137
|
+
|
138
|
+
# Get state from taxon
|
139
|
+
# @param taxon_id [Integer] a taxon ID
|
140
|
+
# @return [String, nil] the associated text-state, or Nil if not found
|
141
|
+
def get_state_taxon(taxon_id)
|
142
|
+
taxa_states[taxon_id.to_i]
|
143
|
+
end
|
144
|
+
|
145
|
+
# Get symbol from taxon
|
146
|
+
# @param taxon_id [Integer] a taxon ID
|
147
|
+
# @return [String, nil] the associated symbol, or Nil if not found
|
148
|
+
def get_symbol_taxon(taxon_id)
|
149
|
+
taxa_symbols[taxon_id.to_i]
|
150
|
+
end
|
151
|
+
|
152
|
+
# Check if character is parsimony-informative
|
153
|
+
# (At least 2 variants occurring in at least 2 places)
|
154
|
+
# @return [Boolean] whether the character provides useful information
|
155
|
+
def informative?
|
156
|
+
@informative ||= (states.size > 1 && states_taxa.count { |_, v| v.size > 1 } > 1)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Check if the character is invariant
|
160
|
+
# @return [Boolean] whether the character is constant (invariant)
|
161
|
+
def constant?
|
162
|
+
@constant ||= states.size <= 1
|
163
|
+
end
|
164
|
+
|
165
|
+
def to_h
|
166
|
+
{
|
167
|
+
id: id,
|
168
|
+
states: states,
|
169
|
+
symbols: symbols,
|
170
|
+
state_totals: state_totals,
|
171
|
+
taxa_states: taxa_states,
|
172
|
+
states_taxa: states_taxa,
|
173
|
+
is_informative: informative?,
|
174
|
+
is_constant: constant?,
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
def as_json(options = nil)
|
179
|
+
to_h.as_json(options)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Pretty-print the object
|
183
|
+
# (used by Pry in particular)
|
184
|
+
def pretty_print(q)
|
185
|
+
q.object_group(self) do
|
186
|
+
q.breakable
|
187
|
+
q.group(1) do
|
188
|
+
q.text "@id=#{id}"
|
189
|
+
q.breakable
|
190
|
+
q.group(1, "{", "}") do
|
191
|
+
q.seplist(states) do |symbol, state|
|
192
|
+
q.text "#{state.inspect}(#{symbol})=#{states_taxa[state]}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# @return [String] a string representation of the object.
|
200
|
+
def inspect
|
201
|
+
pretty_inspect.chomp
|
202
|
+
end
|
203
|
+
alias to_s inspect
|
204
|
+
|
205
|
+
private
|
206
|
+
|
207
|
+
# @param text [String] an arbitrary string of text
|
208
|
+
# @return [String] a Unicode-normalized, stripped, frozen copy
|
209
|
+
def normalize_text(text)
|
210
|
+
return if text.nil?
|
211
|
+
text = UNF::Normalizer.normalize(text.to_s, :nfc)
|
212
|
+
text.strip!
|
213
|
+
text.freeze
|
214
|
+
end
|
215
|
+
|
216
|
+
# Find all unique elements in an array and stably sort them by frequency.
|
217
|
+
# @param array [Array]
|
218
|
+
# @return [Hash] keys are unique input array elements, values are frequency
|
219
|
+
def weighted_uniq(array)
|
220
|
+
weighted_hash = array.each_with_object({}) do |v, acc|
|
221
|
+
acc[v] ||= 0
|
222
|
+
acc[v] += 1
|
223
|
+
end
|
224
|
+
|
225
|
+
n = 0
|
226
|
+
weighted_hash = weighted_hash.sort_by do |x|
|
227
|
+
n += 1
|
228
|
+
[-x[1], n]
|
229
|
+
end
|
230
|
+
weighted_hash.to_h
|
231
|
+
end
|
232
|
+
|
233
|
+
SYMBOL_ARRAY = IceNine.deep_freeze([*"A".."Z", *"a".."z"])
|
234
|
+
private_constant :SYMBOL_ARRAY
|
235
|
+
end
|
236
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# A matrix of character states across taxa.
|
6
|
+
#
|
7
|
+
class CharacterMatrix
|
8
|
+
include LDOutput
|
9
|
+
|
10
|
+
# Instantiate a new CharacterMatrix from a UTF-8 CSV file
|
11
|
+
# @param infile [#read] the file/IO object to read
|
12
|
+
# @param transpose [Boolean] transpose the table 90° (headers in first column)
|
13
|
+
# @return [CharacterMatrix]
|
14
|
+
def self.from_csv(infile, transpose: false)
|
15
|
+
csv = CSV.read(infile, "r:bom|utf-8")
|
16
|
+
csv = csv.transpose if transpose
|
17
|
+
new(csv)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Instantiate a new CharacterMatrix
|
21
|
+
# @param table [Array<Array<String>>] collation table (headers in first row)
|
22
|
+
def initialize(table)
|
23
|
+
@taxa_names = table.to_a.first.each_with_object({})
|
24
|
+
.with_index do |(name, acc), index|
|
25
|
+
acc[index + 1] = normalize_text(name)
|
26
|
+
end
|
27
|
+
@taxa_ids = @taxa_names.invert
|
28
|
+
|
29
|
+
taxa_arr = @taxa_ids.values
|
30
|
+
@characters = (1...table.length).each_with_object({}) do |char_num, acc|
|
31
|
+
char_states = taxa_arr.zip(table[char_num]).to_h
|
32
|
+
acc[char_num] = Dphil::Character.new(id: char_num, states: char_states)
|
33
|
+
end
|
34
|
+
|
35
|
+
instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
|
36
|
+
end
|
37
|
+
|
38
|
+
# @!attribute [r] taxa_names
|
39
|
+
# @return [Hash<Integer, String>] taxa names by ID
|
40
|
+
attr_reader :taxa_names
|
41
|
+
|
42
|
+
# @!attribute [r] taxa_ids
|
43
|
+
# @return [Hash<String, Integer>] taxa IDs by names
|
44
|
+
attr_reader :taxa_ids
|
45
|
+
|
46
|
+
# @!attribute [r] characters
|
47
|
+
# @return [Hash<Integer, Character>] characters by character ID
|
48
|
+
attr_reader :characters
|
49
|
+
|
50
|
+
# @!attribute [r] stats
|
51
|
+
# @return [Hash] the character statistics for the matrix
|
52
|
+
def stats
|
53
|
+
@stats ||= begin
|
54
|
+
hash = {
|
55
|
+
total: characters.count,
|
56
|
+
constant: 0,
|
57
|
+
uninformative: 0,
|
58
|
+
informative: 0,
|
59
|
+
}
|
60
|
+
characters.each_value do |char|
|
61
|
+
if char.constant?
|
62
|
+
hash[:constant] += 1
|
63
|
+
elsif char.informative?
|
64
|
+
hash[:informative] += 1
|
65
|
+
else
|
66
|
+
hash[:uninformative] += 1
|
67
|
+
end
|
68
|
+
end
|
69
|
+
hash
|
70
|
+
end.freeze
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get character by ID
|
74
|
+
# @param char_id [Integer] a character ID
|
75
|
+
# @return [Character, nil] the associated Character, or Nil if not found.
|
76
|
+
def get_character(char_id)
|
77
|
+
characters[char_id.to_i]
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_h
|
81
|
+
{
|
82
|
+
taxa_names: taxa_names,
|
83
|
+
characters: characters,
|
84
|
+
}
|
85
|
+
end
|
86
|
+
|
87
|
+
def as_json(options = nil)
|
88
|
+
to_h.as_json(options)
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
# @param text [String] an arbitrary string of text
|
94
|
+
# @return [String] a Unicode-normalized, stripped, frozen copy
|
95
|
+
def normalize_text(text)
|
96
|
+
return if text.nil?
|
97
|
+
text = UNF::Normalizer.normalize(text.to_s, :nfc)
|
98
|
+
text.strip!
|
99
|
+
text.freeze
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
data/lib/dphil/cli.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "dphil"
|
4
|
+
require "gli"
|
5
|
+
|
6
|
+
module Dphil
|
7
|
+
#
|
8
|
+
# GLI-based CLI interface for the library.
|
9
|
+
#
|
10
|
+
# Not loaded automatically with the rest of the gem.
|
11
|
+
#
|
12
|
+
module CLI
|
13
|
+
extend GLI::App
|
14
|
+
|
15
|
+
program_desc "UBC Sanskrit digital philology CLI tool"
|
16
|
+
version Dphil::VERSION
|
17
|
+
subcommand_option_handling :normal
|
18
|
+
arguments :strict
|
19
|
+
|
20
|
+
desc "Be verbose in output"
|
21
|
+
switch :verbose, negatable: false
|
22
|
+
|
23
|
+
# Load individual CLI commands
|
24
|
+
commands_from "dphil/cli_commands"
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
Dphil::CLI.module_eval do
|
4
|
+
desc "Convert a CSV-format collation file into a JSON-LD dataset"
|
5
|
+
long_desc <<~EOS
|
6
|
+
Convert a CSV-format collation file into a JSON-LD dataset, generating trees
|
7
|
+
using PAUP as part of the process.
|
8
|
+
This expects each column of the CSV to represent data for a single taxon,
|
9
|
+
and the first row to contain the names of the taxa.
|
10
|
+
EOS
|
11
|
+
|
12
|
+
arg :csv_file
|
13
|
+
|
14
|
+
command :csv2ld do |c|
|
15
|
+
c.desc "Transpose rows/columns in CSV"
|
16
|
+
c.switch :t, :transpose, negatable: false
|
17
|
+
|
18
|
+
c.desc "Specify the location of the PAUP executable"
|
19
|
+
c.flag :p, :paup_cmd, arg_name: "file", default_value: "paup4"
|
20
|
+
|
21
|
+
c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
|
22
|
+
c.flag :d, :paup_data, arg_name: "file"
|
23
|
+
|
24
|
+
c.desc "Write JSON-LD output to file instead of STDOUT"
|
25
|
+
c.flag :o, :outfile, arg_name: "file"
|
26
|
+
|
27
|
+
c.action do |_, copts, args|
|
28
|
+
# Check that PAUP command exists
|
29
|
+
paup_cmd = `command -v #{Shellwords.shellescape(copts[:paup_cmd])}`.strip
|
30
|
+
raise "PAUP command `#{copts[:paup_cmd]}` could not be found." if paup_cmd.empty?
|
31
|
+
|
32
|
+
# Set absolute path of CSV input
|
33
|
+
csv_file = Pathname.new(args[0]).realpath
|
34
|
+
|
35
|
+
Dir.mktmpdir("dphil-csv2ld") do |dir|
|
36
|
+
Dir.chdir(dir) do
|
37
|
+
# Run Csv2Nex conversion
|
38
|
+
File.write("csv2ld.nex", Dphil::Csv2NexConverter.new(csv_file, copts).convert)
|
39
|
+
|
40
|
+
# Run PAUP
|
41
|
+
`#{paup_cmd} -n csv2ld.nex`
|
42
|
+
|
43
|
+
# Compile JSON-LD Dataset
|
44
|
+
matrix = Dphil::CharacterMatrix.from_csv(csv_file, transpose: copts[:transpose])
|
45
|
+
paup_trees = Dphil::PAUP.parse_trees("paup.log")
|
46
|
+
trees = paup_trees.each_with_object({}) do |(k, v), acc|
|
47
|
+
next unless k.is_a?(Integer)
|
48
|
+
acc[k] = Dphil::Tree.new(k, v[:lengths], v[:stats])
|
49
|
+
end
|
50
|
+
|
51
|
+
cons_tree = Dphil::NewickTree.tree_from_nex(
|
52
|
+
"con.tree",
|
53
|
+
taxa_map: matrix.taxa_names.transform_values { |v| v.gsub(/[\-\_]/, " ") }
|
54
|
+
)
|
55
|
+
trees[0] = cons_tree
|
56
|
+
|
57
|
+
dataset = Dphil::LDDataSet.new(matrix: matrix, trees: trees)
|
58
|
+
@dataset_ld = JSON.pretty_generate(dataset.as_jsonld)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
if copts[:outfile].nil?
|
63
|
+
puts @dataset_ld
|
64
|
+
else
|
65
|
+
abs_outfile = Pathname.new(copts[:outfile]).expand_path
|
66
|
+
rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
|
67
|
+
puts "#{File.write(copts[:outfile], @dataset_ld)} bytes written to #{rel_outfile}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|