dphil 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
data/lib/dphil/cache.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "active_support/cache"
|
4
|
+
require "active_support/notifications"
|
5
|
+
|
6
|
+
module Dphil
|
7
|
+
module_function
|
8
|
+
|
9
|
+
def cache(key, params = nil)
|
10
|
+
@cache ||= defined?(::Rails.cache) ? ::Rails.cache : ActiveSupport::Cache::MemoryStore.new(size: 16_384)
|
11
|
+
full_key = String.new("Dphil-#{Dphil::VERSION}:cache:#{key}")
|
12
|
+
full_key << ":#{Digest::SHA1.base64digest(params.to_s)}" unless params.nil?
|
13
|
+
block_given? ? @cache.fetch(full_key, &Proc.new) : @cache.fetch(full_key)
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,236 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# Phylogenetic character for storing states and symbols.
|
6
|
+
#
|
7
|
+
# Immutable.
|
8
|
+
#
|
9
|
+
class Character
|
10
|
+
include Dphil::LDOutput
|
11
|
+
|
12
|
+
# Instantiates a new Character
|
13
|
+
# @overload initialize(id = nil, states = nil)
|
14
|
+
# @param id [Integer] a character ID
|
15
|
+
# @param states [Hash<Integer, String]] taxa and text-states +{ taxon_id => text_state }+
|
16
|
+
# @overload initialize(**opts = {})
|
17
|
+
# @param [Hash] opts options or keyword values
|
18
|
+
# @option opts [Integer] :id a character ID
|
19
|
+
# @option opts [Hash<Integer, String]] :states taxa and text-states +{ taxon_id => text_state }+
|
20
|
+
def initialize(id = nil, states = nil, **opts)
|
21
|
+
@id = (opts[:id] || id)&.to_s.to_i
|
22
|
+
@taxa_states = (opts[:states] || states)
|
23
|
+
.to_h.each_with_object({}) do |(taxon, state), acc|
|
24
|
+
next if state.blank?
|
25
|
+
taxon = taxon.to_s if taxon.is_a?(Symbol)
|
26
|
+
acc[taxon.to_i] = normalize_text(state)
|
27
|
+
end
|
28
|
+
|
29
|
+
unique_states = weighted_uniq(@taxa_states.values)
|
30
|
+
if unique_states.size > SYMBOL_ARRAY.size
|
31
|
+
raise ArgumentError,
|
32
|
+
"Too many states (found #{unique_states.size}, " \
|
33
|
+
"max #{SYMBOL_ARRAY.size})"
|
34
|
+
end
|
35
|
+
|
36
|
+
@states = {}
|
37
|
+
@state_totals = unique_states
|
38
|
+
unique_states.each_key.with_index do |state, index|
|
39
|
+
@states[SYMBOL_ARRAY[index]] = state
|
40
|
+
end
|
41
|
+
instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
|
42
|
+
end
|
43
|
+
|
44
|
+
# @!attribute [r] id
|
45
|
+
# @return [Integer] character ID
|
46
|
+
attr_reader :id
|
47
|
+
|
48
|
+
# @!attribute [r] taxa
|
49
|
+
# @return [Set<Integer>] taxon IDs
|
50
|
+
def taxa
|
51
|
+
@taxa ||= Set.new(taxa_states.keys).freeze
|
52
|
+
end
|
53
|
+
|
54
|
+
# @!attribute [r] states
|
55
|
+
# @return [Hash<String, String>] text-states by symbol
|
56
|
+
attr_reader :states
|
57
|
+
|
58
|
+
# @!attribute [r] symbols
|
59
|
+
# @return [Hash<String, String>] symbols by text-state
|
60
|
+
def symbols
|
61
|
+
@symbols ||= states.invert.freeze
|
62
|
+
end
|
63
|
+
|
64
|
+
# @!attribute [r] state_list
|
65
|
+
# @return [Array<String>] text-states
|
66
|
+
def state_list
|
67
|
+
@state_list ||= states.values.freeze
|
68
|
+
end
|
69
|
+
|
70
|
+
# @!attribute [r] symbol_list
|
71
|
+
# @return [Array<String>] symbols
|
72
|
+
def symbol_list
|
73
|
+
@symbol_list ||= states.keys.freeze
|
74
|
+
end
|
75
|
+
|
76
|
+
# @!attribute [r] state_totals
|
77
|
+
# @return [Hash<String, Integer>] character state totals by text-state
|
78
|
+
attr_reader :state_totals
|
79
|
+
|
80
|
+
# @!attribute [r] symbol_totals
|
81
|
+
# @return [Hash<String, Integer>] character state totals by symbol
|
82
|
+
def symbol_totals
|
83
|
+
@symbol_totals ||= state_totals.transform_keys { |state| symbols[state] }.freeze
|
84
|
+
end
|
85
|
+
|
86
|
+
# @!attribute [r] taxa_states
|
87
|
+
# @return [Hash<Integer, String>] text-states by taxon ID
|
88
|
+
attr_reader :taxa_states
|
89
|
+
|
90
|
+
# @!attribute [r] taxa_symbols
|
91
|
+
# @return [Hash<Integer, String>] symbols by taxon ID
|
92
|
+
def taxa_symbols
|
93
|
+
@taxa_symbols ||= taxa_states.transform_values { |state| symbols[state] }.freeze
|
94
|
+
end
|
95
|
+
|
96
|
+
# @!attribute [r] states_taxa
|
97
|
+
# @return [Hash<String, Integer>] taxa IDs by text-state
|
98
|
+
def states_taxa
|
99
|
+
@states_taxa ||= (states.each_value.each_with_object({}) do |state, acc|
|
100
|
+
acc[state] = taxa_states.select { |_, tstate| state == tstate }.keys
|
101
|
+
end).freeze
|
102
|
+
end
|
103
|
+
|
104
|
+
# @!attribute [r] symbols_taxa
|
105
|
+
# @return [Hash<String, Integer>] taxa IDs by symbol
|
106
|
+
def symbols_taxa
|
107
|
+
@symbols_taxa ||= states_taxa.transform_keys { |state| symbols[state] }.freeze
|
108
|
+
end
|
109
|
+
|
110
|
+
# Get state from symbol
|
111
|
+
# @param symbol [String] a symbol
|
112
|
+
# @return [String, nil] the associated text-state, or Nil if not found
|
113
|
+
def get_state(symbol)
|
114
|
+
states[normalize_text(symbol)]
|
115
|
+
end
|
116
|
+
|
117
|
+
# Get symbol from state
|
118
|
+
# @param state [String] a text-state
|
119
|
+
# @return [String, nil] the associated symbol, or Nil if not found
|
120
|
+
def get_symbol(state)
|
121
|
+
symbols[normalize_text(state)]
|
122
|
+
end
|
123
|
+
|
124
|
+
# Get taxa from state
|
125
|
+
# @param symbol [String] a text-state
|
126
|
+
# @return [Array<Integer>] the associated taxa IDs
|
127
|
+
def get_taxa_state(state)
|
128
|
+
states_taxa[normalize_text(state)]
|
129
|
+
end
|
130
|
+
|
131
|
+
# Get taxa from symbol
|
132
|
+
# @param symbol [String] a symbol
|
133
|
+
# @return [Array<Integer>] the associated taxa IDs
|
134
|
+
def get_taxa_symbol(symbol)
|
135
|
+
symbols_taxa[normalize_text(symbol)]
|
136
|
+
end
|
137
|
+
|
138
|
+
# Get state from taxon
|
139
|
+
# @param taxon_id [Integer] a taxon ID
|
140
|
+
# @return [String, nil] the associated text-state, or Nil if not found
|
141
|
+
def get_state_taxon(taxon_id)
|
142
|
+
taxa_states[taxon_id.to_i]
|
143
|
+
end
|
144
|
+
|
145
|
+
# Get symbol from taxon
|
146
|
+
# @param taxon_id [Integer] a taxon ID
|
147
|
+
# @return [String, nil] the associated symbol, or Nil if not found
|
148
|
+
def get_symbol_taxon(taxon_id)
|
149
|
+
taxa_symbols[taxon_id.to_i]
|
150
|
+
end
|
151
|
+
|
152
|
+
# Check if character is parsimony-informative
|
153
|
+
# (At least 2 variants occurring in at least 2 places)
|
154
|
+
# @return [Boolean] whether the character provides useful information
|
155
|
+
def informative?
|
156
|
+
@informative ||= (states.size > 1 && states_taxa.count { |_, v| v.size > 1 } > 1)
|
157
|
+
end
|
158
|
+
|
159
|
+
# Check if the character is invariant
|
160
|
+
# @return [Boolean] whether the character is constant (invariant)
|
161
|
+
def constant?
|
162
|
+
@constant ||= states.size <= 1
|
163
|
+
end
|
164
|
+
|
165
|
+
def to_h
|
166
|
+
{
|
167
|
+
id: id,
|
168
|
+
states: states,
|
169
|
+
symbols: symbols,
|
170
|
+
state_totals: state_totals,
|
171
|
+
taxa_states: taxa_states,
|
172
|
+
states_taxa: states_taxa,
|
173
|
+
is_informative: informative?,
|
174
|
+
is_constant: constant?,
|
175
|
+
}
|
176
|
+
end
|
177
|
+
|
178
|
+
def as_json(options = nil)
|
179
|
+
to_h.as_json(options)
|
180
|
+
end
|
181
|
+
|
182
|
+
# Pretty-print the object
|
183
|
+
# (used by Pry in particular)
|
184
|
+
def pretty_print(q)
|
185
|
+
q.object_group(self) do
|
186
|
+
q.breakable
|
187
|
+
q.group(1) do
|
188
|
+
q.text "@id=#{id}"
|
189
|
+
q.breakable
|
190
|
+
q.group(1, "{", "}") do
|
191
|
+
q.seplist(states) do |symbol, state|
|
192
|
+
q.text "#{state.inspect}(#{symbol})=#{states_taxa[state]}"
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
# @return [String] a string representation of the object.
|
200
|
+
def inspect
|
201
|
+
pretty_inspect.chomp
|
202
|
+
end
|
203
|
+
alias to_s inspect
|
204
|
+
|
205
|
+
private
|
206
|
+
|
207
|
+
# @param text [String] an arbitrary string of text
|
208
|
+
# @return [String] a Unicode-normalized, stripped, frozen copy
|
209
|
+
def normalize_text(text)
|
210
|
+
return if text.nil?
|
211
|
+
text = UNF::Normalizer.normalize(text.to_s, :nfc)
|
212
|
+
text.strip!
|
213
|
+
text.freeze
|
214
|
+
end
|
215
|
+
|
216
|
+
# Find all unique elements in an array and stably sort them by frequency.
|
217
|
+
# @param array [Array]
|
218
|
+
# @return [Hash] keys are unique input array elements, values are frequency
|
219
|
+
def weighted_uniq(array)
|
220
|
+
weighted_hash = array.each_with_object({}) do |v, acc|
|
221
|
+
acc[v] ||= 0
|
222
|
+
acc[v] += 1
|
223
|
+
end
|
224
|
+
|
225
|
+
n = 0
|
226
|
+
weighted_hash = weighted_hash.sort_by do |x|
|
227
|
+
n += 1
|
228
|
+
[-x[1], n]
|
229
|
+
end
|
230
|
+
weighted_hash.to_h
|
231
|
+
end
|
232
|
+
|
233
|
+
SYMBOL_ARRAY = IceNine.deep_freeze([*"A".."Z", *"a".."z"])
|
234
|
+
private_constant :SYMBOL_ARRAY
|
235
|
+
end
|
236
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Dphil
|
4
|
+
#
|
5
|
+
# A matrix of character states across taxa.
|
6
|
+
#
|
7
|
+
class CharacterMatrix
|
8
|
+
include LDOutput
|
9
|
+
|
10
|
+
# Instantiate a new CharacterMatrix from a UTF-8 CSV file
|
11
|
+
# @param infile [#read] the file/IO object to read
|
12
|
+
# @param transpose [Boolean] transpose the table 90° (headers in first column)
|
13
|
+
# @return [CharacterMatrix]
|
14
|
+
def self.from_csv(infile, transpose: false)
|
15
|
+
csv = CSV.read(infile, "r:bom|utf-8")
|
16
|
+
csv = csv.transpose if transpose
|
17
|
+
new(csv)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Instantiate a new CharacterMatrix
|
21
|
+
# @param table [Array<Array<String>>] collation table (headers in first row)
|
22
|
+
def initialize(table)
|
23
|
+
@taxa_names = table.to_a.first.each_with_object({})
|
24
|
+
.with_index do |(name, acc), index|
|
25
|
+
acc[index + 1] = normalize_text(name)
|
26
|
+
end
|
27
|
+
@taxa_ids = @taxa_names.invert
|
28
|
+
|
29
|
+
taxa_arr = @taxa_ids.values
|
30
|
+
@characters = (1...table.length).each_with_object({}) do |char_num, acc|
|
31
|
+
char_states = taxa_arr.zip(table[char_num]).to_h
|
32
|
+
acc[char_num] = Dphil::Character.new(id: char_num, states: char_states)
|
33
|
+
end
|
34
|
+
|
35
|
+
instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
|
36
|
+
end
|
37
|
+
|
38
|
+
# @!attribute [r] taxa_names
|
39
|
+
# @return [Hash<Integer, String>] taxa names by ID
|
40
|
+
attr_reader :taxa_names
|
41
|
+
|
42
|
+
# @!attribute [r] taxa_ids
|
43
|
+
# @return [Hash<String, Integer>] taxa IDs by names
|
44
|
+
attr_reader :taxa_ids
|
45
|
+
|
46
|
+
# @!attribute [r] characters
|
47
|
+
# @return [Hash<Integer, Character>] characters by character ID
|
48
|
+
attr_reader :characters
|
49
|
+
|
50
|
+
# @!attribute [r] stats
|
51
|
+
# @return [Hash] the character statistics for the matrix
|
52
|
+
def stats
|
53
|
+
@stats ||= begin
|
54
|
+
hash = {
|
55
|
+
total: characters.count,
|
56
|
+
constant: 0,
|
57
|
+
uninformative: 0,
|
58
|
+
informative: 0,
|
59
|
+
}
|
60
|
+
characters.each_value do |char|
|
61
|
+
if char.constant?
|
62
|
+
hash[:constant] += 1
|
63
|
+
elsif char.informative?
|
64
|
+
hash[:informative] += 1
|
65
|
+
else
|
66
|
+
hash[:uninformative] += 1
|
67
|
+
end
|
68
|
+
end
|
69
|
+
hash
|
70
|
+
end.freeze
|
71
|
+
end
|
72
|
+
|
73
|
+
# Get character by ID
|
74
|
+
# @param char_id [Integer] a character ID
|
75
|
+
# @return [Character, nil] the associated Character, or Nil if not found.
|
76
|
+
def get_character(char_id)
|
77
|
+
characters[char_id.to_i]
|
78
|
+
end
|
79
|
+
|
80
|
+
def to_h
|
81
|
+
{
|
82
|
+
taxa_names: taxa_names,
|
83
|
+
characters: characters,
|
84
|
+
}
|
85
|
+
end
|
86
|
+
|
87
|
+
def as_json(options = nil)
|
88
|
+
to_h.as_json(options)
|
89
|
+
end
|
90
|
+
|
91
|
+
private
|
92
|
+
|
93
|
+
# @param text [String] an arbitrary string of text
|
94
|
+
# @return [String] a Unicode-normalized, stripped, frozen copy
|
95
|
+
def normalize_text(text)
|
96
|
+
return if text.nil?
|
97
|
+
text = UNF::Normalizer.normalize(text.to_s, :nfc)
|
98
|
+
text.strip!
|
99
|
+
text.freeze
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
data/lib/dphil/cli.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "dphil"
|
4
|
+
require "gli"
|
5
|
+
|
6
|
+
module Dphil
|
7
|
+
#
|
8
|
+
# GLI-based CLI interface for the library.
|
9
|
+
#
|
10
|
+
# Not loaded automatically with the rest of the gem.
|
11
|
+
#
|
12
|
+
module CLI
|
13
|
+
extend GLI::App
|
14
|
+
|
15
|
+
program_desc "UBC Sanskrit digital philology CLI tool"
|
16
|
+
version Dphil::VERSION
|
17
|
+
subcommand_option_handling :normal
|
18
|
+
arguments :strict
|
19
|
+
|
20
|
+
desc "Be verbose in output"
|
21
|
+
switch :verbose, negatable: false
|
22
|
+
|
23
|
+
# Load individual CLI commands
|
24
|
+
commands_from "dphil/cli_commands"
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
Dphil::CLI.module_eval do
|
4
|
+
desc "Convert a CSV-format collation file into a JSON-LD dataset"
|
5
|
+
long_desc <<~EOS
|
6
|
+
Convert a CSV-format collation file into a JSON-LD dataset, generating trees
|
7
|
+
using PAUP as part of the process.
|
8
|
+
This expects each column of the CSV to represent data for a single taxon,
|
9
|
+
and the first row to contain the names of the taxa.
|
10
|
+
EOS
|
11
|
+
|
12
|
+
arg :csv_file
|
13
|
+
|
14
|
+
command :csv2ld do |c|
|
15
|
+
c.desc "Transpose rows/columns in CSV"
|
16
|
+
c.switch :t, :transpose, negatable: false
|
17
|
+
|
18
|
+
c.desc "Specify the location of the PAUP executable"
|
19
|
+
c.flag :p, :paup_cmd, arg_name: "file", default_value: "paup4"
|
20
|
+
|
21
|
+
c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
|
22
|
+
c.flag :d, :paup_data, arg_name: "file"
|
23
|
+
|
24
|
+
c.desc "Write JSON-LD output to file instead of STDOUT"
|
25
|
+
c.flag :o, :outfile, arg_name: "file"
|
26
|
+
|
27
|
+
c.action do |_, copts, args|
|
28
|
+
# Check that PAUP command exists
|
29
|
+
paup_cmd = `command -v #{Shellwords.shellescape(copts[:paup_cmd])}`.strip
|
30
|
+
raise "PAUP command `#{copts[:paup_cmd]}` could not be found." if paup_cmd.empty?
|
31
|
+
|
32
|
+
# Set absolute path of CSV input
|
33
|
+
csv_file = Pathname.new(args[0]).realpath
|
34
|
+
|
35
|
+
Dir.mktmpdir("dphil-csv2ld") do |dir|
|
36
|
+
Dir.chdir(dir) do
|
37
|
+
# Run Csv2Nex conversion
|
38
|
+
File.write("csv2ld.nex", Dphil::Csv2NexConverter.new(csv_file, copts).convert)
|
39
|
+
|
40
|
+
# Run PAUP
|
41
|
+
`#{paup_cmd} -n csv2ld.nex`
|
42
|
+
|
43
|
+
# Compile JSON-LD Dataset
|
44
|
+
matrix = Dphil::CharacterMatrix.from_csv(csv_file, transpose: copts[:transpose])
|
45
|
+
paup_trees = Dphil::PAUP.parse_trees("paup.log")
|
46
|
+
trees = paup_trees.each_with_object({}) do |(k, v), acc|
|
47
|
+
next unless k.is_a?(Integer)
|
48
|
+
acc[k] = Dphil::Tree.new(k, v[:lengths], v[:stats])
|
49
|
+
end
|
50
|
+
|
51
|
+
cons_tree = Dphil::NewickTree.tree_from_nex(
|
52
|
+
"con.tree",
|
53
|
+
taxa_map: matrix.taxa_names.transform_values { |v| v.gsub(/[\-\_]/, " ") }
|
54
|
+
)
|
55
|
+
trees[0] = cons_tree
|
56
|
+
|
57
|
+
dataset = Dphil::LDDataSet.new(matrix: matrix, trees: trees)
|
58
|
+
@dataset_ld = JSON.pretty_generate(dataset.as_jsonld)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
if copts[:outfile].nil?
|
63
|
+
puts @dataset_ld
|
64
|
+
else
|
65
|
+
abs_outfile = Pathname.new(copts[:outfile]).expand_path
|
66
|
+
rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
|
67
|
+
puts "#{File.write(copts[:outfile], @dataset_ld)} bytes written to #{rel_outfile}"
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|