dphil 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/cache"
4
+ require "active_support/notifications"
5
+
6
+ module Dphil
7
+ module_function
8
+
9
+ def cache(key, params = nil)
10
+ @cache ||= defined?(::Rails.cache) ? ::Rails.cache : ActiveSupport::Cache::MemoryStore.new(size: 16_384)
11
+ full_key = String.new("Dphil-#{Dphil::VERSION}:cache:#{key}")
12
+ full_key << ":#{Digest::SHA1.base64digest(params.to_s)}" unless params.nil?
13
+ block_given? ? @cache.fetch(full_key, &Proc.new) : @cache.fetch(full_key)
14
+ end
15
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class ChangeList
5
+ end
6
+ end
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Phylogenetic character for storing states and symbols.
6
+ #
7
+ # Immutable.
8
+ #
9
+ class Character
10
+ include Dphil::LDOutput
11
+
12
+ # Instantiates a new Character
13
+ # @overload initialize(id = nil, states = nil)
14
+ # @param id [Integer] a character ID
15
+ # @param states [Hash<Integer, String]] taxa and text-states +{ taxon_id => text_state }+
16
+ # @overload initialize(**opts = {})
17
+ # @param [Hash] opts options or keyword values
18
+ # @option opts [Integer] :id a character ID
19
+ # @option opts [Hash<Integer, String]] :states taxa and text-states +{ taxon_id => text_state }+
20
+ def initialize(id = nil, states = nil, **opts)
21
+ @id = (opts[:id] || id)&.to_s.to_i
22
+ @taxa_states = (opts[:states] || states)
23
+ .to_h.each_with_object({}) do |(taxon, state), acc|
24
+ next if state.blank?
25
+ taxon = taxon.to_s if taxon.is_a?(Symbol)
26
+ acc[taxon.to_i] = normalize_text(state)
27
+ end
28
+
29
+ unique_states = weighted_uniq(@taxa_states.values)
30
+ if unique_states.size > SYMBOL_ARRAY.size
31
+ raise ArgumentError,
32
+ "Too many states (found #{unique_states.size}, " \
33
+ "max #{SYMBOL_ARRAY.size})"
34
+ end
35
+
36
+ @states = {}
37
+ @state_totals = unique_states
38
+ unique_states.each_key.with_index do |state, index|
39
+ @states[SYMBOL_ARRAY[index]] = state
40
+ end
41
+ instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
42
+ end
43
+
44
+ # @!attribute [r] id
45
+ # @return [Integer] character ID
46
+ attr_reader :id
47
+
48
+ # @!attribute [r] taxa
49
+ # @return [Set<Integer>] taxon IDs
50
+ def taxa
51
+ @taxa ||= Set.new(taxa_states.keys).freeze
52
+ end
53
+
54
+ # @!attribute [r] states
55
+ # @return [Hash<String, String>] text-states by symbol
56
+ attr_reader :states
57
+
58
+ # @!attribute [r] symbols
59
+ # @return [Hash<String, String>] symbols by text-state
60
+ def symbols
61
+ @symbols ||= states.invert.freeze
62
+ end
63
+
64
+ # @!attribute [r] state_list
65
+ # @return [Array<String>] text-states
66
+ def state_list
67
+ @state_list ||= states.values.freeze
68
+ end
69
+
70
+ # @!attribute [r] symbol_list
71
+ # @return [Array<String>] symbols
72
+ def symbol_list
73
+ @symbol_list ||= states.keys.freeze
74
+ end
75
+
76
+ # @!attribute [r] state_totals
77
+ # @return [Hash<String, Integer>] character state totals by text-state
78
+ attr_reader :state_totals
79
+
80
+ # @!attribute [r] symbol_totals
81
+ # @return [Hash<String, Integer>] character state totals by symbol
82
+ def symbol_totals
83
+ @symbol_totals ||= state_totals.transform_keys { |state| symbols[state] }.freeze
84
+ end
85
+
86
+ # @!attribute [r] taxa_states
87
+ # @return [Hash<Integer, String>] text-states by taxon ID
88
+ attr_reader :taxa_states
89
+
90
+ # @!attribute [r] taxa_symbols
91
+ # @return [Hash<Integer, String>] symbols by taxon ID
92
+ def taxa_symbols
93
+ @taxa_symbols ||= taxa_states.transform_values { |state| symbols[state] }.freeze
94
+ end
95
+
96
+ # @!attribute [r] states_taxa
97
+ # @return [Hash<String, Integer>] taxa IDs by text-state
98
+ def states_taxa
99
+ @states_taxa ||= (states.each_value.each_with_object({}) do |state, acc|
100
+ acc[state] = taxa_states.select { |_, tstate| state == tstate }.keys
101
+ end).freeze
102
+ end
103
+
104
+ # @!attribute [r] symbols_taxa
105
+ # @return [Hash<String, Integer>] taxa IDs by symbol
106
+ def symbols_taxa
107
+ @symbols_taxa ||= states_taxa.transform_keys { |state| symbols[state] }.freeze
108
+ end
109
+
110
+ # Get state from symbol
111
+ # @param symbol [String] a symbol
112
+ # @return [String, nil] the associated text-state, or Nil if not found
113
+ def get_state(symbol)
114
+ states[normalize_text(symbol)]
115
+ end
116
+
117
+ # Get symbol from state
118
+ # @param state [String] a text-state
119
+ # @return [String, nil] the associated symbol, or Nil if not found
120
+ def get_symbol(state)
121
+ symbols[normalize_text(state)]
122
+ end
123
+
124
+ # Get taxa from state
125
+ # @param symbol [String] a text-state
126
+ # @return [Array<Integer>] the associated taxa IDs
127
+ def get_taxa_state(state)
128
+ states_taxa[normalize_text(state)]
129
+ end
130
+
131
+ # Get taxa from symbol
132
+ # @param symbol [String] a symbol
133
+ # @return [Array<Integer>] the associated taxa IDs
134
+ def get_taxa_symbol(symbol)
135
+ symbols_taxa[normalize_text(symbol)]
136
+ end
137
+
138
+ # Get state from taxon
139
+ # @param taxon_id [Integer] a taxon ID
140
+ # @return [String, nil] the associated text-state, or Nil if not found
141
+ def get_state_taxon(taxon_id)
142
+ taxa_states[taxon_id.to_i]
143
+ end
144
+
145
+ # Get symbol from taxon
146
+ # @param taxon_id [Integer] a taxon ID
147
+ # @return [String, nil] the associated symbol, or Nil if not found
148
+ def get_symbol_taxon(taxon_id)
149
+ taxa_symbols[taxon_id.to_i]
150
+ end
151
+
152
+ # Check if character is parsimony-informative
153
+ # (At least 2 variants occurring in at least 2 places)
154
+ # @return [Boolean] whether the character provides useful information
155
+ def informative?
156
+ @informative ||= (states.size > 1 && states_taxa.count { |_, v| v.size > 1 } > 1)
157
+ end
158
+
159
+ # Check if the character is invariant
160
+ # @return [Boolean] whether the character is constant (invariant)
161
+ def constant?
162
+ @constant ||= states.size <= 1
163
+ end
164
+
165
+ def to_h
166
+ {
167
+ id: id,
168
+ states: states,
169
+ symbols: symbols,
170
+ state_totals: state_totals,
171
+ taxa_states: taxa_states,
172
+ states_taxa: states_taxa,
173
+ is_informative: informative?,
174
+ is_constant: constant?,
175
+ }
176
+ end
177
+
178
+ def as_json(options = nil)
179
+ to_h.as_json(options)
180
+ end
181
+
182
+ # Pretty-print the object
183
+ # (used by Pry in particular)
184
+ def pretty_print(q)
185
+ q.object_group(self) do
186
+ q.breakable
187
+ q.group(1) do
188
+ q.text "@id=#{id}"
189
+ q.breakable
190
+ q.group(1, "{", "}") do
191
+ q.seplist(states) do |symbol, state|
192
+ q.text "#{state.inspect}(#{symbol})=#{states_taxa[state]}"
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ # @return [String] a string representation of the object.
200
+ def inspect
201
+ pretty_inspect.chomp
202
+ end
203
+ alias to_s inspect
204
+
205
+ private
206
+
207
+ # @param text [String] an arbitrary string of text
208
+ # @return [String] a Unicode-normalized, stripped, frozen copy
209
+ def normalize_text(text)
210
+ return if text.nil?
211
+ text = UNF::Normalizer.normalize(text.to_s, :nfc)
212
+ text.strip!
213
+ text.freeze
214
+ end
215
+
216
+ # Find all unique elements in an array and stably sort them by frequency.
217
+ # @param array [Array]
218
+ # @return [Hash] keys are unique input array elements, values are frequency
219
+ def weighted_uniq(array)
220
+ weighted_hash = array.each_with_object({}) do |v, acc|
221
+ acc[v] ||= 0
222
+ acc[v] += 1
223
+ end
224
+
225
+ n = 0
226
+ weighted_hash = weighted_hash.sort_by do |x|
227
+ n += 1
228
+ [-x[1], n]
229
+ end
230
+ weighted_hash.to_h
231
+ end
232
+
233
+ SYMBOL_ARRAY = IceNine.deep_freeze([*"A".."Z", *"a".."z"])
234
+ private_constant :SYMBOL_ARRAY
235
+ end
236
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # A matrix of character states across taxa.
6
+ #
7
+ class CharacterMatrix
8
+ include LDOutput
9
+
10
+ # Instantiate a new CharacterMatrix from a UTF-8 CSV file
11
+ # @param infile [#read] the file/IO object to read
12
+ # @param transpose [Boolean] transpose the table 90° (headers in first column)
13
+ # @return [CharacterMatrix]
14
+ def self.from_csv(infile, transpose: false)
15
+ csv = CSV.read(infile, "r:bom|utf-8")
16
+ csv = csv.transpose if transpose
17
+ new(csv)
18
+ end
19
+
20
+ # Instantiate a new CharacterMatrix
21
+ # @param table [Array<Array<String>>] collation table (headers in first row)
22
+ def initialize(table)
23
+ @taxa_names = table.to_a.first.each_with_object({})
24
+ .with_index do |(name, acc), index|
25
+ acc[index + 1] = normalize_text(name)
26
+ end
27
+ @taxa_ids = @taxa_names.invert
28
+
29
+ taxa_arr = @taxa_ids.values
30
+ @characters = (1...table.length).each_with_object({}) do |char_num, acc|
31
+ char_states = taxa_arr.zip(table[char_num]).to_h
32
+ acc[char_num] = Dphil::Character.new(id: char_num, states: char_states)
33
+ end
34
+
35
+ instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
36
+ end
37
+
38
+ # @!attribute [r] taxa_names
39
+ # @return [Hash<Integer, String>] taxa names by ID
40
+ attr_reader :taxa_names
41
+
42
+ # @!attribute [r] taxa_ids
43
+ # @return [Hash<String, Integer>] taxa IDs by names
44
+ attr_reader :taxa_ids
45
+
46
+ # @!attribute [r] characters
47
+ # @return [Hash<Integer, Character>] characters by character ID
48
+ attr_reader :characters
49
+
50
+ # @!attribute [r] stats
51
+ # @return [Hash] the character statistics for the matrix
52
+ def stats
53
+ @stats ||= begin
54
+ hash = {
55
+ total: characters.count,
56
+ constant: 0,
57
+ uninformative: 0,
58
+ informative: 0,
59
+ }
60
+ characters.each_value do |char|
61
+ if char.constant?
62
+ hash[:constant] += 1
63
+ elsif char.informative?
64
+ hash[:informative] += 1
65
+ else
66
+ hash[:uninformative] += 1
67
+ end
68
+ end
69
+ hash
70
+ end.freeze
71
+ end
72
+
73
+ # Get character by ID
74
+ # @param char_id [Integer] a character ID
75
+ # @return [Character, nil] the associated Character, or Nil if not found.
76
+ def get_character(char_id)
77
+ characters[char_id.to_i]
78
+ end
79
+
80
+ def to_h
81
+ {
82
+ taxa_names: taxa_names,
83
+ characters: characters,
84
+ }
85
+ end
86
+
87
+ def as_json(options = nil)
88
+ to_h.as_json(options)
89
+ end
90
+
91
+ private
92
+
93
+ # @param text [String] an arbitrary string of text
94
+ # @return [String] a Unicode-normalized, stripped, frozen copy
95
+ def normalize_text(text)
96
+ return if text.nil?
97
+ text = UNF::Normalizer.normalize(text.to_s, :nfc)
98
+ text.strip!
99
+ text.freeze
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "dphil"
4
+ require "gli"
5
+
6
+ module Dphil
7
+ #
8
+ # GLI-based CLI interface for the library.
9
+ #
10
+ # Not loaded automatically with the rest of the gem.
11
+ #
12
+ module CLI
13
+ extend GLI::App
14
+
15
+ program_desc "UBC Sanskrit digital philology CLI tool"
16
+ version Dphil::VERSION
17
+ subcommand_option_handling :normal
18
+ arguments :strict
19
+
20
+ desc "Be verbose in output"
21
+ switch :verbose, negatable: false
22
+
23
+ # Load individual CLI commands
24
+ commands_from "dphil/cli_commands"
25
+ end
26
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ Dphil::CLI.module_eval do
4
+ desc "Convert a CSV-format collation file into a JSON-LD dataset"
5
+ long_desc <<~EOS
6
+ Convert a CSV-format collation file into a JSON-LD dataset, generating trees
7
+ using PAUP as part of the process.
8
+ This expects each column of the CSV to represent data for a single taxon,
9
+ and the first row to contain the names of the taxa.
10
+ EOS
11
+
12
+ arg :csv_file
13
+
14
+ command :csv2ld do |c|
15
+ c.desc "Transpose rows/columns in CSV"
16
+ c.switch :t, :transpose, negatable: false
17
+
18
+ c.desc "Specify the location of the PAUP executable"
19
+ c.flag :p, :paup_cmd, arg_name: "file", default_value: "paup4"
20
+
21
+ c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
22
+ c.flag :d, :paup_data, arg_name: "file"
23
+
24
+ c.desc "Write JSON-LD output to file instead of STDOUT"
25
+ c.flag :o, :outfile, arg_name: "file"
26
+
27
+ c.action do |_, copts, args|
28
+ # Check that PAUP command exists
29
+ paup_cmd = `command -v #{Shellwords.shellescape(copts[:paup_cmd])}`.strip
30
+ raise "PAUP command `#{copts[:paup_cmd]}` could not be found." if paup_cmd.empty?
31
+
32
+ # Set absolute path of CSV input
33
+ csv_file = Pathname.new(args[0]).realpath
34
+
35
+ Dir.mktmpdir("dphil-csv2ld") do |dir|
36
+ Dir.chdir(dir) do
37
+ # Run Csv2Nex conversion
38
+ File.write("csv2ld.nex", Dphil::Csv2NexConverter.new(csv_file, copts).convert)
39
+
40
+ # Run PAUP
41
+ `#{paup_cmd} -n csv2ld.nex`
42
+
43
+ # Compile JSON-LD Dataset
44
+ matrix = Dphil::CharacterMatrix.from_csv(csv_file, transpose: copts[:transpose])
45
+ paup_trees = Dphil::PAUP.parse_trees("paup.log")
46
+ trees = paup_trees.each_with_object({}) do |(k, v), acc|
47
+ next unless k.is_a?(Integer)
48
+ acc[k] = Dphil::Tree.new(k, v[:lengths], v[:stats])
49
+ end
50
+
51
+ cons_tree = Dphil::NewickTree.tree_from_nex(
52
+ "con.tree",
53
+ taxa_map: matrix.taxa_names.transform_values { |v| v.gsub(/[\-\_]/, " ") }
54
+ )
55
+ trees[0] = cons_tree
56
+
57
+ dataset = Dphil::LDDataSet.new(matrix: matrix, trees: trees)
58
+ @dataset_ld = JSON.pretty_generate(dataset.as_jsonld)
59
+ end
60
+ end
61
+
62
+ if copts[:outfile].nil?
63
+ puts @dataset_ld
64
+ else
65
+ abs_outfile = Pathname.new(copts[:outfile]).expand_path
66
+ rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
67
+ puts "#{File.write(copts[:outfile], @dataset_ld)} bytes written to #{rel_outfile}"
68
+ end
69
+ end
70
+ end
71
+ end