dphil 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "active_support/cache"
4
+ require "active_support/notifications"
5
+
6
+ module Dphil
7
+ module_function
8
+
9
+ def cache(key, params = nil)
10
+ @cache ||= defined?(::Rails.cache) ? ::Rails.cache : ActiveSupport::Cache::MemoryStore.new(size: 16_384)
11
+ full_key = String.new("Dphil-#{Dphil::VERSION}:cache:#{key}")
12
+ full_key << ":#{Digest::SHA1.base64digest(params.to_s)}" unless params.nil?
13
+ block_given? ? @cache.fetch(full_key, &Proc.new) : @cache.fetch(full_key)
14
+ end
15
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class ChangeList
5
+ end
6
+ end
@@ -0,0 +1,236 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Phylogenetic character for storing states and symbols.
6
+ #
7
+ # Immutable.
8
+ #
9
+ class Character
10
+ include Dphil::LDOutput
11
+
12
+ # Instantiates a new Character
13
+ # @overload initialize(id = nil, states = nil)
14
+ # @param id [Integer] a character ID
15
+ # @param states [Hash<Integer, String]] taxa and text-states +{ taxon_id => text_state }+
16
+ # @overload initialize(**opts = {})
17
+ # @param [Hash] opts options or keyword values
18
+ # @option opts [Integer] :id a character ID
19
+ # @option opts [Hash<Integer, String]] :states taxa and text-states +{ taxon_id => text_state }+
20
+ def initialize(id = nil, states = nil, **opts)
21
+ @id = (opts[:id] || id)&.to_s.to_i
22
+ @taxa_states = (opts[:states] || states)
23
+ .to_h.each_with_object({}) do |(taxon, state), acc|
24
+ next if state.blank?
25
+ taxon = taxon.to_s if taxon.is_a?(Symbol)
26
+ acc[taxon.to_i] = normalize_text(state)
27
+ end
28
+
29
+ unique_states = weighted_uniq(@taxa_states.values)
30
+ if unique_states.size > SYMBOL_ARRAY.size
31
+ raise ArgumentError,
32
+ "Too many states (found #{unique_states.size}, " \
33
+ "max #{SYMBOL_ARRAY.size})"
34
+ end
35
+
36
+ @states = {}
37
+ @state_totals = unique_states
38
+ unique_states.each_key.with_index do |state, index|
39
+ @states[SYMBOL_ARRAY[index]] = state
40
+ end
41
+ instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
42
+ end
43
+
44
+ # @!attribute [r] id
45
+ # @return [Integer] character ID
46
+ attr_reader :id
47
+
48
+ # @!attribute [r] taxa
49
+ # @return [Set<Integer>] taxon IDs
50
+ def taxa
51
+ @taxa ||= Set.new(taxa_states.keys).freeze
52
+ end
53
+
54
+ # @!attribute [r] states
55
+ # @return [Hash<String, String>] text-states by symbol
56
+ attr_reader :states
57
+
58
+ # @!attribute [r] symbols
59
+ # @return [Hash<String, String>] symbols by text-state
60
+ def symbols
61
+ @symbols ||= states.invert.freeze
62
+ end
63
+
64
+ # @!attribute [r] state_list
65
+ # @return [Array<String>] text-states
66
+ def state_list
67
+ @state_list ||= states.values.freeze
68
+ end
69
+
70
+ # @!attribute [r] symbol_list
71
+ # @return [Array<String>] symbols
72
+ def symbol_list
73
+ @symbol_list ||= states.keys.freeze
74
+ end
75
+
76
+ # @!attribute [r] state_totals
77
+ # @return [Hash<String, Integer>] character state totals by text-state
78
+ attr_reader :state_totals
79
+
80
+ # @!attribute [r] symbol_totals
81
+ # @return [Hash<String, Integer>] character state totals by symbol
82
+ def symbol_totals
83
+ @symbol_totals ||= state_totals.transform_keys { |state| symbols[state] }.freeze
84
+ end
85
+
86
+ # @!attribute [r] taxa_states
87
+ # @return [Hash<Integer, String>] text-states by taxon ID
88
+ attr_reader :taxa_states
89
+
90
+ # @!attribute [r] taxa_symbols
91
+ # @return [Hash<Integer, String>] symbols by taxon ID
92
+ def taxa_symbols
93
+ @taxa_symbols ||= taxa_states.transform_values { |state| symbols[state] }.freeze
94
+ end
95
+
96
+ # @!attribute [r] states_taxa
97
+ # @return [Hash<String, Integer>] taxa IDs by text-state
98
+ def states_taxa
99
+ @states_taxa ||= (states.each_value.each_with_object({}) do |state, acc|
100
+ acc[state] = taxa_states.select { |_, tstate| state == tstate }.keys
101
+ end).freeze
102
+ end
103
+
104
+ # @!attribute [r] symbols_taxa
105
+ # @return [Hash<String, Integer>] taxa IDs by symbol
106
+ def symbols_taxa
107
+ @symbols_taxa ||= states_taxa.transform_keys { |state| symbols[state] }.freeze
108
+ end
109
+
110
+ # Get state from symbol
111
+ # @param symbol [String] a symbol
112
+ # @return [String, nil] the associated text-state, or Nil if not found
113
+ def get_state(symbol)
114
+ states[normalize_text(symbol)]
115
+ end
116
+
117
+ # Get symbol from state
118
+ # @param state [String] a text-state
119
+ # @return [String, nil] the associated symbol, or Nil if not found
120
+ def get_symbol(state)
121
+ symbols[normalize_text(state)]
122
+ end
123
+
124
+ # Get taxa from state
125
+ # @param symbol [String] a text-state
126
+ # @return [Array<Integer>] the associated taxa IDs
127
+ def get_taxa_state(state)
128
+ states_taxa[normalize_text(state)]
129
+ end
130
+
131
+ # Get taxa from symbol
132
+ # @param symbol [String] a symbol
133
+ # @return [Array<Integer>] the associated taxa IDs
134
+ def get_taxa_symbol(symbol)
135
+ symbols_taxa[normalize_text(symbol)]
136
+ end
137
+
138
+ # Get state from taxon
139
+ # @param taxon_id [Integer] a taxon ID
140
+ # @return [String, nil] the associated text-state, or Nil if not found
141
+ def get_state_taxon(taxon_id)
142
+ taxa_states[taxon_id.to_i]
143
+ end
144
+
145
+ # Get symbol from taxon
146
+ # @param taxon_id [Integer] a taxon ID
147
+ # @return [String, nil] the associated symbol, or Nil if not found
148
+ def get_symbol_taxon(taxon_id)
149
+ taxa_symbols[taxon_id.to_i]
150
+ end
151
+
152
+ # Check if character is parsimony-informative
153
+ # (At least 2 variants occurring in at least 2 places)
154
+ # @return [Boolean] whether the character provides useful information
155
+ def informative?
156
+ @informative ||= (states.size > 1 && states_taxa.count { |_, v| v.size > 1 } > 1)
157
+ end
158
+
159
+ # Check if the character is invariant
160
+ # @return [Boolean] whether the character is constant (invariant)
161
+ def constant?
162
+ @constant ||= states.size <= 1
163
+ end
164
+
165
+ def to_h
166
+ {
167
+ id: id,
168
+ states: states,
169
+ symbols: symbols,
170
+ state_totals: state_totals,
171
+ taxa_states: taxa_states,
172
+ states_taxa: states_taxa,
173
+ is_informative: informative?,
174
+ is_constant: constant?,
175
+ }
176
+ end
177
+
178
+ def as_json(options = nil)
179
+ to_h.as_json(options)
180
+ end
181
+
182
+ # Pretty-print the object
183
+ # (used by Pry in particular)
184
+ def pretty_print(q)
185
+ q.object_group(self) do
186
+ q.breakable
187
+ q.group(1) do
188
+ q.text "@id=#{id}"
189
+ q.breakable
190
+ q.group(1, "{", "}") do
191
+ q.seplist(states) do |symbol, state|
192
+ q.text "#{state.inspect}(#{symbol})=#{states_taxa[state]}"
193
+ end
194
+ end
195
+ end
196
+ end
197
+ end
198
+
199
+ # @return [String] a string representation of the object.
200
+ def inspect
201
+ pretty_inspect.chomp
202
+ end
203
+ alias to_s inspect
204
+
205
+ private
206
+
207
+ # @param text [String] an arbitrary string of text
208
+ # @return [String] a Unicode-normalized, stripped, frozen copy
209
+ def normalize_text(text)
210
+ return if text.nil?
211
+ text = UNF::Normalizer.normalize(text.to_s, :nfc)
212
+ text.strip!
213
+ text.freeze
214
+ end
215
+
216
+ # Find all unique elements in an array and stably sort them by frequency.
217
+ # @param array [Array]
218
+ # @return [Hash] keys are unique input array elements, values are frequency
219
+ def weighted_uniq(array)
220
+ weighted_hash = array.each_with_object({}) do |v, acc|
221
+ acc[v] ||= 0
222
+ acc[v] += 1
223
+ end
224
+
225
+ n = 0
226
+ weighted_hash = weighted_hash.sort_by do |x|
227
+ n += 1
228
+ [-x[1], n]
229
+ end
230
+ weighted_hash.to_h
231
+ end
232
+
233
+ SYMBOL_ARRAY = IceNine.deep_freeze([*"A".."Z", *"a".."z"])
234
+ private_constant :SYMBOL_ARRAY
235
+ end
236
+ end
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # A matrix of character states across taxa.
6
+ #
7
+ class CharacterMatrix
8
+ include LDOutput
9
+
10
+ # Instantiate a new CharacterMatrix from a UTF-8 CSV file
11
+ # @param infile [#read] the file/IO object to read
12
+ # @param transpose [Boolean] transpose the table 90° (headers in first column)
13
+ # @return [CharacterMatrix]
14
+ def self.from_csv(infile, transpose: false)
15
+ csv = CSV.read(infile, "r:bom|utf-8")
16
+ csv = csv.transpose if transpose
17
+ new(csv)
18
+ end
19
+
20
+ # Instantiate a new CharacterMatrix
21
+ # @param table [Array<Array<String>>] collation table (headers in first row)
22
+ def initialize(table)
23
+ @taxa_names = table.to_a.first.each_with_object({})
24
+ .with_index do |(name, acc), index|
25
+ acc[index + 1] = normalize_text(name)
26
+ end
27
+ @taxa_ids = @taxa_names.invert
28
+
29
+ taxa_arr = @taxa_ids.values
30
+ @characters = (1...table.length).each_with_object({}) do |char_num, acc|
31
+ char_states = taxa_arr.zip(table[char_num]).to_h
32
+ acc[char_num] = Dphil::Character.new(id: char_num, states: char_states)
33
+ end
34
+
35
+ instance_variables.each { |ivar| instance_variable_get(ivar).freeze }
36
+ end
37
+
38
+ # @!attribute [r] taxa_names
39
+ # @return [Hash<Integer, String>] taxa names by ID
40
+ attr_reader :taxa_names
41
+
42
+ # @!attribute [r] taxa_ids
43
+ # @return [Hash<String, Integer>] taxa IDs by names
44
+ attr_reader :taxa_ids
45
+
46
+ # @!attribute [r] characters
47
+ # @return [Hash<Integer, Character>] characters by character ID
48
+ attr_reader :characters
49
+
50
+ # @!attribute [r] stats
51
+ # @return [Hash] the character statistics for the matrix
52
+ def stats
53
+ @stats ||= begin
54
+ hash = {
55
+ total: characters.count,
56
+ constant: 0,
57
+ uninformative: 0,
58
+ informative: 0,
59
+ }
60
+ characters.each_value do |char|
61
+ if char.constant?
62
+ hash[:constant] += 1
63
+ elsif char.informative?
64
+ hash[:informative] += 1
65
+ else
66
+ hash[:uninformative] += 1
67
+ end
68
+ end
69
+ hash
70
+ end.freeze
71
+ end
72
+
73
+ # Get character by ID
74
+ # @param char_id [Integer] a character ID
75
+ # @return [Character, nil] the associated Character, or Nil if not found.
76
+ def get_character(char_id)
77
+ characters[char_id.to_i]
78
+ end
79
+
80
+ def to_h
81
+ {
82
+ taxa_names: taxa_names,
83
+ characters: characters,
84
+ }
85
+ end
86
+
87
+ def as_json(options = nil)
88
+ to_h.as_json(options)
89
+ end
90
+
91
+ private
92
+
93
+ # @param text [String] an arbitrary string of text
94
+ # @return [String] a Unicode-normalized, stripped, frozen copy
95
+ def normalize_text(text)
96
+ return if text.nil?
97
+ text = UNF::Normalizer.normalize(text.to_s, :nfc)
98
+ text.strip!
99
+ text.freeze
100
+ end
101
+ end
102
+ end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "dphil"
4
+ require "gli"
5
+
6
+ module Dphil
7
+ #
8
+ # GLI-based CLI interface for the library.
9
+ #
10
+ # Not loaded automatically with the rest of the gem.
11
+ #
12
+ module CLI
13
+ extend GLI::App
14
+
15
+ program_desc "UBC Sanskrit digital philology CLI tool"
16
+ version Dphil::VERSION
17
+ subcommand_option_handling :normal
18
+ arguments :strict
19
+
20
+ desc "Be verbose in output"
21
+ switch :verbose, negatable: false
22
+
23
+ # Load individual CLI commands
24
+ commands_from "dphil/cli_commands"
25
+ end
26
+ end
@@ -0,0 +1,71 @@
1
+ # frozen_string_literal: true
2
+
3
+ Dphil::CLI.module_eval do
4
+ desc "Convert a CSV-format collation file into a JSON-LD dataset"
5
+ long_desc <<~EOS
6
+ Convert a CSV-format collation file into a JSON-LD dataset, generating trees
7
+ using PAUP as part of the process.
8
+ This expects each column of the CSV to represent data for a single taxon,
9
+ and the first row to contain the names of the taxa.
10
+ EOS
11
+
12
+ arg :csv_file
13
+
14
+ command :csv2ld do |c|
15
+ c.desc "Transpose rows/columns in CSV"
16
+ c.switch :t, :transpose, negatable: false
17
+
18
+ c.desc "Specify the location of the PAUP executable"
19
+ c.flag :p, :paup_cmd, arg_name: "file", default_value: "paup4"
20
+
21
+ c.desc "Include custom PAUP commands from a file in PAUP block of NEXUS output"
22
+ c.flag :d, :paup_data, arg_name: "file"
23
+
24
+ c.desc "Write JSON-LD output to file instead of STDOUT"
25
+ c.flag :o, :outfile, arg_name: "file"
26
+
27
+ c.action do |_, copts, args|
28
+ # Check that PAUP command exists
29
+ paup_cmd = `command -v #{Shellwords.shellescape(copts[:paup_cmd])}`.strip
30
+ raise "PAUP command `#{copts[:paup_cmd]}` could not be found." if paup_cmd.empty?
31
+
32
+ # Set absolute path of CSV input
33
+ csv_file = Pathname.new(args[0]).realpath
34
+
35
+ Dir.mktmpdir("dphil-csv2ld") do |dir|
36
+ Dir.chdir(dir) do
37
+ # Run Csv2Nex conversion
38
+ File.write("csv2ld.nex", Dphil::Csv2NexConverter.new(csv_file, copts).convert)
39
+
40
+ # Run PAUP
41
+ `#{paup_cmd} -n csv2ld.nex`
42
+
43
+ # Compile JSON-LD Dataset
44
+ matrix = Dphil::CharacterMatrix.from_csv(csv_file, transpose: copts[:transpose])
45
+ paup_trees = Dphil::PAUP.parse_trees("paup.log")
46
+ trees = paup_trees.each_with_object({}) do |(k, v), acc|
47
+ next unless k.is_a?(Integer)
48
+ acc[k] = Dphil::Tree.new(k, v[:lengths], v[:stats])
49
+ end
50
+
51
+ cons_tree = Dphil::NewickTree.tree_from_nex(
52
+ "con.tree",
53
+ taxa_map: matrix.taxa_names.transform_values { |v| v.gsub(/[\-\_]/, " ") }
54
+ )
55
+ trees[0] = cons_tree
56
+
57
+ dataset = Dphil::LDDataSet.new(matrix: matrix, trees: trees)
58
+ @dataset_ld = JSON.pretty_generate(dataset.as_jsonld)
59
+ end
60
+ end
61
+
62
+ if copts[:outfile].nil?
63
+ puts @dataset_ld
64
+ else
65
+ abs_outfile = Pathname.new(copts[:outfile]).expand_path
66
+ rel_outfile = abs_outfile.relative_path_from(Pathname.getwd)
67
+ puts "#{File.write(copts[:outfile], @dataset_ld)} bytes written to #{rel_outfile}"
68
+ end
69
+ end
70
+ end
71
+ end