dphil 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Node in a Phylogenetic tree
6
+ #
7
+ class TreeNode
8
+ include LDOutput
9
+ attr_reader :id, :name, :length, :parent, :children
10
+
11
+ def initialize(opts = {})
12
+ self.id = opts[:id]
13
+ self.name = opts[:name]
14
+ self.length = opts[:length]
15
+ self.parent = opts[:parent]
16
+ self.children = opts[:children]
17
+ end
18
+
19
+ def id=(id)
20
+ @id = id.to_i
21
+ end
22
+
23
+ def name=(name)
24
+ @name = name.to_s
25
+ end
26
+
27
+ def length=(length)
28
+ @length = length.to_i
29
+ end
30
+
31
+ def parent=(parent)
32
+ unless parent.nil? || parent.is_a?(Integer) || parent.is_a?(TreeNode)
33
+ raise ArgumentError, "Parent must be Integer, Node, or Nil"
34
+ end
35
+ @parent = parent
36
+ end
37
+
38
+ def children=(children)
39
+ children = Array(children)
40
+ unless children.all? { |e| e.is_a?(Integer) || e.is_a?(TreeNode) }
41
+ raise ArgumentError, "Parent must be Integer, Node"
42
+ end
43
+ @children = children
44
+ end
45
+
46
+ def to_h
47
+ {
48
+ id: id,
49
+ name: name,
50
+ length: length,
51
+ parent: parent,
52
+ children: children,
53
+ }
54
+ end
55
+
56
+ def as_json(options = nil)
57
+ to_h.as_json(options)
58
+ end
59
+
60
+ def merge!(node)
61
+ node.to_h.each do |k, v|
62
+ method = "#{k}=".to_sym
63
+ send(method, v) if respond_to?(method)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class Verse
5
+ using ::Ragabash::Refinements
6
+ attr_reader :ms, :id, :verse, :syllables, :weights, :identify
7
+
8
+ def initialize(verse, ms: nil, id: nil)
9
+ @verse = verse.to_str.safe_copy
10
+ @ms = ms.safe_copy
11
+ @id = id.safe_copy
12
+ @identify = VerseAnalysis.identify(@verse)
13
+ deep_freeze
14
+ end
15
+
16
+ def to_json(options)
17
+ { ms: ms,
18
+ id: id,
19
+ verse: verse,
20
+ syllables: syllables,
21
+ weights: weights,
22
+ identify: identify }.to_json(options)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,509 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "amatch"
5
+
6
+ module Dphil
7
+ module VerseAnalysis
8
+ using ::Ragabash::Refinements
9
+
10
+ module_function
11
+
12
+ include Amatch
13
+
14
+ def syllables(str)
15
+ str = str.to_str
16
+ Dphil.cache("VerseAnalysis.syllables", str) do
17
+ str = str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
18
+ str = Transliterate.iast_slp1(str)
19
+ syllables = str.scan(Constants::R_SYL)
20
+ syllables.map { |syl| Transliterate.slp1_iast(syl) }
21
+ end
22
+ end
23
+
24
+ def syllables_weights(syllables)
25
+ syllables = syllables.to_ary
26
+ Dphil.cache("VerseAnalysis.syllables_weights", syllables) do
27
+ syllables = syllables.map { |syl| Transliterate.iast_slp1(syl) }
28
+ weight_arr = (0...syllables.length).map do |i|
29
+ cur_syl = syllables[i].delete("'").strip
30
+ next_syl = syllables[i + 1]&.delete("'")&.strip
31
+ if cur_syl =~ Constants::R_GSYL
32
+ # Guru if current syllable contains a long vowel, or end in a ṃ/ḥ/conjunct
33
+ "G"
34
+ elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}" =~ Constants::R_CCON
35
+ # Guru if current syllable ends in a consonant cluster (look ahead)
36
+ "G"
37
+ else
38
+ "L"
39
+ end
40
+ end
41
+ weight_arr.join("")
42
+ end
43
+ end
44
+
45
+ def verse_weight(str)
46
+ Dphil.cache("VerseAnalysis.verse_weight", str) do
47
+ syllables_weights(syllables(str))
48
+ end
49
+ end
50
+
51
+ #
52
+ #
53
+ # WIP BEGIN
54
+ #
55
+ #
56
+
57
+ # Search through MetricalData and return candidate matches
58
+ def meter_search_exact(weight_string, size = :full, padas = [1, 2, 3, 4])
59
+ # FIXME: Only considers full weight string, so only useful for full patterns
60
+ candidates = []
61
+ %i[patterns regexes].each do |type|
62
+ matches = MetricalData.all[type][size].each_with_object([]) do |(pattern, meter), acc|
63
+ meter_name = meter.keys.first
64
+ meter_scope = meter.values.first
65
+ case pattern
66
+ when String
67
+ next unless weight_string == pattern
68
+ pattern_string = pattern
69
+ when Regexp
70
+ r_match = pattern.match(weight_string)
71
+ next if r_match.nil?
72
+ pattern_string = r_match.length > 1 ? r_match.captures : r_match.to_s
73
+ end
74
+ acc << {
75
+ meter: meter_name,
76
+ type: type.to_s,
77
+ size: size.to_s,
78
+ pattern: pattern_string,
79
+ scope: meter_scope,
80
+ padas: padas,
81
+ }
82
+ end
83
+ candidates.concat(matches)
84
+ end
85
+ # candidates = candidates.uniq
86
+ candidates
87
+ end
88
+
89
+ def meter_search_fuzzy(weight_string, size = :full)
90
+ candidates = []
91
+ syllable_count = weight_string.length
92
+ length_variance = 5
93
+ edit_tolerance = 5
94
+ str = Levenshtein.new(weight_string)
95
+
96
+ %i[patterns regexes].each do |type|
97
+ matches = MetricalData.all[type][size].each_with_object([]) do |(pattern, meter), acc|
98
+ meter_name = meter.keys.first
99
+ # meter_scope = meter.values.first
100
+ case pattern
101
+ when String
102
+ next unless (pattern.length - syllable_count).abs <= length_variance
103
+ edit_distance = str.match(pattern)
104
+ next if edit_distance > edit_tolerance
105
+ pattern_string = pattern
106
+ when Regexp # FIXME : approximate matching in case of regexes
107
+ next if pattern.source["|"]
108
+ pattern = pattern.source.gsub!(/[\^\$\(\)]/, "")
109
+ next if (pattern.length - syllable_count).abs > length_variance
110
+ pattern2 = pattern.tr(".", "L")
111
+
112
+ edit_distance = str.match(pattern2)
113
+ c = corrected_string(weight_string, pattern2)
114
+ c = c.join("")
115
+
116
+ pattern_string = []
117
+ x1 = 0 # for pattern
118
+ xw = 0 # for weight string
119
+ pattern = pattern.lstrip
120
+ (0...c.length).each do |i|
121
+ if c[i] == "L" || c[i] == "G"
122
+ x1 += 1
123
+ pattern_string << c[i]
124
+ xw += 1
125
+ elsif c[i] == "l"
126
+ x1 += 1
127
+ pattern_string << "L"
128
+ elsif c[i] == "g"
129
+ x1 += 1
130
+ pattern_string << "G"
131
+ elsif c[i] == "d"
132
+ xw += 1
133
+ elsif c[i] == "f" && pattern[x1] == "."
134
+ x1 += 1
135
+ edit_distance -= 1
136
+ pattern_string << weight_string[xw]
137
+ xw += 1
138
+ else
139
+ pattern_string << pattern[x1]
140
+ x1 += 1
141
+ xw += 1
142
+ end
143
+ end
144
+ pattern_string = pattern_string.join("")
145
+ next if edit_distance > edit_tolerance
146
+ end
147
+ acc << {
148
+ meter: meter_name,
149
+ type: type,
150
+ size: size,
151
+ pattern: pattern_string,
152
+ edit_distance: edit_distance,
153
+ }
154
+ end
155
+ candidates.concat(matches)
156
+ end
157
+ candidates
158
+ end
159
+
160
+ def weight_try_half(weight_string, meter)
161
+ meter_hash = metercount
162
+ length = meter_hash[meter]
163
+
164
+ [
165
+ weight_string.slice(0, length[0] + length[1]),
166
+ weight_string.slice(length[0] + length[1], length[2] + length[3]),
167
+ ]
168
+ end
169
+
170
+ def weight_try_pada(weight_string, meter)
171
+ meter_hash = metercount
172
+ length = meter_hash[meter]
173
+
174
+ [
175
+ weight_string.slice(0, length[0]),
176
+ weight_string.slice(length[0], length[1]),
177
+ weight_string.slice(length[0] + length[1], length[2]),
178
+ weight_string.slice(length[0] + length[1] + length[2], length[3]),
179
+ ]
180
+ end
181
+
182
+ def analyze_syllables(syllables)
183
+ v_syllables = syllables.dup
184
+ v_weight = syllables_weights(v_syllables)
185
+
186
+ meter_candidates = {}
187
+
188
+ meter_search_exact(v_weight).each do |val|
189
+ if meter_candidates[val[:meter]].nil?
190
+ meter_candidates[val[:meter]] = [val]
191
+ else
192
+ meter_candidates[val[:meter]] << val
193
+ end
194
+ end
195
+
196
+ if meter_candidates == {}
197
+ meter_search_fuzzy(v_weight).each do |val|
198
+ if meter_candidates[val[:meter]].nil?
199
+ meter_candidates[val[:meter]] = [val]
200
+ elsif val[:edit_distance] < meter_candidates[val[:meter]][0][:edit_distance]
201
+ meter_candidates[val[:meter]].clear
202
+ meter_candidates[val[:meter]] << val
203
+ end
204
+ end
205
+ status = "fuzzy match"
206
+ else
207
+ v_weight_halves = weight_try_half(v_weight, meter_candidates.keys.first)
208
+ v_weight_halves&.each_with_index do |v_weight_half, index|
209
+ padas = index == 0 ? [1, 2] : [3, 4]
210
+ meter_search_exact(v_weight_half, :half, padas).each do |val|
211
+ if meter_candidates[val[:meter]].nil?
212
+ meter_candidates[val[:meter]] = [val]
213
+ else
214
+ meter_candidates[val[:meter]] << val
215
+ end
216
+ end
217
+ end
218
+
219
+ v_weight_padas = weight_try_pada(v_weight, meter_candidates.keys.first)
220
+ v_weight_padas&.each_with_index do |v_weight_pada, index|
221
+ pada = [index + 1]
222
+ meter_search_exact(v_weight_pada, :pada, pada).each do |val|
223
+ if meter_candidates[val[:meter]].nil?
224
+ meter_candidates[val[:meter]] = [val]
225
+ else
226
+ meter_candidates[val[:meter]] << val
227
+ end
228
+ end
229
+ end
230
+
231
+ status = "exact match"
232
+ end
233
+
234
+ result = {
235
+ status: status,
236
+ syllables: v_syllables,
237
+ weights: v_weight,
238
+ meters: meter_candidates,
239
+ }
240
+ result
241
+ end
242
+
243
+ # identifies the most close meter and returns padas, any corrections in case of approx match
244
+ def identify(verse_string)
245
+ # 1. Get basic information about input
246
+ v_syllables = syllables(verse_string)
247
+ v_weight = syllables_weights(v_syllables)
248
+
249
+ # 2. Discover possible meter candidates
250
+ # Should return list of meters with relevant information for generating correction if appropriate.
251
+ # (Including size of match, etc.)
252
+ m = analyze_syllables(v_syllables)
253
+
254
+ # 3. Explain meter candidates
255
+
256
+ # 3.1 Exact match => Show meter name, information, split input according to match (if possible).
257
+
258
+ # 3.2 Fuzzy match => Generate possible corrections between input and candidates
259
+
260
+ # 4. Output object containing input data, result status, and candidate meters
261
+ # (with corrections if appropriate). No un-necessary results.
262
+
263
+ meter_candidates = m[:meters]
264
+ v_padas = []
265
+ m_hsh = metercount
266
+
267
+ if meter_candidates == {}
268
+ m[:status] = "Verse highly defective , Can't find neter"
269
+ v_meters = {}
270
+ correct = []
271
+
272
+ elsif m[:status] == "exact match"
273
+ meter = meter_candidates.keys.first
274
+
275
+ len = m_hsh[meter]
276
+ v_padas << m[:syllables].slice!(0, len[0]).join("")
277
+ v_padas << m[:syllables].slice!(0, len[1]).join("")
278
+ v_padas << m[:syllables].slice!(0, len[2]).join("")
279
+ v_padas << m[:syllables].slice!(0, len[3]).join("")
280
+
281
+ defect_percentage = nil
282
+ correct = []
283
+ else
284
+ d = 100.0
285
+ pattern = []
286
+ meter_candidates.each do |(key, val)|
287
+ next unless val[0][:edit_distance].to_i < d # multiple verses with same edit distance???
288
+ d = val[0][:edit_distance]
289
+ meter = key
290
+ pattern = val[0][:pattern].split("")
291
+ end
292
+
293
+ defect_percentage = Rational(d, meter_candidates[meter][0][:pattern].length)
294
+ n = fuzzy_correction(m[:weights], meter, pattern, m[:syllables])
295
+ correct = n[:correct_weights]
296
+ v_padas = n[:correct_padas]
297
+ end
298
+
299
+ v_corrections = {
300
+ weights: correct.join(""),
301
+ padas: v_padas,
302
+ }
303
+
304
+ v_meters = {
305
+ name: meter,
306
+ size: "full/half/pada",
307
+ defectiveness: defect_percentage,
308
+ corrections: [v_corrections],
309
+ }
310
+
311
+ result = {
312
+ verse: verse_string,
313
+ syllables: v_syllables,
314
+ weights: v_weight,
315
+ status: m[:status],
316
+ meter: [v_meters],
317
+ }
318
+
319
+ if result[:status] == "exact match"
320
+ result[:meter] = v_meters[:name]
321
+ result[:padas] = v_padas
322
+ end
323
+
324
+ result
325
+ end
326
+
327
+ def corrected_string(weights, pattern)
328
+ actual = weights.split("")
329
+ actual.insert(0, " ")
330
+ pattern.insert(0, " ")
331
+
332
+ table = Array.new(actual.length) { Array.new(pattern.length) }
333
+
334
+ (0...actual.length).each do |i|
335
+ table[i][0] = i
336
+ end
337
+ (0...pattern.length).each do |i|
338
+ table[0][i] = i
339
+ end
340
+
341
+ (1...actual.length).each do |i|
342
+ (1...pattern.length).each do |j|
343
+ if actual[i] == pattern[j]
344
+ table[i][j] = table[i - 1][j - 1]
345
+ else
346
+ table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
347
+ end
348
+ end
349
+ end
350
+
351
+ correct = []
352
+ i = actual.length - 1
353
+ j = pattern.length - 1
354
+ while i > 0 || j > 0
355
+ if actual[i] == pattern[j]
356
+ correct.insert(0, actual[i])
357
+ i -= 1
358
+ j -= 1
359
+ else
360
+ x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min
361
+ case x
362
+ when table[i][j - 1]
363
+ if pattern[j] == "L"
364
+ correct.insert(0, "l")
365
+ else
366
+ correct.insert(0, "g")
367
+ end
368
+ j -= 1
369
+ when table[i - 1][j - 1]
370
+ correct.insert(0, "f") # to mark substitution in string
371
+ i -= 1
372
+ j -= 1
373
+ when table[i - 1][j]
374
+ correct.insert(0, "d") # to mark deletion from string
375
+ i -= 1
376
+ end
377
+ end
378
+ end
379
+ correct
380
+ end
381
+
382
+ def fuzzy_correction(weights, meter, pattern, syllables)
383
+ correct = corrected_string(weights, pattern)
384
+
385
+ k = 0
386
+ n = 0
387
+ p = 0
388
+ temp = []
389
+ v_padas = []
390
+ len = metercount[meter].dup
391
+ len.slice!(0, 4).each do |val|
392
+ (1..val).each do
393
+ if correct[k] == "d" # still to figure out
394
+ temp << ("(" + syllables[n] + ")")
395
+ n += 1
396
+ elsif correct[k] == "f"
397
+ temp << ("(" + syllables[n] + ")")
398
+ n += 1 # still to figure out
399
+ elsif correct[k] == "g"
400
+ case p
401
+ when 0
402
+ temp << " { (g)"
403
+ p = 2
404
+ else
405
+ temp << "(g)"
406
+ end
407
+ elsif correct[k] == "l"
408
+ case p
409
+ when 0
410
+ temp << " { (l)"
411
+ p = 1
412
+ else
413
+ temp << "(l)"
414
+ end
415
+ else
416
+ case p
417
+ when 2
418
+ if correct[k] == "L"
419
+ temp << " } " + syllables[n]
420
+ p = 0
421
+ else
422
+ temp << syllables[n]
423
+ end
424
+ when 1
425
+ if correct[k] == "G"
426
+ temp << " } " + syllables[n]
427
+ p = 0
428
+ else
429
+ temp << syllables[n]
430
+ end
431
+ when 0
432
+ temp << syllables[n]
433
+ end
434
+ n += 1
435
+ end
436
+ k += 1
437
+ end
438
+ v_padas << temp.join("")
439
+ temp = []
440
+ end
441
+
442
+ result = {
443
+ correct_weights: correct,
444
+ correct_padas: v_padas,
445
+ }
446
+ result
447
+ end
448
+
449
+ # returns hash of meter names and no of syllables in each pada, total syllables
450
+ def metercount
451
+ @metercount ||= begin
452
+ meter_data = {}
453
+ MetricalData.meters.map do |meter_name, pada_arr|
454
+ arr = pada_arr.map(&:length)
455
+ arr << arr.reduce(&:+)
456
+ meter_data[meter_name] = arr
457
+ end
458
+ MetricalData.regexes.full.each do |r, v|
459
+ meter_name = v.keys.first
460
+ next if meter_data.key?(meter_name)
461
+ source = r.source
462
+ next if source["|"] || source["("].nil?
463
+ groups = source.scan(/\(([^()]*)\)/).flatten
464
+ source.gsub!(/[\^\$\(\)]/, "")
465
+ meter_data[meter_name] = groups.map(&:length) << source.length
466
+ end
467
+
468
+ meter_data.sort.to_h.deep_freeze
469
+ end
470
+ end
471
+
472
+ def closeness
473
+ e = []
474
+ MetricalData.meters.keys.each do |key|
475
+ first = MetricalData.meters[key].join("")
476
+ a = Levenshtein.new(first)
477
+ MetricalData.meters.keys.each do |key2|
478
+ second = MetricalData.meters[key2].join("")
479
+ diff = a.match(second)
480
+ e << diff if diff <= 5
481
+ end
482
+ e = []
483
+ end
484
+ nil
485
+ end
486
+
487
+ def find_mid(verse)
488
+ w = verse_weight(verse)
489
+ c = w.length
490
+ min = c
491
+ v = 0
492
+ (((c / 2) - 3)..((c / 2) + 3)).each do |val|
493
+ str = Levenshtein.new(w.slice(0, val))
494
+ edit = str.match(w.slice(val, (c - val)))
495
+ if edit < min
496
+ min = edit
497
+ v = val
498
+ end
499
+ end
500
+ puts syllables(verse).slice(0, v).join("")
501
+ puts syllables(verse).slice(v, (c - v)).join("")
502
+ end
503
+ #
504
+ #
505
+ # WIP END
506
+ #
507
+ #
508
+ end
509
+ end