dphil 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ #
5
+ # Node in a Phylogenetic tree
6
+ #
7
+ class TreeNode
8
+ include LDOutput
9
+ attr_reader :id, :name, :length, :parent, :children
10
+
11
+ def initialize(opts = {})
12
+ self.id = opts[:id]
13
+ self.name = opts[:name]
14
+ self.length = opts[:length]
15
+ self.parent = opts[:parent]
16
+ self.children = opts[:children]
17
+ end
18
+
19
+ def id=(id)
20
+ @id = id.to_i
21
+ end
22
+
23
+ def name=(name)
24
+ @name = name.to_s
25
+ end
26
+
27
+ def length=(length)
28
+ @length = length.to_i
29
+ end
30
+
31
+ def parent=(parent)
32
+ unless parent.nil? || parent.is_a?(Integer) || parent.is_a?(TreeNode)
33
+ raise ArgumentError, "Parent must be Integer, Node, or Nil"
34
+ end
35
+ @parent = parent
36
+ end
37
+
38
+ def children=(children)
39
+ children = Array(children)
40
+ unless children.all? { |e| e.is_a?(Integer) || e.is_a?(TreeNode) }
41
+ raise ArgumentError, "Parent must be Integer, Node"
42
+ end
43
+ @children = children
44
+ end
45
+
46
+ def to_h
47
+ {
48
+ id: id,
49
+ name: name,
50
+ length: length,
51
+ parent: parent,
52
+ children: children,
53
+ }
54
+ end
55
+
56
+ def as_json(options = nil)
57
+ to_h.as_json(options)
58
+ end
59
+
60
+ def merge!(node)
61
+ node.to_h.each do |k, v|
62
+ method = "#{k}=".to_sym
63
+ send(method, v) if respond_to?(method)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Dphil
4
+ class Verse
5
+ using ::Ragabash::Refinements
6
+ attr_reader :ms, :id, :verse, :syllables, :weights, :identify
7
+
8
+ def initialize(verse, ms: nil, id: nil)
9
+ @verse = verse.to_str.safe_copy
10
+ @ms = ms.safe_copy
11
+ @id = id.safe_copy
12
+ @identify = VerseAnalysis.identify(@verse)
13
+ deep_freeze
14
+ end
15
+
16
+ def to_json(options)
17
+ { ms: ms,
18
+ id: id,
19
+ verse: verse,
20
+ syllables: syllables,
21
+ weights: weights,
22
+ identify: identify }.to_json(options)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,509 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "amatch"
5
+
6
+ module Dphil
7
+ module VerseAnalysis
8
+ using ::Ragabash::Refinements
9
+
10
+ module_function
11
+
12
+ include Amatch
13
+
14
+ def syllables(str)
15
+ str = str.to_str
16
+ Dphil.cache("VerseAnalysis.syllables", str) do
17
+ str = str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
18
+ str = Transliterate.iast_slp1(str)
19
+ syllables = str.scan(Constants::R_SYL)
20
+ syllables.map { |syl| Transliterate.slp1_iast(syl) }
21
+ end
22
+ end
23
+
24
+ def syllables_weights(syllables)
25
+ syllables = syllables.to_ary
26
+ Dphil.cache("VerseAnalysis.syllables_weights", syllables) do
27
+ syllables = syllables.map { |syl| Transliterate.iast_slp1(syl) }
28
+ weight_arr = (0...syllables.length).map do |i|
29
+ cur_syl = syllables[i].delete("'").strip
30
+ next_syl = syllables[i + 1]&.delete("'")&.strip
31
+ if cur_syl =~ Constants::R_GSYL
32
+ # Guru if current syllable contains a long vowel, or end in a ṃ/ḥ/conjunct
33
+ "G"
34
+ elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}" =~ Constants::R_CCON
35
+ # Guru if current syllable ends in a consonant cluster (look ahead)
36
+ "G"
37
+ else
38
+ "L"
39
+ end
40
+ end
41
+ weight_arr.join("")
42
+ end
43
+ end
44
+
45
+ def verse_weight(str)
46
+ Dphil.cache("VerseAnalysis.verse_weight", str) do
47
+ syllables_weights(syllables(str))
48
+ end
49
+ end
50
+
51
+ #
52
+ #
53
+ # WIP BEGIN
54
+ #
55
+ #
56
+
57
+ # Search through MetricalData and return candidate matches
58
+ def meter_search_exact(weight_string, size = :full, padas = [1, 2, 3, 4])
59
+ # FIXME: Only considers full weight string, so only useful for full patterns
60
+ candidates = []
61
+ %i[patterns regexes].each do |type|
62
+ matches = MetricalData.all[type][size].each_with_object([]) do |(pattern, meter), acc|
63
+ meter_name = meter.keys.first
64
+ meter_scope = meter.values.first
65
+ case pattern
66
+ when String
67
+ next unless weight_string == pattern
68
+ pattern_string = pattern
69
+ when Regexp
70
+ r_match = pattern.match(weight_string)
71
+ next if r_match.nil?
72
+ pattern_string = r_match.length > 1 ? r_match.captures : r_match.to_s
73
+ end
74
+ acc << {
75
+ meter: meter_name,
76
+ type: type.to_s,
77
+ size: size.to_s,
78
+ pattern: pattern_string,
79
+ scope: meter_scope,
80
+ padas: padas,
81
+ }
82
+ end
83
+ candidates.concat(matches)
84
+ end
85
+ # candidates = candidates.uniq
86
+ candidates
87
+ end
88
+
89
+ def meter_search_fuzzy(weight_string, size = :full)
90
+ candidates = []
91
+ syllable_count = weight_string.length
92
+ length_variance = 5
93
+ edit_tolerance = 5
94
+ str = Levenshtein.new(weight_string)
95
+
96
+ %i[patterns regexes].each do |type|
97
+ matches = MetricalData.all[type][size].each_with_object([]) do |(pattern, meter), acc|
98
+ meter_name = meter.keys.first
99
+ # meter_scope = meter.values.first
100
+ case pattern
101
+ when String
102
+ next unless (pattern.length - syllable_count).abs <= length_variance
103
+ edit_distance = str.match(pattern)
104
+ next if edit_distance > edit_tolerance
105
+ pattern_string = pattern
106
+ when Regexp # FIXME : approximate matching in case of regexes
107
+ next if pattern.source["|"]
108
+ pattern = pattern.source.gsub!(/[\^\$\(\)]/, "")
109
+ next if (pattern.length - syllable_count).abs > length_variance
110
+ pattern2 = pattern.tr(".", "L")
111
+
112
+ edit_distance = str.match(pattern2)
113
+ c = corrected_string(weight_string, pattern2)
114
+ c = c.join("")
115
+
116
+ pattern_string = []
117
+ x1 = 0 # for pattern
118
+ xw = 0 # for weight string
119
+ pattern = pattern.lstrip
120
+ (0...c.length).each do |i|
121
+ if c[i] == "L" || c[i] == "G"
122
+ x1 += 1
123
+ pattern_string << c[i]
124
+ xw += 1
125
+ elsif c[i] == "l"
126
+ x1 += 1
127
+ pattern_string << "L"
128
+ elsif c[i] == "g"
129
+ x1 += 1
130
+ pattern_string << "G"
131
+ elsif c[i] == "d"
132
+ xw += 1
133
+ elsif c[i] == "f" && pattern[x1] == "."
134
+ x1 += 1
135
+ edit_distance -= 1
136
+ pattern_string << weight_string[xw]
137
+ xw += 1
138
+ else
139
+ pattern_string << pattern[x1]
140
+ x1 += 1
141
+ xw += 1
142
+ end
143
+ end
144
+ pattern_string = pattern_string.join("")
145
+ next if edit_distance > edit_tolerance
146
+ end
147
+ acc << {
148
+ meter: meter_name,
149
+ type: type,
150
+ size: size,
151
+ pattern: pattern_string,
152
+ edit_distance: edit_distance,
153
+ }
154
+ end
155
+ candidates.concat(matches)
156
+ end
157
+ candidates
158
+ end
159
+
160
+ def weight_try_half(weight_string, meter)
161
+ meter_hash = metercount
162
+ length = meter_hash[meter]
163
+
164
+ [
165
+ weight_string.slice(0, length[0] + length[1]),
166
+ weight_string.slice(length[0] + length[1], length[2] + length[3]),
167
+ ]
168
+ end
169
+
170
+ def weight_try_pada(weight_string, meter)
171
+ meter_hash = metercount
172
+ length = meter_hash[meter]
173
+
174
+ [
175
+ weight_string.slice(0, length[0]),
176
+ weight_string.slice(length[0], length[1]),
177
+ weight_string.slice(length[0] + length[1], length[2]),
178
+ weight_string.slice(length[0] + length[1] + length[2], length[3]),
179
+ ]
180
+ end
181
+
182
+ def analyze_syllables(syllables)
183
+ v_syllables = syllables.dup
184
+ v_weight = syllables_weights(v_syllables)
185
+
186
+ meter_candidates = {}
187
+
188
+ meter_search_exact(v_weight).each do |val|
189
+ if meter_candidates[val[:meter]].nil?
190
+ meter_candidates[val[:meter]] = [val]
191
+ else
192
+ meter_candidates[val[:meter]] << val
193
+ end
194
+ end
195
+
196
+ if meter_candidates == {}
197
+ meter_search_fuzzy(v_weight).each do |val|
198
+ if meter_candidates[val[:meter]].nil?
199
+ meter_candidates[val[:meter]] = [val]
200
+ elsif val[:edit_distance] < meter_candidates[val[:meter]][0][:edit_distance]
201
+ meter_candidates[val[:meter]].clear
202
+ meter_candidates[val[:meter]] << val
203
+ end
204
+ end
205
+ status = "fuzzy match"
206
+ else
207
+ v_weight_halves = weight_try_half(v_weight, meter_candidates.keys.first)
208
+ v_weight_halves&.each_with_index do |v_weight_half, index|
209
+ padas = index == 0 ? [1, 2] : [3, 4]
210
+ meter_search_exact(v_weight_half, :half, padas).each do |val|
211
+ if meter_candidates[val[:meter]].nil?
212
+ meter_candidates[val[:meter]] = [val]
213
+ else
214
+ meter_candidates[val[:meter]] << val
215
+ end
216
+ end
217
+ end
218
+
219
+ v_weight_padas = weight_try_pada(v_weight, meter_candidates.keys.first)
220
+ v_weight_padas&.each_with_index do |v_weight_pada, index|
221
+ pada = [index + 1]
222
+ meter_search_exact(v_weight_pada, :pada, pada).each do |val|
223
+ if meter_candidates[val[:meter]].nil?
224
+ meter_candidates[val[:meter]] = [val]
225
+ else
226
+ meter_candidates[val[:meter]] << val
227
+ end
228
+ end
229
+ end
230
+
231
+ status = "exact match"
232
+ end
233
+
234
+ result = {
235
+ status: status,
236
+ syllables: v_syllables,
237
+ weights: v_weight,
238
+ meters: meter_candidates,
239
+ }
240
+ result
241
+ end
242
+
243
+ # identifies the most close meter and returns padas, any corrections in case of approx match
244
+ def identify(verse_string)
245
+ # 1. Get basic information about input
246
+ v_syllables = syllables(verse_string)
247
+ v_weight = syllables_weights(v_syllables)
248
+
249
+ # 2. Discover possible meter candidates
250
+ # Should return list of meters with relevant information for generating correction if appropriate.
251
+ # (Including size of match, etc.)
252
+ m = analyze_syllables(v_syllables)
253
+
254
+ # 3. Explain meter candidates
255
+
256
+ # 3.1 Exact match => Show meter name, information, split input according to match (if possible).
257
+
258
+ # 3.2 Fuzzy match => Generate possible corrections between input and candidates
259
+
260
+ # 4. Output object containing input data, result status, and candidate meters
261
+ # (with corrections if appropriate). No un-necessary results.
262
+
263
+ meter_candidates = m[:meters]
264
+ v_padas = []
265
+ m_hsh = metercount
266
+
267
+ if meter_candidates == {}
268
+ m[:status] = "Verse highly defective , Can't find neter"
269
+ v_meters = {}
270
+ correct = []
271
+
272
+ elsif m[:status] == "exact match"
273
+ meter = meter_candidates.keys.first
274
+
275
+ len = m_hsh[meter]
276
+ v_padas << m[:syllables].slice!(0, len[0]).join("")
277
+ v_padas << m[:syllables].slice!(0, len[1]).join("")
278
+ v_padas << m[:syllables].slice!(0, len[2]).join("")
279
+ v_padas << m[:syllables].slice!(0, len[3]).join("")
280
+
281
+ defect_percentage = nil
282
+ correct = []
283
+ else
284
+ d = 100.0
285
+ pattern = []
286
+ meter_candidates.each do |(key, val)|
287
+ next unless val[0][:edit_distance].to_i < d # multiple verses with same edit distance???
288
+ d = val[0][:edit_distance]
289
+ meter = key
290
+ pattern = val[0][:pattern].split("")
291
+ end
292
+
293
+ defect_percentage = Rational(d, meter_candidates[meter][0][:pattern].length)
294
+ n = fuzzy_correction(m[:weights], meter, pattern, m[:syllables])
295
+ correct = n[:correct_weights]
296
+ v_padas = n[:correct_padas]
297
+ end
298
+
299
+ v_corrections = {
300
+ weights: correct.join(""),
301
+ padas: v_padas,
302
+ }
303
+
304
+ v_meters = {
305
+ name: meter,
306
+ size: "full/half/pada",
307
+ defectiveness: defect_percentage,
308
+ corrections: [v_corrections],
309
+ }
310
+
311
+ result = {
312
+ verse: verse_string,
313
+ syllables: v_syllables,
314
+ weights: v_weight,
315
+ status: m[:status],
316
+ meter: [v_meters],
317
+ }
318
+
319
+ if result[:status] == "exact match"
320
+ result[:meter] = v_meters[:name]
321
+ result[:padas] = v_padas
322
+ end
323
+
324
+ result
325
+ end
326
+
327
+ def corrected_string(weights, pattern)
328
+ actual = weights.split("")
329
+ actual.insert(0, " ")
330
+ pattern.insert(0, " ")
331
+
332
+ table = Array.new(actual.length) { Array.new(pattern.length) }
333
+
334
+ (0...actual.length).each do |i|
335
+ table[i][0] = i
336
+ end
337
+ (0...pattern.length).each do |i|
338
+ table[0][i] = i
339
+ end
340
+
341
+ (1...actual.length).each do |i|
342
+ (1...pattern.length).each do |j|
343
+ if actual[i] == pattern[j]
344
+ table[i][j] = table[i - 1][j - 1]
345
+ else
346
+ table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
347
+ end
348
+ end
349
+ end
350
+
351
+ correct = []
352
+ i = actual.length - 1
353
+ j = pattern.length - 1
354
+ while i > 0 || j > 0
355
+ if actual[i] == pattern[j]
356
+ correct.insert(0, actual[i])
357
+ i -= 1
358
+ j -= 1
359
+ else
360
+ x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min
361
+ case x
362
+ when table[i][j - 1]
363
+ if pattern[j] == "L"
364
+ correct.insert(0, "l")
365
+ else
366
+ correct.insert(0, "g")
367
+ end
368
+ j -= 1
369
+ when table[i - 1][j - 1]
370
+ correct.insert(0, "f") # to mark substitution in string
371
+ i -= 1
372
+ j -= 1
373
+ when table[i - 1][j]
374
+ correct.insert(0, "d") # to mark deletion from string
375
+ i -= 1
376
+ end
377
+ end
378
+ end
379
+ correct
380
+ end
381
+
382
+ def fuzzy_correction(weights, meter, pattern, syllables)
383
+ correct = corrected_string(weights, pattern)
384
+
385
+ k = 0
386
+ n = 0
387
+ p = 0
388
+ temp = []
389
+ v_padas = []
390
+ len = metercount[meter].dup
391
+ len.slice!(0, 4).each do |val|
392
+ (1..val).each do
393
+ if correct[k] == "d" # still to figure out
394
+ temp << ("(" + syllables[n] + ")")
395
+ n += 1
396
+ elsif correct[k] == "f"
397
+ temp << ("(" + syllables[n] + ")")
398
+ n += 1 # still to figure out
399
+ elsif correct[k] == "g"
400
+ case p
401
+ when 0
402
+ temp << " { (g)"
403
+ p = 2
404
+ else
405
+ temp << "(g)"
406
+ end
407
+ elsif correct[k] == "l"
408
+ case p
409
+ when 0
410
+ temp << " { (l)"
411
+ p = 1
412
+ else
413
+ temp << "(l)"
414
+ end
415
+ else
416
+ case p
417
+ when 2
418
+ if correct[k] == "L"
419
+ temp << " } " + syllables[n]
420
+ p = 0
421
+ else
422
+ temp << syllables[n]
423
+ end
424
+ when 1
425
+ if correct[k] == "G"
426
+ temp << " } " + syllables[n]
427
+ p = 0
428
+ else
429
+ temp << syllables[n]
430
+ end
431
+ when 0
432
+ temp << syllables[n]
433
+ end
434
+ n += 1
435
+ end
436
+ k += 1
437
+ end
438
+ v_padas << temp.join("")
439
+ temp = []
440
+ end
441
+
442
+ result = {
443
+ correct_weights: correct,
444
+ correct_padas: v_padas,
445
+ }
446
+ result
447
+ end
448
+
449
+ # returns hash of meter names and no of syllables in each pada, total syllables
450
+ def metercount
451
+ @metercount ||= begin
452
+ meter_data = {}
453
+ MetricalData.meters.map do |meter_name, pada_arr|
454
+ arr = pada_arr.map(&:length)
455
+ arr << arr.reduce(&:+)
456
+ meter_data[meter_name] = arr
457
+ end
458
+ MetricalData.regexes.full.each do |r, v|
459
+ meter_name = v.keys.first
460
+ next if meter_data.key?(meter_name)
461
+ source = r.source
462
+ next if source["|"] || source["("].nil?
463
+ groups = source.scan(/\(([^()]*)\)/).flatten
464
+ source.gsub!(/[\^\$\(\)]/, "")
465
+ meter_data[meter_name] = groups.map(&:length) << source.length
466
+ end
467
+
468
+ meter_data.sort.to_h.deep_freeze
469
+ end
470
+ end
471
+
472
+ def closeness
473
+ e = []
474
+ MetricalData.meters.keys.each do |key|
475
+ first = MetricalData.meters[key].join("")
476
+ a = Levenshtein.new(first)
477
+ MetricalData.meters.keys.each do |key2|
478
+ second = MetricalData.meters[key2].join("")
479
+ diff = a.match(second)
480
+ e << diff if diff <= 5
481
+ end
482
+ e = []
483
+ end
484
+ nil
485
+ end
486
+
487
+ def find_mid(verse)
488
+ w = verse_weight(verse)
489
+ c = w.length
490
+ min = c
491
+ v = 0
492
+ (((c / 2) - 3)..((c / 2) + 3)).each do |val|
493
+ str = Levenshtein.new(w.slice(0, val))
494
+ edit = str.match(w.slice(val, (c - val)))
495
+ if edit < min
496
+ min = edit
497
+ v = val
498
+ end
499
+ end
500
+ puts syllables(verse).slice(0, v).join("")
501
+ puts syllables(verse).slice(v, (c - v)).join("")
502
+ end
503
+ #
504
+ #
505
+ # WIP END
506
+ #
507
+ #
508
+ end
509
+ end