dphil 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,816 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amatch"
4
+
5
+ module Dphil
6
+ module VerseAnalysis
7
+ using ::Ragabash::Refinements
8
+
9
+ module_function
10
+
11
+ # Converts a verse string into individual syllables.
12
+ #
13
+ # @param verse_string [String] the raw text of the verse.
14
+ # @param from [Symbol] specify source transliteration scheme (detect by default)
15
+ # @param to [Symbol] specify output transliteration scheme (defaults to source)
16
+ # @return [Array] the text split into individual SLP1-encoded syllables.
17
+ def syllables(verse_string, from: nil, to: nil)
18
+ verse_string = verse_string.to_str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
19
+ from ||= Transliterate.detect(verse_string) || Transliterate.default_script
20
+ to ||= from
21
+ verse_string = Transliterate.transliterate(verse_string, from, :slp1)
22
+ syllables = verse_string.scan(Constants::R_SYL)
23
+ syllables.map! { |syl| Transliterate.transliterate(syl, :slp1, to) } if to != :slp1
24
+ syllables
25
+ end
26
+
27
+ # Converts a list of syllables into their L/G weights.
28
+ #
29
+ # @param syllables [Array] a set of syllables
30
+ # @return [String] the weight string of the syllables of the verse
31
+ def syllables_weights(syllables, from: nil, contextual: false)
32
+ from ||= Transliterate.detect(syllables.join("")) || Transliterate.default_script
33
+ syllables = syllables.to_ary.map { |syl| Transliterate.transliterate(syl, from, :slp1) } if from != :slp1
34
+ weight_arr = (0...syllables.length).map do |i|
35
+ cur_syl = syllables[i].delete("'").strip
36
+ next_syl = syllables[i + 1]&.delete("'")&.strip
37
+ if cur_syl.match?(Constants::R_GSYL)
38
+ # Guru if current syllable contains a long vowel, or end in a ṃ/ḥ
39
+ "G"
40
+ elsif cur_syl.match?(Constants::R_CCONF)
41
+ # Contextually Guru if ending in a cluster
42
+ "g"
43
+ elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}".match?(Constants::R_CCON)
44
+ # Contextually Guru if syllable-final and next syllable-inital make a
45
+ # consonant cluster.
46
+ "g"
47
+ else
48
+ "L"
49
+ end
50
+ end
51
+ contextual ? weight_arr.join("") : weight_arr.join("").upcase
52
+ end
53
+
54
+ # Convenience method to directly get weight string of verse
55
+ #
56
+ # @param verse_string [String] the raw text of the verse
57
+ # @return [String] the weight string of the verse.
58
+ def verse_weights(verse_string, contextual: false)
59
+ syllables_weights(syllables(verse_string), contextual: contextual)
60
+ end
61
+
62
+ def identify(verse_string)
63
+ v_syllables = syllables(verse_string)
64
+ v_weight = syllables_weights(v_syllables, contextual: true)
65
+ v_meters = identify_meter_manager(verse_string)
66
+ unless v_meters.empty?
67
+ status = v_meters.first[:info]
68
+ meter = status.delete(:meter)
69
+ padas = v_meters.first[:corrected_padas]
70
+ end
71
+ {
72
+ verse: verse_string,
73
+ syllables: v_syllables,
74
+ weights: v_weight,
75
+ status: status,
76
+ meter: meter,
77
+ padas: padas,
78
+ }
79
+ end
80
+
81
+ # Coordinates metrical identification for a verse string.
82
+ #
83
+ # @param verse_string [String] a verse string
84
+ # @return [Array] candidate meters and information about their matches
85
+ def identify_meter_manager(verse_string)
86
+ syllables = syllables(verse_string)
87
+ weight_string = syllables_weights(syllables)
88
+
89
+ candidates = []
90
+ 4.downto(1).each do |guess_size|
91
+ #
92
+ # TODO: Pre-process or somehow change this so that search is aware of
93
+ # how weight_string may or may not break across padas
94
+ # (i.e. whitespace in syllables)
95
+ search_results = meter_search_partial(weight_string, guess_size)
96
+ next if search_results.empty?
97
+
98
+ meter_results = search_results.group_by { |result| result[:meter_name] }
99
+
100
+ # Filter down to most-complete matches for each meter.
101
+ meter_results = compact_meter_results(meter_results)
102
+ meter_results.sort_by { |_key, value| value[0][:match_percent] }.to_h
103
+
104
+ meter_results = fuzzy_manager(meter_results, guess_size, weight_string)
105
+
106
+ # Add results to candidates
107
+ candidates.concat(meter_results)
108
+ end
109
+
110
+ candidates.concat(fuzzy_analysis(weight_string, 4, 0)) if candidates == []
111
+
112
+ candidates.sort_by! { |value| value[:heuristic] }
113
+ candidates.reverse!
114
+ probables = get_best_matches(candidates, syllables, 2)
115
+ probables
116
+ end
117
+
118
+ #
119
+ #
120
+ #
121
+ #
122
+ def get_best_matches(candidates, syllables, number)
123
+ best = []
124
+ i = 1
125
+ candidates.each do |val|
126
+ break if i > number
127
+
128
+ acc = {
129
+ info: val,
130
+ corrected_padas: fuzzy_correction(val[:meter], val[:correct_weights], syllables),
131
+ }
132
+ i += 1
133
+ best << acc
134
+ end
135
+ best
136
+ end
137
+
138
+ # Searches for meter candidates for a given weight string and search size.
139
+ #
140
+ # @param weight_string [String] a weight string
141
+ # @param guess_size [Integer] the number of padas to match against
142
+ # @return [Array] candidate meters and associated match data
143
+ def meter_search_partial(weight_string, guess_size)
144
+ size_groups = if guess_size == 4
145
+ %i[full half pada]
146
+ elsif guess_size >= 2
147
+ %i[half pada]
148
+ else
149
+ %i[pada]
150
+ end
151
+
152
+ candidates = []
153
+ size_groups.product(%i[patterns regexes]).each do |(pattern_size, pattern_type)|
154
+ MetricalData.all[pattern_type][pattern_size].each do |pattern, meter|
155
+ next unless useful_comparison?(weight_string, pattern, pattern_size, guess_size)
156
+ # Match pattern against weight_string by `.find_pattern`
157
+ matches = find_pattern(weight_string, pattern)
158
+ next if matches.empty?
159
+ candidates << {
160
+ meter_name: meter.each_key.first,
161
+ type: pattern_type,
162
+ size: pattern_size,
163
+ scope: meter.each_value.first,
164
+ pattern: pattern,
165
+ matches: matches,
166
+ coverage: matches.reduce(0.0) { |a, e| a + e.size } / weight_string.length,
167
+ guess_size: guess_size,
168
+ }
169
+ end
170
+ end
171
+ candidates
172
+ end
173
+
174
+ # Determines whether a given match might be considered useful to determining
175
+ # a candidate meter for a given weight string.
176
+ #
177
+ # @param weight_string [String] a weight string
178
+ # @param pattern [String, Regexp] a match pattern
179
+ # @param pattern_size [Symbol] the size of a match pattern
180
+ # @param guess_size [Integer] the number of padas being searched for
181
+ # @param tolerance [Numeric] the tolerance percentage of length difference
182
+ #
183
+ # @return [Boolean] true if comparison has good chance of being useful
184
+ def useful_comparison?(weight_string, pattern, pattern_size, guess_size, tolerance = 0.2)
185
+ pattern = clean_regexp_pattern(pattern) if pattern.is_a?(Regexp)
186
+ multiplier = pattern_size_multiplier(pattern_size)
187
+ difference = (weight_string.length - (guess_size * multiplier * pattern.length / 4)).abs
188
+ return true if difference <= (tolerance * multiplier * pattern.length)
189
+ false
190
+ end
191
+
192
+ # Finds all occurrences of a match pattern in a weight string
193
+ #
194
+ # @param weight_string [String] a weight string
195
+ # @param pattern [String, Regexp] a match pattern
196
+ # @return [Array] array of index-ranges of pattern matches within weight string
197
+ def find_pattern(weight_string, pattern)
198
+ indexes = []
199
+ i = 0
200
+ case pattern
201
+ when String
202
+ while (match = weight_string.index(pattern, i))
203
+ i_end = match + pattern.length - 1
204
+ indexes << (match..i_end)
205
+ i = i_end + 1
206
+ end
207
+ when Regexp
208
+ while (match = pattern.match(weight_string, i))
209
+ i_start = match.begin(0)
210
+ i_end = i_start + match[0].length - 1
211
+ indexes << (i_start..i_end)
212
+ i = i_end + 1
213
+ end
214
+ end
215
+ indexes
216
+ end
217
+
218
+ # Returns a string of a Regexp pattern cleaned of special characters
219
+ #
220
+ # @param regexp [Regexp] a regular expression
221
+ # @return [String] a clean string of the pattern
222
+ def clean_regexp_pattern(regexp)
223
+ pattern = regexp.source
224
+ pattern.gsub!(/[\(\)\^\$\|]+/, "")
225
+ pattern
226
+ end
227
+
228
+ # Returns a multiplier based on the pattern size symbol
229
+ #
230
+ # @param pattern_size [Symbol] a pattern size symbol
231
+ # @return [Integer] a multiplier
232
+ def pattern_size_multiplier(pattern_size)
233
+ case pattern_size
234
+ when :full
235
+ 1
236
+ when :half
237
+ 2
238
+ when :pada
239
+ 4
240
+ end
241
+ end
242
+
243
+ # Checks whether or not a range overlaps with an array of ranges
244
+ #
245
+ # @param indexes [Array] Array of ranges
246
+ # @param range []
247
+ # @return []
248
+ def check_non_overlapping?(indexes, range)
249
+ return true if indexes.empty?
250
+
251
+ indexes.each do |val|
252
+ return false if val.cover?(range.begin) || val.cover?(range.end)
253
+ end
254
+ true
255
+ end
256
+
257
+ # Filters out redundant meter results by compiling together the most
258
+ # complete possible matches for each meter.
259
+ #
260
+ # @param meter_results [Hash] a hash of search results grouped by meter
261
+ # @return [Hash] a compacted hash of search results grouped by meter
262
+ def compact_meter_results(meter_results)
263
+ compact_results = {}
264
+
265
+ meter_results.keys.each do |meter_name|
266
+ compact_index = []
267
+ p = 0
268
+ meter_results[meter_name].each do |val|
269
+ val[:matches].each do |i|
270
+ next unless check_non_overlapping?(compact_index, i)
271
+ compact_index << i
272
+ case val[:size]
273
+ when :full
274
+ p += 100
275
+ when :half
276
+ p += 50
277
+ when :pada
278
+ p += 25
279
+ end
280
+ end
281
+ end
282
+ acc = {
283
+ pattern_type: meter_results[meter_name][0][:type],
284
+ matches: compact_index.sort_by { |a| a.to_s.split("..").first.to_i },
285
+ match_percent: p,
286
+ }
287
+ compact_results[meter_name] = [acc]
288
+ end
289
+ compact_results
290
+ end
291
+
292
+ #
293
+ #
294
+ #
295
+ #
296
+ def fuzzy_manager(meter_results, guess_size, weight_string)
297
+ w1 = weight_string.dup
298
+ extended_result = []
299
+ e = 0
300
+
301
+ meter_results.each do |key, val|
302
+ break if e == 1
303
+ wc = w1.dup
304
+ indexes = val[0][:matches]
305
+ q = (val[0][:match_percent] * 100) / (guess_size * 25)
306
+
307
+ q += 25 if q == 50 && guess_size == 2 # to deal with problematic case when p=25 and guess=2, should be only one correction
308
+
309
+ case q
310
+ when 100
311
+ wc = remove_extra_syllables(wc, indexes)
312
+ e = 1
313
+ when 60..75
314
+ max = get_unmatched_range(indexes, wc.length - 1)
315
+ portion = wc.slice!(max.begin, (max.end - max.begin + 1))
316
+ pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
317
+ correct = corrected_string(portion, pattern)
318
+ wc.insert(max.begin, correct)
319
+ indexes = update_index_array(indexes, max, correct.length - portion.length)
320
+ wc = remove_extra_syllables(wc, indexes)
321
+
322
+ when 30..50
323
+ flag = 0
324
+ 2.times do
325
+ next if flag == 1
326
+ max = get_unmatched_range(indexes, wc.length - 1)
327
+ portion = wc.slice!(max.begin, (max.end - max.begin + 1))
328
+ if (max.end - max.begin + 1) > ((metercount[key][4] / 4) + 3)
329
+ pattern = get_specific_pattern(key, :half, val[0][:pattern_type], portion)
330
+ flag = 1
331
+ else
332
+ pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
333
+ end
334
+ correct = corrected_string(portion, pattern)
335
+ wc.insert(max.begin, correct)
336
+ indexes = update_index_array(indexes, max, correct.length - portion.length)
337
+ end
338
+ wc = remove_extra_syllables(wc, indexes)
339
+ else
340
+ best_fuzzy = fuzzy_analysis(w1, guess_size, val[0][:match_percent])
341
+ best_fuzzy.each do |v|
342
+ if v[:meter] == key
343
+ extended_result << v
344
+ next
345
+ end
346
+ end
347
+ (0...wc.length).each do |k|
348
+ wc[k] = "x"
349
+ end
350
+ end
351
+
352
+ status = get_pada_status(key, wc, indexes, guess_size)
353
+ pada_weights = get_weight_by_pada(status, wc)
354
+
355
+ acc = {
356
+ len_assumption: guess_size.to_s + "/4",
357
+ meter: key,
358
+ type: val[0][:pattern_type],
359
+ match_indexes: status,
360
+ percent_match: val[0][:match_percent],
361
+ edit_count: wc.scan(/[a-z]/).length,
362
+ correct_weights: pada_weights,
363
+ heuristic: (2 * val[0][:match_percent]) + ((100 - (wc.scan(/[a-z]/).length * 100 / weight_string.length))),
364
+ }
365
+ extended_result << acc
366
+ end
367
+ extended_result
368
+ end
369
+
370
+ #
371
+ #
372
+ #
373
+ #
374
+ def get_pada_status(meter, correct_weights, indexes, guess_size)
375
+ len = metercount[meter]
376
+ cw = correct_weights.dup
377
+ index = indexes.dup
378
+
379
+ status = []
380
+ range = nil
381
+
382
+ pn = -1
383
+ pr = nil
384
+ ps = ""
385
+
386
+ # TO DO : identify which padas are actually missing
387
+ (1..guess_size).each do |i|
388
+ break if range.nil? && index.empty?
389
+ pn = i
390
+ range = index.slice!(0, 1)[0] if range.nil?
391
+
392
+ if cw.slice(range.begin, range.end - range.begin + 1).scan(/[a-z]/).empty?
393
+ ps = "exact"
394
+ if (range.end - range.begin + 1) == len[i - 1]
395
+ pr = range
396
+ range = nil
397
+ else
398
+ pr = (range.begin..(range.begin + len[i - 1] - 1))
399
+ range = ((range.begin + len[i - 1])..range.end)
400
+ end
401
+ else
402
+ ps = "fuzzy"
403
+ temp = 1
404
+ rng = range.begin
405
+ while temp <= len[i - 1]
406
+ temp += 1 if cw[rng] != "d"
407
+ rng += 1
408
+ end
409
+ if rng > range.end
410
+ pr = range
411
+ range = nil
412
+ else
413
+ pr = (range.begin..(rng - 1))
414
+ range = (rng..range.end)
415
+ end
416
+ end
417
+ acc = {
418
+ pada_number: pn,
419
+ pada_range: pr,
420
+ pada_status: ps,
421
+ }
422
+ status << acc
423
+ end
424
+
425
+ ((guess_size + 1)..4).each do |j|
426
+ pn = j
427
+ pr = nil
428
+ ps = "missing"
429
+ acc = {
430
+ pada_number: pn,
431
+ pada_range: pr,
432
+ pada_status: ps,
433
+ }
434
+ status << acc
435
+ end
436
+ status
437
+ end
438
+
439
+ #
440
+ #
441
+ #
442
+ #
443
+ def get_weight_by_pada(status, corrected_weights)
444
+ cw = corrected_weights.dup
445
+ pada_weights = []
446
+ start = 0
447
+ status.each do |val|
448
+ if val[:pada_status] == "missing"
449
+ pada_weights << ""
450
+ else
451
+ pada_weights << cw.slice(start, (val[:pada_range].end - start + 1))
452
+ start = val[:pada_range].end + 1
453
+ end
454
+ end
455
+ pada_weights
456
+ end
457
+
458
+ #
459
+ #
460
+ #
461
+ #
462
+ def fuzzy_analysis(weight_string, guess_size, per_match)
463
+ wc = weight_string.dup
464
+ best = []
465
+ edits = 100
466
+
467
+ meter_search_fuzzy(wc, guess_size).each do |value|
468
+ edits = value[:edit_distance] if value[:edit_distance] < edits
469
+ end
470
+
471
+ meter_search_fuzzy(wc, guess_size).each do |value|
472
+ next unless value[:edit_distance] == edits
473
+ wc = corrected_string(weight_string, value[:pattern])
474
+ status = get_pada_status(value[:meter], wc, [(0..(wc.length - 1))], guess_size)
475
+ pada_weights = get_weight_by_pada(status, wc)
476
+ acc = {
477
+ len_assumption: guess_size.to_s + "/4",
478
+ meter: value[:meter],
479
+ type: value[:type],
480
+ match_indexes: status,
481
+ percent_match: per_match,
482
+ edit_count: value[:edit_distance],
483
+ correct_weights: pada_weights,
484
+ heuristic: (2 * per_match) + ((100 - (value[:edit_distance] * 100 / weight_string.length))),
485
+ }
486
+ best << acc
487
+ end
488
+ best
489
+ end
490
+
491
+ #
492
+ #
493
+ #
494
+ #
495
+ def get_specific_pattern(meter_name, size, type, weight_string)
496
+ case type
497
+ when :patterns
498
+ if size == :pada
499
+ MetricalData.meters[meter_name][0].dup
500
+ else
501
+ MetricalData.meters[meter_name][0].dup + MetricalData.meters[meter_name][1].dup
502
+ end
503
+ when :regexes
504
+ MetricalData.all[type][size].each do |p, meter|
505
+ next unless meter_name == meter.keys.first
506
+ p = p.source.gsub(/[\^\$\(\)]/, "")
507
+ r = closest_pattern_to_regex(weight_string, p)
508
+ return r[:pattern]
509
+ end
510
+ end
511
+ end
512
+
513
+ #
514
+ #
515
+ #
516
+ #
517
+ def remove_extra_syllables(weights, indexes)
518
+ w1 = weights.dup
519
+ (0...w1.length).each do |u|
520
+ flag = 0
521
+ indexes.each do |v|
522
+ flag = 1 if u >= v.begin && u <= v.end
523
+ end
524
+ w1[u] = "d" if flag == 0
525
+ end
526
+ w1
527
+ end
528
+
529
+ #
530
+ #
531
+ #
532
+ #
533
+ def get_unmatched_range(indexes, last)
534
+ max = indexes[0].begin > (last - indexes[-1].end) ? (0..(indexes[0].begin - 1)) : ((indexes[-1].end + 1)..last)
535
+ j = 0
536
+ indexes.each do |i|
537
+ if (i.begin - j - 1) > (max.end - max.begin + 1)
538
+ max = ((j + 1)..(i.begin - 1))
539
+ end
540
+ j = i.end
541
+ end
542
+ max
543
+ end
544
+
545
+ #
546
+ #
547
+ #
548
+ #
549
+ def update_index_array(indexes, max, diff)
550
+ indexes << max
551
+ indexes = indexes.sort_by { |a| a.to_s.split("..").first.to_i }
552
+ index2 = []
553
+ indexes.each do |val|
554
+ if val.begin < max.begin
555
+ index2 << val
556
+ elsif val.begin == max.begin
557
+ index2 << ((max.begin)..(max.end + diff))
558
+ else
559
+ index2 << ((val.begin + diff)..(val.end + diff))
560
+ end
561
+ end
562
+ index2
563
+ end
564
+
565
+ #
566
+ #
567
+ #
568
+ #
569
+ def corrected_string(weights, pattern)
570
+ return weights if pattern.empty?
571
+ actual = weights.split("")
572
+ actual.insert(0, " ")
573
+ pattern.insert(0, " ")
574
+
575
+ table = Array.new(actual.length) { Array.new(pattern.length) }
576
+
577
+ (0...actual.length).each do |i|
578
+ table[i][0] = i
579
+ end
580
+ (0...pattern.length).each do |i|
581
+ table[0][i] = i
582
+ end
583
+
584
+ (1...actual.length).each do |i|
585
+ (1...pattern.length).each do |j|
586
+ if actual[i] == pattern[j]
587
+ table[i][j] = table[i - 1][j - 1]
588
+ else
589
+ table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
590
+ end
591
+ end
592
+ end
593
+
594
+ correct = []
595
+ i = actual.length - 1
596
+ j = pattern.length - 1
597
+ while i > 0 || j > 0
598
+ if actual[i] == pattern[j]
599
+ correct.insert(0, actual[i])
600
+ i -= 1
601
+ j -= 1
602
+ else
603
+ x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min if i > 0 && j > 0
604
+ x = table[i][j - 1] if i == 0 # upper boundary case
605
+ x = table[i - 1][j] if j == 0 # left boundary case
606
+ case x
607
+ when table[i][j - 1]
608
+ if pattern[j] == "L"
609
+ correct.insert(0, "l")
610
+ else
611
+ correct.insert(0, "g")
612
+ end
613
+ j -= 1
614
+ when table[i - 1][j - 1]
615
+ correct.insert(0, "f") # to mark substitution in string
616
+ i -= 1
617
+ j -= 1
618
+ when table[i - 1][j]
619
+ correct.insert(0, "d") # to mark deletion from string
620
+ i -= 1
621
+ end
622
+ end
623
+ end
624
+ correct.join("")
625
+ end
626
+
627
+ #
628
+ #
629
+ #
630
+ #
631
+ def metercount
632
+ @metercount ||= begin
633
+ meter_data = {}
634
+ MetricalData.meters.map do |meter_name, pada_arr|
635
+ arr = pada_arr.map(&:length)
636
+ arr << arr.reduce(&:+)
637
+ meter_data[meter_name] = arr
638
+ end
639
+ MetricalData.regexes.full.each do |r, v|
640
+ meter_name = v.keys.first
641
+ next if meter_data.key?(meter_name)
642
+ source = r.source
643
+ next if source["|"] || source["("].nil?
644
+ groups = source.scan(/\(([^()]*)\)/).flatten
645
+ source.gsub!(/[\^\$\(\)]/, "")
646
+ meter_data[meter_name] = groups.map(&:length) << source.length
647
+ end
648
+
649
+ meter_data.sort.to_h.deep_freeze
650
+ end
651
+ end
652
+
653
+ #
654
+ #
655
+ #
656
+ #
657
+ def fuzzy_correction(_meter, corrected_weights, syllables)
658
+ k = 0
659
+ n = 0 # for syllables
660
+ p = 0
661
+ temp = []
662
+ v_padas = []
663
+
664
+ corrected_weights.each do |correct|
665
+ (0...correct.length).each do |k|
666
+ # break if n >= syllables.length
667
+ if correct[k] == "d"
668
+ temp << ("[" + syllables[n] + "]")
669
+ n += 1
670
+ elsif correct[k] == "f"
671
+ temp << ("(" + syllables[n] + ")")
672
+ n += 1
673
+ elsif correct[k] == "g"
674
+ case p
675
+ when 0
676
+ temp << " { (g)"
677
+ p = 2
678
+ else
679
+ temp << "(g)"
680
+ end
681
+ elsif correct[k] == "l"
682
+ case p
683
+ when 0
684
+ temp << " { (l)"
685
+ p = 1
686
+ else
687
+ temp << "(l)"
688
+ end
689
+ else
690
+ case p
691
+ when 2
692
+ if correct[k] == "L"
693
+ temp << " } " + syllables[n]
694
+ p = 0
695
+ else
696
+ temp << syllables[n]
697
+ end
698
+ when 1
699
+ if correct[k] == "G"
700
+ temp << " } " + syllables[n]
701
+ p = 0
702
+ else
703
+ temp << syllables[n]
704
+ end
705
+ when 0
706
+ temp << syllables[n]
707
+ end
708
+ n += 1
709
+ end
710
+ end
711
+ v_padas << temp.join("")
712
+ temp = []
713
+ end
714
+ v_padas
715
+ end
716
+
717
+ #
718
+ #
719
+ #
720
+ #
721
+ def meter_search_fuzzy(weight_string, guess_size)
722
+ candidates = []
723
+ syllable_count = weight_string.length
724
+ length_variance = 0.2
725
+ edit_tolerance = 0.15
726
+ str = Amatch::Levenshtein.new(weight_string)
727
+
728
+ %i[patterns regexes].each do |type|
729
+ matches = MetricalData.all[type][:full].each_with_object([]) do |(p, meter), acc|
730
+ meter_name = meter.keys.first
731
+ case p
732
+ when String
733
+ pattern = ""
734
+ p2 = p.dup
735
+ l = metercount[meter_name]
736
+ (0...guess_size).each do |i|
737
+ pattern += p2.slice!(0, l[i])
738
+ end
739
+
740
+ next unless (pattern.length - syllable_count).abs <= length_variance * pattern.length
741
+ edit_distance = str.match(pattern)
742
+ next if edit_distance > edit_tolerance * pattern.length
743
+ pattern_string = pattern
744
+ when Regexp
745
+ next if p.source["|"]
746
+ p = p.source.gsub(/[\^\$\(\)]/, "")
747
+ pattern = p.slice(0...(guess_size * p.length / 4))
748
+ next if (pattern.length - syllable_count).abs > length_variance * pattern.length
749
+ result = closest_pattern_to_regex(weight_string, pattern)
750
+ pattern_string = result[:pattern]
751
+ edit_distance = result[:edit_distance]
752
+
753
+ next if edit_distance > edit_tolerance * pattern.length
754
+ end
755
+ acc << {
756
+ meter: meter_name,
757
+ type: type,
758
+ guess_size: guess_size,
759
+ pattern: pattern_string,
760
+ edit_distance: edit_distance,
761
+ }
762
+ end
763
+ candidates.concat(matches)
764
+ end
765
+ candidates
766
+ end
767
+
768
+ #
769
+ #
770
+ #
771
+ #
772
+ def closest_pattern_to_regex(weight_string, pattern)
773
+ pattern2 = pattern.tr(".", "L")
774
+ str = Amatch::Levenshtein.new(weight_string)
775
+
776
+ edit_distance = str.match(pattern2)
777
+ c = corrected_string(weight_string, pattern2)
778
+ # c = c.join("")
779
+
780
+ pattern_string = []
781
+ x1 = 0 # for pattern
782
+ xw = 0 # for weight string
783
+ pattern = pattern.lstrip
784
+ (0...c.length).each do |i|
785
+ if c[i] == "L" || c[i] == "G"
786
+ x1 += 1
787
+ pattern_string << c[i]
788
+ xw += 1
789
+ elsif c[i] == "l"
790
+ x1 += 1
791
+ pattern_string << "L"
792
+ elsif c[i] == "g"
793
+ x1 += 1
794
+ pattern_string << "G"
795
+ elsif c[i] == "d"
796
+ xw += 1
797
+ elsif c[i] == "f" && pattern[x1] == "."
798
+ x1 += 1
799
+ edit_distance -= 1
800
+ pattern_string << weight_string[xw]
801
+ xw += 1
802
+ else
803
+ pattern_string << pattern[x1]
804
+ x1 += 1
805
+ xw += 1
806
+ end
807
+ end
808
+ pattern_string = pattern_string.join("")
809
+ acc = {
810
+ pattern: pattern_string,
811
+ edit_distance: edit_distance,
812
+ }
813
+ acc
814
+ end
815
+ end
816
+ end