dphil 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/CODE_OF_CONDUCT.md +49 -0
  3. data/Gemfile +6 -0
  4. data/LICENSE +201 -0
  5. data/README.md +54 -0
  6. data/Rakefile +11 -0
  7. data/dphil.gemspec +49 -0
  8. data/exe/dphil +10 -0
  9. data/lib/dphil.rb +53 -0
  10. data/lib/dphil/cache.rb +15 -0
  11. data/lib/dphil/change_list.rb +6 -0
  12. data/lib/dphil/character.rb +236 -0
  13. data/lib/dphil/character_matrix.rb +102 -0
  14. data/lib/dphil/cli.rb +26 -0
  15. data/lib/dphil/cli_commands/csv2ld.rb +71 -0
  16. data/lib/dphil/cli_commands/csv2nex.rb +37 -0
  17. data/lib/dphil/constants.rb +128 -0
  18. data/lib/dphil/converter.rb +58 -0
  19. data/lib/dphil/converters/csv2nex.rb +83 -0
  20. data/lib/dphil/ld_data_set.rb +25 -0
  21. data/lib/dphil/ld_output.rb +29 -0
  22. data/lib/dphil/lemma.rb +44 -0
  23. data/lib/dphil/lemma_list.rb +179 -0
  24. data/lib/dphil/log_formatter.rb +39 -0
  25. data/lib/dphil/logger.rb +27 -0
  26. data/lib/dphil/metrical_data.rb +78 -0
  27. data/lib/dphil/newick.rb +52 -0
  28. data/lib/dphil/paup.rb +34 -0
  29. data/lib/dphil/refinements.rb +8 -0
  30. data/lib/dphil/refinements/natural_sort.rb +52 -0
  31. data/lib/dphil/script_string.rb +124 -0
  32. data/lib/dphil/syllables.rb +43 -0
  33. data/lib/dphil/syllables/syllable.rb +45 -0
  34. data/lib/dphil/tei_xml.rb +142 -0
  35. data/lib/dphil/transliterate.rb +131 -0
  36. data/lib/dphil/tree.rb +142 -0
  37. data/lib/dphil/tree_node.rb +67 -0
  38. data/lib/dphil/verse.rb +25 -0
  39. data/lib/dphil/verse_analysis.rb +509 -0
  40. data/lib/dphil/verse_analysis_new.rb +816 -0
  41. data/lib/dphil/version.rb +30 -0
  42. data/vendor/default_commands.paup +18 -0
  43. data/vendor/metrical_data.yml +4035 -0
  44. metadata +409 -0
@@ -0,0 +1,816 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "amatch"
4
+
5
+ module Dphil
6
+ module VerseAnalysis
7
+ using ::Ragabash::Refinements
8
+
9
+ module_function
10
+
11
+ # Converts a verse string into individual syllables.
12
+ #
13
+ # @param verse_string [String] the raw text of the verse.
14
+ # @param from [Symbol] specify source transliteration scheme (detect by default)
15
+ # @param to [Symbol] specify output transliteration scheme (defaults to source)
16
+ # @return [Array] the text split into individual SLP1-encoded syllables.
17
+ def syllables(verse_string, from: nil, to: nil)
18
+ verse_string = verse_string.to_str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
19
+ from ||= Transliterate.detect(verse_string) || Transliterate.default_script
20
+ to ||= from
21
+ verse_string = Transliterate.transliterate(verse_string, from, :slp1)
22
+ syllables = verse_string.scan(Constants::R_SYL)
23
+ syllables.map! { |syl| Transliterate.transliterate(syl, :slp1, to) } if to != :slp1
24
+ syllables
25
+ end
26
+
27
+ # Converts a list of syllables into their L/G weights.
28
+ #
29
+ # @param syllables [Array] a set of syllables
30
+ # @return [String] the weight string of the syllables of the verse
31
+ def syllables_weights(syllables, from: nil, contextual: false)
32
+ from ||= Transliterate.detect(syllables.join("")) || Transliterate.default_script
33
+ syllables = syllables.to_ary.map { |syl| Transliterate.transliterate(syl, from, :slp1) } if from != :slp1
34
+ weight_arr = (0...syllables.length).map do |i|
35
+ cur_syl = syllables[i].delete("'").strip
36
+ next_syl = syllables[i + 1]&.delete("'")&.strip
37
+ if cur_syl.match?(Constants::R_GSYL)
38
+ # Guru if current syllable contains a long vowel, or end in a ṃ/ḥ
39
+ "G"
40
+ elsif cur_syl.match?(Constants::R_CCONF)
41
+ # Contextually Guru if ending in a cluster
42
+ "g"
43
+ elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}".match?(Constants::R_CCON)
44
+ # Contextually Guru if syllable-final and next syllable-inital make a
45
+ # consonant cluster.
46
+ "g"
47
+ else
48
+ "L"
49
+ end
50
+ end
51
+ contextual ? weight_arr.join("") : weight_arr.join("").upcase
52
+ end
53
+
54
+ # Convenience method to directly get weight string of verse
55
+ #
56
+ # @param verse_string [String] the raw text of the verse
57
+ # @return [String] the weight string of the verse.
58
+ def verse_weights(verse_string, contextual: false)
59
+ syllables_weights(syllables(verse_string), contextual: contextual)
60
+ end
61
+
62
+ def identify(verse_string)
63
+ v_syllables = syllables(verse_string)
64
+ v_weight = syllables_weights(v_syllables, contextual: true)
65
+ v_meters = identify_meter_manager(verse_string)
66
+ unless v_meters.empty?
67
+ status = v_meters.first[:info]
68
+ meter = status.delete(:meter)
69
+ padas = v_meters.first[:corrected_padas]
70
+ end
71
+ {
72
+ verse: verse_string,
73
+ syllables: v_syllables,
74
+ weights: v_weight,
75
+ status: status,
76
+ meter: meter,
77
+ padas: padas,
78
+ }
79
+ end
80
+
81
+ # Coordinates metrical identification for a verse string.
82
+ #
83
+ # @param verse_string [String] a verse string
84
+ # @return [Array] candidate meters and information about their matches
85
+ def identify_meter_manager(verse_string)
86
+ syllables = syllables(verse_string)
87
+ weight_string = syllables_weights(syllables)
88
+
89
+ candidates = []
90
+ 4.downto(1).each do |guess_size|
91
+ #
92
+ # TODO: Pre-process or somehow change this so that search is aware of
93
+ # how weight_string may or may not break across padas
94
+ # (i.e. whitespace in syllables)
95
+ search_results = meter_search_partial(weight_string, guess_size)
96
+ next if search_results.empty?
97
+
98
+ meter_results = search_results.group_by { |result| result[:meter_name] }
99
+
100
+ # Filter down to most-complete matches for each meter.
101
+ meter_results = compact_meter_results(meter_results)
102
+ meter_results.sort_by { |_key, value| value[0][:match_percent] }.to_h
103
+
104
+ meter_results = fuzzy_manager(meter_results, guess_size, weight_string)
105
+
106
+ # Add results to candidates
107
+ candidates.concat(meter_results)
108
+ end
109
+
110
+ candidates.concat(fuzzy_analysis(weight_string, 4, 0)) if candidates == []
111
+
112
+ candidates.sort_by! { |value| value[:heuristic] }
113
+ candidates.reverse!
114
+ probables = get_best_matches(candidates, syllables, 2)
115
+ probables
116
+ end
117
+
118
+ #
119
+ #
120
+ #
121
+ #
122
+ def get_best_matches(candidates, syllables, number)
123
+ best = []
124
+ i = 1
125
+ candidates.each do |val|
126
+ break if i > number
127
+
128
+ acc = {
129
+ info: val,
130
+ corrected_padas: fuzzy_correction(val[:meter], val[:correct_weights], syllables),
131
+ }
132
+ i += 1
133
+ best << acc
134
+ end
135
+ best
136
+ end
137
+
138
+ # Searches for meter candidates for a given weight string and search size.
139
+ #
140
+ # @param weight_string [String] a weight string
141
+ # @param guess_size [Integer] the number of padas to match against
142
+ # @return [Array] candidate meters and associated match data
143
+ def meter_search_partial(weight_string, guess_size)
144
+ size_groups = if guess_size == 4
145
+ %i[full half pada]
146
+ elsif guess_size >= 2
147
+ %i[half pada]
148
+ else
149
+ %i[pada]
150
+ end
151
+
152
+ candidates = []
153
+ size_groups.product(%i[patterns regexes]).each do |(pattern_size, pattern_type)|
154
+ MetricalData.all[pattern_type][pattern_size].each do |pattern, meter|
155
+ next unless useful_comparison?(weight_string, pattern, pattern_size, guess_size)
156
+ # Match pattern against weight_string by `.find_pattern`
157
+ matches = find_pattern(weight_string, pattern)
158
+ next if matches.empty?
159
+ candidates << {
160
+ meter_name: meter.each_key.first,
161
+ type: pattern_type,
162
+ size: pattern_size,
163
+ scope: meter.each_value.first,
164
+ pattern: pattern,
165
+ matches: matches,
166
+ coverage: matches.reduce(0.0) { |a, e| a + e.size } / weight_string.length,
167
+ guess_size: guess_size,
168
+ }
169
+ end
170
+ end
171
+ candidates
172
+ end
173
+
174
+ # Determines whether a given match might be considered useful to determining
175
+ # a candidate meter for a given weight string.
176
+ #
177
+ # @param weight_string [String] a weight string
178
+ # @param pattern [String, Regexp] a match pattern
179
+ # @param pattern_size [Symbol] the size of a match pattern
180
+ # @param guess_size [Integer] the number of padas being searched for
181
+ # @param tolerance [Numeric] the tolerance percentage of length difference
182
+ #
183
+ # @return [Boolean] true if comparison has good chance of being useful
184
+ def useful_comparison?(weight_string, pattern, pattern_size, guess_size, tolerance = 0.2)
185
+ pattern = clean_regexp_pattern(pattern) if pattern.is_a?(Regexp)
186
+ multiplier = pattern_size_multiplier(pattern_size)
187
+ difference = (weight_string.length - (guess_size * multiplier * pattern.length / 4)).abs
188
+ return true if difference <= (tolerance * multiplier * pattern.length)
189
+ false
190
+ end
191
+
192
+ # Finds all occurrences of a match pattern in a weight string
193
+ #
194
+ # @param weight_string [String] a weight string
195
+ # @param pattern [String, Regexp] a match pattern
196
+ # @return [Array] array of index-ranges of pattern matches within weight string
197
+ def find_pattern(weight_string, pattern)
198
+ indexes = []
199
+ i = 0
200
+ case pattern
201
+ when String
202
+ while (match = weight_string.index(pattern, i))
203
+ i_end = match + pattern.length - 1
204
+ indexes << (match..i_end)
205
+ i = i_end + 1
206
+ end
207
+ when Regexp
208
+ while (match = pattern.match(weight_string, i))
209
+ i_start = match.begin(0)
210
+ i_end = i_start + match[0].length - 1
211
+ indexes << (i_start..i_end)
212
+ i = i_end + 1
213
+ end
214
+ end
215
+ indexes
216
+ end
217
+
218
+ # Returns a string of a Regexp pattern cleaned of special characters
219
+ #
220
+ # @param regexp [Regexp] a regular expression
221
+ # @return [String] a clean string of the pattern
222
+ def clean_regexp_pattern(regexp)
223
+ pattern = regexp.source
224
+ pattern.gsub!(/[\(\)\^\$\|]+/, "")
225
+ pattern
226
+ end
227
+
228
+ # Returns a multiplier based on the pattern size symbol
229
+ #
230
+ # @param pattern_size [Symbol] a pattern size symbol
231
+ # @return [Integer] a multiplier
232
+ def pattern_size_multiplier(pattern_size)
233
+ case pattern_size
234
+ when :full
235
+ 1
236
+ when :half
237
+ 2
238
+ when :pada
239
+ 4
240
+ end
241
+ end
242
+
243
+ # Checks whether or not a range overlaps with an array of ranges
244
+ #
245
+ # @param indexes [Array] Array of ranges
246
+ # @param range []
247
+ # @return []
248
+ def check_non_overlapping?(indexes, range)
249
+ return true if indexes.empty?
250
+
251
+ indexes.each do |val|
252
+ return false if val.cover?(range.begin) || val.cover?(range.end)
253
+ end
254
+ true
255
+ end
256
+
257
+ # Filters out redundant meter results by compiling together the most
258
+ # complete possible matches for each meter.
259
+ #
260
+ # @param meter_results [Hash] a hash of search results grouped by meter
261
+ # @return [Hash] a compacted hash of search results grouped by meter
262
+ def compact_meter_results(meter_results)
263
+ compact_results = {}
264
+
265
+ meter_results.keys.each do |meter_name|
266
+ compact_index = []
267
+ p = 0
268
+ meter_results[meter_name].each do |val|
269
+ val[:matches].each do |i|
270
+ next unless check_non_overlapping?(compact_index, i)
271
+ compact_index << i
272
+ case val[:size]
273
+ when :full
274
+ p += 100
275
+ when :half
276
+ p += 50
277
+ when :pada
278
+ p += 25
279
+ end
280
+ end
281
+ end
282
+ acc = {
283
+ pattern_type: meter_results[meter_name][0][:type],
284
+ matches: compact_index.sort_by { |a| a.to_s.split("..").first.to_i },
285
+ match_percent: p,
286
+ }
287
+ compact_results[meter_name] = [acc]
288
+ end
289
+ compact_results
290
+ end
291
+
292
+ #
293
+ #
294
+ #
295
+ #
296
+ def fuzzy_manager(meter_results, guess_size, weight_string)
297
+ w1 = weight_string.dup
298
+ extended_result = []
299
+ e = 0
300
+
301
+ meter_results.each do |key, val|
302
+ break if e == 1
303
+ wc = w1.dup
304
+ indexes = val[0][:matches]
305
+ q = (val[0][:match_percent] * 100) / (guess_size * 25)
306
+
307
+ q += 25 if q == 50 && guess_size == 2 # to deal with problematic case when p=25 and guess=2, should be only one correction
308
+
309
+ case q
310
+ when 100
311
+ wc = remove_extra_syllables(wc, indexes)
312
+ e = 1
313
+ when 60..75
314
+ max = get_unmatched_range(indexes, wc.length - 1)
315
+ portion = wc.slice!(max.begin, (max.end - max.begin + 1))
316
+ pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
317
+ correct = corrected_string(portion, pattern)
318
+ wc.insert(max.begin, correct)
319
+ indexes = update_index_array(indexes, max, correct.length - portion.length)
320
+ wc = remove_extra_syllables(wc, indexes)
321
+
322
+ when 30..50
323
+ flag = 0
324
+ 2.times do
325
+ next if flag == 1
326
+ max = get_unmatched_range(indexes, wc.length - 1)
327
+ portion = wc.slice!(max.begin, (max.end - max.begin + 1))
328
+ if (max.end - max.begin + 1) > ((metercount[key][4] / 4) + 3)
329
+ pattern = get_specific_pattern(key, :half, val[0][:pattern_type], portion)
330
+ flag = 1
331
+ else
332
+ pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
333
+ end
334
+ correct = corrected_string(portion, pattern)
335
+ wc.insert(max.begin, correct)
336
+ indexes = update_index_array(indexes, max, correct.length - portion.length)
337
+ end
338
+ wc = remove_extra_syllables(wc, indexes)
339
+ else
340
+ best_fuzzy = fuzzy_analysis(w1, guess_size, val[0][:match_percent])
341
+ best_fuzzy.each do |v|
342
+ if v[:meter] == key
343
+ extended_result << v
344
+ next
345
+ end
346
+ end
347
+ (0...wc.length).each do |k|
348
+ wc[k] = "x"
349
+ end
350
+ end
351
+
352
+ status = get_pada_status(key, wc, indexes, guess_size)
353
+ pada_weights = get_weight_by_pada(status, wc)
354
+
355
+ acc = {
356
+ len_assumption: guess_size.to_s + "/4",
357
+ meter: key,
358
+ type: val[0][:pattern_type],
359
+ match_indexes: status,
360
+ percent_match: val[0][:match_percent],
361
+ edit_count: wc.scan(/[a-z]/).length,
362
+ correct_weights: pada_weights,
363
+ heuristic: (2 * val[0][:match_percent]) + ((100 - (wc.scan(/[a-z]/).length * 100 / weight_string.length))),
364
+ }
365
+ extended_result << acc
366
+ end
367
+ extended_result
368
+ end
369
+
370
+ #
371
+ #
372
+ #
373
+ #
374
+ def get_pada_status(meter, correct_weights, indexes, guess_size)
375
+ len = metercount[meter]
376
+ cw = correct_weights.dup
377
+ index = indexes.dup
378
+
379
+ status = []
380
+ range = nil
381
+
382
+ pn = -1
383
+ pr = nil
384
+ ps = ""
385
+
386
+ # TO DO : identify which padas are actually missing
387
+ (1..guess_size).each do |i|
388
+ break if range.nil? && index.empty?
389
+ pn = i
390
+ range = index.slice!(0, 1)[0] if range.nil?
391
+
392
+ if cw.slice(range.begin, range.end - range.begin + 1).scan(/[a-z]/).empty?
393
+ ps = "exact"
394
+ if (range.end - range.begin + 1) == len[i - 1]
395
+ pr = range
396
+ range = nil
397
+ else
398
+ pr = (range.begin..(range.begin + len[i - 1] - 1))
399
+ range = ((range.begin + len[i - 1])..range.end)
400
+ end
401
+ else
402
+ ps = "fuzzy"
403
+ temp = 1
404
+ rng = range.begin
405
+ while temp <= len[i - 1]
406
+ temp += 1 if cw[rng] != "d"
407
+ rng += 1
408
+ end
409
+ if rng > range.end
410
+ pr = range
411
+ range = nil
412
+ else
413
+ pr = (range.begin..(rng - 1))
414
+ range = (rng..range.end)
415
+ end
416
+ end
417
+ acc = {
418
+ pada_number: pn,
419
+ pada_range: pr,
420
+ pada_status: ps,
421
+ }
422
+ status << acc
423
+ end
424
+
425
+ ((guess_size + 1)..4).each do |j|
426
+ pn = j
427
+ pr = nil
428
+ ps = "missing"
429
+ acc = {
430
+ pada_number: pn,
431
+ pada_range: pr,
432
+ pada_status: ps,
433
+ }
434
+ status << acc
435
+ end
436
+ status
437
+ end
438
+
439
+ #
440
+ #
441
+ #
442
+ #
443
+ def get_weight_by_pada(status, corrected_weights)
444
+ cw = corrected_weights.dup
445
+ pada_weights = []
446
+ start = 0
447
+ status.each do |val|
448
+ if val[:pada_status] == "missing"
449
+ pada_weights << ""
450
+ else
451
+ pada_weights << cw.slice(start, (val[:pada_range].end - start + 1))
452
+ start = val[:pada_range].end + 1
453
+ end
454
+ end
455
+ pada_weights
456
+ end
457
+
458
+ #
459
+ #
460
+ #
461
+ #
462
+ def fuzzy_analysis(weight_string, guess_size, per_match)
463
+ wc = weight_string.dup
464
+ best = []
465
+ edits = 100
466
+
467
+ meter_search_fuzzy(wc, guess_size).each do |value|
468
+ edits = value[:edit_distance] if value[:edit_distance] < edits
469
+ end
470
+
471
+ meter_search_fuzzy(wc, guess_size).each do |value|
472
+ next unless value[:edit_distance] == edits
473
+ wc = corrected_string(weight_string, value[:pattern])
474
+ status = get_pada_status(value[:meter], wc, [(0..(wc.length - 1))], guess_size)
475
+ pada_weights = get_weight_by_pada(status, wc)
476
+ acc = {
477
+ len_assumption: guess_size.to_s + "/4",
478
+ meter: value[:meter],
479
+ type: value[:type],
480
+ match_indexes: status,
481
+ percent_match: per_match,
482
+ edit_count: value[:edit_distance],
483
+ correct_weights: pada_weights,
484
+ heuristic: (2 * per_match) + ((100 - (value[:edit_distance] * 100 / weight_string.length))),
485
+ }
486
+ best << acc
487
+ end
488
+ best
489
+ end
490
+
491
+ #
492
+ #
493
+ #
494
+ #
495
+ def get_specific_pattern(meter_name, size, type, weight_string)
496
+ case type
497
+ when :patterns
498
+ if size == :pada
499
+ MetricalData.meters[meter_name][0].dup
500
+ else
501
+ MetricalData.meters[meter_name][0].dup + MetricalData.meters[meter_name][1].dup
502
+ end
503
+ when :regexes
504
+ MetricalData.all[type][size].each do |p, meter|
505
+ next unless meter_name == meter.keys.first
506
+ p = p.source.gsub(/[\^\$\(\)]/, "")
507
+ r = closest_pattern_to_regex(weight_string, p)
508
+ return r[:pattern]
509
+ end
510
+ end
511
+ end
512
+
513
+ #
514
+ #
515
+ #
516
+ #
517
+ def remove_extra_syllables(weights, indexes)
518
+ w1 = weights.dup
519
+ (0...w1.length).each do |u|
520
+ flag = 0
521
+ indexes.each do |v|
522
+ flag = 1 if u >= v.begin && u <= v.end
523
+ end
524
+ w1[u] = "d" if flag == 0
525
+ end
526
+ w1
527
+ end
528
+
529
+ #
530
+ #
531
+ #
532
+ #
533
+ def get_unmatched_range(indexes, last)
534
+ max = indexes[0].begin > (last - indexes[-1].end) ? (0..(indexes[0].begin - 1)) : ((indexes[-1].end + 1)..last)
535
+ j = 0
536
+ indexes.each do |i|
537
+ if (i.begin - j - 1) > (max.end - max.begin + 1)
538
+ max = ((j + 1)..(i.begin - 1))
539
+ end
540
+ j = i.end
541
+ end
542
+ max
543
+ end
544
+
545
+ #
546
+ #
547
+ #
548
+ #
549
+ def update_index_array(indexes, max, diff)
550
+ indexes << max
551
+ indexes = indexes.sort_by { |a| a.to_s.split("..").first.to_i }
552
+ index2 = []
553
+ indexes.each do |val|
554
+ if val.begin < max.begin
555
+ index2 << val
556
+ elsif val.begin == max.begin
557
+ index2 << ((max.begin)..(max.end + diff))
558
+ else
559
+ index2 << ((val.begin + diff)..(val.end + diff))
560
+ end
561
+ end
562
+ index2
563
+ end
564
+
565
+ #
566
+ #
567
+ #
568
+ #
569
+ def corrected_string(weights, pattern)
570
+ return weights if pattern.empty?
571
+ actual = weights.split("")
572
+ actual.insert(0, " ")
573
+ pattern.insert(0, " ")
574
+
575
+ table = Array.new(actual.length) { Array.new(pattern.length) }
576
+
577
+ (0...actual.length).each do |i|
578
+ table[i][0] = i
579
+ end
580
+ (0...pattern.length).each do |i|
581
+ table[0][i] = i
582
+ end
583
+
584
+ (1...actual.length).each do |i|
585
+ (1...pattern.length).each do |j|
586
+ if actual[i] == pattern[j]
587
+ table[i][j] = table[i - 1][j - 1]
588
+ else
589
+ table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
590
+ end
591
+ end
592
+ end
593
+
594
+ correct = []
595
+ i = actual.length - 1
596
+ j = pattern.length - 1
597
+ while i > 0 || j > 0
598
+ if actual[i] == pattern[j]
599
+ correct.insert(0, actual[i])
600
+ i -= 1
601
+ j -= 1
602
+ else
603
+ x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min if i > 0 && j > 0
604
+ x = table[i][j - 1] if i == 0 # upper boundary case
605
+ x = table[i - 1][j] if j == 0 # left boundary case
606
+ case x
607
+ when table[i][j - 1]
608
+ if pattern[j] == "L"
609
+ correct.insert(0, "l")
610
+ else
611
+ correct.insert(0, "g")
612
+ end
613
+ j -= 1
614
+ when table[i - 1][j - 1]
615
+ correct.insert(0, "f") # to mark substitution in string
616
+ i -= 1
617
+ j -= 1
618
+ when table[i - 1][j]
619
+ correct.insert(0, "d") # to mark deletion from string
620
+ i -= 1
621
+ end
622
+ end
623
+ end
624
+ correct.join("")
625
+ end
626
+
627
+ #
628
+ #
629
+ #
630
+ #
631
+ def metercount
632
+ @metercount ||= begin
633
+ meter_data = {}
634
+ MetricalData.meters.map do |meter_name, pada_arr|
635
+ arr = pada_arr.map(&:length)
636
+ arr << arr.reduce(&:+)
637
+ meter_data[meter_name] = arr
638
+ end
639
+ MetricalData.regexes.full.each do |r, v|
640
+ meter_name = v.keys.first
641
+ next if meter_data.key?(meter_name)
642
+ source = r.source
643
+ next if source["|"] || source["("].nil?
644
+ groups = source.scan(/\(([^()]*)\)/).flatten
645
+ source.gsub!(/[\^\$\(\)]/, "")
646
+ meter_data[meter_name] = groups.map(&:length) << source.length
647
+ end
648
+
649
+ meter_data.sort.to_h.deep_freeze
650
+ end
651
+ end
652
+
653
+ #
654
+ #
655
+ #
656
+ #
657
+ def fuzzy_correction(_meter, corrected_weights, syllables)
658
+ k = 0
659
+ n = 0 # for syllables
660
+ p = 0
661
+ temp = []
662
+ v_padas = []
663
+
664
+ corrected_weights.each do |correct|
665
+ (0...correct.length).each do |k|
666
+ # break if n >= syllables.length
667
+ if correct[k] == "d"
668
+ temp << ("[" + syllables[n] + "]")
669
+ n += 1
670
+ elsif correct[k] == "f"
671
+ temp << ("(" + syllables[n] + ")")
672
+ n += 1
673
+ elsif correct[k] == "g"
674
+ case p
675
+ when 0
676
+ temp << " { (g)"
677
+ p = 2
678
+ else
679
+ temp << "(g)"
680
+ end
681
+ elsif correct[k] == "l"
682
+ case p
683
+ when 0
684
+ temp << " { (l)"
685
+ p = 1
686
+ else
687
+ temp << "(l)"
688
+ end
689
+ else
690
+ case p
691
+ when 2
692
+ if correct[k] == "L"
693
+ temp << " } " + syllables[n]
694
+ p = 0
695
+ else
696
+ temp << syllables[n]
697
+ end
698
+ when 1
699
+ if correct[k] == "G"
700
+ temp << " } " + syllables[n]
701
+ p = 0
702
+ else
703
+ temp << syllables[n]
704
+ end
705
+ when 0
706
+ temp << syllables[n]
707
+ end
708
+ n += 1
709
+ end
710
+ end
711
+ v_padas << temp.join("")
712
+ temp = []
713
+ end
714
+ v_padas
715
+ end
716
+
717
+ #
718
+ #
719
+ #
720
+ #
721
+ def meter_search_fuzzy(weight_string, guess_size)
722
+ candidates = []
723
+ syllable_count = weight_string.length
724
+ length_variance = 0.2
725
+ edit_tolerance = 0.15
726
+ str = Amatch::Levenshtein.new(weight_string)
727
+
728
+ %i[patterns regexes].each do |type|
729
+ matches = MetricalData.all[type][:full].each_with_object([]) do |(p, meter), acc|
730
+ meter_name = meter.keys.first
731
+ case p
732
+ when String
733
+ pattern = ""
734
+ p2 = p.dup
735
+ l = metercount[meter_name]
736
+ (0...guess_size).each do |i|
737
+ pattern += p2.slice!(0, l[i])
738
+ end
739
+
740
+ next unless (pattern.length - syllable_count).abs <= length_variance * pattern.length
741
+ edit_distance = str.match(pattern)
742
+ next if edit_distance > edit_tolerance * pattern.length
743
+ pattern_string = pattern
744
+ when Regexp
745
+ next if p.source["|"]
746
+ p = p.source.gsub(/[\^\$\(\)]/, "")
747
+ pattern = p.slice(0...(guess_size * p.length / 4))
748
+ next if (pattern.length - syllable_count).abs > length_variance * pattern.length
749
+ result = closest_pattern_to_regex(weight_string, pattern)
750
+ pattern_string = result[:pattern]
751
+ edit_distance = result[:edit_distance]
752
+
753
+ next if edit_distance > edit_tolerance * pattern.length
754
+ end
755
+ acc << {
756
+ meter: meter_name,
757
+ type: type,
758
+ guess_size: guess_size,
759
+ pattern: pattern_string,
760
+ edit_distance: edit_distance,
761
+ }
762
+ end
763
+ candidates.concat(matches)
764
+ end
765
+ candidates
766
+ end
767
+
768
+ #
769
+ #
770
+ #
771
+ #
772
+ def closest_pattern_to_regex(weight_string, pattern)
773
+ pattern2 = pattern.tr(".", "L")
774
+ str = Amatch::Levenshtein.new(weight_string)
775
+
776
+ edit_distance = str.match(pattern2)
777
+ c = corrected_string(weight_string, pattern2)
778
+ # c = c.join("")
779
+
780
+ pattern_string = []
781
+ x1 = 0 # for pattern
782
+ xw = 0 # for weight string
783
+ pattern = pattern.lstrip
784
+ (0...c.length).each do |i|
785
+ if c[i] == "L" || c[i] == "G"
786
+ x1 += 1
787
+ pattern_string << c[i]
788
+ xw += 1
789
+ elsif c[i] == "l"
790
+ x1 += 1
791
+ pattern_string << "L"
792
+ elsif c[i] == "g"
793
+ x1 += 1
794
+ pattern_string << "G"
795
+ elsif c[i] == "d"
796
+ xw += 1
797
+ elsif c[i] == "f" && pattern[x1] == "."
798
+ x1 += 1
799
+ edit_distance -= 1
800
+ pattern_string << weight_string[xw]
801
+ xw += 1
802
+ else
803
+ pattern_string << pattern[x1]
804
+ x1 += 1
805
+ xw += 1
806
+ end
807
+ end
808
+ pattern_string = pattern_string.join("")
809
+ acc = {
810
+ pattern: pattern_string,
811
+ edit_distance: edit_distance,
812
+ }
813
+ acc
814
+ end
815
+ end
816
+ end