dphil 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,816 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amatch"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
module VerseAnalysis
|
7
|
+
using ::Ragabash::Refinements
|
8
|
+
|
9
|
+
module_function
|
10
|
+
|
11
|
+
# Converts a verse string into individual syllables.
|
12
|
+
#
|
13
|
+
# @param verse_string [String] the raw text of the verse.
|
14
|
+
# @param from [Symbol] specify source transliteration scheme (detect by default)
|
15
|
+
# @param to [Symbol] specify output transliteration scheme (defaults to source)
|
16
|
+
# @return [Array] the text split into individual SLP1-encoded syllables.
|
17
|
+
def syllables(verse_string, from: nil, to: nil)
|
18
|
+
verse_string = verse_string.to_str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
|
19
|
+
from ||= Transliterate.detect(verse_string) || Transliterate.default_script
|
20
|
+
to ||= from
|
21
|
+
verse_string = Transliterate.transliterate(verse_string, from, :slp1)
|
22
|
+
syllables = verse_string.scan(Constants::R_SYL)
|
23
|
+
syllables.map! { |syl| Transliterate.transliterate(syl, :slp1, to) } if to != :slp1
|
24
|
+
syllables
|
25
|
+
end
|
26
|
+
|
27
|
+
# Converts a list of syllables into their L/G weights.
|
28
|
+
#
|
29
|
+
# @param syllables [Array] a set of syllables
|
30
|
+
# @return [String] the weight string of the syllables of the verse
|
31
|
+
def syllables_weights(syllables, from: nil, contextual: false)
|
32
|
+
from ||= Transliterate.detect(syllables.join("")) || Transliterate.default_script
|
33
|
+
syllables = syllables.to_ary.map { |syl| Transliterate.transliterate(syl, from, :slp1) } if from != :slp1
|
34
|
+
weight_arr = (0...syllables.length).map do |i|
|
35
|
+
cur_syl = syllables[i].delete("'").strip
|
36
|
+
next_syl = syllables[i + 1]&.delete("'")&.strip
|
37
|
+
if cur_syl.match?(Constants::R_GSYL)
|
38
|
+
# Guru if current syllable contains a long vowel, or end in a ṃ/ḥ
|
39
|
+
"G"
|
40
|
+
elsif cur_syl.match?(Constants::R_CCONF)
|
41
|
+
# Contextually Guru if ending in a cluster
|
42
|
+
"g"
|
43
|
+
elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}".match?(Constants::R_CCON)
|
44
|
+
# Contextually Guru if syllable-final and next syllable-inital make a
|
45
|
+
# consonant cluster.
|
46
|
+
"g"
|
47
|
+
else
|
48
|
+
"L"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
contextual ? weight_arr.join("") : weight_arr.join("").upcase
|
52
|
+
end
|
53
|
+
|
54
|
+
# Convenience method to directly get weight string of verse
|
55
|
+
#
|
56
|
+
# @param verse_string [String] the raw text of the verse
|
57
|
+
# @return [String] the weight string of the verse.
|
58
|
+
def verse_weights(verse_string, contextual: false)
|
59
|
+
syllables_weights(syllables(verse_string), contextual: contextual)
|
60
|
+
end
|
61
|
+
|
62
|
+
def identify(verse_string)
|
63
|
+
v_syllables = syllables(verse_string)
|
64
|
+
v_weight = syllables_weights(v_syllables, contextual: true)
|
65
|
+
v_meters = identify_meter_manager(verse_string)
|
66
|
+
unless v_meters.empty?
|
67
|
+
status = v_meters.first[:info]
|
68
|
+
meter = status.delete(:meter)
|
69
|
+
padas = v_meters.first[:corrected_padas]
|
70
|
+
end
|
71
|
+
{
|
72
|
+
verse: verse_string,
|
73
|
+
syllables: v_syllables,
|
74
|
+
weights: v_weight,
|
75
|
+
status: status,
|
76
|
+
meter: meter,
|
77
|
+
padas: padas,
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# Coordinates metrical identification for a verse string.
|
82
|
+
#
|
83
|
+
# @param verse_string [String] a verse string
|
84
|
+
# @return [Array] candidate meters and information about their matches
|
85
|
+
def identify_meter_manager(verse_string)
|
86
|
+
syllables = syllables(verse_string)
|
87
|
+
weight_string = syllables_weights(syllables)
|
88
|
+
|
89
|
+
candidates = []
|
90
|
+
4.downto(1).each do |guess_size|
|
91
|
+
#
|
92
|
+
# TODO: Pre-process or somehow change this so that search is aware of
|
93
|
+
# how weight_string may or may not break across padas
|
94
|
+
# (i.e. whitespace in syllables)
|
95
|
+
search_results = meter_search_partial(weight_string, guess_size)
|
96
|
+
next if search_results.empty?
|
97
|
+
|
98
|
+
meter_results = search_results.group_by { |result| result[:meter_name] }
|
99
|
+
|
100
|
+
# Filter down to most-complete matches for each meter.
|
101
|
+
meter_results = compact_meter_results(meter_results)
|
102
|
+
meter_results.sort_by { |_key, value| value[0][:match_percent] }.to_h
|
103
|
+
|
104
|
+
meter_results = fuzzy_manager(meter_results, guess_size, weight_string)
|
105
|
+
|
106
|
+
# Add results to candidates
|
107
|
+
candidates.concat(meter_results)
|
108
|
+
end
|
109
|
+
|
110
|
+
candidates.concat(fuzzy_analysis(weight_string, 4, 0)) if candidates == []
|
111
|
+
|
112
|
+
candidates.sort_by! { |value| value[:heuristic] }
|
113
|
+
candidates.reverse!
|
114
|
+
probables = get_best_matches(candidates, syllables, 2)
|
115
|
+
probables
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
#
|
120
|
+
#
|
121
|
+
#
|
122
|
+
def get_best_matches(candidates, syllables, number)
|
123
|
+
best = []
|
124
|
+
i = 1
|
125
|
+
candidates.each do |val|
|
126
|
+
break if i > number
|
127
|
+
|
128
|
+
acc = {
|
129
|
+
info: val,
|
130
|
+
corrected_padas: fuzzy_correction(val[:meter], val[:correct_weights], syllables),
|
131
|
+
}
|
132
|
+
i += 1
|
133
|
+
best << acc
|
134
|
+
end
|
135
|
+
best
|
136
|
+
end
|
137
|
+
|
138
|
+
# Searches for meter candidates for a given weight string and search size.
|
139
|
+
#
|
140
|
+
# @param weight_string [String] a weight string
|
141
|
+
# @param guess_size [Integer] the number of padas to match against
|
142
|
+
# @return [Array] candidate meters and associated match data
|
143
|
+
def meter_search_partial(weight_string, guess_size)
|
144
|
+
size_groups = if guess_size == 4
|
145
|
+
%i[full half pada]
|
146
|
+
elsif guess_size >= 2
|
147
|
+
%i[half pada]
|
148
|
+
else
|
149
|
+
%i[pada]
|
150
|
+
end
|
151
|
+
|
152
|
+
candidates = []
|
153
|
+
size_groups.product(%i[patterns regexes]).each do |(pattern_size, pattern_type)|
|
154
|
+
MetricalData.all[pattern_type][pattern_size].each do |pattern, meter|
|
155
|
+
next unless useful_comparison?(weight_string, pattern, pattern_size, guess_size)
|
156
|
+
# Match pattern against weight_string by `.find_pattern`
|
157
|
+
matches = find_pattern(weight_string, pattern)
|
158
|
+
next if matches.empty?
|
159
|
+
candidates << {
|
160
|
+
meter_name: meter.each_key.first,
|
161
|
+
type: pattern_type,
|
162
|
+
size: pattern_size,
|
163
|
+
scope: meter.each_value.first,
|
164
|
+
pattern: pattern,
|
165
|
+
matches: matches,
|
166
|
+
coverage: matches.reduce(0.0) { |a, e| a + e.size } / weight_string.length,
|
167
|
+
guess_size: guess_size,
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end
|
171
|
+
candidates
|
172
|
+
end
|
173
|
+
|
174
|
+
# Determines whether a given match might be considered useful to determining
|
175
|
+
# a candidate meter for a given weight string.
|
176
|
+
#
|
177
|
+
# @param weight_string [String] a weight string
|
178
|
+
# @param pattern [String, Regexp] a match pattern
|
179
|
+
# @param pattern_size [Symbol] the size of a match pattern
|
180
|
+
# @param guess_size [Integer] the number of padas being searched for
|
181
|
+
# @param tolerance [Numeric] the tolerance percentage of length difference
|
182
|
+
#
|
183
|
+
# @return [Boolean] true if comparison has good chance of being useful
|
184
|
+
def useful_comparison?(weight_string, pattern, pattern_size, guess_size, tolerance = 0.2)
|
185
|
+
pattern = clean_regexp_pattern(pattern) if pattern.is_a?(Regexp)
|
186
|
+
multiplier = pattern_size_multiplier(pattern_size)
|
187
|
+
difference = (weight_string.length - (guess_size * multiplier * pattern.length / 4)).abs
|
188
|
+
return true if difference <= (tolerance * multiplier * pattern.length)
|
189
|
+
false
|
190
|
+
end
|
191
|
+
|
192
|
+
# Finds all occurrences of a match pattern in a weight string
|
193
|
+
#
|
194
|
+
# @param weight_string [String] a weight string
|
195
|
+
# @param pattern [String, Regexp] a match pattern
|
196
|
+
# @return [Array] array of index-ranges of pattern matches within weight string
|
197
|
+
def find_pattern(weight_string, pattern)
|
198
|
+
indexes = []
|
199
|
+
i = 0
|
200
|
+
case pattern
|
201
|
+
when String
|
202
|
+
while (match = weight_string.index(pattern, i))
|
203
|
+
i_end = match + pattern.length - 1
|
204
|
+
indexes << (match..i_end)
|
205
|
+
i = i_end + 1
|
206
|
+
end
|
207
|
+
when Regexp
|
208
|
+
while (match = pattern.match(weight_string, i))
|
209
|
+
i_start = match.begin(0)
|
210
|
+
i_end = i_start + match[0].length - 1
|
211
|
+
indexes << (i_start..i_end)
|
212
|
+
i = i_end + 1
|
213
|
+
end
|
214
|
+
end
|
215
|
+
indexes
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns a string of a Regexp pattern cleaned of special characters
|
219
|
+
#
|
220
|
+
# @param regexp [Regexp] a regular expression
|
221
|
+
# @return [String] a clean string of the pattern
|
222
|
+
def clean_regexp_pattern(regexp)
|
223
|
+
pattern = regexp.source
|
224
|
+
pattern.gsub!(/[\(\)\^\$\|]+/, "")
|
225
|
+
pattern
|
226
|
+
end
|
227
|
+
|
228
|
+
# Returns a multiplier based on the pattern size symbol
|
229
|
+
#
|
230
|
+
# @param pattern_size [Symbol] a pattern size symbol
|
231
|
+
# @return [Integer] a multiplier
|
232
|
+
def pattern_size_multiplier(pattern_size)
|
233
|
+
case pattern_size
|
234
|
+
when :full
|
235
|
+
1
|
236
|
+
when :half
|
237
|
+
2
|
238
|
+
when :pada
|
239
|
+
4
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
# Checks whether or not a range overlaps with an array of ranges
|
244
|
+
#
|
245
|
+
# @param indexes [Array] Array of ranges
|
246
|
+
# @param range []
|
247
|
+
# @return []
|
248
|
+
def check_non_overlapping?(indexes, range)
|
249
|
+
return true if indexes.empty?
|
250
|
+
|
251
|
+
indexes.each do |val|
|
252
|
+
return false if val.cover?(range.begin) || val.cover?(range.end)
|
253
|
+
end
|
254
|
+
true
|
255
|
+
end
|
256
|
+
|
257
|
+
# Filters out redundant meter results by compiling together the most
|
258
|
+
# complete possible matches for each meter.
|
259
|
+
#
|
260
|
+
# @param meter_results [Hash] a hash of search results grouped by meter
|
261
|
+
# @return [Hash] a compacted hash of search results grouped by meter
|
262
|
+
def compact_meter_results(meter_results)
|
263
|
+
compact_results = {}
|
264
|
+
|
265
|
+
meter_results.keys.each do |meter_name|
|
266
|
+
compact_index = []
|
267
|
+
p = 0
|
268
|
+
meter_results[meter_name].each do |val|
|
269
|
+
val[:matches].each do |i|
|
270
|
+
next unless check_non_overlapping?(compact_index, i)
|
271
|
+
compact_index << i
|
272
|
+
case val[:size]
|
273
|
+
when :full
|
274
|
+
p += 100
|
275
|
+
when :half
|
276
|
+
p += 50
|
277
|
+
when :pada
|
278
|
+
p += 25
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
acc = {
|
283
|
+
pattern_type: meter_results[meter_name][0][:type],
|
284
|
+
matches: compact_index.sort_by { |a| a.to_s.split("..").first.to_i },
|
285
|
+
match_percent: p,
|
286
|
+
}
|
287
|
+
compact_results[meter_name] = [acc]
|
288
|
+
end
|
289
|
+
compact_results
|
290
|
+
end
|
291
|
+
|
292
|
+
#
|
293
|
+
#
|
294
|
+
#
|
295
|
+
#
|
296
|
+
def fuzzy_manager(meter_results, guess_size, weight_string)
|
297
|
+
w1 = weight_string.dup
|
298
|
+
extended_result = []
|
299
|
+
e = 0
|
300
|
+
|
301
|
+
meter_results.each do |key, val|
|
302
|
+
break if e == 1
|
303
|
+
wc = w1.dup
|
304
|
+
indexes = val[0][:matches]
|
305
|
+
q = (val[0][:match_percent] * 100) / (guess_size * 25)
|
306
|
+
|
307
|
+
q += 25 if q == 50 && guess_size == 2 # to deal with problematic case when p=25 and guess=2, should be only one correction
|
308
|
+
|
309
|
+
case q
|
310
|
+
when 100
|
311
|
+
wc = remove_extra_syllables(wc, indexes)
|
312
|
+
e = 1
|
313
|
+
when 60..75
|
314
|
+
max = get_unmatched_range(indexes, wc.length - 1)
|
315
|
+
portion = wc.slice!(max.begin, (max.end - max.begin + 1))
|
316
|
+
pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
|
317
|
+
correct = corrected_string(portion, pattern)
|
318
|
+
wc.insert(max.begin, correct)
|
319
|
+
indexes = update_index_array(indexes, max, correct.length - portion.length)
|
320
|
+
wc = remove_extra_syllables(wc, indexes)
|
321
|
+
|
322
|
+
when 30..50
|
323
|
+
flag = 0
|
324
|
+
2.times do
|
325
|
+
next if flag == 1
|
326
|
+
max = get_unmatched_range(indexes, wc.length - 1)
|
327
|
+
portion = wc.slice!(max.begin, (max.end - max.begin + 1))
|
328
|
+
if (max.end - max.begin + 1) > ((metercount[key][4] / 4) + 3)
|
329
|
+
pattern = get_specific_pattern(key, :half, val[0][:pattern_type], portion)
|
330
|
+
flag = 1
|
331
|
+
else
|
332
|
+
pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
|
333
|
+
end
|
334
|
+
correct = corrected_string(portion, pattern)
|
335
|
+
wc.insert(max.begin, correct)
|
336
|
+
indexes = update_index_array(indexes, max, correct.length - portion.length)
|
337
|
+
end
|
338
|
+
wc = remove_extra_syllables(wc, indexes)
|
339
|
+
else
|
340
|
+
best_fuzzy = fuzzy_analysis(w1, guess_size, val[0][:match_percent])
|
341
|
+
best_fuzzy.each do |v|
|
342
|
+
if v[:meter] == key
|
343
|
+
extended_result << v
|
344
|
+
next
|
345
|
+
end
|
346
|
+
end
|
347
|
+
(0...wc.length).each do |k|
|
348
|
+
wc[k] = "x"
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
status = get_pada_status(key, wc, indexes, guess_size)
|
353
|
+
pada_weights = get_weight_by_pada(status, wc)
|
354
|
+
|
355
|
+
acc = {
|
356
|
+
len_assumption: guess_size.to_s + "/4",
|
357
|
+
meter: key,
|
358
|
+
type: val[0][:pattern_type],
|
359
|
+
match_indexes: status,
|
360
|
+
percent_match: val[0][:match_percent],
|
361
|
+
edit_count: wc.scan(/[a-z]/).length,
|
362
|
+
correct_weights: pada_weights,
|
363
|
+
heuristic: (2 * val[0][:match_percent]) + ((100 - (wc.scan(/[a-z]/).length * 100 / weight_string.length))),
|
364
|
+
}
|
365
|
+
extended_result << acc
|
366
|
+
end
|
367
|
+
extended_result
|
368
|
+
end
|
369
|
+
|
370
|
+
#
|
371
|
+
#
|
372
|
+
#
|
373
|
+
#
|
374
|
+
def get_pada_status(meter, correct_weights, indexes, guess_size)
|
375
|
+
len = metercount[meter]
|
376
|
+
cw = correct_weights.dup
|
377
|
+
index = indexes.dup
|
378
|
+
|
379
|
+
status = []
|
380
|
+
range = nil
|
381
|
+
|
382
|
+
pn = -1
|
383
|
+
pr = nil
|
384
|
+
ps = ""
|
385
|
+
|
386
|
+
# TO DO : identify which padas are actually missing
|
387
|
+
(1..guess_size).each do |i|
|
388
|
+
break if range.nil? && index.empty?
|
389
|
+
pn = i
|
390
|
+
range = index.slice!(0, 1)[0] if range.nil?
|
391
|
+
|
392
|
+
if cw.slice(range.begin, range.end - range.begin + 1).scan(/[a-z]/).empty?
|
393
|
+
ps = "exact"
|
394
|
+
if (range.end - range.begin + 1) == len[i - 1]
|
395
|
+
pr = range
|
396
|
+
range = nil
|
397
|
+
else
|
398
|
+
pr = (range.begin..(range.begin + len[i - 1] - 1))
|
399
|
+
range = ((range.begin + len[i - 1])..range.end)
|
400
|
+
end
|
401
|
+
else
|
402
|
+
ps = "fuzzy"
|
403
|
+
temp = 1
|
404
|
+
rng = range.begin
|
405
|
+
while temp <= len[i - 1]
|
406
|
+
temp += 1 if cw[rng] != "d"
|
407
|
+
rng += 1
|
408
|
+
end
|
409
|
+
if rng > range.end
|
410
|
+
pr = range
|
411
|
+
range = nil
|
412
|
+
else
|
413
|
+
pr = (range.begin..(rng - 1))
|
414
|
+
range = (rng..range.end)
|
415
|
+
end
|
416
|
+
end
|
417
|
+
acc = {
|
418
|
+
pada_number: pn,
|
419
|
+
pada_range: pr,
|
420
|
+
pada_status: ps,
|
421
|
+
}
|
422
|
+
status << acc
|
423
|
+
end
|
424
|
+
|
425
|
+
((guess_size + 1)..4).each do |j|
|
426
|
+
pn = j
|
427
|
+
pr = nil
|
428
|
+
ps = "missing"
|
429
|
+
acc = {
|
430
|
+
pada_number: pn,
|
431
|
+
pada_range: pr,
|
432
|
+
pada_status: ps,
|
433
|
+
}
|
434
|
+
status << acc
|
435
|
+
end
|
436
|
+
status
|
437
|
+
end
|
438
|
+
|
439
|
+
#
|
440
|
+
#
|
441
|
+
#
|
442
|
+
#
|
443
|
+
def get_weight_by_pada(status, corrected_weights)
|
444
|
+
cw = corrected_weights.dup
|
445
|
+
pada_weights = []
|
446
|
+
start = 0
|
447
|
+
status.each do |val|
|
448
|
+
if val[:pada_status] == "missing"
|
449
|
+
pada_weights << ""
|
450
|
+
else
|
451
|
+
pada_weights << cw.slice(start, (val[:pada_range].end - start + 1))
|
452
|
+
start = val[:pada_range].end + 1
|
453
|
+
end
|
454
|
+
end
|
455
|
+
pada_weights
|
456
|
+
end
|
457
|
+
|
458
|
+
#
|
459
|
+
#
|
460
|
+
#
|
461
|
+
#
|
462
|
+
def fuzzy_analysis(weight_string, guess_size, per_match)
|
463
|
+
wc = weight_string.dup
|
464
|
+
best = []
|
465
|
+
edits = 100
|
466
|
+
|
467
|
+
meter_search_fuzzy(wc, guess_size).each do |value|
|
468
|
+
edits = value[:edit_distance] if value[:edit_distance] < edits
|
469
|
+
end
|
470
|
+
|
471
|
+
meter_search_fuzzy(wc, guess_size).each do |value|
|
472
|
+
next unless value[:edit_distance] == edits
|
473
|
+
wc = corrected_string(weight_string, value[:pattern])
|
474
|
+
status = get_pada_status(value[:meter], wc, [(0..(wc.length - 1))], guess_size)
|
475
|
+
pada_weights = get_weight_by_pada(status, wc)
|
476
|
+
acc = {
|
477
|
+
len_assumption: guess_size.to_s + "/4",
|
478
|
+
meter: value[:meter],
|
479
|
+
type: value[:type],
|
480
|
+
match_indexes: status,
|
481
|
+
percent_match: per_match,
|
482
|
+
edit_count: value[:edit_distance],
|
483
|
+
correct_weights: pada_weights,
|
484
|
+
heuristic: (2 * per_match) + ((100 - (value[:edit_distance] * 100 / weight_string.length))),
|
485
|
+
}
|
486
|
+
best << acc
|
487
|
+
end
|
488
|
+
best
|
489
|
+
end
|
490
|
+
|
491
|
+
#
|
492
|
+
#
|
493
|
+
#
|
494
|
+
#
|
495
|
+
def get_specific_pattern(meter_name, size, type, weight_string)
|
496
|
+
case type
|
497
|
+
when :patterns
|
498
|
+
if size == :pada
|
499
|
+
MetricalData.meters[meter_name][0].dup
|
500
|
+
else
|
501
|
+
MetricalData.meters[meter_name][0].dup + MetricalData.meters[meter_name][1].dup
|
502
|
+
end
|
503
|
+
when :regexes
|
504
|
+
MetricalData.all[type][size].each do |p, meter|
|
505
|
+
next unless meter_name == meter.keys.first
|
506
|
+
p = p.source.gsub(/[\^\$\(\)]/, "")
|
507
|
+
r = closest_pattern_to_regex(weight_string, p)
|
508
|
+
return r[:pattern]
|
509
|
+
end
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
#
|
514
|
+
#
|
515
|
+
#
|
516
|
+
#
|
517
|
+
def remove_extra_syllables(weights, indexes)
|
518
|
+
w1 = weights.dup
|
519
|
+
(0...w1.length).each do |u|
|
520
|
+
flag = 0
|
521
|
+
indexes.each do |v|
|
522
|
+
flag = 1 if u >= v.begin && u <= v.end
|
523
|
+
end
|
524
|
+
w1[u] = "d" if flag == 0
|
525
|
+
end
|
526
|
+
w1
|
527
|
+
end
|
528
|
+
|
529
|
+
#
|
530
|
+
#
|
531
|
+
#
|
532
|
+
#
|
533
|
+
def get_unmatched_range(indexes, last)
|
534
|
+
max = indexes[0].begin > (last - indexes[-1].end) ? (0..(indexes[0].begin - 1)) : ((indexes[-1].end + 1)..last)
|
535
|
+
j = 0
|
536
|
+
indexes.each do |i|
|
537
|
+
if (i.begin - j - 1) > (max.end - max.begin + 1)
|
538
|
+
max = ((j + 1)..(i.begin - 1))
|
539
|
+
end
|
540
|
+
j = i.end
|
541
|
+
end
|
542
|
+
max
|
543
|
+
end
|
544
|
+
|
545
|
+
#
|
546
|
+
#
|
547
|
+
#
|
548
|
+
#
|
549
|
+
def update_index_array(indexes, max, diff)
|
550
|
+
indexes << max
|
551
|
+
indexes = indexes.sort_by { |a| a.to_s.split("..").first.to_i }
|
552
|
+
index2 = []
|
553
|
+
indexes.each do |val|
|
554
|
+
if val.begin < max.begin
|
555
|
+
index2 << val
|
556
|
+
elsif val.begin == max.begin
|
557
|
+
index2 << ((max.begin)..(max.end + diff))
|
558
|
+
else
|
559
|
+
index2 << ((val.begin + diff)..(val.end + diff))
|
560
|
+
end
|
561
|
+
end
|
562
|
+
index2
|
563
|
+
end
|
564
|
+
|
565
|
+
#
|
566
|
+
#
|
567
|
+
#
|
568
|
+
#
|
569
|
+
def corrected_string(weights, pattern)
|
570
|
+
return weights if pattern.empty?
|
571
|
+
actual = weights.split("")
|
572
|
+
actual.insert(0, " ")
|
573
|
+
pattern.insert(0, " ")
|
574
|
+
|
575
|
+
table = Array.new(actual.length) { Array.new(pattern.length) }
|
576
|
+
|
577
|
+
(0...actual.length).each do |i|
|
578
|
+
table[i][0] = i
|
579
|
+
end
|
580
|
+
(0...pattern.length).each do |i|
|
581
|
+
table[0][i] = i
|
582
|
+
end
|
583
|
+
|
584
|
+
(1...actual.length).each do |i|
|
585
|
+
(1...pattern.length).each do |j|
|
586
|
+
if actual[i] == pattern[j]
|
587
|
+
table[i][j] = table[i - 1][j - 1]
|
588
|
+
else
|
589
|
+
table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
|
590
|
+
end
|
591
|
+
end
|
592
|
+
end
|
593
|
+
|
594
|
+
correct = []
|
595
|
+
i = actual.length - 1
|
596
|
+
j = pattern.length - 1
|
597
|
+
while i > 0 || j > 0
|
598
|
+
if actual[i] == pattern[j]
|
599
|
+
correct.insert(0, actual[i])
|
600
|
+
i -= 1
|
601
|
+
j -= 1
|
602
|
+
else
|
603
|
+
x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min if i > 0 && j > 0
|
604
|
+
x = table[i][j - 1] if i == 0 # upper boundary case
|
605
|
+
x = table[i - 1][j] if j == 0 # left boundary case
|
606
|
+
case x
|
607
|
+
when table[i][j - 1]
|
608
|
+
if pattern[j] == "L"
|
609
|
+
correct.insert(0, "l")
|
610
|
+
else
|
611
|
+
correct.insert(0, "g")
|
612
|
+
end
|
613
|
+
j -= 1
|
614
|
+
when table[i - 1][j - 1]
|
615
|
+
correct.insert(0, "f") # to mark substitution in string
|
616
|
+
i -= 1
|
617
|
+
j -= 1
|
618
|
+
when table[i - 1][j]
|
619
|
+
correct.insert(0, "d") # to mark deletion from string
|
620
|
+
i -= 1
|
621
|
+
end
|
622
|
+
end
|
623
|
+
end
|
624
|
+
correct.join("")
|
625
|
+
end
|
626
|
+
|
627
|
+
#
|
628
|
+
#
|
629
|
+
#
|
630
|
+
#
|
631
|
+
def metercount
|
632
|
+
@metercount ||= begin
|
633
|
+
meter_data = {}
|
634
|
+
MetricalData.meters.map do |meter_name, pada_arr|
|
635
|
+
arr = pada_arr.map(&:length)
|
636
|
+
arr << arr.reduce(&:+)
|
637
|
+
meter_data[meter_name] = arr
|
638
|
+
end
|
639
|
+
MetricalData.regexes.full.each do |r, v|
|
640
|
+
meter_name = v.keys.first
|
641
|
+
next if meter_data.key?(meter_name)
|
642
|
+
source = r.source
|
643
|
+
next if source["|"] || source["("].nil?
|
644
|
+
groups = source.scan(/\(([^()]*)\)/).flatten
|
645
|
+
source.gsub!(/[\^\$\(\)]/, "")
|
646
|
+
meter_data[meter_name] = groups.map(&:length) << source.length
|
647
|
+
end
|
648
|
+
|
649
|
+
meter_data.sort.to_h.deep_freeze
|
650
|
+
end
|
651
|
+
end
|
652
|
+
|
653
|
+
#
|
654
|
+
#
|
655
|
+
#
|
656
|
+
#
|
657
|
+
def fuzzy_correction(_meter, corrected_weights, syllables)
|
658
|
+
k = 0
|
659
|
+
n = 0 # for syllables
|
660
|
+
p = 0
|
661
|
+
temp = []
|
662
|
+
v_padas = []
|
663
|
+
|
664
|
+
corrected_weights.each do |correct|
|
665
|
+
(0...correct.length).each do |k|
|
666
|
+
# break if n >= syllables.length
|
667
|
+
if correct[k] == "d"
|
668
|
+
temp << ("[" + syllables[n] + "]")
|
669
|
+
n += 1
|
670
|
+
elsif correct[k] == "f"
|
671
|
+
temp << ("(" + syllables[n] + ")")
|
672
|
+
n += 1
|
673
|
+
elsif correct[k] == "g"
|
674
|
+
case p
|
675
|
+
when 0
|
676
|
+
temp << " { (g)"
|
677
|
+
p = 2
|
678
|
+
else
|
679
|
+
temp << "(g)"
|
680
|
+
end
|
681
|
+
elsif correct[k] == "l"
|
682
|
+
case p
|
683
|
+
when 0
|
684
|
+
temp << " { (l)"
|
685
|
+
p = 1
|
686
|
+
else
|
687
|
+
temp << "(l)"
|
688
|
+
end
|
689
|
+
else
|
690
|
+
case p
|
691
|
+
when 2
|
692
|
+
if correct[k] == "L"
|
693
|
+
temp << " } " + syllables[n]
|
694
|
+
p = 0
|
695
|
+
else
|
696
|
+
temp << syllables[n]
|
697
|
+
end
|
698
|
+
when 1
|
699
|
+
if correct[k] == "G"
|
700
|
+
temp << " } " + syllables[n]
|
701
|
+
p = 0
|
702
|
+
else
|
703
|
+
temp << syllables[n]
|
704
|
+
end
|
705
|
+
when 0
|
706
|
+
temp << syllables[n]
|
707
|
+
end
|
708
|
+
n += 1
|
709
|
+
end
|
710
|
+
end
|
711
|
+
v_padas << temp.join("")
|
712
|
+
temp = []
|
713
|
+
end
|
714
|
+
v_padas
|
715
|
+
end
|
716
|
+
|
717
|
+
#
|
718
|
+
#
|
719
|
+
#
|
720
|
+
#
|
721
|
+
def meter_search_fuzzy(weight_string, guess_size)
|
722
|
+
candidates = []
|
723
|
+
syllable_count = weight_string.length
|
724
|
+
length_variance = 0.2
|
725
|
+
edit_tolerance = 0.15
|
726
|
+
str = Amatch::Levenshtein.new(weight_string)
|
727
|
+
|
728
|
+
%i[patterns regexes].each do |type|
|
729
|
+
matches = MetricalData.all[type][:full].each_with_object([]) do |(p, meter), acc|
|
730
|
+
meter_name = meter.keys.first
|
731
|
+
case p
|
732
|
+
when String
|
733
|
+
pattern = ""
|
734
|
+
p2 = p.dup
|
735
|
+
l = metercount[meter_name]
|
736
|
+
(0...guess_size).each do |i|
|
737
|
+
pattern += p2.slice!(0, l[i])
|
738
|
+
end
|
739
|
+
|
740
|
+
next unless (pattern.length - syllable_count).abs <= length_variance * pattern.length
|
741
|
+
edit_distance = str.match(pattern)
|
742
|
+
next if edit_distance > edit_tolerance * pattern.length
|
743
|
+
pattern_string = pattern
|
744
|
+
when Regexp
|
745
|
+
next if p.source["|"]
|
746
|
+
p = p.source.gsub(/[\^\$\(\)]/, "")
|
747
|
+
pattern = p.slice(0...(guess_size * p.length / 4))
|
748
|
+
next if (pattern.length - syllable_count).abs > length_variance * pattern.length
|
749
|
+
result = closest_pattern_to_regex(weight_string, pattern)
|
750
|
+
pattern_string = result[:pattern]
|
751
|
+
edit_distance = result[:edit_distance]
|
752
|
+
|
753
|
+
next if edit_distance > edit_tolerance * pattern.length
|
754
|
+
end
|
755
|
+
acc << {
|
756
|
+
meter: meter_name,
|
757
|
+
type: type,
|
758
|
+
guess_size: guess_size,
|
759
|
+
pattern: pattern_string,
|
760
|
+
edit_distance: edit_distance,
|
761
|
+
}
|
762
|
+
end
|
763
|
+
candidates.concat(matches)
|
764
|
+
end
|
765
|
+
candidates
|
766
|
+
end
|
767
|
+
|
768
|
+
#
|
769
|
+
#
|
770
|
+
#
|
771
|
+
#
|
772
|
+
def closest_pattern_to_regex(weight_string, pattern)
|
773
|
+
pattern2 = pattern.tr(".", "L")
|
774
|
+
str = Amatch::Levenshtein.new(weight_string)
|
775
|
+
|
776
|
+
edit_distance = str.match(pattern2)
|
777
|
+
c = corrected_string(weight_string, pattern2)
|
778
|
+
# c = c.join("")
|
779
|
+
|
780
|
+
pattern_string = []
|
781
|
+
x1 = 0 # for pattern
|
782
|
+
xw = 0 # for weight string
|
783
|
+
pattern = pattern.lstrip
|
784
|
+
(0...c.length).each do |i|
|
785
|
+
if c[i] == "L" || c[i] == "G"
|
786
|
+
x1 += 1
|
787
|
+
pattern_string << c[i]
|
788
|
+
xw += 1
|
789
|
+
elsif c[i] == "l"
|
790
|
+
x1 += 1
|
791
|
+
pattern_string << "L"
|
792
|
+
elsif c[i] == "g"
|
793
|
+
x1 += 1
|
794
|
+
pattern_string << "G"
|
795
|
+
elsif c[i] == "d"
|
796
|
+
xw += 1
|
797
|
+
elsif c[i] == "f" && pattern[x1] == "."
|
798
|
+
x1 += 1
|
799
|
+
edit_distance -= 1
|
800
|
+
pattern_string << weight_string[xw]
|
801
|
+
xw += 1
|
802
|
+
else
|
803
|
+
pattern_string << pattern[x1]
|
804
|
+
x1 += 1
|
805
|
+
xw += 1
|
806
|
+
end
|
807
|
+
end
|
808
|
+
pattern_string = pattern_string.join("")
|
809
|
+
acc = {
|
810
|
+
pattern: pattern_string,
|
811
|
+
edit_distance: edit_distance,
|
812
|
+
}
|
813
|
+
acc
|
814
|
+
end
|
815
|
+
end
|
816
|
+
end
|