dphil 0.1.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CODE_OF_CONDUCT.md +49 -0
- data/Gemfile +6 -0
- data/LICENSE +201 -0
- data/README.md +54 -0
- data/Rakefile +11 -0
- data/dphil.gemspec +49 -0
- data/exe/dphil +10 -0
- data/lib/dphil.rb +53 -0
- data/lib/dphil/cache.rb +15 -0
- data/lib/dphil/change_list.rb +6 -0
- data/lib/dphil/character.rb +236 -0
- data/lib/dphil/character_matrix.rb +102 -0
- data/lib/dphil/cli.rb +26 -0
- data/lib/dphil/cli_commands/csv2ld.rb +71 -0
- data/lib/dphil/cli_commands/csv2nex.rb +37 -0
- data/lib/dphil/constants.rb +128 -0
- data/lib/dphil/converter.rb +58 -0
- data/lib/dphil/converters/csv2nex.rb +83 -0
- data/lib/dphil/ld_data_set.rb +25 -0
- data/lib/dphil/ld_output.rb +29 -0
- data/lib/dphil/lemma.rb +44 -0
- data/lib/dphil/lemma_list.rb +179 -0
- data/lib/dphil/log_formatter.rb +39 -0
- data/lib/dphil/logger.rb +27 -0
- data/lib/dphil/metrical_data.rb +78 -0
- data/lib/dphil/newick.rb +52 -0
- data/lib/dphil/paup.rb +34 -0
- data/lib/dphil/refinements.rb +8 -0
- data/lib/dphil/refinements/natural_sort.rb +52 -0
- data/lib/dphil/script_string.rb +124 -0
- data/lib/dphil/syllables.rb +43 -0
- data/lib/dphil/syllables/syllable.rb +45 -0
- data/lib/dphil/tei_xml.rb +142 -0
- data/lib/dphil/transliterate.rb +131 -0
- data/lib/dphil/tree.rb +142 -0
- data/lib/dphil/tree_node.rb +67 -0
- data/lib/dphil/verse.rb +25 -0
- data/lib/dphil/verse_analysis.rb +509 -0
- data/lib/dphil/verse_analysis_new.rb +816 -0
- data/lib/dphil/version.rb +30 -0
- data/vendor/default_commands.paup +18 -0
- data/vendor/metrical_data.yml +4035 -0
- metadata +409 -0
@@ -0,0 +1,816 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "amatch"
|
4
|
+
|
5
|
+
module Dphil
|
6
|
+
module VerseAnalysis
|
7
|
+
using ::Ragabash::Refinements
|
8
|
+
|
9
|
+
module_function
|
10
|
+
|
11
|
+
# Converts a verse string into individual syllables.
|
12
|
+
#
|
13
|
+
# @param verse_string [String] the raw text of the verse.
|
14
|
+
# @param from [Symbol] specify source transliteration scheme (detect by default)
|
15
|
+
# @param to [Symbol] specify output transliteration scheme (defaults to source)
|
16
|
+
# @return [Array] the text split into individual SLP1-encoded syllables.
|
17
|
+
def syllables(verse_string, from: nil, to: nil)
|
18
|
+
verse_string = verse_string.to_str.gsub(/[\|\.\,\\0-9]+/, "").gsub(/\s+/, " ").strip
|
19
|
+
from ||= Transliterate.detect(verse_string) || Transliterate.default_script
|
20
|
+
to ||= from
|
21
|
+
verse_string = Transliterate.transliterate(verse_string, from, :slp1)
|
22
|
+
syllables = verse_string.scan(Constants::R_SYL)
|
23
|
+
syllables.map! { |syl| Transliterate.transliterate(syl, :slp1, to) } if to != :slp1
|
24
|
+
syllables
|
25
|
+
end
|
26
|
+
|
27
|
+
# Converts a list of syllables into their L/G weights.
|
28
|
+
#
|
29
|
+
# @param syllables [Array] a set of syllables
|
30
|
+
# @return [String] the weight string of the syllables of the verse
|
31
|
+
def syllables_weights(syllables, from: nil, contextual: false)
|
32
|
+
from ||= Transliterate.detect(syllables.join("")) || Transliterate.default_script
|
33
|
+
syllables = syllables.to_ary.map { |syl| Transliterate.transliterate(syl, from, :slp1) } if from != :slp1
|
34
|
+
weight_arr = (0...syllables.length).map do |i|
|
35
|
+
cur_syl = syllables[i].delete("'").strip
|
36
|
+
next_syl = syllables[i + 1]&.delete("'")&.strip
|
37
|
+
if cur_syl.match?(Constants::R_GSYL)
|
38
|
+
# Guru if current syllable contains a long vowel, or end in a ṃ/ḥ
|
39
|
+
"G"
|
40
|
+
elsif cur_syl.match?(Constants::R_CCONF)
|
41
|
+
# Contextually Guru if ending in a cluster
|
42
|
+
"g"
|
43
|
+
elsif "#{cur_syl[-1]}#{next_syl&.slice(0)}".match?(Constants::R_CCON)
|
44
|
+
# Contextually Guru if syllable-final and next syllable-inital make a
|
45
|
+
# consonant cluster.
|
46
|
+
"g"
|
47
|
+
else
|
48
|
+
"L"
|
49
|
+
end
|
50
|
+
end
|
51
|
+
contextual ? weight_arr.join("") : weight_arr.join("").upcase
|
52
|
+
end
|
53
|
+
|
54
|
+
# Convenience method to directly get weight string of verse
|
55
|
+
#
|
56
|
+
# @param verse_string [String] the raw text of the verse
|
57
|
+
# @return [String] the weight string of the verse.
|
58
|
+
def verse_weights(verse_string, contextual: false)
|
59
|
+
syllables_weights(syllables(verse_string), contextual: contextual)
|
60
|
+
end
|
61
|
+
|
62
|
+
def identify(verse_string)
|
63
|
+
v_syllables = syllables(verse_string)
|
64
|
+
v_weight = syllables_weights(v_syllables, contextual: true)
|
65
|
+
v_meters = identify_meter_manager(verse_string)
|
66
|
+
unless v_meters.empty?
|
67
|
+
status = v_meters.first[:info]
|
68
|
+
meter = status.delete(:meter)
|
69
|
+
padas = v_meters.first[:corrected_padas]
|
70
|
+
end
|
71
|
+
{
|
72
|
+
verse: verse_string,
|
73
|
+
syllables: v_syllables,
|
74
|
+
weights: v_weight,
|
75
|
+
status: status,
|
76
|
+
meter: meter,
|
77
|
+
padas: padas,
|
78
|
+
}
|
79
|
+
end
|
80
|
+
|
81
|
+
# Coordinates metrical identification for a verse string.
|
82
|
+
#
|
83
|
+
# @param verse_string [String] a verse string
|
84
|
+
# @return [Array] candidate meters and information about their matches
|
85
|
+
def identify_meter_manager(verse_string)
|
86
|
+
syllables = syllables(verse_string)
|
87
|
+
weight_string = syllables_weights(syllables)
|
88
|
+
|
89
|
+
candidates = []
|
90
|
+
4.downto(1).each do |guess_size|
|
91
|
+
#
|
92
|
+
# TODO: Pre-process or somehow change this so that search is aware of
|
93
|
+
# how weight_string may or may not break across padas
|
94
|
+
# (i.e. whitespace in syllables)
|
95
|
+
search_results = meter_search_partial(weight_string, guess_size)
|
96
|
+
next if search_results.empty?
|
97
|
+
|
98
|
+
meter_results = search_results.group_by { |result| result[:meter_name] }
|
99
|
+
|
100
|
+
# Filter down to most-complete matches for each meter.
|
101
|
+
meter_results = compact_meter_results(meter_results)
|
102
|
+
meter_results.sort_by { |_key, value| value[0][:match_percent] }.to_h
|
103
|
+
|
104
|
+
meter_results = fuzzy_manager(meter_results, guess_size, weight_string)
|
105
|
+
|
106
|
+
# Add results to candidates
|
107
|
+
candidates.concat(meter_results)
|
108
|
+
end
|
109
|
+
|
110
|
+
candidates.concat(fuzzy_analysis(weight_string, 4, 0)) if candidates == []
|
111
|
+
|
112
|
+
candidates.sort_by! { |value| value[:heuristic] }
|
113
|
+
candidates.reverse!
|
114
|
+
probables = get_best_matches(candidates, syllables, 2)
|
115
|
+
probables
|
116
|
+
end
|
117
|
+
|
118
|
+
#
|
119
|
+
#
|
120
|
+
#
|
121
|
+
#
|
122
|
+
def get_best_matches(candidates, syllables, number)
|
123
|
+
best = []
|
124
|
+
i = 1
|
125
|
+
candidates.each do |val|
|
126
|
+
break if i > number
|
127
|
+
|
128
|
+
acc = {
|
129
|
+
info: val,
|
130
|
+
corrected_padas: fuzzy_correction(val[:meter], val[:correct_weights], syllables),
|
131
|
+
}
|
132
|
+
i += 1
|
133
|
+
best << acc
|
134
|
+
end
|
135
|
+
best
|
136
|
+
end
|
137
|
+
|
138
|
+
# Searches for meter candidates for a given weight string and search size.
|
139
|
+
#
|
140
|
+
# @param weight_string [String] a weight string
|
141
|
+
# @param guess_size [Integer] the number of padas to match against
|
142
|
+
# @return [Array] candidate meters and associated match data
|
143
|
+
def meter_search_partial(weight_string, guess_size)
|
144
|
+
size_groups = if guess_size == 4
|
145
|
+
%i[full half pada]
|
146
|
+
elsif guess_size >= 2
|
147
|
+
%i[half pada]
|
148
|
+
else
|
149
|
+
%i[pada]
|
150
|
+
end
|
151
|
+
|
152
|
+
candidates = []
|
153
|
+
size_groups.product(%i[patterns regexes]).each do |(pattern_size, pattern_type)|
|
154
|
+
MetricalData.all[pattern_type][pattern_size].each do |pattern, meter|
|
155
|
+
next unless useful_comparison?(weight_string, pattern, pattern_size, guess_size)
|
156
|
+
# Match pattern against weight_string by `.find_pattern`
|
157
|
+
matches = find_pattern(weight_string, pattern)
|
158
|
+
next if matches.empty?
|
159
|
+
candidates << {
|
160
|
+
meter_name: meter.each_key.first,
|
161
|
+
type: pattern_type,
|
162
|
+
size: pattern_size,
|
163
|
+
scope: meter.each_value.first,
|
164
|
+
pattern: pattern,
|
165
|
+
matches: matches,
|
166
|
+
coverage: matches.reduce(0.0) { |a, e| a + e.size } / weight_string.length,
|
167
|
+
guess_size: guess_size,
|
168
|
+
}
|
169
|
+
end
|
170
|
+
end
|
171
|
+
candidates
|
172
|
+
end
|
173
|
+
|
174
|
+
# Determines whether a given match might be considered useful to determining
|
175
|
+
# a candidate meter for a given weight string.
|
176
|
+
#
|
177
|
+
# @param weight_string [String] a weight string
|
178
|
+
# @param pattern [String, Regexp] a match pattern
|
179
|
+
# @param pattern_size [Symbol] the size of a match pattern
|
180
|
+
# @param guess_size [Integer] the number of padas being searched for
|
181
|
+
# @param tolerance [Numeric] the tolerance percentage of length difference
|
182
|
+
#
|
183
|
+
# @return [Boolean] true if comparison has good chance of being useful
|
184
|
+
def useful_comparison?(weight_string, pattern, pattern_size, guess_size, tolerance = 0.2)
|
185
|
+
pattern = clean_regexp_pattern(pattern) if pattern.is_a?(Regexp)
|
186
|
+
multiplier = pattern_size_multiplier(pattern_size)
|
187
|
+
difference = (weight_string.length - (guess_size * multiplier * pattern.length / 4)).abs
|
188
|
+
return true if difference <= (tolerance * multiplier * pattern.length)
|
189
|
+
false
|
190
|
+
end
|
191
|
+
|
192
|
+
# Finds all occurrences of a match pattern in a weight string
|
193
|
+
#
|
194
|
+
# @param weight_string [String] a weight string
|
195
|
+
# @param pattern [String, Regexp] a match pattern
|
196
|
+
# @return [Array] array of index-ranges of pattern matches within weight string
|
197
|
+
def find_pattern(weight_string, pattern)
|
198
|
+
indexes = []
|
199
|
+
i = 0
|
200
|
+
case pattern
|
201
|
+
when String
|
202
|
+
while (match = weight_string.index(pattern, i))
|
203
|
+
i_end = match + pattern.length - 1
|
204
|
+
indexes << (match..i_end)
|
205
|
+
i = i_end + 1
|
206
|
+
end
|
207
|
+
when Regexp
|
208
|
+
while (match = pattern.match(weight_string, i))
|
209
|
+
i_start = match.begin(0)
|
210
|
+
i_end = i_start + match[0].length - 1
|
211
|
+
indexes << (i_start..i_end)
|
212
|
+
i = i_end + 1
|
213
|
+
end
|
214
|
+
end
|
215
|
+
indexes
|
216
|
+
end
|
217
|
+
|
218
|
+
# Returns a string of a Regexp pattern cleaned of special characters
|
219
|
+
#
|
220
|
+
# @param regexp [Regexp] a regular expression
|
221
|
+
# @return [String] a clean string of the pattern
|
222
|
+
def clean_regexp_pattern(regexp)
|
223
|
+
pattern = regexp.source
|
224
|
+
pattern.gsub!(/[\(\)\^\$\|]+/, "")
|
225
|
+
pattern
|
226
|
+
end
|
227
|
+
|
228
|
+
# Returns a multiplier based on the pattern size symbol
|
229
|
+
#
|
230
|
+
# @param pattern_size [Symbol] a pattern size symbol
|
231
|
+
# @return [Integer] a multiplier
|
232
|
+
def pattern_size_multiplier(pattern_size)
|
233
|
+
case pattern_size
|
234
|
+
when :full
|
235
|
+
1
|
236
|
+
when :half
|
237
|
+
2
|
238
|
+
when :pada
|
239
|
+
4
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
# Checks whether or not a range overlaps with an array of ranges
|
244
|
+
#
|
245
|
+
# @param indexes [Array] Array of ranges
|
246
|
+
# @param range []
|
247
|
+
# @return []
|
248
|
+
def check_non_overlapping?(indexes, range)
|
249
|
+
return true if indexes.empty?
|
250
|
+
|
251
|
+
indexes.each do |val|
|
252
|
+
return false if val.cover?(range.begin) || val.cover?(range.end)
|
253
|
+
end
|
254
|
+
true
|
255
|
+
end
|
256
|
+
|
257
|
+
# Filters out redundant meter results by compiling together the most
|
258
|
+
# complete possible matches for each meter.
|
259
|
+
#
|
260
|
+
# @param meter_results [Hash] a hash of search results grouped by meter
|
261
|
+
# @return [Hash] a compacted hash of search results grouped by meter
|
262
|
+
def compact_meter_results(meter_results)
|
263
|
+
compact_results = {}
|
264
|
+
|
265
|
+
meter_results.keys.each do |meter_name|
|
266
|
+
compact_index = []
|
267
|
+
p = 0
|
268
|
+
meter_results[meter_name].each do |val|
|
269
|
+
val[:matches].each do |i|
|
270
|
+
next unless check_non_overlapping?(compact_index, i)
|
271
|
+
compact_index << i
|
272
|
+
case val[:size]
|
273
|
+
when :full
|
274
|
+
p += 100
|
275
|
+
when :half
|
276
|
+
p += 50
|
277
|
+
when :pada
|
278
|
+
p += 25
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
acc = {
|
283
|
+
pattern_type: meter_results[meter_name][0][:type],
|
284
|
+
matches: compact_index.sort_by { |a| a.to_s.split("..").first.to_i },
|
285
|
+
match_percent: p,
|
286
|
+
}
|
287
|
+
compact_results[meter_name] = [acc]
|
288
|
+
end
|
289
|
+
compact_results
|
290
|
+
end
|
291
|
+
|
292
|
+
#
|
293
|
+
#
|
294
|
+
#
|
295
|
+
#
|
296
|
+
def fuzzy_manager(meter_results, guess_size, weight_string)
|
297
|
+
w1 = weight_string.dup
|
298
|
+
extended_result = []
|
299
|
+
e = 0
|
300
|
+
|
301
|
+
meter_results.each do |key, val|
|
302
|
+
break if e == 1
|
303
|
+
wc = w1.dup
|
304
|
+
indexes = val[0][:matches]
|
305
|
+
q = (val[0][:match_percent] * 100) / (guess_size * 25)
|
306
|
+
|
307
|
+
q += 25 if q == 50 && guess_size == 2 # to deal with problematic case when p=25 and guess=2, should be only one correction
|
308
|
+
|
309
|
+
case q
|
310
|
+
when 100
|
311
|
+
wc = remove_extra_syllables(wc, indexes)
|
312
|
+
e = 1
|
313
|
+
when 60..75
|
314
|
+
max = get_unmatched_range(indexes, wc.length - 1)
|
315
|
+
portion = wc.slice!(max.begin, (max.end - max.begin + 1))
|
316
|
+
pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
|
317
|
+
correct = corrected_string(portion, pattern)
|
318
|
+
wc.insert(max.begin, correct)
|
319
|
+
indexes = update_index_array(indexes, max, correct.length - portion.length)
|
320
|
+
wc = remove_extra_syllables(wc, indexes)
|
321
|
+
|
322
|
+
when 30..50
|
323
|
+
flag = 0
|
324
|
+
2.times do
|
325
|
+
next if flag == 1
|
326
|
+
max = get_unmatched_range(indexes, wc.length - 1)
|
327
|
+
portion = wc.slice!(max.begin, (max.end - max.begin + 1))
|
328
|
+
if (max.end - max.begin + 1) > ((metercount[key][4] / 4) + 3)
|
329
|
+
pattern = get_specific_pattern(key, :half, val[0][:pattern_type], portion)
|
330
|
+
flag = 1
|
331
|
+
else
|
332
|
+
pattern = get_specific_pattern(key, :pada, val[0][:pattern_type], portion)
|
333
|
+
end
|
334
|
+
correct = corrected_string(portion, pattern)
|
335
|
+
wc.insert(max.begin, correct)
|
336
|
+
indexes = update_index_array(indexes, max, correct.length - portion.length)
|
337
|
+
end
|
338
|
+
wc = remove_extra_syllables(wc, indexes)
|
339
|
+
else
|
340
|
+
best_fuzzy = fuzzy_analysis(w1, guess_size, val[0][:match_percent])
|
341
|
+
best_fuzzy.each do |v|
|
342
|
+
if v[:meter] == key
|
343
|
+
extended_result << v
|
344
|
+
next
|
345
|
+
end
|
346
|
+
end
|
347
|
+
(0...wc.length).each do |k|
|
348
|
+
wc[k] = "x"
|
349
|
+
end
|
350
|
+
end
|
351
|
+
|
352
|
+
status = get_pada_status(key, wc, indexes, guess_size)
|
353
|
+
pada_weights = get_weight_by_pada(status, wc)
|
354
|
+
|
355
|
+
acc = {
|
356
|
+
len_assumption: guess_size.to_s + "/4",
|
357
|
+
meter: key,
|
358
|
+
type: val[0][:pattern_type],
|
359
|
+
match_indexes: status,
|
360
|
+
percent_match: val[0][:match_percent],
|
361
|
+
edit_count: wc.scan(/[a-z]/).length,
|
362
|
+
correct_weights: pada_weights,
|
363
|
+
heuristic: (2 * val[0][:match_percent]) + ((100 - (wc.scan(/[a-z]/).length * 100 / weight_string.length))),
|
364
|
+
}
|
365
|
+
extended_result << acc
|
366
|
+
end
|
367
|
+
extended_result
|
368
|
+
end
|
369
|
+
|
370
|
+
#
|
371
|
+
#
|
372
|
+
#
|
373
|
+
#
|
374
|
+
def get_pada_status(meter, correct_weights, indexes, guess_size)
|
375
|
+
len = metercount[meter]
|
376
|
+
cw = correct_weights.dup
|
377
|
+
index = indexes.dup
|
378
|
+
|
379
|
+
status = []
|
380
|
+
range = nil
|
381
|
+
|
382
|
+
pn = -1
|
383
|
+
pr = nil
|
384
|
+
ps = ""
|
385
|
+
|
386
|
+
# TO DO : identify which padas are actually missing
|
387
|
+
(1..guess_size).each do |i|
|
388
|
+
break if range.nil? && index.empty?
|
389
|
+
pn = i
|
390
|
+
range = index.slice!(0, 1)[0] if range.nil?
|
391
|
+
|
392
|
+
if cw.slice(range.begin, range.end - range.begin + 1).scan(/[a-z]/).empty?
|
393
|
+
ps = "exact"
|
394
|
+
if (range.end - range.begin + 1) == len[i - 1]
|
395
|
+
pr = range
|
396
|
+
range = nil
|
397
|
+
else
|
398
|
+
pr = (range.begin..(range.begin + len[i - 1] - 1))
|
399
|
+
range = ((range.begin + len[i - 1])..range.end)
|
400
|
+
end
|
401
|
+
else
|
402
|
+
ps = "fuzzy"
|
403
|
+
temp = 1
|
404
|
+
rng = range.begin
|
405
|
+
while temp <= len[i - 1]
|
406
|
+
temp += 1 if cw[rng] != "d"
|
407
|
+
rng += 1
|
408
|
+
end
|
409
|
+
if rng > range.end
|
410
|
+
pr = range
|
411
|
+
range = nil
|
412
|
+
else
|
413
|
+
pr = (range.begin..(rng - 1))
|
414
|
+
range = (rng..range.end)
|
415
|
+
end
|
416
|
+
end
|
417
|
+
acc = {
|
418
|
+
pada_number: pn,
|
419
|
+
pada_range: pr,
|
420
|
+
pada_status: ps,
|
421
|
+
}
|
422
|
+
status << acc
|
423
|
+
end
|
424
|
+
|
425
|
+
((guess_size + 1)..4).each do |j|
|
426
|
+
pn = j
|
427
|
+
pr = nil
|
428
|
+
ps = "missing"
|
429
|
+
acc = {
|
430
|
+
pada_number: pn,
|
431
|
+
pada_range: pr,
|
432
|
+
pada_status: ps,
|
433
|
+
}
|
434
|
+
status << acc
|
435
|
+
end
|
436
|
+
status
|
437
|
+
end
|
438
|
+
|
439
|
+
#
|
440
|
+
#
|
441
|
+
#
|
442
|
+
#
|
443
|
+
def get_weight_by_pada(status, corrected_weights)
|
444
|
+
cw = corrected_weights.dup
|
445
|
+
pada_weights = []
|
446
|
+
start = 0
|
447
|
+
status.each do |val|
|
448
|
+
if val[:pada_status] == "missing"
|
449
|
+
pada_weights << ""
|
450
|
+
else
|
451
|
+
pada_weights << cw.slice(start, (val[:pada_range].end - start + 1))
|
452
|
+
start = val[:pada_range].end + 1
|
453
|
+
end
|
454
|
+
end
|
455
|
+
pada_weights
|
456
|
+
end
|
457
|
+
|
458
|
+
#
|
459
|
+
#
|
460
|
+
#
|
461
|
+
#
|
462
|
+
def fuzzy_analysis(weight_string, guess_size, per_match)
|
463
|
+
wc = weight_string.dup
|
464
|
+
best = []
|
465
|
+
edits = 100
|
466
|
+
|
467
|
+
meter_search_fuzzy(wc, guess_size).each do |value|
|
468
|
+
edits = value[:edit_distance] if value[:edit_distance] < edits
|
469
|
+
end
|
470
|
+
|
471
|
+
meter_search_fuzzy(wc, guess_size).each do |value|
|
472
|
+
next unless value[:edit_distance] == edits
|
473
|
+
wc = corrected_string(weight_string, value[:pattern])
|
474
|
+
status = get_pada_status(value[:meter], wc, [(0..(wc.length - 1))], guess_size)
|
475
|
+
pada_weights = get_weight_by_pada(status, wc)
|
476
|
+
acc = {
|
477
|
+
len_assumption: guess_size.to_s + "/4",
|
478
|
+
meter: value[:meter],
|
479
|
+
type: value[:type],
|
480
|
+
match_indexes: status,
|
481
|
+
percent_match: per_match,
|
482
|
+
edit_count: value[:edit_distance],
|
483
|
+
correct_weights: pada_weights,
|
484
|
+
heuristic: (2 * per_match) + ((100 - (value[:edit_distance] * 100 / weight_string.length))),
|
485
|
+
}
|
486
|
+
best << acc
|
487
|
+
end
|
488
|
+
best
|
489
|
+
end
|
490
|
+
|
491
|
+
#
|
492
|
+
#
|
493
|
+
#
|
494
|
+
#
|
495
|
+
def get_specific_pattern(meter_name, size, type, weight_string)
|
496
|
+
case type
|
497
|
+
when :patterns
|
498
|
+
if size == :pada
|
499
|
+
MetricalData.meters[meter_name][0].dup
|
500
|
+
else
|
501
|
+
MetricalData.meters[meter_name][0].dup + MetricalData.meters[meter_name][1].dup
|
502
|
+
end
|
503
|
+
when :regexes
|
504
|
+
MetricalData.all[type][size].each do |p, meter|
|
505
|
+
next unless meter_name == meter.keys.first
|
506
|
+
p = p.source.gsub(/[\^\$\(\)]/, "")
|
507
|
+
r = closest_pattern_to_regex(weight_string, p)
|
508
|
+
return r[:pattern]
|
509
|
+
end
|
510
|
+
end
|
511
|
+
end
|
512
|
+
|
513
|
+
#
|
514
|
+
#
|
515
|
+
#
|
516
|
+
#
|
517
|
+
def remove_extra_syllables(weights, indexes)
|
518
|
+
w1 = weights.dup
|
519
|
+
(0...w1.length).each do |u|
|
520
|
+
flag = 0
|
521
|
+
indexes.each do |v|
|
522
|
+
flag = 1 if u >= v.begin && u <= v.end
|
523
|
+
end
|
524
|
+
w1[u] = "d" if flag == 0
|
525
|
+
end
|
526
|
+
w1
|
527
|
+
end
|
528
|
+
|
529
|
+
#
|
530
|
+
#
|
531
|
+
#
|
532
|
+
#
|
533
|
+
def get_unmatched_range(indexes, last)
|
534
|
+
max = indexes[0].begin > (last - indexes[-1].end) ? (0..(indexes[0].begin - 1)) : ((indexes[-1].end + 1)..last)
|
535
|
+
j = 0
|
536
|
+
indexes.each do |i|
|
537
|
+
if (i.begin - j - 1) > (max.end - max.begin + 1)
|
538
|
+
max = ((j + 1)..(i.begin - 1))
|
539
|
+
end
|
540
|
+
j = i.end
|
541
|
+
end
|
542
|
+
max
|
543
|
+
end
|
544
|
+
|
545
|
+
#
|
546
|
+
#
|
547
|
+
#
|
548
|
+
#
|
549
|
+
def update_index_array(indexes, max, diff)
|
550
|
+
indexes << max
|
551
|
+
indexes = indexes.sort_by { |a| a.to_s.split("..").first.to_i }
|
552
|
+
index2 = []
|
553
|
+
indexes.each do |val|
|
554
|
+
if val.begin < max.begin
|
555
|
+
index2 << val
|
556
|
+
elsif val.begin == max.begin
|
557
|
+
index2 << ((max.begin)..(max.end + diff))
|
558
|
+
else
|
559
|
+
index2 << ((val.begin + diff)..(val.end + diff))
|
560
|
+
end
|
561
|
+
end
|
562
|
+
index2
|
563
|
+
end
|
564
|
+
|
565
|
+
#
|
566
|
+
#
|
567
|
+
#
|
568
|
+
#
|
569
|
+
def corrected_string(weights, pattern)
|
570
|
+
return weights if pattern.empty?
|
571
|
+
actual = weights.split("")
|
572
|
+
actual.insert(0, " ")
|
573
|
+
pattern.insert(0, " ")
|
574
|
+
|
575
|
+
table = Array.new(actual.length) { Array.new(pattern.length) }
|
576
|
+
|
577
|
+
(0...actual.length).each do |i|
|
578
|
+
table[i][0] = i
|
579
|
+
end
|
580
|
+
(0...pattern.length).each do |i|
|
581
|
+
table[0][i] = i
|
582
|
+
end
|
583
|
+
|
584
|
+
(1...actual.length).each do |i|
|
585
|
+
(1...pattern.length).each do |j|
|
586
|
+
if actual[i] == pattern[j]
|
587
|
+
table[i][j] = table[i - 1][j - 1]
|
588
|
+
else
|
589
|
+
table[i][j] = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min + 1
|
590
|
+
end
|
591
|
+
end
|
592
|
+
end
|
593
|
+
|
594
|
+
correct = []
|
595
|
+
i = actual.length - 1
|
596
|
+
j = pattern.length - 1
|
597
|
+
while i > 0 || j > 0
|
598
|
+
if actual[i] == pattern[j]
|
599
|
+
correct.insert(0, actual[i])
|
600
|
+
i -= 1
|
601
|
+
j -= 1
|
602
|
+
else
|
603
|
+
x = [table[i - 1][j], table[i - 1][j - 1], table[i][j - 1]].min if i > 0 && j > 0
|
604
|
+
x = table[i][j - 1] if i == 0 # upper boundary case
|
605
|
+
x = table[i - 1][j] if j == 0 # left boundary case
|
606
|
+
case x
|
607
|
+
when table[i][j - 1]
|
608
|
+
if pattern[j] == "L"
|
609
|
+
correct.insert(0, "l")
|
610
|
+
else
|
611
|
+
correct.insert(0, "g")
|
612
|
+
end
|
613
|
+
j -= 1
|
614
|
+
when table[i - 1][j - 1]
|
615
|
+
correct.insert(0, "f") # to mark substitution in string
|
616
|
+
i -= 1
|
617
|
+
j -= 1
|
618
|
+
when table[i - 1][j]
|
619
|
+
correct.insert(0, "d") # to mark deletion from string
|
620
|
+
i -= 1
|
621
|
+
end
|
622
|
+
end
|
623
|
+
end
|
624
|
+
correct.join("")
|
625
|
+
end
|
626
|
+
|
627
|
+
#
|
628
|
+
#
|
629
|
+
#
|
630
|
+
#
|
631
|
+
def metercount
|
632
|
+
@metercount ||= begin
|
633
|
+
meter_data = {}
|
634
|
+
MetricalData.meters.map do |meter_name, pada_arr|
|
635
|
+
arr = pada_arr.map(&:length)
|
636
|
+
arr << arr.reduce(&:+)
|
637
|
+
meter_data[meter_name] = arr
|
638
|
+
end
|
639
|
+
MetricalData.regexes.full.each do |r, v|
|
640
|
+
meter_name = v.keys.first
|
641
|
+
next if meter_data.key?(meter_name)
|
642
|
+
source = r.source
|
643
|
+
next if source["|"] || source["("].nil?
|
644
|
+
groups = source.scan(/\(([^()]*)\)/).flatten
|
645
|
+
source.gsub!(/[\^\$\(\)]/, "")
|
646
|
+
meter_data[meter_name] = groups.map(&:length) << source.length
|
647
|
+
end
|
648
|
+
|
649
|
+
meter_data.sort.to_h.deep_freeze
|
650
|
+
end
|
651
|
+
end
|
652
|
+
|
653
|
+
#
|
654
|
+
#
|
655
|
+
#
|
656
|
+
#
|
657
|
+
def fuzzy_correction(_meter, corrected_weights, syllables)
|
658
|
+
k = 0
|
659
|
+
n = 0 # for syllables
|
660
|
+
p = 0
|
661
|
+
temp = []
|
662
|
+
v_padas = []
|
663
|
+
|
664
|
+
corrected_weights.each do |correct|
|
665
|
+
(0...correct.length).each do |k|
|
666
|
+
# break if n >= syllables.length
|
667
|
+
if correct[k] == "d"
|
668
|
+
temp << ("[" + syllables[n] + "]")
|
669
|
+
n += 1
|
670
|
+
elsif correct[k] == "f"
|
671
|
+
temp << ("(" + syllables[n] + ")")
|
672
|
+
n += 1
|
673
|
+
elsif correct[k] == "g"
|
674
|
+
case p
|
675
|
+
when 0
|
676
|
+
temp << " { (g)"
|
677
|
+
p = 2
|
678
|
+
else
|
679
|
+
temp << "(g)"
|
680
|
+
end
|
681
|
+
elsif correct[k] == "l"
|
682
|
+
case p
|
683
|
+
when 0
|
684
|
+
temp << " { (l)"
|
685
|
+
p = 1
|
686
|
+
else
|
687
|
+
temp << "(l)"
|
688
|
+
end
|
689
|
+
else
|
690
|
+
case p
|
691
|
+
when 2
|
692
|
+
if correct[k] == "L"
|
693
|
+
temp << " } " + syllables[n]
|
694
|
+
p = 0
|
695
|
+
else
|
696
|
+
temp << syllables[n]
|
697
|
+
end
|
698
|
+
when 1
|
699
|
+
if correct[k] == "G"
|
700
|
+
temp << " } " + syllables[n]
|
701
|
+
p = 0
|
702
|
+
else
|
703
|
+
temp << syllables[n]
|
704
|
+
end
|
705
|
+
when 0
|
706
|
+
temp << syllables[n]
|
707
|
+
end
|
708
|
+
n += 1
|
709
|
+
end
|
710
|
+
end
|
711
|
+
v_padas << temp.join("")
|
712
|
+
temp = []
|
713
|
+
end
|
714
|
+
v_padas
|
715
|
+
end
|
716
|
+
|
717
|
+
#
|
718
|
+
#
|
719
|
+
#
|
720
|
+
#
|
721
|
+
def meter_search_fuzzy(weight_string, guess_size)
|
722
|
+
candidates = []
|
723
|
+
syllable_count = weight_string.length
|
724
|
+
length_variance = 0.2
|
725
|
+
edit_tolerance = 0.15
|
726
|
+
str = Amatch::Levenshtein.new(weight_string)
|
727
|
+
|
728
|
+
%i[patterns regexes].each do |type|
|
729
|
+
matches = MetricalData.all[type][:full].each_with_object([]) do |(p, meter), acc|
|
730
|
+
meter_name = meter.keys.first
|
731
|
+
case p
|
732
|
+
when String
|
733
|
+
pattern = ""
|
734
|
+
p2 = p.dup
|
735
|
+
l = metercount[meter_name]
|
736
|
+
(0...guess_size).each do |i|
|
737
|
+
pattern += p2.slice!(0, l[i])
|
738
|
+
end
|
739
|
+
|
740
|
+
next unless (pattern.length - syllable_count).abs <= length_variance * pattern.length
|
741
|
+
edit_distance = str.match(pattern)
|
742
|
+
next if edit_distance > edit_tolerance * pattern.length
|
743
|
+
pattern_string = pattern
|
744
|
+
when Regexp
|
745
|
+
next if p.source["|"]
|
746
|
+
p = p.source.gsub(/[\^\$\(\)]/, "")
|
747
|
+
pattern = p.slice(0...(guess_size * p.length / 4))
|
748
|
+
next if (pattern.length - syllable_count).abs > length_variance * pattern.length
|
749
|
+
result = closest_pattern_to_regex(weight_string, pattern)
|
750
|
+
pattern_string = result[:pattern]
|
751
|
+
edit_distance = result[:edit_distance]
|
752
|
+
|
753
|
+
next if edit_distance > edit_tolerance * pattern.length
|
754
|
+
end
|
755
|
+
acc << {
|
756
|
+
meter: meter_name,
|
757
|
+
type: type,
|
758
|
+
guess_size: guess_size,
|
759
|
+
pattern: pattern_string,
|
760
|
+
edit_distance: edit_distance,
|
761
|
+
}
|
762
|
+
end
|
763
|
+
candidates.concat(matches)
|
764
|
+
end
|
765
|
+
candidates
|
766
|
+
end
|
767
|
+
|
768
|
+
#
|
769
|
+
#
|
770
|
+
#
|
771
|
+
#
|
772
|
+
def closest_pattern_to_regex(weight_string, pattern)
|
773
|
+
pattern2 = pattern.tr(".", "L")
|
774
|
+
str = Amatch::Levenshtein.new(weight_string)
|
775
|
+
|
776
|
+
edit_distance = str.match(pattern2)
|
777
|
+
c = corrected_string(weight_string, pattern2)
|
778
|
+
# c = c.join("")
|
779
|
+
|
780
|
+
pattern_string = []
|
781
|
+
x1 = 0 # for pattern
|
782
|
+
xw = 0 # for weight string
|
783
|
+
pattern = pattern.lstrip
|
784
|
+
(0...c.length).each do |i|
|
785
|
+
if c[i] == "L" || c[i] == "G"
|
786
|
+
x1 += 1
|
787
|
+
pattern_string << c[i]
|
788
|
+
xw += 1
|
789
|
+
elsif c[i] == "l"
|
790
|
+
x1 += 1
|
791
|
+
pattern_string << "L"
|
792
|
+
elsif c[i] == "g"
|
793
|
+
x1 += 1
|
794
|
+
pattern_string << "G"
|
795
|
+
elsif c[i] == "d"
|
796
|
+
xw += 1
|
797
|
+
elsif c[i] == "f" && pattern[x1] == "."
|
798
|
+
x1 += 1
|
799
|
+
edit_distance -= 1
|
800
|
+
pattern_string << weight_string[xw]
|
801
|
+
xw += 1
|
802
|
+
else
|
803
|
+
pattern_string << pattern[x1]
|
804
|
+
x1 += 1
|
805
|
+
xw += 1
|
806
|
+
end
|
807
|
+
end
|
808
|
+
pattern_string = pattern_string.join("")
|
809
|
+
acc = {
|
810
|
+
pattern: pattern_string,
|
811
|
+
edit_distance: edit_distance,
|
812
|
+
}
|
813
|
+
acc
|
814
|
+
end
|
815
|
+
end
|
816
|
+
end
|