skrutable 2.5.3__tar.gz → 2.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skrutable-2.5.3 → skrutable-2.6.0}/PKG-INFO +1 -1
- skrutable-2.6.0/src/skrutable/__init__.py +1 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/config.json +7 -6
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/meter_identification.py +463 -72
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/meter_patterns.py +20 -8
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/scansion.py +13 -14
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable.egg-info/PKG-INFO +1 -1
- skrutable-2.5.3/src/skrutable/__init__.py +0 -1
- {skrutable-2.5.3 → skrutable-2.6.0}/LICENSE.md +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/README.md +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/setup.cfg +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/setup.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/config.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/generate_scheme_vectors.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/impossible_bigrams.json +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/manual.md +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/phonemes.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/run_examples.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/scheme_detection.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/scheme_maps.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/scheme_vectors.json +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/scheme_vectors_mbh.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/splitting.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/transliteration.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/utils.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable/virAma_avoidance.py +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable.egg-info/SOURCES.txt +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable.egg-info/dependency_links.txt +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable.egg-info/requires.txt +0 -0
- {skrutable-2.5.3 → skrutable-2.6.0}/src/skrutable.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.6.0"
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
"avoid_virama_indic_scripts" : true,
|
|
5
5
|
"avoid_virama_non_indic_scripts" : false,
|
|
6
6
|
"scansion_syllable_separator" : " ",
|
|
7
|
-
"additional_pAda_separators" : ["\t", ";", ",", "
|
|
7
|
+
"additional_pAda_separators" : ["\t", ";", ",", "/", "|", "।"],
|
|
8
8
|
"default_resplit_option" : "resplit_lite",
|
|
9
9
|
"default_resplit_keep_midpoint" : false,
|
|
10
10
|
"disable_non_trizwuB_upajAti" : true,
|
|
@@ -21,16 +21,17 @@
|
|
|
21
21
|
"samavṛtta, imperfect (3)" : 6,
|
|
22
22
|
"samavṛtta, imperfect (2)" : 5,
|
|
23
23
|
"samavṛtta, quarter, perfect" : 8,
|
|
24
|
-
"ardhasamavṛtta, perfect" :
|
|
25
|
-
"ardhasamavṛtta,
|
|
24
|
+
"ardhasamavṛtta, perfect" : 9,
|
|
25
|
+
"ardhasamavṛtta, imperfect" : 7,
|
|
26
26
|
"viṣamavṛtta, perfect" : 9,
|
|
27
|
-
"
|
|
27
|
+
"viṣamavṛtta, imperfect" : 7,
|
|
28
|
+
"upajāti, perfect" : 8,
|
|
28
29
|
"upajāti, imperfect" : 6,
|
|
29
|
-
"upajāti, non-triṣṭubh, perfect" :
|
|
30
|
+
"upajāti, non-triṣṭubh, perfect" : 5,
|
|
30
31
|
"upajāti, triṣṭubh-jagatī-saṃkara, perfect" : 4,
|
|
31
32
|
"upajāti, non-triṣṭubh, imperfect" : 3,
|
|
32
33
|
"jāti, perfect" : 9,
|
|
33
|
-
"jāti, imperfect" :
|
|
34
|
+
"jāti, imperfect" : 6,
|
|
34
35
|
"jāti, likely" : 3,
|
|
35
36
|
"none found" : 1
|
|
36
37
|
},
|
|
@@ -3,6 +3,7 @@ from skrutable import meter_patterns
|
|
|
3
3
|
from skrutable.config import load_config_dict_from_json_file
|
|
4
4
|
from skrutable.utils import _DEBUG_TIMING, _section_totals, timed
|
|
5
5
|
import re
|
|
6
|
+
import time as _time
|
|
6
7
|
from copy import copy
|
|
7
8
|
from dataclasses import dataclass
|
|
8
9
|
from typing import Optional
|
|
@@ -59,10 +60,12 @@ def flush_profiling_report(write_file=False):
|
|
|
59
60
|
return
|
|
60
61
|
import sys, os
|
|
61
62
|
scan_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana')
|
|
62
|
-
type_keys = ('anuzwuB', 'samavftta', 'upajAti', '
|
|
63
|
+
type_keys = ('anuzwuB', 'samavftta_etc', 'samavftta', 'upajAti', 'ardhasamavftta_perfect', 'vizamavftta', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama')
|
|
63
64
|
type_abbrev = {
|
|
64
|
-
'anuzwuB': 'anuṣṭ', 'samavftta': 'samav', 'upajAti': 'upajāti',
|
|
65
|
-
'
|
|
65
|
+
'anuzwuB': 'anuṣṭ', 'samavftta_etc': 'vftta↑', 'samavftta': 'samav', 'upajAti': 'upajāti',
|
|
66
|
+
'ardhasamavftta_perfect': 'ardha✓', 'vizamavftta': 'vizama',
|
|
67
|
+
'jAti': 'jāti',
|
|
68
|
+
'lev_samavftta': 'lev✗sama', 'lev_ardha': 'lev✗ardh', 'lev_vizama': 'lev✗visa',
|
|
66
69
|
}
|
|
67
70
|
scan_abbrev = {'scan_clean': 'clean', 'scan_translit': 'transl', 'scan_syllabify': 'syl', 'scan_weights': 'wts', 'scan_morae_gana': 'mor+g'}
|
|
68
71
|
cat_order = ['anuṣṭubh', 'samavṛtta', 'upajāti', 'ardhasamavṛtta', 'viṣamavṛtta', 'jāti', 'na kiṃcid adhyavasitam']
|
|
@@ -151,6 +154,68 @@ class Diagnostic:
|
|
|
151
154
|
return self.imperfect_label_sanskrit is not None and not self.length_error()
|
|
152
155
|
|
|
153
156
|
|
|
157
|
+
ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD = 2
|
|
158
|
+
VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD = 2
|
|
159
|
+
|
|
160
|
+
# Precompute vizamavṛtta canonical weight strings once (avoid per-call gaṇa→weight conversion)
|
|
161
|
+
_gaRa_to_weights_map = {v: k for k, v in meter_patterns.gaRas_by_weights.items()}
|
|
162
|
+
def _gaRa_str_to_weights(s):
|
|
163
|
+
return ''.join(_gaRa_to_weights_map.get(ch, ch) for ch in s)
|
|
164
|
+
_vizamavftta_precomputed = [
|
|
165
|
+
(gaRas, [_gaRa_str_to_weights(g) for g in gaRas], label)
|
|
166
|
+
for gaRas, label in meter_patterns.vizamavftta_by_4_tuple.items()
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _levenshtein_align(observed, canonical):
|
|
171
|
+
"""Return (distance, problem_indices) comparing observed lg string to canonical,
|
|
172
|
+
excluding the final (anceps) position from both distance and problem reporting.
|
|
173
|
+
|
|
174
|
+
- Same length, mismatches: problem_indices = list of mismatched 0-based positions.
|
|
175
|
+
- Hypermetric (len > canonical): problem_indices = [index of extra syllable] (positive).
|
|
176
|
+
- Hypometric (len < canonical): problem_indices = [-(gap_pos + 1)] where gap_pos is
|
|
177
|
+
the 0-based canonical position of the missing syllable. The -(k+1) encoding keeps
|
|
178
|
+
gap position 0 distinct from a positive index (never 0). Frontend recovers the
|
|
179
|
+
canonical column as abs(v) - 1 for any negative v.
|
|
180
|
+
"""
|
|
181
|
+
obs = observed[:-1] # exclude final anceps
|
|
182
|
+
can = canonical[:-1]
|
|
183
|
+
|
|
184
|
+
if len(obs) == len(can):
|
|
185
|
+
bad = [i for i in range(len(obs)) if obs[i] != can[i]]
|
|
186
|
+
return len(bad), bad
|
|
187
|
+
|
|
188
|
+
# Standard Levenshtein DP with early exit once threshold is exceeded
|
|
189
|
+
m, n = len(obs), len(can)
|
|
190
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
191
|
+
for i in range(m + 1): dp[i][0] = i
|
|
192
|
+
for j in range(n + 1): dp[0][j] = j
|
|
193
|
+
for i in range(1, m + 1):
|
|
194
|
+
row_min = n # track row minimum for early exit
|
|
195
|
+
for j in range(1, n + 1):
|
|
196
|
+
cost = 0 if obs[i-1] == can[j-1] else 1
|
|
197
|
+
dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
|
|
198
|
+
if dp[i][j] < row_min: row_min = dp[i][j]
|
|
199
|
+
if row_min > ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
|
|
200
|
+
return row_min, []
|
|
201
|
+
|
|
202
|
+
dist = dp[m][n]
|
|
203
|
+
|
|
204
|
+
# Traceback to find the operation site
|
|
205
|
+
i, j = m, n
|
|
206
|
+
while i > 0 or j > 0:
|
|
207
|
+
if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + (0 if obs[i-1] == can[j-1] else 1):
|
|
208
|
+
i -= 1; j -= 1
|
|
209
|
+
elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
|
|
210
|
+
# deletion from observed — observed has an extra syllable at position i-1
|
|
211
|
+
return dist, [i - 1]
|
|
212
|
+
else:
|
|
213
|
+
# insertion into observed — observed is missing canonical syllable at position j-1
|
|
214
|
+
return dist, [-(j - 1 + 1)] # encode as -(gap_pos + 1), always negative
|
|
215
|
+
|
|
216
|
+
return dist, []
|
|
217
|
+
|
|
218
|
+
|
|
154
219
|
def _decompose_into_mAtragaNas(weights_str, gana_6_morae, gana_8_morae):
|
|
155
220
|
"""
|
|
156
221
|
Decomposes an ardha (half-verse) weight string into mātrā-gaṇas.
|
|
@@ -308,6 +373,9 @@ class VerseTester(object):
|
|
|
308
373
|
self.resplit_keep_midpoint = default_resplit_keep_midpoint # bool
|
|
309
374
|
self.identification_attempt_count = 0
|
|
310
375
|
self._anuzwuB_half_cache = {} # cleared per wiggle_identify run
|
|
376
|
+
self._ardha_stash = [] # accumulated across wiggle candidates
|
|
377
|
+
self._vizama_stash = [] # accumulated across wiggle candidates
|
|
378
|
+
self._samavftta_has_length_error = False # set during evaluate_samavftta perfect_only pass
|
|
311
379
|
|
|
312
380
|
def combine_results(self, Vrs, new_label, new_score, new_is_perfect=False):
|
|
313
381
|
old_label = Vrs.meter_label or ''
|
|
@@ -563,7 +631,7 @@ class VerseTester(object):
|
|
|
563
631
|
self.pAdasamatva_count = max_match
|
|
564
632
|
|
|
565
633
|
|
|
566
|
-
def evaluate_samavftta(self, Vrs):
|
|
634
|
+
def evaluate_samavftta(self, Vrs, perfect_only=False):
|
|
567
635
|
# sufficient pAdasamatva already assured, now just evaluate
|
|
568
636
|
|
|
569
637
|
wbp = Vrs.syllable_weights.split('\n') # weights by pāda
|
|
@@ -613,20 +681,34 @@ class VerseTester(object):
|
|
|
613
681
|
if "ajñātasamavṛtta" in meter_label:
|
|
614
682
|
score -= 2
|
|
615
683
|
|
|
616
|
-
# Build per-pāda diagnostic: length errors, then
|
|
684
|
+
# Build per-pāda diagnostic: length errors (Levenshtein), then pattern errors.
|
|
685
|
+
# In perfect_only mode, skip Levenshtein — just register the result and return.
|
|
617
686
|
problem_syllables = {}
|
|
618
687
|
per_pada_sanskrit = {}
|
|
619
688
|
per_pada_english = {}
|
|
620
689
|
canonical = w_to_id # includes final anceps
|
|
690
|
+
has_length_error = any(len(w) != len(canonical) for w in wbp[:4])
|
|
691
|
+
|
|
692
|
+
if perfect_only and has_length_error:
|
|
693
|
+
# Defer length-error annotation to the imperfect pass; register result now.
|
|
694
|
+
self._samavftta_has_length_error = True
|
|
695
|
+
old_score = Vrs.identification_score
|
|
696
|
+
self.combine_results(Vrs, new_label=meter_label, new_score=score)
|
|
697
|
+
if score >= old_score:
|
|
698
|
+
Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
|
|
699
|
+
return
|
|
700
|
+
|
|
621
701
|
for pada_num, w in enumerate(wbp[:4], start=1):
|
|
622
702
|
if w == canonical:
|
|
623
703
|
pass # no entry → perfect for this pada
|
|
624
704
|
elif len(w) > len(canonical):
|
|
625
|
-
|
|
705
|
+
_, prob_indices = _levenshtein_align(w, canonical)
|
|
706
|
+
problem_syllables[pada_num] = prob_indices
|
|
626
707
|
per_pada_sanskrit[pada_num] = 'adhikākṣarā'
|
|
627
708
|
per_pada_english[pada_num] = 'hypermetric'
|
|
628
709
|
elif len(w) < len(canonical):
|
|
629
|
-
|
|
710
|
+
_, prob_indices = _levenshtein_align(w, canonical)
|
|
711
|
+
problem_syllables[pada_num] = prob_indices
|
|
630
712
|
per_pada_sanskrit[pada_num] = 'ūnākṣarā'
|
|
631
713
|
per_pada_english[pada_num] = 'hypometric'
|
|
632
714
|
else:
|
|
@@ -665,48 +747,119 @@ class VerseTester(object):
|
|
|
665
747
|
|
|
666
748
|
# score arbitration: may tie with pre-existing result (e.g., upajāti)
|
|
667
749
|
old_score = Vrs.identification_score
|
|
668
|
-
self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=imperfect_note is None)
|
|
750
|
+
self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=imperfect_note is None and not has_any_error)
|
|
669
751
|
if score >= old_score:
|
|
670
752
|
Vrs.diagnostic = diagnostic
|
|
671
753
|
|
|
672
754
|
|
|
755
|
+
def evaluate_ardhasamavftta(self, Vrs, perfect_only=False):
|
|
756
|
+
# bail early if even a perfect result can't beat what's already recorded
|
|
757
|
+
if meter_scores["ardhasamavṛtta, perfect"] <= Vrs.identification_score:
|
|
758
|
+
return
|
|
673
759
|
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
760
|
+
wbp = Vrs.syllable_weights.split('\n') # weights by pāda
|
|
761
|
+
tsyl = Vrs.text_syllabified
|
|
762
|
+
gaRa = Vrs.gaRa_abbreviations
|
|
763
|
+
morae = Vrs.morae_per_line
|
|
764
|
+
|
|
765
|
+
if perfect_only:
|
|
766
|
+
# Search all patterns; stash imperfect candidates (no Levenshtein yet), commit perfect immediately.
|
|
767
|
+
for (odd_canonical, even_canonical), meter_label in \
|
|
768
|
+
meter_patterns.ardhasamavftta_by_odd_even_weights.items():
|
|
769
|
+
# length pre-filter
|
|
770
|
+
if any(
|
|
771
|
+
len(w) != (len(odd_canonical) if pada_num in (1, 3) else len(even_canonical))
|
|
772
|
+
for pada_num, w in enumerate(wbp[:4], start=1)
|
|
773
|
+
):
|
|
774
|
+
# stash if within threshold (no Levenshtein — just length check)
|
|
775
|
+
if all(
|
|
776
|
+
abs(len(w) - (len(odd_canonical) if pada_num in (1, 3) else len(even_canonical)))
|
|
777
|
+
<= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD
|
|
778
|
+
for pada_num, w in enumerate(wbp[:4], start=1)
|
|
779
|
+
):
|
|
780
|
+
self._ardha_stash.append((wbp, meter_label, odd_canonical, even_canonical, tsyl, gaRa, morae))
|
|
781
|
+
continue
|
|
678
782
|
|
|
679
|
-
|
|
783
|
+
# exact length: direct string comparison for perfect match (no Levenshtein needed)
|
|
784
|
+
if all(
|
|
785
|
+
w == (odd_canonical if pada_num in (1, 3) else even_canonical)
|
|
786
|
+
for pada_num, w in enumerate(wbp[:4], start=1)
|
|
787
|
+
):
|
|
788
|
+
score = meter_scores["ardhasamavṛtta, perfect"]
|
|
789
|
+
old_score = Vrs.identification_score
|
|
790
|
+
self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=True)
|
|
791
|
+
if score >= old_score:
|
|
792
|
+
Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
|
|
793
|
+
self._ardha_stash = [] # perfect found; no need for imperfect pass
|
|
794
|
+
return
|
|
795
|
+
# same length but not perfect — stash without distance computation
|
|
796
|
+
self._ardha_stash.append((wbp, meter_label, odd_canonical, even_canonical, tsyl, gaRa, morae))
|
|
797
|
+
return
|
|
680
798
|
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
799
|
+
# Imperfect pass: consume the stash built during perfect_only pass.
|
|
800
|
+
# If no stash (e.g. called directly without a prior perfect_only pass), build it now.
|
|
801
|
+
if not self._ardha_stash:
|
|
802
|
+
self.evaluate_ardhasamavftta(Vrs, perfect_only=True)
|
|
803
|
+
if not self._ardha_stash:
|
|
804
|
+
return
|
|
684
805
|
|
|
685
|
-
#
|
|
686
|
-
|
|
687
|
-
|
|
806
|
+
# Run full Levenshtein on every stash entry to find minimum total distance.
|
|
807
|
+
best_total_dist = None
|
|
808
|
+
best_entry = None
|
|
809
|
+
for _stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae in self._ardha_stash:
|
|
810
|
+
total_dist = sum(
|
|
811
|
+
_levenshtein_align(w, _odd_can if pada_num in (1, 3) else _even_can)[0]
|
|
812
|
+
for pada_num, w in enumerate(_stash_wbp[:4], start=1)
|
|
813
|
+
)
|
|
814
|
+
if total_dist <= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
|
|
815
|
+
if best_total_dist is None or total_dist < best_total_dist:
|
|
816
|
+
best_total_dist = total_dist
|
|
817
|
+
best_entry = (_stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae)
|
|
688
818
|
|
|
689
|
-
|
|
690
|
-
|
|
819
|
+
if best_entry is None:
|
|
820
|
+
return
|
|
691
821
|
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
822
|
+
best_stash_wbp, best_label, best_odd_canonical, best_even_canonical, *_ = best_entry
|
|
823
|
+
score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - 1)
|
|
824
|
+
if score <= 0:
|
|
825
|
+
return
|
|
696
826
|
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
827
|
+
problem_syllables = {}
|
|
828
|
+
per_pada_sanskrit = {}
|
|
829
|
+
per_pada_english = {}
|
|
830
|
+
for pada_num, w in enumerate(best_stash_wbp[:4], start=1):
|
|
831
|
+
canonical = best_odd_canonical if pada_num in (1, 3) else best_even_canonical
|
|
832
|
+
dist, prob_indices = _levenshtein_align(w, canonical)
|
|
833
|
+
if dist == 0:
|
|
834
|
+
continue
|
|
835
|
+
problem_syllables[pada_num] = prob_indices
|
|
836
|
+
meter_name = best_label.split(' = ')[0]
|
|
837
|
+
if len(w) > len(canonical):
|
|
838
|
+
per_pada_sanskrit[pada_num] = 'adhikākṣarā'
|
|
839
|
+
per_pada_english[pada_num] = 'hypermetric'
|
|
840
|
+
elif len(w) < len(canonical):
|
|
841
|
+
per_pada_sanskrit[pada_num] = 'ūnākṣarā'
|
|
842
|
+
per_pada_english[pada_num] = 'hypometric'
|
|
843
|
+
else:
|
|
844
|
+
per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
|
|
845
|
+
per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
|
|
701
846
|
|
|
847
|
+
sa_vals = list(per_pada_sanskrit.items())
|
|
848
|
+
if len(sa_vals) == 1:
|
|
849
|
+
suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
|
|
702
850
|
else:
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
Vrs.identification_score = meter_scores["ardhasamavṛtta, perfect, unknown"]
|
|
706
|
-
Vrs.is_perfect = True # "perfect, unknown" means pattern unknown, not imperfect
|
|
851
|
+
suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
|
|
852
|
+
imperfect_label = best_label + f" ({suffix})"
|
|
707
853
|
|
|
708
|
-
|
|
709
|
-
Vrs
|
|
854
|
+
old_score = Vrs.identification_score
|
|
855
|
+
self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
|
|
856
|
+
if score >= old_score:
|
|
857
|
+
Vrs.diagnostic = Diagnostic(
|
|
858
|
+
perfect_id_label=imperfect_label,
|
|
859
|
+
imperfect_label_sanskrit=per_pada_sanskrit or None,
|
|
860
|
+
imperfect_label_english=per_pada_english or None,
|
|
861
|
+
problem_syllables=problem_syllables or None,
|
|
862
|
+
)
|
|
710
863
|
|
|
711
864
|
|
|
712
865
|
def evaluate_upajAti(self, Vrs):
|
|
@@ -804,6 +957,12 @@ class VerseTester(object):
|
|
|
804
957
|
else:
|
|
805
958
|
score = meter_scores["none found"]
|
|
806
959
|
|
|
960
|
+
# Extra penalties for especially weak upajāti results.
|
|
961
|
+
if len(wbp_lens) == 2:
|
|
962
|
+
score -= 1 # two pādas excluded instead of one
|
|
963
|
+
if all(lbl.startswith('ajñātam') for lbl in meter_labels):
|
|
964
|
+
score -= 1
|
|
965
|
+
|
|
807
966
|
imperfect_note = None
|
|
808
967
|
overall_meter_label = "upajāti %s: %s" % (
|
|
809
968
|
family,
|
|
@@ -858,31 +1017,108 @@ class VerseTester(object):
|
|
|
858
1017
|
|
|
859
1018
|
# score arbitration: may tie with pre-existing result (e.g., samavṛtta)
|
|
860
1019
|
old_score = Vrs.identification_score
|
|
861
|
-
|
|
862
|
-
meter_scores["upajāti, triṣṭubh-jagatī-saṃkara, perfect"],
|
|
863
|
-
meter_scores["upajāti, non-triṣṭubh, perfect"])
|
|
864
|
-
and 'ajñātam' not in overall_meter_label)
|
|
865
|
-
self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=is_perf)
|
|
1020
|
+
self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=imperfect_note is None and not per_pada_english)
|
|
866
1021
|
if score >= old_score:
|
|
867
1022
|
Vrs.diagnostic = diagnostic
|
|
868
1023
|
|
|
869
1024
|
|
|
870
|
-
def is_vizamavftta(self, Vrs):
|
|
1025
|
+
def is_vizamavftta(self, Vrs, perfect_only=False):
|
|
1026
|
+
# bail early if even a perfect result can't beat what's already recorded
|
|
1027
|
+
if meter_scores["viṣamavṛtta, perfect"] <= Vrs.identification_score:
|
|
1028
|
+
return False
|
|
871
1029
|
|
|
1030
|
+
wbp = Vrs.syllable_weights.split('\n')
|
|
1031
|
+
if len(wbp) < 4: return False
|
|
872
1032
|
gs_to_id = Vrs.gaRa_abbreviations.split('\n')
|
|
873
1033
|
if len(gs_to_id) < 4: return False
|
|
1034
|
+
tsyl = Vrs.text_syllabified
|
|
1035
|
+
gaRa = Vrs.gaRa_abbreviations
|
|
1036
|
+
morae = Vrs.morae_per_line
|
|
1037
|
+
|
|
1038
|
+
if perfect_only:
|
|
1039
|
+
for canonicals, canonical_weights, meter_label in _vizamavftta_precomputed:
|
|
1040
|
+
|
|
1041
|
+
# Perfect match via gaṇa abbreviations
|
|
1042
|
+
if all(gs_to_id[i] == canonicals[i] for i in range(4)):
|
|
1043
|
+
Vrs.identification_score = meter_scores["viṣamavṛtta, perfect"]
|
|
1044
|
+
Vrs.meter_label = meter_label
|
|
1045
|
+
Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
|
|
1046
|
+
self._vizama_stash = []
|
|
1047
|
+
return True
|
|
1048
|
+
|
|
1049
|
+
# Not perfect — stash if weight lengths are within threshold
|
|
1050
|
+
if all(
|
|
1051
|
+
abs(len(wbp[i]) - len(canonical_weights[i])) <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD
|
|
1052
|
+
for i in range(4)
|
|
1053
|
+
):
|
|
1054
|
+
self._vizama_stash.append((wbp, meter_label, canonical_weights, tsyl, gaRa, morae))
|
|
1055
|
+
return False
|
|
874
1056
|
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
Vrs.diagnostic = Diagnostic(perfect_id_label=Vrs.meter_label)
|
|
881
|
-
return True
|
|
1057
|
+
# Imperfect pass: consume the stash.
|
|
1058
|
+
if not self._vizama_stash:
|
|
1059
|
+
self.is_vizamavftta(Vrs, perfect_only=True)
|
|
1060
|
+
if not self._vizama_stash:
|
|
1061
|
+
return False
|
|
882
1062
|
|
|
883
|
-
|
|
1063
|
+
best_total_dist = None
|
|
1064
|
+
best_entry = None
|
|
1065
|
+
for _wbp, _label, _canonical_weights, _tsyl, _gaRa, _morae in self._vizama_stash:
|
|
1066
|
+
total_dist = sum(
|
|
1067
|
+
_levenshtein_align(_wbp[i], _canonical_weights[i])[0]
|
|
1068
|
+
for i in range(4)
|
|
1069
|
+
)
|
|
1070
|
+
if total_dist <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
|
|
1071
|
+
if best_total_dist is None or total_dist < best_total_dist:
|
|
1072
|
+
best_total_dist = total_dist
|
|
1073
|
+
best_entry = (_wbp, _label, _canonical_weights, _tsyl, _gaRa, _morae)
|
|
1074
|
+
|
|
1075
|
+
if best_entry is None:
|
|
1076
|
+
return False
|
|
1077
|
+
|
|
1078
|
+
best_wbp, best_label, best_canonical_weights, *_ = best_entry
|
|
1079
|
+
score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - 1)
|
|
1080
|
+
if score <= 0:
|
|
884
1081
|
return False
|
|
885
1082
|
|
|
1083
|
+
problem_syllables = {}
|
|
1084
|
+
per_pada_sanskrit = {}
|
|
1085
|
+
per_pada_english = {}
|
|
1086
|
+
for i, w in enumerate(best_wbp[:4]):
|
|
1087
|
+
canonical = best_canonical_weights[i]
|
|
1088
|
+
dist, prob_indices = _levenshtein_align(w, canonical)
|
|
1089
|
+
if dist == 0:
|
|
1090
|
+
continue
|
|
1091
|
+
pada_num = i + 1
|
|
1092
|
+
problem_syllables[pada_num] = prob_indices
|
|
1093
|
+
meter_name = best_label.split(' = ')[0]
|
|
1094
|
+
if len(w) > len(canonical):
|
|
1095
|
+
per_pada_sanskrit[pada_num] = 'adhikākṣarā'
|
|
1096
|
+
per_pada_english[pada_num] = 'hypermetric'
|
|
1097
|
+
elif len(w) < len(canonical):
|
|
1098
|
+
per_pada_sanskrit[pada_num] = 'ūnākṣarā'
|
|
1099
|
+
per_pada_english[pada_num] = 'hypometric'
|
|
1100
|
+
else:
|
|
1101
|
+
per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
|
|
1102
|
+
per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
|
|
1103
|
+
|
|
1104
|
+
sa_vals = list(per_pada_sanskrit.items())
|
|
1105
|
+
if len(sa_vals) == 1:
|
|
1106
|
+
suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
|
|
1107
|
+
else:
|
|
1108
|
+
suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
|
|
1109
|
+
imperfect_label = best_label + f" ({suffix})"
|
|
1110
|
+
|
|
1111
|
+
old_score = Vrs.identification_score
|
|
1112
|
+
self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
|
|
1113
|
+
if score >= old_score:
|
|
1114
|
+
Vrs.diagnostic = Diagnostic(
|
|
1115
|
+
perfect_id_label=imperfect_label,
|
|
1116
|
+
imperfect_label_sanskrit=per_pada_sanskrit or None,
|
|
1117
|
+
imperfect_label_english=per_pada_english or None,
|
|
1118
|
+
problem_syllables=problem_syllables or None,
|
|
1119
|
+
)
|
|
1120
|
+
return True
|
|
1121
|
+
|
|
886
1122
|
def test_as_samavftta_etc(self, Vrs):
|
|
887
1123
|
|
|
888
1124
|
wbp = Vrs.syllable_weights.split('\n') # weights by pāda
|
|
@@ -907,23 +1143,14 @@ class VerseTester(object):
|
|
|
907
1143
|
timed('samavftta')(self.evaluate_samavftta)(Vrs)
|
|
908
1144
|
return 1 # max score already reached
|
|
909
1145
|
|
|
910
|
-
# test perfect ardhasamavftta
|
|
911
|
-
if ( self.pAdasamatva_count == 2
|
|
912
|
-
and wbp[0][:-1] == wbp[2][:-1]
|
|
913
|
-
and wbp[1][:-1] == wbp[3][:-1] # exclude final anceps
|
|
914
1146
|
|
|
915
|
-
and wbp_lens.count(11) != 4 # bc triṣṭubh upajāti so common
|
|
916
|
-
):
|
|
917
|
-
# will give id_score == 8
|
|
918
|
-
timed('ardhasamavftta')(self.evaluate_ardhasamavftta)(Vrs)
|
|
919
|
-
# max score not necessarily yet reached, don't return
|
|
920
1147
|
|
|
921
1148
|
# test perfect single pāda of samavṛtta
|
|
922
1149
|
if ( self.pAdasamatva_count == 0 and self.resplit_option == "single_pAda"):
|
|
923
1150
|
timed('samavftta')(self.evaluate_samavftta)(Vrs)
|
|
924
1151
|
|
|
925
|
-
# test perfect viṣamavṛtta
|
|
926
|
-
if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs):
|
|
1152
|
+
# test perfect viṣamavṛtta (Levenshtein for imperfect deferred to imperfect pass)
|
|
1153
|
+
if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs, perfect_only=True):
|
|
927
1154
|
# will give id_score == 9
|
|
928
1155
|
# label and score already set in is_vizamavftta if test was successful
|
|
929
1156
|
return 1 # max score already reached
|
|
@@ -938,15 +1165,12 @@ class VerseTester(object):
|
|
|
938
1165
|
if Vrs.identification_score == 8: return 1 # best score compared to below
|
|
939
1166
|
# otherwise, max score not necessarily yet reached, don't return
|
|
940
1167
|
|
|
941
|
-
# test imperfect samavftta
|
|
1168
|
+
# test imperfect samavftta (Levenshtein for length errors deferred to imperfect pass)
|
|
942
1169
|
if self.pAdasamatva_count in [2, 3]:
|
|
943
1170
|
# will give id_score in [7, 6], may tie with above
|
|
944
|
-
timed('samavftta')(self.evaluate_samavftta)(Vrs)
|
|
1171
|
+
timed('samavftta')(self.evaluate_samavftta)(Vrs, perfect_only=True)
|
|
945
1172
|
# max score not necessarily yet reached, don't return
|
|
946
1173
|
|
|
947
|
-
# test imperfect ardhasamavftta? seems hard
|
|
948
|
-
# involves looking specifically for corresponding type...
|
|
949
|
-
|
|
950
1174
|
# test imperfect upajāti
|
|
951
1175
|
if (
|
|
952
1176
|
len( list(set(wbp_lens)) ) in [2, 3] or
|
|
@@ -1291,25 +1515,47 @@ class VerseTester(object):
|
|
|
1291
1515
|
"""
|
|
1292
1516
|
|
|
1293
1517
|
self.identification_attempt_count += 1
|
|
1518
|
+
self._samavftta_has_length_error = False
|
|
1294
1519
|
|
|
1295
1520
|
# anuzwuB
|
|
1296
1521
|
success_anuzwuB = timed('anuzwuB')(self.test_as_anuzwuB)(Vrs)
|
|
1297
1522
|
if success_anuzwuB and Vrs.identification_score == meter_scores["max score"]:
|
|
1298
1523
|
return 1
|
|
1299
1524
|
|
|
1300
|
-
# samavftta, upajAti, vizamavftta
|
|
1301
|
-
|
|
1525
|
+
# samavftta, upajAti, vizamavftta
|
|
1526
|
+
_inner_keys = ('samavftta', 'upajAti', 'vizamavftta')
|
|
1527
|
+
_pre_inner = {k: _section_totals.get(k, 0.0) for k in _inner_keys} if _DEBUG_TIMING else None
|
|
1528
|
+
success_samavftta_etc = timed('samavftta_etc')(self.test_as_samavftta_etc)(Vrs)
|
|
1529
|
+
if _DEBUG_TIMING:
|
|
1530
|
+
inner_delta = sum(_section_totals.get(k, 0.0) - _pre_inner[k] for k in _inner_keys)
|
|
1531
|
+
_section_totals['samavftta_etc'] -= inner_delta
|
|
1302
1532
|
if success_samavftta_etc and Vrs.identification_score >= 8:
|
|
1303
1533
|
return 1
|
|
1304
1534
|
# i.e., if upajāti or anything imperfect, also continue on to check jāti
|
|
1305
1535
|
|
|
1306
|
-
#
|
|
1307
|
-
#
|
|
1536
|
+
# ardhasamavftta perfect-only pass (Levenshtein but bails on imperfect)
|
|
1537
|
+
# Odd pādas 10–12, even pādas 11–13; with threshold 2, viable range is 9–15.
|
|
1538
|
+
# Lower bound 9 excludes anuṣṭubh (8 syllables) which is the most common false candidate.
|
|
1539
|
+
wbp_lens_ardha = [len(line) for line in Vrs.syllable_weights.split('\n')]
|
|
1540
|
+
_ardha_viable = (
|
|
1541
|
+
wbp_lens_ardha.count(11) != 4 and # exclude 4×11 triṣṭubh upajāti
|
|
1542
|
+
all(9 <= l <= 15 for l in wbp_lens_ardha[:4])
|
|
1543
|
+
)
|
|
1544
|
+
if _ardha_viable:
|
|
1545
|
+
timed('ardhasamavftta_perfect')(self.evaluate_ardhasamavftta)(Vrs, perfect_only=True)
|
|
1546
|
+
if _ardha_viable and Vrs.identification_score >= meter_scores["ardhasamavṛtta, perfect"]:
|
|
1547
|
+
return 1
|
|
1308
1548
|
|
|
1309
1549
|
# jāti
|
|
1310
1550
|
success_jAti = timed('jAti')(self.test_as_jAti)(Vrs)
|
|
1551
|
+
if success_jAti and Vrs.identification_score >= meter_scores["max score"]:
|
|
1552
|
+
return 1
|
|
1311
1553
|
|
|
1312
|
-
|
|
1554
|
+
# imperfect pass: deferred Levenshtein annotation for samavftta length errors.
|
|
1555
|
+
if self._samavftta_has_length_error:
|
|
1556
|
+
timed('lev_samavftta')(self.evaluate_samavftta)(Vrs)
|
|
1557
|
+
|
|
1558
|
+
if success_anuzwuB or success_samavftta_etc or success_jAti or Vrs.identification_score >= meter_scores["ardhasamavṛtta, perfect"]:
|
|
1313
1559
|
return 1
|
|
1314
1560
|
else:
|
|
1315
1561
|
return 0
|
|
@@ -1368,6 +1614,8 @@ class MeterIdentifier(object):
|
|
|
1368
1614
|
"""Returns a list for MeterIdentifier.Verses_found"""
|
|
1369
1615
|
|
|
1370
1616
|
self._anuzwuB_half_cache = {}
|
|
1617
|
+
VrsTster._ardha_stash = []
|
|
1618
|
+
VrsTster._vizama_stash = []
|
|
1371
1619
|
pos_iterators = {}
|
|
1372
1620
|
for k in ['ab', 'bc', 'cd']:
|
|
1373
1621
|
if (
|
|
@@ -1481,7 +1729,8 @@ class MeterIdentifier(object):
|
|
|
1481
1729
|
|
|
1482
1730
|
if _DEBUG_TIMING:
|
|
1483
1731
|
_pre_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
|
|
1484
|
-
'anuzwuB', 'samavftta', 'upajAti', '
|
|
1732
|
+
'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
|
|
1733
|
+
'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
|
|
1485
1734
|
_pre = {k: _section_totals.get(k, 0.0) for k in _pre_keys}
|
|
1486
1735
|
|
|
1487
1736
|
# gets back mostly populated Verse object
|
|
@@ -1493,7 +1742,15 @@ class MeterIdentifier(object):
|
|
|
1493
1742
|
|
|
1494
1743
|
if resplit_option in ['none', 'single_pAda'] or V.text_cleaned == '':
|
|
1495
1744
|
# No resplitting: test the verse exactly as scanned.
|
|
1745
|
+
VT._ardha_stash = []
|
|
1746
|
+
VT._vizama_stash = []
|
|
1496
1747
|
success = VT.attempt_identification(V)
|
|
1748
|
+
# Post-identification: deferred imperfect ardhasamavṛtta pass over stash.
|
|
1749
|
+
if VT._ardha_stash and meter_scores["ardhasamavṛtta, imperfect"] > V.identification_score:
|
|
1750
|
+
timed('lev_ardha')(VT.evaluate_ardhasamavftta)(V)
|
|
1751
|
+
# Post-identification: deferred imperfect viṣamavṛtta pass over stash.
|
|
1752
|
+
if VT._vizama_stash and meter_scores["viṣamavṛtta, imperfect"] > V.identification_score:
|
|
1753
|
+
timed('lev_vizama')(VT.is_vizamavftta)(V)
|
|
1497
1754
|
|
|
1498
1755
|
elif resplit_option in ['resplit_max', 'resplit_lite']:
|
|
1499
1756
|
|
|
@@ -1557,6 +1814,139 @@ class MeterIdentifier(object):
|
|
|
1557
1814
|
pAda_brs, quarter_len
|
|
1558
1815
|
)
|
|
1559
1816
|
|
|
1817
|
+
# Post-wiggle: deferred imperfect ardhasamavṛtta pass over accumulated stash.
|
|
1818
|
+
_lev_ardha_t0 = _time.perf_counter() if _DEBUG_TIMING else None
|
|
1819
|
+
ardha_stash = VT._ardha_stash
|
|
1820
|
+
if ardha_stash:
|
|
1821
|
+
best_current_score = (
|
|
1822
|
+
max(v.identification_score for v in self.Verses_found)
|
|
1823
|
+
if self.Verses_found else 0
|
|
1824
|
+
)
|
|
1825
|
+
if meter_scores["ardhasamavṛtta, imperfect"] > best_current_score:
|
|
1826
|
+
best_total_dist = None
|
|
1827
|
+
best_entry = None
|
|
1828
|
+
for _stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae in ardha_stash:
|
|
1829
|
+
total_dist = sum(
|
|
1830
|
+
_levenshtein_align(w, _odd_can if pada_num in (1, 3) else _even_can)[0]
|
|
1831
|
+
for pada_num, w in enumerate(_stash_wbp[:4], start=1)
|
|
1832
|
+
)
|
|
1833
|
+
if total_dist <= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
|
|
1834
|
+
if best_total_dist is None or total_dist < best_total_dist:
|
|
1835
|
+
best_total_dist = total_dist
|
|
1836
|
+
best_entry = (_stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae)
|
|
1837
|
+
if best_entry is not None:
|
|
1838
|
+
ardha_score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - 1)
|
|
1839
|
+
if ardha_score > best_current_score:
|
|
1840
|
+
best_stash_wbp, best_label, best_odd_can, best_even_can, best_stash_tsyl, best_stash_gaRa, best_stash_morae = best_entry
|
|
1841
|
+
problem_syllables = {}
|
|
1842
|
+
per_pada_sanskrit = {}
|
|
1843
|
+
per_pada_english = {}
|
|
1844
|
+
for pada_num, w in enumerate(best_stash_wbp[:4], start=1):
|
|
1845
|
+
canonical = best_odd_can if pada_num in (1, 3) else best_even_can
|
|
1846
|
+
dist, prob_indices = _levenshtein_align(w, canonical)
|
|
1847
|
+
if dist == 0:
|
|
1848
|
+
continue
|
|
1849
|
+
problem_syllables[pada_num] = prob_indices
|
|
1850
|
+
meter_name = best_label.split(' = ')[0]
|
|
1851
|
+
if len(w) > len(canonical):
|
|
1852
|
+
per_pada_sanskrit[pada_num] = 'adhikākṣarā'
|
|
1853
|
+
per_pada_english[pada_num] = 'hypermetric'
|
|
1854
|
+
elif len(w) < len(canonical):
|
|
1855
|
+
per_pada_sanskrit[pada_num] = 'ūnākṣarā'
|
|
1856
|
+
per_pada_english[pada_num] = 'hypometric'
|
|
1857
|
+
else:
|
|
1858
|
+
per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
|
|
1859
|
+
per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
|
|
1860
|
+
sa_vals = list(per_pada_sanskrit.items())
|
|
1861
|
+
if len(sa_vals) == 1:
|
|
1862
|
+
suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
|
|
1863
|
+
else:
|
|
1864
|
+
suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
|
|
1865
|
+
imperfect_label = best_label + f" ({suffix})"
|
|
1866
|
+
ardha_Vrs = copy(self.Verses_found[0]) if self.Verses_found else copy(V)
|
|
1867
|
+
ardha_Vrs.text_syllabified = best_stash_tsyl
|
|
1868
|
+
ardha_Vrs.syllable_weights = '\n'.join(best_stash_wbp)
|
|
1869
|
+
ardha_Vrs.gaRa_abbreviations = best_stash_gaRa
|
|
1870
|
+
ardha_Vrs.morae_per_line = best_stash_morae
|
|
1871
|
+
ardha_Vrs.meter_label = imperfect_label
|
|
1872
|
+
ardha_Vrs.identification_score = ardha_score
|
|
1873
|
+
ardha_Vrs.diagnostic = Diagnostic(
|
|
1874
|
+
perfect_id_label=imperfect_label,
|
|
1875
|
+
imperfect_label_sanskrit=per_pada_sanskrit or None,
|
|
1876
|
+
imperfect_label_english=per_pada_english or None,
|
|
1877
|
+
problem_syllables=problem_syllables or None,
|
|
1878
|
+
)
|
|
1879
|
+
self.Verses_found.append(ardha_Vrs)
|
|
1880
|
+
if _DEBUG_TIMING:
|
|
1881
|
+
_section_totals['lev_ardha'] = _section_totals.get('lev_ardha', 0.0) + _time.perf_counter() - _lev_ardha_t0
|
|
1882
|
+
|
|
1883
|
+
# Post-wiggle: deferred imperfect viṣamavṛtta pass over accumulated stash.
|
|
1884
|
+
_lev_vizama_t0 = _time.perf_counter() if _DEBUG_TIMING else None
|
|
1885
|
+
vizama_stash = VT._vizama_stash
|
|
1886
|
+
if vizama_stash:
|
|
1887
|
+
best_current_score = (
|
|
1888
|
+
max(v.identification_score for v in self.Verses_found)
|
|
1889
|
+
if self.Verses_found else 0
|
|
1890
|
+
)
|
|
1891
|
+
if meter_scores["viṣamavṛtta, imperfect"] > best_current_score:
|
|
1892
|
+
best_total_dist = None
|
|
1893
|
+
best_entry = None
|
|
1894
|
+
for _wbp, _label, _canonicals, _tsyl, _gaRa, _morae in vizama_stash:
|
|
1895
|
+
total_dist = sum(
|
|
1896
|
+
_levenshtein_align(_wbp[i], _canonicals[i])[0]
|
|
1897
|
+
for i in range(4)
|
|
1898
|
+
)
|
|
1899
|
+
if total_dist <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
|
|
1900
|
+
if best_total_dist is None or total_dist < best_total_dist:
|
|
1901
|
+
best_total_dist = total_dist
|
|
1902
|
+
best_entry = (_wbp, _label, _canonicals, _tsyl, _gaRa, _morae)
|
|
1903
|
+
if best_entry is not None:
|
|
1904
|
+
vizama_score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - 1)
|
|
1905
|
+
if vizama_score > best_current_score:
|
|
1906
|
+
best_wbp, best_label, best_canonicals, best_tsyl, best_gaRa, best_morae = best_entry
|
|
1907
|
+
problem_syllables = {}
|
|
1908
|
+
per_pada_sanskrit = {}
|
|
1909
|
+
per_pada_english = {}
|
|
1910
|
+
for i, w in enumerate(best_wbp[:4]):
|
|
1911
|
+
canonical = best_canonicals[i]
|
|
1912
|
+
dist, prob_indices = _levenshtein_align(w, canonical)
|
|
1913
|
+
if dist == 0:
|
|
1914
|
+
continue
|
|
1915
|
+
pada_num = i + 1
|
|
1916
|
+
problem_syllables[pada_num] = prob_indices
|
|
1917
|
+
meter_name = best_label.split(' = ')[0]
|
|
1918
|
+
if len(w) > len(canonical):
|
|
1919
|
+
per_pada_sanskrit[pada_num] = 'adhikākṣarā'
|
|
1920
|
+
per_pada_english[pada_num] = 'hypermetric'
|
|
1921
|
+
elif len(w) < len(canonical):
|
|
1922
|
+
per_pada_sanskrit[pada_num] = 'ūnākṣarā'
|
|
1923
|
+
per_pada_english[pada_num] = 'hypometric'
|
|
1924
|
+
else:
|
|
1925
|
+
per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
|
|
1926
|
+
per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
|
|
1927
|
+
sa_vals = list(per_pada_sanskrit.items())
|
|
1928
|
+
if len(sa_vals) == 1:
|
|
1929
|
+
suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
|
|
1930
|
+
else:
|
|
1931
|
+
suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
|
|
1932
|
+
imperfect_label = best_label + f" ({suffix})"
|
|
1933
|
+
vizama_Vrs = copy(self.Verses_found[0]) if self.Verses_found else copy(V)
|
|
1934
|
+
vizama_Vrs.text_syllabified = best_tsyl
|
|
1935
|
+
vizama_Vrs.syllable_weights = '\n'.join(best_wbp)
|
|
1936
|
+
vizama_Vrs.gaRa_abbreviations = best_gaRa
|
|
1937
|
+
vizama_Vrs.morae_per_line = best_morae
|
|
1938
|
+
vizama_Vrs.meter_label = imperfect_label
|
|
1939
|
+
vizama_Vrs.identification_score = vizama_score
|
|
1940
|
+
vizama_Vrs.diagnostic = Diagnostic(
|
|
1941
|
+
perfect_id_label=imperfect_label,
|
|
1942
|
+
imperfect_label_sanskrit=per_pada_sanskrit or None,
|
|
1943
|
+
imperfect_label_english=per_pada_english or None,
|
|
1944
|
+
problem_syllables=problem_syllables or None,
|
|
1945
|
+
)
|
|
1946
|
+
self.Verses_found.append(vizama_Vrs)
|
|
1947
|
+
if _DEBUG_TIMING:
|
|
1948
|
+
_section_totals['lev_vizama'] = _section_totals.get('lev_vizama', 0.0) + _time.perf_counter() - _lev_vizama_t0
|
|
1949
|
+
|
|
1560
1950
|
# Pick the candidate with the highest identification score.
|
|
1561
1951
|
if len(self.Verses_found) > 0:
|
|
1562
1952
|
self.Verses_found.sort(key=lambda x: x.identification_score, reverse=True)
|
|
@@ -1569,7 +1959,8 @@ class MeterIdentifier(object):
|
|
|
1569
1959
|
|
|
1570
1960
|
if _DEBUG_TIMING:
|
|
1571
1961
|
all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
|
|
1572
|
-
'anuzwuB', 'samavftta', 'upajAti', '
|
|
1962
|
+
'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
|
|
1963
|
+
'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
|
|
1573
1964
|
verse_times = {k: _section_totals.get(k, 0.0) - _pre[k] for k in all_keys}
|
|
1574
1965
|
verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
|
|
1575
1966
|
cat = _meter_label_to_category(V.meter_label)
|
|
@@ -11,6 +11,16 @@ gaRas_by_weights = {
|
|
|
11
11
|
'llg' : 's', # anapest / antidactylus
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
+
weights_by_gaRa = {v: k for k, v in gaRas_by_weights.items()}
|
|
15
|
+
|
|
16
|
+
def expand_gaRa_pattern(gaRa_str):
|
|
17
|
+
"""Expand canonical gaṇa abbreviation string (no regex) to lg weight string.
|
|
18
|
+
Single trailing g/l (final anceps) passes through unchanged."""
|
|
19
|
+
result = []
|
|
20
|
+
for ch in gaRa_str:
|
|
21
|
+
result.append(weights_by_gaRa.get(ch, ch))
|
|
22
|
+
return ''.join(result)
|
|
23
|
+
|
|
14
24
|
"""
|
|
15
25
|
Sources:
|
|
16
26
|
Apte, V.S. (1890). Practical Sanskrit-English Dictionary, "Appendix A: Sanskrit Prosody".
|
|
@@ -336,14 +346,16 @@ all_known_samavRttas = []
|
|
|
336
346
|
for k in samavfttas_by_family_and_gaRa.keys(): # for each family
|
|
337
347
|
all_known_samavRttas = all_known_samavRttas + list(samavfttas_by_family_and_gaRa[k].values())
|
|
338
348
|
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
('
|
|
343
|
-
('
|
|
344
|
-
('
|
|
345
|
-
('
|
|
346
|
-
('
|
|
349
|
+
ardhasamavftta_by_odd_even_weights = {
|
|
350
|
+
# Keys are (odd_canonical_lg, even_canonical_lg) with heavy final anceps.
|
|
351
|
+
# Gaṇa abbreviations shown in comments and embedded in meter label strings.
|
|
352
|
+
('llllllglglg', 'llllgllglglg'): 'aparavaktra = [11: nnrlg] 1,3 + [12: njjr] 2,4', # aka vaitālīya
|
|
353
|
+
('llgllgllglg', 'gllgllgllgg'): 'upacitra = [11: ssslg] 1,3 + [11: BBBgg] 2,4',
|
|
354
|
+
('llllllglglgg', 'llllgllglglgg'): 'puṣpitāgrā = [12: nnry] 1,3 + [13: njjrg] 2,4', # aka aupacchandasika
|
|
355
|
+
('llgllglglg', 'llggllglglg'): 'viyoginī = [10: ssjg] 1,3 + [11: sBrlg] 2,4', # aka vaitālīya, sundarī
|
|
356
|
+
('llgllgllgg', 'gllgllgllgg'): 'vegavatī = [10: sssg] 1,3 + [11: BBBgg] 2,4',
|
|
357
|
+
('llgllgllglg', 'lllgllgllglg'): 'hariṇaplutā = [11: ssslg] 1,3 + [12: nBBr] 2,4',
|
|
358
|
+
('llgllglglgg', 'llggllglglgg'): 'aupacchandasika = [11: ssjgg] 1,3 + [12: sBry] 2,4', # aka mālābhāriṇī
|
|
347
359
|
}
|
|
348
360
|
|
|
349
361
|
vizamavftta_by_4_tuple = {
|
|
@@ -157,22 +157,21 @@ class Scanner(object):
|
|
|
157
157
|
Returns result as string.
|
|
158
158
|
"""
|
|
159
159
|
|
|
160
|
-
#
|
|
161
|
-
|
|
162
|
-
for
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
cntnts = re.sub(regex, '\n', cntnts)
|
|
167
|
-
# also remove buffer-initial and -final newlines
|
|
168
|
-
regex = re.compile(r"(\A\s*)|(\s*\Z)")
|
|
169
|
-
cntnts = re.sub(regex, '', cntnts)
|
|
160
|
+
# filter out disallowed characters (numbers, irrelevant punctuation, etc.)
|
|
161
|
+
# pāda separator chars are preserved so they can be converted to \n below
|
|
162
|
+
pAda_sep_chars = set(c for sep in additional_pAda_separators for c in sep)
|
|
163
|
+
for c in list(set(cntnts)):
|
|
164
|
+
if c not in phonemes.character_set[scheme_in] and c not in pAda_sep_chars:
|
|
165
|
+
cntnts = cntnts.replace(c, '')
|
|
170
166
|
|
|
171
|
-
#
|
|
167
|
+
# replace all pāda separator strings with newline
|
|
168
|
+
for sep in additional_pAda_separators:
|
|
169
|
+
cntnts = cntnts.replace(sep, '\n')
|
|
172
170
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
171
|
+
# strip horizontal whitespace around newlines, dedupe, strip leading/trailing
|
|
172
|
+
cntnts = re.sub(r'[ \t]*\n[ \t]*', '\n', cntnts)
|
|
173
|
+
cntnts = re.sub(r'\n+', '\n', cntnts)
|
|
174
|
+
cntnts = cntnts.strip()
|
|
176
175
|
|
|
177
176
|
return cntnts
|
|
178
177
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.5.3"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|