skrutable 2.7.0__tar.gz → 2.8.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {skrutable-2.7.0 → skrutable-2.8.0}/PKG-INFO +1 -1
  2. skrutable-2.8.0/src/skrutable/__init__.py +1 -0
  3. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/config.json +8 -6
  4. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/meter_identification.py +436 -273
  5. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/meter_patterns.py +2 -2
  6. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/scansion.py +1 -0
  7. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable.egg-info/PKG-INFO +1 -1
  8. skrutable-2.7.0/src/skrutable/__init__.py +0 -1
  9. {skrutable-2.7.0 → skrutable-2.8.0}/LICENSE.md +0 -0
  10. {skrutable-2.7.0 → skrutable-2.8.0}/README.md +0 -0
  11. {skrutable-2.7.0 → skrutable-2.8.0}/setup.cfg +0 -0
  12. {skrutable-2.7.0 → skrutable-2.8.0}/setup.py +0 -0
  13. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/config.py +0 -0
  14. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/generate_scheme_vectors.py +0 -0
  15. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/impossible_bigrams.json +0 -0
  16. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/manual.md +0 -0
  17. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/phonemes.py +0 -0
  18. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/run_examples.py +0 -0
  19. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/scheme_detection.py +0 -0
  20. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/scheme_maps.py +0 -0
  21. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/scheme_vectors.json +0 -0
  22. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/scheme_vectors_mbh.py +0 -0
  23. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/splitting.py +0 -0
  24. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/transliteration.py +0 -0
  25. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/utils.py +0 -0
  26. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable/virAma_avoidance.py +0 -0
  27. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable.egg-info/SOURCES.txt +0 -0
  28. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable.egg-info/dependency_links.txt +0 -0
  29. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable.egg-info/requires.txt +0 -0
  30. {skrutable-2.7.0 → skrutable-2.8.0}/src/skrutable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.7.0
3
+ Version: 2.8.0
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -0,0 +1 @@
1
+ __version__ = "2.8.0"
@@ -7,7 +7,7 @@
7
7
  "additional_pAda_separators" : ["\t", ";", ",", "/", "|", "।"],
8
8
  "default_resplit_option" : "resplit_lite",
9
9
  "default_resplit_keep_midpoint" : false,
10
- "disable_non_trizwuB_upajAti" : true,
10
+ "allow_only_trizwuB_and_jagatI_upajAti" : true,
11
11
  "meter_scores" : {
12
12
  "max score" : 9,
13
13
  "anuṣṭubh, full, both halves perfect)" : 9,
@@ -30,13 +30,15 @@
30
30
  "viṣamavṛtta, perfect" : 9,
31
31
  "viṣamavṛtta, imperfect" : 7,
32
32
  "upajāti, perfect" : 8,
33
- "upajāti, imperfect" : 6,
34
- "upajāti, non-triṣṭubh, perfect" : 5,
35
- "upajāti, triṣṭubh-jagatī-saṃkara, perfect" : 4,
36
- "upajāti, non-triṣṭubh, imperfect" : 3,
33
+ "upajāti, penalty, jagati" : 1,
34
+ "upajāti, penalty, per missing pāda" : 2,
35
+ "upajāti, penalty, per ajñātam pāda" : 2,
37
36
  "jāti, perfect" : 9,
38
37
  "jāti, imperfect" : 6,
39
- "jāti, likely" : 3,
38
+ "jāti, likely" : 4,
39
+ "jāti, penalty, per mora-mismatched pāda" : 1,
40
+ "samavṛtta, penalty, ajñātasamavṛtta" : 2,
41
+ "levenshtein distance penalty" : 1,
40
42
  "none found" : 1
41
43
  },
42
44
  "preserve_punctuation_default" : true,
@@ -4,6 +4,7 @@ from skrutable.config import load_config_dict_from_json_file
4
4
  from skrutable.utils import _DEBUG_TIMING, _section_totals, timed
5
5
  import re
6
6
  import time as _time
7
+ from functools import lru_cache
7
8
  from copy import copy
8
9
  from concurrent.futures import ProcessPoolExecutor
9
10
  from dataclasses import dataclass
@@ -17,11 +18,25 @@ config = load_config_dict_from_json_file()
17
18
  scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
18
19
  default_resplit_option = config["default_resplit_option"] # e.g. "none"
19
20
  default_resplit_keep_midpoint = config["default_resplit_keep_midpoint"] # e.g. True
20
- disable_non_trizwuB_upajAti = config["disable_non_trizwuB_upajAti"] # e.g. True
21
+ allow_only_trizwuB_and_jagatI_upajAti = config["allow_only_trizwuB_and_jagatI_upajAti"] # e.g. True
21
22
  meter_scores = config["meter_scores"] # dict
22
23
 
23
24
  _category_totals = {} # { category: { section: float seconds } }, single source of truth
24
25
 
26
+ # Profiling categories and labels
27
+ _SCAN_ABBREV = {
28
+ 'scan_clean': 'clean', 'scan_translit': 'transl', 'scan_syllabify': 'syl',
29
+ 'scan_weights': 'wts', 'scan_morae_gana': 'mor+g',
30
+ }
31
+ _ID_CASCADE_ABBREV = {
32
+ 'anuzwuB': 'anuṣṭ', 'ardhatraya': 'anuṣṭ3', 'samavftta_etc': 'vftta↑', 'samavftta': 'samav', 'upajAti': 'upajāti',
33
+ 'ardhasamavftta_perfect': 'ardha✓', 'vizamavftta': 'vizama',
34
+ 'jAti': 'jāti',
35
+ 'lev_samavftta': 'lev✗sama', 'lev_upajAti': 'lev✗upaj', 'lev_ardha': 'lev✗ardh', 'lev_vizama': 'lev✗visa',
36
+ }
37
+ _SCAN_KEYS = tuple(_SCAN_ABBREV)
38
+ _ID_CASCADE_KEYS = tuple(_ID_CASCADE_ABBREV)
39
+ _TIMING_KEYS = _SCAN_KEYS + _ID_CASCADE_KEYS
25
40
 
26
41
  _ARDHASAMAVRTTA_NAMES = [
27
42
  'aparavaktra', 'upacitra', 'puṣpitāgrā', 'viyoginī', 'vegavatī',
@@ -65,18 +80,9 @@ def flush_profiling_report(write_file=False, wall_clock_secs=None, parallel_work
65
80
  if not _DEBUG_TIMING or not _category_totals:
66
81
  return
67
82
  import sys, os
68
- scan_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana')
69
- type_keys = ('anuzwuB', 'ardhatraya', 'samavftta_etc', 'samavftta', 'upajAti', 'ardhasamavftta_perfect', 'vizamavftta', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama')
70
- type_abbrev = {
71
- 'anuzwuB': 'anuṣṭ', 'ardhatraya': 'anuṣṭ3', 'samavftta_etc': 'vftta↑', 'samavftta': 'samav', 'upajAti': 'upajāti',
72
- 'ardhasamavftta_perfect': 'ardha✓', 'vizamavftta': 'vizama',
73
- 'jAti': 'jāti',
74
- 'lev_samavftta': 'lev✗sama', 'lev_ardha': 'lev✗ardh', 'lev_vizama': 'lev✗visa',
75
- }
76
- scan_abbrev = {'scan_clean': 'clean', 'scan_translit': 'transl', 'scan_syllabify': 'syl', 'scan_weights': 'wts', 'scan_morae_gana': 'mor+g'}
77
83
  cat_order = ['anuṣṭubh', 'samavṛtta', 'upajāti', 'ardhasamavṛtta', 'viṣamavṛtta', 'jāti', 'na kiṃcid adhyavasitam']
78
- hdr_scan_abbrevs = [scan_abbrev[k] for k in scan_keys]
79
- hdr_type_abbrevs = [type_abbrev[k] for k in type_keys]
84
+ hdr_scan_abbrevs = list(_SCAN_ABBREV.values())
85
+ hdr_type_abbrevs = list(_ID_CASCADE_ABBREV.values())
80
86
  val_w = len('0.00s')
81
87
  col_cat_w = max(len(c) for c in cat_order + ['category']) + 2
82
88
  sub_w = max(len('scan∑'), len('types∑'), len('total'), val_w) + 2
@@ -106,10 +112,10 @@ def flush_profiling_report(write_file=False, wall_clock_secs=None, parallel_work
106
112
  bucket = _category_totals.get(cat)
107
113
  if not bucket:
108
114
  continue
109
- cat_scan = sum(bucket.get(k, 0.0) for k in scan_keys)
110
- cat_types = sum(bucket.get(k, 0.0) for k in type_keys)
111
- scan_vals = [f'{bucket.get(k, 0.0):.2f}s' for k in scan_keys]
112
- type_vals = [f'{bucket.get(k, 0.0):.2f}s' for k in type_keys]
115
+ cat_scan = sum(bucket.get(k, 0.0) for k in _SCAN_KEYS)
116
+ cat_types = sum(bucket.get(k, 0.0) for k in _ID_CASCADE_KEYS)
117
+ scan_vals = [f'{bucket.get(k, 0.0):.2f}s' for k in _SCAN_KEYS]
118
+ type_vals = [f'{bucket.get(k, 0.0):.2f}s' for k in _ID_CASCADE_KEYS]
113
119
  n_perf = bucket.get('_perfect_count', 0)
114
120
  n_impf = bucket.get('_count', 0) - n_perf
115
121
  total_perfect += n_perf
@@ -121,10 +127,10 @@ def flush_profiling_report(write_file=False, wall_clock_secs=None, parallel_work
121
127
  + f'{cat_types:.2f}s'.rjust(sub_w)
122
128
  + ' ' + fmt_row(scan_vals, type_vals))
123
129
  lines.append(sep)
124
- total_scan = sum(sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order) for k in scan_keys)
125
- total_types = sum(sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order) for k in type_keys)
126
- total_scan_vals = [f'{sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order):.2f}s' for k in scan_keys]
127
- total_type_vals = [f'{sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order):.2f}s' for k in type_keys]
130
+ total_scan = sum(sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order) for k in _SCAN_KEYS)
131
+ total_types = sum(sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order) for k in _ID_CASCADE_KEYS)
132
+ total_scan_vals = [f'{sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order):.2f}s' for k in _SCAN_KEYS]
133
+ total_type_vals = [f'{sum(_category_totals.get(c, {}).get(k, 0.0) for c in cat_order):.2f}s' for k in _ID_CASCADE_KEYS]
128
134
  lines.append(' ' + 'TOTAL'.ljust(col_cat_w)
129
135
  + str(total_perfect).rjust(count_w) + str(total_imperfect).rjust(count_w)
130
136
  + f'{total_scan + total_types:.2f}s'.rjust(sub_w)
@@ -156,6 +162,7 @@ class Diagnostic:
156
162
  problem_syllables: Optional[dict] = None # keyed by pada (1–4 or 'odd'/'even'); None if perfect
157
163
  notable_syllables: Optional[dict] = None # keyed by pada (1–4 or 'odd'/'even'); green-highlighted "interesting/ok" syllables
158
164
  notable_label: Optional[dict] = None # keyed by pada (1–4 or 'odd'/'even'); label for the notable feature (same string for skt/eng)
165
+ canonical_gana: Optional[dict] = None # keyed by pada (1–4); canonical gaṇa char string for Levenshtein-attributed length-deviant pādas
159
166
 
160
167
  def perfect(self):
161
168
  return self.perfect_id_label is not None
@@ -182,6 +189,19 @@ _vizamavftta_precomputed = [
182
189
  for gaRas, label in meter_patterns.vizamavftta_by_4_tuple.items()
183
190
  ]
184
191
 
192
+ # Precomputed upajāti candidate patterns by length, for future deferred Levenshtein use:
193
+ # (canonical_gaRa_str, canonical_weights_str, meter_name, gaRa_regex_str)
194
+ _upajAti_patterns_by_length = {}
195
+ for _L, _patterns in meter_patterns.samavfttas_by_family_and_gaRa.items():
196
+ if not _patterns:
197
+ continue
198
+ _entries = []
199
+ for _gaRa_pattern, _meter_name in _patterns.items():
200
+ _canonical_gaRa = meter_patterns.choose_heavy_gaRa_pattern(_gaRa_pattern)
201
+ _canonical_weights = _gaRa_str_to_weights(_canonical_gaRa)
202
+ _entries.append((_canonical_gaRa, _canonical_weights, _meter_name, _gaRa_pattern))
203
+ _upajAti_patterns_by_length[_L] = _entries
204
+
185
205
 
186
206
  def _levenshtein_align(observed, canonical):
187
207
  """Return (distance, problem_indices) comparing observed lg string to canonical,
@@ -232,6 +252,12 @@ def _levenshtein_align(observed, canonical):
232
252
  return dist, []
233
253
 
234
254
 
255
+ @lru_cache(maxsize=None)
256
+ def _levenshtein_align_cached(observed, canonical):
257
+ dist, prob = _levenshtein_align(observed, canonical)
258
+ return dist, tuple(prob)
259
+
260
+
235
261
  def _decompose_into_mAtragaNas(weights_str, gana_6_morae, gana_8_morae):
236
262
  """
237
263
  Decomposes an ardha (half-verse) weight string into mātrā-gaṇas.
@@ -378,8 +404,8 @@ class VerseTester(object):
378
404
  Most methods take a populated scansion.Verse object as an argument;
379
405
  test_as_anuzwuB_half() is an exception.
380
406
 
381
- Primary method attempt_identification returns scansion.Verse object
382
- with populated meter_label attribute if identification was successful.
407
+ Primary method attempt_identification embeds results in the Verse object
408
+ and returns 1 if identified, 0 if not.
383
409
  """
384
410
 
385
411
  def __init__(self):
@@ -392,8 +418,9 @@ class VerseTester(object):
392
418
  self._ardha_stash = [] # accumulated across wiggle candidates
393
419
  self._vizama_stash = [] # accumulated across wiggle candidates
394
420
  self._samavftta_has_length_error = False # set during evaluate_samavftta perfect_only pass
421
+ self._upajAti_needs_lev = False # set during evaluate_upajAti forward pass
395
422
 
396
- def combine_results(self, Vrs, new_label, new_score, new_is_perfect=False):
423
+ def combine_results(self, Vrs, new_label, new_score, new_is_perfect=False, new_diagnostic=None):
397
424
  old_label = Vrs.meter_label or ''
398
425
  old_score = Vrs.identification_score
399
426
 
@@ -408,9 +435,18 @@ class VerseTester(object):
408
435
  Vrs.meter_label = new_label
409
436
  Vrs.identification_score = new_score
410
437
  Vrs.is_perfect = new_is_perfect
438
+ Vrs.alternatives = []
411
439
 
412
440
  elif new_score == old_score:
413
441
  # tie, concatenate as old + new
442
+ if Vrs.meter_label is None:
443
+ Vrs.meter_label = new_label
444
+ Vrs.is_perfect = new_is_perfect
445
+ else:
446
+ # stash the first alternative before appending the second
447
+ if not Vrs.alternatives:
448
+ Vrs.alternatives = [{'meter_label': old_label, 'diagnostic': Vrs.diagnostic}]
449
+ Vrs.alternatives.append({'meter_label': new_label, 'diagnostic': new_diagnostic})
414
450
  Vrs.meter_label += " atha vā " + new_label
415
451
  # do not change score
416
452
 
@@ -632,9 +668,9 @@ class VerseTester(object):
632
668
 
633
669
  def count_pAdasamatva(self, Vrs):
634
670
  """
635
- Accepts four-part (newline-separated) string of light/heavy (l/g) pattern.
671
+ Accepts Verse object with four-part (newline-separated) syllable_weights.
636
672
  Since testing for samavṛtta, ignores final anceps syllable in each part.
637
- Returns integer 0,2,3,4 indicating size of best matching group.
673
+ Sets self.pAdasamatva_count to 0, 2, 3, or 4 (size of best matching group).
638
674
  """
639
675
 
640
676
  self.pAdasamatva_count = 0
@@ -665,7 +701,7 @@ class VerseTester(object):
665
701
 
666
702
  # get index of most frequent pāda type
667
703
  wbp_sans_final = [ w[:-1] for w in wbp ] # omit final anceps from consideration
668
- most_freq_pAda = max( set(wbp_sans_final), key=wbp_sans_final.count )
704
+ most_freq_pAda = max( sorted(set(wbp_sans_final)), key=wbp_sans_final.count )
669
705
  i = wbp_sans_final.index(most_freq_pAda)
670
706
 
671
707
  w_to_id = wbp[i] # weights to id, including final anceps
@@ -702,9 +738,10 @@ class VerseTester(object):
702
738
  meter_label += " (%s)" % imperfect_note
703
739
  score = meter_scores["samavṛtta, quarter, perfect"]
704
740
 
705
- # experimental penalty, can later incorporate into config meter_scores
706
741
  if "ajñātasamavṛtta" in meter_label:
707
- score -= 2
742
+ score -= meter_scores["samavṛtta, penalty, ajñātasamavṛtta"]
743
+
744
+ bare_meter_label = meter_label # forward-pass label before per-pāda length notes
708
745
 
709
746
  # Build per-pāda diagnostic: length errors (Levenshtein), then pattern errors.
710
747
  # In perfect_only mode, skip Levenshtein — just register the result and return.
@@ -718,9 +755,10 @@ class VerseTester(object):
718
755
  # Defer length-error annotation to the imperfect pass; register result now.
719
756
  self._samavftta_has_length_error = True
720
757
  old_score = Vrs.identification_score
721
- self.combine_results(Vrs, new_label=meter_label, new_score=score)
758
+ _diag = Diagnostic(perfect_id_label=meter_label)
759
+ self.combine_results(Vrs, new_label=meter_label, new_score=score, new_diagnostic=_diag)
722
760
  if score >= old_score:
723
- Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
761
+ Vrs.diagnostic = _diag
724
762
  return
725
763
 
726
764
  for pada_num, w in enumerate(wbp[:4], start=1):
@@ -770,9 +808,14 @@ class VerseTester(object):
770
808
 
771
809
  # score arbitration: may tie with pre-existing result (e.g., upajāti)
772
810
  old_score = Vrs.identification_score
773
- self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=not imperfect_note and not has_any_error)
774
- if score >= old_score:
811
+ if self._samavftta_has_length_error and Vrs.meter_label == bare_meter_label:
812
+ # Replace the forward-pass placeholder with the fully-annotated label.
813
+ Vrs.meter_label = meter_label
775
814
  Vrs.diagnostic = diagnostic
815
+ else:
816
+ self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=not imperfect_note and not has_any_error, new_diagnostic=diagnostic)
817
+ if score >= old_score:
818
+ Vrs.diagnostic = diagnostic
776
819
 
777
820
 
778
821
  def evaluate_ardhasamavftta(self, Vrs, perfect_only=False):
@@ -810,9 +853,10 @@ class VerseTester(object):
810
853
  ):
811
854
  score = meter_scores["ardhasamavṛtta, perfect"]
812
855
  old_score = Vrs.identification_score
813
- self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=True)
856
+ _diag = Diagnostic(perfect_id_label=meter_label)
857
+ self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=True, new_diagnostic=_diag)
814
858
  if score >= old_score:
815
- Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
859
+ Vrs.diagnostic = _diag
816
860
  self._ardha_stash = [] # perfect found; no need for imperfect pass
817
861
  return
818
862
  # same length but not perfect — stash without distance computation
@@ -875,137 +919,229 @@ class VerseTester(object):
875
919
  imperfect_label = best_label + f" ({suffix})"
876
920
 
877
921
  old_score = Vrs.identification_score
878
- self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
922
+ _diag = Diagnostic(
923
+ perfect_id_label=imperfect_label,
924
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
925
+ imperfect_label_english=per_pada_english or None,
926
+ problem_syllables=problem_syllables or None,
927
+ )
928
+ self.combine_results(Vrs, new_label=imperfect_label, new_score=score, new_diagnostic=_diag)
879
929
  if score >= old_score:
880
- Vrs.diagnostic = Diagnostic(
881
- perfect_id_label=imperfect_label,
882
- imperfect_label_sanskrit=per_pada_sanskrit or None,
883
- imperfect_label_english=per_pada_english or None,
884
- problem_syllables=problem_syllables or None,
885
- )
930
+ Vrs.diagnostic = _diag
931
+
886
932
 
933
+ def _upajAti_match_pada_exact(self, pada_len, gaRa_str):
934
+ """Exact regex attribution for one upajāti pāda against its own length's patterns.
887
935
 
888
- def evaluate_upajAti(self, Vrs):
936
+ Returns (meter_label, is_ajnata) where meter_label is the formatted label string
937
+ and is_ajnata is True if no pattern matched.
938
+ """
939
+ for gaRa_pattern in meter_patterns.samavfttas_by_family_and_gaRa[pada_len].keys():
940
+ if re.match(re.compile(gaRa_pattern), gaRa_str):
941
+ meter_label = meter_patterns.samavfttas_by_family_and_gaRa[pada_len][gaRa_pattern]
942
+ meter_label += ' [%d: %s]' % (
943
+ pada_len,
944
+ meter_patterns.choose_heavy_gaRa_pattern(gaRa_pattern)
945
+ )
946
+ return meter_label, False
947
+ meter_label = 'ajñātam [%d: %s]' % (pada_len, gaRa_str)
948
+ return meter_label, True
949
+
950
+ def _synthesize_upajAti_label(self, meter_labels, wbp_lens, unique_sorted_lens, family_lengths):
951
+ """Build (overall_meter_label, family, notable_label_dict) from per-pāda meter_labels.
952
+
953
+ overall_meter_label format: "upajāti triṣṭubh: upendravajrā 1,3; vātormī 2; indravajrā 4"
954
+ — subtypes sorted by pāda count desc, then first-occurrence asc; no syllable/gaṇa info.
955
+ notable_label_dict: {pada_num (1-based): bare_name} for all non-ajñātam pādas.
956
+ """
957
+ # Extract bare subtype name (strip " [len: gaṇas]" suffix).
958
+ def _bare_name(lbl):
959
+ return lbl.split(' [')[0]
960
+
961
+ # Build notable_label_dict and group pāda numbers by bare name.
962
+ notable_label_dict = {}
963
+ name_to_padas = {} # bare_name → [1-based pada nums], in order
964
+ for i, lbl in enumerate(meter_labels):
965
+ pada_num = i + 1
966
+ name = _bare_name(lbl)
967
+ if not name.startswith('ajñātam'):
968
+ notable_label_dict[pada_num] = name
969
+ name_to_padas.setdefault(name, []).append(pada_num)
970
+
971
+ # Sort groups: count desc, then first occurrence asc.
972
+ sorted_groups = sorted(
973
+ name_to_padas.items(),
974
+ key=lambda kv: (-len(kv[1]), kv[1][0])
975
+ )
976
+ combined_parts = [
977
+ '%s %s' % (name, ','.join(str(p) for p in padas))
978
+ for name, padas in sorted_groups
979
+ ]
980
+ combined_meter_labels = '; '.join(combined_parts)
981
+
982
+ # Pick family name from family_lengths: prefer 11, then 12, then smallest.
983
+ family_len = 11 if 11 in family_lengths else (12 if 12 in family_lengths else min(family_lengths))
984
+ family = meter_patterns.samavftta_family_names[family_len] if family_len < 27 else 'daṇḍaka'
985
+ if unique_sorted_lens == [11, 12]:
986
+ family = 'triṣṭubh + jagatī'
987
+
988
+ overall_meter_label = 'upajāti %s: %s' % (family, combined_meter_labels)
989
+ return overall_meter_label, family, notable_label_dict
990
+
991
+ def _upajAti_levenshtein_attribute_pada(self, pada_weights, family_lengths):
992
+ """Deferred-pass Levenshtein attribution for one upajāti pāda.
993
+
994
+ Tries every known pattern of a family-context length within
995
+ ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD. Returns
996
+ (meter_name, canonical_gaRa, canonical_weights, problem_indices, distance)
997
+ or None if no pattern is within threshold.
998
+ """
999
+ pada_len = len(pada_weights)
1000
+ best = None # (distance, meter_name, canonical_gaRa, canonical_weights, problem_indices)
1001
+ all_at_best = []
1002
+ for L_candidate in family_lengths:
1003
+ if abs(pada_len - L_candidate) > ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
1004
+ continue
1005
+ for canonical_gaRa, canonical_weights, meter_name, _regex_str in _upajAti_patterns_by_length.get(L_candidate, []):
1006
+ dist, prob_indices = _levenshtein_align_cached(pada_weights, canonical_weights)
1007
+ if dist > ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
1008
+ continue
1009
+ entry = (dist, meter_name, canonical_gaRa, canonical_weights, prob_indices)
1010
+ if best is None or dist < best[0]:
1011
+ best = entry
1012
+ all_at_best = [entry]
1013
+ elif dist == best[0]:
1014
+ all_at_best.append(entry)
1015
+ if best is None:
1016
+ return None
1017
+ # special case: indravajrā/upendravajrā are equidistant by design; record jointly
1018
+ names_at_best = {e[1] for e in all_at_best}
1019
+ if names_at_best == {'indravajrā', 'upendravajrā'}:
1020
+ indra = next(e for e in all_at_best if e[1] == 'indravajrā')
1021
+ upendra = next(e for e in all_at_best if e[1] == 'upendravajrā')
1022
+ joint_name = 'indravajrā / upendravajrā'
1023
+ joint_canonical = '%s / %s' % (indra[2], upendra[2])
1024
+ return (joint_name, joint_canonical, indra[3], indra[4], best[0])
1025
+ return (best[1], best[2], best[3], best[4], best[0])
1026
+
1027
+ def evaluate_upajAti(self, Vrs, perfect_only=True):
889
1028
  # sufficient length similarity already assured, now just evaluate
890
1029
 
891
1030
  wbp = Vrs.syllable_weights.split('\n') # weights by pāda
892
- wbp_lens_orig = [ len(line) for line in wbp ]
893
- wbp_lens = list(wbp_lens_orig)
894
- gs_to_id = Vrs.gaRa_abbreviations.split('\n')
1031
+ wbp_lens_orig = [len(line) for line in wbp]
1032
+ n_pAdas = min(len(wbp), 4)
1033
+ wbp = wbp[:4]
1034
+ wbp_lens = wbp_lens_orig[:4]
1035
+ gs_to_id = Vrs.gaRa_abbreviations.split('\n')[:4]
1036
+ missing_pAdas = max(0, 4 - n_pAdas)
1037
+
1038
+ unique_sorted_lens = sorted(set(wbp_lens))
895
1039
 
896
- # special exception for triṣṭubh-jagatī mix
1040
+ # Determine family lengths from most-frequent pāda length.
897
1041
  # see Karashima 2016 "The Triṣṭubh-Jagatī Verses in the Saddharmapuṇḍarīka"
898
- unique_sorted_lens = list(set(wbp_lens))
899
- unique_sorted_lens.sort()
900
-
901
- # track which original pada indices (0-based) are excluded
902
- excluded_indices = []
903
-
904
- if unique_sorted_lens != [11, 12]:
905
- # For non-triṣṭubh-jagatī mixes: drop pādas of non-majority length so
906
- # the identifier works on the largest consistent set.
907
- most_freq_pAda_len = max( set(wbp_lens), key=wbp_lens.count )
908
- to_exclude = []
909
- for i, weights in enumerate(wbp):
910
- if len(weights) != most_freq_pAda_len:
911
- to_exclude.append(i)
912
- excluded_indices = list(to_exclude)
913
- for i in reversed(to_exclude): # delete in descending index order, avoid index errors
914
- del wbp[i]
915
- del wbp_lens[i]
916
- del gs_to_id[i]
1042
+ most_freq_pAda_len = max(sorted(set(wbp_lens)), key=wbp_lens.count)
1043
+ if allow_only_trizwuB_and_jagatI_upajAti and most_freq_pAda_len not in (11, 12):
1044
+ return
1045
+ # family_lengths: the set of lengths to match against. Always includes 11
1046
+ # and/or 12 if present; length-deviant pādas go ajñātam → Lev rescue candidate.
1047
+ family_lengths = set()
1048
+ if 11 in wbp_lens:
1049
+ family_lengths.add(11)
1050
+ if 12 in wbp_lens:
1051
+ family_lengths.add(12)
1052
+ if not family_lengths:
1053
+ family_lengths = {most_freq_pAda_len}
917
1054
 
918
1055
  # Calculate maximum achievable score before doing any pattern work,
919
1056
  # and bail early if we can't beat the current best.
920
1057
  potential_score = meter_scores["upajāti, perfect"]
921
- if 11 not in wbp_lens: # no triṣṭubh (could be mixed with jagatī)
922
- potential_score -= 1
923
- if (
924
- len(wbp_lens) != 4 and
925
- unique_sorted_lens != [11, 12]
926
- ): # not perfect, less than 4 being analyzed
927
- potential_score -= 2
928
- if ( potential_score < Vrs.identification_score
929
- # not going to beat pre-existing result (e.g. 7 from imperfect samavftta)
930
- ) or ( disable_non_trizwuB_upajAti
931
- and potential_score < meter_scores["upajāti, imperfect"]
932
- ):
1058
+ if 11 not in wbp_lens:
1059
+ potential_score -= meter_scores["upajāti, penalty, jagati"]
1060
+ potential_score -= missing_pAdas * meter_scores["upajāti, penalty, per missing pāda"]
1061
+ if potential_score < Vrs.identification_score:
1062
+ # not going to beat pre-existing result (e.g. 7 from imperfect samavṛtta)
933
1063
  return
934
1064
 
935
- # Identify each remaining pāda individually and collect labels.
1065
+ # Identify each pāda individually. Exact match is restricted to family lengths;
1066
+ # length-deviant pādas go straight to ajñātam and become Lev rescue candidates.
936
1067
  meter_labels = []
1068
+ any_ajnata = False
1069
+ any_exact = False
1070
+ vikrta_count = 0
1071
+ vikrta_info = {} # pada_index (0-based) → (orig_len, canonical_len, problem_indices)
937
1072
  for i, g_to_id in enumerate(gs_to_id):
938
-
939
- for gaRa_pattern in meter_patterns.samavfttas_by_family_and_gaRa[wbp_lens[i]].keys():
940
-
941
- regex = re.compile(gaRa_pattern)
942
-
943
- if re.match(regex, g_to_id):
944
-
945
- meter_label = meter_patterns.samavfttas_by_family_and_gaRa[wbp_lens[i]][gaRa_pattern]
946
- meter_label += ' [%d: %s]' % (
947
- wbp_lens[i],
948
- meter_patterns.choose_heavy_gaRa_pattern(gaRa_pattern)
949
- )
950
- break
951
-
1073
+ if wbp_lens[i] in family_lengths:
1074
+ meter_label, is_ajnata = self._upajAti_match_pada_exact(wbp_lens[i], g_to_id)
952
1075
  else:
953
- meter_label = "ajñātam" # i.e., might need to add to meter_patterns
954
- meter_label += ' [%d: %s]' % ( wbp_lens[i], g_to_id )
955
-
1076
+ meter_label = 'ajñātam [%d: %s]' % (wbp_lens[i], g_to_id)
1077
+ is_ajnata = True
1078
+ if is_ajnata:
1079
+ any_ajnata = True
1080
+ if not perfect_only:
1081
+ lev_result = self._upajAti_levenshtein_attribute_pada(wbp[i], family_lengths)
1082
+ if lev_result is not None:
1083
+ meter_name, canonical_gaRa, canonical_weights, problem_indices, dist = lev_result
1084
+ meter_label = '%s [%d: %s]' % (meter_name, len(canonical_weights), canonical_gaRa)
1085
+ vikrta_count += 1
1086
+ vikrta_info[i] = (wbp_lens[i], len(canonical_weights), problem_indices, canonical_gaRa, dist)
1087
+ else:
1088
+ any_exact = True
956
1089
  meter_labels.append(meter_label)
957
1090
 
958
- unique_meter_labels = sorted(set(meter_labels)) # de-dupe, stable order
959
- combined_meter_labels = ', '.join(unique_meter_labels)
960
-
961
- # Assign score based on how complete and homogeneous the match is.
962
- family = meter_patterns.samavftta_family_names[wbp_lens[0]] if wbp_lens[0] < 27 else 'daṇḍaka'
963
- if (family == "triṣṭubh" and
964
- unique_meter_labels == ['indravajrā [11: ttjgg]', 'upendravajrā [11: jtjgg]']
965
- ):
966
- family = '' # clearer not to specify in this case
967
-
968
- if len(wbp_lens) == 4 and unique_sorted_lens == [11]: # triṣṭubh
969
- score = meter_scores["upajāti, perfect"]
970
- elif unique_sorted_lens == [11, 12]:
971
- score = meter_scores["upajāti, triṣṭubh-jagatī-saṃkara, perfect"]
972
- family = "triṣṭubh-jagatī-saṃkara?" # overwrite
973
- elif len(wbp_lens) == 4 and 11 not in unique_sorted_lens:
974
- score = meter_scores["upajāti, non-triṣṭubh, perfect"]
975
- elif len(wbp_lens) in [2,3] and wbp_lens.count(11) == len(wbp_lens): # triṣṭubh
976
- score = meter_scores["upajāti, imperfect"]
977
- elif len(wbp_lens) in [2,3] and 11 not in wbp_lens:
978
- score = meter_scores["upajāti, non-triṣṭubh, imperfect"]
979
- else:
980
- score = meter_scores["none found"]
1091
+ # forward pass: flag for deferred Levenshtein if any pāda is ajñātam
1092
+ # but only if at least one matched exactly (verse is plausibly upajāti)
1093
+ if perfect_only and any_ajnata and any_exact:
1094
+ self._upajAti_needs_lev = True
981
1095
 
982
- # Extra penalties for especially weak upajāti results.
983
- if len(wbp_lens) == 2:
984
- score -= 1 # two pādas excluded instead of one
985
- if all(lbl.startswith('ajñātam') for lbl in meter_labels):
986
- score -= 1
1096
+ overall_meter_label, family, notable_label_dict = self._synthesize_upajAti_label(
1097
+ meter_labels, wbp_lens, unique_sorted_lens, family_lengths
1098
+ )
1099
+
1100
+ score = meter_scores["upajāti, perfect"]
1101
+ if 11 not in wbp_lens:
1102
+ score -= meter_scores["upajāti, penalty, jagati"]
1103
+ score -= missing_pAdas * meter_scores["upajāti, penalty, per missing pāda"]
1104
+ ajnatam_count = sum(1 for lbl in meter_labels if lbl.startswith('ajñātam'))
1105
+ # vikṛta-rescued pādas carry the same penalty as ajñātam until calibration
1106
+ # introduces a dedicated vikṛtavṛtta penalty (Step 2)
1107
+ score -= (ajnatam_count + vikrta_count) * meter_scores["upajāti, penalty, per ajñātam pāda"]
987
1108
 
988
1109
  imperfect_note = len(wbp_lens) != 4 and unique_sorted_lens != [11, 12]
989
- overall_meter_label = "upajāti %s: %s" % (
990
- family,
991
- combined_meter_labels
992
- )
993
1110
 
994
- # Build diagnostic: excluded pādas are flagged as hyper/hypometric relative
995
- # to the majority length; included pādas contribute no error entry.
996
- most_freq_len = wbp_lens[0] if wbp_lens else None
1111
+ # Build diagnostic from per-pāda attribution results.
997
1112
  problem_syllables = {}
998
1113
  per_pada_sanskrit = {}
999
1114
  per_pada_english = {}
1115
+ canonical_gana = {}
1000
1116
  for pada_num in range(1, 5):
1001
- orig_len = wbp_lens_orig[pada_num - 1] if pada_num - 1 < len(wbp_lens_orig) else None
1002
- if pada_num - 1 in excluded_indices:
1003
- syls = list(range(orig_len)) if orig_len is not None else []
1117
+ i = pada_num - 1
1118
+ lbl = meter_labels[i] if i < len(meter_labels) else None
1119
+ if lbl and lbl.startswith('ajñātam'):
1120
+ orig_len = wbp_lens[i]
1121
+ syls = list(range(orig_len))
1004
1122
  problem_syllables[pada_num] = syls
1005
- if orig_len is not None and most_freq_len is not None:
1006
- hyper = orig_len > most_freq_len
1123
+ hyper = orig_len > most_freq_pAda_len
1124
+ per_pada_sanskrit[pada_num] = 'adhikākṣarā' if hyper else 'ūnākṣarā'
1125
+ per_pada_english[pada_num] = 'hypermetric' if hyper else 'hypometric'
1126
+ elif i in vikrta_info:
1127
+ orig_len, canonical_len, problem_indices, vikrta_canonical_gaRa, vikrta_dist = vikrta_info[i]
1128
+ if orig_len != canonical_len:
1129
+ # length-deviant vikṛta: flag as hyper/hypometric
1130
+ hyper = orig_len > canonical_len
1007
1131
  per_pada_sanskrit[pada_num] = 'adhikākṣarā' if hyper else 'ūnākṣarā'
1008
1132
  per_pada_english[pada_num] = 'hypermetric' if hyper else 'hypometric'
1133
+ # Only pinpoint the gap when dist==1; higher distances mean additional weight
1134
+ # mismatches that make the gap position unreliable.
1135
+ if vikrta_dist == 1 and problem_indices:
1136
+ problem_syllables[pada_num] = list(problem_indices)
1137
+ canonical_gana[pada_num] = vikrta_canonical_gaRa
1138
+ else:
1139
+ problem_syllables[pada_num] = list(range(orig_len))
1140
+ elif problem_indices:
1141
+ # same-length vikṛta: flag the specific mismatched positions
1142
+ per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
1143
+ per_pada_english[pada_num] = 'vikrtavrtta'
1144
+ problem_syllables[pada_num] = list(problem_indices)
1009
1145
 
1010
1146
  # Append per-pāda imperfect notes to label.
1011
1147
  length_notes = [f"pāda {p} {v}" for p, v in per_pada_sanskrit.items()]
@@ -1013,27 +1149,42 @@ class VerseTester(object):
1013
1149
  overall_meter_label += " (%s)" % "; ".join(length_notes)
1014
1150
 
1015
1151
  if not per_pada_english and not imperfect_note:
1016
- diagnostic = Diagnostic(perfect_id_label=overall_meter_label)
1152
+ diagnostic = Diagnostic(
1153
+ perfect_id_label=overall_meter_label,
1154
+ notable_label=notable_label_dict or None,
1155
+ )
1017
1156
  elif not imperfect_note:
1018
1157
  diagnostic = Diagnostic(
1019
1158
  perfect_id_label=overall_meter_label,
1020
1159
  imperfect_label_sanskrit=per_pada_sanskrit or None,
1021
1160
  imperfect_label_english=per_pada_english or None,
1022
1161
  problem_syllables=problem_syllables or None,
1162
+ notable_label=notable_label_dict or None,
1163
+ canonical_gana=canonical_gana or None,
1023
1164
  )
1024
1165
  else:
1025
1166
  diagnostic = Diagnostic(
1026
1167
  imperfect_label_sanskrit=per_pada_sanskrit or None,
1027
1168
  imperfect_label_english=per_pada_english or None,
1028
1169
  problem_syllables=problem_syllables or None,
1170
+ notable_label=notable_label_dict or None,
1171
+ canonical_gana=canonical_gana or None,
1029
1172
  )
1030
1173
 
1031
- # score arbitration: may tie with pre-existing result (e.g., samavṛtta)
1174
+ # score arbitration: may tie with pre-existing result (e.g., samavṛtta).
1175
+ # Deferred pass overwrites the forward-pass placeholder directly (same
1176
+ # identification refined, not a new competitor).
1032
1177
  old_score = Vrs.identification_score
1033
1178
  is_perfect = not imperfect_note and not per_pada_english
1034
- self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=is_perfect)
1035
- if score >= old_score:
1179
+ if not perfect_only and Vrs.meter_label is not None and Vrs.meter_label.startswith('upajāti'):
1180
+ Vrs.meter_label = overall_meter_label
1181
+ Vrs.identification_score = score
1182
+ Vrs.is_perfect = is_perfect
1036
1183
  Vrs.diagnostic = diagnostic
1184
+ else:
1185
+ self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=is_perfect, new_diagnostic=diagnostic)
1186
+ if score >= old_score:
1187
+ Vrs.diagnostic = diagnostic
1037
1188
 
1038
1189
 
1039
1190
  def is_vizamavftta(self, Vrs, perfect_only=False):
@@ -1123,82 +1274,17 @@ class VerseTester(object):
1123
1274
  imperfect_label = best_label + f" ({suffix})"
1124
1275
 
1125
1276
  old_score = Vrs.identification_score
1126
- self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
1277
+ _diag = Diagnostic(
1278
+ perfect_id_label=imperfect_label,
1279
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
1280
+ imperfect_label_english=per_pada_english or None,
1281
+ problem_syllables=problem_syllables or None,
1282
+ )
1283
+ self.combine_results(Vrs, new_label=imperfect_label, new_score=score, new_diagnostic=_diag)
1127
1284
  if score >= old_score:
1128
- Vrs.diagnostic = Diagnostic(
1129
- perfect_id_label=imperfect_label,
1130
- imperfect_label_sanskrit=per_pada_sanskrit or None,
1131
- imperfect_label_english=per_pada_english or None,
1132
- problem_syllables=problem_syllables or None,
1133
- )
1285
+ Vrs.diagnostic = _diag
1134
1286
  return True
1135
1287
 
1136
- def test_as_samavftta_etc(self, Vrs):
1137
-
1138
- wbp = Vrs.syllable_weights.split('\n') # weights by pāda
1139
- wbp_lens = [ len(line) for line in wbp ]
1140
-
1141
- # make sure either full four pādas or one and single-pāda mode
1142
- if len(wbp) >= 4 or (
1143
- len(wbp) == 1 and self.resplit_option == "single_pAda"
1144
- ):
1145
- pass
1146
- else:
1147
- return 0
1148
-
1149
- self.count_pAdasamatva(Vrs) # [0,2,3,4]
1150
-
1151
- # test in following order to prioritize left-right presentation of ties
1152
- # ties managed in self.combine_results()
1153
-
1154
- # test perfect samavṛtta
1155
- if self.pAdasamatva_count == 4:
1156
- # definitely checks out, id_score == 9
1157
- timed('samavftta')(self.evaluate_samavftta)(Vrs)
1158
- return 1 # max score already reached
1159
-
1160
-
1161
-
1162
- # test perfect single pāda of samavṛtta
1163
- if ( self.pAdasamatva_count == 0 and self.resplit_option == "single_pAda"):
1164
- timed('samavftta')(self.evaluate_samavftta)(Vrs)
1165
-
1166
- # test perfect viṣamavṛtta (Levenshtein for imperfect deferred to imperfect pass)
1167
- if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs, perfect_only=True):
1168
- # will give id_score == 9
1169
- # label and score already set in is_vizamavftta if test was successful
1170
- return 1 # max score already reached
1171
-
1172
- # test perfect upajāti
1173
-
1174
- unique_sorted_lens = list(set(wbp_lens))
1175
- unique_sorted_lens.sort()
1176
- if len(unique_sorted_lens) == 1: # all same length
1177
- # will give id_score in [8, 7], may tie with above
1178
- timed('upajAti')(self.evaluate_upajAti)(Vrs)
1179
- if Vrs.identification_score == 8: return 1 # best score compared to below
1180
- # otherwise, max score not necessarily yet reached, don't return
1181
-
1182
- # test imperfect samavftta (Levenshtein for length errors deferred to imperfect pass)
1183
- if self.pAdasamatva_count in [2, 3]:
1184
- # will give id_score in [7, 6], may tie with above
1185
- timed('samavftta')(self.evaluate_samavftta)(Vrs, perfect_only=True)
1186
- # max score not necessarily yet reached, don't return
1187
-
1188
- # test imperfect upajāti
1189
- if (
1190
- len( list(set(wbp_lens)) ) in [2, 3] or
1191
- unique_sorted_lens == [11, 12]
1192
- ): # either not all same length or triṣṭubh-jagatī mix
1193
- # will give id_score in [6, 5, 4], may tie with above
1194
- timed('upajAti')(self.evaluate_upajAti)(Vrs)
1195
-
1196
- # return success
1197
- if Vrs.meter_label != None:
1198
- return 1
1199
- else:
1200
- return 0
1201
-
1202
1288
  def test_as_jAti(self, Vrs):
1203
1289
  """
1204
1290
  Determines whether verse is of jāti (mātrāvṛtta) type.
@@ -1239,32 +1325,80 @@ class VerseTester(object):
1239
1325
  close1 = abs(eff1 - std_ardha[0]) <= 1
1240
1326
  close2 = abs(eff2 - std_ardha[1]) <= 1
1241
1327
  if close1 and close2:
1242
- jati_label = jAti_name + " (%s)" % quarter_label
1328
+ jati_label = jAti_name
1243
1329
  likely_score = meter_scores["jāti, likely"]
1244
1330
  if likely_score > Vrs.identification_score:
1245
1331
  per_pada_sanskrit = {}
1246
1332
  per_pada_english = {}
1247
- # Attribute ardha-level mora error to the ardha-final (even) pāda.
1333
+ # Attribute ardha-level mora error to the even pāda key, but label by ardha.
1248
1334
  ardha_morae_pairs = [
1249
- (m1, std_ardha[0], 2),
1250
- (m2, std_ardha[1], 4),
1335
+ (m1, std_ardha[0], 1, ardha1_w, 2),
1336
+ (m2, std_ardha[1], 2, ardha2_w, 4),
1251
1337
  ]
1252
- for actual, expected, even_pada in ardha_morae_pairs:
1253
- hyper = actual > expected
1254
- per_pada_sanskrit[even_pada] = 'adhikamātrā' if hyper else 'ūnamātrā'
1255
- per_pada_english[even_pada] = f"ardha mora count off from expected {expected}"
1338
+ for actual, expected, ardha_num, ardha_w, even_pada in ardha_morae_pairs:
1339
+ anceps_ok = actual == expected - 1 and ardha_w[-1:] == 'l'
1340
+ if actual != expected and not anceps_ok:
1341
+ hyper = actual > expected
1342
+ per_pada_sanskrit[even_pada] = f"ardha {ardha_num}: " + ('adhikamātrā' if hyper else 'ūnamātrā') + f", {expected}→{actual}"
1343
+ per_pada_english[even_pada] = f"ardha {ardha_num} mora count off from expected {expected}"
1256
1344
  # Build meter_label suffix from the per-ardha directions.
1257
- sa_vals = list(per_pada_sanskrit.values())
1258
- if len(set(sa_vals)) == 1:
1259
- suffix = sa_vals[0]
1345
+ ardha_labels = [
1346
+ (ardha_num, per_pada_sanskrit[even_pada])
1347
+ for ardha_num, even_pada in [(1, 2), (2, 4)]
1348
+ if even_pada in per_pada_sanskrit
1349
+ ]
1350
+ if not ardha_labels:
1351
+ suffix = 'asamīcīnā'
1352
+ else:
1353
+ suffix = '; '.join(v for _, v in ardha_labels)
1354
+ # Decompose all ardhas for gaṇa abbreviations and problem syllable pinpointing.
1355
+ g8_morae = 4 if jAti_name == 'āryāgīti' else 2
1356
+ ardha1_ganas = _decompose_into_mAtragaNas(ardha1_w, g6_ardha1, g8_morae)
1357
+ ardha2_ganas = _decompose_into_mAtragaNas(ardha2_w, g6_ardha2, g8_morae)
1358
+ names = meter_patterns.mAtragaNa_names
1359
+ def _ganas_to_abbrevs(ganas):
1360
+ return ' '.join(names.get(g, g) for g in ganas)
1361
+ def _split_ardha_ganas(ganas, pada_a_syl_count):
1362
+ cur = 0
1363
+ for i, g in enumerate(ganas):
1364
+ if cur >= pada_a_syl_count:
1365
+ return _ganas_to_abbrevs(ganas[:i]), _ganas_to_abbrevs(ganas[i:])
1366
+ cur += len(g)
1367
+ return _ganas_to_abbrevs(ganas), ''
1368
+ if len(w_p) >= 4:
1369
+ p1a, p1b = _split_ardha_ganas(ardha1_ganas, len(w_p[0]))
1370
+ p2a, p2b = _split_ardha_ganas(ardha2_ganas, len(w_p[2]))
1371
+ mAtragaNa_abbrevs = '\n'.join([p1a, p1b, p2a, p2b])
1260
1372
  else:
1261
- suffix = '; '.join(f"ardha {i+1}: {v}" for i, v in enumerate(sa_vals))
1373
+ mAtragaNa_abbrevs = '\n'.join([_ganas_to_abbrevs(ardha1_ganas), _ganas_to_abbrevs(ardha2_ganas)])
1374
+ problem_syllables = {}
1375
+ for actual, expected, ardha_num, ardha_w, even_pada in ardha_morae_pairs:
1376
+ anceps_ok = actual == expected - 1 and ardha_w[-1:] == 'l'
1377
+ if actual == expected or anceps_ok:
1378
+ continue
1379
+ g6 = g6_ardha1 if ardha_num == 1 else g6_ardha2
1380
+ ganas = ardha1_ganas if ardha_num == 1 else ardha2_ganas
1381
+ err = _validate_jAti_gaNas(ganas, g6, jAti_name, ardha_num)
1382
+ if err:
1383
+ _, bad_syls = err
1384
+ # map ardha-level offsets to pāda-level
1385
+ pada_a = ardha_num * 2 - 1
1386
+ pada_b = ardha_num * 2
1387
+ pada_a_len = len(w_p[pada_a - 1]) if len(w_p) >= 4 else 0
1388
+ a_syls = [i for i in bad_syls if i < pada_a_len]
1389
+ b_syls = [i - pada_a_len for i in bad_syls if i >= pada_a_len]
1390
+ if a_syls: problem_syllables[pada_a] = a_syls
1391
+ if b_syls: problem_syllables[pada_b] = b_syls
1392
+ if not a_syls and not b_syls:
1393
+ problem_syllables[pada_b] = bad_syls
1262
1394
  Vrs.meter_label = jati_label + f" ({suffix})"
1263
1395
  Vrs.identification_score = likely_score
1264
1396
  Vrs.is_perfect = False
1397
+ Vrs.mAtragaNa_abbreviations = mAtragaNa_abbrevs
1265
1398
  Vrs.diagnostic = Diagnostic(
1266
1399
  imperfect_label_sanskrit=per_pada_sanskrit or None,
1267
1400
  imperfect_label_english=per_pada_english or None,
1401
+ problem_syllables=problem_syllables or None,
1268
1402
  )
1269
1403
  continue
1270
1404
 
@@ -1423,7 +1557,7 @@ class VerseTester(object):
1423
1557
  parts = [s for s in [ardha1_str, ardha2_str] if s]
1424
1558
  imperfect_label_sa = '; '.join(parts) if parts else _gana_error_sanskrit((err1 or err2)[0])
1425
1559
 
1426
- jati_label = jAti_name + " (%s)" % quarter_label
1560
+ jati_label = jAti_name
1427
1561
  jati_score = meter_scores["jāti, imperfect"]
1428
1562
  # penalise pāda mora mismatches so that resplit attempts with better
1429
1563
  # pāda alignment score higher and win arbitration in combine_results
@@ -1433,7 +1567,7 @@ class VerseTester(object):
1433
1567
  anceps_ok = (is_ardha_final and actual == expected - 1
1434
1568
  and w_p[pi] and w_p[pi][-1] == 'l')
1435
1569
  if actual != expected and not anceps_ok:
1436
- jati_score -= 1
1570
+ jati_score -= meter_scores["jāti, penalty, per mora-mismatched pāda"]
1437
1571
  if jati_score >= Vrs.identification_score:
1438
1572
  Vrs.meter_label = jati_label + f" ({imperfect_label_sa})"
1439
1573
  Vrs.identification_score = jati_score
@@ -1447,7 +1581,7 @@ class VerseTester(object):
1447
1581
  return 1
1448
1582
 
1449
1583
  # Gaṇa rules passed — check whether pāda-level morae also match.
1450
- jati_label = jAti_name + " (%s)" % quarter_label
1584
+ jati_label = jAti_name
1451
1585
  def quarters_ok(actual, expected, weights):
1452
1586
  if len(actual) < 4 or len(weights) < 4:
1453
1587
  return False
@@ -1504,45 +1638,90 @@ class VerseTester(object):
1504
1638
  Runs through various possible meter types with respective identification_scores:
1505
1639
  zloka
1506
1640
  9 two zloka halves, both perfect
1507
- 8 two zloka halves, one perfect and one imperfect
1508
- (not currently supported: two zloka halves, both imperfect)
1641
+ 7 two zloka halves, one perfect and one imperfect
1642
+ 5 two zloka halves, both imperfect
1509
1643
  9 one zloka half, perfect
1510
- (not currently supported: one zloka half, imperfect)
1511
1644
  samavftta, upajAti, vizamavftta, ardhasamavftta
1512
- 9 vizamavftta perfect (trivial, in progress)
1645
+ 9 vizamavftta perfect
1513
1646
  (currently not supported: 5 vizamavftta imperfect)
1514
- (currently not supported but planned: 9 ardhasamavftta perfect)
1515
- (currently not supported: 5 ardhasamavftta imperfect)
1647
+ 9 ardhasamavftta perfect
1648
+ 7 ardhasamavftta imperfect
1516
1649
  9 samavftta perfect
1517
- 8 upajAti perfect trizwuB
1650
+ 8 upajAti perfect (4 pAdas, triṣṭubh/jagatī/mix)
1518
1651
  7 samavftta imperfect (2-3 lines match)
1519
- 7 upajAti perfect non-trizwuB
1520
- 6 upajAti imperfect trizwuB
1521
- 5 upajAti imperfect non-trizwuB
1652
+ 6 upajAti imperfect (2-3 pAdas)
1522
1653
  jAti
1523
- 8 jAti perfect
1524
- (currently not supported but planned: 5 jAti imperfect)
1654
+ 9 jAti perfect
1655
+ 6 jAti imperfect
1656
+ 4 jAti likely (±1 mora)
1525
1657
 
1526
1658
  Embeds identification results as Verse.meter_label and Verse.identification_score.
1527
- Returns string corresponding to Verse.meter_label. - currently
1528
- Returns int result 1 if successul and 0 if not. - planned
1529
1659
  """
1530
1660
 
1531
1661
  self.identification_attempt_count += 1
1532
1662
  self._samavftta_has_length_error = False
1663
+ self._upajAti_needs_lev = False
1533
1664
 
1534
1665
  # anuzwuB
1535
1666
  success_anuzwuB = timed('anuzwuB')(self.test_as_anuzwuB)(Vrs)
1536
1667
  if success_anuzwuB and Vrs.identification_score == meter_scores["max score"]:
1537
1668
  return 1
1538
1669
 
1539
- # samavftta, upajAti, vizamavftta
1540
- _inner_keys = ('samavftta', 'upajAti', 'vizamavftta')
1541
- _pre_inner = {k: _section_totals.get(k, 0.0) for k in _inner_keys} if _DEBUG_TIMING else None
1542
- success_samavftta_etc = timed('samavftta_etc')(self.test_as_samavftta_etc)(Vrs)
1670
+ # samavṛtta / upajāti / viṣamavṛtta. The `samavftta_etc` bucket captures
1671
+ # dispatcher overhead (count_pAdasamatva + gate evaluation) by bracketing
1672
+ # the whole block and subtracting the inner timed buckets.
1673
+ _etc_t0 = _time.perf_counter() if _DEBUG_TIMING else None
1674
+ _etc_inner_keys = ('samavftta', 'upajAti', 'vizamavftta')
1675
+ _pre_etc_inner = (
1676
+ {k: _section_totals.get(k, 0.0) for k in _etc_inner_keys}
1677
+ if _DEBUG_TIMING else None
1678
+ )
1679
+ wbp_lens = [len(line) for line in Vrs.syllable_weights.split('\n')]
1680
+ success_samavftta_etc = 0
1681
+ if len(wbp_lens) >= 4 or (len(wbp_lens) == 1 and self.resplit_option == "single_pAda"):
1682
+ self.count_pAdasamatva(Vrs) # populates self.pAdasamatva_count in [0,2,3,4]
1683
+
1684
+ # perfect samavṛtta
1685
+ if self.pAdasamatva_count == 4:
1686
+ timed('samavftta')(self.evaluate_samavftta)(Vrs)
1687
+ success_samavftta_etc = 1
1688
+ else:
1689
+ # single-pāda samavṛtta (perfect)
1690
+ if self.pAdasamatva_count == 0 and self.resplit_option == "single_pAda":
1691
+ timed('samavftta')(self.evaluate_samavftta)(Vrs)
1692
+
1693
+ # perfect viṣamavṛtta (Levenshtein for imperfect deferred below)
1694
+ if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs, perfect_only=True):
1695
+ success_samavftta_etc = 1
1696
+
1697
+ # perfect upajāti: all pādas same length
1698
+ unique_sorted_lens = sorted(set(wbp_lens[:4]))
1699
+ if len(unique_sorted_lens) == 1:
1700
+ timed('upajAti')(self.evaluate_upajAti)(Vrs)
1701
+ if Vrs.identification_score == 8:
1702
+ success_samavftta_etc = 1
1703
+
1704
+ # imperfect samavṛtta (Levenshtein for length errors deferred below)
1705
+ if self.pAdasamatva_count in [2, 3]:
1706
+ timed('samavftta')(self.evaluate_samavftta)(Vrs, perfect_only=True)
1707
+
1708
+ # imperfect upajāti: mixed lengths — after samavṛtta so its score
1709
+ # can trigger the potential_score bail inside evaluate_upajAti
1710
+ if len(unique_sorted_lens) in [2, 3] or unique_sorted_lens == [11, 12]:
1711
+ timed('upajAti')(self.evaluate_upajAti)(Vrs)
1712
+
1713
+ if Vrs.meter_label is not None:
1714
+ success_samavftta_etc = 1
1715
+
1543
1716
  if _DEBUG_TIMING:
1544
- inner_delta = sum(_section_totals.get(k, 0.0) - _pre_inner[k] for k in _inner_keys)
1545
- _section_totals['samavftta_etc'] -= inner_delta
1717
+ _etc_elapsed = _time.perf_counter() - _etc_t0
1718
+ _etc_inner_delta = sum(
1719
+ _section_totals.get(k, 0.0) - _pre_etc_inner[k] for k in _etc_inner_keys
1720
+ )
1721
+ _section_totals['samavftta_etc'] = (
1722
+ _section_totals.get('samavftta_etc', 0.0) + _etc_elapsed - _etc_inner_delta
1723
+ )
1724
+
1546
1725
  if success_samavftta_etc and Vrs.identification_score >= 8:
1547
1726
  return 1
1548
1727
  # i.e., if upajāti or anything imperfect, also continue on to check jāti
@@ -1568,6 +1747,8 @@ class VerseTester(object):
1568
1747
  # imperfect pass: deferred Levenshtein annotation for samavftta length errors.
1569
1748
  if self._samavftta_has_length_error:
1570
1749
  timed('lev_samavftta')(self.evaluate_samavftta)(Vrs)
1750
+ if self._upajAti_needs_lev:
1751
+ timed('lev_upajAti')(self.evaluate_upajAti)(Vrs, perfect_only=False)
1571
1752
 
1572
1753
  if success_anuzwuB or success_samavftta_etc or success_jAti or Vrs.identification_score >= meter_scores["ardhasamavṛtta, perfect"]:
1573
1754
  return 1
@@ -1782,12 +1963,9 @@ class MeterIdentifier(object):
1782
1963
  n_breaks = n_pAdas - 1
1783
1964
  total = len(syllable_list)
1784
1965
 
1785
- # Seed each break: prefer user-provided positions, fall back to canonical.
1786
1966
  canonical_seeds = [pada_len * (i + 1) for i in range(n_breaks)]
1787
1967
  seeds = list(user_seeds) if user_seeds else canonical_seeds
1788
1968
 
1789
- # For each break, build the list of candidate positions:
1790
- # either locked to seed (keep_midpoint) or all positions in [seed-tol, seed+tol].
1791
1969
  def candidates(break_idx):
1792
1970
  seed = seeds[break_idx]
1793
1971
  if break_idx in keep_mid_breaks:
@@ -1826,9 +2004,7 @@ class MeterIdentifier(object):
1826
2004
  seg_len = pos - prev
1827
2005
  if not (pada_len - tol <= seg_len <= pada_len + tol):
1828
2006
  continue
1829
- # check remaining syllables can form valid pādas
1830
2007
  remaining = total - pos
1831
- remaining_breaks = n_breaks - break_idx - 1
1832
2008
  remaining_pAdas = n_pAdas - break_idx - 1
1833
2009
  min_remaining = remaining_pAdas * (pada_len - tol)
1834
2010
  max_remaining = remaining_pAdas * (pada_len + tol)
@@ -1848,18 +2024,14 @@ class MeterIdentifier(object):
1848
2024
  pada_len = 8
1849
2025
  n_breaks = 5
1850
2026
 
1851
- # Derive user seeds from punctuation/newlines when available,
1852
- # mirroring the seeding logic in wiggle_identify.
1853
2027
  user_seeds = None
1854
2028
  if len(newline_indices) == n_breaks:
1855
2029
  if resplit_option in ('none', 'resplit_lite'):
1856
- # all breaks provided — seed all five from user positions
1857
2030
  user_seeds = [
1858
2031
  text_syllabified[:newline_indices[i]].count(scansion_syllable_separator)
1859
2032
  for i in range(n_breaks)
1860
2033
  ]
1861
2034
  elif resplit_option == 'resplit_max' and VrsTster.resplit_keep_midpoint:
1862
- # seed bc (idx 1) and de (idx 3) from user positions, wiggle the rest
1863
2035
  canonical = [pada_len * (i + 1) for i in range(n_breaks)]
1864
2036
  canonical[1] = text_syllabified[:newline_indices[1]].count(scansion_syllable_separator)
1865
2037
  canonical[3] = text_syllabified[:newline_indices[3]].count(scansion_syllable_separator)
@@ -1936,10 +2108,7 @@ class MeterIdentifier(object):
1936
2108
  self.Scanner = S = Sc()
1937
2109
 
1938
2110
  if _DEBUG_TIMING:
1939
- _pre_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
1940
- 'anuzwuB', 'ardhatraya', 'samavftta', 'upajAti', 'vizamavftta',
1941
- 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
1942
- _pre = {k: _section_totals.get(k, 0.0) for k in _pre_keys}
2111
+ _pre = {k: _section_totals.get(k, 0.0) for k in _TIMING_KEYS}
1943
2112
 
1944
2113
  # gets back mostly populated Verse object
1945
2114
  V = S.scan(rw_str, from_scheme=from_scheme)
@@ -2058,7 +2227,7 @@ class MeterIdentifier(object):
2058
2227
  best_total_dist = total_dist
2059
2228
  best_entry = (_stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae)
2060
2229
  if best_entry is not None:
2061
- ardha_score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - 1)
2230
+ ardha_score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - meter_scores["levenshtein distance penalty"])
2062
2231
  if ardha_score > best_current_score:
2063
2232
  best_stash_wbp, best_label, best_odd_can, best_even_can, best_stash_tsyl, best_stash_gaRa, best_stash_morae = best_entry
2064
2233
  problem_syllables = {}
@@ -2124,7 +2293,7 @@ class MeterIdentifier(object):
2124
2293
  best_total_dist = total_dist
2125
2294
  best_entry = (_wbp, _label, _canonicals, _tsyl, _gaRa, _morae)
2126
2295
  if best_entry is not None:
2127
- vizama_score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - 1)
2296
+ vizama_score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - meter_scores["levenshtein distance penalty"])
2128
2297
  if vizama_score > best_current_score:
2129
2298
  best_wbp, best_label, best_canonicals, best_tsyl, best_gaRa, best_morae = best_entry
2130
2299
  problem_syllables = {}
@@ -2181,11 +2350,8 @@ class MeterIdentifier(object):
2181
2350
  V.identification_score = meter_scores["none found"]
2182
2351
 
2183
2352
  if _DEBUG_TIMING:
2184
- all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
2185
- 'anuzwuB', 'ardhatraya', 'samavftta', 'upajAti', 'vizamavftta',
2186
- 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
2187
- verse_times = {k: _section_totals.get(k, 0.0) - _pre[k] for k in all_keys}
2188
- verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
2353
+ verse_times = {k: _section_totals.get(k, 0.0) - _pre[k] for k in _TIMING_KEYS}
2354
+ verse_times['scan'] = sum(verse_times[k] for k in _SCAN_KEYS)
2189
2355
  cat = _meter_label_to_category(V.meter_label)
2190
2356
  bucket = _category_totals.setdefault(cat, {})
2191
2357
  for k, v in verse_times.items():
@@ -2241,11 +2407,8 @@ def _identify_meter_worker(args):
2241
2407
  import skrutable.meter_identification as _mi
2242
2408
  _mi._DEBUG_TIMING = True
2243
2409
  MI = MeterIdentifier()
2244
- all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
2245
- 'anuzwuB', 'ardhatraya', 'samavftta', 'upajAti', 'vizamavftta',
2246
- 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
2247
2410
  if debug_timing:
2248
- pre = {k: _section_totals.get(k, 0.0) for k in all_keys}
2411
+ pre = {k: _section_totals.get(k, 0.0) for k in _TIMING_KEYS}
2249
2412
  pre_wiggle = _section_totals.get('wiggle_count', 0)
2250
2413
  V = MI.identify_meter(
2251
2414
  rw_str,
@@ -2254,8 +2417,8 @@ def _identify_meter_worker(args):
2254
2417
  from_scheme=from_scheme,
2255
2418
  )
2256
2419
  if debug_timing:
2257
- verse_times = {k: _section_totals.get(k, 0.0) - pre[k] for k in all_keys}
2258
- verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
2420
+ verse_times = {k: _section_totals.get(k, 0.0) - pre[k] for k in _TIMING_KEYS}
2421
+ verse_times['scan'] = sum(verse_times[k] for k in _SCAN_KEYS)
2259
2422
  verse_times['wiggle_count'] = _section_totals.get('wiggle_count', 0) - pre_wiggle
2260
2423
  cat = _meter_label_to_category(V.meter_label)
2261
2424
  return V, verse_times, cat
@@ -313,8 +313,8 @@ samavfttas_by_family_and_gaRa = {
313
313
  22: {
314
314
  'mmtnnns(g|l)' : 'haṃsī', # also mmggnnnngg
315
315
  'tByjsrn(g|l)' : 'aśvadhāṭī',
316
- 'Brnrnrn(g|l)' : 'madraka'
317
- },
316
+ 'Brnrnrn(g|l)' : 'madraka',
317
+ },
318
318
 
319
319
  23: {
320
320
  'njBjBjBl(g|l)' : 'adritanayā',
@@ -40,6 +40,7 @@ class Verse(object):
40
40
  self.meter_label = None # string
41
41
  self.identification_score = 0 # int
42
42
  self.diagnostic = None # Diagnostic or dict of Diagnostics, set by meter_identification
43
+ self.alternatives = [] # list of {'meter_label': str, 'diagnostic': ...} for atha-vā ties
43
44
 
44
45
  def summarize(self,
45
46
  show_weights=True, show_morae=True, show_gaRas=True, # part_A
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.7.0
3
+ Version: 2.8.0
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -1 +0,0 @@
1
- __version__ = "2.7.0"
File without changes
File without changes
File without changes
File without changes