skrutable 2.5.4__tar.gz → 2.6.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {skrutable-2.5.4 → skrutable-2.6.1}/PKG-INFO +1 -1
  2. skrutable-2.6.1/src/skrutable/__init__.py +1 -0
  3. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/config.json +7 -6
  4. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/meter_identification.py +457 -71
  5. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/meter_patterns.py +20 -8
  6. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/phonemes.py +6 -0
  7. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/scansion.py +42 -44
  8. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable.egg-info/PKG-INFO +1 -1
  9. skrutable-2.5.4/src/skrutable/__init__.py +0 -1
  10. {skrutable-2.5.4 → skrutable-2.6.1}/LICENSE.md +0 -0
  11. {skrutable-2.5.4 → skrutable-2.6.1}/README.md +0 -0
  12. {skrutable-2.5.4 → skrutable-2.6.1}/setup.cfg +0 -0
  13. {skrutable-2.5.4 → skrutable-2.6.1}/setup.py +0 -0
  14. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/config.py +0 -0
  15. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/generate_scheme_vectors.py +0 -0
  16. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/impossible_bigrams.json +0 -0
  17. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/manual.md +0 -0
  18. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/run_examples.py +0 -0
  19. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/scheme_detection.py +0 -0
  20. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/scheme_maps.py +0 -0
  21. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/scheme_vectors.json +0 -0
  22. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/scheme_vectors_mbh.py +0 -0
  23. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/splitting.py +0 -0
  24. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/transliteration.py +0 -0
  25. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/utils.py +0 -0
  26. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable/virAma_avoidance.py +0 -0
  27. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable.egg-info/SOURCES.txt +0 -0
  28. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable.egg-info/dependency_links.txt +0 -0
  29. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable.egg-info/requires.txt +0 -0
  30. {skrutable-2.5.4 → skrutable-2.6.1}/src/skrutable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.5.4
3
+ Version: 2.6.1
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -0,0 +1 @@
1
+ __version__ = "2.6.1"
@@ -4,7 +4,7 @@
4
4
  "avoid_virama_indic_scripts" : true,
5
5
  "avoid_virama_non_indic_scripts" : false,
6
6
  "scansion_syllable_separator" : " ",
7
- "additional_pAda_separators" : ["\t", ";", ",", " / ", " | ", " "],
7
+ "additional_pAda_separators" : ["\t", ";", ",", "/", "|", "।"],
8
8
  "default_resplit_option" : "resplit_lite",
9
9
  "default_resplit_keep_midpoint" : false,
10
10
  "disable_non_trizwuB_upajAti" : true,
@@ -21,16 +21,17 @@
21
21
  "samavṛtta, imperfect (3)" : 6,
22
22
  "samavṛtta, imperfect (2)" : 5,
23
23
  "samavṛtta, quarter, perfect" : 8,
24
- "ardhasamavṛtta, perfect" : 8,
25
- "ardhasamavṛtta, perfect, unknown" : 6,
24
+ "ardhasamavṛtta, perfect" : 9,
25
+ "ardhasamavṛtta, imperfect" : 7,
26
26
  "viṣamavṛtta, perfect" : 9,
27
- "upajāti, perfect" : 7,
27
+ "viṣamavṛtta, imperfect" : 7,
28
+ "upajāti, perfect" : 8,
28
29
  "upajāti, imperfect" : 6,
29
- "upajāti, non-triṣṭubh, perfect" : 4.5,
30
+ "upajāti, non-triṣṭubh, perfect" : 5,
30
31
  "upajāti, triṣṭubh-jagatī-saṃkara, perfect" : 4,
31
32
  "upajāti, non-triṣṭubh, imperfect" : 3,
32
33
  "jāti, perfect" : 9,
33
- "jāti, imperfect" : 5,
34
+ "jāti, imperfect" : 6,
34
35
  "jāti, likely" : 3,
35
36
  "none found" : 1
36
37
  },
@@ -3,6 +3,7 @@ from skrutable import meter_patterns
3
3
  from skrutable.config import load_config_dict_from_json_file
4
4
  from skrutable.utils import _DEBUG_TIMING, _section_totals, timed
5
5
  import re
6
+ import time as _time
6
7
  from copy import copy
7
8
  from dataclasses import dataclass
8
9
  from typing import Optional
@@ -59,10 +60,12 @@ def flush_profiling_report(write_file=False):
59
60
  return
60
61
  import sys, os
61
62
  scan_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana')
62
- type_keys = ('anuzwuB', 'samavftta_etc', 'samavftta', 'upajAti', 'ardhasamavftta', 'vizamavftta', 'jAti')
63
+ type_keys = ('anuzwuB', 'samavftta_etc', 'samavftta', 'upajAti', 'ardhasamavftta_perfect', 'vizamavftta', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama')
63
64
  type_abbrev = {
64
65
  'anuzwuB': 'anuṣṭ', 'samavftta_etc': 'vftta↑', 'samavftta': 'samav', 'upajAti': 'upajāti',
65
- 'ardhasamavftta': 'ardha', 'vizamavftta': 'vizama', 'jAti': 'jāti',
66
+ 'ardhasamavftta_perfect': 'ardha', 'vizamavftta': 'vizama',
67
+ 'jAti': 'jāti',
68
+ 'lev_samavftta': 'lev✗sama', 'lev_ardha': 'lev✗ardh', 'lev_vizama': 'lev✗visa',
66
69
  }
67
70
  scan_abbrev = {'scan_clean': 'clean', 'scan_translit': 'transl', 'scan_syllabify': 'syl', 'scan_weights': 'wts', 'scan_morae_gana': 'mor+g'}
68
71
  cat_order = ['anuṣṭubh', 'samavṛtta', 'upajāti', 'ardhasamavṛtta', 'viṣamavṛtta', 'jāti', 'na kiṃcid adhyavasitam']
@@ -151,6 +154,68 @@ class Diagnostic:
151
154
  return self.imperfect_label_sanskrit is not None and not self.length_error()
152
155
 
153
156
 
157
+ ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD = 2
158
+ VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD = 2
159
+
160
+ # Precompute vizamavṛtta canonical weight strings once (avoid per-call gaṇa→weight conversion)
161
+ _gaRa_to_weights_map = {v: k for k, v in meter_patterns.gaRas_by_weights.items()}
162
+ def _gaRa_str_to_weights(s):
163
+ return ''.join(_gaRa_to_weights_map.get(ch, ch) for ch in s)
164
+ _vizamavftta_precomputed = [
165
+ (gaRas, [_gaRa_str_to_weights(g) for g in gaRas], label)
166
+ for gaRas, label in meter_patterns.vizamavftta_by_4_tuple.items()
167
+ ]
168
+
169
+
170
+ def _levenshtein_align(observed, canonical):
171
+ """Return (distance, problem_indices) comparing observed lg string to canonical,
172
+ excluding the final (anceps) position from both distance and problem reporting.
173
+
174
+ - Same length, mismatches: problem_indices = list of mismatched 0-based positions.
175
+ - Hypermetric (len > canonical): problem_indices = [index of extra syllable] (positive).
176
+ - Hypometric (len < canonical): problem_indices = [-(gap_pos + 1)] where gap_pos is
177
+ the 0-based canonical position of the missing syllable. The -(k+1) encoding keeps
178
+ gap position 0 distinct from a positive index (never 0). Frontend recovers the
179
+ canonical column as abs(v) - 1 for any negative v.
180
+ """
181
+ obs = observed[:-1] # exclude final anceps
182
+ can = canonical[:-1]
183
+
184
+ if len(obs) == len(can):
185
+ bad = [i for i in range(len(obs)) if obs[i] != can[i]]
186
+ return len(bad), bad
187
+
188
+ # Standard Levenshtein DP with early exit once threshold is exceeded
189
+ m, n = len(obs), len(can)
190
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
191
+ for i in range(m + 1): dp[i][0] = i
192
+ for j in range(n + 1): dp[0][j] = j
193
+ for i in range(1, m + 1):
194
+ row_min = n # track row minimum for early exit
195
+ for j in range(1, n + 1):
196
+ cost = 0 if obs[i-1] == can[j-1] else 1
197
+ dp[i][j] = min(dp[i-1][j] + 1, dp[i][j-1] + 1, dp[i-1][j-1] + cost)
198
+ if dp[i][j] < row_min: row_min = dp[i][j]
199
+ if row_min > ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
200
+ return row_min, []
201
+
202
+ dist = dp[m][n]
203
+
204
+ # Traceback to find the operation site
205
+ i, j = m, n
206
+ while i > 0 or j > 0:
207
+ if i > 0 and j > 0 and dp[i][j] == dp[i-1][j-1] + (0 if obs[i-1] == can[j-1] else 1):
208
+ i -= 1; j -= 1
209
+ elif i > 0 and dp[i][j] == dp[i-1][j] + 1:
210
+ # deletion from observed — observed has an extra syllable at position i-1
211
+ return dist, [i - 1]
212
+ else:
213
+ # insertion into observed — observed is missing canonical syllable at position j-1
214
+ return dist, [-(j - 1 + 1)] # encode as -(gap_pos + 1), always negative
215
+
216
+ return dist, []
217
+
218
+
154
219
  def _decompose_into_mAtragaNas(weights_str, gana_6_morae, gana_8_morae):
155
220
  """
156
221
  Decomposes an ardha (half-verse) weight string into mātrā-gaṇas.
@@ -308,6 +373,9 @@ class VerseTester(object):
308
373
  self.resplit_keep_midpoint = default_resplit_keep_midpoint # bool
309
374
  self.identification_attempt_count = 0
310
375
  self._anuzwuB_half_cache = {} # cleared per wiggle_identify run
376
+ self._ardha_stash = [] # accumulated across wiggle candidates
377
+ self._vizama_stash = [] # accumulated across wiggle candidates
378
+ self._samavftta_has_length_error = False # set during evaluate_samavftta perfect_only pass
311
379
 
312
380
  def combine_results(self, Vrs, new_label, new_score, new_is_perfect=False):
313
381
  old_label = Vrs.meter_label or ''
@@ -563,7 +631,7 @@ class VerseTester(object):
563
631
  self.pAdasamatva_count = max_match
564
632
 
565
633
 
566
- def evaluate_samavftta(self, Vrs):
634
+ def evaluate_samavftta(self, Vrs, perfect_only=False):
567
635
  # sufficient pAdasamatva already assured, now just evaluate
568
636
 
569
637
  wbp = Vrs.syllable_weights.split('\n') # weights by pāda
@@ -613,20 +681,34 @@ class VerseTester(object):
613
681
  if "ajñātasamavṛtta" in meter_label:
614
682
  score -= 2
615
683
 
616
- # Build per-pāda diagnostic: length errors, then position-level pattern errors.
684
+ # Build per-pāda diagnostic: length errors (Levenshtein), then pattern errors.
685
+ # In perfect_only mode, skip Levenshtein — just register the result and return.
617
686
  problem_syllables = {}
618
687
  per_pada_sanskrit = {}
619
688
  per_pada_english = {}
620
689
  canonical = w_to_id # includes final anceps
690
+ has_length_error = any(len(w) != len(canonical) for w in wbp[:4])
691
+
692
+ if perfect_only and has_length_error:
693
+ # Defer length-error annotation to the imperfect pass; register result now.
694
+ self._samavftta_has_length_error = True
695
+ old_score = Vrs.identification_score
696
+ self.combine_results(Vrs, new_label=meter_label, new_score=score)
697
+ if score >= old_score:
698
+ Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
699
+ return
700
+
621
701
  for pada_num, w in enumerate(wbp[:4], start=1):
622
702
  if w == canonical:
623
703
  pass # no entry → perfect for this pada
624
704
  elif len(w) > len(canonical):
625
- problem_syllables[pada_num] = list(range(len(w)))
705
+ _, prob_indices = _levenshtein_align(w, canonical)
706
+ problem_syllables[pada_num] = prob_indices
626
707
  per_pada_sanskrit[pada_num] = 'adhikākṣarā'
627
708
  per_pada_english[pada_num] = 'hypermetric'
628
709
  elif len(w) < len(canonical):
629
- problem_syllables[pada_num] = list(range(len(w)))
710
+ _, prob_indices = _levenshtein_align(w, canonical)
711
+ problem_syllables[pada_num] = prob_indices
630
712
  per_pada_sanskrit[pada_num] = 'ūnākṣarā'
631
713
  per_pada_english[pada_num] = 'hypometric'
632
714
  else:
@@ -665,48 +747,119 @@ class VerseTester(object):
665
747
 
666
748
  # score arbitration: may tie with pre-existing result (e.g., upajāti)
667
749
  old_score = Vrs.identification_score
668
- self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=imperfect_note is None)
750
+ self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=imperfect_note is None and not has_any_error)
669
751
  if score >= old_score:
670
752
  Vrs.diagnostic = diagnostic
671
753
 
672
754
 
755
+ def evaluate_ardhasamavftta(self, Vrs, perfect_only=False):
756
+ # bail early if even a perfect result can't beat what's already recorded
757
+ if meter_scores["ardhasamavṛtta, perfect"] <= Vrs.identification_score:
758
+ return
673
759
 
674
- def evaluate_ardhasamavftta(self, Vrs):
675
- # sufficient pAdasamatva already assured, now just evaluate
676
- Vrs.identification_score = meter_scores["ardhasamavṛtta, perfect"]
677
- Vrs.is_perfect = True
760
+ wbp = Vrs.syllable_weights.split('\n') # weights by pāda
761
+ tsyl = Vrs.text_syllabified
762
+ gaRa = Vrs.gaRa_abbreviations
763
+ morae = Vrs.morae_per_line
764
+
765
+ if perfect_only:
766
+ # Search all patterns; stash imperfect candidates (no Levenshtein yet), commit perfect immediately.
767
+ for (odd_canonical, even_canonical), meter_label in \
768
+ meter_patterns.ardhasamavftta_by_odd_even_weights.items():
769
+ # length pre-filter
770
+ if any(
771
+ len(w) != (len(odd_canonical) if pada_num in (1, 3) else len(even_canonical))
772
+ for pada_num, w in enumerate(wbp[:4], start=1)
773
+ ):
774
+ # stash if within threshold (no Levenshtein — just length check)
775
+ if all(
776
+ abs(len(w) - (len(odd_canonical) if pada_num in (1, 3) else len(even_canonical)))
777
+ <= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD
778
+ for pada_num, w in enumerate(wbp[:4], start=1)
779
+ ):
780
+ self._ardha_stash.append((wbp, meter_label, odd_canonical, even_canonical, tsyl, gaRa, morae))
781
+ continue
678
782
 
679
- wbp = Vrs.syllable_weights.split('\n') # weights by pāda
783
+ # exact length: direct string comparison for perfect match (no Levenshtein needed)
784
+ if all(
785
+ w == (odd_canonical if pada_num in (1, 3) else even_canonical)
786
+ for pada_num, w in enumerate(wbp[:4], start=1)
787
+ ):
788
+ score = meter_scores["ardhasamavṛtta, perfect"]
789
+ old_score = Vrs.identification_score
790
+ self.combine_results(Vrs, new_label=meter_label, new_score=score, new_is_perfect=True)
791
+ if score >= old_score:
792
+ Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
793
+ self._ardha_stash = [] # perfect found; no need for imperfect pass
794
+ return
795
+ # same length but not perfect — stash without distance computation
796
+ self._ardha_stash.append((wbp, meter_label, odd_canonical, even_canonical, tsyl, gaRa, morae))
797
+ return
680
798
 
681
- gs_to_id = Vrs.gaRa_abbreviations.split('\n') # gaRa abbreviation to id
682
- odd_g_to_id = gs_to_id[0]
683
- even_g_to_id = gs_to_id[1]
799
+ # Imperfect pass: consume the stash built during perfect_only pass.
800
+ # If no stash (e.g. called directly without a prior perfect_only pass), build it now.
801
+ if not self._ardha_stash:
802
+ self.evaluate_ardhasamavftta(Vrs, perfect_only=True)
803
+ if not self._ardha_stash:
804
+ return
684
805
 
685
- # look for match among regexes with same length
686
- iterator = meter_patterns.ardhasamavftta_by_odd_even_regex_tuple.keys()
687
- for (odd_gaRa_pattern, even_gaRa_pattern) in iterator:
806
+ # Run full Levenshtein on every stash entry to find minimum total distance.
807
+ best_total_dist = None
808
+ best_entry = None
809
+ for _stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae in self._ardha_stash:
810
+ total_dist = sum(
811
+ _levenshtein_align(w, _odd_can if pada_num in (1, 3) else _even_can)[0]
812
+ for pada_num, w in enumerate(_stash_wbp[:4], start=1)
813
+ )
814
+ if total_dist <= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
815
+ if best_total_dist is None or total_dist < best_total_dist:
816
+ best_total_dist = total_dist
817
+ best_entry = (_stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae)
688
818
 
689
- regex_odd = re.compile(odd_gaRa_pattern)
690
- regex_even = re.compile(even_gaRa_pattern)
819
+ if best_entry is None:
820
+ return
691
821
 
692
- if (
693
- re.match(regex_odd, odd_g_to_id) and
694
- re.match(regex_even, even_g_to_id)
695
- ):
822
+ best_stash_wbp, best_label, best_odd_canonical, best_even_canonical, *_ = best_entry
823
+ score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - 1)
824
+ if score <= 0:
825
+ return
696
826
 
697
- meter_label = meter_patterns.ardhasamavftta_by_odd_even_regex_tuple[
698
- (odd_gaRa_pattern, even_gaRa_pattern)
699
- ]
700
- break
827
+ problem_syllables = {}
828
+ per_pada_sanskrit = {}
829
+ per_pada_english = {}
830
+ for pada_num, w in enumerate(best_stash_wbp[:4], start=1):
831
+ canonical = best_odd_canonical if pada_num in (1, 3) else best_even_canonical
832
+ dist, prob_indices = _levenshtein_align(w, canonical)
833
+ if dist == 0:
834
+ continue
835
+ problem_syllables[pada_num] = prob_indices
836
+ meter_name = best_label.split(' = ')[0]
837
+ if len(w) > len(canonical):
838
+ per_pada_sanskrit[pada_num] = 'adhikākṣarā'
839
+ per_pada_english[pada_num] = 'hypermetric'
840
+ elif len(w) < len(canonical):
841
+ per_pada_sanskrit[pada_num] = 'ūnākṣarā'
842
+ per_pada_english[pada_num] = 'hypometric'
843
+ else:
844
+ per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
845
+ per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
701
846
 
847
+ sa_vals = list(per_pada_sanskrit.items())
848
+ if len(sa_vals) == 1:
849
+ suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
702
850
  else:
703
- meter_label = "ajñātārdhasamavṛtta" # i.e., might need to add to meter_patterns
704
- meter_label += ' [%s, %s]' % (odd_g_to_id, even_g_to_id)
705
- Vrs.identification_score = meter_scores["ardhasamavṛtta, perfect, unknown"]
706
- Vrs.is_perfect = True # "perfect, unknown" means pattern unknown, not imperfect
851
+ suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
852
+ imperfect_label = best_label + f" ({suffix})"
707
853
 
708
- Vrs.meter_label = meter_label
709
- Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
854
+ old_score = Vrs.identification_score
855
+ self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
856
+ if score >= old_score:
857
+ Vrs.diagnostic = Diagnostic(
858
+ perfect_id_label=imperfect_label,
859
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
860
+ imperfect_label_english=per_pada_english or None,
861
+ problem_syllables=problem_syllables or None,
862
+ )
710
863
 
711
864
 
712
865
  def evaluate_upajAti(self, Vrs):
@@ -804,6 +957,12 @@ class VerseTester(object):
804
957
  else:
805
958
  score = meter_scores["none found"]
806
959
 
960
+ # Extra penalties for especially weak upajāti results.
961
+ if len(wbp_lens) == 2:
962
+ score -= 1 # two pādas excluded instead of one
963
+ if all(lbl.startswith('ajñātam') for lbl in meter_labels):
964
+ score -= 1
965
+
807
966
  imperfect_note = None
808
967
  overall_meter_label = "upajāti %s: %s" % (
809
968
  family,
@@ -858,31 +1017,108 @@ class VerseTester(object):
858
1017
 
859
1018
  # score arbitration: may tie with pre-existing result (e.g., samavṛtta)
860
1019
  old_score = Vrs.identification_score
861
- is_perf = (score in (meter_scores["upajāti, perfect"],
862
- meter_scores["upajāti, triṣṭubh-jagatī-saṃkara, perfect"],
863
- meter_scores["upajāti, non-triṣṭubh, perfect"])
864
- and 'ajñātam' not in overall_meter_label)
865
- self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=is_perf)
1020
+ self.combine_results(Vrs, overall_meter_label, score, new_is_perfect=imperfect_note is None and not per_pada_english)
866
1021
  if score >= old_score:
867
1022
  Vrs.diagnostic = diagnostic
868
1023
 
869
1024
 
870
- def is_vizamavftta(self, Vrs):
1025
+ def is_vizamavftta(self, Vrs, perfect_only=False):
1026
+ # bail early if even a perfect result can't beat what's already recorded
1027
+ if meter_scores["viṣamavṛtta, perfect"] <= Vrs.identification_score:
1028
+ return False
871
1029
 
1030
+ wbp = Vrs.syllable_weights.split('\n')
1031
+ if len(wbp) < 4: return False
872
1032
  gs_to_id = Vrs.gaRa_abbreviations.split('\n')
873
1033
  if len(gs_to_id) < 4: return False
1034
+ tsyl = Vrs.text_syllabified
1035
+ gaRa = Vrs.gaRa_abbreviations
1036
+ morae = Vrs.morae_per_line
1037
+
1038
+ if perfect_only:
1039
+ for canonicals, canonical_weights, meter_label in _vizamavftta_precomputed:
1040
+
1041
+ # Perfect match via gaṇa abbreviations
1042
+ if all(gs_to_id[i] == canonicals[i] for i in range(4)):
1043
+ Vrs.identification_score = meter_scores["viṣamavṛtta, perfect"]
1044
+ Vrs.meter_label = meter_label
1045
+ Vrs.diagnostic = Diagnostic(perfect_id_label=meter_label)
1046
+ self._vizama_stash = []
1047
+ return True
1048
+
1049
+ # Not perfect — stash if weight lengths are within threshold
1050
+ if all(
1051
+ abs(len(wbp[i]) - len(canonical_weights[i])) <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD
1052
+ for i in range(4)
1053
+ ):
1054
+ self._vizama_stash.append((wbp, meter_label, canonical_weights, tsyl, gaRa, morae))
1055
+ return False
874
1056
 
875
- for (a, b, c, d) in meter_patterns.vizamavftta_by_4_tuple:
876
- if (gs_to_id[0],gs_to_id[1],gs_to_id[2],gs_to_id[3]) == (a, b, c, d):
877
- Vrs.identification_score = meter_scores["viṣamavṛtta, perfect"]
878
- Vrs.is_perfect = True
879
- Vrs.meter_label = meter_patterns.vizamavftta_by_4_tuple[(a, b, c, d)]
880
- Vrs.diagnostic = Diagnostic(perfect_id_label=Vrs.meter_label)
881
- return True
1057
+ # Imperfect pass: consume the stash.
1058
+ if not self._vizama_stash:
1059
+ self.is_vizamavftta(Vrs, perfect_only=True)
1060
+ if not self._vizama_stash:
1061
+ return False
882
1062
 
883
- else:
1063
+ best_total_dist = None
1064
+ best_entry = None
1065
+ for _wbp, _label, _canonical_weights, _tsyl, _gaRa, _morae in self._vizama_stash:
1066
+ total_dist = sum(
1067
+ _levenshtein_align(_wbp[i], _canonical_weights[i])[0]
1068
+ for i in range(4)
1069
+ )
1070
+ if total_dist <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
1071
+ if best_total_dist is None or total_dist < best_total_dist:
1072
+ best_total_dist = total_dist
1073
+ best_entry = (_wbp, _label, _canonical_weights, _tsyl, _gaRa, _morae)
1074
+
1075
+ if best_entry is None:
884
1076
  return False
885
1077
 
1078
+ best_wbp, best_label, best_canonical_weights, *_ = best_entry
1079
+ score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - 1)
1080
+ if score <= 0:
1081
+ return False
1082
+
1083
+ problem_syllables = {}
1084
+ per_pada_sanskrit = {}
1085
+ per_pada_english = {}
1086
+ for i, w in enumerate(best_wbp[:4]):
1087
+ canonical = best_canonical_weights[i]
1088
+ dist, prob_indices = _levenshtein_align(w, canonical)
1089
+ if dist == 0:
1090
+ continue
1091
+ pada_num = i + 1
1092
+ problem_syllables[pada_num] = prob_indices
1093
+ meter_name = best_label.split(' = ')[0]
1094
+ if len(w) > len(canonical):
1095
+ per_pada_sanskrit[pada_num] = 'adhikākṣarā'
1096
+ per_pada_english[pada_num] = 'hypermetric'
1097
+ elif len(w) < len(canonical):
1098
+ per_pada_sanskrit[pada_num] = 'ūnākṣarā'
1099
+ per_pada_english[pada_num] = 'hypometric'
1100
+ else:
1101
+ per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
1102
+ per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
1103
+
1104
+ sa_vals = list(per_pada_sanskrit.items())
1105
+ if len(sa_vals) == 1:
1106
+ suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
1107
+ else:
1108
+ suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
1109
+ imperfect_label = best_label + f" ({suffix})"
1110
+
1111
+ old_score = Vrs.identification_score
1112
+ self.combine_results(Vrs, new_label=imperfect_label, new_score=score)
1113
+ if score >= old_score:
1114
+ Vrs.diagnostic = Diagnostic(
1115
+ perfect_id_label=imperfect_label,
1116
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
1117
+ imperfect_label_english=per_pada_english or None,
1118
+ problem_syllables=problem_syllables or None,
1119
+ )
1120
+ return True
1121
+
886
1122
  def test_as_samavftta_etc(self, Vrs):
887
1123
 
888
1124
  wbp = Vrs.syllable_weights.split('\n') # weights by pāda
@@ -907,23 +1143,14 @@ class VerseTester(object):
907
1143
  timed('samavftta')(self.evaluate_samavftta)(Vrs)
908
1144
  return 1 # max score already reached
909
1145
 
910
- # test perfect ardhasamavftta
911
- if ( self.pAdasamatva_count == 2
912
- and wbp[0][:-1] == wbp[2][:-1]
913
- and wbp[1][:-1] == wbp[3][:-1] # exclude final anceps
914
1146
 
915
- and wbp_lens.count(11) != 4 # bc triṣṭubh upajāti so common
916
- ):
917
- # will give id_score == 8
918
- timed('ardhasamavftta')(self.evaluate_ardhasamavftta)(Vrs)
919
- # max score not necessarily yet reached, don't return
920
1147
 
921
1148
  # test perfect single pāda of samavṛtta
922
1149
  if ( self.pAdasamatva_count == 0 and self.resplit_option == "single_pAda"):
923
1150
  timed('samavftta')(self.evaluate_samavftta)(Vrs)
924
1151
 
925
- # test perfect viṣamavṛtta
926
- if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs):
1152
+ # test perfect viṣamavṛtta (Levenshtein for imperfect deferred to imperfect pass)
1153
+ if self.pAdasamatva_count == 0 and timed('vizamavftta')(self.is_vizamavftta)(Vrs, perfect_only=True):
927
1154
  # will give id_score == 9
928
1155
  # label and score already set in is_vizamavftta if test was successful
929
1156
  return 1 # max score already reached
@@ -938,15 +1165,12 @@ class VerseTester(object):
938
1165
  if Vrs.identification_score == 8: return 1 # best score compared to below
939
1166
  # otherwise, max score not necessarily yet reached, don't return
940
1167
 
941
- # test imperfect samavftta
1168
+ # test imperfect samavftta (Levenshtein for length errors deferred to imperfect pass)
942
1169
  if self.pAdasamatva_count in [2, 3]:
943
1170
  # will give id_score in [7, 6], may tie with above
944
- timed('samavftta')(self.evaluate_samavftta)(Vrs)
1171
+ timed('samavftta')(self.evaluate_samavftta)(Vrs, perfect_only=True)
945
1172
  # max score not necessarily yet reached, don't return
946
1173
 
947
- # test imperfect ardhasamavftta? seems hard
948
- # involves looking specifically for corresponding type...
949
-
950
1174
  # test imperfect upajāti
951
1175
  if (
952
1176
  len( list(set(wbp_lens)) ) in [2, 3] or
@@ -1291,14 +1515,15 @@ class VerseTester(object):
1291
1515
  """
1292
1516
 
1293
1517
  self.identification_attempt_count += 1
1518
+ self._samavftta_has_length_error = False
1294
1519
 
1295
1520
  # anuzwuB
1296
1521
  success_anuzwuB = timed('anuzwuB')(self.test_as_anuzwuB)(Vrs)
1297
1522
  if success_anuzwuB and Vrs.identification_score == meter_scores["max score"]:
1298
1523
  return 1
1299
1524
 
1300
- # samavftta, upajAti, vizamavftta, ardhasamavftta
1301
- _inner_keys = ('samavftta', 'upajAti', 'ardhasamavftta', 'vizamavftta')
1525
+ # samavftta, upajAti, vizamavftta
1526
+ _inner_keys = ('samavftta', 'upajAti', 'vizamavftta')
1302
1527
  _pre_inner = {k: _section_totals.get(k, 0.0) for k in _inner_keys} if _DEBUG_TIMING else None
1303
1528
  success_samavftta_etc = timed('samavftta_etc')(self.test_as_samavftta_etc)(Vrs)
1304
1529
  if _DEBUG_TIMING:
@@ -1308,13 +1533,29 @@ class VerseTester(object):
1308
1533
  return 1
1309
1534
  # i.e., if upajāti or anything imperfect, also continue on to check jāti
1310
1535
 
1311
- # problem: how to change above handling for rare case
1312
- # where ardhasamavftta is also jAti?
1536
+ # ardhasamavftta perfect-only pass (Levenshtein but bails on imperfect)
1537
+ # Odd pādas 10–12, even pādas 11–13; with threshold 2, viable range is 9–15.
1538
+ # Lower bound 9 excludes anuṣṭubh (8 syllables) which is the most common false candidate.
1539
+ wbp_lens_ardha = [len(line) for line in Vrs.syllable_weights.split('\n')]
1540
+ _ardha_viable = (
1541
+ wbp_lens_ardha.count(11) != 4 and # exclude 4×11 triṣṭubh upajāti
1542
+ all(9 <= l <= 15 for l in wbp_lens_ardha[:4])
1543
+ )
1544
+ if _ardha_viable:
1545
+ timed('ardhasamavftta_perfect')(self.evaluate_ardhasamavftta)(Vrs, perfect_only=True)
1546
+ if _ardha_viable and Vrs.identification_score >= meter_scores["ardhasamavṛtta, perfect"]:
1547
+ return 1
1313
1548
 
1314
1549
  # jāti
1315
1550
  success_jAti = timed('jAti')(self.test_as_jAti)(Vrs)
1551
+ if success_jAti and Vrs.identification_score >= meter_scores["max score"]:
1552
+ return 1
1316
1553
 
1317
- if success_anuzwuB or success_samavftta_etc or success_jAti:
1554
+ # imperfect pass: deferred Levenshtein annotation for samavftta length errors.
1555
+ if self._samavftta_has_length_error:
1556
+ timed('lev_samavftta')(self.evaluate_samavftta)(Vrs)
1557
+
1558
+ if success_anuzwuB or success_samavftta_etc or success_jAti or Vrs.identification_score >= meter_scores["ardhasamavṛtta, perfect"]:
1318
1559
  return 1
1319
1560
  else:
1320
1561
  return 0
@@ -1373,6 +1614,8 @@ class MeterIdentifier(object):
1373
1614
  """Returns a list for MeterIdentifier.Verses_found"""
1374
1615
 
1375
1616
  self._anuzwuB_half_cache = {}
1617
+ VrsTster._ardha_stash = []
1618
+ VrsTster._vizama_stash = []
1376
1619
  pos_iterators = {}
1377
1620
  for k in ['ab', 'bc', 'cd']:
1378
1621
  if (
@@ -1486,7 +1729,8 @@ class MeterIdentifier(object):
1486
1729
 
1487
1730
  if _DEBUG_TIMING:
1488
1731
  _pre_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
1489
- 'anuzwuB', 'samavftta', 'upajAti', 'ardhasamavftta', 'vizamavftta', 'jAti', 'samavftta_etc')
1732
+ 'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
1733
+ 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
1490
1734
  _pre = {k: _section_totals.get(k, 0.0) for k in _pre_keys}
1491
1735
 
1492
1736
  # gets back mostly populated Verse object
@@ -1498,7 +1742,15 @@ class MeterIdentifier(object):
1498
1742
 
1499
1743
  if resplit_option in ['none', 'single_pAda'] or V.text_cleaned == '':
1500
1744
  # No resplitting: test the verse exactly as scanned.
1745
+ VT._ardha_stash = []
1746
+ VT._vizama_stash = []
1501
1747
  success = VT.attempt_identification(V)
1748
+ # Post-identification: deferred imperfect ardhasamavṛtta pass over stash.
1749
+ if VT._ardha_stash and meter_scores["ardhasamavṛtta, imperfect"] > V.identification_score:
1750
+ timed('lev_ardha')(VT.evaluate_ardhasamavftta)(V)
1751
+ # Post-identification: deferred imperfect viṣamavṛtta pass over stash.
1752
+ if VT._vizama_stash and meter_scores["viṣamavṛtta, imperfect"] > V.identification_score:
1753
+ timed('lev_vizama')(VT.is_vizamavftta)(V)
1502
1754
 
1503
1755
  elif resplit_option in ['resplit_max', 'resplit_lite']:
1504
1756
 
@@ -1562,6 +1814,139 @@ class MeterIdentifier(object):
1562
1814
  pAda_brs, quarter_len
1563
1815
  )
1564
1816
 
1817
+ # Post-wiggle: deferred imperfect ardhasamavṛtta pass over accumulated stash.
1818
+ _lev_ardha_t0 = _time.perf_counter() if _DEBUG_TIMING else None
1819
+ ardha_stash = VT._ardha_stash
1820
+ if ardha_stash:
1821
+ best_current_score = (
1822
+ max(v.identification_score for v in self.Verses_found)
1823
+ if self.Verses_found else 0
1824
+ )
1825
+ if meter_scores["ardhasamavṛtta, imperfect"] > best_current_score:
1826
+ best_total_dist = None
1827
+ best_entry = None
1828
+ for _stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae in ardha_stash:
1829
+ total_dist = sum(
1830
+ _levenshtein_align(w, _odd_can if pada_num in (1, 3) else _even_can)[0]
1831
+ for pada_num, w in enumerate(_stash_wbp[:4], start=1)
1832
+ )
1833
+ if total_dist <= ARDHASAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
1834
+ if best_total_dist is None or total_dist < best_total_dist:
1835
+ best_total_dist = total_dist
1836
+ best_entry = (_stash_wbp, _label, _odd_can, _even_can, _stash_tsyl, _stash_gaRa, _stash_morae)
1837
+ if best_entry is not None:
1838
+ ardha_score = meter_scores["ardhasamavṛtta, imperfect"] - (best_total_dist - 1)
1839
+ if ardha_score > best_current_score:
1840
+ best_stash_wbp, best_label, best_odd_can, best_even_can, best_stash_tsyl, best_stash_gaRa, best_stash_morae = best_entry
1841
+ problem_syllables = {}
1842
+ per_pada_sanskrit = {}
1843
+ per_pada_english = {}
1844
+ for pada_num, w in enumerate(best_stash_wbp[:4], start=1):
1845
+ canonical = best_odd_can if pada_num in (1, 3) else best_even_can
1846
+ dist, prob_indices = _levenshtein_align(w, canonical)
1847
+ if dist == 0:
1848
+ continue
1849
+ problem_syllables[pada_num] = prob_indices
1850
+ meter_name = best_label.split(' = ')[0]
1851
+ if len(w) > len(canonical):
1852
+ per_pada_sanskrit[pada_num] = 'adhikākṣarā'
1853
+ per_pada_english[pada_num] = 'hypermetric'
1854
+ elif len(w) < len(canonical):
1855
+ per_pada_sanskrit[pada_num] = 'ūnākṣarā'
1856
+ per_pada_english[pada_num] = 'hypometric'
1857
+ else:
1858
+ per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
1859
+ per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
1860
+ sa_vals = list(per_pada_sanskrit.items())
1861
+ if len(sa_vals) == 1:
1862
+ suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
1863
+ else:
1864
+ suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
1865
+ imperfect_label = best_label + f" ({suffix})"
1866
+ ardha_Vrs = copy(self.Verses_found[0]) if self.Verses_found else copy(V)
1867
+ ardha_Vrs.text_syllabified = best_stash_tsyl
1868
+ ardha_Vrs.syllable_weights = '\n'.join(best_stash_wbp)
1869
+ ardha_Vrs.gaRa_abbreviations = best_stash_gaRa
1870
+ ardha_Vrs.morae_per_line = best_stash_morae
1871
+ ardha_Vrs.meter_label = imperfect_label
1872
+ ardha_Vrs.identification_score = ardha_score
1873
+ ardha_Vrs.diagnostic = Diagnostic(
1874
+ perfect_id_label=imperfect_label,
1875
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
1876
+ imperfect_label_english=per_pada_english or None,
1877
+ problem_syllables=problem_syllables or None,
1878
+ )
1879
+ self.Verses_found.append(ardha_Vrs)
1880
+ if _DEBUG_TIMING:
1881
+ _section_totals['lev_ardha'] = _section_totals.get('lev_ardha', 0.0) + _time.perf_counter() - _lev_ardha_t0
1882
+
1883
+ # Post-wiggle: deferred imperfect viṣamavṛtta pass over accumulated stash.
1884
+ _lev_vizama_t0 = _time.perf_counter() if _DEBUG_TIMING else None
1885
+ vizama_stash = VT._vizama_stash
1886
+ if vizama_stash:
1887
+ best_current_score = (
1888
+ max(v.identification_score for v in self.Verses_found)
1889
+ if self.Verses_found else 0
1890
+ )
1891
+ if meter_scores["viṣamavṛtta, imperfect"] > best_current_score:
1892
+ best_total_dist = None
1893
+ best_entry = None
1894
+ for _wbp, _label, _canonicals, _tsyl, _gaRa, _morae in vizama_stash:
1895
+ total_dist = sum(
1896
+ _levenshtein_align(_wbp[i], _canonicals[i])[0]
1897
+ for i in range(4)
1898
+ )
1899
+ if total_dist <= VIZAMAVFTTA_EDIT_DISTANCE_THRESHOLD:
1900
+ if best_total_dist is None or total_dist < best_total_dist:
1901
+ best_total_dist = total_dist
1902
+ best_entry = (_wbp, _label, _canonicals, _tsyl, _gaRa, _morae)
1903
+ if best_entry is not None:
1904
+ vizama_score = meter_scores["viṣamavṛtta, imperfect"] - (best_total_dist - 1)
1905
+ if vizama_score > best_current_score:
1906
+ best_wbp, best_label, best_canonicals, best_tsyl, best_gaRa, best_morae = best_entry
1907
+ problem_syllables = {}
1908
+ per_pada_sanskrit = {}
1909
+ per_pada_english = {}
1910
+ for i, w in enumerate(best_wbp[:4]):
1911
+ canonical = best_canonicals[i]
1912
+ dist, prob_indices = _levenshtein_align(w, canonical)
1913
+ if dist == 0:
1914
+ continue
1915
+ pada_num = i + 1
1916
+ problem_syllables[pada_num] = prob_indices
1917
+ meter_name = best_label.split(' = ')[0]
1918
+ if len(w) > len(canonical):
1919
+ per_pada_sanskrit[pada_num] = 'adhikākṣarā'
1920
+ per_pada_english[pada_num] = 'hypermetric'
1921
+ elif len(w) < len(canonical):
1922
+ per_pada_sanskrit[pada_num] = 'ūnākṣarā'
1923
+ per_pada_english[pada_num] = 'hypometric'
1924
+ else:
1925
+ per_pada_sanskrit[pada_num] = 'vikṛtavṛtta'
1926
+ per_pada_english[pada_num] = f'does not match expected gaṇa pattern for {meter_name}'
1927
+ sa_vals = list(per_pada_sanskrit.items())
1928
+ if len(sa_vals) == 1:
1929
+ suffix = f"asamīcīnā, pāda {sa_vals[0][0]}: {sa_vals[0][1]}"
1930
+ else:
1931
+ suffix = 'asamīcīnā, ' + '; '.join(f"pāda {p}: {v}" for p, v in sa_vals)
1932
+ imperfect_label = best_label + f" ({suffix})"
1933
+ vizama_Vrs = copy(self.Verses_found[0]) if self.Verses_found else copy(V)
1934
+ vizama_Vrs.text_syllabified = best_tsyl
1935
+ vizama_Vrs.syllable_weights = '\n'.join(best_wbp)
1936
+ vizama_Vrs.gaRa_abbreviations = best_gaRa
1937
+ vizama_Vrs.morae_per_line = best_morae
1938
+ vizama_Vrs.meter_label = imperfect_label
1939
+ vizama_Vrs.identification_score = vizama_score
1940
+ vizama_Vrs.diagnostic = Diagnostic(
1941
+ perfect_id_label=imperfect_label,
1942
+ imperfect_label_sanskrit=per_pada_sanskrit or None,
1943
+ imperfect_label_english=per_pada_english or None,
1944
+ problem_syllables=problem_syllables or None,
1945
+ )
1946
+ self.Verses_found.append(vizama_Vrs)
1947
+ if _DEBUG_TIMING:
1948
+ _section_totals['lev_vizama'] = _section_totals.get('lev_vizama', 0.0) + _time.perf_counter() - _lev_vizama_t0
1949
+
1565
1950
  # Pick the candidate with the highest identification score.
1566
1951
  if len(self.Verses_found) > 0:
1567
1952
  self.Verses_found.sort(key=lambda x: x.identification_score, reverse=True)
@@ -1574,7 +1959,8 @@ class MeterIdentifier(object):
1574
1959
 
1575
1960
  if _DEBUG_TIMING:
1576
1961
  all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
1577
- 'anuzwuB', 'samavftta', 'upajAti', 'ardhasamavftta', 'vizamavftta', 'jAti', 'samavftta_etc')
1962
+ 'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
1963
+ 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
1578
1964
  verse_times = {k: _section_totals.get(k, 0.0) - _pre[k] for k in all_keys}
1579
1965
  verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
1580
1966
  cat = _meter_label_to_category(V.meter_label)
@@ -11,6 +11,16 @@ gaRas_by_weights = {
11
11
  'llg' : 's', # anapest / antidactylus
12
12
  }
13
13
 
14
+ weights_by_gaRa = {v: k for k, v in gaRas_by_weights.items()}
15
+
16
+ def expand_gaRa_pattern(gaRa_str):
17
+ """Expand canonical gaṇa abbreviation string (no regex) to lg weight string.
18
+ Single trailing g/l (final anceps) passes through unchanged."""
19
+ result = []
20
+ for ch in gaRa_str:
21
+ result.append(weights_by_gaRa.get(ch, ch))
22
+ return ''.join(result)
23
+
14
24
  """
15
25
  Sources:
16
26
  Apte, V.S. (1890). Practical Sanskrit-English Dictionary, "Appendix A: Sanskrit Prosody".
@@ -336,14 +346,16 @@ all_known_samavRttas = []
336
346
  for k in samavfttas_by_family_and_gaRa.keys(): # for each family
337
347
  all_known_samavRttas = all_known_samavRttas + list(samavfttas_by_family_and_gaRa[k].values())
338
348
 
339
- ardhasamavftta_by_odd_even_regex_tuple = {
340
- ('nnrl(g|l)', 'njj(r|B)') : 'aparavaktra = [11: nnrlg] 1,3 + [12: njjr] 2,4', # aka vaitālīya
341
- ('sssl(g|l)', 'BBBg(g|l)') : 'upacitra = [11: ssslg] 1,3 + [11: BBBgg] 2,4',
342
- ('nnr(y|j)', 'njjr(g|l)') : 'puṣpitāgrā = [12: nnry] 1,3 + [12: njjrg] 2,4', # aka aupacchandasika
343
- ('ssj(g|l)', 'sBrl(g|l)') : 'viyoginī = [10: ssjg] 1,3 + [11: sBrlg] 2,4', # aka vaitālīya, sundarī
344
- ('sss(g|l)', 'BBBg(g|l)') : 'vegavatī = [10: sssg] 1,3 + [11: BBBgg] 2,4',
345
- ('sssl(g|l)', 'nBB(r|B)') : 'hariṇaplutā = [11: ssjgg] 1,3 + [12: nBBr] 2,4',
346
- ('ssjg(g|l)', 'sBr(y|j)') : 'aupacchandasika = [11: ssjgg] 1,3 + [12: sBry] 2,4', # aka mālābhāriṇī
349
+ ardhasamavftta_by_odd_even_weights = {
350
+ # Keys are (odd_canonical_lg, even_canonical_lg) with heavy final anceps.
351
+ # Gaṇa abbreviations shown in comments and embedded in meter label strings.
352
+ ('llllllglglg', 'llllgllglglg'): 'aparavaktra = [11: nnrlg] 1,3 + [12: njjr] 2,4', # aka vaitālīya
353
+ ('llgllgllglg', 'gllgllgllgg'): 'upacitra = [11: ssslg] 1,3 + [11: BBBgg] 2,4',
354
+ ('llllllglglgg', 'llllgllglglgg'): 'puṣpitāgrā = [12: nnry] 1,3 + [13: njjrg] 2,4', # aka aupacchandasika
355
+ ('llgllglglg', 'llggllglglg'): 'viyoginī = [10: ssjg] 1,3 + [11: sBrlg] 2,4', # aka vaitālīya, sundarī
356
+ ('llgllgllgg', 'gllgllgllgg'): 'vegavatī = [10: sssg] 1,3 + [11: BBBgg] 2,4',
357
+ ('llgllgllglg', 'lllgllgllglg'): 'hariṇaplutā = [11: ssslg] 1,3 + [12: nBBr] 2,4',
358
+ ('llgllglglgg', 'llggllglglgg'): 'aupacchandasika = [11: ssjgg] 1,3 + [12: sBry] 2,4', # aka mālābhāriṇī
347
359
  }
348
360
 
349
361
  vizamavftta_by_4_tuple = {
@@ -33,6 +33,10 @@ For transliteration, 'consonant' means 'needs virāma if non-vowel follows' (no
33
33
  SLP_consonants_for_scansion = SLP_consonants
34
34
  """For scansion, 'consonant' means 'contributes to heaviness of previous vowel' (yes M H)"""
35
35
 
36
+ SLP_vowels_set = set(SLP_vowels)
37
+ SLP_long_vowels_set = set(SLP_long_vowels)
38
+ SLP_consonants_for_scansion_set = set(SLP_consonants_for_scansion)
39
+
36
40
  DEV_consonants = ['क', 'ख', 'ग', 'घ', 'ङ','च', 'छ', 'ज', 'झ', 'ञ',
37
41
  'ट', 'ठ', 'ड', 'ढ', 'ण','त', 'थ', 'द', 'ध', 'न','प', 'फ', 'ब', 'भ', 'म',
38
42
  'य', 'र', 'ल', 'व','श', 'ष', 'स', 'ह']
@@ -111,3 +115,5 @@ to_add = [' ', '\t', '\n']
111
115
  for k in character_set.keys():
112
116
  for c in to_add:
113
117
  character_set[k].append(c)
118
+
119
+ character_set_lookup = {k: set(v) for k, v in character_set.items()}
@@ -6,6 +6,9 @@ from skrutable.config import load_config_dict_from_json_file
6
6
  from skrutable.utils import timed
7
7
  import re
8
8
 
9
+ _re_ws_around_newline = re.compile(r'[ \t]*\n[ \t]*')
10
+ _re_multi_newline = re.compile(r'\n+')
11
+
9
12
  # load config variables
10
13
  config = load_config_dict_from_json_file()
11
14
  scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
@@ -157,22 +160,21 @@ class Scanner(object):
157
160
  Returns result as string.
158
161
  """
159
162
 
160
- # manage additional newlines
161
-
162
- for chr in additional_pAda_separators:
163
- cntnts = cntnts.replace(chr, '\n')
164
- # also dedupe, also allowing for carriage returns introduced in HTML form input
165
- regex = re.compile(r"(\n\r?){2,}")
166
- cntnts = re.sub(regex, '\n', cntnts)
167
- # also remove buffer-initial and -final newlines
168
- regex = re.compile(r"(\A\s*)|(\s*\Z)")
169
- cntnts = re.sub(regex, '', cntnts)
163
+ # filter out disallowed characters (numbers, irrelevant punctuation, etc.)
164
+ # pāda separator chars are preserved so they can be converted to \n below
165
+ pAda_sep_chars = set(c for sep in additional_pAda_separators for c in sep)
166
+ for c in set(cntnts):
167
+ if c not in phonemes.character_set_lookup[scheme_in] and c not in pAda_sep_chars:
168
+ cntnts = cntnts.replace(c, '')
170
169
 
171
- # filter out disallowed characters
170
+ # replace all pāda separator strings with newline
171
+ for sep in additional_pAda_separators:
172
+ cntnts = cntnts.replace(sep, '\n')
172
173
 
173
- for c in list(set(cntnts)):
174
- if c not in phonemes.character_set[scheme_in]:
175
- cntnts = cntnts.replace(c,'')
174
+ # strip horizontal whitespace around newlines, dedupe, strip leading/trailing
175
+ cntnts = _re_ws_around_newline.sub('\n', cntnts)
176
+ cntnts = _re_multi_newline.sub('\n', cntnts)
177
+ cntnts = cntnts.strip()
176
178
 
177
179
  return cntnts
178
180
 
@@ -208,15 +210,15 @@ class Scanner(object):
208
210
  # place scansion_syllable_separator after vowels
209
211
  for letter in line:
210
212
 
211
- # exception: do treat M and H as explicit syllable coda
212
- if letter in ['M', 'H']:
213
- if line_syllables[-1] == scansion_syllable_separator:
213
+ if letter in ('M', 'H'):
214
+ # M and H are explicit syllable codas: strip any trailing separator, append, re-add separator
215
+ if line_syllables and line_syllables[-1] == scansion_syllable_separator:
214
216
  line_syllables = line_syllables[:-1]
215
-
216
- line_syllables += letter
217
-
218
- if letter in phonemes.SLP_vowels + ['M', 'H']:
219
- line_syllables += scansion_syllable_separator
217
+ line_syllables += letter + scansion_syllable_separator
218
+ elif letter in phonemes.SLP_vowels_set:
219
+ line_syllables += letter + scansion_syllable_separator
220
+ else:
221
+ line_syllables += letter
220
222
 
221
223
  # e.g. 'ya.dA.ya.dA.hi.Da.rma.sya.glA.ni.rBa.va.ti.BA.ra.ta.'
222
224
  # BUT e.g. 'a.Byu.tTA.na.ma.Da.rma.sya.ta.dA.tmA.na.Msf.jA.mya.ha.m'
@@ -257,8 +259,6 @@ class Scanner(object):
257
259
 
258
260
  for line in text_lines:
259
261
 
260
- line_weights = ''
261
-
262
262
  syllables = line.split(scansion_syllable_separator)
263
263
 
264
264
  try:
@@ -266,34 +266,37 @@ class Scanner(object):
266
266
  syllables.pop(-1) # in case of final separator(s)
267
267
  except IndexError: pass
268
268
 
269
+ line_weights_chars = []
270
+
269
271
  for n, syllable in enumerate(syllables):
270
272
 
271
273
  if (
272
274
  # heavy by nature
273
- syllable[-1] in phonemes.SLP_long_vowels + ['M', 'H']
275
+ syllable[-1] in phonemes.SLP_long_vowels_set or syllable[-1] in ('M', 'H')
274
276
 
275
277
  or
276
278
 
277
279
  # heavy by position:
278
280
  # consonant closes syllable or next syllable begins with a cluster
279
- syllable[-1] in (phonemes.SLP_consonants_for_scansion)
281
+ syllable[-1] in phonemes.SLP_consonants_for_scansion_set
280
282
  or
281
283
  n <= (len(syllables)-2)
282
284
  and len(syllables[n+1]) > 1
283
- and syllables[n+1][0] in (phonemes.SLP_consonants_for_scansion)
284
- and syllables[n+1][1] in (phonemes.SLP_consonants_for_scansion)
285
+ and syllables[n+1][0] in phonemes.SLP_consonants_for_scansion_set
286
+ and syllables[n+1][1] in phonemes.SLP_consonants_for_scansion_set
287
+
285
288
 
286
289
  ):
287
290
 
288
- line_weights += 'g'
289
- # line_weights += 'g_'
291
+ line_weights_chars.append('g')
292
+ # line_weights_chars.append('g_')
290
293
  # insofar as two 'l's can equal one 'g', could use this alternative for better visual alignment
291
294
 
292
295
  else:
293
296
 
294
- line_weights += 'l'
297
+ line_weights_chars.append('l')
295
298
 
296
- weights_by_line.append(line_weights)
299
+ weights_by_line.append(''.join(line_weights_chars))
297
300
 
298
301
  syllable_weights = '\n'.join(weights_by_line) # restore newlines
299
302
  return syllable_weights
@@ -326,21 +329,16 @@ class Scanner(object):
326
329
  Returns string of 'gaRa'-trisyllable abbreviation, e.g. 'nml'.
327
330
  """
328
331
 
329
- for c in list(set(syl_wts)):
330
- if c not in ['l','g']:
332
+ for c in set(syl_wts):
333
+ if c not in {'l', 'g'}:
331
334
  return None
332
335
 
333
- weights_of_curr_gaRa = ''
334
- overall_abbreviation = ''
335
-
336
- for single_weight in syl_wts:
337
- weights_of_curr_gaRa += single_weight
338
- if len(weights_of_curr_gaRa) == 3:
339
- overall_abbreviation += meter_patterns.gaRas_by_weights[weights_of_curr_gaRa]
340
- weights_of_curr_gaRa = ''
341
-
336
+ n = len(syl_wts) // 3 * 3
337
+ overall_abbreviation = ''.join(
338
+ meter_patterns.gaRas_by_weights[syl_wts[i:i+3]] for i in range(0, n, 3)
339
+ )
342
340
  # leftover lights and heavies (l/g)
343
- overall_abbreviation += weights_of_curr_gaRa
341
+ overall_abbreviation += syl_wts[n:]
344
342
 
345
343
  return overall_abbreviation
346
344
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.5.4
3
+ Version: 2.6.1
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -1 +0,0 @@
1
- __version__ = "2.5.4"
File without changes
File without changes
File without changes
File without changes