skrutable 2.6.0__tar.gz → 2.6.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {skrutable-2.6.0 → skrutable-2.6.2}/PKG-INFO +1 -1
  2. skrutable-2.6.2/src/skrutable/__init__.py +1 -0
  3. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/meter_identification.py +79 -1
  4. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/phonemes.py +6 -0
  5. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scansion.py +33 -34
  6. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/PKG-INFO +1 -1
  7. skrutable-2.6.0/src/skrutable/__init__.py +0 -1
  8. {skrutable-2.6.0 → skrutable-2.6.2}/LICENSE.md +0 -0
  9. {skrutable-2.6.0 → skrutable-2.6.2}/README.md +0 -0
  10. {skrutable-2.6.0 → skrutable-2.6.2}/setup.cfg +0 -0
  11. {skrutable-2.6.0 → skrutable-2.6.2}/setup.py +0 -0
  12. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/config.json +0 -0
  13. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/config.py +0 -0
  14. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/generate_scheme_vectors.py +0 -0
  15. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/impossible_bigrams.json +0 -0
  16. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/manual.md +0 -0
  17. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/meter_patterns.py +0 -0
  18. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/run_examples.py +0 -0
  19. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_detection.py +0 -0
  20. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_maps.py +0 -0
  21. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_vectors.json +0 -0
  22. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_vectors_mbh.py +0 -0
  23. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/splitting.py +0 -0
  24. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/transliteration.py +0 -0
  25. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/utils.py +0 -0
  26. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/virAma_avoidance.py +0 -0
  27. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/SOURCES.txt +0 -0
  28. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/dependency_links.txt +0 -0
  29. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/requires.txt +0 -0
  30. {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.6.0
3
+ Version: 2.6.2
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -0,0 +1 @@
1
+ __version__ = "2.6.2"
@@ -5,9 +5,13 @@ from skrutable.utils import _DEBUG_TIMING, _section_totals, timed
5
5
  import re
6
6
  import time as _time
7
7
  from copy import copy
8
+ from concurrent.futures import ProcessPoolExecutor
8
9
  from dataclasses import dataclass
9
10
  from typing import Optional
10
11
 
12
+ BATCH_MAX_WORKERS = 5
13
+ BATCH_PARALLEL_THRESHOLD = 100
14
+
11
15
  # load config variables
12
16
  config = load_config_dict_from_json_file()
13
17
  scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
@@ -50,10 +54,12 @@ def _verse_is_perfect(V):
50
54
  return getattr(V, 'is_perfect', False)
51
55
 
52
56
 
53
- def flush_profiling_report(write_file=False):
57
+ def flush_profiling_report(write_file=False, wall_clock_secs=None, parallel_workers=None):
54
58
  """Print the accumulated profiling table to stderr, then reset all counters.
55
59
 
56
60
  Pass write_file=True to also write the table to profiling_debug.txt alongside the library source.
61
+ Pass wall_clock_secs to append a timing footer line.
62
+ Pass parallel_workers (int) to show worker count and parallelization speedup; omit or pass None for serial runs.
57
63
  Safe to call even when _DEBUG_TIMING is False (no-op).
58
64
  """
59
65
  if not _DEBUG_TIMING or not _category_totals:
@@ -124,6 +130,13 @@ def flush_profiling_report(write_file=False):
124
130
  + f'{total_scan:.2f}s'.rjust(sub_w)
125
131
  + f'{total_types:.2f}s'.rjust(sub_w)
126
132
  + ' ' + fmt_row(total_scan_vals, total_type_vals))
133
+ if wall_clock_secs is not None:
134
+ table_total = total_scan + total_types
135
+ if parallel_workers is not None:
136
+ speedup = table_total / wall_clock_secs if wall_clock_secs > 0 else float('inf')
137
+ lines.append(f'\n table total (CPU across {parallel_workers} workers, inflated by overhead): {table_total:.2f}s | wall-clock: {wall_clock_secs:.2f}s | apparent parallelization speedup: {speedup:.2f}x')
138
+ else:
139
+ lines.append(f'\n table total: {table_total:.2f}s | wall-clock: {wall_clock_secs:.2f}s')
127
140
  block = '\n'.join(lines) + '\n'
128
141
  if write_file:
129
142
  timing_path = os.path.join(os.path.dirname(__file__), 'profiling_debug.txt')
@@ -1972,3 +1985,68 @@ class MeterIdentifier(object):
1972
1985
  bucket['_perfect_count'] = bucket.get('_perfect_count', 0) + 1
1973
1986
 
1974
1987
  return V
1988
+
1989
+ def identify_meter_batch(self, rw_strs,
1990
+ resplit_option=default_resplit_option,
1991
+ resplit_keep_midpoint=default_resplit_keep_midpoint,
1992
+ from_scheme=None):
1993
+ """
1994
+ Parallel version of identify_meter() for a list of raw strings.
1995
+
1996
+ Spawns up to BATCH_MAX_WORKERS worker processes, one task per verse.
1997
+ Returns a list of Verse objects in the same order as the input.
1998
+ When _DEBUG_TIMING is on, merges per-verse timing dicts back into
1999
+ the main process's _category_totals so flush_profiling_report() works.
2000
+ Falls back to serial processing for small batches below BATCH_PARALLEL_THRESHOLD.
2001
+ """
2002
+ if len(rw_strs) < BATCH_PARALLEL_THRESHOLD:
2003
+ return [self.identify_meter(s, resplit_option=resplit_option,
2004
+ resplit_keep_midpoint=resplit_keep_midpoint, from_scheme=from_scheme)
2005
+ for s in rw_strs]
2006
+
2007
+ args = [(s, resplit_option, resplit_keep_midpoint, from_scheme, _DEBUG_TIMING) for s in rw_strs]
2008
+ with ProcessPoolExecutor(max_workers=BATCH_MAX_WORKERS) as executor:
2009
+ results = list(executor.map(_identify_meter_worker, args))
2010
+
2011
+ if _DEBUG_TIMING:
2012
+ for V, verse_times, cat in results:
2013
+ _section_totals['wiggle_count'] = _section_totals.get('wiggle_count', 0) + verse_times.pop('wiggle_count', 0)
2014
+ bucket = _category_totals.setdefault(cat, {})
2015
+ for k, v in verse_times.items():
2016
+ bucket[k] = bucket.get(k, 0.0) + v
2017
+ bucket['_count'] = bucket.get('_count', 0) + 1
2018
+ if _verse_is_perfect(V):
2019
+ bucket['_perfect_count'] = bucket.get('_perfect_count', 0) + 1
2020
+ return [V for V, _, _ in results]
2021
+
2022
+ return results
2023
+
2024
+
2025
+ def _identify_meter_worker(args):
2026
+ """Module-level worker function (must be picklable). One verse per call."""
2027
+ rw_str, resplit_option, resplit_keep_midpoint, from_scheme, debug_timing = args
2028
+ if debug_timing:
2029
+ import skrutable.utils as _utils
2030
+ _utils._DEBUG_TIMING = True
2031
+ import skrutable.meter_identification as _mi
2032
+ _mi._DEBUG_TIMING = True
2033
+ MI = MeterIdentifier()
2034
+ all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
2035
+ 'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
2036
+ 'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
2037
+ if debug_timing:
2038
+ pre = {k: _section_totals.get(k, 0.0) for k in all_keys}
2039
+ pre_wiggle = _section_totals.get('wiggle_count', 0)
2040
+ V = MI.identify_meter(
2041
+ rw_str,
2042
+ resplit_option=resplit_option,
2043
+ resplit_keep_midpoint=resplit_keep_midpoint,
2044
+ from_scheme=from_scheme,
2045
+ )
2046
+ if debug_timing:
2047
+ verse_times = {k: _section_totals.get(k, 0.0) - pre[k] for k in all_keys}
2048
+ verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
2049
+ verse_times['wiggle_count'] = _section_totals.get('wiggle_count', 0) - pre_wiggle
2050
+ cat = _meter_label_to_category(V.meter_label)
2051
+ return V, verse_times, cat
2052
+ return V
@@ -33,6 +33,10 @@ For transliteration, 'consonant' means 'needs virāma if non-vowel follows' (no
33
33
  SLP_consonants_for_scansion = SLP_consonants
34
34
  """For scansion, 'consonant' means 'contributes to heaviness of previous vowel' (yes M H)"""
35
35
 
36
+ SLP_vowels_set = set(SLP_vowels)
37
+ SLP_long_vowels_set = set(SLP_long_vowels)
38
+ SLP_consonants_for_scansion_set = set(SLP_consonants_for_scansion)
39
+
36
40
  DEV_consonants = ['क', 'ख', 'ग', 'घ', 'ङ','च', 'छ', 'ज', 'झ', 'ञ',
37
41
  'ट', 'ठ', 'ड', 'ढ', 'ण','त', 'थ', 'द', 'ध', 'न','प', 'फ', 'ब', 'भ', 'म',
38
42
  'य', 'र', 'ल', 'व','श', 'ष', 'स', 'ह']
@@ -111,3 +115,5 @@ to_add = [' ', '\t', '\n']
111
115
  for k in character_set.keys():
112
116
  for c in to_add:
113
117
  character_set[k].append(c)
118
+
119
+ character_set_lookup = {k: set(v) for k, v in character_set.items()}
@@ -6,6 +6,9 @@ from skrutable.config import load_config_dict_from_json_file
6
6
  from skrutable.utils import timed
7
7
  import re
8
8
 
9
+ _re_ws_around_newline = re.compile(r'[ \t]*\n[ \t]*')
10
+ _re_multi_newline = re.compile(r'\n+')
11
+
9
12
  # load config variables
10
13
  config = load_config_dict_from_json_file()
11
14
  scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
@@ -160,8 +163,8 @@ class Scanner(object):
160
163
  # filter out disallowed characters (numbers, irrelevant punctuation, etc.)
161
164
  # pāda separator chars are preserved so they can be converted to \n below
162
165
  pAda_sep_chars = set(c for sep in additional_pAda_separators for c in sep)
163
- for c in list(set(cntnts)):
164
- if c not in phonemes.character_set[scheme_in] and c not in pAda_sep_chars:
166
+ for c in set(cntnts):
167
+ if c not in phonemes.character_set_lookup[scheme_in] and c not in pAda_sep_chars:
165
168
  cntnts = cntnts.replace(c, '')
166
169
 
167
170
  # replace all pāda separator strings with newline
@@ -169,8 +172,8 @@ class Scanner(object):
169
172
  cntnts = cntnts.replace(sep, '\n')
170
173
 
171
174
  # strip horizontal whitespace around newlines, dedupe, strip leading/trailing
172
- cntnts = re.sub(r'[ \t]*\n[ \t]*', '\n', cntnts)
173
- cntnts = re.sub(r'\n+', '\n', cntnts)
175
+ cntnts = _re_ws_around_newline.sub('\n', cntnts)
176
+ cntnts = _re_multi_newline.sub('\n', cntnts)
174
177
  cntnts = cntnts.strip()
175
178
 
176
179
  return cntnts
@@ -207,15 +210,15 @@ class Scanner(object):
207
210
  # place scansion_syllable_separator after vowels
208
211
  for letter in line:
209
212
 
210
- # exception: do treat M and H as explicit syllable coda
211
- if letter in ['M', 'H']:
212
- if line_syllables[-1] == scansion_syllable_separator:
213
+ if letter in ('M', 'H'):
214
+ # M and H are explicit syllable codas: strip any trailing separator, append, re-add separator
215
+ if line_syllables and line_syllables[-1] == scansion_syllable_separator:
213
216
  line_syllables = line_syllables[:-1]
214
-
215
- line_syllables += letter
216
-
217
- if letter in phonemes.SLP_vowels + ['M', 'H']:
218
- line_syllables += scansion_syllable_separator
217
+ line_syllables += letter + scansion_syllable_separator
218
+ elif letter in phonemes.SLP_vowels_set:
219
+ line_syllables += letter + scansion_syllable_separator
220
+ else:
221
+ line_syllables += letter
219
222
 
220
223
  # e.g. 'ya.dA.ya.dA.hi.Da.rma.sya.glA.ni.rBa.va.ti.BA.ra.ta.'
221
224
  # BUT e.g. 'a.Byu.tTA.na.ma.Da.rma.sya.ta.dA.tmA.na.Msf.jA.mya.ha.m'
@@ -256,8 +259,6 @@ class Scanner(object):
256
259
 
257
260
  for line in text_lines:
258
261
 
259
- line_weights = ''
260
-
261
262
  syllables = line.split(scansion_syllable_separator)
262
263
 
263
264
  try:
@@ -265,34 +266,37 @@ class Scanner(object):
265
266
  syllables.pop(-1) # in case of final separator(s)
266
267
  except IndexError: pass
267
268
 
269
+ line_weights_chars = []
270
+
268
271
  for n, syllable in enumerate(syllables):
269
272
 
270
273
  if (
271
274
  # heavy by nature
272
- syllable[-1] in phonemes.SLP_long_vowels + ['M', 'H']
275
+ syllable[-1] in phonemes.SLP_long_vowels_set or syllable[-1] in ('M', 'H')
273
276
 
274
277
  or
275
278
 
276
279
  # heavy by position:
277
280
  # consonant closes syllable or next syllable begins with a cluster
278
- syllable[-1] in (phonemes.SLP_consonants_for_scansion)
281
+ syllable[-1] in phonemes.SLP_consonants_for_scansion_set
279
282
  or
280
283
  n <= (len(syllables)-2)
281
284
  and len(syllables[n+1]) > 1
282
- and syllables[n+1][0] in (phonemes.SLP_consonants_for_scansion)
283
- and syllables[n+1][1] in (phonemes.SLP_consonants_for_scansion)
285
+ and syllables[n+1][0] in phonemes.SLP_consonants_for_scansion_set
286
+ and syllables[n+1][1] in phonemes.SLP_consonants_for_scansion_set
287
+
284
288
 
285
289
  ):
286
290
 
287
- line_weights += 'g'
288
- # line_weights += 'g_'
291
+ line_weights_chars.append('g')
292
+ # line_weights_chars.append('g_')
289
293
  # insofar as two 'l's can equal one 'g', could use this alternative for better visual alignment
290
294
 
291
295
  else:
292
296
 
293
- line_weights += 'l'
297
+ line_weights_chars.append('l')
294
298
 
295
- weights_by_line.append(line_weights)
299
+ weights_by_line.append(''.join(line_weights_chars))
296
300
 
297
301
  syllable_weights = '\n'.join(weights_by_line) # restore newlines
298
302
  return syllable_weights
@@ -325,21 +329,16 @@ class Scanner(object):
325
329
  Returns string of 'gaRa'-trisyllable abbreviation, e.g. 'nml'.
326
330
  """
327
331
 
328
- for c in list(set(syl_wts)):
329
- if c not in ['l','g']:
332
+ for c in set(syl_wts):
333
+ if c not in {'l', 'g'}:
330
334
  return None
331
335
 
332
- weights_of_curr_gaRa = ''
333
- overall_abbreviation = ''
334
-
335
- for single_weight in syl_wts:
336
- weights_of_curr_gaRa += single_weight
337
- if len(weights_of_curr_gaRa) == 3:
338
- overall_abbreviation += meter_patterns.gaRas_by_weights[weights_of_curr_gaRa]
339
- weights_of_curr_gaRa = ''
340
-
336
+ n = len(syl_wts) // 3 * 3
337
+ overall_abbreviation = ''.join(
338
+ meter_patterns.gaRas_by_weights[syl_wts[i:i+3]] for i in range(0, n, 3)
339
+ )
341
340
  # leftover lights and heavies (l/g)
342
- overall_abbreviation += weights_of_curr_gaRa
341
+ overall_abbreviation += syl_wts[n:]
343
342
 
344
343
  return overall_abbreviation
345
344
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skrutable
3
- Version: 2.6.0
3
+ Version: 2.6.2
4
4
  Summary: skrutable library for working with Sanskrit text
5
5
  Home-page: https://github.com/tylergneill/skrutable
6
6
  Author: Tyler Neill
@@ -1 +0,0 @@
1
- __version__ = "2.6.0"
File without changes
File without changes
File without changes
File without changes