skrutable 2.6.0__tar.gz → 2.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {skrutable-2.6.0 → skrutable-2.6.2}/PKG-INFO +1 -1
- skrutable-2.6.2/src/skrutable/__init__.py +1 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/meter_identification.py +79 -1
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/phonemes.py +6 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scansion.py +33 -34
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/PKG-INFO +1 -1
- skrutable-2.6.0/src/skrutable/__init__.py +0 -1
- {skrutable-2.6.0 → skrutable-2.6.2}/LICENSE.md +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/README.md +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/setup.cfg +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/setup.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/config.json +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/config.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/generate_scheme_vectors.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/impossible_bigrams.json +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/manual.md +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/meter_patterns.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/run_examples.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_detection.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_maps.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_vectors.json +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/scheme_vectors_mbh.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/splitting.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/transliteration.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/utils.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable/virAma_avoidance.py +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/SOURCES.txt +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/dependency_links.txt +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/requires.txt +0 -0
- {skrutable-2.6.0 → skrutable-2.6.2}/src/skrutable.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "2.6.2"
|
|
@@ -5,9 +5,13 @@ from skrutable.utils import _DEBUG_TIMING, _section_totals, timed
|
|
|
5
5
|
import re
|
|
6
6
|
import time as _time
|
|
7
7
|
from copy import copy
|
|
8
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
8
9
|
from dataclasses import dataclass
|
|
9
10
|
from typing import Optional
|
|
10
11
|
|
|
12
|
+
BATCH_MAX_WORKERS = 5
|
|
13
|
+
BATCH_PARALLEL_THRESHOLD = 100
|
|
14
|
+
|
|
11
15
|
# load config variables
|
|
12
16
|
config = load_config_dict_from_json_file()
|
|
13
17
|
scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
|
|
@@ -50,10 +54,12 @@ def _verse_is_perfect(V):
|
|
|
50
54
|
return getattr(V, 'is_perfect', False)
|
|
51
55
|
|
|
52
56
|
|
|
53
|
-
def flush_profiling_report(write_file=False):
|
|
57
|
+
def flush_profiling_report(write_file=False, wall_clock_secs=None, parallel_workers=None):
|
|
54
58
|
"""Print the accumulated profiling table to stderr, then reset all counters.
|
|
55
59
|
|
|
56
60
|
Pass write_file=True to also write the table to profiling_debug.txt alongside the library source.
|
|
61
|
+
Pass wall_clock_secs to append a timing footer line.
|
|
62
|
+
Pass parallel_workers (int) to show worker count and parallelization speedup; omit or pass None for serial runs.
|
|
57
63
|
Safe to call even when _DEBUG_TIMING is False (no-op).
|
|
58
64
|
"""
|
|
59
65
|
if not _DEBUG_TIMING or not _category_totals:
|
|
@@ -124,6 +130,13 @@ def flush_profiling_report(write_file=False):
|
|
|
124
130
|
+ f'{total_scan:.2f}s'.rjust(sub_w)
|
|
125
131
|
+ f'{total_types:.2f}s'.rjust(sub_w)
|
|
126
132
|
+ ' ' + fmt_row(total_scan_vals, total_type_vals))
|
|
133
|
+
if wall_clock_secs is not None:
|
|
134
|
+
table_total = total_scan + total_types
|
|
135
|
+
if parallel_workers is not None:
|
|
136
|
+
speedup = table_total / wall_clock_secs if wall_clock_secs > 0 else float('inf')
|
|
137
|
+
lines.append(f'\n table total (CPU across {parallel_workers} workers, inflated by overhead): {table_total:.2f}s | wall-clock: {wall_clock_secs:.2f}s | apparent parallelization speedup: {speedup:.2f}x')
|
|
138
|
+
else:
|
|
139
|
+
lines.append(f'\n table total: {table_total:.2f}s | wall-clock: {wall_clock_secs:.2f}s')
|
|
127
140
|
block = '\n'.join(lines) + '\n'
|
|
128
141
|
if write_file:
|
|
129
142
|
timing_path = os.path.join(os.path.dirname(__file__), 'profiling_debug.txt')
|
|
@@ -1972,3 +1985,68 @@ class MeterIdentifier(object):
|
|
|
1972
1985
|
bucket['_perfect_count'] = bucket.get('_perfect_count', 0) + 1
|
|
1973
1986
|
|
|
1974
1987
|
return V
|
|
1988
|
+
|
|
1989
|
+
def identify_meter_batch(self, rw_strs,
|
|
1990
|
+
resplit_option=default_resplit_option,
|
|
1991
|
+
resplit_keep_midpoint=default_resplit_keep_midpoint,
|
|
1992
|
+
from_scheme=None):
|
|
1993
|
+
"""
|
|
1994
|
+
Parallel version of identify_meter() for a list of raw strings.
|
|
1995
|
+
|
|
1996
|
+
Spawns up to BATCH_MAX_WORKERS worker processes, one task per verse.
|
|
1997
|
+
Returns a list of Verse objects in the same order as the input.
|
|
1998
|
+
When _DEBUG_TIMING is on, merges per-verse timing dicts back into
|
|
1999
|
+
the main process's _category_totals so flush_profiling_report() works.
|
|
2000
|
+
Falls back to serial processing for small batches below BATCH_PARALLEL_THRESHOLD.
|
|
2001
|
+
"""
|
|
2002
|
+
if len(rw_strs) < BATCH_PARALLEL_THRESHOLD:
|
|
2003
|
+
return [self.identify_meter(s, resplit_option=resplit_option,
|
|
2004
|
+
resplit_keep_midpoint=resplit_keep_midpoint, from_scheme=from_scheme)
|
|
2005
|
+
for s in rw_strs]
|
|
2006
|
+
|
|
2007
|
+
args = [(s, resplit_option, resplit_keep_midpoint, from_scheme, _DEBUG_TIMING) for s in rw_strs]
|
|
2008
|
+
with ProcessPoolExecutor(max_workers=BATCH_MAX_WORKERS) as executor:
|
|
2009
|
+
results = list(executor.map(_identify_meter_worker, args))
|
|
2010
|
+
|
|
2011
|
+
if _DEBUG_TIMING:
|
|
2012
|
+
for V, verse_times, cat in results:
|
|
2013
|
+
_section_totals['wiggle_count'] = _section_totals.get('wiggle_count', 0) + verse_times.pop('wiggle_count', 0)
|
|
2014
|
+
bucket = _category_totals.setdefault(cat, {})
|
|
2015
|
+
for k, v in verse_times.items():
|
|
2016
|
+
bucket[k] = bucket.get(k, 0.0) + v
|
|
2017
|
+
bucket['_count'] = bucket.get('_count', 0) + 1
|
|
2018
|
+
if _verse_is_perfect(V):
|
|
2019
|
+
bucket['_perfect_count'] = bucket.get('_perfect_count', 0) + 1
|
|
2020
|
+
return [V for V, _, _ in results]
|
|
2021
|
+
|
|
2022
|
+
return results
|
|
2023
|
+
|
|
2024
|
+
|
|
2025
|
+
def _identify_meter_worker(args):
|
|
2026
|
+
"""Module-level worker function (must be picklable). One verse per call."""
|
|
2027
|
+
rw_str, resplit_option, resplit_keep_midpoint, from_scheme, debug_timing = args
|
|
2028
|
+
if debug_timing:
|
|
2029
|
+
import skrutable.utils as _utils
|
|
2030
|
+
_utils._DEBUG_TIMING = True
|
|
2031
|
+
import skrutable.meter_identification as _mi
|
|
2032
|
+
_mi._DEBUG_TIMING = True
|
|
2033
|
+
MI = MeterIdentifier()
|
|
2034
|
+
all_keys = ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana',
|
|
2035
|
+
'anuzwuB', 'samavftta', 'upajAti', 'vizamavftta',
|
|
2036
|
+
'ardhasamavftta_perfect', 'jAti', 'lev_samavftta', 'lev_ardha', 'lev_vizama', 'samavftta_etc')
|
|
2037
|
+
if debug_timing:
|
|
2038
|
+
pre = {k: _section_totals.get(k, 0.0) for k in all_keys}
|
|
2039
|
+
pre_wiggle = _section_totals.get('wiggle_count', 0)
|
|
2040
|
+
V = MI.identify_meter(
|
|
2041
|
+
rw_str,
|
|
2042
|
+
resplit_option=resplit_option,
|
|
2043
|
+
resplit_keep_midpoint=resplit_keep_midpoint,
|
|
2044
|
+
from_scheme=from_scheme,
|
|
2045
|
+
)
|
|
2046
|
+
if debug_timing:
|
|
2047
|
+
verse_times = {k: _section_totals.get(k, 0.0) - pre[k] for k in all_keys}
|
|
2048
|
+
verse_times['scan'] = sum(verse_times[k] for k in ('scan_clean', 'scan_translit', 'scan_syllabify', 'scan_weights', 'scan_morae_gana'))
|
|
2049
|
+
verse_times['wiggle_count'] = _section_totals.get('wiggle_count', 0) - pre_wiggle
|
|
2050
|
+
cat = _meter_label_to_category(V.meter_label)
|
|
2051
|
+
return V, verse_times, cat
|
|
2052
|
+
return V
|
|
@@ -33,6 +33,10 @@ For transliteration, 'consonant' means 'needs virāma if non-vowel follows' (no
|
|
|
33
33
|
SLP_consonants_for_scansion = SLP_consonants
|
|
34
34
|
"""For scansion, 'consonant' means 'contributes to heaviness of previous vowel' (yes M H)"""
|
|
35
35
|
|
|
36
|
+
SLP_vowels_set = set(SLP_vowels)
|
|
37
|
+
SLP_long_vowels_set = set(SLP_long_vowels)
|
|
38
|
+
SLP_consonants_for_scansion_set = set(SLP_consonants_for_scansion)
|
|
39
|
+
|
|
36
40
|
DEV_consonants = ['क', 'ख', 'ग', 'घ', 'ङ','च', 'छ', 'ज', 'झ', 'ञ',
|
|
37
41
|
'ट', 'ठ', 'ड', 'ढ', 'ण','त', 'थ', 'द', 'ध', 'न','प', 'फ', 'ब', 'भ', 'म',
|
|
38
42
|
'य', 'र', 'ल', 'व','श', 'ष', 'स', 'ह']
|
|
@@ -111,3 +115,5 @@ to_add = [' ', '\t', '\n']
|
|
|
111
115
|
for k in character_set.keys():
|
|
112
116
|
for c in to_add:
|
|
113
117
|
character_set[k].append(c)
|
|
118
|
+
|
|
119
|
+
character_set_lookup = {k: set(v) for k, v in character_set.items()}
|
|
@@ -6,6 +6,9 @@ from skrutable.config import load_config_dict_from_json_file
|
|
|
6
6
|
from skrutable.utils import timed
|
|
7
7
|
import re
|
|
8
8
|
|
|
9
|
+
_re_ws_around_newline = re.compile(r'[ \t]*\n[ \t]*')
|
|
10
|
+
_re_multi_newline = re.compile(r'\n+')
|
|
11
|
+
|
|
9
12
|
# load config variables
|
|
10
13
|
config = load_config_dict_from_json_file()
|
|
11
14
|
scansion_syllable_separator = config["scansion_syllable_separator"] # e.g. " "
|
|
@@ -160,8 +163,8 @@ class Scanner(object):
|
|
|
160
163
|
# filter out disallowed characters (numbers, irrelevant punctuation, etc.)
|
|
161
164
|
# pāda separator chars are preserved so they can be converted to \n below
|
|
162
165
|
pAda_sep_chars = set(c for sep in additional_pAda_separators for c in sep)
|
|
163
|
-
for c in
|
|
164
|
-
if c not in phonemes.
|
|
166
|
+
for c in set(cntnts):
|
|
167
|
+
if c not in phonemes.character_set_lookup[scheme_in] and c not in pAda_sep_chars:
|
|
165
168
|
cntnts = cntnts.replace(c, '')
|
|
166
169
|
|
|
167
170
|
# replace all pāda separator strings with newline
|
|
@@ -169,8 +172,8 @@ class Scanner(object):
|
|
|
169
172
|
cntnts = cntnts.replace(sep, '\n')
|
|
170
173
|
|
|
171
174
|
# strip horizontal whitespace around newlines, dedupe, strip leading/trailing
|
|
172
|
-
cntnts =
|
|
173
|
-
cntnts =
|
|
175
|
+
cntnts = _re_ws_around_newline.sub('\n', cntnts)
|
|
176
|
+
cntnts = _re_multi_newline.sub('\n', cntnts)
|
|
174
177
|
cntnts = cntnts.strip()
|
|
175
178
|
|
|
176
179
|
return cntnts
|
|
@@ -207,15 +210,15 @@ class Scanner(object):
|
|
|
207
210
|
# place scansion_syllable_separator after vowels
|
|
208
211
|
for letter in line:
|
|
209
212
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
if line_syllables[-1] == scansion_syllable_separator:
|
|
213
|
+
if letter in ('M', 'H'):
|
|
214
|
+
# M and H are explicit syllable codas: strip any trailing separator, append, re-add separator
|
|
215
|
+
if line_syllables and line_syllables[-1] == scansion_syllable_separator:
|
|
213
216
|
line_syllables = line_syllables[:-1]
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
line_syllables +=
|
|
217
|
+
line_syllables += letter + scansion_syllable_separator
|
|
218
|
+
elif letter in phonemes.SLP_vowels_set:
|
|
219
|
+
line_syllables += letter + scansion_syllable_separator
|
|
220
|
+
else:
|
|
221
|
+
line_syllables += letter
|
|
219
222
|
|
|
220
223
|
# e.g. 'ya.dA.ya.dA.hi.Da.rma.sya.glA.ni.rBa.va.ti.BA.ra.ta.'
|
|
221
224
|
# BUT e.g. 'a.Byu.tTA.na.ma.Da.rma.sya.ta.dA.tmA.na.Msf.jA.mya.ha.m'
|
|
@@ -256,8 +259,6 @@ class Scanner(object):
|
|
|
256
259
|
|
|
257
260
|
for line in text_lines:
|
|
258
261
|
|
|
259
|
-
line_weights = ''
|
|
260
|
-
|
|
261
262
|
syllables = line.split(scansion_syllable_separator)
|
|
262
263
|
|
|
263
264
|
try:
|
|
@@ -265,34 +266,37 @@ class Scanner(object):
|
|
|
265
266
|
syllables.pop(-1) # in case of final separator(s)
|
|
266
267
|
except IndexError: pass
|
|
267
268
|
|
|
269
|
+
line_weights_chars = []
|
|
270
|
+
|
|
268
271
|
for n, syllable in enumerate(syllables):
|
|
269
272
|
|
|
270
273
|
if (
|
|
271
274
|
# heavy by nature
|
|
272
|
-
syllable[-1] in phonemes.
|
|
275
|
+
syllable[-1] in phonemes.SLP_long_vowels_set or syllable[-1] in ('M', 'H')
|
|
273
276
|
|
|
274
277
|
or
|
|
275
278
|
|
|
276
279
|
# heavy by position:
|
|
277
280
|
# consonant closes syllable or next syllable begins with a cluster
|
|
278
|
-
syllable[-1] in
|
|
281
|
+
syllable[-1] in phonemes.SLP_consonants_for_scansion_set
|
|
279
282
|
or
|
|
280
283
|
n <= (len(syllables)-2)
|
|
281
284
|
and len(syllables[n+1]) > 1
|
|
282
|
-
and syllables[n+1][0] in
|
|
283
|
-
and syllables[n+1][1] in
|
|
285
|
+
and syllables[n+1][0] in phonemes.SLP_consonants_for_scansion_set
|
|
286
|
+
and syllables[n+1][1] in phonemes.SLP_consonants_for_scansion_set
|
|
287
|
+
|
|
284
288
|
|
|
285
289
|
):
|
|
286
290
|
|
|
287
|
-
|
|
288
|
-
#
|
|
291
|
+
line_weights_chars.append('g')
|
|
292
|
+
# line_weights_chars.append('g_')
|
|
289
293
|
# insofar as two 'l's can equal one 'g', could use this alternative for better visual alignment
|
|
290
294
|
|
|
291
295
|
else:
|
|
292
296
|
|
|
293
|
-
|
|
297
|
+
line_weights_chars.append('l')
|
|
294
298
|
|
|
295
|
-
weights_by_line.append(
|
|
299
|
+
weights_by_line.append(''.join(line_weights_chars))
|
|
296
300
|
|
|
297
301
|
syllable_weights = '\n'.join(weights_by_line) # restore newlines
|
|
298
302
|
return syllable_weights
|
|
@@ -325,21 +329,16 @@ class Scanner(object):
|
|
|
325
329
|
Returns string of 'gaRa'-trisyllable abbreviation, e.g. 'nml'.
|
|
326
330
|
"""
|
|
327
331
|
|
|
328
|
-
for c in
|
|
329
|
-
if c not in
|
|
332
|
+
for c in set(syl_wts):
|
|
333
|
+
if c not in {'l', 'g'}:
|
|
330
334
|
return None
|
|
331
335
|
|
|
332
|
-
|
|
333
|
-
overall_abbreviation = ''
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
weights_of_curr_gaRa += single_weight
|
|
337
|
-
if len(weights_of_curr_gaRa) == 3:
|
|
338
|
-
overall_abbreviation += meter_patterns.gaRas_by_weights[weights_of_curr_gaRa]
|
|
339
|
-
weights_of_curr_gaRa = ''
|
|
340
|
-
|
|
336
|
+
n = len(syl_wts) // 3 * 3
|
|
337
|
+
overall_abbreviation = ''.join(
|
|
338
|
+
meter_patterns.gaRas_by_weights[syl_wts[i:i+3]] for i in range(0, n, 3)
|
|
339
|
+
)
|
|
341
340
|
# leftover lights and heavies (l/g)
|
|
342
|
-
overall_abbreviation +=
|
|
341
|
+
overall_abbreviation += syl_wts[n:]
|
|
343
342
|
|
|
344
343
|
return overall_abbreviation
|
|
345
344
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "2.6.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|