glitchlings 0.3.0__cp312-cp312-win_amd64.whl → 0.4.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of glitchlings might be problematic. Click here for more details.

@@ -2,121 +2,47 @@ import random
2
2
  import re
3
3
  from collections.abc import Iterable
4
4
  from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, Any, Literal, cast
6
-
7
- try: # pragma: no cover - exercised in environments with NLTK installed
8
- import nltk # type: ignore[import]
9
- except ModuleNotFoundError as exc: # pragma: no cover - triggered when NLTK missing
10
- nltk = None # type: ignore[assignment]
11
- find = None # type: ignore[assignment]
12
- _NLTK_IMPORT_ERROR = exc
13
- else: # pragma: no cover - executed when NLTK is available
14
- from nltk.corpus.reader import WordNetCorpusReader as _WordNetCorpusReader # type: ignore[import]
15
- from nltk.data import find as _nltk_find # type: ignore[import]
16
-
17
- find = _nltk_find
18
- _NLTK_IMPORT_ERROR = None
19
-
20
- if TYPE_CHECKING: # pragma: no cover - typing aid only
21
- from nltk.corpus.reader import WordNetCorpusReader # type: ignore[import]
22
- else: # Use ``Any`` at runtime to avoid hard dependency when NLTK missing
23
- WordNetCorpusReader = Any
24
-
25
- if nltk is not None: # pragma: no cover - guarded by import success
26
- try:
27
- from nltk.corpus import wordnet as _WORDNET_MODULE # type: ignore[import]
28
- except ModuleNotFoundError: # pragma: no cover - only hit on namespace packages
29
- _WORDNET_MODULE = None
30
- else:
31
- WordNetCorpusReader = _WordNetCorpusReader # type: ignore[assignment]
32
- else:
33
- _WORDNET_MODULE = None
5
+ from typing import Any, Literal, cast
34
6
 
35
- from .core import AttackWave, Glitchling
36
- from ._rate import resolve_rate
37
-
38
- _WORDNET_HANDLE: WordNetCorpusReader | Any | None = _WORDNET_MODULE
39
-
40
- _wordnet_ready = False
7
+ from glitchlings.lexicon import Lexicon, get_default_lexicon
41
8
 
9
+ try: # pragma: no cover - optional WordNet dependency
10
+ from glitchlings.lexicon.wordnet import (
11
+ WordNetLexicon,
12
+ dependencies_available as _lexicon_dependencies_available,
13
+ ensure_wordnet as _lexicon_ensure_wordnet,
14
+ )
15
+ except Exception: # pragma: no cover - triggered when nltk unavailable
16
+ WordNetLexicon = None # type: ignore[assignment]
42
17
 
43
- def _require_nltk() -> None:
44
- """Ensure the NLTK dependency is present before continuing."""
18
+ def _lexicon_dependencies_available() -> bool:
19
+ return False
45
20
 
46
- if nltk is None or find is None:
47
- message = (
48
- "The NLTK package is required for the jargoyle glitchling; install "
49
- "the 'wordnet' extra via `pip install glitchlings[wordnet]`."
21
+ def _lexicon_ensure_wordnet() -> None:
22
+ raise RuntimeError(
23
+ "The WordNet backend is no longer bundled by default. Install NLTK "
24
+ "and download its WordNet corpus manually if you need legacy synonyms."
50
25
  )
51
- if '_NLTK_IMPORT_ERROR' in globals() and _NLTK_IMPORT_ERROR is not None:
52
- raise RuntimeError(message) from _NLTK_IMPORT_ERROR
53
- raise RuntimeError(message)
54
-
55
-
56
- def dependencies_available() -> bool:
57
- """Return ``True`` when the runtime NLTK dependency is present."""
58
-
59
- return nltk is not None and find is not None
60
-
61
-
62
- def _load_wordnet_reader() -> WordNetCorpusReader:
63
- """Return a WordNet corpus reader from the downloaded corpus files."""
64
26
 
65
- _require_nltk()
66
-
67
- try:
68
- root = find("corpora/wordnet")
69
- except LookupError:
70
- try:
71
- zip_root = find("corpora/wordnet.zip")
72
- except LookupError as exc:
73
- raise RuntimeError(
74
- "The NLTK WordNet corpus is not installed; run `nltk.download('wordnet')`."
75
- ) from exc
76
- root = zip_root.join("wordnet/")
77
-
78
- return WordNetCorpusReader(root, None)
79
-
80
-
81
- def _wordnet(force_refresh: bool = False) -> WordNetCorpusReader | Any:
82
- """Retrieve the active WordNet handle, rebuilding it on demand."""
83
-
84
- global _WORDNET_HANDLE
85
-
86
- if force_refresh:
87
- _WORDNET_HANDLE = _WORDNET_MODULE
88
-
89
- if _WORDNET_HANDLE is not None:
90
- return _WORDNET_HANDLE
91
-
92
- _WORDNET_HANDLE = _load_wordnet_reader()
93
- return _WORDNET_HANDLE
94
27
 
28
+ from ._rate import resolve_rate
29
+ from .core import AttackWave, Glitchling
95
30
 
96
- def ensure_wordnet() -> None:
97
- """Ensure the WordNet corpus is available before use."""
31
+ ensure_wordnet = _lexicon_ensure_wordnet
98
32
 
99
- global _wordnet_ready
100
- if _wordnet_ready:
101
- return
102
33
 
103
- _require_nltk()
34
+ def dependencies_available() -> bool:
35
+ """Return ``True`` when a synonym backend is accessible."""
104
36
 
105
- resource = _wordnet()
37
+ if _lexicon_dependencies_available():
38
+ return True
106
39
 
107
40
  try:
108
- resource.ensure_loaded()
109
- except LookupError:
110
- nltk.download("wordnet", quiet=True)
111
- try:
112
- resource = _wordnet(force_refresh=True)
113
- resource.ensure_loaded()
114
- except LookupError as exc: # pragma: no cover - only triggered when download fails
115
- raise RuntimeError(
116
- "Unable to load NLTK WordNet corpus for the jargoyle glitchling."
117
- ) from exc
118
-
119
- _wordnet_ready = True
41
+ # Fall back to the configured default lexicon (typically the bundled vector cache).
42
+ get_default_lexicon(seed=None)
43
+ except Exception:
44
+ return False
45
+ return True
120
46
 
121
47
 
122
48
  # Backwards compatibility for callers relying on the previous private helper name.
@@ -140,7 +66,9 @@ def _split_token(token: str) -> tuple[str, str, str]:
140
66
  return prefix, core, suffix
141
67
 
142
68
 
143
- def _normalize_parts_of_speech(part_of_speech: PartOfSpeechInput) -> NormalizedPartsOfSpeech:
69
+ def _normalize_parts_of_speech(
70
+ part_of_speech: PartOfSpeechInput,
71
+ ) -> NormalizedPartsOfSpeech:
144
72
  """Coerce user input into a tuple of valid WordNet POS tags."""
145
73
 
146
74
  if isinstance(part_of_speech, str):
@@ -173,41 +101,8 @@ class CandidateInfo:
173
101
  prefix: str
174
102
  core_word: str
175
103
  suffix: str
176
- parts_of_speech: NormalizedPartsOfSpeech
177
-
178
-
179
- def _collect_synonyms(
180
- word: str, parts_of_speech: NormalizedPartsOfSpeech
181
- ) -> list[str]:
182
- """Gather deterministic synonym candidates for the supplied word."""
183
-
184
- normalized_word = word.lower()
185
- wordnet = _wordnet()
186
- synonyms: set[str] = set()
187
- for pos_tag in parts_of_speech:
188
- synsets = wordnet.synsets(word, pos=pos_tag)
189
- if not synsets:
190
- continue
191
-
192
- for synset in synsets:
193
- lemmas_list = [lemma.name() for lemma in cast(Any, synset).lemmas()]
194
- if not lemmas_list:
195
- continue
196
-
197
- filtered = []
198
- for lemma_str in lemmas_list:
199
- cleaned = lemma_str.replace("_", " ")
200
- if cleaned.lower() != normalized_word:
201
- filtered.append(cleaned)
202
-
203
- if filtered:
204
- synonyms.update(filtered)
205
- break
206
-
207
- if synonyms:
208
- break
209
-
210
- return sorted(synonyms)
104
+ part_of_speech: str | None
105
+ synonyms: list[str]
211
106
 
212
107
 
213
108
  def substitute_random_synonyms(
@@ -218,22 +113,27 @@ def substitute_random_synonyms(
218
113
  rng: random.Random | None = None,
219
114
  *,
220
115
  replacement_rate: float | None = None,
116
+ lexicon: Lexicon | None = None,
221
117
  ) -> str:
222
- """Replace words with random WordNet synonyms.
118
+ """Replace words with random lexicon-driven synonyms.
223
119
 
224
120
  Parameters
225
121
  - text: Input text.
226
- - rate: Max proportion of candidate words to replace (default 0.1).
122
+ - rate: Max proportion of candidate words to replace (default 0.01).
227
123
  - part_of_speech: WordNet POS tag(s) to target. Accepts "n", "v", "a", "r",
228
- any iterable of those tags, or "any" to include all four.
124
+ any iterable of those tags, or "any" to include all four. Backends that do
125
+ not differentiate parts of speech simply ignore the setting.
229
126
  - rng: Optional RNG instance used for deterministic sampling.
230
127
  - seed: Optional seed if `rng` not provided.
128
+ - lexicon: Optional :class:`~glitchlings.lexicon.Lexicon` implementation to
129
+ supply synonyms. Defaults to the configured lexicon priority, typically the
130
+ packaged vector cache.
231
131
 
232
132
  Determinism
233
133
  - Candidates collected in left-to-right order; no set() reordering.
234
134
  - Replacement positions chosen via rng.sample.
235
- - Synonyms sorted before rng.choice to fix ordering.
236
- - For each POS, the first synset containing alternate lemmas is used for stability.
135
+ - Synonyms sourced through the lexicon; the default backend derives
136
+ deterministic subsets per word and part-of-speech using the active seed.
237
137
  """
238
138
  effective_rate = resolve_rate(
239
139
  rate=rate,
@@ -242,68 +142,106 @@ def substitute_random_synonyms(
242
142
  legacy_name="replacement_rate",
243
143
  )
244
144
 
245
- ensure_wordnet()
246
- wordnet = _wordnet()
247
-
248
145
  active_rng: random.Random
249
146
  if rng is not None:
250
147
  active_rng = rng
251
148
  else:
252
149
  active_rng = random.Random(seed)
253
150
 
254
- target_pos = _normalize_parts_of_speech(part_of_speech)
151
+ active_lexicon: Lexicon
152
+ restore_lexicon_seed = False
153
+ original_lexicon_seed: int | None = None
255
154
 
256
- # Split but keep whitespace separators so we can rebuild easily
257
- tokens = re.split(r"(\s+)", text)
155
+ if lexicon is None:
156
+ active_lexicon = get_default_lexicon(seed=seed)
157
+ else:
158
+ active_lexicon = lexicon
159
+ if seed is not None:
160
+ original_lexicon_seed = active_lexicon.seed
161
+ if original_lexicon_seed != seed:
162
+ active_lexicon.reseed(seed)
163
+ restore_lexicon_seed = True
258
164
 
259
- # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
260
- candidate_indices: list[int] = []
261
- candidate_metadata: dict[int, CandidateInfo] = {}
262
- for idx, tok in enumerate(tokens):
263
- if idx % 2 == 0 and tok and not tok.isspace():
264
- prefix, core_word, suffix = _split_token(tok)
265
- if not core_word:
165
+ try:
166
+ target_pos = _normalize_parts_of_speech(part_of_speech)
167
+
168
+ # Split but keep whitespace separators so we can rebuild easily
169
+ tokens = re.split(r"(\s+)", text)
170
+
171
+ # Collect indices of candidate tokens (even positions 0,2,.. are words given our split design)
172
+ candidate_indices: list[int] = []
173
+ candidate_metadata: dict[int, CandidateInfo] = {}
174
+ for idx, tok in enumerate(tokens):
175
+ if idx % 2 == 0 and tok and not tok.isspace():
176
+ prefix, core_word, suffix = _split_token(tok)
177
+ if not core_word:
178
+ continue
179
+
180
+ chosen_pos: str | None = None
181
+ synonyms: list[str] = []
182
+
183
+ for pos in target_pos:
184
+ if not active_lexicon.supports_pos(pos):
185
+ continue
186
+ synonyms = active_lexicon.get_synonyms(core_word, pos=pos)
187
+ if synonyms:
188
+ chosen_pos = pos
189
+ break
190
+
191
+ if not synonyms and active_lexicon.supports_pos(None):
192
+ synonyms = active_lexicon.get_synonyms(core_word, pos=None)
193
+
194
+ if synonyms:
195
+ candidate_indices.append(idx)
196
+ candidate_metadata[idx] = CandidateInfo(
197
+ prefix=prefix,
198
+ core_word=core_word,
199
+ suffix=suffix,
200
+ part_of_speech=chosen_pos,
201
+ synonyms=synonyms,
202
+ )
203
+
204
+ if not candidate_indices:
205
+ return text
206
+
207
+ clamped_rate = max(0.0, effective_rate)
208
+ if clamped_rate == 0.0:
209
+ return text
210
+
211
+ population = len(candidate_indices)
212
+ effective_fraction = min(clamped_rate, 1.0)
213
+ expected_replacements = population * effective_fraction
214
+ max_replacements = int(expected_replacements)
215
+ remainder = expected_replacements - max_replacements
216
+ if remainder > 0.0 and active_rng.random() < remainder:
217
+ max_replacements += 1
218
+ if clamped_rate >= 1.0:
219
+ max_replacements = population
220
+ max_replacements = min(population, max_replacements)
221
+ if max_replacements <= 0:
222
+ return text
223
+
224
+ # Choose which positions to replace deterministically via rng.sample
225
+ replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
226
+ # Process in ascending order to avoid affecting later indices
227
+ replace_positions.sort()
228
+
229
+ for pos in replace_positions:
230
+ metadata = candidate_metadata[pos]
231
+ if not metadata.synonyms:
266
232
  continue
267
233
 
268
- available_pos: NormalizedPartsOfSpeech = tuple(
269
- pos for pos in target_pos if wordnet.synsets(core_word, pos=pos)
270
- )
271
- if available_pos:
272
- candidate_indices.append(idx)
273
- candidate_metadata[idx] = CandidateInfo(
274
- prefix=prefix,
275
- core_word=core_word,
276
- suffix=suffix,
277
- parts_of_speech=available_pos,
278
- )
279
-
280
- if not candidate_indices:
281
- return text
282
-
283
- clamped_rate = max(0.0, effective_rate)
284
- max_replacements = int(len(candidate_indices) * clamped_rate)
285
- if max_replacements <= 0:
286
- return text
287
-
288
- # Choose which positions to replace deterministically via rng.sample
289
- replace_positions = active_rng.sample(candidate_indices, k=max_replacements)
290
- # Process in ascending order to avoid affecting later indices
291
- replace_positions.sort()
234
+ replacement = active_rng.choice(metadata.synonyms)
235
+ tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
292
236
 
293
- for pos in replace_positions:
294
- metadata = candidate_metadata[pos]
295
- synonyms = _collect_synonyms(metadata.core_word, metadata.parts_of_speech)
296
- if not synonyms:
297
- continue
298
-
299
- replacement = active_rng.choice(synonyms)
300
- tokens[pos] = f"{metadata.prefix}{replacement}{metadata.suffix}"
301
-
302
- return "".join(tokens)
237
+ return "".join(tokens)
238
+ finally:
239
+ if restore_lexicon_seed:
240
+ active_lexicon.reseed(original_lexicon_seed)
303
241
 
304
242
 
305
243
  class Jargoyle(Glitchling):
306
- """Glitchling that swaps words with random WordNet synonyms."""
244
+ """Glitchling that swaps words with lexicon-driven synonyms."""
307
245
 
308
246
  def __init__(
309
247
  self,
@@ -312,22 +250,74 @@ class Jargoyle(Glitchling):
312
250
  replacement_rate: float | None = None,
313
251
  part_of_speech: PartOfSpeechInput = "n",
314
252
  seed: int | None = None,
253
+ lexicon: Lexicon | None = None,
315
254
  ) -> None:
316
255
  self._param_aliases = {"replacement_rate": "rate"}
256
+ self._owns_lexicon = lexicon is None
257
+ self._external_lexicon_original_seed = (
258
+ lexicon.seed if isinstance(lexicon, Lexicon) else None
259
+ )
260
+ self._initializing = True
317
261
  effective_rate = resolve_rate(
318
262
  rate=rate,
319
263
  legacy_value=replacement_rate,
320
- default=0.1,
264
+ default=0.01,
321
265
  legacy_name="replacement_rate",
322
266
  )
323
- super().__init__(
324
- name="Jargoyle",
325
- corruption_function=substitute_random_synonyms,
326
- scope=AttackWave.WORD,
327
- seed=seed,
328
- rate=effective_rate,
329
- part_of_speech=part_of_speech,
330
- )
267
+ prepared_lexicon = lexicon or get_default_lexicon(seed=seed)
268
+ if lexicon is not None and seed is not None:
269
+ prepared_lexicon.reseed(seed)
270
+ try:
271
+ super().__init__(
272
+ name="Jargoyle",
273
+ corruption_function=substitute_random_synonyms,
274
+ scope=AttackWave.WORD,
275
+ seed=seed,
276
+ rate=effective_rate,
277
+ part_of_speech=part_of_speech,
278
+ lexicon=prepared_lexicon,
279
+ )
280
+ finally:
281
+ self._initializing = False
282
+
283
+ def set_param(self, key: str, value: Any) -> None:
284
+ super().set_param(key, value)
285
+
286
+ aliases = getattr(self, "_param_aliases", {})
287
+ canonical = aliases.get(key, key)
288
+
289
+ if canonical == "seed":
290
+ current_lexicon = getattr(self, "lexicon", None)
291
+ if isinstance(current_lexicon, Lexicon):
292
+ if getattr(self, "_owns_lexicon", False):
293
+ current_lexicon.reseed(self.seed)
294
+ else:
295
+ if self.seed is not None:
296
+ current_lexicon.reseed(self.seed)
297
+ else:
298
+ if hasattr(self, "_external_lexicon_original_seed"):
299
+ original_seed = getattr(
300
+ self, "_external_lexicon_original_seed", None
301
+ )
302
+ current_lexicon.reseed(original_seed)
303
+ elif canonical == "lexicon" and isinstance(value, Lexicon):
304
+ if getattr(self, "_initializing", False):
305
+ if getattr(self, "_owns_lexicon", False):
306
+ if self.seed is not None:
307
+ value.reseed(self.seed)
308
+ else:
309
+ if getattr(self, "_external_lexicon_original_seed", None) is None:
310
+ self._external_lexicon_original_seed = value.seed
311
+ if self.seed is not None:
312
+ value.reseed(self.seed)
313
+ return
314
+
315
+ self._owns_lexicon = False
316
+ self._external_lexicon_original_seed = value.seed
317
+ if self.seed is not None:
318
+ value.reseed(self.seed)
319
+ elif value.seed != self._external_lexicon_original_seed:
320
+ value.reseed(self._external_lexicon_original_seed)
331
321
 
332
322
 
333
323
  jargoyle = Jargoyle()
@@ -3,10 +3,11 @@ import random
3
3
  from typing import Any
4
4
 
5
5
  from ._rate import resolve_rate
6
+ from ._sampling import weighted_sample_without_replacement
6
7
  from ._text_utils import (
8
+ WordToken,
9
+ collect_word_tokens,
7
10
  split_preserving_whitespace,
8
- split_token_edges,
9
- token_core_length,
10
11
  )
11
12
  from .core import AttackWave, Glitchling
12
13
 
@@ -19,41 +20,6 @@ except ImportError: # pragma: no cover - compiled extension not present
19
20
  _redact_words_rust = None
20
21
 
21
22
 
22
- def _weighted_sample_without_replacement(
23
- population: list[int],
24
- weights: list[float],
25
- *,
26
- k: int,
27
- rng: random.Random,
28
- ) -> list[int]:
29
- """Select `k` unique indices according to the given weights."""
30
-
31
- selections: list[int] = []
32
- items = list(zip(population, weights))
33
- if k <= 0 or not items:
34
- return selections
35
- if k > len(items):
36
- raise ValueError("Sample larger than population or is negative")
37
-
38
- for _ in range(k):
39
- total_weight = sum(weight for _, weight in items)
40
- if total_weight <= 0:
41
- chosen_index = rng.randrange(len(items))
42
- else:
43
- threshold = rng.random() * total_weight
44
- cumulative = 0.0
45
- chosen_index = len(items) - 1
46
- for idx, (_, weight) in enumerate(items):
47
- cumulative += weight
48
- if cumulative >= threshold:
49
- chosen_index = idx
50
- break
51
- value, _ = items.pop(chosen_index)
52
- selections.append(value)
53
-
54
- return selections
55
-
56
-
57
23
  def _python_redact_words(
58
24
  text: str,
59
25
  *,
@@ -74,39 +40,45 @@ def _python_redact_words(
74
40
  - unweighted: When True, sample words uniformly instead of by length.
75
41
  """
76
42
  tokens = split_preserving_whitespace(text)
77
- word_indices = [i for i, token in enumerate(tokens) if i % 2 == 0 and token.strip()]
78
- if not word_indices:
43
+ word_tokens = collect_word_tokens(tokens)
44
+ if not word_tokens:
79
45
  raise ValueError(
80
46
  "Cannot redact words because the input text contains no redactable words."
81
47
  )
82
- weights: list[float] = []
83
- for index in word_indices:
84
- word = tokens[index]
85
- length = token_core_length(word)
86
- weights.append(1.0 if unweighted else float(length))
87
- raw_quota = len(word_indices) * rate
48
+
49
+ population = [token.index for token in word_tokens]
50
+ weights = [
51
+ 1.0 if unweighted else float(token.core_length) for token in word_tokens
52
+ ]
53
+
54
+ clamped_rate = max(0.0, min(rate, 1.0))
55
+ raw_quota = len(population) * clamped_rate
88
56
  num_to_redact = int(raw_quota)
89
- if rate > 0:
57
+ if clamped_rate > 0.0:
90
58
  num_to_redact = max(1, num_to_redact)
91
- if num_to_redact > len(word_indices):
92
- raise ValueError("Sample larger than population or is negative")
93
- indices_to_redact = _weighted_sample_without_replacement(
94
- word_indices,
59
+ num_to_redact = min(num_to_redact, len(population))
60
+ if num_to_redact <= 0:
61
+ return "".join(tokens)
62
+
63
+ indices_to_redact = weighted_sample_without_replacement(
64
+ population,
95
65
  weights,
96
66
  k=num_to_redact,
97
67
  rng=rng,
98
68
  )
99
69
  indices_to_redact.sort()
100
70
 
71
+ token_by_index: dict[int, WordToken] = {token.index: token for token in word_tokens}
72
+
101
73
  for i in indices_to_redact:
102
74
  if i >= len(tokens):
103
75
  break
104
76
 
105
- word = tokens[i]
106
- if not word or word.isspace():
77
+ token = token_by_index.get(i)
78
+ if token is None:
107
79
  continue
108
80
 
109
- prefix, core, suffix = split_token_edges(word)
81
+ prefix, core, suffix = token.prefix, token.core, token.suffix
110
82
  tokens[i] = f"{prefix}{replacement_char * len(core)}{suffix}"
111
83
 
112
84
  text = "".join(tokens)
@@ -144,7 +116,7 @@ def redact_words(
144
116
  if rng is None:
145
117
  rng = random.Random(seed)
146
118
 
147
- clamped_rate = max(0.0, effective_rate)
119
+ clamped_rate = max(0.0, min(effective_rate, 1.0))
148
120
  unweighted_flag = bool(unweighted)
149
121
 
150
122
  use_rust = _redact_words_rust is not None and isinstance(merge_adjacent, bool)
@@ -2,11 +2,7 @@ import random
2
2
  from typing import Any
3
3
 
4
4
  from ._rate import resolve_rate
5
- from ._text_utils import (
6
- split_preserving_whitespace,
7
- split_token_edges,
8
- token_core_length,
9
- )
5
+ from ._text_utils import WordToken, collect_word_tokens, split_preserving_whitespace
10
6
  from .core import AttackWave, Glitchling
11
7
 
12
8
  try:
@@ -35,29 +31,23 @@ def _python_reduplicate_words(
35
31
  - Deterministic when run with a fixed seed or via Gaggle.
36
32
  """
37
33
  tokens = split_preserving_whitespace(text)
34
+ word_tokens = collect_word_tokens(tokens)
38
35
 
39
- candidate_weights: list[tuple[int, float]] = []
40
- for i in range(0, len(tokens), 2):
41
- word = tokens[i]
42
- if not word or word.isspace():
43
- continue
44
-
45
- length = token_core_length(word)
46
- weight = 1.0 if unweighted else 1.0 / length
47
- candidate_weights.append((i, weight))
36
+ weighted_tokens: list[tuple[int, float, WordToken]] = []
37
+ for token in word_tokens:
38
+ weight = 1.0 if unweighted else 1.0 / float(token.core_length)
39
+ weighted_tokens.append((token.index, weight, token))
48
40
 
49
- if not candidate_weights:
41
+ if not weighted_tokens:
50
42
  return "".join(tokens)
51
43
 
52
44
  effective_rate = max(rate, 0.0)
53
45
  if effective_rate <= 0.0:
54
46
  return "".join(tokens)
55
47
 
56
- mean_weight = sum(weight for _, weight in candidate_weights) / len(
57
- candidate_weights
58
- )
48
+ mean_weight = sum(weight for _, weight, _ in weighted_tokens) / len(weighted_tokens)
59
49
 
60
- for index, weight in candidate_weights:
50
+ for index, weight, token in weighted_tokens:
61
51
  if effective_rate >= 1.0:
62
52
  probability = 1.0
63
53
  else:
@@ -68,8 +58,7 @@ def _python_reduplicate_words(
68
58
  if rng.random() >= probability:
69
59
  continue
70
60
 
71
- word = tokens[index]
72
- prefix, core, suffix = split_token_edges(word)
61
+ prefix, core, suffix = token.prefix, token.core, token.suffix
73
62
  tokens[index] = f"{prefix}{core} {core}{suffix}"
74
63
  return "".join(tokens)
75
64