indic-tokenizer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indic_tokenizer/__init__.py +5 -0
- indic_tokenizer/__main__.py +8 -0
- indic_tokenizer/bpe_tokenizer.py +375 -0
- indic_tokenizer/bpe_trainer.py +196 -0
- indic_tokenizer/constants.py +103 -0
- indic_tokenizer/data_loader.py +258 -0
- indic_tokenizer/preprocessor.py +131 -0
- indic_tokenizer/vocab_builder.py +119 -0
- indic_tokenizer-0.1.0.dist-info/METADATA +888 -0
- indic_tokenizer-0.1.0.dist-info/RECORD +13 -0
- indic_tokenizer-0.1.0.dist-info/WHEEL +5 -0
- indic_tokenizer-0.1.0.dist-info/licenses/LICENSE +201 -0
- indic_tokenizer-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""
|
|
2
|
+
IndicBPETokenizer — main public class for BPE-based Indic tokenization.
|
|
3
|
+
|
|
4
|
+
Space / newline convention (Claude Sonnet style)
|
|
5
|
+
--------------------------------------------------
|
|
6
|
+
Spaces and newlines are kept as *literal* characters inside tokens.
|
|
7
|
+
A token like " नमस्ते" contains a real leading space — no Ġ/Ċ substitution.
|
|
8
|
+
Decoding is therefore a trivial string join with no post-processing.
|
|
9
|
+
|
|
10
|
+
Responsibilities
|
|
11
|
+
----------------
|
|
12
|
+
train() — learn BPE merges from a corpus (small / medium datasets)
|
|
13
|
+
train_from_file() — convenience wrapper that auto-loads any supported format
|
|
14
|
+
encode() — text → List[int] (with optional special-token pass-through)
|
|
15
|
+
decode() — List[int] → text (plain string join)
|
|
16
|
+
save() — persist vocab + merges to two JSON files
|
|
17
|
+
load() — restore from those JSON files
|
|
18
|
+
|
|
19
|
+
For corpora > ~1 GB use ChunkedBPETrainer (chunked_trainer.py) instead of
|
|
20
|
+
train(), which stores the full token ID sequence in RAM.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import re
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Dict, List, Optional, Set, Tuple, Union
|
|
27
|
+
|
|
28
|
+
from .bpe_trainer import BPETrainer, MergesType, PairType
|
|
29
|
+
from .constants import SPECIAL_TOKENS
|
|
30
|
+
from .data_loader import IndicDataLoader
|
|
31
|
+
from .preprocessor import pretokenize
|
|
32
|
+
from .vocab_builder import VocabBuilder, InverseVocabType, VocabType
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class IndicBPETokenizer:
|
|
36
|
+
"""
|
|
37
|
+
Byte-Pair Encoding tokenizer for Indian language text.
|
|
38
|
+
|
|
39
|
+
Supports all major Indic scripts: Devanagari, Bengali, Gurmukhi,
|
|
40
|
+
Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, Sinhala, Tibetan,
|
|
41
|
+
as well as ASCII and mixed-script text.
|
|
42
|
+
|
|
43
|
+
Usage
|
|
44
|
+
-----
|
|
45
|
+
>>> tok = IndicBPETokenizer()
|
|
46
|
+
>>> tok.train(text, vocab_size=16_000)
|
|
47
|
+
>>> ids = tok.encode("नमस्ते दुनिया")
|
|
48
|
+
>>> tok.decode(ids)
|
|
49
|
+
'नमस्ते दुनिया'
|
|
50
|
+
>>> tok.save("vocab.json", "merges.json")
|
|
51
|
+
|
|
52
|
+
>>> tok2 = IndicBPETokenizer()
|
|
53
|
+
>>> tok2.load("vocab.json", "merges.json")
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self) -> None:
|
|
57
|
+
self.vocab: VocabType = {}
|
|
58
|
+
self.inverse_vocab: InverseVocabType = {}
|
|
59
|
+
self.bpe_merges: MergesType = {}
|
|
60
|
+
|
|
61
|
+
self._vocab_builder = VocabBuilder()
|
|
62
|
+
self._trainer = BPETrainer()
|
|
63
|
+
|
|
64
|
+
# ------------------------------------------------------------------
|
|
65
|
+
# Properties
|
|
66
|
+
# ------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def vocab_size(self) -> int:
|
|
70
|
+
"""Current number of tokens in the vocabulary."""
|
|
71
|
+
return len(self.vocab)
|
|
72
|
+
|
|
73
|
+
# ------------------------------------------------------------------
|
|
74
|
+
# Training
|
|
75
|
+
# ------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
def train(
|
|
78
|
+
self,
|
|
79
|
+
text: str,
|
|
80
|
+
vocab_size: int,
|
|
81
|
+
special_tokens: Optional[List[str]] = None,
|
|
82
|
+
verbose: bool = True,
|
|
83
|
+
) -> None:
|
|
84
|
+
"""
|
|
85
|
+
Train BPE from *text* using sequence-based BPE.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
text: Raw UTF-8 training corpus (any Indic language or mixed).
|
|
89
|
+
vocab_size: Target vocabulary size.
|
|
90
|
+
special_tokens: Override the default Claude-style special tokens.
|
|
91
|
+
verbose: Show tqdm progress bar during training.
|
|
92
|
+
|
|
93
|
+
Note:
|
|
94
|
+
For corpora larger than ~1 GB, use ChunkedBPETrainer instead —
|
|
95
|
+
it runs word-frequency BPE which needs <1 GB RAM even for 18 GB text.
|
|
96
|
+
"""
|
|
97
|
+
if special_tokens is None:
|
|
98
|
+
special_tokens = list(SPECIAL_TOKENS)
|
|
99
|
+
|
|
100
|
+
# Seed vocab: ASCII + all Indic script characters + corpus chars + specials
|
|
101
|
+
self.vocab, self.inverse_vocab = self._vocab_builder.build_base_vocab()
|
|
102
|
+
self._vocab_builder.extend_from_text(text, self.vocab, self.inverse_vocab)
|
|
103
|
+
self._vocab_builder.add_special_tokens(
|
|
104
|
+
self.vocab, self.inverse_vocab, special_tokens
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
if verbose:
|
|
108
|
+
print(f"Base vocab: {self.vocab_size:,} tokens | Target: {vocab_size:,}")
|
|
109
|
+
if len(text) > 50_000_000:
|
|
110
|
+
print(
|
|
111
|
+
f"WARNING: corpus is {len(text):,} chars. Sequence-based BPE "
|
|
112
|
+
"stores the full token ID list in RAM. Consider ChunkedBPETrainer "
|
|
113
|
+
"for large corpora (chunked_trainer.py)."
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# Encode every character → its token ID
|
|
117
|
+
try:
|
|
118
|
+
from tqdm import tqdm # type: ignore[import]
|
|
119
|
+
char_iter = tqdm(text, desc="Encoding corpus", unit=" chars",
|
|
120
|
+
unit_scale=True, total=len(text))
|
|
121
|
+
except ImportError:
|
|
122
|
+
char_iter = text # type: ignore[assignment]
|
|
123
|
+
|
|
124
|
+
token_ids: List[int] = [self.inverse_vocab[c] for c in char_iter]
|
|
125
|
+
|
|
126
|
+
self.bpe_merges = self._trainer.train(
|
|
127
|
+
token_ids,
|
|
128
|
+
self.vocab,
|
|
129
|
+
self.inverse_vocab,
|
|
130
|
+
target_vocab_size=vocab_size,
|
|
131
|
+
verbose=verbose,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if verbose:
|
|
135
|
+
print(f"Training complete. Final vocab size: {self.vocab_size:,}")
|
|
136
|
+
|
|
137
|
+
def train_from_file(
|
|
138
|
+
self,
|
|
139
|
+
path: Union[str, Path],
|
|
140
|
+
vocab_size: int,
|
|
141
|
+
text_column: str = "text",
|
|
142
|
+
special_tokens: Optional[List[str]] = None,
|
|
143
|
+
max_samples: Optional[int] = None,
|
|
144
|
+
verbose: bool = True,
|
|
145
|
+
) -> None:
|
|
146
|
+
"""
|
|
147
|
+
Load a corpus from *path* then call train().
|
|
148
|
+
|
|
149
|
+
Supports .parquet, .csv, .txt, .json, .jsonl.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
path: Path to the corpus file.
|
|
153
|
+
vocab_size: Target vocabulary size.
|
|
154
|
+
text_column: Column / key for text in tabular / JSON files.
|
|
155
|
+
special_tokens: Override the default special tokens.
|
|
156
|
+
max_samples: Cap on rows / documents loaded (None → all).
|
|
157
|
+
verbose: Show training progress.
|
|
158
|
+
"""
|
|
159
|
+
loader = IndicDataLoader(text_column=text_column, max_samples=max_samples)
|
|
160
|
+
text = loader.load(path)
|
|
161
|
+
self.train(text, vocab_size=vocab_size, special_tokens=special_tokens,
|
|
162
|
+
verbose=verbose)
|
|
163
|
+
|
|
164
|
+
# ------------------------------------------------------------------
|
|
165
|
+
# Encoding
|
|
166
|
+
# ------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
def encode(
|
|
169
|
+
self,
|
|
170
|
+
text: str,
|
|
171
|
+
allowed_special: Optional[Set[str]] = None,
|
|
172
|
+
) -> List[int]:
|
|
173
|
+
"""
|
|
174
|
+
Encode *text* into a list of token IDs.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
text: Input string (any Indic script, ASCII, special tokens, mixed).
|
|
178
|
+
allowed_special: Set of special tokens to recognise and pass through
|
|
179
|
+
as single IDs. Pass set() to disable.
|
|
180
|
+
Defaults to all tokens in SPECIAL_TOKENS.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List of integer token IDs.
|
|
184
|
+
"""
|
|
185
|
+
if allowed_special is None:
|
|
186
|
+
allowed_special = set(SPECIAL_TOKENS)
|
|
187
|
+
|
|
188
|
+
token_ids: List[int] = []
|
|
189
|
+
|
|
190
|
+
if allowed_special:
|
|
191
|
+
for segment, is_special in self._split_on_special_tokens(
|
|
192
|
+
text, allowed_special
|
|
193
|
+
):
|
|
194
|
+
if is_special:
|
|
195
|
+
tid = self.inverse_vocab.get(segment)
|
|
196
|
+
if tid is None:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"Special token {segment!r} not found in vocabulary."
|
|
199
|
+
)
|
|
200
|
+
token_ids.append(tid)
|
|
201
|
+
else:
|
|
202
|
+
token_ids.extend(self._encode_ordinary(segment))
|
|
203
|
+
else:
|
|
204
|
+
token_ids = self._encode_ordinary(text)
|
|
205
|
+
|
|
206
|
+
return token_ids
|
|
207
|
+
|
|
208
|
+
def _encode_ordinary(self, text: str) -> List[int]:
|
|
209
|
+
"""Encode plain text (no special-token detection)."""
|
|
210
|
+
token_ids: List[int] = []
|
|
211
|
+
for chunk in pretokenize(text):
|
|
212
|
+
tid = self.inverse_vocab.get(chunk)
|
|
213
|
+
if tid is not None:
|
|
214
|
+
token_ids.append(tid)
|
|
215
|
+
else:
|
|
216
|
+
token_ids.extend(self._apply_bpe(chunk))
|
|
217
|
+
return token_ids
|
|
218
|
+
|
|
219
|
+
# ------------------------------------------------------------------
|
|
220
|
+
# Decoding
|
|
221
|
+
# ------------------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def decode(self, token_ids: List[int]) -> str:
|
|
224
|
+
"""
|
|
225
|
+
Decode a list of token IDs back into a string.
|
|
226
|
+
|
|
227
|
+
Because tokens contain literal spaces and newlines, decoding is a
|
|
228
|
+
plain concatenation — no marker substitution required.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
token_ids: Sequence of integer token IDs.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
Decoded UTF-8 string.
|
|
235
|
+
|
|
236
|
+
Raises:
|
|
237
|
+
ValueError: If any token ID is not in the vocabulary.
|
|
238
|
+
"""
|
|
239
|
+
parts: List[str] = []
|
|
240
|
+
for tid in token_ids:
|
|
241
|
+
token = self.vocab.get(tid)
|
|
242
|
+
if token is None:
|
|
243
|
+
raise ValueError(f"Token ID {tid} not found in vocabulary.")
|
|
244
|
+
parts.append(token)
|
|
245
|
+
return "".join(parts)
|
|
246
|
+
|
|
247
|
+
# ------------------------------------------------------------------
|
|
248
|
+
# Persistence
|
|
249
|
+
# ------------------------------------------------------------------
|
|
250
|
+
|
|
251
|
+
def save(self, vocab_path: str, merges_path: str) -> None:
|
|
252
|
+
"""
|
|
253
|
+
Save vocabulary and BPE merges to JSON files.
|
|
254
|
+
|
|
255
|
+
vocab.json format: {"<int_id>": "<token_string>", ...}
|
|
256
|
+
merges.json format: [{"pair": [id1, id2], "merged_id": id3}, ...]
|
|
257
|
+
|
|
258
|
+
Args:
|
|
259
|
+
vocab_path: Destination path for vocabulary JSON.
|
|
260
|
+
merges_path: Destination path for merges JSON.
|
|
261
|
+
"""
|
|
262
|
+
with open(vocab_path, "w", encoding="utf-8") as fh:
|
|
263
|
+
json.dump(
|
|
264
|
+
{str(tid): tok for tid, tok in self.vocab.items()},
|
|
265
|
+
fh,
|
|
266
|
+
ensure_ascii=False,
|
|
267
|
+
indent=2,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
merges_list = [
|
|
271
|
+
{"pair": list(pair), "merged_id": mid}
|
|
272
|
+
for pair, mid in self.bpe_merges.items()
|
|
273
|
+
]
|
|
274
|
+
with open(merges_path, "w", encoding="utf-8") as fh:
|
|
275
|
+
json.dump(merges_list, fh, ensure_ascii=False, indent=2)
|
|
276
|
+
|
|
277
|
+
def load(self, vocab_path: str, merges_path: str) -> None:
|
|
278
|
+
"""
|
|
279
|
+
Load vocabulary and BPE merges from JSON files produced by save().
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
vocab_path: Path to vocabulary JSON file.
|
|
283
|
+
merges_path: Path to merges JSON file.
|
|
284
|
+
"""
|
|
285
|
+
with open(vocab_path, "r", encoding="utf-8") as fh:
|
|
286
|
+
raw = json.load(fh)
|
|
287
|
+
self.vocab = {int(k): v for k, v in raw.items()}
|
|
288
|
+
self.inverse_vocab = {v: int(k) for k, v in raw.items()}
|
|
289
|
+
|
|
290
|
+
with open(merges_path, "r", encoding="utf-8") as fh:
|
|
291
|
+
merges_list = json.load(fh)
|
|
292
|
+
self.bpe_merges = {
|
|
293
|
+
(entry["pair"][0], entry["pair"][1]): entry["merged_id"]
|
|
294
|
+
for entry in merges_list
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
# ------------------------------------------------------------------
|
|
298
|
+
# Vocabulary helpers
|
|
299
|
+
# ------------------------------------------------------------------
|
|
300
|
+
|
|
301
|
+
def token_to_id(self, token: str) -> Optional[int]:
|
|
302
|
+
"""Return the vocabulary ID for a token string, or None."""
|
|
303
|
+
return self.inverse_vocab.get(token)
|
|
304
|
+
|
|
305
|
+
def id_to_token(self, tid: int) -> Optional[str]:
|
|
306
|
+
"""Return the token string for a vocabulary ID, or None."""
|
|
307
|
+
return self.vocab.get(tid)
|
|
308
|
+
|
|
309
|
+
def special_token_id(self, token: str) -> Optional[int]:
|
|
310
|
+
"""Return the vocabulary ID of a special token, or None."""
|
|
311
|
+
return self.inverse_vocab.get(token)
|
|
312
|
+
|
|
313
|
+
# ------------------------------------------------------------------
|
|
314
|
+
# Private helpers
|
|
315
|
+
# ------------------------------------------------------------------
|
|
316
|
+
|
|
317
|
+
def _apply_bpe(self, chunk: str) -> List[int]:
|
|
318
|
+
"""
|
|
319
|
+
Apply BPE merges to a pre-tokenised chunk.
|
|
320
|
+
|
|
321
|
+
The chunk is first split into character IDs; then the earliest-learned
|
|
322
|
+
merge that applies is greedily applied left-to-right until no merge
|
|
323
|
+
changes the sequence.
|
|
324
|
+
"""
|
|
325
|
+
unk_id = self.inverse_vocab.get("<|unk|>")
|
|
326
|
+
|
|
327
|
+
char_ids: List[int] = []
|
|
328
|
+
for char in chunk:
|
|
329
|
+
tid = self.inverse_vocab.get(char)
|
|
330
|
+
if tid is None:
|
|
331
|
+
if unk_id is None:
|
|
332
|
+
raise ValueError(
|
|
333
|
+
f"Character {char!r} (U+{ord(char):04X}) not in vocab "
|
|
334
|
+
"and no <|unk|> fallback is defined."
|
|
335
|
+
)
|
|
336
|
+
char_ids.append(unk_id)
|
|
337
|
+
else:
|
|
338
|
+
char_ids.append(tid)
|
|
339
|
+
|
|
340
|
+
changed = True
|
|
341
|
+
while changed and len(char_ids) > 1:
|
|
342
|
+
changed = False
|
|
343
|
+
merged: List[int] = []
|
|
344
|
+
i = 0
|
|
345
|
+
while i < len(char_ids):
|
|
346
|
+
if i < len(char_ids) - 1:
|
|
347
|
+
pair: PairType = (char_ids[i], char_ids[i + 1])
|
|
348
|
+
if pair in self.bpe_merges:
|
|
349
|
+
merged.append(self.bpe_merges[pair])
|
|
350
|
+
i += 2
|
|
351
|
+
changed = True
|
|
352
|
+
continue
|
|
353
|
+
merged.append(char_ids[i])
|
|
354
|
+
i += 1
|
|
355
|
+
char_ids = merged
|
|
356
|
+
|
|
357
|
+
return char_ids
|
|
358
|
+
|
|
359
|
+
def _split_on_special_tokens(
|
|
360
|
+
self,
|
|
361
|
+
text: str,
|
|
362
|
+
allowed_special: Set[str],
|
|
363
|
+
) -> List[Tuple[str, bool]]:
|
|
364
|
+
"""Split *text* into (segment, is_special) pairs using regex."""
|
|
365
|
+
# Sort longest-first so "<|im_start|>" matches before "<|im|>" would
|
|
366
|
+
pattern = "(" + "|".join(
|
|
367
|
+
re.escape(tok)
|
|
368
|
+
for tok in sorted(allowed_special, key=len, reverse=True)
|
|
369
|
+
) + ")"
|
|
370
|
+
result: List[Tuple[str, bool]] = []
|
|
371
|
+
for part in re.split(pattern, text):
|
|
372
|
+
if not part:
|
|
373
|
+
continue
|
|
374
|
+
result.append((part, part in allowed_special))
|
|
375
|
+
return result
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core BPE training algorithm — CPU-optimised with numpy.
|
|
3
|
+
|
|
4
|
+
BPETrainer is stateless: it receives mutable vocab / inverse_vocab dicts,
|
|
5
|
+
modifies them in-place, and returns only the bpe_merges dictionary.
|
|
6
|
+
|
|
7
|
+
Optimisations over the naive Python implementation
|
|
8
|
+
---------------------------------------------------
|
|
9
|
+
_find_most_frequent_pair
|
|
10
|
+
Uses numpy to encode every adjacent pair as a single int64, then
|
|
11
|
+
np.unique (C-level sort + scan) to count them. For a 10 M-token
|
|
12
|
+
sequence this is ~10× faster than Python's Counter(zip(...)).
|
|
13
|
+
|
|
14
|
+
_replace_pair
|
|
15
|
+
Finds all match positions with a numpy boolean mask in one vectorised
|
|
16
|
+
pass, deduplicates overlapping matches with a tiny Python loop
|
|
17
|
+
(typically k << N matches), then builds the output array in numpy —
|
|
18
|
+
no Python-level element-by-element loop.
|
|
19
|
+
|
|
20
|
+
Fallback
|
|
21
|
+
If numpy is not installed both methods fall back to pure-Python
|
|
22
|
+
implementations so the module is always importable.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from collections import Counter, deque
|
|
26
|
+
from typing import Dict, List, Optional, Tuple
|
|
27
|
+
|
|
28
|
+
# Type aliases exposed for callers
|
|
29
|
+
PairType = Tuple[int, int]
|
|
30
|
+
VocabType = Dict[int, str]
|
|
31
|
+
InverseVocabType = Dict[str, int]
|
|
32
|
+
MergesType = Dict[PairType, int]
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
import numpy as np
|
|
36
|
+
_NUMPY_AVAILABLE = True
|
|
37
|
+
except ImportError:
|
|
38
|
+
_NUMPY_AVAILABLE = False
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class BPETrainer:
|
|
42
|
+
"""Stateless, numpy-accelerated BPE training engine."""
|
|
43
|
+
|
|
44
|
+
# ------------------------------------------------------------------
|
|
45
|
+
# Public API
|
|
46
|
+
# ------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
def train(
|
|
49
|
+
self,
|
|
50
|
+
token_ids: List[int],
|
|
51
|
+
vocab: VocabType,
|
|
52
|
+
inverse_vocab: InverseVocabType,
|
|
53
|
+
target_vocab_size: int,
|
|
54
|
+
verbose: bool = True,
|
|
55
|
+
) -> MergesType:
|
|
56
|
+
"""
|
|
57
|
+
Run BPE training until *target_vocab_size* is reached.
|
|
58
|
+
|
|
59
|
+
Mutates *vocab* and *inverse_vocab* in-place with merged tokens.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
token_ids: Sequence of initial token IDs from the corpus.
|
|
63
|
+
vocab: Mutable id → token mapping (extended in-place).
|
|
64
|
+
inverse_vocab: Mutable token → id mapping (extended in-place).
|
|
65
|
+
target_vocab_size: Desired final vocabulary size.
|
|
66
|
+
verbose: Show tqdm progress bar (or step log if tqdm missing).
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
bpe_merges: dict mapping (id1, id2) → merged_id in merge order.
|
|
70
|
+
"""
|
|
71
|
+
bpe_merges: MergesType = {}
|
|
72
|
+
initial_size = len(vocab)
|
|
73
|
+
|
|
74
|
+
try:
|
|
75
|
+
from tqdm import tqdm # type: ignore[import]
|
|
76
|
+
progress = tqdm(
|
|
77
|
+
total=target_vocab_size - initial_size,
|
|
78
|
+
desc="BPE merges",
|
|
79
|
+
unit="merge",
|
|
80
|
+
)
|
|
81
|
+
use_tqdm = True
|
|
82
|
+
except ImportError:
|
|
83
|
+
use_tqdm = False
|
|
84
|
+
progress = None # type: ignore[assignment]
|
|
85
|
+
|
|
86
|
+
step = 0
|
|
87
|
+
while len(vocab) < target_vocab_size:
|
|
88
|
+
pair = self._find_most_frequent_pair(token_ids)
|
|
89
|
+
if pair is None:
|
|
90
|
+
break # no more pairs to merge
|
|
91
|
+
|
|
92
|
+
new_id = len(vocab)
|
|
93
|
+
token_ids = self._replace_pair(token_ids, pair, new_id)
|
|
94
|
+
|
|
95
|
+
merged_token = vocab[pair[0]] + vocab[pair[1]]
|
|
96
|
+
vocab[new_id] = merged_token
|
|
97
|
+
inverse_vocab[merged_token] = new_id
|
|
98
|
+
bpe_merges[pair] = new_id
|
|
99
|
+
|
|
100
|
+
step += 1
|
|
101
|
+
if use_tqdm and progress is not None:
|
|
102
|
+
progress.update(1)
|
|
103
|
+
elif verbose and step % 100 == 0:
|
|
104
|
+
print(
|
|
105
|
+
f" step {step:5d} | vocab {len(vocab):6d} "
|
|
106
|
+
f"| merged: {merged_token!r}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
if use_tqdm and progress is not None:
|
|
110
|
+
progress.close()
|
|
111
|
+
|
|
112
|
+
return bpe_merges
|
|
113
|
+
|
|
114
|
+
# ------------------------------------------------------------------
|
|
115
|
+
# Private helpers — numpy paths with pure-Python fallbacks
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def _find_most_frequent_pair(
|
|
120
|
+
token_ids: List[int],
|
|
121
|
+
) -> Optional[PairType]:
|
|
122
|
+
"""
|
|
123
|
+
Return the most frequent adjacent pair in *token_ids*.
|
|
124
|
+
|
|
125
|
+
numpy path — O(N log N) via C-level sort in np.unique.
|
|
126
|
+
Python path — O(N) via Counter (larger constant due to dict hashing).
|
|
127
|
+
"""
|
|
128
|
+
if len(token_ids) < 2:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
if _NUMPY_AVAILABLE:
|
|
132
|
+
arr = np.asarray(token_ids, dtype=np.int64)
|
|
133
|
+
max_id = int(arr.max()) + 1
|
|
134
|
+
# Encode pair (a, b) as a * max_id + b to get a single int64 key
|
|
135
|
+
flat = arr[:-1] * max_id + arr[1:]
|
|
136
|
+
unique_pairs, counts = np.unique(flat, return_counts=True)
|
|
137
|
+
best_idx = int(np.argmax(counts))
|
|
138
|
+
best = int(unique_pairs[best_idx])
|
|
139
|
+
return (best // max_id, best % max_id)
|
|
140
|
+
|
|
141
|
+
# Pure-Python fallback
|
|
142
|
+
pairs = Counter(zip(token_ids, token_ids[1:]))
|
|
143
|
+
if not pairs:
|
|
144
|
+
return None
|
|
145
|
+
return max(pairs, key=pairs.__getitem__)
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def _replace_pair(
|
|
149
|
+
token_ids: List[int],
|
|
150
|
+
pair: PairType,
|
|
151
|
+
new_id: int,
|
|
152
|
+
) -> List[int]:
|
|
153
|
+
"""
|
|
154
|
+
Replace every non-overlapping occurrence of *pair* with *new_id*.
|
|
155
|
+
|
|
156
|
+
numpy path — vectorised boolean mask + one array copy, O(N).
|
|
157
|
+
Python path — deque-based left-to-right scan, O(N).
|
|
158
|
+
"""
|
|
159
|
+
a, b = pair
|
|
160
|
+
|
|
161
|
+
if _NUMPY_AVAILABLE:
|
|
162
|
+
arr = np.asarray(token_ids, dtype=np.int32)
|
|
163
|
+
match_pos = np.where((arr[:-1] == a) & (arr[1:] == b))[0]
|
|
164
|
+
|
|
165
|
+
if len(match_pos) == 0:
|
|
166
|
+
return token_ids
|
|
167
|
+
|
|
168
|
+
# Greedy dedup: skip positions adjacent to a previous match
|
|
169
|
+
valid: List[int] = [int(match_pos[0])]
|
|
170
|
+
for pos in match_pos[1:]:
|
|
171
|
+
if int(pos) > valid[-1] + 1:
|
|
172
|
+
valid.append(int(pos))
|
|
173
|
+
matches = np.asarray(valid, dtype=np.int64)
|
|
174
|
+
|
|
175
|
+
# Remove the second element of each matched pair
|
|
176
|
+
keep = np.ones(len(arr), dtype=bool)
|
|
177
|
+
keep[matches + 1] = False
|
|
178
|
+
out = arr[keep].copy()
|
|
179
|
+
|
|
180
|
+
# Replace first elements with new_id; adjust for prior deletions
|
|
181
|
+
adjusted = matches - np.arange(len(matches), dtype=np.int64)
|
|
182
|
+
out[adjusted] = new_id
|
|
183
|
+
|
|
184
|
+
return out.tolist()
|
|
185
|
+
|
|
186
|
+
# Pure-Python fallback (deque, O(N) Python ops)
|
|
187
|
+
dq = deque(token_ids)
|
|
188
|
+
result: List[int] = []
|
|
189
|
+
while dq:
|
|
190
|
+
current = dq.popleft()
|
|
191
|
+
if dq and (current, dq[0]) == pair:
|
|
192
|
+
result.append(new_id)
|
|
193
|
+
dq.popleft()
|
|
194
|
+
else:
|
|
195
|
+
result.append(current)
|
|
196
|
+
return result
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constants for the Indic Tokenizer.
|
|
3
|
+
|
|
4
|
+
Covers all major Indian scripts via Unicode block ranges and provides the
|
|
5
|
+
full set of modern LLM special tokens (Claude / ChatML / Llama-3 style).
|
|
6
|
+
|
|
7
|
+
Space / newline handling
|
|
8
|
+
------------------------
|
|
9
|
+
Spaces and newlines are kept as *literal* characters inside tokens (Claude
|
|
10
|
+
Sonnet style). Decoding is therefore a plain string join with no marker
|
|
11
|
+
substitution (no GPT-2 Ġ/Ċ encoding needed).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Dict, FrozenSet, List, Tuple
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Indic script Unicode ranges (start, end) — both endpoints inclusive
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
INDIC_UNICODE_RANGES: Dict[str, Tuple[int, int]] = {
|
|
20
|
+
"devanagari": (0x0900, 0x097F), # Hindi, Sanskrit, Marathi, Nepali
|
|
21
|
+
"bengali": (0x0980, 0x09FF), # Bengali, Assamese
|
|
22
|
+
"gurmukhi": (0x0A00, 0x0A7F), # Punjabi
|
|
23
|
+
"gujarati": (0x0A80, 0x0AFF), # Gujarati
|
|
24
|
+
"oriya": (0x0B00, 0x0B7F), # Odia
|
|
25
|
+
"tamil": (0x0B80, 0x0BFF), # Tamil
|
|
26
|
+
"telugu": (0x0C00, 0x0C7F), # Telugu
|
|
27
|
+
"kannada": (0x0C80, 0x0CFF), # Kannada
|
|
28
|
+
"malayalam": (0x0D00, 0x0D7F), # Malayalam
|
|
29
|
+
"sinhala": (0x0D80, 0x0DFF), # Sinhala
|
|
30
|
+
"tibetan": (0x0F00, 0x0FFF), # Tibetan
|
|
31
|
+
"devanagari_ext": (0xA8E0, 0xA8FF), # Devanagari Extended
|
|
32
|
+
"vedic_ext": (0x1CD0, 0x1CFF), # Vedic Extensions
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Regex character class spanning the main South Asian block (U+0900–U+0DFF).
|
|
36
|
+
# Used in preprocessor patterns as a compact alternative to listing every range.
|
|
37
|
+
INDIC_REGEX_RANGE: str = r"ऀ-"
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Indic numeral (digit) Unicode ranges — each script has its own digit block
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
INDIC_NUMERAL_RANGES: Dict[str, Tuple[int, int]] = {
|
|
43
|
+
"devanagari_digits": (0x0966, 0x096F),
|
|
44
|
+
"bengali_digits": (0x09E6, 0x09EF),
|
|
45
|
+
"gurmukhi_digits": (0x0A66, 0x0A6F),
|
|
46
|
+
"gujarati_digits": (0x0AE6, 0x0AEF),
|
|
47
|
+
"oriya_digits": (0x0B66, 0x0B6F),
|
|
48
|
+
"tamil_digits": (0x0BE6, 0x0BEF),
|
|
49
|
+
"telugu_digits": (0x0C66, 0x0C6F),
|
|
50
|
+
"kannada_digits": (0x0CE6, 0x0CEF),
|
|
51
|
+
"malayalam_digits": (0x0D66, 0x0D6F),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# ---------------------------------------------------------------------------
|
|
55
|
+
# Special tokens (Claude / ChatML / Llama-3 convention)
|
|
56
|
+
# ---------------------------------------------------------------------------
|
|
57
|
+
SPECIAL_TOKENS: List[str] = [
|
|
58
|
+
# ── Text boundary ──────────────────────────────────────────────────────
|
|
59
|
+
"<|endoftext|>",
|
|
60
|
+
"<|startoftext|>",
|
|
61
|
+
"<|bos|>", # beginning-of-sequence
|
|
62
|
+
"<|eos|>", # end-of-sequence
|
|
63
|
+
"<|pad|>", # padding
|
|
64
|
+
"<|unk|>", # unknown token
|
|
65
|
+
|
|
66
|
+
# ── Chat-template (ChatML / Claude style) ─────────────────────────────
|
|
67
|
+
"<|im_start|>",
|
|
68
|
+
"<|im_end|>",
|
|
69
|
+
"<|system|>",
|
|
70
|
+
"<|user|>",
|
|
71
|
+
"<|assistant|>",
|
|
72
|
+
"<|human|>",
|
|
73
|
+
|
|
74
|
+
# ── Llama-3 / header tokens ────────────────────────────────────────────
|
|
75
|
+
"<|eot_id|>",
|
|
76
|
+
"<|start_header_id|>",
|
|
77
|
+
"<|end_header_id|>",
|
|
78
|
+
|
|
79
|
+
# ── Fill-in-the-middle (FIM / infilling) ──────────────────────────────
|
|
80
|
+
"<|fim_prefix|>",
|
|
81
|
+
"<|fim_middle|>",
|
|
82
|
+
"<|fim_suffix|>",
|
|
83
|
+
|
|
84
|
+
# ── Tool / function-calling (Claude API style) ─────────────────────────
|
|
85
|
+
"<|tool_use|>",
|
|
86
|
+
"<|tool_result|>",
|
|
87
|
+
"<|tool_call|>",
|
|
88
|
+
|
|
89
|
+
# ── Extended thinking (Claude) ─────────────────────────────────────────
|
|
90
|
+
"<|thinking|>",
|
|
91
|
+
"<|/thinking|>",
|
|
92
|
+
|
|
93
|
+
# ── Citation / retrieval ───────────────────────────────────────────────
|
|
94
|
+
"<|citation|>",
|
|
95
|
+
|
|
96
|
+
# ── General purpose / classification ──────────────────────────────────
|
|
97
|
+
"<|cls|>",
|
|
98
|
+
"<|sep|>",
|
|
99
|
+
"<|mask|>",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# Frozen set for fast membership tests during encoding
|
|
103
|
+
DEFAULT_ALLOWED_SPECIAL: FrozenSet[str] = frozenset(SPECIAL_TOKENS)
|