indic-tokenizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from .bpe_tokenizer import IndicBPETokenizer
2
+ from .preprocessor import IndicTextPreprocessor
3
+ from .constants import SPECIAL_TOKENS, INDIC_UNICODE_RANGES
4
+
5
+ __all__ = ["IndicBPETokenizer", "IndicTextPreprocessor", "SPECIAL_TOKENS", "INDIC_UNICODE_RANGES"]
@@ -0,0 +1,8 @@
1
+ """
2
+ Entry point for python -m indic_tokenizer invocation.
3
+ """
4
+
5
+ from .bpe_tokenizer import IndicBPETokenizer
6
+
7
+ print("indic_tokenizer — BPE tokenizer for Indic scripts")
8
+ print("Usage: from indic_tokenizer import IndicBPETokenizer")
@@ -0,0 +1,375 @@
1
+ """
2
+ IndicBPETokenizer — main public class for BPE-based Indic tokenization.
3
+
4
+ Space / newline convention (Claude Sonnet style)
5
+ --------------------------------------------------
6
+ Spaces and newlines are kept as *literal* characters inside tokens.
7
+ A token like " नमस्ते" contains a real leading space — no Ġ/Ċ substitution.
8
+ Decoding is therefore a trivial string join with no post-processing.
9
+
10
+ Responsibilities
11
+ ----------------
12
+ train() — learn BPE merges from a corpus (small / medium datasets)
13
+ train_from_file() — convenience wrapper that auto-loads any supported format
14
+ encode() — text → List[int] (with optional special-token pass-through)
15
+ decode() — List[int] → text (plain string join)
16
+ save() — persist vocab + merges to two JSON files
17
+ load() — restore from those JSON files
18
+
19
+ For corpora > ~1 GB use ChunkedBPETrainer (chunked_trainer.py) instead of
20
+ train(), which stores the full token ID sequence in RAM.
21
+ """
22
+
23
+ import json
24
+ import re
25
+ from pathlib import Path
26
+ from typing import Dict, List, Optional, Set, Tuple, Union
27
+
28
+ from .bpe_trainer import BPETrainer, MergesType, PairType
29
+ from .constants import SPECIAL_TOKENS
30
+ from .data_loader import IndicDataLoader
31
+ from .preprocessor import pretokenize
32
+ from .vocab_builder import VocabBuilder, InverseVocabType, VocabType
33
+
34
+
35
+ class IndicBPETokenizer:
36
+ """
37
+ Byte-Pair Encoding tokenizer for Indian language text.
38
+
39
+ Supports all major Indic scripts: Devanagari, Bengali, Gurmukhi,
40
+ Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam, Sinhala, Tibetan,
41
+ as well as ASCII and mixed-script text.
42
+
43
+ Usage
44
+ -----
45
+ >>> tok = IndicBPETokenizer()
46
+ >>> tok.train(text, vocab_size=16_000)
47
+ >>> ids = tok.encode("नमस्ते दुनिया")
48
+ >>> tok.decode(ids)
49
+ 'नमस्ते दुनिया'
50
+ >>> tok.save("vocab.json", "merges.json")
51
+
52
+ >>> tok2 = IndicBPETokenizer()
53
+ >>> tok2.load("vocab.json", "merges.json")
54
+ """
55
+
56
+ def __init__(self) -> None:
57
+ self.vocab: VocabType = {}
58
+ self.inverse_vocab: InverseVocabType = {}
59
+ self.bpe_merges: MergesType = {}
60
+
61
+ self._vocab_builder = VocabBuilder()
62
+ self._trainer = BPETrainer()
63
+
64
+ # ------------------------------------------------------------------
65
+ # Properties
66
+ # ------------------------------------------------------------------
67
+
68
+ @property
69
+ def vocab_size(self) -> int:
70
+ """Current number of tokens in the vocabulary."""
71
+ return len(self.vocab)
72
+
73
+ # ------------------------------------------------------------------
74
+ # Training
75
+ # ------------------------------------------------------------------
76
+
77
+ def train(
78
+ self,
79
+ text: str,
80
+ vocab_size: int,
81
+ special_tokens: Optional[List[str]] = None,
82
+ verbose: bool = True,
83
+ ) -> None:
84
+ """
85
+ Train BPE from *text* using sequence-based BPE.
86
+
87
+ Args:
88
+ text: Raw UTF-8 training corpus (any Indic language or mixed).
89
+ vocab_size: Target vocabulary size.
90
+ special_tokens: Override the default Claude-style special tokens.
91
+ verbose: Show tqdm progress bar during training.
92
+
93
+ Note:
94
+ For corpora larger than ~1 GB, use ChunkedBPETrainer instead —
95
+ it runs word-frequency BPE which needs <1 GB RAM even for 18 GB text.
96
+ """
97
+ if special_tokens is None:
98
+ special_tokens = list(SPECIAL_TOKENS)
99
+
100
+ # Seed vocab: ASCII + all Indic script characters + corpus chars + specials
101
+ self.vocab, self.inverse_vocab = self._vocab_builder.build_base_vocab()
102
+ self._vocab_builder.extend_from_text(text, self.vocab, self.inverse_vocab)
103
+ self._vocab_builder.add_special_tokens(
104
+ self.vocab, self.inverse_vocab, special_tokens
105
+ )
106
+
107
+ if verbose:
108
+ print(f"Base vocab: {self.vocab_size:,} tokens | Target: {vocab_size:,}")
109
+ if len(text) > 50_000_000:
110
+ print(
111
+ f"WARNING: corpus is {len(text):,} chars. Sequence-based BPE "
112
+ "stores the full token ID list in RAM. Consider ChunkedBPETrainer "
113
+ "for large corpora (chunked_trainer.py)."
114
+ )
115
+
116
+ # Encode every character → its token ID
117
+ try:
118
+ from tqdm import tqdm # type: ignore[import]
119
+ char_iter = tqdm(text, desc="Encoding corpus", unit=" chars",
120
+ unit_scale=True, total=len(text))
121
+ except ImportError:
122
+ char_iter = text # type: ignore[assignment]
123
+
124
+ token_ids: List[int] = [self.inverse_vocab[c] for c in char_iter]
125
+
126
+ self.bpe_merges = self._trainer.train(
127
+ token_ids,
128
+ self.vocab,
129
+ self.inverse_vocab,
130
+ target_vocab_size=vocab_size,
131
+ verbose=verbose,
132
+ )
133
+
134
+ if verbose:
135
+ print(f"Training complete. Final vocab size: {self.vocab_size:,}")
136
+
137
+ def train_from_file(
138
+ self,
139
+ path: Union[str, Path],
140
+ vocab_size: int,
141
+ text_column: str = "text",
142
+ special_tokens: Optional[List[str]] = None,
143
+ max_samples: Optional[int] = None,
144
+ verbose: bool = True,
145
+ ) -> None:
146
+ """
147
+ Load a corpus from *path* then call train().
148
+
149
+ Supports .parquet, .csv, .txt, .json, .jsonl.
150
+
151
+ Args:
152
+ path: Path to the corpus file.
153
+ vocab_size: Target vocabulary size.
154
+ text_column: Column / key for text in tabular / JSON files.
155
+ special_tokens: Override the default special tokens.
156
+ max_samples: Cap on rows / documents loaded (None → all).
157
+ verbose: Show training progress.
158
+ """
159
+ loader = IndicDataLoader(text_column=text_column, max_samples=max_samples)
160
+ text = loader.load(path)
161
+ self.train(text, vocab_size=vocab_size, special_tokens=special_tokens,
162
+ verbose=verbose)
163
+
164
+ # ------------------------------------------------------------------
165
+ # Encoding
166
+ # ------------------------------------------------------------------
167
+
168
+ def encode(
169
+ self,
170
+ text: str,
171
+ allowed_special: Optional[Set[str]] = None,
172
+ ) -> List[int]:
173
+ """
174
+ Encode *text* into a list of token IDs.
175
+
176
+ Args:
177
+ text: Input string (any Indic script, ASCII, special tokens, mixed).
178
+ allowed_special: Set of special tokens to recognise and pass through
179
+ as single IDs. Pass set() to disable.
180
+ Defaults to all tokens in SPECIAL_TOKENS.
181
+
182
+ Returns:
183
+ List of integer token IDs.
184
+ """
185
+ if allowed_special is None:
186
+ allowed_special = set(SPECIAL_TOKENS)
187
+
188
+ token_ids: List[int] = []
189
+
190
+ if allowed_special:
191
+ for segment, is_special in self._split_on_special_tokens(
192
+ text, allowed_special
193
+ ):
194
+ if is_special:
195
+ tid = self.inverse_vocab.get(segment)
196
+ if tid is None:
197
+ raise ValueError(
198
+ f"Special token {segment!r} not found in vocabulary."
199
+ )
200
+ token_ids.append(tid)
201
+ else:
202
+ token_ids.extend(self._encode_ordinary(segment))
203
+ else:
204
+ token_ids = self._encode_ordinary(text)
205
+
206
+ return token_ids
207
+
208
+ def _encode_ordinary(self, text: str) -> List[int]:
209
+ """Encode plain text (no special-token detection)."""
210
+ token_ids: List[int] = []
211
+ for chunk in pretokenize(text):
212
+ tid = self.inverse_vocab.get(chunk)
213
+ if tid is not None:
214
+ token_ids.append(tid)
215
+ else:
216
+ token_ids.extend(self._apply_bpe(chunk))
217
+ return token_ids
218
+
219
+ # ------------------------------------------------------------------
220
+ # Decoding
221
+ # ------------------------------------------------------------------
222
+
223
+ def decode(self, token_ids: List[int]) -> str:
224
+ """
225
+ Decode a list of token IDs back into a string.
226
+
227
+ Because tokens contain literal spaces and newlines, decoding is a
228
+ plain concatenation — no marker substitution required.
229
+
230
+ Args:
231
+ token_ids: Sequence of integer token IDs.
232
+
233
+ Returns:
234
+ Decoded UTF-8 string.
235
+
236
+ Raises:
237
+ ValueError: If any token ID is not in the vocabulary.
238
+ """
239
+ parts: List[str] = []
240
+ for tid in token_ids:
241
+ token = self.vocab.get(tid)
242
+ if token is None:
243
+ raise ValueError(f"Token ID {tid} not found in vocabulary.")
244
+ parts.append(token)
245
+ return "".join(parts)
246
+
247
+ # ------------------------------------------------------------------
248
+ # Persistence
249
+ # ------------------------------------------------------------------
250
+
251
+ def save(self, vocab_path: str, merges_path: str) -> None:
252
+ """
253
+ Save vocabulary and BPE merges to JSON files.
254
+
255
+ vocab.json format: {"<int_id>": "<token_string>", ...}
256
+ merges.json format: [{"pair": [id1, id2], "merged_id": id3}, ...]
257
+
258
+ Args:
259
+ vocab_path: Destination path for vocabulary JSON.
260
+ merges_path: Destination path for merges JSON.
261
+ """
262
+ with open(vocab_path, "w", encoding="utf-8") as fh:
263
+ json.dump(
264
+ {str(tid): tok for tid, tok in self.vocab.items()},
265
+ fh,
266
+ ensure_ascii=False,
267
+ indent=2,
268
+ )
269
+
270
+ merges_list = [
271
+ {"pair": list(pair), "merged_id": mid}
272
+ for pair, mid in self.bpe_merges.items()
273
+ ]
274
+ with open(merges_path, "w", encoding="utf-8") as fh:
275
+ json.dump(merges_list, fh, ensure_ascii=False, indent=2)
276
+
277
+ def load(self, vocab_path: str, merges_path: str) -> None:
278
+ """
279
+ Load vocabulary and BPE merges from JSON files produced by save().
280
+
281
+ Args:
282
+ vocab_path: Path to vocabulary JSON file.
283
+ merges_path: Path to merges JSON file.
284
+ """
285
+ with open(vocab_path, "r", encoding="utf-8") as fh:
286
+ raw = json.load(fh)
287
+ self.vocab = {int(k): v for k, v in raw.items()}
288
+ self.inverse_vocab = {v: int(k) for k, v in raw.items()}
289
+
290
+ with open(merges_path, "r", encoding="utf-8") as fh:
291
+ merges_list = json.load(fh)
292
+ self.bpe_merges = {
293
+ (entry["pair"][0], entry["pair"][1]): entry["merged_id"]
294
+ for entry in merges_list
295
+ }
296
+
297
+ # ------------------------------------------------------------------
298
+ # Vocabulary helpers
299
+ # ------------------------------------------------------------------
300
+
301
+ def token_to_id(self, token: str) -> Optional[int]:
302
+ """Return the vocabulary ID for a token string, or None."""
303
+ return self.inverse_vocab.get(token)
304
+
305
+ def id_to_token(self, tid: int) -> Optional[str]:
306
+ """Return the token string for a vocabulary ID, or None."""
307
+ return self.vocab.get(tid)
308
+
309
+ def special_token_id(self, token: str) -> Optional[int]:
310
+ """Return the vocabulary ID of a special token, or None."""
311
+ return self.inverse_vocab.get(token)
312
+
313
+ # ------------------------------------------------------------------
314
+ # Private helpers
315
+ # ------------------------------------------------------------------
316
+
317
+ def _apply_bpe(self, chunk: str) -> List[int]:
318
+ """
319
+ Apply BPE merges to a pre-tokenised chunk.
320
+
321
+ The chunk is first split into character IDs; then the earliest-learned
322
+ merge that applies is greedily applied left-to-right until no merge
323
+ changes the sequence.
324
+ """
325
+ unk_id = self.inverse_vocab.get("<|unk|>")
326
+
327
+ char_ids: List[int] = []
328
+ for char in chunk:
329
+ tid = self.inverse_vocab.get(char)
330
+ if tid is None:
331
+ if unk_id is None:
332
+ raise ValueError(
333
+ f"Character {char!r} (U+{ord(char):04X}) not in vocab "
334
+ "and no <|unk|> fallback is defined."
335
+ )
336
+ char_ids.append(unk_id)
337
+ else:
338
+ char_ids.append(tid)
339
+
340
+ changed = True
341
+ while changed and len(char_ids) > 1:
342
+ changed = False
343
+ merged: List[int] = []
344
+ i = 0
345
+ while i < len(char_ids):
346
+ if i < len(char_ids) - 1:
347
+ pair: PairType = (char_ids[i], char_ids[i + 1])
348
+ if pair in self.bpe_merges:
349
+ merged.append(self.bpe_merges[pair])
350
+ i += 2
351
+ changed = True
352
+ continue
353
+ merged.append(char_ids[i])
354
+ i += 1
355
+ char_ids = merged
356
+
357
+ return char_ids
358
+
359
+ def _split_on_special_tokens(
360
+ self,
361
+ text: str,
362
+ allowed_special: Set[str],
363
+ ) -> List[Tuple[str, bool]]:
364
+ """Split *text* into (segment, is_special) pairs using regex."""
365
+ # Sort longest-first so "<|im_start|>" matches before "<|im|>" would
366
+ pattern = "(" + "|".join(
367
+ re.escape(tok)
368
+ for tok in sorted(allowed_special, key=len, reverse=True)
369
+ ) + ")"
370
+ result: List[Tuple[str, bool]] = []
371
+ for part in re.split(pattern, text):
372
+ if not part:
373
+ continue
374
+ result.append((part, part in allowed_special))
375
+ return result
@@ -0,0 +1,196 @@
1
+ """
2
+ Core BPE training algorithm — CPU-optimised with numpy.
3
+
4
+ BPETrainer is stateless: it receives mutable vocab / inverse_vocab dicts,
5
+ modifies them in-place, and returns only the bpe_merges dictionary.
6
+
7
+ Optimisations over the naive Python implementation
8
+ ---------------------------------------------------
9
+ _find_most_frequent_pair
10
+ Uses numpy to encode every adjacent pair as a single int64, then
11
+ np.unique (C-level sort + scan) to count them. For a 10 M-token
12
+ sequence this is ~10× faster than Python's Counter(zip(...)).
13
+
14
+ _replace_pair
15
+ Finds all match positions with a numpy boolean mask in one vectorised
16
+ pass, deduplicates overlapping matches with a tiny Python loop
17
+ (typically k << N matches), then builds the output array in numpy —
18
+ no Python-level element-by-element loop.
19
+
20
+ Fallback
21
+ If numpy is not installed both methods fall back to pure-Python
22
+ implementations so the module is always importable.
23
+ """
24
+
25
+ from collections import Counter, deque
26
+ from typing import Dict, List, Optional, Tuple
27
+
28
+ # Type aliases exposed for callers
29
+ PairType = Tuple[int, int]
30
+ VocabType = Dict[int, str]
31
+ InverseVocabType = Dict[str, int]
32
+ MergesType = Dict[PairType, int]
33
+
34
+ try:
35
+ import numpy as np
36
+ _NUMPY_AVAILABLE = True
37
+ except ImportError:
38
+ _NUMPY_AVAILABLE = False
39
+
40
+
41
+ class BPETrainer:
42
+ """Stateless, numpy-accelerated BPE training engine."""
43
+
44
+ # ------------------------------------------------------------------
45
+ # Public API
46
+ # ------------------------------------------------------------------
47
+
48
+ def train(
49
+ self,
50
+ token_ids: List[int],
51
+ vocab: VocabType,
52
+ inverse_vocab: InverseVocabType,
53
+ target_vocab_size: int,
54
+ verbose: bool = True,
55
+ ) -> MergesType:
56
+ """
57
+ Run BPE training until *target_vocab_size* is reached.
58
+
59
+ Mutates *vocab* and *inverse_vocab* in-place with merged tokens.
60
+
61
+ Args:
62
+ token_ids: Sequence of initial token IDs from the corpus.
63
+ vocab: Mutable id → token mapping (extended in-place).
64
+ inverse_vocab: Mutable token → id mapping (extended in-place).
65
+ target_vocab_size: Desired final vocabulary size.
66
+ verbose: Show tqdm progress bar (or step log if tqdm missing).
67
+
68
+ Returns:
69
+ bpe_merges: dict mapping (id1, id2) → merged_id in merge order.
70
+ """
71
+ bpe_merges: MergesType = {}
72
+ initial_size = len(vocab)
73
+
74
+ try:
75
+ from tqdm import tqdm # type: ignore[import]
76
+ progress = tqdm(
77
+ total=target_vocab_size - initial_size,
78
+ desc="BPE merges",
79
+ unit="merge",
80
+ )
81
+ use_tqdm = True
82
+ except ImportError:
83
+ use_tqdm = False
84
+ progress = None # type: ignore[assignment]
85
+
86
+ step = 0
87
+ while len(vocab) < target_vocab_size:
88
+ pair = self._find_most_frequent_pair(token_ids)
89
+ if pair is None:
90
+ break # no more pairs to merge
91
+
92
+ new_id = len(vocab)
93
+ token_ids = self._replace_pair(token_ids, pair, new_id)
94
+
95
+ merged_token = vocab[pair[0]] + vocab[pair[1]]
96
+ vocab[new_id] = merged_token
97
+ inverse_vocab[merged_token] = new_id
98
+ bpe_merges[pair] = new_id
99
+
100
+ step += 1
101
+ if use_tqdm and progress is not None:
102
+ progress.update(1)
103
+ elif verbose and step % 100 == 0:
104
+ print(
105
+ f" step {step:5d} | vocab {len(vocab):6d} "
106
+ f"| merged: {merged_token!r}"
107
+ )
108
+
109
+ if use_tqdm and progress is not None:
110
+ progress.close()
111
+
112
+ return bpe_merges
113
+
114
+ # ------------------------------------------------------------------
115
+ # Private helpers — numpy paths with pure-Python fallbacks
116
+ # ------------------------------------------------------------------
117
+
118
+ @staticmethod
119
+ def _find_most_frequent_pair(
120
+ token_ids: List[int],
121
+ ) -> Optional[PairType]:
122
+ """
123
+ Return the most frequent adjacent pair in *token_ids*.
124
+
125
+ numpy path — O(N log N) via C-level sort in np.unique.
126
+ Python path — O(N) via Counter (larger constant due to dict hashing).
127
+ """
128
+ if len(token_ids) < 2:
129
+ return None
130
+
131
+ if _NUMPY_AVAILABLE:
132
+ arr = np.asarray(token_ids, dtype=np.int64)
133
+ max_id = int(arr.max()) + 1
134
+ # Encode pair (a, b) as a * max_id + b to get a single int64 key
135
+ flat = arr[:-1] * max_id + arr[1:]
136
+ unique_pairs, counts = np.unique(flat, return_counts=True)
137
+ best_idx = int(np.argmax(counts))
138
+ best = int(unique_pairs[best_idx])
139
+ return (best // max_id, best % max_id)
140
+
141
+ # Pure-Python fallback
142
+ pairs = Counter(zip(token_ids, token_ids[1:]))
143
+ if not pairs:
144
+ return None
145
+ return max(pairs, key=pairs.__getitem__)
146
+
147
+ @staticmethod
148
+ def _replace_pair(
149
+ token_ids: List[int],
150
+ pair: PairType,
151
+ new_id: int,
152
+ ) -> List[int]:
153
+ """
154
+ Replace every non-overlapping occurrence of *pair* with *new_id*.
155
+
156
+ numpy path — vectorised boolean mask + one array copy, O(N).
157
+ Python path — deque-based left-to-right scan, O(N).
158
+ """
159
+ a, b = pair
160
+
161
+ if _NUMPY_AVAILABLE:
162
+ arr = np.asarray(token_ids, dtype=np.int32)
163
+ match_pos = np.where((arr[:-1] == a) & (arr[1:] == b))[0]
164
+
165
+ if len(match_pos) == 0:
166
+ return token_ids
167
+
168
+ # Greedy dedup: skip positions adjacent to a previous match
169
+ valid: List[int] = [int(match_pos[0])]
170
+ for pos in match_pos[1:]:
171
+ if int(pos) > valid[-1] + 1:
172
+ valid.append(int(pos))
173
+ matches = np.asarray(valid, dtype=np.int64)
174
+
175
+ # Remove the second element of each matched pair
176
+ keep = np.ones(len(arr), dtype=bool)
177
+ keep[matches + 1] = False
178
+ out = arr[keep].copy()
179
+
180
+ # Replace first elements with new_id; adjust for prior deletions
181
+ adjusted = matches - np.arange(len(matches), dtype=np.int64)
182
+ out[adjusted] = new_id
183
+
184
+ return out.tolist()
185
+
186
+ # Pure-Python fallback (deque, O(N) Python ops)
187
+ dq = deque(token_ids)
188
+ result: List[int] = []
189
+ while dq:
190
+ current = dq.popleft()
191
+ if dq and (current, dq[0]) == pair:
192
+ result.append(new_id)
193
+ dq.popleft()
194
+ else:
195
+ result.append(current)
196
+ return result
@@ -0,0 +1,103 @@
1
+ """
2
+ Constants for the Indic Tokenizer.
3
+
4
+ Covers all major Indian scripts via Unicode block ranges and provides the
5
+ full set of modern LLM special tokens (Claude / ChatML / Llama-3 style).
6
+
7
+ Space / newline handling
8
+ ------------------------
9
+ Spaces and newlines are kept as *literal* characters inside tokens (Claude
10
+ Sonnet style). Decoding is therefore a plain string join with no marker
11
+ substitution (no GPT-2 Ġ/Ċ encoding needed).
12
+ """
13
+
14
+ from typing import Dict, FrozenSet, List, Tuple
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Indic script Unicode ranges (start, end) — both endpoints inclusive
18
+ # ---------------------------------------------------------------------------
19
+ INDIC_UNICODE_RANGES: Dict[str, Tuple[int, int]] = {
20
+ "devanagari": (0x0900, 0x097F), # Hindi, Sanskrit, Marathi, Nepali
21
+ "bengali": (0x0980, 0x09FF), # Bengali, Assamese
22
+ "gurmukhi": (0x0A00, 0x0A7F), # Punjabi
23
+ "gujarati": (0x0A80, 0x0AFF), # Gujarati
24
+ "oriya": (0x0B00, 0x0B7F), # Odia
25
+ "tamil": (0x0B80, 0x0BFF), # Tamil
26
+ "telugu": (0x0C00, 0x0C7F), # Telugu
27
+ "kannada": (0x0C80, 0x0CFF), # Kannada
28
+ "malayalam": (0x0D00, 0x0D7F), # Malayalam
29
+ "sinhala": (0x0D80, 0x0DFF), # Sinhala
30
+ "tibetan": (0x0F00, 0x0FFF), # Tibetan
31
+ "devanagari_ext": (0xA8E0, 0xA8FF), # Devanagari Extended
32
+ "vedic_ext": (0x1CD0, 0x1CFF), # Vedic Extensions
33
+ }
34
+
35
+ # Regex character class spanning the main South Asian block (U+0900–U+0DFF).
36
+ # Used in preprocessor patterns as a compact alternative to listing every range.
37
+ INDIC_REGEX_RANGE: str = r"ऀ-෿"
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Indic numeral (digit) Unicode ranges — each script has its own digit block
41
+ # ---------------------------------------------------------------------------
42
+ INDIC_NUMERAL_RANGES: Dict[str, Tuple[int, int]] = {
43
+ "devanagari_digits": (0x0966, 0x096F),
44
+ "bengali_digits": (0x09E6, 0x09EF),
45
+ "gurmukhi_digits": (0x0A66, 0x0A6F),
46
+ "gujarati_digits": (0x0AE6, 0x0AEF),
47
+ "oriya_digits": (0x0B66, 0x0B6F),
48
+ "tamil_digits": (0x0BE6, 0x0BEF),
49
+ "telugu_digits": (0x0C66, 0x0C6F),
50
+ "kannada_digits": (0x0CE6, 0x0CEF),
51
+ "malayalam_digits": (0x0D66, 0x0D6F),
52
+ }
53
+
54
+ # ---------------------------------------------------------------------------
55
+ # Special tokens (Claude / ChatML / Llama-3 convention)
56
+ # ---------------------------------------------------------------------------
57
+ SPECIAL_TOKENS: List[str] = [
58
+ # ── Text boundary ──────────────────────────────────────────────────────
59
+ "<|endoftext|>",
60
+ "<|startoftext|>",
61
+ "<|bos|>", # beginning-of-sequence
62
+ "<|eos|>", # end-of-sequence
63
+ "<|pad|>", # padding
64
+ "<|unk|>", # unknown token
65
+
66
+ # ── Chat-template (ChatML / Claude style) ─────────────────────────────
67
+ "<|im_start|>",
68
+ "<|im_end|>",
69
+ "<|system|>",
70
+ "<|user|>",
71
+ "<|assistant|>",
72
+ "<|human|>",
73
+
74
+ # ── Llama-3 / header tokens ────────────────────────────────────────────
75
+ "<|eot_id|>",
76
+ "<|start_header_id|>",
77
+ "<|end_header_id|>",
78
+
79
+ # ── Fill-in-the-middle (FIM / infilling) ──────────────────────────────
80
+ "<|fim_prefix|>",
81
+ "<|fim_middle|>",
82
+ "<|fim_suffix|>",
83
+
84
+ # ── Tool / function-calling (Claude API style) ─────────────────────────
85
+ "<|tool_use|>",
86
+ "<|tool_result|>",
87
+ "<|tool_call|>",
88
+
89
+ # ── Extended thinking (Claude) ─────────────────────────────────────────
90
+ "<|thinking|>",
91
+ "<|/thinking|>",
92
+
93
+ # ── Citation / retrieval ───────────────────────────────────────────────
94
+ "<|citation|>",
95
+
96
+ # ── General purpose / classification ──────────────────────────────────
97
+ "<|cls|>",
98
+ "<|sep|>",
99
+ "<|mask|>",
100
+ ]
101
+
102
+ # Frozen set for fast membership tests during encoding
103
+ DEFAULT_ALLOWED_SPECIAL: FrozenSet[str] = frozenset(SPECIAL_TOKENS)