cute-tokenizer 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,35 @@
1
+ """CUTE — Compact Unicode Token Encoding.
2
+
3
+ Public API:
4
+ build_cute — train a CUTE tokenizer from a corpus directory.
5
+ CUTEConfig — all knobs for the build pipeline.
6
+ CUTETokenizerFast — HuggingFace-compatible inference wrapper.
7
+ PUAMapping — word ↔ PUA character mapping.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os as _os
13
+
14
+ # Silence the "None of PyTorch, TensorFlow, or Flax have been found" warning
15
+ # that `transformers` emits at import. CUTE only needs the tokenizer layer, not
16
+ # the model layer, so this warning is irrelevant. Must be set BEFORE the first
17
+ # `transformers` import — putting it here covers both library and CLI paths.
18
+ _os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
19
+ _os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
20
+
21
+ from ._version import __version__
22
+ from .config import CUTEConfig
23
+ from .pua import PUAMapping
24
+ from .tokenizer import CUTETokenizerFast
25
+ from .trainer import build_cute, load_mapping, save_mapping
26
+
27
+ __all__ = [
28
+ "CUTEConfig",
29
+ "CUTETokenizerFast",
30
+ "PUAMapping",
31
+ "__version__",
32
+ "build_cute",
33
+ "load_mapping",
34
+ "save_mapping",
35
+ ]
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
cute_tokenizer/cli.py ADDED
@@ -0,0 +1,130 @@
1
+ """Command-line interface for CUTE.
2
+
3
+ cute build --corpus ./corpus --output ./output [--config configs/default.toml]
4
+ cute roundtrip-check --tokenizer ./output --corpus ./holdout
5
+ cute info --tokenizer ./output
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import json
12
+ import os
13
+ import sys
14
+ from pathlib import Path
15
+
16
+ # Silence the noisy "None of PyTorch, TensorFlow >= 2.0, or Flax have been found"
17
+ # warning from `transformers`. We only use the tokenizer, not the model layer,
18
+ # so this warning is irrelevant. Set BEFORE any transformers import below.
19
+ os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
20
+ os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
21
+
22
+ from ._version import __version__
23
+ from .config import CUTEConfig
24
+ from .corpus import iter_corpus_files
25
+ from .manifest import BuildManifest
26
+ from .tokenizer import CUTETokenizerFast
27
+ from .trainer import build_cute
28
+
29
+
30
+ def _cmd_build(args: argparse.Namespace) -> int:
31
+ config = _load_config(Path(args.config)) if args.config else CUTEConfig()
32
+ manifest_path = build_cute(
33
+ corpus_dir=Path(args.corpus),
34
+ output_dir=Path(args.output),
35
+ config=config,
36
+ )
37
+ print(f"Build complete. Manifest: {manifest_path}")
38
+ return 0
39
+
40
+
41
+ def _cmd_roundtrip_check(args: argparse.Namespace) -> int:
42
+ tok_dir = Path(args.tokenizer)
43
+ tok = CUTETokenizerFast(
44
+ tokenizer_file=tok_dir / "tokenizer.json",
45
+ cute_mapping_file=tok_dir / "cute_mapping.json",
46
+ )
47
+
48
+ corpus_dir = Path(args.corpus)
49
+ files_checked = files_failed = 0
50
+ for path in iter_corpus_files(
51
+ corpus_dir,
52
+ extensions=(".py", ".js", ".ts", ".java", ".c", ".cpp", ".rs", ".go", ".rb", ".php"),
53
+ ):
54
+ try:
55
+ text = path.read_text(encoding="utf-8")
56
+ except (UnicodeDecodeError, OSError):
57
+ continue
58
+ files_checked += 1
59
+ ids = tok(text, add_special_tokens=False).input_ids
60
+ decoded = tok.decode(ids, skip_special_tokens=True)
61
+ if decoded != text:
62
+ files_failed += 1
63
+ print(f"FAIL {path.relative_to(corpus_dir)}")
64
+ if args.verbose:
65
+ _show_diff(text, decoded)
66
+ if files_checked >= args.max_files:
67
+ break
68
+
69
+ print(f"Round-trip check: {files_checked - files_failed}/{files_checked} OK")
70
+ return 0 if files_failed == 0 else 1
71
+
72
+
73
+ def _cmd_info(args: argparse.Namespace) -> int:
74
+ tok_dir = Path(args.tokenizer)
75
+ manifest = BuildManifest.read(tok_dir / "build_manifest.json")
76
+ print(json.dumps(manifest.to_dict(), indent=2))
77
+ return 0
78
+
79
+
80
+ def _show_diff(expected: str, got: str) -> None:
81
+ """Print a brief diff for round-trip failures."""
82
+ for i, (e, g) in enumerate(zip(expected, got, strict=False)):
83
+ if e != g:
84
+ ctx = max(0, i - 20)
85
+ print(f" first diff at offset {i}:")
86
+ print(f" expected: ...{expected[ctx : i + 20]!r}")
87
+ print(f" got : ...{got[ctx : i + 20]!r}")
88
+ return
89
+ if len(expected) != len(got):
90
+ print(f" length differs: expected={len(expected)}, got={len(got)}")
91
+
92
+
93
+ def _load_config(path: Path) -> CUTEConfig:
94
+ """Load a config from TOML. Lazy import — `tomllib` is stdlib in 3.11+."""
95
+ if sys.version_info >= (3, 11):
96
+ import tomllib
97
+ else:
98
+ import tomli as tomllib # type: ignore[no-redef]
99
+ data = tomllib.loads(path.read_text(encoding="utf-8"))
100
+ return CUTEConfig(**data)
101
+
102
+
103
+ def main(argv: list[str] | None = None) -> int:
104
+ parser = argparse.ArgumentParser(prog="cute", description="CUTE tokenizer builder")
105
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
106
+ sub = parser.add_subparsers(dest="cmd", required=True)
107
+
108
+ p_build = sub.add_parser("build", help="Train a CUTE tokenizer from a corpus")
109
+ p_build.add_argument("--corpus", required=True, help="Corpus directory")
110
+ p_build.add_argument("--output", required=True, help="Output directory")
111
+ p_build.add_argument("--config", help="Optional TOML config path")
112
+ p_build.set_defaults(func=_cmd_build)
113
+
114
+ p_rt = sub.add_parser("roundtrip-check", help="Verify byte-equal round-trip")
115
+ p_rt.add_argument("--tokenizer", required=True, help="Trained tokenizer dir")
116
+ p_rt.add_argument("--corpus", required=True, help="Held-out corpus to check")
117
+ p_rt.add_argument("--max-files", type=int, default=10_000)
118
+ p_rt.add_argument("--verbose", action="store_true")
119
+ p_rt.set_defaults(func=_cmd_roundtrip_check)
120
+
121
+ p_info = sub.add_parser("info", help="Print build manifest")
122
+ p_info.add_argument("--tokenizer", required=True)
123
+ p_info.set_defaults(func=_cmd_info)
124
+
125
+ args = parser.parse_args(argv)
126
+ return int(args.func(args))
127
+
128
+
129
+ if __name__ == "__main__":
130
+ raise SystemExit(main())
@@ -0,0 +1,96 @@
1
+ """Configuration for the CUTE tokenizer build pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass
6
+ from typing import Any
7
+
8
+ DEFAULT_SPECIAL_TOKENS: tuple[str, ...] = (
9
+ "<pad>",
10
+ "<s>",
11
+ "</s>",
12
+ "<unk>",
13
+ "<|endoftext|>",
14
+ "<|fim_prefix|>",
15
+ "<|fim_middle|>",
16
+ "<|fim_suffix|>",
17
+ "<|file_sep|>",
18
+ "<|repo_name|>",
19
+ )
20
+
21
+ DEFAULT_CODE_EXTENSIONS: tuple[str, ...] = (
22
+ ".py",
23
+ ".js",
24
+ ".ts",
25
+ ".tsx",
26
+ ".jsx",
27
+ ".java",
28
+ ".c",
29
+ ".cpp",
30
+ ".h",
31
+ ".hpp",
32
+ ".cs",
33
+ ".rs",
34
+ ".go",
35
+ ".rb",
36
+ ".php",
37
+ ".swift",
38
+ ".kt",
39
+ ".scala",
40
+ ".sh",
41
+ ".sql",
42
+ ".html",
43
+ ".css",
44
+ ".scss",
45
+ ".json",
46
+ ".yaml",
47
+ ".yml",
48
+ ".toml",
49
+ ".md",
50
+ )
51
+
52
+
53
+ @dataclass(frozen=True)
54
+ class CUTEConfig:
55
+ """All knobs for a CUTE build, in one place.
56
+
57
+ Frozen so hashing/comparison is well-defined and the manifest serializer
58
+ can dump a stable representation.
59
+ """
60
+
61
+ vocab_size: int = 80_000
62
+ coverage_target: float = 0.90
63
+ max_token_len: int = 50
64
+ boost_weight: float = 0.3
65
+ min_bpe_budget: int = 8_000
66
+ min_frequency: int = 2
67
+ seed: int = 42
68
+ extensions: tuple[str, ...] = DEFAULT_CODE_EXTENSIONS
69
+ special_tokens: tuple[str, ...] = DEFAULT_SPECIAL_TOKENS
70
+ workers: int = 0 # 0 means os.cpu_count()
71
+ shard_size_bytes: int = 64 * 1024 * 1024 # 64 MiB per shard
72
+ license_allowlist: tuple[str, ...] = (
73
+ "MIT",
74
+ "Apache-2.0",
75
+ "BSD-3-Clause",
76
+ "BSD-2-Clause",
77
+ "ISC",
78
+ "Apache 2.0",
79
+ "Apache License 2.0",
80
+ )
81
+ enable_secret_scrub: bool = True
82
+ enable_license_filter: bool = False # off by default; opt-in
83
+
84
+ def __post_init__(self) -> None:
85
+ if not 0.0 < self.coverage_target < 1.0:
86
+ raise ValueError(f"coverage_target must be in (0,1), got {self.coverage_target}")
87
+ if self.vocab_size < 1024:
88
+ raise ValueError(f"vocab_size too small: {self.vocab_size}")
89
+ if self.max_token_len < 1:
90
+ raise ValueError(f"max_token_len must be positive: {self.max_token_len}")
91
+
92
+ def to_dict(self) -> dict[str, Any]:
93
+ return asdict(self)
94
+
95
+
96
+ __all__ = ["DEFAULT_CODE_EXTENSIONS", "DEFAULT_SPECIAL_TOKENS", "CUTEConfig"]
@@ -0,0 +1,305 @@
1
+ """Corpus pipeline: stream files, dedupe by content hash, scrub secrets, shard.
2
+
3
+ The output is a sequence of deterministic shards on disk that downstream
4
+ phases (frequency counting, BPE training) can iterate efficiently.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import gzip
10
+ import hashlib
11
+ from collections.abc import Iterable, Iterator
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ import orjson
16
+ import regex as re
17
+
18
+ from .pua import find_pua_codepoints
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Secret scrubbing
22
+ # ---------------------------------------------------------------------------
23
+
24
+ # Each pattern is conservative — false positives drop a file, which is fine
25
+ # at corpus scale. False negatives are far more dangerous (secret in vocab).
26
+ SECRET_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
27
+ ("aws_access_key", re.compile(r"AKIA[0-9A-Z]{16}")),
28
+ ("openai_api_key", re.compile(r"sk-(?:proj-)?[A-Za-z0-9_-]{20,}")),
29
+ ("anthropic_api_key", re.compile(r"sk-ant-[A-Za-z0-9_\-]{50,}")),
30
+ ("github_pat", re.compile(r"ghp_[A-Za-z0-9]{36}")),
31
+ ("github_oauth", re.compile(r"gho_[A-Za-z0-9]{36}")),
32
+ ("github_app", re.compile(r"(ghu|ghs)_[A-Za-z0-9]{36}")),
33
+ ("google_api", re.compile(r"AIza[0-9A-Za-z_\-]{35}")),
34
+ ("slack_token", re.compile(r"xox[baprs]-[A-Za-z0-9-]{10,}")),
35
+ ("private_key_pem", re.compile(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")),
36
+ ("jwt", re.compile(r"eyJ[A-Za-z0-9_\-]{10,}\.eyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}")),
37
+ )
38
+
39
+
40
+ def has_secret(text: str) -> str | None:
41
+ """Return the name of the first matching secret pattern, or None."""
42
+ for name, pat in SECRET_PATTERNS:
43
+ if pat.search(text):
44
+ return name
45
+ return None
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # License filter
50
+ # ---------------------------------------------------------------------------
51
+
52
+ # SPDX header detector. Matches lines like:
53
+ # # SPDX-License-Identifier: MIT
54
+ # // SPDX-License-Identifier: Apache-2.0
55
+ # Operates on the first 4 KiB of each file so we don't scan large blobs.
56
+ _SPDX_REGEX = re.compile(
57
+ r"SPDX-License-Identifier\s*:\s*([A-Za-z0-9.\-+ ]+)",
58
+ re.IGNORECASE,
59
+ )
60
+ # Heuristic: explicit "All rights reserved" / "Proprietary" / "Confidential"
61
+ # in the file head. We refuse files matching these unless an SPDX header
62
+ # explicitly grants a permissive license.
63
+ _PROPRIETARY_REGEX = re.compile(
64
+ r"\b(All Rights Reserved|Proprietary and Confidential|UNLICENSED|License: Proprietary)\b",
65
+ re.IGNORECASE,
66
+ )
67
+ _LICENSE_HEAD_BYTES = 4096
68
+
69
+
70
+ def detect_license(text: str) -> str | None:
71
+ """Best-effort license detection from a file's head.
72
+
73
+ Returns the SPDX identifier if found, otherwise None. Does NOT make a
74
+ keep/drop decision — that's `is_license_allowed`'s job.
75
+ """
76
+ head = text[:_LICENSE_HEAD_BYTES]
77
+ m = _SPDX_REGEX.search(head)
78
+ if m:
79
+ return m.group(1).strip()
80
+ return None
81
+
82
+
83
+ def is_license_allowed(text: str, allowlist: Iterable[str]) -> bool:
84
+ """Decide whether a file's license header (if any) permits inclusion.
85
+
86
+ Logic:
87
+ 1. If an SPDX header is present and matches the allowlist → allow.
88
+ 2. If an SPDX header is present and does NOT match → reject.
89
+ 3. If no SPDX header but a 'proprietary' marker is in the head → reject.
90
+ 4. Otherwise (no headers, no markers) → allow. The corpus owner is
91
+ responsible for not feeding obviously copyrighted material; the
92
+ filter is a safety net, not a legal review.
93
+ """
94
+ spdx = detect_license(text)
95
+ if spdx is not None:
96
+ allow_set = {entry.strip().lower() for entry in allowlist}
97
+ return spdx.lower() in allow_set
98
+
99
+ head = text[:_LICENSE_HEAD_BYTES]
100
+ return not _PROPRIETARY_REGEX.search(head)
101
+
102
+
103
+ # ---------------------------------------------------------------------------
104
+ # Records
105
+ # ---------------------------------------------------------------------------
106
+
107
+
108
+ @dataclass(frozen=True)
109
+ class CorpusRecord:
110
+ """One file's content + metadata."""
111
+
112
+ path: str # path relative to the corpus root
113
+ text: str
114
+ sha256: str
115
+
116
+ def to_json(self) -> bytes:
117
+ return orjson.dumps({"path": self.path, "text": self.text, "sha256": self.sha256})
118
+
119
+ @classmethod
120
+ def from_json(cls, line: bytes) -> CorpusRecord:
121
+ d = orjson.loads(line)
122
+ return cls(path=d["path"], text=d["text"], sha256=d["sha256"])
123
+
124
+
125
+ @dataclass(frozen=True)
126
+ class IngestStats:
127
+ """Aggregated stats from one ingest pass."""
128
+
129
+ files_seen: int
130
+ files_kept: int
131
+ files_dropped_dedup: int
132
+ files_dropped_secret: int
133
+ files_dropped_license: int
134
+ files_dropped_decode: int
135
+ files_dropped_size: int
136
+ bytes_kept: int
137
+ pua_codepoints_in_corpus: frozenset[int]
138
+
139
+
140
+ # ---------------------------------------------------------------------------
141
+ # Ingestion
142
+ # ---------------------------------------------------------------------------
143
+
144
+
145
+ def _hash_text(text: str) -> str:
146
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
147
+
148
+
149
+ def iter_corpus_files(
150
+ corpus_dir: Path,
151
+ extensions: Iterable[str],
152
+ max_bytes: int = 5_000_000,
153
+ ) -> Iterator[Path]:
154
+ """Yield candidate files under `corpus_dir`, deterministically ordered.
155
+
156
+ Sorted by relative path so iteration order is reproducible across runs.
157
+ """
158
+ ext_set = {e.lower() for e in extensions}
159
+ candidates = [p for p in corpus_dir.rglob("*") if p.is_file() and p.suffix.lower() in ext_set]
160
+ candidates.sort(key=lambda p: str(p.relative_to(corpus_dir)).replace("\\", "/"))
161
+ for p in candidates:
162
+ try:
163
+ if p.stat().st_size > max_bytes:
164
+ continue
165
+ except OSError:
166
+ continue
167
+ yield p
168
+
169
+
170
+ def ingest_corpus(
171
+ corpus_dir: Path,
172
+ out_dir: Path,
173
+ extensions: Iterable[str],
174
+ shard_size_bytes: int = 64 * 1024 * 1024,
175
+ enable_secret_scrub: bool = True,
176
+ enable_license_filter: bool = False,
177
+ license_allowlist: Iterable[str] = (),
178
+ max_file_bytes: int = 5_000_000,
179
+ ) -> IngestStats:
180
+ """Read corpus files, dedupe + scrub, write line-delimited gzipped shards.
181
+
182
+ Output layout:
183
+ out_dir/shards/shard_00000.jsonl.gz
184
+ out_dir/shards/shard_00001.jsonl.gz
185
+ ...
186
+
187
+ Each line of each shard is a CorpusRecord.to_json() blob.
188
+ """
189
+ out_dir.mkdir(parents=True, exist_ok=True)
190
+ shards_dir = out_dir / "shards"
191
+ shards_dir.mkdir(exist_ok=True)
192
+
193
+ seen_hashes: set[str] = set()
194
+ files_seen = files_kept = 0
195
+ drop_dedup = drop_secret = drop_license = drop_decode = drop_size = 0
196
+ bytes_kept = 0
197
+ pua_codepoints: set[int] = set()
198
+ license_allowlist_t = tuple(license_allowlist)
199
+
200
+ shard_idx = 0
201
+ shard_path = shards_dir / f"shard_{shard_idx:05d}.jsonl.gz"
202
+ shard_fh: gzip.GzipFile | None = gzip.open(shard_path, "wb") # noqa: SIM115 (rolling handle, closed in finally)
203
+ bytes_in_shard = 0
204
+
205
+ try:
206
+ for path in iter_corpus_files(corpus_dir, extensions, max_bytes=max_file_bytes):
207
+ files_seen += 1
208
+ try:
209
+ text = path.read_text(encoding="utf-8", errors="strict")
210
+ except (UnicodeDecodeError, OSError):
211
+ drop_decode += 1
212
+ continue
213
+
214
+ if not text:
215
+ drop_size += 1
216
+ continue
217
+
218
+ sha = _hash_text(text)
219
+ if sha in seen_hashes:
220
+ drop_dedup += 1
221
+ continue
222
+
223
+ if enable_secret_scrub and has_secret(text):
224
+ drop_secret += 1
225
+ continue
226
+
227
+ if enable_license_filter and not is_license_allowed(text, license_allowlist_t):
228
+ drop_license += 1
229
+ continue
230
+
231
+ seen_hashes.add(sha)
232
+ pua_codepoints.update(find_pua_codepoints(text))
233
+
234
+ rec = CorpusRecord(
235
+ path=str(path.relative_to(corpus_dir)).replace("\\", "/"),
236
+ text=text,
237
+ sha256=sha,
238
+ )
239
+ line = rec.to_json() + b"\n"
240
+
241
+ assert shard_fh is not None
242
+ if bytes_in_shard + len(line) > shard_size_bytes and bytes_in_shard > 0:
243
+ shard_fh.close()
244
+ shard_idx += 1
245
+ shard_path = shards_dir / f"shard_{shard_idx:05d}.jsonl.gz"
246
+ shard_fh = gzip.open(shard_path, "wb") # noqa: SIM115
247
+ bytes_in_shard = 0
248
+
249
+ shard_fh.write(line)
250
+ bytes_in_shard += len(line)
251
+ bytes_kept += len(text.encode("utf-8"))
252
+ files_kept += 1
253
+ finally:
254
+ if shard_fh is not None:
255
+ shard_fh.close()
256
+
257
+ return IngestStats(
258
+ files_seen=files_seen,
259
+ files_kept=files_kept,
260
+ files_dropped_dedup=drop_dedup,
261
+ files_dropped_secret=drop_secret,
262
+ files_dropped_license=drop_license,
263
+ files_dropped_decode=drop_decode,
264
+ files_dropped_size=drop_size,
265
+ bytes_kept=bytes_kept,
266
+ pua_codepoints_in_corpus=frozenset(pua_codepoints),
267
+ )
268
+
269
+
270
+ def iter_shards(shards_dir: Path) -> Iterator[Path]:
271
+ """Yield shard paths in deterministic order."""
272
+ shards = sorted(shards_dir.glob("shard_*.jsonl.gz"))
273
+ yield from shards
274
+
275
+
276
+ def read_shard(shard_path: Path) -> Iterator[CorpusRecord]:
277
+ """Stream records from one shard."""
278
+ with gzip.open(shard_path, "rb") as fh:
279
+ for line in fh:
280
+ line = line.strip()
281
+ if not line:
282
+ continue
283
+ yield CorpusRecord.from_json(line)
284
+
285
+
286
+ def iter_shard_texts(shards_dir: Path) -> Iterator[str]:
287
+ """Stream text payloads from all shards in order."""
288
+ for shard in iter_shards(shards_dir):
289
+ for rec in read_shard(shard):
290
+ yield rec.text
291
+
292
+
293
+ __all__ = [
294
+ "SECRET_PATTERNS",
295
+ "CorpusRecord",
296
+ "IngestStats",
297
+ "detect_license",
298
+ "has_secret",
299
+ "ingest_corpus",
300
+ "is_license_allowed",
301
+ "iter_corpus_files",
302
+ "iter_shard_texts",
303
+ "iter_shards",
304
+ "read_shard",
305
+ ]
@@ -0,0 +1,37 @@
1
+ """PUA-aware decoding.
2
+
3
+ After the underlying ByteLevel BPE decoder reconstructs a string, we must
4
+ substitute every PUA character back to its original word. This is a single
5
+ linear scan with a dict lookup per character — O(n).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .pua import PUAMapping, is_pua_char
11
+
12
+
13
+ def reverse_pua_substitute(text: str, mapping: PUAMapping) -> str:
14
+ """Replace every PUA character in `text` with its original mapped word.
15
+
16
+ Characters not in the mapping are passed through unchanged. This is
17
+ safe even if the input contains PUA chars that weren't in the mapping
18
+ (they survive the round-trip as themselves).
19
+ """
20
+ pua_to_word = mapping.pua_to_word
21
+ if not pua_to_word:
22
+ return text
23
+
24
+ # Fast path: if no PUA chars present, return as-is.
25
+ if not any(is_pua_char(c) for c in text):
26
+ return text
27
+
28
+ out: list[str] = []
29
+ for ch in text:
30
+ if is_pua_char(ch):
31
+ out.append(pua_to_word.get(ch, ch))
32
+ else:
33
+ out.append(ch)
34
+ return "".join(out)
35
+
36
+
37
+ __all__ = ["reverse_pua_substitute"]