cute-tokenizer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cute_tokenizer/__init__.py +35 -0
- cute_tokenizer/_version.py +1 -0
- cute_tokenizer/cli.py +130 -0
- cute_tokenizer/config.py +96 -0
- cute_tokenizer/corpus.py +305 -0
- cute_tokenizer/decode.py +37 -0
- cute_tokenizer/frequency.py +116 -0
- cute_tokenizer/manifest.py +145 -0
- cute_tokenizer/patterns.py +102 -0
- cute_tokenizer/pretokenizer.py +171 -0
- cute_tokenizer/pua.py +156 -0
- cute_tokenizer/selection.py +103 -0
- cute_tokenizer/tokenizer.py +181 -0
- cute_tokenizer/trainer.py +266 -0
- cute_tokenizer-0.1.0.dist-info/METADATA +258 -0
- cute_tokenizer-0.1.0.dist-info/RECORD +19 -0
- cute_tokenizer-0.1.0.dist-info/WHEEL +4 -0
- cute_tokenizer-0.1.0.dist-info/entry_points.txt +2 -0
- cute_tokenizer-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""CUTE — Compact Unicode Token Encoding.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
build_cute — train a CUTE tokenizer from a corpus directory.
|
|
5
|
+
CUTEConfig — all knobs for the build pipeline.
|
|
6
|
+
CUTETokenizerFast — HuggingFace-compatible inference wrapper.
|
|
7
|
+
PUAMapping — word ↔ PUA character mapping.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os as _os
|
|
13
|
+
|
|
14
|
+
# Silence the "None of PyTorch, TensorFlow, or Flax have been found" warning
|
|
15
|
+
# that `transformers` emits at import. CUTE only needs the tokenizer layer, not
|
|
16
|
+
# the model layer, so this warning is irrelevant. Must be set BEFORE the first
|
|
17
|
+
# `transformers` import — putting it here covers both library and CLI paths.
|
|
18
|
+
_os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
|
|
19
|
+
_os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
|
20
|
+
|
|
21
|
+
from ._version import __version__
|
|
22
|
+
from .config import CUTEConfig
|
|
23
|
+
from .pua import PUAMapping
|
|
24
|
+
from .tokenizer import CUTETokenizerFast
|
|
25
|
+
from .trainer import build_cute, load_mapping, save_mapping
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"CUTEConfig",
|
|
29
|
+
"CUTETokenizerFast",
|
|
30
|
+
"PUAMapping",
|
|
31
|
+
"__version__",
|
|
32
|
+
"build_cute",
|
|
33
|
+
"load_mapping",
|
|
34
|
+
"save_mapping",
|
|
35
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
cute_tokenizer/cli.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Command-line interface for CUTE.
|
|
2
|
+
|
|
3
|
+
cute build --corpus ./corpus --output ./output [--config configs/default.toml]
|
|
4
|
+
cute roundtrip-check --tokenizer ./output --corpus ./holdout
|
|
5
|
+
cute info --tokenizer ./output
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import json
|
|
12
|
+
import os
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
# Silence the noisy "None of PyTorch, TensorFlow >= 2.0, or Flax have been found"
|
|
17
|
+
# warning from `transformers`. We only use the tokenizer, not the model layer,
|
|
18
|
+
# so this warning is irrelevant. Set BEFORE any transformers import below.
|
|
19
|
+
os.environ.setdefault("TRANSFORMERS_VERBOSITY", "error")
|
|
20
|
+
os.environ.setdefault("TRANSFORMERS_NO_ADVISORY_WARNINGS", "1")
|
|
21
|
+
|
|
22
|
+
from ._version import __version__
|
|
23
|
+
from .config import CUTEConfig
|
|
24
|
+
from .corpus import iter_corpus_files
|
|
25
|
+
from .manifest import BuildManifest
|
|
26
|
+
from .tokenizer import CUTETokenizerFast
|
|
27
|
+
from .trainer import build_cute
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _cmd_build(args: argparse.Namespace) -> int:
|
|
31
|
+
config = _load_config(Path(args.config)) if args.config else CUTEConfig()
|
|
32
|
+
manifest_path = build_cute(
|
|
33
|
+
corpus_dir=Path(args.corpus),
|
|
34
|
+
output_dir=Path(args.output),
|
|
35
|
+
config=config,
|
|
36
|
+
)
|
|
37
|
+
print(f"Build complete. Manifest: {manifest_path}")
|
|
38
|
+
return 0
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _cmd_roundtrip_check(args: argparse.Namespace) -> int:
|
|
42
|
+
tok_dir = Path(args.tokenizer)
|
|
43
|
+
tok = CUTETokenizerFast(
|
|
44
|
+
tokenizer_file=tok_dir / "tokenizer.json",
|
|
45
|
+
cute_mapping_file=tok_dir / "cute_mapping.json",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
corpus_dir = Path(args.corpus)
|
|
49
|
+
files_checked = files_failed = 0
|
|
50
|
+
for path in iter_corpus_files(
|
|
51
|
+
corpus_dir,
|
|
52
|
+
extensions=(".py", ".js", ".ts", ".java", ".c", ".cpp", ".rs", ".go", ".rb", ".php"),
|
|
53
|
+
):
|
|
54
|
+
try:
|
|
55
|
+
text = path.read_text(encoding="utf-8")
|
|
56
|
+
except (UnicodeDecodeError, OSError):
|
|
57
|
+
continue
|
|
58
|
+
files_checked += 1
|
|
59
|
+
ids = tok(text, add_special_tokens=False).input_ids
|
|
60
|
+
decoded = tok.decode(ids, skip_special_tokens=True)
|
|
61
|
+
if decoded != text:
|
|
62
|
+
files_failed += 1
|
|
63
|
+
print(f"FAIL {path.relative_to(corpus_dir)}")
|
|
64
|
+
if args.verbose:
|
|
65
|
+
_show_diff(text, decoded)
|
|
66
|
+
if files_checked >= args.max_files:
|
|
67
|
+
break
|
|
68
|
+
|
|
69
|
+
print(f"Round-trip check: {files_checked - files_failed}/{files_checked} OK")
|
|
70
|
+
return 0 if files_failed == 0 else 1
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _cmd_info(args: argparse.Namespace) -> int:
|
|
74
|
+
tok_dir = Path(args.tokenizer)
|
|
75
|
+
manifest = BuildManifest.read(tok_dir / "build_manifest.json")
|
|
76
|
+
print(json.dumps(manifest.to_dict(), indent=2))
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _show_diff(expected: str, got: str) -> None:
|
|
81
|
+
"""Print a brief diff for round-trip failures."""
|
|
82
|
+
for i, (e, g) in enumerate(zip(expected, got, strict=False)):
|
|
83
|
+
if e != g:
|
|
84
|
+
ctx = max(0, i - 20)
|
|
85
|
+
print(f" first diff at offset {i}:")
|
|
86
|
+
print(f" expected: ...{expected[ctx : i + 20]!r}")
|
|
87
|
+
print(f" got : ...{got[ctx : i + 20]!r}")
|
|
88
|
+
return
|
|
89
|
+
if len(expected) != len(got):
|
|
90
|
+
print(f" length differs: expected={len(expected)}, got={len(got)}")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _load_config(path: Path) -> CUTEConfig:
|
|
94
|
+
"""Load a config from TOML. Lazy import — `tomllib` is stdlib in 3.11+."""
|
|
95
|
+
if sys.version_info >= (3, 11):
|
|
96
|
+
import tomllib
|
|
97
|
+
else:
|
|
98
|
+
import tomli as tomllib # type: ignore[no-redef]
|
|
99
|
+
data = tomllib.loads(path.read_text(encoding="utf-8"))
|
|
100
|
+
return CUTEConfig(**data)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main(argv: list[str] | None = None) -> int:
|
|
104
|
+
parser = argparse.ArgumentParser(prog="cute", description="CUTE tokenizer builder")
|
|
105
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
106
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
107
|
+
|
|
108
|
+
p_build = sub.add_parser("build", help="Train a CUTE tokenizer from a corpus")
|
|
109
|
+
p_build.add_argument("--corpus", required=True, help="Corpus directory")
|
|
110
|
+
p_build.add_argument("--output", required=True, help="Output directory")
|
|
111
|
+
p_build.add_argument("--config", help="Optional TOML config path")
|
|
112
|
+
p_build.set_defaults(func=_cmd_build)
|
|
113
|
+
|
|
114
|
+
p_rt = sub.add_parser("roundtrip-check", help="Verify byte-equal round-trip")
|
|
115
|
+
p_rt.add_argument("--tokenizer", required=True, help="Trained tokenizer dir")
|
|
116
|
+
p_rt.add_argument("--corpus", required=True, help="Held-out corpus to check")
|
|
117
|
+
p_rt.add_argument("--max-files", type=int, default=10_000)
|
|
118
|
+
p_rt.add_argument("--verbose", action="store_true")
|
|
119
|
+
p_rt.set_defaults(func=_cmd_roundtrip_check)
|
|
120
|
+
|
|
121
|
+
p_info = sub.add_parser("info", help="Print build manifest")
|
|
122
|
+
p_info.add_argument("--tokenizer", required=True)
|
|
123
|
+
p_info.set_defaults(func=_cmd_info)
|
|
124
|
+
|
|
125
|
+
args = parser.parse_args(argv)
|
|
126
|
+
return int(args.func(args))
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
if __name__ == "__main__":
|
|
130
|
+
raise SystemExit(main())
|
cute_tokenizer/config.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Configuration for the CUTE tokenizer build pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import asdict, dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
DEFAULT_SPECIAL_TOKENS: tuple[str, ...] = (
|
|
9
|
+
"<pad>",
|
|
10
|
+
"<s>",
|
|
11
|
+
"</s>",
|
|
12
|
+
"<unk>",
|
|
13
|
+
"<|endoftext|>",
|
|
14
|
+
"<|fim_prefix|>",
|
|
15
|
+
"<|fim_middle|>",
|
|
16
|
+
"<|fim_suffix|>",
|
|
17
|
+
"<|file_sep|>",
|
|
18
|
+
"<|repo_name|>",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
DEFAULT_CODE_EXTENSIONS: tuple[str, ...] = (
|
|
22
|
+
".py",
|
|
23
|
+
".js",
|
|
24
|
+
".ts",
|
|
25
|
+
".tsx",
|
|
26
|
+
".jsx",
|
|
27
|
+
".java",
|
|
28
|
+
".c",
|
|
29
|
+
".cpp",
|
|
30
|
+
".h",
|
|
31
|
+
".hpp",
|
|
32
|
+
".cs",
|
|
33
|
+
".rs",
|
|
34
|
+
".go",
|
|
35
|
+
".rb",
|
|
36
|
+
".php",
|
|
37
|
+
".swift",
|
|
38
|
+
".kt",
|
|
39
|
+
".scala",
|
|
40
|
+
".sh",
|
|
41
|
+
".sql",
|
|
42
|
+
".html",
|
|
43
|
+
".css",
|
|
44
|
+
".scss",
|
|
45
|
+
".json",
|
|
46
|
+
".yaml",
|
|
47
|
+
".yml",
|
|
48
|
+
".toml",
|
|
49
|
+
".md",
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass(frozen=True)
|
|
54
|
+
class CUTEConfig:
|
|
55
|
+
"""All knobs for a CUTE build, in one place.
|
|
56
|
+
|
|
57
|
+
Frozen so hashing/comparison is well-defined and the manifest serializer
|
|
58
|
+
can dump a stable representation.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
vocab_size: int = 80_000
|
|
62
|
+
coverage_target: float = 0.90
|
|
63
|
+
max_token_len: int = 50
|
|
64
|
+
boost_weight: float = 0.3
|
|
65
|
+
min_bpe_budget: int = 8_000
|
|
66
|
+
min_frequency: int = 2
|
|
67
|
+
seed: int = 42
|
|
68
|
+
extensions: tuple[str, ...] = DEFAULT_CODE_EXTENSIONS
|
|
69
|
+
special_tokens: tuple[str, ...] = DEFAULT_SPECIAL_TOKENS
|
|
70
|
+
workers: int = 0 # 0 means os.cpu_count()
|
|
71
|
+
shard_size_bytes: int = 64 * 1024 * 1024 # 64 MiB per shard
|
|
72
|
+
license_allowlist: tuple[str, ...] = (
|
|
73
|
+
"MIT",
|
|
74
|
+
"Apache-2.0",
|
|
75
|
+
"BSD-3-Clause",
|
|
76
|
+
"BSD-2-Clause",
|
|
77
|
+
"ISC",
|
|
78
|
+
"Apache 2.0",
|
|
79
|
+
"Apache License 2.0",
|
|
80
|
+
)
|
|
81
|
+
enable_secret_scrub: bool = True
|
|
82
|
+
enable_license_filter: bool = False # off by default; opt-in
|
|
83
|
+
|
|
84
|
+
def __post_init__(self) -> None:
|
|
85
|
+
if not 0.0 < self.coverage_target < 1.0:
|
|
86
|
+
raise ValueError(f"coverage_target must be in (0,1), got {self.coverage_target}")
|
|
87
|
+
if self.vocab_size < 1024:
|
|
88
|
+
raise ValueError(f"vocab_size too small: {self.vocab_size}")
|
|
89
|
+
if self.max_token_len < 1:
|
|
90
|
+
raise ValueError(f"max_token_len must be positive: {self.max_token_len}")
|
|
91
|
+
|
|
92
|
+
def to_dict(self) -> dict[str, Any]:
|
|
93
|
+
return asdict(self)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
__all__ = ["DEFAULT_CODE_EXTENSIONS", "DEFAULT_SPECIAL_TOKENS", "CUTEConfig"]
|
cute_tokenizer/corpus.py
ADDED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""Corpus pipeline: stream files, dedupe by content hash, scrub secrets, shard.
|
|
2
|
+
|
|
3
|
+
The output is a sequence of deterministic shards on disk that downstream
|
|
4
|
+
phases (frequency counting, BPE training) can iterate efficiently.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import gzip
|
|
10
|
+
import hashlib
|
|
11
|
+
from collections.abc import Iterable, Iterator
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import orjson
|
|
16
|
+
import regex as re
|
|
17
|
+
|
|
18
|
+
from .pua import find_pua_codepoints
|
|
19
|
+
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
# Secret scrubbing
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
# Each pattern is conservative — false positives drop a file, which is fine
|
|
25
|
+
# at corpus scale. False negatives are far more dangerous (secret in vocab).
|
|
26
|
+
SECRET_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = (
|
|
27
|
+
("aws_access_key", re.compile(r"AKIA[0-9A-Z]{16}")),
|
|
28
|
+
("openai_api_key", re.compile(r"sk-(?:proj-)?[A-Za-z0-9_-]{20,}")),
|
|
29
|
+
("anthropic_api_key", re.compile(r"sk-ant-[A-Za-z0-9_\-]{50,}")),
|
|
30
|
+
("github_pat", re.compile(r"ghp_[A-Za-z0-9]{36}")),
|
|
31
|
+
("github_oauth", re.compile(r"gho_[A-Za-z0-9]{36}")),
|
|
32
|
+
("github_app", re.compile(r"(ghu|ghs)_[A-Za-z0-9]{36}")),
|
|
33
|
+
("google_api", re.compile(r"AIza[0-9A-Za-z_\-]{35}")),
|
|
34
|
+
("slack_token", re.compile(r"xox[baprs]-[A-Za-z0-9-]{10,}")),
|
|
35
|
+
("private_key_pem", re.compile(r"-----BEGIN (?:RSA |EC |DSA |OPENSSH )?PRIVATE KEY-----")),
|
|
36
|
+
("jwt", re.compile(r"eyJ[A-Za-z0-9_\-]{10,}\.eyJ[A-Za-z0-9_\-]{10,}\.[A-Za-z0-9_\-]{10,}")),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def has_secret(text: str) -> str | None:
|
|
41
|
+
"""Return the name of the first matching secret pattern, or None."""
|
|
42
|
+
for name, pat in SECRET_PATTERNS:
|
|
43
|
+
if pat.search(text):
|
|
44
|
+
return name
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# License filter
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
# SPDX header detector. Matches lines like:
|
|
53
|
+
# # SPDX-License-Identifier: MIT
|
|
54
|
+
# // SPDX-License-Identifier: Apache-2.0
|
|
55
|
+
# Operates on the first 4 KiB of each file so we don't scan large blobs.
|
|
56
|
+
_SPDX_REGEX = re.compile(
|
|
57
|
+
r"SPDX-License-Identifier\s*:\s*([A-Za-z0-9.\-+ ]+)",
|
|
58
|
+
re.IGNORECASE,
|
|
59
|
+
)
|
|
60
|
+
# Heuristic: explicit "All rights reserved" / "Proprietary" / "Confidential"
|
|
61
|
+
# in the file head. We refuse files matching these unless an SPDX header
|
|
62
|
+
# explicitly grants a permissive license.
|
|
63
|
+
_PROPRIETARY_REGEX = re.compile(
|
|
64
|
+
r"\b(All Rights Reserved|Proprietary and Confidential|UNLICENSED|License: Proprietary)\b",
|
|
65
|
+
re.IGNORECASE,
|
|
66
|
+
)
|
|
67
|
+
_LICENSE_HEAD_BYTES = 4096
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def detect_license(text: str) -> str | None:
|
|
71
|
+
"""Best-effort license detection from a file's head.
|
|
72
|
+
|
|
73
|
+
Returns the SPDX identifier if found, otherwise None. Does NOT make a
|
|
74
|
+
keep/drop decision — that's `is_license_allowed`'s job.
|
|
75
|
+
"""
|
|
76
|
+
head = text[:_LICENSE_HEAD_BYTES]
|
|
77
|
+
m = _SPDX_REGEX.search(head)
|
|
78
|
+
if m:
|
|
79
|
+
return m.group(1).strip()
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def is_license_allowed(text: str, allowlist: Iterable[str]) -> bool:
|
|
84
|
+
"""Decide whether a file's license header (if any) permits inclusion.
|
|
85
|
+
|
|
86
|
+
Logic:
|
|
87
|
+
1. If an SPDX header is present and matches the allowlist → allow.
|
|
88
|
+
2. If an SPDX header is present and does NOT match → reject.
|
|
89
|
+
3. If no SPDX header but a 'proprietary' marker is in the head → reject.
|
|
90
|
+
4. Otherwise (no headers, no markers) → allow. The corpus owner is
|
|
91
|
+
responsible for not feeding obviously copyrighted material; the
|
|
92
|
+
filter is a safety net, not a legal review.
|
|
93
|
+
"""
|
|
94
|
+
spdx = detect_license(text)
|
|
95
|
+
if spdx is not None:
|
|
96
|
+
allow_set = {entry.strip().lower() for entry in allowlist}
|
|
97
|
+
return spdx.lower() in allow_set
|
|
98
|
+
|
|
99
|
+
head = text[:_LICENSE_HEAD_BYTES]
|
|
100
|
+
return not _PROPRIETARY_REGEX.search(head)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
# Records
|
|
105
|
+
# ---------------------------------------------------------------------------
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass(frozen=True)
|
|
109
|
+
class CorpusRecord:
|
|
110
|
+
"""One file's content + metadata."""
|
|
111
|
+
|
|
112
|
+
path: str # path relative to the corpus root
|
|
113
|
+
text: str
|
|
114
|
+
sha256: str
|
|
115
|
+
|
|
116
|
+
def to_json(self) -> bytes:
|
|
117
|
+
return orjson.dumps({"path": self.path, "text": self.text, "sha256": self.sha256})
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_json(cls, line: bytes) -> CorpusRecord:
|
|
121
|
+
d = orjson.loads(line)
|
|
122
|
+
return cls(path=d["path"], text=d["text"], sha256=d["sha256"])
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
@dataclass(frozen=True)
|
|
126
|
+
class IngestStats:
|
|
127
|
+
"""Aggregated stats from one ingest pass."""
|
|
128
|
+
|
|
129
|
+
files_seen: int
|
|
130
|
+
files_kept: int
|
|
131
|
+
files_dropped_dedup: int
|
|
132
|
+
files_dropped_secret: int
|
|
133
|
+
files_dropped_license: int
|
|
134
|
+
files_dropped_decode: int
|
|
135
|
+
files_dropped_size: int
|
|
136
|
+
bytes_kept: int
|
|
137
|
+
pua_codepoints_in_corpus: frozenset[int]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
# ---------------------------------------------------------------------------
|
|
141
|
+
# Ingestion
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _hash_text(text: str) -> str:
|
|
146
|
+
return hashlib.sha256(text.encode("utf-8")).hexdigest()
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def iter_corpus_files(
|
|
150
|
+
corpus_dir: Path,
|
|
151
|
+
extensions: Iterable[str],
|
|
152
|
+
max_bytes: int = 5_000_000,
|
|
153
|
+
) -> Iterator[Path]:
|
|
154
|
+
"""Yield candidate files under `corpus_dir`, deterministically ordered.
|
|
155
|
+
|
|
156
|
+
Sorted by relative path so iteration order is reproducible across runs.
|
|
157
|
+
"""
|
|
158
|
+
ext_set = {e.lower() for e in extensions}
|
|
159
|
+
candidates = [p for p in corpus_dir.rglob("*") if p.is_file() and p.suffix.lower() in ext_set]
|
|
160
|
+
candidates.sort(key=lambda p: str(p.relative_to(corpus_dir)).replace("\\", "/"))
|
|
161
|
+
for p in candidates:
|
|
162
|
+
try:
|
|
163
|
+
if p.stat().st_size > max_bytes:
|
|
164
|
+
continue
|
|
165
|
+
except OSError:
|
|
166
|
+
continue
|
|
167
|
+
yield p
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def ingest_corpus(
|
|
171
|
+
corpus_dir: Path,
|
|
172
|
+
out_dir: Path,
|
|
173
|
+
extensions: Iterable[str],
|
|
174
|
+
shard_size_bytes: int = 64 * 1024 * 1024,
|
|
175
|
+
enable_secret_scrub: bool = True,
|
|
176
|
+
enable_license_filter: bool = False,
|
|
177
|
+
license_allowlist: Iterable[str] = (),
|
|
178
|
+
max_file_bytes: int = 5_000_000,
|
|
179
|
+
) -> IngestStats:
|
|
180
|
+
"""Read corpus files, dedupe + scrub, write line-delimited gzipped shards.
|
|
181
|
+
|
|
182
|
+
Output layout:
|
|
183
|
+
out_dir/shards/shard_00000.jsonl.gz
|
|
184
|
+
out_dir/shards/shard_00001.jsonl.gz
|
|
185
|
+
...
|
|
186
|
+
|
|
187
|
+
Each line of each shard is a CorpusRecord.to_json() blob.
|
|
188
|
+
"""
|
|
189
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
190
|
+
shards_dir = out_dir / "shards"
|
|
191
|
+
shards_dir.mkdir(exist_ok=True)
|
|
192
|
+
|
|
193
|
+
seen_hashes: set[str] = set()
|
|
194
|
+
files_seen = files_kept = 0
|
|
195
|
+
drop_dedup = drop_secret = drop_license = drop_decode = drop_size = 0
|
|
196
|
+
bytes_kept = 0
|
|
197
|
+
pua_codepoints: set[int] = set()
|
|
198
|
+
license_allowlist_t = tuple(license_allowlist)
|
|
199
|
+
|
|
200
|
+
shard_idx = 0
|
|
201
|
+
shard_path = shards_dir / f"shard_{shard_idx:05d}.jsonl.gz"
|
|
202
|
+
shard_fh: gzip.GzipFile | None = gzip.open(shard_path, "wb") # noqa: SIM115 (rolling handle, closed in finally)
|
|
203
|
+
bytes_in_shard = 0
|
|
204
|
+
|
|
205
|
+
try:
|
|
206
|
+
for path in iter_corpus_files(corpus_dir, extensions, max_bytes=max_file_bytes):
|
|
207
|
+
files_seen += 1
|
|
208
|
+
try:
|
|
209
|
+
text = path.read_text(encoding="utf-8", errors="strict")
|
|
210
|
+
except (UnicodeDecodeError, OSError):
|
|
211
|
+
drop_decode += 1
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
if not text:
|
|
215
|
+
drop_size += 1
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
sha = _hash_text(text)
|
|
219
|
+
if sha in seen_hashes:
|
|
220
|
+
drop_dedup += 1
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
if enable_secret_scrub and has_secret(text):
|
|
224
|
+
drop_secret += 1
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
if enable_license_filter and not is_license_allowed(text, license_allowlist_t):
|
|
228
|
+
drop_license += 1
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
seen_hashes.add(sha)
|
|
232
|
+
pua_codepoints.update(find_pua_codepoints(text))
|
|
233
|
+
|
|
234
|
+
rec = CorpusRecord(
|
|
235
|
+
path=str(path.relative_to(corpus_dir)).replace("\\", "/"),
|
|
236
|
+
text=text,
|
|
237
|
+
sha256=sha,
|
|
238
|
+
)
|
|
239
|
+
line = rec.to_json() + b"\n"
|
|
240
|
+
|
|
241
|
+
assert shard_fh is not None
|
|
242
|
+
if bytes_in_shard + len(line) > shard_size_bytes and bytes_in_shard > 0:
|
|
243
|
+
shard_fh.close()
|
|
244
|
+
shard_idx += 1
|
|
245
|
+
shard_path = shards_dir / f"shard_{shard_idx:05d}.jsonl.gz"
|
|
246
|
+
shard_fh = gzip.open(shard_path, "wb") # noqa: SIM115
|
|
247
|
+
bytes_in_shard = 0
|
|
248
|
+
|
|
249
|
+
shard_fh.write(line)
|
|
250
|
+
bytes_in_shard += len(line)
|
|
251
|
+
bytes_kept += len(text.encode("utf-8"))
|
|
252
|
+
files_kept += 1
|
|
253
|
+
finally:
|
|
254
|
+
if shard_fh is not None:
|
|
255
|
+
shard_fh.close()
|
|
256
|
+
|
|
257
|
+
return IngestStats(
|
|
258
|
+
files_seen=files_seen,
|
|
259
|
+
files_kept=files_kept,
|
|
260
|
+
files_dropped_dedup=drop_dedup,
|
|
261
|
+
files_dropped_secret=drop_secret,
|
|
262
|
+
files_dropped_license=drop_license,
|
|
263
|
+
files_dropped_decode=drop_decode,
|
|
264
|
+
files_dropped_size=drop_size,
|
|
265
|
+
bytes_kept=bytes_kept,
|
|
266
|
+
pua_codepoints_in_corpus=frozenset(pua_codepoints),
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def iter_shards(shards_dir: Path) -> Iterator[Path]:
|
|
271
|
+
"""Yield shard paths in deterministic order."""
|
|
272
|
+
shards = sorted(shards_dir.glob("shard_*.jsonl.gz"))
|
|
273
|
+
yield from shards
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def read_shard(shard_path: Path) -> Iterator[CorpusRecord]:
|
|
277
|
+
"""Stream records from one shard."""
|
|
278
|
+
with gzip.open(shard_path, "rb") as fh:
|
|
279
|
+
for line in fh:
|
|
280
|
+
line = line.strip()
|
|
281
|
+
if not line:
|
|
282
|
+
continue
|
|
283
|
+
yield CorpusRecord.from_json(line)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def iter_shard_texts(shards_dir: Path) -> Iterator[str]:
|
|
287
|
+
"""Stream text payloads from all shards in order."""
|
|
288
|
+
for shard in iter_shards(shards_dir):
|
|
289
|
+
for rec in read_shard(shard):
|
|
290
|
+
yield rec.text
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
__all__ = [
|
|
294
|
+
"SECRET_PATTERNS",
|
|
295
|
+
"CorpusRecord",
|
|
296
|
+
"IngestStats",
|
|
297
|
+
"detect_license",
|
|
298
|
+
"has_secret",
|
|
299
|
+
"ingest_corpus",
|
|
300
|
+
"is_license_allowed",
|
|
301
|
+
"iter_corpus_files",
|
|
302
|
+
"iter_shard_texts",
|
|
303
|
+
"iter_shards",
|
|
304
|
+
"read_shard",
|
|
305
|
+
]
|
cute_tokenizer/decode.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""PUA-aware decoding.
|
|
2
|
+
|
|
3
|
+
After the underlying ByteLevel BPE decoder reconstructs a string, we must
|
|
4
|
+
substitute every PUA character back to its original word. This is a single
|
|
5
|
+
linear scan with a dict lookup per character — O(n).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .pua import PUAMapping, is_pua_char
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def reverse_pua_substitute(text: str, mapping: PUAMapping) -> str:
|
|
14
|
+
"""Replace every PUA character in `text` with its original mapped word.
|
|
15
|
+
|
|
16
|
+
Characters not in the mapping are passed through unchanged. This is
|
|
17
|
+
safe even if the input contains PUA chars that weren't in the mapping
|
|
18
|
+
(they survive the round-trip as themselves).
|
|
19
|
+
"""
|
|
20
|
+
pua_to_word = mapping.pua_to_word
|
|
21
|
+
if not pua_to_word:
|
|
22
|
+
return text
|
|
23
|
+
|
|
24
|
+
# Fast path: if no PUA chars present, return as-is.
|
|
25
|
+
if not any(is_pua_char(c) for c in text):
|
|
26
|
+
return text
|
|
27
|
+
|
|
28
|
+
out: list[str] = []
|
|
29
|
+
for ch in text:
|
|
30
|
+
if is_pua_char(ch):
|
|
31
|
+
out.append(pua_to_word.get(ch, ch))
|
|
32
|
+
else:
|
|
33
|
+
out.append(ch)
|
|
34
|
+
return "".join(out)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
__all__ = ["reverse_pua_substitute"]
|