tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/__init__.py ADDED
@@ -0,0 +1,77 @@
1
+ """
2
+ TokMor (Core)
3
+ =============
4
+
5
+ Small, deterministic multilingual preprocessing:
6
+ - tokenization / segmentation with offsets
7
+ - rule-based morphology (best-effort; language-specific analyzers + fallbacks)
8
+
9
+ This package does not run ML models at inference time.
10
+ """
11
+
12
+ # Offline-only runtime enforcement (hard block any online/remote opt-in flags)
13
+ from .offline import enforce_offline as _enforce_offline
14
+ _enforce_offline()
15
+
16
+ __version__ = "1.2.9"
17
+
18
+ from .base import BaseTokenizer, TokenizerResult
19
+ from .factory import (
20
+ detect_language,
21
+ get_morphological_analyzer,
22
+ get_tokenizer,
23
+ morphology_available,
24
+ morph_supported_languages,
25
+ supported_languages,
26
+ tokenize,
27
+ )
28
+
29
+ # NER-oriented public APIs (recommended)
30
+ from .api import ( # noqa: F401
31
+ unified_tokenize,
32
+ ner_preprocess,
33
+ sentiment_hint,
34
+ sentiment_lexicon,
35
+ languages as preprocess_languages,
36
+ normalize as preprocess_normalize,
37
+ normalize_sns as preprocess_normalize_sns,
38
+ function_word_tag,
39
+ is_function_word,
40
+ )
41
+
42
+ # Legacy POS-free preprocessing helpers (compat tooling)
43
+ from .legacy_api import ( # noqa: F401
44
+ tokenize as legacy_tokenize,
45
+ segment as legacy_segment,
46
+ route as legacy_route,
47
+ )
48
+
49
+ __all__ = [
50
+ # Core
51
+ 'BaseTokenizer',
52
+ 'TokenizerResult',
53
+ # Factory functions
54
+ 'get_tokenizer',
55
+ 'tokenize',
56
+ 'detect_language',
57
+ 'get_morphological_analyzer',
58
+ 'supported_languages',
59
+ 'morphology_available',
60
+ 'morph_supported_languages',
61
+ # Public NER-oriented API
62
+ 'unified_tokenize',
63
+ 'ner_preprocess',
64
+ 'function_word_tag',
65
+ 'is_function_word',
66
+ # Domain hints (optional)
67
+ 'sentiment_hint',
68
+ 'sentiment_lexicon',
69
+ # Preprocess helpers
70
+ 'preprocess_languages',
71
+ 'preprocess_normalize',
72
+ 'preprocess_normalize_sns',
73
+ # Legacy POS-free preprocessing (opt-in)
74
+ 'legacy_tokenize',
75
+ 'legacy_segment',
76
+ 'legacy_route',
77
+ ]
tokmor/api.py ADDED
@@ -0,0 +1,194 @@
1
+ """
2
+ TokMor public API (NER-oriented preprocessing)
3
+ =============================================
4
+
5
+ Primary APIs:
6
+ - unified_tokenize(): offsets + best-effort POS/particle hints for downstream NER systems
7
+ - ner_preprocess(): SNS-aware, function-word hard-block, surface tokens for NER input
8
+
9
+ Legacy POS-free preprocessing helpers (tokenize/segment/route) were moved to `tokmor.legacy_api`.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from typing import Any, Dict, List, Literal, Optional, Union
15
+
16
+ from . import __version__ as _TOKMOR_VERSION
17
+ from .factory import detect_language
18
+ from .inventory import build_language_inventory
19
+ from .preprocess import normalize_text
20
+ from .schema import SCHEMA_VERSION
21
+
22
+
23
+ OutputFormat = Literal["tokens", "tokens_with_offsets"]
24
+ SegmentToken = Dict[str, Any]
25
+
26
+ def ner_preprocess(
27
+ text: str,
28
+ lang: str = "auto",
29
+ *,
30
+ sns: bool = True,
31
+ morphology: Optional[bool] = None,
32
+ include_token_hints: bool = False,
33
+ include_function_word_hints: bool = False,
34
+ drop_function_words: bool = True,
35
+ include_pos4_hints: bool = False,
36
+ use_surfaces: bool = True,
37
+ ) -> Dict[str, Any]:
38
+ """
39
+ Convenience helper for NER pipelines.
40
+
41
+ Returns:
42
+ - `ner_tokens`: tokens suitable for NER input (discourse markers/punct removed)
43
+ - `sns_entities`: SNS discourse markers as separate NER-style entities
44
+ - `ner_surfaces`: contiguous merged surfaces (helpful for morpheme-split languages)
45
+ """
46
+ if lang == "auto":
47
+ text_norm = normalize_text(text, sns=bool(sns))
48
+ lang = detect_language(text_norm)
49
+ from .ner_prep import ner_preprocess as _ner_preprocess
50
+ return _ner_preprocess(
51
+ text,
52
+ lang=lang,
53
+ sns=sns,
54
+ morphology=morphology,
55
+ include_token_hints=include_token_hints,
56
+ include_function_word_hints=include_function_word_hints,
57
+ drop_function_words=drop_function_words,
58
+ include_pos4_hints=include_pos4_hints,
59
+ use_surfaces=use_surfaces,
60
+ )
61
+
62
+
63
+ def unified_tokenize(
64
+ text: str,
65
+ lang: str = "auto",
66
+ *,
67
+ sns: bool = True,
68
+ morphology: Optional[bool] = None,
69
+ include_sns_tags: bool = False,
70
+ include_pos4: bool = True,
71
+ ) -> Dict[str, Any]:
72
+ """
73
+ Tokenize with offsets plus best-effort POS/particle hints (for downstream NER systems).
74
+
75
+ Returns tokens with:
76
+ - text/start/end
77
+ - pos (language-specific tag if available)
78
+ - pos_conf (deterministic heuristic, not ML confidence)
79
+ - is_particle (ko/ja/zh best-effort)
80
+ - pos4 (optional): N/V/ADJ/ADV/UNK
81
+ - sns (optional): discourse marker classification
82
+ """
83
+ from .unified_tokens import unified_tokenize as _unified_tokenize
84
+
85
+ return _unified_tokenize(
86
+ text,
87
+ lang=lang,
88
+ sns=sns,
89
+ morphology=morphology,
90
+ include_sns_tags=include_sns_tags,
91
+ include_pos4=include_pos4,
92
+ )
93
+
94
+
95
+ def languages() -> Dict[str, Any]:
96
+ """
97
+ Return an inventory of supported languages and capabilities.
98
+ """
99
+ return build_language_inventory()
100
+
101
+
102
+ def normalize(text: str) -> str:
103
+ """
104
+ Normalize raw text (conservative; keeps meaning).
105
+ """
106
+ return normalize_text(text)
107
+
108
+
109
+ def normalize_sns(text: str) -> str:
110
+ """
111
+ Normalize raw text for SNS / user-generated content.
112
+ This is still deterministic and conservative, but includes helpful tweaks
113
+ like fullwidth ASCII normalization for @/#/URLs.
114
+ """
115
+ return normalize_text(text, sns=True)
116
+
117
+
118
+ def function_word_tag(lang: str, word: str) -> Optional[str]:
119
+ """
120
+ Check if a word is a function word and return its POS tag.
121
+
122
+ Uses lang_configs (358 languages) with morphology analyzer fallback.
123
+
124
+ Args:
125
+ lang: Language code (e.g., 'en', 'ko', 'jv')
126
+ word: Word to check
127
+
128
+ Returns:
129
+ POS tag (DET, PRON, AUX, ADP, CCONJ, ADV, PART) or None if not a function word.
130
+
131
+ Example:
132
+ >>> function_word_tag('en', 'the')
133
+ 'DET'
134
+ >>> function_word_tag('jv', 'iku')
135
+ 'DET'
136
+ >>> function_word_tag('en', 'running')
137
+ None
138
+ """
139
+ from .ner_prep import function_word_tag as _function_word_tag
140
+ return _function_word_tag(lang, word)
141
+
142
+
143
+ def is_function_word(lang: str, word: str) -> bool:
144
+ """
145
+ Check if a word is a function word.
146
+
147
+ Args:
148
+ lang: Language code (e.g., 'en', 'ko', 'jv')
149
+ word: Word to check
150
+
151
+ Returns:
152
+ True if word is a function word, False otherwise.
153
+ """
154
+ return function_word_tag(lang, word) is not None
155
+
156
+
157
+ def sentiment_lexicon(lang: str = "auto") -> Optional[Dict[str, Any]]:
158
+ """
159
+ Load the built-in small sentiment lexicon (currently ko/en seed).
160
+
161
+ Returns None if unavailable for the requested language.
162
+ """
163
+ from .domain.sentiment import load_sentiment_lexicon
164
+
165
+ if lang == "auto":
166
+ # Best effort: infer from empty text is meaningless; caller should pass lang in that case.
167
+ return None
168
+
169
+ lex = load_sentiment_lexicon(lang)
170
+ if lex is None:
171
+ return None
172
+ return {
173
+ "lang": lex.lang,
174
+ "pos": sorted(lex.pos),
175
+ "neg": sorted(lex.neg),
176
+ "negators": sorted(lex.negators),
177
+ "intensifiers": sorted(lex.intensifiers),
178
+ "diminishers": sorted(lex.diminishers),
179
+ }
180
+
181
+
182
+ def sentiment_hint(
183
+ text: str,
184
+ lang: str = "auto",
185
+ *,
186
+ sns: bool = True,
187
+ ) -> Dict[str, Any]:
188
+ """
189
+ Best-effort sentiment hint for quick experiments (ko/en seed).
190
+ """
191
+ from .domain.sentiment import sentiment_hint as _sentiment_hint
192
+ return _sentiment_hint(text, lang=lang, sns=bool(sns))
193
+
194
+
tokmor/assets.py ADDED
@@ -0,0 +1,365 @@
1
+ """
2
+ TokMor core asset management (offline-first)
3
+ ===========================================
4
+
5
+ This module manages ONLY core assets required for tokenization+morphology quality:
6
+ - lemma_dict/ (optional but improves lemmatization)
7
+ - seg_lexicon/ (optional, zh/SEA segmentation improvements)
8
+
9
+ It supports:
10
+ - pack_status(): discover what is available via tokmor.resources routing
11
+ - build_snapshot(): create an offline data directory (TOKMOR_DATA_DIR layout) with manifest + SHA256SUMS
12
+ - build_bundle(): tar.gz a snapshot
13
+ - verify_bundle(): verify tar + manifest + SHA256SUMS integrity
14
+
15
+ No downloads. No network access. Pure filesystem operations.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import hashlib
21
+ import io
22
+ import json
23
+ import os
24
+ import tarfile
25
+ import tempfile
26
+ from dataclasses import dataclass
27
+ from pathlib import Path
28
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
29
+
30
+ from . import __version__ as _TOKMOR_VERSION
31
+ from .schema import SCHEMA_VERSION
32
+ from . import resources
33
+
34
+
35
+ MANIFEST_NAME = "TOKMOR_MANIFEST.json"
36
+ SHA256SUMS_NAME = "SHA256SUMS"
37
+
38
+
39
+ def _sha256_file(p: Path) -> str:
40
+ h = hashlib.sha256()
41
+ with p.open("rb") as f:
42
+ for chunk in iter(lambda: f.read(1024 * 1024), b""):
43
+ h.update(chunk)
44
+ return h.hexdigest()
45
+
46
+
47
+ def _iter_files(root: Path) -> Iterable[Path]:
48
+ for p in sorted(root.rglob("*")):
49
+ if p.is_file():
50
+ yield p
51
+
52
+
53
+ def _copy_file(src: Path, dst: Path) -> None:
54
+ dst.parent.mkdir(parents=True, exist_ok=True)
55
+ # copy bytes (avoid shutil to keep dependencies minimal and behavior explicit)
56
+ dst.write_bytes(src.read_bytes())
57
+
58
+
59
+ def _atomic_write_text(path: Path, text: str) -> None:
60
+ path.parent.mkdir(parents=True, exist_ok=True)
61
+ with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), encoding="utf-8") as tf:
62
+ tf.write(text)
63
+ tmp = Path(tf.name)
64
+ tmp.replace(path)
65
+
66
+
67
+ def _atomic_write_bytes(path: Path, data: bytes) -> None:
68
+ path.parent.mkdir(parents=True, exist_ok=True)
69
+ with tempfile.NamedTemporaryFile("wb", delete=False, dir=str(path.parent)) as tf:
70
+ tf.write(data)
71
+ tmp = Path(tf.name)
72
+ tmp.replace(path)
73
+
74
+
75
+ def pack_status() -> Dict[str, Any]:
76
+ """
77
+ Report discoverable core assets under current routing configuration.
78
+ """
79
+ ld = resources.lemma_dict_dir()
80
+ sd = resources.seg_lexicon_dir()
81
+ zh_extra = resources.resolve_extra_dict_path("zh")
82
+
83
+ def _count(glob_pat: str, d: Path) -> int:
84
+ try:
85
+ if not d.exists():
86
+ return 0
87
+ return len(list(d.glob(glob_pat)))
88
+ except Exception:
89
+ return 0
90
+
91
+ return {
92
+ "schema_version": int(SCHEMA_VERSION),
93
+ "tokmor_version": str(_TOKMOR_VERSION),
94
+ "env": {
95
+ "TOKMOR_DATA_DIR": os.getenv("TOKMOR_DATA_DIR"),
96
+ "TOKMOR_LEMMA_DICT_DIR": os.getenv("TOKMOR_LEMMA_DICT_DIR"),
97
+ "TOKMOR_DISABLE_LEMMA_PACK": os.getenv("TOKMOR_DISABLE_LEMMA_PACK"),
98
+ },
99
+ "paths": {
100
+ "data_dir": str(resources.data_dir()),
101
+ "lemma_dict_dir": str(ld),
102
+ "seg_lexicon_dir": str(sd),
103
+ "zh_extra_dict": str(zh_extra) if zh_extra else None,
104
+ },
105
+ "counts": {
106
+ "lemma_sqlite": _count("*.sqlite", ld),
107
+ "lemma_db": _count("*.db", ld),
108
+ "lemma_pkl": _count("*.pkl", ld),
109
+ "seg_wordfreq_pkl": _count("*_wordfreq.pkl", sd),
110
+ "seg_wordlist_any": _count("*_wordlist.*", sd),
111
+ "extra_dict_json": _count("*_extra_dict.json", sd),
112
+ },
113
+ }
114
+
115
+
116
+ @dataclass
117
+ class SnapshotSpec:
118
+ out_dir: Path
119
+ include_lemma: bool = True
120
+ include_seg_lexicon: bool = True
121
+ langs: Optional[List[str]] = None # if set, copy only matching language files
122
+
123
+
124
+ def _lang_filter_ok(rel_name: str, langs: Optional[List[str]]) -> bool:
125
+ if not langs:
126
+ return True
127
+ # Accept:
128
+ # - "{lang}.sqlite" in lemma_dict/
129
+ # - "{lang}_wordfreq.pkl", "{lang}_wordlist.*", "{lang}_extra_dict.json" in seg_lexicon/
130
+ base = Path(rel_name).name
131
+ for l in langs:
132
+ l2 = (l or "").lower().replace("_", "-")
133
+ if not l2:
134
+ continue
135
+ if base.startswith(l2 + "_") or base.startswith(l2 + "."):
136
+ return True
137
+ return False
138
+
139
+
140
+ def build_snapshot(spec: SnapshotSpec) -> Dict[str, Any]:
141
+ """
142
+ Build an offline snapshot directory with manifest + SHA256SUMS.
143
+
144
+ Snapshot layout (subset of TOKMOR_DATA_DIR):
145
+ out_dir/
146
+ lemma_dict/...
147
+ seg_lexicon/...
148
+ TOKMOR_MANIFEST.json
149
+ SHA256SUMS
150
+ """
151
+ out_dir = Path(spec.out_dir).expanduser().resolve()
152
+ tmp_root = out_dir.parent / (out_dir.name + ".tmp")
153
+ if tmp_root.exists():
154
+ # best-effort cleanup
155
+ for p in reversed(list(tmp_root.rglob("*"))):
156
+ try:
157
+ if p.is_file():
158
+ p.unlink()
159
+ elif p.is_dir():
160
+ p.rmdir()
161
+ except Exception:
162
+ pass
163
+ try:
164
+ tmp_root.rmdir()
165
+ except Exception:
166
+ pass
167
+
168
+ tmp_root.mkdir(parents=True, exist_ok=True)
169
+
170
+ copied: List[str] = []
171
+ if spec.include_lemma:
172
+ src = resources.lemma_dict_dir()
173
+ dst = tmp_root / "lemma_dict"
174
+ if src.exists():
175
+ for f in src.iterdir():
176
+ if not f.is_file():
177
+ continue
178
+ if f.suffix.lower() not in {".sqlite", ".db", ".sqlite3", ".pkl"}:
179
+ continue
180
+ rel = f"lemma_dict/{f.name}"
181
+ if not _lang_filter_ok(rel, spec.langs):
182
+ continue
183
+ _copy_file(f, dst / f.name)
184
+ copied.append(rel)
185
+
186
+ if spec.include_seg_lexicon:
187
+ src = resources.seg_lexicon_dir()
188
+ dst = tmp_root / "seg_lexicon"
189
+ if src.exists():
190
+ for f in src.iterdir():
191
+ if not f.is_file():
192
+ continue
193
+ # keep only known safe lexicon assets (no giant corpora)
194
+ name = f.name
195
+ if not (
196
+ name.endswith("_wordfreq.pkl")
197
+ or name.endswith("_seg_lexicon.pkl")
198
+ or name.endswith("_wordlist.pkl")
199
+ or name.endswith("_wordlist.txt")
200
+ or name.endswith("_extra_dict.json")
201
+ or name.endswith("_extra_lexicon.json")
202
+ ):
203
+ continue
204
+ rel = f"seg_lexicon/{name}"
205
+ if not _lang_filter_ok(rel, spec.langs):
206
+ continue
207
+ _copy_file(f, dst / name)
208
+ copied.append(rel)
209
+
210
+ # manifest + SHA256SUMS
211
+ checksums: Dict[str, str] = {}
212
+ for f in _iter_files(tmp_root):
213
+ rel = str(f.relative_to(tmp_root)).replace("\\", "/")
214
+ checksums[rel] = _sha256_file(f)
215
+
216
+ manifest: Dict[str, Any] = {
217
+ "schema_version": int(SCHEMA_VERSION),
218
+ "tokmor_version": str(_TOKMOR_VERSION),
219
+ "type": "tokmor_core_snapshot",
220
+ "assets": {
221
+ "include_lemma": bool(spec.include_lemma),
222
+ "include_seg_lexicon": bool(spec.include_seg_lexicon),
223
+ "langs": spec.langs,
224
+ },
225
+ "files": [
226
+ {"path": rel, "sha256": sha}
227
+ for rel, sha in sorted(checksums.items(), key=lambda x: x[0])
228
+ ],
229
+ }
230
+
231
+ _atomic_write_text(tmp_root / MANIFEST_NAME, json.dumps(manifest, ensure_ascii=False, indent=2) + "\n")
232
+ # recompute checksums including manifest
233
+ checksums[str(MANIFEST_NAME)] = _sha256_file(tmp_root / MANIFEST_NAME)
234
+
235
+ sha_lines = [f"{checksums[p]} {p}" for p in sorted(checksums.keys())]
236
+ _atomic_write_text(tmp_root / SHA256SUMS_NAME, "\n".join(sha_lines) + "\n")
237
+
238
+ # move into place
239
+ if out_dir.exists():
240
+ # replace old snapshot atomically-ish (rename to backup then replace)
241
+ backup = out_dir.parent / (out_dir.name + ".bak")
242
+ if backup.exists():
243
+ # delete previous backup
244
+ for p in reversed(list(backup.rglob("*"))):
245
+ try:
246
+ if p.is_file():
247
+ p.unlink()
248
+ elif p.is_dir():
249
+ p.rmdir()
250
+ except Exception:
251
+ pass
252
+ try:
253
+ backup.rmdir()
254
+ except Exception:
255
+ pass
256
+ out_dir.replace(backup)
257
+
258
+ tmp_root.replace(out_dir)
259
+
260
+ return {
261
+ "ok": True,
262
+ "schema_version": int(SCHEMA_VERSION),
263
+ "tokmor_version": str(_TOKMOR_VERSION),
264
+ "out_dir": str(out_dir),
265
+ "file_count": len(list(_iter_files(out_dir))),
266
+ "copied_count": len(copied),
267
+ }
268
+
269
+
270
+ def build_bundle(*, snapshot_dir: Path, out_tgz: Path) -> Dict[str, Any]:
271
+ """
272
+ Create a tar.gz from a snapshot directory.
273
+ """
274
+ snapshot_dir = Path(snapshot_dir).expanduser().resolve()
275
+ out_tgz = Path(out_tgz).expanduser().resolve()
276
+ if not snapshot_dir.exists():
277
+ raise FileNotFoundError(f"snapshot_dir does not exist: {snapshot_dir}")
278
+
279
+ # write tgz
280
+ out_tgz.parent.mkdir(parents=True, exist_ok=True)
281
+ with tarfile.open(out_tgz, "w:gz") as tf:
282
+ tf.add(snapshot_dir, arcname=snapshot_dir.name)
283
+
284
+ sha = _sha256_file(out_tgz)
285
+ _atomic_write_text(out_tgz.with_suffix(out_tgz.suffix + ".sha256"), sha + "\n")
286
+ return {
287
+ "ok": True,
288
+ "schema_version": int(SCHEMA_VERSION),
289
+ "tokmor_version": str(_TOKMOR_VERSION),
290
+ "bundle": str(out_tgz),
291
+ "sha256": sha,
292
+ }
293
+
294
+
295
+ def verify_bundle(*, bundle_tgz: Path, expected_sha256: Optional[str] = None) -> Dict[str, Any]:
296
+ """
297
+ Verify tar.gz bundle integrity:
298
+ - optional sha256 match
299
+ - contains manifest and SHA256SUMS
300
+ - internal files match SHA256SUMS
301
+ """
302
+ bundle_tgz = Path(bundle_tgz).expanduser().resolve()
303
+ if not bundle_tgz.exists():
304
+ raise FileNotFoundError(f"bundle not found: {bundle_tgz}")
305
+
306
+ actual = _sha256_file(bundle_tgz)
307
+ if expected_sha256 and (expected_sha256.strip().lower() != actual.lower()):
308
+ return {"ok": False, "error": "sha256_mismatch", "expected": expected_sha256, "actual": actual}
309
+
310
+ # Extract to temp dir (disk-safe; deletes on exit)
311
+ with tempfile.TemporaryDirectory(prefix="tokmor_bundle_verify_") as td:
312
+ td_path = Path(td)
313
+ with tarfile.open(bundle_tgz, "r:gz") as tf:
314
+ tf.extractall(td_path)
315
+
316
+ # assume single root directory
317
+ roots = [p for p in td_path.iterdir() if p.is_dir()]
318
+ if not roots:
319
+ return {"ok": False, "error": "no_root_dir"}
320
+ root = roots[0]
321
+
322
+ man = root / MANIFEST_NAME
323
+ sums = root / SHA256SUMS_NAME
324
+ if not man.exists():
325
+ return {"ok": False, "error": "missing_manifest"}
326
+ if not sums.exists():
327
+ return {"ok": False, "error": "missing_sha256sums"}
328
+
329
+ # parse SHA256SUMS
330
+ expected: Dict[str, str] = {}
331
+ for line in sums.read_text(encoding="utf-8", errors="ignore").splitlines():
332
+ line = line.strip()
333
+ if not line:
334
+ continue
335
+ # "sha path"
336
+ parts = line.split()
337
+ if len(parts) < 2:
338
+ continue
339
+ sha = parts[0].strip()
340
+ path = parts[-1].strip()
341
+ expected[path] = sha
342
+
343
+ # verify
344
+ bad: List[Dict[str, str]] = []
345
+ for path, sha in expected.items():
346
+ p = root / path
347
+ if not p.exists() or not p.is_file():
348
+ bad.append({"path": path, "error": "missing"})
349
+ continue
350
+ got = _sha256_file(p)
351
+ if got.lower() != sha.lower():
352
+ bad.append({"path": path, "error": "sha256_mismatch", "expected": sha, "actual": got})
353
+
354
+ return {
355
+ "ok": len(bad) == 0,
356
+ "schema_version": int(SCHEMA_VERSION),
357
+ "tokmor_version": str(_TOKMOR_VERSION),
358
+ "bundle": str(bundle_tgz),
359
+ "bundle_sha256": actual,
360
+ "errors": bad,
361
+ }
362
+
363
+
364
+
365
+