tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/__init__.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TokMor (Core)
|
|
3
|
+
=============
|
|
4
|
+
|
|
5
|
+
Small, deterministic multilingual preprocessing:
|
|
6
|
+
- tokenization / segmentation with offsets
|
|
7
|
+
- rule-based morphology (best-effort; language-specific analyzers + fallbacks)
|
|
8
|
+
|
|
9
|
+
This package does not run ML models at inference time.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
# Offline-only runtime enforcement (hard block any online/remote opt-in flags)
|
|
13
|
+
from .offline import enforce_offline as _enforce_offline
|
|
14
|
+
_enforce_offline()
|
|
15
|
+
|
|
16
|
+
__version__ = "1.2.9"
|
|
17
|
+
|
|
18
|
+
from .base import BaseTokenizer, TokenizerResult
|
|
19
|
+
from .factory import (
|
|
20
|
+
detect_language,
|
|
21
|
+
get_morphological_analyzer,
|
|
22
|
+
get_tokenizer,
|
|
23
|
+
morphology_available,
|
|
24
|
+
morph_supported_languages,
|
|
25
|
+
supported_languages,
|
|
26
|
+
tokenize,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
# NER-oriented public APIs (recommended)
|
|
30
|
+
from .api import ( # noqa: F401
|
|
31
|
+
unified_tokenize,
|
|
32
|
+
ner_preprocess,
|
|
33
|
+
sentiment_hint,
|
|
34
|
+
sentiment_lexicon,
|
|
35
|
+
languages as preprocess_languages,
|
|
36
|
+
normalize as preprocess_normalize,
|
|
37
|
+
normalize_sns as preprocess_normalize_sns,
|
|
38
|
+
function_word_tag,
|
|
39
|
+
is_function_word,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Legacy POS-free preprocessing helpers (compat tooling)
|
|
43
|
+
from .legacy_api import ( # noqa: F401
|
|
44
|
+
tokenize as legacy_tokenize,
|
|
45
|
+
segment as legacy_segment,
|
|
46
|
+
route as legacy_route,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
# Core
|
|
51
|
+
'BaseTokenizer',
|
|
52
|
+
'TokenizerResult',
|
|
53
|
+
# Factory functions
|
|
54
|
+
'get_tokenizer',
|
|
55
|
+
'tokenize',
|
|
56
|
+
'detect_language',
|
|
57
|
+
'get_morphological_analyzer',
|
|
58
|
+
'supported_languages',
|
|
59
|
+
'morphology_available',
|
|
60
|
+
'morph_supported_languages',
|
|
61
|
+
# Public NER-oriented API
|
|
62
|
+
'unified_tokenize',
|
|
63
|
+
'ner_preprocess',
|
|
64
|
+
'function_word_tag',
|
|
65
|
+
'is_function_word',
|
|
66
|
+
# Domain hints (optional)
|
|
67
|
+
'sentiment_hint',
|
|
68
|
+
'sentiment_lexicon',
|
|
69
|
+
# Preprocess helpers
|
|
70
|
+
'preprocess_languages',
|
|
71
|
+
'preprocess_normalize',
|
|
72
|
+
'preprocess_normalize_sns',
|
|
73
|
+
# Legacy POS-free preprocessing (opt-in)
|
|
74
|
+
'legacy_tokenize',
|
|
75
|
+
'legacy_segment',
|
|
76
|
+
'legacy_route',
|
|
77
|
+
]
|
tokmor/api.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TokMor public API (NER-oriented preprocessing)
|
|
3
|
+
=============================================
|
|
4
|
+
|
|
5
|
+
Primary APIs:
|
|
6
|
+
- unified_tokenize(): offsets + best-effort POS/particle hints for downstream NER systems
|
|
7
|
+
- ner_preprocess(): SNS-aware, function-word hard-block, surface tokens for NER input
|
|
8
|
+
|
|
9
|
+
Legacy POS-free preprocessing helpers (tokenize/segment/route) were moved to `tokmor.legacy_api`.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
15
|
+
|
|
16
|
+
from . import __version__ as _TOKMOR_VERSION
|
|
17
|
+
from .factory import detect_language
|
|
18
|
+
from .inventory import build_language_inventory
|
|
19
|
+
from .preprocess import normalize_text
|
|
20
|
+
from .schema import SCHEMA_VERSION
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
OutputFormat = Literal["tokens", "tokens_with_offsets"]
|
|
24
|
+
SegmentToken = Dict[str, Any]
|
|
25
|
+
|
|
26
|
+
def ner_preprocess(
|
|
27
|
+
text: str,
|
|
28
|
+
lang: str = "auto",
|
|
29
|
+
*,
|
|
30
|
+
sns: bool = True,
|
|
31
|
+
morphology: Optional[bool] = None,
|
|
32
|
+
include_token_hints: bool = False,
|
|
33
|
+
include_function_word_hints: bool = False,
|
|
34
|
+
drop_function_words: bool = True,
|
|
35
|
+
include_pos4_hints: bool = False,
|
|
36
|
+
use_surfaces: bool = True,
|
|
37
|
+
) -> Dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Convenience helper for NER pipelines.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
- `ner_tokens`: tokens suitable for NER input (discourse markers/punct removed)
|
|
43
|
+
- `sns_entities`: SNS discourse markers as separate NER-style entities
|
|
44
|
+
- `ner_surfaces`: contiguous merged surfaces (helpful for morpheme-split languages)
|
|
45
|
+
"""
|
|
46
|
+
if lang == "auto":
|
|
47
|
+
text_norm = normalize_text(text, sns=bool(sns))
|
|
48
|
+
lang = detect_language(text_norm)
|
|
49
|
+
from .ner_prep import ner_preprocess as _ner_preprocess
|
|
50
|
+
return _ner_preprocess(
|
|
51
|
+
text,
|
|
52
|
+
lang=lang,
|
|
53
|
+
sns=sns,
|
|
54
|
+
morphology=morphology,
|
|
55
|
+
include_token_hints=include_token_hints,
|
|
56
|
+
include_function_word_hints=include_function_word_hints,
|
|
57
|
+
drop_function_words=drop_function_words,
|
|
58
|
+
include_pos4_hints=include_pos4_hints,
|
|
59
|
+
use_surfaces=use_surfaces,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def unified_tokenize(
|
|
64
|
+
text: str,
|
|
65
|
+
lang: str = "auto",
|
|
66
|
+
*,
|
|
67
|
+
sns: bool = True,
|
|
68
|
+
morphology: Optional[bool] = None,
|
|
69
|
+
include_sns_tags: bool = False,
|
|
70
|
+
include_pos4: bool = True,
|
|
71
|
+
) -> Dict[str, Any]:
|
|
72
|
+
"""
|
|
73
|
+
Tokenize with offsets plus best-effort POS/particle hints (for downstream NER systems).
|
|
74
|
+
|
|
75
|
+
Returns tokens with:
|
|
76
|
+
- text/start/end
|
|
77
|
+
- pos (language-specific tag if available)
|
|
78
|
+
- pos_conf (deterministic heuristic, not ML confidence)
|
|
79
|
+
- is_particle (ko/ja/zh best-effort)
|
|
80
|
+
- pos4 (optional): N/V/ADJ/ADV/UNK
|
|
81
|
+
- sns (optional): discourse marker classification
|
|
82
|
+
"""
|
|
83
|
+
from .unified_tokens import unified_tokenize as _unified_tokenize
|
|
84
|
+
|
|
85
|
+
return _unified_tokenize(
|
|
86
|
+
text,
|
|
87
|
+
lang=lang,
|
|
88
|
+
sns=sns,
|
|
89
|
+
morphology=morphology,
|
|
90
|
+
include_sns_tags=include_sns_tags,
|
|
91
|
+
include_pos4=include_pos4,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def languages() -> Dict[str, Any]:
|
|
96
|
+
"""
|
|
97
|
+
Return an inventory of supported languages and capabilities.
|
|
98
|
+
"""
|
|
99
|
+
return build_language_inventory()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def normalize(text: str) -> str:
|
|
103
|
+
"""
|
|
104
|
+
Normalize raw text (conservative; keeps meaning).
|
|
105
|
+
"""
|
|
106
|
+
return normalize_text(text)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def normalize_sns(text: str) -> str:
|
|
110
|
+
"""
|
|
111
|
+
Normalize raw text for SNS / user-generated content.
|
|
112
|
+
This is still deterministic and conservative, but includes helpful tweaks
|
|
113
|
+
like fullwidth ASCII normalization for @/#/URLs.
|
|
114
|
+
"""
|
|
115
|
+
return normalize_text(text, sns=True)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def function_word_tag(lang: str, word: str) -> Optional[str]:
|
|
119
|
+
"""
|
|
120
|
+
Check if a word is a function word and return its POS tag.
|
|
121
|
+
|
|
122
|
+
Uses lang_configs (358 languages) with morphology analyzer fallback.
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
lang: Language code (e.g., 'en', 'ko', 'jv')
|
|
126
|
+
word: Word to check
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
POS tag (DET, PRON, AUX, ADP, CCONJ, ADV, PART) or None if not a function word.
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
>>> function_word_tag('en', 'the')
|
|
133
|
+
'DET'
|
|
134
|
+
>>> function_word_tag('jv', 'iku')
|
|
135
|
+
'DET'
|
|
136
|
+
>>> function_word_tag('en', 'running')
|
|
137
|
+
None
|
|
138
|
+
"""
|
|
139
|
+
from .ner_prep import function_word_tag as _function_word_tag
|
|
140
|
+
return _function_word_tag(lang, word)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def is_function_word(lang: str, word: str) -> bool:
|
|
144
|
+
"""
|
|
145
|
+
Check if a word is a function word.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
lang: Language code (e.g., 'en', 'ko', 'jv')
|
|
149
|
+
word: Word to check
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
True if word is a function word, False otherwise.
|
|
153
|
+
"""
|
|
154
|
+
return function_word_tag(lang, word) is not None
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def sentiment_lexicon(lang: str = "auto") -> Optional[Dict[str, Any]]:
|
|
158
|
+
"""
|
|
159
|
+
Load the built-in small sentiment lexicon (currently ko/en seed).
|
|
160
|
+
|
|
161
|
+
Returns None if unavailable for the requested language.
|
|
162
|
+
"""
|
|
163
|
+
from .domain.sentiment import load_sentiment_lexicon
|
|
164
|
+
|
|
165
|
+
if lang == "auto":
|
|
166
|
+
# Best effort: infer from empty text is meaningless; caller should pass lang in that case.
|
|
167
|
+
return None
|
|
168
|
+
|
|
169
|
+
lex = load_sentiment_lexicon(lang)
|
|
170
|
+
if lex is None:
|
|
171
|
+
return None
|
|
172
|
+
return {
|
|
173
|
+
"lang": lex.lang,
|
|
174
|
+
"pos": sorted(lex.pos),
|
|
175
|
+
"neg": sorted(lex.neg),
|
|
176
|
+
"negators": sorted(lex.negators),
|
|
177
|
+
"intensifiers": sorted(lex.intensifiers),
|
|
178
|
+
"diminishers": sorted(lex.diminishers),
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def sentiment_hint(
|
|
183
|
+
text: str,
|
|
184
|
+
lang: str = "auto",
|
|
185
|
+
*,
|
|
186
|
+
sns: bool = True,
|
|
187
|
+
) -> Dict[str, Any]:
|
|
188
|
+
"""
|
|
189
|
+
Best-effort sentiment hint for quick experiments (ko/en seed).
|
|
190
|
+
"""
|
|
191
|
+
from .domain.sentiment import sentiment_hint as _sentiment_hint
|
|
192
|
+
return _sentiment_hint(text, lang=lang, sns=bool(sns))
|
|
193
|
+
|
|
194
|
+
|
tokmor/assets.py
ADDED
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TokMor core asset management (offline-first)
|
|
3
|
+
===========================================
|
|
4
|
+
|
|
5
|
+
This module manages ONLY core assets required for tokenization+morphology quality:
|
|
6
|
+
- lemma_dict/ (optional but improves lemmatization)
|
|
7
|
+
- seg_lexicon/ (optional, zh/SEA segmentation improvements)
|
|
8
|
+
|
|
9
|
+
It supports:
|
|
10
|
+
- pack_status(): discover what is available via tokmor.resources routing
|
|
11
|
+
- build_snapshot(): create an offline data directory (TOKMOR_DATA_DIR layout) with manifest + SHA256SUMS
|
|
12
|
+
- build_bundle(): tar.gz a snapshot
|
|
13
|
+
- verify_bundle(): verify tar + manifest + SHA256SUMS integrity
|
|
14
|
+
|
|
15
|
+
No downloads. No network access. Pure filesystem operations.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import hashlib
|
|
21
|
+
import io
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import tarfile
|
|
25
|
+
import tempfile
|
|
26
|
+
from dataclasses import dataclass
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
|
29
|
+
|
|
30
|
+
from . import __version__ as _TOKMOR_VERSION
|
|
31
|
+
from .schema import SCHEMA_VERSION
|
|
32
|
+
from . import resources
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
MANIFEST_NAME = "TOKMOR_MANIFEST.json"
|
|
36
|
+
SHA256SUMS_NAME = "SHA256SUMS"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _sha256_file(p: Path) -> str:
|
|
40
|
+
h = hashlib.sha256()
|
|
41
|
+
with p.open("rb") as f:
|
|
42
|
+
for chunk in iter(lambda: f.read(1024 * 1024), b""):
|
|
43
|
+
h.update(chunk)
|
|
44
|
+
return h.hexdigest()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _iter_files(root: Path) -> Iterable[Path]:
|
|
48
|
+
for p in sorted(root.rglob("*")):
|
|
49
|
+
if p.is_file():
|
|
50
|
+
yield p
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _copy_file(src: Path, dst: Path) -> None:
|
|
54
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
# copy bytes (avoid shutil to keep dependencies minimal and behavior explicit)
|
|
56
|
+
dst.write_bytes(src.read_bytes())
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _atomic_write_text(path: Path, text: str) -> None:
|
|
60
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
with tempfile.NamedTemporaryFile("w", delete=False, dir=str(path.parent), encoding="utf-8") as tf:
|
|
62
|
+
tf.write(text)
|
|
63
|
+
tmp = Path(tf.name)
|
|
64
|
+
tmp.replace(path)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _atomic_write_bytes(path: Path, data: bytes) -> None:
|
|
68
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
with tempfile.NamedTemporaryFile("wb", delete=False, dir=str(path.parent)) as tf:
|
|
70
|
+
tf.write(data)
|
|
71
|
+
tmp = Path(tf.name)
|
|
72
|
+
tmp.replace(path)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def pack_status() -> Dict[str, Any]:
|
|
76
|
+
"""
|
|
77
|
+
Report discoverable core assets under current routing configuration.
|
|
78
|
+
"""
|
|
79
|
+
ld = resources.lemma_dict_dir()
|
|
80
|
+
sd = resources.seg_lexicon_dir()
|
|
81
|
+
zh_extra = resources.resolve_extra_dict_path("zh")
|
|
82
|
+
|
|
83
|
+
def _count(glob_pat: str, d: Path) -> int:
|
|
84
|
+
try:
|
|
85
|
+
if not d.exists():
|
|
86
|
+
return 0
|
|
87
|
+
return len(list(d.glob(glob_pat)))
|
|
88
|
+
except Exception:
|
|
89
|
+
return 0
|
|
90
|
+
|
|
91
|
+
return {
|
|
92
|
+
"schema_version": int(SCHEMA_VERSION),
|
|
93
|
+
"tokmor_version": str(_TOKMOR_VERSION),
|
|
94
|
+
"env": {
|
|
95
|
+
"TOKMOR_DATA_DIR": os.getenv("TOKMOR_DATA_DIR"),
|
|
96
|
+
"TOKMOR_LEMMA_DICT_DIR": os.getenv("TOKMOR_LEMMA_DICT_DIR"),
|
|
97
|
+
"TOKMOR_DISABLE_LEMMA_PACK": os.getenv("TOKMOR_DISABLE_LEMMA_PACK"),
|
|
98
|
+
},
|
|
99
|
+
"paths": {
|
|
100
|
+
"data_dir": str(resources.data_dir()),
|
|
101
|
+
"lemma_dict_dir": str(ld),
|
|
102
|
+
"seg_lexicon_dir": str(sd),
|
|
103
|
+
"zh_extra_dict": str(zh_extra) if zh_extra else None,
|
|
104
|
+
},
|
|
105
|
+
"counts": {
|
|
106
|
+
"lemma_sqlite": _count("*.sqlite", ld),
|
|
107
|
+
"lemma_db": _count("*.db", ld),
|
|
108
|
+
"lemma_pkl": _count("*.pkl", ld),
|
|
109
|
+
"seg_wordfreq_pkl": _count("*_wordfreq.pkl", sd),
|
|
110
|
+
"seg_wordlist_any": _count("*_wordlist.*", sd),
|
|
111
|
+
"extra_dict_json": _count("*_extra_dict.json", sd),
|
|
112
|
+
},
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
@dataclass
|
|
117
|
+
class SnapshotSpec:
|
|
118
|
+
out_dir: Path
|
|
119
|
+
include_lemma: bool = True
|
|
120
|
+
include_seg_lexicon: bool = True
|
|
121
|
+
langs: Optional[List[str]] = None # if set, copy only matching language files
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _lang_filter_ok(rel_name: str, langs: Optional[List[str]]) -> bool:
|
|
125
|
+
if not langs:
|
|
126
|
+
return True
|
|
127
|
+
# Accept:
|
|
128
|
+
# - "{lang}.sqlite" in lemma_dict/
|
|
129
|
+
# - "{lang}_wordfreq.pkl", "{lang}_wordlist.*", "{lang}_extra_dict.json" in seg_lexicon/
|
|
130
|
+
base = Path(rel_name).name
|
|
131
|
+
for l in langs:
|
|
132
|
+
l2 = (l or "").lower().replace("_", "-")
|
|
133
|
+
if not l2:
|
|
134
|
+
continue
|
|
135
|
+
if base.startswith(l2 + "_") or base.startswith(l2 + "."):
|
|
136
|
+
return True
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def build_snapshot(spec: SnapshotSpec) -> Dict[str, Any]:
|
|
141
|
+
"""
|
|
142
|
+
Build an offline snapshot directory with manifest + SHA256SUMS.
|
|
143
|
+
|
|
144
|
+
Snapshot layout (subset of TOKMOR_DATA_DIR):
|
|
145
|
+
out_dir/
|
|
146
|
+
lemma_dict/...
|
|
147
|
+
seg_lexicon/...
|
|
148
|
+
TOKMOR_MANIFEST.json
|
|
149
|
+
SHA256SUMS
|
|
150
|
+
"""
|
|
151
|
+
out_dir = Path(spec.out_dir).expanduser().resolve()
|
|
152
|
+
tmp_root = out_dir.parent / (out_dir.name + ".tmp")
|
|
153
|
+
if tmp_root.exists():
|
|
154
|
+
# best-effort cleanup
|
|
155
|
+
for p in reversed(list(tmp_root.rglob("*"))):
|
|
156
|
+
try:
|
|
157
|
+
if p.is_file():
|
|
158
|
+
p.unlink()
|
|
159
|
+
elif p.is_dir():
|
|
160
|
+
p.rmdir()
|
|
161
|
+
except Exception:
|
|
162
|
+
pass
|
|
163
|
+
try:
|
|
164
|
+
tmp_root.rmdir()
|
|
165
|
+
except Exception:
|
|
166
|
+
pass
|
|
167
|
+
|
|
168
|
+
tmp_root.mkdir(parents=True, exist_ok=True)
|
|
169
|
+
|
|
170
|
+
copied: List[str] = []
|
|
171
|
+
if spec.include_lemma:
|
|
172
|
+
src = resources.lemma_dict_dir()
|
|
173
|
+
dst = tmp_root / "lemma_dict"
|
|
174
|
+
if src.exists():
|
|
175
|
+
for f in src.iterdir():
|
|
176
|
+
if not f.is_file():
|
|
177
|
+
continue
|
|
178
|
+
if f.suffix.lower() not in {".sqlite", ".db", ".sqlite3", ".pkl"}:
|
|
179
|
+
continue
|
|
180
|
+
rel = f"lemma_dict/{f.name}"
|
|
181
|
+
if not _lang_filter_ok(rel, spec.langs):
|
|
182
|
+
continue
|
|
183
|
+
_copy_file(f, dst / f.name)
|
|
184
|
+
copied.append(rel)
|
|
185
|
+
|
|
186
|
+
if spec.include_seg_lexicon:
|
|
187
|
+
src = resources.seg_lexicon_dir()
|
|
188
|
+
dst = tmp_root / "seg_lexicon"
|
|
189
|
+
if src.exists():
|
|
190
|
+
for f in src.iterdir():
|
|
191
|
+
if not f.is_file():
|
|
192
|
+
continue
|
|
193
|
+
# keep only known safe lexicon assets (no giant corpora)
|
|
194
|
+
name = f.name
|
|
195
|
+
if not (
|
|
196
|
+
name.endswith("_wordfreq.pkl")
|
|
197
|
+
or name.endswith("_seg_lexicon.pkl")
|
|
198
|
+
or name.endswith("_wordlist.pkl")
|
|
199
|
+
or name.endswith("_wordlist.txt")
|
|
200
|
+
or name.endswith("_extra_dict.json")
|
|
201
|
+
or name.endswith("_extra_lexicon.json")
|
|
202
|
+
):
|
|
203
|
+
continue
|
|
204
|
+
rel = f"seg_lexicon/{name}"
|
|
205
|
+
if not _lang_filter_ok(rel, spec.langs):
|
|
206
|
+
continue
|
|
207
|
+
_copy_file(f, dst / name)
|
|
208
|
+
copied.append(rel)
|
|
209
|
+
|
|
210
|
+
# manifest + SHA256SUMS
|
|
211
|
+
checksums: Dict[str, str] = {}
|
|
212
|
+
for f in _iter_files(tmp_root):
|
|
213
|
+
rel = str(f.relative_to(tmp_root)).replace("\\", "/")
|
|
214
|
+
checksums[rel] = _sha256_file(f)
|
|
215
|
+
|
|
216
|
+
manifest: Dict[str, Any] = {
|
|
217
|
+
"schema_version": int(SCHEMA_VERSION),
|
|
218
|
+
"tokmor_version": str(_TOKMOR_VERSION),
|
|
219
|
+
"type": "tokmor_core_snapshot",
|
|
220
|
+
"assets": {
|
|
221
|
+
"include_lemma": bool(spec.include_lemma),
|
|
222
|
+
"include_seg_lexicon": bool(spec.include_seg_lexicon),
|
|
223
|
+
"langs": spec.langs,
|
|
224
|
+
},
|
|
225
|
+
"files": [
|
|
226
|
+
{"path": rel, "sha256": sha}
|
|
227
|
+
for rel, sha in sorted(checksums.items(), key=lambda x: x[0])
|
|
228
|
+
],
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
_atomic_write_text(tmp_root / MANIFEST_NAME, json.dumps(manifest, ensure_ascii=False, indent=2) + "\n")
|
|
232
|
+
# recompute checksums including manifest
|
|
233
|
+
checksums[str(MANIFEST_NAME)] = _sha256_file(tmp_root / MANIFEST_NAME)
|
|
234
|
+
|
|
235
|
+
sha_lines = [f"{checksums[p]} {p}" for p in sorted(checksums.keys())]
|
|
236
|
+
_atomic_write_text(tmp_root / SHA256SUMS_NAME, "\n".join(sha_lines) + "\n")
|
|
237
|
+
|
|
238
|
+
# move into place
|
|
239
|
+
if out_dir.exists():
|
|
240
|
+
# replace old snapshot atomically-ish (rename to backup then replace)
|
|
241
|
+
backup = out_dir.parent / (out_dir.name + ".bak")
|
|
242
|
+
if backup.exists():
|
|
243
|
+
# delete previous backup
|
|
244
|
+
for p in reversed(list(backup.rglob("*"))):
|
|
245
|
+
try:
|
|
246
|
+
if p.is_file():
|
|
247
|
+
p.unlink()
|
|
248
|
+
elif p.is_dir():
|
|
249
|
+
p.rmdir()
|
|
250
|
+
except Exception:
|
|
251
|
+
pass
|
|
252
|
+
try:
|
|
253
|
+
backup.rmdir()
|
|
254
|
+
except Exception:
|
|
255
|
+
pass
|
|
256
|
+
out_dir.replace(backup)
|
|
257
|
+
|
|
258
|
+
tmp_root.replace(out_dir)
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
"ok": True,
|
|
262
|
+
"schema_version": int(SCHEMA_VERSION),
|
|
263
|
+
"tokmor_version": str(_TOKMOR_VERSION),
|
|
264
|
+
"out_dir": str(out_dir),
|
|
265
|
+
"file_count": len(list(_iter_files(out_dir))),
|
|
266
|
+
"copied_count": len(copied),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def build_bundle(*, snapshot_dir: Path, out_tgz: Path) -> Dict[str, Any]:
|
|
271
|
+
"""
|
|
272
|
+
Create a tar.gz from a snapshot directory.
|
|
273
|
+
"""
|
|
274
|
+
snapshot_dir = Path(snapshot_dir).expanduser().resolve()
|
|
275
|
+
out_tgz = Path(out_tgz).expanduser().resolve()
|
|
276
|
+
if not snapshot_dir.exists():
|
|
277
|
+
raise FileNotFoundError(f"snapshot_dir does not exist: {snapshot_dir}")
|
|
278
|
+
|
|
279
|
+
# write tgz
|
|
280
|
+
out_tgz.parent.mkdir(parents=True, exist_ok=True)
|
|
281
|
+
with tarfile.open(out_tgz, "w:gz") as tf:
|
|
282
|
+
tf.add(snapshot_dir, arcname=snapshot_dir.name)
|
|
283
|
+
|
|
284
|
+
sha = _sha256_file(out_tgz)
|
|
285
|
+
_atomic_write_text(out_tgz.with_suffix(out_tgz.suffix + ".sha256"), sha + "\n")
|
|
286
|
+
return {
|
|
287
|
+
"ok": True,
|
|
288
|
+
"schema_version": int(SCHEMA_VERSION),
|
|
289
|
+
"tokmor_version": str(_TOKMOR_VERSION),
|
|
290
|
+
"bundle": str(out_tgz),
|
|
291
|
+
"sha256": sha,
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def verify_bundle(*, bundle_tgz: Path, expected_sha256: Optional[str] = None) -> Dict[str, Any]:
|
|
296
|
+
"""
|
|
297
|
+
Verify tar.gz bundle integrity:
|
|
298
|
+
- optional sha256 match
|
|
299
|
+
- contains manifest and SHA256SUMS
|
|
300
|
+
- internal files match SHA256SUMS
|
|
301
|
+
"""
|
|
302
|
+
bundle_tgz = Path(bundle_tgz).expanduser().resolve()
|
|
303
|
+
if not bundle_tgz.exists():
|
|
304
|
+
raise FileNotFoundError(f"bundle not found: {bundle_tgz}")
|
|
305
|
+
|
|
306
|
+
actual = _sha256_file(bundle_tgz)
|
|
307
|
+
if expected_sha256 and (expected_sha256.strip().lower() != actual.lower()):
|
|
308
|
+
return {"ok": False, "error": "sha256_mismatch", "expected": expected_sha256, "actual": actual}
|
|
309
|
+
|
|
310
|
+
# Extract to temp dir (disk-safe; deletes on exit)
|
|
311
|
+
with tempfile.TemporaryDirectory(prefix="tokmor_bundle_verify_") as td:
|
|
312
|
+
td_path = Path(td)
|
|
313
|
+
with tarfile.open(bundle_tgz, "r:gz") as tf:
|
|
314
|
+
tf.extractall(td_path)
|
|
315
|
+
|
|
316
|
+
# assume single root directory
|
|
317
|
+
roots = [p for p in td_path.iterdir() if p.is_dir()]
|
|
318
|
+
if not roots:
|
|
319
|
+
return {"ok": False, "error": "no_root_dir"}
|
|
320
|
+
root = roots[0]
|
|
321
|
+
|
|
322
|
+
man = root / MANIFEST_NAME
|
|
323
|
+
sums = root / SHA256SUMS_NAME
|
|
324
|
+
if not man.exists():
|
|
325
|
+
return {"ok": False, "error": "missing_manifest"}
|
|
326
|
+
if not sums.exists():
|
|
327
|
+
return {"ok": False, "error": "missing_sha256sums"}
|
|
328
|
+
|
|
329
|
+
# parse SHA256SUMS
|
|
330
|
+
expected: Dict[str, str] = {}
|
|
331
|
+
for line in sums.read_text(encoding="utf-8", errors="ignore").splitlines():
|
|
332
|
+
line = line.strip()
|
|
333
|
+
if not line:
|
|
334
|
+
continue
|
|
335
|
+
# "sha path"
|
|
336
|
+
parts = line.split()
|
|
337
|
+
if len(parts) < 2:
|
|
338
|
+
continue
|
|
339
|
+
sha = parts[0].strip()
|
|
340
|
+
path = parts[-1].strip()
|
|
341
|
+
expected[path] = sha
|
|
342
|
+
|
|
343
|
+
# verify
|
|
344
|
+
bad: List[Dict[str, str]] = []
|
|
345
|
+
for path, sha in expected.items():
|
|
346
|
+
p = root / path
|
|
347
|
+
if not p.exists() or not p.is_file():
|
|
348
|
+
bad.append({"path": path, "error": "missing"})
|
|
349
|
+
continue
|
|
350
|
+
got = _sha256_file(p)
|
|
351
|
+
if got.lower() != sha.lower():
|
|
352
|
+
bad.append({"path": path, "error": "sha256_mismatch", "expected": sha, "actual": got})
|
|
353
|
+
|
|
354
|
+
return {
|
|
355
|
+
"ok": len(bad) == 0,
|
|
356
|
+
"schema_version": int(SCHEMA_VERSION),
|
|
357
|
+
"tokmor_version": str(_TOKMOR_VERSION),
|
|
358
|
+
"bundle": str(bundle_tgz),
|
|
359
|
+
"bundle_sha256": actual,
|
|
360
|
+
"errors": bad,
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
|
|
365
|
+
|