tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/inventory.py ADDED
@@ -0,0 +1,51 @@
1
+ """
2
+ Language inventory (POS-free view)
3
+ ==================================
4
+
5
+ We want an honest, practical view for end users:
6
+ - tokenization works broadly (specialized tokenizers vs fallback)
7
+ - morphology output is always available (at least identity fallback)
8
+ - optional lemma dictionaries may be provided via external assets (TOKMOR_DATA_DIR)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from typing import Any, Dict, List
14
+
15
+ from .factory import TOKENIZER_MAP, supported_languages
16
+ from .morphology.unified import unified_supported_languages
17
+ from .resources import resolve_lemma_dict_path
18
+
19
+
20
+ def build_language_inventory() -> Dict[str, Any]:
21
+ toks = supported_languages()
22
+ morph = unified_supported_languages()
23
+
24
+ has_lemma: List[str] = []
25
+ for lang in toks:
26
+ if resolve_lemma_dict_path(lang) is not None:
27
+ has_lemma.append(lang)
28
+
29
+ specialized = sorted({k.lower() for k in TOKENIZER_MAP.keys()})
30
+
31
+ return {
32
+ "counts": {
33
+ "tokenize_languages": len(toks),
34
+ "morph_languages": len(morph),
35
+ "lemma_dict_languages": len(has_lemma),
36
+ "specialized_tokenizers": len(specialized),
37
+ },
38
+ "capabilities": {
39
+ # Deterministic SNS discourse marker hints via segment(..., sns=True, include_sns_tags=True)
40
+ "sns_discourse_markers": True,
41
+ },
42
+ "tokenize_supported": toks,
43
+ "morph_supported": morph,
44
+ "lemma_dict_supported": sorted(set(has_lemma)),
45
+ "specialized_tokenizers": specialized,
46
+ }
47
+
48
+
49
+
50
+
51
+
tokmor/legacy_api.py ADDED
@@ -0,0 +1,143 @@
1
+ """
2
+ Legacy preprocessing API (kept for internal tooling)
3
+ ===================================================
4
+
5
+ This module contains the older POS-free preprocessing functions:
6
+ - tokenize()
7
+ - segment()
8
+ - route()
9
+
10
+ New integrations should prefer:
11
+ - tokmor.api.unified_tokenize()
12
+ - tokmor.api.ner_preprocess()
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import asdict
18
+ from typing import Any, Dict, List, Literal, Optional, Union
19
+
20
+ from . import __version__ as _TOKMOR_VERSION
21
+ from .factory import detect_language, get_tokenizer
22
+ from .inventory import build_language_inventory
23
+ from .morphology.unified import get_unified_analyzer
24
+ from .preprocess import normalize_text
25
+ from .lookup_keys import suffixing_latin_keys
26
+ from .routing import route as _route
27
+ from .schema import SCHEMA_VERSION
28
+
29
+ OutputFormat = Literal["tokens", "tokens_with_offsets"]
30
+ SegmentToken = Dict[str, Any]
31
+
32
+
33
+ def languages() -> Dict[str, Any]:
34
+ return build_language_inventory()
35
+
36
+
37
+ def normalize(text: str) -> str:
38
+ return normalize_text(text)
39
+
40
+
41
+ def normalize_sns(text: str) -> str:
42
+ return normalize_text(text, sns=True)
43
+
44
+
45
+ def tokenize(
46
+ text: str,
47
+ lang: str = "auto",
48
+ *,
49
+ sns: bool = False,
50
+ morphology: Optional[bool] = None,
51
+ zh_join_dates: Optional[bool] = None,
52
+ output: OutputFormat = "tokens",
53
+ ) -> Union[List[str], List[Dict[str, Any]]]:
54
+ text_norm = normalize_text(text, sns=bool(sns))
55
+ if lang == "auto":
56
+ lang = detect_language(text_norm)
57
+
58
+ if morphology is None:
59
+ if lang in {"zh", "ja"}:
60
+ morphology = True
61
+ elif lang == "ko":
62
+ morphology = True
63
+
64
+ tok = get_tokenizer(lang, use_morphology=morphology, zh_join_dates=zh_join_dates if lang.startswith("zh") else None)
65
+ res = tok.tokenize(text_norm)
66
+ if output == "tokens":
67
+ return res.texts()
68
+ if output == "tokens_with_offsets":
69
+ return [asdict(t) for t in res.tokens]
70
+ raise ValueError(f"unknown output={output}")
71
+
72
+
73
+ def segment(
74
+ text: str,
75
+ lang: str = "auto",
76
+ *,
77
+ sns: bool = False,
78
+ morphology: Optional[bool] = None,
79
+ zh_join_dates: Optional[bool] = None,
80
+ include_morphemes: bool = False,
81
+ include_keys: bool = False,
82
+ include_sns_tags: bool = False,
83
+ ) -> Dict[str, Any]:
84
+ text_norm = normalize_text(text, sns=bool(sns))
85
+ if lang == "auto":
86
+ lang = detect_language(text_norm)
87
+
88
+ if morphology is None:
89
+ if lang in {"zh", "ja"}:
90
+ morphology = True
91
+ elif lang == "ko":
92
+ morphology = True
93
+
94
+ tok = get_tokenizer(lang, use_morphology=morphology, zh_join_dates=zh_join_dates if lang.startswith("zh") else None)
95
+ res = tok.tokenize(text_norm)
96
+
97
+ routing = _route(text_norm, lang=lang)
98
+ stype = str(routing.get("structure", {}).get("type") or "")
99
+
100
+ out_tokens: List[SegmentToken] = []
101
+ for t in res.tokens:
102
+ d: SegmentToken = {"text": t.text, "start": t.start, "end": t.end}
103
+ if include_keys:
104
+ if stype == "suffixing_latin":
105
+ d["keys"] = suffixing_latin_keys(t.text, lang=lang)
106
+ else:
107
+ d["keys"] = [t.text]
108
+ if include_sns_tags:
109
+ from .sns_tags import classify_sns_token
110
+
111
+ d["sns"] = classify_sns_token(t.text, lang=lang)
112
+ out_tokens.append(d)
113
+
114
+ return {
115
+ "schema_version": int(SCHEMA_VERSION),
116
+ "tokmor_version": str(_TOKMOR_VERSION),
117
+ "lang": lang,
118
+ "morphology_used": bool(getattr(res, "morphology_used", False)),
119
+ "token_count": len(out_tokens),
120
+ "tokens": out_tokens,
121
+ "morphemes": (
122
+ [
123
+ {"form": r.word, "pos": r.pos, "features": r.features}
124
+ for r in get_unified_analyzer(lang).analyze(text_norm)
125
+ ]
126
+ if include_morphemes
127
+ else None
128
+ ),
129
+ }
130
+
131
+
132
+ def route(text: str, lang: str = "auto") -> Dict[str, Any]:
133
+ text_norm = normalize_text(text)
134
+ if lang == "auto":
135
+ lang = detect_language(text_norm)
136
+ payload = _route(text_norm, lang=lang)
137
+ return {
138
+ "schema_version": int(SCHEMA_VERSION),
139
+ "tokmor_version": str(_TOKMOR_VERSION),
140
+ "lang": lang,
141
+ **payload,
142
+ }
143
+
tokmor/lemma_store.py ADDED
@@ -0,0 +1,102 @@
1
+ """
2
+ Lemma Store (stdlib-only)
3
+ ========================
4
+
5
+ Goal:
6
+ - very fast lemma lookup for huge lexicons (multi-million+ entries)
7
+ - no external dependencies
8
+
9
+ Supported backends:
10
+ - Pickle dict: small lexicons (fast but memory heavy)
11
+ - SQLite (sqlite3): large lexicons (fast, low memory)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import pickle
17
+ import sqlite3
18
+ from dataclasses import dataclass
19
+ from functools import lru_cache
20
+ from pathlib import Path
21
+ from typing import Dict, Optional, Tuple
22
+
23
+
24
+ class BaseLemmaStore:
25
+ def get(self, key: str) -> Optional[str]:
26
+ raise NotImplementedError
27
+
28
+
29
+ @dataclass
30
+ class PickleLemmaStore(BaseLemmaStore):
31
+ data: Dict[str, str]
32
+
33
+ @classmethod
34
+ def load(cls, path: Path) -> "PickleLemmaStore":
35
+ with open(path, "rb") as f:
36
+ d = pickle.load(f)
37
+ if not isinstance(d, dict):
38
+ raise ValueError("Pickle lemma store must be a dict[str,str]")
39
+ return cls(data=d)
40
+
41
+ def get(self, key: str) -> Optional[str]:
42
+ return self.data.get(key)
43
+
44
+
45
+ class SqliteLemmaStore(BaseLemmaStore):
46
+ """
47
+ SQLite schema:
48
+ CREATE TABLE lemma (k TEXT PRIMARY KEY, v TEXT NOT NULL);
49
+ """
50
+
51
+ def __init__(self, path: Path):
52
+ self.path = Path(path)
53
+ # check_same_thread=False: allow reuse across threads if needed
54
+ self._conn = sqlite3.connect(str(self.path), check_same_thread=False)
55
+ self._conn.row_factory = None
56
+ # fast read-only settings (safe even for rw db)
57
+ try:
58
+ self._conn.execute("PRAGMA journal_mode=OFF;")
59
+ self._conn.execute("PRAGMA synchronous=OFF;")
60
+ self._conn.execute("PRAGMA temp_store=MEMORY;")
61
+ self._conn.execute("PRAGMA cache_size=-20000;") # ~20MB
62
+ except Exception:
63
+ pass
64
+ self._stmt = self._conn.cursor()
65
+
66
+ def close(self) -> None:
67
+ try:
68
+ self._stmt.close()
69
+ except Exception:
70
+ pass
71
+ try:
72
+ self._conn.close()
73
+ except Exception:
74
+ pass
75
+
76
+ @lru_cache(maxsize=200_000)
77
+ def get(self, key: str) -> Optional[str]:
78
+ try:
79
+ self._stmt.execute("SELECT v FROM lemma WHERE k=? LIMIT 1", (key,))
80
+ row = self._stmt.fetchone()
81
+ if not row:
82
+ return None
83
+ return row[0]
84
+ except Exception:
85
+ return None
86
+
87
+
88
+ def load_lemma_store(path: Path) -> BaseLemmaStore:
89
+ p = Path(path)
90
+ suf = p.suffix.lower()
91
+ if suf in (".sqlite", ".db", ".sqlite3"):
92
+ return SqliteLemmaStore(p)
93
+ # default: pickle
94
+ return PickleLemmaStore.load(p)
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
tokmor/lookup_keys.py ADDED
@@ -0,0 +1,145 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, List, Set
4
+
5
+
6
+ def _uniq(xs: Iterable[str]) -> List[str]:
7
+ out: List[str] = []
8
+ seen: Set[str] = set()
9
+ for x in xs:
10
+ if not x:
11
+ continue
12
+ if x in seen:
13
+ continue
14
+ seen.add(x)
15
+ out.append(x)
16
+ return out
17
+
18
+
19
+ def _strip_apostrophe(token: str) -> str:
20
+ # Turkish (and some other Latin-script languages) use apostrophe to separate proper noun and suffix:
21
+ # Ankara'da, Türkiye'nin, Ali'ye ...
22
+ if "'" not in token:
23
+ return token
24
+ head = token.split("'", 1)[0]
25
+ if len(head) >= 2:
26
+ return head
27
+ return token
28
+
29
+
30
+ def _strip_suffix_any(token: str, suffixes: List[str]) -> str:
31
+ for suf in suffixes:
32
+ if token.endswith(suf) and len(token) - len(suf) >= 3:
33
+ return token[: -len(suf)]
34
+ return token
35
+
36
+
37
+ def suffixing_latin_keys(token: str, *, lang: str) -> List[str]:
38
+ """
39
+ Conservative lookup keys for suffixing Latin-script languages.
40
+
41
+ Goal: help NER/Gazetteer/PMI lookups by removing *very common* clitics/possessives/marker suffixes.
42
+ Non-goals:
43
+ - full morphological analysis
44
+ - lemma recovery (e.g., Finnish consonant gradation)
45
+ """
46
+ t0 = token or ""
47
+ if not t0:
48
+ return []
49
+
50
+ # Only attempt on word-like tokens (keep digits/hyphenated words as-is).
51
+ # If it's noisy (contains spaces) or too short, keep identity only.
52
+ if (" " in t0) or (len(t0) < 3):
53
+ return [t0]
54
+
55
+ lang = (lang or "").lower()
56
+
57
+ # Start with the surface form.
58
+ keys: List[str] = [t0]
59
+
60
+ # Apostrophe split helps a lot for Turkish proper nouns.
61
+ ta = _strip_apostrophe(t0)
62
+ if ta != t0:
63
+ keys.append(ta)
64
+
65
+ # Language-specific conservative strips.
66
+ # Ordering matters: clitics/possessives first, then heavier case markers.
67
+ if lang == "fi":
68
+ # Finnish clitics (very common in text; safe to strip)
69
+ clitics = ["kin", "kaan", "kään", "han", "hän", "pa", "pä", "ko", "kö"]
70
+ # Possessives
71
+ possess = ["nsä", "nsa", "mme", "nne", "ni", "si"]
72
+ # Case-ish endings (conservative: prefer longer, avoid 1-letter endings)
73
+ cases = [
74
+ "ssa", "ssä", "sta", "stä", "lla", "llä", "lta", "ltä", "lle",
75
+ "na", "nä", "ksi",
76
+ "tta", "ttä",
77
+ ]
78
+
79
+ t = ta
80
+ for _ in range(2):
81
+ t1 = _strip_suffix_any(t, clitics)
82
+ t2 = _strip_suffix_any(t1, possess)
83
+ t3 = _strip_suffix_any(t2, cases)
84
+ if t3 == t:
85
+ break
86
+ keys.append(t3)
87
+ t = t3
88
+
89
+ elif lang == "tr":
90
+ # Turkish common suffix stacks are complex; keep it conservative:
91
+ # - if apostrophe split happened, it's already a big win for proper nouns.
92
+ # - also strip a few extremely common locative/ablative markers when present without apostrophe.
93
+ suffixes = [
94
+ "daki", "deki",
95
+ "dan", "den", "tan", "ten",
96
+ "da", "de", "ta", "te",
97
+ "lar", "ler",
98
+ ]
99
+ t = ta
100
+ for _ in range(2):
101
+ t2 = _strip_suffix_any(t, suffixes)
102
+ if t2 == t:
103
+ break
104
+ keys.append(t2)
105
+ t = t2
106
+
107
+ elif lang == "hu":
108
+ # Hungarian: very conservative subset (case endings are many).
109
+ suffixes = [
110
+ "ban", "ben", # in
111
+ "ból", "ből", "rol", "ről", "tól", "től", # from
112
+ "nak", "nek", # dative
113
+ "val", "vel", # with
114
+ ]
115
+ t = ta
116
+ for _ in range(2):
117
+ t2 = _strip_suffix_any(t, suffixes)
118
+ if t2 == t:
119
+ break
120
+ keys.append(t2)
121
+ t = t2
122
+
123
+ elif lang == "et":
124
+ # Estonian: conservative subset.
125
+ suffixes = [
126
+ "s", # inessive is actually -s (often with -sse/-st), but single-letter is risky;
127
+ # so we don't strip it. Keep longer ones:
128
+ "sse", "st", "lt", "le", "l",
129
+ "ga", # comitative
130
+ ]
131
+ # Avoid stripping single-letter suffixes (like "s", "l") here; keep only len>=2
132
+ suffixes = [s for s in suffixes if len(s) >= 2]
133
+ t = ta
134
+ for _ in range(2):
135
+ t2 = _strip_suffix_any(t, suffixes)
136
+ if t2 == t:
137
+ break
138
+ keys.append(t2)
139
+ t = t2
140
+
141
+ return _uniq(keys)
142
+
143
+
144
+
145
+
@@ -0,0 +1,54 @@
1
+ {
2
+ "version": 1,
3
+ "lang": "en",
4
+ "pos": [
5
+ "good",
6
+ "great",
7
+ "awesome",
8
+ "amazing",
9
+ "excellent",
10
+ "fantastic",
11
+ "nice",
12
+ "love",
13
+ "loved",
14
+ "lovely",
15
+ "like",
16
+ "happy",
17
+ "happiness",
18
+ "best"
19
+ ],
20
+ "neg": [
21
+ "bad",
22
+ "terrible",
23
+ "awful",
24
+ "horrible",
25
+ "worst",
26
+ "hate",
27
+ "hated",
28
+ "sad",
29
+ "angry",
30
+ "disgusting",
31
+ "sucks"
32
+ ],
33
+ "negators": [
34
+ "not",
35
+ "no",
36
+ "never",
37
+ "n't"
38
+ ],
39
+ "intensifiers": [
40
+ "very",
41
+ "really",
42
+ "so",
43
+ "super",
44
+ "extremely"
45
+ ],
46
+ "diminishers": [
47
+ "slightly",
48
+ "somewhat",
49
+ "kinda",
50
+ "kind of",
51
+ "sort of"
52
+ ]
53
+ }
54
+
@@ -0,0 +1,52 @@
1
+ {
2
+ "version": 1,
3
+ "lang": "ko",
4
+ "pos": [
5
+ "좋다",
6
+ "좋아",
7
+ "좋음",
8
+ "좋아요",
9
+ "최고",
10
+ "훌륭",
11
+ "멋지다",
12
+ "멋져",
13
+ "사랑",
14
+ "행복",
15
+ "기쁘다",
16
+ "만족"
17
+ ],
18
+ "neg": [
19
+ "나쁘다",
20
+ "나빠",
21
+ "싫다",
22
+ "싫어",
23
+ "별로",
24
+ "최악",
25
+ "짜증",
26
+ "화나다",
27
+ "슬프다",
28
+ "혐오",
29
+ "불만"
30
+ ],
31
+ "negators": [
32
+ "안",
33
+ "못",
34
+ "없다",
35
+ "없어",
36
+ "아니",
37
+ "아니다"
38
+ ],
39
+ "intensifiers": [
40
+ "너무",
41
+ "진짜",
42
+ "완전",
43
+ "엄청",
44
+ "개"
45
+ ],
46
+ "diminishers": [
47
+ "좀",
48
+ "약간",
49
+ "조금"
50
+ ]
51
+ }
52
+
@@ -0,0 +1,35 @@
1
+ {
2
+ "蒙特州": "ns",
3
+ "七名区": "ns",
4
+ "曼德省": "ns",
5
+ "举办国": "ns",
6
+ "苏格兰银行": "nrt",
7
+ "刑事法院": "nrt",
8
+ "平奖委员会": "nrt",
9
+ "就读大学": "nrt",
10
+ "建筑公司": "nrt",
11
+ "汽车协会": "nrt",
12
+ "上调大学": "nrt",
13
+ "联邦大学": "nrt",
14
+ "尔奖委员会": "nrt",
15
+ "富国银行": "nrt",
16
+ "儿童协会": "nrt",
17
+ "北京法院": "nrt",
18
+ "电脑公司": "nrt",
19
+ "格兰大学": "nrt",
20
+ "反革命集团": "nrt",
21
+ "警长协会": "nrt",
22
+ "宪法委员会": "nrt",
23
+ "事达公司": "nrt",
24
+ "花旗集团": "nrt",
25
+ "大型银行": "nrt",
26
+ "西方公司": "nrt",
27
+ "斯坦福大学": "nrt",
28
+ "美联银行": "nrt",
29
+ "英国银行": "nrt",
30
+ "精英集团": "nrt",
31
+ "犯罪集团": "nrt",
32
+ "工业集团": "nrt",
33
+ "调查公司": "nrt",
34
+ "记者委员会": "nrt"
35
+ }