tokmor 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. tokmor/__init__.py +77 -0
  2. tokmor/api.py +194 -0
  3. tokmor/assets.py +365 -0
  4. tokmor/base.py +238 -0
  5. tokmor/brahmic.py +516 -0
  6. tokmor/cjk.py +497 -0
  7. tokmor/domain/__init__.py +11 -0
  8. tokmor/domain/sentiment.py +198 -0
  9. tokmor/factory.py +394 -0
  10. tokmor/indic.py +289 -0
  11. tokmor/inventory.py +51 -0
  12. tokmor/legacy_api.py +143 -0
  13. tokmor/lemma_store.py +102 -0
  14. tokmor/lookup_keys.py +145 -0
  15. tokmor/models/domain/sentiment/en.json +54 -0
  16. tokmor/models/domain/sentiment/ko.json +52 -0
  17. tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
  18. tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
  19. tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
  20. tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
  21. tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
  22. tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
  23. tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
  24. tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
  25. tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
  26. tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
  27. tokmor/morphology/__init__.py +395 -0
  28. tokmor/morphology/advanced_base.py +472 -0
  29. tokmor/morphology/arabic_advanced.py +247 -0
  30. tokmor/morphology/chinese.py +736 -0
  31. tokmor/morphology/chinese_advanced.py +425 -0
  32. tokmor/morphology/english.py +315 -0
  33. tokmor/morphology/english_advanced.py +560 -0
  34. tokmor/morphology/french_advanced.py +237 -0
  35. tokmor/morphology/german_advanced.py +343 -0
  36. tokmor/morphology/hindi_advanced.py +258 -0
  37. tokmor/morphology/japanese.py +417 -0
  38. tokmor/morphology/japanese_advanced.py +589 -0
  39. tokmor/morphology/korean.py +534 -0
  40. tokmor/morphology/korean_advanced.py +603 -0
  41. tokmor/morphology/russian_advanced.py +217 -0
  42. tokmor/morphology/spanish_advanced.py +226 -0
  43. tokmor/morphology/templates/__init__.py +32 -0
  44. tokmor/morphology/templates/arabic_script_template.py +162 -0
  45. tokmor/morphology/templates/brahmic_template.py +181 -0
  46. tokmor/morphology/templates/cyrillic_template.py +168 -0
  47. tokmor/morphology/templates/latin_template.py +235 -0
  48. tokmor/morphology/templates/other_scripts_template.py +475 -0
  49. tokmor/morphology/thai_native.py +274 -0
  50. tokmor/morphology/tier2.py +477 -0
  51. tokmor/morphology/tier3.py +449 -0
  52. tokmor/morphology/tier4.py +410 -0
  53. tokmor/morphology/unified.py +855 -0
  54. tokmor/morphology/universal_fallback.py +398 -0
  55. tokmor/ner_prep.py +747 -0
  56. tokmor/offline.py +89 -0
  57. tokmor/preprocess.py +80 -0
  58. tokmor/resources.py +288 -0
  59. tokmor/routing.py +147 -0
  60. tokmor/rtl.py +309 -0
  61. tokmor/schema.py +17 -0
  62. tokmor/sns_tags.py +281 -0
  63. tokmor/space_based.py +272 -0
  64. tokmor/token_quality.py +1185 -0
  65. tokmor/unified_tokens.py +228 -0
  66. tokmor-1.2.9.dist-info/METADATA +103 -0
  67. tokmor-1.2.9.dist-info/RECORD +70 -0
  68. tokmor-1.2.9.dist-info/WHEEL +5 -0
  69. tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
  70. tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/rtl.py ADDED
@@ -0,0 +1,309 @@
1
+ """
2
+ RTL Tokenizer
3
+ =============
4
+
5
+ 오른쪽에서 왼쪽으로 쓰는 언어용 토크나이저
6
+ 아랍어, 히브리어, 페르시아어, 우르두어
7
+ """
8
+
9
+ import re
10
+ import unicodedata
11
+ from typing import List
12
+ from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
13
+
14
+
15
+ class RTLTokenizer(BaseTokenizer):
16
+ """
17
+ RTL 언어 토크나이저
18
+
19
+ 아랍어, 히브리어, 페르시아어, 우르두어 등
20
+ """
21
+
22
+ SUPPORTED_LANGUAGES = {'ar', 'he', 'fa', 'ur', 'yi', 'ps'}
23
+
24
+ # Unicode ranges
25
+ ARABIC = '\u0600-\u06ff'
26
+ ARABIC_SUPPLEMENT = '\u0750-\u077f'
27
+ ARABIC_EXT_A = '\u08a0-\u08ff'
28
+ ARABIC_PRESENTATION_A = '\ufb50-\ufdff'
29
+ ARABIC_PRESENTATION_B = '\ufe70-\ufeff'
30
+ HEBREW = '\u0590-\u05ff'
31
+
32
+ def __init__(self, lang: str, use_morphology: bool = False):
33
+ super().__init__(lang, use_morphology)
34
+ self._setup_patterns()
35
+
36
+ def _setup_patterns(self):
37
+ """언어별 패턴 설정"""
38
+ if self.lang in ('he', 'yi'): # Hebrew and Yiddish use Hebrew script
39
+ self._script_pattern = re.compile(f'[{self.HEBREW}]+')
40
+ else: # ar, fa, ur, ps
41
+ self._script_pattern = re.compile(
42
+ f'[{self.ARABIC}{self.ARABIC_SUPPLEMENT}'
43
+ f'{self.ARABIC_EXT_A}{self.ARABIC_PRESENTATION_A}'
44
+ f'{self.ARABIC_PRESENTATION_B}]+'
45
+ )
46
+
47
+ self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
48
+
49
+ def _init_morphology(self):
50
+ """형태소 분석기 초기화"""
51
+ if self.lang == 'ar':
52
+ self._morphology_analyzer = ArabicMorphologyAnalyzer()
53
+ elif self.lang == 'he':
54
+ self._morphology_analyzer = HebrewMorphologyAnalyzer()
55
+
56
+ def tokenize(self, text: str) -> TokenizerResult:
57
+ """RTL 토크나이징"""
58
+ text = self.clean_text(text)
59
+ if not text:
60
+ return TokenizerResult(tokens=[], text=text, lang=self.lang)
61
+
62
+ # 형태소 분석 사용 시
63
+ if self.use_morphology and self._morphology_analyzer:
64
+ if self._morphology_analyzer.is_available():
65
+ tokens = self._morphology_analyzer.analyze(text)
66
+ return TokenizerResult(
67
+ tokens=tokens,
68
+ text=text,
69
+ lang=self.lang,
70
+ morphology_used=True
71
+ )
72
+
73
+ tokens: List[Token] = []
74
+
75
+ # RTL 스크립트 토큰
76
+ for match in self._script_pattern.finditer(text):
77
+ tokens.append(Token(
78
+ text=match.group(),
79
+ start=match.start(),
80
+ end=match.end(),
81
+ ))
82
+
83
+ # 라틴/숫자
84
+ for match in self._latin_pattern.finditer(text):
85
+ overlaps = any(
86
+ t.start <= match.start() < t.end or t.start < match.end() <= t.end
87
+ for t in tokens
88
+ )
89
+ if not overlaps:
90
+ tokens.append(Token(
91
+ text=match.group(),
92
+ start=match.start(),
93
+ end=match.end(),
94
+ ))
95
+
96
+ tokens.sort(key=lambda t: t.start)
97
+
98
+ # Preserve any remaining non-space spans (emoji, punctuation, symbols, etc.).
99
+ # Previously, RTL tokenization dropped these completely, which is harmful for SNS and general preprocessing.
100
+ if tokens:
101
+ out2: List[Token] = []
102
+ i = 0
103
+ j = 0
104
+ n = len(text)
105
+ toks = tokens
106
+ while i < n:
107
+ ch = text[i]
108
+ if ch.isspace():
109
+ i += 1
110
+ continue
111
+ # Advance token pointer to the first token that could overlap/appear after i
112
+ while j < len(toks) and toks[j].end <= i:
113
+ j += 1
114
+ if j < len(toks) and toks[j].start <= i < toks[j].end:
115
+ # Inside an existing token
116
+ out2.append(toks[j])
117
+ i = toks[j].end
118
+ continue
119
+ # Uncovered non-space segment until whitespace or next token start
120
+ next_start = toks[j].start if j < len(toks) else n
121
+ k = i
122
+ while k < n and (not text[k].isspace()) and k < next_start:
123
+ k += 1
124
+ if k > i:
125
+ out2.append(Token(text=text[i:k], start=i, end=k))
126
+ i = k
127
+ # De-duplicate by (start,end,text) and sort
128
+ seen = set()
129
+ dedup: List[Token] = []
130
+ for t in out2:
131
+ key = (t.start, t.end, t.text)
132
+ if key in seen:
133
+ continue
134
+ seen.add(key)
135
+ dedup.append(t)
136
+ tokens = sorted(dedup, key=lambda t: t.start)
137
+
138
+ # Drop standalone combining marks (diacritics) that appear as separate tokens in noisy corpora.
139
+ # Example: "سويا ً" where "ً" is a combining mark separated by whitespace.
140
+ def _is_mark_only(s: str) -> bool:
141
+ return bool(s) and all(unicodedata.category(ch) in {"Mn", "Mc", "Me"} for ch in s)
142
+
143
+ # Also strip *leading* combining marks that sometimes appear due to corpus spacing noise:
144
+ # e.g., "دون َوقوع" -> token "َوقوع" should become "وقوع".
145
+ cleaned: List[Token] = []
146
+ for t in tokens:
147
+ s = t.text
148
+ i = 0
149
+ while i < len(s) and unicodedata.category(s[i]) in {"Mn", "Mc", "Me"}:
150
+ i += 1
151
+ if i > 0:
152
+ s2 = s[i:]
153
+ if not s2:
154
+ continue
155
+ cleaned.append(Token(text=s2, start=t.start + i, end=t.end, lemma=t.lemma, pos=t.pos, features=t.features))
156
+ else:
157
+ cleaned.append(t)
158
+
159
+ tokens = [t for t in cleaned if not _is_mark_only(t.text)]
160
+
161
+ # Safety: never return empty tokens for non-empty input
162
+ if not tokens:
163
+ for m in re.finditer(r"\S+", text):
164
+ tokens.append(Token(text=m.group(), start=m.start(), end=m.end()))
165
+
166
+ return TokenizerResult(
167
+ tokens=tokens,
168
+ text=text,
169
+ lang=self.lang,
170
+ morphology_used=False
171
+ )
172
+
173
+
174
+ class ArabicMorphologyAnalyzer(MorphologicalAnalyzer):
175
+ """
176
+ 아랍어 형태소 분석기
177
+
178
+ Backends:
179
+ - camel_tools
180
+ - pyarabic
181
+ """
182
+
183
+ def __init__(self, backend: str = 'auto'):
184
+ self.backend = backend
185
+ self._analyzer = None
186
+ self._backend_name = None
187
+ self._init_analyzer()
188
+
189
+ def _init_analyzer(self):
190
+ """분석기 초기화"""
191
+ # camel_tools 시도
192
+ if self.backend in ('auto', 'camel'):
193
+ try:
194
+ from camel_tools.morphology.analyzer import Analyzer
195
+ from camel_tools.morphology.database import MorphologyDB
196
+ db = MorphologyDB.builtin_db()
197
+ self._analyzer = Analyzer(db)
198
+ self._backend_name = 'camel'
199
+ return
200
+ except (ImportError, Exception):
201
+ pass
202
+
203
+ # pyarabic 시도
204
+ if self.backend in ('auto', 'pyarabic'):
205
+ try:
206
+ import pyarabic.araby as araby
207
+ self._analyzer = araby
208
+ self._backend_name = 'pyarabic'
209
+ return
210
+ except ImportError:
211
+ pass
212
+
213
+ def is_available(self) -> bool:
214
+ return self._analyzer is not None
215
+
216
+ def analyze(self, text: str) -> List[Token]:
217
+ """형태소 분석"""
218
+ if not self._analyzer:
219
+ return []
220
+
221
+ tokens = []
222
+
223
+ if self._backend_name == 'pyarabic':
224
+ # pyarabic는 단어 분리만 지원
225
+ words = self._analyzer.tokenize(text)
226
+ offset = 0
227
+ for word in words:
228
+ idx = text.find(word, offset)
229
+ if idx >= 0:
230
+ tokens.append(Token(
231
+ text=word,
232
+ start=idx,
233
+ end=idx + len(word),
234
+ lemma=word,
235
+ ))
236
+ offset = idx + len(word)
237
+
238
+ return tokens
239
+
240
+
241
+ class HebrewMorphologyAnalyzer(MorphologicalAnalyzer):
242
+ """
243
+ 히브리어 형태소 분석기
244
+
245
+ Backends:
246
+ - hebrew_tokenizer
247
+ """
248
+
249
+ def __init__(self, backend: str = 'auto'):
250
+ self.backend = backend
251
+ self._analyzer = None
252
+ self._backend_name = None
253
+ self._init_analyzer()
254
+
255
+ def _init_analyzer(self):
256
+ """분석기 초기화"""
257
+ try:
258
+ from hebrew_tokenizer import tokenize as heb_tokenize
259
+ self._analyzer = heb_tokenize
260
+ self._backend_name = 'hebrew_tokenizer'
261
+ except ImportError:
262
+ pass
263
+
264
+ def is_available(self) -> bool:
265
+ return self._analyzer is not None
266
+
267
+ def analyze(self, text: str) -> List[Token]:
268
+ """형태소 분석"""
269
+ if not self._analyzer:
270
+ return []
271
+
272
+ tokens = []
273
+ try:
274
+ for token_type, token_text, _, start, end in self._analyzer(text):
275
+ tokens.append(Token(
276
+ text=token_text,
277
+ start=start,
278
+ end=end,
279
+ pos=token_type,
280
+ ))
281
+ except Exception:
282
+ pass
283
+
284
+ return tokens
285
+
286
+
287
+ # 언어별 특화 클래스
288
+ class ArabicTokenizer(RTLTokenizer):
289
+ """아랍어 특화 토크나이저"""
290
+ SUPPORTED_LANGUAGES = {'ar'}
291
+
292
+ def __init__(self, use_morphology: bool = False):
293
+ super().__init__('ar', use_morphology)
294
+
295
+
296
+ class HebrewTokenizer(RTLTokenizer):
297
+ """히브리어 특화 토크나이저"""
298
+ SUPPORTED_LANGUAGES = {'he'}
299
+
300
+ def __init__(self, use_morphology: bool = False):
301
+ super().__init__('he', use_morphology)
302
+
303
+
304
+ class PersianTokenizer(RTLTokenizer):
305
+ """페르시아어 특화 토크나이저"""
306
+ SUPPORTED_LANGUAGES = {'fa'}
307
+
308
+ def __init__(self, use_morphology: bool = False):
309
+ super().__init__('fa', use_morphology)
tokmor/schema.py ADDED
@@ -0,0 +1,17 @@
1
+ """
2
+ TokMor output schema versioning
3
+ ==============================
4
+
5
+ We keep a small, explicit schema_version to avoid breaking downstream clients
6
+ when adding/changing fields in JSON outputs.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ # Increment only on breaking output changes (field removal/rename/type change).
12
+ SCHEMA_VERSION: int = 1
13
+
14
+
15
+
16
+
17
+
tokmor/sns_tags.py ADDED
@@ -0,0 +1,281 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import Any, Dict, Optional
5
+
6
+
7
+ _RX_LETTERS: Dict[str, re.Pattern] = {
8
+ "latin": re.compile(r"[A-Za-z]"),
9
+ "cyrillic": re.compile(r"[\u0400-\u04FF]"),
10
+ "arabic": re.compile(r"[\u0600-\u06FF]"),
11
+ "hebrew": re.compile(r"[\u0590-\u05FF]"),
12
+ "devanagari": re.compile(r"[\u0900-\u097F]"),
13
+ "thai": re.compile(r"[\u0E00-\u0E7F]"),
14
+ }
15
+
16
+ _VOWELS: Dict[str, set[str]] = {
17
+ # Keep minimal & conservative; this is *not* linguistic correctness, just a keysmash heuristic.
18
+ "latin": set("aeiouy"),
19
+ "cyrillic": set("аеёиоуыэюя"),
20
+ # Arabic/Hebrew: abjads; skip vowel heuristics (too many false positives).
21
+ "devanagari": set("अआइईउऊएऐओऔऋॠ"),
22
+ "thai": set("ะาิีึืุูเแโใไำ"),
23
+ }
24
+
25
+
26
+ def _script_of_token(t: str) -> Optional[str]:
27
+ if not t:
28
+ return None
29
+ best = None
30
+ best_cnt = 0
31
+ for name, rx in _RX_LETTERS.items():
32
+ cnt = len(rx.findall(t))
33
+ if cnt > best_cnt:
34
+ best = name
35
+ best_cnt = cnt
36
+ if not best or best_cnt <= 0:
37
+ return None
38
+ # Require majority; otherwise treat as mixed and don't attempt keysmash detection.
39
+ if best_cnt / max(1, len(t)) < 0.7:
40
+ return None
41
+ return best
42
+
43
+
44
+ def _looks_like_keysmash_generic(token: str) -> bool:
45
+ """
46
+ Conservative heuristic for "keyboard smash / garble" tokens across major scripts.
47
+
48
+ Goal: produce a neutral DISCOURSE_MARKER/OTHER hint, NOT language understanding.
49
+ """
50
+ t = token or ""
51
+ if len(t) < 8 or len(t) > 40:
52
+ return False
53
+ if not t.isalpha():
54
+ return False
55
+
56
+ script = _script_of_token(t)
57
+ if not script:
58
+ return False
59
+
60
+ # Random-looking: high unique-char ratio (e.g., asdfghjkl, фывапролдж).
61
+ uniq = len(set(t.lower()))
62
+ uniq_ratio = uniq / len(t)
63
+ if uniq_ratio < 0.6:
64
+ return False
65
+
66
+ # Avoid tagging real-ish words: keep script-specific conservatism.
67
+ vowels = _VOWELS.get(script)
68
+ # Cyrillic keysmash commonly contains vowels; don't use vowel ratio there.
69
+ if script == "cyrillic":
70
+ vowels = None
71
+ if script == "latin":
72
+ # Latin false positives are costly (real words). Be *very* strict:
73
+ # Only treat as keysmash when vowel count is essentially zero.
74
+ v = sum(1 for ch in t.lower() if ch in _VOWELS["latin"])
75
+ if v != 0:
76
+ return False
77
+ if len(t) < 8 or len(t) > 20:
78
+ return False
79
+ elif vowels:
80
+ v = sum(1 for ch in t.lower() if ch in vowels)
81
+ # Latin/Devanagari/Thai: keysmash tends to be vowel-poor.
82
+ if v / len(t) > 0.28:
83
+ return False
84
+ else:
85
+ # For abjads, we don't have a safe vowel heuristic; keep stricter thresholds.
86
+ # Allow slightly shorter tokens, but require higher randomness.
87
+ if len(t) < 9:
88
+ return False
89
+ if uniq_ratio < 0.75:
90
+ return False
91
+
92
+ # Low bigram repetition -> more "random typing" than a real word.
93
+ bigrams = [t[i : i + 2].lower() for i in range(len(t) - 1)]
94
+ bg_ratio = len(set(bigrams)) / max(1, len(bigrams))
95
+ if bg_ratio < 0.78:
96
+ return False
97
+
98
+ # Extra safety tightening for scripts where false positives are riskier.
99
+ if script in {"cyrillic"}:
100
+ # Cyrillic "keysmash" often includes vowels; rely more on randomness + length.
101
+ if len(t) < 9 or uniq_ratio < 0.7 or bg_ratio < 0.82:
102
+ return False
103
+ if script in {"arabic", "hebrew"}:
104
+ # Abjads: allow slightly shorter, but only when it's extremely "random looking".
105
+ if len(t) < 8:
106
+ return False
107
+ if uniq_ratio < 0.85 or bg_ratio < 0.9:
108
+ return False
109
+
110
+ return True
111
+
112
+
113
+ def classify_sns_token(token: str, *, lang: str) -> Optional[Dict[str, Any]]:
114
+ """
115
+ Classify SNS discourse markers (NOT POS tagging).
116
+
117
+ Output is a small, deterministic hint object:
118
+ {"class": "DISCOURSE_MARKER", "subtype": "...", "intensity": int}
119
+
120
+ This is intentionally minimal and language-agnostic where possible.
121
+ """
122
+ if not token:
123
+ return None
124
+
125
+ t = token
126
+ ll = (lang or "").lower().replace("_", "-")
127
+ tl = t.lower()
128
+
129
+ # Punctuation/ellipsis-only runs (SNS intensity / stance markers)
130
+ # Keep conservative: require 2+ chars and only specific punctuation.
131
+ if len(t) >= 2:
132
+ if re.fullmatch(r"[!!]{2,}", t):
133
+ return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": len(t)}
134
+ if re.fullmatch(r"[??]{2,}", t):
135
+ return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
136
+ if re.fullmatch(r"[!?!?]{2,}", t) and ("!" in t or "!" in t) and ("?" in t or "?" in t):
137
+ return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
138
+ if re.fullmatch(r"(?:\.{3,}|…{2,})", t):
139
+ return {"class": "DISCOURSE_MARKER", "subtype": "HESITATION", "intensity": len(t)}
140
+ if re.fullmatch(r"[~~]{2,}", t):
141
+ return {"class": "DISCOURSE_MARKER", "subtype": "SOFTENING", "intensity": len(t)}
142
+
143
+ # Emoji sadness / laughter (very common)
144
+ if any(ch in t for ch in ("😂", "🤣")):
145
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
146
+ if any(ch in t for ch in ("😭", "😢", "🥲")):
147
+ return {"class": "DISCOURSE_MARKER", "subtype": "SADNESS", "intensity": 1}
148
+ if any(ch in t for ch in ("😡", "🤬")):
149
+ return {"class": "DISCOURSE_MARKER", "subtype": "ANGER", "intensity": 1}
150
+ if any(ch in t for ch in ("❤️", "❤", "💕", "💖", "😍")):
151
+ return {"class": "DISCOURSE_MARKER", "subtype": "AFFECTION", "intensity": 1}
152
+
153
+ # Global keysmash / garble (conservative). Must come early so downstream can ignore it.
154
+ if _looks_like_keysmash_generic(t):
155
+ return {"class": "DISCOURSE_MARKER", "subtype": "OTHER", "intensity": len(t)}
156
+
157
+ # Hangul/Jamo-based markers (often appear in mixed-language SNS too, so allow globally)
158
+ if re.fullmatch(r"[ㅋㅎ]{2,}", t):
159
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
160
+ # Korean laughter syllable repetition (conservative)
161
+ if re.fullmatch(r"(?:하){2,}", t) or re.fullmatch(r"(?:헤){2,}", t):
162
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t) // 1}
163
+ if re.fullmatch(r"[ㅠㅜ]{2,}", t):
164
+ return {"class": "DISCOURSE_MARKER", "subtype": "SADNESS", "intensity": len(t)}
165
+ if re.fullmatch(r"ㄷ{2,}", t):
166
+ return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
167
+ if re.fullmatch(r"ㅇ{2,}", t):
168
+ return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": len(t)}
169
+ # Hangul Jamo "keysmash"/garbling (e.g., ㅣ마ㅓㅣ넣ㄹ아이고) – treat as discourse noise.
170
+ # This is NOT spell correction; it's a neutral hint to help downstream ignore unusable tokens.
171
+ if len(t) >= 4:
172
+ jamo = re.findall(r"[\u3131-\u3163]", t) # ㄱ-ㅎ,ㅏ-ㅣ
173
+ if len(jamo) >= 3:
174
+ # require presence of vowel jamo to avoid tagging pure consonant runs already handled above
175
+ has_vowel = any("\u314f" <= ch <= "\u3163" for ch in jamo) # ㅏ..ㅣ
176
+ if has_vowel:
177
+ ratio = len(jamo) / max(1, len(t))
178
+ if ratio >= 0.3:
179
+ return {"class": "DISCOURSE_MARKER", "subtype": "OTHER", "intensity": len(t)}
180
+ if ll == "ko":
181
+ if tl in {"ㄹㅇ"}:
182
+ return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": 1}
183
+ if tl in {"ㅇㅋ", "오케이"}:
184
+ return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": 1}
185
+ # common swear abbreviations (keep minimal)
186
+ if tl in {"ㅅㅂ", "ㅆㅂ", "ㅈㄴ"}:
187
+ return {"class": "DISCOURSE_MARKER", "subtype": "SWEAR", "intensity": 1}
188
+
189
+ # English-ish / global roman markers
190
+ if tl in {"lol", "lmao", "rofl"}:
191
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
192
+ if tl in {"haha", "hahaha", "hehe", "hehehe"}:
193
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
194
+ if tl in {"xd", "x-d", "x_d"}:
195
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
196
+ if tl in {"wtf", "omg"}:
197
+ return {"class": "DISCOURSE_MARKER", "subtype": "SWEAR" if tl == "wtf" else "SURPRISE", "intensity": 1}
198
+ if tl in {"ok", "okay", "k", "kk", "yes", "yep", "yeah"}:
199
+ return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": 1}
200
+ if tl in {"nope", "nah"}:
201
+ return {"class": "DISCOURSE_MARKER", "subtype": "NEGATION", "intensity": 1}
202
+
203
+ # Simple ASCII emoticons (conservative)
204
+ if tl in {":)", ":-)", ":d", ":-d", ";)", ";-)", ":(", ":-(", ":'(", ":'-(", "t_t", ";_;"}:
205
+ subtype = "LAUGHTER" if "d" in tl else "SADNESS" if "(" in tl or "_" in tl or ";" in tl else "OTHER"
206
+ if tl in {";)", ";-)"}:
207
+ subtype = "SOFTENING"
208
+ return {"class": "DISCOURSE_MARKER", "subtype": subtype, "intensity": 1}
209
+
210
+ # Chinese/Japanese common laughter markers
211
+ if tl in {"www"}:
212
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
213
+ if ll.startswith("ja"):
214
+ # Katakana laughter (ハハハ...)
215
+ if re.fullmatch(r"ハ{2,}", t):
216
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
217
+ if ll.startswith("zh"):
218
+ # Very conservative: only pure repetition tokens commonly used as laughter in Chinese.
219
+ # (Avoid semantic words; do not attempt to classify content tokens.)
220
+ if re.fullmatch(r"[哈呵嘿嘻]{2,}", t):
221
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
222
+ if any(ch in t for ch in ("笑",)):
223
+ # very conservative: only when token is exactly "(笑)" or "笑"
224
+ if t in {"笑", "(笑)", "(笑)"}:
225
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
226
+
227
+ # Chinese numeric slang (conservative)
228
+ if tl in {"666", "2333"}:
229
+ return {"class": "DISCOURSE_MARKER", "subtype": "PRAISE" if tl == "666" else "LAUGHTER", "intensity": len(t)}
230
+ if tl in {"233", "23333"}:
231
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
232
+
233
+ # Thai: "55555" (ha-ha-ha) is extremely common SNS laughter
234
+ if ll == "th" and re.fullmatch(r"5{3,}", t):
235
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
236
+
237
+ # Arabic-script laughter: ههههه / هاهاها (conservative)
238
+ if re.fullmatch(r"[ه]{2,}", t):
239
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
240
+ # Arabic/Persian internet laughter: خخخ...
241
+ if re.fullmatch(r"[خ]{2,}", t):
242
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
243
+ if re.fullmatch(r"(?:ها){2,}", t):
244
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t) // 2}
245
+ # Arabic-script "lol" transliteration: لول
246
+ if t == "لول":
247
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
248
+
249
+ # Cyrillic laughter (conservative): ха+ / ахаха
250
+ tl_cyr = tl
251
+ if re.fullmatch(r"[а-яё]+", tl_cyr) and (re.fullmatch(r"(?:ха){2,}", tl_cyr) or re.fullmatch(r"(?:ах){2,}а?", tl_cyr)):
252
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl_cyr) // 2)}
253
+ # Cyrillic "lol" transliteration: лол
254
+ if tl_cyr == "лол":
255
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
256
+
257
+ # Latin-script regional laughter (conservative)
258
+ if re.fullmatch(r"[a-z]+", tl):
259
+ if re.fullmatch(r"(?:ha){2,}h?", tl) or re.fullmatch(r"(?:he){2,}e?", tl):
260
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
261
+ if re.fullmatch(r"(?:ja){2,}a?", tl) or re.fullmatch(r"(?:je){2,}e?", tl):
262
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
263
+ if re.fullmatch(r"k{4,}", tl): # pt-BR "kkkkk"
264
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(tl)}
265
+ if re.fullmatch(r"(?:wk){2,}", tl) or re.fullmatch(r"(?:wkwk){1,}", tl): # id "wkwk"
266
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
267
+ if tl in {"mdr", "ptdr"}: # fr
268
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
269
+ if tl in {"rsrs"}: # pt-BR
270
+ return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
271
+
272
+ # Elongated latin words (e.g., "noooo", "soooo") are usually emphasis/stance markers.
273
+ # Keep conservative: require >=4 chars and a 3+ repetition of the same letter.
274
+ # NOTE: must come AFTER laughter patterns (e.g., "kkkkk" is laughter in pt-BR).
275
+ if len(tl) >= 4 and re.search(r"([a-z])\1{2,}", tl):
276
+ # Avoid mis-tagging common real words like "coffee" (double letters only).
277
+ return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": 1}
278
+
279
+ return None
280
+
281
+