tokmor 1.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokmor/__init__.py +77 -0
- tokmor/api.py +194 -0
- tokmor/assets.py +365 -0
- tokmor/base.py +238 -0
- tokmor/brahmic.py +516 -0
- tokmor/cjk.py +497 -0
- tokmor/domain/__init__.py +11 -0
- tokmor/domain/sentiment.py +198 -0
- tokmor/factory.py +394 -0
- tokmor/indic.py +289 -0
- tokmor/inventory.py +51 -0
- tokmor/legacy_api.py +143 -0
- tokmor/lemma_store.py +102 -0
- tokmor/lookup_keys.py +145 -0
- tokmor/models/domain/sentiment/en.json +54 -0
- tokmor/models/domain/sentiment/ko.json +52 -0
- tokmor/models/seg_lexicon/km_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/km_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/lo_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/my_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordfreq.pkl +0 -0
- tokmor/models/seg_lexicon/th_wordlist.pkl +0 -0
- tokmor/models/seg_lexicon/zh_extra_dict.json +35 -0
- tokmor/models/seg_lexicon/zh_wordfreq.pkl +0 -0
- tokmor/morphology/__init__.py +395 -0
- tokmor/morphology/advanced_base.py +472 -0
- tokmor/morphology/arabic_advanced.py +247 -0
- tokmor/morphology/chinese.py +736 -0
- tokmor/morphology/chinese_advanced.py +425 -0
- tokmor/morphology/english.py +315 -0
- tokmor/morphology/english_advanced.py +560 -0
- tokmor/morphology/french_advanced.py +237 -0
- tokmor/morphology/german_advanced.py +343 -0
- tokmor/morphology/hindi_advanced.py +258 -0
- tokmor/morphology/japanese.py +417 -0
- tokmor/morphology/japanese_advanced.py +589 -0
- tokmor/morphology/korean.py +534 -0
- tokmor/morphology/korean_advanced.py +603 -0
- tokmor/morphology/russian_advanced.py +217 -0
- tokmor/morphology/spanish_advanced.py +226 -0
- tokmor/morphology/templates/__init__.py +32 -0
- tokmor/morphology/templates/arabic_script_template.py +162 -0
- tokmor/morphology/templates/brahmic_template.py +181 -0
- tokmor/morphology/templates/cyrillic_template.py +168 -0
- tokmor/morphology/templates/latin_template.py +235 -0
- tokmor/morphology/templates/other_scripts_template.py +475 -0
- tokmor/morphology/thai_native.py +274 -0
- tokmor/morphology/tier2.py +477 -0
- tokmor/morphology/tier3.py +449 -0
- tokmor/morphology/tier4.py +410 -0
- tokmor/morphology/unified.py +855 -0
- tokmor/morphology/universal_fallback.py +398 -0
- tokmor/ner_prep.py +747 -0
- tokmor/offline.py +89 -0
- tokmor/preprocess.py +80 -0
- tokmor/resources.py +288 -0
- tokmor/routing.py +147 -0
- tokmor/rtl.py +309 -0
- tokmor/schema.py +17 -0
- tokmor/sns_tags.py +281 -0
- tokmor/space_based.py +272 -0
- tokmor/token_quality.py +1185 -0
- tokmor/unified_tokens.py +228 -0
- tokmor-1.2.9.dist-info/METADATA +103 -0
- tokmor-1.2.9.dist-info/RECORD +70 -0
- tokmor-1.2.9.dist-info/WHEEL +5 -0
- tokmor-1.2.9.dist-info/licenses/LICENSE +22 -0
- tokmor-1.2.9.dist-info/top_level.txt +1 -0
tokmor/rtl.py
ADDED
|
@@ -0,0 +1,309 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RTL Tokenizer
|
|
3
|
+
=============
|
|
4
|
+
|
|
5
|
+
오른쪽에서 왼쪽으로 쓰는 언어용 토크나이저
|
|
6
|
+
아랍어, 히브리어, 페르시아어, 우르두어
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import unicodedata
|
|
11
|
+
from typing import List
|
|
12
|
+
from .base import BaseTokenizer, Token, TokenizerResult, MorphologicalAnalyzer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RTLTokenizer(BaseTokenizer):
|
|
16
|
+
"""
|
|
17
|
+
RTL 언어 토크나이저
|
|
18
|
+
|
|
19
|
+
아랍어, 히브리어, 페르시아어, 우르두어 등
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
SUPPORTED_LANGUAGES = {'ar', 'he', 'fa', 'ur', 'yi', 'ps'}
|
|
23
|
+
|
|
24
|
+
# Unicode ranges
|
|
25
|
+
ARABIC = '\u0600-\u06ff'
|
|
26
|
+
ARABIC_SUPPLEMENT = '\u0750-\u077f'
|
|
27
|
+
ARABIC_EXT_A = '\u08a0-\u08ff'
|
|
28
|
+
ARABIC_PRESENTATION_A = '\ufb50-\ufdff'
|
|
29
|
+
ARABIC_PRESENTATION_B = '\ufe70-\ufeff'
|
|
30
|
+
HEBREW = '\u0590-\u05ff'
|
|
31
|
+
|
|
32
|
+
def __init__(self, lang: str, use_morphology: bool = False):
|
|
33
|
+
super().__init__(lang, use_morphology)
|
|
34
|
+
self._setup_patterns()
|
|
35
|
+
|
|
36
|
+
def _setup_patterns(self):
|
|
37
|
+
"""언어별 패턴 설정"""
|
|
38
|
+
if self.lang in ('he', 'yi'): # Hebrew and Yiddish use Hebrew script
|
|
39
|
+
self._script_pattern = re.compile(f'[{self.HEBREW}]+')
|
|
40
|
+
else: # ar, fa, ur, ps
|
|
41
|
+
self._script_pattern = re.compile(
|
|
42
|
+
f'[{self.ARABIC}{self.ARABIC_SUPPLEMENT}'
|
|
43
|
+
f'{self.ARABIC_EXT_A}{self.ARABIC_PRESENTATION_A}'
|
|
44
|
+
f'{self.ARABIC_PRESENTATION_B}]+'
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
self._latin_pattern = re.compile(r'[a-zA-Z0-9]+')
|
|
48
|
+
|
|
49
|
+
def _init_morphology(self):
|
|
50
|
+
"""형태소 분석기 초기화"""
|
|
51
|
+
if self.lang == 'ar':
|
|
52
|
+
self._morphology_analyzer = ArabicMorphologyAnalyzer()
|
|
53
|
+
elif self.lang == 'he':
|
|
54
|
+
self._morphology_analyzer = HebrewMorphologyAnalyzer()
|
|
55
|
+
|
|
56
|
+
def tokenize(self, text: str) -> TokenizerResult:
|
|
57
|
+
"""RTL 토크나이징"""
|
|
58
|
+
text = self.clean_text(text)
|
|
59
|
+
if not text:
|
|
60
|
+
return TokenizerResult(tokens=[], text=text, lang=self.lang)
|
|
61
|
+
|
|
62
|
+
# 형태소 분석 사용 시
|
|
63
|
+
if self.use_morphology and self._morphology_analyzer:
|
|
64
|
+
if self._morphology_analyzer.is_available():
|
|
65
|
+
tokens = self._morphology_analyzer.analyze(text)
|
|
66
|
+
return TokenizerResult(
|
|
67
|
+
tokens=tokens,
|
|
68
|
+
text=text,
|
|
69
|
+
lang=self.lang,
|
|
70
|
+
morphology_used=True
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
tokens: List[Token] = []
|
|
74
|
+
|
|
75
|
+
# RTL 스크립트 토큰
|
|
76
|
+
for match in self._script_pattern.finditer(text):
|
|
77
|
+
tokens.append(Token(
|
|
78
|
+
text=match.group(),
|
|
79
|
+
start=match.start(),
|
|
80
|
+
end=match.end(),
|
|
81
|
+
))
|
|
82
|
+
|
|
83
|
+
# 라틴/숫자
|
|
84
|
+
for match in self._latin_pattern.finditer(text):
|
|
85
|
+
overlaps = any(
|
|
86
|
+
t.start <= match.start() < t.end or t.start < match.end() <= t.end
|
|
87
|
+
for t in tokens
|
|
88
|
+
)
|
|
89
|
+
if not overlaps:
|
|
90
|
+
tokens.append(Token(
|
|
91
|
+
text=match.group(),
|
|
92
|
+
start=match.start(),
|
|
93
|
+
end=match.end(),
|
|
94
|
+
))
|
|
95
|
+
|
|
96
|
+
tokens.sort(key=lambda t: t.start)
|
|
97
|
+
|
|
98
|
+
# Preserve any remaining non-space spans (emoji, punctuation, symbols, etc.).
|
|
99
|
+
# Previously, RTL tokenization dropped these completely, which is harmful for SNS and general preprocessing.
|
|
100
|
+
if tokens:
|
|
101
|
+
out2: List[Token] = []
|
|
102
|
+
i = 0
|
|
103
|
+
j = 0
|
|
104
|
+
n = len(text)
|
|
105
|
+
toks = tokens
|
|
106
|
+
while i < n:
|
|
107
|
+
ch = text[i]
|
|
108
|
+
if ch.isspace():
|
|
109
|
+
i += 1
|
|
110
|
+
continue
|
|
111
|
+
# Advance token pointer to the first token that could overlap/appear after i
|
|
112
|
+
while j < len(toks) and toks[j].end <= i:
|
|
113
|
+
j += 1
|
|
114
|
+
if j < len(toks) and toks[j].start <= i < toks[j].end:
|
|
115
|
+
# Inside an existing token
|
|
116
|
+
out2.append(toks[j])
|
|
117
|
+
i = toks[j].end
|
|
118
|
+
continue
|
|
119
|
+
# Uncovered non-space segment until whitespace or next token start
|
|
120
|
+
next_start = toks[j].start if j < len(toks) else n
|
|
121
|
+
k = i
|
|
122
|
+
while k < n and (not text[k].isspace()) and k < next_start:
|
|
123
|
+
k += 1
|
|
124
|
+
if k > i:
|
|
125
|
+
out2.append(Token(text=text[i:k], start=i, end=k))
|
|
126
|
+
i = k
|
|
127
|
+
# De-duplicate by (start,end,text) and sort
|
|
128
|
+
seen = set()
|
|
129
|
+
dedup: List[Token] = []
|
|
130
|
+
for t in out2:
|
|
131
|
+
key = (t.start, t.end, t.text)
|
|
132
|
+
if key in seen:
|
|
133
|
+
continue
|
|
134
|
+
seen.add(key)
|
|
135
|
+
dedup.append(t)
|
|
136
|
+
tokens = sorted(dedup, key=lambda t: t.start)
|
|
137
|
+
|
|
138
|
+
# Drop standalone combining marks (diacritics) that appear as separate tokens in noisy corpora.
|
|
139
|
+
# Example: "سويا ً" where "ً" is a combining mark separated by whitespace.
|
|
140
|
+
def _is_mark_only(s: str) -> bool:
|
|
141
|
+
return bool(s) and all(unicodedata.category(ch) in {"Mn", "Mc", "Me"} for ch in s)
|
|
142
|
+
|
|
143
|
+
# Also strip *leading* combining marks that sometimes appear due to corpus spacing noise:
|
|
144
|
+
# e.g., "دون َوقوع" -> token "َوقوع" should become "وقوع".
|
|
145
|
+
cleaned: List[Token] = []
|
|
146
|
+
for t in tokens:
|
|
147
|
+
s = t.text
|
|
148
|
+
i = 0
|
|
149
|
+
while i < len(s) and unicodedata.category(s[i]) in {"Mn", "Mc", "Me"}:
|
|
150
|
+
i += 1
|
|
151
|
+
if i > 0:
|
|
152
|
+
s2 = s[i:]
|
|
153
|
+
if not s2:
|
|
154
|
+
continue
|
|
155
|
+
cleaned.append(Token(text=s2, start=t.start + i, end=t.end, lemma=t.lemma, pos=t.pos, features=t.features))
|
|
156
|
+
else:
|
|
157
|
+
cleaned.append(t)
|
|
158
|
+
|
|
159
|
+
tokens = [t for t in cleaned if not _is_mark_only(t.text)]
|
|
160
|
+
|
|
161
|
+
# Safety: never return empty tokens for non-empty input
|
|
162
|
+
if not tokens:
|
|
163
|
+
for m in re.finditer(r"\S+", text):
|
|
164
|
+
tokens.append(Token(text=m.group(), start=m.start(), end=m.end()))
|
|
165
|
+
|
|
166
|
+
return TokenizerResult(
|
|
167
|
+
tokens=tokens,
|
|
168
|
+
text=text,
|
|
169
|
+
lang=self.lang,
|
|
170
|
+
morphology_used=False
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class ArabicMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
175
|
+
"""
|
|
176
|
+
아랍어 형태소 분석기
|
|
177
|
+
|
|
178
|
+
Backends:
|
|
179
|
+
- camel_tools
|
|
180
|
+
- pyarabic
|
|
181
|
+
"""
|
|
182
|
+
|
|
183
|
+
def __init__(self, backend: str = 'auto'):
|
|
184
|
+
self.backend = backend
|
|
185
|
+
self._analyzer = None
|
|
186
|
+
self._backend_name = None
|
|
187
|
+
self._init_analyzer()
|
|
188
|
+
|
|
189
|
+
def _init_analyzer(self):
|
|
190
|
+
"""분석기 초기화"""
|
|
191
|
+
# camel_tools 시도
|
|
192
|
+
if self.backend in ('auto', 'camel'):
|
|
193
|
+
try:
|
|
194
|
+
from camel_tools.morphology.analyzer import Analyzer
|
|
195
|
+
from camel_tools.morphology.database import MorphologyDB
|
|
196
|
+
db = MorphologyDB.builtin_db()
|
|
197
|
+
self._analyzer = Analyzer(db)
|
|
198
|
+
self._backend_name = 'camel'
|
|
199
|
+
return
|
|
200
|
+
except (ImportError, Exception):
|
|
201
|
+
pass
|
|
202
|
+
|
|
203
|
+
# pyarabic 시도
|
|
204
|
+
if self.backend in ('auto', 'pyarabic'):
|
|
205
|
+
try:
|
|
206
|
+
import pyarabic.araby as araby
|
|
207
|
+
self._analyzer = araby
|
|
208
|
+
self._backend_name = 'pyarabic'
|
|
209
|
+
return
|
|
210
|
+
except ImportError:
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
def is_available(self) -> bool:
|
|
214
|
+
return self._analyzer is not None
|
|
215
|
+
|
|
216
|
+
def analyze(self, text: str) -> List[Token]:
|
|
217
|
+
"""형태소 분석"""
|
|
218
|
+
if not self._analyzer:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
tokens = []
|
|
222
|
+
|
|
223
|
+
if self._backend_name == 'pyarabic':
|
|
224
|
+
# pyarabic는 단어 분리만 지원
|
|
225
|
+
words = self._analyzer.tokenize(text)
|
|
226
|
+
offset = 0
|
|
227
|
+
for word in words:
|
|
228
|
+
idx = text.find(word, offset)
|
|
229
|
+
if idx >= 0:
|
|
230
|
+
tokens.append(Token(
|
|
231
|
+
text=word,
|
|
232
|
+
start=idx,
|
|
233
|
+
end=idx + len(word),
|
|
234
|
+
lemma=word,
|
|
235
|
+
))
|
|
236
|
+
offset = idx + len(word)
|
|
237
|
+
|
|
238
|
+
return tokens
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
class HebrewMorphologyAnalyzer(MorphologicalAnalyzer):
|
|
242
|
+
"""
|
|
243
|
+
히브리어 형태소 분석기
|
|
244
|
+
|
|
245
|
+
Backends:
|
|
246
|
+
- hebrew_tokenizer
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
def __init__(self, backend: str = 'auto'):
|
|
250
|
+
self.backend = backend
|
|
251
|
+
self._analyzer = None
|
|
252
|
+
self._backend_name = None
|
|
253
|
+
self._init_analyzer()
|
|
254
|
+
|
|
255
|
+
def _init_analyzer(self):
|
|
256
|
+
"""분석기 초기화"""
|
|
257
|
+
try:
|
|
258
|
+
from hebrew_tokenizer import tokenize as heb_tokenize
|
|
259
|
+
self._analyzer = heb_tokenize
|
|
260
|
+
self._backend_name = 'hebrew_tokenizer'
|
|
261
|
+
except ImportError:
|
|
262
|
+
pass
|
|
263
|
+
|
|
264
|
+
def is_available(self) -> bool:
|
|
265
|
+
return self._analyzer is not None
|
|
266
|
+
|
|
267
|
+
def analyze(self, text: str) -> List[Token]:
|
|
268
|
+
"""형태소 분석"""
|
|
269
|
+
if not self._analyzer:
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
tokens = []
|
|
273
|
+
try:
|
|
274
|
+
for token_type, token_text, _, start, end in self._analyzer(text):
|
|
275
|
+
tokens.append(Token(
|
|
276
|
+
text=token_text,
|
|
277
|
+
start=start,
|
|
278
|
+
end=end,
|
|
279
|
+
pos=token_type,
|
|
280
|
+
))
|
|
281
|
+
except Exception:
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
return tokens
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
# 언어별 특화 클래스
|
|
288
|
+
class ArabicTokenizer(RTLTokenizer):
|
|
289
|
+
"""아랍어 특화 토크나이저"""
|
|
290
|
+
SUPPORTED_LANGUAGES = {'ar'}
|
|
291
|
+
|
|
292
|
+
def __init__(self, use_morphology: bool = False):
|
|
293
|
+
super().__init__('ar', use_morphology)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class HebrewTokenizer(RTLTokenizer):
|
|
297
|
+
"""히브리어 특화 토크나이저"""
|
|
298
|
+
SUPPORTED_LANGUAGES = {'he'}
|
|
299
|
+
|
|
300
|
+
def __init__(self, use_morphology: bool = False):
|
|
301
|
+
super().__init__('he', use_morphology)
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
class PersianTokenizer(RTLTokenizer):
|
|
305
|
+
"""페르시아어 특화 토크나이저"""
|
|
306
|
+
SUPPORTED_LANGUAGES = {'fa'}
|
|
307
|
+
|
|
308
|
+
def __init__(self, use_morphology: bool = False):
|
|
309
|
+
super().__init__('fa', use_morphology)
|
tokmor/schema.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
TokMor output schema versioning
|
|
3
|
+
==============================
|
|
4
|
+
|
|
5
|
+
We keep a small, explicit schema_version to avoid breaking downstream clients
|
|
6
|
+
when adding/changing fields in JSON outputs.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
# Increment only on breaking output changes (field removal/rename/type change).
|
|
12
|
+
SCHEMA_VERSION: int = 1
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
tokmor/sns_tags.py
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Any, Dict, Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
_RX_LETTERS: Dict[str, re.Pattern] = {
|
|
8
|
+
"latin": re.compile(r"[A-Za-z]"),
|
|
9
|
+
"cyrillic": re.compile(r"[\u0400-\u04FF]"),
|
|
10
|
+
"arabic": re.compile(r"[\u0600-\u06FF]"),
|
|
11
|
+
"hebrew": re.compile(r"[\u0590-\u05FF]"),
|
|
12
|
+
"devanagari": re.compile(r"[\u0900-\u097F]"),
|
|
13
|
+
"thai": re.compile(r"[\u0E00-\u0E7F]"),
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
_VOWELS: Dict[str, set[str]] = {
|
|
17
|
+
# Keep minimal & conservative; this is *not* linguistic correctness, just a keysmash heuristic.
|
|
18
|
+
"latin": set("aeiouy"),
|
|
19
|
+
"cyrillic": set("аеёиоуыэюя"),
|
|
20
|
+
# Arabic/Hebrew: abjads; skip vowel heuristics (too many false positives).
|
|
21
|
+
"devanagari": set("अआइईउऊएऐओऔऋॠ"),
|
|
22
|
+
"thai": set("ะาิีึืุูเแโใไำ"),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _script_of_token(t: str) -> Optional[str]:
|
|
27
|
+
if not t:
|
|
28
|
+
return None
|
|
29
|
+
best = None
|
|
30
|
+
best_cnt = 0
|
|
31
|
+
for name, rx in _RX_LETTERS.items():
|
|
32
|
+
cnt = len(rx.findall(t))
|
|
33
|
+
if cnt > best_cnt:
|
|
34
|
+
best = name
|
|
35
|
+
best_cnt = cnt
|
|
36
|
+
if not best or best_cnt <= 0:
|
|
37
|
+
return None
|
|
38
|
+
# Require majority; otherwise treat as mixed and don't attempt keysmash detection.
|
|
39
|
+
if best_cnt / max(1, len(t)) < 0.7:
|
|
40
|
+
return None
|
|
41
|
+
return best
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _looks_like_keysmash_generic(token: str) -> bool:
|
|
45
|
+
"""
|
|
46
|
+
Conservative heuristic for "keyboard smash / garble" tokens across major scripts.
|
|
47
|
+
|
|
48
|
+
Goal: produce a neutral DISCOURSE_MARKER/OTHER hint, NOT language understanding.
|
|
49
|
+
"""
|
|
50
|
+
t = token or ""
|
|
51
|
+
if len(t) < 8 or len(t) > 40:
|
|
52
|
+
return False
|
|
53
|
+
if not t.isalpha():
|
|
54
|
+
return False
|
|
55
|
+
|
|
56
|
+
script = _script_of_token(t)
|
|
57
|
+
if not script:
|
|
58
|
+
return False
|
|
59
|
+
|
|
60
|
+
# Random-looking: high unique-char ratio (e.g., asdfghjkl, фывапролдж).
|
|
61
|
+
uniq = len(set(t.lower()))
|
|
62
|
+
uniq_ratio = uniq / len(t)
|
|
63
|
+
if uniq_ratio < 0.6:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
# Avoid tagging real-ish words: keep script-specific conservatism.
|
|
67
|
+
vowels = _VOWELS.get(script)
|
|
68
|
+
# Cyrillic keysmash commonly contains vowels; don't use vowel ratio there.
|
|
69
|
+
if script == "cyrillic":
|
|
70
|
+
vowels = None
|
|
71
|
+
if script == "latin":
|
|
72
|
+
# Latin false positives are costly (real words). Be *very* strict:
|
|
73
|
+
# Only treat as keysmash when vowel count is essentially zero.
|
|
74
|
+
v = sum(1 for ch in t.lower() if ch in _VOWELS["latin"])
|
|
75
|
+
if v != 0:
|
|
76
|
+
return False
|
|
77
|
+
if len(t) < 8 or len(t) > 20:
|
|
78
|
+
return False
|
|
79
|
+
elif vowels:
|
|
80
|
+
v = sum(1 for ch in t.lower() if ch in vowels)
|
|
81
|
+
# Latin/Devanagari/Thai: keysmash tends to be vowel-poor.
|
|
82
|
+
if v / len(t) > 0.28:
|
|
83
|
+
return False
|
|
84
|
+
else:
|
|
85
|
+
# For abjads, we don't have a safe vowel heuristic; keep stricter thresholds.
|
|
86
|
+
# Allow slightly shorter tokens, but require higher randomness.
|
|
87
|
+
if len(t) < 9:
|
|
88
|
+
return False
|
|
89
|
+
if uniq_ratio < 0.75:
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
# Low bigram repetition -> more "random typing" than a real word.
|
|
93
|
+
bigrams = [t[i : i + 2].lower() for i in range(len(t) - 1)]
|
|
94
|
+
bg_ratio = len(set(bigrams)) / max(1, len(bigrams))
|
|
95
|
+
if bg_ratio < 0.78:
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
# Extra safety tightening for scripts where false positives are riskier.
|
|
99
|
+
if script in {"cyrillic"}:
|
|
100
|
+
# Cyrillic "keysmash" often includes vowels; rely more on randomness + length.
|
|
101
|
+
if len(t) < 9 or uniq_ratio < 0.7 or bg_ratio < 0.82:
|
|
102
|
+
return False
|
|
103
|
+
if script in {"arabic", "hebrew"}:
|
|
104
|
+
# Abjads: allow slightly shorter, but only when it's extremely "random looking".
|
|
105
|
+
if len(t) < 8:
|
|
106
|
+
return False
|
|
107
|
+
if uniq_ratio < 0.85 or bg_ratio < 0.9:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
return True
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def classify_sns_token(token: str, *, lang: str) -> Optional[Dict[str, Any]]:
|
|
114
|
+
"""
|
|
115
|
+
Classify SNS discourse markers (NOT POS tagging).
|
|
116
|
+
|
|
117
|
+
Output is a small, deterministic hint object:
|
|
118
|
+
{"class": "DISCOURSE_MARKER", "subtype": "...", "intensity": int}
|
|
119
|
+
|
|
120
|
+
This is intentionally minimal and language-agnostic where possible.
|
|
121
|
+
"""
|
|
122
|
+
if not token:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
t = token
|
|
126
|
+
ll = (lang or "").lower().replace("_", "-")
|
|
127
|
+
tl = t.lower()
|
|
128
|
+
|
|
129
|
+
# Punctuation/ellipsis-only runs (SNS intensity / stance markers)
|
|
130
|
+
# Keep conservative: require 2+ chars and only specific punctuation.
|
|
131
|
+
if len(t) >= 2:
|
|
132
|
+
if re.fullmatch(r"[!!]{2,}", t):
|
|
133
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": len(t)}
|
|
134
|
+
if re.fullmatch(r"[??]{2,}", t):
|
|
135
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
|
|
136
|
+
if re.fullmatch(r"[!?!?]{2,}", t) and ("!" in t or "!" in t) and ("?" in t or "?" in t):
|
|
137
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
|
|
138
|
+
if re.fullmatch(r"(?:\.{3,}|…{2,})", t):
|
|
139
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "HESITATION", "intensity": len(t)}
|
|
140
|
+
if re.fullmatch(r"[~~]{2,}", t):
|
|
141
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SOFTENING", "intensity": len(t)}
|
|
142
|
+
|
|
143
|
+
# Emoji sadness / laughter (very common)
|
|
144
|
+
if any(ch in t for ch in ("😂", "🤣")):
|
|
145
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
146
|
+
if any(ch in t for ch in ("😭", "😢", "🥲")):
|
|
147
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SADNESS", "intensity": 1}
|
|
148
|
+
if any(ch in t for ch in ("😡", "🤬")):
|
|
149
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "ANGER", "intensity": 1}
|
|
150
|
+
if any(ch in t for ch in ("❤️", "❤", "💕", "💖", "😍")):
|
|
151
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "AFFECTION", "intensity": 1}
|
|
152
|
+
|
|
153
|
+
# Global keysmash / garble (conservative). Must come early so downstream can ignore it.
|
|
154
|
+
if _looks_like_keysmash_generic(t):
|
|
155
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "OTHER", "intensity": len(t)}
|
|
156
|
+
|
|
157
|
+
# Hangul/Jamo-based markers (often appear in mixed-language SNS too, so allow globally)
|
|
158
|
+
if re.fullmatch(r"[ㅋㅎ]{2,}", t):
|
|
159
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
160
|
+
# Korean laughter syllable repetition (conservative)
|
|
161
|
+
if re.fullmatch(r"(?:하){2,}", t) or re.fullmatch(r"(?:헤){2,}", t):
|
|
162
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t) // 1}
|
|
163
|
+
if re.fullmatch(r"[ㅠㅜ]{2,}", t):
|
|
164
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SADNESS", "intensity": len(t)}
|
|
165
|
+
if re.fullmatch(r"ㄷ{2,}", t):
|
|
166
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SURPRISE", "intensity": len(t)}
|
|
167
|
+
if re.fullmatch(r"ㅇ{2,}", t):
|
|
168
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": len(t)}
|
|
169
|
+
# Hangul Jamo "keysmash"/garbling (e.g., ㅣ마ㅓㅣ넣ㄹ아이고) – treat as discourse noise.
|
|
170
|
+
# This is NOT spell correction; it's a neutral hint to help downstream ignore unusable tokens.
|
|
171
|
+
if len(t) >= 4:
|
|
172
|
+
jamo = re.findall(r"[\u3131-\u3163]", t) # ㄱ-ㅎ,ㅏ-ㅣ
|
|
173
|
+
if len(jamo) >= 3:
|
|
174
|
+
# require presence of vowel jamo to avoid tagging pure consonant runs already handled above
|
|
175
|
+
has_vowel = any("\u314f" <= ch <= "\u3163" for ch in jamo) # ㅏ..ㅣ
|
|
176
|
+
if has_vowel:
|
|
177
|
+
ratio = len(jamo) / max(1, len(t))
|
|
178
|
+
if ratio >= 0.3:
|
|
179
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "OTHER", "intensity": len(t)}
|
|
180
|
+
if ll == "ko":
|
|
181
|
+
if tl in {"ㄹㅇ"}:
|
|
182
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": 1}
|
|
183
|
+
if tl in {"ㅇㅋ", "오케이"}:
|
|
184
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": 1}
|
|
185
|
+
# common swear abbreviations (keep minimal)
|
|
186
|
+
if tl in {"ㅅㅂ", "ㅆㅂ", "ㅈㄴ"}:
|
|
187
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SWEAR", "intensity": 1}
|
|
188
|
+
|
|
189
|
+
# English-ish / global roman markers
|
|
190
|
+
if tl in {"lol", "lmao", "rofl"}:
|
|
191
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
192
|
+
if tl in {"haha", "hahaha", "hehe", "hehehe"}:
|
|
193
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
194
|
+
if tl in {"xd", "x-d", "x_d"}:
|
|
195
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
196
|
+
if tl in {"wtf", "omg"}:
|
|
197
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "SWEAR" if tl == "wtf" else "SURPRISE", "intensity": 1}
|
|
198
|
+
if tl in {"ok", "okay", "k", "kk", "yes", "yep", "yeah"}:
|
|
199
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "AFFIRM", "intensity": 1}
|
|
200
|
+
if tl in {"nope", "nah"}:
|
|
201
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "NEGATION", "intensity": 1}
|
|
202
|
+
|
|
203
|
+
# Simple ASCII emoticons (conservative)
|
|
204
|
+
if tl in {":)", ":-)", ":d", ":-d", ";)", ";-)", ":(", ":-(", ":'(", ":'-(", "t_t", ";_;"}:
|
|
205
|
+
subtype = "LAUGHTER" if "d" in tl else "SADNESS" if "(" in tl or "_" in tl or ";" in tl else "OTHER"
|
|
206
|
+
if tl in {";)", ";-)"}:
|
|
207
|
+
subtype = "SOFTENING"
|
|
208
|
+
return {"class": "DISCOURSE_MARKER", "subtype": subtype, "intensity": 1}
|
|
209
|
+
|
|
210
|
+
# Chinese/Japanese common laughter markers
|
|
211
|
+
if tl in {"www"}:
|
|
212
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
213
|
+
if ll.startswith("ja"):
|
|
214
|
+
# Katakana laughter (ハハハ...)
|
|
215
|
+
if re.fullmatch(r"ハ{2,}", t):
|
|
216
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
217
|
+
if ll.startswith("zh"):
|
|
218
|
+
# Very conservative: only pure repetition tokens commonly used as laughter in Chinese.
|
|
219
|
+
# (Avoid semantic words; do not attempt to classify content tokens.)
|
|
220
|
+
if re.fullmatch(r"[哈呵嘿嘻]{2,}", t):
|
|
221
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
222
|
+
if any(ch in t for ch in ("笑",)):
|
|
223
|
+
# very conservative: only when token is exactly "(笑)" or "笑"
|
|
224
|
+
if t in {"笑", "(笑)", "(笑)"}:
|
|
225
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
226
|
+
|
|
227
|
+
# Chinese numeric slang (conservative)
|
|
228
|
+
if tl in {"666", "2333"}:
|
|
229
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "PRAISE" if tl == "666" else "LAUGHTER", "intensity": len(t)}
|
|
230
|
+
if tl in {"233", "23333"}:
|
|
231
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
232
|
+
|
|
233
|
+
# Thai: "55555" (ha-ha-ha) is extremely common SNS laughter
|
|
234
|
+
if ll == "th" and re.fullmatch(r"5{3,}", t):
|
|
235
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
236
|
+
|
|
237
|
+
# Arabic-script laughter: ههههه / هاهاها (conservative)
|
|
238
|
+
if re.fullmatch(r"[ه]{2,}", t):
|
|
239
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
240
|
+
# Arabic/Persian internet laughter: خخخ...
|
|
241
|
+
if re.fullmatch(r"[خ]{2,}", t):
|
|
242
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t)}
|
|
243
|
+
if re.fullmatch(r"(?:ها){2,}", t):
|
|
244
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(t) // 2}
|
|
245
|
+
# Arabic-script "lol" transliteration: لول
|
|
246
|
+
if t == "لول":
|
|
247
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
248
|
+
|
|
249
|
+
# Cyrillic laughter (conservative): ха+ / ахаха
|
|
250
|
+
tl_cyr = tl
|
|
251
|
+
if re.fullmatch(r"[а-яё]+", tl_cyr) and (re.fullmatch(r"(?:ха){2,}", tl_cyr) or re.fullmatch(r"(?:ах){2,}а?", tl_cyr)):
|
|
252
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl_cyr) // 2)}
|
|
253
|
+
# Cyrillic "lol" transliteration: лол
|
|
254
|
+
if tl_cyr == "лол":
|
|
255
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
256
|
+
|
|
257
|
+
# Latin-script regional laughter (conservative)
|
|
258
|
+
if re.fullmatch(r"[a-z]+", tl):
|
|
259
|
+
if re.fullmatch(r"(?:ha){2,}h?", tl) or re.fullmatch(r"(?:he){2,}e?", tl):
|
|
260
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
|
|
261
|
+
if re.fullmatch(r"(?:ja){2,}a?", tl) or re.fullmatch(r"(?:je){2,}e?", tl):
|
|
262
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
|
|
263
|
+
if re.fullmatch(r"k{4,}", tl): # pt-BR "kkkkk"
|
|
264
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": len(tl)}
|
|
265
|
+
if re.fullmatch(r"(?:wk){2,}", tl) or re.fullmatch(r"(?:wkwk){1,}", tl): # id "wkwk"
|
|
266
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": max(1, len(tl) // 2)}
|
|
267
|
+
if tl in {"mdr", "ptdr"}: # fr
|
|
268
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
269
|
+
if tl in {"rsrs"}: # pt-BR
|
|
270
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "LAUGHTER", "intensity": 1}
|
|
271
|
+
|
|
272
|
+
# Elongated latin words (e.g., "noooo", "soooo") are usually emphasis/stance markers.
|
|
273
|
+
# Keep conservative: require >=4 chars and a 3+ repetition of the same letter.
|
|
274
|
+
# NOTE: must come AFTER laughter patterns (e.g., "kkkkk" is laughter in pt-BR).
|
|
275
|
+
if len(tl) >= 4 and re.search(r"([a-z])\1{2,}", tl):
|
|
276
|
+
# Avoid mis-tagging common real words like "coffee" (double letters only).
|
|
277
|
+
return {"class": "DISCOURSE_MARKER", "subtype": "EMPHASIS", "intensity": 1}
|
|
278
|
+
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
|