polystring 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,104 @@
1
+ from __future__ import annotations
2
+
3
+ from polystring._models import Token
4
+ from polystring._pipeline.stage1_preprocess import RawToken
5
+
6
+ # Unicode block -> language code (or comma-separated candidates)
7
+ # Ordered by frequency in social-media mixed text
8
+ _BLOCK_MAP: list[tuple[range, str]] = [
9
+ # Perso-Arabic
10
+ (range(0x0600, 0x06FF + 1), "ar"), # will be refined later if needed
11
+ (range(0x0750, 0x077F + 1), "ar"),
12
+ (range(0xFB50, 0xFDFF + 1), "ar"),
13
+ (range(0xFE70, 0xFEFF + 1), "ar"),
14
+ # Devanagari
15
+ (range(0x0900, 0x097F + 1), "hi"),
16
+ # Bengali
17
+ (range(0x0980, 0x09FF + 1), "bn"),
18
+ # Gurmukhi (Punjabi)
19
+ (range(0x0A00, 0x0A7F + 1), "pa"),
20
+ # Gujarati
21
+ (range(0x0A80, 0x0AFF + 1), "gu"),
22
+ # Tamil
23
+ (range(0x0B80, 0x0BFF + 1), "ta"),
24
+ # Telugu
25
+ (range(0x0C00, 0x0C7F + 1), "te"),
26
+ # Kannada
27
+ (range(0x0C80, 0x0CFF + 1), "kn"),
28
+ # Malayalam
29
+ (range(0x0D00, 0x0D7F + 1), "ml"),
30
+ # Sinhala
31
+ (range(0x0D80, 0x0DFF + 1), "si"),
32
+ # Thai
33
+ (range(0x0E00, 0x0E7F + 1), "th"),
34
+ # Georgian
35
+ (range(0x10A0, 0x10FF + 1), "ka"),
36
+ # Hangul (Korean)
37
+ (range(0xAC00, 0xD7AF + 1), "ko"),
38
+ (range(0x1100, 0x11FF + 1), "ko"),
39
+ # CJK Unified
40
+ (range(0x4E00, 0x9FFF + 1), "zh"),
41
+ (range(0x3400, 0x4DBF + 1), "zh"),
42
+ (range(0x20000, 0x2A6DF + 1), "zh"),
43
+ # Hiragana / Katakana -> Japanese
44
+ (range(0x3040, 0x309F + 1), "ja"),
45
+ (range(0x30A0, 0x30FF + 1), "ja"),
46
+ # Cyrillic
47
+ (range(0x0400, 0x04FF + 1), "ru"),
48
+ (range(0x0500, 0x052F + 1), "ru"),
49
+ # Greek
50
+ (range(0x0370, 0x03FF + 1), "el"),
51
+ # Hebrew
52
+ (range(0x0590, 0x05FF + 1), "he"),
53
+ # Armenian
54
+ (range(0x0530, 0x058F + 1), "hy"),
55
+ # Ethiopic
56
+ (range(0x1200, 0x137F + 1), "am"),
57
+ ]
58
+
59
+
60
+ def _script_of(char: str) -> str | None:
61
+ """Return language hint if char is in a non-Latin, non-ASCII block."""
62
+ cp = ord(char)
63
+ for block, lang in _BLOCK_MAP:
64
+ if cp in block:
65
+ return lang
66
+ return None
67
+
68
+
69
+ def _dominant_script(text: str) -> str | None:
70
+ """Return the dominant non-Latin script language for a token, or None."""
71
+ counts: dict[str, int] = {}
72
+ for ch in text:
73
+ lang = _script_of(ch)
74
+ if lang:
75
+ counts[lang] = counts.get(lang, 0) + 1
76
+ if not counts:
77
+ return None
78
+ return max(counts, key=lambda k: counts[k])
79
+
80
+
81
+ def run(tokens: list[RawToken]) -> tuple[list[Token], list[RawToken]]:
82
+ """Stage 2: classify non-Latin tokens immediately; pass Latin ones forward.
83
+
84
+ Returns (classified_tokens, latin_tokens).
85
+ Classified tokens have language set to the script-inferred code.
86
+ """
87
+ classified: list[Token] = []
88
+ latin: list[RawToken] = []
89
+
90
+ for rt in tokens:
91
+ lang = _dominant_script(rt.text)
92
+ if lang is not None:
93
+ classified.append(Token(
94
+ text=rt.text,
95
+ language=lang,
96
+ token_type="text",
97
+ confidence=0.99,
98
+ start=rt.start,
99
+ end=rt.end,
100
+ ))
101
+ else:
102
+ latin.append(rt)
103
+
104
+ return classified, latin
@@ -0,0 +1,176 @@
1
+ from __future__ import annotations
2
+
3
+ from polystring._detector import lingua_top2
4
+ from polystring._models import Token
5
+ from polystring._ngram import NGRAM_LANGUAGES
6
+ from polystring._ngram import score as ngram_score
7
+ from polystring._pipeline.stage1_preprocess import RawToken
8
+ from polystring.lexicons import lexicon_lookup
9
+
10
+ NEAR_IDENTICAL_PAIRS: frozenset[frozenset[str]] = frozenset({
11
+ frozenset({"es", "pt"}),
12
+ frozenset({"es", "it"}),
13
+ frozenset({"pt", "it"}),
14
+ frozenset({"nb", "da"}),
15
+ frozenset({"nb", "sv"}),
16
+ frozenset({"da", "sv"}),
17
+ frozenset({"id", "ms"}),
18
+ frozenset({"hr", "sr"}),
19
+ frozenset({"bs", "hr"}),
20
+ })
21
+
22
+ _CONFIDENCE_GAP = 0.15
23
+ _MIN_CONFIDENCE = 0.70
24
+ _WINDOW = 4
25
+
26
+ # Languages for which lingua adds noise rather than signal. For these we use
27
+ # the n-gram model as primary classifier and skip lingua entirely.
28
+ _LINGUA_SKIP = NGRAM_LANGUAGES # {"ur-Latn", "tl", "sw"}
29
+
30
+
31
+ def _window_text(tokens: list[RawToken], idx: int) -> str:
32
+ half = _WINDOW // 2
33
+ start = max(0, idx - half)
34
+ end = min(len(tokens), idx + half + 1)
35
+ return " ".join(t.text for t in tokens[start:end])
36
+
37
+
38
+ def _is_near_identical(lang1: str, lang2: str) -> bool:
39
+ return frozenset({lang1, lang2}) in NEAR_IDENTICAL_PAIRS
40
+
41
+
42
+ def run(
43
+ latin_tokens: list[RawToken],
44
+ languages_hint: frozenset[str] | None = None,
45
+ min_confidence: float = _MIN_CONFIDENCE,
46
+ ) -> list[Token]:
47
+ result: list[Token] = []
48
+
49
+ # Determine which n-gram languages are in scope given the caller's hint.
50
+ # If the caller restricted to e.g. ["es", "en"], n-gram languages not in
51
+ # that set are excluded from scoring.
52
+ if languages_hint is not None:
53
+ ngram_candidates: frozenset[str] | None = languages_hint & NGRAM_LANGUAGES
54
+ # If the hint contains no n-gram languages, pass None so ngram_score
55
+ # knows to skip rather than returning wrong results.
56
+ if not ngram_candidates:
57
+ ngram_candidates = None
58
+ else:
59
+ ngram_candidates = None # no restriction → scorer uses all loaded models
60
+
61
+ for idx, rt in enumerate(latin_tokens):
62
+ # ------------------------------------------------------------------
63
+ # Step 1: lexicon lookup (fastest path, highest precision)
64
+ # ------------------------------------------------------------------
65
+ lex = lexicon_lookup(rt.text)
66
+ if lex is not None:
67
+ lang, conf = lex
68
+ if lang == "amb":
69
+ result.append(Token(
70
+ text=rt.text,
71
+ language="amb",
72
+ token_type="text",
73
+ confidence=0.0,
74
+ start=rt.start,
75
+ end=rt.end,
76
+ ambiguous_candidates=[],
77
+ ))
78
+ continue
79
+ tok = Token(
80
+ text=rt.text,
81
+ language=lang,
82
+ token_type="text",
83
+ confidence=conf,
84
+ start=rt.start,
85
+ end=rt.end,
86
+ )
87
+ _maybe_mark_ne(tok, rt)
88
+ result.append(tok)
89
+ continue
90
+
91
+ # ------------------------------------------------------------------
92
+ # Step 2: n-gram model (covers ur-Latn, tl, sw)
93
+ # ------------------------------------------------------------------
94
+ ng = ngram_score(rt.text, ngram_candidates)
95
+ if ng is not None:
96
+ lang, conf = ng
97
+ if conf >= min_confidence:
98
+ tok = Token(
99
+ text=rt.text,
100
+ language=lang,
101
+ token_type="text",
102
+ confidence=conf,
103
+ start=rt.start,
104
+ end=rt.end,
105
+ )
106
+ _maybe_mark_ne(tok, rt)
107
+ result.append(tok)
108
+ continue
109
+
110
+ # ------------------------------------------------------------------
111
+ # Step 3: lingua (for all other Latin-script languages)
112
+ # Skip lingua entirely for tokens whose only candidate n-gram
113
+ # languages are in _LINGUA_SKIP — lingua will misclassify them.
114
+ # ------------------------------------------------------------------
115
+ skip_lingua = False
116
+ if languages_hint is not None and languages_hint.issubset(_LINGUA_SKIP):
117
+ skip_lingua = True
118
+
119
+ if skip_lingua:
120
+ result.append(Token(
121
+ text=rt.text,
122
+ language="und",
123
+ token_type="text",
124
+ confidence=0.0,
125
+ start=rt.start,
126
+ end=rt.end,
127
+ ))
128
+ continue
129
+
130
+ window = _window_text(latin_tokens, idx)
131
+ top2 = lingua_top2(window, languages_hint)
132
+ lang, conf = (top2[0][0], top2[0][1]) if top2 else ("und", 0.0)
133
+
134
+ if len(top2) >= 2:
135
+ l1, c1 = top2[0]
136
+ l2, c2 = top2[1]
137
+ if _is_near_identical(l1, l2) and (c1 - c2) < _CONFIDENCE_GAP:
138
+ result.append(Token(
139
+ text=rt.text,
140
+ language="und",
141
+ token_type="text",
142
+ confidence=0.0,
143
+ start=rt.start,
144
+ end=rt.end,
145
+ ambiguous_candidates=[l1, l2],
146
+ ))
147
+ continue
148
+
149
+ if conf < min_confidence or lang == "und":
150
+ result.append(Token(
151
+ text=rt.text,
152
+ language="und",
153
+ token_type="text",
154
+ confidence=conf,
155
+ start=rt.start,
156
+ end=rt.end,
157
+ ))
158
+ continue
159
+
160
+ tok = Token(
161
+ text=rt.text,
162
+ language=lang,
163
+ token_type="text",
164
+ confidence=conf,
165
+ start=rt.start,
166
+ end=rt.end,
167
+ )
168
+ _maybe_mark_ne(tok, rt)
169
+ result.append(tok)
170
+
171
+ return result
172
+
173
+
174
+ def _maybe_mark_ne(tok: Token, rt: RawToken) -> None:
175
+ if rt.is_ne_candidate and tok.language not in ("und", "amb"):
176
+ tok.token_type = "ne-candidate"
@@ -0,0 +1,108 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+
5
+ from polystring._models import Token
6
+
7
+ _CONTEXT_WINDOW = 3
8
+ _UND_MAX_CONFIDENCE = 0.75
9
+
10
+ _NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
11
+
12
+
13
+ def _confirmed_lang(tok: Token) -> str | None:
14
+ """Return the token's language if it is a definite linguistic assignment."""
15
+ if tok.token_type in _NON_LINGUISTIC:
16
+ return None
17
+ if tok.language in ("und", "amb", "ne"):
18
+ return None
19
+ return tok.language
20
+
21
+
22
+ def _context_majority(tokens: list[Token], idx: int) -> str | None:
23
+ """Return the majority confirmed language in a +-CONTEXT_WINDOW radius."""
24
+ start = max(0, idx - _CONTEXT_WINDOW)
25
+ end = min(len(tokens), idx + _CONTEXT_WINDOW + 1)
26
+ langs: list[str] = []
27
+ for i in range(start, end):
28
+ if i == idx:
29
+ continue
30
+ lang = _confirmed_lang(tokens[i])
31
+ if lang:
32
+ langs.append(lang)
33
+ if not langs:
34
+ return None
35
+ counts = Counter(langs)
36
+ top = counts.most_common(1)[0]
37
+ return top[0]
38
+
39
+
40
+ def run(tokens: list[Token]) -> list[Token]:
41
+ """Stage 4: context-driven correction pass. Mutates tokens in-place."""
42
+
43
+ # 4a. "und" inherits from confident neighbours
44
+ for idx, tok in enumerate(tokens):
45
+ if tok.language == "und" and not tok.ambiguous_candidates:
46
+ majority = _context_majority(tokens, idx)
47
+ if majority:
48
+ tok.language = majority
49
+ tok.confidence = min(
50
+ _UND_MAX_CONFIDENCE, tok.confidence or _UND_MAX_CONFIDENCE
51
+ )
52
+
53
+ # 4b. Single-token language islands absorbed (skip NE candidates — handled in 4e)
54
+ for idx, tok in enumerate(tokens):
55
+ if tok.token_type in _NON_LINGUISTIC or tok.token_type == "ne-candidate":
56
+ continue
57
+ lang = _confirmed_lang(tok)
58
+ if lang is None:
59
+ continue
60
+ left = _confirmed_lang(tokens[idx - 1]) if idx > 0 else None
61
+ right = _confirmed_lang(tokens[idx + 1]) if idx < len(tokens) - 1 else None
62
+ if left and right and left == right and left != lang:
63
+ tok.language = left
64
+ tok.confidence = min(_UND_MAX_CONFIDENCE, tok.confidence)
65
+
66
+ # 4c. Near-identical pair resolution via sentence-level prior
67
+ from polystring._pipeline.stage3_classify import _is_near_identical
68
+ sentence_langs = [_confirmed_lang(t) for t in tokens if _confirmed_lang(t)]
69
+ sentence_prior: str | None = None
70
+ if sentence_langs:
71
+ sentence_prior = Counter(sentence_langs).most_common(1)[0][0]
72
+
73
+ for tok in tokens:
74
+ if tok.language == "und" and len(tok.ambiguous_candidates) == 2:
75
+ l1, l2 = tok.ambiguous_candidates
76
+ if _is_near_identical(l1, l2) and sentence_prior in (l1, l2):
77
+ tok.language = sentence_prior # type: ignore[assignment]
78
+ tok.confidence = _UND_MAX_CONFIDENCE
79
+ tok.ambiguous_candidates = []
80
+
81
+ # 4d. "amb" conflict word resolution
82
+ for idx, tok in enumerate(tokens):
83
+ if tok.language != "amb":
84
+ continue
85
+ majority = _context_majority(tokens, idx)
86
+ if majority:
87
+ tok.language = majority
88
+ tok.confidence = _UND_MAX_CONFIDENCE
89
+ else:
90
+ tok.language = "und"
91
+
92
+ # 4e. NE candidate resolution
93
+ for idx, tok in enumerate(tokens):
94
+ if tok.token_type != "ne-candidate":
95
+ continue
96
+ majority = _context_majority(tokens, idx)
97
+ if majority and majority != tok.language:
98
+ # Lingua's assignment conflicts with surrounding context -> proper noun
99
+ tok.language = "ne"
100
+ tok.token_type = "ne"
101
+ tok.confidence = 0.0
102
+ else:
103
+ # Consistent with context — keep as text token
104
+ tok.token_type = "text"
105
+
106
+ # 4f. Remaining "und" kept as-is (honest output)
107
+
108
+ return tokens
@@ -0,0 +1,138 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import Counter
4
+ from typing import Literal
5
+
6
+ from polystring._models import PolyStringResult, Span, Token
7
+ from polystring._pipeline.stage1_preprocess import SpecialToken
8
+
9
+ _NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
10
+
11
+
12
+ def _tokens_to_spans(tokens: list[Token]) -> list[Span]:
13
+ if not tokens:
14
+ return []
15
+
16
+ spans: list[Span] = []
17
+ cur = tokens[0]
18
+ merged_text = cur.text
19
+ merged_start = cur.start
20
+ merged_end = cur.end
21
+ conf_sum = cur.confidence
22
+ conf_count = 1
23
+ merged_cands = list(cur.ambiguous_candidates)
24
+
25
+ for tok in tokens[1:]:
26
+ same_lang = tok.language == cur.language
27
+ same_type = tok.token_type == cur.token_type
28
+ contiguous = tok.start <= merged_end + 1
29
+
30
+ if same_lang and same_type and contiguous:
31
+ merged_text = merged_text + " " + tok.text
32
+ merged_end = tok.end
33
+ conf_sum += tok.confidence
34
+ conf_count += 1
35
+ else:
36
+ spans.append(Span(
37
+ text=merged_text,
38
+ language=cur.language,
39
+ token_type=cur.token_type,
40
+ confidence=conf_sum / conf_count,
41
+ start=merged_start,
42
+ end=merged_end,
43
+ ambiguous_candidates=merged_cands,
44
+ ))
45
+ cur = tok
46
+ merged_text = tok.text
47
+ merged_start = tok.start
48
+ merged_end = tok.end
49
+ conf_sum = tok.confidence
50
+ conf_count = 1
51
+ merged_cands = list(tok.ambiguous_candidates)
52
+
53
+ spans.append(Span(
54
+ text=merged_text,
55
+ language=cur.language,
56
+ token_type=cur.token_type,
57
+ confidence=conf_sum / conf_count,
58
+ start=merged_start,
59
+ end=merged_end,
60
+ ambiguous_candidates=merged_cands,
61
+ ))
62
+ return spans
63
+
64
+
65
+ def _insert_special_tokens(
66
+ spans: list[Span], specials: list[SpecialToken]
67
+ ) -> list[Span]:
68
+ special_spans = [
69
+ Span(
70
+ text=st.text,
71
+ language=st.token_type,
72
+ token_type=st.token_type,
73
+ confidence=0.0,
74
+ start=st.start,
75
+ end=st.end,
76
+ )
77
+ for st in specials
78
+ ]
79
+ all_spans = spans + special_spans
80
+ all_spans.sort(key=lambda s: s.start)
81
+ return all_spans
82
+
83
+
84
+ def _compute_dominant(spans: list[Span]) -> str:
85
+ coverage: Counter[str] = Counter()
86
+ for span in spans:
87
+ if span.token_type in _NON_LINGUISTIC or span.language in ("und", "ne"):
88
+ continue
89
+ coverage[span.language] += span.end - span.start
90
+ if not coverage:
91
+ return "und"
92
+ return coverage.most_common(1)[0][0]
93
+
94
+
95
+ def _mark_foreign(spans: list[Span], dominant: str) -> None:
96
+ for span in spans:
97
+ not_linguistic = span.token_type not in _NON_LINGUISTIC
98
+ if not_linguistic and span.language not in ("und", "ne"):
99
+ span.is_foreign = span.language != dominant
100
+
101
+
102
+ def _overall_confidence(spans: list[Span]) -> float:
103
+ linguistic = [
104
+ s for s in spans if s.token_type == "text" and s.language not in ("und", "ne")
105
+ ]
106
+ if not linguistic:
107
+ return 0.0
108
+ return sum(s.confidence for s in linguistic) / len(linguistic)
109
+
110
+
111
+ def run(
112
+ tokens: list[Token],
113
+ specials: list[SpecialToken],
114
+ original_text: str,
115
+ granularity: Literal["span", "token"] = "span",
116
+ ) -> PolyStringResult:
117
+ spans = _tokens_to_spans(tokens)
118
+ spans = _insert_special_tokens(spans, specials)
119
+
120
+ dominant = _compute_dominant(spans)
121
+ _mark_foreign(spans, dominant)
122
+
123
+ languages: set[str] = {
124
+ s.language for s in spans
125
+ if s.token_type == "text" and s.language not in ("und", "ne", "amb")
126
+ }
127
+ is_mixed = len(languages) > 1
128
+ confidence = _overall_confidence(spans)
129
+
130
+ return PolyStringResult(
131
+ text=original_text,
132
+ spans=spans,
133
+ tokens=tokens if granularity == "token" else None,
134
+ languages=languages,
135
+ dominant_language=dominant,
136
+ is_mixed=is_mixed,
137
+ confidence=confidence,
138
+ )