polystring 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
polystring/__init__.py ADDED
@@ -0,0 +1,29 @@
1
+ """polystring — span-level language detection for mixed-language text."""
2
+ from __future__ import annotations
3
+
4
+ from polystring._analyzer import analyze
5
+ from polystring._exceptions import (
6
+ InputTooShortError,
7
+ PolyStringError,
8
+ UnsupportedLanguageError,
9
+ )
10
+ from polystring._models import PolyStringResult, Span, Token
11
+
12
+ __version__ = "0.1.0"
13
+ __all__ = [
14
+ "__version__",
15
+ "analyze",
16
+ "supported_languages",
17
+ "Span",
18
+ "Token",
19
+ "PolyStringResult",
20
+ "PolyStringError",
21
+ "UnsupportedLanguageError",
22
+ "InputTooShortError",
23
+ ]
24
+
25
+
26
+ def supported_languages() -> list[str]:
27
+ """Return sorted list of ISO 639-1 codes supported for detection."""
28
+ from polystring._analyzer import _LINGUA_SUPPORTED
29
+ return sorted(_LINGUA_SUPPORTED)
@@ -0,0 +1,133 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Literal
4
+
5
+ from polystring._exceptions import InputTooShortError, UnsupportedLanguageError
6
+ from polystring._models import PolyStringResult
7
+ from polystring._pipeline import (
8
+ stage1_preprocess,
9
+ stage2_script,
10
+ stage3_classify,
11
+ stage4_context,
12
+ stage5_merge,
13
+ )
14
+ from polystring.lexicons import add_custom_lexicon
15
+
16
+ # Languages supported by lingua's ISO 639-1 codes (subset used for validation)
17
+ # We rely on lingua raising its own error if a code is truly unknown;
18
+ # this set is used only for fast pre-validation of the hint list.
19
+ _LINGUA_SUPPORTED: frozenset[str] = frozenset({
20
+ "af", "sq", "ar", "hy", "az", "eu", "be", "bn", "bs", "bg", "ca", "zh",
21
+ "hr", "cs", "da", "nl", "en", "eo", "et", "fi", "fr", "lg", "ka", "de",
22
+ "el", "gu", "he", "hi", "hu", "is", "id", "ga", "it", "ja", "kn", "kk",
23
+ "ko", "la", "lv", "lt", "mk", "ms", "mi", "mr", "mn", "ne", "nb", "nn",
24
+ "fa", "pl", "pt", "pa", "ro", "ru", "sr", "sn", "sk", "sl", "so", "st",
25
+ "es", "sw", "sv", "tl", "ta", "te", "th", "ts", "tn", "tr", "uk", "ur",
26
+ "vi", "cy", "xh", "yo", "zu",
27
+ })
28
+
29
+
30
+ def analyze(
31
+ text: str,
32
+ *,
33
+ languages: list[str] | None = None,
34
+ granularity: Literal["span", "token"] = "span",
35
+ min_confidence: float = 0.70,
36
+ low_accuracy_mode: bool = False,
37
+ normalize: bool = True,
38
+ custom_lexicon: dict[str, list[str]] | None = None,
39
+ ) -> PolyStringResult:
40
+ """Detect languages of each span in mixed-language text.
41
+
42
+ Parameters
43
+ ----------
44
+ text:
45
+ Input text to analyse.
46
+ languages:
47
+ Restrict detection to these ISO 639-1 codes. Speeds up detection and
48
+ reduces false positives on known language sets.
49
+ granularity:
50
+ "span" (default) merges adjacent same-language tokens into spans.
51
+ "token" also populates result.tokens with per-token data.
52
+ min_confidence:
53
+ Tokens below this threshold are tagged "und". Default 0.70.
54
+ low_accuracy_mode:
55
+ Skip the lingua model entirely; use only lexicons and script detection.
56
+ Much faster but lower recall.
57
+ normalize:
58
+ Run NFC normalisation before analysis. Set False to skip.
59
+ custom_lexicon:
60
+ Additional {lang_code: [word, ...]} entries merged into the lexicons
61
+ before analysis.
62
+ """
63
+ if not isinstance(text, str):
64
+ raise TypeError(f"text must be str, got {type(text).__name__}")
65
+
66
+ if languages:
67
+ for code in languages:
68
+ if code not in _LINGUA_SUPPORTED:
69
+ raise UnsupportedLanguageError(code)
70
+
71
+ if custom_lexicon:
72
+ add_custom_lexicon(custom_lexicon)
73
+
74
+ languages_key = frozenset(languages) if languages else None
75
+
76
+ stage1 = stage1_preprocess.run(
77
+ text,
78
+ normalize=normalize,
79
+ )
80
+
81
+ if len(stage1.linguistic_tokens) < 2:
82
+ raise InputTooShortError(
83
+ "Input has fewer than 2 tokens after special token removal. "
84
+ "Cannot perform reliable language detection."
85
+ )
86
+
87
+ script_tokens, latin_tokens = stage2_script.run(stage1.linguistic_tokens)
88
+
89
+ if low_accuracy_mode:
90
+ from polystring._models import Token
91
+ from polystring.lexicons import lexicon_lookup
92
+ latin_classified: list[Token] = []
93
+ for rt in latin_tokens:
94
+ lex = lexicon_lookup(rt.text)
95
+ if lex:
96
+ lang, conf = lex
97
+ latin_classified.append(Token(
98
+ text=rt.text,
99
+ language=lang if lang != "amb" else "und",
100
+ token_type="text",
101
+ confidence=conf if lang != "amb" else 0.0,
102
+ start=rt.start,
103
+ end=rt.end,
104
+ ))
105
+ else:
106
+ latin_classified.append(Token(
107
+ text=rt.text,
108
+ language="und",
109
+ token_type="text",
110
+ confidence=0.0,
111
+ start=rt.start,
112
+ end=rt.end,
113
+ ))
114
+ else:
115
+ latin_classified = stage3_classify.run(
116
+ latin_tokens,
117
+ languages_hint=languages_key,
118
+ min_confidence=min_confidence,
119
+ )
120
+
121
+ all_tokens = sorted(
122
+ script_tokens + latin_classified,
123
+ key=lambda t: t.start,
124
+ )
125
+
126
+ all_tokens = stage4_context.run(all_tokens)
127
+
128
+ return stage5_merge.run(
129
+ all_tokens,
130
+ stage1.special_tokens,
131
+ stage1.normalized_text,
132
+ granularity=granularity,
133
+ )
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import functools
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from lingua import LanguageDetector
8
+
9
+ _detector: LanguageDetector | None = None
10
+ _detector_languages: frozenset[str] | None = None
11
+
12
+
13
+ def _build_detector(languages: list[str] | None = None) -> LanguageDetector:
14
+ from lingua import Language, LanguageDetectorBuilder
15
+
16
+ if languages:
17
+ from lingua import IsoCode639_1
18
+ lang_objs = []
19
+ for code in languages:
20
+ try:
21
+ iso = IsoCode639_1[code.upper()]
22
+ lang = Language.from_iso_code_639_1(iso)
23
+ lang_objs.append(lang)
24
+ except (KeyError, Exception):
25
+ pass
26
+ if not lang_objs:
27
+ builder = LanguageDetectorBuilder.from_all_languages()
28
+ else:
29
+ builder = LanguageDetectorBuilder.from_languages(*lang_objs)
30
+ else:
31
+ builder = LanguageDetectorBuilder.from_all_languages()
32
+
33
+ return builder.with_preloaded_language_models().build()
34
+
35
+
36
+ def get_detector(languages: list[str] | None = None) -> LanguageDetector:
37
+ global _detector, _detector_languages
38
+
39
+ key = frozenset(languages) if languages else None
40
+ if _detector is None or _detector_languages != key:
41
+ _detector = _build_detector(languages)
42
+ _detector_languages = key
43
+ lingua_top2.cache_clear()
44
+ return _detector
45
+
46
+
47
+ @functools.lru_cache(maxsize=4096)
48
+ def lingua_top2(
49
+ text: str, languages_key: frozenset[str] | None = None
50
+ ) -> list[tuple[str, float]]:
51
+ detector = get_detector(list(languages_key) if languages_key else None)
52
+ confidence_values = detector.compute_language_confidence_values(text)
53
+ out: list[tuple[str, float]] = []
54
+ for cv in confidence_values[:2]:
55
+ code = cv.language.iso_code_639_1.name.lower()
56
+ out.append((code, cv.value))
57
+ return out
58
+
59
+
60
+ def lingua_confidence_for(text: str, lang_code: str) -> float:
61
+ """Return lingua's confidence that `text` is in `lang_code`."""
62
+ from lingua import IsoCode639_1, Language
63
+
64
+ detector = get_detector()
65
+ try:
66
+ iso = IsoCode639_1[lang_code.upper()]
67
+ lang = Language.from_iso_code_639_1(iso)
68
+ except (KeyError, Exception):
69
+ return 0.0
70
+
71
+ for cv in detector.compute_language_confidence_values(text):
72
+ if cv.language == lang:
73
+ return cv.value
74
+ return 0.0
@@ -0,0 +1,17 @@
1
+ class PolyStringError(Exception): ...
2
+
3
+
4
+ class UnsupportedLanguageError(PolyStringError):
5
+ def __init__(self, code: str) -> None:
6
+ super().__init__(
7
+ f"'{code}' is not a supported language code. "
8
+ f"Call polystring.supported_languages() for the full list."
9
+ )
10
+
11
+
12
+ class InputTooShortError(PolyStringError):
13
+ def __init__(
14
+ self,
15
+ message: str = "Input too short: need at least 2 tokens.",
16
+ ) -> None:
17
+ super().__init__(message)
polystring/_models.py ADDED
@@ -0,0 +1,106 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import TYPE_CHECKING, Any
5
+
6
+ if TYPE_CHECKING:
7
+ import pandas as pd
8
+
9
+
10
+ @dataclass
11
+ class Token:
12
+ text: str
13
+ language: str
14
+ token_type: str
15
+ confidence: float
16
+ start: int
17
+ end: int
18
+ ambiguous_candidates: list[str] = field(default_factory=list)
19
+
20
+ def to_dict(self) -> dict[str, Any]:
21
+ return {
22
+ "text": self.text,
23
+ "language": self.language,
24
+ "token_type": self.token_type,
25
+ "confidence": self.confidence,
26
+ "start": self.start,
27
+ "end": self.end,
28
+ "ambiguous_candidates": self.ambiguous_candidates,
29
+ }
30
+
31
+
32
+ @dataclass
33
+ class Span:
34
+ text: str
35
+ language: str
36
+ token_type: str
37
+ confidence: float
38
+ start: int
39
+ end: int
40
+ is_foreign: bool = False
41
+ ambiguous_candidates: list[str] = field(default_factory=list)
42
+
43
+ def to_dict(self) -> dict[str, Any]:
44
+ return {
45
+ "text": self.text,
46
+ "language": self.language,
47
+ "token_type": self.token_type,
48
+ "confidence": self.confidence,
49
+ "start": self.start,
50
+ "end": self.end,
51
+ "is_foreign": self.is_foreign,
52
+ "ambiguous_candidates": self.ambiguous_candidates,
53
+ }
54
+
55
+
56
+ @dataclass
57
+ class PolyStringResult:
58
+ text: str
59
+ spans: list[Span]
60
+ tokens: list[Token] | None
61
+ languages: set[str]
62
+ dominant_language: str
63
+ is_mixed: bool
64
+ confidence: float
65
+
66
+ def to_dict(self) -> dict[str, Any]:
67
+ return {
68
+ "text": self.text,
69
+ "spans": [s.to_dict() for s in self.spans],
70
+ "languages": list(self.languages),
71
+ "dominant_language": self.dominant_language,
72
+ "is_mixed": self.is_mixed,
73
+ "confidence": self.confidence,
74
+ }
75
+
76
+ def to_dataframe(self) -> pd.DataFrame:
77
+ try:
78
+ import pandas as pd
79
+ except ImportError as e:
80
+ raise ImportError(
81
+ "pandas is required: pip install polystring[pandas]"
82
+ ) from e
83
+ return pd.DataFrame([s.to_dict() for s in self.spans])
84
+
85
+ def highlight(self) -> str:
86
+ # ANSI colour codes per language (cycles through a palette)
87
+ _PALETTE = [
88
+ "\033[91m", "\033[92m", "\033[93m", "\033[94m",
89
+ "\033[95m", "\033[96m", "\033[97m",
90
+ ]
91
+ _RESET = "\033[0m"
92
+ lang_colour: dict[str, str] = {}
93
+ colour_idx = 0
94
+ parts: list[str] = []
95
+ for span in self.spans:
96
+ if span.language not in lang_colour:
97
+ lang_colour[span.language] = _PALETTE[colour_idx % len(_PALETTE)]
98
+ colour_idx += 1
99
+ parts.append(
100
+ f"{lang_colour[span.language]}[{span.language}]{span.text}{_RESET}"
101
+ )
102
+ return " ".join(parts)
103
+
104
+ def linguistic_spans(self) -> list[Span]:
105
+ _NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
106
+ return [s for s in self.spans if s.token_type not in _NON_LINGUISTIC]
polystring/_ngram.py ADDED
@@ -0,0 +1,144 @@
1
+ """Character n-gram language scorer for low-resource romanised languages.
2
+
3
+ Loaded once at import time from pre-built JSON profiles in polystring/data/.
4
+ Used in stage 3 between lexicon lookup and lingua for languages where lingua
5
+ has insufficient training data (ur-Latn, tl, sw).
6
+
7
+ Architecture: discriminative hit-count scoring.
8
+
9
+ Each profile contains only *discriminative* n-grams: n-grams that appear
10
+ significantly more often in that language than in all competitor languages
11
+ (other target languages + English background), as determined at build time
12
+ by a log-prob margin threshold.
13
+
14
+ At inference time we count how many of a token's n-grams match each
15
+ language's discriminative profile. The language with the most hits wins,
16
+ provided it leads the runner-up by at least _MIN_GAP_HITS and has at least
17
+ _MIN_HITS total. Ties are broken by the average log-prob of matched n-grams.
18
+
19
+ This avoids the cross-contamination problem that affects plain LLR scoring:
20
+ because the profiles are pre-filtered to exclude n-grams shared across
21
+ ur-Latn/tl/sw, an Urdu word cannot "accidentally" accumulate Tagalog hits.
22
+ """
23
+ from __future__ import annotations
24
+
25
+ import json
26
+ import re
27
+ from pathlib import Path
28
+
29
+ _DATA_DIR = Path(__file__).parent / "data"
30
+
31
+ NGRAM_LANGUAGES: frozenset[str] = frozenset({"ur-Latn", "tl", "sw"})
32
+
33
+ # Scoring thresholds (tuned empirically on build corpora test words)
34
+ _MIN_HITS = 2 # winner must have at least this many discriminative n-gram hits
35
+ _MIN_GAP_HITS = 1 # winner must lead runner-up by at least this many hits
36
+ _MIN_TOKEN_LEN = 4 # tokens shorter than this are not scored (too noisy)
37
+
38
+ _CLEAN = re.compile(r"[^a-z'\-]")
39
+
40
+ _MODELS: dict[str, dict[str, dict[str, float]]] = {}
41
+ _NGRAM_SIZES: dict[str, list[int]] = {}
42
+ _LOADED = False
43
+
44
+
45
+ def _load() -> None:
46
+ global _LOADED
47
+ if _LOADED:
48
+ return
49
+
50
+ for lang in NGRAM_LANGUAGES:
51
+ fname = _DATA_DIR / f"{lang.replace('-', '_')}_ngram.json"
52
+ if not fname.exists():
53
+ continue
54
+ payload = json.loads(fname.read_text(encoding="utf-8"))
55
+ _MODELS[lang] = payload["profile"]
56
+ _NGRAM_SIZES[lang] = payload["ngram_sizes"]
57
+
58
+ _LOADED = True
59
+
60
+
61
+ def _hit_score(
62
+ cleaned: str, profile: dict[str, dict[str, float]], sizes: list[int]
63
+ ) -> tuple[int, float]:
64
+ """Count discriminative n-gram hits and sum their log-probs.
65
+
66
+ Returns (hit_count, avg_log_prob_of_hits) where avg is 0 when hit_count=0.
67
+ """
68
+ hit_count = 0
69
+ lp_sum = 0.0
70
+
71
+ for n in sizes:
72
+ table = profile[str(n)]
73
+ padded = f"{'_' * (n - 1)}{cleaned}{'_' * (n - 1)}"
74
+ for i in range(len(padded) - n + 1):
75
+ ng = padded[i:i + n]
76
+ v = table.get(ng)
77
+ if v is not None:
78
+ hit_count += 1
79
+ lp_sum += v
80
+
81
+ avg_lp = lp_sum / hit_count if hit_count > 0 else 0.0
82
+ return hit_count, avg_lp
83
+
84
+
85
+ def score(
86
+ token: str, candidates: frozenset[str] | None = None
87
+ ) -> tuple[str, float] | None:
88
+ """Score token using discriminative character n-gram hit counts.
89
+
90
+ Each language's profile contains only n-grams exclusive to that language
91
+ (built with a log-prob margin vs. all competitor languages + English).
92
+ The winner is the language that matches the most of the token's n-grams.
93
+
94
+ Parameters
95
+ ----------
96
+ token:
97
+ Raw token text; cleaned internally.
98
+ candidates:
99
+ Restrict scoring to languages in this set that also have n-gram models.
100
+
101
+ Returns
102
+ -------
103
+ (lang, confidence) with confidence ∈ [0.60, 0.95], or None if no model
104
+ wins convincingly.
105
+ """
106
+ _load()
107
+
108
+ if not _MODELS:
109
+ return None
110
+
111
+ cleaned = _CLEAN.sub("", token.lower()).strip("-'")
112
+ if len(cleaned) < _MIN_TOKEN_LEN:
113
+ return None
114
+
115
+ langs_to_score = set(_MODELS.keys())
116
+ if candidates is not None:
117
+ langs_to_score &= candidates
118
+ if not langs_to_score:
119
+ return None
120
+
121
+ results: list[tuple[int, float, str]] = []
122
+ for lang in langs_to_score:
123
+ hits, avg_lp = _hit_score(cleaned, _MODELS[lang], _NGRAM_SIZES[lang])
124
+ results.append((hits, avg_lp, lang))
125
+
126
+ # Sort: primary by hit count (desc), secondary by avg log-prob (desc)
127
+ results.sort(key=lambda x: (x[0], x[1]), reverse=True)
128
+ best_hits, best_avg, best_lang = results[0]
129
+
130
+ if best_hits < _MIN_HITS:
131
+ return None
132
+
133
+ if len(results) > 1 and (best_hits - results[1][0]) < _MIN_GAP_HITS:
134
+ return None
135
+
136
+ # Map hit count to confidence: 2 hits → 0.65, 10+ hits → 0.90
137
+ confidence = max(0.60, min(0.95, 0.60 + best_hits * 0.03))
138
+ return best_lang, confidence
139
+
140
+
141
+ def available_languages() -> frozenset[str]:
142
+ """Return languages for which a model file is present."""
143
+ _load()
144
+ return frozenset(_MODELS.keys())
File without changes
@@ -0,0 +1,134 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import unicodedata
5
+ from dataclasses import dataclass
6
+
7
+ import regex as _regex
8
+
9
+ _SPECIAL_PATTERNS: list[tuple[str, re.Pattern]] = [
10
+ ("url", re.compile(r"https?://\S+|www\.\S+")),
11
+ ("mention", re.compile(r"@\w+")),
12
+ ("hashtag", re.compile(r"#\w+")),
13
+ ("emoji", _regex.compile(
14
+ "[\U0001F600-\U0001F64F"
15
+ "\U0001F300-\U0001F5FF"
16
+ "\U0001F680-\U0001F6FF"
17
+ "\U0001F1E0-\U0001F1FF"
18
+ "\U00002702-\U000027B0"
19
+ "\U0001F900-\U0001F9FF"
20
+ "☀-⛿"
21
+ "✀-➿]+"
22
+ )),
23
+ ("num", re.compile(r"\b\d+[a-zA-Z]*\b|\b[a-zA-Z]*\d+\b")),
24
+ ]
25
+
26
+
27
+ @dataclass
28
+ class SpecialToken:
29
+ text: str
30
+ token_type: str # url | mention | hashtag | emoji | num
31
+ start: int
32
+ end: int
33
+ hashtag_lang: str | None = None
34
+ hashtag_confidence: float = 0.0
35
+
36
+
37
+ @dataclass
38
+ class RawToken:
39
+ text: str
40
+ start: int
41
+ end: int
42
+ is_ne_candidate: bool = False
43
+
44
+
45
+ @dataclass
46
+ class Stage1Result:
47
+ linguistic_tokens: list[RawToken]
48
+ special_tokens: list[SpecialToken]
49
+ normalized_text: str # NFC-normalized original (offsets valid against this)
50
+
51
+
52
+ def _nfc(text: str) -> str:
53
+ return unicodedata.normalize("NFC", text)
54
+
55
+
56
+ def _extract_special_tokens(text: str) -> tuple[list[SpecialToken], str]:
57
+ """Extract special tokens and replace them with whitespace-width placeholders.
58
+
59
+ Returns (special_tokens, masked_text) where masked_text has the same byte
60
+ offsets but non-linguistic tokens replaced with spaces so downstream
61
+ tokenisation still splits correctly.
62
+ """
63
+ specials: list[SpecialToken] = []
64
+ chars = list(text)
65
+
66
+ for tok_type, pattern in _SPECIAL_PATTERNS:
67
+ for m in pattern.finditer(text):
68
+ already = any(s.start <= m.start() < s.end for s in specials)
69
+ if already:
70
+ continue
71
+ specials.append(SpecialToken(
72
+ text=m.group(),
73
+ token_type=tok_type,
74
+ start=m.start(),
75
+ end=m.end(),
76
+ ))
77
+ for i in range(m.start(), m.end()):
78
+ chars[i] = " "
79
+
80
+ specials.sort(key=lambda s: s.start)
81
+ masked = "".join(chars)
82
+ return specials, masked
83
+
84
+
85
+ def _tokenize(masked_text: str) -> list[tuple[str, int, int]]:
86
+ """Split masked text into (token, start, end) by whitespace and punctuation."""
87
+ tokens: list[tuple[str, int, int]] = []
88
+ for m in re.finditer(r"\S+", masked_text):
89
+ token_text = m.group()
90
+ stripped = token_text.strip(".,!?;:\"'()[]{}")
91
+ if not stripped:
92
+ continue
93
+ offset = token_text.index(stripped[0]) if stripped else 0
94
+ end = m.start() + offset + len(stripped)
95
+ tokens.append((stripped, m.start() + offset, end))
96
+ return tokens
97
+
98
+
99
+ def _is_ne_candidate(token: str, idx: int, tokens: list[tuple[str, int, int]]) -> bool:
100
+ """True if a mid-sentence capitalised token that may be a named entity."""
101
+ if idx == 0:
102
+ return False
103
+ if not token[0].isupper():
104
+ return False
105
+ if idx > 0:
106
+ prev = tokens[idx - 1][0]
107
+ if prev.endswith((".", "!", "?")):
108
+ return False
109
+ return True
110
+
111
+
112
+ def run(
113
+ text: str,
114
+ normalize: bool = True,
115
+ ) -> Stage1Result:
116
+ """Stage 1: extract special tokens, NFC normalise, tokenise, tag NE candidates."""
117
+ normalized = _nfc(text) if normalize else text
118
+ specials, masked = _extract_special_tokens(normalized)
119
+
120
+ raw_tokens_raw = _tokenize(masked)
121
+ linguistic_tokens: list[RawToken] = []
122
+ for idx, (tok, start, end) in enumerate(raw_tokens_raw):
123
+ if not tok.strip():
124
+ continue
125
+ is_ne = _is_ne_candidate(tok, idx, raw_tokens_raw)
126
+ linguistic_tokens.append(
127
+ RawToken(text=tok, start=start, end=end, is_ne_candidate=is_ne)
128
+ )
129
+
130
+ return Stage1Result(
131
+ linguistic_tokens=linguistic_tokens,
132
+ special_tokens=specials,
133
+ normalized_text=normalized,
134
+ )