polystring 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polystring/__init__.py +29 -0
- polystring/_analyzer.py +133 -0
- polystring/_detector.py +74 -0
- polystring/_exceptions.py +17 -0
- polystring/_models.py +106 -0
- polystring/_ngram.py +144 -0
- polystring/_pipeline/__init__.py +0 -0
- polystring/_pipeline/stage1_preprocess.py +134 -0
- polystring/_pipeline/stage2_script.py +104 -0
- polystring/_pipeline/stage3_classify.py +176 -0
- polystring/_pipeline/stage4_context.py +108 -0
- polystring/_pipeline/stage5_merge.py +138 -0
- polystring/data/_background_ngram.json +1 -0
- polystring/data/sw_ngram.json +1 -0
- polystring/data/tl_ngram.json +1 -0
- polystring/data/ur_Latn_ngram.json +1 -0
- polystring/lexicons/__init__.py +116 -0
- polystring/lexicons/french.py +113 -0
- polystring/lexicons/german.py +111 -0
- polystring/lexicons/italian.py +113 -0
- polystring/lexicons/portuguese.py +117 -0
- polystring/lexicons/roman_urdu.py +130 -0
- polystring/lexicons/spanish.py +111 -0
- polystring/lexicons/swahili.py +89 -0
- polystring/lexicons/tagalog.py +100 -0
- polystring/lexicons/turkish.py +87 -0
- polystring/py.typed +0 -0
- polystring-0.1.0.dist-info/METADATA +257 -0
- polystring-0.1.0.dist-info/RECORD +31 -0
- polystring-0.1.0.dist-info/WHEEL +4 -0
- polystring-0.1.0.dist-info/licenses/LICENSE +21 -0
polystring/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""polystring — span-level language detection for mixed-language text."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from polystring._analyzer import analyze
|
|
5
|
+
from polystring._exceptions import (
|
|
6
|
+
InputTooShortError,
|
|
7
|
+
PolyStringError,
|
|
8
|
+
UnsupportedLanguageError,
|
|
9
|
+
)
|
|
10
|
+
from polystring._models import PolyStringResult, Span, Token
|
|
11
|
+
|
|
12
|
+
__version__ = "0.1.0"
|
|
13
|
+
__all__ = [
|
|
14
|
+
"__version__",
|
|
15
|
+
"analyze",
|
|
16
|
+
"supported_languages",
|
|
17
|
+
"Span",
|
|
18
|
+
"Token",
|
|
19
|
+
"PolyStringResult",
|
|
20
|
+
"PolyStringError",
|
|
21
|
+
"UnsupportedLanguageError",
|
|
22
|
+
"InputTooShortError",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def supported_languages() -> list[str]:
|
|
27
|
+
"""Return sorted list of ISO 639-1 codes supported for detection."""
|
|
28
|
+
from polystring._analyzer import _LINGUA_SUPPORTED
|
|
29
|
+
return sorted(_LINGUA_SUPPORTED)
|
polystring/_analyzer.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from polystring._exceptions import InputTooShortError, UnsupportedLanguageError
|
|
6
|
+
from polystring._models import PolyStringResult
|
|
7
|
+
from polystring._pipeline import (
|
|
8
|
+
stage1_preprocess,
|
|
9
|
+
stage2_script,
|
|
10
|
+
stage3_classify,
|
|
11
|
+
stage4_context,
|
|
12
|
+
stage5_merge,
|
|
13
|
+
)
|
|
14
|
+
from polystring.lexicons import add_custom_lexicon
|
|
15
|
+
|
|
16
|
+
# Languages supported by lingua's ISO 639-1 codes (subset used for validation)
|
|
17
|
+
# We rely on lingua raising its own error if a code is truly unknown;
|
|
18
|
+
# this set is used only for fast pre-validation of the hint list.
|
|
19
|
+
_LINGUA_SUPPORTED: frozenset[str] = frozenset({
|
|
20
|
+
"af", "sq", "ar", "hy", "az", "eu", "be", "bn", "bs", "bg", "ca", "zh",
|
|
21
|
+
"hr", "cs", "da", "nl", "en", "eo", "et", "fi", "fr", "lg", "ka", "de",
|
|
22
|
+
"el", "gu", "he", "hi", "hu", "is", "id", "ga", "it", "ja", "kn", "kk",
|
|
23
|
+
"ko", "la", "lv", "lt", "mk", "ms", "mi", "mr", "mn", "ne", "nb", "nn",
|
|
24
|
+
"fa", "pl", "pt", "pa", "ro", "ru", "sr", "sn", "sk", "sl", "so", "st",
|
|
25
|
+
"es", "sw", "sv", "tl", "ta", "te", "th", "ts", "tn", "tr", "uk", "ur",
|
|
26
|
+
"vi", "cy", "xh", "yo", "zu",
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def analyze(
|
|
31
|
+
text: str,
|
|
32
|
+
*,
|
|
33
|
+
languages: list[str] | None = None,
|
|
34
|
+
granularity: Literal["span", "token"] = "span",
|
|
35
|
+
min_confidence: float = 0.70,
|
|
36
|
+
low_accuracy_mode: bool = False,
|
|
37
|
+
normalize: bool = True,
|
|
38
|
+
custom_lexicon: dict[str, list[str]] | None = None,
|
|
39
|
+
) -> PolyStringResult:
|
|
40
|
+
"""Detect languages of each span in mixed-language text.
|
|
41
|
+
|
|
42
|
+
Parameters
|
|
43
|
+
----------
|
|
44
|
+
text:
|
|
45
|
+
Input text to analyse.
|
|
46
|
+
languages:
|
|
47
|
+
Restrict detection to these ISO 639-1 codes. Speeds up detection and
|
|
48
|
+
reduces false positives on known language sets.
|
|
49
|
+
granularity:
|
|
50
|
+
"span" (default) merges adjacent same-language tokens into spans.
|
|
51
|
+
"token" also populates result.tokens with per-token data.
|
|
52
|
+
min_confidence:
|
|
53
|
+
Tokens below this threshold are tagged "und". Default 0.70.
|
|
54
|
+
low_accuracy_mode:
|
|
55
|
+
Skip the lingua model entirely; use only lexicons and script detection.
|
|
56
|
+
Much faster but lower recall.
|
|
57
|
+
normalize:
|
|
58
|
+
Run NFC normalisation before analysis. Set False to skip.
|
|
59
|
+
custom_lexicon:
|
|
60
|
+
Additional {lang_code: [word, ...]} entries merged into the lexicons
|
|
61
|
+
before analysis.
|
|
62
|
+
"""
|
|
63
|
+
if not isinstance(text, str):
|
|
64
|
+
raise TypeError(f"text must be str, got {type(text).__name__}")
|
|
65
|
+
|
|
66
|
+
if languages:
|
|
67
|
+
for code in languages:
|
|
68
|
+
if code not in _LINGUA_SUPPORTED:
|
|
69
|
+
raise UnsupportedLanguageError(code)
|
|
70
|
+
|
|
71
|
+
if custom_lexicon:
|
|
72
|
+
add_custom_lexicon(custom_lexicon)
|
|
73
|
+
|
|
74
|
+
languages_key = frozenset(languages) if languages else None
|
|
75
|
+
|
|
76
|
+
stage1 = stage1_preprocess.run(
|
|
77
|
+
text,
|
|
78
|
+
normalize=normalize,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if len(stage1.linguistic_tokens) < 2:
|
|
82
|
+
raise InputTooShortError(
|
|
83
|
+
"Input has fewer than 2 tokens after special token removal. "
|
|
84
|
+
"Cannot perform reliable language detection."
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
script_tokens, latin_tokens = stage2_script.run(stage1.linguistic_tokens)
|
|
88
|
+
|
|
89
|
+
if low_accuracy_mode:
|
|
90
|
+
from polystring._models import Token
|
|
91
|
+
from polystring.lexicons import lexicon_lookup
|
|
92
|
+
latin_classified: list[Token] = []
|
|
93
|
+
for rt in latin_tokens:
|
|
94
|
+
lex = lexicon_lookup(rt.text)
|
|
95
|
+
if lex:
|
|
96
|
+
lang, conf = lex
|
|
97
|
+
latin_classified.append(Token(
|
|
98
|
+
text=rt.text,
|
|
99
|
+
language=lang if lang != "amb" else "und",
|
|
100
|
+
token_type="text",
|
|
101
|
+
confidence=conf if lang != "amb" else 0.0,
|
|
102
|
+
start=rt.start,
|
|
103
|
+
end=rt.end,
|
|
104
|
+
))
|
|
105
|
+
else:
|
|
106
|
+
latin_classified.append(Token(
|
|
107
|
+
text=rt.text,
|
|
108
|
+
language="und",
|
|
109
|
+
token_type="text",
|
|
110
|
+
confidence=0.0,
|
|
111
|
+
start=rt.start,
|
|
112
|
+
end=rt.end,
|
|
113
|
+
))
|
|
114
|
+
else:
|
|
115
|
+
latin_classified = stage3_classify.run(
|
|
116
|
+
latin_tokens,
|
|
117
|
+
languages_hint=languages_key,
|
|
118
|
+
min_confidence=min_confidence,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
all_tokens = sorted(
|
|
122
|
+
script_tokens + latin_classified,
|
|
123
|
+
key=lambda t: t.start,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
all_tokens = stage4_context.run(all_tokens)
|
|
127
|
+
|
|
128
|
+
return stage5_merge.run(
|
|
129
|
+
all_tokens,
|
|
130
|
+
stage1.special_tokens,
|
|
131
|
+
stage1.normalized_text,
|
|
132
|
+
granularity=granularity,
|
|
133
|
+
)
|
polystring/_detector.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from lingua import LanguageDetector
|
|
8
|
+
|
|
9
|
+
_detector: LanguageDetector | None = None
|
|
10
|
+
_detector_languages: frozenset[str] | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _build_detector(languages: list[str] | None = None) -> LanguageDetector:
|
|
14
|
+
from lingua import Language, LanguageDetectorBuilder
|
|
15
|
+
|
|
16
|
+
if languages:
|
|
17
|
+
from lingua import IsoCode639_1
|
|
18
|
+
lang_objs = []
|
|
19
|
+
for code in languages:
|
|
20
|
+
try:
|
|
21
|
+
iso = IsoCode639_1[code.upper()]
|
|
22
|
+
lang = Language.from_iso_code_639_1(iso)
|
|
23
|
+
lang_objs.append(lang)
|
|
24
|
+
except (KeyError, Exception):
|
|
25
|
+
pass
|
|
26
|
+
if not lang_objs:
|
|
27
|
+
builder = LanguageDetectorBuilder.from_all_languages()
|
|
28
|
+
else:
|
|
29
|
+
builder = LanguageDetectorBuilder.from_languages(*lang_objs)
|
|
30
|
+
else:
|
|
31
|
+
builder = LanguageDetectorBuilder.from_all_languages()
|
|
32
|
+
|
|
33
|
+
return builder.with_preloaded_language_models().build()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_detector(languages: list[str] | None = None) -> LanguageDetector:
|
|
37
|
+
global _detector, _detector_languages
|
|
38
|
+
|
|
39
|
+
key = frozenset(languages) if languages else None
|
|
40
|
+
if _detector is None or _detector_languages != key:
|
|
41
|
+
_detector = _build_detector(languages)
|
|
42
|
+
_detector_languages = key
|
|
43
|
+
lingua_top2.cache_clear()
|
|
44
|
+
return _detector
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@functools.lru_cache(maxsize=4096)
|
|
48
|
+
def lingua_top2(
|
|
49
|
+
text: str, languages_key: frozenset[str] | None = None
|
|
50
|
+
) -> list[tuple[str, float]]:
|
|
51
|
+
detector = get_detector(list(languages_key) if languages_key else None)
|
|
52
|
+
confidence_values = detector.compute_language_confidence_values(text)
|
|
53
|
+
out: list[tuple[str, float]] = []
|
|
54
|
+
for cv in confidence_values[:2]:
|
|
55
|
+
code = cv.language.iso_code_639_1.name.lower()
|
|
56
|
+
out.append((code, cv.value))
|
|
57
|
+
return out
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def lingua_confidence_for(text: str, lang_code: str) -> float:
|
|
61
|
+
"""Return lingua's confidence that `text` is in `lang_code`."""
|
|
62
|
+
from lingua import IsoCode639_1, Language
|
|
63
|
+
|
|
64
|
+
detector = get_detector()
|
|
65
|
+
try:
|
|
66
|
+
iso = IsoCode639_1[lang_code.upper()]
|
|
67
|
+
lang = Language.from_iso_code_639_1(iso)
|
|
68
|
+
except (KeyError, Exception):
|
|
69
|
+
return 0.0
|
|
70
|
+
|
|
71
|
+
for cv in detector.compute_language_confidence_values(text):
|
|
72
|
+
if cv.language == lang:
|
|
73
|
+
return cv.value
|
|
74
|
+
return 0.0
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
class PolyStringError(Exception): ...
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class UnsupportedLanguageError(PolyStringError):
|
|
5
|
+
def __init__(self, code: str) -> None:
|
|
6
|
+
super().__init__(
|
|
7
|
+
f"'{code}' is not a supported language code. "
|
|
8
|
+
f"Call polystring.supported_languages() for the full list."
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class InputTooShortError(PolyStringError):
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
message: str = "Input too short: need at least 2 tokens.",
|
|
16
|
+
) -> None:
|
|
17
|
+
super().__init__(message)
|
polystring/_models.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Token:
|
|
12
|
+
text: str
|
|
13
|
+
language: str
|
|
14
|
+
token_type: str
|
|
15
|
+
confidence: float
|
|
16
|
+
start: int
|
|
17
|
+
end: int
|
|
18
|
+
ambiguous_candidates: list[str] = field(default_factory=list)
|
|
19
|
+
|
|
20
|
+
def to_dict(self) -> dict[str, Any]:
|
|
21
|
+
return {
|
|
22
|
+
"text": self.text,
|
|
23
|
+
"language": self.language,
|
|
24
|
+
"token_type": self.token_type,
|
|
25
|
+
"confidence": self.confidence,
|
|
26
|
+
"start": self.start,
|
|
27
|
+
"end": self.end,
|
|
28
|
+
"ambiguous_candidates": self.ambiguous_candidates,
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Span:
|
|
34
|
+
text: str
|
|
35
|
+
language: str
|
|
36
|
+
token_type: str
|
|
37
|
+
confidence: float
|
|
38
|
+
start: int
|
|
39
|
+
end: int
|
|
40
|
+
is_foreign: bool = False
|
|
41
|
+
ambiguous_candidates: list[str] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
def to_dict(self) -> dict[str, Any]:
|
|
44
|
+
return {
|
|
45
|
+
"text": self.text,
|
|
46
|
+
"language": self.language,
|
|
47
|
+
"token_type": self.token_type,
|
|
48
|
+
"confidence": self.confidence,
|
|
49
|
+
"start": self.start,
|
|
50
|
+
"end": self.end,
|
|
51
|
+
"is_foreign": self.is_foreign,
|
|
52
|
+
"ambiguous_candidates": self.ambiguous_candidates,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class PolyStringResult:
|
|
58
|
+
text: str
|
|
59
|
+
spans: list[Span]
|
|
60
|
+
tokens: list[Token] | None
|
|
61
|
+
languages: set[str]
|
|
62
|
+
dominant_language: str
|
|
63
|
+
is_mixed: bool
|
|
64
|
+
confidence: float
|
|
65
|
+
|
|
66
|
+
def to_dict(self) -> dict[str, Any]:
|
|
67
|
+
return {
|
|
68
|
+
"text": self.text,
|
|
69
|
+
"spans": [s.to_dict() for s in self.spans],
|
|
70
|
+
"languages": list(self.languages),
|
|
71
|
+
"dominant_language": self.dominant_language,
|
|
72
|
+
"is_mixed": self.is_mixed,
|
|
73
|
+
"confidence": self.confidence,
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def to_dataframe(self) -> pd.DataFrame:
|
|
77
|
+
try:
|
|
78
|
+
import pandas as pd
|
|
79
|
+
except ImportError as e:
|
|
80
|
+
raise ImportError(
|
|
81
|
+
"pandas is required: pip install polystring[pandas]"
|
|
82
|
+
) from e
|
|
83
|
+
return pd.DataFrame([s.to_dict() for s in self.spans])
|
|
84
|
+
|
|
85
|
+
def highlight(self) -> str:
|
|
86
|
+
# ANSI colour codes per language (cycles through a palette)
|
|
87
|
+
_PALETTE = [
|
|
88
|
+
"\033[91m", "\033[92m", "\033[93m", "\033[94m",
|
|
89
|
+
"\033[95m", "\033[96m", "\033[97m",
|
|
90
|
+
]
|
|
91
|
+
_RESET = "\033[0m"
|
|
92
|
+
lang_colour: dict[str, str] = {}
|
|
93
|
+
colour_idx = 0
|
|
94
|
+
parts: list[str] = []
|
|
95
|
+
for span in self.spans:
|
|
96
|
+
if span.language not in lang_colour:
|
|
97
|
+
lang_colour[span.language] = _PALETTE[colour_idx % len(_PALETTE)]
|
|
98
|
+
colour_idx += 1
|
|
99
|
+
parts.append(
|
|
100
|
+
f"{lang_colour[span.language]}[{span.language}]{span.text}{_RESET}"
|
|
101
|
+
)
|
|
102
|
+
return " ".join(parts)
|
|
103
|
+
|
|
104
|
+
def linguistic_spans(self) -> list[Span]:
|
|
105
|
+
_NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
|
|
106
|
+
return [s for s in self.spans if s.token_type not in _NON_LINGUISTIC]
|
polystring/_ngram.py
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Character n-gram language scorer for low-resource romanised languages.
|
|
2
|
+
|
|
3
|
+
Loaded once at import time from pre-built JSON profiles in polystring/data/.
|
|
4
|
+
Used in stage 3 between lexicon lookup and lingua for languages where lingua
|
|
5
|
+
has insufficient training data (ur-Latn, tl, sw).
|
|
6
|
+
|
|
7
|
+
Architecture: discriminative hit-count scoring.
|
|
8
|
+
|
|
9
|
+
Each profile contains only *discriminative* n-grams: n-grams that appear
|
|
10
|
+
significantly more often in that language than in all competitor languages
|
|
11
|
+
(other target languages + English background), as determined at build time
|
|
12
|
+
by a log-prob margin threshold.
|
|
13
|
+
|
|
14
|
+
At inference time we count how many of a token's n-grams match each
|
|
15
|
+
language's discriminative profile. The language with the most hits wins,
|
|
16
|
+
provided it leads the runner-up by at least _MIN_GAP_HITS and has at least
|
|
17
|
+
_MIN_HITS total. Ties are broken by the average log-prob of matched n-grams.
|
|
18
|
+
|
|
19
|
+
This avoids the cross-contamination problem that affects plain LLR scoring:
|
|
20
|
+
because the profiles are pre-filtered to exclude n-grams shared across
|
|
21
|
+
ur-Latn/tl/sw, an Urdu word cannot "accidentally" accumulate Tagalog hits.
|
|
22
|
+
"""
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import json
|
|
26
|
+
import re
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
|
|
29
|
+
_DATA_DIR = Path(__file__).parent / "data"
|
|
30
|
+
|
|
31
|
+
NGRAM_LANGUAGES: frozenset[str] = frozenset({"ur-Latn", "tl", "sw"})
|
|
32
|
+
|
|
33
|
+
# Scoring thresholds (tuned empirically on build corpora test words)
|
|
34
|
+
_MIN_HITS = 2 # winner must have at least this many discriminative n-gram hits
|
|
35
|
+
_MIN_GAP_HITS = 1 # winner must lead runner-up by at least this many hits
|
|
36
|
+
_MIN_TOKEN_LEN = 4 # tokens shorter than this are not scored (too noisy)
|
|
37
|
+
|
|
38
|
+
_CLEAN = re.compile(r"[^a-z'\-]")
|
|
39
|
+
|
|
40
|
+
_MODELS: dict[str, dict[str, dict[str, float]]] = {}
|
|
41
|
+
_NGRAM_SIZES: dict[str, list[int]] = {}
|
|
42
|
+
_LOADED = False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _load() -> None:
|
|
46
|
+
global _LOADED
|
|
47
|
+
if _LOADED:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
for lang in NGRAM_LANGUAGES:
|
|
51
|
+
fname = _DATA_DIR / f"{lang.replace('-', '_')}_ngram.json"
|
|
52
|
+
if not fname.exists():
|
|
53
|
+
continue
|
|
54
|
+
payload = json.loads(fname.read_text(encoding="utf-8"))
|
|
55
|
+
_MODELS[lang] = payload["profile"]
|
|
56
|
+
_NGRAM_SIZES[lang] = payload["ngram_sizes"]
|
|
57
|
+
|
|
58
|
+
_LOADED = True
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _hit_score(
|
|
62
|
+
cleaned: str, profile: dict[str, dict[str, float]], sizes: list[int]
|
|
63
|
+
) -> tuple[int, float]:
|
|
64
|
+
"""Count discriminative n-gram hits and sum their log-probs.
|
|
65
|
+
|
|
66
|
+
Returns (hit_count, avg_log_prob_of_hits) where avg is 0 when hit_count=0.
|
|
67
|
+
"""
|
|
68
|
+
hit_count = 0
|
|
69
|
+
lp_sum = 0.0
|
|
70
|
+
|
|
71
|
+
for n in sizes:
|
|
72
|
+
table = profile[str(n)]
|
|
73
|
+
padded = f"{'_' * (n - 1)}{cleaned}{'_' * (n - 1)}"
|
|
74
|
+
for i in range(len(padded) - n + 1):
|
|
75
|
+
ng = padded[i:i + n]
|
|
76
|
+
v = table.get(ng)
|
|
77
|
+
if v is not None:
|
|
78
|
+
hit_count += 1
|
|
79
|
+
lp_sum += v
|
|
80
|
+
|
|
81
|
+
avg_lp = lp_sum / hit_count if hit_count > 0 else 0.0
|
|
82
|
+
return hit_count, avg_lp
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def score(
|
|
86
|
+
token: str, candidates: frozenset[str] | None = None
|
|
87
|
+
) -> tuple[str, float] | None:
|
|
88
|
+
"""Score token using discriminative character n-gram hit counts.
|
|
89
|
+
|
|
90
|
+
Each language's profile contains only n-grams exclusive to that language
|
|
91
|
+
(built with a log-prob margin vs. all competitor languages + English).
|
|
92
|
+
The winner is the language that matches the most of the token's n-grams.
|
|
93
|
+
|
|
94
|
+
Parameters
|
|
95
|
+
----------
|
|
96
|
+
token:
|
|
97
|
+
Raw token text; cleaned internally.
|
|
98
|
+
candidates:
|
|
99
|
+
Restrict scoring to languages in this set that also have n-gram models.
|
|
100
|
+
|
|
101
|
+
Returns
|
|
102
|
+
-------
|
|
103
|
+
(lang, confidence) with confidence ∈ [0.60, 0.95], or None if no model
|
|
104
|
+
wins convincingly.
|
|
105
|
+
"""
|
|
106
|
+
_load()
|
|
107
|
+
|
|
108
|
+
if not _MODELS:
|
|
109
|
+
return None
|
|
110
|
+
|
|
111
|
+
cleaned = _CLEAN.sub("", token.lower()).strip("-'")
|
|
112
|
+
if len(cleaned) < _MIN_TOKEN_LEN:
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
langs_to_score = set(_MODELS.keys())
|
|
116
|
+
if candidates is not None:
|
|
117
|
+
langs_to_score &= candidates
|
|
118
|
+
if not langs_to_score:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
results: list[tuple[int, float, str]] = []
|
|
122
|
+
for lang in langs_to_score:
|
|
123
|
+
hits, avg_lp = _hit_score(cleaned, _MODELS[lang], _NGRAM_SIZES[lang])
|
|
124
|
+
results.append((hits, avg_lp, lang))
|
|
125
|
+
|
|
126
|
+
# Sort: primary by hit count (desc), secondary by avg log-prob (desc)
|
|
127
|
+
results.sort(key=lambda x: (x[0], x[1]), reverse=True)
|
|
128
|
+
best_hits, best_avg, best_lang = results[0]
|
|
129
|
+
|
|
130
|
+
if best_hits < _MIN_HITS:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
if len(results) > 1 and (best_hits - results[1][0]) < _MIN_GAP_HITS:
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
# Map hit count to confidence: 2 hits → 0.65, 10+ hits → 0.90
|
|
137
|
+
confidence = max(0.60, min(0.95, 0.60 + best_hits * 0.03))
|
|
138
|
+
return best_lang, confidence
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def available_languages() -> frozenset[str]:
|
|
142
|
+
"""Return languages for which a model file is present."""
|
|
143
|
+
_load()
|
|
144
|
+
return frozenset(_MODELS.keys())
|
|
File without changes
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
import unicodedata
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import regex as _regex
|
|
8
|
+
|
|
9
|
+
_SPECIAL_PATTERNS: list[tuple[str, re.Pattern]] = [
|
|
10
|
+
("url", re.compile(r"https?://\S+|www\.\S+")),
|
|
11
|
+
("mention", re.compile(r"@\w+")),
|
|
12
|
+
("hashtag", re.compile(r"#\w+")),
|
|
13
|
+
("emoji", _regex.compile(
|
|
14
|
+
"[\U0001F600-\U0001F64F"
|
|
15
|
+
"\U0001F300-\U0001F5FF"
|
|
16
|
+
"\U0001F680-\U0001F6FF"
|
|
17
|
+
"\U0001F1E0-\U0001F1FF"
|
|
18
|
+
"\U00002702-\U000027B0"
|
|
19
|
+
"\U0001F900-\U0001F9FF"
|
|
20
|
+
"☀-⛿"
|
|
21
|
+
"✀-➿]+"
|
|
22
|
+
)),
|
|
23
|
+
("num", re.compile(r"\b\d+[a-zA-Z]*\b|\b[a-zA-Z]*\d+\b")),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class SpecialToken:
|
|
29
|
+
text: str
|
|
30
|
+
token_type: str # url | mention | hashtag | emoji | num
|
|
31
|
+
start: int
|
|
32
|
+
end: int
|
|
33
|
+
hashtag_lang: str | None = None
|
|
34
|
+
hashtag_confidence: float = 0.0
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class RawToken:
|
|
39
|
+
text: str
|
|
40
|
+
start: int
|
|
41
|
+
end: int
|
|
42
|
+
is_ne_candidate: bool = False
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Stage1Result:
|
|
47
|
+
linguistic_tokens: list[RawToken]
|
|
48
|
+
special_tokens: list[SpecialToken]
|
|
49
|
+
normalized_text: str # NFC-normalized original (offsets valid against this)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _nfc(text: str) -> str:
|
|
53
|
+
return unicodedata.normalize("NFC", text)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _extract_special_tokens(text: str) -> tuple[list[SpecialToken], str]:
|
|
57
|
+
"""Extract special tokens and replace them with whitespace-width placeholders.
|
|
58
|
+
|
|
59
|
+
Returns (special_tokens, masked_text) where masked_text has the same byte
|
|
60
|
+
offsets but non-linguistic tokens replaced with spaces so downstream
|
|
61
|
+
tokenisation still splits correctly.
|
|
62
|
+
"""
|
|
63
|
+
specials: list[SpecialToken] = []
|
|
64
|
+
chars = list(text)
|
|
65
|
+
|
|
66
|
+
for tok_type, pattern in _SPECIAL_PATTERNS:
|
|
67
|
+
for m in pattern.finditer(text):
|
|
68
|
+
already = any(s.start <= m.start() < s.end for s in specials)
|
|
69
|
+
if already:
|
|
70
|
+
continue
|
|
71
|
+
specials.append(SpecialToken(
|
|
72
|
+
text=m.group(),
|
|
73
|
+
token_type=tok_type,
|
|
74
|
+
start=m.start(),
|
|
75
|
+
end=m.end(),
|
|
76
|
+
))
|
|
77
|
+
for i in range(m.start(), m.end()):
|
|
78
|
+
chars[i] = " "
|
|
79
|
+
|
|
80
|
+
specials.sort(key=lambda s: s.start)
|
|
81
|
+
masked = "".join(chars)
|
|
82
|
+
return specials, masked
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _tokenize(masked_text: str) -> list[tuple[str, int, int]]:
|
|
86
|
+
"""Split masked text into (token, start, end) by whitespace and punctuation."""
|
|
87
|
+
tokens: list[tuple[str, int, int]] = []
|
|
88
|
+
for m in re.finditer(r"\S+", masked_text):
|
|
89
|
+
token_text = m.group()
|
|
90
|
+
stripped = token_text.strip(".,!?;:\"'()[]{}")
|
|
91
|
+
if not stripped:
|
|
92
|
+
continue
|
|
93
|
+
offset = token_text.index(stripped[0]) if stripped else 0
|
|
94
|
+
end = m.start() + offset + len(stripped)
|
|
95
|
+
tokens.append((stripped, m.start() + offset, end))
|
|
96
|
+
return tokens
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _is_ne_candidate(token: str, idx: int, tokens: list[tuple[str, int, int]]) -> bool:
|
|
100
|
+
"""True if a mid-sentence capitalised token that may be a named entity."""
|
|
101
|
+
if idx == 0:
|
|
102
|
+
return False
|
|
103
|
+
if not token[0].isupper():
|
|
104
|
+
return False
|
|
105
|
+
if idx > 0:
|
|
106
|
+
prev = tokens[idx - 1][0]
|
|
107
|
+
if prev.endswith((".", "!", "?")):
|
|
108
|
+
return False
|
|
109
|
+
return True
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def run(
|
|
113
|
+
text: str,
|
|
114
|
+
normalize: bool = True,
|
|
115
|
+
) -> Stage1Result:
|
|
116
|
+
"""Stage 1: extract special tokens, NFC normalise, tokenise, tag NE candidates."""
|
|
117
|
+
normalized = _nfc(text) if normalize else text
|
|
118
|
+
specials, masked = _extract_special_tokens(normalized)
|
|
119
|
+
|
|
120
|
+
raw_tokens_raw = _tokenize(masked)
|
|
121
|
+
linguistic_tokens: list[RawToken] = []
|
|
122
|
+
for idx, (tok, start, end) in enumerate(raw_tokens_raw):
|
|
123
|
+
if not tok.strip():
|
|
124
|
+
continue
|
|
125
|
+
is_ne = _is_ne_candidate(tok, idx, raw_tokens_raw)
|
|
126
|
+
linguistic_tokens.append(
|
|
127
|
+
RawToken(text=tok, start=start, end=end, is_ne_candidate=is_ne)
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
return Stage1Result(
|
|
131
|
+
linguistic_tokens=linguistic_tokens,
|
|
132
|
+
special_tokens=specials,
|
|
133
|
+
normalized_text=normalized,
|
|
134
|
+
)
|