PyPI - polystring - Versions diffs - 0.1.0__py3-none-any.whl - Mend

polystring 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

polystring/__init__.py +29 -0
polystring/_analyzer.py +133 -0
polystring/_detector.py +74 -0
polystring/_exceptions.py +17 -0
polystring/_models.py +106 -0
polystring/_ngram.py +144 -0
polystring/_pipeline/__init__.py +0 -0
polystring/_pipeline/stage1_preprocess.py +134 -0
polystring/_pipeline/stage2_script.py +104 -0
polystring/_pipeline/stage3_classify.py +176 -0
polystring/_pipeline/stage4_context.py +108 -0
polystring/_pipeline/stage5_merge.py +138 -0
polystring/data/_background_ngram.json +1 -0
polystring/data/sw_ngram.json +1 -0
polystring/data/tl_ngram.json +1 -0
polystring/data/ur_Latn_ngram.json +1 -0
polystring/lexicons/__init__.py +116 -0
polystring/lexicons/french.py +113 -0
polystring/lexicons/german.py +111 -0
polystring/lexicons/italian.py +113 -0
polystring/lexicons/portuguese.py +117 -0
polystring/lexicons/roman_urdu.py +130 -0
polystring/lexicons/spanish.py +111 -0
polystring/lexicons/swahili.py +89 -0
polystring/lexicons/tagalog.py +100 -0
polystring/lexicons/turkish.py +87 -0
polystring/py.typed +0 -0
polystring-0.1.0.dist-info/METADATA +257 -0
polystring-0.1.0.dist-info/RECORD +31 -0
polystring-0.1.0.dist-info/WHEEL +4 -0
polystring-0.1.0.dist-info/licenses/LICENSE +21 -0

polystring/_pipeline/stage2_script.py ADDED Viewed

@@ -0,0 +1,104 @@
+from __future__ import annotations
+from polystring._models import Token
+from polystring._pipeline.stage1_preprocess import RawToken
+# Unicode block -> language code (or comma-separated candidates)
+# Ordered by frequency in social-media mixed text
+_BLOCK_MAP: list[tuple[range, str]] = [
+    # Perso-Arabic
+    (range(0x0600, 0x06FF + 1), "ar"),   # will be refined later if needed
+    (range(0x0750, 0x077F + 1), "ar"),
+    (range(0xFB50, 0xFDFF + 1), "ar"),
+    (range(0xFE70, 0xFEFF + 1), "ar"),
+    # Devanagari
+    (range(0x0900, 0x097F + 1), "hi"),
+    # Bengali
+    (range(0x0980, 0x09FF + 1), "bn"),
+    # Gurmukhi (Punjabi)
+    (range(0x0A00, 0x0A7F + 1), "pa"),
+    # Gujarati
+    (range(0x0A80, 0x0AFF + 1), "gu"),
+    # Tamil
+    (range(0x0B80, 0x0BFF + 1), "ta"),
+    # Telugu
+    (range(0x0C00, 0x0C7F + 1), "te"),
+    # Kannada
+    (range(0x0C80, 0x0CFF + 1), "kn"),
+    # Malayalam
+    (range(0x0D00, 0x0D7F + 1), "ml"),
+    # Sinhala
+    (range(0x0D80, 0x0DFF + 1), "si"),
+    # Thai
+    (range(0x0E00, 0x0E7F + 1), "th"),
+    # Georgian
+    (range(0x10A0, 0x10FF + 1), "ka"),
+    # Hangul (Korean)
+    (range(0xAC00, 0xD7AF + 1), "ko"),
+    (range(0x1100, 0x11FF + 1), "ko"),
+    # CJK Unified
+    (range(0x4E00, 0x9FFF + 1), "zh"),
+    (range(0x3400, 0x4DBF + 1), "zh"),
+    (range(0x20000, 0x2A6DF + 1), "zh"),
+    # Hiragana / Katakana -> Japanese
+    (range(0x3040, 0x309F + 1), "ja"),
+    (range(0x30A0, 0x30FF + 1), "ja"),
+    # Cyrillic
+    (range(0x0400, 0x04FF + 1), "ru"),
+    (range(0x0500, 0x052F + 1), "ru"),
+    # Greek
+    (range(0x0370, 0x03FF + 1), "el"),
+    # Hebrew
+    (range(0x0590, 0x05FF + 1), "he"),
+    # Armenian
+    (range(0x0530, 0x058F + 1), "hy"),
+    # Ethiopic
+    (range(0x1200, 0x137F + 1), "am"),
+]
+def _script_of(char: str) -> str | None:
+    """Return language hint if char is in a non-Latin, non-ASCII block."""
+    cp = ord(char)
+    for block, lang in _BLOCK_MAP:
+        if cp in block:
+            return lang
+    return None
+def _dominant_script(text: str) -> str | None:
+    """Return the dominant non-Latin script language for a token, or None."""
+    counts: dict[str, int] = {}
+    for ch in text:
+        lang = _script_of(ch)
+        if lang:
+            counts[lang] = counts.get(lang, 0) + 1
+    if not counts:
+        return None
+    return max(counts, key=lambda k: counts[k])
+def run(tokens: list[RawToken]) -> tuple[list[Token], list[RawToken]]:
+    """Stage 2: classify non-Latin tokens immediately; pass Latin ones forward.
+    Returns (classified_tokens, latin_tokens).
+    Classified tokens have language set to the script-inferred code.
+    """
+    classified: list[Token] = []
+    latin: list[RawToken] = []
+    for rt in tokens:
+        lang = _dominant_script(rt.text)
+        if lang is not None:
+            classified.append(Token(
+                text=rt.text,
+                language=lang,
+                token_type="text",
+                confidence=0.99,
+                start=rt.start,
+                end=rt.end,
+            ))
+        else:
+            latin.append(rt)
+    return classified, latin

polystring/_pipeline/stage3_classify.py ADDED Viewed

@@ -0,0 +1,176 @@
+from __future__ import annotations
+from polystring._detector import lingua_top2
+from polystring._models import Token
+from polystring._ngram import NGRAM_LANGUAGES
+from polystring._ngram import score as ngram_score
+from polystring._pipeline.stage1_preprocess import RawToken
+from polystring.lexicons import lexicon_lookup
+NEAR_IDENTICAL_PAIRS: frozenset[frozenset[str]] = frozenset({
+    frozenset({"es", "pt"}),
+    frozenset({"es", "it"}),
+    frozenset({"pt", "it"}),
+    frozenset({"nb", "da"}),
+    frozenset({"nb", "sv"}),
+    frozenset({"da", "sv"}),
+    frozenset({"id", "ms"}),
+    frozenset({"hr", "sr"}),
+    frozenset({"bs", "hr"}),
+})
+_CONFIDENCE_GAP = 0.15
+_MIN_CONFIDENCE = 0.70
+_WINDOW = 4
+# Languages for which lingua adds noise rather than signal.  For these we use
+# the n-gram model as primary classifier and skip lingua entirely.
+_LINGUA_SKIP = NGRAM_LANGUAGES   # {"ur-Latn", "tl", "sw"}
+def _window_text(tokens: list[RawToken], idx: int) -> str:
+    half = _WINDOW // 2
+    start = max(0, idx - half)
+    end = min(len(tokens), idx + half + 1)
+    return " ".join(t.text for t in tokens[start:end])
+def _is_near_identical(lang1: str, lang2: str) -> bool:
+    return frozenset({lang1, lang2}) in NEAR_IDENTICAL_PAIRS
+def run(
+    latin_tokens: list[RawToken],
+    languages_hint: frozenset[str] | None = None,
+    min_confidence: float = _MIN_CONFIDENCE,
+) -> list[Token]:
+    result: list[Token] = []
+    # Determine which n-gram languages are in scope given the caller's hint.
+    # If the caller restricted to e.g. ["es", "en"], n-gram languages not in
+    # that set are excluded from scoring.
+    if languages_hint is not None:
+        ngram_candidates: frozenset[str] | None = languages_hint & NGRAM_LANGUAGES
+        # If the hint contains no n-gram languages, pass None so ngram_score
+        # knows to skip rather than returning wrong results.
+        if not ngram_candidates:
+            ngram_candidates = None
+    else:
+        ngram_candidates = None   # no restriction → scorer uses all loaded models
+    for idx, rt in enumerate(latin_tokens):
+        # ------------------------------------------------------------------
+        # Step 1: lexicon lookup (fastest path, highest precision)
+        # ------------------------------------------------------------------
+        lex = lexicon_lookup(rt.text)
+        if lex is not None:
+            lang, conf = lex
+            if lang == "amb":
+                result.append(Token(
+                    text=rt.text,
+                    language="amb",
+                    token_type="text",
+                    confidence=0.0,
+                    start=rt.start,
+                    end=rt.end,
+                    ambiguous_candidates=[],
+                ))
+                continue
+            tok = Token(
+                text=rt.text,
+                language=lang,
+                token_type="text",
+                confidence=conf,
+                start=rt.start,
+                end=rt.end,
+            )
+            _maybe_mark_ne(tok, rt)
+            result.append(tok)
+            continue
+        # ------------------------------------------------------------------
+        # Step 2: n-gram model (covers ur-Latn, tl, sw)
+        # ------------------------------------------------------------------
+        ng = ngram_score(rt.text, ngram_candidates)
+        if ng is not None:
+            lang, conf = ng
+            if conf >= min_confidence:
+                tok = Token(
+                    text=rt.text,
+                    language=lang,
+                    token_type="text",
+                    confidence=conf,
+                    start=rt.start,
+                    end=rt.end,
+                )
+                _maybe_mark_ne(tok, rt)
+                result.append(tok)
+                continue
+        # ------------------------------------------------------------------
+        # Step 3: lingua (for all other Latin-script languages)
+        # Skip lingua entirely for tokens whose only candidate n-gram
+        # languages are in _LINGUA_SKIP — lingua will misclassify them.
+        # ------------------------------------------------------------------
+        skip_lingua = False
+        if languages_hint is not None and languages_hint.issubset(_LINGUA_SKIP):
+            skip_lingua = True
+        if skip_lingua:
+            result.append(Token(
+                text=rt.text,
+                language="und",
+                token_type="text",
+                confidence=0.0,
+                start=rt.start,
+                end=rt.end,
+            ))
+            continue
+        window = _window_text(latin_tokens, idx)
+        top2 = lingua_top2(window, languages_hint)
+        lang, conf = (top2[0][0], top2[0][1]) if top2 else ("und", 0.0)
+        if len(top2) >= 2:
+            l1, c1 = top2[0]
+            l2, c2 = top2[1]
+            if _is_near_identical(l1, l2) and (c1 - c2) < _CONFIDENCE_GAP:
+                result.append(Token(
+                    text=rt.text,
+                    language="und",
+                    token_type="text",
+                    confidence=0.0,
+                    start=rt.start,
+                    end=rt.end,
+                    ambiguous_candidates=[l1, l2],
+                ))
+                continue
+        if conf < min_confidence or lang == "und":
+            result.append(Token(
+                text=rt.text,
+                language="und",
+                token_type="text",
+                confidence=conf,
+                start=rt.start,
+                end=rt.end,
+            ))
+            continue
+        tok = Token(
+            text=rt.text,
+            language=lang,
+            token_type="text",
+            confidence=conf,
+            start=rt.start,
+            end=rt.end,
+        )
+        _maybe_mark_ne(tok, rt)
+        result.append(tok)
+    return result
+def _maybe_mark_ne(tok: Token, rt: RawToken) -> None:
+    if rt.is_ne_candidate and tok.language not in ("und", "amb"):
+        tok.token_type = "ne-candidate"

polystring/_pipeline/stage4_context.py ADDED Viewed

@@ -0,0 +1,108 @@
+from __future__ import annotations
+from collections import Counter
+from polystring._models import Token
+_CONTEXT_WINDOW = 3
+_UND_MAX_CONFIDENCE = 0.75
+_NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
+def _confirmed_lang(tok: Token) -> str | None:
+    """Return the token's language if it is a definite linguistic assignment."""
+    if tok.token_type in _NON_LINGUISTIC:
+        return None
+    if tok.language in ("und", "amb", "ne"):
+        return None
+    return tok.language
+def _context_majority(tokens: list[Token], idx: int) -> str | None:
+    """Return the majority confirmed language in a +-CONTEXT_WINDOW radius."""
+    start = max(0, idx - _CONTEXT_WINDOW)
+    end = min(len(tokens), idx + _CONTEXT_WINDOW + 1)
+    langs: list[str] = []
+    for i in range(start, end):
+        if i == idx:
+            continue
+        lang = _confirmed_lang(tokens[i])
+        if lang:
+            langs.append(lang)
+    if not langs:
+        return None
+    counts = Counter(langs)
+    top = counts.most_common(1)[0]
+    return top[0]
+def run(tokens: list[Token]) -> list[Token]:
+    """Stage 4: context-driven correction pass. Mutates tokens in-place."""
+    # 4a. "und" inherits from confident neighbours
+    for idx, tok in enumerate(tokens):
+        if tok.language == "und" and not tok.ambiguous_candidates:
+            majority = _context_majority(tokens, idx)
+            if majority:
+                tok.language = majority
+                tok.confidence = min(
+                    _UND_MAX_CONFIDENCE, tok.confidence or _UND_MAX_CONFIDENCE
+                )
+    # 4b. Single-token language islands absorbed (skip NE candidates — handled in 4e)
+    for idx, tok in enumerate(tokens):
+        if tok.token_type in _NON_LINGUISTIC or tok.token_type == "ne-candidate":
+            continue
+        lang = _confirmed_lang(tok)
+        if lang is None:
+            continue
+        left = _confirmed_lang(tokens[idx - 1]) if idx > 0 else None
+        right = _confirmed_lang(tokens[idx + 1]) if idx < len(tokens) - 1 else None
+        if left and right and left == right and left != lang:
+            tok.language = left
+            tok.confidence = min(_UND_MAX_CONFIDENCE, tok.confidence)
+    # 4c. Near-identical pair resolution via sentence-level prior
+    from polystring._pipeline.stage3_classify import _is_near_identical
+    sentence_langs = [_confirmed_lang(t) for t in tokens if _confirmed_lang(t)]
+    sentence_prior: str | None = None
+    if sentence_langs:
+        sentence_prior = Counter(sentence_langs).most_common(1)[0][0]
+    for tok in tokens:
+        if tok.language == "und" and len(tok.ambiguous_candidates) == 2:
+            l1, l2 = tok.ambiguous_candidates
+            if _is_near_identical(l1, l2) and sentence_prior in (l1, l2):
+                tok.language = sentence_prior  # type: ignore[assignment]
+                tok.confidence = _UND_MAX_CONFIDENCE
+                tok.ambiguous_candidates = []
+    # 4d. "amb" conflict word resolution
+    for idx, tok in enumerate(tokens):
+        if tok.language != "amb":
+            continue
+        majority = _context_majority(tokens, idx)
+        if majority:
+            tok.language = majority
+            tok.confidence = _UND_MAX_CONFIDENCE
+        else:
+            tok.language = "und"
+    # 4e. NE candidate resolution
+    for idx, tok in enumerate(tokens):
+        if tok.token_type != "ne-candidate":
+            continue
+        majority = _context_majority(tokens, idx)
+        if majority and majority != tok.language:
+            # Lingua's assignment conflicts with surrounding context -> proper noun
+            tok.language = "ne"
+            tok.token_type = "ne"
+            tok.confidence = 0.0
+        else:
+            # Consistent with context — keep as text token
+            tok.token_type = "text"
+    # 4f. Remaining "und" kept as-is (honest output)
+    return tokens

polystring/_pipeline/stage5_merge.py ADDED Viewed

@@ -0,0 +1,138 @@
+from __future__ import annotations
+from collections import Counter
+from typing import Literal
+from polystring._models import PolyStringResult, Span, Token
+from polystring._pipeline.stage1_preprocess import SpecialToken
+_NON_LINGUISTIC = {"url", "mention", "hashtag", "emoji", "num"}
+def _tokens_to_spans(tokens: list[Token]) -> list[Span]:
+    if not tokens:
+        return []
+    spans: list[Span] = []
+    cur = tokens[0]
+    merged_text = cur.text
+    merged_start = cur.start
+    merged_end = cur.end
+    conf_sum = cur.confidence
+    conf_count = 1
+    merged_cands = list(cur.ambiguous_candidates)
+    for tok in tokens[1:]:
+        same_lang = tok.language == cur.language
+        same_type = tok.token_type == cur.token_type
+        contiguous = tok.start <= merged_end + 1
+        if same_lang and same_type and contiguous:
+            merged_text = merged_text + " " + tok.text
+            merged_end = tok.end
+            conf_sum += tok.confidence
+            conf_count += 1
+        else:
+            spans.append(Span(
+                text=merged_text,
+                language=cur.language,
+                token_type=cur.token_type,
+                confidence=conf_sum / conf_count,
+                start=merged_start,
+                end=merged_end,
+                ambiguous_candidates=merged_cands,
+            ))
+            cur = tok
+            merged_text = tok.text
+            merged_start = tok.start
+            merged_end = tok.end
+            conf_sum = tok.confidence
+            conf_count = 1
+            merged_cands = list(tok.ambiguous_candidates)
+    spans.append(Span(
+        text=merged_text,
+        language=cur.language,
+        token_type=cur.token_type,
+        confidence=conf_sum / conf_count,
+        start=merged_start,
+        end=merged_end,
+        ambiguous_candidates=merged_cands,
+    ))
+    return spans
+def _insert_special_tokens(
+    spans: list[Span], specials: list[SpecialToken]
+) -> list[Span]:
+    special_spans = [
+        Span(
+            text=st.text,
+            language=st.token_type,
+            token_type=st.token_type,
+            confidence=0.0,
+            start=st.start,
+            end=st.end,
+        )
+        for st in specials
+    ]
+    all_spans = spans + special_spans
+    all_spans.sort(key=lambda s: s.start)
+    return all_spans
+def _compute_dominant(spans: list[Span]) -> str:
+    coverage: Counter[str] = Counter()
+    for span in spans:
+        if span.token_type in _NON_LINGUISTIC or span.language in ("und", "ne"):
+            continue
+        coverage[span.language] += span.end - span.start
+    if not coverage:
+        return "und"
+    return coverage.most_common(1)[0][0]
+def _mark_foreign(spans: list[Span], dominant: str) -> None:
+    for span in spans:
+        not_linguistic = span.token_type not in _NON_LINGUISTIC
+        if not_linguistic and span.language not in ("und", "ne"):
+            span.is_foreign = span.language != dominant
+def _overall_confidence(spans: list[Span]) -> float:
+    linguistic = [
+        s for s in spans if s.token_type == "text" and s.language not in ("und", "ne")
+    ]
+    if not linguistic:
+        return 0.0
+    return sum(s.confidence for s in linguistic) / len(linguistic)
+def run(
+    tokens: list[Token],
+    specials: list[SpecialToken],
+    original_text: str,
+    granularity: Literal["span", "token"] = "span",
+) -> PolyStringResult:
+    spans = _tokens_to_spans(tokens)
+    spans = _insert_special_tokens(spans, specials)
+    dominant = _compute_dominant(spans)
+    _mark_foreign(spans, dominant)
+    languages: set[str] = {
+        s.language for s in spans
+        if s.token_type == "text" and s.language not in ("und", "ne", "amb")
+    }
+    is_mixed = len(languages) > 1
+    confidence = _overall_confidence(spans)
+    return PolyStringResult(
+        text=original_text,
+        spans=spans,
+        tokens=tokens if granularity == "token" else None,
+        languages=languages,
+        dominant_language=dominant,
+        is_mixed=is_mixed,
+        confidence=confidence,
+    )