typeseg 0.2.2__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {typeseg-0.2.2 → typeseg-0.2.3}/PKG-INFO +1 -1
- {typeseg-0.2.2 → typeseg-0.2.3}/pyproject.toml +1 -1
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_postprocess.py +19 -13
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/PKG-INFO +1 -1
- {typeseg-0.2.2 → typeseg-0.2.3}/LICENSE +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/README.md +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/setup.cfg +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_cupy_parity.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_distribution.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_postprocess_perf.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/__init__.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/__main__.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_cli.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_color.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_cupy_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_mamba_kernel.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_numpy_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_onnx_backend.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_options.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_runtime.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_segmentation.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_tokenize.py +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/mamba_al.npz +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/mamba_al.onnx +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/manifest.json +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/unet_al.npz +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/unet_al.onnx +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/SOURCES.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/dependency_links.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/entry_points.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/requires.txt +0 -0
- {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/top_level.txt +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "typeseg"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.3"
|
|
8
8
|
description = "Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "Apache-2.0"
|
|
@@ -64,7 +64,7 @@ def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float,
|
|
|
64
64
|
if threshold <= 0.0 or char_probs.size == 0:
|
|
65
65
|
return labels
|
|
66
66
|
maxp = char_probs.max(axis=1)
|
|
67
|
-
return
|
|
67
|
+
return np.where(maxp < threshold, other_index, np.asarray(labels, dtype=np.intp)).tolist()
|
|
68
68
|
|
|
69
69
|
|
|
70
70
|
def normalize_short_runs(labels: List[int], char_probs: np.ndarray, min_run_chars: int,
|
|
@@ -452,21 +452,27 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
|
|
|
452
452
|
def build_segments(text: str, labels: List[int], char_probs: np.ndarray,
|
|
453
453
|
label_names: List[str], other_index: int, other_label: str) -> Tuple[List[Segment], List[str], List[float]]:
|
|
454
454
|
n = len(text)
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
455
|
+
# Per-char confidence, vectorised. `other` (no probability column) scores
|
|
456
|
+
# 1 - max(known); every other label scores its own column. The old per-char
|
|
457
|
+
# `char_probs[i].max()` Python loop did n separate numpy reductions and was the
|
|
458
|
+
# bottleneck of the whole pipeline -- this is the same result in two reductions.
|
|
459
|
+
if char_probs.size:
|
|
460
|
+
lab_arr = np.asarray(labels, dtype=np.intp)
|
|
461
|
+
is_other = lab_arr == other_index
|
|
462
|
+
known_max = char_probs.max(axis=1) # (n,)
|
|
463
|
+
safe_lab = np.where(is_other, 0, lab_arr) # avoid OOB at other_index
|
|
464
|
+
picked = char_probs[np.arange(n), safe_lab] # (n,) chosen-label prob
|
|
465
|
+
conf_arr = np.where(is_other, 1.0 - known_max, picked).astype(np.float64)
|
|
466
|
+
else:
|
|
467
|
+
conf_arr = np.zeros(n, dtype=np.float64)
|
|
468
|
+
char_conf: List[float] = conf_arr.tolist()
|
|
469
|
+
char_label_names: List[str] = [
|
|
470
|
+
other_label if lab == other_index else label_names[lab] for lab in labels
|
|
471
|
+
]
|
|
466
472
|
|
|
467
473
|
segments: List[Segment] = []
|
|
468
474
|
for s, e, lab in _runs(labels):
|
|
469
475
|
name = other_label if lab == other_index else label_names[lab]
|
|
470
|
-
conf = float(
|
|
476
|
+
conf = float(conf_arr[s:e].mean()) if e > s else 0.0
|
|
471
477
|
segments.append(Segment(start=s, end=e, label=name, confidence=conf, text=text[s:e]))
|
|
472
478
|
return segments, char_label_names, char_conf
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|