typeseg 0.2.2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {typeseg-0.2.2 → typeseg-0.2.3}/PKG-INFO +1 -1
  2. {typeseg-0.2.2 → typeseg-0.2.3}/pyproject.toml +1 -1
  3. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_postprocess.py +19 -13
  4. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/PKG-INFO +1 -1
  5. {typeseg-0.2.2 → typeseg-0.2.3}/LICENSE +0 -0
  6. {typeseg-0.2.2 → typeseg-0.2.3}/README.md +0 -0
  7. {typeseg-0.2.2 → typeseg-0.2.3}/setup.cfg +0 -0
  8. {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_cupy_parity.py +0 -0
  9. {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_distribution.py +0 -0
  10. {typeseg-0.2.2 → typeseg-0.2.3}/tests/test_postprocess_perf.py +0 -0
  11. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/__init__.py +0 -0
  12. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/__main__.py +0 -0
  13. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_cli.py +0 -0
  14. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_color.py +0 -0
  15. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_cupy_backend.py +0 -0
  16. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_mamba_kernel.py +0 -0
  17. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_numpy_backend.py +0 -0
  18. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_onnx_backend.py +0 -0
  19. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_options.py +0 -0
  20. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_runtime.py +0 -0
  21. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_segmentation.py +0 -0
  22. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/_tokenize.py +0 -0
  23. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/mamba_al.npz +0 -0
  24. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/mamba_al.onnx +0 -0
  25. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/manifest.json +0 -0
  26. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/unet_al.npz +0 -0
  27. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg/data/unet_al.onnx +0 -0
  28. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/SOURCES.txt +0 -0
  29. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/dependency_links.txt +0 -0
  30. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/entry_points.txt +0 -0
  31. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/requires.txt +0 -0
  32. {typeseg-0.2.2 → typeseg-0.2.3}/typeseg.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: typeseg
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
5
5
  Author: Martin Dallinger
6
6
  License-Expression: Apache-2.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "typeseg"
7
- version = "0.2.2"
7
+ version = "0.2.3"
8
8
  description = "Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba)."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -64,7 +64,7 @@ def confidence_gate(char_probs: np.ndarray, labels: List[int], threshold: float,
64
64
  if threshold <= 0.0 or char_probs.size == 0:
65
65
  return labels
66
66
  maxp = char_probs.max(axis=1)
67
- return [other_index if maxp[i] < threshold else lab for i, lab in enumerate(labels)]
67
+ return np.where(maxp < threshold, other_index, np.asarray(labels, dtype=np.intp)).tolist()
68
68
 
69
69
 
70
70
  def normalize_short_runs(labels: List[int], char_probs: np.ndarray, min_run_chars: int,
@@ -452,21 +452,27 @@ def paired_delimiter_fill(text: str, labels: List[int], char_probs: np.ndarray,
452
452
  def build_segments(text: str, labels: List[int], char_probs: np.ndarray,
453
453
  label_names: List[str], other_index: int, other_label: str) -> Tuple[List[Segment], List[str], List[float]]:
454
454
  n = len(text)
455
- char_conf: List[float] = []
456
- char_label_names: List[str] = []
457
- for i in range(n):
458
- lab = labels[i]
459
- if char_probs.size:
460
- known_max = float(char_probs[i].max())
461
- conf = (1.0 - known_max) if lab == other_index else float(char_probs[i, lab])
462
- else:
463
- conf = 0.0
464
- char_conf.append(conf)
465
- char_label_names.append(other_label if lab == other_index else label_names[lab])
455
+ # Per-char confidence, vectorised. `other` (no probability column) scores
456
+ # 1 - max(known); every other label scores its own column. The old per-char
457
+ # `char_probs[i].max()` Python loop did n separate numpy reductions and was the
458
+ # bottleneck of the whole pipeline -- this is the same result in two reductions.
459
+ if char_probs.size:
460
+ lab_arr = np.asarray(labels, dtype=np.intp)
461
+ is_other = lab_arr == other_index
462
+ known_max = char_probs.max(axis=1) # (n,)
463
+ safe_lab = np.where(is_other, 0, lab_arr) # avoid OOB at other_index
464
+ picked = char_probs[np.arange(n), safe_lab] # (n,) chosen-label prob
465
+ conf_arr = np.where(is_other, 1.0 - known_max, picked).astype(np.float64)
466
+ else:
467
+ conf_arr = np.zeros(n, dtype=np.float64)
468
+ char_conf: List[float] = conf_arr.tolist()
469
+ char_label_names: List[str] = [
470
+ other_label if lab == other_index else label_names[lab] for lab in labels
471
+ ]
466
472
 
467
473
  segments: List[Segment] = []
468
474
  for s, e, lab in _runs(labels):
469
475
  name = other_label if lab == other_index else label_names[lab]
470
- conf = float(np.mean(char_conf[s:e])) if e > s else 0.0
476
+ conf = float(conf_arr[s:e].mean()) if e > s else 0.0
471
477
  segments.append(Segment(start=s, end=e, label=name, confidence=conf, text=text[s:e]))
472
478
  return segments, char_label_names, char_conf
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: typeseg
3
- Version: 0.2.2
3
+ Version: 0.2.3
4
4
  Summary: Fine-grained, character-level content-type segmentation for textual inputs (U-Net + Mamba).
5
5
  Author: Martin Dallinger
6
6
  License-Expression: Apache-2.0
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes