shipwright-kit 0.7.0__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {shipwright_kit-0.7.0/shipwright_kit.egg-info → shipwright_kit-0.8.1}/PKG-INFO +7 -8
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/README.md +6 -7
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/pyproject.toml +1 -1
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/__init__.py +1 -1
- shipwright_kit-0.8.1/shipwright_kit/eval/__init__.py +18 -0
- shipwright_kit-0.8.1/shipwright_kit/eval/harness.py +129 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1/shipwright_kit.egg-info}/PKG-INFO +7 -8
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_template_wiring.py +8 -2
- shipwright_kit-0.7.0/shipwright_kit/eval/__init__.py +0 -7
- shipwright_kit-0.7.0/shipwright_kit/eval/harness.py +0 -50
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/LICENSE +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/setup.cfg +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/cli.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/config.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/__init__.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/banner.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/console.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/glyphs.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/output.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/palette.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/design/tiers.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/eval/corpus.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/eval/metrics.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/py.typed +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/security/__init__.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/security/eval.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/security/injection.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit/security/theme.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit.egg-info/SOURCES.txt +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit.egg-info/dependency_links.txt +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit.egg-info/entry_points.txt +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit.egg-info/requires.txt +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/shipwright_kit.egg-info/top_level.txt +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_cli.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_config.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_packaging.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_packs_entrypoint.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_release_config.py +0 -0
- {shipwright_kit-0.7.0 → shipwright_kit-0.8.1}/tests/test_tooling.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: shipwright-kit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Shipwright — AI-agent dev framework + import-light design/eval/security library
|
|
5
5
|
Author: Christian Huhn
|
|
6
6
|
License-Expression: MIT
|
|
@@ -51,19 +51,18 @@ The library is consumed today by two real tools: **barb** and **sift** both impo
|
|
|
51
51
|
|
|
52
52
|
## Install
|
|
53
53
|
|
|
54
|
-
The
|
|
55
|
-
|
|
56
|
-
**`shipwright_kit`**.
|
|
54
|
+
The bare name `shipwright` belongs to an unrelated project on PyPI, so the
|
|
55
|
+
published distribution is **`shipwright-kit`** and the import name is
|
|
56
|
+
**`shipwright_kit`**.
|
|
57
57
|
|
|
58
58
|
```bash
|
|
59
|
-
uv pip install "
|
|
59
|
+
uv pip install "shipwright-kit>=0.7,<0.8"
|
|
60
60
|
# then: import shipwright_kit
|
|
61
61
|
```
|
|
62
62
|
|
|
63
63
|
> [!NOTE]
|
|
64
|
-
>
|
|
65
|
-
>
|
|
66
|
-
> shipwright` from PyPI — that is a different, unrelated package.
|
|
64
|
+
> Do **not** `pip install shipwright` from PyPI — that is a different, unrelated
|
|
65
|
+
> package. The correct dist name is `shipwright-kit`.
|
|
67
66
|
|
|
68
67
|
The security pack needs no extra — it ships with the base install and registers
|
|
69
68
|
through the `shipwright_kit.packs` entry point.
|
|
@@ -21,19 +21,18 @@ The library is consumed today by two real tools: **barb** and **sift** both impo
|
|
|
21
21
|
|
|
22
22
|
## Install
|
|
23
23
|
|
|
24
|
-
The
|
|
25
|
-
|
|
26
|
-
**`shipwright_kit`**.
|
|
24
|
+
The bare name `shipwright` belongs to an unrelated project on PyPI, so the
|
|
25
|
+
published distribution is **`shipwright-kit`** and the import name is
|
|
26
|
+
**`shipwright_kit`**.
|
|
27
27
|
|
|
28
28
|
```bash
|
|
29
|
-
uv pip install "
|
|
29
|
+
uv pip install "shipwright-kit>=0.7,<0.8"
|
|
30
30
|
# then: import shipwright_kit
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
> [!NOTE]
|
|
34
|
-
>
|
|
35
|
-
>
|
|
36
|
-
> shipwright` from PyPI — that is a different, unrelated package.
|
|
34
|
+
> Do **not** `pip install shipwright` from PyPI — that is a different, unrelated
|
|
35
|
+
> package. The correct dist name is `shipwright-kit`.
|
|
37
36
|
|
|
38
37
|
The security pack needs no extra — it ships with the base install and registers
|
|
39
38
|
through the `shipwright_kit.packs` entry point.
|
|
@@ -7,7 +7,7 @@ build-backend = "setuptools.build_meta"
|
|
|
7
7
|
# PyPI distribution name. The bare `shipwright` is taken on PyPI (unrelated 6si
|
|
8
8
|
# tool), so the dist is `shipwright-kit`; the IMPORT name is `shipwright_kit`.
|
|
9
9
|
name = "shipwright-kit"
|
|
10
|
-
version = "0.
|
|
10
|
+
version = "0.8.1"
|
|
11
11
|
description = "Shipwright — AI-agent dev framework + import-light design/eval/security library"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.11"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Generic classification eval harness: corpus, metrics, evaluate + gate."""
|
|
2
|
+
|
|
3
|
+
from .corpus import Sample, load_corpus
|
|
4
|
+
from .harness import CorpusDisagreement, CorpusVerifyReport, EvalGateError, evaluate, gate, verify_corpus
|
|
5
|
+
from .metrics import EVAL_SCHEMA_VERSION, EvalResult
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"Sample",
|
|
9
|
+
"load_corpus",
|
|
10
|
+
"EvalResult",
|
|
11
|
+
"EVAL_SCHEMA_VERSION",
|
|
12
|
+
"EvalGateError",
|
|
13
|
+
"evaluate",
|
|
14
|
+
"gate",
|
|
15
|
+
"CorpusDisagreement",
|
|
16
|
+
"CorpusVerifyReport",
|
|
17
|
+
"verify_corpus",
|
|
18
|
+
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Run a predict function over a corpus and gate the result. Count-and-skip on a
|
|
2
|
+
predict-time exception (faithful to barb — a bad row must not abort the run)."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from collections.abc import Callable
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
from .corpus import Sample
|
|
10
|
+
from .metrics import EvalResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class EvalGateError(AssertionError):
|
|
14
|
+
"""Raised when an eval result is below the required thresholds."""
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class CorpusDisagreement:
|
|
19
|
+
"""A single row where the predictor's output disagrees with the human label."""
|
|
20
|
+
|
|
21
|
+
value: str # the sample input
|
|
22
|
+
label: str # the human-assigned label
|
|
23
|
+
predicted: str # what the predictor returned
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class CorpusVerifyReport:
|
|
28
|
+
"""Result of :func:`verify_corpus`. Stdlib-only, no rich/pyfiglet import."""
|
|
29
|
+
|
|
30
|
+
disagreements: list[CorpusDisagreement]
|
|
31
|
+
total: int
|
|
32
|
+
disagreement_count: int
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def clean(self) -> bool:
|
|
36
|
+
"""True when every row agrees — safe to proceed to floor-setting."""
|
|
37
|
+
return self.disagreement_count == 0
|
|
38
|
+
|
|
39
|
+
def summary(self) -> str:
|
|
40
|
+
"""Single-line human-readable summary, suitable for stderr."""
|
|
41
|
+
if self.clean:
|
|
42
|
+
return f"corpus-verify: OK — {self.total} rows, 0 disagreements"
|
|
43
|
+
return f"corpus-verify: FAIL — {self.disagreement_count}/{self.total} rows disagree (label vs prediction)"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def verify_corpus(
|
|
47
|
+
corpus: list[Sample],
|
|
48
|
+
predictor: Callable[[str], str],
|
|
49
|
+
*,
|
|
50
|
+
eq: Callable[[str, str], bool] | None = None,
|
|
51
|
+
) -> CorpusVerifyReport:
|
|
52
|
+
"""Run *predictor* over every labeled row and report label-vs-prediction disagreements.
|
|
53
|
+
|
|
54
|
+
Use this **before** setting a precision/recall floor to catch mislabeled or
|
|
55
|
+
dishonest corpus rows — a predictor that is believed correct is compared
|
|
56
|
+
directly to the human label; rows that differ are flagged.
|
|
57
|
+
|
|
58
|
+
Parameters
|
|
59
|
+
----------
|
|
60
|
+
corpus:
|
|
61
|
+
List of :class:`~shipwright_kit.eval.Sample` objects (input + label pairs).
|
|
62
|
+
predictor:
|
|
63
|
+
Callable that maps an input string to a prediction string. Must be the
|
|
64
|
+
same callable you intend to gate — usually the production classifier.
|
|
65
|
+
eq:
|
|
66
|
+
Optional equality function ``(label, predicted) -> bool``. Defaults to
|
|
67
|
+
plain string equality (``label == predicted``). Supply a custom function
|
|
68
|
+
when the label space differs from the prediction space (e.g. case-folding,
|
|
69
|
+
synonyms, or a mapping dict).
|
|
70
|
+
|
|
71
|
+
Returns
|
|
72
|
+
-------
|
|
73
|
+
CorpusVerifyReport
|
|
74
|
+
Structured report with the full list of disagreements plus a summary count.
|
|
75
|
+
Predictor exceptions on a row are treated as a disagreement (predicted value
|
|
76
|
+
is set to ``"<error>"``).
|
|
77
|
+
"""
|
|
78
|
+
_eq: Callable[[str, str], bool] = eq if eq is not None else (lambda a, b: a == b)
|
|
79
|
+
disagreements: list[CorpusDisagreement] = []
|
|
80
|
+
for sample in corpus:
|
|
81
|
+
try:
|
|
82
|
+
pred = predictor(sample.input)
|
|
83
|
+
except Exception:
|
|
84
|
+
disagreements.append(CorpusDisagreement(sample.input, sample.label, "<error>"))
|
|
85
|
+
continue
|
|
86
|
+
if not _eq(sample.label, pred):
|
|
87
|
+
disagreements.append(CorpusDisagreement(sample.input, sample.label, pred))
|
|
88
|
+
return CorpusVerifyReport(
|
|
89
|
+
disagreements=disagreements,
|
|
90
|
+
total=len(corpus),
|
|
91
|
+
disagreement_count=len(disagreements),
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def evaluate(
|
|
96
|
+
predict_fn: Callable[[str], str],
|
|
97
|
+
corpus: list[Sample],
|
|
98
|
+
*,
|
|
99
|
+
positive_pred: Callable[[str], bool],
|
|
100
|
+
positive_expected: Callable[[str], bool] | None = None,
|
|
101
|
+
) -> EvalResult:
|
|
102
|
+
binarize_expected = positive_expected or positive_pred # default = same-space (Phase B)
|
|
103
|
+
tp = fp = tn = fn = errors = 0
|
|
104
|
+
for sample in corpus:
|
|
105
|
+
try:
|
|
106
|
+
pred = predict_fn(sample.input)
|
|
107
|
+
except Exception: # count-and-skip, surfaced via errors
|
|
108
|
+
errors += 1
|
|
109
|
+
continue
|
|
110
|
+
exp = binarize_expected(sample.label)
|
|
111
|
+
got = positive_pred(pred)
|
|
112
|
+
if exp and got:
|
|
113
|
+
tp += 1
|
|
114
|
+
elif got and not exp:
|
|
115
|
+
fp += 1
|
|
116
|
+
elif exp and not got:
|
|
117
|
+
fn += 1
|
|
118
|
+
else:
|
|
119
|
+
tn += 1
|
|
120
|
+
return EvalResult(tp, fp, tn, fn, errors)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def gate(result: EvalResult, *, min_precision: float, min_recall: float) -> None:
|
|
124
|
+
if result.precision < min_precision:
|
|
125
|
+
raise EvalGateError(f"precision {result.precision:.3f} < {min_precision}")
|
|
126
|
+
if result.recall < min_recall:
|
|
127
|
+
raise EvalGateError(f"recall {result.recall:.3f} < {min_recall}")
|
|
128
|
+
if (result.tp + result.fn) > 0 and result.recall == 0.0:
|
|
129
|
+
raise EvalGateError("zero recall with positives present")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: shipwright-kit
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Shipwright — AI-agent dev framework + import-light design/eval/security library
|
|
5
5
|
Author: Christian Huhn
|
|
6
6
|
License-Expression: MIT
|
|
@@ -51,19 +51,18 @@ The library is consumed today by two real tools: **barb** and **sift** both impo
|
|
|
51
51
|
|
|
52
52
|
## Install
|
|
53
53
|
|
|
54
|
-
The
|
|
55
|
-
|
|
56
|
-
**`shipwright_kit`**.
|
|
54
|
+
The bare name `shipwright` belongs to an unrelated project on PyPI, so the
|
|
55
|
+
published distribution is **`shipwright-kit`** and the import name is
|
|
56
|
+
**`shipwright_kit`**.
|
|
57
57
|
|
|
58
58
|
```bash
|
|
59
|
-
uv pip install "
|
|
59
|
+
uv pip install "shipwright-kit>=0.7,<0.8"
|
|
60
60
|
# then: import shipwright_kit
|
|
61
61
|
```
|
|
62
62
|
|
|
63
63
|
> [!NOTE]
|
|
64
|
-
>
|
|
65
|
-
>
|
|
66
|
-
> shipwright` from PyPI — that is a different, unrelated package.
|
|
64
|
+
> Do **not** `pip install shipwright` from PyPI — that is a different, unrelated
|
|
65
|
+
> package. The correct dist name is `shipwright-kit`.
|
|
67
66
|
|
|
68
67
|
The security pack needs no extra — it ships with the base install and registers
|
|
69
68
|
through the `shipwright_kit.packs` entry point.
|
|
@@ -56,7 +56,10 @@ def test_security_preset_installs_security_extra(tmp_path):
|
|
|
56
56
|
text = (proj / "pyproject.toml").read_text()
|
|
57
57
|
req = _shipwright_req(text)
|
|
58
58
|
assert req.extras == set() # security pack ships with base (entry-point); no [security] extra exists
|
|
59
|
-
|
|
59
|
+
# W2: PyPI range pin (>=0.7,<0.8) — url is None, specifier encodes the range
|
|
60
|
+
assert req.url is None
|
|
61
|
+
assert ">=0.7" in str(req.specifier)
|
|
62
|
+
assert "<0.8" in str(req.specifier)
|
|
60
63
|
assert 'preset = "security"' in text
|
|
61
64
|
banner = proj / "acme" / "banner.py"
|
|
62
65
|
assert banner.exists()
|
|
@@ -68,5 +71,8 @@ def test_none_preset_core_only(tmp_path):
|
|
|
68
71
|
text = (proj / "pyproject.toml").read_text()
|
|
69
72
|
req = _shipwright_req(text)
|
|
70
73
|
assert req.extras == set() # no security extra
|
|
71
|
-
|
|
74
|
+
# W2: PyPI range pin (>=0.7,<0.8) — url is None, specifier encodes the range
|
|
75
|
+
assert req.url is None
|
|
76
|
+
assert ">=0.7" in str(req.specifier)
|
|
77
|
+
assert "<0.8" in str(req.specifier)
|
|
72
78
|
assert 'preset = "none"' in text
|
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
"""Generic classification eval harness: corpus, metrics, evaluate + gate."""
|
|
2
|
-
|
|
3
|
-
from .corpus import Sample, load_corpus
|
|
4
|
-
from .harness import EvalGateError, evaluate, gate
|
|
5
|
-
from .metrics import EVAL_SCHEMA_VERSION, EvalResult
|
|
6
|
-
|
|
7
|
-
__all__ = ["Sample", "load_corpus", "EvalResult", "EVAL_SCHEMA_VERSION", "EvalGateError", "evaluate", "gate"]
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
"""Run a predict function over a corpus and gate the result. Count-and-skip on a
|
|
2
|
-
predict-time exception (faithful to barb — a bad row must not abort the run)."""
|
|
3
|
-
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
from collections.abc import Callable
|
|
7
|
-
|
|
8
|
-
from .corpus import Sample
|
|
9
|
-
from .metrics import EvalResult
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class EvalGateError(AssertionError):
|
|
13
|
-
"""Raised when an eval result is below the required thresholds."""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
def evaluate(
|
|
17
|
-
predict_fn: Callable[[str], str],
|
|
18
|
-
corpus: list[Sample],
|
|
19
|
-
*,
|
|
20
|
-
positive_pred: Callable[[str], bool],
|
|
21
|
-
positive_expected: Callable[[str], bool] | None = None,
|
|
22
|
-
) -> EvalResult:
|
|
23
|
-
binarize_expected = positive_expected or positive_pred # default = same-space (Phase B)
|
|
24
|
-
tp = fp = tn = fn = errors = 0
|
|
25
|
-
for sample in corpus:
|
|
26
|
-
try:
|
|
27
|
-
pred = predict_fn(sample.input)
|
|
28
|
-
except Exception: # count-and-skip, surfaced via errors
|
|
29
|
-
errors += 1
|
|
30
|
-
continue
|
|
31
|
-
exp = binarize_expected(sample.label)
|
|
32
|
-
got = positive_pred(pred)
|
|
33
|
-
if exp and got:
|
|
34
|
-
tp += 1
|
|
35
|
-
elif got and not exp:
|
|
36
|
-
fp += 1
|
|
37
|
-
elif exp and not got:
|
|
38
|
-
fn += 1
|
|
39
|
-
else:
|
|
40
|
-
tn += 1
|
|
41
|
-
return EvalResult(tp, fp, tn, fn, errors)
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
def gate(result: EvalResult, *, min_precision: float, min_recall: float) -> None:
|
|
45
|
-
if result.precision < min_precision:
|
|
46
|
-
raise EvalGateError(f"precision {result.precision:.3f} < {min_precision}")
|
|
47
|
-
if result.recall < min_recall:
|
|
48
|
-
raise EvalGateError(f"recall {result.recall:.3f} < {min_recall}")
|
|
49
|
-
if (result.tp + result.fn) > 0 and result.recall == 0.0:
|
|
50
|
-
raise EvalGateError("zero recall with positives present")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|