petri-braintrust 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- petri_braintrust-0.2.0/.github/workflows/ci.yml +52 -0
- petri_braintrust-0.2.0/.github/workflows/publish.yml +53 -0
- petri_braintrust-0.2.0/.gitignore +30 -0
- petri_braintrust-0.2.0/CHANGELOG.md +60 -0
- petri_braintrust-0.2.0/LICENSE +21 -0
- petri_braintrust-0.2.0/PKG-INFO +508 -0
- petri_braintrust-0.2.0/README.md +478 -0
- petri_braintrust-0.2.0/petri_braintrust/__init__.py +49 -0
- petri_braintrust-0.2.0/petri_braintrust/cli.py +459 -0
- petri_braintrust-0.2.0/petri_braintrust/exporter.py +747 -0
- petri_braintrust-0.2.0/petri_braintrust/facets/sycophancy_pattern.md +106 -0
- petri_braintrust-0.2.0/petri_braintrust/mapping.py +375 -0
- petri_braintrust-0.2.0/petri_braintrust/py.typed +0 -0
- petri_braintrust-0.2.0/petri_braintrust/runner.py +346 -0
- petri_braintrust-0.2.0/petri_braintrust/seeds/longhorizon/longhorizon_ops.md +107 -0
- petri_braintrust-0.2.0/petri_braintrust/studies/__init__.py +48 -0
- petri_braintrust-0.2.0/petri_braintrust/studies/analysis.py +254 -0
- petri_braintrust-0.2.0/petri_braintrust/studies/base.py +255 -0
- petri_braintrust-0.2.0/petri_braintrust/studies/long_horizon.py +88 -0
- petri_braintrust-0.2.0/petri_braintrust/studies/observer_effect.py +238 -0
- petri_braintrust-0.2.0/petri_braintrust/topics.py +227 -0
- petri_braintrust-0.2.0/pyproject.toml +62 -0
- petri_braintrust-0.2.0/run_local.py +41 -0
- petri_braintrust-0.2.0/scripts/longhorizon_mvp.sh +148 -0
- petri_braintrust-0.2.0/tests/__init__.py +0 -0
- petri_braintrust-0.2.0/tests/test_exporter.py +268 -0
- petri_braintrust-0.2.0/tests/test_mapping.py +342 -0
- petri_braintrust-0.2.0/tests/test_runner.py +105 -0
- petri_braintrust-0.2.0/tests/test_studies.py +480 -0
- petri_braintrust-0.2.0/tests/test_topics.py +136 -0
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
lint:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
steps:
|
|
12
|
+
- uses: actions/checkout@v4
|
|
13
|
+
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
|
|
18
|
+
- name: Install ruff
|
|
19
|
+
run: pip install "ruff>=0.5"
|
|
20
|
+
|
|
21
|
+
- name: Lint
|
|
22
|
+
run: ruff check petri_braintrust tests
|
|
23
|
+
|
|
24
|
+
test:
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
strategy:
|
|
27
|
+
fail-fast: false
|
|
28
|
+
matrix:
|
|
29
|
+
python-version: ["3.12", "3.13"]
|
|
30
|
+
|
|
31
|
+
steps:
|
|
32
|
+
- uses: actions/checkout@v4
|
|
33
|
+
|
|
34
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
35
|
+
uses: actions/setup-python@v5
|
|
36
|
+
with:
|
|
37
|
+
python-version: ${{ matrix.python-version }}
|
|
38
|
+
cache: pip
|
|
39
|
+
|
|
40
|
+
- name: Install package + dev extras
|
|
41
|
+
run: pip install -e ".[dev]"
|
|
42
|
+
|
|
43
|
+
- name: Run pytest
|
|
44
|
+
run: pytest -v
|
|
45
|
+
|
|
46
|
+
- name: Smoke-test CLI commands
|
|
47
|
+
run: |
|
|
48
|
+
petri-bt --help
|
|
49
|
+
petri-bt presets
|
|
50
|
+
petri-bt study list
|
|
51
|
+
petri-bt study show observer-effect
|
|
52
|
+
petri-bt topics-init --project smoke --dry-run
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
# Publishes on GitHub release. Uses PyPI trusted publishing (OIDC) — no API
|
|
4
|
+
# token to store. One-time setup on pypi.org: project → Publishing → add
|
|
5
|
+
# GitHub publisher (owner: Paul-UK, repo: petri-braintrust,
|
|
6
|
+
# workflow: publish.yml, environment: pypi).
|
|
7
|
+
|
|
8
|
+
on:
|
|
9
|
+
release:
|
|
10
|
+
types: [published]
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Build sdist and wheel
|
|
23
|
+
run: |
|
|
24
|
+
python -m pip install build
|
|
25
|
+
python -m build
|
|
26
|
+
|
|
27
|
+
- name: Check version matches the release tag
|
|
28
|
+
run: |
|
|
29
|
+
PKG_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
30
|
+
TAG="${GITHUB_REF_NAME#v}"
|
|
31
|
+
if [ "$PKG_VERSION" != "$TAG" ]; then
|
|
32
|
+
echo "pyproject version ($PKG_VERSION) != release tag ($TAG)" >&2
|
|
33
|
+
exit 1
|
|
34
|
+
fi
|
|
35
|
+
|
|
36
|
+
- uses: actions/upload-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: dist
|
|
39
|
+
path: dist/
|
|
40
|
+
|
|
41
|
+
publish:
|
|
42
|
+
needs: build
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
environment: pypi
|
|
45
|
+
permissions:
|
|
46
|
+
id-token: write # required for PyPI trusted publishing
|
|
47
|
+
steps:
|
|
48
|
+
- uses: actions/download-artifact@v4
|
|
49
|
+
with:
|
|
50
|
+
name: dist
|
|
51
|
+
path: dist/
|
|
52
|
+
|
|
53
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
build/
|
|
7
|
+
dist/
|
|
8
|
+
.eggs/
|
|
9
|
+
|
|
10
|
+
# Inspect eval outputs
|
|
11
|
+
logs/
|
|
12
|
+
*.eval
|
|
13
|
+
|
|
14
|
+
# Virtualenvs
|
|
15
|
+
.venv/
|
|
16
|
+
.venv-test/
|
|
17
|
+
venv/
|
|
18
|
+
|
|
19
|
+
# Editors / OS
|
|
20
|
+
.DS_Store
|
|
21
|
+
.vscode/
|
|
22
|
+
.idea/
|
|
23
|
+
|
|
24
|
+
# Claude Code session state
|
|
25
|
+
.claude/
|
|
26
|
+
|
|
27
|
+
# Article drafts (LinkedIn posts, results write-ups) — repo is for code +
|
|
28
|
+
# commands; published artifacts live elsewhere. Headline numbers from any
|
|
29
|
+
# pilot run live in the README and PR descriptions, not here.
|
|
30
|
+
article/
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to petri-braintrust are documented here. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/); versions
|
|
5
|
+
follow [SemVer](https://semver.org/) (pre-1.0: minor bumps may break).
|
|
6
|
+
|
|
7
|
+
## [0.2.0] — 2026-06-10
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- **Studies harness** (`petri_braintrust.studies`): reproducible
|
|
11
|
+
paired-condition research runs — one seed set, one framing manipulation,
|
|
12
|
+
per-condition Braintrust experiments that pair row-by-row in the comparison
|
|
13
|
+
view, plus a local paired report (per-dimension deltas, awareness
|
|
14
|
+
correlations).
|
|
15
|
+
- **Observer-effect study**: ten oversight-neutral agentic-temptation seeds
|
|
16
|
+
run under observed vs unobserved framings; the paired delta is the
|
|
17
|
+
oversight gap. `petri-bt study run observer-effect`.
|
|
18
|
+
- `petri-bt study list / show / run` CLI.
|
|
19
|
+
- Exporter hooks: `seed_transform` (strip condition overlays so arms pair in
|
|
20
|
+
BT diffs) and `extra_metadata` (study/condition labels on every record).
|
|
21
|
+
- Braintrust **Topics** integration: `petri-bt export --mode logs` /
|
|
22
|
+
`export_log_to_logs` send samples to project logs for clustering;
|
|
23
|
+
`petri-bt topics-init` enables a Petri-scoped Topics automation.
|
|
24
|
+
- **Long-horizon study codified**: the ad-hoc 10-vs-100-turn eval is now a
|
|
25
|
+
registered StudySpec (`petri-bt study run long-horizon`) — same pinned
|
|
26
|
+
probe, arms differing only in turn budget.
|
|
27
|
+
- `petri-bt study compare <baseline.eval> <treatment.eval> --study <slug>`:
|
|
28
|
+
re-analyze existing arm logs without re-running (rescues a study whose
|
|
29
|
+
second arm failed after the first burned tokens).
|
|
30
|
+
- Study reports now include **per-seed deltas** and an exact two-sided
|
|
31
|
+
**sign test** per focus dimension, plus provenance stamps
|
|
32
|
+
(`petri_braintrust_version`, `study_spec_fingerprint`) in both the report
|
|
33
|
+
and Braintrust metadata.
|
|
34
|
+
- **Model-agnostic role defaults**: `PETRI_BT_TARGET` / `PETRI_BT_AUDITOR` /
|
|
35
|
+
`PETRI_BT_JUDGE` env vars (precedence: explicit arg > env > preset). The
|
|
36
|
+
all-local warning now recognizes vLLM/SGLang/HF/llama-cpp prefixes, not
|
|
37
|
+
just Ollama.
|
|
38
|
+
- `--verbose` / `-v` global CLI flag — informational logging (export
|
|
39
|
+
progress, resolved roles) is silent otherwise.
|
|
40
|
+
- `py.typed` marker — the package now ships its type information.
|
|
41
|
+
- CI: ruff lint job and broader CLI smoke tests; PyPI trusted-publishing
|
|
42
|
+
workflow on GitHub release.
|
|
43
|
+
|
|
44
|
+
### Changed
|
|
45
|
+
- **Dimension-direction map rebuilt against inspect-petri v3's real
|
|
46
|
+
dimension set**, derived at runtime from `JudgeDimension.palette` when
|
|
47
|
+
inspect-petri is installed (static snapshot as fallback). Neutral
|
|
48
|
+
descriptor dimensions (`fun_or_funny`, `high_stakes_setting`,
|
|
49
|
+
`unprompted_pos_affect`) no longer get meaningless `__inverted` values.
|
|
50
|
+
Stale guessed names (`deception`, `power_seeking`, …) were dropped.
|
|
51
|
+
- **"Did not apply" judge scores (raw 0) are now omitted** from Braintrust
|
|
52
|
+
scores instead of being clipped to 0.0, keeping per-dimension means honest.
|
|
53
|
+
Skipped dimensions are listed under
|
|
54
|
+
`metadata.judge.<scorer>.not_applicable_dimensions`.
|
|
55
|
+
|
|
56
|
+
## [0.1.0] — 2026-05
|
|
57
|
+
|
|
58
|
+
Initial release: run Inspect Petri audits with mixed local (Ollama) / cloud
|
|
59
|
+
model roles and export `.eval` logs to Braintrust experiments with
|
|
60
|
+
per-dimension scores, per-call trace waterfalls, and judge metadata.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Paul Tancre
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|