frugon 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- frugon-0.1.0/.coveragerc-strict +9 -0
- frugon-0.1.0/.gitattributes +19 -0
- frugon-0.1.0/.github/RELEASE_NOTES_v0.1.0.md +77 -0
- frugon-0.1.0/.github/workflows/ci.yml +42 -0
- frugon-0.1.0/.github/workflows/pricing-sync.yml +70 -0
- frugon-0.1.0/.github/workflows/quality-sync.yml +226 -0
- frugon-0.1.0/.github/workflows/release.yml +112 -0
- frugon-0.1.0/.gitignore +45 -0
- frugon-0.1.0/CONTRIBUTING.md +116 -0
- frugon-0.1.0/LICENSE +21 -0
- frugon-0.1.0/PKG-INFO +226 -0
- frugon-0.1.0/README.md +187 -0
- frugon-0.1.0/ROADMAP.md +200 -0
- frugon-0.1.0/assets/demo.gif +0 -0
- frugon-0.1.0/pyproject.toml +152 -0
- frugon-0.1.0/scripts/gen_sample_logs.py +529 -0
- frugon-0.1.0/src/frugon/__init__.py +10 -0
- frugon-0.1.0/src/frugon/_progress.py +324 -0
- frugon-0.1.0/src/frugon/_store.py +185 -0
- frugon-0.1.0/src/frugon/capture.py +503 -0
- frugon-0.1.0/src/frugon/cli.py +1489 -0
- frugon-0.1.0/src/frugon/cost.py +1474 -0
- frugon-0.1.0/src/frugon/data/pricing.json +121 -0
- frugon-0.1.0/src/frugon/data/quality.json +368 -0
- frugon-0.1.0/src/frugon/data/sample_logs.jsonl.gz +0 -0
- frugon-0.1.0/src/frugon/measure.py +1579 -0
- frugon-0.1.0/src/frugon/model_id.py +203 -0
- frugon-0.1.0/src/frugon/pricing.py +571 -0
- frugon-0.1.0/src/frugon/quality.py +833 -0
- frugon-0.1.0/src/frugon/report.py +7840 -0
- frugon-0.1.0/src/frugon/routing.py +312 -0
- frugon-0.1.0/tests/__init__.py +1 -0
- frugon-0.1.0/tests/conftest.py +132 -0
- frugon-0.1.0/tests/test_anthropic_bare_name_pricing.py +541 -0
- frugon-0.1.0/tests/test_candidate_caption.py +266 -0
- frugon-0.1.0/tests/test_candidate_headline_block_agreement.py +354 -0
- frugon-0.1.0/tests/test_candidate_pool.py +62 -0
- frugon-0.1.0/tests/test_candidates_html_styling.py +202 -0
- frugon-0.1.0/tests/test_capture.py +1643 -0
- frugon-0.1.0/tests/test_capture_feedback.py +462 -0
- frugon-0.1.0/tests/test_ci_config.py +103 -0
- frugon-0.1.0/tests/test_cli.py +241 -0
- frugon-0.1.0/tests/test_cli_concurrency_flag.py +176 -0
- frugon-0.1.0/tests/test_cli_key_hint.py +64 -0
- frugon-0.1.0/tests/test_cli_measure_estimate.py +388 -0
- frugon-0.1.0/tests/test_cli_report_formats.py +319 -0
- frugon-0.1.0/tests/test_cli_report_measure_order.py +203 -0
- frugon-0.1.0/tests/test_cli_smoke.py +944 -0
- frugon-0.1.0/tests/test_cli_utf8.py +91 -0
- frugon-0.1.0/tests/test_cost.py +2455 -0
- frugon-0.1.0/tests/test_escalation_ladder.py +192 -0
- frugon-0.1.0/tests/test_escalation_rendering.py +228 -0
- frugon-0.1.0/tests/test_flagship_coverage.py +96 -0
- frugon-0.1.0/tests/test_judge_fallback.py +270 -0
- frugon-0.1.0/tests/test_measure.py +1555 -0
- frugon-0.1.0/tests/test_measure_concurrency.py +739 -0
- frugon-0.1.0/tests/test_measure_cost.py +436 -0
- frugon-0.1.0/tests/test_measure_judge_methodology.py +214 -0
- frugon-0.1.0/tests/test_measure_precheck.py +458 -0
- frugon-0.1.0/tests/test_measure_sampling_error.py +342 -0
- frugon-0.1.0/tests/test_measure_synthesis.py +417 -0
- frugon-0.1.0/tests/test_measure_unknown_model.py +277 -0
- frugon-0.1.0/tests/test_measure_verbose.py +282 -0
- frugon-0.1.0/tests/test_model_id.py +358 -0
- frugon-0.1.0/tests/test_model_id_drift.py +87 -0
- frugon-0.1.0/tests/test_models.py +159 -0
- frugon-0.1.0/tests/test_preview_flags.py +297 -0
- frugon-0.1.0/tests/test_pricing.py +551 -0
- frugon-0.1.0/tests/test_pricing_update.py +242 -0
- frugon-0.1.0/tests/test_pricing_userdir.py +363 -0
- frugon-0.1.0/tests/test_privacy.py +163 -0
- frugon-0.1.0/tests/test_progress.py +376 -0
- frugon-0.1.0/tests/test_quality.py +2468 -0
- frugon-0.1.0/tests/test_quality_aware_recommendation.py +350 -0
- frugon-0.1.0/tests/test_readme.py +63 -0
- frugon-0.1.0/tests/test_report.py +719 -0
- frugon-0.1.0/tests/test_report_dangling_refs.py +361 -0
- frugon-0.1.0/tests/test_report_html.py +478 -0
- frugon-0.1.0/tests/test_report_measure_aware_copy.py +325 -0
- frugon-0.1.0/tests/test_report_measure_section.py +547 -0
- frugon-0.1.0/tests/test_report_parity.py +433 -0
- frugon-0.1.0/tests/test_report_quality_tier.py +354 -0
- frugon-0.1.0/tests/test_report_split.py +1222 -0
- frugon-0.1.0/tests/test_report_split_reconciliation.py +440 -0
- frugon-0.1.0/tests/test_report_tier_drop_wording.py +788 -0
- frugon-0.1.0/tests/test_report_unrated_family.py +270 -0
- frugon-0.1.0/tests/test_report_unrated_family_measure_aware.py +272 -0
- frugon-0.1.0/tests/test_report_v2.py +1023 -0
- frugon-0.1.0/tests/test_report_verdict_status_colour.py +190 -0
- frugon-0.1.0/tests/test_report_wholesale.py +390 -0
- frugon-0.1.0/tests/test_report_wholesale_swap_plan.py +510 -0
- frugon-0.1.0/tests/test_routing.py +494 -0
- frugon-0.1.0/tests/test_routing_share_and_provenance.py +219 -0
- frugon-0.1.0/tests/test_sample_data.py +256 -0
- frugon-0.1.0/tests/test_store.py +518 -0
- frugon-0.1.0/uv.lock +2705 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
# Line-ending normalization — the LF invariant.
|
|
2
|
+
#
|
|
3
|
+
# Frugon writes capture.jsonl and the Markdown/HTML reports with an explicit
|
|
4
|
+
# newline="\n" so artifacts are byte-identical on Windows, macOS, and Linux.
|
|
5
|
+
# This file extends that guarantee to the repository itself: every text file is
|
|
6
|
+
# stored LF-only and checked out LF-only, so a contributor on a misconfigured
|
|
7
|
+
# clone (core.autocrlf=true) cannot reintroduce CRLF drift into tracked sources,
|
|
8
|
+
# fixtures, or golden test data.
|
|
9
|
+
* text=auto eol=lf
|
|
10
|
+
|
|
11
|
+
# Binary assets — never normalize or diff as text.
|
|
12
|
+
*.png binary
|
|
13
|
+
*.jpg binary
|
|
14
|
+
*.jpeg binary
|
|
15
|
+
*.gif binary
|
|
16
|
+
*.ico binary
|
|
17
|
+
*.gz binary
|
|
18
|
+
*.woff binary
|
|
19
|
+
*.woff2 binary
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# frugon v0.1.0
|
|
2
|
+
|
|
3
|
+
> **Your data never leaves your machine. Your keys go straight to your own providers. Nothing reaches us.**
|
|
4
|
+
|
|
5
|
+
Free, local, open-source LLM cost analyzer. Point it at your real call logs and see — on your machine — how much you'd save by switching or routing models.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# One-shot (no install)
|
|
11
|
+
uvx frugon analyze ./logs.jsonl
|
|
12
|
+
|
|
13
|
+
# Permanent install
|
|
14
|
+
pipx install frugon
|
|
15
|
+
frugon analyze --demo # bundled sample log, see it work in 5 seconds
|
|
16
|
+
frugon analyze ./logs.jsonl # your real logs
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Cross-platform: macOS, Linux, Windows · Python 3.10 / 3.11 / 3.12 / 3.13.
|
|
20
|
+
|
|
21
|
+
## What's in this release
|
|
22
|
+
|
|
23
|
+
**`frugon analyze`** — read OpenAI-compatible JSONL logs and produce a cost analysis. Counts tokens with [tokencost], prices with [LiteLLM's registry], picks a cheaper-than-baseline candidate model, and tells you the dollar saving. Cross-platform, fully local, no LLM calls, no network. Honest savings anchored to [RouteLLM/LMSYS] quality bands — we never inflate the number.
|
|
24
|
+
|
|
25
|
+
**`frugon capture`** — passive OpenAI-compatible logger. Point your app's base URL at `http://127.0.0.1:8787` for a day; it records every call to a local JSONL file in the canonical shape and forwards the request unchanged to the real upstream. No data goes anywhere but your local file and your existing upstream.
|
|
26
|
+
|
|
27
|
+
**`frugon pricing update`** — refresh the bundled pricing table from the [LiteLLM model_prices_and_context_window.json] registry. Atomic write, JSON shape validation, weekly GitHub Actions sync.
|
|
28
|
+
|
|
29
|
+
**`frugon models [QUERY]`** — list the model names frugon can price (the exact names `--candidates` accepts), optionally filtered by a case-insensitive substring. Pure local read — no network.
|
|
30
|
+
|
|
31
|
+
**`frugon quality update`** — refresh the bundled quality-tier table from the LMArena leaderboard so the Strong / Capable tier labels stay current. Tiers are self-recalibrating percentile bands: Elite (top 10%), Strong (10–30%), Capable (30–60%), Efficient (bottom 40%).
|
|
32
|
+
|
|
33
|
+
**`frugon analyze --measure`** *(optional `[measure]` extra)* — sample real prompts through candidate models using **your own** API keys. Calls go straight to your providers (OpenAI / Anthropic / etc.) — never to us. Two tiers: side-by-side diffs (human judge) or LLM-as-judge win/loss/tie tallies.
|
|
34
|
+
|
|
35
|
+
**`frugon analyze --report file.html|file.md`** — shareable single-page report. Self-contained HTML with inline CSS (deep indigo + cyan + silver), or clean Markdown. The viral surface someone shows their boss.
|
|
36
|
+
|
|
37
|
+
## Realistic savings
|
|
38
|
+
|
|
39
|
+
Anchored to [RouteLLM] / [LMSYS] research bands:
|
|
40
|
+
|
|
41
|
+
| Traffic mix | Typical saving |
|
|
42
|
+
|---|---|
|
|
43
|
+
| General mixed traffic | 30–50% |
|
|
44
|
+
| Easy / repetitive (MT-Bench) | up to ~85% |
|
|
45
|
+
| Hard tasks (MMLU) | ~30% |
|
|
46
|
+
|
|
47
|
+
**Your actual number comes from your logs.** Frugon shows what the math says for your data.
|
|
48
|
+
|
|
49
|
+
## Privacy guarantees (tested as code, not promised in prose)
|
|
50
|
+
|
|
51
|
+
- **Cost analysis is fully local.** No LLM, no network, no telemetry.
|
|
52
|
+
- **`capture` never sends data anywhere but your configured upstream.** Asserted at the socket layer in tests — any future regression that introduces a side-channel HTTP client breaks CI.
|
|
53
|
+
- **`--measure` calls only the user's own providers with the user's own keys.** Keys are never logged, never persisted, never sent anywhere but the provider. Asserted by a defense-in-depth fixture patching `socket.socket` / `socket.create_connection` / `socket.getaddrinfo`.
|
|
54
|
+
- **The CLI collects nothing.** Open source — anyone can verify.
|
|
55
|
+
|
|
56
|
+
## Quality
|
|
57
|
+
|
|
58
|
+
- 1,960 tests, 95% overall coverage, 97% on the cost-math triad (`cost.py` / `pricing.py` / `routing.py`).
|
|
59
|
+
- CI green on 3 OS × 4 Python (ubuntu / macos / windows × 3.10 / 3.11 / 3.12 / 3.13).
|
|
60
|
+
- Every change is code-reviewed before merge; cost-math changes get an extra dedicated review pass.
|
|
61
|
+
- ruff + mypy clean.
|
|
62
|
+
|
|
63
|
+
## Keep the savings
|
|
64
|
+
|
|
65
|
+
This release is the diagnosis. Want it to keep routing automatically and hold the savings? → **https://frugon.rodiun.io**
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
Built by [Rodiun]. MIT licensed.
|
|
70
|
+
|
|
71
|
+
[tokencost]: https://github.com/AgentOps-AI/tokencost
|
|
72
|
+
[LiteLLM's registry]: https://github.com/BerriAI/litellm
|
|
73
|
+
[LiteLLM model_prices_and_context_window.json]: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
|
|
74
|
+
[RouteLLM/LMSYS]: https://github.com/lm-sys/RouteLLM
|
|
75
|
+
[RouteLLM]: https://github.com/lm-sys/RouteLLM
|
|
76
|
+
[LMSYS]: https://lmsys.org/
|
|
77
|
+
[Rodiun]: https://rodiun.io
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: ["main"]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: ["main"]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
name: "Python ${{ matrix.python-version }} / ${{ matrix.os }}"
|
|
12
|
+
runs-on: "${{ matrix.os }}"
|
|
13
|
+
strategy:
|
|
14
|
+
fail-fast: false
|
|
15
|
+
matrix:
|
|
16
|
+
os: [ubuntu-latest, macos-latest, windows-latest]
|
|
17
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
18
|
+
|
|
19
|
+
steps:
|
|
20
|
+
- name: Checkout
|
|
21
|
+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
22
|
+
|
|
23
|
+
- name: Set up uv
|
|
24
|
+
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2: bump by verifying refs/tags/v5.x.y before changing.
|
|
25
|
+
with:
|
|
26
|
+
enable-cache: true
|
|
27
|
+
python-version: "${{ matrix.python-version }}"
|
|
28
|
+
|
|
29
|
+
- name: Install project with dev dependencies
|
|
30
|
+
run: uv sync --extra dev --extra measure --frozen
|
|
31
|
+
|
|
32
|
+
- name: Lint — ruff check
|
|
33
|
+
run: uv run ruff check .
|
|
34
|
+
|
|
35
|
+
- name: Type-check — mypy
|
|
36
|
+
run: uv run mypy src
|
|
37
|
+
|
|
38
|
+
- name: Test — pytest
|
|
39
|
+
run: uv run pytest
|
|
40
|
+
|
|
41
|
+
- name: Test — strict cost coverage
|
|
42
|
+
run: uv run pytest --cov-config=.coveragerc-strict --cov-fail-under=90
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
name: Pricing sync
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
# Every Monday at 06:00 UTC
|
|
6
|
+
- cron: "0 6 * * 1"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
pull-requests: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
sync:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout
|
|
19
|
+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.12"
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
|
|
28
|
+
|
|
29
|
+
- name: Install project
|
|
30
|
+
run: uv sync
|
|
31
|
+
|
|
32
|
+
- name: Update pricing table
|
|
33
|
+
run: |
|
|
34
|
+
uv run python - <<'EOF'
|
|
35
|
+
from frugon.pricing import fetch_and_update_pricing, _LITELLM_REGISTRY_URL
|
|
36
|
+
from datetime import date
|
|
37
|
+
from pathlib import Path
|
|
38
|
+
result = fetch_and_update_pricing(
|
|
39
|
+
_LITELLM_REGISTRY_URL,
|
|
40
|
+
Path("src/frugon/data/pricing.json"),
|
|
41
|
+
date.today().isoformat(),
|
|
42
|
+
)
|
|
43
|
+
print(f"Synced {result['models_synced']} models")
|
|
44
|
+
EOF
|
|
45
|
+
|
|
46
|
+
- name: Check for changes
|
|
47
|
+
id: diff
|
|
48
|
+
run: |
|
|
49
|
+
if git diff --quiet src/frugon/data/pricing.json; then
|
|
50
|
+
echo "changed=false" >> "$GITHUB_OUTPUT"
|
|
51
|
+
else
|
|
52
|
+
echo "changed=true" >> "$GITHUB_OUTPUT"
|
|
53
|
+
fi
|
|
54
|
+
|
|
55
|
+
- name: Open PR if pricing.json changed
|
|
56
|
+
if: steps.diff.outputs.changed == 'true'
|
|
57
|
+
uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
|
|
58
|
+
with:
|
|
59
|
+
commit-message: "chore(pricing): sync LiteLLM pricing table"
|
|
60
|
+
branch: chore/pricing-sync
|
|
61
|
+
delete-branch: true
|
|
62
|
+
title: "chore(pricing): weekly pricing table sync"
|
|
63
|
+
body: |
|
|
64
|
+
Automated weekly sync of `src/frugon/data/pricing.json` from the
|
|
65
|
+
[LiteLLM registry](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
|
|
66
|
+
|
|
67
|
+
Review the diff to confirm no unexpected price changes before merging.
|
|
68
|
+
labels: |
|
|
69
|
+
pricing
|
|
70
|
+
automated
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
name: Quality sync
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
# Every Monday at 07:00 UTC (offset from pricing-sync at 06:00)
|
|
6
|
+
- cron: "0 7 * * 1"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
pull-requests: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
sync:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- name: Checkout
|
|
19
|
+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.12"
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
|
|
28
|
+
|
|
29
|
+
- name: Install project
|
|
30
|
+
run: uv sync
|
|
31
|
+
|
|
32
|
+
- name: Capture old quality.json (pre-fetch baseline)
|
|
33
|
+
id: old_seed
|
|
34
|
+
run: |
|
|
35
|
+
# Export the committed version of the seed before the fetch overwrites it.
|
|
36
|
+
# If git show fails (seed never existed), OLD_QUALITY is left empty and
|
|
37
|
+
# classify_quality_update receives old=None (first-run path → MINOR).
|
|
38
|
+
if git show HEAD:src/frugon/data/quality.json > /tmp/quality_old.json 2>/dev/null; then
|
|
39
|
+
echo "has_old=true" >> "$GITHUB_OUTPUT"
|
|
40
|
+
else
|
|
41
|
+
echo "has_old=false" >> "$GITHUB_OUTPUT"
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
- name: Fetch new quality tiers
|
|
45
|
+
id: fetch
|
|
46
|
+
run: |
|
|
47
|
+
uv run python - <<'EOF'
|
|
48
|
+
from frugon.quality import fetch_and_update_quality, _HF_BASE_URL
|
|
49
|
+
from datetime import date
|
|
50
|
+
from pathlib import Path
|
|
51
|
+
result = fetch_and_update_quality(
|
|
52
|
+
_HF_BASE_URL,
|
|
53
|
+
Path("src/frugon/data/quality.json"),
|
|
54
|
+
date.today().isoformat(),
|
|
55
|
+
)
|
|
56
|
+
print(f"Synced {result['models_synced']} models")
|
|
57
|
+
EOF
|
|
58
|
+
|
|
59
|
+
- name: Check for changes
|
|
60
|
+
id: diff
|
|
61
|
+
run: |
|
|
62
|
+
if git diff --quiet src/frugon/data/quality.json; then
|
|
63
|
+
echo "changed=false" >> "$GITHUB_OUTPUT"
|
|
64
|
+
else
|
|
65
|
+
echo "changed=true" >> "$GITHUB_OUTPUT"
|
|
66
|
+
fi
|
|
67
|
+
|
|
68
|
+
- name: Validate and classify update
|
|
69
|
+
id: classify
|
|
70
|
+
if: steps.diff.outputs.changed == 'true'
|
|
71
|
+
run: |
|
|
72
|
+
uv run python - <<'EOF'
|
|
73
|
+
import json
|
|
74
|
+
import os
|
|
75
|
+
import sys
|
|
76
|
+
from pathlib import Path
|
|
77
|
+
|
|
78
|
+
from frugon.quality import (
|
|
79
|
+
VERDICT_INVALID,
|
|
80
|
+
VERDICT_MAJOR,
|
|
81
|
+
VERDICT_MINOR,
|
|
82
|
+
classify_quality_update,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Load the newly fetched seed.
|
|
86
|
+
new_path = Path("src/frugon/data/quality.json")
|
|
87
|
+
with new_path.open(encoding="utf-8") as fh:
|
|
88
|
+
new_data: dict[str, object] = json.load(fh)
|
|
89
|
+
|
|
90
|
+
# Load the old seed (may be absent on first run).
|
|
91
|
+
old_data: dict[str, object] | None = None
|
|
92
|
+
old_path = Path("/tmp/quality_old.json")
|
|
93
|
+
if old_path.exists():
|
|
94
|
+
try:
|
|
95
|
+
with old_path.open(encoding="utf-8") as fh:
|
|
96
|
+
old_data = json.load(fh)
|
|
97
|
+
except json.JSONDecodeError:
|
|
98
|
+
old_data = None # treat corrupt baseline as missing
|
|
99
|
+
|
|
100
|
+
verdict, reason = classify_quality_update(new_data, old_data)
|
|
101
|
+
|
|
102
|
+
# Emit outputs for downstream steps.
|
|
103
|
+
# Use a randomized heredoc delimiter so that fetched content in
|
|
104
|
+
# *reason* cannot escape through the delimiter boundary.
|
|
105
|
+
import secrets
|
|
106
|
+
github_output = os.environ.get("GITHUB_OUTPUT", "")
|
|
107
|
+
if github_output:
|
|
108
|
+
with open(github_output, "a", encoding="utf-8") as fh:
|
|
109
|
+
fh.write(f"verdict={verdict}\n")
|
|
110
|
+
# Reason may contain newlines — use a multiline delimiter.
|
|
111
|
+
# The delimiter is randomized so fetched model names in the
|
|
112
|
+
# reason string cannot terminate it prematurely.
|
|
113
|
+
# Invariant: classify_quality_update reasons must never embed
|
|
114
|
+
# raw fetched model names (enforced in quality.py).
|
|
115
|
+
delimiter = f"EOF_REASON_{secrets.token_hex(8)}"
|
|
116
|
+
fh.write(f"reason<<{delimiter}\n{reason}\n{delimiter}\n")
|
|
117
|
+
|
|
118
|
+
# Emit to step summary for visibility in the Actions UI.
|
|
119
|
+
step_summary = os.environ.get("GITHUB_STEP_SUMMARY", "")
|
|
120
|
+
if step_summary:
|
|
121
|
+
with open(step_summary, "a", encoding="utf-8") as fh:
|
|
122
|
+
emoji = {"INVALID": "🔴", "MAJOR": "🟡", "MINOR": "🟢"}.get(verdict, "⚪")
|
|
123
|
+
fh.write(f"## Quality sync verdict: {emoji} {verdict}\n\n")
|
|
124
|
+
fh.write(f"**Reason:** {reason}\n")
|
|
125
|
+
|
|
126
|
+
print(f"Verdict: {verdict}")
|
|
127
|
+
print(f"Reason: {reason}")
|
|
128
|
+
|
|
129
|
+
# INVALID — discard the new seed and fail so CI turns red.
|
|
130
|
+
if verdict == VERDICT_INVALID:
|
|
131
|
+
print("ERROR: Fetch produced invalid data — discarding and failing job.", file=sys.stderr)
|
|
132
|
+
sys.exit(1)
|
|
133
|
+
EOF
|
|
134
|
+
|
|
135
|
+
- name: Discard seed on INVALID verdict
|
|
136
|
+
if: steps.diff.outputs.changed == 'true' && failure() && steps.classify.outcome == 'failure'
|
|
137
|
+
run: |
|
|
138
|
+
git checkout -- src/frugon/data/quality.json
|
|
139
|
+
echo "Seed discarded — invalid fetch data was NOT committed." >> "$GITHUB_STEP_SUMMARY"
|
|
140
|
+
|
|
141
|
+
- name: Open PR — MINOR (CI-gated auto-merge enabled)
|
|
142
|
+
if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR'
|
|
143
|
+
id: pr_minor
|
|
144
|
+
uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
|
|
145
|
+
with:
|
|
146
|
+
commit-message: "chore(quality): sync LMArena quality tiers"
|
|
147
|
+
branch: chore/quality-sync
|
|
148
|
+
delete-branch: true
|
|
149
|
+
title: "chore(quality): weekly quality tier sync [auto-merge]"
|
|
150
|
+
body: |
|
|
151
|
+
Automated weekly sync of `src/frugon/data/quality.json` from the
|
|
152
|
+
[LMArena leaderboard dataset](https://huggingface.co/datasets/lmarena-ai/leaderboard-dataset)
|
|
153
|
+
(CC-BY-4.0).
|
|
154
|
+
|
|
155
|
+
**Verdict: MINOR** — change classified as within expected weekly drift bounds.
|
|
156
|
+
Auto-merge is enabled; this PR will merge automatically after the CI matrix passes.
|
|
157
|
+
|
|
158
|
+
**Reason:** ${{ steps.classify.outputs.reason }}
|
|
159
|
+
labels: |
|
|
160
|
+
quality
|
|
161
|
+
automated
|
|
162
|
+
|
|
163
|
+
- name: In-job validation gate (MINOR path)
|
|
164
|
+
# Run the full quality-data safety gate in-process before enabling
|
|
165
|
+
# auto-merge. This is belt-and-braces: gh pr merge --auto only waits
|
|
166
|
+
# for branch-protection-required checks, and PRs created by the
|
|
167
|
+
# default GITHUB_TOKEN do not trigger ci.yml on the same repo.
|
|
168
|
+
# Running linting + type-checking + the quality/classify tests here
|
|
169
|
+
# gives a synchronous correctness gate before the merge is enabled.
|
|
170
|
+
#
|
|
171
|
+
# Operational prerequisites: for auto-merge to be truly CI-gated,
|
|
172
|
+
# the repository's `main` branch protection must also require the
|
|
173
|
+
# `CI` matrix status checks. That is server-side configuration, not
|
|
174
|
+
# in-repo config — see docs/quality-tiering.md §Operational prerequisites.
|
|
175
|
+
if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR' && steps.pr_minor.outputs.pull-request-number != ''
|
|
176
|
+
run: |
|
|
177
|
+
uv run ruff check .
|
|
178
|
+
uv run mypy src
|
|
179
|
+
uv run pytest tests/test_quality.py tests/test_model_id_drift.py -q
|
|
180
|
+
|
|
181
|
+
- name: Enable auto-merge on MINOR PR
|
|
182
|
+
if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR' && steps.pr_minor.outputs.pull-request-number != ''
|
|
183
|
+
env:
|
|
184
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
185
|
+
run: |
|
|
186
|
+
gh pr merge "${{ steps.pr_minor.outputs.pull-request-number }}" \
|
|
187
|
+
--auto \
|
|
188
|
+
--squash \
|
|
189
|
+
--repo "${{ github.repository }}"
|
|
190
|
+
|
|
191
|
+
- name: Open PR — MAJOR (human review required, auto-merge NOT enabled)
|
|
192
|
+
if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MAJOR'
|
|
193
|
+
uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
|
|
194
|
+
with:
|
|
195
|
+
commit-message: "chore(quality): sync LMArena quality tiers"
|
|
196
|
+
branch: chore/quality-sync
|
|
197
|
+
delete-branch: true
|
|
198
|
+
title: "chore(quality): weekly quality tier sync [needs-review]"
|
|
199
|
+
body: |
|
|
200
|
+
Automated weekly sync of `src/frugon/data/quality.json` from the
|
|
201
|
+
[LMArena leaderboard dataset](https://huggingface.co/datasets/lmarena-ai/leaderboard-dataset)
|
|
202
|
+
(CC-BY-4.0).
|
|
203
|
+
|
|
204
|
+
**Verdict: MAJOR** — the change exceeds expected weekly drift thresholds.
|
|
205
|
+
**Auto-merge is NOT enabled.** A human must review and merge this PR.
|
|
206
|
+
|
|
207
|
+
This may indicate:
|
|
208
|
+
- A leaderboard rebaseline (scoring mechanism changed)
|
|
209
|
+
- A significant model count shift (new cohort added / old cohort retired)
|
|
210
|
+
- A distribution-wide tier rescore
|
|
211
|
+
|
|
212
|
+
**Reason:** ${{ steps.classify.outputs.reason }}
|
|
213
|
+
|
|
214
|
+
Review the diff carefully before merging. If the change looks correct,
|
|
215
|
+
merge manually. If it looks wrong, close the PR and investigate.
|
|
216
|
+
labels: |
|
|
217
|
+
quality
|
|
218
|
+
automated
|
|
219
|
+
needs-review
|
|
220
|
+
|
|
221
|
+
- name: No changes — skip PR
|
|
222
|
+
if: steps.diff.outputs.changed == 'false'
|
|
223
|
+
run: |
|
|
224
|
+
echo "## Quality sync: no changes" >> "$GITHUB_STEP_SUMMARY"
|
|
225
|
+
echo "quality.json is already up to date — no PR opened." >> "$GITHUB_STEP_SUMMARY"
|
|
226
|
+
echo "quality.json unchanged — no PR opened."
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*.*.*"
|
|
7
|
+
|
|
8
|
+
# Workflow-level write access for the release job to create GitHub Releases.
|
|
9
|
+
# The publish job overrides with id-token: write only (Trusted Publishing).
|
|
10
|
+
permissions:
|
|
11
|
+
contents: write
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
test:
|
|
15
|
+
name: Test before release
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
20
|
+
|
|
21
|
+
- name: Set up Python
|
|
22
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.12"
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
|
|
28
|
+
|
|
29
|
+
- name: Install project with dev dependencies
|
|
30
|
+
run: uv sync --extra dev --extra measure --frozen
|
|
31
|
+
|
|
32
|
+
- name: Lint — ruff check
|
|
33
|
+
run: uv run ruff check .
|
|
34
|
+
|
|
35
|
+
- name: Type-check — mypy
|
|
36
|
+
run: uv run mypy src
|
|
37
|
+
|
|
38
|
+
- name: Test — pytest
|
|
39
|
+
run: uv run pytest
|
|
40
|
+
|
|
41
|
+
- name: Test — strict cost coverage
|
|
42
|
+
run: uv run pytest --cov-config=.coveragerc-strict --cov-fail-under=90
|
|
43
|
+
|
|
44
|
+
build:
|
|
45
|
+
name: Build sdist and wheel
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
|
|
48
|
+
steps:
|
|
49
|
+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
50
|
+
|
|
51
|
+
- name: Set up Python
|
|
52
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
53
|
+
with:
|
|
54
|
+
python-version: "3.12"
|
|
55
|
+
|
|
56
|
+
- name: Install uv
|
|
57
|
+
uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
|
|
58
|
+
|
|
59
|
+
# Align the package version with the pushed git tag so the published
|
|
60
|
+
# wheel/sdist matches the release. Strips a leading "v" from the tag
|
|
61
|
+
# (v1.2.3 -> 1.2.3) and rewrites the version line in pyproject.toml.
|
|
62
|
+
- name: Set version from tag
|
|
63
|
+
run: |
|
|
64
|
+
VERSION="${GITHUB_REF_NAME#v}"
|
|
65
|
+
echo "Setting pyproject version to ${VERSION} (from tag ${GITHUB_REF_NAME})"
|
|
66
|
+
sed -i -E "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml
|
|
67
|
+
grep -E '^version = ' pyproject.toml
|
|
68
|
+
|
|
69
|
+
- name: Build
|
|
70
|
+
run: uv build
|
|
71
|
+
|
|
72
|
+
- name: Upload dist artifacts
|
|
73
|
+
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
|
|
74
|
+
with:
|
|
75
|
+
name: dist
|
|
76
|
+
path: dist/
|
|
77
|
+
|
|
78
|
+
publish:
|
|
79
|
+
name: Publish to PyPI
|
|
80
|
+
needs: [test, build]
|
|
81
|
+
runs-on: ubuntu-latest
|
|
82
|
+
environment: pypi
|
|
83
|
+
permissions:
|
|
84
|
+
id-token: write # required for Trusted Publishing — no API token used
|
|
85
|
+
|
|
86
|
+
steps:
|
|
87
|
+
- name: Download dist artifacts
|
|
88
|
+
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
|
89
|
+
with:
|
|
90
|
+
name: dist
|
|
91
|
+
path: dist/
|
|
92
|
+
|
|
93
|
+
- name: Publish
|
|
94
|
+
uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
|
|
95
|
+
|
|
96
|
+
release-notes:
|
|
97
|
+
name: Create GitHub Release
|
|
98
|
+
needs: [build, publish]
|
|
99
|
+
runs-on: ubuntu-latest
|
|
100
|
+
|
|
101
|
+
steps:
|
|
102
|
+
- name: Download dist artifacts
|
|
103
|
+
uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
|
|
104
|
+
with:
|
|
105
|
+
name: dist
|
|
106
|
+
path: dist/
|
|
107
|
+
|
|
108
|
+
- name: Create GitHub Release
|
|
109
|
+
uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2.6.2
|
|
110
|
+
with:
|
|
111
|
+
files: dist/*
|
|
112
|
+
generate_release_notes: true
|
frugon-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Internal files that must never enter this public repo.
|
|
2
|
+
/CLAUDE.md
|
|
3
|
+
/docs/
|
|
4
|
+
|
|
5
|
+
# Python
|
|
6
|
+
__pycache__/
|
|
7
|
+
*.py[cod]
|
|
8
|
+
*.egg-info/
|
|
9
|
+
.eggs/
|
|
10
|
+
build/
|
|
11
|
+
dist/
|
|
12
|
+
*.egg
|
|
13
|
+
|
|
14
|
+
# Environments
|
|
15
|
+
.venv/
|
|
16
|
+
venv/
|
|
17
|
+
.env
|
|
18
|
+
.python-version
|
|
19
|
+
|
|
20
|
+
# Test / coverage
|
|
21
|
+
.pytest_cache/
|
|
22
|
+
.coverage
|
|
23
|
+
coverage.xml
|
|
24
|
+
htmlcov/
|
|
25
|
+
.mypy_cache/
|
|
26
|
+
.ruff_cache/
|
|
27
|
+
.tox/
|
|
28
|
+
|
|
29
|
+
# Captured logs / user data — never commit a user's traffic
|
|
30
|
+
*.capture.jsonl
|
|
31
|
+
captures/
|
|
32
|
+
/report.html
|
|
33
|
+
/report.md
|
|
34
|
+
|
|
35
|
+
# Local agent config — never ship
|
|
36
|
+
.claude/
|
|
37
|
+
|
|
38
|
+
# OS / editor
|
|
39
|
+
.DS_Store
|
|
40
|
+
Thumbs.db
|
|
41
|
+
.idea/
|
|
42
|
+
.vscode/
|
|
43
|
+
.demo.env
|
|
44
|
+
DOGFOOD_FINDINGS.md
|
|
45
|
+
DEMO_SCRIPT.md
|