frugon 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. frugon-0.1.0/.coveragerc-strict +9 -0
  2. frugon-0.1.0/.gitattributes +19 -0
  3. frugon-0.1.0/.github/RELEASE_NOTES_v0.1.0.md +77 -0
  4. frugon-0.1.0/.github/workflows/ci.yml +42 -0
  5. frugon-0.1.0/.github/workflows/pricing-sync.yml +70 -0
  6. frugon-0.1.0/.github/workflows/quality-sync.yml +226 -0
  7. frugon-0.1.0/.github/workflows/release.yml +112 -0
  8. frugon-0.1.0/.gitignore +45 -0
  9. frugon-0.1.0/CONTRIBUTING.md +116 -0
  10. frugon-0.1.0/LICENSE +21 -0
  11. frugon-0.1.0/PKG-INFO +226 -0
  12. frugon-0.1.0/README.md +187 -0
  13. frugon-0.1.0/ROADMAP.md +200 -0
  14. frugon-0.1.0/assets/demo.gif +0 -0
  15. frugon-0.1.0/pyproject.toml +152 -0
  16. frugon-0.1.0/scripts/gen_sample_logs.py +529 -0
  17. frugon-0.1.0/src/frugon/__init__.py +10 -0
  18. frugon-0.1.0/src/frugon/_progress.py +324 -0
  19. frugon-0.1.0/src/frugon/_store.py +185 -0
  20. frugon-0.1.0/src/frugon/capture.py +503 -0
  21. frugon-0.1.0/src/frugon/cli.py +1489 -0
  22. frugon-0.1.0/src/frugon/cost.py +1474 -0
  23. frugon-0.1.0/src/frugon/data/pricing.json +121 -0
  24. frugon-0.1.0/src/frugon/data/quality.json +368 -0
  25. frugon-0.1.0/src/frugon/data/sample_logs.jsonl.gz +0 -0
  26. frugon-0.1.0/src/frugon/measure.py +1579 -0
  27. frugon-0.1.0/src/frugon/model_id.py +203 -0
  28. frugon-0.1.0/src/frugon/pricing.py +571 -0
  29. frugon-0.1.0/src/frugon/quality.py +833 -0
  30. frugon-0.1.0/src/frugon/report.py +7840 -0
  31. frugon-0.1.0/src/frugon/routing.py +312 -0
  32. frugon-0.1.0/tests/__init__.py +1 -0
  33. frugon-0.1.0/tests/conftest.py +132 -0
  34. frugon-0.1.0/tests/test_anthropic_bare_name_pricing.py +541 -0
  35. frugon-0.1.0/tests/test_candidate_caption.py +266 -0
  36. frugon-0.1.0/tests/test_candidate_headline_block_agreement.py +354 -0
  37. frugon-0.1.0/tests/test_candidate_pool.py +62 -0
  38. frugon-0.1.0/tests/test_candidates_html_styling.py +202 -0
  39. frugon-0.1.0/tests/test_capture.py +1643 -0
  40. frugon-0.1.0/tests/test_capture_feedback.py +462 -0
  41. frugon-0.1.0/tests/test_ci_config.py +103 -0
  42. frugon-0.1.0/tests/test_cli.py +241 -0
  43. frugon-0.1.0/tests/test_cli_concurrency_flag.py +176 -0
  44. frugon-0.1.0/tests/test_cli_key_hint.py +64 -0
  45. frugon-0.1.0/tests/test_cli_measure_estimate.py +388 -0
  46. frugon-0.1.0/tests/test_cli_report_formats.py +319 -0
  47. frugon-0.1.0/tests/test_cli_report_measure_order.py +203 -0
  48. frugon-0.1.0/tests/test_cli_smoke.py +944 -0
  49. frugon-0.1.0/tests/test_cli_utf8.py +91 -0
  50. frugon-0.1.0/tests/test_cost.py +2455 -0
  51. frugon-0.1.0/tests/test_escalation_ladder.py +192 -0
  52. frugon-0.1.0/tests/test_escalation_rendering.py +228 -0
  53. frugon-0.1.0/tests/test_flagship_coverage.py +96 -0
  54. frugon-0.1.0/tests/test_judge_fallback.py +270 -0
  55. frugon-0.1.0/tests/test_measure.py +1555 -0
  56. frugon-0.1.0/tests/test_measure_concurrency.py +739 -0
  57. frugon-0.1.0/tests/test_measure_cost.py +436 -0
  58. frugon-0.1.0/tests/test_measure_judge_methodology.py +214 -0
  59. frugon-0.1.0/tests/test_measure_precheck.py +458 -0
  60. frugon-0.1.0/tests/test_measure_sampling_error.py +342 -0
  61. frugon-0.1.0/tests/test_measure_synthesis.py +417 -0
  62. frugon-0.1.0/tests/test_measure_unknown_model.py +277 -0
  63. frugon-0.1.0/tests/test_measure_verbose.py +282 -0
  64. frugon-0.1.0/tests/test_model_id.py +358 -0
  65. frugon-0.1.0/tests/test_model_id_drift.py +87 -0
  66. frugon-0.1.0/tests/test_models.py +159 -0
  67. frugon-0.1.0/tests/test_preview_flags.py +297 -0
  68. frugon-0.1.0/tests/test_pricing.py +551 -0
  69. frugon-0.1.0/tests/test_pricing_update.py +242 -0
  70. frugon-0.1.0/tests/test_pricing_userdir.py +363 -0
  71. frugon-0.1.0/tests/test_privacy.py +163 -0
  72. frugon-0.1.0/tests/test_progress.py +376 -0
  73. frugon-0.1.0/tests/test_quality.py +2468 -0
  74. frugon-0.1.0/tests/test_quality_aware_recommendation.py +350 -0
  75. frugon-0.1.0/tests/test_readme.py +63 -0
  76. frugon-0.1.0/tests/test_report.py +719 -0
  77. frugon-0.1.0/tests/test_report_dangling_refs.py +361 -0
  78. frugon-0.1.0/tests/test_report_html.py +478 -0
  79. frugon-0.1.0/tests/test_report_measure_aware_copy.py +325 -0
  80. frugon-0.1.0/tests/test_report_measure_section.py +547 -0
  81. frugon-0.1.0/tests/test_report_parity.py +433 -0
  82. frugon-0.1.0/tests/test_report_quality_tier.py +354 -0
  83. frugon-0.1.0/tests/test_report_split.py +1222 -0
  84. frugon-0.1.0/tests/test_report_split_reconciliation.py +440 -0
  85. frugon-0.1.0/tests/test_report_tier_drop_wording.py +788 -0
  86. frugon-0.1.0/tests/test_report_unrated_family.py +270 -0
  87. frugon-0.1.0/tests/test_report_unrated_family_measure_aware.py +272 -0
  88. frugon-0.1.0/tests/test_report_v2.py +1023 -0
  89. frugon-0.1.0/tests/test_report_verdict_status_colour.py +190 -0
  90. frugon-0.1.0/tests/test_report_wholesale.py +390 -0
  91. frugon-0.1.0/tests/test_report_wholesale_swap_plan.py +510 -0
  92. frugon-0.1.0/tests/test_routing.py +494 -0
  93. frugon-0.1.0/tests/test_routing_share_and_provenance.py +219 -0
  94. frugon-0.1.0/tests/test_sample_data.py +256 -0
  95. frugon-0.1.0/tests/test_store.py +518 -0
  96. frugon-0.1.0/uv.lock +2705 -0
@@ -0,0 +1,9 @@
1
+ [run]
2
+ source = src/frugon
3
+
4
+ [report]
5
+ include =
6
+ src/frugon/cost.py
7
+ src/frugon/pricing.py
8
+ src/frugon/routing.py
9
+ fail_under = 90
@@ -0,0 +1,19 @@
1
+ # Line-ending normalization — the LF invariant.
2
+ #
3
+ # Frugon writes capture.jsonl and the Markdown/HTML reports with an explicit
4
+ # newline="\n" so artifacts are byte-identical on Windows, macOS, and Linux.
5
+ # This file extends that guarantee to the repository itself: every text file is
6
+ # stored LF-only and checked out LF-only, so a contributor on a misconfigured
7
+ # clone (core.autocrlf=true) cannot reintroduce CRLF drift into tracked sources,
8
+ # fixtures, or golden test data.
9
+ * text=auto eol=lf
10
+
11
+ # Binary assets — never normalize or diff as text.
12
+ *.png binary
13
+ *.jpg binary
14
+ *.jpeg binary
15
+ *.gif binary
16
+ *.ico binary
17
+ *.gz binary
18
+ *.woff binary
19
+ *.woff2 binary
@@ -0,0 +1,77 @@
1
+ # frugon v0.1.0
2
+
3
+ > **Your data never leaves your machine. Your keys go straight to your own providers. Nothing reaches us.**
4
+
5
+ Free, local, open-source LLM cost analyzer. Point it at your real call logs and see — on your machine — how much you'd save by switching or routing models.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ # One-shot (no install)
11
+ uvx frugon analyze ./logs.jsonl
12
+
13
+ # Permanent install
14
+ pipx install frugon
15
+ frugon analyze --demo # bundled sample log, see it work in 5 seconds
16
+ frugon analyze ./logs.jsonl # your real logs
17
+ ```
18
+
19
+ Cross-platform: macOS, Linux, Windows · Python 3.10 / 3.11 / 3.12 / 3.13.
20
+
21
+ ## What's in this release
22
+
23
+ **`frugon analyze`** — read OpenAI-compatible JSONL logs and produce a cost analysis. Counts tokens with [tokencost], prices with [LiteLLM's registry], picks a cheaper-than-baseline candidate model, and tells you the dollar saving. Cross-platform, fully local, no LLM calls, no network. Honest savings anchored to [RouteLLM/LMSYS] quality bands — we never inflate the number.
24
+
25
+ **`frugon capture`** — passive OpenAI-compatible logger. Point your app's base URL at `http://127.0.0.1:8787` for a day; it records every call to a local JSONL file in the canonical shape and forwards the request unchanged to the real upstream. No data goes anywhere but your local file and your existing upstream.
26
+
27
+ **`frugon pricing update`** — refresh the bundled pricing table from the [LiteLLM model_prices_and_context_window.json] registry. Atomic write, JSON shape validation, weekly GitHub Actions sync.
28
+
29
+ **`frugon models [QUERY]`** — list the model names frugon can price (the exact names `--candidates` accepts), optionally filtered by a case-insensitive substring. Pure local read — no network.
30
+
31
+ **`frugon quality update`** — refresh the bundled quality-tier table from the LMArena leaderboard so the Strong / Capable tier labels stay current. Tiers are self-recalibrating percentile bands: Elite (top 10%), Strong (10–30%), Capable (30–60%), Efficient (bottom 40%).
32
+
33
+ **`frugon analyze --measure`** *(optional `[measure]` extra)* — sample real prompts through candidate models using **your own** API keys. Calls go straight to your providers (OpenAI / Anthropic / etc.) — never to us. Two tiers: side-by-side diffs (human judge) or LLM-as-judge win/loss/tie tallies.
34
+
35
+ **`frugon analyze --report file.html|file.md`** — shareable single-page report. Self-contained HTML with inline CSS (deep indigo + cyan + silver), or clean Markdown. The viral surface someone shows their boss.
36
+
37
+ ## Realistic savings
38
+
39
+ Anchored to [RouteLLM] / [LMSYS] research bands:
40
+
41
+ | Traffic mix | Typical saving |
42
+ |---|---|
43
+ | General mixed traffic | 30–50% |
44
+ | Easy / repetitive (MT-Bench) | up to ~85% |
45
+ | Hard tasks (MMLU) | ~30% |
46
+
47
+ **Your actual number comes from your logs.** Frugon shows what the math says for your data.
48
+
49
+ ## Privacy guarantees (tested as code, not promised in prose)
50
+
51
+ - **Cost analysis is fully local.** No LLM, no network, no telemetry.
52
+ - **`capture` never sends data anywhere but your configured upstream.** Asserted at the socket layer in tests — any future regression that introduces a side-channel HTTP client breaks CI.
53
+ - **`--measure` calls only the user's own providers with the user's own keys.** Keys are never logged, never persisted, never sent anywhere but the provider. Asserted by a defense-in-depth fixture patching `socket.socket` / `socket.create_connection` / `socket.getaddrinfo`.
54
+ - **The CLI collects nothing.** Open source — anyone can verify.
55
+
56
+ ## Quality
57
+
58
+ - 1,960 tests, 95% overall coverage, 97% on the cost-math triad (`cost.py` / `pricing.py` / `routing.py`).
59
+ - CI green on 3 OS × 4 Python (ubuntu / macos / windows × 3.10 / 3.11 / 3.12 / 3.13).
60
+ - Every change is code-reviewed before merge; cost-math changes get an extra dedicated review pass.
61
+ - ruff + mypy clean.
62
+
63
+ ## Keep the savings
64
+
65
+ This release is the diagnosis. Want it to keep routing automatically and hold the savings? → **https://frugon.rodiun.io**
66
+
67
+ ---
68
+
69
+ Built by [Rodiun]. MIT licensed.
70
+
71
+ [tokencost]: https://github.com/AgentOps-AI/tokencost
72
+ [LiteLLM's registry]: https://github.com/BerriAI/litellm
73
+ [LiteLLM model_prices_and_context_window.json]: https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json
74
+ [RouteLLM/LMSYS]: https://github.com/lm-sys/RouteLLM
75
+ [RouteLLM]: https://github.com/lm-sys/RouteLLM
76
+ [LMSYS]: https://lmsys.org/
77
+ [Rodiun]: https://rodiun.io
@@ -0,0 +1,42 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: ["main"]
6
+ pull_request:
7
+ branches: ["main"]
8
+
9
+ jobs:
10
+ test:
11
+ name: "Python ${{ matrix.python-version }} / ${{ matrix.os }}"
12
+ runs-on: "${{ matrix.os }}"
13
+ strategy:
14
+ fail-fast: false
15
+ matrix:
16
+ os: [ubuntu-latest, macos-latest, windows-latest]
17
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
18
+
19
+ steps:
20
+ - name: Checkout
21
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
22
+
23
+ - name: Set up uv
24
+ uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2: bump by verifying refs/tags/v5.x.y before changing.
25
+ with:
26
+ enable-cache: true
27
+ python-version: "${{ matrix.python-version }}"
28
+
29
+ - name: Install project with dev dependencies
30
+ run: uv sync --extra dev --extra measure --frozen
31
+
32
+ - name: Lint — ruff check
33
+ run: uv run ruff check .
34
+
35
+ - name: Type-check — mypy
36
+ run: uv run mypy src
37
+
38
+ - name: Test — pytest
39
+ run: uv run pytest
40
+
41
+ - name: Test — strict cost coverage
42
+ run: uv run pytest --cov-config=.coveragerc-strict --cov-fail-under=90
@@ -0,0 +1,70 @@
1
+ name: Pricing sync
2
+
3
+ on:
4
+ schedule:
5
+ # Every Monday at 06:00 UTC
6
+ - cron: "0 6 * * 1"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: write
11
+ pull-requests: write
12
+
13
+ jobs:
14
+ sync:
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - name: Checkout
19
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
23
+ with:
24
+ python-version: "3.12"
25
+
26
+ - name: Install uv
27
+ uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
28
+
29
+ - name: Install project
30
+ run: uv sync
31
+
32
+ - name: Update pricing table
33
+ run: |
34
+ uv run python - <<'EOF'
35
+ from frugon.pricing import fetch_and_update_pricing, _LITELLM_REGISTRY_URL
36
+ from datetime import date
37
+ from pathlib import Path
38
+ result = fetch_and_update_pricing(
39
+ _LITELLM_REGISTRY_URL,
40
+ Path("src/frugon/data/pricing.json"),
41
+ date.today().isoformat(),
42
+ )
43
+ print(f"Synced {result['models_synced']} models")
44
+ EOF
45
+
46
+ - name: Check for changes
47
+ id: diff
48
+ run: |
49
+ if git diff --quiet src/frugon/data/pricing.json; then
50
+ echo "changed=false" >> "$GITHUB_OUTPUT"
51
+ else
52
+ echo "changed=true" >> "$GITHUB_OUTPUT"
53
+ fi
54
+
55
+ - name: Open PR if pricing.json changed
56
+ if: steps.diff.outputs.changed == 'true'
57
+ uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
58
+ with:
59
+ commit-message: "chore(pricing): sync LiteLLM pricing table"
60
+ branch: chore/pricing-sync
61
+ delete-branch: true
62
+ title: "chore(pricing): weekly pricing table sync"
63
+ body: |
64
+ Automated weekly sync of `src/frugon/data/pricing.json` from the
65
+ [LiteLLM registry](https://github.com/BerriAI/litellm/blob/main/model_prices_and_context_window.json).
66
+
67
+ Review the diff to confirm no unexpected price changes before merging.
68
+ labels: |
69
+ pricing
70
+ automated
@@ -0,0 +1,226 @@
1
+ name: Quality sync
2
+
3
+ on:
4
+ schedule:
5
+ # Every Monday at 07:00 UTC (offset from pricing-sync at 06:00)
6
+ - cron: "0 7 * * 1"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: write
11
+ pull-requests: write
12
+
13
+ jobs:
14
+ sync:
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - name: Checkout
19
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
23
+ with:
24
+ python-version: "3.12"
25
+
26
+ - name: Install uv
27
+ uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
28
+
29
+ - name: Install project
30
+ run: uv sync
31
+
32
+ - name: Capture old quality.json (pre-fetch baseline)
33
+ id: old_seed
34
+ run: |
35
+ # Export the committed version of the seed before the fetch overwrites it.
36
+ # If git show fails (seed never existed), OLD_QUALITY is left empty and
37
+ # classify_quality_update receives old=None (first-run path → MINOR).
38
+ if git show HEAD:src/frugon/data/quality.json > /tmp/quality_old.json 2>/dev/null; then
39
+ echo "has_old=true" >> "$GITHUB_OUTPUT"
40
+ else
41
+ echo "has_old=false" >> "$GITHUB_OUTPUT"
42
+ fi
43
+
44
+ - name: Fetch new quality tiers
45
+ id: fetch
46
+ run: |
47
+ uv run python - <<'EOF'
48
+ from frugon.quality import fetch_and_update_quality, _HF_BASE_URL
49
+ from datetime import date
50
+ from pathlib import Path
51
+ result = fetch_and_update_quality(
52
+ _HF_BASE_URL,
53
+ Path("src/frugon/data/quality.json"),
54
+ date.today().isoformat(),
55
+ )
56
+ print(f"Synced {result['models_synced']} models")
57
+ EOF
58
+
59
+ - name: Check for changes
60
+ id: diff
61
+ run: |
62
+ if git diff --quiet src/frugon/data/quality.json; then
63
+ echo "changed=false" >> "$GITHUB_OUTPUT"
64
+ else
65
+ echo "changed=true" >> "$GITHUB_OUTPUT"
66
+ fi
67
+
68
+ - name: Validate and classify update
69
+ id: classify
70
+ if: steps.diff.outputs.changed == 'true'
71
+ run: |
72
+ uv run python - <<'EOF'
73
+ import json
74
+ import os
75
+ import sys
76
+ from pathlib import Path
77
+
78
+ from frugon.quality import (
79
+ VERDICT_INVALID,
80
+ VERDICT_MAJOR,
81
+ VERDICT_MINOR,
82
+ classify_quality_update,
83
+ )
84
+
85
+ # Load the newly fetched seed.
86
+ new_path = Path("src/frugon/data/quality.json")
87
+ with new_path.open(encoding="utf-8") as fh:
88
+ new_data: dict[str, object] = json.load(fh)
89
+
90
+ # Load the old seed (may be absent on first run).
91
+ old_data: dict[str, object] | None = None
92
+ old_path = Path("/tmp/quality_old.json")
93
+ if old_path.exists():
94
+ try:
95
+ with old_path.open(encoding="utf-8") as fh:
96
+ old_data = json.load(fh)
97
+ except json.JSONDecodeError:
98
+ old_data = None # treat corrupt baseline as missing
99
+
100
+ verdict, reason = classify_quality_update(new_data, old_data)
101
+
102
+ # Emit outputs for downstream steps.
103
+ # Use a randomized heredoc delimiter so that fetched content in
104
+ # *reason* cannot escape through the delimiter boundary.
105
+ import secrets
106
+ github_output = os.environ.get("GITHUB_OUTPUT", "")
107
+ if github_output:
108
+ with open(github_output, "a", encoding="utf-8") as fh:
109
+ fh.write(f"verdict={verdict}\n")
110
+ # Reason may contain newlines — use a multiline delimiter.
111
+ # The delimiter is randomized so fetched model names in the
112
+ # reason string cannot terminate it prematurely.
113
+ # Invariant: classify_quality_update reasons must never embed
114
+ # raw fetched model names (enforced in quality.py).
115
+ delimiter = f"EOF_REASON_{secrets.token_hex(8)}"
116
+ fh.write(f"reason<<{delimiter}\n{reason}\n{delimiter}\n")
117
+
118
+ # Emit to step summary for visibility in the Actions UI.
119
+ step_summary = os.environ.get("GITHUB_STEP_SUMMARY", "")
120
+ if step_summary:
121
+ with open(step_summary, "a", encoding="utf-8") as fh:
122
+ emoji = {"INVALID": "🔴", "MAJOR": "🟡", "MINOR": "🟢"}.get(verdict, "⚪")
123
+ fh.write(f"## Quality sync verdict: {emoji} {verdict}\n\n")
124
+ fh.write(f"**Reason:** {reason}\n")
125
+
126
+ print(f"Verdict: {verdict}")
127
+ print(f"Reason: {reason}")
128
+
129
+ # INVALID — discard the new seed and fail so CI turns red.
130
+ if verdict == VERDICT_INVALID:
131
+ print("ERROR: Fetch produced invalid data — discarding and failing job.", file=sys.stderr)
132
+ sys.exit(1)
133
+ EOF
134
+
135
+ - name: Discard seed on INVALID verdict
136
+ if: steps.diff.outputs.changed == 'true' && failure() && steps.classify.outcome == 'failure'
137
+ run: |
138
+ git checkout -- src/frugon/data/quality.json
139
+ echo "Seed discarded — invalid fetch data was NOT committed." >> "$GITHUB_STEP_SUMMARY"
140
+
141
+ - name: Open PR — MINOR (CI-gated auto-merge enabled)
142
+ if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR'
143
+ id: pr_minor
144
+ uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
145
+ with:
146
+ commit-message: "chore(quality): sync LMArena quality tiers"
147
+ branch: chore/quality-sync
148
+ delete-branch: true
149
+ title: "chore(quality): weekly quality tier sync [auto-merge]"
150
+ body: |
151
+ Automated weekly sync of `src/frugon/data/quality.json` from the
152
+ [LMArena leaderboard dataset](https://huggingface.co/datasets/lmarena-ai/leaderboard-dataset)
153
+ (CC-BY-4.0).
154
+
155
+ **Verdict: MINOR** — change classified as within expected weekly drift bounds.
156
+ Auto-merge is enabled; this PR will merge automatically after the CI matrix passes.
157
+
158
+ **Reason:** ${{ steps.classify.outputs.reason }}
159
+ labels: |
160
+ quality
161
+ automated
162
+
163
+ - name: In-job validation gate (MINOR path)
164
+ # Run the full quality-data safety gate in-process before enabling
165
+ # auto-merge. This is belt-and-braces: gh pr merge --auto only waits
166
+ # for branch-protection-required checks, and PRs created by the
167
+ # default GITHUB_TOKEN do not trigger ci.yml on the same repo.
168
+ # Running linting + type-checking + the quality/classify tests here
169
+ # gives a synchronous correctness gate before the merge is enabled.
170
+ #
171
+ # Operational prerequisites: for auto-merge to be truly CI-gated,
172
+ # the repository's `main` branch protection must also require the
173
+ # `CI` matrix status checks. That is server-side configuration, not
174
+ # in-repo config — see docs/quality-tiering.md §Operational prerequisites.
175
+ if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR' && steps.pr_minor.outputs.pull-request-number != ''
176
+ run: |
177
+ uv run ruff check .
178
+ uv run mypy src
179
+ uv run pytest tests/test_quality.py tests/test_model_id_drift.py -q
180
+
181
+ - name: Enable auto-merge on MINOR PR
182
+ if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MINOR' && steps.pr_minor.outputs.pull-request-number != ''
183
+ env:
184
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
185
+ run: |
186
+ gh pr merge "${{ steps.pr_minor.outputs.pull-request-number }}" \
187
+ --auto \
188
+ --squash \
189
+ --repo "${{ github.repository }}"
190
+
191
+ - name: Open PR — MAJOR (human review required, auto-merge NOT enabled)
192
+ if: steps.diff.outputs.changed == 'true' && steps.classify.outputs.verdict == 'MAJOR'
193
+ uses: peter-evans/create-pull-request@84ae59a2cdc2258d6fa0732dd66352dddae2a412 # v7.0.9
194
+ with:
195
+ commit-message: "chore(quality): sync LMArena quality tiers"
196
+ branch: chore/quality-sync
197
+ delete-branch: true
198
+ title: "chore(quality): weekly quality tier sync [needs-review]"
199
+ body: |
200
+ Automated weekly sync of `src/frugon/data/quality.json` from the
201
+ [LMArena leaderboard dataset](https://huggingface.co/datasets/lmarena-ai/leaderboard-dataset)
202
+ (CC-BY-4.0).
203
+
204
+ **Verdict: MAJOR** — the change exceeds expected weekly drift thresholds.
205
+ **Auto-merge is NOT enabled.** A human must review and merge this PR.
206
+
207
+ This may indicate:
208
+ - A leaderboard rebaseline (scoring mechanism changed)
209
+ - A significant model count shift (new cohort added / old cohort retired)
210
+ - A distribution-wide tier rescore
211
+
212
+ **Reason:** ${{ steps.classify.outputs.reason }}
213
+
214
+ Review the diff carefully before merging. If the change looks correct,
215
+ merge manually. If it looks wrong, close the PR and investigate.
216
+ labels: |
217
+ quality
218
+ automated
219
+ needs-review
220
+
221
+ - name: No changes — skip PR
222
+ if: steps.diff.outputs.changed == 'false'
223
+ run: |
224
+ echo "## Quality sync: no changes" >> "$GITHUB_STEP_SUMMARY"
225
+ echo "quality.json is already up to date — no PR opened." >> "$GITHUB_STEP_SUMMARY"
226
+ echo "quality.json unchanged — no PR opened."
@@ -0,0 +1,112 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*.*.*"
7
+
8
+ # Workflow-level write access for the release job to create GitHub Releases.
9
+ # The publish job overrides with id-token: write only (Trusted Publishing).
10
+ permissions:
11
+ contents: write
12
+
13
+ jobs:
14
+ test:
15
+ name: Test before release
16
+ runs-on: ubuntu-latest
17
+
18
+ steps:
19
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
20
+
21
+ - name: Set up Python
22
+ uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
23
+ with:
24
+ python-version: "3.12"
25
+
26
+ - name: Install uv
27
+ uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
28
+
29
+ - name: Install project with dev dependencies
30
+ run: uv sync --extra dev --extra measure --frozen
31
+
32
+ - name: Lint — ruff check
33
+ run: uv run ruff check .
34
+
35
+ - name: Type-check — mypy
36
+ run: uv run mypy src
37
+
38
+ - name: Test — pytest
39
+ run: uv run pytest
40
+
41
+ - name: Test — strict cost coverage
42
+ run: uv run pytest --cov-config=.coveragerc-strict --cov-fail-under=90
43
+
44
+ build:
45
+ name: Build sdist and wheel
46
+ runs-on: ubuntu-latest
47
+
48
+ steps:
49
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
50
+
51
+ - name: Set up Python
52
+ uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
53
+ with:
54
+ python-version: "3.12"
55
+
56
+ - name: Install uv
57
+ uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
58
+
59
+ # Align the package version with the pushed git tag so the published
60
+ # wheel/sdist matches the release. Strips a leading "v" from the tag
61
+ # (v1.2.3 -> 1.2.3) and rewrites the version line in pyproject.toml.
62
+ - name: Set version from tag
63
+ run: |
64
+ VERSION="${GITHUB_REF_NAME#v}"
65
+ echo "Setting pyproject version to ${VERSION} (from tag ${GITHUB_REF_NAME})"
66
+ sed -i -E "s/^version = \".*\"/version = \"${VERSION}\"/" pyproject.toml
67
+ grep -E '^version = ' pyproject.toml
68
+
69
+ - name: Build
70
+ run: uv build
71
+
72
+ - name: Upload dist artifacts
73
+ uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
74
+ with:
75
+ name: dist
76
+ path: dist/
77
+
78
+ publish:
79
+ name: Publish to PyPI
80
+ needs: [test, build]
81
+ runs-on: ubuntu-latest
82
+ environment: pypi
83
+ permissions:
84
+ id-token: write # required for Trusted Publishing — no API token used
85
+
86
+ steps:
87
+ - name: Download dist artifacts
88
+ uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
89
+ with:
90
+ name: dist
91
+ path: dist/
92
+
93
+ - name: Publish
94
+ uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
95
+
96
+ release-notes:
97
+ name: Create GitHub Release
98
+ needs: [build, publish]
99
+ runs-on: ubuntu-latest
100
+
101
+ steps:
102
+ - name: Download dist artifacts
103
+ uses: actions/download-artifact@d3f86a106a0bac45b974a628896c90dbdf5c8093 # v4.3.0
104
+ with:
105
+ name: dist
106
+ path: dist/
107
+
108
+ - name: Create GitHub Release
109
+ uses: softprops/action-gh-release@3bb12739c298aeb8a4eeaf626c5b8d85266b0e65 # v2.6.2
110
+ with:
111
+ files: dist/*
112
+ generate_release_notes: true
@@ -0,0 +1,45 @@
1
+ # Internal files that must never enter this public repo.
2
+ /CLAUDE.md
3
+ /docs/
4
+
5
+ # Python
6
+ __pycache__/
7
+ *.py[cod]
8
+ *.egg-info/
9
+ .eggs/
10
+ build/
11
+ dist/
12
+ *.egg
13
+
14
+ # Environments
15
+ .venv/
16
+ venv/
17
+ .env
18
+ .python-version
19
+
20
+ # Test / coverage
21
+ .pytest_cache/
22
+ .coverage
23
+ coverage.xml
24
+ htmlcov/
25
+ .mypy_cache/
26
+ .ruff_cache/
27
+ .tox/
28
+
29
+ # Captured logs / user data — never commit a user's traffic
30
+ *.capture.jsonl
31
+ captures/
32
+ /report.html
33
+ /report.md
34
+
35
+ # Local agent config — never ship
36
+ .claude/
37
+
38
+ # OS / editor
39
+ .DS_Store
40
+ Thumbs.db
41
+ .idea/
42
+ .vscode/
43
+ .demo.env
44
+ DOGFOOD_FINDINGS.md
45
+ DEMO_SCRIPT.md