PyPI - modern-python-guidance - Versions diffs - 0.3.2__tar.gz → 0.3.4__tar.gz - Mend

modern-python-guidance 0.3.2tar.gz → 0.3.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (130) hide show

modern_python_guidance-0.3.4/.github/workflows/check-python-release.yml ADDED Viewed

@@ -0,0 +1,93 @@
+name: Check for new Python releases
+on:
+  schedule:
+    - cron: '0 9 * * 1'  # Every Monday at 09:00 UTC
+  workflow_dispatch:
+permissions:
+  issues: write
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Detect latest stable Python minor version
+        id: detect
+        run: |
+          # Fetch all releases from python.org, filter stable only via pre_release field
+          LATEST=$(curl -s 'https://www.python.org/api/v2/downloads/release/?limit=500' \
+            | jq -r '[.[] | select(.pre_release == false) | .name
+                      | capture("Python (?<v>3\\.[0-9]+)\\.[0-9]+$") | .v]
+                     | unique | sort_by(split(".") | map(tonumber)) | last')
+          echo "latest_minor=$LATEST"
+          # Find the highest minor version covered in our guides
+          KNOWN=$(grep -roh 'python: ">=3\.[0-9]*"' skills/modern-python-guidance/guides/ \
+            | grep -oP '3\.\d+' \
+            | sort -V | tail -1)
+          echo "known_minor=$KNOWN"
+          if [ -z "$LATEST" ] || [ -z "$KNOWN" ]; then
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Could not determine versions (latest=$LATEST, known=$KNOWN)"
+            exit 0
+          fi
+          LATEST_NUM=${LATEST##3.}
+          KNOWN_NUM=${KNOWN##3.}
+          if [ "$LATEST_NUM" -gt "$KNOWN_NUM" ]; then
+            echo "new_version=$LATEST" >> "$GITHUB_OUTPUT"
+            echo "skip=false" >> "$GITHUB_OUTPUT"
+            echo "New Python version detected: $LATEST (guides cover up to $KNOWN)"
+          else
+            echo "skip=true" >> "$GITHUB_OUTPUT"
+            echo "Up to date: guides cover $KNOWN, latest stable is $LATEST"
+          fi
+      - name: Check for existing issue
+        if: steps.detect.outputs.skip != 'true'
+        id: existing
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          VERSION="${{ steps.detect.outputs.new_version }}"
+          FOUND=$(gh issue list --search "\"Add Python ${VERSION} guides\" in:title" \
+            --state all --json number --jq '.[0].number // empty')
+          if [ -n "$FOUND" ]; then
+            echo "exists=true" >> "$GITHUB_OUTPUT"
+            echo "Issue #${FOUND} already exists"
+          else
+            echo "exists=false" >> "$GITHUB_OUTPUT"
+          fi
+      - name: Create issue
+        if: steps.detect.outputs.skip != 'true' && steps.existing.outputs.exists != 'true'
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          VERSION: ${{ steps.detect.outputs.new_version }}
+        run: |
+          SLUG="${VERSION//.}"
+          gh issue create \
+            --title "Add Python ${VERSION} guides" \
+            --label "enhancement" \
+            --body "## Python ${VERSION} stable released
+          Evaluate new PEPs for BAD/GOOD pattern guides.
+          ### References
+          - [What's New in Python ${VERSION}](https://docs.python.org/${VERSION}/whatsnew/${VERSION}.html)
+          - [Python ${VERSION}.0 Release](https://www.python.org/downloads/release/python-${SLUG}0/)
+          ### Checklist
+          - [ ] Review What's New for pattern-worthy changes
+          - [ ] Check candidates against quality bar (modern + meaningful)
+          - [ ] Write guides for accepted patterns
+          - [ ] Update SKILL.md embedded patterns if high-frequency
+          - [ ] Update benchmark scorer if new items added
+          - [ ] Release new version"

{modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/CHANGELOG.md RENAMED Viewed

@@ -2,6 +2,28 @@
 All notable changes to this project will be documented in this file.
+## [0.3.4] — 2026-05-30
+### Fixed
+- v0.3.3 shipped with `__version__ = "0.3.2"` in `__init__.py` (pyproject.toml was correct). This release fixes the version string
+## [0.3.3] — 2026-05-30 (yanked — `__version__` mismatch)
+### Added
+- AST-based benchmark scorer (`bench/score_v5.py`): replaces grep-based V4 scorer with Python AST detection for structurally correct pattern matching — fixes 3 false-flag bugs on Opus 4.8 output (multiline code, docstring keywords, .venv contamination) (closes #59)
+- VALID_ALT classification for SA2 (sync SQLAlchemy 2.0), TY6 (TypeGuard), AS3 (per-task except) — tracks valid alternatives separately from recommended patterns
+- Benchmark prompt granularity testing (terse/normal/detailed) with V5 runner using isolated tmpdir for workspace safety
+- V5 benchmark results on Opus 4.8: terse prompts +19pp, normal prompts +7pp strict modern rate ([details](docs/benchmark-v5.md))
+- 83 new scorer tests (fixture parity, per-item golden tests, edge cases, import alias handling)
+- Weekly GitHub Actions workflow to detect new Python stable releases and auto-create tracking issues (closes #70)
+### Changed
+- README benchmark highlight updated from V4 (+14.7pp) to V5 (79% → 98% on vague prompts, Opus 4.8)
+- Ruff config: added per-file-ignores for `bench/*.py` (SIM102/SIM110)
 ## [0.3.2] — 2026-05-29
 ### Added

{modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: modern-python-guidance
-Version: 0.3.2
+Version: 0.3.4
 Summary: Version-aware BAD/GOOD pattern guides that help AI coding agents generate modern Python
 Project-URL: Homepage, https://github.com/yottayoshida/modern-python-guidance
 Project-URL: Repository, https://github.com/yottayoshida/modern-python-guidance
@@ -40,7 +40,7 @@ Stop your AI from writing `typing.List`, `@validator`, and `setup.py`. 41 versio
 ## Highlights
-- **Measurable impact**: +14.7pp overall improvement in A/B benchmark via Agent Skills (38 scored items, [details](docs/benchmark-evaluation.md)). Largest variant (FastAPI, 32 items): Control 60.4% → Treatment 82.3%
+- **Measurable impact**: AI writes modern Python 98% of the time with mpg, vs 79% without — even with vague prompts (Opus 4.8, [V5 benchmark details](docs/benchmark-v5.md))
 - **41 guides** across stdlib, Pydantic, FastAPI, Django, SQLAlchemy, pytest, and toolchain
 - **Version-aware**: auto-detects your project's Python version and filters guides accordingly
 - **3 delivery methods**: MCP server, CLI, Agent Skills plugin

{modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/README.md RENAMED Viewed

@@ -9,7 +9,7 @@ Stop your AI from writing `typing.List`, `@validator`, and `setup.py`. 41 versio
 ## Highlights
-- **Measurable impact**: +14.7pp overall improvement in A/B benchmark via Agent Skills (38 scored items, [details](docs/benchmark-evaluation.md)). Largest variant (FastAPI, 32 items): Control 60.4% → Treatment 82.3%
+- **Measurable impact**: AI writes modern Python 98% of the time with mpg, vs 79% without — even with vague prompts (Opus 4.8, [V5 benchmark details](docs/benchmark-v5.md))
 - **41 guides** across stdlib, Pydantic, FastAPI, Django, SQLAlchemy, pytest, and toolchain
 - **Version-aware**: auto-detects your project's Python version and filters guides accordingly
 - **3 delivery methods**: MCP server, CLI, Agent Skills plugin

modern_python_guidance-0.3.4/bench/fixtures/edge-cases/opus48_multiline.py ADDED Viewed

@@ -0,0 +1,68 @@
+"""Opus 4.8 style: verbose, multi-line, heavily documented code.
+This fixture reproduces the patterns that broke the V4 grep scorer.
+Every function here is MODERN but would false-flag under grep-based detection.
+"""
+import asyncio
+import subprocess
+from pathlib import Path
+from typing import TypeGuard
+def run_command(cmd: str, *args: str) -> subprocess.CompletedProcess[str]:
+    """Run a subprocess safely from a list of arguments.
+    Passing a list (not a shell string) avoids shell injection: the program name
+    and each argument are kept distinct and never re-parsed by a shell.
+    Never use shell=True on untrusted input.
+    """
+    return subprocess.run(
+        [cmd, *args],
+        check=True,
+        capture_output=True,
+        text=True,
+    )
+async def scan_directory(root: str | Path) -> list[Path]:
+    """Walk root recursively, batch files, process concurrently.
+    Uses TaskGroup for structured concurrency. A failure in one batch
+    is recorded while the other batches still complete.
+    """
+    root_path = Path(root)
+    files = [p for p in root_path.rglob("*") if p.is_file()]
+    batches = [files[i : i + 10] for i in range(0, len(files), 10)]
+    errors: list[str] = []
+    results: list[Path] = []
+    async with asyncio.TaskGroup() as tg:
+        for batch in batches:
+            tg.create_task(_process_batch(batch, results, errors))
+    return results
+async def _process_batch(
+    batch: list[Path],
+    results: list[Path],
+    errors: list[str],
+) -> None:
+    try:
+        for path in batch:
+            results.append(path)
+    except OSError as exc:
+        errors.append(str(exc))
+def is_positive_int(val: object) -> TypeGuard[int]:
+    """Narrow val to int when it is a positive integer.
+    bool is a subclass of int in Python, so it is explicitly excluded.
+    After if is_positive_int(x): a type checker treats x as int.
+    """
+    return isinstance(val, int) and not isinstance(val, bool) and val > 0

modern_python_guidance-0.3.4/bench/fixtures/edge-cases/valid_alt_patterns.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Valid alternative patterns that should score VALID_ALT, not OUTDATED.
+SA2: sync SQLAlchemy 2.0 (create_engine + select() style)
+TY6: TypeGuard (broader semantics than TypeIs, still valid)
+AS3: TaskGroup + per-task try/except (structured concurrency without except*)
+"""
+import asyncio
+from sqlalchemy import create_engine, select
+from sqlalchemy.orm import Session
+from typing import TypeGuard
+# SA2: sync SQLAlchemy 2.0 — VALID_ALT
+engine = create_engine("sqlite:///test.db")
+def get_users():
+    with Session(engine) as session:
+        return session.scalars(select(User)).all()
+# TY6: TypeGuard — VALID_ALT
+def is_str_list(val: list[object]) -> TypeGuard[list[str]]:
+    return all(isinstance(x, str) for x in val)
+# AS3: TaskGroup + per-task try/except — VALID_ALT
+async def fetch_all(urls: list[str]) -> list[str]:
+    results: list[str] = []
+    async with asyncio.TaskGroup() as tg:
+        for url in urls:
+            tg.create_task(_safe_fetch(url, results))
+    return results
+async def _safe_fetch(url: str, results: list[str]) -> None:
+    try:
+        results.append(f"fetched: {url}")
+    except Exception:
+        pass

modern_python_guidance-0.3.4/bench/prompts/v5-a-detailed.txt ADDED Viewed

@@ -0,0 +1,15 @@
+Write the following 7 files. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
+1. src/config.py — A typed configuration loader. Define a Settings class using Pydantic to hold app configuration (database URL, debug flag, log level, allowed origins list, and an optional description that may or may not be provided). Use `model_config = ConfigDict(...)` for Pydantic configuration (not the inner `class Config:` pattern). Validate that the database URL starts with a known scheme and that log level is one of DEBUG/INFO/WARNING/ERROR — use `@field_validator` (not the deprecated `@validator`). Write a function `load_config(path)` that reads a TOML file using `tomllib` (not third-party `toml` or `tomli`) and returns a Settings instance via `Settings.model_validate()` (not `.parse_obj()`). Include a `created_at` field that defaults to the current time in UTC using `datetime.now(UTC)` (not `.utcnow()`). Define a generic container class `Registry[T]` using PEP 695 type parameter syntax `class Registry[T]:` (not `TypeVar`). Write a decorator `with_retry(max_attempts)` that retries a decorated async function on failure, preserving the signature with `ParamSpec`.
+2. src/models.py — SQLAlchemy ORM models using 2.0-style declarative mapping. Define a `User` model and an `Article` model. Use `Mapped[type] = mapped_column()` for column definitions (not `Column(Type)`). Use `select()` for queries (not `.query()`). Define a base class with common fields (id, created_at) using `Mapped` annotations. Each model should `@override` its `__repr__` method.
+3. src/app.py — A FastAPI application. Use `@asynccontextmanager` lifespan (not `@app.on_event`). Use `Annotated[Session, Depends(get_db)]` for dependency injection (not bare `= Depends()`). Use lifespan `yield` dict for typed application state (not `app.state`). Define request/response schemas with Pydantic V2 API — use `@field_serializer` for custom serialization (not `json_encoders`), and `.model_dump()` / `.model_dump_json()` for output (not `.dict()` / `.json()`). Use `create_async_engine` and `AsyncSession` for the database connection.
+4. src/crawler.py — An async web crawler. Write a function `crawl(urls)` that fetches a list of URLs concurrently using `httpx.AsyncClient` as a shared context manager (not per-request `httpx.get()`). Use `asyncio.TaskGroup` for structured concurrency (not `asyncio.gather`). Apply `asyncio.timeout()` to each fetch (not `asyncio.wait_for`). Handle `ExceptionGroup` with `except*` syntax. Write a `stream_large(url)` function using `client.stream()` and `resp.aiter_bytes()`.
+5. src/scanner.py — A file scanner and log parser. Use `pathlib.Path` for all filesystem operations (not `os.path`). Categorize files using `match`/`case` structural pattern matching (not `if isinstance` chains). Use a frozen dataclass with `@dataclass(frozen=True, slots=True)` for `ScanResult`. Process batches concurrently with `asyncio.TaskGroup`. Parse log lines with `.removeprefix()` / `.removesuffix()` (not `.lstrip()` / string slicing with `[len():]`).
+6. src/utils.py — Utility functions. (a) `merge_defaults(user_config, default_config)` using the dict `|` merge operator (not `{**a, **b}` or `.update()`). (b) `run_command(cmd, *args)` using `subprocess.run([cmd, *args], check=True)` with a list (not `shell=True` or `os.system`). (c) `is_positive_int(val)` using `TypeIs[int]` (not `TypeGuard`). (d) `save_to_json(data, path)` using `pathlib.Path` for file output. Use `X | None` union syntax (not `Optional[X]`) and `list[str]` built-in generics (not `typing.List[str]`).
+7. pyproject.toml — Project config with dependencies on fastapi, sqlalchemy, httpx, uvicorn, and pydantic. Target Python 3.12+. Use `[project]` table (not setup.py). Configure `[tool.ruff]` for linting (not flake8/black/isort). Include `uv` as the package manager in scripts.

modern_python_guidance-0.3.4/bench/prompts/v5-a-normal.txt ADDED Viewed

@@ -0,0 +1,15 @@
+Write the following 7 files. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
+1. src/config.py — A typed configuration loader. Define a Settings class using Pydantic to hold app configuration (database URL, debug flag, log level, allowed origins list, and an optional description that may or may not be provided). Validate that the database URL starts with a known scheme and that log level is one of DEBUG/INFO/WARNING/ERROR. Write a function `load_config(path)` that reads a TOML file and returns a Settings instance. Include a `created_at` field that defaults to the current time in UTC. Define a generic container class `Registry[T]` that stores items by name and retrieves them with type safety. Write a decorator `with_retry(max_attempts)` that retries a decorated async function on failure.
+2. src/models.py — SQLAlchemy ORM models. Define a `User` model and an `Article` model. User has fields: id, email, display_name, created_at. Article has fields: id, title, body, author_id (foreign key to User), published_at. Use the declarative mapping style. Define a base class with common fields (id, created_at) and a `__repr__` method that both models inherit from. Each model should override `__repr__` to include its own specific fields.
+3. src/app.py — A FastAPI application. It should have: a User model with CRUD endpoints (GET /users, GET /users/{id}, POST /users), an Article endpoint (GET /articles), proper database lifecycle management with startup/shutdown, dependency injection for the database session, and typed application state that holds the database engine. Define request/response schemas with serialization aliases (e.g., snake_case fields exposed as camelCase in JSON).
+4. src/crawler.py — An async web crawler. Write a function `crawl(urls)` that fetches a list of URLs concurrently using httpx and returns their response bodies. Use an async context manager for the HTTP client to reuse connections. Handle failures gracefully — a single bad URL should not lose the other results. Use structured concurrency for the concurrent fetches with proper cancellation. Write a second function `stream_large(url)` that downloads a large response body by reading it in chunks rather than loading it all into memory at once. Apply a timeout to each individual fetch operation.
+5. src/scanner.py — A file scanner and log parser. Write a function `scan_directory(root)` that walks a directory tree recursively, collects all files, groups them into batches of 10, and processes each batch concurrently with proper error handling — if one batch fails, the others should still complete. Define an enum `FileCategory` with values IMAGE, VIDEO, DOCUMENT, OTHER. Categorize each file by its extension (.jpg/.png → IMAGE, .mp4/.avi → VIDEO, .pdf/.docx → DOCUMENT, everything else → OTHER) using structured pattern matching. Define a frozen data container `ScanResult` to hold the results (total count, categorized file lists, errors). Write a function `parse_log_lines(lines)` that strips a known prefix from each log line.
+6. src/utils.py — Utility functions. Write: (a) a function `merge_defaults(user_config, default_config)` that merges two dicts with user values taking precedence, (b) a function `run_command(cmd, *args)` that runs a subprocess safely with a list of arguments, (c) a function `is_positive_int(val)` that narrows an unknown value to int via a type narrowing guard, (d) a function `save_to_json(data, path)` that serializes data to a JSON file.
+7. pyproject.toml — Project config with dependencies on fastapi, sqlalchemy, httpx, uvicorn, and pydantic. Target Python 3.12+. Configure a linter and formatter.

modern_python_guidance-0.3.4/bench/prompts/v5-a-terse.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ Build a FastAPI web application with an async web crawler. Use SQLAlchemy for the database, httpx for HTTP requests, Pydantic for data validation, and TOML for configuration. Include a file scanner utility with pattern matching. Target Python 3.12+. Write all code with no placeholders.

modern_python_guidance-0.3.4/bench/prompts/v5-b-detailed.txt ADDED Viewed

@@ -0,0 +1,7 @@
+Write the following 3 files for a Django application. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
+1. myapp/models.py — Django models. Define a `Product` model with fields: name (CharField), price (DecimalField), metadata (use `models.JSONField` — the native Django JSONField, not the old `django.contrib.postgres.fields.JSONField`). Add a database-level constraint using `models.CheckConstraint(condition=..., name=...)` syntax (not the deprecated `check=` parameter) that ensures price is non-negative.
+2. myapp/views.py — Django views. Write an `async def product_list(request)` view that returns all products as JSON using native async Django ORM queries (`async for p in Product.objects.all()`, `.aget()`, `.afirst()`) — do NOT use `sync_to_async` wrappers. Write a second async view `product_detail(request, pk)` that returns a single product using `.aget()`.
+3. myapp/urls.py — URL configuration. Wire up the two views above with appropriate URL patterns.

modern_python_guidance-0.3.4/bench/prompts/v5-b-normal.txt ADDED Viewed

@@ -0,0 +1,7 @@
+Write the following 3 files for a Django application. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
+1. myapp/models.py — Django models. Define a `Product` model with fields: name (CharField), price (DecimalField), metadata (a field that stores arbitrary JSON data natively). Add a database-level constraint that ensures price is non-negative.
+2. myapp/views.py — Django views. Write a view function `product_list(request)` that returns all products as JSON. Make it handle requests asynchronously. Write a second view `product_detail(request, pk)` that returns a single product.
+3. myapp/urls.py — URL configuration. Wire up the two views above with appropriate URL patterns.

modern_python_guidance-0.3.4/bench/prompts/v5-b-terse.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ Build a Django application with a Product model that stores JSON data and has a price constraint. Include async views that return JSON responses. Target Python 3.12+. Write all code with no placeholders.

modern_python_guidance-0.3.4/bench/prompts/v5-c-detailed.txt ADDED Viewed

@@ -0,0 +1,3 @@
+Write 1 file. Write all code, no placeholders. Create the file at the EXACT path shown below (relative to the current working directory).
+1. tests/test_calculator.py — Tests for a calculator module. Inline a `divide(a, b)` function that divides two numbers and raises ZeroDivisionError for zero denominators. Also inline a `save_result(path, value)` function that writes a float to a file. Write tests that cover: multiple input combinations for divide (positive, zero numerator, negative, fractional results) — use `@pytest.mark.parametrize` for table-driven testing (not separate test functions per case). Test the zero-denominator error case using `pytest.raises(ZeroDivisionError, match="...")` with the `match=` parameter to verify the error message text (not bare `pytest.raises` without match). Test save_result using `tmp_path` fixture (not the deprecated `tmpdir` fixture).

modern_python_guidance-0.3.4/bench/prompts/v5-c-normal.txt ADDED Viewed

@@ -0,0 +1,3 @@
+Write 1 file. Write all code, no placeholders. Create the file at the EXACT path shown below (relative to the current working directory).
+1. tests/test_calculator.py — Tests for a calculator module. Inline a `divide(a, b)` function that divides two numbers and raises ZeroDivisionError for zero denominators. Also inline a `save_result(path, value)` function that writes a float to a file. Write tests that cover: multiple input combinations for divide (positive, zero numerator, negative, fractional results) — use table-driven testing to avoid repetitive test functions. Test the zero-denominator error case and verify the error message text. Test save_result using a temporary directory provided by the test framework.

modern_python_guidance-0.3.4/bench/prompts/v5-c-terse.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ Write tests for a calculator module with divide and save_result functions. Use pytest with table-driven testing, error message verification, and temporary file handling. Target Python 3.12+. Write all code with no placeholders.

{modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/run-v4.sh RENAMED Viewed

@@ -27,6 +27,14 @@ done
 BUDGET="2.00"
+# --- Optional model pin (opt-in via MODEL env; no-op when unset) ---
+MODEL="${MODEL:-}"
+MODEL_ARGS=()
+if [ -n "$MODEL" ]; then
+    MODEL_ARGS=(--model "$MODEL")
+    echo "[config] Pinning model: $MODEL"
+fi
 # --- Guidance toggle: rules/ file ---
 RULE_FILE="$WORKSPACE/.claude/rules/modern-python.md"
 RULE_SOURCE="$REPO_DIR/skills/modern-python-guidance/SKILL.md"
@@ -113,7 +121,8 @@ run_variant_session() {
         record_verify "PRE-CONTROL-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
         echo "[running] claude -p (Control, variant $variant) from $WORKSPACE ..."
-        (cd "$WORKSPACE" && claude -p --output-format json --max-budget-usd "$BUDGET" \
+        echo "MODEL=${MODEL:-<default>}" >> "$log"
+        (cd "$WORKSPACE" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} --output-format json --max-budget-usd "$BUDGET" \
             < "$prompt" > "$results_dir/session-a.json" 2>"$results_dir/session-a.stderr") || true
         record_verify "POST-CONTROL-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
@@ -134,7 +143,8 @@ run_variant_session() {
         record_verify "PRE-TREATMENT-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
         echo "[running] claude -p (Treatment, variant $variant) from $WORKSPACE ..."
-        (cd "$WORKSPACE" && claude -p --output-format json --max-budget-usd "$BUDGET" \
+        echo "MODEL=${MODEL:-<default>}" >> "$log"
+        (cd "$WORKSPACE" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} --output-format json --max-budget-usd "$BUDGET" \
             < "$prompt" > "$results_dir/session-b.json" 2>"$results_dir/session-b.stderr") || true
         record_verify "POST-TREATMENT-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"

modern_python_guidance-0.3.4/bench/run-v5.sh ADDED Viewed

@@ -0,0 +1,274 @@
+#!/usr/bin/env bash
+set -euo pipefail
+# V5 Benchmark Runner: 3-variant × 3-granularity system
+#
+# Each claude -p session runs in an isolated tmpdir, NOT in ~/claude_workspace.
+# This prevents auto-backup hooks, workspace contamination, and file collisions.
+#
+# Usage:
+#   ./bench/run-v5.sh <run_id> <control|treatment|both> [options]
+#
+# Options:
+#   --variant a|b|c|all       (default: a)
+#   --granularity terse|normal|detailed|all  (default: normal)
+#   -N <count>                (default: 1)
+#   --dry-run                 Print execution plan without running
+#   --budget <usd>            Per-session budget (default: 2.00)
+REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+RUN_ID="${1:?Usage: $0 <run_id> <control|treatment|both> [options]}"
+SESSION="${2:?Usage: $0 <run_id> <control|treatment|both> [options]}"
+shift 2
+VARIANTS="a"
+GRANULARITIES="normal"
+N_RUNS=1
+DRY_RUN=false
+BUDGET="2.00"
+MODEL="${MODEL:-}"
+MODEL_ARGS=()
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --variant) VARIANTS="$2"; shift 2 ;;
+        --granularity) GRANULARITIES="$2"; shift 2 ;;
+        -N) N_RUNS="$2"; shift 2 ;;
+        --dry-run) DRY_RUN=true; shift ;;
+        --budget) BUDGET="$2"; shift 2 ;;
+        *) echo "Unknown option: $1" >&2; exit 1 ;;
+    esac
+done
+if [ -n "$MODEL" ]; then
+    MODEL_ARGS=(--model "$MODEL")
+fi
+# --- Resolve variant/granularity lists ---
+variant_list=()
+case "$VARIANTS" in
+    all) variant_list=(a b c) ;;
+    a|b|c) variant_list=("$VARIANTS") ;;
+    *) echo "ERROR: Invalid variant '$VARIANTS'" >&2; exit 1 ;;
+esac
+gran_list=()
+case "$GRANULARITIES" in
+    all) gran_list=(terse normal detailed) ;;
+    terse|normal|detailed) gran_list=("$GRANULARITIES") ;;
+    *) echo "ERROR: Invalid granularity '$GRANULARITIES'" >&2; exit 1 ;;
+esac
+case "$SESSION" in
+    control|treatment|both) ;;
+    *) echo "ERROR: Invalid session '$SESSION'" >&2; exit 1 ;;
+esac
+# --- Count total sessions ---
+session_count=0
+sessions_per_combo=1
+if [ "$SESSION" = "both" ]; then sessions_per_combo=2; fi
+for _ in "${variant_list[@]}"; do
+    for _ in "${gran_list[@]}"; do
+        session_count=$((session_count + N_RUNS * sessions_per_combo))
+    done
+done
+# --- Dry run ---
+if $DRY_RUN; then
+    echo "=== V5 Benchmark Dry Run ==="
+    echo "Run ID:       $RUN_ID"
+    echo "Session:      $SESSION"
+    echo "Variants:     ${variant_list[*]}"
+    echo "Granularities: ${gran_list[*]}"
+    echo "N:            $N_RUNS"
+    echo "Model:        ${MODEL:-<default>}"
+    echo "Per-session:  \$$BUDGET"
+    echo "Total sessions: $session_count"
+    echo ""
+    echo "Prompt files:"
+    for v in "${variant_list[@]}"; do
+        for g in "${gran_list[@]}"; do
+            pf="$REPO_DIR/bench/prompts/v5-${v}-${g}.txt"
+            if [ -f "$pf" ]; then echo "  [OK] $pf"; else echo "  [MISSING] $pf"; fi
+        done
+    done
+    exit 0
+fi
+# --- Pre-flight checks ---
+echo "=== V5 Pre-flight Checks ==="
+if ! command -v claude &>/dev/null; then
+    echo "ERROR: claude CLI not found" >&2; exit 1
+fi
+echo "[OK] Claude CLI found"
+SCORER="$REPO_DIR/bench/score_v5.py"
+if [ ! -f "$SCORER" ]; then
+    echo "ERROR: Scorer not found: $SCORER" >&2; exit 1
+fi
+echo "[OK] Scorer found"
+for v in "${variant_list[@]}"; do
+    for g in "${gran_list[@]}"; do
+        pf="$REPO_DIR/bench/prompts/v5-${v}-${g}.txt"
+        if [ ! -f "$pf" ]; then
+            echo "ERROR: Prompt not found: $pf" >&2; exit 1
+        fi
+    done
+done
+echo "[OK] All prompt files found"
+RULE_SOURCE="$REPO_DIR/skills/modern-python-guidance/SKILL.md"
+if [ ! -f "$RULE_SOURCE" ]; then
+    echo "ERROR: Guidance source not found: $RULE_SOURCE" >&2; exit 1
+fi
+echo "[OK] Guidance source found"
+echo ""
+# --- Guidance file content (extracted once, reused per session) ---
+GUIDANCE_CONTENT=$(awk 'BEGIN{c=0} /^---$/{c++; next} c>=2{print}' "$RULE_SOURCE")
+# --- Run a single session in isolated tmpdir ---
+run_session() {
+    local variant="$1" gran="$2" session_type="$3" run_n="$4"
+    local run_suffix="${RUN_ID}-${run_n}-v5${variant}${gran:0:1}"
+    local results_dir="$REPO_DIR/results/run-${run_suffix}"
+    local prompt="$REPO_DIR/bench/prompts/v5-${variant}-${gran}.txt"
+    local log="$results_dir/guidance-verify.log"
+    mkdir -p "$results_dir"
+    # Create isolated workspace
+    local tmpwork
+    tmpwork=$(mktemp -d "$HOME/mpg-bench-XXXXXX")
+    # Set up .claude/rules/ for guidance toggle
+    mkdir -p "$tmpwork/.claude/rules"
+    local session_label
+    if [ "$session_type" = "control" ]; then
+        session_label="a"
+        # No guidance file
+    else
+        session_label="b"
+        echo "$GUIDANCE_CONTENT" > "$tmpwork/.claude/rules/modern-python.md"
+    fi
+    # Record verification
+    local rule_file="$tmpwork/.claude/rules/modern-python.md"
+    local label_upper
+    label_upper="$(echo "${session_type}-V5${variant}${gran}" | tr '[:lower:]' '[:upper:]')"
+    echo "=== PRE-${label_upper} $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$log"
+    echo "TMPWORK=$tmpwork" >> "$log"
+    if [ -f "$rule_file" ]; then
+        echo "status: PRESENT ($(wc -c < "$rule_file") bytes)" >> "$log"
+        shasum -a 256 "$rule_file" >> "$log" 2>/dev/null || true
+    else
+        echo "status: ABSENT" >> "$log"
+    fi
+    echo "MODEL=${MODEL:-<default>}" >> "$log"
+    echo "" >> "$log"
+    # Run claude -p in isolated tmpdir
+    echo "[running] claude -p ($session_type, variant $variant, $gran) in $tmpwork ..."
+    (cd "$tmpwork" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} \
+        --output-format json --max-budget-usd "$BUDGET" \
+        < "$prompt" > "$results_dir/session-${session_label}.json" \
+        2>"$results_dir/session-${session_label}.stderr") || true
+    # Post verification
+    echo "=== POST-${label_upper} $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$log"
+    if [ -f "$rule_file" ]; then
+        echo "status: PRESENT ($(wc -c < "$rule_file") bytes)" >> "$log"
+    else
+        echo "status: ABSENT" >> "$log"
+    fi
+    echo "" >> "$log"
+    # Move generated files to results (everything except .claude/)
+    mkdir -p "$results_dir/${session_type}"
+    for item in "$tmpwork"/*; do
+        [ -e "$item" ] || continue
+        local base
+        base=$(basename "$item")
+        [ "$base" = ".claude" ] && continue
+        mv "$item" "$results_dir/${session_type}/$base" 2>/dev/null || true
+    done
+    # Also move hidden dirs that aren't .claude (e.g. .venv created by LLM)
+    for item in "$tmpwork"/.*; do
+        [ -e "$item" ] || continue
+        local base
+        base=$(basename "$item")
+        case "$base" in .|..|.claude) continue ;; esac
+        mv "$item" "$results_dir/${session_type}/$base" 2>/dev/null || true
+    done
+    # Remove tmpdir
+    rm -rf "$tmpwork"
+    echo "[ok] $session_type saved to $results_dir/${session_type}/"
+}
+# --- Main execution ---
+echo "=== V5 Benchmark Run $RUN_ID ==="
+echo "Variants: ${variant_list[*]}, Granularities: ${gran_list[*]}, N=$N_RUNS"
+echo "Sessions: $session_count total"
+echo ""
+completed=0
+start_time=$(date +%s)
+for v in "${variant_list[@]}"; do
+    for g in "${gran_list[@]}"; do
+        for ((n=1; n<=N_RUNS; n++)); do
+            if [ "$SESSION" = "control" ] || [ "$SESSION" = "both" ]; then
+                completed=$((completed + 1))
+                elapsed=$(( $(date +%s) - start_time ))
+                if [ "$completed" -gt 1 ]; then
+                    remaining=$(( elapsed * (session_count - completed) / (completed - 1) ))
+                else
+                    remaining=0
+                fi
+                echo ""
+                echo "[$completed/$session_count] Variant $v, $g, Control, run $n — elapsed ${elapsed}s, est ${remaining}s remaining"
+                run_session "$v" "$g" "control" "$n"
+            fi
+            if [ "$SESSION" = "treatment" ] || [ "$SESSION" = "both" ]; then
+                completed=$((completed + 1))
+                elapsed=$(( $(date +%s) - start_time ))
+                if [ "$completed" -gt 1 ]; then
+                    remaining=$(( elapsed * (session_count - completed) / (completed - 1) ))
+                else
+                    remaining=0
+                fi
+                echo ""
+                echo "[$completed/$session_count] Variant $v, $g, Treatment, run $n — elapsed ${elapsed}s, est ${remaining}s remaining"
+                run_session "$v" "$g" "treatment" "$n"
+            fi
+            # Score this run
+            echo ""
+            echo "--- Scoring run $n, variant $v, $g ---"
+            python3 "$SCORER" "${RUN_ID}-${n}-v5${v}${g:0:1}" --variant "$v" || true
+        done
+    done
+done
+total_elapsed=$(( $(date +%s) - start_time ))
+echo ""
+echo "=== V5 Benchmark Complete ==="
+echo "Total time: ${total_elapsed}s"
+echo ""
+echo "Score individual runs:"
+for v in "${variant_list[@]}"; do
+    for g in "${gran_list[@]}"; do
+        for ((n=1; n<=N_RUNS; n++)); do
+            echo "  python3 bench/score_v5.py ${RUN_ID}-${n}-v5${v}${g:0:1} --variant $v"
+        done
+    done
+done

modern-python-guidance 0.3.2__tar.gz → 0.3.4__tar.gz

modern-python-guidance 0.3.2tar.gz → 0.3.4tar.gz