modern-python-guidance 0.3.2__tar.gz → 0.3.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. modern_python_guidance-0.3.4/.github/workflows/check-python-release.yml +93 -0
  2. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/CHANGELOG.md +22 -0
  3. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/PKG-INFO +2 -2
  4. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/README.md +1 -1
  5. modern_python_guidance-0.3.4/bench/fixtures/edge-cases/opus48_multiline.py +68 -0
  6. modern_python_guidance-0.3.4/bench/fixtures/edge-cases/valid_alt_patterns.py +41 -0
  7. modern_python_guidance-0.3.4/bench/prompts/v5-a-detailed.txt +15 -0
  8. modern_python_guidance-0.3.4/bench/prompts/v5-a-normal.txt +15 -0
  9. modern_python_guidance-0.3.4/bench/prompts/v5-a-terse.txt +1 -0
  10. modern_python_guidance-0.3.4/bench/prompts/v5-b-detailed.txt +7 -0
  11. modern_python_guidance-0.3.4/bench/prompts/v5-b-normal.txt +7 -0
  12. modern_python_guidance-0.3.4/bench/prompts/v5-b-terse.txt +1 -0
  13. modern_python_guidance-0.3.4/bench/prompts/v5-c-detailed.txt +3 -0
  14. modern_python_guidance-0.3.4/bench/prompts/v5-c-normal.txt +3 -0
  15. modern_python_guidance-0.3.4/bench/prompts/v5-c-terse.txt +1 -0
  16. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/run-v4.sh +12 -2
  17. modern_python_guidance-0.3.4/bench/run-v5.sh +274 -0
  18. modern_python_guidance-0.3.4/bench/score_v5.py +1504 -0
  19. modern_python_guidance-0.3.4/docs/benchmark-v5.md +107 -0
  20. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/pyproject.toml +4 -1
  21. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/__init__.py +1 -1
  22. modern_python_guidance-0.3.4/tests/test_scorer_v5.py +992 -0
  23. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/.github/workflows/ci.yml +0 -0
  24. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/.github/workflows/publish.yml +0 -0
  25. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/.gitignore +0 -0
  26. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/CONTRIBUTING.md +0 -0
  27. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/LICENSE +0 -0
  28. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/LICENSE-MIT +0 -0
  29. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/SECURITY.md +0 -0
  30. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/pyproject.toml +0 -0
  31. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/app.py +0 -0
  32. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/config.py +0 -0
  33. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/crawler.py +0 -0
  34. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/models.py +0 -0
  35. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/scanner.py +0 -0
  36. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-modern/src/utils.py +0 -0
  37. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/pyproject.toml +0 -0
  38. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/setup.py +0 -0
  39. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/app.py +0 -0
  40. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/config.py +0 -0
  41. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/crawler.py +0 -0
  42. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/models.py +0 -0
  43. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/scanner.py +0 -0
  44. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-a-outdated/src/utils.py +0 -0
  45. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-b-modern/myapp/models.py +0 -0
  46. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-b-modern/myapp/views.py +0 -0
  47. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-b-outdated/myapp/models.py +0 -0
  48. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-b-outdated/myapp/views.py +0 -0
  49. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-c-modern/tests/test_calculator.py +0 -0
  50. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/fixtures/variant-c-outdated/tests/test_calculator.py +0 -0
  51. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/mcp-config.json +0 -0
  52. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v2.txt +0 -0
  53. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v3-mcp.txt +0 -0
  54. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v3.txt +0 -0
  55. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v4-a.txt +0 -0
  56. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v4-b.txt +0 -0
  57. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt-v4-c.txt +0 -0
  58. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/prompt.txt +0 -0
  59. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/run-mcp.sh +0 -0
  60. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/run.sh +0 -0
  61. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/score-v2.sh +0 -0
  62. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/score-v3.sh +0 -0
  63. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/score-v4.sh +0 -0
  64. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/score.sh +0 -0
  65. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/bench/test-scorer.sh +0 -0
  66. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/docs/benchmark-evaluation.md +0 -0
  67. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/docs/benchmark-procedure.md +0 -0
  68. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/docs/design.md +0 -0
  69. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/SKILL.md +0 -0
  70. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/async/async-timeout-context.md +0 -0
  71. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/async/exception-groups.md +0 -0
  72. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/async/taskgroup-over-gather.md +0 -0
  73. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/data-structures/dataclass-modern.md +0 -0
  74. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/data-structures/dict-merge-operator.md +0 -0
  75. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/data-structures/match-case-patterns.md +0 -0
  76. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/django/django-async-views.md +0 -0
  77. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/django/django-check-constraints.md +0 -0
  78. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/django/django-json-field.md +0 -0
  79. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/fastapi/fastapi-annotated-depends.md +0 -0
  80. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/fastapi/fastapi-lifespan.md +0 -0
  81. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/fastapi/fastapi-typed-state.md +0 -0
  82. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/httpx/httpx-async-client-reuse.md +0 -0
  83. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/httpx/httpx-streaming.md +0 -0
  84. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pydantic/pydantic-v2-config.md +0 -0
  85. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pydantic/pydantic-v2-model-api.md +0 -0
  86. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pydantic/pydantic-v2-serialization.md +0 -0
  87. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pydantic/pydantic-v2-validators.md +0 -0
  88. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pytest/pytest-parametrize.md +0 -0
  89. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pytest/pytest-raises-match.md +0 -0
  90. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/pytest/pytest-tmp-path.md +0 -0
  91. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/sqlalchemy/sqlalchemy-2-style.md +0 -0
  92. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/sqlalchemy/sqlalchemy-async-session.md +0 -0
  93. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/sqlalchemy/sqlalchemy-mapped-column.md +0 -0
  94. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/stdlib/datetime-utc.md +0 -0
  95. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/stdlib/pathlib-over-os-path.md +0 -0
  96. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/stdlib/removeprefix-removesuffix.md +0 -0
  97. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/stdlib/template-strings.md +0 -0
  98. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/stdlib/tomllib-builtin.md +0 -0
  99. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/toolchain/no-pickle.md +0 -0
  100. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/toolchain/pyproject-toml-over-setup.md +0 -0
  101. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/toolchain/ruff-over-flake8.md +0 -0
  102. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/toolchain/safe-subprocess.md +0 -0
  103. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/toolchain/uv-over-pip.md +0 -0
  104. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/deferred-annotations.md +0 -0
  105. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/override-decorator.md +0 -0
  106. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/paramspec-decorators.md +0 -0
  107. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/type-parameter-syntax.md +0 -0
  108. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/typeis-vs-typeguard.md +0 -0
  109. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/union-syntax.md +0 -0
  110. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/skills/modern-python-guidance/guides/typing/use-builtin-generics.md +0 -0
  111. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/__main__.py +0 -0
  112. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/cli.py +0 -0
  113. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/compat.py +0 -0
  114. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/frontmatter.py +0 -0
  115. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/guide_index.py +0 -0
  116. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/mcp_server.py +0 -0
  117. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/retrieve.py +0 -0
  118. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/search.py +0 -0
  119. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/setup_cmd.py +0 -0
  120. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/uninstall_cmd.py +0 -0
  121. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/src/modern_python_guidance/version_detect.py +0 -0
  122. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_cli_integration.py +0 -0
  123. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_frontmatter.py +0 -0
  124. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_mcp_server.py +0 -0
  125. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_retrieve.py +0 -0
  126. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_search.py +0 -0
  127. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_setup.py +0 -0
  128. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_skill_sync.py +0 -0
  129. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_uninstall.py +0 -0
  130. {modern_python_guidance-0.3.2 → modern_python_guidance-0.3.4}/tests/test_version_detect.py +0 -0
@@ -0,0 +1,93 @@
1
+ name: Check for new Python releases
2
+
3
+ on:
4
+ schedule:
5
+ - cron: '0 9 * * 1' # Every Monday at 09:00 UTC
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ issues: write
10
+
11
+ jobs:
12
+ check:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+
17
+ - name: Detect latest stable Python minor version
18
+ id: detect
19
+ run: |
20
+ # Fetch all releases from python.org, filter stable only via pre_release field
21
+ LATEST=$(curl -s 'https://www.python.org/api/v2/downloads/release/?limit=500' \
22
+ | jq -r '[.[] | select(.pre_release == false) | .name
23
+ | capture("Python (?<v>3\\.[0-9]+)\\.[0-9]+$") | .v]
24
+ | unique | sort_by(split(".") | map(tonumber)) | last')
25
+
26
+ echo "latest_minor=$LATEST"
27
+
28
+ # Find the highest minor version covered in our guides
29
+ KNOWN=$(grep -roh 'python: ">=3\.[0-9]*"' skills/modern-python-guidance/guides/ \
30
+ | grep -oP '3\.\d+' \
31
+ | sort -V | tail -1)
32
+
33
+ echo "known_minor=$KNOWN"
34
+
35
+ if [ -z "$LATEST" ] || [ -z "$KNOWN" ]; then
36
+ echo "skip=true" >> "$GITHUB_OUTPUT"
37
+ echo "Could not determine versions (latest=$LATEST, known=$KNOWN)"
38
+ exit 0
39
+ fi
40
+
41
+ LATEST_NUM=${LATEST##3.}
42
+ KNOWN_NUM=${KNOWN##3.}
43
+
44
+ if [ "$LATEST_NUM" -gt "$KNOWN_NUM" ]; then
45
+ echo "new_version=$LATEST" >> "$GITHUB_OUTPUT"
46
+ echo "skip=false" >> "$GITHUB_OUTPUT"
47
+ echo "New Python version detected: $LATEST (guides cover up to $KNOWN)"
48
+ else
49
+ echo "skip=true" >> "$GITHUB_OUTPUT"
50
+ echo "Up to date: guides cover $KNOWN, latest stable is $LATEST"
51
+ fi
52
+
53
+ - name: Check for existing issue
54
+ if: steps.detect.outputs.skip != 'true'
55
+ id: existing
56
+ env:
57
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
58
+ run: |
59
+ VERSION="${{ steps.detect.outputs.new_version }}"
60
+ FOUND=$(gh issue list --search "\"Add Python ${VERSION} guides\" in:title" \
61
+ --state all --json number --jq '.[0].number // empty')
62
+ if [ -n "$FOUND" ]; then
63
+ echo "exists=true" >> "$GITHUB_OUTPUT"
64
+ echo "Issue #${FOUND} already exists"
65
+ else
66
+ echo "exists=false" >> "$GITHUB_OUTPUT"
67
+ fi
68
+
69
+ - name: Create issue
70
+ if: steps.detect.outputs.skip != 'true' && steps.existing.outputs.exists != 'true'
71
+ env:
72
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
73
+ VERSION: ${{ steps.detect.outputs.new_version }}
74
+ run: |
75
+ SLUG="${VERSION//.}"
76
+ gh issue create \
77
+ --title "Add Python ${VERSION} guides" \
78
+ --label "enhancement" \
79
+ --body "## Python ${VERSION} stable released
80
+
81
+ Evaluate new PEPs for BAD/GOOD pattern guides.
82
+
83
+ ### References
84
+ - [What's New in Python ${VERSION}](https://docs.python.org/${VERSION}/whatsnew/${VERSION}.html)
85
+ - [Python ${VERSION}.0 Release](https://www.python.org/downloads/release/python-${SLUG}0/)
86
+
87
+ ### Checklist
88
+ - [ ] Review What's New for pattern-worthy changes
89
+ - [ ] Check candidates against quality bar (modern + meaningful)
90
+ - [ ] Write guides for accepted patterns
91
+ - [ ] Update SKILL.md embedded patterns if high-frequency
92
+ - [ ] Update benchmark scorer if new items added
93
+ - [ ] Release new version"
@@ -2,6 +2,28 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [0.3.4] — 2026-05-30
6
+
7
+ ### Fixed
8
+
9
+ - v0.3.3 shipped with `__version__ = "0.3.2"` in `__init__.py` (pyproject.toml was correct). This release fixes the version string
10
+
11
+ ## [0.3.3] — 2026-05-30 (yanked — `__version__` mismatch)
12
+
13
+ ### Added
14
+
15
+ - AST-based benchmark scorer (`bench/score_v5.py`): replaces grep-based V4 scorer with Python AST detection for structurally correct pattern matching — fixes 3 false-flag bugs on Opus 4.8 output (multiline code, docstring keywords, .venv contamination) (closes #59)
16
+ - VALID_ALT classification for SA2 (sync SQLAlchemy 2.0), TY6 (TypeGuard), AS3 (per-task except) — tracks valid alternatives separately from recommended patterns
17
+ - Benchmark prompt granularity testing (terse/normal/detailed) with V5 runner using isolated tmpdir for workspace safety
18
+ - V5 benchmark results on Opus 4.8: terse prompts +19pp, normal prompts +7pp strict modern rate ([details](docs/benchmark-v5.md))
19
+ - 83 new scorer tests (fixture parity, per-item golden tests, edge cases, import alias handling)
20
+ - Weekly GitHub Actions workflow to detect new Python stable releases and auto-create tracking issues (closes #70)
21
+
22
+ ### Changed
23
+
24
+ - README benchmark highlight updated from V4 (+14.7pp) to V5 (79% → 98% on vague prompts, Opus 4.8)
25
+ - Ruff config: added per-file-ignores for `bench/*.py` (SIM102/SIM110)
26
+
5
27
  ## [0.3.2] — 2026-05-29
6
28
 
7
29
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: modern-python-guidance
3
- Version: 0.3.2
3
+ Version: 0.3.4
4
4
  Summary: Version-aware BAD/GOOD pattern guides that help AI coding agents generate modern Python
5
5
  Project-URL: Homepage, https://github.com/yottayoshida/modern-python-guidance
6
6
  Project-URL: Repository, https://github.com/yottayoshida/modern-python-guidance
@@ -40,7 +40,7 @@ Stop your AI from writing `typing.List`, `@validator`, and `setup.py`. 41 versio
40
40
 
41
41
  ## Highlights
42
42
 
43
- - **Measurable impact**: +14.7pp overall improvement in A/B benchmark via Agent Skills (38 scored items, [details](docs/benchmark-evaluation.md)). Largest variant (FastAPI, 32 items): Control 60.4% → Treatment 82.3%
43
+ - **Measurable impact**: AI writes modern Python 98% of the time with mpg, vs 79% without — even with vague prompts (Opus 4.8, [V5 benchmark details](docs/benchmark-v5.md))
44
44
  - **41 guides** across stdlib, Pydantic, FastAPI, Django, SQLAlchemy, pytest, and toolchain
45
45
  - **Version-aware**: auto-detects your project's Python version and filters guides accordingly
46
46
  - **3 delivery methods**: MCP server, CLI, Agent Skills plugin
@@ -9,7 +9,7 @@ Stop your AI from writing `typing.List`, `@validator`, and `setup.py`. 41 versio
9
9
 
10
10
  ## Highlights
11
11
 
12
- - **Measurable impact**: +14.7pp overall improvement in A/B benchmark via Agent Skills (38 scored items, [details](docs/benchmark-evaluation.md)). Largest variant (FastAPI, 32 items): Control 60.4% → Treatment 82.3%
12
+ - **Measurable impact**: AI writes modern Python 98% of the time with mpg, vs 79% without — even with vague prompts (Opus 4.8, [V5 benchmark details](docs/benchmark-v5.md))
13
13
  - **41 guides** across stdlib, Pydantic, FastAPI, Django, SQLAlchemy, pytest, and toolchain
14
14
  - **Version-aware**: auto-detects your project's Python version and filters guides accordingly
15
15
  - **3 delivery methods**: MCP server, CLI, Agent Skills plugin
@@ -0,0 +1,68 @@
1
+ """Opus 4.8 style: verbose, multi-line, heavily documented code.
2
+
3
+ This fixture reproduces the patterns that broke the V4 grep scorer.
4
+ Every function here is MODERN but would false-flag under grep-based detection.
5
+ """
6
+ import asyncio
7
+ import subprocess
8
+ from pathlib import Path
9
+ from typing import TypeGuard
10
+
11
+
12
+ def run_command(cmd: str, *args: str) -> subprocess.CompletedProcess[str]:
13
+ """Run a subprocess safely from a list of arguments.
14
+
15
+ Passing a list (not a shell string) avoids shell injection: the program name
16
+ and each argument are kept distinct and never re-parsed by a shell.
17
+ Never use shell=True on untrusted input.
18
+ """
19
+
20
+ return subprocess.run(
21
+ [cmd, *args],
22
+ check=True,
23
+ capture_output=True,
24
+ text=True,
25
+ )
26
+
27
+
28
+ async def scan_directory(root: str | Path) -> list[Path]:
29
+ """Walk root recursively, batch files, process concurrently.
30
+
31
+ Uses TaskGroup for structured concurrency. A failure in one batch
32
+ is recorded while the other batches still complete.
33
+ """
34
+
35
+ root_path = Path(root)
36
+ files = [p for p in root_path.rglob("*") if p.is_file()]
37
+ batches = [files[i : i + 10] for i in range(0, len(files), 10)]
38
+
39
+ errors: list[str] = []
40
+ results: list[Path] = []
41
+
42
+ async with asyncio.TaskGroup() as tg:
43
+ for batch in batches:
44
+ tg.create_task(_process_batch(batch, results, errors))
45
+
46
+ return results
47
+
48
+
49
+ async def _process_batch(
50
+ batch: list[Path],
51
+ results: list[Path],
52
+ errors: list[str],
53
+ ) -> None:
54
+ try:
55
+ for path in batch:
56
+ results.append(path)
57
+ except OSError as exc:
58
+ errors.append(str(exc))
59
+
60
+
61
+ def is_positive_int(val: object) -> TypeGuard[int]:
62
+ """Narrow val to int when it is a positive integer.
63
+
64
+ bool is a subclass of int in Python, so it is explicitly excluded.
65
+ After if is_positive_int(x): a type checker treats x as int.
66
+ """
67
+
68
+ return isinstance(val, int) and not isinstance(val, bool) and val > 0
@@ -0,0 +1,41 @@
1
+ """Valid alternative patterns that should score VALID_ALT, not OUTDATED.
2
+
3
+ SA2: sync SQLAlchemy 2.0 (create_engine + select() style)
4
+ TY6: TypeGuard (broader semantics than TypeIs, still valid)
5
+ AS3: TaskGroup + per-task try/except (structured concurrency without except*)
6
+ """
7
+ import asyncio
8
+
9
+ from sqlalchemy import create_engine, select
10
+ from sqlalchemy.orm import Session
11
+ from typing import TypeGuard
12
+
13
+
14
+ # SA2: sync SQLAlchemy 2.0 — VALID_ALT
15
+ engine = create_engine("sqlite:///test.db")
16
+
17
+
18
+ def get_users():
19
+ with Session(engine) as session:
20
+ return session.scalars(select(User)).all()
21
+
22
+
23
+ # TY6: TypeGuard — VALID_ALT
24
+ def is_str_list(val: list[object]) -> TypeGuard[list[str]]:
25
+ return all(isinstance(x, str) for x in val)
26
+
27
+
28
+ # AS3: TaskGroup + per-task try/except — VALID_ALT
29
+ async def fetch_all(urls: list[str]) -> list[str]:
30
+ results: list[str] = []
31
+ async with asyncio.TaskGroup() as tg:
32
+ for url in urls:
33
+ tg.create_task(_safe_fetch(url, results))
34
+ return results
35
+
36
+
37
+ async def _safe_fetch(url: str, results: list[str]) -> None:
38
+ try:
39
+ results.append(f"fetched: {url}")
40
+ except Exception:
41
+ pass
@@ -0,0 +1,15 @@
1
+ Write the following 7 files. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
2
+
3
+ 1. src/config.py — A typed configuration loader. Define a Settings class using Pydantic to hold app configuration (database URL, debug flag, log level, allowed origins list, and an optional description that may or may not be provided). Use `model_config = ConfigDict(...)` for Pydantic configuration (not the inner `class Config:` pattern). Validate that the database URL starts with a known scheme and that log level is one of DEBUG/INFO/WARNING/ERROR — use `@field_validator` (not the deprecated `@validator`). Write a function `load_config(path)` that reads a TOML file using `tomllib` (not third-party `toml` or `tomli`) and returns a Settings instance via `Settings.model_validate()` (not `.parse_obj()`). Include a `created_at` field that defaults to the current time in UTC using `datetime.now(UTC)` (not `.utcnow()`). Define a generic container class `Registry[T]` using PEP 695 type parameter syntax `class Registry[T]:` (not `TypeVar`). Write a decorator `with_retry(max_attempts)` that retries a decorated async function on failure, preserving the signature with `ParamSpec`.
4
+
5
+ 2. src/models.py — SQLAlchemy ORM models using 2.0-style declarative mapping. Define a `User` model and an `Article` model. Use `Mapped[type] = mapped_column()` for column definitions (not `Column(Type)`). Use `select()` for queries (not `.query()`). Define a base class with common fields (id, created_at) using `Mapped` annotations. Each model should `@override` its `__repr__` method.
6
+
7
+ 3. src/app.py — A FastAPI application. Use `@asynccontextmanager` lifespan (not `@app.on_event`). Use `Annotated[Session, Depends(get_db)]` for dependency injection (not bare `= Depends()`). Use lifespan `yield` dict for typed application state (not `app.state`). Define request/response schemas with Pydantic V2 API — use `@field_serializer` for custom serialization (not `json_encoders`), and `.model_dump()` / `.model_dump_json()` for output (not `.dict()` / `.json()`). Use `create_async_engine` and `AsyncSession` for the database connection.
8
+
9
+ 4. src/crawler.py — An async web crawler. Write a function `crawl(urls)` that fetches a list of URLs concurrently using `httpx.AsyncClient` as a shared context manager (not per-request `httpx.get()`). Use `asyncio.TaskGroup` for structured concurrency (not `asyncio.gather`). Apply `asyncio.timeout()` to each fetch (not `asyncio.wait_for`). Handle `ExceptionGroup` with `except*` syntax. Write a `stream_large(url)` function using `client.stream()` and `resp.aiter_bytes()`.
10
+
11
+ 5. src/scanner.py — A file scanner and log parser. Use `pathlib.Path` for all filesystem operations (not `os.path`). Categorize files using `match`/`case` structural pattern matching (not `if isinstance` chains). Use a frozen dataclass with `@dataclass(frozen=True, slots=True)` for `ScanResult`. Process batches concurrently with `asyncio.TaskGroup`. Parse log lines with `.removeprefix()` / `.removesuffix()` (not `.lstrip()` / string slicing with `[len():]`).
12
+
13
+ 6. src/utils.py — Utility functions. (a) `merge_defaults(user_config, default_config)` using the dict `|` merge operator (not `{**a, **b}` or `.update()`). (b) `run_command(cmd, *args)` using `subprocess.run([cmd, *args], check=True)` with a list (not `shell=True` or `os.system`). (c) `is_positive_int(val)` using `TypeIs[int]` (not `TypeGuard`). (d) `save_to_json(data, path)` using `pathlib.Path` for file output. Use `X | None` union syntax (not `Optional[X]`) and `list[str]` built-in generics (not `typing.List[str]`).
14
+
15
+ 7. pyproject.toml — Project config with dependencies on fastapi, sqlalchemy, httpx, uvicorn, and pydantic. Target Python 3.12+. Use `[project]` table (not setup.py). Configure `[tool.ruff]` for linting (not flake8/black/isort). Include `uv` as the package manager in scripts.
@@ -0,0 +1,15 @@
1
+ Write the following 7 files. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
2
+
3
+ 1. src/config.py — A typed configuration loader. Define a Settings class using Pydantic to hold app configuration (database URL, debug flag, log level, allowed origins list, and an optional description that may or may not be provided). Validate that the database URL starts with a known scheme and that log level is one of DEBUG/INFO/WARNING/ERROR. Write a function `load_config(path)` that reads a TOML file and returns a Settings instance. Include a `created_at` field that defaults to the current time in UTC. Define a generic container class `Registry[T]` that stores items by name and retrieves them with type safety. Write a decorator `with_retry(max_attempts)` that retries a decorated async function on failure.
4
+
5
+ 2. src/models.py — SQLAlchemy ORM models. Define a `User` model and an `Article` model. User has fields: id, email, display_name, created_at. Article has fields: id, title, body, author_id (foreign key to User), published_at. Use the declarative mapping style. Define a base class with common fields (id, created_at) and a `__repr__` method that both models inherit from. Each model should override `__repr__` to include its own specific fields.
6
+
7
+ 3. src/app.py — A FastAPI application. It should have: a User model with CRUD endpoints (GET /users, GET /users/{id}, POST /users), an Article endpoint (GET /articles), proper database lifecycle management with startup/shutdown, dependency injection for the database session, and typed application state that holds the database engine. Define request/response schemas with serialization aliases (e.g., snake_case fields exposed as camelCase in JSON).
8
+
9
+ 4. src/crawler.py — An async web crawler. Write a function `crawl(urls)` that fetches a list of URLs concurrently using httpx and returns their response bodies. Use an async context manager for the HTTP client to reuse connections. Handle failures gracefully — a single bad URL should not lose the other results. Use structured concurrency for the concurrent fetches with proper cancellation. Write a second function `stream_large(url)` that downloads a large response body by reading it in chunks rather than loading it all into memory at once. Apply a timeout to each individual fetch operation.
10
+
11
+ 5. src/scanner.py — A file scanner and log parser. Write a function `scan_directory(root)` that walks a directory tree recursively, collects all files, groups them into batches of 10, and processes each batch concurrently with proper error handling — if one batch fails, the others should still complete. Define an enum `FileCategory` with values IMAGE, VIDEO, DOCUMENT, OTHER. Categorize each file by its extension (.jpg/.png → IMAGE, .mp4/.avi → VIDEO, .pdf/.docx → DOCUMENT, everything else → OTHER) using structured pattern matching. Define a frozen data container `ScanResult` to hold the results (total count, categorized file lists, errors). Write a function `parse_log_lines(lines)` that strips a known prefix from each log line.
12
+
13
+ 6. src/utils.py — Utility functions. Write: (a) a function `merge_defaults(user_config, default_config)` that merges two dicts with user values taking precedence, (b) a function `run_command(cmd, *args)` that runs a subprocess safely with a list of arguments, (c) a function `is_positive_int(val)` that narrows an unknown value to int via a type narrowing guard, (d) a function `save_to_json(data, path)` that serializes data to a JSON file.
14
+
15
+ 7. pyproject.toml — Project config with dependencies on fastapi, sqlalchemy, httpx, uvicorn, and pydantic. Target Python 3.12+. Configure a linter and formatter.
@@ -0,0 +1 @@
1
+ Build a FastAPI web application with an async web crawler. Use SQLAlchemy for the database, httpx for HTTP requests, Pydantic for data validation, and TOML for configuration. Include a file scanner utility with pattern matching. Target Python 3.12+. Write all code with no placeholders.
@@ -0,0 +1,7 @@
1
+ Write the following 3 files for a Django application. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
2
+
3
+ 1. myapp/models.py — Django models. Define a `Product` model with fields: name (CharField), price (DecimalField), metadata (use `models.JSONField` — the native Django JSONField, not the old `django.contrib.postgres.fields.JSONField`). Add a database-level constraint using `models.CheckConstraint(condition=..., name=...)` syntax (not the deprecated `check=` parameter) that ensures price is non-negative.
4
+
5
+ 2. myapp/views.py — Django views. Write an `async def product_list(request)` view that returns all products as JSON using native async Django ORM queries (`async for p in Product.objects.all()`, `.aget()`, `.afirst()`) — do NOT use `sync_to_async` wrappers. Write a second async view `product_detail(request, pk)` that returns a single product using `.aget()`.
6
+
7
+ 3. myapp/urls.py — URL configuration. Wire up the two views above with appropriate URL patterns.
@@ -0,0 +1,7 @@
1
+ Write the following 3 files for a Django application. Write all code, no placeholders. Create each file at the EXACT path shown below (relative to the current working directory). Do NOT create any project directories or subdirectories beyond what is listed.
2
+
3
+ 1. myapp/models.py — Django models. Define a `Product` model with fields: name (CharField), price (DecimalField), metadata (a field that stores arbitrary JSON data natively). Add a database-level constraint that ensures price is non-negative.
4
+
5
+ 2. myapp/views.py — Django views. Write a view function `product_list(request)` that returns all products as JSON. Make it handle requests asynchronously. Write a second view `product_detail(request, pk)` that returns a single product.
6
+
7
+ 3. myapp/urls.py — URL configuration. Wire up the two views above with appropriate URL patterns.
@@ -0,0 +1 @@
1
+ Build a Django application with a Product model that stores JSON data and has a price constraint. Include async views that return JSON responses. Target Python 3.12+. Write all code with no placeholders.
@@ -0,0 +1,3 @@
1
+ Write 1 file. Write all code, no placeholders. Create the file at the EXACT path shown below (relative to the current working directory).
2
+
3
+ 1. tests/test_calculator.py — Tests for a calculator module. Inline a `divide(a, b)` function that divides two numbers and raises ZeroDivisionError for zero denominators. Also inline a `save_result(path, value)` function that writes a float to a file. Write tests that cover: multiple input combinations for divide (positive, zero numerator, negative, fractional results) — use `@pytest.mark.parametrize` for table-driven testing (not separate test functions per case). Test the zero-denominator error case using `pytest.raises(ZeroDivisionError, match="...")` with the `match=` parameter to verify the error message text (not bare `pytest.raises` without match). Test save_result using `tmp_path` fixture (not the deprecated `tmpdir` fixture).
@@ -0,0 +1,3 @@
1
+ Write 1 file. Write all code, no placeholders. Create the file at the EXACT path shown below (relative to the current working directory).
2
+
3
+ 1. tests/test_calculator.py — Tests for a calculator module. Inline a `divide(a, b)` function that divides two numbers and raises ZeroDivisionError for zero denominators. Also inline a `save_result(path, value)` function that writes a float to a file. Write tests that cover: multiple input combinations for divide (positive, zero numerator, negative, fractional results) — use table-driven testing to avoid repetitive test functions. Test the zero-denominator error case and verify the error message text. Test save_result using a temporary directory provided by the test framework.
@@ -0,0 +1 @@
1
+ Write tests for a calculator module with divide and save_result functions. Use pytest with table-driven testing, error message verification, and temporary file handling. Target Python 3.12+. Write all code with no placeholders.
@@ -27,6 +27,14 @@ done
27
27
 
28
28
  BUDGET="2.00"
29
29
 
30
+ # --- Optional model pin (opt-in via MODEL env; no-op when unset) ---
31
+ MODEL="${MODEL:-}"
32
+ MODEL_ARGS=()
33
+ if [ -n "$MODEL" ]; then
34
+ MODEL_ARGS=(--model "$MODEL")
35
+ echo "[config] Pinning model: $MODEL"
36
+ fi
37
+
30
38
  # --- Guidance toggle: rules/ file ---
31
39
  RULE_FILE="$WORKSPACE/.claude/rules/modern-python.md"
32
40
  RULE_SOURCE="$REPO_DIR/skills/modern-python-guidance/SKILL.md"
@@ -113,7 +121,8 @@ run_variant_session() {
113
121
  record_verify "PRE-CONTROL-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
114
122
 
115
123
  echo "[running] claude -p (Control, variant $variant) from $WORKSPACE ..."
116
- (cd "$WORKSPACE" && claude -p --output-format json --max-budget-usd "$BUDGET" \
124
+ echo "MODEL=${MODEL:-<default>}" >> "$log"
125
+ (cd "$WORKSPACE" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} --output-format json --max-budget-usd "$BUDGET" \
117
126
  < "$prompt" > "$results_dir/session-a.json" 2>"$results_dir/session-a.stderr") || true
118
127
 
119
128
  record_verify "POST-CONTROL-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
@@ -134,7 +143,8 @@ run_variant_session() {
134
143
  record_verify "PRE-TREATMENT-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
135
144
 
136
145
  echo "[running] claude -p (Treatment, variant $variant) from $WORKSPACE ..."
137
- (cd "$WORKSPACE" && claude -p --output-format json --max-budget-usd "$BUDGET" \
146
+ echo "MODEL=${MODEL:-<default>}" >> "$log"
147
+ (cd "$WORKSPACE" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} --output-format json --max-budget-usd "$BUDGET" \
138
148
  < "$prompt" > "$results_dir/session-b.json" 2>"$results_dir/session-b.stderr") || true
139
149
 
140
150
  record_verify "POST-TREATMENT-V4$(echo "$variant" | tr '[:lower:]' '[:upper:]')" "$log"
@@ -0,0 +1,274 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ # V5 Benchmark Runner: 3-variant × 3-granularity system
5
+ #
6
+ # Each claude -p session runs in an isolated tmpdir, NOT in ~/claude_workspace.
7
+ # This prevents auto-backup hooks, workspace contamination, and file collisions.
8
+ #
9
+ # Usage:
10
+ # ./bench/run-v5.sh <run_id> <control|treatment|both> [options]
11
+ #
12
+ # Options:
13
+ # --variant a|b|c|all (default: a)
14
+ # --granularity terse|normal|detailed|all (default: normal)
15
+ # -N <count> (default: 1)
16
+ # --dry-run Print execution plan without running
17
+ # --budget <usd> Per-session budget (default: 2.00)
18
+
19
+ REPO_DIR="$(cd "$(dirname "$0")/.." && pwd)"
20
+ RUN_ID="${1:?Usage: $0 <run_id> <control|treatment|both> [options]}"
21
+ SESSION="${2:?Usage: $0 <run_id> <control|treatment|both> [options]}"
22
+ shift 2
23
+
24
+ VARIANTS="a"
25
+ GRANULARITIES="normal"
26
+ N_RUNS=1
27
+ DRY_RUN=false
28
+ BUDGET="2.00"
29
+ MODEL="${MODEL:-}"
30
+ MODEL_ARGS=()
31
+
32
+ while [[ $# -gt 0 ]]; do
33
+ case "$1" in
34
+ --variant) VARIANTS="$2"; shift 2 ;;
35
+ --granularity) GRANULARITIES="$2"; shift 2 ;;
36
+ -N) N_RUNS="$2"; shift 2 ;;
37
+ --dry-run) DRY_RUN=true; shift ;;
38
+ --budget) BUDGET="$2"; shift 2 ;;
39
+ *) echo "Unknown option: $1" >&2; exit 1 ;;
40
+ esac
41
+ done
42
+
43
+ if [ -n "$MODEL" ]; then
44
+ MODEL_ARGS=(--model "$MODEL")
45
+ fi
46
+
47
+ # --- Resolve variant/granularity lists ---
48
+ variant_list=()
49
+ case "$VARIANTS" in
50
+ all) variant_list=(a b c) ;;
51
+ a|b|c) variant_list=("$VARIANTS") ;;
52
+ *) echo "ERROR: Invalid variant '$VARIANTS'" >&2; exit 1 ;;
53
+ esac
54
+
55
+ gran_list=()
56
+ case "$GRANULARITIES" in
57
+ all) gran_list=(terse normal detailed) ;;
58
+ terse|normal|detailed) gran_list=("$GRANULARITIES") ;;
59
+ *) echo "ERROR: Invalid granularity '$GRANULARITIES'" >&2; exit 1 ;;
60
+ esac
61
+
62
+ case "$SESSION" in
63
+ control|treatment|both) ;;
64
+ *) echo "ERROR: Invalid session '$SESSION'" >&2; exit 1 ;;
65
+ esac
66
+
67
+ # --- Count total sessions ---
68
+ session_count=0
69
+ sessions_per_combo=1
70
+ if [ "$SESSION" = "both" ]; then sessions_per_combo=2; fi
71
+
72
+ for _ in "${variant_list[@]}"; do
73
+ for _ in "${gran_list[@]}"; do
74
+ session_count=$((session_count + N_RUNS * sessions_per_combo))
75
+ done
76
+ done
77
+
78
+ # --- Dry run ---
79
+ if $DRY_RUN; then
80
+ echo "=== V5 Benchmark Dry Run ==="
81
+ echo "Run ID: $RUN_ID"
82
+ echo "Session: $SESSION"
83
+ echo "Variants: ${variant_list[*]}"
84
+ echo "Granularities: ${gran_list[*]}"
85
+ echo "N: $N_RUNS"
86
+ echo "Model: ${MODEL:-<default>}"
87
+ echo "Per-session: \$$BUDGET"
88
+ echo "Total sessions: $session_count"
89
+ echo ""
90
+ echo "Prompt files:"
91
+ for v in "${variant_list[@]}"; do
92
+ for g in "${gran_list[@]}"; do
93
+ pf="$REPO_DIR/bench/prompts/v5-${v}-${g}.txt"
94
+ if [ -f "$pf" ]; then echo " [OK] $pf"; else echo " [MISSING] $pf"; fi
95
+ done
96
+ done
97
+ exit 0
98
+ fi
99
+
100
+ # --- Pre-flight checks ---
101
+ echo "=== V5 Pre-flight Checks ==="
102
+
103
+ if ! command -v claude &>/dev/null; then
104
+ echo "ERROR: claude CLI not found" >&2; exit 1
105
+ fi
106
+ echo "[OK] Claude CLI found"
107
+
108
+ SCORER="$REPO_DIR/bench/score_v5.py"
109
+ if [ ! -f "$SCORER" ]; then
110
+ echo "ERROR: Scorer not found: $SCORER" >&2; exit 1
111
+ fi
112
+ echo "[OK] Scorer found"
113
+
114
+ for v in "${variant_list[@]}"; do
115
+ for g in "${gran_list[@]}"; do
116
+ pf="$REPO_DIR/bench/prompts/v5-${v}-${g}.txt"
117
+ if [ ! -f "$pf" ]; then
118
+ echo "ERROR: Prompt not found: $pf" >&2; exit 1
119
+ fi
120
+ done
121
+ done
122
+ echo "[OK] All prompt files found"
123
+
124
+ RULE_SOURCE="$REPO_DIR/skills/modern-python-guidance/SKILL.md"
125
+ if [ ! -f "$RULE_SOURCE" ]; then
126
+ echo "ERROR: Guidance source not found: $RULE_SOURCE" >&2; exit 1
127
+ fi
128
+ echo "[OK] Guidance source found"
129
+ echo ""
130
+
131
+ # --- Guidance file content (extracted once, reused per session) ---
132
+ GUIDANCE_CONTENT=$(awk 'BEGIN{c=0} /^---$/{c++; next} c>=2{print}' "$RULE_SOURCE")
133
+
134
+ # --- Run a single session in isolated tmpdir ---
135
+ run_session() {
136
+ local variant="$1" gran="$2" session_type="$3" run_n="$4"
137
+ local run_suffix="${RUN_ID}-${run_n}-v5${variant}${gran:0:1}"
138
+ local results_dir="$REPO_DIR/results/run-${run_suffix}"
139
+ local prompt="$REPO_DIR/bench/prompts/v5-${variant}-${gran}.txt"
140
+ local log="$results_dir/guidance-verify.log"
141
+
142
+ mkdir -p "$results_dir"
143
+
144
+ # Create isolated workspace
145
+ local tmpwork
146
+ tmpwork=$(mktemp -d "$HOME/mpg-bench-XXXXXX")
147
+
148
+ # Set up .claude/rules/ for guidance toggle
149
+ mkdir -p "$tmpwork/.claude/rules"
150
+
151
+ local session_label
152
+ if [ "$session_type" = "control" ]; then
153
+ session_label="a"
154
+ # No guidance file
155
+ else
156
+ session_label="b"
157
+ echo "$GUIDANCE_CONTENT" > "$tmpwork/.claude/rules/modern-python.md"
158
+ fi
159
+
160
+ # Record verification
161
+ local rule_file="$tmpwork/.claude/rules/modern-python.md"
162
+ local label_upper
163
+ label_upper="$(echo "${session_type}-V5${variant}${gran}" | tr '[:lower:]' '[:upper:]')"
164
+
165
+ echo "=== PRE-${label_upper} $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$log"
166
+ echo "TMPWORK=$tmpwork" >> "$log"
167
+ if [ -f "$rule_file" ]; then
168
+ echo "status: PRESENT ($(wc -c < "$rule_file") bytes)" >> "$log"
169
+ shasum -a 256 "$rule_file" >> "$log" 2>/dev/null || true
170
+ else
171
+ echo "status: ABSENT" >> "$log"
172
+ fi
173
+ echo "MODEL=${MODEL:-<default>}" >> "$log"
174
+ echo "" >> "$log"
175
+
176
+ # Run claude -p in isolated tmpdir
177
+ echo "[running] claude -p ($session_type, variant $variant, $gran) in $tmpwork ..."
178
+ (cd "$tmpwork" && claude -p ${MODEL_ARGS[@]+"${MODEL_ARGS[@]}"} \
179
+ --output-format json --max-budget-usd "$BUDGET" \
180
+ < "$prompt" > "$results_dir/session-${session_label}.json" \
181
+ 2>"$results_dir/session-${session_label}.stderr") || true
182
+
183
+ # Post verification
184
+ echo "=== POST-${label_upper} $(date -u '+%Y-%m-%dT%H:%M:%SZ') ===" >> "$log"
185
+ if [ -f "$rule_file" ]; then
186
+ echo "status: PRESENT ($(wc -c < "$rule_file") bytes)" >> "$log"
187
+ else
188
+ echo "status: ABSENT" >> "$log"
189
+ fi
190
+ echo "" >> "$log"
191
+
192
+ # Move generated files to results (everything except .claude/)
193
+ mkdir -p "$results_dir/${session_type}"
194
+ for item in "$tmpwork"/*; do
195
+ [ -e "$item" ] || continue
196
+ local base
197
+ base=$(basename "$item")
198
+ [ "$base" = ".claude" ] && continue
199
+ mv "$item" "$results_dir/${session_type}/$base" 2>/dev/null || true
200
+ done
201
+ # Also move hidden dirs that aren't .claude (e.g. .venv created by LLM)
202
+ for item in "$tmpwork"/.*; do
203
+ [ -e "$item" ] || continue
204
+ local base
205
+ base=$(basename "$item")
206
+ case "$base" in .|..|.claude) continue ;; esac
207
+ mv "$item" "$results_dir/${session_type}/$base" 2>/dev/null || true
208
+ done
209
+
210
+ # Remove tmpdir
211
+ rm -rf "$tmpwork"
212
+
213
+ echo "[ok] $session_type saved to $results_dir/${session_type}/"
214
+ }
215
+
216
+ # --- Main execution ---
217
+ echo "=== V5 Benchmark Run $RUN_ID ==="
218
+ echo "Variants: ${variant_list[*]}, Granularities: ${gran_list[*]}, N=$N_RUNS"
219
+ echo "Sessions: $session_count total"
220
+ echo ""
221
+
222
+ completed=0
223
+ start_time=$(date +%s)
224
+
225
+ for v in "${variant_list[@]}"; do
226
+ for g in "${gran_list[@]}"; do
227
+ for ((n=1; n<=N_RUNS; n++)); do
228
+ if [ "$SESSION" = "control" ] || [ "$SESSION" = "both" ]; then
229
+ completed=$((completed + 1))
230
+ elapsed=$(( $(date +%s) - start_time ))
231
+ if [ "$completed" -gt 1 ]; then
232
+ remaining=$(( elapsed * (session_count - completed) / (completed - 1) ))
233
+ else
234
+ remaining=0
235
+ fi
236
+ echo ""
237
+ echo "[$completed/$session_count] Variant $v, $g, Control, run $n — elapsed ${elapsed}s, est ${remaining}s remaining"
238
+ run_session "$v" "$g" "control" "$n"
239
+ fi
240
+
241
+ if [ "$SESSION" = "treatment" ] || [ "$SESSION" = "both" ]; then
242
+ completed=$((completed + 1))
243
+ elapsed=$(( $(date +%s) - start_time ))
244
+ if [ "$completed" -gt 1 ]; then
245
+ remaining=$(( elapsed * (session_count - completed) / (completed - 1) ))
246
+ else
247
+ remaining=0
248
+ fi
249
+ echo ""
250
+ echo "[$completed/$session_count] Variant $v, $g, Treatment, run $n — elapsed ${elapsed}s, est ${remaining}s remaining"
251
+ run_session "$v" "$g" "treatment" "$n"
252
+ fi
253
+
254
+ # Score this run
255
+ echo ""
256
+ echo "--- Scoring run $n, variant $v, $g ---"
257
+ python3 "$SCORER" "${RUN_ID}-${n}-v5${v}${g:0:1}" --variant "$v" || true
258
+ done
259
+ done
260
+ done
261
+
262
+ total_elapsed=$(( $(date +%s) - start_time ))
263
+ echo ""
264
+ echo "=== V5 Benchmark Complete ==="
265
+ echo "Total time: ${total_elapsed}s"
266
+ echo ""
267
+ echo "Score individual runs:"
268
+ for v in "${variant_list[@]}"; do
269
+ for g in "${gran_list[@]}"; do
270
+ for ((n=1; n<=N_RUNS; n++)); do
271
+ echo " python3 bench/score_v5.py ${RUN_ID}-${n}-v5${v}${g:0:1} --variant $v"
272
+ done
273
+ done
274
+ done