llm-code-validator 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llm_code_validator-0.1.0/LICENSE +21 -0
- llm_code_validator-0.1.0/PKG-INFO +220 -0
- llm_code_validator-0.1.0/README.md +209 -0
- llm_code_validator-0.1.0/llm_code_validator/__init__.py +3 -0
- llm_code_validator-0.1.0/llm_code_validator/benchmark.py +141 -0
- llm_code_validator-0.1.0/llm_code_validator/cli.py +105 -0
- llm_code_validator-0.1.0/llm_code_validator/core.py +359 -0
- llm_code_validator-0.1.0/llm_code_validator/diagnostics.py +61 -0
- llm_code_validator-0.1.0/llm_code_validator/fixes.py +66 -0
- llm_code_validator-0.1.0/llm_code_validator/formatting.py +43 -0
- llm_code_validator-0.1.0/llm_code_validator/library_signatures.json +842 -0
- llm_code_validator-0.1.0/llm_code_validator/signatures.py +163 -0
- llm_code_validator-0.1.0/llm_code_validator/versioning.py +153 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/PKG-INFO +220 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/SOURCES.txt +28 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/dependency_links.txt +1 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/entry_points.txt +2 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/requires.txt +3 -0
- llm_code_validator-0.1.0/llm_code_validator.egg-info/top_level.txt +1 -0
- llm_code_validator-0.1.0/pyproject.toml +26 -0
- llm_code_validator-0.1.0/setup.cfg +4 -0
- llm_code_validator-0.1.0/tests/test_benchmark.py +35 -0
- llm_code_validator-0.1.0/tests/test_cli.py +142 -0
- llm_code_validator-0.1.0/tests/test_core.py +220 -0
- llm_code_validator-0.1.0/tests/test_external_repo_evaluation.py +31 -0
- llm_code_validator-0.1.0/tests/test_fixes.py +119 -0
- llm_code_validator-0.1.0/tests/test_formatting.py +42 -0
- llm_code_validator-0.1.0/tests/test_signatures.py +86 -0
- llm_code_validator-0.1.0/tests/test_versioning.py +79 -0
- llm_code_validator-0.1.0/tests/test_workflows.py +23 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Felix Mathew
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llm-code-validator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI guardrail for catching stale Python APIs before runtime.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Provides-Extra: dev
|
|
9
|
+
Requires-Dist: pytest>=8; extra == "dev"
|
|
10
|
+
Dynamic: license-file
|
|
11
|
+
|
|
12
|
+
# llm-code-validator
|
|
13
|
+
|
|
14
|
+
Python CLI for detecting stale or version-incompatible third-party API usage in Python source code.
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
Install from the repository:
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
git clone https://github.com/mathew-felix/llm-code-validator
|
|
22
|
+
cd llm-code-validator
|
|
23
|
+
pip install -e .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Install with test dependencies:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -e ".[dev]"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
After the package is published to PyPI:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install llm-code-validator
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
Check one file:
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
llm-code-validator check file.py
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Check a directory:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
llm-code-validator check src/
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Check standard input:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
llm-code-validator check - < snippet.py
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
Check staged Git files:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
llm-code-validator check --staged
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Use an explicit dependency file:
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
llm-code-validator check --requirements requirements.txt src/
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Show low-confidence diagnostics:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
llm-code-validator check --show-low-confidence src/
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Exit codes:
|
|
77
|
+
|
|
78
|
+
- `0`: no diagnostics
|
|
79
|
+
- `1`: diagnostics found
|
|
80
|
+
- `2`: tool error
|
|
81
|
+
|
|
82
|
+
## Output
|
|
83
|
+
|
|
84
|
+
Text output:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
llm-code-validator check src/
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
JSON output:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
llm-code-validator check src/ --format json
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
GitHub Actions annotation output:
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
llm-code-validator check src/ --format github
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Fixes
|
|
103
|
+
|
|
104
|
+
Preview fixes:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
llm-code-validator fix file.py
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Apply safe fixes:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
llm-code-validator fix file.py --write
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Only rules marked `safe_fix` are written. Rules marked `suggested_fix` or `no_fix` are reported but not changed.
|
|
117
|
+
|
|
118
|
+
Current rule safety counts:
|
|
119
|
+
|
|
120
|
+
- `safe_fix`: 15 rules
|
|
121
|
+
- `suggested_fix`: 51 rules
|
|
122
|
+
- `no_fix`: 2 rules
|
|
123
|
+
|
|
124
|
+
## Signature Database
|
|
125
|
+
|
|
126
|
+
Validate the rule database:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
llm-code-validator validate-signatures
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
The source rule database is:
|
|
133
|
+
|
|
134
|
+
```text
|
|
135
|
+
data/library_signatures.json
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
The packaged rule database is:
|
|
139
|
+
|
|
140
|
+
```text
|
|
141
|
+
llm_code_validator/library_signatures.json
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
Current rule count:
|
|
145
|
+
|
|
146
|
+
- 68 API-drift rules
|
|
147
|
+
|
|
148
|
+
## Benchmarks
|
|
149
|
+
|
|
150
|
+
Run the CLI benchmark dataset:
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
python -m llm_code_validator.benchmark --dataset validation_dataset/cli_benchmark_cases.json
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Run the AI-stack benchmark dataset:
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
python -m llm_code_validator.benchmark --dataset validation_dataset/ai_stack_benchmark_cases.json
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Current saved benchmark results:
|
|
163
|
+
|
|
164
|
+
- CLI dataset: precision `1.0`, recall `1.0`, p50 `0.243ms`, p95 `6.199ms`
|
|
165
|
+
- AI-stack dataset: precision `1.0`, recall `1.0`, p50 `0.444ms`, p95 `4.939ms`
|
|
166
|
+
|
|
167
|
+
## Pre-Commit
|
|
168
|
+
|
|
169
|
+
`.pre-commit-hooks.yaml` is included:
|
|
170
|
+
|
|
171
|
+
```yaml
|
|
172
|
+
repos:
|
|
173
|
+
- repo: https://github.com/mathew-felix/llm-code-validator
|
|
174
|
+
rev: v0.1.0
|
|
175
|
+
hooks:
|
|
176
|
+
- id: llm-code-validator
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## GitHub Actions
|
|
180
|
+
|
|
181
|
+
Example workflow:
|
|
182
|
+
|
|
183
|
+
```yaml
|
|
184
|
+
name: API Drift Check
|
|
185
|
+
|
|
186
|
+
on:
|
|
187
|
+
pull_request:
|
|
188
|
+
|
|
189
|
+
jobs:
|
|
190
|
+
api-drift:
|
|
191
|
+
runs-on: ubuntu-latest
|
|
192
|
+
steps:
|
|
193
|
+
- uses: actions/checkout@v4
|
|
194
|
+
- uses: actions/setup-python@v5
|
|
195
|
+
with:
|
|
196
|
+
python-version: "3.11"
|
|
197
|
+
- run: pip install llm-code-validator
|
|
198
|
+
- run: llm-code-validator check . --format github
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## Testing
|
|
202
|
+
|
|
203
|
+
Run the test suite:
|
|
204
|
+
|
|
205
|
+
```bash
|
|
206
|
+
pytest -q
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
Current local result:
|
|
210
|
+
|
|
211
|
+
```text
|
|
212
|
+
72 passed
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Documentation
|
|
216
|
+
|
|
217
|
+
- `docs/demo.md`: example check and fix workflow
|
|
218
|
+
- `docs/rules.md`: rule database notes
|
|
219
|
+
- `docs/release.md`: package release steps
|
|
220
|
+
- `PROJECT_REPORT.md`: project report
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
# llm-code-validator
|
|
2
|
+
|
|
3
|
+
Python CLI for detecting stale or version-incompatible third-party API usage in Python source code.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Install from the repository:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/mathew-felix/llm-code-validator
|
|
11
|
+
cd llm-code-validator
|
|
12
|
+
pip install -e .
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Install with test dependencies:
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install -e ".[dev]"
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
After the package is published to PyPI:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install llm-code-validator
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
## Usage
|
|
28
|
+
|
|
29
|
+
Check one file:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
llm-code-validator check file.py
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Check a directory:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
llm-code-validator check src/
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Check standard input:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
llm-code-validator check - < snippet.py
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Check staged Git files:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
llm-code-validator check --staged
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Use an explicit dependency file:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
llm-code-validator check --requirements requirements.txt src/
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Show low-confidence diagnostics:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
llm-code-validator check --show-low-confidence src/
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Exit codes:
|
|
66
|
+
|
|
67
|
+
- `0`: no diagnostics
|
|
68
|
+
- `1`: diagnostics found
|
|
69
|
+
- `2`: tool error
|
|
70
|
+
|
|
71
|
+
## Output
|
|
72
|
+
|
|
73
|
+
Text output:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
llm-code-validator check src/
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
JSON output:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
llm-code-validator check src/ --format json
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
GitHub Actions annotation output:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
llm-code-validator check src/ --format github
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
## Fixes
|
|
92
|
+
|
|
93
|
+
Preview fixes:
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
llm-code-validator fix file.py
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Apply safe fixes:
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
llm-code-validator fix file.py --write
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Only rules marked `safe_fix` are written. Rules marked `suggested_fix` or `no_fix` are reported but not changed.
|
|
106
|
+
|
|
107
|
+
Current rule safety counts:
|
|
108
|
+
|
|
109
|
+
- `safe_fix`: 15 rules
|
|
110
|
+
- `suggested_fix`: 51 rules
|
|
111
|
+
- `no_fix`: 2 rules
|
|
112
|
+
|
|
113
|
+
## Signature Database
|
|
114
|
+
|
|
115
|
+
Validate the rule database:
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
llm-code-validator validate-signatures
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The source rule database is:
|
|
122
|
+
|
|
123
|
+
```text
|
|
124
|
+
data/library_signatures.json
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
The packaged rule database is:
|
|
128
|
+
|
|
129
|
+
```text
|
|
130
|
+
llm_code_validator/library_signatures.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Current rule count:
|
|
134
|
+
|
|
135
|
+
- 68 API-drift rules
|
|
136
|
+
|
|
137
|
+
## Benchmarks
|
|
138
|
+
|
|
139
|
+
Run the CLI benchmark dataset:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
python -m llm_code_validator.benchmark --dataset validation_dataset/cli_benchmark_cases.json
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Run the AI-stack benchmark dataset:
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
python -m llm_code_validator.benchmark --dataset validation_dataset/ai_stack_benchmark_cases.json
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Current saved benchmark results:
|
|
152
|
+
|
|
153
|
+
- CLI dataset: precision `1.0`, recall `1.0`, p50 `0.243ms`, p95 `6.199ms`
|
|
154
|
+
- AI-stack dataset: precision `1.0`, recall `1.0`, p50 `0.444ms`, p95 `4.939ms`
|
|
155
|
+
|
|
156
|
+
## Pre-Commit
|
|
157
|
+
|
|
158
|
+
`.pre-commit-hooks.yaml` is included:
|
|
159
|
+
|
|
160
|
+
```yaml
|
|
161
|
+
repos:
|
|
162
|
+
- repo: https://github.com/mathew-felix/llm-code-validator
|
|
163
|
+
rev: v0.1.0
|
|
164
|
+
hooks:
|
|
165
|
+
- id: llm-code-validator
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## GitHub Actions
|
|
169
|
+
|
|
170
|
+
Example workflow:
|
|
171
|
+
|
|
172
|
+
```yaml
|
|
173
|
+
name: API Drift Check
|
|
174
|
+
|
|
175
|
+
on:
|
|
176
|
+
pull_request:
|
|
177
|
+
|
|
178
|
+
jobs:
|
|
179
|
+
api-drift:
|
|
180
|
+
runs-on: ubuntu-latest
|
|
181
|
+
steps:
|
|
182
|
+
- uses: actions/checkout@v4
|
|
183
|
+
- uses: actions/setup-python@v5
|
|
184
|
+
with:
|
|
185
|
+
python-version: "3.11"
|
|
186
|
+
- run: pip install llm-code-validator
|
|
187
|
+
- run: llm-code-validator check . --format github
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
## Testing
|
|
191
|
+
|
|
192
|
+
Run the test suite:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
pytest -q
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
Current local result:
|
|
199
|
+
|
|
200
|
+
```text
|
|
201
|
+
72 passed
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
## Documentation
|
|
205
|
+
|
|
206
|
+
- `docs/demo.md`: example check and fix workflow
|
|
207
|
+
- `docs/rules.md`: rule database notes
|
|
208
|
+
- `docs/release.md`: package release steps
|
|
209
|
+
- `PROJECT_REPORT.md`: project report
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import platform
|
|
6
|
+
import statistics
|
|
7
|
+
import time
|
|
8
|
+
import tracemalloc
|
|
9
|
+
from datetime import date
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .core import check_file, check_source, iter_python_files
|
|
13
|
+
from .versioning import build_version_context
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def run_benchmark(paths: list[str]) -> dict[str, object]:
|
|
17
|
+
files = iter_python_files(paths)
|
|
18
|
+
version_context = build_version_context(paths)
|
|
19
|
+
timings: list[float] = []
|
|
20
|
+
diagnostics = 0
|
|
21
|
+
tracemalloc.start()
|
|
22
|
+
start = time.perf_counter()
|
|
23
|
+
for path in files:
|
|
24
|
+
file_start = time.perf_counter()
|
|
25
|
+
result = check_file(path, version_context)
|
|
26
|
+
timings.append(time.perf_counter() - file_start)
|
|
27
|
+
diagnostics += len(result.diagnostics)
|
|
28
|
+
total = time.perf_counter() - start
|
|
29
|
+
_, peak = tracemalloc.get_traced_memory()
|
|
30
|
+
tracemalloc.stop()
|
|
31
|
+
|
|
32
|
+
p50 = statistics.median(timings) if timings else 0.0
|
|
33
|
+
p95 = statistics.quantiles(timings, n=20)[18] if len(timings) >= 20 else (max(timings) if timings else 0.0)
|
|
34
|
+
files_per_second = len(files) / total if total else 0.0
|
|
35
|
+
return {
|
|
36
|
+
"files": len(files),
|
|
37
|
+
"diagnostics": diagnostics,
|
|
38
|
+
"total_seconds": total,
|
|
39
|
+
"p50_ms": p50 * 1000,
|
|
40
|
+
"p95_ms": p95 * 1000,
|
|
41
|
+
"files_per_second": files_per_second,
|
|
42
|
+
"peak_ram_mb": peak / (1024 * 1024),
|
|
43
|
+
"hardware": platform.machine(),
|
|
44
|
+
"os": platform.platform(),
|
|
45
|
+
"python_version": platform.python_version(),
|
|
46
|
+
"precision": None,
|
|
47
|
+
"recall": None,
|
|
48
|
+
"false_positives": None,
|
|
49
|
+
"false_negatives": None,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def run_labeled_benchmark(dataset_path: str | Path) -> dict[str, object]:
|
|
54
|
+
dataset_file = Path(dataset_path)
|
|
55
|
+
cases = json.loads(dataset_file.read_text(encoding="utf-8"))
|
|
56
|
+
timings: list[float] = []
|
|
57
|
+
true_positives = 0
|
|
58
|
+
false_positives = 0
|
|
59
|
+
false_negatives = 0
|
|
60
|
+
false_positive_examples: list[dict[str, str]] = []
|
|
61
|
+
false_negative_examples: list[dict[str, str]] = []
|
|
62
|
+
total_expected = 0
|
|
63
|
+
total_diagnostics = 0
|
|
64
|
+
|
|
65
|
+
tracemalloc.start()
|
|
66
|
+
start = time.perf_counter()
|
|
67
|
+
for case in cases:
|
|
68
|
+
case_start = time.perf_counter()
|
|
69
|
+
result = check_source(case["code"], case.get("path") or f"{case['id']}.py")
|
|
70
|
+
timings.append(time.perf_counter() - case_start)
|
|
71
|
+
expected = {(item["library"], item["symbol"]) for item in case.get("expected_diagnostics", [])}
|
|
72
|
+
actual = {(diagnostic.library, diagnostic.symbol) for diagnostic in result.diagnostics}
|
|
73
|
+
total_expected += len(expected)
|
|
74
|
+
total_diagnostics += len(actual)
|
|
75
|
+
true_positives += len(expected & actual)
|
|
76
|
+
case_false_positives = actual - expected
|
|
77
|
+
case_false_negatives = expected - actual
|
|
78
|
+
false_positives += len(case_false_positives)
|
|
79
|
+
false_negatives += len(case_false_negatives)
|
|
80
|
+
for library, symbol in sorted(case_false_positives):
|
|
81
|
+
false_positive_examples.append(
|
|
82
|
+
{"case_id": case["id"], "library": library, "symbol": symbol, "reason": "unexpected diagnostic"}
|
|
83
|
+
)
|
|
84
|
+
for library, symbol in sorted(case_false_negatives):
|
|
85
|
+
false_negative_examples.append(
|
|
86
|
+
{"case_id": case["id"], "library": library, "symbol": symbol, "reason": "missing rule or extraction gap"}
|
|
87
|
+
)
|
|
88
|
+
total = time.perf_counter() - start
|
|
89
|
+
_, peak = tracemalloc.get_traced_memory()
|
|
90
|
+
tracemalloc.stop()
|
|
91
|
+
|
|
92
|
+
precision = true_positives / (true_positives + false_positives) if true_positives + false_positives else 1.0
|
|
93
|
+
recall = true_positives / (true_positives + false_negatives) if true_positives + false_negatives else 1.0
|
|
94
|
+
p50 = statistics.median(timings) if timings else 0.0
|
|
95
|
+
p95 = statistics.quantiles(timings, n=20)[18] if len(timings) >= 20 else (max(timings) if timings else 0.0)
|
|
96
|
+
return {
|
|
97
|
+
"dataset": str(dataset_file),
|
|
98
|
+
"benchmark_date": date.today().isoformat(),
|
|
99
|
+
"cases": len(cases),
|
|
100
|
+
"files": len(cases),
|
|
101
|
+
"diagnostics": total_diagnostics,
|
|
102
|
+
"expected_diagnostics": total_expected,
|
|
103
|
+
"true_positives": true_positives,
|
|
104
|
+
"false_positives": false_positives,
|
|
105
|
+
"false_negatives": false_negatives,
|
|
106
|
+
"false_positive_examples": false_positive_examples,
|
|
107
|
+
"false_negative_examples": false_negative_examples,
|
|
108
|
+
"precision": precision,
|
|
109
|
+
"recall": recall,
|
|
110
|
+
"total_seconds": total,
|
|
111
|
+
"p50_ms": p50 * 1000,
|
|
112
|
+
"p95_ms": p95 * 1000,
|
|
113
|
+
"files_per_second": len(cases) / total if total else 0.0,
|
|
114
|
+
"peak_ram_mb": peak / (1024 * 1024),
|
|
115
|
+
"hardware": platform.machine(),
|
|
116
|
+
"os": platform.platform(),
|
|
117
|
+
"python_version": platform.python_version(),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def main(argv: list[str] | None = None) -> int:
|
|
122
|
+
parser = argparse.ArgumentParser(prog="python -m llm_code_validator.benchmark")
|
|
123
|
+
parser.add_argument("paths", nargs="*")
|
|
124
|
+
parser.add_argument("--dataset", help="Run a labeled benchmark dataset JSON file.")
|
|
125
|
+
parser.add_argument("--output", help="Write JSON benchmark output to a file.")
|
|
126
|
+
args = parser.parse_args(argv)
|
|
127
|
+
if args.dataset:
|
|
128
|
+
payload = run_labeled_benchmark(args.dataset)
|
|
129
|
+
elif args.paths:
|
|
130
|
+
payload = run_benchmark(args.paths)
|
|
131
|
+
else:
|
|
132
|
+
parser.error("provide one or more paths or --dataset")
|
|
133
|
+
output = json.dumps(payload, indent=2, sort_keys=True)
|
|
134
|
+
if args.output:
|
|
135
|
+
Path(args.output).write_text(output + "\n", encoding="utf-8")
|
|
136
|
+
print(output)
|
|
137
|
+
return 0
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
if __name__ == "__main__":
|
|
141
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from .core import CheckResult, check_paths, check_stdin, staged_python_files
|
|
7
|
+
from .fixes import fix_file
|
|
8
|
+
from .formatting import format_github, format_json, format_text
|
|
9
|
+
from .signatures import validate_signature_database
|
|
10
|
+
from .versioning import build_version_context
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def check_staged(requirements: str | None = None, python_version: str | None = None) -> CheckResult:
|
|
14
|
+
files = staged_python_files()
|
|
15
|
+
return check_paths(files, requirements=requirements, python_version=python_version) if files else check_paths([])
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
19
|
+
parser = argparse.ArgumentParser(prog="llm-code-validator")
|
|
20
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
21
|
+
|
|
22
|
+
check = subparsers.add_parser("check", help="Check Python files for known API drift.")
|
|
23
|
+
check.add_argument("paths", nargs="*", help="Files or directories to scan. Use '-' for stdin.")
|
|
24
|
+
check.add_argument("--staged", action="store_true", help="Check staged Python files from git.")
|
|
25
|
+
check.add_argument("--format", choices=["text", "json", "github"], default="text")
|
|
26
|
+
check.add_argument("--requirements", help="Requirements file used for version assumptions.")
|
|
27
|
+
check.add_argument("--python-version", help="Target Python version label for result context.")
|
|
28
|
+
check.add_argument("--show-low-confidence", action="store_true", help="Show lower-confidence diagnostics.")
|
|
29
|
+
|
|
30
|
+
fix = subparsers.add_parser("fix", help="Preview or apply deterministic safe fixes.")
|
|
31
|
+
fix.add_argument("paths", nargs="+", help="Python files to fix.")
|
|
32
|
+
fix.add_argument("--write", action="store_true", help="Write safe fixes to disk.")
|
|
33
|
+
fix.add_argument("--requirements", help="Requirements file used for version assumptions.")
|
|
34
|
+
fix.add_argument("--python-version", help="Target Python version label for result context.")
|
|
35
|
+
|
|
36
|
+
validate = subparsers.add_parser("validate-signatures", help="Validate the signature database.")
|
|
37
|
+
validate.add_argument("--path", help="Path to library_signatures.json.")
|
|
38
|
+
validate.add_argument(
|
|
39
|
+
"--require-official-evidence",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="Require diagnostic rules to use source_url or release_note instead of generic notes.",
|
|
42
|
+
)
|
|
43
|
+
return parser
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _render(result: CheckResult, output_format: str) -> str:
|
|
47
|
+
if output_format == "json":
|
|
48
|
+
return format_json(result)
|
|
49
|
+
if output_format == "github":
|
|
50
|
+
return format_github(result)
|
|
51
|
+
return format_text(result)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def main(argv: list[str] | None = None) -> int:
|
|
55
|
+
parser = build_parser()
|
|
56
|
+
args = parser.parse_args(argv)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
if args.command == "check":
|
|
60
|
+
if args.staged:
|
|
61
|
+
result = check_paths(
|
|
62
|
+
staged_python_files(),
|
|
63
|
+
requirements=args.requirements,
|
|
64
|
+
python_version=args.python_version,
|
|
65
|
+
show_low_confidence=args.show_low_confidence,
|
|
66
|
+
)
|
|
67
|
+
elif args.paths == ["-"]:
|
|
68
|
+
result = check_stdin(args.requirements, args.python_version, args.show_low_confidence)
|
|
69
|
+
elif args.paths:
|
|
70
|
+
result = check_paths(args.paths, args.requirements, args.python_version, args.show_low_confidence)
|
|
71
|
+
else:
|
|
72
|
+
parser.error("check requires a path, '-', or --staged")
|
|
73
|
+
output = _render(result, args.format)
|
|
74
|
+
if output:
|
|
75
|
+
print(output)
|
|
76
|
+
return 1 if result.diagnostics else 0
|
|
77
|
+
if args.command == "fix":
|
|
78
|
+
version_context = build_version_context(args.paths, args.requirements, args.python_version)
|
|
79
|
+
exit_code = 0
|
|
80
|
+
for path in args.paths:
|
|
81
|
+
result = fix_file(path, write=args.write, version_context=version_context)
|
|
82
|
+
for preview in result.previews:
|
|
83
|
+
print(preview)
|
|
84
|
+
for skipped in result.skipped:
|
|
85
|
+
print(skipped)
|
|
86
|
+
if result.skipped:
|
|
87
|
+
exit_code = 1
|
|
88
|
+
return exit_code
|
|
89
|
+
if args.command == "validate-signatures":
|
|
90
|
+
errors = validate_signature_database(args.path, args.require_official_evidence)
|
|
91
|
+
if errors:
|
|
92
|
+
for error in errors:
|
|
93
|
+
print(error, file=sys.stderr)
|
|
94
|
+
return 1
|
|
95
|
+
print("OK: signature database is valid")
|
|
96
|
+
return 0
|
|
97
|
+
except Exception as exc:
|
|
98
|
+
print(f"llm-code-validator: {exc}", file=sys.stderr)
|
|
99
|
+
return 2
|
|
100
|
+
|
|
101
|
+
return 2
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
raise SystemExit(main())
|