assignment-codeval 0.0.8__tar.gz → 0.0.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/PKG-INFO +102 -9
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/README.md +97 -3
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/pyproject.toml +5 -6
- assignment_codeval-0.0.10/src/assignment_codeval/ai_benchmark.py +528 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/cli.py +2 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/evaluate.py +65 -10
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/PKG-INFO +102 -9
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/SOURCES.txt +1 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/requires.txt +4 -5
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/setup.cfg +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/__init__.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/canvas_utils.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/commons.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/convertMD2Html.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/create_assignment.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/file_utils.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/github_connect.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/submissions.py +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/dependency_links.txt +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/entry_points.txt +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/top_level.txt +0 -0
- {assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/tests/test_codeval.py +0 -0
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: assignment-codeval
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.10
|
|
4
4
|
Summary: CodEval for evaluating programming assignments
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
7
7
|
Requires-Dist: canvasapi==3.3.0
|
|
8
|
-
Requires-Dist: certifi==2021.10.8
|
|
9
|
-
Requires-Dist: charset-normalizer==2.0.9
|
|
10
8
|
Requires-Dist: click==8.2.1
|
|
11
9
|
Requires-Dist: configparser==5.2.0
|
|
12
|
-
Requires-Dist: idna==3.3
|
|
13
10
|
Requires-Dist: pytz==2021.3
|
|
14
|
-
Requires-Dist: requests
|
|
15
|
-
Requires-Dist: urllib3==1.26.7
|
|
11
|
+
Requires-Dist: requests>=2.28.0
|
|
16
12
|
Requires-Dist: pymongo==4.3.3
|
|
17
13
|
Requires-Dist: markdown==3.4.1
|
|
14
|
+
Requires-Dist: anthropic>=0.39.0
|
|
15
|
+
Requires-Dist: openai>=1.0.0
|
|
16
|
+
Requires-Dist: google-generativeai>=0.8.0
|
|
18
17
|
Provides-Extra: test
|
|
19
18
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
20
19
|
|
|
@@ -72,9 +71,9 @@ Tags used in a spec file (\<course name>.codeval)
|
|
|
72
71
|
| CMD/TCMD | Run Command | Will be followed by a command to run. The TCMD will cause the evaluation to fail if the command exits with an error. |
|
|
73
72
|
| CMP | Compare | Will be followed by two files to compare. |
|
|
74
73
|
| T/HT | Test Case | Will be followed by the command to run to test the submission. |
|
|
75
|
-
| I/IF | Supply Input | Specifies the input for a test case.
|
|
76
|
-
| O/OF | Check Output | Specifies the expected output for a test case.
|
|
77
|
-
| E | Check Error | Specifies the expected error output for a test case. |
|
|
74
|
+
| I/IB/IF | Supply Input | Specifies the input for a test case. I adds a newline, IB does not add a newline, IF reads from a file. |
|
|
75
|
+
| O/OB/OF | Check Output | Specifies the expected output for a test case. O adds a newline, OB does not add a newline, OF reads from a file. |
|
|
76
|
+
| E/EB | Check Error | Specifies the expected error output for a test case. E adds a newline, EB does not. |
|
|
78
77
|
| TO | Timeout | Specifies the time limit in seconds for a test case to run. Defaults to 20 seconds. |
|
|
79
78
|
| X | Exit Code | Specifies the expected exit code for a test case. Defaults to zero. |
|
|
80
79
|
| SS | Start Server | Command containing timeout (wait until server starts), kill timeout (wait to kill the server), and the command to start a server |
|
|
@@ -247,3 +246,97 @@ Refer to a sample spec file [here](SQL/samples/ASSIGNMENT:CREATE.codeval)
|
|
|
247
246
|
C cc -o bigbag --std=gnu11 bigbag.c
|
|
248
247
|
|
|
249
248
|
|
|
249
|
+
## 4. Test Assignments with AI Models
|
|
250
|
+
|
|
251
|
+
Test programming assignments against multiple AI models (Claude, GPT, Gemini) to benchmark their performance.
|
|
252
|
+
|
|
253
|
+
### Installation
|
|
254
|
+
|
|
255
|
+
Install the AI provider packages you want to use:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Install all AI providers
|
|
259
|
+
pip install assignment-codeval[ai]
|
|
260
|
+
|
|
261
|
+
# Or install specific providers
|
|
262
|
+
pip install anthropic # For Claude models
|
|
263
|
+
pip install openai # For GPT models
|
|
264
|
+
pip install google-generativeai # For Gemini models
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### codeval.ini contents (optional)
|
|
268
|
+
```
|
|
269
|
+
[AI]
|
|
270
|
+
anthropic_key=sk-ant-...
|
|
271
|
+
openai_key=sk-...
|
|
272
|
+
google_key=...
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
API keys can also be provided via:
|
|
276
|
+
- Environment variables: `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`
|
|
277
|
+
- Command line options: `--anthropic-key`, `--openai-key`, `--google-key`
|
|
278
|
+
|
|
279
|
+
### Command to run
|
|
280
|
+
```bash
|
|
281
|
+
assignment-codeval test-with-ai <codeval_file> [OPTIONS]
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Options
|
|
285
|
+
| Option | Description |
|
|
286
|
+
|--------|-------------|
|
|
287
|
+
| `-o, --output-dir` | Directory to store solutions and results (default: `ai_test_results`) |
|
|
288
|
+
| `-n, --attempts` | Number of attempts per model (default: 1) |
|
|
289
|
+
| `-m, --models` | Specific models to test (can be used multiple times) |
|
|
290
|
+
| `-p, --providers` | Only test models from specific providers: `anthropic`, `openai`, `google` |
|
|
291
|
+
| `--anthropic-key` | Anthropic API key |
|
|
292
|
+
| `--openai-key` | OpenAI API key |
|
|
293
|
+
| `--google-key` | Google API key |
|
|
294
|
+
|
|
295
|
+
### Examples
|
|
296
|
+
```bash
|
|
297
|
+
# Test with all Anthropic models
|
|
298
|
+
assignment-codeval test-with-ai my_assignment.codeval -p anthropic
|
|
299
|
+
|
|
300
|
+
# Test with specific model, 3 attempts each
|
|
301
|
+
assignment-codeval test-with-ai my_assignment.codeval -m "Claude Sonnet 4" -n 3
|
|
302
|
+
|
|
303
|
+
# Test with all providers (requires all API keys)
|
|
304
|
+
assignment-codeval test-with-ai my_assignment.codeval -n 2
|
|
305
|
+
|
|
306
|
+
# Pass API key directly
|
|
307
|
+
assignment-codeval test-with-ai my_assignment.codeval --anthropic-key sk-ant-xxx -p anthropic
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### Supported Models
|
|
311
|
+
|
|
312
|
+
| Provider | Models |
|
|
313
|
+
|----------|--------|
|
|
314
|
+
| Anthropic | Claude Sonnet 4, Claude Opus 4 |
|
|
315
|
+
| OpenAI | GPT-4o, GPT-4o Mini, o1, o3-mini |
|
|
316
|
+
| Google | Gemini 2.0 Flash, Gemini 1.5 Pro |
|
|
317
|
+
|
|
318
|
+
Note: You can add additional models using `-m "model-id"`. Check each provider's documentation for available model IDs.
|
|
319
|
+
|
|
320
|
+
### Output Structure
|
|
321
|
+
```
|
|
322
|
+
ai_test_results/
|
|
323
|
+
├── prompt.txt # The prompt sent to AI models
|
|
324
|
+
├── results.json # Summary of all results
|
|
325
|
+
├── Claude_Sonnet_4/
|
|
326
|
+
│ └── attempt_1/
|
|
327
|
+
│ ├── raw_response.txt # Raw AI response
|
|
328
|
+
│ ├── solution.c # Extracted code
|
|
329
|
+
│ └── <codeval files> # Copied for evaluation
|
|
330
|
+
├── GPT-4o/
|
|
331
|
+
│ └── attempt_1/
|
|
332
|
+
│ └── ...
|
|
333
|
+
└── ...
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Notes
|
|
337
|
+
- The command extracts the assignment description from the codeval file (between `CRT_HW START` and `CRT_HW END` tags)
|
|
338
|
+
- Support files from `support_files/` directory are automatically copied for evaluation
|
|
339
|
+
- Results include pass/fail status, response time, and any errors
|
|
340
|
+
- Use multiple attempts (`-n`) to account for AI response variability
|
|
341
|
+
|
|
342
|
+
|
|
@@ -52,9 +52,9 @@ Tags used in a spec file (\<course name>.codeval)
|
|
|
52
52
|
| CMD/TCMD | Run Command | Will be followed by a command to run. The TCMD will cause the evaluation to fail if the command exits with an error. |
|
|
53
53
|
| CMP | Compare | Will be followed by two files to compare. |
|
|
54
54
|
| T/HT | Test Case | Will be followed by the command to run to test the submission. |
|
|
55
|
-
| I/IF | Supply Input | Specifies the input for a test case.
|
|
56
|
-
| O/OF | Check Output | Specifies the expected output for a test case.
|
|
57
|
-
| E | Check Error | Specifies the expected error output for a test case. |
|
|
55
|
+
| I/IB/IF | Supply Input | Specifies the input for a test case. I adds a newline, IB does not add a newline, IF reads from a file. |
|
|
56
|
+
| O/OB/OF | Check Output | Specifies the expected output for a test case. O adds a newline, OB does not add a newline, OF reads from a file. |
|
|
57
|
+
| E/EB | Check Error | Specifies the expected error output for a test case. E adds a newline, EB does not. |
|
|
58
58
|
| TO | Timeout | Specifies the time limit in seconds for a test case to run. Defaults to 20 seconds. |
|
|
59
59
|
| X | Exit Code | Specifies the expected exit code for a test case. Defaults to zero. |
|
|
60
60
|
| SS | Start Server | Command containing timeout (wait until server starts), kill timeout (wait to kill the server), and the command to start a server |
|
|
@@ -227,3 +227,97 @@ Refer to a sample spec file [here](SQL/samples/ASSIGNMENT:CREATE.codeval)
|
|
|
227
227
|
C cc -o bigbag --std=gnu11 bigbag.c
|
|
228
228
|
|
|
229
229
|
|
|
230
|
+
## 4. Test Assignments with AI Models
|
|
231
|
+
|
|
232
|
+
Test programming assignments against multiple AI models (Claude, GPT, Gemini) to benchmark their performance.
|
|
233
|
+
|
|
234
|
+
### Installation
|
|
235
|
+
|
|
236
|
+
Install the AI provider packages you want to use:
|
|
237
|
+
|
|
238
|
+
```bash
|
|
239
|
+
# Install all AI providers
|
|
240
|
+
pip install assignment-codeval[ai]
|
|
241
|
+
|
|
242
|
+
# Or install specific providers
|
|
243
|
+
pip install anthropic # For Claude models
|
|
244
|
+
pip install openai # For GPT models
|
|
245
|
+
pip install google-generativeai # For Gemini models
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### codeval.ini contents (optional)
|
|
249
|
+
```
|
|
250
|
+
[AI]
|
|
251
|
+
anthropic_key=sk-ant-...
|
|
252
|
+
openai_key=sk-...
|
|
253
|
+
google_key=...
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
API keys can also be provided via:
|
|
257
|
+
- Environment variables: `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`
|
|
258
|
+
- Command line options: `--anthropic-key`, `--openai-key`, `--google-key`
|
|
259
|
+
|
|
260
|
+
### Command to run
|
|
261
|
+
```bash
|
|
262
|
+
assignment-codeval test-with-ai <codeval_file> [OPTIONS]
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Options
|
|
266
|
+
| Option | Description |
|
|
267
|
+
|--------|-------------|
|
|
268
|
+
| `-o, --output-dir` | Directory to store solutions and results (default: `ai_test_results`) |
|
|
269
|
+
| `-n, --attempts` | Number of attempts per model (default: 1) |
|
|
270
|
+
| `-m, --models` | Specific models to test (can be used multiple times) |
|
|
271
|
+
| `-p, --providers` | Only test models from specific providers: `anthropic`, `openai`, `google` |
|
|
272
|
+
| `--anthropic-key` | Anthropic API key |
|
|
273
|
+
| `--openai-key` | OpenAI API key |
|
|
274
|
+
| `--google-key` | Google API key |
|
|
275
|
+
|
|
276
|
+
### Examples
|
|
277
|
+
```bash
|
|
278
|
+
# Test with all Anthropic models
|
|
279
|
+
assignment-codeval test-with-ai my_assignment.codeval -p anthropic
|
|
280
|
+
|
|
281
|
+
# Test with specific model, 3 attempts each
|
|
282
|
+
assignment-codeval test-with-ai my_assignment.codeval -m "Claude Sonnet 4" -n 3
|
|
283
|
+
|
|
284
|
+
# Test with all providers (requires all API keys)
|
|
285
|
+
assignment-codeval test-with-ai my_assignment.codeval -n 2
|
|
286
|
+
|
|
287
|
+
# Pass API key directly
|
|
288
|
+
assignment-codeval test-with-ai my_assignment.codeval --anthropic-key sk-ant-xxx -p anthropic
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
### Supported Models
|
|
292
|
+
|
|
293
|
+
| Provider | Models |
|
|
294
|
+
|----------|--------|
|
|
295
|
+
| Anthropic | Claude Sonnet 4, Claude Opus 4 |
|
|
296
|
+
| OpenAI | GPT-4o, GPT-4o Mini, o1, o3-mini |
|
|
297
|
+
| Google | Gemini 2.0 Flash, Gemini 1.5 Pro |
|
|
298
|
+
|
|
299
|
+
Note: You can add additional models using `-m "model-id"`. Check each provider's documentation for available model IDs.
|
|
300
|
+
|
|
301
|
+
### Output Structure
|
|
302
|
+
```
|
|
303
|
+
ai_test_results/
|
|
304
|
+
├── prompt.txt # The prompt sent to AI models
|
|
305
|
+
├── results.json # Summary of all results
|
|
306
|
+
├── Claude_Sonnet_4/
|
|
307
|
+
│ └── attempt_1/
|
|
308
|
+
│ ├── raw_response.txt # Raw AI response
|
|
309
|
+
│ ├── solution.c # Extracted code
|
|
310
|
+
│ └── <codeval files> # Copied for evaluation
|
|
311
|
+
├── GPT-4o/
|
|
312
|
+
│ └── attempt_1/
|
|
313
|
+
│ └── ...
|
|
314
|
+
└── ...
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
### Notes
|
|
318
|
+
- The command extracts the assignment description from the codeval file (between `CRT_HW START` and `CRT_HW END` tags)
|
|
319
|
+
- Support files from `support_files/` directory are automatically copied for evaluation
|
|
320
|
+
- Results include pass/fail status, response time, and any errors
|
|
321
|
+
- Use multiple attempts (`-n`) to account for AI response variability
|
|
322
|
+
|
|
323
|
+
|
|
@@ -4,22 +4,21 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "assignment-codeval"
|
|
7
|
-
version = "0.0.
|
|
7
|
+
version = "0.0.10"
|
|
8
8
|
description = "CodEval for evaluating programming assignments"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
11
11
|
dependencies = [
|
|
12
12
|
"canvasapi==3.3.0",
|
|
13
|
-
"certifi==2021.10.8",
|
|
14
|
-
"charset-normalizer==2.0.9",
|
|
15
13
|
"click==8.2.1",
|
|
16
14
|
"configparser==5.2.0",
|
|
17
|
-
"idna==3.3",
|
|
18
15
|
"pytz==2021.3",
|
|
19
|
-
"requests
|
|
20
|
-
"urllib3==1.26.7",
|
|
16
|
+
"requests>=2.28.0",
|
|
21
17
|
"pymongo==4.3.3",
|
|
22
18
|
"markdown==3.4.1",
|
|
19
|
+
"anthropic>=0.39.0",
|
|
20
|
+
"openai>=1.0.0",
|
|
21
|
+
"google-generativeai>=0.8.0",
|
|
23
22
|
]
|
|
24
23
|
|
|
25
24
|
[project.optional-dependencies]
|
|
@@ -0,0 +1,528 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
AI Benchmark Module for CodEval
|
|
4
|
+
|
|
5
|
+
Sends programming assignments to various AI models, collects their solutions,
|
|
6
|
+
and evaluates them using the existing CodEval framework.
|
|
7
|
+
|
|
8
|
+
Supported providers:
|
|
9
|
+
- Anthropic (Claude models)
|
|
10
|
+
- OpenAI (GPT models)
|
|
11
|
+
- Google (Gemini models)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import json
|
|
17
|
+
import time
|
|
18
|
+
import subprocess
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from typing import Optional
|
|
22
|
+
from configparser import ConfigParser
|
|
23
|
+
|
|
24
|
+
import click
|
|
25
|
+
|
|
26
|
+
from .commons import info, warn, error, debug
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class AIModel:
|
|
31
|
+
"""Represents an AI model configuration."""
|
|
32
|
+
provider: str # anthropic, openai, google
|
|
33
|
+
model_id: str
|
|
34
|
+
display_name: str
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# Default models to benchmark
|
|
38
|
+
DEFAULT_MODELS = [
|
|
39
|
+
# Anthropic models
|
|
40
|
+
AIModel("anthropic", "claude-sonnet-4-20250514", "Claude Sonnet 4"),
|
|
41
|
+
AIModel("anthropic", "claude-opus-4-20250514", "Claude Opus 4"),
|
|
42
|
+
# OpenAI models
|
|
43
|
+
AIModel("openai", "gpt-4o", "GPT-4o"),
|
|
44
|
+
AIModel("openai", "gpt-4o-mini", "GPT-4o Mini"),
|
|
45
|
+
AIModel("openai", "o1", "o1"),
|
|
46
|
+
AIModel("openai", "o3-mini", "o3-mini"),
|
|
47
|
+
# Google models
|
|
48
|
+
AIModel("google", "gemini-2.0-flash", "Gemini 2.0 Flash"),
|
|
49
|
+
AIModel("google", "gemini-1.5-pro-latest", "Gemini 1.5 Pro"),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_ai_config() -> dict:
|
|
54
|
+
"""Load AI API keys from config file."""
|
|
55
|
+
config_path = Path.home() / ".config" / "codeval.ini"
|
|
56
|
+
config = ConfigParser()
|
|
57
|
+
|
|
58
|
+
if config_path.exists():
|
|
59
|
+
config.read(config_path)
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
"anthropic_key": config.get("AI", "anthropic_key", fallback=os.environ.get("ANTHROPIC_API_KEY")),
|
|
63
|
+
"openai_key": config.get("AI", "openai_key", fallback=os.environ.get("OPENAI_API_KEY")),
|
|
64
|
+
"google_key": config.get("AI", "google_key", fallback=os.environ.get("GOOGLE_API_KEY")),
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def extract_assignment_from_codeval(codeval_path: str) -> tuple[str, str, str]:
|
|
69
|
+
"""
|
|
70
|
+
Extract assignment description, compile command, and language from a codeval file.
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
(description, compile_command, language)
|
|
74
|
+
"""
|
|
75
|
+
with open(codeval_path, "r", encoding="utf-8") as f:
|
|
76
|
+
content = f.read()
|
|
77
|
+
|
|
78
|
+
# Extract content between CRT_HW START and CRT_HW END
|
|
79
|
+
match = re.search(r"CRT_HW START \S+\n(.*?)CRT_HW END", content, re.DOTALL)
|
|
80
|
+
if match:
|
|
81
|
+
description = match.group(1).strip()
|
|
82
|
+
else:
|
|
83
|
+
# If no CRT_HW markers, use everything before first tag
|
|
84
|
+
lines = []
|
|
85
|
+
for line in content.split("\n"):
|
|
86
|
+
if re.match(r"^[A-Z]+\s", line):
|
|
87
|
+
break
|
|
88
|
+
lines.append(line)
|
|
89
|
+
description = "\n".join(lines).strip()
|
|
90
|
+
|
|
91
|
+
# Extract compile command
|
|
92
|
+
compile_match = re.search(r"^C\s+(.+)$", content, re.MULTILINE)
|
|
93
|
+
compile_cmd = compile_match.group(1) if compile_match else ""
|
|
94
|
+
|
|
95
|
+
# Detect language from compile command
|
|
96
|
+
language = "unknown"
|
|
97
|
+
if "gcc" in compile_cmd or "cc " in compile_cmd:
|
|
98
|
+
language = "c"
|
|
99
|
+
elif "g++" in compile_cmd:
|
|
100
|
+
language = "cpp"
|
|
101
|
+
elif "python" in compile_cmd:
|
|
102
|
+
language = "python"
|
|
103
|
+
elif "javac" in compile_cmd:
|
|
104
|
+
language = "java"
|
|
105
|
+
elif "rustc" in compile_cmd or "cargo" in compile_cmd:
|
|
106
|
+
language = "rust"
|
|
107
|
+
elif "go " in compile_cmd:
|
|
108
|
+
language = "go"
|
|
109
|
+
|
|
110
|
+
return description, compile_cmd, language
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def extract_source_filename(compile_cmd: str) -> str:
|
|
114
|
+
"""Extract the source filename from a compile command."""
|
|
115
|
+
# Look for common source file extensions
|
|
116
|
+
match = re.search(r"(\S+\.(c|cpp|cc|py|java|rs|go))", compile_cmd)
|
|
117
|
+
if match:
|
|
118
|
+
return match.group(1)
|
|
119
|
+
return "solution.c"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def build_prompt(description: str, language: str, filename: str) -> str:
|
|
123
|
+
"""Build the prompt to send to AI models."""
|
|
124
|
+
lang_hints = {
|
|
125
|
+
"c": "Write the solution in C. Use standard C libraries only.",
|
|
126
|
+
"cpp": "Write the solution in C++. Use standard C++ libraries only.",
|
|
127
|
+
"python": "Write the solution in Python 3.",
|
|
128
|
+
"java": "Write the solution in Java.",
|
|
129
|
+
"rust": "Write the solution in Rust.",
|
|
130
|
+
"go": "Write the solution in Go.",
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
hint = lang_hints.get(language, "")
|
|
134
|
+
|
|
135
|
+
return f"""You are solving a programming assignment. {hint}
|
|
136
|
+
|
|
137
|
+
IMPORTANT: Output ONLY the code. No explanations, no markdown code blocks, no comments about the solution. Just the raw source code that can be directly saved to a file and compiled/run.
|
|
138
|
+
|
|
139
|
+
The solution should be saved as: {filename}
|
|
140
|
+
|
|
141
|
+
Here is the assignment:
|
|
142
|
+
|
|
143
|
+
{description}
|
|
144
|
+
|
|
145
|
+
Remember: Output ONLY the code, nothing else."""
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def extract_code_from_response(response: str, language: str) -> str:
|
|
149
|
+
"""Extract code from AI response, handling markdown blocks if present."""
|
|
150
|
+
# Try to extract from markdown code block
|
|
151
|
+
patterns = [
|
|
152
|
+
r"```(?:c|cpp|python|java|rust|go)?\n(.*?)```",
|
|
153
|
+
r"```\n(.*?)```",
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
for pattern in patterns:
|
|
157
|
+
match = re.search(pattern, response, re.DOTALL)
|
|
158
|
+
if match:
|
|
159
|
+
return match.group(1).strip()
|
|
160
|
+
|
|
161
|
+
# If no code blocks, assume the whole response is code
|
|
162
|
+
# But strip any leading/trailing explanation
|
|
163
|
+
lines = response.strip().split("\n")
|
|
164
|
+
|
|
165
|
+
# Remove lines that look like explanations
|
|
166
|
+
code_lines = []
|
|
167
|
+
in_code = False
|
|
168
|
+
for line in lines:
|
|
169
|
+
# Detect start of code
|
|
170
|
+
if not in_code:
|
|
171
|
+
if line.startswith("#include") or line.startswith("import ") or \
|
|
172
|
+
line.startswith("def ") or line.startswith("int ") or \
|
|
173
|
+
line.startswith("void ") or line.startswith("public ") or \
|
|
174
|
+
line.startswith("package ") or line.startswith("use ") or \
|
|
175
|
+
line.startswith("fn ") or line.startswith("func "):
|
|
176
|
+
in_code = True
|
|
177
|
+
|
|
178
|
+
if in_code:
|
|
179
|
+
code_lines.append(line)
|
|
180
|
+
|
|
181
|
+
if code_lines:
|
|
182
|
+
return "\n".join(code_lines)
|
|
183
|
+
|
|
184
|
+
return response.strip()
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def call_anthropic(model_id: str, prompt: str, api_key: str) -> Optional[str]:
|
|
188
|
+
"""Call Anthropic API."""
|
|
189
|
+
try:
|
|
190
|
+
import anthropic
|
|
191
|
+
except ImportError:
|
|
192
|
+
error("anthropic package not installed. Run: pip install anthropic")
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
client = anthropic.Anthropic(api_key=api_key)
|
|
197
|
+
|
|
198
|
+
# Adjust max_tokens based on model capabilities
|
|
199
|
+
max_tokens = 4096 # Safe default for older models
|
|
200
|
+
if "claude-3-5" in model_id or "claude-sonnet-4" in model_id or "claude-opus-4" in model_id:
|
|
201
|
+
max_tokens = 8192
|
|
202
|
+
|
|
203
|
+
message = client.messages.create(
|
|
204
|
+
model=model_id,
|
|
205
|
+
max_tokens=max_tokens,
|
|
206
|
+
messages=[{"role": "user", "content": prompt}]
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return message.content[0].text
|
|
210
|
+
except Exception as e:
|
|
211
|
+
error(f"Anthropic API error: {e}")
|
|
212
|
+
return None
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def call_openai(model_id: str, prompt: str, api_key: str) -> Optional[str]:
|
|
216
|
+
"""Call OpenAI API."""
|
|
217
|
+
try:
|
|
218
|
+
import openai
|
|
219
|
+
except ImportError:
|
|
220
|
+
error("openai package not installed. Run: pip install openai")
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
try:
|
|
224
|
+
client = openai.OpenAI(api_key=api_key)
|
|
225
|
+
|
|
226
|
+
response = client.chat.completions.create(
|
|
227
|
+
model=model_id,
|
|
228
|
+
messages=[{"role": "user", "content": prompt}],
|
|
229
|
+
max_tokens=8192,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
return response.choices[0].message.content
|
|
233
|
+
except Exception as e:
|
|
234
|
+
error(f"OpenAI API error: {e}")
|
|
235
|
+
return None
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def call_google(model_id: str, prompt: str, api_key: str) -> Optional[str]:
|
|
239
|
+
"""Call Google Gemini API."""
|
|
240
|
+
try:
|
|
241
|
+
import google.generativeai as genai
|
|
242
|
+
except ImportError:
|
|
243
|
+
error("google-generativeai package not installed. Run: pip install google-generativeai")
|
|
244
|
+
return None
|
|
245
|
+
|
|
246
|
+
try:
|
|
247
|
+
genai.configure(api_key=api_key)
|
|
248
|
+
|
|
249
|
+
model = genai.GenerativeModel(model_id)
|
|
250
|
+
response = model.generate_content(prompt)
|
|
251
|
+
|
|
252
|
+
return response.text
|
|
253
|
+
except Exception as e:
|
|
254
|
+
error(f"Google API error: {e}")
|
|
255
|
+
return None
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def call_model(model: AIModel, prompt: str, config: dict) -> Optional[str]:
|
|
259
|
+
"""Call the appropriate API based on provider."""
|
|
260
|
+
if model.provider == "anthropic":
|
|
261
|
+
if not config["anthropic_key"]:
|
|
262
|
+
warn(f"No Anthropic API key configured, skipping {model.display_name}")
|
|
263
|
+
return None
|
|
264
|
+
return call_anthropic(model.model_id, prompt, config["anthropic_key"])
|
|
265
|
+
|
|
266
|
+
elif model.provider == "openai":
|
|
267
|
+
if not config["openai_key"]:
|
|
268
|
+
warn(f"No OpenAI API key configured, skipping {model.display_name}")
|
|
269
|
+
return None
|
|
270
|
+
return call_openai(model.model_id, prompt, config["openai_key"])
|
|
271
|
+
|
|
272
|
+
elif model.provider == "google":
|
|
273
|
+
if not config["google_key"]:
|
|
274
|
+
warn(f"No Google API key configured, skipping {model.display_name}")
|
|
275
|
+
return None
|
|
276
|
+
return call_google(model.model_id, prompt, config["google_key"])
|
|
277
|
+
|
|
278
|
+
else:
|
|
279
|
+
error(f"Unknown provider: {model.provider}")
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def run_benchmark(
|
|
284
|
+
codeval_path: str,
|
|
285
|
+
output_dir: str,
|
|
286
|
+
models: Optional[list[AIModel]] = None,
|
|
287
|
+
attempts: int = 1,
|
|
288
|
+
config: Optional[dict] = None,
|
|
289
|
+
) -> dict:
|
|
290
|
+
"""
|
|
291
|
+
Run benchmark on a codeval assignment with multiple AI models.
|
|
292
|
+
|
|
293
|
+
Args:
|
|
294
|
+
codeval_path: Path to the .codeval file
|
|
295
|
+
output_dir: Directory to store solutions and results
|
|
296
|
+
models: List of models to test (defaults to DEFAULT_MODELS)
|
|
297
|
+
attempts: Number of attempts per model
|
|
298
|
+
config: Optional config dict with API keys
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary with results for each model
|
|
302
|
+
"""
|
|
303
|
+
if models is None:
|
|
304
|
+
models = DEFAULT_MODELS
|
|
305
|
+
|
|
306
|
+
if config is None:
|
|
307
|
+
config = load_ai_config()
|
|
308
|
+
|
|
309
|
+
# Convert to absolute path to avoid issues with relative paths
|
|
310
|
+
codeval_path = str(Path(codeval_path).resolve())
|
|
311
|
+
|
|
312
|
+
# Extract assignment info
|
|
313
|
+
description, compile_cmd, language = extract_assignment_from_codeval(codeval_path)
|
|
314
|
+
source_file = extract_source_filename(compile_cmd)
|
|
315
|
+
|
|
316
|
+
info(f"Assignment: {Path(codeval_path).stem}")
|
|
317
|
+
info(f"Language: {language}")
|
|
318
|
+
info(f"Source file: {source_file}")
|
|
319
|
+
|
|
320
|
+
# Build prompt
|
|
321
|
+
prompt = build_prompt(description, language, source_file)
|
|
322
|
+
|
|
323
|
+
# Create output directory
|
|
324
|
+
output_path = Path(output_dir)
|
|
325
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
326
|
+
|
|
327
|
+
# Save prompt for reference
|
|
328
|
+
(output_path / "prompt.txt").write_text(prompt)
|
|
329
|
+
|
|
330
|
+
results = {}
|
|
331
|
+
|
|
332
|
+
for model in models:
|
|
333
|
+
model_dir = output_path / model.display_name.replace(" ", "_")
|
|
334
|
+
model_dir.mkdir(exist_ok=True)
|
|
335
|
+
|
|
336
|
+
model_results = {
|
|
337
|
+
"attempts": [],
|
|
338
|
+
"best_score": 0,
|
|
339
|
+
"passed": False,
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
for attempt in range(attempts):
|
|
343
|
+
attempt_dir = model_dir / f"attempt_{attempt + 1}"
|
|
344
|
+
attempt_dir.mkdir(exist_ok=True)
|
|
345
|
+
|
|
346
|
+
info(f"\n{'='*60}")
|
|
347
|
+
info(f"Model: {model.display_name} (Attempt {attempt + 1}/{attempts})")
|
|
348
|
+
info(f"{'='*60}")
|
|
349
|
+
|
|
350
|
+
# Call the model
|
|
351
|
+
start_time = time.time()
|
|
352
|
+
response = call_model(model, prompt, config)
|
|
353
|
+
elapsed = time.time() - start_time
|
|
354
|
+
|
|
355
|
+
if response is None:
|
|
356
|
+
model_results["attempts"].append({
|
|
357
|
+
"success": False,
|
|
358
|
+
"error": "API call failed",
|
|
359
|
+
"time": elapsed,
|
|
360
|
+
})
|
|
361
|
+
continue
|
|
362
|
+
|
|
363
|
+
# Save raw response
|
|
364
|
+
(attempt_dir / "raw_response.txt").write_text(response)
|
|
365
|
+
|
|
366
|
+
# Extract code
|
|
367
|
+
code = extract_code_from_response(response, language)
|
|
368
|
+
source_path = attempt_dir / source_file
|
|
369
|
+
source_path.write_text(code)
|
|
370
|
+
|
|
371
|
+
info(f"Response received in {elapsed:.2f}s")
|
|
372
|
+
info(f"Code saved to {source_path}")
|
|
373
|
+
|
|
374
|
+
# Copy codeval file to attempt directory, stripping Z tags (not supported locally)
|
|
375
|
+
import shutil
|
|
376
|
+
try:
|
|
377
|
+
with open(codeval_path, "r", encoding="utf-8") as f:
|
|
378
|
+
codeval_content = f.read()
|
|
379
|
+
# Remove Z tag lines (zip file downloads only work on Canvas)
|
|
380
|
+
codeval_lines = [line for line in codeval_content.split("\n") if not line.startswith("Z ")]
|
|
381
|
+
(attempt_dir / Path(codeval_path).name).write_text("\n".join(codeval_lines))
|
|
382
|
+
|
|
383
|
+
# Copy support files if they exist
|
|
384
|
+
codeval_dir = Path(codeval_path).parent
|
|
385
|
+
support_dir = codeval_dir / "support_files"
|
|
386
|
+
if support_dir.exists():
|
|
387
|
+
for support_file in support_dir.iterdir():
|
|
388
|
+
shutil.copy(support_file, attempt_dir / support_file.name)
|
|
389
|
+
except Exception as e:
|
|
390
|
+
error(f"Failed to copy codeval/support files: {e}")
|
|
391
|
+
model_results["attempts"].append({
|
|
392
|
+
"success": False,
|
|
393
|
+
"error": f"File copy failed: {e}",
|
|
394
|
+
"time": elapsed,
|
|
395
|
+
})
|
|
396
|
+
continue
|
|
397
|
+
|
|
398
|
+
# Run evaluation using subprocess
|
|
399
|
+
info("Running evaluation...")
|
|
400
|
+
try:
|
|
401
|
+
result = subprocess.run(
|
|
402
|
+
["assignment-codeval", "run-evaluation", Path(codeval_path).name],
|
|
403
|
+
cwd=attempt_dir,
|
|
404
|
+
capture_output=True,
|
|
405
|
+
text=True,
|
|
406
|
+
timeout=120,
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
# Save evaluation output
|
|
410
|
+
(attempt_dir / "evaluation_output.txt").write_text(
|
|
411
|
+
f"=== STDOUT ===\n{result.stdout}\n\n=== STDERR ===\n{result.stderr}"
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
eval_passed = result.returncode == 0
|
|
415
|
+
|
|
416
|
+
model_results["attempts"].append({
|
|
417
|
+
"success": True,
|
|
418
|
+
"passed": eval_passed,
|
|
419
|
+
"time": elapsed,
|
|
420
|
+
})
|
|
421
|
+
|
|
422
|
+
if eval_passed:
|
|
423
|
+
model_results["passed"] = True
|
|
424
|
+
info(f"✓ {model.display_name} PASSED")
|
|
425
|
+
else:
|
|
426
|
+
info(f"✗ {model.display_name} FAILED")
|
|
427
|
+
# Show brief failure info
|
|
428
|
+
if "FAILED" in result.stdout:
|
|
429
|
+
for line in result.stdout.split("\n"):
|
|
430
|
+
if "FAILED" in line or "Command ran" in line:
|
|
431
|
+
info(f" {line.strip()}")
|
|
432
|
+
|
|
433
|
+
except subprocess.TimeoutExpired:
|
|
434
|
+
model_results["attempts"].append({
|
|
435
|
+
"success": False,
|
|
436
|
+
"error": "Evaluation timed out",
|
|
437
|
+
"time": elapsed,
|
|
438
|
+
})
|
|
439
|
+
error("Evaluation timed out")
|
|
440
|
+
except Exception as e:
|
|
441
|
+
model_results["attempts"].append({
|
|
442
|
+
"success": False,
|
|
443
|
+
"error": str(e),
|
|
444
|
+
"time": elapsed,
|
|
445
|
+
})
|
|
446
|
+
error(f"Evaluation error: {e}")
|
|
447
|
+
|
|
448
|
+
results[model.display_name] = model_results
|
|
449
|
+
|
|
450
|
+
# Save results summary
|
|
451
|
+
(output_path / "results.json").write_text(json.dumps(results, indent=2))
|
|
452
|
+
|
|
453
|
+
# Print summary
|
|
454
|
+
print("\n" + "="*60)
|
|
455
|
+
print("BENCHMARK RESULTS SUMMARY")
|
|
456
|
+
print("="*60)
|
|
457
|
+
|
|
458
|
+
for model_name, result in results.items():
|
|
459
|
+
status = "✓ PASS" if result["passed"] else "✗ FAIL"
|
|
460
|
+
attempts_info = f"{sum(1 for a in result['attempts'] if a.get('passed', False))}/{len(result['attempts'])}"
|
|
461
|
+
print(f"{model_name:30} {status:10} ({attempts_info} attempts passed)")
|
|
462
|
+
|
|
463
|
+
return results
|
|
464
|
+
|
|
465
|
+
|
|
466
|
+
@click.command("test-with-ai")
|
|
467
|
+
@click.argument("codeval_file", type=click.Path(exists=True))
|
|
468
|
+
@click.option("--output-dir", "-o", default="ai_test_results",
|
|
469
|
+
help="Directory to store solutions and results")
|
|
470
|
+
@click.option("--attempts", "-n", default=1, type=int,
|
|
471
|
+
help="Number of attempts per model")
|
|
472
|
+
@click.option("--models", "-m", multiple=True,
|
|
473
|
+
help="Specific models to test (can be used multiple times)")
|
|
474
|
+
@click.option("--providers", "-p", multiple=True,
|
|
475
|
+
type=click.Choice(["anthropic", "openai", "google"]),
|
|
476
|
+
help="Only test models from these providers")
|
|
477
|
+
@click.option("--anthropic-key", envvar="ANTHROPIC_API_KEY",
|
|
478
|
+
help="Anthropic API key (or set ANTHROPIC_API_KEY env var)")
|
|
479
|
+
@click.option("--openai-key", envvar="OPENAI_API_KEY",
|
|
480
|
+
help="OpenAI API key (or set OPENAI_API_KEY env var)")
|
|
481
|
+
@click.option("--google-key", envvar="GOOGLE_API_KEY",
|
|
482
|
+
help="Google API key (or set GOOGLE_API_KEY env var)")
|
|
483
|
+
def benchmark_ai_command(codeval_file, output_dir, attempts, models, providers,
|
|
484
|
+
anthropic_key, openai_key, google_key):
|
|
485
|
+
"""
|
|
486
|
+
Test AI models on a programming assignment.
|
|
487
|
+
|
|
488
|
+
Sends the assignment to multiple AI models, collects their solutions,
|
|
489
|
+
and evaluates them using the codeval framework.
|
|
490
|
+
|
|
491
|
+
API keys can be provided via:
|
|
492
|
+
- Command line options (--anthropic-key, --openai-key, --google-key)
|
|
493
|
+
- Environment variables (ANTHROPIC_API_KEY, OPENAI_API_KEY, GOOGLE_API_KEY)
|
|
494
|
+
- Config file (~/.config/codeval.ini in [AI] section)
|
|
495
|
+
|
|
496
|
+
Example:
|
|
497
|
+
assignment-codeval test-with-ai my_assignment.codeval -n 3 -p anthropic
|
|
498
|
+
"""
|
|
499
|
+
# Build config from provided keys
|
|
500
|
+
config = load_ai_config()
|
|
501
|
+
if anthropic_key:
|
|
502
|
+
config["anthropic_key"] = anthropic_key
|
|
503
|
+
if openai_key:
|
|
504
|
+
config["openai_key"] = openai_key
|
|
505
|
+
if google_key:
|
|
506
|
+
config["google_key"] = google_key
|
|
507
|
+
|
|
508
|
+
# Filter models if specific ones requested
|
|
509
|
+
test_models = DEFAULT_MODELS
|
|
510
|
+
|
|
511
|
+
if models:
|
|
512
|
+
test_models = [m for m in DEFAULT_MODELS if m.model_id in models or m.display_name in models]
|
|
513
|
+
|
|
514
|
+
if providers:
|
|
515
|
+
test_models = [m for m in test_models if m.provider in providers]
|
|
516
|
+
|
|
517
|
+
if not test_models:
|
|
518
|
+
error("No models selected for testing")
|
|
519
|
+
return
|
|
520
|
+
|
|
521
|
+
info(f"Testing {len(test_models)} models with {attempts} attempt(s) each")
|
|
522
|
+
|
|
523
|
+
run_benchmark(codeval_file, output_dir, test_models, attempts, config)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def get_benchmark_command():
|
|
527
|
+
"""Return the Click command for CLI registration."""
|
|
528
|
+
return benchmark_ai_command
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import click
|
|
2
2
|
|
|
3
|
+
from assignment_codeval.ai_benchmark import get_benchmark_command
|
|
3
4
|
from assignment_codeval.create_assignment import create_assignment
|
|
4
5
|
from assignment_codeval.evaluate import run_evaluation
|
|
5
6
|
from assignment_codeval.github_connect import github_setup_repo
|
|
@@ -16,6 +17,7 @@ cli.add_command(upload_submission_comments)
|
|
|
16
17
|
cli.add_command(github_setup_repo)
|
|
17
18
|
cli.add_command(evaluate_submissions)
|
|
18
19
|
cli.add_command(create_assignment)
|
|
20
|
+
cli.add_command(get_benchmark_command())
|
|
19
21
|
|
|
20
22
|
if __name__ == "__main__":
|
|
21
23
|
cli()
|
|
@@ -366,7 +366,7 @@ def test_case_hidden(test_case_command):
|
|
|
366
366
|
|
|
367
367
|
|
|
368
368
|
def supply_input(inputs):
|
|
369
|
-
"""Specifies the input for a test case.
|
|
369
|
+
"""Specifies the input for a test case (adds newline at end).
|
|
370
370
|
|
|
371
371
|
Arguments:
|
|
372
372
|
*inputs: inputs to be used for test case
|
|
@@ -374,8 +374,21 @@ def supply_input(inputs):
|
|
|
374
374
|
Returns:
|
|
375
375
|
None
|
|
376
376
|
"""
|
|
377
|
-
with open("fileinput", "
|
|
378
|
-
outfile.write(inputs)
|
|
377
|
+
with open("fileinput", "ab") as outfile:
|
|
378
|
+
outfile.write((inputs + "\n").encode("utf-8"))
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def supply_input_bare(inputs):
|
|
382
|
+
"""Specifies the input for a test case without adding a newline.
|
|
383
|
+
|
|
384
|
+
Arguments:
|
|
385
|
+
*inputs: inputs to be used for test case
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
None
|
|
389
|
+
"""
|
|
390
|
+
with open("fileinput", "ab") as outfile:
|
|
391
|
+
outfile.write(inputs.encode("utf-8"))
|
|
379
392
|
|
|
380
393
|
|
|
381
394
|
def supply_input_file(input_file):
|
|
@@ -387,15 +400,15 @@ def supply_input_file(input_file):
|
|
|
387
400
|
Returns:
|
|
388
401
|
None
|
|
389
402
|
"""
|
|
390
|
-
with open(input_file, "
|
|
391
|
-
|
|
403
|
+
with open(input_file, "rb") as infile:
|
|
404
|
+
input_data = infile.read()
|
|
392
405
|
|
|
393
|
-
with open("fileinput", "
|
|
394
|
-
outfile.
|
|
406
|
+
with open("fileinput", "ab") as outfile:
|
|
407
|
+
outfile.write(input_data)
|
|
395
408
|
|
|
396
409
|
|
|
397
410
|
def check_output(outputs):
|
|
398
|
-
"""Specifies the expected output for a test case.
|
|
411
|
+
"""Specifies the expected output for a test case (adds newline at end).
|
|
399
412
|
|
|
400
413
|
Arguments:
|
|
401
414
|
*outputs: outputs to be used for test case
|
|
@@ -408,6 +421,20 @@ def check_output(outputs):
|
|
|
408
421
|
outfile.write(outputs + "\n")
|
|
409
422
|
|
|
410
423
|
|
|
424
|
+
def check_output_bare(outputs):
|
|
425
|
+
"""Specifies the expected output for a test case without adding a newline.
|
|
426
|
+
|
|
427
|
+
Arguments:
|
|
428
|
+
*outputs: outputs to be used for test case
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
None
|
|
432
|
+
"""
|
|
433
|
+
|
|
434
|
+
with open("expectedoutput", "a") as outfile:
|
|
435
|
+
outfile.write(outputs)
|
|
436
|
+
|
|
437
|
+
|
|
411
438
|
def check_output_file(output_file):
|
|
412
439
|
"""Specifies the expected output for a test case read from a file.
|
|
413
440
|
|
|
@@ -425,7 +452,7 @@ def check_output_file(output_file):
|
|
|
425
452
|
|
|
426
453
|
|
|
427
454
|
def check_error(error_output):
|
|
428
|
-
"""Specifies the expected error output for a test case.
|
|
455
|
+
"""Specifies the expected error output for a test case (adds newline at end).
|
|
429
456
|
|
|
430
457
|
Arguments:
|
|
431
458
|
error_output: expected error output for a test case
|
|
@@ -437,6 +464,19 @@ def check_error(error_output):
|
|
|
437
464
|
outfile.write(error_output + "\n")
|
|
438
465
|
|
|
439
466
|
|
|
467
|
+
def check_error_bare(error_output):
|
|
468
|
+
"""Specifies the expected error output for a test case without adding a newline.
|
|
469
|
+
|
|
470
|
+
Arguments:
|
|
471
|
+
error_output: expected error output for a test case
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
None
|
|
475
|
+
"""
|
|
476
|
+
with open("expectederror", "a") as outfile:
|
|
477
|
+
outfile.write(error_output)
|
|
478
|
+
|
|
479
|
+
|
|
440
480
|
def hint(hints):
|
|
441
481
|
"""Hint
|
|
442
482
|
|
|
@@ -536,10 +576,13 @@ tag_func_map = {
|
|
|
536
576
|
"T": test_case,
|
|
537
577
|
"HT": test_case_hidden,
|
|
538
578
|
"I": supply_input,
|
|
579
|
+
"IB": supply_input_bare,
|
|
539
580
|
"IF": supply_input_file,
|
|
540
581
|
"O": check_output,
|
|
582
|
+
"OB": check_output_bare,
|
|
541
583
|
"OF": check_output_file,
|
|
542
584
|
"E": check_error,
|
|
585
|
+
"EB": check_error_bare,
|
|
543
586
|
"HINT": hint,
|
|
544
587
|
"TO": timeout,
|
|
545
588
|
"X": exit_code,
|
|
@@ -584,6 +627,8 @@ def parse_tags(tags: list[str]):
|
|
|
584
627
|
tag_only_pattern = r"([A-Z_]+)\s*$"
|
|
585
628
|
|
|
586
629
|
valid_tags = set(tag_func_map.keys())
|
|
630
|
+
# Tags to silently ignore (used by other tools but not by run-evaluation)
|
|
631
|
+
ignored_tags = {"CTO", "Z", "RUN"}
|
|
587
632
|
|
|
588
633
|
# Track if we're inside a CRT_HW block (content to ignore)
|
|
589
634
|
in_crt_hw_block = False
|
|
@@ -609,6 +654,9 @@ def parse_tags(tags: list[str]):
|
|
|
609
654
|
# Check for tag without arguments
|
|
610
655
|
if tag_only_match and not tag_match:
|
|
611
656
|
tag = tag_only_match.group(1)
|
|
657
|
+
# Skip ignored tags
|
|
658
|
+
if tag in ignored_tags:
|
|
659
|
+
continue
|
|
612
660
|
if tag in valid_tags:
|
|
613
661
|
print(f"Error on line {line_num}: Tag '{tag}' requires arguments")
|
|
614
662
|
print(f" {line_num}: {tag_line.rstrip()}")
|
|
@@ -626,6 +674,9 @@ def parse_tags(tags: list[str]):
|
|
|
626
674
|
potential_tag = re.match(r"([A-Z]+)", tag_line)
|
|
627
675
|
if potential_tag:
|
|
628
676
|
tag = potential_tag.group(1)
|
|
677
|
+
# Skip ignored tags
|
|
678
|
+
if tag in ignored_tags:
|
|
679
|
+
continue
|
|
629
680
|
if tag not in valid_tags and len(tag) <= 4:
|
|
630
681
|
print(f"Error on line {line_num}: Unknown tag '{tag}'")
|
|
631
682
|
print(f" {line_num}: {tag_line.rstrip()}")
|
|
@@ -636,6 +687,10 @@ def parse_tags(tags: list[str]):
|
|
|
636
687
|
tag = tag_match.group(1)
|
|
637
688
|
args = tag_match.group(2)
|
|
638
689
|
|
|
690
|
+
# Skip ignored tags
|
|
691
|
+
if tag in ignored_tags:
|
|
692
|
+
continue
|
|
693
|
+
|
|
639
694
|
# Check for unknown tag
|
|
640
695
|
if tag not in valid_tags:
|
|
641
696
|
print(f"Error on line {line_num}: Unknown tag '{tag}'")
|
|
@@ -715,7 +770,7 @@ def check_test():
|
|
|
715
770
|
print(f"Test case {test_case_count} of {test_case_total}")
|
|
716
771
|
passed = True
|
|
717
772
|
|
|
718
|
-
with open("fileinput", "
|
|
773
|
+
with open("fileinput", "rb") as fileinput, open(
|
|
719
774
|
"youroutput", "w"
|
|
720
775
|
) as youroutput, open("yourerror", "w") as yourerror:
|
|
721
776
|
test_exec = subprocess.Popen(
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/PKG-INFO
RENAMED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: assignment-codeval
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.10
|
|
4
4
|
Summary: CodEval for evaluating programming assignments
|
|
5
5
|
Requires-Python: >=3.12
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
7
7
|
Requires-Dist: canvasapi==3.3.0
|
|
8
|
-
Requires-Dist: certifi==2021.10.8
|
|
9
|
-
Requires-Dist: charset-normalizer==2.0.9
|
|
10
8
|
Requires-Dist: click==8.2.1
|
|
11
9
|
Requires-Dist: configparser==5.2.0
|
|
12
|
-
Requires-Dist: idna==3.3
|
|
13
10
|
Requires-Dist: pytz==2021.3
|
|
14
|
-
Requires-Dist: requests
|
|
15
|
-
Requires-Dist: urllib3==1.26.7
|
|
11
|
+
Requires-Dist: requests>=2.28.0
|
|
16
12
|
Requires-Dist: pymongo==4.3.3
|
|
17
13
|
Requires-Dist: markdown==3.4.1
|
|
14
|
+
Requires-Dist: anthropic>=0.39.0
|
|
15
|
+
Requires-Dist: openai>=1.0.0
|
|
16
|
+
Requires-Dist: google-generativeai>=0.8.0
|
|
18
17
|
Provides-Extra: test
|
|
19
18
|
Requires-Dist: pytest>=7.0; extra == "test"
|
|
20
19
|
|
|
@@ -72,9 +71,9 @@ Tags used in a spec file (\<course name>.codeval)
|
|
|
72
71
|
| CMD/TCMD | Run Command | Will be followed by a command to run. The TCMD will cause the evaluation to fail if the command exits with an error. |
|
|
73
72
|
| CMP | Compare | Will be followed by two files to compare. |
|
|
74
73
|
| T/HT | Test Case | Will be followed by the command to run to test the submission. |
|
|
75
|
-
| I/IF | Supply Input | Specifies the input for a test case.
|
|
76
|
-
| O/OF | Check Output | Specifies the expected output for a test case.
|
|
77
|
-
| E | Check Error | Specifies the expected error output for a test case. |
|
|
74
|
+
| I/IB/IF | Supply Input | Specifies the input for a test case. I adds a newline, IB does not add a newline, IF reads from a file. |
|
|
75
|
+
| O/OB/OF | Check Output | Specifies the expected output for a test case. O adds a newline, OB does not add a newline, OF reads from a file. |
|
|
76
|
+
| E/EB | Check Error | Specifies the expected error output for a test case. E adds a newline, EB does not. |
|
|
78
77
|
| TO | Timeout | Specifies the time limit in seconds for a test case to run. Defaults to 20 seconds. |
|
|
79
78
|
| X | Exit Code | Specifies the expected exit code for a test case. Defaults to zero. |
|
|
80
79
|
| SS | Start Server | Command containing timeout (wait until server starts), kill timeout (wait to kill the server), and the command to start a server |
|
|
@@ -247,3 +246,97 @@ Refer to a sample spec file [here](SQL/samples/ASSIGNMENT:CREATE.codeval)
|
|
|
247
246
|
C cc -o bigbag --std=gnu11 bigbag.c
|
|
248
247
|
|
|
249
248
|
|
|
249
|
+
## 4. Test Assignments with AI Models
|
|
250
|
+
|
|
251
|
+
Test programming assignments against multiple AI models (Claude, GPT, Gemini) to benchmark their performance.
|
|
252
|
+
|
|
253
|
+
### Installation
|
|
254
|
+
|
|
255
|
+
Install the AI provider packages you want to use:
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Install all AI providers
|
|
259
|
+
pip install assignment-codeval[ai]
|
|
260
|
+
|
|
261
|
+
# Or install specific providers
|
|
262
|
+
pip install anthropic # For Claude models
|
|
263
|
+
pip install openai # For GPT models
|
|
264
|
+
pip install google-generativeai # For Gemini models
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
### codeval.ini contents (optional)
|
|
268
|
+
```
|
|
269
|
+
[AI]
|
|
270
|
+
anthropic_key=sk-ant-...
|
|
271
|
+
openai_key=sk-...
|
|
272
|
+
google_key=...
|
|
273
|
+
```
|
|
274
|
+
|
|
275
|
+
API keys can also be provided via:
|
|
276
|
+
- Environment variables: `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`
|
|
277
|
+
- Command line options: `--anthropic-key`, `--openai-key`, `--google-key`
|
|
278
|
+
|
|
279
|
+
### Command to run
|
|
280
|
+
```bash
|
|
281
|
+
assignment-codeval test-with-ai <codeval_file> [OPTIONS]
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Options
|
|
285
|
+
| Option | Description |
|
|
286
|
+
|--------|-------------|
|
|
287
|
+
| `-o, --output-dir` | Directory to store solutions and results (default: `ai_test_results`) |
|
|
288
|
+
| `-n, --attempts` | Number of attempts per model (default: 1) |
|
|
289
|
+
| `-m, --models` | Specific models to test (can be used multiple times) |
|
|
290
|
+
| `-p, --providers` | Only test models from specific providers: `anthropic`, `openai`, `google` |
|
|
291
|
+
| `--anthropic-key` | Anthropic API key |
|
|
292
|
+
| `--openai-key` | OpenAI API key |
|
|
293
|
+
| `--google-key` | Google API key |
|
|
294
|
+
|
|
295
|
+
### Examples
|
|
296
|
+
```bash
|
|
297
|
+
# Test with all Anthropic models
|
|
298
|
+
assignment-codeval test-with-ai my_assignment.codeval -p anthropic
|
|
299
|
+
|
|
300
|
+
# Test with specific model, 3 attempts each
|
|
301
|
+
assignment-codeval test-with-ai my_assignment.codeval -m "Claude Sonnet 4" -n 3
|
|
302
|
+
|
|
303
|
+
# Test with all providers (requires all API keys)
|
|
304
|
+
assignment-codeval test-with-ai my_assignment.codeval -n 2
|
|
305
|
+
|
|
306
|
+
# Pass API key directly
|
|
307
|
+
assignment-codeval test-with-ai my_assignment.codeval --anthropic-key sk-ant-xxx -p anthropic
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### Supported Models
|
|
311
|
+
|
|
312
|
+
| Provider | Models |
|
|
313
|
+
|----------|--------|
|
|
314
|
+
| Anthropic | Claude Sonnet 4, Claude Opus 4 |
|
|
315
|
+
| OpenAI | GPT-4o, GPT-4o Mini, o1, o3-mini |
|
|
316
|
+
| Google | Gemini 2.0 Flash, Gemini 1.5 Pro |
|
|
317
|
+
|
|
318
|
+
Note: You can add additional models using `-m "model-id"`. Check each provider's documentation for available model IDs.
|
|
319
|
+
|
|
320
|
+
### Output Structure
|
|
321
|
+
```
|
|
322
|
+
ai_test_results/
|
|
323
|
+
├── prompt.txt # The prompt sent to AI models
|
|
324
|
+
├── results.json # Summary of all results
|
|
325
|
+
├── Claude_Sonnet_4/
|
|
326
|
+
│ └── attempt_1/
|
|
327
|
+
│ ├── raw_response.txt # Raw AI response
|
|
328
|
+
│ ├── solution.c # Extracted code
|
|
329
|
+
│ └── <codeval files> # Copied for evaluation
|
|
330
|
+
├── GPT-4o/
|
|
331
|
+
│ └── attempt_1/
|
|
332
|
+
│ └── ...
|
|
333
|
+
└── ...
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Notes
|
|
337
|
+
- The command extracts the assignment description from the codeval file (between `CRT_HW START` and `CRT_HW END` tags)
|
|
338
|
+
- Support files from `support_files/` directory are automatically copied for evaluation
|
|
339
|
+
- Results include pass/fail status, response time, and any errors
|
|
340
|
+
- Use multiple attempts (`-n`) to account for AI response variability
|
|
341
|
+
|
|
342
|
+
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/requires.txt
RENAMED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
canvasapi==3.3.0
|
|
2
|
-
certifi==2021.10.8
|
|
3
|
-
charset-normalizer==2.0.9
|
|
4
2
|
click==8.2.1
|
|
5
3
|
configparser==5.2.0
|
|
6
|
-
idna==3.3
|
|
7
4
|
pytz==2021.3
|
|
8
|
-
requests
|
|
9
|
-
urllib3==1.26.7
|
|
5
|
+
requests>=2.28.0
|
|
10
6
|
pymongo==4.3.3
|
|
11
7
|
markdown==3.4.1
|
|
8
|
+
anthropic>=0.39.0
|
|
9
|
+
openai>=1.0.0
|
|
10
|
+
google-generativeai>=0.8.0
|
|
12
11
|
|
|
13
12
|
[test]
|
|
14
13
|
pytest>=7.0
|
|
File without changes
|
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/canvas_utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/convertMD2Html.py
RENAMED
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/create_assignment.py
RENAMED
|
File without changes
|
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/github_connect.py
RENAMED
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval/submissions.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{assignment_codeval-0.0.8 → assignment_codeval-0.0.10}/src/assignment_codeval.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|