bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,785 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark Manager for comprehensive error injection testing.
|
|
3
|
+
|
|
4
|
+
Provides:
|
|
5
|
+
- Stress testing across multiple error count levels (10, 20, 40, 60, 100)
|
|
6
|
+
- Multi-process parallel execution for files and stress levels
|
|
7
|
+
- Multi-model comparison support (BioGuider + external models)
|
|
8
|
+
- CSV/JSON export of results
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import json
|
|
14
|
+
import csv
|
|
15
|
+
import shutil
|
|
16
|
+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Dict, List, Optional, Any, Callable, Tuple
|
|
21
|
+
|
|
22
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
23
|
+
|
|
24
|
+
from bioguider.generation.llm_injector import LLMErrorInjector
|
|
25
|
+
from bioguider.generation.benchmark_metrics import (
|
|
26
|
+
BenchmarkResult,
|
|
27
|
+
BenchmarkEvaluator,
|
|
28
|
+
evaluate_benchmark,
|
|
29
|
+
)
|
|
30
|
+
from bioguider.managers.generation_manager import DocumentationGenerationManager
|
|
31
|
+
from bioguider.agents.agent_utils import read_file, write_file
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# Default stress test levels
|
|
35
|
+
DEFAULT_STRESS_LEVELS = [10, 20, 40, 60, 100]
|
|
36
|
+
|
|
37
|
+
# Supported external models for comparison
|
|
38
|
+
SUPPORTED_MODELS = ["bioguider", "gpt-5.1", "claude-sonnet", "gemini"]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class StressTestResult:
|
|
43
|
+
"""Result of a single stress test level."""
|
|
44
|
+
error_count: int
|
|
45
|
+
benchmark_result: BenchmarkResult
|
|
46
|
+
output_dir: str
|
|
47
|
+
duration_seconds: float = 0.0
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class ModelComparisonResult:
|
|
52
|
+
"""Comparison results across multiple models."""
|
|
53
|
+
models: List[str]
|
|
54
|
+
error_count: int
|
|
55
|
+
results: Dict[str, BenchmarkResult] = field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class BenchmarkManager:
|
|
59
|
+
"""
|
|
60
|
+
Manages comprehensive benchmark testing for error injection.
|
|
61
|
+
|
|
62
|
+
Features:
|
|
63
|
+
- Stress testing with configurable error levels
|
|
64
|
+
- Multi-process parallel execution
|
|
65
|
+
- Multi-model comparison support
|
|
66
|
+
- Comprehensive result export (JSON, CSV, Markdown)
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
def __init__(
|
|
70
|
+
self,
|
|
71
|
+
llm: BaseChatOpenAI,
|
|
72
|
+
step_callback: Optional[Callable] = None,
|
|
73
|
+
max_workers: int = 4
|
|
74
|
+
):
|
|
75
|
+
self.llm = llm
|
|
76
|
+
self.step_callback = step_callback
|
|
77
|
+
self.max_workers = max_workers
|
|
78
|
+
|
|
79
|
+
def print_step(self, name: str, output: str = ""):
|
|
80
|
+
"""Output step progress."""
|
|
81
|
+
if self.step_callback:
|
|
82
|
+
self.step_callback(step_name=name, step_output=output)
|
|
83
|
+
else:
|
|
84
|
+
print(f"[{name}] {output}")
|
|
85
|
+
|
|
86
|
+
# =========================================================================
|
|
87
|
+
# FILE SELECTION
|
|
88
|
+
# =========================================================================
|
|
89
|
+
|
|
90
|
+
def _select_target_files(self, baseline_repo_path: str) -> Dict[str, List[str]]:
|
|
91
|
+
"""
|
|
92
|
+
Select target files for error injection across multiple categories.
|
|
93
|
+
"""
|
|
94
|
+
targets = {
|
|
95
|
+
"readme": [],
|
|
96
|
+
"tutorial": [],
|
|
97
|
+
"userguide": [],
|
|
98
|
+
"installation": []
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# README files
|
|
102
|
+
readme_path = os.path.join(baseline_repo_path, "README.md")
|
|
103
|
+
if os.path.exists(readme_path):
|
|
104
|
+
targets["readme"].append(readme_path)
|
|
105
|
+
|
|
106
|
+
# Tutorial files (RMarkdown vignettes)
|
|
107
|
+
vignettes_dir = os.path.join(baseline_repo_path, "vignettes")
|
|
108
|
+
if os.path.isdir(vignettes_dir):
|
|
109
|
+
for f in sorted(os.listdir(vignettes_dir)):
|
|
110
|
+
if f.endswith('.Rmd') and not f.startswith('.'):
|
|
111
|
+
targets["tutorial"].append(os.path.join(vignettes_dir, f))
|
|
112
|
+
|
|
113
|
+
# Installation files
|
|
114
|
+
for pattern in ["install", "INSTALL", "installation"]:
|
|
115
|
+
for ext in [".md", ".Rmd", ".rst"]:
|
|
116
|
+
fpath = os.path.join(baseline_repo_path, pattern + ext)
|
|
117
|
+
if os.path.exists(fpath):
|
|
118
|
+
targets["installation"].append(fpath)
|
|
119
|
+
|
|
120
|
+
# Userguide files
|
|
121
|
+
docs_dir = os.path.join(baseline_repo_path, "docs")
|
|
122
|
+
if os.path.isdir(docs_dir):
|
|
123
|
+
for f in sorted(os.listdir(docs_dir)):
|
|
124
|
+
if f.endswith('.md') and not f.startswith('.'):
|
|
125
|
+
targets["userguide"].append(os.path.join(docs_dir, f))
|
|
126
|
+
|
|
127
|
+
return targets
|
|
128
|
+
|
|
129
|
+
def _extract_project_terms(self, repo_path: str) -> List[str]:
|
|
130
|
+
"""Extract function names and key terms from the codebase."""
|
|
131
|
+
import re
|
|
132
|
+
from collections import Counter
|
|
133
|
+
|
|
134
|
+
terms = Counter()
|
|
135
|
+
|
|
136
|
+
for root, _, files in os.walk(repo_path):
|
|
137
|
+
if ".git" in root or "__pycache__" in root:
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
for file in files:
|
|
141
|
+
fpath = os.path.join(root, file)
|
|
142
|
+
try:
|
|
143
|
+
content = read_file(fpath)
|
|
144
|
+
if not content:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
if file.endswith(".py"):
|
|
148
|
+
funcs = re.findall(r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", content)
|
|
149
|
+
terms.update(funcs)
|
|
150
|
+
classes = re.findall(r"class\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*[:\(]", content)
|
|
151
|
+
terms.update(classes)
|
|
152
|
+
elif file.endswith(".R"):
|
|
153
|
+
funcs = re.findall(r"([a-zA-Z_.][a-zA-Z0-9_.]*)\s*<-\s*function", content)
|
|
154
|
+
terms.update(funcs)
|
|
155
|
+
except Exception:
|
|
156
|
+
continue
|
|
157
|
+
|
|
158
|
+
filtered = [t for t, _ in terms.most_common(50)
|
|
159
|
+
if len(t) > 4 and t not in {"init", "self", "setup", "test", "main"}]
|
|
160
|
+
return filtered[:20]
|
|
161
|
+
|
|
162
|
+
# =========================================================================
|
|
163
|
+
# ERROR INJECTION
|
|
164
|
+
# =========================================================================
|
|
165
|
+
|
|
166
|
+
def _inject_errors_into_file(
|
|
167
|
+
self,
|
|
168
|
+
fpath: str,
|
|
169
|
+
category: str,
|
|
170
|
+
tmp_repo_path: str,
|
|
171
|
+
min_per_category: int,
|
|
172
|
+
project_terms: List[str],
|
|
173
|
+
injector: LLMErrorInjector
|
|
174
|
+
) -> Optional[Dict[str, Any]]:
|
|
175
|
+
"""Inject errors into a single file."""
|
|
176
|
+
if not os.path.exists(fpath):
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
baseline_content = read_file(fpath) or ""
|
|
180
|
+
if not baseline_content.strip():
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
corrupted, manifest = injector.inject(
|
|
185
|
+
baseline_content,
|
|
186
|
+
min_per_category=min_per_category,
|
|
187
|
+
project_terms=project_terms
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Determine relative path
|
|
191
|
+
rel_path = os.path.relpath(fpath, os.path.dirname(os.path.dirname(fpath)))
|
|
192
|
+
if rel_path.startswith("../"):
|
|
193
|
+
rel_path = os.path.basename(fpath)
|
|
194
|
+
|
|
195
|
+
corrupted_path = os.path.join(tmp_repo_path, rel_path)
|
|
196
|
+
os.makedirs(os.path.dirname(corrupted_path), exist_ok=True)
|
|
197
|
+
write_file(corrupted_path, corrupted)
|
|
198
|
+
|
|
199
|
+
# Add file path to each error
|
|
200
|
+
for error in manifest.get("errors", []):
|
|
201
|
+
error["file_path"] = rel_path
|
|
202
|
+
|
|
203
|
+
return {
|
|
204
|
+
"rel_path": rel_path,
|
|
205
|
+
"category": category,
|
|
206
|
+
"original_path": fpath,
|
|
207
|
+
"corrupted_path": corrupted_path,
|
|
208
|
+
"manifest": manifest,
|
|
209
|
+
"baseline_content": baseline_content,
|
|
210
|
+
"corrupted_content": corrupted,
|
|
211
|
+
}
|
|
212
|
+
except Exception as e:
|
|
213
|
+
self.print_step(f"InjectionError:{os.path.basename(fpath)}", str(e))
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def _inject_errors_parallel(
|
|
217
|
+
self,
|
|
218
|
+
target_files: Dict[str, List[str]],
|
|
219
|
+
tmp_repo_path: str,
|
|
220
|
+
min_per_category: int
|
|
221
|
+
) -> Dict[str, Dict]:
|
|
222
|
+
"""Inject errors into multiple files in parallel."""
|
|
223
|
+
injector = LLMErrorInjector(self.llm)
|
|
224
|
+
project_terms = self._extract_project_terms(tmp_repo_path)
|
|
225
|
+
self.print_step("ExtractTerms", f"Found {len(project_terms)} project terms")
|
|
226
|
+
|
|
227
|
+
all_manifests = {}
|
|
228
|
+
|
|
229
|
+
# Flatten file list with categories
|
|
230
|
+
files_with_cats = []
|
|
231
|
+
for category, file_list in target_files.items():
|
|
232
|
+
for fpath in file_list:
|
|
233
|
+
files_with_cats.append((fpath, category))
|
|
234
|
+
|
|
235
|
+
self.print_step("InjectErrors", f"Injecting into {len(files_with_cats)} files with {min_per_category} errors/category")
|
|
236
|
+
|
|
237
|
+
# Use ThreadPoolExecutor since LLM calls are I/O bound
|
|
238
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
239
|
+
futures = {}
|
|
240
|
+
for fpath, category in files_with_cats:
|
|
241
|
+
future = executor.submit(
|
|
242
|
+
self._inject_errors_into_file,
|
|
243
|
+
fpath, category, tmp_repo_path, min_per_category, project_terms, injector
|
|
244
|
+
)
|
|
245
|
+
futures[future] = (fpath, category)
|
|
246
|
+
|
|
247
|
+
for future in as_completed(futures):
|
|
248
|
+
fpath, category = futures[future]
|
|
249
|
+
try:
|
|
250
|
+
result = future.result()
|
|
251
|
+
if result:
|
|
252
|
+
all_manifests[result["rel_path"]] = result
|
|
253
|
+
self.print_step(
|
|
254
|
+
f"Injected:{os.path.basename(fpath)}",
|
|
255
|
+
f"{len(result['manifest'].get('errors', []))} errors"
|
|
256
|
+
)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
self.print_step(f"InjectionFailed:{os.path.basename(fpath)}", str(e))
|
|
259
|
+
|
|
260
|
+
return all_manifests
|
|
261
|
+
|
|
262
|
+
# =========================================================================
|
|
263
|
+
# STRESS TESTING
|
|
264
|
+
# =========================================================================
|
|
265
|
+
|
|
266
|
+
def run_stress_test(
|
|
267
|
+
self,
|
|
268
|
+
report_path: str,
|
|
269
|
+
baseline_repo_path: str,
|
|
270
|
+
output_base_path: str,
|
|
271
|
+
stress_levels: List[int] = None,
|
|
272
|
+
max_files_per_category: int = 10,
|
|
273
|
+
detect_semantic_fp: bool = True,
|
|
274
|
+
limit_generation_files: bool = True
|
|
275
|
+
) -> Dict[int, StressTestResult]:
|
|
276
|
+
"""
|
|
277
|
+
Run stress tests across multiple error count levels.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
report_path: Path to evaluation report JSON
|
|
281
|
+
baseline_repo_path: Path to baseline repository
|
|
282
|
+
output_base_path: Base path for output directories
|
|
283
|
+
stress_levels: List of error counts to test (default: [10, 20, 40, 60, 100])
|
|
284
|
+
max_files_per_category: Max files to process per category
|
|
285
|
+
detect_semantic_fp: Whether to run semantic FP detection
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
Dict mapping error_count to StressTestResult
|
|
289
|
+
"""
|
|
290
|
+
import time
|
|
291
|
+
|
|
292
|
+
if stress_levels is None:
|
|
293
|
+
stress_levels = DEFAULT_STRESS_LEVELS
|
|
294
|
+
|
|
295
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
296
|
+
benchmark_dir = os.path.join(output_base_path, f"benchmark_{timestamp}")
|
|
297
|
+
os.makedirs(benchmark_dir, exist_ok=True)
|
|
298
|
+
|
|
299
|
+
self.print_step("StressTestStart", f"Testing levels: {stress_levels}")
|
|
300
|
+
|
|
301
|
+
# Select target files once
|
|
302
|
+
target_files = self._select_target_files(baseline_repo_path)
|
|
303
|
+
|
|
304
|
+
# Limit files per category
|
|
305
|
+
for cat in target_files:
|
|
306
|
+
if len(target_files[cat]) > max_files_per_category:
|
|
307
|
+
target_files[cat] = target_files[cat][:max_files_per_category]
|
|
308
|
+
|
|
309
|
+
total_files = sum(len(v) for v in target_files.values())
|
|
310
|
+
self.print_step("FilesSelected", f"{total_files} files across {len(target_files)} categories")
|
|
311
|
+
|
|
312
|
+
results: Dict[int, StressTestResult] = {}
|
|
313
|
+
|
|
314
|
+
for level in stress_levels:
|
|
315
|
+
start_time = time.time()
|
|
316
|
+
self.print_step(f"StressLevel:{level}", f"Starting with {level} errors per category")
|
|
317
|
+
|
|
318
|
+
# Create level-specific directory
|
|
319
|
+
level_dir = os.path.join(benchmark_dir, f"level_{level}")
|
|
320
|
+
tmp_repo_path = os.path.join(level_dir, "tmp_repo")
|
|
321
|
+
|
|
322
|
+
# Copy baseline repo
|
|
323
|
+
if os.path.exists(tmp_repo_path):
|
|
324
|
+
shutil.rmtree(tmp_repo_path)
|
|
325
|
+
shutil.copytree(baseline_repo_path, tmp_repo_path,
|
|
326
|
+
symlinks=False, ignore=shutil.ignore_patterns('.git'))
|
|
327
|
+
|
|
328
|
+
# Inject errors
|
|
329
|
+
all_manifests = self._inject_errors_parallel(target_files, tmp_repo_path, level)
|
|
330
|
+
|
|
331
|
+
total_errors = sum(len(info["manifest"].get("errors", [])) for info in all_manifests.values())
|
|
332
|
+
self.print_step("InjectionComplete", f"{total_errors} errors in {len(all_manifests)} files")
|
|
333
|
+
|
|
334
|
+
# Save injection manifest
|
|
335
|
+
self._save_manifest(all_manifests, level_dir)
|
|
336
|
+
|
|
337
|
+
# Run BioGuider to fix - only process injected files to save time
|
|
338
|
+
injected_files = list(all_manifests.keys()) if limit_generation_files else None
|
|
339
|
+
num_injected = len(injected_files) if injected_files else 0
|
|
340
|
+
|
|
341
|
+
# Always use max_files as a hard limit when limiting
|
|
342
|
+
max_files_limit = num_injected if (limit_generation_files and num_injected > 0) else None
|
|
343
|
+
|
|
344
|
+
if limit_generation_files:
|
|
345
|
+
self.print_step("RunGeneration", f"Processing ONLY {num_injected} injected files (max_files={max_files_limit})")
|
|
346
|
+
else:
|
|
347
|
+
self.print_step("RunGeneration", "Processing ALL files...")
|
|
348
|
+
|
|
349
|
+
gen = DocumentationGenerationManager(self.llm, self.step_callback)
|
|
350
|
+
out_dir = gen.run(
|
|
351
|
+
report_path=report_path,
|
|
352
|
+
repo_path=tmp_repo_path,
|
|
353
|
+
target_files=injected_files, # Filter by file path (primary)
|
|
354
|
+
max_files=max_files_limit # Hard limit (backup guarantee)
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Evaluate results
|
|
358
|
+
self.print_step("EvaluateFixes", "Computing benchmark metrics...")
|
|
359
|
+
benchmark_result = evaluate_benchmark(
|
|
360
|
+
all_manifests, out_dir, self.llm, detect_semantic_fp
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
duration = time.time() - start_time
|
|
364
|
+
|
|
365
|
+
results[level] = StressTestResult(
|
|
366
|
+
error_count=level,
|
|
367
|
+
benchmark_result=benchmark_result,
|
|
368
|
+
output_dir=level_dir,
|
|
369
|
+
duration_seconds=duration,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
# Save level results
|
|
373
|
+
self._save_level_results(results[level], level_dir)
|
|
374
|
+
|
|
375
|
+
self.print_step(
|
|
376
|
+
f"LevelComplete:{level}",
|
|
377
|
+
f"F1={benchmark_result.f1_score:.3f}, FixRate={benchmark_result.fix_rate:.3f}"
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
# Save aggregate stress test results
|
|
381
|
+
self._save_stress_test_results(results, benchmark_dir)
|
|
382
|
+
|
|
383
|
+
self.print_step("StressTestComplete", f"Results saved to {benchmark_dir}")
|
|
384
|
+
return results
|
|
385
|
+
|
|
386
|
+
# =========================================================================
|
|
387
|
+
# MULTI-MODEL COMPARISON
|
|
388
|
+
# =========================================================================
|
|
389
|
+
|
|
390
|
+
def prepare_model_comparison(
|
|
391
|
+
self,
|
|
392
|
+
report_path: str,
|
|
393
|
+
baseline_repo_path: str,
|
|
394
|
+
output_base_path: str,
|
|
395
|
+
error_count: int = 20,
|
|
396
|
+
max_files_per_category: int = 10
|
|
397
|
+
) -> str:
|
|
398
|
+
"""
|
|
399
|
+
Prepare corrupted files for multi-model comparison.
|
|
400
|
+
|
|
401
|
+
This generates corrupted files that can be manually run through
|
|
402
|
+
Cursor with different models (GPT-5.1, Claude Sonnet, Gemini).
|
|
403
|
+
|
|
404
|
+
Args:
|
|
405
|
+
report_path: Path to evaluation report
|
|
406
|
+
baseline_repo_path: Path to baseline repository
|
|
407
|
+
output_base_path: Base output path
|
|
408
|
+
error_count: Number of errors to inject per category
|
|
409
|
+
max_files_per_category: Max files per category
|
|
410
|
+
|
|
411
|
+
Returns:
|
|
412
|
+
Path to the prepared benchmark directory
|
|
413
|
+
"""
|
|
414
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
415
|
+
benchmark_dir = os.path.join(output_base_path, f"model_comparison_{timestamp}")
|
|
416
|
+
os.makedirs(benchmark_dir, exist_ok=True)
|
|
417
|
+
|
|
418
|
+
self.print_step("PrepareComparison", f"Preparing files for model comparison")
|
|
419
|
+
|
|
420
|
+
# Select and limit files
|
|
421
|
+
target_files = self._select_target_files(baseline_repo_path)
|
|
422
|
+
for cat in target_files:
|
|
423
|
+
if len(target_files[cat]) > max_files_per_category:
|
|
424
|
+
target_files[cat] = target_files[cat][:max_files_per_category]
|
|
425
|
+
|
|
426
|
+
# Create tmp repo for injection
|
|
427
|
+
tmp_repo_path = os.path.join(benchmark_dir, "corrupted")
|
|
428
|
+
if os.path.exists(tmp_repo_path):
|
|
429
|
+
shutil.rmtree(tmp_repo_path)
|
|
430
|
+
shutil.copytree(baseline_repo_path, tmp_repo_path,
|
|
431
|
+
symlinks=False, ignore=shutil.ignore_patterns('.git'))
|
|
432
|
+
|
|
433
|
+
# Inject errors
|
|
434
|
+
all_manifests = self._inject_errors_parallel(target_files, tmp_repo_path, error_count)
|
|
435
|
+
|
|
436
|
+
# Save manifest
|
|
437
|
+
self._save_manifest(all_manifests, benchmark_dir)
|
|
438
|
+
|
|
439
|
+
# Save original files for reference
|
|
440
|
+
originals_dir = os.path.join(benchmark_dir, "originals")
|
|
441
|
+
os.makedirs(originals_dir, exist_ok=True)
|
|
442
|
+
for rel_path, info in all_manifests.items():
|
|
443
|
+
orig_save_path = os.path.join(originals_dir, rel_path)
|
|
444
|
+
os.makedirs(os.path.dirname(orig_save_path), exist_ok=True)
|
|
445
|
+
write_file(orig_save_path, info["baseline_content"])
|
|
446
|
+
|
|
447
|
+
# Create directories for each model's fixed output
|
|
448
|
+
for model in SUPPORTED_MODELS:
|
|
449
|
+
model_dir = os.path.join(benchmark_dir, f"fixed_{model}")
|
|
450
|
+
os.makedirs(model_dir, exist_ok=True)
|
|
451
|
+
|
|
452
|
+
# Generate instructions file
|
|
453
|
+
self._generate_comparison_instructions(benchmark_dir, all_manifests)
|
|
454
|
+
|
|
455
|
+
self.print_step("ComparisonPrepared", f"Files ready in {benchmark_dir}")
|
|
456
|
+
return benchmark_dir
|
|
457
|
+
|
|
458
|
+
def evaluate_model_comparison(
|
|
459
|
+
self,
|
|
460
|
+
benchmark_dir: str,
|
|
461
|
+
models: List[str] = None,
|
|
462
|
+
detect_semantic_fp: bool = True
|
|
463
|
+
) -> ModelComparisonResult:
|
|
464
|
+
"""
|
|
465
|
+
Evaluate and compare results from multiple models.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
benchmark_dir: Path to benchmark directory with fixed files
|
|
469
|
+
models: List of model names to evaluate
|
|
470
|
+
detect_semantic_fp: Whether to run semantic FP detection
|
|
471
|
+
|
|
472
|
+
Returns:
|
|
473
|
+
ModelComparisonResult with comparison data
|
|
474
|
+
"""
|
|
475
|
+
if models is None:
|
|
476
|
+
models = SUPPORTED_MODELS
|
|
477
|
+
|
|
478
|
+
# Load manifest
|
|
479
|
+
manifest_path = os.path.join(benchmark_dir, "BENCHMARK_MANIFEST.json")
|
|
480
|
+
with open(manifest_path, 'r') as f:
|
|
481
|
+
manifest_data = json.load(f)
|
|
482
|
+
|
|
483
|
+
# Reconstruct manifests dict
|
|
484
|
+
all_manifests = {}
|
|
485
|
+
originals_dir = os.path.join(benchmark_dir, "originals")
|
|
486
|
+
corrupted_dir = os.path.join(benchmark_dir, "corrupted")
|
|
487
|
+
|
|
488
|
+
for rel_path, file_info in manifest_data["files"].items():
|
|
489
|
+
orig_content = read_file(os.path.join(originals_dir, rel_path)) or ""
|
|
490
|
+
corr_content = read_file(os.path.join(corrupted_dir, rel_path)) or ""
|
|
491
|
+
|
|
492
|
+
all_manifests[rel_path] = {
|
|
493
|
+
"category": file_info["category"],
|
|
494
|
+
"manifest": {"errors": file_info["errors"]},
|
|
495
|
+
"baseline_content": orig_content,
|
|
496
|
+
"corrupted_content": corr_content,
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
total_errors = manifest_data.get("total_errors", 0)
|
|
500
|
+
|
|
501
|
+
result = ModelComparisonResult(
|
|
502
|
+
models=models,
|
|
503
|
+
error_count=total_errors,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
for model in models:
|
|
507
|
+
model_fixed_dir = os.path.join(benchmark_dir, f"fixed_{model}")
|
|
508
|
+
|
|
509
|
+
if not os.path.exists(model_fixed_dir):
|
|
510
|
+
self.print_step(f"SkipModel:{model}", "No fixed files found")
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# Check if there are any files in the directory
|
|
514
|
+
has_files = any(os.path.isfile(os.path.join(model_fixed_dir, f))
|
|
515
|
+
for f in os.listdir(model_fixed_dir))
|
|
516
|
+
if not has_files:
|
|
517
|
+
self.print_step(f"SkipModel:{model}", "Directory empty")
|
|
518
|
+
continue
|
|
519
|
+
|
|
520
|
+
self.print_step(f"EvaluateModel:{model}", "Computing metrics...")
|
|
521
|
+
|
|
522
|
+
benchmark_result = evaluate_benchmark(
|
|
523
|
+
all_manifests, model_fixed_dir, self.llm, detect_semantic_fp
|
|
524
|
+
)
|
|
525
|
+
|
|
526
|
+
result.results[model] = benchmark_result
|
|
527
|
+
|
|
528
|
+
self.print_step(
|
|
529
|
+
f"ModelEvaluated:{model}",
|
|
530
|
+
f"F1={benchmark_result.f1_score:.3f}, FixRate={benchmark_result.fix_rate:.3f}"
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
# Save comparison results
|
|
534
|
+
self._save_comparison_results(result, benchmark_dir)
|
|
535
|
+
|
|
536
|
+
return result
|
|
537
|
+
|
|
538
|
+
# =========================================================================
|
|
539
|
+
# RESULT EXPORT
|
|
540
|
+
# =========================================================================
|
|
541
|
+
|
|
542
|
+
def _save_manifest(self, all_manifests: Dict[str, Dict], output_dir: str):
|
|
543
|
+
"""Save injection manifest to JSON."""
|
|
544
|
+
all_errors = []
|
|
545
|
+
files_info = {}
|
|
546
|
+
|
|
547
|
+
for rel_path, info in all_manifests.items():
|
|
548
|
+
file_errors = info["manifest"].get("errors", [])
|
|
549
|
+
files_info[rel_path] = {
|
|
550
|
+
"category": info["category"],
|
|
551
|
+
"error_count": len(file_errors),
|
|
552
|
+
"errors": file_errors,
|
|
553
|
+
}
|
|
554
|
+
all_errors.extend(file_errors)
|
|
555
|
+
|
|
556
|
+
manifest = {
|
|
557
|
+
"total_files": len(all_manifests),
|
|
558
|
+
"total_errors": len(all_errors),
|
|
559
|
+
"files": files_info,
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
manifest_path = os.path.join(output_dir, "BENCHMARK_MANIFEST.json")
|
|
563
|
+
with open(manifest_path, 'w') as f:
|
|
564
|
+
json.dump(manifest, f, indent=2)
|
|
565
|
+
|
|
566
|
+
def _save_level_results(self, result: StressTestResult, output_dir: str):
|
|
567
|
+
"""Save results for a single stress level."""
|
|
568
|
+
results_path = os.path.join(output_dir, "BENCHMARK_RESULTS.json")
|
|
569
|
+
with open(results_path, 'w') as f:
|
|
570
|
+
json.dump({
|
|
571
|
+
"error_count": result.error_count,
|
|
572
|
+
"duration_seconds": result.duration_seconds,
|
|
573
|
+
**result.benchmark_result.to_dict()
|
|
574
|
+
}, f, indent=2)
|
|
575
|
+
|
|
576
|
+
def _save_stress_test_results(
|
|
577
|
+
self,
|
|
578
|
+
results: Dict[int, StressTestResult],
|
|
579
|
+
output_dir: str
|
|
580
|
+
):
|
|
581
|
+
"""Save aggregate stress test results as JSON and CSV."""
|
|
582
|
+
# JSON format
|
|
583
|
+
stress_results = []
|
|
584
|
+
for level, result in sorted(results.items()):
|
|
585
|
+
stress_results.append({
|
|
586
|
+
"error_count": level,
|
|
587
|
+
"duration_seconds": result.duration_seconds,
|
|
588
|
+
**result.benchmark_result.to_dict()
|
|
589
|
+
})
|
|
590
|
+
|
|
591
|
+
json_path = os.path.join(output_dir, "STRESS_TEST_RESULTS.json")
|
|
592
|
+
with open(json_path, 'w') as f:
|
|
593
|
+
json.dump({"stress_results": stress_results}, f, indent=2)
|
|
594
|
+
|
|
595
|
+
# CSV format
|
|
596
|
+
csv_path = os.path.join(output_dir, "STRESS_TEST_TABLE.csv")
|
|
597
|
+
with open(csv_path, 'w', newline='') as f:
|
|
598
|
+
writer = csv.writer(f)
|
|
599
|
+
writer.writerow([
|
|
600
|
+
"error_count", "true_positives", "false_negatives", "false_positives",
|
|
601
|
+
"precision", "recall", "f1_score", "fix_rate", "duration_seconds"
|
|
602
|
+
])
|
|
603
|
+
for level, result in sorted(results.items()):
|
|
604
|
+
br = result.benchmark_result
|
|
605
|
+
writer.writerow([
|
|
606
|
+
level, br.true_positives, br.false_negatives, br.false_positives,
|
|
607
|
+
round(br.precision, 4), round(br.recall, 4), round(br.f1_score, 4),
|
|
608
|
+
round(br.fix_rate, 4), round(result.duration_seconds, 2)
|
|
609
|
+
])
|
|
610
|
+
|
|
611
|
+
# Markdown report
|
|
612
|
+
self._generate_stress_test_report(results, output_dir)
|
|
613
|
+
|
|
614
|
+
def _generate_stress_test_report(
|
|
615
|
+
self,
|
|
616
|
+
results: Dict[int, StressTestResult],
|
|
617
|
+
output_dir: str
|
|
618
|
+
):
|
|
619
|
+
"""Generate markdown report for stress test."""
|
|
620
|
+
lines = [
|
|
621
|
+
"# Stress Test Results\n",
|
|
622
|
+
f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
|
|
623
|
+
"\n---\n",
|
|
624
|
+
"\n## Summary Table\n",
|
|
625
|
+
"\n| Errors | TP | FN | FP | Precision | Recall | F1 | Fix Rate |\n",
|
|
626
|
+
"|--------|----|----|-----|-----------|--------|-----|----------|\n",
|
|
627
|
+
]
|
|
628
|
+
|
|
629
|
+
for level, result in sorted(results.items()):
|
|
630
|
+
br = result.benchmark_result
|
|
631
|
+
lines.append(
|
|
632
|
+
f"| {level} | {br.true_positives} | {br.false_negatives} | "
|
|
633
|
+
f"{br.false_positives} | {br.precision:.3f} | {br.recall:.3f} | "
|
|
634
|
+
f"{br.f1_score:.3f} | {br.fix_rate:.3f} |\n"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
lines.append("\n---\n")
|
|
638
|
+
lines.append("\n## Key Findings\n")
|
|
639
|
+
|
|
640
|
+
# Find performance drop-off point
|
|
641
|
+
prev_f1 = 1.0
|
|
642
|
+
drop_point = None
|
|
643
|
+
for level, result in sorted(results.items()):
|
|
644
|
+
if result.benchmark_result.f1_score < prev_f1 * 0.8: # 20% drop
|
|
645
|
+
drop_point = level
|
|
646
|
+
break
|
|
647
|
+
prev_f1 = result.benchmark_result.f1_score
|
|
648
|
+
|
|
649
|
+
if drop_point:
|
|
650
|
+
lines.append(f"\n- **Performance drop-off**: Significant decline observed at {drop_point} errors\n")
|
|
651
|
+
else:
|
|
652
|
+
lines.append("\n- **Performance**: Stable across all tested error levels\n")
|
|
653
|
+
|
|
654
|
+
# Best/worst performance
|
|
655
|
+
best_level = max(results.keys(), key=lambda k: results[k].benchmark_result.f1_score)
|
|
656
|
+
worst_level = min(results.keys(), key=lambda k: results[k].benchmark_result.f1_score)
|
|
657
|
+
|
|
658
|
+
lines.append(f"- **Best F1 Score**: {results[best_level].benchmark_result.f1_score:.3f} at {best_level} errors\n")
|
|
659
|
+
lines.append(f"- **Worst F1 Score**: {results[worst_level].benchmark_result.f1_score:.3f} at {worst_level} errors\n")
|
|
660
|
+
|
|
661
|
+
report_path = os.path.join(output_dir, "STRESS_TEST_REPORT.md")
|
|
662
|
+
with open(report_path, 'w') as f:
|
|
663
|
+
f.writelines(lines)
|
|
664
|
+
|
|
665
|
+
def _save_comparison_results(
|
|
666
|
+
self,
|
|
667
|
+
result: ModelComparisonResult,
|
|
668
|
+
output_dir: str
|
|
669
|
+
):
|
|
670
|
+
"""Save model comparison results as JSON and CSV."""
|
|
671
|
+
# JSON format
|
|
672
|
+
comparison_data = {
|
|
673
|
+
"models": result.models,
|
|
674
|
+
"error_count": result.error_count,
|
|
675
|
+
"results": {
|
|
676
|
+
model: br.to_dict()
|
|
677
|
+
for model, br in result.results.items()
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
json_path = os.path.join(output_dir, "MODEL_COMPARISON_RESULTS.json")
|
|
682
|
+
with open(json_path, 'w') as f:
|
|
683
|
+
json.dump(comparison_data, f, indent=2)
|
|
684
|
+
|
|
685
|
+
# CSV format
|
|
686
|
+
csv_path = os.path.join(output_dir, "MODEL_COMPARISON_TABLE.csv")
|
|
687
|
+
with open(csv_path, 'w', newline='') as f:
|
|
688
|
+
writer = csv.writer(f)
|
|
689
|
+
writer.writerow([
|
|
690
|
+
"model", "true_positives", "false_negatives", "false_positives",
|
|
691
|
+
"precision", "recall", "f1_score", "fix_rate"
|
|
692
|
+
])
|
|
693
|
+
for model, br in result.results.items():
|
|
694
|
+
writer.writerow([
|
|
695
|
+
model, br.true_positives, br.false_negatives, br.false_positives,
|
|
696
|
+
round(br.precision, 4), round(br.recall, 4),
|
|
697
|
+
round(br.f1_score, 4), round(br.fix_rate, 4)
|
|
698
|
+
])
|
|
699
|
+
|
|
700
|
+
# Markdown report
|
|
701
|
+
self._generate_comparison_report(result, output_dir)
|
|
702
|
+
|
|
703
|
+
def _generate_comparison_report(
|
|
704
|
+
self,
|
|
705
|
+
result: ModelComparisonResult,
|
|
706
|
+
output_dir: str
|
|
707
|
+
):
|
|
708
|
+
"""Generate markdown report for model comparison."""
|
|
709
|
+
lines = [
|
|
710
|
+
"# Model Comparison Results\n",
|
|
711
|
+
f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
|
|
712
|
+
f"**Error Count**: {result.error_count}\n",
|
|
713
|
+
"\n---\n",
|
|
714
|
+
"\n## Comparison Table\n",
|
|
715
|
+
"\n| Model | TP | FN | FP | Precision | Recall | F1 | Fix Rate |\n",
|
|
716
|
+
"|-------|----|----|-----|-----------|--------|-----|----------|\n",
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
for model, br in result.results.items():
|
|
720
|
+
lines.append(
|
|
721
|
+
f"| {model} | {br.true_positives} | {br.false_negatives} | "
|
|
722
|
+
f"{br.false_positives} | {br.precision:.3f} | {br.recall:.3f} | "
|
|
723
|
+
f"{br.f1_score:.3f} | {br.fix_rate:.3f} |\n"
|
|
724
|
+
)
|
|
725
|
+
|
|
726
|
+
lines.append("\n---\n")
|
|
727
|
+
lines.append("\n## Rankings\n")
|
|
728
|
+
|
|
729
|
+
# Rank by F1 score
|
|
730
|
+
ranked = sorted(result.results.items(), key=lambda x: x[1].f1_score, reverse=True)
|
|
731
|
+
lines.append("\n### By F1 Score\n")
|
|
732
|
+
for i, (model, br) in enumerate(ranked, 1):
|
|
733
|
+
lines.append(f"{i}. **{model}**: {br.f1_score:.3f}\n")
|
|
734
|
+
|
|
735
|
+
report_path = os.path.join(output_dir, "MODEL_COMPARISON_REPORT.md")
|
|
736
|
+
with open(report_path, 'w') as f:
|
|
737
|
+
f.writelines(lines)
|
|
738
|
+
|
|
739
|
+
def _generate_comparison_instructions(
|
|
740
|
+
self,
|
|
741
|
+
output_dir: str,
|
|
742
|
+
all_manifests: Dict[str, Dict]
|
|
743
|
+
):
|
|
744
|
+
"""Generate instructions for running model comparison."""
|
|
745
|
+
files_list = list(all_manifests.keys())
|
|
746
|
+
|
|
747
|
+
lines = [
|
|
748
|
+
"# Model Comparison Instructions\n",
|
|
749
|
+
f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
|
|
750
|
+
"\n---\n",
|
|
751
|
+
"\n## Overview\n",
|
|
752
|
+
f"\nThis benchmark contains {len(files_list)} corrupted files for testing.\n",
|
|
753
|
+
"\n## Files to Process\n",
|
|
754
|
+
]
|
|
755
|
+
|
|
756
|
+
for rel_path in files_list:
|
|
757
|
+
lines.append(f"- `corrupted/{rel_path}`\n")
|
|
758
|
+
|
|
759
|
+
lines.append("\n---\n")
|
|
760
|
+
lines.append("\n## Instructions for Each Model\n")
|
|
761
|
+
|
|
762
|
+
for model in SUPPORTED_MODELS:
|
|
763
|
+
if model == "bioguider":
|
|
764
|
+
lines.append(f"\n### {model}\n")
|
|
765
|
+
lines.append("Run automatically via the benchmark evaluation.\n")
|
|
766
|
+
else:
|
|
767
|
+
lines.append(f"\n### {model}\n")
|
|
768
|
+
lines.append("1. Open each file in `corrupted/` with Cursor\n")
|
|
769
|
+
lines.append(f"2. Use {model} as the AI model\n")
|
|
770
|
+
lines.append("3. Prompt: 'Fix all errors, typos, broken links, and formatting issues in this file'\n")
|
|
771
|
+
lines.append(f"4. Save fixed files to `fixed_{model}/` maintaining directory structure\n")
|
|
772
|
+
|
|
773
|
+
lines.append("\n---\n")
|
|
774
|
+
lines.append("\n## After Fixing\n")
|
|
775
|
+
lines.append("\nRun evaluation:\n")
|
|
776
|
+
lines.append("```python\n")
|
|
777
|
+
lines.append("from bioguider.managers.benchmark_manager import BenchmarkManager\n")
|
|
778
|
+
lines.append("mgr = BenchmarkManager(llm, callback)\n")
|
|
779
|
+
lines.append(f'result = mgr.evaluate_model_comparison("{output_dir}")\n')
|
|
780
|
+
lines.append("```\n")
|
|
781
|
+
|
|
782
|
+
instructions_path = os.path.join(output_dir, "INSTRUCTIONS.md")
|
|
783
|
+
with open(instructions_path, 'w') as f:
|
|
784
|
+
f.writelines(lines)
|
|
785
|
+
|