bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,785 @@
1
+ """
2
+ Benchmark Manager for comprehensive error injection testing.
3
+
4
+ Provides:
5
+ - Stress testing across multiple error count levels (10, 20, 40, 60, 100)
6
+ - Multi-process parallel execution for files and stress levels
7
+ - Multi-model comparison support (BioGuider + external models)
8
+ - CSV/JSON export of results
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import json
14
+ import csv
15
+ import shutil
16
+ from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
17
+ from dataclasses import dataclass, field
18
+ from datetime import datetime
19
+ from pathlib import Path
20
+ from typing import Dict, List, Optional, Any, Callable, Tuple
21
+
22
+ from langchain_openai.chat_models.base import BaseChatOpenAI
23
+
24
+ from bioguider.generation.llm_injector import LLMErrorInjector
25
+ from bioguider.generation.benchmark_metrics import (
26
+ BenchmarkResult,
27
+ BenchmarkEvaluator,
28
+ evaluate_benchmark,
29
+ )
30
+ from bioguider.managers.generation_manager import DocumentationGenerationManager
31
+ from bioguider.agents.agent_utils import read_file, write_file
32
+
33
+
34
+ # Default stress test levels
35
+ DEFAULT_STRESS_LEVELS = [10, 20, 40, 60, 100]
36
+
37
+ # Supported external models for comparison
38
+ SUPPORTED_MODELS = ["bioguider", "gpt-5.1", "claude-sonnet", "gemini"]
39
+
40
+
41
+ @dataclass
42
+ class StressTestResult:
43
+ """Result of a single stress test level."""
44
+ error_count: int
45
+ benchmark_result: BenchmarkResult
46
+ output_dir: str
47
+ duration_seconds: float = 0.0
48
+
49
+
50
+ @dataclass
51
+ class ModelComparisonResult:
52
+ """Comparison results across multiple models."""
53
+ models: List[str]
54
+ error_count: int
55
+ results: Dict[str, BenchmarkResult] = field(default_factory=dict)
56
+
57
+
58
+ class BenchmarkManager:
59
+ """
60
+ Manages comprehensive benchmark testing for error injection.
61
+
62
+ Features:
63
+ - Stress testing with configurable error levels
64
+ - Multi-process parallel execution
65
+ - Multi-model comparison support
66
+ - Comprehensive result export (JSON, CSV, Markdown)
67
+ """
68
+
69
+ def __init__(
70
+ self,
71
+ llm: BaseChatOpenAI,
72
+ step_callback: Optional[Callable] = None,
73
+ max_workers: int = 4
74
+ ):
75
+ self.llm = llm
76
+ self.step_callback = step_callback
77
+ self.max_workers = max_workers
78
+
79
+ def print_step(self, name: str, output: str = ""):
80
+ """Output step progress."""
81
+ if self.step_callback:
82
+ self.step_callback(step_name=name, step_output=output)
83
+ else:
84
+ print(f"[{name}] {output}")
85
+
86
+ # =========================================================================
87
+ # FILE SELECTION
88
+ # =========================================================================
89
+
90
+ def _select_target_files(self, baseline_repo_path: str) -> Dict[str, List[str]]:
91
+ """
92
+ Select target files for error injection across multiple categories.
93
+ """
94
+ targets = {
95
+ "readme": [],
96
+ "tutorial": [],
97
+ "userguide": [],
98
+ "installation": []
99
+ }
100
+
101
+ # README files
102
+ readme_path = os.path.join(baseline_repo_path, "README.md")
103
+ if os.path.exists(readme_path):
104
+ targets["readme"].append(readme_path)
105
+
106
+ # Tutorial files (RMarkdown vignettes)
107
+ vignettes_dir = os.path.join(baseline_repo_path, "vignettes")
108
+ if os.path.isdir(vignettes_dir):
109
+ for f in sorted(os.listdir(vignettes_dir)):
110
+ if f.endswith('.Rmd') and not f.startswith('.'):
111
+ targets["tutorial"].append(os.path.join(vignettes_dir, f))
112
+
113
+ # Installation files
114
+ for pattern in ["install", "INSTALL", "installation"]:
115
+ for ext in [".md", ".Rmd", ".rst"]:
116
+ fpath = os.path.join(baseline_repo_path, pattern + ext)
117
+ if os.path.exists(fpath):
118
+ targets["installation"].append(fpath)
119
+
120
+ # Userguide files
121
+ docs_dir = os.path.join(baseline_repo_path, "docs")
122
+ if os.path.isdir(docs_dir):
123
+ for f in sorted(os.listdir(docs_dir)):
124
+ if f.endswith('.md') and not f.startswith('.'):
125
+ targets["userguide"].append(os.path.join(docs_dir, f))
126
+
127
+ return targets
128
+
129
+ def _extract_project_terms(self, repo_path: str) -> List[str]:
130
+ """Extract function names and key terms from the codebase."""
131
+ import re
132
+ from collections import Counter
133
+
134
+ terms = Counter()
135
+
136
+ for root, _, files in os.walk(repo_path):
137
+ if ".git" in root or "__pycache__" in root:
138
+ continue
139
+
140
+ for file in files:
141
+ fpath = os.path.join(root, file)
142
+ try:
143
+ content = read_file(fpath)
144
+ if not content:
145
+ continue
146
+
147
+ if file.endswith(".py"):
148
+ funcs = re.findall(r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", content)
149
+ terms.update(funcs)
150
+ classes = re.findall(r"class\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*[:\(]", content)
151
+ terms.update(classes)
152
+ elif file.endswith(".R"):
153
+ funcs = re.findall(r"([a-zA-Z_.][a-zA-Z0-9_.]*)\s*<-\s*function", content)
154
+ terms.update(funcs)
155
+ except Exception:
156
+ continue
157
+
158
+ filtered = [t for t, _ in terms.most_common(50)
159
+ if len(t) > 4 and t not in {"init", "self", "setup", "test", "main"}]
160
+ return filtered[:20]
161
+
162
+ # =========================================================================
163
+ # ERROR INJECTION
164
+ # =========================================================================
165
+
166
+ def _inject_errors_into_file(
167
+ self,
168
+ fpath: str,
169
+ category: str,
170
+ tmp_repo_path: str,
171
+ min_per_category: int,
172
+ project_terms: List[str],
173
+ injector: LLMErrorInjector
174
+ ) -> Optional[Dict[str, Any]]:
175
+ """Inject errors into a single file."""
176
+ if not os.path.exists(fpath):
177
+ return None
178
+
179
+ baseline_content = read_file(fpath) or ""
180
+ if not baseline_content.strip():
181
+ return None
182
+
183
+ try:
184
+ corrupted, manifest = injector.inject(
185
+ baseline_content,
186
+ min_per_category=min_per_category,
187
+ project_terms=project_terms
188
+ )
189
+
190
+ # Determine relative path
191
+ rel_path = os.path.relpath(fpath, os.path.dirname(os.path.dirname(fpath)))
192
+ if rel_path.startswith("../"):
193
+ rel_path = os.path.basename(fpath)
194
+
195
+ corrupted_path = os.path.join(tmp_repo_path, rel_path)
196
+ os.makedirs(os.path.dirname(corrupted_path), exist_ok=True)
197
+ write_file(corrupted_path, corrupted)
198
+
199
+ # Add file path to each error
200
+ for error in manifest.get("errors", []):
201
+ error["file_path"] = rel_path
202
+
203
+ return {
204
+ "rel_path": rel_path,
205
+ "category": category,
206
+ "original_path": fpath,
207
+ "corrupted_path": corrupted_path,
208
+ "manifest": manifest,
209
+ "baseline_content": baseline_content,
210
+ "corrupted_content": corrupted,
211
+ }
212
+ except Exception as e:
213
+ self.print_step(f"InjectionError:{os.path.basename(fpath)}", str(e))
214
+ return None
215
+
216
+ def _inject_errors_parallel(
217
+ self,
218
+ target_files: Dict[str, List[str]],
219
+ tmp_repo_path: str,
220
+ min_per_category: int
221
+ ) -> Dict[str, Dict]:
222
+ """Inject errors into multiple files in parallel."""
223
+ injector = LLMErrorInjector(self.llm)
224
+ project_terms = self._extract_project_terms(tmp_repo_path)
225
+ self.print_step("ExtractTerms", f"Found {len(project_terms)} project terms")
226
+
227
+ all_manifests = {}
228
+
229
+ # Flatten file list with categories
230
+ files_with_cats = []
231
+ for category, file_list in target_files.items():
232
+ for fpath in file_list:
233
+ files_with_cats.append((fpath, category))
234
+
235
+ self.print_step("InjectErrors", f"Injecting into {len(files_with_cats)} files with {min_per_category} errors/category")
236
+
237
+ # Use ThreadPoolExecutor since LLM calls are I/O bound
238
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
239
+ futures = {}
240
+ for fpath, category in files_with_cats:
241
+ future = executor.submit(
242
+ self._inject_errors_into_file,
243
+ fpath, category, tmp_repo_path, min_per_category, project_terms, injector
244
+ )
245
+ futures[future] = (fpath, category)
246
+
247
+ for future in as_completed(futures):
248
+ fpath, category = futures[future]
249
+ try:
250
+ result = future.result()
251
+ if result:
252
+ all_manifests[result["rel_path"]] = result
253
+ self.print_step(
254
+ f"Injected:{os.path.basename(fpath)}",
255
+ f"{len(result['manifest'].get('errors', []))} errors"
256
+ )
257
+ except Exception as e:
258
+ self.print_step(f"InjectionFailed:{os.path.basename(fpath)}", str(e))
259
+
260
+ return all_manifests
261
+
262
+ # =========================================================================
263
+ # STRESS TESTING
264
+ # =========================================================================
265
+
266
+ def run_stress_test(
267
+ self,
268
+ report_path: str,
269
+ baseline_repo_path: str,
270
+ output_base_path: str,
271
+ stress_levels: List[int] = None,
272
+ max_files_per_category: int = 10,
273
+ detect_semantic_fp: bool = True,
274
+ limit_generation_files: bool = True
275
+ ) -> Dict[int, StressTestResult]:
276
+ """
277
+ Run stress tests across multiple error count levels.
278
+
279
+ Args:
280
+ report_path: Path to evaluation report JSON
281
+ baseline_repo_path: Path to baseline repository
282
+ output_base_path: Base path for output directories
283
+ stress_levels: List of error counts to test (default: [10, 20, 40, 60, 100])
284
+ max_files_per_category: Max files to process per category
285
+ detect_semantic_fp: Whether to run semantic FP detection
286
+
287
+ Returns:
288
+ Dict mapping error_count to StressTestResult
289
+ """
290
+ import time
291
+
292
+ if stress_levels is None:
293
+ stress_levels = DEFAULT_STRESS_LEVELS
294
+
295
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
296
+ benchmark_dir = os.path.join(output_base_path, f"benchmark_{timestamp}")
297
+ os.makedirs(benchmark_dir, exist_ok=True)
298
+
299
+ self.print_step("StressTestStart", f"Testing levels: {stress_levels}")
300
+
301
+ # Select target files once
302
+ target_files = self._select_target_files(baseline_repo_path)
303
+
304
+ # Limit files per category
305
+ for cat in target_files:
306
+ if len(target_files[cat]) > max_files_per_category:
307
+ target_files[cat] = target_files[cat][:max_files_per_category]
308
+
309
+ total_files = sum(len(v) for v in target_files.values())
310
+ self.print_step("FilesSelected", f"{total_files} files across {len(target_files)} categories")
311
+
312
+ results: Dict[int, StressTestResult] = {}
313
+
314
+ for level in stress_levels:
315
+ start_time = time.time()
316
+ self.print_step(f"StressLevel:{level}", f"Starting with {level} errors per category")
317
+
318
+ # Create level-specific directory
319
+ level_dir = os.path.join(benchmark_dir, f"level_{level}")
320
+ tmp_repo_path = os.path.join(level_dir, "tmp_repo")
321
+
322
+ # Copy baseline repo
323
+ if os.path.exists(tmp_repo_path):
324
+ shutil.rmtree(tmp_repo_path)
325
+ shutil.copytree(baseline_repo_path, tmp_repo_path,
326
+ symlinks=False, ignore=shutil.ignore_patterns('.git'))
327
+
328
+ # Inject errors
329
+ all_manifests = self._inject_errors_parallel(target_files, tmp_repo_path, level)
330
+
331
+ total_errors = sum(len(info["manifest"].get("errors", [])) for info in all_manifests.values())
332
+ self.print_step("InjectionComplete", f"{total_errors} errors in {len(all_manifests)} files")
333
+
334
+ # Save injection manifest
335
+ self._save_manifest(all_manifests, level_dir)
336
+
337
+ # Run BioGuider to fix - only process injected files to save time
338
+ injected_files = list(all_manifests.keys()) if limit_generation_files else None
339
+ num_injected = len(injected_files) if injected_files else 0
340
+
341
+ # Always use max_files as a hard limit when limiting
342
+ max_files_limit = num_injected if (limit_generation_files and num_injected > 0) else None
343
+
344
+ if limit_generation_files:
345
+ self.print_step("RunGeneration", f"Processing ONLY {num_injected} injected files (max_files={max_files_limit})")
346
+ else:
347
+ self.print_step("RunGeneration", "Processing ALL files...")
348
+
349
+ gen = DocumentationGenerationManager(self.llm, self.step_callback)
350
+ out_dir = gen.run(
351
+ report_path=report_path,
352
+ repo_path=tmp_repo_path,
353
+ target_files=injected_files, # Filter by file path (primary)
354
+ max_files=max_files_limit # Hard limit (backup guarantee)
355
+ )
356
+
357
+ # Evaluate results
358
+ self.print_step("EvaluateFixes", "Computing benchmark metrics...")
359
+ benchmark_result = evaluate_benchmark(
360
+ all_manifests, out_dir, self.llm, detect_semantic_fp
361
+ )
362
+
363
+ duration = time.time() - start_time
364
+
365
+ results[level] = StressTestResult(
366
+ error_count=level,
367
+ benchmark_result=benchmark_result,
368
+ output_dir=level_dir,
369
+ duration_seconds=duration,
370
+ )
371
+
372
+ # Save level results
373
+ self._save_level_results(results[level], level_dir)
374
+
375
+ self.print_step(
376
+ f"LevelComplete:{level}",
377
+ f"F1={benchmark_result.f1_score:.3f}, FixRate={benchmark_result.fix_rate:.3f}"
378
+ )
379
+
380
+ # Save aggregate stress test results
381
+ self._save_stress_test_results(results, benchmark_dir)
382
+
383
+ self.print_step("StressTestComplete", f"Results saved to {benchmark_dir}")
384
+ return results
385
+
386
+ # =========================================================================
387
+ # MULTI-MODEL COMPARISON
388
+ # =========================================================================
389
+
390
+ def prepare_model_comparison(
391
+ self,
392
+ report_path: str,
393
+ baseline_repo_path: str,
394
+ output_base_path: str,
395
+ error_count: int = 20,
396
+ max_files_per_category: int = 10
397
+ ) -> str:
398
+ """
399
+ Prepare corrupted files for multi-model comparison.
400
+
401
+ This generates corrupted files that can be manually run through
402
+ Cursor with different models (GPT-5.1, Claude Sonnet, Gemini).
403
+
404
+ Args:
405
+ report_path: Path to evaluation report
406
+ baseline_repo_path: Path to baseline repository
407
+ output_base_path: Base output path
408
+ error_count: Number of errors to inject per category
409
+ max_files_per_category: Max files per category
410
+
411
+ Returns:
412
+ Path to the prepared benchmark directory
413
+ """
414
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
415
+ benchmark_dir = os.path.join(output_base_path, f"model_comparison_{timestamp}")
416
+ os.makedirs(benchmark_dir, exist_ok=True)
417
+
418
+ self.print_step("PrepareComparison", f"Preparing files for model comparison")
419
+
420
+ # Select and limit files
421
+ target_files = self._select_target_files(baseline_repo_path)
422
+ for cat in target_files:
423
+ if len(target_files[cat]) > max_files_per_category:
424
+ target_files[cat] = target_files[cat][:max_files_per_category]
425
+
426
+ # Create tmp repo for injection
427
+ tmp_repo_path = os.path.join(benchmark_dir, "corrupted")
428
+ if os.path.exists(tmp_repo_path):
429
+ shutil.rmtree(tmp_repo_path)
430
+ shutil.copytree(baseline_repo_path, tmp_repo_path,
431
+ symlinks=False, ignore=shutil.ignore_patterns('.git'))
432
+
433
+ # Inject errors
434
+ all_manifests = self._inject_errors_parallel(target_files, tmp_repo_path, error_count)
435
+
436
+ # Save manifest
437
+ self._save_manifest(all_manifests, benchmark_dir)
438
+
439
+ # Save original files for reference
440
+ originals_dir = os.path.join(benchmark_dir, "originals")
441
+ os.makedirs(originals_dir, exist_ok=True)
442
+ for rel_path, info in all_manifests.items():
443
+ orig_save_path = os.path.join(originals_dir, rel_path)
444
+ os.makedirs(os.path.dirname(orig_save_path), exist_ok=True)
445
+ write_file(orig_save_path, info["baseline_content"])
446
+
447
+ # Create directories for each model's fixed output
448
+ for model in SUPPORTED_MODELS:
449
+ model_dir = os.path.join(benchmark_dir, f"fixed_{model}")
450
+ os.makedirs(model_dir, exist_ok=True)
451
+
452
+ # Generate instructions file
453
+ self._generate_comparison_instructions(benchmark_dir, all_manifests)
454
+
455
+ self.print_step("ComparisonPrepared", f"Files ready in {benchmark_dir}")
456
+ return benchmark_dir
457
+
458
+ def evaluate_model_comparison(
459
+ self,
460
+ benchmark_dir: str,
461
+ models: List[str] = None,
462
+ detect_semantic_fp: bool = True
463
+ ) -> ModelComparisonResult:
464
+ """
465
+ Evaluate and compare results from multiple models.
466
+
467
+ Args:
468
+ benchmark_dir: Path to benchmark directory with fixed files
469
+ models: List of model names to evaluate
470
+ detect_semantic_fp: Whether to run semantic FP detection
471
+
472
+ Returns:
473
+ ModelComparisonResult with comparison data
474
+ """
475
+ if models is None:
476
+ models = SUPPORTED_MODELS
477
+
478
+ # Load manifest
479
+ manifest_path = os.path.join(benchmark_dir, "BENCHMARK_MANIFEST.json")
480
+ with open(manifest_path, 'r') as f:
481
+ manifest_data = json.load(f)
482
+
483
+ # Reconstruct manifests dict
484
+ all_manifests = {}
485
+ originals_dir = os.path.join(benchmark_dir, "originals")
486
+ corrupted_dir = os.path.join(benchmark_dir, "corrupted")
487
+
488
+ for rel_path, file_info in manifest_data["files"].items():
489
+ orig_content = read_file(os.path.join(originals_dir, rel_path)) or ""
490
+ corr_content = read_file(os.path.join(corrupted_dir, rel_path)) or ""
491
+
492
+ all_manifests[rel_path] = {
493
+ "category": file_info["category"],
494
+ "manifest": {"errors": file_info["errors"]},
495
+ "baseline_content": orig_content,
496
+ "corrupted_content": corr_content,
497
+ }
498
+
499
+ total_errors = manifest_data.get("total_errors", 0)
500
+
501
+ result = ModelComparisonResult(
502
+ models=models,
503
+ error_count=total_errors,
504
+ )
505
+
506
+ for model in models:
507
+ model_fixed_dir = os.path.join(benchmark_dir, f"fixed_{model}")
508
+
509
+ if not os.path.exists(model_fixed_dir):
510
+ self.print_step(f"SkipModel:{model}", "No fixed files found")
511
+ continue
512
+
513
+ # Check if there are any files in the directory
514
+ has_files = any(os.path.isfile(os.path.join(model_fixed_dir, f))
515
+ for f in os.listdir(model_fixed_dir))
516
+ if not has_files:
517
+ self.print_step(f"SkipModel:{model}", "Directory empty")
518
+ continue
519
+
520
+ self.print_step(f"EvaluateModel:{model}", "Computing metrics...")
521
+
522
+ benchmark_result = evaluate_benchmark(
523
+ all_manifests, model_fixed_dir, self.llm, detect_semantic_fp
524
+ )
525
+
526
+ result.results[model] = benchmark_result
527
+
528
+ self.print_step(
529
+ f"ModelEvaluated:{model}",
530
+ f"F1={benchmark_result.f1_score:.3f}, FixRate={benchmark_result.fix_rate:.3f}"
531
+ )
532
+
533
+ # Save comparison results
534
+ self._save_comparison_results(result, benchmark_dir)
535
+
536
+ return result
537
+
538
+ # =========================================================================
539
+ # RESULT EXPORT
540
+ # =========================================================================
541
+
542
+ def _save_manifest(self, all_manifests: Dict[str, Dict], output_dir: str):
543
+ """Save injection manifest to JSON."""
544
+ all_errors = []
545
+ files_info = {}
546
+
547
+ for rel_path, info in all_manifests.items():
548
+ file_errors = info["manifest"].get("errors", [])
549
+ files_info[rel_path] = {
550
+ "category": info["category"],
551
+ "error_count": len(file_errors),
552
+ "errors": file_errors,
553
+ }
554
+ all_errors.extend(file_errors)
555
+
556
+ manifest = {
557
+ "total_files": len(all_manifests),
558
+ "total_errors": len(all_errors),
559
+ "files": files_info,
560
+ }
561
+
562
+ manifest_path = os.path.join(output_dir, "BENCHMARK_MANIFEST.json")
563
+ with open(manifest_path, 'w') as f:
564
+ json.dump(manifest, f, indent=2)
565
+
566
+ def _save_level_results(self, result: StressTestResult, output_dir: str):
567
+ """Save results for a single stress level."""
568
+ results_path = os.path.join(output_dir, "BENCHMARK_RESULTS.json")
569
+ with open(results_path, 'w') as f:
570
+ json.dump({
571
+ "error_count": result.error_count,
572
+ "duration_seconds": result.duration_seconds,
573
+ **result.benchmark_result.to_dict()
574
+ }, f, indent=2)
575
+
576
+ def _save_stress_test_results(
577
+ self,
578
+ results: Dict[int, StressTestResult],
579
+ output_dir: str
580
+ ):
581
+ """Save aggregate stress test results as JSON and CSV."""
582
+ # JSON format
583
+ stress_results = []
584
+ for level, result in sorted(results.items()):
585
+ stress_results.append({
586
+ "error_count": level,
587
+ "duration_seconds": result.duration_seconds,
588
+ **result.benchmark_result.to_dict()
589
+ })
590
+
591
+ json_path = os.path.join(output_dir, "STRESS_TEST_RESULTS.json")
592
+ with open(json_path, 'w') as f:
593
+ json.dump({"stress_results": stress_results}, f, indent=2)
594
+
595
+ # CSV format
596
+ csv_path = os.path.join(output_dir, "STRESS_TEST_TABLE.csv")
597
+ with open(csv_path, 'w', newline='') as f:
598
+ writer = csv.writer(f)
599
+ writer.writerow([
600
+ "error_count", "true_positives", "false_negatives", "false_positives",
601
+ "precision", "recall", "f1_score", "fix_rate", "duration_seconds"
602
+ ])
603
+ for level, result in sorted(results.items()):
604
+ br = result.benchmark_result
605
+ writer.writerow([
606
+ level, br.true_positives, br.false_negatives, br.false_positives,
607
+ round(br.precision, 4), round(br.recall, 4), round(br.f1_score, 4),
608
+ round(br.fix_rate, 4), round(result.duration_seconds, 2)
609
+ ])
610
+
611
+ # Markdown report
612
+ self._generate_stress_test_report(results, output_dir)
613
+
614
+ def _generate_stress_test_report(
615
+ self,
616
+ results: Dict[int, StressTestResult],
617
+ output_dir: str
618
+ ):
619
+ """Generate markdown report for stress test."""
620
+ lines = [
621
+ "# Stress Test Results\n",
622
+ f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
623
+ "\n---\n",
624
+ "\n## Summary Table\n",
625
+ "\n| Errors | TP | FN | FP | Precision | Recall | F1 | Fix Rate |\n",
626
+ "|--------|----|----|-----|-----------|--------|-----|----------|\n",
627
+ ]
628
+
629
+ for level, result in sorted(results.items()):
630
+ br = result.benchmark_result
631
+ lines.append(
632
+ f"| {level} | {br.true_positives} | {br.false_negatives} | "
633
+ f"{br.false_positives} | {br.precision:.3f} | {br.recall:.3f} | "
634
+ f"{br.f1_score:.3f} | {br.fix_rate:.3f} |\n"
635
+ )
636
+
637
+ lines.append("\n---\n")
638
+ lines.append("\n## Key Findings\n")
639
+
640
+ # Find performance drop-off point
641
+ prev_f1 = 1.0
642
+ drop_point = None
643
+ for level, result in sorted(results.items()):
644
+ if result.benchmark_result.f1_score < prev_f1 * 0.8: # 20% drop
645
+ drop_point = level
646
+ break
647
+ prev_f1 = result.benchmark_result.f1_score
648
+
649
+ if drop_point:
650
+ lines.append(f"\n- **Performance drop-off**: Significant decline observed at {drop_point} errors\n")
651
+ else:
652
+ lines.append("\n- **Performance**: Stable across all tested error levels\n")
653
+
654
+ # Best/worst performance
655
+ best_level = max(results.keys(), key=lambda k: results[k].benchmark_result.f1_score)
656
+ worst_level = min(results.keys(), key=lambda k: results[k].benchmark_result.f1_score)
657
+
658
+ lines.append(f"- **Best F1 Score**: {results[best_level].benchmark_result.f1_score:.3f} at {best_level} errors\n")
659
+ lines.append(f"- **Worst F1 Score**: {results[worst_level].benchmark_result.f1_score:.3f} at {worst_level} errors\n")
660
+
661
+ report_path = os.path.join(output_dir, "STRESS_TEST_REPORT.md")
662
+ with open(report_path, 'w') as f:
663
+ f.writelines(lines)
664
+
665
+ def _save_comparison_results(
666
+ self,
667
+ result: ModelComparisonResult,
668
+ output_dir: str
669
+ ):
670
+ """Save model comparison results as JSON and CSV."""
671
+ # JSON format
672
+ comparison_data = {
673
+ "models": result.models,
674
+ "error_count": result.error_count,
675
+ "results": {
676
+ model: br.to_dict()
677
+ for model, br in result.results.items()
678
+ }
679
+ }
680
+
681
+ json_path = os.path.join(output_dir, "MODEL_COMPARISON_RESULTS.json")
682
+ with open(json_path, 'w') as f:
683
+ json.dump(comparison_data, f, indent=2)
684
+
685
+ # CSV format
686
+ csv_path = os.path.join(output_dir, "MODEL_COMPARISON_TABLE.csv")
687
+ with open(csv_path, 'w', newline='') as f:
688
+ writer = csv.writer(f)
689
+ writer.writerow([
690
+ "model", "true_positives", "false_negatives", "false_positives",
691
+ "precision", "recall", "f1_score", "fix_rate"
692
+ ])
693
+ for model, br in result.results.items():
694
+ writer.writerow([
695
+ model, br.true_positives, br.false_negatives, br.false_positives,
696
+ round(br.precision, 4), round(br.recall, 4),
697
+ round(br.f1_score, 4), round(br.fix_rate, 4)
698
+ ])
699
+
700
+ # Markdown report
701
+ self._generate_comparison_report(result, output_dir)
702
+
703
+ def _generate_comparison_report(
704
+ self,
705
+ result: ModelComparisonResult,
706
+ output_dir: str
707
+ ):
708
+ """Generate markdown report for model comparison."""
709
+ lines = [
710
+ "# Model Comparison Results\n",
711
+ f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
712
+ f"**Error Count**: {result.error_count}\n",
713
+ "\n---\n",
714
+ "\n## Comparison Table\n",
715
+ "\n| Model | TP | FN | FP | Precision | Recall | F1 | Fix Rate |\n",
716
+ "|-------|----|----|-----|-----------|--------|-----|----------|\n",
717
+ ]
718
+
719
+ for model, br in result.results.items():
720
+ lines.append(
721
+ f"| {model} | {br.true_positives} | {br.false_negatives} | "
722
+ f"{br.false_positives} | {br.precision:.3f} | {br.recall:.3f} | "
723
+ f"{br.f1_score:.3f} | {br.fix_rate:.3f} |\n"
724
+ )
725
+
726
+ lines.append("\n---\n")
727
+ lines.append("\n## Rankings\n")
728
+
729
+ # Rank by F1 score
730
+ ranked = sorted(result.results.items(), key=lambda x: x[1].f1_score, reverse=True)
731
+ lines.append("\n### By F1 Score\n")
732
+ for i, (model, br) in enumerate(ranked, 1):
733
+ lines.append(f"{i}. **{model}**: {br.f1_score:.3f}\n")
734
+
735
+ report_path = os.path.join(output_dir, "MODEL_COMPARISON_REPORT.md")
736
+ with open(report_path, 'w') as f:
737
+ f.writelines(lines)
738
+
739
+ def _generate_comparison_instructions(
740
+ self,
741
+ output_dir: str,
742
+ all_manifests: Dict[str, Dict]
743
+ ):
744
+ """Generate instructions for running model comparison."""
745
+ files_list = list(all_manifests.keys())
746
+
747
+ lines = [
748
+ "# Model Comparison Instructions\n",
749
+ f"\n**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
750
+ "\n---\n",
751
+ "\n## Overview\n",
752
+ f"\nThis benchmark contains {len(files_list)} corrupted files for testing.\n",
753
+ "\n## Files to Process\n",
754
+ ]
755
+
756
+ for rel_path in files_list:
757
+ lines.append(f"- `corrupted/{rel_path}`\n")
758
+
759
+ lines.append("\n---\n")
760
+ lines.append("\n## Instructions for Each Model\n")
761
+
762
+ for model in SUPPORTED_MODELS:
763
+ if model == "bioguider":
764
+ lines.append(f"\n### {model}\n")
765
+ lines.append("Run automatically via the benchmark evaluation.\n")
766
+ else:
767
+ lines.append(f"\n### {model}\n")
768
+ lines.append("1. Open each file in `corrupted/` with Cursor\n")
769
+ lines.append(f"2. Use {model} as the AI model\n")
770
+ lines.append("3. Prompt: 'Fix all errors, typos, broken links, and formatting issues in this file'\n")
771
+ lines.append(f"4. Save fixed files to `fixed_{model}/` maintaining directory structure\n")
772
+
773
+ lines.append("\n---\n")
774
+ lines.append("\n## After Fixing\n")
775
+ lines.append("\nRun evaluation:\n")
776
+ lines.append("```python\n")
777
+ lines.append("from bioguider.managers.benchmark_manager import BenchmarkManager\n")
778
+ lines.append("mgr = BenchmarkManager(llm, callback)\n")
779
+ lines.append(f'result = mgr.evaluate_model_comparison("{output_dir}")\n')
780
+ lines.append("```\n")
781
+
782
+ instructions_path = os.path.join(output_dir, "INSTRUCTIONS.md")
783
+ with open(instructions_path, 'w') as f:
784
+ f.writelines(lines)
785
+