bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,610 @@
1
+ """
2
+ Benchmark metrics for comprehensive error injection evaluation.
3
+
4
+ Provides F-score calculation with semantic False Positive detection via LLM.
5
+ """
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import re
10
+ from dataclasses import dataclass, field
11
+ from difflib import SequenceMatcher, unified_diff
12
+ from typing import Dict, Any, List, Tuple, Optional
13
+
14
+ from langchain_openai.chat_models.base import BaseChatOpenAI
15
+ from bioguider.agents.common_conversation import CommonConversation
16
+
17
+
18
+ @dataclass
19
+ class ErrorMetrics:
20
+ """Metrics for a single error evaluation."""
21
+ error_id: str
22
+ category: str
23
+ file_path: str
24
+ is_fixed: bool # TP if True, FN if False
25
+ original_snippet: str
26
+ mutated_snippet: str
27
+ status: str # "fixed_to_baseline", "fixed_to_valid", "unchanged"
28
+
29
+
30
+ @dataclass
31
+ class FalsePositive:
32
+ """Represents a detected false positive (harmful unintended change)."""
33
+ file_path: str
34
+ change_description: str
35
+ severity: str # "harmful", "neutral", "beneficial"
36
+ original_text: str
37
+ changed_text: str
38
+
39
+
40
+ @dataclass
41
+ class BenchmarkResult:
42
+ """Complete benchmark result for a single run."""
43
+ error_count: int
44
+ file_count: int
45
+
46
+ # Core metrics
47
+ true_positives: int = 0 # Errors correctly fixed
48
+ false_negatives: int = 0 # Errors NOT fixed
49
+ false_positives: int = 0 # Harmful unintended changes
50
+ true_negatives: int = 0 # Non-errors correctly unchanged
51
+
52
+ # Derived metrics (computed)
53
+ precision: float = 0.0
54
+ recall: float = 0.0
55
+ f1_score: float = 0.0
56
+ fix_rate: float = 0.0
57
+
58
+ # Detailed breakdowns
59
+ per_category: Dict[str, Dict[str, int]] = field(default_factory=dict)
60
+ per_file: Dict[str, Dict[str, int]] = field(default_factory=dict)
61
+ error_details: List[ErrorMetrics] = field(default_factory=list)
62
+ fp_details: List[FalsePositive] = field(default_factory=list)
63
+
64
+ def compute_derived_metrics(self):
65
+ """Compute precision, recall, F1 from TP/FP/FN."""
66
+ # Precision = TP / (TP + FP)
67
+ if self.true_positives + self.false_positives > 0:
68
+ self.precision = self.true_positives / (self.true_positives + self.false_positives)
69
+ else:
70
+ self.precision = 0.0
71
+
72
+ # Recall = TP / (TP + FN)
73
+ if self.true_positives + self.false_negatives > 0:
74
+ self.recall = self.true_positives / (self.true_positives + self.false_negatives)
75
+ else:
76
+ self.recall = 0.0
77
+
78
+ # F1 = 2 * (precision * recall) / (precision + recall)
79
+ if self.precision + self.recall > 0:
80
+ self.f1_score = 2 * (self.precision * self.recall) / (self.precision + self.recall)
81
+ else:
82
+ self.f1_score = 0.0
83
+
84
+ # Fix rate = TP / (TP + FN)
85
+ total_errors = self.true_positives + self.false_negatives
86
+ if total_errors > 0:
87
+ self.fix_rate = self.true_positives / total_errors
88
+ else:
89
+ self.fix_rate = 0.0
90
+
91
+ def to_dict(self) -> Dict[str, Any]:
92
+ """Convert to dictionary for JSON serialization."""
93
+ return {
94
+ "error_count": self.error_count,
95
+ "file_count": self.file_count,
96
+ "true_positives": self.true_positives,
97
+ "false_negatives": self.false_negatives,
98
+ "false_positives": self.false_positives,
99
+ "true_negatives": self.true_negatives,
100
+ "precision": round(self.precision, 4),
101
+ "recall": round(self.recall, 4),
102
+ "f1_score": round(self.f1_score, 4),
103
+ "fix_rate": round(self.fix_rate, 4),
104
+ "per_category": self.per_category,
105
+ "per_file": self.per_file,
106
+ "error_details": [
107
+ {
108
+ "error_id": e.error_id,
109
+ "category": e.category,
110
+ "file_path": e.file_path,
111
+ "is_fixed": e.is_fixed,
112
+ "status": e.status,
113
+ }
114
+ for e in self.error_details
115
+ ],
116
+ "fp_details": [
117
+ {
118
+ "file_path": fp.file_path,
119
+ "change_description": fp.change_description,
120
+ "severity": fp.severity,
121
+ }
122
+ for fp in self.fp_details
123
+ ],
124
+ }
125
+
126
+
127
+ SEMANTIC_FP_PROMPT = """
128
+ You are analyzing changes made to a documentation file to detect potentially harmful modifications.
129
+
130
+ CONTEXT:
131
+ - A document was intentionally corrupted with specific errors (listed below)
132
+ - An AI system attempted to fix these errors
133
+ - We need to check if the AI made any UNINTENDED harmful changes beyond fixing the known errors
134
+
135
+ INJECTED ERRORS (these changes ARE expected and should be fixed):
136
+ {injected_errors}
137
+
138
+ DIFF OF CHANGES (unified diff format):
139
+ ```
140
+ {diff}
141
+ ```
142
+
143
+ TASK:
144
+ Analyze the diff and identify any changes that are NOT related to fixing the injected errors.
145
+ For each unrelated change, classify it as:
146
+ 1. "harmful" - Incorrect changes that introduce new errors or break functionality
147
+ 2. "neutral" - Style/formatting changes that don't affect correctness
148
+ 3. "beneficial" - Improvements beyond the required fixes (still acceptable)
149
+
150
+ OUTPUT (JSON only):
151
+ {{
152
+ "unintended_changes": [
153
+ {{
154
+ "description": "brief description of the change",
155
+ "severity": "harmful|neutral|beneficial",
156
+ "original_text": "what was there before",
157
+ "changed_text": "what it was changed to",
158
+ "reasoning": "why this classification"
159
+ }}
160
+ ],
161
+ "summary": {{
162
+ "harmful_count": <int>,
163
+ "neutral_count": <int>,
164
+ "beneficial_count": <int>
165
+ }}
166
+ }}
167
+
168
+ If no unintended changes found, return:
169
+ {{
170
+ "unintended_changes": [],
171
+ "summary": {{"harmful_count": 0, "neutral_count": 0, "beneficial_count": 0}}
172
+ }}
173
+ """
174
+
175
+
176
+ class SemanticFPDetector:
177
+ """Detects false positives using LLM semantic analysis."""
178
+
179
+ def __init__(self, llm: BaseChatOpenAI):
180
+ self.llm = llm
181
+
182
+ def detect_false_positives(
183
+ self,
184
+ baseline: str,
185
+ revised: str,
186
+ injected_errors: List[Dict[str, Any]],
187
+ file_path: str
188
+ ) -> List[FalsePositive]:
189
+ """
190
+ Detect harmful unintended changes (false positives) in the revised content.
191
+
192
+ Args:
193
+ baseline: Original correct content
194
+ revised: Content after AI fixes
195
+ injected_errors: List of errors that were intentionally injected
196
+ file_path: Path to the file being analyzed
197
+
198
+ Returns:
199
+ List of detected false positives (harmful changes)
200
+ """
201
+ # Generate unified diff
202
+ baseline_lines = baseline.splitlines(keepends=True)
203
+ revised_lines = revised.splitlines(keepends=True)
204
+ diff_lines = list(unified_diff(
205
+ baseline_lines,
206
+ revised_lines,
207
+ fromfile="baseline",
208
+ tofile="revised",
209
+ lineterm=""
210
+ ))
211
+ diff_text = "".join(diff_lines)
212
+
213
+ if not diff_text.strip():
214
+ # No changes at all
215
+ return []
216
+
217
+ # Format injected errors for the prompt
218
+ error_descriptions = []
219
+ for err in injected_errors:
220
+ error_descriptions.append(
221
+ f"- Category: {err.get('category', 'unknown')}\n"
222
+ f" Original: {err.get('original_snippet', 'N/A')[:100]}\n"
223
+ f" Mutated: {err.get('mutated_snippet', 'N/A')[:100]}"
224
+ )
225
+ errors_text = "\n".join(error_descriptions) if error_descriptions else "None"
226
+
227
+ # Build prompt
228
+ prompt = SEMANTIC_FP_PROMPT.format(
229
+ injected_errors=errors_text,
230
+ diff=diff_text[:8000] # Limit diff size
231
+ )
232
+
233
+ try:
234
+ conv = CommonConversation(self.llm)
235
+ output, _ = conv.generate(
236
+ system_prompt=prompt,
237
+ instruction_prompt="Analyze the changes and return the JSON."
238
+ )
239
+
240
+ # Parse response
241
+ result = self._parse_json_output(output)
242
+
243
+ # Extract harmful changes as false positives
244
+ false_positives = []
245
+ for change in result.get("unintended_changes", []):
246
+ if change.get("severity") == "harmful":
247
+ false_positives.append(FalsePositive(
248
+ file_path=file_path,
249
+ change_description=change.get("description", "Unknown change"),
250
+ severity="harmful",
251
+ original_text=change.get("original_text", ""),
252
+ changed_text=change.get("changed_text", ""),
253
+ ))
254
+
255
+ return false_positives
256
+
257
+ except Exception as e:
258
+ print(f"Warning: Semantic FP detection failed for {file_path}: {e}")
259
+ return []
260
+
261
+ def _parse_json_output(self, output: str) -> Dict[str, Any]:
262
+ """Parse JSON from LLM output with fallback strategies."""
263
+ # Strategy 1: Direct parse
264
+ try:
265
+ return json.loads(output)
266
+ except json.JSONDecodeError:
267
+ pass
268
+
269
+ # Strategy 2: Extract JSON block
270
+ json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
271
+ match = re.search(json_pattern, output, re.DOTALL)
272
+ if match:
273
+ try:
274
+ return json.loads(match.group(1))
275
+ except json.JSONDecodeError:
276
+ pass
277
+
278
+ # Strategy 3: Find first complete JSON object
279
+ start = output.find("{")
280
+ if start != -1:
281
+ brace_count = 0
282
+ end = start
283
+ for i, char in enumerate(output[start:], start):
284
+ if char == "{":
285
+ brace_count += 1
286
+ elif char == "}":
287
+ brace_count -= 1
288
+ if brace_count == 0:
289
+ end = i
290
+ break
291
+
292
+ if brace_count == 0:
293
+ try:
294
+ return json.loads(output[start:end+1])
295
+ except json.JSONDecodeError:
296
+ pass
297
+
298
+ # Fallback
299
+ return {"unintended_changes": [], "summary": {"harmful_count": 0, "neutral_count": 0, "beneficial_count": 0}}
300
+
301
+
302
+ class BenchmarkEvaluator:
303
+ """Evaluates benchmark results with F-score metrics."""
304
+
305
+ def __init__(self, llm: Optional[BaseChatOpenAI] = None):
306
+ self.llm = llm
307
+ self.fp_detector = SemanticFPDetector(llm) if llm else None
308
+
309
+ def evaluate_single_file(
310
+ self,
311
+ baseline: str,
312
+ corrupted: str,
313
+ revised: str,
314
+ injection_manifest: Dict[str, Any],
315
+ file_path: str,
316
+ file_category: str,
317
+ detect_semantic_fp: bool = True
318
+ ) -> Tuple[List[ErrorMetrics], List[FalsePositive]]:
319
+ """
320
+ Evaluate fixes for a single file.
321
+
322
+ Returns:
323
+ Tuple of (error_metrics, false_positives)
324
+ """
325
+ error_metrics = []
326
+
327
+ for err in injection_manifest.get("errors", []):
328
+ error_id = err.get("id", "unknown")
329
+ category = err.get("category", "unknown")
330
+ orig = err.get("original_snippet", "")
331
+ mut = err.get("mutated_snippet", "")
332
+
333
+ # Determine if error was fixed
334
+ is_fixed, status = self._check_error_fixed(
335
+ category, orig, mut, baseline, corrupted, revised
336
+ )
337
+
338
+ error_metrics.append(ErrorMetrics(
339
+ error_id=error_id,
340
+ category=category,
341
+ file_path=file_path,
342
+ is_fixed=is_fixed,
343
+ original_snippet=orig,
344
+ mutated_snippet=mut,
345
+ status=status,
346
+ ))
347
+
348
+ # Detect false positives if LLM available and enabled
349
+ false_positives = []
350
+ if detect_semantic_fp and self.fp_detector:
351
+ false_positives = self.fp_detector.detect_false_positives(
352
+ baseline, revised, injection_manifest.get("errors", []), file_path
353
+ )
354
+
355
+ return error_metrics, false_positives
356
+
357
+ def _check_error_fixed(
358
+ self,
359
+ category: str,
360
+ orig: str,
361
+ mut: str,
362
+ baseline: str,
363
+ corrupted: str,
364
+ revised: str
365
+ ) -> Tuple[bool, str]:
366
+ """
367
+ Check if a specific error was fixed.
368
+
369
+ Returns:
370
+ Tuple of (is_fixed, status)
371
+ """
372
+ # Logic adapted from test_metrics.py
373
+ if category == "typo":
374
+ if orig and orig in revised:
375
+ return True, "fixed_to_baseline"
376
+ elif mut and mut in revised:
377
+ return False, "unchanged"
378
+ else:
379
+ return True, "fixed_to_valid"
380
+
381
+ elif category == "link":
382
+ wellformed = re.search(r"\[[^\]]+\]\([^\s)]+\)", revised) is not None
383
+ return wellformed, "fixed_to_valid" if wellformed else "unchanged"
384
+
385
+ elif category == "duplicate":
386
+ dup_before = corrupted.count(mut) if mut else 0
387
+ dup_after = revised.count(mut) if mut else 0
388
+ is_fixed = dup_after < dup_before
389
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
390
+
391
+ elif category == "markdown_structure":
392
+ issues_before = self._count_markdown_issues(corrupted)
393
+ issues_after = self._count_markdown_issues(revised)
394
+ is_fixed = issues_after < issues_before
395
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
396
+
397
+ elif category in ("bio_term", "function"):
398
+ if orig and orig in revised:
399
+ return True, "fixed_to_baseline"
400
+ elif mut and mut in revised:
401
+ return False, "unchanged"
402
+ else:
403
+ return True, "fixed_to_valid"
404
+
405
+ elif category == "list_structure":
406
+ mal_before = len(re.findall(r"^[-*]\S", corrupted, flags=re.M))
407
+ mal_after = len(re.findall(r"^[-*]\S", revised, flags=re.M))
408
+ is_fixed = mal_after < mal_before
409
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
410
+
411
+ elif category == "image_syntax":
412
+ bad_before = len(re.findall(r"!\[[^\]]*\]\s+\(", corrupted))
413
+ bad_after = len(re.findall(r"!\[[^\]]*\]\s+\(", revised))
414
+ is_fixed = bad_after < bad_before
415
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
416
+
417
+ elif category == "section_title":
418
+ canonical_titles = {
419
+ "## What is it?", "## What can it do?", "## Requirements",
420
+ "## Install", "## Quick example", "## Learn more", "## License & Contact",
421
+ }
422
+ if mut and mut not in revised and any(t in revised for t in canonical_titles):
423
+ return True, "fixed_to_valid"
424
+ return False, "unchanged"
425
+
426
+ elif category == "inline_code":
427
+ raw = mut.strip('`') if mut else ""
428
+ rewrapped = f"`{raw}`" if raw else ""
429
+ if raw and rewrapped and rewrapped in revised and mut not in revised:
430
+ return True, "fixed_to_valid"
431
+ return False, "unchanged"
432
+
433
+ elif category in ("emphasis", "code_lang_tag"):
434
+ is_fixed = mut and mut not in revised
435
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
436
+
437
+ elif category in ("number", "boolean", "param_name", "comment_typo", "species_name", "gene_case"):
438
+ # For these categories: fixed if original restored OR mutated removed
439
+ if orig and orig in revised:
440
+ return True, "fixed_to_baseline"
441
+ elif mut and mut in revised:
442
+ return False, "unchanged"
443
+ else:
444
+ # Neither found = content rewritten = consider fixed
445
+ return True, "fixed_to_valid"
446
+
447
+ elif category == "table_alignment":
448
+ var_before = self._table_variance(corrupted)
449
+ var_after = self._table_variance(revised)
450
+ is_fixed = var_after < var_before
451
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
452
+
453
+ # Biology-specific and CLI/CONFIG categories
454
+ elif category in {
455
+ "gene_symbol_case", "species_swap", "ref_genome_mismatch", "modality_confusion",
456
+ "normalization_error", "umi_vs_read", "batch_effect", "qc_threshold", "file_format",
457
+ "strandedness", "coordinates", "units_scale", "sample_type", "contamination",
458
+ "param_name", "default_value", "path_hint"
459
+ }:
460
+ is_fixed = mut and mut not in revised
461
+ return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
462
+
463
+ # Default
464
+ return False, "unchanged"
465
+
466
+ def _count_markdown_issues(self, text: str) -> int:
467
+ """Count markdown structural issues."""
468
+ issues = 0
469
+ issues += text.count("[![") - text.count("](")
470
+ issues += text.count("[ ")
471
+ issues += len(re.findall(r"^#[^#\s]", text, flags=re.M))
472
+ return max(0, issues)
473
+
474
+ def _table_variance(self, text: str) -> int:
475
+ """Calculate table alignment variance."""
476
+ rows = [ln for ln in text.splitlines() if '|' in ln]
477
+ groups: List[List[str]] = []
478
+ cur: List[str] = []
479
+ for ln in rows:
480
+ if '|' in ln:
481
+ cur.append(ln)
482
+ else:
483
+ if len(cur) >= 2:
484
+ groups.append(cur)
485
+ cur = []
486
+ if len(cur) >= 2:
487
+ groups.append(cur)
488
+ vari = 0
489
+ for g in groups:
490
+ counts = [ln.count('|') for ln in g]
491
+ vari += (max(counts) - min(counts))
492
+ return vari
493
+
494
+ def aggregate_results(
495
+ self,
496
+ all_error_metrics: List[ErrorMetrics],
497
+ all_false_positives: List[FalsePositive],
498
+ error_count: int,
499
+ file_count: int
500
+ ) -> BenchmarkResult:
501
+ """
502
+ Aggregate metrics from all files into a single BenchmarkResult.
503
+ """
504
+ result = BenchmarkResult(
505
+ error_count=error_count,
506
+ file_count=file_count,
507
+ )
508
+
509
+ # Count TP/FN from error metrics
510
+ for em in all_error_metrics:
511
+ if em.is_fixed:
512
+ result.true_positives += 1
513
+ else:
514
+ result.false_negatives += 1
515
+
516
+ # Per-category breakdown
517
+ cat = em.category
518
+ if cat not in result.per_category:
519
+ result.per_category[cat] = {"tp": 0, "fn": 0}
520
+ if em.is_fixed:
521
+ result.per_category[cat]["tp"] += 1
522
+ else:
523
+ result.per_category[cat]["fn"] += 1
524
+
525
+ # Per-file breakdown
526
+ fp = em.file_path
527
+ if fp not in result.per_file:
528
+ result.per_file[fp] = {"tp": 0, "fn": 0, "fp": 0}
529
+ if em.is_fixed:
530
+ result.per_file[fp]["tp"] += 1
531
+ else:
532
+ result.per_file[fp]["fn"] += 1
533
+
534
+ result.error_details.append(em)
535
+
536
+ # Count FP from semantic detection
537
+ result.false_positives = len(all_false_positives)
538
+ result.fp_details = all_false_positives
539
+
540
+ for fp in all_false_positives:
541
+ if fp.file_path not in result.per_file:
542
+ result.per_file[fp.file_path] = {"tp": 0, "fn": 0, "fp": 0}
543
+ result.per_file[fp.file_path]["fp"] += 1
544
+
545
+ # Compute derived metrics
546
+ result.compute_derived_metrics()
547
+
548
+ return result
549
+
550
+
551
+ def evaluate_benchmark(
552
+ manifests: Dict[str, Dict[str, Any]],
553
+ output_dir: str,
554
+ llm: Optional[BaseChatOpenAI] = None,
555
+ detect_semantic_fp: bool = True
556
+ ) -> BenchmarkResult:
557
+ """
558
+ Evaluate a complete benchmark run.
559
+
560
+ Args:
561
+ manifests: Dict mapping file paths to their injection info
562
+ output_dir: Directory containing the fixed files
563
+ llm: LLM for semantic FP detection (optional)
564
+ detect_semantic_fp: Whether to run semantic FP detection
565
+
566
+ Returns:
567
+ BenchmarkResult with all metrics
568
+ """
569
+ import os
570
+ from bioguider.agents.agent_utils import read_file
571
+
572
+ evaluator = BenchmarkEvaluator(llm)
573
+
574
+ all_error_metrics: List[ErrorMetrics] = []
575
+ all_false_positives: List[FalsePositive] = []
576
+ total_errors = 0
577
+
578
+ for rel_path, info in manifests.items():
579
+ # Read fixed version
580
+ fixed_path = os.path.join(output_dir, rel_path)
581
+ if not os.path.exists(fixed_path):
582
+ fixed_content = info["baseline_content"]
583
+ else:
584
+ fixed_content = read_file(fixed_path) or info["baseline_content"]
585
+
586
+ # Evaluate this file
587
+ error_metrics, false_positives = evaluator.evaluate_single_file(
588
+ baseline=info["baseline_content"],
589
+ corrupted=info["corrupted_content"],
590
+ revised=fixed_content,
591
+ injection_manifest=info["manifest"],
592
+ file_path=rel_path,
593
+ file_category=info["category"],
594
+ detect_semantic_fp=detect_semantic_fp,
595
+ )
596
+
597
+ all_error_metrics.extend(error_metrics)
598
+ all_false_positives.extend(false_positives)
599
+ total_errors += len(info["manifest"].get("errors", []))
600
+
601
+ # Aggregate results
602
+ result = evaluator.aggregate_results(
603
+ all_error_metrics,
604
+ all_false_positives,
605
+ error_count=total_errors,
606
+ file_count=len(manifests),
607
+ )
608
+
609
+ return result
610
+