bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,610 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Benchmark metrics for comprehensive error injection evaluation.
|
|
3
|
+
|
|
4
|
+
Provides F-score calculation with semantic False Positive detection via LLM.
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import re
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from difflib import SequenceMatcher, unified_diff
|
|
12
|
+
from typing import Dict, Any, List, Tuple, Optional
|
|
13
|
+
|
|
14
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
15
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ErrorMetrics:
|
|
20
|
+
"""Metrics for a single error evaluation."""
|
|
21
|
+
error_id: str
|
|
22
|
+
category: str
|
|
23
|
+
file_path: str
|
|
24
|
+
is_fixed: bool # TP if True, FN if False
|
|
25
|
+
original_snippet: str
|
|
26
|
+
mutated_snippet: str
|
|
27
|
+
status: str # "fixed_to_baseline", "fixed_to_valid", "unchanged"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class FalsePositive:
|
|
32
|
+
"""Represents a detected false positive (harmful unintended change)."""
|
|
33
|
+
file_path: str
|
|
34
|
+
change_description: str
|
|
35
|
+
severity: str # "harmful", "neutral", "beneficial"
|
|
36
|
+
original_text: str
|
|
37
|
+
changed_text: str
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class BenchmarkResult:
|
|
42
|
+
"""Complete benchmark result for a single run."""
|
|
43
|
+
error_count: int
|
|
44
|
+
file_count: int
|
|
45
|
+
|
|
46
|
+
# Core metrics
|
|
47
|
+
true_positives: int = 0 # Errors correctly fixed
|
|
48
|
+
false_negatives: int = 0 # Errors NOT fixed
|
|
49
|
+
false_positives: int = 0 # Harmful unintended changes
|
|
50
|
+
true_negatives: int = 0 # Non-errors correctly unchanged
|
|
51
|
+
|
|
52
|
+
# Derived metrics (computed)
|
|
53
|
+
precision: float = 0.0
|
|
54
|
+
recall: float = 0.0
|
|
55
|
+
f1_score: float = 0.0
|
|
56
|
+
fix_rate: float = 0.0
|
|
57
|
+
|
|
58
|
+
# Detailed breakdowns
|
|
59
|
+
per_category: Dict[str, Dict[str, int]] = field(default_factory=dict)
|
|
60
|
+
per_file: Dict[str, Dict[str, int]] = field(default_factory=dict)
|
|
61
|
+
error_details: List[ErrorMetrics] = field(default_factory=list)
|
|
62
|
+
fp_details: List[FalsePositive] = field(default_factory=list)
|
|
63
|
+
|
|
64
|
+
def compute_derived_metrics(self):
|
|
65
|
+
"""Compute precision, recall, F1 from TP/FP/FN."""
|
|
66
|
+
# Precision = TP / (TP + FP)
|
|
67
|
+
if self.true_positives + self.false_positives > 0:
|
|
68
|
+
self.precision = self.true_positives / (self.true_positives + self.false_positives)
|
|
69
|
+
else:
|
|
70
|
+
self.precision = 0.0
|
|
71
|
+
|
|
72
|
+
# Recall = TP / (TP + FN)
|
|
73
|
+
if self.true_positives + self.false_negatives > 0:
|
|
74
|
+
self.recall = self.true_positives / (self.true_positives + self.false_negatives)
|
|
75
|
+
else:
|
|
76
|
+
self.recall = 0.0
|
|
77
|
+
|
|
78
|
+
# F1 = 2 * (precision * recall) / (precision + recall)
|
|
79
|
+
if self.precision + self.recall > 0:
|
|
80
|
+
self.f1_score = 2 * (self.precision * self.recall) / (self.precision + self.recall)
|
|
81
|
+
else:
|
|
82
|
+
self.f1_score = 0.0
|
|
83
|
+
|
|
84
|
+
# Fix rate = TP / (TP + FN)
|
|
85
|
+
total_errors = self.true_positives + self.false_negatives
|
|
86
|
+
if total_errors > 0:
|
|
87
|
+
self.fix_rate = self.true_positives / total_errors
|
|
88
|
+
else:
|
|
89
|
+
self.fix_rate = 0.0
|
|
90
|
+
|
|
91
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
92
|
+
"""Convert to dictionary for JSON serialization."""
|
|
93
|
+
return {
|
|
94
|
+
"error_count": self.error_count,
|
|
95
|
+
"file_count": self.file_count,
|
|
96
|
+
"true_positives": self.true_positives,
|
|
97
|
+
"false_negatives": self.false_negatives,
|
|
98
|
+
"false_positives": self.false_positives,
|
|
99
|
+
"true_negatives": self.true_negatives,
|
|
100
|
+
"precision": round(self.precision, 4),
|
|
101
|
+
"recall": round(self.recall, 4),
|
|
102
|
+
"f1_score": round(self.f1_score, 4),
|
|
103
|
+
"fix_rate": round(self.fix_rate, 4),
|
|
104
|
+
"per_category": self.per_category,
|
|
105
|
+
"per_file": self.per_file,
|
|
106
|
+
"error_details": [
|
|
107
|
+
{
|
|
108
|
+
"error_id": e.error_id,
|
|
109
|
+
"category": e.category,
|
|
110
|
+
"file_path": e.file_path,
|
|
111
|
+
"is_fixed": e.is_fixed,
|
|
112
|
+
"status": e.status,
|
|
113
|
+
}
|
|
114
|
+
for e in self.error_details
|
|
115
|
+
],
|
|
116
|
+
"fp_details": [
|
|
117
|
+
{
|
|
118
|
+
"file_path": fp.file_path,
|
|
119
|
+
"change_description": fp.change_description,
|
|
120
|
+
"severity": fp.severity,
|
|
121
|
+
}
|
|
122
|
+
for fp in self.fp_details
|
|
123
|
+
],
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
SEMANTIC_FP_PROMPT = """
|
|
128
|
+
You are analyzing changes made to a documentation file to detect potentially harmful modifications.
|
|
129
|
+
|
|
130
|
+
CONTEXT:
|
|
131
|
+
- A document was intentionally corrupted with specific errors (listed below)
|
|
132
|
+
- An AI system attempted to fix these errors
|
|
133
|
+
- We need to check if the AI made any UNINTENDED harmful changes beyond fixing the known errors
|
|
134
|
+
|
|
135
|
+
INJECTED ERRORS (these changes ARE expected and should be fixed):
|
|
136
|
+
{injected_errors}
|
|
137
|
+
|
|
138
|
+
DIFF OF CHANGES (unified diff format):
|
|
139
|
+
```
|
|
140
|
+
{diff}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
TASK:
|
|
144
|
+
Analyze the diff and identify any changes that are NOT related to fixing the injected errors.
|
|
145
|
+
For each unrelated change, classify it as:
|
|
146
|
+
1. "harmful" - Incorrect changes that introduce new errors or break functionality
|
|
147
|
+
2. "neutral" - Style/formatting changes that don't affect correctness
|
|
148
|
+
3. "beneficial" - Improvements beyond the required fixes (still acceptable)
|
|
149
|
+
|
|
150
|
+
OUTPUT (JSON only):
|
|
151
|
+
{{
|
|
152
|
+
"unintended_changes": [
|
|
153
|
+
{{
|
|
154
|
+
"description": "brief description of the change",
|
|
155
|
+
"severity": "harmful|neutral|beneficial",
|
|
156
|
+
"original_text": "what was there before",
|
|
157
|
+
"changed_text": "what it was changed to",
|
|
158
|
+
"reasoning": "why this classification"
|
|
159
|
+
}}
|
|
160
|
+
],
|
|
161
|
+
"summary": {{
|
|
162
|
+
"harmful_count": <int>,
|
|
163
|
+
"neutral_count": <int>,
|
|
164
|
+
"beneficial_count": <int>
|
|
165
|
+
}}
|
|
166
|
+
}}
|
|
167
|
+
|
|
168
|
+
If no unintended changes found, return:
|
|
169
|
+
{{
|
|
170
|
+
"unintended_changes": [],
|
|
171
|
+
"summary": {{"harmful_count": 0, "neutral_count": 0, "beneficial_count": 0}}
|
|
172
|
+
}}
|
|
173
|
+
"""
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
class SemanticFPDetector:
|
|
177
|
+
"""Detects false positives using LLM semantic analysis."""
|
|
178
|
+
|
|
179
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
180
|
+
self.llm = llm
|
|
181
|
+
|
|
182
|
+
def detect_false_positives(
|
|
183
|
+
self,
|
|
184
|
+
baseline: str,
|
|
185
|
+
revised: str,
|
|
186
|
+
injected_errors: List[Dict[str, Any]],
|
|
187
|
+
file_path: str
|
|
188
|
+
) -> List[FalsePositive]:
|
|
189
|
+
"""
|
|
190
|
+
Detect harmful unintended changes (false positives) in the revised content.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
baseline: Original correct content
|
|
194
|
+
revised: Content after AI fixes
|
|
195
|
+
injected_errors: List of errors that were intentionally injected
|
|
196
|
+
file_path: Path to the file being analyzed
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
List of detected false positives (harmful changes)
|
|
200
|
+
"""
|
|
201
|
+
# Generate unified diff
|
|
202
|
+
baseline_lines = baseline.splitlines(keepends=True)
|
|
203
|
+
revised_lines = revised.splitlines(keepends=True)
|
|
204
|
+
diff_lines = list(unified_diff(
|
|
205
|
+
baseline_lines,
|
|
206
|
+
revised_lines,
|
|
207
|
+
fromfile="baseline",
|
|
208
|
+
tofile="revised",
|
|
209
|
+
lineterm=""
|
|
210
|
+
))
|
|
211
|
+
diff_text = "".join(diff_lines)
|
|
212
|
+
|
|
213
|
+
if not diff_text.strip():
|
|
214
|
+
# No changes at all
|
|
215
|
+
return []
|
|
216
|
+
|
|
217
|
+
# Format injected errors for the prompt
|
|
218
|
+
error_descriptions = []
|
|
219
|
+
for err in injected_errors:
|
|
220
|
+
error_descriptions.append(
|
|
221
|
+
f"- Category: {err.get('category', 'unknown')}\n"
|
|
222
|
+
f" Original: {err.get('original_snippet', 'N/A')[:100]}\n"
|
|
223
|
+
f" Mutated: {err.get('mutated_snippet', 'N/A')[:100]}"
|
|
224
|
+
)
|
|
225
|
+
errors_text = "\n".join(error_descriptions) if error_descriptions else "None"
|
|
226
|
+
|
|
227
|
+
# Build prompt
|
|
228
|
+
prompt = SEMANTIC_FP_PROMPT.format(
|
|
229
|
+
injected_errors=errors_text,
|
|
230
|
+
diff=diff_text[:8000] # Limit diff size
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
try:
|
|
234
|
+
conv = CommonConversation(self.llm)
|
|
235
|
+
output, _ = conv.generate(
|
|
236
|
+
system_prompt=prompt,
|
|
237
|
+
instruction_prompt="Analyze the changes and return the JSON."
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
# Parse response
|
|
241
|
+
result = self._parse_json_output(output)
|
|
242
|
+
|
|
243
|
+
# Extract harmful changes as false positives
|
|
244
|
+
false_positives = []
|
|
245
|
+
for change in result.get("unintended_changes", []):
|
|
246
|
+
if change.get("severity") == "harmful":
|
|
247
|
+
false_positives.append(FalsePositive(
|
|
248
|
+
file_path=file_path,
|
|
249
|
+
change_description=change.get("description", "Unknown change"),
|
|
250
|
+
severity="harmful",
|
|
251
|
+
original_text=change.get("original_text", ""),
|
|
252
|
+
changed_text=change.get("changed_text", ""),
|
|
253
|
+
))
|
|
254
|
+
|
|
255
|
+
return false_positives
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
print(f"Warning: Semantic FP detection failed for {file_path}: {e}")
|
|
259
|
+
return []
|
|
260
|
+
|
|
261
|
+
def _parse_json_output(self, output: str) -> Dict[str, Any]:
|
|
262
|
+
"""Parse JSON from LLM output with fallback strategies."""
|
|
263
|
+
# Strategy 1: Direct parse
|
|
264
|
+
try:
|
|
265
|
+
return json.loads(output)
|
|
266
|
+
except json.JSONDecodeError:
|
|
267
|
+
pass
|
|
268
|
+
|
|
269
|
+
# Strategy 2: Extract JSON block
|
|
270
|
+
json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
|
|
271
|
+
match = re.search(json_pattern, output, re.DOTALL)
|
|
272
|
+
if match:
|
|
273
|
+
try:
|
|
274
|
+
return json.loads(match.group(1))
|
|
275
|
+
except json.JSONDecodeError:
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
# Strategy 3: Find first complete JSON object
|
|
279
|
+
start = output.find("{")
|
|
280
|
+
if start != -1:
|
|
281
|
+
brace_count = 0
|
|
282
|
+
end = start
|
|
283
|
+
for i, char in enumerate(output[start:], start):
|
|
284
|
+
if char == "{":
|
|
285
|
+
brace_count += 1
|
|
286
|
+
elif char == "}":
|
|
287
|
+
brace_count -= 1
|
|
288
|
+
if brace_count == 0:
|
|
289
|
+
end = i
|
|
290
|
+
break
|
|
291
|
+
|
|
292
|
+
if brace_count == 0:
|
|
293
|
+
try:
|
|
294
|
+
return json.loads(output[start:end+1])
|
|
295
|
+
except json.JSONDecodeError:
|
|
296
|
+
pass
|
|
297
|
+
|
|
298
|
+
# Fallback
|
|
299
|
+
return {"unintended_changes": [], "summary": {"harmful_count": 0, "neutral_count": 0, "beneficial_count": 0}}
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
class BenchmarkEvaluator:
|
|
303
|
+
"""Evaluates benchmark results with F-score metrics."""
|
|
304
|
+
|
|
305
|
+
def __init__(self, llm: Optional[BaseChatOpenAI] = None):
|
|
306
|
+
self.llm = llm
|
|
307
|
+
self.fp_detector = SemanticFPDetector(llm) if llm else None
|
|
308
|
+
|
|
309
|
+
def evaluate_single_file(
|
|
310
|
+
self,
|
|
311
|
+
baseline: str,
|
|
312
|
+
corrupted: str,
|
|
313
|
+
revised: str,
|
|
314
|
+
injection_manifest: Dict[str, Any],
|
|
315
|
+
file_path: str,
|
|
316
|
+
file_category: str,
|
|
317
|
+
detect_semantic_fp: bool = True
|
|
318
|
+
) -> Tuple[List[ErrorMetrics], List[FalsePositive]]:
|
|
319
|
+
"""
|
|
320
|
+
Evaluate fixes for a single file.
|
|
321
|
+
|
|
322
|
+
Returns:
|
|
323
|
+
Tuple of (error_metrics, false_positives)
|
|
324
|
+
"""
|
|
325
|
+
error_metrics = []
|
|
326
|
+
|
|
327
|
+
for err in injection_manifest.get("errors", []):
|
|
328
|
+
error_id = err.get("id", "unknown")
|
|
329
|
+
category = err.get("category", "unknown")
|
|
330
|
+
orig = err.get("original_snippet", "")
|
|
331
|
+
mut = err.get("mutated_snippet", "")
|
|
332
|
+
|
|
333
|
+
# Determine if error was fixed
|
|
334
|
+
is_fixed, status = self._check_error_fixed(
|
|
335
|
+
category, orig, mut, baseline, corrupted, revised
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
error_metrics.append(ErrorMetrics(
|
|
339
|
+
error_id=error_id,
|
|
340
|
+
category=category,
|
|
341
|
+
file_path=file_path,
|
|
342
|
+
is_fixed=is_fixed,
|
|
343
|
+
original_snippet=orig,
|
|
344
|
+
mutated_snippet=mut,
|
|
345
|
+
status=status,
|
|
346
|
+
))
|
|
347
|
+
|
|
348
|
+
# Detect false positives if LLM available and enabled
|
|
349
|
+
false_positives = []
|
|
350
|
+
if detect_semantic_fp and self.fp_detector:
|
|
351
|
+
false_positives = self.fp_detector.detect_false_positives(
|
|
352
|
+
baseline, revised, injection_manifest.get("errors", []), file_path
|
|
353
|
+
)
|
|
354
|
+
|
|
355
|
+
return error_metrics, false_positives
|
|
356
|
+
|
|
357
|
+
def _check_error_fixed(
|
|
358
|
+
self,
|
|
359
|
+
category: str,
|
|
360
|
+
orig: str,
|
|
361
|
+
mut: str,
|
|
362
|
+
baseline: str,
|
|
363
|
+
corrupted: str,
|
|
364
|
+
revised: str
|
|
365
|
+
) -> Tuple[bool, str]:
|
|
366
|
+
"""
|
|
367
|
+
Check if a specific error was fixed.
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
Tuple of (is_fixed, status)
|
|
371
|
+
"""
|
|
372
|
+
# Logic adapted from test_metrics.py
|
|
373
|
+
if category == "typo":
|
|
374
|
+
if orig and orig in revised:
|
|
375
|
+
return True, "fixed_to_baseline"
|
|
376
|
+
elif mut and mut in revised:
|
|
377
|
+
return False, "unchanged"
|
|
378
|
+
else:
|
|
379
|
+
return True, "fixed_to_valid"
|
|
380
|
+
|
|
381
|
+
elif category == "link":
|
|
382
|
+
wellformed = re.search(r"\[[^\]]+\]\([^\s)]+\)", revised) is not None
|
|
383
|
+
return wellformed, "fixed_to_valid" if wellformed else "unchanged"
|
|
384
|
+
|
|
385
|
+
elif category == "duplicate":
|
|
386
|
+
dup_before = corrupted.count(mut) if mut else 0
|
|
387
|
+
dup_after = revised.count(mut) if mut else 0
|
|
388
|
+
is_fixed = dup_after < dup_before
|
|
389
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
390
|
+
|
|
391
|
+
elif category == "markdown_structure":
|
|
392
|
+
issues_before = self._count_markdown_issues(corrupted)
|
|
393
|
+
issues_after = self._count_markdown_issues(revised)
|
|
394
|
+
is_fixed = issues_after < issues_before
|
|
395
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
396
|
+
|
|
397
|
+
elif category in ("bio_term", "function"):
|
|
398
|
+
if orig and orig in revised:
|
|
399
|
+
return True, "fixed_to_baseline"
|
|
400
|
+
elif mut and mut in revised:
|
|
401
|
+
return False, "unchanged"
|
|
402
|
+
else:
|
|
403
|
+
return True, "fixed_to_valid"
|
|
404
|
+
|
|
405
|
+
elif category == "list_structure":
|
|
406
|
+
mal_before = len(re.findall(r"^[-*]\S", corrupted, flags=re.M))
|
|
407
|
+
mal_after = len(re.findall(r"^[-*]\S", revised, flags=re.M))
|
|
408
|
+
is_fixed = mal_after < mal_before
|
|
409
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
410
|
+
|
|
411
|
+
elif category == "image_syntax":
|
|
412
|
+
bad_before = len(re.findall(r"!\[[^\]]*\]\s+\(", corrupted))
|
|
413
|
+
bad_after = len(re.findall(r"!\[[^\]]*\]\s+\(", revised))
|
|
414
|
+
is_fixed = bad_after < bad_before
|
|
415
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
416
|
+
|
|
417
|
+
elif category == "section_title":
|
|
418
|
+
canonical_titles = {
|
|
419
|
+
"## What is it?", "## What can it do?", "## Requirements",
|
|
420
|
+
"## Install", "## Quick example", "## Learn more", "## License & Contact",
|
|
421
|
+
}
|
|
422
|
+
if mut and mut not in revised and any(t in revised for t in canonical_titles):
|
|
423
|
+
return True, "fixed_to_valid"
|
|
424
|
+
return False, "unchanged"
|
|
425
|
+
|
|
426
|
+
elif category == "inline_code":
|
|
427
|
+
raw = mut.strip('`') if mut else ""
|
|
428
|
+
rewrapped = f"`{raw}`" if raw else ""
|
|
429
|
+
if raw and rewrapped and rewrapped in revised and mut not in revised:
|
|
430
|
+
return True, "fixed_to_valid"
|
|
431
|
+
return False, "unchanged"
|
|
432
|
+
|
|
433
|
+
elif category in ("emphasis", "code_lang_tag"):
|
|
434
|
+
is_fixed = mut and mut not in revised
|
|
435
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
436
|
+
|
|
437
|
+
elif category in ("number", "boolean", "param_name", "comment_typo", "species_name", "gene_case"):
|
|
438
|
+
# For these categories: fixed if original restored OR mutated removed
|
|
439
|
+
if orig and orig in revised:
|
|
440
|
+
return True, "fixed_to_baseline"
|
|
441
|
+
elif mut and mut in revised:
|
|
442
|
+
return False, "unchanged"
|
|
443
|
+
else:
|
|
444
|
+
# Neither found = content rewritten = consider fixed
|
|
445
|
+
return True, "fixed_to_valid"
|
|
446
|
+
|
|
447
|
+
elif category == "table_alignment":
|
|
448
|
+
var_before = self._table_variance(corrupted)
|
|
449
|
+
var_after = self._table_variance(revised)
|
|
450
|
+
is_fixed = var_after < var_before
|
|
451
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
452
|
+
|
|
453
|
+
# Biology-specific and CLI/CONFIG categories
|
|
454
|
+
elif category in {
|
|
455
|
+
"gene_symbol_case", "species_swap", "ref_genome_mismatch", "modality_confusion",
|
|
456
|
+
"normalization_error", "umi_vs_read", "batch_effect", "qc_threshold", "file_format",
|
|
457
|
+
"strandedness", "coordinates", "units_scale", "sample_type", "contamination",
|
|
458
|
+
"param_name", "default_value", "path_hint"
|
|
459
|
+
}:
|
|
460
|
+
is_fixed = mut and mut not in revised
|
|
461
|
+
return is_fixed, "fixed_to_valid" if is_fixed else "unchanged"
|
|
462
|
+
|
|
463
|
+
# Default
|
|
464
|
+
return False, "unchanged"
|
|
465
|
+
|
|
466
|
+
def _count_markdown_issues(self, text: str) -> int:
|
|
467
|
+
"""Count markdown structural issues."""
|
|
468
|
+
issues = 0
|
|
469
|
+
issues += text.count("[
|
|
470
|
+
issues += text.count("[ ")
|
|
471
|
+
issues += len(re.findall(r"^#[^#\s]", text, flags=re.M))
|
|
472
|
+
return max(0, issues)
|
|
473
|
+
|
|
474
|
+
def _table_variance(self, text: str) -> int:
|
|
475
|
+
"""Calculate table alignment variance."""
|
|
476
|
+
rows = [ln for ln in text.splitlines() if '|' in ln]
|
|
477
|
+
groups: List[List[str]] = []
|
|
478
|
+
cur: List[str] = []
|
|
479
|
+
for ln in rows:
|
|
480
|
+
if '|' in ln:
|
|
481
|
+
cur.append(ln)
|
|
482
|
+
else:
|
|
483
|
+
if len(cur) >= 2:
|
|
484
|
+
groups.append(cur)
|
|
485
|
+
cur = []
|
|
486
|
+
if len(cur) >= 2:
|
|
487
|
+
groups.append(cur)
|
|
488
|
+
vari = 0
|
|
489
|
+
for g in groups:
|
|
490
|
+
counts = [ln.count('|') for ln in g]
|
|
491
|
+
vari += (max(counts) - min(counts))
|
|
492
|
+
return vari
|
|
493
|
+
|
|
494
|
+
def aggregate_results(
|
|
495
|
+
self,
|
|
496
|
+
all_error_metrics: List[ErrorMetrics],
|
|
497
|
+
all_false_positives: List[FalsePositive],
|
|
498
|
+
error_count: int,
|
|
499
|
+
file_count: int
|
|
500
|
+
) -> BenchmarkResult:
|
|
501
|
+
"""
|
|
502
|
+
Aggregate metrics from all files into a single BenchmarkResult.
|
|
503
|
+
"""
|
|
504
|
+
result = BenchmarkResult(
|
|
505
|
+
error_count=error_count,
|
|
506
|
+
file_count=file_count,
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Count TP/FN from error metrics
|
|
510
|
+
for em in all_error_metrics:
|
|
511
|
+
if em.is_fixed:
|
|
512
|
+
result.true_positives += 1
|
|
513
|
+
else:
|
|
514
|
+
result.false_negatives += 1
|
|
515
|
+
|
|
516
|
+
# Per-category breakdown
|
|
517
|
+
cat = em.category
|
|
518
|
+
if cat not in result.per_category:
|
|
519
|
+
result.per_category[cat] = {"tp": 0, "fn": 0}
|
|
520
|
+
if em.is_fixed:
|
|
521
|
+
result.per_category[cat]["tp"] += 1
|
|
522
|
+
else:
|
|
523
|
+
result.per_category[cat]["fn"] += 1
|
|
524
|
+
|
|
525
|
+
# Per-file breakdown
|
|
526
|
+
fp = em.file_path
|
|
527
|
+
if fp not in result.per_file:
|
|
528
|
+
result.per_file[fp] = {"tp": 0, "fn": 0, "fp": 0}
|
|
529
|
+
if em.is_fixed:
|
|
530
|
+
result.per_file[fp]["tp"] += 1
|
|
531
|
+
else:
|
|
532
|
+
result.per_file[fp]["fn"] += 1
|
|
533
|
+
|
|
534
|
+
result.error_details.append(em)
|
|
535
|
+
|
|
536
|
+
# Count FP from semantic detection
|
|
537
|
+
result.false_positives = len(all_false_positives)
|
|
538
|
+
result.fp_details = all_false_positives
|
|
539
|
+
|
|
540
|
+
for fp in all_false_positives:
|
|
541
|
+
if fp.file_path not in result.per_file:
|
|
542
|
+
result.per_file[fp.file_path] = {"tp": 0, "fn": 0, "fp": 0}
|
|
543
|
+
result.per_file[fp.file_path]["fp"] += 1
|
|
544
|
+
|
|
545
|
+
# Compute derived metrics
|
|
546
|
+
result.compute_derived_metrics()
|
|
547
|
+
|
|
548
|
+
return result
|
|
549
|
+
|
|
550
|
+
|
|
551
|
+
def evaluate_benchmark(
|
|
552
|
+
manifests: Dict[str, Dict[str, Any]],
|
|
553
|
+
output_dir: str,
|
|
554
|
+
llm: Optional[BaseChatOpenAI] = None,
|
|
555
|
+
detect_semantic_fp: bool = True
|
|
556
|
+
) -> BenchmarkResult:
|
|
557
|
+
"""
|
|
558
|
+
Evaluate a complete benchmark run.
|
|
559
|
+
|
|
560
|
+
Args:
|
|
561
|
+
manifests: Dict mapping file paths to their injection info
|
|
562
|
+
output_dir: Directory containing the fixed files
|
|
563
|
+
llm: LLM for semantic FP detection (optional)
|
|
564
|
+
detect_semantic_fp: Whether to run semantic FP detection
|
|
565
|
+
|
|
566
|
+
Returns:
|
|
567
|
+
BenchmarkResult with all metrics
|
|
568
|
+
"""
|
|
569
|
+
import os
|
|
570
|
+
from bioguider.agents.agent_utils import read_file
|
|
571
|
+
|
|
572
|
+
evaluator = BenchmarkEvaluator(llm)
|
|
573
|
+
|
|
574
|
+
all_error_metrics: List[ErrorMetrics] = []
|
|
575
|
+
all_false_positives: List[FalsePositive] = []
|
|
576
|
+
total_errors = 0
|
|
577
|
+
|
|
578
|
+
for rel_path, info in manifests.items():
|
|
579
|
+
# Read fixed version
|
|
580
|
+
fixed_path = os.path.join(output_dir, rel_path)
|
|
581
|
+
if not os.path.exists(fixed_path):
|
|
582
|
+
fixed_content = info["baseline_content"]
|
|
583
|
+
else:
|
|
584
|
+
fixed_content = read_file(fixed_path) or info["baseline_content"]
|
|
585
|
+
|
|
586
|
+
# Evaluate this file
|
|
587
|
+
error_metrics, false_positives = evaluator.evaluate_single_file(
|
|
588
|
+
baseline=info["baseline_content"],
|
|
589
|
+
corrupted=info["corrupted_content"],
|
|
590
|
+
revised=fixed_content,
|
|
591
|
+
injection_manifest=info["manifest"],
|
|
592
|
+
file_path=rel_path,
|
|
593
|
+
file_category=info["category"],
|
|
594
|
+
detect_semantic_fp=detect_semantic_fp,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
all_error_metrics.extend(error_metrics)
|
|
598
|
+
all_false_positives.extend(false_positives)
|
|
599
|
+
total_errors += len(info["manifest"].get("errors", []))
|
|
600
|
+
|
|
601
|
+
# Aggregate results
|
|
602
|
+
result = evaluator.aggregate_results(
|
|
603
|
+
all_error_metrics,
|
|
604
|
+
all_false_positives,
|
|
605
|
+
error_count=total_errors,
|
|
606
|
+
file_count=len(manifests),
|
|
607
|
+
)
|
|
608
|
+
|
|
609
|
+
return result
|
|
610
|
+
|