bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
from typing import Tuple
|
|
6
|
+
|
|
7
|
+
from bioguider.generation.llm_injector import LLMErrorInjector
|
|
8
|
+
from bioguider.generation.test_metrics import evaluate_fixes
|
|
9
|
+
from bioguider.managers.generation_manager import DocumentationGenerationManager
|
|
10
|
+
from bioguider.agents.agent_utils import read_file, write_file
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class GenerationTestManager:
|
|
14
|
+
def __init__(self, llm, step_callback):
|
|
15
|
+
self.llm = llm
|
|
16
|
+
self.step_output = step_callback
|
|
17
|
+
|
|
18
|
+
def print_step(self, name: str, out: str | None = None):
|
|
19
|
+
if self.step_output:
|
|
20
|
+
self.step_output(step_name=name, step_output=out)
|
|
21
|
+
|
|
22
|
+
def run_quant_test(self, report_path: str, baseline_repo_path: str, tmp_repo_path: str, min_per_category: int = 3) -> str:
|
|
23
|
+
self.print_step("QuantTest:LoadBaseline", baseline_repo_path)
|
|
24
|
+
baseline_readme_path = os.path.join(baseline_repo_path, "README.md")
|
|
25
|
+
baseline = read_file(baseline_readme_path) or ""
|
|
26
|
+
|
|
27
|
+
self.print_step("QuantTest:Inject")
|
|
28
|
+
injector = LLMErrorInjector(self.llm)
|
|
29
|
+
corrupted, inj_manifest = injector.inject(baseline, min_per_category=min_per_category)
|
|
30
|
+
|
|
31
|
+
# write corrupted into tmp repo path
|
|
32
|
+
os.makedirs(tmp_repo_path, exist_ok=True)
|
|
33
|
+
corrupted_readme_path = os.path.join(tmp_repo_path, "README.md")
|
|
34
|
+
write_file(corrupted_readme_path, corrupted)
|
|
35
|
+
inj_path = os.path.join(tmp_repo_path, "INJECTION_MANIFEST.json")
|
|
36
|
+
with open(inj_path, "w", encoding="utf-8") as fobj:
|
|
37
|
+
json.dump(inj_manifest, fobj, indent=2)
|
|
38
|
+
|
|
39
|
+
self.print_step("QuantTest:Generate")
|
|
40
|
+
gen = DocumentationGenerationManager(self.llm, self.step_output)
|
|
41
|
+
out_dir = gen.run(report_path=report_path, repo_path=tmp_repo_path)
|
|
42
|
+
|
|
43
|
+
# read revised
|
|
44
|
+
revised_readme_path = os.path.join(out_dir, "README.md")
|
|
45
|
+
revised = read_file(revised_readme_path) or ""
|
|
46
|
+
|
|
47
|
+
self.print_step("QuantTest:Evaluate")
|
|
48
|
+
results = evaluate_fixes(baseline, corrupted, revised, inj_manifest)
|
|
49
|
+
# write results
|
|
50
|
+
with open(os.path.join(out_dir, "GEN_TEST_RESULTS.json"), "w", encoding="utf-8") as fobj:
|
|
51
|
+
json.dump(results, fobj, indent=2)
|
|
52
|
+
# slides-like markdown report
|
|
53
|
+
totals = results.get("summary", {}).get("totals", {})
|
|
54
|
+
success_rate = results.get("summary", {}).get("success_rate", 0.0)
|
|
55
|
+
lines = ["# 🔬 Quantifiable Testing Results\n",
|
|
56
|
+
"\n## BioGuider Error Correction Performance Analysis\n",
|
|
57
|
+
"\n---\n",
|
|
58
|
+
"\n## 📊 Slide 1: Testing Results Overview\n",
|
|
59
|
+
"\n### 🎯 Totals\n",
|
|
60
|
+
f"- Total Errors: {totals.get('total_errors', 0)}\n",
|
|
61
|
+
f"- Fixed to Baseline: {totals.get('fixed_to_baseline', 0)}\n",
|
|
62
|
+
f"- Fixed to Valid: {totals.get('fixed_to_valid', 0)}\n",
|
|
63
|
+
f"- Unchanged: {totals.get('unchanged', 0)}\n",
|
|
64
|
+
f"- Success Rate: {success_rate}%\n",
|
|
65
|
+
"\n### 📂 Per-Category Metrics\n"]
|
|
66
|
+
for cat, m in results["per_category"].items():
|
|
67
|
+
lines.append(f"- {cat}: total={m.get('total',0)}, fixed_to_baseline={m.get('fixed_to_baseline',0)}, fixed_to_valid={m.get('fixed_to_valid',0)}, unchanged={m.get('unchanged',0)}")
|
|
68
|
+
# Per-file change counts (simple heuristic from manifest artifacts)
|
|
69
|
+
try:
|
|
70
|
+
manifest_path = os.path.join(out_dir, "manifest.json")
|
|
71
|
+
with open(manifest_path, "r", encoding="utf-8") as mf:
|
|
72
|
+
mani = json.load(mf)
|
|
73
|
+
lines.append("\n### 🗂️ Per-File Changes\n")
|
|
74
|
+
for art in mani.get("artifacts", []):
|
|
75
|
+
rel = art.get("dest_rel_path")
|
|
76
|
+
stats = art.get("diff_stats", {})
|
|
77
|
+
added = stats.get("added_lines", 0)
|
|
78
|
+
status = "Revised" if added and added > 0 else "Copied"
|
|
79
|
+
lines.append(f"- {rel}: {status}, added_lines={added}")
|
|
80
|
+
except Exception:
|
|
81
|
+
pass
|
|
82
|
+
lines.append("\n---\n\n## 📝 Notes\n")
|
|
83
|
+
lines.append("- README versions saved: README.original.md, README.corrupted.md, README.md (fixed).\n")
|
|
84
|
+
with open(os.path.join(out_dir, "GEN_TEST_REPORT.md"), "w", encoding="utf-8") as fobj:
|
|
85
|
+
fobj.write("\n".join(lines))
|
|
86
|
+
# Save versioned files into output dir
|
|
87
|
+
write_file(os.path.join(out_dir, "README.original.md"), baseline)
|
|
88
|
+
write_file(os.path.join(out_dir, "README.corrupted.md"), corrupted)
|
|
89
|
+
# Copy injection manifest
|
|
90
|
+
try:
|
|
91
|
+
with open(inj_path, "r", encoding="utf-8") as fin:
|
|
92
|
+
with open(os.path.join(out_dir, "INJECTION_MANIFEST.json"), "w", encoding="utf-8") as fout:
|
|
93
|
+
fout.write(fin.read())
|
|
94
|
+
except Exception:
|
|
95
|
+
pass
|
|
96
|
+
self.print_step("QuantTest:Done", out_dir)
|
|
97
|
+
return out_dir
|
|
98
|
+
|
|
99
|
+
def run_quant_suite(self, report_path: str, baseline_repo_path: str, base_tmp_repo_path: str, levels: dict[str, int]) -> dict:
|
|
100
|
+
results = {}
|
|
101
|
+
for level, min_cnt in levels.items():
|
|
102
|
+
tmp_repo_path = f"{base_tmp_repo_path}_{level}"
|
|
103
|
+
out_dir = self.run_quant_test(report_path, baseline_repo_path, tmp_repo_path, min_per_category=min_cnt)
|
|
104
|
+
results[level] = out_dir
|
|
105
|
+
return results
|
|
106
|
+
|
|
107
|
+
|
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
import shutil
|
|
6
|
+
from typing import Dict, List, Tuple
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from bioguider.generation.llm_injector import LLMErrorInjector
|
|
10
|
+
from bioguider.managers.generation_manager import DocumentationGenerationManager
|
|
11
|
+
from bioguider.agents.agent_utils import read_file, write_file
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GenerationTestManagerV2:
|
|
15
|
+
"""
|
|
16
|
+
Enhanced version that:
|
|
17
|
+
1. Injects errors into ALL files in multiple categories (README, ALL tutorials, ALL userguides, ALL installation docs)
|
|
18
|
+
2. Tracks errors comprehensively across all files with detailed per-file and per-category statistics
|
|
19
|
+
3. Provides detailed statistics on injected, detected, and fixed errors
|
|
20
|
+
4. Simplifies reporting (fixed vs unchanged, no confusing dual-status)
|
|
21
|
+
5. Saves corrupted, original, and fixed versions for each file for full audit trail
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
def __init__(self, llm, step_callback):
|
|
25
|
+
self.llm = llm
|
|
26
|
+
self.step_output = step_callback
|
|
27
|
+
|
|
28
|
+
def print_step(self, name: str, out: str | None = None):
|
|
29
|
+
if self.step_output:
|
|
30
|
+
self.step_output(step_name=name, step_output=out)
|
|
31
|
+
|
|
32
|
+
def _select_target_files(self, baseline_repo_path: str) -> Dict[str, List[str]]:
|
|
33
|
+
"""
|
|
34
|
+
Select target files for error injection across multiple categories.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Dict mapping category names to file paths
|
|
38
|
+
"""
|
|
39
|
+
targets = {
|
|
40
|
+
"readme": [],
|
|
41
|
+
"tutorial": [],
|
|
42
|
+
"userguide": [],
|
|
43
|
+
"installation": []
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
# README files
|
|
47
|
+
readme_path = os.path.join(baseline_repo_path, "README.md")
|
|
48
|
+
if os.path.exists(readme_path):
|
|
49
|
+
targets["readme"].append(readme_path)
|
|
50
|
+
|
|
51
|
+
# Tutorial files (RMarkdown vignettes) - ALL FILES
|
|
52
|
+
vignettes_dir = os.path.join(baseline_repo_path, "vignettes")
|
|
53
|
+
if os.path.isdir(vignettes_dir):
|
|
54
|
+
tutorial_files = []
|
|
55
|
+
for f in os.listdir(vignettes_dir):
|
|
56
|
+
if f.endswith('.Rmd') and not f.startswith('.'):
|
|
57
|
+
tutorial_files.append(os.path.join(vignettes_dir, f))
|
|
58
|
+
# Inject into ALL tutorial files
|
|
59
|
+
targets["tutorial"] = sorted(tutorial_files)
|
|
60
|
+
|
|
61
|
+
# Installation files
|
|
62
|
+
install_files = []
|
|
63
|
+
for pattern in ["install", "INSTALL", "installation"]:
|
|
64
|
+
for ext in [".md", ".Rmd", ".rst"]:
|
|
65
|
+
fpath = os.path.join(baseline_repo_path, pattern + ext)
|
|
66
|
+
if os.path.exists(fpath):
|
|
67
|
+
install_files.append(fpath)
|
|
68
|
+
|
|
69
|
+
# Also check vignettes for installation guides
|
|
70
|
+
if os.path.isdir(vignettes_dir):
|
|
71
|
+
for f in os.listdir(vignettes_dir):
|
|
72
|
+
if "install" in f.lower() and (f.endswith('.Rmd') or f.endswith('.md')):
|
|
73
|
+
fpath = os.path.join(vignettes_dir, f)
|
|
74
|
+
if fpath not in install_files: # Avoid duplicates
|
|
75
|
+
install_files.append(fpath)
|
|
76
|
+
|
|
77
|
+
targets["installation"] = install_files # ALL installation docs
|
|
78
|
+
|
|
79
|
+
# Userguide files - ALL FILES
|
|
80
|
+
docs_dir = os.path.join(baseline_repo_path, "docs")
|
|
81
|
+
if os.path.isdir(docs_dir):
|
|
82
|
+
userguide_files = []
|
|
83
|
+
for f in os.listdir(docs_dir):
|
|
84
|
+
if f.endswith('.md') and not f.startswith('.'):
|
|
85
|
+
userguide_files.append(os.path.join(docs_dir, f))
|
|
86
|
+
targets["userguide"] = userguide_files # ALL userguide files
|
|
87
|
+
|
|
88
|
+
return targets
|
|
89
|
+
|
|
90
|
+
def _extract_project_terms(self, repo_path: str) -> List[str]:
|
|
91
|
+
"""
|
|
92
|
+
Extract function names and key terms from the codebase to use as injection targets.
|
|
93
|
+
"""
|
|
94
|
+
import re
|
|
95
|
+
from collections import Counter
|
|
96
|
+
|
|
97
|
+
terms = Counter()
|
|
98
|
+
|
|
99
|
+
# Walk through the repo
|
|
100
|
+
for root, _, files in os.walk(repo_path):
|
|
101
|
+
if ".git" in root or "__pycache__" in root:
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
for file in files:
|
|
105
|
+
fpath = os.path.join(root, file)
|
|
106
|
+
try:
|
|
107
|
+
content = read_file(fpath)
|
|
108
|
+
if not content:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
if file.endswith(".py"):
|
|
112
|
+
# Python function definitions
|
|
113
|
+
funcs = re.findall(r"def\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*\(", content)
|
|
114
|
+
terms.update(funcs)
|
|
115
|
+
# Python class definitions
|
|
116
|
+
classes = re.findall(r"class\s+([a-zA-Z_][a-zA-Z0-9_]*)\s*[:\(]", content)
|
|
117
|
+
terms.update(classes)
|
|
118
|
+
|
|
119
|
+
elif file.endswith(".R"):
|
|
120
|
+
# R function definitions
|
|
121
|
+
funcs = re.findall(r"([a-zA-Z_.][a-zA-Z0-9_.]*)\s*<-\s*function", content)
|
|
122
|
+
terms.update(funcs)
|
|
123
|
+
|
|
124
|
+
except Exception:
|
|
125
|
+
continue
|
|
126
|
+
|
|
127
|
+
# Filter out common/short terms
|
|
128
|
+
filtered_terms = [t for t, _ in terms.most_common(50) if len(t) > 4 and t not in ["init", "self", "setup", "test", "main"]]
|
|
129
|
+
return filtered_terms[:20]
|
|
130
|
+
|
|
131
|
+
def _inject_errors_into_files(
|
|
132
|
+
self,
|
|
133
|
+
target_files: Dict[str, List[str]],
|
|
134
|
+
tmp_repo_path: str,
|
|
135
|
+
min_per_category: int
|
|
136
|
+
) -> Dict[str, Dict]:
|
|
137
|
+
"""
|
|
138
|
+
Inject errors into selected files.
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Dict mapping file paths to injection manifests
|
|
142
|
+
"""
|
|
143
|
+
injector = LLMErrorInjector(self.llm)
|
|
144
|
+
all_manifests = {}
|
|
145
|
+
|
|
146
|
+
# Extract project terms once
|
|
147
|
+
project_terms = self._extract_project_terms(tmp_repo_path)
|
|
148
|
+
self.print_step("ExtractTerms", f"Found {len(project_terms)} project terms: {', '.join(project_terms[:5])}...")
|
|
149
|
+
|
|
150
|
+
for category, file_list in target_files.items():
|
|
151
|
+
self.print_step(f"InjectErrors:{category.title()}", f"Injecting {min_per_category} errors per file into {len(file_list)} files")
|
|
152
|
+
|
|
153
|
+
for fpath in file_list:
|
|
154
|
+
if not os.path.exists(fpath):
|
|
155
|
+
continue
|
|
156
|
+
|
|
157
|
+
baseline_content = read_file(fpath) or ""
|
|
158
|
+
if not baseline_content.strip():
|
|
159
|
+
continue
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
# Inject errors
|
|
163
|
+
corrupted, manifest = injector.inject(
|
|
164
|
+
baseline_content,
|
|
165
|
+
min_per_category=min_per_category,
|
|
166
|
+
project_terms=project_terms
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Save corrupted version to tmp repo
|
|
170
|
+
rel_path = os.path.relpath(fpath, os.path.dirname(os.path.dirname(fpath)))
|
|
171
|
+
if rel_path.startswith("../"):
|
|
172
|
+
rel_path = os.path.basename(fpath)
|
|
173
|
+
|
|
174
|
+
corrupted_path = os.path.join(tmp_repo_path, rel_path)
|
|
175
|
+
os.makedirs(os.path.dirname(corrupted_path), exist_ok=True)
|
|
176
|
+
write_file(corrupted_path, corrupted)
|
|
177
|
+
|
|
178
|
+
# Track manifest - add file_path to each error for tracking
|
|
179
|
+
errors_with_file = []
|
|
180
|
+
for error in manifest.get("errors", []):
|
|
181
|
+
error_with_file = error.copy()
|
|
182
|
+
error_with_file["file_path"] = rel_path
|
|
183
|
+
errors_with_file.append(error_with_file)
|
|
184
|
+
|
|
185
|
+
manifest_with_file = manifest.copy()
|
|
186
|
+
manifest_with_file["errors"] = errors_with_file
|
|
187
|
+
|
|
188
|
+
all_manifests[rel_path] = {
|
|
189
|
+
"category": category,
|
|
190
|
+
"original_path": fpath,
|
|
191
|
+
"corrupted_path": corrupted_path,
|
|
192
|
+
"manifest": manifest_with_file,
|
|
193
|
+
"baseline_content": baseline_content,
|
|
194
|
+
"corrupted_content": corrupted
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
self.print_step(
|
|
198
|
+
f"Injected:{os.path.basename(fpath)}",
|
|
199
|
+
f"{len(manifest.get('errors', []))} errors"
|
|
200
|
+
)
|
|
201
|
+
except Exception as e:
|
|
202
|
+
self.print_step(f"InjectionError:{os.path.basename(fpath)}", str(e))
|
|
203
|
+
continue
|
|
204
|
+
|
|
205
|
+
return all_manifests
|
|
206
|
+
|
|
207
|
+
def _evaluate_all_fixes(
|
|
208
|
+
self,
|
|
209
|
+
all_manifests: Dict[str, Dict],
|
|
210
|
+
output_dir: str
|
|
211
|
+
) -> Dict:
|
|
212
|
+
"""
|
|
213
|
+
Evaluate fixes across all injected files.
|
|
214
|
+
|
|
215
|
+
Returns comprehensive statistics.
|
|
216
|
+
"""
|
|
217
|
+
from bioguider.generation.test_metrics import evaluate_fixes
|
|
218
|
+
|
|
219
|
+
all_results = {
|
|
220
|
+
"per_file": {},
|
|
221
|
+
"aggregate": {
|
|
222
|
+
"total_files_injected": len(all_manifests),
|
|
223
|
+
"total_errors_injected": 0,
|
|
224
|
+
"total_errors_fixed": 0,
|
|
225
|
+
"total_errors_unchanged": 0,
|
|
226
|
+
"by_category": {},
|
|
227
|
+
"by_file_type": {}
|
|
228
|
+
},
|
|
229
|
+
"detailed_errors": []
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
for rel_path, info in all_manifests.items():
|
|
233
|
+
# Read the fixed version
|
|
234
|
+
fixed_path = os.path.join(output_dir, rel_path)
|
|
235
|
+
if not os.path.exists(fixed_path):
|
|
236
|
+
# File wasn't processed - copy original
|
|
237
|
+
fixed_content = info["baseline_content"]
|
|
238
|
+
else:
|
|
239
|
+
fixed_content = read_file(fixed_path) or info["baseline_content"]
|
|
240
|
+
|
|
241
|
+
# Evaluate fixes for this file
|
|
242
|
+
results = evaluate_fixes(
|
|
243
|
+
info["baseline_content"],
|
|
244
|
+
info["corrupted_content"],
|
|
245
|
+
fixed_content,
|
|
246
|
+
info["manifest"]
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
# Store per-file results
|
|
250
|
+
all_results["per_file"][rel_path] = {
|
|
251
|
+
"category": info["category"],
|
|
252
|
+
"results": results
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
# Aggregate statistics
|
|
256
|
+
totals = results.get("summary", {}).get("totals", {})
|
|
257
|
+
file_total_errors = totals.get("total_errors", 0)
|
|
258
|
+
file_fixed = totals.get("fixed_to_baseline", 0) + totals.get("fixed_to_valid", 0)
|
|
259
|
+
file_unchanged = totals.get("unchanged", 0)
|
|
260
|
+
|
|
261
|
+
all_results["aggregate"]["total_errors_injected"] += file_total_errors
|
|
262
|
+
all_results["aggregate"]["total_errors_fixed"] += file_fixed
|
|
263
|
+
all_results["aggregate"]["total_errors_unchanged"] += file_unchanged
|
|
264
|
+
|
|
265
|
+
# By file type
|
|
266
|
+
file_cat = info["category"]
|
|
267
|
+
if file_cat not in all_results["aggregate"]["by_file_type"]:
|
|
268
|
+
all_results["aggregate"]["by_file_type"][file_cat] = {
|
|
269
|
+
"files": 0,
|
|
270
|
+
"errors_injected": 0,
|
|
271
|
+
"errors_fixed": 0,
|
|
272
|
+
"errors_unchanged": 0
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
all_results["aggregate"]["by_file_type"][file_cat]["files"] += 1
|
|
276
|
+
all_results["aggregate"]["by_file_type"][file_cat]["errors_injected"] += file_total_errors
|
|
277
|
+
all_results["aggregate"]["by_file_type"][file_cat]["errors_fixed"] += file_fixed
|
|
278
|
+
all_results["aggregate"]["by_file_type"][file_cat]["errors_unchanged"] += file_unchanged
|
|
279
|
+
|
|
280
|
+
# By error category
|
|
281
|
+
for err_cat, metrics in results.get("per_category", {}).items():
|
|
282
|
+
if err_cat not in all_results["aggregate"]["by_category"]:
|
|
283
|
+
all_results["aggregate"]["by_category"][err_cat] = {
|
|
284
|
+
"total": 0,
|
|
285
|
+
"fixed": 0,
|
|
286
|
+
"unchanged": 0
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
all_results["aggregate"]["by_category"][err_cat]["total"] += metrics.get("total", 0)
|
|
290
|
+
all_results["aggregate"]["by_category"][err_cat]["fixed"] += (
|
|
291
|
+
metrics.get("fixed_to_baseline", 0) + metrics.get("fixed_to_valid", 0)
|
|
292
|
+
)
|
|
293
|
+
all_results["aggregate"]["by_category"][err_cat]["unchanged"] += metrics.get("unchanged", 0)
|
|
294
|
+
|
|
295
|
+
# Collect detailed errors
|
|
296
|
+
for err in results.get("per_error", []):
|
|
297
|
+
err_detail = {
|
|
298
|
+
"file": rel_path,
|
|
299
|
+
"file_category": file_cat,
|
|
300
|
+
**err
|
|
301
|
+
}
|
|
302
|
+
# Simplify status
|
|
303
|
+
if err["status"] in ("fixed_to_baseline", "fixed_to_valid"):
|
|
304
|
+
err_detail["status"] = "fixed"
|
|
305
|
+
all_results["detailed_errors"].append(err_detail)
|
|
306
|
+
|
|
307
|
+
# Calculate aggregate success rate
|
|
308
|
+
total_errors = all_results["aggregate"]["total_errors_injected"]
|
|
309
|
+
fixed_errors = all_results["aggregate"]["total_errors_fixed"]
|
|
310
|
+
all_results["aggregate"]["success_rate"] = (
|
|
311
|
+
round((fixed_errors / total_errors * 100.0), 2) if total_errors > 0 else 0.0
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
return all_results
|
|
315
|
+
|
|
316
|
+
def _generate_comprehensive_report(
|
|
317
|
+
self,
|
|
318
|
+
results: Dict,
|
|
319
|
+
output_dir: str,
|
|
320
|
+
level: str
|
|
321
|
+
):
|
|
322
|
+
"""Generate a comprehensive markdown report"""
|
|
323
|
+
agg = results["aggregate"]
|
|
324
|
+
|
|
325
|
+
lines = [
|
|
326
|
+
"# 🔬 BioGuider Quantifiable Testing Results\n",
|
|
327
|
+
f"**Test Level**: {level.upper()}\n",
|
|
328
|
+
"\n---\n",
|
|
329
|
+
"\n## 📊 Executive Summary\n",
|
|
330
|
+
f"\n### Overall Performance\n",
|
|
331
|
+
f"- **Success Rate**: {agg['success_rate']}%\n",
|
|
332
|
+
f"- **Total Files Tested**: {agg['total_files_injected']}\n",
|
|
333
|
+
f"- **Total Errors Injected**: {agg['total_errors_injected']}\n",
|
|
334
|
+
f"- **Errors Fixed**: {agg['total_errors_fixed']} ({round(agg['total_errors_fixed']/agg['total_errors_injected']*100, 1) if agg['total_errors_injected'] > 0 else 0}%)\n",
|
|
335
|
+
f"- **Errors Unchanged**: {agg['total_errors_unchanged']} ({round(agg['total_errors_unchanged']/agg['total_errors_injected']*100, 1) if agg['total_errors_injected'] > 0 else 0}%)\n",
|
|
336
|
+
"\n---\n",
|
|
337
|
+
"\n## 📂 Performance by File Type\n",
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
for file_type, metrics in sorted(agg["by_file_type"].items()):
|
|
341
|
+
fix_rate = (metrics["errors_fixed"] / metrics["errors_injected"] * 100) if metrics["errors_injected"] > 0 else 0
|
|
342
|
+
lines.append(f"\n### {file_type.title()}\n")
|
|
343
|
+
lines.append(f"- Files Tested: {metrics['files']}\n")
|
|
344
|
+
lines.append(f"- Errors Injected: {metrics['errors_injected']}\n")
|
|
345
|
+
lines.append(f"- Errors Fixed: {metrics['errors_fixed']} ({fix_rate:.1f}%)\n")
|
|
346
|
+
lines.append(f"- Errors Unchanged: {metrics['errors_unchanged']}\n")
|
|
347
|
+
|
|
348
|
+
lines.append("\n---\n")
|
|
349
|
+
lines.append("\n## 🏷️ Performance by Error Category\n")
|
|
350
|
+
lines.append("\n| Category | Total | Fixed | Unchanged | Fix Rate |\n")
|
|
351
|
+
lines.append("|----------|-------|-------|-----------|----------|\n")
|
|
352
|
+
|
|
353
|
+
for err_cat, metrics in sorted(agg["by_category"].items(), key=lambda x: -x[1]["total"]):
|
|
354
|
+
fix_rate = (metrics["fixed"] / metrics["total"] * 100) if metrics["total"] > 0 else 0
|
|
355
|
+
lines.append(
|
|
356
|
+
f"| {err_cat} | {metrics['total']} | {metrics['fixed']} | "
|
|
357
|
+
f"{metrics['unchanged']} | {fix_rate:.1f}% |\n"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
lines.append("\n---\n")
|
|
361
|
+
lines.append("\n## 📝 Detailed Error Breakdown\n")
|
|
362
|
+
|
|
363
|
+
# Group by file
|
|
364
|
+
by_file = {}
|
|
365
|
+
for err in results["detailed_errors"]:
|
|
366
|
+
fpath = err["file"]
|
|
367
|
+
if fpath not in by_file:
|
|
368
|
+
by_file[fpath] = []
|
|
369
|
+
by_file[fpath].append(err)
|
|
370
|
+
|
|
371
|
+
for fpath, errors in sorted(by_file.items()):
|
|
372
|
+
fixed_count = sum(1 for e in errors if e["status"] == "fixed")
|
|
373
|
+
total_count = len(errors)
|
|
374
|
+
lines.append(f"\n### `{fpath}`\n")
|
|
375
|
+
lines.append(f"- **Total Errors**: {total_count}\n")
|
|
376
|
+
lines.append(f"- **Fixed**: {fixed_count}\n")
|
|
377
|
+
lines.append(f"- **Unchanged**: {total_count - fixed_count}\n")
|
|
378
|
+
lines.append("\n| ID | Category | Status |\n")
|
|
379
|
+
lines.append("|--------|----------|--------|\n")
|
|
380
|
+
for err in errors:
|
|
381
|
+
lines.append(f"| {err['id']} | {err['category']} | {err['status']} |\n")
|
|
382
|
+
|
|
383
|
+
lines.append("\n---\n")
|
|
384
|
+
lines.append("\n## 💡 Notes\n")
|
|
385
|
+
lines.append("- Original, corrupted, and fixed versions saved for each file\n")
|
|
386
|
+
lines.append("- Detailed injection manifests available in `INJECTION_MANIFEST.json`\n")
|
|
387
|
+
lines.append("- Complete results data in `GEN_TEST_RESULTS.json`\n")
|
|
388
|
+
|
|
389
|
+
with open(os.path.join(output_dir, "GEN_TEST_REPORT.md"), "w", encoding="utf-8") as f:
|
|
390
|
+
f.write("".join(lines))
|
|
391
|
+
|
|
392
|
+
def run_quant_test(
|
|
393
|
+
self,
|
|
394
|
+
report_path: str,
|
|
395
|
+
baseline_repo_path: str,
|
|
396
|
+
tmp_repo_path: str,
|
|
397
|
+
min_per_category: int = 3
|
|
398
|
+
) -> str:
|
|
399
|
+
"""
|
|
400
|
+
Run quantifiable testing with multi-file error injection.
|
|
401
|
+
"""
|
|
402
|
+
# 1. Select target files across categories
|
|
403
|
+
self.print_step("SelectFiles", "Identifying target files...")
|
|
404
|
+
target_files = self._select_target_files(baseline_repo_path)
|
|
405
|
+
|
|
406
|
+
total_targets = sum(len(files) for files in target_files.values())
|
|
407
|
+
self.print_step("TargetsSelected", f"{total_targets} files selected across {len(target_files)} categories")
|
|
408
|
+
|
|
409
|
+
# 2. Copy baseline to tmp (for unmodified files)
|
|
410
|
+
if os.path.exists(tmp_repo_path):
|
|
411
|
+
shutil.rmtree(tmp_repo_path)
|
|
412
|
+
shutil.copytree(baseline_repo_path, tmp_repo_path, symlinks=False, ignore=shutil.ignore_patterns('.git'))
|
|
413
|
+
|
|
414
|
+
# 3. Inject errors into selected files
|
|
415
|
+
self.print_step("InjectErrors", f"Injecting {min_per_category} errors per category...")
|
|
416
|
+
all_manifests = self._inject_errors_into_files(target_files, tmp_repo_path, min_per_category)
|
|
417
|
+
|
|
418
|
+
total_errors = sum(len(info["manifest"].get("errors", [])) for info in all_manifests.values())
|
|
419
|
+
self.print_step("InjectionComplete", f"{total_errors} errors injected across {len(all_manifests)} files")
|
|
420
|
+
|
|
421
|
+
# Save combined injection manifest with proper structure
|
|
422
|
+
# Flatten all errors with file information for easy tracking
|
|
423
|
+
all_errors_flat = []
|
|
424
|
+
files_info = {}
|
|
425
|
+
for rel_path, info in all_manifests.items():
|
|
426
|
+
file_errors = info["manifest"].get("errors", [])
|
|
427
|
+
files_info[rel_path] = {
|
|
428
|
+
"category": info["category"],
|
|
429
|
+
"original_path": info["original_path"],
|
|
430
|
+
"corrupted_path": info["corrupted_path"],
|
|
431
|
+
"error_count": len(file_errors),
|
|
432
|
+
"errors": file_errors
|
|
433
|
+
}
|
|
434
|
+
all_errors_flat.extend(file_errors)
|
|
435
|
+
|
|
436
|
+
combined_manifest = {
|
|
437
|
+
"total_files": len(all_manifests),
|
|
438
|
+
"total_errors": total_errors,
|
|
439
|
+
"files": files_info,
|
|
440
|
+
"errors": all_errors_flat # Flat list for easy evaluation
|
|
441
|
+
}
|
|
442
|
+
inj_path = os.path.join(tmp_repo_path, "INJECTION_MANIFEST.json")
|
|
443
|
+
with open(inj_path, "w", encoding="utf-8") as f:
|
|
444
|
+
json.dump(combined_manifest, f, indent=2)
|
|
445
|
+
|
|
446
|
+
# 4. Run generation/fixing
|
|
447
|
+
self.print_step("RunGeneration", "Running BioGuider to fix errors...")
|
|
448
|
+
gen = DocumentationGenerationManager(self.llm, self.step_output)
|
|
449
|
+
out_dir = gen.run(report_path=report_path, repo_path=tmp_repo_path)
|
|
450
|
+
|
|
451
|
+
# 5. Evaluate fixes
|
|
452
|
+
self.print_step("EvaluateFixes", "Evaluating error corrections...")
|
|
453
|
+
results = self._evaluate_all_fixes(all_manifests, out_dir)
|
|
454
|
+
|
|
455
|
+
# 6. Save results
|
|
456
|
+
with open(os.path.join(out_dir, "GEN_TEST_RESULTS.json"), "w", encoding="utf-8") as f:
|
|
457
|
+
json.dump(results, f, indent=2)
|
|
458
|
+
|
|
459
|
+
# Copy injection manifest to output
|
|
460
|
+
shutil.copy(inj_path, os.path.join(out_dir, "INJECTION_MANIFEST.json"))
|
|
461
|
+
|
|
462
|
+
# 7. Generate report
|
|
463
|
+
level = "custom"
|
|
464
|
+
if min_per_category <= 3:
|
|
465
|
+
level = "low"
|
|
466
|
+
elif min_per_category <= 7:
|
|
467
|
+
level = "mid"
|
|
468
|
+
else:
|
|
469
|
+
level = "high"
|
|
470
|
+
|
|
471
|
+
self._generate_comprehensive_report(results, out_dir, level)
|
|
472
|
+
|
|
473
|
+
# 8. Save versioned baseline files (original and corrupted versions)
|
|
474
|
+
for rel_path, info in all_manifests.items():
|
|
475
|
+
base_name = os.path.basename(rel_path)
|
|
476
|
+
base_dir = os.path.dirname(rel_path)
|
|
477
|
+
|
|
478
|
+
# Extract file extension properly
|
|
479
|
+
if '.' in base_name:
|
|
480
|
+
name_parts = base_name.rsplit('.', 1)
|
|
481
|
+
base_name_no_ext = name_parts[0]
|
|
482
|
+
ext = '.' + name_parts[1]
|
|
483
|
+
else:
|
|
484
|
+
base_name_no_ext = base_name
|
|
485
|
+
ext = ''
|
|
486
|
+
|
|
487
|
+
# Create original and corrupted filenames
|
|
488
|
+
orig_name = f"{base_name_no_ext}.original{ext}"
|
|
489
|
+
corr_name = f"{base_name_no_ext}.corrupted{ext}"
|
|
490
|
+
|
|
491
|
+
# Determine save directory - preserve directory structure
|
|
492
|
+
if base_name == "README.md":
|
|
493
|
+
# Special handling for README - save at root level
|
|
494
|
+
save_dir = out_dir
|
|
495
|
+
else:
|
|
496
|
+
# Save in same directory structure as original
|
|
497
|
+
save_dir = os.path.join(out_dir, base_dir) if base_dir else out_dir
|
|
498
|
+
|
|
499
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
500
|
+
|
|
501
|
+
# Save original and corrupted versions
|
|
502
|
+
write_file(os.path.join(save_dir, orig_name), info["baseline_content"])
|
|
503
|
+
write_file(os.path.join(save_dir, corr_name), info["corrupted_content"])
|
|
504
|
+
|
|
505
|
+
self.print_step("TestComplete", f"Results saved to {out_dir}")
|
|
506
|
+
return out_dir
|
|
507
|
+
|
|
508
|
+
def run_quant_suite(
|
|
509
|
+
self,
|
|
510
|
+
report_path: str,
|
|
511
|
+
baseline_repo_path: str,
|
|
512
|
+
base_tmp_repo_path: str,
|
|
513
|
+
levels: dict[str, int]
|
|
514
|
+
) -> dict:
|
|
515
|
+
"""
|
|
516
|
+
Run test suite across multiple levels.
|
|
517
|
+
"""
|
|
518
|
+
results = {}
|
|
519
|
+
for level, min_cnt in levels.items():
|
|
520
|
+
self.print_step(f"RunLevel:{level.upper()}", f"Running with {min_cnt} errors per file")
|
|
521
|
+
tmp_repo_path = f"{base_tmp_repo_path}_{level}"
|
|
522
|
+
out_dir = self.run_quant_test(report_path, baseline_repo_path, tmp_repo_path, min_per_category=min_cnt)
|
|
523
|
+
results[level] = out_dir
|
|
524
|
+
return results
|
|
525
|
+
|
|
File without changes
|