bioguider 0.2.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bioguider/__init__.py +0 -0
- bioguider/agents/__init__.py +0 -0
- bioguider/agents/agent_task.py +92 -0
- bioguider/agents/agent_tools.py +176 -0
- bioguider/agents/agent_utils.py +504 -0
- bioguider/agents/collection_execute_step.py +182 -0
- bioguider/agents/collection_observe_step.py +125 -0
- bioguider/agents/collection_plan_step.py +156 -0
- bioguider/agents/collection_task.py +184 -0
- bioguider/agents/collection_task_utils.py +142 -0
- bioguider/agents/common_agent.py +137 -0
- bioguider/agents/common_agent_2step.py +215 -0
- bioguider/agents/common_conversation.py +61 -0
- bioguider/agents/common_step.py +85 -0
- bioguider/agents/consistency_collection_step.py +102 -0
- bioguider/agents/consistency_evaluation_task.py +57 -0
- bioguider/agents/consistency_evaluation_task_utils.py +14 -0
- bioguider/agents/consistency_observe_step.py +110 -0
- bioguider/agents/consistency_query_step.py +77 -0
- bioguider/agents/dockergeneration_execute_step.py +186 -0
- bioguider/agents/dockergeneration_observe_step.py +154 -0
- bioguider/agents/dockergeneration_plan_step.py +158 -0
- bioguider/agents/dockergeneration_task.py +158 -0
- bioguider/agents/dockergeneration_task_utils.py +220 -0
- bioguider/agents/evaluation_installation_task.py +270 -0
- bioguider/agents/evaluation_readme_task.py +767 -0
- bioguider/agents/evaluation_submission_requirements_task.py +172 -0
- bioguider/agents/evaluation_task.py +206 -0
- bioguider/agents/evaluation_tutorial_task.py +169 -0
- bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
- bioguider/agents/evaluation_userguide_prompts.py +179 -0
- bioguider/agents/evaluation_userguide_task.py +154 -0
- bioguider/agents/evaluation_utils.py +127 -0
- bioguider/agents/identification_execute_step.py +181 -0
- bioguider/agents/identification_observe_step.py +104 -0
- bioguider/agents/identification_plan_step.py +140 -0
- bioguider/agents/identification_task.py +270 -0
- bioguider/agents/identification_task_utils.py +22 -0
- bioguider/agents/peo_common_step.py +64 -0
- bioguider/agents/prompt_utils.py +253 -0
- bioguider/agents/python_ast_repl_tool.py +69 -0
- bioguider/agents/rag_collection_task.py +130 -0
- bioguider/conversation.py +67 -0
- bioguider/database/code_structure_db.py +500 -0
- bioguider/database/summarized_file_db.py +146 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/benchmark_metrics.py +610 -0
- bioguider/generation/change_planner.py +189 -0
- bioguider/generation/document_renderer.py +157 -0
- bioguider/generation/llm_cleaner.py +67 -0
- bioguider/generation/llm_content_generator.py +1128 -0
- bioguider/generation/llm_injector.py +809 -0
- bioguider/generation/models.py +85 -0
- bioguider/generation/output_manager.py +74 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +166 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +436 -0
- bioguider/generation/test_metrics.py +189 -0
- bioguider/managers/benchmark_manager.py +785 -0
- bioguider/managers/evaluation_manager.py +215 -0
- bioguider/managers/generation_manager.py +686 -0
- bioguider/managers/generation_test_manager.py +107 -0
- bioguider/managers/generation_test_manager_v2.py +525 -0
- bioguider/rag/__init__.py +0 -0
- bioguider/rag/config.py +117 -0
- bioguider/rag/data_pipeline.py +651 -0
- bioguider/rag/embedder.py +24 -0
- bioguider/rag/rag.py +138 -0
- bioguider/settings.py +103 -0
- bioguider/utils/code_structure_builder.py +59 -0
- bioguider/utils/constants.py +135 -0
- bioguider/utils/default.gitignore +140 -0
- bioguider/utils/file_utils.py +215 -0
- bioguider/utils/gitignore_checker.py +175 -0
- bioguider/utils/notebook_utils.py +117 -0
- bioguider/utils/pyphen_utils.py +73 -0
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +551 -0
- bioguider/utils/utils.py +163 -0
- bioguider-0.2.52.dist-info/LICENSE +21 -0
- bioguider-0.2.52.dist-info/METADATA +51 -0
- bioguider-0.2.52.dist-info/RECORD +84 -0
- bioguider-0.2.52.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import List, Dict
|
|
4
|
+
|
|
5
|
+
from .models import SuggestionItem, StyleProfile, DocumentPlan, PlannedEdit
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChangePlanner:
|
|
9
|
+
def build_plan(
|
|
10
|
+
self,
|
|
11
|
+
repo_path: str,
|
|
12
|
+
style: StyleProfile,
|
|
13
|
+
suggestions: List[SuggestionItem],
|
|
14
|
+
available_files: Dict[str, str],
|
|
15
|
+
) -> DocumentPlan:
|
|
16
|
+
planned: List[PlannedEdit] = []
|
|
17
|
+
seen_headers: set[tuple[str, str]] = set()
|
|
18
|
+
|
|
19
|
+
def section_header(title: str) -> str:
|
|
20
|
+
# use heading level 2 for inserts to be safe
|
|
21
|
+
h = style.heading_style or "#"
|
|
22
|
+
return f"{h*2} {title}\n\n"
|
|
23
|
+
|
|
24
|
+
for s in suggestions:
|
|
25
|
+
for target in s.target_files:
|
|
26
|
+
if target not in available_files:
|
|
27
|
+
# allow planning; renderer will skip if missing
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
if s.action == "add_dependencies_section":
|
|
31
|
+
# Use LLM generation instead of template
|
|
32
|
+
header_key = (target, (s.anchor_hint or "Dependencies").strip().lower())
|
|
33
|
+
if header_key in seen_headers:
|
|
34
|
+
continue
|
|
35
|
+
planned.append(PlannedEdit(
|
|
36
|
+
file_path=target,
|
|
37
|
+
edit_type="append_section",
|
|
38
|
+
anchor={"type": "header", "value": s.anchor_hint or "Dependencies"},
|
|
39
|
+
content_template="", # Will be generated by LLM
|
|
40
|
+
rationale=s.source.get("evidence", ""),
|
|
41
|
+
suggestion_id=s.id,
|
|
42
|
+
))
|
|
43
|
+
seen_headers.add(header_key)
|
|
44
|
+
elif s.action == "add_system_requirements_section":
|
|
45
|
+
# Use LLM generation instead of template
|
|
46
|
+
header_key = (target, (s.anchor_hint or "System Requirements").strip().lower())
|
|
47
|
+
if header_key in seen_headers:
|
|
48
|
+
continue
|
|
49
|
+
planned.append(PlannedEdit(
|
|
50
|
+
file_path=target,
|
|
51
|
+
edit_type="append_section",
|
|
52
|
+
anchor={"type": "header", "value": s.anchor_hint or "System Requirements"},
|
|
53
|
+
content_template="", # Will be generated by LLM
|
|
54
|
+
rationale=s.source.get("evidence", ""),
|
|
55
|
+
suggestion_id=s.id,
|
|
56
|
+
))
|
|
57
|
+
seen_headers.add(header_key)
|
|
58
|
+
elif s.action == "mention_license_section":
|
|
59
|
+
content = section_header("License") + "This project is released under the MIT License. See LICENSE for details.\n"
|
|
60
|
+
header_key = (target, (s.anchor_hint or "License").strip().lower())
|
|
61
|
+
if header_key in seen_headers:
|
|
62
|
+
continue
|
|
63
|
+
planned.append(PlannedEdit(
|
|
64
|
+
file_path=target,
|
|
65
|
+
edit_type="append_section",
|
|
66
|
+
anchor={"type": "header", "value": s.anchor_hint or "License"},
|
|
67
|
+
content_template=content,
|
|
68
|
+
rationale=s.source.get("evidence", ""),
|
|
69
|
+
suggestion_id=s.id,
|
|
70
|
+
))
|
|
71
|
+
seen_headers.add(header_key)
|
|
72
|
+
elif s.action == "normalize_headings_structure":
|
|
73
|
+
# Minimal placeholder: avoid heavy rewrites
|
|
74
|
+
# Plan a no-op or a small note; actual normalization could be added later
|
|
75
|
+
continue
|
|
76
|
+
elif s.action == "add_usage_section":
|
|
77
|
+
content = section_header("Usage") + "- Brief example of typical workflow.\n"
|
|
78
|
+
header_key = (target, "usage")
|
|
79
|
+
if header_key in seen_headers:
|
|
80
|
+
continue
|
|
81
|
+
planned.append(PlannedEdit(
|
|
82
|
+
file_path=target,
|
|
83
|
+
edit_type="append_section",
|
|
84
|
+
anchor={"type": "header", "value": "Usage"},
|
|
85
|
+
content_template=content,
|
|
86
|
+
rationale=s.source.get("evidence", ""),
|
|
87
|
+
suggestion_id=s.id,
|
|
88
|
+
))
|
|
89
|
+
seen_headers.add(header_key)
|
|
90
|
+
elif s.action == "replace_intro":
|
|
91
|
+
# Replace intro block (between H1 and first H2) with a clean Overview section
|
|
92
|
+
# Use empty content_template so LLM can generate content based on guidance
|
|
93
|
+
header_key = (target, "overview")
|
|
94
|
+
if header_key in seen_headers:
|
|
95
|
+
continue
|
|
96
|
+
planned.append(PlannedEdit(
|
|
97
|
+
file_path=target,
|
|
98
|
+
edit_type="replace_intro_block",
|
|
99
|
+
anchor={"type": "header", "value": "Overview"},
|
|
100
|
+
content_template="", # Will be filled by LLM generation
|
|
101
|
+
rationale=s.source.get("evidence", ""),
|
|
102
|
+
suggestion_id=s.id,
|
|
103
|
+
))
|
|
104
|
+
seen_headers.add(header_key)
|
|
105
|
+
elif s.action == "clarify_mandatory_vs_optional":
|
|
106
|
+
# Use specific guidance from evaluation report instead of generic template
|
|
107
|
+
guidance = s.content_guidance or "Specify compatibility details for dependencies across operating systems and architectures."
|
|
108
|
+
content = section_header("Dependencies") + f"- {guidance}\n"
|
|
109
|
+
header_key = (target, "dependencies")
|
|
110
|
+
if header_key in seen_headers:
|
|
111
|
+
continue
|
|
112
|
+
planned.append(PlannedEdit(
|
|
113
|
+
file_path=target,
|
|
114
|
+
edit_type="append_section",
|
|
115
|
+
anchor={"type": "header", "value": "Dependencies"},
|
|
116
|
+
content_template=content,
|
|
117
|
+
rationale=s.source.get("evidence", ""),
|
|
118
|
+
suggestion_id=s.id,
|
|
119
|
+
))
|
|
120
|
+
seen_headers.add(header_key)
|
|
121
|
+
elif s.action == "add_hardware_requirements":
|
|
122
|
+
# Use LLM generation instead of template
|
|
123
|
+
header_key = (target, (s.anchor_hint or "Hardware Requirements").strip().lower())
|
|
124
|
+
if header_key in seen_headers:
|
|
125
|
+
continue
|
|
126
|
+
planned.append(PlannedEdit(
|
|
127
|
+
file_path=target,
|
|
128
|
+
edit_type="append_section",
|
|
129
|
+
anchor={"type": "header", "value": s.anchor_hint or "Hardware Requirements"},
|
|
130
|
+
content_template="", # Will be generated by LLM
|
|
131
|
+
rationale=s.source.get("evidence", ""),
|
|
132
|
+
suggestion_id=s.id,
|
|
133
|
+
))
|
|
134
|
+
seen_headers.add(header_key)
|
|
135
|
+
elif s.action == "improve_clarity_and_error_handling":
|
|
136
|
+
# Handle targeted improvements to user guides
|
|
137
|
+
planned.append(PlannedEdit(
|
|
138
|
+
file_path=target,
|
|
139
|
+
edit_type="append_section",
|
|
140
|
+
anchor={"type": "header", "value": s.anchor_hint or "Introduction"},
|
|
141
|
+
content_template="", # Will be filled by LLM generation
|
|
142
|
+
rationale=s.source.get("evidence", ""),
|
|
143
|
+
suggestion_id=s.id,
|
|
144
|
+
))
|
|
145
|
+
elif s.action == "improve_consistency":
|
|
146
|
+
# Handle consistency improvements
|
|
147
|
+
planned.append(PlannedEdit(
|
|
148
|
+
file_path=target,
|
|
149
|
+
edit_type="append_section",
|
|
150
|
+
anchor={"type": "header", "value": s.anchor_hint or "Examples"},
|
|
151
|
+
content_template="", # Will be filled by LLM generation
|
|
152
|
+
rationale=s.source.get("evidence", ""),
|
|
153
|
+
suggestion_id=s.id,
|
|
154
|
+
))
|
|
155
|
+
elif s.action == "improve_tutorial_quality":
|
|
156
|
+
# Handle tutorial quality improvements
|
|
157
|
+
planned.append(PlannedEdit(
|
|
158
|
+
file_path=target,
|
|
159
|
+
edit_type="append_section",
|
|
160
|
+
anchor={"type": "header", "value": s.anchor_hint or "Setup"},
|
|
161
|
+
content_template="", # Will be filled by LLM generation
|
|
162
|
+
rationale=s.source.get("evidence", ""),
|
|
163
|
+
suggestion_id=s.id,
|
|
164
|
+
))
|
|
165
|
+
# All actions now use full_replace mode
|
|
166
|
+
planned.append(PlannedEdit(
|
|
167
|
+
file_path=target,
|
|
168
|
+
edit_type="full_replace",
|
|
169
|
+
anchor={"type": "document", "value": "full_document"},
|
|
170
|
+
content_template="", # Will be filled by LLM generation
|
|
171
|
+
rationale=s.source.get("evidence", ""),
|
|
172
|
+
suggestion_id=s.id,
|
|
173
|
+
))
|
|
174
|
+
|
|
175
|
+
# If a file is planned for full_replace, suppress other edits for that file to avoid redundancy
|
|
176
|
+
by_file: Dict[str, List[PlannedEdit]] = {}
|
|
177
|
+
for e in planned:
|
|
178
|
+
by_file.setdefault(e.file_path, []).append(e)
|
|
179
|
+
filtered: List[PlannedEdit] = []
|
|
180
|
+
for fpath, edits in by_file.items():
|
|
181
|
+
has_full = any(e.edit_type == "full_replace" for e in edits)
|
|
182
|
+
if has_full:
|
|
183
|
+
filtered.extend([e for e in edits if e.edit_type == "full_replace"])
|
|
184
|
+
else:
|
|
185
|
+
filtered.extend(edits)
|
|
186
|
+
|
|
187
|
+
return DocumentPlan(repo_path=repo_path, style_profile=style, planned_edits=filtered)
|
|
188
|
+
|
|
189
|
+
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
|
|
5
|
+
from .models import PlannedEdit
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocumentRenderer:
|
|
9
|
+
def apply_edit(self, original: str, edit: PlannedEdit) -> Tuple[str, dict]:
|
|
10
|
+
content = original
|
|
11
|
+
added = 0
|
|
12
|
+
|
|
13
|
+
if edit.edit_type == "append_section":
|
|
14
|
+
# Avoid duplicate header if the same header already exists
|
|
15
|
+
header_line = None
|
|
16
|
+
if edit.content_template.lstrip().startswith("#"):
|
|
17
|
+
header_line = edit.content_template.strip().splitlines()[0].strip()
|
|
18
|
+
if header_line and header_line in content:
|
|
19
|
+
return content, {"added_lines": 0}
|
|
20
|
+
# Append with two leading newlines if needed
|
|
21
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
22
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
23
|
+
added = len(edit.content_template.splitlines())
|
|
24
|
+
|
|
25
|
+
elif edit.edit_type == "replace_intro_block":
|
|
26
|
+
# Replace content from start to first level-2 header (##) with new intro
|
|
27
|
+
lines = content.splitlines()
|
|
28
|
+
end_idx = None
|
|
29
|
+
for i, ln in enumerate(lines):
|
|
30
|
+
if ln.strip().startswith("## "):
|
|
31
|
+
end_idx = i
|
|
32
|
+
break
|
|
33
|
+
if end_idx is None:
|
|
34
|
+
# No H2 header found; replace entire content
|
|
35
|
+
new_content = edit.content_template
|
|
36
|
+
else:
|
|
37
|
+
head = lines[:0]
|
|
38
|
+
tail = lines[end_idx:]
|
|
39
|
+
new_content = edit.content_template.rstrip() + "\n\n" + "\n".join(tail)
|
|
40
|
+
added = len(edit.content_template.splitlines())
|
|
41
|
+
content = new_content
|
|
42
|
+
|
|
43
|
+
elif edit.edit_type == "insert_after_header":
|
|
44
|
+
# Insert content after a specific header, but integrate naturally
|
|
45
|
+
header_value = edit.anchor.get("value", "")
|
|
46
|
+
if header_value:
|
|
47
|
+
lines = content.splitlines()
|
|
48
|
+
insert_idx = None
|
|
49
|
+
for i, line in enumerate(lines):
|
|
50
|
+
if line.strip().startswith("#") and header_value.lower() in line.lower():
|
|
51
|
+
# Find a good insertion point after the header and its immediate content
|
|
52
|
+
insert_idx = i + 1
|
|
53
|
+
# Skip empty lines and find the first substantial content
|
|
54
|
+
while insert_idx < len(lines) and lines[insert_idx].strip() == "":
|
|
55
|
+
insert_idx += 1
|
|
56
|
+
# Insert after the first code block or paragraph, but before next major section
|
|
57
|
+
while insert_idx < len(lines):
|
|
58
|
+
line_content = lines[insert_idx].strip()
|
|
59
|
+
if line_content.startswith("#") and not line_content.startswith("###"):
|
|
60
|
+
break
|
|
61
|
+
if line_content.startswith("```") and insert_idx > 0:
|
|
62
|
+
# Found end of code block, insert after it
|
|
63
|
+
insert_idx += 1
|
|
64
|
+
break
|
|
65
|
+
insert_idx += 1
|
|
66
|
+
break
|
|
67
|
+
|
|
68
|
+
if insert_idx is not None:
|
|
69
|
+
# Insert the new content with minimal formatting
|
|
70
|
+
new_content_lines = edit.content_template.splitlines()
|
|
71
|
+
# Remove standalone headers to avoid creating new major sections
|
|
72
|
+
filtered_lines = []
|
|
73
|
+
for line in new_content_lines:
|
|
74
|
+
if line.strip().startswith("## ") and len(line.strip()) < 50:
|
|
75
|
+
# Convert major headers to minor explanations
|
|
76
|
+
header_text = line.strip()[3:].strip()
|
|
77
|
+
filtered_lines.append(f"\n**Note:** {header_text.lower()}")
|
|
78
|
+
else:
|
|
79
|
+
filtered_lines.append(line)
|
|
80
|
+
|
|
81
|
+
# Insert with minimal spacing
|
|
82
|
+
new_lines = lines[:insert_idx] + [""] + filtered_lines + lines[insert_idx:]
|
|
83
|
+
content = "\n".join(new_lines)
|
|
84
|
+
added = len(filtered_lines)
|
|
85
|
+
else:
|
|
86
|
+
# Header not found, append at end
|
|
87
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
88
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
89
|
+
added = len(edit.content_template.splitlines())
|
|
90
|
+
else:
|
|
91
|
+
# No header specified, append at end
|
|
92
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
93
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
94
|
+
added = len(edit.content_template.splitlines())
|
|
95
|
+
|
|
96
|
+
elif edit.edit_type == "rmarkdown_integration":
|
|
97
|
+
# Special handling for RMarkdown files - integrate content naturally
|
|
98
|
+
header_value = edit.anchor.get("value", "")
|
|
99
|
+
if header_value:
|
|
100
|
+
lines = content.splitlines()
|
|
101
|
+
insert_idx = None
|
|
102
|
+
for i, line in enumerate(lines):
|
|
103
|
+
if line.strip().startswith("#") and header_value.lower() in line.lower():
|
|
104
|
+
# Find insertion point after the first code block in this section
|
|
105
|
+
insert_idx = i + 1
|
|
106
|
+
while insert_idx < len(lines):
|
|
107
|
+
line_content = lines[insert_idx].strip()
|
|
108
|
+
if line_content.startswith("```") and insert_idx > 0:
|
|
109
|
+
# Found code block, insert after it
|
|
110
|
+
insert_idx += 1
|
|
111
|
+
break
|
|
112
|
+
if line_content.startswith("#") and not line_content.startswith("###"):
|
|
113
|
+
# Next major section, insert before it
|
|
114
|
+
break
|
|
115
|
+
insert_idx += 1
|
|
116
|
+
break
|
|
117
|
+
|
|
118
|
+
if insert_idx is not None:
|
|
119
|
+
# Process content to be more contextual
|
|
120
|
+
new_content_lines = edit.content_template.splitlines()
|
|
121
|
+
contextual_lines = []
|
|
122
|
+
|
|
123
|
+
for line in new_content_lines:
|
|
124
|
+
# Convert standalone sections to contextual notes
|
|
125
|
+
if line.strip().startswith("## "):
|
|
126
|
+
header_text = line.strip()[3:].strip()
|
|
127
|
+
contextual_lines.append(f"\n**Note:** For this tutorial, {header_text.lower()}")
|
|
128
|
+
elif line.strip().startswith("# "):
|
|
129
|
+
header_text = line.strip()[2:].strip()
|
|
130
|
+
contextual_lines.append(f"\n**Important:** {header_text.lower()}")
|
|
131
|
+
else:
|
|
132
|
+
contextual_lines.append(line)
|
|
133
|
+
|
|
134
|
+
# Insert with minimal disruption
|
|
135
|
+
new_lines = lines[:insert_idx] + [""] + contextual_lines + lines[insert_idx:]
|
|
136
|
+
content = "\n".join(new_lines)
|
|
137
|
+
added = len(contextual_lines)
|
|
138
|
+
else:
|
|
139
|
+
# Fallback to append
|
|
140
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
141
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
142
|
+
added = len(edit.content_template.splitlines())
|
|
143
|
+
else:
|
|
144
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
145
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
146
|
+
added = len(edit.content_template.splitlines())
|
|
147
|
+
|
|
148
|
+
elif edit.edit_type == "full_replace":
|
|
149
|
+
# Replace entire document content
|
|
150
|
+
content = edit.content_template
|
|
151
|
+
added = len(edit.content_template.splitlines())
|
|
152
|
+
|
|
153
|
+
# Other edit types (replace_block) can be added as needed
|
|
154
|
+
|
|
155
|
+
return content, {"added_lines": added}
|
|
156
|
+
|
|
157
|
+
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
4
|
+
|
|
5
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
CLEANUP_PROMPT = """
|
|
9
|
+
You are "BioGuider," a precise editor for biomedical/bioinformatics documentation.
|
|
10
|
+
|
|
11
|
+
TASK
|
|
12
|
+
Given a documentation file (README, RMarkdown, or other), produce a corrected version that:
|
|
13
|
+
- Fixes typos, grammar, capitalization, and spacing
|
|
14
|
+
- Corrects malformed markdown (headers, lists, links, code fences)
|
|
15
|
+
- Repairs or normalizes link formatting; keep URLs absolute if present
|
|
16
|
+
- Removes duplicated sections or repeated content; consolidate if needed
|
|
17
|
+
- Preserves technical accuracy and biomedical domain terminology (do not invent features)
|
|
18
|
+
- Keeps tone neutral and professional; avoid marketing language
|
|
19
|
+
- Preserves all valid information; do not delete content unless it is a duplicate or malformed
|
|
20
|
+
- For RMarkdown files (.Rmd): Preserve YAML frontmatter, R code chunks, and existing structure exactly
|
|
21
|
+
|
|
22
|
+
CRITICAL REQUIREMENTS:
|
|
23
|
+
- Do NOT wrap the entire document in markdown code fences (```markdown). Return pure content only.
|
|
24
|
+
- If the document starts with ```markdown and ends with ```, remove these fences completely.
|
|
25
|
+
- Do NOT modify YAML frontmatter in RMarkdown files
|
|
26
|
+
- Do NOT modify R code chunks (```{r} blocks) in RMarkdown files
|
|
27
|
+
- Do NOT change the overall structure or organization of the document
|
|
28
|
+
|
|
29
|
+
ABSOLUTELY FORBIDDEN - REMOVE THESE COMPLETELY:
|
|
30
|
+
- Any summary sections, concluding statements, or notes at the end of documents
|
|
31
|
+
- Phrases like "Happy analyzing!", "Ensure all dependencies are up-to-date", "This concludes", "For more information"
|
|
32
|
+
- Any text that appears to be AI-generated summaries or conclusions
|
|
33
|
+
- Sentences starting with "Note:", "Remember:", "Important:", "Tip:", "Warning:" at the end
|
|
34
|
+
- Any text after the last substantive content section
|
|
35
|
+
- Phrases like "Happy coding!", "Good luck!", "Enjoy!", "Have fun!"
|
|
36
|
+
- Any concluding remarks, final thoughts, or wrap-up statements
|
|
37
|
+
- Text that sounds like AI-generated advice or encouragement
|
|
38
|
+
|
|
39
|
+
DOCUMENT ENDING RULES:
|
|
40
|
+
- The document must end naturally with the last substantive content section
|
|
41
|
+
- Do NOT add any concluding statements, summaries, or notes
|
|
42
|
+
- If the original document had a natural ending, preserve it exactly
|
|
43
|
+
- If AI-added content appears at the end, remove it completely
|
|
44
|
+
|
|
45
|
+
INPUT
|
|
46
|
+
<<DOCUMENT>>
|
|
47
|
+
{doc}
|
|
48
|
+
<</DOCUMENT>>
|
|
49
|
+
|
|
50
|
+
OUTPUT
|
|
51
|
+
Return ONLY the revised content (no commentary, no explanations, no code fences).
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class LLMCleaner:
|
|
56
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
57
|
+
self.llm = llm
|
|
58
|
+
|
|
59
|
+
def clean_readme(self, content: str) -> tuple[str, dict]:
|
|
60
|
+
conv = CommonConversation(self.llm)
|
|
61
|
+
output, token_usage = conv.generate(
|
|
62
|
+
system_prompt=CLEANUP_PROMPT.format(doc=content[:30000]),
|
|
63
|
+
instruction_prompt="Provide the corrected documentation content only.",
|
|
64
|
+
)
|
|
65
|
+
return output.strip(), token_usage
|
|
66
|
+
|
|
67
|
+
|