bioguider 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of bioguider might be problematic. Click here for more details.
- bioguider/agents/agent_utils.py +18 -10
- bioguider/agents/collection_execute_step.py +1 -1
- bioguider/agents/collection_observe_step.py +7 -2
- bioguider/agents/collection_task_utils.py +1 -0
- bioguider/agents/common_conversation.py +20 -2
- bioguider/agents/consistency_collection_step.py +100 -0
- bioguider/agents/consistency_evaluation_task.py +56 -0
- bioguider/agents/consistency_evaluation_task_utils.py +13 -0
- bioguider/agents/consistency_observe_step.py +107 -0
- bioguider/agents/consistency_query_step.py +74 -0
- bioguider/agents/evaluation_task.py +2 -2
- bioguider/agents/evaluation_userguide_prompts.py +162 -0
- bioguider/agents/evaluation_userguide_task.py +131 -0
- bioguider/agents/prompt_utils.py +15 -8
- bioguider/database/code_structure_db.py +489 -0
- bioguider/generation/__init__.py +39 -0
- bioguider/generation/change_planner.py +140 -0
- bioguider/generation/document_renderer.py +47 -0
- bioguider/generation/llm_cleaner.py +43 -0
- bioguider/generation/llm_content_generator.py +69 -0
- bioguider/generation/llm_injector.py +270 -0
- bioguider/generation/models.py +77 -0
- bioguider/generation/output_manager.py +54 -0
- bioguider/generation/repo_reader.py +37 -0
- bioguider/generation/report_loader.py +151 -0
- bioguider/generation/style_analyzer.py +36 -0
- bioguider/generation/suggestion_extractor.py +136 -0
- bioguider/generation/test_metrics.py +104 -0
- bioguider/managers/evaluation_manager.py +24 -0
- bioguider/managers/generation_manager.py +160 -0
- bioguider/managers/generation_test_manager.py +74 -0
- bioguider/utils/code_structure_builder.py +47 -0
- bioguider/utils/constants.py +12 -12
- bioguider/utils/python_file_handler.py +65 -0
- bioguider/utils/r_file_handler.py +368 -0
- bioguider/utils/utils.py +34 -1
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/METADATA +1 -1
- bioguider-0.2.21.dist-info/RECORD +77 -0
- bioguider-0.2.19.dist-info/RECORD +0 -51
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/LICENSE +0 -0
- {bioguider-0.2.19.dist-info → bioguider-0.2.21.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
|
|
5
|
+
from .models import PlannedEdit
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DocumentRenderer:
|
|
9
|
+
def apply_edit(self, original: str, edit: PlannedEdit) -> Tuple[str, dict]:
|
|
10
|
+
content = original
|
|
11
|
+
added = 0
|
|
12
|
+
|
|
13
|
+
if edit.edit_type == "append_section":
|
|
14
|
+
# Avoid duplicate header if the same header already exists
|
|
15
|
+
header_line = None
|
|
16
|
+
if edit.content_template.lstrip().startswith("#"):
|
|
17
|
+
header_line = edit.content_template.strip().splitlines()[0].strip()
|
|
18
|
+
if header_line and header_line in content:
|
|
19
|
+
return content, {"added_lines": 0}
|
|
20
|
+
# Append with two leading newlines if needed
|
|
21
|
+
sep = "\n\n" if not content.endswith("\n\n") else ""
|
|
22
|
+
content = f"{content}{sep}{edit.content_template}"
|
|
23
|
+
added = len(edit.content_template.splitlines())
|
|
24
|
+
|
|
25
|
+
elif edit.edit_type == "replace_intro_block":
|
|
26
|
+
# Replace content from start to first level-2 header (##) with new intro
|
|
27
|
+
lines = content.splitlines()
|
|
28
|
+
end_idx = None
|
|
29
|
+
for i, ln in enumerate(lines):
|
|
30
|
+
if ln.strip().startswith("## "):
|
|
31
|
+
end_idx = i
|
|
32
|
+
break
|
|
33
|
+
if end_idx is None:
|
|
34
|
+
# No H2 header found; replace entire content
|
|
35
|
+
new_content = edit.content_template
|
|
36
|
+
else:
|
|
37
|
+
head = lines[:0]
|
|
38
|
+
tail = lines[end_idx:]
|
|
39
|
+
new_content = edit.content_template.rstrip() + "\n\n" + "\n".join(tail)
|
|
40
|
+
added = len(edit.content_template.splitlines())
|
|
41
|
+
content = new_content
|
|
42
|
+
|
|
43
|
+
# Other edit types (insert_after_header, replace_block) can be added as needed
|
|
44
|
+
|
|
45
|
+
return content, {"added_lines": added}
|
|
46
|
+
|
|
47
|
+
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
4
|
+
|
|
5
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
CLEANUP_PROMPT = """
|
|
9
|
+
You are “BioGuider,” a precise editor for biomedical/bioinformatics documentation.
|
|
10
|
+
|
|
11
|
+
TASK
|
|
12
|
+
Given a full README markdown, produce a corrected version that:
|
|
13
|
+
- Fixes typos, grammar, capitalization, and spacing
|
|
14
|
+
- Corrects malformed markdown (headers, lists, links, code fences)
|
|
15
|
+
- Repairs or normalizes link formatting; keep URLs absolute if present
|
|
16
|
+
- Removes duplicated sections or repeated content; consolidate if needed
|
|
17
|
+
- Preserves technical accuracy and biomedical domain terminology (do not invent features)
|
|
18
|
+
- Keeps tone neutral and professional; avoid marketing language
|
|
19
|
+
- Preserves all valid information; do not delete content unless it is a duplicate or malformed
|
|
20
|
+
|
|
21
|
+
INPUT
|
|
22
|
+
<<README>>
|
|
23
|
+
{readme}
|
|
24
|
+
<</README>>
|
|
25
|
+
|
|
26
|
+
OUTPUT
|
|
27
|
+
Return ONLY the revised markdown (no commentary, no explanations).
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLMCleaner:
|
|
32
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
33
|
+
self.llm = llm
|
|
34
|
+
|
|
35
|
+
def clean_readme(self, content: str) -> tuple[str, dict]:
|
|
36
|
+
conv = CommonConversation(self.llm)
|
|
37
|
+
output, token_usage = conv.generate(
|
|
38
|
+
system_prompt=CLEANUP_PROMPT.format(readme=content[:30000]),
|
|
39
|
+
instruction_prompt="Provide the corrected README markdown only.",
|
|
40
|
+
)
|
|
41
|
+
return output.strip(), token_usage
|
|
42
|
+
|
|
43
|
+
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict
|
|
4
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
5
|
+
|
|
6
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
7
|
+
from .models import StyleProfile, SuggestionItem
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
LLM_SECTION_PROMPT = """
|
|
11
|
+
You are “BioGuider,” a concise documentation generator for biomedical/bioinformatics software.
|
|
12
|
+
|
|
13
|
+
GOAL
|
|
14
|
+
Write or refine a single documentation section named "{section}". Produce minimal, style-consistent content that addresses only this section.
|
|
15
|
+
|
|
16
|
+
INPUTS (use only what is provided; never invent)
|
|
17
|
+
- suggestion_category: {suggestion_category}
|
|
18
|
+
- anchor_title: {anchor_title}
|
|
19
|
+
- guidance: {guidance}
|
|
20
|
+
- evidence_from_evaluation: {evidence}
|
|
21
|
+
- repo_context_excerpt (analyze tone/formatting; do not paraphrase it blindly): <<{context}>>
|
|
22
|
+
|
|
23
|
+
STYLE & CONSTRAINTS
|
|
24
|
+
- Preserve the existing tone and style markers: {tone_markers}
|
|
25
|
+
- Use heading style "{heading_style}" and list style "{list_style}"; link style "{link_style}".
|
|
26
|
+
- Neutral, professional tone; avoid marketing claims.
|
|
27
|
+
- Omit details you cannot substantiate from inputs/context; do not invent.
|
|
28
|
+
- Prefer bullets; keep it short and skimmable.
|
|
29
|
+
- Biomedical examples must avoid PHI; assume de-identified data.
|
|
30
|
+
- Output must be plain markdown for this section only, with no commentary and no backticks.
|
|
31
|
+
- Avoid duplication: if similar content exists in the repo context, rewrite succinctly instead of repeating.
|
|
32
|
+
|
|
33
|
+
SECTION GUIDELINES
|
|
34
|
+
- Dependencies: short bullet list; clearly separate Mandatory and Optional if applicable; avoid version numbers unless present in context.
|
|
35
|
+
- System Requirements: runtime versions and supported OS; add hardware notes only if guidance provides specifics.
|
|
36
|
+
- Hardware Requirements: brief bullets with RAM/CPU only if guidance includes numbers.
|
|
37
|
+
- License: one sentence referencing the license and pointing to the LICENSE file.
|
|
38
|
+
- Install (clarify dependencies): bullets under Mandatory and Optional.
|
|
39
|
+
- If the section does not fit the above, produce a concise, accurate subsection aligned with the repo’s style.
|
|
40
|
+
|
|
41
|
+
OUTPUT FORMAT
|
|
42
|
+
- Return only the section markdown (no code fences).
|
|
43
|
+
- Start with a level-2 header: "## {anchor_title}" unless the content already starts with a header.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class LLMContentGenerator:
|
|
48
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
49
|
+
self.llm = llm
|
|
50
|
+
|
|
51
|
+
def generate_section(self, suggestion: SuggestionItem, style: StyleProfile, context: str = "") -> tuple[str, dict]:
|
|
52
|
+
conv = CommonConversation(self.llm)
|
|
53
|
+
section_name = suggestion.anchor_hint or suggestion.category.split(".")[-1].replace("_", " ").title()
|
|
54
|
+
system_prompt = LLM_SECTION_PROMPT.format(
|
|
55
|
+
tone_markers=", ".join(style.tone_markers or []),
|
|
56
|
+
heading_style=style.heading_style,
|
|
57
|
+
list_style=style.list_style,
|
|
58
|
+
link_style=style.link_style,
|
|
59
|
+
section=section_name,
|
|
60
|
+
anchor_title=section_name,
|
|
61
|
+
suggestion_category=suggestion.category,
|
|
62
|
+
evidence=(suggestion.source.get("evidence", "") if suggestion.source else ""),
|
|
63
|
+
context=context[:2500],
|
|
64
|
+
guidance=(suggestion.content_guidance or "").strip(),
|
|
65
|
+
)
|
|
66
|
+
content, token_usage = conv.generate(system_prompt=system_prompt, instruction_prompt="Write the section content now.")
|
|
67
|
+
return content.strip(), token_usage
|
|
68
|
+
|
|
69
|
+
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Tuple, Dict, Any, List, Set
|
|
5
|
+
import re
|
|
6
|
+
from difflib import SequenceMatcher
|
|
7
|
+
|
|
8
|
+
from langchain_openai.chat_models.base import BaseChatOpenAI
|
|
9
|
+
from bioguider.agents.common_conversation import CommonConversation
|
|
10
|
+
from bioguider.utils.utils import escape_braces
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
INJECTION_PROMPT = """
|
|
14
|
+
You are “BioGuider-Intro,” generating a deliberately flawed **INTRODUCTION** file
|
|
15
|
+
(“README-lite”) to test an auto-fixer. Start from the provided clean INTRO doc that follows the
|
|
16
|
+
BioGuider Intro structure (What is it? / What can it do? / Requirements / Install / Quick example /
|
|
17
|
+
Learn more / License & Contact). Produce a corrupted version with small, realistic defects.
|
|
18
|
+
|
|
19
|
+
GOAL
|
|
20
|
+
Introduce subtle but meaningful issues while keeping the document recognizably the same.
|
|
21
|
+
|
|
22
|
+
ERROR CATEGORIES (inject all)
|
|
23
|
+
- typo: spelling/grammar/punctuation mistakes
|
|
24
|
+
- link: malformed URL, wrong domain, or stray spaces in URL
|
|
25
|
+
- duplicate: duplicate a short line/section fragment
|
|
26
|
+
- bio_term: slightly wrong domain term (e.g., “single sell” for “single cell”); do not invent new science
|
|
27
|
+
- function: misspell a known function/API name **from the input README-lite only**
|
|
28
|
+
- markdown_structure: break a header level, list indentation, or code fence (one-off)
|
|
29
|
+
|
|
30
|
+
CONSTRAINTS
|
|
31
|
+
- Keep edits minimal and local; **≥85% token overlap** with input.
|
|
32
|
+
- **Preserve section ORDER and TITLES** from the Intro spec:
|
|
33
|
+
1) # <project_name>
|
|
34
|
+
_<tagline>_
|
|
35
|
+
2) What is it?
|
|
36
|
+
3) What can it do?
|
|
37
|
+
4) Requirements
|
|
38
|
+
5) Install
|
|
39
|
+
6) Quick example
|
|
40
|
+
7) Learn more
|
|
41
|
+
8) License & Contact
|
|
42
|
+
- Do **not** add or remove top-level sections. Subtle line-level corruption only.
|
|
43
|
+
- Maintain a **concise length** (≤ {max_words} words).
|
|
44
|
+
- Do **not** alter the protected keywords (exact casing/spelling): {keywords}
|
|
45
|
+
- Keep at least **{min_per_category} errors per category** listed above.
|
|
46
|
+
- If the input contains runnable code, keep it mostly intact but introduce **one** realistic break
|
|
47
|
+
(e.g., missing quote/paren or wrong function name) without adding new libraries.
|
|
48
|
+
- Keep at least one **valid** URL so the fixer can compare.
|
|
49
|
+
- Do not change the project identity, domain, or language.
|
|
50
|
+
- Do not include markers, explanations, or commentary in the corrupted markdown.
|
|
51
|
+
|
|
52
|
+
INPUT INTRO (clean README-lite)
|
|
53
|
+
<<INTRO>>
|
|
54
|
+
{readme}
|
|
55
|
+
<</INTRO>>
|
|
56
|
+
|
|
57
|
+
OUTPUT (JSON only):
|
|
58
|
+
{{
|
|
59
|
+
"corrupted_markdown": "<the entire corrupted INTRO as markdown>",
|
|
60
|
+
"errors": [
|
|
61
|
+
{{
|
|
62
|
+
"id": "e1",
|
|
63
|
+
"category": "typo|link|duplicate|bio_term|function|markdown_structure",
|
|
64
|
+
"rationale": "why this mutation is realistic",
|
|
65
|
+
"original_snippet": "<verbatim snippet from input>",
|
|
66
|
+
"mutated_snippet": "<verbatim mutated text>"
|
|
67
|
+
}}
|
|
68
|
+
// include one entry per individual mutation you applied
|
|
69
|
+
]
|
|
70
|
+
}}
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class LLMErrorInjector:
|
|
75
|
+
def __init__(self, llm: BaseChatOpenAI):
|
|
76
|
+
self.llm = llm
|
|
77
|
+
|
|
78
|
+
def inject(self, readme_text: str, min_per_category: int = 3, preserve_keywords: list[str] | None = None, max_words: int = 450) -> Tuple[str, Dict[str, Any]]:
|
|
79
|
+
conv = CommonConversation(self.llm)
|
|
80
|
+
preserve_keywords = preserve_keywords or self._extract_preserve_keywords(readme_text)
|
|
81
|
+
system_prompt = escape_braces(INJECTION_PROMPT).format(
|
|
82
|
+
readme=readme_text[:30000],
|
|
83
|
+
min_per_category=min_per_category,
|
|
84
|
+
keywords=", ".join(preserve_keywords) if preserve_keywords else "",
|
|
85
|
+
max_words=max_words,
|
|
86
|
+
)
|
|
87
|
+
output, _ = conv.generate(system_prompt=system_prompt, instruction_prompt="Return the JSON now.")
|
|
88
|
+
try:
|
|
89
|
+
data = json.loads(output)
|
|
90
|
+
except Exception:
|
|
91
|
+
# try to locate JSON block
|
|
92
|
+
start = output.find("{")
|
|
93
|
+
end = output.rfind("}")
|
|
94
|
+
data = json.loads(output[start:end+1]) if start != -1 and end != -1 else {"corrupted_markdown": readme_text, "errors": []}
|
|
95
|
+
corrupted = data.get("corrupted_markdown", readme_text)
|
|
96
|
+
# Validate output stays within original context; fallback to deterministic if invalid
|
|
97
|
+
if not self._validate_corrupted(readme_text, corrupted, preserve_keywords):
|
|
98
|
+
corrupted, data = self._deterministic_inject(readme_text)
|
|
99
|
+
# Supplement to satisfy minimum per-category counts using deterministic local edits
|
|
100
|
+
corrupted, data = self._supplement_errors(readme_text, corrupted, data, min_per_category)
|
|
101
|
+
manifest = {
|
|
102
|
+
"errors": data.get("errors", []),
|
|
103
|
+
}
|
|
104
|
+
return corrupted, manifest
|
|
105
|
+
|
|
106
|
+
def _extract_preserve_keywords(self, text: str) -> List[str]:
|
|
107
|
+
# Extract capitalized terms, domain hyphenations, and hostnames in links
|
|
108
|
+
kws: Set[str] = set()
|
|
109
|
+
for m in re.finditer(r"\b[A-Z][A-Za-z0-9\-/]{2,}(?:\s[A-Z][A-Za-z0-9\-/]{2,})*\b", text):
|
|
110
|
+
term = m.group(0)
|
|
111
|
+
if len(term) <= 40:
|
|
112
|
+
kws.add(term)
|
|
113
|
+
for m in re.finditer(r"\b[\w]+-[\w]+\b", text):
|
|
114
|
+
if any(ch.isalpha() for ch in m.group(0)):
|
|
115
|
+
kws.add(m.group(0))
|
|
116
|
+
for m in re.finditer(r"https?://([^/\s)]+)", text):
|
|
117
|
+
kws.add(m.group(1))
|
|
118
|
+
# Keep a small set to avoid over-constraining
|
|
119
|
+
out = list(kws)[:20]
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
def _validate_corrupted(self, baseline: str, corrupted: str, preserve_keywords: List[str]) -> bool:
|
|
123
|
+
# Similarity threshold
|
|
124
|
+
ratio = SequenceMatcher(None, baseline, corrupted).ratio()
|
|
125
|
+
if ratio < 0.7:
|
|
126
|
+
return False
|
|
127
|
+
# Preserve keywords
|
|
128
|
+
for k in preserve_keywords:
|
|
129
|
+
if k and k not in corrupted:
|
|
130
|
+
return False
|
|
131
|
+
# No new top-level sections
|
|
132
|
+
base_h2 = set([ln.strip() for ln in baseline.splitlines() if ln.strip().startswith("## ")])
|
|
133
|
+
corr_h2 = set([ln.strip() for ln in corrupted.splitlines() if ln.strip().startswith("## ")])
|
|
134
|
+
if not corr_h2.issubset(base_h2.union({"## Overview", "## Hardware Requirements", "## License", "## Usage", "## Dependencies", "## System Requirements"})):
|
|
135
|
+
return False
|
|
136
|
+
# New token ratio
|
|
137
|
+
btoks = set(re.findall(r"[A-Za-z0-9_\-]+", baseline.lower()))
|
|
138
|
+
ctoks = set(re.findall(r"[A-Za-z0-9_\-]+", corrupted.lower()))
|
|
139
|
+
new_ratio = len(ctoks - btoks) / max(1, len(ctoks))
|
|
140
|
+
if new_ratio > 0.25:
|
|
141
|
+
return False
|
|
142
|
+
return True
|
|
143
|
+
|
|
144
|
+
def _deterministic_inject(self, baseline: str) -> Tuple[str, Dict[str, Any]]:
|
|
145
|
+
errors: List[Dict[str, Any]] = []
|
|
146
|
+
text = baseline
|
|
147
|
+
# typo
|
|
148
|
+
if "successfully" in text:
|
|
149
|
+
text = text.replace("successfully", "succesfully", 1)
|
|
150
|
+
errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "successfully", "mutated_snippet": "succesfully", "rationale": "common misspelling"})
|
|
151
|
+
elif "installation" in text:
|
|
152
|
+
text = text.replace("installation", "instalation", 1)
|
|
153
|
+
errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "installation", "mutated_snippet": "instalation", "rationale": "common misspelling"})
|
|
154
|
+
# link
|
|
155
|
+
m = re.search(r"\]\(https?://[^)]+\)", text)
|
|
156
|
+
if m:
|
|
157
|
+
broken = m.group(0).replace("https://", "https//")
|
|
158
|
+
text = text.replace(m.group(0), broken, 1)
|
|
159
|
+
errors.append({"id": "e_link_1", "category": "link", "original_snippet": m.group(0), "mutated_snippet": broken, "rationale": "missing colon in scheme"})
|
|
160
|
+
# duplicate a small section (next header and paragraph)
|
|
161
|
+
lines = text.splitlines()
|
|
162
|
+
dup_idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("## ")), None)
|
|
163
|
+
if dup_idx is not None:
|
|
164
|
+
block = lines[dup_idx: min(len(lines), dup_idx+5)]
|
|
165
|
+
text = "\n".join(lines + ["", *block])
|
|
166
|
+
errors.append({"id": "e_dup_1", "category": "duplicate", "original_snippet": "\n".join(block), "mutated_snippet": "\n".join(block), "rationale": "duplicated section"})
|
|
167
|
+
# markdown structure: break a header
|
|
168
|
+
if "\n# " in text:
|
|
169
|
+
text = text.replace("\n# ", "\n#", 1)
|
|
170
|
+
errors.append({"id": "e_md_1", "category": "markdown_structure", "original_snippet": "\n# ", "mutated_snippet": "\n#", "rationale": "missing space in header"})
|
|
171
|
+
return text, {"errors": errors}
|
|
172
|
+
|
|
173
|
+
def _supplement_errors(self, baseline: str, corrupted: str, data: Dict[str, Any], min_per_category: int) -> Tuple[str, Dict[str, Any]]:
|
|
174
|
+
errors: List[Dict[str, Any]] = data.get("errors", []) or []
|
|
175
|
+
cat_counts: Dict[str, int] = {}
|
|
176
|
+
for e in errors:
|
|
177
|
+
cat = e.get("category", "")
|
|
178
|
+
cat_counts[cat] = cat_counts.get(cat, 0) + 1
|
|
179
|
+
|
|
180
|
+
def need(cat: str) -> int:
|
|
181
|
+
return max(0, min_per_category - cat_counts.get(cat, 0))
|
|
182
|
+
|
|
183
|
+
# typo supplements
|
|
184
|
+
for _ in range(need("typo")):
|
|
185
|
+
m = re.search(r"\b(installation|successfully|analysis|documentation|maintained|example|requirements|license|tutorials)\b", corrupted, flags=re.I)
|
|
186
|
+
if not m:
|
|
187
|
+
m = re.search(r"\b[A-Za-z]{6,}\b", corrupted)
|
|
188
|
+
if not m:
|
|
189
|
+
break
|
|
190
|
+
orig = m.group(0)
|
|
191
|
+
mut = orig[:-1] if len(orig) > 3 else orig + "e"
|
|
192
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
193
|
+
errors.append({"id": f"e_typo_sup_{len(errors)}", "category": "typo", "original_snippet": orig, "mutated_snippet": mut, "rationale": "minor misspelling"})
|
|
194
|
+
|
|
195
|
+
# link supplements
|
|
196
|
+
for _ in range(need("link")):
|
|
197
|
+
m = re.search(r"\[[^\]]+\]\(https?://[^)]+\)", corrupted)
|
|
198
|
+
if not m:
|
|
199
|
+
break
|
|
200
|
+
orig = m.group(0)
|
|
201
|
+
mut = orig.replace("https://", "https//", 1)
|
|
202
|
+
if mut == orig:
|
|
203
|
+
mut = orig.replace("http://", "http//", 1)
|
|
204
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
205
|
+
errors.append({"id": f"e_link_sup_{len(errors)}", "category": "link", "original_snippet": orig, "mutated_snippet": mut, "rationale": "scheme colon removed"})
|
|
206
|
+
|
|
207
|
+
# duplicate supplements
|
|
208
|
+
for _ in range(need("duplicate")):
|
|
209
|
+
lines = corrupted.splitlines()
|
|
210
|
+
idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("- ") or ln.strip().startswith("## ")), None)
|
|
211
|
+
if idx is None:
|
|
212
|
+
break
|
|
213
|
+
frag = lines[idx]
|
|
214
|
+
lines = lines[:idx+1] + [frag] + lines[idx+1:]
|
|
215
|
+
corrupted = "\n".join(lines)
|
|
216
|
+
errors.append({"id": f"e_dup_sup_{len(errors)}", "category": "duplicate", "original_snippet": frag, "mutated_snippet": frag, "rationale": "line duplicated"})
|
|
217
|
+
|
|
218
|
+
# bio_term supplements
|
|
219
|
+
bio_swaps = [(r"single cell", "single sell"), (r"genomics", "genomis"), (r"spatial", "spacial")]
|
|
220
|
+
for _ in range(need("bio_term")):
|
|
221
|
+
made = False
|
|
222
|
+
for pat, rep in bio_swaps:
|
|
223
|
+
m = re.search(pat, corrupted, flags=re.I)
|
|
224
|
+
if m:
|
|
225
|
+
orig = m.group(0)
|
|
226
|
+
mut = rep if orig.islower() else rep.title()
|
|
227
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
228
|
+
errors.append({"id": f"e_bio_sup_{len(errors)}", "category": "bio_term", "original_snippet": orig, "mutated_snippet": mut, "rationale": "common domain typo"})
|
|
229
|
+
made = True
|
|
230
|
+
break
|
|
231
|
+
if not made:
|
|
232
|
+
break
|
|
233
|
+
|
|
234
|
+
# function supplements
|
|
235
|
+
for _ in range(need("function")):
|
|
236
|
+
m = re.search(r"\b([A-Za-z_][A-Za-z0-9_]*)\(", corrupted)
|
|
237
|
+
if not m:
|
|
238
|
+
break
|
|
239
|
+
fname = m.group(1)
|
|
240
|
+
if len(fname) > 3:
|
|
241
|
+
mut = fname[:-1]
|
|
242
|
+
else:
|
|
243
|
+
mut = fname + "x"
|
|
244
|
+
orig = fname + "("
|
|
245
|
+
mutated = mut + "("
|
|
246
|
+
corrupted = corrupted.replace(orig, mutated, 1)
|
|
247
|
+
errors.append({"id": f"e_func_sup_{len(errors)}", "category": "function", "original_snippet": orig, "mutated_snippet": mutated, "rationale": "misspelled API name"})
|
|
248
|
+
|
|
249
|
+
# markdown_structure supplements
|
|
250
|
+
for _ in range(need("markdown_structure")):
|
|
251
|
+
m = re.search(r"^## \s*", corrupted, flags=re.M)
|
|
252
|
+
if m:
|
|
253
|
+
orig = m.group(0)
|
|
254
|
+
mut = orig.replace("## ", "##", 1)
|
|
255
|
+
corrupted = corrupted.replace(orig, mut, 1)
|
|
256
|
+
errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "removed header space"})
|
|
257
|
+
else:
|
|
258
|
+
fence = re.search(r"```[A-Za-z]*\n[\s\S]*?```", corrupted)
|
|
259
|
+
if fence:
|
|
260
|
+
block = fence.group(0)
|
|
261
|
+
mut = block.rstrip("`") # drop a backtick
|
|
262
|
+
corrupted = corrupted.replace(block, mut, 1)
|
|
263
|
+
errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": block[:10], "mutated_snippet": mut[:10], "rationale": "broken code fence"})
|
|
264
|
+
else:
|
|
265
|
+
break
|
|
266
|
+
|
|
267
|
+
data["errors"] = errors
|
|
268
|
+
return corrupted, data
|
|
269
|
+
|
|
270
|
+
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional, List, Dict, Any
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EvaluationReport(BaseModel):
|
|
8
|
+
timestamp: Optional[str] = None
|
|
9
|
+
repo_url: Optional[str] = None
|
|
10
|
+
|
|
11
|
+
installation_evaluation: Optional[Dict[str, Any]] = None
|
|
12
|
+
installation_files: Optional[List[str]] = None
|
|
13
|
+
|
|
14
|
+
readme_evaluation: Optional[Dict[str, Any]] = None
|
|
15
|
+
readme_files: Optional[List[str]] = None
|
|
16
|
+
|
|
17
|
+
submission_requirements_evaluation: Optional[Dict[str, Any]] = None
|
|
18
|
+
submission_requirements_files: Optional[List[str]] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class SuggestionItem(BaseModel):
|
|
22
|
+
id: str
|
|
23
|
+
category: str
|
|
24
|
+
severity: str = Field(default="should_fix")
|
|
25
|
+
source: Dict[str, str] = Field(default_factory=dict)
|
|
26
|
+
target_files: List[str] = Field(default_factory=list)
|
|
27
|
+
action: str
|
|
28
|
+
anchor_hint: Optional[str] = None
|
|
29
|
+
content_guidance: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StyleProfile(BaseModel):
|
|
33
|
+
heading_style: str = Field(default="#")
|
|
34
|
+
list_style: str = Field(default="-")
|
|
35
|
+
code_fence_style: str = Field(default="```")
|
|
36
|
+
tone_markers: List[str] = Field(default_factory=list)
|
|
37
|
+
link_style: str = Field(default="inline")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PlannedEdit(BaseModel):
|
|
41
|
+
file_path: str
|
|
42
|
+
edit_type: str
|
|
43
|
+
anchor: Dict[str, str] = Field(default_factory=dict)
|
|
44
|
+
content_template: str
|
|
45
|
+
rationale: str
|
|
46
|
+
minimal_diff: bool = Field(default=True)
|
|
47
|
+
suggestion_id: Optional[str] = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DocumentPlan(BaseModel):
|
|
51
|
+
repo_path: str
|
|
52
|
+
style_profile: StyleProfile
|
|
53
|
+
planned_edits: List[PlannedEdit] = Field(default_factory=list)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class OutputArtifact(BaseModel):
|
|
57
|
+
dest_rel_path: str
|
|
58
|
+
original_rel_path: str
|
|
59
|
+
change_summary: str
|
|
60
|
+
diff_stats: Dict[str, int] = Field(default_factory=dict)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GenerationManifest(BaseModel):
|
|
64
|
+
repo_url: Optional[str] = None
|
|
65
|
+
report_path: Optional[str] = None
|
|
66
|
+
output_dir: Optional[str] = None
|
|
67
|
+
suggestions: List[SuggestionItem] = Field(default_factory=list)
|
|
68
|
+
planned_edits: List[PlannedEdit] = Field(default_factory=list)
|
|
69
|
+
artifacts: List[OutputArtifact] = Field(default_factory=list)
|
|
70
|
+
skipped: List[str] = Field(default_factory=list)
|
|
71
|
+
|
|
72
|
+
class GenerationReport(BaseModel):
|
|
73
|
+
repo_url: Optional[str] = None
|
|
74
|
+
output_dir: Optional[str] = None
|
|
75
|
+
sections: List[Dict[str, Any]] = Field(default_factory=list)
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import json
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Dict, List, Tuple
|
|
7
|
+
|
|
8
|
+
from .models import OutputArtifact, GenerationManifest, PlannedEdit
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class OutputManager:
|
|
12
|
+
def __init__(self, base_outputs_dir: str = "outputs"):
|
|
13
|
+
self.base_outputs_dir = base_outputs_dir
|
|
14
|
+
|
|
15
|
+
def prepare_output_dir(self, repo_url_or_name: str) -> str:
|
|
16
|
+
repo_name = self._extract_repo_name(repo_url_or_name)
|
|
17
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
18
|
+
out_dir = os.path.join(self.base_outputs_dir, f"{repo_name}", timestamp)
|
|
19
|
+
os.makedirs(out_dir, exist_ok=True)
|
|
20
|
+
return out_dir
|
|
21
|
+
|
|
22
|
+
def _extract_repo_name(self, url_or_name: str) -> str:
|
|
23
|
+
name = url_or_name.rstrip("/")
|
|
24
|
+
if "/" in name:
|
|
25
|
+
name = name.split("/")[-1]
|
|
26
|
+
name = name.replace(".git", "")
|
|
27
|
+
return name
|
|
28
|
+
|
|
29
|
+
def write_files(self, output_dir: str, files: Dict[str, str], diff_stats_by_file: Dict[str, dict] | None = None) -> List[OutputArtifact]:
|
|
30
|
+
artifacts: List[OutputArtifact] = []
|
|
31
|
+
for rel_path, content in files.items():
|
|
32
|
+
dest = os.path.join(output_dir, rel_path)
|
|
33
|
+
os.makedirs(os.path.dirname(dest), exist_ok=True)
|
|
34
|
+
with open(dest, "w", encoding="utf-8") as fobj:
|
|
35
|
+
fobj.write(content)
|
|
36
|
+
artifacts.append(OutputArtifact(
|
|
37
|
+
dest_rel_path=rel_path,
|
|
38
|
+
original_rel_path=rel_path,
|
|
39
|
+
change_summary="revised document",
|
|
40
|
+
diff_stats=(diff_stats_by_file or {}).get(rel_path, {})
|
|
41
|
+
))
|
|
42
|
+
return artifacts
|
|
43
|
+
|
|
44
|
+
def write_manifest(
|
|
45
|
+
self,
|
|
46
|
+
output_dir: str,
|
|
47
|
+
manifest: GenerationManifest,
|
|
48
|
+
) -> str:
|
|
49
|
+
dest = os.path.join(output_dir, "manifest.json")
|
|
50
|
+
with open(dest, "w", encoding="utf-8") as fobj:
|
|
51
|
+
json.dump(manifest.model_dump(), fobj, indent=2)
|
|
52
|
+
return dest
|
|
53
|
+
|
|
54
|
+
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Dict, Optional, List, Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RepoReader:
|
|
8
|
+
def __init__(self, repo_path: str, gitignore_path: Optional[str] = None):
|
|
9
|
+
self.repo_path = repo_path
|
|
10
|
+
self.gitignore_path = gitignore_path
|
|
11
|
+
|
|
12
|
+
def read_files(self, rel_paths: List[str]) -> Tuple[Dict[str, str], List[str]]:
|
|
13
|
+
contents: Dict[str, str] = {}
|
|
14
|
+
missing: List[str] = []
|
|
15
|
+
for rel in rel_paths:
|
|
16
|
+
abs_path = os.path.join(self.repo_path, rel)
|
|
17
|
+
if not os.path.isfile(abs_path):
|
|
18
|
+
missing.append(rel)
|
|
19
|
+
continue
|
|
20
|
+
try:
|
|
21
|
+
with open(abs_path, "r", encoding="utf-8") as fobj:
|
|
22
|
+
contents[rel] = fobj.read()
|
|
23
|
+
except Exception:
|
|
24
|
+
missing.append(rel)
|
|
25
|
+
return contents, missing
|
|
26
|
+
|
|
27
|
+
def read_default_targets(self) -> Tuple[Dict[str, str], List[str]]:
|
|
28
|
+
# Common targets we may need to modify
|
|
29
|
+
candidates = [
|
|
30
|
+
"README.md",
|
|
31
|
+
"README.rst",
|
|
32
|
+
"vignettes/install.Rmd",
|
|
33
|
+
"vignettes/install_v5.Rmd",
|
|
34
|
+
]
|
|
35
|
+
return self.read_files(candidates)
|
|
36
|
+
|
|
37
|
+
|