bioguider 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

Files changed (35) hide show
  1. bioguider/agents/agent_utils.py +5 -3
  2. bioguider/agents/collection_execute_step.py +1 -1
  3. bioguider/agents/common_conversation.py +20 -2
  4. bioguider/agents/consistency_collection_execute_step.py +152 -0
  5. bioguider/agents/consistency_collection_observe_step.py +128 -0
  6. bioguider/agents/consistency_collection_plan_step.py +128 -0
  7. bioguider/agents/consistency_collection_task.py +109 -0
  8. bioguider/agents/consistency_collection_task_utils.py +137 -0
  9. bioguider/agents/evaluation_task.py +2 -2
  10. bioguider/agents/evaluation_userguide_prompts.py +162 -0
  11. bioguider/agents/evaluation_userguide_task.py +164 -0
  12. bioguider/agents/prompt_utils.py +11 -8
  13. bioguider/database/code_structure_db.py +489 -0
  14. bioguider/generation/__init__.py +39 -0
  15. bioguider/generation/change_planner.py +140 -0
  16. bioguider/generation/document_renderer.py +47 -0
  17. bioguider/generation/llm_cleaner.py +43 -0
  18. bioguider/generation/llm_content_generator.py +69 -0
  19. bioguider/generation/llm_injector.py +270 -0
  20. bioguider/generation/models.py +77 -0
  21. bioguider/generation/output_manager.py +54 -0
  22. bioguider/generation/repo_reader.py +37 -0
  23. bioguider/generation/report_loader.py +151 -0
  24. bioguider/generation/style_analyzer.py +36 -0
  25. bioguider/generation/suggestion_extractor.py +136 -0
  26. bioguider/generation/test_metrics.py +104 -0
  27. bioguider/managers/evaluation_manager.py +24 -0
  28. bioguider/managers/generation_manager.py +160 -0
  29. bioguider/managers/generation_test_manager.py +74 -0
  30. bioguider/utils/code_structure_builder.py +42 -0
  31. bioguider/utils/file_handler.py +65 -0
  32. {bioguider-0.2.19.dist-info → bioguider-0.2.20.dist-info}/METADATA +1 -1
  33. {bioguider-0.2.19.dist-info → bioguider-0.2.20.dist-info}/RECORD +35 -10
  34. {bioguider-0.2.19.dist-info → bioguider-0.2.20.dist-info}/LICENSE +0 -0
  35. {bioguider-0.2.19.dist-info → bioguider-0.2.20.dist-info}/WHEEL +0 -0
@@ -0,0 +1,47 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Tuple
4
+
5
+ from .models import PlannedEdit
6
+
7
+
8
+ class DocumentRenderer:
9
+ def apply_edit(self, original: str, edit: PlannedEdit) -> Tuple[str, dict]:
10
+ content = original
11
+ added = 0
12
+
13
+ if edit.edit_type == "append_section":
14
+ # Avoid duplicate header if the same header already exists
15
+ header_line = None
16
+ if edit.content_template.lstrip().startswith("#"):
17
+ header_line = edit.content_template.strip().splitlines()[0].strip()
18
+ if header_line and header_line in content:
19
+ return content, {"added_lines": 0}
20
+ # Append with two leading newlines if needed
21
+ sep = "\n\n" if not content.endswith("\n\n") else ""
22
+ content = f"{content}{sep}{edit.content_template}"
23
+ added = len(edit.content_template.splitlines())
24
+
25
+ elif edit.edit_type == "replace_intro_block":
26
+ # Replace content from start to first level-2 header (##) with new intro
27
+ lines = content.splitlines()
28
+ end_idx = None
29
+ for i, ln in enumerate(lines):
30
+ if ln.strip().startswith("## "):
31
+ end_idx = i
32
+ break
33
+ if end_idx is None:
34
+ # No H2 header found; replace entire content
35
+ new_content = edit.content_template
36
+ else:
37
+ head = lines[:0]
38
+ tail = lines[end_idx:]
39
+ new_content = edit.content_template.rstrip() + "\n\n" + "\n".join(tail)
40
+ added = len(edit.content_template.splitlines())
41
+ content = new_content
42
+
43
+ # Other edit types (insert_after_header, replace_block) can be added as needed
44
+
45
+ return content, {"added_lines": added}
46
+
47
+
@@ -0,0 +1,43 @@
1
+ from __future__ import annotations
2
+
3
+ from langchain_openai.chat_models.base import BaseChatOpenAI
4
+
5
+ from bioguider.agents.common_conversation import CommonConversation
6
+
7
+
8
+ CLEANUP_PROMPT = """
9
+ You are “BioGuider,” a precise editor for biomedical/bioinformatics documentation.
10
+
11
+ TASK
12
+ Given a full README markdown, produce a corrected version that:
13
+ - Fixes typos, grammar, capitalization, and spacing
14
+ - Corrects malformed markdown (headers, lists, links, code fences)
15
+ - Repairs or normalizes link formatting; keep URLs absolute if present
16
+ - Removes duplicated sections or repeated content; consolidate if needed
17
+ - Preserves technical accuracy and biomedical domain terminology (do not invent features)
18
+ - Keeps tone neutral and professional; avoid marketing language
19
+ - Preserves all valid information; do not delete content unless it is a duplicate or malformed
20
+
21
+ INPUT
22
+ <<README>>
23
+ {readme}
24
+ <</README>>
25
+
26
+ OUTPUT
27
+ Return ONLY the revised markdown (no commentary, no explanations).
28
+ """
29
+
30
+
31
+ class LLMCleaner:
32
+ def __init__(self, llm: BaseChatOpenAI):
33
+ self.llm = llm
34
+
35
+ def clean_readme(self, content: str) -> tuple[str, dict]:
36
+ conv = CommonConversation(self.llm)
37
+ output, token_usage = conv.generate(
38
+ system_prompt=CLEANUP_PROMPT.format(readme=content[:30000]),
39
+ instruction_prompt="Provide the corrected README markdown only.",
40
+ )
41
+ return output.strip(), token_usage
42
+
43
+
@@ -0,0 +1,69 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Dict
4
+ from langchain_openai.chat_models.base import BaseChatOpenAI
5
+
6
+ from bioguider.agents.common_conversation import CommonConversation
7
+ from .models import StyleProfile, SuggestionItem
8
+
9
+
10
+ LLM_SECTION_PROMPT = """
11
+ You are “BioGuider,” a concise documentation generator for biomedical/bioinformatics software.
12
+
13
+ GOAL
14
+ Write or refine a single documentation section named "{section}". Produce minimal, style-consistent content that addresses only this section.
15
+
16
+ INPUTS (use only what is provided; never invent)
17
+ - suggestion_category: {suggestion_category}
18
+ - anchor_title: {anchor_title}
19
+ - guidance: {guidance}
20
+ - evidence_from_evaluation: {evidence}
21
+ - repo_context_excerpt (analyze tone/formatting; do not paraphrase it blindly): <<{context}>>
22
+
23
+ STYLE & CONSTRAINTS
24
+ - Preserve the existing tone and style markers: {tone_markers}
25
+ - Use heading style "{heading_style}" and list style "{list_style}"; link style "{link_style}".
26
+ - Neutral, professional tone; avoid marketing claims.
27
+ - Omit details you cannot substantiate from inputs/context; do not invent.
28
+ - Prefer bullets; keep it short and skimmable.
29
+ - Biomedical examples must avoid PHI; assume de-identified data.
30
+ - Output must be plain markdown for this section only, with no commentary and no backticks.
31
+ - Avoid duplication: if similar content exists in the repo context, rewrite succinctly instead of repeating.
32
+
33
+ SECTION GUIDELINES
34
+ - Dependencies: short bullet list; clearly separate Mandatory and Optional if applicable; avoid version numbers unless present in context.
35
+ - System Requirements: runtime versions and supported OS; add hardware notes only if guidance provides specifics.
36
+ - Hardware Requirements: brief bullets with RAM/CPU only if guidance includes numbers.
37
+ - License: one sentence referencing the license and pointing to the LICENSE file.
38
+ - Install (clarify dependencies): bullets under Mandatory and Optional.
39
+ - If the section does not fit the above, produce a concise, accurate subsection aligned with the repo’s style.
40
+
41
+ OUTPUT FORMAT
42
+ - Return only the section markdown (no code fences).
43
+ - Start with a level-2 header: "## {anchor_title}" unless the content already starts with a header.
44
+ """
45
+
46
+
47
+ class LLMContentGenerator:
48
+ def __init__(self, llm: BaseChatOpenAI):
49
+ self.llm = llm
50
+
51
+ def generate_section(self, suggestion: SuggestionItem, style: StyleProfile, context: str = "") -> tuple[str, dict]:
52
+ conv = CommonConversation(self.llm)
53
+ section_name = suggestion.anchor_hint or suggestion.category.split(".")[-1].replace("_", " ").title()
54
+ system_prompt = LLM_SECTION_PROMPT.format(
55
+ tone_markers=", ".join(style.tone_markers or []),
56
+ heading_style=style.heading_style,
57
+ list_style=style.list_style,
58
+ link_style=style.link_style,
59
+ section=section_name,
60
+ anchor_title=section_name,
61
+ suggestion_category=suggestion.category,
62
+ evidence=(suggestion.source.get("evidence", "") if suggestion.source else ""),
63
+ context=context[:2500],
64
+ guidance=(suggestion.content_guidance or "").strip(),
65
+ )
66
+ content, token_usage = conv.generate(system_prompt=system_prompt, instruction_prompt="Write the section content now.")
67
+ return content.strip(), token_usage
68
+
69
+
@@ -0,0 +1,270 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import Tuple, Dict, Any, List, Set
5
+ import re
6
+ from difflib import SequenceMatcher
7
+
8
+ from langchain_openai.chat_models.base import BaseChatOpenAI
9
+ from bioguider.agents.common_conversation import CommonConversation
10
+ from bioguider.utils.utils import escape_braces
11
+
12
+
13
+ INJECTION_PROMPT = """
14
+ You are “BioGuider-Intro,” generating a deliberately flawed **INTRODUCTION** file
15
+ (“README-lite”) to test an auto-fixer. Start from the provided clean INTRO doc that follows the
16
+ BioGuider Intro structure (What is it? / What can it do? / Requirements / Install / Quick example /
17
+ Learn more / License & Contact). Produce a corrupted version with small, realistic defects.
18
+
19
+ GOAL
20
+ Introduce subtle but meaningful issues while keeping the document recognizably the same.
21
+
22
+ ERROR CATEGORIES (inject all)
23
+ - typo: spelling/grammar/punctuation mistakes
24
+ - link: malformed URL, wrong domain, or stray spaces in URL
25
+ - duplicate: duplicate a short line/section fragment
26
+ - bio_term: slightly wrong domain term (e.g., “single sell” for “single cell”); do not invent new science
27
+ - function: misspell a known function/API name **from the input README-lite only**
28
+ - markdown_structure: break a header level, list indentation, or code fence (one-off)
29
+
30
+ CONSTRAINTS
31
+ - Keep edits minimal and local; **≥85% token overlap** with input.
32
+ - **Preserve section ORDER and TITLES** from the Intro spec:
33
+ 1) # <project_name>
34
+ _<tagline>_
35
+ 2) What is it?
36
+ 3) What can it do?
37
+ 4) Requirements
38
+ 5) Install
39
+ 6) Quick example
40
+ 7) Learn more
41
+ 8) License & Contact
42
+ - Do **not** add or remove top-level sections. Subtle line-level corruption only.
43
+ - Maintain a **concise length** (≤ {max_words} words).
44
+ - Do **not** alter the protected keywords (exact casing/spelling): {keywords}
45
+ - Keep at least **{min_per_category} errors per category** listed above.
46
+ - If the input contains runnable code, keep it mostly intact but introduce **one** realistic break
47
+ (e.g., missing quote/paren or wrong function name) without adding new libraries.
48
+ - Keep at least one **valid** URL so the fixer can compare.
49
+ - Do not change the project identity, domain, or language.
50
+ - Do not include markers, explanations, or commentary in the corrupted markdown.
51
+
52
+ INPUT INTRO (clean README-lite)
53
+ <<INTRO>>
54
+ {readme}
55
+ <</INTRO>>
56
+
57
+ OUTPUT (JSON only):
58
+ {{
59
+ "corrupted_markdown": "<the entire corrupted INTRO as markdown>",
60
+ "errors": [
61
+ {{
62
+ "id": "e1",
63
+ "category": "typo|link|duplicate|bio_term|function|markdown_structure",
64
+ "rationale": "why this mutation is realistic",
65
+ "original_snippet": "<verbatim snippet from input>",
66
+ "mutated_snippet": "<verbatim mutated text>"
67
+ }}
68
+ // include one entry per individual mutation you applied
69
+ ]
70
+ }}
71
+ """
72
+
73
+
74
+ class LLMErrorInjector:
75
+ def __init__(self, llm: BaseChatOpenAI):
76
+ self.llm = llm
77
+
78
+ def inject(self, readme_text: str, min_per_category: int = 3, preserve_keywords: list[str] | None = None, max_words: int = 450) -> Tuple[str, Dict[str, Any]]:
79
+ conv = CommonConversation(self.llm)
80
+ preserve_keywords = preserve_keywords or self._extract_preserve_keywords(readme_text)
81
+ system_prompt = escape_braces(INJECTION_PROMPT).format(
82
+ readme=readme_text[:30000],
83
+ min_per_category=min_per_category,
84
+ keywords=", ".join(preserve_keywords) if preserve_keywords else "",
85
+ max_words=max_words,
86
+ )
87
+ output, _ = conv.generate(system_prompt=system_prompt, instruction_prompt="Return the JSON now.")
88
+ try:
89
+ data = json.loads(output)
90
+ except Exception:
91
+ # try to locate JSON block
92
+ start = output.find("{")
93
+ end = output.rfind("}")
94
+ data = json.loads(output[start:end+1]) if start != -1 and end != -1 else {"corrupted_markdown": readme_text, "errors": []}
95
+ corrupted = data.get("corrupted_markdown", readme_text)
96
+ # Validate output stays within original context; fallback to deterministic if invalid
97
+ if not self._validate_corrupted(readme_text, corrupted, preserve_keywords):
98
+ corrupted, data = self._deterministic_inject(readme_text)
99
+ # Supplement to satisfy minimum per-category counts using deterministic local edits
100
+ corrupted, data = self._supplement_errors(readme_text, corrupted, data, min_per_category)
101
+ manifest = {
102
+ "errors": data.get("errors", []),
103
+ }
104
+ return corrupted, manifest
105
+
106
+ def _extract_preserve_keywords(self, text: str) -> List[str]:
107
+ # Extract capitalized terms, domain hyphenations, and hostnames in links
108
+ kws: Set[str] = set()
109
+ for m in re.finditer(r"\b[A-Z][A-Za-z0-9\-/]{2,}(?:\s[A-Z][A-Za-z0-9\-/]{2,})*\b", text):
110
+ term = m.group(0)
111
+ if len(term) <= 40:
112
+ kws.add(term)
113
+ for m in re.finditer(r"\b[\w]+-[\w]+\b", text):
114
+ if any(ch.isalpha() for ch in m.group(0)):
115
+ kws.add(m.group(0))
116
+ for m in re.finditer(r"https?://([^/\s)]+)", text):
117
+ kws.add(m.group(1))
118
+ # Keep a small set to avoid over-constraining
119
+ out = list(kws)[:20]
120
+ return out
121
+
122
+ def _validate_corrupted(self, baseline: str, corrupted: str, preserve_keywords: List[str]) -> bool:
123
+ # Similarity threshold
124
+ ratio = SequenceMatcher(None, baseline, corrupted).ratio()
125
+ if ratio < 0.7:
126
+ return False
127
+ # Preserve keywords
128
+ for k in preserve_keywords:
129
+ if k and k not in corrupted:
130
+ return False
131
+ # No new top-level sections
132
+ base_h2 = set([ln.strip() for ln in baseline.splitlines() if ln.strip().startswith("## ")])
133
+ corr_h2 = set([ln.strip() for ln in corrupted.splitlines() if ln.strip().startswith("## ")])
134
+ if not corr_h2.issubset(base_h2.union({"## Overview", "## Hardware Requirements", "## License", "## Usage", "## Dependencies", "## System Requirements"})):
135
+ return False
136
+ # New token ratio
137
+ btoks = set(re.findall(r"[A-Za-z0-9_\-]+", baseline.lower()))
138
+ ctoks = set(re.findall(r"[A-Za-z0-9_\-]+", corrupted.lower()))
139
+ new_ratio = len(ctoks - btoks) / max(1, len(ctoks))
140
+ if new_ratio > 0.25:
141
+ return False
142
+ return True
143
+
144
+ def _deterministic_inject(self, baseline: str) -> Tuple[str, Dict[str, Any]]:
145
+ errors: List[Dict[str, Any]] = []
146
+ text = baseline
147
+ # typo
148
+ if "successfully" in text:
149
+ text = text.replace("successfully", "succesfully", 1)
150
+ errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "successfully", "mutated_snippet": "succesfully", "rationale": "common misspelling"})
151
+ elif "installation" in text:
152
+ text = text.replace("installation", "instalation", 1)
153
+ errors.append({"id": "e_typo_1", "category": "typo", "original_snippet": "installation", "mutated_snippet": "instalation", "rationale": "common misspelling"})
154
+ # link
155
+ m = re.search(r"\]\(https?://[^)]+\)", text)
156
+ if m:
157
+ broken = m.group(0).replace("https://", "https//")
158
+ text = text.replace(m.group(0), broken, 1)
159
+ errors.append({"id": "e_link_1", "category": "link", "original_snippet": m.group(0), "mutated_snippet": broken, "rationale": "missing colon in scheme"})
160
+ # duplicate a small section (next header and paragraph)
161
+ lines = text.splitlines()
162
+ dup_idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("## ")), None)
163
+ if dup_idx is not None:
164
+ block = lines[dup_idx: min(len(lines), dup_idx+5)]
165
+ text = "\n".join(lines + ["", *block])
166
+ errors.append({"id": "e_dup_1", "category": "duplicate", "original_snippet": "\n".join(block), "mutated_snippet": "\n".join(block), "rationale": "duplicated section"})
167
+ # markdown structure: break a header
168
+ if "\n# " in text:
169
+ text = text.replace("\n# ", "\n#", 1)
170
+ errors.append({"id": "e_md_1", "category": "markdown_structure", "original_snippet": "\n# ", "mutated_snippet": "\n#", "rationale": "missing space in header"})
171
+ return text, {"errors": errors}
172
+
173
+ def _supplement_errors(self, baseline: str, corrupted: str, data: Dict[str, Any], min_per_category: int) -> Tuple[str, Dict[str, Any]]:
174
+ errors: List[Dict[str, Any]] = data.get("errors", []) or []
175
+ cat_counts: Dict[str, int] = {}
176
+ for e in errors:
177
+ cat = e.get("category", "")
178
+ cat_counts[cat] = cat_counts.get(cat, 0) + 1
179
+
180
+ def need(cat: str) -> int:
181
+ return max(0, min_per_category - cat_counts.get(cat, 0))
182
+
183
+ # typo supplements
184
+ for _ in range(need("typo")):
185
+ m = re.search(r"\b(installation|successfully|analysis|documentation|maintained|example|requirements|license|tutorials)\b", corrupted, flags=re.I)
186
+ if not m:
187
+ m = re.search(r"\b[A-Za-z]{6,}\b", corrupted)
188
+ if not m:
189
+ break
190
+ orig = m.group(0)
191
+ mut = orig[:-1] if len(orig) > 3 else orig + "e"
192
+ corrupted = corrupted.replace(orig, mut, 1)
193
+ errors.append({"id": f"e_typo_sup_{len(errors)}", "category": "typo", "original_snippet": orig, "mutated_snippet": mut, "rationale": "minor misspelling"})
194
+
195
+ # link supplements
196
+ for _ in range(need("link")):
197
+ m = re.search(r"\[[^\]]+\]\(https?://[^)]+\)", corrupted)
198
+ if not m:
199
+ break
200
+ orig = m.group(0)
201
+ mut = orig.replace("https://", "https//", 1)
202
+ if mut == orig:
203
+ mut = orig.replace("http://", "http//", 1)
204
+ corrupted = corrupted.replace(orig, mut, 1)
205
+ errors.append({"id": f"e_link_sup_{len(errors)}", "category": "link", "original_snippet": orig, "mutated_snippet": mut, "rationale": "scheme colon removed"})
206
+
207
+ # duplicate supplements
208
+ for _ in range(need("duplicate")):
209
+ lines = corrupted.splitlines()
210
+ idx = next((i for i, ln in enumerate(lines) if ln.strip().startswith("- ") or ln.strip().startswith("## ")), None)
211
+ if idx is None:
212
+ break
213
+ frag = lines[idx]
214
+ lines = lines[:idx+1] + [frag] + lines[idx+1:]
215
+ corrupted = "\n".join(lines)
216
+ errors.append({"id": f"e_dup_sup_{len(errors)}", "category": "duplicate", "original_snippet": frag, "mutated_snippet": frag, "rationale": "line duplicated"})
217
+
218
+ # bio_term supplements
219
+ bio_swaps = [(r"single cell", "single sell"), (r"genomics", "genomis"), (r"spatial", "spacial")]
220
+ for _ in range(need("bio_term")):
221
+ made = False
222
+ for pat, rep in bio_swaps:
223
+ m = re.search(pat, corrupted, flags=re.I)
224
+ if m:
225
+ orig = m.group(0)
226
+ mut = rep if orig.islower() else rep.title()
227
+ corrupted = corrupted.replace(orig, mut, 1)
228
+ errors.append({"id": f"e_bio_sup_{len(errors)}", "category": "bio_term", "original_snippet": orig, "mutated_snippet": mut, "rationale": "common domain typo"})
229
+ made = True
230
+ break
231
+ if not made:
232
+ break
233
+
234
+ # function supplements
235
+ for _ in range(need("function")):
236
+ m = re.search(r"\b([A-Za-z_][A-Za-z0-9_]*)\(", corrupted)
237
+ if not m:
238
+ break
239
+ fname = m.group(1)
240
+ if len(fname) > 3:
241
+ mut = fname[:-1]
242
+ else:
243
+ mut = fname + "x"
244
+ orig = fname + "("
245
+ mutated = mut + "("
246
+ corrupted = corrupted.replace(orig, mutated, 1)
247
+ errors.append({"id": f"e_func_sup_{len(errors)}", "category": "function", "original_snippet": orig, "mutated_snippet": mutated, "rationale": "misspelled API name"})
248
+
249
+ # markdown_structure supplements
250
+ for _ in range(need("markdown_structure")):
251
+ m = re.search(r"^## \s*", corrupted, flags=re.M)
252
+ if m:
253
+ orig = m.group(0)
254
+ mut = orig.replace("## ", "##", 1)
255
+ corrupted = corrupted.replace(orig, mut, 1)
256
+ errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": orig, "mutated_snippet": mut, "rationale": "removed header space"})
257
+ else:
258
+ fence = re.search(r"```[A-Za-z]*\n[\s\S]*?```", corrupted)
259
+ if fence:
260
+ block = fence.group(0)
261
+ mut = block.rstrip("`") # drop a backtick
262
+ corrupted = corrupted.replace(block, mut, 1)
263
+ errors.append({"id": f"e_md_sup_{len(errors)}", "category": "markdown_structure", "original_snippet": block[:10], "mutated_snippet": mut[:10], "rationale": "broken code fence"})
264
+ else:
265
+ break
266
+
267
+ data["errors"] = errors
268
+ return corrupted, data
269
+
270
+
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Optional, List, Dict, Any
4
+ from pydantic import BaseModel, Field
5
+
6
+
7
+ class EvaluationReport(BaseModel):
8
+ timestamp: Optional[str] = None
9
+ repo_url: Optional[str] = None
10
+
11
+ installation_evaluation: Optional[Dict[str, Any]] = None
12
+ installation_files: Optional[List[str]] = None
13
+
14
+ readme_evaluation: Optional[Dict[str, Any]] = None
15
+ readme_files: Optional[List[str]] = None
16
+
17
+ submission_requirements_evaluation: Optional[Dict[str, Any]] = None
18
+ submission_requirements_files: Optional[List[str]] = None
19
+
20
+
21
+ class SuggestionItem(BaseModel):
22
+ id: str
23
+ category: str
24
+ severity: str = Field(default="should_fix")
25
+ source: Dict[str, str] = Field(default_factory=dict)
26
+ target_files: List[str] = Field(default_factory=list)
27
+ action: str
28
+ anchor_hint: Optional[str] = None
29
+ content_guidance: Optional[str] = None
30
+
31
+
32
+ class StyleProfile(BaseModel):
33
+ heading_style: str = Field(default="#")
34
+ list_style: str = Field(default="-")
35
+ code_fence_style: str = Field(default="```")
36
+ tone_markers: List[str] = Field(default_factory=list)
37
+ link_style: str = Field(default="inline")
38
+
39
+
40
+ class PlannedEdit(BaseModel):
41
+ file_path: str
42
+ edit_type: str
43
+ anchor: Dict[str, str] = Field(default_factory=dict)
44
+ content_template: str
45
+ rationale: str
46
+ minimal_diff: bool = Field(default=True)
47
+ suggestion_id: Optional[str] = None
48
+
49
+
50
+ class DocumentPlan(BaseModel):
51
+ repo_path: str
52
+ style_profile: StyleProfile
53
+ planned_edits: List[PlannedEdit] = Field(default_factory=list)
54
+
55
+
56
+ class OutputArtifact(BaseModel):
57
+ dest_rel_path: str
58
+ original_rel_path: str
59
+ change_summary: str
60
+ diff_stats: Dict[str, int] = Field(default_factory=dict)
61
+
62
+
63
+ class GenerationManifest(BaseModel):
64
+ repo_url: Optional[str] = None
65
+ report_path: Optional[str] = None
66
+ output_dir: Optional[str] = None
67
+ suggestions: List[SuggestionItem] = Field(default_factory=list)
68
+ planned_edits: List[PlannedEdit] = Field(default_factory=list)
69
+ artifacts: List[OutputArtifact] = Field(default_factory=list)
70
+ skipped: List[str] = Field(default_factory=list)
71
+
72
+ class GenerationReport(BaseModel):
73
+ repo_url: Optional[str] = None
74
+ output_dir: Optional[str] = None
75
+ sections: List[Dict[str, Any]] = Field(default_factory=list)
76
+
77
+
@@ -0,0 +1,54 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import json
5
+ from datetime import datetime
6
+ from typing import Dict, List, Tuple
7
+
8
+ from .models import OutputArtifact, GenerationManifest, PlannedEdit
9
+
10
+
11
+ class OutputManager:
12
+ def __init__(self, base_outputs_dir: str = "outputs"):
13
+ self.base_outputs_dir = base_outputs_dir
14
+
15
+ def prepare_output_dir(self, repo_url_or_name: str) -> str:
16
+ repo_name = self._extract_repo_name(repo_url_or_name)
17
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
18
+ out_dir = os.path.join(self.base_outputs_dir, f"{repo_name}", timestamp)
19
+ os.makedirs(out_dir, exist_ok=True)
20
+ return out_dir
21
+
22
+ def _extract_repo_name(self, url_or_name: str) -> str:
23
+ name = url_or_name.rstrip("/")
24
+ if "/" in name:
25
+ name = name.split("/")[-1]
26
+ name = name.replace(".git", "")
27
+ return name
28
+
29
+ def write_files(self, output_dir: str, files: Dict[str, str], diff_stats_by_file: Dict[str, dict] | None = None) -> List[OutputArtifact]:
30
+ artifacts: List[OutputArtifact] = []
31
+ for rel_path, content in files.items():
32
+ dest = os.path.join(output_dir, rel_path)
33
+ os.makedirs(os.path.dirname(dest), exist_ok=True)
34
+ with open(dest, "w", encoding="utf-8") as fobj:
35
+ fobj.write(content)
36
+ artifacts.append(OutputArtifact(
37
+ dest_rel_path=rel_path,
38
+ original_rel_path=rel_path,
39
+ change_summary="revised document",
40
+ diff_stats=(diff_stats_by_file or {}).get(rel_path, {})
41
+ ))
42
+ return artifacts
43
+
44
+ def write_manifest(
45
+ self,
46
+ output_dir: str,
47
+ manifest: GenerationManifest,
48
+ ) -> str:
49
+ dest = os.path.join(output_dir, "manifest.json")
50
+ with open(dest, "w", encoding="utf-8") as fobj:
51
+ json.dump(manifest.model_dump(), fobj, indent=2)
52
+ return dest
53
+
54
+
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from typing import Dict, Optional, List, Tuple
5
+
6
+
7
+ class RepoReader:
8
+ def __init__(self, repo_path: str, gitignore_path: Optional[str] = None):
9
+ self.repo_path = repo_path
10
+ self.gitignore_path = gitignore_path
11
+
12
+ def read_files(self, rel_paths: List[str]) -> Tuple[Dict[str, str], List[str]]:
13
+ contents: Dict[str, str] = {}
14
+ missing: List[str] = []
15
+ for rel in rel_paths:
16
+ abs_path = os.path.join(self.repo_path, rel)
17
+ if not os.path.isfile(abs_path):
18
+ missing.append(rel)
19
+ continue
20
+ try:
21
+ with open(abs_path, "r", encoding="utf-8") as fobj:
22
+ contents[rel] = fobj.read()
23
+ except Exception:
24
+ missing.append(rel)
25
+ return contents, missing
26
+
27
+ def read_default_targets(self) -> Tuple[Dict[str, str], List[str]]:
28
+ # Common targets we may need to modify
29
+ candidates = [
30
+ "README.md",
31
+ "README.rst",
32
+ "vignettes/install.Rmd",
33
+ "vignettes/install_v5.Rmd",
34
+ ]
35
+ return self.read_files(candidates)
36
+
37
+