bioguider 0.2.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. bioguider/__init__.py +0 -0
  2. bioguider/agents/__init__.py +0 -0
  3. bioguider/agents/agent_task.py +92 -0
  4. bioguider/agents/agent_tools.py +176 -0
  5. bioguider/agents/agent_utils.py +504 -0
  6. bioguider/agents/collection_execute_step.py +182 -0
  7. bioguider/agents/collection_observe_step.py +125 -0
  8. bioguider/agents/collection_plan_step.py +156 -0
  9. bioguider/agents/collection_task.py +184 -0
  10. bioguider/agents/collection_task_utils.py +142 -0
  11. bioguider/agents/common_agent.py +137 -0
  12. bioguider/agents/common_agent_2step.py +215 -0
  13. bioguider/agents/common_conversation.py +61 -0
  14. bioguider/agents/common_step.py +85 -0
  15. bioguider/agents/consistency_collection_step.py +102 -0
  16. bioguider/agents/consistency_evaluation_task.py +57 -0
  17. bioguider/agents/consistency_evaluation_task_utils.py +14 -0
  18. bioguider/agents/consistency_observe_step.py +110 -0
  19. bioguider/agents/consistency_query_step.py +77 -0
  20. bioguider/agents/dockergeneration_execute_step.py +186 -0
  21. bioguider/agents/dockergeneration_observe_step.py +154 -0
  22. bioguider/agents/dockergeneration_plan_step.py +158 -0
  23. bioguider/agents/dockergeneration_task.py +158 -0
  24. bioguider/agents/dockergeneration_task_utils.py +220 -0
  25. bioguider/agents/evaluation_installation_task.py +270 -0
  26. bioguider/agents/evaluation_readme_task.py +767 -0
  27. bioguider/agents/evaluation_submission_requirements_task.py +172 -0
  28. bioguider/agents/evaluation_task.py +206 -0
  29. bioguider/agents/evaluation_tutorial_task.py +169 -0
  30. bioguider/agents/evaluation_tutorial_task_prompts.py +187 -0
  31. bioguider/agents/evaluation_userguide_prompts.py +179 -0
  32. bioguider/agents/evaluation_userguide_task.py +154 -0
  33. bioguider/agents/evaluation_utils.py +127 -0
  34. bioguider/agents/identification_execute_step.py +181 -0
  35. bioguider/agents/identification_observe_step.py +104 -0
  36. bioguider/agents/identification_plan_step.py +140 -0
  37. bioguider/agents/identification_task.py +270 -0
  38. bioguider/agents/identification_task_utils.py +22 -0
  39. bioguider/agents/peo_common_step.py +64 -0
  40. bioguider/agents/prompt_utils.py +253 -0
  41. bioguider/agents/python_ast_repl_tool.py +69 -0
  42. bioguider/agents/rag_collection_task.py +130 -0
  43. bioguider/conversation.py +67 -0
  44. bioguider/database/code_structure_db.py +500 -0
  45. bioguider/database/summarized_file_db.py +146 -0
  46. bioguider/generation/__init__.py +39 -0
  47. bioguider/generation/benchmark_metrics.py +610 -0
  48. bioguider/generation/change_planner.py +189 -0
  49. bioguider/generation/document_renderer.py +157 -0
  50. bioguider/generation/llm_cleaner.py +67 -0
  51. bioguider/generation/llm_content_generator.py +1128 -0
  52. bioguider/generation/llm_injector.py +809 -0
  53. bioguider/generation/models.py +85 -0
  54. bioguider/generation/output_manager.py +74 -0
  55. bioguider/generation/repo_reader.py +37 -0
  56. bioguider/generation/report_loader.py +166 -0
  57. bioguider/generation/style_analyzer.py +36 -0
  58. bioguider/generation/suggestion_extractor.py +436 -0
  59. bioguider/generation/test_metrics.py +189 -0
  60. bioguider/managers/benchmark_manager.py +785 -0
  61. bioguider/managers/evaluation_manager.py +215 -0
  62. bioguider/managers/generation_manager.py +686 -0
  63. bioguider/managers/generation_test_manager.py +107 -0
  64. bioguider/managers/generation_test_manager_v2.py +525 -0
  65. bioguider/rag/__init__.py +0 -0
  66. bioguider/rag/config.py +117 -0
  67. bioguider/rag/data_pipeline.py +651 -0
  68. bioguider/rag/embedder.py +24 -0
  69. bioguider/rag/rag.py +138 -0
  70. bioguider/settings.py +103 -0
  71. bioguider/utils/code_structure_builder.py +59 -0
  72. bioguider/utils/constants.py +135 -0
  73. bioguider/utils/default.gitignore +140 -0
  74. bioguider/utils/file_utils.py +215 -0
  75. bioguider/utils/gitignore_checker.py +175 -0
  76. bioguider/utils/notebook_utils.py +117 -0
  77. bioguider/utils/pyphen_utils.py +73 -0
  78. bioguider/utils/python_file_handler.py +65 -0
  79. bioguider/utils/r_file_handler.py +551 -0
  80. bioguider/utils/utils.py +163 -0
  81. bioguider-0.2.52.dist-info/LICENSE +21 -0
  82. bioguider-0.2.52.dist-info/METADATA +51 -0
  83. bioguider-0.2.52.dist-info/RECORD +84 -0
  84. bioguider-0.2.52.dist-info/WHEEL +4 -0
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import List, Dict
4
+
5
+ from .models import SuggestionItem, StyleProfile, DocumentPlan, PlannedEdit
6
+
7
+
8
+ class ChangePlanner:
9
+ def build_plan(
10
+ self,
11
+ repo_path: str,
12
+ style: StyleProfile,
13
+ suggestions: List[SuggestionItem],
14
+ available_files: Dict[str, str],
15
+ ) -> DocumentPlan:
16
+ planned: List[PlannedEdit] = []
17
+ seen_headers: set[tuple[str, str]] = set()
18
+
19
+ def section_header(title: str) -> str:
20
+ # use heading level 2 for inserts to be safe
21
+ h = style.heading_style or "#"
22
+ return f"{h*2} {title}\n\n"
23
+
24
+ for s in suggestions:
25
+ for target in s.target_files:
26
+ if target not in available_files:
27
+ # allow planning; renderer will skip if missing
28
+ pass
29
+
30
+ if s.action == "add_dependencies_section":
31
+ # Use LLM generation instead of template
32
+ header_key = (target, (s.anchor_hint or "Dependencies").strip().lower())
33
+ if header_key in seen_headers:
34
+ continue
35
+ planned.append(PlannedEdit(
36
+ file_path=target,
37
+ edit_type="append_section",
38
+ anchor={"type": "header", "value": s.anchor_hint or "Dependencies"},
39
+ content_template="", # Will be generated by LLM
40
+ rationale=s.source.get("evidence", ""),
41
+ suggestion_id=s.id,
42
+ ))
43
+ seen_headers.add(header_key)
44
+ elif s.action == "add_system_requirements_section":
45
+ # Use LLM generation instead of template
46
+ header_key = (target, (s.anchor_hint or "System Requirements").strip().lower())
47
+ if header_key in seen_headers:
48
+ continue
49
+ planned.append(PlannedEdit(
50
+ file_path=target,
51
+ edit_type="append_section",
52
+ anchor={"type": "header", "value": s.anchor_hint or "System Requirements"},
53
+ content_template="", # Will be generated by LLM
54
+ rationale=s.source.get("evidence", ""),
55
+ suggestion_id=s.id,
56
+ ))
57
+ seen_headers.add(header_key)
58
+ elif s.action == "mention_license_section":
59
+ content = section_header("License") + "This project is released under the MIT License. See LICENSE for details.\n"
60
+ header_key = (target, (s.anchor_hint or "License").strip().lower())
61
+ if header_key in seen_headers:
62
+ continue
63
+ planned.append(PlannedEdit(
64
+ file_path=target,
65
+ edit_type="append_section",
66
+ anchor={"type": "header", "value": s.anchor_hint or "License"},
67
+ content_template=content,
68
+ rationale=s.source.get("evidence", ""),
69
+ suggestion_id=s.id,
70
+ ))
71
+ seen_headers.add(header_key)
72
+ elif s.action == "normalize_headings_structure":
73
+ # Minimal placeholder: avoid heavy rewrites
74
+ # Plan a no-op or a small note; actual normalization could be added later
75
+ continue
76
+ elif s.action == "add_usage_section":
77
+ content = section_header("Usage") + "- Brief example of typical workflow.\n"
78
+ header_key = (target, "usage")
79
+ if header_key in seen_headers:
80
+ continue
81
+ planned.append(PlannedEdit(
82
+ file_path=target,
83
+ edit_type="append_section",
84
+ anchor={"type": "header", "value": "Usage"},
85
+ content_template=content,
86
+ rationale=s.source.get("evidence", ""),
87
+ suggestion_id=s.id,
88
+ ))
89
+ seen_headers.add(header_key)
90
+ elif s.action == "replace_intro":
91
+ # Replace intro block (between H1 and first H2) with a clean Overview section
92
+ # Use empty content_template so LLM can generate content based on guidance
93
+ header_key = (target, "overview")
94
+ if header_key in seen_headers:
95
+ continue
96
+ planned.append(PlannedEdit(
97
+ file_path=target,
98
+ edit_type="replace_intro_block",
99
+ anchor={"type": "header", "value": "Overview"},
100
+ content_template="", # Will be filled by LLM generation
101
+ rationale=s.source.get("evidence", ""),
102
+ suggestion_id=s.id,
103
+ ))
104
+ seen_headers.add(header_key)
105
+ elif s.action == "clarify_mandatory_vs_optional":
106
+ # Use specific guidance from evaluation report instead of generic template
107
+ guidance = s.content_guidance or "Specify compatibility details for dependencies across operating systems and architectures."
108
+ content = section_header("Dependencies") + f"- {guidance}\n"
109
+ header_key = (target, "dependencies")
110
+ if header_key in seen_headers:
111
+ continue
112
+ planned.append(PlannedEdit(
113
+ file_path=target,
114
+ edit_type="append_section",
115
+ anchor={"type": "header", "value": "Dependencies"},
116
+ content_template=content,
117
+ rationale=s.source.get("evidence", ""),
118
+ suggestion_id=s.id,
119
+ ))
120
+ seen_headers.add(header_key)
121
+ elif s.action == "add_hardware_requirements":
122
+ # Use LLM generation instead of template
123
+ header_key = (target, (s.anchor_hint or "Hardware Requirements").strip().lower())
124
+ if header_key in seen_headers:
125
+ continue
126
+ planned.append(PlannedEdit(
127
+ file_path=target,
128
+ edit_type="append_section",
129
+ anchor={"type": "header", "value": s.anchor_hint or "Hardware Requirements"},
130
+ content_template="", # Will be generated by LLM
131
+ rationale=s.source.get("evidence", ""),
132
+ suggestion_id=s.id,
133
+ ))
134
+ seen_headers.add(header_key)
135
+ elif s.action == "improve_clarity_and_error_handling":
136
+ # Handle targeted improvements to user guides
137
+ planned.append(PlannedEdit(
138
+ file_path=target,
139
+ edit_type="append_section",
140
+ anchor={"type": "header", "value": s.anchor_hint or "Introduction"},
141
+ content_template="", # Will be filled by LLM generation
142
+ rationale=s.source.get("evidence", ""),
143
+ suggestion_id=s.id,
144
+ ))
145
+ elif s.action == "improve_consistency":
146
+ # Handle consistency improvements
147
+ planned.append(PlannedEdit(
148
+ file_path=target,
149
+ edit_type="append_section",
150
+ anchor={"type": "header", "value": s.anchor_hint or "Examples"},
151
+ content_template="", # Will be filled by LLM generation
152
+ rationale=s.source.get("evidence", ""),
153
+ suggestion_id=s.id,
154
+ ))
155
+ elif s.action == "improve_tutorial_quality":
156
+ # Handle tutorial quality improvements
157
+ planned.append(PlannedEdit(
158
+ file_path=target,
159
+ edit_type="append_section",
160
+ anchor={"type": "header", "value": s.anchor_hint or "Setup"},
161
+ content_template="", # Will be filled by LLM generation
162
+ rationale=s.source.get("evidence", ""),
163
+ suggestion_id=s.id,
164
+ ))
165
+ # All actions now use full_replace mode
166
+ planned.append(PlannedEdit(
167
+ file_path=target,
168
+ edit_type="full_replace",
169
+ anchor={"type": "document", "value": "full_document"},
170
+ content_template="", # Will be filled by LLM generation
171
+ rationale=s.source.get("evidence", ""),
172
+ suggestion_id=s.id,
173
+ ))
174
+
175
+ # If a file is planned for full_replace, suppress other edits for that file to avoid redundancy
176
+ by_file: Dict[str, List[PlannedEdit]] = {}
177
+ for e in planned:
178
+ by_file.setdefault(e.file_path, []).append(e)
179
+ filtered: List[PlannedEdit] = []
180
+ for fpath, edits in by_file.items():
181
+ has_full = any(e.edit_type == "full_replace" for e in edits)
182
+ if has_full:
183
+ filtered.extend([e for e in edits if e.edit_type == "full_replace"])
184
+ else:
185
+ filtered.extend(edits)
186
+
187
+ return DocumentPlan(repo_path=repo_path, style_profile=style, planned_edits=filtered)
188
+
189
+
@@ -0,0 +1,157 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Tuple
4
+
5
+ from .models import PlannedEdit
6
+
7
+
8
+ class DocumentRenderer:
9
+ def apply_edit(self, original: str, edit: PlannedEdit) -> Tuple[str, dict]:
10
+ content = original
11
+ added = 0
12
+
13
+ if edit.edit_type == "append_section":
14
+ # Avoid duplicate header if the same header already exists
15
+ header_line = None
16
+ if edit.content_template.lstrip().startswith("#"):
17
+ header_line = edit.content_template.strip().splitlines()[0].strip()
18
+ if header_line and header_line in content:
19
+ return content, {"added_lines": 0}
20
+ # Append with two leading newlines if needed
21
+ sep = "\n\n" if not content.endswith("\n\n") else ""
22
+ content = f"{content}{sep}{edit.content_template}"
23
+ added = len(edit.content_template.splitlines())
24
+
25
+ elif edit.edit_type == "replace_intro_block":
26
+ # Replace content from start to first level-2 header (##) with new intro
27
+ lines = content.splitlines()
28
+ end_idx = None
29
+ for i, ln in enumerate(lines):
30
+ if ln.strip().startswith("## "):
31
+ end_idx = i
32
+ break
33
+ if end_idx is None:
34
+ # No H2 header found; replace entire content
35
+ new_content = edit.content_template
36
+ else:
37
+ head = lines[:0]
38
+ tail = lines[end_idx:]
39
+ new_content = edit.content_template.rstrip() + "\n\n" + "\n".join(tail)
40
+ added = len(edit.content_template.splitlines())
41
+ content = new_content
42
+
43
+ elif edit.edit_type == "insert_after_header":
44
+ # Insert content after a specific header, but integrate naturally
45
+ header_value = edit.anchor.get("value", "")
46
+ if header_value:
47
+ lines = content.splitlines()
48
+ insert_idx = None
49
+ for i, line in enumerate(lines):
50
+ if line.strip().startswith("#") and header_value.lower() in line.lower():
51
+ # Find a good insertion point after the header and its immediate content
52
+ insert_idx = i + 1
53
+ # Skip empty lines and find the first substantial content
54
+ while insert_idx < len(lines) and lines[insert_idx].strip() == "":
55
+ insert_idx += 1
56
+ # Insert after the first code block or paragraph, but before next major section
57
+ while insert_idx < len(lines):
58
+ line_content = lines[insert_idx].strip()
59
+ if line_content.startswith("#") and not line_content.startswith("###"):
60
+ break
61
+ if line_content.startswith("```") and insert_idx > 0:
62
+ # Found end of code block, insert after it
63
+ insert_idx += 1
64
+ break
65
+ insert_idx += 1
66
+ break
67
+
68
+ if insert_idx is not None:
69
+ # Insert the new content with minimal formatting
70
+ new_content_lines = edit.content_template.splitlines()
71
+ # Remove standalone headers to avoid creating new major sections
72
+ filtered_lines = []
73
+ for line in new_content_lines:
74
+ if line.strip().startswith("## ") and len(line.strip()) < 50:
75
+ # Convert major headers to minor explanations
76
+ header_text = line.strip()[3:].strip()
77
+ filtered_lines.append(f"\n**Note:** {header_text.lower()}")
78
+ else:
79
+ filtered_lines.append(line)
80
+
81
+ # Insert with minimal spacing
82
+ new_lines = lines[:insert_idx] + [""] + filtered_lines + lines[insert_idx:]
83
+ content = "\n".join(new_lines)
84
+ added = len(filtered_lines)
85
+ else:
86
+ # Header not found, append at end
87
+ sep = "\n\n" if not content.endswith("\n\n") else ""
88
+ content = f"{content}{sep}{edit.content_template}"
89
+ added = len(edit.content_template.splitlines())
90
+ else:
91
+ # No header specified, append at end
92
+ sep = "\n\n" if not content.endswith("\n\n") else ""
93
+ content = f"{content}{sep}{edit.content_template}"
94
+ added = len(edit.content_template.splitlines())
95
+
96
+ elif edit.edit_type == "rmarkdown_integration":
97
+ # Special handling for RMarkdown files - integrate content naturally
98
+ header_value = edit.anchor.get("value", "")
99
+ if header_value:
100
+ lines = content.splitlines()
101
+ insert_idx = None
102
+ for i, line in enumerate(lines):
103
+ if line.strip().startswith("#") and header_value.lower() in line.lower():
104
+ # Find insertion point after the first code block in this section
105
+ insert_idx = i + 1
106
+ while insert_idx < len(lines):
107
+ line_content = lines[insert_idx].strip()
108
+ if line_content.startswith("```") and insert_idx > 0:
109
+ # Found code block, insert after it
110
+ insert_idx += 1
111
+ break
112
+ if line_content.startswith("#") and not line_content.startswith("###"):
113
+ # Next major section, insert before it
114
+ break
115
+ insert_idx += 1
116
+ break
117
+
118
+ if insert_idx is not None:
119
+ # Process content to be more contextual
120
+ new_content_lines = edit.content_template.splitlines()
121
+ contextual_lines = []
122
+
123
+ for line in new_content_lines:
124
+ # Convert standalone sections to contextual notes
125
+ if line.strip().startswith("## "):
126
+ header_text = line.strip()[3:].strip()
127
+ contextual_lines.append(f"\n**Note:** For this tutorial, {header_text.lower()}")
128
+ elif line.strip().startswith("# "):
129
+ header_text = line.strip()[2:].strip()
130
+ contextual_lines.append(f"\n**Important:** {header_text.lower()}")
131
+ else:
132
+ contextual_lines.append(line)
133
+
134
+ # Insert with minimal disruption
135
+ new_lines = lines[:insert_idx] + [""] + contextual_lines + lines[insert_idx:]
136
+ content = "\n".join(new_lines)
137
+ added = len(contextual_lines)
138
+ else:
139
+ # Fallback to append
140
+ sep = "\n\n" if not content.endswith("\n\n") else ""
141
+ content = f"{content}{sep}{edit.content_template}"
142
+ added = len(edit.content_template.splitlines())
143
+ else:
144
+ sep = "\n\n" if not content.endswith("\n\n") else ""
145
+ content = f"{content}{sep}{edit.content_template}"
146
+ added = len(edit.content_template.splitlines())
147
+
148
+ elif edit.edit_type == "full_replace":
149
+ # Replace entire document content
150
+ content = edit.content_template
151
+ added = len(edit.content_template.splitlines())
152
+
153
+ # Other edit types (replace_block) can be added as needed
154
+
155
+ return content, {"added_lines": added}
156
+
157
+
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ from langchain_openai.chat_models.base import BaseChatOpenAI
4
+
5
+ from bioguider.agents.common_conversation import CommonConversation
6
+
7
+
8
+ CLEANUP_PROMPT = """
9
+ You are "BioGuider," a precise editor for biomedical/bioinformatics documentation.
10
+
11
+ TASK
12
+ Given a documentation file (README, RMarkdown, or other), produce a corrected version that:
13
+ - Fixes typos, grammar, capitalization, and spacing
14
+ - Corrects malformed markdown (headers, lists, links, code fences)
15
+ - Repairs or normalizes link formatting; keep URLs absolute if present
16
+ - Removes duplicated sections or repeated content; consolidate if needed
17
+ - Preserves technical accuracy and biomedical domain terminology (do not invent features)
18
+ - Keeps tone neutral and professional; avoid marketing language
19
+ - Preserves all valid information; do not delete content unless it is a duplicate or malformed
20
+ - For RMarkdown files (.Rmd): Preserve YAML frontmatter, R code chunks, and existing structure exactly
21
+
22
+ CRITICAL REQUIREMENTS:
23
+ - Do NOT wrap the entire document in markdown code fences (```markdown). Return pure content only.
24
+ - If the document starts with ```markdown and ends with ```, remove these fences completely.
25
+ - Do NOT modify YAML frontmatter in RMarkdown files
26
+ - Do NOT modify R code chunks (```{r} blocks) in RMarkdown files
27
+ - Do NOT change the overall structure or organization of the document
28
+
29
+ ABSOLUTELY FORBIDDEN - REMOVE THESE COMPLETELY:
30
+ - Any summary sections, concluding statements, or notes at the end of documents
31
+ - Phrases like "Happy analyzing!", "Ensure all dependencies are up-to-date", "This concludes", "For more information"
32
+ - Any text that appears to be AI-generated summaries or conclusions
33
+ - Sentences starting with "Note:", "Remember:", "Important:", "Tip:", "Warning:" at the end
34
+ - Any text after the last substantive content section
35
+ - Phrases like "Happy coding!", "Good luck!", "Enjoy!", "Have fun!"
36
+ - Any concluding remarks, final thoughts, or wrap-up statements
37
+ - Text that sounds like AI-generated advice or encouragement
38
+
39
+ DOCUMENT ENDING RULES:
40
+ - The document must end naturally with the last substantive content section
41
+ - Do NOT add any concluding statements, summaries, or notes
42
+ - If the original document had a natural ending, preserve it exactly
43
+ - If AI-added content appears at the end, remove it completely
44
+
45
+ INPUT
46
+ <<DOCUMENT>>
47
+ {doc}
48
+ <</DOCUMENT>>
49
+
50
+ OUTPUT
51
+ Return ONLY the revised content (no commentary, no explanations, no code fences).
52
+ """
53
+
54
+
55
+ class LLMCleaner:
56
+ def __init__(self, llm: BaseChatOpenAI):
57
+ self.llm = llm
58
+
59
+ def clean_readme(self, content: str) -> tuple[str, dict]:
60
+ conv = CommonConversation(self.llm)
61
+ output, token_usage = conv.generate(
62
+ system_prompt=CLEANUP_PROMPT.format(doc=content[:30000]),
63
+ instruction_prompt="Provide the corrected documentation content only.",
64
+ )
65
+ return output.strip(), token_usage
66
+
67
+