bioguider 0.2.32__py3-none-any.whl → 0.2.33__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of bioguider might be problematic. Click here for more details.

@@ -67,8 +67,17 @@ def get_llm(
67
67
  api_version: str=None,
68
68
  azure_deployment: str=None,
69
69
  temperature: float = 0.0,
70
- max_tokens: int = 4096,
70
+ max_tokens: int = 16384, # Set high by default - enough for any document type
71
71
  ):
72
+ """
73
+ Create an LLM instance with appropriate parameters based on model type and API version.
74
+
75
+ Handles parameter compatibility across different models and API versions:
76
+ - DeepSeek models: Use max_tokens parameter
77
+ - GPT models (newer): Use max_completion_tokens parameter
78
+ - GPT-5+: Don't support custom temperature (uses default)
79
+ """
80
+
72
81
  if model_name.startswith("deepseek"):
73
82
  chat = ChatDeepSeek(
74
83
  api_key=api_key,
@@ -77,23 +86,38 @@ def get_llm(
77
86
  max_tokens=max_tokens,
78
87
  )
79
88
  elif model_name.startswith("gpt"):
80
- chat = AzureChatOpenAI(
81
- api_key=api_key,
82
- azure_endpoint=azure_endpoint,
83
- api_version=api_version,
84
- azure_deployment=azure_deployment,
85
- model=model_name,
86
- temperature=temperature,
87
- max_tokens=max_tokens,
88
- )
89
+ # Base parameters common to all GPT models
90
+ llm_params = {
91
+ "api_key": api_key,
92
+ "azure_endpoint": azure_endpoint,
93
+ "api_version": api_version,
94
+ "azure_deployment": azure_deployment,
95
+ "model": model_name,
96
+ }
97
+
98
+ # Determine token limit parameter name based on API version
99
+ # Newer APIs (2024-08+) use max_completion_tokens instead of max_tokens
100
+ use_completion_tokens = api_version and api_version >= "2024-08-01-preview"
101
+ token_param = "max_completion_tokens" if use_completion_tokens else "max_tokens"
102
+ llm_params[token_param] = max_tokens
103
+
104
+ # Handle temperature parameter based on model capabilities
105
+ # GPT-5+ models don't support custom temperature values
106
+ supports_temperature = not any(restricted in model_name for restricted in ["gpt-5", "o1", "o3"])
107
+ if supports_temperature:
108
+ llm_params["temperature"] = temperature
109
+
110
+ chat = AzureChatOpenAI(**llm_params)
89
111
  else:
90
- raise ValueError("Invalid model name")
91
- # validate chat
112
+ raise ValueError(f"Unsupported model type: {model_name}")
113
+
114
+ # Validate the LLM instance with a simple test
92
115
  try:
93
116
  chat.invoke("Hi")
94
117
  except Exception as e:
95
- print(e)
118
+ logger.error(f"Failed to initialize LLM {model_name}: {e}")
96
119
  return None
120
+
97
121
  return chat
98
122
 
99
123
  def pretty_print(message, printout = True):
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from typing import Dict
4
4
  import json
5
+ import re
5
6
  from langchain_openai.chat_models.base import BaseChatOpenAI
6
7
 
7
8
  from bioguider.agents.common_conversation import CommonConversation
@@ -25,13 +26,14 @@ CRITICAL REQUIREMENTS
25
26
  - Follow the guidance EXACTLY as provided: {guidance}
26
27
  - Address the specific suggestions from the evaluation report precisely
27
28
  - Do not deviate from the guidance or add unrelated content
28
- - If guidance mentions specific packages, requirements, or details, include them exactly
29
- - For RMarkdown files (.Rmd), preserve the original structure including YAML frontmatter, code chunks, and existing headers
29
+ - If guidance mentions specific packages, requirements, or details, include them ONLY if they are explicitly stated - never invent or estimate
30
+ - Preserve the original file structure including frontmatter, code blocks, and existing headers
30
31
  - NEVER generate generic placeholder content like "Clear 2–3 sentence summary" or "brief description"
32
+ - NEVER invent technical specifications (hardware requirements, version numbers, performance metrics) unless explicitly provided in guidance or context
31
33
  - ABSOLUTELY FORBIDDEN: Do NOT add summary sections, notes, conclusions, or any text at the end of documents
32
34
  - ABSOLUTELY FORBIDDEN: Do NOT wrap content in markdown code fences (```markdown). Return pure content only.
33
35
  - ABSOLUTELY FORBIDDEN: Do NOT add phrases like "Happy analyzing!", "Ensure all dependencies are up-to-date", or any concluding statements
34
- - ALWAYS use the specific guidance provided above to create concrete, actionable content
36
+ - ALWAYS use the specific guidance provided above to create concrete, actionable content based on evidence
35
37
 
36
38
  STYLE & CONSTRAINTS
37
39
  - Fix obvious errors in the content.
@@ -47,22 +49,22 @@ STYLE & CONSTRAINTS
47
49
  - When targeting README content, do not rewrite the document title or header area; generate only the requested section body to be inserted below existing headers/badges.
48
50
 
49
51
  SECTION GUIDELINES (follow guidance exactly)
50
- - Dependencies: Include specific packages mentioned in guidance (e.g., "ggplot2", "dplyr", etc.)
51
- - System Requirements: Include R version requirements and platform-specific instructions as mentioned in guidance
52
- - Hardware Requirements: Include RAM/CPU recommendations as specified in guidance
52
+ - Dependencies: Include ONLY specific packages explicitly mentioned in guidance or found in repo context. Never invent package names or versions.
53
+ - System Requirements: Include ONLY language/runtime version requirements explicitly stated in guidance or found in repo context. Never invent version numbers.
54
+ - Hardware Requirements: Include ONLY specific RAM/CPU recommendations explicitly stated in guidance or found in repo context. NEVER estimate or invent hardware specifications - omit this section if not substantiated.
53
55
  - License: one sentence referencing the license and pointing to the LICENSE file.
54
- - Install (clarify dependencies): Include compatibility details across operating systems and architectures as mentioned in guidance
56
+ - Install (clarify dependencies): Include compatibility details ONLY if explicitly mentioned in guidance or found in repo context.
55
57
  - Tutorial improvements: Add specific examples, error handling, and reproducibility notes as mentioned in guidance
56
58
  - User guide improvements: Enhance clarity, add missing information, and improve error handling as mentioned in guidance
57
- - Conservative injection: For tutorial files (.Rmd), make minimal, targeted additions that preserve the original structure and flow. Add brief notes, small subsections, or contextual comments that enhance existing content without disrupting the tutorial's narrative.
58
- - RMarkdown integration: When inserting content into existing RMarkdown tutorials, integrate naturally into the flow rather than creating standalone sections. Add brief explanatory text, code comments, or small subsections that enhance the existing content.
59
- - RMarkdown format compliance: For .Rmd files, ensure content follows RMarkdown conventions:
60
- * Use proper R code chunks with ```{{r chunk_name}} and ``` when adding code examples
59
+ - Conservative injection: For tutorial files, make minimal, targeted additions that preserve the original structure and flow. Add brief notes, small subsections, or contextual comments that enhance existing content without disrupting the tutorial's narrative.
60
+ - Natural integration: When inserting content into existing tutorials or guides, integrate naturally into the flow rather than creating standalone sections. Add brief explanatory text, code comments, or small subsections that enhance the existing content.
61
+ - Format compliance: Preserve the existing file format conventions (e.g., YAML frontmatter, code blocks, headers):
62
+ * For code examples, use the appropriate code fence syntax for the language (e.g., ```r, ```python, ```bash)
61
63
  * Maintain the tutorial's existing tone and context - content should feel like a natural continuation
62
64
  * Avoid creating new major sections unless absolutely necessary
63
- * Use inline R code with `{{r code_here}}` when appropriate
64
65
  * Keep explanations concise and contextual to the tutorial's purpose
65
- - Context awareness: Content should feel like a natural part of the existing tutorial, not a standalone addition. Reference the tutorial's specific context, datasets, and examples.
66
+ - Context awareness: Content should feel like a natural part of the existing document, not a standalone addition. Reference the document's specific context, datasets, and examples when available.
67
+ - Biological accuracy: For biomedical/bioinformatics content, ensure technical accuracy. If unsure about biological or computational details, keep descriptions general rather than inventing specifics.
66
68
  - If the section does not fit the above, produce content that directly addresses the guidance provided.
67
69
 
68
70
  OUTPUT FORMAT
@@ -74,40 +76,72 @@ OUTPUT FORMAT
74
76
  """
75
77
 
76
78
  LLM_FULLDOC_PROMPT = """
77
- You are "BioGuider," a documentation rewriter.
79
+ You are "BioGuider," a documentation rewriter with enhanced capabilities for complex documents.
78
80
 
79
81
  GOAL
80
- Rewrite a complete target document using only the provided evaluation report signals and the repository context excerpts. Output a full, ready-to-publish markdown file that is more complete and directly usable.
82
+ Rewrite a complete target document using only the provided evaluation report signals and the repository context excerpts. Output a full, ready-to-publish markdown file that is more complete and directly usable. You now have increased token capacity to handle complex documents comprehensively.
81
83
 
82
84
  INPUTS (authoritative)
83
85
  - evaluation_report (structured JSON excerpts): <<{evaluation_report}>>
84
86
  - target_file: {target_file}
85
87
  - repo_context_excerpt (do not copy blindly; use only to keep style/tone): <<{context}>>
86
88
 
89
+ CRITICAL: SINGLE DOCUMENT WITH MULTIPLE IMPROVEMENTS
90
+ This file requires improvements from {total_suggestions} separate evaluation suggestions. You must:
91
+ 1. **Read ALL {total_suggestions} suggestions** in the evaluation_report before writing
92
+ 2. **Integrate ALL suggestions into ONE cohesive document** - do NOT create {total_suggestions} separate versions
93
+ 3. **Weave improvements together naturally** - related suggestions should enhance the same sections
94
+ 4. **Write the document ONCE** with all improvements incorporated throughout
95
+
96
+ INTEGRATION STRATEGY
97
+ - Identify which suggestions target similar topics (e.g., setup, reproducibility, performance)
98
+ - Group related improvements and apply them to the same document sections
99
+ - For tutorial files: Enhance existing sections with all relevant suggestions, don't create duplicate sections
100
+ - For documentation files: Merge suggestions into existing structure, avoid redundant sections
101
+ - Result: ONE enhanced document that addresses all {total_suggestions} suggestions simultaneously
102
+
103
+ CAPACITY AND SCOPE
104
+ - You have enhanced token capacity to handle complex documents comprehensively
105
+ - Tutorial documents: Enhanced capacity for step-by-step content, code examples, and comprehensive explanations
106
+ - Complex documents: Increased capacity for multiple sections, detailed explanations, and extensive content
107
+ - Comprehensive documents: Full capacity for complete documentation with all necessary sections
108
+
87
109
  STRICT CONSTRAINTS
88
- - Base the content solely on the evaluation report. Do not invent features, data, or claims not supported by it.
110
+ - Base the content solely on the evaluation report and repo context. Do not invent features, data, or claims not supported by these sources.
111
+ - CRITICAL: NEVER invent technical specifications including:
112
+ * Hardware requirements (RAM, CPU, disk space) unless explicitly stated in guidance/context
113
+ * Version numbers for dependencies unless explicitly stated in guidance/context
114
+ * Performance metrics, benchmarks, or timing estimates
115
+ * Biological/computational parameters or thresholds without evidence
116
+ * Installation commands or package names not found in the repo context
89
117
  - Prefer completeness and usability: produce the full file content, not just minimal "added" snippets.
90
118
  - Preserve top-of-file badges/logos if they exist in the original; keep title and header area intact unless the report requires changes.
91
119
  - CRITICAL: Preserve the original document structure, sections, and flow. Only enhance existing content and add missing information.
92
- - For tutorial files (.Rmd), maintain all original sections (Docker, installation methods, etc.) while improving clarity and adding missing details.
120
+ - For tutorial files, maintain all original sections while improving clarity and adding missing details based on evaluation suggestions.
93
121
  - Fix obvious errors; improve structure and readability per report suggestions.
94
122
  - Include ONLY sections specifically requested by the evaluation report - do not add unnecessary sections.
95
123
  - Avoid redundancy: do not duplicate information across multiple sections.
96
124
  - ABSOLUTELY FORBIDDEN: Do NOT add summary sections, notes, conclusions, or any text at the end of documents
97
- - ABSOLUTELY FORBIDDEN: Do NOT wrap the entire document inside markdown code fences (```markdown). Do NOT start with ```markdown or end with ```. Return pure markdown content suitable for copy/paste.
125
+ - ABSOLUTELY FORBIDDEN: Do NOT wrap the entire document inside markdown code fences (```markdown). Do NOT start with ```markdown or end with ```. Return pure content suitable for copy/paste.
98
126
  - ABSOLUTELY FORBIDDEN: Do NOT add phrases like "Happy analyzing!" or any concluding statements
99
127
  - Keep links well-formed; keep neutral, professional tone; concise, skimmable formatting.
100
- - For RMarkdown files (.Rmd), preserve YAML frontmatter exactly and do not wrap content in code fences.
128
+ - Preserve file-specific formatting (e.g., YAML frontmatter, code fence syntax) and do not wrap content in extra code fences.
129
+
130
+ COMPLETENESS REQUIREMENTS
131
+ - Generate complete, comprehensive content that addresses all evaluation suggestions
132
+ - For complex documents, ensure all sections are fully developed and detailed
133
+ - For tutorial documents, include complete step-by-step instructions with examples
134
+ - Use the increased token capacity to provide thorough, useful documentation
101
135
 
102
136
  OUTPUT
103
137
  - Return only the full markdown content for {target_file}. No commentary, no fences.
104
138
  """
105
139
 
106
140
  LLM_README_COMPREHENSIVE_PROMPT = """
107
- You are "BioGuider," a comprehensive documentation rewriter specializing in README files.
141
+ You are "BioGuider," a comprehensive documentation rewriter specializing in README files with enhanced capacity for complex documentation.
108
142
 
109
143
  GOAL
110
- Create a complete, professional README.md that addresses all evaluation suggestions comprehensively. This is the main project documentation that users will see first.
144
+ Create a complete, professional README.md that addresses all evaluation suggestions comprehensively. This is the main project documentation that users will see first. You now have increased token capacity to create thorough, comprehensive documentation.
111
145
 
112
146
  INPUTS (authoritative)
113
147
  - evaluation_report (structured JSON excerpts): <<{evaluation_report}>>
@@ -124,12 +158,26 @@ COMPREHENSIVE README REQUIREMENTS
124
158
  - Make it copy-paste ready for users
125
159
  - Use professional, clear language suitable for biomedical researchers
126
160
 
161
+ ENHANCED CAPACITY FEATURES
162
+ - You have increased token capacity to create comprehensive documentation
163
+ - Include detailed explanations, multiple examples, and thorough coverage
164
+ - Provide extensive installation instructions with platform-specific details
165
+ - Add comprehensive usage examples with different scenarios
166
+ - Include detailed API documentation if applicable
167
+ - Provide troubleshooting guides with common issues and solutions
168
+
127
169
  STRICT CONSTRAINTS
128
170
  - Base the content solely on the evaluation report. Do not invent features, data, or claims not supported by it.
129
171
  - ABSOLUTELY FORBIDDEN: Do NOT wrap the entire document inside markdown code fences (```markdown). Return pure markdown content.
130
172
  - ABSOLUTELY FORBIDDEN: Do NOT add summary sections, notes, conclusions, or any text at the end of documents
131
173
  - Keep links well-formed; use neutral, professional tone; concise, skimmable formatting.
132
174
 
175
+ COMPLETENESS REQUIREMENTS
176
+ - Generate complete, comprehensive content that addresses all evaluation suggestions
177
+ - Ensure all sections are fully developed and detailed
178
+ - Use the increased token capacity to provide thorough, useful documentation
179
+ - Include all necessary information for users to successfully use the software
180
+
133
181
  OUTPUT
134
182
  - Return only the full README.md content. No commentary, no fences.
135
183
  """
@@ -139,6 +187,322 @@ class LLMContentGenerator:
139
187
  def __init__(self, llm: BaseChatOpenAI):
140
188
  self.llm = llm
141
189
 
190
+ def _detect_truncation(self, content: str, target_file: str) -> bool:
191
+ """
192
+ Detect if content appears to be truncated based on common patterns.
193
+ Universal detection for all file types.
194
+
195
+ Args:
196
+ content: Generated content to check
197
+ target_file: Target file path for context
198
+
199
+ Returns:
200
+ True if content appears truncated, False otherwise
201
+ """
202
+ if not content or len(content.strip()) < 100:
203
+ return True
204
+
205
+ # 1. Check for very short content (applies to all files)
206
+ # Only flag as truncated if content is very short (< 1500 chars)
207
+ if len(content) < 1500:
208
+ return True
209
+
210
+ # 2. Check for incomplete code blocks (any language)
211
+ # Count opening and closing code fences
212
+ code_fence_count = content.count('```')
213
+ if code_fence_count > 0 and code_fence_count % 2 != 0:
214
+ # Unbalanced code fences suggest truncation
215
+ return True
216
+
217
+ # 3. Check for specific language code blocks
218
+ if target_file.endswith('.Rmd'):
219
+ # R chunks should be complete
220
+ r_chunks_open = re.findall(r'```\{r[^}]*\}', content)
221
+ if r_chunks_open and not content.rstrip().endswith('```'):
222
+ # Has R chunks but doesn't end with closing fence
223
+ return True
224
+
225
+ if target_file.endswith(('.py', '.js', '.ts', '.java', '.cpp', '.c')):
226
+ # Check for incomplete class/function definitions
227
+ lines = content.split('\n')
228
+ last_lines = [line.strip() for line in lines[-5:] if line.strip()]
229
+ if last_lines:
230
+ last_line = last_lines[-1]
231
+ if (last_line.endswith(':') or
232
+ last_line.endswith('{') or
233
+ last_line.endswith('(') or
234
+ 'def ' in last_line or
235
+ 'class ' in last_line or
236
+ 'function ' in last_line):
237
+ return True
238
+
239
+ # 4. Check for incomplete markdown sections (applies to all markdown-like files)
240
+ if any(target_file.endswith(ext) for ext in ['.md', '.Rmd', '.rst', '.txt']):
241
+ lines = content.split('\n')
242
+ last_non_empty_line = None
243
+ for line in reversed(lines):
244
+ if line.strip():
245
+ last_non_empty_line = line.strip()
246
+ break
247
+
248
+ if last_non_empty_line:
249
+ # Check if last line looks incomplete
250
+ incomplete_endings = [
251
+ '##', # Header without content
252
+ '###', # Header without content
253
+ '####', # Header without content
254
+ '-', # List item
255
+ '*', # List item or emphasis
256
+ ':', # Definition or label
257
+ '|', # Table row
258
+ ]
259
+
260
+ for ending in incomplete_endings:
261
+ if last_non_empty_line.endswith(ending):
262
+ return True
263
+
264
+ # Check if ends with incomplete patterns
265
+ content_end = content[-300:].strip().lower()
266
+ incomplete_patterns = [
267
+ '## ', # Section header without content
268
+ '### ', # Subsection without content
269
+ '#### ', # Sub-subsection without content
270
+ '```{', # Incomplete code chunk
271
+ '```r', # Incomplete R chunk
272
+ '```python',# Incomplete Python chunk
273
+ ]
274
+
275
+ for pattern in incomplete_patterns:
276
+ if content_end.endswith(pattern.lower()):
277
+ return True
278
+
279
+ return False
280
+
281
+ def _appears_complete(self, content: str, target_file: str) -> bool:
282
+ """
283
+ Check if content appears to be complete based on structure and patterns.
284
+ Universal completion check for all file types.
285
+
286
+ Args:
287
+ content: Generated content to check
288
+ target_file: Target file path for context
289
+
290
+ Returns:
291
+ True if content appears complete, False if it needs continuation
292
+ """
293
+ if not content or len(content.strip()) < 100:
294
+ return False
295
+
296
+ # 1. Check for balanced code blocks (applies to all files)
297
+ code_block_count = content.count('```')
298
+ if code_block_count > 0 and code_block_count % 2 != 0:
299
+ # Unbalanced code blocks suggest incomplete
300
+ return False
301
+
302
+ # 2. File type specific checks
303
+
304
+ # RMarkdown files
305
+ if target_file.endswith('.Rmd'):
306
+ # Check for proper YAML frontmatter
307
+ if not content.startswith('---'):
308
+ return False
309
+
310
+ # Check for conclusion patterns
311
+ conclusion_patterns = [
312
+ 'sessionInfo()',
313
+ 'session.info()',
314
+ '## Conclusion',
315
+ '## Summary',
316
+ '## Session Info',
317
+ '</details>',
318
+ 'knitr::knit(',
319
+ ]
320
+
321
+ content_lower = content.lower()
322
+ has_conclusion = any(pattern.lower() in content_lower for pattern in conclusion_patterns)
323
+
324
+ # If we have a conclusion and balanced code blocks, likely complete
325
+ if has_conclusion and code_block_count > 0:
326
+ return True
327
+
328
+ # Markdown files
329
+ if target_file.endswith('.md'):
330
+ # Check for conclusion sections
331
+ conclusion_patterns = [
332
+ '## Conclusion',
333
+ '## Summary',
334
+ '## Next Steps',
335
+ '## Further Reading',
336
+ '## References',
337
+ '## License',
338
+ ]
339
+
340
+ content_lower = content.lower()
341
+ has_conclusion = any(pattern.lower() in content_lower for pattern in conclusion_patterns)
342
+
343
+ if has_conclusion and len(content) > 2000:
344
+ return True
345
+
346
+ # Python files
347
+ if target_file.endswith('.py'):
348
+ # Check for balanced brackets/parentheses
349
+ if content.count('(') != content.count(')'):
350
+ return False
351
+ if content.count('[') != content.count(']'):
352
+ return False
353
+ if content.count('{') != content.count('}'):
354
+ return False
355
+
356
+ # Check for complete structure (reasonable length + proper ending)
357
+ lines = [line for line in content.split('\n') if line.strip()]
358
+ if len(lines) > 20: # Has reasonable content
359
+ last_line = lines[-1].strip()
360
+ # Should not end with incomplete statements
361
+ if not (last_line.endswith(':') or
362
+ last_line.endswith('\\') or
363
+ last_line.endswith(',')):
364
+ return True
365
+
366
+ # JavaScript/TypeScript files
367
+ if target_file.endswith(('.js', '.ts', '.jsx', '.tsx')):
368
+ # Check for balanced brackets
369
+ if content.count('{') != content.count('}'):
370
+ return False
371
+ if content.count('(') != content.count(')'):
372
+ return False
373
+
374
+ lines = [line for line in content.split('\n') if line.strip()]
375
+ if len(lines) > 20:
376
+ last_line = lines[-1].strip()
377
+ # Complete if ends with proper syntax
378
+ if (last_line.endswith('}') or
379
+ last_line.endswith(';') or
380
+ last_line.endswith('*/') or
381
+ last_line.startswith('//')):
382
+ return True
383
+
384
+ # 3. Generic checks for all file types
385
+ if len(content) > 3000: # Reasonable length
386
+ # Check if it ends with complete sentences/sections
387
+ lines = content.split('\n')
388
+ last_lines = [line.strip() for line in lines[-10:] if line.strip()]
389
+
390
+ if last_lines:
391
+ last_line = last_lines[-1]
392
+ # Complete if ends with proper punctuation or closing tags
393
+ complete_endings = [
394
+ '.', # Sentence
395
+ '```', # Code block
396
+ '---', # Section divider
397
+ '</details>', # HTML details
398
+ '}', # Closing brace
399
+ ';', # Statement end
400
+ '*/', # Comment end
401
+ ]
402
+
403
+ if any(last_line.endswith(ending) for ending in complete_endings):
404
+ return True
405
+
406
+ return False
407
+
408
+ def _generate_continuation(self, target_file: str, evaluation_report: dict,
409
+ context: str, existing_content: str) -> tuple[str, dict]:
410
+ """
411
+ Generate continuation content from where previous generation left off.
412
+
413
+ Args:
414
+ target_file: Target file path
415
+ evaluation_report: Evaluation report data
416
+ context: Repository context
417
+ existing_content: Previously generated content
418
+
419
+ Returns:
420
+ Tuple of (continuation_content, token_usage)
421
+ """
422
+ # Create LLM for continuation (uses 16k tokens by default)
423
+ from bioguider.agents.agent_utils import get_llm
424
+ import os
425
+
426
+ llm = get_llm(
427
+ api_key=os.environ.get("OPENAI_API_KEY"),
428
+ model_name=os.environ.get("OPENAI_MODEL", "gpt-4o"),
429
+ azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
430
+ api_version=os.environ.get("OPENAI_API_VERSION"),
431
+ azure_deployment=os.environ.get("OPENAI_DEPLOYMENT_NAME"),
432
+ )
433
+
434
+ conv = CommonConversation(llm)
435
+
436
+ # Calculate total suggestions for the prompt
437
+ total_suggestions = 1
438
+ if isinstance(evaluation_report, dict):
439
+ if "total_suggestions" in evaluation_report:
440
+ total_suggestions = evaluation_report["total_suggestions"]
441
+ elif "suggestions" in evaluation_report and isinstance(evaluation_report["suggestions"], list):
442
+ total_suggestions = len(evaluation_report["suggestions"])
443
+
444
+ continuation_prompt = f"""
445
+ You are "BioGuider," continuing a documentation generation task with enhanced capacity for complex documents.
446
+
447
+ GOAL
448
+ Continue generating the document "{target_file}" from where the previous generation left off.
449
+ The previous content was truncated and needs to be completed. You now have increased token
450
+ capacity to handle complex documents comprehensively.
451
+
452
+ PREVIOUS CONTENT (do not repeat this):
453
+ ```
454
+ {existing_content[-1000:]} # Last 1000 chars for context
455
+ ```
456
+
457
+ TASK
458
+ Continue the document naturally from the last complete section. Maintain the same style,
459
+ structure, and flow as the previous content. Complete all remaining sections that should
460
+ be in this document.
461
+
462
+ CAPACITY AND SCOPE
463
+ - You have enhanced token capacity to handle complex documents comprehensively
464
+ - Tutorial documents: Enhanced capacity for step-by-step content, code examples, and comprehensive explanations
465
+ - Complex documents: Increased capacity for multiple sections, detailed explanations, and extensive content
466
+ - Comprehensive documents: Full capacity for complete documentation with all necessary sections
467
+
468
+ INPUTS
469
+ - evaluation_report (contains {total_suggestions} suggestions to integrate): {json.dumps(evaluation_report)[:4000]}
470
+ - context: {context[:2000]}
471
+
472
+ REMINDER: SINGLE DOCUMENT APPROACH
473
+ - The evaluation report contains {total_suggestions} SEPARATE suggestions
474
+ - These should be integrated into ONE cohesive continuation
475
+ - Do NOT create {total_suggestions} separate sections for each suggestion
476
+ - Group related suggestions (e.g., setup, reproducibility, performance) and integrate them naturally
477
+
478
+ REQUIREMENTS
479
+ - Continue seamlessly from the previous content
480
+ - Maintain the same tone and style
481
+ - Complete all sections that should be in this document
482
+ - Preserve file-specific formatting (e.g., YAML frontmatter, code block syntax appropriate to the language)
483
+ - Do not repeat content already generated
484
+ - Return only the continuation content, not the full document
485
+ - Use the increased token capacity to provide thorough, complete content
486
+ - NEVER invent technical specifications (hardware, versions, performance) unless explicitly in evaluation report or context
487
+ - ABSOLUTELY FORBIDDEN: Do NOT wrap content in markdown code fences (```markdown). Return pure content only.
488
+ - ABSOLUTELY FORBIDDEN: Do NOT add summary sections, notes, conclusions, or any text at the end of documents
489
+
490
+ COMPLETENESS REQUIREMENTS
491
+ - Generate complete, comprehensive content that addresses all remaining evaluation suggestions
492
+ - For complex documents, ensure all sections are fully developed and detailed
493
+ - For tutorial documents, include complete step-by-step instructions with examples
494
+ - Use the increased token capacity to provide thorough, useful documentation
495
+
496
+ OUTPUT
497
+ Return only the continuation content that should be appended to the existing content.
498
+ """
499
+
500
+ content, token_usage = conv.generate(
501
+ system_prompt=continuation_prompt,
502
+ instruction_prompt="Continue the document from where it left off."
503
+ )
504
+ return content.strip(), token_usage
505
+
142
506
  def generate_section(self, suggestion: SuggestionItem, style: StyleProfile, context: str = "") -> tuple[str, dict]:
143
507
  conv = CommonConversation(self.llm)
144
508
  section_name = suggestion.anchor_hint or suggestion.category.split(".")[-1].replace("_", " ").title()
@@ -158,7 +522,43 @@ class LLMContentGenerator:
158
522
  return content.strip(), token_usage
159
523
 
160
524
  def generate_full_document(self, target_file: str, evaluation_report: dict, context: str = "") -> tuple[str, dict]:
161
- conv = CommonConversation(self.llm)
525
+ # Create LLM (uses 16k tokens by default - enough for any document)
526
+ from bioguider.agents.agent_utils import get_llm
527
+ import os
528
+ import json
529
+ from datetime import datetime
530
+
531
+ # Get LLM with default 16k token limit
532
+ llm = get_llm(
533
+ api_key=os.environ.get("OPENAI_API_KEY"),
534
+ model_name=os.environ.get("OPENAI_MODEL", "gpt-4o"),
535
+ azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
536
+ api_version=os.environ.get("OPENAI_API_VERSION"),
537
+ azure_deployment=os.environ.get("OPENAI_DEPLOYMENT_NAME"),
538
+ )
539
+
540
+ conv = CommonConversation(llm)
541
+
542
+ # Debug: Save generation settings and context
543
+ debug_info = {
544
+ "target_file": target_file,
545
+ "timestamp": datetime.now().isoformat(),
546
+ "evaluation_report": evaluation_report,
547
+ "context_length": len(context),
548
+ "llm_settings": {
549
+ "model_name": os.environ.get("OPENAI_MODEL", "gpt-4o"),
550
+ "azure_deployment": os.environ.get("OPENAI_DEPLOYMENT_NAME"),
551
+ "max_tokens": getattr(llm, 'max_tokens', 16384)
552
+ }
553
+ }
554
+
555
+ # Save debug info to file
556
+ debug_dir = "outputs/debug_generation"
557
+ os.makedirs(debug_dir, exist_ok=True)
558
+ safe_filename = target_file.replace("/", "_").replace(".", "_")
559
+ debug_file = os.path.join(debug_dir, f"{safe_filename}_debug.json")
560
+ with open(debug_file, 'w', encoding='utf-8') as f:
561
+ json.dump(debug_info, f, indent=2, ensure_ascii=False)
162
562
 
163
563
  # Use comprehensive README prompt for README.md files
164
564
  if target_file.endswith("README.md"):
@@ -168,13 +568,204 @@ class LLMContentGenerator:
168
568
  context=context[:4000],
169
569
  )
170
570
  else:
571
+ # Calculate total suggestions for the prompt
572
+ total_suggestions = 1
573
+ if isinstance(evaluation_report, dict):
574
+ if "total_suggestions" in evaluation_report:
575
+ total_suggestions = evaluation_report["total_suggestions"]
576
+ elif "suggestions" in evaluation_report and isinstance(evaluation_report["suggestions"], list):
577
+ total_suggestions = len(evaluation_report["suggestions"])
578
+
171
579
  system_prompt = LLM_FULLDOC_PROMPT.format(
172
580
  target_file=target_file,
173
581
  evaluation_report=json.dumps(evaluation_report)[:6000],
174
582
  context=context[:4000],
583
+ total_suggestions=total_suggestions,
175
584
  )
176
585
 
586
+ # Save initial prompt for debugging
587
+ prompt_file = os.path.join(debug_dir, f"{safe_filename}_prompt.txt")
588
+ with open(prompt_file, 'w', encoding='utf-8') as f:
589
+ f.write("=== SYSTEM PROMPT ===\n")
590
+ f.write(system_prompt)
591
+ f.write("\n\n=== INSTRUCTION PROMPT ===\n")
592
+ f.write("Write the full document now.")
593
+ f.write("\n\n=== EVALUATION REPORT ===\n")
594
+ f.write(json.dumps(evaluation_report, indent=2))
595
+ f.write("\n\n=== CONTEXT ===\n")
596
+ f.write(context[:2000] + "..." if len(context) > 2000 else context)
597
+
598
+ # Initial generation
177
599
  content, token_usage = conv.generate(system_prompt=system_prompt, instruction_prompt="Write the full document now.")
178
- return content.strip(), token_usage
600
+ content = content.strip()
601
+
602
+ # Save initial generation for debugging
603
+ generation_file = os.path.join(debug_dir, f"{safe_filename}_generation_0.txt")
604
+ with open(generation_file, 'w', encoding='utf-8') as f:
605
+ f.write(f"=== INITIAL GENERATION ===\n")
606
+ f.write(f"Tokens: {token_usage}\n")
607
+ f.write(f"Length: {len(content)} characters\n")
608
+ f.write(f"Truncation detected: {self._detect_truncation(content, target_file)}\n")
609
+ f.write(f"\n=== CONTENT ===\n")
610
+ f.write(content)
611
+
612
+ # Check for truncation and continue if needed
613
+ max_continuations = 3 # Limit to prevent infinite loops
614
+ continuation_count = 0
615
+
616
+ while (self._detect_truncation(content, target_file) and
617
+ continuation_count < max_continuations):
618
+
619
+ # Additional check: if content appears complete, don't continue
620
+ if self._appears_complete(content, target_file):
621
+ break
622
+ continuation_count += 1
623
+
624
+ # Save continuation prompt for debugging
625
+ continuation_prompt_file = os.path.join(debug_dir, f"{safe_filename}_continuation_{continuation_count}_prompt.txt")
626
+ continuation_prompt = f"""
627
+ You are "BioGuider," continuing a documentation generation task with enhanced capacity for complex documents.
628
+
629
+ GOAL
630
+ Continue generating the document "{target_file}" from where the previous generation left off.
631
+ The previous content was truncated and needs to be completed. You now have increased token
632
+ capacity to handle complex documents comprehensively.
633
+
634
+ PREVIOUS CONTENT (do not repeat this):
635
+ ```
636
+ {content[-1000:]} # Last 1000 chars for context
637
+ ```
638
+
639
+ TASK
640
+ Continue the document naturally from the last complete section. Maintain the same style,
641
+ structure, and flow as the previous content. Complete all remaining sections that should
642
+ be in this document.
643
+
644
+ CRITICAL REQUIREMENTS:
645
+ - Do NOT repeat any content already generated above
646
+ - Do NOT duplicate sections, headers, or code blocks that already exist
647
+ - Generate ONLY new, unique content that continues from where the previous content ended
648
+ - If the previous content appears complete, add complementary sections that enhance the document
649
+ - Focus on adding missing sections, examples, or explanations that weren't covered
650
+
651
+ CAPACITY AND SCOPE
652
+ - You have enhanced token capacity to handle complex documents comprehensively
653
+ - Tutorial documents: Enhanced capacity for step-by-step content, code examples, and comprehensive explanations
654
+ - Complex documents: Increased capacity for multiple sections, detailed explanations, and extensive content
655
+ - Comprehensive documents: Full capacity for complete documentation with all necessary sections
656
+
657
+ INPUTS
658
+ - evaluation_report (contains {total_suggestions} suggestions to integrate): {json.dumps(evaluation_report)[:4000]}
659
+ - context: {context[:2000]}
660
+
661
+ REMINDER: SINGLE DOCUMENT APPROACH
662
+ - The evaluation report contains {total_suggestions} SEPARATE suggestions
663
+ - These should be integrated into ONE cohesive continuation
664
+ - Do NOT create {total_suggestions} separate sections for each suggestion
665
+ - Group related suggestions (e.g., setup, reproducibility, performance) and integrate them naturally
666
+
667
+ REQUIREMENTS
668
+ - Continue seamlessly from the previous content
669
+ - Maintain the same tone and style
670
+ - Complete all sections that should be in this document
671
+ - Preserve file-specific formatting (e.g., YAML frontmatter, code block syntax appropriate to the language)
672
+ - Do not repeat content already generated
673
+ - Return only the continuation content, not the full document
674
+ - Use the increased token capacity to provide thorough, complete content
675
+ - NEVER invent technical specifications (hardware, versions, performance) unless explicitly in evaluation report or context
676
+ - ABSOLUTELY FORBIDDEN: Do NOT wrap content in markdown code fences (```markdown). Return pure content only.
677
+ - ABSOLUTELY FORBIDDEN: Do NOT add summary sections, notes, conclusions, or any text at the end of documents
678
+
679
+ COMPLETENESS REQUIREMENTS
680
+ - Generate complete, comprehensive content that addresses all remaining evaluation suggestions
681
+ - For complex documents, ensure all sections are fully developed and detailed
682
+ - For tutorial documents, include complete step-by-step instructions with examples
683
+ - Use the increased token capacity to provide thorough, useful documentation
684
+
685
+ OUTPUT
686
+ - Return only the continuation content. No commentary, no fences.
687
+ """
688
+
689
+ with open(continuation_prompt_file, 'w', encoding='utf-8') as f:
690
+ f.write(continuation_prompt)
691
+
692
+ # Generate continuation
693
+ continuation_content, continuation_usage = self._generate_continuation(
694
+ target_file=target_file,
695
+ evaluation_report=evaluation_report,
696
+ context=context,
697
+ existing_content=content
698
+ )
699
+
700
+ # Save continuation generation for debugging
701
+ continuation_file = os.path.join(debug_dir, f"{safe_filename}_continuation_{continuation_count}.txt")
702
+ with open(continuation_file, 'w', encoding='utf-8') as f:
703
+ f.write(f"=== CONTINUATION {continuation_count} ===\n")
704
+ f.write(f"Tokens: {continuation_usage}\n")
705
+ f.write(f"Length: {len(continuation_content)} characters\n")
706
+ f.write(f"Truncation detected: {self._detect_truncation(continuation_content, target_file)}\n")
707
+ f.write(f"\n=== CONTENT ===\n")
708
+ f.write(continuation_content)
709
+
710
+ # Merge continuation with existing content
711
+ if continuation_content:
712
+ content += "\n\n" + continuation_content
713
+ # Update token usage
714
+ token_usage = {
715
+ "total_tokens": token_usage.get("total_tokens", 0) + continuation_usage.get("total_tokens", 0),
716
+ "prompt_tokens": token_usage.get("prompt_tokens", 0) + continuation_usage.get("prompt_tokens", 0),
717
+ "completion_tokens": token_usage.get("completion_tokens", 0) + continuation_usage.get("completion_tokens", 0),
718
+ }
719
+
720
+ # Save merged content for debugging
721
+ merged_file = os.path.join(debug_dir, f"{safe_filename}_merged_{continuation_count}.txt")
722
+ with open(merged_file, 'w', encoding='utf-8') as f:
723
+ f.write(f"=== MERGED CONTENT AFTER CONTINUATION {continuation_count} ===\n")
724
+ f.write(f"Total length: {len(content)} characters\n")
725
+ f.write(f"Truncation detected: {self._detect_truncation(content, target_file)}\n")
726
+ f.write(f"\n=== CONTENT ===\n")
727
+ f.write(content)
728
+ else:
729
+ # If continuation is empty, break to avoid infinite loop
730
+ break
731
+
732
+ # Clean up any markdown code fences that might have been added
733
+ content = self._clean_markdown_fences(content)
734
+
735
+ # Save final cleaned content for debugging
736
+ final_file = os.path.join(debug_dir, f"{safe_filename}_final.txt")
737
+ with open(final_file, 'w', encoding='utf-8') as f:
738
+ f.write(f"=== FINAL CLEANED CONTENT ===\n")
739
+ f.write(f"Total tokens: {token_usage}\n")
740
+ f.write(f"Final length: {len(content)} characters\n")
741
+ f.write(f"Continuations used: {continuation_count}\n")
742
+ f.write(f"\n=== CONTENT ===\n")
743
+ f.write(content)
744
+
745
+ return content, token_usage
746
+
747
+ def _clean_markdown_fences(self, content: str) -> str:
748
+ """
749
+ Remove markdown code fences that shouldn't be in the final content.
750
+ """
751
+ # Remove ```markdown at the beginning
752
+ if content.startswith('```markdown\n'):
753
+ content = content[12:] # Remove ```markdown\n
754
+
755
+ # Remove ``` at the end
756
+ if content.endswith('\n```'):
757
+ content = content[:-4] # Remove \n```
758
+ elif content.endswith('```'):
759
+ content = content[:-3] # Remove ```
760
+
761
+ # Remove any standalone ```markdown lines
762
+ lines = content.split('\n')
763
+ cleaned_lines = []
764
+ for line in lines:
765
+ if line.strip() == '```markdown':
766
+ continue
767
+ cleaned_lines.append(line)
768
+
769
+ return '\n'.join(cleaned_lines)
179
770
 
180
771
 
@@ -34,6 +34,7 @@ class DocumentationGenerationManager:
34
34
  self.output = OutputManager(base_outputs_dir=output_dir)
35
35
  self.llm_gen = LLMContentGenerator(llm)
36
36
  self.llm_cleaner = LLMCleaner(llm)
37
+
37
38
 
38
39
  def print_step(self, step_name: str | None = None, step_output: str | None = None):
39
40
  if self.step_callback is None:
@@ -143,54 +144,175 @@ class DocumentationGenerationManager:
143
144
  self.print_step(step_name="ProcessingFile", step_output=f"Processing {fpath} ({processed_files}/{total_files}) - {len(edits)} edits")
144
145
 
145
146
  original_content = files.get(fpath, "")
147
+
148
+ # Group suggestions by file to avoid duplicate generation
149
+ file_suggestions = []
150
+ full_replace_edits = []
151
+ section_edits = []
152
+
153
+ for e in edits:
154
+ suggestion = next((s for s in suggestions if s.id == e.suggestion_id), None) if e.suggestion_id else None
155
+ if suggestion:
156
+ file_suggestions.append(suggestion)
157
+ if e.edit_type == "full_replace":
158
+ full_replace_edits.append(e)
159
+ else:
160
+ section_edits.append(e)
161
+
162
+ # Debug: Save suggestion grouping info
163
+ import json
164
+ import os
165
+ from datetime import datetime
166
+
167
+ debug_dir = "outputs/debug_generation"
168
+ os.makedirs(debug_dir, exist_ok=True)
169
+ safe_filename = fpath.replace("/", "_").replace(".", "_")
170
+
171
+ grouping_info = {
172
+ "file_path": fpath,
173
+ "total_edits": len(edits),
174
+ "file_suggestions_count": len(file_suggestions),
175
+ "full_replace_edits_count": len(full_replace_edits),
176
+ "section_edits_count": len(section_edits),
177
+ "suggestions": [
178
+ {
179
+ "id": s.id,
180
+ "category": s.category,
181
+ "content_guidance": s.content_guidance[:200] + "..." if len(s.content_guidance or "") > 200 else s.content_guidance,
182
+ "target_files": s.target_files
183
+ } for s in file_suggestions
184
+ ],
185
+ "timestamp": datetime.now().isoformat()
186
+ }
187
+
188
+ grouping_file = os.path.join(debug_dir, f"{safe_filename}_grouping.json")
189
+ with open(grouping_file, 'w', encoding='utf-8') as f:
190
+ json.dump(grouping_info, f, indent=2, ensure_ascii=False)
191
+
146
192
  content = original_content
147
193
  total_stats = {"added_lines": 0}
148
- for e in edits:
149
- context = original_content
150
- if not e.content_template or e.content_template.strip() == "":
151
- # Generate LLM content - use full document generation for full_replace, section generation for others
152
- suggestion = next((s for s in suggestions if s.id == e.suggestion_id), None) if e.suggestion_id else None
153
- if suggestion:
154
- if e.edit_type == "full_replace":
155
- self.print_step(step_name="GeneratingContent", step_output=f"Generating full document for {e.suggestion_id} using LLM...")
194
+
195
+ # CRITICAL: Generate content ONCE per file if there are full_replace edits
196
+ # All suggestions for this file are merged into a single evaluation report
197
+ # This prevents duplicate content generation
198
+ if full_replace_edits:
199
+ self.print_step(
200
+ step_name="GeneratingContent",
201
+ step_output=f"🔄 Generating full document for {fpath} with {len(file_suggestions)} suggestions using LLM (SINGLE CALL)..."
202
+ )
203
+
204
+ # Merge all suggestions for this file into a single evaluation report
205
+ # Format suggestions with clear numbering to help LLM understand they're separate improvements
206
+ suggestions_list = []
207
+ for idx, s in enumerate(file_suggestions, 1):
208
+ suggestions_list.append({
209
+ "suggestion_number": idx,
210
+ "category": s.category if hasattr(s, 'category') else "general",
211
+ "content_guidance": s.content_guidance,
212
+ "evidence": s.source.get("evidence", "") if s.source else ""
213
+ })
214
+
215
+ merged_evaluation_report = {
216
+ "total_suggestions": len(file_suggestions),
217
+ "integration_instruction": f"Integrate ALL {len(file_suggestions)} suggestions below into ONE cohesive document. Do NOT create {len(file_suggestions)} separate versions.",
218
+ "suggestions": suggestions_list
219
+ }
220
+
221
+ # Debug: Save merged evaluation report
222
+ merged_report_file = os.path.join(debug_dir, f"{safe_filename}_merged_report.json")
223
+ with open(merged_report_file, 'w', encoding='utf-8') as f:
224
+ json.dump(merged_evaluation_report, f, indent=2, ensure_ascii=False)
225
+
226
+ # Debug: Log that we're about to make a single generation call
227
+ debug_log_file = os.path.join(debug_dir, f"{safe_filename}_generation_log.txt")
228
+ with open(debug_log_file, 'a', encoding='utf-8') as f:
229
+ f.write(f"\n=== GENERATION CALL at {datetime.now().isoformat()} ===\n")
230
+ f.write(f"File: {fpath}\n")
231
+ f.write(f"Full replace edits: {len(full_replace_edits)}\n")
232
+ f.write(f"Total suggestions: {len(file_suggestions)}\n")
233
+ f.write(f"Merged into single call: YES\n")
234
+ f.write(f"Suggestion IDs: {[s.id for s in file_suggestions]}\n\n")
235
+
236
+ gen_content, gen_usage = self.llm_gen.generate_full_document(
237
+ target_file=fpath,
238
+ evaluation_report=merged_evaluation_report,
239
+ context=original_content,
240
+ )
241
+
242
+ # Debug: Log completion
243
+ with open(debug_log_file, 'a', encoding='utf-8') as f:
244
+ f.write(f"Generation completed at {datetime.now().isoformat()}\n")
245
+ f.write(f"Content length: {len(gen_content) if isinstance(gen_content, str) else 0} characters\n")
246
+ f.write(f"Tokens used: {gen_usage.get('total_tokens', 0)}\n")
247
+ f.write(f"SUCCESS: {isinstance(gen_content, str) and gen_content}\n\n")
248
+
249
+ if isinstance(gen_content, str) and gen_content:
250
+ self.print_step(step_name="LLMFullDoc", step_output=f"✓ Generated full document for {fpath} ({gen_usage.get('total_tokens', 0)} tokens)")
251
+ # Apply the generated content to all full_replace edits
252
+ for e in full_replace_edits:
253
+ e.content_template = gen_content
254
+ content = gen_content
255
+ else:
256
+ # Fallback: try individual generation but only for the first edit to avoid duplicates
257
+ if full_replace_edits:
258
+ e = full_replace_edits[0] # Only process the first edit
259
+ suggestion = next((s for s in suggestions if s.id == e.suggestion_id), None) if e.suggestion_id else None
260
+ if suggestion and (not e.content_template or e.content_template.strip() == ""):
261
+ self.print_step(step_name="GeneratingContent", step_output=f"Fallback: Generating full document for {e.suggestion_id} using LLM...")
156
262
  gen_content, gen_usage = self.llm_gen.generate_full_document(
157
263
  target_file=e.file_path,
158
264
  evaluation_report={"suggestion": suggestion.content_guidance, "evidence": suggestion.source.get("evidence", "") if suggestion.source else ""},
159
- context=context,
265
+ context=original_content,
160
266
  )
161
267
  if isinstance(gen_content, str) and gen_content:
162
268
  self.print_step(step_name="LLMFullDoc", step_output=f"✓ Generated full document for {e.suggestion_id} ({gen_usage.get('total_tokens', 0)} tokens)")
163
- e.content_template = gen_content
164
- else:
165
- self.print_step(step_name="GeneratingContent", step_output=f"Generating section for {e.suggestion_id} using LLM...")
166
- gen_section, gen_usage = self.llm_gen.generate_section(
167
- suggestion=suggestion,
168
- style=plan.style_profile,
169
- context=context,
170
- )
171
- if isinstance(gen_section, str) and gen_section:
172
- self.print_step(step_name="LLMSection", step_output=f" Generated section for {e.suggestion_id} ({gen_usage.get('total_tokens', 0)} tokens)")
173
- # Ensure header present
174
- if gen_section.lstrip().startswith("#"):
175
- e.content_template = gen_section
176
- else:
177
- title = e.anchor.get('value', '').strip() or ''
178
- e.content_template = f"## {title}\n\n{gen_section}" if title else gen_section
179
- content, stats = self.renderer.apply_edit(content, e)
180
- # After applying full document or section changes, run a general cleaner pass for all text files
181
- # to fix markdown/formatting issues without changing meaning.
182
- try:
183
- if fpath.endswith((".md", ".rst", ".Rmd", ".Rd")) and content:
184
- self.print_step(step_name="CleaningContent", step_output=f"Cleaning formatting for {fpath}...")
185
- cleaned, _usage = self.llm_cleaner.clean_readme(content)
186
- if isinstance(cleaned, str) and cleaned.strip():
187
- content = cleaned
269
+ # Apply the same content to all full_replace edits
270
+ for edit in full_replace_edits:
271
+ edit.content_template = gen_content
272
+ content = gen_content
273
+ else:
274
+ # Handle section edits individually
275
+ for e in section_edits:
276
+ suggestion = next((s for s in suggestions if s.id == e.suggestion_id), None) if e.suggestion_id else None
277
+ if suggestion and (not e.content_template or e.content_template.strip() == ""):
278
+ self.print_step(step_name="GeneratingContent", step_output=f"Generating section for {e.suggestion_id} using LLM...")
279
+ gen_section, gen_usage = self.llm_gen.generate_section(
280
+ suggestion=suggestion,
281
+ style=plan.style_profile,
282
+ context=original_content,
283
+ )
284
+ if isinstance(gen_section, str) and gen_section:
285
+ self.print_step(step_name="LLMSection", step_output=f"✓ Generated section for {e.suggestion_id} ({gen_usage.get('total_tokens', 0)} tokens)")
286
+ # Ensure header present
287
+ if gen_section.lstrip().startswith("#"):
288
+ e.content_template = gen_section
289
+ else:
290
+ title = e.anchor.get('value', '').strip() or ''
291
+ e.content_template = f"## {title}\n\n{gen_section}" if title else gen_section
292
+
293
+ content, stats = self.renderer.apply_edit(content, e)
294
+ total_stats["added_lines"] = total_stats.get("added_lines", 0) + stats.get("added_lines", 0)
295
+
296
+ # Apply remaining edits that weren't full_replace
297
+ for e in edits:
298
+ if e.edit_type != "full_replace":
299
+ content, stats = self.renderer.apply_edit(content, e)
300
+ total_stats["added_lines"] = total_stats.get("added_lines", 0) + stats.get("added_lines", 0)
301
+
302
+ # After applying full document or section changes, run a general cleaner pass for all text files
303
+ # to fix markdown/formatting issues without changing meaning.
304
+ try:
305
+ if fpath.endswith((".md", ".rst", ".Rmd", ".Rd")) and content:
306
+ self.print_step(step_name="CleaningContent", step_output=f"Cleaning formatting for {fpath}...")
307
+ cleaned, _usage = self.llm_cleaner.clean_readme(content)
308
+ if isinstance(cleaned, str) and cleaned.strip():
309
+ content = cleaned
310
+
311
+ # LLM cleaner now handles markdown fences and unwanted summaries
188
312
 
189
- # LLM cleaner now handles markdown fences and unwanted summaries
190
-
191
- except Exception:
192
- pass
193
- total_stats["added_lines"] = total_stats.get("added_lines", 0) + stats.get("added_lines", 0)
313
+ except Exception:
314
+ pass
315
+
194
316
  revised[fpath] = content
195
317
  diff_stats[fpath] = total_stats
196
318
  self.print_step(step_name="RenderedFile", step_output=f"✓ Completed {fpath} - added {total_stats['added_lines']} lines")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: bioguider
3
- Version: 0.2.32
3
+ Version: 0.2.33
4
4
  Summary: An AI-Powered package to help biomedical developers to generate clear documentation
5
5
  License: MIT
6
6
  Author: Cankun Wang
@@ -2,7 +2,7 @@ bioguider/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  bioguider/agents/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  bioguider/agents/agent_task.py,sha256=TL0Zx8zOmiAVslmNbfMPQ38qTQ73QospY6Dwrwf8POg,2890
4
4
  bioguider/agents/agent_tools.py,sha256=r21wHV6a-Ic2T0dk4YzA-_d7PodHPM3GzRxJqv-llSw,7286
5
- bioguider/agents/agent_utils.py,sha256=dA2xbS7ojeNyGBigyCGGCZOZGGUSUpPVfCYgWLqi0ZA,15011
5
+ bioguider/agents/agent_utils.py,sha256=VL6ui13v0Fo6FGRvrak6Q04q1q7LWuKY6o0JP4fx8Os,16293
6
6
  bioguider/agents/collection_execute_step.py,sha256=jE_oSQZI5WDaz0bJjUWoAfqWfVbGUqN--cvITSWCGiI,5614
7
7
  bioguider/agents/collection_observe_step.py,sha256=1xOw6N3uIoyh4h4_vcULAc5x5KZ9G-zZo42AhRidyn8,5373
8
8
  bioguider/agents/collection_plan_step.py,sha256=Nn0f8AOkEDCDtnhaqE7yCQoi7PVpsHmiUcsIqC0T0dQ,5956
@@ -46,7 +46,7 @@ bioguider/generation/__init__.py,sha256=esV02QgCsY67-HBwSHDbA5AcbKzNRIT3wDwwh6N4
46
46
  bioguider/generation/change_planner.py,sha256=0N10jvkfn2J9b598FKOKPQecwmQv68yeuUvMZn81nOI,9715
47
47
  bioguider/generation/document_renderer.py,sha256=Md8NMo0CXNIqatWOdKE-_4k02Y3T_BCLmEPLTEiYUCA,7984
48
48
  bioguider/generation/llm_cleaner.py,sha256=qFgS5xi7bBO8HAJ9WFNzH3p9AhOsAkYjchKQHuAUWWM,2917
49
- bioguider/generation/llm_content_generator.py,sha256=UbRURH6RdEWBVVqQi96SlTkNEOt01yyuxr76jR8__GA,10983
49
+ bioguider/generation/llm_content_generator.py,sha256=DEgk4uAgZrxBTVEN3ZuhL7W-tBfXOyn2X4e9rM1Gfhc,39748
50
50
  bioguider/generation/llm_injector.py,sha256=bVxP6Asv2em4MBOB5yFsS14AuaeT7NLKQQMcsEqXjPY,17352
51
51
  bioguider/generation/models.py,sha256=MlJOLjPHk8xs-UGW-TGN_M9cevTuxTG4tjm1d1L15go,2699
52
52
  bioguider/generation/output_manager.py,sha256=uwLyavND4kXOHlsXB0Berab3y8u6bhaEmQOQLl7wDAM,1963
@@ -56,7 +56,7 @@ bioguider/generation/style_analyzer.py,sha256=Vn9FAK1qJBNLolLC1tz362k4UBaPl107Bl
56
56
  bioguider/generation/suggestion_extractor.py,sha256=kkPOYE6FXRtYlogV0GQdBraZZJm08I6Oux5YKGUF1UU,29442
57
57
  bioguider/generation/test_metrics.py,sha256=ACXmSZc2L_UkkmC5h2s4tG44MXW1d-hClFwPCD5_BFI,7505
58
58
  bioguider/managers/evaluation_manager.py,sha256=7WlshfnqFkk34dDlf50qAINK5sFTaoCFE0f0vGYyRdc,5789
59
- bioguider/managers/generation_manager.py,sha256=GqjCci2eWHnIcJ-SOn5-hEMi8p3Jk4Q5E4KSObKELPs,31945
59
+ bioguider/managers/generation_manager.py,sha256=m6hGu9_1HcL1YS0PMoFfXfVgDqsps1ahcj6L9E4jtoo,38636
60
60
  bioguider/managers/generation_test_manager.py,sha256=3mOBzQVpsLo_LpSspJcofn3CNtvgagS1DMr9Zuwkzq4,5307
61
61
  bioguider/rag/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
62
  bioguider/rag/config.py,sha256=5g4IqTzgyfZfax9Af9CTkXShgItPOt4_9TEMSekCPik,4602
@@ -74,7 +74,7 @@ bioguider/utils/pyphen_utils.py,sha256=cdZc3qphkvMDeL5NiZ8Xou13M_uVNP7ifJ-FwxO-0
74
74
  bioguider/utils/python_file_handler.py,sha256=BERiE2RHxpu3gAzv26jr8ZQetkrtnMZOv9SjpQ7WIdg,2650
75
75
  bioguider/utils/r_file_handler.py,sha256=y57Y04wjgtFWve0lPg1EOrNNOccPfnNF0z2WnlFMX74,19616
76
76
  bioguider/utils/utils.py,sha256=h8OhCjzLpHkb3ndnjRBUOBHD7csbHdEVNXf75SRN8Zc,4413
77
- bioguider-0.2.32.dist-info/LICENSE,sha256=qzkvZcKwwA5DuSuhXMOm2LcO6BdEr4V7jwFZVL2-jL4,1065
78
- bioguider-0.2.32.dist-info/METADATA,sha256=OTumO8NRGxVO5JZWKXb7ztoM_S7zh8bAuij2TUrABuA,1962
79
- bioguider-0.2.32.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
80
- bioguider-0.2.32.dist-info/RECORD,,
77
+ bioguider-0.2.33.dist-info/LICENSE,sha256=qzkvZcKwwA5DuSuhXMOm2LcO6BdEr4V7jwFZVL2-jL4,1065
78
+ bioguider-0.2.33.dist-info/METADATA,sha256=Yyqyvrm_CLNHy1fgDluqbCsePUqA_mYwkzRupFTHbRU,1962
79
+ bioguider-0.2.33.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
80
+ bioguider-0.2.33.dist-info/RECORD,,