empathy-framework 5.1.1__py3-none-any.whl → 5.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/METADATA +52 -3
  2. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/RECORD +69 -28
  3. empathy_os/cli_router.py +9 -0
  4. empathy_os/core_modules/__init__.py +15 -0
  5. empathy_os/mcp/__init__.py +10 -0
  6. empathy_os/mcp/server.py +506 -0
  7. empathy_os/memory/control_panel.py +1 -131
  8. empathy_os/memory/control_panel_support.py +145 -0
  9. empathy_os/memory/encryption.py +159 -0
  10. empathy_os/memory/long_term.py +41 -626
  11. empathy_os/memory/long_term_types.py +99 -0
  12. empathy_os/memory/mixins/__init__.py +25 -0
  13. empathy_os/memory/mixins/backend_init_mixin.py +244 -0
  14. empathy_os/memory/mixins/capabilities_mixin.py +199 -0
  15. empathy_os/memory/mixins/handoff_mixin.py +208 -0
  16. empathy_os/memory/mixins/lifecycle_mixin.py +49 -0
  17. empathy_os/memory/mixins/long_term_mixin.py +352 -0
  18. empathy_os/memory/mixins/promotion_mixin.py +109 -0
  19. empathy_os/memory/mixins/short_term_mixin.py +182 -0
  20. empathy_os/memory/short_term.py +7 -0
  21. empathy_os/memory/simple_storage.py +302 -0
  22. empathy_os/memory/storage_backend.py +167 -0
  23. empathy_os/memory/unified.py +21 -1120
  24. empathy_os/meta_workflows/cli_commands/__init__.py +56 -0
  25. empathy_os/meta_workflows/cli_commands/agent_commands.py +321 -0
  26. empathy_os/meta_workflows/cli_commands/analytics_commands.py +442 -0
  27. empathy_os/meta_workflows/cli_commands/config_commands.py +232 -0
  28. empathy_os/meta_workflows/cli_commands/memory_commands.py +182 -0
  29. empathy_os/meta_workflows/cli_commands/template_commands.py +354 -0
  30. empathy_os/meta_workflows/cli_commands/workflow_commands.py +382 -0
  31. empathy_os/meta_workflows/cli_meta_workflows.py +52 -1802
  32. empathy_os/models/telemetry/__init__.py +71 -0
  33. empathy_os/models/telemetry/analytics.py +594 -0
  34. empathy_os/models/telemetry/backend.py +196 -0
  35. empathy_os/models/telemetry/data_models.py +431 -0
  36. empathy_os/models/telemetry/storage.py +489 -0
  37. empathy_os/orchestration/__init__.py +35 -0
  38. empathy_os/orchestration/execution_strategies.py +481 -0
  39. empathy_os/orchestration/meta_orchestrator.py +488 -1
  40. empathy_os/routing/workflow_registry.py +36 -0
  41. empathy_os/telemetry/cli.py +19 -724
  42. empathy_os/telemetry/commands/__init__.py +14 -0
  43. empathy_os/telemetry/commands/dashboard_commands.py +696 -0
  44. empathy_os/tools.py +183 -0
  45. empathy_os/workflows/__init__.py +5 -0
  46. empathy_os/workflows/autonomous_test_gen.py +860 -161
  47. empathy_os/workflows/base.py +6 -2
  48. empathy_os/workflows/code_review.py +4 -1
  49. empathy_os/workflows/document_gen/__init__.py +25 -0
  50. empathy_os/workflows/document_gen/config.py +30 -0
  51. empathy_os/workflows/document_gen/report_formatter.py +162 -0
  52. empathy_os/workflows/document_gen/workflow.py +1426 -0
  53. empathy_os/workflows/document_gen.py +22 -1598
  54. empathy_os/workflows/security_audit.py +2 -2
  55. empathy_os/workflows/security_audit_phase3.py +7 -4
  56. empathy_os/workflows/seo_optimization.py +633 -0
  57. empathy_os/workflows/test_gen/__init__.py +52 -0
  58. empathy_os/workflows/test_gen/ast_analyzer.py +249 -0
  59. empathy_os/workflows/test_gen/config.py +88 -0
  60. empathy_os/workflows/test_gen/data_models.py +38 -0
  61. empathy_os/workflows/test_gen/report_formatter.py +289 -0
  62. empathy_os/workflows/test_gen/test_templates.py +381 -0
  63. empathy_os/workflows/test_gen/workflow.py +655 -0
  64. empathy_os/workflows/test_gen.py +42 -1905
  65. empathy_os/memory/types 2.py +0 -441
  66. empathy_os/models/telemetry.py +0 -1660
  67. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/WHEEL +0 -0
  68. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/entry_points.txt +0 -0
  69. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/licenses/LICENSE +0 -0
  70. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/licenses/LICENSE_CHANGE_ANNOUNCEMENT.md +0 -0
  71. {empathy_framework-5.1.1.dist-info → empathy_framework-5.2.1.dist-info}/top_level.txt +0 -0
@@ -1,1605 +1,29 @@
1
- """Document Generation Workflow
1
+ """Document Generation Workflow (Backward Compatible Entry Point).
2
2
 
3
- A cost-optimized, enterprise-safe documentation pipeline:
4
- 1. Haiku: Generate outline from code/specs (cheap, fast)
5
- 2. Sonnet: Write each section (capable, chunked for large projects)
6
- 3. Opus: Final review + consistency polish (premium, chunked if needed)
3
+ This module maintains backward compatibility by re-exporting all public APIs
4
+ from the document_gen package.
7
5
 
8
- Enterprise Features:
9
- - Auto-scaling tokens based on project complexity
10
- - Chunked polish for large documents
11
- - Cost guardrails with configurable max_cost
12
- - Graceful degradation with partial results on errors
6
+ For new code, import from the package directly:
7
+ from empathy_os.workflows.document_gen import DocumentGenerationWorkflow
13
8
 
14
9
  Copyright 2025 Smart-AI-Memory
15
10
  Licensed under Fair Source License 0.9
16
11
  """
17
12
 
18
- import logging
19
- from datetime import datetime
20
- from pathlib import Path
21
- from typing import Any
22
-
23
- from empathy_os.config import _validate_file_path
24
-
25
- from .base import BaseWorkflow, ModelTier
26
- from .step_config import WorkflowStepConfig
27
-
28
- logger = logging.getLogger(__name__)
29
-
30
- # Approximate cost per 1K tokens (USD) - used for cost estimation
31
- # These are estimates and should be updated as pricing changes
32
- TOKEN_COSTS = {
33
- ModelTier.CHEAP: {"input": 0.00025, "output": 0.00125}, # Haiku
34
- ModelTier.CAPABLE: {"input": 0.003, "output": 0.015}, # Sonnet
35
- ModelTier.PREMIUM: {"input": 0.015, "output": 0.075}, # Opus
36
- }
37
-
38
- # Define step configurations for executor-based execution
39
- # Note: max_tokens for polish is dynamically set based on input size
40
- DOC_GEN_STEPS = {
41
- "polish": WorkflowStepConfig(
42
- name="polish",
43
- task_type="final_review", # Premium tier task
44
- tier_hint="premium",
45
- description="Polish and improve documentation for consistency and quality",
46
- max_tokens=20000, # Increased to handle large chunked documents
47
- ),
48
- }
49
-
50
-
51
- class DocumentGenerationWorkflow(BaseWorkflow):
52
- """Multi-tier document generation workflow.
53
-
54
- Uses cheap models for outlining, capable models for content
55
- generation, and premium models for final polish and consistency
56
- review.
57
-
58
- Usage:
59
- workflow = DocumentGenerationWorkflow()
60
- result = await workflow.execute(
61
- source_code="...",
62
- doc_type="api_reference",
63
- audience="developers"
64
- )
65
- """
66
-
67
- name = "doc-gen"
68
- description = "Cost-optimized documentation generation pipeline"
69
- stages = ["outline", "write", "polish"]
70
- tier_map = {
71
- "outline": ModelTier.CHEAP,
72
- "write": ModelTier.CAPABLE,
73
- "polish": ModelTier.PREMIUM,
74
- }
75
-
76
- def __init__(
77
- self,
78
- skip_polish_threshold: int = 1000,
79
- max_sections: int = 10,
80
- max_write_tokens: int | None = None, # Auto-scaled if None
81
- section_focus: list[str] | None = None,
82
- chunked_generation: bool = True,
83
- sections_per_chunk: int = 3,
84
- max_cost: float = 5.0, # Cost guardrail in USD
85
- cost_warning_threshold: float = 0.8, # Warn at 80% of max_cost
86
- graceful_degradation: bool = True, # Return partial results on error
87
- export_path: str | Path | None = None, # Export docs to file (e.g., "docs/generated")
88
- max_display_chars: int = 45000, # Max chars before chunking output
89
- enable_auth_strategy: bool = True, # Enable intelligent auth routing
90
- **kwargs: Any,
91
- ):
92
- """Initialize workflow with enterprise-safe defaults.
93
-
94
- Args:
95
- skip_polish_threshold: Skip premium polish for docs under this
96
- token count (they're already good enough).
97
- max_sections: Maximum number of sections to generate.
98
- max_write_tokens: Maximum tokens for content generation.
99
- If None, auto-scales based on section count (recommended).
100
- section_focus: Optional list of specific sections to generate
101
- (e.g., ["Testing Guide", "API Reference"]).
102
- chunked_generation: If True, generates large docs in chunks to avoid
103
- truncation (default True).
104
- sections_per_chunk: Number of sections to generate per chunk (default 3).
105
- max_cost: Maximum cost in USD before stopping (default $5).
106
- Set to 0 to disable cost limits.
107
- cost_warning_threshold: Percentage of max_cost to trigger warning (default 0.8).
108
- graceful_degradation: If True, return partial results on errors
109
- instead of failing completely (default True).
110
- export_path: Optional directory to export generated docs (e.g., "docs/generated").
111
- If provided, documentation will be saved to a file automatically.
112
- max_display_chars: Maximum characters before splitting output into chunks
113
- for display (default 45000). Helps avoid terminal/UI truncation.
114
- enable_auth_strategy: If True, use intelligent subscription vs API routing
115
- based on module size (default True).
116
-
117
- """
118
- super().__init__(**kwargs)
119
- self.skip_polish_threshold = skip_polish_threshold
120
- self.max_sections = max_sections
121
- self._user_max_write_tokens = max_write_tokens # Store user preference
122
- self.max_write_tokens = max_write_tokens or 16000 # Will be auto-scaled
123
- self.section_focus = section_focus
124
- self.chunked_generation = chunked_generation
125
- self.sections_per_chunk = sections_per_chunk
126
- self.max_cost = max_cost
127
- self.cost_warning_threshold = cost_warning_threshold
128
- self.graceful_degradation = graceful_degradation
129
- self.export_path = Path(export_path) if export_path else None
130
- self.max_display_chars = max_display_chars
131
- self.enable_auth_strategy = enable_auth_strategy
132
- self._total_content_tokens: int = 0
133
- self._accumulated_cost: float = 0.0
134
- self._cost_warning_issued: bool = False
135
- self._partial_results: dict = {}
136
- self._auth_mode_used: str | None = None # Track which auth was recommended
137
-
138
- def _estimate_cost(self, tier: ModelTier, input_tokens: int, output_tokens: int) -> float:
139
- """Estimate cost for a given tier and token counts."""
140
- costs = TOKEN_COSTS.get(tier, TOKEN_COSTS[ModelTier.CAPABLE])
141
- input_cost = (input_tokens / 1000) * costs["input"]
142
- output_cost = (output_tokens / 1000) * costs["output"]
143
- return input_cost + output_cost
144
-
145
- def _track_cost(
146
- self,
147
- tier: ModelTier,
148
- input_tokens: int,
149
- output_tokens: int,
150
- ) -> tuple[float, bool]:
151
- """Track accumulated cost and check against limits.
152
-
153
- Returns:
154
- Tuple of (cost_for_this_call, should_stop)
155
-
156
- """
157
- cost = self._estimate_cost(tier, input_tokens, output_tokens)
158
- self._accumulated_cost += cost
159
-
160
- # Check warning threshold
161
- if (
162
- self.max_cost > 0
163
- and not self._cost_warning_issued
164
- and self._accumulated_cost >= self.max_cost * self.cost_warning_threshold
165
- ):
166
- self._cost_warning_issued = True
167
- logger.warning(
168
- f"Doc-gen cost approaching limit: ${self._accumulated_cost:.2f} "
169
- f"of ${self.max_cost:.2f} ({self.cost_warning_threshold * 100:.0f}% threshold)",
170
- )
171
-
172
- # Check if we should stop
173
- should_stop = self.max_cost > 0 and self._accumulated_cost >= self.max_cost
174
- if should_stop:
175
- logger.warning(
176
- f"Doc-gen cost limit reached: ${self._accumulated_cost:.2f} >= ${self.max_cost:.2f}",
177
- )
178
-
179
- return cost, should_stop
180
-
181
- def _auto_scale_tokens(self, section_count: int) -> int:
182
- """Auto-scale max_write_tokens based on section count.
183
-
184
- Enterprise projects may have 20+ sections requiring more tokens.
185
- """
186
- if self._user_max_write_tokens is not None:
187
- return self._user_max_write_tokens # User override
188
-
189
- # Base: 2000 tokens per section, minimum 16000, maximum 64000
190
- scaled = max(16000, min(64000, section_count * 2000))
191
- logger.info(f"Auto-scaled max_write_tokens to {scaled} for {section_count} sections")
192
- return scaled
193
-
194
- def _export_document(
195
- self,
196
- document: str,
197
- doc_type: str,
198
- report: str | None = None,
199
- ) -> tuple[Path | None, Path | None]:
200
- """Export generated documentation to file.
201
-
202
- Args:
203
- document: The generated documentation content
204
- doc_type: Document type for naming
205
- report: Optional report to save alongside document
206
-
207
- Returns:
208
- Tuple of (doc_path, report_path) or (None, None) if export disabled
209
-
210
- """
211
- if not self.export_path:
212
- return None, None
213
-
214
- # Create export directory
215
- self.export_path.mkdir(parents=True, exist_ok=True)
216
-
217
- # Generate filename with timestamp
218
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
219
- safe_doc_type = doc_type.replace(" ", "_").replace("/", "-").lower()
220
- doc_filename = f"{safe_doc_type}_{timestamp}.md"
221
- report_filename = f"{safe_doc_type}_{timestamp}_report.txt"
222
-
223
- doc_path = self.export_path / doc_filename
224
- report_path = self.export_path / report_filename if report else None
225
-
226
- # Write document
227
- try:
228
- validated_doc_path = _validate_file_path(str(doc_path))
229
- validated_doc_path.write_text(document, encoding="utf-8")
230
- logger.info(f"Documentation exported to: {validated_doc_path}")
231
-
232
- # Write report if provided
233
- if report and report_path:
234
- validated_report_path = _validate_file_path(str(report_path))
235
- validated_report_path.write_text(report, encoding="utf-8")
236
- logger.info(f"Report exported to: {validated_report_path}")
237
-
238
- return validated_doc_path, validated_report_path if report else None
239
- except (OSError, ValueError) as e:
240
- logger.error(f"Failed to export documentation: {e}")
241
- return None, None
242
-
243
- def _chunk_output_for_display(self, content: str, chunk_prefix: str = "PART") -> list[str]:
244
- """Split large output into displayable chunks.
245
-
246
- Args:
247
- content: The content to chunk
248
- chunk_prefix: Prefix for chunk headers
249
-
250
- Returns:
251
- List of content chunks, each under max_display_chars
252
-
253
- """
254
- if len(content) <= self.max_display_chars:
255
- return [content]
256
-
257
- chunks = []
258
- # Try to split on section boundaries (## headers)
259
- import re
260
-
261
- sections = re.split(r"(?=^## )", content, flags=re.MULTILINE)
262
-
263
- current_chunk = ""
264
- chunk_num = 1
265
-
266
- for section in sections:
267
- # If adding this section would exceed limit, save current chunk
268
- if current_chunk and len(current_chunk) + len(section) > self.max_display_chars:
269
- chunks.append(
270
- f"{'=' * 60}\n{chunk_prefix} {chunk_num} of {{total}}\n{'=' * 60}\n\n"
271
- + current_chunk,
272
- )
273
- chunk_num += 1
274
- current_chunk = section
275
- else:
276
- current_chunk += section
277
-
278
- # Add final chunk
279
- if current_chunk:
280
- chunks.append(
281
- f"{'=' * 60}\n{chunk_prefix} {chunk_num} of {{total}}\n{'=' * 60}\n\n"
282
- + current_chunk,
283
- )
284
-
285
- # Update total count in all chunks
286
- total = len(chunks)
287
- chunks = [chunk.format(total=total) for chunk in chunks]
288
-
289
- return chunks
290
-
291
- def should_skip_stage(self, stage_name: str, input_data: Any) -> tuple[bool, str | None]:
292
- """Skip polish for short documents."""
293
- if stage_name == "polish":
294
- if self._total_content_tokens < self.skip_polish_threshold:
295
- self.tier_map["polish"] = ModelTier.CAPABLE
296
- return False, None
297
- return False, None
298
-
299
- async def run_stage(
300
- self,
301
- stage_name: str,
302
- tier: ModelTier,
303
- input_data: Any,
304
- ) -> tuple[Any, int, int]:
305
- """Execute a document generation stage."""
306
- if stage_name == "outline":
307
- return await self._outline(input_data, tier)
308
- if stage_name == "write":
309
- return await self._write(input_data, tier)
310
- if stage_name == "polish":
311
- return await self._polish(input_data, tier)
312
- raise ValueError(f"Unknown stage: {stage_name}")
313
-
314
- async def _outline(self, input_data: dict, tier: ModelTier) -> tuple[dict, int, int]:
315
- """Generate document outline from source."""
316
- from pathlib import Path
317
-
318
- source_code = input_data.get("source_code", "")
319
- target = input_data.get("target", "")
320
- doc_type = input_data.get("doc_type", "general")
321
- audience = input_data.get("audience", "developers")
322
-
323
- # Use target if source_code not provided
324
- content_to_document = source_code or target
325
-
326
- # If target looks like a file path and source_code wasn't provided, read the file
327
- if not source_code and target:
328
- target_path = Path(target)
329
- if target_path.exists() and target_path.is_file():
330
- try:
331
- content_to_document = target_path.read_text(encoding="utf-8")
332
- # Prepend file info for context
333
- content_to_document = f"# File: {target}\n\n{content_to_document}"
334
- except Exception as e:
335
- # If we can't read the file, log and use the path as-is
336
- import logging
337
-
338
- logging.getLogger(__name__).warning(f"Could not read file {target}: {e}")
339
- elif target_path.suffix in (
340
- ".py",
341
- ".js",
342
- ".ts",
343
- ".tsx",
344
- ".java",
345
- ".go",
346
- ".rs",
347
- ".md",
348
- ".txt",
349
- ):
350
- # Looks like a file path but doesn't exist - warn
351
- import logging
352
-
353
- logging.getLogger(__name__).warning(
354
- f"Target appears to be a file path but doesn't exist: {target}",
355
- )
356
-
357
- # === AUTH STRATEGY INTEGRATION ===
358
- # Detect module size and recommend auth mode (first stage only)
359
- if self.enable_auth_strategy:
360
- try:
361
- from empathy_os.models import (
362
- count_lines_of_code,
363
- get_auth_strategy,
364
- get_module_size_category,
365
- )
366
-
367
- # Calculate module size
368
- module_lines = 0
369
- if target and Path(target).exists():
370
- module_lines = count_lines_of_code(target)
371
- elif content_to_document:
372
- # Count from source code content
373
- module_lines = len(
374
- [
375
- line
376
- for line in content_to_document.split("\n")
377
- if line.strip() and not line.strip().startswith("#")
378
- ]
379
- )
380
-
381
- if module_lines > 0:
382
- # Get auth strategy (first-time setup if needed)
383
- strategy = get_auth_strategy()
384
-
385
- # Get recommended auth mode
386
- recommended_mode = strategy.get_recommended_mode(module_lines)
387
- self._auth_mode_used = recommended_mode.value
388
-
389
- # Get size category
390
- size_category = get_module_size_category(module_lines)
391
-
392
- # Log recommendation
393
- logger.info(
394
- f"Module: {target or 'source'} ({module_lines} LOC, {size_category})"
395
- )
396
- logger.info(f"Recommended auth mode: {recommended_mode.value}")
397
-
398
- # Get cost estimate
399
- cost_estimate = strategy.estimate_cost(module_lines, recommended_mode)
400
-
401
- if recommended_mode.value == "subscription":
402
- logger.info(
403
- f"Cost: {cost_estimate['quota_cost']} "
404
- f"(fits in {cost_estimate['fits_in_context']} context)"
405
- )
406
- else: # API
407
- logger.info(
408
- f"Cost: ~${cost_estimate['monetary_cost']:.4f} "
409
- f"(1M context window)"
410
- )
411
-
412
- except Exception as e:
413
- # Don't fail workflow if auth strategy fails
414
- logger.warning(f"Auth strategy detection failed: {e}")
415
-
416
- system = """You are an expert technical writer specializing in API Reference documentation.
417
-
418
- IMPORTANT: This is API REFERENCE documentation, not a tutorial. Focus on documenting EVERY function/class with structured Args/Returns/Raises format.
419
-
420
- Create a detailed, structured outline for API Reference documentation:
421
-
422
- 1. **Logical Section Structure** (emphasize API reference sections):
423
- - Overview/Introduction (brief)
424
- - Quick Start (1 complete example)
425
- - API Reference - Functions (one subsection per function with Args/Returns/Raises)
426
- - API Reference - Classes (one subsection per class with Args/Returns/Raises for methods)
427
- - Usage Examples (showing how to combine multiple functions)
428
- - Additional reference sections as needed
429
-
430
- 2. **For Each Section**:
431
- - Clear purpose and what readers will learn
432
- - Specific topics to cover
433
- - Types of examples to include (with actual code)
434
-
435
- 3. **Key Requirements**:
436
- - Include sections for real, copy-paste ready code examples
437
- - Plan for comprehensive API documentation with all parameters
438
- - Include edge cases and error handling examples
439
- - Add best practices and common patterns
440
-
441
- Format as a numbered list with section titles and detailed descriptions."""
442
-
443
- user_message = f"""Create a comprehensive documentation outline:
444
-
445
- Document Type: {doc_type}
446
- Target Audience: {audience}
447
-
448
- IMPORTANT: This documentation should be production-ready with:
449
- - Real, executable code examples (not placeholders)
450
- - Complete API reference with parameter types and descriptions
451
- - Usage guides showing common patterns
452
- - Edge case handling and error scenarios
453
- - Best practices for the target audience
454
-
455
- Content to document:
456
- {content_to_document[:4000]}
457
-
458
- Generate an outline that covers all these aspects comprehensively."""
459
-
460
- response, input_tokens, output_tokens = await self._call_llm(
461
- tier,
462
- system,
463
- user_message,
464
- max_tokens=1000,
465
- )
466
-
467
- return (
468
- {
469
- "outline": response,
470
- "doc_type": doc_type,
471
- "audience": audience,
472
- "content_to_document": content_to_document,
473
- },
474
- input_tokens,
475
- output_tokens,
476
- )
477
-
478
- def _parse_outline_sections(self, outline: str) -> list[str]:
479
- """Parse top-level section titles from the outline.
480
-
481
- Only matches main sections like "1. Introduction", "2. Setup", etc.
482
- Ignores sub-sections like "2.1 Prerequisites" or nested items.
483
- """
484
- import re
485
-
486
- sections = []
487
- # Match only top-level sections: digit followed by period and space/letter
488
- # e.g., "1. Introduction" but NOT "1.1 Sub-section" or "2.1.3 Deep"
489
- top_level_pattern = re.compile(r"^(\d+)\.\s+([A-Za-z].*)")
490
-
491
- for line in outline.split("\n"):
492
- stripped = line.strip()
493
- match = top_level_pattern.match(stripped)
494
- if match:
495
- # section_num = match.group(1) - not needed, only extracting title
496
- title = match.group(2).strip()
497
- # Remove any trailing description after " - "
498
- if " - " in title:
499
- title = title.split(" - ")[0].strip()
500
- sections.append(title)
501
-
502
- return sections
503
-
504
- async def _write(self, input_data: dict, tier: ModelTier) -> tuple[dict, int, int]:
505
- """Write content based on the outline."""
506
- outline = input_data.get("outline", "")
507
- doc_type = input_data.get("doc_type", "general")
508
- audience = input_data.get("audience", "developers")
509
- content_to_document = input_data.get("content_to_document", "")
510
-
511
- # Parse sections from outline
512
- sections = self._parse_outline_sections(outline)
513
-
514
- # Auto-scale tokens based on section count
515
- self.max_write_tokens = self._auto_scale_tokens(len(sections))
516
-
517
- # Use chunked generation for large outlines (more than sections_per_chunk * 2)
518
- use_chunking = (
519
- self.chunked_generation
520
- and len(sections) > self.sections_per_chunk * 2
521
- and not self.section_focus # Don't chunk if already focused
522
- )
523
-
524
- if use_chunking:
525
- return await self._write_chunked(
526
- sections,
527
- outline,
528
- doc_type,
529
- audience,
530
- content_to_document,
531
- tier,
532
- )
533
-
534
- # Handle section_focus for targeted generation
535
- section_instruction = ""
536
- if self.section_focus:
537
- sections_list = ", ".join(self.section_focus)
538
- section_instruction = f"""
539
- IMPORTANT: Focus ONLY on generating these specific sections:
540
- {sections_list}
541
-
542
- Generate comprehensive, detailed content for each of these sections."""
543
-
544
- system = f"""You are an expert technical writer creating comprehensive developer documentation.
545
-
546
- YOUR TASK HAS TWO CRITICAL PHASES - YOU MUST COMPLETE BOTH:
547
-
548
- ═══════════════════════════════════════════════════════════════
549
- PHASE 1: Write Comprehensive Documentation
550
- ═══════════════════════════════════════════════════════════════
551
-
552
- Write clear, helpful documentation with:
553
- - Overview and introduction explaining what this code does
554
- - Real, executable code examples (NOT placeholders - use actual code from source)
555
- - Usage guides showing how to use the code in real scenarios
556
- - Best practices and common patterns
557
- - Step-by-step instructions where helpful
558
- - Tables, diagrams, and visual aids as appropriate
559
- - Clear explanations appropriate for {audience}
560
-
561
- Do this naturally - write the kind of documentation that helps developers understand and use the code effectively.
562
-
563
- ═══════════════════════════════════════════════════════════════
564
- PHASE 2: Add Structured API Reference Sections (MANDATORY)
565
- ═══════════════════════════════════════════════════════════════
566
-
567
- After writing the comprehensive documentation above, you MUST add structured API reference sections for EVERY function and class method.
568
-
569
- For EACH function/method in the source code, add this EXACT structure:
570
-
571
- ---
572
- ### `function_name()`
573
-
574
- **Function Signature:**
575
- ```python
576
- def function_name(param1: type, param2: type = default) -> return_type
577
- ```
578
-
579
- **Description:**
580
- [Brief description of what the function does - 1-2 sentences]
581
-
582
- **Args:**
583
- - `param1` (`type`): Clear description of this parameter
584
- - `param2` (`type`, optional): Description. Defaults to `default`.
585
-
586
- **Returns:**
587
- - `return_type`: Description of the return value
588
-
589
- **Raises:**
590
- - `ExceptionType`: Description of when and why this exception occurs
591
- - `AnotherException`: Another exception case
592
-
593
- **Example:**
594
- ```python
595
- from module import function_name
596
-
597
- # Show real usage with actual code
598
- result = function_name(actual_value, param2=123)
599
- print(result)
600
- ```
601
- ---
602
-
603
- CRITICAL RULES FOR PHASE 2:
604
- - Include **Args:** header for ALL functions (write "None" if no parameters)
605
- - Include **Returns:** header for ALL functions (write "None" if void/no return)
606
- - Include **Raises:** header for ALL functions (write "None" if no exceptions)
607
- - Use backticks for code: `param_name` (`type`)
608
- - Document EVERY public function and method you see in the source code
609
-
610
- {section_instruction}
611
-
612
- ═══════════════════════════════════════════════════════════════
613
- REMINDER: BOTH PHASES ARE MANDATORY
614
- ═══════════════════════════════════════════════════════════════
615
-
616
- 1. Write comprehensive documentation (Phase 1) - what you do naturally
617
- 2. Add structured API reference sections (Phase 2) - for every function/method
618
-
619
- Do NOT skip Phase 2 after completing Phase 1. Both phases are required for complete documentation."""
620
-
621
- user_message = f"""Write comprehensive, production-ready documentation in TWO PHASES:
622
-
623
- Document Type: {doc_type}
624
- Target Audience: {audience}
625
-
626
- Outline to follow:
627
- {outline}
628
-
629
- Source code to document (extract actual class names, function signatures, parameters):
630
- {content_to_document[:5000]}
631
-
632
- ═══════════════════════════════════════════════════════════════
633
- YOUR TASK:
634
- ═══════════════════════════════════════════════════════════════
635
-
636
- PHASE 1: Write comprehensive documentation
637
- - Use the outline above as your guide
638
- - Include real, executable code examples from the source
639
- - Show usage patterns, best practices, common workflows
640
- - Write clear explanations that help developers understand the code
641
-
642
- PHASE 2: Add structured API reference sections
643
- - For EACH function/method in the source code, add:
644
- - Function signature
645
- - Description
646
- - **Args:** section (every parameter with type and description)
647
- - **Returns:** section (return type and description)
648
- - **Raises:** section (exceptions that can occur)
649
- - Example code snippet
650
-
651
- ═══════════════════════════════════════════════════════════════
652
- IMPORTANT: Complete BOTH phases. Don't stop after Phase 1.
653
- ═══════════════════════════════════════════════════════════════
654
-
655
- Generate the complete documentation now, ensuring both comprehensive content AND structured API reference sections."""
656
-
657
- response, input_tokens, output_tokens = await self._call_llm(
658
- tier,
659
- system,
660
- user_message,
661
- max_tokens=self.max_write_tokens,
662
- )
663
-
664
- self._total_content_tokens = output_tokens
665
-
666
- return (
667
- {
668
- "draft_document": response,
669
- "doc_type": doc_type,
670
- "audience": audience,
671
- "outline": outline,
672
- "chunked": False,
673
- "source_code": content_to_document, # Pass through for API reference generation
674
- },
675
- input_tokens,
676
- output_tokens,
677
- )
678
-
679
- async def _write_chunked(
680
- self,
681
- sections: list[str],
682
- outline: str,
683
- doc_type: str,
684
- audience: str,
685
- content_to_document: str,
686
- tier: ModelTier,
687
- ) -> tuple[dict, int, int]:
688
- """Generate documentation in chunks to avoid truncation.
689
-
690
- Enterprise-safe: includes cost tracking and graceful degradation.
691
- """
692
- all_content: list[str] = []
693
- total_input_tokens: int = 0
694
- total_output_tokens: int = 0
695
- stopped_early: bool = False
696
- error_message: str | None = None
697
-
698
- # Split sections into chunks
699
- chunks = []
700
- for i in range(0, len(sections), self.sections_per_chunk):
701
- chunks.append(sections[i : i + self.sections_per_chunk])
702
-
703
- logger.info(f"Generating documentation in {len(chunks)} chunks")
704
-
705
- for chunk_idx, chunk_sections in enumerate(chunks):
706
- sections_list = ", ".join(chunk_sections)
707
-
708
- # Build context about what came before
709
- previous_context = ""
710
- if chunk_idx > 0 and all_content:
711
- # Include last 500 chars of previous content for continuity
712
- previous_context = f"""
713
- Previous sections already written (for context/continuity):
714
- ...{all_content[-1][-500:]}
715
-
716
- Continue with the next sections, maintaining consistent style and terminology."""
717
-
718
- system = f"""You are an expert technical writer creating comprehensive developer documentation.
719
-
720
- Write ONLY these sections (part {chunk_idx + 1} of {len(chunks)}): {sections_list}
721
-
722
- YOUR TASK FOR THESE SECTIONS (TWO PHASES):
723
-
724
- ═══════════════════════════════════════════════════════════════
725
- PHASE 1: Comprehensive Content
726
- ═══════════════════════════════════════════════════════════════
727
- - Write clear explanations and overviews
728
- - Include real, executable code examples (extract from source)
729
- - Show usage patterns and workflows
730
- - Add best practices and common patterns
731
- - Professional language for {audience}
732
-
733
- ═══════════════════════════════════════════════════════════════
734
- PHASE 2: Structured API Reference
735
- ═══════════════════════════════════════════════════════════════
736
- For EACH function/method in these sections, add:
737
-
738
- ### `function_name()`
739
-
740
- **Function Signature:**
741
- ```python
742
- def function_name(params) -> return_type
743
- ```
744
-
745
- **Description:**
746
- [Brief description]
747
-
748
- **Args:**
749
- - `param` (`type`): Description
750
-
751
- **Returns:**
752
- - `type`: Description
753
-
754
- **Raises:**
755
- - `Exception`: When it occurs
756
-
757
- **Example:**
758
- ```python
759
- # Real usage example
760
- ```
761
-
762
- ═══════════════════════════════════════════════════════════════
763
- Complete BOTH phases for these sections.
764
- ═══════════════════════════════════════════════════════════════"""
765
-
766
- user_message = f"""Write comprehensive documentation for these sections in TWO PHASES:
767
-
768
- Sections to write: {sections_list}
769
-
770
- Document Type: {doc_type}
771
- Target Audience: {audience}
772
-
773
- Source code (extract actual functions/classes from here):
774
- {content_to_document[:3000]}
775
-
776
- Full outline (for context):
777
- {outline}
778
- {previous_context}
779
-
780
- PHASE 1: Write comprehensive content with real code examples
781
- PHASE 2: Add structured API reference sections with **Args:**, **Returns:**, **Raises:**
782
-
783
- Generate complete sections now, ensuring both phases are complete."""
784
-
785
- try:
786
- response, input_tokens, output_tokens = await self._call_llm(
787
- tier,
788
- system,
789
- user_message,
790
- max_tokens=self.max_write_tokens // len(chunks) + 2000,
791
- )
792
-
793
- # Track cost and check limits
794
- _, should_stop = self._track_cost(tier, input_tokens, output_tokens)
795
-
796
- all_content.append(response)
797
- total_input_tokens += input_tokens
798
- total_output_tokens += output_tokens
799
-
800
- logger.info(
801
- f"Chunk {chunk_idx + 1}/{len(chunks)} complete: "
802
- f"{len(response)} chars, {output_tokens} tokens, "
803
- f"cost so far: ${self._accumulated_cost:.2f}",
804
- )
805
-
806
- # Check cost limit
807
- if should_stop:
808
- stopped_early = True
809
- remaining = len(chunks) - chunk_idx - 1
810
- error_message = (
811
- f"Cost limit reached (${self._accumulated_cost:.2f}). "
812
- f"Stopped after {chunk_idx + 1}/{len(chunks)} chunks. "
813
- f"{remaining} chunks not generated."
814
- )
815
- logger.warning(error_message)
816
- break
817
-
818
- except Exception as e:
819
- error_message = f"Error generating chunk {chunk_idx + 1}: {e}"
820
- logger.error(error_message)
821
- if not self.graceful_degradation:
822
- raise
823
- stopped_early = True
824
- break
825
-
826
- # Combine all chunks
827
- combined_document = "\n\n".join(all_content)
828
- self._total_content_tokens = total_output_tokens
829
-
830
- # Store partial results for graceful degradation
831
- self._partial_results = {
832
- "draft_document": combined_document,
833
- "sections_completed": len(all_content),
834
- "sections_total": len(chunks),
835
- }
836
-
837
- result = {
838
- "draft_document": combined_document,
839
- "doc_type": doc_type,
840
- "audience": audience,
841
- "outline": outline,
842
- "chunked": True,
843
- "chunk_count": len(chunks),
844
- "chunks_completed": len(all_content),
845
- "stopped_early": stopped_early,
846
- "accumulated_cost": self._accumulated_cost,
847
- "source_code": content_to_document, # Pass through for API reference generation
848
- }
849
-
850
- if error_message:
851
- result["warning"] = error_message
852
-
853
- return (result, total_input_tokens, total_output_tokens)
854
-
855
- async def _polish(self, input_data: dict, tier: ModelTier) -> tuple[dict, int, int]:
856
- """Final review and consistency polish using LLM.
857
-
858
- Enterprise-safe: chunks large documents to avoid truncation.
859
- Supports XML-enhanced prompts when enabled in workflow config.
860
- """
861
- draft_document = input_data.get("draft_document", "")
862
- doc_type = input_data.get("doc_type", "general")
863
- audience = input_data.get("audience", "developers")
864
-
865
- # Check if document is too large and needs chunked polishing
866
- # Rough estimate: 4 chars per token, 10k tokens threshold for chunking
867
- estimated_tokens = len(draft_document) // 4
868
- needs_chunked_polish = estimated_tokens > 10000
869
-
870
- if needs_chunked_polish:
871
- logger.info(
872
- f"Large document detected (~{estimated_tokens} tokens). "
873
- "Using chunked polish for enterprise safety.",
874
- )
875
- return await self._polish_chunked(input_data, tier)
876
-
877
- # Build input payload for prompt
878
- input_payload = f"""Document Type: {doc_type}
879
- Target Audience: {audience}
880
-
881
- Draft:
882
- {draft_document}"""
883
-
884
- # Check if XML prompts are enabled
885
- if self._is_xml_enabled():
886
- # Use XML-enhanced prompt
887
- user_message = self._render_xml_prompt(
888
- role="senior technical editor",
889
- goal="Polish and improve the documentation for consistency and quality",
890
- instructions=[
891
- "Standardize terminology and formatting",
892
- "Improve clarity and flow",
893
- "Add missing cross-references",
894
- "Fix grammatical issues",
895
- "Identify gaps and add helpful notes",
896
- "Ensure examples are complete and accurate",
897
- ],
898
- constraints=[
899
- "Maintain the original structure and intent",
900
- "Keep content appropriate for the target audience",
901
- "Preserve code examples while improving explanations",
902
- ],
903
- input_type="documentation_draft",
904
- input_payload=input_payload,
905
- extra={
906
- "doc_type": doc_type,
907
- "audience": audience,
908
- },
909
- )
910
- system = None # XML prompt includes all context
911
- else:
912
- # Use legacy plain text prompts
913
- system = """You are a senior technical editor specializing in developer documentation.
914
-
915
- Polish and improve this documentation. The writer was asked to complete TWO PHASES:
916
- - Phase 1: Comprehensive content with real examples
917
- - Phase 2: Structured API reference sections with **Args:**, **Returns:**, **Raises:**
918
-
919
- Your job is to verify BOTH phases are complete and polish to production quality.
920
-
921
- ═══════════════════════════════════════════════════════════════
922
- CRITICAL: Verify Phase 2 Completion
923
- ═══════════════════════════════════════════════════════════════
924
-
925
- 1. **Check for Missing API Reference Sections**:
926
- - Scan the entire document for all functions and methods
927
- - EVERY function MUST have these sections:
928
- - **Args:** (write "None" if no parameters)
929
- - **Returns:** (write "None" if void)
930
- - **Raises:** (write "None" if no exceptions)
931
- - If ANY function is missing these sections, ADD them now
932
- - Format: **Args:**, **Returns:**, **Raises:** (bold headers with colons)
933
-
934
- 2. **Polish API Reference Sections**:
935
- - Verify all parameters have types in backticks: `param` (`type`)
936
- - Ensure return values are clearly described
937
- - Check exception documentation is complete
938
- - Validate code examples in each function section
939
-
940
- 3. **Polish General Content**:
941
- - Verify code examples are complete and runnable
942
- - Ensure proper imports and setup code
943
- - Replace any placeholders with real code
944
- - Standardize terminology throughout
945
- - Fix formatting inconsistencies
946
- - Improve clarity and flow
947
- - Add cross-references between sections
948
-
949
- 4. **Production Readiness**:
950
- - Remove any TODO or placeholder comments
951
- - Ensure professional tone
952
- - Add helpful notes, tips, and warnings
953
- - Verify edge cases are covered
954
-
955
- ═══════════════════════════════════════════════════════════════
956
- Return the complete, polished document. Add a brief "## Polish Notes" section at the end summarizing improvements made."""
957
-
958
- user_message = f"""Polish this documentation to production quality.
959
-
960
- The writer was asked to complete TWO PHASES:
961
- 1. Comprehensive content with real examples
962
- 2. Structured API reference with **Args:**, **Returns:**, **Raises:** for every function
963
-
964
- Verify BOTH phases are complete, then polish:
965
-
966
- {input_payload}
967
-
968
- ═══════════════════════════════════════════════════════════════
969
- YOUR TASKS:
970
- ═══════════════════════════════════════════════════════════════
971
-
972
- 1. SCAN for missing API reference sections
973
- - Find every function/method in the document
974
- - Check if it has **Args:**, **Returns:**, **Raises:** sections
975
- - ADD these sections if missing (use "None" if no parameters/returns/exceptions)
976
-
977
- 2. POLISH existing content
978
- - Verify code examples are complete and runnable
979
- - Ensure terminology is consistent
980
- - Fix formatting issues
981
- - Improve clarity and flow
982
-
983
- 3. VALIDATE production readiness
984
- - Remove TODOs and placeholders
985
- - Add warnings and best practices
986
- - Ensure professional tone
987
-
988
- Return the complete, polished documentation with all API reference sections present."""
989
-
990
- # Calculate polish tokens based on draft size (at least as much as write stage)
991
- polish_max_tokens = max(self.max_write_tokens, 20000)
992
-
993
- # Try executor-based execution first (Phase 3 pattern)
994
- if self._executor is not None or self._api_key:
995
- try:
996
- step = DOC_GEN_STEPS["polish"]
997
- # Override step max_tokens with dynamic value
998
- step.max_tokens = polish_max_tokens
999
- response, input_tokens, output_tokens, cost = await self.run_step_with_executor(
1000
- step=step,
1001
- prompt=user_message,
1002
- system=system,
1003
- )
1004
- except Exception:
1005
- # Fall back to legacy _call_llm if executor fails
1006
- response, input_tokens, output_tokens = await self._call_llm(
1007
- tier,
1008
- system or "",
1009
- user_message,
1010
- max_tokens=polish_max_tokens,
1011
- )
1012
- else:
1013
- # Legacy path for backward compatibility
1014
- response, input_tokens, output_tokens = await self._call_llm(
1015
- tier,
1016
- system or "",
1017
- user_message,
1018
- max_tokens=polish_max_tokens,
1019
- )
1020
-
1021
- # Parse XML response if enforcement is enabled
1022
- parsed_data = self._parse_xml_response(response)
1023
-
1024
- # Add structured API reference sections (Step 4: Post-processing)
1025
- source_code = input_data.get("source_code", "")
1026
- if source_code:
1027
- logger.info("Adding structured API reference sections to polished document...")
1028
- response = await self._add_api_reference_sections(
1029
- narrative_doc=response,
1030
- source_code=source_code,
1031
- tier=ModelTier.CHEAP, # Use cheap tier for structured extraction
1032
- )
1033
- else:
1034
- logger.warning("No source code available for API reference generation")
1035
-
1036
- result = {
1037
- "document": response,
1038
- "doc_type": doc_type,
1039
- "audience": audience,
1040
- "model_tier_used": tier.value,
1041
- "accumulated_cost": self._accumulated_cost, # Track total cost
1042
- "auth_mode_used": self._auth_mode_used, # Track recommended auth mode
1043
- }
1044
-
1045
- # Merge parsed XML data if available
1046
- if parsed_data.get("xml_parsed"):
1047
- result.update(
1048
- {
1049
- "xml_parsed": True,
1050
- "summary": parsed_data.get("summary"),
1051
- "findings": parsed_data.get("findings", []),
1052
- "checklist": parsed_data.get("checklist", []),
1053
- },
1054
- )
1055
-
1056
- # Add formatted report for human readability
1057
- result["formatted_report"] = format_doc_gen_report(result, input_data)
1058
-
1059
- # Export documentation if export_path is configured
1060
- doc_path, report_path = self._export_document(
1061
- document=response,
1062
- doc_type=doc_type,
1063
- report=result["formatted_report"],
1064
- )
1065
- if doc_path:
1066
- result["export_path"] = str(doc_path)
1067
- result["report_path"] = str(report_path) if report_path else None
1068
- logger.info(f"Documentation saved to: {doc_path}")
1069
-
1070
- # Chunk output for display if needed
1071
- output_chunks = self._chunk_output_for_display(
1072
- result["formatted_report"],
1073
- chunk_prefix="DOC OUTPUT",
1074
- )
1075
- if len(output_chunks) > 1:
1076
- result["output_chunks"] = output_chunks
1077
- result["output_chunk_count"] = len(output_chunks)
1078
- logger.info(
1079
- f"Report split into {len(output_chunks)} chunks for display "
1080
- f"(total {len(result['formatted_report'])} chars)",
1081
- )
1082
-
1083
- return (result, input_tokens, output_tokens)
1084
-
1085
- async def _polish_chunked(self, input_data: dict, tier: ModelTier) -> tuple[dict, int, int]:
1086
- """Polish large documents in chunks to avoid truncation.
1087
-
1088
- Splits the document by section headers and polishes each chunk separately,
1089
- then combines the results.
1090
- """
1091
- import re
1092
-
1093
- draft_document = input_data.get("draft_document", "")
1094
- doc_type = input_data.get("doc_type", "general")
1095
- audience = input_data.get("audience", "developers")
1096
-
1097
- # Split document by major section headers (## headers)
1098
- sections = re.split(r"(?=^## )", draft_document, flags=re.MULTILINE)
1099
- sections = [s.strip() for s in sections if s.strip()]
1100
-
1101
- if len(sections) <= 1:
1102
- # If we can't split by sections, split by character count
1103
- chunk_size = 15000 # ~3750 tokens per chunk
1104
- sections = [
1105
- draft_document[i : i + chunk_size]
1106
- for i in range(0, len(draft_document), chunk_size)
1107
- ]
1108
-
1109
- logger.info(f"Polishing document in {len(sections)} chunks")
1110
-
1111
- polished_chunks: list[str] = []
1112
- total_input_tokens: int = 0
1113
- total_output_tokens: int = 0
1114
-
1115
- for chunk_idx, section in enumerate(sections):
1116
- system = """You are a senior technical editor specializing in developer documentation.
1117
-
1118
- Polish this section to production quality. The writer was asked to complete TWO PHASES:
1119
- 1. Comprehensive content with real examples
1120
- 2. Structured API reference with **Args:**, **Returns:**, **Raises:** for every function
1121
-
1122
- Verify both phases are complete in this section:
1123
-
1124
- ═══════════════════════════════════════════════════════════════
1125
- CRITICAL: Check for Missing API Reference Format
1126
- ═══════════════════════════════════════════════════════════════
1127
-
1128
- 1. **Scan for functions/methods in this section**
1129
- - If any function is missing **Args:**, **Returns:**, **Raises:** sections, ADD them
1130
- - Format: **Args:**, **Returns:**, **Raises:** (bold headers with colons)
1131
- - Write "None" if no parameters/returns/exceptions
1132
-
1133
- 2. **Polish API Documentation**:
1134
- - Verify parameters documented with types in backticks
1135
- - Ensure return values and exceptions are clear
1136
- - Validate code examples are complete
1137
-
1138
- 3. **Polish General Content**:
1139
- - Ensure all examples are runnable with proper imports
1140
- - Standardize terminology and formatting
1141
- - Fix grammatical issues
1142
- - Remove TODOs and placeholders
1143
-
1144
- Return ONLY the polished section. Do not add commentary about changes."""
1145
-
1146
- user_message = f"""Polish this section to production quality (part {chunk_idx + 1} of {len(sections)}):
1147
-
1148
- Document Type: {doc_type}
1149
- Target Audience: {audience}
1150
-
1151
- Section to polish:
1152
- {section}
1153
-
1154
- Check if all functions have **Args:**, **Returns:**, **Raises:** sections - add if missing.
1155
- Make all code examples complete and executable."""
1156
-
1157
- try:
1158
- response, input_tokens, output_tokens = await self._call_llm(
1159
- tier,
1160
- system,
1161
- user_message,
1162
- max_tokens=8000,
1163
- )
1164
-
1165
- # Track cost
1166
- _, should_stop = self._track_cost(tier, input_tokens, output_tokens)
1167
-
1168
- polished_chunks.append(response)
1169
- total_input_tokens += input_tokens
1170
- total_output_tokens += output_tokens
1171
-
1172
- logger.info(
1173
- f"Polish chunk {chunk_idx + 1}/{len(sections)} complete, "
1174
- f"cost so far: ${self._accumulated_cost:.2f}",
1175
- )
1176
-
1177
- if should_stop:
1178
- logger.warning(
1179
- f"Cost limit reached during polish. "
1180
- f"Returning {len(polished_chunks)}/{len(sections)} polished chunks.",
1181
- )
1182
- # Add remaining sections unpolished
1183
- polished_chunks.extend(sections[chunk_idx + 1 :])
1184
- break
1185
-
1186
- except Exception as e:
1187
- logger.error(f"Error polishing chunk {chunk_idx + 1}: {e}")
1188
- if self.graceful_degradation:
1189
- # Keep original section on error
1190
- polished_chunks.append(section)
1191
- else:
1192
- raise
1193
-
1194
- # Combine polished chunks
1195
- polished_document = "\n\n".join(polished_chunks)
1196
-
1197
- # Add structured API reference sections (Step 4: Post-processing)
1198
- source_code = input_data.get("source_code", "")
1199
- if source_code:
1200
- logger.info("Adding structured API reference sections to chunked polished document...")
1201
- polished_document = await self._add_api_reference_sections(
1202
- narrative_doc=polished_document,
1203
- source_code=source_code,
1204
- tier=ModelTier.CHEAP, # Use cheap tier for structured extraction
1205
- )
1206
- else:
1207
- logger.warning("No source code available for API reference generation")
1208
-
1209
- result = {
1210
- "document": polished_document,
1211
- "doc_type": doc_type,
1212
- "audience": audience,
1213
- "model_tier_used": tier.value,
1214
- "polish_chunked": True,
1215
- "polish_chunks": len(sections),
1216
- "accumulated_cost": self._accumulated_cost,
1217
- }
1218
-
1219
- # Add formatted report
1220
- result["formatted_report"] = format_doc_gen_report(result, input_data)
1221
-
1222
- # Export documentation if export_path is configured
1223
- doc_path, report_path = self._export_document(
1224
- document=polished_document,
1225
- doc_type=doc_type,
1226
- report=result["formatted_report"],
1227
- )
1228
- if doc_path:
1229
- result["export_path"] = str(doc_path)
1230
- result["report_path"] = str(report_path) if report_path else None
1231
- logger.info(f"Documentation saved to: {doc_path}")
1232
-
1233
- # Chunk output for display if needed
1234
- output_chunks = self._chunk_output_for_display(
1235
- result["formatted_report"],
1236
- chunk_prefix="DOC OUTPUT",
1237
- )
1238
- if len(output_chunks) > 1:
1239
- result["output_chunks"] = output_chunks
1240
- result["output_chunk_count"] = len(output_chunks)
1241
- logger.info(
1242
- f"Report split into {len(output_chunks)} chunks for display "
1243
- f"(total {len(result['formatted_report'])} chars)",
1244
- )
1245
-
1246
- return (result, total_input_tokens, total_output_tokens)
1247
-
1248
- def _extract_functions_from_source(self, source_code: str) -> list[dict]:
1249
- """Extract function information from source code using AST.
1250
-
1251
- Args:
1252
- source_code: Python source code to parse
1253
-
1254
- Returns:
1255
- List of dicts with function information (name, args, returns, docstring)
1256
- """
1257
- import ast
1258
-
1259
- functions = []
1260
-
1261
- try:
1262
- tree = ast.parse(source_code)
1263
- except SyntaxError as e:
1264
- logger.warning(f"Failed to parse source code: {e}")
1265
- return functions
1266
-
1267
- for node in ast.walk(tree):
1268
- # Extract top-level functions and class methods
1269
- if isinstance(node, ast.FunctionDef):
1270
- # Skip private functions (starting with _)
1271
- if node.name.startswith("_"):
1272
- continue
1273
-
1274
- # Extract function signature
1275
- args_list = []
1276
- for arg in node.args.args:
1277
- arg_name = arg.arg
1278
- # Get type annotation if available
1279
- arg_type = ast.unparse(arg.annotation) if arg.annotation else "Any"
1280
- args_list.append({"name": arg_name, "type": arg_type})
1281
-
1282
- # Extract return type
1283
- return_type = ast.unparse(node.returns) if node.returns else "Any"
1284
-
1285
- # Extract docstring
1286
- docstring = ast.get_docstring(node) or ""
1287
-
1288
- functions.append({
1289
- "name": node.name,
1290
- "args": args_list,
1291
- "return_type": return_type,
1292
- "docstring": docstring,
1293
- "lineno": node.lineno,
1294
- })
1295
-
1296
- return functions
1297
-
1298
- async def _generate_api_section_for_function(
1299
- self,
1300
- func_info: dict,
1301
- tier: ModelTier,
1302
- ) -> str:
1303
- """Generate structured API reference section for a single function.
1304
-
1305
- This is a focused prompt that ONLY asks for Args/Returns/Raises format,
1306
- not narrative documentation.
1307
-
1308
- Args:
1309
- func_info: Function information from AST extraction
1310
- tier: Model tier to use for generation
1311
-
1312
- Returns:
1313
- Markdown formatted API reference section
1314
- """
1315
- func_name = func_info["name"]
1316
- args_list = func_info["args"]
1317
- return_type = func_info["return_type"]
1318
- docstring = func_info["docstring"]
1319
-
1320
- # Build function signature
1321
- args_str = ", ".join([f"{arg['name']}: {arg['type']}" for arg in args_list])
1322
- signature = f"def {func_name}({args_str}) -> {return_type}"
1323
-
1324
- system = """You are an API documentation generator. Output ONLY structured API reference sections in the EXACT format specified below.
1325
-
1326
- CRITICAL: Do NOT write explanatory text, questions, or narrative. Output ONLY the formatted section.
1327
-
1328
- REQUIRED FORMAT (copy this structure EXACTLY, replace bracketed content):
1329
-
1330
- ### `function_name()`
1331
-
1332
- **Function Signature:**
1333
- ```python
1334
- def function_name(param: type) -> return_type
1335
- ```
1336
-
1337
- **Description:**
1338
- Brief 1-2 sentence description.
1339
-
1340
- **Args:**
1341
- - `param_name` (`type`): Parameter description
1342
-
1343
- **Returns:**
1344
- - `return_type`: Return value description
1345
-
1346
- **Raises:**
1347
- - `ExceptionType`: When this exception occurs
1348
-
1349
- IMPORTANT:
1350
- - Use "**Args:**" (NOT "Parameters" or "params")
1351
- - Write "None" if no Args/Returns/Raises
1352
- - NO conversational text - just the formatted section"""
1353
-
1354
- user_message = f"""Generate API reference section using EXACT format specified in system prompt.
1355
-
1356
- Function:
1357
- ```python
1358
- {signature}
1359
- ```
1360
-
1361
- Docstring:
1362
- {docstring if docstring else "No docstring"}
1363
-
1364
- Output the formatted section EXACTLY as shown in system prompt. Use **Args:** (not Parameters). NO conversational text."""
1365
-
1366
- try:
1367
- response, input_tokens, output_tokens = await self._call_llm(
1368
- tier,
1369
- system,
1370
- user_message,
1371
- max_tokens=1000, # Small response - just the structured section
1372
- )
1373
-
1374
- # Track cost
1375
- self._track_cost(tier, input_tokens, output_tokens)
1376
-
1377
- return response
1378
-
1379
- except Exception as e:
1380
- logger.error(f"Failed to generate API section for {func_name}: {e}")
1381
- # Return minimal fallback
1382
- return f"""### `{func_name}()`
1383
-
1384
- **Function Signature:**
1385
- ```python
1386
- {signature}
1387
- ```
1388
-
1389
- **Description:**
1390
- {docstring.split('.')[0] if docstring else "No description available."}
1391
-
1392
- **Args:**
1393
- None
1394
-
1395
- **Returns:**
1396
- - `{return_type}`: Return value
1397
-
1398
- **Raises:**
1399
- None
1400
- """
1401
-
1402
- async def _add_api_reference_sections(
1403
- self,
1404
- narrative_doc: str,
1405
- source_code: str,
1406
- tier: ModelTier,
1407
- ) -> str:
1408
- """Add structured API reference sections to narrative documentation.
1409
-
1410
- This is Step 4 of the pipeline: after outline, write, and polish,
1411
- we add structured API reference sections extracted from source code.
1412
-
1413
- Args:
1414
- narrative_doc: The polished narrative documentation
1415
- source_code: Original source code to extract functions from
1416
- tier: Model tier to use for API section generation
1417
-
1418
- Returns:
1419
- Complete documentation with API reference appendix
1420
- """
1421
- logger.info("Adding structured API reference sections...")
1422
-
1423
- # Extract functions from source code
1424
- functions = self._extract_functions_from_source(source_code)
1425
-
1426
- if not functions:
1427
- logger.warning("No public functions found in source code")
1428
- return narrative_doc
1429
-
1430
- logger.info(f"Found {len(functions)} public functions to document")
1431
-
1432
- # Generate API section for each function
1433
- api_sections = []
1434
- for func_info in functions:
1435
- func_name = func_info["name"]
1436
- logger.debug(f"Generating API reference for {func_name}()")
1437
-
1438
- api_section = await self._generate_api_section_for_function(
1439
- func_info, tier
1440
- )
1441
- api_sections.append(api_section)
1442
-
1443
- # Append API reference section to narrative doc
1444
- full_doc = narrative_doc
1445
- full_doc += "\n\n---\n\n"
1446
- full_doc += "## API Reference\n\n"
1447
- full_doc += "Complete structured reference for all public functions:\n\n"
1448
- full_doc += "\n\n".join(api_sections)
1449
-
1450
- logger.info(f"Added {len(api_sections)} API reference sections")
1451
-
1452
- return full_doc
1453
-
1454
-
1455
- def format_doc_gen_report(result: dict, input_data: dict) -> str:
1456
- """Format document generation output as a human-readable report.
1457
-
1458
- Args:
1459
- result: The polish stage result
1460
- input_data: Input data from previous stages
1461
-
1462
- Returns:
1463
- Formatted report string
1464
-
1465
- """
1466
- lines = []
1467
-
1468
- # Header
1469
- doc_type = result.get("doc_type", "general").replace("_", " ").title()
1470
- audience = result.get("audience", "developers").title()
1471
-
1472
- lines.append("=" * 60)
1473
- lines.append("DOCUMENTATION GENERATION REPORT")
1474
- lines.append("=" * 60)
1475
- lines.append("")
1476
- lines.append(f"Document Type: {doc_type}")
1477
- lines.append(f"Target Audience: {audience}")
1478
- lines.append("")
1479
-
1480
- # Outline summary
1481
- outline = input_data.get("outline", "")
1482
- if outline:
1483
- lines.append("-" * 60)
1484
- lines.append("DOCUMENT OUTLINE")
1485
- lines.append("-" * 60)
1486
- # Show just a preview of the outline
1487
- outline_lines = outline.split("\n")[:10]
1488
- lines.extend(outline_lines)
1489
- if len(outline.split("\n")) > 10:
1490
- lines.append("...")
1491
- lines.append("")
1492
-
1493
- # Generated document
1494
- document = result.get("document", "")
1495
- if document:
1496
- lines.append("-" * 60)
1497
- lines.append("GENERATED DOCUMENTATION")
1498
- lines.append("-" * 60)
1499
- lines.append("")
1500
- lines.append(document)
1501
- lines.append("")
1502
-
1503
- # Statistics
1504
- word_count = len(document.split()) if document else 0
1505
- section_count = document.count("##") if document else 0 # Count markdown headers
1506
- was_chunked = input_data.get("chunked", False)
1507
- chunk_count = input_data.get("chunk_count", 0)
1508
- chunks_completed = input_data.get("chunks_completed", chunk_count)
1509
- stopped_early = input_data.get("stopped_early", False)
1510
- accumulated_cost = result.get("accumulated_cost", 0)
1511
- polish_chunked = result.get("polish_chunked", False)
1512
-
1513
- lines.append("-" * 60)
1514
- lines.append("STATISTICS")
1515
- lines.append("-" * 60)
1516
- lines.append(f"Word Count: {word_count}")
1517
- lines.append(f"Section Count: ~{section_count}")
1518
- if was_chunked:
1519
- if stopped_early:
1520
- lines.append(
1521
- f"Generation Mode: Chunked ({chunks_completed}/{chunk_count} chunks completed)",
1522
- )
1523
- else:
1524
- lines.append(f"Generation Mode: Chunked ({chunk_count} chunks)")
1525
- if polish_chunked:
1526
- polish_chunks = result.get("polish_chunks", 0)
1527
- lines.append(f"Polish Mode: Chunked ({polish_chunks} sections)")
1528
- if accumulated_cost > 0:
1529
- lines.append(f"Estimated Cost: ${accumulated_cost:.2f}")
1530
- lines.append("")
1531
-
1532
- # Export info
1533
- export_path = result.get("export_path")
1534
- if export_path:
1535
- lines.append("-" * 60)
1536
- lines.append("FILE EXPORT")
1537
- lines.append("-" * 60)
1538
- lines.append(f"Documentation saved to: {export_path}")
1539
- report_path = result.get("report_path")
1540
- if report_path:
1541
- lines.append(f"Report saved to: {report_path}")
1542
- lines.append("")
1543
- lines.append("Full documentation is available in the exported file.")
1544
- lines.append("")
1545
-
1546
- # Warning notice (cost limit, errors, etc.)
1547
- warning = input_data.get("warning") or result.get("warning")
1548
- if warning or stopped_early:
1549
- lines.append("-" * 60)
1550
- lines.append("⚠️ WARNING")
1551
- lines.append("-" * 60)
1552
- if warning:
1553
- lines.append(warning)
1554
- if stopped_early and not warning:
1555
- lines.append("Generation stopped early due to cost or error limits.")
1556
- lines.append("")
1557
-
1558
- # Truncation detection and scope notice
1559
- truncation_indicators = [
1560
- document.rstrip().endswith("..."),
1561
- document.rstrip().endswith("-"),
1562
- "```" in document and document.count("```") % 2 != 0, # Unclosed code block
1563
- any(
1564
- phrase in document.lower()
1565
- for phrase in ["continued in", "see next section", "to be continued"]
1566
- ),
1567
- ]
1568
-
1569
- # Count planned sections from outline (top-level only)
1570
- import re
1571
-
1572
- planned_sections = 0
1573
- top_level_pattern = re.compile(r"^(\d+)\.\s+([A-Za-z].*)")
1574
- if outline:
1575
- for line in outline.split("\n"):
1576
- stripped = line.strip()
1577
- if top_level_pattern.match(stripped):
1578
- planned_sections += 1
1579
-
1580
- is_truncated = any(truncation_indicators) or (
1581
- planned_sections > 0 and section_count < planned_sections - 1
1582
- )
1583
-
1584
- if is_truncated or planned_sections > section_count + 1:
1585
- lines.append("-" * 60)
1586
- lines.append("SCOPE NOTICE")
1587
- lines.append("-" * 60)
1588
- lines.append("⚠️ DOCUMENTATION MAY BE INCOMPLETE")
1589
- if planned_sections > 0:
1590
- lines.append(f" Planned sections: {planned_sections}")
1591
- lines.append(f" Generated sections: {section_count}")
1592
- lines.append("")
1593
- lines.append("To generate missing sections, re-run with section_focus:")
1594
- lines.append(" workflow = DocumentGenerationWorkflow(")
1595
- lines.append(' section_focus=["Testing Guide", "API Reference"]')
1596
- lines.append(" )")
1597
- lines.append("")
1598
-
1599
- # Footer
1600
- lines.append("=" * 60)
1601
- model_tier = result.get("model_tier_used", "unknown")
1602
- lines.append(f"Generated using {model_tier} tier model")
1603
- lines.append("=" * 60)
1604
-
1605
- return "\n".join(lines)
13
+ # Re-export all public APIs from the package for backward compatibility
14
+ from .document_gen import (
15
+ DOC_GEN_STEPS,
16
+ TOKEN_COSTS,
17
+ DocumentGenerationWorkflow,
18
+ format_doc_gen_report,
19
+ )
20
+
21
+ __all__ = [
22
+ # Workflow
23
+ "DocumentGenerationWorkflow",
24
+ # Configuration
25
+ "DOC_GEN_STEPS",
26
+ "TOKEN_COSTS",
27
+ # Report formatter
28
+ "format_doc_gen_report",
29
+ ]