source-kb 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. cli/__init__.py +50 -0
  2. cli/__main__.py +5 -0
  3. cli/commands/__init__.py +1 -0
  4. cli/commands/anchor_fix.py +47 -0
  5. cli/commands/diff_doc.py +52 -0
  6. cli/commands/dispatch.py +77 -0
  7. cli/commands/extract.py +72 -0
  8. cli/commands/file_list.py +74 -0
  9. cli/commands/index.py +84 -0
  10. cli/commands/lock.py +89 -0
  11. cli/commands/merge.py +60 -0
  12. cli/commands/merge_delta.py +19 -0
  13. cli/commands/metadata.py +24 -0
  14. cli/commands/pipeline.py +45 -0
  15. cli/commands/post_merge.py +43 -0
  16. cli/commands/query.py +52 -0
  17. cli/commands/render.py +101 -0
  18. cli/commands/scan_repos.py +46 -0
  19. cli/commands/setup.py +94 -0
  20. cli/commands/split.py +196 -0
  21. cli/commands/stale_files.py +98 -0
  22. cli/commands/validate.py +191 -0
  23. core/__init__.py +32 -0
  24. core/config.py +261 -0
  25. core/docs/__init__.py +7 -0
  26. core/docs/section_updater.py +286 -0
  27. core/docs/shared.py +149 -0
  28. core/git.py +294 -0
  29. core/interfaces.py +249 -0
  30. core/monitor/__init__.py +5 -0
  31. core/monitor/progress.py +83 -0
  32. core/monitor/prompt_store.py +49 -0
  33. core/paths.py +141 -0
  34. core/preset.py +237 -0
  35. core/preset_accessors.py +202 -0
  36. core/preset_classify.py +132 -0
  37. core/preset_hooks.py +129 -0
  38. core/preset_profile.py +89 -0
  39. core/prompt/__init__.py +7 -0
  40. core/prompt/__main__.py +147 -0
  41. core/prompt/content.py +320 -0
  42. core/prompt/context_manager.py +164 -0
  43. core/prompt/renderer.py +236 -0
  44. core/prompt/response_parser.py +274 -0
  45. core/prompt/templates.py +357 -0
  46. core/prompt/validate_parity.py +162 -0
  47. core/prompt/variables.py +339 -0
  48. core/rag/__init__.py +22 -0
  49. core/rag/__main__.py +136 -0
  50. core/rag/bm25_index.py +268 -0
  51. core/rag/chunker.py +273 -0
  52. core/rag/embedder.py +151 -0
  53. core/rag/indexer.py +292 -0
  54. core/rag/loader.py +89 -0
  55. core/rag/retriever.py +82 -0
  56. core/skeleton/__init__.py +11 -0
  57. core/skeleton/__main__.py +934 -0
  58. core/skeleton/anchor_fix.py +250 -0
  59. core/skeleton/classify.py +331 -0
  60. core/skeleton/cmd_anchor_fix.py +43 -0
  61. core/skeleton/cmd_diff_doc.py +44 -0
  62. core/skeleton/cmd_lock.py +87 -0
  63. core/skeleton/cmd_merge_delta.py +41 -0
  64. core/skeleton/community.py +233 -0
  65. core/skeleton/dependency_graph.py +306 -0
  66. core/skeleton/diff_doc.py +248 -0
  67. core/skeleton/dispatch.py +273 -0
  68. core/skeleton/dispatch_render.py +319 -0
  69. core/skeleton/dispatch_source.py +111 -0
  70. core/skeleton/extract.py +218 -0
  71. core/skeleton/extract_methods.py +298 -0
  72. core/skeleton/file_list.py +239 -0
  73. core/skeleton/impact.py +278 -0
  74. core/skeleton/jar_download.py +177 -0
  75. core/skeleton/jar_resolver.py +186 -0
  76. core/skeleton/loader.py +162 -0
  77. core/skeleton/merge.py +278 -0
  78. core/skeleton/merge_delta.py +229 -0
  79. core/skeleton/metadata.py +96 -0
  80. core/skeleton/metadata_builders.py +264 -0
  81. core/skeleton/module_dag.py +330 -0
  82. core/skeleton/parsers/__init__.py +71 -0
  83. core/skeleton/parsers/jqassistant.py +300 -0
  84. core/skeleton/parsers/jqassistant_cypher.py +225 -0
  85. core/skeleton/parsers/regex.py +171 -0
  86. core/skeleton/parsers/treesitter.py +324 -0
  87. core/skeleton/parsers/treesitter_java.py +284 -0
  88. core/skeleton/parsers/treesitter_multi.py +289 -0
  89. core/skeleton/pom_parser.py +299 -0
  90. core/skeleton/post_merge.py +295 -0
  91. core/skeleton/post_merge_llm.py +82 -0
  92. core/skeleton/query.py +195 -0
  93. core/skeleton/shard_context.py +177 -0
  94. core/skeleton/split.py +180 -0
  95. core/skeleton/split_cache.py +107 -0
  96. core/skeleton/split_feedback.py +174 -0
  97. core/skeleton/split_plan.py +219 -0
  98. core/skeleton/split_plan_helpers.py +305 -0
  99. core/skeleton/split_plan_llm.py +274 -0
  100. core/utils.py +135 -0
  101. core/validators/__init__.py +65 -0
  102. core/validators/__main__.py +215 -0
  103. core/validators/consistency.py +203 -0
  104. core/validators/coverage.py +171 -0
  105. core/validators/duplicates.py +76 -0
  106. core/validators/engine.py +224 -0
  107. core/validators/links.py +76 -0
  108. core/validators/sampling.py +169 -0
  109. core/validators/structure.py +144 -0
  110. engine/__init__.py +7 -0
  111. engine/assembler.py +231 -0
  112. engine/confirm.py +65 -0
  113. engine/dedup.py +106 -0
  114. engine/main.py +211 -0
  115. engine/pipeline/__init__.py +163 -0
  116. engine/pipeline/recovery.py +250 -0
  117. engine/pipeline/steps/__init__.py +23 -0
  118. engine/pipeline/steps/audit.py +220 -0
  119. engine/pipeline/steps/audit_apply.py +195 -0
  120. engine/pipeline/steps/audit_helpers.py +155 -0
  121. engine/pipeline/steps/classify_llm.py +236 -0
  122. engine/pipeline/steps/classify_prompt.py +223 -0
  123. engine/pipeline/steps/finalize.py +160 -0
  124. engine/pipeline/steps/generate.py +169 -0
  125. engine/pipeline/steps/generate_batch.py +197 -0
  126. engine/pipeline/steps/generate_recovery.py +170 -0
  127. engine/pipeline/steps/llm_plan_split.py +253 -0
  128. engine/pipeline/steps/lock.py +64 -0
  129. engine/pipeline/steps/preflight.py +237 -0
  130. engine/pipeline/steps/preflight_adjust.py +147 -0
  131. engine/pipeline/steps/pregenerate.py +130 -0
  132. engine/pipeline/steps/quality.py +81 -0
  133. engine/pipeline/steps/skeleton.py +149 -0
  134. engine/pipeline/steps/source.py +163 -0
  135. engine/pipeline/steps/sync.py +117 -0
  136. engine/pipeline/steps/sync_finalize.py +237 -0
  137. engine/pipeline/steps/sync_update.py +341 -0
  138. engine/pipelines.py +91 -0
  139. engine/runner.py +335 -0
  140. engine/strategies/__init__.py +86 -0
  141. engine/strategies/api.py +128 -0
  142. engine/strategies/delegated.py +50 -0
  143. engine/strategies/dryrun.py +25 -0
  144. engine/two_phase.py +143 -0
  145. mcp_server/__init__.py +73 -0
  146. mcp_server/__main__.py +5 -0
  147. mcp_server/tools/__init__.py +1 -0
  148. mcp_server/tools/config.py +63 -0
  149. mcp_server/tools/discovery.py +276 -0
  150. mcp_server/tools/generation.py +184 -0
  151. mcp_server/tools/planning.py +144 -0
  152. mcp_server/tools/source.py +175 -0
  153. mcp_server/tools/validation.py +140 -0
  154. mcp_server/tools/workflow.py +166 -0
  155. mcp_server/workflow_loader.py +204 -0
  156. presets/generic/audit_dimensions.md +132 -0
  157. presets/generic/doc_types.yaml +152 -0
  158. presets/generic/preset.yaml +115 -0
  159. presets/java-spring/audit_dimensions.md +228 -0
  160. presets/java-spring/audit_dimensions.yaml +203 -0
  161. presets/java-spring/doc_types.yaml +269 -0
  162. presets/java-spring/hooks.py +122 -0
  163. presets/java-spring/preset.yaml +341 -0
  164. presets/java-spring/templates/README.md +34 -0
  165. presets/java-spring/templates/audit-system.md +15 -0
  166. presets/java-spring/templates/subagent-aop.md +105 -0
  167. presets/java-spring/templates/subagent-api.md +63 -0
  168. presets/java-spring/templates/subagent-architecture.md +111 -0
  169. presets/java-spring/templates/subagent-async-events.md +107 -0
  170. presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
  171. presets/java-spring/templates/subagent-audit-architecture.md +38 -0
  172. presets/java-spring/templates/subagent-audit-business.md +40 -0
  173. presets/java-spring/templates/subagent-audit-data-models.md +40 -0
  174. presets/java-spring/templates/subagent-business.md +129 -0
  175. presets/java-spring/templates/subagent-caching.md +75 -0
  176. presets/java-spring/templates/subagent-database-access.md +114 -0
  177. presets/java-spring/templates/subagent-enum.md +75 -0
  178. presets/java-spring/templates/subagent-error-handling.md +91 -0
  179. presets/java-spring/templates/subagent-external-integrations.md +80 -0
  180. presets/java-spring/templates/subagent-index.md +122 -0
  181. presets/java-spring/templates/subagent-messaging.md +97 -0
  182. presets/java-spring/templates/subagent-model.md +88 -0
  183. presets/java-spring/templates/subagent-observability.md +91 -0
  184. presets/java-spring/templates/subagent-scheduled.md +81 -0
  185. presets/java-spring/templates/subagent-security.md +102 -0
  186. presets/java-spring/templates/subagent-structure.md +101 -0
  187. presets/java-spring/templates/subagent-sync-section.md +34 -0
  188. presets/java-spring/templates/subagent-utils.md +73 -0
  189. presets/java-spring/templates/sync-system.md +8 -0
  190. presets/java-spring/workflow-extensions.md +112 -0
  191. skills/__init__.py +1 -0
  192. skills/_shared/README.md +30 -0
  193. skills/_shared/doc-coverage-shared.md +134 -0
  194. skills/_shared/doc-quality-standard.md +1058 -0
  195. skills/_shared/doc-subagent-rules.md +762 -0
  196. skills/_shared/windows-compat.md +89 -0
  197. skills/kb-audit/SKILL.md +52 -0
  198. skills/kb-audit/rules.md +88 -0
  199. skills/kb-audit/steps/step-01-prepare.md +75 -0
  200. skills/kb-audit/steps/step-02-audit.md +96 -0
  201. skills/kb-audit/steps/step-03-verify.md +65 -0
  202. skills/kb-audit/steps/step-04-report.md +64 -0
  203. skills/kb-init/SKILL.md +146 -0
  204. skills/kb-init/rules.md +187 -0
  205. skills/kb-init/steps/step-01-scope.md +62 -0
  206. skills/kb-init/steps/step-02-source.md +410 -0
  207. skills/kb-init/steps/step-03-generate.md +307 -0
  208. skills/kb-init/steps/step-04-quality.md +92 -0
  209. skills/kb-init/steps/step-05-finalize.md +132 -0
  210. skills/kb-init/templates/core/execution-modes.md +29 -0
  211. skills/kb-init/templates/core/output-only.md +4 -0
  212. skills/kb-init/templates/core/readwrite.md +33 -0
  213. skills/kb-search/SKILL.md +138 -0
  214. skills/kb-search/rules.md +64 -0
  215. skills/kb-sync/SKILL.md +43 -0
  216. skills/kb-sync/rules.md +70 -0
  217. skills/kb-sync/scripts/rebuild_module.py +91 -0
  218. skills/kb-sync/scripts/scan_repos.py +687 -0
  219. skills/kb-sync/steps/step-01-detect.md +72 -0
  220. skills/kb-sync/steps/step-02-update.md +71 -0
  221. skills/kb-sync/steps/step-03-verify.md +47 -0
  222. skills/kb-sync/steps/step-04-finalize.md +52 -0
  223. source_kb-0.2.2.dist-info/METADATA +194 -0
  224. source_kb-0.2.2.dist-info/RECORD +228 -0
  225. source_kb-0.2.2.dist-info/WHEEL +5 -0
  226. source_kb-0.2.2.dist-info/entry_points.txt +3 -0
  227. source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
  228. source_kb-0.2.2.dist-info/top_level.txt +6 -0
@@ -0,0 +1,160 @@
1
+ """Finalization steps — merge, dedup, publish, index, cleanup."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from core.interfaces import Step, StepResult, PipelineContext
8
+ from engine.pipeline import register_step
9
+
10
+
11
+ @register_step
12
+ class MergeShardsStep(Step):
13
+ """Merge split document shards into single files."""
14
+
15
+ default_name = "merge-shards"
16
+
17
+ def __init__(self):
18
+ super().__init__("merge-shards")
19
+
20
+ def run(self, ctx: PipelineContext) -> StepResult:
21
+ from core.skeleton.merge import find_shards, merge_shards, refine_merged_doc
22
+
23
+ merged_count = 0
24
+ for module_dir in _module_dirs(ctx):
25
+ # Find all shard prefixes from preset doc_type_mapping
26
+ from core.preset import load_preset, get_doc_type_mapping
27
+ preset_name = ctx.kb_config.get("preset", "generic")
28
+ preset = load_preset(preset_name)
29
+ dt_mapping = get_doc_type_mapping(preset)
30
+ known_types = {fn.removesuffix(".md") for fn in dt_mapping.values()} if dt_mapping else set()
31
+
32
+ for prefix in known_types:
33
+ shards = find_shards(module_dir, prefix)
34
+ if not shards:
35
+ continue
36
+ merged_file = module_dir / f"{prefix}.md"
37
+ if merged_file.exists() and merged_file.stat().st_size > 100:
38
+ continue
39
+
40
+ content = merge_shards(shards)
41
+ if content.strip():
42
+ merged_file.write_text(content, encoding="utf-8")
43
+ for s in shards:
44
+ s.unlink(missing_ok=True)
45
+ merged_count += 1
46
+
47
+ # Post-merge refinement
48
+ result = refine_merged_doc(merged_file)
49
+ if result.changed:
50
+ result.apply()
51
+
52
+ return StepResult(status="ok", message=f"Merged {merged_count} documents")
53
+
54
+
55
+ @register_step
56
+ class DeduplicateStep(Step):
57
+ """LLM-based deduplication of large documents."""
58
+
59
+ default_name = "deduplicate"
60
+
61
+ def __init__(self):
62
+ super().__init__("deduplicate")
63
+
64
+ def run(self, ctx: PipelineContext) -> StepResult:
65
+ from engine.dedup import dedup_document
66
+ from engine.strategies import create_strategy
67
+ from engine.pipeline.steps.generate import _make_config_obj
68
+ from core.preset import load_preset
69
+
70
+ config_obj = _make_config_obj(ctx)
71
+ strategy = create_strategy(config_obj)
72
+
73
+ # Probe: delegated strategy means Agent handles dedup
74
+ from core.interfaces import LlmRequest
75
+ probe = strategy.call(LlmRequest(system="probe", user="probe"))
76
+ if probe.status == "delegated":
77
+ return StepResult(status="skipped", message="Delegated mode — dedup handled by Agent")
78
+
79
+ preset_name = ctx.kb_config.get("preset", "generic")
80
+ preset = load_preset(preset_name)
81
+ limits = preset.get("limits", {})
82
+ dedup_min_size = limits.get("dedup_min_doc_size_bytes", 15 * 1024)
83
+
84
+ total_deduped = 0
85
+ total_saved = 0.0
86
+
87
+ for module_dir in _module_dirs(ctx):
88
+ md_files = sorted(module_dir.glob("*.md"))
89
+ sibling_names = [f.name for f in md_files]
90
+
91
+ for md_file in md_files:
92
+ if md_file.stat().st_size < dedup_min_size:
93
+ continue
94
+ changed, saved = dedup_document(md_file, sibling_names, strategy)
95
+ if changed:
96
+ total_deduped += 1
97
+ total_saved += saved
98
+
99
+ if total_deduped == 0:
100
+ return StepResult(status="ok", message="No redundancy found")
101
+ return StepResult(status="ok", message=f"Deduped {total_deduped} docs (saved {total_saved:.1f}KB)")
102
+
103
+
104
+ @register_step
105
+ class RebuildIndexStep(Step):
106
+ """Rebuild vector index for generated documents."""
107
+
108
+ default_name = "rebuild-index"
109
+
110
+ def __init__(self):
111
+ super().__init__("rebuild-index", checkpoint="cp9")
112
+
113
+ def run(self, ctx: PipelineContext) -> StepResult:
114
+ # TODO: Import from core.rag when migrated
115
+ return StepResult(status="ok", message="Index rebuild (placeholder)")
116
+
117
+
118
+ @register_step
119
+ class CleanProgressStep(Step):
120
+ """Remove progress files after successful completion."""
121
+
122
+ default_name = "clean-progress"
123
+
124
+ def __init__(self):
125
+ super().__init__("clean-progress")
126
+
127
+ def run(self, ctx: PipelineContext) -> StepResult:
128
+ from core.monitor.progress import cleanup_progress
129
+
130
+ cleaned = 0
131
+ for module_dir in _module_dirs(ctx):
132
+ cleaned += cleanup_progress(module_dir)
133
+ return StepResult(status="ok", message=f"Cleaned {cleaned} progress files")
134
+
135
+
136
+ @register_step
137
+ class SharedDocsStep(Step):
138
+ """Generate cross-module shared documents."""
139
+
140
+ default_name = "shared-docs"
141
+
142
+ def __init__(self):
143
+ super().__init__("shared-docs")
144
+
145
+ def run(self, ctx: PipelineContext) -> StepResult:
146
+ from core.docs.shared import generate_shared_docs
147
+
148
+ generated = generate_shared_docs(ctx.knowledge_dir, ctx.config, ctx.kb_name)
149
+ if not generated:
150
+ return StepResult(status="skipped", message="No shared docs to generate")
151
+ return StepResult(status="ok", message=f"Generated {len(generated)} shared docs",
152
+ details={"generated": generated})
153
+
154
+
155
+ def _module_dirs(ctx: PipelineContext) -> list[Path]:
156
+ if ctx.module:
157
+ d = ctx.knowledge_dir / ctx.module
158
+ return [d] if d.is_dir() else []
159
+ return sorted(d for d in ctx.knowledge_dir.iterdir()
160
+ if d.is_dir() and not d.name.startswith(".") and d.name != "_shared")
@@ -0,0 +1,169 @@
1
+ """Document generation step — the main LLM-powered step.
2
+
3
+ Orchestrates prompt rendering, split planning, batch execution,
4
+ and two-phase generation for large modules.
5
+
6
+ Requirements: 5.1, 5.5
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import time
13
+ from pathlib import Path
14
+
15
+ from core.interfaces import Step, StepResult, PipelineContext
16
+ from core.monitor.progress import write_progress
17
+ from engine.pipeline import register_step
18
+ from engine.runner import BatchAbortError
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ MIN_DOC_SIZE_BYTES = 500
23
+
24
+
25
+ @register_step
26
+ class GenerateDocsStep(Step):
27
+ """Generate knowledge base documents via LLM."""
28
+
29
+ default_name = "generate-docs"
30
+
31
+ def __init__(self):
32
+ super().__init__("generate-docs", checkpoint="cp4")
33
+
34
+ def run(self, ctx: PipelineContext) -> StepResult:
35
+ import json
36
+ from core.preset import load_preset
37
+ from core.prompt.renderer import render_prompt
38
+ from engine.strategies import create_strategy, ConfigProxy
39
+ from engine.assembler import InlinePromptAssembler
40
+ from engine.runner import SubagentTask, run_batch, configure_garbage_patterns, CircuitBreakerConfig
41
+
42
+ config = ctx.config
43
+ preset_name = ctx.kb_config.get("preset", "generic")
44
+ preset = load_preset(preset_name)
45
+
46
+ # Configure runner from preset limits
47
+ limits = preset.get("limits", {})
48
+ configure_garbage_patterns(limits.get("garbage_patterns"))
49
+ breaker_config = CircuitBreakerConfig.from_config({"limits": limits})
50
+
51
+ strategy = create_strategy(ConfigProxy(config))
52
+ assembler = InlinePromptAssembler(preset=preset)
53
+
54
+ snippet_path = Path(ctx.project_root) / "skills" / "kb-init" / "templates" / "core" / "output-only.md"
55
+ execution_snippet = snippet_path.read_text(encoding="utf-8") if snippet_path.exists() else ""
56
+
57
+ from core.monitor.prompt_store import should_save_prompts
58
+ save_prompts = should_save_prompts(config)
59
+
60
+ module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
61
+ generated: list[str] = []
62
+ max_concurrent = ctx.config.get("agent", {}).get("max_concurrent_subagents", 5)
63
+
64
+ for name in module_repos:
65
+ module_dir = ctx.knowledge_dir / name
66
+ tasks_file = module_dir / ".meta" / "dispatch-tasks.json"
67
+
68
+ if not tasks_file.exists():
69
+ logger.warning("No dispatch-tasks.json for %s, skipping", name)
70
+ continue
71
+
72
+ manifest = json.loads(tasks_file.read_text(encoding="utf-8"))
73
+ dispatch_tasks = manifest.get("tasks", [])
74
+
75
+ # Group by batch for sequential execution
76
+ batches: dict[str, list[dict]] = {}
77
+ for t in dispatch_tasks:
78
+ batches.setdefault(t["batch"], []).append(t)
79
+
80
+ for batch_id in sorted(batches.keys()):
81
+ batch_tasks: list[SubagentTask] = []
82
+
83
+ for t in batches[batch_id]:
84
+ output_path = Path(t["output_file"])
85
+ if output_path.exists() and output_path.stat().st_size > MIN_DOC_SIZE_BYTES:
86
+ continue
87
+
88
+ template_path = _find_template(ctx.project_root, preset_name, t["doc_type"], preset)
89
+ if not template_path:
90
+ continue
91
+
92
+ extras = {}
93
+ if t.get("shard_file_list"):
94
+ extras["file_list_override"] = t["shard_file_list"]
95
+
96
+ prompt = render_prompt(
97
+ template_path=template_path,
98
+ config=config,
99
+ kb_name=ctx.kb_name,
100
+ module_name=name,
101
+ doc_type=t["doc_type"],
102
+ assembler=assembler,
103
+ extras=extras,
104
+ execution_snippet=execution_snippet,
105
+ preset=preset,
106
+ )
107
+
108
+ batch_tasks.append(SubagentTask(
109
+ task_id=f"{name}__{t['id']}",
110
+ prompt=prompt,
111
+ output_path=output_path,
112
+ doc_type=t["doc_type"],
113
+ ))
114
+
115
+ if batch_tasks:
116
+ _write_heartbeat(module_dir, [t["doc_type"] for t in batches[batch_id]])
117
+ try:
118
+ results = run_batch(batch_tasks, strategy, max_concurrent,
119
+ save_prompts=save_prompts, breaker_config=breaker_config)
120
+ except BatchAbortError as e:
121
+ logger.error("Batch %s aborted: %s", batch_id, e.reason)
122
+ logger.error("Diagnosis: %s", e.diagnosis)
123
+ return StepResult(
124
+ status="failed",
125
+ message=f"Generation aborted: {e.reason}\nDiagnosis: {e.diagnosis}",
126
+ details={"generated": generated, "abort_reason": e.reason, "diagnosis": e.diagnosis},
127
+ )
128
+ for r in results:
129
+ if r.status == "done" and r.output_path and r.output_path.exists():
130
+ generated.append(f"{name}/{r.output_path.name}")
131
+ write_progress(module_dir, r.task_id.split("__")[-1], "DONE")
132
+
133
+ if not generated:
134
+ return StepResult(status="skipped", message="No documents generated")
135
+ return StepResult(status="ok", message=f"Generated {len(generated)} documents",
136
+ details={"generated": generated})
137
+
138
+
139
+ def _make_config_obj(ctx: PipelineContext):
140
+ """Create a ConfigProxy from pipeline context. Used by multiple steps."""
141
+ from engine.strategies import ConfigProxy
142
+ return ConfigProxy(ctx.config)
143
+
144
+
145
+ def _find_template(project_root: Path, preset_name: str, doc_type: str, preset: dict = None) -> Path | None:
146
+ """Find the template file for a doc type from preset config.
147
+
148
+ Uses preset search path: env > local > built-in.
149
+ """
150
+ from core.preset import find_preset_template, get_template_path
151
+
152
+ template_name = None
153
+ if preset:
154
+ template_name = get_template_path(preset, doc_type, preset_name)
155
+
156
+ if not template_name:
157
+ return None
158
+
159
+ return find_preset_template(preset_name, template_name)
160
+
161
+
162
+ def _write_heartbeat(module_dir: Path, batch: list[str]) -> None:
163
+ """Write heartbeat file for current batch progress tracking."""
164
+ hb_path = module_dir / ".meta" / "heartbeat.txt"
165
+ hb_path.parent.mkdir(parents=True, exist_ok=True)
166
+ hb_path.write_text(
167
+ f"batch={','.join(batch)} ts={time.time():.0f}\n",
168
+ encoding="utf-8",
169
+ )
@@ -0,0 +1,197 @@
1
+ """Batch execution module for document generation.
2
+
3
+ Handles single-batch execution logic:
4
+ - Shard generation via LlmStrategy
5
+ - Two-phase coordination (call cli/two_phase.py)
6
+ - Concurrency control
7
+
8
+ Requirements: 5.2, 5.3, 5.4
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from pathlib import Path
15
+
16
+ from core.interfaces import LlmStrategy, PipelineContext
17
+ from core.paths import resolve_file_list
18
+ from core.monitor.progress import write_progress
19
+ from engine.runner import SubagentTask, SubagentResult, run_batch
20
+ from engine.two_phase import run_two_phase
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ MIN_DOC_SIZE_BYTES = 500
25
+
26
+
27
+ def execute_batch(
28
+ ctx: PipelineContext,
29
+ module_name: str,
30
+ doc_types: list[str],
31
+ strategy: LlmStrategy,
32
+ max_concurrent: int = 5,
33
+ ) -> list[SubagentResult]:
34
+ """Execute a single batch of doc_types for a module.
35
+
36
+ For each doc_type, determines if splitting is needed and dispatches
37
+ either direct generation or two-phase split generation.
38
+
39
+ Args:
40
+ ctx: Pipeline context
41
+ module_name: Name of the module being processed
42
+ doc_types: List of doc types in this batch
43
+ strategy: LLM execution strategy
44
+ max_concurrent: Max concurrent tasks
45
+
46
+ Returns:
47
+ List of SubagentResults from all tasks in this batch
48
+ """
49
+ from core.preset import load_preset, get_doc_type_mapping
50
+ from core.skeleton.split import SplitConfig, compute_splits
51
+ from core.skeleton.file_list import load_skeleton
52
+ from core.prompt.renderer import render_prompt
53
+ from engine.assembler import InlinePromptAssembler
54
+ from engine.pipeline.steps.generate import _find_template, _make_config_obj
55
+
56
+ preset_name = ctx.kb_config.get("preset", "generic")
57
+ preset = load_preset(preset_name)
58
+ split_config = SplitConfig.from_preset(preset, mode="output-only")
59
+ assembler = InlinePromptAssembler()
60
+
61
+ module_dir = ctx.knowledge_dir / module_name
62
+ config_path = str(ctx.project_root / "kb-project.yaml")
63
+
64
+ tasks: list[SubagentTask] = []
65
+ split_tasks: list[SubagentTask] = []
66
+
67
+ for doc_type in doc_types:
68
+ doc_path = module_dir / f"{doc_type}.md"
69
+ if doc_path.exists() and doc_path.stat().st_size > MIN_DOC_SIZE_BYTES:
70
+ continue
71
+
72
+ if not resolve_file_list(module_dir, doc_type):
73
+ continue
74
+
75
+ template_path = _find_template(ctx.project_root, preset_name, doc_type, preset)
76
+ if not template_path:
77
+ continue
78
+
79
+ # Compute splits based on file list size
80
+ num_splits = _compute_doc_splits(module_dir, doc_type, split_config)
81
+
82
+ prompt = render_prompt(
83
+ template_path=template_path,
84
+ config=ctx.config,
85
+ kb_name=ctx.kb_name,
86
+ module_name=module_name,
87
+ doc_type=doc_type,
88
+ assembler=assembler,
89
+ )
90
+
91
+ if num_splits > 1:
92
+ # Create shard tasks for two-phase generation
93
+ shard_tasks = _create_shard_tasks(
94
+ ctx, module_dir, module_name, doc_type,
95
+ prompt, num_splits, template_path, assembler,
96
+ )
97
+ split_tasks.extend(shard_tasks)
98
+ else:
99
+ tasks.append(SubagentTask(
100
+ task_id=f"{module_name}__{doc_type}",
101
+ prompt=prompt,
102
+ output_path=doc_path,
103
+ doc_type=doc_type,
104
+ ))
105
+
106
+ results: list[SubagentResult] = []
107
+
108
+ # Execute direct tasks
109
+ if tasks:
110
+ logger.info("[batch] Running %d direct tasks for %s", len(tasks), module_name)
111
+ results.extend(run_batch(tasks, strategy, max_concurrent))
112
+
113
+ # Execute split tasks via two-phase
114
+ if split_tasks:
115
+ logger.info("[batch] Running %d split tasks (two-phase) for %s",
116
+ len(split_tasks), module_name)
117
+ results.extend(run_two_phase(split_tasks, strategy, max_concurrent))
118
+
119
+ return results
120
+
121
+
122
+ def _compute_doc_splits(
123
+ module_dir: Path, doc_type: str, split_config: "SplitConfig"
124
+ ) -> int:
125
+ """Compute number of splits needed for a doc type."""
126
+ from core.skeleton.file_list import load_skeleton
127
+
128
+ fl_path = resolve_file_list(module_dir, doc_type)
129
+ if not fl_path:
130
+ return 1
131
+
132
+ try:
133
+ lines = fl_path.read_text(encoding="utf-8").strip().splitlines()
134
+ file_count = len(lines)
135
+ except OSError:
136
+ return 1
137
+
138
+ # Simple heuristic: split if file count exceeds threshold
139
+ threshold = split_config.max_files_per_shard if split_config else 30
140
+ if file_count <= threshold:
141
+ return 1
142
+ return min(4, max(2, file_count // threshold + 1))
143
+
144
+
145
+ def _create_shard_tasks(
146
+ ctx: PipelineContext,
147
+ module_dir: Path,
148
+ module_name: str,
149
+ doc_type: str,
150
+ base_prompt: str,
151
+ num_splits: int,
152
+ template_path: Path,
153
+ assembler,
154
+ ) -> list[SubagentTask]:
155
+ """Create shard tasks for split generation.
156
+
157
+ Divides the file list into shards and creates a task per shard.
158
+ """
159
+ from core.paths import shard_doc_path
160
+
161
+ fl_path = resolve_file_list(module_dir, doc_type)
162
+ if not fl_path:
163
+ return []
164
+
165
+ try:
166
+ all_files = fl_path.read_text(encoding="utf-8").strip().splitlines()
167
+ except OSError:
168
+ return []
169
+
170
+ if not all_files:
171
+ return []
172
+
173
+ # Divide files into shards
174
+ shard_size = max(1, len(all_files) // num_splits)
175
+ shards: list[list[str]] = []
176
+ for i in range(0, len(all_files), shard_size):
177
+ shards.append(all_files[i:i + shard_size])
178
+
179
+ # Merge last shard if too small
180
+ if len(shards) > 1 and len(shards[-1]) < shard_size // 2:
181
+ shards[-2].extend(shards[-1])
182
+ shards.pop()
183
+
184
+ tasks: list[SubagentTask] = []
185
+ for idx, shard_files in enumerate(shards, 1):
186
+ shard_suffix = f"\n\n## Files assigned to this shard\n\n" + "\n".join(f"- {f}" for f in shard_files)
187
+ output_path = shard_doc_path(module_dir, doc_type, idx)
188
+ output_path.parent.mkdir(parents=True, exist_ok=True)
189
+
190
+ tasks.append(SubagentTask(
191
+ task_id=f"{module_name}__{doc_type}__shard{idx:02d}",
192
+ prompt=base_prompt + shard_suffix,
193
+ output_path=output_path,
194
+ doc_type=doc_type,
195
+ ))
196
+
197
+ return tasks
@@ -0,0 +1,170 @@
1
+ """Recovery strategy chain for failed generation steps.
2
+
3
+ Provides a chain-of-responsibility pattern for handling LLM failures:
4
+ 1. SplitRetryStrategy — re-split into smaller shards and retry
5
+ 2. ModelSwitchStrategy — try a different/larger model
6
+ 3. MarkFailedStrategy — give up and mark as failed
7
+
8
+ Requirements: 5.2
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from abc import ABC, abstractmethod
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ from core.interfaces import LlmStrategy, LlmRequest, LlmResponse
19
+ from engine.runner import SubagentTask, SubagentResult, run_single
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @dataclass
25
+ class RecoveryContext:
26
+ """Context passed through the recovery chain."""
27
+ task: SubagentTask
28
+ strategy: LlmStrategy
29
+ failure_reason: str
30
+ attempt: int = 0
31
+ max_attempts: int = 3
32
+
33
+
34
+ class RecoveryStrategy(ABC):
35
+ """Base class for recovery strategies."""
36
+
37
+ @abstractmethod
38
+ def can_handle(self, ctx: RecoveryContext) -> bool:
39
+ """Check if this strategy can handle the failure."""
40
+
41
+ @abstractmethod
42
+ def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
43
+ """Attempt recovery. Returns result or None to pass to next strategy."""
44
+
45
+
46
+ class SplitRetryStrategy(RecoveryStrategy):
47
+ """Re-split a failed shard into smaller pieces and retry."""
48
+
49
+ def can_handle(self, ctx: RecoveryContext) -> bool:
50
+ # Only applicable if the task has a large prompt (likely too much content)
51
+ return len(ctx.task.prompt) > 20000 and ctx.attempt < 2
52
+
53
+ def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
54
+ logger.info("[recovery:split] %s: splitting prompt", ctx.task.task_id)
55
+
56
+ # Split the prompt roughly in half by file list
57
+ prompt = ctx.task.prompt
58
+ marker = "## Files assigned to this shard"
59
+ if marker not in prompt:
60
+ return None
61
+
62
+ parts = prompt.split(marker, 1)
63
+ if len(parts) < 2:
64
+ return None
65
+
66
+ file_section = parts[1]
67
+ files = [l.strip("- \n") for l in file_section.strip().splitlines() if l.strip().startswith("-")]
68
+ if len(files) <= 2:
69
+ return None
70
+
71
+ mid = len(files) // 2
72
+ first_half = files[:mid]
73
+ second_half = files[mid:]
74
+
75
+ results: list[SubagentResult] = []
76
+ for idx, chunk in enumerate([first_half, second_half], 1):
77
+ chunk_suffix = f"\n\n{marker}\n\n" + "\n".join(f"- {f}" for f in chunk)
78
+ sub_task = SubagentTask(
79
+ task_id=f"{ctx.task.task_id}__split{idx}",
80
+ prompt=parts[0] + chunk_suffix,
81
+ output_path=ctx.task.output_path.parent / f"{ctx.task.output_path.stem}-part{idx}.md",
82
+ doc_type=ctx.task.doc_type,
83
+ )
84
+ result = run_single(sub_task, ctx.strategy)
85
+ results.append(result)
86
+
87
+ # Merge successful results
88
+ merged_content = ""
89
+ for r in results:
90
+ if r.status == "done" and r.content:
91
+ merged_content += r.content + "\n\n"
92
+
93
+ if merged_content:
94
+ ctx.task.output_path.parent.mkdir(parents=True, exist_ok=True)
95
+ ctx.task.output_path.write_text(merged_content.strip(), encoding="utf-8")
96
+ return SubagentResult(
97
+ task_id=ctx.task.task_id, status="done",
98
+ output_path=ctx.task.output_path, content=merged_content,
99
+ )
100
+ return None
101
+
102
+
103
+ class ModelSwitchStrategy(RecoveryStrategy):
104
+ """Switch to a different model and retry."""
105
+
106
+ def __init__(self, fallback_model: str | None = None):
107
+ self._fallback = fallback_model
108
+
109
+ def can_handle(self, ctx: RecoveryContext) -> bool:
110
+ return self._fallback is not None and ctx.attempt < 2
111
+
112
+ def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
113
+ if not self._fallback:
114
+ return None
115
+ logger.info("[recovery:model-switch] %s: switching to %s",
116
+ ctx.task.task_id, self._fallback)
117
+ task = SubagentTask(
118
+ task_id=ctx.task.task_id,
119
+ prompt=ctx.task.prompt,
120
+ output_path=ctx.task.output_path,
121
+ doc_type=ctx.task.doc_type,
122
+ model=self._fallback,
123
+ )
124
+ result = run_single(task, ctx.strategy)
125
+ return result if result.status == "done" else None
126
+
127
+
128
+ class MarkFailedStrategy(RecoveryStrategy):
129
+ """Final fallback — mark the task as failed and continue."""
130
+
131
+ def can_handle(self, ctx: RecoveryContext) -> bool:
132
+ return True # Always applicable as last resort
133
+
134
+ def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
135
+ logger.warning("[recovery:mark-failed] %s: giving up after %d attempts",
136
+ ctx.task.task_id, ctx.attempt)
137
+ return SubagentResult(
138
+ task_id=ctx.task.task_id, status="failed",
139
+ error=f"All recovery exhausted: {ctx.failure_reason}",
140
+ )
141
+
142
+
143
+ class RecoveryChain:
144
+ """Chain of recovery strategies executed in order."""
145
+
146
+ def __init__(self, strategies: list[RecoveryStrategy] | None = None):
147
+ self._strategies = strategies or [
148
+ SplitRetryStrategy(),
149
+ ModelSwitchStrategy(),
150
+ MarkFailedStrategy(),
151
+ ]
152
+
153
+ def attempt(
154
+ self, task: SubagentTask, strategy: LlmStrategy, failure_reason: str
155
+ ) -> SubagentResult:
156
+ """Try recovery strategies in order until one succeeds."""
157
+ ctx = RecoveryContext(task=task, strategy=strategy, failure_reason=failure_reason)
158
+
159
+ for recovery in self._strategies:
160
+ ctx.attempt += 1
161
+ if not recovery.can_handle(ctx):
162
+ continue
163
+ result = recovery.recover(ctx)
164
+ if result and result.status == "done":
165
+ return result
166
+
167
+ return SubagentResult(
168
+ task_id=task.task_id, status="failed",
169
+ error=f"Recovery chain exhausted: {failure_reason}",
170
+ )