source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Finalization steps — merge, dedup, publish, index, cleanup."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from core.interfaces import Step, StepResult, PipelineContext
|
|
8
|
+
from engine.pipeline import register_step
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@register_step
|
|
12
|
+
class MergeShardsStep(Step):
|
|
13
|
+
"""Merge split document shards into single files."""
|
|
14
|
+
|
|
15
|
+
default_name = "merge-shards"
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
super().__init__("merge-shards")
|
|
19
|
+
|
|
20
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
21
|
+
from core.skeleton.merge import find_shards, merge_shards, refine_merged_doc
|
|
22
|
+
|
|
23
|
+
merged_count = 0
|
|
24
|
+
for module_dir in _module_dirs(ctx):
|
|
25
|
+
# Find all shard prefixes from preset doc_type_mapping
|
|
26
|
+
from core.preset import load_preset, get_doc_type_mapping
|
|
27
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
28
|
+
preset = load_preset(preset_name)
|
|
29
|
+
dt_mapping = get_doc_type_mapping(preset)
|
|
30
|
+
known_types = {fn.removesuffix(".md") for fn in dt_mapping.values()} if dt_mapping else set()
|
|
31
|
+
|
|
32
|
+
for prefix in known_types:
|
|
33
|
+
shards = find_shards(module_dir, prefix)
|
|
34
|
+
if not shards:
|
|
35
|
+
continue
|
|
36
|
+
merged_file = module_dir / f"{prefix}.md"
|
|
37
|
+
if merged_file.exists() and merged_file.stat().st_size > 100:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
content = merge_shards(shards)
|
|
41
|
+
if content.strip():
|
|
42
|
+
merged_file.write_text(content, encoding="utf-8")
|
|
43
|
+
for s in shards:
|
|
44
|
+
s.unlink(missing_ok=True)
|
|
45
|
+
merged_count += 1
|
|
46
|
+
|
|
47
|
+
# Post-merge refinement
|
|
48
|
+
result = refine_merged_doc(merged_file)
|
|
49
|
+
if result.changed:
|
|
50
|
+
result.apply()
|
|
51
|
+
|
|
52
|
+
return StepResult(status="ok", message=f"Merged {merged_count} documents")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@register_step
|
|
56
|
+
class DeduplicateStep(Step):
|
|
57
|
+
"""LLM-based deduplication of large documents."""
|
|
58
|
+
|
|
59
|
+
default_name = "deduplicate"
|
|
60
|
+
|
|
61
|
+
def __init__(self):
|
|
62
|
+
super().__init__("deduplicate")
|
|
63
|
+
|
|
64
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
65
|
+
from engine.dedup import dedup_document
|
|
66
|
+
from engine.strategies import create_strategy
|
|
67
|
+
from engine.pipeline.steps.generate import _make_config_obj
|
|
68
|
+
from core.preset import load_preset
|
|
69
|
+
|
|
70
|
+
config_obj = _make_config_obj(ctx)
|
|
71
|
+
strategy = create_strategy(config_obj)
|
|
72
|
+
|
|
73
|
+
# Probe: delegated strategy means Agent handles dedup
|
|
74
|
+
from core.interfaces import LlmRequest
|
|
75
|
+
probe = strategy.call(LlmRequest(system="probe", user="probe"))
|
|
76
|
+
if probe.status == "delegated":
|
|
77
|
+
return StepResult(status="skipped", message="Delegated mode — dedup handled by Agent")
|
|
78
|
+
|
|
79
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
80
|
+
preset = load_preset(preset_name)
|
|
81
|
+
limits = preset.get("limits", {})
|
|
82
|
+
dedup_min_size = limits.get("dedup_min_doc_size_bytes", 15 * 1024)
|
|
83
|
+
|
|
84
|
+
total_deduped = 0
|
|
85
|
+
total_saved = 0.0
|
|
86
|
+
|
|
87
|
+
for module_dir in _module_dirs(ctx):
|
|
88
|
+
md_files = sorted(module_dir.glob("*.md"))
|
|
89
|
+
sibling_names = [f.name for f in md_files]
|
|
90
|
+
|
|
91
|
+
for md_file in md_files:
|
|
92
|
+
if md_file.stat().st_size < dedup_min_size:
|
|
93
|
+
continue
|
|
94
|
+
changed, saved = dedup_document(md_file, sibling_names, strategy)
|
|
95
|
+
if changed:
|
|
96
|
+
total_deduped += 1
|
|
97
|
+
total_saved += saved
|
|
98
|
+
|
|
99
|
+
if total_deduped == 0:
|
|
100
|
+
return StepResult(status="ok", message="No redundancy found")
|
|
101
|
+
return StepResult(status="ok", message=f"Deduped {total_deduped} docs (saved {total_saved:.1f}KB)")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@register_step
|
|
105
|
+
class RebuildIndexStep(Step):
|
|
106
|
+
"""Rebuild vector index for generated documents."""
|
|
107
|
+
|
|
108
|
+
default_name = "rebuild-index"
|
|
109
|
+
|
|
110
|
+
def __init__(self):
|
|
111
|
+
super().__init__("rebuild-index", checkpoint="cp9")
|
|
112
|
+
|
|
113
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
114
|
+
# TODO: Import from core.rag when migrated
|
|
115
|
+
return StepResult(status="ok", message="Index rebuild (placeholder)")
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@register_step
|
|
119
|
+
class CleanProgressStep(Step):
|
|
120
|
+
"""Remove progress files after successful completion."""
|
|
121
|
+
|
|
122
|
+
default_name = "clean-progress"
|
|
123
|
+
|
|
124
|
+
def __init__(self):
|
|
125
|
+
super().__init__("clean-progress")
|
|
126
|
+
|
|
127
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
128
|
+
from core.monitor.progress import cleanup_progress
|
|
129
|
+
|
|
130
|
+
cleaned = 0
|
|
131
|
+
for module_dir in _module_dirs(ctx):
|
|
132
|
+
cleaned += cleanup_progress(module_dir)
|
|
133
|
+
return StepResult(status="ok", message=f"Cleaned {cleaned} progress files")
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@register_step
|
|
137
|
+
class SharedDocsStep(Step):
|
|
138
|
+
"""Generate cross-module shared documents."""
|
|
139
|
+
|
|
140
|
+
default_name = "shared-docs"
|
|
141
|
+
|
|
142
|
+
def __init__(self):
|
|
143
|
+
super().__init__("shared-docs")
|
|
144
|
+
|
|
145
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
146
|
+
from core.docs.shared import generate_shared_docs
|
|
147
|
+
|
|
148
|
+
generated = generate_shared_docs(ctx.knowledge_dir, ctx.config, ctx.kb_name)
|
|
149
|
+
if not generated:
|
|
150
|
+
return StepResult(status="skipped", message="No shared docs to generate")
|
|
151
|
+
return StepResult(status="ok", message=f"Generated {len(generated)} shared docs",
|
|
152
|
+
details={"generated": generated})
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _module_dirs(ctx: PipelineContext) -> list[Path]:
|
|
156
|
+
if ctx.module:
|
|
157
|
+
d = ctx.knowledge_dir / ctx.module
|
|
158
|
+
return [d] if d.is_dir() else []
|
|
159
|
+
return sorted(d for d in ctx.knowledge_dir.iterdir()
|
|
160
|
+
if d.is_dir() and not d.name.startswith(".") and d.name != "_shared")
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
"""Document generation step — the main LLM-powered step.
|
|
2
|
+
|
|
3
|
+
Orchestrates prompt rendering, split planning, batch execution,
|
|
4
|
+
and two-phase generation for large modules.
|
|
5
|
+
|
|
6
|
+
Requirements: 5.1, 5.5
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import time
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from core.interfaces import Step, StepResult, PipelineContext
|
|
16
|
+
from core.monitor.progress import write_progress
|
|
17
|
+
from engine.pipeline import register_step
|
|
18
|
+
from engine.runner import BatchAbortError
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
MIN_DOC_SIZE_BYTES = 500
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@register_step
|
|
26
|
+
class GenerateDocsStep(Step):
|
|
27
|
+
"""Generate knowledge base documents via LLM."""
|
|
28
|
+
|
|
29
|
+
default_name = "generate-docs"
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
super().__init__("generate-docs", checkpoint="cp4")
|
|
33
|
+
|
|
34
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
35
|
+
import json
|
|
36
|
+
from core.preset import load_preset
|
|
37
|
+
from core.prompt.renderer import render_prompt
|
|
38
|
+
from engine.strategies import create_strategy, ConfigProxy
|
|
39
|
+
from engine.assembler import InlinePromptAssembler
|
|
40
|
+
from engine.runner import SubagentTask, run_batch, configure_garbage_patterns, CircuitBreakerConfig
|
|
41
|
+
|
|
42
|
+
config = ctx.config
|
|
43
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
44
|
+
preset = load_preset(preset_name)
|
|
45
|
+
|
|
46
|
+
# Configure runner from preset limits
|
|
47
|
+
limits = preset.get("limits", {})
|
|
48
|
+
configure_garbage_patterns(limits.get("garbage_patterns"))
|
|
49
|
+
breaker_config = CircuitBreakerConfig.from_config({"limits": limits})
|
|
50
|
+
|
|
51
|
+
strategy = create_strategy(ConfigProxy(config))
|
|
52
|
+
assembler = InlinePromptAssembler(preset=preset)
|
|
53
|
+
|
|
54
|
+
snippet_path = Path(ctx.project_root) / "skills" / "kb-init" / "templates" / "core" / "output-only.md"
|
|
55
|
+
execution_snippet = snippet_path.read_text(encoding="utf-8") if snippet_path.exists() else ""
|
|
56
|
+
|
|
57
|
+
from core.monitor.prompt_store import should_save_prompts
|
|
58
|
+
save_prompts = should_save_prompts(config)
|
|
59
|
+
|
|
60
|
+
module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
|
|
61
|
+
generated: list[str] = []
|
|
62
|
+
max_concurrent = ctx.config.get("agent", {}).get("max_concurrent_subagents", 5)
|
|
63
|
+
|
|
64
|
+
for name in module_repos:
|
|
65
|
+
module_dir = ctx.knowledge_dir / name
|
|
66
|
+
tasks_file = module_dir / ".meta" / "dispatch-tasks.json"
|
|
67
|
+
|
|
68
|
+
if not tasks_file.exists():
|
|
69
|
+
logger.warning("No dispatch-tasks.json for %s, skipping", name)
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
manifest = json.loads(tasks_file.read_text(encoding="utf-8"))
|
|
73
|
+
dispatch_tasks = manifest.get("tasks", [])
|
|
74
|
+
|
|
75
|
+
# Group by batch for sequential execution
|
|
76
|
+
batches: dict[str, list[dict]] = {}
|
|
77
|
+
for t in dispatch_tasks:
|
|
78
|
+
batches.setdefault(t["batch"], []).append(t)
|
|
79
|
+
|
|
80
|
+
for batch_id in sorted(batches.keys()):
|
|
81
|
+
batch_tasks: list[SubagentTask] = []
|
|
82
|
+
|
|
83
|
+
for t in batches[batch_id]:
|
|
84
|
+
output_path = Path(t["output_file"])
|
|
85
|
+
if output_path.exists() and output_path.stat().st_size > MIN_DOC_SIZE_BYTES:
|
|
86
|
+
continue
|
|
87
|
+
|
|
88
|
+
template_path = _find_template(ctx.project_root, preset_name, t["doc_type"], preset)
|
|
89
|
+
if not template_path:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
extras = {}
|
|
93
|
+
if t.get("shard_file_list"):
|
|
94
|
+
extras["file_list_override"] = t["shard_file_list"]
|
|
95
|
+
|
|
96
|
+
prompt = render_prompt(
|
|
97
|
+
template_path=template_path,
|
|
98
|
+
config=config,
|
|
99
|
+
kb_name=ctx.kb_name,
|
|
100
|
+
module_name=name,
|
|
101
|
+
doc_type=t["doc_type"],
|
|
102
|
+
assembler=assembler,
|
|
103
|
+
extras=extras,
|
|
104
|
+
execution_snippet=execution_snippet,
|
|
105
|
+
preset=preset,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
batch_tasks.append(SubagentTask(
|
|
109
|
+
task_id=f"{name}__{t['id']}",
|
|
110
|
+
prompt=prompt,
|
|
111
|
+
output_path=output_path,
|
|
112
|
+
doc_type=t["doc_type"],
|
|
113
|
+
))
|
|
114
|
+
|
|
115
|
+
if batch_tasks:
|
|
116
|
+
_write_heartbeat(module_dir, [t["doc_type"] for t in batches[batch_id]])
|
|
117
|
+
try:
|
|
118
|
+
results = run_batch(batch_tasks, strategy, max_concurrent,
|
|
119
|
+
save_prompts=save_prompts, breaker_config=breaker_config)
|
|
120
|
+
except BatchAbortError as e:
|
|
121
|
+
logger.error("Batch %s aborted: %s", batch_id, e.reason)
|
|
122
|
+
logger.error("Diagnosis: %s", e.diagnosis)
|
|
123
|
+
return StepResult(
|
|
124
|
+
status="failed",
|
|
125
|
+
message=f"Generation aborted: {e.reason}\nDiagnosis: {e.diagnosis}",
|
|
126
|
+
details={"generated": generated, "abort_reason": e.reason, "diagnosis": e.diagnosis},
|
|
127
|
+
)
|
|
128
|
+
for r in results:
|
|
129
|
+
if r.status == "done" and r.output_path and r.output_path.exists():
|
|
130
|
+
generated.append(f"{name}/{r.output_path.name}")
|
|
131
|
+
write_progress(module_dir, r.task_id.split("__")[-1], "DONE")
|
|
132
|
+
|
|
133
|
+
if not generated:
|
|
134
|
+
return StepResult(status="skipped", message="No documents generated")
|
|
135
|
+
return StepResult(status="ok", message=f"Generated {len(generated)} documents",
|
|
136
|
+
details={"generated": generated})
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _make_config_obj(ctx: PipelineContext):
|
|
140
|
+
"""Create a ConfigProxy from pipeline context. Used by multiple steps."""
|
|
141
|
+
from engine.strategies import ConfigProxy
|
|
142
|
+
return ConfigProxy(ctx.config)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _find_template(project_root: Path, preset_name: str, doc_type: str, preset: dict = None) -> Path | None:
|
|
146
|
+
"""Find the template file for a doc type from preset config.
|
|
147
|
+
|
|
148
|
+
Uses preset search path: env > local > built-in.
|
|
149
|
+
"""
|
|
150
|
+
from core.preset import find_preset_template, get_template_path
|
|
151
|
+
|
|
152
|
+
template_name = None
|
|
153
|
+
if preset:
|
|
154
|
+
template_name = get_template_path(preset, doc_type, preset_name)
|
|
155
|
+
|
|
156
|
+
if not template_name:
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
return find_preset_template(preset_name, template_name)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _write_heartbeat(module_dir: Path, batch: list[str]) -> None:
|
|
163
|
+
"""Write heartbeat file for current batch progress tracking."""
|
|
164
|
+
hb_path = module_dir / ".meta" / "heartbeat.txt"
|
|
165
|
+
hb_path.parent.mkdir(parents=True, exist_ok=True)
|
|
166
|
+
hb_path.write_text(
|
|
167
|
+
f"batch={','.join(batch)} ts={time.time():.0f}\n",
|
|
168
|
+
encoding="utf-8",
|
|
169
|
+
)
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Batch execution module for document generation.
|
|
2
|
+
|
|
3
|
+
Handles single-batch execution logic:
|
|
4
|
+
- Shard generation via LlmStrategy
|
|
5
|
+
- Two-phase coordination (call cli/two_phase.py)
|
|
6
|
+
- Concurrency control
|
|
7
|
+
|
|
8
|
+
Requirements: 5.2, 5.3, 5.4
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from core.interfaces import LlmStrategy, PipelineContext
|
|
17
|
+
from core.paths import resolve_file_list
|
|
18
|
+
from core.monitor.progress import write_progress
|
|
19
|
+
from engine.runner import SubagentTask, SubagentResult, run_batch
|
|
20
|
+
from engine.two_phase import run_two_phase
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
MIN_DOC_SIZE_BYTES = 500
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def execute_batch(
|
|
28
|
+
ctx: PipelineContext,
|
|
29
|
+
module_name: str,
|
|
30
|
+
doc_types: list[str],
|
|
31
|
+
strategy: LlmStrategy,
|
|
32
|
+
max_concurrent: int = 5,
|
|
33
|
+
) -> list[SubagentResult]:
|
|
34
|
+
"""Execute a single batch of doc_types for a module.
|
|
35
|
+
|
|
36
|
+
For each doc_type, determines if splitting is needed and dispatches
|
|
37
|
+
either direct generation or two-phase split generation.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
ctx: Pipeline context
|
|
41
|
+
module_name: Name of the module being processed
|
|
42
|
+
doc_types: List of doc types in this batch
|
|
43
|
+
strategy: LLM execution strategy
|
|
44
|
+
max_concurrent: Max concurrent tasks
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
List of SubagentResults from all tasks in this batch
|
|
48
|
+
"""
|
|
49
|
+
from core.preset import load_preset, get_doc_type_mapping
|
|
50
|
+
from core.skeleton.split import SplitConfig, compute_splits
|
|
51
|
+
from core.skeleton.file_list import load_skeleton
|
|
52
|
+
from core.prompt.renderer import render_prompt
|
|
53
|
+
from engine.assembler import InlinePromptAssembler
|
|
54
|
+
from engine.pipeline.steps.generate import _find_template, _make_config_obj
|
|
55
|
+
|
|
56
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
57
|
+
preset = load_preset(preset_name)
|
|
58
|
+
split_config = SplitConfig.from_preset(preset, mode="output-only")
|
|
59
|
+
assembler = InlinePromptAssembler()
|
|
60
|
+
|
|
61
|
+
module_dir = ctx.knowledge_dir / module_name
|
|
62
|
+
config_path = str(ctx.project_root / "kb-project.yaml")
|
|
63
|
+
|
|
64
|
+
tasks: list[SubagentTask] = []
|
|
65
|
+
split_tasks: list[SubagentTask] = []
|
|
66
|
+
|
|
67
|
+
for doc_type in doc_types:
|
|
68
|
+
doc_path = module_dir / f"{doc_type}.md"
|
|
69
|
+
if doc_path.exists() and doc_path.stat().st_size > MIN_DOC_SIZE_BYTES:
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
if not resolve_file_list(module_dir, doc_type):
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
template_path = _find_template(ctx.project_root, preset_name, doc_type, preset)
|
|
76
|
+
if not template_path:
|
|
77
|
+
continue
|
|
78
|
+
|
|
79
|
+
# Compute splits based on file list size
|
|
80
|
+
num_splits = _compute_doc_splits(module_dir, doc_type, split_config)
|
|
81
|
+
|
|
82
|
+
prompt = render_prompt(
|
|
83
|
+
template_path=template_path,
|
|
84
|
+
config=ctx.config,
|
|
85
|
+
kb_name=ctx.kb_name,
|
|
86
|
+
module_name=module_name,
|
|
87
|
+
doc_type=doc_type,
|
|
88
|
+
assembler=assembler,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
if num_splits > 1:
|
|
92
|
+
# Create shard tasks for two-phase generation
|
|
93
|
+
shard_tasks = _create_shard_tasks(
|
|
94
|
+
ctx, module_dir, module_name, doc_type,
|
|
95
|
+
prompt, num_splits, template_path, assembler,
|
|
96
|
+
)
|
|
97
|
+
split_tasks.extend(shard_tasks)
|
|
98
|
+
else:
|
|
99
|
+
tasks.append(SubagentTask(
|
|
100
|
+
task_id=f"{module_name}__{doc_type}",
|
|
101
|
+
prompt=prompt,
|
|
102
|
+
output_path=doc_path,
|
|
103
|
+
doc_type=doc_type,
|
|
104
|
+
))
|
|
105
|
+
|
|
106
|
+
results: list[SubagentResult] = []
|
|
107
|
+
|
|
108
|
+
# Execute direct tasks
|
|
109
|
+
if tasks:
|
|
110
|
+
logger.info("[batch] Running %d direct tasks for %s", len(tasks), module_name)
|
|
111
|
+
results.extend(run_batch(tasks, strategy, max_concurrent))
|
|
112
|
+
|
|
113
|
+
# Execute split tasks via two-phase
|
|
114
|
+
if split_tasks:
|
|
115
|
+
logger.info("[batch] Running %d split tasks (two-phase) for %s",
|
|
116
|
+
len(split_tasks), module_name)
|
|
117
|
+
results.extend(run_two_phase(split_tasks, strategy, max_concurrent))
|
|
118
|
+
|
|
119
|
+
return results
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _compute_doc_splits(
|
|
123
|
+
module_dir: Path, doc_type: str, split_config: "SplitConfig"
|
|
124
|
+
) -> int:
|
|
125
|
+
"""Compute number of splits needed for a doc type."""
|
|
126
|
+
from core.skeleton.file_list import load_skeleton
|
|
127
|
+
|
|
128
|
+
fl_path = resolve_file_list(module_dir, doc_type)
|
|
129
|
+
if not fl_path:
|
|
130
|
+
return 1
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
lines = fl_path.read_text(encoding="utf-8").strip().splitlines()
|
|
134
|
+
file_count = len(lines)
|
|
135
|
+
except OSError:
|
|
136
|
+
return 1
|
|
137
|
+
|
|
138
|
+
# Simple heuristic: split if file count exceeds threshold
|
|
139
|
+
threshold = split_config.max_files_per_shard if split_config else 30
|
|
140
|
+
if file_count <= threshold:
|
|
141
|
+
return 1
|
|
142
|
+
return min(4, max(2, file_count // threshold + 1))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _create_shard_tasks(
|
|
146
|
+
ctx: PipelineContext,
|
|
147
|
+
module_dir: Path,
|
|
148
|
+
module_name: str,
|
|
149
|
+
doc_type: str,
|
|
150
|
+
base_prompt: str,
|
|
151
|
+
num_splits: int,
|
|
152
|
+
template_path: Path,
|
|
153
|
+
assembler,
|
|
154
|
+
) -> list[SubagentTask]:
|
|
155
|
+
"""Create shard tasks for split generation.
|
|
156
|
+
|
|
157
|
+
Divides the file list into shards and creates a task per shard.
|
|
158
|
+
"""
|
|
159
|
+
from core.paths import shard_doc_path
|
|
160
|
+
|
|
161
|
+
fl_path = resolve_file_list(module_dir, doc_type)
|
|
162
|
+
if not fl_path:
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
try:
|
|
166
|
+
all_files = fl_path.read_text(encoding="utf-8").strip().splitlines()
|
|
167
|
+
except OSError:
|
|
168
|
+
return []
|
|
169
|
+
|
|
170
|
+
if not all_files:
|
|
171
|
+
return []
|
|
172
|
+
|
|
173
|
+
# Divide files into shards
|
|
174
|
+
shard_size = max(1, len(all_files) // num_splits)
|
|
175
|
+
shards: list[list[str]] = []
|
|
176
|
+
for i in range(0, len(all_files), shard_size):
|
|
177
|
+
shards.append(all_files[i:i + shard_size])
|
|
178
|
+
|
|
179
|
+
# Merge last shard if too small
|
|
180
|
+
if len(shards) > 1 and len(shards[-1]) < shard_size // 2:
|
|
181
|
+
shards[-2].extend(shards[-1])
|
|
182
|
+
shards.pop()
|
|
183
|
+
|
|
184
|
+
tasks: list[SubagentTask] = []
|
|
185
|
+
for idx, shard_files in enumerate(shards, 1):
|
|
186
|
+
shard_suffix = f"\n\n## Files assigned to this shard\n\n" + "\n".join(f"- {f}" for f in shard_files)
|
|
187
|
+
output_path = shard_doc_path(module_dir, doc_type, idx)
|
|
188
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
|
|
190
|
+
tasks.append(SubagentTask(
|
|
191
|
+
task_id=f"{module_name}__{doc_type}__shard{idx:02d}",
|
|
192
|
+
prompt=base_prompt + shard_suffix,
|
|
193
|
+
output_path=output_path,
|
|
194
|
+
doc_type=doc_type,
|
|
195
|
+
))
|
|
196
|
+
|
|
197
|
+
return tasks
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Recovery strategy chain for failed generation steps.
|
|
2
|
+
|
|
3
|
+
Provides a chain-of-responsibility pattern for handling LLM failures:
|
|
4
|
+
1. SplitRetryStrategy — re-split into smaller shards and retry
|
|
5
|
+
2. ModelSwitchStrategy — try a different/larger model
|
|
6
|
+
3. MarkFailedStrategy — give up and mark as failed
|
|
7
|
+
|
|
8
|
+
Requirements: 5.2
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from abc import ABC, abstractmethod
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from core.interfaces import LlmStrategy, LlmRequest, LlmResponse
|
|
19
|
+
from engine.runner import SubagentTask, SubagentResult, run_single
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class RecoveryContext:
|
|
26
|
+
"""Context passed through the recovery chain."""
|
|
27
|
+
task: SubagentTask
|
|
28
|
+
strategy: LlmStrategy
|
|
29
|
+
failure_reason: str
|
|
30
|
+
attempt: int = 0
|
|
31
|
+
max_attempts: int = 3
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class RecoveryStrategy(ABC):
|
|
35
|
+
"""Base class for recovery strategies."""
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def can_handle(self, ctx: RecoveryContext) -> bool:
|
|
39
|
+
"""Check if this strategy can handle the failure."""
|
|
40
|
+
|
|
41
|
+
@abstractmethod
|
|
42
|
+
def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
|
|
43
|
+
"""Attempt recovery. Returns result or None to pass to next strategy."""
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class SplitRetryStrategy(RecoveryStrategy):
|
|
47
|
+
"""Re-split a failed shard into smaller pieces and retry."""
|
|
48
|
+
|
|
49
|
+
def can_handle(self, ctx: RecoveryContext) -> bool:
|
|
50
|
+
# Only applicable if the task has a large prompt (likely too much content)
|
|
51
|
+
return len(ctx.task.prompt) > 20000 and ctx.attempt < 2
|
|
52
|
+
|
|
53
|
+
def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
|
|
54
|
+
logger.info("[recovery:split] %s: splitting prompt", ctx.task.task_id)
|
|
55
|
+
|
|
56
|
+
# Split the prompt roughly in half by file list
|
|
57
|
+
prompt = ctx.task.prompt
|
|
58
|
+
marker = "## Files assigned to this shard"
|
|
59
|
+
if marker not in prompt:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
parts = prompt.split(marker, 1)
|
|
63
|
+
if len(parts) < 2:
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
file_section = parts[1]
|
|
67
|
+
files = [l.strip("- \n") for l in file_section.strip().splitlines() if l.strip().startswith("-")]
|
|
68
|
+
if len(files) <= 2:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
mid = len(files) // 2
|
|
72
|
+
first_half = files[:mid]
|
|
73
|
+
second_half = files[mid:]
|
|
74
|
+
|
|
75
|
+
results: list[SubagentResult] = []
|
|
76
|
+
for idx, chunk in enumerate([first_half, second_half], 1):
|
|
77
|
+
chunk_suffix = f"\n\n{marker}\n\n" + "\n".join(f"- {f}" for f in chunk)
|
|
78
|
+
sub_task = SubagentTask(
|
|
79
|
+
task_id=f"{ctx.task.task_id}__split{idx}",
|
|
80
|
+
prompt=parts[0] + chunk_suffix,
|
|
81
|
+
output_path=ctx.task.output_path.parent / f"{ctx.task.output_path.stem}-part{idx}.md",
|
|
82
|
+
doc_type=ctx.task.doc_type,
|
|
83
|
+
)
|
|
84
|
+
result = run_single(sub_task, ctx.strategy)
|
|
85
|
+
results.append(result)
|
|
86
|
+
|
|
87
|
+
# Merge successful results
|
|
88
|
+
merged_content = ""
|
|
89
|
+
for r in results:
|
|
90
|
+
if r.status == "done" and r.content:
|
|
91
|
+
merged_content += r.content + "\n\n"
|
|
92
|
+
|
|
93
|
+
if merged_content:
|
|
94
|
+
ctx.task.output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
95
|
+
ctx.task.output_path.write_text(merged_content.strip(), encoding="utf-8")
|
|
96
|
+
return SubagentResult(
|
|
97
|
+
task_id=ctx.task.task_id, status="done",
|
|
98
|
+
output_path=ctx.task.output_path, content=merged_content,
|
|
99
|
+
)
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class ModelSwitchStrategy(RecoveryStrategy):
|
|
104
|
+
"""Switch to a different model and retry."""
|
|
105
|
+
|
|
106
|
+
def __init__(self, fallback_model: str | None = None):
|
|
107
|
+
self._fallback = fallback_model
|
|
108
|
+
|
|
109
|
+
def can_handle(self, ctx: RecoveryContext) -> bool:
|
|
110
|
+
return self._fallback is not None and ctx.attempt < 2
|
|
111
|
+
|
|
112
|
+
def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
|
|
113
|
+
if not self._fallback:
|
|
114
|
+
return None
|
|
115
|
+
logger.info("[recovery:model-switch] %s: switching to %s",
|
|
116
|
+
ctx.task.task_id, self._fallback)
|
|
117
|
+
task = SubagentTask(
|
|
118
|
+
task_id=ctx.task.task_id,
|
|
119
|
+
prompt=ctx.task.prompt,
|
|
120
|
+
output_path=ctx.task.output_path,
|
|
121
|
+
doc_type=ctx.task.doc_type,
|
|
122
|
+
model=self._fallback,
|
|
123
|
+
)
|
|
124
|
+
result = run_single(task, ctx.strategy)
|
|
125
|
+
return result if result.status == "done" else None
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
class MarkFailedStrategy(RecoveryStrategy):
|
|
129
|
+
"""Final fallback — mark the task as failed and continue."""
|
|
130
|
+
|
|
131
|
+
def can_handle(self, ctx: RecoveryContext) -> bool:
|
|
132
|
+
return True # Always applicable as last resort
|
|
133
|
+
|
|
134
|
+
def recover(self, ctx: RecoveryContext) -> SubagentResult | None:
|
|
135
|
+
logger.warning("[recovery:mark-failed] %s: giving up after %d attempts",
|
|
136
|
+
ctx.task.task_id, ctx.attempt)
|
|
137
|
+
return SubagentResult(
|
|
138
|
+
task_id=ctx.task.task_id, status="failed",
|
|
139
|
+
error=f"All recovery exhausted: {ctx.failure_reason}",
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class RecoveryChain:
|
|
144
|
+
"""Chain of recovery strategies executed in order."""
|
|
145
|
+
|
|
146
|
+
def __init__(self, strategies: list[RecoveryStrategy] | None = None):
|
|
147
|
+
self._strategies = strategies or [
|
|
148
|
+
SplitRetryStrategy(),
|
|
149
|
+
ModelSwitchStrategy(),
|
|
150
|
+
MarkFailedStrategy(),
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
def attempt(
|
|
154
|
+
self, task: SubagentTask, strategy: LlmStrategy, failure_reason: str
|
|
155
|
+
) -> SubagentResult:
|
|
156
|
+
"""Try recovery strategies in order until one succeeds."""
|
|
157
|
+
ctx = RecoveryContext(task=task, strategy=strategy, failure_reason=failure_reason)
|
|
158
|
+
|
|
159
|
+
for recovery in self._strategies:
|
|
160
|
+
ctx.attempt += 1
|
|
161
|
+
if not recovery.can_handle(ctx):
|
|
162
|
+
continue
|
|
163
|
+
result = recovery.recover(ctx)
|
|
164
|
+
if result and result.status == "done":
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
return SubagentResult(
|
|
168
|
+
task_id=task.task_id, status="failed",
|
|
169
|
+
error=f"Recovery chain exhausted: {failure_reason}",
|
|
170
|
+
)
|