source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""Checkpoint and recovery module for pipeline execution.
|
|
2
|
+
|
|
3
|
+
Provides:
|
|
4
|
+
- Checkpoint dataclass for recording step completion state
|
|
5
|
+
- save()/load() methods for persistence
|
|
6
|
+
- --resume breakpoint resume logic
|
|
7
|
+
- Corruption detection and rollback
|
|
8
|
+
|
|
9
|
+
Requirements: 19.1, 19.2, 19.3, 19.4
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import time
|
|
17
|
+
from dataclasses import dataclass, field, asdict
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from core.interfaces import LlmStrategy
|
|
22
|
+
from engine.runner import SubagentTask, SubagentResult, run_single
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
CHECKPOINT_FILENAME = ".pipeline-checkpoint.json"
|
|
27
|
+
CHECKPOINT_VERSION = 2
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# Checkpoint dataclass
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class StepState:
|
|
36
|
+
"""State of a single pipeline step."""
|
|
37
|
+
name: str
|
|
38
|
+
status: str # "pending", "running", "done", "failed", "skipped"
|
|
39
|
+
started_at: float = 0.0
|
|
40
|
+
completed_at: float = 0.0
|
|
41
|
+
message: str = ""
|
|
42
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class Checkpoint:
|
|
47
|
+
"""Pipeline execution checkpoint for resume support."""
|
|
48
|
+
pipeline_name: str
|
|
49
|
+
kb_name: str
|
|
50
|
+
version: int = CHECKPOINT_VERSION
|
|
51
|
+
created_at: float = field(default_factory=time.time)
|
|
52
|
+
updated_at: float = field(default_factory=time.time)
|
|
53
|
+
steps: list[StepState] = field(default_factory=list)
|
|
54
|
+
state_snapshot: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def completed_steps(self) -> list[str]:
|
|
58
|
+
"""Names of steps that completed successfully."""
|
|
59
|
+
return [s.name for s in self.steps if s.status in ("done", "skipped")]
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def last_completed(self) -> str | None:
|
|
63
|
+
"""Name of the last completed step."""
|
|
64
|
+
completed = self.completed_steps
|
|
65
|
+
return completed[-1] if completed else None
|
|
66
|
+
|
|
67
|
+
@property
|
|
68
|
+
def next_step(self) -> str | None:
|
|
69
|
+
"""Name of the next step to execute (first pending/failed)."""
|
|
70
|
+
for s in self.steps:
|
|
71
|
+
if s.status in ("pending", "failed", "running"):
|
|
72
|
+
return s.name
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
def mark_step(self, name: str, status: str, message: str = "", **details) -> None:
|
|
76
|
+
"""Update a step's status."""
|
|
77
|
+
for s in self.steps:
|
|
78
|
+
if s.name == name:
|
|
79
|
+
s.status = status
|
|
80
|
+
s.message = message
|
|
81
|
+
s.details.update(details)
|
|
82
|
+
if status == "running":
|
|
83
|
+
s.started_at = time.time()
|
|
84
|
+
elif status in ("done", "failed", "skipped"):
|
|
85
|
+
s.completed_at = time.time()
|
|
86
|
+
break
|
|
87
|
+
self.updated_at = time.time()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
# Save / Load
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
def save_checkpoint(checkpoint: Checkpoint, knowledge_dir: Path) -> Path:
|
|
95
|
+
"""Save checkpoint to disk.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
checkpoint: Checkpoint to save
|
|
99
|
+
knowledge_dir: Knowledge directory (checkpoint stored in .meta/)
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
Path to saved checkpoint file
|
|
103
|
+
"""
|
|
104
|
+
checkpoint.updated_at = time.time()
|
|
105
|
+
|
|
106
|
+
cp_dir = knowledge_dir / ".meta"
|
|
107
|
+
cp_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
cp_path = cp_dir / CHECKPOINT_FILENAME
|
|
109
|
+
|
|
110
|
+
data = {
|
|
111
|
+
"version": checkpoint.version,
|
|
112
|
+
"pipeline_name": checkpoint.pipeline_name,
|
|
113
|
+
"kb_name": checkpoint.kb_name,
|
|
114
|
+
"created_at": checkpoint.created_at,
|
|
115
|
+
"updated_at": checkpoint.updated_at,
|
|
116
|
+
"steps": [asdict(s) for s in checkpoint.steps],
|
|
117
|
+
"state_snapshot": _sanitize_state(checkpoint.state_snapshot),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
# Write atomically (write to temp then rename)
|
|
121
|
+
tmp_path = cp_path.with_suffix(".tmp")
|
|
122
|
+
tmp_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
123
|
+
tmp_path.replace(cp_path)
|
|
124
|
+
|
|
125
|
+
logger.debug("[checkpoint] Saved: %s (%d steps)", cp_path, len(checkpoint.steps))
|
|
126
|
+
return cp_path
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def load_checkpoint(knowledge_dir: Path) -> Checkpoint | None:
|
|
130
|
+
"""Load checkpoint from disk.
|
|
131
|
+
|
|
132
|
+
Returns None if no checkpoint exists or if JSON is corrupted.
|
|
133
|
+
"""
|
|
134
|
+
cp_path = knowledge_dir / ".meta" / CHECKPOINT_FILENAME
|
|
135
|
+
if not cp_path.exists():
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
data = json.loads(cp_path.read_text(encoding="utf-8"))
|
|
140
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
141
|
+
logger.warning("[checkpoint] Corrupted checkpoint: %s", e)
|
|
142
|
+
return None
|
|
143
|
+
|
|
144
|
+
# Version check
|
|
145
|
+
if data.get("version", 0) != CHECKPOINT_VERSION:
|
|
146
|
+
logger.warning("[checkpoint] Version mismatch (got %s, expected %s)",
|
|
147
|
+
data.get("version"), CHECKPOINT_VERSION)
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
steps = [
|
|
151
|
+
StepState(
|
|
152
|
+
name=s["name"],
|
|
153
|
+
status=s.get("status", "pending"),
|
|
154
|
+
started_at=s.get("started_at", 0),
|
|
155
|
+
completed_at=s.get("completed_at", 0),
|
|
156
|
+
message=s.get("message", ""),
|
|
157
|
+
details=s.get("details", {}),
|
|
158
|
+
)
|
|
159
|
+
for s in data.get("steps", [])
|
|
160
|
+
]
|
|
161
|
+
|
|
162
|
+
return Checkpoint(
|
|
163
|
+
pipeline_name=data.get("pipeline_name", ""),
|
|
164
|
+
kb_name=data.get("kb_name", ""),
|
|
165
|
+
version=data.get("version", CHECKPOINT_VERSION),
|
|
166
|
+
created_at=data.get("created_at", 0),
|
|
167
|
+
updated_at=data.get("updated_at", 0),
|
|
168
|
+
steps=steps,
|
|
169
|
+
state_snapshot=data.get("state_snapshot", {}),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def create_checkpoint(
|
|
174
|
+
pipeline_name: str, kb_name: str, step_names: list[str]
|
|
175
|
+
) -> Checkpoint:
|
|
176
|
+
"""Create a new checkpoint for a pipeline run."""
|
|
177
|
+
steps = [StepState(name=name, status="pending") for name in step_names]
|
|
178
|
+
return Checkpoint(pipeline_name=pipeline_name, kb_name=kb_name, steps=steps)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def clear_checkpoint(knowledge_dir: Path) -> None:
|
|
182
|
+
"""Remove checkpoint file after successful completion."""
|
|
183
|
+
cp_path = knowledge_dir / ".meta" / CHECKPOINT_FILENAME
|
|
184
|
+
if cp_path.exists():
|
|
185
|
+
cp_path.unlink()
|
|
186
|
+
logger.debug("[checkpoint] Cleared: %s", cp_path)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# Resume logic
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def get_resume_point(knowledge_dir: Path) -> str | None:
|
|
194
|
+
"""Get the step name to resume from, or None if no resume needed."""
|
|
195
|
+
cp = load_checkpoint(knowledge_dir)
|
|
196
|
+
if cp is None:
|
|
197
|
+
return None
|
|
198
|
+
return cp.next_step
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def should_skip_step(checkpoint: Checkpoint | None, step_name: str) -> bool:
|
|
202
|
+
"""Check if a step should be skipped during resume."""
|
|
203
|
+
if checkpoint is None:
|
|
204
|
+
return False
|
|
205
|
+
return step_name in checkpoint.completed_steps
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# ---------------------------------------------------------------------------
|
|
209
|
+
# Recovery (LLM failure handling) — see generate_recovery.py for full chain
|
|
210
|
+
# ---------------------------------------------------------------------------
|
|
211
|
+
|
|
212
|
+
def attempt_recovery(
|
|
213
|
+
task: SubagentTask,
|
|
214
|
+
strategy: LlmStrategy,
|
|
215
|
+
failure_reason: str,
|
|
216
|
+
max_retries: int = 2,
|
|
217
|
+
) -> SubagentResult:
|
|
218
|
+
"""Try retrying the task. For full recovery chain, use generate_recovery.py."""
|
|
219
|
+
for attempt in range(max_retries):
|
|
220
|
+
result = run_single(task, strategy)
|
|
221
|
+
if result.status == "done":
|
|
222
|
+
return result
|
|
223
|
+
|
|
224
|
+
return SubagentResult(
|
|
225
|
+
task_id=task.task_id, status="failed",
|
|
226
|
+
error=f"Recovery exhausted after {max_retries} retries: {failure_reason}",
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
# ---------------------------------------------------------------------------
|
|
231
|
+
# Helpers
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
def _sanitize_state(state: dict) -> dict:
|
|
235
|
+
"""Remove non-serializable items from state for checkpoint storage."""
|
|
236
|
+
safe: dict[str, Any] = {}
|
|
237
|
+
for key, value in state.items():
|
|
238
|
+
if key.startswith("_"):
|
|
239
|
+
continue # Skip private state (locks, etc.)
|
|
240
|
+
if isinstance(value, Path):
|
|
241
|
+
safe[key] = str(value)
|
|
242
|
+
elif isinstance(value, dict):
|
|
243
|
+
safe[key] = {
|
|
244
|
+
k: str(v) if isinstance(v, Path) else v
|
|
245
|
+
for k, v in value.items()
|
|
246
|
+
if not k.startswith("_")
|
|
247
|
+
}
|
|
248
|
+
elif isinstance(value, (str, int, float, bool, list)):
|
|
249
|
+
safe[key] = value
|
|
250
|
+
return safe
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""CLI pipeline steps — all step implementations.
|
|
2
|
+
|
|
3
|
+
Steps are organized by pipeline phase:
|
|
4
|
+
lock.py — AcquireLock / ReleaseLock
|
|
5
|
+
source.py — FetchSource / BuildSource / ResolveDeps
|
|
6
|
+
skeleton.py — ExtractSkeleton / ExtractFileList / Classify
|
|
7
|
+
generate.py — GenerateDocs (main LLM generation step)
|
|
8
|
+
quality.py — Validate / Sampling / Links / Duplicates
|
|
9
|
+
finalize.py — Merge / Dedup / SharedDocs / Publish / RebuildIndex / Clean
|
|
10
|
+
sync.py — Sync-specific steps
|
|
11
|
+
audit.py — Audit-specific steps
|
|
12
|
+
|
|
13
|
+
Import all step modules to trigger registration.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
# Import all step modules to register them
|
|
17
|
+
from engine.pipeline.steps import lock # noqa: F401
|
|
18
|
+
from engine.pipeline.steps import source # noqa: F401
|
|
19
|
+
from engine.pipeline.steps import skeleton # noqa: F401
|
|
20
|
+
from engine.pipeline.steps import pregenerate # noqa: F401
|
|
21
|
+
from engine.pipeline.steps import generate # noqa: F401
|
|
22
|
+
from engine.pipeline.steps import quality # noqa: F401
|
|
23
|
+
from engine.pipeline.steps import finalize # noqa: F401
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Audit pipeline steps — document quality audit via LLM.
|
|
2
|
+
|
|
3
|
+
Steps:
|
|
4
|
+
- FetchAndScaleStep: fetch source and assess scale for strategy selection
|
|
5
|
+
- AuditDocsStep: LLM-powered document audit against source
|
|
6
|
+
|
|
7
|
+
Requirements: Req 12, 18, 20, 27
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from core.interfaces import Step, StepResult, PipelineContext, LlmRequest
|
|
17
|
+
from engine.pipeline import register_step
|
|
18
|
+
from engine.pipeline.steps.audit_helpers import (
|
|
19
|
+
compute_doc_size, get_audit_order, load_audit_progress,
|
|
20
|
+
update_audit_progress, build_audit_system_prompt, build_audit_user_prompt,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@register_step
|
|
27
|
+
class FetchAndScaleStep(Step):
|
|
28
|
+
"""Fetch source and assess scale for audit strategy selection."""
|
|
29
|
+
|
|
30
|
+
default_name = "fetch-and-scale"
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
super().__init__("fetch-and-scale")
|
|
34
|
+
|
|
35
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
36
|
+
from core.git import ensure_repo
|
|
37
|
+
from core.preset import load_preset
|
|
38
|
+
from engine.pipeline.steps.source import _resolve_modules
|
|
39
|
+
|
|
40
|
+
modules = _resolve_modules(ctx)
|
|
41
|
+
strategies: dict[str, str] = {}
|
|
42
|
+
|
|
43
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
44
|
+
preset = load_preset(preset_name)
|
|
45
|
+
audit_cfg = preset.get("audit", {})
|
|
46
|
+
scale_cfg = audit_cfg.get("scale_thresholds", {})
|
|
47
|
+
single_agent_threshold = scale_cfg.get("single_agent_kb", 50) * 1024
|
|
48
|
+
skeleton_threshold = scale_cfg.get("skeleton_kb", 80) * 1024
|
|
49
|
+
|
|
50
|
+
for mod in modules:
|
|
51
|
+
name = mod["name"]
|
|
52
|
+
branch = mod.get("branch", "main")
|
|
53
|
+
try:
|
|
54
|
+
repo_path = ensure_repo(
|
|
55
|
+
mod.get("url"), mod.get("local"),
|
|
56
|
+
ctx.cache_dir, name, branch,
|
|
57
|
+
)
|
|
58
|
+
ctx.state.setdefault("module_repos", {})[name] = repo_path
|
|
59
|
+
|
|
60
|
+
module_dir = ctx.knowledge_dir / name
|
|
61
|
+
doc_size = compute_doc_size(module_dir)
|
|
62
|
+
|
|
63
|
+
if doc_size < single_agent_threshold:
|
|
64
|
+
strategies[name] = "single-agent"
|
|
65
|
+
elif doc_size < skeleton_threshold:
|
|
66
|
+
strategies[name] = "single-agent+skeleton"
|
|
67
|
+
else:
|
|
68
|
+
strategies[name] = "multi-agent"
|
|
69
|
+
except Exception as e:
|
|
70
|
+
logger.warning("[fetch-and-scale] %s: %s", name, e)
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
ctx.state["audit_strategies"] = strategies
|
|
74
|
+
return StepResult(status="ok", message=f"Scale assessed: {len(strategies)} modules")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@register_step
|
|
78
|
+
class AuditDocsStep(Step):
|
|
79
|
+
"""LLM-powered document audit with structured findings.
|
|
80
|
+
|
|
81
|
+
Processes documents in dependency order (batch field from doc_types.yaml).
|
|
82
|
+
Supports progress persistence for resume and scope filtering.
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
default_name = "audit-docs"
|
|
86
|
+
|
|
87
|
+
def __init__(self):
|
|
88
|
+
super().__init__("audit-docs", checkpoint="audit-docs")
|
|
89
|
+
|
|
90
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
91
|
+
from engine.strategies import create_strategy
|
|
92
|
+
from engine.pipeline.steps.generate import _make_config_obj
|
|
93
|
+
from core.prompt.response_parser import parse_audit_response
|
|
94
|
+
from core.preset import load_preset
|
|
95
|
+
|
|
96
|
+
config_obj = _make_config_obj(ctx)
|
|
97
|
+
if config_obj.agent_backend == "delegated":
|
|
98
|
+
return self._delegated_mode(ctx)
|
|
99
|
+
|
|
100
|
+
strategy = create_strategy(config_obj)
|
|
101
|
+
module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
|
|
102
|
+
|
|
103
|
+
preset_name = ctx.kb_config.get("preset", "generic")
|
|
104
|
+
preset = load_preset(preset_name)
|
|
105
|
+
audit_cfg = preset.get("audit", {})
|
|
106
|
+
max_tokens = audit_cfg.get("max_tokens", 8192)
|
|
107
|
+
doc_truncate_chars = audit_cfg.get("doc_truncate_chars", 50000)
|
|
108
|
+
|
|
109
|
+
scope_doc_type = ctx.state.get("audit_scope")
|
|
110
|
+
scope_module = ctx.state.get("audit_module") or ctx.module
|
|
111
|
+
force = ctx.state.get("audit_force", False)
|
|
112
|
+
|
|
113
|
+
all_findings: dict[str, list] = {}
|
|
114
|
+
total_audited = 0
|
|
115
|
+
|
|
116
|
+
for name, repo_path in module_repos.items():
|
|
117
|
+
if scope_module and name != scope_module:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
module_dir = ctx.knowledge_dir / name
|
|
121
|
+
if not module_dir.is_dir():
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
progress = load_audit_progress(module_dir) if not force else {}
|
|
125
|
+
audit_order = get_audit_order(ctx)
|
|
126
|
+
|
|
127
|
+
for doc_type in audit_order:
|
|
128
|
+
if scope_doc_type and doc_type != scope_doc_type:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
doc_file = module_dir / f"{doc_type}.md"
|
|
132
|
+
if not doc_file.exists():
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
if progress.get(doc_type, {}).get("status") == "DONE":
|
|
136
|
+
logger.info("[audit] %s/%s: already DONE, skipping", name, doc_type)
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
update_audit_progress(module_dir, doc_type, "IN_PROGRESS")
|
|
140
|
+
|
|
141
|
+
system_prompt = build_audit_system_prompt(doc_type)
|
|
142
|
+
user_prompt = build_audit_user_prompt(doc_file, doc_type, module_dir, doc_truncate_chars)
|
|
143
|
+
|
|
144
|
+
request = LlmRequest(system=system_prompt, user=user_prompt, max_tokens=max_tokens)
|
|
145
|
+
try:
|
|
146
|
+
response = strategy.call(request)
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error("[audit] LLM failed for %s/%s: %s", name, doc_type, e)
|
|
149
|
+
update_audit_progress(module_dir, doc_type, "FAILED")
|
|
150
|
+
total_audited += 1
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
if response.status != "done":
|
|
154
|
+
logger.warning("[audit] LLM returned status '%s' for %s/%s: %s",
|
|
155
|
+
response.status, name, doc_type, response.error[:200] if response.error else "")
|
|
156
|
+
update_audit_progress(module_dir, doc_type, "FAILED")
|
|
157
|
+
total_audited += 1
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
result = parse_audit_response(response.content)
|
|
161
|
+
if not result.success:
|
|
162
|
+
logger.warning("[audit] %s/%s: parse failed: %s", name, doc_type, result.error)
|
|
163
|
+
update_audit_progress(module_dir, doc_type, "FAILED")
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
key = f"{name}/{doc_type}"
|
|
167
|
+
all_findings[key] = result.findings
|
|
168
|
+
update_audit_progress(module_dir, doc_type, "DONE", len(result.findings))
|
|
169
|
+
total_audited += 1
|
|
170
|
+
|
|
171
|
+
ctx.state["audit_findings"] = all_findings
|
|
172
|
+
total_issues = sum(
|
|
173
|
+
len([f for f in findings if f.status == "fail"])
|
|
174
|
+
for findings in all_findings.values()
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
if total_audited > 0 and not all_findings:
|
|
178
|
+
return StepResult(
|
|
179
|
+
status="ok",
|
|
180
|
+
message=f"Attempted {total_audited} docs, all LLM calls failed (check LLM connection)",
|
|
181
|
+
details={"total_audited": total_audited, "total_issues": 0, "all_failed": True},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return StepResult(
|
|
185
|
+
status="ok",
|
|
186
|
+
message=f"Audited {total_audited} docs, {total_issues} issues found",
|
|
187
|
+
details={"total_audited": total_audited, "total_issues": total_issues},
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def _delegated_mode(self, ctx: PipelineContext) -> StepResult:
|
|
191
|
+
"""Generate audit manifest for Agent dispatch."""
|
|
192
|
+
module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
|
|
193
|
+
manifest = {"operation": "kb-audit", "modules": []}
|
|
194
|
+
|
|
195
|
+
for name, repo_path in module_repos.items():
|
|
196
|
+
module_dir = ctx.knowledge_dir / name
|
|
197
|
+
if not module_dir.is_dir():
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
docs = []
|
|
201
|
+
for doc_type in get_audit_order(ctx):
|
|
202
|
+
doc_file = module_dir / f"{doc_type}.md"
|
|
203
|
+
if doc_file.exists():
|
|
204
|
+
docs.append({
|
|
205
|
+
"doc_type": doc_type,
|
|
206
|
+
"doc_path": str(doc_file.relative_to(ctx.knowledge_dir)),
|
|
207
|
+
"template": f"presets/java-spring/templates/subagent-audit-{doc_type}.md",
|
|
208
|
+
})
|
|
209
|
+
|
|
210
|
+
if docs:
|
|
211
|
+
manifest["modules"].append({"name": name, "documents": docs})
|
|
212
|
+
|
|
213
|
+
manifest_path = ctx.knowledge_dir / ".audit-manifest.json"
|
|
214
|
+
manifest_path.write_text(
|
|
215
|
+
json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
|
|
216
|
+
)
|
|
217
|
+
return StepResult(
|
|
218
|
+
status="delegated",
|
|
219
|
+
message=f"Audit manifest written: {manifest_path}",
|
|
220
|
+
)
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Audit fix and report steps — apply fixes and generate reports.
|
|
2
|
+
|
|
3
|
+
Steps:
|
|
4
|
+
- ApplyFixesStep: apply LLM-generated fixes to documents
|
|
5
|
+
- GenerateReportStep: generate audit summary report
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
from core.interfaces import Step, StepResult, PipelineContext
|
|
15
|
+
from engine.pipeline import register_step
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@register_step
|
|
21
|
+
class ApplyFixesStep(Step):
|
|
22
|
+
"""Apply LLM-generated fixes to documents.
|
|
23
|
+
|
|
24
|
+
Fix scope determination:
|
|
25
|
+
- fix contains section heading -> Section_Updater.replace_section()
|
|
26
|
+
- fix is new section -> Section_Updater.append_section()
|
|
27
|
+
- fix is inline content -> direct line replacement
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
default_name = "apply-fixes"
|
|
31
|
+
|
|
32
|
+
def __init__(self):
|
|
33
|
+
super().__init__("apply-fixes")
|
|
34
|
+
|
|
35
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
36
|
+
findings = ctx.state.get("audit_findings", {})
|
|
37
|
+
if not findings:
|
|
38
|
+
return StepResult(status="skipped", message="No findings to apply")
|
|
39
|
+
|
|
40
|
+
dry_run = ctx.state.get("dry_run", False)
|
|
41
|
+
applied = 0
|
|
42
|
+
skipped = 0
|
|
43
|
+
failed = 0
|
|
44
|
+
fix_log: list[dict] = []
|
|
45
|
+
|
|
46
|
+
for key, finding_list in findings.items():
|
|
47
|
+
parts = key.split("/", 1)
|
|
48
|
+
if len(parts) != 2:
|
|
49
|
+
continue
|
|
50
|
+
module_name, doc_type = parts
|
|
51
|
+
module_dir = ctx.knowledge_dir / module_name
|
|
52
|
+
doc_path = module_dir / f"{doc_type}.md"
|
|
53
|
+
|
|
54
|
+
if not doc_path.exists():
|
|
55
|
+
continue
|
|
56
|
+
|
|
57
|
+
for finding in finding_list:
|
|
58
|
+
if finding.status != "fail" or not finding.fix:
|
|
59
|
+
skipped += 1
|
|
60
|
+
continue
|
|
61
|
+
|
|
62
|
+
if dry_run:
|
|
63
|
+
fix_log.append({
|
|
64
|
+
"doc": doc_type, "dimension": finding.dimension,
|
|
65
|
+
"action": "would_apply", "preview": finding.fix[:200],
|
|
66
|
+
})
|
|
67
|
+
applied += 1
|
|
68
|
+
continue
|
|
69
|
+
|
|
70
|
+
success = _apply_single_fix(doc_path, finding)
|
|
71
|
+
if success:
|
|
72
|
+
applied += 1
|
|
73
|
+
fix_log.append({
|
|
74
|
+
"doc": doc_type, "dimension": finding.dimension,
|
|
75
|
+
"status": "applied",
|
|
76
|
+
})
|
|
77
|
+
else:
|
|
78
|
+
failed += 1
|
|
79
|
+
fix_log.append({
|
|
80
|
+
"doc": doc_type, "dimension": finding.dimension,
|
|
81
|
+
"status": "failed",
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
if fix_log:
|
|
85
|
+
for key in findings:
|
|
86
|
+
parts = key.split("/", 1)
|
|
87
|
+
if len(parts) == 2:
|
|
88
|
+
module_dir = ctx.knowledge_dir / parts[0]
|
|
89
|
+
if module_dir.is_dir():
|
|
90
|
+
tracking_path = module_dir / ".audit-fixes.json"
|
|
91
|
+
tracking_path.write_text(
|
|
92
|
+
json.dumps(fix_log, ensure_ascii=False, indent=2),
|
|
93
|
+
encoding="utf-8",
|
|
94
|
+
)
|
|
95
|
+
break
|
|
96
|
+
|
|
97
|
+
ctx.state["fixes_applied"] = applied
|
|
98
|
+
ctx.state["fixes_failed"] = failed
|
|
99
|
+
|
|
100
|
+
return StepResult(
|
|
101
|
+
status="ok",
|
|
102
|
+
message=f"Applied {applied} fixes, {skipped} skipped, {failed} failed",
|
|
103
|
+
details={"applied": applied, "skipped": skipped, "failed": failed},
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@register_step
|
|
108
|
+
class GenerateReportStep(Step):
|
|
109
|
+
"""Generate audit summary report."""
|
|
110
|
+
|
|
111
|
+
default_name = "generate-report"
|
|
112
|
+
|
|
113
|
+
def __init__(self):
|
|
114
|
+
super().__init__("generate-report")
|
|
115
|
+
|
|
116
|
+
def run(self, ctx: PipelineContext) -> StepResult:
|
|
117
|
+
findings = ctx.state.get("audit_findings", {})
|
|
118
|
+
fixes_applied = ctx.state.get("fixes_applied", 0)
|
|
119
|
+
fixes_failed = ctx.state.get("fixes_failed", 0)
|
|
120
|
+
report_path = ctx.knowledge_dir / ".audit-report.md"
|
|
121
|
+
|
|
122
|
+
scope_doc = ctx.state.get("audit_scope", "all")
|
|
123
|
+
scope_mod = ctx.state.get("audit_module") or ctx.module or "all"
|
|
124
|
+
|
|
125
|
+
lines = ["# Audit Report", ""]
|
|
126
|
+
lines.append(f"Knowledge base: {ctx.kb_name}")
|
|
127
|
+
lines.append(f"Scope: {scope_doc}/{scope_mod}")
|
|
128
|
+
lines.append(f"Fixes: {fixes_applied} applied, {fixes_failed} failed")
|
|
129
|
+
lines.append("")
|
|
130
|
+
|
|
131
|
+
total_issues = 0
|
|
132
|
+
for key, finding_list in sorted(findings.items()):
|
|
133
|
+
fails = [f for f in finding_list if f.status == "fail"]
|
|
134
|
+
passes = [f for f in finding_list if f.status == "pass"]
|
|
135
|
+
total_issues += len(fails)
|
|
136
|
+
|
|
137
|
+
lines.append(f"## {key}")
|
|
138
|
+
lines.append(f"- Pass: {len(passes)}, Fail: {len(fails)}")
|
|
139
|
+
lines.append("")
|
|
140
|
+
|
|
141
|
+
if fails:
|
|
142
|
+
for f in fails[:10]:
|
|
143
|
+
fix_indicator = " (fixed)" if f.fix else ""
|
|
144
|
+
lines.append(f" - [{f.dimension}] {f.detail}{fix_indicator}")
|
|
145
|
+
if len(fails) > 10:
|
|
146
|
+
lines.append(f" - ... and {len(fails) - 10} more")
|
|
147
|
+
lines.append("")
|
|
148
|
+
|
|
149
|
+
lines.append("---")
|
|
150
|
+
lines.append(f"Total: {total_issues} issues")
|
|
151
|
+
|
|
152
|
+
report_path.write_text("\n".join(lines), encoding="utf-8")
|
|
153
|
+
|
|
154
|
+
module_repos = ctx.state.get("module_repos", {})
|
|
155
|
+
for name in module_repos:
|
|
156
|
+
module_dir = ctx.knowledge_dir / name
|
|
157
|
+
progress_file = module_dir / ".audit-progress.json"
|
|
158
|
+
if progress_file.exists():
|
|
159
|
+
progress_file.unlink()
|
|
160
|
+
|
|
161
|
+
return StepResult(status="ok", message=f"Report: {report_path} ({total_issues} issues)")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
# ---------------------------------------------------------------------------
|
|
165
|
+
# Helpers
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _apply_single_fix(doc_path: Path, finding) -> bool:
|
|
170
|
+
"""Apply a single fix from an audit finding."""
|
|
171
|
+
from core.docs.section_updater import replace_section, append_section
|
|
172
|
+
from core.utils import validate_path_within_bounds
|
|
173
|
+
|
|
174
|
+
fix_content = finding.fix
|
|
175
|
+
if not fix_content:
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
if not validate_path_within_bounds(doc_path, doc_path.parent):
|
|
179
|
+
logger.warning("[apply-fix] Path traversal blocked: %s", doc_path)
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
lines = fix_content.strip().split("\n")
|
|
183
|
+
first_line = lines[0] if lines else ""
|
|
184
|
+
|
|
185
|
+
if first_line.startswith("## ") or first_line.startswith("### "):
|
|
186
|
+
heading = first_line
|
|
187
|
+
body = "\n".join(lines[1:]).strip()
|
|
188
|
+
success = replace_section(doc_path, heading, body)
|
|
189
|
+
if not success:
|
|
190
|
+
success = append_section(doc_path, heading, body)
|
|
191
|
+
return success
|
|
192
|
+
else:
|
|
193
|
+
logger.info("[apply-fix] Inline fix for %s/%s (deferred): %s",
|
|
194
|
+
doc_path.name, finding.dimension, fix_content[:100])
|
|
195
|
+
return False
|