source-kb 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. cli/__init__.py +50 -0
  2. cli/__main__.py +5 -0
  3. cli/commands/__init__.py +1 -0
  4. cli/commands/anchor_fix.py +47 -0
  5. cli/commands/diff_doc.py +52 -0
  6. cli/commands/dispatch.py +77 -0
  7. cli/commands/extract.py +72 -0
  8. cli/commands/file_list.py +74 -0
  9. cli/commands/index.py +84 -0
  10. cli/commands/lock.py +89 -0
  11. cli/commands/merge.py +60 -0
  12. cli/commands/merge_delta.py +19 -0
  13. cli/commands/metadata.py +24 -0
  14. cli/commands/pipeline.py +45 -0
  15. cli/commands/post_merge.py +43 -0
  16. cli/commands/query.py +52 -0
  17. cli/commands/render.py +101 -0
  18. cli/commands/scan_repos.py +46 -0
  19. cli/commands/setup.py +94 -0
  20. cli/commands/split.py +196 -0
  21. cli/commands/stale_files.py +98 -0
  22. cli/commands/validate.py +191 -0
  23. core/__init__.py +32 -0
  24. core/config.py +261 -0
  25. core/docs/__init__.py +7 -0
  26. core/docs/section_updater.py +286 -0
  27. core/docs/shared.py +149 -0
  28. core/git.py +294 -0
  29. core/interfaces.py +249 -0
  30. core/monitor/__init__.py +5 -0
  31. core/monitor/progress.py +83 -0
  32. core/monitor/prompt_store.py +49 -0
  33. core/paths.py +141 -0
  34. core/preset.py +237 -0
  35. core/preset_accessors.py +202 -0
  36. core/preset_classify.py +132 -0
  37. core/preset_hooks.py +129 -0
  38. core/preset_profile.py +89 -0
  39. core/prompt/__init__.py +7 -0
  40. core/prompt/__main__.py +147 -0
  41. core/prompt/content.py +320 -0
  42. core/prompt/context_manager.py +164 -0
  43. core/prompt/renderer.py +236 -0
  44. core/prompt/response_parser.py +274 -0
  45. core/prompt/templates.py +357 -0
  46. core/prompt/validate_parity.py +162 -0
  47. core/prompt/variables.py +339 -0
  48. core/rag/__init__.py +22 -0
  49. core/rag/__main__.py +136 -0
  50. core/rag/bm25_index.py +268 -0
  51. core/rag/chunker.py +273 -0
  52. core/rag/embedder.py +151 -0
  53. core/rag/indexer.py +292 -0
  54. core/rag/loader.py +89 -0
  55. core/rag/retriever.py +82 -0
  56. core/skeleton/__init__.py +11 -0
  57. core/skeleton/__main__.py +934 -0
  58. core/skeleton/anchor_fix.py +250 -0
  59. core/skeleton/classify.py +331 -0
  60. core/skeleton/cmd_anchor_fix.py +43 -0
  61. core/skeleton/cmd_diff_doc.py +44 -0
  62. core/skeleton/cmd_lock.py +87 -0
  63. core/skeleton/cmd_merge_delta.py +41 -0
  64. core/skeleton/community.py +233 -0
  65. core/skeleton/dependency_graph.py +306 -0
  66. core/skeleton/diff_doc.py +248 -0
  67. core/skeleton/dispatch.py +273 -0
  68. core/skeleton/dispatch_render.py +319 -0
  69. core/skeleton/dispatch_source.py +111 -0
  70. core/skeleton/extract.py +218 -0
  71. core/skeleton/extract_methods.py +298 -0
  72. core/skeleton/file_list.py +239 -0
  73. core/skeleton/impact.py +278 -0
  74. core/skeleton/jar_download.py +177 -0
  75. core/skeleton/jar_resolver.py +186 -0
  76. core/skeleton/loader.py +162 -0
  77. core/skeleton/merge.py +278 -0
  78. core/skeleton/merge_delta.py +229 -0
  79. core/skeleton/metadata.py +96 -0
  80. core/skeleton/metadata_builders.py +264 -0
  81. core/skeleton/module_dag.py +330 -0
  82. core/skeleton/parsers/__init__.py +71 -0
  83. core/skeleton/parsers/jqassistant.py +300 -0
  84. core/skeleton/parsers/jqassistant_cypher.py +225 -0
  85. core/skeleton/parsers/regex.py +171 -0
  86. core/skeleton/parsers/treesitter.py +324 -0
  87. core/skeleton/parsers/treesitter_java.py +284 -0
  88. core/skeleton/parsers/treesitter_multi.py +289 -0
  89. core/skeleton/pom_parser.py +299 -0
  90. core/skeleton/post_merge.py +295 -0
  91. core/skeleton/post_merge_llm.py +82 -0
  92. core/skeleton/query.py +195 -0
  93. core/skeleton/shard_context.py +177 -0
  94. core/skeleton/split.py +180 -0
  95. core/skeleton/split_cache.py +107 -0
  96. core/skeleton/split_feedback.py +174 -0
  97. core/skeleton/split_plan.py +219 -0
  98. core/skeleton/split_plan_helpers.py +305 -0
  99. core/skeleton/split_plan_llm.py +274 -0
  100. core/utils.py +135 -0
  101. core/validators/__init__.py +65 -0
  102. core/validators/__main__.py +215 -0
  103. core/validators/consistency.py +203 -0
  104. core/validators/coverage.py +171 -0
  105. core/validators/duplicates.py +76 -0
  106. core/validators/engine.py +224 -0
  107. core/validators/links.py +76 -0
  108. core/validators/sampling.py +169 -0
  109. core/validators/structure.py +144 -0
  110. engine/__init__.py +7 -0
  111. engine/assembler.py +231 -0
  112. engine/confirm.py +65 -0
  113. engine/dedup.py +106 -0
  114. engine/main.py +211 -0
  115. engine/pipeline/__init__.py +163 -0
  116. engine/pipeline/recovery.py +250 -0
  117. engine/pipeline/steps/__init__.py +23 -0
  118. engine/pipeline/steps/audit.py +220 -0
  119. engine/pipeline/steps/audit_apply.py +195 -0
  120. engine/pipeline/steps/audit_helpers.py +155 -0
  121. engine/pipeline/steps/classify_llm.py +236 -0
  122. engine/pipeline/steps/classify_prompt.py +223 -0
  123. engine/pipeline/steps/finalize.py +160 -0
  124. engine/pipeline/steps/generate.py +169 -0
  125. engine/pipeline/steps/generate_batch.py +197 -0
  126. engine/pipeline/steps/generate_recovery.py +170 -0
  127. engine/pipeline/steps/llm_plan_split.py +253 -0
  128. engine/pipeline/steps/lock.py +64 -0
  129. engine/pipeline/steps/preflight.py +237 -0
  130. engine/pipeline/steps/preflight_adjust.py +147 -0
  131. engine/pipeline/steps/pregenerate.py +130 -0
  132. engine/pipeline/steps/quality.py +81 -0
  133. engine/pipeline/steps/skeleton.py +149 -0
  134. engine/pipeline/steps/source.py +163 -0
  135. engine/pipeline/steps/sync.py +117 -0
  136. engine/pipeline/steps/sync_finalize.py +237 -0
  137. engine/pipeline/steps/sync_update.py +341 -0
  138. engine/pipelines.py +91 -0
  139. engine/runner.py +335 -0
  140. engine/strategies/__init__.py +86 -0
  141. engine/strategies/api.py +128 -0
  142. engine/strategies/delegated.py +50 -0
  143. engine/strategies/dryrun.py +25 -0
  144. engine/two_phase.py +143 -0
  145. mcp_server/__init__.py +73 -0
  146. mcp_server/__main__.py +5 -0
  147. mcp_server/tools/__init__.py +1 -0
  148. mcp_server/tools/config.py +63 -0
  149. mcp_server/tools/discovery.py +276 -0
  150. mcp_server/tools/generation.py +184 -0
  151. mcp_server/tools/planning.py +144 -0
  152. mcp_server/tools/source.py +175 -0
  153. mcp_server/tools/validation.py +140 -0
  154. mcp_server/tools/workflow.py +166 -0
  155. mcp_server/workflow_loader.py +204 -0
  156. presets/generic/audit_dimensions.md +132 -0
  157. presets/generic/doc_types.yaml +152 -0
  158. presets/generic/preset.yaml +115 -0
  159. presets/java-spring/audit_dimensions.md +228 -0
  160. presets/java-spring/audit_dimensions.yaml +203 -0
  161. presets/java-spring/doc_types.yaml +269 -0
  162. presets/java-spring/hooks.py +122 -0
  163. presets/java-spring/preset.yaml +341 -0
  164. presets/java-spring/templates/README.md +34 -0
  165. presets/java-spring/templates/audit-system.md +15 -0
  166. presets/java-spring/templates/subagent-aop.md +105 -0
  167. presets/java-spring/templates/subagent-api.md +63 -0
  168. presets/java-spring/templates/subagent-architecture.md +111 -0
  169. presets/java-spring/templates/subagent-async-events.md +107 -0
  170. presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
  171. presets/java-spring/templates/subagent-audit-architecture.md +38 -0
  172. presets/java-spring/templates/subagent-audit-business.md +40 -0
  173. presets/java-spring/templates/subagent-audit-data-models.md +40 -0
  174. presets/java-spring/templates/subagent-business.md +129 -0
  175. presets/java-spring/templates/subagent-caching.md +75 -0
  176. presets/java-spring/templates/subagent-database-access.md +114 -0
  177. presets/java-spring/templates/subagent-enum.md +75 -0
  178. presets/java-spring/templates/subagent-error-handling.md +91 -0
  179. presets/java-spring/templates/subagent-external-integrations.md +80 -0
  180. presets/java-spring/templates/subagent-index.md +122 -0
  181. presets/java-spring/templates/subagent-messaging.md +97 -0
  182. presets/java-spring/templates/subagent-model.md +88 -0
  183. presets/java-spring/templates/subagent-observability.md +91 -0
  184. presets/java-spring/templates/subagent-scheduled.md +81 -0
  185. presets/java-spring/templates/subagent-security.md +102 -0
  186. presets/java-spring/templates/subagent-structure.md +101 -0
  187. presets/java-spring/templates/subagent-sync-section.md +34 -0
  188. presets/java-spring/templates/subagent-utils.md +73 -0
  189. presets/java-spring/templates/sync-system.md +8 -0
  190. presets/java-spring/workflow-extensions.md +112 -0
  191. skills/__init__.py +1 -0
  192. skills/_shared/README.md +30 -0
  193. skills/_shared/doc-coverage-shared.md +134 -0
  194. skills/_shared/doc-quality-standard.md +1058 -0
  195. skills/_shared/doc-subagent-rules.md +762 -0
  196. skills/_shared/windows-compat.md +89 -0
  197. skills/kb-audit/SKILL.md +52 -0
  198. skills/kb-audit/rules.md +88 -0
  199. skills/kb-audit/steps/step-01-prepare.md +75 -0
  200. skills/kb-audit/steps/step-02-audit.md +96 -0
  201. skills/kb-audit/steps/step-03-verify.md +65 -0
  202. skills/kb-audit/steps/step-04-report.md +64 -0
  203. skills/kb-init/SKILL.md +146 -0
  204. skills/kb-init/rules.md +187 -0
  205. skills/kb-init/steps/step-01-scope.md +62 -0
  206. skills/kb-init/steps/step-02-source.md +410 -0
  207. skills/kb-init/steps/step-03-generate.md +307 -0
  208. skills/kb-init/steps/step-04-quality.md +92 -0
  209. skills/kb-init/steps/step-05-finalize.md +132 -0
  210. skills/kb-init/templates/core/execution-modes.md +29 -0
  211. skills/kb-init/templates/core/output-only.md +4 -0
  212. skills/kb-init/templates/core/readwrite.md +33 -0
  213. skills/kb-search/SKILL.md +138 -0
  214. skills/kb-search/rules.md +64 -0
  215. skills/kb-sync/SKILL.md +43 -0
  216. skills/kb-sync/rules.md +70 -0
  217. skills/kb-sync/scripts/rebuild_module.py +91 -0
  218. skills/kb-sync/scripts/scan_repos.py +687 -0
  219. skills/kb-sync/steps/step-01-detect.md +72 -0
  220. skills/kb-sync/steps/step-02-update.md +71 -0
  221. skills/kb-sync/steps/step-03-verify.md +47 -0
  222. skills/kb-sync/steps/step-04-finalize.md +52 -0
  223. source_kb-0.2.2.dist-info/METADATA +194 -0
  224. source_kb-0.2.2.dist-info/RECORD +228 -0
  225. source_kb-0.2.2.dist-info/WHEEL +5 -0
  226. source_kb-0.2.2.dist-info/entry_points.txt +3 -0
  227. source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
  228. source_kb-0.2.2.dist-info/top_level.txt +6 -0
@@ -0,0 +1,250 @@
1
+ """Checkpoint and recovery module for pipeline execution.
2
+
3
+ Provides:
4
+ - Checkpoint dataclass for recording step completion state
5
+ - save()/load() methods for persistence
6
+ - --resume breakpoint resume logic
7
+ - Corruption detection and rollback
8
+
9
+ Requirements: 19.1, 19.2, 19.3, 19.4
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ import time
17
+ from dataclasses import dataclass, field, asdict
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ from core.interfaces import LlmStrategy
22
+ from engine.runner import SubagentTask, SubagentResult, run_single
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ CHECKPOINT_FILENAME = ".pipeline-checkpoint.json"
27
+ CHECKPOINT_VERSION = 2
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Checkpoint dataclass
32
+ # ---------------------------------------------------------------------------
33
+
34
+ @dataclass
35
+ class StepState:
36
+ """State of a single pipeline step."""
37
+ name: str
38
+ status: str # "pending", "running", "done", "failed", "skipped"
39
+ started_at: float = 0.0
40
+ completed_at: float = 0.0
41
+ message: str = ""
42
+ details: dict[str, Any] = field(default_factory=dict)
43
+
44
+
45
+ @dataclass
46
+ class Checkpoint:
47
+ """Pipeline execution checkpoint for resume support."""
48
+ pipeline_name: str
49
+ kb_name: str
50
+ version: int = CHECKPOINT_VERSION
51
+ created_at: float = field(default_factory=time.time)
52
+ updated_at: float = field(default_factory=time.time)
53
+ steps: list[StepState] = field(default_factory=list)
54
+ state_snapshot: dict[str, Any] = field(default_factory=dict)
55
+
56
+ @property
57
+ def completed_steps(self) -> list[str]:
58
+ """Names of steps that completed successfully."""
59
+ return [s.name for s in self.steps if s.status in ("done", "skipped")]
60
+
61
+ @property
62
+ def last_completed(self) -> str | None:
63
+ """Name of the last completed step."""
64
+ completed = self.completed_steps
65
+ return completed[-1] if completed else None
66
+
67
+ @property
68
+ def next_step(self) -> str | None:
69
+ """Name of the next step to execute (first pending/failed)."""
70
+ for s in self.steps:
71
+ if s.status in ("pending", "failed", "running"):
72
+ return s.name
73
+ return None
74
+
75
+ def mark_step(self, name: str, status: str, message: str = "", **details) -> None:
76
+ """Update a step's status."""
77
+ for s in self.steps:
78
+ if s.name == name:
79
+ s.status = status
80
+ s.message = message
81
+ s.details.update(details)
82
+ if status == "running":
83
+ s.started_at = time.time()
84
+ elif status in ("done", "failed", "skipped"):
85
+ s.completed_at = time.time()
86
+ break
87
+ self.updated_at = time.time()
88
+
89
+
90
+ # ---------------------------------------------------------------------------
91
+ # Save / Load
92
+ # ---------------------------------------------------------------------------
93
+
94
+ def save_checkpoint(checkpoint: Checkpoint, knowledge_dir: Path) -> Path:
95
+ """Save checkpoint to disk.
96
+
97
+ Args:
98
+ checkpoint: Checkpoint to save
99
+ knowledge_dir: Knowledge directory (checkpoint stored in .meta/)
100
+
101
+ Returns:
102
+ Path to saved checkpoint file
103
+ """
104
+ checkpoint.updated_at = time.time()
105
+
106
+ cp_dir = knowledge_dir / ".meta"
107
+ cp_dir.mkdir(parents=True, exist_ok=True)
108
+ cp_path = cp_dir / CHECKPOINT_FILENAME
109
+
110
+ data = {
111
+ "version": checkpoint.version,
112
+ "pipeline_name": checkpoint.pipeline_name,
113
+ "kb_name": checkpoint.kb_name,
114
+ "created_at": checkpoint.created_at,
115
+ "updated_at": checkpoint.updated_at,
116
+ "steps": [asdict(s) for s in checkpoint.steps],
117
+ "state_snapshot": _sanitize_state(checkpoint.state_snapshot),
118
+ }
119
+
120
+ # Write atomically (write to temp then rename)
121
+ tmp_path = cp_path.with_suffix(".tmp")
122
+ tmp_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
123
+ tmp_path.replace(cp_path)
124
+
125
+ logger.debug("[checkpoint] Saved: %s (%d steps)", cp_path, len(checkpoint.steps))
126
+ return cp_path
127
+
128
+
129
+ def load_checkpoint(knowledge_dir: Path) -> Checkpoint | None:
130
+ """Load checkpoint from disk.
131
+
132
+ Returns None if no checkpoint exists or if JSON is corrupted.
133
+ """
134
+ cp_path = knowledge_dir / ".meta" / CHECKPOINT_FILENAME
135
+ if not cp_path.exists():
136
+ return None
137
+
138
+ try:
139
+ data = json.loads(cp_path.read_text(encoding="utf-8"))
140
+ except (json.JSONDecodeError, OSError) as e:
141
+ logger.warning("[checkpoint] Corrupted checkpoint: %s", e)
142
+ return None
143
+
144
+ # Version check
145
+ if data.get("version", 0) != CHECKPOINT_VERSION:
146
+ logger.warning("[checkpoint] Version mismatch (got %s, expected %s)",
147
+ data.get("version"), CHECKPOINT_VERSION)
148
+ return None
149
+
150
+ steps = [
151
+ StepState(
152
+ name=s["name"],
153
+ status=s.get("status", "pending"),
154
+ started_at=s.get("started_at", 0),
155
+ completed_at=s.get("completed_at", 0),
156
+ message=s.get("message", ""),
157
+ details=s.get("details", {}),
158
+ )
159
+ for s in data.get("steps", [])
160
+ ]
161
+
162
+ return Checkpoint(
163
+ pipeline_name=data.get("pipeline_name", ""),
164
+ kb_name=data.get("kb_name", ""),
165
+ version=data.get("version", CHECKPOINT_VERSION),
166
+ created_at=data.get("created_at", 0),
167
+ updated_at=data.get("updated_at", 0),
168
+ steps=steps,
169
+ state_snapshot=data.get("state_snapshot", {}),
170
+ )
171
+
172
+
173
+ def create_checkpoint(
174
+ pipeline_name: str, kb_name: str, step_names: list[str]
175
+ ) -> Checkpoint:
176
+ """Create a new checkpoint for a pipeline run."""
177
+ steps = [StepState(name=name, status="pending") for name in step_names]
178
+ return Checkpoint(pipeline_name=pipeline_name, kb_name=kb_name, steps=steps)
179
+
180
+
181
+ def clear_checkpoint(knowledge_dir: Path) -> None:
182
+ """Remove checkpoint file after successful completion."""
183
+ cp_path = knowledge_dir / ".meta" / CHECKPOINT_FILENAME
184
+ if cp_path.exists():
185
+ cp_path.unlink()
186
+ logger.debug("[checkpoint] Cleared: %s", cp_path)
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Resume logic
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def get_resume_point(knowledge_dir: Path) -> str | None:
194
+ """Get the step name to resume from, or None if no resume needed."""
195
+ cp = load_checkpoint(knowledge_dir)
196
+ if cp is None:
197
+ return None
198
+ return cp.next_step
199
+
200
+
201
+ def should_skip_step(checkpoint: Checkpoint | None, step_name: str) -> bool:
202
+ """Check if a step should be skipped during resume."""
203
+ if checkpoint is None:
204
+ return False
205
+ return step_name in checkpoint.completed_steps
206
+
207
+
208
+ # ---------------------------------------------------------------------------
209
+ # Recovery (LLM failure handling) — see generate_recovery.py for full chain
210
+ # ---------------------------------------------------------------------------
211
+
212
+ def attempt_recovery(
213
+ task: SubagentTask,
214
+ strategy: LlmStrategy,
215
+ failure_reason: str,
216
+ max_retries: int = 2,
217
+ ) -> SubagentResult:
218
+ """Try retrying the task. For full recovery chain, use generate_recovery.py."""
219
+ for attempt in range(max_retries):
220
+ result = run_single(task, strategy)
221
+ if result.status == "done":
222
+ return result
223
+
224
+ return SubagentResult(
225
+ task_id=task.task_id, status="failed",
226
+ error=f"Recovery exhausted after {max_retries} retries: {failure_reason}",
227
+ )
228
+
229
+
230
+ # ---------------------------------------------------------------------------
231
+ # Helpers
232
+ # ---------------------------------------------------------------------------
233
+
234
+ def _sanitize_state(state: dict) -> dict:
235
+ """Remove non-serializable items from state for checkpoint storage."""
236
+ safe: dict[str, Any] = {}
237
+ for key, value in state.items():
238
+ if key.startswith("_"):
239
+ continue # Skip private state (locks, etc.)
240
+ if isinstance(value, Path):
241
+ safe[key] = str(value)
242
+ elif isinstance(value, dict):
243
+ safe[key] = {
244
+ k: str(v) if isinstance(v, Path) else v
245
+ for k, v in value.items()
246
+ if not k.startswith("_")
247
+ }
248
+ elif isinstance(value, (str, int, float, bool, list)):
249
+ safe[key] = value
250
+ return safe
@@ -0,0 +1,23 @@
1
+ """CLI pipeline steps — all step implementations.
2
+
3
+ Steps are organized by pipeline phase:
4
+ lock.py — AcquireLock / ReleaseLock
5
+ source.py — FetchSource / BuildSource / ResolveDeps
6
+ skeleton.py — ExtractSkeleton / ExtractFileList / Classify
7
+ generate.py — GenerateDocs (main LLM generation step)
8
+ quality.py — Validate / Sampling / Links / Duplicates
9
+ finalize.py — Merge / Dedup / SharedDocs / Publish / RebuildIndex / Clean
10
+ sync.py — Sync-specific steps
11
+ audit.py — Audit-specific steps
12
+
13
+ Import all step modules to trigger registration.
14
+ """
15
+
16
+ # Import all step modules to register them
17
+ from engine.pipeline.steps import lock # noqa: F401
18
+ from engine.pipeline.steps import source # noqa: F401
19
+ from engine.pipeline.steps import skeleton # noqa: F401
20
+ from engine.pipeline.steps import pregenerate # noqa: F401
21
+ from engine.pipeline.steps import generate # noqa: F401
22
+ from engine.pipeline.steps import quality # noqa: F401
23
+ from engine.pipeline.steps import finalize # noqa: F401
@@ -0,0 +1,220 @@
1
+ """Audit pipeline steps — document quality audit via LLM.
2
+
3
+ Steps:
4
+ - FetchAndScaleStep: fetch source and assess scale for strategy selection
5
+ - AuditDocsStep: LLM-powered document audit against source
6
+
7
+ Requirements: Req 12, 18, 20, 27
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ from pathlib import Path
15
+
16
+ from core.interfaces import Step, StepResult, PipelineContext, LlmRequest
17
+ from engine.pipeline import register_step
18
+ from engine.pipeline.steps.audit_helpers import (
19
+ compute_doc_size, get_audit_order, load_audit_progress,
20
+ update_audit_progress, build_audit_system_prompt, build_audit_user_prompt,
21
+ )
22
+
23
+ logger = logging.getLogger(__name__)
24
+
25
+
26
+ @register_step
27
+ class FetchAndScaleStep(Step):
28
+ """Fetch source and assess scale for audit strategy selection."""
29
+
30
+ default_name = "fetch-and-scale"
31
+
32
+ def __init__(self):
33
+ super().__init__("fetch-and-scale")
34
+
35
+ def run(self, ctx: PipelineContext) -> StepResult:
36
+ from core.git import ensure_repo
37
+ from core.preset import load_preset
38
+ from engine.pipeline.steps.source import _resolve_modules
39
+
40
+ modules = _resolve_modules(ctx)
41
+ strategies: dict[str, str] = {}
42
+
43
+ preset_name = ctx.kb_config.get("preset", "generic")
44
+ preset = load_preset(preset_name)
45
+ audit_cfg = preset.get("audit", {})
46
+ scale_cfg = audit_cfg.get("scale_thresholds", {})
47
+ single_agent_threshold = scale_cfg.get("single_agent_kb", 50) * 1024
48
+ skeleton_threshold = scale_cfg.get("skeleton_kb", 80) * 1024
49
+
50
+ for mod in modules:
51
+ name = mod["name"]
52
+ branch = mod.get("branch", "main")
53
+ try:
54
+ repo_path = ensure_repo(
55
+ mod.get("url"), mod.get("local"),
56
+ ctx.cache_dir, name, branch,
57
+ )
58
+ ctx.state.setdefault("module_repos", {})[name] = repo_path
59
+
60
+ module_dir = ctx.knowledge_dir / name
61
+ doc_size = compute_doc_size(module_dir)
62
+
63
+ if doc_size < single_agent_threshold:
64
+ strategies[name] = "single-agent"
65
+ elif doc_size < skeleton_threshold:
66
+ strategies[name] = "single-agent+skeleton"
67
+ else:
68
+ strategies[name] = "multi-agent"
69
+ except Exception as e:
70
+ logger.warning("[fetch-and-scale] %s: %s", name, e)
71
+ continue
72
+
73
+ ctx.state["audit_strategies"] = strategies
74
+ return StepResult(status="ok", message=f"Scale assessed: {len(strategies)} modules")
75
+
76
+
77
+ @register_step
78
+ class AuditDocsStep(Step):
79
+ """LLM-powered document audit with structured findings.
80
+
81
+ Processes documents in dependency order (batch field from doc_types.yaml).
82
+ Supports progress persistence for resume and scope filtering.
83
+ """
84
+
85
+ default_name = "audit-docs"
86
+
87
+ def __init__(self):
88
+ super().__init__("audit-docs", checkpoint="audit-docs")
89
+
90
+ def run(self, ctx: PipelineContext) -> StepResult:
91
+ from engine.strategies import create_strategy
92
+ from engine.pipeline.steps.generate import _make_config_obj
93
+ from core.prompt.response_parser import parse_audit_response
94
+ from core.preset import load_preset
95
+
96
+ config_obj = _make_config_obj(ctx)
97
+ if config_obj.agent_backend == "delegated":
98
+ return self._delegated_mode(ctx)
99
+
100
+ strategy = create_strategy(config_obj)
101
+ module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
102
+
103
+ preset_name = ctx.kb_config.get("preset", "generic")
104
+ preset = load_preset(preset_name)
105
+ audit_cfg = preset.get("audit", {})
106
+ max_tokens = audit_cfg.get("max_tokens", 8192)
107
+ doc_truncate_chars = audit_cfg.get("doc_truncate_chars", 50000)
108
+
109
+ scope_doc_type = ctx.state.get("audit_scope")
110
+ scope_module = ctx.state.get("audit_module") or ctx.module
111
+ force = ctx.state.get("audit_force", False)
112
+
113
+ all_findings: dict[str, list] = {}
114
+ total_audited = 0
115
+
116
+ for name, repo_path in module_repos.items():
117
+ if scope_module and name != scope_module:
118
+ continue
119
+
120
+ module_dir = ctx.knowledge_dir / name
121
+ if not module_dir.is_dir():
122
+ continue
123
+
124
+ progress = load_audit_progress(module_dir) if not force else {}
125
+ audit_order = get_audit_order(ctx)
126
+
127
+ for doc_type in audit_order:
128
+ if scope_doc_type and doc_type != scope_doc_type:
129
+ continue
130
+
131
+ doc_file = module_dir / f"{doc_type}.md"
132
+ if not doc_file.exists():
133
+ continue
134
+
135
+ if progress.get(doc_type, {}).get("status") == "DONE":
136
+ logger.info("[audit] %s/%s: already DONE, skipping", name, doc_type)
137
+ continue
138
+
139
+ update_audit_progress(module_dir, doc_type, "IN_PROGRESS")
140
+
141
+ system_prompt = build_audit_system_prompt(doc_type)
142
+ user_prompt = build_audit_user_prompt(doc_file, doc_type, module_dir, doc_truncate_chars)
143
+
144
+ request = LlmRequest(system=system_prompt, user=user_prompt, max_tokens=max_tokens)
145
+ try:
146
+ response = strategy.call(request)
147
+ except Exception as e:
148
+ logger.error("[audit] LLM failed for %s/%s: %s", name, doc_type, e)
149
+ update_audit_progress(module_dir, doc_type, "FAILED")
150
+ total_audited += 1
151
+ continue
152
+
153
+ if response.status != "done":
154
+ logger.warning("[audit] LLM returned status '%s' for %s/%s: %s",
155
+ response.status, name, doc_type, response.error[:200] if response.error else "")
156
+ update_audit_progress(module_dir, doc_type, "FAILED")
157
+ total_audited += 1
158
+ continue
159
+
160
+ result = parse_audit_response(response.content)
161
+ if not result.success:
162
+ logger.warning("[audit] %s/%s: parse failed: %s", name, doc_type, result.error)
163
+ update_audit_progress(module_dir, doc_type, "FAILED")
164
+ continue
165
+
166
+ key = f"{name}/{doc_type}"
167
+ all_findings[key] = result.findings
168
+ update_audit_progress(module_dir, doc_type, "DONE", len(result.findings))
169
+ total_audited += 1
170
+
171
+ ctx.state["audit_findings"] = all_findings
172
+ total_issues = sum(
173
+ len([f for f in findings if f.status == "fail"])
174
+ for findings in all_findings.values()
175
+ )
176
+
177
+ if total_audited > 0 and not all_findings:
178
+ return StepResult(
179
+ status="ok",
180
+ message=f"Attempted {total_audited} docs, all LLM calls failed (check LLM connection)",
181
+ details={"total_audited": total_audited, "total_issues": 0, "all_failed": True},
182
+ )
183
+
184
+ return StepResult(
185
+ status="ok",
186
+ message=f"Audited {total_audited} docs, {total_issues} issues found",
187
+ details={"total_audited": total_audited, "total_issues": total_issues},
188
+ )
189
+
190
+ def _delegated_mode(self, ctx: PipelineContext) -> StepResult:
191
+ """Generate audit manifest for Agent dispatch."""
192
+ module_repos: dict[str, Path] = ctx.state.get("module_repos", {})
193
+ manifest = {"operation": "kb-audit", "modules": []}
194
+
195
+ for name, repo_path in module_repos.items():
196
+ module_dir = ctx.knowledge_dir / name
197
+ if not module_dir.is_dir():
198
+ continue
199
+
200
+ docs = []
201
+ for doc_type in get_audit_order(ctx):
202
+ doc_file = module_dir / f"{doc_type}.md"
203
+ if doc_file.exists():
204
+ docs.append({
205
+ "doc_type": doc_type,
206
+ "doc_path": str(doc_file.relative_to(ctx.knowledge_dir)),
207
+ "template": f"presets/java-spring/templates/subagent-audit-{doc_type}.md",
208
+ })
209
+
210
+ if docs:
211
+ manifest["modules"].append({"name": name, "documents": docs})
212
+
213
+ manifest_path = ctx.knowledge_dir / ".audit-manifest.json"
214
+ manifest_path.write_text(
215
+ json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8"
216
+ )
217
+ return StepResult(
218
+ status="delegated",
219
+ message=f"Audit manifest written: {manifest_path}",
220
+ )
@@ -0,0 +1,195 @@
1
+ """Audit fix and report steps — apply fixes and generate reports.
2
+
3
+ Steps:
4
+ - ApplyFixesStep: apply LLM-generated fixes to documents
5
+ - GenerateReportStep: generate audit summary report
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import logging
12
+ from pathlib import Path
13
+
14
+ from core.interfaces import Step, StepResult, PipelineContext
15
+ from engine.pipeline import register_step
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ @register_step
21
+ class ApplyFixesStep(Step):
22
+ """Apply LLM-generated fixes to documents.
23
+
24
+ Fix scope determination:
25
+ - fix contains section heading -> Section_Updater.replace_section()
26
+ - fix is new section -> Section_Updater.append_section()
27
+ - fix is inline content -> direct line replacement
28
+ """
29
+
30
+ default_name = "apply-fixes"
31
+
32
+ def __init__(self):
33
+ super().__init__("apply-fixes")
34
+
35
+ def run(self, ctx: PipelineContext) -> StepResult:
36
+ findings = ctx.state.get("audit_findings", {})
37
+ if not findings:
38
+ return StepResult(status="skipped", message="No findings to apply")
39
+
40
+ dry_run = ctx.state.get("dry_run", False)
41
+ applied = 0
42
+ skipped = 0
43
+ failed = 0
44
+ fix_log: list[dict] = []
45
+
46
+ for key, finding_list in findings.items():
47
+ parts = key.split("/", 1)
48
+ if len(parts) != 2:
49
+ continue
50
+ module_name, doc_type = parts
51
+ module_dir = ctx.knowledge_dir / module_name
52
+ doc_path = module_dir / f"{doc_type}.md"
53
+
54
+ if not doc_path.exists():
55
+ continue
56
+
57
+ for finding in finding_list:
58
+ if finding.status != "fail" or not finding.fix:
59
+ skipped += 1
60
+ continue
61
+
62
+ if dry_run:
63
+ fix_log.append({
64
+ "doc": doc_type, "dimension": finding.dimension,
65
+ "action": "would_apply", "preview": finding.fix[:200],
66
+ })
67
+ applied += 1
68
+ continue
69
+
70
+ success = _apply_single_fix(doc_path, finding)
71
+ if success:
72
+ applied += 1
73
+ fix_log.append({
74
+ "doc": doc_type, "dimension": finding.dimension,
75
+ "status": "applied",
76
+ })
77
+ else:
78
+ failed += 1
79
+ fix_log.append({
80
+ "doc": doc_type, "dimension": finding.dimension,
81
+ "status": "failed",
82
+ })
83
+
84
+ if fix_log:
85
+ for key in findings:
86
+ parts = key.split("/", 1)
87
+ if len(parts) == 2:
88
+ module_dir = ctx.knowledge_dir / parts[0]
89
+ if module_dir.is_dir():
90
+ tracking_path = module_dir / ".audit-fixes.json"
91
+ tracking_path.write_text(
92
+ json.dumps(fix_log, ensure_ascii=False, indent=2),
93
+ encoding="utf-8",
94
+ )
95
+ break
96
+
97
+ ctx.state["fixes_applied"] = applied
98
+ ctx.state["fixes_failed"] = failed
99
+
100
+ return StepResult(
101
+ status="ok",
102
+ message=f"Applied {applied} fixes, {skipped} skipped, {failed} failed",
103
+ details={"applied": applied, "skipped": skipped, "failed": failed},
104
+ )
105
+
106
+
107
+ @register_step
108
+ class GenerateReportStep(Step):
109
+ """Generate audit summary report."""
110
+
111
+ default_name = "generate-report"
112
+
113
+ def __init__(self):
114
+ super().__init__("generate-report")
115
+
116
+ def run(self, ctx: PipelineContext) -> StepResult:
117
+ findings = ctx.state.get("audit_findings", {})
118
+ fixes_applied = ctx.state.get("fixes_applied", 0)
119
+ fixes_failed = ctx.state.get("fixes_failed", 0)
120
+ report_path = ctx.knowledge_dir / ".audit-report.md"
121
+
122
+ scope_doc = ctx.state.get("audit_scope", "all")
123
+ scope_mod = ctx.state.get("audit_module") or ctx.module or "all"
124
+
125
+ lines = ["# Audit Report", ""]
126
+ lines.append(f"Knowledge base: {ctx.kb_name}")
127
+ lines.append(f"Scope: {scope_doc}/{scope_mod}")
128
+ lines.append(f"Fixes: {fixes_applied} applied, {fixes_failed} failed")
129
+ lines.append("")
130
+
131
+ total_issues = 0
132
+ for key, finding_list in sorted(findings.items()):
133
+ fails = [f for f in finding_list if f.status == "fail"]
134
+ passes = [f for f in finding_list if f.status == "pass"]
135
+ total_issues += len(fails)
136
+
137
+ lines.append(f"## {key}")
138
+ lines.append(f"- Pass: {len(passes)}, Fail: {len(fails)}")
139
+ lines.append("")
140
+
141
+ if fails:
142
+ for f in fails[:10]:
143
+ fix_indicator = " (fixed)" if f.fix else ""
144
+ lines.append(f" - [{f.dimension}] {f.detail}{fix_indicator}")
145
+ if len(fails) > 10:
146
+ lines.append(f" - ... and {len(fails) - 10} more")
147
+ lines.append("")
148
+
149
+ lines.append("---")
150
+ lines.append(f"Total: {total_issues} issues")
151
+
152
+ report_path.write_text("\n".join(lines), encoding="utf-8")
153
+
154
+ module_repos = ctx.state.get("module_repos", {})
155
+ for name in module_repos:
156
+ module_dir = ctx.knowledge_dir / name
157
+ progress_file = module_dir / ".audit-progress.json"
158
+ if progress_file.exists():
159
+ progress_file.unlink()
160
+
161
+ return StepResult(status="ok", message=f"Report: {report_path} ({total_issues} issues)")
162
+
163
+
164
+ # ---------------------------------------------------------------------------
165
+ # Helpers
166
+ # ---------------------------------------------------------------------------
167
+
168
+
169
+ def _apply_single_fix(doc_path: Path, finding) -> bool:
170
+ """Apply a single fix from an audit finding."""
171
+ from core.docs.section_updater import replace_section, append_section
172
+ from core.utils import validate_path_within_bounds
173
+
174
+ fix_content = finding.fix
175
+ if not fix_content:
176
+ return False
177
+
178
+ if not validate_path_within_bounds(doc_path, doc_path.parent):
179
+ logger.warning("[apply-fix] Path traversal blocked: %s", doc_path)
180
+ return False
181
+
182
+ lines = fix_content.strip().split("\n")
183
+ first_line = lines[0] if lines else ""
184
+
185
+ if first_line.startswith("## ") or first_line.startswith("### "):
186
+ heading = first_line
187
+ body = "\n".join(lines[1:]).strip()
188
+ success = replace_section(doc_path, heading, body)
189
+ if not success:
190
+ success = append_section(doc_path, heading, body)
191
+ return success
192
+ else:
193
+ logger.info("[apply-fix] Inline fix for %s/%s (deferred): %s",
194
+ doc_path.name, finding.dimension, fix_content[:100])
195
+ return False