source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
engine/dedup.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""LLM-based document deduplication — CLI-only post-merge optimization.
|
|
2
|
+
|
|
3
|
+
Uses LLM to intelligently remove cross-document redundancy:
|
|
4
|
+
- Internal dedup: repeated tables/paragraphs within a document
|
|
5
|
+
- Cross-doc dedup: content that belongs in another document → replace with reference
|
|
6
|
+
|
|
7
|
+
Only processes documents exceeding a configurable size threshold.
|
|
8
|
+
Agent mode uses rule-based refinement (core/skeleton/merge.py) instead.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
import re
|
|
15
|
+
import time
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from core.interfaces import LlmRequest, LlmStrategy
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Configurable thresholds (should come from config in production)
|
|
23
|
+
MIN_SIZE_KB = 15
|
|
24
|
+
CHUNK_THRESHOLD_KB = 60
|
|
25
|
+
|
|
26
|
+
SYSTEM_PROMPT = (
|
|
27
|
+
"You are a technical documentation editing expert. Your task is to deduplicate and streamline a knowledge base document.\n"
|
|
28
|
+
"Rules:\n"
|
|
29
|
+
"1. Internal dedup: if the same content appears multiple times, keep only the most complete version\n"
|
|
30
|
+
"2. Cross-document dedup: replace content that belongs in another document with a cross-reference `> See [xxx.md](./xxx.md)`\n"
|
|
31
|
+
"3. Preserve structure: keep heading hierarchy, do not add new content\n"
|
|
32
|
+
"4. Merge conflicting data: keep the more specific version\n"
|
|
33
|
+
"Output the optimized full markdown document content directly."
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def dedup_document(
|
|
38
|
+
doc_path: Path,
|
|
39
|
+
sibling_docs: list[str],
|
|
40
|
+
strategy: LlmStrategy,
|
|
41
|
+
) -> tuple[bool, float]:
|
|
42
|
+
"""Deduplicate a single document using LLM.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
doc_path: Path to the document to deduplicate
|
|
46
|
+
sibling_docs: Names of other docs in the same module (for cross-doc reference)
|
|
47
|
+
strategy: LLM execution strategy
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
(changed: bool, saved_kb: float) — whether content changed and KB saved
|
|
51
|
+
"""
|
|
52
|
+
content = doc_path.read_text(encoding="utf-8")
|
|
53
|
+
original_size = len(content)
|
|
54
|
+
|
|
55
|
+
if original_size < MIN_SIZE_KB * 1024:
|
|
56
|
+
return False, 0.0
|
|
57
|
+
|
|
58
|
+
user_prompt = _build_prompt(doc_path.name, content, sibling_docs)
|
|
59
|
+
|
|
60
|
+
resp = strategy.call(LlmRequest(
|
|
61
|
+
system=SYSTEM_PROMPT,
|
|
62
|
+
user=user_prompt,
|
|
63
|
+
max_tokens=16384,
|
|
64
|
+
temperature=0.1,
|
|
65
|
+
))
|
|
66
|
+
|
|
67
|
+
if resp.status != "done" or not resp.content:
|
|
68
|
+
return False, 0.0
|
|
69
|
+
|
|
70
|
+
new_content = _strip_fence(resp.content)
|
|
71
|
+
new_size = len(new_content)
|
|
72
|
+
|
|
73
|
+
# Safety: reject if too short (lost content) or longer (added content)
|
|
74
|
+
if new_size < original_size * 0.3 or new_size >= original_size:
|
|
75
|
+
return False, 0.0
|
|
76
|
+
|
|
77
|
+
if not new_content.endswith("\n"):
|
|
78
|
+
new_content += "\n"
|
|
79
|
+
doc_path.write_text(new_content, encoding="utf-8")
|
|
80
|
+
|
|
81
|
+
saved_kb = (original_size - new_size) / 1024
|
|
82
|
+
logger.info("[dedup] %s: %.1fKB → %.1fKB (saved %.1fKB)",
|
|
83
|
+
doc_path.name, original_size / 1024, new_size / 1024, saved_kb)
|
|
84
|
+
return True, saved_kb
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _build_prompt(doc_name: str, content: str, siblings: list[str]) -> str:
|
|
88
|
+
siblings_str = ", ".join(f"`{s}`" for s in siblings if s != doc_name)
|
|
89
|
+
return (
|
|
90
|
+
f"Filename: `{doc_name}`\n"
|
|
91
|
+
f"Sibling documents in same directory: {siblings_str}\n\n"
|
|
92
|
+
f"```markdown\n{content}\n```\n\n"
|
|
93
|
+
f"Output the deduplicated and optimized full document content."
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _strip_fence(text: str) -> str:
|
|
98
|
+
"""Remove markdown code fence wrapper from LLM output."""
|
|
99
|
+
content = text.strip()
|
|
100
|
+
for prefix in ("```markdown", "```md", "```"):
|
|
101
|
+
if content.startswith(prefix):
|
|
102
|
+
content = content[len(prefix):].strip()
|
|
103
|
+
break
|
|
104
|
+
if content.endswith("```"):
|
|
105
|
+
content = content[:-3].strip()
|
|
106
|
+
return content
|
engine/main.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""CLI entry point — simplified commands with sensible defaults.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python -m cli.main init --kb my-kb
|
|
5
|
+
python -m cli.main sync --kb my-kb
|
|
6
|
+
python -m cli.main audit --kb my-kb
|
|
7
|
+
python -m cli.main search --kb my-kb "query"
|
|
8
|
+
python -m cli.main index --kb my-kb
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import logging
|
|
15
|
+
import sys
|
|
16
|
+
|
|
17
|
+
from core import __version__, setup_logging
|
|
18
|
+
|
|
19
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def cmd_init(args: argparse.Namespace) -> None:
|
|
23
|
+
"""Run kb-init pipeline."""
|
|
24
|
+
config, ctx = _setup(args)
|
|
25
|
+
from engine.pipelines import build_init_pipeline
|
|
26
|
+
from engine.pipeline.recovery import load_checkpoint, should_skip_step
|
|
27
|
+
|
|
28
|
+
pipeline = build_init_pipeline()
|
|
29
|
+
if args.dry_run:
|
|
30
|
+
_print_plan(pipeline)
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
# Resume support
|
|
34
|
+
checkpoint = None
|
|
35
|
+
if args.resume:
|
|
36
|
+
checkpoint = load_checkpoint(ctx.knowledge_dir)
|
|
37
|
+
if checkpoint is None:
|
|
38
|
+
print("No checkpoint found — starting from scratch")
|
|
39
|
+
else:
|
|
40
|
+
print(f"Resuming from: {checkpoint.next_step}")
|
|
41
|
+
|
|
42
|
+
result = pipeline.execute(ctx, checkpoint=checkpoint)
|
|
43
|
+
_print_result(result)
|
|
44
|
+
if not result.passed:
|
|
45
|
+
sys.exit(1)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cmd_sync(args: argparse.Namespace) -> None:
|
|
49
|
+
"""Run kb-sync pipeline."""
|
|
50
|
+
config, ctx = _setup(args)
|
|
51
|
+
from engine.pipelines import build_sync_pipeline
|
|
52
|
+
|
|
53
|
+
if getattr(args, "dry_run", False):
|
|
54
|
+
ctx.state["dry_run"] = True
|
|
55
|
+
|
|
56
|
+
pipeline = build_sync_pipeline()
|
|
57
|
+
result = pipeline.execute(ctx)
|
|
58
|
+
_print_result(result)
|
|
59
|
+
if not result.passed:
|
|
60
|
+
sys.exit(1)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def cmd_audit(args: argparse.Namespace) -> None:
|
|
64
|
+
"""Run kb-audit pipeline."""
|
|
65
|
+
config, ctx = _setup(args)
|
|
66
|
+
from engine.pipelines import build_audit_pipeline
|
|
67
|
+
|
|
68
|
+
if getattr(args, "dry_run", False):
|
|
69
|
+
ctx.state["dry_run"] = True
|
|
70
|
+
if getattr(args, "scope", None):
|
|
71
|
+
ctx.state["audit_scope"] = args.scope
|
|
72
|
+
if getattr(args, "force", False):
|
|
73
|
+
ctx.state["audit_force"] = True
|
|
74
|
+
|
|
75
|
+
pipeline = build_audit_pipeline()
|
|
76
|
+
result = pipeline.execute(ctx)
|
|
77
|
+
_print_result(result)
|
|
78
|
+
if not result.passed:
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def cmd_search(args: argparse.Namespace) -> None:
|
|
83
|
+
"""Search knowledge base."""
|
|
84
|
+
config, _ = _setup(args, need_ctx=False)
|
|
85
|
+
from core.rag.retriever import retrieve
|
|
86
|
+
|
|
87
|
+
results = retrieve(args.query, config, kb_name=args.kb)
|
|
88
|
+
if not results:
|
|
89
|
+
print(f"No results for '{args.query}'")
|
|
90
|
+
return
|
|
91
|
+
for i, r in enumerate(results, 1):
|
|
92
|
+
score = r["score"]
|
|
93
|
+
source = r["metadata"].get("source", "?")
|
|
94
|
+
section = r["metadata"].get("section", "")
|
|
95
|
+
header = f"[{i}] {source}"
|
|
96
|
+
if section:
|
|
97
|
+
header += f" > {section}"
|
|
98
|
+
print(f"{header} (score: {score:.3f})")
|
|
99
|
+
print(f" {r['text'][:200]}...")
|
|
100
|
+
print()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def cmd_index(args: argparse.Namespace) -> None:
|
|
104
|
+
"""Build/rebuild vector index."""
|
|
105
|
+
config, _ = _setup(args, need_ctx=False)
|
|
106
|
+
from core.rag.loader import load_documents
|
|
107
|
+
from core.rag.chunker import chunk_documents
|
|
108
|
+
from core.rag.indexer import build_index
|
|
109
|
+
|
|
110
|
+
kb_cfg = config.get_kb(args.kb)
|
|
111
|
+
knowledge_dir = kb_cfg["knowledge_dir"]
|
|
112
|
+
collection_name = kb_cfg["collection"]
|
|
113
|
+
|
|
114
|
+
docs = load_documents(knowledge_dir)
|
|
115
|
+
if not docs:
|
|
116
|
+
print(f"No documents found in {knowledge_dir}")
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
chunks = chunk_documents(docs)
|
|
120
|
+
print(f"Loaded {len(docs)} docs, {len(chunks)} chunks. Building index...")
|
|
121
|
+
build_index(chunks, collection_name, config, kb_name=args.kb)
|
|
122
|
+
print(f"Index built: {len(chunks)} chunks → collection '{collection_name}'")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ---------------------------------------------------------------------------
|
|
126
|
+
# Helpers
|
|
127
|
+
# ---------------------------------------------------------------------------
|
|
128
|
+
|
|
129
|
+
def _setup(args: argparse.Namespace, need_ctx: bool = True):
|
|
130
|
+
"""Load config and create pipeline context."""
|
|
131
|
+
from core.config import load_config, find_config
|
|
132
|
+
from core.interfaces import PipelineContext
|
|
133
|
+
from pathlib import Path
|
|
134
|
+
|
|
135
|
+
config_path = Path(args.config) if args.config else find_config()
|
|
136
|
+
config = load_config(config_path)
|
|
137
|
+
|
|
138
|
+
if not need_ctx:
|
|
139
|
+
return config, None
|
|
140
|
+
|
|
141
|
+
ctx = PipelineContext.from_config(config.raw, args.kb, module=getattr(args, "module", None))
|
|
142
|
+
return config, ctx
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _print_plan(pipeline) -> None:
|
|
146
|
+
"""Print pipeline steps without executing."""
|
|
147
|
+
print(f"[pipeline] {pipeline.name} — {len(pipeline.steps)} steps:")
|
|
148
|
+
for info in pipeline.describe():
|
|
149
|
+
cp = f" (checkpoint={info['checkpoint']})" if info["checkpoint"] else ""
|
|
150
|
+
print(f" {info['name']}{cp}")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _print_result(result) -> None:
|
|
154
|
+
"""Print pipeline execution result."""
|
|
155
|
+
markers = {"ok": "+", "skipped": "-", "failed": "!", "delegated": "~"}
|
|
156
|
+
print(f"\n{result.summary()}")
|
|
157
|
+
for step_name, sr in result.step_results:
|
|
158
|
+
marker = markers.get(sr.status, "?")
|
|
159
|
+
print(f" [{marker}] {step_name}: {sr.message}")
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def main():
|
|
163
|
+
parser = argparse.ArgumentParser(description="source-kb CLI")
|
|
164
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
165
|
+
parser.add_argument("--config", help="kb-project.yaml path (auto-detected)")
|
|
166
|
+
sub = parser.add_subparsers(dest="command")
|
|
167
|
+
|
|
168
|
+
# init
|
|
169
|
+
p_init = sub.add_parser("init", help="Initialize knowledge base")
|
|
170
|
+
p_init.add_argument("--kb", required=True)
|
|
171
|
+
p_init.add_argument("--module", default=None)
|
|
172
|
+
p_init.add_argument("--dry-run", action="store_true")
|
|
173
|
+
p_init.add_argument("--resume", action="store_true", help="Resume from last checkpoint")
|
|
174
|
+
|
|
175
|
+
# sync
|
|
176
|
+
p_sync = sub.add_parser("sync", help="Incremental sync")
|
|
177
|
+
p_sync.add_argument("--kb", required=True)
|
|
178
|
+
p_sync.add_argument("--module", default=None)
|
|
179
|
+
p_sync.add_argument("--dry-run", action="store_true", help="Preview changes without writing")
|
|
180
|
+
|
|
181
|
+
# audit
|
|
182
|
+
p_audit = sub.add_parser("audit", help="Document quality audit")
|
|
183
|
+
p_audit.add_argument("--kb", required=True)
|
|
184
|
+
p_audit.add_argument("--module", default=None)
|
|
185
|
+
p_audit.add_argument("--scope", default=None, help="Audit only this doc type (e.g. business-logic)")
|
|
186
|
+
p_audit.add_argument("--dry-run", action="store_true", help="Preview without applying fixes")
|
|
187
|
+
p_audit.add_argument("--force", action="store_true", help="Ignore progress, re-audit all")
|
|
188
|
+
|
|
189
|
+
# search
|
|
190
|
+
p_search = sub.add_parser("search", help="Search knowledge base")
|
|
191
|
+
p_search.add_argument("--kb", required=True)
|
|
192
|
+
p_search.add_argument("query")
|
|
193
|
+
|
|
194
|
+
# index
|
|
195
|
+
p_index = sub.add_parser("index", help="Build/rebuild index")
|
|
196
|
+
p_index.add_argument("--kb", required=True)
|
|
197
|
+
|
|
198
|
+
args = parser.parse_args()
|
|
199
|
+
if not args.command:
|
|
200
|
+
parser.print_help()
|
|
201
|
+
sys.exit(1)
|
|
202
|
+
|
|
203
|
+
setup_logging()
|
|
204
|
+
|
|
205
|
+
commands = {"init": cmd_init, "sync": cmd_sync, "audit": cmd_audit,
|
|
206
|
+
"search": cmd_search, "index": cmd_index}
|
|
207
|
+
commands[args.command](args)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
if __name__ == "__main__":
|
|
211
|
+
main()
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""CLI pipeline framework — Pipeline class, PipelineBuilder, step registry.
|
|
2
|
+
|
|
3
|
+
Orchestrates Step sequences with checkpoint validation, recovery, and rollback.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from engine.pipeline import Pipeline, PipelineBuilder, register_step
|
|
7
|
+
|
|
8
|
+
pipeline = PipelineBuilder("kb-init").add("fetch-source").add("extract-skeleton").build()
|
|
9
|
+
result = pipeline.execute(ctx)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import time
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from core.interfaces import Step, StepResult, PipelineContext
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Step registry
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
_STEP_REGISTRY: dict[str, type[Step]] = {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def register_step(cls: type[Step]) -> type[Step]:
|
|
31
|
+
"""Decorator to register a step class by its default name."""
|
|
32
|
+
# Use class attribute or class name as key
|
|
33
|
+
name = getattr(cls, "default_name", cls.__name__)
|
|
34
|
+
_STEP_REGISTRY[name] = cls
|
|
35
|
+
return cls
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_step(name: str) -> Step:
|
|
39
|
+
"""Instantiate a registered step by name."""
|
|
40
|
+
if name not in _STEP_REGISTRY:
|
|
41
|
+
available = sorted(_STEP_REGISTRY.keys())
|
|
42
|
+
raise KeyError(f"Step '{name}' not registered. Available: {available}")
|
|
43
|
+
return _STEP_REGISTRY[name]()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# PipelineResult
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class PipelineResult:
|
|
52
|
+
"""Outcome of a full pipeline execution."""
|
|
53
|
+
pipeline_name: str
|
|
54
|
+
step_results: list[tuple[str, StepResult]] = field(default_factory=list)
|
|
55
|
+
elapsed: float = 0.0
|
|
56
|
+
|
|
57
|
+
@property
|
|
58
|
+
def passed(self) -> bool:
|
|
59
|
+
return all(r.ok or r.skipped or r.delegated for _, r in self.step_results)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def has_delegated(self) -> bool:
|
|
63
|
+
return any(r.delegated for _, r in self.step_results)
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def failed_steps(self) -> list[tuple[str, StepResult]]:
|
|
67
|
+
return [(n, r) for n, r in self.step_results if r.status == "failed"]
|
|
68
|
+
|
|
69
|
+
def summary(self) -> str:
|
|
70
|
+
ok = sum(1 for _, r in self.step_results if r.ok)
|
|
71
|
+
skip = sum(1 for _, r in self.step_results if r.skipped)
|
|
72
|
+
fail = sum(1 for _, r in self.step_results if r.status == "failed")
|
|
73
|
+
status = "PASS" if self.passed else "FAIL"
|
|
74
|
+
return f"{status}: {ok} ok, {skip} skipped, {fail} failed ({self.elapsed:.1f}s)"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# Pipeline
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
class Pipeline:
|
|
82
|
+
"""Executes a sequence of Steps with error handling and rollback."""
|
|
83
|
+
|
|
84
|
+
def __init__(self, name: str, steps: list[Step]):
|
|
85
|
+
self.name = name
|
|
86
|
+
self.steps = steps
|
|
87
|
+
|
|
88
|
+
def execute(self, ctx: PipelineContext, checkpoint=None) -> PipelineResult:
|
|
89
|
+
result = PipelineResult(pipeline_name=self.name)
|
|
90
|
+
t0 = time.time()
|
|
91
|
+
executed: list[Step] = []
|
|
92
|
+
short_circuit = False
|
|
93
|
+
|
|
94
|
+
# Determine steps to skip if resuming from checkpoint
|
|
95
|
+
skip_steps: set[str] = set()
|
|
96
|
+
if checkpoint is not None:
|
|
97
|
+
from engine.pipeline.recovery import should_skip_step
|
|
98
|
+
skip_steps = {s.name for s in self.steps if should_skip_step(checkpoint, s.name)}
|
|
99
|
+
|
|
100
|
+
for step in self.steps:
|
|
101
|
+
if short_circuit:
|
|
102
|
+
result.step_results.append((step.name, StepResult(status="skipped", message="short-circuited")))
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
if step.name in skip_steps:
|
|
106
|
+
result.step_results.append((step.name, StepResult(status="skipped", message="resumed (checkpoint)")))
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
logger.info("[%s] running...", step.name)
|
|
110
|
+
try:
|
|
111
|
+
sr = step.run(ctx)
|
|
112
|
+
except Exception as e:
|
|
113
|
+
sr = StepResult(status="failed", message=str(e))
|
|
114
|
+
|
|
115
|
+
result.step_results.append((step.name, sr))
|
|
116
|
+
logger.info("[%s] %s", step.name, sr)
|
|
117
|
+
|
|
118
|
+
if sr.ok or sr.delegated:
|
|
119
|
+
executed.append(step)
|
|
120
|
+
elif sr.skipped:
|
|
121
|
+
if sr.details.get("short_circuit"):
|
|
122
|
+
short_circuit = True
|
|
123
|
+
else:
|
|
124
|
+
# Failed — rollback
|
|
125
|
+
executed.append(step)
|
|
126
|
+
self._rollback(executed, ctx)
|
|
127
|
+
break
|
|
128
|
+
|
|
129
|
+
result.elapsed = time.time() - t0
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def _rollback(executed: list[Step], ctx: PipelineContext) -> None:
|
|
134
|
+
for step in reversed(executed):
|
|
135
|
+
try:
|
|
136
|
+
step.rollback(ctx)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.error("[%s] rollback error: %s", step.name, e)
|
|
139
|
+
|
|
140
|
+
def describe(self) -> list[dict[str, str]]:
|
|
141
|
+
return [{"name": s.name, "checkpoint": s.checkpoint or ""} for s in self.steps]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------------------
|
|
145
|
+
# PipelineBuilder
|
|
146
|
+
# ---------------------------------------------------------------------------
|
|
147
|
+
|
|
148
|
+
class PipelineBuilder:
|
|
149
|
+
"""Declarative pipeline composition from registered steps."""
|
|
150
|
+
|
|
151
|
+
def __init__(self, name: str):
|
|
152
|
+
self._name = name
|
|
153
|
+
self._steps: list[Step] = []
|
|
154
|
+
|
|
155
|
+
def add(self, step: Step | str) -> PipelineBuilder:
|
|
156
|
+
"""Add a step (instance or registered name)."""
|
|
157
|
+
if isinstance(step, str):
|
|
158
|
+
step = get_step(step)
|
|
159
|
+
self._steps.append(step)
|
|
160
|
+
return self
|
|
161
|
+
|
|
162
|
+
def build(self) -> Pipeline:
|
|
163
|
+
return Pipeline(self._name, self._steps)
|