source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
core/paths.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""Canonical path resolution for knowledge base artifacts.
|
|
2
|
+
|
|
3
|
+
All modules use this single source of truth for file/directory paths.
|
|
4
|
+
No hard-coded path strings elsewhere in the codebase.
|
|
5
|
+
|
|
6
|
+
Directory structure:
|
|
7
|
+
knowledge/{kb}/{module}/
|
|
8
|
+
├── .meta/
|
|
9
|
+
│ ├── skeleton/ # Skeleton files (JSON)
|
|
10
|
+
│ ├── file-lists/ # File classification lists
|
|
11
|
+
│ ├── prompts/ # Rendered sub-agent prompts
|
|
12
|
+
│ └── progress/ # Progress tracking
|
|
13
|
+
└── *.md # Final documents
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
META_DIR = ".meta"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ─── Directory paths ────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
def meta_dir(module_dir: Path) -> Path:
|
|
26
|
+
return module_dir / META_DIR
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def skeleton_dir(module_dir: Path) -> Path:
|
|
30
|
+
return meta_dir(module_dir) / "skeleton"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def file_list_dir(module_dir: Path) -> Path:
|
|
34
|
+
return meta_dir(module_dir) / "file-lists"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def prompts_dir(module_dir: Path) -> Path:
|
|
38
|
+
return meta_dir(module_dir) / "prompts"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def progress_dir(module_dir: Path) -> Path:
|
|
42
|
+
return meta_dir(module_dir) / "progress"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# ─── File paths ─────────────────────────────────────────────
|
|
46
|
+
|
|
47
|
+
def skeleton_path(module_dir: Path) -> Path:
|
|
48
|
+
return skeleton_dir(module_dir) / "skeleton.json"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def skeleton_shards_dir(module_dir: Path) -> Path:
|
|
52
|
+
return skeleton_dir(module_dir) / "shards"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def skeleton_summary_path(module_dir: Path) -> Path:
|
|
56
|
+
return skeleton_dir(module_dir) / "summary.json"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def skeleton_stats_path(module_dir: Path) -> Path:
|
|
60
|
+
return skeleton_dir(module_dir) / "stats.json"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def file_list_path(module_dir: Path, doc_type: str) -> Path:
|
|
64
|
+
return file_list_dir(module_dir) / f"{doc_type}.txt"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def prompt_path(module_dir: Path, doc_type: str) -> Path:
|
|
68
|
+
return prompts_dir(module_dir) / f"{doc_type}.md"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def progress_path(module_dir: Path, doc_name: str) -> Path:
|
|
72
|
+
return progress_dir(module_dir) / doc_name
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def progress_pid_path(module_dir: Path, doc_name: str) -> Path:
|
|
76
|
+
return progress_dir(module_dir) / f"{doc_name}.hb-pid"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
# ─── Shard paths (unified naming for both modes) ───────────
|
|
80
|
+
|
|
81
|
+
def shard_doc_path(module_dir: Path, doc_basename: str, shard_name: str) -> Path:
|
|
82
|
+
"""Output path for a shard document: {module_dir}/{doc_basename}-{shard_name}.md"""
|
|
83
|
+
return module_dir / f"{doc_basename}-{shard_name}.md"
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def shard_file_list_path(module_dir: Path, doc_type: str, shard_name: str) -> Path:
|
|
87
|
+
"""File list path for a shard: .meta/file-lists/{doc_type}-{shard_name}.txt"""
|
|
88
|
+
return file_list_dir(module_dir) / f"{doc_type}-{shard_name}.txt"
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
# ─── Resolution (prefer new paths, fallback to legacy) ─────
|
|
92
|
+
|
|
93
|
+
def resolve_skeleton(module_dir: Path) -> Path | None:
|
|
94
|
+
"""Resolve skeleton file or shards directory. Returns None if not found."""
|
|
95
|
+
new = skeleton_path(module_dir)
|
|
96
|
+
if new.exists():
|
|
97
|
+
return new
|
|
98
|
+
new_shards = skeleton_shards_dir(module_dir)
|
|
99
|
+
if new_shards.is_dir():
|
|
100
|
+
return new_shards
|
|
101
|
+
# Legacy fallback
|
|
102
|
+
for legacy in (module_dir / ".skeleton.json", module_dir / ".skeleton"):
|
|
103
|
+
if legacy.exists():
|
|
104
|
+
return legacy
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def resolve_skeleton_summary(module_dir: Path) -> Path | None:
|
|
109
|
+
"""Resolve skeleton summary file."""
|
|
110
|
+
new = skeleton_summary_path(module_dir)
|
|
111
|
+
if new.exists():
|
|
112
|
+
return new
|
|
113
|
+
old = module_dir / ".skeleton-summary.json"
|
|
114
|
+
if old.exists():
|
|
115
|
+
return old
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def resolve_file_list(module_dir: Path, doc_type: str) -> Path | None:
|
|
120
|
+
"""Resolve file list for a doc type."""
|
|
121
|
+
new = file_list_path(module_dir, doc_type)
|
|
122
|
+
if new.exists():
|
|
123
|
+
return new
|
|
124
|
+
# Legacy fallback
|
|
125
|
+
for name in (f".file-list-{doc_type}.txt", f".file-list-{doc_type.split('-')[0]}.txt"):
|
|
126
|
+
old = module_dir / name
|
|
127
|
+
if old.exists():
|
|
128
|
+
return old
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def resolve_progress(module_dir: Path, doc_name: str) -> Path | None:
|
|
133
|
+
"""Resolve progress file. Returns None if not found."""
|
|
134
|
+
p = progress_path(module_dir, doc_name)
|
|
135
|
+
return p if p.exists() else None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def ensure_dir(path: Path) -> Path:
|
|
139
|
+
"""Create directory if it doesn't exist. Returns the path."""
|
|
140
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
141
|
+
return path
|
core/preset.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
"""Preset configuration loading and file classification.
|
|
2
|
+
|
|
3
|
+
Loads language presets (java-spring, generic, etc.) and provides rule-based
|
|
4
|
+
file classification. All classification rules come from preset.yaml config.
|
|
5
|
+
|
|
6
|
+
Public API:
|
|
7
|
+
load_preset(name) → dict
|
|
8
|
+
classify_file(preset, file_path, skeleton_entry) → list[str]
|
|
9
|
+
get_doc_filename(preset, doc_type) → str
|
|
10
|
+
get_affected_docs(preset, categories) → set[str]
|
|
11
|
+
get_doc_type_mapping(preset) → dict
|
|
12
|
+
CompiledClassifier(preset) — optimized batch classifier
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import copy
|
|
18
|
+
import fnmatch
|
|
19
|
+
import os
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import yaml
|
|
24
|
+
|
|
25
|
+
_BUILTIN_PRESETS_DIR = Path(__file__).resolve().parent.parent / "presets"
|
|
26
|
+
PRESETS_DIR = _BUILTIN_PRESETS_DIR # backward compat
|
|
27
|
+
|
|
28
|
+
_preset_cache: dict[str, dict] = {}
|
|
29
|
+
_preset_lock = __import__("threading").Lock()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Preset search path
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _get_preset_search_dirs() -> list[Path]:
|
|
38
|
+
"""Return preset directories in priority order (first match wins).
|
|
39
|
+
|
|
40
|
+
Search order:
|
|
41
|
+
1. SOURCE_KB_PRESETS_DIR env var (explicit override)
|
|
42
|
+
2. ./presets/ relative to cwd (project-local customization)
|
|
43
|
+
3. Built-in presets bundled with the package (fallback)
|
|
44
|
+
"""
|
|
45
|
+
dirs: list[Path] = []
|
|
46
|
+
|
|
47
|
+
env_dir = os.environ.get("SOURCE_KB_PRESETS_DIR")
|
|
48
|
+
if env_dir:
|
|
49
|
+
p = Path(env_dir)
|
|
50
|
+
if p.is_dir():
|
|
51
|
+
dirs.append(p)
|
|
52
|
+
|
|
53
|
+
local_dir = Path.cwd() / "presets"
|
|
54
|
+
if local_dir.is_dir() and local_dir.resolve() != _BUILTIN_PRESETS_DIR.resolve():
|
|
55
|
+
dirs.append(local_dir)
|
|
56
|
+
|
|
57
|
+
dirs.append(_BUILTIN_PRESETS_DIR)
|
|
58
|
+
return dirs
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _find_preset_dir(preset_name: str) -> Path | None:
|
|
62
|
+
"""Find the first directory containing the named preset."""
|
|
63
|
+
for base in _get_preset_search_dirs():
|
|
64
|
+
candidate = base / preset_name / "preset.yaml"
|
|
65
|
+
if candidate.exists():
|
|
66
|
+
return base / preset_name
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
# Preset loading
|
|
72
|
+
# ---------------------------------------------------------------------------
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_preset(preset_name: str) -> dict:
|
|
76
|
+
"""Load preset config (with custom_rules + doc_types merge). Raises FileNotFoundError.
|
|
77
|
+
|
|
78
|
+
Search order: SOURCE_KB_PRESETS_DIR > ./presets/ > built-in presets.
|
|
79
|
+
Results are cached per preset_name. Returns a deep copy so callers can mutate safely.
|
|
80
|
+
"""
|
|
81
|
+
with _preset_lock:
|
|
82
|
+
if preset_name in _preset_cache:
|
|
83
|
+
return copy.deepcopy(_preset_cache[preset_name])
|
|
84
|
+
|
|
85
|
+
_validate_name(preset_name)
|
|
86
|
+
preset_dir = _find_preset_dir(preset_name)
|
|
87
|
+
if preset_dir is None:
|
|
88
|
+
raise FileNotFoundError(
|
|
89
|
+
f"Preset '{preset_name}' not found. Searched: {[str(d) for d in _get_preset_search_dirs()]}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
preset_path = preset_dir / "preset.yaml"
|
|
93
|
+
with open(preset_path, encoding="utf-8") as f:
|
|
94
|
+
base = yaml.safe_load(f)
|
|
95
|
+
|
|
96
|
+
# Merge custom_rules.yaml if exists
|
|
97
|
+
custom_path = preset_dir / "custom_rules.yaml"
|
|
98
|
+
if custom_path.exists():
|
|
99
|
+
with open(custom_path, encoding="utf-8") as f:
|
|
100
|
+
custom = yaml.safe_load(f) or {}
|
|
101
|
+
base = _merge_custom_rules(base, custom)
|
|
102
|
+
|
|
103
|
+
# Merge doc_types.yaml if exists (doc_type definitions, split config, limits, parsers)
|
|
104
|
+
doc_types_path = preset_dir / "doc_types.yaml"
|
|
105
|
+
if doc_types_path.exists():
|
|
106
|
+
with open(doc_types_path, encoding="utf-8") as f:
|
|
107
|
+
doc_types_data = yaml.safe_load(f) or {}
|
|
108
|
+
# Merge top-level keys (doc_types, split, limits, parsers) into base
|
|
109
|
+
for key in ("doc_types", "split", "limits", "parsers"):
|
|
110
|
+
if key in doc_types_data:
|
|
111
|
+
base[key] = doc_types_data[key]
|
|
112
|
+
|
|
113
|
+
base["_preset_dir"] = str(preset_dir)
|
|
114
|
+
|
|
115
|
+
with _preset_lock:
|
|
116
|
+
_preset_cache[preset_name] = base
|
|
117
|
+
return copy.deepcopy(base)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def find_preset_template(preset_name: str, template_name: str) -> Path | None:
|
|
121
|
+
"""Find a template file within a preset's templates/ directory.
|
|
122
|
+
|
|
123
|
+
Uses the same search path as load_preset: env > local > built-in.
|
|
124
|
+
"""
|
|
125
|
+
preset_dir = _find_preset_dir(preset_name)
|
|
126
|
+
if preset_dir is None:
|
|
127
|
+
return None
|
|
128
|
+
candidate = preset_dir / "templates" / template_name
|
|
129
|
+
if candidate.exists():
|
|
130
|
+
return candidate
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _validate_name(name: str):
|
|
135
|
+
import re
|
|
136
|
+
if not name or not re.match(r"^[a-zA-Z0-9][a-zA-Z0-9_-]*$", name):
|
|
137
|
+
raise ValueError(f"Invalid preset name: {name!r}")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _merge_custom_rules(base: dict, custom: dict) -> dict:
|
|
141
|
+
"""Merge custom_rules into base preset (append patterns, add categories)."""
|
|
142
|
+
merged = copy.deepcopy(base)
|
|
143
|
+
custom_cls = custom.get("file_classification", {})
|
|
144
|
+
if not custom_cls:
|
|
145
|
+
return merged
|
|
146
|
+
|
|
147
|
+
base_cls = merged.setdefault("file_classification", {})
|
|
148
|
+
for cat, cfg in custom_cls.items():
|
|
149
|
+
if not isinstance(cfg, dict):
|
|
150
|
+
continue
|
|
151
|
+
clean = {k: v for k, v in cfg.items() if not k.startswith("_")}
|
|
152
|
+
if cat in base_cls:
|
|
153
|
+
for key, value in clean.items():
|
|
154
|
+
if key.endswith("_append") and isinstance(value, list):
|
|
155
|
+
real_key = key.removesuffix("_append")
|
|
156
|
+
existing = base_cls[cat].get(real_key, [])
|
|
157
|
+
if isinstance(existing, list):
|
|
158
|
+
for item in value:
|
|
159
|
+
if item not in existing:
|
|
160
|
+
existing.append(item)
|
|
161
|
+
base_cls[cat][real_key] = existing
|
|
162
|
+
else:
|
|
163
|
+
base_cls[cat] = clean
|
|
164
|
+
return merged
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Doc type mapping
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_doc_filename(preset: dict, doc_type: str, *, strict: bool = False) -> str:
|
|
173
|
+
"""Get the output filename for a doc type key."""
|
|
174
|
+
doc_types = preset.get("doc_types", {})
|
|
175
|
+
if doc_type in doc_types:
|
|
176
|
+
return doc_types[doc_type].get("filename", f"{doc_type}.md")
|
|
177
|
+
if strict:
|
|
178
|
+
raise KeyError(f"Unknown doc-type '{doc_type}'. Valid: {sorted(doc_types.keys())}")
|
|
179
|
+
return f"{doc_type}.md"
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def get_doc_type_mapping(preset: dict) -> dict[str, str]:
|
|
183
|
+
"""Return full doc-type → filename mapping (built from doc_types config)."""
|
|
184
|
+
doc_types = preset.get("doc_types", {})
|
|
185
|
+
if doc_types:
|
|
186
|
+
return {dt: cfg.get("filename", f"{dt}.md") for dt, cfg in doc_types.items()}
|
|
187
|
+
return {}
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def get_affected_docs(preset: dict, categories: list[str]) -> set[str]:
|
|
191
|
+
"""Given category names, return affected document filenames."""
|
|
192
|
+
classification = preset.get("file_classification", {})
|
|
193
|
+
docs: set[str] = set()
|
|
194
|
+
for cat in categories:
|
|
195
|
+
cfg = classification.get(cat, {})
|
|
196
|
+
docs.update(cfg.get("affects", []))
|
|
197
|
+
return docs
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def get_file_classification(preset: dict) -> dict:
|
|
201
|
+
"""Return the file_classification section."""
|
|
202
|
+
return dict(preset.get("file_classification", {}))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
# File classification (delegated to preset_classify.py)
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
from core.preset_classify import classify_file # noqa: E402, F401
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# ---------------------------------------------------------------------------
|
|
213
|
+
# Utility accessors (delegated to preset_accessors.py)
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
|
|
216
|
+
from core.preset_accessors import ( # noqa: E402, F401
|
|
217
|
+
get_coverage_skip_patterns,
|
|
218
|
+
get_generation_order,
|
|
219
|
+
get_doc_types,
|
|
220
|
+
get_doc_type_config,
|
|
221
|
+
get_template_path,
|
|
222
|
+
get_split_override,
|
|
223
|
+
get_ownership_keywords,
|
|
224
|
+
get_global_view_types,
|
|
225
|
+
get_limits,
|
|
226
|
+
get_search_routing,
|
|
227
|
+
get_dependency_chain,
|
|
228
|
+
get_dedup_rules,
|
|
229
|
+
get_batch_plan,
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# ---------------------------------------------------------------------------
|
|
234
|
+
# Profile resolution and show-config (delegated to preset_profile.py)
|
|
235
|
+
# ---------------------------------------------------------------------------
|
|
236
|
+
|
|
237
|
+
from core.preset_profile import resolve_profile, show_config # noqa: E402, F401
|
core/preset_accessors.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Preset utility accessors — query preset config for various subsystems.
|
|
2
|
+
|
|
3
|
+
All functions take a loaded preset dict and return derived configuration.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_coverage_skip_patterns(preset: dict) -> list[str]:
|
|
12
|
+
return list(preset.get("coverage_skip_patterns", []))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_generation_order(preset: dict) -> list[list[str]]:
|
|
16
|
+
"""Return batch generation order derived from doc_types config.
|
|
17
|
+
|
|
18
|
+
Groups doc_types by their 'batch' field, sorted ascending.
|
|
19
|
+
Each batch is a list of doc_type keys that can be generated in parallel.
|
|
20
|
+
"""
|
|
21
|
+
doc_types = preset.get("doc_types", {})
|
|
22
|
+
if not doc_types:
|
|
23
|
+
return list(preset.get("generation_order", [["source-tree-analysis"]]))
|
|
24
|
+
|
|
25
|
+
batches: dict[int, list[str]] = {}
|
|
26
|
+
for dt_key, dt_config in doc_types.items():
|
|
27
|
+
if not isinstance(dt_config, dict):
|
|
28
|
+
continue
|
|
29
|
+
batch_num = dt_config.get("batch", 99)
|
|
30
|
+
batches.setdefault(batch_num, []).append(dt_key)
|
|
31
|
+
|
|
32
|
+
return [batches[b] for b in sorted(batches.keys())]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_doc_types(preset: dict) -> dict[str, dict]:
|
|
36
|
+
"""Return the full doc_types configuration section."""
|
|
37
|
+
return dict(preset.get("doc_types", {}))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_doc_type_config(preset: dict, doc_type: str) -> dict:
|
|
41
|
+
"""Get config for a specific doc_type. Returns empty dict if not found."""
|
|
42
|
+
doc_types = preset.get("doc_types", {})
|
|
43
|
+
return doc_types.get(doc_type, {})
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def get_template_path(preset: dict, doc_type: str, preset_name: str) -> str | None:
|
|
47
|
+
"""Get the template filename for a doc_type from preset config.
|
|
48
|
+
|
|
49
|
+
Returns relative path like 'subagent-business.md', or None if not configured.
|
|
50
|
+
"""
|
|
51
|
+
dt_config = get_doc_type_config(preset, doc_type)
|
|
52
|
+
return dt_config.get("template")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_split_override(preset: dict, doc_type: str, mode: str) -> dict:
|
|
56
|
+
"""Get per-doc-type split threshold overrides."""
|
|
57
|
+
dt_config = get_doc_type_config(preset, doc_type)
|
|
58
|
+
overrides = dt_config.get("split_override", {})
|
|
59
|
+
return overrides.get(mode, {})
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_ownership_keywords(preset: dict) -> dict[str, list[str]]:
|
|
63
|
+
"""Get doc_type -> ownership keywords mapping (for duplicate detection)."""
|
|
64
|
+
doc_types = preset.get("doc_types", {})
|
|
65
|
+
result: dict[str, list[str]] = {}
|
|
66
|
+
for dt_key, dt_config in doc_types.items():
|
|
67
|
+
if not isinstance(dt_config, dict):
|
|
68
|
+
continue
|
|
69
|
+
keywords = dt_config.get("owns_keywords", [])
|
|
70
|
+
if keywords:
|
|
71
|
+
filename = dt_config.get("filename", f"{dt_key}.md")
|
|
72
|
+
result[filename] = keywords
|
|
73
|
+
return result
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_global_view_types(preset: dict) -> frozenset[str]:
|
|
77
|
+
"""Return all doc_type keys marked as global_view."""
|
|
78
|
+
doc_types = preset.get("doc_types", {})
|
|
79
|
+
|
|
80
|
+
global_keys: set[str] = set()
|
|
81
|
+
for dt_key, dt_config in doc_types.items():
|
|
82
|
+
if isinstance(dt_config, dict) and dt_config.get("global_view", False):
|
|
83
|
+
global_keys.add(dt_key)
|
|
84
|
+
|
|
85
|
+
if not global_keys and not doc_types:
|
|
86
|
+
return frozenset({"source-tree-analysis", "index"})
|
|
87
|
+
|
|
88
|
+
return frozenset(global_keys)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_limits(preset: dict, project_config: dict | None = None) -> dict[str, int]:
|
|
92
|
+
"""Get configurable limits (timeouts, size thresholds, etc.).
|
|
93
|
+
|
|
94
|
+
Resolution order: project_config (kb-project.yaml) > preset (doc_types.yaml) > defaults.
|
|
95
|
+
"""
|
|
96
|
+
defaults = {
|
|
97
|
+
"min_doc_size_bytes": 500,
|
|
98
|
+
"max_source_inline_bytes": 300000,
|
|
99
|
+
"max_skeleton_inline_bytes": 50000,
|
|
100
|
+
"max_output_tokens": 8192,
|
|
101
|
+
"prior_docs_max_chars": 2000,
|
|
102
|
+
"shard_context_max_chars": 1500,
|
|
103
|
+
"spawn_timeout_default": 900,
|
|
104
|
+
"heartbeat_interval": 30,
|
|
105
|
+
"max_retries": 2,
|
|
106
|
+
}
|
|
107
|
+
configured = preset.get("limits", {})
|
|
108
|
+
result = {**defaults, **configured}
|
|
109
|
+
if project_config:
|
|
110
|
+
project_limits = project_config.get("limits", {})
|
|
111
|
+
result = {**result, **project_limits}
|
|
112
|
+
return result
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_search_routing(preset: dict) -> dict[str, list[str]]:
|
|
116
|
+
"""Get filename -> search keywords mapping for question routing."""
|
|
117
|
+
doc_types = preset.get("doc_types", {})
|
|
118
|
+
result: dict[str, list[str]] = {}
|
|
119
|
+
for dt_key, dt_config in doc_types.items():
|
|
120
|
+
if not isinstance(dt_config, dict):
|
|
121
|
+
continue
|
|
122
|
+
filename = dt_config.get("filename", f"{dt_key}.md")
|
|
123
|
+
keywords = dt_config.get("search_keywords", [])
|
|
124
|
+
if not keywords:
|
|
125
|
+
keywords = [dt_key.replace("-", " ")]
|
|
126
|
+
result[filename] = keywords
|
|
127
|
+
return result
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def get_dependency_chain(preset: dict) -> list[str]:
|
|
131
|
+
"""Return doc_type keys in dependency-safe generation/update order (topological sort)."""
|
|
132
|
+
doc_types = preset.get("doc_types", {})
|
|
133
|
+
if not doc_types:
|
|
134
|
+
return []
|
|
135
|
+
|
|
136
|
+
deps: dict[str, set[str]] = {}
|
|
137
|
+
for dt_key, dt_config in doc_types.items():
|
|
138
|
+
if not isinstance(dt_config, dict):
|
|
139
|
+
continue
|
|
140
|
+
depends_on = dt_config.get("depends_on", [])
|
|
141
|
+
deps[dt_key] = set(depends_on) & set(doc_types.keys())
|
|
142
|
+
|
|
143
|
+
in_degree: dict[str, int] = {k: len(v) for k, v in deps.items()}
|
|
144
|
+
queue: list[str] = sorted(
|
|
145
|
+
[k for k, d in in_degree.items() if d == 0],
|
|
146
|
+
key=lambda k: (doc_types.get(k, {}).get("batch", 99), k),
|
|
147
|
+
)
|
|
148
|
+
result: list[str] = []
|
|
149
|
+
|
|
150
|
+
while queue:
|
|
151
|
+
node = queue.pop(0)
|
|
152
|
+
result.append(node)
|
|
153
|
+
for k, dep_set in deps.items():
|
|
154
|
+
if node in dep_set:
|
|
155
|
+
dep_set.discard(node)
|
|
156
|
+
in_degree[k] -= 1
|
|
157
|
+
if in_degree[k] == 0:
|
|
158
|
+
queue.append(k)
|
|
159
|
+
queue.sort(key=lambda x: (doc_types.get(x, {}).get("batch", 99), x))
|
|
160
|
+
|
|
161
|
+
for k in deps:
|
|
162
|
+
if k not in result:
|
|
163
|
+
result.append(k)
|
|
164
|
+
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def get_dedup_rules(preset: dict) -> dict[str, list[str]]:
|
|
169
|
+
"""Get filename -> dedup detection patterns."""
|
|
170
|
+
doc_types = preset.get("doc_types", {})
|
|
171
|
+
result: dict[str, list[str]] = {}
|
|
172
|
+
for dt_key, dt_config in doc_types.items():
|
|
173
|
+
if not isinstance(dt_config, dict):
|
|
174
|
+
continue
|
|
175
|
+
patterns = list(dt_config.get("dedup_patterns", []))
|
|
176
|
+
owns = dt_config.get("owns_keywords", [])
|
|
177
|
+
if patterns or owns:
|
|
178
|
+
filename = dt_config.get("filename", f"{dt_key}.md")
|
|
179
|
+
result[filename] = patterns + owns
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_batch_plan(preset: dict) -> dict[int, list[dict]]:
|
|
184
|
+
"""Get batch_num -> list of doc_type info dicts for generation planning."""
|
|
185
|
+
doc_types = preset.get("doc_types", {})
|
|
186
|
+
result: dict[int, list[dict]] = {}
|
|
187
|
+
for dt_key, dt_config in doc_types.items():
|
|
188
|
+
if not isinstance(dt_config, dict):
|
|
189
|
+
continue
|
|
190
|
+
batch_num = dt_config.get("batch", 99)
|
|
191
|
+
entry = {
|
|
192
|
+
"name": dt_key,
|
|
193
|
+
"filename": dt_config.get("filename", f"{dt_key}.md"),
|
|
194
|
+
"conditional": dt_config.get("conditional", False),
|
|
195
|
+
"depends_on": dt_config.get("depends_on", []),
|
|
196
|
+
"global_view": dt_config.get("global_view", False),
|
|
197
|
+
"template": dt_config.get("template"),
|
|
198
|
+
}
|
|
199
|
+
result.setdefault(batch_num, []).append(entry)
|
|
200
|
+
for batch_num in result:
|
|
201
|
+
result[batch_num].sort(key=lambda x: x["name"])
|
|
202
|
+
return result
|