source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Split plan LLM strategy — LLM-assisted business-domain grouping.
|
|
2
|
+
|
|
3
|
+
Handles both CLI mode (direct LLM call) and delegated mode (Agent dispatch).
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from core.skeleton.split import SplitConfig, SplitPlan
|
|
14
|
+
from core.skeleton.split_plan_helpers import (
|
|
15
|
+
make_split, derive_name, rebalance_groups,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def try_llm_split(
|
|
22
|
+
file_list: list[dict],
|
|
23
|
+
split_config: SplitConfig,
|
|
24
|
+
doc_type: str,
|
|
25
|
+
module_dir: Path | None,
|
|
26
|
+
*,
|
|
27
|
+
strategy: Any | None = None,
|
|
28
|
+
is_delegated: bool = False,
|
|
29
|
+
) -> SplitPlan | None:
|
|
30
|
+
"""Try LLM-assisted business-domain grouping.
|
|
31
|
+
|
|
32
|
+
In Agent mode (delegated): outputs a grouping request JSON file for the Agent
|
|
33
|
+
to process using its own LLM capability, then returns a pending plan.
|
|
34
|
+
|
|
35
|
+
In CLI mode: calls the provided LLM strategy directly.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file_list: Files to group.
|
|
39
|
+
split_config: Split configuration.
|
|
40
|
+
doc_type: Document type being split.
|
|
41
|
+
module_dir: Module directory for cache/shard files.
|
|
42
|
+
strategy: Injected LlmStrategy instance (from engine/ caller). None = skip LLM.
|
|
43
|
+
is_delegated: Whether running in delegated mode (Agent handles grouping).
|
|
44
|
+
"""
|
|
45
|
+
if not module_dir:
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
from core.skeleton.split import compute_splits
|
|
49
|
+
|
|
50
|
+
max_files = split_config.max_files_per_shard
|
|
51
|
+
total_lines = sum(f.get("lines", 0) for f in file_list)
|
|
52
|
+
total_bytes = sum(f.get("bytes", 0) for f in file_list)
|
|
53
|
+
n_target = max(2, compute_splits(
|
|
54
|
+
split_config,
|
|
55
|
+
total_bytes=total_bytes,
|
|
56
|
+
total_lines=total_lines,
|
|
57
|
+
file_count=len(file_list),
|
|
58
|
+
doc_type=doc_type,
|
|
59
|
+
))
|
|
60
|
+
max_lines = split_config.effective_max_lines(doc_type)
|
|
61
|
+
|
|
62
|
+
if is_delegated:
|
|
63
|
+
existing = _load_existing_shards(module_dir, doc_type, file_list)
|
|
64
|
+
if existing:
|
|
65
|
+
return existing
|
|
66
|
+
return _emit_grouping_request(file_list, module_dir, doc_type, n_target, max_files, max_lines)
|
|
67
|
+
|
|
68
|
+
# CLI mode: need a strategy to call LLM
|
|
69
|
+
if strategy is None:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
llm_sample_limit = split_config.llm_sample_limit
|
|
73
|
+
sampled_files = file_list if len(file_list) <= llm_sample_limit else file_list[:llm_sample_limit]
|
|
74
|
+
|
|
75
|
+
file_summaries = []
|
|
76
|
+
for f in sampled_files:
|
|
77
|
+
name = f.get("name", "")
|
|
78
|
+
pkg = f.get("package", "")
|
|
79
|
+
lines = f.get("lines", 0)
|
|
80
|
+
file_summaries.append(f"{pkg}.{name} ({lines} lines)")
|
|
81
|
+
|
|
82
|
+
count_desc = f"{len(sampled_files)} Java files"
|
|
83
|
+
if len(sampled_files) < len(file_list):
|
|
84
|
+
count_desc += f" (sample of {len(file_list)} total)"
|
|
85
|
+
prompt_user = (
|
|
86
|
+
f"Group the following {count_desc} into {n_target} groups by business domain.\n"
|
|
87
|
+
f"Output format per group: `GroupName: File1, File2, ...`\n"
|
|
88
|
+
f"File list:\n" + "\n".join(file_summaries)
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
from core.interfaces import LlmRequest
|
|
92
|
+
try:
|
|
93
|
+
resp = strategy.call(LlmRequest(
|
|
94
|
+
system=(
|
|
95
|
+
"You are a Java project architect skilled at grouping source files by business domain. "
|
|
96
|
+
"Output only the grouping result, no explanation."
|
|
97
|
+
),
|
|
98
|
+
user=prompt_user,
|
|
99
|
+
max_tokens=2000,
|
|
100
|
+
temperature=0.1,
|
|
101
|
+
))
|
|
102
|
+
except Exception:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
if resp.status != "done" or not resp.content:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
groups = _parse_llm_groups(resp.content, file_list)
|
|
109
|
+
if not groups or len(groups) <= 1:
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
assigned = {id(f) for g in groups for f in g}
|
|
113
|
+
unassigned = [f for f in file_list if id(f) not in assigned]
|
|
114
|
+
if unassigned:
|
|
115
|
+
_assign_by_package(unassigned, groups)
|
|
116
|
+
|
|
117
|
+
rebalance_groups(groups, split_config.max_files_per_shard)
|
|
118
|
+
|
|
119
|
+
splits = [make_split(derive_name(g, noise_words=split_config.noise_words), g) for g in groups]
|
|
120
|
+
return SplitPlan(splits=splits, strategy="llm")
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _load_existing_shards(
|
|
124
|
+
module_dir: Path,
|
|
125
|
+
doc_type: str,
|
|
126
|
+
file_list: list[dict],
|
|
127
|
+
) -> SplitPlan | None:
|
|
128
|
+
"""Load existing shard files produced by a prior split-apply."""
|
|
129
|
+
shards_dir = module_dir / ".meta" / "shards"
|
|
130
|
+
if not shards_dir.is_dir():
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
shard_files = sorted(shards_dir.glob(f"{doc_type}-shard-*.txt"))
|
|
134
|
+
if not shard_files:
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
name_to_file = {f.get("name", ""): f for f in file_list}
|
|
138
|
+
|
|
139
|
+
splits: list[dict[str, Any]] = []
|
|
140
|
+
group_names: list[str] = []
|
|
141
|
+
groups_path = module_dir / ".meta" / "split-requests" / f"{doc_type}-groups.json"
|
|
142
|
+
if groups_path.exists():
|
|
143
|
+
try:
|
|
144
|
+
groups_data = json.loads(groups_path.read_text(encoding="utf-8"))
|
|
145
|
+
group_names = [g.get("name", "") for g in groups_data]
|
|
146
|
+
except Exception:
|
|
147
|
+
pass
|
|
148
|
+
|
|
149
|
+
for idx, shard_file in enumerate(shard_files):
|
|
150
|
+
filenames = [
|
|
151
|
+
ln.strip() for ln in shard_file.read_text(encoding="utf-8").splitlines()
|
|
152
|
+
if ln.strip()
|
|
153
|
+
]
|
|
154
|
+
shard_files_info = [name_to_file[fn] for fn in filenames if fn in name_to_file]
|
|
155
|
+
name = group_names[idx] if idx < len(group_names) else f"group-{idx + 1}"
|
|
156
|
+
splits.append(make_split(name, shard_files_info))
|
|
157
|
+
|
|
158
|
+
if not splits or len(splits) <= 1:
|
|
159
|
+
return None
|
|
160
|
+
|
|
161
|
+
logger.info("split strategy=existing-shards doc_type=%s splits=%d", doc_type, len(splits))
|
|
162
|
+
return SplitPlan(splits=splits, strategy="agent-applied")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _emit_grouping_request(
|
|
166
|
+
file_list: list[dict],
|
|
167
|
+
module_dir: Path,
|
|
168
|
+
doc_type: str,
|
|
169
|
+
n_target: int,
|
|
170
|
+
max_files: int,
|
|
171
|
+
max_lines: int,
|
|
172
|
+
) -> SplitPlan:
|
|
173
|
+
"""Write a grouping request JSON for Agent to process."""
|
|
174
|
+
request_dir = module_dir / ".meta" / "split-requests"
|
|
175
|
+
request_dir.mkdir(parents=True, exist_ok=True)
|
|
176
|
+
request_path = request_dir / f"{doc_type}-grouping-request.json"
|
|
177
|
+
|
|
178
|
+
file_entries = []
|
|
179
|
+
for f in file_list:
|
|
180
|
+
file_entries.append({
|
|
181
|
+
"name": f.get("name", ""),
|
|
182
|
+
"package": f.get("package", ""),
|
|
183
|
+
"lines": f.get("lines", 0),
|
|
184
|
+
"bytes": f.get("bytes", 0),
|
|
185
|
+
})
|
|
186
|
+
|
|
187
|
+
request = {
|
|
188
|
+
"doc_type": doc_type,
|
|
189
|
+
"n_target": n_target,
|
|
190
|
+
"constraints": {
|
|
191
|
+
"max_files_per_group": min(max_files, 80),
|
|
192
|
+
"max_lines_per_group": max_lines,
|
|
193
|
+
"max_imbalance_ratio": 3.0,
|
|
194
|
+
"all_files_must_be_assigned": True,
|
|
195
|
+
"no_duplicate_assignment": True,
|
|
196
|
+
},
|
|
197
|
+
"files": file_entries,
|
|
198
|
+
"output_format": {
|
|
199
|
+
"description": "JSON array of groups",
|
|
200
|
+
"example": [
|
|
201
|
+
{"name": "order-lifecycle", "files": ["OrderServiceImpl.java", "OrderProcessor.java"]},
|
|
202
|
+
{"name": "payment", "files": ["PaymentServiceImpl.java"]},
|
|
203
|
+
],
|
|
204
|
+
},
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
request_path.write_text(json.dumps(request, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
208
|
+
logger.info("Agent grouping request written to %s", request_path)
|
|
209
|
+
|
|
210
|
+
return SplitPlan(
|
|
211
|
+
splits=[],
|
|
212
|
+
strategy="agent-pending",
|
|
213
|
+
pending_agent_grouping=True,
|
|
214
|
+
grouping_request_path=str(request_path),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def _parse_llm_groups(content: str, file_list: list[dict]) -> list[list[dict]] | None:
|
|
219
|
+
"""Parse LLM grouping response into file groups."""
|
|
220
|
+
name_to_file = {f.get("name", ""): f for f in file_list}
|
|
221
|
+
groups: list[list[dict]] = []
|
|
222
|
+
|
|
223
|
+
for line in content.strip().splitlines():
|
|
224
|
+
if ":" not in line:
|
|
225
|
+
continue
|
|
226
|
+
_, files_str = line.split(":", 1)
|
|
227
|
+
group: list[dict] = []
|
|
228
|
+
for fname in files_str.split(","):
|
|
229
|
+
fname = fname.strip().split("(")[0].strip()
|
|
230
|
+
if fname in name_to_file:
|
|
231
|
+
group.append(name_to_file[fname])
|
|
232
|
+
else:
|
|
233
|
+
class_name = fname.rsplit(".", 1)[-1] if "." in fname else fname
|
|
234
|
+
for name, f in name_to_file.items():
|
|
235
|
+
if name.endswith(class_name + ".java") or name == class_name:
|
|
236
|
+
group.append(f)
|
|
237
|
+
break
|
|
238
|
+
if group:
|
|
239
|
+
groups.append(group)
|
|
240
|
+
|
|
241
|
+
return groups if len(groups) > 1 else None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _assign_by_package(unassigned: list[dict], groups: list[list[dict]]) -> None:
|
|
245
|
+
"""Assign unassigned files to the group with the most package overlap."""
|
|
246
|
+
group_packages: list[set[str]] = []
|
|
247
|
+
for g in groups:
|
|
248
|
+
pkgs = set()
|
|
249
|
+
for f in g:
|
|
250
|
+
pkg = f.get("package", "")
|
|
251
|
+
while pkg:
|
|
252
|
+
pkgs.add(pkg)
|
|
253
|
+
pkg = pkg.rsplit(".", 1)[0] if "." in pkg else ""
|
|
254
|
+
group_packages.append(pkgs)
|
|
255
|
+
|
|
256
|
+
for f in unassigned:
|
|
257
|
+
pkg = f.get("package", "")
|
|
258
|
+
best_idx = 0
|
|
259
|
+
best_score = -1
|
|
260
|
+
for i, pkgs in enumerate(group_packages):
|
|
261
|
+
score = 0
|
|
262
|
+
p = pkg
|
|
263
|
+
while p:
|
|
264
|
+
if p in pkgs:
|
|
265
|
+
score += 1
|
|
266
|
+
p = p.rsplit(".", 1)[0] if "." in p else ""
|
|
267
|
+
if score > best_score:
|
|
268
|
+
best_score = score
|
|
269
|
+
best_idx = i
|
|
270
|
+
groups[best_idx].append(f)
|
|
271
|
+
pkg_full = f.get("package", "")
|
|
272
|
+
while pkg_full:
|
|
273
|
+
group_packages[best_idx].add(pkg_full)
|
|
274
|
+
pkg_full = pkg_full.rsplit(".", 1)[0] if "." in pkg_full else ""
|
core/utils.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
"""Shared utility functions — text sanitization, subprocess safety, path validation.
|
|
2
|
+
|
|
3
|
+
Pure utility functions with no LLM calls or CLI I/O.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
from core.utils import sanitize_llm_input, safe_subprocess_run, validate_path_within_bounds
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
# Path validation
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_path_within_bounds(path: Path, base: Path) -> bool:
|
|
23
|
+
"""Security check: ensure resolved path doesn't escape base directory.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
path: Path to validate (may be relative or absolute).
|
|
27
|
+
base: Base directory that path must stay within.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
True if path is within bounds, False otherwise.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
resolved = path.resolve()
|
|
34
|
+
base_resolved = base.resolve()
|
|
35
|
+
except (OSError, ValueError):
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
if sys.platform == "win32":
|
|
39
|
+
return str(resolved).lower().startswith(str(base_resolved).lower())
|
|
40
|
+
else:
|
|
41
|
+
try:
|
|
42
|
+
resolved.relative_to(base_resolved)
|
|
43
|
+
return True
|
|
44
|
+
except ValueError:
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Text sanitization for LLM input
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
_INJECTION_PATTERNS = [
|
|
53
|
+
re.compile(r"(?i)ignore\s+(all\s+)?previous\s+instructions"),
|
|
54
|
+
re.compile(r"(?i)^system\s*:", re.MULTILINE),
|
|
55
|
+
re.compile(r"(?i)you\s+are\s+now\s+"),
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
MAX_LLM_INPUT_CHARS = 200_000
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def sanitize_llm_input(text: str) -> str:
|
|
62
|
+
"""Clean text for LLM input: remove null bytes, normalize whitespace, filter injections.
|
|
63
|
+
|
|
64
|
+
- Removes null bytes
|
|
65
|
+
- Truncates to MAX_LLM_INPUT_CHARS
|
|
66
|
+
- Filters common prompt injection patterns
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
text: Raw text to sanitize.
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Sanitized text safe for LLM consumption.
|
|
73
|
+
"""
|
|
74
|
+
# Remove null bytes
|
|
75
|
+
text = text.replace("\0", "")
|
|
76
|
+
|
|
77
|
+
# Truncate
|
|
78
|
+
if len(text) > MAX_LLM_INPUT_CHARS:
|
|
79
|
+
text = text[:MAX_LLM_INPUT_CHARS] + "\n[truncated]"
|
|
80
|
+
|
|
81
|
+
# Filter injection patterns
|
|
82
|
+
for pat in _INJECTION_PATTERNS:
|
|
83
|
+
text = pat.sub("[filtered]", text)
|
|
84
|
+
|
|
85
|
+
return text
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------------------
|
|
89
|
+
# Safe subprocess execution
|
|
90
|
+
# ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def safe_subprocess_run(
|
|
94
|
+
cmd: list[str | Path],
|
|
95
|
+
*,
|
|
96
|
+
timeout: int = 120,
|
|
97
|
+
capture: bool = True,
|
|
98
|
+
env: dict[str, str] | None = None,
|
|
99
|
+
cwd: Path | str | None = None,
|
|
100
|
+
) -> subprocess.CompletedProcess:
|
|
101
|
+
"""Subprocess wrapper with timeout and encoding handling.
|
|
102
|
+
|
|
103
|
+
All Path arguments are resolved to absolute paths to prevent injection.
|
|
104
|
+
Uses UTF-8 encoding with error replacement.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
cmd: Command and arguments (Path objects are resolved to absolute).
|
|
108
|
+
timeout: Maximum execution time in seconds.
|
|
109
|
+
capture: Whether to capture stdout/stderr.
|
|
110
|
+
env: Optional environment variables override.
|
|
111
|
+
cwd: Optional working directory.
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
CompletedProcess result.
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
subprocess.TimeoutExpired: If command exceeds timeout.
|
|
118
|
+
FileNotFoundError: If command executable not found.
|
|
119
|
+
"""
|
|
120
|
+
sanitized: list[str] = []
|
|
121
|
+
for arg in cmd:
|
|
122
|
+
if isinstance(arg, Path):
|
|
123
|
+
sanitized.append(str(arg.resolve()))
|
|
124
|
+
else:
|
|
125
|
+
sanitized.append(str(arg))
|
|
126
|
+
|
|
127
|
+
return subprocess.run(
|
|
128
|
+
sanitized,
|
|
129
|
+
capture_output=capture,
|
|
130
|
+
encoding="utf-8",
|
|
131
|
+
errors="replace",
|
|
132
|
+
timeout=timeout,
|
|
133
|
+
env=env,
|
|
134
|
+
cwd=str(cwd) if cwd else None,
|
|
135
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""core.validators — document quality validators (registry pattern).
|
|
2
|
+
|
|
3
|
+
Each validator implements the Validator ABC and is registered by name.
|
|
4
|
+
Validators are pure functions: they inspect files and return structured results
|
|
5
|
+
without performing any remediation.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from core.validators import get_all_validators, run_all
|
|
9
|
+
|
|
10
|
+
results = run_all(module_dir, preset_name="java-spring")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from core.interfaces import Validator, ValidationResult
|
|
19
|
+
|
|
20
|
+
_REGISTRY: dict[str, type[Validator]] = {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def register_validator(cls: type[Validator]) -> type[Validator]:
|
|
24
|
+
"""Decorator to register a validator class."""
|
|
25
|
+
_REGISTRY[cls.name] = cls
|
|
26
|
+
return cls
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_all_validators() -> list[Validator]:
|
|
30
|
+
"""Return instances of all registered validators."""
|
|
31
|
+
_ensure_loaded()
|
|
32
|
+
return [cls() for cls in _REGISTRY.values()]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_validator(name: str) -> Validator:
|
|
36
|
+
"""Get a specific validator by name."""
|
|
37
|
+
if name not in _REGISTRY:
|
|
38
|
+
raise KeyError(f"Validator '{name}' not found. Available: {sorted(_REGISTRY.keys())}")
|
|
39
|
+
return _REGISTRY[name]()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def run_all(module_dir: Path, **kwargs: Any) -> ValidationResult:
|
|
43
|
+
"""Run all registered validators and merge results."""
|
|
44
|
+
_ensure_loaded()
|
|
45
|
+
combined = ValidationResult()
|
|
46
|
+
for validator in get_all_validators():
|
|
47
|
+
try:
|
|
48
|
+
result = validator.validate(module_dir, **kwargs)
|
|
49
|
+
combined = combined.merge(result)
|
|
50
|
+
except Exception as e:
|
|
51
|
+
combined.errors.append(f"{validator.name}: {e}")
|
|
52
|
+
return combined
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _ensure_loaded() -> None:
|
|
56
|
+
"""Import all validator modules to trigger registration."""
|
|
57
|
+
if _REGISTRY:
|
|
58
|
+
return
|
|
59
|
+
# Import all validator modules so @register_validator decorators fire
|
|
60
|
+
from core.validators import duplicates # noqa: F401
|
|
61
|
+
from core.validators import links # noqa: F401
|
|
62
|
+
from core.validators import sampling # noqa: F401
|
|
63
|
+
from core.validators import structure # noqa: F401
|
|
64
|
+
from core.validators import coverage # noqa: F401
|
|
65
|
+
from core.validators import consistency # noqa: F401
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""CLI entry points for core/validators tools.
|
|
2
|
+
|
|
3
|
+
Provides sub-commands for Agent mode to call directly:
|
|
4
|
+
python -m core.validators coverage ...
|
|
5
|
+
python -m core.validators consistency ...
|
|
6
|
+
python -m core.validators check-progress ...
|
|
7
|
+
python -m core.validators links ...
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
sys.stdout.reconfigure(encoding="utf-8")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def cmd_coverage(args: argparse.Namespace) -> None:
|
|
21
|
+
"""Run coverage check."""
|
|
22
|
+
from core.validators.coverage import CoverageValidator
|
|
23
|
+
|
|
24
|
+
module_dir = Path(args.module_dir) if args.module_dir else None
|
|
25
|
+
skeleton_path = Path(args.skeleton) if args.skeleton else None
|
|
26
|
+
skeleton_dir = Path(args.skeleton_dir) if args.skeleton_dir else None
|
|
27
|
+
docs_dir = Path(args.docs_dir) if args.docs_dir else module_dir
|
|
28
|
+
|
|
29
|
+
if not docs_dir:
|
|
30
|
+
print("Error: --docs-dir or --module-dir required", file=sys.stderr)
|
|
31
|
+
sys.exit(1)
|
|
32
|
+
|
|
33
|
+
validator = CoverageValidator()
|
|
34
|
+
result = validator.validate(
|
|
35
|
+
docs_dir,
|
|
36
|
+
skeleton_path=str(skeleton_path) if skeleton_path else None,
|
|
37
|
+
skeleton_dir=str(skeleton_dir) if skeleton_dir else None,
|
|
38
|
+
module_type=args.type or "service",
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
if result.errors:
|
|
42
|
+
print("ERRORS:")
|
|
43
|
+
for e in result.errors:
|
|
44
|
+
print(f" ❌ {e}")
|
|
45
|
+
if result.warnings:
|
|
46
|
+
print("WARNINGS:")
|
|
47
|
+
for w in result.warnings:
|
|
48
|
+
print(f" ⚠️ {w}")
|
|
49
|
+
|
|
50
|
+
status = "ok" if result.passed else "fail"
|
|
51
|
+
print(json.dumps({"status": status, "errors": len(result.errors),
|
|
52
|
+
"warnings": len(result.warnings)}, ensure_ascii=False))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def cmd_consistency(args: argparse.Namespace) -> None:
|
|
56
|
+
"""Run consistency check (progress monitoring + reference validation)."""
|
|
57
|
+
from core.validators.consistency import ConsistencyValidator
|
|
58
|
+
from core.monitor.progress import check_progress, cleanup_progress
|
|
59
|
+
|
|
60
|
+
module_dir = Path(args.module_dir)
|
|
61
|
+
|
|
62
|
+
if args.cleanup:
|
|
63
|
+
cleaned = cleanup_progress(module_dir)
|
|
64
|
+
print(f"Cleaned {cleaned} progress files")
|
|
65
|
+
print(json.dumps({"status": "ok", "cleaned": cleaned}, ensure_ascii=False))
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
# Progress check
|
|
69
|
+
progress = check_progress(module_dir)
|
|
70
|
+
if progress:
|
|
71
|
+
print("Progress status:")
|
|
72
|
+
for doc_name, status in progress.items():
|
|
73
|
+
print(f" {doc_name}: {status}")
|
|
74
|
+
|
|
75
|
+
# Consistency validation
|
|
76
|
+
validator = ConsistencyValidator()
|
|
77
|
+
source_cache = Path(args.source_cache) if args.source_cache else None
|
|
78
|
+
result = validator.validate(module_dir, source_cache=source_cache)
|
|
79
|
+
|
|
80
|
+
if result.warnings:
|
|
81
|
+
print("\nConsistency warnings:")
|
|
82
|
+
for w in result.warnings:
|
|
83
|
+
print(f" ⚠️ {w}")
|
|
84
|
+
|
|
85
|
+
status = "ok" if result.passed else "fail"
|
|
86
|
+
print(json.dumps({"status": status, "errors": len(result.errors),
|
|
87
|
+
"warnings": len(result.warnings),
|
|
88
|
+
"progress": progress or {}}, ensure_ascii=False))
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def cmd_links(args: argparse.Namespace) -> None:
|
|
92
|
+
"""Run link/embedding health check."""
|
|
93
|
+
from core.validators.links import LinksValidator
|
|
94
|
+
|
|
95
|
+
module_dir = Path(args.module_dir) if args.module_dir else None
|
|
96
|
+
if not module_dir:
|
|
97
|
+
print("Error: --module-dir required", file=sys.stderr)
|
|
98
|
+
sys.exit(1)
|
|
99
|
+
|
|
100
|
+
validator = LinksValidator()
|
|
101
|
+
result = validator.validate(module_dir)
|
|
102
|
+
|
|
103
|
+
if result.errors:
|
|
104
|
+
print("Broken links:")
|
|
105
|
+
for e in result.errors:
|
|
106
|
+
print(f" ❌ {e}")
|
|
107
|
+
if result.warnings:
|
|
108
|
+
for w in result.warnings:
|
|
109
|
+
print(f" ⚠️ {w}")
|
|
110
|
+
|
|
111
|
+
status = "ok" if result.passed else "fail"
|
|
112
|
+
print(json.dumps({"status": status, "errors": len(result.errors),
|
|
113
|
+
"warnings": len(result.warnings)}, ensure_ascii=False))
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def cmd_structure(args: argparse.Namespace) -> None:
|
|
117
|
+
"""Run structure/output validation."""
|
|
118
|
+
from core.validators.structure import StructureValidator
|
|
119
|
+
|
|
120
|
+
module_dir = Path(args.module_dir)
|
|
121
|
+
skeleton_dir = Path(args.skeleton_dir) if args.skeleton_dir else None
|
|
122
|
+
|
|
123
|
+
validator = StructureValidator()
|
|
124
|
+
result = validator.validate(module_dir, skeleton_dir=skeleton_dir)
|
|
125
|
+
|
|
126
|
+
if result.errors:
|
|
127
|
+
print("Structure errors:")
|
|
128
|
+
for e in result.errors:
|
|
129
|
+
print(f" ❌ {e}")
|
|
130
|
+
if result.warnings:
|
|
131
|
+
for w in result.warnings:
|
|
132
|
+
print(f" ⚠️ {w}")
|
|
133
|
+
|
|
134
|
+
status = "ok" if result.passed else "fail"
|
|
135
|
+
print(json.dumps({"status": status, "errors": len(result.errors),
|
|
136
|
+
"warnings": len(result.warnings)}, ensure_ascii=False))
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def cmd_sampling(args: argparse.Namespace) -> None:
|
|
140
|
+
"""Run numeric sampling validation (spot-check enum values and field names)."""
|
|
141
|
+
from core.validators.sampling import SamplingValidator
|
|
142
|
+
|
|
143
|
+
module_dir = Path(args.module_dir)
|
|
144
|
+
validator = SamplingValidator()
|
|
145
|
+
result = validator.validate(module_dir)
|
|
146
|
+
|
|
147
|
+
if result.warnings:
|
|
148
|
+
print("Sampling warnings:")
|
|
149
|
+
for w in result.warnings:
|
|
150
|
+
print(f" ⚠️ {w}")
|
|
151
|
+
|
|
152
|
+
hit_count = 10 - len(result.warnings) # max 10 samples (5 enum + 5 field)
|
|
153
|
+
total = 10
|
|
154
|
+
hit_rate = round(hit_count / total * 100, 1) if total > 0 else 100.0
|
|
155
|
+
|
|
156
|
+
if not result.warnings:
|
|
157
|
+
print(f"Numeric sampling validation passed (hit rate {hit_rate}%)")
|
|
158
|
+
else:
|
|
159
|
+
print(f"Numeric sampling hit rate {hit_rate}% ({len(result.warnings)} items missed)")
|
|
160
|
+
|
|
161
|
+
status = "ok" if result.passed else "warn"
|
|
162
|
+
print(json.dumps({"status": status, "warnings": len(result.warnings),
|
|
163
|
+
"hit_rate_pct": hit_rate}, ensure_ascii=False), file=sys.stderr)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def main():
|
|
167
|
+
parser = argparse.ArgumentParser(prog="python -m core.validators", description="Validation tools")
|
|
168
|
+
sub = parser.add_subparsers(dest="command")
|
|
169
|
+
|
|
170
|
+
# coverage
|
|
171
|
+
p = sub.add_parser("coverage", help="Coverage check")
|
|
172
|
+
p.add_argument("action", nargs="?", default="check", choices=["check"])
|
|
173
|
+
p.add_argument("--skeleton", help="Skeleton JSON path")
|
|
174
|
+
p.add_argument("--skeleton-dir", help="Skeleton shards directory")
|
|
175
|
+
p.add_argument("--module-dir", help="Module directory")
|
|
176
|
+
p.add_argument("--docs-dir", help="Documents directory")
|
|
177
|
+
p.add_argument("--type", default="service", help="Module type")
|
|
178
|
+
|
|
179
|
+
# consistency (replaces check_progress)
|
|
180
|
+
p = sub.add_parser("consistency", help="Consistency and progress check")
|
|
181
|
+
p.add_argument("--module-dir", required=True, help="Module directory")
|
|
182
|
+
p.add_argument("--preset", help="Preset name")
|
|
183
|
+
p.add_argument("--source-cache", help="Source cache path")
|
|
184
|
+
p.add_argument("--cleanup", action="store_true", help="Clean up progress files")
|
|
185
|
+
|
|
186
|
+
# links (replaces embedding_health)
|
|
187
|
+
p = sub.add_parser("links", help="Link and reference health check")
|
|
188
|
+
p.add_argument("--module-dir", help="Module directory")
|
|
189
|
+
p.add_argument("--config", help="kb-project.yaml path")
|
|
190
|
+
|
|
191
|
+
# structure (replaces validate_output)
|
|
192
|
+
p = sub.add_parser("structure", help="Document structure validation")
|
|
193
|
+
p.add_argument("--module-dir", required=True, help="Module directory")
|
|
194
|
+
p.add_argument("--skeleton-dir", help="Skeleton directory for comparison")
|
|
195
|
+
p.add_argument("--kb-dir", help="Knowledge base directory (validate all modules)")
|
|
196
|
+
|
|
197
|
+
# sampling (numeric spot-check)
|
|
198
|
+
p = sub.add_parser("sampling", help="Numeric sampling validation (enum values + field names)")
|
|
199
|
+
p.add_argument("--module-dir", required=True, help="Module directory")
|
|
200
|
+
|
|
201
|
+
args = parser.parse_args()
|
|
202
|
+
if not args.command:
|
|
203
|
+
parser.print_help()
|
|
204
|
+
sys.exit(1)
|
|
205
|
+
|
|
206
|
+
commands = {
|
|
207
|
+
"coverage": cmd_coverage, "consistency": cmd_consistency,
|
|
208
|
+
"links": cmd_links, "structure": cmd_structure,
|
|
209
|
+
"sampling": cmd_sampling,
|
|
210
|
+
}
|
|
211
|
+
commands[args.command](args)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
if __name__ == "__main__":
|
|
215
|
+
main()
|