source-kb 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. cli/__init__.py +50 -0
  2. cli/__main__.py +5 -0
  3. cli/commands/__init__.py +1 -0
  4. cli/commands/anchor_fix.py +47 -0
  5. cli/commands/diff_doc.py +52 -0
  6. cli/commands/dispatch.py +77 -0
  7. cli/commands/extract.py +72 -0
  8. cli/commands/file_list.py +74 -0
  9. cli/commands/index.py +84 -0
  10. cli/commands/lock.py +89 -0
  11. cli/commands/merge.py +60 -0
  12. cli/commands/merge_delta.py +19 -0
  13. cli/commands/metadata.py +24 -0
  14. cli/commands/pipeline.py +45 -0
  15. cli/commands/post_merge.py +43 -0
  16. cli/commands/query.py +52 -0
  17. cli/commands/render.py +101 -0
  18. cli/commands/scan_repos.py +46 -0
  19. cli/commands/setup.py +94 -0
  20. cli/commands/split.py +196 -0
  21. cli/commands/stale_files.py +98 -0
  22. cli/commands/validate.py +191 -0
  23. core/__init__.py +32 -0
  24. core/config.py +261 -0
  25. core/docs/__init__.py +7 -0
  26. core/docs/section_updater.py +286 -0
  27. core/docs/shared.py +149 -0
  28. core/git.py +294 -0
  29. core/interfaces.py +249 -0
  30. core/monitor/__init__.py +5 -0
  31. core/monitor/progress.py +83 -0
  32. core/monitor/prompt_store.py +49 -0
  33. core/paths.py +141 -0
  34. core/preset.py +237 -0
  35. core/preset_accessors.py +202 -0
  36. core/preset_classify.py +132 -0
  37. core/preset_hooks.py +129 -0
  38. core/preset_profile.py +89 -0
  39. core/prompt/__init__.py +7 -0
  40. core/prompt/__main__.py +147 -0
  41. core/prompt/content.py +320 -0
  42. core/prompt/context_manager.py +164 -0
  43. core/prompt/renderer.py +236 -0
  44. core/prompt/response_parser.py +274 -0
  45. core/prompt/templates.py +357 -0
  46. core/prompt/validate_parity.py +162 -0
  47. core/prompt/variables.py +339 -0
  48. core/rag/__init__.py +22 -0
  49. core/rag/__main__.py +136 -0
  50. core/rag/bm25_index.py +268 -0
  51. core/rag/chunker.py +273 -0
  52. core/rag/embedder.py +151 -0
  53. core/rag/indexer.py +292 -0
  54. core/rag/loader.py +89 -0
  55. core/rag/retriever.py +82 -0
  56. core/skeleton/__init__.py +11 -0
  57. core/skeleton/__main__.py +934 -0
  58. core/skeleton/anchor_fix.py +250 -0
  59. core/skeleton/classify.py +331 -0
  60. core/skeleton/cmd_anchor_fix.py +43 -0
  61. core/skeleton/cmd_diff_doc.py +44 -0
  62. core/skeleton/cmd_lock.py +87 -0
  63. core/skeleton/cmd_merge_delta.py +41 -0
  64. core/skeleton/community.py +233 -0
  65. core/skeleton/dependency_graph.py +306 -0
  66. core/skeleton/diff_doc.py +248 -0
  67. core/skeleton/dispatch.py +273 -0
  68. core/skeleton/dispatch_render.py +319 -0
  69. core/skeleton/dispatch_source.py +111 -0
  70. core/skeleton/extract.py +218 -0
  71. core/skeleton/extract_methods.py +298 -0
  72. core/skeleton/file_list.py +239 -0
  73. core/skeleton/impact.py +278 -0
  74. core/skeleton/jar_download.py +177 -0
  75. core/skeleton/jar_resolver.py +186 -0
  76. core/skeleton/loader.py +162 -0
  77. core/skeleton/merge.py +278 -0
  78. core/skeleton/merge_delta.py +229 -0
  79. core/skeleton/metadata.py +96 -0
  80. core/skeleton/metadata_builders.py +264 -0
  81. core/skeleton/module_dag.py +330 -0
  82. core/skeleton/parsers/__init__.py +71 -0
  83. core/skeleton/parsers/jqassistant.py +300 -0
  84. core/skeleton/parsers/jqassistant_cypher.py +225 -0
  85. core/skeleton/parsers/regex.py +171 -0
  86. core/skeleton/parsers/treesitter.py +324 -0
  87. core/skeleton/parsers/treesitter_java.py +284 -0
  88. core/skeleton/parsers/treesitter_multi.py +289 -0
  89. core/skeleton/pom_parser.py +299 -0
  90. core/skeleton/post_merge.py +295 -0
  91. core/skeleton/post_merge_llm.py +82 -0
  92. core/skeleton/query.py +195 -0
  93. core/skeleton/shard_context.py +177 -0
  94. core/skeleton/split.py +180 -0
  95. core/skeleton/split_cache.py +107 -0
  96. core/skeleton/split_feedback.py +174 -0
  97. core/skeleton/split_plan.py +219 -0
  98. core/skeleton/split_plan_helpers.py +305 -0
  99. core/skeleton/split_plan_llm.py +274 -0
  100. core/utils.py +135 -0
  101. core/validators/__init__.py +65 -0
  102. core/validators/__main__.py +215 -0
  103. core/validators/consistency.py +203 -0
  104. core/validators/coverage.py +171 -0
  105. core/validators/duplicates.py +76 -0
  106. core/validators/engine.py +224 -0
  107. core/validators/links.py +76 -0
  108. core/validators/sampling.py +169 -0
  109. core/validators/structure.py +144 -0
  110. engine/__init__.py +7 -0
  111. engine/assembler.py +231 -0
  112. engine/confirm.py +65 -0
  113. engine/dedup.py +106 -0
  114. engine/main.py +211 -0
  115. engine/pipeline/__init__.py +163 -0
  116. engine/pipeline/recovery.py +250 -0
  117. engine/pipeline/steps/__init__.py +23 -0
  118. engine/pipeline/steps/audit.py +220 -0
  119. engine/pipeline/steps/audit_apply.py +195 -0
  120. engine/pipeline/steps/audit_helpers.py +155 -0
  121. engine/pipeline/steps/classify_llm.py +236 -0
  122. engine/pipeline/steps/classify_prompt.py +223 -0
  123. engine/pipeline/steps/finalize.py +160 -0
  124. engine/pipeline/steps/generate.py +169 -0
  125. engine/pipeline/steps/generate_batch.py +197 -0
  126. engine/pipeline/steps/generate_recovery.py +170 -0
  127. engine/pipeline/steps/llm_plan_split.py +253 -0
  128. engine/pipeline/steps/lock.py +64 -0
  129. engine/pipeline/steps/preflight.py +237 -0
  130. engine/pipeline/steps/preflight_adjust.py +147 -0
  131. engine/pipeline/steps/pregenerate.py +130 -0
  132. engine/pipeline/steps/quality.py +81 -0
  133. engine/pipeline/steps/skeleton.py +149 -0
  134. engine/pipeline/steps/source.py +163 -0
  135. engine/pipeline/steps/sync.py +117 -0
  136. engine/pipeline/steps/sync_finalize.py +237 -0
  137. engine/pipeline/steps/sync_update.py +341 -0
  138. engine/pipelines.py +91 -0
  139. engine/runner.py +335 -0
  140. engine/strategies/__init__.py +86 -0
  141. engine/strategies/api.py +128 -0
  142. engine/strategies/delegated.py +50 -0
  143. engine/strategies/dryrun.py +25 -0
  144. engine/two_phase.py +143 -0
  145. mcp_server/__init__.py +73 -0
  146. mcp_server/__main__.py +5 -0
  147. mcp_server/tools/__init__.py +1 -0
  148. mcp_server/tools/config.py +63 -0
  149. mcp_server/tools/discovery.py +276 -0
  150. mcp_server/tools/generation.py +184 -0
  151. mcp_server/tools/planning.py +144 -0
  152. mcp_server/tools/source.py +175 -0
  153. mcp_server/tools/validation.py +140 -0
  154. mcp_server/tools/workflow.py +166 -0
  155. mcp_server/workflow_loader.py +204 -0
  156. presets/generic/audit_dimensions.md +132 -0
  157. presets/generic/doc_types.yaml +152 -0
  158. presets/generic/preset.yaml +115 -0
  159. presets/java-spring/audit_dimensions.md +228 -0
  160. presets/java-spring/audit_dimensions.yaml +203 -0
  161. presets/java-spring/doc_types.yaml +269 -0
  162. presets/java-spring/hooks.py +122 -0
  163. presets/java-spring/preset.yaml +341 -0
  164. presets/java-spring/templates/README.md +34 -0
  165. presets/java-spring/templates/audit-system.md +15 -0
  166. presets/java-spring/templates/subagent-aop.md +105 -0
  167. presets/java-spring/templates/subagent-api.md +63 -0
  168. presets/java-spring/templates/subagent-architecture.md +111 -0
  169. presets/java-spring/templates/subagent-async-events.md +107 -0
  170. presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
  171. presets/java-spring/templates/subagent-audit-architecture.md +38 -0
  172. presets/java-spring/templates/subagent-audit-business.md +40 -0
  173. presets/java-spring/templates/subagent-audit-data-models.md +40 -0
  174. presets/java-spring/templates/subagent-business.md +129 -0
  175. presets/java-spring/templates/subagent-caching.md +75 -0
  176. presets/java-spring/templates/subagent-database-access.md +114 -0
  177. presets/java-spring/templates/subagent-enum.md +75 -0
  178. presets/java-spring/templates/subagent-error-handling.md +91 -0
  179. presets/java-spring/templates/subagent-external-integrations.md +80 -0
  180. presets/java-spring/templates/subagent-index.md +122 -0
  181. presets/java-spring/templates/subagent-messaging.md +97 -0
  182. presets/java-spring/templates/subagent-model.md +88 -0
  183. presets/java-spring/templates/subagent-observability.md +91 -0
  184. presets/java-spring/templates/subagent-scheduled.md +81 -0
  185. presets/java-spring/templates/subagent-security.md +102 -0
  186. presets/java-spring/templates/subagent-structure.md +101 -0
  187. presets/java-spring/templates/subagent-sync-section.md +34 -0
  188. presets/java-spring/templates/subagent-utils.md +73 -0
  189. presets/java-spring/templates/sync-system.md +8 -0
  190. presets/java-spring/workflow-extensions.md +112 -0
  191. skills/__init__.py +1 -0
  192. skills/_shared/README.md +30 -0
  193. skills/_shared/doc-coverage-shared.md +134 -0
  194. skills/_shared/doc-quality-standard.md +1058 -0
  195. skills/_shared/doc-subagent-rules.md +762 -0
  196. skills/_shared/windows-compat.md +89 -0
  197. skills/kb-audit/SKILL.md +52 -0
  198. skills/kb-audit/rules.md +88 -0
  199. skills/kb-audit/steps/step-01-prepare.md +75 -0
  200. skills/kb-audit/steps/step-02-audit.md +96 -0
  201. skills/kb-audit/steps/step-03-verify.md +65 -0
  202. skills/kb-audit/steps/step-04-report.md +64 -0
  203. skills/kb-init/SKILL.md +146 -0
  204. skills/kb-init/rules.md +187 -0
  205. skills/kb-init/steps/step-01-scope.md +62 -0
  206. skills/kb-init/steps/step-02-source.md +410 -0
  207. skills/kb-init/steps/step-03-generate.md +307 -0
  208. skills/kb-init/steps/step-04-quality.md +92 -0
  209. skills/kb-init/steps/step-05-finalize.md +132 -0
  210. skills/kb-init/templates/core/execution-modes.md +29 -0
  211. skills/kb-init/templates/core/output-only.md +4 -0
  212. skills/kb-init/templates/core/readwrite.md +33 -0
  213. skills/kb-search/SKILL.md +138 -0
  214. skills/kb-search/rules.md +64 -0
  215. skills/kb-sync/SKILL.md +43 -0
  216. skills/kb-sync/rules.md +70 -0
  217. skills/kb-sync/scripts/rebuild_module.py +91 -0
  218. skills/kb-sync/scripts/scan_repos.py +687 -0
  219. skills/kb-sync/steps/step-01-detect.md +72 -0
  220. skills/kb-sync/steps/step-02-update.md +71 -0
  221. skills/kb-sync/steps/step-03-verify.md +47 -0
  222. skills/kb-sync/steps/step-04-finalize.md +52 -0
  223. source_kb-0.2.2.dist-info/METADATA +194 -0
  224. source_kb-0.2.2.dist-info/RECORD +228 -0
  225. source_kb-0.2.2.dist-info/WHEEL +5 -0
  226. source_kb-0.2.2.dist-info/entry_points.txt +3 -0
  227. source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
  228. source_kb-0.2.2.dist-info/top_level.txt +6 -0
@@ -0,0 +1,274 @@
1
+ """Split plan LLM strategy — LLM-assisted business-domain grouping.
2
+
3
+ Handles both CLI mode (direct LLM call) and delegated mode (Agent dispatch).
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import json
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from core.skeleton.split import SplitConfig, SplitPlan
14
+ from core.skeleton.split_plan_helpers import (
15
+ make_split, derive_name, rebalance_groups,
16
+ )
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def try_llm_split(
22
+ file_list: list[dict],
23
+ split_config: SplitConfig,
24
+ doc_type: str,
25
+ module_dir: Path | None,
26
+ *,
27
+ strategy: Any | None = None,
28
+ is_delegated: bool = False,
29
+ ) -> SplitPlan | None:
30
+ """Try LLM-assisted business-domain grouping.
31
+
32
+ In Agent mode (delegated): outputs a grouping request JSON file for the Agent
33
+ to process using its own LLM capability, then returns a pending plan.
34
+
35
+ In CLI mode: calls the provided LLM strategy directly.
36
+
37
+ Args:
38
+ file_list: Files to group.
39
+ split_config: Split configuration.
40
+ doc_type: Document type being split.
41
+ module_dir: Module directory for cache/shard files.
42
+ strategy: Injected LlmStrategy instance (from engine/ caller). None = skip LLM.
43
+ is_delegated: Whether running in delegated mode (Agent handles grouping).
44
+ """
45
+ if not module_dir:
46
+ return None
47
+
48
+ from core.skeleton.split import compute_splits
49
+
50
+ max_files = split_config.max_files_per_shard
51
+ total_lines = sum(f.get("lines", 0) for f in file_list)
52
+ total_bytes = sum(f.get("bytes", 0) for f in file_list)
53
+ n_target = max(2, compute_splits(
54
+ split_config,
55
+ total_bytes=total_bytes,
56
+ total_lines=total_lines,
57
+ file_count=len(file_list),
58
+ doc_type=doc_type,
59
+ ))
60
+ max_lines = split_config.effective_max_lines(doc_type)
61
+
62
+ if is_delegated:
63
+ existing = _load_existing_shards(module_dir, doc_type, file_list)
64
+ if existing:
65
+ return existing
66
+ return _emit_grouping_request(file_list, module_dir, doc_type, n_target, max_files, max_lines)
67
+
68
+ # CLI mode: need a strategy to call LLM
69
+ if strategy is None:
70
+ return None
71
+
72
+ llm_sample_limit = split_config.llm_sample_limit
73
+ sampled_files = file_list if len(file_list) <= llm_sample_limit else file_list[:llm_sample_limit]
74
+
75
+ file_summaries = []
76
+ for f in sampled_files:
77
+ name = f.get("name", "")
78
+ pkg = f.get("package", "")
79
+ lines = f.get("lines", 0)
80
+ file_summaries.append(f"{pkg}.{name} ({lines} lines)")
81
+
82
+ count_desc = f"{len(sampled_files)} Java files"
83
+ if len(sampled_files) < len(file_list):
84
+ count_desc += f" (sample of {len(file_list)} total)"
85
+ prompt_user = (
86
+ f"Group the following {count_desc} into {n_target} groups by business domain.\n"
87
+ f"Output format per group: `GroupName: File1, File2, ...`\n"
88
+ f"File list:\n" + "\n".join(file_summaries)
89
+ )
90
+
91
+ from core.interfaces import LlmRequest
92
+ try:
93
+ resp = strategy.call(LlmRequest(
94
+ system=(
95
+ "You are a Java project architect skilled at grouping source files by business domain. "
96
+ "Output only the grouping result, no explanation."
97
+ ),
98
+ user=prompt_user,
99
+ max_tokens=2000,
100
+ temperature=0.1,
101
+ ))
102
+ except Exception:
103
+ return None
104
+
105
+ if resp.status != "done" or not resp.content:
106
+ return None
107
+
108
+ groups = _parse_llm_groups(resp.content, file_list)
109
+ if not groups or len(groups) <= 1:
110
+ return None
111
+
112
+ assigned = {id(f) for g in groups for f in g}
113
+ unassigned = [f for f in file_list if id(f) not in assigned]
114
+ if unassigned:
115
+ _assign_by_package(unassigned, groups)
116
+
117
+ rebalance_groups(groups, split_config.max_files_per_shard)
118
+
119
+ splits = [make_split(derive_name(g, noise_words=split_config.noise_words), g) for g in groups]
120
+ return SplitPlan(splits=splits, strategy="llm")
121
+
122
+
123
+ def _load_existing_shards(
124
+ module_dir: Path,
125
+ doc_type: str,
126
+ file_list: list[dict],
127
+ ) -> SplitPlan | None:
128
+ """Load existing shard files produced by a prior split-apply."""
129
+ shards_dir = module_dir / ".meta" / "shards"
130
+ if not shards_dir.is_dir():
131
+ return None
132
+
133
+ shard_files = sorted(shards_dir.glob(f"{doc_type}-shard-*.txt"))
134
+ if not shard_files:
135
+ return None
136
+
137
+ name_to_file = {f.get("name", ""): f for f in file_list}
138
+
139
+ splits: list[dict[str, Any]] = []
140
+ group_names: list[str] = []
141
+ groups_path = module_dir / ".meta" / "split-requests" / f"{doc_type}-groups.json"
142
+ if groups_path.exists():
143
+ try:
144
+ groups_data = json.loads(groups_path.read_text(encoding="utf-8"))
145
+ group_names = [g.get("name", "") for g in groups_data]
146
+ except Exception:
147
+ pass
148
+
149
+ for idx, shard_file in enumerate(shard_files):
150
+ filenames = [
151
+ ln.strip() for ln in shard_file.read_text(encoding="utf-8").splitlines()
152
+ if ln.strip()
153
+ ]
154
+ shard_files_info = [name_to_file[fn] for fn in filenames if fn in name_to_file]
155
+ name = group_names[idx] if idx < len(group_names) else f"group-{idx + 1}"
156
+ splits.append(make_split(name, shard_files_info))
157
+
158
+ if not splits or len(splits) <= 1:
159
+ return None
160
+
161
+ logger.info("split strategy=existing-shards doc_type=%s splits=%d", doc_type, len(splits))
162
+ return SplitPlan(splits=splits, strategy="agent-applied")
163
+
164
+
165
+ def _emit_grouping_request(
166
+ file_list: list[dict],
167
+ module_dir: Path,
168
+ doc_type: str,
169
+ n_target: int,
170
+ max_files: int,
171
+ max_lines: int,
172
+ ) -> SplitPlan:
173
+ """Write a grouping request JSON for Agent to process."""
174
+ request_dir = module_dir / ".meta" / "split-requests"
175
+ request_dir.mkdir(parents=True, exist_ok=True)
176
+ request_path = request_dir / f"{doc_type}-grouping-request.json"
177
+
178
+ file_entries = []
179
+ for f in file_list:
180
+ file_entries.append({
181
+ "name": f.get("name", ""),
182
+ "package": f.get("package", ""),
183
+ "lines": f.get("lines", 0),
184
+ "bytes": f.get("bytes", 0),
185
+ })
186
+
187
+ request = {
188
+ "doc_type": doc_type,
189
+ "n_target": n_target,
190
+ "constraints": {
191
+ "max_files_per_group": min(max_files, 80),
192
+ "max_lines_per_group": max_lines,
193
+ "max_imbalance_ratio": 3.0,
194
+ "all_files_must_be_assigned": True,
195
+ "no_duplicate_assignment": True,
196
+ },
197
+ "files": file_entries,
198
+ "output_format": {
199
+ "description": "JSON array of groups",
200
+ "example": [
201
+ {"name": "order-lifecycle", "files": ["OrderServiceImpl.java", "OrderProcessor.java"]},
202
+ {"name": "payment", "files": ["PaymentServiceImpl.java"]},
203
+ ],
204
+ },
205
+ }
206
+
207
+ request_path.write_text(json.dumps(request, ensure_ascii=False, indent=2), encoding="utf-8")
208
+ logger.info("Agent grouping request written to %s", request_path)
209
+
210
+ return SplitPlan(
211
+ splits=[],
212
+ strategy="agent-pending",
213
+ pending_agent_grouping=True,
214
+ grouping_request_path=str(request_path),
215
+ )
216
+
217
+
218
+ def _parse_llm_groups(content: str, file_list: list[dict]) -> list[list[dict]] | None:
219
+ """Parse LLM grouping response into file groups."""
220
+ name_to_file = {f.get("name", ""): f for f in file_list}
221
+ groups: list[list[dict]] = []
222
+
223
+ for line in content.strip().splitlines():
224
+ if ":" not in line:
225
+ continue
226
+ _, files_str = line.split(":", 1)
227
+ group: list[dict] = []
228
+ for fname in files_str.split(","):
229
+ fname = fname.strip().split("(")[0].strip()
230
+ if fname in name_to_file:
231
+ group.append(name_to_file[fname])
232
+ else:
233
+ class_name = fname.rsplit(".", 1)[-1] if "." in fname else fname
234
+ for name, f in name_to_file.items():
235
+ if name.endswith(class_name + ".java") or name == class_name:
236
+ group.append(f)
237
+ break
238
+ if group:
239
+ groups.append(group)
240
+
241
+ return groups if len(groups) > 1 else None
242
+
243
+
244
+ def _assign_by_package(unassigned: list[dict], groups: list[list[dict]]) -> None:
245
+ """Assign unassigned files to the group with the most package overlap."""
246
+ group_packages: list[set[str]] = []
247
+ for g in groups:
248
+ pkgs = set()
249
+ for f in g:
250
+ pkg = f.get("package", "")
251
+ while pkg:
252
+ pkgs.add(pkg)
253
+ pkg = pkg.rsplit(".", 1)[0] if "." in pkg else ""
254
+ group_packages.append(pkgs)
255
+
256
+ for f in unassigned:
257
+ pkg = f.get("package", "")
258
+ best_idx = 0
259
+ best_score = -1
260
+ for i, pkgs in enumerate(group_packages):
261
+ score = 0
262
+ p = pkg
263
+ while p:
264
+ if p in pkgs:
265
+ score += 1
266
+ p = p.rsplit(".", 1)[0] if "." in p else ""
267
+ if score > best_score:
268
+ best_score = score
269
+ best_idx = i
270
+ groups[best_idx].append(f)
271
+ pkg_full = f.get("package", "")
272
+ while pkg_full:
273
+ group_packages[best_idx].add(pkg_full)
274
+ pkg_full = pkg_full.rsplit(".", 1)[0] if "." in pkg_full else ""
core/utils.py ADDED
@@ -0,0 +1,135 @@
1
+ """Shared utility functions — text sanitization, subprocess safety, path validation.
2
+
3
+ Pure utility functions with no LLM calls or CLI I/O.
4
+
5
+ Usage:
6
+ from core.utils import sanitize_llm_input, safe_subprocess_run, validate_path_within_bounds
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ import subprocess
13
+ import sys
14
+ from pathlib import Path
15
+
16
+
17
+ # ---------------------------------------------------------------------------
18
+ # Path validation
19
+ # ---------------------------------------------------------------------------
20
+
21
+
22
+ def validate_path_within_bounds(path: Path, base: Path) -> bool:
23
+ """Security check: ensure resolved path doesn't escape base directory.
24
+
25
+ Args:
26
+ path: Path to validate (may be relative or absolute).
27
+ base: Base directory that path must stay within.
28
+
29
+ Returns:
30
+ True if path is within bounds, False otherwise.
31
+ """
32
+ try:
33
+ resolved = path.resolve()
34
+ base_resolved = base.resolve()
35
+ except (OSError, ValueError):
36
+ return False
37
+
38
+ if sys.platform == "win32":
39
+ return str(resolved).lower().startswith(str(base_resolved).lower())
40
+ else:
41
+ try:
42
+ resolved.relative_to(base_resolved)
43
+ return True
44
+ except ValueError:
45
+ return False
46
+
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Text sanitization for LLM input
50
+ # ---------------------------------------------------------------------------
51
+
52
+ _INJECTION_PATTERNS = [
53
+ re.compile(r"(?i)ignore\s+(all\s+)?previous\s+instructions"),
54
+ re.compile(r"(?i)^system\s*:", re.MULTILINE),
55
+ re.compile(r"(?i)you\s+are\s+now\s+"),
56
+ ]
57
+
58
+ MAX_LLM_INPUT_CHARS = 200_000
59
+
60
+
61
+ def sanitize_llm_input(text: str) -> str:
62
+ """Clean text for LLM input: remove null bytes, normalize whitespace, filter injections.
63
+
64
+ - Removes null bytes
65
+ - Truncates to MAX_LLM_INPUT_CHARS
66
+ - Filters common prompt injection patterns
67
+
68
+ Args:
69
+ text: Raw text to sanitize.
70
+
71
+ Returns:
72
+ Sanitized text safe for LLM consumption.
73
+ """
74
+ # Remove null bytes
75
+ text = text.replace("\0", "")
76
+
77
+ # Truncate
78
+ if len(text) > MAX_LLM_INPUT_CHARS:
79
+ text = text[:MAX_LLM_INPUT_CHARS] + "\n[truncated]"
80
+
81
+ # Filter injection patterns
82
+ for pat in _INJECTION_PATTERNS:
83
+ text = pat.sub("[filtered]", text)
84
+
85
+ return text
86
+
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # Safe subprocess execution
90
+ # ---------------------------------------------------------------------------
91
+
92
+
93
+ def safe_subprocess_run(
94
+ cmd: list[str | Path],
95
+ *,
96
+ timeout: int = 120,
97
+ capture: bool = True,
98
+ env: dict[str, str] | None = None,
99
+ cwd: Path | str | None = None,
100
+ ) -> subprocess.CompletedProcess:
101
+ """Subprocess wrapper with timeout and encoding handling.
102
+
103
+ All Path arguments are resolved to absolute paths to prevent injection.
104
+ Uses UTF-8 encoding with error replacement.
105
+
106
+ Args:
107
+ cmd: Command and arguments (Path objects are resolved to absolute).
108
+ timeout: Maximum execution time in seconds.
109
+ capture: Whether to capture stdout/stderr.
110
+ env: Optional environment variables override.
111
+ cwd: Optional working directory.
112
+
113
+ Returns:
114
+ CompletedProcess result.
115
+
116
+ Raises:
117
+ subprocess.TimeoutExpired: If command exceeds timeout.
118
+ FileNotFoundError: If command executable not found.
119
+ """
120
+ sanitized: list[str] = []
121
+ for arg in cmd:
122
+ if isinstance(arg, Path):
123
+ sanitized.append(str(arg.resolve()))
124
+ else:
125
+ sanitized.append(str(arg))
126
+
127
+ return subprocess.run(
128
+ sanitized,
129
+ capture_output=capture,
130
+ encoding="utf-8",
131
+ errors="replace",
132
+ timeout=timeout,
133
+ env=env,
134
+ cwd=str(cwd) if cwd else None,
135
+ )
@@ -0,0 +1,65 @@
1
+ """core.validators — document quality validators (registry pattern).
2
+
3
+ Each validator implements the Validator ABC and is registered by name.
4
+ Validators are pure functions: they inspect files and return structured results
5
+ without performing any remediation.
6
+
7
+ Usage:
8
+ from core.validators import get_all_validators, run_all
9
+
10
+ results = run_all(module_dir, preset_name="java-spring")
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+ from typing import Any
17
+
18
+ from core.interfaces import Validator, ValidationResult
19
+
20
+ _REGISTRY: dict[str, type[Validator]] = {}
21
+
22
+
23
+ def register_validator(cls: type[Validator]) -> type[Validator]:
24
+ """Decorator to register a validator class."""
25
+ _REGISTRY[cls.name] = cls
26
+ return cls
27
+
28
+
29
+ def get_all_validators() -> list[Validator]:
30
+ """Return instances of all registered validators."""
31
+ _ensure_loaded()
32
+ return [cls() for cls in _REGISTRY.values()]
33
+
34
+
35
+ def get_validator(name: str) -> Validator:
36
+ """Get a specific validator by name."""
37
+ if name not in _REGISTRY:
38
+ raise KeyError(f"Validator '{name}' not found. Available: {sorted(_REGISTRY.keys())}")
39
+ return _REGISTRY[name]()
40
+
41
+
42
+ def run_all(module_dir: Path, **kwargs: Any) -> ValidationResult:
43
+ """Run all registered validators and merge results."""
44
+ _ensure_loaded()
45
+ combined = ValidationResult()
46
+ for validator in get_all_validators():
47
+ try:
48
+ result = validator.validate(module_dir, **kwargs)
49
+ combined = combined.merge(result)
50
+ except Exception as e:
51
+ combined.errors.append(f"{validator.name}: {e}")
52
+ return combined
53
+
54
+
55
+ def _ensure_loaded() -> None:
56
+ """Import all validator modules to trigger registration."""
57
+ if _REGISTRY:
58
+ return
59
+ # Import all validator modules so @register_validator decorators fire
60
+ from core.validators import duplicates # noqa: F401
61
+ from core.validators import links # noqa: F401
62
+ from core.validators import sampling # noqa: F401
63
+ from core.validators import structure # noqa: F401
64
+ from core.validators import coverage # noqa: F401
65
+ from core.validators import consistency # noqa: F401
@@ -0,0 +1,215 @@
1
+ """CLI entry points for core/validators tools.
2
+
3
+ Provides sub-commands for Agent mode to call directly:
4
+ python -m core.validators coverage ...
5
+ python -m core.validators consistency ...
6
+ python -m core.validators check-progress ...
7
+ python -m core.validators links ...
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import json
14
+ import sys
15
+ from pathlib import Path
16
+
17
+ sys.stdout.reconfigure(encoding="utf-8")
18
+
19
+
20
+ def cmd_coverage(args: argparse.Namespace) -> None:
21
+ """Run coverage check."""
22
+ from core.validators.coverage import CoverageValidator
23
+
24
+ module_dir = Path(args.module_dir) if args.module_dir else None
25
+ skeleton_path = Path(args.skeleton) if args.skeleton else None
26
+ skeleton_dir = Path(args.skeleton_dir) if args.skeleton_dir else None
27
+ docs_dir = Path(args.docs_dir) if args.docs_dir else module_dir
28
+
29
+ if not docs_dir:
30
+ print("Error: --docs-dir or --module-dir required", file=sys.stderr)
31
+ sys.exit(1)
32
+
33
+ validator = CoverageValidator()
34
+ result = validator.validate(
35
+ docs_dir,
36
+ skeleton_path=str(skeleton_path) if skeleton_path else None,
37
+ skeleton_dir=str(skeleton_dir) if skeleton_dir else None,
38
+ module_type=args.type or "service",
39
+ )
40
+
41
+ if result.errors:
42
+ print("ERRORS:")
43
+ for e in result.errors:
44
+ print(f" ❌ {e}")
45
+ if result.warnings:
46
+ print("WARNINGS:")
47
+ for w in result.warnings:
48
+ print(f" ⚠️ {w}")
49
+
50
+ status = "ok" if result.passed else "fail"
51
+ print(json.dumps({"status": status, "errors": len(result.errors),
52
+ "warnings": len(result.warnings)}, ensure_ascii=False))
53
+
54
+
55
+ def cmd_consistency(args: argparse.Namespace) -> None:
56
+ """Run consistency check (progress monitoring + reference validation)."""
57
+ from core.validators.consistency import ConsistencyValidator
58
+ from core.monitor.progress import check_progress, cleanup_progress
59
+
60
+ module_dir = Path(args.module_dir)
61
+
62
+ if args.cleanup:
63
+ cleaned = cleanup_progress(module_dir)
64
+ print(f"Cleaned {cleaned} progress files")
65
+ print(json.dumps({"status": "ok", "cleaned": cleaned}, ensure_ascii=False))
66
+ return
67
+
68
+ # Progress check
69
+ progress = check_progress(module_dir)
70
+ if progress:
71
+ print("Progress status:")
72
+ for doc_name, status in progress.items():
73
+ print(f" {doc_name}: {status}")
74
+
75
+ # Consistency validation
76
+ validator = ConsistencyValidator()
77
+ source_cache = Path(args.source_cache) if args.source_cache else None
78
+ result = validator.validate(module_dir, source_cache=source_cache)
79
+
80
+ if result.warnings:
81
+ print("\nConsistency warnings:")
82
+ for w in result.warnings:
83
+ print(f" ⚠️ {w}")
84
+
85
+ status = "ok" if result.passed else "fail"
86
+ print(json.dumps({"status": status, "errors": len(result.errors),
87
+ "warnings": len(result.warnings),
88
+ "progress": progress or {}}, ensure_ascii=False))
89
+
90
+
91
+ def cmd_links(args: argparse.Namespace) -> None:
92
+ """Run link/embedding health check."""
93
+ from core.validators.links import LinksValidator
94
+
95
+ module_dir = Path(args.module_dir) if args.module_dir else None
96
+ if not module_dir:
97
+ print("Error: --module-dir required", file=sys.stderr)
98
+ sys.exit(1)
99
+
100
+ validator = LinksValidator()
101
+ result = validator.validate(module_dir)
102
+
103
+ if result.errors:
104
+ print("Broken links:")
105
+ for e in result.errors:
106
+ print(f" ❌ {e}")
107
+ if result.warnings:
108
+ for w in result.warnings:
109
+ print(f" ⚠️ {w}")
110
+
111
+ status = "ok" if result.passed else "fail"
112
+ print(json.dumps({"status": status, "errors": len(result.errors),
113
+ "warnings": len(result.warnings)}, ensure_ascii=False))
114
+
115
+
116
+ def cmd_structure(args: argparse.Namespace) -> None:
117
+ """Run structure/output validation."""
118
+ from core.validators.structure import StructureValidator
119
+
120
+ module_dir = Path(args.module_dir)
121
+ skeleton_dir = Path(args.skeleton_dir) if args.skeleton_dir else None
122
+
123
+ validator = StructureValidator()
124
+ result = validator.validate(module_dir, skeleton_dir=skeleton_dir)
125
+
126
+ if result.errors:
127
+ print("Structure errors:")
128
+ for e in result.errors:
129
+ print(f" ❌ {e}")
130
+ if result.warnings:
131
+ for w in result.warnings:
132
+ print(f" ⚠️ {w}")
133
+
134
+ status = "ok" if result.passed else "fail"
135
+ print(json.dumps({"status": status, "errors": len(result.errors),
136
+ "warnings": len(result.warnings)}, ensure_ascii=False))
137
+
138
+
139
+ def cmd_sampling(args: argparse.Namespace) -> None:
140
+ """Run numeric sampling validation (spot-check enum values and field names)."""
141
+ from core.validators.sampling import SamplingValidator
142
+
143
+ module_dir = Path(args.module_dir)
144
+ validator = SamplingValidator()
145
+ result = validator.validate(module_dir)
146
+
147
+ if result.warnings:
148
+ print("Sampling warnings:")
149
+ for w in result.warnings:
150
+ print(f" ⚠️ {w}")
151
+
152
+ hit_count = 10 - len(result.warnings) # max 10 samples (5 enum + 5 field)
153
+ total = 10
154
+ hit_rate = round(hit_count / total * 100, 1) if total > 0 else 100.0
155
+
156
+ if not result.warnings:
157
+ print(f"Numeric sampling validation passed (hit rate {hit_rate}%)")
158
+ else:
159
+ print(f"Numeric sampling hit rate {hit_rate}% ({len(result.warnings)} items missed)")
160
+
161
+ status = "ok" if result.passed else "warn"
162
+ print(json.dumps({"status": status, "warnings": len(result.warnings),
163
+ "hit_rate_pct": hit_rate}, ensure_ascii=False), file=sys.stderr)
164
+
165
+
166
+ def main():
167
+ parser = argparse.ArgumentParser(prog="python -m core.validators", description="Validation tools")
168
+ sub = parser.add_subparsers(dest="command")
169
+
170
+ # coverage
171
+ p = sub.add_parser("coverage", help="Coverage check")
172
+ p.add_argument("action", nargs="?", default="check", choices=["check"])
173
+ p.add_argument("--skeleton", help="Skeleton JSON path")
174
+ p.add_argument("--skeleton-dir", help="Skeleton shards directory")
175
+ p.add_argument("--module-dir", help="Module directory")
176
+ p.add_argument("--docs-dir", help="Documents directory")
177
+ p.add_argument("--type", default="service", help="Module type")
178
+
179
+ # consistency (replaces check_progress)
180
+ p = sub.add_parser("consistency", help="Consistency and progress check")
181
+ p.add_argument("--module-dir", required=True, help="Module directory")
182
+ p.add_argument("--preset", help="Preset name")
183
+ p.add_argument("--source-cache", help="Source cache path")
184
+ p.add_argument("--cleanup", action="store_true", help="Clean up progress files")
185
+
186
+ # links (replaces embedding_health)
187
+ p = sub.add_parser("links", help="Link and reference health check")
188
+ p.add_argument("--module-dir", help="Module directory")
189
+ p.add_argument("--config", help="kb-project.yaml path")
190
+
191
+ # structure (replaces validate_output)
192
+ p = sub.add_parser("structure", help="Document structure validation")
193
+ p.add_argument("--module-dir", required=True, help="Module directory")
194
+ p.add_argument("--skeleton-dir", help="Skeleton directory for comparison")
195
+ p.add_argument("--kb-dir", help="Knowledge base directory (validate all modules)")
196
+
197
+ # sampling (numeric spot-check)
198
+ p = sub.add_parser("sampling", help="Numeric sampling validation (enum values + field names)")
199
+ p.add_argument("--module-dir", required=True, help="Module directory")
200
+
201
+ args = parser.parse_args()
202
+ if not args.command:
203
+ parser.print_help()
204
+ sys.exit(1)
205
+
206
+ commands = {
207
+ "coverage": cmd_coverage, "consistency": cmd_consistency,
208
+ "links": cmd_links, "structure": cmd_structure,
209
+ "sampling": cmd_sampling,
210
+ }
211
+ commands[args.command](args)
212
+
213
+
214
+ if __name__ == "__main__":
215
+ main()