source-kb 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. cli/__init__.py +50 -0
  2. cli/__main__.py +5 -0
  3. cli/commands/__init__.py +1 -0
  4. cli/commands/anchor_fix.py +47 -0
  5. cli/commands/diff_doc.py +52 -0
  6. cli/commands/dispatch.py +77 -0
  7. cli/commands/extract.py +72 -0
  8. cli/commands/file_list.py +74 -0
  9. cli/commands/index.py +84 -0
  10. cli/commands/lock.py +89 -0
  11. cli/commands/merge.py +60 -0
  12. cli/commands/merge_delta.py +19 -0
  13. cli/commands/metadata.py +24 -0
  14. cli/commands/pipeline.py +45 -0
  15. cli/commands/post_merge.py +43 -0
  16. cli/commands/query.py +52 -0
  17. cli/commands/render.py +101 -0
  18. cli/commands/scan_repos.py +46 -0
  19. cli/commands/setup.py +94 -0
  20. cli/commands/split.py +196 -0
  21. cli/commands/stale_files.py +98 -0
  22. cli/commands/validate.py +191 -0
  23. core/__init__.py +32 -0
  24. core/config.py +261 -0
  25. core/docs/__init__.py +7 -0
  26. core/docs/section_updater.py +286 -0
  27. core/docs/shared.py +149 -0
  28. core/git.py +294 -0
  29. core/interfaces.py +249 -0
  30. core/monitor/__init__.py +5 -0
  31. core/monitor/progress.py +83 -0
  32. core/monitor/prompt_store.py +49 -0
  33. core/paths.py +141 -0
  34. core/preset.py +237 -0
  35. core/preset_accessors.py +202 -0
  36. core/preset_classify.py +132 -0
  37. core/preset_hooks.py +129 -0
  38. core/preset_profile.py +89 -0
  39. core/prompt/__init__.py +7 -0
  40. core/prompt/__main__.py +147 -0
  41. core/prompt/content.py +320 -0
  42. core/prompt/context_manager.py +164 -0
  43. core/prompt/renderer.py +236 -0
  44. core/prompt/response_parser.py +274 -0
  45. core/prompt/templates.py +357 -0
  46. core/prompt/validate_parity.py +162 -0
  47. core/prompt/variables.py +339 -0
  48. core/rag/__init__.py +22 -0
  49. core/rag/__main__.py +136 -0
  50. core/rag/bm25_index.py +268 -0
  51. core/rag/chunker.py +273 -0
  52. core/rag/embedder.py +151 -0
  53. core/rag/indexer.py +292 -0
  54. core/rag/loader.py +89 -0
  55. core/rag/retriever.py +82 -0
  56. core/skeleton/__init__.py +11 -0
  57. core/skeleton/__main__.py +934 -0
  58. core/skeleton/anchor_fix.py +250 -0
  59. core/skeleton/classify.py +331 -0
  60. core/skeleton/cmd_anchor_fix.py +43 -0
  61. core/skeleton/cmd_diff_doc.py +44 -0
  62. core/skeleton/cmd_lock.py +87 -0
  63. core/skeleton/cmd_merge_delta.py +41 -0
  64. core/skeleton/community.py +233 -0
  65. core/skeleton/dependency_graph.py +306 -0
  66. core/skeleton/diff_doc.py +248 -0
  67. core/skeleton/dispatch.py +273 -0
  68. core/skeleton/dispatch_render.py +319 -0
  69. core/skeleton/dispatch_source.py +111 -0
  70. core/skeleton/extract.py +218 -0
  71. core/skeleton/extract_methods.py +298 -0
  72. core/skeleton/file_list.py +239 -0
  73. core/skeleton/impact.py +278 -0
  74. core/skeleton/jar_download.py +177 -0
  75. core/skeleton/jar_resolver.py +186 -0
  76. core/skeleton/loader.py +162 -0
  77. core/skeleton/merge.py +278 -0
  78. core/skeleton/merge_delta.py +229 -0
  79. core/skeleton/metadata.py +96 -0
  80. core/skeleton/metadata_builders.py +264 -0
  81. core/skeleton/module_dag.py +330 -0
  82. core/skeleton/parsers/__init__.py +71 -0
  83. core/skeleton/parsers/jqassistant.py +300 -0
  84. core/skeleton/parsers/jqassistant_cypher.py +225 -0
  85. core/skeleton/parsers/regex.py +171 -0
  86. core/skeleton/parsers/treesitter.py +324 -0
  87. core/skeleton/parsers/treesitter_java.py +284 -0
  88. core/skeleton/parsers/treesitter_multi.py +289 -0
  89. core/skeleton/pom_parser.py +299 -0
  90. core/skeleton/post_merge.py +295 -0
  91. core/skeleton/post_merge_llm.py +82 -0
  92. core/skeleton/query.py +195 -0
  93. core/skeleton/shard_context.py +177 -0
  94. core/skeleton/split.py +180 -0
  95. core/skeleton/split_cache.py +107 -0
  96. core/skeleton/split_feedback.py +174 -0
  97. core/skeleton/split_plan.py +219 -0
  98. core/skeleton/split_plan_helpers.py +305 -0
  99. core/skeleton/split_plan_llm.py +274 -0
  100. core/utils.py +135 -0
  101. core/validators/__init__.py +65 -0
  102. core/validators/__main__.py +215 -0
  103. core/validators/consistency.py +203 -0
  104. core/validators/coverage.py +171 -0
  105. core/validators/duplicates.py +76 -0
  106. core/validators/engine.py +224 -0
  107. core/validators/links.py +76 -0
  108. core/validators/sampling.py +169 -0
  109. core/validators/structure.py +144 -0
  110. engine/__init__.py +7 -0
  111. engine/assembler.py +231 -0
  112. engine/confirm.py +65 -0
  113. engine/dedup.py +106 -0
  114. engine/main.py +211 -0
  115. engine/pipeline/__init__.py +163 -0
  116. engine/pipeline/recovery.py +250 -0
  117. engine/pipeline/steps/__init__.py +23 -0
  118. engine/pipeline/steps/audit.py +220 -0
  119. engine/pipeline/steps/audit_apply.py +195 -0
  120. engine/pipeline/steps/audit_helpers.py +155 -0
  121. engine/pipeline/steps/classify_llm.py +236 -0
  122. engine/pipeline/steps/classify_prompt.py +223 -0
  123. engine/pipeline/steps/finalize.py +160 -0
  124. engine/pipeline/steps/generate.py +169 -0
  125. engine/pipeline/steps/generate_batch.py +197 -0
  126. engine/pipeline/steps/generate_recovery.py +170 -0
  127. engine/pipeline/steps/llm_plan_split.py +253 -0
  128. engine/pipeline/steps/lock.py +64 -0
  129. engine/pipeline/steps/preflight.py +237 -0
  130. engine/pipeline/steps/preflight_adjust.py +147 -0
  131. engine/pipeline/steps/pregenerate.py +130 -0
  132. engine/pipeline/steps/quality.py +81 -0
  133. engine/pipeline/steps/skeleton.py +149 -0
  134. engine/pipeline/steps/source.py +163 -0
  135. engine/pipeline/steps/sync.py +117 -0
  136. engine/pipeline/steps/sync_finalize.py +237 -0
  137. engine/pipeline/steps/sync_update.py +341 -0
  138. engine/pipelines.py +91 -0
  139. engine/runner.py +335 -0
  140. engine/strategies/__init__.py +86 -0
  141. engine/strategies/api.py +128 -0
  142. engine/strategies/delegated.py +50 -0
  143. engine/strategies/dryrun.py +25 -0
  144. engine/two_phase.py +143 -0
  145. mcp_server/__init__.py +73 -0
  146. mcp_server/__main__.py +5 -0
  147. mcp_server/tools/__init__.py +1 -0
  148. mcp_server/tools/config.py +63 -0
  149. mcp_server/tools/discovery.py +276 -0
  150. mcp_server/tools/generation.py +184 -0
  151. mcp_server/tools/planning.py +144 -0
  152. mcp_server/tools/source.py +175 -0
  153. mcp_server/tools/validation.py +140 -0
  154. mcp_server/tools/workflow.py +166 -0
  155. mcp_server/workflow_loader.py +204 -0
  156. presets/generic/audit_dimensions.md +132 -0
  157. presets/generic/doc_types.yaml +152 -0
  158. presets/generic/preset.yaml +115 -0
  159. presets/java-spring/audit_dimensions.md +228 -0
  160. presets/java-spring/audit_dimensions.yaml +203 -0
  161. presets/java-spring/doc_types.yaml +269 -0
  162. presets/java-spring/hooks.py +122 -0
  163. presets/java-spring/preset.yaml +341 -0
  164. presets/java-spring/templates/README.md +34 -0
  165. presets/java-spring/templates/audit-system.md +15 -0
  166. presets/java-spring/templates/subagent-aop.md +105 -0
  167. presets/java-spring/templates/subagent-api.md +63 -0
  168. presets/java-spring/templates/subagent-architecture.md +111 -0
  169. presets/java-spring/templates/subagent-async-events.md +107 -0
  170. presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
  171. presets/java-spring/templates/subagent-audit-architecture.md +38 -0
  172. presets/java-spring/templates/subagent-audit-business.md +40 -0
  173. presets/java-spring/templates/subagent-audit-data-models.md +40 -0
  174. presets/java-spring/templates/subagent-business.md +129 -0
  175. presets/java-spring/templates/subagent-caching.md +75 -0
  176. presets/java-spring/templates/subagent-database-access.md +114 -0
  177. presets/java-spring/templates/subagent-enum.md +75 -0
  178. presets/java-spring/templates/subagent-error-handling.md +91 -0
  179. presets/java-spring/templates/subagent-external-integrations.md +80 -0
  180. presets/java-spring/templates/subagent-index.md +122 -0
  181. presets/java-spring/templates/subagent-messaging.md +97 -0
  182. presets/java-spring/templates/subagent-model.md +88 -0
  183. presets/java-spring/templates/subagent-observability.md +91 -0
  184. presets/java-spring/templates/subagent-scheduled.md +81 -0
  185. presets/java-spring/templates/subagent-security.md +102 -0
  186. presets/java-spring/templates/subagent-structure.md +101 -0
  187. presets/java-spring/templates/subagent-sync-section.md +34 -0
  188. presets/java-spring/templates/subagent-utils.md +73 -0
  189. presets/java-spring/templates/sync-system.md +8 -0
  190. presets/java-spring/workflow-extensions.md +112 -0
  191. skills/__init__.py +1 -0
  192. skills/_shared/README.md +30 -0
  193. skills/_shared/doc-coverage-shared.md +134 -0
  194. skills/_shared/doc-quality-standard.md +1058 -0
  195. skills/_shared/doc-subagent-rules.md +762 -0
  196. skills/_shared/windows-compat.md +89 -0
  197. skills/kb-audit/SKILL.md +52 -0
  198. skills/kb-audit/rules.md +88 -0
  199. skills/kb-audit/steps/step-01-prepare.md +75 -0
  200. skills/kb-audit/steps/step-02-audit.md +96 -0
  201. skills/kb-audit/steps/step-03-verify.md +65 -0
  202. skills/kb-audit/steps/step-04-report.md +64 -0
  203. skills/kb-init/SKILL.md +146 -0
  204. skills/kb-init/rules.md +187 -0
  205. skills/kb-init/steps/step-01-scope.md +62 -0
  206. skills/kb-init/steps/step-02-source.md +410 -0
  207. skills/kb-init/steps/step-03-generate.md +307 -0
  208. skills/kb-init/steps/step-04-quality.md +92 -0
  209. skills/kb-init/steps/step-05-finalize.md +132 -0
  210. skills/kb-init/templates/core/execution-modes.md +29 -0
  211. skills/kb-init/templates/core/output-only.md +4 -0
  212. skills/kb-init/templates/core/readwrite.md +33 -0
  213. skills/kb-search/SKILL.md +138 -0
  214. skills/kb-search/rules.md +64 -0
  215. skills/kb-sync/SKILL.md +43 -0
  216. skills/kb-sync/rules.md +70 -0
  217. skills/kb-sync/scripts/rebuild_module.py +91 -0
  218. skills/kb-sync/scripts/scan_repos.py +687 -0
  219. skills/kb-sync/steps/step-01-detect.md +72 -0
  220. skills/kb-sync/steps/step-02-update.md +71 -0
  221. skills/kb-sync/steps/step-03-verify.md +47 -0
  222. skills/kb-sync/steps/step-04-finalize.md +52 -0
  223. source_kb-0.2.2.dist-info/METADATA +194 -0
  224. source_kb-0.2.2.dist-info/RECORD +228 -0
  225. source_kb-0.2.2.dist-info/WHEEL +5 -0
  226. source_kb-0.2.2.dist-info/entry_points.txt +3 -0
  227. source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
  228. source_kb-0.2.2.dist-info/top_level.txt +6 -0
@@ -0,0 +1,174 @@
1
+ """Split feedback — record and query split execution results for adaptive tuning.
2
+
3
+ Records each split execution's quality metrics and provides historical best
4
+ resolution parameter for future splits. Lightweight JSON persistence.
5
+
6
+ Usage:
7
+ from core.skeleton.split_feedback import SplitRecord, record_split_result, get_best_resolution
8
+
9
+ record_split_result(module_dir, record)
10
+ best = get_best_resolution(module_dir, "business-logic")
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import json
16
+ import logging
17
+ from dataclasses import asdict, dataclass
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ FEEDBACK_FILENAME = "split-feedback.json"
24
+ MAX_RECORDS = 20
25
+
26
+
27
+ # ---------------------------------------------------------------------------
28
+ # Data classes
29
+ # ---------------------------------------------------------------------------
30
+
31
+
32
+ @dataclass
33
+ class SplitRecord:
34
+ """A single split execution record."""
35
+
36
+ doc_type: str
37
+ strategy: str # "community" | "package" | "simple" | "single"
38
+ resolution: float = 1.0
39
+ n_splits: int = 1
40
+ coverage_score: float = 0.0
41
+ quality_score: float = 0.0
42
+ issues_count: int = 0
43
+ merge_duplicates: int = 0
44
+ timestamp: str = ""
45
+
46
+ def __post_init__(self):
47
+ if not self.timestamp:
48
+ self.timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
49
+
50
+ @property
51
+ def composite_score(self) -> float:
52
+ """Compute composite score (0-1).
53
+
54
+ Weights: coverage 40%, quality 40%, issue penalty 10%, duplicate penalty 10%.
55
+ """
56
+ issue_penalty = min(self.issues_count * 0.02, 0.1)
57
+ dup_penalty = min(self.merge_duplicates * 0.02, 0.1)
58
+ score = (
59
+ self.coverage_score * 0.4
60
+ + self.quality_score * 0.4
61
+ + (0.1 - issue_penalty)
62
+ + (0.1 - dup_penalty)
63
+ )
64
+ return round(max(0.0, min(1.0, score)), 3)
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # Public API
69
+ # ---------------------------------------------------------------------------
70
+
71
+
72
+ def record_split_result(module_dir: Path, record: SplitRecord) -> None:
73
+ """Write a split execution record to .meta/split-feedback.json.
74
+
75
+ Args:
76
+ module_dir: Module documentation directory
77
+ record: SplitRecord with execution metrics
78
+ """
79
+ meta_dir = module_dir / ".meta"
80
+ meta_dir.mkdir(parents=True, exist_ok=True)
81
+ feedback_path = meta_dir / FEEDBACK_FILENAME
82
+
83
+ records = _load_records(feedback_path)
84
+
85
+ # Serialize record
86
+ entry = asdict(record)
87
+ entry["composite_score"] = record.composite_score
88
+ records.append(entry)
89
+
90
+ # Keep only recent records
91
+ records = records[-MAX_RECORDS:]
92
+
93
+ try:
94
+ feedback_path.write_text(
95
+ json.dumps(records, ensure_ascii=False, indent=2),
96
+ encoding="utf-8",
97
+ )
98
+ except OSError as e:
99
+ logger.warning("Failed to write split feedback: %s", e)
100
+
101
+
102
+ def get_best_resolution(
103
+ module_dir: Path,
104
+ doc_type: str,
105
+ default_resolution: float = 1.0,
106
+ ) -> dict:
107
+ """Read history and return best resolution parameter.
108
+
109
+ Simplified: just reuse the last successful resolution parameter.
110
+ No weighted scoring — if the last run succeeded, use its parameters.
111
+
112
+ Args:
113
+ module_dir: Module documentation directory
114
+ doc_type: Document type to filter by
115
+ default_resolution: Default when no history exists
116
+
117
+ Returns:
118
+ {"resolution": float, "records": int, "recommendation": str}
119
+ """
120
+ feedback_path = module_dir / ".meta" / FEEDBACK_FILENAME
121
+ records = _load_records(feedback_path)
122
+
123
+ relevant = [r for r in records if r.get("doc_type") == doc_type]
124
+
125
+ if not relevant:
126
+ return {
127
+ "resolution": default_resolution,
128
+ "records": 0,
129
+ "recommendation": "no-history",
130
+ }
131
+
132
+ # Simple: use the last successful record's resolution
133
+ last = relevant[-1]
134
+ return {
135
+ "resolution": last.get("resolution", default_resolution),
136
+ "records": len(relevant),
137
+ "recommendation": "use-last-successful",
138
+ }
139
+
140
+
141
+ def get_split_history(
142
+ module_dir: Path,
143
+ doc_type: str | None = None,
144
+ ) -> list[dict]:
145
+ """Get split history records for debugging and analysis.
146
+
147
+ Args:
148
+ module_dir: Module documentation directory
149
+ doc_type: Optional filter
150
+
151
+ Returns:
152
+ List of historical records.
153
+ """
154
+ feedback_path = module_dir / ".meta" / FEEDBACK_FILENAME
155
+ records = _load_records(feedback_path)
156
+ if doc_type:
157
+ records = [r for r in records if r.get("doc_type") == doc_type]
158
+ return records
159
+
160
+
161
+ # ---------------------------------------------------------------------------
162
+ # Internal
163
+ # ---------------------------------------------------------------------------
164
+
165
+
166
+ def _load_records(path: Path) -> list[dict]:
167
+ """Load feedback records file."""
168
+ if not path.exists():
169
+ return []
170
+ try:
171
+ data = json.loads(path.read_text(encoding="utf-8"))
172
+ return data if isinstance(data, list) else []
173
+ except (json.JSONDecodeError, OSError):
174
+ return []
@@ -0,0 +1,219 @@
1
+ """Split planning — orchestrate split strategy selection.
2
+
3
+ Implements the priority chain for split planning:
4
+ 1. Cache hit (skeleton hash unchanged) — instant
5
+ 2. Community detection (dependency graph, zero LLM cost)
6
+ 3. LLM-assisted grouping (business-domain, optional)
7
+ 4. Package-based grouping (code rules, zero LLM cost)
8
+ 5. Simple split (equal file count, last resort)
9
+
10
+ All thresholds loaded from SplitConfig (yaml-driven), no hardcoded numbers.
11
+
12
+ Usage:
13
+ from core.skeleton.split_plan import plan_splits
14
+
15
+ plan = plan_splits(entries, file_list, split_config, dep_graph)
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ from core.skeleton.split import SplitConfig, SplitPlan
25
+ from core.skeleton.split_plan_helpers import (
26
+ make_split, derive_name, balanced_split,
27
+ )
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Public API
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ def plan_splits(
38
+ entries: list[dict[str, Any]],
39
+ file_list: list[dict[str, Any]],
40
+ split_config: SplitConfig,
41
+ dep_graph: Any | None = None,
42
+ doc_type: str = "business-logic",
43
+ module_dir: Path | None = None,
44
+ *,
45
+ llm_strategy: Any | None = None,
46
+ is_delegated: bool = False,
47
+ ) -> SplitPlan:
48
+ """Main entry point for split planning with priority chain.
49
+
50
+ Priority chain: Cache -> Community -> LLM -> Package -> Simple
51
+
52
+ Args:
53
+ llm_strategy: Optional LlmStrategy instance for LLM-assisted splitting.
54
+ Injected by engine/ caller. None = skip LLM strategy.
55
+ is_delegated: Whether running in delegated mode (Agent handles grouping).
56
+ """
57
+ if not file_list:
58
+ return SplitPlan(splits=[], strategy="empty")
59
+
60
+ max_lines = split_config.effective_max_lines(doc_type)
61
+ max_files = split_config.max_files_per_shard
62
+ total_files = len(file_list)
63
+ total_lines = sum(f.get("lines", 0) for f in file_list)
64
+
65
+ if total_files <= max_files and total_lines <= max_lines:
66
+ logger.debug("split not needed for doc_type=%s (files=%d, lines=%d)", doc_type, total_files, total_lines)
67
+ return SplitPlan(
68
+ splits=[make_split("all", file_list)],
69
+ strategy="single",
70
+ )
71
+
72
+ # Strategy 1: Cache hit
73
+ if module_dir:
74
+ cached = _try_cache(module_dir, doc_type)
75
+ if cached:
76
+ logger.info("split strategy=cache doc_type=%s splits=%d", doc_type, len(cached.splits))
77
+ return cached
78
+
79
+ # Strategy 2: Community detection
80
+ plan = _try_community(file_list, entries, split_config, dep_graph, doc_type)
81
+ if plan:
82
+ logger.info("split strategy=community doc_type=%s splits=%d", doc_type, len(plan.splits))
83
+ _save_to_cache(module_dir, doc_type, plan)
84
+ return plan
85
+ logger.debug("community detection skipped for doc_type=%s", doc_type)
86
+
87
+ # Strategy 3: LLM-assisted grouping
88
+ from core.skeleton.split_plan_llm import try_llm_split
89
+ plan = try_llm_split(file_list, split_config, doc_type, module_dir,
90
+ strategy=llm_strategy, is_delegated=is_delegated)
91
+ if plan:
92
+ logger.info("split strategy=llm doc_type=%s splits=%d", doc_type, len(plan.splits))
93
+ if plan.strategy != "agent-pending":
94
+ _save_to_cache(module_dir, doc_type, plan)
95
+ return plan
96
+ logger.debug("llm split skipped for doc_type=%s", doc_type)
97
+
98
+ # Strategy 4: Package-based grouping
99
+ from core.skeleton.split_plan_helpers import try_package
100
+ plan = try_package(file_list, split_config, doc_type)
101
+ if plan:
102
+ logger.info("split strategy=package doc_type=%s splits=%d", doc_type, len(plan.splits))
103
+ _save_to_cache(module_dir, doc_type, plan)
104
+ return plan
105
+ logger.debug("package split skipped for doc_type=%s", doc_type)
106
+
107
+ # Strategy 5: Simple split (last resort)
108
+ logger.info("split strategy=simple (last resort) doc_type=%s files=%d", doc_type, total_files)
109
+ plan = _simple_split(file_list, split_config, doc_type)
110
+ _save_to_cache(module_dir, doc_type, plan)
111
+ return plan
112
+
113
+
114
+ # ---------------------------------------------------------------------------
115
+ # Strategy 1: Cache
116
+ # ---------------------------------------------------------------------------
117
+
118
+
119
+ def _try_cache(module_dir: Path, doc_type: str) -> SplitPlan | None:
120
+ """Try to load a cached split plan."""
121
+ try:
122
+ from core.skeleton.split_cache import compute_skeleton_hash, get_cached_plan
123
+ current_hash = compute_skeleton_hash(module_dir)
124
+ cached = get_cached_plan(module_dir, doc_type, current_hash)
125
+ if cached and "splits" in cached:
126
+ return SplitPlan(
127
+ splits=cached["splits"],
128
+ strategy="cache",
129
+ )
130
+ except Exception as e:
131
+ logger.debug("Split cache load failed for %s: %s", doc_type, e)
132
+ return None
133
+
134
+
135
+ def _save_to_cache(module_dir: Path | None, doc_type: str, plan: SplitPlan) -> None:
136
+ """Persist split plan to cache for future reuse."""
137
+ if not module_dir or not plan.splits:
138
+ return
139
+ try:
140
+ from core.skeleton.split_cache import compute_skeleton_hash, save_plan_cache
141
+ current_hash = compute_skeleton_hash(module_dir)
142
+ save_plan_cache(
143
+ module_dir, current_hash, doc_type,
144
+ {"splits": plan.splits},
145
+ num_splits=len(plan.splits),
146
+ )
147
+ except Exception as e:
148
+ logger.debug("Failed to save split cache for %s: %s", doc_type, e)
149
+
150
+
151
+ # ---------------------------------------------------------------------------
152
+ # Strategy 2: Community detection
153
+ # ---------------------------------------------------------------------------
154
+
155
+
156
+ def _try_community(
157
+ file_list: list[dict],
158
+ entries: list[dict],
159
+ split_config: SplitConfig,
160
+ dep_graph: Any | None,
161
+ doc_type: str,
162
+ ) -> SplitPlan | None:
163
+ """Try community detection split."""
164
+ try:
165
+ from core.skeleton.dependency_graph import DependencyGraph, build_dependency_graph
166
+ from core.skeleton.community import split_by_community
167
+ except ImportError:
168
+ return None
169
+
170
+ if dep_graph is None:
171
+ if not entries:
172
+ return None
173
+ file_basenames = {Path(f.get("name", "")).name for f in file_list}
174
+ relevant = [e for e in entries if Path(e.get("file", "")).name in file_basenames]
175
+ if not relevant:
176
+ return None
177
+ dep_graph = build_dependency_graph(relevant)
178
+
179
+ if not isinstance(dep_graph, DependencyGraph) or not dep_graph.adjacency:
180
+ return None
181
+
182
+ max_lines = split_config.effective_max_lines(doc_type)
183
+ max_files = split_config.max_files_per_shard
184
+
185
+ groups = split_by_community(
186
+ file_list, dep_graph,
187
+ max_files_per_shard=max_files,
188
+ max_lines_per_shard=max_lines,
189
+ )
190
+
191
+ if not groups or len(groups) <= 1:
192
+ return None
193
+
194
+ total_files = len(file_list)
195
+ max_reasonable_splits = max(6, total_files // max_files + 2)
196
+ if len(groups) > max_reasonable_splits:
197
+ return None
198
+
199
+ splits = [make_split(derive_name(g, noise_words=split_config.noise_words), g) for g in groups]
200
+ return SplitPlan(splits=splits, strategy="community")
201
+
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # Strategy 5: Simple split
205
+ # ---------------------------------------------------------------------------
206
+
207
+
208
+ def _simple_split(
209
+ file_list: list[dict],
210
+ split_config: SplitConfig,
211
+ doc_type: str,
212
+ ) -> SplitPlan:
213
+ """Last resort: equal-count split."""
214
+ max_lines = split_config.effective_max_lines(doc_type)
215
+ max_files = split_config.max_files_per_shard
216
+
217
+ chunks = balanced_split(file_list, max_lines, max_files)
218
+ splits = [make_split(f"group-{i+1}", c) for i, c in enumerate(chunks)]
219
+ return SplitPlan(splits=splits, strategy="simple")
@@ -0,0 +1,305 @@
1
+ """Split plan helpers — shared utilities for split strategies.
2
+
3
+ Includes:
4
+ - Package-based grouping strategy
5
+ - Name derivation (semantic business-domain naming)
6
+ - Balanced splitting (LPT greedy)
7
+ - Small-split merging
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ import os
14
+ import re
15
+ from collections import Counter, defaultdict
16
+ from typing import Any
17
+
18
+
19
+ def make_split(name: str, files: list[dict]) -> dict[str, Any]:
20
+ return {
21
+ "name": name,
22
+ "files": [f.get("rel_path", f.get("name", "")) for f in files],
23
+ "file_count": len(files),
24
+ "lines": sum(f.get("lines", 0) for f in files),
25
+ "packages": list({f.get("package", "") for f in files}),
26
+ }
27
+
28
+
29
+ def derive_name(
30
+ files: list[dict],
31
+ split_name_suffixes: tuple[str, ...] | None = None,
32
+ noise_words: frozenset[str] | None = None,
33
+ ) -> str:
34
+ """Derive a semantic business-domain name from file list.
35
+
36
+ Strategy priority:
37
+ 1. Common sub-package beyond base (e.g., service.group -> "group")
38
+ 2. Dominant business keywords from class names, combined for specificity
39
+ 3. Common prefix of stems (fallback)
40
+ """
41
+ if not files:
42
+ return "group"
43
+
44
+ suffixes = split_name_suffixes or (
45
+ "ServiceImpl", "Service", "Handler", "Processor",
46
+ "Manager", "Controller", "Listener", "Client", "Biz", "BizImpl",
47
+ )
48
+
49
+ packages = [f.get("package", "") for f in files if f.get("package")]
50
+ if packages:
51
+ parts_list = [p.split(".") for p in packages]
52
+ if parts_list:
53
+ min_len = min(len(p) for p in parts_list)
54
+ common_depth = 0
55
+ for i in range(min_len):
56
+ if len(set(p[i] for p in parts_list)) == 1:
57
+ common_depth = i + 1
58
+ else:
59
+ break
60
+ diverging = Counter()
61
+ for parts in parts_list:
62
+ if len(parts) > common_depth:
63
+ seg = parts[common_depth]
64
+ if seg not in ("impl", "service", "base", "controller", "config", "util", "common"):
65
+ diverging[seg] += 1
66
+ if diverging:
67
+ top_pkg, top_count = diverging.most_common(1)[0]
68
+ if top_count >= len(files) * 0.5:
69
+ return top_pkg
70
+
71
+ keywords = Counter()
72
+ noise = noise_words or frozenset({
73
+ "service", "impl", "base", "controller", "manager",
74
+ "handler", "listener", "api", "java", "abstract", "default", "i",
75
+ })
76
+ for f in files:
77
+ raw_name = f.get("name", "")
78
+ basename = raw_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1].rsplit(".", 1)[0]
79
+ words = re.findall(r'[A-Z][a-z]+', basename)
80
+ for w in words:
81
+ w_lower = w.lower()
82
+ if w_lower not in noise and len(w_lower) >= 3:
83
+ keywords[w_lower] += 1
84
+
85
+ if keywords:
86
+ top_keywords = keywords.most_common(5)
87
+ selected = []
88
+ for k, c in top_keywords:
89
+ if c >= max(2, len(files) * 0.2):
90
+ selected.append(k)
91
+ if len(selected) >= 2:
92
+ break
93
+ if selected:
94
+ return "-".join(selected)
95
+ elif top_keywords:
96
+ return top_keywords[0][0]
97
+
98
+ stems: list[str] = []
99
+ for f in files:
100
+ name = f.get("name", "").rsplit(".", 1)[0]
101
+ for s in sorted(suffixes, key=len, reverse=True):
102
+ if name.endswith(s) and len(name) > len(s):
103
+ name = name[:-len(s)]
104
+ break
105
+ stems.append(name)
106
+ prefix = os.path.commonprefix(stems)
107
+ if len(prefix) >= 4:
108
+ return re.sub(r'([a-z])([A-Z])', r'\1-\2', prefix).lower().strip("-")
109
+ if files:
110
+ largest = max(files, key=lambda f: f.get("lines", 0))
111
+ return largest.get("name", "group").rsplit(".", 1)[0].lower()[:30]
112
+ return "group"
113
+
114
+
115
+ def derive_name_with_context(
116
+ files: list[dict],
117
+ pkg_label: str,
118
+ chunk_idx: int,
119
+ total_chunks: int,
120
+ noise_words: frozenset[str] | None = None,
121
+ ) -> str:
122
+ """Derive a name for a chunk within a larger package group."""
123
+ noise = set(noise_words or {
124
+ "service", "impl", "base", "controller", "manager",
125
+ "handler", "listener", "api", "java", "abstract", "default", "i",
126
+ })
127
+ noise.add(pkg_label.lower())
128
+
129
+ keywords = Counter()
130
+ for f in files:
131
+ basename = f.get("name", "").rsplit("/", 1)[-1].rsplit("\\", 1)[-1].rsplit(".", 1)[0]
132
+ words = re.findall(r'[A-Z][a-z]+', basename)
133
+ for w in words:
134
+ w_lower = w.lower()
135
+ if w_lower not in noise and len(w_lower) >= 3:
136
+ keywords[w_lower] += 1
137
+
138
+ if keywords:
139
+ top = keywords.most_common(2)
140
+ suffix = "-".join(k for k, _ in top)
141
+ return f"{pkg_label}-{suffix}"
142
+
143
+ return f"{pkg_label}-{chunk_idx}"
144
+
145
+
146
+ def rebalance_groups(groups: list[list[dict]], max_files: int) -> None:
147
+ """Split oversized groups to respect max_files_per_shard constraint."""
148
+ i = 0
149
+ while i < len(groups):
150
+ if len(groups[i]) > max_files:
151
+ overflow = groups[i][max_files:]
152
+ groups[i] = groups[i][:max_files]
153
+ while overflow:
154
+ chunk = overflow[:max_files]
155
+ overflow = overflow[max_files:]
156
+ groups.append(chunk)
157
+ i += 1
158
+
159
+
160
+ def deduplicate_names(splits: list[dict]) -> None:
161
+ """Ensure all split names are unique by appending a counter to duplicates."""
162
+ name_counts = Counter(s["name"] for s in splits)
163
+ dupes = {name for name, count in name_counts.items() if count > 1}
164
+ if not dupes:
165
+ return
166
+ counters: dict[str, int] = {}
167
+ for s in splits:
168
+ if s["name"] in dupes:
169
+ counters.setdefault(s["name"], 0)
170
+ counters[s["name"]] += 1
171
+ s["name"] = f"{s['name']}-{counters[s['name']]}"
172
+
173
+
174
+ def group_by_package(files: list[dict], depth: int = 3) -> dict[str, list[dict]]:
175
+ groups: dict[str, list[dict]] = defaultdict(list)
176
+ for f in files:
177
+ parts = f.get("package", "").split(".")
178
+ key = ".".join(parts[:depth]) if len(parts) >= depth else f.get("package", "root")
179
+ groups[key].append(f)
180
+ return dict(groups)
181
+
182
+
183
+ def balanced_split(files: list[dict], max_lines: int, max_files: int) -> list[list[dict]]:
184
+ """LPT greedy balanced split."""
185
+ total_lines = sum(f.get("lines", 0) for f in files)
186
+ n = max(math.ceil(len(files) / max_files), math.ceil(total_lines / max_lines), 2)
187
+ buckets: list[list[dict]] = [[] for _ in range(n)]
188
+ bucket_lines = [0] * n
189
+ for f in sorted(files, key=lambda x: -x.get("lines", 0)):
190
+ lightest = min(range(n), key=lambda i: bucket_lines[i])
191
+ buckets[lightest].append(f)
192
+ bucket_lines[lightest] += f.get("lines", 0)
193
+ return [b for b in buckets if b]
194
+
195
+
196
+ def merge_small(
197
+ splits: list[dict],
198
+ max_lines: int,
199
+ max_files: int,
200
+ merge_ratio: float = 0.25,
201
+ file_list: list[dict] | None = None,
202
+ ) -> list[dict]:
203
+ """Merge small splits into neighbors when they're far below target size."""
204
+ small_line_threshold = max(200, int(max_lines * merge_ratio))
205
+ small_file_threshold = max(5, int(max_files * merge_ratio))
206
+
207
+ file_lines_lookup: dict[str, int] = {}
208
+ if file_list:
209
+ for f in file_list:
210
+ file_lines_lookup[f.get("name", "")] = f.get("lines", 0)
211
+
212
+ large: list[dict] = []
213
+ small_files: list[dict] = []
214
+ for s in splits:
215
+ if s["lines"] < small_line_threshold and s["file_count"] < small_file_threshold:
216
+ avg_lines = s["lines"] // max(s["file_count"], 1)
217
+ for fname in s["files"]:
218
+ actual_lines = file_lines_lookup.get(fname, avg_lines)
219
+ small_files.append({"name": fname, "lines": actual_lines, "package": ""})
220
+ else:
221
+ large.append(s)
222
+
223
+ if small_files and large:
224
+ for f in small_files:
225
+ lightest = min(large, key=lambda x: x["lines"])
226
+ if lightest["lines"] + f["lines"] <= max_lines and lightest["file_count"] + 1 <= max_files:
227
+ lightest["files"].append(f["name"])
228
+ lightest["file_count"] = len(lightest["files"])
229
+ lightest["lines"] += f["lines"]
230
+ else:
231
+ large.append(make_split("misc", [f]))
232
+ elif small_files:
233
+ large.append(make_split("misc", small_files))
234
+ return large
235
+
236
+
237
+ def package_label(pkg: str, depth: int) -> str:
238
+ """Extract a meaningful label from a package path."""
239
+ parts = pkg.split(".")
240
+ generic = {
241
+ "impl", "service", "base", "controller", "config", "util", "common", "api",
242
+ "model", "entity", "manager", "provider", "client", "listener", "event",
243
+ "promotion", "app", "example",
244
+ }
245
+ meaningful = [p for p in parts[depth:] if p not in generic]
246
+ if meaningful:
247
+ return "-".join(meaningful)
248
+ for p in reversed(parts):
249
+ if p not in generic and len(p) > 3:
250
+ return p
251
+ return parts[-1] if parts else "group"
252
+
253
+
254
+ def try_package(
255
+ file_list: list[dict],
256
+ split_config: Any,
257
+ doc_type: str,
258
+ ) -> Any | None:
259
+ """Try package-based grouping."""
260
+ from core.skeleton.split import SplitPlan
261
+
262
+ max_lines = split_config.effective_max_lines(doc_type)
263
+ override = split_config.per_doc_type_overrides.get(doc_type, {})
264
+ max_files = override.get("max_files_per_shard", split_config.max_files_per_shard)
265
+
266
+ best_groups = None
267
+ best_depth = 3
268
+ for depth in range(3, 10):
269
+ groups = group_by_package(file_list, depth=depth)
270
+ if len(groups) > 1:
271
+ best_groups = groups
272
+ best_depth = depth
273
+ break
274
+
275
+ if best_groups is None or len(best_groups) <= 1:
276
+ return None
277
+
278
+ splits: list[dict] = []
279
+ warnings: list[str] = []
280
+
281
+ for pkg, pkg_files in best_groups.items():
282
+ pkg_lbl = package_label(pkg, best_depth)
283
+ pkg_lines = sum(f.get("lines", 0) for f in pkg_files)
284
+ if len(pkg_files) > max_files or pkg_lines > max_lines:
285
+ chunks = balanced_split(pkg_files, max_lines, max_files)
286
+ for i, c in enumerate(chunks):
287
+ chunk_name = derive_name_with_context(
288
+ c, pkg_lbl, i + 1, len(chunks),
289
+ noise_words=split_config.noise_words,
290
+ )
291
+ splits.append(make_split(chunk_name, c))
292
+ warnings.append(f"Package {pkg} oversized, split into {len(chunks)} groups")
293
+ else:
294
+ splits.append(make_split(pkg_lbl, pkg_files))
295
+
296
+ splits = merge_small(
297
+ splits, max_lines, max_files,
298
+ split_config.merge_threshold_ratio, file_list=file_list,
299
+ )
300
+
301
+ if len(splits) <= 1:
302
+ return None
303
+
304
+ deduplicate_names(splits)
305
+ return SplitPlan(splits=splits, strategy="package", warnings=warnings)