source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Split feedback — record and query split execution results for adaptive tuning.
|
|
2
|
+
|
|
3
|
+
Records each split execution's quality metrics and provides historical best
|
|
4
|
+
resolution parameter for future splits. Lightweight JSON persistence.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from core.skeleton.split_feedback import SplitRecord, record_split_result, get_best_resolution
|
|
8
|
+
|
|
9
|
+
record_split_result(module_dir, record)
|
|
10
|
+
best = get_best_resolution(module_dir, "business-logic")
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import logging
|
|
17
|
+
from dataclasses import asdict, dataclass
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
FEEDBACK_FILENAME = "split-feedback.json"
|
|
24
|
+
MAX_RECORDS = 20
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# Data classes
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class SplitRecord:
|
|
34
|
+
"""A single split execution record."""
|
|
35
|
+
|
|
36
|
+
doc_type: str
|
|
37
|
+
strategy: str # "community" | "package" | "simple" | "single"
|
|
38
|
+
resolution: float = 1.0
|
|
39
|
+
n_splits: int = 1
|
|
40
|
+
coverage_score: float = 0.0
|
|
41
|
+
quality_score: float = 0.0
|
|
42
|
+
issues_count: int = 0
|
|
43
|
+
merge_duplicates: int = 0
|
|
44
|
+
timestamp: str = ""
|
|
45
|
+
|
|
46
|
+
def __post_init__(self):
|
|
47
|
+
if not self.timestamp:
|
|
48
|
+
self.timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def composite_score(self) -> float:
|
|
52
|
+
"""Compute composite score (0-1).
|
|
53
|
+
|
|
54
|
+
Weights: coverage 40%, quality 40%, issue penalty 10%, duplicate penalty 10%.
|
|
55
|
+
"""
|
|
56
|
+
issue_penalty = min(self.issues_count * 0.02, 0.1)
|
|
57
|
+
dup_penalty = min(self.merge_duplicates * 0.02, 0.1)
|
|
58
|
+
score = (
|
|
59
|
+
self.coverage_score * 0.4
|
|
60
|
+
+ self.quality_score * 0.4
|
|
61
|
+
+ (0.1 - issue_penalty)
|
|
62
|
+
+ (0.1 - dup_penalty)
|
|
63
|
+
)
|
|
64
|
+
return round(max(0.0, min(1.0, score)), 3)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# Public API
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def record_split_result(module_dir: Path, record: SplitRecord) -> None:
|
|
73
|
+
"""Write a split execution record to .meta/split-feedback.json.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
module_dir: Module documentation directory
|
|
77
|
+
record: SplitRecord with execution metrics
|
|
78
|
+
"""
|
|
79
|
+
meta_dir = module_dir / ".meta"
|
|
80
|
+
meta_dir.mkdir(parents=True, exist_ok=True)
|
|
81
|
+
feedback_path = meta_dir / FEEDBACK_FILENAME
|
|
82
|
+
|
|
83
|
+
records = _load_records(feedback_path)
|
|
84
|
+
|
|
85
|
+
# Serialize record
|
|
86
|
+
entry = asdict(record)
|
|
87
|
+
entry["composite_score"] = record.composite_score
|
|
88
|
+
records.append(entry)
|
|
89
|
+
|
|
90
|
+
# Keep only recent records
|
|
91
|
+
records = records[-MAX_RECORDS:]
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
feedback_path.write_text(
|
|
95
|
+
json.dumps(records, ensure_ascii=False, indent=2),
|
|
96
|
+
encoding="utf-8",
|
|
97
|
+
)
|
|
98
|
+
except OSError as e:
|
|
99
|
+
logger.warning("Failed to write split feedback: %s", e)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def get_best_resolution(
|
|
103
|
+
module_dir: Path,
|
|
104
|
+
doc_type: str,
|
|
105
|
+
default_resolution: float = 1.0,
|
|
106
|
+
) -> dict:
|
|
107
|
+
"""Read history and return best resolution parameter.
|
|
108
|
+
|
|
109
|
+
Simplified: just reuse the last successful resolution parameter.
|
|
110
|
+
No weighted scoring — if the last run succeeded, use its parameters.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
module_dir: Module documentation directory
|
|
114
|
+
doc_type: Document type to filter by
|
|
115
|
+
default_resolution: Default when no history exists
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
{"resolution": float, "records": int, "recommendation": str}
|
|
119
|
+
"""
|
|
120
|
+
feedback_path = module_dir / ".meta" / FEEDBACK_FILENAME
|
|
121
|
+
records = _load_records(feedback_path)
|
|
122
|
+
|
|
123
|
+
relevant = [r for r in records if r.get("doc_type") == doc_type]
|
|
124
|
+
|
|
125
|
+
if not relevant:
|
|
126
|
+
return {
|
|
127
|
+
"resolution": default_resolution,
|
|
128
|
+
"records": 0,
|
|
129
|
+
"recommendation": "no-history",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
# Simple: use the last successful record's resolution
|
|
133
|
+
last = relevant[-1]
|
|
134
|
+
return {
|
|
135
|
+
"resolution": last.get("resolution", default_resolution),
|
|
136
|
+
"records": len(relevant),
|
|
137
|
+
"recommendation": "use-last-successful",
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def get_split_history(
|
|
142
|
+
module_dir: Path,
|
|
143
|
+
doc_type: str | None = None,
|
|
144
|
+
) -> list[dict]:
|
|
145
|
+
"""Get split history records for debugging and analysis.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
module_dir: Module documentation directory
|
|
149
|
+
doc_type: Optional filter
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
List of historical records.
|
|
153
|
+
"""
|
|
154
|
+
feedback_path = module_dir / ".meta" / FEEDBACK_FILENAME
|
|
155
|
+
records = _load_records(feedback_path)
|
|
156
|
+
if doc_type:
|
|
157
|
+
records = [r for r in records if r.get("doc_type") == doc_type]
|
|
158
|
+
return records
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# Internal
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _load_records(path: Path) -> list[dict]:
|
|
167
|
+
"""Load feedback records file."""
|
|
168
|
+
if not path.exists():
|
|
169
|
+
return []
|
|
170
|
+
try:
|
|
171
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
172
|
+
return data if isinstance(data, list) else []
|
|
173
|
+
except (json.JSONDecodeError, OSError):
|
|
174
|
+
return []
|
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
"""Split planning — orchestrate split strategy selection.
|
|
2
|
+
|
|
3
|
+
Implements the priority chain for split planning:
|
|
4
|
+
1. Cache hit (skeleton hash unchanged) — instant
|
|
5
|
+
2. Community detection (dependency graph, zero LLM cost)
|
|
6
|
+
3. LLM-assisted grouping (business-domain, optional)
|
|
7
|
+
4. Package-based grouping (code rules, zero LLM cost)
|
|
8
|
+
5. Simple split (equal file count, last resort)
|
|
9
|
+
|
|
10
|
+
All thresholds loaded from SplitConfig (yaml-driven), no hardcoded numbers.
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
from core.skeleton.split_plan import plan_splits
|
|
14
|
+
|
|
15
|
+
plan = plan_splits(entries, file_list, split_config, dep_graph)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
from core.skeleton.split import SplitConfig, SplitPlan
|
|
25
|
+
from core.skeleton.split_plan_helpers import (
|
|
26
|
+
make_split, derive_name, balanced_split,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Public API
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def plan_splits(
|
|
38
|
+
entries: list[dict[str, Any]],
|
|
39
|
+
file_list: list[dict[str, Any]],
|
|
40
|
+
split_config: SplitConfig,
|
|
41
|
+
dep_graph: Any | None = None,
|
|
42
|
+
doc_type: str = "business-logic",
|
|
43
|
+
module_dir: Path | None = None,
|
|
44
|
+
*,
|
|
45
|
+
llm_strategy: Any | None = None,
|
|
46
|
+
is_delegated: bool = False,
|
|
47
|
+
) -> SplitPlan:
|
|
48
|
+
"""Main entry point for split planning with priority chain.
|
|
49
|
+
|
|
50
|
+
Priority chain: Cache -> Community -> LLM -> Package -> Simple
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
llm_strategy: Optional LlmStrategy instance for LLM-assisted splitting.
|
|
54
|
+
Injected by engine/ caller. None = skip LLM strategy.
|
|
55
|
+
is_delegated: Whether running in delegated mode (Agent handles grouping).
|
|
56
|
+
"""
|
|
57
|
+
if not file_list:
|
|
58
|
+
return SplitPlan(splits=[], strategy="empty")
|
|
59
|
+
|
|
60
|
+
max_lines = split_config.effective_max_lines(doc_type)
|
|
61
|
+
max_files = split_config.max_files_per_shard
|
|
62
|
+
total_files = len(file_list)
|
|
63
|
+
total_lines = sum(f.get("lines", 0) for f in file_list)
|
|
64
|
+
|
|
65
|
+
if total_files <= max_files and total_lines <= max_lines:
|
|
66
|
+
logger.debug("split not needed for doc_type=%s (files=%d, lines=%d)", doc_type, total_files, total_lines)
|
|
67
|
+
return SplitPlan(
|
|
68
|
+
splits=[make_split("all", file_list)],
|
|
69
|
+
strategy="single",
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Strategy 1: Cache hit
|
|
73
|
+
if module_dir:
|
|
74
|
+
cached = _try_cache(module_dir, doc_type)
|
|
75
|
+
if cached:
|
|
76
|
+
logger.info("split strategy=cache doc_type=%s splits=%d", doc_type, len(cached.splits))
|
|
77
|
+
return cached
|
|
78
|
+
|
|
79
|
+
# Strategy 2: Community detection
|
|
80
|
+
plan = _try_community(file_list, entries, split_config, dep_graph, doc_type)
|
|
81
|
+
if plan:
|
|
82
|
+
logger.info("split strategy=community doc_type=%s splits=%d", doc_type, len(plan.splits))
|
|
83
|
+
_save_to_cache(module_dir, doc_type, plan)
|
|
84
|
+
return plan
|
|
85
|
+
logger.debug("community detection skipped for doc_type=%s", doc_type)
|
|
86
|
+
|
|
87
|
+
# Strategy 3: LLM-assisted grouping
|
|
88
|
+
from core.skeleton.split_plan_llm import try_llm_split
|
|
89
|
+
plan = try_llm_split(file_list, split_config, doc_type, module_dir,
|
|
90
|
+
strategy=llm_strategy, is_delegated=is_delegated)
|
|
91
|
+
if plan:
|
|
92
|
+
logger.info("split strategy=llm doc_type=%s splits=%d", doc_type, len(plan.splits))
|
|
93
|
+
if plan.strategy != "agent-pending":
|
|
94
|
+
_save_to_cache(module_dir, doc_type, plan)
|
|
95
|
+
return plan
|
|
96
|
+
logger.debug("llm split skipped for doc_type=%s", doc_type)
|
|
97
|
+
|
|
98
|
+
# Strategy 4: Package-based grouping
|
|
99
|
+
from core.skeleton.split_plan_helpers import try_package
|
|
100
|
+
plan = try_package(file_list, split_config, doc_type)
|
|
101
|
+
if plan:
|
|
102
|
+
logger.info("split strategy=package doc_type=%s splits=%d", doc_type, len(plan.splits))
|
|
103
|
+
_save_to_cache(module_dir, doc_type, plan)
|
|
104
|
+
return plan
|
|
105
|
+
logger.debug("package split skipped for doc_type=%s", doc_type)
|
|
106
|
+
|
|
107
|
+
# Strategy 5: Simple split (last resort)
|
|
108
|
+
logger.info("split strategy=simple (last resort) doc_type=%s files=%d", doc_type, total_files)
|
|
109
|
+
plan = _simple_split(file_list, split_config, doc_type)
|
|
110
|
+
_save_to_cache(module_dir, doc_type, plan)
|
|
111
|
+
return plan
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# ---------------------------------------------------------------------------
|
|
115
|
+
# Strategy 1: Cache
|
|
116
|
+
# ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def _try_cache(module_dir: Path, doc_type: str) -> SplitPlan | None:
|
|
120
|
+
"""Try to load a cached split plan."""
|
|
121
|
+
try:
|
|
122
|
+
from core.skeleton.split_cache import compute_skeleton_hash, get_cached_plan
|
|
123
|
+
current_hash = compute_skeleton_hash(module_dir)
|
|
124
|
+
cached = get_cached_plan(module_dir, doc_type, current_hash)
|
|
125
|
+
if cached and "splits" in cached:
|
|
126
|
+
return SplitPlan(
|
|
127
|
+
splits=cached["splits"],
|
|
128
|
+
strategy="cache",
|
|
129
|
+
)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.debug("Split cache load failed for %s: %s", doc_type, e)
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _save_to_cache(module_dir: Path | None, doc_type: str, plan: SplitPlan) -> None:
|
|
136
|
+
"""Persist split plan to cache for future reuse."""
|
|
137
|
+
if not module_dir or not plan.splits:
|
|
138
|
+
return
|
|
139
|
+
try:
|
|
140
|
+
from core.skeleton.split_cache import compute_skeleton_hash, save_plan_cache
|
|
141
|
+
current_hash = compute_skeleton_hash(module_dir)
|
|
142
|
+
save_plan_cache(
|
|
143
|
+
module_dir, current_hash, doc_type,
|
|
144
|
+
{"splits": plan.splits},
|
|
145
|
+
num_splits=len(plan.splits),
|
|
146
|
+
)
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.debug("Failed to save split cache for %s: %s", doc_type, e)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# ---------------------------------------------------------------------------
|
|
152
|
+
# Strategy 2: Community detection
|
|
153
|
+
# ---------------------------------------------------------------------------
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def _try_community(
|
|
157
|
+
file_list: list[dict],
|
|
158
|
+
entries: list[dict],
|
|
159
|
+
split_config: SplitConfig,
|
|
160
|
+
dep_graph: Any | None,
|
|
161
|
+
doc_type: str,
|
|
162
|
+
) -> SplitPlan | None:
|
|
163
|
+
"""Try community detection split."""
|
|
164
|
+
try:
|
|
165
|
+
from core.skeleton.dependency_graph import DependencyGraph, build_dependency_graph
|
|
166
|
+
from core.skeleton.community import split_by_community
|
|
167
|
+
except ImportError:
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
if dep_graph is None:
|
|
171
|
+
if not entries:
|
|
172
|
+
return None
|
|
173
|
+
file_basenames = {Path(f.get("name", "")).name for f in file_list}
|
|
174
|
+
relevant = [e for e in entries if Path(e.get("file", "")).name in file_basenames]
|
|
175
|
+
if not relevant:
|
|
176
|
+
return None
|
|
177
|
+
dep_graph = build_dependency_graph(relevant)
|
|
178
|
+
|
|
179
|
+
if not isinstance(dep_graph, DependencyGraph) or not dep_graph.adjacency:
|
|
180
|
+
return None
|
|
181
|
+
|
|
182
|
+
max_lines = split_config.effective_max_lines(doc_type)
|
|
183
|
+
max_files = split_config.max_files_per_shard
|
|
184
|
+
|
|
185
|
+
groups = split_by_community(
|
|
186
|
+
file_list, dep_graph,
|
|
187
|
+
max_files_per_shard=max_files,
|
|
188
|
+
max_lines_per_shard=max_lines,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if not groups or len(groups) <= 1:
|
|
192
|
+
return None
|
|
193
|
+
|
|
194
|
+
total_files = len(file_list)
|
|
195
|
+
max_reasonable_splits = max(6, total_files // max_files + 2)
|
|
196
|
+
if len(groups) > max_reasonable_splits:
|
|
197
|
+
return None
|
|
198
|
+
|
|
199
|
+
splits = [make_split(derive_name(g, noise_words=split_config.noise_words), g) for g in groups]
|
|
200
|
+
return SplitPlan(splits=splits, strategy="community")
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
# Strategy 5: Simple split
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _simple_split(
|
|
209
|
+
file_list: list[dict],
|
|
210
|
+
split_config: SplitConfig,
|
|
211
|
+
doc_type: str,
|
|
212
|
+
) -> SplitPlan:
|
|
213
|
+
"""Last resort: equal-count split."""
|
|
214
|
+
max_lines = split_config.effective_max_lines(doc_type)
|
|
215
|
+
max_files = split_config.max_files_per_shard
|
|
216
|
+
|
|
217
|
+
chunks = balanced_split(file_list, max_lines, max_files)
|
|
218
|
+
splits = [make_split(f"group-{i+1}", c) for i, c in enumerate(chunks)]
|
|
219
|
+
return SplitPlan(splits=splits, strategy="simple")
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""Split plan helpers — shared utilities for split strategies.
|
|
2
|
+
|
|
3
|
+
Includes:
|
|
4
|
+
- Package-based grouping strategy
|
|
5
|
+
- Name derivation (semantic business-domain naming)
|
|
6
|
+
- Balanced splitting (LPT greedy)
|
|
7
|
+
- Small-split merging
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import math
|
|
13
|
+
import os
|
|
14
|
+
import re
|
|
15
|
+
from collections import Counter, defaultdict
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_split(name: str, files: list[dict]) -> dict[str, Any]:
|
|
20
|
+
return {
|
|
21
|
+
"name": name,
|
|
22
|
+
"files": [f.get("rel_path", f.get("name", "")) for f in files],
|
|
23
|
+
"file_count": len(files),
|
|
24
|
+
"lines": sum(f.get("lines", 0) for f in files),
|
|
25
|
+
"packages": list({f.get("package", "") for f in files}),
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def derive_name(
|
|
30
|
+
files: list[dict],
|
|
31
|
+
split_name_suffixes: tuple[str, ...] | None = None,
|
|
32
|
+
noise_words: frozenset[str] | None = None,
|
|
33
|
+
) -> str:
|
|
34
|
+
"""Derive a semantic business-domain name from file list.
|
|
35
|
+
|
|
36
|
+
Strategy priority:
|
|
37
|
+
1. Common sub-package beyond base (e.g., service.group -> "group")
|
|
38
|
+
2. Dominant business keywords from class names, combined for specificity
|
|
39
|
+
3. Common prefix of stems (fallback)
|
|
40
|
+
"""
|
|
41
|
+
if not files:
|
|
42
|
+
return "group"
|
|
43
|
+
|
|
44
|
+
suffixes = split_name_suffixes or (
|
|
45
|
+
"ServiceImpl", "Service", "Handler", "Processor",
|
|
46
|
+
"Manager", "Controller", "Listener", "Client", "Biz", "BizImpl",
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
packages = [f.get("package", "") for f in files if f.get("package")]
|
|
50
|
+
if packages:
|
|
51
|
+
parts_list = [p.split(".") for p in packages]
|
|
52
|
+
if parts_list:
|
|
53
|
+
min_len = min(len(p) for p in parts_list)
|
|
54
|
+
common_depth = 0
|
|
55
|
+
for i in range(min_len):
|
|
56
|
+
if len(set(p[i] for p in parts_list)) == 1:
|
|
57
|
+
common_depth = i + 1
|
|
58
|
+
else:
|
|
59
|
+
break
|
|
60
|
+
diverging = Counter()
|
|
61
|
+
for parts in parts_list:
|
|
62
|
+
if len(parts) > common_depth:
|
|
63
|
+
seg = parts[common_depth]
|
|
64
|
+
if seg not in ("impl", "service", "base", "controller", "config", "util", "common"):
|
|
65
|
+
diverging[seg] += 1
|
|
66
|
+
if diverging:
|
|
67
|
+
top_pkg, top_count = diverging.most_common(1)[0]
|
|
68
|
+
if top_count >= len(files) * 0.5:
|
|
69
|
+
return top_pkg
|
|
70
|
+
|
|
71
|
+
keywords = Counter()
|
|
72
|
+
noise = noise_words or frozenset({
|
|
73
|
+
"service", "impl", "base", "controller", "manager",
|
|
74
|
+
"handler", "listener", "api", "java", "abstract", "default", "i",
|
|
75
|
+
})
|
|
76
|
+
for f in files:
|
|
77
|
+
raw_name = f.get("name", "")
|
|
78
|
+
basename = raw_name.rsplit("/", 1)[-1].rsplit("\\", 1)[-1].rsplit(".", 1)[0]
|
|
79
|
+
words = re.findall(r'[A-Z][a-z]+', basename)
|
|
80
|
+
for w in words:
|
|
81
|
+
w_lower = w.lower()
|
|
82
|
+
if w_lower not in noise and len(w_lower) >= 3:
|
|
83
|
+
keywords[w_lower] += 1
|
|
84
|
+
|
|
85
|
+
if keywords:
|
|
86
|
+
top_keywords = keywords.most_common(5)
|
|
87
|
+
selected = []
|
|
88
|
+
for k, c in top_keywords:
|
|
89
|
+
if c >= max(2, len(files) * 0.2):
|
|
90
|
+
selected.append(k)
|
|
91
|
+
if len(selected) >= 2:
|
|
92
|
+
break
|
|
93
|
+
if selected:
|
|
94
|
+
return "-".join(selected)
|
|
95
|
+
elif top_keywords:
|
|
96
|
+
return top_keywords[0][0]
|
|
97
|
+
|
|
98
|
+
stems: list[str] = []
|
|
99
|
+
for f in files:
|
|
100
|
+
name = f.get("name", "").rsplit(".", 1)[0]
|
|
101
|
+
for s in sorted(suffixes, key=len, reverse=True):
|
|
102
|
+
if name.endswith(s) and len(name) > len(s):
|
|
103
|
+
name = name[:-len(s)]
|
|
104
|
+
break
|
|
105
|
+
stems.append(name)
|
|
106
|
+
prefix = os.path.commonprefix(stems)
|
|
107
|
+
if len(prefix) >= 4:
|
|
108
|
+
return re.sub(r'([a-z])([A-Z])', r'\1-\2', prefix).lower().strip("-")
|
|
109
|
+
if files:
|
|
110
|
+
largest = max(files, key=lambda f: f.get("lines", 0))
|
|
111
|
+
return largest.get("name", "group").rsplit(".", 1)[0].lower()[:30]
|
|
112
|
+
return "group"
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def derive_name_with_context(
|
|
116
|
+
files: list[dict],
|
|
117
|
+
pkg_label: str,
|
|
118
|
+
chunk_idx: int,
|
|
119
|
+
total_chunks: int,
|
|
120
|
+
noise_words: frozenset[str] | None = None,
|
|
121
|
+
) -> str:
|
|
122
|
+
"""Derive a name for a chunk within a larger package group."""
|
|
123
|
+
noise = set(noise_words or {
|
|
124
|
+
"service", "impl", "base", "controller", "manager",
|
|
125
|
+
"handler", "listener", "api", "java", "abstract", "default", "i",
|
|
126
|
+
})
|
|
127
|
+
noise.add(pkg_label.lower())
|
|
128
|
+
|
|
129
|
+
keywords = Counter()
|
|
130
|
+
for f in files:
|
|
131
|
+
basename = f.get("name", "").rsplit("/", 1)[-1].rsplit("\\", 1)[-1].rsplit(".", 1)[0]
|
|
132
|
+
words = re.findall(r'[A-Z][a-z]+', basename)
|
|
133
|
+
for w in words:
|
|
134
|
+
w_lower = w.lower()
|
|
135
|
+
if w_lower not in noise and len(w_lower) >= 3:
|
|
136
|
+
keywords[w_lower] += 1
|
|
137
|
+
|
|
138
|
+
if keywords:
|
|
139
|
+
top = keywords.most_common(2)
|
|
140
|
+
suffix = "-".join(k for k, _ in top)
|
|
141
|
+
return f"{pkg_label}-{suffix}"
|
|
142
|
+
|
|
143
|
+
return f"{pkg_label}-{chunk_idx}"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def rebalance_groups(groups: list[list[dict]], max_files: int) -> None:
|
|
147
|
+
"""Split oversized groups to respect max_files_per_shard constraint."""
|
|
148
|
+
i = 0
|
|
149
|
+
while i < len(groups):
|
|
150
|
+
if len(groups[i]) > max_files:
|
|
151
|
+
overflow = groups[i][max_files:]
|
|
152
|
+
groups[i] = groups[i][:max_files]
|
|
153
|
+
while overflow:
|
|
154
|
+
chunk = overflow[:max_files]
|
|
155
|
+
overflow = overflow[max_files:]
|
|
156
|
+
groups.append(chunk)
|
|
157
|
+
i += 1
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def deduplicate_names(splits: list[dict]) -> None:
|
|
161
|
+
"""Ensure all split names are unique by appending a counter to duplicates."""
|
|
162
|
+
name_counts = Counter(s["name"] for s in splits)
|
|
163
|
+
dupes = {name for name, count in name_counts.items() if count > 1}
|
|
164
|
+
if not dupes:
|
|
165
|
+
return
|
|
166
|
+
counters: dict[str, int] = {}
|
|
167
|
+
for s in splits:
|
|
168
|
+
if s["name"] in dupes:
|
|
169
|
+
counters.setdefault(s["name"], 0)
|
|
170
|
+
counters[s["name"]] += 1
|
|
171
|
+
s["name"] = f"{s['name']}-{counters[s['name']]}"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def group_by_package(files: list[dict], depth: int = 3) -> dict[str, list[dict]]:
|
|
175
|
+
groups: dict[str, list[dict]] = defaultdict(list)
|
|
176
|
+
for f in files:
|
|
177
|
+
parts = f.get("package", "").split(".")
|
|
178
|
+
key = ".".join(parts[:depth]) if len(parts) >= depth else f.get("package", "root")
|
|
179
|
+
groups[key].append(f)
|
|
180
|
+
return dict(groups)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def balanced_split(files: list[dict], max_lines: int, max_files: int) -> list[list[dict]]:
|
|
184
|
+
"""LPT greedy balanced split."""
|
|
185
|
+
total_lines = sum(f.get("lines", 0) for f in files)
|
|
186
|
+
n = max(math.ceil(len(files) / max_files), math.ceil(total_lines / max_lines), 2)
|
|
187
|
+
buckets: list[list[dict]] = [[] for _ in range(n)]
|
|
188
|
+
bucket_lines = [0] * n
|
|
189
|
+
for f in sorted(files, key=lambda x: -x.get("lines", 0)):
|
|
190
|
+
lightest = min(range(n), key=lambda i: bucket_lines[i])
|
|
191
|
+
buckets[lightest].append(f)
|
|
192
|
+
bucket_lines[lightest] += f.get("lines", 0)
|
|
193
|
+
return [b for b in buckets if b]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def merge_small(
|
|
197
|
+
splits: list[dict],
|
|
198
|
+
max_lines: int,
|
|
199
|
+
max_files: int,
|
|
200
|
+
merge_ratio: float = 0.25,
|
|
201
|
+
file_list: list[dict] | None = None,
|
|
202
|
+
) -> list[dict]:
|
|
203
|
+
"""Merge small splits into neighbors when they're far below target size."""
|
|
204
|
+
small_line_threshold = max(200, int(max_lines * merge_ratio))
|
|
205
|
+
small_file_threshold = max(5, int(max_files * merge_ratio))
|
|
206
|
+
|
|
207
|
+
file_lines_lookup: dict[str, int] = {}
|
|
208
|
+
if file_list:
|
|
209
|
+
for f in file_list:
|
|
210
|
+
file_lines_lookup[f.get("name", "")] = f.get("lines", 0)
|
|
211
|
+
|
|
212
|
+
large: list[dict] = []
|
|
213
|
+
small_files: list[dict] = []
|
|
214
|
+
for s in splits:
|
|
215
|
+
if s["lines"] < small_line_threshold and s["file_count"] < small_file_threshold:
|
|
216
|
+
avg_lines = s["lines"] // max(s["file_count"], 1)
|
|
217
|
+
for fname in s["files"]:
|
|
218
|
+
actual_lines = file_lines_lookup.get(fname, avg_lines)
|
|
219
|
+
small_files.append({"name": fname, "lines": actual_lines, "package": ""})
|
|
220
|
+
else:
|
|
221
|
+
large.append(s)
|
|
222
|
+
|
|
223
|
+
if small_files and large:
|
|
224
|
+
for f in small_files:
|
|
225
|
+
lightest = min(large, key=lambda x: x["lines"])
|
|
226
|
+
if lightest["lines"] + f["lines"] <= max_lines and lightest["file_count"] + 1 <= max_files:
|
|
227
|
+
lightest["files"].append(f["name"])
|
|
228
|
+
lightest["file_count"] = len(lightest["files"])
|
|
229
|
+
lightest["lines"] += f["lines"]
|
|
230
|
+
else:
|
|
231
|
+
large.append(make_split("misc", [f]))
|
|
232
|
+
elif small_files:
|
|
233
|
+
large.append(make_split("misc", small_files))
|
|
234
|
+
return large
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def package_label(pkg: str, depth: int) -> str:
|
|
238
|
+
"""Extract a meaningful label from a package path."""
|
|
239
|
+
parts = pkg.split(".")
|
|
240
|
+
generic = {
|
|
241
|
+
"impl", "service", "base", "controller", "config", "util", "common", "api",
|
|
242
|
+
"model", "entity", "manager", "provider", "client", "listener", "event",
|
|
243
|
+
"promotion", "app", "example",
|
|
244
|
+
}
|
|
245
|
+
meaningful = [p for p in parts[depth:] if p not in generic]
|
|
246
|
+
if meaningful:
|
|
247
|
+
return "-".join(meaningful)
|
|
248
|
+
for p in reversed(parts):
|
|
249
|
+
if p not in generic and len(p) > 3:
|
|
250
|
+
return p
|
|
251
|
+
return parts[-1] if parts else "group"
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def try_package(
|
|
255
|
+
file_list: list[dict],
|
|
256
|
+
split_config: Any,
|
|
257
|
+
doc_type: str,
|
|
258
|
+
) -> Any | None:
|
|
259
|
+
"""Try package-based grouping."""
|
|
260
|
+
from core.skeleton.split import SplitPlan
|
|
261
|
+
|
|
262
|
+
max_lines = split_config.effective_max_lines(doc_type)
|
|
263
|
+
override = split_config.per_doc_type_overrides.get(doc_type, {})
|
|
264
|
+
max_files = override.get("max_files_per_shard", split_config.max_files_per_shard)
|
|
265
|
+
|
|
266
|
+
best_groups = None
|
|
267
|
+
best_depth = 3
|
|
268
|
+
for depth in range(3, 10):
|
|
269
|
+
groups = group_by_package(file_list, depth=depth)
|
|
270
|
+
if len(groups) > 1:
|
|
271
|
+
best_groups = groups
|
|
272
|
+
best_depth = depth
|
|
273
|
+
break
|
|
274
|
+
|
|
275
|
+
if best_groups is None or len(best_groups) <= 1:
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
splits: list[dict] = []
|
|
279
|
+
warnings: list[str] = []
|
|
280
|
+
|
|
281
|
+
for pkg, pkg_files in best_groups.items():
|
|
282
|
+
pkg_lbl = package_label(pkg, best_depth)
|
|
283
|
+
pkg_lines = sum(f.get("lines", 0) for f in pkg_files)
|
|
284
|
+
if len(pkg_files) > max_files or pkg_lines > max_lines:
|
|
285
|
+
chunks = balanced_split(pkg_files, max_lines, max_files)
|
|
286
|
+
for i, c in enumerate(chunks):
|
|
287
|
+
chunk_name = derive_name_with_context(
|
|
288
|
+
c, pkg_lbl, i + 1, len(chunks),
|
|
289
|
+
noise_words=split_config.noise_words,
|
|
290
|
+
)
|
|
291
|
+
splits.append(make_split(chunk_name, c))
|
|
292
|
+
warnings.append(f"Package {pkg} oversized, split into {len(chunks)} groups")
|
|
293
|
+
else:
|
|
294
|
+
splits.append(make_split(pkg_lbl, pkg_files))
|
|
295
|
+
|
|
296
|
+
splits = merge_small(
|
|
297
|
+
splits, max_lines, max_files,
|
|
298
|
+
split_config.merge_threshold_ratio, file_list=file_list,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
if len(splits) <= 1:
|
|
302
|
+
return None
|
|
303
|
+
|
|
304
|
+
deduplicate_names(splits)
|
|
305
|
+
return SplitPlan(splits=splits, strategy="package", warnings=warnings)
|