source-kb 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +50 -0
- cli/__main__.py +5 -0
- cli/commands/__init__.py +1 -0
- cli/commands/anchor_fix.py +47 -0
- cli/commands/diff_doc.py +52 -0
- cli/commands/dispatch.py +77 -0
- cli/commands/extract.py +72 -0
- cli/commands/file_list.py +74 -0
- cli/commands/index.py +84 -0
- cli/commands/lock.py +89 -0
- cli/commands/merge.py +60 -0
- cli/commands/merge_delta.py +19 -0
- cli/commands/metadata.py +24 -0
- cli/commands/pipeline.py +45 -0
- cli/commands/post_merge.py +43 -0
- cli/commands/query.py +52 -0
- cli/commands/render.py +101 -0
- cli/commands/scan_repos.py +46 -0
- cli/commands/setup.py +94 -0
- cli/commands/split.py +196 -0
- cli/commands/stale_files.py +98 -0
- cli/commands/validate.py +191 -0
- core/__init__.py +32 -0
- core/config.py +261 -0
- core/docs/__init__.py +7 -0
- core/docs/section_updater.py +286 -0
- core/docs/shared.py +149 -0
- core/git.py +294 -0
- core/interfaces.py +249 -0
- core/monitor/__init__.py +5 -0
- core/monitor/progress.py +83 -0
- core/monitor/prompt_store.py +49 -0
- core/paths.py +141 -0
- core/preset.py +237 -0
- core/preset_accessors.py +202 -0
- core/preset_classify.py +132 -0
- core/preset_hooks.py +129 -0
- core/preset_profile.py +89 -0
- core/prompt/__init__.py +7 -0
- core/prompt/__main__.py +147 -0
- core/prompt/content.py +320 -0
- core/prompt/context_manager.py +164 -0
- core/prompt/renderer.py +236 -0
- core/prompt/response_parser.py +274 -0
- core/prompt/templates.py +357 -0
- core/prompt/validate_parity.py +162 -0
- core/prompt/variables.py +339 -0
- core/rag/__init__.py +22 -0
- core/rag/__main__.py +136 -0
- core/rag/bm25_index.py +268 -0
- core/rag/chunker.py +273 -0
- core/rag/embedder.py +151 -0
- core/rag/indexer.py +292 -0
- core/rag/loader.py +89 -0
- core/rag/retriever.py +82 -0
- core/skeleton/__init__.py +11 -0
- core/skeleton/__main__.py +934 -0
- core/skeleton/anchor_fix.py +250 -0
- core/skeleton/classify.py +331 -0
- core/skeleton/cmd_anchor_fix.py +43 -0
- core/skeleton/cmd_diff_doc.py +44 -0
- core/skeleton/cmd_lock.py +87 -0
- core/skeleton/cmd_merge_delta.py +41 -0
- core/skeleton/community.py +233 -0
- core/skeleton/dependency_graph.py +306 -0
- core/skeleton/diff_doc.py +248 -0
- core/skeleton/dispatch.py +273 -0
- core/skeleton/dispatch_render.py +319 -0
- core/skeleton/dispatch_source.py +111 -0
- core/skeleton/extract.py +218 -0
- core/skeleton/extract_methods.py +298 -0
- core/skeleton/file_list.py +239 -0
- core/skeleton/impact.py +278 -0
- core/skeleton/jar_download.py +177 -0
- core/skeleton/jar_resolver.py +186 -0
- core/skeleton/loader.py +162 -0
- core/skeleton/merge.py +278 -0
- core/skeleton/merge_delta.py +229 -0
- core/skeleton/metadata.py +96 -0
- core/skeleton/metadata_builders.py +264 -0
- core/skeleton/module_dag.py +330 -0
- core/skeleton/parsers/__init__.py +71 -0
- core/skeleton/parsers/jqassistant.py +300 -0
- core/skeleton/parsers/jqassistant_cypher.py +225 -0
- core/skeleton/parsers/regex.py +171 -0
- core/skeleton/parsers/treesitter.py +324 -0
- core/skeleton/parsers/treesitter_java.py +284 -0
- core/skeleton/parsers/treesitter_multi.py +289 -0
- core/skeleton/pom_parser.py +299 -0
- core/skeleton/post_merge.py +295 -0
- core/skeleton/post_merge_llm.py +82 -0
- core/skeleton/query.py +195 -0
- core/skeleton/shard_context.py +177 -0
- core/skeleton/split.py +180 -0
- core/skeleton/split_cache.py +107 -0
- core/skeleton/split_feedback.py +174 -0
- core/skeleton/split_plan.py +219 -0
- core/skeleton/split_plan_helpers.py +305 -0
- core/skeleton/split_plan_llm.py +274 -0
- core/utils.py +135 -0
- core/validators/__init__.py +65 -0
- core/validators/__main__.py +215 -0
- core/validators/consistency.py +203 -0
- core/validators/coverage.py +171 -0
- core/validators/duplicates.py +76 -0
- core/validators/engine.py +224 -0
- core/validators/links.py +76 -0
- core/validators/sampling.py +169 -0
- core/validators/structure.py +144 -0
- engine/__init__.py +7 -0
- engine/assembler.py +231 -0
- engine/confirm.py +65 -0
- engine/dedup.py +106 -0
- engine/main.py +211 -0
- engine/pipeline/__init__.py +163 -0
- engine/pipeline/recovery.py +250 -0
- engine/pipeline/steps/__init__.py +23 -0
- engine/pipeline/steps/audit.py +220 -0
- engine/pipeline/steps/audit_apply.py +195 -0
- engine/pipeline/steps/audit_helpers.py +155 -0
- engine/pipeline/steps/classify_llm.py +236 -0
- engine/pipeline/steps/classify_prompt.py +223 -0
- engine/pipeline/steps/finalize.py +160 -0
- engine/pipeline/steps/generate.py +169 -0
- engine/pipeline/steps/generate_batch.py +197 -0
- engine/pipeline/steps/generate_recovery.py +170 -0
- engine/pipeline/steps/llm_plan_split.py +253 -0
- engine/pipeline/steps/lock.py +64 -0
- engine/pipeline/steps/preflight.py +237 -0
- engine/pipeline/steps/preflight_adjust.py +147 -0
- engine/pipeline/steps/pregenerate.py +130 -0
- engine/pipeline/steps/quality.py +81 -0
- engine/pipeline/steps/skeleton.py +149 -0
- engine/pipeline/steps/source.py +163 -0
- engine/pipeline/steps/sync.py +117 -0
- engine/pipeline/steps/sync_finalize.py +237 -0
- engine/pipeline/steps/sync_update.py +341 -0
- engine/pipelines.py +91 -0
- engine/runner.py +335 -0
- engine/strategies/__init__.py +86 -0
- engine/strategies/api.py +128 -0
- engine/strategies/delegated.py +50 -0
- engine/strategies/dryrun.py +25 -0
- engine/two_phase.py +143 -0
- mcp_server/__init__.py +73 -0
- mcp_server/__main__.py +5 -0
- mcp_server/tools/__init__.py +1 -0
- mcp_server/tools/config.py +63 -0
- mcp_server/tools/discovery.py +276 -0
- mcp_server/tools/generation.py +184 -0
- mcp_server/tools/planning.py +144 -0
- mcp_server/tools/source.py +175 -0
- mcp_server/tools/validation.py +140 -0
- mcp_server/tools/workflow.py +166 -0
- mcp_server/workflow_loader.py +204 -0
- presets/generic/audit_dimensions.md +132 -0
- presets/generic/doc_types.yaml +152 -0
- presets/generic/preset.yaml +115 -0
- presets/java-spring/audit_dimensions.md +228 -0
- presets/java-spring/audit_dimensions.yaml +203 -0
- presets/java-spring/doc_types.yaml +269 -0
- presets/java-spring/hooks.py +122 -0
- presets/java-spring/preset.yaml +341 -0
- presets/java-spring/templates/README.md +34 -0
- presets/java-spring/templates/audit-system.md +15 -0
- presets/java-spring/templates/subagent-aop.md +105 -0
- presets/java-spring/templates/subagent-api.md +63 -0
- presets/java-spring/templates/subagent-architecture.md +111 -0
- presets/java-spring/templates/subagent-async-events.md +107 -0
- presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
- presets/java-spring/templates/subagent-audit-architecture.md +38 -0
- presets/java-spring/templates/subagent-audit-business.md +40 -0
- presets/java-spring/templates/subagent-audit-data-models.md +40 -0
- presets/java-spring/templates/subagent-business.md +129 -0
- presets/java-spring/templates/subagent-caching.md +75 -0
- presets/java-spring/templates/subagent-database-access.md +114 -0
- presets/java-spring/templates/subagent-enum.md +75 -0
- presets/java-spring/templates/subagent-error-handling.md +91 -0
- presets/java-spring/templates/subagent-external-integrations.md +80 -0
- presets/java-spring/templates/subagent-index.md +122 -0
- presets/java-spring/templates/subagent-messaging.md +97 -0
- presets/java-spring/templates/subagent-model.md +88 -0
- presets/java-spring/templates/subagent-observability.md +91 -0
- presets/java-spring/templates/subagent-scheduled.md +81 -0
- presets/java-spring/templates/subagent-security.md +102 -0
- presets/java-spring/templates/subagent-structure.md +101 -0
- presets/java-spring/templates/subagent-sync-section.md +34 -0
- presets/java-spring/templates/subagent-utils.md +73 -0
- presets/java-spring/templates/sync-system.md +8 -0
- presets/java-spring/workflow-extensions.md +112 -0
- skills/__init__.py +1 -0
- skills/_shared/README.md +30 -0
- skills/_shared/doc-coverage-shared.md +134 -0
- skills/_shared/doc-quality-standard.md +1058 -0
- skills/_shared/doc-subagent-rules.md +762 -0
- skills/_shared/windows-compat.md +89 -0
- skills/kb-audit/SKILL.md +52 -0
- skills/kb-audit/rules.md +88 -0
- skills/kb-audit/steps/step-01-prepare.md +75 -0
- skills/kb-audit/steps/step-02-audit.md +96 -0
- skills/kb-audit/steps/step-03-verify.md +65 -0
- skills/kb-audit/steps/step-04-report.md +64 -0
- skills/kb-init/SKILL.md +146 -0
- skills/kb-init/rules.md +187 -0
- skills/kb-init/steps/step-01-scope.md +62 -0
- skills/kb-init/steps/step-02-source.md +410 -0
- skills/kb-init/steps/step-03-generate.md +307 -0
- skills/kb-init/steps/step-04-quality.md +92 -0
- skills/kb-init/steps/step-05-finalize.md +132 -0
- skills/kb-init/templates/core/execution-modes.md +29 -0
- skills/kb-init/templates/core/output-only.md +4 -0
- skills/kb-init/templates/core/readwrite.md +33 -0
- skills/kb-search/SKILL.md +138 -0
- skills/kb-search/rules.md +64 -0
- skills/kb-sync/SKILL.md +43 -0
- skills/kb-sync/rules.md +70 -0
- skills/kb-sync/scripts/rebuild_module.py +91 -0
- skills/kb-sync/scripts/scan_repos.py +687 -0
- skills/kb-sync/steps/step-01-detect.md +72 -0
- skills/kb-sync/steps/step-02-update.md +71 -0
- skills/kb-sync/steps/step-03-verify.md +47 -0
- skills/kb-sync/steps/step-04-finalize.md +52 -0
- source_kb-0.2.2.dist-info/METADATA +194 -0
- source_kb-0.2.2.dist-info/RECORD +228 -0
- source_kb-0.2.2.dist-info/WHEEL +5 -0
- source_kb-0.2.2.dist-info/entry_points.txt +3 -0
- source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
- source_kb-0.2.2.dist-info/top_level.txt +6 -0
core/prompt/content.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
"""Prompt content helpers — shared utilities for variable computation.
|
|
2
|
+
|
|
3
|
+
Computes template variables: high_methods, generated_docs, sibling_modules,
|
|
4
|
+
prior_docs_context, shard_context, and source content reading.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compute_high_methods(module_dir: Path) -> str:
|
|
19
|
+
"""Extract high-complexity methods from skeleton summary for prompt injection."""
|
|
20
|
+
from core.paths import resolve_skeleton_summary
|
|
21
|
+
summary_file = resolve_skeleton_summary(module_dir)
|
|
22
|
+
if summary_file is None:
|
|
23
|
+
return "(Skeleton unavailable, identify complex methods from source code)"
|
|
24
|
+
try:
|
|
25
|
+
entries = json.loads(summary_file.read_text(encoding="utf-8"))
|
|
26
|
+
except (OSError, ValueError):
|
|
27
|
+
return "(Failed to read skeleton)"
|
|
28
|
+
|
|
29
|
+
high_methods: list[str] = []
|
|
30
|
+
for entry in entries:
|
|
31
|
+
file_path = entry.get("file", "")
|
|
32
|
+
classname = Path(file_path).stem if file_path else "Unknown"
|
|
33
|
+
for m in entry.get("high_complexity_methods", []):
|
|
34
|
+
name = m.get("name", "") if isinstance(m, dict) else str(m)
|
|
35
|
+
line_count = m.get("line_count", "?") if isinstance(m, dict) else "?"
|
|
36
|
+
high_methods.append(f"- {classname}.{name} ({line_count} lines)")
|
|
37
|
+
if not entry.get("high_complexity_methods"):
|
|
38
|
+
for m in entry.get("methods", []):
|
|
39
|
+
if m.get("complexity") == "high":
|
|
40
|
+
high_methods.append(f"- {classname}.{m['name']} ({m.get('line_count', '?')} lines)")
|
|
41
|
+
if not high_methods:
|
|
42
|
+
return "(No high-complexity methods)"
|
|
43
|
+
return "\n".join(high_methods[:30])
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def scan_generated_docs(module_dir: Path) -> str:
|
|
47
|
+
"""List already-generated .md files in the module directory."""
|
|
48
|
+
if not module_dir.is_dir():
|
|
49
|
+
return "(No generated documents yet)"
|
|
50
|
+
docs = sorted(f.name for f in module_dir.glob("*.md") if not f.name.startswith("."))
|
|
51
|
+
if not docs:
|
|
52
|
+
return "(No generated documents yet)"
|
|
53
|
+
return "\n".join(f"- [{doc}](./{doc})" for doc in docs)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def compute_sibling_modules(config: dict[str, Any], kb_name: str, current_module: str) -> str:
|
|
57
|
+
"""Compute sibling module info for cross-module awareness."""
|
|
58
|
+
kb = config.get("knowledge_bases", {}).get(kb_name, {})
|
|
59
|
+
source = kb.get("source", {})
|
|
60
|
+
siblings: list[str] = []
|
|
61
|
+
if source.get("structure") == "multi-repo":
|
|
62
|
+
for repo in source.get("repos", []):
|
|
63
|
+
name = repo.get("name", "")
|
|
64
|
+
if name and name != current_module:
|
|
65
|
+
siblings.append(f"- **{name}** ({repo.get('description', repo.get('type', ''))})")
|
|
66
|
+
elif source.get("structure") == "monorepo":
|
|
67
|
+
for mod in source.get("modules", []):
|
|
68
|
+
name = mod.get("name", "")
|
|
69
|
+
if name and name != current_module:
|
|
70
|
+
siblings.append(f"- **{name}** ({mod.get('type', '')})")
|
|
71
|
+
if not siblings:
|
|
72
|
+
return "(No sibling modules)"
|
|
73
|
+
return "### Sibling modules in the same knowledge base\n\n" + "\n".join(siblings)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_prior_docs_context(
|
|
77
|
+
module_dir: Path, current_doc_type: str, max_chars: int = 2000,
|
|
78
|
+
preset: dict | None = None,
|
|
79
|
+
) -> str:
|
|
80
|
+
"""Build context from prior-batch documents for cross-doc-type reference.
|
|
81
|
+
|
|
82
|
+
Dependencies are loaded from preset doc_types.yaml `depends_on` field.
|
|
83
|
+
Falls back to hardcoded defaults if preset is not provided.
|
|
84
|
+
|
|
85
|
+
Note: max_chars default (2000) can be overridden via preset limits.prior_docs_max_chars
|
|
86
|
+
"""
|
|
87
|
+
# Use configured limit if available
|
|
88
|
+
if preset and max_chars == 2000:
|
|
89
|
+
limits = preset.get("limits", {})
|
|
90
|
+
max_chars = limits.get("prior_docs_max_chars", max_chars)
|
|
91
|
+
dep_docs = _get_dependencies(current_doc_type, preset)
|
|
92
|
+
if not dep_docs:
|
|
93
|
+
return ""
|
|
94
|
+
summaries: list[str] = []
|
|
95
|
+
remaining = max_chars
|
|
96
|
+
for dep_type in dep_docs:
|
|
97
|
+
filename = _doc_type_to_filename(dep_type)
|
|
98
|
+
doc_path = module_dir / filename
|
|
99
|
+
if not doc_path.exists():
|
|
100
|
+
continue
|
|
101
|
+
summary = _extract_doc_summary(doc_path)
|
|
102
|
+
if not summary:
|
|
103
|
+
continue
|
|
104
|
+
section = f"### {filename}\n{summary}"
|
|
105
|
+
if len(section) > remaining:
|
|
106
|
+
break
|
|
107
|
+
summaries.append(section)
|
|
108
|
+
remaining -= len(section)
|
|
109
|
+
if not summaries:
|
|
110
|
+
return ""
|
|
111
|
+
return "## Prior document summaries (available for reference)\n\n" + "\n\n".join(summaries)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _get_dependencies(doc_type: str, preset: dict | None) -> list[str]:
|
|
115
|
+
"""Get dependency doc types from preset config."""
|
|
116
|
+
if preset:
|
|
117
|
+
from core.preset import get_doc_type_config
|
|
118
|
+
cfg = get_doc_type_config(preset, doc_type)
|
|
119
|
+
deps = cfg.get("depends_on", [])
|
|
120
|
+
if deps:
|
|
121
|
+
return deps
|
|
122
|
+
logger.debug("no depends_on for doc_type=%s, skipping prior docs", doc_type)
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def read_source_content(
|
|
127
|
+
module_dir: Path, doc_type: str, source_cache: Path, max_bytes: int = 300_000,
|
|
128
|
+
) -> str:
|
|
129
|
+
"""Read source files from file list, priority-sorted, with byte-limit truncation.
|
|
130
|
+
|
|
131
|
+
Priority: high-complexity files first, then by line count, then alphabetical.
|
|
132
|
+
"""
|
|
133
|
+
from core.paths import resolve_file_list
|
|
134
|
+
fl_path = resolve_file_list(module_dir, doc_type)
|
|
135
|
+
if fl_path is None or not fl_path.exists():
|
|
136
|
+
return ""
|
|
137
|
+
file_paths = [l.strip() for l in fl_path.read_text(encoding="utf-8").splitlines()
|
|
138
|
+
if l.strip() and not l.strip().startswith("#")]
|
|
139
|
+
if not file_paths:
|
|
140
|
+
return ""
|
|
141
|
+
|
|
142
|
+
priority_map = _build_priority_map(module_dir)
|
|
143
|
+
source_cache_str = str(source_cache.resolve())
|
|
144
|
+
|
|
145
|
+
def sort_key(fpath: str):
|
|
146
|
+
# fpath is now module-relative (e.g. "promotion-manager-api/src/main/java/...")
|
|
147
|
+
# For backward compat, also strip source_cache prefix if present (old absolute paths)
|
|
148
|
+
rel = fpath
|
|
149
|
+
if fpath.startswith(source_cache_str):
|
|
150
|
+
rel = fpath[len(source_cache_str):].lstrip("/\\")
|
|
151
|
+
info = priority_map.get(rel, {})
|
|
152
|
+
return (-info.get("high_methods", 0), -info.get("total_lines", 0), fpath)
|
|
153
|
+
|
|
154
|
+
file_paths.sort(key=sort_key)
|
|
155
|
+
parts: list[str] = []
|
|
156
|
+
total_bytes_used = 0
|
|
157
|
+
included = 0
|
|
158
|
+
max_single_file = max_bytes // 6
|
|
159
|
+
|
|
160
|
+
for fpath in file_paths:
|
|
161
|
+
src_file = source_cache / fpath if not Path(fpath).is_absolute() else Path(fpath)
|
|
162
|
+
if not src_file.exists():
|
|
163
|
+
continue
|
|
164
|
+
try:
|
|
165
|
+
raw_content = src_file.read_text(encoding="utf-8")
|
|
166
|
+
except (OSError, UnicodeDecodeError):
|
|
167
|
+
continue
|
|
168
|
+
content_bytes = len(raw_content.encode("utf-8"))
|
|
169
|
+
if content_bytes <= max_single_file:
|
|
170
|
+
content = raw_content
|
|
171
|
+
else:
|
|
172
|
+
lines = raw_content.splitlines(keepends=True)
|
|
173
|
+
truncated: list[str] = []
|
|
174
|
+
acc = 0
|
|
175
|
+
for line in lines:
|
|
176
|
+
acc += len(line.encode("utf-8"))
|
|
177
|
+
if acc > max_single_file:
|
|
178
|
+
break
|
|
179
|
+
truncated.append(line)
|
|
180
|
+
content = "".join(truncated)
|
|
181
|
+
content += f"\n// ... [truncated, showing {len(truncated)}/{len(lines)} lines]\n"
|
|
182
|
+
|
|
183
|
+
ext = Path(fpath).suffix.lstrip(".") or "text"
|
|
184
|
+
block = f"### {Path(fpath).name}\n```{ext}\n{content}\n```\n"
|
|
185
|
+
block_bytes = len(block.encode("utf-8"))
|
|
186
|
+
if total_bytes_used + block_bytes > max_bytes:
|
|
187
|
+
parts.append(f"\n[truncated — {len(file_paths) - included} files omitted]\n")
|
|
188
|
+
break
|
|
189
|
+
parts.append(block)
|
|
190
|
+
total_bytes_used += block_bytes
|
|
191
|
+
included += 1
|
|
192
|
+
|
|
193
|
+
return "\n".join(parts)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _build_priority_map(module_dir: Path) -> dict[str, dict]:
|
|
197
|
+
"""Build file priority map from skeleton: {relative_path: {high_methods, total_lines}}."""
|
|
198
|
+
from core.paths import resolve_skeleton, resolve_skeleton_summary
|
|
199
|
+
entries: list[dict] = []
|
|
200
|
+
summary_file = resolve_skeleton_summary(module_dir)
|
|
201
|
+
if summary_file is not None:
|
|
202
|
+
try:
|
|
203
|
+
entries = json.loads(summary_file.read_text(encoding="utf-8"))
|
|
204
|
+
except (OSError, ValueError):
|
|
205
|
+
pass
|
|
206
|
+
if not entries:
|
|
207
|
+
resolved = resolve_skeleton(module_dir)
|
|
208
|
+
if resolved is not None:
|
|
209
|
+
if resolved.is_dir():
|
|
210
|
+
for f in resolved.glob("*.json"):
|
|
211
|
+
try:
|
|
212
|
+
data = json.loads(f.read_text(encoding="utf-8"))
|
|
213
|
+
if isinstance(data, list):
|
|
214
|
+
entries.extend(data)
|
|
215
|
+
except (OSError, ValueError):
|
|
216
|
+
continue
|
|
217
|
+
else:
|
|
218
|
+
try:
|
|
219
|
+
data = json.loads(resolved.read_text(encoding="utf-8"))
|
|
220
|
+
entries = data if isinstance(data, list) else []
|
|
221
|
+
except (OSError, ValueError):
|
|
222
|
+
pass
|
|
223
|
+
priority: dict[str, dict] = {}
|
|
224
|
+
for entry in entries:
|
|
225
|
+
fpath = entry.get("file", "")
|
|
226
|
+
if not fpath:
|
|
227
|
+
continue
|
|
228
|
+
high_count = len(entry.get("high_complexity_methods", []))
|
|
229
|
+
if not high_count:
|
|
230
|
+
high_count = sum(1 for m in entry.get("methods", [])
|
|
231
|
+
if isinstance(m, dict) and m.get("complexity") == "high")
|
|
232
|
+
priority[fpath] = {"high_methods": high_count, "total_lines": entry.get("line_count", 0)}
|
|
233
|
+
return priority
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def build_shard_context(
|
|
237
|
+
module_dir: Path, doc_type: str, current_shard: str,
|
|
238
|
+
completed_shards: list[str], max_chars: int = 1500,
|
|
239
|
+
preset: dict | None = None,
|
|
240
|
+
) -> str:
|
|
241
|
+
"""Build context from completed shards of the same doc type.
|
|
242
|
+
|
|
243
|
+
Note: max_chars default (1500) can be overridden via preset limits.shard_context_max_chars
|
|
244
|
+
"""
|
|
245
|
+
if not completed_shards:
|
|
246
|
+
return ""
|
|
247
|
+
# Use configured limit if available
|
|
248
|
+
if preset and max_chars == 1500:
|
|
249
|
+
limits = preset.get("limits", {})
|
|
250
|
+
max_chars = limits.get("shard_context_max_chars", max_chars)
|
|
251
|
+
doc_basename = _doc_type_to_filename(doc_type).replace(".md", "")
|
|
252
|
+
parts: list[str] = []
|
|
253
|
+
for shard_name in completed_shards:
|
|
254
|
+
if shard_name == current_shard:
|
|
255
|
+
continue
|
|
256
|
+
shard_path = module_dir / f"{doc_basename}-{shard_name}.md"
|
|
257
|
+
if not shard_path.exists():
|
|
258
|
+
continue
|
|
259
|
+
summary = _extract_shard_summary(shard_path)
|
|
260
|
+
if summary:
|
|
261
|
+
parts.append(f"### {shard_name}\n{summary}")
|
|
262
|
+
if not parts:
|
|
263
|
+
return ""
|
|
264
|
+
text = "## Content already covered by prior shards (avoid repetition)\n\n" + "\n\n".join(parts)
|
|
265
|
+
if len(text) > max_chars:
|
|
266
|
+
text = text[:max_chars - 20] + "\n\n[truncated]"
|
|
267
|
+
return text
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
# ---------------------------------------------------------------------------
|
|
271
|
+
# Internal helpers
|
|
272
|
+
# ---------------------------------------------------------------------------
|
|
273
|
+
|
|
274
|
+
_DOC_TYPE_FILENAMES: dict[str, str] = {}
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _doc_type_to_filename(doc_type: str, preset: dict | None = None) -> str:
|
|
278
|
+
"""Convert doc_type key to filename."""
|
|
279
|
+
if preset:
|
|
280
|
+
try:
|
|
281
|
+
from core.preset import get_doc_type_config
|
|
282
|
+
dt_config = get_doc_type_config(preset, doc_type)
|
|
283
|
+
if dt_config and "filename" in dt_config:
|
|
284
|
+
return dt_config["filename"]
|
|
285
|
+
except (ImportError, Exception):
|
|
286
|
+
pass
|
|
287
|
+
return f"{doc_type}.md"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _extract_doc_summary(path: Path, max_lines: int = 15) -> str:
|
|
291
|
+
"""Extract title + heading list from a document."""
|
|
292
|
+
try:
|
|
293
|
+
content = path.read_text(encoding="utf-8")
|
|
294
|
+
except (OSError, UnicodeDecodeError):
|
|
295
|
+
return ""
|
|
296
|
+
lines = content.splitlines()
|
|
297
|
+
title = ""
|
|
298
|
+
for line in lines[:5]:
|
|
299
|
+
if line.startswith("# ") and not line.startswith("## "):
|
|
300
|
+
title = line.strip()
|
|
301
|
+
break
|
|
302
|
+
headings = [l.strip().lstrip("#").strip() for l in lines if l.strip().startswith("## ")]
|
|
303
|
+
parts = []
|
|
304
|
+
if title:
|
|
305
|
+
parts.append(title)
|
|
306
|
+
if headings:
|
|
307
|
+
parts.append("Sections: " + " | ".join(headings[:10]))
|
|
308
|
+
return "\n".join(parts)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _extract_shard_summary(path: Path) -> str:
|
|
312
|
+
"""Extract heading list from a shard document."""
|
|
313
|
+
try:
|
|
314
|
+
content = path.read_text(encoding="utf-8")
|
|
315
|
+
except (OSError, UnicodeDecodeError):
|
|
316
|
+
return ""
|
|
317
|
+
headings = [l.strip() for l in content.splitlines() if l.strip().startswith("## ")]
|
|
318
|
+
if headings:
|
|
319
|
+
return "Sections: " + " | ".join(h.lstrip("#").strip() for h in headings[:8])
|
|
320
|
+
return ""
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""Prompt context size management.
|
|
2
|
+
|
|
3
|
+
Ensures prompts don't exceed model context limits by applying
|
|
4
|
+
progressive truncation strategies.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from core.prompt.context_manager import estimate_tokens, manage_context_size
|
|
8
|
+
|
|
9
|
+
system, user = manage_context_size(system, user, max_tokens=128000)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def estimate_tokens(text: str) -> int:
|
|
21
|
+
"""Fast token count approximation.
|
|
22
|
+
|
|
23
|
+
Rules:
|
|
24
|
+
- Chinese characters: ~1.5 tokens per char (conservative)
|
|
25
|
+
- English/code: ~4 chars per token
|
|
26
|
+
- Mixed: weighted by character type
|
|
27
|
+
"""
|
|
28
|
+
if not text:
|
|
29
|
+
return 0
|
|
30
|
+
|
|
31
|
+
chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
|
|
32
|
+
other_chars = len(text) - chinese_chars
|
|
33
|
+
|
|
34
|
+
chinese_tokens = int(chinese_chars * 1.5)
|
|
35
|
+
other_tokens = other_chars // 4
|
|
36
|
+
|
|
37
|
+
return chinese_tokens + other_tokens
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def manage_context_size(
|
|
41
|
+
system: str,
|
|
42
|
+
user: str,
|
|
43
|
+
*,
|
|
44
|
+
max_tokens: int = 128_000,
|
|
45
|
+
threshold_ratio: float = 0.8,
|
|
46
|
+
) -> tuple[str, str]:
|
|
47
|
+
"""Apply progressive truncation if prompt exceeds threshold.
|
|
48
|
+
|
|
49
|
+
Truncation priority (applied in order until under limit):
|
|
50
|
+
1. Truncate skeleton/skeleton_delta sections in user prompt
|
|
51
|
+
2. Truncate source_code/source_snippets sections
|
|
52
|
+
3. Truncate old_content to heading list + first 3 lines per section
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
system: System prompt
|
|
56
|
+
user: User prompt
|
|
57
|
+
max_tokens: Maximum context tokens
|
|
58
|
+
threshold_ratio: Trigger truncation at this ratio of max
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
(system, user) — possibly truncated
|
|
62
|
+
"""
|
|
63
|
+
threshold = int(max_tokens * threshold_ratio)
|
|
64
|
+
total = estimate_tokens(system) + estimate_tokens(user)
|
|
65
|
+
|
|
66
|
+
if total <= threshold:
|
|
67
|
+
return system, user
|
|
68
|
+
|
|
69
|
+
original_total = total
|
|
70
|
+
truncated_what: list[str] = []
|
|
71
|
+
|
|
72
|
+
# Strategy 1: Truncate skeleton sections
|
|
73
|
+
user, reduced = _truncate_section(user, "skeleton", max_lines=100)
|
|
74
|
+
if reduced:
|
|
75
|
+
truncated_what.append("skeleton")
|
|
76
|
+
total = estimate_tokens(system) + estimate_tokens(user)
|
|
77
|
+
if total <= threshold:
|
|
78
|
+
_log_truncation(original_total, total, truncated_what)
|
|
79
|
+
return system, user
|
|
80
|
+
|
|
81
|
+
user, reduced = _truncate_section(user, "skeleton", max_lines=100)
|
|
82
|
+
if reduced:
|
|
83
|
+
truncated_what.append("skeleton_delta")
|
|
84
|
+
total = estimate_tokens(system) + estimate_tokens(user)
|
|
85
|
+
if total <= threshold:
|
|
86
|
+
_log_truncation(original_total, total, truncated_what)
|
|
87
|
+
return system, user
|
|
88
|
+
|
|
89
|
+
# Strategy 2: Truncate source code sections
|
|
90
|
+
user, reduced = _truncate_section(user, "source", max_lines=80)
|
|
91
|
+
if reduced:
|
|
92
|
+
truncated_what.append("source_code")
|
|
93
|
+
total = estimate_tokens(system) + estimate_tokens(user)
|
|
94
|
+
if total <= threshold:
|
|
95
|
+
_log_truncation(original_total, total, truncated_what)
|
|
96
|
+
return system, user
|
|
97
|
+
|
|
98
|
+
user, reduced = _truncate_section(user, "source", max_lines=80)
|
|
99
|
+
if reduced:
|
|
100
|
+
truncated_what.append("source_files")
|
|
101
|
+
total = estimate_tokens(system) + estimate_tokens(user)
|
|
102
|
+
if total <= threshold:
|
|
103
|
+
_log_truncation(original_total, total, truncated_what)
|
|
104
|
+
return system, user
|
|
105
|
+
|
|
106
|
+
# Strategy 3: Hard truncate user prompt to fit
|
|
107
|
+
max_user_tokens = threshold - estimate_tokens(system) - 500 # leave margin
|
|
108
|
+
if max_user_tokens > 0:
|
|
109
|
+
user = _hard_truncate(user, max_user_tokens)
|
|
110
|
+
truncated_what.append("hard_truncate")
|
|
111
|
+
|
|
112
|
+
_log_truncation(original_total, estimate_tokens(system) + estimate_tokens(user), truncated_what)
|
|
113
|
+
return system, user
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _truncate_section(text: str, marker: str, max_lines: int = 100) -> tuple[str, bool]:
|
|
117
|
+
"""Find a section containing marker keyword and truncate it."""
|
|
118
|
+
lines = text.split("\n")
|
|
119
|
+
marker_lower = marker.lower()
|
|
120
|
+
|
|
121
|
+
# Find section start (line containing marker)
|
|
122
|
+
start_idx = None
|
|
123
|
+
for i, line in enumerate(lines):
|
|
124
|
+
if marker_lower in line.lower() and (line.startswith("#") or line.endswith(":")):
|
|
125
|
+
start_idx = i
|
|
126
|
+
break
|
|
127
|
+
|
|
128
|
+
if start_idx is None:
|
|
129
|
+
return text, False
|
|
130
|
+
|
|
131
|
+
# Find section end (next heading or end)
|
|
132
|
+
end_idx = len(lines)
|
|
133
|
+
for j in range(start_idx + 1, len(lines)):
|
|
134
|
+
if lines[j].startswith("#") or (lines[j].strip() and lines[j][0].isalpha() and lines[j].endswith(":")):
|
|
135
|
+
end_idx = j
|
|
136
|
+
break
|
|
137
|
+
|
|
138
|
+
section_lines = lines[start_idx:end_idx]
|
|
139
|
+
if len(section_lines) <= max_lines:
|
|
140
|
+
return text, False
|
|
141
|
+
|
|
142
|
+
# Truncate section
|
|
143
|
+
truncated = section_lines[:max_lines]
|
|
144
|
+
truncated.append(f"\n... (truncated, {len(section_lines) - max_lines} lines removed)")
|
|
145
|
+
|
|
146
|
+
new_lines = lines[:start_idx] + truncated + lines[end_idx:]
|
|
147
|
+
return "\n".join(new_lines), True
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _hard_truncate(text: str, max_tokens: int) -> str:
|
|
151
|
+
"""Hard truncate text to approximately max_tokens."""
|
|
152
|
+
# Rough: 1 token ≈ 3 chars for mixed content
|
|
153
|
+
max_chars = max_tokens * 3
|
|
154
|
+
if len(text) <= max_chars:
|
|
155
|
+
return text
|
|
156
|
+
return text[:max_chars] + "\n\n... (truncated to fit context window)"
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _log_truncation(original: int, final: int, what: list[str]) -> None:
|
|
160
|
+
"""Log truncation details."""
|
|
161
|
+
logger.warning(
|
|
162
|
+
"[context_manager] Truncated prompt: %d → %d tokens (removed: %s)",
|
|
163
|
+
original, final, ", ".join(what),
|
|
164
|
+
)
|