source-kb 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (228) hide show
  1. cli/__init__.py +50 -0
  2. cli/__main__.py +5 -0
  3. cli/commands/__init__.py +1 -0
  4. cli/commands/anchor_fix.py +47 -0
  5. cli/commands/diff_doc.py +52 -0
  6. cli/commands/dispatch.py +77 -0
  7. cli/commands/extract.py +72 -0
  8. cli/commands/file_list.py +74 -0
  9. cli/commands/index.py +84 -0
  10. cli/commands/lock.py +89 -0
  11. cli/commands/merge.py +60 -0
  12. cli/commands/merge_delta.py +19 -0
  13. cli/commands/metadata.py +24 -0
  14. cli/commands/pipeline.py +45 -0
  15. cli/commands/post_merge.py +43 -0
  16. cli/commands/query.py +52 -0
  17. cli/commands/render.py +101 -0
  18. cli/commands/scan_repos.py +46 -0
  19. cli/commands/setup.py +94 -0
  20. cli/commands/split.py +196 -0
  21. cli/commands/stale_files.py +98 -0
  22. cli/commands/validate.py +191 -0
  23. core/__init__.py +32 -0
  24. core/config.py +261 -0
  25. core/docs/__init__.py +7 -0
  26. core/docs/section_updater.py +286 -0
  27. core/docs/shared.py +149 -0
  28. core/git.py +294 -0
  29. core/interfaces.py +249 -0
  30. core/monitor/__init__.py +5 -0
  31. core/monitor/progress.py +83 -0
  32. core/monitor/prompt_store.py +49 -0
  33. core/paths.py +141 -0
  34. core/preset.py +237 -0
  35. core/preset_accessors.py +202 -0
  36. core/preset_classify.py +132 -0
  37. core/preset_hooks.py +129 -0
  38. core/preset_profile.py +89 -0
  39. core/prompt/__init__.py +7 -0
  40. core/prompt/__main__.py +147 -0
  41. core/prompt/content.py +320 -0
  42. core/prompt/context_manager.py +164 -0
  43. core/prompt/renderer.py +236 -0
  44. core/prompt/response_parser.py +274 -0
  45. core/prompt/templates.py +357 -0
  46. core/prompt/validate_parity.py +162 -0
  47. core/prompt/variables.py +339 -0
  48. core/rag/__init__.py +22 -0
  49. core/rag/__main__.py +136 -0
  50. core/rag/bm25_index.py +268 -0
  51. core/rag/chunker.py +273 -0
  52. core/rag/embedder.py +151 -0
  53. core/rag/indexer.py +292 -0
  54. core/rag/loader.py +89 -0
  55. core/rag/retriever.py +82 -0
  56. core/skeleton/__init__.py +11 -0
  57. core/skeleton/__main__.py +934 -0
  58. core/skeleton/anchor_fix.py +250 -0
  59. core/skeleton/classify.py +331 -0
  60. core/skeleton/cmd_anchor_fix.py +43 -0
  61. core/skeleton/cmd_diff_doc.py +44 -0
  62. core/skeleton/cmd_lock.py +87 -0
  63. core/skeleton/cmd_merge_delta.py +41 -0
  64. core/skeleton/community.py +233 -0
  65. core/skeleton/dependency_graph.py +306 -0
  66. core/skeleton/diff_doc.py +248 -0
  67. core/skeleton/dispatch.py +273 -0
  68. core/skeleton/dispatch_render.py +319 -0
  69. core/skeleton/dispatch_source.py +111 -0
  70. core/skeleton/extract.py +218 -0
  71. core/skeleton/extract_methods.py +298 -0
  72. core/skeleton/file_list.py +239 -0
  73. core/skeleton/impact.py +278 -0
  74. core/skeleton/jar_download.py +177 -0
  75. core/skeleton/jar_resolver.py +186 -0
  76. core/skeleton/loader.py +162 -0
  77. core/skeleton/merge.py +278 -0
  78. core/skeleton/merge_delta.py +229 -0
  79. core/skeleton/metadata.py +96 -0
  80. core/skeleton/metadata_builders.py +264 -0
  81. core/skeleton/module_dag.py +330 -0
  82. core/skeleton/parsers/__init__.py +71 -0
  83. core/skeleton/parsers/jqassistant.py +300 -0
  84. core/skeleton/parsers/jqassistant_cypher.py +225 -0
  85. core/skeleton/parsers/regex.py +171 -0
  86. core/skeleton/parsers/treesitter.py +324 -0
  87. core/skeleton/parsers/treesitter_java.py +284 -0
  88. core/skeleton/parsers/treesitter_multi.py +289 -0
  89. core/skeleton/pom_parser.py +299 -0
  90. core/skeleton/post_merge.py +295 -0
  91. core/skeleton/post_merge_llm.py +82 -0
  92. core/skeleton/query.py +195 -0
  93. core/skeleton/shard_context.py +177 -0
  94. core/skeleton/split.py +180 -0
  95. core/skeleton/split_cache.py +107 -0
  96. core/skeleton/split_feedback.py +174 -0
  97. core/skeleton/split_plan.py +219 -0
  98. core/skeleton/split_plan_helpers.py +305 -0
  99. core/skeleton/split_plan_llm.py +274 -0
  100. core/utils.py +135 -0
  101. core/validators/__init__.py +65 -0
  102. core/validators/__main__.py +215 -0
  103. core/validators/consistency.py +203 -0
  104. core/validators/coverage.py +171 -0
  105. core/validators/duplicates.py +76 -0
  106. core/validators/engine.py +224 -0
  107. core/validators/links.py +76 -0
  108. core/validators/sampling.py +169 -0
  109. core/validators/structure.py +144 -0
  110. engine/__init__.py +7 -0
  111. engine/assembler.py +231 -0
  112. engine/confirm.py +65 -0
  113. engine/dedup.py +106 -0
  114. engine/main.py +211 -0
  115. engine/pipeline/__init__.py +163 -0
  116. engine/pipeline/recovery.py +250 -0
  117. engine/pipeline/steps/__init__.py +23 -0
  118. engine/pipeline/steps/audit.py +220 -0
  119. engine/pipeline/steps/audit_apply.py +195 -0
  120. engine/pipeline/steps/audit_helpers.py +155 -0
  121. engine/pipeline/steps/classify_llm.py +236 -0
  122. engine/pipeline/steps/classify_prompt.py +223 -0
  123. engine/pipeline/steps/finalize.py +160 -0
  124. engine/pipeline/steps/generate.py +169 -0
  125. engine/pipeline/steps/generate_batch.py +197 -0
  126. engine/pipeline/steps/generate_recovery.py +170 -0
  127. engine/pipeline/steps/llm_plan_split.py +253 -0
  128. engine/pipeline/steps/lock.py +64 -0
  129. engine/pipeline/steps/preflight.py +237 -0
  130. engine/pipeline/steps/preflight_adjust.py +147 -0
  131. engine/pipeline/steps/pregenerate.py +130 -0
  132. engine/pipeline/steps/quality.py +81 -0
  133. engine/pipeline/steps/skeleton.py +149 -0
  134. engine/pipeline/steps/source.py +163 -0
  135. engine/pipeline/steps/sync.py +117 -0
  136. engine/pipeline/steps/sync_finalize.py +237 -0
  137. engine/pipeline/steps/sync_update.py +341 -0
  138. engine/pipelines.py +91 -0
  139. engine/runner.py +335 -0
  140. engine/strategies/__init__.py +86 -0
  141. engine/strategies/api.py +128 -0
  142. engine/strategies/delegated.py +50 -0
  143. engine/strategies/dryrun.py +25 -0
  144. engine/two_phase.py +143 -0
  145. mcp_server/__init__.py +73 -0
  146. mcp_server/__main__.py +5 -0
  147. mcp_server/tools/__init__.py +1 -0
  148. mcp_server/tools/config.py +63 -0
  149. mcp_server/tools/discovery.py +276 -0
  150. mcp_server/tools/generation.py +184 -0
  151. mcp_server/tools/planning.py +144 -0
  152. mcp_server/tools/source.py +175 -0
  153. mcp_server/tools/validation.py +140 -0
  154. mcp_server/tools/workflow.py +166 -0
  155. mcp_server/workflow_loader.py +204 -0
  156. presets/generic/audit_dimensions.md +132 -0
  157. presets/generic/doc_types.yaml +152 -0
  158. presets/generic/preset.yaml +115 -0
  159. presets/java-spring/audit_dimensions.md +228 -0
  160. presets/java-spring/audit_dimensions.yaml +203 -0
  161. presets/java-spring/doc_types.yaml +269 -0
  162. presets/java-spring/hooks.py +122 -0
  163. presets/java-spring/preset.yaml +341 -0
  164. presets/java-spring/templates/README.md +34 -0
  165. presets/java-spring/templates/audit-system.md +15 -0
  166. presets/java-spring/templates/subagent-aop.md +105 -0
  167. presets/java-spring/templates/subagent-api.md +63 -0
  168. presets/java-spring/templates/subagent-architecture.md +111 -0
  169. presets/java-spring/templates/subagent-async-events.md +107 -0
  170. presets/java-spring/templates/subagent-audit-api-contracts.md +40 -0
  171. presets/java-spring/templates/subagent-audit-architecture.md +38 -0
  172. presets/java-spring/templates/subagent-audit-business.md +40 -0
  173. presets/java-spring/templates/subagent-audit-data-models.md +40 -0
  174. presets/java-spring/templates/subagent-business.md +129 -0
  175. presets/java-spring/templates/subagent-caching.md +75 -0
  176. presets/java-spring/templates/subagent-database-access.md +114 -0
  177. presets/java-spring/templates/subagent-enum.md +75 -0
  178. presets/java-spring/templates/subagent-error-handling.md +91 -0
  179. presets/java-spring/templates/subagent-external-integrations.md +80 -0
  180. presets/java-spring/templates/subagent-index.md +122 -0
  181. presets/java-spring/templates/subagent-messaging.md +97 -0
  182. presets/java-spring/templates/subagent-model.md +88 -0
  183. presets/java-spring/templates/subagent-observability.md +91 -0
  184. presets/java-spring/templates/subagent-scheduled.md +81 -0
  185. presets/java-spring/templates/subagent-security.md +102 -0
  186. presets/java-spring/templates/subagent-structure.md +101 -0
  187. presets/java-spring/templates/subagent-sync-section.md +34 -0
  188. presets/java-spring/templates/subagent-utils.md +73 -0
  189. presets/java-spring/templates/sync-system.md +8 -0
  190. presets/java-spring/workflow-extensions.md +112 -0
  191. skills/__init__.py +1 -0
  192. skills/_shared/README.md +30 -0
  193. skills/_shared/doc-coverage-shared.md +134 -0
  194. skills/_shared/doc-quality-standard.md +1058 -0
  195. skills/_shared/doc-subagent-rules.md +762 -0
  196. skills/_shared/windows-compat.md +89 -0
  197. skills/kb-audit/SKILL.md +52 -0
  198. skills/kb-audit/rules.md +88 -0
  199. skills/kb-audit/steps/step-01-prepare.md +75 -0
  200. skills/kb-audit/steps/step-02-audit.md +96 -0
  201. skills/kb-audit/steps/step-03-verify.md +65 -0
  202. skills/kb-audit/steps/step-04-report.md +64 -0
  203. skills/kb-init/SKILL.md +146 -0
  204. skills/kb-init/rules.md +187 -0
  205. skills/kb-init/steps/step-01-scope.md +62 -0
  206. skills/kb-init/steps/step-02-source.md +410 -0
  207. skills/kb-init/steps/step-03-generate.md +307 -0
  208. skills/kb-init/steps/step-04-quality.md +92 -0
  209. skills/kb-init/steps/step-05-finalize.md +132 -0
  210. skills/kb-init/templates/core/execution-modes.md +29 -0
  211. skills/kb-init/templates/core/output-only.md +4 -0
  212. skills/kb-init/templates/core/readwrite.md +33 -0
  213. skills/kb-search/SKILL.md +138 -0
  214. skills/kb-search/rules.md +64 -0
  215. skills/kb-sync/SKILL.md +43 -0
  216. skills/kb-sync/rules.md +70 -0
  217. skills/kb-sync/scripts/rebuild_module.py +91 -0
  218. skills/kb-sync/scripts/scan_repos.py +687 -0
  219. skills/kb-sync/steps/step-01-detect.md +72 -0
  220. skills/kb-sync/steps/step-02-update.md +71 -0
  221. skills/kb-sync/steps/step-03-verify.md +47 -0
  222. skills/kb-sync/steps/step-04-finalize.md +52 -0
  223. source_kb-0.2.2.dist-info/METADATA +194 -0
  224. source_kb-0.2.2.dist-info/RECORD +228 -0
  225. source_kb-0.2.2.dist-info/WHEEL +5 -0
  226. source_kb-0.2.2.dist-info/entry_points.txt +3 -0
  227. source_kb-0.2.2.dist-info/licenses/LICENSE +21 -0
  228. source_kb-0.2.2.dist-info/top_level.txt +6 -0
core/prompt/content.py ADDED
@@ -0,0 +1,320 @@
1
+ """Prompt content helpers — shared utilities for variable computation.
2
+
3
+ Computes template variables: high_methods, generated_docs, sibling_modules,
4
+ prior_docs_context, shard_context, and source content reading.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import logging
11
+ import re
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ def compute_high_methods(module_dir: Path) -> str:
19
+ """Extract high-complexity methods from skeleton summary for prompt injection."""
20
+ from core.paths import resolve_skeleton_summary
21
+ summary_file = resolve_skeleton_summary(module_dir)
22
+ if summary_file is None:
23
+ return "(Skeleton unavailable, identify complex methods from source code)"
24
+ try:
25
+ entries = json.loads(summary_file.read_text(encoding="utf-8"))
26
+ except (OSError, ValueError):
27
+ return "(Failed to read skeleton)"
28
+
29
+ high_methods: list[str] = []
30
+ for entry in entries:
31
+ file_path = entry.get("file", "")
32
+ classname = Path(file_path).stem if file_path else "Unknown"
33
+ for m in entry.get("high_complexity_methods", []):
34
+ name = m.get("name", "") if isinstance(m, dict) else str(m)
35
+ line_count = m.get("line_count", "?") if isinstance(m, dict) else "?"
36
+ high_methods.append(f"- {classname}.{name} ({line_count} lines)")
37
+ if not entry.get("high_complexity_methods"):
38
+ for m in entry.get("methods", []):
39
+ if m.get("complexity") == "high":
40
+ high_methods.append(f"- {classname}.{m['name']} ({m.get('line_count', '?')} lines)")
41
+ if not high_methods:
42
+ return "(No high-complexity methods)"
43
+ return "\n".join(high_methods[:30])
44
+
45
+
46
+ def scan_generated_docs(module_dir: Path) -> str:
47
+ """List already-generated .md files in the module directory."""
48
+ if not module_dir.is_dir():
49
+ return "(No generated documents yet)"
50
+ docs = sorted(f.name for f in module_dir.glob("*.md") if not f.name.startswith("."))
51
+ if not docs:
52
+ return "(No generated documents yet)"
53
+ return "\n".join(f"- [{doc}](./{doc})" for doc in docs)
54
+
55
+
56
+ def compute_sibling_modules(config: dict[str, Any], kb_name: str, current_module: str) -> str:
57
+ """Compute sibling module info for cross-module awareness."""
58
+ kb = config.get("knowledge_bases", {}).get(kb_name, {})
59
+ source = kb.get("source", {})
60
+ siblings: list[str] = []
61
+ if source.get("structure") == "multi-repo":
62
+ for repo in source.get("repos", []):
63
+ name = repo.get("name", "")
64
+ if name and name != current_module:
65
+ siblings.append(f"- **{name}** ({repo.get('description', repo.get('type', ''))})")
66
+ elif source.get("structure") == "monorepo":
67
+ for mod in source.get("modules", []):
68
+ name = mod.get("name", "")
69
+ if name and name != current_module:
70
+ siblings.append(f"- **{name}** ({mod.get('type', '')})")
71
+ if not siblings:
72
+ return "(No sibling modules)"
73
+ return "### Sibling modules in the same knowledge base\n\n" + "\n".join(siblings)
74
+
75
+
76
+ def build_prior_docs_context(
77
+ module_dir: Path, current_doc_type: str, max_chars: int = 2000,
78
+ preset: dict | None = None,
79
+ ) -> str:
80
+ """Build context from prior-batch documents for cross-doc-type reference.
81
+
82
+ Dependencies are loaded from preset doc_types.yaml `depends_on` field.
83
+ Falls back to hardcoded defaults if preset is not provided.
84
+
85
+ Note: max_chars default (2000) can be overridden via preset limits.prior_docs_max_chars
86
+ """
87
+ # Use configured limit if available
88
+ if preset and max_chars == 2000:
89
+ limits = preset.get("limits", {})
90
+ max_chars = limits.get("prior_docs_max_chars", max_chars)
91
+ dep_docs = _get_dependencies(current_doc_type, preset)
92
+ if not dep_docs:
93
+ return ""
94
+ summaries: list[str] = []
95
+ remaining = max_chars
96
+ for dep_type in dep_docs:
97
+ filename = _doc_type_to_filename(dep_type)
98
+ doc_path = module_dir / filename
99
+ if not doc_path.exists():
100
+ continue
101
+ summary = _extract_doc_summary(doc_path)
102
+ if not summary:
103
+ continue
104
+ section = f"### {filename}\n{summary}"
105
+ if len(section) > remaining:
106
+ break
107
+ summaries.append(section)
108
+ remaining -= len(section)
109
+ if not summaries:
110
+ return ""
111
+ return "## Prior document summaries (available for reference)\n\n" + "\n\n".join(summaries)
112
+
113
+
114
+ def _get_dependencies(doc_type: str, preset: dict | None) -> list[str]:
115
+ """Get dependency doc types from preset config."""
116
+ if preset:
117
+ from core.preset import get_doc_type_config
118
+ cfg = get_doc_type_config(preset, doc_type)
119
+ deps = cfg.get("depends_on", [])
120
+ if deps:
121
+ return deps
122
+ logger.debug("no depends_on for doc_type=%s, skipping prior docs", doc_type)
123
+ return []
124
+
125
+
126
+ def read_source_content(
127
+ module_dir: Path, doc_type: str, source_cache: Path, max_bytes: int = 300_000,
128
+ ) -> str:
129
+ """Read source files from file list, priority-sorted, with byte-limit truncation.
130
+
131
+ Priority: high-complexity files first, then by line count, then alphabetical.
132
+ """
133
+ from core.paths import resolve_file_list
134
+ fl_path = resolve_file_list(module_dir, doc_type)
135
+ if fl_path is None or not fl_path.exists():
136
+ return ""
137
+ file_paths = [l.strip() for l in fl_path.read_text(encoding="utf-8").splitlines()
138
+ if l.strip() and not l.strip().startswith("#")]
139
+ if not file_paths:
140
+ return ""
141
+
142
+ priority_map = _build_priority_map(module_dir)
143
+ source_cache_str = str(source_cache.resolve())
144
+
145
+ def sort_key(fpath: str):
146
+ # fpath is now module-relative (e.g. "promotion-manager-api/src/main/java/...")
147
+ # For backward compat, also strip source_cache prefix if present (old absolute paths)
148
+ rel = fpath
149
+ if fpath.startswith(source_cache_str):
150
+ rel = fpath[len(source_cache_str):].lstrip("/\\")
151
+ info = priority_map.get(rel, {})
152
+ return (-info.get("high_methods", 0), -info.get("total_lines", 0), fpath)
153
+
154
+ file_paths.sort(key=sort_key)
155
+ parts: list[str] = []
156
+ total_bytes_used = 0
157
+ included = 0
158
+ max_single_file = max_bytes // 6
159
+
160
+ for fpath in file_paths:
161
+ src_file = source_cache / fpath if not Path(fpath).is_absolute() else Path(fpath)
162
+ if not src_file.exists():
163
+ continue
164
+ try:
165
+ raw_content = src_file.read_text(encoding="utf-8")
166
+ except (OSError, UnicodeDecodeError):
167
+ continue
168
+ content_bytes = len(raw_content.encode("utf-8"))
169
+ if content_bytes <= max_single_file:
170
+ content = raw_content
171
+ else:
172
+ lines = raw_content.splitlines(keepends=True)
173
+ truncated: list[str] = []
174
+ acc = 0
175
+ for line in lines:
176
+ acc += len(line.encode("utf-8"))
177
+ if acc > max_single_file:
178
+ break
179
+ truncated.append(line)
180
+ content = "".join(truncated)
181
+ content += f"\n// ... [truncated, showing {len(truncated)}/{len(lines)} lines]\n"
182
+
183
+ ext = Path(fpath).suffix.lstrip(".") or "text"
184
+ block = f"### {Path(fpath).name}\n```{ext}\n{content}\n```\n"
185
+ block_bytes = len(block.encode("utf-8"))
186
+ if total_bytes_used + block_bytes > max_bytes:
187
+ parts.append(f"\n[truncated — {len(file_paths) - included} files omitted]\n")
188
+ break
189
+ parts.append(block)
190
+ total_bytes_used += block_bytes
191
+ included += 1
192
+
193
+ return "\n".join(parts)
194
+
195
+
196
+ def _build_priority_map(module_dir: Path) -> dict[str, dict]:
197
+ """Build file priority map from skeleton: {relative_path: {high_methods, total_lines}}."""
198
+ from core.paths import resolve_skeleton, resolve_skeleton_summary
199
+ entries: list[dict] = []
200
+ summary_file = resolve_skeleton_summary(module_dir)
201
+ if summary_file is not None:
202
+ try:
203
+ entries = json.loads(summary_file.read_text(encoding="utf-8"))
204
+ except (OSError, ValueError):
205
+ pass
206
+ if not entries:
207
+ resolved = resolve_skeleton(module_dir)
208
+ if resolved is not None:
209
+ if resolved.is_dir():
210
+ for f in resolved.glob("*.json"):
211
+ try:
212
+ data = json.loads(f.read_text(encoding="utf-8"))
213
+ if isinstance(data, list):
214
+ entries.extend(data)
215
+ except (OSError, ValueError):
216
+ continue
217
+ else:
218
+ try:
219
+ data = json.loads(resolved.read_text(encoding="utf-8"))
220
+ entries = data if isinstance(data, list) else []
221
+ except (OSError, ValueError):
222
+ pass
223
+ priority: dict[str, dict] = {}
224
+ for entry in entries:
225
+ fpath = entry.get("file", "")
226
+ if not fpath:
227
+ continue
228
+ high_count = len(entry.get("high_complexity_methods", []))
229
+ if not high_count:
230
+ high_count = sum(1 for m in entry.get("methods", [])
231
+ if isinstance(m, dict) and m.get("complexity") == "high")
232
+ priority[fpath] = {"high_methods": high_count, "total_lines": entry.get("line_count", 0)}
233
+ return priority
234
+
235
+
236
+ def build_shard_context(
237
+ module_dir: Path, doc_type: str, current_shard: str,
238
+ completed_shards: list[str], max_chars: int = 1500,
239
+ preset: dict | None = None,
240
+ ) -> str:
241
+ """Build context from completed shards of the same doc type.
242
+
243
+ Note: max_chars default (1500) can be overridden via preset limits.shard_context_max_chars
244
+ """
245
+ if not completed_shards:
246
+ return ""
247
+ # Use configured limit if available
248
+ if preset and max_chars == 1500:
249
+ limits = preset.get("limits", {})
250
+ max_chars = limits.get("shard_context_max_chars", max_chars)
251
+ doc_basename = _doc_type_to_filename(doc_type).replace(".md", "")
252
+ parts: list[str] = []
253
+ for shard_name in completed_shards:
254
+ if shard_name == current_shard:
255
+ continue
256
+ shard_path = module_dir / f"{doc_basename}-{shard_name}.md"
257
+ if not shard_path.exists():
258
+ continue
259
+ summary = _extract_shard_summary(shard_path)
260
+ if summary:
261
+ parts.append(f"### {shard_name}\n{summary}")
262
+ if not parts:
263
+ return ""
264
+ text = "## Content already covered by prior shards (avoid repetition)\n\n" + "\n\n".join(parts)
265
+ if len(text) > max_chars:
266
+ text = text[:max_chars - 20] + "\n\n[truncated]"
267
+ return text
268
+
269
+
270
+ # ---------------------------------------------------------------------------
271
+ # Internal helpers
272
+ # ---------------------------------------------------------------------------
273
+
274
+ _DOC_TYPE_FILENAMES: dict[str, str] = {}
275
+
276
+
277
+ def _doc_type_to_filename(doc_type: str, preset: dict | None = None) -> str:
278
+ """Convert doc_type key to filename."""
279
+ if preset:
280
+ try:
281
+ from core.preset import get_doc_type_config
282
+ dt_config = get_doc_type_config(preset, doc_type)
283
+ if dt_config and "filename" in dt_config:
284
+ return dt_config["filename"]
285
+ except (ImportError, Exception):
286
+ pass
287
+ return f"{doc_type}.md"
288
+
289
+
290
+ def _extract_doc_summary(path: Path, max_lines: int = 15) -> str:
291
+ """Extract title + heading list from a document."""
292
+ try:
293
+ content = path.read_text(encoding="utf-8")
294
+ except (OSError, UnicodeDecodeError):
295
+ return ""
296
+ lines = content.splitlines()
297
+ title = ""
298
+ for line in lines[:5]:
299
+ if line.startswith("# ") and not line.startswith("## "):
300
+ title = line.strip()
301
+ break
302
+ headings = [l.strip().lstrip("#").strip() for l in lines if l.strip().startswith("## ")]
303
+ parts = []
304
+ if title:
305
+ parts.append(title)
306
+ if headings:
307
+ parts.append("Sections: " + " | ".join(headings[:10]))
308
+ return "\n".join(parts)
309
+
310
+
311
+ def _extract_shard_summary(path: Path) -> str:
312
+ """Extract heading list from a shard document."""
313
+ try:
314
+ content = path.read_text(encoding="utf-8")
315
+ except (OSError, UnicodeDecodeError):
316
+ return ""
317
+ headings = [l.strip() for l in content.splitlines() if l.strip().startswith("## ")]
318
+ if headings:
319
+ return "Sections: " + " | ".join(h.lstrip("#").strip() for h in headings[:8])
320
+ return ""
@@ -0,0 +1,164 @@
1
+ """Prompt context size management.
2
+
3
+ Ensures prompts don't exceed model context limits by applying
4
+ progressive truncation strategies.
5
+
6
+ Usage:
7
+ from core.prompt.context_manager import estimate_tokens, manage_context_size
8
+
9
+ system, user = manage_context_size(system, user, max_tokens=128000)
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import logging
15
+ import re
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def estimate_tokens(text: str) -> int:
21
+ """Fast token count approximation.
22
+
23
+ Rules:
24
+ - Chinese characters: ~1.5 tokens per char (conservative)
25
+ - English/code: ~4 chars per token
26
+ - Mixed: weighted by character type
27
+ """
28
+ if not text:
29
+ return 0
30
+
31
+ chinese_chars = len(re.findall(r'[\u4e00-\u9fff]', text))
32
+ other_chars = len(text) - chinese_chars
33
+
34
+ chinese_tokens = int(chinese_chars * 1.5)
35
+ other_tokens = other_chars // 4
36
+
37
+ return chinese_tokens + other_tokens
38
+
39
+
40
+ def manage_context_size(
41
+ system: str,
42
+ user: str,
43
+ *,
44
+ max_tokens: int = 128_000,
45
+ threshold_ratio: float = 0.8,
46
+ ) -> tuple[str, str]:
47
+ """Apply progressive truncation if prompt exceeds threshold.
48
+
49
+ Truncation priority (applied in order until under limit):
50
+ 1. Truncate skeleton/skeleton_delta sections in user prompt
51
+ 2. Truncate source_code/source_snippets sections
52
+ 3. Truncate old_content to heading list + first 3 lines per section
53
+
54
+ Args:
55
+ system: System prompt
56
+ user: User prompt
57
+ max_tokens: Maximum context tokens
58
+ threshold_ratio: Trigger truncation at this ratio of max
59
+
60
+ Returns:
61
+ (system, user) — possibly truncated
62
+ """
63
+ threshold = int(max_tokens * threshold_ratio)
64
+ total = estimate_tokens(system) + estimate_tokens(user)
65
+
66
+ if total <= threshold:
67
+ return system, user
68
+
69
+ original_total = total
70
+ truncated_what: list[str] = []
71
+
72
+ # Strategy 1: Truncate skeleton sections
73
+ user, reduced = _truncate_section(user, "skeleton", max_lines=100)
74
+ if reduced:
75
+ truncated_what.append("skeleton")
76
+ total = estimate_tokens(system) + estimate_tokens(user)
77
+ if total <= threshold:
78
+ _log_truncation(original_total, total, truncated_what)
79
+ return system, user
80
+
81
+ user, reduced = _truncate_section(user, "skeleton", max_lines=100)
82
+ if reduced:
83
+ truncated_what.append("skeleton_delta")
84
+ total = estimate_tokens(system) + estimate_tokens(user)
85
+ if total <= threshold:
86
+ _log_truncation(original_total, total, truncated_what)
87
+ return system, user
88
+
89
+ # Strategy 2: Truncate source code sections
90
+ user, reduced = _truncate_section(user, "source", max_lines=80)
91
+ if reduced:
92
+ truncated_what.append("source_code")
93
+ total = estimate_tokens(system) + estimate_tokens(user)
94
+ if total <= threshold:
95
+ _log_truncation(original_total, total, truncated_what)
96
+ return system, user
97
+
98
+ user, reduced = _truncate_section(user, "source", max_lines=80)
99
+ if reduced:
100
+ truncated_what.append("source_files")
101
+ total = estimate_tokens(system) + estimate_tokens(user)
102
+ if total <= threshold:
103
+ _log_truncation(original_total, total, truncated_what)
104
+ return system, user
105
+
106
+ # Strategy 3: Hard truncate user prompt to fit
107
+ max_user_tokens = threshold - estimate_tokens(system) - 500 # leave margin
108
+ if max_user_tokens > 0:
109
+ user = _hard_truncate(user, max_user_tokens)
110
+ truncated_what.append("hard_truncate")
111
+
112
+ _log_truncation(original_total, estimate_tokens(system) + estimate_tokens(user), truncated_what)
113
+ return system, user
114
+
115
+
116
+ def _truncate_section(text: str, marker: str, max_lines: int = 100) -> tuple[str, bool]:
117
+ """Find a section containing marker keyword and truncate it."""
118
+ lines = text.split("\n")
119
+ marker_lower = marker.lower()
120
+
121
+ # Find section start (line containing marker)
122
+ start_idx = None
123
+ for i, line in enumerate(lines):
124
+ if marker_lower in line.lower() and (line.startswith("#") or line.endswith(":")):
125
+ start_idx = i
126
+ break
127
+
128
+ if start_idx is None:
129
+ return text, False
130
+
131
+ # Find section end (next heading or end)
132
+ end_idx = len(lines)
133
+ for j in range(start_idx + 1, len(lines)):
134
+ if lines[j].startswith("#") or (lines[j].strip() and lines[j][0].isalpha() and lines[j].endswith(":")):
135
+ end_idx = j
136
+ break
137
+
138
+ section_lines = lines[start_idx:end_idx]
139
+ if len(section_lines) <= max_lines:
140
+ return text, False
141
+
142
+ # Truncate section
143
+ truncated = section_lines[:max_lines]
144
+ truncated.append(f"\n... (truncated, {len(section_lines) - max_lines} lines removed)")
145
+
146
+ new_lines = lines[:start_idx] + truncated + lines[end_idx:]
147
+ return "\n".join(new_lines), True
148
+
149
+
150
+ def _hard_truncate(text: str, max_tokens: int) -> str:
151
+ """Hard truncate text to approximately max_tokens."""
152
+ # Rough: 1 token ≈ 3 chars for mixed content
153
+ max_chars = max_tokens * 3
154
+ if len(text) <= max_chars:
155
+ return text
156
+ return text[:max_chars] + "\n\n... (truncated to fit context window)"
157
+
158
+
159
+ def _log_truncation(original: int, final: int, what: list[str]) -> None:
160
+ """Log truncation details."""
161
+ logger.warning(
162
+ "[context_manager] Truncated prompt: %d → %d tokens (removed: %s)",
163
+ original, final, ", ".join(what),
164
+ )