cognis-engine 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. cognis/__init__.py +35 -0
  2. cognis/assets/logo.png +0 -0
  3. cognis/branding.py +29 -0
  4. cognis/capsule/__init__.py +42 -0
  5. cognis/capsule/composer.py +553 -0
  6. cognis/capsule/models.py +261 -0
  7. cognis/capsule/token_estimator.py +136 -0
  8. cognis/cli/__init__.py +1 -0
  9. cognis/cli/main.py +1831 -0
  10. cognis/config.py +533 -0
  11. cognis/db.py +747 -0
  12. cognis/migrations/001_initial.sql +188 -0
  13. cognis/migrations/__init__.py +7 -0
  14. cognis/models.py +188 -0
  15. cognis/planner.py +433 -0
  16. cognis/py.typed +0 -0
  17. cognis/schemas/__init__.py +1 -0
  18. cognis/schemas/capsule.v1.json +383 -0
  19. cognis_adapters/__init__.py +1 -0
  20. cognis_cli/__init__.py +1 -0
  21. cognis_engine-0.3.0.dist-info/METADATA +492 -0
  22. cognis_engine-0.3.0.dist-info/RECORD +72 -0
  23. cognis_engine-0.3.0.dist-info/WHEEL +4 -0
  24. cognis_engine-0.3.0.dist-info/entry_points.txt +4 -0
  25. cognis_engine-0.3.0.dist-info/licenses/LICENSE +201 -0
  26. cognis_eval/__init__.py +57 -0
  27. cognis_eval/models.py +184 -0
  28. cognis_eval/runner.py +404 -0
  29. cognis_eval/strategy.py +87 -0
  30. cognis_indexd/__init__.py +1 -0
  31. cognis_indexd/main.py +691 -0
  32. cognis_indexer/__init__.py +4 -0
  33. cognis_indexer/embedder.py +508 -0
  34. cognis_indexer/enricher/__init__.py +27 -0
  35. cognis_indexer/enricher/attributes.py +316 -0
  36. cognis_indexer/enricher/enricher.py +199 -0
  37. cognis_indexer/enricher/secrets.py +231 -0
  38. cognis_indexer/parsers/__init__.py +25 -0
  39. cognis_indexer/parsers/_normalize.py +102 -0
  40. cognis_indexer/parsers/base.py +94 -0
  41. cognis_indexer/parsers/go.py +342 -0
  42. cognis_indexer/parsers/python.py +412 -0
  43. cognis_indexer/parsers/typescript.py +469 -0
  44. cognis_indexer/pipeline.py +1028 -0
  45. cognis_indexer/resolver/__init__.py +24 -0
  46. cognis_indexer/resolver/base.py +77 -0
  47. cognis_indexer/resolver/heuristic.py +173 -0
  48. cognis_indexer/resolver/lsp.py +109 -0
  49. cognis_indexer/resolver/pipeline.py +112 -0
  50. cognis_indexer/watcher/__init__.py +29 -0
  51. cognis_indexer/watcher/debounce.py +131 -0
  52. cognis_indexer/watcher/events.py +72 -0
  53. cognis_indexer/watcher/gitignore.py +178 -0
  54. cognis_indexer/watcher/watcher.py +326 -0
  55. cognis_indexer/writer.py +477 -0
  56. cognis_mcpd/__init__.py +16 -0
  57. cognis_mcpd/app/__init__.py +1 -0
  58. cognis_mcpd/audit.py +78 -0
  59. cognis_mcpd/embedder_pool.py +94 -0
  60. cognis_mcpd/errors.py +93 -0
  61. cognis_mcpd/main.py +128 -0
  62. cognis_mcpd/metrics.py +275 -0
  63. cognis_mcpd/result_cache.py +81 -0
  64. cognis_mcpd/server.py +272 -0
  65. cognis_mcpd/tools.py +1837 -0
  66. cognis_retrieval/__init__.py +66 -0
  67. cognis_retrieval/base.py +64 -0
  68. cognis_retrieval/csar.py +551 -0
  69. cognis_retrieval/lexical.py +179 -0
  70. cognis_retrieval/query_rewriter.py +191 -0
  71. cognis_retrieval/semantic.py +232 -0
  72. cognis_retrieval/structural.py +220 -0
cognis/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ """cognis core namespace.
2
+
3
+ Subpackages land in later tasks:
4
+
5
+ - ``cognis.config`` (task 2.1) — Pydantic config loader.
6
+ - ``cognis.cli`` (task 2.2) — Click-based ``cognis-cli`` entry points.
7
+ - ``cognis.db`` (task 3) — SQLite connection factory + migrations + UCKG CRUD.
8
+ - ``cognis.models`` (task 3) — Pydantic data models for the UCKG schema.
9
+ - ``cognis.planner`` (task 13) — Cognitive Context Planner.
10
+ - ``cognis.capsule`` (task 14) — Capsule composer + JSON Schema.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+
17
+ if os.name == "nt":
18
+ # Local Windows runs commonly combine pytest workers, MCP threads, numpy,
19
+ # and embedding/model libraries in one process tree. Keep BLAS thread
20
+ # defaults conservative unless the operator explicitly overrides them.
21
+ for _var in (
22
+ "OPENBLAS_NUM_THREADS",
23
+ "OMP_NUM_THREADS",
24
+ "MKL_NUM_THREADS",
25
+ "NUMEXPR_NUM_THREADS",
26
+ ):
27
+ os.environ.setdefault(_var, "1")
28
+
29
+ from cognis.config import Config
30
+
31
+ __all__ = ["Config", "__version__"]
32
+
33
+ # Single source of truth for runtime version. PEP 621 metadata in pyproject is
34
+ # the canonical tag; this constant tracks it for display in ``cognis-cli health``.
35
+ __version__: str = "0.3.0"
cognis/assets/logo.png ADDED
Binary file
cognis/branding.py ADDED
@@ -0,0 +1,29 @@
1
+ """Shared cognis branding helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from cognis import __version__
9
+
10
+ TAGLINE = "Software Cognition Engine"
11
+
12
+
13
+ def logo_path() -> Path | None:
14
+ """Return the bundled logo path when the asset ships with the package."""
15
+ candidate = Path(__file__).resolve().parent / "assets" / "logo.png"
16
+ return candidate if candidate.is_file() else None
17
+
18
+
19
+ def format_banner(*, prog: str = "cognis") -> str:
20
+ """Return a one-line startup banner for CLI and daemon entry points."""
21
+ return f"{prog} v{__version__} — {TAGLINE}"
22
+
23
+
24
+ def echo_banner(*, prog: str = "cognis", file: object | None = None) -> None:
25
+ """Print the startup banner when attached to an interactive terminal."""
26
+ stream = file if file is not None else sys.stderr
27
+ isatty = getattr(stream, "isatty", lambda: False)
28
+ if isatty():
29
+ print(format_banner(prog=prog), file=stream) # type: ignore[arg-type]
@@ -0,0 +1,42 @@
1
+ """Capsule composer package — Task 14 of ``.kiro/specs/cognis/tasks.md``.
2
+
3
+ Submodules:
4
+
5
+ - :mod:`cognis.capsule.models` — Pydantic v2 models for :class:`ContextCapsule` v1.
6
+ - :mod:`cognis.capsule.token_estimator` — tiktoken-based token estimation.
7
+ - :mod:`cognis.capsule.composer` — :class:`CapsuleComposer` pipeline.
8
+
9
+ Public re-exports:
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from cognis.capsule.composer import CapsuleComposer, ComposeError
15
+ from cognis.capsule.models import (
16
+ CallChainEdge,
17
+ CapsuleSource,
18
+ CompressedContext,
19
+ ContextCapsule,
20
+ NeighborPattern,
21
+ RelevantSymbol,
22
+ RiskArea,
23
+ RootCauseCandidate,
24
+ RuntimeEvidence,
25
+ )
26
+ from cognis.capsule.token_estimator import estimate_capsule_tokens, estimate_tokens
27
+
28
+ __all__ = [
29
+ "CallChainEdge",
30
+ "CapsuleComposer",
31
+ "CapsuleSource",
32
+ "ComposeError",
33
+ "CompressedContext",
34
+ "ContextCapsule",
35
+ "NeighborPattern",
36
+ "RelevantSymbol",
37
+ "RiskArea",
38
+ "RootCauseCandidate",
39
+ "RuntimeEvidence",
40
+ "estimate_capsule_tokens",
41
+ "estimate_tokens",
42
+ ]
@@ -0,0 +1,553 @@
1
+ """Capsule Composer — Task 14.2, 14.4 of ``.kiro/specs/cognis/tasks.md``.
2
+
3
+ Implements the composition pipeline that takes retrieval hits from the three
4
+ MVP layers (lexical, semantic, structural), merges them, hydrates symbol rows
5
+ from the DB, fills per-mode sections, attaches source entries, wraps untrusted
6
+ content, and enforces the token budget.
7
+
8
+ Design reference
9
+ ----------------
10
+ - Context Capsule schema (v1, MVP) and Composition rules — design.md §Data Models.
11
+ - Cognitive Context Planner pipeline — design.md §Components and Interfaces.
12
+ - Error Handling → Untrusted content handling — design.md.
13
+
14
+ Correctness properties
15
+ -----------------------
16
+ CP-8: ``token_estimate ≤ max_tokens`` for any ``max_tokens ∈ [500, 32000]``.
17
+ CP-9: ``sources[]`` non-empty for every populated section.
18
+ CP-11: Same query + same DB state → same capsule (modulo wall-clock fields).
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import json
24
+ import logging
25
+ from typing import TYPE_CHECKING, Any
26
+
27
+ from cognis.capsule.models import (
28
+ CapsuleSource,
29
+ CompressedContext,
30
+ ContextCapsule,
31
+ RelevantSymbol,
32
+ RiskArea,
33
+ RootCauseCandidate,
34
+ )
35
+ from cognis.capsule.token_estimator import estimate_capsule_tokens
36
+ from cognis.db import Database, get_symbol
37
+ from cognis.planner import TaskMode
38
+
39
+ if TYPE_CHECKING:
40
+ from cognis_retrieval.base import Hit
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # ---------------------------------------------------------------------------
45
+ # Untrusted content markers (design §Error Handling → Untrusted content)
46
+ # ---------------------------------------------------------------------------
47
+
48
+ _UNTRUSTED_OPEN = '<<<UNTRUSTED type="{kind}" symbol="{symbol}">>>'
49
+ _UNTRUSTED_CLOSE = "<<<END UNTRUSTED>>>"
50
+
51
+
52
+ def _wrap_untrusted(text: str, kind: str, symbol: str) -> str:
53
+ """Wrap *text* with the ``<<<UNTRUSTED ...>>>`` marker pair.
54
+
55
+ Args:
56
+ text: The raw content to wrap.
57
+ kind: Content kind (e.g. ``"docstring"``, ``"comment"``).
58
+ symbol: Qualified name or symbol_id of the originating symbol.
59
+
60
+ Returns:
61
+ The wrapped string ready for inclusion in the capsule.
62
+ """
63
+ open_tag = _UNTRUSTED_OPEN.format(kind=kind, symbol=symbol)
64
+ return f"{open_tag}\n{text}\n{_UNTRUSTED_CLOSE}"
65
+
66
+
67
+ # ---------------------------------------------------------------------------
68
+ # ComposeError
69
+ # ---------------------------------------------------------------------------
70
+
71
+
72
+ class ComposeError(Exception):
73
+ """Raised when the capsule composition pipeline detects a fatal violation.
74
+
75
+ Currently the only fatal case is a populated section (root_cause_candidates
76
+ or relevant_symbols) that has no backing source entry — per the design's
77
+ "Sources mandatory" composition rule.
78
+ """
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Internal helpers
83
+ # ---------------------------------------------------------------------------
84
+
85
+
86
+ def _dedupe_hits(hits: list[Hit]) -> list[Hit]:
87
+ """Deduplicate hits by ``symbol_id``, keeping the highest score per symbol.
88
+
89
+ The output is sorted by descending score (deterministic tie-breaking by
90
+ symbol_id ensures CP-11 determinism).
91
+
92
+ Args:
93
+ hits: Raw hits from one or more retrieval layers.
94
+
95
+ Returns:
96
+ Deduplicated, score-sorted list of hits.
97
+ """
98
+ best: dict[str, Hit] = {}
99
+ for hit in hits:
100
+ existing = best.get(hit.symbol_id)
101
+ if existing is None or hit.score > existing.score:
102
+ best[hit.symbol_id] = hit
103
+ # Stable sort: descending score, then ascending symbol_id for tie-break.
104
+ return sorted(best.values(), key=lambda h: (-h.score, h.symbol_id))
105
+
106
+
107
+ def _make_source(symbol_id: str) -> CapsuleSource:
108
+ """Create a ``CapsuleSource`` of type ``"symbol"`` for *symbol_id*."""
109
+ return CapsuleSource(type="symbol", id=symbol_id, uri=None)
110
+
111
+
112
+ def _is_untrusted(untrusted_flags: list[str]) -> bool:
113
+ """Return True if the symbol's flags indicate untrusted document content."""
114
+ return "untrusted_doc" in untrusted_flags
115
+
116
+
117
+ # ---------------------------------------------------------------------------
118
+ # CapsuleComposer
119
+ # ---------------------------------------------------------------------------
120
+
121
+
122
+ class CapsuleComposer:
123
+ """Compose a :class:`~cognis.capsule.models.ContextCapsule` from retrieval hits.
124
+
125
+ The composer is **stateless** — all state is passed in via :meth:`compose`.
126
+ This makes it straightforward to test and keeps CP-11 (determinism) easy
127
+ to satisfy: given the same inputs, the pipeline always produces the same
128
+ output.
129
+
130
+ Usage
131
+ -----
132
+ .. code-block:: python
133
+
134
+ composer = CapsuleComposer()
135
+ capsule = composer.compose(
136
+ task="Why is /login timing out?",
137
+ mode="bugfix",
138
+ confidence=0.85,
139
+ hits=hits,
140
+ max_tokens=8000,
141
+ db=db,
142
+ )
143
+ """
144
+
145
+ # ------------------------------------------------------------------
146
+ # Public interface
147
+ # ------------------------------------------------------------------
148
+
149
+ def compose(
150
+ self,
151
+ task: str,
152
+ mode: TaskMode,
153
+ confidence: float,
154
+ hits: list[Hit],
155
+ max_tokens: int,
156
+ db: Database,
157
+ include_runtime: bool = False,
158
+ ) -> ContextCapsule:
159
+ """Run the full composition pipeline.
160
+
161
+ Pipeline steps (design §Cognitive Context Planner):
162
+
163
+ 1. Score-merge hits: deduplicate by ``symbol_id``, keep highest score.
164
+ 2. Sort by score descending (tie-break by symbol_id for determinism).
165
+ 3. Hydrate symbol rows from the DB for top-N deduplicated hits.
166
+ 4. Fill sections based on *mode*:
167
+ - ``bugfix`` → ``root_cause_candidates`` from structural hits;
168
+ ``relevant_symbols`` from semantic + lexical.
169
+ - all other modes → ``relevant_symbols`` with appropriate scoring.
170
+ 5. Attach sources: every entry in ``root_cause_candidates`` and
171
+ ``relevant_symbols`` must have a backing ``CapsuleSource``.
172
+ 6. Untrusted wrapping: symbols with ``"untrusted_doc"`` in
173
+ ``untrusted_flags`` get their snippet/docstring wrapped in
174
+ ``<<<UNTRUSTED ...>>>`` markers; section id added to
175
+ ``untrusted_sections``.
176
+ 7. Estimate tokens (tiktoken cl100k_base + 10% margin).
177
+ 8. Drop sections (not truncate) to fit within *max_tokens* budget.
178
+ 9. Final validation: every populated section must have ≥ 1 source;
179
+ raise :class:`ComposeError` on violation.
180
+
181
+ Args:
182
+ task: The original user task string (stored as ``goal``).
183
+ mode: Task mode from the planner classifier.
184
+ confidence: Planner classifier confidence (0.0-1.0).
185
+ hits: Raw retrieval hits from all layers (may be duplicated
186
+ across layers; composer deduplicates).
187
+ max_tokens: Hard upper bound on ``token_estimate``.
188
+ db: Database handle for hydrating symbol rows.
189
+ include_runtime: Include ``runtime_evidence`` section if hits
190
+ provide runtime signals (Phase 3; currently always empty).
191
+
192
+ Returns:
193
+ A validated :class:`~cognis.capsule.models.ContextCapsule`.
194
+
195
+ Raises:
196
+ ComposeError: If any populated section lacks a source entry.
197
+ """
198
+ # Step 1+2: deduplicate and sort hits.
199
+ deduped = _dedupe_hits(hits)
200
+
201
+ # Step 3: hydrate symbol rows. We limit to the top-100 hits to avoid
202
+ # runaway DB queries; the token budget will further trim sections later.
203
+ top_hits = deduped[:100]
204
+ symbol_rows = self._hydrate_symbols(top_hits, db)
205
+
206
+ # Step 4+5+6: fill sections, attach sources, wrap untrusted content.
207
+ sources: list[CapsuleSource] = []
208
+ untrusted_sections: list[str] = []
209
+
210
+ root_cause_candidates: list[RootCauseCandidate] = []
211
+ relevant_symbols: list[RelevantSymbol] = []
212
+ risk_areas: list[RiskArea] = []
213
+ compressed_context: list[CompressedContext] = []
214
+
215
+ if mode == "bugfix":
216
+ root_cause_candidates, relevant_symbols = self._fill_bugfix_sections(
217
+ top_hits, symbol_rows, sources, untrusted_sections
218
+ )
219
+ else:
220
+ relevant_symbols = self._fill_generic_sections(
221
+ top_hits, symbol_rows, sources, untrusted_sections
222
+ )
223
+
224
+ # Risk areas: symbols with risk_score > 0 (from hydrated rows).
225
+ for hit in top_hits:
226
+ sym = symbol_rows.get(hit.symbol_id)
227
+ if sym is not None and sym.risk_score > 0.0:
228
+ risk_areas.append(
229
+ RiskArea(
230
+ symbol_id=hit.symbol_id,
231
+ reason=f"risk_score={sym.risk_score:.2f}",
232
+ )
233
+ )
234
+
235
+ # Step 7: assemble a draft capsule and estimate tokens.
236
+ draft = ContextCapsule(
237
+ version="1",
238
+ goal=task,
239
+ task_mode=mode,
240
+ confidence=confidence,
241
+ root_cause_candidates=root_cause_candidates,
242
+ relevant_symbols=relevant_symbols,
243
+ call_chain=[],
244
+ runtime_evidence=[],
245
+ neighbor_patterns=[],
246
+ risk_areas=risk_areas,
247
+ compressed_context=compressed_context,
248
+ token_estimate=0, # placeholder; computed below
249
+ sources=sources,
250
+ untrusted_sections=sorted(set(untrusted_sections)),
251
+ )
252
+
253
+ # Step 8: drop sections to fit within max_tokens budget.
254
+ draft = self._enforce_budget(draft, max_tokens)
255
+
256
+ # Step 9: validate sources completeness.
257
+ self._validate_sources(draft)
258
+
259
+ return draft
260
+
261
+ # ------------------------------------------------------------------
262
+ # Section filling helpers
263
+ # ------------------------------------------------------------------
264
+
265
+ def _hydrate_symbols(
266
+ self,
267
+ hits: list[Hit],
268
+ db: Database,
269
+ ) -> dict[str, Any]:
270
+ """Fetch symbol rows from the DB for every hit.
271
+
272
+ Missing rows (symbol was deleted after indexing) are silently skipped —
273
+ the hit will still appear in the capsule but with minimal metadata.
274
+
275
+ Returns:
276
+ ``{symbol_id: SymbolNode}`` mapping (may be a subset of *hits*).
277
+ """
278
+ result: dict[str, Any] = {}
279
+ for hit in hits:
280
+ sym = get_symbol(db, hit.symbol_id)
281
+ if sym is not None:
282
+ result[hit.symbol_id] = sym
283
+ return result
284
+
285
+ def _build_relevant_symbol(
286
+ self,
287
+ hit: Hit,
288
+ symbol_rows: dict[str, Any],
289
+ untrusted_sections: list[str],
290
+ section_id: str,
291
+ ) -> RelevantSymbol:
292
+ """Build a :class:`RelevantSymbol` for a single hit.
293
+
294
+ Applies the untrusted-content wrapping rule: if the symbol has
295
+ ``"untrusted_doc"`` in its ``untrusted_flags``, the snippet is wrapped
296
+ with ``<<<UNTRUSTED>>>`` markers and *section_id* is appended to
297
+ *untrusted_sections*.
298
+
299
+ Args:
300
+ hit: The retrieval hit.
301
+ symbol_rows: Hydrated symbol rows from the DB.
302
+ untrusted_sections: Mutable list to which the section id is
303
+ appended if the content is untrusted.
304
+ section_id: The capsule section id (e.g. ``"relevant_symbols"``).
305
+
306
+ Returns:
307
+ A :class:`RelevantSymbol` instance.
308
+ """
309
+ sym = symbol_rows.get(hit.symbol_id)
310
+ kind = sym.kind if sym is not None else "unknown"
311
+
312
+ snippet: str | None = None
313
+ if sym is not None:
314
+ raw_snippet = sym.body_excerpt or sym.docstring
315
+ if raw_snippet:
316
+ if _is_untrusted(list(sym.untrusted_flags)):
317
+ snippet = _wrap_untrusted(raw_snippet, "docstring", sym.qualified_name)
318
+ if section_id not in untrusted_sections:
319
+ untrusted_sections.append(section_id)
320
+ else:
321
+ snippet = raw_snippet
322
+
323
+ return RelevantSymbol(
324
+ symbol_id=hit.symbol_id,
325
+ kind=kind,
326
+ snippet=snippet,
327
+ summary=sym.semantic_summary if sym is not None else None,
328
+ score=hit.score,
329
+ )
330
+
331
+ def _fill_bugfix_sections(
332
+ self,
333
+ hits: list[Hit],
334
+ symbol_rows: dict[str, Any],
335
+ sources: list[CapsuleSource],
336
+ untrusted_sections: list[str],
337
+ ) -> tuple[list[RootCauseCandidate], list[RelevantSymbol]]:
338
+ """Fill ``root_cause_candidates`` and ``relevant_symbols`` for bugfix mode.
339
+
340
+ Bugfix strategy (design §Cognitive Context Planner layer plan table):
341
+ - ``root_cause_candidates``: top structural hits (layer == "structural"),
342
+ sorted by score descending. Max 5.
343
+ - ``relevant_symbols``: remaining hits (semantic + lexical), up to 20.
344
+ """
345
+ structural_hits = [h for h in hits if h.layer == "structural"]
346
+ other_hits = [h for h in hits if h.layer != "structural"]
347
+
348
+ root_causes: list[RootCauseCandidate] = []
349
+ relevant: list[RelevantSymbol] = []
350
+
351
+ # Root cause candidates from structural hits (up to 5).
352
+ for hit in structural_hits[:5]:
353
+ sym = symbol_rows.get(hit.symbol_id)
354
+ evidence: list[str] = []
355
+ if hit.evidence:
356
+ ev_str = json.dumps(hit.evidence, sort_keys=True)
357
+ evidence.append(ev_str)
358
+ rationale = hit.reason or f"structural relevance (score={hit.score:.3f})"
359
+
360
+ # Untrusted rationale wrapping.
361
+ if sym is not None and _is_untrusted(list(sym.untrusted_flags)):
362
+ rationale = _wrap_untrusted(rationale, "rationale", sym.qualified_name)
363
+ if "root_cause_candidates" not in untrusted_sections:
364
+ untrusted_sections.append("root_cause_candidates")
365
+
366
+ root_causes.append(
367
+ RootCauseCandidate(
368
+ symbol_id=hit.symbol_id,
369
+ rationale=rationale,
370
+ evidence=evidence,
371
+ )
372
+ )
373
+ sources.append(_make_source(hit.symbol_id))
374
+
375
+ # Relevant symbols from non-structural hits (up to 20).
376
+ for hit in other_hits[:20]:
377
+ rs = self._build_relevant_symbol(
378
+ hit, symbol_rows, untrusted_sections, "relevant_symbols"
379
+ )
380
+ relevant.append(rs)
381
+ sources.append(_make_source(hit.symbol_id))
382
+
383
+ return root_causes, relevant
384
+
385
+ def _fill_generic_sections(
386
+ self,
387
+ hits: list[Hit],
388
+ symbol_rows: dict[str, Any],
389
+ sources: list[CapsuleSource],
390
+ untrusted_sections: list[str],
391
+ ) -> list[RelevantSymbol]:
392
+ """Fill ``relevant_symbols`` for all non-bugfix modes.
393
+
394
+ Takes up to 25 hits, builds :class:`RelevantSymbol` entries with
395
+ source attachments.
396
+ """
397
+ relevant: list[RelevantSymbol] = []
398
+ for hit in hits[:25]:
399
+ rs = self._build_relevant_symbol(
400
+ hit, symbol_rows, untrusted_sections, "relevant_symbols"
401
+ )
402
+ relevant.append(rs)
403
+ sources.append(_make_source(hit.symbol_id))
404
+ return relevant
405
+
406
+ # ------------------------------------------------------------------
407
+ # Budget enforcement (CP-8)
408
+ # ------------------------------------------------------------------
409
+
410
+ def _enforce_budget(self, capsule: ContextCapsule, max_tokens: int) -> ContextCapsule:
411
+ """Drop sections until ``token_estimate ≤ max_tokens`` (CP-8).
412
+
413
+ The design mandates "drop sections (not truncate)" so we remove entire
414
+ sections in priority order (least important first) until the estimate
415
+ fits. Wall-clock fields (``generated_at`` if present) are excluded
416
+ from the hash comparison in the determinism test (CP-11) but are not
417
+ present in the v1 schema at MVP.
418
+
419
+ Section drop priority (lowest value = dropped first):
420
+
421
+ 1. ``neighbor_patterns``
422
+ 2. ``compressed_context``
423
+ 3. ``risk_areas``
424
+ 4. ``runtime_evidence``
425
+ 5. ``relevant_symbols`` (trimmed from the end, not entirely dropped)
426
+ 6. ``root_cause_candidates`` (trimmed from the end)
427
+
428
+ Args:
429
+ capsule: Draft capsule (``token_estimate`` may be 0 placeholder).
430
+ max_tokens: Hard upper bound.
431
+
432
+ Returns:
433
+ A new capsule instance with ``token_estimate`` set and sections
434
+ potentially trimmed to fit.
435
+ """
436
+ # Section drop order: cheapest to lose first.
437
+ _DROP_ORDER = [
438
+ "neighbor_patterns",
439
+ "compressed_context",
440
+ "risk_areas",
441
+ "runtime_evidence",
442
+ ]
443
+
444
+ # Build a mutable dict so we can iteratively drop sections.
445
+ fields: dict[str, Any] = capsule.model_dump(by_alias=True)
446
+
447
+ def _current_estimate(f: dict[str, Any]) -> int:
448
+ tmp = ContextCapsule(
449
+ **{k: v for k, v in f.items() if k not in ("token_estimate",)}, token_estimate=0
450
+ )
451
+ return estimate_capsule_tokens(tmp)
452
+
453
+ estimate = _current_estimate(fields)
454
+
455
+ # Phase 1: drop whole sections in priority order.
456
+ for section in _DROP_ORDER:
457
+ if estimate <= max_tokens:
458
+ break
459
+ if fields.get(section):
460
+ fields[section] = []
461
+ estimate = _current_estimate(fields)
462
+
463
+ # Phase 2: trim relevant_symbols from the end (one-at-a-time).
464
+ while estimate > max_tokens and fields.get("relevant_symbols"):
465
+ fields["relevant_symbols"] = fields["relevant_symbols"][:-1]
466
+ estimate = _current_estimate(fields)
467
+
468
+ # Phase 3: trim root_cause_candidates from the end.
469
+ while estimate > max_tokens and fields.get("root_cause_candidates"):
470
+ fields["root_cause_candidates"] = fields["root_cause_candidates"][:-1]
471
+ estimate = _current_estimate(fields)
472
+
473
+ # After trimming, recalculate the final sources list to only include
474
+ # sources for symbols that still appear in the capsule.
475
+ remaining_symbol_ids: set[str] = set()
476
+ for rs in fields.get("relevant_symbols", []):
477
+ sid: str | None = rs.get("symbol_id") if isinstance(rs, dict) else rs.symbol_id
478
+ if sid is not None:
479
+ remaining_symbol_ids.add(sid)
480
+ for rcc in fields.get("root_cause_candidates", []):
481
+ sid2: str | None = rcc.get("symbol_id") if isinstance(rcc, dict) else rcc.symbol_id
482
+ if sid2 is not None:
483
+ remaining_symbol_ids.add(sid2)
484
+
485
+ # Filter sources to only those backing remaining symbols.
486
+ filtered_sources = []
487
+ seen_source_ids: set[str] = set()
488
+ for src in fields.get("sources", []):
489
+ src_id: str | None = src.get("id") if isinstance(src, dict) else src.id
490
+ if (
491
+ src_id is not None
492
+ and src_id in remaining_symbol_ids
493
+ and src_id not in seen_source_ids
494
+ ):
495
+ filtered_sources.append(src)
496
+ seen_source_ids.add(src_id)
497
+ fields["sources"] = filtered_sources
498
+
499
+ # Also trim untrusted_sections to only those relevant to surviving content.
500
+ # (simplified: keep all declared untrusted sections; they reference section
501
+ # names not individual symbols, so we keep them as-is)
502
+
503
+ # Set the final token estimate.
504
+ fields["token_estimate"] = estimate
505
+
506
+ # Reconstruct the capsule with corrected fields.
507
+ return ContextCapsule(**{k: v for k, v in fields.items()})
508
+
509
+ # ------------------------------------------------------------------
510
+ # Source validation (CP-9 / "Sources mandatory")
511
+ # ------------------------------------------------------------------
512
+
513
+ def _validate_sources(self, capsule: ContextCapsule) -> None:
514
+ """Verify every populated section has at least one ``sources[]`` entry.
515
+
516
+ Design: "Sources mandatory. Every claim has a sources[] entry. Compose
517
+ fails if violation detected."
518
+
519
+ Args:
520
+ capsule: The composed capsule to validate.
521
+
522
+ Raises:
523
+ ComposeError: If ``root_cause_candidates`` or ``relevant_symbols``
524
+ are populated but ``sources`` is empty.
525
+ """
526
+ has_claims = bool(capsule.root_cause_candidates or capsule.relevant_symbols)
527
+ if has_claims and not capsule.sources:
528
+ raise ComposeError(
529
+ "Capsule has populated sections (root_cause_candidates / relevant_symbols) "
530
+ "but sources[] is empty. Composition rule 1 requires every claim to have "
531
+ "at least one source entry."
532
+ )
533
+
534
+ # Per-symbol source check: every symbol_id in claims must have a source.
535
+ sourced_ids = {s.id for s in capsule.sources}
536
+ for rcc in capsule.root_cause_candidates:
537
+ if rcc.symbol_id not in sourced_ids:
538
+ raise ComposeError(
539
+ f"root_cause_candidate symbol '{rcc.symbol_id}' has no source entry. "
540
+ "Compose fails: sources mandatory per design composition rule 1."
541
+ )
542
+ for rs in capsule.relevant_symbols:
543
+ if rs.symbol_id not in sourced_ids:
544
+ raise ComposeError(
545
+ f"relevant_symbol '{rs.symbol_id}' has no source entry. "
546
+ "Compose fails: sources mandatory per design composition rule 1."
547
+ )
548
+
549
+
550
+ __all__ = [
551
+ "CapsuleComposer",
552
+ "ComposeError",
553
+ ]