cognis-engine 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cognis/__init__.py +35 -0
- cognis/assets/logo.png +0 -0
- cognis/branding.py +29 -0
- cognis/capsule/__init__.py +42 -0
- cognis/capsule/composer.py +553 -0
- cognis/capsule/models.py +261 -0
- cognis/capsule/token_estimator.py +136 -0
- cognis/cli/__init__.py +1 -0
- cognis/cli/main.py +1831 -0
- cognis/config.py +533 -0
- cognis/db.py +747 -0
- cognis/migrations/001_initial.sql +188 -0
- cognis/migrations/__init__.py +7 -0
- cognis/models.py +188 -0
- cognis/planner.py +433 -0
- cognis/py.typed +0 -0
- cognis/schemas/__init__.py +1 -0
- cognis/schemas/capsule.v1.json +383 -0
- cognis_adapters/__init__.py +1 -0
- cognis_cli/__init__.py +1 -0
- cognis_engine-0.3.0.dist-info/METADATA +492 -0
- cognis_engine-0.3.0.dist-info/RECORD +72 -0
- cognis_engine-0.3.0.dist-info/WHEEL +4 -0
- cognis_engine-0.3.0.dist-info/entry_points.txt +4 -0
- cognis_engine-0.3.0.dist-info/licenses/LICENSE +201 -0
- cognis_eval/__init__.py +57 -0
- cognis_eval/models.py +184 -0
- cognis_eval/runner.py +404 -0
- cognis_eval/strategy.py +87 -0
- cognis_indexd/__init__.py +1 -0
- cognis_indexd/main.py +691 -0
- cognis_indexer/__init__.py +4 -0
- cognis_indexer/embedder.py +508 -0
- cognis_indexer/enricher/__init__.py +27 -0
- cognis_indexer/enricher/attributes.py +316 -0
- cognis_indexer/enricher/enricher.py +199 -0
- cognis_indexer/enricher/secrets.py +231 -0
- cognis_indexer/parsers/__init__.py +25 -0
- cognis_indexer/parsers/_normalize.py +102 -0
- cognis_indexer/parsers/base.py +94 -0
- cognis_indexer/parsers/go.py +342 -0
- cognis_indexer/parsers/python.py +412 -0
- cognis_indexer/parsers/typescript.py +469 -0
- cognis_indexer/pipeline.py +1028 -0
- cognis_indexer/resolver/__init__.py +24 -0
- cognis_indexer/resolver/base.py +77 -0
- cognis_indexer/resolver/heuristic.py +173 -0
- cognis_indexer/resolver/lsp.py +109 -0
- cognis_indexer/resolver/pipeline.py +112 -0
- cognis_indexer/watcher/__init__.py +29 -0
- cognis_indexer/watcher/debounce.py +131 -0
- cognis_indexer/watcher/events.py +72 -0
- cognis_indexer/watcher/gitignore.py +178 -0
- cognis_indexer/watcher/watcher.py +326 -0
- cognis_indexer/writer.py +477 -0
- cognis_mcpd/__init__.py +16 -0
- cognis_mcpd/app/__init__.py +1 -0
- cognis_mcpd/audit.py +78 -0
- cognis_mcpd/embedder_pool.py +94 -0
- cognis_mcpd/errors.py +93 -0
- cognis_mcpd/main.py +128 -0
- cognis_mcpd/metrics.py +275 -0
- cognis_mcpd/result_cache.py +81 -0
- cognis_mcpd/server.py +272 -0
- cognis_mcpd/tools.py +1837 -0
- cognis_retrieval/__init__.py +66 -0
- cognis_retrieval/base.py +64 -0
- cognis_retrieval/csar.py +551 -0
- cognis_retrieval/lexical.py +179 -0
- cognis_retrieval/query_rewriter.py +191 -0
- cognis_retrieval/semantic.py +232 -0
- cognis_retrieval/structural.py +220 -0
cognis/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""cognis core namespace.
|
|
2
|
+
|
|
3
|
+
Subpackages land in later tasks:
|
|
4
|
+
|
|
5
|
+
- ``cognis.config`` (task 2.1) — Pydantic config loader.
|
|
6
|
+
- ``cognis.cli`` (task 2.2) — Click-based ``cognis-cli`` entry points.
|
|
7
|
+
- ``cognis.db`` (task 3) — SQLite connection factory + migrations + UCKG CRUD.
|
|
8
|
+
- ``cognis.models`` (task 3) — Pydantic data models for the UCKG schema.
|
|
9
|
+
- ``cognis.planner`` (task 13) — Cognitive Context Planner.
|
|
10
|
+
- ``cognis.capsule`` (task 14) — Capsule composer + JSON Schema.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
if os.name == "nt":
|
|
18
|
+
# Local Windows runs commonly combine pytest workers, MCP threads, numpy,
|
|
19
|
+
# and embedding/model libraries in one process tree. Keep BLAS thread
|
|
20
|
+
# defaults conservative unless the operator explicitly overrides them.
|
|
21
|
+
for _var in (
|
|
22
|
+
"OPENBLAS_NUM_THREADS",
|
|
23
|
+
"OMP_NUM_THREADS",
|
|
24
|
+
"MKL_NUM_THREADS",
|
|
25
|
+
"NUMEXPR_NUM_THREADS",
|
|
26
|
+
):
|
|
27
|
+
os.environ.setdefault(_var, "1")
|
|
28
|
+
|
|
29
|
+
from cognis.config import Config
|
|
30
|
+
|
|
31
|
+
__all__ = ["Config", "__version__"]
|
|
32
|
+
|
|
33
|
+
# Single source of truth for runtime version. PEP 621 metadata in pyproject is
|
|
34
|
+
# the canonical tag; this constant tracks it for display in ``cognis-cli health``.
|
|
35
|
+
__version__: str = "0.3.0"
|
cognis/assets/logo.png
ADDED
|
Binary file
|
cognis/branding.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Shared cognis branding helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from cognis import __version__
|
|
9
|
+
|
|
10
|
+
TAGLINE = "Software Cognition Engine"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def logo_path() -> Path | None:
|
|
14
|
+
"""Return the bundled logo path when the asset ships with the package."""
|
|
15
|
+
candidate = Path(__file__).resolve().parent / "assets" / "logo.png"
|
|
16
|
+
return candidate if candidate.is_file() else None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def format_banner(*, prog: str = "cognis") -> str:
|
|
20
|
+
"""Return a one-line startup banner for CLI and daemon entry points."""
|
|
21
|
+
return f"{prog} v{__version__} — {TAGLINE}"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def echo_banner(*, prog: str = "cognis", file: object | None = None) -> None:
|
|
25
|
+
"""Print the startup banner when attached to an interactive terminal."""
|
|
26
|
+
stream = file if file is not None else sys.stderr
|
|
27
|
+
isatty = getattr(stream, "isatty", lambda: False)
|
|
28
|
+
if isatty():
|
|
29
|
+
print(format_banner(prog=prog), file=stream) # type: ignore[arg-type]
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Capsule composer package — Task 14 of ``.kiro/specs/cognis/tasks.md``.
|
|
2
|
+
|
|
3
|
+
Submodules:
|
|
4
|
+
|
|
5
|
+
- :mod:`cognis.capsule.models` — Pydantic v2 models for :class:`ContextCapsule` v1.
|
|
6
|
+
- :mod:`cognis.capsule.token_estimator` — tiktoken-based token estimation.
|
|
7
|
+
- :mod:`cognis.capsule.composer` — :class:`CapsuleComposer` pipeline.
|
|
8
|
+
|
|
9
|
+
Public re-exports:
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from cognis.capsule.composer import CapsuleComposer, ComposeError
|
|
15
|
+
from cognis.capsule.models import (
|
|
16
|
+
CallChainEdge,
|
|
17
|
+
CapsuleSource,
|
|
18
|
+
CompressedContext,
|
|
19
|
+
ContextCapsule,
|
|
20
|
+
NeighborPattern,
|
|
21
|
+
RelevantSymbol,
|
|
22
|
+
RiskArea,
|
|
23
|
+
RootCauseCandidate,
|
|
24
|
+
RuntimeEvidence,
|
|
25
|
+
)
|
|
26
|
+
from cognis.capsule.token_estimator import estimate_capsule_tokens, estimate_tokens
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
"CallChainEdge",
|
|
30
|
+
"CapsuleComposer",
|
|
31
|
+
"CapsuleSource",
|
|
32
|
+
"ComposeError",
|
|
33
|
+
"CompressedContext",
|
|
34
|
+
"ContextCapsule",
|
|
35
|
+
"NeighborPattern",
|
|
36
|
+
"RelevantSymbol",
|
|
37
|
+
"RiskArea",
|
|
38
|
+
"RootCauseCandidate",
|
|
39
|
+
"RuntimeEvidence",
|
|
40
|
+
"estimate_capsule_tokens",
|
|
41
|
+
"estimate_tokens",
|
|
42
|
+
]
|
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"""Capsule Composer — Task 14.2, 14.4 of ``.kiro/specs/cognis/tasks.md``.
|
|
2
|
+
|
|
3
|
+
Implements the composition pipeline that takes retrieval hits from the three
|
|
4
|
+
MVP layers (lexical, semantic, structural), merges them, hydrates symbol rows
|
|
5
|
+
from the DB, fills per-mode sections, attaches source entries, wraps untrusted
|
|
6
|
+
content, and enforces the token budget.
|
|
7
|
+
|
|
8
|
+
Design reference
|
|
9
|
+
----------------
|
|
10
|
+
- Context Capsule schema (v1, MVP) and Composition rules — design.md §Data Models.
|
|
11
|
+
- Cognitive Context Planner pipeline — design.md §Components and Interfaces.
|
|
12
|
+
- Error Handling → Untrusted content handling — design.md.
|
|
13
|
+
|
|
14
|
+
Correctness properties
|
|
15
|
+
-----------------------
|
|
16
|
+
CP-8: ``token_estimate ≤ max_tokens`` for any ``max_tokens ∈ [500, 32000]``.
|
|
17
|
+
CP-9: ``sources[]`` non-empty for every populated section.
|
|
18
|
+
CP-11: Same query + same DB state → same capsule (modulo wall-clock fields).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import json
|
|
24
|
+
import logging
|
|
25
|
+
from typing import TYPE_CHECKING, Any
|
|
26
|
+
|
|
27
|
+
from cognis.capsule.models import (
|
|
28
|
+
CapsuleSource,
|
|
29
|
+
CompressedContext,
|
|
30
|
+
ContextCapsule,
|
|
31
|
+
RelevantSymbol,
|
|
32
|
+
RiskArea,
|
|
33
|
+
RootCauseCandidate,
|
|
34
|
+
)
|
|
35
|
+
from cognis.capsule.token_estimator import estimate_capsule_tokens
|
|
36
|
+
from cognis.db import Database, get_symbol
|
|
37
|
+
from cognis.planner import TaskMode
|
|
38
|
+
|
|
39
|
+
if TYPE_CHECKING:
|
|
40
|
+
from cognis_retrieval.base import Hit
|
|
41
|
+
|
|
42
|
+
logger = logging.getLogger(__name__)
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
# Untrusted content markers (design §Error Handling → Untrusted content)
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
|
|
48
|
+
_UNTRUSTED_OPEN = '<<<UNTRUSTED type="{kind}" symbol="{symbol}">>>'
|
|
49
|
+
_UNTRUSTED_CLOSE = "<<<END UNTRUSTED>>>"
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _wrap_untrusted(text: str, kind: str, symbol: str) -> str:
|
|
53
|
+
"""Wrap *text* with the ``<<<UNTRUSTED ...>>>`` marker pair.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
text: The raw content to wrap.
|
|
57
|
+
kind: Content kind (e.g. ``"docstring"``, ``"comment"``).
|
|
58
|
+
symbol: Qualified name or symbol_id of the originating symbol.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
The wrapped string ready for inclusion in the capsule.
|
|
62
|
+
"""
|
|
63
|
+
open_tag = _UNTRUSTED_OPEN.format(kind=kind, symbol=symbol)
|
|
64
|
+
return f"{open_tag}\n{text}\n{_UNTRUSTED_CLOSE}"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# ComposeError
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ComposeError(Exception):
|
|
73
|
+
"""Raised when the capsule composition pipeline detects a fatal violation.
|
|
74
|
+
|
|
75
|
+
Currently the only fatal case is a populated section (root_cause_candidates
|
|
76
|
+
or relevant_symbols) that has no backing source entry — per the design's
|
|
77
|
+
"Sources mandatory" composition rule.
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Internal helpers
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _dedupe_hits(hits: list[Hit]) -> list[Hit]:
|
|
87
|
+
"""Deduplicate hits by ``symbol_id``, keeping the highest score per symbol.
|
|
88
|
+
|
|
89
|
+
The output is sorted by descending score (deterministic tie-breaking by
|
|
90
|
+
symbol_id ensures CP-11 determinism).
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
hits: Raw hits from one or more retrieval layers.
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Deduplicated, score-sorted list of hits.
|
|
97
|
+
"""
|
|
98
|
+
best: dict[str, Hit] = {}
|
|
99
|
+
for hit in hits:
|
|
100
|
+
existing = best.get(hit.symbol_id)
|
|
101
|
+
if existing is None or hit.score > existing.score:
|
|
102
|
+
best[hit.symbol_id] = hit
|
|
103
|
+
# Stable sort: descending score, then ascending symbol_id for tie-break.
|
|
104
|
+
return sorted(best.values(), key=lambda h: (-h.score, h.symbol_id))
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _make_source(symbol_id: str) -> CapsuleSource:
|
|
108
|
+
"""Create a ``CapsuleSource`` of type ``"symbol"`` for *symbol_id*."""
|
|
109
|
+
return CapsuleSource(type="symbol", id=symbol_id, uri=None)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _is_untrusted(untrusted_flags: list[str]) -> bool:
|
|
113
|
+
"""Return True if the symbol's flags indicate untrusted document content."""
|
|
114
|
+
return "untrusted_doc" in untrusted_flags
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# CapsuleComposer
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class CapsuleComposer:
|
|
123
|
+
"""Compose a :class:`~cognis.capsule.models.ContextCapsule` from retrieval hits.
|
|
124
|
+
|
|
125
|
+
The composer is **stateless** — all state is passed in via :meth:`compose`.
|
|
126
|
+
This makes it straightforward to test and keeps CP-11 (determinism) easy
|
|
127
|
+
to satisfy: given the same inputs, the pipeline always produces the same
|
|
128
|
+
output.
|
|
129
|
+
|
|
130
|
+
Usage
|
|
131
|
+
-----
|
|
132
|
+
.. code-block:: python
|
|
133
|
+
|
|
134
|
+
composer = CapsuleComposer()
|
|
135
|
+
capsule = composer.compose(
|
|
136
|
+
task="Why is /login timing out?",
|
|
137
|
+
mode="bugfix",
|
|
138
|
+
confidence=0.85,
|
|
139
|
+
hits=hits,
|
|
140
|
+
max_tokens=8000,
|
|
141
|
+
db=db,
|
|
142
|
+
)
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
# ------------------------------------------------------------------
|
|
146
|
+
# Public interface
|
|
147
|
+
# ------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def compose(
|
|
150
|
+
self,
|
|
151
|
+
task: str,
|
|
152
|
+
mode: TaskMode,
|
|
153
|
+
confidence: float,
|
|
154
|
+
hits: list[Hit],
|
|
155
|
+
max_tokens: int,
|
|
156
|
+
db: Database,
|
|
157
|
+
include_runtime: bool = False,
|
|
158
|
+
) -> ContextCapsule:
|
|
159
|
+
"""Run the full composition pipeline.
|
|
160
|
+
|
|
161
|
+
Pipeline steps (design §Cognitive Context Planner):
|
|
162
|
+
|
|
163
|
+
1. Score-merge hits: deduplicate by ``symbol_id``, keep highest score.
|
|
164
|
+
2. Sort by score descending (tie-break by symbol_id for determinism).
|
|
165
|
+
3. Hydrate symbol rows from the DB for top-N deduplicated hits.
|
|
166
|
+
4. Fill sections based on *mode*:
|
|
167
|
+
- ``bugfix`` → ``root_cause_candidates`` from structural hits;
|
|
168
|
+
``relevant_symbols`` from semantic + lexical.
|
|
169
|
+
- all other modes → ``relevant_symbols`` with appropriate scoring.
|
|
170
|
+
5. Attach sources: every entry in ``root_cause_candidates`` and
|
|
171
|
+
``relevant_symbols`` must have a backing ``CapsuleSource``.
|
|
172
|
+
6. Untrusted wrapping: symbols with ``"untrusted_doc"`` in
|
|
173
|
+
``untrusted_flags`` get their snippet/docstring wrapped in
|
|
174
|
+
``<<<UNTRUSTED ...>>>`` markers; section id added to
|
|
175
|
+
``untrusted_sections``.
|
|
176
|
+
7. Estimate tokens (tiktoken cl100k_base + 10% margin).
|
|
177
|
+
8. Drop sections (not truncate) to fit within *max_tokens* budget.
|
|
178
|
+
9. Final validation: every populated section must have ≥ 1 source;
|
|
179
|
+
raise :class:`ComposeError` on violation.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
task: The original user task string (stored as ``goal``).
|
|
183
|
+
mode: Task mode from the planner classifier.
|
|
184
|
+
confidence: Planner classifier confidence (0.0-1.0).
|
|
185
|
+
hits: Raw retrieval hits from all layers (may be duplicated
|
|
186
|
+
across layers; composer deduplicates).
|
|
187
|
+
max_tokens: Hard upper bound on ``token_estimate``.
|
|
188
|
+
db: Database handle for hydrating symbol rows.
|
|
189
|
+
include_runtime: Include ``runtime_evidence`` section if hits
|
|
190
|
+
provide runtime signals (Phase 3; currently always empty).
|
|
191
|
+
|
|
192
|
+
Returns:
|
|
193
|
+
A validated :class:`~cognis.capsule.models.ContextCapsule`.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
ComposeError: If any populated section lacks a source entry.
|
|
197
|
+
"""
|
|
198
|
+
# Step 1+2: deduplicate and sort hits.
|
|
199
|
+
deduped = _dedupe_hits(hits)
|
|
200
|
+
|
|
201
|
+
# Step 3: hydrate symbol rows. We limit to the top-100 hits to avoid
|
|
202
|
+
# runaway DB queries; the token budget will further trim sections later.
|
|
203
|
+
top_hits = deduped[:100]
|
|
204
|
+
symbol_rows = self._hydrate_symbols(top_hits, db)
|
|
205
|
+
|
|
206
|
+
# Step 4+5+6: fill sections, attach sources, wrap untrusted content.
|
|
207
|
+
sources: list[CapsuleSource] = []
|
|
208
|
+
untrusted_sections: list[str] = []
|
|
209
|
+
|
|
210
|
+
root_cause_candidates: list[RootCauseCandidate] = []
|
|
211
|
+
relevant_symbols: list[RelevantSymbol] = []
|
|
212
|
+
risk_areas: list[RiskArea] = []
|
|
213
|
+
compressed_context: list[CompressedContext] = []
|
|
214
|
+
|
|
215
|
+
if mode == "bugfix":
|
|
216
|
+
root_cause_candidates, relevant_symbols = self._fill_bugfix_sections(
|
|
217
|
+
top_hits, symbol_rows, sources, untrusted_sections
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
relevant_symbols = self._fill_generic_sections(
|
|
221
|
+
top_hits, symbol_rows, sources, untrusted_sections
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Risk areas: symbols with risk_score > 0 (from hydrated rows).
|
|
225
|
+
for hit in top_hits:
|
|
226
|
+
sym = symbol_rows.get(hit.symbol_id)
|
|
227
|
+
if sym is not None and sym.risk_score > 0.0:
|
|
228
|
+
risk_areas.append(
|
|
229
|
+
RiskArea(
|
|
230
|
+
symbol_id=hit.symbol_id,
|
|
231
|
+
reason=f"risk_score={sym.risk_score:.2f}",
|
|
232
|
+
)
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Step 7: assemble a draft capsule and estimate tokens.
|
|
236
|
+
draft = ContextCapsule(
|
|
237
|
+
version="1",
|
|
238
|
+
goal=task,
|
|
239
|
+
task_mode=mode,
|
|
240
|
+
confidence=confidence,
|
|
241
|
+
root_cause_candidates=root_cause_candidates,
|
|
242
|
+
relevant_symbols=relevant_symbols,
|
|
243
|
+
call_chain=[],
|
|
244
|
+
runtime_evidence=[],
|
|
245
|
+
neighbor_patterns=[],
|
|
246
|
+
risk_areas=risk_areas,
|
|
247
|
+
compressed_context=compressed_context,
|
|
248
|
+
token_estimate=0, # placeholder; computed below
|
|
249
|
+
sources=sources,
|
|
250
|
+
untrusted_sections=sorted(set(untrusted_sections)),
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Step 8: drop sections to fit within max_tokens budget.
|
|
254
|
+
draft = self._enforce_budget(draft, max_tokens)
|
|
255
|
+
|
|
256
|
+
# Step 9: validate sources completeness.
|
|
257
|
+
self._validate_sources(draft)
|
|
258
|
+
|
|
259
|
+
return draft
|
|
260
|
+
|
|
261
|
+
# ------------------------------------------------------------------
|
|
262
|
+
# Section filling helpers
|
|
263
|
+
# ------------------------------------------------------------------
|
|
264
|
+
|
|
265
|
+
def _hydrate_symbols(
|
|
266
|
+
self,
|
|
267
|
+
hits: list[Hit],
|
|
268
|
+
db: Database,
|
|
269
|
+
) -> dict[str, Any]:
|
|
270
|
+
"""Fetch symbol rows from the DB for every hit.
|
|
271
|
+
|
|
272
|
+
Missing rows (symbol was deleted after indexing) are silently skipped —
|
|
273
|
+
the hit will still appear in the capsule but with minimal metadata.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
``{symbol_id: SymbolNode}`` mapping (may be a subset of *hits*).
|
|
277
|
+
"""
|
|
278
|
+
result: dict[str, Any] = {}
|
|
279
|
+
for hit in hits:
|
|
280
|
+
sym = get_symbol(db, hit.symbol_id)
|
|
281
|
+
if sym is not None:
|
|
282
|
+
result[hit.symbol_id] = sym
|
|
283
|
+
return result
|
|
284
|
+
|
|
285
|
+
def _build_relevant_symbol(
|
|
286
|
+
self,
|
|
287
|
+
hit: Hit,
|
|
288
|
+
symbol_rows: dict[str, Any],
|
|
289
|
+
untrusted_sections: list[str],
|
|
290
|
+
section_id: str,
|
|
291
|
+
) -> RelevantSymbol:
|
|
292
|
+
"""Build a :class:`RelevantSymbol` for a single hit.
|
|
293
|
+
|
|
294
|
+
Applies the untrusted-content wrapping rule: if the symbol has
|
|
295
|
+
``"untrusted_doc"`` in its ``untrusted_flags``, the snippet is wrapped
|
|
296
|
+
with ``<<<UNTRUSTED>>>`` markers and *section_id* is appended to
|
|
297
|
+
*untrusted_sections*.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
hit: The retrieval hit.
|
|
301
|
+
symbol_rows: Hydrated symbol rows from the DB.
|
|
302
|
+
untrusted_sections: Mutable list to which the section id is
|
|
303
|
+
appended if the content is untrusted.
|
|
304
|
+
section_id: The capsule section id (e.g. ``"relevant_symbols"``).
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
A :class:`RelevantSymbol` instance.
|
|
308
|
+
"""
|
|
309
|
+
sym = symbol_rows.get(hit.symbol_id)
|
|
310
|
+
kind = sym.kind if sym is not None else "unknown"
|
|
311
|
+
|
|
312
|
+
snippet: str | None = None
|
|
313
|
+
if sym is not None:
|
|
314
|
+
raw_snippet = sym.body_excerpt or sym.docstring
|
|
315
|
+
if raw_snippet:
|
|
316
|
+
if _is_untrusted(list(sym.untrusted_flags)):
|
|
317
|
+
snippet = _wrap_untrusted(raw_snippet, "docstring", sym.qualified_name)
|
|
318
|
+
if section_id not in untrusted_sections:
|
|
319
|
+
untrusted_sections.append(section_id)
|
|
320
|
+
else:
|
|
321
|
+
snippet = raw_snippet
|
|
322
|
+
|
|
323
|
+
return RelevantSymbol(
|
|
324
|
+
symbol_id=hit.symbol_id,
|
|
325
|
+
kind=kind,
|
|
326
|
+
snippet=snippet,
|
|
327
|
+
summary=sym.semantic_summary if sym is not None else None,
|
|
328
|
+
score=hit.score,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def _fill_bugfix_sections(
|
|
332
|
+
self,
|
|
333
|
+
hits: list[Hit],
|
|
334
|
+
symbol_rows: dict[str, Any],
|
|
335
|
+
sources: list[CapsuleSource],
|
|
336
|
+
untrusted_sections: list[str],
|
|
337
|
+
) -> tuple[list[RootCauseCandidate], list[RelevantSymbol]]:
|
|
338
|
+
"""Fill ``root_cause_candidates`` and ``relevant_symbols`` for bugfix mode.
|
|
339
|
+
|
|
340
|
+
Bugfix strategy (design §Cognitive Context Planner layer plan table):
|
|
341
|
+
- ``root_cause_candidates``: top structural hits (layer == "structural"),
|
|
342
|
+
sorted by score descending. Max 5.
|
|
343
|
+
- ``relevant_symbols``: remaining hits (semantic + lexical), up to 20.
|
|
344
|
+
"""
|
|
345
|
+
structural_hits = [h for h in hits if h.layer == "structural"]
|
|
346
|
+
other_hits = [h for h in hits if h.layer != "structural"]
|
|
347
|
+
|
|
348
|
+
root_causes: list[RootCauseCandidate] = []
|
|
349
|
+
relevant: list[RelevantSymbol] = []
|
|
350
|
+
|
|
351
|
+
# Root cause candidates from structural hits (up to 5).
|
|
352
|
+
for hit in structural_hits[:5]:
|
|
353
|
+
sym = symbol_rows.get(hit.symbol_id)
|
|
354
|
+
evidence: list[str] = []
|
|
355
|
+
if hit.evidence:
|
|
356
|
+
ev_str = json.dumps(hit.evidence, sort_keys=True)
|
|
357
|
+
evidence.append(ev_str)
|
|
358
|
+
rationale = hit.reason or f"structural relevance (score={hit.score:.3f})"
|
|
359
|
+
|
|
360
|
+
# Untrusted rationale wrapping.
|
|
361
|
+
if sym is not None and _is_untrusted(list(sym.untrusted_flags)):
|
|
362
|
+
rationale = _wrap_untrusted(rationale, "rationale", sym.qualified_name)
|
|
363
|
+
if "root_cause_candidates" not in untrusted_sections:
|
|
364
|
+
untrusted_sections.append("root_cause_candidates")
|
|
365
|
+
|
|
366
|
+
root_causes.append(
|
|
367
|
+
RootCauseCandidate(
|
|
368
|
+
symbol_id=hit.symbol_id,
|
|
369
|
+
rationale=rationale,
|
|
370
|
+
evidence=evidence,
|
|
371
|
+
)
|
|
372
|
+
)
|
|
373
|
+
sources.append(_make_source(hit.symbol_id))
|
|
374
|
+
|
|
375
|
+
# Relevant symbols from non-structural hits (up to 20).
|
|
376
|
+
for hit in other_hits[:20]:
|
|
377
|
+
rs = self._build_relevant_symbol(
|
|
378
|
+
hit, symbol_rows, untrusted_sections, "relevant_symbols"
|
|
379
|
+
)
|
|
380
|
+
relevant.append(rs)
|
|
381
|
+
sources.append(_make_source(hit.symbol_id))
|
|
382
|
+
|
|
383
|
+
return root_causes, relevant
|
|
384
|
+
|
|
385
|
+
def _fill_generic_sections(
|
|
386
|
+
self,
|
|
387
|
+
hits: list[Hit],
|
|
388
|
+
symbol_rows: dict[str, Any],
|
|
389
|
+
sources: list[CapsuleSource],
|
|
390
|
+
untrusted_sections: list[str],
|
|
391
|
+
) -> list[RelevantSymbol]:
|
|
392
|
+
"""Fill ``relevant_symbols`` for all non-bugfix modes.
|
|
393
|
+
|
|
394
|
+
Takes up to 25 hits, builds :class:`RelevantSymbol` entries with
|
|
395
|
+
source attachments.
|
|
396
|
+
"""
|
|
397
|
+
relevant: list[RelevantSymbol] = []
|
|
398
|
+
for hit in hits[:25]:
|
|
399
|
+
rs = self._build_relevant_symbol(
|
|
400
|
+
hit, symbol_rows, untrusted_sections, "relevant_symbols"
|
|
401
|
+
)
|
|
402
|
+
relevant.append(rs)
|
|
403
|
+
sources.append(_make_source(hit.symbol_id))
|
|
404
|
+
return relevant
|
|
405
|
+
|
|
406
|
+
# ------------------------------------------------------------------
|
|
407
|
+
# Budget enforcement (CP-8)
|
|
408
|
+
# ------------------------------------------------------------------
|
|
409
|
+
|
|
410
|
+
def _enforce_budget(self, capsule: ContextCapsule, max_tokens: int) -> ContextCapsule:
|
|
411
|
+
"""Drop sections until ``token_estimate ≤ max_tokens`` (CP-8).
|
|
412
|
+
|
|
413
|
+
The design mandates "drop sections (not truncate)" so we remove entire
|
|
414
|
+
sections in priority order (least important first) until the estimate
|
|
415
|
+
fits. Wall-clock fields (``generated_at`` if present) are excluded
|
|
416
|
+
from the hash comparison in the determinism test (CP-11) but are not
|
|
417
|
+
present in the v1 schema at MVP.
|
|
418
|
+
|
|
419
|
+
Section drop priority (lowest value = dropped first):
|
|
420
|
+
|
|
421
|
+
1. ``neighbor_patterns``
|
|
422
|
+
2. ``compressed_context``
|
|
423
|
+
3. ``risk_areas``
|
|
424
|
+
4. ``runtime_evidence``
|
|
425
|
+
5. ``relevant_symbols`` (trimmed from the end, not entirely dropped)
|
|
426
|
+
6. ``root_cause_candidates`` (trimmed from the end)
|
|
427
|
+
|
|
428
|
+
Args:
|
|
429
|
+
capsule: Draft capsule (``token_estimate`` may be 0 placeholder).
|
|
430
|
+
max_tokens: Hard upper bound.
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
A new capsule instance with ``token_estimate`` set and sections
|
|
434
|
+
potentially trimmed to fit.
|
|
435
|
+
"""
|
|
436
|
+
# Section drop order: cheapest to lose first.
|
|
437
|
+
_DROP_ORDER = [
|
|
438
|
+
"neighbor_patterns",
|
|
439
|
+
"compressed_context",
|
|
440
|
+
"risk_areas",
|
|
441
|
+
"runtime_evidence",
|
|
442
|
+
]
|
|
443
|
+
|
|
444
|
+
# Build a mutable dict so we can iteratively drop sections.
|
|
445
|
+
fields: dict[str, Any] = capsule.model_dump(by_alias=True)
|
|
446
|
+
|
|
447
|
+
def _current_estimate(f: dict[str, Any]) -> int:
|
|
448
|
+
tmp = ContextCapsule(
|
|
449
|
+
**{k: v for k, v in f.items() if k not in ("token_estimate",)}, token_estimate=0
|
|
450
|
+
)
|
|
451
|
+
return estimate_capsule_tokens(tmp)
|
|
452
|
+
|
|
453
|
+
estimate = _current_estimate(fields)
|
|
454
|
+
|
|
455
|
+
# Phase 1: drop whole sections in priority order.
|
|
456
|
+
for section in _DROP_ORDER:
|
|
457
|
+
if estimate <= max_tokens:
|
|
458
|
+
break
|
|
459
|
+
if fields.get(section):
|
|
460
|
+
fields[section] = []
|
|
461
|
+
estimate = _current_estimate(fields)
|
|
462
|
+
|
|
463
|
+
# Phase 2: trim relevant_symbols from the end (one-at-a-time).
|
|
464
|
+
while estimate > max_tokens and fields.get("relevant_symbols"):
|
|
465
|
+
fields["relevant_symbols"] = fields["relevant_symbols"][:-1]
|
|
466
|
+
estimate = _current_estimate(fields)
|
|
467
|
+
|
|
468
|
+
# Phase 3: trim root_cause_candidates from the end.
|
|
469
|
+
while estimate > max_tokens and fields.get("root_cause_candidates"):
|
|
470
|
+
fields["root_cause_candidates"] = fields["root_cause_candidates"][:-1]
|
|
471
|
+
estimate = _current_estimate(fields)
|
|
472
|
+
|
|
473
|
+
# After trimming, recalculate the final sources list to only include
|
|
474
|
+
# sources for symbols that still appear in the capsule.
|
|
475
|
+
remaining_symbol_ids: set[str] = set()
|
|
476
|
+
for rs in fields.get("relevant_symbols", []):
|
|
477
|
+
sid: str | None = rs.get("symbol_id") if isinstance(rs, dict) else rs.symbol_id
|
|
478
|
+
if sid is not None:
|
|
479
|
+
remaining_symbol_ids.add(sid)
|
|
480
|
+
for rcc in fields.get("root_cause_candidates", []):
|
|
481
|
+
sid2: str | None = rcc.get("symbol_id") if isinstance(rcc, dict) else rcc.symbol_id
|
|
482
|
+
if sid2 is not None:
|
|
483
|
+
remaining_symbol_ids.add(sid2)
|
|
484
|
+
|
|
485
|
+
# Filter sources to only those backing remaining symbols.
|
|
486
|
+
filtered_sources = []
|
|
487
|
+
seen_source_ids: set[str] = set()
|
|
488
|
+
for src in fields.get("sources", []):
|
|
489
|
+
src_id: str | None = src.get("id") if isinstance(src, dict) else src.id
|
|
490
|
+
if (
|
|
491
|
+
src_id is not None
|
|
492
|
+
and src_id in remaining_symbol_ids
|
|
493
|
+
and src_id not in seen_source_ids
|
|
494
|
+
):
|
|
495
|
+
filtered_sources.append(src)
|
|
496
|
+
seen_source_ids.add(src_id)
|
|
497
|
+
fields["sources"] = filtered_sources
|
|
498
|
+
|
|
499
|
+
# Also trim untrusted_sections to only those relevant to surviving content.
|
|
500
|
+
# (simplified: keep all declared untrusted sections; they reference section
|
|
501
|
+
# names not individual symbols, so we keep them as-is)
|
|
502
|
+
|
|
503
|
+
# Set the final token estimate.
|
|
504
|
+
fields["token_estimate"] = estimate
|
|
505
|
+
|
|
506
|
+
# Reconstruct the capsule with corrected fields.
|
|
507
|
+
return ContextCapsule(**{k: v for k, v in fields.items()})
|
|
508
|
+
|
|
509
|
+
# ------------------------------------------------------------------
|
|
510
|
+
# Source validation (CP-9 / "Sources mandatory")
|
|
511
|
+
# ------------------------------------------------------------------
|
|
512
|
+
|
|
513
|
+
def _validate_sources(self, capsule: ContextCapsule) -> None:
|
|
514
|
+
"""Verify every populated section has at least one ``sources[]`` entry.
|
|
515
|
+
|
|
516
|
+
Design: "Sources mandatory. Every claim has a sources[] entry. Compose
|
|
517
|
+
fails if violation detected."
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
capsule: The composed capsule to validate.
|
|
521
|
+
|
|
522
|
+
Raises:
|
|
523
|
+
ComposeError: If ``root_cause_candidates`` or ``relevant_symbols``
|
|
524
|
+
are populated but ``sources`` is empty.
|
|
525
|
+
"""
|
|
526
|
+
has_claims = bool(capsule.root_cause_candidates or capsule.relevant_symbols)
|
|
527
|
+
if has_claims and not capsule.sources:
|
|
528
|
+
raise ComposeError(
|
|
529
|
+
"Capsule has populated sections (root_cause_candidates / relevant_symbols) "
|
|
530
|
+
"but sources[] is empty. Composition rule 1 requires every claim to have "
|
|
531
|
+
"at least one source entry."
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
# Per-symbol source check: every symbol_id in claims must have a source.
|
|
535
|
+
sourced_ids = {s.id for s in capsule.sources}
|
|
536
|
+
for rcc in capsule.root_cause_candidates:
|
|
537
|
+
if rcc.symbol_id not in sourced_ids:
|
|
538
|
+
raise ComposeError(
|
|
539
|
+
f"root_cause_candidate symbol '{rcc.symbol_id}' has no source entry. "
|
|
540
|
+
"Compose fails: sources mandatory per design composition rule 1."
|
|
541
|
+
)
|
|
542
|
+
for rs in capsule.relevant_symbols:
|
|
543
|
+
if rs.symbol_id not in sourced_ids:
|
|
544
|
+
raise ComposeError(
|
|
545
|
+
f"relevant_symbol '{rs.symbol_id}' has no source entry. "
|
|
546
|
+
"Compose fails: sources mandatory per design composition rule 1."
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
__all__ = [
|
|
551
|
+
"CapsuleComposer",
|
|
552
|
+
"ComposeError",
|
|
553
|
+
]
|