crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/ckf/fabric.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Contextual Knowledge Fabric — unified 4-mode retrieval interface (§3.8).
|
|
4
|
+
|
|
5
|
+
The CKF is the top-level interface for fact storage, retrieval, community
|
|
6
|
+
detection, pub/sub events, and cross-session persistence.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import time
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from crp.extraction.types import Fact, FactEdge, RelationType
|
|
17
|
+
from crp.state.cold_storage import persist_to_cold, restore_from_cold
|
|
18
|
+
from crp.state.event_log import FactEventLog
|
|
19
|
+
from crp.state.warm_store import WarmStateStore, WarmStoreConfig
|
|
20
|
+
|
|
21
|
+
from .community import Community, CommunityDetector, CommunityResult
|
|
22
|
+
from .gc import GarbageCollector, GCResult
|
|
23
|
+
from .graph_walk import GraphWalkResult, graph_walk
|
|
24
|
+
from .merge import MergeResult, multi_mode_merge
|
|
25
|
+
from .pattern_query import PatternQueryResult, pattern_query
|
|
26
|
+
from .pubsub import CKFEvent, CKFEventType, EventCallback, PubSubEventBus
|
|
27
|
+
from .semantic import HNSWIndex, _check_hnswlib, semantic_fallback
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Config
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class CKFConfig:
|
|
39
|
+
"""Configuration for the Contextual Knowledge Fabric."""
|
|
40
|
+
|
|
41
|
+
max_facts: int = 10_000
|
|
42
|
+
hnsw_threshold: int = 1000
|
|
43
|
+
persist_path: str = ""
|
|
44
|
+
gc_budget_bytes: int = 500 * 1024 * 1024
|
|
45
|
+
gc_trigger_ratio: float = 0.80
|
|
46
|
+
gc_target_ratio: float = 0.70
|
|
47
|
+
community_detect_enabled: bool = True
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
# Health report
|
|
52
|
+
# ---------------------------------------------------------------------------
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class CKFHealth:
|
|
57
|
+
"""Health snapshot for monitoring."""
|
|
58
|
+
|
|
59
|
+
fact_count: int = 0
|
|
60
|
+
edge_count: int = 0
|
|
61
|
+
community_count: int = 0
|
|
62
|
+
event_count: int = 0
|
|
63
|
+
tombstoned_count: int = 0
|
|
64
|
+
estimated_bytes: int = 0
|
|
65
|
+
hnsw_active: bool = False
|
|
66
|
+
leiden_available: bool = False
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# ContextualKnowledgeFabric
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ContextualKnowledgeFabric:
|
|
75
|
+
"""Unified interface for fact storage and 4-mode retrieval (§3.8).
|
|
76
|
+
|
|
77
|
+
Methods (per spec 4F.1a):
|
|
78
|
+
- store(facts) / retrieve(query, modes, budget)
|
|
79
|
+
- query(pattern) / persist(path) / restore(path)
|
|
80
|
+
- fact_count() / health()
|
|
81
|
+
- temporal_query(window_range)
|
|
82
|
+
- graph_walk(seeds, hops) / community_summary(topic)
|
|
83
|
+
- subscribe(event, callback)
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(self, config: CKFConfig | None = None) -> None:
|
|
87
|
+
self._config = config or CKFConfig()
|
|
88
|
+
|
|
89
|
+
# Core stores
|
|
90
|
+
self._warm = WarmStateStore(
|
|
91
|
+
WarmStoreConfig(max_facts=self._config.max_facts)
|
|
92
|
+
)
|
|
93
|
+
self._event_log = FactEventLog()
|
|
94
|
+
|
|
95
|
+
# CKF subsystems
|
|
96
|
+
self._bus = PubSubEventBus()
|
|
97
|
+
self._community = CommunityDetector()
|
|
98
|
+
self._gc = GarbageCollector(
|
|
99
|
+
budget_bytes=self._config.gc_budget_bytes,
|
|
100
|
+
trigger_ratio=self._config.gc_trigger_ratio,
|
|
101
|
+
target_ratio=self._config.gc_target_ratio,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# HNSW index (built lazily when needed)
|
|
105
|
+
self._hnsw: HNSWIndex | None = None
|
|
106
|
+
self._hnsw_dirty: bool = True
|
|
107
|
+
|
|
108
|
+
# Community cache
|
|
109
|
+
self._community_result: CommunityResult | None = None
|
|
110
|
+
|
|
111
|
+
# Cooldown for community detection to avoid excessive re-computation
|
|
112
|
+
self._last_community_detect: float = 0.0
|
|
113
|
+
_COMMUNITY_COOLDOWN_SECONDS = 30.0
|
|
114
|
+
self._community_cooldown = _COMMUNITY_COOLDOWN_SECONDS
|
|
115
|
+
|
|
116
|
+
# ====================================================================
|
|
117
|
+
# Store
|
|
118
|
+
# ====================================================================
|
|
119
|
+
|
|
120
|
+
def store(self, facts: list[Fact], window_id: str = "") -> None:
|
|
121
|
+
"""Ingest facts into the warm store and emit events."""
|
|
122
|
+
# Set source_window_id on facts if not already set
|
|
123
|
+
for f in facts:
|
|
124
|
+
if not f.source_window_id and window_id:
|
|
125
|
+
f.source_window_id = window_id
|
|
126
|
+
added = self._warm.add_facts(facts)
|
|
127
|
+
for sf in added:
|
|
128
|
+
self._event_log.record_fact_created(sf.fact, window_id)
|
|
129
|
+
self._bus.publish(
|
|
130
|
+
CKFEvent(CKFEventType.FACT_CREATED, {"fact_id": sf.id, "window_id": window_id})
|
|
131
|
+
)
|
|
132
|
+
self._hnsw_dirty = True
|
|
133
|
+
|
|
134
|
+
# Auto-trigger community detection when fact count crosses thresholds
|
|
135
|
+
# (every 50 new facts or when first reaching 20 facts). This ensures
|
|
136
|
+
# community mode is usable during retrieval without explicit calls.
|
|
137
|
+
if self._config.community_detect_enabled and added:
|
|
138
|
+
fact_count = len(self._warm._facts)
|
|
139
|
+
prev_count = fact_count - len(added)
|
|
140
|
+
should_detect = (
|
|
141
|
+
(prev_count < 20 <= fact_count)
|
|
142
|
+
or (fact_count >= 20 and fact_count // 50 > prev_count // 50)
|
|
143
|
+
)
|
|
144
|
+
if should_detect:
|
|
145
|
+
if time.monotonic() - self._last_community_detect < self._community_cooldown:
|
|
146
|
+
logger.debug("Community detection skipped (cooldown active)")
|
|
147
|
+
else:
|
|
148
|
+
try:
|
|
149
|
+
self.detect_communities()
|
|
150
|
+
self._last_community_detect = time.monotonic()
|
|
151
|
+
except Exception:
|
|
152
|
+
logger.warning("Auto community detection failed", exc_info=True)
|
|
153
|
+
|
|
154
|
+
def store_edges(self, edges: list[FactEdge]) -> None:
|
|
155
|
+
"""Add edges to the fact graph."""
|
|
156
|
+
for edge in edges:
|
|
157
|
+
self._warm._graph.add_edge(edge)
|
|
158
|
+
self._event_log.record_edge_added(edge, "")
|
|
159
|
+
self._bus.publish(
|
|
160
|
+
CKFEvent(
|
|
161
|
+
CKFEventType.EDGE_ADDED,
|
|
162
|
+
{"source_id": edge.source_id, "target_id": edge.target_id},
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
self._hnsw_dirty = True
|
|
166
|
+
|
|
167
|
+
# ====================================================================
|
|
168
|
+
# Retrieve — 4-mode merge
|
|
169
|
+
# ====================================================================
|
|
170
|
+
|
|
171
|
+
def retrieve(
|
|
172
|
+
self,
|
|
173
|
+
query_embedding: list[float] | None = None,
|
|
174
|
+
seed_ids: set[str] | None = None,
|
|
175
|
+
entity_type: str | None = None,
|
|
176
|
+
relationship_type: str | RelationType | None = None,
|
|
177
|
+
topic: str | None = None,
|
|
178
|
+
modes: list[str] | None = None,
|
|
179
|
+
budget: int = 200,
|
|
180
|
+
) -> MergeResult:
|
|
181
|
+
"""Retrieve facts using up to 4 modes, merged and ranked.
|
|
182
|
+
|
|
183
|
+
Parameters
|
|
184
|
+
----------
|
|
185
|
+
query_embedding : for semantic mode
|
|
186
|
+
seed_ids : for graph_walk mode
|
|
187
|
+
entity_type / relationship_type : for pattern mode
|
|
188
|
+
topic : for community mode
|
|
189
|
+
modes : subset of ["graph_walk", "pattern", "semantic", "community"]
|
|
190
|
+
defaults to all applicable modes
|
|
191
|
+
budget : max facts to return
|
|
192
|
+
"""
|
|
193
|
+
if modes is None:
|
|
194
|
+
modes = self._infer_modes(query_embedding, seed_ids, entity_type, topic)
|
|
195
|
+
|
|
196
|
+
graph = self._warm._graph
|
|
197
|
+
mode_results: dict[str, list[tuple[Fact, float]]] = {}
|
|
198
|
+
|
|
199
|
+
# Mode 1: Graph walk
|
|
200
|
+
if "graph_walk" in modes and seed_ids:
|
|
201
|
+
gw = graph_walk(graph, seed_ids, max_hops=2, max_results=budget)
|
|
202
|
+
mode_results["graph_walk"] = [
|
|
203
|
+
(f, 1.0 / (1.0 + gw.distances.get(f.id, 3))) for f in gw.facts
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
# Mode 2: Pattern query
|
|
207
|
+
if "pattern" in modes and (entity_type or relationship_type):
|
|
208
|
+
pq = pattern_query(graph, entity_type, relationship_type, max_results=budget)
|
|
209
|
+
mode_results["pattern"] = [(f, f.confidence or 0.5) for f in pq.facts]
|
|
210
|
+
|
|
211
|
+
# Mode 3: Semantic fallback
|
|
212
|
+
if "semantic" in modes and query_embedding:
|
|
213
|
+
self._ensure_hnsw()
|
|
214
|
+
sem = semantic_fallback(
|
|
215
|
+
query_embedding,
|
|
216
|
+
self._warm._facts,
|
|
217
|
+
top_k=budget,
|
|
218
|
+
hnsw_index=self._hnsw,
|
|
219
|
+
)
|
|
220
|
+
mode_results["semantic"] = [
|
|
221
|
+
(sf.fact, sem.scores.get(sf.id, 0.0)) for sf in sem.facts
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
# Mode 4: Community summary
|
|
225
|
+
if "community" in modes and topic:
|
|
226
|
+
comms = self.community_summary(topic)
|
|
227
|
+
comm_facts: list[tuple[Fact, float]] = []
|
|
228
|
+
for comm in comms:
|
|
229
|
+
for fid in comm.fact_ids:
|
|
230
|
+
fact = graph.nodes.get(fid)
|
|
231
|
+
if fact:
|
|
232
|
+
comm_facts.append((fact, fact.confidence or 0.5))
|
|
233
|
+
mode_results["community"] = comm_facts[:budget]
|
|
234
|
+
|
|
235
|
+
# Merge
|
|
236
|
+
# Validate facts from all modes before merging (§audit G6)
|
|
237
|
+
for mode_name in list(mode_results):
|
|
238
|
+
mode_results[mode_name] = [
|
|
239
|
+
(f, s) for f, s in mode_results[mode_name]
|
|
240
|
+
if f.id and f.text and len(f.text.strip()) >= 3
|
|
241
|
+
]
|
|
242
|
+
|
|
243
|
+
fact_to_comm = (
|
|
244
|
+
self._community_result.fact_to_community
|
|
245
|
+
if self._community_result
|
|
246
|
+
else None
|
|
247
|
+
)
|
|
248
|
+
return multi_mode_merge(
|
|
249
|
+
mode_results,
|
|
250
|
+
fact_to_community=fact_to_comm,
|
|
251
|
+
max_results=budget,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# ====================================================================
|
|
255
|
+
# Query (pattern shorthand)
|
|
256
|
+
# ====================================================================
|
|
257
|
+
|
|
258
|
+
def query(
|
|
259
|
+
self,
|
|
260
|
+
entity_type: str | None = None,
|
|
261
|
+
relationship_type: str | RelationType | None = None,
|
|
262
|
+
min_confidence: float = 0.0,
|
|
263
|
+
max_results: int = 200,
|
|
264
|
+
) -> PatternQueryResult:
|
|
265
|
+
"""Convenience: pattern query on the fact graph."""
|
|
266
|
+
return pattern_query(
|
|
267
|
+
self._warm._graph, entity_type, relationship_type,
|
|
268
|
+
min_confidence=min_confidence, max_results=max_results,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
# ====================================================================
|
|
272
|
+
# Graph walk
|
|
273
|
+
# ====================================================================
|
|
274
|
+
|
|
275
|
+
def graph_walk(
|
|
276
|
+
self,
|
|
277
|
+
seed_ids: set[str],
|
|
278
|
+
max_hops: int = 2,
|
|
279
|
+
max_results: int = 200,
|
|
280
|
+
) -> GraphWalkResult:
|
|
281
|
+
"""BFS traversal from seed facts."""
|
|
282
|
+
return graph_walk(self._warm._graph, seed_ids, max_hops, max_results)
|
|
283
|
+
|
|
284
|
+
# ====================================================================
|
|
285
|
+
# Community
|
|
286
|
+
# ====================================================================
|
|
287
|
+
|
|
288
|
+
def community_summary(self, topic: str) -> list[Community]:
|
|
289
|
+
"""Return communities matching *topic*."""
|
|
290
|
+
if self._community_result is None and self._config.community_detect_enabled:
|
|
291
|
+
self._community_result = self._community.detect(self._warm._graph)
|
|
292
|
+
self._bus.publish(
|
|
293
|
+
CKFEvent(CKFEventType.COMMUNITY_UPDATED, {"community_count": len(self._community_result.communities)})
|
|
294
|
+
)
|
|
295
|
+
return self._community.community_summary(self._warm._graph, topic)
|
|
296
|
+
|
|
297
|
+
def detect_communities(self) -> CommunityResult:
|
|
298
|
+
"""Force community detection run."""
|
|
299
|
+
self._community_result = self._community.detect(self._warm._graph)
|
|
300
|
+
self._bus.publish(
|
|
301
|
+
CKFEvent(CKFEventType.COMMUNITY_UPDATED, {"community_count": len(self._community_result.communities)})
|
|
302
|
+
)
|
|
303
|
+
return self._community_result
|
|
304
|
+
|
|
305
|
+
# ====================================================================
|
|
306
|
+
# Temporal query
|
|
307
|
+
# ====================================================================
|
|
308
|
+
|
|
309
|
+
def temporal_query(
|
|
310
|
+
self,
|
|
311
|
+
start_window: str,
|
|
312
|
+
end_window: str,
|
|
313
|
+
) -> list[str]:
|
|
314
|
+
"""Return fact IDs active between two windows."""
|
|
315
|
+
return self._event_log.facts_between(start_window, end_window) # type: ignore[return-value]
|
|
316
|
+
|
|
317
|
+
# ====================================================================
|
|
318
|
+
# Persistence
|
|
319
|
+
# ====================================================================
|
|
320
|
+
|
|
321
|
+
def persist(self, path: str | Path) -> None:
|
|
322
|
+
"""Persist full state to cold storage, including community IDs."""
|
|
323
|
+
community_map = {}
|
|
324
|
+
if self._community_result:
|
|
325
|
+
community_map = dict(self._community_result.fact_to_community)
|
|
326
|
+
persist_to_cold(
|
|
327
|
+
self._warm, self._event_log, str(path),
|
|
328
|
+
community_map=community_map,
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
def restore(self, path: str | Path) -> list[str]:
|
|
332
|
+
"""Restore state from cold storage. Returns warnings."""
|
|
333
|
+
_header, warnings, community_map = restore_from_cold(
|
|
334
|
+
self._warm, self._event_log, str(path)
|
|
335
|
+
)
|
|
336
|
+
self._hnsw_dirty = True
|
|
337
|
+
self._community_result = None
|
|
338
|
+
return warnings
|
|
339
|
+
|
|
340
|
+
# ====================================================================
|
|
341
|
+
# Subscribe (pub/sub)
|
|
342
|
+
# ====================================================================
|
|
343
|
+
|
|
344
|
+
def subscribe(self, event_type: CKFEventType, callback: EventCallback) -> None:
|
|
345
|
+
"""Register a callback for CKF events."""
|
|
346
|
+
self._bus.subscribe(event_type, callback)
|
|
347
|
+
|
|
348
|
+
# ====================================================================
|
|
349
|
+
# Introspection
|
|
350
|
+
# ====================================================================
|
|
351
|
+
|
|
352
|
+
def fact_count(self) -> int:
|
|
353
|
+
return len(self._warm._facts)
|
|
354
|
+
|
|
355
|
+
def health(self) -> CKFHealth:
|
|
356
|
+
"""Return a health snapshot."""
|
|
357
|
+
from .community import _check_leiden
|
|
358
|
+
|
|
359
|
+
return CKFHealth(
|
|
360
|
+
fact_count=len(self._warm._facts),
|
|
361
|
+
edge_count=len(self._warm._graph.edges),
|
|
362
|
+
community_count=(
|
|
363
|
+
len(self._community_result.communities)
|
|
364
|
+
if self._community_result
|
|
365
|
+
else 0
|
|
366
|
+
),
|
|
367
|
+
event_count=len(self._event_log._events),
|
|
368
|
+
tombstoned_count=self._gc.tombstone_count(),
|
|
369
|
+
estimated_bytes=GarbageCollector.estimate_store_bytes(self._warm._facts),
|
|
370
|
+
hnsw_active=self._hnsw is not None,
|
|
371
|
+
leiden_available=_check_leiden(),
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# ====================================================================
|
|
375
|
+
# GC
|
|
376
|
+
# ====================================================================
|
|
377
|
+
|
|
378
|
+
def run_gc(self, current_window: int = 0) -> GCResult:
|
|
379
|
+
"""Run cross-session garbage collection."""
|
|
380
|
+
estimated = GarbageCollector.estimate_store_bytes(self._warm._facts)
|
|
381
|
+
return self._gc.run(self._warm._facts, estimated, current_window)
|
|
382
|
+
|
|
383
|
+
def should_gc(self) -> bool:
|
|
384
|
+
"""Check if GC should be triggered."""
|
|
385
|
+
estimated = GarbageCollector.estimate_store_bytes(self._warm._facts)
|
|
386
|
+
return self._gc.should_gc(estimated)
|
|
387
|
+
|
|
388
|
+
# ====================================================================
|
|
389
|
+
# Internal helpers
|
|
390
|
+
# ====================================================================
|
|
391
|
+
|
|
392
|
+
def _infer_modes(
|
|
393
|
+
self,
|
|
394
|
+
query_embedding: list[float] | None,
|
|
395
|
+
seed_ids: set[str] | None,
|
|
396
|
+
entity_type: str | None,
|
|
397
|
+
topic: str | None,
|
|
398
|
+
) -> list[str]:
|
|
399
|
+
"""Auto-detect which modes to activate based on provided inputs."""
|
|
400
|
+
modes: list[str] = []
|
|
401
|
+
if seed_ids:
|
|
402
|
+
modes.append("graph_walk")
|
|
403
|
+
if entity_type:
|
|
404
|
+
modes.append("pattern")
|
|
405
|
+
if query_embedding:
|
|
406
|
+
modes.append("semantic")
|
|
407
|
+
if topic:
|
|
408
|
+
modes.append("community")
|
|
409
|
+
return modes # return empty if no inputs match any mode
|
|
410
|
+
|
|
411
|
+
def _ensure_hnsw(self) -> None:
|
|
412
|
+
"""Build/rebuild HNSW index if needed and hnswlib is available."""
|
|
413
|
+
if not self._hnsw_dirty:
|
|
414
|
+
return
|
|
415
|
+
if _check_hnswlib() is None:
|
|
416
|
+
self._hnsw = None
|
|
417
|
+
self._hnsw_dirty = False
|
|
418
|
+
return
|
|
419
|
+
if len(self._warm._facts) < self._config.hnsw_threshold:
|
|
420
|
+
self._hnsw = None
|
|
421
|
+
self._hnsw_dirty = False
|
|
422
|
+
return
|
|
423
|
+
|
|
424
|
+
# Determine embedding dimension from first available embedding
|
|
425
|
+
dim = 0
|
|
426
|
+
for sf in self._warm._facts.values():
|
|
427
|
+
if sf.has_embedding() and sf.embedding:
|
|
428
|
+
dim = len(sf.embedding)
|
|
429
|
+
break
|
|
430
|
+
if dim == 0:
|
|
431
|
+
self._hnsw = None
|
|
432
|
+
self._hnsw_dirty = False
|
|
433
|
+
return
|
|
434
|
+
|
|
435
|
+
try:
|
|
436
|
+
idx = HNSWIndex(dim=dim, max_elements=len(self._warm._facts) + 1000)
|
|
437
|
+
for sf in self._warm._facts.values():
|
|
438
|
+
emb = sf.embedding
|
|
439
|
+
if emb is not None:
|
|
440
|
+
idx.add(sf.id, emb)
|
|
441
|
+
self._hnsw = idx
|
|
442
|
+
except Exception: # noqa: BLE001
|
|
443
|
+
logger.warning("Failed to build HNSW index, using brute-force")
|
|
444
|
+
self._hnsw = None
|
|
445
|
+
self._hnsw_dirty = False
|
crp/ckf/gc.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""CKF cross-session garbage collection (§3.8).
|
|
4
|
+
|
|
5
|
+
gc_score formula determines fact retention priority.
|
|
6
|
+
Tombstone → purge lifecycle. Budget 500 MB, trigger 80%, target 70%.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
from crp.state.fact import StateFact
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Config
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
BUDGET_BYTES = 500 * 1024 * 1024 # 500 MB
|
|
23
|
+
TRIGGER_RATIO = 0.80 # Trigger GC at 80% of budget
|
|
24
|
+
TARGET_RATIO = 0.70 # Reduce to 70% of budget
|
|
25
|
+
TOMBSTONE_AGE_WINDOWS = 50 # Windows before tombstone → purge
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# ---------------------------------------------------------------------------
|
|
29
|
+
# GC score formula
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def gc_score(fact: StateFact, current_window: int = 0) -> float:
|
|
34
|
+
"""Compute GC retention score for a fact.
|
|
35
|
+
|
|
36
|
+
Higher score = more worth keeping. Components:
|
|
37
|
+
- Confidence: raw confidence value
|
|
38
|
+
- Freshness: inverse of age (recently created facts score higher)
|
|
39
|
+
- Usage: how many envelopes consumed this fact
|
|
40
|
+
- Graph connectivity: number of edges (well-connected facts are more valuable)
|
|
41
|
+
|
|
42
|
+
Formula:
|
|
43
|
+
gc_score = 0.3 * confidence + 0.3 * freshness + 0.2 * usage + 0.2 * connectivity
|
|
44
|
+
"""
|
|
45
|
+
confidence = min(1.0, fact.confidence or 0.0)
|
|
46
|
+
|
|
47
|
+
# Freshness — decays with age; 1.0 at age 0, approaches 0 for old facts
|
|
48
|
+
age = fact.age_in_windows
|
|
49
|
+
freshness = 1.0 / (1.0 + age * 0.1)
|
|
50
|
+
|
|
51
|
+
# Usage — normalised seen_count (cap at 20)
|
|
52
|
+
usage = min(1.0, fact.seen_count / 20.0)
|
|
53
|
+
|
|
54
|
+
# Connectivity — normalised edge count (cap at 10)
|
|
55
|
+
connectivity = min(1.0, len(fact.graph_edges) / 10.0)
|
|
56
|
+
|
|
57
|
+
return 0.3 * confidence + 0.3 * freshness + 0.2 * usage + 0.2 * connectivity
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# ---------------------------------------------------------------------------
|
|
61
|
+
# GC result
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class GCResult:
|
|
67
|
+
"""Result of a GC pass."""
|
|
68
|
+
|
|
69
|
+
tombstoned: list[str] = field(default_factory=list)
|
|
70
|
+
purged: list[str] = field(default_factory=list)
|
|
71
|
+
bytes_freed_estimate: int = 0
|
|
72
|
+
facts_before: int = 0
|
|
73
|
+
facts_after: int = 0
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
# ---------------------------------------------------------------------------
|
|
77
|
+
# GarbageCollector
|
|
78
|
+
# ---------------------------------------------------------------------------
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class GarbageCollector:
|
|
82
|
+
"""Cross-session GC for the CKF fact store.
|
|
83
|
+
|
|
84
|
+
Lifecycle: active → tombstoned → purged.
|
|
85
|
+
Tombstoned facts are excluded from retrieval but retained for
|
|
86
|
+
TOMBSTONE_AGE_WINDOWS before final purge.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(
|
|
90
|
+
self,
|
|
91
|
+
budget_bytes: int = BUDGET_BYTES,
|
|
92
|
+
trigger_ratio: float = TRIGGER_RATIO,
|
|
93
|
+
target_ratio: float = TARGET_RATIO,
|
|
94
|
+
) -> None:
|
|
95
|
+
self._budget = budget_bytes
|
|
96
|
+
self._trigger = trigger_ratio
|
|
97
|
+
self._target = target_ratio
|
|
98
|
+
self._tombstones: dict[str, int] = {} # fact_id → window when tombstoned
|
|
99
|
+
|
|
100
|
+
def should_gc(self, estimated_bytes: int) -> bool:
|
|
101
|
+
"""Return True if GC should run (estimated usage ≥ trigger)."""
|
|
102
|
+
return estimated_bytes >= int(self._budget * self._trigger)
|
|
103
|
+
|
|
104
|
+
def run(
|
|
105
|
+
self,
|
|
106
|
+
facts: dict[str, StateFact],
|
|
107
|
+
estimated_bytes: int,
|
|
108
|
+
current_window: int = 0,
|
|
109
|
+
) -> GCResult:
|
|
110
|
+
"""Execute a GC pass.
|
|
111
|
+
|
|
112
|
+
1. Purge old tombstones (aged out).
|
|
113
|
+
2. If still over target, tombstone lowest-scoring active facts.
|
|
114
|
+
"""
|
|
115
|
+
result = GCResult(facts_before=len(facts))
|
|
116
|
+
target_bytes = int(self._budget * self._target)
|
|
117
|
+
|
|
118
|
+
# --- Phase 1: Purge old tombstones ---
|
|
119
|
+
to_purge: list[str] = []
|
|
120
|
+
for fid, tombstone_window in list(self._tombstones.items()):
|
|
121
|
+
if current_window - tombstone_window >= TOMBSTONE_AGE_WINDOWS:
|
|
122
|
+
to_purge.append(fid)
|
|
123
|
+
|
|
124
|
+
for fid in to_purge:
|
|
125
|
+
if fid in facts:
|
|
126
|
+
result.bytes_freed_estimate += self._estimate_fact_bytes(facts[fid])
|
|
127
|
+
del facts[fid]
|
|
128
|
+
result.purged.append(fid)
|
|
129
|
+
self._tombstones.pop(fid, None)
|
|
130
|
+
|
|
131
|
+
# Recalculate
|
|
132
|
+
estimated_bytes -= result.bytes_freed_estimate
|
|
133
|
+
|
|
134
|
+
# --- Phase 2: Tombstone low-value facts if still over target ---
|
|
135
|
+
if estimated_bytes > target_bytes:
|
|
136
|
+
# Score all active (non-tombstoned) facts
|
|
137
|
+
active: list[tuple[float, str]] = []
|
|
138
|
+
for fid, sf in facts.items():
|
|
139
|
+
if fid not in self._tombstones and not sf.is_superseded:
|
|
140
|
+
active.append((gc_score(sf, current_window), fid))
|
|
141
|
+
|
|
142
|
+
# Sort ascending — lowest score first (candidates for tombstoning)
|
|
143
|
+
active.sort()
|
|
144
|
+
|
|
145
|
+
for _score, fid in active:
|
|
146
|
+
if estimated_bytes <= target_bytes:
|
|
147
|
+
break
|
|
148
|
+
self._tombstones[fid] = current_window
|
|
149
|
+
result.tombstoned.append(fid)
|
|
150
|
+
estimated_bytes -= self._estimate_fact_bytes(facts[fid])
|
|
151
|
+
|
|
152
|
+
result.facts_after = len(facts)
|
|
153
|
+
return result
|
|
154
|
+
|
|
155
|
+
def is_tombstoned(self, fact_id: str) -> bool:
|
|
156
|
+
return fact_id in self._tombstones
|
|
157
|
+
|
|
158
|
+
def tombstone_count(self) -> int:
|
|
159
|
+
return len(self._tombstones)
|
|
160
|
+
|
|
161
|
+
@staticmethod
|
|
162
|
+
def _estimate_fact_bytes(sf: StateFact) -> int:
|
|
163
|
+
"""Rough byte estimate for a StateFact."""
|
|
164
|
+
base = len(sf.text.encode("utf-8")) + 200 # overhead
|
|
165
|
+
if sf.has_embedding():
|
|
166
|
+
base += len(sf.embedding or []) * 4 # float32
|
|
167
|
+
return base
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def estimate_store_bytes(facts: dict[str, StateFact]) -> int:
|
|
171
|
+
"""Estimate total bytes for the fact store."""
|
|
172
|
+
total = 0
|
|
173
|
+
for sf in facts.values():
|
|
174
|
+
total += GarbageCollector._estimate_fact_bytes(sf)
|
|
175
|
+
return total
|
crp/ckf/graph_walk.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""CKF Mode 1: Graph walk — BFS traversal from seed facts (§3.8).
|
|
4
|
+
|
|
5
|
+
``graph_walk(seed_facts, max_hops=2)`` returns a ranked list of facts
|
|
6
|
+
reachable within *max_hops* of the seed set, ordered by proximity.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections import deque
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
|
|
14
|
+
from crp.extraction.types import Fact, FactGraph
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Result type
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class GraphWalkResult:
|
|
23
|
+
"""Result of a graph walk query."""
|
|
24
|
+
|
|
25
|
+
facts: list[Fact] = field(default_factory=list)
|
|
26
|
+
distances: dict[str, int] = field(default_factory=dict)
|
|
27
|
+
visited_count: int = 0
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------------------------------------------------------
|
|
31
|
+
# BFS graph walk
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def graph_walk(
|
|
36
|
+
graph: FactGraph,
|
|
37
|
+
seed_ids: set[str],
|
|
38
|
+
max_hops: int = 2,
|
|
39
|
+
max_results: int = 200,
|
|
40
|
+
) -> GraphWalkResult:
|
|
41
|
+
"""BFS walk from *seed_ids* up to *max_hops*.
|
|
42
|
+
|
|
43
|
+
Returns facts ordered by hop distance (closer first), then by confidence.
|
|
44
|
+
Seed facts themselves are included at distance 0.
|
|
45
|
+
"""
|
|
46
|
+
if not seed_ids or not graph.nodes:
|
|
47
|
+
return GraphWalkResult()
|
|
48
|
+
|
|
49
|
+
# Build adjacency index for O(1) neighbour lookup
|
|
50
|
+
adj: dict[str, set[str]] = {}
|
|
51
|
+
for edge in graph.edges:
|
|
52
|
+
adj.setdefault(edge.source_id, set()).add(edge.target_id)
|
|
53
|
+
adj.setdefault(edge.target_id, set()).add(edge.source_id)
|
|
54
|
+
|
|
55
|
+
distances: dict[str, int] = {}
|
|
56
|
+
queue: deque[tuple[str, int]] = deque()
|
|
57
|
+
|
|
58
|
+
# Seed nodes
|
|
59
|
+
for sid in seed_ids:
|
|
60
|
+
if sid in graph.nodes:
|
|
61
|
+
distances[sid] = 0
|
|
62
|
+
queue.append((sid, 0))
|
|
63
|
+
|
|
64
|
+
# BFS
|
|
65
|
+
while queue:
|
|
66
|
+
node_id, dist = queue.popleft()
|
|
67
|
+
if dist >= max_hops:
|
|
68
|
+
continue
|
|
69
|
+
for neighbour in adj.get(node_id, set()):
|
|
70
|
+
if neighbour not in distances and neighbour in graph.nodes:
|
|
71
|
+
distances[neighbour] = dist + 1
|
|
72
|
+
queue.append((neighbour, dist + 1))
|
|
73
|
+
|
|
74
|
+
# Collect and rank: sort by distance asc, then confidence desc
|
|
75
|
+
ranked: list[tuple[int, float, str]] = []
|
|
76
|
+
for fid, dist in distances.items():
|
|
77
|
+
fact = graph.nodes.get(fid)
|
|
78
|
+
if fact:
|
|
79
|
+
ranked.append((dist, -(fact.confidence or 0.0), fid))
|
|
80
|
+
|
|
81
|
+
ranked.sort()
|
|
82
|
+
facts = [graph.nodes[fid] for _, _, fid in ranked[:max_results]]
|
|
83
|
+
return GraphWalkResult(
|
|
84
|
+
facts=facts,
|
|
85
|
+
distances={fid: distances[fid] for _, _, fid in ranked[:max_results]},
|
|
86
|
+
visited_count=len(distances),
|
|
87
|
+
)
|