crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/ckf/fabric.py ADDED
@@ -0,0 +1,445 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Contextual Knowledge Fabric — unified 4-mode retrieval interface (§3.8).
4
+
5
+ The CKF is the top-level interface for fact storage, retrieval, community
6
+ detection, pub/sub events, and cross-session persistence.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import time
13
+ from dataclasses import dataclass
14
+ from pathlib import Path
15
+
16
+ from crp.extraction.types import Fact, FactEdge, RelationType
17
+ from crp.state.cold_storage import persist_to_cold, restore_from_cold
18
+ from crp.state.event_log import FactEventLog
19
+ from crp.state.warm_store import WarmStateStore, WarmStoreConfig
20
+
21
+ from .community import Community, CommunityDetector, CommunityResult
22
+ from .gc import GarbageCollector, GCResult
23
+ from .graph_walk import GraphWalkResult, graph_walk
24
+ from .merge import MergeResult, multi_mode_merge
25
+ from .pattern_query import PatternQueryResult, pattern_query
26
+ from .pubsub import CKFEvent, CKFEventType, EventCallback, PubSubEventBus
27
+ from .semantic import HNSWIndex, _check_hnswlib, semantic_fallback
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Config
34
+ # ---------------------------------------------------------------------------
35
+
36
+
37
+ @dataclass
38
+ class CKFConfig:
39
+ """Configuration for the Contextual Knowledge Fabric."""
40
+
41
+ max_facts: int = 10_000
42
+ hnsw_threshold: int = 1000
43
+ persist_path: str = ""
44
+ gc_budget_bytes: int = 500 * 1024 * 1024
45
+ gc_trigger_ratio: float = 0.80
46
+ gc_target_ratio: float = 0.70
47
+ community_detect_enabled: bool = True
48
+
49
+
50
+ # ---------------------------------------------------------------------------
51
+ # Health report
52
+ # ---------------------------------------------------------------------------
53
+
54
+
55
+ @dataclass
56
+ class CKFHealth:
57
+ """Health snapshot for monitoring."""
58
+
59
+ fact_count: int = 0
60
+ edge_count: int = 0
61
+ community_count: int = 0
62
+ event_count: int = 0
63
+ tombstoned_count: int = 0
64
+ estimated_bytes: int = 0
65
+ hnsw_active: bool = False
66
+ leiden_available: bool = False
67
+
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # ContextualKnowledgeFabric
71
+ # ---------------------------------------------------------------------------
72
+
73
+
74
+ class ContextualKnowledgeFabric:
75
+ """Unified interface for fact storage and 4-mode retrieval (§3.8).
76
+
77
+ Methods (per spec 4F.1a):
78
+ - store(facts) / retrieve(query, modes, budget)
79
+ - query(pattern) / persist(path) / restore(path)
80
+ - fact_count() / health()
81
+ - temporal_query(window_range)
82
+ - graph_walk(seeds, hops) / community_summary(topic)
83
+ - subscribe(event, callback)
84
+ """
85
+
86
+ def __init__(self, config: CKFConfig | None = None) -> None:
87
+ self._config = config or CKFConfig()
88
+
89
+ # Core stores
90
+ self._warm = WarmStateStore(
91
+ WarmStoreConfig(max_facts=self._config.max_facts)
92
+ )
93
+ self._event_log = FactEventLog()
94
+
95
+ # CKF subsystems
96
+ self._bus = PubSubEventBus()
97
+ self._community = CommunityDetector()
98
+ self._gc = GarbageCollector(
99
+ budget_bytes=self._config.gc_budget_bytes,
100
+ trigger_ratio=self._config.gc_trigger_ratio,
101
+ target_ratio=self._config.gc_target_ratio,
102
+ )
103
+
104
+ # HNSW index (built lazily when needed)
105
+ self._hnsw: HNSWIndex | None = None
106
+ self._hnsw_dirty: bool = True
107
+
108
+ # Community cache
109
+ self._community_result: CommunityResult | None = None
110
+
111
+ # Cooldown for community detection to avoid excessive re-computation
112
+ self._last_community_detect: float = 0.0
113
+ _COMMUNITY_COOLDOWN_SECONDS = 30.0
114
+ self._community_cooldown = _COMMUNITY_COOLDOWN_SECONDS
115
+
116
+ # ====================================================================
117
+ # Store
118
+ # ====================================================================
119
+
120
+ def store(self, facts: list[Fact], window_id: str = "") -> None:
121
+ """Ingest facts into the warm store and emit events."""
122
+ # Set source_window_id on facts if not already set
123
+ for f in facts:
124
+ if not f.source_window_id and window_id:
125
+ f.source_window_id = window_id
126
+ added = self._warm.add_facts(facts)
127
+ for sf in added:
128
+ self._event_log.record_fact_created(sf.fact, window_id)
129
+ self._bus.publish(
130
+ CKFEvent(CKFEventType.FACT_CREATED, {"fact_id": sf.id, "window_id": window_id})
131
+ )
132
+ self._hnsw_dirty = True
133
+
134
+ # Auto-trigger community detection when fact count crosses thresholds
135
+ # (every 50 new facts or when first reaching 20 facts). This ensures
136
+ # community mode is usable during retrieval without explicit calls.
137
+ if self._config.community_detect_enabled and added:
138
+ fact_count = len(self._warm._facts)
139
+ prev_count = fact_count - len(added)
140
+ should_detect = (
141
+ (prev_count < 20 <= fact_count)
142
+ or (fact_count >= 20 and fact_count // 50 > prev_count // 50)
143
+ )
144
+ if should_detect:
145
+ if time.monotonic() - self._last_community_detect < self._community_cooldown:
146
+ logger.debug("Community detection skipped (cooldown active)")
147
+ else:
148
+ try:
149
+ self.detect_communities()
150
+ self._last_community_detect = time.monotonic()
151
+ except Exception:
152
+ logger.warning("Auto community detection failed", exc_info=True)
153
+
154
+ def store_edges(self, edges: list[FactEdge]) -> None:
155
+ """Add edges to the fact graph."""
156
+ for edge in edges:
157
+ self._warm._graph.add_edge(edge)
158
+ self._event_log.record_edge_added(edge, "")
159
+ self._bus.publish(
160
+ CKFEvent(
161
+ CKFEventType.EDGE_ADDED,
162
+ {"source_id": edge.source_id, "target_id": edge.target_id},
163
+ )
164
+ )
165
+ self._hnsw_dirty = True
166
+
167
+ # ====================================================================
168
+ # Retrieve — 4-mode merge
169
+ # ====================================================================
170
+
171
+ def retrieve(
172
+ self,
173
+ query_embedding: list[float] | None = None,
174
+ seed_ids: set[str] | None = None,
175
+ entity_type: str | None = None,
176
+ relationship_type: str | RelationType | None = None,
177
+ topic: str | None = None,
178
+ modes: list[str] | None = None,
179
+ budget: int = 200,
180
+ ) -> MergeResult:
181
+ """Retrieve facts using up to 4 modes, merged and ranked.
182
+
183
+ Parameters
184
+ ----------
185
+ query_embedding : for semantic mode
186
+ seed_ids : for graph_walk mode
187
+ entity_type / relationship_type : for pattern mode
188
+ topic : for community mode
189
+ modes : subset of ["graph_walk", "pattern", "semantic", "community"]
190
+ defaults to all applicable modes
191
+ budget : max facts to return
192
+ """
193
+ if modes is None:
194
+ modes = self._infer_modes(query_embedding, seed_ids, entity_type, topic)
195
+
196
+ graph = self._warm._graph
197
+ mode_results: dict[str, list[tuple[Fact, float]]] = {}
198
+
199
+ # Mode 1: Graph walk
200
+ if "graph_walk" in modes and seed_ids:
201
+ gw = graph_walk(graph, seed_ids, max_hops=2, max_results=budget)
202
+ mode_results["graph_walk"] = [
203
+ (f, 1.0 / (1.0 + gw.distances.get(f.id, 3))) for f in gw.facts
204
+ ]
205
+
206
+ # Mode 2: Pattern query
207
+ if "pattern" in modes and (entity_type or relationship_type):
208
+ pq = pattern_query(graph, entity_type, relationship_type, max_results=budget)
209
+ mode_results["pattern"] = [(f, f.confidence or 0.5) for f in pq.facts]
210
+
211
+ # Mode 3: Semantic fallback
212
+ if "semantic" in modes and query_embedding:
213
+ self._ensure_hnsw()
214
+ sem = semantic_fallback(
215
+ query_embedding,
216
+ self._warm._facts,
217
+ top_k=budget,
218
+ hnsw_index=self._hnsw,
219
+ )
220
+ mode_results["semantic"] = [
221
+ (sf.fact, sem.scores.get(sf.id, 0.0)) for sf in sem.facts
222
+ ]
223
+
224
+ # Mode 4: Community summary
225
+ if "community" in modes and topic:
226
+ comms = self.community_summary(topic)
227
+ comm_facts: list[tuple[Fact, float]] = []
228
+ for comm in comms:
229
+ for fid in comm.fact_ids:
230
+ fact = graph.nodes.get(fid)
231
+ if fact:
232
+ comm_facts.append((fact, fact.confidence or 0.5))
233
+ mode_results["community"] = comm_facts[:budget]
234
+
235
+ # Merge
236
+ # Validate facts from all modes before merging (§audit G6)
237
+ for mode_name in list(mode_results):
238
+ mode_results[mode_name] = [
239
+ (f, s) for f, s in mode_results[mode_name]
240
+ if f.id and f.text and len(f.text.strip()) >= 3
241
+ ]
242
+
243
+ fact_to_comm = (
244
+ self._community_result.fact_to_community
245
+ if self._community_result
246
+ else None
247
+ )
248
+ return multi_mode_merge(
249
+ mode_results,
250
+ fact_to_community=fact_to_comm,
251
+ max_results=budget,
252
+ )
253
+
254
+ # ====================================================================
255
+ # Query (pattern shorthand)
256
+ # ====================================================================
257
+
258
+ def query(
259
+ self,
260
+ entity_type: str | None = None,
261
+ relationship_type: str | RelationType | None = None,
262
+ min_confidence: float = 0.0,
263
+ max_results: int = 200,
264
+ ) -> PatternQueryResult:
265
+ """Convenience: pattern query on the fact graph."""
266
+ return pattern_query(
267
+ self._warm._graph, entity_type, relationship_type,
268
+ min_confidence=min_confidence, max_results=max_results,
269
+ )
270
+
271
+ # ====================================================================
272
+ # Graph walk
273
+ # ====================================================================
274
+
275
+ def graph_walk(
276
+ self,
277
+ seed_ids: set[str],
278
+ max_hops: int = 2,
279
+ max_results: int = 200,
280
+ ) -> GraphWalkResult:
281
+ """BFS traversal from seed facts."""
282
+ return graph_walk(self._warm._graph, seed_ids, max_hops, max_results)
283
+
284
+ # ====================================================================
285
+ # Community
286
+ # ====================================================================
287
+
288
+ def community_summary(self, topic: str) -> list[Community]:
289
+ """Return communities matching *topic*."""
290
+ if self._community_result is None and self._config.community_detect_enabled:
291
+ self._community_result = self._community.detect(self._warm._graph)
292
+ self._bus.publish(
293
+ CKFEvent(CKFEventType.COMMUNITY_UPDATED, {"community_count": len(self._community_result.communities)})
294
+ )
295
+ return self._community.community_summary(self._warm._graph, topic)
296
+
297
+ def detect_communities(self) -> CommunityResult:
298
+ """Force community detection run."""
299
+ self._community_result = self._community.detect(self._warm._graph)
300
+ self._bus.publish(
301
+ CKFEvent(CKFEventType.COMMUNITY_UPDATED, {"community_count": len(self._community_result.communities)})
302
+ )
303
+ return self._community_result
304
+
305
+ # ====================================================================
306
+ # Temporal query
307
+ # ====================================================================
308
+
309
+ def temporal_query(
310
+ self,
311
+ start_window: str,
312
+ end_window: str,
313
+ ) -> list[str]:
314
+ """Return fact IDs active between two windows."""
315
+ return self._event_log.facts_between(start_window, end_window) # type: ignore[return-value]
316
+
317
+ # ====================================================================
318
+ # Persistence
319
+ # ====================================================================
320
+
321
+ def persist(self, path: str | Path) -> None:
322
+ """Persist full state to cold storage, including community IDs."""
323
+ community_map = {}
324
+ if self._community_result:
325
+ community_map = dict(self._community_result.fact_to_community)
326
+ persist_to_cold(
327
+ self._warm, self._event_log, str(path),
328
+ community_map=community_map,
329
+ )
330
+
331
+ def restore(self, path: str | Path) -> list[str]:
332
+ """Restore state from cold storage. Returns warnings."""
333
+ _header, warnings, community_map = restore_from_cold(
334
+ self._warm, self._event_log, str(path)
335
+ )
336
+ self._hnsw_dirty = True
337
+ self._community_result = None
338
+ return warnings
339
+
340
+ # ====================================================================
341
+ # Subscribe (pub/sub)
342
+ # ====================================================================
343
+
344
+ def subscribe(self, event_type: CKFEventType, callback: EventCallback) -> None:
345
+ """Register a callback for CKF events."""
346
+ self._bus.subscribe(event_type, callback)
347
+
348
+ # ====================================================================
349
+ # Introspection
350
+ # ====================================================================
351
+
352
+ def fact_count(self) -> int:
353
+ return len(self._warm._facts)
354
+
355
+ def health(self) -> CKFHealth:
356
+ """Return a health snapshot."""
357
+ from .community import _check_leiden
358
+
359
+ return CKFHealth(
360
+ fact_count=len(self._warm._facts),
361
+ edge_count=len(self._warm._graph.edges),
362
+ community_count=(
363
+ len(self._community_result.communities)
364
+ if self._community_result
365
+ else 0
366
+ ),
367
+ event_count=len(self._event_log._events),
368
+ tombstoned_count=self._gc.tombstone_count(),
369
+ estimated_bytes=GarbageCollector.estimate_store_bytes(self._warm._facts),
370
+ hnsw_active=self._hnsw is not None,
371
+ leiden_available=_check_leiden(),
372
+ )
373
+
374
+ # ====================================================================
375
+ # GC
376
+ # ====================================================================
377
+
378
+ def run_gc(self, current_window: int = 0) -> GCResult:
379
+ """Run cross-session garbage collection."""
380
+ estimated = GarbageCollector.estimate_store_bytes(self._warm._facts)
381
+ return self._gc.run(self._warm._facts, estimated, current_window)
382
+
383
+ def should_gc(self) -> bool:
384
+ """Check if GC should be triggered."""
385
+ estimated = GarbageCollector.estimate_store_bytes(self._warm._facts)
386
+ return self._gc.should_gc(estimated)
387
+
388
+ # ====================================================================
389
+ # Internal helpers
390
+ # ====================================================================
391
+
392
+ def _infer_modes(
393
+ self,
394
+ query_embedding: list[float] | None,
395
+ seed_ids: set[str] | None,
396
+ entity_type: str | None,
397
+ topic: str | None,
398
+ ) -> list[str]:
399
+ """Auto-detect which modes to activate based on provided inputs."""
400
+ modes: list[str] = []
401
+ if seed_ids:
402
+ modes.append("graph_walk")
403
+ if entity_type:
404
+ modes.append("pattern")
405
+ if query_embedding:
406
+ modes.append("semantic")
407
+ if topic:
408
+ modes.append("community")
409
+ return modes # return empty if no inputs match any mode
410
+
411
+ def _ensure_hnsw(self) -> None:
412
+ """Build/rebuild HNSW index if needed and hnswlib is available."""
413
+ if not self._hnsw_dirty:
414
+ return
415
+ if _check_hnswlib() is None:
416
+ self._hnsw = None
417
+ self._hnsw_dirty = False
418
+ return
419
+ if len(self._warm._facts) < self._config.hnsw_threshold:
420
+ self._hnsw = None
421
+ self._hnsw_dirty = False
422
+ return
423
+
424
+ # Determine embedding dimension from first available embedding
425
+ dim = 0
426
+ for sf in self._warm._facts.values():
427
+ if sf.has_embedding() and sf.embedding:
428
+ dim = len(sf.embedding)
429
+ break
430
+ if dim == 0:
431
+ self._hnsw = None
432
+ self._hnsw_dirty = False
433
+ return
434
+
435
+ try:
436
+ idx = HNSWIndex(dim=dim, max_elements=len(self._warm._facts) + 1000)
437
+ for sf in self._warm._facts.values():
438
+ emb = sf.embedding
439
+ if emb is not None:
440
+ idx.add(sf.id, emb)
441
+ self._hnsw = idx
442
+ except Exception: # noqa: BLE001
443
+ logger.warning("Failed to build HNSW index, using brute-force")
444
+ self._hnsw = None
445
+ self._hnsw_dirty = False
crp/ckf/gc.py ADDED
@@ -0,0 +1,175 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """CKF cross-session garbage collection (§3.8).
4
+
5
+ gc_score formula determines fact retention priority.
6
+ Tombstone → purge lifecycle. Budget 500 MB, trigger 80%, target 70%.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from dataclasses import dataclass, field
13
+
14
+ from crp.state.fact import StateFact
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Config
20
+ # ---------------------------------------------------------------------------
21
+
22
+ BUDGET_BYTES = 500 * 1024 * 1024 # 500 MB
23
+ TRIGGER_RATIO = 0.80 # Trigger GC at 80% of budget
24
+ TARGET_RATIO = 0.70 # Reduce to 70% of budget
25
+ TOMBSTONE_AGE_WINDOWS = 50 # Windows before tombstone → purge
26
+
27
+
28
+ # ---------------------------------------------------------------------------
29
+ # GC score formula
30
+ # ---------------------------------------------------------------------------
31
+
32
+
33
+ def gc_score(fact: StateFact, current_window: int = 0) -> float:
34
+ """Compute GC retention score for a fact.
35
+
36
+ Higher score = more worth keeping. Components:
37
+ - Confidence: raw confidence value
38
+ - Freshness: inverse of age (recently created facts score higher)
39
+ - Usage: how many envelopes consumed this fact
40
+ - Graph connectivity: number of edges (well-connected facts are more valuable)
41
+
42
+ Formula:
43
+ gc_score = 0.3 * confidence + 0.3 * freshness + 0.2 * usage + 0.2 * connectivity
44
+ """
45
+ confidence = min(1.0, fact.confidence or 0.0)
46
+
47
+ # Freshness — decays with age; 1.0 at age 0, approaches 0 for old facts
48
+ age = fact.age_in_windows
49
+ freshness = 1.0 / (1.0 + age * 0.1)
50
+
51
+ # Usage — normalised seen_count (cap at 20)
52
+ usage = min(1.0, fact.seen_count / 20.0)
53
+
54
+ # Connectivity — normalised edge count (cap at 10)
55
+ connectivity = min(1.0, len(fact.graph_edges) / 10.0)
56
+
57
+ return 0.3 * confidence + 0.3 * freshness + 0.2 * usage + 0.2 * connectivity
58
+
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # GC result
62
+ # ---------------------------------------------------------------------------
63
+
64
+
65
+ @dataclass
66
+ class GCResult:
67
+ """Result of a GC pass."""
68
+
69
+ tombstoned: list[str] = field(default_factory=list)
70
+ purged: list[str] = field(default_factory=list)
71
+ bytes_freed_estimate: int = 0
72
+ facts_before: int = 0
73
+ facts_after: int = 0
74
+
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # GarbageCollector
78
+ # ---------------------------------------------------------------------------
79
+
80
+
81
+ class GarbageCollector:
82
+ """Cross-session GC for the CKF fact store.
83
+
84
+ Lifecycle: active → tombstoned → purged.
85
+ Tombstoned facts are excluded from retrieval but retained for
86
+ TOMBSTONE_AGE_WINDOWS before final purge.
87
+ """
88
+
89
+ def __init__(
90
+ self,
91
+ budget_bytes: int = BUDGET_BYTES,
92
+ trigger_ratio: float = TRIGGER_RATIO,
93
+ target_ratio: float = TARGET_RATIO,
94
+ ) -> None:
95
+ self._budget = budget_bytes
96
+ self._trigger = trigger_ratio
97
+ self._target = target_ratio
98
+ self._tombstones: dict[str, int] = {} # fact_id → window when tombstoned
99
+
100
+ def should_gc(self, estimated_bytes: int) -> bool:
101
+ """Return True if GC should run (estimated usage ≥ trigger)."""
102
+ return estimated_bytes >= int(self._budget * self._trigger)
103
+
104
+ def run(
105
+ self,
106
+ facts: dict[str, StateFact],
107
+ estimated_bytes: int,
108
+ current_window: int = 0,
109
+ ) -> GCResult:
110
+ """Execute a GC pass.
111
+
112
+ 1. Purge old tombstones (aged out).
113
+ 2. If still over target, tombstone lowest-scoring active facts.
114
+ """
115
+ result = GCResult(facts_before=len(facts))
116
+ target_bytes = int(self._budget * self._target)
117
+
118
+ # --- Phase 1: Purge old tombstones ---
119
+ to_purge: list[str] = []
120
+ for fid, tombstone_window in list(self._tombstones.items()):
121
+ if current_window - tombstone_window >= TOMBSTONE_AGE_WINDOWS:
122
+ to_purge.append(fid)
123
+
124
+ for fid in to_purge:
125
+ if fid in facts:
126
+ result.bytes_freed_estimate += self._estimate_fact_bytes(facts[fid])
127
+ del facts[fid]
128
+ result.purged.append(fid)
129
+ self._tombstones.pop(fid, None)
130
+
131
+ # Recalculate
132
+ estimated_bytes -= result.bytes_freed_estimate
133
+
134
+ # --- Phase 2: Tombstone low-value facts if still over target ---
135
+ if estimated_bytes > target_bytes:
136
+ # Score all active (non-tombstoned) facts
137
+ active: list[tuple[float, str]] = []
138
+ for fid, sf in facts.items():
139
+ if fid not in self._tombstones and not sf.is_superseded:
140
+ active.append((gc_score(sf, current_window), fid))
141
+
142
+ # Sort ascending — lowest score first (candidates for tombstoning)
143
+ active.sort()
144
+
145
+ for _score, fid in active:
146
+ if estimated_bytes <= target_bytes:
147
+ break
148
+ self._tombstones[fid] = current_window
149
+ result.tombstoned.append(fid)
150
+ estimated_bytes -= self._estimate_fact_bytes(facts[fid])
151
+
152
+ result.facts_after = len(facts)
153
+ return result
154
+
155
+ def is_tombstoned(self, fact_id: str) -> bool:
156
+ return fact_id in self._tombstones
157
+
158
+ def tombstone_count(self) -> int:
159
+ return len(self._tombstones)
160
+
161
+ @staticmethod
162
+ def _estimate_fact_bytes(sf: StateFact) -> int:
163
+ """Rough byte estimate for a StateFact."""
164
+ base = len(sf.text.encode("utf-8")) + 200 # overhead
165
+ if sf.has_embedding():
166
+ base += len(sf.embedding or []) * 4 # float32
167
+ return base
168
+
169
+ @staticmethod
170
+ def estimate_store_bytes(facts: dict[str, StateFact]) -> int:
171
+ """Estimate total bytes for the fact store."""
172
+ total = 0
173
+ for sf in facts.values():
174
+ total += GarbageCollector._estimate_fact_bytes(sf)
175
+ return total
crp/ckf/graph_walk.py ADDED
@@ -0,0 +1,87 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """CKF Mode 1: Graph walk — BFS traversal from seed facts (§3.8).
4
+
5
+ ``graph_walk(seed_facts, max_hops=2)`` returns a ranked list of facts
6
+ reachable within *max_hops* of the seed set, ordered by proximity.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections import deque
12
+ from dataclasses import dataclass, field
13
+
14
+ from crp.extraction.types import Fact, FactGraph
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Result type
18
+ # ---------------------------------------------------------------------------
19
+
20
+
21
+ @dataclass
22
+ class GraphWalkResult:
23
+ """Result of a graph walk query."""
24
+
25
+ facts: list[Fact] = field(default_factory=list)
26
+ distances: dict[str, int] = field(default_factory=dict)
27
+ visited_count: int = 0
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # BFS graph walk
32
+ # ---------------------------------------------------------------------------
33
+
34
+
35
+ def graph_walk(
36
+ graph: FactGraph,
37
+ seed_ids: set[str],
38
+ max_hops: int = 2,
39
+ max_results: int = 200,
40
+ ) -> GraphWalkResult:
41
+ """BFS walk from *seed_ids* up to *max_hops*.
42
+
43
+ Returns facts ordered by hop distance (closer first), then by confidence.
44
+ Seed facts themselves are included at distance 0.
45
+ """
46
+ if not seed_ids or not graph.nodes:
47
+ return GraphWalkResult()
48
+
49
+ # Build adjacency index for O(1) neighbour lookup
50
+ adj: dict[str, set[str]] = {}
51
+ for edge in graph.edges:
52
+ adj.setdefault(edge.source_id, set()).add(edge.target_id)
53
+ adj.setdefault(edge.target_id, set()).add(edge.source_id)
54
+
55
+ distances: dict[str, int] = {}
56
+ queue: deque[tuple[str, int]] = deque()
57
+
58
+ # Seed nodes
59
+ for sid in seed_ids:
60
+ if sid in graph.nodes:
61
+ distances[sid] = 0
62
+ queue.append((sid, 0))
63
+
64
+ # BFS
65
+ while queue:
66
+ node_id, dist = queue.popleft()
67
+ if dist >= max_hops:
68
+ continue
69
+ for neighbour in adj.get(node_id, set()):
70
+ if neighbour not in distances and neighbour in graph.nodes:
71
+ distances[neighbour] = dist + 1
72
+ queue.append((neighbour, dist + 1))
73
+
74
+ # Collect and rank: sort by distance asc, then confidence desc
75
+ ranked: list[tuple[int, float, str]] = []
76
+ for fid, dist in distances.items():
77
+ fact = graph.nodes.get(fid)
78
+ if fact:
79
+ ranked.append((dist, -(fact.confidence or 0.0), fid))
80
+
81
+ ranked.sort()
82
+ facts = [graph.nodes[fid] for _, _, fid in ranked[:max_results]]
83
+ return GraphWalkResult(
84
+ facts=facts,
85
+ distances={fid: distances[fid] for _, _, fid in ranked[:max_results]},
86
+ visited_count=len(distances),
87
+ )