crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,329 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Review cycle management — active LLM review patterns (§14).
4
+
5
+ Three interaction patterns:
6
+ 1. Pre-generation planning (predict chain > 5 windows)
7
+ 2. Checkpoint review (periodic, Tier 3 models only)
8
+ 3. Post-generation self-assessment (quality scoring + targeted re-gen)
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from collections.abc import Callable
15
+ from dataclasses import dataclass, field
16
+ from typing import Any
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Data types
20
+ # ---------------------------------------------------------------------------
21
+
22
+
23
+ @dataclass
24
+ class ReviewGuidance:
25
+ """Output from a checkpoint review."""
26
+
27
+ on_track: bool = True
28
+ contradictions: list[str] = field(default_factory=list)
29
+ priorities: list[str] = field(default_factory=list)
30
+ new_gaps: list[str] = field(default_factory=list)
31
+ raw_output: str = ""
32
+
33
+
34
+ @dataclass
35
+ class AssessmentResult:
36
+ """Output from post-generation self-assessment."""
37
+
38
+ score: float = 0.0 # 0-10
39
+ issues: list[str] = field(default_factory=list)
40
+ needs_correction: bool = False
41
+ corrections_applied: int = 0
42
+ raw_output: str = ""
43
+
44
+
45
+ @dataclass
46
+ class PlannedSection:
47
+ """One section in the generation plan."""
48
+
49
+ title: str = ""
50
+ key_points: list[str] = field(default_factory=list)
51
+ dependencies: list[str] = field(default_factory=list)
52
+ estimated_tokens: int = 0
53
+
54
+
55
+ @dataclass
56
+ class DocumentPlan:
57
+ """Full generation plan from pre-generation planning."""
58
+
59
+ sections: list[PlannedSection] = field(default_factory=list)
60
+ total_estimated_tokens: int = 0
61
+ estimated_windows: int = 0
62
+
63
+
64
+ # ---------------------------------------------------------------------------
65
+ # ReviewCycleManager
66
+ # ---------------------------------------------------------------------------
67
+
68
+
69
+ class ReviewCycleManager:
70
+ """Active LLM review cycles — planning, checkpoint, assessment."""
71
+
72
+ def __init__(
73
+ self,
74
+ dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
75
+ model_review_capability: int = 1,
76
+ correction_mode: str = "flag",
77
+ max_correction_windows: int = 3,
78
+ ) -> None:
79
+ self._dispatch_fn = dispatch_fn
80
+ self._model_capability = model_review_capability
81
+ self._correction_mode = correction_mode
82
+ self._max_corrections = max_correction_windows
83
+
84
+ # ------------------------------------------------------------------
85
+ # 1. Pre-generation planning
86
+ # ------------------------------------------------------------------
87
+
88
+ def pre_generation_plan(
89
+ self,
90
+ task_intent: str,
91
+ predicted_chain_length: int = 0,
92
+ ) -> DocumentPlan | None:
93
+ """Generate document plan when chain > 5 windows.
94
+
95
+ Returns None if chain is short or no dispatch_fn.
96
+ """
97
+ if predicted_chain_length <= 5:
98
+ return None
99
+ if not self._dispatch_fn:
100
+ return None
101
+
102
+ prompt = (
103
+ "Create an outline for the following task. For each section provide:\n"
104
+ "- Section title\n"
105
+ "- 2-3 key points to cover\n"
106
+ "- Dependencies on other sections\n\n"
107
+ f"Task: {task_intent}\n"
108
+ f"Estimated length: {predicted_chain_length} windows\n\n"
109
+ "Format: numbered sections."
110
+ )
111
+
112
+ try:
113
+ output, _ = self._dispatch_fn(prompt, "")
114
+ except Exception:
115
+ return None
116
+
117
+ return self._parse_plan(output, predicted_chain_length)
118
+
119
+ def _parse_plan(self, output: str, windows: int) -> DocumentPlan:
120
+ """Parse LLM output into a DocumentPlan."""
121
+ sections: list[PlannedSection] = []
122
+ current_title = ""
123
+ current_points: list[str] = []
124
+
125
+ for line in output.split("\n"):
126
+ line = line.strip()
127
+ if not line:
128
+ continue
129
+ # New section: starts with number
130
+ if re.match(r"\d+[\.\)]\s", line):
131
+ if current_title:
132
+ sections.append(PlannedSection(
133
+ title=current_title,
134
+ key_points=current_points,
135
+ ))
136
+ current_title = re.sub(r"^\d+[\.\)]\s*", "", line)
137
+ current_points = []
138
+ elif line.startswith("-") or line.startswith("•"):
139
+ current_points.append(line.lstrip("-•").strip())
140
+
141
+ if current_title:
142
+ sections.append(PlannedSection(
143
+ title=current_title,
144
+ key_points=current_points,
145
+ ))
146
+
147
+ return DocumentPlan(
148
+ sections=sections,
149
+ estimated_windows=windows,
150
+ )
151
+
152
+ # ------------------------------------------------------------------
153
+ # 2. Checkpoint review
154
+ # ------------------------------------------------------------------
155
+
156
+ def checkpoint_review(
157
+ self,
158
+ window_index: int,
159
+ review_interval: int = 20,
160
+ task_intent: str = "",
161
+ top_facts: list[str] | None = None,
162
+ gap_summary: str = "",
163
+ ) -> ReviewGuidance | None:
164
+ """Periodic review at checkpoint windows.
165
+
166
+ Gate: model_capability < 3 → None
167
+ Gate: window_index not at interval → None
168
+ """
169
+ if self._model_capability < 3:
170
+ return None
171
+ if window_index % review_interval != 0:
172
+ return None
173
+ if not self._dispatch_fn:
174
+ return None
175
+
176
+ facts_section = ""
177
+ if top_facts:
178
+ facts_section = "\n".join(f"- {f}" for f in top_facts[:30])
179
+
180
+ prompt = (
181
+ "Review checkpoint. Assess the following:\n"
182
+ "1. Are we on track for the task?\n"
183
+ "2. Any contradictions in the findings?\n"
184
+ "3. What should be prioritized next?\n"
185
+ "4. Any new gaps identified?\n\n"
186
+ f"Task: {task_intent}\n\n"
187
+ f"Key facts so far:\n{facts_section}\n\n"
188
+ f"Gap summary: {gap_summary}\n\n"
189
+ "Be concise."
190
+ )
191
+
192
+ try:
193
+ output, _ = self._dispatch_fn(prompt, "")
194
+ except Exception:
195
+ return None
196
+
197
+ return self._parse_review(output)
198
+
199
+ def _parse_review(self, output: str) -> ReviewGuidance:
200
+ """Parse checkpoint review output."""
201
+ guidance = ReviewGuidance(raw_output=output)
202
+ lines = output.split("\n")
203
+ for line in lines:
204
+ line_lower = line.lower().strip()
205
+ if "not on track" in line_lower or "off track" in line_lower:
206
+ guidance.on_track = False
207
+ if "contradict" in line_lower:
208
+ guidance.contradictions.append(line.strip())
209
+ if "priorit" in line_lower or "next" in line_lower:
210
+ guidance.priorities.append(line.strip())
211
+ if "gap" in line_lower or "missing" in line_lower:
212
+ guidance.new_gaps.append(line.strip())
213
+ return guidance
214
+
215
+ # ------------------------------------------------------------------
216
+ # 3. Post-generation self-assessment
217
+ # ------------------------------------------------------------------
218
+
219
+ def post_generation_assessment(
220
+ self,
221
+ accumulated_output: str,
222
+ task_intent: str,
223
+ ) -> AssessmentResult:
224
+ """Score output quality and flag issues.
225
+
226
+ Weak model → basic heuristic scoring.
227
+ Strong model → full LLM self-assessment.
228
+ """
229
+ if self._model_capability < 3 or not self._dispatch_fn:
230
+ return self._heuristic_assessment(accumulated_output, task_intent)
231
+
232
+ prompt = (
233
+ "Score the following output on a scale of 1-10 for completeness, "
234
+ "accuracy, and coherence. Start your response with 'SCORE: X/10'. "
235
+ "Then list any issues as numbered items.\n\n"
236
+ f"Task: {task_intent}"
237
+ )
238
+
239
+ try:
240
+ output, _ = self._dispatch_fn(prompt, accumulated_output[:5000])
241
+ except Exception:
242
+ return self._heuristic_assessment(accumulated_output, task_intent)
243
+
244
+ return self._parse_assessment(output)
245
+
246
+ def _parse_assessment(self, output: str) -> AssessmentResult:
247
+ """Parse LLM assessment output."""
248
+ result = AssessmentResult(raw_output=output)
249
+
250
+ # Extract score
251
+ score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)\s*/\s*10", output, re.IGNORECASE)
252
+ if score_match:
253
+ result.score = float(score_match.group(1))
254
+ else:
255
+ result.score = 5.0 # Default if can't parse
256
+
257
+ # Extract issues
258
+ for line in output.split("\n"):
259
+ line = line.strip()
260
+ if re.match(r"\d+\.", line):
261
+ result.issues.append(re.sub(r"^\d+\.\s*", "", line))
262
+
263
+ result.needs_correction = (
264
+ result.score < 6
265
+ and self._correction_mode == "correct"
266
+ )
267
+
268
+ return result
269
+
270
+ def _heuristic_assessment(
271
+ self, output: str, task_intent: str,
272
+ ) -> AssessmentResult:
273
+ """Basic quality scoring without LLM."""
274
+ score = 5.0
275
+ issues: list[str] = []
276
+
277
+ # Length heuristic
278
+ words = len(output.split())
279
+ if words < 50:
280
+ score -= 2
281
+ issues.append("Output is very short")
282
+ elif words > 200:
283
+ score += 1
284
+
285
+ # Check for task keyword coverage
286
+ task_words = set(task_intent.lower().split())
287
+ output_words = set(output.lower().split())
288
+ coverage = len(task_words & output_words) / max(len(task_words), 1)
289
+ if coverage < 0.3:
290
+ score -= 1
291
+ issues.append("Low task keyword coverage")
292
+ elif coverage > 0.7:
293
+ score += 1
294
+
295
+ score = max(1.0, min(10.0, score))
296
+
297
+ return AssessmentResult(
298
+ score=score,
299
+ issues=issues,
300
+ needs_correction=score < 6 and self._correction_mode == "correct",
301
+ )
302
+
303
+ # ------------------------------------------------------------------
304
+ # Targeted re-generation
305
+ # ------------------------------------------------------------------
306
+
307
+ def targeted_regeneration(
308
+ self,
309
+ issues: list[str],
310
+ task_intent: str,
311
+ ) -> list[str]:
312
+ """Re-generate targeted fixes for each issue (capped at max_corrections)."""
313
+ if not self._dispatch_fn:
314
+ return []
315
+
316
+ corrections: list[str] = []
317
+ for issue in issues[:self._max_corrections]:
318
+ prompt = (
319
+ f"Fix this specific issue in the output: {issue}\n"
320
+ f"Original task: {task_intent}\n"
321
+ "Provide only the corrected section."
322
+ )
323
+ try:
324
+ output, _ = self._dispatch_fn(prompt, "")
325
+ corrections.append(output)
326
+ except Exception:
327
+ corrections.append(f"[correction failed for: {issue}]")
328
+
329
+ return corrections
@@ -0,0 +1,129 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Scale-mode selector — auto-configure session by quality tier (§8.3, §15).
4
+
5
+ Classifies input into quality tiers S/A/B/C/D based on token ratio,
6
+ then configures processing mode, validation tiers, review cycles, etc.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from dataclasses import dataclass
12
+ from enum import IntEnum
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Quality tiers (§10.2)
16
+ # ---------------------------------------------------------------------------
17
+
18
+
19
+ class QualityTier(IntEnum):
20
+ """Quality tiers — S (single window) through D (>1000 windows at 128K ctx)."""
21
+
22
+ S = 0 # ≤ C
23
+ A = 1 # C–10C
24
+ B = 2 # 10C–100C
25
+ C = 3 # 100C–1000C
26
+ D = 4 # >1000C
27
+
28
+
29
+ def classify_quality_tier(
30
+ estimated_tokens: int, context_window: int,
31
+ ) -> QualityTier:
32
+ """Classify input into quality tier based on token-to-context ratio."""
33
+ if context_window <= 0:
34
+ return QualityTier.S
35
+ ratio = estimated_tokens / context_window
36
+ if ratio <= 1:
37
+ return QualityTier.S
38
+ if ratio <= 10:
39
+ return QualityTier.A
40
+ if ratio <= 100:
41
+ return QualityTier.B
42
+ if ratio <= 1000:
43
+ return QualityTier.C
44
+ return QualityTier.D
45
+
46
+
47
+ # ---------------------------------------------------------------------------
48
+ # Processing mode
49
+ # ---------------------------------------------------------------------------
50
+
51
+
52
+ def select_processing_mode(
53
+ estimated_tokens: int, context_window: int,
54
+ ) -> str:
55
+ """Select processing mode based on windows needed."""
56
+ if context_window <= 0:
57
+ return "SERIAL"
58
+ windows = estimated_tokens / context_window
59
+ if windows <= 10:
60
+ return "SERIAL"
61
+ if windows <= 100:
62
+ return "SERIAL_WITH_REGROUNDING"
63
+ if windows <= 1000:
64
+ return "HIERARCHICAL"
65
+ return "HIERARCHICAL_MULTI_LEVEL"
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Session configuration
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ @dataclass
74
+ class SessionConfig:
75
+ """Auto-configured session parameters."""
76
+
77
+ quality_tier: QualityTier = QualityTier.S
78
+ processing_mode: str = "SERIAL"
79
+ cqs_enabled: bool = False
80
+ validation_tiers: int = 1
81
+ review_cycles_enabled: bool = False
82
+ planning_window: bool = False
83
+ hierarchical: bool = False
84
+ re_grounding: bool = False
85
+ model_review_capability: int = 1
86
+
87
+
88
+ class ScaleModeSelector:
89
+ """Auto-configure session based on quality tier and model capability."""
90
+
91
+ def __init__(
92
+ self,
93
+ context_window: int = 128_000,
94
+ ) -> None:
95
+ self._context_window = context_window
96
+
97
+ def configure_session(
98
+ self,
99
+ estimated_tokens: int,
100
+ model_capability: int = 1,
101
+ ) -> SessionConfig:
102
+ """Auto-configure session based on input size and model capability.
103
+
104
+ Args:
105
+ estimated_tokens: Total estimated input tokens.
106
+ model_capability: Assessed model capability (1, 2, or 3).
107
+
108
+ Returns:
109
+ SessionConfig with all parameters set.
110
+ """
111
+ tier = classify_quality_tier(estimated_tokens, self._context_window)
112
+ mode = select_processing_mode(estimated_tokens, self._context_window)
113
+
114
+ return SessionConfig(
115
+ quality_tier=tier,
116
+ processing_mode=mode,
117
+ cqs_enabled=tier >= QualityTier.A,
118
+ validation_tiers=min(
119
+ model_capability,
120
+ 3 if tier >= QualityTier.C else 2 if tier >= QualityTier.B else 1,
121
+ ),
122
+ review_cycles_enabled=(
123
+ tier >= QualityTier.B and model_capability >= 3
124
+ ),
125
+ planning_window=tier >= QualityTier.B,
126
+ hierarchical=tier >= QualityTier.C,
127
+ re_grounding=tier >= QualityTier.B,
128
+ model_review_capability=model_capability,
129
+ )
@@ -0,0 +1,207 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Source grounding — store/retrieve verbatim source passages (§17).
4
+
5
+ Stores passages for facts with confidence ≥ 0.8. Integrates passages
6
+ into envelopes with tier-based budget allocation.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from collections.abc import Callable
12
+ from dataclasses import dataclass, field
13
+ from typing import Any
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Constants (§17)
17
+ # ---------------------------------------------------------------------------
18
+
19
+ HIGH_CONFIDENCE_THRESHOLD = 0.8
20
+
21
+ # Budget allocation by quality tier (fact_pct, source_pct)
22
+ BUDGET_BY_TIER: dict[str, tuple[float, float]] = {
23
+ "S": (1.0, 0.0), # No envelope needed
24
+ "A": (0.90, 0.10), # Low drift risk
25
+ "B": (0.70, 0.30), # Highest drift risk
26
+ "C": (0.70, 0.30), # Hierarchy adds abstraction
27
+ "D": (0.75, 0.25), # Space premium
28
+ }
29
+
30
+ HIGH_RELEVANCE_THRESHOLD = 0.7
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Data types
35
+ # ---------------------------------------------------------------------------
36
+
37
+
38
+ @dataclass
39
+ class SourcePassage:
40
+ """Verbatim passage from the original input linked to facts."""
41
+
42
+ passage_id: str = ""
43
+ text: str = ""
44
+ source_window: int = 0
45
+ token_offset_start: int = 0
46
+ token_offset_end: int = 0
47
+ linked_fact_ids: list[str] = field(default_factory=list)
48
+ token_count: int = 0
49
+ relevance_score: float = 0.0
50
+
51
+ def to_dict(self) -> dict[str, Any]:
52
+ return {
53
+ "passage_id": self.passage_id,
54
+ "text": self.text,
55
+ "source_window": self.source_window,
56
+ "token_offset_start": self.token_offset_start,
57
+ "token_offset_end": self.token_offset_end,
58
+ "linked_fact_ids": list(self.linked_fact_ids),
59
+ "token_count": self.token_count,
60
+ "relevance_score": self.relevance_score,
61
+ }
62
+
63
+ @classmethod
64
+ def from_dict(cls, data: dict[str, Any]) -> SourcePassage:
65
+ return cls(
66
+ passage_id=data.get("passage_id", ""),
67
+ text=data.get("text", ""),
68
+ source_window=data.get("source_window", 0),
69
+ token_offset_start=data.get("token_offset_start", 0),
70
+ token_offset_end=data.get("token_offset_end", 0),
71
+ linked_fact_ids=data.get("linked_fact_ids", []),
72
+ token_count=data.get("token_count", 0),
73
+ relevance_score=data.get("relevance_score", 0.0),
74
+ )
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # SourceGroundingEngine
79
+ # ---------------------------------------------------------------------------
80
+
81
+
82
+ class SourceGroundingEngine:
83
+ """Store and retrieve verbatim source passages for high-confidence facts."""
84
+
85
+ def __init__(
86
+ self,
87
+ count_tokens: Callable[[str], int] | None = None,
88
+ ) -> None:
89
+ self._count_tokens = count_tokens or (lambda t: len(t) // 4)
90
+ self._passages: dict[str, SourcePassage] = {} # passage_id → passage
91
+ self._fact_to_passages: dict[str, list[str]] = {} # fact_id → [passage_ids]
92
+
93
+ @property
94
+ def passage_count(self) -> int:
95
+ return len(self._passages)
96
+
97
+ def store_passage(
98
+ self,
99
+ passage: SourcePassage,
100
+ fact_confidence: float = 0.0,
101
+ ) -> bool:
102
+ """Store a passage if linked fact has confidence ≥ threshold.
103
+
104
+ Returns True if stored, False if below threshold.
105
+ """
106
+ if fact_confidence < HIGH_CONFIDENCE_THRESHOLD:
107
+ return False
108
+
109
+ passage.token_count = self._count_tokens(passage.text)
110
+ self._passages[passage.passage_id] = passage
111
+ for fid in passage.linked_fact_ids:
112
+ self._fact_to_passages.setdefault(fid, []).append(passage.passage_id)
113
+ return True
114
+
115
+ def get_passages_for_fact(self, fact_id: str) -> list[SourcePassage]:
116
+ """Retrieve all source passages linked to a fact."""
117
+ pids = self._fact_to_passages.get(fact_id, [])
118
+ return [self._passages[pid] for pid in pids if pid in self._passages]
119
+
120
+ def build_source_grounded_envelope(
121
+ self,
122
+ scored_facts: list[dict[str, Any]],
123
+ budget_tokens: int,
124
+ quality_tier: str = "B",
125
+ ) -> tuple[list[dict[str, Any]], list[SourcePassage]]:
126
+ """Build envelope with source passages allocated by tier.
127
+
128
+ Args:
129
+ scored_facts: Sorted list of {"id", "text", "score", ...}
130
+ budget_tokens: Total envelope budget in tokens.
131
+ quality_tier: S/A/B/C/D
132
+
133
+ Returns:
134
+ (packed_facts, included_passages)
135
+ """
136
+ fact_pct, source_pct = BUDGET_BY_TIER.get(quality_tier, (0.70, 0.30))
137
+ fact_budget = int(budget_tokens * fact_pct)
138
+ source_budget = int(budget_tokens * source_pct)
139
+
140
+ # Phase 1: Pack facts within fact budget
141
+ packed_facts: list[dict[str, Any]] = []
142
+ tokens_used = 0
143
+ for fact in scored_facts:
144
+ t = self._count_tokens(fact.get("text", ""))
145
+ if tokens_used + t > fact_budget:
146
+ break
147
+ packed_facts.append(fact)
148
+ tokens_used += t
149
+
150
+ # Phase 2: Allocate source passages for high-relevance packed facts
151
+ included_passages: list[SourcePassage] = []
152
+ source_tokens_used = 0
153
+
154
+ for fact in packed_facts:
155
+ fid = fact.get("id", "")
156
+ score = fact.get("score", 0.0)
157
+ if score < HIGH_RELEVANCE_THRESHOLD:
158
+ continue
159
+
160
+ for passage in self.get_passages_for_fact(fid):
161
+ if source_tokens_used + passage.token_count > source_budget:
162
+ break
163
+ included_passages.append(passage)
164
+ source_tokens_used += passage.token_count
165
+
166
+ return packed_facts, included_passages
167
+
168
+ def format_envelope_section(
169
+ self,
170
+ fact: dict[str, Any],
171
+ passages: list[SourcePassage],
172
+ ) -> str:
173
+ """Format a fact with its source passages for envelope inclusion.
174
+
175
+ Format:
176
+ - {fact text} — Window N
177
+ ↳ [SOURCE: Window N, tokens X-Y]
178
+ "{verbatim original text}"
179
+ """
180
+ lines = [f"- {fact.get('text', '')} — Window {fact.get('window', '?')}"]
181
+ for p in passages:
182
+ lines.append(
183
+ f" ↳ [SOURCE: Window {p.source_window}, "
184
+ f"tokens {p.token_offset_start}-{p.token_offset_end}]"
185
+ )
186
+ lines.append(f' "{p.text}"')
187
+ return "\n".join(lines)
188
+
189
+ def to_dict(self) -> dict[str, Any]:
190
+ """Serialize for persistence."""
191
+ return {
192
+ "passages": {pid: p.to_dict() for pid, p in self._passages.items()},
193
+ "fact_to_passages": dict(self._fact_to_passages),
194
+ }
195
+
196
+ @classmethod
197
+ def from_dict(
198
+ cls,
199
+ data: dict[str, Any],
200
+ count_tokens: Callable[[str], int] | None = None,
201
+ ) -> SourceGroundingEngine:
202
+ """Restore from serialized state."""
203
+ engine = cls(count_tokens=count_tokens)
204
+ for pid, pdata in data.get("passages", {}).items():
205
+ engine._passages[pid] = SourcePassage.from_dict(pdata)
206
+ engine._fact_to_passages = data.get("fact_to_passages", {})
207
+ return engine
crp/ckf/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Contextual Knowledge Fabric — 4-mode retrieval, community detection, pub/sub."""
4
+
5
+ from .community import Community, CommunityDetector, CommunityResult
6
+ from .fabric import CKFConfig, CKFHealth, ContextualKnowledgeFabric
7
+ from .gc import GarbageCollector, GCResult
8
+ from .graph_walk import GraphWalkResult, graph_walk
9
+ from .merge import MergedFact, MergeResult, multi_mode_merge
10
+ from .pattern_query import PatternQueryResult, pattern_query
11
+ from .pubsub import CKFEvent, CKFEventType, PubSubEventBus
12
+ from .semantic import SemanticResult, semantic_fallback
13
+
14
+ __all__ = [
15
+ "CKFConfig",
16
+ "CKFEvent",
17
+ "CKFEventType",
18
+ "CKFHealth",
19
+ "Community",
20
+ "CommunityDetector",
21
+ "CommunityResult",
22
+ "ContextualKnowledgeFabric",
23
+ "GCResult",
24
+ "GarbageCollector",
25
+ "GraphWalkResult",
26
+ "MergeResult",
27
+ "MergedFact",
28
+ "PatternQueryResult",
29
+ "PubSubEventBus",
30
+ "SemanticResult",
31
+ "graph_walk",
32
+ "multi_mode_merge",
33
+ "pattern_query",
34
+ "semantic_fallback",
35
+ ]