crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Review cycle management — active LLM review patterns (§14).
|
|
4
|
+
|
|
5
|
+
Three interaction patterns:
|
|
6
|
+
1. Pre-generation planning (predict chain > 5 windows)
|
|
7
|
+
2. Checkpoint review (periodic, Tier 3 models only)
|
|
8
|
+
3. Post-generation self-assessment (quality scoring + targeted re-gen)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Data types
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class ReviewGuidance:
|
|
25
|
+
"""Output from a checkpoint review."""
|
|
26
|
+
|
|
27
|
+
on_track: bool = True
|
|
28
|
+
contradictions: list[str] = field(default_factory=list)
|
|
29
|
+
priorities: list[str] = field(default_factory=list)
|
|
30
|
+
new_gaps: list[str] = field(default_factory=list)
|
|
31
|
+
raw_output: str = ""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class AssessmentResult:
|
|
36
|
+
"""Output from post-generation self-assessment."""
|
|
37
|
+
|
|
38
|
+
score: float = 0.0 # 0-10
|
|
39
|
+
issues: list[str] = field(default_factory=list)
|
|
40
|
+
needs_correction: bool = False
|
|
41
|
+
corrections_applied: int = 0
|
|
42
|
+
raw_output: str = ""
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class PlannedSection:
|
|
47
|
+
"""One section in the generation plan."""
|
|
48
|
+
|
|
49
|
+
title: str = ""
|
|
50
|
+
key_points: list[str] = field(default_factory=list)
|
|
51
|
+
dependencies: list[str] = field(default_factory=list)
|
|
52
|
+
estimated_tokens: int = 0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class DocumentPlan:
|
|
57
|
+
"""Full generation plan from pre-generation planning."""
|
|
58
|
+
|
|
59
|
+
sections: list[PlannedSection] = field(default_factory=list)
|
|
60
|
+
total_estimated_tokens: int = 0
|
|
61
|
+
estimated_windows: int = 0
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# ReviewCycleManager
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class ReviewCycleManager:
|
|
70
|
+
"""Active LLM review cycles — planning, checkpoint, assessment."""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
|
|
75
|
+
model_review_capability: int = 1,
|
|
76
|
+
correction_mode: str = "flag",
|
|
77
|
+
max_correction_windows: int = 3,
|
|
78
|
+
) -> None:
|
|
79
|
+
self._dispatch_fn = dispatch_fn
|
|
80
|
+
self._model_capability = model_review_capability
|
|
81
|
+
self._correction_mode = correction_mode
|
|
82
|
+
self._max_corrections = max_correction_windows
|
|
83
|
+
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
# 1. Pre-generation planning
|
|
86
|
+
# ------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
def pre_generation_plan(
|
|
89
|
+
self,
|
|
90
|
+
task_intent: str,
|
|
91
|
+
predicted_chain_length: int = 0,
|
|
92
|
+
) -> DocumentPlan | None:
|
|
93
|
+
"""Generate document plan when chain > 5 windows.
|
|
94
|
+
|
|
95
|
+
Returns None if chain is short or no dispatch_fn.
|
|
96
|
+
"""
|
|
97
|
+
if predicted_chain_length <= 5:
|
|
98
|
+
return None
|
|
99
|
+
if not self._dispatch_fn:
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
prompt = (
|
|
103
|
+
"Create an outline for the following task. For each section provide:\n"
|
|
104
|
+
"- Section title\n"
|
|
105
|
+
"- 2-3 key points to cover\n"
|
|
106
|
+
"- Dependencies on other sections\n\n"
|
|
107
|
+
f"Task: {task_intent}\n"
|
|
108
|
+
f"Estimated length: {predicted_chain_length} windows\n\n"
|
|
109
|
+
"Format: numbered sections."
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
output, _ = self._dispatch_fn(prompt, "")
|
|
114
|
+
except Exception:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
return self._parse_plan(output, predicted_chain_length)
|
|
118
|
+
|
|
119
|
+
def _parse_plan(self, output: str, windows: int) -> DocumentPlan:
|
|
120
|
+
"""Parse LLM output into a DocumentPlan."""
|
|
121
|
+
sections: list[PlannedSection] = []
|
|
122
|
+
current_title = ""
|
|
123
|
+
current_points: list[str] = []
|
|
124
|
+
|
|
125
|
+
for line in output.split("\n"):
|
|
126
|
+
line = line.strip()
|
|
127
|
+
if not line:
|
|
128
|
+
continue
|
|
129
|
+
# New section: starts with number
|
|
130
|
+
if re.match(r"\d+[\.\)]\s", line):
|
|
131
|
+
if current_title:
|
|
132
|
+
sections.append(PlannedSection(
|
|
133
|
+
title=current_title,
|
|
134
|
+
key_points=current_points,
|
|
135
|
+
))
|
|
136
|
+
current_title = re.sub(r"^\d+[\.\)]\s*", "", line)
|
|
137
|
+
current_points = []
|
|
138
|
+
elif line.startswith("-") or line.startswith("•"):
|
|
139
|
+
current_points.append(line.lstrip("-•").strip())
|
|
140
|
+
|
|
141
|
+
if current_title:
|
|
142
|
+
sections.append(PlannedSection(
|
|
143
|
+
title=current_title,
|
|
144
|
+
key_points=current_points,
|
|
145
|
+
))
|
|
146
|
+
|
|
147
|
+
return DocumentPlan(
|
|
148
|
+
sections=sections,
|
|
149
|
+
estimated_windows=windows,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# ------------------------------------------------------------------
|
|
153
|
+
# 2. Checkpoint review
|
|
154
|
+
# ------------------------------------------------------------------
|
|
155
|
+
|
|
156
|
+
def checkpoint_review(
|
|
157
|
+
self,
|
|
158
|
+
window_index: int,
|
|
159
|
+
review_interval: int = 20,
|
|
160
|
+
task_intent: str = "",
|
|
161
|
+
top_facts: list[str] | None = None,
|
|
162
|
+
gap_summary: str = "",
|
|
163
|
+
) -> ReviewGuidance | None:
|
|
164
|
+
"""Periodic review at checkpoint windows.
|
|
165
|
+
|
|
166
|
+
Gate: model_capability < 3 → None
|
|
167
|
+
Gate: window_index not at interval → None
|
|
168
|
+
"""
|
|
169
|
+
if self._model_capability < 3:
|
|
170
|
+
return None
|
|
171
|
+
if window_index % review_interval != 0:
|
|
172
|
+
return None
|
|
173
|
+
if not self._dispatch_fn:
|
|
174
|
+
return None
|
|
175
|
+
|
|
176
|
+
facts_section = ""
|
|
177
|
+
if top_facts:
|
|
178
|
+
facts_section = "\n".join(f"- {f}" for f in top_facts[:30])
|
|
179
|
+
|
|
180
|
+
prompt = (
|
|
181
|
+
"Review checkpoint. Assess the following:\n"
|
|
182
|
+
"1. Are we on track for the task?\n"
|
|
183
|
+
"2. Any contradictions in the findings?\n"
|
|
184
|
+
"3. What should be prioritized next?\n"
|
|
185
|
+
"4. Any new gaps identified?\n\n"
|
|
186
|
+
f"Task: {task_intent}\n\n"
|
|
187
|
+
f"Key facts so far:\n{facts_section}\n\n"
|
|
188
|
+
f"Gap summary: {gap_summary}\n\n"
|
|
189
|
+
"Be concise."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
try:
|
|
193
|
+
output, _ = self._dispatch_fn(prompt, "")
|
|
194
|
+
except Exception:
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
return self._parse_review(output)
|
|
198
|
+
|
|
199
|
+
def _parse_review(self, output: str) -> ReviewGuidance:
|
|
200
|
+
"""Parse checkpoint review output."""
|
|
201
|
+
guidance = ReviewGuidance(raw_output=output)
|
|
202
|
+
lines = output.split("\n")
|
|
203
|
+
for line in lines:
|
|
204
|
+
line_lower = line.lower().strip()
|
|
205
|
+
if "not on track" in line_lower or "off track" in line_lower:
|
|
206
|
+
guidance.on_track = False
|
|
207
|
+
if "contradict" in line_lower:
|
|
208
|
+
guidance.contradictions.append(line.strip())
|
|
209
|
+
if "priorit" in line_lower or "next" in line_lower:
|
|
210
|
+
guidance.priorities.append(line.strip())
|
|
211
|
+
if "gap" in line_lower or "missing" in line_lower:
|
|
212
|
+
guidance.new_gaps.append(line.strip())
|
|
213
|
+
return guidance
|
|
214
|
+
|
|
215
|
+
# ------------------------------------------------------------------
|
|
216
|
+
# 3. Post-generation self-assessment
|
|
217
|
+
# ------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
def post_generation_assessment(
|
|
220
|
+
self,
|
|
221
|
+
accumulated_output: str,
|
|
222
|
+
task_intent: str,
|
|
223
|
+
) -> AssessmentResult:
|
|
224
|
+
"""Score output quality and flag issues.
|
|
225
|
+
|
|
226
|
+
Weak model → basic heuristic scoring.
|
|
227
|
+
Strong model → full LLM self-assessment.
|
|
228
|
+
"""
|
|
229
|
+
if self._model_capability < 3 or not self._dispatch_fn:
|
|
230
|
+
return self._heuristic_assessment(accumulated_output, task_intent)
|
|
231
|
+
|
|
232
|
+
prompt = (
|
|
233
|
+
"Score the following output on a scale of 1-10 for completeness, "
|
|
234
|
+
"accuracy, and coherence. Start your response with 'SCORE: X/10'. "
|
|
235
|
+
"Then list any issues as numbered items.\n\n"
|
|
236
|
+
f"Task: {task_intent}"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
output, _ = self._dispatch_fn(prompt, accumulated_output[:5000])
|
|
241
|
+
except Exception:
|
|
242
|
+
return self._heuristic_assessment(accumulated_output, task_intent)
|
|
243
|
+
|
|
244
|
+
return self._parse_assessment(output)
|
|
245
|
+
|
|
246
|
+
def _parse_assessment(self, output: str) -> AssessmentResult:
|
|
247
|
+
"""Parse LLM assessment output."""
|
|
248
|
+
result = AssessmentResult(raw_output=output)
|
|
249
|
+
|
|
250
|
+
# Extract score
|
|
251
|
+
score_match = re.search(r"SCORE:\s*(\d+(?:\.\d+)?)\s*/\s*10", output, re.IGNORECASE)
|
|
252
|
+
if score_match:
|
|
253
|
+
result.score = float(score_match.group(1))
|
|
254
|
+
else:
|
|
255
|
+
result.score = 5.0 # Default if can't parse
|
|
256
|
+
|
|
257
|
+
# Extract issues
|
|
258
|
+
for line in output.split("\n"):
|
|
259
|
+
line = line.strip()
|
|
260
|
+
if re.match(r"\d+\.", line):
|
|
261
|
+
result.issues.append(re.sub(r"^\d+\.\s*", "", line))
|
|
262
|
+
|
|
263
|
+
result.needs_correction = (
|
|
264
|
+
result.score < 6
|
|
265
|
+
and self._correction_mode == "correct"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
return result
|
|
269
|
+
|
|
270
|
+
def _heuristic_assessment(
|
|
271
|
+
self, output: str, task_intent: str,
|
|
272
|
+
) -> AssessmentResult:
|
|
273
|
+
"""Basic quality scoring without LLM."""
|
|
274
|
+
score = 5.0
|
|
275
|
+
issues: list[str] = []
|
|
276
|
+
|
|
277
|
+
# Length heuristic
|
|
278
|
+
words = len(output.split())
|
|
279
|
+
if words < 50:
|
|
280
|
+
score -= 2
|
|
281
|
+
issues.append("Output is very short")
|
|
282
|
+
elif words > 200:
|
|
283
|
+
score += 1
|
|
284
|
+
|
|
285
|
+
# Check for task keyword coverage
|
|
286
|
+
task_words = set(task_intent.lower().split())
|
|
287
|
+
output_words = set(output.lower().split())
|
|
288
|
+
coverage = len(task_words & output_words) / max(len(task_words), 1)
|
|
289
|
+
if coverage < 0.3:
|
|
290
|
+
score -= 1
|
|
291
|
+
issues.append("Low task keyword coverage")
|
|
292
|
+
elif coverage > 0.7:
|
|
293
|
+
score += 1
|
|
294
|
+
|
|
295
|
+
score = max(1.0, min(10.0, score))
|
|
296
|
+
|
|
297
|
+
return AssessmentResult(
|
|
298
|
+
score=score,
|
|
299
|
+
issues=issues,
|
|
300
|
+
needs_correction=score < 6 and self._correction_mode == "correct",
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# ------------------------------------------------------------------
|
|
304
|
+
# Targeted re-generation
|
|
305
|
+
# ------------------------------------------------------------------
|
|
306
|
+
|
|
307
|
+
def targeted_regeneration(
|
|
308
|
+
self,
|
|
309
|
+
issues: list[str],
|
|
310
|
+
task_intent: str,
|
|
311
|
+
) -> list[str]:
|
|
312
|
+
"""Re-generate targeted fixes for each issue (capped at max_corrections)."""
|
|
313
|
+
if not self._dispatch_fn:
|
|
314
|
+
return []
|
|
315
|
+
|
|
316
|
+
corrections: list[str] = []
|
|
317
|
+
for issue in issues[:self._max_corrections]:
|
|
318
|
+
prompt = (
|
|
319
|
+
f"Fix this specific issue in the output: {issue}\n"
|
|
320
|
+
f"Original task: {task_intent}\n"
|
|
321
|
+
"Provide only the corrected section."
|
|
322
|
+
)
|
|
323
|
+
try:
|
|
324
|
+
output, _ = self._dispatch_fn(prompt, "")
|
|
325
|
+
corrections.append(output)
|
|
326
|
+
except Exception:
|
|
327
|
+
corrections.append(f"[correction failed for: {issue}]")
|
|
328
|
+
|
|
329
|
+
return corrections
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Scale-mode selector — auto-configure session by quality tier (§8.3, §15).
|
|
4
|
+
|
|
5
|
+
Classifies input into quality tiers S/A/B/C/D based on token ratio,
|
|
6
|
+
then configures processing mode, validation tiers, review cycles, etc.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
from enum import IntEnum
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# Quality tiers (§10.2)
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class QualityTier(IntEnum):
|
|
20
|
+
"""Quality tiers — S (single window) through D (>1000 windows at 128K ctx)."""
|
|
21
|
+
|
|
22
|
+
S = 0 # ≤ C
|
|
23
|
+
A = 1 # C–10C
|
|
24
|
+
B = 2 # 10C–100C
|
|
25
|
+
C = 3 # 100C–1000C
|
|
26
|
+
D = 4 # >1000C
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def classify_quality_tier(
|
|
30
|
+
estimated_tokens: int, context_window: int,
|
|
31
|
+
) -> QualityTier:
|
|
32
|
+
"""Classify input into quality tier based on token-to-context ratio."""
|
|
33
|
+
if context_window <= 0:
|
|
34
|
+
return QualityTier.S
|
|
35
|
+
ratio = estimated_tokens / context_window
|
|
36
|
+
if ratio <= 1:
|
|
37
|
+
return QualityTier.S
|
|
38
|
+
if ratio <= 10:
|
|
39
|
+
return QualityTier.A
|
|
40
|
+
if ratio <= 100:
|
|
41
|
+
return QualityTier.B
|
|
42
|
+
if ratio <= 1000:
|
|
43
|
+
return QualityTier.C
|
|
44
|
+
return QualityTier.D
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ---------------------------------------------------------------------------
|
|
48
|
+
# Processing mode
|
|
49
|
+
# ---------------------------------------------------------------------------
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def select_processing_mode(
|
|
53
|
+
estimated_tokens: int, context_window: int,
|
|
54
|
+
) -> str:
|
|
55
|
+
"""Select processing mode based on windows needed."""
|
|
56
|
+
if context_window <= 0:
|
|
57
|
+
return "SERIAL"
|
|
58
|
+
windows = estimated_tokens / context_window
|
|
59
|
+
if windows <= 10:
|
|
60
|
+
return "SERIAL"
|
|
61
|
+
if windows <= 100:
|
|
62
|
+
return "SERIAL_WITH_REGROUNDING"
|
|
63
|
+
if windows <= 1000:
|
|
64
|
+
return "HIERARCHICAL"
|
|
65
|
+
return "HIERARCHICAL_MULTI_LEVEL"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
# Session configuration
|
|
70
|
+
# ---------------------------------------------------------------------------
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass
|
|
74
|
+
class SessionConfig:
|
|
75
|
+
"""Auto-configured session parameters."""
|
|
76
|
+
|
|
77
|
+
quality_tier: QualityTier = QualityTier.S
|
|
78
|
+
processing_mode: str = "SERIAL"
|
|
79
|
+
cqs_enabled: bool = False
|
|
80
|
+
validation_tiers: int = 1
|
|
81
|
+
review_cycles_enabled: bool = False
|
|
82
|
+
planning_window: bool = False
|
|
83
|
+
hierarchical: bool = False
|
|
84
|
+
re_grounding: bool = False
|
|
85
|
+
model_review_capability: int = 1
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class ScaleModeSelector:
|
|
89
|
+
"""Auto-configure session based on quality tier and model capability."""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
context_window: int = 128_000,
|
|
94
|
+
) -> None:
|
|
95
|
+
self._context_window = context_window
|
|
96
|
+
|
|
97
|
+
def configure_session(
|
|
98
|
+
self,
|
|
99
|
+
estimated_tokens: int,
|
|
100
|
+
model_capability: int = 1,
|
|
101
|
+
) -> SessionConfig:
|
|
102
|
+
"""Auto-configure session based on input size and model capability.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
estimated_tokens: Total estimated input tokens.
|
|
106
|
+
model_capability: Assessed model capability (1, 2, or 3).
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
SessionConfig with all parameters set.
|
|
110
|
+
"""
|
|
111
|
+
tier = classify_quality_tier(estimated_tokens, self._context_window)
|
|
112
|
+
mode = select_processing_mode(estimated_tokens, self._context_window)
|
|
113
|
+
|
|
114
|
+
return SessionConfig(
|
|
115
|
+
quality_tier=tier,
|
|
116
|
+
processing_mode=mode,
|
|
117
|
+
cqs_enabled=tier >= QualityTier.A,
|
|
118
|
+
validation_tiers=min(
|
|
119
|
+
model_capability,
|
|
120
|
+
3 if tier >= QualityTier.C else 2 if tier >= QualityTier.B else 1,
|
|
121
|
+
),
|
|
122
|
+
review_cycles_enabled=(
|
|
123
|
+
tier >= QualityTier.B and model_capability >= 3
|
|
124
|
+
),
|
|
125
|
+
planning_window=tier >= QualityTier.B,
|
|
126
|
+
hierarchical=tier >= QualityTier.C,
|
|
127
|
+
re_grounding=tier >= QualityTier.B,
|
|
128
|
+
model_review_capability=model_capability,
|
|
129
|
+
)
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Source grounding — store/retrieve verbatim source passages (§17).
|
|
4
|
+
|
|
5
|
+
Stores passages for facts with confidence ≥ 0.8. Integrates passages
|
|
6
|
+
into envelopes with tier-based budget allocation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from collections.abc import Callable
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
# ---------------------------------------------------------------------------
|
|
16
|
+
# Constants (§17)
|
|
17
|
+
# ---------------------------------------------------------------------------
|
|
18
|
+
|
|
19
|
+
HIGH_CONFIDENCE_THRESHOLD = 0.8
|
|
20
|
+
|
|
21
|
+
# Budget allocation by quality tier (fact_pct, source_pct)
|
|
22
|
+
BUDGET_BY_TIER: dict[str, tuple[float, float]] = {
|
|
23
|
+
"S": (1.0, 0.0), # No envelope needed
|
|
24
|
+
"A": (0.90, 0.10), # Low drift risk
|
|
25
|
+
"B": (0.70, 0.30), # Highest drift risk
|
|
26
|
+
"C": (0.70, 0.30), # Hierarchy adds abstraction
|
|
27
|
+
"D": (0.75, 0.25), # Space premium
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
HIGH_RELEVANCE_THRESHOLD = 0.7
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Data types
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class SourcePassage:
|
|
40
|
+
"""Verbatim passage from the original input linked to facts."""
|
|
41
|
+
|
|
42
|
+
passage_id: str = ""
|
|
43
|
+
text: str = ""
|
|
44
|
+
source_window: int = 0
|
|
45
|
+
token_offset_start: int = 0
|
|
46
|
+
token_offset_end: int = 0
|
|
47
|
+
linked_fact_ids: list[str] = field(default_factory=list)
|
|
48
|
+
token_count: int = 0
|
|
49
|
+
relevance_score: float = 0.0
|
|
50
|
+
|
|
51
|
+
def to_dict(self) -> dict[str, Any]:
|
|
52
|
+
return {
|
|
53
|
+
"passage_id": self.passage_id,
|
|
54
|
+
"text": self.text,
|
|
55
|
+
"source_window": self.source_window,
|
|
56
|
+
"token_offset_start": self.token_offset_start,
|
|
57
|
+
"token_offset_end": self.token_offset_end,
|
|
58
|
+
"linked_fact_ids": list(self.linked_fact_ids),
|
|
59
|
+
"token_count": self.token_count,
|
|
60
|
+
"relevance_score": self.relevance_score,
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_dict(cls, data: dict[str, Any]) -> SourcePassage:
|
|
65
|
+
return cls(
|
|
66
|
+
passage_id=data.get("passage_id", ""),
|
|
67
|
+
text=data.get("text", ""),
|
|
68
|
+
source_window=data.get("source_window", 0),
|
|
69
|
+
token_offset_start=data.get("token_offset_start", 0),
|
|
70
|
+
token_offset_end=data.get("token_offset_end", 0),
|
|
71
|
+
linked_fact_ids=data.get("linked_fact_ids", []),
|
|
72
|
+
token_count=data.get("token_count", 0),
|
|
73
|
+
relevance_score=data.get("relevance_score", 0.0),
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ---------------------------------------------------------------------------
|
|
78
|
+
# SourceGroundingEngine
|
|
79
|
+
# ---------------------------------------------------------------------------
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class SourceGroundingEngine:
|
|
83
|
+
"""Store and retrieve verbatim source passages for high-confidence facts."""
|
|
84
|
+
|
|
85
|
+
def __init__(
|
|
86
|
+
self,
|
|
87
|
+
count_tokens: Callable[[str], int] | None = None,
|
|
88
|
+
) -> None:
|
|
89
|
+
self._count_tokens = count_tokens or (lambda t: len(t) // 4)
|
|
90
|
+
self._passages: dict[str, SourcePassage] = {} # passage_id → passage
|
|
91
|
+
self._fact_to_passages: dict[str, list[str]] = {} # fact_id → [passage_ids]
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def passage_count(self) -> int:
|
|
95
|
+
return len(self._passages)
|
|
96
|
+
|
|
97
|
+
def store_passage(
|
|
98
|
+
self,
|
|
99
|
+
passage: SourcePassage,
|
|
100
|
+
fact_confidence: float = 0.0,
|
|
101
|
+
) -> bool:
|
|
102
|
+
"""Store a passage if linked fact has confidence ≥ threshold.
|
|
103
|
+
|
|
104
|
+
Returns True if stored, False if below threshold.
|
|
105
|
+
"""
|
|
106
|
+
if fact_confidence < HIGH_CONFIDENCE_THRESHOLD:
|
|
107
|
+
return False
|
|
108
|
+
|
|
109
|
+
passage.token_count = self._count_tokens(passage.text)
|
|
110
|
+
self._passages[passage.passage_id] = passage
|
|
111
|
+
for fid in passage.linked_fact_ids:
|
|
112
|
+
self._fact_to_passages.setdefault(fid, []).append(passage.passage_id)
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
def get_passages_for_fact(self, fact_id: str) -> list[SourcePassage]:
|
|
116
|
+
"""Retrieve all source passages linked to a fact."""
|
|
117
|
+
pids = self._fact_to_passages.get(fact_id, [])
|
|
118
|
+
return [self._passages[pid] for pid in pids if pid in self._passages]
|
|
119
|
+
|
|
120
|
+
def build_source_grounded_envelope(
|
|
121
|
+
self,
|
|
122
|
+
scored_facts: list[dict[str, Any]],
|
|
123
|
+
budget_tokens: int,
|
|
124
|
+
quality_tier: str = "B",
|
|
125
|
+
) -> tuple[list[dict[str, Any]], list[SourcePassage]]:
|
|
126
|
+
"""Build envelope with source passages allocated by tier.
|
|
127
|
+
|
|
128
|
+
Args:
|
|
129
|
+
scored_facts: Sorted list of {"id", "text", "score", ...}
|
|
130
|
+
budget_tokens: Total envelope budget in tokens.
|
|
131
|
+
quality_tier: S/A/B/C/D
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
(packed_facts, included_passages)
|
|
135
|
+
"""
|
|
136
|
+
fact_pct, source_pct = BUDGET_BY_TIER.get(quality_tier, (0.70, 0.30))
|
|
137
|
+
fact_budget = int(budget_tokens * fact_pct)
|
|
138
|
+
source_budget = int(budget_tokens * source_pct)
|
|
139
|
+
|
|
140
|
+
# Phase 1: Pack facts within fact budget
|
|
141
|
+
packed_facts: list[dict[str, Any]] = []
|
|
142
|
+
tokens_used = 0
|
|
143
|
+
for fact in scored_facts:
|
|
144
|
+
t = self._count_tokens(fact.get("text", ""))
|
|
145
|
+
if tokens_used + t > fact_budget:
|
|
146
|
+
break
|
|
147
|
+
packed_facts.append(fact)
|
|
148
|
+
tokens_used += t
|
|
149
|
+
|
|
150
|
+
# Phase 2: Allocate source passages for high-relevance packed facts
|
|
151
|
+
included_passages: list[SourcePassage] = []
|
|
152
|
+
source_tokens_used = 0
|
|
153
|
+
|
|
154
|
+
for fact in packed_facts:
|
|
155
|
+
fid = fact.get("id", "")
|
|
156
|
+
score = fact.get("score", 0.0)
|
|
157
|
+
if score < HIGH_RELEVANCE_THRESHOLD:
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
for passage in self.get_passages_for_fact(fid):
|
|
161
|
+
if source_tokens_used + passage.token_count > source_budget:
|
|
162
|
+
break
|
|
163
|
+
included_passages.append(passage)
|
|
164
|
+
source_tokens_used += passage.token_count
|
|
165
|
+
|
|
166
|
+
return packed_facts, included_passages
|
|
167
|
+
|
|
168
|
+
def format_envelope_section(
|
|
169
|
+
self,
|
|
170
|
+
fact: dict[str, Any],
|
|
171
|
+
passages: list[SourcePassage],
|
|
172
|
+
) -> str:
|
|
173
|
+
"""Format a fact with its source passages for envelope inclusion.
|
|
174
|
+
|
|
175
|
+
Format:
|
|
176
|
+
- {fact text} — Window N
|
|
177
|
+
↳ [SOURCE: Window N, tokens X-Y]
|
|
178
|
+
"{verbatim original text}"
|
|
179
|
+
"""
|
|
180
|
+
lines = [f"- {fact.get('text', '')} — Window {fact.get('window', '?')}"]
|
|
181
|
+
for p in passages:
|
|
182
|
+
lines.append(
|
|
183
|
+
f" ↳ [SOURCE: Window {p.source_window}, "
|
|
184
|
+
f"tokens {p.token_offset_start}-{p.token_offset_end}]"
|
|
185
|
+
)
|
|
186
|
+
lines.append(f' "{p.text}"')
|
|
187
|
+
return "\n".join(lines)
|
|
188
|
+
|
|
189
|
+
def to_dict(self) -> dict[str, Any]:
|
|
190
|
+
"""Serialize for persistence."""
|
|
191
|
+
return {
|
|
192
|
+
"passages": {pid: p.to_dict() for pid, p in self._passages.items()},
|
|
193
|
+
"fact_to_passages": dict(self._fact_to_passages),
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def from_dict(
|
|
198
|
+
cls,
|
|
199
|
+
data: dict[str, Any],
|
|
200
|
+
count_tokens: Callable[[str], int] | None = None,
|
|
201
|
+
) -> SourceGroundingEngine:
|
|
202
|
+
"""Restore from serialized state."""
|
|
203
|
+
engine = cls(count_tokens=count_tokens)
|
|
204
|
+
for pid, pdata in data.get("passages", {}).items():
|
|
205
|
+
engine._passages[pid] = SourcePassage.from_dict(pdata)
|
|
206
|
+
engine._fact_to_passages = data.get("fact_to_passages", {})
|
|
207
|
+
return engine
|
crp/ckf/__init__.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Contextual Knowledge Fabric — 4-mode retrieval, community detection, pub/sub."""
|
|
4
|
+
|
|
5
|
+
from .community import Community, CommunityDetector, CommunityResult
|
|
6
|
+
from .fabric import CKFConfig, CKFHealth, ContextualKnowledgeFabric
|
|
7
|
+
from .gc import GarbageCollector, GCResult
|
|
8
|
+
from .graph_walk import GraphWalkResult, graph_walk
|
|
9
|
+
from .merge import MergedFact, MergeResult, multi_mode_merge
|
|
10
|
+
from .pattern_query import PatternQueryResult, pattern_query
|
|
11
|
+
from .pubsub import CKFEvent, CKFEventType, PubSubEventBus
|
|
12
|
+
from .semantic import SemanticResult, semantic_fallback
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"CKFConfig",
|
|
16
|
+
"CKFEvent",
|
|
17
|
+
"CKFEventType",
|
|
18
|
+
"CKFHealth",
|
|
19
|
+
"Community",
|
|
20
|
+
"CommunityDetector",
|
|
21
|
+
"CommunityResult",
|
|
22
|
+
"ContextualKnowledgeFabric",
|
|
23
|
+
"GCResult",
|
|
24
|
+
"GarbageCollector",
|
|
25
|
+
"GraphWalkResult",
|
|
26
|
+
"MergeResult",
|
|
27
|
+
"MergedFact",
|
|
28
|
+
"PatternQueryResult",
|
|
29
|
+
"PubSubEventBus",
|
|
30
|
+
"SemanticResult",
|
|
31
|
+
"graph_walk",
|
|
32
|
+
"multi_mode_merge",
|
|
33
|
+
"pattern_query",
|
|
34
|
+
"semantic_fallback",
|
|
35
|
+
]
|