crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/advanced/curator.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""LLM context curation — progressive understanding synthesis (§18).
|
|
4
|
+
|
|
5
|
+
Periodically dispatches curation windows to build an evolving synthesis
|
|
6
|
+
of findings, relationships, and gaps. Injected into envelopes as
|
|
7
|
+
Section 1.5 between CRITICAL STATE and DISCOVERIES.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import time
|
|
13
|
+
import uuid
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
# Constants
|
|
20
|
+
# ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
# Curation interval by tier: {tier: (interval, max_tokens)}
|
|
23
|
+
TIER_CONFIG: dict[str, tuple[int, int]] = {
|
|
24
|
+
"A": (5, 500),
|
|
25
|
+
"B": (5, 800),
|
|
26
|
+
"C": (10, 1000),
|
|
27
|
+
"D": (20, 1500),
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Data types
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class LLMSynthesis:
|
|
38
|
+
"""Curated synthesis from LLM review of accumulated facts."""
|
|
39
|
+
|
|
40
|
+
synthesis_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
41
|
+
text: str = ""
|
|
42
|
+
window_index: int = 0
|
|
43
|
+
supersedes: str | None = None
|
|
44
|
+
evolution_count: int = 1
|
|
45
|
+
critical_findings: list[str] | None = None
|
|
46
|
+
key_relationships: list[str] | None = None
|
|
47
|
+
gaps: list[str] | None = None
|
|
48
|
+
confidence: float = 1.0
|
|
49
|
+
created_at: float = field(default_factory=time.time)
|
|
50
|
+
|
|
51
|
+
def to_dict(self) -> dict[str, Any]:
|
|
52
|
+
return {
|
|
53
|
+
"synthesis_id": self.synthesis_id,
|
|
54
|
+
"text": self.text,
|
|
55
|
+
"window_index": self.window_index,
|
|
56
|
+
"supersedes": self.supersedes,
|
|
57
|
+
"evolution_count": self.evolution_count,
|
|
58
|
+
"critical_findings": self.critical_findings,
|
|
59
|
+
"key_relationships": self.key_relationships,
|
|
60
|
+
"gaps": self.gaps,
|
|
61
|
+
"confidence": self.confidence,
|
|
62
|
+
"created_at": self.created_at,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def from_dict(cls, data: dict[str, Any]) -> LLMSynthesis:
|
|
67
|
+
return cls(
|
|
68
|
+
synthesis_id=data.get("synthesis_id", str(uuid.uuid4())),
|
|
69
|
+
text=data.get("text", ""),
|
|
70
|
+
window_index=data.get("window_index", 0),
|
|
71
|
+
supersedes=data.get("supersedes"),
|
|
72
|
+
evolution_count=data.get("evolution_count", 1),
|
|
73
|
+
critical_findings=data.get("critical_findings"),
|
|
74
|
+
key_relationships=data.get("key_relationships"),
|
|
75
|
+
gaps=data.get("gaps"),
|
|
76
|
+
confidence=data.get("confidence", 1.0),
|
|
77
|
+
created_at=data.get("created_at", 0.0),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass
|
|
82
|
+
class CurationConfig:
|
|
83
|
+
"""Configuration for LLM curation."""
|
|
84
|
+
|
|
85
|
+
enabled: bool = True
|
|
86
|
+
curation_interval: int = 5
|
|
87
|
+
max_synthesis_tokens: int = 1500
|
|
88
|
+
progressive: bool = True
|
|
89
|
+
quality_tier: str = "B"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ---------------------------------------------------------------------------
|
|
93
|
+
# LLMContextCurator
|
|
94
|
+
# ---------------------------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class LLMContextCurator:
|
|
98
|
+
"""LLM-driven context curation with progressive understanding."""
|
|
99
|
+
|
|
100
|
+
def __init__(
|
|
101
|
+
self,
|
|
102
|
+
dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
|
|
103
|
+
config: CurationConfig | None = None,
|
|
104
|
+
) -> None:
|
|
105
|
+
self._dispatch_fn = dispatch_fn
|
|
106
|
+
self.config = config or CurationConfig()
|
|
107
|
+
self._current_synthesis: LLMSynthesis | None = None
|
|
108
|
+
self._synthesis_history: list[LLMSynthesis] = []
|
|
109
|
+
|
|
110
|
+
@property
|
|
111
|
+
def current_synthesis(self) -> LLMSynthesis | None:
|
|
112
|
+
return self._current_synthesis
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
def evolution_count(self) -> int:
|
|
116
|
+
if self._current_synthesis:
|
|
117
|
+
return self._current_synthesis.evolution_count
|
|
118
|
+
return 0
|
|
119
|
+
|
|
120
|
+
def should_curate(self, window_index: int) -> bool:
|
|
121
|
+
"""Check if curation should run at this window."""
|
|
122
|
+
if not self.config.enabled:
|
|
123
|
+
return False
|
|
124
|
+
interval, _ = TIER_CONFIG.get(
|
|
125
|
+
self.config.quality_tier,
|
|
126
|
+
(self.config.curation_interval, self.config.max_synthesis_tokens),
|
|
127
|
+
)
|
|
128
|
+
return window_index > 0 and window_index % interval == 0
|
|
129
|
+
|
|
130
|
+
def curate(
|
|
131
|
+
self,
|
|
132
|
+
window_index: int,
|
|
133
|
+
top_facts: list[str],
|
|
134
|
+
recent_output_summary: str = "",
|
|
135
|
+
) -> LLMSynthesis | None:
|
|
136
|
+
"""Run curation (initial or progressive).
|
|
137
|
+
|
|
138
|
+
Returns new synthesis or None if dispatch unavailable.
|
|
139
|
+
"""
|
|
140
|
+
if not self._dispatch_fn:
|
|
141
|
+
return None
|
|
142
|
+
|
|
143
|
+
if self.config.progressive and self._current_synthesis:
|
|
144
|
+
return self._progressive_curation(
|
|
145
|
+
window_index, top_facts, recent_output_summary,
|
|
146
|
+
)
|
|
147
|
+
return self._initial_curation(window_index, top_facts, recent_output_summary)
|
|
148
|
+
|
|
149
|
+
def _initial_curation(
|
|
150
|
+
self,
|
|
151
|
+
window_index: int,
|
|
152
|
+
top_facts: list[str],
|
|
153
|
+
recent_output_summary: str,
|
|
154
|
+
) -> LLMSynthesis:
|
|
155
|
+
"""First curation — no prior synthesis to build on."""
|
|
156
|
+
facts_text = "\n".join(f"- {f}" for f in top_facts[:40])
|
|
157
|
+
prompt = (
|
|
158
|
+
"Analyze the extracted facts and provide:\n"
|
|
159
|
+
"1. 5 most critical findings\n"
|
|
160
|
+
"2. 3 key relationships between findings\n"
|
|
161
|
+
"3. Current assessment\n"
|
|
162
|
+
"4. What's missing / gaps\n\n"
|
|
163
|
+
f"Recent output:\n{recent_output_summary[:1000]}\n\n"
|
|
164
|
+
"Be concise."
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
output, _ = self._dispatch_fn(prompt, facts_text) # type: ignore[misc]
|
|
168
|
+
synthesis = self._parse_synthesis(output, window_index)
|
|
169
|
+
self._current_synthesis = synthesis
|
|
170
|
+
self._synthesis_history.append(synthesis)
|
|
171
|
+
return synthesis
|
|
172
|
+
|
|
173
|
+
def _progressive_curation(
|
|
174
|
+
self,
|
|
175
|
+
window_index: int,
|
|
176
|
+
top_facts: list[str],
|
|
177
|
+
recent_output_summary: str,
|
|
178
|
+
) -> LLMSynthesis:
|
|
179
|
+
"""Progressive curation — revise previous synthesis."""
|
|
180
|
+
prev = self._current_synthesis
|
|
181
|
+
facts_text = "\n".join(f"- {f}" for f in top_facts[:40])
|
|
182
|
+
prompt = (
|
|
183
|
+
"Revise your previous synthesis based on new facts.\n"
|
|
184
|
+
f"Previous synthesis:\n{prev.text[:1500] if prev else ''}\n\n"
|
|
185
|
+
"Update:\n"
|
|
186
|
+
"1. Revised critical findings\n"
|
|
187
|
+
"2. Updated relationships\n"
|
|
188
|
+
"3. Updated assessment\n"
|
|
189
|
+
"4. New gaps identified\n\n"
|
|
190
|
+
f"New facts since last synthesis:\n{facts_text}\n\n"
|
|
191
|
+
f"Recent output:\n{recent_output_summary[:500]}\n\n"
|
|
192
|
+
"Be concise."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
output, _ = self._dispatch_fn(prompt, "") # type: ignore[misc]
|
|
196
|
+
synthesis = self._parse_synthesis(output, window_index)
|
|
197
|
+
synthesis.supersedes = prev.synthesis_id if prev else None
|
|
198
|
+
synthesis.evolution_count = (prev.evolution_count + 1) if prev else 1
|
|
199
|
+
self._current_synthesis = synthesis
|
|
200
|
+
self._synthesis_history.append(synthesis)
|
|
201
|
+
return synthesis
|
|
202
|
+
|
|
203
|
+
def _parse_synthesis(self, output: str, window_index: int) -> LLMSynthesis:
|
|
204
|
+
"""Parse curation output into structured synthesis."""
|
|
205
|
+
findings: list[str] = []
|
|
206
|
+
relationships: list[str] = []
|
|
207
|
+
gaps: list[str] = []
|
|
208
|
+
|
|
209
|
+
section = ""
|
|
210
|
+
for line in output.split("\n"):
|
|
211
|
+
line_lower = line.lower().strip()
|
|
212
|
+
if "finding" in line_lower or "critical" in line_lower:
|
|
213
|
+
section = "findings"
|
|
214
|
+
elif "relationship" in line_lower:
|
|
215
|
+
section = "relationships"
|
|
216
|
+
elif "gap" in line_lower or "missing" in line_lower:
|
|
217
|
+
section = "gaps"
|
|
218
|
+
elif "assessment" in line_lower:
|
|
219
|
+
section = "assessment"
|
|
220
|
+
|
|
221
|
+
if line.strip().startswith("-") or line.strip().startswith("•"):
|
|
222
|
+
item = line.strip().lstrip("-•").strip()
|
|
223
|
+
if section == "findings":
|
|
224
|
+
findings.append(item)
|
|
225
|
+
elif section == "relationships":
|
|
226
|
+
relationships.append(item)
|
|
227
|
+
elif section == "gaps":
|
|
228
|
+
gaps.append(item)
|
|
229
|
+
|
|
230
|
+
return LLMSynthesis(
|
|
231
|
+
text=output,
|
|
232
|
+
window_index=window_index,
|
|
233
|
+
critical_findings=findings or None,
|
|
234
|
+
key_relationships=relationships or None,
|
|
235
|
+
gaps=gaps or None,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def format_for_envelope(self) -> str:
|
|
239
|
+
"""Format current synthesis for envelope injection (Section 1.5)."""
|
|
240
|
+
if not self._current_synthesis:
|
|
241
|
+
return ""
|
|
242
|
+
s = self._current_synthesis
|
|
243
|
+
parts = [
|
|
244
|
+
f"[LLM_SYNTHESIS (Window {s.window_index}, evolution {s.evolution_count})]",
|
|
245
|
+
]
|
|
246
|
+
if s.critical_findings:
|
|
247
|
+
parts.append("CRITICAL FINDINGS: " + "; ".join(s.critical_findings))
|
|
248
|
+
if s.key_relationships:
|
|
249
|
+
parts.append("KEY RELATIONSHIPS: " + "; ".join(s.key_relationships))
|
|
250
|
+
if s.gaps:
|
|
251
|
+
parts.append("GAPS: " + "; ".join(s.gaps))
|
|
252
|
+
return "\n".join(parts)
|
|
253
|
+
|
|
254
|
+
def to_dict(self) -> dict[str, Any]:
|
|
255
|
+
return {
|
|
256
|
+
"current": self._current_synthesis.to_dict() if self._current_synthesis else None,
|
|
257
|
+
"history": [s.to_dict() for s in self._synthesis_history],
|
|
258
|
+
"config": {
|
|
259
|
+
"enabled": self.config.enabled,
|
|
260
|
+
"curation_interval": self.config.curation_interval,
|
|
261
|
+
"max_synthesis_tokens": self.config.max_synthesis_tokens,
|
|
262
|
+
"progressive": self.config.progressive,
|
|
263
|
+
"quality_tier": self.config.quality_tier,
|
|
264
|
+
},
|
|
265
|
+
}
|
crp/advanced/feedback.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Human-in-the-loop feedback — fact override, confidence adjustment (§18, MAY)."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class FeedbackEntry:
|
|
14
|
+
"""Single human feedback action."""
|
|
15
|
+
|
|
16
|
+
feedback_id: str = ""
|
|
17
|
+
fact_id: str = ""
|
|
18
|
+
action: str = "" # "override" | "boost" | "penalize" | "reject"
|
|
19
|
+
original_text: str = ""
|
|
20
|
+
corrected_text: str | None = None
|
|
21
|
+
confidence_delta: float = 0.0
|
|
22
|
+
reason: str = ""
|
|
23
|
+
timestamp: float = field(default_factory=time.time)
|
|
24
|
+
applied: bool = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class FeedbackLoop:
|
|
28
|
+
"""Human-in-the-loop corrections for facts in warm state."""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self._entries: list[FeedbackEntry] = []
|
|
32
|
+
self._fact_adjustments: dict[str, float] = {} # fact_id → cumulative delta
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def entry_count(self) -> int:
|
|
36
|
+
return len(self._entries)
|
|
37
|
+
|
|
38
|
+
def override_fact(
|
|
39
|
+
self,
|
|
40
|
+
fact_id: str,
|
|
41
|
+
corrected_text: str,
|
|
42
|
+
reason: str = "",
|
|
43
|
+
) -> FeedbackEntry:
|
|
44
|
+
"""Replace fact text with human-provided correction."""
|
|
45
|
+
entry = FeedbackEntry(
|
|
46
|
+
feedback_id=f"fb-{len(self._entries)}",
|
|
47
|
+
fact_id=fact_id,
|
|
48
|
+
action="override",
|
|
49
|
+
corrected_text=corrected_text,
|
|
50
|
+
reason=reason,
|
|
51
|
+
applied=True,
|
|
52
|
+
)
|
|
53
|
+
self._entries.append(entry)
|
|
54
|
+
return entry
|
|
55
|
+
|
|
56
|
+
def boost_confidence(
|
|
57
|
+
self,
|
|
58
|
+
fact_id: str,
|
|
59
|
+
delta: float = 0.1,
|
|
60
|
+
reason: str = "",
|
|
61
|
+
) -> FeedbackEntry:
|
|
62
|
+
"""Increase fact confidence based on human validation."""
|
|
63
|
+
self._fact_adjustments[fact_id] = (
|
|
64
|
+
self._fact_adjustments.get(fact_id, 0.0) + delta
|
|
65
|
+
)
|
|
66
|
+
entry = FeedbackEntry(
|
|
67
|
+
feedback_id=f"fb-{len(self._entries)}",
|
|
68
|
+
fact_id=fact_id,
|
|
69
|
+
action="boost",
|
|
70
|
+
confidence_delta=delta,
|
|
71
|
+
reason=reason,
|
|
72
|
+
applied=True,
|
|
73
|
+
)
|
|
74
|
+
self._entries.append(entry)
|
|
75
|
+
return entry
|
|
76
|
+
|
|
77
|
+
def penalize_confidence(
|
|
78
|
+
self,
|
|
79
|
+
fact_id: str,
|
|
80
|
+
delta: float = -0.2,
|
|
81
|
+
reason: str = "",
|
|
82
|
+
) -> FeedbackEntry:
|
|
83
|
+
"""Decrease fact confidence based on human rejection."""
|
|
84
|
+
self._fact_adjustments[fact_id] = (
|
|
85
|
+
self._fact_adjustments.get(fact_id, 0.0) + delta
|
|
86
|
+
)
|
|
87
|
+
entry = FeedbackEntry(
|
|
88
|
+
feedback_id=f"fb-{len(self._entries)}",
|
|
89
|
+
fact_id=fact_id,
|
|
90
|
+
action="penalize",
|
|
91
|
+
confidence_delta=delta,
|
|
92
|
+
reason=reason,
|
|
93
|
+
applied=True,
|
|
94
|
+
)
|
|
95
|
+
self._entries.append(entry)
|
|
96
|
+
return entry
|
|
97
|
+
|
|
98
|
+
def reject_fact(
|
|
99
|
+
self,
|
|
100
|
+
fact_id: str,
|
|
101
|
+
reason: str = "",
|
|
102
|
+
) -> FeedbackEntry:
|
|
103
|
+
"""Mark fact as rejected (confidence → 0)."""
|
|
104
|
+
self._fact_adjustments[fact_id] = -1.0 # Signal full rejection
|
|
105
|
+
entry = FeedbackEntry(
|
|
106
|
+
feedback_id=f"fb-{len(self._entries)}",
|
|
107
|
+
fact_id=fact_id,
|
|
108
|
+
action="reject",
|
|
109
|
+
confidence_delta=-1.0,
|
|
110
|
+
reason=reason,
|
|
111
|
+
applied=True,
|
|
112
|
+
)
|
|
113
|
+
self._entries.append(entry)
|
|
114
|
+
return entry
|
|
115
|
+
|
|
116
|
+
def get_adjusted_confidence(
|
|
117
|
+
self,
|
|
118
|
+
fact_id: str,
|
|
119
|
+
base_confidence: float,
|
|
120
|
+
) -> float:
|
|
121
|
+
"""Get confidence after applying all feedback adjustments."""
|
|
122
|
+
delta = self._fact_adjustments.get(fact_id, 0.0)
|
|
123
|
+
if delta <= -1.0:
|
|
124
|
+
return 0.0
|
|
125
|
+
return max(0.0, min(1.0, base_confidence + delta))
|
|
126
|
+
|
|
127
|
+
def get_entries_for_fact(self, fact_id: str) -> list[FeedbackEntry]:
|
|
128
|
+
return [e for e in self._entries if e.fact_id == fact_id]
|
|
129
|
+
|
|
130
|
+
def to_dict(self) -> dict[str, Any]:
|
|
131
|
+
return {
|
|
132
|
+
"entries": [
|
|
133
|
+
{
|
|
134
|
+
"feedback_id": e.feedback_id,
|
|
135
|
+
"fact_id": e.fact_id,
|
|
136
|
+
"action": e.action,
|
|
137
|
+
"corrected_text": e.corrected_text,
|
|
138
|
+
"confidence_delta": e.confidence_delta,
|
|
139
|
+
"reason": e.reason,
|
|
140
|
+
"timestamp": e.timestamp,
|
|
141
|
+
"applied": e.applied,
|
|
142
|
+
}
|
|
143
|
+
for e in self._entries
|
|
144
|
+
],
|
|
145
|
+
"adjustments": dict(self._fact_adjustments),
|
|
146
|
+
}
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Hierarchical processing — map-reduce-validate for Tier C/D inputs (§4.5, §11).
|
|
4
|
+
|
|
5
|
+
Splits massive inputs into segments, processes each independently,
|
|
6
|
+
reduces iteratively, and validates cross-window consistency.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
from collections.abc import Callable
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Constants
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
DEFAULT_SEGMENT_SIZE_MULTIPLIER = 100 # segment_size = 100 × context_window
|
|
21
|
+
DEFAULT_FAN_IN = 50
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Data types
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class HierarchicalPlan:
|
|
31
|
+
"""Plan for hierarchical processing."""
|
|
32
|
+
|
|
33
|
+
total_tokens: int = 0
|
|
34
|
+
segment_count: int = 0
|
|
35
|
+
segment_size: int = 0
|
|
36
|
+
fan_in: int = DEFAULT_FAN_IN
|
|
37
|
+
hierarchy_levels: int = 1
|
|
38
|
+
estimated_degradation: float = 0.0
|
|
39
|
+
processing_mode: str = "hierarchical"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class HierarchicalConfig:
|
|
44
|
+
"""Configuration for hierarchical processing."""
|
|
45
|
+
|
|
46
|
+
segment_size: int | None = None
|
|
47
|
+
fan_in: int | None = None
|
|
48
|
+
context_window: int = 128_000
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class SegmentResult:
|
|
53
|
+
"""Output of processing one segment."""
|
|
54
|
+
|
|
55
|
+
segment_index: int = 0
|
|
56
|
+
synthesis: str = ""
|
|
57
|
+
facts_extracted: int = 0
|
|
58
|
+
token_count: int = 0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
# Degradation model
|
|
63
|
+
# ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def chain_degradation(levels: int, per_level: float = 0.03) -> float:
|
|
67
|
+
"""Compute effective degradation after N hierarchy levels.
|
|
68
|
+
|
|
69
|
+
d_chain(L) = 1 - (1 - per_level)^L
|
|
70
|
+
"""
|
|
71
|
+
return 1.0 - (1.0 - per_level) ** levels
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def effective_context(
|
|
75
|
+
context_window: int, levels: int, per_level: float = 0.03,
|
|
76
|
+
) -> float:
|
|
77
|
+
"""Effective context capacity after hierarchical degradation.
|
|
78
|
+
|
|
79
|
+
EffCtx_hier(N) = C × (1 - d_chain(⌈log_k(N)⌉))
|
|
80
|
+
"""
|
|
81
|
+
d = chain_degradation(levels, per_level)
|
|
82
|
+
return context_window * (1.0 - d)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
# HierarchicalProcessor
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class HierarchicalProcessor:
|
|
91
|
+
"""Map-reduce-validate pattern for oversized inputs."""
|
|
92
|
+
|
|
93
|
+
def __init__(
|
|
94
|
+
self,
|
|
95
|
+
dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
|
|
96
|
+
count_tokens: Callable[[str], int] | None = None,
|
|
97
|
+
context_window: int = 128_000,
|
|
98
|
+
) -> None:
|
|
99
|
+
self._dispatch_fn = dispatch_fn
|
|
100
|
+
self._count_tokens = count_tokens or (lambda t: len(t) // 4)
|
|
101
|
+
self._context_window = context_window
|
|
102
|
+
|
|
103
|
+
def plan(
|
|
104
|
+
self, total_tokens: int, config: HierarchicalConfig | None = None,
|
|
105
|
+
) -> HierarchicalPlan:
|
|
106
|
+
"""Create a hierarchical processing plan."""
|
|
107
|
+
cfg = config or HierarchicalConfig()
|
|
108
|
+
seg_size = cfg.segment_size or (DEFAULT_SEGMENT_SIZE_MULTIPLIER * self._context_window)
|
|
109
|
+
fan_in = cfg.fan_in or DEFAULT_FAN_IN
|
|
110
|
+
|
|
111
|
+
segment_count = max(1, math.ceil(total_tokens / seg_size))
|
|
112
|
+
levels = max(1, math.ceil(math.log(max(segment_count, 2)) / math.log(max(fan_in, 2))))
|
|
113
|
+
degradation = chain_degradation(levels)
|
|
114
|
+
|
|
115
|
+
mode = "hierarchical"
|
|
116
|
+
if total_tokens > 1000 * self._context_window:
|
|
117
|
+
mode = "hierarchical_multi_level"
|
|
118
|
+
|
|
119
|
+
return HierarchicalPlan(
|
|
120
|
+
total_tokens=total_tokens,
|
|
121
|
+
segment_count=segment_count,
|
|
122
|
+
segment_size=seg_size,
|
|
123
|
+
fan_in=fan_in,
|
|
124
|
+
hierarchy_levels=levels,
|
|
125
|
+
estimated_degradation=degradation,
|
|
126
|
+
processing_mode=mode,
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def map_phase(
|
|
130
|
+
self,
|
|
131
|
+
segments: list[str],
|
|
132
|
+
task_intent: str,
|
|
133
|
+
) -> list[SegmentResult]:
|
|
134
|
+
"""MAP: Process each segment independently."""
|
|
135
|
+
results: list[SegmentResult] = []
|
|
136
|
+
for i, segment in enumerate(segments):
|
|
137
|
+
if self._dispatch_fn:
|
|
138
|
+
prompt = (
|
|
139
|
+
"Summarize and extract ALL key facts from the following "
|
|
140
|
+
f"segment ({i + 1}/{len(segments)}) for: {task_intent}"
|
|
141
|
+
)
|
|
142
|
+
output, _ = self._dispatch_fn(prompt, segment)
|
|
143
|
+
else:
|
|
144
|
+
# Fallback: take first 500 chars as summary
|
|
145
|
+
output = segment[:500]
|
|
146
|
+
|
|
147
|
+
results.append(SegmentResult(
|
|
148
|
+
segment_index=i,
|
|
149
|
+
synthesis=output,
|
|
150
|
+
facts_extracted=0,
|
|
151
|
+
token_count=self._count_tokens(output),
|
|
152
|
+
))
|
|
153
|
+
return results
|
|
154
|
+
|
|
155
|
+
def reduce_phase(
|
|
156
|
+
self,
|
|
157
|
+
syntheses: list[str],
|
|
158
|
+
task_intent: str,
|
|
159
|
+
fan_in: int = DEFAULT_FAN_IN,
|
|
160
|
+
) -> list[str]:
|
|
161
|
+
"""REDUCE: Iteratively merge syntheses until ≤ fan_in remain."""
|
|
162
|
+
current = syntheses
|
|
163
|
+
while len(current) > fan_in:
|
|
164
|
+
batches: list[list[str]] = []
|
|
165
|
+
for i in range(0, len(current), fan_in):
|
|
166
|
+
batches.append(current[i:i + fan_in])
|
|
167
|
+
|
|
168
|
+
next_level: list[str] = []
|
|
169
|
+
for batch in batches:
|
|
170
|
+
joined = "\n\n---\n\n".join(batch)
|
|
171
|
+
if self._dispatch_fn:
|
|
172
|
+
prompt = (
|
|
173
|
+
f"Synthesize these {len(batch)} segment summaries "
|
|
174
|
+
f"into a coherent overview for: {task_intent}"
|
|
175
|
+
)
|
|
176
|
+
output, _ = self._dispatch_fn(prompt, joined)
|
|
177
|
+
else:
|
|
178
|
+
output = joined[:1000]
|
|
179
|
+
next_level.append(output)
|
|
180
|
+
current = next_level
|
|
181
|
+
return current
|
|
182
|
+
|
|
183
|
+
def hierarchical_dispatch(
|
|
184
|
+
self,
|
|
185
|
+
task_intent: str,
|
|
186
|
+
large_input: str,
|
|
187
|
+
config: HierarchicalConfig | None = None,
|
|
188
|
+
) -> tuple[list[str], HierarchicalPlan]:
|
|
189
|
+
"""Full map-reduce-validate pipeline for oversized input.
|
|
190
|
+
|
|
191
|
+
Returns (final_syntheses, plan).
|
|
192
|
+
"""
|
|
193
|
+
total_tokens = self._count_tokens(large_input)
|
|
194
|
+
plan = self.plan(total_tokens, config)
|
|
195
|
+
|
|
196
|
+
# Segment the input
|
|
197
|
+
seg_char_size = len(large_input) // max(plan.segment_count, 1)
|
|
198
|
+
segments: list[str] = []
|
|
199
|
+
for i in range(plan.segment_count):
|
|
200
|
+
start = i * seg_char_size
|
|
201
|
+
end = start + seg_char_size if i < plan.segment_count - 1 else len(large_input)
|
|
202
|
+
segments.append(large_input[start:end])
|
|
203
|
+
|
|
204
|
+
# MAP
|
|
205
|
+
map_results = self.map_phase(segments, task_intent)
|
|
206
|
+
|
|
207
|
+
# REDUCE
|
|
208
|
+
syntheses = [r.synthesis for r in map_results]
|
|
209
|
+
reduced = self.reduce_phase(syntheses, task_intent, plan.fan_in)
|
|
210
|
+
|
|
211
|
+
return reduced, plan
|