crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,265 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """LLM context curation — progressive understanding synthesis (§18).
4
+
5
+ Periodically dispatches curation windows to build an evolving synthesis
6
+ of findings, relationships, and gaps. Injected into envelopes as
7
+ Section 1.5 between CRITICAL STATE and DISCOVERIES.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ import uuid
14
+ from collections.abc import Callable
15
+ from dataclasses import dataclass, field
16
+ from typing import Any
17
+
18
+ # ---------------------------------------------------------------------------
19
+ # Constants
20
+ # ---------------------------------------------------------------------------
21
+
22
+ # Curation interval by tier: {tier: (interval, max_tokens)}
23
+ TIER_CONFIG: dict[str, tuple[int, int]] = {
24
+ "A": (5, 500),
25
+ "B": (5, 800),
26
+ "C": (10, 1000),
27
+ "D": (20, 1500),
28
+ }
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Data types
33
+ # ---------------------------------------------------------------------------
34
+
35
+
36
+ @dataclass
37
+ class LLMSynthesis:
38
+ """Curated synthesis from LLM review of accumulated facts."""
39
+
40
+ synthesis_id: str = field(default_factory=lambda: str(uuid.uuid4()))
41
+ text: str = ""
42
+ window_index: int = 0
43
+ supersedes: str | None = None
44
+ evolution_count: int = 1
45
+ critical_findings: list[str] | None = None
46
+ key_relationships: list[str] | None = None
47
+ gaps: list[str] | None = None
48
+ confidence: float = 1.0
49
+ created_at: float = field(default_factory=time.time)
50
+
51
+ def to_dict(self) -> dict[str, Any]:
52
+ return {
53
+ "synthesis_id": self.synthesis_id,
54
+ "text": self.text,
55
+ "window_index": self.window_index,
56
+ "supersedes": self.supersedes,
57
+ "evolution_count": self.evolution_count,
58
+ "critical_findings": self.critical_findings,
59
+ "key_relationships": self.key_relationships,
60
+ "gaps": self.gaps,
61
+ "confidence": self.confidence,
62
+ "created_at": self.created_at,
63
+ }
64
+
65
+ @classmethod
66
+ def from_dict(cls, data: dict[str, Any]) -> LLMSynthesis:
67
+ return cls(
68
+ synthesis_id=data.get("synthesis_id", str(uuid.uuid4())),
69
+ text=data.get("text", ""),
70
+ window_index=data.get("window_index", 0),
71
+ supersedes=data.get("supersedes"),
72
+ evolution_count=data.get("evolution_count", 1),
73
+ critical_findings=data.get("critical_findings"),
74
+ key_relationships=data.get("key_relationships"),
75
+ gaps=data.get("gaps"),
76
+ confidence=data.get("confidence", 1.0),
77
+ created_at=data.get("created_at", 0.0),
78
+ )
79
+
80
+
81
+ @dataclass
82
+ class CurationConfig:
83
+ """Configuration for LLM curation."""
84
+
85
+ enabled: bool = True
86
+ curation_interval: int = 5
87
+ max_synthesis_tokens: int = 1500
88
+ progressive: bool = True
89
+ quality_tier: str = "B"
90
+
91
+
92
+ # ---------------------------------------------------------------------------
93
+ # LLMContextCurator
94
+ # ---------------------------------------------------------------------------
95
+
96
+
97
+ class LLMContextCurator:
98
+ """LLM-driven context curation with progressive understanding."""
99
+
100
+ def __init__(
101
+ self,
102
+ dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
103
+ config: CurationConfig | None = None,
104
+ ) -> None:
105
+ self._dispatch_fn = dispatch_fn
106
+ self.config = config or CurationConfig()
107
+ self._current_synthesis: LLMSynthesis | None = None
108
+ self._synthesis_history: list[LLMSynthesis] = []
109
+
110
+ @property
111
+ def current_synthesis(self) -> LLMSynthesis | None:
112
+ return self._current_synthesis
113
+
114
+ @property
115
+ def evolution_count(self) -> int:
116
+ if self._current_synthesis:
117
+ return self._current_synthesis.evolution_count
118
+ return 0
119
+
120
+ def should_curate(self, window_index: int) -> bool:
121
+ """Check if curation should run at this window."""
122
+ if not self.config.enabled:
123
+ return False
124
+ interval, _ = TIER_CONFIG.get(
125
+ self.config.quality_tier,
126
+ (self.config.curation_interval, self.config.max_synthesis_tokens),
127
+ )
128
+ return window_index > 0 and window_index % interval == 0
129
+
130
+ def curate(
131
+ self,
132
+ window_index: int,
133
+ top_facts: list[str],
134
+ recent_output_summary: str = "",
135
+ ) -> LLMSynthesis | None:
136
+ """Run curation (initial or progressive).
137
+
138
+ Returns new synthesis or None if dispatch unavailable.
139
+ """
140
+ if not self._dispatch_fn:
141
+ return None
142
+
143
+ if self.config.progressive and self._current_synthesis:
144
+ return self._progressive_curation(
145
+ window_index, top_facts, recent_output_summary,
146
+ )
147
+ return self._initial_curation(window_index, top_facts, recent_output_summary)
148
+
149
+ def _initial_curation(
150
+ self,
151
+ window_index: int,
152
+ top_facts: list[str],
153
+ recent_output_summary: str,
154
+ ) -> LLMSynthesis:
155
+ """First curation — no prior synthesis to build on."""
156
+ facts_text = "\n".join(f"- {f}" for f in top_facts[:40])
157
+ prompt = (
158
+ "Analyze the extracted facts and provide:\n"
159
+ "1. 5 most critical findings\n"
160
+ "2. 3 key relationships between findings\n"
161
+ "3. Current assessment\n"
162
+ "4. What's missing / gaps\n\n"
163
+ f"Recent output:\n{recent_output_summary[:1000]}\n\n"
164
+ "Be concise."
165
+ )
166
+
167
+ output, _ = self._dispatch_fn(prompt, facts_text) # type: ignore[misc]
168
+ synthesis = self._parse_synthesis(output, window_index)
169
+ self._current_synthesis = synthesis
170
+ self._synthesis_history.append(synthesis)
171
+ return synthesis
172
+
173
+ def _progressive_curation(
174
+ self,
175
+ window_index: int,
176
+ top_facts: list[str],
177
+ recent_output_summary: str,
178
+ ) -> LLMSynthesis:
179
+ """Progressive curation — revise previous synthesis."""
180
+ prev = self._current_synthesis
181
+ facts_text = "\n".join(f"- {f}" for f in top_facts[:40])
182
+ prompt = (
183
+ "Revise your previous synthesis based on new facts.\n"
184
+ f"Previous synthesis:\n{prev.text[:1500] if prev else ''}\n\n"
185
+ "Update:\n"
186
+ "1. Revised critical findings\n"
187
+ "2. Updated relationships\n"
188
+ "3. Updated assessment\n"
189
+ "4. New gaps identified\n\n"
190
+ f"New facts since last synthesis:\n{facts_text}\n\n"
191
+ f"Recent output:\n{recent_output_summary[:500]}\n\n"
192
+ "Be concise."
193
+ )
194
+
195
+ output, _ = self._dispatch_fn(prompt, "") # type: ignore[misc]
196
+ synthesis = self._parse_synthesis(output, window_index)
197
+ synthesis.supersedes = prev.synthesis_id if prev else None
198
+ synthesis.evolution_count = (prev.evolution_count + 1) if prev else 1
199
+ self._current_synthesis = synthesis
200
+ self._synthesis_history.append(synthesis)
201
+ return synthesis
202
+
203
+ def _parse_synthesis(self, output: str, window_index: int) -> LLMSynthesis:
204
+ """Parse curation output into structured synthesis."""
205
+ findings: list[str] = []
206
+ relationships: list[str] = []
207
+ gaps: list[str] = []
208
+
209
+ section = ""
210
+ for line in output.split("\n"):
211
+ line_lower = line.lower().strip()
212
+ if "finding" in line_lower or "critical" in line_lower:
213
+ section = "findings"
214
+ elif "relationship" in line_lower:
215
+ section = "relationships"
216
+ elif "gap" in line_lower or "missing" in line_lower:
217
+ section = "gaps"
218
+ elif "assessment" in line_lower:
219
+ section = "assessment"
220
+
221
+ if line.strip().startswith("-") or line.strip().startswith("•"):
222
+ item = line.strip().lstrip("-•").strip()
223
+ if section == "findings":
224
+ findings.append(item)
225
+ elif section == "relationships":
226
+ relationships.append(item)
227
+ elif section == "gaps":
228
+ gaps.append(item)
229
+
230
+ return LLMSynthesis(
231
+ text=output,
232
+ window_index=window_index,
233
+ critical_findings=findings or None,
234
+ key_relationships=relationships or None,
235
+ gaps=gaps or None,
236
+ )
237
+
238
+ def format_for_envelope(self) -> str:
239
+ """Format current synthesis for envelope injection (Section 1.5)."""
240
+ if not self._current_synthesis:
241
+ return ""
242
+ s = self._current_synthesis
243
+ parts = [
244
+ f"[LLM_SYNTHESIS (Window {s.window_index}, evolution {s.evolution_count})]",
245
+ ]
246
+ if s.critical_findings:
247
+ parts.append("CRITICAL FINDINGS: " + "; ".join(s.critical_findings))
248
+ if s.key_relationships:
249
+ parts.append("KEY RELATIONSHIPS: " + "; ".join(s.key_relationships))
250
+ if s.gaps:
251
+ parts.append("GAPS: " + "; ".join(s.gaps))
252
+ return "\n".join(parts)
253
+
254
+ def to_dict(self) -> dict[str, Any]:
255
+ return {
256
+ "current": self._current_synthesis.to_dict() if self._current_synthesis else None,
257
+ "history": [s.to_dict() for s in self._synthesis_history],
258
+ "config": {
259
+ "enabled": self.config.enabled,
260
+ "curation_interval": self.config.curation_interval,
261
+ "max_synthesis_tokens": self.config.max_synthesis_tokens,
262
+ "progressive": self.config.progressive,
263
+ "quality_tier": self.config.quality_tier,
264
+ },
265
+ }
@@ -0,0 +1,146 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Human-in-the-loop feedback — fact override, confidence adjustment (§18, MAY)."""
4
+
5
+ from __future__ import annotations
6
+
7
+ import time
8
+ from dataclasses import dataclass, field
9
+ from typing import Any
10
+
11
+
12
+ @dataclass
13
+ class FeedbackEntry:
14
+ """Single human feedback action."""
15
+
16
+ feedback_id: str = ""
17
+ fact_id: str = ""
18
+ action: str = "" # "override" | "boost" | "penalize" | "reject"
19
+ original_text: str = ""
20
+ corrected_text: str | None = None
21
+ confidence_delta: float = 0.0
22
+ reason: str = ""
23
+ timestamp: float = field(default_factory=time.time)
24
+ applied: bool = False
25
+
26
+
27
+ class FeedbackLoop:
28
+ """Human-in-the-loop corrections for facts in warm state."""
29
+
30
+ def __init__(self) -> None:
31
+ self._entries: list[FeedbackEntry] = []
32
+ self._fact_adjustments: dict[str, float] = {} # fact_id → cumulative delta
33
+
34
+ @property
35
+ def entry_count(self) -> int:
36
+ return len(self._entries)
37
+
38
+ def override_fact(
39
+ self,
40
+ fact_id: str,
41
+ corrected_text: str,
42
+ reason: str = "",
43
+ ) -> FeedbackEntry:
44
+ """Replace fact text with human-provided correction."""
45
+ entry = FeedbackEntry(
46
+ feedback_id=f"fb-{len(self._entries)}",
47
+ fact_id=fact_id,
48
+ action="override",
49
+ corrected_text=corrected_text,
50
+ reason=reason,
51
+ applied=True,
52
+ )
53
+ self._entries.append(entry)
54
+ return entry
55
+
56
+ def boost_confidence(
57
+ self,
58
+ fact_id: str,
59
+ delta: float = 0.1,
60
+ reason: str = "",
61
+ ) -> FeedbackEntry:
62
+ """Increase fact confidence based on human validation."""
63
+ self._fact_adjustments[fact_id] = (
64
+ self._fact_adjustments.get(fact_id, 0.0) + delta
65
+ )
66
+ entry = FeedbackEntry(
67
+ feedback_id=f"fb-{len(self._entries)}",
68
+ fact_id=fact_id,
69
+ action="boost",
70
+ confidence_delta=delta,
71
+ reason=reason,
72
+ applied=True,
73
+ )
74
+ self._entries.append(entry)
75
+ return entry
76
+
77
+ def penalize_confidence(
78
+ self,
79
+ fact_id: str,
80
+ delta: float = -0.2,
81
+ reason: str = "",
82
+ ) -> FeedbackEntry:
83
+ """Decrease fact confidence based on human rejection."""
84
+ self._fact_adjustments[fact_id] = (
85
+ self._fact_adjustments.get(fact_id, 0.0) + delta
86
+ )
87
+ entry = FeedbackEntry(
88
+ feedback_id=f"fb-{len(self._entries)}",
89
+ fact_id=fact_id,
90
+ action="penalize",
91
+ confidence_delta=delta,
92
+ reason=reason,
93
+ applied=True,
94
+ )
95
+ self._entries.append(entry)
96
+ return entry
97
+
98
+ def reject_fact(
99
+ self,
100
+ fact_id: str,
101
+ reason: str = "",
102
+ ) -> FeedbackEntry:
103
+ """Mark fact as rejected (confidence → 0)."""
104
+ self._fact_adjustments[fact_id] = -1.0 # Signal full rejection
105
+ entry = FeedbackEntry(
106
+ feedback_id=f"fb-{len(self._entries)}",
107
+ fact_id=fact_id,
108
+ action="reject",
109
+ confidence_delta=-1.0,
110
+ reason=reason,
111
+ applied=True,
112
+ )
113
+ self._entries.append(entry)
114
+ return entry
115
+
116
+ def get_adjusted_confidence(
117
+ self,
118
+ fact_id: str,
119
+ base_confidence: float,
120
+ ) -> float:
121
+ """Get confidence after applying all feedback adjustments."""
122
+ delta = self._fact_adjustments.get(fact_id, 0.0)
123
+ if delta <= -1.0:
124
+ return 0.0
125
+ return max(0.0, min(1.0, base_confidence + delta))
126
+
127
+ def get_entries_for_fact(self, fact_id: str) -> list[FeedbackEntry]:
128
+ return [e for e in self._entries if e.fact_id == fact_id]
129
+
130
+ def to_dict(self) -> dict[str, Any]:
131
+ return {
132
+ "entries": [
133
+ {
134
+ "feedback_id": e.feedback_id,
135
+ "fact_id": e.fact_id,
136
+ "action": e.action,
137
+ "corrected_text": e.corrected_text,
138
+ "confidence_delta": e.confidence_delta,
139
+ "reason": e.reason,
140
+ "timestamp": e.timestamp,
141
+ "applied": e.applied,
142
+ }
143
+ for e in self._entries
144
+ ],
145
+ "adjustments": dict(self._fact_adjustments),
146
+ }
@@ -0,0 +1,211 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Hierarchical processing — map-reduce-validate for Tier C/D inputs (§4.5, §11).
4
+
5
+ Splits massive inputs into segments, processes each independently,
6
+ reduces iteratively, and validates cross-window consistency.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ from collections.abc import Callable
13
+ from dataclasses import dataclass
14
+ from typing import Any
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Constants
18
+ # ---------------------------------------------------------------------------
19
+
20
+ DEFAULT_SEGMENT_SIZE_MULTIPLIER = 100 # segment_size = 100 × context_window
21
+ DEFAULT_FAN_IN = 50
22
+
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Data types
26
+ # ---------------------------------------------------------------------------
27
+
28
+
29
+ @dataclass
30
+ class HierarchicalPlan:
31
+ """Plan for hierarchical processing."""
32
+
33
+ total_tokens: int = 0
34
+ segment_count: int = 0
35
+ segment_size: int = 0
36
+ fan_in: int = DEFAULT_FAN_IN
37
+ hierarchy_levels: int = 1
38
+ estimated_degradation: float = 0.0
39
+ processing_mode: str = "hierarchical"
40
+
41
+
42
+ @dataclass
43
+ class HierarchicalConfig:
44
+ """Configuration for hierarchical processing."""
45
+
46
+ segment_size: int | None = None
47
+ fan_in: int | None = None
48
+ context_window: int = 128_000
49
+
50
+
51
+ @dataclass
52
+ class SegmentResult:
53
+ """Output of processing one segment."""
54
+
55
+ segment_index: int = 0
56
+ synthesis: str = ""
57
+ facts_extracted: int = 0
58
+ token_count: int = 0
59
+
60
+
61
+ # ---------------------------------------------------------------------------
62
+ # Degradation model
63
+ # ---------------------------------------------------------------------------
64
+
65
+
66
+ def chain_degradation(levels: int, per_level: float = 0.03) -> float:
67
+ """Compute effective degradation after N hierarchy levels.
68
+
69
+ d_chain(L) = 1 - (1 - per_level)^L
70
+ """
71
+ return 1.0 - (1.0 - per_level) ** levels
72
+
73
+
74
+ def effective_context(
75
+ context_window: int, levels: int, per_level: float = 0.03,
76
+ ) -> float:
77
+ """Effective context capacity after hierarchical degradation.
78
+
79
+ EffCtx_hier(N) = C × (1 - d_chain(⌈log_k(N)⌉))
80
+ """
81
+ d = chain_degradation(levels, per_level)
82
+ return context_window * (1.0 - d)
83
+
84
+
85
+ # ---------------------------------------------------------------------------
86
+ # HierarchicalProcessor
87
+ # ---------------------------------------------------------------------------
88
+
89
+
90
+ class HierarchicalProcessor:
91
+ """Map-reduce-validate pattern for oversized inputs."""
92
+
93
+ def __init__(
94
+ self,
95
+ dispatch_fn: Callable[[str, str], tuple[str, Any]] | None = None,
96
+ count_tokens: Callable[[str], int] | None = None,
97
+ context_window: int = 128_000,
98
+ ) -> None:
99
+ self._dispatch_fn = dispatch_fn
100
+ self._count_tokens = count_tokens or (lambda t: len(t) // 4)
101
+ self._context_window = context_window
102
+
103
+ def plan(
104
+ self, total_tokens: int, config: HierarchicalConfig | None = None,
105
+ ) -> HierarchicalPlan:
106
+ """Create a hierarchical processing plan."""
107
+ cfg = config or HierarchicalConfig()
108
+ seg_size = cfg.segment_size or (DEFAULT_SEGMENT_SIZE_MULTIPLIER * self._context_window)
109
+ fan_in = cfg.fan_in or DEFAULT_FAN_IN
110
+
111
+ segment_count = max(1, math.ceil(total_tokens / seg_size))
112
+ levels = max(1, math.ceil(math.log(max(segment_count, 2)) / math.log(max(fan_in, 2))))
113
+ degradation = chain_degradation(levels)
114
+
115
+ mode = "hierarchical"
116
+ if total_tokens > 1000 * self._context_window:
117
+ mode = "hierarchical_multi_level"
118
+
119
+ return HierarchicalPlan(
120
+ total_tokens=total_tokens,
121
+ segment_count=segment_count,
122
+ segment_size=seg_size,
123
+ fan_in=fan_in,
124
+ hierarchy_levels=levels,
125
+ estimated_degradation=degradation,
126
+ processing_mode=mode,
127
+ )
128
+
129
+ def map_phase(
130
+ self,
131
+ segments: list[str],
132
+ task_intent: str,
133
+ ) -> list[SegmentResult]:
134
+ """MAP: Process each segment independently."""
135
+ results: list[SegmentResult] = []
136
+ for i, segment in enumerate(segments):
137
+ if self._dispatch_fn:
138
+ prompt = (
139
+ "Summarize and extract ALL key facts from the following "
140
+ f"segment ({i + 1}/{len(segments)}) for: {task_intent}"
141
+ )
142
+ output, _ = self._dispatch_fn(prompt, segment)
143
+ else:
144
+ # Fallback: take first 500 chars as summary
145
+ output = segment[:500]
146
+
147
+ results.append(SegmentResult(
148
+ segment_index=i,
149
+ synthesis=output,
150
+ facts_extracted=0,
151
+ token_count=self._count_tokens(output),
152
+ ))
153
+ return results
154
+
155
+ def reduce_phase(
156
+ self,
157
+ syntheses: list[str],
158
+ task_intent: str,
159
+ fan_in: int = DEFAULT_FAN_IN,
160
+ ) -> list[str]:
161
+ """REDUCE: Iteratively merge syntheses until ≤ fan_in remain."""
162
+ current = syntheses
163
+ while len(current) > fan_in:
164
+ batches: list[list[str]] = []
165
+ for i in range(0, len(current), fan_in):
166
+ batches.append(current[i:i + fan_in])
167
+
168
+ next_level: list[str] = []
169
+ for batch in batches:
170
+ joined = "\n\n---\n\n".join(batch)
171
+ if self._dispatch_fn:
172
+ prompt = (
173
+ f"Synthesize these {len(batch)} segment summaries "
174
+ f"into a coherent overview for: {task_intent}"
175
+ )
176
+ output, _ = self._dispatch_fn(prompt, joined)
177
+ else:
178
+ output = joined[:1000]
179
+ next_level.append(output)
180
+ current = next_level
181
+ return current
182
+
183
+ def hierarchical_dispatch(
184
+ self,
185
+ task_intent: str,
186
+ large_input: str,
187
+ config: HierarchicalConfig | None = None,
188
+ ) -> tuple[list[str], HierarchicalPlan]:
189
+ """Full map-reduce-validate pipeline for oversized input.
190
+
191
+ Returns (final_syntheses, plan).
192
+ """
193
+ total_tokens = self._count_tokens(large_input)
194
+ plan = self.plan(total_tokens, config)
195
+
196
+ # Segment the input
197
+ seg_char_size = len(large_input) // max(plan.segment_count, 1)
198
+ segments: list[str] = []
199
+ for i in range(plan.segment_count):
200
+ start = i * seg_char_size
201
+ end = start + seg_char_size if i < plan.segment_count - 1 else len(large_input)
202
+ segments.append(large_input[start:end])
203
+
204
+ # MAP
205
+ map_results = self.map_phase(segments, task_intent)
206
+
207
+ # REDUCE
208
+ syntheses = [r.synthesis for r in map_results]
209
+ reduced = self.reduce_phase(syntheses, task_intent, plan.fan_in)
210
+
211
+ return reduced, plan