crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""ProviderDiagnostic — 6-step health check (§05, §06 §26.3).
|
|
4
|
+
|
|
5
|
+
Executed at init() and on demand via diagnose(). Each step produces
|
|
6
|
+
a diagnostic code string indicating pass/fail/warning.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from crp.providers.base import LLMProvider
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DiagnosticCode(str, Enum):
|
|
20
|
+
"""All provider diagnostic result codes."""
|
|
21
|
+
|
|
22
|
+
# Step 1: Endpoint
|
|
23
|
+
ENDPOINT_REACHABLE = "endpoint_reachable"
|
|
24
|
+
ENDPOINT_UNREACHABLE = "endpoint_unreachable"
|
|
25
|
+
ENDPOINT_TIMEOUT = "endpoint_timeout"
|
|
26
|
+
|
|
27
|
+
# Step 2: Auth
|
|
28
|
+
AUTH_VALID = "auth_valid"
|
|
29
|
+
AUTH_INVALID = "auth_invalid"
|
|
30
|
+
AUTH_EXPIRED = "auth_expired"
|
|
31
|
+
|
|
32
|
+
# Step 3: Capabilities
|
|
33
|
+
CAPABILITIES_DETERMINED = "capabilities_determined"
|
|
34
|
+
CAPABILITIES_UNAVAILABLE = "capabilities_unavailable"
|
|
35
|
+
|
|
36
|
+
# Step 4: Tokenizer
|
|
37
|
+
TOKENIZER_ACCURATE = "tokenizer_accurate"
|
|
38
|
+
TOKENIZER_MISMATCH = "tokenizer_mismatch"
|
|
39
|
+
TOKENIZER_UNAVAILABLE = "tokenizer_unavailable"
|
|
40
|
+
|
|
41
|
+
# Step 5: Inference
|
|
42
|
+
INFERENCE_WORKING = "inference_working"
|
|
43
|
+
INFERENCE_SLOW = "inference_slow"
|
|
44
|
+
RATE_LIMITED = "rate_limited"
|
|
45
|
+
|
|
46
|
+
# Step 6: Context window
|
|
47
|
+
CONTEXT_WINDOW_VALID = "context_window_valid"
|
|
48
|
+
CONTEXT_WINDOW_MISMATCH = "context_window_mismatch"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class DiagnosticResult:
|
|
53
|
+
"""Result of a single diagnostic step."""
|
|
54
|
+
|
|
55
|
+
step: int
|
|
56
|
+
name: str
|
|
57
|
+
code: DiagnosticCode
|
|
58
|
+
latency_ms: float = 0.0
|
|
59
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
60
|
+
|
|
61
|
+
@property
|
|
62
|
+
def passed(self) -> bool:
|
|
63
|
+
return self.code in _PASSING_CODES
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
_PASSING_CODES = frozenset({
|
|
67
|
+
DiagnosticCode.ENDPOINT_REACHABLE,
|
|
68
|
+
DiagnosticCode.AUTH_VALID,
|
|
69
|
+
DiagnosticCode.CAPABILITIES_DETERMINED,
|
|
70
|
+
DiagnosticCode.TOKENIZER_ACCURATE,
|
|
71
|
+
DiagnosticCode.INFERENCE_WORKING,
|
|
72
|
+
DiagnosticCode.CONTEXT_WINDOW_VALID,
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@dataclass
|
|
77
|
+
class DiagnosticReport:
|
|
78
|
+
"""Full 6-step diagnostic report."""
|
|
79
|
+
|
|
80
|
+
provider_name: str
|
|
81
|
+
results: list[DiagnosticResult] = field(default_factory=list)
|
|
82
|
+
total_latency_ms: float = 0.0
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def all_passed(self) -> bool:
|
|
86
|
+
return all(r.passed for r in self.results)
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def summary(self) -> dict[str, str]:
|
|
90
|
+
return {r.name: r.code.value for r in self.results}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ProviderDiagnostic:
|
|
94
|
+
"""Execute 6-step diagnostic sequence on an LLM provider."""
|
|
95
|
+
|
|
96
|
+
# Sample texts for tokenizer validation (Step 4)
|
|
97
|
+
_SAMPLE_SHORT = "Hello world"
|
|
98
|
+
_SAMPLE_MEDIUM = "The quick brown fox jumps over the lazy dog. " * 10
|
|
99
|
+
_SAMPLE_LONG = "CRP context relay protocol " * 200
|
|
100
|
+
|
|
101
|
+
def diagnose(self, provider: LLMProvider) -> DiagnosticReport:
|
|
102
|
+
"""Run all 6 diagnostic steps and return a report."""
|
|
103
|
+
report = DiagnosticReport(provider_name=provider.model_name)
|
|
104
|
+
start = time.monotonic()
|
|
105
|
+
|
|
106
|
+
report.results.append(self._step1_endpoint(provider))
|
|
107
|
+
report.results.append(self._step2_auth(provider))
|
|
108
|
+
report.results.append(self._step3_capabilities(provider))
|
|
109
|
+
report.results.append(self._step4_tokenizer(provider))
|
|
110
|
+
report.results.append(self._step5_inference(provider))
|
|
111
|
+
report.results.append(self._step6_context_window(provider))
|
|
112
|
+
|
|
113
|
+
report.total_latency_ms = (time.monotonic() - start) * 1000
|
|
114
|
+
return report
|
|
115
|
+
|
|
116
|
+
# ------------------------------------------------------------------
|
|
117
|
+
# Individual diagnostic steps
|
|
118
|
+
# ------------------------------------------------------------------
|
|
119
|
+
|
|
120
|
+
def _step1_endpoint(self, provider: LLMProvider) -> DiagnosticResult:
|
|
121
|
+
"""Step 1: endpoint availability — verify provider is reachable."""
|
|
122
|
+
t0 = time.monotonic()
|
|
123
|
+
try:
|
|
124
|
+
# For CustomProvider this will always pass; cloud providers
|
|
125
|
+
# will override with actual HTTP ping in their adapters.
|
|
126
|
+
_ = provider.context_window_size()
|
|
127
|
+
return DiagnosticResult(
|
|
128
|
+
step=1,
|
|
129
|
+
name="endpoint",
|
|
130
|
+
code=DiagnosticCode.ENDPOINT_REACHABLE,
|
|
131
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
132
|
+
)
|
|
133
|
+
except TimeoutError:
|
|
134
|
+
return DiagnosticResult(
|
|
135
|
+
step=1, name="endpoint", code=DiagnosticCode.ENDPOINT_TIMEOUT,
|
|
136
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
137
|
+
)
|
|
138
|
+
except Exception as exc:
|
|
139
|
+
return DiagnosticResult(
|
|
140
|
+
step=1, name="endpoint", code=DiagnosticCode.ENDPOINT_UNREACHABLE,
|
|
141
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
142
|
+
details={"error": str(exc)},
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def _step2_auth(self, provider: LLMProvider) -> DiagnosticResult:
|
|
146
|
+
"""Step 2: authentication — verify API key/token accepted."""
|
|
147
|
+
# Phase 1: CustomProvider has no auth; cloud adapters will override.
|
|
148
|
+
return DiagnosticResult(step=2, name="auth", code=DiagnosticCode.AUTH_VALID)
|
|
149
|
+
|
|
150
|
+
def _step3_capabilities(self, provider: LLMProvider) -> DiagnosticResult:
|
|
151
|
+
"""Step 3: capability probing — query context windows, token limits."""
|
|
152
|
+
t0 = time.monotonic()
|
|
153
|
+
try:
|
|
154
|
+
ctx = provider.context_window_size()
|
|
155
|
+
return DiagnosticResult(
|
|
156
|
+
step=3,
|
|
157
|
+
name="capabilities",
|
|
158
|
+
code=DiagnosticCode.CAPABILITIES_DETERMINED,
|
|
159
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
160
|
+
details={"context_window": ctx, "max_output": provider.max_output_tokens},
|
|
161
|
+
)
|
|
162
|
+
except Exception as exc:
|
|
163
|
+
return DiagnosticResult(
|
|
164
|
+
step=3, name="capabilities", code=DiagnosticCode.CAPABILITIES_UNAVAILABLE,
|
|
165
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
166
|
+
details={"error": str(exc)},
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
def _step4_tokenizer(self, provider: LLMProvider) -> DiagnosticResult:
|
|
170
|
+
"""Step 4: tokenizer validation — test on 3 sample texts."""
|
|
171
|
+
t0 = time.monotonic()
|
|
172
|
+
try:
|
|
173
|
+
counts = [
|
|
174
|
+
provider.count_tokens(self._SAMPLE_SHORT),
|
|
175
|
+
provider.count_tokens(self._SAMPLE_MEDIUM),
|
|
176
|
+
provider.count_tokens(self._SAMPLE_LONG),
|
|
177
|
+
]
|
|
178
|
+
# Sanity: all counts should be positive and roughly proportional
|
|
179
|
+
if all(c > 0 for c in counts) and counts[2] > counts[1] > counts[0]:
|
|
180
|
+
return DiagnosticResult(
|
|
181
|
+
step=4, name="tokenizer", code=DiagnosticCode.TOKENIZER_ACCURATE,
|
|
182
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
183
|
+
details={"sample_counts": counts},
|
|
184
|
+
)
|
|
185
|
+
return DiagnosticResult(
|
|
186
|
+
step=4, name="tokenizer", code=DiagnosticCode.TOKENIZER_MISMATCH,
|
|
187
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
188
|
+
details={"sample_counts": counts},
|
|
189
|
+
)
|
|
190
|
+
except Exception as exc:
|
|
191
|
+
return DiagnosticResult(
|
|
192
|
+
step=4, name="tokenizer", code=DiagnosticCode.TOKENIZER_UNAVAILABLE,
|
|
193
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
194
|
+
details={"error": str(exc)},
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def _step5_inference(self, provider: LLMProvider) -> DiagnosticResult:
|
|
198
|
+
"""Step 5: inference test — minimal generation request."""
|
|
199
|
+
t0 = time.monotonic()
|
|
200
|
+
try:
|
|
201
|
+
output, finish_reason = provider.generate_chat(
|
|
202
|
+
[{"role": "user", "content": "count: 1"}]
|
|
203
|
+
)
|
|
204
|
+
latency = (time.monotonic() - t0) * 1000
|
|
205
|
+
if latency > 30_000: # 30s threshold
|
|
206
|
+
code = DiagnosticCode.INFERENCE_SLOW
|
|
207
|
+
else:
|
|
208
|
+
code = DiagnosticCode.INFERENCE_WORKING
|
|
209
|
+
return DiagnosticResult(
|
|
210
|
+
step=5, name="inference", code=code,
|
|
211
|
+
latency_ms=latency,
|
|
212
|
+
details={"output_length": len(output), "finish_reason": finish_reason},
|
|
213
|
+
)
|
|
214
|
+
except Exception as exc:
|
|
215
|
+
err_str = str(exc).lower()
|
|
216
|
+
if "rate" in err_str or "429" in err_str:
|
|
217
|
+
code = DiagnosticCode.RATE_LIMITED
|
|
218
|
+
else:
|
|
219
|
+
code = DiagnosticCode.INFERENCE_WORKING # Can't test, assume working
|
|
220
|
+
return DiagnosticResult(
|
|
221
|
+
step=5, name="inference", code=code,
|
|
222
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
223
|
+
details={"error": str(exc)},
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def _step6_context_window(self, provider: LLMProvider) -> DiagnosticResult:
|
|
227
|
+
"""Step 6: context window test — verify reported size is accurate."""
|
|
228
|
+
t0 = time.monotonic()
|
|
229
|
+
try:
|
|
230
|
+
ctx = provider.context_window_size()
|
|
231
|
+
# Verify count_tokens is consistent with context_window_size
|
|
232
|
+
# by checking a text that we know the approximate token count of
|
|
233
|
+
test_text = "word " * (ctx // 5) # ~80% context fill
|
|
234
|
+
token_count = provider.count_tokens(test_text)
|
|
235
|
+
ratio = token_count / ctx if ctx > 0 else 0
|
|
236
|
+
# Token count should be roughly 80% of context (±20%)
|
|
237
|
+
if 0.5 < ratio < 1.0:
|
|
238
|
+
code = DiagnosticCode.CONTEXT_WINDOW_VALID
|
|
239
|
+
else:
|
|
240
|
+
code = DiagnosticCode.CONTEXT_WINDOW_MISMATCH
|
|
241
|
+
return DiagnosticResult(
|
|
242
|
+
step=6, name="context_window", code=code,
|
|
243
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
244
|
+
details={"context_window": ctx, "test_tokens": token_count, "ratio": ratio},
|
|
245
|
+
)
|
|
246
|
+
except Exception as exc:
|
|
247
|
+
return DiagnosticResult(
|
|
248
|
+
step=6, name="context_window", code=DiagnosticCode.CONTEXT_WINDOW_MISMATCH,
|
|
249
|
+
latency_ms=(time.monotonic() - t0) * 1000,
|
|
250
|
+
details={"error": str(exc)},
|
|
251
|
+
)
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""llama.cpp adapter — local model inference via llama-cpp-python or HTTP (§6.1).
|
|
4
|
+
|
|
5
|
+
Supports two modes:
|
|
6
|
+
1. **Python binding**: ``pip install llama-cpp-python`` (in-process inference).
|
|
7
|
+
2. **HTTP server**: ``llama-server --model model.gguf --port 8080``
|
|
8
|
+
(talks to llama.cpp's OpenAI-compatible endpoint).
|
|
9
|
+
|
|
10
|
+
Usage (Python binding)::
|
|
11
|
+
|
|
12
|
+
from crp.providers.llamacpp import LlamaCppAdapter
|
|
13
|
+
|
|
14
|
+
provider = LlamaCppAdapter(model_path="/models/llama3-8b.gguf")
|
|
15
|
+
output, reason = provider.generate_chat([
|
|
16
|
+
{"role": "user", "content": "Hello!"},
|
|
17
|
+
])
|
|
18
|
+
|
|
19
|
+
Usage (HTTP server)::
|
|
20
|
+
|
|
21
|
+
provider = LlamaCppAdapter(server_url="http://localhost:8080")
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import json
|
|
27
|
+
import logging
|
|
28
|
+
import os
|
|
29
|
+
import random
|
|
30
|
+
import time
|
|
31
|
+
import urllib.request
|
|
32
|
+
import urllib.error
|
|
33
|
+
from typing import Any
|
|
34
|
+
|
|
35
|
+
from crp.providers.base import LLMProvider
|
|
36
|
+
|
|
37
|
+
logger = logging.getLogger("crp.providers.llamacpp")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LlamaCppAdapter(LLMProvider):
|
|
41
|
+
"""llama.cpp adapter — local inference or HTTP server.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
model_path: Path to a GGUF model file (Python binding mode).
|
|
45
|
+
server_url: Base URL for llama.cpp's HTTP server (e.g. "http://localhost:8080").
|
|
46
|
+
If provided, *model_path* is ignored.
|
|
47
|
+
context_size: Context window size in tokens (default: 4096).
|
|
48
|
+
max_tokens: Max output tokens per generation (default: 2048).
|
|
49
|
+
n_gpu_layers: GPU layers for Python binding (default: -1 = all).
|
|
50
|
+
n_threads: CPU threads for Python binding (default: os.cpu_count()).
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
*,
|
|
56
|
+
model_path: str | None = None,
|
|
57
|
+
server_url: str | None = None,
|
|
58
|
+
context_size: int = 4_096,
|
|
59
|
+
max_tokens: int = 2_048,
|
|
60
|
+
n_gpu_layers: int = -1,
|
|
61
|
+
n_threads: int | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
if not model_path and not server_url:
|
|
64
|
+
raise ValueError(
|
|
65
|
+
"LlamaCppAdapter requires either model_path (Python binding) "
|
|
66
|
+
"or server_url (HTTP mode)."
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
self._context_size = context_size
|
|
70
|
+
self._max_tokens = max_tokens
|
|
71
|
+
self._server_url = server_url
|
|
72
|
+
self._llama = None # Lazy-loaded Python binding
|
|
73
|
+
self._model_path = model_path
|
|
74
|
+
self._n_gpu_layers = n_gpu_layers
|
|
75
|
+
self._n_threads = n_threads or (os.cpu_count() or 4)
|
|
76
|
+
|
|
77
|
+
if server_url:
|
|
78
|
+
logger.debug("LlamaCppAdapter: HTTP mode → %s", server_url)
|
|
79
|
+
else:
|
|
80
|
+
logger.debug("LlamaCppAdapter: Python binding → %s", model_path)
|
|
81
|
+
|
|
82
|
+
# -- lazy init --------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
def _ensure_model(self) -> Any:
|
|
85
|
+
"""Lazy-load the llama-cpp-python model on first use."""
|
|
86
|
+
if self._llama is not None:
|
|
87
|
+
return self._llama
|
|
88
|
+
|
|
89
|
+
if self._server_url:
|
|
90
|
+
return None # HTTP mode — no local model
|
|
91
|
+
|
|
92
|
+
try:
|
|
93
|
+
from llama_cpp import Llama
|
|
94
|
+
except ImportError:
|
|
95
|
+
raise ImportError(
|
|
96
|
+
"Python binding mode requires 'llama-cpp-python'. "
|
|
97
|
+
"Install with: pip install llama-cpp-python"
|
|
98
|
+
) from None
|
|
99
|
+
|
|
100
|
+
self._llama = Llama( # type: ignore[assignment]
|
|
101
|
+
model_path=self._model_path, # type: ignore[arg-type]
|
|
102
|
+
n_ctx=self._context_size,
|
|
103
|
+
n_gpu_layers=self._n_gpu_layers,
|
|
104
|
+
n_threads=self._n_threads,
|
|
105
|
+
verbose=False,
|
|
106
|
+
)
|
|
107
|
+
return self._llama
|
|
108
|
+
|
|
109
|
+
# -- LLMProvider interface --------------------------------------------
|
|
110
|
+
|
|
111
|
+
def generate_chat(
|
|
112
|
+
self, messages: list[dict[str, str]], **kwargs: Any
|
|
113
|
+
) -> tuple[str, str]:
|
|
114
|
+
"""Generate via Python binding or HTTP server."""
|
|
115
|
+
max_tokens = kwargs.pop("max_tokens", self._max_tokens)
|
|
116
|
+
|
|
117
|
+
if self._server_url:
|
|
118
|
+
return self._generate_http(messages, max_tokens, **kwargs)
|
|
119
|
+
return self._generate_binding(messages, max_tokens, **kwargs)
|
|
120
|
+
|
|
121
|
+
def count_tokens(self, text: str) -> int:
|
|
122
|
+
"""Count tokens via llama.cpp's tokenizer or heuristic fallback."""
|
|
123
|
+
if self._server_url:
|
|
124
|
+
# HTTP mode: heuristic (no local tokenizer available)
|
|
125
|
+
return max(1, len(text) // 4)
|
|
126
|
+
|
|
127
|
+
model = self._ensure_model()
|
|
128
|
+
try:
|
|
129
|
+
tokens = model.tokenize(text.encode("utf-8"))
|
|
130
|
+
return len(tokens)
|
|
131
|
+
except Exception:
|
|
132
|
+
return max(1, len(text) // 4)
|
|
133
|
+
|
|
134
|
+
def context_window_size(self) -> int:
|
|
135
|
+
return self._context_size
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def max_output_tokens(self) -> int | None:
|
|
139
|
+
return self._max_tokens
|
|
140
|
+
|
|
141
|
+
@property
|
|
142
|
+
def model_name(self) -> str:
|
|
143
|
+
if self._model_path:
|
|
144
|
+
return os.path.basename(self._model_path)
|
|
145
|
+
return f"llamacpp@{self._server_url}"
|
|
146
|
+
|
|
147
|
+
# -- private ----------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
def _generate_binding(
|
|
150
|
+
self, messages: list[dict[str, str]], max_tokens: int, **kwargs: Any
|
|
151
|
+
) -> tuple[str, str]:
|
|
152
|
+
"""Generate using llama-cpp-python's create_chat_completion."""
|
|
153
|
+
model = self._ensure_model()
|
|
154
|
+
try:
|
|
155
|
+
result = model.create_chat_completion(
|
|
156
|
+
messages=messages,
|
|
157
|
+
max_tokens=max_tokens,
|
|
158
|
+
**kwargs,
|
|
159
|
+
)
|
|
160
|
+
choice = result["choices"][0]
|
|
161
|
+
text = choice["message"]["content"] or ""
|
|
162
|
+
reason = choice.get("finish_reason", "stop")
|
|
163
|
+
if reason == "length":
|
|
164
|
+
pass # Physical wall
|
|
165
|
+
else:
|
|
166
|
+
reason = "stop"
|
|
167
|
+
return (text, reason)
|
|
168
|
+
except Exception as exc:
|
|
169
|
+
logger.error("llama.cpp binding error: %s", exc)
|
|
170
|
+
return ("", "error")
|
|
171
|
+
|
|
172
|
+
def _generate_http(
|
|
173
|
+
self, messages: list[dict[str, str]], max_tokens: int, **kwargs: Any
|
|
174
|
+
) -> tuple[str, str]:
|
|
175
|
+
"""Generate via llama.cpp's OpenAI-compatible HTTP endpoint with retry (§audit H5)."""
|
|
176
|
+
url = f"{self._server_url.rstrip('/')}/v1/chat/completions" # type: ignore[union-attr]
|
|
177
|
+
payload = json.dumps({
|
|
178
|
+
"messages": messages,
|
|
179
|
+
"max_tokens": max_tokens,
|
|
180
|
+
**kwargs,
|
|
181
|
+
}).encode("utf-8")
|
|
182
|
+
|
|
183
|
+
max_retries = 4
|
|
184
|
+
base_delay = 2.0
|
|
185
|
+
last_exc: Exception | None = None
|
|
186
|
+
|
|
187
|
+
for attempt in range(max_retries):
|
|
188
|
+
req = urllib.request.Request(
|
|
189
|
+
url,
|
|
190
|
+
data=payload,
|
|
191
|
+
headers={
|
|
192
|
+
"Content-Type": "application/json",
|
|
193
|
+
"Connection": "keep-alive",
|
|
194
|
+
},
|
|
195
|
+
method="POST",
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
200
|
+
data = json.loads(resp.read())
|
|
201
|
+
choice = data["choices"][0]
|
|
202
|
+
text = choice["message"]["content"] or ""
|
|
203
|
+
reason = choice.get("finish_reason", "stop")
|
|
204
|
+
if reason != "length":
|
|
205
|
+
reason = "stop"
|
|
206
|
+
return (text, reason)
|
|
207
|
+
except (urllib.error.URLError, ConnectionError, TimeoutError, OSError) as exc:
|
|
208
|
+
last_exc = exc
|
|
209
|
+
if attempt < max_retries - 1:
|
|
210
|
+
delay = base_delay * (2 ** attempt) + random.uniform(0, 0.5)
|
|
211
|
+
logger.warning(
|
|
212
|
+
"llama.cpp HTTP request failed (attempt %d/%d), retrying in %.1fs: %s",
|
|
213
|
+
attempt + 1, max_retries, delay, exc,
|
|
214
|
+
)
|
|
215
|
+
time.sleep(delay)
|
|
216
|
+
continue
|
|
217
|
+
logger.error("llama.cpp HTTP error after %d retries: %s", max_retries, exc)
|
|
218
|
+
return ("", "error")
|
|
219
|
+
except Exception as exc:
|
|
220
|
+
logger.error("llama.cpp HTTP error (non-retryable): %s", exc)
|
|
221
|
+
return ("", "error")
|
|
222
|
+
|
|
223
|
+
logger.error("llama.cpp HTTP failed after %d retries: %s", max_retries, last_exc)
|
|
224
|
+
return ("", "error")
|
crp/providers/manager.py
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""LLMProviderManager — multi-provider routing with fallback chain (§05).
|
|
4
|
+
|
|
5
|
+
Supports:
|
|
6
|
+
- Primary provider selection
|
|
7
|
+
- Fallback chain: if primary fails, try registered providers in order
|
|
8
|
+
- Provider registration and retrieval by name
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from collections.abc import Generator
|
|
15
|
+
|
|
16
|
+
from crp.core.errors import ProviderError
|
|
17
|
+
from crp.providers.base import LLMProvider
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("crp.providers.manager")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LLMProviderManager:
|
|
23
|
+
"""Routes requests to one or more LLM providers with fallback.
|
|
24
|
+
|
|
25
|
+
Usage::
|
|
26
|
+
|
|
27
|
+
mgr = LLMProviderManager(primary_provider)
|
|
28
|
+
mgr.register(fallback_provider)
|
|
29
|
+
|
|
30
|
+
# Generate with automatic fallback
|
|
31
|
+
output, reason = mgr.generate_with_fallback(messages)
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(self, provider: LLMProvider) -> None:
|
|
35
|
+
self._primary = provider
|
|
36
|
+
self._providers: dict[str, LLMProvider] = {provider.model_name: provider}
|
|
37
|
+
self._fallback_order: list[str] = [] # model names in fallback priority
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def primary(self) -> LLMProvider:
|
|
41
|
+
return self._primary
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def provider_count(self) -> int:
|
|
45
|
+
return len(self._providers)
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def provider_names(self) -> list[str]:
|
|
49
|
+
return list(self._providers.keys())
|
|
50
|
+
|
|
51
|
+
def register(self, provider: LLMProvider) -> None:
|
|
52
|
+
"""Register an additional provider for fallback routing."""
|
|
53
|
+
self._providers[provider.model_name] = provider
|
|
54
|
+
if provider.model_name != self._primary.model_name:
|
|
55
|
+
self._fallback_order.append(provider.model_name)
|
|
56
|
+
logger.info("Provider registered: %s (total: %d)", provider.model_name, len(self._providers))
|
|
57
|
+
|
|
58
|
+
def get(self, name: str | None = None) -> LLMProvider:
|
|
59
|
+
"""Get a provider by name, or the primary if *name* is None."""
|
|
60
|
+
if name is None:
|
|
61
|
+
return self._primary
|
|
62
|
+
provider = self._providers.get(name)
|
|
63
|
+
if provider is None:
|
|
64
|
+
raise ProviderError(
|
|
65
|
+
f"Provider '{name}' not registered",
|
|
66
|
+
available=list(self._providers.keys()),
|
|
67
|
+
)
|
|
68
|
+
return provider
|
|
69
|
+
|
|
70
|
+
def generate_with_fallback(
|
|
71
|
+
self,
|
|
72
|
+
messages: list[dict[str, str]],
|
|
73
|
+
**kwargs: object,
|
|
74
|
+
) -> tuple[str, str, str]:
|
|
75
|
+
"""Generate using primary, fall back to registered providers on failure.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
(output_text, finish_reason, provider_name) — the provider that
|
|
79
|
+
successfully generated is identified in provider_name.
|
|
80
|
+
|
|
81
|
+
Raises:
|
|
82
|
+
ProviderError: If ALL providers fail.
|
|
83
|
+
"""
|
|
84
|
+
providers_to_try = [self._primary.model_name] + self._fallback_order
|
|
85
|
+
last_error: Exception | None = None
|
|
86
|
+
|
|
87
|
+
for name in providers_to_try:
|
|
88
|
+
provider = self._providers.get(name)
|
|
89
|
+
if provider is None:
|
|
90
|
+
continue
|
|
91
|
+
try:
|
|
92
|
+
output, finish_reason = provider.generate_chat(messages, **kwargs)
|
|
93
|
+
if name != self._primary.model_name:
|
|
94
|
+
logger.warning(
|
|
95
|
+
"Primary provider failed, fell back to '%s'", name,
|
|
96
|
+
)
|
|
97
|
+
return output, finish_reason, name
|
|
98
|
+
except Exception as exc:
|
|
99
|
+
logger.warning("Provider '%s' failed: %s", name, exc)
|
|
100
|
+
last_error = exc
|
|
101
|
+
continue
|
|
102
|
+
|
|
103
|
+
raise ProviderError(
|
|
104
|
+
f"All {len(providers_to_try)} providers failed",
|
|
105
|
+
available=list(self._providers.keys()),
|
|
106
|
+
) from last_error
|
|
107
|
+
|
|
108
|
+
def generate_with_tools_fallback(
|
|
109
|
+
self,
|
|
110
|
+
messages: list[dict[str, object]],
|
|
111
|
+
tools: list[dict[str, object]],
|
|
112
|
+
**kwargs: object,
|
|
113
|
+
) -> tuple[str, str, list[dict[str, object]] | None, dict[str, object] | None, str]:
|
|
114
|
+
"""Generate with tools using primary, fall back on failure.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
(output_text, finish_reason, tool_calls, raw_message, provider_name)
|
|
118
|
+
"""
|
|
119
|
+
providers_to_try = [self._primary.model_name] + self._fallback_order
|
|
120
|
+
last_error: Exception | None = None
|
|
121
|
+
|
|
122
|
+
for name in providers_to_try:
|
|
123
|
+
provider = self._providers.get(name)
|
|
124
|
+
if provider is None or not provider.supports_tools():
|
|
125
|
+
continue
|
|
126
|
+
try:
|
|
127
|
+
result = provider.generate_chat_with_tools(messages, tools, **kwargs)
|
|
128
|
+
if name != self._primary.model_name:
|
|
129
|
+
logger.warning("Tools fallback to '%s'", name)
|
|
130
|
+
return (*result, name)
|
|
131
|
+
except Exception as exc:
|
|
132
|
+
logger.warning("Provider '%s' tools failed: %s", name, exc)
|
|
133
|
+
last_error = exc
|
|
134
|
+
continue
|
|
135
|
+
|
|
136
|
+
raise ProviderError(
|
|
137
|
+
"All providers failed for tool-mediated dispatch",
|
|
138
|
+
available=list(self._providers.keys()),
|
|
139
|
+
) from last_error
|