crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
crp/providers/openai.py
ADDED
|
@@ -0,0 +1,628 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""OpenAI adapter — GPT-4o, GPT-4, GPT-3.5-turbo, o1/o3 families (§6.1).
|
|
4
|
+
|
|
5
|
+
Requires ``openai>=1.0`` (``pip install crprotocol[full]``).
|
|
6
|
+
|
|
7
|
+
Usage::
|
|
8
|
+
|
|
9
|
+
from crp.providers.openai import OpenAIAdapter
|
|
10
|
+
|
|
11
|
+
provider = OpenAIAdapter(model="gpt-4o")
|
|
12
|
+
output, reason = provider.generate_chat([
|
|
13
|
+
{"role": "system", "content": "You are helpful."},
|
|
14
|
+
{"role": "user", "content": "Hello!"},
|
|
15
|
+
])
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import logging
|
|
22
|
+
import os
|
|
23
|
+
import random
|
|
24
|
+
import time
|
|
25
|
+
import urllib.request
|
|
26
|
+
import urllib.error
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
from crp.providers.base import LLMProvider
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger("crp.providers.openai")
|
|
32
|
+
|
|
33
|
+
# Model → context window (tokens). Updated as of 2025-Q2.
|
|
34
|
+
# Primary table: exact OpenAI model names.
|
|
35
|
+
_MODEL_CONTEXT: dict[str, int] = {
|
|
36
|
+
"gpt-4o": 128_000,
|
|
37
|
+
"gpt-4o-mini": 128_000,
|
|
38
|
+
"gpt-4-turbo": 128_000,
|
|
39
|
+
"gpt-4": 8_192,
|
|
40
|
+
"gpt-3.5-turbo": 16_385,
|
|
41
|
+
"o1": 200_000,
|
|
42
|
+
"o1-mini": 128_000,
|
|
43
|
+
"o1-preview": 128_000,
|
|
44
|
+
"o3": 200_000,
|
|
45
|
+
"o3-mini": 200_000,
|
|
46
|
+
"o4-mini": 200_000,
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
# Model → max output tokens (hard cap set by OpenAI).
|
|
50
|
+
_MODEL_MAX_OUTPUT: dict[str, int] = {
|
|
51
|
+
"gpt-4o": 16_384,
|
|
52
|
+
"gpt-4o-mini": 16_384,
|
|
53
|
+
"gpt-4-turbo": 4_096,
|
|
54
|
+
"gpt-4": 8_192,
|
|
55
|
+
"gpt-3.5-turbo": 4_096,
|
|
56
|
+
"o1": 100_000,
|
|
57
|
+
"o1-mini": 65_536,
|
|
58
|
+
"o1-preview": 32_768,
|
|
59
|
+
"o3": 100_000,
|
|
60
|
+
"o3-mini": 100_000,
|
|
61
|
+
"o4-mini": 100_000,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# ── Model family table (prefix-matched) ─────────────────────────────
|
|
65
|
+
# Used when the model name is NOT in the primary OpenAI table.
|
|
66
|
+
# This covers open-source models served via OpenAI-compatible APIs
|
|
67
|
+
# (LM Studio, vLLM, llama.cpp server, Ollama OpenAI compat, TGI, etc.).
|
|
68
|
+
# Ordered longest-prefix-first to ensure specific matches win.
|
|
69
|
+
_MODEL_FAMILY_CONTEXT: list[tuple[str, int, int]] = [
|
|
70
|
+
# (prefix, context_window, max_output_tokens)
|
|
71
|
+
# Qwen family
|
|
72
|
+
("qwen3", 40_960, 8_192),
|
|
73
|
+
("qwen2.5", 128_000, 8_192),
|
|
74
|
+
("qwen2", 128_000, 8_192),
|
|
75
|
+
("qwen", 32_768, 4_096),
|
|
76
|
+
# LLaMA family
|
|
77
|
+
("llama-3.3", 128_000, 4_096),
|
|
78
|
+
("llama-3.2", 128_000, 4_096),
|
|
79
|
+
("llama-3.1", 128_000, 4_096),
|
|
80
|
+
("llama3.3", 128_000, 4_096),
|
|
81
|
+
("llama3.2", 128_000, 4_096),
|
|
82
|
+
("llama3.1", 128_000, 4_096),
|
|
83
|
+
("llama-3", 8_192, 4_096),
|
|
84
|
+
("llama3", 8_192, 4_096),
|
|
85
|
+
("llama-2", 4_096, 4_096),
|
|
86
|
+
("llama2", 4_096, 4_096),
|
|
87
|
+
("codellama", 16_384, 4_096),
|
|
88
|
+
# Gemma family
|
|
89
|
+
("gemma-3-27b", 128_000, 8_192),
|
|
90
|
+
("gemma-3", 32_768, 4_096), # Smaller gemma-3 variants
|
|
91
|
+
("gemma3", 128_000, 8_192),
|
|
92
|
+
("gemma-2", 8_192, 4_096),
|
|
93
|
+
("gemma2", 8_192, 4_096),
|
|
94
|
+
("gemma", 8_192, 4_096),
|
|
95
|
+
# Mistral family
|
|
96
|
+
("mistral-large", 128_000, 8_192),
|
|
97
|
+
("mistral-medium", 32_768, 4_096),
|
|
98
|
+
("mistral-small", 32_768, 4_096),
|
|
99
|
+
("mixtral", 32_768, 4_096),
|
|
100
|
+
("mistral", 32_768, 4_096),
|
|
101
|
+
# Phi family
|
|
102
|
+
("phi-4", 16_384, 4_096),
|
|
103
|
+
("phi4", 16_384, 4_096),
|
|
104
|
+
("phi-3", 128_000, 4_096),
|
|
105
|
+
("phi3", 128_000, 4_096),
|
|
106
|
+
# DeepSeek family
|
|
107
|
+
("deepseek-r1", 128_000, 16_384),
|
|
108
|
+
("deepseek-v3", 128_000, 16_384),
|
|
109
|
+
("deepseek-v2", 128_000, 8_192),
|
|
110
|
+
("deepseek-coder", 128_000, 8_192),
|
|
111
|
+
("deepseek", 128_000, 8_192),
|
|
112
|
+
# Command-R family
|
|
113
|
+
("command-r-plus", 128_000, 4_096),
|
|
114
|
+
("command-r", 128_000, 4_096),
|
|
115
|
+
# RWKV family
|
|
116
|
+
("rwkv", 100_000, 4_096),
|
|
117
|
+
# Yi family
|
|
118
|
+
("yi-", 200_000, 4_096),
|
|
119
|
+
# InternLM family
|
|
120
|
+
("internlm", 256_000, 8_192),
|
|
121
|
+
# Anthropic (when proxied through OpenAI-compat)
|
|
122
|
+
("claude-3.5", 200_000, 8_192),
|
|
123
|
+
("claude-3", 200_000, 4_096),
|
|
124
|
+
("claude", 200_000, 4_096),
|
|
125
|
+
# Youtu (WASA native)
|
|
126
|
+
("youtu", 128_000, 16_384),
|
|
127
|
+
]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _resolve_model_capabilities(
|
|
131
|
+
model: str,
|
|
132
|
+
base_url: str | None = None,
|
|
133
|
+
) -> tuple[int, int]:
|
|
134
|
+
"""Resolve context window and max output for any model.
|
|
135
|
+
|
|
136
|
+
Strategy (3-layer precedence):
|
|
137
|
+
1. Exact match in primary OpenAI table
|
|
138
|
+
2. Prefix match against model family table (open-source models)
|
|
139
|
+
3. Server-side probing via /v1/models/{model} (if base_url set)
|
|
140
|
+
4. Conservative fallback (8_192, 4_096) — NOT 128K
|
|
141
|
+
|
|
142
|
+
Returns (context_window, max_output_tokens).
|
|
143
|
+
"""
|
|
144
|
+
# Layer 1: exact match (OpenAI models)
|
|
145
|
+
if model in _MODEL_CONTEXT:
|
|
146
|
+
return (_MODEL_CONTEXT[model], _MODEL_MAX_OUTPUT.get(model, 4_096))
|
|
147
|
+
|
|
148
|
+
# Layer 2: prefix match against model families
|
|
149
|
+
lower = model.lower()
|
|
150
|
+
for prefix, ctx, max_out in _MODEL_FAMILY_CONTEXT:
|
|
151
|
+
if lower.startswith(prefix):
|
|
152
|
+
logger.info(
|
|
153
|
+
"Model '%s' matched family '%s': ctx=%d, max_out=%d",
|
|
154
|
+
model, prefix, ctx, max_out,
|
|
155
|
+
)
|
|
156
|
+
return (ctx, max_out)
|
|
157
|
+
|
|
158
|
+
# Layer 3: server-side probing (non-OpenAI servers may expose metadata)
|
|
159
|
+
if base_url:
|
|
160
|
+
probed = _probe_server_model_info(model, base_url)
|
|
161
|
+
if probed:
|
|
162
|
+
return probed
|
|
163
|
+
|
|
164
|
+
# Layer 4: conservative fallback — NOT 128K (that's dangerous for small models)
|
|
165
|
+
logger.warning(
|
|
166
|
+
"Model '%s' not in any known table. Using conservative defaults "
|
|
167
|
+
"(ctx=8192, max_out=4096). Override with context_size= parameter "
|
|
168
|
+
"or add model to _MODEL_FAMILY_CONTEXT.",
|
|
169
|
+
model,
|
|
170
|
+
)
|
|
171
|
+
return (8_192, 4_096)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _probe_server_model_info(
|
|
175
|
+
model: str,
|
|
176
|
+
base_url: str,
|
|
177
|
+
) -> tuple[int, int] | None:
|
|
178
|
+
"""Probe the server for model metadata.
|
|
179
|
+
|
|
180
|
+
Tries (in order):
|
|
181
|
+
1. GET /v1/models/{model} — some servers include context_length
|
|
182
|
+
2. GET /api/show (Ollama-compat) — includes modelfile with num_ctx
|
|
183
|
+
"""
|
|
184
|
+
url = base_url.rstrip("/")
|
|
185
|
+
|
|
186
|
+
# Attempt 1: /v1/models/{model} (vLLM, TGI expose max_model_len here)
|
|
187
|
+
try:
|
|
188
|
+
req = urllib.request.Request(
|
|
189
|
+
f"{url}/v1/models/{model}",
|
|
190
|
+
headers={"Accept": "application/json"},
|
|
191
|
+
method="GET",
|
|
192
|
+
)
|
|
193
|
+
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
194
|
+
data = json.loads(resp.read())
|
|
195
|
+
ctx = (
|
|
196
|
+
data.get("max_model_len")
|
|
197
|
+
or data.get("context_length")
|
|
198
|
+
or data.get("max_context_length")
|
|
199
|
+
)
|
|
200
|
+
if ctx and isinstance(ctx, int) and ctx > 0:
|
|
201
|
+
max_out = data.get("max_output_tokens", min(ctx // 4, 16_384))
|
|
202
|
+
logger.info(
|
|
203
|
+
"Server probe found model '%s': ctx=%d, max_out=%d",
|
|
204
|
+
model, ctx, max_out,
|
|
205
|
+
)
|
|
206
|
+
return (ctx, max_out)
|
|
207
|
+
except Exception:
|
|
208
|
+
logger.debug("Server probe /v1/models/%s failed (expected for non-vLLM servers)", model)
|
|
209
|
+
|
|
210
|
+
# Attempt 2: Ollama-compatible /api/show
|
|
211
|
+
try:
|
|
212
|
+
req = urllib.request.Request(
|
|
213
|
+
f"{url}/api/show",
|
|
214
|
+
data=json.dumps({"name": model}).encode("utf-8"),
|
|
215
|
+
headers={"Content-Type": "application/json"},
|
|
216
|
+
method="POST",
|
|
217
|
+
)
|
|
218
|
+
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
219
|
+
data = json.loads(resp.read())
|
|
220
|
+
params = data.get("model_info", {})
|
|
221
|
+
ctx = params.get("context_length") or params.get("num_ctx")
|
|
222
|
+
if ctx and isinstance(ctx, int) and ctx > 0:
|
|
223
|
+
max_out = min(ctx // 4, 16_384)
|
|
224
|
+
logger.info(
|
|
225
|
+
"Ollama probe found model '%s': ctx=%d, max_out=%d",
|
|
226
|
+
model, ctx, max_out,
|
|
227
|
+
)
|
|
228
|
+
return (ctx, max_out)
|
|
229
|
+
except Exception:
|
|
230
|
+
logger.debug("Ollama probe /api/show failed for '%s' (expected for non-Ollama servers)", model)
|
|
231
|
+
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _require_openai():
|
|
236
|
+
"""Import openai with a friendly error."""
|
|
237
|
+
try:
|
|
238
|
+
import openai
|
|
239
|
+
return openai
|
|
240
|
+
except ImportError:
|
|
241
|
+
raise ImportError(
|
|
242
|
+
"OpenAI adapter requires the 'openai' package. "
|
|
243
|
+
"Install with: pip install crprotocol[full]"
|
|
244
|
+
) from None
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _require_tiktoken():
|
|
248
|
+
"""Import tiktoken with a friendly error."""
|
|
249
|
+
try:
|
|
250
|
+
import tiktoken
|
|
251
|
+
return tiktoken
|
|
252
|
+
except ImportError:
|
|
253
|
+
return None # Fall back to heuristic
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class OpenAIAdapter(LLMProvider):
|
|
257
|
+
"""OpenAI chat completions adapter.
|
|
258
|
+
|
|
259
|
+
Works with OpenAI API and any OpenAI-compatible server (LM Studio,
|
|
260
|
+
vLLM, llama.cpp server, Ollama OpenAI compat, TGI, etc.).
|
|
261
|
+
|
|
262
|
+
Model capabilities are auto-discovered via 3-layer resolution:
|
|
263
|
+
1. Exact match against known OpenAI models
|
|
264
|
+
2. Prefix match against 50+ open-source model families
|
|
265
|
+
3. Server-side probing (for vLLM, Ollama-compat endpoints)
|
|
266
|
+
4. Conservative fallback (8K context) — safe for unknown models
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
model: Model name (e.g. "gpt-4o", "qwen3-4b", "llama3.1").
|
|
270
|
+
api_key: API key. Defaults to ``OPENAI_API_KEY`` env var.
|
|
271
|
+
base_url: Override API base URL (for LM Studio, vLLM, etc.).
|
|
272
|
+
context_size: Override auto-discovered context window (tokens).
|
|
273
|
+
max_tokens: Override auto-discovered max output tokens per request.
|
|
274
|
+
timeout: HTTP timeout in seconds (default: 120).
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
def __init__(
|
|
278
|
+
self,
|
|
279
|
+
model: str = "gpt-4o",
|
|
280
|
+
*,
|
|
281
|
+
api_key: str | None = None,
|
|
282
|
+
base_url: str | None = None,
|
|
283
|
+
context_size: int | None = None,
|
|
284
|
+
max_tokens: int | None = None,
|
|
285
|
+
timeout: float = 120.0,
|
|
286
|
+
) -> None:
|
|
287
|
+
openai = _require_openai()
|
|
288
|
+
|
|
289
|
+
self._model = model
|
|
290
|
+
|
|
291
|
+
# ── Auto-discover model capabilities (3-layer) ──────────
|
|
292
|
+
# User-explicit overrides always win.
|
|
293
|
+
resolved_ctx, resolved_max = _resolve_model_capabilities(model, base_url)
|
|
294
|
+
self._context_size = context_size or resolved_ctx
|
|
295
|
+
self._max_tokens = max_tokens or resolved_max
|
|
296
|
+
|
|
297
|
+
# Build the client
|
|
298
|
+
key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
299
|
+
if not key:
|
|
300
|
+
raise ValueError(
|
|
301
|
+
"No API key provided. Pass api_key= or set OPENAI_API_KEY."
|
|
302
|
+
)
|
|
303
|
+
kwargs: dict[str, Any] = {"api_key": key, "timeout": timeout}
|
|
304
|
+
if base_url:
|
|
305
|
+
kwargs["base_url"] = base_url
|
|
306
|
+
self._client = openai.OpenAI(**kwargs)
|
|
307
|
+
|
|
308
|
+
# Tokenizer (optional — tiktoken for accurate counts)
|
|
309
|
+
tiktoken = _require_tiktoken()
|
|
310
|
+
self._encoding = None
|
|
311
|
+
if tiktoken:
|
|
312
|
+
try:
|
|
313
|
+
self._encoding = tiktoken.encoding_for_model(model)
|
|
314
|
+
except KeyError:
|
|
315
|
+
self._encoding = tiktoken.get_encoding("cl100k_base")
|
|
316
|
+
|
|
317
|
+
logger.info(
|
|
318
|
+
"OpenAIAdapter initialized: model=%s, ctx=%d, max_out=%d (auto-discovered=%s)",
|
|
319
|
+
model, self._context_size, self._max_tokens,
|
|
320
|
+
"no" if context_size or max_tokens else "yes",
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
# Reasoning/thinking content from the last generate_chat() call.
|
|
324
|
+
# Set after every call; None if no reasoning was present.
|
|
325
|
+
self.last_reasoning_content: str | None = None
|
|
326
|
+
|
|
327
|
+
# -- LLMProvider interface --------------------------------------------
|
|
328
|
+
|
|
329
|
+
# Retry config: 3 attempts, exponential backoff with jitter
|
|
330
|
+
_MAX_RETRIES = 4
|
|
331
|
+
_BASE_DELAY = 2.0 # seconds (generous for local inference servers)
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def _is_retryable(exc: Exception) -> bool:
|
|
335
|
+
"""Check if an exception is transient and worth retrying."""
|
|
336
|
+
exc_type = type(exc).__name__
|
|
337
|
+
# Rate limit (429) or server error (500/502/503)
|
|
338
|
+
if hasattr(exc, "status_code"):
|
|
339
|
+
return exc.status_code in (429, 500, 502, 503)
|
|
340
|
+
# openai.RateLimitError, openai.APIConnectionError, etc.
|
|
341
|
+
if exc_type in ("RateLimitError", "APIConnectionError", "APITimeoutError",
|
|
342
|
+
"InternalServerError", "APIStatusError"):
|
|
343
|
+
return True
|
|
344
|
+
# Connection-level transients (includes Channel Error from LM Studio)
|
|
345
|
+
if isinstance(exc, (ConnectionError, TimeoutError, OSError)):
|
|
346
|
+
return True
|
|
347
|
+
# Catch-all: retry anything with "channel", "connection", "reset" in message
|
|
348
|
+
exc_msg = str(exc).lower()
|
|
349
|
+
if any(kw in exc_msg for kw in ("channel", "connection", "reset", "closed")):
|
|
350
|
+
return True
|
|
351
|
+
return False
|
|
352
|
+
|
|
353
|
+
def generate_chat(
|
|
354
|
+
self, messages: list[dict[str, str]], **kwargs: Any
|
|
355
|
+
) -> tuple[str, str]:
|
|
356
|
+
"""Call OpenAI chat completions API with retry on transient failures.
|
|
357
|
+
|
|
358
|
+
Handles "thinking" models (Qwen3, DeepSeek-R1, o1, etc.) that
|
|
359
|
+
split output into reasoning_content + content fields. CRP
|
|
360
|
+
extracts the final content and preserves the full reasoning
|
|
361
|
+
for downstream extraction.
|
|
362
|
+
|
|
363
|
+
Returns (output_text, finish_reason).
|
|
364
|
+
"""
|
|
365
|
+
params: dict[str, Any] = {
|
|
366
|
+
"model": self._model,
|
|
367
|
+
"messages": messages,
|
|
368
|
+
"max_tokens": kwargs.pop("max_tokens", self._max_tokens),
|
|
369
|
+
}
|
|
370
|
+
params.update(kwargs)
|
|
371
|
+
|
|
372
|
+
last_exc: Exception | None = None
|
|
373
|
+
for attempt in range(self._MAX_RETRIES):
|
|
374
|
+
try:
|
|
375
|
+
response = self._client.chat.completions.create(**params)
|
|
376
|
+
choice = response.choices[0]
|
|
377
|
+
text = choice.message.content or ""
|
|
378
|
+
reason = choice.finish_reason or "stop"
|
|
379
|
+
|
|
380
|
+
# ── Handle thinking models ──────────────────────
|
|
381
|
+
# Models like Qwen3, DeepSeek-R1, o1 put reasoning
|
|
382
|
+
# in a separate field. If content is empty but
|
|
383
|
+
# reasoning exists, the model spent all tokens on
|
|
384
|
+
# thinking and didn't produce final output.
|
|
385
|
+
reasoning = getattr(choice.message, "reasoning_content", None)
|
|
386
|
+
self.last_reasoning_content = reasoning or None
|
|
387
|
+
if not text and reasoning:
|
|
388
|
+
# Model exhausted budget on reasoning — no final
|
|
389
|
+
# content produced. Return empty output with
|
|
390
|
+
# finish_reason="length" so the continuation engine
|
|
391
|
+
# knows the budget was exhausted (not that the model
|
|
392
|
+
# finished). The orchestrator will skip extraction
|
|
393
|
+
# for this window and continue to the next one.
|
|
394
|
+
# Reasoning is preserved for inspection but NOT used
|
|
395
|
+
# as output — it would pollute the document map.
|
|
396
|
+
text = ""
|
|
397
|
+
reason = "length"
|
|
398
|
+
logger.info(
|
|
399
|
+
"Thinking model: all tokens spent on reasoning "
|
|
400
|
+
"(%d chars), no content produced. Returning "
|
|
401
|
+
"finish_reason=length so continuation proceeds.",
|
|
402
|
+
len(reasoning),
|
|
403
|
+
)
|
|
404
|
+
elif text and reasoning:
|
|
405
|
+
# Both present — model completed reasoning AND produced
|
|
406
|
+
# final content. CRP gets the clean content. Reasoning
|
|
407
|
+
# is discarded (it's internal chain-of-thought).
|
|
408
|
+
logger.debug(
|
|
409
|
+
"Thinking model: reasoning=%d chars, content=%d chars",
|
|
410
|
+
len(reasoning), len(text),
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Map OpenAI finish reasons to CRP convention
|
|
414
|
+
if reason == "length":
|
|
415
|
+
pass # Already correct — physical wall
|
|
416
|
+
elif reason in ("stop", "end_turn"):
|
|
417
|
+
reason = "stop"
|
|
418
|
+
else:
|
|
419
|
+
reason = "stop" # content_filter, tool_calls, etc.
|
|
420
|
+
|
|
421
|
+
return (text, reason)
|
|
422
|
+
except Exception as exc:
|
|
423
|
+
last_exc = exc
|
|
424
|
+
if attempt < self._MAX_RETRIES - 1 and self._is_retryable(exc):
|
|
425
|
+
delay = self._BASE_DELAY * (2 ** attempt) + random.uniform(0, 0.5)
|
|
426
|
+
logger.warning(
|
|
427
|
+
"OpenAI transient error (attempt %d/%d), retrying in %.1fs: %s",
|
|
428
|
+
attempt + 1, self._MAX_RETRIES, delay, type(exc).__name__,
|
|
429
|
+
)
|
|
430
|
+
time.sleep(delay)
|
|
431
|
+
else:
|
|
432
|
+
logger.error("OpenAI API error: %s", type(exc).__name__)
|
|
433
|
+
return ("", "error")
|
|
434
|
+
|
|
435
|
+
logger.error("OpenAI API failed after %d retries: %s", self._MAX_RETRIES, type(last_exc).__name__)
|
|
436
|
+
return ("", "error")
|
|
437
|
+
|
|
438
|
+
def count_tokens(self, text: str) -> int:
|
|
439
|
+
"""Count tokens using tiktoken (exact) or fallback heuristic."""
|
|
440
|
+
if self._encoding is not None:
|
|
441
|
+
return len(self._encoding.encode(text))
|
|
442
|
+
return max(1, len(text) // 4)
|
|
443
|
+
|
|
444
|
+
def context_window_size(self) -> int:
|
|
445
|
+
return self._context_size
|
|
446
|
+
|
|
447
|
+
@property
|
|
448
|
+
def max_output_tokens(self) -> int | None:
|
|
449
|
+
return self._max_tokens
|
|
450
|
+
|
|
451
|
+
@property
|
|
452
|
+
def model_name(self) -> str:
|
|
453
|
+
return self._model
|
|
454
|
+
|
|
455
|
+
# Thinking model prefixes — models that produce reasoning_content
|
|
456
|
+
_THINKING_PREFIXES = ("qwen3", "deepseek-r1", "o1", "o3", "o4")
|
|
457
|
+
|
|
458
|
+
@property
|
|
459
|
+
def is_thinking_model(self) -> bool:
|
|
460
|
+
"""Detect if the current model is a thinking/reasoning model."""
|
|
461
|
+
name = self._model.lower()
|
|
462
|
+
return any(name.startswith(p) for p in self._THINKING_PREFIXES)
|
|
463
|
+
|
|
464
|
+
def cost_per_1k_tokens(self) -> tuple[float, float]:
|
|
465
|
+
"""OpenAI pricing per 1K tokens (USD) — updated 2025-Q2."""
|
|
466
|
+
pricing = {
|
|
467
|
+
"gpt-4o": (0.0025, 0.010),
|
|
468
|
+
"gpt-4o-mini": (0.00015, 0.0006),
|
|
469
|
+
"gpt-4-turbo": (0.010, 0.030),
|
|
470
|
+
"gpt-4": (0.030, 0.060),
|
|
471
|
+
"gpt-3.5-turbo": (0.0005, 0.0015),
|
|
472
|
+
"o1": (0.015, 0.060),
|
|
473
|
+
"o1-mini": (0.003, 0.012),
|
|
474
|
+
"o3": (0.015, 0.060),
|
|
475
|
+
"o3-mini": (0.0011, 0.0044),
|
|
476
|
+
"o4-mini": (0.0011, 0.0044),
|
|
477
|
+
}
|
|
478
|
+
return pricing.get(self._model, (0.0, 0.0))
|
|
479
|
+
|
|
480
|
+
# ── Tool-mediated dispatch (§20) ──────────────────────────────────
|
|
481
|
+
|
|
482
|
+
def supports_tools(self) -> bool:
|
|
483
|
+
"""OpenAI and compatible servers support function/tool calling."""
|
|
484
|
+
return True
|
|
485
|
+
|
|
486
|
+
def generate_chat_with_tools(
|
|
487
|
+
self,
|
|
488
|
+
messages: list[dict[str, object]],
|
|
489
|
+
tools: list[dict[str, object]],
|
|
490
|
+
**kwargs: object,
|
|
491
|
+
) -> tuple[str, str, list[dict[str, object]] | None, dict[str, object] | None]:
|
|
492
|
+
"""Generate with OpenAI tool/function calling.
|
|
493
|
+
|
|
494
|
+
Returns (text, finish_reason, tool_calls, raw_assistant_message).
|
|
495
|
+
When the model wants to call tools, finish_reason="tool_calls" and
|
|
496
|
+
the tool_calls list contains structured call requests. The
|
|
497
|
+
raw_assistant_message is the full message dict for appending to
|
|
498
|
+
conversation history (required by the OpenAI tool protocol).
|
|
499
|
+
"""
|
|
500
|
+
params: dict[str, Any] = {
|
|
501
|
+
"model": self._model,
|
|
502
|
+
"messages": messages,
|
|
503
|
+
"max_tokens": kwargs.pop("max_tokens", self._max_tokens),
|
|
504
|
+
"tools": tools,
|
|
505
|
+
"tool_choice": kwargs.pop("tool_choice", "auto"),
|
|
506
|
+
}
|
|
507
|
+
params.update(kwargs)
|
|
508
|
+
|
|
509
|
+
last_exc: Exception | None = None
|
|
510
|
+
for attempt in range(self._MAX_RETRIES):
|
|
511
|
+
try:
|
|
512
|
+
response = self._client.chat.completions.create(**params)
|
|
513
|
+
choice = response.choices[0]
|
|
514
|
+
text = choice.message.content or ""
|
|
515
|
+
reason = choice.finish_reason or "stop"
|
|
516
|
+
|
|
517
|
+
# Extract tool calls if present
|
|
518
|
+
raw_tool_calls = choice.message.tool_calls
|
|
519
|
+
if raw_tool_calls:
|
|
520
|
+
tool_calls_out: list[dict[str, object]] = []
|
|
521
|
+
for tc in raw_tool_calls:
|
|
522
|
+
# Parse arguments (may be JSON string)
|
|
523
|
+
try:
|
|
524
|
+
args = json.loads(tc.function.arguments) if isinstance(tc.function.arguments, str) else tc.function.arguments
|
|
525
|
+
except (json.JSONDecodeError, TypeError):
|
|
526
|
+
args = {"raw": tc.function.arguments}
|
|
527
|
+
|
|
528
|
+
tool_calls_out.append({
|
|
529
|
+
"id": tc.id,
|
|
530
|
+
"type": "function",
|
|
531
|
+
"function": {
|
|
532
|
+
"name": tc.function.name,
|
|
533
|
+
"arguments": args,
|
|
534
|
+
},
|
|
535
|
+
})
|
|
536
|
+
|
|
537
|
+
# Build raw assistant message for conversation history
|
|
538
|
+
raw_msg: dict[str, Any] = {
|
|
539
|
+
"role": "assistant",
|
|
540
|
+
"content": text or None,
|
|
541
|
+
"tool_calls": [
|
|
542
|
+
{
|
|
543
|
+
"id": tc.id,
|
|
544
|
+
"type": "function",
|
|
545
|
+
"function": {
|
|
546
|
+
"name": tc.function.name,
|
|
547
|
+
"arguments": tc.function.arguments if isinstance(tc.function.arguments, str) else json.dumps(tc.function.arguments),
|
|
548
|
+
},
|
|
549
|
+
}
|
|
550
|
+
for tc in raw_tool_calls
|
|
551
|
+
],
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
logger.info(
|
|
555
|
+
"Tool calls requested: %d calls [%s]",
|
|
556
|
+
len(tool_calls_out),
|
|
557
|
+
", ".join(tc["function"]["name"] for tc in tool_calls_out),
|
|
558
|
+
)
|
|
559
|
+
return (text, "tool_calls", tool_calls_out, raw_msg)
|
|
560
|
+
|
|
561
|
+
# No tool calls — normal completion
|
|
562
|
+
# Handle thinking models (same as generate_chat)
|
|
563
|
+
reasoning = getattr(choice.message, "reasoning_content", None)
|
|
564
|
+
self.last_reasoning_content = reasoning or None
|
|
565
|
+
if not text and reasoning:
|
|
566
|
+
text = ""
|
|
567
|
+
reason = "length"
|
|
568
|
+
|
|
569
|
+
if reason == "length":
|
|
570
|
+
pass
|
|
571
|
+
elif reason in ("stop", "end_turn"):
|
|
572
|
+
reason = "stop"
|
|
573
|
+
else:
|
|
574
|
+
reason = "stop"
|
|
575
|
+
|
|
576
|
+
return (text, reason, None, None)
|
|
577
|
+
|
|
578
|
+
except Exception as exc:
|
|
579
|
+
last_exc = exc
|
|
580
|
+
if attempt < self._MAX_RETRIES - 1 and self._is_retryable(exc):
|
|
581
|
+
delay = self._BASE_DELAY * (2 ** attempt) + random.uniform(0, 0.5)
|
|
582
|
+
logger.warning(
|
|
583
|
+
"OpenAI tool call transient error (attempt %d/%d), retrying in %.1fs: %s",
|
|
584
|
+
attempt + 1, self._MAX_RETRIES, delay, exc,
|
|
585
|
+
)
|
|
586
|
+
time.sleep(delay)
|
|
587
|
+
else:
|
|
588
|
+
logger.error("OpenAI tool call API error: %s", exc)
|
|
589
|
+
return ("", "error", None, None)
|
|
590
|
+
|
|
591
|
+
logger.error("OpenAI tool call failed after %d retries: %s", self._MAX_RETRIES, last_exc)
|
|
592
|
+
return ("", "error", None, None)
|
|
593
|
+
|
|
594
|
+
def generate_chat_stream(
|
|
595
|
+
self,
|
|
596
|
+
messages: list[dict[str, str]],
|
|
597
|
+
**kwargs: object,
|
|
598
|
+
):
|
|
599
|
+
"""Stream token chunks from OpenAI.
|
|
600
|
+
|
|
601
|
+
Yields individual token deltas. Return value is finish_reason.
|
|
602
|
+
"""
|
|
603
|
+
from collections.abc import Generator
|
|
604
|
+
|
|
605
|
+
params: dict[str, object] = {
|
|
606
|
+
"model": self._model,
|
|
607
|
+
"messages": messages,
|
|
608
|
+
"max_tokens": kwargs.pop("max_tokens", self._max_tokens) if "max_tokens" in kwargs else self._max_tokens,
|
|
609
|
+
"stream": True,
|
|
610
|
+
}
|
|
611
|
+
params.update(kwargs)
|
|
612
|
+
|
|
613
|
+
finish_reason = "stop"
|
|
614
|
+
try:
|
|
615
|
+
stream = self._client.chat.completions.create(**params)
|
|
616
|
+
for chunk in stream:
|
|
617
|
+
if chunk.choices:
|
|
618
|
+
delta = chunk.choices[0].delta
|
|
619
|
+
if delta and delta.content:
|
|
620
|
+
yield delta.content
|
|
621
|
+
fr = chunk.choices[0].finish_reason
|
|
622
|
+
if fr:
|
|
623
|
+
finish_reason = "length" if fr == "length" else "stop"
|
|
624
|
+
except Exception as exc:
|
|
625
|
+
logger.error("OpenAI streaming error: %s", exc)
|
|
626
|
+
finish_reason = "error"
|
|
627
|
+
|
|
628
|
+
return finish_reason
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Per-provider tokenizer reconciliation (§06 §6.4).
|
|
4
|
+
|
|
5
|
+
Three-layer hierarchy:
|
|
6
|
+
Layer 1: Model-specific tokenizer (best — 100% accuracy)
|
|
7
|
+
Layer 2: Provider API token counting (good — 99%)
|
|
8
|
+
Layer 3: Character-to-token fallback (acceptable — 70-80%)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from crp.providers.base import LLMProvider
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TokenizerRegistry:
|
|
17
|
+
"""Cache and resolve tokenizers per provider.
|
|
18
|
+
|
|
19
|
+
Phase 1 implementation delegates to LLMProvider.count_tokens() which
|
|
20
|
+
each adapter must implement with its own tokenizer. The registry adds
|
|
21
|
+
the Layer 3 fallback heuristic for providers that raise.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
_CHARS_PER_TOKEN = 4 # Layer 3 heuristic
|
|
25
|
+
|
|
26
|
+
def count_tokens(self, text: str, provider: LLMProvider) -> int:
|
|
27
|
+
"""Count tokens using the best available method for *provider*.
|
|
28
|
+
|
|
29
|
+
Layer 1/2: provider.count_tokens() — exact model tokenizer or API.
|
|
30
|
+
Layer 3: chars/4 fallback if the provider raises.
|
|
31
|
+
"""
|
|
32
|
+
try:
|
|
33
|
+
return provider.count_tokens(text)
|
|
34
|
+
except Exception:
|
|
35
|
+
return self._fallback_count(text)
|
|
36
|
+
|
|
37
|
+
def _fallback_count(self, text: str) -> int:
|
|
38
|
+
"""Layer 3: ~4 characters = 1 token average."""
|
|
39
|
+
return max(1, len(text) // self._CHARS_PER_TOKEN)
|
|
40
|
+
|
|
41
|
+
def validate_roundtrip(self, text: str, provider: LLMProvider) -> bool:
|
|
42
|
+
"""Validate encode→decode→encode is lossless (best-effort)."""
|
|
43
|
+
try:
|
|
44
|
+
count1 = provider.count_tokens(text)
|
|
45
|
+
count2 = provider.count_tokens(text)
|
|
46
|
+
return count1 == count2
|
|
47
|
+
except Exception:
|
|
48
|
+
return False
|
crp/py.typed
ADDED
|
File without changes
|