deepseek-harness 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepseek_harness-0.2.0/PKG-INFO +50 -0
- deepseek_harness-0.2.0/README.md +25 -0
- deepseek_harness-0.2.0/deepseek_harness/__init__.py +41 -0
- deepseek_harness-0.2.0/deepseek_harness/cache.py +198 -0
- deepseek_harness-0.2.0/deepseek_harness/client.py +259 -0
- deepseek_harness-0.2.0/deepseek_harness/exceptions.py +36 -0
- deepseek_harness-0.2.0/deepseek_harness/normalize.py +84 -0
- deepseek_harness-0.2.0/deepseek_harness/reasoning.py +112 -0
- deepseek_harness-0.2.0/deepseek_harness/summarize.py +118 -0
- deepseek_harness-0.2.0/deepseek_harness/tool_calls.py +181 -0
- deepseek_harness-0.2.0/deepseek_harness.egg-info/PKG-INFO +50 -0
- deepseek_harness-0.2.0/deepseek_harness.egg-info/SOURCES.txt +16 -0
- deepseek_harness-0.2.0/deepseek_harness.egg-info/dependency_links.txt +1 -0
- deepseek_harness-0.2.0/deepseek_harness.egg-info/requires.txt +10 -0
- deepseek_harness-0.2.0/deepseek_harness.egg-info/top_level.txt +1 -0
- deepseek_harness-0.2.0/pyproject.toml +41 -0
- deepseek_harness-0.2.0/setup.cfg +4 -0
- deepseek_harness-0.2.0/tests/test_smoke.py +179 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepseek-harness
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Protocol-aware client for DeepSeek V4-Pro / V4-Flash. Survives the 16 documented quirks; ships the cache discount.
|
|
5
|
+
Author: Henry Zhang
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HenryZ838978/deepseek-harness
|
|
8
|
+
Project-URL: Reports, https://github.com/HenryZ838978/deepseek-harness/tree/main/reports
|
|
9
|
+
Project-URL: Spec, https://github.com/HenryZ838978/deepseek-harness/tree/main/spec
|
|
10
|
+
Keywords: deepseek,llm,openai,agent,harness,mcp
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: openai>=1.50.0
|
|
18
|
+
Requires-Dist: httpx>=0.27.0
|
|
19
|
+
Requires-Dist: tiktoken>=0.7.0
|
|
20
|
+
Provides-Extra: dev
|
|
21
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
22
|
+
Requires-Dist: ruff>=0.5.0; extra == "dev"
|
|
23
|
+
Provides-Extra: dotenv
|
|
24
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "dotenv"
|
|
25
|
+
|
|
26
|
+
# `deepseek-harness`
|
|
27
|
+
|
|
28
|
+
Protocol-aware Python client for **DeepSeek V4-Pro / V4-Flash**.
|
|
29
|
+
Survives the [16 documented quirks](https://github.com/HenryZ838978/deepseek-harness/blob/main/reports/REPORT_2026-05-09.md); ships the 50× cache discount.
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install deepseek-harness
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from deepseek_harness import DeepSeekHarness
|
|
37
|
+
|
|
38
|
+
c = DeepSeekHarness(disable_thinking_by_default=True)
|
|
39
|
+
out = c.chat(
|
|
40
|
+
model="deepseek-v4-pro",
|
|
41
|
+
messages=[{"role": "user", "content": "Hello"}],
|
|
42
|
+
max_tokens=4096,
|
|
43
|
+
)
|
|
44
|
+
print(out["message"]["content"])
|
|
45
|
+
print(f"cost: ${out['usage']['estimated_cost_usd']:.6f} · cache hit: {out['usage']['cache_hit_rate']:.0%}")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
The harness wraps `openai.OpenAI` and enforces 10 contract rules by default. See the [main repository](https://github.com/HenryZ838978/deepseek-harness) for the full spec, probe corpus, and three other distribution forms (`dsh` CLI, `@deepseek-harness/mcp` server, Anthropic `SKILL.md`).
|
|
49
|
+
|
|
50
|
+
License: MIT.
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# `deepseek-harness`
|
|
2
|
+
|
|
3
|
+
Protocol-aware Python client for **DeepSeek V4-Pro / V4-Flash**.
|
|
4
|
+
Survives the [16 documented quirks](https://github.com/HenryZ838978/deepseek-harness/blob/main/reports/REPORT_2026-05-09.md); ships the 50× cache discount.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
pip install deepseek-harness
|
|
8
|
+
```
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
from deepseek_harness import DeepSeekHarness
|
|
12
|
+
|
|
13
|
+
c = DeepSeekHarness(disable_thinking_by_default=True)
|
|
14
|
+
out = c.chat(
|
|
15
|
+
model="deepseek-v4-pro",
|
|
16
|
+
messages=[{"role": "user", "content": "Hello"}],
|
|
17
|
+
max_tokens=4096,
|
|
18
|
+
)
|
|
19
|
+
print(out["message"]["content"])
|
|
20
|
+
print(f"cost: ${out['usage']['estimated_cost_usd']:.6f} · cache hit: {out['usage']['cache_hit_rate']:.0%}")
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
The harness wraps `openai.OpenAI` and enforces 10 contract rules by default. See the [main repository](https://github.com/HenryZ838978/deepseek-harness) for the full spec, probe corpus, and three other distribution forms (`dsh` CLI, `@deepseek-harness/mcp` server, Anthropic `SKILL.md`).
|
|
24
|
+
|
|
25
|
+
License: MIT.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""deepseek-harness · core — protocol-aware client for DeepSeek V4-Pro / V4-Flash.
|
|
2
|
+
|
|
3
|
+
Validated by 16 probes documented in `reports/REPORT_2026-05-09.md`.
|
|
4
|
+
|
|
5
|
+
Public API::
|
|
6
|
+
|
|
7
|
+
from deepseek_harness import DeepSeekHarness, normalize_usage, estimate_cache_hit
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .client import DeepSeekHarness
|
|
11
|
+
from .cache import estimate_cache_hit, normalize_usage
|
|
12
|
+
from .reasoning import ReasoningLifecycle
|
|
13
|
+
from .tool_calls import salvage_tool_calls_from_content
|
|
14
|
+
from .exceptions import (
|
|
15
|
+
HarnessError,
|
|
16
|
+
ReasoningContentMissingError,
|
|
17
|
+
ToolCallLeakageError,
|
|
18
|
+
StrictModeCorruptionError,
|
|
19
|
+
StreamShapeError,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Backwards-compatible alias (transitional, kept for one minor release).
|
|
23
|
+
DeepSeekClient = DeepSeekHarness
|
|
24
|
+
DeepSeekKitError = HarnessError
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"DeepSeekHarness",
|
|
28
|
+
"DeepSeekClient",
|
|
29
|
+
"ReasoningLifecycle",
|
|
30
|
+
"salvage_tool_calls_from_content",
|
|
31
|
+
"estimate_cache_hit",
|
|
32
|
+
"normalize_usage",
|
|
33
|
+
"HarnessError",
|
|
34
|
+
"DeepSeekKitError",
|
|
35
|
+
"ReasoningContentMissingError",
|
|
36
|
+
"ToolCallLeakageError",
|
|
37
|
+
"StrictModeCorruptionError",
|
|
38
|
+
"StreamShapeError",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
__version__ = "0.2.0"
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""Cache-hit field bridging + a local prefix-cache estimator.
|
|
2
|
+
|
|
3
|
+
Two concrete pains:
|
|
4
|
+
|
|
5
|
+
1. Field naming mismatch (pi-mono#3880):
|
|
6
|
+
- DeepSeek puts cache-hit token count in `usage.prompt_cache_hit_tokens`
|
|
7
|
+
- OpenAI puts it in `usage.prompt_tokens_details.cached_tokens`
|
|
8
|
+
- Vanilla OpenAI parsers see 0% cache hit even when DeepSeek is happily charging
|
|
9
|
+
you the cached price. `normalize_usage()` below back-fills both fields.
|
|
10
|
+
|
|
11
|
+
2. The DeepSeek cache only triggers on **byte-for-byte prefix match starting from
|
|
12
|
+
token 0**, with a practical minimum prefix of ~1024 tokens. `estimate_cache_hit()`
|
|
13
|
+
is a local pre-flight estimator: feed it the messages you are about to send +
|
|
14
|
+
the prefix you saw "stick" in the previous request, and it tells you the longest
|
|
15
|
+
common prefix in tokens.
|
|
16
|
+
|
|
17
|
+
References:
|
|
18
|
+
deepseek-ai/DeepSeek-V3#1261 (V3.2→V4 cache hit rate regression 92%→35%)
|
|
19
|
+
pi-mono#3880 (field mismatch fix)
|
|
20
|
+
DeepSeek docs: https://api-docs.deepseek.com/guides/kv_cache
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
import tiktoken
|
|
29
|
+
except ImportError: # pragma: no cover
|
|
30
|
+
tiktoken = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# DeepSeek V4-Flash quoted prices, USD per million tokens.
|
|
34
|
+
PRICE_PER_M_INPUT_MISS = 0.14
|
|
35
|
+
PRICE_PER_M_INPUT_HIT = 0.0028
|
|
36
|
+
PRICE_PER_M_OUTPUT = 0.28
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def normalize_usage(usage: dict | Any) -> dict:
|
|
40
|
+
"""Return a dict that has BOTH field shapes filled in.
|
|
41
|
+
|
|
42
|
+
Accepts the raw `usage` dict from a DeepSeek response (or an OpenAI usage object).
|
|
43
|
+
Output always includes:
|
|
44
|
+
- prompt_cache_hit_tokens (int)
|
|
45
|
+
- prompt_cache_miss_tokens (int)
|
|
46
|
+
- prompt_tokens_details.cached_tokens (int) # OpenAI shape
|
|
47
|
+
- completion_tokens, prompt_tokens, total_tokens (passthrough)
|
|
48
|
+
- estimated_cost_usd (float, V4-Flash pricing)
|
|
49
|
+
"""
|
|
50
|
+
if usage is None:
|
|
51
|
+
return {}
|
|
52
|
+
u = _to_dict(usage)
|
|
53
|
+
|
|
54
|
+
prompt_total = int(u.get("prompt_tokens") or 0)
|
|
55
|
+
completion = int(u.get("completion_tokens") or 0)
|
|
56
|
+
|
|
57
|
+
# DeepSeek native field
|
|
58
|
+
hit = u.get("prompt_cache_hit_tokens")
|
|
59
|
+
miss = u.get("prompt_cache_miss_tokens")
|
|
60
|
+
|
|
61
|
+
# OpenAI shape
|
|
62
|
+
details = u.get("prompt_tokens_details") or {}
|
|
63
|
+
if isinstance(details, dict):
|
|
64
|
+
cached_oa = details.get("cached_tokens")
|
|
65
|
+
else:
|
|
66
|
+
cached_oa = getattr(details, "cached_tokens", None)
|
|
67
|
+
|
|
68
|
+
if hit is None and cached_oa is not None:
|
|
69
|
+
hit = int(cached_oa)
|
|
70
|
+
miss = max(prompt_total - hit, 0)
|
|
71
|
+
elif hit is not None and cached_oa is None:
|
|
72
|
+
cached_oa = int(hit)
|
|
73
|
+
elif hit is None and cached_oa is None:
|
|
74
|
+
hit, miss, cached_oa = 0, prompt_total, 0
|
|
75
|
+
|
|
76
|
+
cost = (
|
|
77
|
+
(miss / 1_000_000) * PRICE_PER_M_INPUT_MISS
|
|
78
|
+
+ (hit / 1_000_000) * PRICE_PER_M_INPUT_HIT
|
|
79
|
+
+ (completion / 1_000_000) * PRICE_PER_M_OUTPUT
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return {
|
|
83
|
+
"prompt_tokens": prompt_total,
|
|
84
|
+
"completion_tokens": completion,
|
|
85
|
+
"total_tokens": int(u.get("total_tokens") or (prompt_total + completion)),
|
|
86
|
+
"prompt_cache_hit_tokens": int(hit),
|
|
87
|
+
"prompt_cache_miss_tokens": int(miss),
|
|
88
|
+
"prompt_tokens_details": {"cached_tokens": int(cached_oa)},
|
|
89
|
+
"estimated_cost_usd": round(cost, 8),
|
|
90
|
+
"cache_hit_rate": round(hit / prompt_total, 4) if prompt_total else 0.0,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _to_dict(obj: Any) -> dict:
|
|
95
|
+
if isinstance(obj, dict):
|
|
96
|
+
return obj
|
|
97
|
+
if hasattr(obj, "model_dump"):
|
|
98
|
+
return obj.model_dump()
|
|
99
|
+
if hasattr(obj, "to_dict"):
|
|
100
|
+
return obj.to_dict()
|
|
101
|
+
if hasattr(obj, "__dict__"):
|
|
102
|
+
return {k: v for k, v in obj.__dict__.items() if not k.startswith("_")}
|
|
103
|
+
return {}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ---------------------------------------------------------------------------
|
|
107
|
+
# Pre-flight estimator
|
|
108
|
+
# ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _encode(text: str) -> list[int]:
|
|
112
|
+
if tiktoken is None:
|
|
113
|
+
# crude byte-pair fallback so the kit still imports without tiktoken
|
|
114
|
+
return list(text.encode("utf-8"))
|
|
115
|
+
enc = tiktoken.get_encoding("cl100k_base")
|
|
116
|
+
return enc.encode(text)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def estimate_cache_hit(
|
|
120
|
+
new_messages: list[dict],
|
|
121
|
+
previous_prefix_messages: list[dict] | None = None,
|
|
122
|
+
*,
|
|
123
|
+
minimum_prefix_tokens: int = 1024,
|
|
124
|
+
cache_block_size: int = 256,
|
|
125
|
+
) -> dict:
|
|
126
|
+
"""Estimate how much of `new_messages` will be a cache hit on DeepSeek.
|
|
127
|
+
|
|
128
|
+
DeepSeek's cache rule (validated by probe_5 on 2026-05-09):
|
|
129
|
+
- prefix-from-0 match
|
|
130
|
+
- bucketed by ~256-token blocks (cached tokens are always multiples of 256)
|
|
131
|
+
- minimum prefix length to BEGIN caching ≈ 1024 tokens
|
|
132
|
+
- tail edits preserve head cache; mid-prefix edits invalidate from the edit
|
|
133
|
+
point onwards (NOT the entire prefix)
|
|
134
|
+
|
|
135
|
+
We serialise both message lists deterministically (role + content + tool_calls
|
|
136
|
+
+ reasoning_content), tokenise with cl100k_base (close enough; DeepSeek
|
|
137
|
+
tokenizer is ~3.6 chars/token for English ASCII vs cl100k's 4), and find the
|
|
138
|
+
longest common token prefix, rounded DOWN to the nearest cache_block_size
|
|
139
|
+
boundary. The estimator does NOT replace the server's truth — it is a
|
|
140
|
+
pre-flight sanity check, e.g. "is my client about to invalidate the
|
|
141
|
+
99-cent prefix by re-ordering tool messages?".
|
|
142
|
+
"""
|
|
143
|
+
new_text = _serialize_messages(new_messages)
|
|
144
|
+
prev_text = _serialize_messages(previous_prefix_messages or [])
|
|
145
|
+
|
|
146
|
+
new_tokens = _encode(new_text)
|
|
147
|
+
prev_tokens = _encode(prev_text)
|
|
148
|
+
|
|
149
|
+
common = 0
|
|
150
|
+
for a, b in zip(new_tokens, prev_tokens):
|
|
151
|
+
if a != b:
|
|
152
|
+
break
|
|
153
|
+
common += 1
|
|
154
|
+
|
|
155
|
+
if common < minimum_prefix_tokens:
|
|
156
|
+
eligible = 0
|
|
157
|
+
else:
|
|
158
|
+
# Server rounds cached tokens DOWN to the nearest cache_block_size (256).
|
|
159
|
+
eligible = (common // cache_block_size) * cache_block_size
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
"common_prefix_tokens": common,
|
|
163
|
+
"eligible_cached_tokens": eligible,
|
|
164
|
+
"new_total_tokens": len(new_tokens),
|
|
165
|
+
"estimated_hit_rate": round(eligible / len(new_tokens), 4) if new_tokens else 0.0,
|
|
166
|
+
"minimum_prefix_threshold": minimum_prefix_tokens,
|
|
167
|
+
"cache_block_size": cache_block_size,
|
|
168
|
+
"explanation": (
|
|
169
|
+
f"common < {minimum_prefix_tokens} → server will NOT cache this request"
|
|
170
|
+
if common < minimum_prefix_tokens
|
|
171
|
+
else f"ok — {eligible} tokens (rounded to {cache_block_size}-block) will be discounted at $0.0028/M"
|
|
172
|
+
),
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _serialize_messages(messages: list[dict]) -> str:
|
|
177
|
+
"""Deterministic flattening used by both prefix estimator and cache-debug helpers.
|
|
178
|
+
|
|
179
|
+
Layout mirrors OpenAI chat-completions JSON ordering: role → content → tool_calls
|
|
180
|
+
→ tool_call_id → reasoning_content. ANY field reorder by your agent code will
|
|
181
|
+
bust the prefix.
|
|
182
|
+
"""
|
|
183
|
+
parts: list[str] = []
|
|
184
|
+
for msg in messages:
|
|
185
|
+
parts.append(f"<role>{msg.get('role', '')}</role>")
|
|
186
|
+
content = msg.get("content")
|
|
187
|
+
if isinstance(content, list):
|
|
188
|
+
for c in content:
|
|
189
|
+
parts.append(f"<part>{c}</part>")
|
|
190
|
+
elif content:
|
|
191
|
+
parts.append(f"<content>{content}</content>")
|
|
192
|
+
for tc in msg.get("tool_calls") or []:
|
|
193
|
+
parts.append(f"<tc>{tc.get('id','')}|{tc.get('function',{}).get('name','')}|{tc.get('function',{}).get('arguments','')}</tc>")
|
|
194
|
+
if msg.get("tool_call_id"):
|
|
195
|
+
parts.append(f"<tcid>{msg['tool_call_id']}</tcid>")
|
|
196
|
+
if msg.get("reasoning_content"):
|
|
197
|
+
parts.append(f"<rc>{msg['reasoning_content']}</rc>")
|
|
198
|
+
return "\n".join(parts)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
"""Drop-in OpenAI-compat client for DeepSeek V4-Pro / V4-Flash with all known protocol salvages enabled.
|
|
2
|
+
|
|
3
|
+
Replace::
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
client = OpenAI(api_key=..., base_url="https://api.deepseek.com")
|
|
7
|
+
|
|
8
|
+
with::
|
|
9
|
+
|
|
10
|
+
from deepseek_harness import DeepSeekHarness
|
|
11
|
+
client = DeepSeekHarness(api_key=..., base_url="https://api.deepseek.com")
|
|
12
|
+
|
|
13
|
+
The surface is `client.chat.completions.create(...)` — same as OpenAI — but every
|
|
14
|
+
response goes through:
|
|
15
|
+
|
|
16
|
+
- `from_deepseek_response` (preserves reasoning_content on the message dict)
|
|
17
|
+
- `salvage_tool_calls_from_content` (rescues 11% leaked tool calls)
|
|
18
|
+
- `normalize_usage` (back-fills both cache-hit field shapes + cost estimate)
|
|
19
|
+
|
|
20
|
+
Streaming uses `stream_chat()` which absorbs reasoning chunks via `ReasoningLifecycle`
|
|
21
|
+
and tolerates the empty-final-chunk shape (cline #1594).
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
from typing import Any, Iterator
|
|
28
|
+
|
|
29
|
+
from openai import OpenAI
|
|
30
|
+
|
|
31
|
+
from .cache import normalize_usage
|
|
32
|
+
from .exceptions import StreamShapeError, ToolCallLeakageError
|
|
33
|
+
from .normalize import from_deepseek_response, strip_kit_warnings, to_deepseek_history
|
|
34
|
+
from .reasoning import ReasoningLifecycle
|
|
35
|
+
from .tool_calls import salvage_tool_calls_from_content
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DeepSeekHarness:
|
|
39
|
+
"""Thin wrapper around `openai.OpenAI` with DeepSeek V4-specific safety guards.
|
|
40
|
+
|
|
41
|
+
Implements the 10 contract rules documented in `spec/` and validated by the
|
|
42
|
+
16 probes in `reports/`. See `__init__.py` for the public re-export.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
api_key: str | None = None,
|
|
48
|
+
base_url: str | None = None,
|
|
49
|
+
*,
|
|
50
|
+
salvage_tool_calls: bool = True,
|
|
51
|
+
normalize_cache_fields: bool = True,
|
|
52
|
+
warn_on_missing_reasoning: bool = True,
|
|
53
|
+
disable_thinking_by_default: bool = False,
|
|
54
|
+
raw_dump_path: str | None = None,
|
|
55
|
+
) -> None:
|
|
56
|
+
"""Construct a DeepSeek-aware OpenAI-compatible client.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
disable_thinking_by_default: If True, every request gets
|
|
60
|
+
`extra_body={"thinking":{"type":"disabled"}}` UNLESS the caller
|
|
61
|
+
explicitly passes their own `extra_body`. Recommended for cost-
|
|
62
|
+
sensitive deployments because `deepseek-v4-pro` defaults to
|
|
63
|
+
thinking-enabled (probe_0 finding 2026-05-09: ~30 reasoning
|
|
64
|
+
tokens are billed even on trivial prompts).
|
|
65
|
+
"""
|
|
66
|
+
self._oai = OpenAI(
|
|
67
|
+
api_key=api_key or os.getenv("DEEPSEEK_API_KEY"),
|
|
68
|
+
base_url=base_url or os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com"),
|
|
69
|
+
)
|
|
70
|
+
self._salvage = salvage_tool_calls
|
|
71
|
+
self._normalize_cache = normalize_cache_fields
|
|
72
|
+
self._warn = warn_on_missing_reasoning
|
|
73
|
+
self._disable_thinking = disable_thinking_by_default
|
|
74
|
+
self._raw_dump = raw_dump_path
|
|
75
|
+
|
|
76
|
+
def _maybe_inject_thinking_off(self, kwargs: dict) -> dict:
|
|
77
|
+
if self._disable_thinking and "extra_body" not in kwargs:
|
|
78
|
+
kwargs = dict(kwargs)
|
|
79
|
+
kwargs["extra_body"] = {"thinking": {"type": "disabled"}}
|
|
80
|
+
return kwargs
|
|
81
|
+
|
|
82
|
+
# ------------------------------------------------------------------
|
|
83
|
+
# Non-streaming
|
|
84
|
+
# ------------------------------------------------------------------
|
|
85
|
+
def chat(
|
|
86
|
+
self,
|
|
87
|
+
*,
|
|
88
|
+
model: str,
|
|
89
|
+
messages: list[dict],
|
|
90
|
+
tools: list[dict] | None = None,
|
|
91
|
+
tool_choice: str | dict | None = None,
|
|
92
|
+
**kwargs: Any,
|
|
93
|
+
) -> dict:
|
|
94
|
+
"""Returns a dict with keys:
|
|
95
|
+
message: OpenAI-shaped assistant message (incl. reasoning_content if any)
|
|
96
|
+
usage: normalised usage (both cache-field shapes filled in)
|
|
97
|
+
finish_reason: passthrough
|
|
98
|
+
salvage: None if no tool-call salvage happened, else {pattern, original_content}
|
|
99
|
+
raw: the raw SDK response (for debugging)
|
|
100
|
+
"""
|
|
101
|
+
normalized_in = strip_kit_warnings(to_deepseek_history(messages))
|
|
102
|
+
kwargs = self._maybe_inject_thinking_off(kwargs)
|
|
103
|
+
resp = self._oai.chat.completions.create(
|
|
104
|
+
model=model,
|
|
105
|
+
messages=normalized_in,
|
|
106
|
+
tools=tools,
|
|
107
|
+
tool_choice=tool_choice,
|
|
108
|
+
**kwargs,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
msg = from_deepseek_response(resp)
|
|
112
|
+
finish_reason = msg.pop("_dsk_kit_finish_reason", None)
|
|
113
|
+
salvage = None
|
|
114
|
+
|
|
115
|
+
if self._salvage and not msg.get("tool_calls"):
|
|
116
|
+
tool_calls, residual, reason = salvage_tool_calls_from_content(
|
|
117
|
+
msg.get("content"), finish_reason
|
|
118
|
+
)
|
|
119
|
+
if tool_calls is not None:
|
|
120
|
+
salvage = {
|
|
121
|
+
"pattern": reason,
|
|
122
|
+
"original_content": msg.get("content"),
|
|
123
|
+
}
|
|
124
|
+
msg["tool_calls"] = tool_calls
|
|
125
|
+
msg["content"] = residual
|
|
126
|
+
|
|
127
|
+
usage = normalize_usage(getattr(resp, "usage", None)) if self._normalize_cache else None
|
|
128
|
+
|
|
129
|
+
return {
|
|
130
|
+
"message": msg,
|
|
131
|
+
"usage": usage,
|
|
132
|
+
"finish_reason": finish_reason,
|
|
133
|
+
"salvage": salvage,
|
|
134
|
+
"raw": resp,
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------------------------------
|
|
138
|
+
# Streaming
|
|
139
|
+
# ------------------------------------------------------------------
|
|
140
|
+
def stream_chat(
|
|
141
|
+
self,
|
|
142
|
+
*,
|
|
143
|
+
model: str,
|
|
144
|
+
messages: list[dict],
|
|
145
|
+
tools: list[dict] | None = None,
|
|
146
|
+
tool_choice: str | dict | None = None,
|
|
147
|
+
**kwargs: Any,
|
|
148
|
+
) -> Iterator[dict]:
|
|
149
|
+
"""Yields **structured events** (not raw chunks) so callers don't have to re-implement
|
|
150
|
+
the cline #1594 / hermes #15353 mitigations.
|
|
151
|
+
|
|
152
|
+
Event types:
|
|
153
|
+
{"type": "content_delta", "data": str}
|
|
154
|
+
{"type": "reasoning_delta", "data": str}
|
|
155
|
+
{"type": "tool_call_delta", "data": {"index": int, "id": str|None, "name": str|None, "arguments": str}}
|
|
156
|
+
{"type": "done", "message": {...}, "usage": {...}, "finish_reason": str, "salvage": dict|None}
|
|
157
|
+
"""
|
|
158
|
+
normalized_in = strip_kit_warnings(to_deepseek_history(messages))
|
|
159
|
+
kwargs = self._maybe_inject_thinking_off(kwargs)
|
|
160
|
+
stream = self._oai.chat.completions.create(
|
|
161
|
+
model=model,
|
|
162
|
+
messages=normalized_in,
|
|
163
|
+
tools=tools,
|
|
164
|
+
tool_choice=tool_choice,
|
|
165
|
+
stream=True,
|
|
166
|
+
stream_options={"include_usage": True},
|
|
167
|
+
**kwargs,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
rl = ReasoningLifecycle()
|
|
171
|
+
content_buf: list[str] = []
|
|
172
|
+
tool_call_acc: dict[int, dict] = {}
|
|
173
|
+
finish_reason: str | None = None
|
|
174
|
+
usage_raw: Any | None = None
|
|
175
|
+
|
|
176
|
+
for chunk in stream:
|
|
177
|
+
choices = getattr(chunk, "choices", None) or []
|
|
178
|
+
# DeepSeek emits a final chunk with empty choices but populated usage —
|
|
179
|
+
# cline #1594 was the upstream bug from indexing into [0] blindly.
|
|
180
|
+
if not choices:
|
|
181
|
+
if getattr(chunk, "usage", None) is not None:
|
|
182
|
+
usage_raw = chunk.usage
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
choice = choices[0]
|
|
186
|
+
delta = getattr(choice, "delta", None)
|
|
187
|
+
if delta is None:
|
|
188
|
+
# malformed chunk — neither delta nor finish_reason; skip rather than throw
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
# reasoning_content goes through the lifecycle helper (also yields out for visibility)
|
|
192
|
+
rc = getattr(delta, "reasoning_content", None)
|
|
193
|
+
if rc:
|
|
194
|
+
rl.absorb_chunk(chunk)
|
|
195
|
+
yield {"type": "reasoning_delta", "data": rc}
|
|
196
|
+
|
|
197
|
+
content_piece = getattr(delta, "content", None)
|
|
198
|
+
if content_piece:
|
|
199
|
+
content_buf.append(content_piece)
|
|
200
|
+
yield {"type": "content_delta", "data": content_piece}
|
|
201
|
+
|
|
202
|
+
for tc in getattr(delta, "tool_calls", None) or []:
|
|
203
|
+
idx = getattr(tc, "index", 0)
|
|
204
|
+
slot = tool_call_acc.setdefault(
|
|
205
|
+
idx, {"id": None, "name": None, "arguments": ""}
|
|
206
|
+
)
|
|
207
|
+
if getattr(tc, "id", None):
|
|
208
|
+
slot["id"] = tc.id
|
|
209
|
+
fn = getattr(tc, "function", None)
|
|
210
|
+
if fn is not None:
|
|
211
|
+
if getattr(fn, "name", None):
|
|
212
|
+
slot["name"] = fn.name
|
|
213
|
+
if getattr(fn, "arguments", None):
|
|
214
|
+
slot["arguments"] += fn.arguments
|
|
215
|
+
yield {
|
|
216
|
+
"type": "tool_call_delta",
|
|
217
|
+
"data": {
|
|
218
|
+
"index": idx,
|
|
219
|
+
"id": slot["id"],
|
|
220
|
+
"name": slot["name"],
|
|
221
|
+
"arguments": slot["arguments"],
|
|
222
|
+
},
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if getattr(choice, "finish_reason", None):
|
|
226
|
+
finish_reason = choice.finish_reason
|
|
227
|
+
|
|
228
|
+
content_str = "".join(content_buf) or None
|
|
229
|
+
message: dict[str, Any] = {"role": "assistant", "content": content_str}
|
|
230
|
+
if tool_call_acc:
|
|
231
|
+
message["tool_calls"] = [
|
|
232
|
+
{
|
|
233
|
+
"id": v["id"] or f"call_{i}",
|
|
234
|
+
"type": "function",
|
|
235
|
+
"function": {"name": v["name"] or "", "arguments": v["arguments"] or "{}"},
|
|
236
|
+
}
|
|
237
|
+
for i, v in sorted(tool_call_acc.items())
|
|
238
|
+
]
|
|
239
|
+
message = rl.finalize_assistant_message(message)
|
|
240
|
+
|
|
241
|
+
salvage = None
|
|
242
|
+
if self._salvage and not message.get("tool_calls"):
|
|
243
|
+
tool_calls, residual, reason = salvage_tool_calls_from_content(
|
|
244
|
+
message.get("content"), finish_reason
|
|
245
|
+
)
|
|
246
|
+
if tool_calls is not None:
|
|
247
|
+
salvage = {"pattern": reason, "original_content": message.get("content")}
|
|
248
|
+
message["tool_calls"] = tool_calls
|
|
249
|
+
message["content"] = residual
|
|
250
|
+
|
|
251
|
+
usage = normalize_usage(usage_raw) if (self._normalize_cache and usage_raw) else None
|
|
252
|
+
|
|
253
|
+
yield {
|
|
254
|
+
"type": "done",
|
|
255
|
+
"message": message,
|
|
256
|
+
"usage": usage,
|
|
257
|
+
"finish_reason": finish_reason,
|
|
258
|
+
"salvage": salvage,
|
|
259
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Typed errors so callers can branch on the specific quirk that fired."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class HarnessError(Exception):
|
|
5
|
+
"""Base for every protocol-quirk we detect."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ReasoningContentMissingError(HarnessError):
|
|
9
|
+
"""Server returned 400 because the previous turn's reasoning_content was not echoed back.
|
|
10
|
+
|
|
11
|
+
Reference: microsoft/agent-framework#5538, NousResearch/hermes-agent#15353.
|
|
12
|
+
Probe: reports/probes/probe_2_reasoning_lifecycle.py (3/3 reproduction on V4-Pro).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ToolCallLeakageError(HarnessError):
|
|
17
|
+
"""finish_reason='stop' but content carried a DSML / JSON tool call payload.
|
|
18
|
+
|
|
19
|
+
Reference: deepseek-ai/DeepSeek-V3#1244, NousResearch/hermes-agent#15453.
|
|
20
|
+
Probe: reports/probes/probe_3_tool_call_leakage.py (0/50 on V4 official endpoint).
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class StrictModeCorruptionError(HarnessError):
|
|
25
|
+
"""beta/strict path produced unparseable JSON (missing quote on first key).
|
|
26
|
+
|
|
27
|
+
Reference: deepseek-ai/DeepSeek-V3#1069 (closed as not-planned).
|
|
28
|
+
Probe: reports/probes/probe_4_strict_mode_corruption.py (0/32 on V4 series).
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class StreamShapeError(HarnessError):
|
|
33
|
+
"""Final SSE chunk lacked `choices`, or `delta` was undefined where indexed.
|
|
34
|
+
|
|
35
|
+
Reference: cline/cline#1594.
|
|
36
|
+
"""
|