tokenburn 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. tokenburn-0.2.0/PKG-INFO +108 -0
  2. tokenburn-0.2.0/README.md +74 -0
  3. tokenburn-0.2.0/pyproject.toml +40 -0
  4. tokenburn-0.2.0/setup.cfg +4 -0
  5. tokenburn-0.2.0/tokenburn/__init__.py +142 -0
  6. tokenburn-0.2.0/tokenburn/adapters/__init__.py +32 -0
  7. tokenburn-0.2.0/tokenburn/adapters/anthropic.py +226 -0
  8. tokenburn-0.2.0/tokenburn/adapters/gemini.py +222 -0
  9. tokenburn-0.2.0/tokenburn/adapters/openai.py +341 -0
  10. tokenburn-0.2.0/tokenburn/adapters/sse.py +108 -0
  11. tokenburn-0.2.0/tokenburn/classify.py +670 -0
  12. tokenburn-0.2.0/tokenburn/cli.py +383 -0
  13. tokenburn-0.2.0/tokenburn/compress.py +120 -0
  14. tokenburn-0.2.0/tokenburn/config.py +233 -0
  15. tokenburn-0.2.0/tokenburn/context_drivers.py +326 -0
  16. tokenburn-0.2.0/tokenburn/detectors.py +892 -0
  17. tokenburn-0.2.0/tokenburn/doctor.py +399 -0
  18. tokenburn-0.2.0/tokenburn/gain.py +75 -0
  19. tokenburn-0.2.0/tokenburn/logger.py +168 -0
  20. tokenburn-0.2.0/tokenburn/pricing.py +301 -0
  21. tokenburn-0.2.0/tokenburn/proxy/__init__.py +1 -0
  22. tokenburn-0.2.0/tokenburn/proxy/cli.py +214 -0
  23. tokenburn-0.2.0/tokenburn/proxy/daemon.py +75 -0
  24. tokenburn-0.2.0/tokenburn/proxy/extractor.py +261 -0
  25. tokenburn-0.2.0/tokenburn/proxy/server.py +444 -0
  26. tokenburn-0.2.0/tokenburn/proxy/setup.py +154 -0
  27. tokenburn-0.2.0/tokenburn/py.typed +0 -0
  28. tokenburn-0.2.0/tokenburn/report.py +534 -0
  29. tokenburn-0.2.0/tokenburn/setup_wizard.py +194 -0
  30. tokenburn-0.2.0/tokenburn/share.py +41 -0
  31. tokenburn-0.2.0/tokenburn/templates/report.html.j2 +210 -0
  32. tokenburn-0.2.0/tokenburn/tests/__init__.py +0 -0
  33. tokenburn-0.2.0/tokenburn/tests/conftest.py +23 -0
  34. tokenburn-0.2.0/tokenburn/tests/test_adapter_consistency.py +403 -0
  35. tokenburn-0.2.0/tokenburn/tests/test_adapters.py +2268 -0
  36. tokenburn-0.2.0/tokenburn/tests/test_audit_fixes.py +540 -0
  37. tokenburn-0.2.0/tokenburn/tests/test_audit_iter3.py +313 -0
  38. tokenburn-0.2.0/tokenburn/tests/test_classify.py +202 -0
  39. tokenburn-0.2.0/tokenburn/tests/test_cli_categories.py +74 -0
  40. tokenburn-0.2.0/tokenburn/tests/test_compress.py +215 -0
  41. tokenburn-0.2.0/tokenburn/tests/test_config.py +259 -0
  42. tokenburn-0.2.0/tokenburn/tests/test_context_drivers.py +705 -0
  43. tokenburn-0.2.0/tokenburn/tests/test_detectors.py +699 -0
  44. tokenburn-0.2.0/tokenburn/tests/test_doctor.py +342 -0
  45. tokenburn-0.2.0/tokenburn/tests/test_gain.py +110 -0
  46. tokenburn-0.2.0/tokenburn/tests/test_logger.py +168 -0
  47. tokenburn-0.2.0/tokenburn/tests/test_pricing.py +312 -0
  48. tokenburn-0.2.0/tokenburn/tests/test_proxy_cli.py +67 -0
  49. tokenburn-0.2.0/tokenburn/tests/test_proxy_extractor.py +566 -0
  50. tokenburn-0.2.0/tokenburn/tests/test_proxy_server.py +1363 -0
  51. tokenburn-0.2.0/tokenburn/tests/test_proxy_setup.py +90 -0
  52. tokenburn-0.2.0/tokenburn/tests/test_proxy_skip_process_cli.py +79 -0
  53. tokenburn-0.2.0/tokenburn/tests/test_report.py +978 -0
  54. tokenburn-0.2.0/tokenburn/tests/test_reset.py +76 -0
  55. tokenburn-0.2.0/tokenburn/tests/test_setup_wizard.py +114 -0
  56. tokenburn-0.2.0/tokenburn/tests/test_share.py +94 -0
  57. tokenburn-0.2.0/tokenburn/tests/test_wrapper.py +1466 -0
  58. tokenburn-0.2.0/tokenburn/wrapper.py +1308 -0
  59. tokenburn-0.2.0/tokenburn.egg-info/PKG-INFO +108 -0
  60. tokenburn-0.2.0/tokenburn.egg-info/SOURCES.txt +62 -0
  61. tokenburn-0.2.0/tokenburn.egg-info/dependency_links.txt +1 -0
  62. tokenburn-0.2.0/tokenburn.egg-info/entry_points.txt +2 -0
  63. tokenburn-0.2.0/tokenburn.egg-info/requires.txt +25 -0
  64. tokenburn-0.2.0/tokenburn.egg-info/top_level.txt +1 -0
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: tokenburn
3
+ Version: 0.2.0
4
+ Summary: Local-first proxy for LLM spend visibility and control.
5
+ License: MIT
6
+ Keywords: llm,proxy,tokens,cost,observability
7
+ Classifier: Development Status :: 4 - Beta
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Software Development :: Libraries
11
+ Requires-Python: >=3.9
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: click>=8.0
14
+ Requires-Dist: rich>=12.0
15
+ Provides-Extra: share
16
+ Requires-Dist: jinja2>=3.0; extra == "share"
17
+ Provides-Extra: proxy
18
+ Requires-Dist: starlette>=0.37; extra == "proxy"
19
+ Requires-Dist: uvicorn>=0.29; extra == "proxy"
20
+ Requires-Dist: httpx>=0.25; extra == "proxy"
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == "dev"
23
+ Requires-Dist: pytest-cov; extra == "dev"
24
+ Requires-Dist: pytest-asyncio; extra == "dev"
25
+ Requires-Dist: jinja2>=3.0; extra == "dev"
26
+ Requires-Dist: starlette>=0.37; extra == "dev"
27
+ Requires-Dist: uvicorn>=0.29; extra == "dev"
28
+ Requires-Dist: httpx>=0.25; extra == "dev"
29
+ Provides-Extra: all
30
+ Requires-Dist: jinja2>=3.0; extra == "all"
31
+ Requires-Dist: starlette>=0.37; extra == "all"
32
+ Requires-Dist: uvicorn>=0.29; extra == "all"
33
+ Requires-Dist: httpx>=0.25; extra == "all"
34
+
35
+ # TokenBurn
36
+
37
+ > `htop` for your LLM spend — proxy-only.
38
+
39
+ TokenBurn is a local-first HTTP proxy for LLM spend visibility and control.
40
+
41
+ Route OpenAI-, Anthropic-, and Gemini-compatible traffic through a local proxy. TokenBurn logs usage locally, attributes cost by model/provider/program/tag, and turns raw traffic into actionable waste reports.
42
+
43
+ **No hosted backend. No account. No prompt egress by default.**
44
+
45
+ ---
46
+
47
+ ## Install
48
+
49
+ ```bash
50
+ pip install "tokenburn[proxy]"
51
+ tokenburn proxy setup
52
+ tokenburn proxy start --background
53
+ ```
54
+
55
+ After setup, clients that support base URL overrides can route through TokenBurn with no app-specific SDK integration.
56
+
57
+ ---
58
+
59
+ ## What it does
60
+
61
+ - **Proxy-based capture** — intercepts LLM traffic at the HTTP layer
62
+ - **Cross-language** — works with Python, TypeScript, Go, curl, and anything else that can point at a base URL
63
+ - **Cross-provider** — OpenAI, Anthropic, Gemini
64
+ - **Local logs** — normalized JSONL logs under `~/.tokenburn/logs/`
65
+ - **Spend reports** — model, provider, endpoint, program, and tag breakdowns
66
+ - **Waste detection** — highlights expensive patterns worth fixing first
67
+ - **Shareable output** — terminal and exported reports
68
+
69
+ ---
70
+
71
+ ## Core commands
72
+
73
+ ```bash
74
+ tokenburn proxy setup
75
+ tokenburn proxy start --background
76
+ tokenburn proxy status
77
+ tokenburn proxy stop
78
+
79
+ tokenburn report
80
+ tokenburn gain
81
+ tokenburn share --open
82
+ tokenburn doctor
83
+ ```
84
+
85
+ ---
86
+
87
+ ## Product direction
88
+
89
+ TokenBurn is proxy-only.
90
+
91
+ That means the product lives at the proxy boundary rather than inside application runtimes.
92
+
93
+ The product lives at the network boundary, not inside app SDKs.
94
+
95
+ ---
96
+
97
+ ## Repo docs
98
+
99
+ - `PROXY_TFF.md` — proxy technical design
100
+ - `SPEC.md` — product and positioning
101
+ - `keche.md` — project operating brief
102
+ - `CLAUDE.md` — maintainer workflow notes
103
+
104
+ ---
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,74 @@
1
+ # TokenBurn
2
+
3
+ > `htop` for your LLM spend — proxy-only.
4
+
5
+ TokenBurn is a local-first HTTP proxy for LLM spend visibility and control.
6
+
7
+ Route OpenAI-, Anthropic-, and Gemini-compatible traffic through a local proxy. TokenBurn logs usage locally, attributes cost by model/provider/program/tag, and turns raw traffic into actionable waste reports.
8
+
9
+ **No hosted backend. No account. No prompt egress by default.**
10
+
11
+ ---
12
+
13
+ ## Install
14
+
15
+ ```bash
16
+ pip install "tokenburn[proxy]"
17
+ tokenburn proxy setup
18
+ tokenburn proxy start --background
19
+ ```
20
+
21
+ After setup, clients that support base URL overrides can route through TokenBurn with no app-specific SDK integration.
22
+
23
+ ---
24
+
25
+ ## What it does
26
+
27
+ - **Proxy-based capture** — intercepts LLM traffic at the HTTP layer
28
+ - **Cross-language** — works with Python, TypeScript, Go, curl, and anything else that can point at a base URL
29
+ - **Cross-provider** — OpenAI, Anthropic, Gemini
30
+ - **Local logs** — normalized JSONL logs under `~/.tokenburn/logs/`
31
+ - **Spend reports** — model, provider, endpoint, program, and tag breakdowns
32
+ - **Waste detection** — highlights expensive patterns worth fixing first
33
+ - **Shareable output** — terminal and exported reports
34
+
35
+ ---
36
+
37
+ ## Core commands
38
+
39
+ ```bash
40
+ tokenburn proxy setup
41
+ tokenburn proxy start --background
42
+ tokenburn proxy status
43
+ tokenburn proxy stop
44
+
45
+ tokenburn report
46
+ tokenburn gain
47
+ tokenburn share --open
48
+ tokenburn doctor
49
+ ```
50
+
51
+ ---
52
+
53
+ ## Product direction
54
+
55
+ TokenBurn is proxy-only.
56
+
57
+ That means the product lives at the proxy boundary rather than inside application runtimes.
58
+
59
+ The product lives at the network boundary, not inside app SDKs.
60
+
61
+ ---
62
+
63
+ ## Repo docs
64
+
65
+ - `PROXY_TFF.md` — proxy technical design
66
+ - `SPEC.md` — product and positioning
67
+ - `keche.md` — project operating brief
68
+ - `CLAUDE.md` — maintainer workflow notes
69
+
70
+ ---
71
+
72
+ ## License
73
+
74
+ MIT
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tokenburn"
7
+ version = "0.2.0"
8
+ description = "Local-first proxy for LLM spend visibility and control."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ keywords = ["llm", "proxy", "tokens", "cost", "observability"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Topic :: Software Development :: Libraries",
18
+ ]
19
+ dependencies = [
20
+ "click>=8.0",
21
+ "rich>=12.0",
22
+ ]
23
+
24
+ [project.optional-dependencies]
25
+ share = ["jinja2>=3.0"]
26
+ proxy = ["starlette>=0.37", "uvicorn>=0.29", "httpx>=0.25"]
27
+ dev = ["pytest", "pytest-cov", "pytest-asyncio", "jinja2>=3.0", "starlette>=0.37", "uvicorn>=0.29", "httpx>=0.25"]
28
+ all = ["jinja2>=3.0", "starlette>=0.37", "uvicorn>=0.29", "httpx>=0.25"]
29
+
30
+ [project.scripts]
31
+ tokenburn = "tokenburn.cli:cli"
32
+
33
+ [tool.setuptools.package-data]
34
+ tokenburn = ["templates/*.j2", "py.typed"]
35
+
36
+ [tool.pytest.ini_options]
37
+ asyncio_mode = "strict"
38
+
39
+ [tool.setuptools.packages.find]
40
+ include = ["tokenburn*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,142 @@
1
+ """TokenBurn - LLM token waste detector SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Optional
6
+
7
+ from tokenburn.classify import classify_entries_async
8
+
9
+ __version__ = "0.1.0"
10
+
11
+
12
+ def wrap(
13
+ client: Any,
14
+ default_tags: Optional[str] = None,
15
+ log_preview: bool = True,
16
+ ) -> Any:
17
+ """Wrap an OpenAI or Anthropic client to log all LLM calls.
18
+
19
+ Supports sync and async clients for OpenAI and Anthropic.
20
+
21
+ Usage:
22
+ import openai
23
+ import anthropic
24
+ from tokenburn import wrap
25
+
26
+ # Sync
27
+ client = wrap(openai.OpenAI())
28
+ # Use client as normal - all calls are logged
29
+
30
+ # Async
31
+ async_client = wrap(openai.AsyncOpenAI())
32
+ response = await async_client.chat.completions.create(...)
33
+
34
+ # Anthropic (sync + async both supported)
35
+ anth = wrap(anthropic.Anthropic())
36
+ anth_async = wrap(anthropic.AsyncAnthropic())
37
+ """
38
+ from tokenburn.wrapper import wrap as _wrap
39
+
40
+ return _wrap(client, default_tags=default_tags, log_preview=log_preview)
41
+
42
+
43
+ def log_raw(
44
+ provider: str,
45
+ model: str,
46
+ input_tokens: int,
47
+ output_tokens: int,
48
+ cache_read_tokens: int = 0,
49
+ cache_creation_tokens: int = 0,
50
+ max_tokens_set: int | None = None,
51
+ system_prompt_hash: str | None = None,
52
+ tool_count: int = 0,
53
+ tool_schema_tokens: int = 0,
54
+ tags: str | None = None,
55
+ streaming: bool = False,
56
+ duration_ms: int | None = None,
57
+ error: bool = False,
58
+ error_type: str | None = None,
59
+ request_id: str | None = None,
60
+ caller: str | None = None,
61
+ ) -> None:
62
+ """Log an LLM call manually (for apps that can't use wrap()).
63
+
64
+ Never raises — silently swallows errors to avoid breaking the caller.
65
+ """
66
+ try:
67
+ from datetime import datetime, timezone
68
+ from tokenburn.logger import log_entry
69
+
70
+ entry: dict[str, Any] = {
71
+ "timestamp": datetime.now(timezone.utc).isoformat(),
72
+ "provider": provider,
73
+ "model": model,
74
+ "input_tokens": input_tokens,
75
+ "output_tokens": output_tokens,
76
+ "cache_read_tokens": cache_read_tokens,
77
+ "cache_creation_tokens": cache_creation_tokens,
78
+ "streaming": streaming,
79
+ "error": error,
80
+ }
81
+ if max_tokens_set is not None:
82
+ entry["max_tokens_set"] = max_tokens_set
83
+ if system_prompt_hash is not None:
84
+ entry["system_prompt_hash"] = system_prompt_hash
85
+ if tool_count:
86
+ entry["tool_count"] = tool_count
87
+ if tool_schema_tokens:
88
+ entry["tool_schema_tokens"] = tool_schema_tokens
89
+ if tags is not None:
90
+ entry["tags"] = tags
91
+ if duration_ms is not None:
92
+ entry["duration_ms"] = duration_ms
93
+ if error_type is not None:
94
+ entry["error_type"] = error_type
95
+ if request_id is not None:
96
+ entry["request_id"] = request_id
97
+ if caller is not None:
98
+ entry["call_site"] = {"file": caller, "function": "", "line": 0}
99
+ else:
100
+ from tokenburn.wrapper import _get_call_site
101
+ cs = _get_call_site()
102
+ if cs is not None:
103
+ entry["call_site"] = cs
104
+
105
+ log_entry(entry)
106
+ except Exception:
107
+ pass
108
+
109
+
110
+ def compress_history(
111
+ messages: list,
112
+ max_tokens: int = 8000,
113
+ keep_recent: int = 5,
114
+ summarizer_model: str = "gpt-4o-mini",
115
+ api_key: Optional[str] = None,
116
+ ) -> list:
117
+ """Compress a message history by summarizing old messages.
118
+
119
+ Partitions messages into system messages (preserved verbatim at the top)
120
+ and conversation messages. Keeps the most recent `keep_recent` conversation
121
+ messages unchanged. Older conversation messages are summarized into a single
122
+ system message when they would exceed the token budget.
123
+
124
+ Args:
125
+ messages: List of chat messages (dicts with 'role' and 'content').
126
+ max_tokens: Target token budget. Returns unchanged if already fits.
127
+ keep_recent: Number of recent conversation messages to keep verbatim.
128
+ summarizer_model: OpenAI model used for summarization.
129
+ api_key: Optional OpenAI API key (uses env var if None).
130
+
131
+ Returns:
132
+ Compressed list of messages, or the original list if no compression needed.
133
+ """
134
+ from tokenburn.compress import compress_history as _compress_history
135
+
136
+ return _compress_history(
137
+ messages,
138
+ max_tokens=max_tokens,
139
+ keep_recent=keep_recent,
140
+ summarizer_model=summarizer_model,
141
+ api_key=api_key,
142
+ )
@@ -0,0 +1,32 @@
1
+ """LLM provider adapters for usage extraction and streaming handling."""
2
+
3
+ from tokenburn.adapters.openai import (
4
+ UsageResult as OpenAIUsageResult,
5
+ OpenAIEventHandler,
6
+ extract_from_response as openai_extract,
7
+ )
8
+ from tokenburn.adapters.anthropic import (
9
+ AnthropicUsageResult,
10
+ AnthropicEventHandler,
11
+ extract_from_response as anthropic_extract,
12
+ )
13
+ from tokenburn.adapters.gemini import (
14
+ GeminiUsageResult,
15
+ GeminiEventHandler,
16
+ extract_from_response as gemini_extract,
17
+ )
18
+
19
+ __all__ = [
20
+ # Result dataclasses
21
+ "OpenAIUsageResult",
22
+ "AnthropicUsageResult",
23
+ "GeminiUsageResult",
24
+ # Streaming event handlers
25
+ "OpenAIEventHandler",
26
+ "AnthropicEventHandler",
27
+ "GeminiEventHandler",
28
+ # Non-streaming extraction
29
+ "openai_extract",
30
+ "anthropic_extract",
31
+ "gemini_extract",
32
+ ]
@@ -0,0 +1,226 @@
1
+ # tokenburn/adapters/anthropic.py
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ @dataclass
12
+ class AnthropicUsageResult:
13
+ """Normalized usage extracted from an Anthropic response."""
14
+ input_tokens: int | None
15
+ output_tokens: int | None
16
+ cache_read_tokens: int
17
+ cache_creation_tokens: int
18
+ thinking_tokens: int | None # Estimated from thinking text chars // 4 (Anthropic doesn't report separately)
19
+ tool_calls_made: int | None
20
+ total_tokens: int | None = None # Always None — Anthropic doesn't report it; field exists for parity with OpenAI
21
+ raw_usage: dict | None = None
22
+ usage_source: str = "provider_response"
23
+ model_served: str | None = None
24
+ request_id: str | None = None
25
+ response_id: str | None = None # msg_... message ID from API response
26
+ stop_reason: str | None = None # end_turn, tool_use, max_tokens, etc.
27
+
28
+ @property
29
+ def usage_status(self) -> str:
30
+ if self.input_tokens is not None and self.output_tokens is not None:
31
+ return "exact"
32
+ return "missing"
33
+
34
+ @property
35
+ def computed_total_tokens(self) -> int | None:
36
+ """Compute total from input + output since Anthropic doesn't report it.
37
+
38
+ Note: this is input_tokens + output_tokens, which EXCLUDES cache tokens
39
+ (Anthropic's input_tokens does not include cache_read/cache_creation).
40
+ Returns None when either component is missing rather than falling back
41
+ to an ambiguous value.
42
+ """
43
+ if self.input_tokens is not None and self.output_tokens is not None:
44
+ return self.input_tokens + self.output_tokens
45
+ return None
46
+
47
+ def apply_to_entry(self, entry: dict) -> None:
48
+ """Write normalized fields into a log entry dict."""
49
+ entry["input_tokens"] = self.input_tokens
50
+ entry["output_tokens"] = self.output_tokens
51
+ entry["total_tokens"] = self.computed_total_tokens
52
+ entry["cache_read_tokens"] = self.cache_read_tokens
53
+ entry["cache_creation_tokens"] = self.cache_creation_tokens
54
+ entry["thinking_tokens"] = self.thinking_tokens
55
+ entry["raw_usage"] = self.raw_usage
56
+ entry["usage_source"] = self.usage_source
57
+ entry["usage_status"] = self.usage_status
58
+ if self.model_served:
59
+ entry["model"] = self.model_served
60
+ if self.request_id:
61
+ entry["request_id"] = self.request_id
62
+ if self.response_id:
63
+ entry["response_id"] = self.response_id
64
+ # Dual-write: also set request_id for backwards compatibility
65
+ # with existing log entries and downstream code that reads request_id.
66
+ if not self.request_id:
67
+ entry["request_id"] = self.response_id
68
+ if self.tool_calls_made is not None:
69
+ entry["tool_calls_made"] = self.tool_calls_made
70
+ if self.stop_reason is not None:
71
+ entry["stop_reason"] = self.stop_reason
72
+
73
+
74
+ def extract_from_response(body: dict) -> AnthropicUsageResult:
75
+ """Extract usage from a non-streaming Anthropic Messages API response body."""
76
+ usage = body.get("usage") or {}
77
+ content = body.get("content") or []
78
+
79
+ thinking_chars = sum(
80
+ len(block.get("thinking") or "")
81
+ for block in content
82
+ if isinstance(block, dict) and block.get("type") == "thinking"
83
+ )
84
+
85
+ tool_calls = sum(
86
+ 1 for block in content
87
+ if isinstance(block, dict) and block.get("type") == "tool_use"
88
+ )
89
+
90
+ return AnthropicUsageResult(
91
+ input_tokens=usage.get("input_tokens"),
92
+ output_tokens=usage.get("output_tokens"),
93
+ cache_read_tokens=usage.get("cache_read_input_tokens", 0),
94
+ cache_creation_tokens=usage.get("cache_creation_input_tokens", 0),
95
+ thinking_tokens=thinking_chars // 4 if thinking_chars > 0 else None,
96
+ tool_calls_made=tool_calls,
97
+ raw_usage=usage if usage else None,
98
+ usage_source="provider_response",
99
+ model_served=body.get("model"),
100
+ response_id=body.get("id"), # msg_... is a response identifier
101
+ stop_reason=body.get("stop_reason"),
102
+ )
103
+
104
+
105
+ def create_stream_handler() -> AnthropicEventHandler:
106
+ """Factory: create a fresh AnthropicEventHandler for a new stream."""
107
+ return AnthropicEventHandler()
108
+
109
+
110
+ class AnthropicEventHandler:
111
+ """Handles parsed SSE events for Anthropic Messages API.
112
+
113
+ Receives dicts from SSEStreamBuffer and accumulates usage state.
114
+
115
+ Events handled:
116
+ - message_start -> input_tokens, cache tokens, model, response_id
117
+ - content_block_start -> tool_calls_made counter (type == "tool_use")
118
+ - content_block_delta -> thinking char accumulation (type == "thinking_delta")
119
+ - message_delta -> output_tokens, stop_reason
120
+ - error -> error_type, error_message
121
+ """
122
+
123
+ def __init__(self) -> None:
124
+ self._finalized_result: AnthropicUsageResult | None = None
125
+
126
+ self.model_served: str | None = None
127
+ self.response_id: str | None = None
128
+ self.input_tokens: int | None = None
129
+ self.output_tokens: int | None = None
130
+ self.cache_read_tokens: int = 0
131
+ self.cache_creation_tokens: int = 0
132
+ self._thinking_chars: int = 0
133
+ self._tool_call_count: int = 0
134
+ self._raw_usage_parts: dict = {}
135
+ self.error_type: str | None = None
136
+ self.error_message: str | None = None
137
+ self.stop_reason: str | None = None
138
+
139
+ def handle(self, event: dict) -> None:
140
+ """Process a single parsed SSE event dict."""
141
+ if self._finalized_result is not None:
142
+ logger.warning("AnthropicEventHandler.handle() called after finalize() — event dropped")
143
+ return
144
+
145
+ event_type = event.get("type")
146
+ if event_type == "message_start":
147
+ self._handle_message_start(event)
148
+ elif event_type == "content_block_start":
149
+ self._handle_content_block_start(event)
150
+ elif event_type == "content_block_delta":
151
+ self._handle_content_block_delta(event)
152
+ elif event_type == "message_delta":
153
+ self._handle_message_delta(event)
154
+ elif event_type == "error":
155
+ self._handle_error(event)
156
+ # message_stop, ping, content_block_stop: no action needed
157
+
158
+ def finalize(self) -> AnthropicUsageResult:
159
+ """Called when the stream ends. Returns accumulated usage.
160
+
161
+ Idempotent: caches the result on first call.
162
+ tool_calls_made is None when input_tokens is None (no message_start received).
163
+ """
164
+ if self._finalized_result is not None:
165
+ return self._finalized_result
166
+
167
+ self._finalized_result = AnthropicUsageResult(
168
+ input_tokens=self.input_tokens,
169
+ output_tokens=self.output_tokens,
170
+ cache_read_tokens=self.cache_read_tokens,
171
+ cache_creation_tokens=self.cache_creation_tokens,
172
+ thinking_tokens=self._thinking_chars // 4 if self._thinking_chars > 0 else None,
173
+ tool_calls_made=self._tool_call_count if self.input_tokens is not None else None,
174
+ raw_usage=dict(self._raw_usage_parts) if self._raw_usage_parts else None,
175
+ usage_source="provider_stream_final",
176
+ model_served=self.model_served,
177
+ response_id=self.response_id,
178
+ stop_reason=self.stop_reason,
179
+ )
180
+ return self._finalized_result
181
+
182
+ def apply_to_entry(self, entry: dict) -> None:
183
+ """Finalize and write accumulated streaming state to a log entry.
184
+
185
+ Does NOT set endpoint_family — that must be set by integration code.
186
+ """
187
+ self.finalize().apply_to_entry(entry)
188
+ if self.error_type is not None:
189
+ entry["error"] = True
190
+ entry["error_type"] = self.error_type
191
+
192
+ def _handle_message_start(self, event: dict) -> None:
193
+ message = event.get("message") or {}
194
+ self.model_served = message.get("model")
195
+ self.response_id = message.get("id")
196
+ usage = message.get("usage") or {}
197
+ self.input_tokens = usage.get("input_tokens")
198
+ self.cache_read_tokens = usage.get("cache_read_input_tokens", 0)
199
+ self.cache_creation_tokens = usage.get("cache_creation_input_tokens", 0)
200
+ self._raw_usage_parts["message_start"] = usage
201
+
202
+ def _handle_content_block_start(self, event: dict) -> None:
203
+ block = event.get("content_block") or {}
204
+ if block.get("type") == "tool_use":
205
+ self._tool_call_count += 1
206
+
207
+ def _handle_content_block_delta(self, event: dict) -> None:
208
+ delta = event.get("delta") or {}
209
+ if delta.get("type") == "thinking_delta":
210
+ self._thinking_chars += len(delta.get("thinking", ""))
211
+
212
+ def _handle_message_delta(self, event: dict) -> None:
213
+ """Note: Anthropic only sends cache tokens in message_start.
214
+ We intentionally skip cache token extraction from message_delta."""
215
+ usage = event.get("usage") or {}
216
+ if "output_tokens" in usage:
217
+ self.output_tokens = usage["output_tokens"]
218
+ self._raw_usage_parts["message_delta"] = usage
219
+ stop_reason = event.get("delta", {}).get("stop_reason")
220
+ if stop_reason is not None:
221
+ self.stop_reason = stop_reason
222
+
223
+ def _handle_error(self, event: dict) -> None:
224
+ error = event.get("error") or {}
225
+ self.error_type = error.get("type")
226
+ self.error_message = error.get("message")