tokenmizer 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. tokenmizer/__init__.py +21 -0
  2. tokenmizer/agents/__init__.py +0 -0
  3. tokenmizer/analytics/__init__.py +0 -0
  4. tokenmizer/analytics/engine.py +188 -0
  5. tokenmizer/api/__init__.py +0 -0
  6. tokenmizer/api/app.py +958 -0
  7. tokenmizer/api/rate_limiter.py +110 -0
  8. tokenmizer/checkpoints/__init__.py +0 -0
  9. tokenmizer/checkpoints/manager.py +383 -0
  10. tokenmizer/cli.py +153 -0
  11. tokenmizer/compression/__init__.py +0 -0
  12. tokenmizer/compression/engine.py +669 -0
  13. tokenmizer/compression/output_trimmer.py +95 -0
  14. tokenmizer/compression/window.py +104 -0
  15. tokenmizer/config/__init__.py +0 -0
  16. tokenmizer/config/settings.py +170 -0
  17. tokenmizer/core/__init__.py +0 -0
  18. tokenmizer/core/dto.py +196 -0
  19. tokenmizer/core/errors.py +35 -0
  20. tokenmizer/core/tokenizer.py +96 -0
  21. tokenmizer/dashboard/__init__.py +0 -0
  22. tokenmizer/dashboard/page.py +267 -0
  23. tokenmizer/filters/__init__.py +0 -0
  24. tokenmizer/filters/file_intelligence.py +960 -0
  25. tokenmizer/graph_memory/__init__.py +0 -0
  26. tokenmizer/graph_memory/decision_tracker.py +225 -0
  27. tokenmizer/graph_memory/graph.py +1287 -0
  28. tokenmizer/graph_memory/helpers.py +121 -0
  29. tokenmizer/graph_memory/hybrid_extractor.py +703 -0
  30. tokenmizer/graph_memory/types.py +134 -0
  31. tokenmizer/graph_memory/validator.py +304 -0
  32. tokenmizer/graph_memory/visualization.py +228 -0
  33. tokenmizer/mcp/__init__.py +0 -0
  34. tokenmizer/mcp/server.py +368 -0
  35. tokenmizer/providers/__init__.py +0 -0
  36. tokenmizer/providers/providers.py +456 -0
  37. tokenmizer/security/__init__.py +0 -0
  38. tokenmizer/security/auth.py +95 -0
  39. tokenmizer/security/middleware.py +138 -0
  40. tokenmizer/security/redaction.py +126 -0
  41. tokenmizer/semantic_cache/__init__.py +0 -0
  42. tokenmizer/semantic_cache/cache.py +383 -0
  43. tokenmizer/state/__init__.py +0 -0
  44. tokenmizer/state/backend.py +137 -0
  45. tokenmizer/storage/__init__.py +56 -0
  46. tokenmizer-0.2.4.dist-info/METADATA +529 -0
  47. tokenmizer-0.2.4.dist-info/RECORD +50 -0
  48. tokenmizer-0.2.4.dist-info/WHEEL +4 -0
  49. tokenmizer-0.2.4.dist-info/entry_points.txt +2 -0
  50. tokenmizer-0.2.4.dist-info/licenses/LICENSE +21 -0
tokenmizer/api/app.py ADDED
@@ -0,0 +1,958 @@
1
+ """
2
+ TokenMizer — main FastAPI application.
3
+
4
+ OpenAI-compatible proxy: POST /v1/chat/completions plus session/graph
5
+ management endpoints. See README API Reference.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import asyncio
10
+ import logging
11
+ import time
12
+ import uuid
13
+ from collections import OrderedDict
14
+ from contextlib import asynccontextmanager
15
+ from typing import Optional
16
+
17
+ from fastapi import Depends, FastAPI, HTTPException, Request
18
+ from fastapi.middleware.cors import CORSMiddleware
19
+ from fastapi.responses import HTMLResponse
20
+ from pydantic import BaseModel
21
+
22
+ from tokenmizer.analytics.engine import AnalyticsEngine
23
+ from tokenmizer.api.rate_limiter import get_rate_limiter
24
+ from tokenmizer.checkpoints.manager import CheckpointManager
25
+ from tokenmizer.compression.engine import CompressionPipeline
26
+ from tokenmizer.compression.output_trimmer import OutputTrimmer
27
+ from tokenmizer.compression.window import SmartMessageWindow, needs_windowing
28
+ from tokenmizer.config.settings import get_settings
29
+ from tokenmizer.core.tokenizer import count_messages_tokens, count_tokens
30
+ from tokenmizer.filters.file_intelligence import FileIntelligence
31
+ from tokenmizer.graph_memory.graph import GraphMemory
32
+ from tokenmizer.providers.providers import build_provider
33
+ from tokenmizer.security.auth import verify_api_key
34
+ from tokenmizer.security.middleware import injection_guard
35
+ from tokenmizer.security.redaction import redact_messages
36
+ from tokenmizer.semantic_cache.cache import SemanticCache
37
+ from tokenmizer.state.backend import get_state_backend
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ settings = get_settings()
42
+
43
+ # ── Singletons ────────────────────────────────────────────────────────────────
44
+ _provider = None
45
+ _compression = CompressionPipeline(
46
+ ratio=settings.compression.ratio,
47
+ enable_ml=(settings.compression.engine == "llmlingua2"),
48
+ )
49
+ _cache = SemanticCache(
50
+ threshold=settings.cache.similarity_threshold,
51
+ ttl_seconds=settings.cache.ttl_seconds,
52
+ max_size=settings.cache.max_size,
53
+ )
54
+ _checkpoint_mgr = CheckpointManager(storage_dir=settings.graph_checkpoint.storage_dir)
55
+ _analytics = AnalyticsEngine()
56
+ _state = get_state_backend(settings.state_backend, settings.redis_url)
57
+ _output_trimmer = OutputTrimmer()
58
+ _rate_limiter = get_rate_limiter(rate=60, per_seconds=60, burst=10)
59
+
60
+ # Bounded LRU for session locks — prevents memory leak on long-running servers.
61
+ # Max 1000 concurrent sessions; LRU eviction removes oldest UNHELD lock.
62
+ _SESSION_LOCK_MAX = 1000
63
+ _session_locks: "OrderedDict[str, asyncio.Lock]" = OrderedDict()
64
+
65
+
66
+ def _get_session_lock(session_id: str) -> asyncio.Lock:
67
+ """
68
+ Get or create a per-session async lock (LRU-bounded).
69
+
70
+ Eviction safety: only evicts UNHELD locks (lock.locked() == False).
71
+ If all locks happen to be held when over the cap (extremely unlikely
72
+ at 1000 concurrent sessions), we skip eviction this call rather than
73
+ risk a held lock being dropped — which would let a new request bypass
74
+ an in-flight request's mutual exclusion for the same session.
75
+ """
76
+ if session_id in _session_locks:
77
+ _session_locks.move_to_end(session_id)
78
+ return _session_locks[session_id]
79
+
80
+ lock = asyncio.Lock()
81
+ _session_locks[session_id] = lock
82
+
83
+ if len(_session_locks) > _SESSION_LOCK_MAX:
84
+ # Find oldest unheld lock to evict (iterate from front)
85
+ for old_id in list(_session_locks.keys()):
86
+ if old_id == session_id:
87
+ continue
88
+ if not _session_locks[old_id].locked():
89
+ del _session_locks[old_id]
90
+ break
91
+ if len(_session_locks) <= _SESSION_LOCK_MAX:
92
+ break
93
+
94
+ return lock
95
+
96
+ _smart_window = SmartMessageWindow(
97
+ token_budget=settings.memory.max_tokens_before_summary,
98
+ protect_recent=settings.memory.recent_turns_verbatim,
99
+ graph_context_budget=250,
100
+ )
101
+ _file_intelligence = FileIntelligence()
102
+ _cheap_provider = None # lazy — only built if use_llm_extraction=True
103
+
104
+
105
+ def _get_cheap_provider():
106
+ """
107
+ Build a cheap model provider for LLM extraction.
108
+ Uses haiku/gpt-4o-mini — costs ~$0.001 per extraction turn.
109
+ Only instantiated when use_llm_extraction=True.
110
+ """
111
+ global _cheap_provider
112
+ if _cheap_provider is not None:
113
+ return _cheap_provider
114
+
115
+ from tokenmizer.providers.providers import AnthropicProvider, OpenAIProvider
116
+
117
+ provider = settings.provider.lower()
118
+ key = settings.get_api_key_for_provider(provider)
119
+
120
+ if provider in ("anthropic", "claude") and key:
121
+ _cheap_provider = AnthropicProvider(key, model="claude-haiku-4-5")
122
+ elif provider in ("openai", "gpt") and key:
123
+ _cheap_provider = OpenAIProvider(key, model="gpt-4o-mini")
124
+ elif provider == "deepseek" and key:
125
+ from tokenmizer.providers.providers import DeepSeekProvider
126
+ _cheap_provider = DeepSeekProvider(key, model="deepseek-chat")
127
+ else:
128
+ # No cheap model available — will fall back to heuristic
129
+ _cheap_provider = None
130
+
131
+ return _cheap_provider
132
+
133
+
134
+ def _get_provider():
135
+ global _provider
136
+ if _provider is None:
137
+ _provider = build_provider(settings)
138
+ return _provider
139
+
140
+
141
+ # ── Graph helpers (state-backend backed) ─────────────────────────────────────
142
+
143
+ # In-process graph cache — avoids SQLite reload on every request.
144
+ # Thread-safe: each session_id maps to one GraphMemory, protected by _get_session_lock().
145
+
146
+ # LRU-bounded cache of GraphMemory objects (evicts least-recently-used).
147
+ # Graph data is persisted to SQLite, so eviction just frees memory —
148
+ # the graph reloads from disk on next access. Cap chosen for typical
149
+ # self-hosted deployments (one process, many sessions over time).
150
+ _GRAPH_CACHE_MAX = 200
151
+ _graph_cache: "OrderedDict[str, GraphMemory]" = OrderedDict()
152
+ _graph_cache_lock = asyncio.Lock() # guards dict creation — prevents TOCTOU race
153
+
154
+
155
+ def _graph_cache_touch(session_id: str) -> None:
156
+ """Move session to end (most-recently-used) and evict oldest if over cap."""
157
+ _graph_cache.move_to_end(session_id)
158
+ while len(_graph_cache) > _GRAPH_CACHE_MAX:
159
+ evicted_id, evicted_graph = _graph_cache.popitem(last=False)
160
+ # Ensure pending writes are flushed before dropping from memory.
161
+ #
162
+ # FIXED: previously a failed flush here was caught, logged at
163
+ # `error`, and then the graph was dropped from memory anyway —
164
+ # meaning any nodes added since the last successful `_persist()`
165
+ # call were gone permanently, with zero visibility beyond a log
166
+ # line. This is silent, permanent data loss in a tool whose whole
167
+ # pitch is "never lose context." We now retry once (covers
168
+ # transient SQLite WAL lock contention) and record the failure to
169
+ # analytics so it's queryable via /api/stats instead of invisible.
170
+ persisted = False
171
+ for attempt in range(2):
172
+ try:
173
+ evicted_graph._persist()
174
+ persisted = True
175
+ break
176
+ except Exception as e:
177
+ if attempt == 0:
178
+ logger.warning(
179
+ f"Persist attempt 1 failed for evicted graph {evicted_id}, retrying: {e}"
180
+ )
181
+ else:
182
+ logger.error(
183
+ f"Graph {evicted_id} evicted from cache WITHOUT persisting — "
184
+ f"nodes added since last successful save are LOST: {e}"
185
+ )
186
+ if not persisted:
187
+ _analytics.record_silent_failure("graph_eviction")
188
+
189
+
190
+ async def _get_graph_async(session_id: str) -> GraphMemory:
191
+ """
192
+ Race-safe, LRU-bounded graph accessor for async handlers.
193
+ Double-checked locking: avoids creating two GraphMemory objects
194
+ for the same session when concurrent requests both see a cache miss.
195
+ """
196
+ if session_id in _graph_cache:
197
+ _graph_cache_touch(session_id)
198
+ return _graph_cache[session_id]
199
+ async with _graph_cache_lock:
200
+ if session_id not in _graph_cache: # re-check after lock
201
+ _graph_cache[session_id] = GraphMemory(
202
+ session_id,
203
+ storage_dir=settings.graph_checkpoint.storage_dir,
204
+ )
205
+ _graph_cache_touch(session_id)
206
+ return _graph_cache[session_id]
207
+
208
+
209
+ def _get_context_used(session_id: str) -> int:
210
+ return _state.get(f"ctx:{session_id}") or 0
211
+
212
+
213
+ def _set_context_used(session_id: str, tokens: int) -> None:
214
+ # FIXED: state backend `set()` now returns bool (see state/backend.py).
215
+ # A dropped write here under-counts context usage, which can silently
216
+ # cause the auto-checkpoint trigger_at_percent threshold to be missed —
217
+ # the proxy thinks the session has used less context than it actually
218
+ # has. Recording the failure makes this visible via /api/stats instead
219
+ # of manifesting only as "why didn't my checkpoint fire."
220
+ ok = _state.set(f"ctx:{session_id}", tokens, ttl=86400)
221
+ if not ok:
222
+ logger.error(f"Failed to persist context usage for session {session_id}")
223
+ _analytics.record_silent_failure("state_backend_set")
224
+
225
+
226
+ # ── Context window sizes ──────────────────────────────────────────────────────
227
+
228
+ # Newest Claude models (fable-5, opus-4-8, sonnet-5, haiku-4-5) all match the
229
+ # "claude" prefix entry. Add a specific entry ONLY if a model's window differs.
230
+ _CONTEXT_WINDOWS = {
231
+ "claude-fable-5": 200_000, "claude-opus-4-8": 200_000,
232
+ "claude-sonnet": 200_000, "claude-opus": 200_000, "claude-haiku": 200_000,
233
+ "claude": 200_000,
234
+ "gpt-4o": 128_000, "gpt-4": 128_000, "gpt-3.5": 16_000,
235
+ "gemini": 1_000_000, "deepseek": 64_000,
236
+ }
237
+
238
+
239
+ def _context_window(model: str) -> int:
240
+ # Longest key first — so "claude-fable-5" wins over the "claude" catch-all
241
+ # if their values ever diverge. (Previously dict order decided; the broad
242
+ # "claude" key shadowed every specific entry.)
243
+ m = model.lower()
244
+ for k in sorted(_CONTEXT_WINDOWS, key=len, reverse=True):
245
+ if k in m:
246
+ return _CONTEXT_WINDOWS[k]
247
+ return 128_000
248
+
249
+
250
+ # ── Lifespan ─────────────────────────────────────────────────────────────────
251
+
252
+ @asynccontextmanager
253
+ async def lifespan(app: FastAPI):
254
+ logger.info("TokenMizer starting")
255
+ yield
256
+ logger.info("TokenMizer stopped")
257
+
258
+
259
+ # ── App ───────────────────────────────────────────────────────────────────────
260
+
261
+ app = FastAPI(
262
+ title="TokenMizer",
263
+ description="Never lose your AI context again.",
264
+ version="0.2.4",
265
+ lifespan=lifespan,
266
+ docs_url="/docs",
267
+ redoc_url="/redoc",
268
+ )
269
+
270
+ app.add_middleware(
271
+ CORSMiddleware,
272
+ allow_origins=settings.cors_origins, # defaults: localhost:3000, localhost:8000
273
+ allow_credentials=True,
274
+ allow_methods=["GET", "POST", "OPTIONS"],
275
+ allow_headers=["Content-Type", "Authorization", "X-Session-ID", "X-API-Key"],
276
+ )
277
+
278
+
279
+ # ── Request / Response models ─────────────────────────────────────────────────
280
+
281
+ class ChatMessage(BaseModel):
282
+ """OpenAI-style message. `content` accepts a plain string OR a list of
283
+ content blocks (multimodal format). Blocks are normalized to text —
284
+ TokenMizer is a text proxy; non-text blocks (images) are dropped with
285
+ their text parts preserved."""
286
+ role: str
287
+ content: str | list | None = ""
288
+
289
+ def text(self) -> str:
290
+ from tokenmizer.graph_memory.helpers import _content_to_text
291
+ return _content_to_text(self.content)
292
+
293
+
294
+ class ChatRequest(BaseModel):
295
+ """OpenAI-compatible request. Sampling params (temperature, top_p, stop)
296
+ are forwarded to the provider. Unknown fields are accepted and ignored
297
+ (extra='allow') so standard OpenAI clients never get a 422 — but only
298
+ the fields below influence the call."""
299
+ model_config = {"extra": "allow"}
300
+
301
+ model: Optional[str] = None
302
+ messages: list[ChatMessage]
303
+ max_tokens: Optional[int] = 4096
304
+ stream: Optional[bool] = False
305
+ session_id: Optional[str] = None
306
+ temperature: Optional[float] = None
307
+ top_p: Optional[float] = None
308
+ stop: Optional[str | list[str]] = None
309
+
310
+
311
+ def _sampling_kwargs(req: "ChatRequest") -> dict:
312
+ """Sampling params to forward to the provider (only ones explicitly set)."""
313
+ kw: dict = {}
314
+ if req.temperature is not None:
315
+ kw["temperature"] = req.temperature
316
+ if req.top_p is not None:
317
+ kw["top_p"] = req.top_p
318
+ if req.stop is not None:
319
+ kw["stop"] = req.stop
320
+ return kw
321
+
322
+
323
+ # ── chat_completions helpers ──────────────────────────────────────────────────
324
+
325
+
326
+ async def _check_rate_limit(request: Request) -> None:
327
+ """Raise 429 if client is rate-limited."""
328
+ client_ip = request.client.host if request.client else "unknown"
329
+ client_id = request.headers.get("Authorization", client_ip)
330
+ allowed, retry_after = await _rate_limiter.check(client_id)
331
+ if not allowed:
332
+ raise HTTPException(
333
+ status_code=429,
334
+ detail=f"Rate limit exceeded. Retry after {retry_after:.1f}s",
335
+ headers={"Retry-After": str(int(retry_after) + 1)},
336
+ )
337
+
338
+
339
+ def _apply_compression_layers(
340
+ messages: list[dict],
341
+ settings,
342
+ savings: dict,
343
+ ) -> list[dict]:
344
+ """
345
+ Layer 0-2: file intelligence, compression, terse injection.
346
+ Returns compressed messages and populates savings dict.
347
+ """
348
+ user_query = next(
349
+ (m["content"] for m in reversed(messages) if m.get("role") == "user"), ""
350
+ )
351
+ # Layer 0: File intelligence
352
+ messages, file_saved = _file_intelligence.process_message_files(
353
+ messages, token_budget_per_file=600, query=user_query
354
+ )
355
+ savings["file_extraction"] = file_saved
356
+
357
+ # Layer 1: Prompt compression
358
+ if settings.compression.enabled:
359
+ compressed, saved = _compression.compress_messages(messages, protect_recent=3)
360
+ messages = compressed
361
+ savings["compression"] = saved
362
+
363
+ # Layer 2: Terse output injection
364
+ if settings.terse_output.enabled:
365
+ terse = _compression.terse_system_prompt(settings.terse_output.level)
366
+ has_system = any(m.get("role") == "system" for m in messages)
367
+ if has_system:
368
+ for m in messages:
369
+ if m.get("role") == "system":
370
+ m["content"] = terse + "\n\n" + m["content"]
371
+ break
372
+ else:
373
+ messages = [{"role": "system", "content": terse}] + messages
374
+
375
+ return messages
376
+
377
+
378
+ async def _update_graph(
379
+ session_id: str,
380
+ graph,
381
+ raw_messages: list[dict],
382
+ messages: list[dict],
383
+ model: str,
384
+ savings: dict,
385
+ user_query: str,
386
+ ) -> tuple[list[dict], dict]:
387
+ """
388
+ Layer 4: Graph extraction, smart windowing, context injection, checkpoint.
389
+ Mutates messages (adds graph context).
390
+ Returns (updated_messages, checkpoint_status) — checkpoint_status surfaces
391
+ auto-checkpoint success/failure to the caller instead of only logging it.
392
+ """
393
+ context_used = _get_context_used(session_id)
394
+ context_window = _context_window(model)
395
+ input_tokens = count_messages_tokens(messages, model)
396
+ context_pct = (context_used + input_tokens) / context_window
397
+
398
+ # Extraction: heuristic sync now, LLM async in background
399
+ if settings.graph_checkpoint.use_llm_extraction:
400
+ cheap = _get_cheap_provider()
401
+ if cheap is not None:
402
+ recent = raw_messages[-4:] if len(raw_messages) >= 4 else raw_messages
403
+ new_msgs = [m for m in recent
404
+ if graph._msg_hash(m) not in graph._processed_hashes]
405
+ if new_msgs:
406
+ graph.extract_from_messages(raw_messages, incremental=True)
407
+ _lock_ref = _get_session_lock(session_id)
408
+
409
+ async def _background_extract(
410
+ _g=graph, _msgs=new_msgs, _all=raw_messages,
411
+ _cheap=cheap, _lock=_lock_ref, _sid=session_id,
412
+ ):
413
+ async with _lock:
414
+ try:
415
+ from tokenmizer.graph_memory.hybrid_extractor import HybridExtractor
416
+
417
+ async def _pfn(messages, system="", max_tokens=600):
418
+ r = await _cheap.chat(
419
+ messages=messages, system=system, max_tokens=max_tokens
420
+ )
421
+ return {"text": r.text}
422
+
423
+ ext = HybridExtractor(provider_fn=_pfn)
424
+ extracted = await ext.extract(_msgs)
425
+ _g.extract_from_messages(_all, incremental=False,
426
+ extracted_data=extracted)
427
+ logger.debug(f"HybridExtractor complete for {_sid}")
428
+ except Exception as e:
429
+ # FIXED: previously logged at `debug` (off by
430
+ # default in production) — meaning the entire
431
+ # LLM-powered extraction feature could fail on
432
+ # every single call (e.g. invalid/expired cheap-
433
+ # provider API key, provider outage, quota
434
+ # exhausted) and run silently for the whole
435
+ # session with zero visibility anywhere. The
436
+ # graph would just quietly stop gaining new
437
+ # nodes from this path and nobody would know
438
+ # why. Bumped to `warning` (visible by default)
439
+ # and tracked via analytics so persistent
440
+ # failures are queryable via /api/stats instead
441
+ # of only discoverable by reading debug logs.
442
+ logger.warning(
443
+ f"Background LLM extraction failed for session "
444
+ f"{_sid} (falling back to heuristic-only on next "
445
+ f"calls, no data lost — just less accurate "
446
+ f"extraction this turn): {e}"
447
+ )
448
+ _analytics.record_silent_failure("llm_extraction")
449
+
450
+ asyncio.create_task(_background_extract())
451
+ else:
452
+ graph.extract_from_messages(raw_messages, incremental=True)
453
+ else:
454
+ graph.extract_from_messages(raw_messages, incremental=True)
455
+ else:
456
+ graph.extract_from_messages(raw_messages, incremental=True)
457
+
458
+ # Smart windowing
459
+ if needs_windowing(messages, settings.memory.max_tokens_before_summary, model):
460
+ messages, window_saved = _smart_window.apply(messages, graph, model)
461
+ savings["windowing"] = window_saved
462
+ else:
463
+ savings["windowing"] = 0
464
+
465
+ # Context injection — only when graph has enough signal
466
+ if len(graph._nodes) >= 3 and len(user_query.split()) >= 4:
467
+ relevant = graph.query(user_query, top_k=8)
468
+ if relevant:
469
+ ctx_parts = [
470
+ f" {n.type.value}: {n.label}"
471
+ + (f" ({n.summary[:50]})" if n.summary else "")
472
+ for n in relevant[:6]
473
+ ]
474
+ ctx_block = "\n".join(ctx_parts)
475
+ sys_idx = next(
476
+ (i for i, m in enumerate(messages) if m.get("role") == "system"), None
477
+ )
478
+ if sys_idx is not None:
479
+ messages[sys_idx]["content"] = (
480
+ f"[Relevant session context]\n{ctx_block}\n\n"
481
+ f"{messages[sys_idx]['content']}"
482
+ )
483
+
484
+ # Auto-checkpoint
485
+ #
486
+ # FIXED: previously a failed auto-checkpoint was caught, logged at
487
+ # `warning`, and otherwise invisible — the chat response returned
488
+ # normally with no indication that the safety net didn't fire. For a
489
+ # tool whose entire pitch is "never lose context across sessions,"
490
+ # silently failing the auto-checkpoint and telling the user nothing is
491
+ # the single worst failure mode this codebase had. The chat request
492
+ # still should NOT fail just because the checkpoint failed (the user
493
+ # came here for an answer, not a checkpoint), but the failure must be
494
+ # visible somewhere the caller can actually see it.
495
+ #
496
+ # Fix: retry once (covers transient SQLite lock contention under
497
+ # concurrent requests — see WAL mode notes in checkpoints/manager.py),
498
+ # log at `error` if it still fails, and record the failure in `savings`
499
+ # so it flows into the `tokenmizer.checkpoint` response field below —
500
+ # a client that cares can check `checkpoint_failed` instead of having
501
+ # to grep server logs to discover their context wasn't saved.
502
+ checkpoint_status = {"attempted": False, "succeeded": False, "checkpoint_id": None}
503
+ if (context_pct >= settings.graph_checkpoint.trigger_at_percent
504
+ and settings.graph_checkpoint.enabled):
505
+ checkpoint_status["attempted"] = True
506
+ last_error: Optional[Exception] = None
507
+ for attempt in range(2): # one retry for transient SQLite lock contention
508
+ try:
509
+ ckpt = _checkpoint_mgr.create(
510
+ session_id=session_id,
511
+ messages=raw_messages,
512
+ graph=graph,
513
+ context_pct=context_pct,
514
+ trigger="auto_threshold",
515
+ model=model,
516
+ )
517
+ logger.info(f"Auto-checkpoint {ckpt.checkpoint_id} for {session_id}")
518
+ checkpoint_status["succeeded"] = True
519
+ checkpoint_status["checkpoint_id"] = ckpt.checkpoint_id
520
+ last_error = None
521
+ break
522
+ except Exception as e:
523
+ last_error = e
524
+ if attempt == 0:
525
+ logger.warning(
526
+ f"Auto-checkpoint attempt 1 failed for {session_id}, retrying once: {e}"
527
+ )
528
+ await asyncio.sleep(0.1)
529
+ if last_error is not None:
530
+ logger.error(
531
+ f"Auto-checkpoint FAILED for {session_id} after retry — "
532
+ f"context was NOT saved at {context_pct:.0%} usage: {last_error}"
533
+ )
534
+ checkpoint_status["error"] = str(last_error)
535
+ _analytics.record_silent_failure("checkpoint")
536
+
537
+ _set_context_used(session_id, context_used + input_tokens)
538
+ return messages, checkpoint_status
539
+
540
+
541
+ async def _call_provider(
542
+ req,
543
+ messages: list[dict],
544
+ model: str,
545
+ user_content: str,
546
+ session_id: str,
547
+ savings: dict,
548
+ ) -> tuple[str, int, int, float]:
549
+ """
550
+ Layer 3 + 5: Cache lookup → LLM call → output trim → cache write.
551
+ Returns (response_text, input_tokens, output_tokens, latency_ms).
552
+ """
553
+ # Cache lookup
554
+ if settings.cache.enabled and user_content:
555
+ cached = _cache.get(user_content, session_id=session_id)
556
+ if cached:
557
+ savings["cache"] = count_tokens(user_content, model)
558
+ output_tokens = count_tokens(cached.response, model)
559
+ return cached.response, 0, output_tokens, 0.0
560
+
561
+ # Streaming check
562
+ if req.stream:
563
+ raise HTTPException(
564
+ status_code=501,
565
+ detail=(
566
+ "Streaming is not yet supported by the TokenMizer proxy. "
567
+ "Set stream=False in your request, or connect directly to "
568
+ "your provider for streaming. True SSE streaming is planned for v0.3."
569
+ ),
570
+ )
571
+
572
+ # LLM call
573
+ # NOTE: `messages` is already redacted — redaction now happens once at
574
+ # ingestion in chat_completions() so every downstream consumer (this call,
575
+ # background graph extraction, checkpoint storage) sees the same safe
576
+ # copy. We do NOT re-redact here to avoid masking a regression upstream:
577
+ # if redaction is ever accidentally removed at ingestion, this call site
578
+ # should not silently paper over it.
579
+ provider = _get_provider()
580
+ try:
581
+ resp = await provider.chat(
582
+ messages=messages, model=model,
583
+ max_tokens=req.max_tokens or 4096, stream=False,
584
+ **_sampling_kwargs(req),
585
+ )
586
+ except Exception as e:
587
+ logger.error(f"Provider error: {e}")
588
+ raise HTTPException(status_code=502, detail=f"Provider error: {str(e)}")
589
+
590
+ response_text = resp.text
591
+ output_tokens = resp.output_tokens
592
+ input_tokens = resp.input_tokens
593
+ latency_ms = resp.latency_ms
594
+
595
+ # Output trim
596
+ if settings.terse_output.enabled:
597
+ response_text, output_saved = _output_trimmer.trim(
598
+ response_text, level=settings.terse_output.level
599
+ )
600
+ savings["output_trim"] = output_saved
601
+ output_tokens = max(1, output_tokens - output_saved)
602
+
603
+ # Cache write
604
+ if settings.cache.enabled and user_content:
605
+ _cache.set(user_content, response_text,
606
+ input_tokens=input_tokens, output_tokens=output_tokens,
607
+ session_id=session_id)
608
+
609
+ return response_text, input_tokens, output_tokens, latency_ms
610
+
611
+
612
+ @app.post("/v1/chat/completions", dependencies=[Depends(verify_api_key), Depends(injection_guard)])
613
+ async def chat_completions(req: ChatRequest, request: Request):
614
+ """
615
+ Main proxy endpoint — orchestrates all 6 layers.
616
+
617
+ Split into helpers to keep this orchestrator readable:
618
+ _check_rate_limit() — 429 if over limit
619
+ _apply_compression_layers() — file intelligence, compress, terse inject
620
+ _update_graph() — graph extraction, windowing, context inject
621
+ _call_provider() — cache → LLM → output trim → cache write
622
+ """
623
+ session_id = req.session_id or str(uuid.uuid4())
624
+ model = req.model or settings.default_model
625
+ savings: dict[str, int] = {}
626
+
627
+ await _check_rate_limit(request)
628
+
629
+ # SECURITY: redact secrets/PII at the earliest possible point, before
630
+ # ANY downstream consumer sees the content. This includes:
631
+ # - the main chat provider call (_call_provider)
632
+ # - the background graph-extraction LLM call (_update_graph → HybridExtractor),
633
+ # which talks to a *separate*, often cheaper third-party model
634
+ # (haiku/gpt-4o-mini/deepseek) — previously this saw RAW unredacted
635
+ # content because only _call_provider redacted its own copy.
636
+ # - checkpoint storage (SQLite) and the graph DB itself
637
+ # Redacting once here means every downstream path is safe by construction
638
+ # instead of relying on each call site to remember to redact.
639
+ raw_messages = [{"role": m.role, "content": m.text()} for m in req.messages]
640
+ raw_messages = redact_messages(raw_messages)
641
+ messages = raw_messages[:]
642
+ user_query = next(
643
+ (m["content"] for m in reversed(raw_messages) if m.get("role") == "user"), ""
644
+ )
645
+ user_content = user_query
646
+
647
+ # Layer 0-2: file intelligence, compression, terse injection
648
+ messages = _apply_compression_layers(messages, settings, savings)
649
+
650
+ # Layer 3+5: cache + LLM + output trim (done before graph for latency)
651
+ # Graph runs in parallel-ish: heuristic extract is sync and fast,
652
+ # LLM extract fires async after provider returns.
653
+ orig_input_tokens = count_messages_tokens(raw_messages, model)
654
+ sent_input_tokens = count_messages_tokens(messages, model)
655
+ savings["routing"] = 0
656
+
657
+ # Layer 4: graph update + context injection (mutates messages)
658
+ checkpoint_status: dict = {"attempted": False, "succeeded": False, "checkpoint_id": None}
659
+ if settings.graph_checkpoint.enabled:
660
+ graph = await _get_graph_async(session_id)
661
+ messages, checkpoint_status = await _update_graph(
662
+ session_id, graph, raw_messages, messages, model, savings, user_query
663
+ )
664
+
665
+ # Layer 5: call provider (or return cache hit)
666
+ response_text, input_tokens_actual, output_tokens, latency_ms = await _call_provider(
667
+ req, messages, model, user_content, session_id, savings
668
+ )
669
+ cache_hit = input_tokens_actual == 0 and response_text != ""
670
+
671
+ # Analytics
672
+ total_saved = sum(savings.values())
673
+ _analytics.record(
674
+ session_id=session_id,
675
+ provider=settings.provider,
676
+ model=model,
677
+ input_tokens_original=orig_input_tokens,
678
+ input_tokens_sent=sent_input_tokens,
679
+ output_tokens=output_tokens,
680
+ tokens_saved=total_saved,
681
+ latency_ms=latency_ms,
682
+ cache_hit=cache_hit,
683
+ layer_savings=savings,
684
+ )
685
+
686
+ return {
687
+ "id": f"chatcmpl-{uuid.uuid4().hex[:12]}",
688
+ "object": "chat.completion",
689
+ "created": int(time.time()),
690
+ "model": model,
691
+ "session_id": session_id,
692
+ "choices": [{
693
+ "index": 0,
694
+ "message": {"role": "assistant", "content": response_text},
695
+ "finish_reason": "stop",
696
+ }],
697
+ "usage": {
698
+ "prompt_tokens": input_tokens_actual,
699
+ "completion_tokens": output_tokens,
700
+ "total_tokens": input_tokens_actual + output_tokens,
701
+ "original_prompt_tokens": orig_input_tokens,
702
+ "tokens_saved": total_saved,
703
+ },
704
+ "tokenmizer": {
705
+ "cache_hit": cache_hit,
706
+ "savings": savings,
707
+ "total_saved": total_saved,
708
+ "latency_ms": round(latency_ms, 1),
709
+ # FIXED: previously a failed auto-checkpoint was invisible to
710
+ # the caller — only a log line nobody watches. Now surfaced
711
+ # here so a client can detect "my context wasn't saved" instead
712
+ # of finding out only when resume returns nothing.
713
+ "checkpoint": checkpoint_status,
714
+ },
715
+ }
716
+
717
+ # ── Health / Info ─────────────────────────────────────────────────────────────
718
+
719
+ @app.get("/health")
720
+ async def health():
721
+ return {"status": "ok", "timestamp": time.time()}
722
+
723
+
724
+ @app.get("/")
725
+ async def dashboard():
726
+ from tokenmizer.dashboard.page import DASHBOARD_HTML
727
+ return HTMLResponse(DASHBOARD_HTML)
728
+
729
+
730
+ # ── Session / Graph endpoints ─────────────────────────────────────────────────
731
+
732
+ @app.get("/api/stats", dependencies=[Depends(verify_api_key)])
733
+ async def stats(session_id: Optional[str] = None):
734
+ return _analytics.summary()
735
+
736
+
737
+ @app.get("/api/cache/stats", dependencies=[Depends(verify_api_key)])
738
+ async def cache_stats():
739
+ stats = _cache.stats()
740
+ # Include preference context for completeness (was previously unused)
741
+ stats["preference_context"] = _cache._preference_store.to_system_context()
742
+ return stats
743
+
744
+
745
+ @app.get("/api/graph/{session_id}/history", dependencies=[Depends(verify_api_key)])
746
+ async def get_graph_history(session_id: str, at_time: float = 0.0, top_k: int = 12):
747
+ """
748
+ Query graph state at a specific Unix timestamp.
749
+ at_time=0.0 (default) returns current state (equivalent to /viz).
750
+ at_time=<unix_ts> returns which nodes were active at that point in time.
751
+
752
+ Useful for: debugging decision changes, audit trail, "what did we decide
753
+ at 2pm?" queries.
754
+ """
755
+ graph = await _get_graph_async(session_id)
756
+ if at_time == 0.0:
757
+ nodes = graph.query("", top_k=top_k)
758
+ else:
759
+ nodes = graph.query_at_time("", at_time=at_time, top_k=top_k)
760
+ return {
761
+ "session_id": session_id,
762
+ "at_time": at_time or None,
763
+ "nodes": [
764
+ {
765
+ "id": n.id, "label": n.label, "type": n.type.value,
766
+ "status": n.status.value, "importance": n.importance,
767
+ "valid_from": n.valid_from, "valid_until": n.valid_until or None,
768
+ }
769
+ for n in nodes
770
+ ],
771
+ "count": len(nodes),
772
+ }
773
+
774
+
775
+ @app.get("/api/graph/{session_id}", dependencies=[Depends(verify_api_key)])
776
+ async def get_graph(session_id: str):
777
+ graph = await _get_graph_async(session_id)
778
+ return graph.stats()
779
+
780
+
781
+ @app.get("/api/graph/{session_id}/viz", dependencies=[Depends(verify_api_key)])
782
+ async def get_graph_viz(session_id: str):
783
+ """
784
+ Return full graph as D3-compatible JSON for visualization.
785
+ {nodes: [...], edges: [...], meta: {...}}
786
+ Used by the dashboard Graph tab and any external viz tool.
787
+ """
788
+ graph = await _get_graph_async(session_id)
789
+ return graph.to_vis_json()
790
+
791
+
792
+ @app.get("/api/graph/{session_id}/obsidian", dependencies=[Depends(verify_api_key)])
793
+ async def get_graph_obsidian(session_id: str):
794
+ """
795
+ Download graph as Obsidian Canvas (.canvas) file.
796
+ Save as <any-name>.canvas inside your Obsidian vault and open directly.
797
+ """
798
+ import json as _json
799
+
800
+ from fastapi.responses import Response as _Resp
801
+ graph = await _get_graph_async(session_id)
802
+ canvas = graph.to_obsidian_canvas()
803
+ filename = f"tokenmizer-{session_id[:12]}.canvas"
804
+ return _Resp(
805
+ content=_json.dumps(canvas, indent=2),
806
+ media_type="application/json",
807
+ headers={"Content-Disposition": f"attachment; filename={filename}"},
808
+ )
809
+
810
+
811
+ @app.get("/api/graph/{session_id}/transitions", dependencies=[Depends(verify_api_key)])
812
+ async def get_transitions(session_id: str):
813
+ """Full decision transition history — trigger, reason, evidence, confidence_delta."""
814
+ graph = await _get_graph_async(session_id)
815
+ return {
816
+ "session_id": session_id,
817
+ "transitions": [
818
+ {
819
+ "id": t.id,
820
+ "from_label": t.from_label,
821
+ "to_label": t.to_label,
822
+ "trigger": t.trigger,
823
+ "reason": t.reason,
824
+ "evidence": t.evidence,
825
+ "confidence_delta": t.confidence_delta,
826
+ "timestamp": t.timestamp,
827
+ "context_line": t.to_context_line(),
828
+ }
829
+ for t in graph.get_transitions()
830
+ ],
831
+ "count": len(graph.get_transitions()),
832
+ }
833
+
834
+
835
+
836
+ @app.post("/api/checkpoint", dependencies=[Depends(verify_api_key)])
837
+ async def create_manual_checkpoint(session_id: str):
838
+ """
839
+ Create a manual checkpoint for a session, snapshotting current graph
840
+ state. Used by `tokenmizer checkpoint <session-id>` (CLI) and the
841
+ `/tokenmizer:checkpoint` Claude Code skill.
842
+
843
+ FOUND DURING A FINAL ACCURACY PASS: this endpoint was referenced by
844
+ the README's API Reference table, cli.py's `checkpoint` command, AND
845
+ the Claude Code checkpoint skill (.claude-plugin/skills/checkpoint/
846
+ SKILL.md) — all three call `POST /api/checkpoint?session_id=...` —
847
+ but it was never actually implemented here. Every one of those three
848
+ callers would have gotten a 404 against the real running app. This
849
+ wasn't a documentation typo; it was a real, consistent gap across
850
+ three independent consumers that nothing caught because none of them
851
+ were exercised end-to-end during the original audit.
852
+
853
+ Design note: unlike the auto-checkpoint path in chat_completions(),
854
+ this has no live message history to extract from (a standalone HTTP
855
+ call has no conversation attached) — `CheckpointManager.create()` is
856
+ called with `messages=[]`, which is safe: extract_from_messages()
857
+ early-returns on an empty new-messages diff, and the checkpoint still
858
+ correctly snapshots whatever's ALREADY in the graph from prior chat
859
+ turns. Verified with a direct test before writing this (see
860
+ tests/unit/test_graph_persistence.py for the equivalent pattern).
861
+ """
862
+ try:
863
+ graph = await _get_graph_async(session_id)
864
+ ckpt = _checkpoint_mgr.create(
865
+ session_id=session_id,
866
+ messages=[],
867
+ graph=graph,
868
+ context_pct=0.0,
869
+ trigger="manual",
870
+ )
871
+ return {
872
+ "checkpoint_id": ckpt.checkpoint_id,
873
+ "session_id": session_id,
874
+ "node_count": len(ckpt.graph_snapshot.get("nodes", [])),
875
+ "resume_tokens": ckpt.resume_tokens,
876
+ "resume_standard": ckpt.resume_standard,
877
+ "trigger": ckpt.trigger,
878
+ }
879
+ except Exception as e:
880
+ logger.error(f"Manual checkpoint failed for session {session_id}: {e}")
881
+ raise HTTPException(status_code=500, detail=str(e))
882
+
883
+
884
+ @app.get("/api/checkpoints/{session_id}", dependencies=[Depends(verify_api_key)])
885
+ async def list_checkpoints(session_id: str):
886
+ return _checkpoint_mgr.list_checkpoints(session_id)
887
+
888
+
889
+ @app.post("/api/decision/invalidate", dependencies=[Depends(verify_api_key)])
890
+ async def invalidate_decision(session_id: str, decision_label: str, reason: str = ""):
891
+ """
892
+ Mark a decision as INVALIDATED (red) — explicitly wrong or cancelled.
893
+ Use when a decision was made that turned out to be incorrect.
894
+ History is preserved; decision is flagged as a warning in future resumes.
895
+ """
896
+ try:
897
+ from tokenmizer.graph_memory.graph import NodeStatus, NodeType
898
+ graph = await _get_graph_async(session_id)
899
+ label_lower = decision_label.lower().strip()
900
+ found = False
901
+ for node in graph._nodes.values():
902
+ if (node.type == NodeType.DECISION and
903
+ label_lower in node.label.lower()):
904
+ node.status = NodeStatus.INVALIDATED
905
+ node.summary = (
906
+ f"Invalidated: {reason[:100]}" if reason else "Explicitly invalidated"
907
+ )
908
+ found = True
909
+ if not found:
910
+ raise HTTPException(
911
+ status_code=404,
912
+ detail=f"No decision matching '{decision_label}' found in session '{session_id}'"
913
+ )
914
+ graph._persist(force=True) # direct node mutation above bypasses add_node's
915
+ # dirty-tracking — force=True is required here or
916
+ # this write is silently skipped (caught in a final
917
+ # accuracy pass; same class of bug the eviction path
918
+ # and prune() were already protected against)
919
+ return {
920
+ "session_id": session_id,
921
+ "invalidated": decision_label,
922
+ "reason": reason,
923
+ "status": "invalidated",
924
+ }
925
+ except HTTPException:
926
+ raise
927
+ except Exception as e:
928
+ logger.error(f"Invalidate decision failed: {e}")
929
+ raise HTTPException(status_code=500, detail=str(e))
930
+
931
+
932
+ @app.get("/api/resume/{session_id}", dependencies=[Depends(verify_api_key)])
933
+ async def get_resume(session_id: str, level: str = "standard"):
934
+ """Get resume context for a session. level: critical | standard | full"""
935
+ try:
936
+ if level not in ("critical", "standard", "full"):
937
+ level = "standard"
938
+ ckpt = _checkpoint_mgr.get_latest(session_id)
939
+ if not ckpt:
940
+ raise HTTPException(status_code=404, detail="No checkpoint found for session")
941
+ resume_map = {
942
+ "critical": ckpt.resume_critical,
943
+ "standard": ckpt.resume_standard,
944
+ "full": ckpt.resume_full,
945
+ }
946
+ text = resume_map.get(level, ckpt.resume_standard)
947
+ return {
948
+ "session_id": session_id,
949
+ "checkpoint_id": ckpt.checkpoint_id,
950
+ "level": level,
951
+ "resume_context": text,
952
+ "token_count": count_tokens(text),
953
+ }
954
+ except HTTPException:
955
+ raise
956
+ except Exception as e:
957
+ logger.error(f"Resume failed for {session_id}: {e}")
958
+ raise HTTPException(status_code=500, detail=f"Resume failed: {str(e)}")