remdb 0.3.14__py3-none-any.whl → 0.3.133__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. rem/agentic/README.md +76 -0
  2. rem/agentic/__init__.py +15 -0
  3. rem/agentic/agents/__init__.py +16 -2
  4. rem/agentic/agents/sse_simulator.py +502 -0
  5. rem/agentic/context.py +51 -27
  6. rem/agentic/llm_provider_models.py +301 -0
  7. rem/agentic/mcp/tool_wrapper.py +112 -17
  8. rem/agentic/otel/setup.py +93 -4
  9. rem/agentic/providers/phoenix.py +302 -109
  10. rem/agentic/providers/pydantic_ai.py +215 -26
  11. rem/agentic/schema.py +361 -21
  12. rem/agentic/tools/rem_tools.py +3 -3
  13. rem/api/README.md +215 -1
  14. rem/api/deps.py +255 -0
  15. rem/api/main.py +132 -40
  16. rem/api/mcp_router/resources.py +1 -1
  17. rem/api/mcp_router/server.py +26 -5
  18. rem/api/mcp_router/tools.py +465 -7
  19. rem/api/routers/admin.py +494 -0
  20. rem/api/routers/auth.py +70 -0
  21. rem/api/routers/chat/completions.py +402 -20
  22. rem/api/routers/chat/models.py +88 -10
  23. rem/api/routers/chat/otel_utils.py +33 -0
  24. rem/api/routers/chat/sse_events.py +542 -0
  25. rem/api/routers/chat/streaming.py +642 -45
  26. rem/api/routers/dev.py +81 -0
  27. rem/api/routers/feedback.py +268 -0
  28. rem/api/routers/messages.py +473 -0
  29. rem/api/routers/models.py +78 -0
  30. rem/api/routers/query.py +360 -0
  31. rem/api/routers/shared_sessions.py +406 -0
  32. rem/auth/middleware.py +126 -27
  33. rem/cli/commands/README.md +237 -64
  34. rem/cli/commands/cluster.py +1808 -0
  35. rem/cli/commands/configure.py +1 -3
  36. rem/cli/commands/db.py +386 -143
  37. rem/cli/commands/experiments.py +418 -27
  38. rem/cli/commands/process.py +14 -8
  39. rem/cli/commands/schema.py +97 -50
  40. rem/cli/main.py +27 -6
  41. rem/config.py +10 -3
  42. rem/models/core/core_model.py +7 -1
  43. rem/models/core/experiment.py +54 -0
  44. rem/models/core/rem_query.py +5 -2
  45. rem/models/entities/__init__.py +21 -0
  46. rem/models/entities/domain_resource.py +38 -0
  47. rem/models/entities/feedback.py +123 -0
  48. rem/models/entities/message.py +30 -1
  49. rem/models/entities/session.py +83 -0
  50. rem/models/entities/shared_session.py +180 -0
  51. rem/registry.py +10 -4
  52. rem/schemas/agents/rem.yaml +7 -3
  53. rem/services/content/service.py +92 -20
  54. rem/services/embeddings/api.py +4 -4
  55. rem/services/embeddings/worker.py +16 -16
  56. rem/services/phoenix/client.py +154 -14
  57. rem/services/postgres/README.md +159 -15
  58. rem/services/postgres/__init__.py +2 -1
  59. rem/services/postgres/diff_service.py +531 -0
  60. rem/services/postgres/pydantic_to_sqlalchemy.py +427 -129
  61. rem/services/postgres/repository.py +132 -0
  62. rem/services/postgres/schema_generator.py +205 -4
  63. rem/services/postgres/service.py +6 -6
  64. rem/services/rem/parser.py +44 -9
  65. rem/services/rem/service.py +36 -2
  66. rem/services/session/compression.py +24 -1
  67. rem/services/session/reload.py +1 -1
  68. rem/settings.py +324 -23
  69. rem/sql/background_indexes.sql +21 -16
  70. rem/sql/migrations/001_install.sql +387 -54
  71. rem/sql/migrations/002_install_models.sql +2320 -393
  72. rem/sql/migrations/003_optional_extensions.sql +326 -0
  73. rem/sql/migrations/004_cache_system.sql +548 -0
  74. rem/utils/__init__.py +18 -0
  75. rem/utils/date_utils.py +2 -2
  76. rem/utils/model_helpers.py +156 -1
  77. rem/utils/schema_loader.py +220 -22
  78. rem/utils/sql_paths.py +146 -0
  79. rem/utils/sql_types.py +3 -1
  80. rem/workers/__init__.py +3 -1
  81. rem/workers/db_listener.py +579 -0
  82. rem/workers/unlogged_maintainer.py +463 -0
  83. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/METADATA +335 -226
  84. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/RECORD +86 -66
  85. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/WHEEL +1 -1
  86. rem/sql/002_install_models.sql +0 -1068
  87. rem/sql/install_models.sql +0 -1051
  88. rem/sql/migrations/003_seed_default_user.sql +0 -48
  89. {remdb-0.3.14.dist-info → remdb-0.3.133.dist-info}/entry_points.txt +0 -0
@@ -1,13 +1,94 @@
1
1
  """
2
2
  OpenAI-compatible chat completions router for REM.
3
3
 
4
- Design Pattern:
5
- - Headers map to AgentContext (X-User-Id, X-Tenant-Id, X-Session-Id, X-Agent-Schema)
4
+ Quick Start (Local Development)
5
+ ===============================
6
+
7
+ NOTE: Local dev uses LOCAL databases (Postgres via Docker Compose on port 5050).
8
+ Do NOT port-forward databases. Only port-forward observability services.
9
+
10
+ IMPORTANT: Session IDs MUST be UUIDs. Non-UUID session IDs will cause message
11
+ storage issues and feedback will not work correctly.
12
+
13
+ 1. Port Forwarding (REQUIRED for trace capture and Phoenix sync):
14
+
15
+ # Terminal 1: OTEL Collector (HTTP) - sends traces to Phoenix
16
+ kubectl port-forward -n observability svc/otel-collector-collector 4318:4318
17
+
18
+ # Terminal 2: Phoenix UI - view traces at http://localhost:6006
19
+ kubectl port-forward -n siggy svc/phoenix 6006:6006
20
+
21
+ 2. Get Phoenix API Key (REQUIRED for feedback->Phoenix sync):
22
+
23
+ export PHOENIX_API_KEY=$(kubectl get secret -n siggy rem-phoenix-api-key \\
24
+ -o jsonpath='{.data.PHOENIX_API_KEY}' | base64 -d)
25
+
26
+ 3. Start API with OTEL and Phoenix enabled:
27
+
28
+ cd /path/to/remstack/rem
29
+ source .venv/bin/activate
30
+ OTEL__ENABLED=true \\
31
+ PHOENIX__ENABLED=true \\
32
+ PHOENIX_API_KEY="$PHOENIX_API_KEY" \\
33
+ uvicorn rem.api.main:app --host 0.0.0.0 --port 8000 --app-dir src
34
+
35
+ 4. Test Chat Request (session_id MUST be a UUID):
36
+
37
+ SESSION_ID=$(python3 -c "import uuid; print(uuid.uuid4())")
38
+ curl -s -N -X POST http://localhost:8000/api/v1/chat/completions \\
39
+ -H 'Content-Type: application/json' \\
40
+ -H "X-Session-Id: $SESSION_ID" \\
41
+ -H 'X-Agent-Schema: rem' \\
42
+ -d '{"messages": [{"role": "user", "content": "Hello"}], "stream": true}'
43
+
44
+ # Note: Use 'rem' agent schema (default) for real LLM responses.
45
+ # The 'simulator' agent is for testing SSE events without LLM calls.
46
+
47
+ 5. Submit Feedback on Response:
48
+
49
+ The metadata SSE event contains message_id and trace_id for feedback:
50
+ event: metadata
51
+ data: {"message_id": "728882f8-...", "trace_id": "e53c701c...", ...}
52
+
53
+ Use session_id (UUID you generated) and message_id to submit feedback:
54
+
55
+ curl -X POST http://localhost:8000/api/v1/messages/feedback \\
56
+ -H 'Content-Type: application/json' \\
57
+ -H 'X-Tenant-Id: default' \\
58
+ -d '{
59
+ "session_id": "<your-uuid-session-id>",
60
+ "message_id": "<message-id-from-metadata>",
61
+ "rating": 1,
62
+ "categories": ["helpful"],
63
+ "comment": "Good response"
64
+ }'
65
+
66
+ Expected response (201 = synced to Phoenix):
67
+ {"phoenix_synced": true, "trace_id": "e53c701c...", "span_id": "6432d497..."}
68
+
69
+ OTEL Architecture
70
+ =================
71
+
72
+ REM API --[OTLP/HTTP]--> OTEL Collector --[relay]--> Phoenix
73
+ (port 4318) (k8s: observability) (k8s: siggy)
74
+
75
+ Environment Variables:
76
+ OTEL__ENABLED=true Enable OTEL tracing (required for trace capture)
77
+ PHOENIX__ENABLED=true Enable Phoenix integration (required for feedback sync)
78
+ PHOENIX_API_KEY=<jwt> Phoenix API key (required for feedback->Phoenix sync)
79
+ OTEL__COLLECTOR_ENDPOINT Default: http://localhost:4318
80
+ OTEL__PROTOCOL Default: http (use port 4318, not gRPC 4317)
81
+
82
+ Design Pattern
83
+ ==============
84
+
85
+ - Headers map to AgentContext (X-User-Id, X-Tenant-Id, X-Session-Id, X-Agent-Schema, X-Is-Eval)
6
86
  - ContextBuilder centralizes message construction with user profile + session history
7
87
  - Body.model is the LLM model for Pydantic AI
8
88
  - X-Agent-Schema header specifies which agent schema to use (defaults to 'rem')
9
89
  - Support for streaming (SSE) and non-streaming modes
10
90
  - Response format control (text vs json_object)
91
+ - OpenAI-compatible body fields: metadata, store, reasoning_effort, etc.
11
92
 
12
93
  Context Building Flow:
13
94
  1. ContextBuilder.build_from_headers() extracts user_id, session_id from headers
@@ -25,9 +106,10 @@ Context Building Flow:
25
106
  Headers Mapping
26
107
  X-User-Id → AgentContext.user_id
27
108
  X-Tenant-Id → AgentContext.tenant_id
28
- X-Session-Id → AgentContext.session_id
109
+ X-Session-Id → AgentContext.session_id (use UUID for new sessions)
29
110
  X-Model-Name → AgentContext.default_model (overrides body.model)
30
111
  X-Agent-Schema → AgentContext.agent_schema_uri (defaults to 'rem')
112
+ X-Is-Eval → AgentContext.is_eval (sets session mode to EVALUATION)
31
113
 
32
114
  Default Agent:
33
115
  If X-Agent-Schema header is not provided, the system loads 'rem' schema,
@@ -42,6 +124,7 @@ Example Request:
42
124
  POST /api/v1/chat/completions
43
125
  X-Tenant-Id: acme-corp
44
126
  X-User-Id: user123
127
+ X-Session-Id: a1b2c3d4-e5f6-7890-abcd-ef1234567890 # UUID
45
128
  X-Agent-Schema: rem # Optional, this is the default
46
129
 
47
130
  {
@@ -67,10 +150,12 @@ from loguru import logger
67
150
  from ....agentic.context import AgentContext
68
151
  from ....agentic.context_builder import ContextBuilder
69
152
  from ....agentic.providers.pydantic_ai import create_agent
153
+ from ....models.entities.session import Session, SessionMode
70
154
  from ....services.audio.transcriber import AudioTranscriber
155
+ from ....services.postgres.repository import Repository
71
156
  from ....services.session import SessionMessageStore, reload_session
72
157
  from ....settings import settings
73
- from ....utils.schema_loader import load_agent_schema
158
+ from ....utils.schema_loader import load_agent_schema, load_agent_schema_async
74
159
  from .json_utils import extract_json_resilient
75
160
  from .models import (
76
161
  ChatCompletionChoice,
@@ -79,14 +164,113 @@ from .models import (
79
164
  ChatCompletionUsage,
80
165
  ChatMessage,
81
166
  )
82
- from .streaming import stream_openai_response
167
+ from .streaming import stream_openai_response, stream_openai_response_with_save, stream_simulator_response
83
168
 
84
- router = APIRouter(prefix="/v1", tags=["chat"])
169
+ router = APIRouter(prefix="/api/v1", tags=["chat"])
85
170
 
86
171
  # Default agent schema file
87
172
  DEFAULT_AGENT_SCHEMA = "rem"
88
173
 
89
174
 
175
+ def get_current_trace_context() -> tuple[str | None, str | None]:
176
+ """Get trace_id and span_id from current OTEL context.
177
+
178
+ Returns:
179
+ Tuple of (trace_id, span_id) as hex strings, or (None, None) if not available.
180
+ """
181
+ try:
182
+ from opentelemetry import trace
183
+ span = trace.get_current_span()
184
+ if span and span.get_span_context().is_valid:
185
+ ctx = span.get_span_context()
186
+ trace_id = format(ctx.trace_id, '032x')
187
+ span_id = format(ctx.span_id, '016x')
188
+ return trace_id, span_id
189
+ except Exception:
190
+ pass
191
+ return None, None
192
+
193
+
194
+ def get_tracer():
195
+ """Get the OpenTelemetry tracer for chat completions."""
196
+ try:
197
+ from opentelemetry import trace
198
+ return trace.get_tracer("rem.chat.completions")
199
+ except Exception:
200
+ return None
201
+
202
+
203
+ async def ensure_session_with_metadata(
204
+ session_id: str,
205
+ user_id: str | None,
206
+ tenant_id: str,
207
+ is_eval: bool,
208
+ request_metadata: dict[str, str] | None,
209
+ agent_schema: str | None = None,
210
+ ) -> None:
211
+ """
212
+ Ensure session exists and update with metadata/mode.
213
+
214
+ If X-Is-Eval header is true, sets session mode to EVALUATION.
215
+ Merges request metadata with existing session metadata.
216
+
217
+ Args:
218
+ session_id: Session identifier (maps to Session.name)
219
+ user_id: User identifier
220
+ tenant_id: Tenant identifier
221
+ is_eval: Whether this is an evaluation session
222
+ request_metadata: Metadata from request body to merge
223
+ agent_schema: Optional agent schema being used
224
+ """
225
+ if not settings.postgres.enabled:
226
+ return
227
+
228
+ try:
229
+ repo = Repository(Session, table_name="sessions")
230
+
231
+ # Try to load existing session by name (session_id is the name field)
232
+ existing_list = await repo.find(
233
+ filters={"name": session_id, "tenant_id": tenant_id},
234
+ limit=1,
235
+ )
236
+ existing = existing_list[0] if existing_list else None
237
+
238
+ if existing:
239
+ # Merge metadata if provided
240
+ merged_metadata = existing.metadata or {}
241
+ if request_metadata:
242
+ merged_metadata.update(request_metadata)
243
+
244
+ # Update session if eval flag or new metadata
245
+ needs_update = False
246
+ if is_eval and existing.mode != SessionMode.EVALUATION:
247
+ existing.mode = SessionMode.EVALUATION
248
+ needs_update = True
249
+ if request_metadata:
250
+ existing.metadata = merged_metadata
251
+ needs_update = True
252
+
253
+ if needs_update:
254
+ await repo.upsert(existing)
255
+ logger.debug(f"Updated session {session_id} (eval={is_eval}, metadata keys={list(merged_metadata.keys())})")
256
+ else:
257
+ # Create new session
258
+ session = Session(
259
+ name=session_id,
260
+ mode=SessionMode.EVALUATION if is_eval else SessionMode.NORMAL,
261
+ user_id=user_id,
262
+ tenant_id=tenant_id,
263
+ agent_schema_uri=agent_schema,
264
+ metadata=request_metadata or {},
265
+ )
266
+ await repo.upsert(session)
267
+ logger.info(f"Created session {session_id} (eval={is_eval})")
268
+
269
+ except Exception as e:
270
+ # Non-critical - log but don't fail the request
271
+ logger.error(f"Failed to ensure session metadata: {e}", exc_info=True)
272
+
273
+
90
274
  @router.post("/chat/completions", response_model=None)
91
275
  async def chat_completions(body: ChatCompletionRequest, request: Request):
92
276
  """
@@ -102,6 +286,17 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
102
286
  | X-Tenant-Id | Tenant identifier (multi-tenancy) | AgentContext.tenant_id | "default" |
103
287
  | X-Session-Id | Session/conversation identifier | AgentContext.session_id | None |
104
288
  | X-Agent-Schema | Agent schema name | AgentContext.agent_schema_uri | "rem" |
289
+ | X-Is-Eval | Mark as evaluation session | AgentContext.is_eval | false |
290
+
291
+ Additional OpenAI-compatible Body Fields:
292
+ - metadata: Key-value pairs merged with session metadata (max 16 keys)
293
+ - store: Whether to store for distillation/evaluation
294
+ - max_completion_tokens: Max tokens to generate (replaces max_tokens)
295
+ - seed: Seed for deterministic sampling
296
+ - top_p: Nucleus sampling probability
297
+ - logprobs: Return log probabilities
298
+ - reasoning_effort: low/medium/high for o-series models
299
+ - service_tier: auto/flex/priority/default
105
300
 
106
301
  Example Models:
107
302
  - anthropic:claude-sonnet-4-5-20250929 (Claude 4.5 Sonnet)
@@ -127,15 +322,137 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
127
322
  - If CHAT__AUTO_INJECT_USER_CONTEXT=true: User profile auto-loaded and injected
128
323
  - New messages saved to database with compression for session continuity
129
324
  - When Postgres is disabled, session management is skipped
325
+
326
+ Evaluation Sessions:
327
+ - Set X-Is-Eval: true header to mark session as evaluation
328
+ - Session mode will be set to EVALUATION
329
+ - Request metadata is merged with session metadata
330
+ - Useful for A/B testing, model comparison, and feedback collection
130
331
  """
131
332
  # Load agent schema: use header value from context or default
132
333
  # Extract AgentContext first to get schema name
133
334
  temp_context = AgentContext.from_headers(dict(request.headers))
134
335
  schema_name = temp_context.agent_schema_uri or DEFAULT_AGENT_SCHEMA
135
336
 
337
+ # Resolve model: use body.model if provided, otherwise settings default
338
+ if body.model is None:
339
+ body.model = settings.llm.default_model
340
+ logger.debug(f"No model specified, using default: {body.model}")
341
+
342
+ # Special handling for simulator schema - no LLM, just generates demo SSE events
343
+ # Check BEFORE loading schema since simulator doesn't need a schema file
344
+ # Still builds full context and saves messages like a real agent
345
+ if schema_name == "simulator":
346
+ logger.info("Using SSE simulator (no LLM)")
347
+
348
+ # Build context just like real agents (loads session history, user context)
349
+ new_messages = [msg.model_dump() for msg in body.messages]
350
+ context, messages = await ContextBuilder.build_from_headers(
351
+ headers=dict(request.headers),
352
+ new_messages=new_messages,
353
+ )
354
+
355
+ # Ensure session exists with metadata and eval mode if applicable
356
+ if context.session_id:
357
+ await ensure_session_with_metadata(
358
+ session_id=context.session_id,
359
+ user_id=context.user_id,
360
+ tenant_id=context.tenant_id,
361
+ is_eval=context.is_eval,
362
+ request_metadata=body.metadata,
363
+ agent_schema="simulator",
364
+ )
365
+
366
+ # Get the last user message as prompt
367
+ prompt = body.messages[-1].content if body.messages else "demo"
368
+ request_id = f"sim-{uuid.uuid4().hex[:24]}"
369
+
370
+ # Generate message IDs upfront for correlation
371
+ user_message_id = str(uuid.uuid4())
372
+ assistant_message_id = str(uuid.uuid4())
373
+
374
+ # Simulated assistant response content (for persistence)
375
+ simulated_content = (
376
+ f"[SSE Simulator Response]\n\n"
377
+ f"This is a simulated response demonstrating all SSE event types:\n"
378
+ f"- reasoning events (model thinking)\n"
379
+ f"- text_delta events (streamed content)\n"
380
+ f"- progress events (multi-step operations)\n"
381
+ f"- tool_call events (function invocations)\n"
382
+ f"- action_request events (UI solicitation)\n"
383
+ f"- metadata events (confidence, sources, message IDs)\n\n"
384
+ f"Original prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}"
385
+ )
386
+
387
+ # Save messages to database (if session_id and postgres enabled)
388
+ if settings.postgres.enabled and context.session_id:
389
+ user_message = {
390
+ "id": user_message_id,
391
+ "role": "user",
392
+ "content": prompt,
393
+ "timestamp": datetime.utcnow().isoformat(),
394
+ }
395
+ assistant_message = {
396
+ "id": assistant_message_id,
397
+ "role": "assistant",
398
+ "content": simulated_content,
399
+ "timestamp": datetime.utcnow().isoformat(),
400
+ }
401
+
402
+ try:
403
+ store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
404
+ await store.store_session_messages(
405
+ session_id=context.session_id,
406
+ messages=[user_message, assistant_message],
407
+ user_id=context.user_id,
408
+ compress=True,
409
+ )
410
+ logger.info(f"Saved simulator conversation to session {context.session_id}")
411
+ except Exception as e:
412
+ # Log error but don't fail the request - session storage is non-critical
413
+ logger.error(f"Failed to save session messages: {e}", exc_info=True)
414
+
415
+ if body.stream:
416
+ return StreamingResponse(
417
+ stream_simulator_response(
418
+ prompt=prompt,
419
+ model="simulator-v1.0.0",
420
+ # Pass message correlation IDs
421
+ message_id=assistant_message_id,
422
+ in_reply_to=user_message_id,
423
+ session_id=context.session_id,
424
+ ),
425
+ media_type="text/event-stream",
426
+ headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
427
+ )
428
+ else:
429
+ # Non-streaming simulator returns simple JSON
430
+ return ChatCompletionResponse(
431
+ id=request_id,
432
+ created=int(time.time()),
433
+ model="simulator-v1.0.0",
434
+ choices=[
435
+ ChatCompletionChoice(
436
+ index=0,
437
+ message=ChatMessage(
438
+ role="assistant",
439
+ content=simulated_content,
440
+ ),
441
+ finish_reason="stop",
442
+ )
443
+ ],
444
+ usage=ChatCompletionUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0),
445
+ )
446
+
136
447
  # Load schema using centralized utility
448
+ # Enable database fallback to load dynamic agents stored in schemas table
449
+ # Use async version since we're in an async context (FastAPI endpoint)
450
+ user_id = temp_context.user_id or settings.test.effective_user_id
137
451
  try:
138
- agent_schema = load_agent_schema(schema_name)
452
+ agent_schema = await load_agent_schema_async(
453
+ schema_name,
454
+ user_id=user_id,
455
+ )
139
456
  except FileNotFoundError:
140
457
  # Fallback to default if specified schema not found
141
458
  logger.warning(f"Schema '{schema_name}' not found, falling back to '{DEFAULT_AGENT_SCHEMA}'")
@@ -151,7 +468,7 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
151
468
  detail=f"Agent schema '{schema_name}' not found and default schema unavailable",
152
469
  )
153
470
 
154
- logger.info(f"Using agent schema: {schema_name}, model: {body.model}")
471
+ logger.debug(f"Using agent schema: {schema_name}, model: {body.model}")
155
472
 
156
473
  # Check for audio input
157
474
  is_audio = request.headers.get("x-chat-is-audio", "").lower() == "true"
@@ -196,6 +513,17 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
196
513
 
197
514
  logger.info(f"Built context with {len(messages)} total messages (includes history + user context)")
198
515
 
516
+ # Ensure session exists with metadata and eval mode if applicable
517
+ if context.session_id:
518
+ await ensure_session_with_metadata(
519
+ session_id=context.session_id,
520
+ user_id=context.user_id,
521
+ tenant_id=context.tenant_id,
522
+ is_eval=context.is_eval,
523
+ request_metadata=body.metadata,
524
+ agent_schema=schema_name,
525
+ )
526
+
199
527
  # Create agent with schema and model override
200
528
  agent = await create_agent(
201
529
  context=context,
@@ -212,14 +540,60 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
212
540
 
213
541
  # Streaming mode
214
542
  if body.stream:
543
+ # Save user message before streaming starts
544
+ if settings.postgres.enabled and context.session_id:
545
+ user_message = {
546
+ "role": "user",
547
+ "content": body.messages[-1].content if body.messages else "",
548
+ "timestamp": datetime.utcnow().isoformat(),
549
+ }
550
+ try:
551
+ store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
552
+ await store.store_session_messages(
553
+ session_id=context.session_id,
554
+ messages=[user_message],
555
+ user_id=context.user_id,
556
+ compress=False, # User messages are typically short
557
+ )
558
+ logger.debug(f"Saved user message to session {context.session_id}")
559
+ except Exception as e:
560
+ logger.error(f"Failed to save user message: {e}", exc_info=True)
561
+
215
562
  return StreamingResponse(
216
- stream_openai_response(agent, prompt, body.model, request_id),
563
+ stream_openai_response_with_save(
564
+ agent=agent,
565
+ prompt=prompt,
566
+ model=body.model,
567
+ request_id=request_id,
568
+ agent_schema=schema_name,
569
+ session_id=context.session_id,
570
+ user_id=context.user_id,
571
+ ),
217
572
  media_type="text/event-stream",
218
573
  headers={"Cache-Control": "no-cache", "Connection": "keep-alive"},
219
574
  )
220
575
 
221
576
  # Non-streaming mode
222
- result = await agent.run(prompt)
577
+ # Create a parent span to capture trace context for message storage
578
+ trace_id, span_id = None, None
579
+ tracer = get_tracer()
580
+
581
+ if tracer:
582
+ with tracer.start_as_current_span(
583
+ "chat_completion",
584
+ attributes={
585
+ "session.id": context.session_id or "",
586
+ "user.id": context.user_id or "",
587
+ "model": body.model,
588
+ "agent.schema": context.agent_schema_uri or DEFAULT_AGENT_SCHEMA,
589
+ }
590
+ ) as span:
591
+ # Capture trace context from the span we just created
592
+ trace_id, span_id = get_current_trace_context()
593
+ result = await agent.run(prompt)
594
+ else:
595
+ # No tracer available, run without tracing
596
+ result = await agent.run(prompt)
223
597
 
224
598
  # Determine content format based on response_format request
225
599
  if body.response_format and body.response_format.type == "json_object":
@@ -242,25 +616,33 @@ async def chat_completions(body: ChatCompletionRequest, request: Request):
242
616
  "role": "user",
243
617
  "content": body.messages[-1].content if body.messages else "",
244
618
  "timestamp": datetime.utcnow().isoformat(),
619
+ "trace_id": trace_id,
620
+ "span_id": span_id,
245
621
  }
246
622
 
247
623
  assistant_message = {
248
624
  "role": "assistant",
249
625
  "content": content,
250
626
  "timestamp": datetime.utcnow().isoformat(),
627
+ "trace_id": trace_id,
628
+ "span_id": span_id,
251
629
  }
252
630
 
253
- # Store messages with compression
254
- store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
255
-
256
- await store.store_session_messages(
257
- session_id=context.session_id,
258
- messages=[user_message, assistant_message],
259
- user_id=context.user_id,
260
- compress=True,
261
- )
631
+ try:
632
+ # Store messages with compression
633
+ store = SessionMessageStore(user_id=context.user_id or settings.test.effective_user_id)
634
+
635
+ await store.store_session_messages(
636
+ session_id=context.session_id,
637
+ messages=[user_message, assistant_message],
638
+ user_id=context.user_id,
639
+ compress=True,
640
+ )
262
641
 
263
- logger.info(f"Saved conversation to session {context.session_id}")
642
+ logger.info(f"Saved conversation to session {context.session_id}")
643
+ except Exception as e:
644
+ # Log error but don't fail the request - session storage is non-critical
645
+ logger.error(f"Failed to save session messages: {e}", exc_info=True)
264
646
 
265
647
  return ChatCompletionResponse(
266
648
  id=request_id,
@@ -1,17 +1,43 @@
1
1
  """
2
2
  OpenAI-compatible API models for chat completions.
3
3
 
4
- Design Pattern
4
+ Design Pattern:
5
5
  - Full OpenAI compatibility for drop-in replacement
6
6
  - Support for streaming (SSE) and non-streaming modes
7
7
  - Response format control (text vs json_object)
8
- - Headers map to AgentContext (X-User-Id, X-Tenant-Id, X-Agent-Schema, etc.)
8
+ - Headers map to AgentContext for session/context control
9
+ - Body fields for OpenAI-compatible parameters + metadata
10
+
11
+ Headers (context control):
12
+ X-User-Id → context.user_id (user identifier)
13
+ X-Tenant-Id → context.tenant_id (multi-tenancy, default: "default")
14
+ X-Session-Id → context.session_id (conversation continuity)
15
+ X-Agent-Schema → context.agent_schema_uri (which agent to use, default: "rem")
16
+ X-Model-Name → context.default_model (model override)
17
+ X-Chat-Is-Audio → triggers audio transcription ("true"/"false")
18
+ X-Is-Eval → context.is_eval (marks session as evaluation, sets mode=EVALUATION)
19
+
20
+ Body Fields (OpenAI-compatible + extensions):
21
+ model → LLM model (e.g., "openai:gpt-4.1", "anthropic:claude-sonnet-4-5-20250929")
22
+ messages → Chat conversation history
23
+ temperature → Sampling temperature (0-2)
24
+ max_tokens → Max tokens (deprecated, use max_completion_tokens)
25
+ max_completion_tokens → Max tokens to generate
26
+ stream → Enable SSE streaming
27
+ metadata → Key-value pairs merged with session metadata (for evals/experiments)
28
+ store → Whether to store for distillation/evaluation
29
+ seed → Deterministic sampling seed
30
+ top_p → Nucleus sampling probability
31
+ reasoning_effort → low/medium/high for o-series models
32
+ service_tier → auto/flex/priority/default
9
33
  """
10
34
 
11
- from typing import Literal
35
+ from typing import Any, Literal
12
36
 
13
37
  from pydantic import BaseModel, Field
14
38
 
39
+ from rem.settings import settings
40
+
15
41
 
16
42
  # Request models
17
43
  class ChatMessage(BaseModel):
@@ -44,17 +70,26 @@ class ChatCompletionRequest(BaseModel):
44
70
  Compatible with OpenAI's /v1/chat/completions endpoint.
45
71
 
46
72
  Headers Map to AgentContext:
47
- - X-User-Id → context.user_id
48
- - X-Tenant-Id → context.tenant_id
49
- - X-Session-Id → context.session_id
50
- - X-Agent-Schema → context.agent_schema_uri
73
+ X-User-Id → context.user_id
74
+ X-Tenant-Id → context.tenant_id (default: "default")
75
+ X-Session-Id → context.session_id
76
+ X-Agent-Schema → context.agent_schema_uri (default: "rem")
77
+ X-Model-Name → context.default_model
78
+ X-Chat-Is-Audio → triggers audio transcription
79
+ X-Is-Eval → context.is_eval (sets session mode=EVALUATION)
80
+
81
+ Body Fields for Metadata/Evals:
82
+ metadata → Key-value pairs merged with session metadata
83
+ store → Whether to store for distillation/evaluation
51
84
 
52
85
  Note: Model is specified in body.model (standard OpenAI field), not headers.
53
86
  """
54
87
 
55
- model: str = Field(
56
- default="anthropic:claude-sonnet-4-5-20250929",
57
- description="Model to use (standard OpenAI field)",
88
+ # TODO: default should come from settings.llm.default_model at request time
89
+ # Using None and resolving in endpoint to avoid import-time settings evaluation
90
+ model: str | None = Field(
91
+ default=None,
92
+ description="Model to use. Defaults to LLM__DEFAULT_MODEL from settings.",
58
93
  )
59
94
  messages: list[ChatMessage] = Field(description="Chat conversation history")
60
95
  temperature: float | None = Field(default=None, ge=0, le=2)
@@ -69,6 +104,49 @@ class ChatCompletionRequest(BaseModel):
69
104
  default=None,
70
105
  description="Response format. Set type='json_object' to enable JSON mode.",
71
106
  )
107
+ # Additional OpenAI-compatible fields
108
+ metadata: dict[str, str] | None = Field(
109
+ default=None,
110
+ description="Key-value pairs attached to the request (max 16 keys, 64/512 char limits). "
111
+ "Merged with session metadata for persistence.",
112
+ )
113
+ store: bool | None = Field(
114
+ default=None,
115
+ description="Whether to store for distillation/evaluation purposes.",
116
+ )
117
+ max_completion_tokens: int | None = Field(
118
+ default=None,
119
+ ge=1,
120
+ description="Max tokens to generate (replaces deprecated max_tokens).",
121
+ )
122
+ seed: int | None = Field(
123
+ default=None,
124
+ description="Seed for deterministic sampling (best effort).",
125
+ )
126
+ top_p: float | None = Field(
127
+ default=None,
128
+ ge=0,
129
+ le=1,
130
+ description="Nucleus sampling probability. Use temperature OR top_p, not both.",
131
+ )
132
+ logprobs: bool | None = Field(
133
+ default=None,
134
+ description="Whether to return log probabilities for output tokens.",
135
+ )
136
+ top_logprobs: int | None = Field(
137
+ default=None,
138
+ ge=0,
139
+ le=20,
140
+ description="Number of most likely tokens to return at each position (requires logprobs=true).",
141
+ )
142
+ reasoning_effort: Literal["low", "medium", "high"] | None = Field(
143
+ default=None,
144
+ description="Reasoning effort for o-series models (low/medium/high).",
145
+ )
146
+ service_tier: Literal["auto", "flex", "priority", "default"] | None = Field(
147
+ default=None,
148
+ description="Service tier for processing (flex is 50% cheaper but slower).",
149
+ )
72
150
 
73
151
 
74
152
  # Response models