lucidicai 2.0.2__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lucidicai/__init__.py +367 -899
- lucidicai/api/__init__.py +1 -0
- lucidicai/api/client.py +218 -0
- lucidicai/api/resources/__init__.py +1 -0
- lucidicai/api/resources/dataset.py +192 -0
- lucidicai/api/resources/event.py +88 -0
- lucidicai/api/resources/session.py +126 -0
- lucidicai/core/__init__.py +1 -0
- lucidicai/core/config.py +223 -0
- lucidicai/core/errors.py +60 -0
- lucidicai/core/types.py +35 -0
- lucidicai/sdk/__init__.py +1 -0
- lucidicai/sdk/context.py +231 -0
- lucidicai/sdk/decorators.py +187 -0
- lucidicai/sdk/error_boundary.py +299 -0
- lucidicai/sdk/event.py +126 -0
- lucidicai/sdk/event_builder.py +304 -0
- lucidicai/sdk/features/__init__.py +1 -0
- lucidicai/sdk/features/dataset.py +605 -0
- lucidicai/sdk/features/feature_flag.py +383 -0
- lucidicai/sdk/init.py +361 -0
- lucidicai/sdk/shutdown_manager.py +302 -0
- lucidicai/telemetry/context_bridge.py +82 -0
- lucidicai/telemetry/context_capture_processor.py +25 -9
- lucidicai/telemetry/litellm_bridge.py +20 -24
- lucidicai/telemetry/lucidic_exporter.py +99 -60
- lucidicai/telemetry/openai_patch.py +295 -0
- lucidicai/telemetry/openai_uninstrument.py +87 -0
- lucidicai/telemetry/telemetry_init.py +16 -1
- lucidicai/telemetry/utils/model_pricing.py +278 -0
- lucidicai/utils/__init__.py +1 -0
- lucidicai/utils/images.py +337 -0
- lucidicai/utils/logger.py +168 -0
- lucidicai/utils/queue.py +393 -0
- {lucidicai-2.0.2.dist-info → lucidicai-2.1.1.dist-info}/METADATA +1 -1
- {lucidicai-2.0.2.dist-info → lucidicai-2.1.1.dist-info}/RECORD +38 -9
- {lucidicai-2.0.2.dist-info → lucidicai-2.1.1.dist-info}/WHEEL +0 -0
- {lucidicai-2.0.2.dist-info → lucidicai-2.1.1.dist-info}/top_level.txt +0 -0
lucidicai/sdk/init.py
ADDED
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
"""SDK initialization module.
|
|
2
|
+
|
|
3
|
+
This module handles SDK initialization, separating concerns from the main __init__.py
|
|
4
|
+
"""
|
|
5
|
+
import uuid
|
|
6
|
+
from typing import List, Optional
|
|
7
|
+
import asyncio
|
|
8
|
+
import threading
|
|
9
|
+
from weakref import WeakKeyDictionary
|
|
10
|
+
|
|
11
|
+
from ..api.client import HttpClient
|
|
12
|
+
from ..api.resources.event import EventResource
|
|
13
|
+
from ..api.resources.session import SessionResource
|
|
14
|
+
from ..api.resources.dataset import DatasetResource
|
|
15
|
+
from ..core.config import SDKConfig, get_config, set_config
|
|
16
|
+
from ..utils.queue import EventQueue
|
|
17
|
+
from ..utils.logger import debug, info, warning, error, truncate_id
|
|
18
|
+
from .context import set_active_session, current_session_id
|
|
19
|
+
from .error_boundary import register_cleanup_handler
|
|
20
|
+
from .shutdown_manager import get_shutdown_manager, SessionState
|
|
21
|
+
from ..telemetry.telemetry_init import instrument_providers
|
|
22
|
+
from opentelemetry.sdk.trace import TracerProvider
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class SDKState:
|
|
26
|
+
"""Container for SDK runtime state."""
|
|
27
|
+
|
|
28
|
+
def __init__(self):
|
|
29
|
+
self.http: Optional[HttpClient] = None
|
|
30
|
+
self.event_queue: Optional[EventQueue] = None
|
|
31
|
+
self.session_id: Optional[str] = None
|
|
32
|
+
self.tracer_provider: Optional[TracerProvider] = None
|
|
33
|
+
self.resources = {}
|
|
34
|
+
# Task-local storage for async task isolation
|
|
35
|
+
self.task_sessions: WeakKeyDictionary = WeakKeyDictionary()
|
|
36
|
+
# Thread-local storage for thread isolation
|
|
37
|
+
self.thread_local = threading.local()
|
|
38
|
+
|
|
39
|
+
def reset(self):
|
|
40
|
+
"""Reset SDK state."""
|
|
41
|
+
# Shutdown telemetry first to ensure all spans are exported
|
|
42
|
+
if self.tracer_provider:
|
|
43
|
+
try:
|
|
44
|
+
# Force flush all pending spans with 5 second timeout
|
|
45
|
+
debug("[SDK] Flushing OpenTelemetry spans...")
|
|
46
|
+
self.tracer_provider.force_flush(timeout_millis=5000)
|
|
47
|
+
# Shutdown the tracer provider and all processors
|
|
48
|
+
self.tracer_provider.shutdown()
|
|
49
|
+
debug("[SDK] TracerProvider shutdown complete")
|
|
50
|
+
except Exception as e:
|
|
51
|
+
error(f"[SDK] Error shutting down TracerProvider: {e}")
|
|
52
|
+
|
|
53
|
+
if self.event_queue:
|
|
54
|
+
self.event_queue.shutdown()
|
|
55
|
+
if self.http:
|
|
56
|
+
self.http.close()
|
|
57
|
+
|
|
58
|
+
self.http = None
|
|
59
|
+
self.event_queue = None
|
|
60
|
+
self.session_id = None
|
|
61
|
+
self.tracer_provider = None
|
|
62
|
+
self.resources = {}
|
|
63
|
+
self.task_sessions.clear()
|
|
64
|
+
# Clear thread-local storage for current thread
|
|
65
|
+
if hasattr(self.thread_local, 'session_id'):
|
|
66
|
+
delattr(self.thread_local, 'session_id')
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Global SDK state
|
|
70
|
+
_sdk_state = SDKState()
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def init(
|
|
74
|
+
session_name: Optional[str] = None,
|
|
75
|
+
session_id: Optional[str] = None,
|
|
76
|
+
api_key: Optional[str] = None,
|
|
77
|
+
agent_id: Optional[str] = None,
|
|
78
|
+
task: Optional[str] = None,
|
|
79
|
+
providers: Optional[List[str]] = None,
|
|
80
|
+
production_monitoring: bool = False,
|
|
81
|
+
experiment_id: Optional[str] = None,
|
|
82
|
+
evaluators: Optional[List] = None,
|
|
83
|
+
tags: Optional[List] = None,
|
|
84
|
+
datasetitem_id: Optional[str] = None,
|
|
85
|
+
masking_function: Optional[callable] = None,
|
|
86
|
+
auto_end: bool = True,
|
|
87
|
+
capture_uncaught: bool = True,
|
|
88
|
+
) -> str:
|
|
89
|
+
"""Initialize the Lucidic SDK.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
session_name: Name for the session
|
|
93
|
+
session_id: Custom session ID (optional)
|
|
94
|
+
api_key: API key (uses env if not provided)
|
|
95
|
+
agent_id: Agent ID (uses env if not provided)
|
|
96
|
+
task: Task description
|
|
97
|
+
providers: List of telemetry providers to instrument
|
|
98
|
+
production_monitoring: Enable production monitoring
|
|
99
|
+
experiment_id: Experiment ID to associate with session
|
|
100
|
+
evaluators: Ealuators to use
|
|
101
|
+
tags: Session tags
|
|
102
|
+
datasetitem_id: Dataset item ID
|
|
103
|
+
masking_function: Function to mask sensitive data
|
|
104
|
+
auto_end: Automatically end session on exit
|
|
105
|
+
capture_uncaught: Capture uncaught exceptions
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Session ID
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
APIKeyVerificationError: If API credentials are invalid
|
|
112
|
+
"""
|
|
113
|
+
global _sdk_state
|
|
114
|
+
|
|
115
|
+
# Create or update configuration
|
|
116
|
+
config = SDKConfig.from_env(
|
|
117
|
+
api_key=api_key,
|
|
118
|
+
agent_id=agent_id,
|
|
119
|
+
auto_end=auto_end,
|
|
120
|
+
production_monitoring=production_monitoring
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
if providers:
|
|
124
|
+
config.telemetry.providers = providers
|
|
125
|
+
|
|
126
|
+
config.error_handling.capture_uncaught = capture_uncaught
|
|
127
|
+
|
|
128
|
+
# Validate configuration
|
|
129
|
+
errors = config.validate()
|
|
130
|
+
if errors:
|
|
131
|
+
raise ValueError(f"Invalid configuration: {', '.join(errors)}")
|
|
132
|
+
|
|
133
|
+
# Set global config
|
|
134
|
+
set_config(config)
|
|
135
|
+
|
|
136
|
+
# Initialize HTTP client
|
|
137
|
+
if not _sdk_state.http:
|
|
138
|
+
debug("[SDK] Initializing HTTP client")
|
|
139
|
+
_sdk_state.http = HttpClient(config)
|
|
140
|
+
|
|
141
|
+
# Initialize resources
|
|
142
|
+
if not _sdk_state.resources:
|
|
143
|
+
_sdk_state.resources = {
|
|
144
|
+
'events': EventResource(_sdk_state.http),
|
|
145
|
+
'sessions': SessionResource(_sdk_state.http),
|
|
146
|
+
'datasets': DatasetResource(_sdk_state.http)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
# Initialize event queue
|
|
150
|
+
if not _sdk_state.event_queue:
|
|
151
|
+
debug("[SDK] Initializing event queue")
|
|
152
|
+
# Create a mock client object for backward compatibility
|
|
153
|
+
# The queue needs a client with make_request method
|
|
154
|
+
class ClientAdapter:
|
|
155
|
+
def make_request(self, endpoint, method, data):
|
|
156
|
+
return _sdk_state.http.request(method, endpoint, json=data)
|
|
157
|
+
|
|
158
|
+
_sdk_state.event_queue = EventQueue(ClientAdapter())
|
|
159
|
+
|
|
160
|
+
# Register cleanup handler
|
|
161
|
+
register_cleanup_handler(lambda: _sdk_state.event_queue.force_flush())
|
|
162
|
+
debug("[SDK] Event queue initialized and cleanup handler registered")
|
|
163
|
+
|
|
164
|
+
# Create or retrieve session
|
|
165
|
+
if session_id:
|
|
166
|
+
# Use provided session ID
|
|
167
|
+
real_session_id = session_id
|
|
168
|
+
else:
|
|
169
|
+
# Create new session
|
|
170
|
+
real_session_id = str(uuid.uuid4())
|
|
171
|
+
|
|
172
|
+
# Create session via API - only send non-None values
|
|
173
|
+
session_params = {
|
|
174
|
+
'session_id': real_session_id,
|
|
175
|
+
'session_name': session_name or 'Unnamed Session',
|
|
176
|
+
'agent_id': config.agent_id,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
# Only add optional fields if they have values
|
|
180
|
+
if task:
|
|
181
|
+
session_params['task'] = task
|
|
182
|
+
if tags:
|
|
183
|
+
session_params['tags'] = tags
|
|
184
|
+
if experiment_id:
|
|
185
|
+
session_params['experiment_id'] = experiment_id
|
|
186
|
+
if datasetitem_id:
|
|
187
|
+
session_params['datasetitem_id'] = datasetitem_id
|
|
188
|
+
if evaluators:
|
|
189
|
+
session_params['evaluators'] = evaluators
|
|
190
|
+
if production_monitoring:
|
|
191
|
+
session_params['production_monitoring'] = production_monitoring
|
|
192
|
+
|
|
193
|
+
debug(f"[SDK] Creating session with params: {session_params}")
|
|
194
|
+
session_resource = _sdk_state.resources['sessions']
|
|
195
|
+
session_data = session_resource.create_session(session_params)
|
|
196
|
+
|
|
197
|
+
# Use the session_id returned by the backend
|
|
198
|
+
real_session_id = session_data.get('session_id', real_session_id)
|
|
199
|
+
_sdk_state.session_id = real_session_id
|
|
200
|
+
|
|
201
|
+
info(f"[SDK] Session created: {truncate_id(real_session_id)} (name: {session_name or 'Unnamed Session'})")
|
|
202
|
+
|
|
203
|
+
# Set active session in context
|
|
204
|
+
set_active_session(real_session_id)
|
|
205
|
+
|
|
206
|
+
# Register session with shutdown manager
|
|
207
|
+
debug(f"[SDK] Registering session with shutdown manager (auto_end={auto_end})")
|
|
208
|
+
shutdown_manager = get_shutdown_manager()
|
|
209
|
+
session_state = SessionState(
|
|
210
|
+
session_id=real_session_id,
|
|
211
|
+
http_client=_sdk_state.resources, # Pass resources dict which has sessions
|
|
212
|
+
event_queue=_sdk_state.event_queue,
|
|
213
|
+
auto_end=auto_end
|
|
214
|
+
)
|
|
215
|
+
shutdown_manager.register_session(real_session_id, session_state)
|
|
216
|
+
|
|
217
|
+
# Initialize telemetry if providers specified
|
|
218
|
+
if providers:
|
|
219
|
+
debug(f"[SDK] Initializing telemetry for providers: {providers}")
|
|
220
|
+
_initialize_telemetry(providers)
|
|
221
|
+
|
|
222
|
+
return real_session_id
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _initialize_telemetry(providers: List[str]) -> None:
|
|
226
|
+
"""Initialize telemetry providers.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
providers: List of provider names
|
|
230
|
+
"""
|
|
231
|
+
global _sdk_state
|
|
232
|
+
|
|
233
|
+
if not _sdk_state.tracer_provider:
|
|
234
|
+
# Import here to avoid circular dependency
|
|
235
|
+
from ..telemetry.lucidic_exporter import LucidicSpanExporter
|
|
236
|
+
from ..telemetry.context_capture_processor import ContextCaptureProcessor
|
|
237
|
+
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
238
|
+
|
|
239
|
+
# Create tracer provider with our processors
|
|
240
|
+
_sdk_state.tracer_provider = TracerProvider()
|
|
241
|
+
|
|
242
|
+
# Add context capture processor FIRST to capture context before export
|
|
243
|
+
context_processor = ContextCaptureProcessor()
|
|
244
|
+
_sdk_state.tracer_provider.add_span_processor(context_processor)
|
|
245
|
+
|
|
246
|
+
# Add exporter processor
|
|
247
|
+
exporter = LucidicSpanExporter()
|
|
248
|
+
export_processor = BatchSpanProcessor(exporter)
|
|
249
|
+
_sdk_state.tracer_provider.add_span_processor(export_processor)
|
|
250
|
+
|
|
251
|
+
# Instrument providers
|
|
252
|
+
instrument_providers(providers, _sdk_state.tracer_provider, {})
|
|
253
|
+
|
|
254
|
+
info(f"[Telemetry] Initialized for providers: {providers}")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def set_task_session(session_id: str) -> None:
|
|
258
|
+
"""Set session ID for current async task (if in async context)."""
|
|
259
|
+
try:
|
|
260
|
+
if task := asyncio.current_task():
|
|
261
|
+
_sdk_state.task_sessions[task] = session_id
|
|
262
|
+
debug(f"[SDK] Set task-local session {truncate_id(session_id)} for task {task.get_name()}")
|
|
263
|
+
except RuntimeError:
|
|
264
|
+
# Not in async context, ignore
|
|
265
|
+
pass
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def clear_task_session() -> None:
|
|
269
|
+
"""Clear session ID for current async task (if in async context)."""
|
|
270
|
+
try:
|
|
271
|
+
if task := asyncio.current_task():
|
|
272
|
+
_sdk_state.task_sessions.pop(task, None)
|
|
273
|
+
debug(f"[SDK] Cleared task-local session for task {task.get_name()}")
|
|
274
|
+
except RuntimeError:
|
|
275
|
+
# Not in async context, ignore
|
|
276
|
+
pass
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def set_thread_session(session_id: str) -> None:
|
|
280
|
+
"""Set session ID for current thread.
|
|
281
|
+
|
|
282
|
+
This provides true thread-local storage that doesn't inherit from parent thread.
|
|
283
|
+
"""
|
|
284
|
+
_sdk_state.thread_local.session_id = session_id
|
|
285
|
+
current_thread = threading.current_thread()
|
|
286
|
+
debug(f"[SDK] Set thread-local session {truncate_id(session_id)} for thread {current_thread.name}")
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def clear_thread_session() -> None:
|
|
290
|
+
"""Clear session ID for current thread."""
|
|
291
|
+
if hasattr(_sdk_state.thread_local, 'session_id'):
|
|
292
|
+
delattr(_sdk_state.thread_local, 'session_id')
|
|
293
|
+
current_thread = threading.current_thread()
|
|
294
|
+
debug(f"[SDK] Cleared thread-local session for thread {current_thread.name}")
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def get_thread_session() -> Optional[str]:
|
|
298
|
+
"""Get session ID from thread-local storage."""
|
|
299
|
+
return getattr(_sdk_state.thread_local, 'session_id', None)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def is_main_thread() -> bool:
|
|
303
|
+
"""Check if we're running in the main thread."""
|
|
304
|
+
return threading.current_thread() is threading.main_thread()
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def get_session_id() -> Optional[str]:
|
|
308
|
+
"""Get the current session ID.
|
|
309
|
+
|
|
310
|
+
Priority:
|
|
311
|
+
1. Task-local session (for async tasks)
|
|
312
|
+
2. Thread-local session (for threads) - NO FALLBACK for threads
|
|
313
|
+
3. SDK state session (for main thread)
|
|
314
|
+
4. Context variable session (fallback for main thread only)
|
|
315
|
+
"""
|
|
316
|
+
# First check task-local storage for async isolation
|
|
317
|
+
try:
|
|
318
|
+
if task := asyncio.current_task():
|
|
319
|
+
if task_session := _sdk_state.task_sessions.get(task):
|
|
320
|
+
debug(f"[SDK] Using task-local session {truncate_id(task_session)}")
|
|
321
|
+
return task_session
|
|
322
|
+
except RuntimeError:
|
|
323
|
+
# Not in async context
|
|
324
|
+
pass
|
|
325
|
+
|
|
326
|
+
# Check if we're in a thread
|
|
327
|
+
if not is_main_thread():
|
|
328
|
+
# For threads, ONLY use thread-local storage - no fallback!
|
|
329
|
+
# This prevents inheriting the parent thread's session
|
|
330
|
+
thread_session = get_thread_session()
|
|
331
|
+
if thread_session:
|
|
332
|
+
debug(f"[SDK] Using thread-local session {truncate_id(thread_session)}")
|
|
333
|
+
else:
|
|
334
|
+
debug(f"[SDK] Thread {threading.current_thread().name} has no thread-local session")
|
|
335
|
+
return thread_session # Return None if not set - don't fall back!
|
|
336
|
+
|
|
337
|
+
# For main thread only: fall back to SDK state or context variable
|
|
338
|
+
return _sdk_state.session_id or current_session_id.get()
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def get_http() -> Optional[HttpClient]:
|
|
342
|
+
"""Get the HTTP client instance."""
|
|
343
|
+
return _sdk_state.http
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def get_event_queue() -> Optional[EventQueue]:
|
|
347
|
+
"""Get the event queue instance."""
|
|
348
|
+
return _sdk_state.event_queue
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
def get_resources() -> dict:
|
|
352
|
+
"""Get API resource instances."""
|
|
353
|
+
return _sdk_state.resources
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def clear_state() -> None:
|
|
357
|
+
"""Clear SDK state (for testing)."""
|
|
358
|
+
global _sdk_state
|
|
359
|
+
debug("[SDK] Clearing SDK state")
|
|
360
|
+
_sdk_state.reset()
|
|
361
|
+
_sdk_state = SDKState()
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""Shutdown manager for graceful cleanup.
|
|
2
|
+
|
|
3
|
+
Coordinates shutdown across all active sessions, ensuring proper cleanup
|
|
4
|
+
on process exit. Inspired by TypeScript SDK's shutdown-manager.ts.
|
|
5
|
+
"""
|
|
6
|
+
import atexit
|
|
7
|
+
import signal
|
|
8
|
+
import sys
|
|
9
|
+
import threading
|
|
10
|
+
import time
|
|
11
|
+
from typing import Dict, Optional, Set, Callable
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
|
|
14
|
+
from ..utils.logger import debug, info, warning, error, truncate_id
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class SessionState:
|
|
19
|
+
"""State information for an active session."""
|
|
20
|
+
session_id: str
|
|
21
|
+
http_client: Optional[object] = None
|
|
22
|
+
event_queue: Optional[object] = None
|
|
23
|
+
is_shutting_down: bool = False
|
|
24
|
+
auto_end: bool = True
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ShutdownManager:
|
|
28
|
+
"""Singleton manager for coordinating shutdown across all active sessions.
|
|
29
|
+
|
|
30
|
+
Ensures process listeners are only registered once and all sessions
|
|
31
|
+
are properly ended on exit.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
_instance: Optional['ShutdownManager'] = None
|
|
35
|
+
_lock = threading.Lock()
|
|
36
|
+
|
|
37
|
+
def __new__(cls):
|
|
38
|
+
if cls._instance is None:
|
|
39
|
+
with cls._lock:
|
|
40
|
+
if cls._instance is None:
|
|
41
|
+
cls._instance = super().__new__(cls)
|
|
42
|
+
cls._instance._initialized = False
|
|
43
|
+
return cls._instance
|
|
44
|
+
|
|
45
|
+
def __init__(self):
|
|
46
|
+
# only initialize once
|
|
47
|
+
if self._initialized:
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
self._initialized = True
|
|
51
|
+
self.active_sessions: Dict[str, SessionState] = {}
|
|
52
|
+
self.is_shutting_down = False
|
|
53
|
+
self.shutdown_complete = threading.Event()
|
|
54
|
+
self.listeners_registered = False
|
|
55
|
+
self._session_lock = threading.Lock()
|
|
56
|
+
|
|
57
|
+
debug("[ShutdownManager] Initialized")
|
|
58
|
+
|
|
59
|
+
def register_session(self, session_id: str, state: SessionState) -> None:
|
|
60
|
+
"""Register a new active session.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
session_id: Session identifier
|
|
64
|
+
state: Session state information
|
|
65
|
+
"""
|
|
66
|
+
with self._session_lock:
|
|
67
|
+
debug(f"[ShutdownManager] Registering session {truncate_id(session_id)}, auto_end={state.auto_end}")
|
|
68
|
+
self.active_sessions[session_id] = state
|
|
69
|
+
|
|
70
|
+
# ensure listeners are registered
|
|
71
|
+
self._ensure_listeners_registered()
|
|
72
|
+
|
|
73
|
+
def unregister_session(self, session_id: str) -> None:
|
|
74
|
+
"""Unregister a session after it ends.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
session_id: Session identifier
|
|
78
|
+
"""
|
|
79
|
+
with self._session_lock:
|
|
80
|
+
debug(f"[ShutdownManager] Unregistering session {truncate_id(session_id)}")
|
|
81
|
+
self.active_sessions.pop(session_id, None)
|
|
82
|
+
|
|
83
|
+
def get_active_session_count(self) -> int:
|
|
84
|
+
"""Get count of active sessions."""
|
|
85
|
+
with self._session_lock:
|
|
86
|
+
return len(self.active_sessions)
|
|
87
|
+
|
|
88
|
+
def is_session_active(self, session_id: str) -> bool:
|
|
89
|
+
"""Check if a session is active.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
session_id: Session identifier
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
True if session is active
|
|
96
|
+
"""
|
|
97
|
+
with self._session_lock:
|
|
98
|
+
return session_id in self.active_sessions
|
|
99
|
+
|
|
100
|
+
def _ensure_listeners_registered(self) -> None:
|
|
101
|
+
"""Register process exit listeners once."""
|
|
102
|
+
if self.listeners_registered:
|
|
103
|
+
return
|
|
104
|
+
|
|
105
|
+
self.listeners_registered = True
|
|
106
|
+
debug("[ShutdownManager] Registering global shutdown listeners (atexit, SIGINT, SIGTERM, uncaught exceptions)")
|
|
107
|
+
|
|
108
|
+
# register atexit handler for normal termination
|
|
109
|
+
atexit.register(self._handle_exit)
|
|
110
|
+
|
|
111
|
+
# register signal handlers for interrupts
|
|
112
|
+
signal.signal(signal.SIGINT, self._signal_handler)
|
|
113
|
+
signal.signal(signal.SIGTERM, self._signal_handler)
|
|
114
|
+
|
|
115
|
+
# register uncaught exception handler
|
|
116
|
+
sys.excepthook = self._exception_handler
|
|
117
|
+
|
|
118
|
+
def _signal_handler(self, signum, frame):
|
|
119
|
+
"""Handle shutdown signals."""
|
|
120
|
+
info(f"[ShutdownManager] Received signal {signum}, initiating graceful shutdown")
|
|
121
|
+
self._handle_shutdown(f"signal_{signum}")
|
|
122
|
+
# exit after cleanup
|
|
123
|
+
sys.exit(0)
|
|
124
|
+
|
|
125
|
+
def _exception_handler(self, exc_type, exc_value, exc_traceback):
|
|
126
|
+
"""Handle uncaught exceptions."""
|
|
127
|
+
# log the exception
|
|
128
|
+
error(f"[ShutdownManager] Uncaught exception: {exc_type.__name__}: {exc_value}")
|
|
129
|
+
|
|
130
|
+
# Create an error event for the uncaught exception
|
|
131
|
+
try:
|
|
132
|
+
from ..sdk.event import create_event
|
|
133
|
+
import traceback
|
|
134
|
+
|
|
135
|
+
error_message = f"{exc_type.__name__}: {exc_value}"
|
|
136
|
+
traceback_str = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
|
|
137
|
+
|
|
138
|
+
create_event(
|
|
139
|
+
type="error_traceback",
|
|
140
|
+
error=error_message,
|
|
141
|
+
traceback=traceback_str
|
|
142
|
+
)
|
|
143
|
+
debug(f"[ShutdownManager] Created error_traceback event for uncaught exception")
|
|
144
|
+
except Exception as e:
|
|
145
|
+
debug(f"[ShutdownManager] Failed to create error_traceback event: {e}")
|
|
146
|
+
|
|
147
|
+
# perform shutdown
|
|
148
|
+
self._handle_shutdown("uncaught_exception")
|
|
149
|
+
|
|
150
|
+
# call default handler
|
|
151
|
+
sys.__excepthook__(exc_type, exc_value, exc_traceback)
|
|
152
|
+
|
|
153
|
+
def _handle_exit(self):
|
|
154
|
+
"""Handle normal process exit."""
|
|
155
|
+
debug("[ShutdownManager] Normal process exit triggered (atexit)")
|
|
156
|
+
self._handle_shutdown("atexit")
|
|
157
|
+
|
|
158
|
+
def _handle_shutdown(self, trigger: str) -> None:
|
|
159
|
+
"""Coordinate shutdown of all sessions.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
trigger: What triggered the shutdown
|
|
163
|
+
"""
|
|
164
|
+
if self.is_shutting_down:
|
|
165
|
+
debug(f"[ShutdownManager] Already shutting down, ignoring {trigger}")
|
|
166
|
+
return
|
|
167
|
+
|
|
168
|
+
self.is_shutting_down = True
|
|
169
|
+
|
|
170
|
+
with self._session_lock:
|
|
171
|
+
session_count = len(self.active_sessions)
|
|
172
|
+
if session_count == 0:
|
|
173
|
+
debug("[ShutdownManager] No active sessions to clean up")
|
|
174
|
+
self.shutdown_complete.set()
|
|
175
|
+
return
|
|
176
|
+
|
|
177
|
+
info(f"[ShutdownManager] Shutdown initiated by {trigger}, ending {session_count} active session(s)")
|
|
178
|
+
|
|
179
|
+
# perform shutdown in separate thread to avoid deadlocks
|
|
180
|
+
import threading
|
|
181
|
+
shutdown_thread = threading.Thread(
|
|
182
|
+
target=self._perform_shutdown,
|
|
183
|
+
name="ShutdownThread"
|
|
184
|
+
)
|
|
185
|
+
shutdown_thread.daemon = True
|
|
186
|
+
shutdown_thread.start()
|
|
187
|
+
|
|
188
|
+
# wait for shutdown with timeout
|
|
189
|
+
if not self.shutdown_complete.wait(timeout=30):
|
|
190
|
+
warning("[ShutdownManager] Shutdown timeout after 30s")
|
|
191
|
+
|
|
192
|
+
def _perform_shutdown(self) -> None:
|
|
193
|
+
"""Perform the actual shutdown of all sessions."""
|
|
194
|
+
debug("[ShutdownManager] _perform_shutdown thread started")
|
|
195
|
+
try:
|
|
196
|
+
sessions_to_end = []
|
|
197
|
+
|
|
198
|
+
with self._session_lock:
|
|
199
|
+
# collect sessions that need ending
|
|
200
|
+
for session_id, state in self.active_sessions.items():
|
|
201
|
+
if state.auto_end and not state.is_shutting_down:
|
|
202
|
+
state.is_shutting_down = True
|
|
203
|
+
sessions_to_end.append((session_id, state))
|
|
204
|
+
|
|
205
|
+
debug(f"[ShutdownManager] Found {len(sessions_to_end)} sessions to end")
|
|
206
|
+
|
|
207
|
+
# end all sessions
|
|
208
|
+
for session_id, state in sessions_to_end:
|
|
209
|
+
try:
|
|
210
|
+
debug(f"[ShutdownManager] Ending session {truncate_id(session_id)}")
|
|
211
|
+
self._end_session(session_id, state)
|
|
212
|
+
except Exception as e:
|
|
213
|
+
error(f"[ShutdownManager] Error ending session {truncate_id(session_id)}: {e}")
|
|
214
|
+
|
|
215
|
+
# Final telemetry shutdown after all sessions are ended
|
|
216
|
+
try:
|
|
217
|
+
from ..sdk.init import _sdk_state
|
|
218
|
+
if hasattr(_sdk_state, 'tracer_provider') and _sdk_state.tracer_provider:
|
|
219
|
+
debug("[ShutdownManager] Final OpenTelemetry shutdown")
|
|
220
|
+
try:
|
|
221
|
+
# Final flush and shutdown with longer timeout
|
|
222
|
+
_sdk_state.tracer_provider.force_flush(timeout_millis=5000)
|
|
223
|
+
_sdk_state.tracer_provider.shutdown()
|
|
224
|
+
debug("[ShutdownManager] OpenTelemetry shutdown complete")
|
|
225
|
+
except Exception as e:
|
|
226
|
+
error(f"[ShutdownManager] Error in final telemetry shutdown: {e}")
|
|
227
|
+
except ImportError:
|
|
228
|
+
pass # SDK not initialized
|
|
229
|
+
|
|
230
|
+
info("[ShutdownManager] Shutdown complete")
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
error(f"[ShutdownManager] Unexpected error in _perform_shutdown: {e}")
|
|
234
|
+
import traceback
|
|
235
|
+
error(f"[ShutdownManager] Traceback: {traceback.format_exc()}")
|
|
236
|
+
finally:
|
|
237
|
+
debug("[ShutdownManager] Setting shutdown_complete event")
|
|
238
|
+
self.shutdown_complete.set()
|
|
239
|
+
|
|
240
|
+
def _end_session(self, session_id: str, state: SessionState) -> None:
|
|
241
|
+
"""End a single session with cleanup.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
session_id: Session identifier
|
|
245
|
+
state: Session state
|
|
246
|
+
"""
|
|
247
|
+
# Flush OpenTelemetry spans first (before event queue)
|
|
248
|
+
try:
|
|
249
|
+
# Get the global tracer provider if it exists
|
|
250
|
+
from ..sdk.init import _sdk_state
|
|
251
|
+
if hasattr(_sdk_state, 'tracer_provider') and _sdk_state.tracer_provider:
|
|
252
|
+
debug(f"[ShutdownManager] Flushing OpenTelemetry spans for session {truncate_id(session_id)}")
|
|
253
|
+
try:
|
|
254
|
+
# Force flush with 3 second timeout
|
|
255
|
+
_sdk_state.tracer_provider.force_flush(timeout_millis=3000)
|
|
256
|
+
except Exception as e:
|
|
257
|
+
error(f"[ShutdownManager] Error flushing spans: {e}")
|
|
258
|
+
except ImportError:
|
|
259
|
+
pass # SDK not initialized
|
|
260
|
+
|
|
261
|
+
# Skip event queue flush during shutdown to avoid hanging
|
|
262
|
+
# The queue worker is a daemon thread and will flush on its own
|
|
263
|
+
if state.event_queue:
|
|
264
|
+
debug(f"[ShutdownManager] Skipping event queue flush during shutdown for session {truncate_id(session_id)}")
|
|
265
|
+
|
|
266
|
+
# end session via API if http client present
|
|
267
|
+
if state.http_client and session_id:
|
|
268
|
+
try:
|
|
269
|
+
debug(f"[ShutdownManager] Ending session {truncate_id(session_id)} via API")
|
|
270
|
+
debug(f"[ShutdownManager] http_client type: {type(state.http_client)}, keys: {state.http_client.keys() if isinstance(state.http_client, dict) else 'not a dict'}")
|
|
271
|
+
# state.http_client is a resources dict with 'sessions' key
|
|
272
|
+
if isinstance(state.http_client, dict) and 'sessions' in state.http_client:
|
|
273
|
+
state.http_client['sessions'].end_session(
|
|
274
|
+
session_id,
|
|
275
|
+
is_successful=False,
|
|
276
|
+
session_eval_reason="Process shutdown"
|
|
277
|
+
)
|
|
278
|
+
debug(f"[ShutdownManager] Session {truncate_id(session_id)} ended via API")
|
|
279
|
+
else:
|
|
280
|
+
debug(f"[ShutdownManager] Cannot end session - http_client not properly configured")
|
|
281
|
+
except Exception as e:
|
|
282
|
+
error(f"[ShutdownManager] Error ending session via API: {e}")
|
|
283
|
+
|
|
284
|
+
# unregister the session
|
|
285
|
+
self.unregister_session(session_id)
|
|
286
|
+
|
|
287
|
+
def reset(self) -> None:
|
|
288
|
+
"""Reset shutdown manager (for testing)."""
|
|
289
|
+
with self._session_lock:
|
|
290
|
+
self.active_sessions.clear()
|
|
291
|
+
self.is_shutting_down = False
|
|
292
|
+
self.shutdown_complete.clear()
|
|
293
|
+
# note: we don't reset listeners_registered as they persist
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# global singleton instance
|
|
297
|
+
_shutdown_manager = ShutdownManager()
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def get_shutdown_manager() -> ShutdownManager:
|
|
301
|
+
"""Get the global shutdown manager instance."""
|
|
302
|
+
return _shutdown_manager
|