hud-python 0.2.10__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of hud-python might be problematic. Click here for more details.

Files changed (86) hide show
  1. hud/__init__.py +20 -8
  2. hud/adapters/common/adapter.py +14 -3
  3. hud/adapters/common/tests/test_adapter.py +16 -4
  4. hud/datasets.py +188 -0
  5. hud/env/docker_client.py +15 -3
  6. hud/env/environment.py +10 -7
  7. hud/env/local_docker_client.py +29 -7
  8. hud/env/remote_client.py +1 -1
  9. hud/env/remote_docker_client.py +2 -2
  10. hud/exceptions.py +2 -1
  11. hud/gym.py +0 -9
  12. hud/mcp/__init__.py +17 -0
  13. hud/mcp/base.py +631 -0
  14. hud/mcp/claude.py +321 -0
  15. hud/mcp/client.py +312 -0
  16. hud/mcp/langchain.py +250 -0
  17. hud/mcp/openai.py +334 -0
  18. hud/mcp/tests/__init__.py +1 -0
  19. hud/mcp/tests/test_base.py +512 -0
  20. hud/mcp/tests/test_claude.py +294 -0
  21. hud/mcp/tests/test_client.py +324 -0
  22. hud/mcp/tests/test_openai.py +238 -0
  23. hud/settings.py +20 -2
  24. hud/task.py +5 -88
  25. hud/taskset.py +2 -23
  26. hud/telemetry/__init__.py +16 -7
  27. hud/telemetry/_trace.py +246 -72
  28. hud/telemetry/context.py +88 -27
  29. hud/telemetry/exporter.py +171 -11
  30. hud/telemetry/instrumentation/mcp.py +174 -410
  31. hud/telemetry/job.py +141 -0
  32. hud/telemetry/mcp_models.py +13 -74
  33. hud/telemetry/tests/test_context.py +9 -6
  34. hud/telemetry/tests/test_trace.py +120 -78
  35. hud/tools/__init__.py +34 -0
  36. hud/tools/base.py +65 -0
  37. hud/tools/bash.py +137 -0
  38. hud/tools/computer/__init__.py +13 -0
  39. hud/tools/computer/anthropic.py +411 -0
  40. hud/tools/computer/hud.py +315 -0
  41. hud/tools/computer/openai.py +283 -0
  42. hud/tools/edit.py +290 -0
  43. hud/tools/executors/__init__.py +30 -0
  44. hud/tools/executors/base.py +331 -0
  45. hud/tools/executors/pyautogui.py +619 -0
  46. hud/tools/executors/tests/__init__.py +1 -0
  47. hud/tools/executors/tests/test_base_executor.py +338 -0
  48. hud/tools/executors/tests/test_pyautogui_executor.py +165 -0
  49. hud/tools/executors/xdo.py +503 -0
  50. hud/tools/helper/README.md +56 -0
  51. hud/tools/helper/__init__.py +9 -0
  52. hud/tools/helper/mcp_server.py +78 -0
  53. hud/tools/helper/server_initialization.py +115 -0
  54. hud/tools/helper/utils.py +58 -0
  55. hud/tools/playwright_tool.py +379 -0
  56. hud/tools/tests/__init__.py +3 -0
  57. hud/tools/tests/test_bash.py +152 -0
  58. hud/tools/tests/test_computer.py +52 -0
  59. hud/tools/tests/test_computer_actions.py +34 -0
  60. hud/tools/tests/test_edit.py +240 -0
  61. hud/tools/tests/test_init.py +27 -0
  62. hud/tools/tests/test_playwright_tool.py +183 -0
  63. hud/tools/tests/test_tools.py +157 -0
  64. hud/tools/tests/test_utils.py +156 -0
  65. hud/tools/utils.py +50 -0
  66. hud/trajectory.py +5 -1
  67. hud/types.py +10 -1
  68. hud/utils/tests/test_init.py +21 -0
  69. hud/utils/tests/test_version.py +1 -1
  70. hud/version.py +1 -1
  71. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/METADATA +27 -18
  72. hud_python-0.3.1.dist-info/RECORD +119 -0
  73. hud/evaluators/__init__.py +0 -9
  74. hud/evaluators/base.py +0 -32
  75. hud/evaluators/inspect.py +0 -24
  76. hud/evaluators/judge.py +0 -189
  77. hud/evaluators/match.py +0 -156
  78. hud/evaluators/remote.py +0 -65
  79. hud/evaluators/tests/__init__.py +0 -0
  80. hud/evaluators/tests/test_inspect.py +0 -12
  81. hud/evaluators/tests/test_judge.py +0 -231
  82. hud/evaluators/tests/test_match.py +0 -115
  83. hud/evaluators/tests/test_remote.py +0 -98
  84. hud_python-0.2.10.dist-info/RECORD +0 -85
  85. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/WHEEL +0 -0
  86. {hud_python-0.2.10.dist-info → hud_python-0.3.1.dist-info}/licenses/LICENSE +0 -0
hud/telemetry/_trace.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ # ruff: noqa: T201
3
4
  import asyncio
4
5
  import logging
5
6
  import time
@@ -11,61 +12,173 @@ from typing import (
11
12
  Any,
12
13
  ParamSpec,
13
14
  TypeVar,
14
- overload,
15
15
  )
16
16
 
17
- from hud.telemetry import exporter
18
17
  from hud.telemetry.context import (
19
18
  flush_buffer,
20
19
  get_current_task_run_id,
21
20
  is_root_trace,
22
21
  set_current_task_run_id,
23
22
  )
24
- from hud.telemetry.exporter import submit_to_worker_loop
25
23
  from hud.telemetry.instrumentation.registry import registry
26
24
 
27
25
  if TYPE_CHECKING:
28
- from collections.abc import (
29
- Callable,
30
- Coroutine,
31
- Generator,
32
- )
26
+ from collections.abc import Generator
33
27
 
34
- from hud.telemetry.mcp_models import BaseMCPCall
35
28
 
36
29
  logger = logging.getLogger("hud.telemetry")
37
30
  T = TypeVar("T")
31
+ P = ParamSpec("P")
32
+
33
+ # Track whether telemetry has been initialized
34
+ _telemetry_initialized = False
38
35
 
39
36
 
40
37
  def init_telemetry() -> None:
41
38
  """Initialize telemetry instrumentors and ensure worker is started if telemetry is active."""
39
+ global _telemetry_initialized
40
+ if _telemetry_initialized:
41
+ return
42
+
42
43
  registry.install_all()
43
44
  logger.info("Telemetry initialized.")
45
+ _telemetry_initialized = True
46
+
47
+
48
+ def _ensure_telemetry_initialized() -> None:
49
+ """Ensure telemetry is initialized - called lazily by trace functions."""
50
+ from hud.settings import settings
51
+
52
+ if settings.telemetry_enabled and not _telemetry_initialized:
53
+ init_telemetry()
54
+
55
+
56
+ def _detect_agent_model() -> str | None:
57
+ """
58
+ Try to auto-detect agent model from parent frames.
59
+ This is a best-effort approach and may not work in all cases.
60
+ """
61
+ import sys
62
+
63
+ try:
64
+ # Try different frame depths (2-3 typically covers most cases)
65
+ for depth in range(2, 3):
66
+ try:
67
+ frame = sys._getframe(depth)
68
+ # Check local variables for agent objects
69
+ for var_value in frame.f_locals.values():
70
+ # Look for objects with model_name attribute
71
+ if hasattr(var_value, "model_name") and hasattr(var_value, "run"):
72
+ # Likely an agent object
73
+ model_name = getattr(var_value, "model_name", None)
74
+ if model_name:
75
+ logger.debug(
76
+ "Found agent with model_name in frame %d: %s", depth, model_name
77
+ )
78
+ return str(model_name)
79
+
80
+ # Also check self in case we're in a method
81
+ if "self" in frame.f_locals:
82
+ self_obj = frame.f_locals["self"]
83
+ if hasattr(self_obj, "model_name"):
84
+ model_name = getattr(self_obj, "model_name", None)
85
+ if model_name:
86
+ logger.debug(
87
+ "Found agent model_name in self at frame %d: %s", depth, model_name
88
+ )
89
+ return str(model_name)
90
+
91
+ except (ValueError, AttributeError):
92
+ # Frame doesn't exist at this depth or other issues
93
+ continue
94
+
95
+ except Exception as e:
96
+ logger.debug("Agent model detection failed: %s", e)
97
+
98
+ return None
99
+
100
+
101
+ def _print_trace_url(task_run_id: str) -> None:
102
+ """Print the trace URL in a colorful box."""
103
+ url = f"https://app.hud.so/trace/{task_run_id}"
104
+ header = "🚀 See your agent live at:"
105
+
106
+ # ANSI color codes
107
+ DIM = "\033[90m" # Dim/Gray for border (visible on both light and dark terminals)
108
+ GOLD = "\033[33m" # Gold/Yellow for URL
109
+ RESET = "\033[0m"
110
+ BOLD = "\033[1m"
111
+
112
+ # Calculate box width based on the longest line
113
+ box_width = max(len(url), len(header)) + 6
114
+
115
+ # Box drawing characters
116
+ top_border = "╔" + "═" * (box_width - 2) + "╗"
117
+ bottom_border = "╚" + "═" * (box_width - 2) + "╝"
118
+ divider = "╟" + "─" * (box_width - 2) + "╢"
119
+
120
+ # Center the content
121
+ header_padding = (box_width - len(header) - 2) // 2
122
+ url_padding = (box_width - len(url) - 2) // 2
123
+
124
+ # Print the box
125
+ print(f"\n{DIM}{top_border}{RESET}")
126
+ print(
127
+ f"{DIM}║{RESET}{' ' * header_padding}{header}{' ' * (box_width - len(header) - header_padding - 3)}{DIM}║{RESET}" # noqa: E501
128
+ )
129
+ print(f"{DIM}{divider}{RESET}")
130
+ print(
131
+ f"{DIM}║{RESET}{' ' * url_padding}{BOLD}{GOLD}{url}{RESET}{' ' * (box_width - len(url) - url_padding - 2)}{DIM}║{RESET}" # noqa: E501
132
+ )
133
+ print(f"{DIM}{bottom_border}{RESET}\n")
134
+
135
+
136
+ def _print_trace_complete_url(task_run_id: str) -> None:
137
+ """Print the trace completion URL in a simple colorful format."""
138
+ url = f"https://app.hud.so/trace/{task_run_id}"
139
+
140
+ # ANSI color codes
141
+ GREEN = "\033[92m"
142
+ GOLD = "\033[33m"
143
+ RESET = "\033[0m"
144
+ DIM = "\033[2m"
145
+ BOLD = "\033[1m"
146
+
147
+ print(f"\n{GREEN}✓ Trace complete!{RESET} {DIM}View at:{RESET} {BOLD}{GOLD}{url}{RESET}\n")
44
148
 
45
149
 
46
150
  @contextmanager
47
- def trace(
151
+ def trace_open(
48
152
  name: str | None = None,
153
+ agent_model: str | None = None,
154
+ run_id: str | None = None,
49
155
  attributes: dict[str, Any] | None = None,
50
156
  ) -> Generator[str, None, None]:
51
157
  """
52
158
  Context manager for tracing a block of code.
53
- The task_run_id is always generated internally as a UUID.
54
- Telemetry export is handled by a background worker thread.
55
159
 
56
160
  Args:
57
- attributes: Optional dictionary of attributes to associate with this trace
58
161
  name: Optional name for this trace, will be added to attributes.
162
+ attributes: Optional dictionary of attributes to associate with this trace
59
163
 
60
164
  Returns:
61
165
  The generated task run ID (UUID string) used for this trace
62
166
  """
63
- task_run_id = str(uuid.uuid4())
167
+ # Lazy initialization - only initialize telemetry when trace() is actually called
168
+ _ensure_telemetry_initialized()
169
+
170
+ task_run_id = run_id or str(uuid.uuid4())
171
+
172
+ _print_trace_url(task_run_id)
64
173
 
65
174
  local_attributes = attributes.copy() if attributes is not None else {}
66
175
  if name is not None:
67
176
  local_attributes["trace_name"] = name
68
177
 
178
+ # Auto-detect agent if not explicitly provided
179
+ if agent_model is None:
180
+ agent_model = _detect_agent_model()
181
+
69
182
  start_time = time.time()
70
183
  logger.debug("Starting trace %s (Name: %s)", task_run_id, name if name else "Unnamed")
71
184
 
@@ -76,96 +189,157 @@ def trace(
76
189
  is_root = previous_task_id is None
77
190
  is_root_trace.set(is_root)
78
191
 
192
+ # Update status to initializing for root traces
193
+ if is_root:
194
+ from hud.telemetry.exporter import (
195
+ TaskRunStatus,
196
+ submit_to_worker_loop,
197
+ update_task_run_status,
198
+ )
199
+ from hud.telemetry.job import get_current_job_id
200
+
201
+ # Include metadata in the initial status update
202
+ initial_metadata = local_attributes.copy()
203
+ initial_metadata["is_root_trace"] = is_root
204
+ if agent_model:
205
+ initial_metadata["agent_model"] = agent_model
206
+
207
+ # Get job_id if we're in a job context
208
+ job_id = get_current_job_id()
209
+
210
+ coro = update_task_run_status(
211
+ task_run_id, TaskRunStatus.INITIALIZING, metadata=initial_metadata, job_id=job_id
212
+ )
213
+ submit_to_worker_loop(coro)
214
+ logger.debug("Updated task run %s status to INITIALIZING with metadata", task_run_id)
215
+
216
+ error_occurred = False
217
+ error_message = None
218
+
79
219
  try:
80
220
  yield task_run_id
221
+ except Exception as e:
222
+ error_occurred = True
223
+ error_message = str(e)
224
+ raise
81
225
  finally:
82
226
  end_time = time.time()
83
227
  duration = end_time - start_time
228
+ local_attributes["duration_seconds"] = duration
229
+ local_attributes["is_root_trace"] = is_root
84
230
 
85
- mcp_calls: list[BaseMCPCall] = flush_buffer()
231
+ logger.debug("Finishing trace %s after %.2f seconds", task_run_id, duration)
86
232
 
87
- trace_attributes_final = {
88
- **local_attributes,
89
- "start_time": start_time,
90
- "end_time": end_time,
91
- "duration": duration,
92
- "is_root": is_root,
93
- }
233
+ # Update status for root traces
234
+ if is_root:
235
+ from hud.telemetry.exporter import (
236
+ TaskRunStatus,
237
+ submit_to_worker_loop,
238
+ update_task_run_status,
239
+ )
94
240
 
95
- if is_root and mcp_calls:
96
- try:
97
- coro_to_submit = exporter.export_telemetry(
98
- task_run_id=task_run_id,
99
- trace_attributes=trace_attributes_final,
100
- mcp_calls=mcp_calls,
241
+ # Include final metadata with duration
242
+ final_metadata = local_attributes.copy()
243
+
244
+ if error_occurred:
245
+ coro = update_task_run_status(
246
+ task_run_id, TaskRunStatus.ERROR, error_message, metadata=final_metadata
101
247
  )
102
- future = submit_to_worker_loop(coro_to_submit)
103
- if future:
104
- logger.debug(
105
- "Telemetry for trace %s submitted to background worker.", task_run_id
106
- )
107
- else:
108
- logger.warning(
109
- "Failed to submit telemetry for trace %s to"
110
- "background worker (loop not available).",
111
- task_run_id,
112
- )
113
- except Exception as e:
114
- logger.warning("Failed to submit telemetry for trace %s: %s", task_run_id, e)
248
+ logger.debug("Updated task run %s status to ERROR: %s", task_run_id, error_message)
249
+ else:
250
+ coro = update_task_run_status(
251
+ task_run_id, TaskRunStatus.COMPLETED, metadata=final_metadata
252
+ )
253
+ logger.debug("Updated task run %s status to COMPLETED with metadata", task_run_id)
115
254
 
116
- set_current_task_run_id(previous_task_id)
117
- is_root_trace.set(was_root)
255
+ # Wait for the status update to complete
256
+ future = submit_to_worker_loop(coro)
257
+ if future:
258
+ try:
259
+ # Wait up to 5 seconds for the status update
260
+ import concurrent.futures
118
261
 
119
- logger.debug(
120
- "Ended trace %s (Name: %s) with %d MCP call(s)",
121
- task_run_id,
122
- name if name else "Unnamed",
123
- len(mcp_calls),
124
- )
262
+ future.result(timeout=5.0)
263
+ logger.debug("Status update completed successfully")
264
+ except concurrent.futures.TimeoutError:
265
+ logger.warning("Timeout waiting for status update to complete")
266
+ except Exception as e:
267
+ logger.error("Error waiting for status update: %s", e)
125
268
 
126
- logger.info("View trace at https://app.hud.so/jobs/traces/%s", task_run_id)
269
+ # Export any remaining records before flushing
270
+ if is_root:
271
+ from hud.telemetry.context import export_incremental
127
272
 
273
+ export_incremental()
128
274
 
129
- P = ParamSpec("P")
130
- R = TypeVar("R")
275
+ # Always flush the buffer for the current task
276
+ mcp_calls = flush_buffer(export=True)
277
+ logger.debug("Flushed %d MCP calls for trace %s", len(mcp_calls), task_run_id)
278
+
279
+ # Restore previous context
280
+ set_current_task_run_id(previous_task_id)
281
+ is_root_trace.set(was_root)
131
282
 
283
+ # Log at the end
284
+ if is_root:
285
+ _print_trace_complete_url(task_run_id)
132
286
 
133
- def register_trace(
134
- name: str | None = None, attributes: dict[str, Any] | None = None
135
- ) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
287
+
288
+ @contextmanager
289
+ def trace(
290
+ name: str | None = None,
291
+ agent_model: str | None = None,
292
+ attributes: dict[str, Any] | None = None,
293
+ ) -> Generator[str, None, None]:
136
294
  """
137
- Decorator to wrap a synchronous or asynchronous function call
138
- within a hud._telemetry.trace context.
295
+ Synchronous context manager that traces and blocks until telemetry is sent.
296
+
297
+ This is the "worry-free" option when you want to ensure telemetry is
298
+ sent immediately before continuing, rather than relying on background workers.
139
299
 
140
300
  Args:
141
- name: Optional name for the trace.
142
- attributes: Optional dictionary of attributes for the trace.
301
+ name: Optional name for this trace
302
+ attributes: Optional attributes for the trace
303
+
304
+ Returns:
305
+ The generated task run ID (UUID string) used for this trace
143
306
  """
307
+ with trace_open(name=name, agent_model=agent_model, attributes=attributes) as task_run_id:
308
+ yield task_run_id
144
309
 
145
- @overload
146
- def decorator(
147
- func: Callable[P, Coroutine[Any, Any, R]],
148
- ) -> Callable[P, Coroutine[Any, Any, R]]: ...
310
+ # Ensure telemetry is flushed synchronously
311
+ from hud import flush
149
312
 
150
- @overload
151
- def decorator(func: Callable[P, R]) -> Callable[P, R]: ...
313
+ flush()
314
+
315
+
316
+ def trace_decorator(
317
+ name: str | None = None,
318
+ agent_model: str | None = None,
319
+ attributes: dict[str, Any] | None = None,
320
+ ) -> Any:
321
+ """
322
+ Decorator for tracing functions.
323
+
324
+ Can be used on both sync and async functions.
325
+ """
152
326
 
153
- def decorator(func: Callable[P, Any]) -> Callable[P, Any]:
327
+ def decorator(func: Any) -> Any:
154
328
  if asyncio.iscoroutinefunction(func):
155
329
 
156
330
  @wraps(func)
157
- async def async_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
158
- effective_name = name if name else func.__name__
159
- with trace(name=effective_name, attributes=attributes):
331
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
332
+ func_name = name or f"{func.__module__}.{func.__name__}"
333
+ with trace_open(name=func_name, agent_model=agent_model, attributes=attributes):
160
334
  return await func(*args, **kwargs)
161
335
 
162
336
  return async_wrapper
163
337
  else:
164
338
 
165
339
  @wraps(func)
166
- def sync_wrapper(*args: P.args, **kwargs: P.kwargs) -> Any:
167
- effective_name = name if name else func.__name__
168
- with trace(name=effective_name, attributes=attributes):
340
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
341
+ func_name = name or f"{func.__module__}.{func.__name__}"
342
+ with trace_open(name=func_name, agent_model=agent_model, attributes=attributes):
169
343
  return func(*args, **kwargs)
170
344
 
171
345
  return sync_wrapper
hud/telemetry/context.py CHANGED
@@ -8,7 +8,6 @@ from typing import Any, TypeVar
8
8
 
9
9
  from hud.telemetry.mcp_models import (
10
10
  BaseMCPCall,
11
- MCPManualTestCall,
12
11
  MCPNotificationCall,
13
12
  MCPRequestCall,
14
13
  MCPResponseCall,
@@ -21,8 +20,12 @@ logger = logging.getLogger("hud.telemetry")
21
20
  current_task_run_id: contextvars.ContextVar[str | None] = contextvars.ContextVar(
22
21
  "current_task_run_id", default=None
23
22
  )
24
- # NEW: Global dictionary for buffering, keyed by task_run_id
23
+ # Global dictionary for buffering, keyed by task_run_id
25
24
  _GLOBAL_MCP_CALL_BUFFERS: defaultdict[str, list[BaseMCPCall]] = defaultdict(list)
25
+ # Track the last exported index for each task_run_id
26
+ _GLOBAL_EXPORT_INDICES: defaultdict[str, int] = defaultdict(int)
27
+ # Track whether we've seen a non-init request for each task_run_id
28
+ _GLOBAL_HAS_NON_INIT_REQUEST: defaultdict[str, bool] = defaultdict(bool)
26
29
  is_root_trace: contextvars.ContextVar[bool] = contextvars.ContextVar("is_root_trace", default=False)
27
30
 
28
31
  # Maximum buffer size before automatic flush
@@ -43,6 +46,7 @@ def set_current_task_run_id(task_run_id: str | None) -> None:
43
46
 
44
47
 
45
48
  def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
49
+ """Buffer an MCP call record for the current trace."""
46
50
  task_run_id = get_current_task_run_id()
47
51
 
48
52
  if not task_run_id:
@@ -51,7 +55,7 @@ def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
51
55
  )
52
56
  return
53
57
 
54
- # Ensure 'record' is a Pydantic model instance from here
58
+ # Ensure 'record' is a Pydantic model instance
55
59
  if isinstance(record, dict):
56
60
  try:
57
61
  record_model = BaseMCPCall.from_dict(record)
@@ -67,6 +71,48 @@ def buffer_mcp_call(record: BaseMCPCall | dict[str, Any]) -> None:
67
71
  flush_buffer(export=True)
68
72
 
69
73
 
74
+ def export_incremental() -> list[BaseMCPCall]:
75
+ """
76
+ Export only new MCP calls since last export without clearing the buffer.
77
+
78
+ Returns:
79
+ The list of newly exported MCP calls
80
+ """
81
+ task_run_id = get_current_task_run_id()
82
+ if not task_run_id or not is_root_trace.get():
83
+ return []
84
+
85
+ buffer = _GLOBAL_MCP_CALL_BUFFERS.get(task_run_id, [])
86
+ last_exported_idx = _GLOBAL_EXPORT_INDICES.get(task_run_id, 0)
87
+
88
+ # Get only the new records since last export
89
+ new_records = buffer[last_exported_idx:]
90
+
91
+ if new_records:
92
+ # Update the export index
93
+ _GLOBAL_EXPORT_INDICES[task_run_id] = len(buffer)
94
+
95
+ # Trigger export
96
+ from hud.telemetry import exporter
97
+ from hud.telemetry.exporter import submit_to_worker_loop
98
+
99
+ # Get current trace attributes if available
100
+ attributes = {"incremental": True}
101
+
102
+ coro = exporter.export_telemetry(
103
+ task_run_id=task_run_id,
104
+ trace_attributes=attributes,
105
+ mcp_calls=new_records.copy(), # Copy to avoid modification during export
106
+ )
107
+ submit_to_worker_loop(coro)
108
+
109
+ logger.debug(
110
+ "Incremental export: %d new MCP calls for trace %s", len(new_records), task_run_id
111
+ )
112
+
113
+ return new_records
114
+
115
+
70
116
  def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
71
117
  """
72
118
  Clear the MCP calls buffer and return its contents.
@@ -82,11 +128,12 @@ def flush_buffer(export: bool = False) -> list[BaseMCPCall]:
82
128
  logger.warning("FLUSH_BUFFER: No current task_run_id. Cannot flush.")
83
129
  return []
84
130
 
85
- buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(
86
- task_run_id, []
87
- ) # Get and remove the list for this task
88
-
89
- return buffer_for_task # Return the flushed items
131
+ buffer_for_task = _GLOBAL_MCP_CALL_BUFFERS.pop(task_run_id, [])
132
+ # Clean up export index when buffer is flushed
133
+ _GLOBAL_EXPORT_INDICES.pop(task_run_id, None)
134
+ # Clean up non-init request tracking
135
+ _GLOBAL_HAS_NON_INIT_REQUEST.pop(task_run_id, None)
136
+ return buffer_for_task
90
137
 
91
138
 
92
139
  def create_request_record(
@@ -98,6 +145,31 @@ def create_request_record(
98
145
  logger.warning("No active task_run_id, request record will not be created")
99
146
  raise ValueError("No active task_run_id")
100
147
 
148
+ # Check if this is the first non-init request and update status
149
+ if is_root_trace.get() and not _GLOBAL_HAS_NON_INIT_REQUEST[task_run_id]:
150
+ # Common initialization method patterns
151
+ init_methods = {"initialize", "session/new", "init", "setup", "connect"}
152
+ method_lower = method.lower()
153
+
154
+ # Check if this is NOT an initialization method
155
+ if not any(init_pattern in method_lower for init_pattern in init_methods):
156
+ _GLOBAL_HAS_NON_INIT_REQUEST[task_run_id] = True
157
+
158
+ # Update status to running
159
+ from hud.telemetry.exporter import (
160
+ TaskRunStatus,
161
+ submit_to_worker_loop,
162
+ update_task_run_status,
163
+ )
164
+
165
+ coro = update_task_run_status(task_run_id, TaskRunStatus.RUNNING)
166
+ submit_to_worker_loop(coro)
167
+ logger.debug(
168
+ "Updated task run %s status to RUNNING on first non-init request: %s",
169
+ task_run_id,
170
+ method,
171
+ )
172
+
101
173
  record = MCPRequestCall(
102
174
  task_run_id=task_run_id,
103
175
  method=method,
@@ -118,16 +190,23 @@ def create_response_record(
118
190
  logger.warning("No active task_run_id, response record will not be created")
119
191
  raise ValueError("No active task_run_id")
120
192
 
193
+ # Default to COMPLETED status if not provided
194
+ if "status" not in kwargs:
195
+ kwargs["status"] = StatusType.COMPLETED
196
+
121
197
  record = MCPResponseCall(
122
198
  task_run_id=task_run_id,
123
199
  method=method,
124
- status=StatusType.COMPLETED,
125
200
  related_request_id=related_request_id,
126
201
  is_error=is_error,
127
202
  **kwargs,
128
203
  )
129
204
 
130
205
  buffer_mcp_call(record)
206
+
207
+ # Trigger incremental export when we receive a response
208
+ export_incremental()
209
+
131
210
  return record
132
211
 
133
212
 
@@ -149,21 +228,3 @@ def create_notification_record(
149
228
  )
150
229
  buffer_mcp_call(record)
151
230
  return record
152
-
153
-
154
- def create_manual_test_record(**custom_data: Any) -> MCPManualTestCall | None:
155
- """Create and buffer a manual test record"""
156
- task_run_id = get_current_task_run_id()
157
- if not task_run_id:
158
- logger.warning("No active task_run_id, manual test record will not be created")
159
- return None
160
-
161
- record = MCPManualTestCall.create(task_run_id=task_run_id, **custom_data)
162
- buffer_mcp_call(record)
163
- return record
164
-
165
-
166
- def reset_context() -> None:
167
- """Reset all telemetry context variables. Useful for test isolation."""
168
- set_current_task_run_id(None)
169
- is_root_trace.set(False)