agnt5 0.3.2a1__cp310-abi3-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/lm.py ADDED
@@ -0,0 +1,1266 @@
1
+ """Language Model interface for AGNT5 SDK.
2
+
3
+ Simplified API inspired by Vercel AI SDK for seamless multi-provider LLM access.
4
+ Uses Rust-backed implementation via PyO3 for performance and reliability.
5
+
6
+ Basic Usage:
7
+ >>> from agnt5 import lm
8
+ >>>
9
+ >>> # Simple generation
10
+ >>> response = await lm.generate(
11
+ ... model="openai/gpt-4o-mini",
12
+ ... prompt="What is love?",
13
+ ... temperature=0.7
14
+ ... )
15
+ >>> print(response.text)
16
+ >>>
17
+ >>> # Streaming
18
+ >>> async for chunk in lm.stream(
19
+ ... model="anthropic/claude-3-5-haiku",
20
+ ... prompt="Write a story"
21
+ ... ):
22
+ ... print(chunk, end="", flush=True)
23
+
24
+ Supported Providers (via model prefix):
25
+ - openai/model-name
26
+ - anthropic/model-name
27
+ - groq/model-name
28
+ - openrouter/provider/model-name
29
+ - azure/model-name
30
+ - bedrock/model-name
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import json
36
+ from abc import ABC, abstractmethod
37
+ from dataclasses import dataclass, field
38
+ from enum import Enum
39
+ from typing import Any, AsyncIterator, Dict, List, Optional
40
+
41
+ from ._schema_utils import detect_format_type
42
+ from .context import get_current_context
43
+ from .journal import (
44
+ LMCallStartedEvent,
45
+ LMCallCompletedEvent,
46
+ LMCallFailedEvent,
47
+ write_lm_call_started,
48
+ write_lm_call_completed,
49
+ write_lm_call_failed,
50
+ )
51
+
52
+ try:
53
+ from ._core import LanguageModel as RustLanguageModel
54
+ from ._core import LanguageModelConfig as RustLanguageModelConfig
55
+ from ._core import Response as RustResponse
56
+ from ._core import StreamChunk as RustStreamChunk
57
+ from ._core import AsyncStreamHandle as RustAsyncStreamHandle
58
+ from ._core import Usage as RustUsage
59
+ _RUST_AVAILABLE = True
60
+ except ImportError:
61
+ _RUST_AVAILABLE = False
62
+ RustLanguageModel = None
63
+ RustLanguageModelConfig = None
64
+ RustResponse = None
65
+ RustStreamChunk = None
66
+ RustAsyncStreamHandle = None
67
+ RustUsage = None
68
+
69
+
70
+ # Keep Python classes for backward compatibility and convenience
71
+ class MessageRole(str, Enum):
72
+ """Message role in conversation."""
73
+
74
+ SYSTEM = "system"
75
+ USER = "user"
76
+ ASSISTANT = "assistant"
77
+
78
+
79
+ @dataclass
80
+ class Message:
81
+ """Conversation message."""
82
+
83
+ role: MessageRole
84
+ content: str
85
+ tool_calls: Optional[List[Dict[str, Any]]] = None
86
+ tool_call_id: Optional[str] = None
87
+
88
+ @staticmethod
89
+ def system(content: str) -> Message:
90
+ """Create system message."""
91
+ return Message(role=MessageRole.SYSTEM, content=content)
92
+
93
+ @staticmethod
94
+ def user(content: str) -> Message:
95
+ """Create user message."""
96
+ return Message(role=MessageRole.USER, content=content)
97
+
98
+ @staticmethod
99
+ def assistant(
100
+ content: str = "",
101
+ tool_calls: Optional[List[Dict[str, Any]]] = None,
102
+ ) -> Message:
103
+ """Create assistant message, optionally with tool calls."""
104
+ return Message(role=MessageRole.ASSISTANT, content=content, tool_calls=tool_calls)
105
+
106
+ @staticmethod
107
+ def tool_result(tool_call_id: str, content: str) -> Message:
108
+ """Create tool result message.
109
+
110
+ Args:
111
+ tool_call_id: The ID of the tool call this is a response to
112
+ content: The result of the tool execution
113
+ """
114
+ return Message(
115
+ role=MessageRole.USER, # Tool results are sent as user messages in most APIs
116
+ content=content,
117
+ tool_call_id=tool_call_id,
118
+ )
119
+
120
+
121
+ @dataclass
122
+ class ToolDefinition:
123
+ """Tool definition for LLM."""
124
+
125
+ name: str
126
+ description: Optional[str] = None
127
+ parameters: Optional[Dict[str, Any]] = None
128
+
129
+
130
+ class ToolChoice(str, Enum):
131
+ """Tool choice mode."""
132
+
133
+ AUTO = "auto"
134
+ NONE = "none"
135
+ REQUIRED = "required"
136
+
137
+
138
+ class BuiltInTool(str, Enum):
139
+ """Built-in tools for OpenAI Responses API.
140
+
141
+ These are platform-provided tools that don't require implementation:
142
+ - WEB_SEARCH: Real-time web search capability
143
+ - CODE_INTERPRETER: Execute Python code in a sandboxed environment
144
+ - FILE_SEARCH: Search through uploaded files
145
+ """
146
+
147
+ WEB_SEARCH = "web_search_preview"
148
+ CODE_INTERPRETER = "code_interpreter"
149
+ FILE_SEARCH = "file_search"
150
+
151
+
152
+ class ReasoningEffort(str, Enum):
153
+ """Reasoning effort level for o-series models (o1, o3, etc.).
154
+
155
+ Controls the amount of reasoning/thinking the model performs:
156
+ - MINIMAL: Fast responses with basic reasoning
157
+ - MEDIUM: Balanced reasoning and speed (default)
158
+ - HIGH: Deep reasoning, slower but more thorough
159
+ """
160
+
161
+ MINIMAL = "minimal"
162
+ MEDIUM = "medium"
163
+ HIGH = "high"
164
+
165
+
166
+ class Modality(str, Enum):
167
+ """Output modalities for multimodal models.
168
+
169
+ Specifies the types of content the model can generate:
170
+ - TEXT: Standard text output
171
+ - AUDIO: Audio output (e.g., for text-to-speech models)
172
+ - IMAGE: Image generation (future capability)
173
+ """
174
+
175
+ TEXT = "text"
176
+ AUDIO = "audio"
177
+ IMAGE = "image"
178
+
179
+
180
+ @dataclass
181
+ class ModelConfig:
182
+ """Advanced model configuration for custom endpoints and settings.
183
+
184
+ Use this for advanced scenarios like custom API endpoints, special headers,
185
+ or overriding default timeouts. Most users won't need this - the basic
186
+ model string with temperature/max_tokens is sufficient for common cases.
187
+
188
+ Example:
189
+ >>> from agnt5.lm import ModelConfig
190
+ >>> from agnt5 import Agent
191
+ >>>
192
+ >>> # Custom API endpoint
193
+ >>> config = ModelConfig(
194
+ ... base_url="https://custom-api.example.com",
195
+ ... api_key="custom-key",
196
+ ... timeout=60,
197
+ ... headers={"X-Custom-Header": "value"}
198
+ ... )
199
+ >>>
200
+ >>> agent = Agent(
201
+ ... name="custom_agent",
202
+ ... model="openai/gpt-4o-mini",
203
+ ... instructions="...",
204
+ ... model_config=config
205
+ ... )
206
+ """
207
+ base_url: Optional[str] = None
208
+ api_key: Optional[str] = None
209
+ timeout: Optional[int] = None
210
+ headers: Optional[Dict[str, str]] = None
211
+
212
+
213
+ @dataclass
214
+ class GenerationConfig:
215
+ """LLM generation configuration.
216
+
217
+ Supports both Chat Completions and Responses API parameters.
218
+ """
219
+
220
+ # Standard parameters (both APIs)
221
+ temperature: Optional[float] = None
222
+ max_tokens: Optional[int] = None
223
+ top_p: Optional[float] = None
224
+
225
+ # Responses API specific parameters
226
+ built_in_tools: List[BuiltInTool] = field(default_factory=list)
227
+ reasoning_effort: Optional[ReasoningEffort] = None
228
+ modalities: Optional[List[Modality]] = None
229
+ store: Optional[bool] = None # Enable server-side conversation state
230
+ previous_response_id: Optional[str] = None # Continue previous conversation
231
+
232
+
233
+ @dataclass
234
+ class TokenUsage:
235
+ """Token usage statistics."""
236
+
237
+ prompt_tokens: int
238
+ completion_tokens: int
239
+ total_tokens: int
240
+
241
+
242
+ @dataclass
243
+ class GenerateResponse:
244
+ """Response from LLM generation."""
245
+
246
+ text: str
247
+ usage: Optional[TokenUsage] = None
248
+ finish_reason: Optional[str] = None
249
+ tool_calls: Optional[List[Dict[str, Any]]] = None
250
+ response_id: Optional[str] = None # Response ID for conversation continuation (Responses API)
251
+ _rust_response: Optional[Any] = field(default=None, repr=False)
252
+
253
+ @property
254
+ def structured_output(self) -> Optional[Any]:
255
+ """Parsed structured output (Pydantic model, dataclass, or dict).
256
+
257
+ Returns the parsed object when response_format is specified.
258
+ This is the recommended property name for accessing structured output.
259
+
260
+ Returns:
261
+ Parsed object according to the specified response_format, or None if not available
262
+ """
263
+ if self._rust_response and hasattr(self._rust_response, 'object'):
264
+ return self._rust_response.object
265
+ return None
266
+
267
+ @property
268
+ def parsed(self) -> Optional[Any]:
269
+ """Alias for structured_output (OpenAI SDK compatibility).
270
+
271
+ Returns:
272
+ Same as structured_output
273
+ """
274
+ return self.structured_output
275
+
276
+ @property
277
+ def object(self) -> Optional[Any]:
278
+ """Alias for structured_output.
279
+
280
+ Returns:
281
+ Same as structured_output
282
+ """
283
+ return self.structured_output
284
+
285
+
286
+ @dataclass
287
+ class GenerateRequest:
288
+ """Request for LLM generation."""
289
+
290
+ model: str
291
+ messages: List[Message] = field(default_factory=list)
292
+ system_prompt: Optional[str] = None
293
+ tools: List[ToolDefinition] = field(default_factory=list)
294
+ tool_choice: Optional[ToolChoice] = None
295
+ config: GenerationConfig = field(default_factory=GenerationConfig)
296
+ response_schema: Optional[str] = None # JSON-encoded schema for structured output
297
+
298
+
299
+ # Abstract base class for language models
300
+ # This exists primarily for testing/mocking purposes
301
+ class LanguageModel(ABC):
302
+ """Abstract base class for language model implementations.
303
+
304
+ This class defines the interface that all language models must implement.
305
+ It's primarily used for testing and mocking, as production code should use
306
+ the module-level generate() and stream() functions instead.
307
+ """
308
+
309
+ @abstractmethod
310
+ async def generate(self, request: GenerateRequest) -> GenerateResponse:
311
+ """Generate completion from LLM.
312
+
313
+ Args:
314
+ request: Generation request with model, messages, and configuration
315
+
316
+ Returns:
317
+ GenerateResponse with text, usage, and optional tool calls
318
+ """
319
+ pass
320
+
321
+ @abstractmethod
322
+ async def stream(self, request: GenerateRequest) -> AsyncIterator["Event"]:
323
+ """Stream completion from LLM as Event objects.
324
+
325
+ Yields typed Event objects for real-time SSE streaming:
326
+ - lm.message.start: Beginning of message content
327
+ - lm.message.delta: Token chunk with incremental text
328
+ - lm.message.stop: End of message content
329
+
330
+ Args:
331
+ request: Generation request with model, messages, and configuration
332
+
333
+ Yields:
334
+ Event objects for streaming
335
+ """
336
+ pass
337
+
338
+
339
+ # Internal wrapper for the Rust-backed implementation
340
+ # Users should use the module-level generate() and stream() functions instead
341
+ class _LanguageModel(LanguageModel):
342
+ """Internal Language Model wrapper using Rust SDK core.
343
+
344
+ This class is for internal use only. Users should use the module-level
345
+ lm.generate() and lm.stream() functions for a simpler interface.
346
+ """
347
+
348
+ def __init__(
349
+ self,
350
+ provider: Optional[str] = None,
351
+ default_model: Optional[str] = None,
352
+ ):
353
+ """Initialize language model.
354
+
355
+ Args:
356
+ provider: Provider name (e.g., 'openai', 'anthropic', 'azure', 'bedrock', 'groq', 'openrouter')
357
+ If None, provider will be auto-detected from model prefix (e.g., 'openai/gpt-4o')
358
+ default_model: Default model to use if not specified in requests
359
+ """
360
+ if not _RUST_AVAILABLE:
361
+ raise ImportError(
362
+ "Rust extension not available. Please rebuild the SDK with: "
363
+ "cd sdk/sdk-python && maturin develop"
364
+ )
365
+
366
+ self._provider = provider
367
+ self._default_model = default_model
368
+
369
+ # Create config object for Rust
370
+ config = RustLanguageModelConfig(
371
+ default_model=default_model,
372
+ default_provider=provider,
373
+ )
374
+
375
+ self._rust_lm = RustLanguageModel(config=config)
376
+
377
+ def _prepare_model_name(self, model: str) -> str:
378
+ """Prepare model name with provider prefix if needed.
379
+
380
+ Args:
381
+ model: Model name (e.g., 'gpt-4o-mini' or 'openai/gpt-4o-mini')
382
+
383
+ Returns:
384
+ Model name with provider prefix (e.g., 'openai/gpt-4o-mini')
385
+ """
386
+ # If model already has a prefix, return as is
387
+ # This handles cases like OpenRouter where models already have their provider prefix
388
+ # (e.g., 'anthropic/claude-3.5-haiku' for OpenRouter)
389
+ if '/' in model:
390
+ return model
391
+
392
+ # If we have a default provider, prefix the model
393
+ if self._provider:
394
+ return f"{self._provider}/{model}"
395
+
396
+ # Otherwise return as is and let Rust handle the error
397
+ return model
398
+
399
+ async def generate(self, request: GenerateRequest) -> GenerateResponse:
400
+ """Generate completion from LLM.
401
+
402
+ Args:
403
+ request: Generation request with model, messages, and configuration
404
+
405
+ Returns:
406
+ GenerateResponse with text, usage, and optional tool calls
407
+
408
+ Note:
409
+ If memoization is enabled on the current context, this method will
410
+ check the journal for cached results before executing and cache
411
+ results after successful execution.
412
+ """
413
+ # Check for memoization before expensive LLM call
414
+ current_ctx = get_current_context()
415
+ step_key = None
416
+ content_hash = None
417
+
418
+ if current_ctx and hasattr(current_ctx, '_memo') and current_ctx._memo:
419
+ # Generate step_key and content_hash for memoization
420
+ memo = current_ctx._memo
421
+ step_key, content_hash = memo.lm_call_key(
422
+ model=request.model,
423
+ messages=request.messages,
424
+ config={
425
+ "temperature": request.config.temperature,
426
+ "max_tokens": request.config.max_tokens,
427
+ }
428
+ )
429
+
430
+ # Check cache first - skip expensive LLM call if cached
431
+ cached = await memo.get_cached_lm_result(step_key, content_hash)
432
+ if cached:
433
+ logger.debug(f"LLM call {step_key} served from memoization cache")
434
+ return cached
435
+
436
+ # Convert Python request to structured format for Rust
437
+ prompt = self._build_prompt_messages(request)
438
+
439
+ # Prepare model name with provider prefix
440
+ model = self._prepare_model_name(request.model)
441
+
442
+ # Build kwargs for Rust
443
+ kwargs: dict[str, Any] = {
444
+ "model": model,
445
+ }
446
+
447
+ # Always pass provider explicitly if set
448
+ # For gateway providers like OpenRouter, this allows them to handle
449
+ # models with provider prefixes (e.g., openrouter can handle anthropic/claude-3.5-haiku)
450
+ if self._provider:
451
+ kwargs["provider"] = self._provider
452
+
453
+ # Pass system prompt separately if provided
454
+ if request.system_prompt:
455
+ kwargs["system_prompt"] = request.system_prompt
456
+
457
+ if request.config.temperature is not None:
458
+ kwargs["temperature"] = request.config.temperature
459
+ if request.config.max_tokens is not None:
460
+ kwargs["max_tokens"] = request.config.max_tokens
461
+ if request.config.top_p is not None:
462
+ kwargs["top_p"] = request.config.top_p
463
+
464
+ # Pass response schema for structured output if provided
465
+ if request.response_schema is not None:
466
+ kwargs["response_schema_kw"] = request.response_schema
467
+
468
+ # Pass Responses API specific parameters
469
+ if request.config.built_in_tools:
470
+ # Serialize built-in tools to JSON for Rust
471
+ built_in_tools_list = [tool.value for tool in request.config.built_in_tools]
472
+ kwargs["built_in_tools"] = json.dumps(built_in_tools_list)
473
+
474
+ if request.config.reasoning_effort is not None:
475
+ kwargs["reasoning_effort"] = request.config.reasoning_effort.value
476
+
477
+ if request.config.modalities is not None:
478
+ modalities_list = [modality.value for modality in request.config.modalities]
479
+ kwargs["modalities"] = json.dumps(modalities_list)
480
+
481
+ if request.config.store is not None:
482
+ kwargs["store"] = request.config.store
483
+
484
+ if request.config.previous_response_id is not None:
485
+ kwargs["previous_response_id"] = request.config.previous_response_id
486
+
487
+ # Pass tools and tool_choice to Rust
488
+ if request.tools:
489
+ # Serialize tools to JSON for Rust
490
+ tools_list = [
491
+ {
492
+ "name": tool.name,
493
+ "description": tool.description,
494
+ "parameters": tool.parameters,
495
+ }
496
+ for tool in request.tools
497
+ ]
498
+ tools_json = json.dumps(tools_list)
499
+ kwargs["tools"] = tools_json
500
+
501
+ if request.tool_choice:
502
+ # Serialize tool_choice to JSON for Rust
503
+ kwargs["tool_choice"] = json.dumps(request.tool_choice.value)
504
+
505
+ # Pass runtime_context for proper trace linking
506
+ # Try to get from current context if available
507
+ current_ctx = get_current_context()
508
+ if current_ctx and hasattr(current_ctx, '_runtime_context') and current_ctx._runtime_context:
509
+ kwargs["runtime_context"] = current_ctx._runtime_context
510
+
511
+ # Emit checkpoint if called within a workflow context
512
+ from .context import get_workflow_context
513
+ import time
514
+ workflow_ctx = get_workflow_context()
515
+
516
+ # Get trace context for event linkage using AGNT5's tracing system
517
+ trace_id = None
518
+ span_id = None
519
+ try:
520
+ from .tracing import get_current_span_info
521
+ span_info = get_current_span_info()
522
+ if span_info:
523
+ trace_id = span_info.trace_id
524
+ span_id = span_info.span_id
525
+ except Exception as e:
526
+ import logging
527
+ logging.getLogger("agnt5.lm").warning(f"🔍 LM-DEBUG: Failed to get span info: {e}")
528
+
529
+ # Get run_id for journal events - use runtime_context.run_id (base invocation_id)
530
+ # NOT current_ctx.run_id which may have :agent:name suffix
531
+ run_id = None
532
+ if current_ctx and hasattr(current_ctx, '_runtime_context') and current_ctx._runtime_context:
533
+ run_id = current_ctx._runtime_context.run_id
534
+ tenant_id = None # TODO: Get from context when available
535
+
536
+ # Track start time for latency calculation (nanoseconds for precision)
537
+ start_time_ns = time.time_ns()
538
+
539
+ # Write journal event for LLM observability (in addition to checkpoint for streaming)
540
+ if run_id and trace_id and span_id:
541
+ started_event = LMCallStartedEvent(
542
+ model=model,
543
+ provider=self._provider or "unknown",
544
+ temperature=request.config.temperature,
545
+ max_tokens=request.config.max_tokens,
546
+ tools_count=len(request.tools) if request.tools else 0,
547
+ timestamp_ns=start_time_ns,
548
+ )
549
+ await write_lm_call_started(
550
+ run_id=run_id,
551
+ trace_id=trace_id,
552
+ span_id=span_id,
553
+ event=started_event,
554
+ tenant_id=tenant_id,
555
+ )
556
+ # Emit checkpoint for real-time streaming (separate from journal)
557
+ if workflow_ctx:
558
+ event_data = {
559
+ "model": model,
560
+ "provider": self._provider,
561
+ "timestamp": time.time_ns() // 1_000_000,
562
+ }
563
+ if trace_id:
564
+ event_data["trace_id"] = trace_id
565
+ event_data["span_id"] = span_id
566
+ workflow_ctx._send_checkpoint("lm.call.started", event_data)
567
+
568
+ try:
569
+ # Call Rust implementation - it returns a proper Python coroutine now
570
+ # Using pyo3-async-runtimes for truly async HTTP calls without blocking
571
+ rust_response = await self._rust_lm.generate(prompt=prompt, **kwargs)
572
+
573
+ # Convert Rust response to Python
574
+ response = self._convert_response(rust_response)
575
+
576
+ # Calculate latency (in ms for human readability)
577
+ end_time_ns = time.time_ns()
578
+ latency_ms = (end_time_ns - start_time_ns) // 1_000_000
579
+
580
+ # Write journal event for LLM observability
581
+ if run_id and trace_id and span_id:
582
+ input_tokens = response.usage.prompt_tokens if response.usage else 0
583
+ output_tokens = response.usage.completion_tokens if response.usage else 0
584
+ total_tokens = response.usage.total_tokens if response.usage else 0
585
+
586
+ completed_event = LMCallCompletedEvent(
587
+ model=model,
588
+ provider=self._provider or "unknown",
589
+ input_tokens=input_tokens,
590
+ output_tokens=output_tokens,
591
+ total_tokens=total_tokens,
592
+ latency_ms=latency_ms,
593
+ finish_reason=response.finish_reason,
594
+ tool_calls_count=len(response.tool_calls) if response.tool_calls else 0,
595
+ timestamp_ns=end_time_ns,
596
+ )
597
+ await write_lm_call_completed(
598
+ run_id=run_id,
599
+ trace_id=trace_id,
600
+ span_id=span_id,
601
+ event=completed_event,
602
+ tenant_id=tenant_id,
603
+ )
604
+
605
+ # Emit checkpoint for real-time streaming (separate from journal)
606
+ if workflow_ctx:
607
+ event_data = {
608
+ "model": model,
609
+ "provider": self._provider,
610
+ "timestamp": time.time_ns() // 1_000_000,
611
+ }
612
+ if trace_id:
613
+ event_data["trace_id"] = trace_id
614
+ event_data["span_id"] = span_id
615
+
616
+ # Add token usage if available
617
+ if response.usage:
618
+ event_data["input_tokens"] = response.usage.prompt_tokens
619
+ event_data["output_tokens"] = response.usage.completion_tokens
620
+ event_data["total_tokens"] = response.usage.total_tokens
621
+
622
+ workflow_ctx._send_checkpoint("lm.call.completed", event_data)
623
+
624
+ # Cache result for replay if memoization is enabled
625
+ if current_ctx and current_ctx._memo and step_key:
626
+ await current_ctx._memo.cache_lm_result(step_key, content_hash, response)
627
+
628
+ return response
629
+ except Exception as e:
630
+ # Calculate latency for failed call (in ms for human readability)
631
+ end_time_ns = time.time_ns()
632
+ latency_ms = (end_time_ns - start_time_ns) // 1_000_000
633
+
634
+ # Write journal event for LLM failure
635
+ if run_id and trace_id and span_id:
636
+ failed_event = LMCallFailedEvent(
637
+ model=model,
638
+ provider=self._provider or "unknown",
639
+ error_code=type(e).__name__,
640
+ error_message=str(e),
641
+ latency_ms=latency_ms,
642
+ timestamp_ns=end_time_ns,
643
+ )
644
+ await write_lm_call_failed(
645
+ run_id=run_id,
646
+ trace_id=trace_id,
647
+ span_id=span_id,
648
+ event=failed_event,
649
+ tenant_id=tenant_id,
650
+ )
651
+
652
+ # Emit checkpoint for real-time streaming (separate from journal)
653
+ if workflow_ctx:
654
+ event_data = {
655
+ "model": model,
656
+ "provider": self._provider,
657
+ "error": str(e),
658
+ "error_type": type(e).__name__,
659
+ "timestamp": time.time_ns() // 1_000_000,
660
+ }
661
+ if trace_id:
662
+ event_data["trace_id"] = trace_id
663
+ event_data["span_id"] = span_id
664
+ workflow_ctx._send_checkpoint("lm.call.failed", event_data)
665
+ raise
666
+
667
+ async def stream(self, request: GenerateRequest) -> AsyncIterator["Event"]:
668
+ """Stream completion from LLM as Event objects for SSE delivery.
669
+
670
+ This method yields typed Event objects suitable for real-time streaming
671
+ via SSE. It emits content block events following the pattern:
672
+ - lm.message.start / lm.thinking.start: Beginning of content block
673
+ - lm.message.delta / lm.thinking.delta: Token chunk with incremental text
674
+ - lm.message.stop / lm.thinking.stop: End of content block
675
+
676
+ Extended thinking models (Claude with extended thinking) emit thinking blocks
677
+ before text blocks, allowing you to see the model's reasoning process.
678
+
679
+ Args:
680
+ request: Generation request with model, messages, and configuration
681
+
682
+ Yields:
683
+ Event objects for streaming
684
+
685
+ Example:
686
+ ```python
687
+ async for event in lm_instance.stream(request):
688
+ if event.event_type == EventType.LM_MESSAGE_DELTA:
689
+ print(event.data.get("content", ""), end="", flush=True)
690
+ elif event.event_type == EventType.LM_THINKING_DELTA:
691
+ # Handle thinking content (optional)
692
+ pass
693
+ ```
694
+ """
695
+ from .events import Event, EventType
696
+ from .context import get_current_context
697
+
698
+ current_ctx = get_current_context()
699
+
700
+ # Convert Python request to structured format for Rust
701
+ prompt = self._build_prompt_messages(request)
702
+
703
+ # Prepare model name with provider prefix
704
+ model = self._prepare_model_name(request.model)
705
+
706
+ # Build kwargs for Rust
707
+ kwargs: dict[str, Any] = {
708
+ "model": model,
709
+ }
710
+
711
+ # Always pass provider explicitly if set
712
+ if self._provider:
713
+ kwargs["provider"] = self._provider
714
+
715
+ # Pass system prompt separately if provided
716
+ if request.system_prompt:
717
+ kwargs["system_prompt"] = request.system_prompt
718
+
719
+ if request.config.temperature is not None:
720
+ kwargs["temperature"] = request.config.temperature
721
+ if request.config.max_tokens is not None:
722
+ kwargs["max_tokens"] = request.config.max_tokens
723
+ if request.config.top_p is not None:
724
+ kwargs["top_p"] = request.config.top_p
725
+
726
+ # Pass Responses API specific parameters
727
+ if request.config.built_in_tools:
728
+ built_in_tools_list = [tool.value for tool in request.config.built_in_tools]
729
+ kwargs["built_in_tools"] = json.dumps(built_in_tools_list)
730
+
731
+ if request.config.reasoning_effort is not None:
732
+ kwargs["reasoning_effort"] = request.config.reasoning_effort.value
733
+
734
+ if request.config.modalities is not None:
735
+ modalities_list = [modality.value for modality in request.config.modalities]
736
+ kwargs["modalities"] = json.dumps(modalities_list)
737
+
738
+ if request.config.store is not None:
739
+ kwargs["store"] = request.config.store
740
+
741
+ if request.config.previous_response_id is not None:
742
+ kwargs["previous_response_id"] = request.config.previous_response_id
743
+
744
+ # Pass tools and tool_choice to Rust
745
+ if request.tools:
746
+ tools_list = [
747
+ {
748
+ "name": tool.name,
749
+ "description": tool.description,
750
+ "parameters": tool.parameters,
751
+ }
752
+ for tool in request.tools
753
+ ]
754
+ kwargs["tools"] = json.dumps(tools_list)
755
+
756
+ if request.tool_choice:
757
+ kwargs["tool_choice"] = json.dumps(request.tool_choice.value)
758
+
759
+ import time
760
+ sequence = 0
761
+ # Track block types by index since content_block_stop doesn't include block_type
762
+ block_types: Dict[int, str] = {}
763
+
764
+ # Get trace context for journal events
765
+ trace_id = None
766
+ span_id = None
767
+ try:
768
+ from .tracing import get_current_span_info
769
+ span_info = get_current_span_info()
770
+ if span_info:
771
+ trace_id = span_info.trace_id
772
+ span_id = span_info.span_id
773
+ except Exception:
774
+ pass
775
+
776
+ # Get run_id for journal events - use runtime_context.run_id (base invocation_id)
777
+ # NOT current_ctx.run_id which may have :agent:name suffix
778
+ run_id = None
779
+ if current_ctx and hasattr(current_ctx, '_runtime_context') and current_ctx._runtime_context:
780
+ run_id = current_ctx._runtime_context.run_id
781
+ tenant_id = None
782
+
783
+ # Track timing (nanoseconds for precision)
784
+ start_time_ns = time.time_ns()
785
+
786
+ # Write lm.call.started journal event
787
+ if run_id and trace_id and span_id:
788
+ started_event = LMCallStartedEvent(
789
+ model=model,
790
+ provider=self._provider or "unknown",
791
+ temperature=request.config.temperature,
792
+ max_tokens=request.config.max_tokens,
793
+ tools_count=len(request.tools) if request.tools else 0,
794
+ timestamp_ns=start_time_ns,
795
+ )
796
+ await write_lm_call_started(
797
+ run_id=run_id,
798
+ trace_id=trace_id,
799
+ span_id=span_id,
800
+ event=started_event,
801
+ tenant_id=tenant_id,
802
+ )
803
+
804
+ try:
805
+ # Use stream_iter for true async streaming - yields chunks as they arrive
806
+ # instead of collecting all chunks first
807
+ async for chunk in self._rust_lm.stream_iter(prompt=prompt, **kwargs):
808
+ chunk_type = chunk.chunk_type
809
+ block_type = chunk.block_type # "text" or "thinking" (None for stop/completed)
810
+ index = chunk.index if chunk.index is not None else 0
811
+
812
+ if chunk_type == "content_block_start":
813
+ # Track block type for this index
814
+ block_types[index] = block_type or "text"
815
+ # Emit start event based on block type
816
+ if block_type == "thinking":
817
+ yield Event.thinking_start(
818
+ index=index,
819
+ sequence=sequence,
820
+ )
821
+ else:
822
+ yield Event.message_start(
823
+ index=index,
824
+ sequence=sequence,
825
+ )
826
+ sequence += 1
827
+
828
+ elif chunk_type == "delta":
829
+ # Emit delta event based on block type
830
+ if block_type == "thinking":
831
+ yield Event.thinking_delta(
832
+ content=chunk.text,
833
+ index=index,
834
+ sequence=sequence,
835
+ )
836
+ else:
837
+ yield Event.message_delta(
838
+ content=chunk.text,
839
+ index=index,
840
+ sequence=sequence,
841
+ )
842
+ sequence += 1
843
+
844
+ elif chunk_type == "content_block_stop":
845
+ # Look up block type from when we saw content_block_start
846
+ tracked_block_type = block_types.get(index, "text")
847
+ # Emit stop event based on tracked block type
848
+ if tracked_block_type == "thinking":
849
+ yield Event.thinking_stop(
850
+ index=index,
851
+ sequence=sequence,
852
+ )
853
+ else:
854
+ yield Event.message_stop(
855
+ index=index,
856
+ sequence=sequence,
857
+ )
858
+ sequence += 1
859
+
860
+ elif chunk_type == "completed":
861
+ # Final response - emit completion event
862
+ completion_data = {
863
+ "text": chunk.text,
864
+ "model": chunk.model,
865
+ "timestamp": time.time_ns() // 1_000_000,
866
+ }
867
+ if chunk.finish_reason:
868
+ completion_data["finish_reason"] = chunk.finish_reason
869
+ if chunk.usage:
870
+ completion_data["usage"] = {
871
+ "prompt_tokens": chunk.usage.prompt_tokens,
872
+ "completion_tokens": chunk.usage.completion_tokens,
873
+ "total_tokens": chunk.usage.total_tokens,
874
+ }
875
+ yield Event(
876
+ event_type=EventType.LM_STREAM_COMPLETED,
877
+ data=completion_data,
878
+ sequence=sequence,
879
+ )
880
+ sequence += 1
881
+
882
+ # Write lm.call.completed journal event
883
+ if run_id and trace_id and span_id:
884
+ end_time_ns = time.time_ns()
885
+ latency_ms = (end_time_ns - start_time_ns) // 1_000_000
886
+ completed_event = LMCallCompletedEvent(
887
+ model=model,
888
+ provider=self._provider or "unknown",
889
+ input_tokens=chunk.usage.prompt_tokens if chunk.usage else 0,
890
+ output_tokens=chunk.usage.completion_tokens if chunk.usage else 0,
891
+ total_tokens=chunk.usage.total_tokens if chunk.usage else 0,
892
+ latency_ms=latency_ms,
893
+ finish_reason=chunk.finish_reason,
894
+ tool_calls_count=0, # TODO: track tool calls in streaming
895
+ timestamp_ns=end_time_ns,
896
+ )
897
+ await write_lm_call_completed(
898
+ run_id=run_id,
899
+ trace_id=trace_id,
900
+ span_id=span_id,
901
+ event=completed_event,
902
+ tenant_id=tenant_id,
903
+ )
904
+
905
+ except Exception as e:
906
+ # Write lm.call.failed journal event
907
+ if run_id and trace_id and span_id:
908
+ end_time_ns = time.time_ns()
909
+ latency_ms = (end_time_ns - start_time_ns) // 1_000_000
910
+ failed_event = LMCallFailedEvent(
911
+ model=model,
912
+ provider=self._provider or "unknown",
913
+ error_code=type(e).__name__,
914
+ error_message=str(e),
915
+ latency_ms=latency_ms,
916
+ timestamp_ns=end_time_ns,
917
+ )
918
+ await write_lm_call_failed(
919
+ run_id=run_id,
920
+ trace_id=trace_id,
921
+ span_id=span_id,
922
+ event=failed_event,
923
+ tenant_id=tenant_id,
924
+ )
925
+
926
+ # Emit error as a failed event (caller can handle)
927
+ yield Event(
928
+ event_type=EventType.LM_STREAM_FAILED,
929
+ data={
930
+ "error": str(e),
931
+ "error_type": type(e).__name__,
932
+ "timestamp": time.time_ns() // 1_000_000,
933
+ },
934
+ sequence=sequence,
935
+ )
936
+ raise
937
+
938
+ def _build_prompt_messages(self, request: GenerateRequest) -> List[Dict[str, Any]]:
939
+ """Build structured message list for Rust.
940
+
941
+ Rust expects a list of dicts with 'role', 'content', and optional fields:
942
+ - tool_calls: List of tool calls for assistant messages
943
+ - tool_call_id: ID of the tool call this message responds to
944
+ System prompt is passed separately via kwargs.
945
+
946
+ Args:
947
+ request: Generation request with messages
948
+
949
+ Returns:
950
+ List of message dicts with role, content, and optional tool fields
951
+ """
952
+ # Convert messages to Rust format (list of dicts with role, content, and optional fields)
953
+ messages = []
954
+ for msg in request.messages:
955
+ msg_dict: Dict[str, Any] = {
956
+ "role": msg.role.value, # "system", "user", or "assistant"
957
+ "content": msg.content
958
+ }
959
+ # Include tool_calls for assistant messages that have them
960
+ if msg.tool_calls:
961
+ msg_dict["tool_calls"] = msg.tool_calls
962
+ # Include tool_call_id for tool result messages
963
+ if msg.tool_call_id:
964
+ msg_dict["tool_call_id"] = msg.tool_call_id
965
+ messages.append(msg_dict)
966
+
967
+ # If no messages and no system prompt, return a default user message
968
+ if not messages and not request.system_prompt:
969
+ messages.append({
970
+ "role": "user",
971
+ "content": ""
972
+ })
973
+
974
+ return messages
975
+
976
+ def _convert_response(self, rust_response: RustResponse) -> GenerateResponse:
977
+ """Convert Rust response to Python response."""
978
+ usage = None
979
+ if rust_response.usage:
980
+ usage = TokenUsage(
981
+ prompt_tokens=rust_response.usage.prompt_tokens,
982
+ completion_tokens=rust_response.usage.completion_tokens,
983
+ total_tokens=rust_response.usage.total_tokens,
984
+ )
985
+
986
+ # Extract tool_calls from Rust response
987
+ tool_calls = None
988
+ if hasattr(rust_response, 'tool_calls') and rust_response.tool_calls:
989
+ tool_calls = rust_response.tool_calls
990
+
991
+ # Extract response_id from Rust response (for Responses API)
992
+ # PyResponse exposes .id which is the response ID for conversation continuation
993
+ response_id = None
994
+ if hasattr(rust_response, 'id') and rust_response.id:
995
+ response_id = rust_response.id
996
+
997
+ return GenerateResponse(
998
+ text=rust_response.content,
999
+ usage=usage,
1000
+ finish_reason=None, # TODO: Add finish_reason to Rust response
1001
+ tool_calls=tool_calls,
1002
+ response_id=response_id,
1003
+ _rust_response=rust_response, # Store for .structured_output access
1004
+ )
1005
+
1006
+
1007
+ # ============================================================================
1008
+ # Simplified API (Recommended)
1009
+ # ============================================================================
1010
+ # This is the recommended simple interface for most use cases
1011
+
1012
+ async def generate(
1013
+ model: str,
1014
+ prompt: Optional[str] = None,
1015
+ messages: Optional[List[Dict[str, str]]] = None,
1016
+ system_prompt: Optional[str] = None,
1017
+ temperature: Optional[float] = None,
1018
+ max_tokens: Optional[int] = None,
1019
+ top_p: Optional[float] = None,
1020
+ response_format: Optional[Any] = None,
1021
+ # Responses API specific parameters
1022
+ built_in_tools: Optional[List[BuiltInTool]] = None,
1023
+ reasoning_effort: Optional[ReasoningEffort] = None,
1024
+ modalities: Optional[List[Modality]] = None,
1025
+ store: Optional[bool] = None,
1026
+ previous_response_id: Optional[str] = None,
1027
+ ) -> GenerateResponse:
1028
+ """Generate text using any LLM provider (simplified API).
1029
+
1030
+ This is the recommended way to use the LLM API. Provider is auto-detected
1031
+ from the model prefix (e.g., 'openai/gpt-4o-mini', 'anthropic/claude-3-5-haiku').
1032
+
1033
+ Args:
1034
+ model: Model identifier with provider prefix (e.g., 'openai/gpt-4o-mini')
1035
+ prompt: Simple text prompt (for single-turn requests)
1036
+ messages: List of message dicts with 'role' and 'content' (for multi-turn)
1037
+ system_prompt: Optional system prompt
1038
+ temperature: Sampling temperature (0.0-2.0)
1039
+ max_tokens: Maximum tokens to generate
1040
+ top_p: Nucleus sampling parameter
1041
+ response_format: Pydantic model, dataclass, or JSON schema dict for structured output
1042
+ built_in_tools: List of built-in tools (OpenAI Responses API only)
1043
+ reasoning_effort: Reasoning effort level for o-series models (OpenAI Responses API only)
1044
+ modalities: Output modalities (text, audio, image) (OpenAI Responses API only)
1045
+ store: Enable server-side conversation state (OpenAI Responses API only)
1046
+ previous_response_id: Continue from previous response (OpenAI Responses API only)
1047
+
1048
+ Returns:
1049
+ GenerateResponse with text, usage, and optional structured output
1050
+
1051
+ Examples:
1052
+ Simple prompt:
1053
+ >>> response = await generate(
1054
+ ... model="openai/gpt-4o-mini",
1055
+ ... prompt="What is love?",
1056
+ ... temperature=0.7
1057
+ ... )
1058
+ >>> print(response.text)
1059
+
1060
+ Structured output with dataclass:
1061
+ >>> from dataclasses import dataclass
1062
+ >>>
1063
+ >>> @dataclass
1064
+ ... class CodeReview:
1065
+ ... issues: list[str]
1066
+ ... suggestions: list[str]
1067
+ ... overall_quality: int
1068
+ >>>
1069
+ >>> response = await generate(
1070
+ ... model="openai/gpt-4o",
1071
+ ... prompt="Analyze this code...",
1072
+ ... response_format=CodeReview
1073
+ ... )
1074
+ >>> review = response.structured_output # Returns dict
1075
+ """
1076
+ # Validate input
1077
+ if not prompt and not messages:
1078
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
1079
+ if prompt and messages:
1080
+ raise ValueError("Provide either 'prompt' or 'messages', not both")
1081
+
1082
+ # Auto-detect provider from model prefix
1083
+ if '/' not in model:
1084
+ raise ValueError(
1085
+ f"Model must include provider prefix (e.g., 'openai/{model}'). "
1086
+ f"Supported providers: openai, anthropic, groq, openrouter, azure, bedrock"
1087
+ )
1088
+
1089
+ provider, model_name = model.split('/', 1)
1090
+
1091
+ # Convert response_format to JSON schema if provided
1092
+ response_schema_json = None
1093
+ if response_format is not None:
1094
+ format_type, json_schema = detect_format_type(response_format)
1095
+ response_schema_json = json.dumps(json_schema)
1096
+
1097
+ # Create language model client
1098
+ lm = _LanguageModel(provider=provider.lower(), default_model=None)
1099
+
1100
+ # Build messages list
1101
+ if prompt:
1102
+ msg_list = [{"role": "user", "content": prompt}]
1103
+ else:
1104
+ msg_list = messages or []
1105
+
1106
+ # Convert to Message objects for internal API
1107
+ message_objects = []
1108
+ for msg in msg_list:
1109
+ role = MessageRole(msg["role"])
1110
+ if role == MessageRole.USER:
1111
+ message_objects.append(Message.user(msg["content"]))
1112
+ elif role == MessageRole.ASSISTANT:
1113
+ message_objects.append(Message.assistant(msg["content"]))
1114
+ elif role == MessageRole.SYSTEM:
1115
+ message_objects.append(Message.system(msg["content"]))
1116
+
1117
+ # Build request with Responses API parameters
1118
+ config = GenerationConfig(
1119
+ temperature=temperature,
1120
+ max_tokens=max_tokens,
1121
+ top_p=top_p,
1122
+ built_in_tools=built_in_tools or [],
1123
+ reasoning_effort=reasoning_effort,
1124
+ modalities=modalities,
1125
+ store=store,
1126
+ previous_response_id=previous_response_id,
1127
+ )
1128
+
1129
+ request = GenerateRequest(
1130
+ model=model,
1131
+ messages=message_objects,
1132
+ system_prompt=system_prompt,
1133
+ config=config,
1134
+ response_schema=response_schema_json,
1135
+ )
1136
+
1137
+ # Checkpoints are emitted by _LanguageModel.generate() internally
1138
+ # to avoid duplication. No need to emit them here.
1139
+
1140
+ # Generate and return
1141
+ result = await lm.generate(request)
1142
+ return result
1143
+
1144
+
1145
+ async def stream(
1146
+ model: str,
1147
+ prompt: Optional[str] = None,
1148
+ messages: Optional[List[Dict[str, str]]] = None,
1149
+ system_prompt: Optional[str] = None,
1150
+ temperature: Optional[float] = None,
1151
+ max_tokens: Optional[int] = None,
1152
+ top_p: Optional[float] = None,
1153
+ # Responses API specific parameters
1154
+ built_in_tools: Optional[List[BuiltInTool]] = None,
1155
+ reasoning_effort: Optional[ReasoningEffort] = None,
1156
+ modalities: Optional[List[Modality]] = None,
1157
+ store: Optional[bool] = None,
1158
+ previous_response_id: Optional[str] = None,
1159
+ ) -> AsyncIterator["Event"]:
1160
+ """Stream LLM completion as Event objects (simplified API).
1161
+
1162
+ This is the recommended way to use streaming. Provider is auto-detected
1163
+ from the model prefix (e.g., 'openai/gpt-4o-mini', 'anthropic/claude-3-5-haiku').
1164
+
1165
+ Yields Event objects for real-time SSE streaming:
1166
+ - lm.message.start: Beginning of message content
1167
+ - lm.message.delta: Token chunk with incremental text
1168
+ - lm.message.stop: End of message content
1169
+
1170
+ Args:
1171
+ model: Model identifier with provider prefix (e.g., 'openai/gpt-4o-mini')
1172
+ prompt: Simple text prompt (for single-turn requests)
1173
+ messages: List of message dicts with 'role' and 'content' (for multi-turn)
1174
+ system_prompt: Optional system prompt
1175
+ temperature: Sampling temperature (0.0-2.0)
1176
+ max_tokens: Maximum tokens to generate
1177
+ top_p: Nucleus sampling parameter
1178
+ built_in_tools: List of built-in tools (OpenAI Responses API only)
1179
+ reasoning_effort: Reasoning effort level for o-series models (OpenAI Responses API only)
1180
+ modalities: Output modalities (text, audio, image) (OpenAI Responses API only)
1181
+ store: Enable server-side conversation state (OpenAI Responses API only)
1182
+ previous_response_id: Continue from previous response (OpenAI Responses API only)
1183
+
1184
+ Yields:
1185
+ Event objects for streaming
1186
+
1187
+ Examples:
1188
+ Simple streaming:
1189
+ >>> from agnt5.events import EventType
1190
+ >>> async for event in stream(
1191
+ ... model="openai/gpt-4o-mini",
1192
+ ... prompt="Write a story"
1193
+ ... ):
1194
+ ... if event.event_type == EventType.LM_MESSAGE_DELTA:
1195
+ ... print(event.data.get("content", ""), end="", flush=True)
1196
+
1197
+ Streaming conversation:
1198
+ >>> async for event in stream(
1199
+ ... model="groq/llama-3.3-70b-versatile",
1200
+ ... messages=[{"role": "user", "content": "Tell me a joke"}],
1201
+ ... temperature=0.9
1202
+ ... ):
1203
+ ... if event.event_type == EventType.LM_MESSAGE_DELTA:
1204
+ ... print(event.data.get("content", ""), end="")
1205
+ """
1206
+ from .events import Event
1207
+ # Validate input
1208
+ if not prompt and not messages:
1209
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
1210
+ if prompt and messages:
1211
+ raise ValueError("Provide either 'prompt' or 'messages', not both")
1212
+
1213
+ # Auto-detect provider from model prefix
1214
+ if '/' not in model:
1215
+ raise ValueError(
1216
+ f"Model must include provider prefix (e.g., 'openai/{model}'). "
1217
+ f"Supported providers: openai, anthropic, groq, openrouter, azure, bedrock"
1218
+ )
1219
+
1220
+ provider, model_name = model.split('/', 1)
1221
+
1222
+ # Create language model client
1223
+ lm = _LanguageModel(provider=provider.lower(), default_model=None)
1224
+
1225
+ # Build messages list
1226
+ if prompt:
1227
+ msg_list = [{"role": "user", "content": prompt}]
1228
+ else:
1229
+ msg_list = messages or []
1230
+
1231
+ # Convert to Message objects for internal API
1232
+ message_objects = []
1233
+ for msg in msg_list:
1234
+ role = MessageRole(msg["role"])
1235
+ if role == MessageRole.USER:
1236
+ message_objects.append(Message.user(msg["content"]))
1237
+ elif role == MessageRole.ASSISTANT:
1238
+ message_objects.append(Message.assistant(msg["content"]))
1239
+ elif role == MessageRole.SYSTEM:
1240
+ message_objects.append(Message.system(msg["content"]))
1241
+
1242
+ # Build request with Responses API parameters
1243
+ config = GenerationConfig(
1244
+ temperature=temperature,
1245
+ max_tokens=max_tokens,
1246
+ top_p=top_p,
1247
+ built_in_tools=built_in_tools or [],
1248
+ reasoning_effort=reasoning_effort,
1249
+ modalities=modalities,
1250
+ store=store,
1251
+ previous_response_id=previous_response_id,
1252
+ )
1253
+
1254
+ request = GenerateRequest(
1255
+ model=model,
1256
+ messages=message_objects,
1257
+ system_prompt=system_prompt,
1258
+ config=config,
1259
+ )
1260
+
1261
+ # Events are emitted by _LanguageModel.stream() internally
1262
+ # (lm.stream.started/completed/failed with trace linkage)
1263
+
1264
+ # Stream and yield chunks
1265
+ async for chunk in lm.stream(request):
1266
+ yield chunk