agnt5 0.3.0a8__cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/lm.py ADDED
@@ -0,0 +1,1033 @@
1
+ """Language Model interface for AGNT5 SDK.
2
+
3
+ Simplified API inspired by Vercel AI SDK for seamless multi-provider LLM access.
4
+ Uses Rust-backed implementation via PyO3 for performance and reliability.
5
+
6
+ Basic Usage:
7
+ >>> from agnt5 import lm
8
+ >>>
9
+ >>> # Simple generation
10
+ >>> response = await lm.generate(
11
+ ... model="openai/gpt-4o-mini",
12
+ ... prompt="What is love?",
13
+ ... temperature=0.7
14
+ ... )
15
+ >>> print(response.text)
16
+ >>>
17
+ >>> # Streaming
18
+ >>> async for chunk in lm.stream(
19
+ ... model="anthropic/claude-3-5-haiku",
20
+ ... prompt="Write a story"
21
+ ... ):
22
+ ... print(chunk, end="", flush=True)
23
+
24
+ Supported Providers (via model prefix):
25
+ - openai/model-name
26
+ - anthropic/model-name
27
+ - groq/model-name
28
+ - openrouter/provider/model-name
29
+ - azure/model-name
30
+ - bedrock/model-name
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import json
36
+ from abc import ABC, abstractmethod
37
+ from dataclasses import dataclass, field
38
+ from enum import Enum
39
+ from typing import Any, AsyncIterator, Dict, List, Optional
40
+
41
+ from ._schema_utils import detect_format_type
42
+ from .context import get_current_context
43
+
44
+ try:
45
+ from ._core import LanguageModel as RustLanguageModel
46
+ from ._core import LanguageModelConfig as RustLanguageModelConfig
47
+ from ._core import Response as RustResponse
48
+ from ._core import StreamChunk as RustStreamChunk
49
+ from ._core import AsyncStreamHandle as RustAsyncStreamHandle
50
+ from ._core import Usage as RustUsage
51
+ _RUST_AVAILABLE = True
52
+ except ImportError:
53
+ _RUST_AVAILABLE = False
54
+ RustLanguageModel = None
55
+ RustLanguageModelConfig = None
56
+ RustResponse = None
57
+ RustStreamChunk = None
58
+ RustAsyncStreamHandle = None
59
+ RustUsage = None
60
+
61
+
62
+ # Keep Python classes for backward compatibility and convenience
63
+ class MessageRole(str, Enum):
64
+ """Message role in conversation."""
65
+
66
+ SYSTEM = "system"
67
+ USER = "user"
68
+ ASSISTANT = "assistant"
69
+
70
+
71
+ @dataclass
72
+ class Message:
73
+ """Conversation message."""
74
+
75
+ role: MessageRole
76
+ content: str
77
+
78
+ @staticmethod
79
+ def system(content: str) -> Message:
80
+ """Create system message."""
81
+ return Message(role=MessageRole.SYSTEM, content=content)
82
+
83
+ @staticmethod
84
+ def user(content: str) -> Message:
85
+ """Create user message."""
86
+ return Message(role=MessageRole.USER, content=content)
87
+
88
+ @staticmethod
89
+ def assistant(content: str) -> Message:
90
+ """Create assistant message."""
91
+ return Message(role=MessageRole.ASSISTANT, content=content)
92
+
93
+
94
+ @dataclass
95
+ class ToolDefinition:
96
+ """Tool definition for LLM."""
97
+
98
+ name: str
99
+ description: Optional[str] = None
100
+ parameters: Optional[Dict[str, Any]] = None
101
+
102
+
103
+ class ToolChoice(str, Enum):
104
+ """Tool choice mode."""
105
+
106
+ AUTO = "auto"
107
+ NONE = "none"
108
+ REQUIRED = "required"
109
+
110
+
111
+ class BuiltInTool(str, Enum):
112
+ """Built-in tools for OpenAI Responses API.
113
+
114
+ These are platform-provided tools that don't require implementation:
115
+ - WEB_SEARCH: Real-time web search capability
116
+ - CODE_INTERPRETER: Execute Python code in a sandboxed environment
117
+ - FILE_SEARCH: Search through uploaded files
118
+ """
119
+
120
+ WEB_SEARCH = "web_search_preview"
121
+ CODE_INTERPRETER = "code_interpreter"
122
+ FILE_SEARCH = "file_search"
123
+
124
+
125
+ class ReasoningEffort(str, Enum):
126
+ """Reasoning effort level for o-series models (o1, o3, etc.).
127
+
128
+ Controls the amount of reasoning/thinking the model performs:
129
+ - MINIMAL: Fast responses with basic reasoning
130
+ - MEDIUM: Balanced reasoning and speed (default)
131
+ - HIGH: Deep reasoning, slower but more thorough
132
+ """
133
+
134
+ MINIMAL = "minimal"
135
+ MEDIUM = "medium"
136
+ HIGH = "high"
137
+
138
+
139
+ class Modality(str, Enum):
140
+ """Output modalities for multimodal models.
141
+
142
+ Specifies the types of content the model can generate:
143
+ - TEXT: Standard text output
144
+ - AUDIO: Audio output (e.g., for text-to-speech models)
145
+ - IMAGE: Image generation (future capability)
146
+ """
147
+
148
+ TEXT = "text"
149
+ AUDIO = "audio"
150
+ IMAGE = "image"
151
+
152
+
153
+ @dataclass
154
+ class ModelConfig:
155
+ """Advanced model configuration for custom endpoints and settings.
156
+
157
+ Use this for advanced scenarios like custom API endpoints, special headers,
158
+ or overriding default timeouts. Most users won't need this - the basic
159
+ model string with temperature/max_tokens is sufficient for common cases.
160
+
161
+ Example:
162
+ >>> from agnt5.lm import ModelConfig
163
+ >>> from agnt5 import Agent
164
+ >>>
165
+ >>> # Custom API endpoint
166
+ >>> config = ModelConfig(
167
+ ... base_url="https://custom-api.example.com",
168
+ ... api_key="custom-key",
169
+ ... timeout=60,
170
+ ... headers={"X-Custom-Header": "value"}
171
+ ... )
172
+ >>>
173
+ >>> agent = Agent(
174
+ ... name="custom_agent",
175
+ ... model="openai/gpt-4o-mini",
176
+ ... instructions="...",
177
+ ... model_config=config
178
+ ... )
179
+ """
180
+ base_url: Optional[str] = None
181
+ api_key: Optional[str] = None
182
+ timeout: Optional[int] = None
183
+ headers: Optional[Dict[str, str]] = None
184
+
185
+
186
+ @dataclass
187
+ class GenerationConfig:
188
+ """LLM generation configuration.
189
+
190
+ Supports both Chat Completions and Responses API parameters.
191
+ """
192
+
193
+ # Standard parameters (both APIs)
194
+ temperature: Optional[float] = None
195
+ max_tokens: Optional[int] = None
196
+ top_p: Optional[float] = None
197
+
198
+ # Responses API specific parameters
199
+ built_in_tools: List[BuiltInTool] = field(default_factory=list)
200
+ reasoning_effort: Optional[ReasoningEffort] = None
201
+ modalities: Optional[List[Modality]] = None
202
+ store: Optional[bool] = None # Enable server-side conversation state
203
+ previous_response_id: Optional[str] = None # Continue previous conversation
204
+
205
+
206
+ @dataclass
207
+ class TokenUsage:
208
+ """Token usage statistics."""
209
+
210
+ prompt_tokens: int
211
+ completion_tokens: int
212
+ total_tokens: int
213
+
214
+
215
+ @dataclass
216
+ class GenerateResponse:
217
+ """Response from LLM generation."""
218
+
219
+ text: str
220
+ usage: Optional[TokenUsage] = None
221
+ finish_reason: Optional[str] = None
222
+ tool_calls: Optional[List[Dict[str, Any]]] = None
223
+ response_id: Optional[str] = None # Response ID for conversation continuation (Responses API)
224
+ _rust_response: Optional[Any] = field(default=None, repr=False)
225
+
226
+ @property
227
+ def structured_output(self) -> Optional[Any]:
228
+ """Parsed structured output (Pydantic model, dataclass, or dict).
229
+
230
+ Returns the parsed object when response_format is specified.
231
+ This is the recommended property name for accessing structured output.
232
+
233
+ Returns:
234
+ Parsed object according to the specified response_format, or None if not available
235
+ """
236
+ if self._rust_response and hasattr(self._rust_response, 'object'):
237
+ return self._rust_response.object
238
+ return None
239
+
240
+ @property
241
+ def parsed(self) -> Optional[Any]:
242
+ """Alias for structured_output (OpenAI SDK compatibility).
243
+
244
+ Returns:
245
+ Same as structured_output
246
+ """
247
+ return self.structured_output
248
+
249
+ @property
250
+ def object(self) -> Optional[Any]:
251
+ """Alias for structured_output.
252
+
253
+ Returns:
254
+ Same as structured_output
255
+ """
256
+ return self.structured_output
257
+
258
+
259
+ @dataclass
260
+ class GenerateRequest:
261
+ """Request for LLM generation."""
262
+
263
+ model: str
264
+ messages: List[Message] = field(default_factory=list)
265
+ system_prompt: Optional[str] = None
266
+ tools: List[ToolDefinition] = field(default_factory=list)
267
+ tool_choice: Optional[ToolChoice] = None
268
+ config: GenerationConfig = field(default_factory=GenerationConfig)
269
+ response_schema: Optional[str] = None # JSON-encoded schema for structured output
270
+
271
+
272
+ # Abstract base class for language models
273
+ # This exists primarily for testing/mocking purposes
274
+ class LanguageModel(ABC):
275
+ """Abstract base class for language model implementations.
276
+
277
+ This class defines the interface that all language models must implement.
278
+ It's primarily used for testing and mocking, as production code should use
279
+ the module-level generate() and stream() functions instead.
280
+ """
281
+
282
+ @abstractmethod
283
+ async def generate(self, request: GenerateRequest) -> GenerateResponse:
284
+ """Generate completion from LLM.
285
+
286
+ Args:
287
+ request: Generation request with model, messages, and configuration
288
+
289
+ Returns:
290
+ GenerateResponse with text, usage, and optional tool calls
291
+ """
292
+ pass
293
+
294
+ @abstractmethod
295
+ async def stream(self, request: GenerateRequest) -> AsyncIterator["Event"]:
296
+ """Stream completion from LLM as Event objects.
297
+
298
+ Yields typed Event objects for real-time SSE streaming:
299
+ - lm.message.start: Beginning of message content
300
+ - lm.message.delta: Token chunk with incremental text
301
+ - lm.message.stop: End of message content
302
+
303
+ Args:
304
+ request: Generation request with model, messages, and configuration
305
+
306
+ Yields:
307
+ Event objects for streaming
308
+ """
309
+ pass
310
+
311
+
312
+ # Internal wrapper for the Rust-backed implementation
313
+ # Users should use the module-level generate() and stream() functions instead
314
+ class _LanguageModel(LanguageModel):
315
+ """Internal Language Model wrapper using Rust SDK core.
316
+
317
+ This class is for internal use only. Users should use the module-level
318
+ lm.generate() and lm.stream() functions for a simpler interface.
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ provider: Optional[str] = None,
324
+ default_model: Optional[str] = None,
325
+ ):
326
+ """Initialize language model.
327
+
328
+ Args:
329
+ provider: Provider name (e.g., 'openai', 'anthropic', 'azure', 'bedrock', 'groq', 'openrouter')
330
+ If None, provider will be auto-detected from model prefix (e.g., 'openai/gpt-4o')
331
+ default_model: Default model to use if not specified in requests
332
+ """
333
+ if not _RUST_AVAILABLE:
334
+ raise ImportError(
335
+ "Rust extension not available. Please rebuild the SDK with: "
336
+ "cd sdk/sdk-python && maturin develop"
337
+ )
338
+
339
+ self._provider = provider
340
+ self._default_model = default_model
341
+
342
+ # Create config object for Rust
343
+ config = RustLanguageModelConfig(
344
+ default_model=default_model,
345
+ default_provider=provider,
346
+ )
347
+
348
+ self._rust_lm = RustLanguageModel(config=config)
349
+
350
+ def _prepare_model_name(self, model: str) -> str:
351
+ """Prepare model name with provider prefix if needed.
352
+
353
+ Args:
354
+ model: Model name (e.g., 'gpt-4o-mini' or 'openai/gpt-4o-mini')
355
+
356
+ Returns:
357
+ Model name with provider prefix (e.g., 'openai/gpt-4o-mini')
358
+ """
359
+ # If model already has a prefix, return as is
360
+ # This handles cases like OpenRouter where models already have their provider prefix
361
+ # (e.g., 'anthropic/claude-3.5-haiku' for OpenRouter)
362
+ if '/' in model:
363
+ return model
364
+
365
+ # If we have a default provider, prefix the model
366
+ if self._provider:
367
+ return f"{self._provider}/{model}"
368
+
369
+ # Otherwise return as is and let Rust handle the error
370
+ return model
371
+
372
+ async def generate(self, request: GenerateRequest) -> GenerateResponse:
373
+ """Generate completion from LLM.
374
+
375
+ Args:
376
+ request: Generation request with model, messages, and configuration
377
+
378
+ Returns:
379
+ GenerateResponse with text, usage, and optional tool calls
380
+ """
381
+ # Convert Python request to structured format for Rust
382
+ prompt = self._build_prompt_messages(request)
383
+
384
+ # Prepare model name with provider prefix
385
+ model = self._prepare_model_name(request.model)
386
+
387
+ # Build kwargs for Rust
388
+ kwargs: dict[str, Any] = {
389
+ "model": model,
390
+ }
391
+
392
+ # Always pass provider explicitly if set
393
+ # For gateway providers like OpenRouter, this allows them to handle
394
+ # models with provider prefixes (e.g., openrouter can handle anthropic/claude-3.5-haiku)
395
+ if self._provider:
396
+ kwargs["provider"] = self._provider
397
+
398
+ # Pass system prompt separately if provided
399
+ if request.system_prompt:
400
+ kwargs["system_prompt"] = request.system_prompt
401
+
402
+ if request.config.temperature is not None:
403
+ kwargs["temperature"] = request.config.temperature
404
+ if request.config.max_tokens is not None:
405
+ kwargs["max_tokens"] = request.config.max_tokens
406
+ if request.config.top_p is not None:
407
+ kwargs["top_p"] = request.config.top_p
408
+
409
+ # Pass response schema for structured output if provided
410
+ if request.response_schema is not None:
411
+ kwargs["response_schema_kw"] = request.response_schema
412
+
413
+ # Pass Responses API specific parameters
414
+ if request.config.built_in_tools:
415
+ # Serialize built-in tools to JSON for Rust
416
+ built_in_tools_list = [tool.value for tool in request.config.built_in_tools]
417
+ kwargs["built_in_tools"] = json.dumps(built_in_tools_list)
418
+
419
+ if request.config.reasoning_effort is not None:
420
+ kwargs["reasoning_effort"] = request.config.reasoning_effort.value
421
+
422
+ if request.config.modalities is not None:
423
+ modalities_list = [modality.value for modality in request.config.modalities]
424
+ kwargs["modalities"] = json.dumps(modalities_list)
425
+
426
+ if request.config.store is not None:
427
+ kwargs["store"] = request.config.store
428
+
429
+ if request.config.previous_response_id is not None:
430
+ kwargs["previous_response_id"] = request.config.previous_response_id
431
+
432
+ # Pass tools and tool_choice to Rust
433
+ if request.tools:
434
+ # Serialize tools to JSON for Rust
435
+ tools_list = [
436
+ {
437
+ "name": tool.name,
438
+ "description": tool.description,
439
+ "parameters": tool.parameters,
440
+ }
441
+ for tool in request.tools
442
+ ]
443
+ tools_json = json.dumps(tools_list)
444
+ kwargs["tools"] = tools_json
445
+
446
+ if request.tool_choice:
447
+ # Serialize tool_choice to JSON for Rust
448
+ kwargs["tool_choice"] = json.dumps(request.tool_choice.value)
449
+
450
+ # Pass runtime_context for proper trace linking
451
+ # Try to get from current context if available
452
+ current_ctx = get_current_context()
453
+ if current_ctx and hasattr(current_ctx, '_runtime_context') and current_ctx._runtime_context:
454
+ kwargs["runtime_context"] = current_ctx._runtime_context
455
+
456
+ # Emit checkpoint if called within a workflow context
457
+ from .context import get_workflow_context
458
+ import time
459
+ workflow_ctx = get_workflow_context()
460
+
461
+ # Get trace context for event linkage
462
+ trace_id = None
463
+ span_id = None
464
+ try:
465
+ from opentelemetry import trace
466
+ span = trace.get_current_span()
467
+ if span.is_recording():
468
+ span_context = span.get_span_context()
469
+ trace_id = format(span_context.trace_id, '032x')
470
+ span_id = format(span_context.span_id, '016x')
471
+ except Exception:
472
+ pass # Tracing not available, continue without
473
+
474
+ # Emit started event (trace_id is optional - emit even without tracing)
475
+ if workflow_ctx:
476
+ event_data = {
477
+ "model": model,
478
+ "provider": self._provider,
479
+ "timestamp": time.time_ns() // 1_000_000,
480
+ }
481
+ if trace_id:
482
+ event_data["trace_id"] = trace_id
483
+ event_data["span_id"] = span_id
484
+ workflow_ctx._send_checkpoint("lm.call.started", event_data)
485
+
486
+ try:
487
+ # Call Rust implementation - it returns a proper Python coroutine now
488
+ # Using pyo3-async-runtimes for truly async HTTP calls without blocking
489
+ rust_response = await self._rust_lm.generate(prompt=prompt, **kwargs)
490
+
491
+ # Convert Rust response to Python
492
+ response = self._convert_response(rust_response)
493
+
494
+ # Emit completion event with token usage and cost
495
+ if workflow_ctx:
496
+ event_data = {
497
+ "model": model,
498
+ "provider": self._provider,
499
+ "timestamp": time.time_ns() // 1_000_000,
500
+ }
501
+ if trace_id:
502
+ event_data["trace_id"] = trace_id
503
+ event_data["span_id"] = span_id
504
+
505
+ # Add token usage if available
506
+ if response.usage:
507
+ event_data["input_tokens"] = response.usage.prompt_tokens
508
+ event_data["output_tokens"] = response.usage.completion_tokens
509
+ event_data["total_tokens"] = response.usage.total_tokens
510
+
511
+ workflow_ctx._send_checkpoint("lm.call.completed", event_data)
512
+
513
+ return response
514
+ except Exception as e:
515
+ # Emit failed event
516
+ if workflow_ctx:
517
+ event_data = {
518
+ "model": model,
519
+ "provider": self._provider,
520
+ "error": str(e),
521
+ "error_type": type(e).__name__,
522
+ "timestamp": time.time_ns() // 1_000_000,
523
+ }
524
+ if trace_id:
525
+ event_data["trace_id"] = trace_id
526
+ event_data["span_id"] = span_id
527
+ workflow_ctx._send_checkpoint("lm.call.failed", event_data)
528
+ raise
529
+
530
+ async def stream(self, request: GenerateRequest) -> AsyncIterator["Event"]:
531
+ """Stream completion from LLM as Event objects for SSE delivery.
532
+
533
+ This method yields typed Event objects suitable for real-time streaming
534
+ via SSE. It emits content block events following the pattern:
535
+ - lm.message.start / lm.thinking.start: Beginning of content block
536
+ - lm.message.delta / lm.thinking.delta: Token chunk with incremental text
537
+ - lm.message.stop / lm.thinking.stop: End of content block
538
+
539
+ Extended thinking models (Claude with extended thinking) emit thinking blocks
540
+ before text blocks, allowing you to see the model's reasoning process.
541
+
542
+ Args:
543
+ request: Generation request with model, messages, and configuration
544
+
545
+ Yields:
546
+ Event objects for streaming
547
+
548
+ Example:
549
+ ```python
550
+ async for event in lm_instance.stream(request):
551
+ if event.event_type == EventType.LM_MESSAGE_DELTA:
552
+ print(event.data.get("content", ""), end="", flush=True)
553
+ elif event.event_type == EventType.LM_THINKING_DELTA:
554
+ # Handle thinking content (optional)
555
+ pass
556
+ ```
557
+ """
558
+ from .events import Event, EventType
559
+
560
+ # Convert Python request to structured format for Rust
561
+ prompt = self._build_prompt_messages(request)
562
+
563
+ # Prepare model name with provider prefix
564
+ model = self._prepare_model_name(request.model)
565
+
566
+ # Build kwargs for Rust
567
+ kwargs: dict[str, Any] = {
568
+ "model": model,
569
+ }
570
+
571
+ # Always pass provider explicitly if set
572
+ if self._provider:
573
+ kwargs["provider"] = self._provider
574
+
575
+ # Pass system prompt separately if provided
576
+ if request.system_prompt:
577
+ kwargs["system_prompt"] = request.system_prompt
578
+
579
+ if request.config.temperature is not None:
580
+ kwargs["temperature"] = request.config.temperature
581
+ if request.config.max_tokens is not None:
582
+ kwargs["max_tokens"] = request.config.max_tokens
583
+ if request.config.top_p is not None:
584
+ kwargs["top_p"] = request.config.top_p
585
+
586
+ # Pass Responses API specific parameters
587
+ if request.config.built_in_tools:
588
+ built_in_tools_list = [tool.value for tool in request.config.built_in_tools]
589
+ kwargs["built_in_tools"] = json.dumps(built_in_tools_list)
590
+
591
+ if request.config.reasoning_effort is not None:
592
+ kwargs["reasoning_effort"] = request.config.reasoning_effort.value
593
+
594
+ if request.config.modalities is not None:
595
+ modalities_list = [modality.value for modality in request.config.modalities]
596
+ kwargs["modalities"] = json.dumps(modalities_list)
597
+
598
+ if request.config.store is not None:
599
+ kwargs["store"] = request.config.store
600
+
601
+ if request.config.previous_response_id is not None:
602
+ kwargs["previous_response_id"] = request.config.previous_response_id
603
+
604
+ # Pass tools and tool_choice to Rust
605
+ if request.tools:
606
+ tools_list = [
607
+ {
608
+ "name": tool.name,
609
+ "description": tool.description,
610
+ "parameters": tool.parameters,
611
+ }
612
+ for tool in request.tools
613
+ ]
614
+ kwargs["tools"] = json.dumps(tools_list)
615
+
616
+ if request.tool_choice:
617
+ kwargs["tool_choice"] = json.dumps(request.tool_choice.value)
618
+
619
+ import time
620
+ sequence = 0
621
+ # Track block types by index since content_block_stop doesn't include block_type
622
+ block_types: Dict[int, str] = {}
623
+
624
+ try:
625
+ # Use stream_iter for true async streaming - yields chunks as they arrive
626
+ # instead of collecting all chunks first
627
+ async for chunk in self._rust_lm.stream_iter(prompt=prompt, **kwargs):
628
+ chunk_type = chunk.chunk_type
629
+ block_type = chunk.block_type # "text" or "thinking" (None for stop/completed)
630
+ index = chunk.index if chunk.index is not None else 0
631
+
632
+ if chunk_type == "content_block_start":
633
+ # Track block type for this index
634
+ block_types[index] = block_type or "text"
635
+ # Emit start event based on block type
636
+ if block_type == "thinking":
637
+ yield Event.thinking_start(
638
+ index=index,
639
+ sequence=sequence,
640
+ )
641
+ else:
642
+ yield Event.message_start(
643
+ index=index,
644
+ sequence=sequence,
645
+ )
646
+ sequence += 1
647
+
648
+ elif chunk_type == "delta":
649
+ # Emit delta event based on block type
650
+ if block_type == "thinking":
651
+ yield Event.thinking_delta(
652
+ content=chunk.text,
653
+ index=index,
654
+ sequence=sequence,
655
+ )
656
+ else:
657
+ yield Event.message_delta(
658
+ content=chunk.text,
659
+ index=index,
660
+ sequence=sequence,
661
+ )
662
+ sequence += 1
663
+
664
+ elif chunk_type == "content_block_stop":
665
+ # Look up block type from when we saw content_block_start
666
+ tracked_block_type = block_types.get(index, "text")
667
+ # Emit stop event based on tracked block type
668
+ if tracked_block_type == "thinking":
669
+ yield Event.thinking_stop(
670
+ index=index,
671
+ sequence=sequence,
672
+ )
673
+ else:
674
+ yield Event.message_stop(
675
+ index=index,
676
+ sequence=sequence,
677
+ )
678
+ sequence += 1
679
+
680
+ elif chunk_type == "completed":
681
+ # Final response - emit completion event
682
+ completion_data = {
683
+ "text": chunk.text,
684
+ "model": chunk.model,
685
+ "timestamp": time.time_ns() // 1_000_000,
686
+ }
687
+ if chunk.finish_reason:
688
+ completion_data["finish_reason"] = chunk.finish_reason
689
+ if chunk.usage:
690
+ completion_data["usage"] = {
691
+ "prompt_tokens": chunk.usage.prompt_tokens,
692
+ "completion_tokens": chunk.usage.completion_tokens,
693
+ "total_tokens": chunk.usage.total_tokens,
694
+ }
695
+ yield Event(
696
+ event_type=EventType.LM_STREAM_COMPLETED,
697
+ data=completion_data,
698
+ sequence=sequence,
699
+ )
700
+ sequence += 1
701
+
702
+ except Exception as e:
703
+ # Emit error as a failed event (caller can handle)
704
+ yield Event(
705
+ event_type=EventType.LM_STREAM_FAILED,
706
+ data={
707
+ "error": str(e),
708
+ "error_type": type(e).__name__,
709
+ "timestamp": time.time_ns() // 1_000_000,
710
+ },
711
+ sequence=sequence,
712
+ )
713
+ raise
714
+
715
+ def _build_prompt_messages(self, request: GenerateRequest) -> List[Dict[str, str]]:
716
+ """Build structured message list for Rust.
717
+
718
+ Rust expects a list of dicts with 'role' and 'content' keys.
719
+ System prompt is passed separately via kwargs.
720
+
721
+ Args:
722
+ request: Generation request with messages
723
+
724
+ Returns:
725
+ List of message dicts with role and content
726
+ """
727
+ # Convert messages to Rust format (list of dicts with role and content)
728
+ messages = []
729
+ for msg in request.messages:
730
+ messages.append({
731
+ "role": msg.role.value, # "system", "user", or "assistant"
732
+ "content": msg.content
733
+ })
734
+
735
+ # If no messages and no system prompt, return a default user message
736
+ if not messages and not request.system_prompt:
737
+ messages.append({
738
+ "role": "user",
739
+ "content": ""
740
+ })
741
+
742
+ return messages
743
+
744
+ def _convert_response(self, rust_response: RustResponse) -> GenerateResponse:
745
+ """Convert Rust response to Python response."""
746
+ usage = None
747
+ if rust_response.usage:
748
+ usage = TokenUsage(
749
+ prompt_tokens=rust_response.usage.prompt_tokens,
750
+ completion_tokens=rust_response.usage.completion_tokens,
751
+ total_tokens=rust_response.usage.total_tokens,
752
+ )
753
+
754
+ # Extract tool_calls from Rust response
755
+ tool_calls = None
756
+ if hasattr(rust_response, 'tool_calls') and rust_response.tool_calls:
757
+ tool_calls = rust_response.tool_calls
758
+
759
+ # Extract response_id from Rust response (for Responses API)
760
+ response_id = None
761
+ if hasattr(rust_response, 'response_id') and rust_response.response_id:
762
+ response_id = rust_response.response_id
763
+
764
+ return GenerateResponse(
765
+ text=rust_response.content,
766
+ usage=usage,
767
+ finish_reason=None, # TODO: Add finish_reason to Rust response
768
+ tool_calls=tool_calls,
769
+ response_id=response_id,
770
+ _rust_response=rust_response, # Store for .structured_output access
771
+ )
772
+
773
+
774
+ # ============================================================================
775
+ # Simplified API (Recommended)
776
+ # ============================================================================
777
+ # This is the recommended simple interface for most use cases
778
+
779
+ async def generate(
780
+ model: str,
781
+ prompt: Optional[str] = None,
782
+ messages: Optional[List[Dict[str, str]]] = None,
783
+ system_prompt: Optional[str] = None,
784
+ temperature: Optional[float] = None,
785
+ max_tokens: Optional[int] = None,
786
+ top_p: Optional[float] = None,
787
+ response_format: Optional[Any] = None,
788
+ # Responses API specific parameters
789
+ built_in_tools: Optional[List[BuiltInTool]] = None,
790
+ reasoning_effort: Optional[ReasoningEffort] = None,
791
+ modalities: Optional[List[Modality]] = None,
792
+ store: Optional[bool] = None,
793
+ previous_response_id: Optional[str] = None,
794
+ ) -> GenerateResponse:
795
+ """Generate text using any LLM provider (simplified API).
796
+
797
+ This is the recommended way to use the LLM API. Provider is auto-detected
798
+ from the model prefix (e.g., 'openai/gpt-4o-mini', 'anthropic/claude-3-5-haiku').
799
+
800
+ Args:
801
+ model: Model identifier with provider prefix (e.g., 'openai/gpt-4o-mini')
802
+ prompt: Simple text prompt (for single-turn requests)
803
+ messages: List of message dicts with 'role' and 'content' (for multi-turn)
804
+ system_prompt: Optional system prompt
805
+ temperature: Sampling temperature (0.0-2.0)
806
+ max_tokens: Maximum tokens to generate
807
+ top_p: Nucleus sampling parameter
808
+ response_format: Pydantic model, dataclass, or JSON schema dict for structured output
809
+ built_in_tools: List of built-in tools (OpenAI Responses API only)
810
+ reasoning_effort: Reasoning effort level for o-series models (OpenAI Responses API only)
811
+ modalities: Output modalities (text, audio, image) (OpenAI Responses API only)
812
+ store: Enable server-side conversation state (OpenAI Responses API only)
813
+ previous_response_id: Continue from previous response (OpenAI Responses API only)
814
+
815
+ Returns:
816
+ GenerateResponse with text, usage, and optional structured output
817
+
818
+ Examples:
819
+ Simple prompt:
820
+ >>> response = await generate(
821
+ ... model="openai/gpt-4o-mini",
822
+ ... prompt="What is love?",
823
+ ... temperature=0.7
824
+ ... )
825
+ >>> print(response.text)
826
+
827
+ Structured output with dataclass:
828
+ >>> from dataclasses import dataclass
829
+ >>>
830
+ >>> @dataclass
831
+ ... class CodeReview:
832
+ ... issues: list[str]
833
+ ... suggestions: list[str]
834
+ ... overall_quality: int
835
+ >>>
836
+ >>> response = await generate(
837
+ ... model="openai/gpt-4o",
838
+ ... prompt="Analyze this code...",
839
+ ... response_format=CodeReview
840
+ ... )
841
+ >>> review = response.structured_output # Returns dict
842
+ """
843
+ # Validate input
844
+ if not prompt and not messages:
845
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
846
+ if prompt and messages:
847
+ raise ValueError("Provide either 'prompt' or 'messages', not both")
848
+
849
+ # Auto-detect provider from model prefix
850
+ if '/' not in model:
851
+ raise ValueError(
852
+ f"Model must include provider prefix (e.g., 'openai/{model}'). "
853
+ f"Supported providers: openai, anthropic, groq, openrouter, azure, bedrock"
854
+ )
855
+
856
+ provider, model_name = model.split('/', 1)
857
+
858
+ # Convert response_format to JSON schema if provided
859
+ response_schema_json = None
860
+ if response_format is not None:
861
+ format_type, json_schema = detect_format_type(response_format)
862
+ response_schema_json = json.dumps(json_schema)
863
+
864
+ # Create language model client
865
+ lm = _LanguageModel(provider=provider.lower(), default_model=None)
866
+
867
+ # Build messages list
868
+ if prompt:
869
+ msg_list = [{"role": "user", "content": prompt}]
870
+ else:
871
+ msg_list = messages or []
872
+
873
+ # Convert to Message objects for internal API
874
+ message_objects = []
875
+ for msg in msg_list:
876
+ role = MessageRole(msg["role"])
877
+ if role == MessageRole.USER:
878
+ message_objects.append(Message.user(msg["content"]))
879
+ elif role == MessageRole.ASSISTANT:
880
+ message_objects.append(Message.assistant(msg["content"]))
881
+ elif role == MessageRole.SYSTEM:
882
+ message_objects.append(Message.system(msg["content"]))
883
+
884
+ # Build request with Responses API parameters
885
+ config = GenerationConfig(
886
+ temperature=temperature,
887
+ max_tokens=max_tokens,
888
+ top_p=top_p,
889
+ built_in_tools=built_in_tools or [],
890
+ reasoning_effort=reasoning_effort,
891
+ modalities=modalities,
892
+ store=store,
893
+ previous_response_id=previous_response_id,
894
+ )
895
+
896
+ request = GenerateRequest(
897
+ model=model,
898
+ messages=message_objects,
899
+ system_prompt=system_prompt,
900
+ config=config,
901
+ response_schema=response_schema_json,
902
+ )
903
+
904
+ # Checkpoints are emitted by _LanguageModel.generate() internally
905
+ # to avoid duplication. No need to emit them here.
906
+
907
+ # Generate and return
908
+ result = await lm.generate(request)
909
+ return result
910
+
911
+
912
+ async def stream(
913
+ model: str,
914
+ prompt: Optional[str] = None,
915
+ messages: Optional[List[Dict[str, str]]] = None,
916
+ system_prompt: Optional[str] = None,
917
+ temperature: Optional[float] = None,
918
+ max_tokens: Optional[int] = None,
919
+ top_p: Optional[float] = None,
920
+ # Responses API specific parameters
921
+ built_in_tools: Optional[List[BuiltInTool]] = None,
922
+ reasoning_effort: Optional[ReasoningEffort] = None,
923
+ modalities: Optional[List[Modality]] = None,
924
+ store: Optional[bool] = None,
925
+ previous_response_id: Optional[str] = None,
926
+ ) -> AsyncIterator["Event"]:
927
+ """Stream LLM completion as Event objects (simplified API).
928
+
929
+ This is the recommended way to use streaming. Provider is auto-detected
930
+ from the model prefix (e.g., 'openai/gpt-4o-mini', 'anthropic/claude-3-5-haiku').
931
+
932
+ Yields Event objects for real-time SSE streaming:
933
+ - lm.message.start: Beginning of message content
934
+ - lm.message.delta: Token chunk with incremental text
935
+ - lm.message.stop: End of message content
936
+
937
+ Args:
938
+ model: Model identifier with provider prefix (e.g., 'openai/gpt-4o-mini')
939
+ prompt: Simple text prompt (for single-turn requests)
940
+ messages: List of message dicts with 'role' and 'content' (for multi-turn)
941
+ system_prompt: Optional system prompt
942
+ temperature: Sampling temperature (0.0-2.0)
943
+ max_tokens: Maximum tokens to generate
944
+ top_p: Nucleus sampling parameter
945
+ built_in_tools: List of built-in tools (OpenAI Responses API only)
946
+ reasoning_effort: Reasoning effort level for o-series models (OpenAI Responses API only)
947
+ modalities: Output modalities (text, audio, image) (OpenAI Responses API only)
948
+ store: Enable server-side conversation state (OpenAI Responses API only)
949
+ previous_response_id: Continue from previous response (OpenAI Responses API only)
950
+
951
+ Yields:
952
+ Event objects for streaming
953
+
954
+ Examples:
955
+ Simple streaming:
956
+ >>> from agnt5.events import EventType
957
+ >>> async for event in stream(
958
+ ... model="openai/gpt-4o-mini",
959
+ ... prompt="Write a story"
960
+ ... ):
961
+ ... if event.event_type == EventType.LM_MESSAGE_DELTA:
962
+ ... print(event.data.get("content", ""), end="", flush=True)
963
+
964
+ Streaming conversation:
965
+ >>> async for event in stream(
966
+ ... model="groq/llama-3.3-70b-versatile",
967
+ ... messages=[{"role": "user", "content": "Tell me a joke"}],
968
+ ... temperature=0.9
969
+ ... ):
970
+ ... if event.event_type == EventType.LM_MESSAGE_DELTA:
971
+ ... print(event.data.get("content", ""), end="")
972
+ """
973
+ from .events import Event
974
+ # Validate input
975
+ if not prompt and not messages:
976
+ raise ValueError("Either 'prompt' or 'messages' must be provided")
977
+ if prompt and messages:
978
+ raise ValueError("Provide either 'prompt' or 'messages', not both")
979
+
980
+ # Auto-detect provider from model prefix
981
+ if '/' not in model:
982
+ raise ValueError(
983
+ f"Model must include provider prefix (e.g., 'openai/{model}'). "
984
+ f"Supported providers: openai, anthropic, groq, openrouter, azure, bedrock"
985
+ )
986
+
987
+ provider, model_name = model.split('/', 1)
988
+
989
+ # Create language model client
990
+ lm = _LanguageModel(provider=provider.lower(), default_model=None)
991
+
992
+ # Build messages list
993
+ if prompt:
994
+ msg_list = [{"role": "user", "content": prompt}]
995
+ else:
996
+ msg_list = messages or []
997
+
998
+ # Convert to Message objects for internal API
999
+ message_objects = []
1000
+ for msg in msg_list:
1001
+ role = MessageRole(msg["role"])
1002
+ if role == MessageRole.USER:
1003
+ message_objects.append(Message.user(msg["content"]))
1004
+ elif role == MessageRole.ASSISTANT:
1005
+ message_objects.append(Message.assistant(msg["content"]))
1006
+ elif role == MessageRole.SYSTEM:
1007
+ message_objects.append(Message.system(msg["content"]))
1008
+
1009
+ # Build request with Responses API parameters
1010
+ config = GenerationConfig(
1011
+ temperature=temperature,
1012
+ max_tokens=max_tokens,
1013
+ top_p=top_p,
1014
+ built_in_tools=built_in_tools or [],
1015
+ reasoning_effort=reasoning_effort,
1016
+ modalities=modalities,
1017
+ store=store,
1018
+ previous_response_id=previous_response_id,
1019
+ )
1020
+
1021
+ request = GenerateRequest(
1022
+ model=model,
1023
+ messages=message_objects,
1024
+ system_prompt=system_prompt,
1025
+ config=config,
1026
+ )
1027
+
1028
+ # Events are emitted by _LanguageModel.stream() internally
1029
+ # (lm.stream.started/completed/failed with trace linkage)
1030
+
1031
+ # Stream and yield chunks
1032
+ async for chunk in lm.stream(request):
1033
+ yield chunk