dm-aioaiagent 0.6.1__tar.gz → 0.7.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/PKG-INFO +142 -20
- dm_aioaiagent-0.7.0/README.md +328 -0
- dm_aioaiagent-0.7.0/dm_aioaiagent/__init__.py +4 -0
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent/ai_agent.py +204 -24
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent/async_ai_agent.py +32 -5
- dm_aioaiagent-0.7.0/dm_aioaiagent/input_image.py +73 -0
- dm_aioaiagent-0.7.0/dm_aioaiagent/output_image.py +60 -0
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent/types.py +1 -15
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent.egg-info/PKG-INFO +142 -20
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent.egg-info/SOURCES.txt +2 -1
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent.egg-info/requires.txt +3 -0
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/setup.py +2 -1
- dm_aioaiagent-0.6.1/README.md +0 -208
- dm_aioaiagent-0.6.1/dm_aioaiagent/__init__.py +0 -3
- dm_aioaiagent-0.6.1/dm_aioaiagent/openai_image_message_content.py +0 -15
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent.egg-info/dependency_links.txt +0 -0
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/dm_aioaiagent.egg-info/top_level.txt +0 -0
- {dm_aioaiagent-0.6.1 → dm_aioaiagent-0.7.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dm-aioaiagent
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.7.0
|
|
4
4
|
Summary: This is my custom aioaiagent client
|
|
5
5
|
Home-page: https://pypi.org/project/dm-aioaiagent
|
|
6
6
|
Author: dimka4621
|
|
@@ -42,6 +42,8 @@ Requires-Dist: langchain-groq<2.0.0,>=1.0.0; extra == "all"
|
|
|
42
42
|
Requires-Dist: langchain-mistralai<2.0.0,>=1.0.0; extra == "all"
|
|
43
43
|
Requires-Dist: langchain-deepseek<2.0.0,>=1.0.0; extra == "all"
|
|
44
44
|
Requires-Dist: langchain-ollama<2.0.0,>=1.0.0; extra == "all"
|
|
45
|
+
Provides-Extra: test
|
|
46
|
+
Requires-Dist: Pillow<12.0.0,>=10.0.0; extra == "test"
|
|
45
47
|
Dynamic: author
|
|
46
48
|
Dynamic: author-email
|
|
47
49
|
Dynamic: classifier
|
|
@@ -234,32 +236,152 @@ if __name__ == "__main__":
|
|
|
234
236
|
asyncio.run(main())
|
|
235
237
|
```
|
|
236
238
|
|
|
237
|
-
###
|
|
239
|
+
### Working with images — input
|
|
240
|
+
|
|
241
|
+
Use the `InputImage` helper to attach an image to a user message in a way that works **across providers** (OpenAI, Anthropic, Gemini). Each factory returns a ready-to-send `HumanMessage` whose `.content` is a list of LangChain v1 standard content blocks.
|
|
238
242
|
|
|
239
243
|
```python
|
|
240
|
-
from dm_aioaiagent import DMAIAgent,
|
|
244
|
+
from dm_aioaiagent import DMAIAgent, InputImage
|
|
241
245
|
|
|
246
|
+
agent = DMAIAgent(agent_name="image_vision", model="gpt-4o-mini")
|
|
242
247
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
ai_agent = DMAIAgent(agent_name="image_vision", model="gpt-4o")
|
|
248
|
+
# from a local file (mime type inferred from extension)
|
|
249
|
+
msg_file = InputImage.from_file("photo.png", text="What is in the picture?")
|
|
246
250
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
img_content = OpenAIImageMessageContent(image_url="https://your.domain/image",
|
|
250
|
-
text="Hello, what is shown in the photo?")
|
|
251
|
+
# from a remote URL
|
|
252
|
+
msg_url = InputImage.from_url("https://your.domain/image.png", text="Describe it.")
|
|
251
253
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
]
|
|
254
|
+
# from raw bytes / base64 (mime_type required)
|
|
255
|
+
with open("photo.png", "rb") as f:
|
|
256
|
+
msg_bytes = InputImage.from_bytes(f.read(), mime_type="image/png", text="Describe.")
|
|
257
|
+
msg_b64 = InputImage.from_base64("aGVsbG8=", mime_type="image/png")
|
|
257
258
|
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
259
|
+
answer = agent.run_messages([msg_file])
|
|
260
|
+
print(answer[-1].content_blocks) # list of standard blocks
|
|
261
|
+
```
|
|
261
262
|
|
|
263
|
+
**Multiple images per turn.** Each factory builds **one** image message. To attach several images to a single user turn, pass several messages:
|
|
262
264
|
|
|
263
|
-
|
|
264
|
-
|
|
265
|
+
```python
|
|
266
|
+
messages = [
|
|
267
|
+
InputImage.from_file("front.png", text="Compare these two views:"),
|
|
268
|
+
InputImage.from_file("back.png"),
|
|
269
|
+
]
|
|
270
|
+
agent.run_messages(messages)
|
|
265
271
|
```
|
|
272
|
+
|
|
273
|
+
> **`from_url` caveats.** Some providers (notably Anthropic and Gemini) may have stricter rules about remote URLs (allowed hosts, public reachability, redirects). When in doubt — read the file yourself and use `from_file` / `from_bytes`.
|
|
274
|
+
|
|
275
|
+
### Image generation and edit
|
|
276
|
+
|
|
277
|
+
The agent can also produce images. The mechanism differs by provider, so two flavours of model are supported:
|
|
278
|
+
|
|
279
|
+
`enable_image_generation` is the **single master switch** for image output across providers — image generation is off by default, and you opt in with one flag. The flag's effect is provider-specific (different APIs underneath), but the semantics are uniform: turn it on → the agent can draw, leave it off → it can't.
|
|
280
|
+
|
|
281
|
+
#### OpenAI — `enable_image_generation=True`
|
|
282
|
+
|
|
283
|
+
Pass the flag to a normal chat-capable OpenAI model (`gpt-5`, `gpt-5-mini`, etc.). Under the hood the agent enables the **Responses API** and binds OpenAI's built-in `image_generation` tool — the model decides on its own when to call it. Plain text turns stay text.
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from dm_aioaiagent import DMAIAgent, OutputImage
|
|
287
|
+
|
|
288
|
+
agent = DMAIAgent(model="gpt-5-mini", enable_image_generation=True)
|
|
289
|
+
|
|
290
|
+
agent.run("Draw a small red square on a white background.")
|
|
291
|
+
|
|
292
|
+
# Generated images surface on agent.images
|
|
293
|
+
for i, img in enumerate(agent.images):
|
|
294
|
+
img.save(f"out_{i}.png")
|
|
295
|
+
```
|
|
296
|
+
|
|
297
|
+
The same flag can be combined with regular tools — they coexist. `enable_image_generation=True` is **safe** even when the user only asks for text: the model uses `tool_choice="auto"`.
|
|
298
|
+
|
|
299
|
+
> Older OpenAI models (`gpt-4o`, `gpt-4.1`, etc.) require **organization verification** at platform.openai.com before they will accept the `image_generation` tool. The `gpt-5` family works on a fresh API key without verification.
|
|
300
|
+
|
|
301
|
+
#### Gemini — image-output models + the same flag
|
|
302
|
+
|
|
303
|
+
For Gemini you pick a model whose name contains `image` — e.g. `gemini-2.5-flash-image` (Nano Banana) — **and** turn the flag on. The agent then injects `response_modalities=["IMAGE", "TEXT"]` so the model is allowed to draw.
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
agent = DMAIAgent(
|
|
307
|
+
model="google_genai:gemini-2.5-flash-image",
|
|
308
|
+
enable_image_generation=True,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
agent.run("Generate a small red square.")
|
|
312
|
+
agent.images[0].save("out.png")
|
|
313
|
+
```
|
|
314
|
+
|
|
315
|
+
If you pick a Gemini image model but forget the flag, the agent logs a warning (`"... is image-capable but enable_image_generation=False — set the flag to True to let it draw."`) and stays in text-only mode.
|
|
316
|
+
|
|
317
|
+
> **Heads up.** A Gemini image-output model is **not** a general chat model — it tends to draw on every turn, including plain greetings. For mixed workloads use a **two-agent pattern**: a chat agent with the image agent attached as a tool. See [`agent.as_tool()`](#agentas_tool) below.
|
|
318
|
+
|
|
319
|
+
#### Anthropic — vision only
|
|
320
|
+
|
|
321
|
+
Claude **cannot generate** images. If you pass `enable_image_generation=True` to a Claude model, the flag is silently ignored and a warning is logged. Image input (vision) works as usual.
|
|
322
|
+
|
|
323
|
+
### Working with generated images — `OutputImage`
|
|
324
|
+
|
|
325
|
+
Generated images live in `agent.images` as `OutputImage` instances:
|
|
326
|
+
|
|
327
|
+
```python
|
|
328
|
+
img = agent.images[0]
|
|
329
|
+
img.bytes # raw image bytes
|
|
330
|
+
img.mime_type # e.g. "image/png"
|
|
331
|
+
img.save("out.png")
|
|
332
|
+
img.to_base64()
|
|
333
|
+
```
|
|
334
|
+
|
|
335
|
+
You can also extract images directly from any `AIMessage`:
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
from dm_aioaiagent import OutputImage
|
|
339
|
+
images = OutputImage.extract_from(response_message) # list[OutputImage]
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
### Image memory modes
|
|
343
|
+
|
|
344
|
+
Images in `agent.memory_messages` (the conversation history sent to the LLM on each turn) and in `agent.images` (the property exposing AI-generated images) follow the `image_memory_mode` constructor argument:
|
|
345
|
+
|
|
346
|
+
| Mode | Memory (history) | `agent.images` |
|
|
347
|
+
|---|---|---|
|
|
348
|
+
| `keep_last` *(default)* | last user-image kept; last AI-image kept; older → `[image]` / `[generated image]` placeholder | last AI-image kept; replaced when a new one arrives |
|
|
349
|
+
| `drop` | every image (user + AI) becomes a placeholder right after the turn | only the AI-image of the **current** turn (then wiped on the next call) |
|
|
350
|
+
| `keep_all` | nothing is stripped — full multimodal history | every AI-image accumulates |
|
|
351
|
+
|
|
352
|
+
```python
|
|
353
|
+
agent = DMAIAgent(model="gpt-4o-mini", image_memory_mode="keep_last")
|
|
354
|
+
agent.run_messages([InputImage.from_file("photo.png", text="Describe.")])
|
|
355
|
+
agent.run("What colour was dominant?") # answers based on the image
|
|
356
|
+
```
|
|
357
|
+
|
|
358
|
+
`agent.clear_memory_messages()` clears both `memory_messages` and `images`.
|
|
359
|
+
|
|
360
|
+
> Only **AI-generated** images populate `agent.images`. Images you upload via `InputImage` go into history per the rules above but are not exposed on the `images` property.
|
|
361
|
+
|
|
362
|
+
### `agent.as_tool()`
|
|
363
|
+
|
|
364
|
+
Wrap any agent as a `StructuredTool` so a *parent* agent can call it like any other tool — the basis for multi-agent composition. Default name is derived from `agent_name` (lowercased, non-alphanumerics replaced with `_`); `description` is required.
|
|
365
|
+
|
|
366
|
+
```python
|
|
367
|
+
from dm_aioaiagent import DMAIAgent
|
|
368
|
+
|
|
369
|
+
# specialised image agent
|
|
370
|
+
image_agent = DMAIAgent(
|
|
371
|
+
agent_name="image_drawer",
|
|
372
|
+
model="google_genai:gemini-2.5-flash-image",
|
|
373
|
+
enable_image_generation=True,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# chat agent that delegates drawing to the image agent
|
|
377
|
+
chat_agent = DMAIAgent(
|
|
378
|
+
model="google_genai:gemini-2.5-flash",
|
|
379
|
+
tools=[image_agent.as_tool(description="Generates an image from a text prompt.")],
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
chat_agent.run("Hi! Please draw a small red square.")
|
|
383
|
+
# the chat agent picks the tool, the image agent draws, image lands in image_agent.images
|
|
384
|
+
image_agent.images[0].save("out.png")
|
|
385
|
+
```
|
|
386
|
+
|
|
387
|
+
The async client (`DMAioAIAgent.as_tool`) returns a tool with both `func` and `coroutine` set, so it can be invoked from sync or async parent agents.
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
# DM-aioaiagent
|
|
2
|
+
|
|
3
|
+
## Urls
|
|
4
|
+
|
|
5
|
+
* [PyPI](https://pypi.org/project/dm-aioaiagent)
|
|
6
|
+
* [GitHub](https://github.com/MykhLibs/dm-aioaiagent)
|
|
7
|
+
|
|
8
|
+
### * Package contains both `asynchronous` and `synchronous` clients
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
By default, the package ships with **OpenAI** support. Other providers are optional extras:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
pip install dm-aioaiagent # OpenAI only
|
|
16
|
+
pip install dm-aioaiagent[anthropic] # + Anthropic
|
|
17
|
+
pip install dm-aioaiagent[anthropic,gemini] # several at once
|
|
18
|
+
pip install dm-aioaiagent[all] # every supported provider
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
Available extras: `anthropic`, `gemini`, `groq`, `mistral`, `deepseek`, `ollama`, `all`.
|
|
22
|
+
|
|
23
|
+
If you call a model from a provider whose package is not installed, `init_chat_model` will raise an `ImportError` with the exact `pip install` command you need.
|
|
24
|
+
|
|
25
|
+
## Providers
|
|
26
|
+
|
|
27
|
+
Provider resolution is delegated to LangChain's [`init_chat_model`](https://python.langchain.com/api_reference/langchain/chat_models/langchain.chat_models.base.init_chat_model.html) — the agent picks the provider automatically by model name prefix when possible. For everything else, use the `"provider:model"` mask.
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
# Auto-detected from model prefix (rules come from LangChain's init_chat_model)
|
|
31
|
+
agent = DMAioAIAgent(model="gpt-4o-mini") # → openai
|
|
32
|
+
agent = DMAioAIAgent(model="claude-3-5-sonnet-latest") # → anthropic
|
|
33
|
+
agent = DMAioAIAgent(model="gemini-2.0-flash") # → google_vertexai (see note below)
|
|
34
|
+
|
|
35
|
+
# Explicit provider via "provider:model" mask
|
|
36
|
+
agent = DMAioAIAgent(model="google_genai:gemini-2.0-flash")
|
|
37
|
+
agent = DMAioAIAgent(model="groq:llama-3.1-70b-versatile")
|
|
38
|
+
agent = DMAioAIAgent(model="mistralai:mistral-large-latest")
|
|
39
|
+
agent = DMAioAIAgent(model="deepseek:deepseek-chat")
|
|
40
|
+
agent = DMAioAIAgent(model="ollama:llama3.1")
|
|
41
|
+
|
|
42
|
+
# OpenAI-compatible gateway (OpenRouter, Together, vLLM, LiteLLM proxy, ...)
|
|
43
|
+
# Works without installing any extra — just point to the OpenAI-compatible URL.
|
|
44
|
+
agent = DMAioAIAgent(
|
|
45
|
+
model="meta-llama/llama-3.1-70b-instruct",
|
|
46
|
+
llm_provider_base_url="https://openrouter.ai/api/v1",
|
|
47
|
+
llm_provider_api_key="sk-or-...",
|
|
48
|
+
)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
> **Note about Gemini.** LangChain's auto-detect maps the `gemini*` prefix to **`google_vertexai`** (Google Cloud Vertex AI, requires a GCP service account). If you have a regular **Google AI Studio** API key (`GOOGLE_API_KEY`), use the `google_genai:` mask explicitly:
|
|
52
|
+
>
|
|
53
|
+
> ```python
|
|
54
|
+
> agent = DMAioAIAgent(model="google_genai:gemini-2.0-flash")
|
|
55
|
+
> ```
|
|
56
|
+
|
|
57
|
+
Supported provider keys for the `"provider:model"` mask (list inherited from LangChain): `openai`, `anthropic`, `azure_openai`, `azure_ai`, `google_vertexai`, `google_genai`, `bedrock`, `bedrock_converse`, `cohere`, `fireworks`, `together`, `mistralai`, `huggingface`, `groq`, `ollama`, `google_anthropic_vertex`, `deepseek`, `ibm`, `nvidia`, `xai`, `perplexity`.
|
|
58
|
+
|
|
59
|
+
### Note about parallel tool calls
|
|
60
|
+
|
|
61
|
+
`parallel_tool_calls` is currently mapped only for **OpenAI** and **Anthropic** (their APIs use different formats). For other providers the parameter is silently ignored — extend per-provider mapping if you need it.
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
|
|
65
|
+
Analogue to `DMAioAIAgent` is the synchronous client `DMAIAgent`.
|
|
66
|
+
|
|
67
|
+
### Windows Setup
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import asyncio
|
|
71
|
+
import sys
|
|
72
|
+
|
|
73
|
+
if sys.platform == "win32":
|
|
74
|
+
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Api Key Setup
|
|
78
|
+
|
|
79
|
+
Each provider reads its API key from a dedicated environment variable, e.g. `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `GOOGLE_API_KEY`, `GROQ_API_KEY`, `MISTRAL_API_KEY`, etc. Alternatively, pass the key explicitly via the `llm_provider_api_key` argument — useful for multi-tenant setups, custom gateways, or runtime key rotation.
|
|
80
|
+
|
|
81
|
+
**Use load_dotenv to load the `.env` file.**
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from dotenv import load_dotenv
|
|
85
|
+
load_dotenv()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Use agent *with* inner memory and run *single* message
|
|
89
|
+
|
|
90
|
+
By default, agent use inner memory to store the conversation history.
|
|
91
|
+
|
|
92
|
+
(You can set *max count messages in memory* by `max_memory_messages` init argument)
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
import asyncio
|
|
96
|
+
from dm_aioaiagent import DMAioAIAgent
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
async def main():
|
|
100
|
+
# define a system message
|
|
101
|
+
system_message = "Your custom system message with role, backstory and goal"
|
|
102
|
+
|
|
103
|
+
# (optional) define a list of tools, if you want to use them
|
|
104
|
+
tools = [...]
|
|
105
|
+
|
|
106
|
+
# define a openai model, default is "gpt-4o-mini"
|
|
107
|
+
model_name = "gpt-4o"
|
|
108
|
+
|
|
109
|
+
# create an agent
|
|
110
|
+
ai_agent = DMAioAIAgent(system_message, tools, model=model_name)
|
|
111
|
+
# if you don't want to see the input and output messages from agent
|
|
112
|
+
# you can set `input_output_logging=False` init argument
|
|
113
|
+
|
|
114
|
+
# call an agent
|
|
115
|
+
answer = await ai_agent.run("Hello!")
|
|
116
|
+
|
|
117
|
+
# call an agent
|
|
118
|
+
answer = await ai_agent.run("I want to know the weather in Kyiv")
|
|
119
|
+
|
|
120
|
+
# get full conversation history
|
|
121
|
+
conversation_history = ai_agent.memory_messages
|
|
122
|
+
|
|
123
|
+
# clear conversation history
|
|
124
|
+
ai_agent.clear_memory_messages()
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
if __name__ == "__main__":
|
|
128
|
+
asyncio.run(main())
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Use agent *without* inner memory and run *multiple* messages
|
|
132
|
+
|
|
133
|
+
If you want to control the memory of the agent, you can disable it by setting `is_memory_enabled=False`
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
import asyncio
|
|
137
|
+
from dm_aioaiagent import DMAioAIAgent
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
async def main():
|
|
141
|
+
# define a system message
|
|
142
|
+
system_message = "Your custom system message with role, backstory and goal"
|
|
143
|
+
|
|
144
|
+
# (optional) define a list of tools, if you want to use them
|
|
145
|
+
tools = [...]
|
|
146
|
+
|
|
147
|
+
# define a openai model, default is "gpt-4o-mini"
|
|
148
|
+
model_name = "gpt-4o"
|
|
149
|
+
|
|
150
|
+
# create an agent
|
|
151
|
+
ai_agent = DMAioAIAgent(system_message, tools, model=model_name,
|
|
152
|
+
is_memory_enabled=False)
|
|
153
|
+
# if you don't want to see the input and output messages from agent
|
|
154
|
+
# you can set input_output_logging=False
|
|
155
|
+
|
|
156
|
+
# define the conversation message(s)
|
|
157
|
+
messages = [
|
|
158
|
+
{"role": "user", "content": "Hello!"}
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
# call an agent
|
|
162
|
+
new_messages = await ai_agent.run_messages(messages)
|
|
163
|
+
|
|
164
|
+
# add new_messages to messages
|
|
165
|
+
messages.extend(new_messages)
|
|
166
|
+
|
|
167
|
+
# define the next conversation message
|
|
168
|
+
messages.append(
|
|
169
|
+
{"role": "user", "content": "I want to know the weather in Kyiv"}
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# call an agent
|
|
173
|
+
new_messages = await ai_agent.run_messages(messages)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
|
+
asyncio.run(main())
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
### Working with images — input
|
|
181
|
+
|
|
182
|
+
Use the `InputImage` helper to attach an image to a user message in a way that works **across providers** (OpenAI, Anthropic, Gemini). Each factory returns a ready-to-send `HumanMessage` whose `.content` is a list of LangChain v1 standard content blocks.
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from dm_aioaiagent import DMAIAgent, InputImage
|
|
186
|
+
|
|
187
|
+
agent = DMAIAgent(agent_name="image_vision", model="gpt-4o-mini")
|
|
188
|
+
|
|
189
|
+
# from a local file (mime type inferred from extension)
|
|
190
|
+
msg_file = InputImage.from_file("photo.png", text="What is in the picture?")
|
|
191
|
+
|
|
192
|
+
# from a remote URL
|
|
193
|
+
msg_url = InputImage.from_url("https://your.domain/image.png", text="Describe it.")
|
|
194
|
+
|
|
195
|
+
# from raw bytes / base64 (mime_type required)
|
|
196
|
+
with open("photo.png", "rb") as f:
|
|
197
|
+
msg_bytes = InputImage.from_bytes(f.read(), mime_type="image/png", text="Describe.")
|
|
198
|
+
msg_b64 = InputImage.from_base64("aGVsbG8=", mime_type="image/png")
|
|
199
|
+
|
|
200
|
+
answer = agent.run_messages([msg_file])
|
|
201
|
+
print(answer[-1].content_blocks) # list of standard blocks
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**Multiple images per turn.** Each factory builds **one** image message. To attach several images to a single user turn, pass several messages:
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
messages = [
|
|
208
|
+
InputImage.from_file("front.png", text="Compare these two views:"),
|
|
209
|
+
InputImage.from_file("back.png"),
|
|
210
|
+
]
|
|
211
|
+
agent.run_messages(messages)
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
> **`from_url` caveats.** Some providers (notably Anthropic and Gemini) may have stricter rules about remote URLs (allowed hosts, public reachability, redirects). When in doubt — read the file yourself and use `from_file` / `from_bytes`.
|
|
215
|
+
|
|
216
|
+
### Image generation and edit
|
|
217
|
+
|
|
218
|
+
The agent can also produce images. The mechanism differs by provider, so two flavours of model are supported:
|
|
219
|
+
|
|
220
|
+
`enable_image_generation` is the **single master switch** for image output across providers — image generation is off by default, and you opt in with one flag. The flag's effect is provider-specific (different APIs underneath), but the semantics are uniform: turn it on → the agent can draw, leave it off → it can't.
|
|
221
|
+
|
|
222
|
+
#### OpenAI — `enable_image_generation=True`
|
|
223
|
+
|
|
224
|
+
Pass the flag to a normal chat-capable OpenAI model (`gpt-5`, `gpt-5-mini`, etc.). Under the hood the agent enables the **Responses API** and binds OpenAI's built-in `image_generation` tool — the model decides on its own when to call it. Plain text turns stay text.
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
from dm_aioaiagent import DMAIAgent, OutputImage
|
|
228
|
+
|
|
229
|
+
agent = DMAIAgent(model="gpt-5-mini", enable_image_generation=True)
|
|
230
|
+
|
|
231
|
+
agent.run("Draw a small red square on a white background.")
|
|
232
|
+
|
|
233
|
+
# Generated images surface on agent.images
|
|
234
|
+
for i, img in enumerate(agent.images):
|
|
235
|
+
img.save(f"out_{i}.png")
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
The same flag can be combined with regular tools — they coexist. `enable_image_generation=True` is **safe** even when the user only asks for text: the model uses `tool_choice="auto"`.
|
|
239
|
+
|
|
240
|
+
> Older OpenAI models (`gpt-4o`, `gpt-4.1`, etc.) require **organization verification** at platform.openai.com before they will accept the `image_generation` tool. The `gpt-5` family works on a fresh API key without verification.
|
|
241
|
+
|
|
242
|
+
#### Gemini — image-output models + the same flag
|
|
243
|
+
|
|
244
|
+
For Gemini you pick a model whose name contains `image` — e.g. `gemini-2.5-flash-image` (Nano Banana) — **and** turn the flag on. The agent then injects `response_modalities=["IMAGE", "TEXT"]` so the model is allowed to draw.
|
|
245
|
+
|
|
246
|
+
```python
|
|
247
|
+
agent = DMAIAgent(
|
|
248
|
+
model="google_genai:gemini-2.5-flash-image",
|
|
249
|
+
enable_image_generation=True,
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
agent.run("Generate a small red square.")
|
|
253
|
+
agent.images[0].save("out.png")
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
If you pick a Gemini image model but forget the flag, the agent logs a warning (`"... is image-capable but enable_image_generation=False — set the flag to True to let it draw."`) and stays in text-only mode.
|
|
257
|
+
|
|
258
|
+
> **Heads up.** A Gemini image-output model is **not** a general chat model — it tends to draw on every turn, including plain greetings. For mixed workloads use a **two-agent pattern**: a chat agent with the image agent attached as a tool. See [`agent.as_tool()`](#agentas_tool) below.
|
|
259
|
+
|
|
260
|
+
#### Anthropic — vision only
|
|
261
|
+
|
|
262
|
+
Claude **cannot generate** images. If you pass `enable_image_generation=True` to a Claude model, the flag is silently ignored and a warning is logged. Image input (vision) works as usual.
|
|
263
|
+
|
|
264
|
+
### Working with generated images — `OutputImage`
|
|
265
|
+
|
|
266
|
+
Generated images live in `agent.images` as `OutputImage` instances:
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
img = agent.images[0]
|
|
270
|
+
img.bytes # raw image bytes
|
|
271
|
+
img.mime_type # e.g. "image/png"
|
|
272
|
+
img.save("out.png")
|
|
273
|
+
img.to_base64()
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
You can also extract images directly from any `AIMessage`:
|
|
277
|
+
|
|
278
|
+
```python
|
|
279
|
+
from dm_aioaiagent import OutputImage
|
|
280
|
+
images = OutputImage.extract_from(response_message) # list[OutputImage]
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
### Image memory modes
|
|
284
|
+
|
|
285
|
+
Images in `agent.memory_messages` (the conversation history sent to the LLM on each turn) and in `agent.images` (the property exposing AI-generated images) follow the `image_memory_mode` constructor argument:
|
|
286
|
+
|
|
287
|
+
| Mode | Memory (history) | `agent.images` |
|
|
288
|
+
|---|---|---|
|
|
289
|
+
| `keep_last` *(default)* | last user-image kept; last AI-image kept; older → `[image]` / `[generated image]` placeholder | last AI-image kept; replaced when a new one arrives |
|
|
290
|
+
| `drop` | every image (user + AI) becomes a placeholder right after the turn | only the AI-image of the **current** turn (then wiped on the next call) |
|
|
291
|
+
| `keep_all` | nothing is stripped — full multimodal history | every AI-image accumulates |
|
|
292
|
+
|
|
293
|
+
```python
|
|
294
|
+
agent = DMAIAgent(model="gpt-4o-mini", image_memory_mode="keep_last")
|
|
295
|
+
agent.run_messages([InputImage.from_file("photo.png", text="Describe.")])
|
|
296
|
+
agent.run("What colour was dominant?") # answers based on the image
|
|
297
|
+
```
|
|
298
|
+
|
|
299
|
+
`agent.clear_memory_messages()` clears both `memory_messages` and `images`.
|
|
300
|
+
|
|
301
|
+
> Only **AI-generated** images populate `agent.images`. Images you upload via `InputImage` go into history per the rules above but are not exposed on the `images` property.
|
|
302
|
+
|
|
303
|
+
### `agent.as_tool()`
|
|
304
|
+
|
|
305
|
+
Wrap any agent as a `StructuredTool` so a *parent* agent can call it like any other tool — the basis for multi-agent composition. Default name is derived from `agent_name` (lowercased, non-alphanumerics replaced with `_`); `description` is required.
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from dm_aioaiagent import DMAIAgent
|
|
309
|
+
|
|
310
|
+
# specialised image agent
|
|
311
|
+
image_agent = DMAIAgent(
|
|
312
|
+
agent_name="image_drawer",
|
|
313
|
+
model="google_genai:gemini-2.5-flash-image",
|
|
314
|
+
enable_image_generation=True,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# chat agent that delegates drawing to the image agent
|
|
318
|
+
chat_agent = DMAIAgent(
|
|
319
|
+
model="google_genai:gemini-2.5-flash",
|
|
320
|
+
tools=[image_agent.as_tool(description="Generates an image from a text prompt.")],
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
chat_agent.run("Hi! Please draw a small red square.")
|
|
324
|
+
# the chat agent picks the tool, the image agent draws, image lands in image_agent.images
|
|
325
|
+
image_agent.images[0].save("out.png")
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
The async client (`DMAioAIAgent.as_tool`) returns a tool with both `func` and `coroutine` set, so it can be invoked from sync or async parent agents.
|