flowent 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/backend/pyproject.toml +31 -5
  2. package/backend/src/flowent/agent.py +107 -37
  3. package/backend/src/flowent/compact.py +35 -14
  4. package/backend/src/flowent/llm.py +198 -12
  5. package/backend/src/flowent/main.py +260 -59
  6. package/backend/src/flowent/static/assets/index-CRSV2xu1.css +2 -0
  7. package/backend/src/flowent/static/assets/index-DUYj6rgD.js +82 -0
  8. package/backend/src/flowent/static/index.html +2 -2
  9. package/backend/src/flowent/storage.py +135 -3
  10. package/backend/src/flowent/usage.py +315 -0
  11. package/backend/uv.lock +971 -3
  12. package/dist/frontend/assets/index-CRSV2xu1.css +2 -0
  13. package/dist/frontend/assets/index-DUYj6rgD.js +82 -0
  14. package/dist/frontend/index.html +2 -2
  15. package/package.json +24 -3
  16. package/backend/src/flowent/__pycache__/__init__.cpython-313.pyc +0 -0
  17. package/backend/src/flowent/__pycache__/_version.cpython-313.pyc +0 -0
  18. package/backend/src/flowent/__pycache__/agent.cpython-313.pyc +0 -0
  19. package/backend/src/flowent/__pycache__/approval.cpython-313.pyc +0 -0
  20. package/backend/src/flowent/__pycache__/channels.cpython-313.pyc +0 -0
  21. package/backend/src/flowent/__pycache__/cli.cpython-313.pyc +0 -0
  22. package/backend/src/flowent/__pycache__/compact.cpython-313.pyc +0 -0
  23. package/backend/src/flowent/__pycache__/context.cpython-313.pyc +0 -0
  24. package/backend/src/flowent/__pycache__/llm.cpython-313.pyc +0 -0
  25. package/backend/src/flowent/__pycache__/logging.cpython-313.pyc +0 -0
  26. package/backend/src/flowent/__pycache__/main.cpython-313.pyc +0 -0
  27. package/backend/src/flowent/__pycache__/mcp.cpython-313.pyc +0 -0
  28. package/backend/src/flowent/__pycache__/mcp_import.cpython-313.pyc +0 -0
  29. package/backend/src/flowent/__pycache__/patch.cpython-313.pyc +0 -0
  30. package/backend/src/flowent/__pycache__/paths.cpython-313.pyc +0 -0
  31. package/backend/src/flowent/__pycache__/permissions.cpython-313.pyc +0 -0
  32. package/backend/src/flowent/__pycache__/sandbox.cpython-313.pyc +0 -0
  33. package/backend/src/flowent/__pycache__/skills.cpython-313.pyc +0 -0
  34. package/backend/src/flowent/__pycache__/storage.cpython-313.pyc +0 -0
  35. package/backend/src/flowent/__pycache__/tools.cpython-313.pyc +0 -0
  36. package/backend/src/flowent/static/assets/index-Cl20cARb.css +0 -2
  37. package/backend/src/flowent/static/assets/index-dsDDsEym.js +0 -81
  38. package/backend/tests/__pycache__/conftest.cpython-313-pytest-9.0.3.pyc +0 -0
  39. package/backend/tests/__pycache__/test_agent_tools.cpython-313-pytest-9.0.3.pyc +0 -0
  40. package/backend/tests/__pycache__/test_approval.cpython-313-pytest-9.0.3.pyc +0 -0
  41. package/backend/tests/__pycache__/test_channels.cpython-313-pytest-9.0.3.pyc +0 -0
  42. package/backend/tests/__pycache__/test_health.cpython-313-pytest-9.0.3.pyc +0 -0
  43. package/backend/tests/__pycache__/test_llm_providers.cpython-313-pytest-9.0.3.pyc +0 -0
  44. package/backend/tests/__pycache__/test_logging.cpython-313-pytest-9.0.3.pyc +0 -0
  45. package/backend/tests/__pycache__/test_mcp.cpython-313-pytest-9.0.3.pyc +0 -0
  46. package/backend/tests/__pycache__/test_patch.cpython-313-pytest-9.0.3.pyc +0 -0
  47. package/backend/tests/__pycache__/test_permissions.cpython-313-pytest-9.0.3.pyc +0 -0
  48. package/backend/tests/__pycache__/test_persistence.cpython-313-pytest-9.0.3.pyc +0 -0
  49. package/backend/tests/__pycache__/test_skills.cpython-313-pytest-9.0.3.pyc +0 -0
  50. package/backend/tests/__pycache__/test_startup_requirements.cpython-313-pytest-9.0.3.pyc +0 -0
  51. package/backend/tests/__pycache__/test_workspace_chat.cpython-313-pytest-9.0.3.pyc +0 -0
  52. package/backend/tests/conftest.py +0 -21
  53. package/backend/tests/test_agent_tools.py +0 -988
  54. package/backend/tests/test_approval.py +0 -283
  55. package/backend/tests/test_channels.py +0 -360
  56. package/backend/tests/test_health.py +0 -12
  57. package/backend/tests/test_llm_providers.py +0 -387
  58. package/backend/tests/test_logging.py +0 -212
  59. package/backend/tests/test_mcp.py +0 -788
  60. package/backend/tests/test_patch.py +0 -112
  61. package/backend/tests/test_permissions.py +0 -588
  62. package/backend/tests/test_persistence.py +0 -249
  63. package/backend/tests/test_skills.py +0 -462
  64. package/backend/tests/test_startup_requirements.py +0 -144
  65. package/backend/tests/test_workspace_chat.py +0 -2122
  66. package/dist/frontend/assets/index-Cl20cARb.css +0 -2
  67. package/dist/frontend/assets/index-dsDDsEym.js +0 -81
@@ -1,13 +1,39 @@
1
1
  [project]
2
2
  name = "flowent"
3
- version = "0.1.5"
4
- description = "A workflow orchestration platform for multi-agent collaboration."
3
+ version = "0.2.1"
4
+ description = "A workflow orchestration platform for multi-agent collaboration"
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  { name = "ImFeH2", email = "i@feh2.im" }
8
8
  ]
9
- requires-python = ">=3.12,<3.14"
9
+ requires-python = ">=3.11"
10
10
  license = "Apache-2.0"
11
+ keywords = [
12
+ "agent",
13
+ "agents",
14
+ "ai",
15
+ "ai-agents",
16
+ "assistant",
17
+ "automation",
18
+ "code-generation",
19
+ "llm",
20
+ "mcp",
21
+ "orchestration",
22
+ "sandbox",
23
+ "web-application",
24
+ "workflow",
25
+ ]
26
+ classifiers = [
27
+ "Development Status :: 3 - Alpha",
28
+ "Intended Audience :: Developers",
29
+ "License :: OSI Approved :: Apache Software License",
30
+ "Operating System :: OS Independent",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Programming Language :: Python :: 3.12",
34
+ "Programming Language :: Python :: 3.13",
35
+ "Topic :: Software Development",
36
+ ]
11
37
  dependencies = [
12
38
  "fastapi[standard]>=0.136.1",
13
39
  "litellm>=1.84.0",
@@ -37,14 +63,14 @@ requires = ["uv_build>=0.8.14,<0.9.0"]
37
63
  build-backend = "uv_build"
38
64
 
39
65
  [tool.ruff]
40
- target-version = "py312"
66
+ target-version = "py311"
41
67
 
42
68
  [tool.ruff.lint]
43
69
  select = ["E", "W", "F", "I", "UP", "B", "SIM", "N", "RUF"]
44
70
  ignore = ["E501"]
45
71
 
46
72
  [tool.mypy]
47
- python_version = "3.12"
73
+ python_version = "3.11"
48
74
 
49
75
  [tool.pytest.ini_options]
50
76
  testpaths = ["tests"]
@@ -15,6 +15,7 @@ from flowent.llm import (
15
15
  chunk_delta_content,
16
16
  chunk_delta_reasoning,
17
17
  chunk_delta_tool_calls,
18
+ chunk_token_usage,
18
19
  stream_chat_chunks,
19
20
  )
20
21
  from flowent.logging import TRACE_LEVEL
@@ -146,56 +147,108 @@ async def run_agent_stream(
146
147
  while True:
147
148
  round_number += 1
148
149
  logger.debug("Agent round started id=%s round=%s", assistant_id, round_number)
150
+ logger.info(
151
+ "Agent model call started id=%s round=%s conversation_messages=%s",
152
+ assistant_id,
153
+ round_number,
154
+ len(conversation),
155
+ )
149
156
  yield AgentStreamEvent(event="output_start", data={"index": round_number})
150
157
  round_content = ""
151
158
  pending: dict[int, PendingToolCall] = {}
159
+ chunk_count = 0
160
+ content_delta_count = 0
161
+ reasoning_delta_count = 0
162
+ tool_delta_count = 0
152
163
 
153
- async for chunk in stream_chat_chunks(
154
- connection,
155
- conversation,
156
- completion=completion,
157
- tools=[*tool_specs(), *list(extra_tool_specs or [])],
158
- ):
159
- reasoning = chunk_delta_reasoning(chunk)
160
- if reasoning:
161
- final_thinking += reasoning
162
- logger.log(
163
- TRACE_LEVEL,
164
- "Agent stream reasoning id=%s content=%r",
165
- assistant_id,
166
- reasoning,
167
- )
168
- yield AgentStreamEvent(
169
- event="thinking_delta", data={"content": reasoning}
170
- )
171
- content = chunk_delta_content(chunk)
172
- if content:
173
- round_content += content
174
- final_content += content
175
- logger.log(
176
- TRACE_LEVEL,
177
- "Agent stream delta id=%s content=%r",
178
- assistant_id,
179
- content,
180
- )
181
- yield AgentStreamEvent(event="delta", data={"content": content})
182
- for delta in chunk_delta_tool_calls(chunk):
183
- pending.setdefault(delta.index, PendingToolCall()).apply_delta(delta)
164
+ try:
165
+ async for chunk in stream_chat_chunks(
166
+ connection,
167
+ conversation,
168
+ completion=completion,
169
+ tools=[*tool_specs(), *list(extra_tool_specs or [])],
170
+ ):
171
+ chunk_count += 1
172
+ usage = chunk_token_usage(chunk)
173
+ if usage is not None:
174
+ yield AgentStreamEvent(
175
+ event="usage",
176
+ data={"usage": usage.model_dump()},
177
+ )
178
+ reasoning = chunk_delta_reasoning(chunk)
179
+ if reasoning:
180
+ reasoning_delta_count += 1
181
+ final_thinking += reasoning
182
+ logger.log(
183
+ TRACE_LEVEL,
184
+ "Agent stream reasoning id=%s round=%s content=%r",
185
+ assistant_id,
186
+ round_number,
187
+ reasoning,
188
+ )
189
+ yield AgentStreamEvent(
190
+ event="thinking_delta", data={"content": reasoning}
191
+ )
192
+ content = chunk_delta_content(chunk)
193
+ if content:
194
+ content_delta_count += 1
195
+ round_content += content
196
+ final_content += content
197
+ logger.log(
198
+ TRACE_LEVEL,
199
+ "Agent stream delta id=%s round=%s content=%r",
200
+ assistant_id,
201
+ round_number,
202
+ content,
203
+ )
204
+ yield AgentStreamEvent(event="delta", data={"content": content})
205
+ for delta in chunk_delta_tool_calls(chunk):
206
+ tool_delta_count += 1
207
+ pending.setdefault(delta.index, PendingToolCall()).apply_delta(
208
+ delta
209
+ )
210
+ except Exception:
211
+ logger.exception(
212
+ "Agent model call failed id=%s round=%s chunk_count=%s content_deltas=%s reasoning_deltas=%s tool_deltas=%s conversation_messages=%s",
213
+ assistant_id,
214
+ round_number,
215
+ chunk_count,
216
+ content_delta_count,
217
+ reasoning_delta_count,
218
+ tool_delta_count,
219
+ len(conversation),
220
+ )
221
+ raise
184
222
 
185
223
  tool_calls = [pending[index] for index in sorted(pending)]
224
+ logger.info(
225
+ "Agent model call completed id=%s round=%s chunk_count=%s content_deltas=%s reasoning_deltas=%s tool_deltas=%s tool_calls=%s content_length=%s decision=%s",
226
+ assistant_id,
227
+ round_number,
228
+ chunk_count,
229
+ content_delta_count,
230
+ reasoning_delta_count,
231
+ tool_delta_count,
232
+ len(tool_calls),
233
+ len(round_content),
234
+ "run_tools" if tool_calls else "final_response",
235
+ )
186
236
  logger.log(
187
237
  TRACE_LEVEL,
188
- "Agent round tool calls id=%s tool_calls=%r",
238
+ "Agent round tool calls id=%s round=%s tool_calls=%r",
189
239
  assistant_id,
240
+ round_number,
190
241
  tool_calls,
191
242
  )
192
243
  if not tool_calls:
193
244
  if not final_content and not final_thinking:
194
245
  raise RuntimeError(EMPTY_MODEL_RESPONSE_ERROR)
195
246
  logger.info(
196
- "Agent response completed id=%s content_length=%s",
247
+ "Agent response completed id=%s rounds=%s content_length=%s thinking_length=%s decision=final_response",
197
248
  assistant_id,
249
+ round_number,
198
250
  len(final_content),
251
+ len(final_thinking),
199
252
  )
200
253
  logger.log(
201
254
  TRACE_LEVEL,
@@ -301,11 +354,28 @@ async def run_agent_stream(
301
354
  )
302
355
  conversation.append(tool_result_message(tool_call_id, result_content))
303
356
 
357
+ logger.info(
358
+ "Agent continuing after tools id=%s completed_round=%s tool_results=%s conversation_messages=%s decision=continue",
359
+ assistant_id,
360
+ round_number,
361
+ len(tool_calls),
362
+ len(conversation),
363
+ )
364
+
304
365
  if context_compactor is not None:
305
366
  compaction = await context_compactor(conversation)
306
367
  if compaction is not None:
307
- conversation = [dict(message) for message in compaction.conversation]
308
- yield AgentStreamEvent(
309
- event="context_optimized",
310
- data={"message": dict(compaction.message)},
368
+ logger.info(
369
+ "Agent context optimized id=%s round=%s conversation_messages_before=%s conversation_messages_after=%s",
370
+ assistant_id,
371
+ round_number,
372
+ len(conversation),
373
+ len(compaction.conversation),
311
374
  )
375
+ conversation = [dict(message) for message in compaction.conversation]
376
+ compaction_message = dict(compaction.message)
377
+ usage_info = compaction_message.pop("usage_info", None)
378
+ event_data: dict[str, object] = {"message": compaction_message}
379
+ if isinstance(usage_info, dict):
380
+ event_data["usage_info"] = usage_info
381
+ yield AgentStreamEvent(event="context_optimized", data=event_data)
@@ -8,8 +8,9 @@ from flowent.llm import (
8
8
  ChatMessage,
9
9
  CompletionCallable,
10
10
  ProviderConnection,
11
- complete_chat,
11
+ complete_chat_with_usage,
12
12
  )
13
+ from flowent.usage import TokenUsage
13
14
 
14
15
  if TYPE_CHECKING:
15
16
  from flowent.storage import StoredMessage
@@ -44,6 +45,7 @@ class CompactResult:
44
45
  method: CompactMethod
45
46
  replacement_history: list[ChatMessage]
46
47
  summary: str
48
+ summary_usage: TokenUsage | None
47
49
  token_after: int
48
50
  token_before: int
49
51
 
@@ -66,12 +68,12 @@ class LocalSummaryCompactProvider:
66
68
  *,
67
69
  completion: CompletionCallable | None = None,
68
70
  ) -> CompactResult:
69
- summary_message = await complete_chat(
71
+ summary_result = await complete_chat_with_usage(
70
72
  connection,
71
73
  compact_prompt_messages(compact_input.model_history),
72
74
  completion=completion,
73
75
  )
74
- summary = summary_message.content.strip()
76
+ summary = summary_result.message.content.strip()
75
77
  replacement_history = build_replacement_history(
76
78
  summary,
77
79
  compact_input.messages,
@@ -81,6 +83,7 @@ class LocalSummaryCompactProvider:
81
83
  method="local_summary",
82
84
  replacement_history=replacement_history,
83
85
  summary=summary,
86
+ summary_usage=summary_result.usage,
84
87
  token_after=approximate_tokens_for_messages(replacement_history),
85
88
  token_before=approximate_tokens_for_messages(compact_input.model_history),
86
89
  )
@@ -127,15 +130,15 @@ def build_replacement_history(
127
130
  token_budget: int = DEFAULT_RETAINED_MESSAGE_TOKEN_BUDGET,
128
131
  ) -> list[ChatMessage]:
129
132
  return [
130
- ChatMessage(role="user", content=f"{COMPACT_SUMMARY_PREFIX}{summary}"),
131
- *retained_recent_chat_messages(
133
+ *retained_recent_user_messages(
132
134
  recent_messages,
133
135
  token_budget=token_budget,
134
136
  ),
137
+ ChatMessage(role="user", content=f"{COMPACT_SUMMARY_PREFIX}{summary}"),
135
138
  ]
136
139
 
137
140
 
138
- def retained_recent_chat_messages(
141
+ def retained_recent_user_messages(
139
142
  messages: Sequence[StoredMessage],
140
143
  *,
141
144
  token_budget: int = DEFAULT_RETAINED_MESSAGE_TOKEN_BUDGET,
@@ -143,17 +146,22 @@ def retained_recent_chat_messages(
143
146
  retained: list[ChatMessage] = []
144
147
  remaining_tokens = max(token_budget, 0)
145
148
  for message in reversed(messages):
146
- if message.author not in {"user", "assistant"}:
149
+ if message.author != "user":
147
150
  continue
148
151
  token_count = approximate_token_count(message.content)
149
- if retained and token_count > remaining_tokens:
152
+ if token_count > remaining_tokens:
153
+ if remaining_tokens > 0:
154
+ retained.append(
155
+ ChatMessage(
156
+ role="user",
157
+ content=truncate_text_to_token_budget(
158
+ message.content,
159
+ remaining_tokens,
160
+ ),
161
+ )
162
+ )
150
163
  break
151
- if token_count > token_budget:
152
- continue
153
- role: Literal["user", "assistant"] = (
154
- "user" if message.author == "user" else "assistant"
155
- )
156
- retained.append(ChatMessage(role=role, content=message.content))
164
+ retained.append(ChatMessage(role="user", content=message.content))
157
165
  remaining_tokens -= token_count
158
166
  if remaining_tokens <= 0:
159
167
  break
@@ -161,6 +169,19 @@ def retained_recent_chat_messages(
161
169
  return retained
162
170
 
163
171
 
172
+ def truncate_text_to_token_budget(content: str, token_budget: int) -> str:
173
+ if token_budget <= 0 or not content:
174
+ return ""
175
+ character_budget = max(token_budget * 4, 1)
176
+ if len(content) <= character_budget:
177
+ return content
178
+ left_budget = character_budget // 2
179
+ right_budget = character_budget - left_budget
180
+ removed_tokens = approximate_token_count(content[left_budget:-right_budget])
181
+ marker = f"…{removed_tokens} tokens truncated…"
182
+ return f"{content[:left_budget]}{marker}{content[-right_budget:]}"
183
+
184
+
164
185
  def transcript_messages_after(
165
186
  messages: Sequence[StoredMessage],
166
187
  message_id: str | None,
@@ -1,7 +1,10 @@
1
+ import asyncio
1
2
  import logging
3
+ import re
2
4
  from collections.abc import AsyncIterator, Awaitable, Mapping, Sequence
3
5
  from enum import StrEnum
4
- from typing import Any, Literal, Protocol
6
+ from typing import Any, Literal, Protocol, cast
7
+ from urllib.parse import urlsplit, urlunsplit
5
8
 
6
9
  from pydantic import BaseModel, ConfigDict, Field
7
10
 
@@ -10,6 +13,7 @@ from flowent.logging import (
10
13
  configure_litellm_logging,
11
14
  write_llm_request_diagnostic,
12
15
  )
16
+ from flowent.usage import TokenUsage, token_usage_from_response
13
17
 
14
18
 
15
19
  class ProviderFormat(StrEnum):
@@ -55,6 +59,13 @@ class ToolCallDelta(BaseModel):
55
59
  type: str = "function"
56
60
 
57
61
 
62
+ class ChatCompletionResult(BaseModel):
63
+ model_config = ConfigDict(extra="forbid")
64
+
65
+ message: ChatMessage
66
+ usage: TokenUsage | None = None
67
+
68
+
58
69
  class CompletionCallable(Protocol):
59
70
  def __call__(self, **kwargs: Any) -> Awaitable[Any]: ...
60
71
 
@@ -65,6 +76,31 @@ class ModelListCallable(Protocol):
65
76
 
66
77
  logger = logging.getLogger("flowent.llm")
67
78
 
79
+ LLM_RETRY_LIMIT = 5
80
+ LLM_RETRY_BASE_DELAY_SECONDS = 0.5
81
+
82
+
83
+ class LLMStreamError(RuntimeError):
84
+ pass
85
+
86
+
87
+ async def wait_before_llm_retry(attempt_number: int) -> None:
88
+ await asyncio.sleep(LLM_RETRY_BASE_DELAY_SECONDS * attempt_number)
89
+
90
+
91
+ async def request_litellm_completion(
92
+ completion: CompletionCallable,
93
+ request: Mapping[str, Any],
94
+ ) -> Any:
95
+ for attempt_number in range(LLM_RETRY_LIMIT + 1):
96
+ try:
97
+ return await completion(**request)
98
+ except Exception:
99
+ if attempt_number >= LLM_RETRY_LIMIT:
100
+ raise
101
+ await wait_before_llm_retry(attempt_number + 1)
102
+ raise RuntimeError("LLM request failed")
103
+
68
104
 
69
105
  MODEL_PREFIXES: dict[ProviderFormat, str] = {
70
106
  ProviderFormat.OPENAI: "openai",
@@ -72,6 +108,16 @@ MODEL_PREFIXES: dict[ProviderFormat, str] = {
72
108
  ProviderFormat.ANTHROPIC: "anthropic",
73
109
  ProviderFormat.GEMINI: "gemini",
74
110
  }
111
+ _litellm_stream_error_patch_installed = False
112
+
113
+ PROVIDER_API_VERSIONS: dict[ProviderFormat, str] = {
114
+ ProviderFormat.OPENAI: "v1",
115
+ ProviderFormat.OPENAI_RESPONSES: "v1",
116
+ ProviderFormat.ANTHROPIC: "v1",
117
+ ProviderFormat.GEMINI: "v1beta",
118
+ }
119
+
120
+ VERSION_PATH_SEGMENT = re.compile(r"^v\d+(?:[a-z]+)?$", re.IGNORECASE)
75
121
 
76
122
 
77
123
  def provider_model_name(connection: ProviderConnection) -> str:
@@ -82,6 +128,40 @@ def provider_litellm_name(provider: ProviderFormat) -> str:
82
128
  return MODEL_PREFIXES[provider]
83
129
 
84
130
 
131
+ def normalize_provider_base_url(
132
+ provider: ProviderFormat, base_url: str | None
133
+ ) -> str | None:
134
+ if base_url is None:
135
+ return None
136
+ raw_base_url = base_url.strip()
137
+ if not raw_base_url:
138
+ return None
139
+ if raw_base_url.endswith("#"):
140
+ return raw_base_url[:-1].rstrip("/") or None
141
+
142
+ trimmed_base_url = raw_base_url.rstrip("/")
143
+ parsed_base_url = urlsplit(trimmed_base_url)
144
+ path_segments = [segment for segment in parsed_base_url.path.split("/") if segment]
145
+ if any(VERSION_PATH_SEGMENT.fullmatch(segment) for segment in path_segments):
146
+ return trimmed_base_url
147
+
148
+ version = PROVIDER_API_VERSIONS[provider]
149
+ if parsed_base_url.scheme and parsed_base_url.netloc:
150
+ path = parsed_base_url.path.rstrip("/")
151
+ normalized_path = f"{path}/{version}" if path else f"/{version}"
152
+ return urlunsplit(
153
+ (
154
+ parsed_base_url.scheme,
155
+ parsed_base_url.netloc,
156
+ normalized_path,
157
+ parsed_base_url.query,
158
+ parsed_base_url.fragment,
159
+ )
160
+ )
161
+
162
+ return f"{trimmed_base_url}/{version}"
163
+
164
+
85
165
  def normalize_provider_model_name(provider: ProviderFormat, model: str) -> str:
86
166
  prefix = f"{provider_litellm_name(provider)}/"
87
167
  if model.startswith(prefix):
@@ -89,6 +169,71 @@ def normalize_provider_model_name(provider: ProviderFormat, model: str) -> str:
89
169
  return model
90
170
 
91
171
 
172
+ def stream_failure_message(chunk: Any) -> str:
173
+ if isinstance(chunk, BaseModel):
174
+ chunk = chunk.model_dump()
175
+ if not isinstance(chunk, Mapping):
176
+ return ""
177
+
178
+ event_type = getattr(chunk.get("type"), "value", chunk.get("type"))
179
+ event_type = str(event_type or "")
180
+ if event_type == "error":
181
+ error = chunk.get("error", {})
182
+ elif event_type == "response.failed":
183
+ response = chunk.get("response", {})
184
+ error = value_at(response, "error", {})
185
+ else:
186
+ return ""
187
+
188
+ message = value_at(error, "message", "")
189
+ if isinstance(message, str) and message:
190
+ return message
191
+ code = value_at(error, "code", "")
192
+ if isinstance(code, str) and code:
193
+ return code
194
+ return "Upstream request failed"
195
+
196
+
197
+ def raise_for_stream_failure(chunk: Any) -> None:
198
+ message = stream_failure_message(chunk)
199
+ if message:
200
+ raise LLMStreamError(message)
201
+
202
+
203
+ def configure_litellm_stream_error_handling() -> None:
204
+ global _litellm_stream_error_patch_installed
205
+
206
+ if _litellm_stream_error_patch_installed:
207
+ return
208
+ try:
209
+ from litellm.completion_extras.litellm_responses_transformation.transformation import (
210
+ OpenAiResponsesToChatCompletionStreamIterator,
211
+ )
212
+ except Exception:
213
+ return
214
+
215
+ if getattr(
216
+ OpenAiResponsesToChatCompletionStreamIterator,
217
+ "_flowent_stream_error_patch_installed",
218
+ False,
219
+ ):
220
+ _litellm_stream_error_patch_installed = True
221
+ return
222
+
223
+ transformer = cast(Any, OpenAiResponsesToChatCompletionStreamIterator)
224
+ original = transformer.translate_responses_chunk_to_openai_stream
225
+
226
+ def translate_responses_chunk_to_openai_stream(parsed_chunk: Any) -> Any:
227
+ raise_for_stream_failure(parsed_chunk)
228
+ return original(parsed_chunk)
229
+
230
+ transformer.translate_responses_chunk_to_openai_stream = staticmethod(
231
+ translate_responses_chunk_to_openai_stream
232
+ )
233
+ transformer._flowent_stream_error_patch_installed = True
234
+ _litellm_stream_error_patch_installed = True
235
+
236
+
92
237
  def unique_model_names(provider: ProviderFormat, models: Sequence[str]) -> list[str]:
93
238
  seen: set[str] = set()
94
239
  normalized_models: list[str] = []
@@ -115,7 +260,7 @@ def list_provider_models(
115
260
  model_lister = get_valid_models
116
261
 
117
262
  models = model_lister(
118
- api_base=base_url,
263
+ api_base=normalize_provider_base_url(provider, base_url),
119
264
  api_key=secret_reference,
120
265
  check_provider_endpoint=True,
121
266
  custom_llm_provider=provider_litellm_name(provider),
@@ -161,8 +306,12 @@ def build_litellm_request(
161
306
  request["tools"] = list(tools)
162
307
  if stream:
163
308
  request["stream"] = True
164
- if connection.base_url:
165
- request["api_base"] = connection.base_url
309
+ request["stream_options"] = {"include_usage": True}
310
+ normalized_base_url = normalize_provider_base_url(
311
+ connection.provider, connection.base_url
312
+ )
313
+ if normalized_base_url:
314
+ request["api_base"] = normalized_base_url
166
315
  if connection.reasoning_effort != ReasoningEffort.DEFAULT:
167
316
  request["reasoning_effort"] = connection.reasoning_effort.value
168
317
  logger.log(
@@ -170,7 +319,7 @@ def build_litellm_request(
170
319
  "Built LiteLLM request provider=%s model=%s base_url=%s stream=%s tools=%s reasoning_effort=%s messages=%r",
171
320
  connection.provider,
172
321
  connection.model,
173
- connection.base_url or "",
322
+ normalized_base_url or "",
174
323
  stream,
175
324
  bool(tools),
176
325
  connection.reasoning_effort,
@@ -185,7 +334,7 @@ def record_litellm_request_diagnostic(
185
334
  ) -> None:
186
335
  write_llm_request_diagnostic(
187
336
  {
188
- "base_url": connection.base_url,
337
+ "base_url": request.get("api_base"),
189
338
  "litellm_model": request["model"],
190
339
  "messages": request["messages"],
191
340
  "model": connection.model,
@@ -204,6 +353,23 @@ async def complete_chat(
204
353
  completion: CompletionCallable | None = None,
205
354
  tools: Sequence[Mapping[str, Any]] | None = None,
206
355
  ) -> ChatMessage:
356
+ return (
357
+ await complete_chat_with_usage(
358
+ connection,
359
+ messages,
360
+ completion=completion,
361
+ tools=tools,
362
+ )
363
+ ).message
364
+
365
+
366
+ async def complete_chat_with_usage(
367
+ connection: ProviderConnection,
368
+ messages: Sequence[ChatMessage | Mapping[str, Any]],
369
+ *,
370
+ completion: CompletionCallable | None = None,
371
+ tools: Sequence[Mapping[str, Any]] | None = None,
372
+ ) -> ChatCompletionResult:
207
373
  if completion is None:
208
374
  from litellm import acompletion
209
375
 
@@ -217,10 +383,15 @@ async def complete_chat(
217
383
  )
218
384
  request = build_litellm_request(connection, messages, tools=tools)
219
385
  record_litellm_request_diagnostic(connection, request)
220
- response = await completion(**request)
386
+ response = await request_litellm_completion(completion, request)
221
387
  logger.log(TRACE_LEVEL, "LLM completion response=%r", response)
222
388
  choice = response["choices"][0]["message"]
223
- return ChatMessage(role=choice.get("role", "assistant"), content=choice["content"])
389
+ return ChatCompletionResult(
390
+ message=ChatMessage(
391
+ role=choice.get("role", "assistant"), content=choice["content"]
392
+ ),
393
+ usage=token_usage_from_response(response),
394
+ )
224
395
 
225
396
 
226
397
  def value_at(value: Any, key: str, default: Any = None) -> Any:
@@ -306,6 +477,10 @@ def chunk_delta_tool_calls(chunk: Any) -> list[ToolCallDelta]:
306
477
  return tool_call_deltas
307
478
 
308
479
 
480
+ def chunk_token_usage(chunk: Any) -> TokenUsage | None:
481
+ return token_usage_from_response(chunk)
482
+
483
+
309
484
  async def stream_chat_chunks(
310
485
  connection: ProviderConnection,
311
486
  messages: Sequence[ChatMessage | Mapping[str, Any]],
@@ -317,6 +492,7 @@ async def stream_chat_chunks(
317
492
  from litellm import acompletion
318
493
 
319
494
  configure_litellm_logging()
495
+ configure_litellm_stream_error_handling()
320
496
  completion = acompletion
321
497
 
322
498
  logger.debug(
@@ -326,10 +502,20 @@ async def stream_chat_chunks(
326
502
  )
327
503
  request = build_litellm_request(connection, messages, stream=True, tools=tools)
328
504
  record_litellm_request_diagnostic(connection, request)
329
- response = await completion(**request)
330
- async for chunk in response:
331
- logger.log(TRACE_LEVEL, "LLM stream chunk=%r", chunk)
332
- yield chunk
505
+ for attempt_number in range(LLM_RETRY_LIMIT + 1):
506
+ yielded_chunk = False
507
+ try:
508
+ response = await completion(**request)
509
+ async for chunk in response:
510
+ raise_for_stream_failure(chunk)
511
+ logger.log(TRACE_LEVEL, "LLM stream chunk=%r", chunk)
512
+ yielded_chunk = True
513
+ yield chunk
514
+ return
515
+ except Exception:
516
+ if yielded_chunk or attempt_number >= LLM_RETRY_LIMIT:
517
+ raise
518
+ await wait_before_llm_retry(attempt_number + 1)
333
519
 
334
520
 
335
521
  async def stream_chat(