npm - flowent - Versions diffs - 0.1.5 → 0.2.1 - Mend

flowent 0.1.5 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (67) hide show

package/backend/pyproject.toml CHANGED Viewed

@@ -1,13 +1,39 @@
 [project]
 name = "flowent"
-version = "0.1.5"
-description = "A workflow orchestration platform for multi-agent collaboration."
+version = "0.2.1"
+description = "A workflow orchestration platform for multi-agent collaboration"
 readme = "README.md"
 authors = [
     { name = "ImFeH2", email = "i@feh2.im" }
 ]
-requires-python = ">=3.12,<3.14"
+requires-python = ">=3.11"
 license = "Apache-2.0"
+keywords = [
+    "agent",
+    "agents",
+    "ai",
+    "ai-agents",
+    "assistant",
+    "automation",
+    "code-generation",
+    "llm",
+    "mcp",
+    "orchestration",
+    "sandbox",
+    "web-application",
+    "workflow",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Software Development",
+]
 dependencies = [
     "fastapi[standard]>=0.136.1",
     "litellm>=1.84.0",
@@ -37,14 +63,14 @@ requires = ["uv_build>=0.8.14,<0.9.0"]
 build-backend = "uv_build"
 [tool.ruff]
-target-version = "py312"
+target-version = "py311"
 [tool.ruff.lint]
 select = ["E", "W", "F", "I", "UP", "B", "SIM", "N", "RUF"]
 ignore = ["E501"]
 [tool.mypy]
-python_version = "3.12"
+python_version = "3.11"
 [tool.pytest.ini_options]
 testpaths = ["tests"]

package/backend/src/flowent/agent.py CHANGED Viewed

@@ -15,6 +15,7 @@ from flowent.llm import (
     chunk_delta_content,
     chunk_delta_reasoning,
     chunk_delta_tool_calls,
+    chunk_token_usage,
     stream_chat_chunks,
 )
 from flowent.logging import TRACE_LEVEL
@@ -146,56 +147,108 @@ async def run_agent_stream(
     while True:
         round_number += 1
         logger.debug("Agent round started id=%s round=%s", assistant_id, round_number)
+        logger.info(
+            "Agent model call started id=%s round=%s conversation_messages=%s",
+            assistant_id,
+            round_number,
+            len(conversation),
+        )
         yield AgentStreamEvent(event="output_start", data={"index": round_number})
         round_content = ""
         pending: dict[int, PendingToolCall] = {}
+        chunk_count = 0
+        content_delta_count = 0
+        reasoning_delta_count = 0
+        tool_delta_count = 0
-        async for chunk in stream_chat_chunks(
-            connection,
-            conversation,
-            completion=completion,
-            tools=[*tool_specs(), *list(extra_tool_specs or [])],
-        ):
-            reasoning = chunk_delta_reasoning(chunk)
-            if reasoning:
-                final_thinking += reasoning
-                logger.log(
-                    TRACE_LEVEL,
-                    "Agent stream reasoning id=%s content=%r",
-                    assistant_id,
-                    reasoning,
-                )
-                yield AgentStreamEvent(
-                    event="thinking_delta", data={"content": reasoning}
-                )
-            content = chunk_delta_content(chunk)
-            if content:
-                round_content += content
-                final_content += content
-                logger.log(
-                    TRACE_LEVEL,
-                    "Agent stream delta id=%s content=%r",
-                    assistant_id,
-                    content,
-                )
-                yield AgentStreamEvent(event="delta", data={"content": content})
-            for delta in chunk_delta_tool_calls(chunk):
-                pending.setdefault(delta.index, PendingToolCall()).apply_delta(delta)
+        try:
+            async for chunk in stream_chat_chunks(
+                connection,
+                conversation,
+                completion=completion,
+                tools=[*tool_specs(), *list(extra_tool_specs or [])],
+            ):
+                chunk_count += 1
+                usage = chunk_token_usage(chunk)
+                if usage is not None:
+                    yield AgentStreamEvent(
+                        event="usage",
+                        data={"usage": usage.model_dump()},
+                    )
+                reasoning = chunk_delta_reasoning(chunk)
+                if reasoning:
+                    reasoning_delta_count += 1
+                    final_thinking += reasoning
+                    logger.log(
+                        TRACE_LEVEL,
+                        "Agent stream reasoning id=%s round=%s content=%r",
+                        assistant_id,
+                        round_number,
+                        reasoning,
+                    )
+                    yield AgentStreamEvent(
+                        event="thinking_delta", data={"content": reasoning}
+                    )
+                content = chunk_delta_content(chunk)
+                if content:
+                    content_delta_count += 1
+                    round_content += content
+                    final_content += content
+                    logger.log(
+                        TRACE_LEVEL,
+                        "Agent stream delta id=%s round=%s content=%r",
+                        assistant_id,
+                        round_number,
+                        content,
+                    )
+                    yield AgentStreamEvent(event="delta", data={"content": content})
+                for delta in chunk_delta_tool_calls(chunk):
+                    tool_delta_count += 1
+                    pending.setdefault(delta.index, PendingToolCall()).apply_delta(
+                        delta
+                    )
+        except Exception:
+            logger.exception(
+                "Agent model call failed id=%s round=%s chunk_count=%s content_deltas=%s reasoning_deltas=%s tool_deltas=%s conversation_messages=%s",
+                assistant_id,
+                round_number,
+                chunk_count,
+                content_delta_count,
+                reasoning_delta_count,
+                tool_delta_count,
+                len(conversation),
+            )
+            raise
         tool_calls = [pending[index] for index in sorted(pending)]
+        logger.info(
+            "Agent model call completed id=%s round=%s chunk_count=%s content_deltas=%s reasoning_deltas=%s tool_deltas=%s tool_calls=%s content_length=%s decision=%s",
+            assistant_id,
+            round_number,
+            chunk_count,
+            content_delta_count,
+            reasoning_delta_count,
+            tool_delta_count,
+            len(tool_calls),
+            len(round_content),
+            "run_tools" if tool_calls else "final_response",
+        )
         logger.log(
             TRACE_LEVEL,
-            "Agent round tool calls id=%s tool_calls=%r",
+            "Agent round tool calls id=%s round=%s tool_calls=%r",
             assistant_id,
+            round_number,
             tool_calls,
         )
         if not tool_calls:
             if not final_content and not final_thinking:
                 raise RuntimeError(EMPTY_MODEL_RESPONSE_ERROR)
             logger.info(
-                "Agent response completed id=%s content_length=%s",
+                "Agent response completed id=%s rounds=%s content_length=%s thinking_length=%s decision=final_response",
                 assistant_id,
+                round_number,
                 len(final_content),
+                len(final_thinking),
             )
             logger.log(
                 TRACE_LEVEL,
@@ -301,11 +354,28 @@ async def run_agent_stream(
                 )
             conversation.append(tool_result_message(tool_call_id, result_content))
+        logger.info(
+            "Agent continuing after tools id=%s completed_round=%s tool_results=%s conversation_messages=%s decision=continue",
+            assistant_id,
+            round_number,
+            len(tool_calls),
+            len(conversation),
+        )
         if context_compactor is not None:
             compaction = await context_compactor(conversation)
             if compaction is not None:
-                conversation = [dict(message) for message in compaction.conversation]
-                yield AgentStreamEvent(
-                    event="context_optimized",
-                    data={"message": dict(compaction.message)},
+                logger.info(
+                    "Agent context optimized id=%s round=%s conversation_messages_before=%s conversation_messages_after=%s",
+                    assistant_id,
+                    round_number,
+                    len(conversation),
+                    len(compaction.conversation),
                 )
+                conversation = [dict(message) for message in compaction.conversation]
+                compaction_message = dict(compaction.message)
+                usage_info = compaction_message.pop("usage_info", None)
+                event_data: dict[str, object] = {"message": compaction_message}
+                if isinstance(usage_info, dict):
+                    event_data["usage_info"] = usage_info
+                yield AgentStreamEvent(event="context_optimized", data=event_data)

package/backend/src/flowent/compact.py CHANGED Viewed

@@ -8,8 +8,9 @@ from flowent.llm import (
     ChatMessage,
     CompletionCallable,
     ProviderConnection,
-    complete_chat,
+    complete_chat_with_usage,
 )
+from flowent.usage import TokenUsage
 if TYPE_CHECKING:
     from flowent.storage import StoredMessage
@@ -44,6 +45,7 @@ class CompactResult:
     method: CompactMethod
     replacement_history: list[ChatMessage]
     summary: str
+    summary_usage: TokenUsage | None
     token_after: int
     token_before: int
@@ -66,12 +68,12 @@ class LocalSummaryCompactProvider:
         *,
         completion: CompletionCallable | None = None,
     ) -> CompactResult:
-        summary_message = await complete_chat(
+        summary_result = await complete_chat_with_usage(
             connection,
             compact_prompt_messages(compact_input.model_history),
             completion=completion,
         )
-        summary = summary_message.content.strip()
+        summary = summary_result.message.content.strip()
         replacement_history = build_replacement_history(
             summary,
             compact_input.messages,
@@ -81,6 +83,7 @@ class LocalSummaryCompactProvider:
             method="local_summary",
             replacement_history=replacement_history,
             summary=summary,
+            summary_usage=summary_result.usage,
             token_after=approximate_tokens_for_messages(replacement_history),
             token_before=approximate_tokens_for_messages(compact_input.model_history),
         )
@@ -127,15 +130,15 @@ def build_replacement_history(
     token_budget: int = DEFAULT_RETAINED_MESSAGE_TOKEN_BUDGET,
 ) -> list[ChatMessage]:
     return [
-        ChatMessage(role="user", content=f"{COMPACT_SUMMARY_PREFIX}{summary}"),
-        *retained_recent_chat_messages(
+        *retained_recent_user_messages(
             recent_messages,
             token_budget=token_budget,
         ),
+        ChatMessage(role="user", content=f"{COMPACT_SUMMARY_PREFIX}{summary}"),
     ]
-def retained_recent_chat_messages(
+def retained_recent_user_messages(
     messages: Sequence[StoredMessage],
     *,
     token_budget: int = DEFAULT_RETAINED_MESSAGE_TOKEN_BUDGET,
@@ -143,17 +146,22 @@ def retained_recent_chat_messages(
     retained: list[ChatMessage] = []
     remaining_tokens = max(token_budget, 0)
     for message in reversed(messages):
-        if message.author not in {"user", "assistant"}:
+        if message.author != "user":
             continue
         token_count = approximate_token_count(message.content)
-        if retained and token_count > remaining_tokens:
+        if token_count > remaining_tokens:
+            if remaining_tokens > 0:
+                retained.append(
+                    ChatMessage(
+                        role="user",
+                        content=truncate_text_to_token_budget(
+                            message.content,
+                            remaining_tokens,
+                        ),
+                    )
+                )
             break
-        if token_count > token_budget:
-            continue
-        role: Literal["user", "assistant"] = (
-            "user" if message.author == "user" else "assistant"
-        )
-        retained.append(ChatMessage(role=role, content=message.content))
+        retained.append(ChatMessage(role="user", content=message.content))
         remaining_tokens -= token_count
         if remaining_tokens <= 0:
             break
@@ -161,6 +169,19 @@ def retained_recent_chat_messages(
     return retained
+def truncate_text_to_token_budget(content: str, token_budget: int) -> str:
+    if token_budget <= 0 or not content:
+        return ""
+    character_budget = max(token_budget * 4, 1)
+    if len(content) <= character_budget:
+        return content
+    left_budget = character_budget // 2
+    right_budget = character_budget - left_budget
+    removed_tokens = approximate_token_count(content[left_budget:-right_budget])
+    marker = f"…{removed_tokens} tokens truncated…"
+    return f"{content[:left_budget]}{marker}{content[-right_budget:]}"
 def transcript_messages_after(
     messages: Sequence[StoredMessage],
     message_id: str | None,

package/backend/src/flowent/llm.py CHANGED Viewed

@@ -1,7 +1,10 @@
+import asyncio
 import logging
+import re
 from collections.abc import AsyncIterator, Awaitable, Mapping, Sequence
 from enum import StrEnum
-from typing import Any, Literal, Protocol
+from typing import Any, Literal, Protocol, cast
+from urllib.parse import urlsplit, urlunsplit
 from pydantic import BaseModel, ConfigDict, Field
@@ -10,6 +13,7 @@ from flowent.logging import (
     configure_litellm_logging,
     write_llm_request_diagnostic,
 )
+from flowent.usage import TokenUsage, token_usage_from_response
 class ProviderFormat(StrEnum):
@@ -55,6 +59,13 @@ class ToolCallDelta(BaseModel):
     type: str = "function"
+class ChatCompletionResult(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    message: ChatMessage
+    usage: TokenUsage | None = None
 class CompletionCallable(Protocol):
     def __call__(self, **kwargs: Any) -> Awaitable[Any]: ...
@@ -65,6 +76,31 @@ class ModelListCallable(Protocol):
 logger = logging.getLogger("flowent.llm")
+LLM_RETRY_LIMIT = 5
+LLM_RETRY_BASE_DELAY_SECONDS = 0.5
+class LLMStreamError(RuntimeError):
+    pass
+async def wait_before_llm_retry(attempt_number: int) -> None:
+    await asyncio.sleep(LLM_RETRY_BASE_DELAY_SECONDS * attempt_number)
+async def request_litellm_completion(
+    completion: CompletionCallable,
+    request: Mapping[str, Any],
+) -> Any:
+    for attempt_number in range(LLM_RETRY_LIMIT + 1):
+        try:
+            return await completion(**request)
+        except Exception:
+            if attempt_number >= LLM_RETRY_LIMIT:
+                raise
+            await wait_before_llm_retry(attempt_number + 1)
+    raise RuntimeError("LLM request failed")
 MODEL_PREFIXES: dict[ProviderFormat, str] = {
     ProviderFormat.OPENAI: "openai",
@@ -72,6 +108,16 @@ MODEL_PREFIXES: dict[ProviderFormat, str] = {
     ProviderFormat.ANTHROPIC: "anthropic",
     ProviderFormat.GEMINI: "gemini",
 }
+_litellm_stream_error_patch_installed = False
+PROVIDER_API_VERSIONS: dict[ProviderFormat, str] = {
+    ProviderFormat.OPENAI: "v1",
+    ProviderFormat.OPENAI_RESPONSES: "v1",
+    ProviderFormat.ANTHROPIC: "v1",
+    ProviderFormat.GEMINI: "v1beta",
+}
+VERSION_PATH_SEGMENT = re.compile(r"^v\d+(?:[a-z]+)?$", re.IGNORECASE)
 def provider_model_name(connection: ProviderConnection) -> str:
@@ -82,6 +128,40 @@ def provider_litellm_name(provider: ProviderFormat) -> str:
     return MODEL_PREFIXES[provider]
+def normalize_provider_base_url(
+    provider: ProviderFormat, base_url: str | None
+) -> str | None:
+    if base_url is None:
+        return None
+    raw_base_url = base_url.strip()
+    if not raw_base_url:
+        return None
+    if raw_base_url.endswith("#"):
+        return raw_base_url[:-1].rstrip("/") or None
+    trimmed_base_url = raw_base_url.rstrip("/")
+    parsed_base_url = urlsplit(trimmed_base_url)
+    path_segments = [segment for segment in parsed_base_url.path.split("/") if segment]
+    if any(VERSION_PATH_SEGMENT.fullmatch(segment) for segment in path_segments):
+        return trimmed_base_url
+    version = PROVIDER_API_VERSIONS[provider]
+    if parsed_base_url.scheme and parsed_base_url.netloc:
+        path = parsed_base_url.path.rstrip("/")
+        normalized_path = f"{path}/{version}" if path else f"/{version}"
+        return urlunsplit(
+            (
+                parsed_base_url.scheme,
+                parsed_base_url.netloc,
+                normalized_path,
+                parsed_base_url.query,
+                parsed_base_url.fragment,
+            )
+        )
+    return f"{trimmed_base_url}/{version}"
 def normalize_provider_model_name(provider: ProviderFormat, model: str) -> str:
     prefix = f"{provider_litellm_name(provider)}/"
     if model.startswith(prefix):
@@ -89,6 +169,71 @@ def normalize_provider_model_name(provider: ProviderFormat, model: str) -> str:
     return model
+def stream_failure_message(chunk: Any) -> str:
+    if isinstance(chunk, BaseModel):
+        chunk = chunk.model_dump()
+    if not isinstance(chunk, Mapping):
+        return ""
+    event_type = getattr(chunk.get("type"), "value", chunk.get("type"))
+    event_type = str(event_type or "")
+    if event_type == "error":
+        error = chunk.get("error", {})
+    elif event_type == "response.failed":
+        response = chunk.get("response", {})
+        error = value_at(response, "error", {})
+    else:
+        return ""
+    message = value_at(error, "message", "")
+    if isinstance(message, str) and message:
+        return message
+    code = value_at(error, "code", "")
+    if isinstance(code, str) and code:
+        return code
+    return "Upstream request failed"
+def raise_for_stream_failure(chunk: Any) -> None:
+    message = stream_failure_message(chunk)
+    if message:
+        raise LLMStreamError(message)
+def configure_litellm_stream_error_handling() -> None:
+    global _litellm_stream_error_patch_installed
+    if _litellm_stream_error_patch_installed:
+        return
+    try:
+        from litellm.completion_extras.litellm_responses_transformation.transformation import (
+            OpenAiResponsesToChatCompletionStreamIterator,
+        )
+    except Exception:
+        return
+    if getattr(
+        OpenAiResponsesToChatCompletionStreamIterator,
+        "_flowent_stream_error_patch_installed",
+        False,
+    ):
+        _litellm_stream_error_patch_installed = True
+        return
+    transformer = cast(Any, OpenAiResponsesToChatCompletionStreamIterator)
+    original = transformer.translate_responses_chunk_to_openai_stream
+    def translate_responses_chunk_to_openai_stream(parsed_chunk: Any) -> Any:
+        raise_for_stream_failure(parsed_chunk)
+        return original(parsed_chunk)
+    transformer.translate_responses_chunk_to_openai_stream = staticmethod(
+        translate_responses_chunk_to_openai_stream
+    )
+    transformer._flowent_stream_error_patch_installed = True
+    _litellm_stream_error_patch_installed = True
 def unique_model_names(provider: ProviderFormat, models: Sequence[str]) -> list[str]:
     seen: set[str] = set()
     normalized_models: list[str] = []
@@ -115,7 +260,7 @@ def list_provider_models(
         model_lister = get_valid_models
     models = model_lister(
-        api_base=base_url,
+        api_base=normalize_provider_base_url(provider, base_url),
         api_key=secret_reference,
         check_provider_endpoint=True,
         custom_llm_provider=provider_litellm_name(provider),
@@ -161,8 +306,12 @@ def build_litellm_request(
         request["tools"] = list(tools)
     if stream:
         request["stream"] = True
-    if connection.base_url:
-        request["api_base"] = connection.base_url
+        request["stream_options"] = {"include_usage": True}
+    normalized_base_url = normalize_provider_base_url(
+        connection.provider, connection.base_url
+    )
+    if normalized_base_url:
+        request["api_base"] = normalized_base_url
     if connection.reasoning_effort != ReasoningEffort.DEFAULT:
         request["reasoning_effort"] = connection.reasoning_effort.value
     logger.log(
@@ -170,7 +319,7 @@ def build_litellm_request(
         "Built LiteLLM request provider=%s model=%s base_url=%s stream=%s tools=%s reasoning_effort=%s messages=%r",
         connection.provider,
         connection.model,
-        connection.base_url or "",
+        normalized_base_url or "",
         stream,
         bool(tools),
         connection.reasoning_effort,
@@ -185,7 +334,7 @@ def record_litellm_request_diagnostic(
 ) -> None:
     write_llm_request_diagnostic(
         {
-            "base_url": connection.base_url,
+            "base_url": request.get("api_base"),
             "litellm_model": request["model"],
             "messages": request["messages"],
             "model": connection.model,
@@ -204,6 +353,23 @@ async def complete_chat(
     completion: CompletionCallable | None = None,
     tools: Sequence[Mapping[str, Any]] | None = None,
 ) -> ChatMessage:
+    return (
+        await complete_chat_with_usage(
+            connection,
+            messages,
+            completion=completion,
+            tools=tools,
+        )
+    ).message
+async def complete_chat_with_usage(
+    connection: ProviderConnection,
+    messages: Sequence[ChatMessage | Mapping[str, Any]],
+    *,
+    completion: CompletionCallable | None = None,
+    tools: Sequence[Mapping[str, Any]] | None = None,
+) -> ChatCompletionResult:
     if completion is None:
         from litellm import acompletion
@@ -217,10 +383,15 @@ async def complete_chat(
     )
     request = build_litellm_request(connection, messages, tools=tools)
     record_litellm_request_diagnostic(connection, request)
-    response = await completion(**request)
+    response = await request_litellm_completion(completion, request)
     logger.log(TRACE_LEVEL, "LLM completion response=%r", response)
     choice = response["choices"][0]["message"]
-    return ChatMessage(role=choice.get("role", "assistant"), content=choice["content"])
+    return ChatCompletionResult(
+        message=ChatMessage(
+            role=choice.get("role", "assistant"), content=choice["content"]
+        ),
+        usage=token_usage_from_response(response),
+    )
 def value_at(value: Any, key: str, default: Any = None) -> Any:
@@ -306,6 +477,10 @@ def chunk_delta_tool_calls(chunk: Any) -> list[ToolCallDelta]:
     return tool_call_deltas
+def chunk_token_usage(chunk: Any) -> TokenUsage | None:
+    return token_usage_from_response(chunk)
 async def stream_chat_chunks(
     connection: ProviderConnection,
     messages: Sequence[ChatMessage | Mapping[str, Any]],
@@ -317,6 +492,7 @@ async def stream_chat_chunks(
         from litellm import acompletion
         configure_litellm_logging()
+        configure_litellm_stream_error_handling()
         completion = acompletion
     logger.debug(
@@ -326,10 +502,20 @@ async def stream_chat_chunks(
     )
     request = build_litellm_request(connection, messages, stream=True, tools=tools)
     record_litellm_request_diagnostic(connection, request)
-    response = await completion(**request)
-    async for chunk in response:
-        logger.log(TRACE_LEVEL, "LLM stream chunk=%r", chunk)
-        yield chunk
+    for attempt_number in range(LLM_RETRY_LIMIT + 1):
+        yielded_chunk = False
+        try:
+            response = await completion(**request)
+            async for chunk in response:
+                raise_for_stream_failure(chunk)
+                logger.log(TRACE_LEVEL, "LLM stream chunk=%r", chunk)
+                yielded_chunk = True
+                yield chunk
+            return
+        except Exception:
+            if yielded_chunk or attempt_number >= LLM_RETRY_LIMIT:
+                raise
+            await wait_before_llm_retry(attempt_number + 1)
 async def stream_chat(