PyPI - promptbuilder - Versions diffs - 0.4.37__tar.gz → 0.4.39__tar.gz - Mend

promptbuilder 0.4.37tar.gz → 0.4.39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{promptbuilder-0.4.37/promptbuilder.egg-info → promptbuilder-0.4.39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: promptbuilder
-Version: 0.4.37
+Version: 0.4.39
 Summary: Library for building prompts for LLMs
 Home-page: https://github.com/kapulkin/promptbuilder
 Author: Kapulkin Stanislav
@@ -21,6 +21,7 @@ Requires-Dist: aioboto3
 Requires-Dist: litellm
 Requires-Dist: httpx
 Requires-Dist: aiohttp
+Requires-Dist: tiktoken
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/agent/agent.py RENAMED Viewed

@@ -82,21 +82,24 @@ class AgentRouter(Agent[MessageType, ContextType]):
         )
         content = response.candidates[0].content
+        router_tool_contents = []
         for part in content.parts:
             if part.function_call is None:
                 if part.text is not None:
-                    self.context.dialog_history.add_message(Content(parts=[Part(text=part.text, thought=part.thought)], role="model"))
+                    router_tool_contents.append(Content(parts=[Part(text=part.text)], role="model"))
             else:
                 tr_name = part.function_call.name
-                args = part.function_call.args
-                if args is None:
-                    args = {}
+                tr_args = part.function_call.args
+                if tr_args is None:
+                    tr_args = {}
                 route = self.routes.get(tr_name)
                 if route is not None:
+                    router_tool_contents = []
                     self.last_used_tr_name = tr_name
-                    logger.debug("Route %s called with args: %s", tr_name, args)
-                    merged_args = {**kwargs, **args}
+                    logger.debug("Route %s called with args: %s", tr_name, tr_args)
+                    merged_args = {**kwargs, **tr_args}
                     result = await route(**merged_args)
                     logger.debug("Route %s result: %s", tr_name, result)
                     trs_to_exclude = trs_to_exclude | {tr_name}
@@ -108,9 +111,14 @@ class AgentRouter(Agent[MessageType, ContextType]):
                 tool = self.tools.get(tr_name)
                 if tool is not None:
                     self.last_used_tr_name = tr_name
+                    for rtc in router_tool_contents:
+                        self.context.dialog_history.add_message(rtc)
+                    router_tool_contents = []
                     self.context.dialog_history.add_message(content)
-                    logger.debug("Tool %s called with args: %s", tr_name, args)
-                    tool_response = await tool(**args)
+                    logger.debug("Tool %s called with args: %s", tr_name, tr_args)
+                    tool_response = await tool(**tr_args)
                     logger.debug("Tool %s response: %s", tr_name, tool_response)
                     self.context.dialog_history.add_message(tool_response.candidates[0].content)
                     trs_to_exclude = trs_to_exclude | {tr_name}

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/base_client.py RENAMED Viewed

@@ -134,6 +134,7 @@ class BaseLLMClient(ABC, utils.InheritDecoratorsMixin):
     @logfire_decorators.create
     @utils.retry_cls
     @utils.rpm_limit_cls
+    @utils.tpm_limit_cls
     @abstractmethod
     def _create(
         self,
@@ -252,13 +253,12 @@ class BaseLLMClient(ABC, utils.InheritDecoratorsMixin):
         if result_type is None:
             return response.text
         else:
-            if result_type == "json":
+            if result_type == "json" and response.parsed is None:
                 response.parsed = BaseLLMClient.as_json(response.text)
             return response.parsed
     @staticmethod
-    def _append_generated_part(messages: list[Content], response: Response) -> Content | None:
+    def _responce_to_text(response: Response):
         assert(response.candidates and response.candidates[0].content), "Response must contain at least one candidate with content."
         text_parts = [
@@ -267,6 +267,7 @@ class BaseLLMClient(ABC, utils.InheritDecoratorsMixin):
         if text_parts is not None and len(text_parts) > 0:
             response_text = "".join(part.text for part in text_parts)
             is_thought = False
+            return response_text, is_thought
         else:
             thought_parts = [
                 part for part in response.candidates[0].content.parts if part.text and part.thought
@@ -274,17 +275,28 @@ class BaseLLMClient(ABC, utils.InheritDecoratorsMixin):
             if thought_parts is not None and len(thought_parts) > 0:
                 response_text = "".join(part.text for part in thought_parts)
                 is_thought = True
+                return response_text, is_thought
             else:
-                return None
+                return None, None
+    @staticmethod
+    def _append_to_message(message: Content, text: str, is_thought: bool):
+        if message.parts and message.parts[-1].text is not None and message.parts[-1].thought == is_thought:
+            message.parts[-1].text += text
+        else:
+            if not message.parts:
+                message.parts = []
+            message.parts.append(Part(text=text, thought=is_thought))
+    @staticmethod
+    def _append_generated_part(messages: list[Content], response: Response) -> Content | None:
+        response_text, is_thought = BaseLLMClient._responce_to_text(response)
+        if response_text is None:
+            return None
         if len(messages) > 0 and messages[-1].role == "model":
             message_to_append = messages[-1]
-            if message_to_append.parts and message_to_append.parts[-1].text is not None and message_to_append.parts[-1].thought == is_thought:
-                message_to_append.parts[-1].text += response_text
-            else:
-                if not message_to_append.parts:
-                    message_to_append.parts = []
-                message_to_append.parts.append(Part(text=response_text, thought=is_thought))
+            BaseLLMClient._append_to_message(message_to_append, response_text, is_thought)
         else:
             messages.append(Content(parts=[Part(text=response_text, thought=is_thought)], role="model"))
         return messages[-1]
@@ -296,6 +308,7 @@ class BaseLLMClient(ABC, utils.InheritDecoratorsMixin):
     @logfire_decorators.create_stream
     @utils.retry_cls
     @utils.rpm_limit_cls
+    @utils.tpm_limit_cls
     def _create_stream(
         self,
         messages: list[Content],
@@ -539,6 +552,7 @@ class BaseLLMClientAsync(ABC, utils.InheritDecoratorsMixin):
     @logfire_decorators.create_async
     @utils.retry_cls_async
     @utils.rpm_limit_cls_async
+    @utils.tpm_limit_cls_async
     @abstractmethod
     async def _create(
         self,
@@ -656,13 +670,14 @@ class BaseLLMClientAsync(ABC, utils.InheritDecoratorsMixin):
         if result_type is None:
             return response.text
         else:
-            if result_type == "json":
+            if result_type == "json" and response.parsed is None:
                 response.parsed = BaseLLMClient.as_json(response.text)
             return response.parsed
     @logfire_decorators.create_stream_async
     @utils.retry_cls_async
     @utils.rpm_limit_cls_async
+    @utils.tpm_limit_cls_async
     async def _create_stream(
         self,
         messages: list[Content],
@@ -819,19 +834,65 @@ class CachedLLMClient(BaseLLMClient):
         self.llm_client = llm_client
         self.cache_dir = cache_dir
-    def _create(self, messages: list[Content], **kwargs) -> Response:
-        response, messages_dump, cache_path = CachedLLMClient.create_cached(self.llm_client, self.cache_dir, messages, **kwargs)
+    def _create(self, messages: list[Content], system_message: str | None = None, **kwargs) -> Response:
+        response, messages_dump, cache_path = CachedLLMClient.create_cached(self.llm_client, self.cache_dir, messages, system_message, **kwargs)
         if response is not None:
             return response
-        response = self.llm_client.create(messages, **kwargs)
+        response = self.llm_client.create(messages, system_message=system_message, **kwargs)
         CachedLLMClient.save_cache(cache_path, self.llm_client.full_model_name, messages_dump, response)
         return response
+    def _create_stream(
+        self,
+        messages: list[Content],
+        *,
+        thinking_config: ThinkingConfig | None = None,
+        system_message: str | None = None,
+        max_tokens: int | None = None,
+    ) -> Iterator[Response]:
+        response, messages_dump, cache_path = CachedLLMClient.create_cached(
+            self.llm_client, self.cache_dir, messages,
+            thinking_config=thinking_config,
+            system_message=system_message,
+            max_tokens=max_tokens,
+        )
+        if response is not None:
+            yield response
+            return
+        accumulated_content: Content | None = None
+        final_response: Response | None = None
+        for response in self.llm_client._create_stream(
+            messages=messages,
+            thinking_config=thinking_config,
+            system_message=system_message,
+            max_tokens=max_tokens,
+        ):
+            # Accumulate content from each response chunk
+            if response.candidates and response.candidates[0].content:
+                response_text, is_thought = BaseLLMClient._responce_to_text(response)
+                if response_text is not None:
+                    if accumulated_content is None:
+                        accumulated_content = Content(parts=[], role="model")
+                    BaseLLMClient._append_to_message(accumulated_content, response_text, is_thought or False)
+            final_response = response
+            yield response
+        # Save accumulated response to cache
+        if final_response is not None and accumulated_content is not None and final_response.candidates:
+            cached_response = Response(
+                candidates=[final_response.candidates[0].model_copy(update={"content": accumulated_content})],
+                usage_metadata=final_response.usage_metadata,
+            )
+            CachedLLMClient.save_cache(cache_path, self.llm_client.full_model_name, messages_dump, cached_response)
     @staticmethod
-    def create_cached(llm_client: BaseLLMClient | BaseLLMClientAsync, cache_dir: str, messages: list[Content], **kwargs) -> tuple[Response | None, list[dict], str]:
+    def create_cached(llm_client: BaseLLMClient | BaseLLMClientAsync, cache_dir: str, messages: list[Content], system_message: str | None = None, **kwargs) -> tuple[Response | None, list[dict], str]:
         messages_dump = [message.model_dump() for message in messages]
         key = hashlib.sha256(
-            json.dumps((llm_client.full_model_name, messages_dump)).encode()
+            json.dumps((llm_client.full_model_name, messages_dump, system_message)).encode()
         ).hexdigest()
         cache_path = os.path.join(cache_dir, f"{key}.json")
         if os.path.exists(cache_path):
@@ -855,7 +916,7 @@ class CachedLLMClient(BaseLLMClient):
     @staticmethod
     def save_cache(cache_path: str, full_model_name: str, messages_dump: list[dict], response: Response):
         with open(cache_path, 'wt') as f:
-            json.dump({"full_model_name": full_model_name, "request": messages_dump, "response": response.model_dump()}, f, indent=4)
+            json.dump({"full_model_name": full_model_name, "request": messages_dump, "response": Response.model_dump(response, mode="json")}, f, indent=4)
 class CachedLLMClientAsync(BaseLLMClientAsync):
@@ -869,10 +930,56 @@ class CachedLLMClientAsync(BaseLLMClientAsync):
         self.llm_client = llm_client
         self.cache_dir = cache_dir
-    async def _create(self, messages: list[Content], **kwargs) -> Response:
-        response, messages_dump, cache_path = CachedLLMClient.create_cached(self.llm_client, self.cache_dir, messages, **kwargs)
+    async def _create(self, messages: list[Content], system_message: str | None = None, **kwargs) -> Response:
+        response, messages_dump, cache_path = CachedLLMClient.create_cached(self.llm_client, self.cache_dir, messages, system_message, **kwargs)
         if response is not None:
             return response
-        response = await self.llm_client.create(messages, **kwargs)
+        response = await self.llm_client.create(messages, system_message=system_message, **kwargs)
         CachedLLMClient.save_cache(cache_path, self.llm_client.full_model_name, messages_dump, response)
         return response
+    async def _create_stream(
+        self,
+        messages: list[Content],
+        *,
+        thinking_config: ThinkingConfig | None = None,
+        system_message: str | None = None,
+        max_tokens: int | None = None,
+    ) -> AsyncIterator[Response]:
+        response, messages_dump, cache_path = CachedLLMClient.create_cached(
+            self.llm_client, self.cache_dir, messages,
+            thinking_config=thinking_config,
+            system_message=system_message,
+            max_tokens=max_tokens,
+        )
+        if response is not None:
+            yield response
+            return
+        accumulated_content: Content | None = None
+        final_response: Response | None = None
+        async for response in self.llm_client._create_stream(
+            messages=messages,
+            thinking_config=thinking_config,
+            system_message=system_message,
+            max_tokens=max_tokens,
+        ):
+            # Accumulate content from each response chunk
+            if response.candidates and response.candidates[0].content:
+                response_text, is_thought = BaseLLMClient._responce_to_text(response)
+                if response_text is not None:
+                    if accumulated_content is None:
+                        accumulated_content = Content(parts=[], role="model")
+                    BaseLLMClient._append_to_message(accumulated_content, response_text, is_thought or False)
+            final_response = response
+            yield response
+        # Save accumulated response to cache
+        if final_response is not None and accumulated_content is not None and final_response.candidates:
+            cached_response = Response(
+                candidates=[final_response.candidates[0].model_copy(update={"content": accumulated_content})],
+                usage_metadata=final_response.usage_metadata,
+            )
+            CachedLLMClient.save_cache(cache_path, self.llm_client.full_model_name, messages_dump, cached_response)

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/google_client.py RENAMED Viewed

@@ -123,6 +123,7 @@ class GoogleLLMClient(BaseLLMClient):
                 config=config,
             )
         elif result_type == "json":
+            config.response_mime_type = "application/json"
             response = self.client.models.generate_content(
                 model=self.model,
                 contents=messages,
@@ -273,6 +274,7 @@ class GoogleLLMClientAsync(BaseLLMClientAsync):
         config.thinking_config = thinking_config
         if result_type is None or result_type == "json":
+            config.response_mime_type = "application/json"
             return await self.client.aio.models.generate_content(
                 model=self.model,
                 contents=messages,

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/litellm_client.py RENAMED Viewed

@@ -241,7 +241,7 @@ class LiteLLMClient(BaseLLMClient):
                     finish_reason_val = first_choice.get("finish_reason")
                 else:
                     finish_reason_val = getattr(first_choice, "finish_reason", None)
-            mapped_finish_reason = LiteLLMLLMClient._map_finish_reason(finish_reason_val)
+            mapped_finish_reason = LiteLLMClient._map_finish_reason(finish_reason_val)
             content_parts: list[Part | Any] = list(parts)
             return Response(
@@ -293,7 +293,7 @@ class LiteLLMClient(BaseLLMClient):
                     finish_reason_val = first_choice.get("finish_reason")
                 else:
                     finish_reason_val = getattr(first_choice, "finish_reason", None)
-            mapped_finish_reason = LiteLLMLLMClient._map_finish_reason(finish_reason_val)
+            mapped_finish_reason = LiteLLMClient._map_finish_reason(finish_reason_val)
             content_parts2: list[Part | Any] = list(parts)
             return Response(
@@ -460,11 +460,11 @@ class LiteLLMClientAsync(BaseLLMClientAsync):
     @staticmethod
     def make_function_call(tool_call) -> FunctionCall | None:
-        return LiteLLMLLMClient.make_function_call(tool_call)
+        return LiteLLMClient.make_function_call(tool_call)
     @staticmethod
     def make_usage_metadata(usage) -> UsageMetadata:
-        return LiteLLMLLMClient.make_usage_metadata(usage)
+        return LiteLLMClient.make_usage_metadata(usage)
     async def _create(
         self,
@@ -569,7 +569,7 @@ class LiteLLMClientAsync(BaseLLMClientAsync):
                     finish_reason_val = first_choice.get("finish_reason")
                 else:
                     finish_reason_val = getattr(first_choice, "finish_reason", None)
-            mapped_finish_reason = LiteLLMLLMClient._map_finish_reason(finish_reason_val)
+            mapped_finish_reason = LiteLLMClient._map_finish_reason(finish_reason_val)
             content_parts3: list[Part | Any] = list(parts)
             return Response(
@@ -621,7 +621,7 @@ class LiteLLMClientAsync(BaseLLMClientAsync):
                     finish_reason_val = first_choice.get("finish_reason")
                 else:
                     finish_reason_val = getattr(first_choice, "finish_reason", None)
-            mapped_finish_reason = LiteLLMLLMClient._map_finish_reason(finish_reason_val)
+            mapped_finish_reason = LiteLLMClient._map_finish_reason(finish_reason_val)
             content_parts4: list[Part | Any] = list(parts)
             return Response(

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/logfire_decorators.py RENAMED Viewed

@@ -46,9 +46,12 @@ def extract_response_data(response: Response) -> dict[str, Any]:
     response_data = {"message": {"role": "assistant"}}
     response_data["message"]["content"] = response.text
     tool_calls = []
-    for part in response.candidates[0].content.parts:
-        if part.function_call is not None:
-            tool_calls.append({"function": {"name": part.function_call.name, "arguments": part.function_call.args}})
+    if response.candidates is not None and len(response.candidates) > 0:
+        content = response.candidates[0].content
+        if content is not None and content.parts is not None:
+            for part in content.parts:
+                if part.function_call is not None:
+                    tool_calls.append({"function": {"name": part.function_call.name, "arguments": part.function_call.args}})
     if len(tool_calls) > 0:
         response_data["message"]["tool_calls"] = tool_calls
     return response_data

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/openai_client.py RENAMED Viewed

@@ -205,7 +205,7 @@ class OpenaiLLMClient(BaseLLMClient):
             elif tool_choice_mode == "ANY":
                 openai_kwargs["tool_choice"] = "required"
-        if result_type is None or result_type == "json":
+        if result_type is None:
             # Forward timeout to OpenAI per-request if provided
             if timeout is not None:
                 openai_kwargs["timeout"] = timeout
@@ -222,6 +222,33 @@ class OpenaiLLMClient(BaseLLMClient):
                 elif output_item.type == "function_call":
                     parts.append(Part(function_call=FunctionCall(args=json.loads(output_item.arguments), name=output_item.name)))
+            return Response(
+                candidates=[Candidate(content=Content(parts=parts, role="model"))],
+                usage_metadata=UsageMetadata(
+                    candidates_token_count=response.usage.output_tokens,
+                    prompt_token_count=response.usage.input_tokens,
+                    total_token_count=response.usage.total_tokens,
+                )
+            )
+        elif result_type == "json":
+            # Forward timeout to OpenAI per-request if provided
+            if timeout is not None:
+                openai_kwargs["timeout"] = timeout
+            response = self.client.responses.create(**openai_kwargs, text={ "format" : { "type": "json_object" } })
+            response_text = ""
+            parts: list[Part] = []
+            for output_item in response.output:
+                if output_item.type == "message":
+                    for content in output_item.content:
+                        parts.append(Part(text=content.text))
+                        response_text += content.text
+                elif output_item.type == "reasoning":
+                    for summary in output_item.summary:
+                        parts.append(Part(text=summary.text, thought=True))
+                elif output_item.type == "function_call":
+                    parts.append(Part(function_call=FunctionCall(args=json.loads(output_item.arguments), name=output_item.name)))
             return Response(
                 candidates=[Candidate(content=Content(parts=parts, role="model"))],
                 usage_metadata=UsageMetadata(
@@ -229,6 +256,7 @@ class OpenaiLLMClient(BaseLLMClient):
                     prompt_token_count=response.usage.input_tokens,
                     total_token_count=response.usage.total_tokens,
                 ),
+                parsed=BaseLLMClient.as_json(response_text)
             )
         elif isinstance(result_type, type(BaseModel)):
             if timeout is not None:
@@ -453,7 +481,7 @@ class OpenaiLLMClientAsync(BaseLLMClientAsync):
             elif tool_choice_mode == "ANY":
                 openai_kwargs["tool_choice"] = "required"
-        if result_type is None or result_type == "json":
+        if result_type is None:
             if timeout is not None:
                 openai_kwargs["timeout"] = timeout
             response = await self.client.responses.create(**openai_kwargs)
@@ -476,6 +504,32 @@ class OpenaiLLMClientAsync(BaseLLMClientAsync):
                     total_token_count=response.usage.total_tokens,
                 ),
             )
+        elif result_type == "json":
+            if timeout is not None:
+                openai_kwargs["timeout"] = timeout
+            response = await self.client.responses.create(**openai_kwargs, text={ "format" : { "type": "json_object" } })
+            parts: list[Part] = []
+            response_text = ""
+            for output_item in response.output:
+                if output_item.type == "message":
+                    for content in output_item.content:
+                        parts.append(Part(text=content.text))
+                        response_text += content.text
+                elif output_item.type == "reasoning":
+                    for summary in output_item.summary:
+                        parts.append(Part(text=summary.text, thought=True))
+                elif output_item.type == "function_call":
+                    parts.append(Part(function_call=FunctionCall(args=json.loads(output_item.arguments), name=output_item.name)))
+            return Response(
+                candidates=[Candidate(content=Content(parts=parts, role="model"))],
+                usage_metadata=UsageMetadata(
+                    candidates_token_count=response.usage.output_tokens,
+                    prompt_token_count=response.usage.input_tokens,
+                    total_token_count=response.usage.total_tokens,
+                ),
+                parsed=BaseLLMClient.as_json(response_text)
+            )
         elif isinstance(result_type, type(BaseModel)):
             if timeout is not None:
                 openai_kwargs["timeout"] = timeout

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder/llm_client/utils.py RENAMED Viewed

@@ -4,10 +4,14 @@ import logging
 import traceback
 from functools import wraps
 from typing import Callable, Awaitable, ParamSpec, TypeVar
+import tiktoken
 from collections import defaultdict
 from pydantic import BaseModel
+from promptbuilder.llm_client.types import Content
 logger = logging.getLogger(__name__)
@@ -48,9 +52,14 @@ class RetryConfig(BaseModel):
 class RpmLimitConfig(BaseModel):
     rpm_limit: int = 0
+class TpmLimitConfig(BaseModel):
+    tpm_limit: int = 0
+    fast: bool = False
 class DecoratorConfigs(BaseModel):
     retry: RetryConfig | None = None
     rpm_limit: RpmLimitConfig | None = None
+    tpm_limit: TpmLimitConfig | None = None
 @inherited_decorator
@@ -181,3 +190,149 @@ def rpm_limit_cls_async(class_method: Callable[P, Awaitable[T]]) -> Callable[P,
             self._last_request_time = time.time()
             return await class_method(self, *args, **kwargs)
     return wrapper
+def _estimate_input_tokens_from_messages(self, messages: list[Content], fast: bool = False) -> int:
+    """Estimate input tokens for a list[Content] using best available method.
+    Priority:
+    1) If provider == "google" and a google.genai client is available, use
+       models.count_tokens for accurate counts.
+    2) If tiktoken is installed, approximate with a BPE encoding.
+    3) Fallback heuristic: ~4 characters per token across text parts.
+    """
+    if not messages:
+        return 0
+    # Collect text parts for non-Google fallback methods
+    texts: list[str] = []
+    for m in messages:
+        parts = m.parts
+        if not parts:
+            continue
+        for part in parts:
+            text = part.text
+            if text:
+                texts.append(text)
+    if not fast:
+    # 1) Google Gemini accurate count via genai API (when provider == google)
+        if self.provider == "google":
+            genai_client = self.client
+            contents_arg = "\n".join(texts)
+            total_tokens = genai_client.models.count_tokens(
+                model=self.model,
+                contents=contents_arg,
+            ).total_tokens
+            return total_tokens
+        # 2) tiktoken approximation
+        # cl100k_base is a good default for many chat models
+        enc = tiktoken.get_encoding("cl100k_base")
+        return sum(len(enc.encode(t)) for t in texts)
+    else:
+        # 3) Heuristic fallback
+        total_chars = sum(len(t) for t in texts)
+        tokens = total_chars // 4
+        return tokens if tokens > 0 else (1 if total_chars > 0 else 0)
+@inherited_decorator
+def tpm_limit_cls(class_method: Callable[P, T]) -> Callable[P, T]:
+    """
+    Decorator that limits the number of input tokens per minute to the decorated class methods.
+    Decorated methods must have 'self' as its first arg and accept a 'messages' argument
+    either positionally (first arg) or by keyword.
+    The decorator estimates tokens from input messages and ensures the total tokens
+    sent within a 60-second window do not exceed the configured TPM limit. If the
+    limit would be exceeded, it waits until the window resets.
+    """
+    @wraps(class_method)
+    def wrapper(self, *args, **kwargs):
+        if not hasattr(self, "_decorator_configs"):
+            self._decorator_configs = DecoratorConfigs()
+        if getattr(self._decorator_configs, "tpm_limit", None) is None:
+            self._decorator_configs.tpm_limit = TpmLimitConfig()
+        limit = self._decorator_configs.tpm_limit.tpm_limit
+        if limit <= 0:
+            return class_method(self, *args, **kwargs)
+        # Extract messages from either kwargs or positional args
+        messages = kwargs.get("messages") if "messages" in kwargs else (args[0] if len(args) > 0 else None)
+        tokens_needed = _estimate_input_tokens_from_messages(self, messages, self._decorator_configs.tpm_limit.fast)
+        # Initialize sliding window state
+        now = time.time()
+        if not hasattr(self, "_tpm_window_start"):
+            self._tpm_window_start = now
+        if not hasattr(self, "_tpm_used_tokens"):
+            self._tpm_used_tokens = 0
+        while True:
+            now = time.time()
+            elapsed = now - self._tpm_window_start
+            if elapsed >= 60:
+                # Reset window
+                self._tpm_window_start = now
+                self._tpm_used_tokens = 0
+            if self._tpm_used_tokens + tokens_needed <= limit:
+                self._tpm_used_tokens += tokens_needed
+                break
+            # Need to wait until window resets
+            sleep_for = max(0.0, 60 - elapsed)
+            if sleep_for > 0:
+                time.sleep(sleep_for)
+                continue
+            # If sleep_for == 0, loop will reset on next iteration
+        return class_method(self, *args, **kwargs)
+    return wrapper
+@inherited_decorator
+def tpm_limit_cls_async(class_method: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]:
+    """
+    Async variant of TPM limiter.
+    """
+    @wraps(class_method)
+    async def wrapper(self, *args, **kwargs):
+        if not hasattr(self, "_decorator_configs"):
+            self._decorator_configs = DecoratorConfigs()
+        if getattr(self._decorator_configs, "tpm_limit", None) is None:
+            self._decorator_configs.tpm_limit = TpmLimitConfig()
+        limit = self._decorator_configs.tpm_limit.tpm_limit
+        if limit <= 0:
+            return await class_method(self, *args, **kwargs)
+        messages = kwargs.get("messages") if "messages" in kwargs else (args[0] if len(args) > 0 else None)
+        tokens_needed = _estimate_input_tokens_from_messages(self, messages, self._decorator_configs.tpm_limit.fast)
+        now = time.time()
+        if not hasattr(self, "_tpm_window_start"):
+            self._tpm_window_start = now
+        if not hasattr(self, "_tpm_used_tokens"):
+            self._tpm_used_tokens = 0
+        while True:
+            now = time.time()
+            elapsed = now - self._tpm_window_start
+            if elapsed >= 60:
+                self._tpm_window_start = now
+                self._tpm_used_tokens = 0
+            if self._tpm_used_tokens + tokens_needed <= limit:
+                self._tpm_used_tokens += tokens_needed
+                break
+            sleep_for = max(0.0, 60 - elapsed)
+            if sleep_for > 0:
+                await asyncio.sleep(sleep_for)
+                continue
+        return await class_method(self, *args, **kwargs)
+    return wrapper

{promptbuilder-0.4.37 → promptbuilder-0.4.39/promptbuilder.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: promptbuilder
-Version: 0.4.37
+Version: 0.4.39
 Summary: Library for building prompts for LLMs
 Home-page: https://github.com/kapulkin/promptbuilder
 Author: Kapulkin Stanislav
@@ -21,6 +21,7 @@ Requires-Dist: aioboto3
 Requires-Dist: litellm
 Requires-Dist: httpx
 Requires-Dist: aiohttp
+Requires-Dist: tiktoken
 Dynamic: author
 Dynamic: author-email
 Dynamic: classifier

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder.egg-info/SOURCES.txt RENAMED Viewed

@@ -30,7 +30,6 @@ promptbuilder/llm_client/main.py
 promptbuilder/llm_client/openai_client.py
 promptbuilder/llm_client/types.py
 promptbuilder/llm_client/utils.py
-promptbuilder/llm_client/vertex_client.py
 tests/test_llm_client.py
 tests/test_llm_client_async.py
 tests/test_timeout_google.py

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/promptbuilder.egg-info/requires.txt RENAMED Viewed

@@ -8,3 +8,4 @@ aioboto3
 litellm
 httpx
 aiohttp
+tiktoken

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/setup.py RENAMED Viewed

@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
 setup(
     name="promptbuilder",
-    version="0.4.37",
+    version="0.4.39",
     packages=find_packages(),
     install_requires=[
         "pydantic",
@@ -14,7 +14,8 @@ setup(
         "aioboto3",
         "litellm",
         "httpx",
-        "aiohttp"
+        "aiohttp",
+        "tiktoken"
     ],
     author="Kapulkin Stanislav",
     author_email="kapulkin@gmail.com",

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/tests/test_timeout_google.py RENAMED Viewed

@@ -36,7 +36,7 @@ def test_google_timeout_forwarded_sync(monkeypatch):
     cfg = rec.get("last_config")
     assert cfg is not None
     assert cfg.http_options is not None
-    assert int(cfg.http_options.timeout) == 12
+    assert int(cfg.http_options.timeout) == 12000  # Google API expects milliseconds
 class _FakeAioGoogleModels:
@@ -75,4 +75,4 @@ async def test_google_timeout_forwarded_async(monkeypatch):
     cfg = rec.get("last_config_async")
     assert cfg is not None
     assert cfg.http_options is not None
-    assert int(cfg.http_options.timeout) == 8
+    assert int(cfg.http_options.timeout) == 8500  # Google API expects milliseconds

{promptbuilder-0.4.37 → promptbuilder-0.4.39}/tests/test_timeout_litellm.py RENAMED Viewed

@@ -1,7 +1,7 @@
 import pytest
 from pydantic import BaseModel
+import litellm
-import promptbuilder.llm_client.litellm_client as litellm_mod
 from promptbuilder.llm_client.litellm_client import LiteLLMClient, LiteLLMClientAsync
 from promptbuilder.llm_client.types import Content, Part
@@ -14,8 +14,10 @@ def test_litellm_timeout_forwarded_sync(monkeypatch):
             def __init__(self):
                 self.choices = []
                 self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
+            def get(self, key, default=None):
+                return getattr(self, key, default)
         return R()
-    monkeypatch.setattr(litellm_mod, "completion", fake_completion)
+    monkeypatch.setattr(litellm, "completion", fake_completion)
     cli = LiteLLMClient(full_model_name="ollama:llama3.1", api_key=None)
     _ = cli.create([Content(parts=[Part(text="hi")], role="user")], timeout=7.5)
@@ -33,8 +35,10 @@ async def test_litellm_timeout_forwarded_async(monkeypatch):
             def __init__(self):
                 self.choices = []
                 self.usage = {"prompt_tokens": 0, "completion_tokens": 0, "total_tokens": 0}
+            def get(self, key, default=None):
+                return getattr(self, key, default)
         return R()
-    monkeypatch.setattr(litellm_mod, "acompletion", fake_acompletion)
+    monkeypatch.setattr(litellm, "acompletion", fake_acompletion)
     cli = LiteLLMClientAsync(full_model_name="ollama:llama3.1", api_key=None)
     _ = await cli.create([Content(parts=[Part(text="hi")], role="user")], timeout=5.0)

promptbuilder-0.4.37/promptbuilder/llm_client/vertex_client.py DELETED Viewed

@@ -1,403 +0,0 @@
-import os
-import importlib
-from functools import wraps
-from typing import AsyncIterator, Iterator, Callable, ParamSpec, Awaitable, Any, cast
-from pydantic import BaseModel, ConfigDict
-from tenacity import RetryError
-from vertexai import init as vertex_init
-from vertexai.generative_models import GenerativeModel
-from promptbuilder.llm_client.base_client import BaseLLMClient, BaseLLMClientAsync, ResultType
-from promptbuilder.llm_client.types import (
-    Response,
-    Content,
-    Candidate,
-    UsageMetadata,
-    Part,
-    PartLike,
-    ApiKey,
-    ThinkingConfig,
-    Tool,
-    ToolConfig,
-    Model,
-    CustomApiKey,
-)
-from promptbuilder.llm_client.config import DecoratorConfigs
-from promptbuilder.llm_client.utils import inherited_decorator
-from promptbuilder.llm_client.exceptions import APIError
-P = ParamSpec("P")
-class VertexApiKey(BaseModel, CustomApiKey):
-    model_config = ConfigDict(frozen=True)
-    project: str
-    location: str
-@inherited_decorator
-def _error_handler(func: Callable[P, Response]) -> Callable[P, Response]:
-    @wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            return func(*args, **kwargs)
-        except RetryError as retry_error:
-            e = retry_error.last_attempt._exception
-            if e is None:
-                raise APIError()
-            code = getattr(e, "code", None)
-            response_json = {
-                "status": getattr(e, "status", None),
-                "message": str(e),
-            }
-            response = getattr(e, "response", None)
-            raise APIError(code, response_json, response)
-        except Exception as e:  # noqa: BLE001
-            raise APIError(None, {"status": None, "message": str(e)}, None)
-    return wrapper
-def _to_vertex_content(messages: list[Content]):
-    gen_mod = importlib.import_module("vertexai.generative_models")
-    VPart = getattr(gen_mod, "Part")
-    VContent = getattr(gen_mod, "Content")
-    v_messages: list[Any] = []
-    for m in messages:
-        v_parts: list[Any] = []
-        if m.parts:
-            for p in m.parts:
-                if p.text is not None:
-                    v_parts.append(VPart.from_text(p.text))
-                elif p.inline_data is not None and p.inline_data.data is not None:
-                    v_parts.append(VPart.from_bytes(data=p.inline_data.data, mime_type=p.inline_data.mime_type or "application/octet-stream"))
-        v_messages.append(VContent(role=m.role, parts=v_parts))
-    return v_messages
-def _tool_to_vertex(tool: Tool):
-    VTool = getattr(importlib.import_module("vertexai.generative_models"), "Tool")
-    if not tool.function_declarations:
-        return VTool(function_declarations=[])
-    fds = []
-    for fd in tool.function_declarations:
-        fds.append({
-            "name": fd.name,
-            "description": fd.description,
-            "parameters": fd.parameters.model_dump() if fd.parameters is not None else None,
-            "response": fd.response.model_dump() if fd.response is not None else None,
-        })
-    return VTool(function_declarations=fds)
-def _tool_config_to_vertex(cfg: ToolConfig | None):
-    VToolConfig = getattr(importlib.import_module("vertexai.generative_models"), "ToolConfig")
-    if cfg is None or cfg.function_calling_config is None:
-        return None
-    mode = cfg.function_calling_config.mode or "AUTO"
-    allowed = cfg.function_calling_config.allowed_function_names
-    return VToolConfig(function_calling_config={"mode": mode, "allowedFunctionNames": allowed})
-def _from_vertex_response(v_resp: Any) -> Response:
-    candidates: list[Candidate] = []
-    if getattr(v_resp, "candidates", None):
-        for c in v_resp.candidates:
-            parts: list[Part] = []
-            if c.content and getattr(c.content, "parts", None):
-                for vp in c.content.parts:
-                    t = getattr(vp, "text", None)
-                    if isinstance(t, str):
-                        parts.append(Part(text=t))
-            candidates.append(Candidate(content=Content(parts=cast(list[Part | PartLike], parts), role="model")))
-    usage = None
-    um = getattr(v_resp, "usage_metadata", None)
-    if um is not None:
-        usage = UsageMetadata(
-            cached_content_token_count=getattr(um, "cached_content_token_count", None),
-            candidates_token_count=getattr(um, "candidates_token_count", None),
-            prompt_token_count=getattr(um, "prompt_token_count", None),
-            thoughts_token_count=getattr(um, "thoughts_token_count", None),
-            total_token_count=getattr(um, "total_token_count", None),
-        )
-    return Response(candidates=candidates, usage_metadata=usage)
-class VertexLLMClient(BaseLLMClient):
-    PROVIDER: str = "vertexai"
-    def __init__(
-        self,
-        model: str,
-        api_key: ApiKey | None = None,
-        decorator_configs: DecoratorConfigs | None = None,
-        default_thinking_config: ThinkingConfig | None = None,
-        default_max_tokens: int | None = None,
-        project: str | None = None,
-        location: str | None = None,
-        **kwargs,
-    ):
-        # Resolve project/location from args or env
-        project = project or os.getenv("VERTEXAI_PROJECT") or os.getenv("GOOGLE_CLOUD_PROJECT") or os.getenv("GCLOUD_PROJECT")
-        location = location or os.getenv("VERTEXAI_LOCATION") or os.getenv("GOOGLE_CLOUD_REGION") or os.getenv("GOOGLE_CLOUD_LOCATION")
-        # Allow API Key (string) or ADC (VertexApiKey)
-        api_key_str: str | None = None
-        if isinstance(api_key, str):
-            api_key_str = api_key
-        elif api_key is None:
-            # Fallback to env vars for API key
-            api_key_str = os.getenv("VERTEX_API_KEY") or os.getenv("GOOGLE_API_KEY")
-        elif isinstance(api_key, VertexApiKey):
-            # ADC path with explicit project/location
-            pass
-        else:
-            # Unexpected CustomApiKey subtype
-            raise ValueError("Unsupported api_key type for Vertex: expected str or VertexApiKey")
-        if not project or not location:
-            raise ValueError("To create a vertexai llm client you need to provide project and location via args or env vars VERTEXAI_PROJECT and VERTEXAI_LOCATION")
-        if not isinstance(api_key, VertexApiKey):
-            api_key = VertexApiKey(project=project, location=location)
-        super().__init__(
-            VertexLLMClient.PROVIDER,
-            model,
-            decorator_configs=decorator_configs,
-            default_thinking_config=default_thinking_config,
-            default_max_tokens=default_max_tokens,
-        )
-        self._api_key = api_key
-        self._api_key_str = api_key_str
-        vertex_init(project=self._api_key.project, location=self._api_key.location)
-        self._model = GenerativeModel(self.model)
-    @property
-    def api_key(self) -> VertexApiKey:
-        return self._api_key
-    @_error_handler
-    def _create(
-        self,
-        messages: list[Content],
-        result_type: ResultType = None,
-        *,
-        thinking_config: ThinkingConfig | None = None,
-        system_message: str | None = None,
-        max_tokens: int | None = None,
-        timeout: float | None = None,
-        tools: list[Tool] | None = None,
-        tool_config: ToolConfig = ToolConfig(),
-    ) -> Response:
-        v_messages = _to_vertex_content(messages)
-        GenerationConfig = getattr(importlib.import_module("vertexai.generative_models"), "GenerationConfig")
-        gen_cfg = GenerationConfig(max_output_tokens=max_tokens or self.default_max_tokens)
-        # Handle thinking config
-        if thinking_config is None:
-            thinking_config = self.default_thinking_config
-        if thinking_config is not None:
-            # Vertex AI supports thinking via response_logprobs and logprobs parameters
-            # but the exact implementation may vary - for now, we'll store it for potential future use
-            pass
-        req_opts: dict[str, Any] | None = {}
-        if timeout is not None:
-            req_opts["timeout"] = timeout
-        if self._api_key_str:
-            req_opts["api_key"] = self._api_key_str
-        if not req_opts:
-            req_opts = None
-        v_tools = None
-        if tools is not None:
-            v_tools = [_tool_to_vertex(t) for t in tools]
-        v_tool_cfg = _tool_config_to_vertex(tool_config)
-        v_resp = self._model.generate_content(
-            contents=v_messages,
-            generation_config=gen_cfg,
-            tools=v_tools,
-            tool_config=v_tool_cfg,
-            system_instruction=system_message,
-            request_options=req_opts,
-        )
-        resp = _from_vertex_response(v_resp)
-        if result_type == "json" and resp.text is not None:
-            resp.parsed = BaseLLMClient.as_json(resp.text)
-        elif isinstance(result_type, type(BaseModel)) and resp.text is not None:
-            parsed = BaseLLMClient.as_json(resp.text)
-            resp.parsed = result_type.model_validate(parsed)
-        return resp
-    def create_stream(
-        self,
-        messages: list[Content],
-        *,
-        thinking_config: ThinkingConfig | None = None,
-        system_message: str | None = None,
-        max_tokens: int | None = None,
-    ) -> Iterator[Response]:
-        v_messages = _to_vertex_content(messages)
-        GenerationConfig = getattr(importlib.import_module("vertexai.generative_models"), "GenerationConfig")
-        gen_cfg = GenerationConfig(max_output_tokens=max_tokens or self.default_max_tokens)
-        # Handle thinking config
-        if thinking_config is None:
-            thinking_config = self.default_thinking_config
-        if thinking_config is not None:
-            # Store for potential future use when Vertex AI supports thinking features
-            pass
-        req_opts: dict[str, Any] | None = {}
-        if self._api_key_str:
-            req_opts["api_key"] = self._api_key_str
-        if not req_opts:
-            req_opts = None
-        stream = self._model.generate_content(
-            contents=v_messages,
-            generation_config=gen_cfg,
-            system_instruction=system_message,
-            request_options=req_opts,
-            stream=True,
-        )
-        for ev in stream:
-            yield _from_vertex_response(ev)
-    @staticmethod
-    def models_list() -> list[Model]:
-        return []
-@inherited_decorator
-def _error_handler_async(func: Callable[P, Awaitable[Response]]) -> Callable[P, Awaitable[Response]]:
-    @wraps(func)
-    async def wrapper(*args, **kwargs):
-        try:
-            return await func(*args, **kwargs)
-        except RetryError as retry_error:
-            e = retry_error.last_attempt._exception
-            if e is None:
-                raise APIError()
-            code = getattr(e, "code", None)
-            response_json = {
-                "status": getattr(e, "status", None),
-                "message": str(e),
-            }
-            response = getattr(e, "response", None)
-            raise APIError(code, response_json, response)
-        except Exception as e:  # noqa: BLE001
-            raise APIError(None, {"status": None, "message": str(e)}, None)
-    return wrapper
-class VertexLLMClientAsync(BaseLLMClientAsync):
-    PROVIDER: str = "vertexai"
-    def __init__(
-        self,
-        model: str,
-        api_key: ApiKey | None = None,
-        decorator_configs: DecoratorConfigs | None = None,
-        default_thinking_config: ThinkingConfig | None = None,
-        default_max_tokens: int | None = None,
-        project: str | None = None,
-        location: str | None = None,
-        **kwargs,
-    ):
-        project = project or os.getenv("VERTEXAI_PROJECT") or os.getenv("GOOGLE_CLOUD_PROJECT") or os.getenv("GCLOUD_PROJECT")
-        location = location or os.getenv("VERTEXAI_LOCATION") or os.getenv("GOOGLE_CLOUD_REGION") or os.getenv("GOOGLE_CLOUD_LOCATION")
-        api_key_str: str | None = None
-        if isinstance(api_key, str):
-            api_key_str = api_key
-        elif api_key is None:
-            api_key_str = os.getenv("VERTEX_API_KEY") or os.getenv("GOOGLE_API_KEY")
-        elif isinstance(api_key, VertexApiKey):
-            pass
-        else:
-            raise ValueError("Unsupported api_key type for Vertex: expected str or VertexApiKey")
-        if not project or not location:
-            raise ValueError("To create a vertexai llm client you need to provide project and location via args or env vars VERTEXAI_PROJECT and VERTEXAI_LOCATION")
-        if not isinstance(api_key, VertexApiKey):
-            api_key = VertexApiKey(project=project, location=location)
-        super().__init__(
-            VertexLLMClientAsync.PROVIDER,
-            model,
-            decorator_configs=decorator_configs,
-            default_thinking_config=default_thinking_config,
-            default_max_tokens=default_max_tokens,
-        )
-        self._api_key = api_key
-        self._api_key_str = api_key_str
-        vertex_init(project=self._api_key.project, location=self._api_key.location)
-        self._model = GenerativeModel(self.model)
-    @property
-    def api_key(self) -> VertexApiKey:
-        return self._api_key
-    @_error_handler_async
-    async def _create(
-        self,
-        messages: list[Content],
-        result_type: ResultType = None,
-        *,
-        thinking_config: ThinkingConfig | None = None,
-        system_message: str | None = None,
-        max_tokens: int | None = None,
-        timeout: float | None = None,
-        tools: list[Tool] | None = None,
-        tool_config: ToolConfig = ToolConfig(),
-    ) -> Response:
-        # Reuse sync implementation (SDK is sync). For real async, offload to thread.
-        client = VertexLLMClient(
-            model=self.model,
-            api_key=self._api_key,
-            decorator_configs=self._decorator_configs,
-            default_thinking_config=self.default_thinking_config,
-            default_max_tokens=self.default_max_tokens,
-        )
-        return client._create(
-            messages=messages,
-            result_type=result_type,
-            thinking_config=thinking_config,
-            system_message=system_message,
-            max_tokens=max_tokens,
-            timeout=timeout,
-            tools=tools,
-            tool_config=tool_config,
-        )
-    async def create_stream(
-        self,
-        messages: list[Content],
-        *,
-        thinking_config: ThinkingConfig | None = None,
-        system_message: str | None = None,
-        max_tokens: int | None = None,
-    ) -> AsyncIterator[Response]:
-        # Provide a simple wrapper yielding once (non-streaming)
-        resp = await self._create(
-            messages=messages,
-            result_type=None,
-            thinking_config=thinking_config,
-            system_message=system_message,
-            max_tokens=max_tokens,
-        )
-        yield resp
-    @staticmethod
-    def models_list() -> list[Model]:
-        return VertexLLMClient.models_list()