PyPI - MindsDB - Versions diffs - 25.5.4.0__py3-none-any.whl → 25.5.4.2__py3-none-any.whl - Mend

MindsDB 25.5.4.0py3-none-any.whl → 25.5.4.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (26) hide show

mindsdb/__about__.py +8 -8
mindsdb/api/a2a/__main__.py +38 -8
mindsdb/api/a2a/run_a2a.py +10 -53
mindsdb/api/a2a/task_manager.py +19 -53
mindsdb/api/executor/command_executor.py +147 -291
mindsdb/api/http/namespaces/config.py +61 -86
mindsdb/integrations/handlers/byom_handler/requirements.txt +1 -2
mindsdb/integrations/handlers/lancedb_handler/requirements.txt +0 -1
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +37 -20
mindsdb/integrations/libs/llm/config.py +13 -0
mindsdb/integrations/libs/llm/utils.py +37 -65
mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +230 -227
mindsdb/interfaces/agents/constants.py +17 -13
mindsdb/interfaces/agents/langchain_agent.py +93 -94
mindsdb/interfaces/knowledge_base/controller.py +230 -221
mindsdb/utilities/config.py +43 -84
mindsdb/utilities/starters.py +9 -1
{mindsdb-25.5.4.0.dist-info → mindsdb-25.5.4.2.dist-info}/METADATA +268 -266
{mindsdb-25.5.4.0.dist-info → mindsdb-25.5.4.2.dist-info}/RECORD +22 -26
mindsdb/api/a2a/a2a_client.py +0 -439
mindsdb/api/a2a/common/client/__init__.py +0 -4
mindsdb/api/a2a/common/client/card_resolver.py +0 -21
mindsdb/api/a2a/common/client/client.py +0 -86
{mindsdb-25.5.4.0.dist-info → mindsdb-25.5.4.2.dist-info}/WHEEL +0 -0
{mindsdb-25.5.4.0.dist-info → mindsdb-25.5.4.2.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.5.4.0.dist-info → mindsdb-25.5.4.2.dist-info}/top_level.txt +0 -0

mindsdb/api/http/namespaces/config.py CHANGED Viewed

@@ -20,121 +20,109 @@ from mindsdb.integrations.libs.response import HandlerStatusResponse
 logger = log.getLogger(__name__)
-@ns_conf.route('/')
-@ns_conf.param('name', 'Get config')
+@ns_conf.route("/")
+@ns_conf.param("name", "Get config")
 class GetConfig(Resource):
-    @ns_conf.doc('get_config')
-    @api_endpoint_metrics('GET', '/config')
+    @ns_conf.doc("get_config")
+    @api_endpoint_metrics("GET", "/config")
     def get(self):
         config = Config()
-        resp = {
-            'auth': {
-                'http_auth_enabled': config['auth']['http_auth_enabled']
-            }
-        }
-        for key in ['default_llm', 'default_embedding_model']:
+        resp = {"auth": {"http_auth_enabled": config["auth"]["http_auth_enabled"]}}
+        for key in ["default_llm", "default_embedding_model", "default_reranking_model"]:
             value = config.get(key)
             if value is not None:
                 resp[key] = value
         return resp
-    @ns_conf.doc('put_config')
-    @api_endpoint_metrics('PUT', '/config')
+    @ns_conf.doc("put_config")
+    @api_endpoint_metrics("PUT", "/config")
     def put(self):
         data = request.json
-        allowed_arguments = {'auth', 'default_llm', 'default_embedding_model'}
+        allowed_arguments = {"auth", "default_llm", "default_embedding_model", "default_reranking_model"}
         unknown_arguments = list(set(data.keys()) - allowed_arguments)
         if len(unknown_arguments) > 0:
-            return http_error(
-                HTTPStatus.BAD_REQUEST, 'Wrong arguments',
-                f'Unknown argumens: {unknown_arguments}'
-            )
+            return http_error(HTTPStatus.BAD_REQUEST, "Wrong arguments", f"Unknown argumens: {unknown_arguments}")
-        nested_keys_to_validate = {'auth'}
+        nested_keys_to_validate = {"auth"}
         for key in data.keys():
             if key in nested_keys_to_validate:
-                unknown_arguments = list(
-                    set(data[key].keys()) - set(Config()[key].keys())
-                )
+                unknown_arguments = list(set(data[key].keys()) - set(Config()[key].keys()))
                 if len(unknown_arguments) > 0:
                     return http_error(
-                        HTTPStatus.BAD_REQUEST, 'Wrong arguments',
-                        f'Unknown argumens: {unknown_arguments}'
+                        HTTPStatus.BAD_REQUEST, "Wrong arguments", f"Unknown argumens: {unknown_arguments}"
                     )
         Config().update(data)
-        return '', 200
+        return "", 200
-@ns_conf.route('/integrations')
-@ns_conf.param('name', 'List all database integration')
+@ns_conf.route("/integrations")
+@ns_conf.param("name", "List all database integration")
 class ListIntegration(Resource):
-    @api_endpoint_metrics('GET', '/config/integrations')
+    @api_endpoint_metrics("GET", "/config/integrations")
     def get(self):
-        return {
-            'integrations': [k for k in ca.integration_controller.get_all(show_secrets=False)]
-        }
+        return {"integrations": [k for k in ca.integration_controller.get_all(show_secrets=False)]}
-@ns_conf.route('/all_integrations')
-@ns_conf.param('name', 'List all database integration')
+@ns_conf.route("/all_integrations")
+@ns_conf.param("name", "List all database integration")
 class AllIntegration(Resource):
-    @ns_conf.doc('get_all_integrations')
-    @api_endpoint_metrics('GET', '/config/all_integrations')
+    @ns_conf.doc("get_all_integrations")
+    @api_endpoint_metrics("GET", "/config/all_integrations")
     def get(self):
         integrations = ca.integration_controller.get_all(show_secrets=False)
         return integrations
-@ns_conf.route('/integrations/<name>')
-@ns_conf.param('name', 'Database integration')
+@ns_conf.route("/integrations/<name>")
+@ns_conf.param("name", "Database integration")
 class Integration(Resource):
-    @ns_conf.doc('get_integration')
-    @api_endpoint_metrics('GET', '/config/integrations/integration')
+    @ns_conf.doc("get_integration")
+    @api_endpoint_metrics("GET", "/config/integrations/integration")
     def get(self, name):
         integration = ca.integration_controller.get(name, show_secrets=False)
         if integration is None:
-            return http_error(HTTPStatus.NOT_FOUND, 'Not found', f'Can\'t find integration: {name}')
+            return http_error(HTTPStatus.NOT_FOUND, "Not found", f"Can't find integration: {name}")
         integration = copy.deepcopy(integration)
         return integration
-    @ns_conf.doc('put_integration')
-    @api_endpoint_metrics('PUT', '/config/integrations/integration')
+    @ns_conf.doc("put_integration")
+    @api_endpoint_metrics("PUT", "/config/integrations/integration")
     def put(self, name):
         params = {}
         if request.is_json:
-            params.update((request.json or {}).get('params', {}))
+            params.update((request.json or {}).get("params", {}))
         else:
             params.update(request.form or {})
         if len(params) == 0:
-            return http_error(HTTPStatus.BAD_REQUEST, 'Wrong argument', "type of 'params' must be dict")
+            return http_error(HTTPStatus.BAD_REQUEST, "Wrong argument", "type of 'params' must be dict")
         files = request.files
         temp_dir = None
         if files is not None and len(files) > 0:
-            temp_dir = tempfile.mkdtemp(prefix='integration_files_')
+            temp_dir = tempfile.mkdtemp(prefix="integration_files_")
             for key, file in files.items():
                 temp_dir_path = Path(temp_dir)
                 file_name = Path(file.filename)
                 file_path = temp_dir_path.joinpath(file_name).resolve()
                 if temp_dir_path not in file_path.parents:
-                    raise Exception(f'Can not save file at path: {file_path}')
+                    raise Exception(f"Can not save file at path: {file_path}")
                 file.save(file_path)
                 params[key] = str(file_path)
-        is_test = params.get('test', False)
+        is_test = params.get("test", False)
         # TODO: Move this to new Endpoint
         config = Config()
-        secret_key = config.get('secret_key', 'dummy-key')
+        secret_key = config.get("secret_key", "dummy-key")
         if is_test:
-            del params['test']
-            handler_type = params.pop('type', None)
-            params.pop('publish', None)
+            del params["test"]
+            handler_type = params.pop("type", None)
+            params.pop("publish", None)
             try:
                 handler = ca.integration_controller.create_tmp_handler(name, handler_type, params)
                 status = handler.check_connection()
@@ -145,33 +133,32 @@ class Integration(Resource):
             resp = status.to_json()
-            if status.success and 'code' in params:
-                if hasattr(handler, 'handler_storage'):
+            if status.success and "code" in params:
+                if hasattr(handler, "handler_storage"):
                     # attach storage if exists
                     export = handler.handler_storage.export_files()
                     if export:
                         # encrypt with flask secret key
                         encrypted = encrypt(export, secret_key)
-                        resp['storage'] = encrypted.decode()
+                        resp["storage"] = encrypted.decode()
             return resp, 200
         config = Config()
-        secret_key = config.get('secret_key', 'dummy-key')
+        secret_key = config.get("secret_key", "dummy-key")
         integration = ca.integration_controller.get(name, show_secrets=False)
         if integration is not None:
             return http_error(
-                HTTPStatus.BAD_REQUEST, 'Wrong argument',
-                f"Integration with name '{name}' already exists"
+                HTTPStatus.BAD_REQUEST, "Wrong argument", f"Integration with name '{name}' already exists"
             )
         try:
-            engine = params['type']
+            engine = params["type"]
             if engine is not None:
-                del params['type']
-            params.pop('publish', False)
-            storage = params.pop('storage', None)
+                del params["type"]
+            params.pop("publish", False)
+            storage = params.pop("storage", None)
             ca.integration_controller.add(name, engine, params)
             # copy storage
@@ -185,62 +172,50 @@ class Integration(Resource):
             logger.error(str(e))
             if temp_dir is not None:
                 shutil.rmtree(temp_dir)
-            return http_error(
-                HTTPStatus.INTERNAL_SERVER_ERROR, 'Error',
-                f'Error during config update: {str(e)}'
-            )
+            return http_error(HTTPStatus.INTERNAL_SERVER_ERROR, "Error", f"Error during config update: {str(e)}")
         if temp_dir is not None:
             shutil.rmtree(temp_dir)
         return {}, 200
-    @ns_conf.doc('delete_integration')
-    @api_endpoint_metrics('DELETE', '/config/integrations/integration')
+    @ns_conf.doc("delete_integration")
+    @api_endpoint_metrics("DELETE", "/config/integrations/integration")
     def delete(self, name):
         integration = ca.integration_controller.get(name)
         if integration is None:
             return http_error(
-                HTTPStatus.BAD_REQUEST, 'Integration does not exists',
-                f"Nothing to delete. '{name}' not exists."
+                HTTPStatus.BAD_REQUEST, "Integration does not exists", f"Nothing to delete. '{name}' not exists."
             )
         try:
             ca.integration_controller.delete(name)
         except Exception as e:
             logger.error(str(e))
-            return http_error(
-                HTTPStatus.INTERNAL_SERVER_ERROR, 'Error',
-                f"Error during integration delete: {str(e)}"
-            )
+            return http_error(HTTPStatus.INTERNAL_SERVER_ERROR, "Error", f"Error during integration delete: {str(e)}")
         return "", 200
-    @ns_conf.doc('modify_integration')
-    @api_endpoint_metrics('POST', '/config/integrations/integration')
+    @ns_conf.doc("modify_integration")
+    @api_endpoint_metrics("POST", "/config/integrations/integration")
     def post(self, name):
         params = {}
-        params.update((request.json or {}).get('params', {}))
+        params.update((request.json or {}).get("params", {}))
         params.update(request.form or {})
         if not isinstance(params, dict):
-            return http_error(
-                HTTPStatus.BAD_REQUEST, 'Wrong argument',
-                "type of 'params' must be dict"
-            )
+            return http_error(HTTPStatus.BAD_REQUEST, "Wrong argument", "type of 'params' must be dict")
         integration = ca.integration_controller.get(name)
         if integration is None:
             return http_error(
-                HTTPStatus.BAD_REQUEST, 'Integration does not exists',
-                f"Nothin to modify. '{name}' not exists."
+                HTTPStatus.BAD_REQUEST, "Integration does not exists", f"Nothin to modify. '{name}' not exists."
             )
         try:
-            if 'enabled' in params:
-                params['publish'] = params['enabled']
-                del params['enabled']
+            if "enabled" in params:
+                params["publish"] = params["enabled"]
+                del params["enabled"]
             ca.integration_controller.modify(name, params)
         except Exception as e:
             logger.error(str(e))
             return http_error(
-                HTTPStatus.INTERNAL_SERVER_ERROR, 'Error',
-                f"Error during integration modification: {str(e)}"
+                HTTPStatus.INTERNAL_SERVER_ERROR, "Error", f"Error during integration modification: {str(e)}"
             )
         return "", 200

mindsdb/integrations/handlers/byom_handler/requirements.txt CHANGED Viewed

@@ -1,2 +1 @@
-virtualenv
-pyarrow==19.0.0
+virtualenv

mindsdb/integrations/handlers/lancedb_handler/requirements.txt CHANGED Viewed

@@ -1,3 +1,2 @@
 lancedb~=0.3.1
 lance
-pyarrow~=19.0.0

mindsdb/integrations/handlers/litellm_handler/litellm_handler.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import ast
 from typing import Dict, Optional, List
+from litellm import completion, batch_completion, embedding
 import pandas as pd
 from mindsdb.integrations.libs.base import BaseMLEngine
@@ -8,8 +10,6 @@ from mindsdb.utilities import log
 from mindsdb.integrations.handlers.litellm_handler.settings import CompletionParameters
-from litellm import completion, batch_completion
 logger = log.getLogger(__name__)
@@ -28,10 +28,24 @@ class LiteLLMHandler(BaseMLEngine):
     @staticmethod
     def create_validation(target, args=None, **kwargs):
         if "using" not in args:
-            raise Exception(
-                "Litellm engine requires a USING clause. See settings.py for more info on supported args."
+            raise Exception("Litellm engine requires a USING clause. See settings.py for more info on supported args.")
+    @staticmethod
+    def embeddings(model: str, messages: List[str], args: dict) -> List[list]:
+        response = embedding(model=model, input=messages, **args)
+        return [rec["embedding"] for rec in response.data]
+    @staticmethod
+    async def acompletion(model: str, messages: List[dict], args: dict):
+        if model.startswith("snowflake/") and "snowflake_account_id" in args:
+            args["api_base"] = (
+                f"https://{args['snowflake_account_id']}.snowflakecomputing.com/api/v2/cortex/inference:complete"
             )
+        from litellm import acompletion
+        return await acompletion(model=model, messages=messages, stream=False, **args)
     def create(
         self,
         target: str,
@@ -70,9 +84,9 @@ class LiteLLMHandler(BaseMLEngine):
         self._build_messages(args, df)
         # remove prompt_template from args
-        args.pop('prompt_template', None)
+        args.pop("prompt_template", None)
-        if len(args['messages']) > 1:
+        if len(args["messages"]) > 1:
             # if more than one message, use batch completion
             responses = batch_completion(**args)
             return pd.DataFrame({"result": [response.choices[0].message.content for response in responses]})
@@ -103,36 +117,39 @@ class LiteLLMHandler(BaseMLEngine):
         if "prompt_template" in prompt_kwargs:
             # if prompt_template is passed in predict query, use it
-            logger.info("Using 'prompt_template' passed in SELECT Predict query. "
-                        "Note this will overwrite a 'prompt_template' passed in create MODEL query.")
+            logger.info(
+                "Using 'prompt_template' passed in SELECT Predict query. "
+                "Note this will overwrite a 'prompt_template' passed in create MODEL query."
+            )
-            args['prompt_template'] = prompt_kwargs.pop('prompt_template')
+            args["prompt_template"] = prompt_kwargs.pop("prompt_template")
-        if 'mock_response' in prompt_kwargs:
+        if "mock_response" in prompt_kwargs:
             # used for testing to save on real completion api calls
-            args['mock_response']: str = prompt_kwargs.pop('mock_response')
+            args["mock_response"]: str = prompt_kwargs.pop("mock_response")
-        if 'messages' in prompt_kwargs and len(prompt_kwargs) > 1:
+        if "messages" in prompt_kwargs and len(prompt_kwargs) > 1:
             # if user passes in messages, no other args can be passed in
-            raise Exception(
-                "If 'messages' is passed in SELECT Predict query, no other args can be passed in."
-            )
+            raise Exception("If 'messages' is passed in SELECT Predict query, no other args can be passed in.")
         # if user passes in messages, use those instead
-        if 'messages' in prompt_kwargs:
+        if "messages" in prompt_kwargs:
             logger.info("Using messages passed in SELECT Predict query. 'prompt_template' will be ignored.")
-            args['messages']: List = ast.literal_eval(df['messages'].iloc[0])
+            args["messages"]: List = ast.literal_eval(df["messages"].iloc[0])
         else:
             # if user passes in prompt_template, use that to create messages
             if len(prompt_kwargs) == 1:
-                args['messages'] = self._prompt_to_messages(args['prompt_template'], **prompt_kwargs) \
-                    if args['prompt_template'] else self._prompt_to_messages(df.iloc[0][0])
+                args["messages"] = (
+                    self._prompt_to_messages(args["prompt_template"], **prompt_kwargs)
+                    if args["prompt_template"]
+                    else self._prompt_to_messages(df.iloc[0][0])
+                )
             elif len(prompt_kwargs) > 1:
                 try:
-                    args['messages'] = self._prompt_to_messages(args['prompt_template'], **prompt_kwargs)
+                    args["messages"] = self._prompt_to_messages(args["prompt_template"], **prompt_kwargs)
                 except KeyError as e:
                     raise Exception(
                         f"{e}: Please pass in either a prompt_template on create MODEL or "

mindsdb/integrations/libs/llm/config.py CHANGED Viewed

@@ -114,3 +114,16 @@ class GoogleConfig(BaseLLMConfig):
     top_k: Optional[int] = Field(default=None, description="Number of highest probability tokens to consider")
     max_output_tokens: Optional[int] = Field(default=None, description="Maximum number of tokens to generate")
     google_api_key: Optional[str] = Field(default=None, description="API key for Google Generative AI")
+# See https://api.python.langchain.com/en/latest/llms/langchain_community.llms.writer.Writer.html
+class WriterConfig(BaseLLMConfig):
+    model_name: str = Field(default="palmyra-x5", alias="model_id")
+    temperature: Optional[float] = Field(default=0.7)
+    max_tokens: Optional[int] = Field(default=None)
+    top_p: Optional[float] = Field(default=None)
+    stop: Optional[List[str]] = Field(default=None)
+    best_of: Optional[int] = Field(default=None)
+    writer_api_key: Optional[str] = Field(default=None)
+    writer_org_id: Optional[str] = Field(default=None)
+    base_url: Optional[str] = Field(default=None)

mindsdb/integrations/libs/llm/utils.py CHANGED Viewed

@@ -16,6 +16,7 @@ from mindsdb.integrations.libs.llm.config import (
     OpenAIConfig,
     NvidiaNIMConfig,
     MindsdbConfig,
+    WriterConfig,
 )
 from mindsdb.utilities.config import config
 from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
@@ -41,16 +42,12 @@ DEFAULT_LITELLM_BASE_URL = "https://ai.dev.mindsdb.com"
 DEFAULT_OLLAMA_BASE_URL = "http://localhost:11434"
 DEFAULT_OLLAMA_MODEL = "llama2"
-DEFAULT_NVIDIA_NIM_BASE_URL = (
-    "http://localhost:8000/v1"  # Assumes local port forwarding through ssh
-)
+DEFAULT_NVIDIA_NIM_BASE_URL = "http://localhost:8000/v1"  # Assumes local port forwarding through ssh
 DEFAULT_NVIDIA_NIM_MODEL = "meta/llama-3_1-8b-instruct"
 DEFAULT_VLLM_SERVER_URL = "http://localhost:8000/v1"
-def get_completed_prompts(
-    base_template: str, df: pd.DataFrame, strict=True
-) -> Tuple[List[str], np.ndarray]:
+def get_completed_prompts(base_template: str, df: pd.DataFrame, strict=True) -> Tuple[List[str], np.ndarray]:
     """
     Helper method that produces formatted prompts given a template and data in a Pandas DataFrame.
     It also returns the ID of any empty templates that failed to be filled due to missing data.
@@ -69,9 +66,7 @@ def get_completed_prompts(
     if len(matches) == 0:
         # no placeholders
         if strict:
-            raise AssertionError(
-                "No placeholders found in the prompt, please provide a valid prompt template."
-            )
+            raise AssertionError("No placeholders found in the prompt, please provide a valid prompt template.")
         prompts = [base_template] * len(df)
         return prompts, np.ndarray(0)
@@ -95,12 +90,8 @@ def get_completed_prompts(
     for i in range(len(template)):
         atom = template[i]
         if i < len(columns):
-            col = df[columns[i]].replace(
-                to_replace=[None], value=""
-            )  # add empty quote if data is missing
-            df["__mdb_prompt"] = df["__mdb_prompt"].apply(
-                lambda x: x + atom
-            ) + col.astype("string")
+            col = df[columns[i]].replace(to_replace=[None], value="")  # add empty quote if data is missing
+            df["__mdb_prompt"] = df["__mdb_prompt"].apply(lambda x: x + atom) + col.astype("string")
         else:
             df["__mdb_prompt"] = df["__mdb_prompt"].apply(lambda x: x + atom)
     prompts = list(df["__mdb_prompt"])
@@ -119,8 +110,7 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
     """
     temperature = min(1.0, max(0.0, args.get("temperature", 0.0)))
     if provider == "openai":
-        if any(x in args.get("model_name", "") for x in ['o1', 'o3']):
+        if any(x in args.get("model_name", "") for x in ["o1", "o3"]):
             # for o1 and 03, 'temperature' does not support 0.0 with this model. Only the default (1) value is supported
             temperature = 1
@@ -173,9 +163,7 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
             max_tokens=args.get("max_tokens", DEFAULT_OPENAI_MAX_TOKENS),
             top_p=args.get("top_p", None),
             top_k=args.get("top_k", None),
-            custom_llm_provider=args.get(
-                "custom_llm_provider", DEFAULT_LITELLM_PROVIDER
-            ),
+            custom_llm_provider=args.get("custom_llm_provider", DEFAULT_LITELLM_PROVIDER),
             model_kwargs=model_kwargs,
         )
     if provider == "ollama":
@@ -237,6 +225,18 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
             max_output_tokens=args.get("max_tokens", None),
             google_api_key=args["api_keys"].get("google", None),
         )
+    if provider == "writer":
+        return WriterConfig(
+            model_name=args.get("model_name", "palmyra-x5"),
+            temperature=temperature,
+            max_tokens=args.get("max_tokens", None),
+            top_p=args.get("top_p", None),
+            stop=args.get("stop", None),
+            best_of=args.get("best_of", None),
+            writer_api_key=args["api_keys"].get("writer", None),
+            writer_org_id=args.get("writer_org_id", None),
+            base_url=args.get("base_url", None),
+        )
     raise ValueError(f"Provider {provider} is not supported.")
@@ -290,9 +290,7 @@ def ft_jsonl_validation(
                 )  # noqa
             if messages_col not in batch:
-                raise Exception(
-                    f"{prefix}Each line in the provided data should have a '{messages_col}' key"
-                )
+                raise Exception(f"{prefix}Each line in the provided data should have a '{messages_col}' key")
             messages = batch[messages_col]
             try:
@@ -350,30 +348,22 @@ def ft_chat_format_validation(
     for c in chat:
         if any(k not in valid_keys for k in c.keys()):
-            raise Exception(
-                f"Each message should only have these keys: `{valid_keys}`. Found: `{c.keys()}`"
-            )
+            raise Exception(f"Each message should only have these keys: `{valid_keys}`. Found: `{c.keys()}`")
     roles = [m[role_key] for m in chat]
     contents = [m[content_key] for m in chat]
     if len(roles) != len(contents):
-        raise Exception(
-            f"Each message should contain both `{role_key}` and `{content_key}` fields"
-        )
+        raise Exception(f"Each message should contain both `{role_key}` and `{content_key}` fields")
     if len(roles) == 0:
         raise Exception("Chat should have at least one message")
     if assistant_key not in roles:
-        raise Exception(
-            "Chat should have at least one assistant message"
-        )  # otherwise it is useless for FT
+        raise Exception("Chat should have at least one assistant message")  # otherwise it is useless for FT
     if user_key not in roles:
-        raise Exception(
-            "Chat should have at least one user message"
-        )  # perhaps remove in the future
+        raise Exception("Chat should have at least one user message")  # perhaps remove in the future
     # set default transitions for finite state machine if undefined
     if transitions is None:
@@ -387,20 +377,15 @@ def ft_chat_format_validation(
     # check order is valid via finite state machine
     state = None
     for i, (role, content) in enumerate(zip(roles, contents)):
         prefix = f"message #{i + 1}: "
         # check invalid roles
         if role not in valid_roles:
-            raise Exception(
-                f"{prefix}Invalid role (found `{role}`, expected one of `{valid_roles}`)"
-            )
+            raise Exception(f"{prefix}Invalid role (found `{role}`, expected one of `{valid_roles}`)")
         # check content
         if not isinstance(content, str):
-            raise Exception(
-                f"{prefix}Content should be a string, got type `{type(content)}`"
-            )
+            raise Exception(f"{prefix}Content should be a string, got type `{type(content)}`")
         # check transition
         if role not in transitions[state]:
@@ -464,9 +449,7 @@ def ft_chat_formatter(df: pd.DataFrame) -> List[Dict]:
             df = df.sort_values(["chat_id"], kind="stable")
     elif "message_id" in df.columns:
         if df["message_id"].duplicated().any():
-            raise Exception(
-                "If `message_id` is provided, it must not contain duplicate IDs."
-            )
+            raise Exception("If `message_id` is provided, it must not contain duplicate IDs.")
         df = df.sort_values(["message_id"])
     # 2. build chats
@@ -477,12 +460,8 @@ def ft_chat_formatter(df: pd.DataFrame) -> List[Dict]:
         for _, row in df.iterrows():
             try:
                 chat = json.loads(row["chat_json"])
-                assert list(chat.keys()) == [
-                    "messages"
-                ], "Each chat should have a 'messages' key, and nothing else."
-                ft_chat_format_validation(
-                    chat["messages"]
-                )  # will raise Exception if chat is invalid
+                assert list(chat.keys()) == ["messages"], "Each chat should have a 'messages' key, and nothing else."
+                ft_chat_format_validation(chat["messages"])  # will raise Exception if chat is invalid
                 chats.append(chat)
             except json.JSONDecodeError:
                 pass  # TODO: add logger info here, prompt user to clean dataset carefully
@@ -492,9 +471,7 @@ def ft_chat_formatter(df: pd.DataFrame) -> List[Dict]:
         chat = []
         for i, row in df.iterrows():
             if row["role"] == "system" and len(chat) > 0:
-                ft_chat_format_validation(
-                    chat
-                )  # will raise Exception if chat is invalid
+                ft_chat_format_validation(chat)  # will raise Exception if chat is invalid
                 chats.append({"messages": chat})
                 chat = []
             event = {"role": row["role"], "content": row["content"]}
@@ -529,15 +506,11 @@ def ft_code_formatter(
     # input and setup validation
     assert len(df) > 0, "Input dataframe should not be empty"
     assert "code" in df.columns, "Input dataframe should have a 'code' column"
-    assert chunk_size > 0 and isinstance(
-        chunk_size, int
-    ), "`chunk_size` should be a positive integer"
+    assert chunk_size > 0 and isinstance(chunk_size, int), "`chunk_size` should be a positive integer"
     supported_formats = ["chat", "fim"]
     supported_langs = [e.value for e in Language]
-    assert (
-        language.lower() in supported_langs
-    ), f"Invalid language. Valid choices are: {supported_langs}"
+    assert language.lower() in supported_langs, f"Invalid language. Valid choices are: {supported_langs}"
     # ensure correct encoding
     df["code"] = df["code"].map(lambda x: x.encode("utf8").decode("unicode_escape"))
@@ -574,7 +547,7 @@ def ft_code_formatter(
     roles = []
     contents = []
     for idx in range(0, len(chunks), 3):
-        pre, mid, suf = chunks[idx: idx + 3]
+        pre, mid, suf = chunks[idx : idx + 3]
         interleaved = list(itertools.chain(*zip(templates, (pre, suf, mid))))
         user = "\n".join(interleaved[:-1])
         assistant = "\n".join(interleaved[-1:])
@@ -595,12 +568,11 @@ def ft_cqa_formatter(
     default_instruction="You are a helpful assistant.",
     default_context="",
 ) -> pd.DataFrame:
     # input and setup validation
     assert len(df) > 0, "Input dataframe should not be empty"
-    assert {question_col, answer_col}.issubset(
-        set(df.columns)
-    ), f"Input dataframe must have columns `{question_col}`, and `{answer_col}`"  # noqa
+    assert {question_col, answer_col}.issubset(set(df.columns)), (
+        f"Input dataframe must have columns `{question_col}`, and `{answer_col}`"
+    )  # noqa
     if instruction_col not in df.columns:
         df[instruction_col] = default_instruction

MindsDB 25.5.4.0__py3-none-any.whl → 25.5.4.2__py3-none-any.whl

Potentially problematic release.

MindsDB 25.5.4.0py3-none-any.whl → 25.5.4.2py3-none-any.whl