PyPI - MindsDB - Versions diffs - 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl - Mend

MindsDB 25.7.3.0py3-none-any.whl → 25.8.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (102) hide show

mindsdb/integrations/handlers/web_handler/web_handler.py CHANGED Viewed

@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
 from .urlcrawl_helpers import get_all_websites
 from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
-from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
+from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
 class CrawlerTable(APIResource):
-    def list(
-            self,
-            conditions: List[FilterCondition] = None,
-            limit: int = None,
-            **kwargs
-    ) -> pd.DataFrame:
+    def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
         """
         Selects data from the provided websites
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
         urls = []
         crawl_depth = None
         per_url_limit = None
+        headers = {}
         for condition in conditions:
-            if condition.column == 'url':
+            if condition.column == "url":
                 if condition.op == FilterOperator.IN:
                     urls = condition.value
                 elif condition.op == FilterOperator.EQUAL:
                     urls = [condition.value]
                 condition.applied = True
-            if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
+            if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
                 crawl_depth = condition.value
                 condition.applied = True
-            if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
+            if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
                 per_url_limit = condition.value
                 condition.applied = True
+            if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
+                headers["User-Agent"] = condition.value
+                condition.applied = True
         if len(urls) == 0:
             raise NotImplementedError(
-                'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
+                'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
+            )
-        allowed_urls = config.get('web_crawling_allowed_sites', [])
+        allowed_urls = config.get("web_crawling_allowed_sites", [])
         if allowed_urls and not validate_urls(urls, allowed_urls):
-            raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
+            raise ValueError(
+                f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
+            )
         if limit is None and per_url_limit is None and crawl_depth is None:
             per_url_limit = 1
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
             # crawl every url separately
             results = []
             for url in urls:
-                results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
+                results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
             result = pd.concat(results)
         else:
-            result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
+            result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
         if limit is not None and len(result) > limit:
             result = result[:limit]
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
         """
         Returns the columns of the crawler table
         """
-        return [
-            'url',
-            'text_content',
-            'error'
-        ]
+        return ["url", "text_content", "error"]
 class WebHandler(APIHandler):
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
     def __init__(self, name=None, **kwargs):
         super().__init__(name)
         crawler = CrawlerTable(self)
-        self._register_table('crawler', crawler)
+        self._register_table("crawler", crawler)
     def check_connection(self) -> HandlerStatusResponse:
         """

mindsdb/integrations/libs/llm/config.py CHANGED Viewed

@@ -37,20 +37,6 @@ class AnthropicConfig(BaseLLMConfig):
     anthropic_api_url: Optional[str]
-# See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.anyscale.ChatAnyscale.html
-# This config does not have to be exclusively used with Langchain.
-class AnyscaleConfig(BaseLLMConfig):
-    model_name: str
-    temperature: Optional[float]
-    max_retries: Optional[int]
-    max_tokens: Optional[int]
-    anyscale_api_base: Optional[str]
-    # Inferred from ANYSCALE_API_KEY if not provided.
-    anyscale_api_key: Optional[str]
-    anyscale_proxy: Optional[str]
-    request_timeout: Optional[float]
 # See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.litellm.ChatLiteLLM.html
 # This config does not have to be exclusively used with Langchain.
 class LiteLLMConfig(BaseLLMConfig):

mindsdb/integrations/libs/llm/utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ import pandas as pd
 from mindsdb.integrations.libs.llm.config import (
     AnthropicConfig,
-    AnyscaleConfig,
     BaseLLMConfig,
     GoogleConfig,
     LiteLLMConfig,
@@ -30,9 +29,6 @@ DEFAULT_OPENAI_MAX_RETRIES = 3
 DEFAULT_ANTHROPIC_MODEL = "claude-3-haiku-20240307"
-DEFAULT_ANYSCALE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
-DEFAULT_ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
 DEFAULT_GOOGLE_MODEL = "gemini-2.5-pro-preview-03-25"
 DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
@@ -135,17 +131,6 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
             anthropic_api_key=args["api_keys"].get("anthropic", None),
             anthropic_api_url=args.get("base_url", None),
         )
-    if provider == "anyscale":
-        return AnyscaleConfig(
-            model_name=args.get("model_name", DEFAULT_ANYSCALE_MODEL),
-            temperature=temperature,
-            max_retries=args.get("max_retries", DEFAULT_OPENAI_MAX_RETRIES),
-            max_tokens=args.get("max_tokens", DEFAULT_OPENAI_MAX_TOKENS),
-            anyscale_api_base=args.get("base_url", DEFAULT_ANYSCALE_BASE_URL),
-            anyscale_api_key=args["api_keys"].get("anyscale", None),
-            anyscale_proxy=args.get("proxy", None),
-            request_timeout=args.get("request_timeout", None),
-        )
     if provider == "litellm":
         model_kwargs = {
             "api_key": args["api_keys"].get("litellm", None),

mindsdb/integrations/libs/vectordatabase_handler.py CHANGED Viewed

@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
         if not df_update.empty:
             # get values of existed `created_at` and return them to metadata
-            created_dates = {row[id_col]: row[metadata_col].get("_created_at") for _, row in df_existed.iterrows()}
+            origin_id_col = "_original_doc_id"
+            created_dates, ids = {}, {}
+            for _, row in df_existed.iterrows():
+                chunk_id = row[id_col]
+                created_dates[chunk_id] = row[metadata_col].get("_created_at")
+                ids[chunk_id] = row[metadata_col].get(origin_id_col)
             def keep_created_at(row):
                 val = created_dates.get(row[id_col])
                 if val:
                     row[metadata_col]["_created_at"] = val
+                # keep id column
+                if origin_id_col not in row[metadata_col]:
+                    row[metadata_col][origin_id_col] = ids.get(row[id_col])
                 return row
             df_update.apply(keep_created_at, axis=1)

mindsdb/integrations/utilities/files/file_reader.py CHANGED Viewed

@@ -10,6 +10,7 @@ from typing import List, Generator
 import filetype
 import pandas as pd
 from charset_normalizer import from_bytes
+from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
 from mindsdb.utilities import log
@@ -322,40 +323,25 @@ class FileReader(FormatDetector):
     @staticmethod
     def read_txt(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
         # the lib is heavy, so import it only when needed
-        from langchain_text_splitters import RecursiveCharacterTextSplitter
         file_obj = decode(file_obj)
-        try:
-            from langchain_core.documents import Document
-        except ImportError:
-            raise FileProcessingError(
-                "To import TXT document please install 'langchain-community':\n    pip install langchain-community"
-            )
         text = file_obj.read()
-        metadata = {"source_file": name, "file_format": "txt"}
-        documents = [Document(page_content=text, metadata=metadata)]
+        text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
-        )
-        docs = text_splitter.split_documents(documents)
-        return pd.DataFrame([{"content": doc.page_content, "metadata": doc.metadata} for doc in docs])
+        docs = text_splitter.split_text(text)
+        return pd.DataFrame([{"content": doc, "metadata": {"source_file": name, "file_format": "txt"}} for doc in docs])
     @staticmethod
     def read_pdf(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
         # the libs are heavy, so import it only when needed
         import fitz  # pymupdf
-        from langchain_text_splitters import RecursiveCharacterTextSplitter
         with fitz.open(stream=file_obj.read()) as pdf:  # open pdf
             text = chr(12).join([page.get_text() for page in pdf])
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
-        )
+        text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
         split_text = text_splitter.split_text(text)

mindsdb/integrations/utilities/handler_utils.py CHANGED Viewed

@@ -37,54 +37,74 @@ def get_api_key(
     # 1
     if "using" in create_args and f"{api_name.lower()}_api_key" in create_args["using"]:
-        return create_args["using"][f"{api_name.lower()}_api_key"]
+        api_key = create_args["using"][f"{api_name.lower()}_api_key"]
+        if api_key:
+            return api_key
     # 1.5 - Check for generic api_key in using
     if "using" in create_args and "api_key" in create_args["using"]:
-        return create_args["using"]["api_key"]
+        api_key = create_args["using"]["api_key"]
+        if api_key:
+            return api_key
     # 2
     if f"{api_name.lower()}_api_key" in create_args:
-        return create_args[f"{api_name.lower()}_api_key"]
+        api_key = create_args[f"{api_name.lower()}_api_key"]
+        if api_key:
+            return api_key
     # 2.5 - Check for generic api_key
     if "api_key" in create_args:
-        return create_args["api_key"]
+        api_key = create_args["api_key"]
+        if api_key:
+            return api_key
     # 3 - Check in params dictionary if it exists (for agents)
     if "params" in create_args and create_args["params"] is not None:
         if f"{api_name.lower()}_api_key" in create_args["params"]:
-            return create_args["params"][f"{api_name.lower()}_api_key"]
+            api_key = create_args["params"][f"{api_name.lower()}_api_key"]
+            if api_key:
+                return api_key
         # 3.5 - Check for generic api_key in params
         if "api_key" in create_args["params"]:
-            return create_args["params"]["api_key"]
+            api_key = create_args["params"]["api_key"]
+            if api_key:
+                return api_key
     # 4
     if engine_storage is not None:
         connection_args = engine_storage.get_connection_args()
         if f"{api_name.lower()}_api_key" in connection_args:
-            return connection_args[f"{api_name.lower()}_api_key"]
+            api_key = connection_args[f"{api_name.lower()}_api_key"]
+            if api_key:
+                return api_key
         # 4.5 - Check for generic api_key in connection_args
         if "api_key" in connection_args:
-            return connection_args["api_key"]
+            api_key = connection_args["api_key"]
+            if api_key:
+                return api_key
     # 5
     api_key = os.getenv(f"{api_name.lower()}_api_key")
-    if api_key is not None:
+    if api_key:
         return api_key
     api_key = os.getenv(f"{api_name.upper()}_API_KEY")
-    if api_key is not None:
+    if api_key:
         return api_key
     # 6
     config = Config()
     api_cfg = config.get(api_name, {})
     if f"{api_name.lower()}_api_key" in api_cfg:
-        return api_cfg[f"{api_name.lower()}_api_key"]
+        api_key = api_cfg[f"{api_name.lower()}_api_key"]
+        if api_key:
+            return api_key
     # 7
     if "api_keys" in create_args and api_name in create_args["api_keys"]:
-        return create_args["api_keys"][api_name]
+        api_key = create_args["api_keys"][api_name]
+        if api_key:
+            return api_key
     if strict:
         provider_upper = api_name.upper()

mindsdb/integrations/utilities/rag/rerankers/base_reranker.py CHANGED Viewed

@@ -33,7 +33,7 @@ class BaseLLMReranker(BaseModel, ABC):
     client: Optional[AsyncOpenAI | BaseMLEngine] = None
     _semaphore: Optional[asyncio.Semaphore] = None
     max_concurrent_requests: int = 20
-    max_retries: int = 2
+    max_retries: int = 4
     retry_delay: float = 1.0
     request_timeout: float = 20.0  # Timeout for API requests
     early_stop: bool = True  # Whether to enable early stopping

MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.3.0py3-none-any.whl → 25.8.2.0py3-none-any.whl