PyPI - MindsDB - Versions diffs - 25.2.2.2__py3-none-any.whl → 25.2.4.0__py3-none-any.whl - Mend

MindsDB 25.2.2.2py3-none-any.whl → 25.2.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (54) hide show

mindsdb/api/http/namespaces/util.py CHANGED Viewed

@@ -9,12 +9,6 @@ from flask import current_app as ca
 from mindsdb.metrics.metrics import api_endpoint_metrics
 from mindsdb.api.http.namespaces.configs.util import ns_conf
-from mindsdb.utilities.telemetry import (
-    enable_telemetry,
-    disable_telemetry,
-    telemetry_file_exists,
-    inject_telemetry_to_static
-)
 from mindsdb.api.http.gui import update_static
 from mindsdb.utilities.fs import clean_unlinked_process_marks
 from mindsdb.api.http.utils import http_error
@@ -98,28 +92,6 @@ class PingNative(Resource):
         return get_active_tasks()
-@ns_conf.route('/telemetry')
-class Telemetry(Resource):
-    @ns_conf.doc('get_telemetry_status')
-    @api_endpoint_metrics('GET', '/util/telemetry')
-    def get(self):
-        root_storage_path = ca.config_obj['paths']['root']
-        status = "enabled" if telemetry_file_exists(root_storage_path) else "disabled"
-        return {"status": status}
-    @ns_conf.doc('set_telemetry')
-    @api_endpoint_metrics('POST', '/util/telemetry')
-    def post(self):
-        data = request.json
-        action = data['action']
-        if str(action).lower() in ["true", "enable", "on"]:
-            enable_telemetry(ca.config_obj['paths']['root'])
-        else:
-            disable_telemetry(ca.config_obj['paths']['root'])
-        inject_telemetry_to_static(ca.config_obj.paths['static'])
-        return '', 200
 @ns_conf.route('/validate_json_ai')
 class ValidateJsonAI(Resource):
     @api_endpoint_metrics('POST', '/util/validate_json_ai')

mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt CHANGED Viewed

@@ -1,3 +1,2 @@
-openai == 1.24.0
 pydantic-settings >= 2.1.0
 -r mindsdb/integrations/handlers/openai_handler/requirements.txt

mindsdb/integrations/handlers/dspy_handler/requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-openai<2.0.0,>=1.54.0
 wikipedia==1.4.0
 tiktoken
 anthropic>=0.26.1

mindsdb/integrations/handlers/file_handler/file_handler.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import os
 import shutil
 import tempfile
-from pathlib import Path
 import pandas as pd
 from mindsdb_sql_parser import parse_sql
-from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select
+from mindsdb_sql_parser.ast import CreateTable, DropTables, Insert, Select, Identifier
 from mindsdb_sql_parser.ast.base import ASTNode
 from mindsdb.api.executor.utilities.sql import query_df
@@ -15,8 +14,6 @@ from mindsdb.integrations.libs.response import HandlerResponse as Response
 from mindsdb.integrations.libs.response import HandlerStatusResponse as StatusResponse
 from mindsdb.utilities import log
-from mindsdb.integrations.utilities.files.file_reader import FileReader
 logger = log.getLogger(__name__)
@@ -63,6 +60,18 @@ class FileHandler(DatabaseHandler):
     def check_connection(self) -> StatusResponse:
         return StatusResponse(True)
+    def _get_table_page_names(self, table: Identifier):
+        table_name_parts = table.parts
+        # Check if it's a multi-part name (e.g., `file_name.sheet_name`)
+        if len(table_name_parts) > 1:
+            table_name = table_name_parts[-2]
+            page_name = table_name_parts[-1]  # Get the sheet name
+        else:
+            table_name = table_name_parts[-1]
+            page_name = None
+        return table_name, page_name
     def query(self, query: ASTNode) -> Response:
         if type(query) is DropTables:
             for table_identifier in query.tables:
@@ -84,7 +93,7 @@ class FileHandler(DatabaseHandler):
                     )
             return Response(RESPONSE_TYPE.OK)
-        if type(query) is CreateTable:
+        if isinstance(query, CreateTable):
             # Check if the table already exists or if the table name contains more than one namespace
             existing_files = self.file_controller.get_files_names()
@@ -96,13 +105,13 @@ class FileHandler(DatabaseHandler):
             table_name = query.name.parts[-1]
             if table_name in existing_files:
-                return Response(
-                    RESPONSE_TYPE.ERROR,
-                    error_message=f"Table '{table_name}' already exists",
-                )
-            if query.is_replace:
-                self.file_controller.delete_file(table_name)
+                if query.is_replace:
+                    self.file_controller.delete_file(table_name)
+                else:
+                    return Response(
+                        RESPONSE_TYPE.ERROR,
+                        error_message=f"Table '{table_name}' already exists",
+                    )
             temp_dir_path = tempfile.mkdtemp(prefix="mindsdb_file_")
@@ -126,31 +135,19 @@ class FileHandler(DatabaseHandler):
             return Response(RESPONSE_TYPE.OK)
-        elif type(query) is Select:
-            table_name_parts = query.from_table.parts
-            table_name = table_name_parts[-1]
-            # Check if it's a multi-part name (e.g., `files.file_name.sheet_name`)
-            if len(table_name_parts) > 1:
-                table_name = table_name_parts[-2]
-                sheet_name = table_name_parts[-1]  # Get the sheet name
-            else:
-                sheet_name = None
-            file_path = self.file_controller.get_file_path(table_name)
+        elif isinstance(query, Select):
+            table_name, page_name = self._get_table_page_names(query.from_table)
-            df = self.handle_source(file_path, sheet_name=sheet_name)
+            df = self.file_controller.get_file_data(table_name, page_name)
             # Process the SELECT query
             result_df = query_df(df, query)
             return Response(RESPONSE_TYPE.TABLE, data_frame=result_df)
-        elif type(query) is Insert:
-            table_name = query.table.parts[-1]
-            file_path = self.file_controller.get_file_path(table_name)
+        elif isinstance(query, Insert):
+            table_name, page_name = self._get_table_page_names(query.table)
-            file_reader = FileReader(path=file_path)
-            df = file_reader.to_df()
+            df = self.file_controller.get_file_data(table_name, page_name)
             # Create a new dataframe with the values from the query
             new_df = pd.DataFrame(query.values, columns=[col.name for col in query.columns])
@@ -158,10 +155,7 @@ class FileHandler(DatabaseHandler):
             # Concatenate the new dataframe with the existing one
             df = pd.concat([df, new_df], ignore_index=True)
-            # Write the concatenated data to the file based on its format
-            format = Path(file_path).suffix.strip(".").lower()
-            write_method = getattr(df, f"to_{format}")
-            write_method(file_path, index=False)
+            self.file_controller.set_file_data(table_name, df, page_name=page_name)
             return Response(RESPONSE_TYPE.OK)
@@ -175,18 +169,6 @@ class FileHandler(DatabaseHandler):
         ast = self.parser(query)
         return self.query(ast)
-    @staticmethod
-    def handle_source(file_path, **kwargs):
-        file_reader = FileReader(path=file_path)
-        df = file_reader.to_df(**kwargs)
-        header = df.columns.values.tolist()
-        df.columns = [key.strip() for key in header]
-        df = df.applymap(clean_cell)
-        return df
     def get_tables(self) -> Response:
         """
         List all files

mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py CHANGED Viewed

@@ -64,6 +64,12 @@ class MockFileController:
     def save_file(self, name, file_path, file_name=None):
         return True
+    def get_file_data(self, name, page_name=None):
+        return pandas.DataFrame(test_file_content[1:], columns=test_file_content[0])
+    def set_file_data(self, name, df, page_name=None):
+        return True
 def curr_dir():
     return os.path.dirname(os.path.realpath(__file__))
@@ -296,18 +302,9 @@ def test_handle_source(file_path, expected_columns):
     # using different methods to create reader
     for reader in get_reader(file_path):
-        df = reader.to_df()
+        df = reader.get_page_content()
         assert isinstance(df, pandas.DataFrame)
-        if reader.get_format() == 'xlsx':
-            assert df.columns.tolist() == test_excel_sheet_content[0]
-            assert len(df) == len(test_excel_sheet_content) - 1
-            assert df.values.tolist() == test_excel_sheet_content[1:]
-            sheet_name = test_excel_sheet_content[1][0]
-            df = reader.to_df(sheet_name=sheet_name)
         assert df.columns.tolist() == expected_columns
         # The pdf and txt files have some different content
@@ -336,7 +333,7 @@ def test_tsv():
     assert reader.get_format() == 'csv'
     assert reader.parameters['delimiter'] == '\t'
-    df = reader.to_df()
+    df = reader.get_page_content()
     assert len(df.columns) == 2

mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt CHANGED Viewed

	@@ -1,2 +1 @@
1	- openai==1.55.3
2 1	tiktoken

mindsdb/integrations/handlers/langchain_handler/requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-openai==1.55.3
 wikipedia==1.4.0
 tiktoken
 anthropic>=0.26.1

mindsdb/integrations/handlers/llama_index_handler/requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
 llama-index==0.10.13
-openai == 1.24.0
 pydantic-settings >= 2.1.0
 llama-index-readers-web
 llama-index-embeddings-openai

mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py CHANGED Viewed

@@ -87,4 +87,4 @@ class FileTable(APIResource):
         reader = FileReader(file=BytesIO(file_content), name=table_name)
-        return reader.to_df()
+        return reader.get_page_content()

mindsdb/integrations/handlers/openai_handler/constants.py CHANGED Viewed

@@ -8,7 +8,9 @@ CHAT_MODELS = (
     'gpt-4-32k',
     'gpt-4-1106-preview',
     'gpt-4-0125-preview',
-    'gpt-4o'
+    'gpt-4o',
+    'o3-mini',
+    'o1-mini'
 )
 COMPLETION_MODELS = ('babbage-002', 'davinci-002')
 FINETUNING_MODELS = ('gpt-3.5-turbo', 'babbage-002', 'davinci-002', 'gpt-4')

mindsdb/integrations/handlers/openai_handler/requirements.txt CHANGED Viewed

	@@ -1,2 +1 @@
1	- openai<2.0.0,>=1.54.0
2 1	tiktoken

mindsdb/integrations/handlers/rag_handler/requirements.txt CHANGED Viewed

@@ -1,6 +1,5 @@
 -r mindsdb/integrations/handlers/chromadb_handler/requirements.txt
 faiss-cpu
-openai==1.55.3
 html2text
 writerai~=1.1.0
 sentence-transformers # needed for HuggingFaceEmbeddings from langchain-community

mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py CHANGED Viewed

@@ -1,9 +1,11 @@
+import io
 import json
 import requests
 from typing import Dict, Optional
 import pandas as pd
+import pyarrow.parquet as pq
 from mindsdb.integrations.libs.base import BaseMLEngine
@@ -37,9 +39,17 @@ class RayServeHandler(BaseMLEngine):
         args['target'] = target
         self.model_storage.json_set('args', args)
         try:
-            resp = requests.post(args['train_url'],
-                                 json={'df': df.to_json(orient='records'), 'target': target},
-                                 headers={'content-type': 'application/json; format=pandas-records'})
+            if args.get('is_parquet', False):
+                buffer = io.BytesIO()
+                df.to_parquet(buffer)
+                resp = requests.post(args['train_url'],
+                                     files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
+                                     data={"args": json.dumps(args), "target": target},
+                                     )
+            else:
+                resp = requests.post(args['train_url'],
+                                     json={'df': df.to_json(orient='records'), 'target': target, 'args': args},
+                                     headers={'content-type': 'application/json; format=pandas-records'})
         except requests.exceptions.InvalidSchema:
             raise Exception("Error: The URL provided for the training endpoint is invalid.")
@@ -59,14 +69,29 @@ class RayServeHandler(BaseMLEngine):
         args = {**(self.model_storage.json_get('args')), **args}  # merge incoming args
         pred_args = args.get('predict_params', {})
         args = {**args, **pred_args}  # merge pred_args
-        resp = requests.post(args['predict_url'],
-                             json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
-                             headers={'content-type': 'application/json; format=pandas-records'})
+        if args.get('is_parquet', False):
+            buffer = io.BytesIO()
+            df.attrs['pred_args'] = pred_args
+            df.to_parquet(buffer)
+            resp = requests.post(args['predict_url'],
+                                 files={"df": ("df", buffer.getvalue(), "application/octet-stream")},
+                                 data={"pred_args": json.dumps(pred_args)},
+                                 )
+        else:
+            resp = requests.post(args['predict_url'],
+                                 json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
+                                 headers={'content-type': 'application/json; format=pandas-records'})
         try:
-            response = resp.json()
+            if args.get('is_parquet', False):
+                buffer = io.BytesIO(resp.content)
+                table = pq.read_table(buffer)
+                response = table.to_pandas()
+            else:
+                response = resp.json()
         except json.JSONDecodeError:
             error = resp.text
+        except Exception:
+            error = 'Could not decode parquet.'
         else:
             if 'prediction' in response:
                 target = args['target']

mindsdb/integrations/handlers/timegpt_handler/requirements.txt CHANGED Viewed

	@@ -1 +1 @@
1	- nixtla==0.5.0
1	+ nixtla==0.6.6

mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py CHANGED Viewed

@@ -220,8 +220,6 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
     if limit is not None:
         if len(reviewed_urls) >= limit:
             return reviewed_urls
-    if crawl_depth == current_depth:
-        return reviewed_urls
     if not filters:
         matches_filter = True
@@ -241,6 +239,9 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
                 "error": str(error_message),
             }
+    if crawl_depth is not None and crawl_depth == current_depth:
+        return reviewed_urls
     to_rev_url_list = []
     # create a list of new urls to review that don't exist in the already reviewed ones

mindsdb/integrations/handlers/web_handler/web_handler.py CHANGED Viewed

@@ -1,62 +1,71 @@
+from typing import List
 import pandas as pd
 from mindsdb.integrations.libs.response import HandlerStatusResponse
-from mindsdb_sql_parser import ast
-from mindsdb.integrations.libs.api_handler import APIHandler, APITable
-from mindsdb.utilities.config import Config
-from mindsdb.integrations.utilities.sql_utils import extract_comparison_conditions, project_dataframe
+from mindsdb.utilities.config import config
 from mindsdb.utilities.security import validate_urls
 from .urlcrawl_helpers import get_all_websites
+from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
+from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
-class CrawlerTable(APITable):
-    def __init__(self, handler: APIHandler):
-        super().__init__(handler)
-        self.config = Config()
+class CrawlerTable(APIResource):
-    def select(self, query: ast.Select) -> pd.DataFrame:
+    def list(
+            self,
+            conditions: List[FilterCondition] = None,
+            limit: int = None,
+            **kwargs
+    ) -> pd.DataFrame:
         """
         Selects data from the provided websites
-        Args:
-            query (ast.Select): Given SQL SELECT query
         Returns:
             dataframe: Dataframe containing the crawled data
         Raises:
             NotImplementedError: If the query is not supported
         """
-        conditions = extract_comparison_conditions(query.where)
         urls = []
-        for operator, arg1, arg2 in conditions:
-            if operator == 'or':
-                raise NotImplementedError('OR is not supported')
-            if arg1 == 'url':
-                if operator in ['=', 'in']:
-                    urls = [str(arg2)] if isinstance(arg2, str) else arg2
-                else:
-                    raise NotImplementedError('Invalid URL format. Please provide a single URL like url = "example.com" or'
-                                              'multiple URLs using the format url IN ("url1", "url2", ...)')
+        crawl_depth = None
+        per_url_limit = None
+        for condition in conditions:
+            if condition.column == 'url':
+                if condition.op == FilterOperator.IN:
+                    urls = condition.value
+                elif condition.op == FilterOperator.EQUAL:
+                    urls = [condition.value]
+                condition.applied = True
+            if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
+                crawl_depth = condition.value
+                condition.applied = True
+            if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
+                per_url_limit = condition.value
+                condition.applied = True
         if len(urls) == 0:
             raise NotImplementedError(
-                'You must specify what url you want to crawl, for example: SELECT * FROM crawl WHERE url = "someurl"')
+                'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
-        allowed_urls = self.config.get('web_crawling_allowed_sites', [])
+        allowed_urls = config.get('web_crawling_allowed_sites', [])
         if allowed_urls and not validate_urls(urls, allowed_urls):
             raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
-        if query.limit is None:
-            raise NotImplementedError('You must specify a LIMIT clause which defines the number of pages to crawl')
-        limit = query.limit.value
-        result = get_all_websites(urls, limit, html=False)
-        if len(result) > limit:
+        if limit is None and per_url_limit is None and crawl_depth is None:
+            per_url_limit = 1
+        if per_url_limit is not None:
+            # crawl every url separately
+            results = []
+            for url in urls:
+                results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
+            result = pd.concat(results)
+        else:
+            result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
+        if limit is not None and len(result) > limit:
             result = result[:limit]
-        # filter targets
-        result = project_dataframe(result, query.targets, self.get_columns())
         return result
     def get_columns(self):

mindsdb/integrations/handlers/youtube_handler/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ from .__about__ import __version__ as version, __description__ as description
 try:
     from .youtube_handler import YoutubeHandler as Handler
+    from .connection_args import connection_args
     import_error = None
 except Exception as e:
     Handler = None
@@ -24,4 +25,5 @@ __all__ = [
     "description",
     "import_error",
     "icon_path",
+    "connection_args",
 ]

mindsdb/integrations/handlers/youtube_handler/connection_args.py ADDED Viewed

@@ -0,0 +1,32 @@
+from collections import OrderedDict
+from mindsdb.integrations.libs.const import HANDLER_CONNECTION_ARG_TYPE as ARG_TYPE
+connection_args = OrderedDict(
+    youtube_api_token={
+        'type': ARG_TYPE.STR,
+        'description': 'Youtube API Token',
+        'label': 'Youtube API Token',
+    },
+    credentials_url={
+        'type': ARG_TYPE.STR,
+        'description': 'URL to Service Account Keys',
+        'label': 'URL to Service Account Keys',
+    },
+    credentials_file={
+        'type': ARG_TYPE.STR,
+        'description': 'Location of Service Account Keys',
+        'label': 'Path to Service Account Keys',
+    },
+    credentials={
+        'type': ARG_TYPE.PATH,
+        'description': 'Service Account Keys',
+        'label': 'Upload Service Account Keys',
+    },
+    code={
+        'type': ARG_TYPE.STR,
+        'description': 'Code After Authorisation',
+        'label': 'Code After Authorisation',
+    },
+)

mindsdb/integrations/libs/llm/utils.py CHANGED Viewed

@@ -115,6 +115,11 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
     """
     temperature = min(1.0, max(0.0, args.get("temperature", 0.0)))
     if provider == "openai":
+        if any(x in args.get("model_name", "") for x in ['o1', 'o3']):
+            # for o1 and 03, 'temperature' does not support 0.0 with this model. Only the default (1) value is supported
+            temperature = 1
         return OpenAIConfig(
             model_name=args.get("model_name", DEFAULT_OPENAI_MODEL),
             temperature=temperature,

mindsdb/integrations/libs/process_cache.py CHANGED Viewed

@@ -186,7 +186,6 @@ class ProcessCache:
         self._keep_alive = {}
         self._stop_event = threading.Event()
         self.cleaner_thread = None
-        self._start_clean()
     def __del__(self):
         self._stop_clean()
@@ -200,7 +199,7 @@ class ProcessCache:
         ):
             return
         self._stop_event.clear()
-        self.cleaner_thread = threading.Thread(target=self._clean)
+        self.cleaner_thread = threading.Thread(target=self._clean, name='ProcessCache.clean')
         self.cleaner_thread.daemon = True
         self.cleaner_thread.start()
@@ -258,6 +257,7 @@ class ProcessCache:
             Returns:
                 Future
         """
+        self._start_clean()
         handler_module_path = payload['handler_meta']['module_path']
         integration_id = payload['handler_meta']['integration_id']
         if task_type in (ML_TASK_TYPE.LEARN, ML_TASK_TYPE.FINETUNE):

MindsDB 25.2.2.2__py3-none-any.whl → 25.2.4.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.2.2.2py3-none-any.whl → 25.2.4.0py3-none-any.whl