PyPI - MindsDB - Versions diffs - 25.7.4.0__py3-none-any.whl → 25.8.3.0__py3-none-any.whl - Mend

MindsDB 25.7.4.0py3-none-any.whl → 25.8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (65) hide show

mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py ADDED Viewed

@@ -0,0 +1,73 @@
+from typing import List
+class TextSplitter:
+    def __init__(
+        self,
+        chunk_size: int = 1000,
+        chunk_overlap: int = 200,
+        separators: List[str] = None,
+        k_range: float = 0.5,
+        k_ratio: float = 1,
+    ):
+        """
+        Split text into chunks. The logic:
+         - Get a piece of text with chunk_size and try to find the separator at the end of the piece.
+         - The allowed range to find the separator is defined by k_range and k_ratio using formula:
+            k_range * chunk_size / (num * k_ratio + 1)
+            num - is number of a separator from the list
+         - if the separator is not in the rage: switch to the next separator
+         - if the found separator is in the middle of the sentence, use overlapping:
+            - the found text is the current chunk
+            - repeat the search with less strict k_range and k_ratio
+            - the found text will be the beginning of the next chunk
+        :param chunk_size: size of the chunk, which must not be exceeded
+        :param separators: list of separators in order of priority
+        :param k_range: defines the range to look for the separator
+        :param k_ratio: defines how much to shrink the range for the next separator
+        """
+        if separators is None:
+            separators = ["\n\n", "\n", ". ", " ", ""]
+        self.chunk_size = chunk_size
+        self.chunk_overlap = chunk_overlap
+        self.separators = separators
+        self.k_range = k_range
+        self.k_ratio = k_ratio
+    def split_text(self, text: str) -> List[str]:
+        chunks = []
+        while True:
+            if len(text) < self.chunk_size:
+                chunks.append(text)
+                break
+            sep, chunk, shift = self.get_next_chunk(text, self.k_range, self.k_ratio)
+            chunks.append(chunk)
+            text = text[shift:]
+        return chunks
+    def get_next_chunk(self, text: str, k_range: float, k_ratio: float):
+        # returns chunk with separator and shift for the next search iteration
+        chunk = text[: self.chunk_size]
+        # positions = []
+        for i, sep in enumerate(self.separators):
+            pos = chunk.rfind(sep)
+            vpos = self.chunk_size - pos
+            if vpos < k_range * self.chunk_size / (i * k_ratio + 1):
+                shift = len(sep) + pos
+                if sep.strip(" ") == "":
+                    # overlapping
+                    sep2, _, shift2 = self.get_next_chunk(text, k_range * 1.5, 0)
+                    if sep2.strip(" ") != "":
+                        # use shift of previous separator
+                        if shift - shift2 < self.chunk_overlap:
+                            shift = shift2
+                return sep, chunk[:pos], shift
+        raise RuntimeError("Cannot split text")

mindsdb/interfaces/query_context/context_controller.py CHANGED Viewed

@@ -45,7 +45,7 @@ class RunningQuery:
             for df in dn.query_stream(query2, fetch_size=self.batch_size):
                 max_track_value = self.get_max_track_value(df)
                 yield df
-                self.set_progress(df, max_track_value)
+                self.set_progress(max_track_value=max_track_value)
         else:
             while True:
@@ -59,7 +59,7 @@ class RunningQuery:
                 max_track_value = self.get_max_track_value(df)
                 yield df
-                self.set_progress(df, max_track_value)
+                self.set_progress(max_track_value=max_track_value)
     def get_partition_query(self, step_num: int, query: Select, stream=False) -> Select:
         """
@@ -178,24 +178,23 @@ class RunningQuery:
             # stream mode
             return None
-    def set_progress(self, df: pd.DataFrame, max_track_value: int):
+    def set_progress(self, processed_rows: int = None, max_track_value: int = None):
         """
         Store progres of the query, it is called after processing of batch
         """
-        if len(df) == 0:
-            return
-        self.record.processed_rows = self.record.processed_rows + len(df)
-        cur_value = self.record.context.get("track_value")
-        new_value = max_track_value
-        if new_value is not None:
-            if cur_value is None or new_value > cur_value:
-                self.record.context["track_value"] = new_value
-                flag_modified(self.record, "context")
+        if processed_rows is not None and processed_rows > 0:
+            self.record.processed_rows = self.record.processed_rows + processed_rows
+            db.session.commit()
-        db.session.commit()
+        if max_track_value is not None:
+            cur_value = self.record.context.get("track_value")
+            new_value = max_track_value
+            if new_value is not None:
+                if cur_value is None or new_value > cur_value:
+                    self.record.context["track_value"] = new_value
+                    flag_modified(self.record, "context")
+            db.session.commit()
     def on_error(self, error: Exception, step_num: int, steps_data: dict):
         """

mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py CHANGED Viewed

@@ -15,6 +15,7 @@ from mindsdb.interfaces.skills.custom.text2sql.mindsdb_kb_tools import (
 class MindsDBSQLToolkit(SQLDatabaseToolkit):
+    include_tables_tools: bool = True
     include_knowledge_base_tools: bool = True
     def get_tools(self, prefix="") -> List[BaseTool]:
@@ -212,8 +213,13 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
         )
         # Return standard SQL tools and knowledge base tools
-        return sql_tools + [
+        kb_tools = [
             kb_list_tool,
             kb_info_tool,
             kb_query_tool,
         ]
+        if not self.include_tables_tools:
+            return kb_tools
+        else:
+            return sql_tools + kb_tools

mindsdb/interfaces/skills/skill_tool.py CHANGED Viewed

@@ -347,7 +347,13 @@ class SkillToolController:
         )
         db = MindsDBSQL.custom_init(sql_agent=sql_agent)
         should_include_kb_tools = include_knowledge_bases is not None and len(include_knowledge_bases) > 0
-        toolkit = MindsDBSQLToolkit(db=db, llm=llm, include_knowledge_base_tools=should_include_kb_tools)
+        should_include_tables_tools = len(databases_struct) > 0 or len(tables_list) > 0
+        toolkit = MindsDBSQLToolkit(
+            db=db,
+            llm=llm,
+            include_tables_tools=should_include_tables_tools,
+            include_knowledge_base_tools=should_include_kb_tools,
+        )
         return toolkit.get_tools()
     def _make_retrieval_tools(self, skill: db.Skills, llm, embedding_model):

mindsdb/interfaces/skills/sql_agent.py CHANGED Viewed

@@ -405,6 +405,7 @@ class SQLAgent:
             tables_idx[tuple(table.parts)] = table
         tables = []
+        not_found = []
         for table_name in table_names:
             if not table_name.strip():
                 continue
@@ -419,9 +420,12 @@ class SQLAgent:
             table_identifier = tables_idx.get(tuple(table_parts))
             if table_identifier is None:
-                raise ValueError(f"Table {table_name} not found in the database")
-            tables.append(table_identifier)
+                not_found.append(table_name)
+            else:
+                tables.append(table_identifier)
+        if not_found:
+            raise ValueError(f"Tables: {', '.join(not_found)} not found in the database")
         return tables
     def get_knowledge_base_info(self, kb_names: Optional[List[str]] = None) -> str:

mindsdb/utilities/config.py CHANGED Viewed

@@ -599,6 +599,7 @@ class Config:
                 ml_task_queue_consumer=None,
                 agent=None,
                 project=None,
+                update_gui=False,
             )
             return
@@ -635,6 +636,7 @@ class Config:
             help="MindsDB agent name to connect to",
         )
         parser.add_argument("--project-name", type=str, default=None, help="MindsDB project name")
+        parser.add_argument("--update-gui", action="store_true", default=False, help="Update GUI and exit")
         self._cmd_args = parser.parse_args()

mindsdb/utilities/fs.py CHANGED Viewed

@@ -12,6 +12,10 @@ from mindsdb.utilities import log
 logger = log.getLogger(__name__)
+def get_tmp_dir() -> Path:
+    return Path(tempfile.gettempdir()).joinpath("mindsdb")
 def _get_process_mark_id(unified: bool = False) -> str:
     """Creates a text that can be used to identify process+thread
     Args:
@@ -26,7 +30,7 @@ def _get_process_mark_id(unified: bool = False) -> str:
 def create_process_mark(folder="learn"):
-    p = Path(tempfile.gettempdir()).joinpath(f"mindsdb/processes/{folder}/")
+    p = get_tmp_dir().joinpath(f"processes/{folder}/")
     p.mkdir(parents=True, exist_ok=True)
     mark = _get_process_mark_id()
     p.joinpath(mark).touch()
@@ -43,7 +47,7 @@ def set_process_mark(folder: str, mark: str) -> None:
     Returns:
         str: process mark
     """
-    p = Path(tempfile.gettempdir()).joinpath(f"mindsdb/processes/{folder}/")
+    p = get_tmp_dir().joinpath(f"processes/{folder}/")
     p.mkdir(parents=True, exist_ok=True)
     mark = f"{os.getpid()}-{threading.get_native_id()}-{mark}"
     p.joinpath(mark).touch()
@@ -53,11 +57,7 @@ def set_process_mark(folder: str, mark: str) -> None:
 def delete_process_mark(folder: str = "learn", mark: Optional[str] = None):
     if mark is None:
         mark = _get_process_mark_id()
-    p = (
-        Path(tempfile.gettempdir())
-        .joinpath(f"mindsdb/processes/{folder}/")
-        .joinpath(mark)
-    )
+    p = get_tmp_dir().joinpath(f"processes/{folder}/").joinpath(mark)
     if p.exists():
         p.unlink()
@@ -65,7 +65,7 @@ def delete_process_mark(folder: str = "learn", mark: Optional[str] = None):
 def clean_process_marks():
     """delete all existing processes marks"""
     logger.debug("Deleting PIDs..")
-    p = Path(tempfile.gettempdir()).joinpath("mindsdb/processes/")
+    p = get_tmp_dir().joinpath("processes/")
     if p.exists() is False:
         return
     for path in p.iterdir():
@@ -81,7 +81,7 @@ def get_processes_dir_files_generator() -> Tuple[Path, int, int]:
     Yields:
         Tuple[Path, int, int]: file object, process is and thread id
     """
-    p = Path(tempfile.gettempdir()).joinpath("mindsdb/processes/")
+    p = get_tmp_dir().joinpath("processes/")
     if p.exists() is False:
         return
     for path in p.iterdir():
@@ -112,9 +112,7 @@ def clean_unlinked_process_marks() -> List[int]:
             try:
                 next(t for t in threads if t.id == thread_id)
             except StopIteration:
-                logger.warning(
-                    f"We have mark for process/thread {process_id}/{thread_id} but it does not exists"
-                )
+                logger.warning(f"We have mark for process/thread {process_id}/{thread_id} but it does not exists")
                 deleted_pids.append(process_id)
                 file.unlink()
@@ -124,14 +122,59 @@ def clean_unlinked_process_marks() -> List[int]:
             continue
         except psutil.NoSuchProcess:
-            logger.warning(
-                f"We have mark for process/thread {process_id}/{thread_id} but it does not exists"
-            )
+            logger.warning(f"We have mark for process/thread {process_id}/{thread_id} but it does not exists")
             deleted_pids.append(process_id)
             file.unlink()
     return deleted_pids
+def create_pid_file():
+    """
+    Create mindsdb process pid file. Check if previous process exists and is running
+    """
+    if os.environ.get("USE_PIDFILE") != "1":
+        return
+    p = get_tmp_dir()
+    p.mkdir(parents=True, exist_ok=True)
+    pid_file = p.joinpath("pid")
+    if pid_file.exists():
+        # if process exists raise exception
+        pid = pid_file.read_text().strip()
+        try:
+            psutil.Process(int(pid))
+            raise Exception(f"Found PID file with existing process: {pid} {pid_file}")
+        except (psutil.Error, ValueError):
+            ...
+        logger.warning(f"Found existing PID file {pid_file}({pid}), removing")
+        pid_file.unlink()
+    pid_file.write_text(str(os.getpid()))
+def delete_pid_file():
+    """
+    Remove existing process pid file if it matches current process
+    """
+    if os.environ.get("USE_PIDFILE") != "1":
+        return
+    pid_file = get_tmp_dir().joinpath("pid")
+    if not pid_file.exists():
+        return
+    pid = pid_file.read_text().strip()
+    if pid != str(os.getpid()):
+        logger.warning(f"Process id in PID file ({pid_file}) doesn't match mindsdb pid")
+        return
+    pid_file.unlink()
 def __is_within_directory(directory, target):
     abs_directory = os.path.abspath(directory)
     abs_target = os.path.abspath(target)
@@ -141,8 +184,8 @@ def __is_within_directory(directory, target):
 def safe_extract(tarfile, path=".", members=None, *, numeric_owner=False):
     # for py >= 3.12
-    if hasattr(tarfile, 'data_filter'):
-        tarfile.extractall(path, members=members, numeric_owner=numeric_owner, filter='data')
+    if hasattr(tarfile, "data_filter"):
+        tarfile.extractall(path, members=members, numeric_owner=numeric_owner, filter="data")
         return
     # for py < 3.12

MindsDB 25.7.4.0__py3-none-any.whl → 25.8.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.4.0py3-none-any.whl → 25.8.3.0py3-none-any.whl