PyPI - MindsDB - Versions diffs - 25.7.4.0__py3-none-any.whl → 25.8.3.0__py3-none-any.whl - Mend

MindsDB 25.7.4.0py3-none-any.whl → 25.8.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (65) hide show

mindsdb/interfaces/agents/agents_controller.py CHANGED Viewed

@@ -145,11 +145,60 @@ class AgentsController:
         return all_agents.all()
+    def _create_default_sql_skill(
+        self,
+        name,
+        project_name,
+        include_tables: List[str] = None,
+        include_knowledge_bases: List[str] = None,
+    ):
+        # Create a default SQL skill
+        skill_name = f"{name}_sql_skill"
+        skill_params = {
+            "type": "sql",
+            "description": f"Auto-generated SQL skill for agent {name}",
+        }
+        # Add restrictions provided
+        if include_tables:
+            skill_params["include_tables"] = include_tables
+        if include_knowledge_bases:
+            skill_params["include_knowledge_bases"] = include_knowledge_bases
+        try:
+            # Check if skill already exists
+            existing_skill = self.skills_controller.get_skill(skill_name, project_name)
+            if existing_skill is None:
+                # Create the skill
+                skill_type = skill_params.pop("type")
+                self.skills_controller.add_skill(
+                    name=skill_name, project_name=project_name, type=skill_type, params=skill_params
+                )
+            else:
+                # Update the skill if parameters have changed
+                params_changed = False
+                # Check if skill parameters need to be updated
+                for param_key, param_value in skill_params.items():
+                    if existing_skill.params.get(param_key) != param_value:
+                        existing_skill.params[param_key] = param_value
+                        params_changed = True
+                # Update the skill if needed
+                if params_changed:
+                    flag_modified(existing_skill, "params")
+                    db.session.commit()
+        except Exception as e:
+            raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
+        return skill_name
     def add_agent(
         self,
         name: str,
         project_name: str = None,
-        model_name: str = None,
+        model_name: Union[str, dict] = None,
         skills: List[Union[str, dict]] = None,
         provider: str = None,
         params: Dict[str, Any] = None,
@@ -256,46 +305,13 @@ class AgentsController:
         # Auto-create SQL skill if no skills are provided but include_tables or include_knowledge_bases params are provided
         if not skills and (include_tables or include_knowledge_bases):
-            # Create a default SQL skill
-            skill_name = f"{name}_sql_skill"
-            skill_params = {
-                "type": "sql",
-                "description": f"Auto-generated SQL skill for agent {name}",
-            }
-            # Add restrictions provided
-            if include_tables:
-                skill_params["include_tables"] = include_tables
-            if include_knowledge_bases:
-                skill_params["include_knowledge_bases"] = include_knowledge_bases
-            try:
-                # Check if skill already exists
-                existing_skill = self.skills_controller.get_skill(skill_name, project_name)
-                if existing_skill is None:
-                    # Create the skill
-                    skill_type = skill_params.pop("type")
-                    self.skills_controller.add_skill(
-                        name=skill_name, project_name=project_name, type=skill_type, params=skill_params
-                    )
-                else:
-                    # Update the skill if parameters have changed
-                    params_changed = False
-                    # Check if skill parameters need to be updated
-                    for param_key, param_value in skill_params.items():
-                        if existing_skill.params.get(param_key) != param_value:
-                            existing_skill.params[param_key] = param_value
-                            params_changed = True
-                    # Update the skill if needed
-                    if params_changed:
-                        flag_modified(existing_skill, "params")
-                        db.session.commit()
-                skills = [skill_name]
-            except Exception as e:
-                raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
+            skill = self._create_default_sql_skill(
+                name,
+                project_name,
+                include_tables=include_tables,
+                include_knowledge_bases=include_knowledge_bases,
+            )
+            skills = [skill]
         agent = db.Agents(
             name=name,
@@ -351,7 +367,7 @@ class AgentsController:
         agent_name: str,
         project_name: str = default_project,
         name: str = None,
-        model_name: str = None,
+        model_name: Union[str, dict] = None,
         skills_to_add: List[Union[str, dict]] = None,
         skills_to_remove: List[str] = None,
         skills_to_rewrite: List[Union[str, dict]] = None,
@@ -365,7 +381,7 @@ class AgentsController:
             agent_name (str): The name of the new agent, or existing agent to update
             project_name (str): The containing project
             name (str): The updated name of the agent
-            model_name (str): The name of the existing ML model the agent will use
+            model_name (str | dict): The name of the existing ML model the agent will use
             skills_to_add (List[Union[str, dict]]): List of skill names to add to the agent, or list of dicts
                  with one of keys is "name", and other is additional parameters for relationship agent<>skill
             skills_to_remove (List[str]): List of skill names to remove from the agent
@@ -394,6 +410,8 @@ class AgentsController:
         existing_agent = self.get_agent(agent_name, project_name=project_name)
         if existing_agent is None:
             raise EntityNotExistsError(f"Agent with name not found: {agent_name}")
+        existing_params = existing_agent.params or {}
         is_demo = (existing_agent.params or {}).get("is_demo", False)
         if is_demo and (
             (name is not None and name != agent_name)
@@ -413,12 +431,34 @@ class AgentsController:
             existing_agent.name = name
         if model_name or provider:
+            if isinstance(model_name, dict):
+                # move into params
+                existing_params["model"] = model_name
+                model_name = None
             # check model and provider
             model, provider = self.check_model_provider(model_name, provider)
             # Update model and provider
             existing_agent.model_name = model_name
             existing_agent.provider = provider
+        if "data" in params:
+            if len(skills_to_add) > 0 or len(skills_to_remove) > 0:
+                raise ValueError(
+                    "'data' parameter cannot be used with 'skills_to_remove' or 'skills_to_add' parameters"
+                )
+            include_knowledge_bases = params["data"].get("knowledge_bases")
+            include_tables = params["data"].get("tables")
+            skill = self._create_default_sql_skill(
+                agent_name,
+                project_name,
+                include_tables=include_tables,
+                include_knowledge_bases=include_knowledge_bases,
+            )
+            skills_to_rewrite = [{"name": skill}]
         # check that all skills exist
         skill_name_to_record_map = {}
         for skill_meta in skills_to_add + skills_to_remove + skills_to_rewrite:
@@ -496,8 +536,6 @@ class AgentsController:
                 db.session.add(association)
         if params is not None:
-            existing_params = existing_agent.params or {}
             if params.get("data", {}).get("tables"):
                 new_table_entries = set(params["data"]["tables"]) - set(
                     existing_params.get("data", {}).get("tables", [])

mindsdb/interfaces/agents/constants.py CHANGED Viewed

@@ -26,7 +26,6 @@ OPEN_AI_CHAT_MODELS = (
 SUPPORTED_PROVIDERS = {
     "openai",
     "anthropic",
-    "anyscale",
     "litellm",
     "ollama",
     "nvidia_nim",
@@ -213,7 +212,14 @@ DEFAULT_TIKTOKEN_MODEL_NAME = os.getenv("DEFAULT_TIKTOKEN_MODEL_NAME", "gpt-4")
 AGENT_CHUNK_POLLING_INTERVAL_SECONDS = os.getenv("AGENT_CHUNK_POLLING_INTERVAL_SECONDS", 1.0)
 DEFAULT_TEXT2SQL_DATABASE = "mindsdb"
 DEFAULT_AGENT_SYSTEM_PROMPT = """
-You are an AI assistant powered by MindsDB. When answering questions, follow these guidelines:
+You are an AI assistant powered by MindsDB. You have access to conversation history and should use it to provide contextual responses. When answering questions, follow these guidelines:
+**CONVERSATION CONTEXT:**
+- You have access to previous messages in this conversation through your memory system
+- When users ask about previous questions, topics, or context, refer to the conversation history
+- Maintain conversational continuity and reference earlier parts of the conversation when relevant
+- When asked to retrieve or list past user questions, examine your conversation memory to identify and list previous user queries
+- You can reference specific past questions by their content or by their position in the conversation (e.g., "your first question", "the question you asked earlier about...")
 1. For factual questions about specific topics, use the knowledge base tools in this sequence:
    - First use kb_list_tool to see available knowledge bases
@@ -231,7 +237,14 @@ For factual questions, ALWAYS use the available tools to look up information rat
 """
-MINDSDB_PREFIX = """You are an AI assistant powered by MindsDB. When answering questions, follow these guidelines:
+MINDSDB_PREFIX = """You are an AI assistant powered by MindsDB. You have access to conversation history and should use it to provide contextual responses. When answering questions, follow these guidelines:
+**CONVERSATION CONTEXT:**
+- You have access to previous messages in this conversation through your memory system
+- When users ask about previous questions, topics, or context, refer to the conversation history
+- Maintain conversational continuity and reference earlier parts of the conversation when relevant
+- When asked to retrieve or list past user questions, examine your conversation memory to identify and list previous user queries
+- You can reference specific past questions by their content or by their position in the conversation (e.g., "your first question", "the question you asked earlier about...")
 1. For questions about database tables and their contents:
    - Use the sql_db_query to query the tables directly

mindsdb/interfaces/agents/langchain_agent.py CHANGED Viewed

@@ -7,15 +7,17 @@ import re
 import threading
 import numpy as np
 import pandas as pd
+import logging
 from langchain.agents import AgentExecutor
 from langchain.agents.initialize import initialize_agent
 from langchain.chains.conversation.memory import ConversationSummaryBufferMemory
-from langchain_community.chat_models import ChatAnyscale, ChatLiteLLM, ChatOllama
+from langchain_community.chat_models import ChatLiteLLM, ChatOllama
 from langchain_writer import ChatWriter
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.agents import AgentAction, AgentStep
 from langchain_core.callbacks.base import BaseCallbackHandler
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
 from langchain_core.messages.base import BaseMessage
@@ -63,7 +65,6 @@ from mindsdb.interfaces.agents.constants import (
 )
 from mindsdb.interfaces.skills.skill_tool import skill_tool, SkillData
 from langchain_anthropic import ChatAnthropic
-from langchain_core.messages import SystemMessage
 from langchain_openai import ChatOpenAI
 from mindsdb.utilities.langfuse import LangfuseClientWrapper
@@ -165,8 +166,6 @@ def create_chat_model(args: Dict):
         except NotImplementedError:
             chat_open_ai.tiktoken_model_name = DEFAULT_TIKTOKEN_MODEL_NAME
         return chat_open_ai
-    if args["provider"] == "anyscale":
-        return ChatAnyscale(**model_kwargs)
     if args["provider"] == "litellm":
         return ChatLiteLLM(**model_kwargs)
     if args["provider"] == "ollama":
@@ -299,6 +298,11 @@ class LangchainAgent:
         if "prompt_template" in args:
             logger.info(f"Using prompt template: {args['prompt_template'][:50]}...")
+        if "model_name" not in args:
+            raise ValueError(
+                "No model name provided for agent. Provide it in the model parameter or in the default model setup."
+            )
         return args
     def get_metadata(self) -> Dict:
@@ -347,15 +351,20 @@ class LangchainAgent:
         args.update(params or {})
         df = pd.DataFrame(messages)
+        logger.info(f"LangchainAgent.get_completion: Received {len(messages)} messages")
+        if logger.isEnabledFor(logging.DEBUG):
+            logger.debug(f"Messages DataFrame shape: {df.shape}")
+            logger.debug(f"Messages DataFrame columns: {df.columns.tolist()}")
+            logger.debug(f"Messages DataFrame content: {df.to_dict('records')}")
         # Back compatibility for old models
         self.provider = args.get("provider", get_llm_provider(args))
         df = df.reset_index(drop=True)
         agent = self.create_agent(df)
-        # Use last message as prompt, remove other questions.
-        user_column = args.get("user_column", USER_COLUMN)
-        df.iloc[:-1, df.columns.get_loc(user_column)] = None
+        # Keep conversation history for context - don't nullify previous messages
+        # Only use the last message as the current prompt, but preserve history for agent memory
         response = self.run_agent(df, agent, args)
         # End the run completion span and update the metadata with tool usage
@@ -376,6 +385,12 @@ class LangchainAgent:
         args = self.args
         df = pd.DataFrame(messages)
+        logger.info(f"LangchainAgent._get_completion_stream: Received {len(messages)} messages")
+        # Check if we have the expected columns for conversation history
+        if "question" in df.columns and "answer" in df.columns:
+            logger.debug("DataFrame has question/answer columns for conversation history")
+        else:
+            logger.warning("DataFrame missing question/answer columns! Available columns: {df.columns.tolist()}")
         self.embedding_model_provider = args.get("embedding_model_provider", get_embedding_model_provider(args))
         # Back compatibility for old models
@@ -383,9 +398,8 @@ class LangchainAgent:
         df = df.reset_index(drop=True)
         agent = self.create_agent(df)
-        # Use last message as prompt, remove other questions.
-        user_column = args.get("user_column", USER_COLUMN)
-        df.iloc[:-1, df.columns.get_loc(user_column)] = None
+        # Keep conversation history for context - don't nullify previous messages
+        # Only use the last message as the current prompt, but preserve history for agent memory
         return self.stream_agent(df, agent, args)
     def create_agent(self, df: pd.DataFrame) -> AgentExecutor:
@@ -405,7 +419,8 @@ class LangchainAgent:
         # Prefer prediction prompt template over original if provided.
         prompt_template = args["prompt_template"]
-        # Set up memory.
+        # Modern LangChain approach: Use memory but populate it correctly
+        # Create memory and populate with conversation history
         memory = ConversationSummaryBufferMemory(
             llm=llm,
             input_key="input",
@@ -414,17 +429,41 @@ class LangchainAgent:
             memory_key="chat_history",
         )
+        # Add system message first
         memory.chat_memory.messages.insert(0, SystemMessage(content=prompt_template))
-        # User - Assistant conversation. All except the last message.
         user_column = args.get("user_column", USER_COLUMN)
         assistant_column = args.get("assistant_column", ASSISTANT_COLUMN)
-        for row in df[:-1].to_dict("records"):
-            question = row[user_column]
-            answer = row[assistant_column]
+        logger.info(f"Processing conversation history: {len(df)} total messages, {len(df[:-1])} history messages")
+        logger.debug(f"User column: {user_column}, Assistant column: {assistant_column}")
+        # Process history messages (all except the last one which is current message)
+        history_df = df[:-1]
+        if len(history_df) == 0:
+            logger.debug("No history rows to process - this is normal for first message")
+        history_count = 0
+        for i, row in enumerate(history_df.to_dict("records")):
+            question = row.get(user_column)
+            answer = row.get(assistant_column)
+            logger.debug(f"Converting history row {i}: question='{question}', answer='{answer}'")
+            # Add messages directly to memory's chat_memory.messages list (modern approach)
             if isinstance(question, str) and len(question) > 0:
-                memory.chat_memory.add_user_message(question)
+                memory.chat_memory.messages.append(HumanMessage(content=question))
+                history_count += 1
+                logger.debug(f"Added HumanMessage to memory: {question}")
             if isinstance(answer, str) and len(answer) > 0:
-                memory.chat_memory.add_ai_message(answer)
+                memory.chat_memory.messages.append(AIMessage(content=answer))
+                history_count += 1
+                logger.debug(f"Added AIMessage to memory: {answer}")
+        logger.info(f"Built conversation history with {history_count} history messages + system message")
+        logger.debug(f"Final memory messages count: {len(memory.chat_memory.messages)}")
+        # Store memory for agent use
+        self._conversation_memory = memory
         agent_type = args.get("agent_type", DEFAULT_AGENT_TYPE)
         agent_executor = initialize_agent(
@@ -564,7 +603,22 @@ AI: {response}"""
                 return {CONTEXT_COLUMN: [], ASSISTANT_COLUMN: ""}
             try:
                 callbacks, context_callback = prepare_callbacks(self, args)
-                result = agent_executor.invoke(prompt, config={"callbacks": callbacks})
+                # Modern LangChain approach: Include conversation history + current message
+                if hasattr(self, "_conversation_messages") and self._conversation_messages:
+                    # Add current user message to conversation history
+                    full_messages = self._conversation_messages + [HumanMessage(content=prompt)]
+                    logger.critical(f"🔍 INVOKING AGENT with {len(full_messages)} messages (including history)")
+                    logger.debug(
+                        f"Full conversation messages: {[type(msg).__name__ + ': ' + msg.content[:100] + '...' for msg in full_messages]}"
+                    )
+                    # For agents, we need to pass the input in the expected format
+                    # The agent expects 'input' key with the current question, but conversation history should be in memory
+                    result = agent_executor.invoke({"input": prompt}, config={"callbacks": callbacks})
+                else:
+                    logger.warning("No conversation messages found - using simple prompt")
+                    result = agent_executor.invoke({"input": prompt}, config={"callbacks": callbacks})
                 captured_context = context_callback.get_contexts()
                 output = result["output"] if isinstance(result, dict) and "output" in result else str(result)
                 return {CONTEXT_COLUMN: captured_context, ASSISTANT_COLUMN: output}
@@ -587,7 +641,14 @@ AI: {response}"""
         agent_timeout_seconds = args.get("timeout", DEFAULT_AGENT_TIMEOUT_SECONDS)
         with ContextThreadPoolExecutor(max_workers=max_workers) as executor:
-            futures = [executor.submit(_invoke_agent_executor_with_prompt, agent, prompt) for prompt in prompts]
+            # Only process the last prompt (current question), not all prompts
+            # The previous prompts are conversation history and should only be used for context
+            if prompts:
+                current_prompt = prompts[-1]  # Last prompt is the current question
+                futures = [executor.submit(_invoke_agent_executor_with_prompt, agent, current_prompt)]
+            else:
+                logger.error("No prompts found to process")
+                futures = []
             try:
                 for future in as_completed(futures, timeout=agent_timeout_seconds):
                     result = future.result()
@@ -688,12 +749,14 @@ AI: {response}"""
         callbacks, context_callback = prepare_callbacks(self, args)
-        yield self.add_chunk_metadata({"type": "start", "prompt": prompts[0]})
+        # Use last prompt (current question) instead of first prompt (history)
+        current_prompt = prompts[-1] if prompts else ""
+        yield self.add_chunk_metadata({"type": "start", "prompt": current_prompt})
         if not hasattr(agent_executor, "stream") or not callable(agent_executor.stream):
             raise AttributeError("The agent_executor does not have a 'stream' method")
-        stream_iterator = self._stream_agent_executor(agent_executor, prompts[0], callbacks)
+        stream_iterator = self._stream_agent_executor(agent_executor, current_prompt, callbacks)
         for chunk in stream_iterator:
             yield chunk

mindsdb/interfaces/database/projects.py CHANGED Viewed

@@ -3,11 +3,12 @@ from copy import deepcopy
 from typing import List, Optional
 from collections import OrderedDict
+import pandas as pd
 import sqlalchemy as sa
 import numpy as np
 from mindsdb_sql_parser.ast.base import ASTNode
-from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
+from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier, BinaryOperation
 from mindsdb_sql_parser import parse_sql
 from mindsdb.interfaces.storage import db
@@ -109,7 +110,19 @@ class Project:
         """
         ViewController().delete(name, project_name=self.name, strict_case=strict_case)
-    def create_view(self, name: str, query: str):
+    def create_view(self, name: str, query: str, session):
+        ast_query = parse_sql(query)
+        if isinstance(ast_query, Select):
+            # check create view sql
+            ast_query.limit = Constant(1)
+            query_context_controller.set_context(query_context_controller.IGNORE_CONTEXT)
+            try:
+                SQLQuery(ast_query, session=session, database=self.name)
+            finally:
+                query_context_controller.release_context(query_context_controller.IGNORE_CONTEXT)
         ViewController().add(name, query=query, project_name=self.name)
     def update_view(self, name: str, query: str, strict_case: bool = False):
@@ -124,21 +137,112 @@ class Project:
         view_meta["query_ast"] = parse_sql(view_meta["query"])
         return view_meta
-    def query_view(self, query, session):
+    @staticmethod
+    def combine_view_select(view_query: Select, query: Select) -> Select:
+        """
+        Create a combined query from view's query and outer query.
+        """
+        # apply optimizations
+        if query.where is not None:
+            # Get conditions that can be duplicated into view's query
+            # It has to be simple condition with identifier and constant
+            # Also it shouldn't be under the OR condition
+            def get_conditions_to_move(node):
+                if not isinstance(node, BinaryOperation):
+                    return []
+                op = node.op.upper()
+                if op == "AND":
+                    conditions = []
+                    conditions.extend(get_conditions_to_move(node.args[0]))
+                    conditions.extend(get_conditions_to_move(node.args[1]))
+                    return conditions
+                if op == "OR":
+                    return []
+                if isinstance(node.args[0], (Identifier, Constant)) and isinstance(
+                    node.args[1], (Identifier, Constant)
+                ):
+                    return [node]
+            conditions = get_conditions_to_move(query.where)
+            if conditions:
+                # analyse targets
+                # if target element has alias
+                #    if element is not identifier or the name is not equal to alias:
+                #         add alias to black list
+                # white list:
+                #     all targets that are identifiers with no alias or equal to its alias
+                # condition can be moved if
+                #     column is not in black list AND (query has star(*) OR column in white list)
+                has_star = False
+                white_list, black_list = [], []
+                for target in view_query.targets:
+                    if isinstance(target, Star):
+                        has_star = True
+                    if isinstance(target, Identifier):
+                        name = target.parts[-1].lower()
+                        if target.alias is None or target.alias.parts[-1].lower() == name:
+                            white_list.append(name)
+                    elif target.alias is not None:
+                        black_list.append(target.alias.parts[-1].lower())
+                view_where = view_query.where
+                for condition in conditions:
+                    arg1, arg2 = condition.args
+                    if isinstance(arg1, Identifier):
+                        name = arg1.parts[-1].lower()
+                        if name in black_list or not (has_star or name in white_list):
+                            continue
+                    if isinstance(arg2, Identifier):
+                        name = arg2.parts[-1].lower()
+                        if name in black_list or not (has_star or name in white_list):
+                            continue
+                    # condition can be moved into view
+                    condition2 = BinaryOperation(condition.op, [arg1, arg2])
+                    if view_where is None:
+                        view_where = condition2
+                    else:
+                        view_where = BinaryOperation("AND", args=[view_where, condition2])
+                    # disable outer condition
+                    condition.op = "="
+                    condition.args = [Constant(0), Constant(0)]
+                view_query.where = view_where
+        # combine outer query with view's query
+        view_query.parentheses = True
+        query.from_table = view_query
+        return query
+    def query_view(self, query: Select, session) -> pd.DataFrame:
         view_meta = self.get_view_meta(query)
         query_context_controller.set_context("view", view_meta["id"])
+        query_applied = False
         try:
-            sqlquery = SQLQuery(view_meta["query_ast"], session=session)
+            view_query = view_meta["query_ast"]
+            if isinstance(view_query, Select):
+                view_query = self.combine_view_select(view_query, query)
+                query_applied = True
+            sqlquery = SQLQuery(view_query, session=session)
             df = sqlquery.fetched_data.to_df()
         finally:
             query_context_controller.release_context("view", view_meta["id"])
         # remove duplicated columns
         df = df.loc[:, ~df.columns.duplicated()]
-        return query_df(df, query, session=session)
+        if query_applied:
+            return df
+        else:
+            return query_df(df, query, session=session)
     @staticmethod
     def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):

mindsdb/interfaces/knowledge_base/controller.py CHANGED Viewed

@@ -1139,8 +1139,14 @@ class KnowledgeBaseController:
         else:
             vector_db_name, vector_table_name = storage.parts
+        data_node = self.session.datahub.get(vector_db_name)
+        if data_node:
+            vector_store_handler = data_node.integration_handler
+        else:
+            raise ValueError(
+                f"Unable to find database named {vector_db_name}, please make sure {vector_db_name} is defined"
+            )
         # create table in vectordb before creating KB
-        vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
         vector_store_handler.create_table(vector_table_name)
         if keyword_search_enabled:
             vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)

mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py CHANGED Viewed

@@ -4,8 +4,7 @@ import asyncio
 from typing import List, Dict, Optional, Any
 import pandas as pd
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from langchain_core.documents import Document as LangchainDocument
+from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
 from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
     FileSplitter,
@@ -22,7 +21,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
 )
 from mindsdb.utilities import log
 logger = log.getLogger(__name__)
 _DEFAULT_CONTENT_COLUMN_NAME = "content"
@@ -49,11 +47,10 @@ class DocumentPreprocessor:
         if self.splitter is None:
             raise ValueError("Splitter not configured")
-        # Convert to langchain Document for splitting
-        langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
+        metadata = doc.metadata or {}
         # Split and convert back to our Document type
-        split_docs = self.splitter.split_documents([langchain_doc])
-        return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
+        split_texts = self.splitter.split_text(doc.content)
+        return [Document(content=text, metadata=metadata) for text in split_texts]
     def _get_source(self) -> str:
         """Get the source identifier for this preprocessor"""
@@ -266,16 +263,15 @@ Please give a short succinct context to situate this chunk within the overall do
 class TextChunkingPreprocessor(DocumentPreprocessor):
-    """Default text chunking preprocessor using RecursiveCharacterTextSplitter"""
+    """Default text chunking preprocessor using TextSplitter"""
     def __init__(self, config: Optional[TextChunkingConfig] = None):
         """Initialize with text chunking configuration"""
         super().__init__()
         self.config = config or TextChunkingConfig()
-        self.splitter = RecursiveCharacterTextSplitter(
+        self.splitter = TextSplitter(
             chunk_size=self.config.chunk_size,
             chunk_overlap=self.config.chunk_overlap,
-            length_function=self.config.length_function,
             separators=self.config.separators,
         )

MindsDB 25.7.4.0__py3-none-any.whl → 25.8.3.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.4.0py3-none-any.whl → 25.8.3.0py3-none-any.whl