PyPI - MindsDB - Versions diffs - 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl - Mend

MindsDB 25.6.3.1py3-none-any.whl → 25.7.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (55) hide show

mindsdb/interfaces/knowledge_base/preprocessing/models.py CHANGED Viewed

@@ -18,45 +18,31 @@ class PreprocessorType(Enum):
 class BasePreprocessingConfig(BaseModel):
     """Base configuration for preprocessing"""
     chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Size of document chunks")
     chunk_overlap: int = Field(default=DEFAULT_CHUNK_OVERLAP, description="Overlap between chunks")
+    doc_id_column_name: str = Field(default="_original_doc_id", description="Name of doc_id columns in metadata")
 class ContextualConfig(BasePreprocessingConfig):
     """Configuration specific to contextual preprocessing"""
     llm_config: LLMConfig = Field(
-        default_factory=LLMConfig,
-        description="LLM configuration to use for context generation"
-    )
-    context_template: Optional[str] = Field(
-        default=None,
-        description="Custom template for context generation"
-    )
-    summarize: Optional[bool] = Field(
-        default=False,
-        description="Whether to return chunks as summarizations"
+        default_factory=LLMConfig, description="LLM configuration to use for context generation"
     )
+    context_template: Optional[str] = Field(default=None, description="Custom template for context generation")
+    summarize: Optional[bool] = Field(default=False, description="Whether to return chunks as summarizations")
-class TextChunkingConfig(BaseModel):
+class TextChunkingConfig(BasePreprocessingConfig):
     """Configuration for text chunking preprocessor using Pydantic"""
-    chunk_size: int = Field(
-        default=1000,
-        description="The target size of each text chunk",
-        gt=0
-    )
-    chunk_overlap: int = Field(
-        default=200,
-        description="The number of characters to overlap between chunks",
-        ge=0
-    )
-    length_function: Callable = Field(
-        default=len,
-        description="Function to measure text length"
-    )
+    chunk_size: int = Field(default=1000, description="The target size of each text chunk", gt=0)
+    chunk_overlap: int = Field(default=200, description="The number of characters to overlap between chunks", ge=0)
+    length_function: Callable = Field(default=len, description="Function to measure text length")
     separators: List[str] = Field(
         default=["\n\n", "\n", " ", ""],
-        description="List of separators to use for splitting text, in order of priority"
+        description="List of separators to use for splitting text, in order of priority",
     )
     class Config:
@@ -65,44 +51,28 @@ class TextChunkingConfig(BaseModel):
 class JSONChunkingConfig(BasePreprocessingConfig):
     """Configuration for JSON chunking preprocessor"""
-    flatten_nested: bool = Field(
-        default=True,
-        description="Whether to flatten nested JSON structures"
-    )
-    include_metadata: bool = Field(
-        default=True,
-        description="Whether to include original metadata in chunks"
-    )
+    flatten_nested: bool = Field(default=True, description="Whether to flatten nested JSON structures")
+    include_metadata: bool = Field(default=True, description="Whether to include original metadata in chunks")
     chunk_by_object: bool = Field(
-        default=True,
-        description="Whether to chunk by top-level objects (True) or create a single document (False)"
-    )
-    exclude_fields: List[str] = Field(
-        default_factory=list,
-        description="List of fields to exclude from chunking"
+        default=True, description="Whether to chunk by top-level objects (True) or create a single document (False)"
     )
+    exclude_fields: List[str] = Field(default_factory=list, description="List of fields to exclude from chunking")
     include_fields: List[str] = Field(
         default_factory=list,
-        description="List of fields to include in chunking (if empty, all fields except excluded ones are included)"
+        description="List of fields to include in chunking (if empty, all fields except excluded ones are included)",
     )
     metadata_fields: List[str] = Field(
         default_factory=list,
         description="List of fields to extract into metadata for filtering "
-                    "(can include nested fields using dot notation). "
-                    "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure)."
+        "(can include nested fields using dot notation). "
+        "If empty, all primitive fields will be extracted (top-level fields if available, otherwise all primitive fields in the flattened structure).",
     )
     extract_all_primitives: bool = Field(
-        default=False,
-        description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
-    )
-    nested_delimiter: str = Field(
-        default=".",
-        description="Delimiter for flattened nested field names"
-    )
-    content_column: str = Field(
-        default="content",
-        description="Name of the content column for chunk ID generation"
+        default=False, description="Whether to extract all primitive values (strings, numbers, booleans) into metadata"
     )
+    nested_delimiter: str = Field(default=".", description="Delimiter for flattened nested field names")
+    content_column: str = Field(default="content", description="Name of the content column for chunk ID generation")
     class Config:
         arbitrary_types_allowed = True
@@ -110,25 +80,20 @@ class JSONChunkingConfig(BasePreprocessingConfig):
 class PreprocessingConfig(BaseModel):
     """Complete preprocessing configuration"""
-    type: PreprocessorType = Field(
-        default=PreprocessorType.TEXT_CHUNKING,
-        description="Type of preprocessing to apply"
-    )
+    type: PreprocessorType = Field(default=PreprocessorType.TEXT_CHUNKING, description="Type of preprocessing to apply")
     contextual_config: Optional[ContextualConfig] = Field(
-        default=None,
-        description="Configuration for contextual preprocessing"
+        default=None, description="Configuration for contextual preprocessing"
     )
     text_chunking_config: Optional[TextChunkingConfig] = Field(
-        default=None,
-        description="Configuration for text chunking preprocessing"
+        default=None, description="Configuration for text chunking preprocessing"
     )
     json_chunking_config: Optional[JSONChunkingConfig] = Field(
-        default=None,
-        description="Configuration for JSON chunking preprocessing"
+        default=None, description="Configuration for JSON chunking preprocessing"
     )
-    @model_validator(mode='after')
-    def validate_config_presence(self) -> 'PreprocessingConfig':
+    @model_validator(mode="after")
+    def validate_config_presence(self) -> "PreprocessingConfig":
         """Ensure the appropriate config is present for the chosen type"""
         if self.type == PreprocessorType.CONTEXTUAL and not self.contextual_config:
             self.contextual_config = ContextualConfig()
@@ -137,26 +102,28 @@ class PreprocessingConfig(BaseModel):
         if self.type == PreprocessorType.JSON_CHUNKING and not self.json_chunking_config:
             # Import here to avoid circular imports
             from mindsdb.interfaces.knowledge_base.preprocessing.json_chunker import JSONChunkingConfig
             self.json_chunking_config = JSONChunkingConfig()
         return self
 class Document(BaseModel):
     """Document model with default metadata handling"""
     id: Optional[Union[int, str]] = Field(default=None, description="Unique identifier for the document")
     content: str = Field(description="The document content")
     embeddings: Optional[List[float]] = Field(default=None, description="Vector embeddings of the content")
     metadata: Optional[Dict[str, Any]] = Field(default=None, description="Additional document metadata")
-    @model_validator(mode='after')
-    def validate_metadata(self) -> 'Document':
+    @model_validator(mode="after")
+    def validate_metadata(self) -> "Document":
         """Ensure metadata is present and valid"""
         if not self.metadata:
-            self.metadata = {'source': 'default'}
+            self.metadata = {"source": "default"}
         return self
 class ProcessedChunk(Document):
     """Processed chunk that aligns with VectorStoreHandler schema"""
     pass

mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py CHANGED Viewed

@@ -6,6 +6,27 @@ from langchain_core.tools import BaseTool
 from mindsdb_sql_parser.ast import Describe, Select, Identifier, Constant, Star
+def llm_str_strip(s):
+    length = -1
+    while length != len(s):
+        length = len(s)
+        # remove ```
+        if s.startswith("```"):
+            s = s[3:]
+        if s.endswith("```"):
+            s = s[:-3]
+        # remove trailing new lines
+        s = s.strip("\n")
+        # remove extra quotes
+        for q in ('"', "'", "`"):
+            if s.count(q) == 1:
+                s = s.strip(q)
+    return s
 class KnowledgeBaseListToolInput(BaseModel):
     tool_input: str = Field("", description="An empty string to list all knowledge bases.")
@@ -56,26 +77,6 @@ class KnowledgeBaseInfoTool(BaseTool):
         except (json.JSONDecodeError, TypeError):
             pass
-        def strip(s):
-            length = -1
-            while length != len(s):
-                length = len(s)
-                # remove ```
-                if s.startswith("```"):
-                    s = s[3:]
-                if s.endswith("```"):
-                    s = s[:-3]
-                # remove trailing new lines
-                s = s.strip("\n")
-                # remove extra quotes
-                for q in ('"', "'", "`"):
-                    if s.count(q) == 1:
-                        s = s.strip(q)
-            return s
         # Finally, try the original regex pattern for $START$ and $STOP$ markers
         match = re.search(r"\$START\$(.*?)\$STOP\$", tool_input, re.DOTALL)
         if not match:
@@ -84,14 +85,14 @@ class KnowledgeBaseInfoTool(BaseTool):
                 return [kb.strip() for kb in tool_input.split(",")]
             # If it's just a single string without formatting, return it as a single item
             if tool_input.strip():
-                return [strip(tool_input)]
+                return [llm_str_strip(tool_input)]
             return []
         # Extract and clean the knowledge base names
         kb_names_str = match.group(1).strip()
         kb_names = re.findall(r"`([^`]+)`", kb_names_str)
-        kb_names = [strip(n) for n in kb_names]
+        kb_names = [llm_str_strip(n) for n in kb_names]
         return kb_names
     def _run(self, tool_input: str) -> str:
@@ -105,6 +106,8 @@ class KnowledgeBaseInfoTool(BaseTool):
         for kb_name in kb_names:
             try:
+                self.db.check_knowledge_base_permission(Identifier(kb_name))
                 # Get knowledge base schema
                 schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
@@ -221,6 +224,7 @@ class KnowledgeBaseQueryTool(BaseTool):
         try:
             # Execute the query
+            query = llm_str_strip(query)
             result = self.db.run_no_throw(query)
             if not result:

mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py CHANGED Viewed

@@ -10,25 +10,27 @@ from mindsdb.interfaces.skills.custom.text2sql.mindsdb_sql_tool import MindsDBSQ
 from mindsdb.interfaces.skills.custom.text2sql.mindsdb_kb_tools import (
     KnowledgeBaseListTool,
     KnowledgeBaseInfoTool,
-    KnowledgeBaseQueryTool
+    KnowledgeBaseQueryTool,
 )
 class MindsDBSQLToolkit(SQLDatabaseToolkit):
+    include_knowledge_base_tools: bool = True
-    def get_tools(self, prefix='') -> List[BaseTool]:
+    def get_tools(self, prefix="") -> List[BaseTool]:
         current_date_time = datetime.now().strftime("%Y-%m-%d %H:%M")
         """Get the tools in the toolkit."""
         list_sql_database_tool = ListSQLDatabaseTool(
-            name=f'sql_db_list_tables{prefix}',
+            name=f"sql_db_list_tables{prefix}",
             db=self.db,
-            description=dedent("""\n
+            description=dedent(
+                """\n
                 Input is an empty string, output is a comma-separated list of tables in the database. Each table name is escaped using backticks.
                 Each table name in the list may be in one of two formats: database_name.`table_name` or database_name.schema_name.`table_name`.
                 Table names in response to the user must be escaped using backticks.
-            """)
+            """
+            ),
         )
         info_sql_database_tool_description = (
@@ -45,11 +47,11 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
             "    $START$ table1 table2 table3 $STOP$\n"
         )
         info_sql_database_tool = InfoSQLDatabaseTool(
-            name=f'sql_db_schema{prefix}',
-            db=self.db, description=info_sql_database_tool_description
+            name=f"sql_db_schema{prefix}", db=self.db, description=info_sql_database_tool_description
         )
-        query_sql_database_tool_description = dedent(f"""\
+        query_sql_database_tool_description = dedent(
+            f"""\
             Input: A detailed and well-structured SQL query. The query must be enclosed between the symbols $START$ and $STOP$.
             Output: Database result or error message. For errors, rewrite and retry the query. For 'Unknown column' errors, use '{info_sql_database_tool.name}' to check table fields.
             This system is a highly intelligent and reliable PostgreSQL SQL skill designed to work with databases.
@@ -93,11 +95,11 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
                - When asked about yourself or your maker, state that you are a Data-Mind, created by MindsDB to help answer data questions.
                - When asked about your purpose or how you can help, explore the available data sources and then explain that you can answer questions based on the connected data. Provide a few relevant example questions that you could answer for the user about their data.
             Adhere to these guidelines for all queries and responses. Ask for clarification if needed.
-        """)
+        """
+        )
         query_sql_database_tool = QuerySQLDataBaseTool(
-            name=f'sql_db_query{prefix}',
-            db=self.db, description=query_sql_database_tool_description
+            name=f"sql_db_query{prefix}", db=self.db, description=query_sql_database_tool_description
         )
         mindsdb_sql_parser_tool_description = (
@@ -108,15 +110,24 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
             f"ALWAYS run this tool before executing a query with {query_sql_database_tool.name}. "
         )
         mindsdb_sql_parser_tool = MindsDBSQLParserTool(
-            name=f'mindsdb_sql_parser_tool{prefix}',
-            description=mindsdb_sql_parser_tool_description
+            name=f"mindsdb_sql_parser_tool{prefix}", description=mindsdb_sql_parser_tool_description
         )
+        sql_tools = [
+            query_sql_database_tool,
+            info_sql_database_tool,
+            list_sql_database_tool,
+            mindsdb_sql_parser_tool,
+        ]
+        if not self.include_knowledge_base_tools:
+            return sql_tools
         # Knowledge base tools
         kb_list_tool = KnowledgeBaseListTool(
-            name=f'kb_list_tool{prefix}',
+            name=f"kb_list_tool{prefix}",
             db=self.db,
-            description=dedent("""\
+            description=dedent(
+                """\
                 Lists all available knowledge bases that can be queried.
                 Input: No input required, just call the tool directly.
                 Output: A table of all available knowledge bases with their names and creation dates.
@@ -125,13 +136,15 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
                 Each knowledge base name is escaped using backticks.
                 Example usage: kb_list_tool()
-            """)
+            """
+            ),
         )
         kb_info_tool = KnowledgeBaseInfoTool(
-            name=f'kb_info_tool{prefix}',
+            name=f"kb_info_tool{prefix}",
             db=self.db,
-            description=dedent(f"""\
+            description=dedent(
+                f"""\
                 Gets detailed information about specific knowledge bases including their structure and metadata fields.
                 Input: A knowledge base name as a simple string.
@@ -143,13 +156,15 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
                 Example usage: kb_info_tool("kb_name")
                 Make sure the knowledge base exists by calling {kb_list_tool.name} first.
-            """)
+            """
+            ),
         )
         kb_query_tool = KnowledgeBaseQueryTool(
-            name=f'kb_query_tool{prefix}',
+            name=f"kb_query_tool{prefix}",
             db=self.db,
-            description=dedent(f"""\
+            description=dedent(
+                f"""\
                 Queries knowledge bases using SQL syntax to retrieve relevant information.
                 Input: A SQL query string that targets a knowledge base.
@@ -192,15 +207,12 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
                 - Always include a semicolon at the end of your SQL query
                 For factual questions, use this tool to retrieve information rather than relying on the model's knowledge.
-            """)
+            """
+            ),
         )
         # Return standard SQL tools and knowledge base tools
-        return [
-            query_sql_database_tool,
-            info_sql_database_tool,
-            list_sql_database_tool,
-            mindsdb_sql_parser_tool,
+        return sql_tools + [
             kb_list_tool,
             kb_info_tool,
             kb_query_tool,

MindsDB 25.6.3.1__py3-none-any.whl → 25.7.1.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.6.3.1py3-none-any.whl → 25.7.1.0py3-none-any.whl