MindsDB 25.6.2.0__py3-none-any.whl → 25.6.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (35) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +25 -4
  3. mindsdb/api/a2a/task_manager.py +68 -6
  4. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +1 -1
  6. mindsdb/api/executor/utilities/sql.py +18 -19
  7. mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
  8. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
  9. mindsdb/integrations/handlers/lindorm_handler/requirements.txt +1 -1
  10. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
  11. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -0
  12. mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
  13. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +94 -8
  14. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +19 -1
  15. mindsdb/integrations/libs/api_handler.py +19 -1
  16. mindsdb/integrations/libs/base.py +86 -2
  17. mindsdb/interfaces/agents/agents_controller.py +32 -6
  18. mindsdb/interfaces/agents/constants.py +1 -0
  19. mindsdb/interfaces/agents/mindsdb_database_agent.py +27 -34
  20. mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -6
  21. mindsdb/interfaces/data_catalog/data_catalog_reader.py +4 -0
  22. mindsdb/interfaces/database/integrations.py +4 -2
  23. mindsdb/interfaces/knowledge_base/controller.py +29 -24
  24. mindsdb/interfaces/knowledge_base/evaluate.py +0 -3
  25. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +17 -86
  26. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +28 -3
  27. mindsdb/interfaces/skills/skills_controller.py +0 -23
  28. mindsdb/interfaces/skills/sql_agent.py +9 -5
  29. mindsdb/interfaces/storage/db.py +20 -4
  30. mindsdb/utilities/config.py +5 -1
  31. {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.1.dist-info}/METADATA +247 -247
  32. {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.1.dist-info}/RECORD +35 -35
  33. {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.1.dist-info}/WHEEL +0 -0
  34. {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.1.dist-info}/licenses/LICENSE +0 -0
  35. {mindsdb-25.6.2.0.dist-info → mindsdb-25.6.3.1.dist-info}/top_level.txt +0 -0
@@ -9,6 +9,7 @@ import numpy as np
9
9
 
10
10
  from mindsdb_sql_parser.ast import BinaryOperation, Constant, Identifier, Select, Update, Delete, Star
11
11
  from mindsdb_sql_parser.ast.mindsdb import CreatePredictor
12
+ from mindsdb_sql_parser import parse_sql
12
13
 
13
14
  from mindsdb.integrations.utilities.query_traversal import query_traversal
14
15
 
@@ -52,6 +53,13 @@ def get_model_params(model_params: dict, default_config_key: str):
52
53
  """
53
54
  Get model parameters by combining default config with user provided parameters.
54
55
  """
56
+ # If the default config key is for reranking and the switch to use the default LLM is enabled,
57
+ # switch to the default LLM model.
58
+ if default_config_key == "default_reranking_model" and config.get("default_reranking_model").get(
59
+ "use_default_llm", False
60
+ ):
61
+ default_config_key = "default_llm_model"
62
+
55
63
  combined_model_params = copy.deepcopy(config.get(default_config_key, {}))
56
64
 
57
65
  if model_params:
@@ -97,6 +105,8 @@ def get_reranking_model_from_params(reranking_model_params: dict):
97
105
  params_copy["api_key"] = get_api_key(provider, params_copy, strict=False)
98
106
  params_copy["model"] = params_copy.pop("model_name", None)
99
107
 
108
+ params_copy.pop("use_default_llm", None)
109
+
100
110
  return BaseLLMReranker(**params_copy)
101
111
 
102
112
 
@@ -359,23 +369,30 @@ class KnowledgeBaseTable:
359
369
 
360
370
  def insert_query_result(self, query: str, project_name: str):
361
371
  """Process and insert SQL query results"""
362
- if not self.document_loader:
363
- raise ValueError("Document loader not configured")
372
+ ast_query = parse_sql(query)
364
373
 
365
- documents = list(self.document_loader.load_query_result(query, project_name))
366
- if documents:
367
- self.insert_documents(documents)
374
+ command_executor = ExecuteCommands(self.session)
375
+ response = command_executor.execute_command(ast_query, project_name)
376
+
377
+ if response.error_code is not None:
378
+ raise ValueError(f"Error executing query: {response.error_message}")
379
+
380
+ if response.data is None:
381
+ raise ValueError("Query returned no data")
382
+
383
+ records = response.data.records
384
+ df = pd.DataFrame(records)
385
+
386
+ self.insert(df)
368
387
 
369
388
  def insert_rows(self, rows: List[Dict]):
370
389
  """Process and insert raw data rows"""
371
390
  if not rows:
372
391
  return
373
392
 
374
- documents = [
375
- Document(content=row.get("content", ""), id=row.get("id"), metadata=row.get("metadata", {})) for row in rows
376
- ]
393
+ df = pd.DataFrame(rows)
377
394
 
378
- self.insert_documents(documents)
395
+ self.insert(df)
379
396
 
380
397
  def insert_documents(self, documents: List[Document]):
381
398
  """Process and insert documents with preprocessing if configured"""
@@ -1201,22 +1218,10 @@ class KnowledgeBaseController:
1201
1218
  project_names = {i.id: i.name for i in project_controller.get_list()}
1202
1219
 
1203
1220
  for record in query:
1204
- vector_database = record.vector_database
1205
- embedding_model = record.embedding_model
1221
+ kb = record.as_dict(with_secrets=self.session.show_secrets)
1222
+ kb["project_name"] = project_names[record.project_id]
1206
1223
 
1207
- data.append(
1208
- {
1209
- "id": record.id,
1210
- "name": record.name,
1211
- "project_id": record.project_id,
1212
- "project_name": project_names[record.project_id],
1213
- "embedding_model": embedding_model.name if embedding_model is not None else None,
1214
- "vector_database": None if vector_database is None else vector_database.name,
1215
- "vector_database_table": record.vector_database_table,
1216
- "query_id": record.query_id,
1217
- "params": record.params,
1218
- }
1219
- )
1224
+ data.append(kb)
1220
1225
 
1221
1226
  return data
1222
1227
 
@@ -492,8 +492,6 @@ class EvaluateDocID(EvaluateBase):
492
492
  total_questions = len(stats)
493
493
  total_found = sum([1 for stat in stats if stat["doc_found"]])
494
494
 
495
- total_accurately_retrieved = sum([1 for stat in stats if stat["doc_found"]])
496
-
497
495
  accurate_in_top_10 = sum([1 for stat in stats if stat["doc_found"] and stat["doc_position"] < 10])
498
496
 
499
497
  # calculate recall curve by position
@@ -512,7 +510,6 @@ class EvaluateDocID(EvaluateBase):
512
510
  return {
513
511
  "total": total_questions,
514
512
  "total_found": total_found,
515
- "retrieved_in_top_k": total_accurately_retrieved,
516
513
  "retrieved_in_top_10": accurate_in_top_10,
517
514
  "cumulative_recall": cumulative_recall,
518
515
  "avg_query_time": avg_query_time,
@@ -2,7 +2,6 @@ import os
2
2
  from typing import List, Iterator
3
3
  from langchain_core.documents import Document as LangchainDocument
4
4
  from langchain_text_splitters import MarkdownHeaderTextSplitter
5
- import pandas as pd
6
5
 
7
6
  from mindsdb.interfaces.file.file_controller import FileController
8
7
  from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
@@ -20,12 +19,12 @@ class DocumentLoader:
20
19
  """Handles loading documents from various sources including SQL queries"""
21
20
 
22
21
  def __init__(
23
- self,
24
- file_controller: FileController,
25
- file_splitter: FileSplitter,
26
- markdown_splitter: MarkdownHeaderTextSplitter,
27
- file_loader_class=FileLoader,
28
- mysql_proxy=None
22
+ self,
23
+ file_controller: FileController,
24
+ file_splitter: FileSplitter,
25
+ markdown_splitter: MarkdownHeaderTextSplitter,
26
+ file_loader_class=FileLoader,
27
+ mysql_proxy=None,
29
28
  ):
30
29
  """
31
30
  Initialize with required dependencies
@@ -52,8 +51,8 @@ class DocumentLoader:
52
51
  for doc in loader.lazy_load():
53
52
  # Add file extension to metadata for proper splitting
54
53
  extension = os.path.splitext(file_path)[1].lower()
55
- doc.metadata['extension'] = extension
56
- doc.metadata['source'] = file_name
54
+ doc.metadata["extension"] = extension
55
+ doc.metadata["source"] = file_name
57
56
 
58
57
  # Use FileSplitter to handle the document based on its type
59
58
  split_docs = self.file_splitter.split_documents([doc])
@@ -62,34 +61,22 @@ class DocumentLoader:
62
61
  metadata = doc.metadata.copy()
63
62
  metadata.update(split_doc.metadata or {})
64
63
 
65
- yield Document(
66
- content=split_doc.page_content,
67
- metadata=metadata
68
- )
64
+ yield Document(content=split_doc.page_content, metadata=metadata)
69
65
 
70
66
  def load_web_pages(
71
- self,
72
- urls: List[str],
73
- crawl_depth: int,
74
- limit: int,
75
- filters: List[str] = None,
67
+ self,
68
+ urls: List[str],
69
+ crawl_depth: int,
70
+ limit: int,
71
+ filters: List[str] = None,
76
72
  ) -> Iterator[Document]:
77
73
  """Load and split documents from web pages"""
78
- websites_df = get_all_websites(
79
- urls,
80
- crawl_depth=crawl_depth,
81
- limit=limit,
82
- filters=filters
83
- )
74
+ websites_df = get_all_websites(urls, crawl_depth=crawl_depth, limit=limit, filters=filters)
84
75
 
85
76
  for _, row in websites_df.iterrows():
86
77
  # Create a document with HTML extension for proper splitting
87
78
  doc = LangchainDocument(
88
- page_content=row['text_content'],
89
- metadata={
90
- 'extension': '.html',
91
- 'url': row['url']
92
- }
79
+ page_content=row["text_content"], metadata={"extension": ".html", "url": row["url"]}
93
80
  )
94
81
 
95
82
  # Use FileSplitter to handle HTML content
@@ -98,60 +85,4 @@ class DocumentLoader:
98
85
  metadata = doc.metadata.copy()
99
86
  metadata.update(split_doc.metadata or {})
100
87
 
101
- yield Document(
102
- content=split_doc.page_content,
103
- metadata=metadata
104
- )
105
-
106
- def load_query_result(self, query: str, project_name: str) -> Iterator[Document]:
107
- """
108
- Load documents from SQL query results
109
-
110
- Args:
111
- query: SQL query to execute
112
- project_name: Name of the project context
113
-
114
- Returns:
115
- Iterator of Document objects
116
-
117
- Raises:
118
- ValueError: If mysql_proxy is not configured or query returns no data
119
- """
120
- if not self.mysql_proxy:
121
- raise ValueError("MySQL proxy not configured")
122
-
123
- if not query:
124
- return
125
-
126
- # Set project context and execute query
127
- self.mysql_proxy.set_context({'db': project_name})
128
- query_result = self.mysql_proxy.process_query(query)
129
-
130
- if query_result.type != 'table':
131
- raise ValueError('Query returned no data')
132
-
133
- # Convert query result to DataFrame
134
- df = query_result.data.to_df()
135
-
136
- # Process each row into a Document
137
- for _, row in df.iterrows():
138
- # Extract id, content and metadata
139
- content = str(row.get('content', ''))
140
- id = row.get('id', None)
141
-
142
- # Convert remaining columns to metadata
143
- metadata = {
144
- col: str(row[col])
145
- for col in df.columns
146
- if col != 'content' and not pd.isna(row[col])
147
- }
148
- metadata['source'] = 'query'
149
-
150
- # Split content using recursive splitter
151
- if content:
152
-
153
- yield Document(
154
- id=id,
155
- content=content,
156
- metadata=metadata
157
- )
88
+ yield Document(content=split_doc.page_content, metadata=metadata)
@@ -3,6 +3,7 @@ import re
3
3
  import json
4
4
  from pydantic import BaseModel, Field
5
5
  from langchain_core.tools import BaseTool
6
+ from mindsdb_sql_parser.ast import Describe, Select, Identifier, Constant, Star
6
7
 
7
8
 
8
9
  class KnowledgeBaseListToolInput(BaseModel):
@@ -55,6 +56,26 @@ class KnowledgeBaseInfoTool(BaseTool):
55
56
  except (json.JSONDecodeError, TypeError):
56
57
  pass
57
58
 
59
+ def strip(s):
60
+ length = -1
61
+ while length != len(s):
62
+ length = len(s)
63
+
64
+ # remove ```
65
+ if s.startswith("```"):
66
+ s = s[3:]
67
+ if s.endswith("```"):
68
+ s = s[:-3]
69
+
70
+ # remove trailing new lines
71
+ s = s.strip("\n")
72
+
73
+ # remove extra quotes
74
+ for q in ('"', "'", "`"):
75
+ if s.count(q) == 1:
76
+ s = s.strip(q)
77
+ return s
78
+
58
79
  # Finally, try the original regex pattern for $START$ and $STOP$ markers
59
80
  match = re.search(r"\$START\$(.*?)\$STOP\$", tool_input, re.DOTALL)
60
81
  if not match:
@@ -63,12 +84,14 @@ class KnowledgeBaseInfoTool(BaseTool):
63
84
  return [kb.strip() for kb in tool_input.split(",")]
64
85
  # If it's just a single string without formatting, return it as a single item
65
86
  if tool_input.strip():
66
- return [tool_input.strip()]
87
+ return [strip(tool_input)]
67
88
  return []
68
89
 
69
90
  # Extract and clean the knowledge base names
70
91
  kb_names_str = match.group(1).strip()
71
92
  kb_names = re.findall(r"`([^`]+)`", kb_names_str)
93
+
94
+ kb_names = [strip(n) for n in kb_names]
72
95
  return kb_names
73
96
 
74
97
  def _run(self, tool_input: str) -> str:
@@ -83,7 +106,7 @@ class KnowledgeBaseInfoTool(BaseTool):
83
106
  for kb_name in kb_names:
84
107
  try:
85
108
  # Get knowledge base schema
86
- schema_result = self.db.run_no_throw(f"DESCRIBE KNOWLEDGE_BASE `{kb_name}`;")
109
+ schema_result = self.db.run_no_throw(str(Describe(kb_name, type="knowledge_base")))
87
110
 
88
111
  if not schema_result:
89
112
  results.append(f"Knowledge base `{kb_name}` not found or has no schema information.")
@@ -111,7 +134,9 @@ class KnowledgeBaseInfoTool(BaseTool):
111
134
  kb_info += "```\n\n"
112
135
 
113
136
  # Get sample data
114
- sample_data = self.db.run_no_throw(f"SELECT * FROM `{kb_name}` LIMIT 10;")
137
+ sample_data = self.db.run_no_throw(
138
+ str(Select(targets=[Star()], from_table=Identifier(kb_name), limit=Constant(20)))
139
+ )
115
140
 
116
141
  # Sample data
117
142
  kb_info += "### Sample Data:\n"
@@ -6,8 +6,6 @@ from sqlalchemy.orm.attributes import flag_modified
6
6
 
7
7
  from mindsdb.interfaces.storage import db
8
8
  from mindsdb.interfaces.database.projects import ProjectController
9
- from mindsdb.interfaces.data_catalog.data_catalog_loader import DataCatalogLoader
10
- from mindsdb.interfaces.skills.skill_tool import SkillType
11
9
  from mindsdb.utilities.config import config
12
10
  from mindsdb.utilities import log
13
11
 
@@ -99,27 +97,6 @@ class SkillsController:
99
97
  if skill is not None:
100
98
  raise ValueError(f"Skill with name already exists: {name}")
101
99
 
102
- # Load metadata to data catalog (if enabled) if the skill is Text-to-SQL.
103
- if config.get("data_catalog", {}).get("enabled", False):
104
- if type == SkillType.TEXT2SQL.value and "include_tables" in params:
105
- # TODO: Is it possible to create a skill with complete access to the database with the new agent syntax?
106
- # TODO: Handle the case where `ignore_tables` is provided. Is this a valid parameter?
107
- # TODO: Knowledge Bases?
108
- database_table_map = {}
109
- for table in params["include_tables"]:
110
- parts = table.split(".", 1)
111
- database_table_map[parts[0]] = database_table_map.get(parts[0], []) + [parts[1]]
112
-
113
- for database_name, table_names in database_table_map.items():
114
- data_catalog_loader = DataCatalogLoader(database_name=database_name, table_names=table_names)
115
- data_catalog_loader.load_metadata()
116
-
117
- elif type in [SkillType.TEXT2SQL.value, SkillType.TEXT2SQL_LEGACY.value] and "database" in params:
118
- data_catalog_loader = DataCatalogLoader(
119
- database_name=params["database"], table_names=params["tables"] if "tables" in params else None
120
- )
121
- data_catalog_loader.load_metadata()
122
-
123
100
  new_skill = db.Skills(
124
101
  name=name,
125
102
  project_id=project.id,
@@ -76,7 +76,7 @@ def split_table_name(table_name: str) -> List[str]:
76
76
  result.append(current.strip("`"))
77
77
 
78
78
  # ensure we split the table name
79
- result = [r.split(".") for r in result][0]
79
+ # result = [r.split(".") for r in result][0]
80
80
 
81
81
  return result
82
82
 
@@ -402,11 +402,15 @@ class SQLAgent:
402
402
  """
403
403
  if config.get("data_catalog", {}).get("enabled", False):
404
404
  database_table_map = {}
405
- for name in self.get_usable_table_names():
405
+ for name in table_names or self.get_usable_table_names():
406
406
  name = name.replace("`", "")
407
407
 
408
- # TODO: Can there be situations where the database name is returned from the above method?
409
408
  parts = name.split(".", 1)
409
+ # TODO: Will there be situations where parts has more than 2 elements? Like a schema?
410
+ # This is unlikely given that we default to a single schema per database.
411
+ if len(parts) == 1:
412
+ raise ValueError(f"Invalid table name: {name}. Expected format is 'database.table'.")
413
+
410
414
  database_table_map[parts[0]] = database_table_map.get(parts[0], []) + [parts[1]]
411
415
 
412
416
  data_catalog_str = ""
@@ -430,8 +434,8 @@ class SQLAgent:
430
434
  else:
431
435
  all_tables.append(Identifier(name))
432
436
 
433
- # if table_names is not None:
434
- # all_tables = self._resolve_table_names(table_names, all_tables)
437
+ if table_names is not None:
438
+ all_tables = self._resolve_table_names(table_names, all_tables)
435
439
 
436
440
  tables_info = []
437
441
  for table in all_tables:
@@ -1,6 +1,6 @@
1
1
  import json
2
2
  import datetime
3
- from typing import Dict, List
3
+ from typing import Dict, List, Optional
4
4
 
5
5
  import numpy as np
6
6
  from sqlalchemy import (
@@ -494,17 +494,33 @@ class KnowledgeBase(Base):
494
494
 
495
495
  __table_args__ = (UniqueConstraint("name", "project_id", name="unique_knowledge_base_name_project_id"),)
496
496
 
497
- def as_dict(self) -> Dict:
497
+ def as_dict(self, with_secrets: Optional[bool] = True) -> Dict:
498
+ params = self.params.copy()
499
+ embedding_model = params.pop("embedding_model", None)
500
+ reranking_model = params.pop("reranking_model", None)
501
+
502
+ if not with_secrets:
503
+ if embedding_model and "api_key" in embedding_model:
504
+ embedding_model["api_key"] = "******"
505
+
506
+ if reranking_model and "api_key" in reranking_model:
507
+ reranking_model["api_key"] = "******"
508
+
498
509
  return {
499
510
  "id": self.id,
500
511
  "name": self.name,
501
512
  "project_id": self.project_id,
502
- "embedding_model": None if self.embedding_model is None else self.embedding_model.name,
503
513
  "vector_database": None if self.vector_database is None else self.vector_database.name,
504
514
  "vector_database_table": self.vector_database_table,
505
515
  "updated_at": self.updated_at,
506
516
  "created_at": self.created_at,
507
- "params": self.params,
517
+ "query_id": self.query_id,
518
+ "embedding_model": embedding_model,
519
+ "reranking_model": reranking_model,
520
+ "metadata_columns": params.pop("metadata_columns", None),
521
+ "content_columns": params.pop("content_columns", None),
522
+ "id_column": params.pop("id_column", None),
523
+ "params": params,
508
524
  }
509
525
 
510
526
 
@@ -400,7 +400,11 @@ class Config:
400
400
  bool: True if config was loaded or updated
401
401
  """
402
402
 
403
- if self.auto_config_path.is_file() and self.auto_config_mtime != self.auto_config_path.stat().st_mtime:
403
+ if (
404
+ self.auto_config_path.is_file()
405
+ and self.auto_config_path.read_text() != ""
406
+ and self.auto_config_mtime != self.auto_config_path.stat().st_mtime
407
+ ):
404
408
  try:
405
409
  self._auto_config = json.loads(self.auto_config_path.read_text())
406
410
  except json.JSONDecodeError as e: