MindsDB 25.7.1.0__py3-none-any.whl → 25.7.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (38) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +54 -95
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +7 -2
  8. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
  9. mindsdb/api/executor/utilities/sql.py +97 -21
  10. mindsdb/api/http/namespaces/agents.py +127 -202
  11. mindsdb/api/http/namespaces/config.py +12 -1
  12. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +11 -1
  13. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
  14. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +94 -1
  15. mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
  16. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +4 -3
  17. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -3
  18. mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
  19. mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
  20. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  21. mindsdb/integrations/libs/vectordatabase_handler.py +35 -14
  22. mindsdb/integrations/utilities/sql_utils.py +11 -0
  23. mindsdb/interfaces/agents/agents_controller.py +2 -2
  24. mindsdb/interfaces/data_catalog/data_catalog_loader.py +18 -4
  25. mindsdb/interfaces/database/projects.py +1 -3
  26. mindsdb/interfaces/functions/controller.py +54 -64
  27. mindsdb/interfaces/functions/to_markdown.py +47 -14
  28. mindsdb/interfaces/knowledge_base/controller.py +134 -35
  29. mindsdb/interfaces/knowledge_base/evaluate.py +53 -10
  30. mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
  31. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
  32. mindsdb/utilities/config.py +46 -39
  33. mindsdb/utilities/exception.py +11 -0
  34. {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/METADATA +236 -236
  35. {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/RECORD +38 -36
  36. {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/WHEEL +0 -0
  37. {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/licenses/LICENSE +0 -0
  38. {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
1
1
  import json
2
2
  import math
3
+ import re
3
4
  import time
4
5
  from typing import List
5
6
 
@@ -16,15 +17,15 @@ logger = log.getLogger(__name__)
16
17
 
17
18
 
18
19
  GENERATE_QA_SYSTEM_PROMPT = """
19
- Your task is to generate question and answer pairs for a search engine.
20
+ Your task is to generate question and answer pairs for a search engine.
20
21
  The search engine will take your query and return a list of documents.
21
22
  You will be given a text and you need to generate a question that can be answered using the information in the text.
22
23
  Your questions will be used to evaluate the search engine.
23
- Question should always have enough clues to identify the specific text that this question is generated from.
24
+ Question should always have enough clues to identify the specific text that this question is generated from.
24
25
  Never ask questions like "What license number is associated with Amend 6" because Amend 6 could be found in many documents and the question is not specific enough.
25
- Example output 1: {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
26
+ Example output 1: {\"query\": \"What processor does the HP 2023 14\" FHD IPS Laptop use?\", \"reference_answer\": \"Ryzen 3 5300U\"}
26
27
  Example output 2: {\"query\": \"What is the name of the river in Paris?\", \"reference_answer\": \"Seine\"}
27
- Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
28
+ Don't generate questions like "What is being amended in the application?" because these questions cannot be answered using the text and without knowing which document it refers to.
28
29
  The question should be answerable without the text, but the answer should be present in the text.
29
30
  Return ONLY a json response. No other text.
30
31
  """
@@ -43,6 +44,39 @@ def calc_entropy(values: List[float]) -> float:
43
44
  return -sum([pk * math.log(pk) for pk in values])
44
45
 
45
46
 
47
+ def sanitize_json_response(response: str) -> str:
48
+ """Remove markdown code block formatting from JSON response and extract valid JSON."""
49
+ if not response or not response.strip():
50
+ raise ValueError("Empty response provided.")
51
+
52
+ # Remove leading/trailing whitespace
53
+ response = response.strip()
54
+
55
+ # Remove markdown code block markers if present
56
+ response = re.sub(r"^```(?:json|JSON)?\s*", "", response, flags=re.MULTILINE)
57
+ response = re.sub(r"\s*```$", "", response, flags=re.MULTILINE)
58
+ response = response.strip()
59
+
60
+ # Find the first opening brace
61
+ start_idx = response.find("{")
62
+ if start_idx == -1:
63
+ raise ValueError("No JSON object found in the response.")
64
+
65
+ # Try to parse JSON starting from first { with increasing end positions
66
+ # This handles nested objects and strings with braces correctly
67
+ for end_idx in range(len(response), start_idx, -1): # Start from end and work backwards
68
+ candidate = response[start_idx:end_idx]
69
+ try:
70
+ parsed = json.loads(candidate)
71
+ # Ensure it's a dictionary (object) not just any valid JSON
72
+ if isinstance(parsed, dict):
73
+ return candidate
74
+ except json.JSONDecodeError:
75
+ continue
76
+
77
+ raise ValueError("No valid JSON object found in the response.")
78
+
79
+
46
80
  class EvaluateBase:
47
81
  DEFAULT_QUESTION_COUNT = 20
48
82
  DEFAULT_SAMPLE_SIZE = 10000
@@ -90,7 +124,7 @@ class EvaluateBase:
90
124
  df = response.data_frame
91
125
 
92
126
  if "content" not in df.columns:
93
- raise ValueError("`content` column isn't found in source data")
127
+ raise ValueError(f"`content` column isn't found in provided sql: {gen_params['from_sql']}")
94
128
 
95
129
  df.rename(columns={"content": "chunk_content"}, inplace=True)
96
130
  else:
@@ -178,6 +212,7 @@ class EvaluateBase:
178
212
  test_data = self.read_from_table(test_table)
179
213
 
180
214
  scores = self.evaluate(test_data)
215
+ scores["id"] = math.floor(time.time()) # unique ID for the evaluation run
181
216
  scores["name"] = self.name
182
217
  scores["created_at"] = dt.datetime.now()
183
218
 
@@ -186,7 +221,7 @@ class EvaluateBase:
186
221
  to_table = params["save_to"]
187
222
  if isinstance(to_table, str):
188
223
  to_table = Identifier(to_table)
189
- self.save_to_table(to_table, scores)
224
+ self.save_to_table(to_table, scores.copy())
190
225
 
191
226
  return scores
192
227
 
@@ -237,9 +272,13 @@ class EvaluateRerank(EvaluateBase):
237
272
  {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
238
273
  {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
239
274
  ]
240
- answer = self.llm_client.completion(messages)
275
+ answer = self.llm_client.completion(messages, json_output=True)
276
+
277
+ # Sanitize the response by removing markdown code block formatting like ```json
278
+ sanitized_answer = sanitize_json_response(answer)
279
+
241
280
  try:
242
- output = json.loads(answer)
281
+ output = json.loads(sanitized_answer)
243
282
  except json.JSONDecodeError:
244
283
  raise ValueError(f"Could not parse response from LLM: {answer}")
245
284
 
@@ -448,9 +487,13 @@ class EvaluateDocID(EvaluateBase):
448
487
  {"role": "system", "content": GENERATE_QA_SYSTEM_PROMPT},
449
488
  {"role": "user", "content": f"\n\nText:\n{text}\n\n"},
450
489
  ]
451
- answer = self.llm_client.completion(messages)
490
+ answer = self.llm_client.completion(messages, json_output=True)
491
+
492
+ # Sanitize the response by removing markdown code block formatting like ```json
493
+ sanitized_answer = sanitize_json_response(answer)
494
+
452
495
  try:
453
- output = json.loads(answer)
496
+ output = json.loads(sanitized_answer)
454
497
  except json.JSONDecodeError:
455
498
  raise ValueError(f"Could not parse response from LLM: {answer}")
456
499
 
@@ -54,12 +54,12 @@ class LLMClient:
54
54
 
55
55
  self.client = module.Handler
56
56
 
57
- def completion(self, messages: List[dict]) -> str:
57
+ def completion(self, messages: List[dict], json_output: bool = False) -> str:
58
58
  """
59
59
  Call LLM completion and get response
60
60
  """
61
61
  params = self.params
62
-
62
+ params["json_output"] = json_output
63
63
  if self.provider in ("azure_openai", "openai"):
64
64
  response = self.client.chat.completions.create(
65
65
  model=params["model_name"],
@@ -69,6 +69,6 @@ class LLMClient:
69
69
  else:
70
70
  kwargs = params.copy()
71
71
  model = kwargs.pop("model_name")
72
-
72
+ kwargs.pop("provider", None)
73
73
  response = self.client.completion(self.provider, model=model, messages=messages, args=kwargs)
74
74
  return response.choices[0].message.content
@@ -1,16 +1,17 @@
1
+ import re
2
+ import html
3
+ import asyncio
1
4
  from typing import List, Dict, Optional, Any
5
+
2
6
  import pandas as pd
3
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
4
- import asyncio
5
-
8
+ from langchain_core.documents import Document as LangchainDocument
6
9
 
7
10
  from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
8
11
  FileSplitter,
9
12
  FileSplitterConfig,
10
13
  )
11
-
12
14
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model
13
-
14
15
  from mindsdb.interfaces.knowledge_base.preprocessing.models import (
15
16
  PreprocessingConfig,
16
17
  ProcessedChunk,
@@ -21,7 +22,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
21
22
  )
22
23
  from mindsdb.utilities import log
23
24
 
24
- from langchain_core.documents import Document as LangchainDocument
25
25
 
26
26
  logger = log.getLogger(__name__)
27
27
 
@@ -123,11 +123,11 @@ class ContextualPreprocessor(DocumentPreprocessor):
123
123
 
124
124
  DEFAULT_CONTEXT_TEMPLATE = """
125
125
  <document>
126
- {{WHOLE_DOCUMENT}}
126
+ {WHOLE_DOCUMENT}
127
127
  </document>
128
128
  Here is the chunk we want to situate within the whole document
129
129
  <chunk>
130
- {{CHUNK_CONTENT}}
130
+ {CHUNK_CONTENT}
131
131
  </chunk>
132
132
  Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else."""
133
133
 
@@ -149,12 +149,20 @@ Please give a short succinct context to situate this chunk within the overall do
149
149
  self.summarize = self.config.summarize
150
150
 
151
151
  def _prepare_prompts(self, chunk_contents: list[str], full_documents: list[str]) -> list[str]:
152
- prompts = [
153
- self.context_template.replace("{{WHOLE_DOCUMENT}}", full_document) for full_document in full_documents
154
- ]
155
- prompts = [
156
- prompt.replace("{{CHUNK_CONTENT}}", chunk_content) for prompt, chunk_content in zip(prompts, chunk_contents)
157
- ]
152
+ def tag_replacer(match):
153
+ tag = match.group(0)
154
+ if tag.lower() not in ["<document>", "</document>", "<chunk>", "</chunk>"]:
155
+ return tag
156
+ return html.escape(tag)
157
+
158
+ tag_pattern = r"</?document>|</?chunk>"
159
+ prompts = []
160
+ for chunk_content, full_document in zip(chunk_contents, full_documents):
161
+ chunk_content = re.sub(tag_pattern, tag_replacer, chunk_content, flags=re.IGNORECASE)
162
+ full_document = re.sub(tag_pattern, tag_replacer, full_document, flags=re.IGNORECASE)
163
+ prompts.append(
164
+ self.DEFAULT_CONTEXT_TEMPLATE.format(WHOLE_DOCUMENT=full_document, CHUNK_CONTENT=chunk_content)
165
+ )
158
166
 
159
167
  return prompts
160
168
 
@@ -28,6 +28,13 @@ def _merge_configs(original_config: dict, override_config: dict) -> dict:
28
28
  return original_config
29
29
 
30
30
 
31
+ def _overwrite_configs(original_config: dict, override_config: dict) -> dict:
32
+ """Overwrite original config with override config."""
33
+ for key in list(override_config.keys()):
34
+ original_config[key] = override_config[key]
35
+ return original_config
36
+
37
+
31
38
  def create_data_dir(path: Path) -> None:
32
39
  """Create a directory and checks that it is writable.
33
40
 
@@ -196,6 +203,15 @@ class Config:
196
203
  "host": "0.0.0.0", # API server binds to all interfaces by default
197
204
  "port": "8000",
198
205
  },
206
+ "a2a": {
207
+ "host": api_host,
208
+ "port": 47338,
209
+ "mindsdb_host": "localhost",
210
+ "mindsdb_port": 47334,
211
+ "agent_name": "my_agent",
212
+ "project_name": "mindsdb",
213
+ "enabled": False,
214
+ },
199
215
  },
200
216
  "cache": {"type": "local"},
201
217
  "ml_task_queue": {"type": "local"},
@@ -209,15 +225,6 @@ class Config:
209
225
  "default_llm": {},
210
226
  "default_embedding_model": {},
211
227
  "default_reranking_model": {},
212
- "a2a": {
213
- "host": "localhost",
214
- "port": 47338,
215
- "mindsdb_host": "localhost",
216
- "mindsdb_port": 47334,
217
- "agent_name": "my_agent",
218
- "project_name": "mindsdb",
219
- "enabled": False,
220
- },
221
228
  "data_catalog": {
222
229
  "enabled": False,
223
230
  },
@@ -243,12 +250,11 @@ class Config:
243
250
  """Collect config values from env vars to self._env_config"""
244
251
  self._env_config = {
245
252
  "logging": {"handlers": {"console": {}, "file": {}}},
246
- "api": {"http": {"server": {}}},
253
+ "api": {"http": {"server": {}}, "a2a": {}},
247
254
  "auth": {},
248
255
  "paths": {},
249
256
  "permanent_storage": {},
250
257
  "ml_task_queue": {},
251
- "a2a": {},
252
258
  }
253
259
 
254
260
  # region storage root path
@@ -390,7 +396,7 @@ class Config:
390
396
  )
391
397
 
392
398
  if a2a_config:
393
- self._env_config["a2a"] = a2a_config
399
+ self._env_config["api"]["a2a"] = a2a_config
394
400
  # endregion
395
401
 
396
402
  def fetch_auto_config(self) -> bool:
@@ -457,47 +463,36 @@ class Config:
457
463
  _merge_configs(new_config, self._env_config)
458
464
 
459
465
  # Apply command-line arguments for A2A
460
- cmd_args_config = {}
466
+ a2a_config = {}
461
467
 
462
468
  # Check for A2A command-line arguments
463
469
  if hasattr(self.cmd_args, "a2a_host") and self.cmd_args.a2a_host is not None:
464
- if "a2a" not in cmd_args_config:
465
- cmd_args_config["a2a"] = {}
466
- cmd_args_config["a2a"]["host"] = self.cmd_args.a2a_host
470
+ a2a_config["host"] = self.cmd_args.a2a_host
467
471
 
468
472
  if hasattr(self.cmd_args, "a2a_port") and self.cmd_args.a2a_port is not None:
469
- if "a2a" not in cmd_args_config:
470
- cmd_args_config["a2a"] = {}
471
- cmd_args_config["a2a"]["port"] = self.cmd_args.a2a_port
473
+ a2a_config["port"] = self.cmd_args.a2a_port
472
474
 
473
475
  if hasattr(self.cmd_args, "mindsdb_host") and self.cmd_args.mindsdb_host is not None:
474
- if "a2a" not in cmd_args_config:
475
- cmd_args_config["a2a"] = {}
476
- cmd_args_config["a2a"]["mindsdb_host"] = self.cmd_args.mindsdb_host
476
+ a2a_config["mindsdb_host"] = self.cmd_args.mindsdb_host
477
477
 
478
478
  if hasattr(self.cmd_args, "mindsdb_port") and self.cmd_args.mindsdb_port is not None:
479
- if "a2a" not in cmd_args_config:
480
- cmd_args_config["a2a"] = {}
481
- cmd_args_config["a2a"]["mindsdb_port"] = self.cmd_args.mindsdb_port
479
+ a2a_config["mindsdb_port"] = self.cmd_args.mindsdb_port
482
480
 
483
481
  if hasattr(self.cmd_args, "agent_name") and self.cmd_args.agent_name is not None:
484
- if "a2a" not in cmd_args_config:
485
- cmd_args_config["a2a"] = {}
486
- cmd_args_config["a2a"]["agent_name"] = self.cmd_args.agent_name
482
+ a2a_config["agent_name"] = self.cmd_args.agent_name
487
483
 
488
484
  if hasattr(self.cmd_args, "project_name") and self.cmd_args.project_name is not None:
489
- if "a2a" not in cmd_args_config:
490
- cmd_args_config["a2a"] = {}
491
- cmd_args_config["a2a"]["project_name"] = self.cmd_args.project_name
485
+ a2a_config["project_name"] = self.cmd_args.project_name
492
486
 
493
487
  # Merge command-line args config with highest priority
494
- if cmd_args_config:
495
- _merge_configs(new_config, cmd_args_config)
488
+ if a2a_config:
489
+ _merge_configs(new_config, {"api": {"a2a": a2a_config}})
496
490
 
497
491
  # Ensure A2A port is never 0, which would prevent the A2A API from starting
498
- if "a2a" in new_config and isinstance(new_config["a2a"], dict):
499
- if "port" in new_config["a2a"] and (new_config["a2a"]["port"] == 0 or new_config["a2a"]["port"] is None):
500
- new_config["a2a"]["port"] = 47338 # Use the default port value
492
+ a2a_config = new_config["api"].get("a2a")
493
+ if a2a_config is not None and isinstance(a2a_config, dict):
494
+ if "port" in a2a_config and (a2a_config["port"] == 0 or a2a_config["port"] is None):
495
+ a2a_config["port"] = 47338 # Use the default port value
501
496
 
502
497
  # region create dirs
503
498
  for key, value in new_config["paths"].items():
@@ -522,11 +517,23 @@ class Config:
522
517
  self.ensure_auto_config_is_relevant()
523
518
  return self._config
524
519
 
525
- def update(self, data: dict) -> None:
526
- """Update calues in `auto` config"""
520
+ def update(self, data: dict, overwrite: bool = False) -> None:
521
+ """
522
+ Update values in `auto` config.
523
+ Args:
524
+ data (dict): data to update in `auto` config.
525
+ overwrite (bool): if True, overwrite existing keys, otherwise merge them.
526
+ - False (default): Merge recursively. Existing nested dictionaries are preserved
527
+ and only the specified keys in `data` are updated.
528
+ - True: Overwrite completely. Existing keys are replaced entirely with values
529
+ from `data`, discarding any nested structure not present in `data`.
530
+ """
527
531
  self.ensure_auto_config_is_relevant()
528
532
 
529
- _merge_configs(self._auto_config, data)
533
+ if overwrite:
534
+ _overwrite_configs(self._auto_config, data)
535
+ else:
536
+ _merge_configs(self._auto_config, data)
530
537
 
531
538
  self.auto_config_path.write_text(json.dumps(self._auto_config, indent=4))
532
539
 
@@ -40,6 +40,7 @@ def format_db_error_message(
40
40
  db_type: str | None = None,
41
41
  db_error_msg: str | None = None,
42
42
  failed_query: str | None = None,
43
+ is_external: bool = True,
43
44
  ) -> str:
44
45
  """Format the error message for the database query.
45
46
 
@@ -48,11 +49,21 @@ def format_db_error_message(
48
49
  db_type (str | None): The type of the database.
49
50
  db_error_msg (str | None): The error message.
50
51
  failed_query (str | None): The failed query.
52
+ is_external (bool): True if error appeared in external database, False if in internal duckdb
51
53
 
52
54
  Returns:
53
55
  str: The formatted error message.
54
56
  """
55
57
  error_message = "Failed to execute external database query during query processing."
58
+ if is_external:
59
+ error_message = (
60
+ "An error occurred while executing a derived query on the external "
61
+ "database during processing of your original SQL query."
62
+ )
63
+ else:
64
+ error_message = (
65
+ "An error occurred while processing an internally generated query derived from your original SQL statement."
66
+ )
56
67
  if db_name is not None or db_type is not None:
57
68
  error_message += "\n\nDatabase Details:"
58
69
  if db_name is not None: