MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (102) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/a2a/common/server/server.py +16 -6
  4. mindsdb/api/executor/command_executor.py +215 -150
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  6. mindsdb/api/executor/planner/plan_join.py +3 -0
  7. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  8. mindsdb/api/executor/planner/query_planner.py +1 -0
  9. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  11. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  12. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  13. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  14. mindsdb/api/executor/utilities/sql.py +30 -0
  15. mindsdb/api/http/initialize.py +18 -44
  16. mindsdb/api/http/namespaces/agents.py +23 -20
  17. mindsdb/api/http/namespaces/chatbots.py +83 -120
  18. mindsdb/api/http/namespaces/file.py +1 -1
  19. mindsdb/api/http/namespaces/jobs.py +38 -60
  20. mindsdb/api/http/namespaces/tree.py +69 -61
  21. mindsdb/api/http/namespaces/views.py +56 -72
  22. mindsdb/api/mcp/start.py +2 -0
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  24. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  25. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  27. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  28. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  29. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  30. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  31. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  32. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  33. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  34. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  35. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  36. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  37. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  38. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  39. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  40. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  41. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  42. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  43. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  44. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  45. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
  46. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  47. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
  48. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  49. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  50. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  51. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  52. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  53. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  54. mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
  55. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
  56. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
  57. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  58. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  59. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  60. mindsdb/integrations/libs/llm/config.py +0 -14
  61. mindsdb/integrations/libs/llm/utils.py +0 -15
  62. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  63. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  64. mindsdb/integrations/utilities/handler_utils.py +32 -12
  65. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  66. mindsdb/interfaces/agents/agents_controller.py +246 -149
  67. mindsdb/interfaces/agents/constants.py +0 -1
  68. mindsdb/interfaces/agents/langchain_agent.py +11 -6
  69. mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
  70. mindsdb/interfaces/database/database.py +38 -13
  71. mindsdb/interfaces/database/integrations.py +20 -5
  72. mindsdb/interfaces/database/projects.py +174 -23
  73. mindsdb/interfaces/database/views.py +86 -60
  74. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  75. mindsdb/interfaces/knowledge_base/controller.py +33 -6
  76. mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
  77. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  78. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  79. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  80. mindsdb/interfaces/query_context/context_controller.py +111 -145
  81. mindsdb/interfaces/skills/skills_controller.py +18 -6
  82. mindsdb/interfaces/storage/db.py +40 -6
  83. mindsdb/interfaces/variables/variables_controller.py +8 -15
  84. mindsdb/utilities/config.py +5 -3
  85. mindsdb/utilities/fs.py +54 -17
  86. mindsdb/utilities/functions.py +72 -60
  87. mindsdb/utilities/log.py +38 -6
  88. mindsdb/utilities/ps.py +7 -7
  89. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
  90. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
  91. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  92. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  93. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  94. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  95. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  96. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  97. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  98. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  99. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  100. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  101. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  102. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
7
7
  from .urlcrawl_helpers import get_all_websites
8
8
 
9
9
  from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
10
- from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
10
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
11
11
 
12
12
 
13
13
  class CrawlerTable(APIResource):
14
-
15
- def list(
16
- self,
17
- conditions: List[FilterCondition] = None,
18
- limit: int = None,
19
- **kwargs
20
- ) -> pd.DataFrame:
14
+ def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
21
15
  """
22
16
  Selects data from the provided websites
23
17
 
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
30
24
  urls = []
31
25
  crawl_depth = None
32
26
  per_url_limit = None
27
+ headers = {}
33
28
  for condition in conditions:
34
- if condition.column == 'url':
29
+ if condition.column == "url":
35
30
  if condition.op == FilterOperator.IN:
36
31
  urls = condition.value
37
32
  elif condition.op == FilterOperator.EQUAL:
38
33
  urls = [condition.value]
39
34
  condition.applied = True
40
- if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
35
+ if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
41
36
  crawl_depth = condition.value
42
37
  condition.applied = True
43
- if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
38
+ if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
44
39
  per_url_limit = condition.value
45
40
  condition.applied = True
41
+ if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
42
+ headers["User-Agent"] = condition.value
43
+ condition.applied = True
46
44
 
47
45
  if len(urls) == 0:
48
46
  raise NotImplementedError(
49
- 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
47
+ 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
48
+ )
50
49
 
51
- allowed_urls = config.get('web_crawling_allowed_sites', [])
50
+ allowed_urls = config.get("web_crawling_allowed_sites", [])
52
51
  if allowed_urls and not validate_urls(urls, allowed_urls):
53
- raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
52
+ raise ValueError(
53
+ f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
54
+ )
54
55
 
55
56
  if limit is None and per_url_limit is None and crawl_depth is None:
56
57
  per_url_limit = 1
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
58
59
  # crawl every url separately
59
60
  results = []
60
61
  for url in urls:
61
- results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
62
+ results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
62
63
  result = pd.concat(results)
63
64
  else:
64
- result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
65
+ result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
65
66
 
66
67
  if limit is not None and len(result) > limit:
67
68
  result = result[:limit]
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
72
73
  """
73
74
  Returns the columns of the crawler table
74
75
  """
75
- return [
76
- 'url',
77
- 'text_content',
78
- 'error'
79
- ]
76
+ return ["url", "text_content", "error"]
80
77
 
81
78
 
82
79
  class WebHandler(APIHandler):
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
87
84
  def __init__(self, name=None, **kwargs):
88
85
  super().__init__(name)
89
86
  crawler = CrawlerTable(self)
90
- self._register_table('crawler', crawler)
87
+ self._register_table("crawler", crawler)
91
88
 
92
89
  def check_connection(self) -> HandlerStatusResponse:
93
90
  """
@@ -37,20 +37,6 @@ class AnthropicConfig(BaseLLMConfig):
37
37
  anthropic_api_url: Optional[str]
38
38
 
39
39
 
40
- # See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.anyscale.ChatAnyscale.html
41
- # This config does not have to be exclusively used with Langchain.
42
- class AnyscaleConfig(BaseLLMConfig):
43
- model_name: str
44
- temperature: Optional[float]
45
- max_retries: Optional[int]
46
- max_tokens: Optional[int]
47
- anyscale_api_base: Optional[str]
48
- # Inferred from ANYSCALE_API_KEY if not provided.
49
- anyscale_api_key: Optional[str]
50
- anyscale_proxy: Optional[str]
51
- request_timeout: Optional[float]
52
-
53
-
54
40
  # See https://api.python.langchain.com/en/latest/chat_models/langchain_community.chat_models.litellm.ChatLiteLLM.html
55
41
  # This config does not have to be exclusively used with Langchain.
56
42
  class LiteLLMConfig(BaseLLMConfig):
@@ -8,7 +8,6 @@ import pandas as pd
8
8
 
9
9
  from mindsdb.integrations.libs.llm.config import (
10
10
  AnthropicConfig,
11
- AnyscaleConfig,
12
11
  BaseLLMConfig,
13
12
  GoogleConfig,
14
13
  LiteLLMConfig,
@@ -30,9 +29,6 @@ DEFAULT_OPENAI_MAX_RETRIES = 3
30
29
 
31
30
  DEFAULT_ANTHROPIC_MODEL = "claude-3-haiku-20240307"
32
31
 
33
- DEFAULT_ANYSCALE_MODEL = "meta-llama/Llama-2-7b-chat-hf"
34
- DEFAULT_ANYSCALE_BASE_URL = "https://api.endpoints.anyscale.com/v1"
35
-
36
32
  DEFAULT_GOOGLE_MODEL = "gemini-2.5-pro-preview-03-25"
37
33
 
38
34
  DEFAULT_LITELLM_MODEL = "gpt-3.5-turbo"
@@ -135,17 +131,6 @@ def get_llm_config(provider: str, args: Dict) -> BaseLLMConfig:
135
131
  anthropic_api_key=args["api_keys"].get("anthropic", None),
136
132
  anthropic_api_url=args.get("base_url", None),
137
133
  )
138
- if provider == "anyscale":
139
- return AnyscaleConfig(
140
- model_name=args.get("model_name", DEFAULT_ANYSCALE_MODEL),
141
- temperature=temperature,
142
- max_retries=args.get("max_retries", DEFAULT_OPENAI_MAX_RETRIES),
143
- max_tokens=args.get("max_tokens", DEFAULT_OPENAI_MAX_TOKENS),
144
- anyscale_api_base=args.get("base_url", DEFAULT_ANYSCALE_BASE_URL),
145
- anyscale_api_key=args["api_keys"].get("anyscale", None),
146
- anyscale_proxy=args.get("proxy", None),
147
- request_timeout=args.get("request_timeout", None),
148
- )
149
134
  if provider == "litellm":
150
135
  model_kwargs = {
151
136
  "api_key": args["api_keys"].get("litellm", None),
@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
334
334
 
335
335
  if not df_update.empty:
336
336
  # get values of existed `created_at` and return them to metadata
337
- created_dates = {row[id_col]: row[metadata_col].get("_created_at") for _, row in df_existed.iterrows()}
337
+ origin_id_col = "_original_doc_id"
338
+
339
+ created_dates, ids = {}, {}
340
+ for _, row in df_existed.iterrows():
341
+ chunk_id = row[id_col]
342
+ created_dates[chunk_id] = row[metadata_col].get("_created_at")
343
+ ids[chunk_id] = row[metadata_col].get(origin_id_col)
338
344
 
339
345
  def keep_created_at(row):
340
346
  val = created_dates.get(row[id_col])
341
347
  if val:
342
348
  row[metadata_col]["_created_at"] = val
349
+ # keep id column
350
+ if origin_id_col not in row[metadata_col]:
351
+ row[metadata_col][origin_id_col] = ids.get(row[id_col])
343
352
  return row
344
353
 
345
354
  df_update.apply(keep_created_at, axis=1)
@@ -10,6 +10,7 @@ from typing import List, Generator
10
10
  import filetype
11
11
  import pandas as pd
12
12
  from charset_normalizer import from_bytes
13
+ from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
13
14
 
14
15
  from mindsdb.utilities import log
15
16
 
@@ -322,40 +323,25 @@ class FileReader(FormatDetector):
322
323
  @staticmethod
323
324
  def read_txt(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
324
325
  # the lib is heavy, so import it only when needed
325
- from langchain_text_splitters import RecursiveCharacterTextSplitter
326
326
 
327
327
  file_obj = decode(file_obj)
328
328
 
329
- try:
330
- from langchain_core.documents import Document
331
- except ImportError:
332
- raise FileProcessingError(
333
- "To import TXT document please install 'langchain-community':\n pip install langchain-community"
334
- )
335
329
  text = file_obj.read()
336
330
 
337
- metadata = {"source_file": name, "file_format": "txt"}
338
- documents = [Document(page_content=text, metadata=metadata)]
331
+ text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
339
332
 
340
- text_splitter = RecursiveCharacterTextSplitter(
341
- chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
342
- )
343
-
344
- docs = text_splitter.split_documents(documents)
345
- return pd.DataFrame([{"content": doc.page_content, "metadata": doc.metadata} for doc in docs])
333
+ docs = text_splitter.split_text(text)
334
+ return pd.DataFrame([{"content": doc, "metadata": {"source_file": name, "file_format": "txt"}} for doc in docs])
346
335
 
347
336
  @staticmethod
348
337
  def read_pdf(file_obj: BytesIO, name: str | None = None, **kwargs) -> pd.DataFrame:
349
338
  # the libs are heavy, so import it only when needed
350
339
  import fitz # pymupdf
351
- from langchain_text_splitters import RecursiveCharacterTextSplitter
352
340
 
353
341
  with fitz.open(stream=file_obj.read()) as pdf: # open pdf
354
342
  text = chr(12).join([page.get_text() for page in pdf])
355
343
 
356
- text_splitter = RecursiveCharacterTextSplitter(
357
- chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP
358
- )
344
+ text_splitter = TextSplitter(chunk_size=DEFAULT_CHUNK_SIZE, chunk_overlap=DEFAULT_CHUNK_OVERLAP)
359
345
 
360
346
  split_text = text_splitter.split_text(text)
361
347
 
@@ -37,54 +37,74 @@ def get_api_key(
37
37
 
38
38
  # 1
39
39
  if "using" in create_args and f"{api_name.lower()}_api_key" in create_args["using"]:
40
- return create_args["using"][f"{api_name.lower()}_api_key"]
40
+ api_key = create_args["using"][f"{api_name.lower()}_api_key"]
41
+ if api_key:
42
+ return api_key
41
43
 
42
44
  # 1.5 - Check for generic api_key in using
43
45
  if "using" in create_args and "api_key" in create_args["using"]:
44
- return create_args["using"]["api_key"]
46
+ api_key = create_args["using"]["api_key"]
47
+ if api_key:
48
+ return api_key
45
49
 
46
50
  # 2
47
51
  if f"{api_name.lower()}_api_key" in create_args:
48
- return create_args[f"{api_name.lower()}_api_key"]
52
+ api_key = create_args[f"{api_name.lower()}_api_key"]
53
+ if api_key:
54
+ return api_key
49
55
 
50
56
  # 2.5 - Check for generic api_key
51
57
  if "api_key" in create_args:
52
- return create_args["api_key"]
58
+ api_key = create_args["api_key"]
59
+ if api_key:
60
+ return api_key
53
61
 
54
62
  # 3 - Check in params dictionary if it exists (for agents)
55
63
  if "params" in create_args and create_args["params"] is not None:
56
64
  if f"{api_name.lower()}_api_key" in create_args["params"]:
57
- return create_args["params"][f"{api_name.lower()}_api_key"]
65
+ api_key = create_args["params"][f"{api_name.lower()}_api_key"]
66
+ if api_key:
67
+ return api_key
58
68
  # 3.5 - Check for generic api_key in params
59
69
  if "api_key" in create_args["params"]:
60
- return create_args["params"]["api_key"]
70
+ api_key = create_args["params"]["api_key"]
71
+ if api_key:
72
+ return api_key
61
73
 
62
74
  # 4
63
75
  if engine_storage is not None:
64
76
  connection_args = engine_storage.get_connection_args()
65
77
  if f"{api_name.lower()}_api_key" in connection_args:
66
- return connection_args[f"{api_name.lower()}_api_key"]
78
+ api_key = connection_args[f"{api_name.lower()}_api_key"]
79
+ if api_key:
80
+ return api_key
67
81
  # 4.5 - Check for generic api_key in connection_args
68
82
  if "api_key" in connection_args:
69
- return connection_args["api_key"]
83
+ api_key = connection_args["api_key"]
84
+ if api_key:
85
+ return api_key
70
86
 
71
87
  # 5
72
88
  api_key = os.getenv(f"{api_name.lower()}_api_key")
73
- if api_key is not None:
89
+ if api_key:
74
90
  return api_key
75
91
  api_key = os.getenv(f"{api_name.upper()}_API_KEY")
76
- if api_key is not None:
92
+ if api_key:
77
93
  return api_key
78
94
 
79
95
  # 6
80
96
  config = Config()
81
97
  api_cfg = config.get(api_name, {})
82
98
  if f"{api_name.lower()}_api_key" in api_cfg:
83
- return api_cfg[f"{api_name.lower()}_api_key"]
99
+ api_key = api_cfg[f"{api_name.lower()}_api_key"]
100
+ if api_key:
101
+ return api_key
84
102
 
85
103
  # 7
86
104
  if "api_keys" in create_args and api_name in create_args["api_keys"]:
87
- return create_args["api_keys"][api_name]
105
+ api_key = create_args["api_keys"][api_name]
106
+ if api_key:
107
+ return api_key
88
108
 
89
109
  if strict:
90
110
  provider_upper = api_name.upper()
@@ -33,7 +33,7 @@ class BaseLLMReranker(BaseModel, ABC):
33
33
  client: Optional[AsyncOpenAI | BaseMLEngine] = None
34
34
  _semaphore: Optional[asyncio.Semaphore] = None
35
35
  max_concurrent_requests: int = 20
36
- max_retries: int = 2
36
+ max_retries: int = 4
37
37
  retry_delay: float = 1.0
38
38
  request_timeout: float = 20.0 # Timeout for API requests
39
39
  early_stop: bool = True # Whether to enable early stopping