MindsDB 25.1.2.0__py3-none-any.whl → 25.1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (99) hide show
  1. {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/METADATA +258 -255
  2. {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/RECORD +98 -85
  3. {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/WHEEL +1 -1
  4. mindsdb/__about__.py +1 -1
  5. mindsdb/__main__.py +5 -3
  6. mindsdb/api/executor/__init__.py +0 -1
  7. mindsdb/api/executor/command_executor.py +2 -1
  8. mindsdb/api/executor/data_types/answer.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
  10. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  11. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +8 -3
  12. mindsdb/api/executor/datahub/datanodes/project_datanode.py +9 -26
  13. mindsdb/api/executor/sql_query/__init__.py +1 -0
  14. mindsdb/api/executor/sql_query/result_set.py +36 -21
  15. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +1 -1
  16. mindsdb/api/executor/sql_query/steps/join_step.py +4 -4
  17. mindsdb/api/executor/sql_query/steps/map_reduce_step.py +6 -39
  18. mindsdb/api/executor/utilities/sql.py +2 -10
  19. mindsdb/api/http/namespaces/agents.py +3 -1
  20. mindsdb/api/http/namespaces/knowledge_bases.py +3 -3
  21. mindsdb/api/http/namespaces/sql.py +3 -1
  22. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +2 -1
  23. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +7 -0
  24. mindsdb/api/postgres/postgres_proxy/executor/executor.py +2 -1
  25. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -2
  26. mindsdb/integrations/handlers/chromadb_handler/requirements.txt +1 -1
  27. mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
  28. mindsdb/integrations/handlers/file_handler/file_handler.py +1 -1
  29. mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
  30. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +17 -1
  31. mindsdb/integrations/handlers/jira_handler/jira_handler.py +15 -1
  32. mindsdb/integrations/handlers/jira_handler/jira_table.py +52 -31
  33. mindsdb/integrations/handlers/langchain_embedding_handler/fastapi_embeddings.py +82 -0
  34. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +8 -1
  35. mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
  36. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
  37. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
  38. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +49 -12
  39. mindsdb/integrations/handlers/pinecone_handler/pinecone_handler.py +123 -72
  40. mindsdb/integrations/handlers/pinecone_handler/requirements.txt +1 -1
  41. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +12 -6
  42. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
  43. mindsdb/integrations/handlers/slack_handler/slack_handler.py +13 -2
  44. mindsdb/integrations/handlers/slack_handler/slack_tables.py +21 -1
  45. mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
  46. mindsdb/integrations/libs/ml_handler_process/learn_process.py +2 -2
  47. mindsdb/integrations/utilities/files/__init__.py +0 -0
  48. mindsdb/integrations/utilities/files/file_reader.py +258 -0
  49. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
  50. mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
  51. mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
  52. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/pgvector.py +76 -27
  53. mindsdb/integrations/utilities/rag/loaders/vector_store_loader/vector_store_loader.py +18 -1
  54. mindsdb/integrations/utilities/rag/pipelines/rag.py +84 -20
  55. mindsdb/integrations/utilities/rag/rag_pipeline_builder.py +16 -1
  56. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +166 -108
  57. mindsdb/integrations/utilities/rag/retrievers/__init__.py +3 -0
  58. mindsdb/integrations/utilities/rag/retrievers/multi_hop_retriever.py +85 -0
  59. mindsdb/integrations/utilities/rag/retrievers/retriever_factory.py +57 -0
  60. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +117 -48
  61. mindsdb/integrations/utilities/rag/settings.py +190 -17
  62. mindsdb/integrations/utilities/sql_utils.py +1 -1
  63. mindsdb/interfaces/agents/agents_controller.py +18 -8
  64. mindsdb/interfaces/agents/constants.py +1 -0
  65. mindsdb/interfaces/agents/langchain_agent.py +124 -157
  66. mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -37
  67. mindsdb/interfaces/agents/mindsdb_database_agent.py +21 -13
  68. mindsdb/interfaces/chatbot/chatbot_controller.py +7 -11
  69. mindsdb/interfaces/chatbot/chatbot_task.py +16 -5
  70. mindsdb/interfaces/chatbot/memory.py +58 -13
  71. mindsdb/interfaces/database/integrations.py +5 -1
  72. mindsdb/interfaces/database/projects.py +55 -16
  73. mindsdb/interfaces/database/views.py +12 -25
  74. mindsdb/interfaces/knowledge_base/controller.py +38 -9
  75. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
  76. mindsdb/interfaces/model/functions.py +15 -4
  77. mindsdb/interfaces/model/model_controller.py +4 -7
  78. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +51 -40
  79. mindsdb/interfaces/skills/retrieval_tool.py +10 -3
  80. mindsdb/interfaces/skills/skill_tool.py +97 -54
  81. mindsdb/interfaces/skills/skills_controller.py +7 -3
  82. mindsdb/interfaces/skills/sql_agent.py +127 -41
  83. mindsdb/interfaces/storage/db.py +1 -1
  84. mindsdb/migrations/versions/2025-01-15_c06c35f7e8e1_project_company.py +88 -0
  85. mindsdb/utilities/cache.py +7 -4
  86. mindsdb/utilities/context.py +11 -1
  87. mindsdb/utilities/langfuse.py +279 -0
  88. mindsdb/utilities/log.py +20 -2
  89. mindsdb/utilities/otel/__init__.py +206 -0
  90. mindsdb/utilities/otel/logger.py +25 -0
  91. mindsdb/utilities/otel/meter.py +19 -0
  92. mindsdb/utilities/otel/metric_handlers/__init__.py +25 -0
  93. mindsdb/utilities/otel/tracer.py +16 -0
  94. mindsdb/utilities/partitioning.py +52 -0
  95. mindsdb/utilities/render/sqlalchemy_render.py +7 -1
  96. mindsdb/utilities/utils.py +34 -0
  97. mindsdb/utilities/otel.py +0 -72
  98. {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/LICENSE +0 -0
  99. {MindsDB-25.1.2.0.dist-info → MindsDB-25.1.5.0.dist-info}/top_level.txt +0 -0
@@ -10,9 +10,26 @@ from mindsdb_sql_parser import ast
10
10
 
11
11
  logger = log.getLogger(__name__)
12
12
 
13
+
14
+ def flatten_json(nested_json, parent_key="", separator="."):
15
+ """
16
+ Recursively flattens a nested JSON object into a dictionary with dot notation keys.
17
+ """
18
+ items = []
19
+ for k, v in nested_json.items():
20
+ new_key = f"{parent_key}{separator}{k}" if parent_key else k
21
+ if isinstance(v, dict):
22
+ items.extend(flatten_json(v, new_key, separator=separator).items())
23
+ else:
24
+ items.append((new_key, v))
25
+ return dict(items)
26
+
27
+
13
28
  class JiraProjectsTable(APITable):
14
29
  """Jira Projects Table implementation"""
30
+
15
31
  _MAX_API_RESULTS = 100
32
+
16
33
  def select(self, query: ast.Select) -> pd.DataFrame:
17
34
  """Pulls data from the Jira "get_all_project_issues" API endpoint
18
35
  Parameters
@@ -42,8 +59,8 @@ class JiraProjectsTable(APITable):
42
59
 
43
60
  for an_order in query.order_by:
44
61
  if an_order.field.parts[0] != "key":
45
- continue
46
- if an_order.field.parts[1] in ["reporter","assignee","status"]:
62
+ continue
63
+ if an_order.field.parts[1] in ["reporter", "assignee", "status"]:
47
64
  if issues_kwargs != {}:
48
65
  raise ValueError(
49
66
  "Duplicate order conditions found for reporter,status and assignee"
@@ -61,9 +78,9 @@ class JiraProjectsTable(APITable):
61
78
  raise ValueError(
62
79
  f"Order by unknown column {an_order.field.parts[1]}"
63
80
  )
64
- project = self.handler.connection_data['project']
81
+ project = self.handler.connection_data["project"]
65
82
  jira_project_df = self.call_jira_api(project)
66
-
83
+
67
84
  selected_columns = []
68
85
  for target in query.targets:
69
86
  if isinstance(target, ast.Star):
@@ -74,7 +91,6 @@ class JiraProjectsTable(APITable):
74
91
  else:
75
92
  raise ValueError(f"Unknown query target {type(target)}")
76
93
 
77
-
78
94
  if len(jira_project_df) == 0:
79
95
  jira_project_df = pd.DataFrame([], columns=selected_columns)
80
96
  return jira_project_df
@@ -88,7 +104,7 @@ class JiraProjectsTable(APITable):
88
104
  by=order_by_conditions["columns"],
89
105
  ascending=order_by_conditions["ascending"],
90
106
  )
91
-
107
+
92
108
  if query.limit:
93
109
  jira_project_df = jira_project_df.head(total_results)
94
110
 
@@ -102,12 +118,12 @@ class JiraProjectsTable(APITable):
102
118
  List of columns
103
119
  """
104
120
  return [
105
- 'key',
106
- 'summary',
107
- 'status',
108
- 'reporter',
109
- 'assignee',
110
- 'priority',
121
+ "key",
122
+ "summary",
123
+ "status",
124
+ "reporter",
125
+ "assignee",
126
+ "priority",
111
127
  ]
112
128
 
113
129
  def call_jira_api(self, project):
@@ -116,36 +132,41 @@ class JiraProjectsTable(APITable):
116
132
  max_records = jira.get_project_issues_count(project)
117
133
  max_records = 100
118
134
  jql_query = self.handler.construct_jql()
119
- max_results = self._MAX_API_RESULTS
135
+ max_results = self._MAX_API_RESULTS
120
136
  start_index = 0
121
137
  total = 1
122
138
  fields = [
123
- 'key',
124
- 'fields.summary',
125
- 'fields.status.name',
126
- 'fields.reporter.name',
127
- 'fields.assignee.name',
128
- 'fields.priority.name',
139
+ "key",
140
+ "fields.summary",
141
+ "fields.status.name",
142
+ "fields.reporter.displayName",
143
+ "fields.assignee.displayName",
144
+ "fields.priority.name",
129
145
  ]
130
146
 
131
147
  all_jira_issues_df = pd.DataFrame(columns=fields)
132
148
 
133
149
  while start_index <= total:
134
- results = self.handler.connect().jql(jql_query,start=start_index, limit=max_results)
135
- df = pd.json_normalize(results['issues'])
150
+ results = self.handler.connect().jql(
151
+ jql_query, start=start_index, limit=max_results
152
+ )
153
+ flattened_data = [flatten_json(item) for item in results["issues"]]
154
+ df = pd.DataFrame(flattened_data)
136
155
  df = df[fields]
137
156
  start_index += max_results
138
- total = max_records
157
+ total = results["total"]
139
158
  all_jira_issues_df = pd.concat([all_jira_issues_df, df], axis=0)
140
159
 
160
+ all_jira_issues_df = all_jira_issues_df.rename(
161
+ columns={
162
+ "key": "key",
163
+ "fields.summary": "summary",
164
+ "fields.reporter.displayName": "reporter",
165
+ "fields.assignee.displayName": "assignee",
166
+ "fields.priority.name": "priority",
167
+ "fields.status.name": "status",
168
+ },
169
+ errors="ignore",
170
+ )
141
171
 
142
- all_jira_issues_df = all_jira_issues_df.rename(columns={
143
- 'key': 'key',
144
- 'fields.summary': 'summary',
145
- 'fields.reporter.name':'reporter',
146
- 'fields.assignee.name':'assignee',
147
- 'fields.priority.name':'priority',
148
- 'fields.status.name':'status'})
149
-
150
172
  return all_jira_issues_df
151
-
@@ -0,0 +1,82 @@
1
+ from typing import Any, List
2
+ from langchain_core.embeddings import Embeddings
3
+ import requests
4
+
5
+
6
+ class FastAPIEmbeddings(Embeddings):
7
+ """An embedding extension that interfaces with FAST API. Useful for custom serving solutions."""
8
+
9
+ def __init__(
10
+ self,
11
+ api_base: str,
12
+ model: str,
13
+ batch_size: int = 32,
14
+ **kwargs: Any,
15
+ ):
16
+ """Initialize the embeddings class.
17
+
18
+ Args:
19
+ api_base: Base URL for the VLLM server
20
+ model: Model name/path to use for embeddings
21
+ batch_size: Batch size for generating embeddings
22
+ """
23
+ super().__init__()
24
+ self.api_base = api_base
25
+ self.model = model
26
+ self.batch_size = batch_size
27
+
28
+ # initialize requests here with the api_base
29
+
30
+ def _get_embeddings(self, texts: List[str]) -> List[str]:
31
+ """Get embeddings for a batch of text chunks.
32
+
33
+ Returns:
34
+ List of embeddings as strings. For sparse vectors, returns strings in format
35
+ "{key:value,...}/size" where size is the dimension of the vector space.
36
+ """
37
+
38
+ headers = {"accept": "application/json", "Content-Type": "application/json"}
39
+
40
+ data = {
41
+ "input": texts,
42
+ "model": self.model
43
+ }
44
+
45
+ response = requests.post(self.api_base, headers=headers, json=data)
46
+
47
+ response.raise_for_status()
48
+
49
+ embeddings = []
50
+ for response_dict in response.json()["data"]:
51
+ embedding = response_dict["embedding"]
52
+ embeddings.append(embedding)
53
+
54
+ return embeddings
55
+
56
+ def embed_documents(self, texts: List[str]) -> List[str]:
57
+ """Embed a list of documents using vLLM.
58
+
59
+ Args:
60
+ texts: List of documents to embed
61
+
62
+ Returns:
63
+ List of embeddings as strings, one for each document.
64
+ For sparse embeddings, returns strings in format "{key:value,...}/size"
65
+ For dense embeddings, returns JSON strings of float lists
66
+ """
67
+
68
+ return self._get_embeddings(texts)
69
+
70
+ def embed_query(self, text: str) -> str:
71
+ """Embed a single query text using vLLM.
72
+
73
+ Args:
74
+ text: Query text to embed
75
+
76
+ Returns:
77
+ Query embedding as a string.
78
+ For sparse embeddings, returns string in format "{key:value,...}/size"
79
+ For dense embeddings, returns JSON string of float list
80
+ """
81
+
82
+ return self._get_embeddings([text])[0]
@@ -10,6 +10,7 @@ from mindsdb.integrations.libs.base import BaseMLEngine
10
10
  from mindsdb.utilities import log
11
11
  from langchain_core.embeddings import Embeddings
12
12
  from mindsdb.integrations.handlers.langchain_embedding_handler.vllm_embeddings import VLLMEmbeddings
13
+ from mindsdb.integrations.handlers.langchain_embedding_handler.fastapi_embeddings import FastAPIEmbeddings
13
14
 
14
15
  logger = log.getLogger(__name__)
15
16
 
@@ -20,7 +21,10 @@ logger = log.getLogger(__name__)
20
21
  # This is used for the user to select the embedding model
21
22
  EMBEDDING_MODELS = {
22
23
  'VLLM': 'VLLMEmbeddings',
23
- 'vllm': 'VLLMEmbeddings'
24
+ 'vllm': 'VLLMEmbeddings',
25
+ 'FastAPI': 'FastAPIEmbeddings',
26
+ 'fastapi': 'FastAPIEmbeddings'
27
+
24
28
  }
25
29
 
26
30
  try:
@@ -55,6 +59,9 @@ def get_langchain_class(class_name: str) -> Embeddings:
55
59
  if class_name == "VLLMEmbeddings":
56
60
  return VLLMEmbeddings
57
61
 
62
+ if class_name == "FastAPIEmbeddings":
63
+ return FastAPIEmbeddings
64
+
58
65
  # Then try langchain_community.embeddings
59
66
  try:
60
67
  module = importlib.import_module("langchain_community.embeddings")
@@ -3,6 +3,6 @@ wikipedia==1.4.0
3
3
  tiktoken
4
4
  anthropic>=0.26.1
5
5
  litellm==1.44.8
6
- chromadb # Knowledge bases.
6
+ chromadb~=0.6.3 # Knowledge bases.
7
7
  -r mindsdb/integrations/handlers/openai_handler/requirements.txt
8
8
  -r mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt
@@ -28,7 +28,7 @@ class MSOneDriveHandler(APIHandler):
28
28
  """
29
29
 
30
30
  name = 'one_drive'
31
- supported_file_formats = ['csv', 'tsv', 'json', 'parquet']
31
+ supported_file_formats = ['csv', 'tsv', 'json', 'parquet', 'pdf', 'txt']
32
32
 
33
33
  def __init__(self, name: Text, connection_data: Dict, **kwargs: Any) -> None:
34
34
  """
@@ -9,6 +9,8 @@ from mindsdb.integrations.utilities.sql_utils import (
9
9
  SortColumn
10
10
  )
11
11
 
12
+ from mindsdb.integrations.utilities.files.file_reader import FileReader
13
+
12
14
 
13
15
  class ListFilesTable(APIResource):
14
16
  """
@@ -97,4 +99,10 @@ class FileTable(APIResource):
97
99
  elif file_extension == "parquet":
98
100
  df = pd.read_parquet(BytesIO(file_content))
99
101
 
102
+ elif file_extension == "pdf":
103
+ df = FileReader().read_pdf(BytesIO(file_content))
104
+
105
+ elif file_extension == "txt":
106
+ df = FileReader().read_txt(BytesIO(file_content))
107
+
100
108
  return df
@@ -37,6 +37,11 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
37
37
  super().__init__(name=name, **kwargs)
38
38
  self._is_shared_db = False
39
39
  self._is_vector_registered = False
40
+ # we get these from the connection args on PostgresHandler parent
41
+ self._is_sparse = self.connection_args.get('is_sparse', False)
42
+ self._vector_size = self.connection_args.get('vector_size', None)
43
+ if self._is_sparse and not self._vector_size:
44
+ raise ValueError("vector_size is required when is_sparse=True")
40
45
  self.connect()
41
46
 
42
47
  def _make_connection_args(self):
@@ -190,13 +195,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
190
195
  if filter_conditions:
191
196
 
192
197
  if embedding_search:
193
- # if search vector, return similar rows, apply other filters after if any
194
198
  search_vector = filter_conditions["embeddings"]["value"][0]
195
199
  filter_conditions.pop("embeddings")
196
- return f"SELECT {targets} FROM {table_name} ORDER BY embeddings <=> '{search_vector}' {after_from_clause}"
200
+
201
+ if self._is_sparse:
202
+ # Convert dict to sparse vector if needed
203
+ if isinstance(search_vector, dict):
204
+ from pgvector.utils import SparseVector
205
+ embedding = SparseVector(search_vector, self._vector_size)
206
+ search_vector = embedding.to_text()
207
+ # Use inner product for sparse vectors
208
+ distance_op = "<#>"
209
+ else:
210
+ # Convert list to vector string if needed
211
+ if isinstance(search_vector, list):
212
+ search_vector = f"[{','.join(str(x) for x in search_vector)}]"
213
+ # Use cosine similarity for dense vectors
214
+ distance_op = "<=>"
215
+
216
+ return f"SELECT {targets} FROM {table_name} ORDER BY embeddings {distance_op} '{search_vector}' ASC {after_from_clause}"
217
+
197
218
  else:
198
- # if filter conditions, return filtered rows
219
+ # if filter conditions, return rows that satisfy the conditions
199
220
  return f"SELECT {targets} FROM {table_name} {after_from_clause}"
221
+
200
222
  else:
201
223
  # if no filter conditions, return all rows
202
224
  return f"SELECT {targets} FROM {table_name} {after_from_clause}"
@@ -339,14 +361,30 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
339
361
  full_search_query = f'{semantic_search_cte}{full_text_search_cte}{hybrid_select}'
340
362
  return self.raw_query(full_search_query)
341
363
 
342
- def create_table(self, table_name: str, if_not_exists=True):
343
- """
344
- Run a create table query on the pgvector database.
345
- """
346
- table_name = self._check_table(table_name)
347
-
348
- query = f"CREATE TABLE IF NOT EXISTS {table_name} (id text PRIMARY KEY, content text, embeddings vector, metadata jsonb)"
349
- self.raw_query(query)
364
+ def create_table(self, table_name: str):
365
+ """Create a table with a vector column."""
366
+ with self.connection.cursor() as cur:
367
+ # For sparse vectors, use sparsevec type
368
+ vector_column_type = 'sparsevec' if self._is_sparse else 'vector'
369
+
370
+ # Vector size is required for sparse vectors, optional for dense
371
+ if self._is_sparse and not self._vector_size:
372
+ raise ValueError("vector_size is required for sparse vectors")
373
+
374
+ # Add vector size specification only if provided
375
+ size_spec = f"({self._vector_size})" if self._vector_size is not None else "()"
376
+ if vector_column_type == 'vector':
377
+ size_spec = ''
378
+
379
+ cur.execute(f"""
380
+ CREATE TABLE IF NOT EXISTS {table_name} (
381
+ id TEXT PRIMARY KEY,
382
+ embeddings {vector_column_type}{size_spec},
383
+ content TEXT,
384
+ metadata JSONB
385
+ )
386
+ """)
387
+ self.connection.commit()
350
388
 
351
389
  def insert(
352
390
  self, table_name: str, data: pd.DataFrame
@@ -444,4 +482,3 @@ class PgVectorHandler(VectorStoreHandler, PostgresHandler):
444
482
  """
445
483
  table_name = self._check_table(table_name)
446
484
  self.raw_query(f"DROP TABLE IF EXISTS {table_name}")
447
-