MindsDB 25.3.4.2__py3-none-any.whl → 25.4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +21 -4
- mindsdb/api/mcp/__init__.py +0 -0
- mindsdb/api/mcp/start.py +152 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +20 -20
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +121 -11
- mindsdb/interfaces/database/projects.py +15 -0
- mindsdb/interfaces/knowledge_base/controller.py +78 -2
- mindsdb/utilities/config.py +8 -0
- mindsdb/utilities/starters.py +7 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.1.0.dist-info}/METADATA +220 -219
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.1.0.dist-info}/RECORD +15 -13
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.1.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.1.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.3.4.2.dist-info → mindsdb-25.4.1.0.dist-info}/top_level.txt +0 -0
mindsdb/__about__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
__title__ = 'MindsDB'
|
|
2
2
|
__package_name__ = 'mindsdb'
|
|
3
|
-
__version__ = '25.
|
|
3
|
+
__version__ = '25.4.1.0'
|
|
4
4
|
__description__ = "MindsDB's AI SQL Server enables developers to build AI tools that need access to real-time data to perform their tasks"
|
|
5
5
|
__email__ = "jorge@mindsdb.com"
|
|
6
6
|
__author__ = 'MindsDB Inc'
|
mindsdb/__main__.py
CHANGED
|
@@ -24,7 +24,8 @@ from mindsdb.__about__ import __version__ as mindsdb_version
|
|
|
24
24
|
from mindsdb.utilities.config import config
|
|
25
25
|
from mindsdb.utilities.exception import EntityNotExistsError
|
|
26
26
|
from mindsdb.utilities.starters import (
|
|
27
|
-
start_http, start_mysql, start_mongo, start_postgres, start_ml_task_queue, start_scheduler, start_tasks
|
|
27
|
+
start_http, start_mysql, start_mongo, start_postgres, start_ml_task_queue, start_scheduler, start_tasks,
|
|
28
|
+
start_mcp
|
|
28
29
|
)
|
|
29
30
|
from mindsdb.utilities.ps import is_pid_listen_port, get_child_pids
|
|
30
31
|
from mindsdb.utilities.functions import get_versions_where_predictors_become_obsolete
|
|
@@ -57,6 +58,7 @@ class TrunkProcessEnum(Enum):
|
|
|
57
58
|
JOBS = 'jobs'
|
|
58
59
|
TASKS = 'tasks'
|
|
59
60
|
ML_TASK_QUEUE = 'ml_task_queue'
|
|
61
|
+
MCP = 'mcp'
|
|
60
62
|
|
|
61
63
|
@classmethod
|
|
62
64
|
def _missing_(cls, value):
|
|
@@ -221,9 +223,9 @@ if __name__ == '__main__':
|
|
|
221
223
|
ctx.set_default()
|
|
222
224
|
|
|
223
225
|
# ---- CHECK SYSTEM ----
|
|
224
|
-
if not (sys.version_info[0] >= 3 and sys.version_info[1] >=
|
|
226
|
+
if not (sys.version_info[0] >= 3 and sys.version_info[1] >= 10):
|
|
225
227
|
print("""
|
|
226
|
-
MindsDB requires Python >= 3.
|
|
228
|
+
MindsDB requires Python >= 3.10 to run
|
|
227
229
|
|
|
228
230
|
Once you have supported Python version installed you can start mindsdb as follows:
|
|
229
231
|
|
|
@@ -385,6 +387,7 @@ if __name__ == '__main__':
|
|
|
385
387
|
|
|
386
388
|
http_api_config = config['api']['http']
|
|
387
389
|
mysql_api_config = config['api']['mysql']
|
|
390
|
+
mcp_api_config = config['api']['mcp']
|
|
388
391
|
trunc_processes_struct = {
|
|
389
392
|
TrunkProcessEnum.HTTP: TrunkProcessData(
|
|
390
393
|
name=TrunkProcessEnum.HTTP.value,
|
|
@@ -434,11 +437,25 @@ if __name__ == '__main__':
|
|
|
434
437
|
name=TrunkProcessEnum.ML_TASK_QUEUE.value,
|
|
435
438
|
entrypoint=start_ml_task_queue,
|
|
436
439
|
args=(config.cmd_args.verbose,)
|
|
440
|
+
),
|
|
441
|
+
TrunkProcessEnum.MCP: TrunkProcessData(
|
|
442
|
+
name=TrunkProcessEnum.MCP.value,
|
|
443
|
+
entrypoint=start_mcp,
|
|
444
|
+
port=mcp_api_config.get('port', 47337),
|
|
445
|
+
args=(config.cmd_args.verbose,),
|
|
446
|
+
restart_on_failure=mcp_api_config.get('restart_on_failure', False),
|
|
447
|
+
max_restart_count=mcp_api_config.get('max_restart_count', TrunkProcessData.max_restart_count),
|
|
448
|
+
max_restart_interval_seconds=mcp_api_config.get(
|
|
449
|
+
'max_restart_interval_seconds', TrunkProcessData.max_restart_interval_seconds
|
|
450
|
+
)
|
|
437
451
|
)
|
|
438
452
|
}
|
|
439
453
|
|
|
440
454
|
for api_enum in api_arr:
|
|
441
|
-
|
|
455
|
+
if api_enum in trunc_processes_struct:
|
|
456
|
+
trunc_processes_struct[api_enum].need_to_run = True
|
|
457
|
+
else:
|
|
458
|
+
logger.error(f"ERROR: {api_enum} API is not a valid api in config")
|
|
442
459
|
|
|
443
460
|
if config['jobs']['disable'] is False:
|
|
444
461
|
trunc_processes_struct[TrunkProcessEnum.JOBS].need_to_run = True
|
|
File without changes
|
mindsdb/api/mcp/start.py
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from contextlib import asynccontextmanager
|
|
2
|
+
from collections.abc import AsyncIterator
|
|
3
|
+
from typing import Optional, Dict, Any
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
from mcp.server.fastmcp import FastMCP
|
|
7
|
+
from mindsdb.api.mysql.mysql_proxy.classes.fake_mysql_proxy import FakeMysqlProxy
|
|
8
|
+
from mindsdb.api.executor.data_types.response_type import RESPONSE_TYPE as SQL_RESPONSE_TYPE
|
|
9
|
+
from mindsdb.utilities import log
|
|
10
|
+
from mindsdb.utilities.config import Config
|
|
11
|
+
from mindsdb.interfaces.storage import db
|
|
12
|
+
|
|
13
|
+
logger = log.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class AppContext:
|
|
18
|
+
db: Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@asynccontextmanager
|
|
22
|
+
async def app_lifespan(server: FastMCP) -> AsyncIterator[AppContext]:
|
|
23
|
+
"""Manage application lifecycle with type-safe context"""
|
|
24
|
+
# Initialize on startup
|
|
25
|
+
db.init()
|
|
26
|
+
try:
|
|
27
|
+
yield AppContext(db=db)
|
|
28
|
+
finally:
|
|
29
|
+
# TODO: We need better way to handle this in storage/db.py
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Configure server with lifespan
|
|
34
|
+
mcp = FastMCP(
|
|
35
|
+
"MindsDB",
|
|
36
|
+
lifespan=app_lifespan,
|
|
37
|
+
dependencies=["mindsdb"] # Add any additional dependencies
|
|
38
|
+
)
|
|
39
|
+
# MCP Queries
|
|
40
|
+
LISTING_QUERY = "SHOW DATABASES"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@mcp.tool()
|
|
44
|
+
def query(query: str, context: Optional[Dict] = None) -> Dict[str, Any]:
|
|
45
|
+
"""
|
|
46
|
+
Execute a SQL query against MindsDB
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
query: The SQL query to execute
|
|
50
|
+
context: Optional context parameters for the query
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
Dict containing the query results or error information
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
if context is None:
|
|
57
|
+
context = {}
|
|
58
|
+
|
|
59
|
+
logger.debug(f'Incoming MCP query: {query}')
|
|
60
|
+
|
|
61
|
+
mysql_proxy = FakeMysqlProxy()
|
|
62
|
+
mysql_proxy.set_context(context)
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
result = mysql_proxy.process_query(query)
|
|
66
|
+
|
|
67
|
+
if result.type == SQL_RESPONSE_TYPE.OK:
|
|
68
|
+
return {"type": SQL_RESPONSE_TYPE.OK}
|
|
69
|
+
|
|
70
|
+
if result.type == SQL_RESPONSE_TYPE.TABLE:
|
|
71
|
+
return {
|
|
72
|
+
"type": SQL_RESPONSE_TYPE.TABLE,
|
|
73
|
+
"data": result.data.to_lists(json_types=True),
|
|
74
|
+
"column_names": [
|
|
75
|
+
x["alias"] or x["name"] if "alias" in x else x["name"]
|
|
76
|
+
for x in result.columns
|
|
77
|
+
],
|
|
78
|
+
}
|
|
79
|
+
else:
|
|
80
|
+
return {
|
|
81
|
+
"type": SQL_RESPONSE_TYPE.ERROR,
|
|
82
|
+
"error_code": 0,
|
|
83
|
+
"error_message": "Unknown response type"
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
except Exception as e:
|
|
87
|
+
logger.error(f"Error processing query: {str(e)}")
|
|
88
|
+
return {
|
|
89
|
+
"type": SQL_RESPONSE_TYPE.ERROR,
|
|
90
|
+
"error_code": 0,
|
|
91
|
+
"error_message": str(e)
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@mcp.tool()
|
|
96
|
+
def list_databases() -> Dict[str, Any]:
|
|
97
|
+
"""
|
|
98
|
+
List all databases in MindsDB along with their tables
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dict containing the list of databases and their tables
|
|
102
|
+
"""
|
|
103
|
+
|
|
104
|
+
mysql_proxy = FakeMysqlProxy()
|
|
105
|
+
|
|
106
|
+
try:
|
|
107
|
+
result = mysql_proxy.process_query(LISTING_QUERY)
|
|
108
|
+
if result.type == SQL_RESPONSE_TYPE.ERROR:
|
|
109
|
+
return {
|
|
110
|
+
"type": "error",
|
|
111
|
+
"error_code": result.error_code,
|
|
112
|
+
"error_message": result.error_message,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
elif result.type == SQL_RESPONSE_TYPE.OK:
|
|
116
|
+
return {"type": "ok"}
|
|
117
|
+
|
|
118
|
+
elif result.type == SQL_RESPONSE_TYPE.TABLE:
|
|
119
|
+
data = result.data.to_lists(json_types=True)
|
|
120
|
+
return data
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
return {
|
|
124
|
+
"type": "error",
|
|
125
|
+
"error_code": 0,
|
|
126
|
+
"error_message": str(e),
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def start(*args, **kwargs):
|
|
131
|
+
"""Start the MCP server
|
|
132
|
+
Args:
|
|
133
|
+
host (str): Host to bind to
|
|
134
|
+
port (int): Port to listen on
|
|
135
|
+
"""
|
|
136
|
+
config = Config()
|
|
137
|
+
port = int(config['api'].get('mcp', {}).get('port', 47337))
|
|
138
|
+
host = config['api'].get('mcp', {}).get('host', '127.0.0.1')
|
|
139
|
+
|
|
140
|
+
logger.info(f"Starting MCP server on {host}:{port}")
|
|
141
|
+
mcp.settings.host = host
|
|
142
|
+
mcp.settings.port = port
|
|
143
|
+
|
|
144
|
+
try:
|
|
145
|
+
mcp.run(transport="sse") # Use SSE transport instead of stdio
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.error(f"Error starting MCP server: {str(e)}")
|
|
148
|
+
raise
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
if __name__ == "__main__":
|
|
152
|
+
start()
|
|
@@ -89,7 +89,7 @@ class VectorStoreHandler(BaseHandler):
|
|
|
89
89
|
else:
|
|
90
90
|
return value
|
|
91
91
|
|
|
92
|
-
def
|
|
92
|
+
def extract_conditions(self, where_statement) -> Optional[List[FilterCondition]]:
|
|
93
93
|
conditions = []
|
|
94
94
|
# parse conditions
|
|
95
95
|
if where_statement is not None:
|
|
@@ -110,13 +110,7 @@ class VectorStoreHandler(BaseHandler):
|
|
|
110
110
|
right_hand = node.args[1].value
|
|
111
111
|
elif isinstance(node.args[1], Tuple):
|
|
112
112
|
# Constant could be actually a list i.e. [1.2, 3.2]
|
|
113
|
-
right_hand = [
|
|
114
|
-
ast.literal_eval(item.value)
|
|
115
|
-
if isinstance(item, Constant)
|
|
116
|
-
and not isinstance(item.value, list)
|
|
117
|
-
else item.value
|
|
118
|
-
for item in node.args[1].items
|
|
119
|
-
]
|
|
113
|
+
right_hand = [item.value for item in node.args[1].items]
|
|
120
114
|
else:
|
|
121
115
|
raise Exception(f"Unsupported right hand side: {node.args[1]}")
|
|
122
116
|
conditions.append(
|
|
@@ -125,18 +119,21 @@ class VectorStoreHandler(BaseHandler):
|
|
|
125
119
|
|
|
126
120
|
query_traversal(where_statement, _extract_comparison_conditions)
|
|
127
121
|
|
|
128
|
-
# try to treat conditions that are not in TableField as metadata conditions
|
|
129
|
-
for condition in conditions:
|
|
130
|
-
if not self._is_condition_allowed(condition):
|
|
131
|
-
condition.column = (
|
|
132
|
-
TableField.METADATA.value + "." + condition.column
|
|
133
|
-
)
|
|
134
|
-
|
|
135
122
|
else:
|
|
136
123
|
conditions = None
|
|
137
124
|
|
|
138
125
|
return conditions
|
|
139
126
|
|
|
127
|
+
def _convert_metadata_filters(self, conditions):
|
|
128
|
+
if conditions is None:
|
|
129
|
+
return
|
|
130
|
+
# try to treat conditions that are not in TableField as metadata conditions
|
|
131
|
+
for condition in conditions:
|
|
132
|
+
if not self._is_condition_allowed(condition):
|
|
133
|
+
condition.column = (
|
|
134
|
+
TableField.METADATA.value + "." + condition.column
|
|
135
|
+
)
|
|
136
|
+
|
|
140
137
|
def _is_columns_allowed(self, columns: List[str]) -> bool:
|
|
141
138
|
"""
|
|
142
139
|
Check if columns are allowed.
|
|
@@ -325,14 +322,16 @@ class VectorStoreHandler(BaseHandler):
|
|
|
325
322
|
if not df_insert.empty:
|
|
326
323
|
self.insert(table_name, df_insert)
|
|
327
324
|
|
|
328
|
-
def dispatch_delete(self, query: Delete):
|
|
325
|
+
def dispatch_delete(self, query: Delete, conditions: List[FilterCondition] = None):
|
|
329
326
|
"""
|
|
330
327
|
Dispatch delete query to the appropriate method.
|
|
331
328
|
"""
|
|
332
329
|
# parse key arguments
|
|
333
330
|
table_name = query.table.parts[-1]
|
|
334
|
-
|
|
335
|
-
|
|
331
|
+
if conditions is None:
|
|
332
|
+
where_statement = query.where
|
|
333
|
+
conditions = self.extract_conditions(where_statement)
|
|
334
|
+
self._convert_metadata_filters(conditions)
|
|
336
335
|
|
|
337
336
|
# dispatch delete
|
|
338
337
|
return self.delete(table_name, conditions=conditions)
|
|
@@ -356,9 +355,10 @@ class VectorStoreHandler(BaseHandler):
|
|
|
356
355
|
)
|
|
357
356
|
|
|
358
357
|
# check if columns are allowed
|
|
359
|
-
where_statement = query.where
|
|
360
358
|
if conditions is None:
|
|
361
|
-
|
|
359
|
+
where_statement = query.where
|
|
360
|
+
conditions = self.extract_conditions(where_statement)
|
|
361
|
+
self._convert_metadata_filters(conditions)
|
|
362
362
|
|
|
363
363
|
# get offset and limit
|
|
364
364
|
offset = query.offset.value if query.offset is not None else None
|
|
@@ -127,17 +127,21 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
127
127
|
ranked_results.append((batch[idx][1], score))
|
|
128
128
|
|
|
129
129
|
# Check if we should stop early
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
130
|
+
try:
|
|
131
|
+
high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
|
|
132
|
+
can_stop_early = (
|
|
133
|
+
self.early_stop # Early stopping is enabled
|
|
134
|
+
and self.num_docs_to_keep # We have a target number of docs
|
|
135
|
+
and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
|
|
136
|
+
and score >= self.early_stop_threshold # Current doc is good enough
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
if can_stop_early:
|
|
140
|
+
log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
|
|
141
|
+
return ranked_results
|
|
142
|
+
except Exception as e:
|
|
143
|
+
# Don't let early stopping errors stop the whole process
|
|
144
|
+
log.warning(f"Error in early stopping check: {str(e)}")
|
|
141
145
|
|
|
142
146
|
except Exception as e:
|
|
143
147
|
log.error(f"Batch processing error: {str(e)}")
|
|
@@ -222,3 +226,109 @@ class LLMReranker(BaseDocumentCompressor):
|
|
|
222
226
|
"temperature": self.temperature,
|
|
223
227
|
"remove_irrelevant": self.remove_irrelevant,
|
|
224
228
|
}
|
|
229
|
+
|
|
230
|
+
def get_scores(self, query: str, documents: list[str], disable_events: bool = True):
|
|
231
|
+
"""
|
|
232
|
+
Get relevance scores for documents given a query.
|
|
233
|
+
Args:
|
|
234
|
+
query: The query text
|
|
235
|
+
documents: List of document texts to score
|
|
236
|
+
disable_events: Whether to disable event dispatching (default True)
|
|
237
|
+
Returns:
|
|
238
|
+
List of relevance scores
|
|
239
|
+
"""
|
|
240
|
+
query_document_pairs = [(query, doc) for doc in documents]
|
|
241
|
+
# Create event loop and run async code
|
|
242
|
+
import asyncio
|
|
243
|
+
try:
|
|
244
|
+
loop = asyncio.get_running_loop()
|
|
245
|
+
except RuntimeError:
|
|
246
|
+
# If no running loop exists, create a new one
|
|
247
|
+
loop = asyncio.new_event_loop()
|
|
248
|
+
asyncio.set_event_loop(loop)
|
|
249
|
+
# If disable_events is True, we need to modify the _rank function to not use dispatch_custom_event
|
|
250
|
+
if disable_events:
|
|
251
|
+
# Create a wrapper function that doesn't dispatch events
|
|
252
|
+
async def _rank_without_events(query_document_pairs):
|
|
253
|
+
ranked_results = []
|
|
254
|
+
# Process in larger batches for better throughput
|
|
255
|
+
batch_size = min(self.max_concurrent_requests * 2, len(query_document_pairs))
|
|
256
|
+
for i in range(0, len(query_document_pairs), batch_size):
|
|
257
|
+
batch = query_document_pairs[i:i + batch_size]
|
|
258
|
+
try:
|
|
259
|
+
# Define a no-events version of search_relevancy inside this closure
|
|
260
|
+
async def search_relevancy_no_events(query, document):
|
|
261
|
+
await self._init_client()
|
|
262
|
+
async with self._semaphore:
|
|
263
|
+
for attempt in range(self.max_retries):
|
|
264
|
+
try:
|
|
265
|
+
response = await self.client.chat.completions.create(
|
|
266
|
+
model=self.model,
|
|
267
|
+
messages=[
|
|
268
|
+
{"role": "system", "content": "Rate the relevance of the document to the query. Respond with 'yes' or 'no'."},
|
|
269
|
+
{"role": "user", "content": f"Query: {query}\nDocument: {document}\nIs this document relevant?"}
|
|
270
|
+
],
|
|
271
|
+
temperature=self.temperature,
|
|
272
|
+
n=1,
|
|
273
|
+
logprobs=True,
|
|
274
|
+
max_tokens=1
|
|
275
|
+
)
|
|
276
|
+
# Extract response and confidence score
|
|
277
|
+
answer = response.choices[0].message.content
|
|
278
|
+
logprob = response.choices[0].logprobs.content[0].logprob
|
|
279
|
+
# No event dispatch here
|
|
280
|
+
return {"document": document, "answer": answer, "logprob": logprob}
|
|
281
|
+
except Exception as e:
|
|
282
|
+
if attempt == self.max_retries - 1:
|
|
283
|
+
log.error(f"Failed after {self.max_retries} attempts: {str(e)}")
|
|
284
|
+
raise
|
|
285
|
+
# Exponential backoff with jitter
|
|
286
|
+
retry_delay = self.retry_delay * (2 ** attempt) + random.uniform(0, 0.1)
|
|
287
|
+
await asyncio.sleep(retry_delay)
|
|
288
|
+
# Use our no-events version for this batch
|
|
289
|
+
results = await asyncio.gather(
|
|
290
|
+
*[search_relevancy_no_events(query=query, document=document) for (query, document) in batch],
|
|
291
|
+
return_exceptions=True
|
|
292
|
+
)
|
|
293
|
+
for idx, result in enumerate(results):
|
|
294
|
+
if isinstance(result, Exception):
|
|
295
|
+
log.error(f"Error processing document {i+idx}: {str(result)}")
|
|
296
|
+
ranked_results.append((batch[idx][1], 0.0))
|
|
297
|
+
continue
|
|
298
|
+
answer = result["answer"]
|
|
299
|
+
logprob = result["logprob"]
|
|
300
|
+
prob = math.exp(logprob)
|
|
301
|
+
# Convert answer to score using the model's confidence
|
|
302
|
+
if answer.lower().strip() == "yes":
|
|
303
|
+
score = prob # If yes, use the model's confidence
|
|
304
|
+
elif answer.lower().strip() == "no":
|
|
305
|
+
score = 1 - prob # If no, invert the confidence
|
|
306
|
+
else:
|
|
307
|
+
score = 0.5 * prob # For unclear answers, reduce confidence
|
|
308
|
+
ranked_results.append((batch[idx][1], score))
|
|
309
|
+
# Check if we should stop early
|
|
310
|
+
try:
|
|
311
|
+
high_scoring_docs = [r for r in ranked_results if r[1] >= self.filtering_threshold]
|
|
312
|
+
can_stop_early = (
|
|
313
|
+
self.early_stop # Early stopping is enabled
|
|
314
|
+
and self.num_docs_to_keep # We have a target number of docs
|
|
315
|
+
and len(high_scoring_docs) >= self.num_docs_to_keep # Found enough good docs
|
|
316
|
+
and score >= self.early_stop_threshold # Current doc is good enough
|
|
317
|
+
)
|
|
318
|
+
if can_stop_early:
|
|
319
|
+
log.info(f"Early stopping after finding {self.num_docs_to_keep} documents with high confidence")
|
|
320
|
+
return ranked_results
|
|
321
|
+
except Exception as e:
|
|
322
|
+
# Don't let early stopping errors stop the whole process
|
|
323
|
+
log.warning(f"Error in early stopping check: {str(e)}")
|
|
324
|
+
except Exception as e:
|
|
325
|
+
log.error(f"Batch processing error: {str(e)}")
|
|
326
|
+
continue
|
|
327
|
+
return ranked_results
|
|
328
|
+
# Use our no-events version
|
|
329
|
+
documents_and_scores = loop.run_until_complete(_rank_without_events(query_document_pairs))
|
|
330
|
+
else:
|
|
331
|
+
# Use the original _rank method
|
|
332
|
+
documents_and_scores = loop.run_until_complete(self._rank(query_document_pairs))
|
|
333
|
+
scores = [score for _, score in documents_and_scores]
|
|
334
|
+
return scores
|
|
@@ -296,6 +296,19 @@ class Project:
|
|
|
296
296
|
]
|
|
297
297
|
return data
|
|
298
298
|
|
|
299
|
+
def get_knowledge_bases(self):
|
|
300
|
+
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
301
|
+
session = SessionController()
|
|
302
|
+
|
|
303
|
+
return {
|
|
304
|
+
kb['name']: {
|
|
305
|
+
'type': 'knowledge_base',
|
|
306
|
+
'id': kb['id'],
|
|
307
|
+
'deletable': True
|
|
308
|
+
}
|
|
309
|
+
for kb in session.kb_controller.list(self.name)
|
|
310
|
+
}
|
|
311
|
+
|
|
299
312
|
def get_views(self):
|
|
300
313
|
records = (
|
|
301
314
|
db.session.query(db.View).filter_by(
|
|
@@ -353,6 +366,8 @@ class Project:
|
|
|
353
366
|
for agent in agents:
|
|
354
367
|
data[agent['name']] = agent['metadata']
|
|
355
368
|
|
|
369
|
+
data.update(self.get_knowledge_bases())
|
|
370
|
+
|
|
356
371
|
return data
|
|
357
372
|
|
|
358
373
|
def get_columns(self, table_name: str):
|
|
@@ -4,6 +4,7 @@ from typing import Dict, List, Optional
|
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
import hashlib
|
|
7
|
+
import numpy as np
|
|
7
8
|
|
|
8
9
|
from mindsdb_sql_parser.ast import (
|
|
9
10
|
BinaryOperation,
|
|
@@ -37,9 +38,16 @@ from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
|
37
38
|
|
|
38
39
|
from mindsdb.api.executor.command_executor import ExecuteCommands
|
|
39
40
|
from mindsdb.utilities import log
|
|
41
|
+
from mindsdb.integrations.utilities.rag.rerankers.reranker_compressor import LLMReranker
|
|
40
42
|
|
|
41
43
|
logger = log.getLogger(__name__)
|
|
42
44
|
|
|
45
|
+
KB_TO_VECTORDB_COLUMNS = {
|
|
46
|
+
'id': 'original_row_id',
|
|
47
|
+
'chunk_id': 'id',
|
|
48
|
+
'chunk_content': 'content'
|
|
49
|
+
}
|
|
50
|
+
|
|
43
51
|
|
|
44
52
|
class KnowledgeBaseTable:
|
|
45
53
|
"""
|
|
@@ -103,7 +111,9 @@ class KnowledgeBaseTable:
|
|
|
103
111
|
db_handler = self.get_vector_db()
|
|
104
112
|
logger.debug(f"Using vector db handler: {type(db_handler)}")
|
|
105
113
|
|
|
106
|
-
|
|
114
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
115
|
+
self.addapt_conditions_columns(conditions)
|
|
116
|
+
df = db_handler.dispatch_select(query, conditions)
|
|
107
117
|
|
|
108
118
|
if df is not None:
|
|
109
119
|
|
|
@@ -115,8 +125,72 @@ class KnowledgeBaseTable:
|
|
|
115
125
|
else:
|
|
116
126
|
logger.warning("Query returned no data")
|
|
117
127
|
|
|
128
|
+
rerank_model = self._kb.params.get("rerank_model")
|
|
129
|
+
if rerank_model and df is not None and not df.empty:
|
|
130
|
+
try:
|
|
131
|
+
logger.info(f"Using reranker model: {rerank_model}")
|
|
132
|
+
reranker = LLMReranker(model=rerank_model)
|
|
133
|
+
# convert response from a dataframe to a list of strings
|
|
134
|
+
content_column = df[TableField.CONTENT.value]
|
|
135
|
+
# convert to list
|
|
136
|
+
documents = content_column.tolist()
|
|
137
|
+
# Extract query text from WHERE clause if it exists
|
|
138
|
+
query_text = ""
|
|
139
|
+
if query.where:
|
|
140
|
+
def extract_content(node, **kwargs):
|
|
141
|
+
nonlocal query_text
|
|
142
|
+
is_binary_op = isinstance(node, BinaryOperation)
|
|
143
|
+
is_identifier = isinstance(node.args[0], Identifier)
|
|
144
|
+
is_content = node.args[0].parts[-1].lower() == 'content'
|
|
145
|
+
is_constant = isinstance(node.args[1], Constant)
|
|
146
|
+
if is_binary_op and is_identifier and is_content and is_constant:
|
|
147
|
+
query_text = node.args[1].value
|
|
148
|
+
query_traversal(query.where, extract_content)
|
|
149
|
+
logger.debug(f"Extracted query text: {query_text}")
|
|
150
|
+
# Get scores from reranker
|
|
151
|
+
scores = reranker.get_scores(query_text, documents)
|
|
152
|
+
# Add scores as a new column for filtering
|
|
153
|
+
scores_array = np.array(scores)
|
|
154
|
+
# Add temporary column for sorting
|
|
155
|
+
df['_relevance_score'] = scores
|
|
156
|
+
# Filter by score threshold using numpy array for element-wise comparison
|
|
157
|
+
df = df[scores_array > reranker.filtering_threshold]
|
|
158
|
+
# Sort by relevance (higher score = more relevant)
|
|
159
|
+
df = df.sort_values(by='_relevance_score', ascending=False)
|
|
160
|
+
# Remove temporary column
|
|
161
|
+
# df = df.drop(columns=['_relevance_score'])
|
|
162
|
+
# Apply original limit if it exists
|
|
163
|
+
if query.limit and len(df) > query.limit.value:
|
|
164
|
+
df = df.iloc[:query.limit.value]
|
|
165
|
+
logger.debug(f"Applied reranking with model {rerank_model}")
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Error during reranking: {str(e)}")
|
|
168
|
+
|
|
169
|
+
df = self.addapt_result_columns(df)
|
|
118
170
|
return df
|
|
119
171
|
|
|
172
|
+
def addapt_conditions_columns(self, conditions):
|
|
173
|
+
if conditions is None:
|
|
174
|
+
return
|
|
175
|
+
for condition in conditions:
|
|
176
|
+
if condition.column in KB_TO_VECTORDB_COLUMNS:
|
|
177
|
+
condition.column = KB_TO_VECTORDB_COLUMNS[condition.column]
|
|
178
|
+
|
|
179
|
+
def addapt_result_columns(self, df):
|
|
180
|
+
col_update = {}
|
|
181
|
+
for kb_col, vec_col in KB_TO_VECTORDB_COLUMNS.items():
|
|
182
|
+
if vec_col in df.columns:
|
|
183
|
+
col_update[vec_col] = kb_col
|
|
184
|
+
|
|
185
|
+
df = df.rename(columns=col_update)
|
|
186
|
+
|
|
187
|
+
columns = list(df.columns)
|
|
188
|
+
# update id, get from metadata
|
|
189
|
+
df[TableField.ID.value] = df[TableField.METADATA.value].apply(lambda m: m.get('original_row_id'))
|
|
190
|
+
|
|
191
|
+
# id on first place
|
|
192
|
+
return df[[TableField.ID.value] + columns]
|
|
193
|
+
|
|
120
194
|
def insert_files(self, file_names: List[str]):
|
|
121
195
|
"""Process and insert files"""
|
|
122
196
|
if not self.document_loader:
|
|
@@ -217,7 +291,9 @@ class KnowledgeBaseTable:
|
|
|
217
291
|
|
|
218
292
|
# send to vectordb
|
|
219
293
|
db_handler = self.get_vector_db()
|
|
220
|
-
db_handler.
|
|
294
|
+
conditions = db_handler.extract_conditions(query.where)
|
|
295
|
+
self.addapt_conditions_columns(conditions)
|
|
296
|
+
db_handler.dispatch_delete(query, conditions)
|
|
221
297
|
|
|
222
298
|
def hybrid_search(
|
|
223
299
|
self,
|
mindsdb/utilities/config.py
CHANGED
|
@@ -201,6 +201,14 @@ class Config:
|
|
|
201
201
|
"host": api_host,
|
|
202
202
|
"port": "55432",
|
|
203
203
|
"database": "mindsdb"
|
|
204
|
+
},
|
|
205
|
+
"mcp": {
|
|
206
|
+
"host": api_host,
|
|
207
|
+
"port": "47337",
|
|
208
|
+
"enabled": True,
|
|
209
|
+
"restart_on_failure": True,
|
|
210
|
+
"max_restart_count": 1,
|
|
211
|
+
"max_restart_interval_seconds": 60
|
|
204
212
|
}
|
|
205
213
|
},
|
|
206
214
|
"cache": {
|
mindsdb/utilities/starters.py
CHANGED
|
@@ -31,3 +31,10 @@ def start_ml_task_queue(*args, **kwargs):
|
|
|
31
31
|
def start_scheduler(*args, **kwargs):
|
|
32
32
|
from mindsdb.interfaces.jobs.scheduler import start
|
|
33
33
|
start(*args, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def start_mcp(*args, **kwargs):
|
|
37
|
+
"""Start the MCP server"""
|
|
38
|
+
from mindsdb.api.mcp.start import start
|
|
39
|
+
|
|
40
|
+
start(*args, **kwargs)
|