MindsDB 25.1.4.0__py3-none-any.whl → 25.1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/METADATA +235 -246
- {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/RECORD +44 -42
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +1 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +2 -26
- mindsdb/api/http/namespaces/agents.py +3 -1
- mindsdb/api/http/namespaces/knowledge_bases.py +4 -1
- mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
- mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +4 -2
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
- mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -1
- mindsdb/integrations/libs/vectordatabase_handler.py +4 -3
- mindsdb/integrations/utilities/files/__init__.py +0 -0
- mindsdb/integrations/utilities/files/file_reader.py +258 -0
- mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
- mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
- mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
- mindsdb/integrations/utilities/rag/pipelines/rag.py +1 -3
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +97 -89
- mindsdb/integrations/utilities/rag/settings.py +29 -14
- mindsdb/interfaces/agents/agents_controller.py +15 -3
- mindsdb/interfaces/agents/constants.py +1 -0
- mindsdb/interfaces/agents/langchain_agent.py +15 -10
- mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -0
- mindsdb/interfaces/agents/mindsdb_database_agent.py +14 -0
- mindsdb/interfaces/database/integrations.py +5 -1
- mindsdb/interfaces/database/projects.py +38 -1
- mindsdb/interfaces/knowledge_base/controller.py +26 -11
- mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +18 -10
- mindsdb/interfaces/skills/skill_tool.py +12 -6
- mindsdb/interfaces/skills/skills_controller.py +7 -3
- mindsdb/interfaces/skills/sql_agent.py +81 -18
- mindsdb/utilities/langfuse.py +15 -0
- {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/LICENSE +0 -0
- {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/WHEEL +0 -0
- {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,6 @@ from typing import List, Union, Any, Optional, Dict
|
|
|
3
3
|
|
|
4
4
|
from langchain_community.vectorstores.chroma import Chroma
|
|
5
5
|
from langchain_community.vectorstores.pgvector import PGVector
|
|
6
|
-
from langchain_community.tools.sql_database.prompt import QUERY_CHECKER as DEFAULT_QUERY_CHECKER_PROMPT_TEMPLATE
|
|
7
6
|
from langchain_core.documents import Document
|
|
8
7
|
from langchain_core.embeddings import Embeddings
|
|
9
8
|
from langchain_core.language_models import BaseChatModel
|
|
@@ -94,6 +93,25 @@ Output only a single better search query and nothing else like in the example.
|
|
|
94
93
|
Here is the user input: {input}
|
|
95
94
|
'''
|
|
96
95
|
|
|
96
|
+
DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE = '''Construct a list of PostgreSQL metadata filters to filter documents in the database based on the user input.
|
|
97
|
+
|
|
98
|
+
<< INSTRUCTIONS >>
|
|
99
|
+
{format_instructions}
|
|
100
|
+
|
|
101
|
+
RETURN ONLY THE FINAL JSON. DO NOT EXPLAIN, JUST RETURN THE FINAL JSON.
|
|
102
|
+
|
|
103
|
+
<< TABLES YOU HAVE ACCESS TO >>
|
|
104
|
+
|
|
105
|
+
{schema}
|
|
106
|
+
|
|
107
|
+
<< EXAMPLES >>
|
|
108
|
+
|
|
109
|
+
{examples}
|
|
110
|
+
|
|
111
|
+
Here is the user input:
|
|
112
|
+
{input}
|
|
113
|
+
'''
|
|
114
|
+
|
|
97
115
|
DEFAULT_SQL_PROMPT_TEMPLATE = '''
|
|
98
116
|
Construct a valid {dialect} SQL query to select documents relevant to the user input.
|
|
99
117
|
Source documents are found in the {source_table} table. You may need to join with other tables to get additional document metadata.
|
|
@@ -377,6 +395,13 @@ class MetadataSchema(BaseModel):
|
|
|
377
395
|
columns: List[ColumnSchema] = Field(
|
|
378
396
|
description="List of column schemas describing the metadata columns available for the table"
|
|
379
397
|
)
|
|
398
|
+
join: str = Field(
|
|
399
|
+
description="SQL join string to join this table with source documents table",
|
|
400
|
+
default=''
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
class Config:
|
|
404
|
+
frozen = True
|
|
380
405
|
|
|
381
406
|
|
|
382
407
|
class LLMExample(BaseModel):
|
|
@@ -393,19 +418,9 @@ class SQLRetrieverConfig(BaseModel):
|
|
|
393
418
|
default_factory=LLMConfig,
|
|
394
419
|
description="LLM configuration to use for generating the final SQL query for retrieval"
|
|
395
420
|
)
|
|
396
|
-
|
|
397
|
-
default=
|
|
398
|
-
description="
|
|
399
|
-
Has 'dialect', 'input', 'embeddings_table', 'source_table', 'embeddings', 'distance_function', 'schema', and 'examples' input variables.
|
|
400
|
-
"""
|
|
401
|
-
)
|
|
402
|
-
query_checker_template: str = Field(
|
|
403
|
-
default=DEFAULT_QUERY_CHECKER_PROMPT_TEMPLATE,
|
|
404
|
-
description="Prompt template to use for double checking SQL queries before execution. Has 'query' and 'dialect' input variables."
|
|
405
|
-
)
|
|
406
|
-
query_retry_template: str = Field(
|
|
407
|
-
default=DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE,
|
|
408
|
-
description="Prompt template to rewrite SQL query that failed. Has 'dialect', 'query', and 'error' input variables."
|
|
421
|
+
metadata_filters_prompt_template: str = Field(
|
|
422
|
+
default=DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE,
|
|
423
|
+
description="Prompt template to generate PostgreSQL metadata filters. Has 'format_instructions', 'schema', 'examples', and 'input' input variables"
|
|
409
424
|
)
|
|
410
425
|
num_retries: int = Field(
|
|
411
426
|
default=DEFAULT_NUM_QUERY_RETRIES,
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import Dict, Iterator, List, Union, Tuple
|
|
2
|
+
from typing import Dict, Iterator, List, Union, Tuple, Optional
|
|
3
3
|
|
|
4
4
|
from langchain_core.tools import BaseTool
|
|
5
5
|
from sqlalchemy.orm.attributes import flag_modified
|
|
@@ -70,7 +70,7 @@ class AgentsController:
|
|
|
70
70
|
|
|
71
71
|
return model, provider
|
|
72
72
|
|
|
73
|
-
def get_agent(self, agent_name: str, project_name: str = 'mindsdb') -> db.Agents:
|
|
73
|
+
def get_agent(self, agent_name: str, project_name: str = 'mindsdb') -> Optional[db.Agents]:
|
|
74
74
|
'''
|
|
75
75
|
Gets an agent by name.
|
|
76
76
|
|
|
@@ -79,7 +79,7 @@ class AgentsController:
|
|
|
79
79
|
project_name (str): The name of the containing project - must exist
|
|
80
80
|
|
|
81
81
|
Returns:
|
|
82
|
-
agent (db.Agents): The database agent object
|
|
82
|
+
agent (Optional[db.Agents]): The database agent object
|
|
83
83
|
'''
|
|
84
84
|
|
|
85
85
|
project = self.project_controller.get(name=project_name)
|
|
@@ -252,6 +252,16 @@ class AgentsController:
|
|
|
252
252
|
existing_agent = self.get_agent(agent_name, project_name=project_name)
|
|
253
253
|
if existing_agent is None:
|
|
254
254
|
raise EntityNotExistsError(f'Agent with name not found: {agent_name}')
|
|
255
|
+
is_demo = (existing_agent.params or {}).get('is_demo', False)
|
|
256
|
+
if (
|
|
257
|
+
is_demo and (
|
|
258
|
+
(name is not None and name != agent_name)
|
|
259
|
+
or (model_name or provider)
|
|
260
|
+
or (len(skills_to_add) > 0 or len(skills_to_remove) > 0 or len(skills_to_rewrite) > 0)
|
|
261
|
+
or (isinstance(params, dict) and len(params) > 1 and 'prompt_template' not in params)
|
|
262
|
+
)
|
|
263
|
+
):
|
|
264
|
+
raise ValueError("It is forbidden to change properties of the demo object")
|
|
255
265
|
|
|
256
266
|
if name is not None and name != agent_name:
|
|
257
267
|
# Check to see if updated name already exists
|
|
@@ -352,6 +362,8 @@ class AgentsController:
|
|
|
352
362
|
agent = self.get_agent(agent_name, project_name)
|
|
353
363
|
if agent is None:
|
|
354
364
|
raise ValueError(f'Agent with name does not exist: {agent_name}')
|
|
365
|
+
if isinstance(agent.params, dict) and agent.params.get('is_demo') is True:
|
|
366
|
+
raise ValueError('Unable to delete demo object')
|
|
355
367
|
agent.deleted_at = datetime.datetime.now()
|
|
356
368
|
db.session.commit()
|
|
357
369
|
|
|
@@ -165,6 +165,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
|
|
|
165
165
|
|
|
166
166
|
ASSISTANT_COLUMN = "answer"
|
|
167
167
|
CONTEXT_COLUMN = "context"
|
|
168
|
+
TRACE_ID_COLUMN = "trace_id"
|
|
168
169
|
DEFAULT_AGENT_TIMEOUT_SECONDS = 300
|
|
169
170
|
# These should require no additional arguments.
|
|
170
171
|
DEFAULT_AGENT_TOOLS = []
|
|
@@ -49,7 +49,7 @@ from .constants import (
|
|
|
49
49
|
NVIDIA_NIM_CHAT_MODELS,
|
|
50
50
|
USER_COLUMN,
|
|
51
51
|
ASSISTANT_COLUMN,
|
|
52
|
-
CONTEXT_COLUMN
|
|
52
|
+
CONTEXT_COLUMN, TRACE_ID_COLUMN
|
|
53
53
|
)
|
|
54
54
|
from mindsdb.interfaces.skills.skill_tool import skill_tool, SkillData
|
|
55
55
|
from langchain_anthropic import ChatAnthropic
|
|
@@ -371,9 +371,9 @@ class LangchainAgent:
|
|
|
371
371
|
for row in df[:-1].to_dict("records"):
|
|
372
372
|
question = row[user_column]
|
|
373
373
|
answer = row[assistant_column]
|
|
374
|
-
if question:
|
|
374
|
+
if isinstance(question, str) and len(question) > 0:
|
|
375
375
|
memory.chat_memory.add_user_message(question)
|
|
376
|
-
if answer:
|
|
376
|
+
if isinstance(answer, str) and len(answer) > 0:
|
|
377
377
|
memory.chat_memory.add_ai_message(answer)
|
|
378
378
|
|
|
379
379
|
agent_type = args.get("agent_type", DEFAULT_AGENT_TYPE)
|
|
@@ -455,9 +455,7 @@ class LangchainAgent:
|
|
|
455
455
|
|
|
456
456
|
# custom tracer
|
|
457
457
|
if self.mdb_langfuse_callback_handler is None:
|
|
458
|
-
trace_id =
|
|
459
|
-
if self.langfuse_client_wrapper.trace is not None:
|
|
460
|
-
trace_id = args.get("trace_id", self.langfuse_client_wrapper.trace.id)
|
|
458
|
+
trace_id = self.langfuse_client_wrapper.get_trace_id()
|
|
461
459
|
|
|
462
460
|
span_id = None
|
|
463
461
|
if self.run_completion_span is not None:
|
|
@@ -562,6 +560,7 @@ AI: {response}"""
|
|
|
562
560
|
CONTEXT_COLUMN: [
|
|
563
561
|
json.dumps(ctx) for ctx in contexts
|
|
564
562
|
], # Serialize context to JSON string
|
|
563
|
+
TRACE_ID_COLUMN: self.langfuse_client_wrapper.get_trace_id()
|
|
565
564
|
}
|
|
566
565
|
)
|
|
567
566
|
|
|
@@ -570,6 +569,12 @@ AI: {response}"""
|
|
|
570
569
|
|
|
571
570
|
return pred_df
|
|
572
571
|
|
|
572
|
+
def add_chunk_metadata(self, chunk: Dict) -> Dict:
|
|
573
|
+
logger.debug(f'Adding metadata to chunk: {chunk}')
|
|
574
|
+
logger.debug(f'Trace ID: {self.langfuse_client_wrapper.get_trace_id()}')
|
|
575
|
+
chunk["trace_id"] = self.langfuse_client_wrapper.get_trace_id()
|
|
576
|
+
return chunk
|
|
577
|
+
|
|
573
578
|
def stream_agent(self, df: pd.DataFrame, agent_executor: AgentExecutor, args: Dict) -> Iterable[Dict]:
|
|
574
579
|
base_template = args.get('prompt_template', args['prompt_template'])
|
|
575
580
|
input_variables = re.findall(r"{{(.*?)}}", base_template)
|
|
@@ -579,7 +584,7 @@ AI: {response}"""
|
|
|
579
584
|
|
|
580
585
|
callbacks, context_callback = prepare_callbacks(self, args)
|
|
581
586
|
|
|
582
|
-
yield {"type": "start", "prompt": prompts[0]}
|
|
587
|
+
yield self.add_chunk_metadata({"type": "start", "prompt": prompts[0]})
|
|
583
588
|
|
|
584
589
|
if not hasattr(agent_executor, 'stream') or not callable(agent_executor.stream):
|
|
585
590
|
raise AttributeError("The agent_executor does not have a 'stream' method")
|
|
@@ -591,10 +596,10 @@ AI: {response}"""
|
|
|
591
596
|
raise TypeError("The stream method did not return an iterable")
|
|
592
597
|
|
|
593
598
|
for chunk in stream_iterator:
|
|
594
|
-
logger.
|
|
599
|
+
logger.debug(f'Processing streaming chunk {chunk}')
|
|
595
600
|
processed_chunk = self.process_chunk(chunk)
|
|
596
601
|
logger.info(f'Processed chunk: {processed_chunk}')
|
|
597
|
-
yield processed_chunk
|
|
602
|
+
yield self.add_chunk_metadata(processed_chunk)
|
|
598
603
|
|
|
599
604
|
if return_context:
|
|
600
605
|
# Yield context if required
|
|
@@ -604,7 +609,7 @@ AI: {response}"""
|
|
|
604
609
|
|
|
605
610
|
if self.log_callback_handler.generated_sql:
|
|
606
611
|
# Yield generated SQL if available
|
|
607
|
-
yield {"type": "sql", "content": self.log_callback_handler.generated_sql}
|
|
612
|
+
yield self.add_chunk_metadata({"type": "sql", "content": self.log_callback_handler.generated_sql})
|
|
608
613
|
|
|
609
614
|
# End the run completion span and update the metadata with tool usage
|
|
610
615
|
self.langfuse_client_wrapper.end_span_stream(span=self.run_completion_span)
|
|
@@ -66,6 +66,10 @@ class LangfuseCallbackHandler(BaseCallbackHandler):
|
|
|
66
66
|
) -> Any:
|
|
67
67
|
"""Run when chain starts running."""
|
|
68
68
|
run_uuid = kwargs.get('run_id', uuid4()).hex
|
|
69
|
+
|
|
70
|
+
if serialized is None:
|
|
71
|
+
serialized = {}
|
|
72
|
+
|
|
69
73
|
chain_span = self.langfuse.span(
|
|
70
74
|
name=f'{serialized.get("name", "chain")}-{run_uuid}',
|
|
71
75
|
trace_id=self.trace_id,
|
|
@@ -11,6 +11,17 @@ from mindsdb.interfaces.skills.sql_agent import SQLAgent
|
|
|
11
11
|
logger = log.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def extract_essential(input: str) -> str:
|
|
15
|
+
""" Sometimes LLM include to input unnecessary data. We can't control stochastic nature of LLM, so we need to
|
|
16
|
+
'clean' input somehow. LLM prompt contains instruction to enclose input between '$START$' and '$STOP$'.
|
|
17
|
+
"""
|
|
18
|
+
if '$START$' in input:
|
|
19
|
+
input = input.partition('$START$')[-1]
|
|
20
|
+
if '$STOP$' in input:
|
|
21
|
+
input = input.partition('$STOP$')[0]
|
|
22
|
+
return input.strip(' ')
|
|
23
|
+
|
|
24
|
+
|
|
14
25
|
class MindsDBSQL(SQLDatabase):
|
|
15
26
|
@staticmethod
|
|
16
27
|
def custom_init(
|
|
@@ -50,7 +61,10 @@ class MindsDBSQL(SQLDatabase):
|
|
|
50
61
|
return self._sql_agent.get_usable_table_names()
|
|
51
62
|
|
|
52
63
|
def get_table_info_no_throw(self, table_names: Optional[List[str]] = None) -> str:
|
|
64
|
+
for i in range(len(table_names)):
|
|
65
|
+
table_names[i] = extract_essential(table_names[i])
|
|
53
66
|
return self._sql_agent.get_table_info_safe(table_names)
|
|
54
67
|
|
|
55
68
|
def run_no_throw(self, command: str, fetch: str = "all") -> str:
|
|
69
|
+
command = extract_essential(command)
|
|
56
70
|
return self._sql_agent.query_safe(command)
|
|
@@ -215,6 +215,8 @@ class IntegrationController:
|
|
|
215
215
|
def modify(self, name, data):
|
|
216
216
|
self.handlers_cache.delete(name)
|
|
217
217
|
integration_record = self._get_integration_record(name)
|
|
218
|
+
if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
|
|
219
|
+
raise ValueError("It is forbidden to change properties of the demo object")
|
|
218
220
|
old_data = deepcopy(integration_record.data)
|
|
219
221
|
for k in old_data:
|
|
220
222
|
if k not in data:
|
|
@@ -234,9 +236,11 @@ class IntegrationController:
|
|
|
234
236
|
handler = self.handler_modules[name]
|
|
235
237
|
|
|
236
238
|
if getattr(handler, 'permanent', False) is True:
|
|
237
|
-
raise Exception('Unable to drop
|
|
239
|
+
raise Exception('Unable to drop permanent integration')
|
|
238
240
|
|
|
239
241
|
integration_record = self._get_integration_record(name)
|
|
242
|
+
if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
|
|
243
|
+
raise Exception('Unable to drop demo object')
|
|
240
244
|
|
|
241
245
|
# if this is ml engine
|
|
242
246
|
engine_models = get_model_records(ml_handler_name=name, deleted_at=None)
|
|
@@ -7,6 +7,7 @@ import sqlalchemy as sa
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
9
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
10
|
+
from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
|
|
10
11
|
from mindsdb_sql_parser import parse_sql
|
|
11
12
|
|
|
12
13
|
from mindsdb.interfaces.storage import db
|
|
@@ -16,6 +17,9 @@ from mindsdb.interfaces.database.views import ViewController
|
|
|
16
17
|
from mindsdb.utilities.context import context as ctx
|
|
17
18
|
from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
|
|
18
19
|
import mindsdb.utilities.profiler as profiler
|
|
20
|
+
from mindsdb.api.executor.sql_query import SQLQuery
|
|
21
|
+
from mindsdb.api.executor.utilities.sql import query_df
|
|
22
|
+
from mindsdb.interfaces.query_context.context_controller import query_context_controller
|
|
19
23
|
|
|
20
24
|
|
|
21
25
|
class Project:
|
|
@@ -111,7 +115,7 @@ class Project:
|
|
|
111
115
|
project_name=self.name
|
|
112
116
|
)
|
|
113
117
|
|
|
114
|
-
def
|
|
118
|
+
def get_view_meta(self, query: ASTNode) -> ASTNode:
|
|
115
119
|
view_name = query.from_table.parts[-1]
|
|
116
120
|
view_meta = ViewController().get(
|
|
117
121
|
name=view_name,
|
|
@@ -120,6 +124,30 @@ class Project:
|
|
|
120
124
|
view_meta['query_ast'] = parse_sql(view_meta['query'])
|
|
121
125
|
return view_meta
|
|
122
126
|
|
|
127
|
+
def query_view(self, query, session):
|
|
128
|
+
|
|
129
|
+
view_meta = self.get_view_meta(query)
|
|
130
|
+
|
|
131
|
+
query_context_controller.set_context('view', view_meta['id'])
|
|
132
|
+
|
|
133
|
+
try:
|
|
134
|
+
sqlquery = SQLQuery(
|
|
135
|
+
view_meta['query_ast'],
|
|
136
|
+
session=session
|
|
137
|
+
)
|
|
138
|
+
result = sqlquery.fetch(view='dataframe')
|
|
139
|
+
|
|
140
|
+
finally:
|
|
141
|
+
query_context_controller.release_context('view', view_meta['id'])
|
|
142
|
+
|
|
143
|
+
if result['success'] is False:
|
|
144
|
+
raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
|
|
145
|
+
df = result['result']
|
|
146
|
+
# remove duplicated columns
|
|
147
|
+
df = df.loc[:, ~df.columns.duplicated()]
|
|
148
|
+
|
|
149
|
+
return query_df(df, query, session=session)
|
|
150
|
+
|
|
123
151
|
@staticmethod
|
|
124
152
|
def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):
|
|
125
153
|
from mindsdb.interfaces.database.integrations import integration_controller
|
|
@@ -341,6 +369,15 @@ class Project:
|
|
|
341
369
|
columns = predictor_record.to_predict
|
|
342
370
|
if not isinstance(columns, list):
|
|
343
371
|
columns = [columns]
|
|
372
|
+
return columns
|
|
373
|
+
if self.get_view(table_name):
|
|
374
|
+
query = Select(targets=[Star()], from_table=Identifier(table_name), limit=Constant(1))
|
|
375
|
+
|
|
376
|
+
from mindsdb.api.executor.controllers.session_controller import SessionController
|
|
377
|
+
session = SessionController()
|
|
378
|
+
session.database = self.name
|
|
379
|
+
df = self.query_view(query, session)
|
|
380
|
+
return df.columns
|
|
344
381
|
else:
|
|
345
382
|
# is it agent?
|
|
346
383
|
agent = db.Agents.query.filter_by(
|
|
@@ -26,6 +26,9 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
26
26
|
)
|
|
27
27
|
from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
|
|
28
28
|
from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
|
|
29
|
+
from mindsdb.integrations.utilities.sql_utils import (
|
|
30
|
+
extract_comparison_conditions, filter_dataframe, FilterCondition, FilterOperator
|
|
31
|
+
)
|
|
29
32
|
from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
|
|
30
33
|
from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
|
|
31
34
|
from mindsdb.interfaces.database.projects import ProjectController
|
|
@@ -101,18 +104,30 @@ class KnowledgeBaseTable:
|
|
|
101
104
|
# Get response from vector db
|
|
102
105
|
db_handler = self.get_vector_db()
|
|
103
106
|
logger.debug(f"Using vector db handler: {type(db_handler)}")
|
|
104
|
-
resp = db_handler.query(query)
|
|
105
107
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
108
|
+
vector_filters, outer_filters = [], []
|
|
109
|
+
# update vector handlers, mark conditions as applied inside
|
|
110
|
+
for op, arg1, arg2 in extract_comparison_conditions(query.where):
|
|
111
|
+
condition = FilterCondition(arg1, FilterOperator(op.upper()), arg2)
|
|
112
|
+
if arg1 in (TableField.ID.value, TableField.CONTENT.value, TableField.EMBEDDINGS.value):
|
|
113
|
+
vector_filters.append(condition)
|
|
114
|
+
else:
|
|
115
|
+
outer_filters.append([op, arg1, arg2])
|
|
116
|
+
|
|
117
|
+
df = db_handler.dispatch_select(query, conditions=vector_filters)
|
|
118
|
+
|
|
119
|
+
if df is not None:
|
|
120
|
+
df = filter_dataframe(df, outer_filters)
|
|
121
|
+
|
|
122
|
+
logger.debug(f"Query returned {len(df)} rows")
|
|
123
|
+
logger.debug(f"Columns in response: {df.columns.tolist()}")
|
|
109
124
|
# Log a sample of IDs to help diagnose issues
|
|
110
|
-
if not
|
|
111
|
-
logger.debug(f"Sample of IDs in response: {
|
|
125
|
+
if not df.empty:
|
|
126
|
+
logger.debug(f"Sample of IDs in response: {df['id'].head().tolist()}")
|
|
112
127
|
else:
|
|
113
128
|
logger.warning("Query returned no data")
|
|
114
129
|
|
|
115
|
-
return
|
|
130
|
+
return df
|
|
116
131
|
|
|
117
132
|
def insert_files(self, file_names: List[str]):
|
|
118
133
|
"""Process and insert files"""
|
|
@@ -713,10 +728,6 @@ class KnowledgeBaseController:
|
|
|
713
728
|
vector_db_params['vector_size'] = vector_size
|
|
714
729
|
vector_db_name = self._create_persistent_pgvector(vector_db_params)
|
|
715
730
|
|
|
716
|
-
# create table in vectordb before creating KB
|
|
717
|
-
self.session.datahub.get(vector_db_name).integration_handler.create_table(
|
|
718
|
-
vector_table_name
|
|
719
|
-
)
|
|
720
731
|
else:
|
|
721
732
|
# create chroma db with same name
|
|
722
733
|
vector_table_name = "default_collection"
|
|
@@ -728,6 +739,10 @@ class KnowledgeBaseController:
|
|
|
728
739
|
else:
|
|
729
740
|
vector_db_name, vector_table_name = storage.parts
|
|
730
741
|
|
|
742
|
+
# create table in vectordb before creating KB
|
|
743
|
+
self.session.datahub.get(vector_db_name).integration_handler.create_table(
|
|
744
|
+
vector_table_name
|
|
745
|
+
)
|
|
731
746
|
vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
|
|
732
747
|
|
|
733
748
|
# Store sparse vector settings in params if specified
|
|
@@ -1,15 +1,13 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from typing import List, Iterator
|
|
3
3
|
from langchain_core.documents import Document as LangchainDocument
|
|
4
|
-
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
4
|
+
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
7
|
from mindsdb.interfaces.file.file_controller import FileController
|
|
8
8
|
from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
|
|
9
9
|
from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
|
|
10
10
|
FileSplitter,
|
|
11
|
-
DEFAULT_CHUNK_SIZE,
|
|
12
|
-
DEFAULT_CHUNK_OVERLAP
|
|
13
11
|
)
|
|
14
12
|
from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites
|
|
15
13
|
from mindsdb.interfaces.knowledge_base.preprocessing.models import Document
|
|
@@ -45,12 +43,6 @@ class DocumentLoader:
|
|
|
45
43
|
self.file_loader_class = file_loader_class
|
|
46
44
|
self.mysql_proxy = mysql_proxy
|
|
47
45
|
|
|
48
|
-
# Initialize text splitter for query results with default settings
|
|
49
|
-
self.query_splitter = RecursiveCharacterTextSplitter(
|
|
50
|
-
chunk_size=DEFAULT_CHUNK_SIZE,
|
|
51
|
-
chunk_overlap=DEFAULT_CHUNK_OVERLAP
|
|
52
|
-
)
|
|
53
|
-
|
|
54
46
|
def load_files(self, file_names: List[str]) -> Iterator[Document]:
|
|
55
47
|
"""Load and split documents from files"""
|
|
56
48
|
for file_name in file_names:
|
|
@@ -143,8 +135,9 @@ class DocumentLoader:
|
|
|
143
135
|
|
|
144
136
|
# Process each row into a Document
|
|
145
137
|
for _, row in df.iterrows():
|
|
146
|
-
# Extract content
|
|
138
|
+
# Extract id, content and metadata
|
|
147
139
|
content = str(row.get('content', ''))
|
|
140
|
+
id = row.get('id', None)
|
|
148
141
|
|
|
149
142
|
# Convert remaining columns to metadata
|
|
150
143
|
metadata = {
|
|
@@ -156,21 +149,9 @@ class DocumentLoader:
|
|
|
156
149
|
|
|
157
150
|
# Split content using recursive splitter
|
|
158
151
|
if content:
|
|
159
|
-
|
|
160
|
-
|
|
152
|
+
|
|
153
|
+
yield Document(
|
|
154
|
+
id=id,
|
|
155
|
+
content=content,
|
|
161
156
|
metadata=metadata
|
|
162
157
|
)
|
|
163
|
-
# Use FileSplitter with default recursive splitter
|
|
164
|
-
split_docs = self.file_splitter.split_documents(
|
|
165
|
-
[doc],
|
|
166
|
-
default_failover=True
|
|
167
|
-
)
|
|
168
|
-
|
|
169
|
-
for split_doc in split_docs:
|
|
170
|
-
metadata = doc.metadata.copy()
|
|
171
|
-
metadata.update(split_doc.metadata or {})
|
|
172
|
-
|
|
173
|
-
yield Document(
|
|
174
|
-
content=split_doc.page_content,
|
|
175
|
-
metadata=metadata
|
|
176
|
-
)
|
|
@@ -15,19 +15,25 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
15
15
|
list_sql_database_tool = ListSQLDatabaseTool(
|
|
16
16
|
name=f'sql_db_list_tables{prefix}',
|
|
17
17
|
db=self.db,
|
|
18
|
-
description=(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
)
|
|
18
|
+
description=dedent("""\n
|
|
19
|
+
Input is an empty string, output is a comma-separated list of tables in the database. Each table name is escaped using backticks.
|
|
20
|
+
Each table name in the list may be in one of two formats: database_name.`table_name` or database_name.schema_name.`table_name`.
|
|
21
|
+
Table names in response to the user must be escaped using backticks.
|
|
22
|
+
""")
|
|
23
23
|
)
|
|
24
24
|
|
|
25
25
|
info_sql_database_tool_description = (
|
|
26
|
-
"Input: A comma-separated list of tables
|
|
27
|
-
|
|
26
|
+
"Input: A comma-separated list of tables enclosed between the symbols $START$ and $STOP$. The tables names itself must be escaped using backticks.\n"
|
|
27
|
+
"Output: Schema and sample rows for those tables. \n"
|
|
28
28
|
"Use this tool to investigate table schemas for needed columns. "
|
|
29
|
-
"
|
|
30
|
-
"
|
|
29
|
+
f"Ensure tables exist by calling {list_sql_database_tool.name} first. "
|
|
30
|
+
# "The names of tables, schemas, and databases must be escaped using backticks. "
|
|
31
|
+
# "Always enclose the names of tables, schemas, and databases in backticks. "
|
|
32
|
+
"Get sample data with 'SELECT * FROM `database`.`table` LIMIT 3' before answering questions. \n"
|
|
33
|
+
"Example of correct Input:\n $START$ `database`.`table1`, `database`.`table2`, `database`.`table3` $STOP$\n"
|
|
34
|
+
" $START$ `table1` `table2` `table3` $STOP$\n"
|
|
35
|
+
"Example of wrong Input:\n $START$ `database.table1`, `database.table2`, `database.table3` $STOP$\n"
|
|
36
|
+
" $START$ table1 table2 table3 $STOP$\n"
|
|
31
37
|
)
|
|
32
38
|
info_sql_database_tool = InfoSQLDatabaseTool(
|
|
33
39
|
name=f'sql_db_schema{prefix}',
|
|
@@ -35,7 +41,7 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
35
41
|
)
|
|
36
42
|
|
|
37
43
|
query_sql_database_tool_description = dedent(f"""\
|
|
38
|
-
Input: A detailed SQL query.
|
|
44
|
+
Input: A detailed and well-structured SQL query. The query must be enclosed between the symbols $START$ and $STOP$.
|
|
39
45
|
Output: Database result or error message. For errors, rewrite and retry the query. For 'Unknown column' errors, use '{info_sql_database_tool.name}' to check table fields.
|
|
40
46
|
This system is a highly intelligent and reliable PostgreSQL SQL skill designed to work with databases.
|
|
41
47
|
Follow these instructions with utmost precision:
|
|
@@ -63,6 +69,8 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
|
|
|
63
69
|
SELECT NOW() - INTERVAL 1 YEAR;
|
|
64
70
|
6. Query Best Practices:
|
|
65
71
|
- Always send only one query at a time.
|
|
72
|
+
- Always enclose the names of tables, schemas, and databases in backticks.
|
|
73
|
+
- The input SQL query must end with a semicolon.
|
|
66
74
|
- Query only necessary columns, not all.
|
|
67
75
|
- Use only existing column names from correct tables.
|
|
68
76
|
- Use database-specific syntax for date operations.
|
|
@@ -126,6 +126,10 @@ class SkillToolController:
|
|
|
126
126
|
|
|
127
127
|
command_executor = self.get_command_executor()
|
|
128
128
|
|
|
129
|
+
def escape_table_name(name: str) -> str:
|
|
130
|
+
name = name.strip(' `')
|
|
131
|
+
return f'`{name}`'
|
|
132
|
+
|
|
129
133
|
tables_list = []
|
|
130
134
|
for skill in skills:
|
|
131
135
|
database = skill.params['database']
|
|
@@ -137,19 +141,22 @@ class SkillToolController:
|
|
|
137
141
|
else:
|
|
138
142
|
response = handler.get_tables()
|
|
139
143
|
# no restrictions
|
|
144
|
+
columns = [c.lower() for c in response.data_frame.columns]
|
|
145
|
+
name_idx = columns.index('table_name') if 'table_name' in columns else 0
|
|
146
|
+
|
|
140
147
|
if 'table_schema' in response.data_frame.columns:
|
|
141
148
|
for _, row in response.data_frame.iterrows():
|
|
142
|
-
tables_list.append(f"{database}.{row['table_schema']}.{row[
|
|
149
|
+
tables_list.append(f"{database}.{row['table_schema']}.{escape_table_name(row[name_idx])}")
|
|
143
150
|
else:
|
|
144
|
-
for
|
|
145
|
-
tables_list.append(f"{database}.{
|
|
151
|
+
for table_name in response.data_frame.iloc[:, name_idx]:
|
|
152
|
+
tables_list.append(f"{database}.{escape_table_name(table_name)}")
|
|
146
153
|
continue
|
|
147
154
|
for schema_name, tables in restriction_on_tables.items():
|
|
148
155
|
for table in tables:
|
|
149
156
|
if schema_name is None:
|
|
150
|
-
tables_list.append(f'{database}.{table}')
|
|
157
|
+
tables_list.append(f'{database}.{escape_table_name(table)}')
|
|
151
158
|
else:
|
|
152
|
-
tables_list.append(f'{database}.{schema_name}.{table}')
|
|
159
|
+
tables_list.append(f'{database}.{schema_name}.{escape_table_name(table)}')
|
|
153
160
|
|
|
154
161
|
sql_agent = SQLAgent(
|
|
155
162
|
command_executor=command_executor,
|
|
@@ -219,7 +226,6 @@ class SkillToolController:
|
|
|
219
226
|
return build_retrieval_tool(tool, pred_args, skill)
|
|
220
227
|
|
|
221
228
|
def _get_rag_query_function(self, skill: db.Skills):
|
|
222
|
-
|
|
223
229
|
session_controller = self.get_command_executor().session
|
|
224
230
|
|
|
225
231
|
def _answer_question(question: str) -> str:
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import datetime
|
|
2
|
-
from typing import Dict, List
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
3
|
|
|
4
4
|
from sqlalchemy import null
|
|
5
5
|
from sqlalchemy.orm.attributes import flag_modified
|
|
@@ -16,7 +16,7 @@ class SkillsController:
|
|
|
16
16
|
project_controller = ProjectController()
|
|
17
17
|
self.project_controller = project_controller
|
|
18
18
|
|
|
19
|
-
def get_skill(self, skill_name: str, project_name: str = 'mindsdb') -> db.Skills:
|
|
19
|
+
def get_skill(self, skill_name: str, project_name: str = 'mindsdb') -> Optional[db.Skills]:
|
|
20
20
|
'''
|
|
21
21
|
Gets a skill by name. Skills are expected to have unique names.
|
|
22
22
|
|
|
@@ -25,7 +25,7 @@ class SkillsController:
|
|
|
25
25
|
project_name (str): The name of the containing project
|
|
26
26
|
|
|
27
27
|
Returns:
|
|
28
|
-
skill (db.Skills): The database skill object
|
|
28
|
+
skill (Optional[db.Skills]): The database skill object
|
|
29
29
|
|
|
30
30
|
Raises:
|
|
31
31
|
ValueError: If `project_name` does not exist
|
|
@@ -136,6 +136,8 @@ class SkillsController:
|
|
|
136
136
|
existing_skill = self.get_skill(skill_name, project_name)
|
|
137
137
|
if existing_skill is None:
|
|
138
138
|
raise ValueError(f'Skill with name not found: {skill_name}')
|
|
139
|
+
if isinstance(existing_skill.params, dict) and existing_skill.params.get('is_demo') is True:
|
|
140
|
+
raise ValueError("It is forbidden to change properties of the demo object")
|
|
139
141
|
|
|
140
142
|
if new_name is not None:
|
|
141
143
|
existing_skill.name = new_name
|
|
@@ -171,5 +173,7 @@ class SkillsController:
|
|
|
171
173
|
skill = self.get_skill(skill_name, project_name)
|
|
172
174
|
if skill is None:
|
|
173
175
|
raise ValueError(f"Skill with name doesn't exist: {skill_name}")
|
|
176
|
+
if isinstance(skill.params, dict) and skill.params.get('is_demo') is True:
|
|
177
|
+
raise ValueError("Unable to delete demo object")
|
|
174
178
|
skill.deleted_at = datetime.datetime.now()
|
|
175
179
|
db.session.commit()
|