MindsDB 25.1.4.0__py3-none-any.whl → 25.1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (44) hide show
  1. {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/METADATA +235 -246
  2. {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/RECORD +44 -42
  3. mindsdb/__about__.py +1 -1
  4. mindsdb/api/executor/datahub/datanodes/datanode.py +1 -1
  5. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  6. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +1 -1
  7. mindsdb/api/executor/datahub/datanodes/project_datanode.py +2 -26
  8. mindsdb/api/http/namespaces/agents.py +3 -1
  9. mindsdb/api/http/namespaces/knowledge_bases.py +4 -1
  10. mindsdb/integrations/handlers/databricks_handler/requirements.txt +1 -1
  11. mindsdb/integrations/handlers/file_handler/requirements.txt +0 -4
  12. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_handler.py +1 -1
  13. mindsdb/integrations/handlers/ms_one_drive_handler/ms_one_drive_tables.py +8 -0
  14. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +4 -2
  15. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +5 -3
  16. mindsdb/integrations/handlers/snowflake_handler/requirements.txt +1 -1
  17. mindsdb/integrations/handlers/web_handler/requirements.txt +0 -1
  18. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -1
  19. mindsdb/integrations/libs/vectordatabase_handler.py +4 -3
  20. mindsdb/integrations/utilities/files/__init__.py +0 -0
  21. mindsdb/integrations/utilities/files/file_reader.py +258 -0
  22. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +2 -1
  23. mindsdb/integrations/utilities/handlers/auth_utilities/microsoft/ms_graph_api_auth_utilities.py +8 -3
  24. mindsdb/integrations/utilities/rag/chains/map_reduce_summarizer_chain.py +5 -9
  25. mindsdb/integrations/utilities/rag/pipelines/rag.py +1 -3
  26. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +97 -89
  27. mindsdb/integrations/utilities/rag/settings.py +29 -14
  28. mindsdb/interfaces/agents/agents_controller.py +15 -3
  29. mindsdb/interfaces/agents/constants.py +1 -0
  30. mindsdb/interfaces/agents/langchain_agent.py +15 -10
  31. mindsdb/interfaces/agents/langfuse_callback_handler.py +4 -0
  32. mindsdb/interfaces/agents/mindsdb_database_agent.py +14 -0
  33. mindsdb/interfaces/database/integrations.py +5 -1
  34. mindsdb/interfaces/database/projects.py +38 -1
  35. mindsdb/interfaces/knowledge_base/controller.py +26 -11
  36. mindsdb/interfaces/knowledge_base/preprocessing/document_loader.py +7 -26
  37. mindsdb/interfaces/skills/custom/text2sql/mindsdb_sql_toolkit.py +18 -10
  38. mindsdb/interfaces/skills/skill_tool.py +12 -6
  39. mindsdb/interfaces/skills/skills_controller.py +7 -3
  40. mindsdb/interfaces/skills/sql_agent.py +81 -18
  41. mindsdb/utilities/langfuse.py +15 -0
  42. {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/LICENSE +0 -0
  43. {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/WHEEL +0 -0
  44. {MindsDB-25.1.4.0.dist-info → MindsDB-25.1.5.1.dist-info}/top_level.txt +0 -0
@@ -3,7 +3,6 @@ from typing import List, Union, Any, Optional, Dict
3
3
 
4
4
  from langchain_community.vectorstores.chroma import Chroma
5
5
  from langchain_community.vectorstores.pgvector import PGVector
6
- from langchain_community.tools.sql_database.prompt import QUERY_CHECKER as DEFAULT_QUERY_CHECKER_PROMPT_TEMPLATE
7
6
  from langchain_core.documents import Document
8
7
  from langchain_core.embeddings import Embeddings
9
8
  from langchain_core.language_models import BaseChatModel
@@ -94,6 +93,25 @@ Output only a single better search query and nothing else like in the example.
94
93
  Here is the user input: {input}
95
94
  '''
96
95
 
96
+ DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE = '''Construct a list of PostgreSQL metadata filters to filter documents in the database based on the user input.
97
+
98
+ << INSTRUCTIONS >>
99
+ {format_instructions}
100
+
101
+ RETURN ONLY THE FINAL JSON. DO NOT EXPLAIN, JUST RETURN THE FINAL JSON.
102
+
103
+ << TABLES YOU HAVE ACCESS TO >>
104
+
105
+ {schema}
106
+
107
+ << EXAMPLES >>
108
+
109
+ {examples}
110
+
111
+ Here is the user input:
112
+ {input}
113
+ '''
114
+
97
115
  DEFAULT_SQL_PROMPT_TEMPLATE = '''
98
116
  Construct a valid {dialect} SQL query to select documents relevant to the user input.
99
117
  Source documents are found in the {source_table} table. You may need to join with other tables to get additional document metadata.
@@ -377,6 +395,13 @@ class MetadataSchema(BaseModel):
377
395
  columns: List[ColumnSchema] = Field(
378
396
  description="List of column schemas describing the metadata columns available for the table"
379
397
  )
398
+ join: str = Field(
399
+ description="SQL join string to join this table with source documents table",
400
+ default=''
401
+ )
402
+
403
+ class Config:
404
+ frozen = True
380
405
 
381
406
 
382
407
  class LLMExample(BaseModel):
@@ -393,19 +418,9 @@ class SQLRetrieverConfig(BaseModel):
393
418
  default_factory=LLMConfig,
394
419
  description="LLM configuration to use for generating the final SQL query for retrieval"
395
420
  )
396
- sql_prompt_template: str = Field(
397
- default=DEFAULT_SQL_PROMPT_TEMPLATE,
398
- description="""Prompt template to generate the SQL query to execute against the vector database. Currently only pgvector is supported.
399
- Has 'dialect', 'input', 'embeddings_table', 'source_table', 'embeddings', 'distance_function', 'schema', and 'examples' input variables.
400
- """
401
- )
402
- query_checker_template: str = Field(
403
- default=DEFAULT_QUERY_CHECKER_PROMPT_TEMPLATE,
404
- description="Prompt template to use for double checking SQL queries before execution. Has 'query' and 'dialect' input variables."
405
- )
406
- query_retry_template: str = Field(
407
- default=DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE,
408
- description="Prompt template to rewrite SQL query that failed. Has 'dialect', 'query', and 'error' input variables."
421
+ metadata_filters_prompt_template: str = Field(
422
+ default=DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE,
423
+ description="Prompt template to generate PostgreSQL metadata filters. Has 'format_instructions', 'schema', 'examples', and 'input' input variables"
409
424
  )
410
425
  num_retries: int = Field(
411
426
  default=DEFAULT_NUM_QUERY_RETRIES,
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from typing import Dict, Iterator, List, Union, Tuple
2
+ from typing import Dict, Iterator, List, Union, Tuple, Optional
3
3
 
4
4
  from langchain_core.tools import BaseTool
5
5
  from sqlalchemy.orm.attributes import flag_modified
@@ -70,7 +70,7 @@ class AgentsController:
70
70
 
71
71
  return model, provider
72
72
 
73
- def get_agent(self, agent_name: str, project_name: str = 'mindsdb') -> db.Agents:
73
+ def get_agent(self, agent_name: str, project_name: str = 'mindsdb') -> Optional[db.Agents]:
74
74
  '''
75
75
  Gets an agent by name.
76
76
 
@@ -79,7 +79,7 @@ class AgentsController:
79
79
  project_name (str): The name of the containing project - must exist
80
80
 
81
81
  Returns:
82
- agent (db.Agents): The database agent object
82
+ agent (Optional[db.Agents]): The database agent object
83
83
  '''
84
84
 
85
85
  project = self.project_controller.get(name=project_name)
@@ -252,6 +252,16 @@ class AgentsController:
252
252
  existing_agent = self.get_agent(agent_name, project_name=project_name)
253
253
  if existing_agent is None:
254
254
  raise EntityNotExistsError(f'Agent with name not found: {agent_name}')
255
+ is_demo = (existing_agent.params or {}).get('is_demo', False)
256
+ if (
257
+ is_demo and (
258
+ (name is not None and name != agent_name)
259
+ or (model_name or provider)
260
+ or (len(skills_to_add) > 0 or len(skills_to_remove) > 0 or len(skills_to_rewrite) > 0)
261
+ or (isinstance(params, dict) and len(params) > 1 and 'prompt_template' not in params)
262
+ )
263
+ ):
264
+ raise ValueError("It is forbidden to change properties of the demo object")
255
265
 
256
266
  if name is not None and name != agent_name:
257
267
  # Check to see if updated name already exists
@@ -352,6 +362,8 @@ class AgentsController:
352
362
  agent = self.get_agent(agent_name, project_name)
353
363
  if agent is None:
354
364
  raise ValueError(f'Agent with name does not exist: {agent_name}')
365
+ if isinstance(agent.params, dict) and agent.params.get('is_demo') is True:
366
+ raise ValueError('Unable to delete demo object')
355
367
  agent.deleted_at = datetime.datetime.now()
356
368
  db.session.commit()
357
369
 
@@ -165,6 +165,7 @@ PROVIDER_TO_MODELS = MappingProxyType(
165
165
 
166
166
  ASSISTANT_COLUMN = "answer"
167
167
  CONTEXT_COLUMN = "context"
168
+ TRACE_ID_COLUMN = "trace_id"
168
169
  DEFAULT_AGENT_TIMEOUT_SECONDS = 300
169
170
  # These should require no additional arguments.
170
171
  DEFAULT_AGENT_TOOLS = []
@@ -49,7 +49,7 @@ from .constants import (
49
49
  NVIDIA_NIM_CHAT_MODELS,
50
50
  USER_COLUMN,
51
51
  ASSISTANT_COLUMN,
52
- CONTEXT_COLUMN
52
+ CONTEXT_COLUMN, TRACE_ID_COLUMN
53
53
  )
54
54
  from mindsdb.interfaces.skills.skill_tool import skill_tool, SkillData
55
55
  from langchain_anthropic import ChatAnthropic
@@ -371,9 +371,9 @@ class LangchainAgent:
371
371
  for row in df[:-1].to_dict("records"):
372
372
  question = row[user_column]
373
373
  answer = row[assistant_column]
374
- if question:
374
+ if isinstance(question, str) and len(question) > 0:
375
375
  memory.chat_memory.add_user_message(question)
376
- if answer:
376
+ if isinstance(answer, str) and len(answer) > 0:
377
377
  memory.chat_memory.add_ai_message(answer)
378
378
 
379
379
  agent_type = args.get("agent_type", DEFAULT_AGENT_TYPE)
@@ -455,9 +455,7 @@ class LangchainAgent:
455
455
 
456
456
  # custom tracer
457
457
  if self.mdb_langfuse_callback_handler is None:
458
- trace_id = None
459
- if self.langfuse_client_wrapper.trace is not None:
460
- trace_id = args.get("trace_id", self.langfuse_client_wrapper.trace.id)
458
+ trace_id = self.langfuse_client_wrapper.get_trace_id()
461
459
 
462
460
  span_id = None
463
461
  if self.run_completion_span is not None:
@@ -562,6 +560,7 @@ AI: {response}"""
562
560
  CONTEXT_COLUMN: [
563
561
  json.dumps(ctx) for ctx in contexts
564
562
  ], # Serialize context to JSON string
563
+ TRACE_ID_COLUMN: self.langfuse_client_wrapper.get_trace_id()
565
564
  }
566
565
  )
567
566
 
@@ -570,6 +569,12 @@ AI: {response}"""
570
569
 
571
570
  return pred_df
572
571
 
572
+ def add_chunk_metadata(self, chunk: Dict) -> Dict:
573
+ logger.debug(f'Adding metadata to chunk: {chunk}')
574
+ logger.debug(f'Trace ID: {self.langfuse_client_wrapper.get_trace_id()}')
575
+ chunk["trace_id"] = self.langfuse_client_wrapper.get_trace_id()
576
+ return chunk
577
+
573
578
  def stream_agent(self, df: pd.DataFrame, agent_executor: AgentExecutor, args: Dict) -> Iterable[Dict]:
574
579
  base_template = args.get('prompt_template', args['prompt_template'])
575
580
  input_variables = re.findall(r"{{(.*?)}}", base_template)
@@ -579,7 +584,7 @@ AI: {response}"""
579
584
 
580
585
  callbacks, context_callback = prepare_callbacks(self, args)
581
586
 
582
- yield {"type": "start", "prompt": prompts[0]}
587
+ yield self.add_chunk_metadata({"type": "start", "prompt": prompts[0]})
583
588
 
584
589
  if not hasattr(agent_executor, 'stream') or not callable(agent_executor.stream):
585
590
  raise AttributeError("The agent_executor does not have a 'stream' method")
@@ -591,10 +596,10 @@ AI: {response}"""
591
596
  raise TypeError("The stream method did not return an iterable")
592
597
 
593
598
  for chunk in stream_iterator:
594
- logger.info(f'Processing streaming chunk {chunk}')
599
+ logger.debug(f'Processing streaming chunk {chunk}')
595
600
  processed_chunk = self.process_chunk(chunk)
596
601
  logger.info(f'Processed chunk: {processed_chunk}')
597
- yield processed_chunk
602
+ yield self.add_chunk_metadata(processed_chunk)
598
603
 
599
604
  if return_context:
600
605
  # Yield context if required
@@ -604,7 +609,7 @@ AI: {response}"""
604
609
 
605
610
  if self.log_callback_handler.generated_sql:
606
611
  # Yield generated SQL if available
607
- yield {"type": "sql", "content": self.log_callback_handler.generated_sql}
612
+ yield self.add_chunk_metadata({"type": "sql", "content": self.log_callback_handler.generated_sql})
608
613
 
609
614
  # End the run completion span and update the metadata with tool usage
610
615
  self.langfuse_client_wrapper.end_span_stream(span=self.run_completion_span)
@@ -66,6 +66,10 @@ class LangfuseCallbackHandler(BaseCallbackHandler):
66
66
  ) -> Any:
67
67
  """Run when chain starts running."""
68
68
  run_uuid = kwargs.get('run_id', uuid4()).hex
69
+
70
+ if serialized is None:
71
+ serialized = {}
72
+
69
73
  chain_span = self.langfuse.span(
70
74
  name=f'{serialized.get("name", "chain")}-{run_uuid}',
71
75
  trace_id=self.trace_id,
@@ -11,6 +11,17 @@ from mindsdb.interfaces.skills.sql_agent import SQLAgent
11
11
  logger = log.getLogger(__name__)
12
12
 
13
13
 
14
+ def extract_essential(input: str) -> str:
15
+ """ Sometimes LLM include to input unnecessary data. We can't control stochastic nature of LLM, so we need to
16
+ 'clean' input somehow. LLM prompt contains instruction to enclose input between '$START$' and '$STOP$'.
17
+ """
18
+ if '$START$' in input:
19
+ input = input.partition('$START$')[-1]
20
+ if '$STOP$' in input:
21
+ input = input.partition('$STOP$')[0]
22
+ return input.strip(' ')
23
+
24
+
14
25
  class MindsDBSQL(SQLDatabase):
15
26
  @staticmethod
16
27
  def custom_init(
@@ -50,7 +61,10 @@ class MindsDBSQL(SQLDatabase):
50
61
  return self._sql_agent.get_usable_table_names()
51
62
 
52
63
  def get_table_info_no_throw(self, table_names: Optional[List[str]] = None) -> str:
64
+ for i in range(len(table_names)):
65
+ table_names[i] = extract_essential(table_names[i])
53
66
  return self._sql_agent.get_table_info_safe(table_names)
54
67
 
55
68
  def run_no_throw(self, command: str, fetch: str = "all") -> str:
69
+ command = extract_essential(command)
56
70
  return self._sql_agent.query_safe(command)
@@ -215,6 +215,8 @@ class IntegrationController:
215
215
  def modify(self, name, data):
216
216
  self.handlers_cache.delete(name)
217
217
  integration_record = self._get_integration_record(name)
218
+ if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
219
+ raise ValueError("It is forbidden to change properties of the demo object")
218
220
  old_data = deepcopy(integration_record.data)
219
221
  for k in old_data:
220
222
  if k not in data:
@@ -234,9 +236,11 @@ class IntegrationController:
234
236
  handler = self.handler_modules[name]
235
237
 
236
238
  if getattr(handler, 'permanent', False) is True:
237
- raise Exception('Unable to drop: is permanent integration')
239
+ raise Exception('Unable to drop permanent integration')
238
240
 
239
241
  integration_record = self._get_integration_record(name)
242
+ if isinstance(integration_record.data, dict) and integration_record.data.get('is_demo') is True:
243
+ raise Exception('Unable to drop demo object')
240
244
 
241
245
  # if this is ml engine
242
246
  engine_models = get_model_records(ml_handler_name=name, deleted_at=None)
@@ -7,6 +7,7 @@ import sqlalchemy as sa
7
7
  import numpy as np
8
8
 
9
9
  from mindsdb_sql_parser.ast.base import ASTNode
10
+ from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
10
11
  from mindsdb_sql_parser import parse_sql
11
12
 
12
13
  from mindsdb.interfaces.storage import db
@@ -16,6 +17,9 @@ from mindsdb.interfaces.database.views import ViewController
16
17
  from mindsdb.utilities.context import context as ctx
17
18
  from mindsdb.utilities.exception import EntityExistsError, EntityNotExistsError
18
19
  import mindsdb.utilities.profiler as profiler
20
+ from mindsdb.api.executor.sql_query import SQLQuery
21
+ from mindsdb.api.executor.utilities.sql import query_df
22
+ from mindsdb.interfaces.query_context.context_controller import query_context_controller
19
23
 
20
24
 
21
25
  class Project:
@@ -111,7 +115,7 @@ class Project:
111
115
  project_name=self.name
112
116
  )
113
117
 
114
- def query_view(self, query: ASTNode) -> ASTNode:
118
+ def get_view_meta(self, query: ASTNode) -> ASTNode:
115
119
  view_name = query.from_table.parts[-1]
116
120
  view_meta = ViewController().get(
117
121
  name=view_name,
@@ -120,6 +124,30 @@ class Project:
120
124
  view_meta['query_ast'] = parse_sql(view_meta['query'])
121
125
  return view_meta
122
126
 
127
+ def query_view(self, query, session):
128
+
129
+ view_meta = self.get_view_meta(query)
130
+
131
+ query_context_controller.set_context('view', view_meta['id'])
132
+
133
+ try:
134
+ sqlquery = SQLQuery(
135
+ view_meta['query_ast'],
136
+ session=session
137
+ )
138
+ result = sqlquery.fetch(view='dataframe')
139
+
140
+ finally:
141
+ query_context_controller.release_context('view', view_meta['id'])
142
+
143
+ if result['success'] is False:
144
+ raise Exception(f"Cant execute view query: {view_meta['query_ast']}")
145
+ df = result['result']
146
+ # remove duplicated columns
147
+ df = df.loc[:, ~df.columns.duplicated()]
148
+
149
+ return query_df(df, query, session=session)
150
+
123
151
  @staticmethod
124
152
  def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):
125
153
  from mindsdb.interfaces.database.integrations import integration_controller
@@ -341,6 +369,15 @@ class Project:
341
369
  columns = predictor_record.to_predict
342
370
  if not isinstance(columns, list):
343
371
  columns = [columns]
372
+ return columns
373
+ if self.get_view(table_name):
374
+ query = Select(targets=[Star()], from_table=Identifier(table_name), limit=Constant(1))
375
+
376
+ from mindsdb.api.executor.controllers.session_controller import SessionController
377
+ session = SessionController()
378
+ session.database = self.name
379
+ df = self.query_view(query, session)
380
+ return df.columns
344
381
  else:
345
382
  # is it agent?
346
383
  agent = db.Agents.query.filter_by(
@@ -26,6 +26,9 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
26
26
  )
27
27
  from mindsdb.integrations.utilities.rag.rag_pipeline_builder import RAG
28
28
  from mindsdb.integrations.utilities.rag.config_loader import load_rag_config
29
+ from mindsdb.integrations.utilities.sql_utils import (
30
+ extract_comparison_conditions, filter_dataframe, FilterCondition, FilterOperator
31
+ )
29
32
  from mindsdb.interfaces.agents.constants import DEFAULT_EMBEDDINGS_MODEL_CLASS
30
33
  from mindsdb.interfaces.agents.langchain_agent import create_chat_model, get_llm_provider
31
34
  from mindsdb.interfaces.database.projects import ProjectController
@@ -101,18 +104,30 @@ class KnowledgeBaseTable:
101
104
  # Get response from vector db
102
105
  db_handler = self.get_vector_db()
103
106
  logger.debug(f"Using vector db handler: {type(db_handler)}")
104
- resp = db_handler.query(query)
105
107
 
106
- if resp.data_frame is not None:
107
- logger.debug(f"Query returned {len(resp.data_frame)} rows")
108
- logger.debug(f"Columns in response: {resp.data_frame.columns.tolist()}")
108
+ vector_filters, outer_filters = [], []
109
+ # update vector handlers, mark conditions as applied inside
110
+ for op, arg1, arg2 in extract_comparison_conditions(query.where):
111
+ condition = FilterCondition(arg1, FilterOperator(op.upper()), arg2)
112
+ if arg1 in (TableField.ID.value, TableField.CONTENT.value, TableField.EMBEDDINGS.value):
113
+ vector_filters.append(condition)
114
+ else:
115
+ outer_filters.append([op, arg1, arg2])
116
+
117
+ df = db_handler.dispatch_select(query, conditions=vector_filters)
118
+
119
+ if df is not None:
120
+ df = filter_dataframe(df, outer_filters)
121
+
122
+ logger.debug(f"Query returned {len(df)} rows")
123
+ logger.debug(f"Columns in response: {df.columns.tolist()}")
109
124
  # Log a sample of IDs to help diagnose issues
110
- if not resp.data_frame.empty:
111
- logger.debug(f"Sample of IDs in response: {resp.data_frame['id'].head().tolist()}")
125
+ if not df.empty:
126
+ logger.debug(f"Sample of IDs in response: {df['id'].head().tolist()}")
112
127
  else:
113
128
  logger.warning("Query returned no data")
114
129
 
115
- return resp.data_frame
130
+ return df
116
131
 
117
132
  def insert_files(self, file_names: List[str]):
118
133
  """Process and insert files"""
@@ -713,10 +728,6 @@ class KnowledgeBaseController:
713
728
  vector_db_params['vector_size'] = vector_size
714
729
  vector_db_name = self._create_persistent_pgvector(vector_db_params)
715
730
 
716
- # create table in vectordb before creating KB
717
- self.session.datahub.get(vector_db_name).integration_handler.create_table(
718
- vector_table_name
719
- )
720
731
  else:
721
732
  # create chroma db with same name
722
733
  vector_table_name = "default_collection"
@@ -728,6 +739,10 @@ class KnowledgeBaseController:
728
739
  else:
729
740
  vector_db_name, vector_table_name = storage.parts
730
741
 
742
+ # create table in vectordb before creating KB
743
+ self.session.datahub.get(vector_db_name).integration_handler.create_table(
744
+ vector_table_name
745
+ )
731
746
  vector_database_id = self.session.integration_controller.get(vector_db_name)['id']
732
747
 
733
748
  # Store sparse vector settings in params if specified
@@ -1,15 +1,13 @@
1
1
  import os
2
2
  from typing import List, Iterator
3
3
  from langchain_core.documents import Document as LangchainDocument
4
- from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
4
+ from langchain_text_splitters import MarkdownHeaderTextSplitter
5
5
  import pandas as pd
6
6
 
7
7
  from mindsdb.interfaces.file.file_controller import FileController
8
8
  from mindsdb.integrations.utilities.rag.loaders.file_loader import FileLoader
9
9
  from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
10
10
  FileSplitter,
11
- DEFAULT_CHUNK_SIZE,
12
- DEFAULT_CHUNK_OVERLAP
13
11
  )
14
12
  from mindsdb.integrations.handlers.web_handler.urlcrawl_helpers import get_all_websites
15
13
  from mindsdb.interfaces.knowledge_base.preprocessing.models import Document
@@ -45,12 +43,6 @@ class DocumentLoader:
45
43
  self.file_loader_class = file_loader_class
46
44
  self.mysql_proxy = mysql_proxy
47
45
 
48
- # Initialize text splitter for query results with default settings
49
- self.query_splitter = RecursiveCharacterTextSplitter(
50
- chunk_size=DEFAULT_CHUNK_SIZE,
51
- chunk_overlap=DEFAULT_CHUNK_OVERLAP
52
- )
53
-
54
46
  def load_files(self, file_names: List[str]) -> Iterator[Document]:
55
47
  """Load and split documents from files"""
56
48
  for file_name in file_names:
@@ -143,8 +135,9 @@ class DocumentLoader:
143
135
 
144
136
  # Process each row into a Document
145
137
  for _, row in df.iterrows():
146
- # Extract content and metadata
138
+ # Extract id, content and metadata
147
139
  content = str(row.get('content', ''))
140
+ id = row.get('id', None)
148
141
 
149
142
  # Convert remaining columns to metadata
150
143
  metadata = {
@@ -156,21 +149,9 @@ class DocumentLoader:
156
149
 
157
150
  # Split content using recursive splitter
158
151
  if content:
159
- doc = LangchainDocument(
160
- page_content=content,
152
+
153
+ yield Document(
154
+ id=id,
155
+ content=content,
161
156
  metadata=metadata
162
157
  )
163
- # Use FileSplitter with default recursive splitter
164
- split_docs = self.file_splitter.split_documents(
165
- [doc],
166
- default_failover=True
167
- )
168
-
169
- for split_doc in split_docs:
170
- metadata = doc.metadata.copy()
171
- metadata.update(split_doc.metadata or {})
172
-
173
- yield Document(
174
- content=split_doc.page_content,
175
- metadata=metadata
176
- )
@@ -15,19 +15,25 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
15
15
  list_sql_database_tool = ListSQLDatabaseTool(
16
16
  name=f'sql_db_list_tables{prefix}',
17
17
  db=self.db,
18
- description=(
19
- "Input is an empty string, output is a comma-separated list of tables in the database. "
20
- "Each table name in the list may be in one of two formats: database_name.table_name or "
21
- "database_name.schema_name.table_name."
22
- )
18
+ description=dedent("""\n
19
+ Input is an empty string, output is a comma-separated list of tables in the database. Each table name is escaped using backticks.
20
+ Each table name in the list may be in one of two formats: database_name.`table_name` or database_name.schema_name.`table_name`.
21
+ Table names in response to the user must be escaped using backticks.
22
+ """)
23
23
  )
24
24
 
25
25
  info_sql_database_tool_description = (
26
- "Input: A comma-separated list of tables. Output: Schema and sample rows for those tables. "
27
- f"Ensure tables exist by calling {list_sql_database_tool.name} first. "
26
+ "Input: A comma-separated list of tables enclosed between the symbols $START$ and $STOP$. The tables names itself must be escaped using backticks.\n"
27
+ "Output: Schema and sample rows for those tables. \n"
28
28
  "Use this tool to investigate table schemas for needed columns. "
29
- "Get sample data with 'SELECT * FROM table LIMIT 3' before answering questions. "
30
- "Example Input: table1, table2, table3"
29
+ f"Ensure tables exist by calling {list_sql_database_tool.name} first. "
30
+ # "The names of tables, schemas, and databases must be escaped using backticks. "
31
+ # "Always enclose the names of tables, schemas, and databases in backticks. "
32
+ "Get sample data with 'SELECT * FROM `database`.`table` LIMIT 3' before answering questions. \n"
33
+ "Example of correct Input:\n $START$ `database`.`table1`, `database`.`table2`, `database`.`table3` $STOP$\n"
34
+ " $START$ `table1` `table2` `table3` $STOP$\n"
35
+ "Example of wrong Input:\n $START$ `database.table1`, `database.table2`, `database.table3` $STOP$\n"
36
+ " $START$ table1 table2 table3 $STOP$\n"
31
37
  )
32
38
  info_sql_database_tool = InfoSQLDatabaseTool(
33
39
  name=f'sql_db_schema{prefix}',
@@ -35,7 +41,7 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
35
41
  )
36
42
 
37
43
  query_sql_database_tool_description = dedent(f"""\
38
- Input: A detailed SQL query.
44
+ Input: A detailed and well-structured SQL query. The query must be enclosed between the symbols $START$ and $STOP$.
39
45
  Output: Database result or error message. For errors, rewrite and retry the query. For 'Unknown column' errors, use '{info_sql_database_tool.name}' to check table fields.
40
46
  This system is a highly intelligent and reliable PostgreSQL SQL skill designed to work with databases.
41
47
  Follow these instructions with utmost precision:
@@ -63,6 +69,8 @@ class MindsDBSQLToolkit(SQLDatabaseToolkit):
63
69
  SELECT NOW() - INTERVAL 1 YEAR;
64
70
  6. Query Best Practices:
65
71
  - Always send only one query at a time.
72
+ - Always enclose the names of tables, schemas, and databases in backticks.
73
+ - The input SQL query must end with a semicolon.
66
74
  - Query only necessary columns, not all.
67
75
  - Use only existing column names from correct tables.
68
76
  - Use database-specific syntax for date operations.
@@ -126,6 +126,10 @@ class SkillToolController:
126
126
 
127
127
  command_executor = self.get_command_executor()
128
128
 
129
+ def escape_table_name(name: str) -> str:
130
+ name = name.strip(' `')
131
+ return f'`{name}`'
132
+
129
133
  tables_list = []
130
134
  for skill in skills:
131
135
  database = skill.params['database']
@@ -137,19 +141,22 @@ class SkillToolController:
137
141
  else:
138
142
  response = handler.get_tables()
139
143
  # no restrictions
144
+ columns = [c.lower() for c in response.data_frame.columns]
145
+ name_idx = columns.index('table_name') if 'table_name' in columns else 0
146
+
140
147
  if 'table_schema' in response.data_frame.columns:
141
148
  for _, row in response.data_frame.iterrows():
142
- tables_list.append(f"{database}.{row['table_schema']}.{row['table_name']}")
149
+ tables_list.append(f"{database}.{row['table_schema']}.{escape_table_name(row[name_idx])}")
143
150
  else:
144
- for _, row in response.data_frame.iterrows():
145
- tables_list.append(f"{database}.{row['table_name']}")
151
+ for table_name in response.data_frame.iloc[:, name_idx]:
152
+ tables_list.append(f"{database}.{escape_table_name(table_name)}")
146
153
  continue
147
154
  for schema_name, tables in restriction_on_tables.items():
148
155
  for table in tables:
149
156
  if schema_name is None:
150
- tables_list.append(f'{database}.{table}')
157
+ tables_list.append(f'{database}.{escape_table_name(table)}')
151
158
  else:
152
- tables_list.append(f'{database}.{schema_name}.{table}')
159
+ tables_list.append(f'{database}.{schema_name}.{escape_table_name(table)}')
153
160
 
154
161
  sql_agent = SQLAgent(
155
162
  command_executor=command_executor,
@@ -219,7 +226,6 @@ class SkillToolController:
219
226
  return build_retrieval_tool(tool, pred_args, skill)
220
227
 
221
228
  def _get_rag_query_function(self, skill: db.Skills):
222
-
223
229
  session_controller = self.get_command_executor().session
224
230
 
225
231
  def _answer_question(question: str) -> str:
@@ -1,5 +1,5 @@
1
1
  import datetime
2
- from typing import Dict, List
2
+ from typing import Dict, List, Optional
3
3
 
4
4
  from sqlalchemy import null
5
5
  from sqlalchemy.orm.attributes import flag_modified
@@ -16,7 +16,7 @@ class SkillsController:
16
16
  project_controller = ProjectController()
17
17
  self.project_controller = project_controller
18
18
 
19
- def get_skill(self, skill_name: str, project_name: str = 'mindsdb') -> db.Skills:
19
+ def get_skill(self, skill_name: str, project_name: str = 'mindsdb') -> Optional[db.Skills]:
20
20
  '''
21
21
  Gets a skill by name. Skills are expected to have unique names.
22
22
 
@@ -25,7 +25,7 @@ class SkillsController:
25
25
  project_name (str): The name of the containing project
26
26
 
27
27
  Returns:
28
- skill (db.Skills): The database skill object
28
+ skill (Optional[db.Skills]): The database skill object
29
29
 
30
30
  Raises:
31
31
  ValueError: If `project_name` does not exist
@@ -136,6 +136,8 @@ class SkillsController:
136
136
  existing_skill = self.get_skill(skill_name, project_name)
137
137
  if existing_skill is None:
138
138
  raise ValueError(f'Skill with name not found: {skill_name}')
139
+ if isinstance(existing_skill.params, dict) and existing_skill.params.get('is_demo') is True:
140
+ raise ValueError("It is forbidden to change properties of the demo object")
139
141
 
140
142
  if new_name is not None:
141
143
  existing_skill.name = new_name
@@ -171,5 +173,7 @@ class SkillsController:
171
173
  skill = self.get_skill(skill_name, project_name)
172
174
  if skill is None:
173
175
  raise ValueError(f"Skill with name doesn't exist: {skill_name}")
176
+ if isinstance(skill.params, dict) and skill.params.get('is_demo') is True:
177
+ raise ValueError("Unable to delete demo object")
174
178
  skill.deleted_at = datetime.datetime.now()
175
179
  db.session.commit()