MindsDB 25.2.3.0__py3-none-any.whl → 25.3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (86) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +16 -11
  3. mindsdb/api/executor/command_executor.py +1 -1
  4. mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -2
  5. mindsdb/api/executor/planner/query_planner.py +6 -2
  6. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -1
  7. mindsdb/api/http/initialize.py +8 -5
  8. mindsdb/api/http/namespaces/agents.py +0 -7
  9. mindsdb/api/http/namespaces/config.py +0 -48
  10. mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
  11. mindsdb/api/http/namespaces/util.py +0 -28
  12. mindsdb/api/mongo/classes/query_sql.py +2 -1
  13. mindsdb/api/mongo/responders/aggregate.py +2 -2
  14. mindsdb/api/mongo/responders/coll_stats.py +3 -2
  15. mindsdb/api/mongo/responders/db_stats.py +2 -1
  16. mindsdb/api/mongo/responders/insert.py +4 -2
  17. mindsdb/api/mysql/mysql_proxy/classes/fake_mysql_proxy/fake_mysql_proxy.py +2 -1
  18. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +5 -4
  19. mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +2 -4
  20. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
  21. mindsdb/integrations/handlers/autosklearn_handler/autosklearn_handler.py +1 -1
  22. mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
  23. mindsdb/integrations/handlers/gmail_handler/connection_args.py +2 -2
  24. mindsdb/integrations/handlers/gmail_handler/gmail_handler.py +19 -66
  25. mindsdb/integrations/handlers/gmail_handler/requirements.txt +0 -1
  26. mindsdb/integrations/handlers/google_calendar_handler/connection_args.py +15 -0
  27. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_handler.py +31 -41
  28. mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +0 -2
  29. mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
  30. mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
  31. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
  32. mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
  33. mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
  34. mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
  35. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
  36. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
  37. mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
  38. mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
  39. mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
  40. mindsdb/integrations/handlers/youtube_handler/youtube_handler.py +2 -38
  41. mindsdb/integrations/libs/llm/utils.py +7 -1
  42. mindsdb/integrations/libs/process_cache.py +2 -2
  43. mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py +29 -38
  44. mindsdb/integrations/utilities/pydantic_utils.py +208 -0
  45. mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
  46. mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -4
  47. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +800 -135
  48. mindsdb/integrations/utilities/rag/settings.py +390 -152
  49. mindsdb/integrations/utilities/sql_utils.py +2 -1
  50. mindsdb/interfaces/agents/agents_controller.py +14 -10
  51. mindsdb/interfaces/agents/callback_handlers.py +52 -5
  52. mindsdb/interfaces/agents/langchain_agent.py +5 -3
  53. mindsdb/interfaces/agents/mindsdb_chat_model.py +4 -2
  54. mindsdb/interfaces/chatbot/chatbot_controller.py +9 -8
  55. mindsdb/interfaces/database/database.py +3 -2
  56. mindsdb/interfaces/database/integrations.py +1 -1
  57. mindsdb/interfaces/database/projects.py +28 -2
  58. mindsdb/interfaces/jobs/jobs_controller.py +4 -1
  59. mindsdb/interfaces/jobs/scheduler.py +1 -1
  60. mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
  61. mindsdb/interfaces/model/model_controller.py +5 -2
  62. mindsdb/interfaces/skills/retrieval_tool.py +128 -39
  63. mindsdb/interfaces/skills/skill_tool.py +7 -7
  64. mindsdb/interfaces/skills/skills_controller.py +10 -6
  65. mindsdb/interfaces/skills/sql_agent.py +6 -1
  66. mindsdb/interfaces/storage/db.py +14 -12
  67. mindsdb/interfaces/storage/json.py +59 -0
  68. mindsdb/interfaces/storage/model_fs.py +85 -3
  69. mindsdb/interfaces/triggers/triggers_controller.py +2 -1
  70. mindsdb/migrations/versions/2022-10-14_43c52d23845a_projects.py +17 -3
  71. mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
  72. mindsdb/migrations/versions/2025-02-14_4521dafe89ab_added_encrypted_content_to_json_storage.py +29 -0
  73. mindsdb/migrations/versions/2025-02-19_11347c213b36_added_metadata_to_projects.py +41 -0
  74. mindsdb/utilities/config.py +6 -1
  75. mindsdb/utilities/functions.py +11 -0
  76. mindsdb/utilities/log.py +17 -2
  77. mindsdb/utilities/ml_task_queue/consumer.py +4 -2
  78. mindsdb/utilities/render/sqlalchemy_render.py +4 -0
  79. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/METADATA +226 -247
  80. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/RECORD +83 -80
  81. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/WHEEL +1 -1
  82. mindsdb/integrations/handlers/gmail_handler/utils.py +0 -45
  83. mindsdb/utilities/log_controller.py +0 -39
  84. mindsdb/utilities/telemetry.py +0 -44
  85. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/LICENSE +0 -0
  86. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
1
  from enum import Enum
2
- from typing import List, Union, Any, Optional, Dict
2
+ from typing import List, Union, Any, Optional, Dict, OrderedDict
3
3
 
4
4
  from langchain_community.vectorstores.chroma import Chroma
5
5
  from langchain_community.vectorstores.pgvector import PGVector
@@ -11,7 +11,7 @@ from langchain_core.stores import BaseStore
11
11
  from pydantic import BaseModel, Field, field_validator
12
12
  from langchain_text_splitters import TextSplitter
13
13
 
14
- DEFAULT_COLLECTION_NAME = 'default_collection'
14
+ DEFAULT_COLLECTION_NAME = "default_collection"
15
15
 
16
16
  # Multi retriever specific
17
17
  DEFAULT_ID_KEY = "doc_id"
@@ -38,15 +38,15 @@ Return a JSON list with an entry for each column. Each entry should have
38
38
  {{"name": "column name", "description": "column description", "type": "column data type"}}
39
39
  \n\n{dataframe}\n\nJSON:\n
40
40
  """
41
- DEFAULT_RAG_PROMPT_TEMPLATE = '''You are an assistant for
41
+ DEFAULT_RAG_PROMPT_TEMPLATE = """You are an assistant for
42
42
  question-answering tasks. Use the following pieces of retrieved context
43
43
  to answer the question. If you don't know the answer, just say that you
44
44
  don't know. Use two sentences maximum and keep the answer concise.
45
45
  Question: {question}
46
46
  Context: {context}
47
- Answer:'''
47
+ Answer:"""
48
48
 
49
- DEFAULT_QA_GENERATION_PROMPT_TEMPLATE = '''You are an assistant for
49
+ DEFAULT_QA_GENERATION_PROMPT_TEMPLATE = """You are an assistant for
50
50
  generating sample questions and answers from the given document and metadata. Given
51
51
  a document and its metadata as context, generate a question and answer from that document and its metadata.
52
52
 
@@ -64,25 +64,25 @@ in the specified JSON format no matter what.
64
64
 
65
65
  Document: {document}
66
66
  Metadata: {metadata}
67
- Answer:'''
67
+ Answer:"""
68
68
 
69
- DEFAULT_MAP_PROMPT_TEMPLATE = '''The following is a set of documents
69
+ DEFAULT_MAP_PROMPT_TEMPLATE = """The following is a set of documents
70
70
  {docs}
71
71
  Based on this list of docs, please summarize based on the user input.
72
72
 
73
73
  User input: {input}
74
74
 
75
- Helpful Answer:'''
75
+ Helpful Answer:"""
76
76
 
77
- DEFAULT_REDUCE_PROMPT_TEMPLATE = '''The following is set of summaries:
77
+ DEFAULT_REDUCE_PROMPT_TEMPLATE = """The following is set of summaries:
78
78
  {docs}
79
79
  Take these and distill it into a final, consolidated summary related to the user input.
80
80
 
81
81
  User input: {input}
82
82
 
83
- Helpful Answer:'''
83
+ Helpful Answer:"""
84
84
 
85
- DEFAULT_SEMANTIC_PROMPT_TEMPLATE = '''Provide a better search query for web search engine to answer the given question.
85
+ DEFAULT_SEMANTIC_PROMPT_TEMPLATE = """Provide a better search query for web search engine to answer the given question.
86
86
 
87
87
  << EXAMPLES >>
88
88
  1. Input: "Show me documents containing how to finetune a LLM please"
@@ -91,9 +91,9 @@ Output: "how to finetune a LLM"
91
91
  Output only a single better search query and nothing else like in the example.
92
92
 
93
93
  Here is the user input: {input}
94
- '''
94
+ """
95
95
 
96
- DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE = '''Construct a list of PostgreSQL metadata filters to filter documents in the database based on the user input.
96
+ DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE = """Construct a list of PostgreSQL metadata filters to filter documents in the database based on the user input.
97
97
 
98
98
  << INSTRUCTIONS >>
99
99
  {format_instructions}
@@ -110,9 +110,99 @@ RETURN ONLY THE FINAL JSON. DO NOT EXPLAIN, JUST RETURN THE FINAL JSON.
110
110
 
111
111
  Here is the user input:
112
112
  {input}
113
- '''
113
+ """
114
+
115
+ DEFAULT_BOOLEAN_PROMPT_TEMPLATE = """**Task:** Determine Schema Relevance for Database Search Queries
116
+
117
+ As an expert in constructing database search queries, you are provided with database schemas detailing tables, columns, and values. Your task is to assess whether these elements can be used to effectively search the database in relation to a given user query.
118
+
119
+ **Instructions:**
120
+
121
+ - **Evaluate the Schema**:
122
+ - Analyze the tables, columns, and values described.
123
+ - Consider their potential usefulness in retrieving information pertinent to the user query.
124
+
125
+ - **Decision Criteria**:
126
+ - Determine if any part of the schema could assist in forming a relevant search query for the information requested.
127
+
128
+ - **Response**:
129
+ - Reply with a single word: 'yes' if the schema components are useful, otherwise 'no'.
130
+
131
+ **Note:** Provide your answer based solely on the relevance of the described schema to the user query."""
132
+
133
+ DEFAULT_GENERATIVE_SYSTEM_PROMPT = """You are an expert database analyst that can assist in building SQL queries by providing structured output. Follow these format instructions precisely to generate a metadata filter given the provided schema description.
134
+
135
+ ## Format instructions:
136
+ {format_instructions}
137
+ """
138
+
139
+ DEFAULT_VALUE_PROMPT_TEMPLATE = """
140
+ {column_schema}
141
+
142
+ # **Value Schema**
143
+ {header}
144
+
145
+ - The type of the value: {type}
146
+
147
+ ## **Description**
148
+ {description}
149
+
150
+ {value}{comparator}
151
+
152
+ ## **Usage**
153
+ {usage}
154
+
155
+ {examples}
156
+
157
+ ## **Query**
158
+ {query}
159
+
160
+ """
161
+
162
+ DEFAULT_COLUMN_PROMPT_TEMPLATE = """
163
+ {table_schema}
164
+
165
+ # **Column Schema**
166
+ {header}
167
+
168
+ - The column name in the database table: {column}
169
+ - The type of the values in this column: {type}
170
+
171
+ ## **Description**
172
+ {description}
173
+
174
+ ## **Usage**
175
+ {usage}
176
+
177
+ {examples}
114
178
 
115
- DEFAULT_SQL_PROMPT_TEMPLATE = '''
179
+ ## **Query**
180
+ {query}
181
+ """
182
+
183
+ DEFAULT_TABLE_PROMPT_TEMPLATE = """# **Table Schema**
184
+ {header}
185
+
186
+ - The name of this table in the database: {table}
187
+
188
+ ## **Description**
189
+ {description}
190
+
191
+ ## **Usage**
192
+ {usage}
193
+
194
+ ## **Column Descriptions**
195
+ Below are descriptions of each column in this table:
196
+
197
+ {columns}
198
+
199
+ {examples}
200
+
201
+ ## **Query**
202
+ {query}
203
+ """
204
+
205
+ DEFAULT_SQL_PROMPT_TEMPLATE = """
116
206
  Construct a valid {dialect} SQL query to select documents relevant to the user input.
117
207
  Source documents are found in the {source_table} table. You may need to join with other tables to get additional document metadata.
118
208
 
@@ -165,7 +255,7 @@ Output the {dialect} SQL query that is ready to be executed only WITHOUT ANY DEL
165
255
 
166
256
  Here is the user input:
167
257
  {input}
168
- '''
258
+ """
169
259
 
170
260
  DEFAULT_QUESTION_REFORMULATION_TEMPLATE = """Given the original question and the retrieved context,
171
261
  analyze what additional information is needed for a complete, accurate answer.
@@ -234,7 +324,7 @@ If no additional information is needed, output an empty array [].
234
324
 
235
325
  Follow-up Questions:"""
236
326
 
237
- DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE = '''
327
+ DEFAULT_QUERY_RETRY_PROMPT_TEMPLATE = """
238
328
  {query}
239
329
 
240
330
  The {dialect} query above failed with the error message: {error}.
@@ -270,14 +360,19 @@ Rewrite the query so it works.
270
360
  Output the final SQL query only.
271
361
 
272
362
  SQL Query:
273
- '''
363
+ """
274
364
 
275
365
  DEFAULT_NUM_QUERY_RETRIES = 2
276
366
 
277
367
 
278
368
  class LLMConfig(BaseModel):
279
- model_name: str = Field(default=DEFAULT_LLM_MODEL, description='LLM model to use for generation')
280
- provider: str = Field(default=DEFAULT_LLM_MODEL_PROVIDER, description='LLM model provider to use for generation')
369
+ model_name: str = Field(
370
+ default=DEFAULT_LLM_MODEL, description="LLM model to use for generation"
371
+ )
372
+ provider: str = Field(
373
+ default=DEFAULT_LLM_MODEL_PROVIDER,
374
+ description="LLM model provider to use for generation",
375
+ )
281
376
  params: Dict[str, Any] = Field(default_factory=dict)
282
377
 
283
378
 
@@ -285,20 +380,18 @@ class MultiVectorRetrieverMode(Enum):
285
380
  """
286
381
  Enum for MultiVectorRetriever types.
287
382
  """
383
+
288
384
  SPLIT = "split"
289
385
  SUMMARIZE = "summarize"
290
386
  BOTH = "both"
291
387
 
292
388
 
293
389
  class VectorStoreType(Enum):
294
- CHROMA = 'chromadb'
295
- PGVECTOR = 'pgvector'
390
+ CHROMA = "chromadb"
391
+ PGVECTOR = "pgvector"
296
392
 
297
393
 
298
- vector_store_map = {
299
- VectorStoreType.CHROMA: Chroma,
300
- VectorStoreType.PGVECTOR: PGVector
301
- }
394
+ vector_store_map = {VectorStoreType.CHROMA: Chroma, VectorStoreType.PGVECTOR: PGVector}
302
395
 
303
396
 
304
397
  class VectorStoreConfig(BaseModel):
@@ -317,6 +410,7 @@ class VectorStoreConfig(BaseModel):
317
410
 
318
411
  class RetrieverType(str, Enum):
319
412
  """Retriever type for RAG pipeline"""
413
+
320
414
  VECTOR_STORE = "vector_store"
321
415
  AUTO = "auto"
322
416
  MULTI = "multi"
@@ -328,137 +422,298 @@ class SearchType(Enum):
328
422
  """
329
423
  Enum for vector store search types.
330
424
  """
425
+
331
426
  SIMILARITY = "similarity"
332
427
  MMR = "mmr"
333
428
  SIMILARITY_SCORE_THRESHOLD = "similarity_score_threshold"
334
429
 
335
430
 
336
431
  class SearchKwargs(BaseModel):
337
- k: int = Field(
338
- default=DEFAULT_K,
339
- description="Amount of documents to return",
340
- ge=1
341
- )
432
+ k: int = Field(default=DEFAULT_K, description="Amount of documents to return", ge=1)
342
433
  filter: Optional[Dict[str, Any]] = Field(
343
- default=None,
344
- description="Filter by document metadata"
434
+ default=None, description="Filter by document metadata"
345
435
  )
346
436
  # For similarity_score_threshold search type
347
437
  score_threshold: Optional[float] = Field(
348
438
  default=None,
349
439
  description="Minimum relevance threshold for similarity_score_threshold search",
350
440
  ge=0.0,
351
- le=1.0
441
+ le=1.0,
352
442
  )
353
443
  # For MMR search type
354
444
  fetch_k: Optional[int] = Field(
355
- default=None,
356
- description="Amount of documents to pass to MMR algorithm",
357
- ge=1
445
+ default=None, description="Amount of documents to pass to MMR algorithm", ge=1
358
446
  )
359
447
  lambda_mult: Optional[float] = Field(
360
448
  default=None,
361
449
  description="Diversity of results returned by MMR (1=min diversity, 0=max)",
362
450
  ge=0.0,
363
- le=1.0
451
+ le=1.0,
364
452
  )
365
453
 
366
454
  def model_dump(self, *args, **kwargs):
367
455
  # Override model_dump to exclude None values by default
368
- kwargs['exclude_none'] = True
456
+ kwargs["exclude_none"] = True
369
457
  return super().model_dump(*args, **kwargs)
370
458
 
371
459
 
372
- class ColumnSchema(BaseModel):
373
- name: str = Field(
374
- description="Name of the column in the database"
460
+ class LLMExample(BaseModel):
461
+ input: str = Field(description="User input for the example")
462
+ output: str = Field(
463
+ description="What the LLM should generate for this example's input"
464
+ )
465
+
466
+
467
+ class ValueSchema(BaseModel):
468
+ value: Union[
469
+ Union[str, int, float],
470
+ Dict[Union[str, int, float], str],
471
+ List[Union[str, int, float]],
472
+ ] = Field(
473
+ description="One of the following. The value as it exists in the table column. A dict of {table_value: descriptive value, ...}, where table_value is the value in the table. A list of sample values taken from the column."
474
+ )
475
+ comparator: Optional[Union[str, List[str]]] = Field(
476
+ description="The posgtres sql operators used to compare two values. For example: `>`, `<`, `=`, or `%`.",
477
+ default="=",
375
478
  )
376
479
  type: str = Field(
377
- description="Type of the column (e.g. int, string, datetime)"
480
+ description="A valid postgres type for this value. One of: int, string, float, or bool. When numbers appear they should be of type int or float."
481
+ )
482
+ description: str = Field(description="Description of what the value represents.")
483
+ usage: str = Field(description="How and when to use this value for search.")
484
+ example_questions: Optional[List[LLMExample]] = Field(
485
+ default=None, description="Example questions where this value is set."
486
+ )
487
+ filter_threshold: Optional[float] = Field(
488
+ default=0.0,
489
+ description="Minimum relevance threshold to include metadata filters from this column.",
490
+ exclude=True,
378
491
  )
379
- description: str = Field(
380
- description="Description of what the column represents"
492
+ priority: Optional[int] = Field(
493
+ default=0,
494
+ description="Priority level for this column, lower numbers will be processed first.",
381
495
  )
382
- values: Optional[Dict[Any, Any]] = Field(
496
+ relevance: Optional[float] = Field(
383
497
  default=None,
384
- description="Mapping of values the column can be with the description of what the value means"
498
+ description="Relevance computed during search. Should not be set by the end user.",
499
+ exclude=True,
385
500
  )
386
501
 
387
502
 
388
- class MetadataSchema(BaseModel):
503
+ class MetadataConfig(BaseModel):
504
+ """Class to configure metadata for retrieval. Only supports very basic document name lookup at the moment."""
389
505
  table: str = Field(
390
- description="Name of table in the database"
506
+ description="Source table for metadata."
507
+ )
508
+ max_document_context: int = Field(
509
+ # To work well with models with context window of 32768.
510
+ default=16384,
511
+ description="Truncate a document before using as context with an LLM if it exceeds this amount of tokens"
512
+ )
513
+ embeddings_table: str = Field(
514
+ default="embeddings",
515
+ description="Source table for embeddings"
516
+ )
517
+ id_column: str = Field(
518
+ default="Id",
519
+ description="Name of ID column in metadata table"
520
+ )
521
+ name_column: str = Field(
522
+ default="Title",
523
+ description="Name of column containing name or title of document"
391
524
  )
392
- description: str = Field(
393
- description="Description of what the table represents"
525
+ name_column_index: Optional[str] = Field(
526
+ default=None,
527
+ description="Name of GIN index to use when looking up name."
528
+ )
529
+ content_column: str = Field(
530
+ default="content",
531
+ description="Name of column in embeddings table containing chunk content"
532
+ )
533
+ embeddings_metadata_column: str = Field(
534
+ default="metadata",
535
+ description="Name of column in embeddings table containing chunk metadata"
536
+ )
537
+ doc_id_key: str = Field(
538
+ default="original_row_id",
539
+ description="Metadata field that links an embedded chunk back to source document ID"
394
540
  )
395
- columns: List[ColumnSchema] = Field(
396
- description="List of column schemas describing the metadata columns available for the table"
541
+
542
+
543
+ class ColumnSchema(BaseModel):
544
+ column: str = Field(description="Name of the column in the database")
545
+ type: str = Field(description="Type of the column (e.g. int, string, datetime)")
546
+ description: str = Field(description="Description of what the column represents")
547
+ usage: str = Field(description="How and when to use this Table for search.")
548
+ values: Optional[
549
+ Union[
550
+ OrderedDict[Union[str, int, float], ValueSchema],
551
+ Dict[Union[str, int, float], ValueSchema],
552
+ ]
553
+ ] = Field(
554
+ description="One of the following. A dict or ordered dict of {schema_value: ValueSchema, ...}, where schema value is the name given for this value description in the schema."
555
+ )
556
+ example_questions: Optional[List[LLMExample]] = Field(
557
+ default=None, description="Example questions where this table is useful."
558
+ )
559
+ max_filters: Optional[int] = Field(
560
+ default=1, description="Maximum number of filters to generate for this column."
561
+ )
562
+ filter_threshold: Optional[float] = Field(
563
+ default=0.0,
564
+ description="Minimum relevance threshold to include metadata filters from this column.",
565
+ )
566
+ priority: Optional[int] = Field(
567
+ default=1,
568
+ description="Priority level for this column, lower numbers will be processed first.",
569
+ )
570
+ relevance: Optional[float] = Field(
571
+ default=None,
572
+ description="Relevance computed during search. Should not be set by the end user.",
573
+ )
574
+
575
+
576
+ class TableSchema(BaseModel):
577
+ table: str = Field(description="Name of table in the database")
578
+ description: str = Field(description="Description of what the table represents")
579
+ usage: str = Field(description="How and when to use this Table for search.")
580
+ columns: Optional[
581
+ Union[OrderedDict[str, ColumnSchema], Dict[str, ColumnSchema]]
582
+ ] = Field(
583
+ description="Dict or Ordered Dict of {column_name: ColumnSchemas} describing the metadata columns available for the table"
584
+ )
585
+ example_questions: Optional[List[LLMExample]] = Field(
586
+ default=None, description="Example questions where this table is useful."
397
587
  )
398
588
  join: str = Field(
399
589
  description="SQL join string to join this table with source documents table",
400
- default=''
590
+ default="",
591
+ )
592
+ max_filters: Optional[int] = Field(
593
+ default=1, description="Maximum number of filters to generate for this table."
594
+ )
595
+ filter_threshold: Optional[float] = Field(
596
+ default=0.0,
597
+ description="Minimum relevance required to use this table to generate filters.",
598
+ )
599
+ priority: Optional[int] = Field(
600
+ default=1,
601
+ description="Priority level for this table, lower numbers will be processed first.",
602
+ )
603
+ relevance: Optional[float] = Field(
604
+ default=None,
605
+ description="Relevance computed during search. Should not be set by the end user.",
401
606
  )
402
-
403
- class Config:
404
- frozen = True
405
607
 
406
608
 
407
- class LLMExample(BaseModel):
408
- input: str = Field(
409
- description="User input for the example"
609
+ class DatabaseSchema(BaseModel):
610
+ database: str = Field(description="Name of database in the Database")
611
+ description: str = Field(description="Description of what the Database represents")
612
+ usage: str = Field(description="How and when to use this Database for search.")
613
+ tables: Union[OrderedDict[str, TableSchema], Dict[str, TableSchema]] = Field(
614
+ description="Dict of {column_name: ColumnSchemas} describing the metadata columns available for the table"
410
615
  )
411
- output: str = Field(
412
- description="What the LLM should generate for this example's input"
616
+ example_questions: Optional[List[LLMExample]] = Field(
617
+ default=None, description="Example questions where this Database is useful."
618
+ )
619
+ max_filters: Optional[int] = Field(
620
+ default=1,
621
+ description="Maximum number of filters to generate for this Database.",
622
+ )
623
+ filter_threshold: Optional[float] = Field(
624
+ default=0.0,
625
+ description="Minimum relevance required to use this Database to generate filters.",
626
+ )
627
+ priority: Optional[int] = Field(
628
+ default=0,
629
+ description="Priority level for this Database, lower numbers will be processed first.",
630
+ )
631
+ relevance: Optional[float] = Field(
632
+ default=None,
633
+ description="Relevance computed during search. Should not be set by the end user.",
413
634
  )
414
635
 
415
636
 
416
637
  class SQLRetrieverConfig(BaseModel):
417
638
  llm_config: LLMConfig = Field(
418
639
  default_factory=LLMConfig,
419
- description="LLM configuration to use for generating the final SQL query for retrieval"
640
+ description="LLM configuration to use for generating the final SQL query for retrieval",
420
641
  )
421
642
  metadata_filters_prompt_template: str = Field(
422
643
  default=DEFAULT_METADATA_FILTERS_PROMPT_TEMPLATE,
423
- description="Prompt template to generate PostgreSQL metadata filters. Has 'format_instructions', 'schema', 'examples', and 'input' input variables"
644
+ description="Prompt template to generate PostgreSQL metadata filters. Has 'format_instructions', 'schema', 'examples', and 'input' input variables",
424
645
  )
425
646
  num_retries: int = Field(
426
647
  default=DEFAULT_NUM_QUERY_RETRIES,
427
- description="How many times for an LLM to try rewriting a failed SQL query before using the fallback retriever."
648
+ description="How many times for an LLM to try rewriting a failed SQL query before using the fallback retriever.",
428
649
  )
429
650
  rewrite_prompt_template: str = Field(
430
651
  default=DEFAULT_SEMANTIC_PROMPT_TEMPLATE,
431
- description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable."
652
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
653
+ )
654
+ table_prompt_template: str = Field(
655
+ default=DEFAULT_TABLE_PROMPT_TEMPLATE,
656
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
657
+ )
658
+ column_prompt_template: str = Field(
659
+ default=DEFAULT_COLUMN_PROMPT_TEMPLATE,
660
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
661
+ )
662
+ value_prompt_template: str = Field(
663
+ default=DEFAULT_VALUE_PROMPT_TEMPLATE,
664
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
665
+ )
666
+ boolean_system_prompt: str = Field(
667
+ default=DEFAULT_BOOLEAN_PROMPT_TEMPLATE,
668
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
669
+ )
670
+ generative_system_prompt: str = Field(
671
+ default=DEFAULT_GENERATIVE_SYSTEM_PROMPT,
672
+ description="Prompt template to rewrite user input to be better suited for retrieval. Has 'input' input variable.",
432
673
  )
433
674
  source_table: str = Field(
434
675
  description="Name of the source table containing the original documents that were embedded"
435
676
  )
436
- metadata_schemas: Optional[List[MetadataSchema]] = Field(
677
+ source_id_column: str = Field(
678
+ description="Name of the column containing the UUID.", default="Id"
679
+ )
680
+ max_filters: Optional[int] = Field(
681
+ description="Maximum number of filters to generate for sql queries.", default=10
682
+ )
683
+ filter_threshold: Optional[float] = Field(
684
+ description="Minimum relevance required to use this Database to generate filters.",
685
+ default=0.0,
686
+ )
687
+ min_k: Optional[int] = Field(
688
+ description="Minimum number of documents accepted from a generated sql query.",
689
+ default=10,
690
+ )
691
+ database_schema: Optional[DatabaseSchema] = Field(
437
692
  default=None,
438
- description="Optional list of table schemas containing document metadata to potentially join with."
693
+ description="DatabaseSchema describing the database.",
439
694
  )
440
695
  examples: Optional[List[LLMExample]] = Field(
441
696
  default=None,
442
- description="Optional examples of final generated pgvector queries based on user input."
697
+ description="Optional examples of final generated pgvector queries based on user input.",
443
698
  )
444
699
 
445
700
 
446
701
  class SummarizationConfig(BaseModel):
447
702
  llm_config: LLMConfig = Field(
448
703
  default_factory=LLMConfig,
449
- description="LLM configuration to use for summarization"
704
+ description="LLM configuration to use for summarization",
450
705
  )
451
706
  map_prompt_template: str = Field(
452
707
  default=DEFAULT_MAP_PROMPT_TEMPLATE,
453
- description="Prompt for an LLM to summarize a single document"
708
+ description="Prompt for an LLM to summarize a single document",
454
709
  )
455
710
  reduce_prompt_template: str = Field(
456
711
  default=DEFAULT_REDUCE_PROMPT_TEMPLATE,
457
- description="Prompt for an LLM to summarize a set of summaries of documents into one"
712
+ description="Prompt for an LLM to summarize a set of summaries of documents into one",
458
713
  )
459
714
  max_summarization_tokens: int = Field(
460
715
  default=DEFAULT_MAX_SUMMARIZATION_TOKENS,
461
- description="Max number of tokens for summarized documents"
716
+ description="Max number of tokens for summarized documents",
462
717
  )
463
718
 
464
719
 
@@ -476,154 +731,122 @@ class RerankerConfig(BaseModel):
476
731
 
477
732
  class MultiHopRetrieverConfig(BaseModel):
478
733
  """Configuration for multi-hop retrieval"""
734
+
479
735
  base_retriever_type: RetrieverType = Field(
480
736
  default=RetrieverType.VECTOR_STORE,
481
- description="Type of base retriever to use for multi-hop retrieval"
737
+ description="Type of base retriever to use for multi-hop retrieval",
482
738
  )
483
739
  max_hops: int = Field(
484
- default=3,
485
- description="Maximum number of follow-up questions to generate",
486
- ge=1
740
+ default=3, description="Maximum number of follow-up questions to generate", ge=1
487
741
  )
488
742
  reformulation_template: str = Field(
489
743
  default=DEFAULT_QUESTION_REFORMULATION_TEMPLATE,
490
- description="Template for reformulating questions"
744
+ description="Template for reformulating questions",
491
745
  )
492
746
  llm_config: LLMConfig = Field(
493
747
  default_factory=LLMConfig,
494
- description="LLM configuration to use for generating follow-up questions"
748
+ description="LLM configuration to use for generating follow-up questions",
495
749
  )
496
750
 
497
751
 
498
752
  class RAGPipelineModel(BaseModel):
499
753
  documents: Optional[List[Document]] = Field(
500
- default=None,
501
- description="List of documents"
754
+ default=None, description="List of documents"
502
755
  )
503
756
 
504
757
  vector_store_config: VectorStoreConfig = Field(
505
- default_factory=VectorStoreConfig,
506
- description="Vector store configuration"
758
+ default_factory=VectorStoreConfig, description="Vector store configuration"
507
759
  )
508
760
 
509
- llm: Optional[BaseChatModel] = Field(
510
- default=None,
511
- description="Language model"
512
- )
761
+ llm: Optional[BaseChatModel] = Field(default=None, description="Language model")
513
762
  llm_model_name: str = Field(
514
- default=DEFAULT_LLM_MODEL,
515
- description="Language model name"
763
+ default=DEFAULT_LLM_MODEL, description="Language model name"
516
764
  )
517
765
  llm_provider: Optional[str] = Field(
518
- default=None,
519
- description="Language model provider"
766
+ default=None, description="Language model provider"
520
767
  )
521
-
522
768
  vector_store: VectorStore = Field(
523
769
  default_factory=lambda: vector_store_map[VectorStoreConfig().vector_store_type],
524
- description="Vector store"
770
+ description="Vector store",
525
771
  )
526
772
  db_connection_string: Optional[str] = Field(
527
- default=None,
528
- description="Database connection string"
773
+ default=None, description="Database connection string"
529
774
  )
530
- table_name: str = Field(
531
- default=DEFAULT_TEST_TABLE_NAME,
532
- description="Table name"
775
+ metadata_config: Optional[MetadataConfig] = Field(
776
+ default=None,
777
+ description="Configuration for metadata to be used for retrieval"
533
778
  )
779
+ table_name: str = Field(default=DEFAULT_TEST_TABLE_NAME, description="Table name")
534
780
  embedding_model: Optional[Embeddings] = Field(
535
- default=None,
536
- description="Embedding model"
781
+ default=None, description="Embedding model"
537
782
  )
538
783
  rag_prompt_template: str = Field(
539
- default=DEFAULT_RAG_PROMPT_TEMPLATE,
540
- description="RAG prompt template"
784
+ default=DEFAULT_RAG_PROMPT_TEMPLATE, description="RAG prompt template"
541
785
  )
542
786
  retriever_prompt_template: Optional[Union[str, dict]] = Field(
543
- default=None,
544
- description="Retriever prompt template"
787
+ default=None, description="Retriever prompt template"
545
788
  )
546
789
  retriever_type: RetrieverType = Field(
547
- default=RetrieverType.VECTOR_STORE,
548
- description="Retriever type"
790
+ default=RetrieverType.VECTOR_STORE, description="Retriever type"
549
791
  )
550
792
  search_type: SearchType = Field(
551
- default=SearchType.SIMILARITY,
552
- description="Type of search to perform"
793
+ default=SearchType.SIMILARITY, description="Type of search to perform"
553
794
  )
554
795
  search_kwargs: SearchKwargs = Field(
555
796
  default_factory=SearchKwargs,
556
- description="Search configuration for the retriever"
797
+ description="Search configuration for the retriever",
557
798
  )
558
799
  summarization_config: Optional[SummarizationConfig] = Field(
559
800
  default=None,
560
- description="Configuration for summarizing retrieved documents as context"
801
+ description="Configuration for summarizing retrieved documents as context",
561
802
  )
562
803
  # SQL retriever specific.
563
804
  sql_retriever_config: Optional[SQLRetrieverConfig] = Field(
564
805
  default=None,
565
- description="Configuration for retrieving documents by generating SQL to filter by metadata & order by distance function"
806
+ description="Configuration for retrieving documents by generating SQL to filter by metadata & order by distance function",
566
807
  )
567
808
 
568
809
  # Multi retriever specific
569
810
  multi_retriever_mode: MultiVectorRetrieverMode = Field(
570
- default=MultiVectorRetrieverMode.BOTH,
571
- description="Multi retriever mode"
811
+ default=MultiVectorRetrieverMode.BOTH, description="Multi retriever mode"
572
812
  )
573
813
  max_concurrency: int = Field(
574
- default=DEFAULT_MAX_CONCURRENCY,
575
- description="Maximum concurrency"
576
- )
577
- id_key: int = Field(
578
- default=DEFAULT_ID_KEY,
579
- description="ID key"
580
- )
581
- parent_store: Optional[BaseStore] = Field(
582
- default=None,
583
- description="Parent store"
814
+ default=DEFAULT_MAX_CONCURRENCY, description="Maximum concurrency"
584
815
  )
816
+ id_key: int = Field(default=DEFAULT_ID_KEY, description="ID key")
817
+ parent_store: Optional[BaseStore] = Field(default=None, description="Parent store")
585
818
  text_splitter: Optional[TextSplitter] = Field(
586
- default=None,
587
- description="Text splitter"
588
- )
589
- chunk_size: int = Field(
590
- default=DEFAULT_CHUNK_SIZE,
591
- description="Chunk size"
819
+ default=None, description="Text splitter"
592
820
  )
821
+ chunk_size: int = Field(default=DEFAULT_CHUNK_SIZE, description="Chunk size")
593
822
  chunk_overlap: int = Field(
594
- default=DEFAULT_CHUNK_OVERLAP,
595
- description="Chunk overlap"
823
+ default=DEFAULT_CHUNK_OVERLAP, description="Chunk overlap"
596
824
  )
597
825
 
598
826
  # Auto retriever specific
599
827
  auto_retriever_filter_columns: Optional[List[str]] = Field(
600
- default=None,
601
- description="Filter columns"
828
+ default=None, description="Filter columns"
602
829
  )
603
830
  cardinality_threshold: int = Field(
604
- default=DEFAULT_CARDINALITY_THRESHOLD,
605
- description="Cardinality threshold"
831
+ default=DEFAULT_CARDINALITY_THRESHOLD, description="Cardinality threshold"
606
832
  )
607
833
  content_column_name: str = Field(
608
834
  default=DEFAULT_CONTENT_COLUMN_NAME,
609
- description="Content column name (the column we will get embeddings)"
835
+ description="Content column name (the column we will get embeddings)",
610
836
  )
611
837
  dataset_description: str = Field(
612
- default=DEFAULT_DATASET_DESCRIPTION,
613
- description="Description of the dataset"
838
+ default=DEFAULT_DATASET_DESCRIPTION, description="Description of the dataset"
614
839
  )
615
840
  reranker: bool = Field(
616
- default=DEFAULT_RERANKER_FLAG,
617
- description="Whether to use reranker"
841
+ default=DEFAULT_RERANKER_FLAG, description="Whether to use reranker"
618
842
  )
619
843
  reranker_config: RerankerConfig = Field(
620
- default_factory=RerankerConfig,
621
- description="Reranker configuration"
844
+ default_factory=RerankerConfig, description="Reranker configuration"
622
845
  )
623
846
 
624
847
  multi_hop_config: Optional[MultiHopRetrieverConfig] = Field(
625
848
  default=None,
626
- description="Configuration for multi-hop retrieval. Required when retriever_type is MULTI_HOP."
849
+ description="Configuration for multi-hop retrieval. Required when retriever_type is MULTI_HOP.",
627
850
  )
628
851
 
629
852
  @field_validator("multi_hop_config")
@@ -632,7 +855,9 @@ class RAGPipelineModel(BaseModel):
632
855
  """Validate that multi_hop_config is set when using multi-hop retrieval."""
633
856
  values = info.data
634
857
  if values.get("retriever_type") == RetrieverType.MULTI_HOP and v is None:
635
- raise ValueError("multi_hop_config must be set when using multi-hop retrieval")
858
+ raise ValueError(
859
+ "multi_hop_config must be set when using multi-hop retrieval"
860
+ )
636
861
  return v
637
862
 
638
863
  class Config:
@@ -651,10 +876,10 @@ class RAGPipelineModel(BaseModel):
651
876
  def get_field_names(cls):
652
877
  return list(cls.model_fields.keys())
653
878
 
654
- @field_validator('search_kwargs')
879
+ @field_validator("search_kwargs")
655
880
  @classmethod
656
881
  def validate_search_kwargs(cls, v: SearchKwargs, info) -> SearchKwargs:
657
- search_type = info.data.get('search_type', SearchType.SIMILARITY)
882
+ search_type = info.data.get("search_type", SearchType.SIMILARITY)
658
883
 
659
884
  # Validate MMR-specific parameters
660
885
  if search_type == SearchType.MMR:
@@ -663,9 +888,13 @@ class RAGPipelineModel(BaseModel):
663
888
  if v.lambda_mult is not None and (v.lambda_mult < 0 or v.lambda_mult > 1):
664
889
  raise ValueError("lambda_mult must be between 0 and 1")
665
890
  if v.fetch_k is None and v.lambda_mult is not None:
666
- raise ValueError("fetch_k is required when using lambda_mult with MMR search type")
891
+ raise ValueError(
892
+ "fetch_k is required when using lambda_mult with MMR search type"
893
+ )
667
894
  if v.lambda_mult is None and v.fetch_k is not None:
668
- raise ValueError("lambda_mult is required when using fetch_k with MMR search type")
895
+ raise ValueError(
896
+ "lambda_mult is required when using fetch_k with MMR search type"
897
+ )
669
898
  elif search_type != SearchType.MMR:
670
899
  if v.fetch_k is not None:
671
900
  raise ValueError("fetch_k is only valid for MMR search type")
@@ -674,11 +903,20 @@ class RAGPipelineModel(BaseModel):
674
903
 
675
904
  # Validate similarity_score_threshold parameters
676
905
  if search_type == SearchType.SIMILARITY_SCORE_THRESHOLD:
677
- if v.score_threshold is not None and (v.score_threshold < 0 or v.score_threshold > 1):
906
+ if v.score_threshold is not None and (
907
+ v.score_threshold < 0 or v.score_threshold > 1
908
+ ):
678
909
  raise ValueError("score_threshold must be between 0 and 1")
679
910
  if v.score_threshold is None:
680
- raise ValueError("score_threshold is required for similarity_score_threshold search type")
681
- elif search_type != SearchType.SIMILARITY_SCORE_THRESHOLD and v.score_threshold is not None:
682
- raise ValueError("score_threshold is only valid for similarity_score_threshold search type")
911
+ raise ValueError(
912
+ "score_threshold is required for similarity_score_threshold search type"
913
+ )
914
+ elif (
915
+ search_type != SearchType.SIMILARITY_SCORE_THRESHOLD
916
+ and v.score_threshold is not None
917
+ ):
918
+ raise ValueError(
919
+ "score_threshold is only valid for similarity_score_threshold search type"
920
+ )
683
921
 
684
922
  return v