MindsDB 25.2.3.0__py3-none-any.whl → 25.3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (86) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +16 -11
  3. mindsdb/api/executor/command_executor.py +1 -1
  4. mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -2
  5. mindsdb/api/executor/planner/query_planner.py +6 -2
  6. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -1
  7. mindsdb/api/http/initialize.py +8 -5
  8. mindsdb/api/http/namespaces/agents.py +0 -7
  9. mindsdb/api/http/namespaces/config.py +0 -48
  10. mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
  11. mindsdb/api/http/namespaces/util.py +0 -28
  12. mindsdb/api/mongo/classes/query_sql.py +2 -1
  13. mindsdb/api/mongo/responders/aggregate.py +2 -2
  14. mindsdb/api/mongo/responders/coll_stats.py +3 -2
  15. mindsdb/api/mongo/responders/db_stats.py +2 -1
  16. mindsdb/api/mongo/responders/insert.py +4 -2
  17. mindsdb/api/mysql/mysql_proxy/classes/fake_mysql_proxy/fake_mysql_proxy.py +2 -1
  18. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +5 -4
  19. mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +2 -4
  20. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
  21. mindsdb/integrations/handlers/autosklearn_handler/autosklearn_handler.py +1 -1
  22. mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
  23. mindsdb/integrations/handlers/gmail_handler/connection_args.py +2 -2
  24. mindsdb/integrations/handlers/gmail_handler/gmail_handler.py +19 -66
  25. mindsdb/integrations/handlers/gmail_handler/requirements.txt +0 -1
  26. mindsdb/integrations/handlers/google_calendar_handler/connection_args.py +15 -0
  27. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_handler.py +31 -41
  28. mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +0 -2
  29. mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
  30. mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
  31. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
  32. mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
  33. mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
  34. mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
  35. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
  36. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
  37. mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
  38. mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
  39. mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
  40. mindsdb/integrations/handlers/youtube_handler/youtube_handler.py +2 -38
  41. mindsdb/integrations/libs/llm/utils.py +7 -1
  42. mindsdb/integrations/libs/process_cache.py +2 -2
  43. mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py +29 -38
  44. mindsdb/integrations/utilities/pydantic_utils.py +208 -0
  45. mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
  46. mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -4
  47. mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +800 -135
  48. mindsdb/integrations/utilities/rag/settings.py +390 -152
  49. mindsdb/integrations/utilities/sql_utils.py +2 -1
  50. mindsdb/interfaces/agents/agents_controller.py +14 -10
  51. mindsdb/interfaces/agents/callback_handlers.py +52 -5
  52. mindsdb/interfaces/agents/langchain_agent.py +5 -3
  53. mindsdb/interfaces/agents/mindsdb_chat_model.py +4 -2
  54. mindsdb/interfaces/chatbot/chatbot_controller.py +9 -8
  55. mindsdb/interfaces/database/database.py +3 -2
  56. mindsdb/interfaces/database/integrations.py +1 -1
  57. mindsdb/interfaces/database/projects.py +28 -2
  58. mindsdb/interfaces/jobs/jobs_controller.py +4 -1
  59. mindsdb/interfaces/jobs/scheduler.py +1 -1
  60. mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
  61. mindsdb/interfaces/model/model_controller.py +5 -2
  62. mindsdb/interfaces/skills/retrieval_tool.py +128 -39
  63. mindsdb/interfaces/skills/skill_tool.py +7 -7
  64. mindsdb/interfaces/skills/skills_controller.py +10 -6
  65. mindsdb/interfaces/skills/sql_agent.py +6 -1
  66. mindsdb/interfaces/storage/db.py +14 -12
  67. mindsdb/interfaces/storage/json.py +59 -0
  68. mindsdb/interfaces/storage/model_fs.py +85 -3
  69. mindsdb/interfaces/triggers/triggers_controller.py +2 -1
  70. mindsdb/migrations/versions/2022-10-14_43c52d23845a_projects.py +17 -3
  71. mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
  72. mindsdb/migrations/versions/2025-02-14_4521dafe89ab_added_encrypted_content_to_json_storage.py +29 -0
  73. mindsdb/migrations/versions/2025-02-19_11347c213b36_added_metadata_to_projects.py +41 -0
  74. mindsdb/utilities/config.py +6 -1
  75. mindsdb/utilities/functions.py +11 -0
  76. mindsdb/utilities/log.py +17 -2
  77. mindsdb/utilities/ml_task_queue/consumer.py +4 -2
  78. mindsdb/utilities/render/sqlalchemy_render.py +4 -0
  79. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/METADATA +226 -247
  80. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/RECORD +83 -80
  81. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/WHEEL +1 -1
  82. mindsdb/integrations/handlers/gmail_handler/utils.py +0 -45
  83. mindsdb/utilities/log_controller.py +0 -39
  84. mindsdb/utilities/telemetry.py +0 -44
  85. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/LICENSE +0 -0
  86. {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/top_level.txt +0 -0
@@ -186,7 +186,6 @@ class ProcessCache:
186
186
  self._keep_alive = {}
187
187
  self._stop_event = threading.Event()
188
188
  self.cleaner_thread = None
189
- self._start_clean()
190
189
 
191
190
  def __del__(self):
192
191
  self._stop_clean()
@@ -200,7 +199,7 @@ class ProcessCache:
200
199
  ):
201
200
  return
202
201
  self._stop_event.clear()
203
- self.cleaner_thread = threading.Thread(target=self._clean)
202
+ self.cleaner_thread = threading.Thread(target=self._clean, name='ProcessCache.clean')
204
203
  self.cleaner_thread.daemon = True
205
204
  self.cleaner_thread.start()
206
205
 
@@ -258,6 +257,7 @@ class ProcessCache:
258
257
  Returns:
259
258
  Future
260
259
  """
260
+ self._start_clean()
261
261
  handler_module_path = payload['handler_meta']['module_path']
262
262
  integration_id = payload['handler_meta']['integration_id']
263
263
  if task_type in (ML_TASK_TYPE.LEARN, ML_TASK_TYPE.FINETUNE):
@@ -1,9 +1,8 @@
1
- import os
2
1
  import json
2
+ from pathlib import Path
3
3
  import requests
4
4
  import datetime as dt
5
5
  from flask import request
6
- from shutil import copyfile
7
6
 
8
7
  from mindsdb.utilities import log
9
8
 
@@ -29,73 +28,65 @@ class GoogleUserOAuth2Manager:
29
28
  creds = None
30
29
 
31
30
  if self.credentials_file or self.credentials_url:
32
- # get the current directory and checks tokens & creds
33
- curr_dir = self.handler_storage.folder_get('config')
31
+ oauth_user_info = self.handler_storage.encrypted_json_get('oauth_user_info')
34
32
 
35
- creds_file = os.path.join(curr_dir, 'creds.json')
36
- secret_file = os.path.join(curr_dir, 'secret.json')
37
-
38
- if os.path.isfile(creds_file):
39
- creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
33
+ if oauth_user_info:
34
+ creds = Credentials.from_authorized_user_info(oauth_user_info, self.scopes)
40
35
 
41
36
  if not creds or not creds.valid:
42
37
  logger.debug("Credentials do not exist or are invalid, attempting to authorize again")
43
38
 
44
- if self._download_secret_file(secret_file):
45
- # save to storage
46
- self.handler_storage.folder_sync('config')
47
- else:
48
- raise ValueError('No valid Gmail Credentials filepath or S3 url found.')
39
+ oauth_user_info = self._download_oauth_user_info()
49
40
 
50
41
  if creds and creds.expired and creds.refresh_token:
51
42
  creds.refresh(Request())
52
43
  logger.debug("Credentials refreshed successfully")
53
44
  else:
54
- creds = self._execute_google_auth_flow(secret_file, self.scopes, self.code)
45
+ creds = self._execute_google_auth_flow(oauth_user_info)
55
46
  logger.debug("New credentials obtained")
56
47
 
57
- self._save_credentials_to_file(creds, creds_file)
58
- logger.debug(f"saved session credentials to {creds_file}")
59
- self.handler_storage.folder_sync('config')
48
+ self.handler_storage.encrypted_json_set('oauth_user_info', self._convert_credentials_to_dict(creds))
49
+ logger.debug("Saving credentials to storage")
60
50
 
61
51
  return creds
62
52
 
63
- def _download_secret_file(self, secret_file):
64
- # if credentials_url is set, attempt to download the file
53
+ def _download_oauth_user_info(self):
54
+ # if credentials_url is set, attempt to download the contents of the files
65
55
  # this will be given preference over credentials_file
66
56
  if self.credentials_url:
67
57
  response = requests.get(self.credentials_url)
68
58
  if response.status_code == 200:
69
- with open(secret_file, 'w') as creds:
70
- creds.write(response.text)
71
- return True
59
+ return response.json()
72
60
  else:
73
- logger.error("Failed to get credentials from S3", response.status_code)
61
+ logger.error("Failed to get credentials from URL", response.status_code)
62
+
63
+ # if credentials_file is set, attempt to read the contents of the file
64
+ if self.credentials_file:
65
+ path = Path(self.credentials_file).expanduser()
66
+ if path.exists():
67
+ with open(path, 'r') as f:
68
+ return json.load(f)
69
+ else:
70
+ logger.error("Credentials file does not exist")
74
71
 
75
- # if credentials_file is set, attempt to copy the file
76
- if self.credentials_file and os.path.isfile(self.credentials_file):
77
- copyfile(self.credentials_file, secret_file)
78
- return True
79
- return False
72
+ raise ValueError('OAuth2 credentials could not be found')
80
73
 
81
- def _execute_google_auth_flow(self, secret_file, scopes, code=None):
82
- flow = Flow.from_client_secrets_file(secret_file, scopes)
74
+ def _execute_google_auth_flow(self, oauth_user_info: dict):
75
+ flow = Flow.from_client_config(
76
+ oauth_user_info,
77
+ scopes=self.scopes
78
+ )
83
79
 
84
80
  flow.redirect_uri = request.headers['ORIGIN'] + '/verify-auth'
85
81
 
86
- if code:
87
- flow.fetch_token(code=code)
82
+ if self.code:
83
+ flow.fetch_token(code=self.code)
88
84
  creds = flow.credentials
89
85
  return creds
90
86
  else:
91
87
  auth_url = flow.authorization_url()[0]
92
88
  raise AuthException(f'Authorisation required. Please follow the url: {auth_url}', auth_url=auth_url)
93
89
 
94
- def _save_credentials_to_file(self, creds, file_path):
95
- with open(file_path, 'w') as token:
96
- data = self._convert_credentials_to_dict(creds)
97
- token.write(json.dumps(data))
98
-
99
90
  def _convert_credentials_to_dict(self, credentials):
100
91
  return {
101
92
  'token': credentials.token,
@@ -0,0 +1,208 @@
1
+ import pprint
2
+
3
+ pydantic_schema_description = """## Understanding Pydantic Schemas for JSON Formatting
4
+
5
+ Pydantic schemas provide a framework for defining the structure and validation rules for JSON output. Below is an overview of key components commonly found in a Pydantic schema:
6
+
7
+ ### Key Components
8
+
9
+ Each object in the schema represents a Pydantic model in JSON format. Typical fields in a Pydantic model description include:
10
+
11
+ - **`anyOf`**:
12
+ - A list describing possible values for a Pydantic model field.
13
+
14
+ - **`additionalProperties`**:
15
+ - Describes the keys of a dictionary. Keys are always of type `string` due to this being a JSON Pydantic schema. The corresponding key types supported by Pydantic are:
16
+ - `string`: a text string
17
+ - `integer`: an integer number
18
+ - `number`: a floating-point number
19
+
20
+ - **`items`**:
21
+ - Describes the items contained within an `array` (list).
22
+
23
+ - **`type`**:
24
+ - Specifies the Pydantic type assigned to the field, defining the expected data type. Common types include:
25
+ - `string`: a text string
26
+ - `integer`: an integer number
27
+ - `number`: a floating-point number
28
+ - `array`: a list
29
+ - `object`: a dictionary
30
+ - `null`: the python null value None. Indicates the field is optional.
31
+
32
+ - **`description`**:
33
+ - Provides a textual narrative explaining the purpose and details of the output JSON field.
34
+
35
+ - **`title`**:
36
+ - A Pydantic-generated, human-readable title for the field.
37
+
38
+ - **`default`**:
39
+ - The default value for this field if no value is provided by the user.
40
+
41
+ ### Schema
42
+
43
+ Below is the Pydantic schema:
44
+
45
+ {schema}
46
+
47
+ ### Examples
48
+
49
+ Below is an example of well-formed output adhering to this schema.
50
+
51
+ - Dummy text strings are represented as "lorem ipsum."
52
+
53
+ {example}
54
+ """
55
+
56
+
57
+ def get_dummy_value(field_value):
58
+ """A function to return a dummy value of a Pydantic model field."""
59
+ type_str = field_value["type"]
60
+ example_dict = {
61
+ "string": "lorem ipsum",
62
+ "int": 3,
63
+ "number": 42.0,
64
+ "null": None,
65
+ "object": {"lorem ipsum": "lorem_ipsum"},
66
+ }
67
+
68
+ if type_str in example_dict:
69
+ return example_dict[type_str]
70
+ else:
71
+ return None
72
+
73
+
74
+ def get_dummy_array(field_value):
75
+ """A function to return a dummy array of a Pydantic model field."""
76
+ items = field_value["items"]
77
+
78
+ if "type" in items:
79
+ if items["type"] == "null": # skip if null
80
+ pass
81
+ elif items["type"] == "array": # is it an array?
82
+ array_value = get_dummy_array(items)
83
+ elif (
84
+ items["type"] == "object" and "additionalProperties" in items
85
+ ): # is it a dict?
86
+ array_value = get_dummy_dict(items)
87
+ else: # it is a regular value!
88
+ array_value = get_dummy_value(items)
89
+ return [array_value for _ in range(2)]
90
+
91
+ elif "AnyOf" in field_value["items"]:
92
+ array_value = get_any_of(field_value["items"]) # can be one of many types
93
+ return [array_value for _ in range(2)]
94
+
95
+ else: # is it a pydantic class?
96
+ array_value = example_generator(items)
97
+ return [array_value for _ in range(2)]
98
+
99
+
100
+ def get_dummy_dict(field_value):
101
+ """A function to return a dummy dictionary of a Pydantic model field."""
102
+ return get_dummy_value(field_value)
103
+
104
+
105
+ def get_any_of(field_value):
106
+ """A function to return the first viable pydantic type of an Any() Pydantic model field."""
107
+ for any_of in field_value["anyOf"]:
108
+ if "type" in any_of:
109
+ if any_of["type"] == "null": # skip if null
110
+ continue
111
+ elif any_of["type"] == "array": # is it an array?
112
+ out = get_dummy_array(any_of)
113
+ return out
114
+ elif (
115
+ any_of["type"] == "object" and "additionalProperties" in any_of
116
+ ): # is it a dict?
117
+ out = get_dummy_dict(any_of)
118
+ return out
119
+ else: # it is a regular value!
120
+ out = get_dummy_value(any_of)
121
+ return out
122
+ else: # is it a pydantic class?
123
+ out = example_generator(any_of)
124
+ return out
125
+
126
+
127
+ def example_generator(pydantic_json_schema):
128
+ """dynamically parse a pydantic object and generate an example of it's formatting."""
129
+
130
+ example_dict = {}
131
+ for schema_name, schema in pydantic_json_schema.items():
132
+
133
+ for field_name, field_value in schema.items():
134
+ if "type" in field_value:
135
+
136
+ if field_value["type"] == "array": # is it an array?
137
+ example_dict[field_name] = get_dummy_array(field_value)
138
+
139
+ elif (
140
+ field_value["type"] == "object"
141
+ and "additionalProperties" in field_value
142
+ ): # is it a dict?
143
+ example_dict[field_name] = get_dummy_dict(field_value)
144
+
145
+ else: # it is a regular value!
146
+ example_dict[field_name] = get_dummy_value(field_value)
147
+
148
+ elif "anyOf" in field_value:
149
+ example_dict[field_name] = get_any_of(field_value)
150
+
151
+ else: # it is a pydantic class
152
+ example_dict[field_name] = example_generator(field_value)
153
+ return example_dict
154
+
155
+
156
+ def search_and_replace_refs(schema, defs, ref_skip={}, n=0):
157
+ """Dynamically substitute subclass references in a Pydantic object schema."""
158
+ for key, value in schema.items():
159
+ if key in ref_skip:
160
+ continue
161
+ if type(value) is dict:
162
+ if "$ref" in value:
163
+ definition_key = value["$ref"].split("/")[-1]
164
+ if definition_key in ref_skip:
165
+ schema[key] = {"type": "null"}
166
+ else:
167
+ schema[key] = {definition_key: defs[definition_key]["properties"]}
168
+ else:
169
+ search_and_replace_refs(value, defs, ref_skip, n + 1)
170
+ elif type(value) is list:
171
+ for val in value:
172
+ search_and_replace_refs(val, defs, ref_skip, n + 1)
173
+
174
+
175
+ def remove_extraneous_fields(schema, ref_skip):
176
+ """Remove extraneous fields from object descriptions."""
177
+ reduced_schema = schema["properties"]
178
+
179
+ for ref in ref_skip.keys():
180
+ if ref in reduced_schema:
181
+ del reduced_schema[ref]
182
+
183
+ for key, value in reduced_schema.items():
184
+ if "title" in value:
185
+ del value["title"]
186
+ if "$defs" in value:
187
+ del value["$defs"]
188
+ if "required" in value:
189
+ del value["required"]
190
+
191
+ return reduced_schema
192
+
193
+
194
+ def format_for_prompt(pydantic_object, ref_skip={}):
195
+ """Format a Pydantic object description for prompting an LLM."""
196
+ schema = {k: v for k, v in pydantic_object.schema().items()}
197
+
198
+ search_and_replace_refs(
199
+ schema=schema["properties"], defs=schema["$defs"], ref_skip=ref_skip, n=0
200
+ )
201
+
202
+ reduced_schema = remove_extraneous_fields(schema, ref_skip)
203
+
204
+ reduced_schema = {schema["title"]: reduced_schema}
205
+
206
+ out = pprint.pformat(reduced_schema)
207
+
208
+ return out, reduced_schema
@@ -0,0 +1,227 @@
1
+ import asyncio
2
+ from collections import namedtuple
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from mindsdb.interfaces.agents.langchain_agent import create_chat_model
6
+ from langchain.chains.base import Chain
7
+ from langchain.chains.combine_documents.stuff import StuffDocumentsChain
8
+ from langchain.chains.llm import LLMChain
9
+ from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain, ReduceDocumentsChain
10
+ from langchain_core.callbacks import dispatch_custom_event
11
+ from langchain_core.callbacks.manager import CallbackManagerForChainRun
12
+ from langchain_core.documents import Document
13
+ from langchain_core.prompts import PromptTemplate
14
+ from pandas import DataFrame
15
+
16
+ from mindsdb.integrations.libs.vectordatabase_handler import VectorStoreHandler
17
+ from mindsdb.integrations.utilities.rag.settings import SummarizationConfig
18
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
19
+ from mindsdb.utilities import log
20
+
21
+ logger = log.getLogger(__name__)
22
+
23
+ Summary = namedtuple('Summary', ['source_id', 'content'])
24
+
25
+
26
+ def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) -> ReduceDocumentsChain:
27
+ """Creates a chain that map-reduces documents into a single consolidated summary."""
28
+ summarization_llm = create_chat_model({
29
+ 'model_name': summarization_config.llm_config.model_name,
30
+ 'provider': summarization_config.llm_config.provider,
31
+ **summarization_config.llm_config.params
32
+ })
33
+
34
+ reduce_prompt_template = summarization_config.reduce_prompt_template
35
+ reduce_prompt = PromptTemplate.from_template(reduce_prompt_template)
36
+ if 'input' in reduce_prompt.input_variables:
37
+ reduce_prompt = reduce_prompt.partial(input=input)
38
+
39
+ reduce_chain = LLMChain(llm=summarization_llm, prompt=reduce_prompt)
40
+
41
+ combine_documents_chain = StuffDocumentsChain(
42
+ llm_chain=reduce_chain,
43
+ document_variable_name='docs'
44
+ )
45
+
46
+ return ReduceDocumentsChain(
47
+ combine_documents_chain=combine_documents_chain,
48
+ collapse_documents_chain=combine_documents_chain,
49
+ token_max=summarization_config.max_summarization_tokens
50
+ )
51
+
52
+
53
+ class LocalContextSummarizerChain(Chain):
54
+ """Summarizes M chunks before and after a given chunk in a document."""
55
+
56
+ doc_id_key: str = 'original_row_id'
57
+ chunk_index_key: str = 'chunk_index'
58
+
59
+ vector_store_handler: VectorStoreHandler
60
+ table_name: str = 'embeddings'
61
+ content_column_name: str = 'content'
62
+ metadata_column_name: str = 'metadata'
63
+
64
+ summarization_config: SummarizationConfig
65
+ map_reduce_documents_chain: Optional[ReduceDocumentsChain] = None
66
+
67
+ def _select_chunks_from_vector_store(self, doc_id: str) -> DataFrame:
68
+ condition = FilterCondition(
69
+ f"{self.metadata_column_name}->>'{self.doc_id_key}'",
70
+ FilterOperator.EQUAL,
71
+ doc_id
72
+ )
73
+ return self.vector_store_handler.select(
74
+ self.table_name,
75
+ columns=[self.content_column_name, self.metadata_column_name],
76
+ conditions=[condition]
77
+ )
78
+
79
+ async def _get_all_chunks_for_document(self, doc_id: str) -> List[Document]:
80
+ df = await asyncio.get_event_loop().run_in_executor(
81
+ None, self._select_chunks_from_vector_store, doc_id
82
+ )
83
+ chunks = []
84
+ for _, row in df.iterrows():
85
+ metadata = row.get(self.metadata_column_name, {})
86
+ metadata[self.chunk_index_key] = row.get('chunk_id', 0)
87
+ chunks.append(Document(page_content=row[self.content_column_name], metadata=metadata))
88
+
89
+ return sorted(chunks, key=lambda x: x.metadata.get(self.chunk_index_key, 0))
90
+
91
+ async def summarize_local_context(self, doc_id: str, target_chunk_index: int, M: int) -> Summary:
92
+ """
93
+ Summarizes M chunks before and after the given chunk.
94
+
95
+ Args:
96
+ doc_id (str): Document ID.
97
+ target_chunk_index (int): Index of the chunk to summarize around.
98
+ M (int): Number of chunks before and after to include.
99
+
100
+ Returns:
101
+ Summary: Summary object containing source_id and summary content.
102
+ """
103
+ logger.debug(f"Fetching chunks for document {doc_id}")
104
+ all_chunks = await self._get_all_chunks_for_document(doc_id)
105
+
106
+ if not all_chunks:
107
+ logger.warning(f"No chunks found for document {doc_id}")
108
+ return Summary(source_id=doc_id, content='')
109
+
110
+ # Determine window boundaries
111
+ start_idx = max(0, target_chunk_index - M)
112
+ end_idx = min(len(all_chunks), target_chunk_index + M + 1)
113
+ local_chunks = all_chunks[start_idx:end_idx]
114
+
115
+ logger.debug(f"Summarizing chunks {start_idx} to {end_idx - 1} for document {doc_id}")
116
+
117
+ if not self.map_reduce_documents_chain:
118
+ self.map_reduce_documents_chain = create_map_reduce_documents_chain(
119
+ self.summarization_config, input="Summarize these chunks."
120
+ )
121
+
122
+ summary_result = await self.map_reduce_documents_chain.ainvoke(local_chunks)
123
+ summary_text = summary_result.get('output_text', '')
124
+
125
+ logger.debug(f"Generated summary: {summary_text[:100]}...")
126
+
127
+ return Summary(source_id=doc_id, content=summary_text)
128
+
129
+ @property
130
+ def input_keys(self) -> List[str]:
131
+ return [self.context_key, self.question_key]
132
+
133
+ @property
134
+ def output_keys(self) -> List[str]:
135
+ return [self.context_key, self.question_key]
136
+
137
+ async def _get_source_summary(self, source_id: str, map_reduce_documents_chain: MapReduceDocumentsChain) -> Summary:
138
+ if not source_id:
139
+ logger.warning("Received empty source_id, returning empty summary")
140
+ return Summary(source_id='', content='')
141
+
142
+ logger.debug(f"Getting summary for source ID: {source_id}")
143
+ source_chunks = await self._get_all_chunks_for_document(source_id)
144
+
145
+ if not source_chunks:
146
+ logger.warning(f"No chunks found for source ID: {source_id}")
147
+ return Summary(source_id=source_id, content='')
148
+
149
+ logger.debug(f"Summarizing {len(source_chunks)} chunks for source ID: {source_id}")
150
+ summary = await map_reduce_documents_chain.ainvoke(source_chunks)
151
+ content = summary.get('output_text', '')
152
+ logger.debug(f"Generated summary for source ID {source_id}: {content[:100]}...")
153
+
154
+ # Stream summarization update.
155
+ dispatch_custom_event('summary', {'source_id': source_id, 'content': content})
156
+
157
+ return Summary(source_id=source_id, content=content)
158
+
159
+ async def _get_source_summaries(self, source_ids: List[str], map_reduce_documents_chain: MapReduceDocumentsChain) -> \
160
+ List[Summary]:
161
+ summaries = await asyncio.gather(
162
+ *[self._get_source_summary(source_id, map_reduce_documents_chain) for source_id in source_ids]
163
+ )
164
+ return summaries
165
+
166
+ def _call(
167
+ self,
168
+ inputs: Dict[str, Any],
169
+ run_manager: Optional[CallbackManagerForChainRun] = None
170
+ ) -> Dict[str, Any]:
171
+ # Step 1: Connect to vector store to ensure embeddings are accessible
172
+ self.vector_store_handler.connect()
173
+
174
+ context_chunks: List[Document] = inputs.get(self.context_key, [])
175
+ logger.debug(f"Found {len(context_chunks)} context chunks.")
176
+
177
+ # Step 2: Extract unique document IDs from the provided chunks
178
+ unique_document_ids = self._get_document_ids_from_chunks(context_chunks)
179
+ logger.debug(f"Extracted {len(unique_document_ids)} unique document IDs: {unique_document_ids}")
180
+
181
+ # Step 3: Initialize the summarization chain if not provided
182
+ question = inputs.get(self.question_key, '')
183
+ map_reduce_documents_chain = self.map_reduce_documents_chain or create_map_reduce_documents_chain(
184
+ self.summarization_config, question
185
+ )
186
+
187
+ # Step 4: Dispatch event to signal summarization start
188
+ if run_manager:
189
+ run_manager.on_text("Starting summarization for documents.", verbose=True)
190
+
191
+ # Step 5: Process each document ID to summarize chunks with local context
192
+ for doc_id in unique_document_ids:
193
+ logger.debug(f"Fetching and summarizing chunks for document ID: {doc_id}")
194
+
195
+ # Fetch all chunks for the document
196
+ chunks = asyncio.get_event_loop().run_until_complete(self._get_all_chunks_for_document(doc_id))
197
+ if not chunks:
198
+ logger.warning(f"No chunks found for document ID: {doc_id}")
199
+ continue
200
+
201
+ # Summarize each chunk with M neighboring chunks
202
+ M = self.neighbor_window
203
+ for i, chunk in enumerate(chunks):
204
+ window_chunks = chunks[max(0, i - M): min(len(chunks), i + M + 1)]
205
+ local_summary = asyncio.get_event_loop().run_until_complete(
206
+ map_reduce_documents_chain.ainvoke(window_chunks)
207
+ )
208
+ chunk.metadata['summary'] = local_summary.get('output_text', '')
209
+ logger.debug(f"Chunk {i} summary: {chunk.metadata['summary'][:100]}...")
210
+
211
+ # Step 6: Update the original context chunks with the newly generated summaries
212
+ for chunk in context_chunks:
213
+ doc_id = str(chunk.metadata.get(self.doc_id_key, ''))
214
+ matching_chunk = next((c for c in chunks if c.metadata.get(self.doc_id_key) == doc_id and c.metadata.get(
215
+ 'chunk_index') == chunk.metadata.get('chunk_index')), None)
216
+ if matching_chunk:
217
+ chunk.metadata['summary'] = matching_chunk.metadata.get('summary', '')
218
+ else:
219
+ chunk.metadata['summary'] = ''
220
+ logger.warning(f"No matching chunk found for doc_id: {doc_id}")
221
+
222
+ # Step 7: Signal summarization end
223
+ if run_manager:
224
+ run_manager.on_text("Summarization completed.", verbose=True)
225
+
226
+ logger.debug(f"Updated {len(context_chunks)} context chunks with summaries.")
227
+ return inputs
@@ -294,16 +294,23 @@ class LangChainRAGPipeline:
294
294
  retriever = SQLRetriever(
295
295
  fallback_retriever=vector_store_retriever,
296
296
  vector_store_handler=knowledge_base_table.get_vector_db(),
297
- metadata_schemas=retriever_config.metadata_schemas,
298
- examples=retriever_config.examples,
297
+ min_k=retriever_config.min_k,
298
+ max_filters=retriever_config.max_filters,
299
+ filter_threshold=retriever_config.filter_threshold,
300
+ database_schema=retriever_config.database_schema,
299
301
  embeddings_model=embeddings,
302
+ search_kwargs=config.search_kwargs,
300
303
  rewrite_prompt_template=retriever_config.rewrite_prompt_template,
301
- metadata_filters_prompt_template=retriever_config.metadata_filters_prompt_template,
304
+ table_prompt_template=retriever_config.table_prompt_template,
305
+ column_prompt_template=retriever_config.column_prompt_template,
306
+ value_prompt_template=retriever_config.value_prompt_template,
307
+ boolean_system_prompt=retriever_config.boolean_system_prompt,
308
+ generative_system_prompt=retriever_config.generative_system_prompt,
302
309
  num_retries=retriever_config.num_retries,
303
310
  embeddings_table=knowledge_base_table._kb.vector_database_table,
304
311
  source_table=retriever_config.source_table,
312
+ source_id_column=retriever_config.source_id_column,
305
313
  distance_function=distance_function,
306
- search_kwargs=config.search_kwargs,
307
314
  llm=sql_llm
308
315
  )
309
316
  return cls(