MindsDB 25.2.3.0__py3-none-any.whl → 25.3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +16 -11
- mindsdb/api/executor/command_executor.py +1 -1
- mindsdb/api/executor/datahub/datanodes/system_tables.py +10 -2
- mindsdb/api/executor/planner/query_planner.py +6 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -1
- mindsdb/api/http/initialize.py +8 -5
- mindsdb/api/http/namespaces/agents.py +0 -7
- mindsdb/api/http/namespaces/config.py +0 -48
- mindsdb/api/http/namespaces/knowledge_bases.py +1 -1
- mindsdb/api/http/namespaces/util.py +0 -28
- mindsdb/api/mongo/classes/query_sql.py +2 -1
- mindsdb/api/mongo/responders/aggregate.py +2 -2
- mindsdb/api/mongo/responders/coll_stats.py +3 -2
- mindsdb/api/mongo/responders/db_stats.py +2 -1
- mindsdb/api/mongo/responders/insert.py +4 -2
- mindsdb/api/mysql/mysql_proxy/classes/fake_mysql_proxy/fake_mysql_proxy.py +2 -1
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +5 -4
- mindsdb/api/postgres/postgres_proxy/postgres_proxy.py +2 -4
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/autosklearn_handler/autosklearn_handler.py +1 -1
- mindsdb/integrations/handlers/dspy_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/gmail_handler/connection_args.py +2 -2
- mindsdb/integrations/handlers/gmail_handler/gmail_handler.py +19 -66
- mindsdb/integrations/handlers/gmail_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/google_calendar_handler/connection_args.py +15 -0
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_handler.py +31 -41
- mindsdb/integrations/handlers/google_calendar_handler/requirements.txt +0 -2
- mindsdb/integrations/handlers/langchain_embedding_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/langchain_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/openai_handler/constants.py +3 -1
- mindsdb/integrations/handlers/openai_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/rag_handler/requirements.txt +0 -1
- mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +33 -8
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +3 -2
- mindsdb/integrations/handlers/web_handler/web_handler.py +42 -33
- mindsdb/integrations/handlers/youtube_handler/__init__.py +2 -0
- mindsdb/integrations/handlers/youtube_handler/connection_args.py +32 -0
- mindsdb/integrations/handlers/youtube_handler/youtube_handler.py +2 -38
- mindsdb/integrations/libs/llm/utils.py +7 -1
- mindsdb/integrations/libs/process_cache.py +2 -2
- mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py +29 -38
- mindsdb/integrations/utilities/pydantic_utils.py +208 -0
- mindsdb/integrations/utilities/rag/chains/local_context_summarizer_chain.py +227 -0
- mindsdb/integrations/utilities/rag/pipelines/rag.py +11 -4
- mindsdb/integrations/utilities/rag/retrievers/sql_retriever.py +800 -135
- mindsdb/integrations/utilities/rag/settings.py +390 -152
- mindsdb/integrations/utilities/sql_utils.py +2 -1
- mindsdb/interfaces/agents/agents_controller.py +14 -10
- mindsdb/interfaces/agents/callback_handlers.py +52 -5
- mindsdb/interfaces/agents/langchain_agent.py +5 -3
- mindsdb/interfaces/agents/mindsdb_chat_model.py +4 -2
- mindsdb/interfaces/chatbot/chatbot_controller.py +9 -8
- mindsdb/interfaces/database/database.py +3 -2
- mindsdb/interfaces/database/integrations.py +1 -1
- mindsdb/interfaces/database/projects.py +28 -2
- mindsdb/interfaces/jobs/jobs_controller.py +4 -1
- mindsdb/interfaces/jobs/scheduler.py +1 -1
- mindsdb/interfaces/knowledge_base/preprocessing/constants.py +2 -2
- mindsdb/interfaces/model/model_controller.py +5 -2
- mindsdb/interfaces/skills/retrieval_tool.py +128 -39
- mindsdb/interfaces/skills/skill_tool.py +7 -7
- mindsdb/interfaces/skills/skills_controller.py +10 -6
- mindsdb/interfaces/skills/sql_agent.py +6 -1
- mindsdb/interfaces/storage/db.py +14 -12
- mindsdb/interfaces/storage/json.py +59 -0
- mindsdb/interfaces/storage/model_fs.py +85 -3
- mindsdb/interfaces/triggers/triggers_controller.py +2 -1
- mindsdb/migrations/versions/2022-10-14_43c52d23845a_projects.py +17 -3
- mindsdb/migrations/versions/2025-02-10_6ab9903fc59a_del_log_table.py +33 -0
- mindsdb/migrations/versions/2025-02-14_4521dafe89ab_added_encrypted_content_to_json_storage.py +29 -0
- mindsdb/migrations/versions/2025-02-19_11347c213b36_added_metadata_to_projects.py +41 -0
- mindsdb/utilities/config.py +6 -1
- mindsdb/utilities/functions.py +11 -0
- mindsdb/utilities/log.py +17 -2
- mindsdb/utilities/ml_task_queue/consumer.py +4 -2
- mindsdb/utilities/render/sqlalchemy_render.py +4 -0
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/METADATA +226 -247
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/RECORD +83 -80
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/WHEEL +1 -1
- mindsdb/integrations/handlers/gmail_handler/utils.py +0 -45
- mindsdb/utilities/log_controller.py +0 -39
- mindsdb/utilities/telemetry.py +0 -44
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/LICENSE +0 -0
- {MindsDB-25.2.3.0.dist-info → mindsdb-25.3.1.0.dist-info}/top_level.txt +0 -0
|
@@ -186,7 +186,6 @@ class ProcessCache:
|
|
|
186
186
|
self._keep_alive = {}
|
|
187
187
|
self._stop_event = threading.Event()
|
|
188
188
|
self.cleaner_thread = None
|
|
189
|
-
self._start_clean()
|
|
190
189
|
|
|
191
190
|
def __del__(self):
|
|
192
191
|
self._stop_clean()
|
|
@@ -200,7 +199,7 @@ class ProcessCache:
|
|
|
200
199
|
):
|
|
201
200
|
return
|
|
202
201
|
self._stop_event.clear()
|
|
203
|
-
self.cleaner_thread = threading.Thread(target=self._clean)
|
|
202
|
+
self.cleaner_thread = threading.Thread(target=self._clean, name='ProcessCache.clean')
|
|
204
203
|
self.cleaner_thread.daemon = True
|
|
205
204
|
self.cleaner_thread.start()
|
|
206
205
|
|
|
@@ -258,6 +257,7 @@ class ProcessCache:
|
|
|
258
257
|
Returns:
|
|
259
258
|
Future
|
|
260
259
|
"""
|
|
260
|
+
self._start_clean()
|
|
261
261
|
handler_module_path = payload['handler_meta']['module_path']
|
|
262
262
|
integration_id = payload['handler_meta']['integration_id']
|
|
263
263
|
if task_type in (ML_TASK_TYPE.LEARN, ML_TASK_TYPE.FINETUNE):
|
mindsdb/integrations/utilities/handlers/auth_utilities/google/google_user_oauth_utilities.py
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
|
-
import os
|
|
2
1
|
import json
|
|
2
|
+
from pathlib import Path
|
|
3
3
|
import requests
|
|
4
4
|
import datetime as dt
|
|
5
5
|
from flask import request
|
|
6
|
-
from shutil import copyfile
|
|
7
6
|
|
|
8
7
|
from mindsdb.utilities import log
|
|
9
8
|
|
|
@@ -29,73 +28,65 @@ class GoogleUserOAuth2Manager:
|
|
|
29
28
|
creds = None
|
|
30
29
|
|
|
31
30
|
if self.credentials_file or self.credentials_url:
|
|
32
|
-
|
|
33
|
-
curr_dir = self.handler_storage.folder_get('config')
|
|
31
|
+
oauth_user_info = self.handler_storage.encrypted_json_get('oauth_user_info')
|
|
34
32
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if os.path.isfile(creds_file):
|
|
39
|
-
creds = Credentials.from_authorized_user_file(creds_file, self.scopes)
|
|
33
|
+
if oauth_user_info:
|
|
34
|
+
creds = Credentials.from_authorized_user_info(oauth_user_info, self.scopes)
|
|
40
35
|
|
|
41
36
|
if not creds or not creds.valid:
|
|
42
37
|
logger.debug("Credentials do not exist or are invalid, attempting to authorize again")
|
|
43
38
|
|
|
44
|
-
|
|
45
|
-
# save to storage
|
|
46
|
-
self.handler_storage.folder_sync('config')
|
|
47
|
-
else:
|
|
48
|
-
raise ValueError('No valid Gmail Credentials filepath or S3 url found.')
|
|
39
|
+
oauth_user_info = self._download_oauth_user_info()
|
|
49
40
|
|
|
50
41
|
if creds and creds.expired and creds.refresh_token:
|
|
51
42
|
creds.refresh(Request())
|
|
52
43
|
logger.debug("Credentials refreshed successfully")
|
|
53
44
|
else:
|
|
54
|
-
creds = self._execute_google_auth_flow(
|
|
45
|
+
creds = self._execute_google_auth_flow(oauth_user_info)
|
|
55
46
|
logger.debug("New credentials obtained")
|
|
56
47
|
|
|
57
|
-
self.
|
|
58
|
-
logger.debug(
|
|
59
|
-
self.handler_storage.folder_sync('config')
|
|
48
|
+
self.handler_storage.encrypted_json_set('oauth_user_info', self._convert_credentials_to_dict(creds))
|
|
49
|
+
logger.debug("Saving credentials to storage")
|
|
60
50
|
|
|
61
51
|
return creds
|
|
62
52
|
|
|
63
|
-
def
|
|
64
|
-
# if credentials_url is set, attempt to download the
|
|
53
|
+
def _download_oauth_user_info(self):
|
|
54
|
+
# if credentials_url is set, attempt to download the contents of the files
|
|
65
55
|
# this will be given preference over credentials_file
|
|
66
56
|
if self.credentials_url:
|
|
67
57
|
response = requests.get(self.credentials_url)
|
|
68
58
|
if response.status_code == 200:
|
|
69
|
-
|
|
70
|
-
creds.write(response.text)
|
|
71
|
-
return True
|
|
59
|
+
return response.json()
|
|
72
60
|
else:
|
|
73
|
-
logger.error("Failed to get credentials from
|
|
61
|
+
logger.error("Failed to get credentials from URL", response.status_code)
|
|
62
|
+
|
|
63
|
+
# if credentials_file is set, attempt to read the contents of the file
|
|
64
|
+
if self.credentials_file:
|
|
65
|
+
path = Path(self.credentials_file).expanduser()
|
|
66
|
+
if path.exists():
|
|
67
|
+
with open(path, 'r') as f:
|
|
68
|
+
return json.load(f)
|
|
69
|
+
else:
|
|
70
|
+
logger.error("Credentials file does not exist")
|
|
74
71
|
|
|
75
|
-
|
|
76
|
-
if self.credentials_file and os.path.isfile(self.credentials_file):
|
|
77
|
-
copyfile(self.credentials_file, secret_file)
|
|
78
|
-
return True
|
|
79
|
-
return False
|
|
72
|
+
raise ValueError('OAuth2 credentials could not be found')
|
|
80
73
|
|
|
81
|
-
def _execute_google_auth_flow(self,
|
|
82
|
-
flow = Flow.
|
|
74
|
+
def _execute_google_auth_flow(self, oauth_user_info: dict):
|
|
75
|
+
flow = Flow.from_client_config(
|
|
76
|
+
oauth_user_info,
|
|
77
|
+
scopes=self.scopes
|
|
78
|
+
)
|
|
83
79
|
|
|
84
80
|
flow.redirect_uri = request.headers['ORIGIN'] + '/verify-auth'
|
|
85
81
|
|
|
86
|
-
if code:
|
|
87
|
-
flow.fetch_token(code=code)
|
|
82
|
+
if self.code:
|
|
83
|
+
flow.fetch_token(code=self.code)
|
|
88
84
|
creds = flow.credentials
|
|
89
85
|
return creds
|
|
90
86
|
else:
|
|
91
87
|
auth_url = flow.authorization_url()[0]
|
|
92
88
|
raise AuthException(f'Authorisation required. Please follow the url: {auth_url}', auth_url=auth_url)
|
|
93
89
|
|
|
94
|
-
def _save_credentials_to_file(self, creds, file_path):
|
|
95
|
-
with open(file_path, 'w') as token:
|
|
96
|
-
data = self._convert_credentials_to_dict(creds)
|
|
97
|
-
token.write(json.dumps(data))
|
|
98
|
-
|
|
99
90
|
def _convert_credentials_to_dict(self, credentials):
|
|
100
91
|
return {
|
|
101
92
|
'token': credentials.token,
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
import pprint
|
|
2
|
+
|
|
3
|
+
pydantic_schema_description = """## Understanding Pydantic Schemas for JSON Formatting
|
|
4
|
+
|
|
5
|
+
Pydantic schemas provide a framework for defining the structure and validation rules for JSON output. Below is an overview of key components commonly found in a Pydantic schema:
|
|
6
|
+
|
|
7
|
+
### Key Components
|
|
8
|
+
|
|
9
|
+
Each object in the schema represents a Pydantic model in JSON format. Typical fields in a Pydantic model description include:
|
|
10
|
+
|
|
11
|
+
- **`anyOf`**:
|
|
12
|
+
- A list describing possible values for a Pydantic model field.
|
|
13
|
+
|
|
14
|
+
- **`additionalProperties`**:
|
|
15
|
+
- Describes the keys of a dictionary. Keys are always of type `string` due to this being a JSON Pydantic schema. The corresponding key types supported by Pydantic are:
|
|
16
|
+
- `string`: a text string
|
|
17
|
+
- `integer`: an integer number
|
|
18
|
+
- `number`: a floating-point number
|
|
19
|
+
|
|
20
|
+
- **`items`**:
|
|
21
|
+
- Describes the items contained within an `array` (list).
|
|
22
|
+
|
|
23
|
+
- **`type`**:
|
|
24
|
+
- Specifies the Pydantic type assigned to the field, defining the expected data type. Common types include:
|
|
25
|
+
- `string`: a text string
|
|
26
|
+
- `integer`: an integer number
|
|
27
|
+
- `number`: a floating-point number
|
|
28
|
+
- `array`: a list
|
|
29
|
+
- `object`: a dictionary
|
|
30
|
+
- `null`: the python null value None. Indicates the field is optional.
|
|
31
|
+
|
|
32
|
+
- **`description`**:
|
|
33
|
+
- Provides a textual narrative explaining the purpose and details of the output JSON field.
|
|
34
|
+
|
|
35
|
+
- **`title`**:
|
|
36
|
+
- A Pydantic-generated, human-readable title for the field.
|
|
37
|
+
|
|
38
|
+
- **`default`**:
|
|
39
|
+
- The default value for this field if no value is provided by the user.
|
|
40
|
+
|
|
41
|
+
### Schema
|
|
42
|
+
|
|
43
|
+
Below is the Pydantic schema:
|
|
44
|
+
|
|
45
|
+
{schema}
|
|
46
|
+
|
|
47
|
+
### Examples
|
|
48
|
+
|
|
49
|
+
Below is an example of well-formed output adhering to this schema.
|
|
50
|
+
|
|
51
|
+
- Dummy text strings are represented as "lorem ipsum."
|
|
52
|
+
|
|
53
|
+
{example}
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_dummy_value(field_value):
|
|
58
|
+
"""A function to return a dummy value of a Pydantic model field."""
|
|
59
|
+
type_str = field_value["type"]
|
|
60
|
+
example_dict = {
|
|
61
|
+
"string": "lorem ipsum",
|
|
62
|
+
"int": 3,
|
|
63
|
+
"number": 42.0,
|
|
64
|
+
"null": None,
|
|
65
|
+
"object": {"lorem ipsum": "lorem_ipsum"},
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if type_str in example_dict:
|
|
69
|
+
return example_dict[type_str]
|
|
70
|
+
else:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_dummy_array(field_value):
|
|
75
|
+
"""A function to return a dummy array of a Pydantic model field."""
|
|
76
|
+
items = field_value["items"]
|
|
77
|
+
|
|
78
|
+
if "type" in items:
|
|
79
|
+
if items["type"] == "null": # skip if null
|
|
80
|
+
pass
|
|
81
|
+
elif items["type"] == "array": # is it an array?
|
|
82
|
+
array_value = get_dummy_array(items)
|
|
83
|
+
elif (
|
|
84
|
+
items["type"] == "object" and "additionalProperties" in items
|
|
85
|
+
): # is it a dict?
|
|
86
|
+
array_value = get_dummy_dict(items)
|
|
87
|
+
else: # it is a regular value!
|
|
88
|
+
array_value = get_dummy_value(items)
|
|
89
|
+
return [array_value for _ in range(2)]
|
|
90
|
+
|
|
91
|
+
elif "AnyOf" in field_value["items"]:
|
|
92
|
+
array_value = get_any_of(field_value["items"]) # can be one of many types
|
|
93
|
+
return [array_value for _ in range(2)]
|
|
94
|
+
|
|
95
|
+
else: # is it a pydantic class?
|
|
96
|
+
array_value = example_generator(items)
|
|
97
|
+
return [array_value for _ in range(2)]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def get_dummy_dict(field_value):
|
|
101
|
+
"""A function to return a dummy dictionary of a Pydantic model field."""
|
|
102
|
+
return get_dummy_value(field_value)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def get_any_of(field_value):
|
|
106
|
+
"""A function to return the first viable pydantic type of an Any() Pydantic model field."""
|
|
107
|
+
for any_of in field_value["anyOf"]:
|
|
108
|
+
if "type" in any_of:
|
|
109
|
+
if any_of["type"] == "null": # skip if null
|
|
110
|
+
continue
|
|
111
|
+
elif any_of["type"] == "array": # is it an array?
|
|
112
|
+
out = get_dummy_array(any_of)
|
|
113
|
+
return out
|
|
114
|
+
elif (
|
|
115
|
+
any_of["type"] == "object" and "additionalProperties" in any_of
|
|
116
|
+
): # is it a dict?
|
|
117
|
+
out = get_dummy_dict(any_of)
|
|
118
|
+
return out
|
|
119
|
+
else: # it is a regular value!
|
|
120
|
+
out = get_dummy_value(any_of)
|
|
121
|
+
return out
|
|
122
|
+
else: # is it a pydantic class?
|
|
123
|
+
out = example_generator(any_of)
|
|
124
|
+
return out
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def example_generator(pydantic_json_schema):
|
|
128
|
+
"""dynamically parse a pydantic object and generate an example of it's formatting."""
|
|
129
|
+
|
|
130
|
+
example_dict = {}
|
|
131
|
+
for schema_name, schema in pydantic_json_schema.items():
|
|
132
|
+
|
|
133
|
+
for field_name, field_value in schema.items():
|
|
134
|
+
if "type" in field_value:
|
|
135
|
+
|
|
136
|
+
if field_value["type"] == "array": # is it an array?
|
|
137
|
+
example_dict[field_name] = get_dummy_array(field_value)
|
|
138
|
+
|
|
139
|
+
elif (
|
|
140
|
+
field_value["type"] == "object"
|
|
141
|
+
and "additionalProperties" in field_value
|
|
142
|
+
): # is it a dict?
|
|
143
|
+
example_dict[field_name] = get_dummy_dict(field_value)
|
|
144
|
+
|
|
145
|
+
else: # it is a regular value!
|
|
146
|
+
example_dict[field_name] = get_dummy_value(field_value)
|
|
147
|
+
|
|
148
|
+
elif "anyOf" in field_value:
|
|
149
|
+
example_dict[field_name] = get_any_of(field_value)
|
|
150
|
+
|
|
151
|
+
else: # it is a pydantic class
|
|
152
|
+
example_dict[field_name] = example_generator(field_value)
|
|
153
|
+
return example_dict
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def search_and_replace_refs(schema, defs, ref_skip={}, n=0):
|
|
157
|
+
"""Dynamically substitute subclass references in a Pydantic object schema."""
|
|
158
|
+
for key, value in schema.items():
|
|
159
|
+
if key in ref_skip:
|
|
160
|
+
continue
|
|
161
|
+
if type(value) is dict:
|
|
162
|
+
if "$ref" in value:
|
|
163
|
+
definition_key = value["$ref"].split("/")[-1]
|
|
164
|
+
if definition_key in ref_skip:
|
|
165
|
+
schema[key] = {"type": "null"}
|
|
166
|
+
else:
|
|
167
|
+
schema[key] = {definition_key: defs[definition_key]["properties"]}
|
|
168
|
+
else:
|
|
169
|
+
search_and_replace_refs(value, defs, ref_skip, n + 1)
|
|
170
|
+
elif type(value) is list:
|
|
171
|
+
for val in value:
|
|
172
|
+
search_and_replace_refs(val, defs, ref_skip, n + 1)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def remove_extraneous_fields(schema, ref_skip):
|
|
176
|
+
"""Remove extraneous fields from object descriptions."""
|
|
177
|
+
reduced_schema = schema["properties"]
|
|
178
|
+
|
|
179
|
+
for ref in ref_skip.keys():
|
|
180
|
+
if ref in reduced_schema:
|
|
181
|
+
del reduced_schema[ref]
|
|
182
|
+
|
|
183
|
+
for key, value in reduced_schema.items():
|
|
184
|
+
if "title" in value:
|
|
185
|
+
del value["title"]
|
|
186
|
+
if "$defs" in value:
|
|
187
|
+
del value["$defs"]
|
|
188
|
+
if "required" in value:
|
|
189
|
+
del value["required"]
|
|
190
|
+
|
|
191
|
+
return reduced_schema
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def format_for_prompt(pydantic_object, ref_skip={}):
|
|
195
|
+
"""Format a Pydantic object description for prompting an LLM."""
|
|
196
|
+
schema = {k: v for k, v in pydantic_object.schema().items()}
|
|
197
|
+
|
|
198
|
+
search_and_replace_refs(
|
|
199
|
+
schema=schema["properties"], defs=schema["$defs"], ref_skip=ref_skip, n=0
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
reduced_schema = remove_extraneous_fields(schema, ref_skip)
|
|
203
|
+
|
|
204
|
+
reduced_schema = {schema["title"]: reduced_schema}
|
|
205
|
+
|
|
206
|
+
out = pprint.pformat(reduced_schema)
|
|
207
|
+
|
|
208
|
+
return out, reduced_schema
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
from collections import namedtuple
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from mindsdb.interfaces.agents.langchain_agent import create_chat_model
|
|
6
|
+
from langchain.chains.base import Chain
|
|
7
|
+
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
|
|
8
|
+
from langchain.chains.llm import LLMChain
|
|
9
|
+
from langchain.chains.combine_documents.map_reduce import MapReduceDocumentsChain, ReduceDocumentsChain
|
|
10
|
+
from langchain_core.callbacks import dispatch_custom_event
|
|
11
|
+
from langchain_core.callbacks.manager import CallbackManagerForChainRun
|
|
12
|
+
from langchain_core.documents import Document
|
|
13
|
+
from langchain_core.prompts import PromptTemplate
|
|
14
|
+
from pandas import DataFrame
|
|
15
|
+
|
|
16
|
+
from mindsdb.integrations.libs.vectordatabase_handler import VectorStoreHandler
|
|
17
|
+
from mindsdb.integrations.utilities.rag.settings import SummarizationConfig
|
|
18
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
19
|
+
from mindsdb.utilities import log
|
|
20
|
+
|
|
21
|
+
logger = log.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
Summary = namedtuple('Summary', ['source_id', 'content'])
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def create_map_reduce_documents_chain(summarization_config: SummarizationConfig, input: str) -> ReduceDocumentsChain:
|
|
27
|
+
"""Creates a chain that map-reduces documents into a single consolidated summary."""
|
|
28
|
+
summarization_llm = create_chat_model({
|
|
29
|
+
'model_name': summarization_config.llm_config.model_name,
|
|
30
|
+
'provider': summarization_config.llm_config.provider,
|
|
31
|
+
**summarization_config.llm_config.params
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
reduce_prompt_template = summarization_config.reduce_prompt_template
|
|
35
|
+
reduce_prompt = PromptTemplate.from_template(reduce_prompt_template)
|
|
36
|
+
if 'input' in reduce_prompt.input_variables:
|
|
37
|
+
reduce_prompt = reduce_prompt.partial(input=input)
|
|
38
|
+
|
|
39
|
+
reduce_chain = LLMChain(llm=summarization_llm, prompt=reduce_prompt)
|
|
40
|
+
|
|
41
|
+
combine_documents_chain = StuffDocumentsChain(
|
|
42
|
+
llm_chain=reduce_chain,
|
|
43
|
+
document_variable_name='docs'
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
return ReduceDocumentsChain(
|
|
47
|
+
combine_documents_chain=combine_documents_chain,
|
|
48
|
+
collapse_documents_chain=combine_documents_chain,
|
|
49
|
+
token_max=summarization_config.max_summarization_tokens
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class LocalContextSummarizerChain(Chain):
|
|
54
|
+
"""Summarizes M chunks before and after a given chunk in a document."""
|
|
55
|
+
|
|
56
|
+
doc_id_key: str = 'original_row_id'
|
|
57
|
+
chunk_index_key: str = 'chunk_index'
|
|
58
|
+
|
|
59
|
+
vector_store_handler: VectorStoreHandler
|
|
60
|
+
table_name: str = 'embeddings'
|
|
61
|
+
content_column_name: str = 'content'
|
|
62
|
+
metadata_column_name: str = 'metadata'
|
|
63
|
+
|
|
64
|
+
summarization_config: SummarizationConfig
|
|
65
|
+
map_reduce_documents_chain: Optional[ReduceDocumentsChain] = None
|
|
66
|
+
|
|
67
|
+
def _select_chunks_from_vector_store(self, doc_id: str) -> DataFrame:
|
|
68
|
+
condition = FilterCondition(
|
|
69
|
+
f"{self.metadata_column_name}->>'{self.doc_id_key}'",
|
|
70
|
+
FilterOperator.EQUAL,
|
|
71
|
+
doc_id
|
|
72
|
+
)
|
|
73
|
+
return self.vector_store_handler.select(
|
|
74
|
+
self.table_name,
|
|
75
|
+
columns=[self.content_column_name, self.metadata_column_name],
|
|
76
|
+
conditions=[condition]
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
async def _get_all_chunks_for_document(self, doc_id: str) -> List[Document]:
|
|
80
|
+
df = await asyncio.get_event_loop().run_in_executor(
|
|
81
|
+
None, self._select_chunks_from_vector_store, doc_id
|
|
82
|
+
)
|
|
83
|
+
chunks = []
|
|
84
|
+
for _, row in df.iterrows():
|
|
85
|
+
metadata = row.get(self.metadata_column_name, {})
|
|
86
|
+
metadata[self.chunk_index_key] = row.get('chunk_id', 0)
|
|
87
|
+
chunks.append(Document(page_content=row[self.content_column_name], metadata=metadata))
|
|
88
|
+
|
|
89
|
+
return sorted(chunks, key=lambda x: x.metadata.get(self.chunk_index_key, 0))
|
|
90
|
+
|
|
91
|
+
async def summarize_local_context(self, doc_id: str, target_chunk_index: int, M: int) -> Summary:
|
|
92
|
+
"""
|
|
93
|
+
Summarizes M chunks before and after the given chunk.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
doc_id (str): Document ID.
|
|
97
|
+
target_chunk_index (int): Index of the chunk to summarize around.
|
|
98
|
+
M (int): Number of chunks before and after to include.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Summary: Summary object containing source_id and summary content.
|
|
102
|
+
"""
|
|
103
|
+
logger.debug(f"Fetching chunks for document {doc_id}")
|
|
104
|
+
all_chunks = await self._get_all_chunks_for_document(doc_id)
|
|
105
|
+
|
|
106
|
+
if not all_chunks:
|
|
107
|
+
logger.warning(f"No chunks found for document {doc_id}")
|
|
108
|
+
return Summary(source_id=doc_id, content='')
|
|
109
|
+
|
|
110
|
+
# Determine window boundaries
|
|
111
|
+
start_idx = max(0, target_chunk_index - M)
|
|
112
|
+
end_idx = min(len(all_chunks), target_chunk_index + M + 1)
|
|
113
|
+
local_chunks = all_chunks[start_idx:end_idx]
|
|
114
|
+
|
|
115
|
+
logger.debug(f"Summarizing chunks {start_idx} to {end_idx - 1} for document {doc_id}")
|
|
116
|
+
|
|
117
|
+
if not self.map_reduce_documents_chain:
|
|
118
|
+
self.map_reduce_documents_chain = create_map_reduce_documents_chain(
|
|
119
|
+
self.summarization_config, input="Summarize these chunks."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
summary_result = await self.map_reduce_documents_chain.ainvoke(local_chunks)
|
|
123
|
+
summary_text = summary_result.get('output_text', '')
|
|
124
|
+
|
|
125
|
+
logger.debug(f"Generated summary: {summary_text[:100]}...")
|
|
126
|
+
|
|
127
|
+
return Summary(source_id=doc_id, content=summary_text)
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def input_keys(self) -> List[str]:
|
|
131
|
+
return [self.context_key, self.question_key]
|
|
132
|
+
|
|
133
|
+
@property
|
|
134
|
+
def output_keys(self) -> List[str]:
|
|
135
|
+
return [self.context_key, self.question_key]
|
|
136
|
+
|
|
137
|
+
async def _get_source_summary(self, source_id: str, map_reduce_documents_chain: MapReduceDocumentsChain) -> Summary:
|
|
138
|
+
if not source_id:
|
|
139
|
+
logger.warning("Received empty source_id, returning empty summary")
|
|
140
|
+
return Summary(source_id='', content='')
|
|
141
|
+
|
|
142
|
+
logger.debug(f"Getting summary for source ID: {source_id}")
|
|
143
|
+
source_chunks = await self._get_all_chunks_for_document(source_id)
|
|
144
|
+
|
|
145
|
+
if not source_chunks:
|
|
146
|
+
logger.warning(f"No chunks found for source ID: {source_id}")
|
|
147
|
+
return Summary(source_id=source_id, content='')
|
|
148
|
+
|
|
149
|
+
logger.debug(f"Summarizing {len(source_chunks)} chunks for source ID: {source_id}")
|
|
150
|
+
summary = await map_reduce_documents_chain.ainvoke(source_chunks)
|
|
151
|
+
content = summary.get('output_text', '')
|
|
152
|
+
logger.debug(f"Generated summary for source ID {source_id}: {content[:100]}...")
|
|
153
|
+
|
|
154
|
+
# Stream summarization update.
|
|
155
|
+
dispatch_custom_event('summary', {'source_id': source_id, 'content': content})
|
|
156
|
+
|
|
157
|
+
return Summary(source_id=source_id, content=content)
|
|
158
|
+
|
|
159
|
+
async def _get_source_summaries(self, source_ids: List[str], map_reduce_documents_chain: MapReduceDocumentsChain) -> \
|
|
160
|
+
List[Summary]:
|
|
161
|
+
summaries = await asyncio.gather(
|
|
162
|
+
*[self._get_source_summary(source_id, map_reduce_documents_chain) for source_id in source_ids]
|
|
163
|
+
)
|
|
164
|
+
return summaries
|
|
165
|
+
|
|
166
|
+
def _call(
|
|
167
|
+
self,
|
|
168
|
+
inputs: Dict[str, Any],
|
|
169
|
+
run_manager: Optional[CallbackManagerForChainRun] = None
|
|
170
|
+
) -> Dict[str, Any]:
|
|
171
|
+
# Step 1: Connect to vector store to ensure embeddings are accessible
|
|
172
|
+
self.vector_store_handler.connect()
|
|
173
|
+
|
|
174
|
+
context_chunks: List[Document] = inputs.get(self.context_key, [])
|
|
175
|
+
logger.debug(f"Found {len(context_chunks)} context chunks.")
|
|
176
|
+
|
|
177
|
+
# Step 2: Extract unique document IDs from the provided chunks
|
|
178
|
+
unique_document_ids = self._get_document_ids_from_chunks(context_chunks)
|
|
179
|
+
logger.debug(f"Extracted {len(unique_document_ids)} unique document IDs: {unique_document_ids}")
|
|
180
|
+
|
|
181
|
+
# Step 3: Initialize the summarization chain if not provided
|
|
182
|
+
question = inputs.get(self.question_key, '')
|
|
183
|
+
map_reduce_documents_chain = self.map_reduce_documents_chain or create_map_reduce_documents_chain(
|
|
184
|
+
self.summarization_config, question
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Step 4: Dispatch event to signal summarization start
|
|
188
|
+
if run_manager:
|
|
189
|
+
run_manager.on_text("Starting summarization for documents.", verbose=True)
|
|
190
|
+
|
|
191
|
+
# Step 5: Process each document ID to summarize chunks with local context
|
|
192
|
+
for doc_id in unique_document_ids:
|
|
193
|
+
logger.debug(f"Fetching and summarizing chunks for document ID: {doc_id}")
|
|
194
|
+
|
|
195
|
+
# Fetch all chunks for the document
|
|
196
|
+
chunks = asyncio.get_event_loop().run_until_complete(self._get_all_chunks_for_document(doc_id))
|
|
197
|
+
if not chunks:
|
|
198
|
+
logger.warning(f"No chunks found for document ID: {doc_id}")
|
|
199
|
+
continue
|
|
200
|
+
|
|
201
|
+
# Summarize each chunk with M neighboring chunks
|
|
202
|
+
M = self.neighbor_window
|
|
203
|
+
for i, chunk in enumerate(chunks):
|
|
204
|
+
window_chunks = chunks[max(0, i - M): min(len(chunks), i + M + 1)]
|
|
205
|
+
local_summary = asyncio.get_event_loop().run_until_complete(
|
|
206
|
+
map_reduce_documents_chain.ainvoke(window_chunks)
|
|
207
|
+
)
|
|
208
|
+
chunk.metadata['summary'] = local_summary.get('output_text', '')
|
|
209
|
+
logger.debug(f"Chunk {i} summary: {chunk.metadata['summary'][:100]}...")
|
|
210
|
+
|
|
211
|
+
# Step 6: Update the original context chunks with the newly generated summaries
|
|
212
|
+
for chunk in context_chunks:
|
|
213
|
+
doc_id = str(chunk.metadata.get(self.doc_id_key, ''))
|
|
214
|
+
matching_chunk = next((c for c in chunks if c.metadata.get(self.doc_id_key) == doc_id and c.metadata.get(
|
|
215
|
+
'chunk_index') == chunk.metadata.get('chunk_index')), None)
|
|
216
|
+
if matching_chunk:
|
|
217
|
+
chunk.metadata['summary'] = matching_chunk.metadata.get('summary', '')
|
|
218
|
+
else:
|
|
219
|
+
chunk.metadata['summary'] = ''
|
|
220
|
+
logger.warning(f"No matching chunk found for doc_id: {doc_id}")
|
|
221
|
+
|
|
222
|
+
# Step 7: Signal summarization end
|
|
223
|
+
if run_manager:
|
|
224
|
+
run_manager.on_text("Summarization completed.", verbose=True)
|
|
225
|
+
|
|
226
|
+
logger.debug(f"Updated {len(context_chunks)} context chunks with summaries.")
|
|
227
|
+
return inputs
|
|
@@ -294,16 +294,23 @@ class LangChainRAGPipeline:
|
|
|
294
294
|
retriever = SQLRetriever(
|
|
295
295
|
fallback_retriever=vector_store_retriever,
|
|
296
296
|
vector_store_handler=knowledge_base_table.get_vector_db(),
|
|
297
|
-
|
|
298
|
-
|
|
297
|
+
min_k=retriever_config.min_k,
|
|
298
|
+
max_filters=retriever_config.max_filters,
|
|
299
|
+
filter_threshold=retriever_config.filter_threshold,
|
|
300
|
+
database_schema=retriever_config.database_schema,
|
|
299
301
|
embeddings_model=embeddings,
|
|
302
|
+
search_kwargs=config.search_kwargs,
|
|
300
303
|
rewrite_prompt_template=retriever_config.rewrite_prompt_template,
|
|
301
|
-
|
|
304
|
+
table_prompt_template=retriever_config.table_prompt_template,
|
|
305
|
+
column_prompt_template=retriever_config.column_prompt_template,
|
|
306
|
+
value_prompt_template=retriever_config.value_prompt_template,
|
|
307
|
+
boolean_system_prompt=retriever_config.boolean_system_prompt,
|
|
308
|
+
generative_system_prompt=retriever_config.generative_system_prompt,
|
|
302
309
|
num_retries=retriever_config.num_retries,
|
|
303
310
|
embeddings_table=knowledge_base_table._kb.vector_database_table,
|
|
304
311
|
source_table=retriever_config.source_table,
|
|
312
|
+
source_id_column=retriever_config.source_id_column,
|
|
305
313
|
distance_function=distance_function,
|
|
306
|
-
search_kwargs=config.search_kwargs,
|
|
307
314
|
llm=sql_llm
|
|
308
315
|
)
|
|
309
316
|
return cls(
|