MindsDB 25.7.4.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +11 -1
- mindsdb/api/executor/command_executor.py +9 -15
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
- mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
- mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
- mindsdb/api/executor/utilities/sql.py +30 -0
- mindsdb/api/http/initialize.py +2 -1
- mindsdb/api/http/namespaces/views.py +56 -72
- mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
- mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
- mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
- mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
- mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
- mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
- mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
- mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -2
- mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
- mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
- mindsdb/integrations/handlers/salesforce_handler/constants.py +9 -2
- mindsdb/integrations/libs/llm/config.py +0 -14
- mindsdb/integrations/libs/llm/utils.py +0 -15
- mindsdb/integrations/utilities/files/file_reader.py +5 -19
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +83 -45
- mindsdb/interfaces/agents/constants.py +0 -1
- mindsdb/interfaces/agents/langchain_agent.py +1 -3
- mindsdb/interfaces/database/projects.py +111 -7
- mindsdb/interfaces/knowledge_base/controller.py +7 -1
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
- mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
- mindsdb/interfaces/query_context/context_controller.py +14 -15
- mindsdb/utilities/config.py +2 -0
- mindsdb/utilities/fs.py +54 -17
- {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +278 -263
- {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +49 -48
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
- mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
- mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
- mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
- mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
- mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
- /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
- {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
|
@@ -145,11 +145,60 @@ class AgentsController:
|
|
|
145
145
|
|
|
146
146
|
return all_agents.all()
|
|
147
147
|
|
|
148
|
+
def _create_default_sql_skill(
|
|
149
|
+
self,
|
|
150
|
+
name,
|
|
151
|
+
project_name,
|
|
152
|
+
include_tables: List[str] = None,
|
|
153
|
+
include_knowledge_bases: List[str] = None,
|
|
154
|
+
):
|
|
155
|
+
# Create a default SQL skill
|
|
156
|
+
skill_name = f"{name}_sql_skill"
|
|
157
|
+
skill_params = {
|
|
158
|
+
"type": "sql",
|
|
159
|
+
"description": f"Auto-generated SQL skill for agent {name}",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Add restrictions provided
|
|
163
|
+
if include_tables:
|
|
164
|
+
skill_params["include_tables"] = include_tables
|
|
165
|
+
if include_knowledge_bases:
|
|
166
|
+
skill_params["include_knowledge_bases"] = include_knowledge_bases
|
|
167
|
+
|
|
168
|
+
try:
|
|
169
|
+
# Check if skill already exists
|
|
170
|
+
existing_skill = self.skills_controller.get_skill(skill_name, project_name)
|
|
171
|
+
if existing_skill is None:
|
|
172
|
+
# Create the skill
|
|
173
|
+
skill_type = skill_params.pop("type")
|
|
174
|
+
self.skills_controller.add_skill(
|
|
175
|
+
name=skill_name, project_name=project_name, type=skill_type, params=skill_params
|
|
176
|
+
)
|
|
177
|
+
else:
|
|
178
|
+
# Update the skill if parameters have changed
|
|
179
|
+
params_changed = False
|
|
180
|
+
|
|
181
|
+
# Check if skill parameters need to be updated
|
|
182
|
+
for param_key, param_value in skill_params.items():
|
|
183
|
+
if existing_skill.params.get(param_key) != param_value:
|
|
184
|
+
existing_skill.params[param_key] = param_value
|
|
185
|
+
params_changed = True
|
|
186
|
+
|
|
187
|
+
# Update the skill if needed
|
|
188
|
+
if params_changed:
|
|
189
|
+
flag_modified(existing_skill, "params")
|
|
190
|
+
db.session.commit()
|
|
191
|
+
|
|
192
|
+
except Exception as e:
|
|
193
|
+
raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
|
|
194
|
+
|
|
195
|
+
return skill_name
|
|
196
|
+
|
|
148
197
|
def add_agent(
|
|
149
198
|
self,
|
|
150
199
|
name: str,
|
|
151
200
|
project_name: str = None,
|
|
152
|
-
model_name: str = None,
|
|
201
|
+
model_name: Union[str, dict] = None,
|
|
153
202
|
skills: List[Union[str, dict]] = None,
|
|
154
203
|
provider: str = None,
|
|
155
204
|
params: Dict[str, Any] = None,
|
|
@@ -256,46 +305,13 @@ class AgentsController:
|
|
|
256
305
|
|
|
257
306
|
# Auto-create SQL skill if no skills are provided but include_tables or include_knowledge_bases params are provided
|
|
258
307
|
if not skills and (include_tables or include_knowledge_bases):
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
# Add restrictions provided
|
|
267
|
-
if include_tables:
|
|
268
|
-
skill_params["include_tables"] = include_tables
|
|
269
|
-
if include_knowledge_bases:
|
|
270
|
-
skill_params["include_knowledge_bases"] = include_knowledge_bases
|
|
271
|
-
|
|
272
|
-
try:
|
|
273
|
-
# Check if skill already exists
|
|
274
|
-
existing_skill = self.skills_controller.get_skill(skill_name, project_name)
|
|
275
|
-
if existing_skill is None:
|
|
276
|
-
# Create the skill
|
|
277
|
-
skill_type = skill_params.pop("type")
|
|
278
|
-
self.skills_controller.add_skill(
|
|
279
|
-
name=skill_name, project_name=project_name, type=skill_type, params=skill_params
|
|
280
|
-
)
|
|
281
|
-
else:
|
|
282
|
-
# Update the skill if parameters have changed
|
|
283
|
-
params_changed = False
|
|
284
|
-
|
|
285
|
-
# Check if skill parameters need to be updated
|
|
286
|
-
for param_key, param_value in skill_params.items():
|
|
287
|
-
if existing_skill.params.get(param_key) != param_value:
|
|
288
|
-
existing_skill.params[param_key] = param_value
|
|
289
|
-
params_changed = True
|
|
290
|
-
|
|
291
|
-
# Update the skill if needed
|
|
292
|
-
if params_changed:
|
|
293
|
-
flag_modified(existing_skill, "params")
|
|
294
|
-
db.session.commit()
|
|
295
|
-
|
|
296
|
-
skills = [skill_name]
|
|
297
|
-
except Exception as e:
|
|
298
|
-
raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
|
|
308
|
+
skill = self._create_default_sql_skill(
|
|
309
|
+
name,
|
|
310
|
+
project_name,
|
|
311
|
+
include_tables=include_tables,
|
|
312
|
+
include_knowledge_bases=include_knowledge_bases,
|
|
313
|
+
)
|
|
314
|
+
skills = [skill]
|
|
299
315
|
|
|
300
316
|
agent = db.Agents(
|
|
301
317
|
name=name,
|
|
@@ -351,7 +367,7 @@ class AgentsController:
|
|
|
351
367
|
agent_name: str,
|
|
352
368
|
project_name: str = default_project,
|
|
353
369
|
name: str = None,
|
|
354
|
-
model_name: str = None,
|
|
370
|
+
model_name: Union[str, dict] = None,
|
|
355
371
|
skills_to_add: List[Union[str, dict]] = None,
|
|
356
372
|
skills_to_remove: List[str] = None,
|
|
357
373
|
skills_to_rewrite: List[Union[str, dict]] = None,
|
|
@@ -365,7 +381,7 @@ class AgentsController:
|
|
|
365
381
|
agent_name (str): The name of the new agent, or existing agent to update
|
|
366
382
|
project_name (str): The containing project
|
|
367
383
|
name (str): The updated name of the agent
|
|
368
|
-
model_name (str): The name of the existing ML model the agent will use
|
|
384
|
+
model_name (str | dict): The name of the existing ML model the agent will use
|
|
369
385
|
skills_to_add (List[Union[str, dict]]): List of skill names to add to the agent, or list of dicts
|
|
370
386
|
with one of keys is "name", and other is additional parameters for relationship agent<>skill
|
|
371
387
|
skills_to_remove (List[str]): List of skill names to remove from the agent
|
|
@@ -394,6 +410,8 @@ class AgentsController:
|
|
|
394
410
|
existing_agent = self.get_agent(agent_name, project_name=project_name)
|
|
395
411
|
if existing_agent is None:
|
|
396
412
|
raise EntityNotExistsError(f"Agent with name not found: {agent_name}")
|
|
413
|
+
existing_params = existing_agent.params or {}
|
|
414
|
+
|
|
397
415
|
is_demo = (existing_agent.params or {}).get("is_demo", False)
|
|
398
416
|
if is_demo and (
|
|
399
417
|
(name is not None and name != agent_name)
|
|
@@ -413,12 +431,34 @@ class AgentsController:
|
|
|
413
431
|
existing_agent.name = name
|
|
414
432
|
|
|
415
433
|
if model_name or provider:
|
|
434
|
+
if isinstance(model_name, dict):
|
|
435
|
+
# move into params
|
|
436
|
+
existing_params["model"] = model_name
|
|
437
|
+
model_name = None
|
|
438
|
+
|
|
416
439
|
# check model and provider
|
|
417
440
|
model, provider = self.check_model_provider(model_name, provider)
|
|
418
441
|
# Update model and provider
|
|
419
442
|
existing_agent.model_name = model_name
|
|
420
443
|
existing_agent.provider = provider
|
|
421
444
|
|
|
445
|
+
if "data" in params:
|
|
446
|
+
if len(skills_to_add) > 0 or len(skills_to_remove) > 0:
|
|
447
|
+
raise ValueError(
|
|
448
|
+
"'data' parameter cannot be used with 'skills_to_remove' or 'skills_to_add' parameters"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
include_knowledge_bases = params["data"].get("knowledge_bases")
|
|
452
|
+
include_tables = params["data"].get("tables")
|
|
453
|
+
|
|
454
|
+
skill = self._create_default_sql_skill(
|
|
455
|
+
agent_name,
|
|
456
|
+
project_name,
|
|
457
|
+
include_tables=include_tables,
|
|
458
|
+
include_knowledge_bases=include_knowledge_bases,
|
|
459
|
+
)
|
|
460
|
+
skills_to_rewrite = [{"name": skill}]
|
|
461
|
+
|
|
422
462
|
# check that all skills exist
|
|
423
463
|
skill_name_to_record_map = {}
|
|
424
464
|
for skill_meta in skills_to_add + skills_to_remove + skills_to_rewrite:
|
|
@@ -496,8 +536,6 @@ class AgentsController:
|
|
|
496
536
|
db.session.add(association)
|
|
497
537
|
|
|
498
538
|
if params is not None:
|
|
499
|
-
existing_params = existing_agent.params or {}
|
|
500
|
-
|
|
501
539
|
if params.get("data", {}).get("tables"):
|
|
502
540
|
new_table_entries = set(params["data"]["tables"]) - set(
|
|
503
541
|
existing_params.get("data", {}).get("tables", [])
|
|
@@ -11,7 +11,7 @@ import pandas as pd
|
|
|
11
11
|
from langchain.agents import AgentExecutor
|
|
12
12
|
from langchain.agents.initialize import initialize_agent
|
|
13
13
|
from langchain.chains.conversation.memory import ConversationSummaryBufferMemory
|
|
14
|
-
from langchain_community.chat_models import
|
|
14
|
+
from langchain_community.chat_models import ChatLiteLLM, ChatOllama
|
|
15
15
|
from langchain_writer import ChatWriter
|
|
16
16
|
from langchain_google_genai import ChatGoogleGenerativeAI
|
|
17
17
|
from langchain_core.agents import AgentAction, AgentStep
|
|
@@ -165,8 +165,6 @@ def create_chat_model(args: Dict):
|
|
|
165
165
|
except NotImplementedError:
|
|
166
166
|
chat_open_ai.tiktoken_model_name = DEFAULT_TIKTOKEN_MODEL_NAME
|
|
167
167
|
return chat_open_ai
|
|
168
|
-
if args["provider"] == "anyscale":
|
|
169
|
-
return ChatAnyscale(**model_kwargs)
|
|
170
168
|
if args["provider"] == "litellm":
|
|
171
169
|
return ChatLiteLLM(**model_kwargs)
|
|
172
170
|
if args["provider"] == "ollama":
|
|
@@ -3,11 +3,12 @@ from copy import deepcopy
|
|
|
3
3
|
from typing import List, Optional
|
|
4
4
|
from collections import OrderedDict
|
|
5
5
|
|
|
6
|
+
import pandas as pd
|
|
6
7
|
import sqlalchemy as sa
|
|
7
8
|
import numpy as np
|
|
8
9
|
|
|
9
10
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
10
|
-
from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
|
|
11
|
+
from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier, BinaryOperation
|
|
11
12
|
from mindsdb_sql_parser import parse_sql
|
|
12
13
|
|
|
13
14
|
from mindsdb.interfaces.storage import db
|
|
@@ -109,7 +110,19 @@ class Project:
|
|
|
109
110
|
"""
|
|
110
111
|
ViewController().delete(name, project_name=self.name, strict_case=strict_case)
|
|
111
112
|
|
|
112
|
-
def create_view(self, name: str, query: str):
|
|
113
|
+
def create_view(self, name: str, query: str, session):
|
|
114
|
+
ast_query = parse_sql(query)
|
|
115
|
+
|
|
116
|
+
if isinstance(ast_query, Select):
|
|
117
|
+
# check create view sql
|
|
118
|
+
ast_query.limit = Constant(1)
|
|
119
|
+
|
|
120
|
+
query_context_controller.set_context(query_context_controller.IGNORE_CONTEXT)
|
|
121
|
+
try:
|
|
122
|
+
SQLQuery(ast_query, session=session, database=self.name)
|
|
123
|
+
finally:
|
|
124
|
+
query_context_controller.release_context(query_context_controller.IGNORE_CONTEXT)
|
|
125
|
+
|
|
113
126
|
ViewController().add(name, query=query, project_name=self.name)
|
|
114
127
|
|
|
115
128
|
def update_view(self, name: str, query: str, strict_case: bool = False):
|
|
@@ -124,21 +137,112 @@ class Project:
|
|
|
124
137
|
view_meta["query_ast"] = parse_sql(view_meta["query"])
|
|
125
138
|
return view_meta
|
|
126
139
|
|
|
127
|
-
|
|
140
|
+
@staticmethod
|
|
141
|
+
def combine_view_select(view_query: Select, query: Select) -> Select:
|
|
142
|
+
"""
|
|
143
|
+
Create a combined query from view's query and outer query.
|
|
144
|
+
"""
|
|
145
|
+
|
|
146
|
+
# apply optimizations
|
|
147
|
+
if query.where is not None:
|
|
148
|
+
# Get conditions that can be duplicated into view's query
|
|
149
|
+
# It has to be simple condition with identifier and constant
|
|
150
|
+
# Also it shouldn't be under the OR condition
|
|
151
|
+
|
|
152
|
+
def get_conditions_to_move(node):
|
|
153
|
+
if not isinstance(node, BinaryOperation):
|
|
154
|
+
return []
|
|
155
|
+
op = node.op.upper()
|
|
156
|
+
if op == "AND":
|
|
157
|
+
conditions = []
|
|
158
|
+
conditions.extend(get_conditions_to_move(node.args[0]))
|
|
159
|
+
conditions.extend(get_conditions_to_move(node.args[1]))
|
|
160
|
+
return conditions
|
|
161
|
+
|
|
162
|
+
if op == "OR":
|
|
163
|
+
return []
|
|
164
|
+
if isinstance(node.args[0], (Identifier, Constant)) and isinstance(
|
|
165
|
+
node.args[1], (Identifier, Constant)
|
|
166
|
+
):
|
|
167
|
+
return [node]
|
|
168
|
+
|
|
169
|
+
conditions = get_conditions_to_move(query.where)
|
|
170
|
+
|
|
171
|
+
if conditions:
|
|
172
|
+
# analyse targets
|
|
173
|
+
# if target element has alias
|
|
174
|
+
# if element is not identifier or the name is not equal to alias:
|
|
175
|
+
# add alias to black list
|
|
176
|
+
# white list:
|
|
177
|
+
# all targets that are identifiers with no alias or equal to its alias
|
|
178
|
+
# condition can be moved if
|
|
179
|
+
# column is not in black list AND (query has star(*) OR column in white list)
|
|
180
|
+
|
|
181
|
+
has_star = False
|
|
182
|
+
white_list, black_list = [], []
|
|
183
|
+
for target in view_query.targets:
|
|
184
|
+
if isinstance(target, Star):
|
|
185
|
+
has_star = True
|
|
186
|
+
if isinstance(target, Identifier):
|
|
187
|
+
name = target.parts[-1].lower()
|
|
188
|
+
if target.alias is None or target.alias.parts[-1].lower() == name:
|
|
189
|
+
white_list.append(name)
|
|
190
|
+
elif target.alias is not None:
|
|
191
|
+
black_list.append(target.alias.parts[-1].lower())
|
|
192
|
+
|
|
193
|
+
view_where = view_query.where
|
|
194
|
+
for condition in conditions:
|
|
195
|
+
arg1, arg2 = condition.args
|
|
196
|
+
|
|
197
|
+
if isinstance(arg1, Identifier):
|
|
198
|
+
name = arg1.parts[-1].lower()
|
|
199
|
+
if name in black_list or not (has_star or name in white_list):
|
|
200
|
+
continue
|
|
201
|
+
if isinstance(arg2, Identifier):
|
|
202
|
+
name = arg2.parts[-1].lower()
|
|
203
|
+
if name in black_list or not (has_star or name in white_list):
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
# condition can be moved into view
|
|
207
|
+
condition2 = BinaryOperation(condition.op, [arg1, arg2])
|
|
208
|
+
if view_where is None:
|
|
209
|
+
view_where = condition2
|
|
210
|
+
else:
|
|
211
|
+
view_where = BinaryOperation("AND", args=[view_where, condition2])
|
|
212
|
+
|
|
213
|
+
# disable outer condition
|
|
214
|
+
condition.op = "="
|
|
215
|
+
condition.args = [Constant(0), Constant(0)]
|
|
216
|
+
|
|
217
|
+
view_query.where = view_where
|
|
218
|
+
|
|
219
|
+
# combine outer query with view's query
|
|
220
|
+
view_query.parentheses = True
|
|
221
|
+
query.from_table = view_query
|
|
222
|
+
return query
|
|
223
|
+
|
|
224
|
+
def query_view(self, query: Select, session) -> pd.DataFrame:
|
|
128
225
|
view_meta = self.get_view_meta(query)
|
|
129
226
|
|
|
130
227
|
query_context_controller.set_context("view", view_meta["id"])
|
|
131
|
-
|
|
228
|
+
query_applied = False
|
|
132
229
|
try:
|
|
133
|
-
|
|
230
|
+
view_query = view_meta["query_ast"]
|
|
231
|
+
if isinstance(view_query, Select):
|
|
232
|
+
view_query = self.combine_view_select(view_query, query)
|
|
233
|
+
query_applied = True
|
|
234
|
+
|
|
235
|
+
sqlquery = SQLQuery(view_query, session=session)
|
|
134
236
|
df = sqlquery.fetched_data.to_df()
|
|
135
237
|
finally:
|
|
136
238
|
query_context_controller.release_context("view", view_meta["id"])
|
|
137
239
|
|
|
138
240
|
# remove duplicated columns
|
|
139
241
|
df = df.loc[:, ~df.columns.duplicated()]
|
|
140
|
-
|
|
141
|
-
|
|
242
|
+
if query_applied:
|
|
243
|
+
return df
|
|
244
|
+
else:
|
|
245
|
+
return query_df(df, query, session=session)
|
|
142
246
|
|
|
143
247
|
@staticmethod
|
|
144
248
|
def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):
|
|
@@ -1139,8 +1139,14 @@ class KnowledgeBaseController:
|
|
|
1139
1139
|
else:
|
|
1140
1140
|
vector_db_name, vector_table_name = storage.parts
|
|
1141
1141
|
|
|
1142
|
+
data_node = self.session.datahub.get(vector_db_name)
|
|
1143
|
+
if data_node:
|
|
1144
|
+
vector_store_handler = data_node.integration_handler
|
|
1145
|
+
else:
|
|
1146
|
+
raise ValueError(
|
|
1147
|
+
f"Unable to find database named {vector_db_name}, please make sure {vector_db_name} is defined"
|
|
1148
|
+
)
|
|
1142
1149
|
# create table in vectordb before creating KB
|
|
1143
|
-
vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
|
|
1144
1150
|
vector_store_handler.create_table(vector_table_name)
|
|
1145
1151
|
if keyword_search_enabled:
|
|
1146
1152
|
vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
|
|
@@ -4,8 +4,7 @@ import asyncio
|
|
|
4
4
|
from typing import List, Dict, Optional, Any
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
|
-
from
|
|
8
|
-
from langchain_core.documents import Document as LangchainDocument
|
|
7
|
+
from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
|
|
9
8
|
|
|
10
9
|
from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
|
|
11
10
|
FileSplitter,
|
|
@@ -22,7 +21,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
|
|
|
22
21
|
)
|
|
23
22
|
from mindsdb.utilities import log
|
|
24
23
|
|
|
25
|
-
|
|
26
24
|
logger = log.getLogger(__name__)
|
|
27
25
|
|
|
28
26
|
_DEFAULT_CONTENT_COLUMN_NAME = "content"
|
|
@@ -49,11 +47,10 @@ class DocumentPreprocessor:
|
|
|
49
47
|
if self.splitter is None:
|
|
50
48
|
raise ValueError("Splitter not configured")
|
|
51
49
|
|
|
52
|
-
|
|
53
|
-
langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
|
|
50
|
+
metadata = doc.metadata or {}
|
|
54
51
|
# Split and convert back to our Document type
|
|
55
|
-
|
|
56
|
-
return [Document(content=
|
|
52
|
+
split_texts = self.splitter.split_text(doc.content)
|
|
53
|
+
return [Document(content=text, metadata=metadata) for text in split_texts]
|
|
57
54
|
|
|
58
55
|
def _get_source(self) -> str:
|
|
59
56
|
"""Get the source identifier for this preprocessor"""
|
|
@@ -266,16 +263,15 @@ Please give a short succinct context to situate this chunk within the overall do
|
|
|
266
263
|
|
|
267
264
|
|
|
268
265
|
class TextChunkingPreprocessor(DocumentPreprocessor):
|
|
269
|
-
"""Default text chunking preprocessor using
|
|
266
|
+
"""Default text chunking preprocessor using TextSplitter"""
|
|
270
267
|
|
|
271
268
|
def __init__(self, config: Optional[TextChunkingConfig] = None):
|
|
272
269
|
"""Initialize with text chunking configuration"""
|
|
273
270
|
super().__init__()
|
|
274
271
|
self.config = config or TextChunkingConfig()
|
|
275
|
-
self.splitter =
|
|
272
|
+
self.splitter = TextSplitter(
|
|
276
273
|
chunk_size=self.config.chunk_size,
|
|
277
274
|
chunk_overlap=self.config.chunk_overlap,
|
|
278
|
-
length_function=self.config.length_function,
|
|
279
275
|
separators=self.config.separators,
|
|
280
276
|
)
|
|
281
277
|
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class TextSplitter:
|
|
5
|
+
def __init__(
|
|
6
|
+
self,
|
|
7
|
+
chunk_size: int = 1000,
|
|
8
|
+
chunk_overlap: int = 200,
|
|
9
|
+
separators: List[str] = None,
|
|
10
|
+
k_range: float = 0.5,
|
|
11
|
+
k_ratio: float = 1,
|
|
12
|
+
):
|
|
13
|
+
"""
|
|
14
|
+
Split text into chunks. The logic:
|
|
15
|
+
- Get a piece of text with chunk_size and try to find the separator at the end of the piece.
|
|
16
|
+
- The allowed range to find the separator is defined by k_range and k_ratio using formula:
|
|
17
|
+
k_range * chunk_size / (num * k_ratio + 1)
|
|
18
|
+
num - is number of a separator from the list
|
|
19
|
+
- if the separator is not in the rage: switch to the next separator
|
|
20
|
+
- if the found separator is in the middle of the sentence, use overlapping:
|
|
21
|
+
- the found text is the current chunk
|
|
22
|
+
- repeat the search with less strict k_range and k_ratio
|
|
23
|
+
- the found text will be the beginning of the next chunk
|
|
24
|
+
|
|
25
|
+
:param chunk_size: size of the chunk, which must not be exceeded
|
|
26
|
+
:param separators: list of separators in order of priority
|
|
27
|
+
:param k_range: defines the range to look for the separator
|
|
28
|
+
:param k_ratio: defines how much to shrink the range for the next separator
|
|
29
|
+
"""
|
|
30
|
+
if separators is None:
|
|
31
|
+
separators = ["\n\n", "\n", ". ", " ", ""]
|
|
32
|
+
self.chunk_size = chunk_size
|
|
33
|
+
self.chunk_overlap = chunk_overlap
|
|
34
|
+
self.separators = separators
|
|
35
|
+
self.k_range = k_range
|
|
36
|
+
self.k_ratio = k_ratio
|
|
37
|
+
|
|
38
|
+
def split_text(self, text: str) -> List[str]:
|
|
39
|
+
chunks = []
|
|
40
|
+
|
|
41
|
+
while True:
|
|
42
|
+
if len(text) < self.chunk_size:
|
|
43
|
+
chunks.append(text)
|
|
44
|
+
break
|
|
45
|
+
|
|
46
|
+
sep, chunk, shift = self.get_next_chunk(text, self.k_range, self.k_ratio)
|
|
47
|
+
chunks.append(chunk)
|
|
48
|
+
|
|
49
|
+
text = text[shift:]
|
|
50
|
+
return chunks
|
|
51
|
+
|
|
52
|
+
def get_next_chunk(self, text: str, k_range: float, k_ratio: float):
|
|
53
|
+
# returns chunk with separator and shift for the next search iteration
|
|
54
|
+
|
|
55
|
+
chunk = text[: self.chunk_size]
|
|
56
|
+
# positions = []
|
|
57
|
+
for i, sep in enumerate(self.separators):
|
|
58
|
+
pos = chunk.rfind(sep)
|
|
59
|
+
|
|
60
|
+
vpos = self.chunk_size - pos
|
|
61
|
+
if vpos < k_range * self.chunk_size / (i * k_ratio + 1):
|
|
62
|
+
shift = len(sep) + pos
|
|
63
|
+
if sep.strip(" ") == "":
|
|
64
|
+
# overlapping
|
|
65
|
+
sep2, _, shift2 = self.get_next_chunk(text, k_range * 1.5, 0)
|
|
66
|
+
if sep2.strip(" ") != "":
|
|
67
|
+
# use shift of previous separator
|
|
68
|
+
if shift - shift2 < self.chunk_overlap:
|
|
69
|
+
shift = shift2
|
|
70
|
+
|
|
71
|
+
return sep, chunk[:pos], shift
|
|
72
|
+
|
|
73
|
+
raise RuntimeError("Cannot split text")
|
|
@@ -45,7 +45,7 @@ class RunningQuery:
|
|
|
45
45
|
for df in dn.query_stream(query2, fetch_size=self.batch_size):
|
|
46
46
|
max_track_value = self.get_max_track_value(df)
|
|
47
47
|
yield df
|
|
48
|
-
self.set_progress(
|
|
48
|
+
self.set_progress(max_track_value=max_track_value)
|
|
49
49
|
|
|
50
50
|
else:
|
|
51
51
|
while True:
|
|
@@ -59,7 +59,7 @@ class RunningQuery:
|
|
|
59
59
|
|
|
60
60
|
max_track_value = self.get_max_track_value(df)
|
|
61
61
|
yield df
|
|
62
|
-
self.set_progress(
|
|
62
|
+
self.set_progress(max_track_value=max_track_value)
|
|
63
63
|
|
|
64
64
|
def get_partition_query(self, step_num: int, query: Select, stream=False) -> Select:
|
|
65
65
|
"""
|
|
@@ -178,24 +178,23 @@ class RunningQuery:
|
|
|
178
178
|
# stream mode
|
|
179
179
|
return None
|
|
180
180
|
|
|
181
|
-
def set_progress(self,
|
|
181
|
+
def set_progress(self, processed_rows: int = None, max_track_value: int = None):
|
|
182
182
|
"""
|
|
183
183
|
Store progres of the query, it is called after processing of batch
|
|
184
184
|
"""
|
|
185
185
|
|
|
186
|
-
if
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
self.record.processed_rows = self.record.processed_rows + len(df)
|
|
190
|
-
|
|
191
|
-
cur_value = self.record.context.get("track_value")
|
|
192
|
-
new_value = max_track_value
|
|
193
|
-
if new_value is not None:
|
|
194
|
-
if cur_value is None or new_value > cur_value:
|
|
195
|
-
self.record.context["track_value"] = new_value
|
|
196
|
-
flag_modified(self.record, "context")
|
|
186
|
+
if processed_rows is not None and processed_rows > 0:
|
|
187
|
+
self.record.processed_rows = self.record.processed_rows + processed_rows
|
|
188
|
+
db.session.commit()
|
|
197
189
|
|
|
198
|
-
|
|
190
|
+
if max_track_value is not None:
|
|
191
|
+
cur_value = self.record.context.get("track_value")
|
|
192
|
+
new_value = max_track_value
|
|
193
|
+
if new_value is not None:
|
|
194
|
+
if cur_value is None or new_value > cur_value:
|
|
195
|
+
self.record.context["track_value"] = new_value
|
|
196
|
+
flag_modified(self.record, "context")
|
|
197
|
+
db.session.commit()
|
|
199
198
|
|
|
200
199
|
def on_error(self, error: Exception, step_num: int, steps_data: dict):
|
|
201
200
|
"""
|
mindsdb/utilities/config.py
CHANGED
|
@@ -599,6 +599,7 @@ class Config:
|
|
|
599
599
|
ml_task_queue_consumer=None,
|
|
600
600
|
agent=None,
|
|
601
601
|
project=None,
|
|
602
|
+
update_gui=False,
|
|
602
603
|
)
|
|
603
604
|
return
|
|
604
605
|
|
|
@@ -635,6 +636,7 @@ class Config:
|
|
|
635
636
|
help="MindsDB agent name to connect to",
|
|
636
637
|
)
|
|
637
638
|
parser.add_argument("--project-name", type=str, default=None, help="MindsDB project name")
|
|
639
|
+
parser.add_argument("--update-gui", action="store_true", default=False, help="Update GUI and exit")
|
|
638
640
|
|
|
639
641
|
self._cmd_args = parser.parse_args()
|
|
640
642
|
|