MindsDB 25.7.4.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (57) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/executor/command_executor.py +9 -15
  4. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  5. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  6. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  7. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  8. mindsdb/api/executor/utilities/sql.py +30 -0
  9. mindsdb/api/http/initialize.py +2 -1
  10. mindsdb/api/http/namespaces/views.py +56 -72
  11. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  12. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  13. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  14. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  15. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  16. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  17. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  18. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  19. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  20. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  21. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  22. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  23. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  24. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  25. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -2
  26. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  27. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  28. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  29. mindsdb/integrations/handlers/salesforce_handler/constants.py +9 -2
  30. mindsdb/integrations/libs/llm/config.py +0 -14
  31. mindsdb/integrations/libs/llm/utils.py +0 -15
  32. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  33. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  34. mindsdb/interfaces/agents/agents_controller.py +83 -45
  35. mindsdb/interfaces/agents/constants.py +0 -1
  36. mindsdb/interfaces/agents/langchain_agent.py +1 -3
  37. mindsdb/interfaces/database/projects.py +111 -7
  38. mindsdb/interfaces/knowledge_base/controller.py +7 -1
  39. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  40. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  41. mindsdb/interfaces/query_context/context_controller.py +14 -15
  42. mindsdb/utilities/config.py +2 -0
  43. mindsdb/utilities/fs.py +54 -17
  44. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +278 -263
  45. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +49 -48
  46. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  47. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  48. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  49. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  50. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  51. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  52. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  53. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  54. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  55. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  56. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  57. {mindsdb-25.7.4.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -145,11 +145,60 @@ class AgentsController:
145
145
 
146
146
  return all_agents.all()
147
147
 
148
+ def _create_default_sql_skill(
149
+ self,
150
+ name,
151
+ project_name,
152
+ include_tables: List[str] = None,
153
+ include_knowledge_bases: List[str] = None,
154
+ ):
155
+ # Create a default SQL skill
156
+ skill_name = f"{name}_sql_skill"
157
+ skill_params = {
158
+ "type": "sql",
159
+ "description": f"Auto-generated SQL skill for agent {name}",
160
+ }
161
+
162
+ # Add restrictions provided
163
+ if include_tables:
164
+ skill_params["include_tables"] = include_tables
165
+ if include_knowledge_bases:
166
+ skill_params["include_knowledge_bases"] = include_knowledge_bases
167
+
168
+ try:
169
+ # Check if skill already exists
170
+ existing_skill = self.skills_controller.get_skill(skill_name, project_name)
171
+ if existing_skill is None:
172
+ # Create the skill
173
+ skill_type = skill_params.pop("type")
174
+ self.skills_controller.add_skill(
175
+ name=skill_name, project_name=project_name, type=skill_type, params=skill_params
176
+ )
177
+ else:
178
+ # Update the skill if parameters have changed
179
+ params_changed = False
180
+
181
+ # Check if skill parameters need to be updated
182
+ for param_key, param_value in skill_params.items():
183
+ if existing_skill.params.get(param_key) != param_value:
184
+ existing_skill.params[param_key] = param_value
185
+ params_changed = True
186
+
187
+ # Update the skill if needed
188
+ if params_changed:
189
+ flag_modified(existing_skill, "params")
190
+ db.session.commit()
191
+
192
+ except Exception as e:
193
+ raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
194
+
195
+ return skill_name
196
+
148
197
  def add_agent(
149
198
  self,
150
199
  name: str,
151
200
  project_name: str = None,
152
- model_name: str = None,
201
+ model_name: Union[str, dict] = None,
153
202
  skills: List[Union[str, dict]] = None,
154
203
  provider: str = None,
155
204
  params: Dict[str, Any] = None,
@@ -256,46 +305,13 @@ class AgentsController:
256
305
 
257
306
  # Auto-create SQL skill if no skills are provided but include_tables or include_knowledge_bases params are provided
258
307
  if not skills and (include_tables or include_knowledge_bases):
259
- # Create a default SQL skill
260
- skill_name = f"{name}_sql_skill"
261
- skill_params = {
262
- "type": "sql",
263
- "description": f"Auto-generated SQL skill for agent {name}",
264
- }
265
-
266
- # Add restrictions provided
267
- if include_tables:
268
- skill_params["include_tables"] = include_tables
269
- if include_knowledge_bases:
270
- skill_params["include_knowledge_bases"] = include_knowledge_bases
271
-
272
- try:
273
- # Check if skill already exists
274
- existing_skill = self.skills_controller.get_skill(skill_name, project_name)
275
- if existing_skill is None:
276
- # Create the skill
277
- skill_type = skill_params.pop("type")
278
- self.skills_controller.add_skill(
279
- name=skill_name, project_name=project_name, type=skill_type, params=skill_params
280
- )
281
- else:
282
- # Update the skill if parameters have changed
283
- params_changed = False
284
-
285
- # Check if skill parameters need to be updated
286
- for param_key, param_value in skill_params.items():
287
- if existing_skill.params.get(param_key) != param_value:
288
- existing_skill.params[param_key] = param_value
289
- params_changed = True
290
-
291
- # Update the skill if needed
292
- if params_changed:
293
- flag_modified(existing_skill, "params")
294
- db.session.commit()
295
-
296
- skills = [skill_name]
297
- except Exception as e:
298
- raise ValueError(f"Failed to auto-create or update SQL skill: {str(e)}")
308
+ skill = self._create_default_sql_skill(
309
+ name,
310
+ project_name,
311
+ include_tables=include_tables,
312
+ include_knowledge_bases=include_knowledge_bases,
313
+ )
314
+ skills = [skill]
299
315
 
300
316
  agent = db.Agents(
301
317
  name=name,
@@ -351,7 +367,7 @@ class AgentsController:
351
367
  agent_name: str,
352
368
  project_name: str = default_project,
353
369
  name: str = None,
354
- model_name: str = None,
370
+ model_name: Union[str, dict] = None,
355
371
  skills_to_add: List[Union[str, dict]] = None,
356
372
  skills_to_remove: List[str] = None,
357
373
  skills_to_rewrite: List[Union[str, dict]] = None,
@@ -365,7 +381,7 @@ class AgentsController:
365
381
  agent_name (str): The name of the new agent, or existing agent to update
366
382
  project_name (str): The containing project
367
383
  name (str): The updated name of the agent
368
- model_name (str): The name of the existing ML model the agent will use
384
+ model_name (str | dict): The name of the existing ML model the agent will use
369
385
  skills_to_add (List[Union[str, dict]]): List of skill names to add to the agent, or list of dicts
370
386
  with one of keys is "name", and other is additional parameters for relationship agent<>skill
371
387
  skills_to_remove (List[str]): List of skill names to remove from the agent
@@ -394,6 +410,8 @@ class AgentsController:
394
410
  existing_agent = self.get_agent(agent_name, project_name=project_name)
395
411
  if existing_agent is None:
396
412
  raise EntityNotExistsError(f"Agent with name not found: {agent_name}")
413
+ existing_params = existing_agent.params or {}
414
+
397
415
  is_demo = (existing_agent.params or {}).get("is_demo", False)
398
416
  if is_demo and (
399
417
  (name is not None and name != agent_name)
@@ -413,12 +431,34 @@ class AgentsController:
413
431
  existing_agent.name = name
414
432
 
415
433
  if model_name or provider:
434
+ if isinstance(model_name, dict):
435
+ # move into params
436
+ existing_params["model"] = model_name
437
+ model_name = None
438
+
416
439
  # check model and provider
417
440
  model, provider = self.check_model_provider(model_name, provider)
418
441
  # Update model and provider
419
442
  existing_agent.model_name = model_name
420
443
  existing_agent.provider = provider
421
444
 
445
+ if "data" in params:
446
+ if len(skills_to_add) > 0 or len(skills_to_remove) > 0:
447
+ raise ValueError(
448
+ "'data' parameter cannot be used with 'skills_to_remove' or 'skills_to_add' parameters"
449
+ )
450
+
451
+ include_knowledge_bases = params["data"].get("knowledge_bases")
452
+ include_tables = params["data"].get("tables")
453
+
454
+ skill = self._create_default_sql_skill(
455
+ agent_name,
456
+ project_name,
457
+ include_tables=include_tables,
458
+ include_knowledge_bases=include_knowledge_bases,
459
+ )
460
+ skills_to_rewrite = [{"name": skill}]
461
+
422
462
  # check that all skills exist
423
463
  skill_name_to_record_map = {}
424
464
  for skill_meta in skills_to_add + skills_to_remove + skills_to_rewrite:
@@ -496,8 +536,6 @@ class AgentsController:
496
536
  db.session.add(association)
497
537
 
498
538
  if params is not None:
499
- existing_params = existing_agent.params or {}
500
-
501
539
  if params.get("data", {}).get("tables"):
502
540
  new_table_entries = set(params["data"]["tables"]) - set(
503
541
  existing_params.get("data", {}).get("tables", [])
@@ -26,7 +26,6 @@ OPEN_AI_CHAT_MODELS = (
26
26
  SUPPORTED_PROVIDERS = {
27
27
  "openai",
28
28
  "anthropic",
29
- "anyscale",
30
29
  "litellm",
31
30
  "ollama",
32
31
  "nvidia_nim",
@@ -11,7 +11,7 @@ import pandas as pd
11
11
  from langchain.agents import AgentExecutor
12
12
  from langchain.agents.initialize import initialize_agent
13
13
  from langchain.chains.conversation.memory import ConversationSummaryBufferMemory
14
- from langchain_community.chat_models import ChatAnyscale, ChatLiteLLM, ChatOllama
14
+ from langchain_community.chat_models import ChatLiteLLM, ChatOllama
15
15
  from langchain_writer import ChatWriter
16
16
  from langchain_google_genai import ChatGoogleGenerativeAI
17
17
  from langchain_core.agents import AgentAction, AgentStep
@@ -165,8 +165,6 @@ def create_chat_model(args: Dict):
165
165
  except NotImplementedError:
166
166
  chat_open_ai.tiktoken_model_name = DEFAULT_TIKTOKEN_MODEL_NAME
167
167
  return chat_open_ai
168
- if args["provider"] == "anyscale":
169
- return ChatAnyscale(**model_kwargs)
170
168
  if args["provider"] == "litellm":
171
169
  return ChatLiteLLM(**model_kwargs)
172
170
  if args["provider"] == "ollama":
@@ -3,11 +3,12 @@ from copy import deepcopy
3
3
  from typing import List, Optional
4
4
  from collections import OrderedDict
5
5
 
6
+ import pandas as pd
6
7
  import sqlalchemy as sa
7
8
  import numpy as np
8
9
 
9
10
  from mindsdb_sql_parser.ast.base import ASTNode
10
- from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier
11
+ from mindsdb_sql_parser.ast import Select, Star, Constant, Identifier, BinaryOperation
11
12
  from mindsdb_sql_parser import parse_sql
12
13
 
13
14
  from mindsdb.interfaces.storage import db
@@ -109,7 +110,19 @@ class Project:
109
110
  """
110
111
  ViewController().delete(name, project_name=self.name, strict_case=strict_case)
111
112
 
112
- def create_view(self, name: str, query: str):
113
+ def create_view(self, name: str, query: str, session):
114
+ ast_query = parse_sql(query)
115
+
116
+ if isinstance(ast_query, Select):
117
+ # check create view sql
118
+ ast_query.limit = Constant(1)
119
+
120
+ query_context_controller.set_context(query_context_controller.IGNORE_CONTEXT)
121
+ try:
122
+ SQLQuery(ast_query, session=session, database=self.name)
123
+ finally:
124
+ query_context_controller.release_context(query_context_controller.IGNORE_CONTEXT)
125
+
113
126
  ViewController().add(name, query=query, project_name=self.name)
114
127
 
115
128
  def update_view(self, name: str, query: str, strict_case: bool = False):
@@ -124,21 +137,112 @@ class Project:
124
137
  view_meta["query_ast"] = parse_sql(view_meta["query"])
125
138
  return view_meta
126
139
 
127
- def query_view(self, query, session):
140
+ @staticmethod
141
+ def combine_view_select(view_query: Select, query: Select) -> Select:
142
+ """
143
+ Create a combined query from view's query and outer query.
144
+ """
145
+
146
+ # apply optimizations
147
+ if query.where is not None:
148
+ # Get conditions that can be duplicated into view's query
149
+ # It has to be simple condition with identifier and constant
150
+ # Also it shouldn't be under the OR condition
151
+
152
+ def get_conditions_to_move(node):
153
+ if not isinstance(node, BinaryOperation):
154
+ return []
155
+ op = node.op.upper()
156
+ if op == "AND":
157
+ conditions = []
158
+ conditions.extend(get_conditions_to_move(node.args[0]))
159
+ conditions.extend(get_conditions_to_move(node.args[1]))
160
+ return conditions
161
+
162
+ if op == "OR":
163
+ return []
164
+ if isinstance(node.args[0], (Identifier, Constant)) and isinstance(
165
+ node.args[1], (Identifier, Constant)
166
+ ):
167
+ return [node]
168
+
169
+ conditions = get_conditions_to_move(query.where)
170
+
171
+ if conditions:
172
+ # analyse targets
173
+ # if target element has alias
174
+ # if element is not identifier or the name is not equal to alias:
175
+ # add alias to black list
176
+ # white list:
177
+ # all targets that are identifiers with no alias or equal to its alias
178
+ # condition can be moved if
179
+ # column is not in black list AND (query has star(*) OR column in white list)
180
+
181
+ has_star = False
182
+ white_list, black_list = [], []
183
+ for target in view_query.targets:
184
+ if isinstance(target, Star):
185
+ has_star = True
186
+ if isinstance(target, Identifier):
187
+ name = target.parts[-1].lower()
188
+ if target.alias is None or target.alias.parts[-1].lower() == name:
189
+ white_list.append(name)
190
+ elif target.alias is not None:
191
+ black_list.append(target.alias.parts[-1].lower())
192
+
193
+ view_where = view_query.where
194
+ for condition in conditions:
195
+ arg1, arg2 = condition.args
196
+
197
+ if isinstance(arg1, Identifier):
198
+ name = arg1.parts[-1].lower()
199
+ if name in black_list or not (has_star or name in white_list):
200
+ continue
201
+ if isinstance(arg2, Identifier):
202
+ name = arg2.parts[-1].lower()
203
+ if name in black_list or not (has_star or name in white_list):
204
+ continue
205
+
206
+ # condition can be moved into view
207
+ condition2 = BinaryOperation(condition.op, [arg1, arg2])
208
+ if view_where is None:
209
+ view_where = condition2
210
+ else:
211
+ view_where = BinaryOperation("AND", args=[view_where, condition2])
212
+
213
+ # disable outer condition
214
+ condition.op = "="
215
+ condition.args = [Constant(0), Constant(0)]
216
+
217
+ view_query.where = view_where
218
+
219
+ # combine outer query with view's query
220
+ view_query.parentheses = True
221
+ query.from_table = view_query
222
+ return query
223
+
224
+ def query_view(self, query: Select, session) -> pd.DataFrame:
128
225
  view_meta = self.get_view_meta(query)
129
226
 
130
227
  query_context_controller.set_context("view", view_meta["id"])
131
-
228
+ query_applied = False
132
229
  try:
133
- sqlquery = SQLQuery(view_meta["query_ast"], session=session)
230
+ view_query = view_meta["query_ast"]
231
+ if isinstance(view_query, Select):
232
+ view_query = self.combine_view_select(view_query, query)
233
+ query_applied = True
234
+
235
+ sqlquery = SQLQuery(view_query, session=session)
134
236
  df = sqlquery.fetched_data.to_df()
135
237
  finally:
136
238
  query_context_controller.release_context("view", view_meta["id"])
137
239
 
138
240
  # remove duplicated columns
139
241
  df = df.loc[:, ~df.columns.duplicated()]
140
-
141
- return query_df(df, query, session=session)
242
+ if query_applied:
243
+ return df
244
+ else:
245
+ return query_df(df, query, session=session)
142
246
 
143
247
  @staticmethod
144
248
  def _get_model_data(predictor_record, integraion_record, with_secrets: bool = True):
@@ -1139,8 +1139,14 @@ class KnowledgeBaseController:
1139
1139
  else:
1140
1140
  vector_db_name, vector_table_name = storage.parts
1141
1141
 
1142
+ data_node = self.session.datahub.get(vector_db_name)
1143
+ if data_node:
1144
+ vector_store_handler = data_node.integration_handler
1145
+ else:
1146
+ raise ValueError(
1147
+ f"Unable to find database named {vector_db_name}, please make sure {vector_db_name} is defined"
1148
+ )
1142
1149
  # create table in vectordb before creating KB
1143
- vector_store_handler = self.session.datahub.get(vector_db_name).integration_handler
1144
1150
  vector_store_handler.create_table(vector_table_name)
1145
1151
  if keyword_search_enabled:
1146
1152
  vector_store_handler.add_full_text_index(vector_table_name, TableField.CONTENT.value)
@@ -4,8 +4,7 @@ import asyncio
4
4
  from typing import List, Dict, Optional, Any
5
5
 
6
6
  import pandas as pd
7
- from langchain_text_splitters import RecursiveCharacterTextSplitter
8
- from langchain_core.documents import Document as LangchainDocument
7
+ from mindsdb.interfaces.knowledge_base.preprocessing.text_splitter import TextSplitter
9
8
 
10
9
  from mindsdb.integrations.utilities.rag.splitters.file_splitter import (
11
10
  FileSplitter,
@@ -22,7 +21,6 @@ from mindsdb.interfaces.knowledge_base.preprocessing.models import (
22
21
  )
23
22
  from mindsdb.utilities import log
24
23
 
25
-
26
24
  logger = log.getLogger(__name__)
27
25
 
28
26
  _DEFAULT_CONTENT_COLUMN_NAME = "content"
@@ -49,11 +47,10 @@ class DocumentPreprocessor:
49
47
  if self.splitter is None:
50
48
  raise ValueError("Splitter not configured")
51
49
 
52
- # Convert to langchain Document for splitting
53
- langchain_doc = LangchainDocument(page_content=doc.content, metadata=doc.metadata or {})
50
+ metadata = doc.metadata or {}
54
51
  # Split and convert back to our Document type
55
- split_docs = self.splitter.split_documents([langchain_doc])
56
- return [Document(content=split_doc.page_content, metadata=split_doc.metadata) for split_doc in split_docs]
52
+ split_texts = self.splitter.split_text(doc.content)
53
+ return [Document(content=text, metadata=metadata) for text in split_texts]
57
54
 
58
55
  def _get_source(self) -> str:
59
56
  """Get the source identifier for this preprocessor"""
@@ -266,16 +263,15 @@ Please give a short succinct context to situate this chunk within the overall do
266
263
 
267
264
 
268
265
  class TextChunkingPreprocessor(DocumentPreprocessor):
269
- """Default text chunking preprocessor using RecursiveCharacterTextSplitter"""
266
+ """Default text chunking preprocessor using TextSplitter"""
270
267
 
271
268
  def __init__(self, config: Optional[TextChunkingConfig] = None):
272
269
  """Initialize with text chunking configuration"""
273
270
  super().__init__()
274
271
  self.config = config or TextChunkingConfig()
275
- self.splitter = RecursiveCharacterTextSplitter(
272
+ self.splitter = TextSplitter(
276
273
  chunk_size=self.config.chunk_size,
277
274
  chunk_overlap=self.config.chunk_overlap,
278
- length_function=self.config.length_function,
279
275
  separators=self.config.separators,
280
276
  )
281
277
 
@@ -0,0 +1,73 @@
1
+ from typing import List
2
+
3
+
4
+ class TextSplitter:
5
+ def __init__(
6
+ self,
7
+ chunk_size: int = 1000,
8
+ chunk_overlap: int = 200,
9
+ separators: List[str] = None,
10
+ k_range: float = 0.5,
11
+ k_ratio: float = 1,
12
+ ):
13
+ """
14
+ Split text into chunks. The logic:
15
+ - Get a piece of text with chunk_size and try to find the separator at the end of the piece.
16
+ - The allowed range to find the separator is defined by k_range and k_ratio using formula:
17
+ k_range * chunk_size / (num * k_ratio + 1)
18
+ num - is number of a separator from the list
19
+ - if the separator is not in the rage: switch to the next separator
20
+ - if the found separator is in the middle of the sentence, use overlapping:
21
+ - the found text is the current chunk
22
+ - repeat the search with less strict k_range and k_ratio
23
+ - the found text will be the beginning of the next chunk
24
+
25
+ :param chunk_size: size of the chunk, which must not be exceeded
26
+ :param separators: list of separators in order of priority
27
+ :param k_range: defines the range to look for the separator
28
+ :param k_ratio: defines how much to shrink the range for the next separator
29
+ """
30
+ if separators is None:
31
+ separators = ["\n\n", "\n", ". ", " ", ""]
32
+ self.chunk_size = chunk_size
33
+ self.chunk_overlap = chunk_overlap
34
+ self.separators = separators
35
+ self.k_range = k_range
36
+ self.k_ratio = k_ratio
37
+
38
+ def split_text(self, text: str) -> List[str]:
39
+ chunks = []
40
+
41
+ while True:
42
+ if len(text) < self.chunk_size:
43
+ chunks.append(text)
44
+ break
45
+
46
+ sep, chunk, shift = self.get_next_chunk(text, self.k_range, self.k_ratio)
47
+ chunks.append(chunk)
48
+
49
+ text = text[shift:]
50
+ return chunks
51
+
52
+ def get_next_chunk(self, text: str, k_range: float, k_ratio: float):
53
+ # returns chunk with separator and shift for the next search iteration
54
+
55
+ chunk = text[: self.chunk_size]
56
+ # positions = []
57
+ for i, sep in enumerate(self.separators):
58
+ pos = chunk.rfind(sep)
59
+
60
+ vpos = self.chunk_size - pos
61
+ if vpos < k_range * self.chunk_size / (i * k_ratio + 1):
62
+ shift = len(sep) + pos
63
+ if sep.strip(" ") == "":
64
+ # overlapping
65
+ sep2, _, shift2 = self.get_next_chunk(text, k_range * 1.5, 0)
66
+ if sep2.strip(" ") != "":
67
+ # use shift of previous separator
68
+ if shift - shift2 < self.chunk_overlap:
69
+ shift = shift2
70
+
71
+ return sep, chunk[:pos], shift
72
+
73
+ raise RuntimeError("Cannot split text")
@@ -45,7 +45,7 @@ class RunningQuery:
45
45
  for df in dn.query_stream(query2, fetch_size=self.batch_size):
46
46
  max_track_value = self.get_max_track_value(df)
47
47
  yield df
48
- self.set_progress(df, max_track_value)
48
+ self.set_progress(max_track_value=max_track_value)
49
49
 
50
50
  else:
51
51
  while True:
@@ -59,7 +59,7 @@ class RunningQuery:
59
59
 
60
60
  max_track_value = self.get_max_track_value(df)
61
61
  yield df
62
- self.set_progress(df, max_track_value)
62
+ self.set_progress(max_track_value=max_track_value)
63
63
 
64
64
  def get_partition_query(self, step_num: int, query: Select, stream=False) -> Select:
65
65
  """
@@ -178,24 +178,23 @@ class RunningQuery:
178
178
  # stream mode
179
179
  return None
180
180
 
181
- def set_progress(self, df: pd.DataFrame, max_track_value: int):
181
+ def set_progress(self, processed_rows: int = None, max_track_value: int = None):
182
182
  """
183
183
  Store progres of the query, it is called after processing of batch
184
184
  """
185
185
 
186
- if len(df) == 0:
187
- return
188
-
189
- self.record.processed_rows = self.record.processed_rows + len(df)
190
-
191
- cur_value = self.record.context.get("track_value")
192
- new_value = max_track_value
193
- if new_value is not None:
194
- if cur_value is None or new_value > cur_value:
195
- self.record.context["track_value"] = new_value
196
- flag_modified(self.record, "context")
186
+ if processed_rows is not None and processed_rows > 0:
187
+ self.record.processed_rows = self.record.processed_rows + processed_rows
188
+ db.session.commit()
197
189
 
198
- db.session.commit()
190
+ if max_track_value is not None:
191
+ cur_value = self.record.context.get("track_value")
192
+ new_value = max_track_value
193
+ if new_value is not None:
194
+ if cur_value is None or new_value > cur_value:
195
+ self.record.context["track_value"] = new_value
196
+ flag_modified(self.record, "context")
197
+ db.session.commit()
199
198
 
200
199
  def on_error(self, error: Exception, step_num: int, steps_data: dict):
201
200
  """
@@ -599,6 +599,7 @@ class Config:
599
599
  ml_task_queue_consumer=None,
600
600
  agent=None,
601
601
  project=None,
602
+ update_gui=False,
602
603
  )
603
604
  return
604
605
 
@@ -635,6 +636,7 @@ class Config:
635
636
  help="MindsDB agent name to connect to",
636
637
  )
637
638
  parser.add_argument("--project-name", type=str, default=None, help="MindsDB project name")
639
+ parser.add_argument("--update-gui", action="store_true", default=False, help="Update GUI and exit")
638
640
 
639
641
  self._cmd_args = parser.parse_args()
640
642