MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (63) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/executor/command_executor.py +91 -61
  3. mindsdb/api/executor/data_types/answer.py +9 -12
  4. mindsdb/api/executor/datahub/classes/response.py +11 -0
  5. mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
  6. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +10 -11
  7. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
  8. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
  9. mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
  10. mindsdb/api/executor/planner/plan_join.py +2 -2
  11. mindsdb/api/executor/planner/query_plan.py +1 -0
  12. mindsdb/api/executor/planner/query_planner.py +86 -14
  13. mindsdb/api/executor/planner/steps.py +11 -2
  14. mindsdb/api/executor/sql_query/result_set.py +10 -7
  15. mindsdb/api/executor/sql_query/sql_query.py +69 -84
  16. mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
  17. mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
  18. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
  19. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +288 -0
  20. mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
  21. mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
  22. mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
  23. mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
  24. mindsdb/api/http/namespaces/sql.py +4 -1
  25. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
  26. mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
  27. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
  28. mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
  29. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
  30. mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
  31. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
  32. mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
  33. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
  34. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
  35. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +26 -16
  36. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +36 -7
  37. mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
  38. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
  39. mindsdb/integrations/libs/llm/config.py +11 -1
  40. mindsdb/integrations/libs/llm/utils.py +12 -0
  41. mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
  42. mindsdb/integrations/libs/response.py +9 -4
  43. mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
  44. mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
  45. mindsdb/interfaces/agents/constants.py +12 -1
  46. mindsdb/interfaces/agents/langchain_agent.py +6 -0
  47. mindsdb/interfaces/database/log.py +8 -9
  48. mindsdb/interfaces/database/projects.py +1 -5
  49. mindsdb/interfaces/functions/controller.py +59 -17
  50. mindsdb/interfaces/functions/to_markdown.py +194 -0
  51. mindsdb/interfaces/jobs/jobs_controller.py +3 -3
  52. mindsdb/interfaces/knowledge_base/controller.py +223 -97
  53. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
  54. mindsdb/interfaces/query_context/context_controller.py +224 -1
  55. mindsdb/interfaces/storage/db.py +23 -0
  56. mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
  57. mindsdb/utilities/context_executor.py +1 -1
  58. mindsdb/utilities/partitioning.py +35 -20
  59. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/METADATA +227 -224
  60. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/RECORD +63 -59
  61. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/WHEEL +0 -0
  62. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/licenses/LICENSE +0 -0
  63. {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,17 @@
1
1
  from typing import List
2
+ import pickle
3
+ import datetime as dt
2
4
 
5
+ from sqlalchemy.orm.attributes import flag_modified
3
6
  import pandas as pd
4
7
 
8
+ from mindsdb_sql_parser import Select, Star, OrderBy
9
+
5
10
  from mindsdb_sql_parser.ast import (
6
11
  Identifier, BinaryOperation, Last, Constant, ASTNode
7
12
  )
8
13
  from mindsdb.integrations.utilities.query_traversal import query_traversal
14
+ from mindsdb.utilities.cache import get_cache
9
15
 
10
16
  from mindsdb.interfaces.storage import db
11
17
  from mindsdb.utilities.context import context as ctx
@@ -13,6 +19,147 @@ from mindsdb.utilities.context import context as ctx
13
19
  from .last_query import LastQuery
14
20
 
15
21
 
22
+ class RunningQuery:
23
+ """
24
+ Query in progres
25
+ """
26
+
27
+ def __init__(self, record: db.Queries):
28
+ self.record = record
29
+ self.sql = record.sql
30
+
31
+ def get_partition_query(self, step_num: int, query: Select) -> Select:
32
+ """
33
+ Generate query for fetching the next partition
34
+ It wraps query to
35
+ select * from ({query})
36
+ where {track_column} > {previous_value}
37
+ order by track_column
38
+ limit size {batch_size}
39
+ And fill track_column, previous_value, batch_size
40
+ """
41
+
42
+ track_column = self.record.parameters['track_column']
43
+
44
+ query = Select(
45
+ targets=[Star()],
46
+ from_table=query,
47
+ order_by=[OrderBy(Identifier(track_column))],
48
+ limit=Constant(self.batch_size)
49
+ )
50
+
51
+ track_value = self.record.context.get('track_value')
52
+ # is it different step?
53
+ cur_step_num = self.record.context.get('step_num')
54
+ if cur_step_num is not None and cur_step_num != step_num:
55
+ # reset track_value
56
+ track_value = None
57
+ self.record.context['track_value'] = None
58
+ self.record.context['step_num'] = step_num
59
+ flag_modified(self.record, 'context')
60
+ db.session.commit()
61
+
62
+ if track_value is not None:
63
+ query.where = BinaryOperation(
64
+ op='>',
65
+ args=[Identifier(track_column), Constant(track_value)],
66
+ )
67
+
68
+ return query
69
+
70
+ def set_params(self, params: dict):
71
+ """
72
+ Store parameters of the step which is about to be split into partitions
73
+ """
74
+
75
+ if 'track_column' not in params:
76
+ raise ValueError('Track column is not defined')
77
+ if 'batch_size' not in params:
78
+ params['batch_size'] = 1000
79
+
80
+ self.record.parameters = params
81
+ self.batch_size = self.record.parameters['batch_size']
82
+ db.session.commit()
83
+
84
+ def get_max_track_value(self, df: pd.DataFrame) -> pd.DataFrame:
85
+ """
86
+ return max value to use in `set_progress`.
87
+ this function is called before execution substeps,
88
+ `set_progress` function - after
89
+ """
90
+
91
+ track_column = self.record.parameters['track_column']
92
+ return df[track_column].max()
93
+
94
+ def set_progress(self, df: pd.DataFrame, max_track_value: int):
95
+ """
96
+ Store progres of the query, it is called after processing of batch
97
+ """
98
+
99
+ if len(df) == 0:
100
+ return
101
+
102
+ self.record.processed_rows = self.record.processed_rows + len(df)
103
+
104
+ cur_value = self.record.context.get('track_value')
105
+ new_value = max_track_value
106
+ if new_value is not None:
107
+ if cur_value is None or new_value > cur_value:
108
+ self.record.context['track_value'] = new_value
109
+ flag_modified(self.record, 'context')
110
+
111
+ db.session.commit()
112
+
113
+ def on_error(self, error: Exception, step_num: int, steps_data: dict):
114
+ """
115
+ Saves error of the query in database
116
+ Also saves step data and current step num to be able to resume query
117
+ """
118
+ self.record.error = str(error)
119
+ self.record.context['step_num'] = step_num
120
+ flag_modified(self.record, 'context')
121
+
122
+ # save steps_data
123
+ cache = get_cache('steps_data')
124
+ data = pickle.dumps(steps_data, protocol=5)
125
+ cache.set(str(self.record.id), data)
126
+
127
+ db.session.commit()
128
+
129
+ def clear_error(self):
130
+ """
131
+ Reset error of the query in database
132
+ """
133
+
134
+ if self.record.error is not None:
135
+ self.record.error = None
136
+ db.session.commit()
137
+
138
+ def get_state(self) -> dict:
139
+ """
140
+ Returns stored state for resuming the query
141
+ """
142
+ cache = get_cache('steps_data')
143
+ key = self.record.id
144
+ data = cache.get(key)
145
+ cache.delete(key)
146
+
147
+ steps_data = pickle.loads(data)
148
+
149
+ return {
150
+ 'step_num': self.record.context.get('step_num'),
151
+ 'steps_data': steps_data,
152
+ }
153
+
154
+ def finish(self):
155
+ """
156
+ Mark query as finished
157
+ """
158
+
159
+ self.record.finished_at = dt.datetime.now()
160
+ db.session.commit()
161
+
162
+
16
163
  class QueryContextController:
17
164
  IGNORE_CONTEXT = '<IGNORE>'
18
165
 
@@ -156,10 +303,12 @@ class QueryContextController:
156
303
  last_values = {}
157
304
  for query, info in l_query.get_init_queries():
158
305
 
159
- data, columns_info = dn.query(
306
+ response = dn.query(
160
307
  query=query,
161
308
  session=session
162
309
  )
310
+ data = response.data_frame
311
+ columns_info = response.columns
163
312
 
164
313
  if len(data) == 0:
165
314
  value = None
@@ -285,5 +434,79 @@ class QueryContextController:
285
434
  rec.values = values
286
435
  db.session.commit()
287
436
 
437
+ def get_query(self, query_id: int) -> RunningQuery:
438
+ """
439
+ Get running query by id
440
+ """
441
+
442
+ rec = db.Queries.query.filter(
443
+ db.Queries.id == query_id,
444
+ db.Queries.company_id == ctx.company_id
445
+ ).first()
446
+
447
+ if rec is None:
448
+ raise RuntimeError(f'Query not found: {query_id}')
449
+ return RunningQuery(rec)
450
+
451
+ def create_query(self, query: ASTNode) -> RunningQuery:
452
+ """
453
+ Create a new running query from AST query
454
+ """
455
+
456
+ # remove old queries
457
+ remove_query = db.session.query(db.Queries).filter(
458
+ db.Queries.company_id == ctx.company_id,
459
+ db.Queries.finished_at < (dt.datetime.now() - dt.timedelta(days=1))
460
+ )
461
+ for rec in remove_query.all():
462
+ db.session.delete(rec)
463
+
464
+ rec = db.Queries(
465
+ sql=str(query),
466
+ company_id=ctx.company_id,
467
+ )
468
+
469
+ db.session.add(rec)
470
+ db.session.commit()
471
+ return RunningQuery(rec)
472
+
473
+ def list_queries(self) -> List[dict]:
474
+ """
475
+ Get list of all running queries with metadata
476
+ """
477
+
478
+ query = db.session.query(db.Queries).filter(
479
+ db.Queries.company_id == ctx.company_id
480
+ )
481
+ return [
482
+ {
483
+ 'id': record.id,
484
+ 'sql': record.sql,
485
+ 'started_at': record.started_at,
486
+ 'finished_at': record.finished_at,
487
+ 'parameters': record.parameters,
488
+ 'context': record.context,
489
+ 'processed_rows': record.processed_rows,
490
+ 'error': record.error,
491
+ 'updated_at': record.updated_at,
492
+ }
493
+ for record in query
494
+ ]
495
+
496
+ def cancel_query(self, query_id: int):
497
+ """
498
+ Cancels running query by id
499
+ """
500
+ rec = db.Queries.query.filter(
501
+ db.Queries.id == query_id,
502
+ db.Queries.company_id == ctx.company_id
503
+ ).first()
504
+ if rec is None:
505
+ raise RuntimeError(f'Query not found: {query_id}')
506
+
507
+ # the query in progress will fail when it tries to update status
508
+ db.session.delete(rec)
509
+ db.session.commit()
510
+
288
511
 
289
512
  query_context_controller = QueryContextController()
@@ -523,6 +523,7 @@ class KnowledgeBase(Base):
523
523
  embedding_model = relationship(
524
524
  "Predictor", foreign_keys=[embedding_model_id], doc="embedding model"
525
525
  )
526
+ query_id = Column(Integer, nullable=True)
526
527
 
527
528
  created_at = Column(DateTime, default=datetime.datetime.now)
528
529
  updated_at = Column(
@@ -564,6 +565,28 @@ class QueryContext(Base):
564
565
  created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
565
566
 
566
567
 
568
+ class Queries(Base):
569
+ __tablename__ = "queries"
570
+ id: int = Column(Integer, primary_key=True)
571
+ company_id: int = Column(Integer, nullable=True)
572
+
573
+ sql: str = Column(String, nullable=False)
574
+ # step_data: JSON = Column(JSON, nullable=True)
575
+
576
+ started_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
577
+ finished_at: datetime.datetime = Column(DateTime)
578
+
579
+ parameters = Column(JSON, default={})
580
+ context = Column(JSON, default={})
581
+ processed_rows = Column(Integer, default=0)
582
+ error: str = Column(String, nullable=True)
583
+
584
+ updated_at: datetime.datetime = Column(
585
+ DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
586
+ )
587
+ created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
588
+
589
+
567
590
  class LLMLog(Base):
568
591
  __tablename__ = "llm_log"
569
592
  id: int = Column(Integer, primary_key=True)
@@ -0,0 +1,45 @@
1
+ """queries
2
+
3
+ Revision ID: fda503400e43
4
+ Revises: 11347c213b36
5
+ Create Date: 2025-03-21 18:50:20.795930
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ import mindsdb.interfaces.storage.db # noqa
11
+
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = 'fda503400e43'
15
+ down_revision = '11347c213b36'
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade():
21
+ op.create_table(
22
+ 'queries',
23
+ sa.Column('id', sa.Integer(), nullable=False),
24
+ sa.Column('company_id', sa.Integer(), nullable=True),
25
+ sa.Column('sql', sa.String(), nullable=False),
26
+ sa.Column('started_at', sa.DateTime(), nullable=True),
27
+ sa.Column('finished_at', sa.DateTime(), nullable=True),
28
+ sa.Column('parameters', sa.JSON(), nullable=True),
29
+ sa.Column('context', sa.JSON(), nullable=True),
30
+ sa.Column('processed_rows', sa.Integer(), nullable=True),
31
+ sa.Column('error', sa.String(), nullable=True),
32
+ sa.Column('updated_at', sa.DateTime(), nullable=True),
33
+ sa.Column('created_at', sa.DateTime(), nullable=True),
34
+ sa.PrimaryKeyConstraint('id')
35
+ )
36
+
37
+ with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
38
+ batch_op.add_column(sa.Column('query_id', sa.INTEGER(), nullable=True))
39
+
40
+
41
+ def downgrade():
42
+ with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
43
+ batch_op.drop_column('query_id')
44
+
45
+ op.drop_table('queries')
@@ -43,7 +43,7 @@ def execute_in_threads(func, tasks, thread_count=3, queue_size_k=1.5):
43
43
  for i in range(queue_size):
44
44
  try:
45
45
  args = next(tasks)
46
- futures.append(executor.submit(func, *args))
46
+ futures.append(executor.submit(func, args))
47
47
  except StopIteration:
48
48
  break
49
49
 
@@ -6,6 +6,35 @@ from mindsdb.utilities.config import Config
6
6
  from mindsdb.utilities.context_executor import execute_in_threads
7
7
 
8
8
 
9
+ def get_max_thread_count() -> int:
10
+ """
11
+ Calculate the maximum number of threads allowed for the system.
12
+ """
13
+ # workers count
14
+ is_cloud = Config().is_cloud
15
+ if is_cloud:
16
+ max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
17
+ else:
18
+ max_threads = os.cpu_count() - 3
19
+
20
+ if max_threads < 1:
21
+ max_threads = 1
22
+
23
+ return max_threads
24
+
25
+
26
+ def split_data_frame(df: pd.DataFrame, partition_size: int) -> Iterable[pd.DataFrame]:
27
+ """
28
+ Split data frame into chunks with partition_size and yield them out
29
+ """
30
+ chunk = 0
31
+ while chunk * partition_size < len(df):
32
+ # create results with partition
33
+ df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
34
+ chunk += 1
35
+ yield df1
36
+
37
+
9
38
  def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partition_size: int) -> Iterable:
10
39
  """
11
40
  Splits dataframe into partitions and apply callback on each partition
@@ -17,35 +46,21 @@ def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partit
17
46
  """
18
47
 
19
48
  # tasks
20
- def split_data_f(df):
21
- chunk = 0
22
- while chunk * partition_size < len(df):
23
- # create results with partition
24
- df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
25
- chunk += 1
26
- yield [df1]
27
49
 
28
- tasks = split_data_f(df)
50
+ tasks = split_data_frame(df, partition_size)
29
51
 
30
- # workers count
31
- is_cloud = Config().is_cloud
32
- if is_cloud:
33
- max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
34
- else:
35
- max_threads = os.cpu_count() - 2
52
+ max_threads = get_max_thread_count()
36
53
 
37
- # don't exceed chunk_count
38
54
  chunk_count = int(len(df) / partition_size)
39
- max_threads = min(max_threads, chunk_count)
40
-
41
- if max_threads < 1:
42
- max_threads = 1
55
+ # don't exceed chunk_count
56
+ if chunk_count > 0:
57
+ max_threads = min(max_threads, chunk_count)
43
58
 
44
59
  if max_threads == 1:
45
60
  # don't spawn threads
46
61
 
47
62
  for task in tasks:
48
- yield callback(*task)
63
+ yield callback(task)
49
64
 
50
65
  else:
51
66
  for result in execute_in_threads(callback, tasks, thread_count=max_threads):