MindsDB 25.4.1.0__py3-none-any.whl → 25.4.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/executor/command_executor.py +91 -61
- mindsdb/api/executor/data_types/answer.py +9 -12
- mindsdb/api/executor/datahub/classes/response.py +11 -0
- mindsdb/api/executor/datahub/datanodes/datanode.py +4 -4
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +10 -11
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +22 -16
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +20 -20
- mindsdb/api/executor/planner/plan_join.py +2 -2
- mindsdb/api/executor/planner/query_plan.py +1 -0
- mindsdb/api/executor/planner/query_planner.py +86 -14
- mindsdb/api/executor/planner/steps.py +11 -2
- mindsdb/api/executor/sql_query/result_set.py +10 -7
- mindsdb/api/executor/sql_query/sql_query.py +69 -84
- mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
- mindsdb/api/executor/sql_query/steps/delete_step.py +2 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +5 -3
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +288 -0
- mindsdb/api/executor/sql_query/steps/insert_step.py +2 -2
- mindsdb/api/executor/sql_query/steps/prepare_steps.py +2 -2
- mindsdb/api/executor/sql_query/steps/subselect_step.py +20 -8
- mindsdb/api/executor/sql_query/steps/update_step.py +4 -6
- mindsdb/api/http/namespaces/sql.py +4 -1
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/ok_packet.py +1 -1
- mindsdb/api/mysql/mysql_proxy/executor/mysql_executor.py +4 -27
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +1 -0
- mindsdb/api/mysql/mysql_proxy/mysql_proxy.py +38 -37
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -13
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
- mindsdb/integrations/handlers/mssql_handler/mssql_handler.py +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +3 -2
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +4 -4
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +26 -16
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +36 -7
- mindsdb/integrations/handlers/redshift_handler/redshift_handler.py +1 -1
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +18 -11
- mindsdb/integrations/libs/llm/config.py +11 -1
- mindsdb/integrations/libs/llm/utils.py +12 -0
- mindsdb/integrations/libs/ml_handler_process/learn_process.py +1 -2
- mindsdb/integrations/libs/response.py +9 -4
- mindsdb/integrations/libs/vectordatabase_handler.py +17 -5
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +8 -98
- mindsdb/interfaces/agents/constants.py +12 -1
- mindsdb/interfaces/agents/langchain_agent.py +6 -0
- mindsdb/interfaces/database/log.py +8 -9
- mindsdb/interfaces/database/projects.py +1 -5
- mindsdb/interfaces/functions/controller.py +59 -17
- mindsdb/interfaces/functions/to_markdown.py +194 -0
- mindsdb/interfaces/jobs/jobs_controller.py +3 -3
- mindsdb/interfaces/knowledge_base/controller.py +223 -97
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +3 -14
- mindsdb/interfaces/query_context/context_controller.py +224 -1
- mindsdb/interfaces/storage/db.py +23 -0
- mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
- mindsdb/utilities/context_executor.py +1 -1
- mindsdb/utilities/partitioning.py +35 -20
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/METADATA +227 -224
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/RECORD +63 -59
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.1.0.dist-info → mindsdb-25.4.2.1.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import pickle
|
|
3
|
+
import datetime as dt
|
|
2
4
|
|
|
5
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
3
6
|
import pandas as pd
|
|
4
7
|
|
|
8
|
+
from mindsdb_sql_parser import Select, Star, OrderBy
|
|
9
|
+
|
|
5
10
|
from mindsdb_sql_parser.ast import (
|
|
6
11
|
Identifier, BinaryOperation, Last, Constant, ASTNode
|
|
7
12
|
)
|
|
8
13
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
14
|
+
from mindsdb.utilities.cache import get_cache
|
|
9
15
|
|
|
10
16
|
from mindsdb.interfaces.storage import db
|
|
11
17
|
from mindsdb.utilities.context import context as ctx
|
|
@@ -13,6 +19,147 @@ from mindsdb.utilities.context import context as ctx
|
|
|
13
19
|
from .last_query import LastQuery
|
|
14
20
|
|
|
15
21
|
|
|
22
|
+
class RunningQuery:
|
|
23
|
+
"""
|
|
24
|
+
Query in progres
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, record: db.Queries):
|
|
28
|
+
self.record = record
|
|
29
|
+
self.sql = record.sql
|
|
30
|
+
|
|
31
|
+
def get_partition_query(self, step_num: int, query: Select) -> Select:
|
|
32
|
+
"""
|
|
33
|
+
Generate query for fetching the next partition
|
|
34
|
+
It wraps query to
|
|
35
|
+
select * from ({query})
|
|
36
|
+
where {track_column} > {previous_value}
|
|
37
|
+
order by track_column
|
|
38
|
+
limit size {batch_size}
|
|
39
|
+
And fill track_column, previous_value, batch_size
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
track_column = self.record.parameters['track_column']
|
|
43
|
+
|
|
44
|
+
query = Select(
|
|
45
|
+
targets=[Star()],
|
|
46
|
+
from_table=query,
|
|
47
|
+
order_by=[OrderBy(Identifier(track_column))],
|
|
48
|
+
limit=Constant(self.batch_size)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
track_value = self.record.context.get('track_value')
|
|
52
|
+
# is it different step?
|
|
53
|
+
cur_step_num = self.record.context.get('step_num')
|
|
54
|
+
if cur_step_num is not None and cur_step_num != step_num:
|
|
55
|
+
# reset track_value
|
|
56
|
+
track_value = None
|
|
57
|
+
self.record.context['track_value'] = None
|
|
58
|
+
self.record.context['step_num'] = step_num
|
|
59
|
+
flag_modified(self.record, 'context')
|
|
60
|
+
db.session.commit()
|
|
61
|
+
|
|
62
|
+
if track_value is not None:
|
|
63
|
+
query.where = BinaryOperation(
|
|
64
|
+
op='>',
|
|
65
|
+
args=[Identifier(track_column), Constant(track_value)],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return query
|
|
69
|
+
|
|
70
|
+
def set_params(self, params: dict):
|
|
71
|
+
"""
|
|
72
|
+
Store parameters of the step which is about to be split into partitions
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
if 'track_column' not in params:
|
|
76
|
+
raise ValueError('Track column is not defined')
|
|
77
|
+
if 'batch_size' not in params:
|
|
78
|
+
params['batch_size'] = 1000
|
|
79
|
+
|
|
80
|
+
self.record.parameters = params
|
|
81
|
+
self.batch_size = self.record.parameters['batch_size']
|
|
82
|
+
db.session.commit()
|
|
83
|
+
|
|
84
|
+
def get_max_track_value(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
return max value to use in `set_progress`.
|
|
87
|
+
this function is called before execution substeps,
|
|
88
|
+
`set_progress` function - after
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
track_column = self.record.parameters['track_column']
|
|
92
|
+
return df[track_column].max()
|
|
93
|
+
|
|
94
|
+
def set_progress(self, df: pd.DataFrame, max_track_value: int):
|
|
95
|
+
"""
|
|
96
|
+
Store progres of the query, it is called after processing of batch
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
if len(df) == 0:
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
self.record.processed_rows = self.record.processed_rows + len(df)
|
|
103
|
+
|
|
104
|
+
cur_value = self.record.context.get('track_value')
|
|
105
|
+
new_value = max_track_value
|
|
106
|
+
if new_value is not None:
|
|
107
|
+
if cur_value is None or new_value > cur_value:
|
|
108
|
+
self.record.context['track_value'] = new_value
|
|
109
|
+
flag_modified(self.record, 'context')
|
|
110
|
+
|
|
111
|
+
db.session.commit()
|
|
112
|
+
|
|
113
|
+
def on_error(self, error: Exception, step_num: int, steps_data: dict):
|
|
114
|
+
"""
|
|
115
|
+
Saves error of the query in database
|
|
116
|
+
Also saves step data and current step num to be able to resume query
|
|
117
|
+
"""
|
|
118
|
+
self.record.error = str(error)
|
|
119
|
+
self.record.context['step_num'] = step_num
|
|
120
|
+
flag_modified(self.record, 'context')
|
|
121
|
+
|
|
122
|
+
# save steps_data
|
|
123
|
+
cache = get_cache('steps_data')
|
|
124
|
+
data = pickle.dumps(steps_data, protocol=5)
|
|
125
|
+
cache.set(str(self.record.id), data)
|
|
126
|
+
|
|
127
|
+
db.session.commit()
|
|
128
|
+
|
|
129
|
+
def clear_error(self):
|
|
130
|
+
"""
|
|
131
|
+
Reset error of the query in database
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
if self.record.error is not None:
|
|
135
|
+
self.record.error = None
|
|
136
|
+
db.session.commit()
|
|
137
|
+
|
|
138
|
+
def get_state(self) -> dict:
|
|
139
|
+
"""
|
|
140
|
+
Returns stored state for resuming the query
|
|
141
|
+
"""
|
|
142
|
+
cache = get_cache('steps_data')
|
|
143
|
+
key = self.record.id
|
|
144
|
+
data = cache.get(key)
|
|
145
|
+
cache.delete(key)
|
|
146
|
+
|
|
147
|
+
steps_data = pickle.loads(data)
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
'step_num': self.record.context.get('step_num'),
|
|
151
|
+
'steps_data': steps_data,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def finish(self):
|
|
155
|
+
"""
|
|
156
|
+
Mark query as finished
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
self.record.finished_at = dt.datetime.now()
|
|
160
|
+
db.session.commit()
|
|
161
|
+
|
|
162
|
+
|
|
16
163
|
class QueryContextController:
|
|
17
164
|
IGNORE_CONTEXT = '<IGNORE>'
|
|
18
165
|
|
|
@@ -156,10 +303,12 @@ class QueryContextController:
|
|
|
156
303
|
last_values = {}
|
|
157
304
|
for query, info in l_query.get_init_queries():
|
|
158
305
|
|
|
159
|
-
|
|
306
|
+
response = dn.query(
|
|
160
307
|
query=query,
|
|
161
308
|
session=session
|
|
162
309
|
)
|
|
310
|
+
data = response.data_frame
|
|
311
|
+
columns_info = response.columns
|
|
163
312
|
|
|
164
313
|
if len(data) == 0:
|
|
165
314
|
value = None
|
|
@@ -285,5 +434,79 @@ class QueryContextController:
|
|
|
285
434
|
rec.values = values
|
|
286
435
|
db.session.commit()
|
|
287
436
|
|
|
437
|
+
def get_query(self, query_id: int) -> RunningQuery:
|
|
438
|
+
"""
|
|
439
|
+
Get running query by id
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
rec = db.Queries.query.filter(
|
|
443
|
+
db.Queries.id == query_id,
|
|
444
|
+
db.Queries.company_id == ctx.company_id
|
|
445
|
+
).first()
|
|
446
|
+
|
|
447
|
+
if rec is None:
|
|
448
|
+
raise RuntimeError(f'Query not found: {query_id}')
|
|
449
|
+
return RunningQuery(rec)
|
|
450
|
+
|
|
451
|
+
def create_query(self, query: ASTNode) -> RunningQuery:
|
|
452
|
+
"""
|
|
453
|
+
Create a new running query from AST query
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
# remove old queries
|
|
457
|
+
remove_query = db.session.query(db.Queries).filter(
|
|
458
|
+
db.Queries.company_id == ctx.company_id,
|
|
459
|
+
db.Queries.finished_at < (dt.datetime.now() - dt.timedelta(days=1))
|
|
460
|
+
)
|
|
461
|
+
for rec in remove_query.all():
|
|
462
|
+
db.session.delete(rec)
|
|
463
|
+
|
|
464
|
+
rec = db.Queries(
|
|
465
|
+
sql=str(query),
|
|
466
|
+
company_id=ctx.company_id,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
db.session.add(rec)
|
|
470
|
+
db.session.commit()
|
|
471
|
+
return RunningQuery(rec)
|
|
472
|
+
|
|
473
|
+
def list_queries(self) -> List[dict]:
|
|
474
|
+
"""
|
|
475
|
+
Get list of all running queries with metadata
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
query = db.session.query(db.Queries).filter(
|
|
479
|
+
db.Queries.company_id == ctx.company_id
|
|
480
|
+
)
|
|
481
|
+
return [
|
|
482
|
+
{
|
|
483
|
+
'id': record.id,
|
|
484
|
+
'sql': record.sql,
|
|
485
|
+
'started_at': record.started_at,
|
|
486
|
+
'finished_at': record.finished_at,
|
|
487
|
+
'parameters': record.parameters,
|
|
488
|
+
'context': record.context,
|
|
489
|
+
'processed_rows': record.processed_rows,
|
|
490
|
+
'error': record.error,
|
|
491
|
+
'updated_at': record.updated_at,
|
|
492
|
+
}
|
|
493
|
+
for record in query
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
def cancel_query(self, query_id: int):
|
|
497
|
+
"""
|
|
498
|
+
Cancels running query by id
|
|
499
|
+
"""
|
|
500
|
+
rec = db.Queries.query.filter(
|
|
501
|
+
db.Queries.id == query_id,
|
|
502
|
+
db.Queries.company_id == ctx.company_id
|
|
503
|
+
).first()
|
|
504
|
+
if rec is None:
|
|
505
|
+
raise RuntimeError(f'Query not found: {query_id}')
|
|
506
|
+
|
|
507
|
+
# the query in progress will fail when it tries to update status
|
|
508
|
+
db.session.delete(rec)
|
|
509
|
+
db.session.commit()
|
|
510
|
+
|
|
288
511
|
|
|
289
512
|
query_context_controller = QueryContextController()
|
mindsdb/interfaces/storage/db.py
CHANGED
|
@@ -523,6 +523,7 @@ class KnowledgeBase(Base):
|
|
|
523
523
|
embedding_model = relationship(
|
|
524
524
|
"Predictor", foreign_keys=[embedding_model_id], doc="embedding model"
|
|
525
525
|
)
|
|
526
|
+
query_id = Column(Integer, nullable=True)
|
|
526
527
|
|
|
527
528
|
created_at = Column(DateTime, default=datetime.datetime.now)
|
|
528
529
|
updated_at = Column(
|
|
@@ -564,6 +565,28 @@ class QueryContext(Base):
|
|
|
564
565
|
created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
565
566
|
|
|
566
567
|
|
|
568
|
+
class Queries(Base):
|
|
569
|
+
__tablename__ = "queries"
|
|
570
|
+
id: int = Column(Integer, primary_key=True)
|
|
571
|
+
company_id: int = Column(Integer, nullable=True)
|
|
572
|
+
|
|
573
|
+
sql: str = Column(String, nullable=False)
|
|
574
|
+
# step_data: JSON = Column(JSON, nullable=True)
|
|
575
|
+
|
|
576
|
+
started_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
577
|
+
finished_at: datetime.datetime = Column(DateTime)
|
|
578
|
+
|
|
579
|
+
parameters = Column(JSON, default={})
|
|
580
|
+
context = Column(JSON, default={})
|
|
581
|
+
processed_rows = Column(Integer, default=0)
|
|
582
|
+
error: str = Column(String, nullable=True)
|
|
583
|
+
|
|
584
|
+
updated_at: datetime.datetime = Column(
|
|
585
|
+
DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
|
|
586
|
+
)
|
|
587
|
+
created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
588
|
+
|
|
589
|
+
|
|
567
590
|
class LLMLog(Base):
|
|
568
591
|
__tablename__ = "llm_log"
|
|
569
592
|
id: int = Column(Integer, primary_key=True)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""queries
|
|
2
|
+
|
|
3
|
+
Revision ID: fda503400e43
|
|
4
|
+
Revises: 11347c213b36
|
|
5
|
+
Create Date: 2025-03-21 18:50:20.795930
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import mindsdb.interfaces.storage.db # noqa
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = 'fda503400e43'
|
|
15
|
+
down_revision = '11347c213b36'
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade():
|
|
21
|
+
op.create_table(
|
|
22
|
+
'queries',
|
|
23
|
+
sa.Column('id', sa.Integer(), nullable=False),
|
|
24
|
+
sa.Column('company_id', sa.Integer(), nullable=True),
|
|
25
|
+
sa.Column('sql', sa.String(), nullable=False),
|
|
26
|
+
sa.Column('started_at', sa.DateTime(), nullable=True),
|
|
27
|
+
sa.Column('finished_at', sa.DateTime(), nullable=True),
|
|
28
|
+
sa.Column('parameters', sa.JSON(), nullable=True),
|
|
29
|
+
sa.Column('context', sa.JSON(), nullable=True),
|
|
30
|
+
sa.Column('processed_rows', sa.Integer(), nullable=True),
|
|
31
|
+
sa.Column('error', sa.String(), nullable=True),
|
|
32
|
+
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
|
33
|
+
sa.Column('created_at', sa.DateTime(), nullable=True),
|
|
34
|
+
sa.PrimaryKeyConstraint('id')
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
|
|
38
|
+
batch_op.add_column(sa.Column('query_id', sa.INTEGER(), nullable=True))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade():
|
|
42
|
+
with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
|
|
43
|
+
batch_op.drop_column('query_id')
|
|
44
|
+
|
|
45
|
+
op.drop_table('queries')
|
|
@@ -43,7 +43,7 @@ def execute_in_threads(func, tasks, thread_count=3, queue_size_k=1.5):
|
|
|
43
43
|
for i in range(queue_size):
|
|
44
44
|
try:
|
|
45
45
|
args = next(tasks)
|
|
46
|
-
futures.append(executor.submit(func,
|
|
46
|
+
futures.append(executor.submit(func, args))
|
|
47
47
|
except StopIteration:
|
|
48
48
|
break
|
|
49
49
|
|
|
@@ -6,6 +6,35 @@ from mindsdb.utilities.config import Config
|
|
|
6
6
|
from mindsdb.utilities.context_executor import execute_in_threads
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
def get_max_thread_count() -> int:
|
|
10
|
+
"""
|
|
11
|
+
Calculate the maximum number of threads allowed for the system.
|
|
12
|
+
"""
|
|
13
|
+
# workers count
|
|
14
|
+
is_cloud = Config().is_cloud
|
|
15
|
+
if is_cloud:
|
|
16
|
+
max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
|
|
17
|
+
else:
|
|
18
|
+
max_threads = os.cpu_count() - 3
|
|
19
|
+
|
|
20
|
+
if max_threads < 1:
|
|
21
|
+
max_threads = 1
|
|
22
|
+
|
|
23
|
+
return max_threads
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def split_data_frame(df: pd.DataFrame, partition_size: int) -> Iterable[pd.DataFrame]:
|
|
27
|
+
"""
|
|
28
|
+
Split data frame into chunks with partition_size and yield them out
|
|
29
|
+
"""
|
|
30
|
+
chunk = 0
|
|
31
|
+
while chunk * partition_size < len(df):
|
|
32
|
+
# create results with partition
|
|
33
|
+
df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
|
|
34
|
+
chunk += 1
|
|
35
|
+
yield df1
|
|
36
|
+
|
|
37
|
+
|
|
9
38
|
def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partition_size: int) -> Iterable:
|
|
10
39
|
"""
|
|
11
40
|
Splits dataframe into partitions and apply callback on each partition
|
|
@@ -17,35 +46,21 @@ def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partit
|
|
|
17
46
|
"""
|
|
18
47
|
|
|
19
48
|
# tasks
|
|
20
|
-
def split_data_f(df):
|
|
21
|
-
chunk = 0
|
|
22
|
-
while chunk * partition_size < len(df):
|
|
23
|
-
# create results with partition
|
|
24
|
-
df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
|
|
25
|
-
chunk += 1
|
|
26
|
-
yield [df1]
|
|
27
49
|
|
|
28
|
-
tasks =
|
|
50
|
+
tasks = split_data_frame(df, partition_size)
|
|
29
51
|
|
|
30
|
-
|
|
31
|
-
is_cloud = Config().is_cloud
|
|
32
|
-
if is_cloud:
|
|
33
|
-
max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
|
|
34
|
-
else:
|
|
35
|
-
max_threads = os.cpu_count() - 2
|
|
52
|
+
max_threads = get_max_thread_count()
|
|
36
53
|
|
|
37
|
-
# don't exceed chunk_count
|
|
38
54
|
chunk_count = int(len(df) / partition_size)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
max_threads = 1
|
|
55
|
+
# don't exceed chunk_count
|
|
56
|
+
if chunk_count > 0:
|
|
57
|
+
max_threads = min(max_threads, chunk_count)
|
|
43
58
|
|
|
44
59
|
if max_threads == 1:
|
|
45
60
|
# don't spawn threads
|
|
46
61
|
|
|
47
62
|
for task in tasks:
|
|
48
|
-
yield callback(
|
|
63
|
+
yield callback(task)
|
|
49
64
|
|
|
50
65
|
else:
|
|
51
66
|
for result in execute_in_threads(callback, tasks, thread_count=max_threads):
|