MindsDB 25.4.2.0__py3-none-any.whl → 25.4.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +30 -7
- mindsdb/api/executor/command_executor.py +29 -0
- mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +3 -2
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +43 -1
- mindsdb/api/executor/planner/plan_join.py +1 -1
- mindsdb/api/executor/planner/query_plan.py +1 -0
- mindsdb/api/executor/planner/query_planner.py +86 -14
- mindsdb/api/executor/planner/steps.py +9 -1
- mindsdb/api/executor/sql_query/sql_query.py +37 -6
- mindsdb/api/executor/sql_query/steps/__init__.py +1 -0
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +231 -0
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +2 -1
- mindsdb/integrations/handlers/langchain_embedding_handler/langchain_embedding_handler.py +17 -16
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +1 -0
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +7 -11
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +28 -4
- mindsdb/integrations/libs/llm/config.py +11 -1
- mindsdb/integrations/libs/llm/utils.py +12 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +9 -1
- mindsdb/integrations/utilities/rag/rerankers/reranker_compressor.py +1 -1
- mindsdb/interfaces/agents/constants.py +12 -1
- mindsdb/interfaces/agents/langchain_agent.py +6 -0
- mindsdb/interfaces/database/projects.py +7 -1
- mindsdb/interfaces/knowledge_base/controller.py +166 -74
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +43 -62
- mindsdb/interfaces/knowledge_base/utils.py +28 -0
- mindsdb/interfaces/query_context/context_controller.py +221 -0
- mindsdb/interfaces/storage/db.py +23 -0
- mindsdb/migrations/versions/2025-03-21_fda503400e43_queries.py +45 -0
- mindsdb/utilities/auth.py +5 -1
- mindsdb/utilities/cache.py +4 -1
- mindsdb/utilities/context_executor.py +1 -1
- mindsdb/utilities/partitioning.py +35 -20
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/METADATA +221 -219
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/RECORD +39 -36
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.4.2.0.dist-info → mindsdb-25.4.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,17 @@
|
|
|
1
1
|
from typing import List
|
|
2
|
+
import pickle
|
|
3
|
+
import datetime as dt
|
|
2
4
|
|
|
5
|
+
from sqlalchemy.orm.attributes import flag_modified
|
|
3
6
|
import pandas as pd
|
|
4
7
|
|
|
8
|
+
from mindsdb_sql_parser import Select, Star, OrderBy
|
|
9
|
+
|
|
5
10
|
from mindsdb_sql_parser.ast import (
|
|
6
11
|
Identifier, BinaryOperation, Last, Constant, ASTNode
|
|
7
12
|
)
|
|
8
13
|
from mindsdb.integrations.utilities.query_traversal import query_traversal
|
|
14
|
+
from mindsdb.utilities.cache import get_cache
|
|
9
15
|
|
|
10
16
|
from mindsdb.interfaces.storage import db
|
|
11
17
|
from mindsdb.utilities.context import context as ctx
|
|
@@ -13,6 +19,147 @@ from mindsdb.utilities.context import context as ctx
|
|
|
13
19
|
from .last_query import LastQuery
|
|
14
20
|
|
|
15
21
|
|
|
22
|
+
class RunningQuery:
|
|
23
|
+
"""
|
|
24
|
+
Query in progres
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
def __init__(self, record: db.Queries):
|
|
28
|
+
self.record = record
|
|
29
|
+
self.sql = record.sql
|
|
30
|
+
|
|
31
|
+
def get_partition_query(self, step_num: int, query: Select) -> Select:
|
|
32
|
+
"""
|
|
33
|
+
Generate query for fetching the next partition
|
|
34
|
+
It wraps query to
|
|
35
|
+
select * from ({query})
|
|
36
|
+
where {track_column} > {previous_value}
|
|
37
|
+
order by track_column
|
|
38
|
+
limit size {batch_size}
|
|
39
|
+
And fill track_column, previous_value, batch_size
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
track_column = self.record.parameters['track_column']
|
|
43
|
+
|
|
44
|
+
query = Select(
|
|
45
|
+
targets=[Star()],
|
|
46
|
+
from_table=query,
|
|
47
|
+
order_by=[OrderBy(Identifier(track_column))],
|
|
48
|
+
limit=Constant(self.batch_size)
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
track_value = self.record.context.get('track_value')
|
|
52
|
+
# is it different step?
|
|
53
|
+
cur_step_num = self.record.context.get('step_num')
|
|
54
|
+
if cur_step_num is not None and cur_step_num != step_num:
|
|
55
|
+
# reset track_value
|
|
56
|
+
track_value = None
|
|
57
|
+
self.record.context['track_value'] = None
|
|
58
|
+
self.record.context['step_num'] = step_num
|
|
59
|
+
flag_modified(self.record, 'context')
|
|
60
|
+
db.session.commit()
|
|
61
|
+
|
|
62
|
+
if track_value is not None:
|
|
63
|
+
query.where = BinaryOperation(
|
|
64
|
+
op='>',
|
|
65
|
+
args=[Identifier(track_column), Constant(track_value)],
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
return query
|
|
69
|
+
|
|
70
|
+
def set_params(self, params: dict):
|
|
71
|
+
"""
|
|
72
|
+
Store parameters of the step which is about to be split into partitions
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
if 'track_column' not in params:
|
|
76
|
+
raise ValueError('Track column is not defined')
|
|
77
|
+
if 'batch_size' not in params:
|
|
78
|
+
params['batch_size'] = 1000
|
|
79
|
+
|
|
80
|
+
self.record.parameters = params
|
|
81
|
+
self.batch_size = self.record.parameters['batch_size']
|
|
82
|
+
db.session.commit()
|
|
83
|
+
|
|
84
|
+
def get_max_track_value(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
85
|
+
"""
|
|
86
|
+
return max value to use in `set_progress`.
|
|
87
|
+
this function is called before execution substeps,
|
|
88
|
+
`set_progress` function - after
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
track_column = self.record.parameters['track_column']
|
|
92
|
+
return df[track_column].max()
|
|
93
|
+
|
|
94
|
+
def set_progress(self, df: pd.DataFrame, max_track_value: int):
|
|
95
|
+
"""
|
|
96
|
+
Store progres of the query, it is called after processing of batch
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
if len(df) == 0:
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
self.record.processed_rows = self.record.processed_rows + len(df)
|
|
103
|
+
|
|
104
|
+
cur_value = self.record.context.get('track_value')
|
|
105
|
+
new_value = max_track_value
|
|
106
|
+
if new_value is not None:
|
|
107
|
+
if cur_value is None or new_value > cur_value:
|
|
108
|
+
self.record.context['track_value'] = new_value
|
|
109
|
+
flag_modified(self.record, 'context')
|
|
110
|
+
|
|
111
|
+
db.session.commit()
|
|
112
|
+
|
|
113
|
+
def on_error(self, error: Exception, step_num: int, steps_data: dict):
|
|
114
|
+
"""
|
|
115
|
+
Saves error of the query in database
|
|
116
|
+
Also saves step data and current step num to be able to resume query
|
|
117
|
+
"""
|
|
118
|
+
self.record.error = str(error)
|
|
119
|
+
self.record.context['step_num'] = step_num
|
|
120
|
+
flag_modified(self.record, 'context')
|
|
121
|
+
|
|
122
|
+
# save steps_data
|
|
123
|
+
cache = get_cache('steps_data')
|
|
124
|
+
data = pickle.dumps(steps_data, protocol=5)
|
|
125
|
+
cache.set(str(self.record.id), data)
|
|
126
|
+
|
|
127
|
+
db.session.commit()
|
|
128
|
+
|
|
129
|
+
def clear_error(self):
|
|
130
|
+
"""
|
|
131
|
+
Reset error of the query in database
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
if self.record.error is not None:
|
|
135
|
+
self.record.error = None
|
|
136
|
+
db.session.commit()
|
|
137
|
+
|
|
138
|
+
def get_state(self) -> dict:
|
|
139
|
+
"""
|
|
140
|
+
Returns stored state for resuming the query
|
|
141
|
+
"""
|
|
142
|
+
cache = get_cache('steps_data')
|
|
143
|
+
key = self.record.id
|
|
144
|
+
data = cache.get(key)
|
|
145
|
+
cache.delete(key)
|
|
146
|
+
|
|
147
|
+
steps_data = pickle.loads(data)
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
'step_num': self.record.context.get('step_num'),
|
|
151
|
+
'steps_data': steps_data,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
def finish(self):
|
|
155
|
+
"""
|
|
156
|
+
Mark query as finished
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
self.record.finished_at = dt.datetime.now()
|
|
160
|
+
db.session.commit()
|
|
161
|
+
|
|
162
|
+
|
|
16
163
|
class QueryContextController:
|
|
17
164
|
IGNORE_CONTEXT = '<IGNORE>'
|
|
18
165
|
|
|
@@ -287,5 +434,79 @@ class QueryContextController:
|
|
|
287
434
|
rec.values = values
|
|
288
435
|
db.session.commit()
|
|
289
436
|
|
|
437
|
+
def get_query(self, query_id: int) -> RunningQuery:
|
|
438
|
+
"""
|
|
439
|
+
Get running query by id
|
|
440
|
+
"""
|
|
441
|
+
|
|
442
|
+
rec = db.Queries.query.filter(
|
|
443
|
+
db.Queries.id == query_id,
|
|
444
|
+
db.Queries.company_id == ctx.company_id
|
|
445
|
+
).first()
|
|
446
|
+
|
|
447
|
+
if rec is None:
|
|
448
|
+
raise RuntimeError(f'Query not found: {query_id}')
|
|
449
|
+
return RunningQuery(rec)
|
|
450
|
+
|
|
451
|
+
def create_query(self, query: ASTNode) -> RunningQuery:
|
|
452
|
+
"""
|
|
453
|
+
Create a new running query from AST query
|
|
454
|
+
"""
|
|
455
|
+
|
|
456
|
+
# remove old queries
|
|
457
|
+
remove_query = db.session.query(db.Queries).filter(
|
|
458
|
+
db.Queries.company_id == ctx.company_id,
|
|
459
|
+
db.Queries.finished_at < (dt.datetime.now() - dt.timedelta(days=1))
|
|
460
|
+
)
|
|
461
|
+
for rec in remove_query.all():
|
|
462
|
+
db.session.delete(rec)
|
|
463
|
+
|
|
464
|
+
rec = db.Queries(
|
|
465
|
+
sql=str(query),
|
|
466
|
+
company_id=ctx.company_id,
|
|
467
|
+
)
|
|
468
|
+
|
|
469
|
+
db.session.add(rec)
|
|
470
|
+
db.session.commit()
|
|
471
|
+
return RunningQuery(rec)
|
|
472
|
+
|
|
473
|
+
def list_queries(self) -> List[dict]:
|
|
474
|
+
"""
|
|
475
|
+
Get list of all running queries with metadata
|
|
476
|
+
"""
|
|
477
|
+
|
|
478
|
+
query = db.session.query(db.Queries).filter(
|
|
479
|
+
db.Queries.company_id == ctx.company_id
|
|
480
|
+
)
|
|
481
|
+
return [
|
|
482
|
+
{
|
|
483
|
+
'id': record.id,
|
|
484
|
+
'sql': record.sql,
|
|
485
|
+
'started_at': record.started_at,
|
|
486
|
+
'finished_at': record.finished_at,
|
|
487
|
+
'parameters': record.parameters,
|
|
488
|
+
'context': record.context,
|
|
489
|
+
'processed_rows': record.processed_rows,
|
|
490
|
+
'error': record.error,
|
|
491
|
+
'updated_at': record.updated_at,
|
|
492
|
+
}
|
|
493
|
+
for record in query
|
|
494
|
+
]
|
|
495
|
+
|
|
496
|
+
def cancel_query(self, query_id: int):
|
|
497
|
+
"""
|
|
498
|
+
Cancels running query by id
|
|
499
|
+
"""
|
|
500
|
+
rec = db.Queries.query.filter(
|
|
501
|
+
db.Queries.id == query_id,
|
|
502
|
+
db.Queries.company_id == ctx.company_id
|
|
503
|
+
).first()
|
|
504
|
+
if rec is None:
|
|
505
|
+
raise RuntimeError(f'Query not found: {query_id}')
|
|
506
|
+
|
|
507
|
+
# the query in progress will fail when it tries to update status
|
|
508
|
+
db.session.delete(rec)
|
|
509
|
+
db.session.commit()
|
|
510
|
+
|
|
290
511
|
|
|
291
512
|
query_context_controller = QueryContextController()
|
mindsdb/interfaces/storage/db.py
CHANGED
|
@@ -523,6 +523,7 @@ class KnowledgeBase(Base):
|
|
|
523
523
|
embedding_model = relationship(
|
|
524
524
|
"Predictor", foreign_keys=[embedding_model_id], doc="embedding model"
|
|
525
525
|
)
|
|
526
|
+
query_id = Column(Integer, nullable=True)
|
|
526
527
|
|
|
527
528
|
created_at = Column(DateTime, default=datetime.datetime.now)
|
|
528
529
|
updated_at = Column(
|
|
@@ -564,6 +565,28 @@ class QueryContext(Base):
|
|
|
564
565
|
created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
565
566
|
|
|
566
567
|
|
|
568
|
+
class Queries(Base):
|
|
569
|
+
__tablename__ = "queries"
|
|
570
|
+
id: int = Column(Integer, primary_key=True)
|
|
571
|
+
company_id: int = Column(Integer, nullable=True)
|
|
572
|
+
|
|
573
|
+
sql: str = Column(String, nullable=False)
|
|
574
|
+
# step_data: JSON = Column(JSON, nullable=True)
|
|
575
|
+
|
|
576
|
+
started_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
577
|
+
finished_at: datetime.datetime = Column(DateTime)
|
|
578
|
+
|
|
579
|
+
parameters = Column(JSON, default={})
|
|
580
|
+
context = Column(JSON, default={})
|
|
581
|
+
processed_rows = Column(Integer, default=0)
|
|
582
|
+
error: str = Column(String, nullable=True)
|
|
583
|
+
|
|
584
|
+
updated_at: datetime.datetime = Column(
|
|
585
|
+
DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
|
|
586
|
+
)
|
|
587
|
+
created_at: datetime.datetime = Column(DateTime, default=datetime.datetime.now)
|
|
588
|
+
|
|
589
|
+
|
|
567
590
|
class LLMLog(Base):
|
|
568
591
|
__tablename__ = "llm_log"
|
|
569
592
|
id: int = Column(Integer, primary_key=True)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""queries
|
|
2
|
+
|
|
3
|
+
Revision ID: fda503400e43
|
|
4
|
+
Revises: 11347c213b36
|
|
5
|
+
Create Date: 2025-03-21 18:50:20.795930
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
import mindsdb.interfaces.storage.db # noqa
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# revision identifiers, used by Alembic.
|
|
14
|
+
revision = 'fda503400e43'
|
|
15
|
+
down_revision = '11347c213b36'
|
|
16
|
+
branch_labels = None
|
|
17
|
+
depends_on = None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def upgrade():
|
|
21
|
+
op.create_table(
|
|
22
|
+
'queries',
|
|
23
|
+
sa.Column('id', sa.Integer(), nullable=False),
|
|
24
|
+
sa.Column('company_id', sa.Integer(), nullable=True),
|
|
25
|
+
sa.Column('sql', sa.String(), nullable=False),
|
|
26
|
+
sa.Column('started_at', sa.DateTime(), nullable=True),
|
|
27
|
+
sa.Column('finished_at', sa.DateTime(), nullable=True),
|
|
28
|
+
sa.Column('parameters', sa.JSON(), nullable=True),
|
|
29
|
+
sa.Column('context', sa.JSON(), nullable=True),
|
|
30
|
+
sa.Column('processed_rows', sa.Integer(), nullable=True),
|
|
31
|
+
sa.Column('error', sa.String(), nullable=True),
|
|
32
|
+
sa.Column('updated_at', sa.DateTime(), nullable=True),
|
|
33
|
+
sa.Column('created_at', sa.DateTime(), nullable=True),
|
|
34
|
+
sa.PrimaryKeyConstraint('id')
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
|
|
38
|
+
batch_op.add_column(sa.Column('query_id', sa.INTEGER(), nullable=True))
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def downgrade():
|
|
42
|
+
with op.batch_alter_table('knowledge_base', schema=None) as batch_op:
|
|
43
|
+
batch_op.drop_column('query_id')
|
|
44
|
+
|
|
45
|
+
op.drop_table('queries')
|
mindsdb/utilities/auth.py
CHANGED
|
@@ -15,9 +15,11 @@ def get_aws_meta_data() -> dict:
|
|
|
15
15
|
'ami-id': None,
|
|
16
16
|
'instance-id': None
|
|
17
17
|
}
|
|
18
|
+
aws_token = requests.put("http://169.254.169.254/latest/api/token", headers={'X-aws-ec2-metadata-token-ttl-seconds': '30'}).text
|
|
18
19
|
for key in aws_meta_data.keys():
|
|
19
20
|
resp = requests.get(
|
|
20
21
|
f'http://169.254.169.254/latest/meta-data/{key}',
|
|
22
|
+
headers={'X-aws-ec2-metadata-token': aws_token},
|
|
21
23
|
timeout=1
|
|
22
24
|
)
|
|
23
25
|
if resp.status_code != 200:
|
|
@@ -35,7 +37,9 @@ def register_oauth_client():
|
|
|
35
37
|
aws_meta_data = get_aws_meta_data()
|
|
36
38
|
|
|
37
39
|
current_aws_meta_data = config.get('aws_meta_data', {})
|
|
38
|
-
oauth_meta = config.get('auth', {}).get('oauth'
|
|
40
|
+
oauth_meta = config.get('auth', {}).get('oauth')
|
|
41
|
+
if oauth_meta is None:
|
|
42
|
+
return
|
|
39
43
|
|
|
40
44
|
public_hostname = aws_meta_data['public-hostname']
|
|
41
45
|
if (
|
mindsdb/utilities/cache.py
CHANGED
|
@@ -56,6 +56,7 @@ import os
|
|
|
56
56
|
import time
|
|
57
57
|
from abc import ABC
|
|
58
58
|
from pathlib import Path
|
|
59
|
+
import re
|
|
59
60
|
import hashlib
|
|
60
61
|
import typing as t
|
|
61
62
|
|
|
@@ -154,7 +155,9 @@ class FileCache(BaseCache):
|
|
|
154
155
|
pass
|
|
155
156
|
|
|
156
157
|
def file_path(self, name):
|
|
157
|
-
|
|
158
|
+
# Sanitize the key to avoid table (file) names with backticks and slashes.
|
|
159
|
+
sanitized_name = re.sub(r'[^\w\-.]', '_', name)
|
|
160
|
+
return self.path / sanitized_name
|
|
158
161
|
|
|
159
162
|
def set_df(self, name, df):
|
|
160
163
|
path = self.file_path(name)
|
|
@@ -43,7 +43,7 @@ def execute_in_threads(func, tasks, thread_count=3, queue_size_k=1.5):
|
|
|
43
43
|
for i in range(queue_size):
|
|
44
44
|
try:
|
|
45
45
|
args = next(tasks)
|
|
46
|
-
futures.append(executor.submit(func,
|
|
46
|
+
futures.append(executor.submit(func, args))
|
|
47
47
|
except StopIteration:
|
|
48
48
|
break
|
|
49
49
|
|
|
@@ -6,6 +6,35 @@ from mindsdb.utilities.config import Config
|
|
|
6
6
|
from mindsdb.utilities.context_executor import execute_in_threads
|
|
7
7
|
|
|
8
8
|
|
|
9
|
+
def get_max_thread_count() -> int:
|
|
10
|
+
"""
|
|
11
|
+
Calculate the maximum number of threads allowed for the system.
|
|
12
|
+
"""
|
|
13
|
+
# workers count
|
|
14
|
+
is_cloud = Config().is_cloud
|
|
15
|
+
if is_cloud:
|
|
16
|
+
max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
|
|
17
|
+
else:
|
|
18
|
+
max_threads = os.cpu_count() - 3
|
|
19
|
+
|
|
20
|
+
if max_threads < 1:
|
|
21
|
+
max_threads = 1
|
|
22
|
+
|
|
23
|
+
return max_threads
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def split_data_frame(df: pd.DataFrame, partition_size: int) -> Iterable[pd.DataFrame]:
|
|
27
|
+
"""
|
|
28
|
+
Split data frame into chunks with partition_size and yield them out
|
|
29
|
+
"""
|
|
30
|
+
chunk = 0
|
|
31
|
+
while chunk * partition_size < len(df):
|
|
32
|
+
# create results with partition
|
|
33
|
+
df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
|
|
34
|
+
chunk += 1
|
|
35
|
+
yield df1
|
|
36
|
+
|
|
37
|
+
|
|
9
38
|
def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partition_size: int) -> Iterable:
|
|
10
39
|
"""
|
|
11
40
|
Splits dataframe into partitions and apply callback on each partition
|
|
@@ -17,35 +46,21 @@ def process_dataframe_in_partitions(df: pd.DataFrame, callback: Callable, partit
|
|
|
17
46
|
"""
|
|
18
47
|
|
|
19
48
|
# tasks
|
|
20
|
-
def split_data_f(df):
|
|
21
|
-
chunk = 0
|
|
22
|
-
while chunk * partition_size < len(df):
|
|
23
|
-
# create results with partition
|
|
24
|
-
df1 = df.iloc[chunk * partition_size: (chunk + 1) * partition_size]
|
|
25
|
-
chunk += 1
|
|
26
|
-
yield [df1]
|
|
27
49
|
|
|
28
|
-
tasks =
|
|
50
|
+
tasks = split_data_frame(df, partition_size)
|
|
29
51
|
|
|
30
|
-
|
|
31
|
-
is_cloud = Config().is_cloud
|
|
32
|
-
if is_cloud:
|
|
33
|
-
max_threads = int(os.getenv('MINDSDB_MAX_PARTITIONING_THREADS', 10))
|
|
34
|
-
else:
|
|
35
|
-
max_threads = os.cpu_count() - 2
|
|
52
|
+
max_threads = get_max_thread_count()
|
|
36
53
|
|
|
37
|
-
# don't exceed chunk_count
|
|
38
54
|
chunk_count = int(len(df) / partition_size)
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
max_threads = 1
|
|
55
|
+
# don't exceed chunk_count
|
|
56
|
+
if chunk_count > 0:
|
|
57
|
+
max_threads = min(max_threads, chunk_count)
|
|
43
58
|
|
|
44
59
|
if max_threads == 1:
|
|
45
60
|
# don't spawn threads
|
|
46
61
|
|
|
47
62
|
for task in tasks:
|
|
48
|
-
yield callback(
|
|
63
|
+
yield callback(task)
|
|
49
64
|
|
|
50
65
|
else:
|
|
51
66
|
for result in execute_in_threads(callback, tasks, thread_count=max_threads):
|