MindsDB 25.5.4.1__py3-none-any.whl → 25.6.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +28 -25
- mindsdb/api/a2a/common/server/server.py +32 -26
- mindsdb/api/a2a/run_a2a.py +1 -1
- mindsdb/api/executor/command_executor.py +69 -14
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
- mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
- mindsdb/api/executor/planner/plan_join.py +67 -77
- mindsdb/api/executor/planner/query_planner.py +176 -155
- mindsdb/api/executor/planner/steps.py +37 -12
- mindsdb/api/executor/sql_query/result_set.py +45 -64
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
- mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
- mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
- mindsdb/api/executor/utilities/sql.py +42 -48
- mindsdb/api/http/namespaces/config.py +1 -1
- mindsdb/api/http/namespaces/file.py +14 -23
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
- mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
- mindsdb/integrations/libs/api_handler.py +261 -57
- mindsdb/integrations/libs/base.py +100 -29
- mindsdb/integrations/utilities/files/file_reader.py +99 -73
- mindsdb/integrations/utilities/handler_utils.py +23 -8
- mindsdb/integrations/utilities/sql_utils.py +35 -40
- mindsdb/interfaces/agents/agents_controller.py +196 -192
- mindsdb/interfaces/agents/constants.py +7 -1
- mindsdb/interfaces/agents/langchain_agent.py +42 -11
- mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
- mindsdb/interfaces/data_catalog/__init__.py +0 -0
- mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
- mindsdb/interfaces/database/database.py +81 -57
- mindsdb/interfaces/database/integrations.py +220 -234
- mindsdb/interfaces/database/log.py +72 -104
- mindsdb/interfaces/database/projects.py +156 -193
- mindsdb/interfaces/file/file_controller.py +21 -65
- mindsdb/interfaces/knowledge_base/controller.py +63 -10
- mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
- mindsdb/interfaces/skills/skills_controller.py +54 -36
- mindsdb/interfaces/skills/sql_agent.py +109 -86
- mindsdb/interfaces/storage/db.py +223 -79
- mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
- mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
- mindsdb/utilities/config.py +9 -2
- mindsdb/utilities/log.py +35 -26
- mindsdb/utilities/ml_task_queue/task.py +19 -22
- mindsdb/utilities/render/sqlalchemy_render.py +129 -181
- mindsdb/utilities/starters.py +49 -1
- {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +268 -268
- {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +70 -62
- {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
|
@@ -9,7 +9,9 @@ class PlanStep:
|
|
|
9
9
|
@property
|
|
10
10
|
def result(self):
|
|
11
11
|
if self.step_num is None:
|
|
12
|
-
raise PlanningException(
|
|
12
|
+
raise PlanningException(
|
|
13
|
+
f"Can't reference a step with no assigned step number. Tried to reference: {type(self)}"
|
|
14
|
+
)
|
|
13
15
|
return Result(self.step_num)
|
|
14
16
|
|
|
15
17
|
def __eq__(self, other):
|
|
@@ -18,7 +20,7 @@ class PlanStep:
|
|
|
18
20
|
|
|
19
21
|
for k in vars(self):
|
|
20
22
|
# skip result comparison
|
|
21
|
-
if k ==
|
|
23
|
+
if k == "result_data":
|
|
22
24
|
continue
|
|
23
25
|
|
|
24
26
|
if getattr(self, k) != getattr(other, k):
|
|
@@ -28,8 +30,8 @@ class PlanStep:
|
|
|
28
30
|
|
|
29
31
|
def __repr__(self):
|
|
30
32
|
attrs_dict = vars(self)
|
|
31
|
-
attrs_str =
|
|
32
|
-
return f
|
|
33
|
+
attrs_str = ", ".join([f"{k}={str(v)}" for k, v in attrs_dict.items()])
|
|
34
|
+
return f"{self.__class__.__name__}({attrs_str})"
|
|
33
35
|
|
|
34
36
|
def set_result(self, result):
|
|
35
37
|
self.result_data = result
|
|
@@ -37,6 +39,7 @@ class PlanStep:
|
|
|
37
39
|
|
|
38
40
|
class ProjectStep(PlanStep):
|
|
39
41
|
"""Selects columns from a dataframe"""
|
|
42
|
+
|
|
40
43
|
def __init__(self, columns, dataframe, ignore_doubles=False, *args, **kwargs):
|
|
41
44
|
super().__init__(*args, **kwargs)
|
|
42
45
|
self.columns = columns
|
|
@@ -47,6 +50,7 @@ class ProjectStep(PlanStep):
|
|
|
47
50
|
# TODO remove
|
|
48
51
|
class FilterStep(PlanStep):
|
|
49
52
|
"""Filters some dataframe according to a query"""
|
|
53
|
+
|
|
50
54
|
def __init__(self, dataframe, query, *args, **kwargs):
|
|
51
55
|
super().__init__(*args, **kwargs)
|
|
52
56
|
self.dataframe = dataframe
|
|
@@ -66,6 +70,7 @@ class GroupByStep(PlanStep):
|
|
|
66
70
|
|
|
67
71
|
class JoinStep(PlanStep):
|
|
68
72
|
"""Joins two dataframes, producing a new dataframe"""
|
|
73
|
+
|
|
69
74
|
def __init__(self, left, right, query, *args, **kwargs):
|
|
70
75
|
super().__init__(*args, **kwargs)
|
|
71
76
|
self.left = left
|
|
@@ -75,7 +80,8 @@ class JoinStep(PlanStep):
|
|
|
75
80
|
|
|
76
81
|
class UnionStep(PlanStep):
|
|
77
82
|
"""Union of two dataframes, producing a new dataframe"""
|
|
78
|
-
|
|
83
|
+
|
|
84
|
+
def __init__(self, left, right, unique, operation="union", *args, **kwargs):
|
|
79
85
|
super().__init__(*args, **kwargs)
|
|
80
86
|
self.left = left
|
|
81
87
|
self.right = right
|
|
@@ -95,6 +101,7 @@ class OrderByStep(PlanStep):
|
|
|
95
101
|
|
|
96
102
|
class LimitOffsetStep(PlanStep):
|
|
97
103
|
"""Applies limit and offset to a dataframe"""
|
|
104
|
+
|
|
98
105
|
def __init__(self, dataframe, limit=None, offset=None, *args, **kwargs):
|
|
99
106
|
super().__init__(*args, **kwargs)
|
|
100
107
|
self.dataframe = dataframe
|
|
@@ -104,6 +111,7 @@ class LimitOffsetStep(PlanStep):
|
|
|
104
111
|
|
|
105
112
|
class FetchDataframeStep(PlanStep):
|
|
106
113
|
"""Fetches a dataframe from external integration"""
|
|
114
|
+
|
|
107
115
|
def __init__(self, integration, query=None, raw_query=None, params=None, *args, **kwargs):
|
|
108
116
|
super().__init__(*args, **kwargs)
|
|
109
117
|
self.integration = integration
|
|
@@ -114,15 +122,28 @@ class FetchDataframeStep(PlanStep):
|
|
|
114
122
|
|
|
115
123
|
class FetchDataframeStepPartition(FetchDataframeStep):
|
|
116
124
|
"""Fetches a dataframe from external integration in partitions"""
|
|
117
|
-
|
|
125
|
+
|
|
126
|
+
def __init__(self, steps=None, *args, **kwargs):
|
|
118
127
|
super().__init__(*args, **kwargs)
|
|
119
|
-
|
|
128
|
+
if steps is None:
|
|
129
|
+
steps = []
|
|
130
|
+
self.steps = steps
|
|
120
131
|
|
|
121
132
|
|
|
122
133
|
class ApplyPredictorStep(PlanStep):
|
|
123
134
|
"""Applies a mindsdb predictor on some dataframe and returns a new dataframe with predictions"""
|
|
124
|
-
|
|
125
|
-
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
namespace,
|
|
139
|
+
predictor,
|
|
140
|
+
dataframe,
|
|
141
|
+
params: dict = None,
|
|
142
|
+
row_dict: dict = None,
|
|
143
|
+
columns_map: dict = None,
|
|
144
|
+
*args,
|
|
145
|
+
**kwargs,
|
|
146
|
+
):
|
|
126
147
|
super().__init__(*args, **kwargs)
|
|
127
148
|
self.namespace = namespace
|
|
128
149
|
self.predictor = predictor
|
|
@@ -149,6 +170,7 @@ class ApplyTimeseriesPredictorStep(ApplyPredictorStep):
|
|
|
149
170
|
|
|
150
171
|
class ApplyPredictorRowStep(PlanStep):
|
|
151
172
|
"""Applies a mindsdb predictor to one row of values and returns a dataframe of one row, the predictor."""
|
|
173
|
+
|
|
152
174
|
def __init__(self, namespace, predictor, row_dict, params=None, *args, **kwargs):
|
|
153
175
|
super().__init__(*args, **kwargs)
|
|
154
176
|
self.namespace = namespace
|
|
@@ -159,6 +181,7 @@ class ApplyPredictorRowStep(PlanStep):
|
|
|
159
181
|
|
|
160
182
|
class GetPredictorColumns(PlanStep):
|
|
161
183
|
"""Returns an empty dataframe of shape and columns like predictor results."""
|
|
184
|
+
|
|
162
185
|
def __init__(self, namespace, predictor, *args, **kwargs):
|
|
163
186
|
super().__init__(*args, **kwargs)
|
|
164
187
|
self.namespace = namespace
|
|
@@ -167,6 +190,7 @@ class GetPredictorColumns(PlanStep):
|
|
|
167
190
|
|
|
168
191
|
class GetTableColumns(PlanStep):
|
|
169
192
|
"""Returns an empty dataframe of shape and columns like select from table."""
|
|
193
|
+
|
|
170
194
|
def __init__(self, namespace, table, *args, **kwargs):
|
|
171
195
|
super().__init__(*args, **kwargs)
|
|
172
196
|
self.namespace = namespace
|
|
@@ -175,7 +199,8 @@ class GetTableColumns(PlanStep):
|
|
|
175
199
|
|
|
176
200
|
class MapReduceStep(PlanStep):
|
|
177
201
|
"""Applies a step for each value in a list, and then reduces results to a single dataframe"""
|
|
178
|
-
|
|
202
|
+
|
|
203
|
+
def __init__(self, values, step, reduce="union", partition=None, *args, **kwargs):
|
|
179
204
|
"""
|
|
180
205
|
:param values: input step data
|
|
181
206
|
:param step: step to be applied
|
|
@@ -202,8 +227,8 @@ class MultipleSteps(PlanStep):
|
|
|
202
227
|
class SaveToTable(PlanStep):
|
|
203
228
|
def __init__(self, table, dataframe, is_replace=False, params=None, *args, **kwargs):
|
|
204
229
|
"""
|
|
205
|
-
|
|
206
|
-
|
|
230
|
+
Creates table if not exists and fills it with content of dataframe
|
|
231
|
+
is_replace - to drop table beforehand
|
|
207
232
|
"""
|
|
208
233
|
super().__init__(*args, **kwargs)
|
|
209
234
|
self.table = table
|
|
@@ -1,4 +1,6 @@
|
|
|
1
1
|
import copy
|
|
2
|
+
from array import array
|
|
3
|
+
from typing import Any
|
|
2
4
|
from dataclasses import dataclass, field, MISSING
|
|
3
5
|
|
|
4
6
|
import numpy as np
|
|
@@ -49,6 +51,12 @@ def get_mysql_data_type_from_series(series: pd.Series, do_infer: bool = False) -
|
|
|
49
51
|
return MYSQL_DATA_TYPE.TEXT
|
|
50
52
|
|
|
51
53
|
|
|
54
|
+
def _dump_vector(value: Any) -> Any:
|
|
55
|
+
if isinstance(value, array):
|
|
56
|
+
return value.tolist()
|
|
57
|
+
return value
|
|
58
|
+
|
|
59
|
+
|
|
52
60
|
@dataclass(kw_only=True, slots=True)
|
|
53
61
|
class Column:
|
|
54
62
|
name: str = field(default=MISSING)
|
|
@@ -70,7 +78,7 @@ class Column:
|
|
|
70
78
|
table_name = self.table_name if self.table_alias is None else self.table_alias
|
|
71
79
|
name = self.name if self.alias is None else self.alias
|
|
72
80
|
|
|
73
|
-
name = f
|
|
81
|
+
name = f"{prefix}_{table_name}_{name}"
|
|
74
82
|
return name
|
|
75
83
|
|
|
76
84
|
|
|
@@ -95,7 +103,7 @@ class ResultSet:
|
|
|
95
103
|
df: pd.DataFrame | None = None,
|
|
96
104
|
affected_rows: int | None = None,
|
|
97
105
|
is_prediction: bool = False,
|
|
98
|
-
mysql_types: list[MYSQL_DATA_TYPE] | None = None
|
|
106
|
+
mysql_types: list[MYSQL_DATA_TYPE] | None = None,
|
|
99
107
|
):
|
|
100
108
|
"""
|
|
101
109
|
Args:
|
|
@@ -122,9 +130,9 @@ class ResultSet:
|
|
|
122
130
|
self.mysql_types = mysql_types
|
|
123
131
|
|
|
124
132
|
def __repr__(self):
|
|
125
|
-
col_names =
|
|
133
|
+
col_names = ", ".join([col.name for col in self._columns])
|
|
126
134
|
|
|
127
|
-
return f
|
|
135
|
+
return f"{self.__class__.__name__}({self.length()} rows, cols: {col_names})"
|
|
128
136
|
|
|
129
137
|
def __len__(self) -> int:
|
|
130
138
|
if self._df is None:
|
|
@@ -140,38 +148,30 @@ class ResultSet:
|
|
|
140
148
|
|
|
141
149
|
@classmethod
|
|
142
150
|
def from_df(
|
|
143
|
-
cls,
|
|
144
|
-
|
|
151
|
+
cls,
|
|
152
|
+
df: pd.DataFrame,
|
|
153
|
+
database=None,
|
|
154
|
+
table_name=None,
|
|
155
|
+
table_alias=None,
|
|
156
|
+
is_prediction: bool = False,
|
|
157
|
+
mysql_types: list[MYSQL_DATA_TYPE] | None = None,
|
|
145
158
|
):
|
|
146
159
|
match mysql_types:
|
|
147
160
|
case None:
|
|
148
161
|
mysql_types = [None] * len(df.columns)
|
|
149
162
|
case list() if len(mysql_types) != len(df.columns):
|
|
150
|
-
raise WrongArgumentError(
|
|
151
|
-
f'Mysql types length mismatch: {len(mysql_types)} != {len(df.columns)}'
|
|
152
|
-
)
|
|
163
|
+
raise WrongArgumentError(f"Mysql types length mismatch: {len(mysql_types)} != {len(df.columns)}")
|
|
153
164
|
|
|
154
165
|
columns = [
|
|
155
|
-
Column(
|
|
156
|
-
|
|
157
|
-
table_name=table_name,
|
|
158
|
-
table_alias=table_alias,
|
|
159
|
-
database=database,
|
|
160
|
-
type=mysql_type
|
|
161
|
-
) for column_name, mysql_type
|
|
162
|
-
in zip(df.columns, mysql_types)
|
|
166
|
+
Column(name=column_name, table_name=table_name, table_alias=table_alias, database=database, type=mysql_type)
|
|
167
|
+
for column_name, mysql_type in zip(df.columns, mysql_types)
|
|
163
168
|
]
|
|
164
169
|
|
|
165
170
|
rename_df_columns(df)
|
|
166
|
-
return cls(
|
|
167
|
-
df=df,
|
|
168
|
-
columns=columns,
|
|
169
|
-
is_prediction=is_prediction,
|
|
170
|
-
mysql_types=mysql_types
|
|
171
|
-
)
|
|
171
|
+
return cls(df=df, columns=columns, is_prediction=is_prediction, mysql_types=mysql_types)
|
|
172
172
|
|
|
173
173
|
@classmethod
|
|
174
|
-
def from_df_cols(cls, df: pd.DataFrame, columns_dict: dict[str, Column], strict: bool = True) ->
|
|
174
|
+
def from_df_cols(cls, df: pd.DataFrame, columns_dict: dict[str, Column], strict: bool = True) -> "ResultSet":
|
|
175
175
|
"""Create ResultSet from dataframe and dictionary of columns
|
|
176
176
|
|
|
177
177
|
Args:
|
|
@@ -185,29 +185,18 @@ class ResultSet:
|
|
|
185
185
|
Raises:
|
|
186
186
|
ValueError: if a column is not found in columns_dict and strict is True
|
|
187
187
|
"""
|
|
188
|
-
alias_idx = {
|
|
189
|
-
column.alias: column
|
|
190
|
-
for column in columns_dict.values()
|
|
191
|
-
if column.alias is not None
|
|
192
|
-
}
|
|
188
|
+
alias_idx = {column.alias: column for column in columns_dict.values() if column.alias is not None}
|
|
193
189
|
|
|
194
190
|
columns = []
|
|
195
191
|
for column_name in df.columns:
|
|
196
192
|
if strict and column_name not in columns_dict:
|
|
197
|
-
raise ValueError(f
|
|
198
|
-
column = (
|
|
199
|
-
columns_dict.get(column_name)
|
|
200
|
-
or alias_idx.get(column_name)
|
|
201
|
-
or Column(name=column_name)
|
|
202
|
-
)
|
|
193
|
+
raise ValueError(f"Column {column_name} not found in columns_dict")
|
|
194
|
+
column = columns_dict.get(column_name) or alias_idx.get(column_name) or Column(name=column_name)
|
|
203
195
|
columns.append(column)
|
|
204
196
|
|
|
205
197
|
rename_df_columns(df)
|
|
206
198
|
|
|
207
|
-
return cls(
|
|
208
|
-
columns=columns,
|
|
209
|
-
df=df
|
|
210
|
-
)
|
|
199
|
+
return cls(columns=columns, df=df)
|
|
211
200
|
|
|
212
201
|
def to_df(self):
|
|
213
202
|
columns_names = self.get_column_names()
|
|
@@ -215,7 +204,7 @@ class ResultSet:
|
|
|
215
204
|
rename_df_columns(df, columns_names)
|
|
216
205
|
return df
|
|
217
206
|
|
|
218
|
-
def to_df_cols(self, prefix: str =
|
|
207
|
+
def to_df_cols(self, prefix: str = "") -> tuple[pd.DataFrame, dict[str, Column]]:
|
|
219
208
|
# returns dataframe and dict of columns
|
|
220
209
|
# can be restored to ResultSet by from_df_cols method
|
|
221
210
|
|
|
@@ -235,7 +224,7 @@ class ResultSet:
|
|
|
235
224
|
def get_tables(self):
|
|
236
225
|
tables_idx = []
|
|
237
226
|
tables = []
|
|
238
|
-
cols = [
|
|
227
|
+
cols = ["database", "table_name", "table_alias"]
|
|
239
228
|
for col in self._columns:
|
|
240
229
|
table = (col.database, col.table_name, col.table_alias)
|
|
241
230
|
if table not in tables_idx:
|
|
@@ -258,7 +247,7 @@ class ResultSet:
|
|
|
258
247
|
col_idx = i
|
|
259
248
|
break
|
|
260
249
|
if col_idx is None:
|
|
261
|
-
raise WrongArgumentError(f
|
|
250
|
+
raise WrongArgumentError(f"Column is not found: {col}")
|
|
262
251
|
return col_idx
|
|
263
252
|
|
|
264
253
|
def add_column(self, col, values=None):
|
|
@@ -281,10 +270,7 @@ class ResultSet:
|
|
|
281
270
|
return self._columns
|
|
282
271
|
|
|
283
272
|
def get_column_names(self):
|
|
284
|
-
columns = [
|
|
285
|
-
col.name if col.alias is None else col.alias
|
|
286
|
-
for col in self._columns
|
|
287
|
-
]
|
|
273
|
+
columns = [col.name if col.alias is None else col.alias for col in self._columns]
|
|
288
274
|
return columns
|
|
289
275
|
|
|
290
276
|
def find_columns(self, alias=None, table_alias=None):
|
|
@@ -324,7 +310,7 @@ class ResultSet:
|
|
|
324
310
|
|
|
325
311
|
def add_raw_df(self, df):
|
|
326
312
|
if len(df.columns) != len(self._columns):
|
|
327
|
-
raise WrongArgumentError(f
|
|
313
|
+
raise WrongArgumentError(f"Record length mismatch columns length: {len(df.columns)} != {len(self.columns)}")
|
|
328
314
|
|
|
329
315
|
rename_df_columns(df)
|
|
330
316
|
|
|
@@ -340,7 +326,7 @@ class ResultSet:
|
|
|
340
326
|
convert_floating=True,
|
|
341
327
|
infer_objects=False,
|
|
342
328
|
convert_string=False,
|
|
343
|
-
convert_boolean=False
|
|
329
|
+
convert_boolean=False,
|
|
344
330
|
)
|
|
345
331
|
self.add_raw_df(df)
|
|
346
332
|
|
|
@@ -367,9 +353,9 @@ class ResultSet:
|
|
|
367
353
|
MYSQL_DATA_TYPE.BOOLEAN: sqlalchemy_types.BOOLEAN,
|
|
368
354
|
MYSQL_DATA_TYPE.FLOAT: sqlalchemy_types.FLOAT,
|
|
369
355
|
MYSQL_DATA_TYPE.DOUBLE: sqlalchemy_types.FLOAT,
|
|
370
|
-
MYSQL_DATA_TYPE.TIME: sqlalchemy_types.
|
|
371
|
-
MYSQL_DATA_TYPE.DATE: sqlalchemy_types.
|
|
372
|
-
MYSQL_DATA_TYPE.DATETIME: sqlalchemy_types.
|
|
356
|
+
MYSQL_DATA_TYPE.TIME: sqlalchemy_types.Time,
|
|
357
|
+
MYSQL_DATA_TYPE.DATE: sqlalchemy_types.Date,
|
|
358
|
+
MYSQL_DATA_TYPE.DATETIME: sqlalchemy_types.DateTime,
|
|
373
359
|
MYSQL_DATA_TYPE.TIMESTAMP: sqlalchemy_types.TIMESTAMP,
|
|
374
360
|
}
|
|
375
361
|
|
|
@@ -379,7 +365,7 @@ class ResultSet:
|
|
|
379
365
|
# infer MYSQL_DATA_TYPE if not set
|
|
380
366
|
if isinstance(column_type, MYSQL_DATA_TYPE) is False:
|
|
381
367
|
if column_type is not None:
|
|
382
|
-
logger.warning(f
|
|
368
|
+
logger.warning(f"Unexpected column type: {column_type}")
|
|
383
369
|
if self._df is None:
|
|
384
370
|
column_type = MYSQL_DATA_TYPE.TEXT
|
|
385
371
|
else:
|
|
@@ -387,12 +373,7 @@ class ResultSet:
|
|
|
387
373
|
|
|
388
374
|
sqlalchemy_type = type_mapping.get(column_type, sqlalchemy_types.TEXT)
|
|
389
375
|
|
|
390
|
-
columns.append(
|
|
391
|
-
TableColumn(
|
|
392
|
-
name=column.alias,
|
|
393
|
-
type=sqlalchemy_type
|
|
394
|
-
)
|
|
395
|
-
)
|
|
376
|
+
columns.append(TableColumn(name=column.alias, type=sqlalchemy_type))
|
|
396
377
|
return columns
|
|
397
378
|
|
|
398
379
|
def to_lists(self, json_types=False):
|
|
@@ -410,12 +391,15 @@ class ResultSet:
|
|
|
410
391
|
for name, dtype in df.dtypes.to_dict().items():
|
|
411
392
|
if pd.api.types.is_datetime64_any_dtype(dtype):
|
|
412
393
|
df[name] = df[name].dt.strftime("%Y-%m-%d %H:%M:%S.%f")
|
|
413
|
-
|
|
394
|
+
for i, column in enumerate(self.columns):
|
|
395
|
+
if column.type == MYSQL_DATA_TYPE.VECTOR:
|
|
396
|
+
df[i] = df[i].apply(_dump_vector)
|
|
397
|
+
df.replace({np.nan: None}, inplace=True)
|
|
414
398
|
return df.to_records(index=False).tolist()
|
|
415
399
|
|
|
416
400
|
# slower but keep timestamp type
|
|
417
401
|
df = self._df.replace({np.nan: None}) # TODO rework
|
|
418
|
-
return df.to_dict(
|
|
402
|
+
return df.to_dict("split")["data"]
|
|
419
403
|
|
|
420
404
|
def get_column_values(self, col_idx):
|
|
421
405
|
# get by column index
|
|
@@ -434,14 +418,11 @@ class ResultSet:
|
|
|
434
418
|
self._df[col_idx] = values
|
|
435
419
|
|
|
436
420
|
def add_from_result_set(self, rs):
|
|
437
|
-
|
|
438
421
|
source_names = rs.get_column_names()
|
|
439
422
|
|
|
440
423
|
col_sequence = []
|
|
441
424
|
for name in self.get_column_names():
|
|
442
|
-
col_sequence.append(
|
|
443
|
-
source_names.index(name)
|
|
444
|
-
)
|
|
425
|
+
col_sequence.append(source_names.index(name))
|
|
445
426
|
|
|
446
427
|
raw_df = rs.get_raw_df()[col_sequence]
|
|
447
428
|
|
|
@@ -28,7 +28,7 @@ def get_table_alias(table_obj, default_db_name):
|
|
|
28
28
|
elif isinstance(table_obj, Select):
|
|
29
29
|
# it is subquery
|
|
30
30
|
if table_obj.alias is None:
|
|
31
|
-
name =
|
|
31
|
+
name = "t"
|
|
32
32
|
else:
|
|
33
33
|
name = table_obj.alias.parts[0]
|
|
34
34
|
name = (default_db_name, name)
|
|
@@ -37,10 +37,10 @@ def get_table_alias(table_obj, default_db_name):
|
|
|
37
37
|
return get_table_alias(table_obj.left, default_db_name)
|
|
38
38
|
else:
|
|
39
39
|
# unknown yet object
|
|
40
|
-
return default_db_name,
|
|
40
|
+
return default_db_name, "t", "t"
|
|
41
41
|
|
|
42
42
|
if table_obj.alias is not None:
|
|
43
|
-
name = name + (
|
|
43
|
+
name = name + (".".join(table_obj.alias.parts),)
|
|
44
44
|
else:
|
|
45
45
|
name = name + (name[1],)
|
|
46
46
|
return name
|
|
@@ -57,7 +57,7 @@ def get_fill_param_fnc(steps_data):
|
|
|
57
57
|
node_prev = callstack[0]
|
|
58
58
|
if isinstance(node_prev, BinaryOperation):
|
|
59
59
|
# Check case: 'something IN Parameter()'
|
|
60
|
-
if node_prev.op.lower() ==
|
|
60
|
+
if node_prev.op.lower() == "in" and node_prev.args[1] is node:
|
|
61
61
|
is_single_item = False
|
|
62
62
|
|
|
63
63
|
if is_single_item and len(items) == 1:
|
|
@@ -71,32 +71,28 @@ def get_fill_param_fnc(steps_data):
|
|
|
71
71
|
rs = steps_data[node.value.step_num]
|
|
72
72
|
items = [Constant(i) for i in rs.get_column_values(col_idx=0)]
|
|
73
73
|
return Tuple(items)
|
|
74
|
+
|
|
74
75
|
return fill_params
|
|
75
76
|
|
|
76
77
|
|
|
77
78
|
class FetchDataframeStepCall(BaseStepCall):
|
|
78
|
-
|
|
79
79
|
bind = FetchDataframeStep
|
|
80
80
|
|
|
81
81
|
def call(self, step):
|
|
82
|
-
|
|
83
82
|
dn = self.session.datahub.get(step.integration)
|
|
84
83
|
query = step.query
|
|
85
84
|
|
|
86
85
|
if dn is None:
|
|
87
|
-
raise UnknownError(f
|
|
86
|
+
raise UnknownError(f"Unknown integration name: {step.integration}")
|
|
88
87
|
|
|
89
88
|
if query is None:
|
|
90
|
-
table_alias = (self.context.get(
|
|
89
|
+
table_alias = (self.context.get("database"), "result", "result")
|
|
91
90
|
|
|
92
91
|
# fetch raw_query
|
|
93
|
-
response: DataHubResponse = dn.query(
|
|
94
|
-
native_query=step.raw_query,
|
|
95
|
-
session=self.session
|
|
96
|
-
)
|
|
92
|
+
response: DataHubResponse = dn.query(native_query=step.raw_query, session=self.session)
|
|
97
93
|
df = response.data_frame
|
|
98
94
|
else:
|
|
99
|
-
table_alias = get_table_alias(step.query.from_table, self.context.get(
|
|
95
|
+
table_alias = get_table_alias(step.query.from_table, self.context.get("database"))
|
|
100
96
|
|
|
101
97
|
# TODO for information_schema we have 'database' = 'mindsdb'
|
|
102
98
|
|
|
@@ -106,19 +102,19 @@ class FetchDataframeStepCall(BaseStepCall):
|
|
|
106
102
|
|
|
107
103
|
query, context_callback = query_context_controller.handle_db_context_vars(query, dn, self.session)
|
|
108
104
|
|
|
109
|
-
response: DataHubResponse = dn.query(
|
|
110
|
-
query=query,
|
|
111
|
-
session=self.session
|
|
112
|
-
)
|
|
105
|
+
response: DataHubResponse = dn.query(query=query, session=self.session)
|
|
113
106
|
df = response.data_frame
|
|
114
107
|
|
|
115
108
|
if context_callback:
|
|
116
109
|
context_callback(df, response.columns)
|
|
117
110
|
|
|
111
|
+
# if query registered, set progress
|
|
112
|
+
if self.sql_query.run_query is not None:
|
|
113
|
+
self.sql_query.run_query.set_progress(df, None)
|
|
118
114
|
return ResultSet.from_df(
|
|
119
115
|
df,
|
|
120
116
|
table_name=table_alias[1],
|
|
121
117
|
table_alias=table_alias[2],
|
|
122
118
|
database=table_alias[0],
|
|
123
|
-
mysql_types=response.mysql_types
|
|
119
|
+
mysql_types=response.mysql_types,
|
|
124
120
|
)
|
|
@@ -57,21 +57,21 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
57
57
|
# get query record
|
|
58
58
|
run_query = self.sql_query.run_query
|
|
59
59
|
if run_query is None:
|
|
60
|
-
raise RuntimeError(
|
|
60
|
+
raise RuntimeError("Error with partitioning of the query")
|
|
61
61
|
run_query.set_params(step.params)
|
|
62
62
|
|
|
63
|
-
self.table_alias = get_table_alias(step.query.from_table, self.context.get(
|
|
63
|
+
self.table_alias = get_table_alias(step.query.from_table, self.context.get("database"))
|
|
64
64
|
self.current_step_num = step.step_num
|
|
65
65
|
self.substeps = step.steps
|
|
66
66
|
|
|
67
67
|
# ml task queue enabled?
|
|
68
68
|
use_threads, thread_count = False, None
|
|
69
|
-
if config[
|
|
69
|
+
if config["ml_task_queue"]["type"] == "redis":
|
|
70
70
|
use_threads = True
|
|
71
71
|
|
|
72
72
|
# use threads?
|
|
73
|
-
if
|
|
74
|
-
threads = step.params[
|
|
73
|
+
if "threads" in step.params:
|
|
74
|
+
threads = step.params["threads"]
|
|
75
75
|
if isinstance(threads, int):
|
|
76
76
|
thread_count = threads
|
|
77
77
|
use_threads = True
|
|
@@ -81,7 +81,7 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
81
81
|
# disable even with ml task queue
|
|
82
82
|
use_threads = False
|
|
83
83
|
|
|
84
|
-
on_error = step.params.get(
|
|
84
|
+
on_error = step.params.get("error", "raise")
|
|
85
85
|
if use_threads:
|
|
86
86
|
return self.fetch_threads(run_query, query, thread_count=thread_count, on_error=on_error)
|
|
87
87
|
else:
|
|
@@ -89,7 +89,7 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
89
89
|
|
|
90
90
|
def fetch_iterate(self, run_query: RunningQuery, query: ASTNode, on_error: str = None) -> ResultSet:
|
|
91
91
|
"""
|
|
92
|
-
|
|
92
|
+
Process batches one by one in circle
|
|
93
93
|
"""
|
|
94
94
|
|
|
95
95
|
results = []
|
|
@@ -99,7 +99,7 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
99
99
|
sub_data = self.exec_sub_steps(df)
|
|
100
100
|
results.append(sub_data)
|
|
101
101
|
except Exception as e:
|
|
102
|
-
if on_error ==
|
|
102
|
+
if on_error == "skip":
|
|
103
103
|
logger.error(e)
|
|
104
104
|
else:
|
|
105
105
|
raise e
|
|
@@ -131,12 +131,12 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
131
131
|
- the final result is returned and used outside to concatenate with results of other's batches
|
|
132
132
|
"""
|
|
133
133
|
input_data = ResultSet.from_df(
|
|
134
|
-
df,
|
|
135
|
-
table_name=self.table_alias[1],
|
|
136
|
-
table_alias=self.table_alias[2],
|
|
137
|
-
database=self.table_alias[0]
|
|
134
|
+
df, table_name=self.table_alias[1], table_alias=self.table_alias[2], database=self.table_alias[0]
|
|
138
135
|
)
|
|
139
136
|
|
|
137
|
+
if len(self.substeps) == 0:
|
|
138
|
+
return input_data
|
|
139
|
+
|
|
140
140
|
# execute with modified previous results
|
|
141
141
|
steps_data2 = self.steps_data.copy()
|
|
142
142
|
steps_data2[self.current_step_num] = input_data
|
|
@@ -147,8 +147,9 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
147
147
|
steps_data2[substep.step_num] = sub_data
|
|
148
148
|
return sub_data
|
|
149
149
|
|
|
150
|
-
def fetch_threads(
|
|
151
|
-
|
|
150
|
+
def fetch_threads(
|
|
151
|
+
self, run_query: RunningQuery, query: ASTNode, thread_count: int = None, on_error: str = None
|
|
152
|
+
) -> ResultSet:
|
|
152
153
|
"""
|
|
153
154
|
Process batches in threads
|
|
154
155
|
- spawn required count of threads
|
|
@@ -170,9 +171,7 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
170
171
|
results = []
|
|
171
172
|
|
|
172
173
|
with ContextThreadPoolExecutor(max_workers=thread_count) as executor:
|
|
173
|
-
|
|
174
174
|
for df in run_query.get_partitions(self.dn, self, query):
|
|
175
|
-
|
|
176
175
|
# split into chunks and send to workers
|
|
177
176
|
futures = []
|
|
178
177
|
for df2 in split_data_frame(df, partition_size):
|
|
@@ -182,13 +181,13 @@ class FetchDataframePartitionCall(BaseStepCall):
|
|
|
182
181
|
try:
|
|
183
182
|
results.append(future.result())
|
|
184
183
|
except Exception as e:
|
|
185
|
-
if on_error ==
|
|
184
|
+
if on_error == "skip":
|
|
186
185
|
logger.error(e)
|
|
187
186
|
else:
|
|
188
187
|
executor.shutdown()
|
|
189
188
|
raise e
|
|
190
189
|
if self.sql_query.stop_event is not None and self.sql_query.stop_event.is_set():
|
|
191
190
|
executor.shutdown()
|
|
192
|
-
raise RuntimeError(
|
|
191
|
+
raise RuntimeError("Query is interrupted")
|
|
193
192
|
|
|
194
193
|
return self.concat_results(results)
|