MindsDB 25.5.4.2__py3-none-any.whl → 25.6.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +28 -25
- mindsdb/api/a2a/common/server/server.py +32 -26
- mindsdb/api/executor/command_executor.py +69 -14
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
- mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
- mindsdb/api/executor/planner/plan_join.py +67 -77
- mindsdb/api/executor/planner/query_planner.py +176 -155
- mindsdb/api/executor/planner/steps.py +37 -12
- mindsdb/api/executor/sql_query/result_set.py +45 -64
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
- mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
- mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
- mindsdb/api/executor/utilities/sql.py +42 -48
- mindsdb/api/http/namespaces/config.py +1 -1
- mindsdb/api/http/namespaces/file.py +14 -23
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
- mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
- mindsdb/integrations/libs/api_handler.py +261 -57
- mindsdb/integrations/libs/base.py +100 -29
- mindsdb/integrations/utilities/files/file_reader.py +99 -73
- mindsdb/integrations/utilities/handler_utils.py +23 -8
- mindsdb/integrations/utilities/sql_utils.py +35 -40
- mindsdb/interfaces/agents/agents_controller.py +196 -192
- mindsdb/interfaces/agents/constants.py +7 -1
- mindsdb/interfaces/agents/langchain_agent.py +42 -11
- mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
- mindsdb/interfaces/data_catalog/__init__.py +0 -0
- mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
- mindsdb/interfaces/database/database.py +81 -57
- mindsdb/interfaces/database/integrations.py +220 -234
- mindsdb/interfaces/database/log.py +72 -104
- mindsdb/interfaces/database/projects.py +156 -193
- mindsdb/interfaces/file/file_controller.py +21 -65
- mindsdb/interfaces/knowledge_base/controller.py +63 -10
- mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
- mindsdb/interfaces/skills/skills_controller.py +54 -36
- mindsdb/interfaces/skills/sql_agent.py +109 -86
- mindsdb/interfaces/storage/db.py +223 -79
- mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
- mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
- mindsdb/utilities/config.py +9 -2
- mindsdb/utilities/log.py +35 -26
- mindsdb/utilities/ml_task_queue/task.py +19 -22
- mindsdb/utilities/render/sqlalchemy_render.py +129 -181
- mindsdb/utilities/starters.py +40 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +253 -253
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +69 -61
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
|
@@ -6,22 +6,24 @@ from snowflake.sqlalchemy import snowdialect
|
|
|
6
6
|
from snowflake import connector
|
|
7
7
|
from snowflake.connector.errors import NotSupportedError
|
|
8
8
|
from snowflake.connector.cursor import SnowflakeCursor, ResultMetadata
|
|
9
|
+
from typing import Optional, List
|
|
9
10
|
|
|
10
11
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
11
12
|
from mindsdb_sql_parser.ast import Select, Identifier
|
|
12
13
|
|
|
13
14
|
from mindsdb.utilities import log
|
|
14
|
-
from mindsdb.integrations.libs.base import
|
|
15
|
+
from mindsdb.integrations.libs.base import MetaDatabaseHandler
|
|
15
16
|
from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
|
|
16
17
|
from mindsdb.integrations.libs.response import (
|
|
17
18
|
HandlerStatusResponse as StatusResponse,
|
|
18
19
|
HandlerResponse as Response,
|
|
19
|
-
RESPONSE_TYPE
|
|
20
|
+
RESPONSE_TYPE,
|
|
20
21
|
)
|
|
21
22
|
from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE
|
|
22
23
|
|
|
23
24
|
try:
|
|
24
25
|
import pyarrow as pa
|
|
26
|
+
|
|
25
27
|
memory_pool = pa.default_memory_pool()
|
|
26
28
|
except Exception:
|
|
27
29
|
memory_pool = None
|
|
@@ -31,7 +33,7 @@ logger = log.getLogger(__name__)
|
|
|
31
33
|
|
|
32
34
|
|
|
33
35
|
def _map_type(internal_type_name: str) -> MYSQL_DATA_TYPE:
|
|
34
|
-
"""
|
|
36
|
+
"""Map Snowflake types to MySQL types.
|
|
35
37
|
|
|
36
38
|
Args:
|
|
37
39
|
internal_type_name (str): The name of the Snowflake type to map.
|
|
@@ -41,22 +43,22 @@ def _map_type(internal_type_name: str) -> MYSQL_DATA_TYPE:
|
|
|
41
43
|
"""
|
|
42
44
|
internal_type_name = internal_type_name.upper()
|
|
43
45
|
types_map = {
|
|
44
|
-
(
|
|
45
|
-
(
|
|
46
|
-
(
|
|
47
|
-
(
|
|
48
|
-
(
|
|
49
|
-
(
|
|
50
|
-
(
|
|
51
|
-
(
|
|
52
|
-
(
|
|
53
|
-
(
|
|
54
|
-
(
|
|
55
|
-
(
|
|
56
|
-
(
|
|
57
|
-
(
|
|
58
|
-
(
|
|
59
|
-
(
|
|
46
|
+
("NUMBER", "DECIMAL", "DEC", "NUMERIC"): MYSQL_DATA_TYPE.DECIMAL,
|
|
47
|
+
("INT , INTEGER , BIGINT , SMALLINT , TINYINT , BYTEINT"): MYSQL_DATA_TYPE.INT,
|
|
48
|
+
("FLOAT", "FLOAT4", "FLOAT8"): MYSQL_DATA_TYPE.FLOAT,
|
|
49
|
+
("DOUBLE", "DOUBLE PRECISION", "REAL"): MYSQL_DATA_TYPE.DOUBLE,
|
|
50
|
+
("VARCHAR"): MYSQL_DATA_TYPE.VARCHAR,
|
|
51
|
+
("CHAR", "CHARACTER", "NCHAR"): MYSQL_DATA_TYPE.CHAR,
|
|
52
|
+
("STRING", "TEXT", "NVARCHAR"): MYSQL_DATA_TYPE.TEXT,
|
|
53
|
+
("NVARCHAR2", "CHAR VARYING", "NCHAR VARYING"): MYSQL_DATA_TYPE.VARCHAR,
|
|
54
|
+
("BINARY", "VARBINARY"): MYSQL_DATA_TYPE.BINARY,
|
|
55
|
+
("BOOLEAN",): MYSQL_DATA_TYPE.BOOL,
|
|
56
|
+
("TIMESTAMP_NTZ", "DATETIME"): MYSQL_DATA_TYPE.DATETIME,
|
|
57
|
+
("DATE",): MYSQL_DATA_TYPE.DATE,
|
|
58
|
+
("TIME",): MYSQL_DATA_TYPE.TIME,
|
|
59
|
+
("TIMESTAMP_LTZ"): MYSQL_DATA_TYPE.DATETIME,
|
|
60
|
+
("TIMESTAMP_TZ"): MYSQL_DATA_TYPE.DATETIME,
|
|
61
|
+
("VARIANT", "OBJECT", "ARRAY", "MAP", "GEOGRAPHY", "GEOMETRY", "VECTOR"): MYSQL_DATA_TYPE.VARCHAR,
|
|
60
62
|
}
|
|
61
63
|
|
|
62
64
|
for db_types_list, mysql_data_type in types_map.items():
|
|
@@ -84,26 +86,32 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
|
|
|
84
86
|
for column in description:
|
|
85
87
|
column_dtype = result[column.name].dtype
|
|
86
88
|
description_column_type = connector.constants.FIELD_ID_TO_NAME.get(column.type_code)
|
|
89
|
+
if description_column_type in ("OBJECT", "ARRAY"):
|
|
90
|
+
mysql_types.append(MYSQL_DATA_TYPE.JSON)
|
|
91
|
+
continue
|
|
92
|
+
if description_column_type == "VECTOR":
|
|
93
|
+
mysql_types.append(MYSQL_DATA_TYPE.VECTOR)
|
|
94
|
+
continue
|
|
87
95
|
if pd_types.is_integer_dtype(column_dtype):
|
|
88
96
|
column_dtype_name = column_dtype.name
|
|
89
|
-
if column_dtype_name in (
|
|
97
|
+
if column_dtype_name in ("int8", "Int8"):
|
|
90
98
|
mysql_types.append(MYSQL_DATA_TYPE.TINYINT)
|
|
91
|
-
elif column_dtype in (
|
|
99
|
+
elif column_dtype in ("int16", "Int16"):
|
|
92
100
|
mysql_types.append(MYSQL_DATA_TYPE.SMALLINT)
|
|
93
|
-
elif column_dtype in (
|
|
101
|
+
elif column_dtype in ("int32", "Int32"):
|
|
94
102
|
mysql_types.append(MYSQL_DATA_TYPE.MEDIUMINT)
|
|
95
|
-
elif column_dtype in (
|
|
103
|
+
elif column_dtype in ("int64", "Int64"):
|
|
96
104
|
mysql_types.append(MYSQL_DATA_TYPE.BIGINT)
|
|
97
105
|
else:
|
|
98
106
|
mysql_types.append(MYSQL_DATA_TYPE.INT)
|
|
99
107
|
continue
|
|
100
108
|
if pd_types.is_float_dtype(column_dtype):
|
|
101
109
|
column_dtype_name = column_dtype.name
|
|
102
|
-
if column_dtype_name in (
|
|
110
|
+
if column_dtype_name in ("float16", "Float16"): # Float16 does not exists so far
|
|
103
111
|
mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
|
|
104
|
-
elif column_dtype_name in (
|
|
112
|
+
elif column_dtype_name in ("float32", "Float32"):
|
|
105
113
|
mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
|
|
106
|
-
elif column_dtype_name in (
|
|
114
|
+
elif column_dtype_name in ("float64", "Float64"):
|
|
107
115
|
mysql_types.append(MYSQL_DATA_TYPE.DOUBLE)
|
|
108
116
|
else:
|
|
109
117
|
mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
|
|
@@ -115,35 +123,35 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
|
|
|
115
123
|
mysql_types.append(MYSQL_DATA_TYPE.DATETIME)
|
|
116
124
|
series = result[column.name]
|
|
117
125
|
# snowflake use pytz.timezone
|
|
118
|
-
if series.dt.tz is not None and getattr(series.dt.tz,
|
|
119
|
-
series = series.dt.tz_convert(
|
|
126
|
+
if series.dt.tz is not None and getattr(series.dt.tz, "zone", "UTC") != "UTC":
|
|
127
|
+
series = series.dt.tz_convert("UTC")
|
|
120
128
|
result[column.name] = series.dt.tz_localize(None)
|
|
121
129
|
continue
|
|
122
130
|
|
|
123
131
|
if pd_types.is_object_dtype(column_dtype):
|
|
124
|
-
if description_column_type ==
|
|
132
|
+
if description_column_type == "TEXT":
|
|
125
133
|
# we can also check column.internal_size, if == 16777216 then it is TEXT, else VARCHAR(internal_size)
|
|
126
134
|
mysql_types.append(MYSQL_DATA_TYPE.TEXT)
|
|
127
135
|
continue
|
|
128
|
-
elif description_column_type ==
|
|
136
|
+
elif description_column_type == "BINARY":
|
|
129
137
|
# if column.internal_size == 8388608 then BINARY, else VARBINARY(internal_size)
|
|
130
138
|
mysql_types.append(MYSQL_DATA_TYPE.BINARY)
|
|
131
139
|
continue
|
|
132
|
-
elif description_column_type ==
|
|
140
|
+
elif description_column_type == "DATE":
|
|
133
141
|
mysql_types.append(MYSQL_DATA_TYPE.DATE)
|
|
134
142
|
continue
|
|
135
|
-
elif description_column_type ==
|
|
143
|
+
elif description_column_type == "TIME":
|
|
136
144
|
mysql_types.append(MYSQL_DATA_TYPE.TIME)
|
|
137
145
|
continue
|
|
138
146
|
|
|
139
|
-
if description_column_type ==
|
|
147
|
+
if description_column_type == "FIXED":
|
|
140
148
|
if column.scale == 0:
|
|
141
149
|
mysql_types.append(MYSQL_DATA_TYPE.INT)
|
|
142
150
|
else:
|
|
143
151
|
# It is NUMBER, DECIMAL or NUMERIC with scale > 0
|
|
144
152
|
mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
|
|
145
153
|
continue
|
|
146
|
-
elif description_column_type ==
|
|
154
|
+
elif description_column_type == "REAL":
|
|
147
155
|
mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
|
|
148
156
|
continue
|
|
149
157
|
|
|
@@ -154,24 +162,19 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
|
|
|
154
162
|
columns=[column.name for column in description],
|
|
155
163
|
)
|
|
156
164
|
|
|
157
|
-
return Response(
|
|
158
|
-
RESPONSE_TYPE.TABLE,
|
|
159
|
-
data_frame=df,
|
|
160
|
-
affected_rows=None,
|
|
161
|
-
mysql_types=mysql_types
|
|
162
|
-
)
|
|
165
|
+
return Response(RESPONSE_TYPE.TABLE, data_frame=df, affected_rows=None, mysql_types=mysql_types)
|
|
163
166
|
|
|
164
167
|
|
|
165
|
-
class SnowflakeHandler(
|
|
168
|
+
class SnowflakeHandler(MetaDatabaseHandler):
|
|
166
169
|
"""
|
|
167
170
|
This handler handles connection and execution of the Snowflake statements.
|
|
168
171
|
"""
|
|
169
172
|
|
|
170
|
-
name =
|
|
173
|
+
name = "snowflake"
|
|
171
174
|
|
|
172
175
|
def __init__(self, name, **kwargs):
|
|
173
176
|
super().__init__(name)
|
|
174
|
-
self.connection_data = kwargs.get(
|
|
177
|
+
self.connection_data = kwargs.get("connection_data")
|
|
175
178
|
self.renderer = SqlalchemyRender(snowdialect.dialect)
|
|
176
179
|
|
|
177
180
|
self.is_connected = False
|
|
@@ -193,18 +196,18 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
193
196
|
return self.connection
|
|
194
197
|
|
|
195
198
|
# Mandatory connection parameters
|
|
196
|
-
if not all(key in self.connection_data for key in [
|
|
197
|
-
raise ValueError(
|
|
199
|
+
if not all(key in self.connection_data for key in ["account", "user", "password", "database"]):
|
|
200
|
+
raise ValueError("Required parameters (account, user, password, database) must be provided.")
|
|
198
201
|
|
|
199
202
|
config = {
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
203
|
+
"account": self.connection_data.get("account"),
|
|
204
|
+
"user": self.connection_data.get("user"),
|
|
205
|
+
"password": self.connection_data.get("password"),
|
|
206
|
+
"database": self.connection_data.get("database"),
|
|
204
207
|
}
|
|
205
208
|
|
|
206
209
|
# Optional connection parameters
|
|
207
|
-
optional_params = [
|
|
210
|
+
optional_params = ["schema", "warehouse", "role"]
|
|
208
211
|
for param in optional_params:
|
|
209
212
|
if param in self.connection_data:
|
|
210
213
|
config[param] = self.connection_data[param]
|
|
@@ -215,7 +218,7 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
215
218
|
self.is_connected = True
|
|
216
219
|
return self.connection
|
|
217
220
|
except connector.errors.Error as e:
|
|
218
|
-
logger.error(f
|
|
221
|
+
logger.error(f"Error connecting to Snowflake, {e}!")
|
|
219
222
|
raise
|
|
220
223
|
|
|
221
224
|
def disconnect(self):
|
|
@@ -244,10 +247,10 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
244
247
|
|
|
245
248
|
# Execute a simple query to test the connection
|
|
246
249
|
with connection.cursor() as cur:
|
|
247
|
-
cur.execute(
|
|
250
|
+
cur.execute("select 1;")
|
|
248
251
|
response.success = True
|
|
249
252
|
except (connector.errors.Error, ValueError) as e:
|
|
250
|
-
logger.error(f
|
|
253
|
+
logger.error(f"Error connecting to Snowflake, {e}!")
|
|
251
254
|
response.error_message = str(e)
|
|
252
255
|
|
|
253
256
|
if response.success and need_to_close:
|
|
@@ -276,7 +279,6 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
276
279
|
try:
|
|
277
280
|
cur.execute(query)
|
|
278
281
|
try:
|
|
279
|
-
|
|
280
282
|
try:
|
|
281
283
|
batches_iter = cur.fetch_pandas_batches()
|
|
282
284
|
except ValueError:
|
|
@@ -297,64 +299,52 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
297
299
|
if memory_estimation_check_done is False and batches_rowcount > 1000:
|
|
298
300
|
memory_estimation_check_done = True
|
|
299
301
|
available_memory_kb = psutil.virtual_memory().available >> 10
|
|
300
|
-
batches_size_kb = sum(
|
|
302
|
+
batches_size_kb = sum(
|
|
303
|
+
[(x.memory_usage(index=True, deep=True).sum() >> 10) for x in batches]
|
|
304
|
+
)
|
|
301
305
|
total_rowcount = cur.rowcount
|
|
302
306
|
rest_rowcount = total_rowcount - batches_rowcount
|
|
303
307
|
rest_estimated_size_kb = int((rest_rowcount / batches_rowcount) * batches_size_kb)
|
|
304
308
|
if (available_memory_kb * 0.9) < rest_estimated_size_kb:
|
|
305
309
|
logger.error(
|
|
306
|
-
|
|
307
|
-
f
|
|
308
|
-
f
|
|
309
|
-
f
|
|
310
|
+
"Attempt to get too large dataset:\n"
|
|
311
|
+
f"batches_rowcount={batches_rowcount}, size_kb={batches_size_kb}\n"
|
|
312
|
+
f"total_rowcount={total_rowcount}, estimated_size_kb={rest_estimated_size_kb}\n"
|
|
313
|
+
f"available_memory_kb={available_memory_kb}"
|
|
310
314
|
)
|
|
311
|
-
raise MemoryError(
|
|
315
|
+
raise MemoryError("Not enought memory")
|
|
312
316
|
# endregion
|
|
313
317
|
if len(batches) > 0:
|
|
314
318
|
response = _make_table_response(result=pandas.concat(batches, ignore_index=True), cursor=cur)
|
|
315
319
|
else:
|
|
316
|
-
response = Response(
|
|
317
|
-
RESPONSE_TYPE.TABLE,
|
|
318
|
-
DataFrame(
|
|
319
|
-
[],
|
|
320
|
-
columns=[x[0] for x in cur.description]
|
|
321
|
-
)
|
|
322
|
-
)
|
|
320
|
+
response = Response(RESPONSE_TYPE.TABLE, DataFrame([], columns=[x[0] for x in cur.description]))
|
|
323
321
|
except NotSupportedError:
|
|
324
322
|
# Fallback for CREATE/DELETE/UPDATE. These commands returns table with single column,
|
|
325
323
|
# but it cannot be retrieved as pandas DataFrame.
|
|
326
324
|
result = cur.fetchall()
|
|
327
325
|
match result:
|
|
328
326
|
case (
|
|
329
|
-
[{
|
|
330
|
-
| [{
|
|
331
|
-
| [{
|
|
327
|
+
[{"number of rows inserted": affected_rows}]
|
|
328
|
+
| [{"number of rows deleted": affected_rows}]
|
|
329
|
+
| [{"number of rows updated": affected_rows, "number of multi-joined rows updated": _}]
|
|
332
330
|
):
|
|
333
331
|
response = Response(RESPONSE_TYPE.OK, affected_rows=affected_rows)
|
|
334
332
|
case list():
|
|
335
333
|
response = Response(
|
|
336
|
-
RESPONSE_TYPE.TABLE,
|
|
337
|
-
DataFrame(
|
|
338
|
-
result,
|
|
339
|
-
columns=[x[0] for x in cur.description]
|
|
340
|
-
)
|
|
334
|
+
RESPONSE_TYPE.TABLE, DataFrame(result, columns=[x[0] for x in cur.description])
|
|
341
335
|
)
|
|
342
336
|
case _:
|
|
343
337
|
# Looks like SnowFlake always returns something in response, so this is suspicious
|
|
344
|
-
logger.warning(
|
|
338
|
+
logger.warning("Snowflake did not return any data in response.")
|
|
345
339
|
response = Response(RESPONSE_TYPE.OK)
|
|
346
340
|
except Exception as e:
|
|
347
341
|
logger.error(f"Error running query: {query} on {self.connection_data.get('database')}, {e}!")
|
|
348
|
-
response = Response(
|
|
349
|
-
RESPONSE_TYPE.ERROR,
|
|
350
|
-
error_code=0,
|
|
351
|
-
error_message=str(e)
|
|
352
|
-
)
|
|
342
|
+
response = Response(RESPONSE_TYPE.ERROR, error_code=0, error_message=str(e))
|
|
353
343
|
|
|
354
344
|
if need_to_close is True:
|
|
355
345
|
self.disconnect()
|
|
356
346
|
|
|
357
|
-
if memory_pool is not None and memory_pool.backend_name ==
|
|
347
|
+
if memory_pool is not None and memory_pool.backend_name == "jemalloc":
|
|
358
348
|
# This reduce memory consumption, but will slow down next query slightly.
|
|
359
349
|
# Except pool type 'jemalloc': memory consumption do not change significantly
|
|
360
350
|
# and next query processing time may be even lower.
|
|
@@ -385,7 +375,7 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
385
375
|
quoted_columns = []
|
|
386
376
|
if query.targets is not None:
|
|
387
377
|
for column in query.targets:
|
|
388
|
-
if hasattr(column,
|
|
378
|
+
if hasattr(column, "alias") and column.alias is not None:
|
|
389
379
|
if column.alias.is_quoted[-1]:
|
|
390
380
|
quoted_columns.append(column.alias.parts[-1])
|
|
391
381
|
elif isinstance(column, Identifier):
|
|
@@ -455,3 +445,264 @@ class SnowflakeHandler(DatabaseHandler):
|
|
|
455
445
|
result.to_columns_table_response(map_type_fn=_map_type)
|
|
456
446
|
|
|
457
447
|
return result
|
|
448
|
+
|
|
449
|
+
def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
|
|
450
|
+
"""
|
|
451
|
+
Retrieves metadata information about the tables in the Snowflake database to be stored in the data catalog.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
table_names (list): A list of table names for which to retrieve metadata information.
|
|
455
|
+
|
|
456
|
+
Returns:
|
|
457
|
+
Response: A response object containing the metadata information, formatted as per the `Response` class.
|
|
458
|
+
"""
|
|
459
|
+
query = """
|
|
460
|
+
SELECT
|
|
461
|
+
TABLE_CATALOG,
|
|
462
|
+
TABLE_SCHEMA,
|
|
463
|
+
TABLE_NAME,
|
|
464
|
+
TABLE_TYPE,
|
|
465
|
+
COMMENT AS TABLE_DESCRIPTION,
|
|
466
|
+
ROW_COUNT,
|
|
467
|
+
CREATED,
|
|
468
|
+
LAST_ALTERED
|
|
469
|
+
FROM INFORMATION_SCHEMA.TABLES
|
|
470
|
+
WHERE TABLE_SCHEMA = current_schema()
|
|
471
|
+
AND TABLE_TYPE IN ('BASE TABLE', 'VIEW')
|
|
472
|
+
"""
|
|
473
|
+
|
|
474
|
+
if table_names is not None and len(table_names) > 0:
|
|
475
|
+
table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
|
|
476
|
+
query += f" AND TABLE_NAME IN ({table_names_str})"
|
|
477
|
+
|
|
478
|
+
result = self.native_query(query)
|
|
479
|
+
result.data_frame["ROW_COUNT"] = result.data_frame["ROW_COUNT"].astype(int)
|
|
480
|
+
|
|
481
|
+
return result
|
|
482
|
+
|
|
483
|
+
def meta_get_columns(self, table_names: Optional[List[str]] = None) -> Response:
|
|
484
|
+
"""
|
|
485
|
+
Retrieves column metadata for the specified tables (or all tables if no list is provided).
|
|
486
|
+
|
|
487
|
+
Args:
|
|
488
|
+
table_names (list): A list of table names for which to retrieve column metadata.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Response: A response object containing the column metadata.
|
|
492
|
+
"""
|
|
493
|
+
query = """
|
|
494
|
+
SELECT
|
|
495
|
+
TABLE_NAME,
|
|
496
|
+
COLUMN_NAME,
|
|
497
|
+
DATA_TYPE,
|
|
498
|
+
COMMENT AS COLUMN_DESCRIPTION,
|
|
499
|
+
COLUMN_DEFAULT,
|
|
500
|
+
(IS_NULLABLE = 'YES') AS IS_NULLABLE,
|
|
501
|
+
CHARACTER_MAXIMUM_LENGTH,
|
|
502
|
+
CHARACTER_OCTET_LENGTH,
|
|
503
|
+
NUMERIC_PRECISION,
|
|
504
|
+
NUMERIC_SCALE,
|
|
505
|
+
DATETIME_PRECISION,
|
|
506
|
+
CHARACTER_SET_NAME,
|
|
507
|
+
COLLATION_NAME
|
|
508
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
509
|
+
WHERE TABLE_SCHEMA = current_schema()
|
|
510
|
+
"""
|
|
511
|
+
|
|
512
|
+
if table_names is not None and len(table_names) > 0:
|
|
513
|
+
table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
|
|
514
|
+
query += f" AND TABLE_NAME IN ({table_names_str})"
|
|
515
|
+
|
|
516
|
+
result = self.native_query(query)
|
|
517
|
+
return result
|
|
518
|
+
|
|
519
|
+
def meta_get_column_statistics(self, table_names: Optional[List[str]] = None) -> Response:
|
|
520
|
+
"""
|
|
521
|
+
Retrieves basic column statistics: null %, distinct count.
|
|
522
|
+
Due to Snowflake limitations, this runs per-table not per-column.
|
|
523
|
+
TODO: Add most_common_values and most_common_frequencies
|
|
524
|
+
"""
|
|
525
|
+
columns_query = """
|
|
526
|
+
SELECT TABLE_NAME, COLUMN_NAME
|
|
527
|
+
FROM INFORMATION_SCHEMA.COLUMNS
|
|
528
|
+
WHERE TABLE_SCHEMA = current_schema()
|
|
529
|
+
"""
|
|
530
|
+
if table_names:
|
|
531
|
+
table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
|
|
532
|
+
columns_query += f" AND TABLE_NAME IN ({table_names_str})"
|
|
533
|
+
|
|
534
|
+
columns_result = self.native_query(columns_query)
|
|
535
|
+
if (
|
|
536
|
+
columns_result.type == RESPONSE_TYPE.ERROR
|
|
537
|
+
or columns_result.data_frame is None
|
|
538
|
+
or columns_result.data_frame.empty
|
|
539
|
+
):
|
|
540
|
+
return Response(RESPONSE_TYPE.ERROR, error_message="No columns found.")
|
|
541
|
+
|
|
542
|
+
columns_df = columns_result.data_frame
|
|
543
|
+
grouped = columns_df.groupby("TABLE_NAME")
|
|
544
|
+
all_stats = []
|
|
545
|
+
|
|
546
|
+
for table_name, group in grouped:
|
|
547
|
+
select_parts = []
|
|
548
|
+
for _, row in group.iterrows():
|
|
549
|
+
col = row["COLUMN_NAME"]
|
|
550
|
+
# Ensure column names in the query are properly quoted if they contain special characters or are case-sensitive
|
|
551
|
+
quoted_col = f'"{col}"'
|
|
552
|
+
select_parts.extend(
|
|
553
|
+
[
|
|
554
|
+
f'COUNT_IF({quoted_col} IS NULL) AS "nulls_{col}"',
|
|
555
|
+
f'APPROX_COUNT_DISTINCT({quoted_col}) AS "distincts_{col}"',
|
|
556
|
+
f'MIN({quoted_col}) AS "min_{col}"',
|
|
557
|
+
f'MAX({quoted_col}) AS "max_{col}"',
|
|
558
|
+
]
|
|
559
|
+
)
|
|
560
|
+
|
|
561
|
+
quoted_table_name = f'"{table_name}"'
|
|
562
|
+
stats_query = f"""
|
|
563
|
+
SELECT COUNT(*) AS "total_rows", {", ".join(select_parts)}
|
|
564
|
+
FROM {quoted_table_name}
|
|
565
|
+
"""
|
|
566
|
+
try:
|
|
567
|
+
stats_res = self.native_query(stats_query)
|
|
568
|
+
if stats_res.type != RESPONSE_TYPE.TABLE or stats_res.data_frame is None or stats_res.data_frame.empty:
|
|
569
|
+
logger.warning(
|
|
570
|
+
f"Could not retrieve stats for table {table_name}. Query returned no data or an error: {stats_res.error_message if stats_res.type == RESPONSE_TYPE.ERROR else 'No data'}"
|
|
571
|
+
)
|
|
572
|
+
# Add placeholder stats if query fails or returns empty
|
|
573
|
+
for _, row in group.iterrows():
|
|
574
|
+
all_stats.append(
|
|
575
|
+
{
|
|
576
|
+
"table_name": table_name,
|
|
577
|
+
"column_name": row["COLUMN_NAME"],
|
|
578
|
+
"null_percentage": None,
|
|
579
|
+
"distinct_values_count": None,
|
|
580
|
+
"most_common_values": [],
|
|
581
|
+
"most_common_frequencies": [],
|
|
582
|
+
"minimum_value": None,
|
|
583
|
+
"maximum_value": None,
|
|
584
|
+
}
|
|
585
|
+
)
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
stats_data = stats_res.data_frame.iloc[0]
|
|
589
|
+
total_rows = stats_data.get("total_rows", 0)
|
|
590
|
+
|
|
591
|
+
for _, row in group.iterrows():
|
|
592
|
+
col = row["COLUMN_NAME"]
|
|
593
|
+
# Keys for stats_data should match the aliases in stats_query (e.g., "nulls_COLNAME")
|
|
594
|
+
nulls = stats_data.get(f"nulls_{col}", 0)
|
|
595
|
+
distincts = stats_data.get(f"distincts_{col}", None)
|
|
596
|
+
min_val = stats_data.get(f"min_{col}", None)
|
|
597
|
+
max_val = stats_data.get(f"max_{col}", None)
|
|
598
|
+
null_pct = (nulls / total_rows) * 100 if total_rows is not None and total_rows > 0 else None
|
|
599
|
+
|
|
600
|
+
all_stats.append(
|
|
601
|
+
{
|
|
602
|
+
"table_name": table_name,
|
|
603
|
+
"column_name": col,
|
|
604
|
+
"null_percentage": null_pct,
|
|
605
|
+
"distinct_values_count": distincts,
|
|
606
|
+
"most_common_values": [],
|
|
607
|
+
"most_common_frequencies": [],
|
|
608
|
+
"minimum_value": min_val,
|
|
609
|
+
"maximum_value": max_val,
|
|
610
|
+
}
|
|
611
|
+
)
|
|
612
|
+
except Exception as e:
|
|
613
|
+
logger.error(f"Exception while fetching statistics for table {table_name}: {e}")
|
|
614
|
+
for _, row in group.iterrows():
|
|
615
|
+
all_stats.append(
|
|
616
|
+
{
|
|
617
|
+
"table_name": table_name,
|
|
618
|
+
"column_name": row["COLUMN_NAME"],
|
|
619
|
+
"null_percentage": None,
|
|
620
|
+
"distinct_values_count": None,
|
|
621
|
+
"most_common_values": [],
|
|
622
|
+
"most_common_frequencies": [],
|
|
623
|
+
"minimum_value": None,
|
|
624
|
+
"maximum_value": None,
|
|
625
|
+
}
|
|
626
|
+
)
|
|
627
|
+
|
|
628
|
+
if not all_stats:
|
|
629
|
+
return Response(RESPONSE_TYPE.TABLE, data_frame=pandas.DataFrame())
|
|
630
|
+
|
|
631
|
+
return Response(RESPONSE_TYPE.TABLE, data_frame=pandas.DataFrame(all_stats))
|
|
632
|
+
|
|
633
|
+
def meta_get_primary_keys(self, table_names: Optional[List[str]] = None) -> Response:
|
|
634
|
+
"""
|
|
635
|
+
Retrieves primary key information for the specified tables (or all tables if no list is provided).
|
|
636
|
+
|
|
637
|
+
Args:
|
|
638
|
+
table_names (list): A list of table names for which to retrieve primary key information.
|
|
639
|
+
|
|
640
|
+
Returns:
|
|
641
|
+
Response: A response object containing the primary key information.
|
|
642
|
+
"""
|
|
643
|
+
try:
|
|
644
|
+
query = """
|
|
645
|
+
SHOW PRIMARY KEYS IN TABLE;
|
|
646
|
+
"""
|
|
647
|
+
|
|
648
|
+
response = self.native_query(query)
|
|
649
|
+
if response.type == RESPONSE_TYPE.ERROR and response.error_message:
|
|
650
|
+
logger.error(f"Query error in meta_get_primary_keys: {response.error_message}\nQuery:\n{query}")
|
|
651
|
+
|
|
652
|
+
df = response.data_frame
|
|
653
|
+
if not df.empty:
|
|
654
|
+
if table_names:
|
|
655
|
+
df = df[df["table_name"].isin(table_names)]
|
|
656
|
+
|
|
657
|
+
df = df[["table_name", "column_name", "key_sequence", "constraint_name"]]
|
|
658
|
+
df = df.rename(columns={"key_sequence": "ordinal_position"})
|
|
659
|
+
|
|
660
|
+
response.data_frame = df
|
|
661
|
+
|
|
662
|
+
return response
|
|
663
|
+
|
|
664
|
+
except Exception as e:
|
|
665
|
+
logger.error(f"Exception in meta_get_primary_keys: {e!r}")
|
|
666
|
+
return Response(RESPONSE_TYPE.ERROR, error_message=f"Exception querying primary keys: {e!r}")
|
|
667
|
+
|
|
668
|
+
def meta_get_foreign_keys(self, table_names: Optional[List[str]] = None) -> Response:
|
|
669
|
+
"""
|
|
670
|
+
Retrieves foreign key information for the specified tables (or all tables if no list is provided).
|
|
671
|
+
|
|
672
|
+
Args:
|
|
673
|
+
table_names (list): A list of table names for which to retrieve foreign key information.
|
|
674
|
+
|
|
675
|
+
Returns:
|
|
676
|
+
Response: A response object containing the foreign key information.
|
|
677
|
+
"""
|
|
678
|
+
try:
|
|
679
|
+
query = """
|
|
680
|
+
SHOW IMPORTED KEYS IN TABLE;
|
|
681
|
+
"""
|
|
682
|
+
|
|
683
|
+
response = self.native_query(query)
|
|
684
|
+
if response.type == RESPONSE_TYPE.ERROR and response.error_message:
|
|
685
|
+
logger.error(f"Query error in meta_get_primary_keys: {response.error_message}\nQuery:\n{query}")
|
|
686
|
+
|
|
687
|
+
df = response.data_frame
|
|
688
|
+
if not df.empty:
|
|
689
|
+
if table_names:
|
|
690
|
+
df = df[df["pk_table_name"].isin(table_names) & df["fk_table_name"].isin(table_names)]
|
|
691
|
+
|
|
692
|
+
df = df[["pk_table_name", "pk_column_name", "fk_table_name", "fk_column_name"]]
|
|
693
|
+
df = df.rename(
|
|
694
|
+
columns={
|
|
695
|
+
"pk_table_name": "child_table_name",
|
|
696
|
+
"pk_column_name": "child_column_name",
|
|
697
|
+
"fk_table_name": "parent_table_name",
|
|
698
|
+
"fk_column_name": "parent_column_name",
|
|
699
|
+
}
|
|
700
|
+
)
|
|
701
|
+
|
|
702
|
+
response.data_frame = df
|
|
703
|
+
|
|
704
|
+
return response
|
|
705
|
+
|
|
706
|
+
except Exception as e:
|
|
707
|
+
logger.error(f"Exception in meta_get_primary_keys: {e!r}")
|
|
708
|
+
return Response(RESPONSE_TYPE.ERROR, error_message=f"Exception querying primary keys: {e!r}")
|