MindsDB 25.5.4.1__py3-none-any.whl → 25.6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (70) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +28 -25
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/a2a/run_a2a.py +1 -1
  5. mindsdb/api/executor/command_executor.py +69 -14
  6. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  7. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  8. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  9. mindsdb/api/executor/planner/plan_join.py +67 -77
  10. mindsdb/api/executor/planner/query_planner.py +176 -155
  11. mindsdb/api/executor/planner/steps.py +37 -12
  12. mindsdb/api/executor/sql_query/result_set.py +45 -64
  13. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  14. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  15. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  16. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  17. mindsdb/api/executor/utilities/sql.py +42 -48
  18. mindsdb/api/http/namespaces/config.py +1 -1
  19. mindsdb/api/http/namespaces/file.py +14 -23
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  21. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  22. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  23. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  24. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  25. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  26. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  27. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  28. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
  29. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  30. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
  32. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  33. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
  34. mindsdb/integrations/libs/api_handler.py +261 -57
  35. mindsdb/integrations/libs/base.py +100 -29
  36. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  37. mindsdb/integrations/utilities/handler_utils.py +23 -8
  38. mindsdb/integrations/utilities/sql_utils.py +35 -40
  39. mindsdb/interfaces/agents/agents_controller.py +196 -192
  40. mindsdb/interfaces/agents/constants.py +7 -1
  41. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  42. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  43. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  44. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  45. mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
  46. mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
  47. mindsdb/interfaces/database/database.py +81 -57
  48. mindsdb/interfaces/database/integrations.py +220 -234
  49. mindsdb/interfaces/database/log.py +72 -104
  50. mindsdb/interfaces/database/projects.py +156 -193
  51. mindsdb/interfaces/file/file_controller.py +21 -65
  52. mindsdb/interfaces/knowledge_base/controller.py +63 -10
  53. mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
  54. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  55. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  56. mindsdb/interfaces/skills/skills_controller.py +54 -36
  57. mindsdb/interfaces/skills/sql_agent.py +109 -86
  58. mindsdb/interfaces/storage/db.py +223 -79
  59. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  60. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  61. mindsdb/utilities/config.py +9 -2
  62. mindsdb/utilities/log.py +35 -26
  63. mindsdb/utilities/ml_task_queue/task.py +19 -22
  64. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  65. mindsdb/utilities/starters.py +49 -1
  66. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +268 -268
  67. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +70 -62
  68. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
  69. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
  70. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
@@ -6,22 +6,24 @@ from snowflake.sqlalchemy import snowdialect
6
6
  from snowflake import connector
7
7
  from snowflake.connector.errors import NotSupportedError
8
8
  from snowflake.connector.cursor import SnowflakeCursor, ResultMetadata
9
+ from typing import Optional, List
9
10
 
10
11
  from mindsdb_sql_parser.ast.base import ASTNode
11
12
  from mindsdb_sql_parser.ast import Select, Identifier
12
13
 
13
14
  from mindsdb.utilities import log
14
- from mindsdb.integrations.libs.base import DatabaseHandler
15
+ from mindsdb.integrations.libs.base import MetaDatabaseHandler
15
16
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
16
17
  from mindsdb.integrations.libs.response import (
17
18
  HandlerStatusResponse as StatusResponse,
18
19
  HandlerResponse as Response,
19
- RESPONSE_TYPE
20
+ RESPONSE_TYPE,
20
21
  )
21
22
  from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE
22
23
 
23
24
  try:
24
25
  import pyarrow as pa
26
+
25
27
  memory_pool = pa.default_memory_pool()
26
28
  except Exception:
27
29
  memory_pool = None
@@ -31,7 +33,7 @@ logger = log.getLogger(__name__)
31
33
 
32
34
 
33
35
  def _map_type(internal_type_name: str) -> MYSQL_DATA_TYPE:
34
- """ Map Snowflake types to MySQL types.
36
+ """Map Snowflake types to MySQL types.
35
37
 
36
38
  Args:
37
39
  internal_type_name (str): The name of the Snowflake type to map.
@@ -41,22 +43,22 @@ def _map_type(internal_type_name: str) -> MYSQL_DATA_TYPE:
41
43
  """
42
44
  internal_type_name = internal_type_name.upper()
43
45
  types_map = {
44
- ('NUMBER', 'DECIMAL', 'DEC', 'NUMERIC'): MYSQL_DATA_TYPE.DECIMAL,
45
- ('INT , INTEGER , BIGINT , SMALLINT , TINYINT , BYTEINT'): MYSQL_DATA_TYPE.INT,
46
- ('FLOAT', 'FLOAT4', 'FLOAT8'): MYSQL_DATA_TYPE.FLOAT,
47
- ('DOUBLE', 'DOUBLE PRECISION', 'REAL'): MYSQL_DATA_TYPE.DOUBLE,
48
- ('VARCHAR'): MYSQL_DATA_TYPE.VARCHAR,
49
- ('CHAR', 'CHARACTER', 'NCHAR'): MYSQL_DATA_TYPE.CHAR,
50
- ('STRING', 'TEXT', 'NVARCHAR'): MYSQL_DATA_TYPE.TEXT,
51
- ('NVARCHAR2', 'CHAR VARYING', 'NCHAR VARYING'): MYSQL_DATA_TYPE.VARCHAR,
52
- ('BINARY', 'VARBINARY'): MYSQL_DATA_TYPE.BINARY,
53
- ('BOOLEAN',): MYSQL_DATA_TYPE.BOOL,
54
- ('TIMESTAMP_NTZ', 'DATETIME'): MYSQL_DATA_TYPE.DATETIME,
55
- ('DATE',): MYSQL_DATA_TYPE.DATE,
56
- ('TIME',): MYSQL_DATA_TYPE.TIME,
57
- ('TIMESTAMP_LTZ'): MYSQL_DATA_TYPE.DATETIME,
58
- ('TIMESTAMP_TZ'): MYSQL_DATA_TYPE.DATETIME,
59
- ('VARIANT', 'OBJECT', 'ARRAY', 'MAP', 'GEOGRAPHY', 'GEOMETRY', 'VECTOR'): MYSQL_DATA_TYPE.VARCHAR
46
+ ("NUMBER", "DECIMAL", "DEC", "NUMERIC"): MYSQL_DATA_TYPE.DECIMAL,
47
+ ("INT , INTEGER , BIGINT , SMALLINT , TINYINT , BYTEINT"): MYSQL_DATA_TYPE.INT,
48
+ ("FLOAT", "FLOAT4", "FLOAT8"): MYSQL_DATA_TYPE.FLOAT,
49
+ ("DOUBLE", "DOUBLE PRECISION", "REAL"): MYSQL_DATA_TYPE.DOUBLE,
50
+ ("VARCHAR"): MYSQL_DATA_TYPE.VARCHAR,
51
+ ("CHAR", "CHARACTER", "NCHAR"): MYSQL_DATA_TYPE.CHAR,
52
+ ("STRING", "TEXT", "NVARCHAR"): MYSQL_DATA_TYPE.TEXT,
53
+ ("NVARCHAR2", "CHAR VARYING", "NCHAR VARYING"): MYSQL_DATA_TYPE.VARCHAR,
54
+ ("BINARY", "VARBINARY"): MYSQL_DATA_TYPE.BINARY,
55
+ ("BOOLEAN",): MYSQL_DATA_TYPE.BOOL,
56
+ ("TIMESTAMP_NTZ", "DATETIME"): MYSQL_DATA_TYPE.DATETIME,
57
+ ("DATE",): MYSQL_DATA_TYPE.DATE,
58
+ ("TIME",): MYSQL_DATA_TYPE.TIME,
59
+ ("TIMESTAMP_LTZ"): MYSQL_DATA_TYPE.DATETIME,
60
+ ("TIMESTAMP_TZ"): MYSQL_DATA_TYPE.DATETIME,
61
+ ("VARIANT", "OBJECT", "ARRAY", "MAP", "GEOGRAPHY", "GEOMETRY", "VECTOR"): MYSQL_DATA_TYPE.VARCHAR,
60
62
  }
61
63
 
62
64
  for db_types_list, mysql_data_type in types_map.items():
@@ -84,26 +86,32 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
84
86
  for column in description:
85
87
  column_dtype = result[column.name].dtype
86
88
  description_column_type = connector.constants.FIELD_ID_TO_NAME.get(column.type_code)
89
+ if description_column_type in ("OBJECT", "ARRAY"):
90
+ mysql_types.append(MYSQL_DATA_TYPE.JSON)
91
+ continue
92
+ if description_column_type == "VECTOR":
93
+ mysql_types.append(MYSQL_DATA_TYPE.VECTOR)
94
+ continue
87
95
  if pd_types.is_integer_dtype(column_dtype):
88
96
  column_dtype_name = column_dtype.name
89
- if column_dtype_name in ('int8', 'Int8'):
97
+ if column_dtype_name in ("int8", "Int8"):
90
98
  mysql_types.append(MYSQL_DATA_TYPE.TINYINT)
91
- elif column_dtype in ('int16', 'Int16'):
99
+ elif column_dtype in ("int16", "Int16"):
92
100
  mysql_types.append(MYSQL_DATA_TYPE.SMALLINT)
93
- elif column_dtype in ('int32', 'Int32'):
101
+ elif column_dtype in ("int32", "Int32"):
94
102
  mysql_types.append(MYSQL_DATA_TYPE.MEDIUMINT)
95
- elif column_dtype in ('int64', 'Int64'):
103
+ elif column_dtype in ("int64", "Int64"):
96
104
  mysql_types.append(MYSQL_DATA_TYPE.BIGINT)
97
105
  else:
98
106
  mysql_types.append(MYSQL_DATA_TYPE.INT)
99
107
  continue
100
108
  if pd_types.is_float_dtype(column_dtype):
101
109
  column_dtype_name = column_dtype.name
102
- if column_dtype_name in ('float16', 'Float16'): # Float16 does not exists so far
110
+ if column_dtype_name in ("float16", "Float16"): # Float16 does not exists so far
103
111
  mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
104
- elif column_dtype_name in ('float32', 'Float32'):
112
+ elif column_dtype_name in ("float32", "Float32"):
105
113
  mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
106
- elif column_dtype_name in ('float64', 'Float64'):
114
+ elif column_dtype_name in ("float64", "Float64"):
107
115
  mysql_types.append(MYSQL_DATA_TYPE.DOUBLE)
108
116
  else:
109
117
  mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
@@ -115,35 +123,35 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
115
123
  mysql_types.append(MYSQL_DATA_TYPE.DATETIME)
116
124
  series = result[column.name]
117
125
  # snowflake use pytz.timezone
118
- if series.dt.tz is not None and getattr(series.dt.tz, 'zone', 'UTC') != 'UTC':
119
- series = series.dt.tz_convert('UTC')
126
+ if series.dt.tz is not None and getattr(series.dt.tz, "zone", "UTC") != "UTC":
127
+ series = series.dt.tz_convert("UTC")
120
128
  result[column.name] = series.dt.tz_localize(None)
121
129
  continue
122
130
 
123
131
  if pd_types.is_object_dtype(column_dtype):
124
- if description_column_type == 'TEXT':
132
+ if description_column_type == "TEXT":
125
133
  # we can also check column.internal_size, if == 16777216 then it is TEXT, else VARCHAR(internal_size)
126
134
  mysql_types.append(MYSQL_DATA_TYPE.TEXT)
127
135
  continue
128
- elif description_column_type == 'BINARY':
136
+ elif description_column_type == "BINARY":
129
137
  # if column.internal_size == 8388608 then BINARY, else VARBINARY(internal_size)
130
138
  mysql_types.append(MYSQL_DATA_TYPE.BINARY)
131
139
  continue
132
- elif description_column_type == 'DATE':
140
+ elif description_column_type == "DATE":
133
141
  mysql_types.append(MYSQL_DATA_TYPE.DATE)
134
142
  continue
135
- elif description_column_type == 'TIME':
143
+ elif description_column_type == "TIME":
136
144
  mysql_types.append(MYSQL_DATA_TYPE.TIME)
137
145
  continue
138
146
 
139
- if description_column_type == 'FIXED':
147
+ if description_column_type == "FIXED":
140
148
  if column.scale == 0:
141
149
  mysql_types.append(MYSQL_DATA_TYPE.INT)
142
150
  else:
143
151
  # It is NUMBER, DECIMAL or NUMERIC with scale > 0
144
152
  mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
145
153
  continue
146
- elif description_column_type == 'REAL':
154
+ elif description_column_type == "REAL":
147
155
  mysql_types.append(MYSQL_DATA_TYPE.FLOAT)
148
156
  continue
149
157
 
@@ -154,24 +162,19 @@ def _make_table_response(result: DataFrame, cursor: SnowflakeCursor) -> Response
154
162
  columns=[column.name for column in description],
155
163
  )
156
164
 
157
- return Response(
158
- RESPONSE_TYPE.TABLE,
159
- data_frame=df,
160
- affected_rows=None,
161
- mysql_types=mysql_types
162
- )
165
+ return Response(RESPONSE_TYPE.TABLE, data_frame=df, affected_rows=None, mysql_types=mysql_types)
163
166
 
164
167
 
165
- class SnowflakeHandler(DatabaseHandler):
168
+ class SnowflakeHandler(MetaDatabaseHandler):
166
169
  """
167
170
  This handler handles connection and execution of the Snowflake statements.
168
171
  """
169
172
 
170
- name = 'snowflake'
173
+ name = "snowflake"
171
174
 
172
175
  def __init__(self, name, **kwargs):
173
176
  super().__init__(name)
174
- self.connection_data = kwargs.get('connection_data')
177
+ self.connection_data = kwargs.get("connection_data")
175
178
  self.renderer = SqlalchemyRender(snowdialect.dialect)
176
179
 
177
180
  self.is_connected = False
@@ -193,18 +196,18 @@ class SnowflakeHandler(DatabaseHandler):
193
196
  return self.connection
194
197
 
195
198
  # Mandatory connection parameters
196
- if not all(key in self.connection_data for key in ['account', 'user', 'password', 'database']):
197
- raise ValueError('Required parameters (account, user, password, database) must be provided.')
199
+ if not all(key in self.connection_data for key in ["account", "user", "password", "database"]):
200
+ raise ValueError("Required parameters (account, user, password, database) must be provided.")
198
201
 
199
202
  config = {
200
- 'account': self.connection_data.get('account'),
201
- 'user': self.connection_data.get('user'),
202
- 'password': self.connection_data.get('password'),
203
- 'database': self.connection_data.get('database')
203
+ "account": self.connection_data.get("account"),
204
+ "user": self.connection_data.get("user"),
205
+ "password": self.connection_data.get("password"),
206
+ "database": self.connection_data.get("database"),
204
207
  }
205
208
 
206
209
  # Optional connection parameters
207
- optional_params = ['schema', 'warehouse', 'role']
210
+ optional_params = ["schema", "warehouse", "role"]
208
211
  for param in optional_params:
209
212
  if param in self.connection_data:
210
213
  config[param] = self.connection_data[param]
@@ -215,7 +218,7 @@ class SnowflakeHandler(DatabaseHandler):
215
218
  self.is_connected = True
216
219
  return self.connection
217
220
  except connector.errors.Error as e:
218
- logger.error(f'Error connecting to Snowflake, {e}!')
221
+ logger.error(f"Error connecting to Snowflake, {e}!")
219
222
  raise
220
223
 
221
224
  def disconnect(self):
@@ -244,10 +247,10 @@ class SnowflakeHandler(DatabaseHandler):
244
247
 
245
248
  # Execute a simple query to test the connection
246
249
  with connection.cursor() as cur:
247
- cur.execute('select 1;')
250
+ cur.execute("select 1;")
248
251
  response.success = True
249
252
  except (connector.errors.Error, ValueError) as e:
250
- logger.error(f'Error connecting to Snowflake, {e}!')
253
+ logger.error(f"Error connecting to Snowflake, {e}!")
251
254
  response.error_message = str(e)
252
255
 
253
256
  if response.success and need_to_close:
@@ -276,7 +279,6 @@ class SnowflakeHandler(DatabaseHandler):
276
279
  try:
277
280
  cur.execute(query)
278
281
  try:
279
-
280
282
  try:
281
283
  batches_iter = cur.fetch_pandas_batches()
282
284
  except ValueError:
@@ -297,64 +299,52 @@ class SnowflakeHandler(DatabaseHandler):
297
299
  if memory_estimation_check_done is False and batches_rowcount > 1000:
298
300
  memory_estimation_check_done = True
299
301
  available_memory_kb = psutil.virtual_memory().available >> 10
300
- batches_size_kb = sum([(x.memory_usage(index=True, deep=True).sum() >> 10) for x in batches])
302
+ batches_size_kb = sum(
303
+ [(x.memory_usage(index=True, deep=True).sum() >> 10) for x in batches]
304
+ )
301
305
  total_rowcount = cur.rowcount
302
306
  rest_rowcount = total_rowcount - batches_rowcount
303
307
  rest_estimated_size_kb = int((rest_rowcount / batches_rowcount) * batches_size_kb)
304
308
  if (available_memory_kb * 0.9) < rest_estimated_size_kb:
305
309
  logger.error(
306
- 'Attempt to get too large dataset:\n'
307
- f'batches_rowcount={batches_rowcount}, size_kb={batches_size_kb}\n'
308
- f'total_rowcount={total_rowcount}, estimated_size_kb={rest_estimated_size_kb}\n'
309
- f'available_memory_kb={available_memory_kb}'
310
+ "Attempt to get too large dataset:\n"
311
+ f"batches_rowcount={batches_rowcount}, size_kb={batches_size_kb}\n"
312
+ f"total_rowcount={total_rowcount}, estimated_size_kb={rest_estimated_size_kb}\n"
313
+ f"available_memory_kb={available_memory_kb}"
310
314
  )
311
- raise MemoryError('Not enought memory')
315
+ raise MemoryError("Not enought memory")
312
316
  # endregion
313
317
  if len(batches) > 0:
314
318
  response = _make_table_response(result=pandas.concat(batches, ignore_index=True), cursor=cur)
315
319
  else:
316
- response = Response(
317
- RESPONSE_TYPE.TABLE,
318
- DataFrame(
319
- [],
320
- columns=[x[0] for x in cur.description]
321
- )
322
- )
320
+ response = Response(RESPONSE_TYPE.TABLE, DataFrame([], columns=[x[0] for x in cur.description]))
323
321
  except NotSupportedError:
324
322
  # Fallback for CREATE/DELETE/UPDATE. These commands returns table with single column,
325
323
  # but it cannot be retrieved as pandas DataFrame.
326
324
  result = cur.fetchall()
327
325
  match result:
328
326
  case (
329
- [{'number of rows inserted': affected_rows}]
330
- | [{'number of rows deleted': affected_rows}]
331
- | [{'number of rows updated': affected_rows, 'number of multi-joined rows updated': _}]
327
+ [{"number of rows inserted": affected_rows}]
328
+ | [{"number of rows deleted": affected_rows}]
329
+ | [{"number of rows updated": affected_rows, "number of multi-joined rows updated": _}]
332
330
  ):
333
331
  response = Response(RESPONSE_TYPE.OK, affected_rows=affected_rows)
334
332
  case list():
335
333
  response = Response(
336
- RESPONSE_TYPE.TABLE,
337
- DataFrame(
338
- result,
339
- columns=[x[0] for x in cur.description]
340
- )
334
+ RESPONSE_TYPE.TABLE, DataFrame(result, columns=[x[0] for x in cur.description])
341
335
  )
342
336
  case _:
343
337
  # Looks like SnowFlake always returns something in response, so this is suspicious
344
- logger.warning('Snowflake did not return any data in response.')
338
+ logger.warning("Snowflake did not return any data in response.")
345
339
  response = Response(RESPONSE_TYPE.OK)
346
340
  except Exception as e:
347
341
  logger.error(f"Error running query: {query} on {self.connection_data.get('database')}, {e}!")
348
- response = Response(
349
- RESPONSE_TYPE.ERROR,
350
- error_code=0,
351
- error_message=str(e)
352
- )
342
+ response = Response(RESPONSE_TYPE.ERROR, error_code=0, error_message=str(e))
353
343
 
354
344
  if need_to_close is True:
355
345
  self.disconnect()
356
346
 
357
- if memory_pool is not None and memory_pool.backend_name == 'jemalloc':
347
+ if memory_pool is not None and memory_pool.backend_name == "jemalloc":
358
348
  # This reduce memory consumption, but will slow down next query slightly.
359
349
  # Except pool type 'jemalloc': memory consumption do not change significantly
360
350
  # and next query processing time may be even lower.
@@ -385,7 +375,7 @@ class SnowflakeHandler(DatabaseHandler):
385
375
  quoted_columns = []
386
376
  if query.targets is not None:
387
377
  for column in query.targets:
388
- if hasattr(column, 'alias') and column.alias is not None:
378
+ if hasattr(column, "alias") and column.alias is not None:
389
379
  if column.alias.is_quoted[-1]:
390
380
  quoted_columns.append(column.alias.parts[-1])
391
381
  elif isinstance(column, Identifier):
@@ -455,3 +445,264 @@ class SnowflakeHandler(DatabaseHandler):
455
445
  result.to_columns_table_response(map_type_fn=_map_type)
456
446
 
457
447
  return result
448
+
449
+ def meta_get_tables(self, table_names: Optional[List[str]] = None) -> Response:
450
+ """
451
+ Retrieves metadata information about the tables in the Snowflake database to be stored in the data catalog.
452
+
453
+ Args:
454
+ table_names (list): A list of table names for which to retrieve metadata information.
455
+
456
+ Returns:
457
+ Response: A response object containing the metadata information, formatted as per the `Response` class.
458
+ """
459
+ query = """
460
+ SELECT
461
+ TABLE_CATALOG,
462
+ TABLE_SCHEMA,
463
+ TABLE_NAME,
464
+ TABLE_TYPE,
465
+ COMMENT AS TABLE_DESCRIPTION,
466
+ ROW_COUNT,
467
+ CREATED,
468
+ LAST_ALTERED
469
+ FROM INFORMATION_SCHEMA.TABLES
470
+ WHERE TABLE_SCHEMA = current_schema()
471
+ AND TABLE_TYPE IN ('BASE TABLE', 'VIEW')
472
+ """
473
+
474
+ if table_names is not None and len(table_names) > 0:
475
+ table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
476
+ query += f" AND TABLE_NAME IN ({table_names_str})"
477
+
478
+ result = self.native_query(query)
479
+ result.data_frame["ROW_COUNT"] = result.data_frame["ROW_COUNT"].astype(int)
480
+
481
+ return result
482
+
483
+ def meta_get_columns(self, table_names: Optional[List[str]] = None) -> Response:
484
+ """
485
+ Retrieves column metadata for the specified tables (or all tables if no list is provided).
486
+
487
+ Args:
488
+ table_names (list): A list of table names for which to retrieve column metadata.
489
+
490
+ Returns:
491
+ Response: A response object containing the column metadata.
492
+ """
493
+ query = """
494
+ SELECT
495
+ TABLE_NAME,
496
+ COLUMN_NAME,
497
+ DATA_TYPE,
498
+ COMMENT AS COLUMN_DESCRIPTION,
499
+ COLUMN_DEFAULT,
500
+ (IS_NULLABLE = 'YES') AS IS_NULLABLE,
501
+ CHARACTER_MAXIMUM_LENGTH,
502
+ CHARACTER_OCTET_LENGTH,
503
+ NUMERIC_PRECISION,
504
+ NUMERIC_SCALE,
505
+ DATETIME_PRECISION,
506
+ CHARACTER_SET_NAME,
507
+ COLLATION_NAME
508
+ FROM INFORMATION_SCHEMA.COLUMNS
509
+ WHERE TABLE_SCHEMA = current_schema()
510
+ """
511
+
512
+ if table_names is not None and len(table_names) > 0:
513
+ table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
514
+ query += f" AND TABLE_NAME IN ({table_names_str})"
515
+
516
+ result = self.native_query(query)
517
+ return result
518
+
519
+ def meta_get_column_statistics(self, table_names: Optional[List[str]] = None) -> Response:
520
+ """
521
+ Retrieves basic column statistics: null %, distinct count.
522
+ Due to Snowflake limitations, this runs per-table not per-column.
523
+ TODO: Add most_common_values and most_common_frequencies
524
+ """
525
+ columns_query = """
526
+ SELECT TABLE_NAME, COLUMN_NAME
527
+ FROM INFORMATION_SCHEMA.COLUMNS
528
+ WHERE TABLE_SCHEMA = current_schema()
529
+ """
530
+ if table_names:
531
+ table_names_str = ", ".join([f"'{t.upper()}'" for t in table_names])
532
+ columns_query += f" AND TABLE_NAME IN ({table_names_str})"
533
+
534
+ columns_result = self.native_query(columns_query)
535
+ if (
536
+ columns_result.type == RESPONSE_TYPE.ERROR
537
+ or columns_result.data_frame is None
538
+ or columns_result.data_frame.empty
539
+ ):
540
+ return Response(RESPONSE_TYPE.ERROR, error_message="No columns found.")
541
+
542
+ columns_df = columns_result.data_frame
543
+ grouped = columns_df.groupby("TABLE_NAME")
544
+ all_stats = []
545
+
546
+ for table_name, group in grouped:
547
+ select_parts = []
548
+ for _, row in group.iterrows():
549
+ col = row["COLUMN_NAME"]
550
+ # Ensure column names in the query are properly quoted if they contain special characters or are case-sensitive
551
+ quoted_col = f'"{col}"'
552
+ select_parts.extend(
553
+ [
554
+ f'COUNT_IF({quoted_col} IS NULL) AS "nulls_{col}"',
555
+ f'APPROX_COUNT_DISTINCT({quoted_col}) AS "distincts_{col}"',
556
+ f'MIN({quoted_col}) AS "min_{col}"',
557
+ f'MAX({quoted_col}) AS "max_{col}"',
558
+ ]
559
+ )
560
+
561
+ quoted_table_name = f'"{table_name}"'
562
+ stats_query = f"""
563
+ SELECT COUNT(*) AS "total_rows", {", ".join(select_parts)}
564
+ FROM {quoted_table_name}
565
+ """
566
+ try:
567
+ stats_res = self.native_query(stats_query)
568
+ if stats_res.type != RESPONSE_TYPE.TABLE or stats_res.data_frame is None or stats_res.data_frame.empty:
569
+ logger.warning(
570
+ f"Could not retrieve stats for table {table_name}. Query returned no data or an error: {stats_res.error_message if stats_res.type == RESPONSE_TYPE.ERROR else 'No data'}"
571
+ )
572
+ # Add placeholder stats if query fails or returns empty
573
+ for _, row in group.iterrows():
574
+ all_stats.append(
575
+ {
576
+ "table_name": table_name,
577
+ "column_name": row["COLUMN_NAME"],
578
+ "null_percentage": None,
579
+ "distinct_values_count": None,
580
+ "most_common_values": [],
581
+ "most_common_frequencies": [],
582
+ "minimum_value": None,
583
+ "maximum_value": None,
584
+ }
585
+ )
586
+ continue
587
+
588
+ stats_data = stats_res.data_frame.iloc[0]
589
+ total_rows = stats_data.get("total_rows", 0)
590
+
591
+ for _, row in group.iterrows():
592
+ col = row["COLUMN_NAME"]
593
+ # Keys for stats_data should match the aliases in stats_query (e.g., "nulls_COLNAME")
594
+ nulls = stats_data.get(f"nulls_{col}", 0)
595
+ distincts = stats_data.get(f"distincts_{col}", None)
596
+ min_val = stats_data.get(f"min_{col}", None)
597
+ max_val = stats_data.get(f"max_{col}", None)
598
+ null_pct = (nulls / total_rows) * 100 if total_rows is not None and total_rows > 0 else None
599
+
600
+ all_stats.append(
601
+ {
602
+ "table_name": table_name,
603
+ "column_name": col,
604
+ "null_percentage": null_pct,
605
+ "distinct_values_count": distincts,
606
+ "most_common_values": [],
607
+ "most_common_frequencies": [],
608
+ "minimum_value": min_val,
609
+ "maximum_value": max_val,
610
+ }
611
+ )
612
+ except Exception as e:
613
+ logger.error(f"Exception while fetching statistics for table {table_name}: {e}")
614
+ for _, row in group.iterrows():
615
+ all_stats.append(
616
+ {
617
+ "table_name": table_name,
618
+ "column_name": row["COLUMN_NAME"],
619
+ "null_percentage": None,
620
+ "distinct_values_count": None,
621
+ "most_common_values": [],
622
+ "most_common_frequencies": [],
623
+ "minimum_value": None,
624
+ "maximum_value": None,
625
+ }
626
+ )
627
+
628
+ if not all_stats:
629
+ return Response(RESPONSE_TYPE.TABLE, data_frame=pandas.DataFrame())
630
+
631
+ return Response(RESPONSE_TYPE.TABLE, data_frame=pandas.DataFrame(all_stats))
632
+
633
+ def meta_get_primary_keys(self, table_names: Optional[List[str]] = None) -> Response:
634
+ """
635
+ Retrieves primary key information for the specified tables (or all tables if no list is provided).
636
+
637
+ Args:
638
+ table_names (list): A list of table names for which to retrieve primary key information.
639
+
640
+ Returns:
641
+ Response: A response object containing the primary key information.
642
+ """
643
+ try:
644
+ query = """
645
+ SHOW PRIMARY KEYS IN TABLE;
646
+ """
647
+
648
+ response = self.native_query(query)
649
+ if response.type == RESPONSE_TYPE.ERROR and response.error_message:
650
+ logger.error(f"Query error in meta_get_primary_keys: {response.error_message}\nQuery:\n{query}")
651
+
652
+ df = response.data_frame
653
+ if not df.empty:
654
+ if table_names:
655
+ df = df[df["table_name"].isin(table_names)]
656
+
657
+ df = df[["table_name", "column_name", "key_sequence", "constraint_name"]]
658
+ df = df.rename(columns={"key_sequence": "ordinal_position"})
659
+
660
+ response.data_frame = df
661
+
662
+ return response
663
+
664
+ except Exception as e:
665
+ logger.error(f"Exception in meta_get_primary_keys: {e!r}")
666
+ return Response(RESPONSE_TYPE.ERROR, error_message=f"Exception querying primary keys: {e!r}")
667
+
668
+ def meta_get_foreign_keys(self, table_names: Optional[List[str]] = None) -> Response:
669
+ """
670
+ Retrieves foreign key information for the specified tables (or all tables if no list is provided).
671
+
672
+ Args:
673
+ table_names (list): A list of table names for which to retrieve foreign key information.
674
+
675
+ Returns:
676
+ Response: A response object containing the foreign key information.
677
+ """
678
+ try:
679
+ query = """
680
+ SHOW IMPORTED KEYS IN TABLE;
681
+ """
682
+
683
+ response = self.native_query(query)
684
+ if response.type == RESPONSE_TYPE.ERROR and response.error_message:
685
+ logger.error(f"Query error in meta_get_primary_keys: {response.error_message}\nQuery:\n{query}")
686
+
687
+ df = response.data_frame
688
+ if not df.empty:
689
+ if table_names:
690
+ df = df[df["pk_table_name"].isin(table_names) & df["fk_table_name"].isin(table_names)]
691
+
692
+ df = df[["pk_table_name", "pk_column_name", "fk_table_name", "fk_column_name"]]
693
+ df = df.rename(
694
+ columns={
695
+ "pk_table_name": "child_table_name",
696
+ "pk_column_name": "child_column_name",
697
+ "fk_table_name": "parent_table_name",
698
+ "fk_column_name": "parent_column_name",
699
+ }
700
+ )
701
+
702
+ response.data_frame = df
703
+
704
+ return response
705
+
706
+ except Exception as e:
707
+ logger.error(f"Exception in meta_get_primary_keys: {e!r}")
708
+ return Response(RESPONSE_TYPE.ERROR, error_message=f"Exception querying primary keys: {e!r}")