MindsDB 25.5.4.2__py3-none-any.whl → 25.6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +28 -25
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/executor/command_executor.py +69 -14
  5. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  6. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  7. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  8. mindsdb/api/executor/planner/plan_join.py +67 -77
  9. mindsdb/api/executor/planner/query_planner.py +176 -155
  10. mindsdb/api/executor/planner/steps.py +37 -12
  11. mindsdb/api/executor/sql_query/result_set.py +45 -64
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  13. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  14. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  15. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  16. mindsdb/api/executor/utilities/sql.py +42 -48
  17. mindsdb/api/http/namespaces/config.py +1 -1
  18. mindsdb/api/http/namespaces/file.py +14 -23
  19. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  21. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  22. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  24. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  25. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  27. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
  28. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  29. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  32. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
  33. mindsdb/integrations/libs/api_handler.py +261 -57
  34. mindsdb/integrations/libs/base.py +100 -29
  35. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  36. mindsdb/integrations/utilities/handler_utils.py +23 -8
  37. mindsdb/integrations/utilities/sql_utils.py +35 -40
  38. mindsdb/interfaces/agents/agents_controller.py +196 -192
  39. mindsdb/interfaces/agents/constants.py +7 -1
  40. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  41. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  42. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  43. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  44. mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
  45. mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
  46. mindsdb/interfaces/database/database.py +81 -57
  47. mindsdb/interfaces/database/integrations.py +220 -234
  48. mindsdb/interfaces/database/log.py +72 -104
  49. mindsdb/interfaces/database/projects.py +156 -193
  50. mindsdb/interfaces/file/file_controller.py +21 -65
  51. mindsdb/interfaces/knowledge_base/controller.py +63 -10
  52. mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
  53. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  54. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  55. mindsdb/interfaces/skills/skills_controller.py +54 -36
  56. mindsdb/interfaces/skills/sql_agent.py +109 -86
  57. mindsdb/interfaces/storage/db.py +223 -79
  58. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  59. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  60. mindsdb/utilities/config.py +9 -2
  61. mindsdb/utilities/log.py +35 -26
  62. mindsdb/utilities/ml_task_queue/task.py +19 -22
  63. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  64. mindsdb/utilities/starters.py +40 -0
  65. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +253 -253
  66. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +69 -61
  67. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
  68. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
  69. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,5 @@
1
+ import csv
2
+ import io
1
3
  import time
2
4
  import json
3
5
  from typing import Optional, Any
@@ -13,12 +15,12 @@ from mindsdb_sql_parser import parse_sql
13
15
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
14
16
  from mindsdb_sql_parser.ast.base import ASTNode
15
17
 
16
- from mindsdb.integrations.libs.base import DatabaseHandler
18
+ from mindsdb.integrations.libs.base import MetaDatabaseHandler
17
19
  from mindsdb.utilities import log
18
20
  from mindsdb.integrations.libs.response import (
19
21
  HandlerStatusResponse as StatusResponse,
20
22
  HandlerResponse as Response,
21
- RESPONSE_TYPE
23
+ RESPONSE_TYPE,
22
24
  )
23
25
  import mindsdb.utilities.profiler as profiler
24
26
  from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE
@@ -44,20 +46,21 @@ def _map_type(internal_type_name: str | None) -> MYSQL_DATA_TYPE:
44
46
 
45
47
  internal_type_name = internal_type_name.lower()
46
48
  types_map = {
47
- ('smallint', 'smallserial'): MYSQL_DATA_TYPE.SMALLINT,
48
- ('integer', 'int', 'serial'): MYSQL_DATA_TYPE.INT,
49
- ('bigint', 'bigserial'): MYSQL_DATA_TYPE.BIGINT,
50
- ('real', 'float'): MYSQL_DATA_TYPE.FLOAT,
51
- ('numeric', 'decimal'): MYSQL_DATA_TYPE.DECIMAL,
52
- ('double precision',): MYSQL_DATA_TYPE.DOUBLE,
53
- ('character varying', 'varchar'): MYSQL_DATA_TYPE.VARCHAR,
49
+ ("smallint", "smallserial"): MYSQL_DATA_TYPE.SMALLINT,
50
+ ("integer", "int", "serial"): MYSQL_DATA_TYPE.INT,
51
+ ("bigint", "bigserial"): MYSQL_DATA_TYPE.BIGINT,
52
+ ("real", "float"): MYSQL_DATA_TYPE.FLOAT,
53
+ ("numeric", "decimal"): MYSQL_DATA_TYPE.DECIMAL,
54
+ ("double precision",): MYSQL_DATA_TYPE.DOUBLE,
55
+ ("character varying", "varchar"): MYSQL_DATA_TYPE.VARCHAR,
54
56
  # NOTE: if return chars-types as mysql's CHAR, then response will be padded with spaces, so return as TEXT
55
- ('money', 'character', 'char', 'bpchar', 'bpchar', 'text'): MYSQL_DATA_TYPE.TEXT,
56
- ('timestamp', 'timestamp without time zone', 'timestamp with time zone'): MYSQL_DATA_TYPE.DATETIME,
57
- ('date', ): MYSQL_DATA_TYPE.DATE,
58
- ('time', 'time without time zone', 'time with time zone'): MYSQL_DATA_TYPE.TIME,
59
- ('boolean',): MYSQL_DATA_TYPE.BOOL,
60
- ('bytea',): MYSQL_DATA_TYPE.BINARY,
57
+ ("money", "character", "char", "bpchar", "bpchar", "text"): MYSQL_DATA_TYPE.TEXT,
58
+ ("timestamp", "timestamp without time zone", "timestamp with time zone"): MYSQL_DATA_TYPE.DATETIME,
59
+ ("date",): MYSQL_DATA_TYPE.DATE,
60
+ ("time", "time without time zone", "time with time zone"): MYSQL_DATA_TYPE.TIME,
61
+ ("boolean",): MYSQL_DATA_TYPE.BOOL,
62
+ ("bytea",): MYSQL_DATA_TYPE.BINARY,
63
+ ("json", "jsonb"): MYSQL_DATA_TYPE.JSON,
61
64
  }
62
65
 
63
66
  for db_types_list, mysql_data_type in types_map.items():
@@ -81,10 +84,28 @@ def _make_table_response(result: list[tuple[Any]], cursor: Cursor) -> Response:
81
84
  description: list[PGColumn] = cursor.description
82
85
  mysql_types: list[MYSQL_DATA_TYPE] = []
83
86
  for column in description:
87
+ if column.type_display == "vector":
88
+ # 'vector' is type of pgvector extension, added here as text to not import pgvector
89
+ # NOTE: data returned as numpy array
90
+ mysql_types.append(MYSQL_DATA_TYPE.VECTOR)
91
+ continue
84
92
  pg_type_info: TypeInfo = pg_types.get(column.type_code)
85
93
  if pg_type_info is None:
86
- logger.warning(f'Postgres handler: unknown type: {column.type_code}')
87
- regtype: str = pg_type_info.regtype if pg_type_info is not None else None
94
+ # postgres may return 'polymorphic type', which are not present in the pg_types
95
+ # list of 'polymorphic type' can be obtained:
96
+ # SELECT oid, typname, typcategory FROM pg_type WHERE typcategory = 'P' ORDER BY oid;
97
+ if column.type_code in (2277, 5078):
98
+ # anyarray, anycompatiblearray
99
+ regtype = "json"
100
+ else:
101
+ logger.warning(f"Postgres handler: unknown type: {column.type_code}")
102
+ mysql_types.append(MYSQL_DATA_TYPE.TEXT)
103
+ continue
104
+ elif pg_type_info.array_oid == column.type_code:
105
+ # it is any array, handle is as json
106
+ regtype: str = "json"
107
+ else:
108
+ regtype: str = pg_type_info.regtype if pg_type_info is not None else None
88
109
  mysql_type = _map_type(regtype)
89
110
  mysql_types.append(mysql_type)
90
111
 
@@ -93,38 +114,37 @@ def _make_table_response(result: list[tuple[Any]], cursor: Cursor) -> Response:
93
114
  for i, mysql_type in enumerate(mysql_types):
94
115
  expected_dtype = None
95
116
  if mysql_type in (
96
- MYSQL_DATA_TYPE.SMALLINT, MYSQL_DATA_TYPE.INT, MYSQL_DATA_TYPE.MEDIUMINT,
97
- MYSQL_DATA_TYPE.BIGINT, MYSQL_DATA_TYPE.TINYINT
117
+ MYSQL_DATA_TYPE.SMALLINT,
118
+ MYSQL_DATA_TYPE.INT,
119
+ MYSQL_DATA_TYPE.MEDIUMINT,
120
+ MYSQL_DATA_TYPE.BIGINT,
121
+ MYSQL_DATA_TYPE.TINYINT,
98
122
  ):
99
- expected_dtype = 'Int64'
123
+ expected_dtype = "Int64"
100
124
  elif mysql_type in (MYSQL_DATA_TYPE.BOOL, MYSQL_DATA_TYPE.BOOLEAN):
101
- expected_dtype = 'boolean'
125
+ expected_dtype = "boolean"
102
126
  serieses.append(pd.Series([row[i] for row in result], dtype=expected_dtype, name=description[i].name))
103
127
  df = pd.concat(serieses, axis=1, copy=False)
104
128
  # endregion
105
129
 
106
- return Response(
107
- RESPONSE_TYPE.TABLE,
108
- data_frame=df,
109
- affected_rows=cursor.rowcount,
110
- mysql_types=mysql_types
111
- )
130
+ return Response(RESPONSE_TYPE.TABLE, data_frame=df, affected_rows=cursor.rowcount, mysql_types=mysql_types)
112
131
 
113
132
 
114
- class PostgresHandler(DatabaseHandler):
133
+ class PostgresHandler(MetaDatabaseHandler):
115
134
  """
116
135
  This handler handles connection and execution of the PostgreSQL statements.
117
136
  """
118
- name = 'postgres'
119
137
 
120
- @profiler.profile('init_pg_handler')
138
+ name = "postgres"
139
+
140
+ @profiler.profile("init_pg_handler")
121
141
  def __init__(self, name=None, **kwargs):
122
142
  super().__init__(name)
123
143
  self.parser = parse_sql
124
- self.connection_args = kwargs.get('connection_data')
125
- self.dialect = 'postgresql'
126
- self.database = self.connection_args.get('database')
127
- self.renderer = SqlalchemyRender('postgres')
144
+ self.connection_args = kwargs.get("connection_data")
145
+ self.dialect = "postgresql"
146
+ self.database = self.connection_args.get("database")
147
+ self.renderer = SqlalchemyRender("postgres")
128
148
 
129
149
  self.connection = None
130
150
  self.is_connected = False
@@ -136,30 +156,30 @@ class PostgresHandler(DatabaseHandler):
136
156
 
137
157
  def _make_connection_args(self):
138
158
  config = {
139
- 'host': self.connection_args.get('host'),
140
- 'port': self.connection_args.get('port'),
141
- 'user': self.connection_args.get('user'),
142
- 'password': self.connection_args.get('password'),
143
- 'dbname': self.connection_args.get('database')
159
+ "host": self.connection_args.get("host"),
160
+ "port": self.connection_args.get("port"),
161
+ "user": self.connection_args.get("user"),
162
+ "password": self.connection_args.get("password"),
163
+ "dbname": self.connection_args.get("database"),
144
164
  }
145
165
 
146
166
  # https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-PARAMKEYWORDS
147
- connection_parameters = self.connection_args.get('connection_parameters')
167
+ connection_parameters = self.connection_args.get("connection_parameters")
148
168
  if isinstance(connection_parameters, dict) is False:
149
169
  connection_parameters = {}
150
- if 'connect_timeout' not in connection_parameters:
151
- connection_parameters['connect_timeout'] = 10
170
+ if "connect_timeout" not in connection_parameters:
171
+ connection_parameters["connect_timeout"] = 10
152
172
  config.update(connection_parameters)
153
173
 
154
- if self.connection_args.get('sslmode'):
155
- config['sslmode'] = self.connection_args.get('sslmode')
174
+ if self.connection_args.get("sslmode"):
175
+ config["sslmode"] = self.connection_args.get("sslmode")
156
176
 
157
- if self.connection_args.get('autocommit'):
158
- config['autocommit'] = self.connection_args.get('autocommit')
177
+ if self.connection_args.get("autocommit"):
178
+ config["autocommit"] = self.connection_args.get("autocommit")
159
179
 
160
180
  # If schema is not provided set public as default one
161
- if self.connection_args.get('schema'):
162
- config['options'] = f'-c search_path={self.connection_args.get("schema")},public'
181
+ if self.connection_args.get("schema"):
182
+ config["options"] = f"-c search_path={self.connection_args.get('schema')},public"
163
183
  return config
164
184
 
165
185
  @profiler.profile()
@@ -182,7 +202,7 @@ class PostgresHandler(DatabaseHandler):
182
202
  self.is_connected = True
183
203
  return self.connection
184
204
  except psycopg.Error as e:
185
- logger.error(f'Error connecting to PostgreSQL {self.database}, {e}!')
205
+ logger.error(f"Error connecting to PostgreSQL {self.database}, {e}!")
186
206
  self.is_connected = False
187
207
  raise
188
208
 
@@ -209,10 +229,10 @@ class PostgresHandler(DatabaseHandler):
209
229
  connection = self.connect()
210
230
  with connection.cursor() as cur:
211
231
  # Execute a simple query to test the connection
212
- cur.execute('select 1;')
232
+ cur.execute("select 1;")
213
233
  response.success = True
214
234
  except psycopg.Error as e:
215
- logger.error(f'Error connecting to PostgreSQL {self.database}, {e}!')
235
+ logger.error(f"Error connecting to PostgreSQL {self.database}, {e}!")
216
236
  response.error_message = str(e)
217
237
 
218
238
  if response.success and need_to_close:
@@ -239,25 +259,25 @@ class PostgresHandler(DatabaseHandler):
239
259
  description (list): psycopg cursor description
240
260
  """
241
261
  types_map = {
242
- 'int2': 'int16',
243
- 'int4': 'int32',
244
- 'int8': 'int64',
245
- 'numeric': 'float64',
246
- 'float4': 'float32',
247
- 'float8': 'float64'
262
+ "int2": "int16",
263
+ "int4": "int32",
264
+ "int8": "int64",
265
+ "numeric": "float64",
266
+ "float4": "float32",
267
+ "float8": "float64",
248
268
  }
249
269
  columns = df.columns
250
270
  df.columns = list(range(len(columns)))
251
271
  for column_index, column_name in enumerate(df.columns):
252
272
  col = df[column_name]
253
- if str(col.dtype) == 'object':
254
- pg_type_info: TypeInfo = pg_types.get(description[column_index].type_code) # type_code is int!?
273
+ if str(col.dtype) == "object":
274
+ pg_type_info: TypeInfo = pg_types.get(description[column_index].type_code) # type_code is int!?
255
275
  if pg_type_info is not None and pg_type_info.name in types_map:
256
- col = col.fillna(0) # TODO rework
276
+ col = col.fillna(0) # TODO rework
257
277
  try:
258
278
  df[column_name] = col.astype(types_map[pg_type_info.name])
259
279
  except ValueError as e:
260
- logger.error(f'Error casting column {col.name} to {types_map[pg_type_info.name]}: {e}')
280
+ logger.error(f"Error casting column {col.name} to {types_map[pg_type_info.name]}: {e}")
261
281
  df.columns = columns
262
282
 
263
283
  @profiler.profile()
@@ -287,12 +307,8 @@ class PostgresHandler(DatabaseHandler):
287
307
  response = _make_table_response(result, cur)
288
308
  connection.commit()
289
309
  except Exception as e:
290
- logger.error(f'Error running query: {query} on {self.database}, {e}!')
291
- response = Response(
292
- RESPONSE_TYPE.ERROR,
293
- error_code=0,
294
- error_message=str(e)
295
- )
310
+ logger.error(f"Error running query: {query} on {self.database}, {e}!")
311
+ response = Response(RESPONSE_TYPE.ERROR, error_code=0, error_message=str(e))
296
312
  connection.rollback()
297
313
 
298
314
  if need_to_close:
@@ -325,10 +341,7 @@ class PostgresHandler(DatabaseHandler):
325
341
  result = cur.fetchmany(fetch_size)
326
342
  if not result:
327
343
  break
328
- df = DataFrame(
329
- result,
330
- columns=[x.name for x in cur.description]
331
- )
344
+ df = DataFrame(result, columns=[x.name for x in cur.description])
332
345
  self._cast_dtypes(df, cur.description)
333
346
  yield df
334
347
  connection.commit()
@@ -349,16 +362,10 @@ class PostgresHandler(DatabaseHandler):
349
362
 
350
363
  # copy requires precise cases of names: get current column names from table and adapt input dataframe columns
351
364
  if resp.data_frame is not None and not resp.data_frame.empty:
352
- db_columns = {
353
- c.lower(): c
354
- for c in resp.data_frame['COLUMN_NAME']
355
- }
365
+ db_columns = {c.lower(): c for c in resp.data_frame["COLUMN_NAME"]}
356
366
 
357
367
  # try to get case of existing column
358
- columns = [
359
- db_columns.get(c.lower(), c)
360
- for c in columns
361
- ]
368
+ columns = [db_columns.get(c.lower(), c) for c in columns]
362
369
 
363
370
  columns = [f'"{c}"' for c in columns]
364
371
  rowcount = None
@@ -370,7 +377,7 @@ class PostgresHandler(DatabaseHandler):
370
377
 
371
378
  connection.commit()
372
379
  except Exception as e:
373
- logger.error(f'Error running insert to {table_name} on {self.database}, {e}!')
380
+ logger.error(f"Error running insert to {table_name} on {self.database}, {e}!")
374
381
  connection.rollback()
375
382
  raise e
376
383
  rowcount = cur.rowcount
@@ -402,9 +409,9 @@ class PostgresHandler(DatabaseHandler):
402
409
  Returns:
403
410
  Response: A response object containing the list of tables and views, formatted as per the `Response` class.
404
411
  """
405
- all_filter = 'and table_schema = current_schema()'
412
+ all_filter = "and table_schema = current_schema()"
406
413
  if all is True:
407
- all_filter = ''
414
+ all_filter = ""
408
415
  query = f"""
409
416
  SELECT
410
417
  table_schema,
@@ -439,7 +446,7 @@ class PostgresHandler(DatabaseHandler):
439
446
  if isinstance(schema_name, str):
440
447
  schema_name = f"'{schema_name}'"
441
448
  else:
442
- schema_name = 'current_schema()'
449
+ schema_name = "current_schema()"
443
450
  query = f"""
444
451
  SELECT
445
452
  COLUMN_NAME,
@@ -467,33 +474,33 @@ class PostgresHandler(DatabaseHandler):
467
474
 
468
475
  def subscribe(self, stop_event, callback, table_name, columns=None, **kwargs):
469
476
  config = self._make_connection_args()
470
- config['autocommit'] = True
477
+ config["autocommit"] = True
471
478
 
472
479
  conn = psycopg.connect(connect_timeout=10, **config)
473
480
 
474
481
  # create db trigger
475
- trigger_name = f'mdb_notify_{table_name}'
482
+ trigger_name = f"mdb_notify_{table_name}"
476
483
 
477
- before, after = '', ''
484
+ before, after = "", ""
478
485
 
479
486
  if columns:
480
487
  # check column exist
481
- conn.execute(f'select {",".join(columns)} from {table_name} limit 0')
488
+ conn.execute(f"select {','.join(columns)} from {table_name} limit 0")
482
489
 
483
490
  columns = set(columns)
484
- trigger_name += '_' + '_'.join(columns)
491
+ trigger_name += "_" + "_".join(columns)
485
492
 
486
493
  news, olds = [], []
487
494
  for column in columns:
488
- news.append(f'NEW.{column}')
489
- olds.append(f'OLD.{column}')
495
+ news.append(f"NEW.{column}")
496
+ olds.append(f"OLD.{column}")
490
497
 
491
- before = f'IF ({", ".join(news)}) IS DISTINCT FROM ({", ".join(olds)}) then\n'
492
- after = '\nEND IF;'
498
+ before = f"IF ({', '.join(news)}) IS DISTINCT FROM ({', '.join(olds)}) then\n"
499
+ after = "\nEND IF;"
493
500
  else:
494
501
  columns = set()
495
502
 
496
- func_code = f'''
503
+ func_code = f"""
497
504
  CREATE OR REPLACE FUNCTION {trigger_name}()
498
505
  RETURNS trigger AS $$
499
506
  DECLARE
@@ -504,16 +511,16 @@ class PostgresHandler(DatabaseHandler):
504
511
  RETURN NEW;
505
512
  END;
506
513
  $$ LANGUAGE plpgsql;
507
- '''
514
+ """
508
515
  conn.execute(func_code)
509
516
 
510
517
  # for after update - new and old have the same values
511
- conn.execute(f'''
518
+ conn.execute(f"""
512
519
  CREATE OR REPLACE TRIGGER {trigger_name}
513
520
  BEFORE INSERT OR UPDATE ON {table_name}
514
521
  FOR EACH ROW
515
522
  EXECUTE PROCEDURE {trigger_name}();
516
- ''')
523
+ """)
517
524
  conn.commit()
518
525
 
519
526
  # start listen
@@ -544,8 +551,208 @@ class PostgresHandler(DatabaseHandler):
544
551
  time.sleep(SUBSCRIBE_SLEEP_INTERVAL)
545
552
 
546
553
  finally:
547
- conn.execute(f'drop TRIGGER {trigger_name} on {table_name}')
548
- conn.execute(f'drop FUNCTION {trigger_name}')
554
+ conn.execute(f"drop TRIGGER {trigger_name} on {table_name}")
555
+ conn.execute(f"drop FUNCTION {trigger_name}")
549
556
  conn.commit()
550
557
 
551
558
  conn.close()
559
+
560
+ def meta_get_tables(self, table_names: Optional[list] = None) -> Response:
561
+ """
562
+ Retrieves metadata information about the tables in the PostgreSQL database to be stored in the data catalog.
563
+
564
+ Args:
565
+ table_names (list): A list of table names for which to retrieve metadata information.
566
+
567
+ Returns:
568
+ Response: A response object containing the metadata information, formatted as per the `Response` class.
569
+ """
570
+ query = """
571
+ SELECT
572
+ t.table_name,
573
+ t.table_schema,
574
+ t.table_type,
575
+ obj_description(pgc.oid, 'pg_class') AS table_description,
576
+ pgc.reltuples AS row_count
577
+ FROM information_schema.tables t
578
+ JOIN pg_catalog.pg_class pgc ON pgc.relname = t.table_name
579
+ JOIN pg_catalog.pg_namespace pgn ON pgn.oid = pgc.relnamespace
580
+ WHERE t.table_schema = current_schema()
581
+ AND t.table_type in ('BASE TABLE', 'VIEW')
582
+ AND t.table_name NOT LIKE 'pg_%'
583
+ AND t.table_name NOT LIKE 'sql_%'
584
+ """
585
+
586
+ if table_names is not None and len(table_names) > 0:
587
+ table_names = [f"'{t}'" for t in table_names]
588
+ query += f" AND t.table_name IN ({','.join(table_names)})"
589
+
590
+ result = self.native_query(query)
591
+ return result
592
+
593
+ def meta_get_columns(self, table_names: Optional[list] = None) -> Response:
594
+ """
595
+ Retrieves column metadata for the specified tables (or all tables if no list is provided).
596
+
597
+ Args:
598
+ table_names (list): A list of table names for which to retrieve column metadata.
599
+
600
+ Returns:
601
+ Response: A response object containing the column metadata.
602
+ """
603
+ query = """
604
+ SELECT
605
+ c.table_name,
606
+ c.column_name,
607
+ c.data_type,
608
+ col_description(pgc.oid, c.ordinal_position) AS column_description,
609
+ c.column_default,
610
+ (c.is_nullable = 'YES') AS is_nullable
611
+ FROM information_schema.columns c
612
+ JOIN pg_catalog.pg_class pgc ON pgc.relname = c.table_name
613
+ JOIN pg_catalog.pg_namespace pgn ON pgn.oid = pgc.relnamespace
614
+ WHERE c.table_schema = current_schema()
615
+ AND pgc.relkind = 'r' -- Only consider regular tables (avoids indexes, sequences, etc.)
616
+ AND c.table_name NOT LIKE 'pg_%'
617
+ AND c.table_name NOT LIKE 'sql_%'
618
+ AND pgn.nspname = c.table_schema
619
+ """
620
+
621
+ if table_names is not None and len(table_names) > 0:
622
+ table_names = [f"'{t}'" for t in table_names]
623
+ query += f" AND c.table_name IN ({','.join(table_names)})"
624
+
625
+ result = self.native_query(query)
626
+ return result
627
+
628
+ def meta_get_column_statistics(self, table_names: Optional[list] = None) -> dict:
629
+ """
630
+ Retrieves column statistics (e.g., most common values, frequencies, null percentage, and distinct value count)
631
+ for the specified tables or all tables if no list is provided.
632
+
633
+ Args:
634
+ table_names (list): A list of table names for which to retrieve column statistics.
635
+
636
+ Returns:
637
+ dict: A dictionary containing the column statistics.
638
+ """
639
+ query = """
640
+ SELECT
641
+ ps.attname AS column_name,
642
+ ps.tablename AS table_name,
643
+ ps.most_common_vals AS most_common_values,
644
+ ps.most_common_freqs::text AS most_common_frequencies,
645
+ ps.null_frac * 100 AS null_percentage,
646
+ ps.n_distinct AS distinct_values_count,
647
+ ps.histogram_bounds AS histogram_bounds
648
+ FROM pg_stats ps
649
+ WHERE ps.schemaname = current_schema()
650
+ AND ps.tablename NOT LIKE 'pg_%'
651
+ AND ps.tablename NOT LIKE 'sql_%'
652
+ """
653
+
654
+ if table_names is not None and len(table_names) > 0:
655
+ table_names = [f"'{t}'" for t in table_names]
656
+ query += f" AND ps.tablename IN ({','.join(table_names)})"
657
+
658
+ result = self.native_query(query)
659
+ df = result.data_frame
660
+
661
+ def parse_pg_array_string(x):
662
+ try:
663
+ return (
664
+ [item.strip(" ,") for row in csv.reader(io.StringIO(x.strip("{}"))) for item in row if item.strip()]
665
+ if x
666
+ else []
667
+ )
668
+ except IndexError:
669
+ logger.error(f"Error parsing PostgreSQL array string: {x}")
670
+ return []
671
+
672
+ # Convert most_common_values and most_common_frequencies from string representation to lists.
673
+ df["most_common_values"] = df["most_common_values"].apply(lambda x: parse_pg_array_string(x))
674
+ df["most_common_frequencies"] = df["most_common_frequencies"].apply(lambda x: parse_pg_array_string(x))
675
+
676
+ # Get the minimum and maximum values from the histogram bounds.
677
+ df["minimum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[0] if x else None)
678
+ df["maximum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[-1] if x else None)
679
+
680
+ # Handle cases where distinct_values_count is negative (indicating an approximation).
681
+ df["distinct_values_count"] = df["distinct_values_count"].apply(lambda x: x if x >= 0 else None)
682
+
683
+ result.data_frame = df.drop(columns=["histogram_bounds"])
684
+
685
+ return result
686
+
687
+ def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response:
688
+ """
689
+ Retrieves primary key information for the specified tables (or all tables if no list is provided).
690
+
691
+ Args:
692
+ table_names (list): A list of table names for which to retrieve primary key information.
693
+
694
+ Returns:
695
+ Response: A response object containing the primary key information.
696
+ """
697
+ query = """
698
+ SELECT
699
+ tc.table_name,
700
+ kcu.column_name,
701
+ kcu.ordinal_position,
702
+ tc.constraint_name
703
+ FROM
704
+ information_schema.table_constraints AS tc
705
+ JOIN
706
+ information_schema.key_column_usage AS kcu
707
+ ON
708
+ tc.constraint_name = kcu.constraint_name
709
+ WHERE
710
+ tc.constraint_type = 'PRIMARY KEY'
711
+ AND tc.table_schema = current_schema()
712
+ """
713
+
714
+ if table_names is not None and len(table_names) > 0:
715
+ table_names = [f"'{t}'" for t in table_names]
716
+ query += f" AND tc.table_name IN ({','.join(table_names)})"
717
+
718
+ result = self.native_query(query)
719
+ return result
720
+
721
+ def meta_get_foreign_keys(self, table_names: Optional[list] = None) -> Response:
722
+ """
723
+ Retrieves foreign key information for the specified tables (or all tables if no list is provided).
724
+
725
+ Args:
726
+ table_names (list): A list of table names for which to retrieve foreign key information.
727
+
728
+ Returns:
729
+ Response: A response object containing the foreign key information.
730
+ """
731
+ query = """
732
+ SELECT
733
+ ccu.table_name AS parent_table_name,
734
+ ccu.column_name AS parent_column_name,
735
+ tc.table_name AS child_table_name,
736
+ kcu.column_name AS child_column_name,
737
+ tc.constraint_name
738
+ FROM
739
+ information_schema.table_constraints AS tc
740
+ JOIN
741
+ information_schema.key_column_usage AS kcu
742
+ ON
743
+ tc.constraint_name = kcu.constraint_name
744
+ JOIN
745
+ information_schema.constraint_column_usage AS ccu
746
+ ON
747
+ ccu.constraint_name = tc.constraint_name
748
+ WHERE
749
+ tc.constraint_type = 'FOREIGN KEY'
750
+ AND tc.table_schema = current_schema()
751
+ """
752
+
753
+ if table_names is not None and len(table_names) > 0:
754
+ table_names = [f"'{t}'" for t in table_names]
755
+ query += f" AND tc.table_name IN ({','.join(table_names)})"
756
+
757
+ result = self.native_query(query)
758
+ return result