MindsDB 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (76) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +50 -26
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/a2a/task_manager.py +68 -6
  5. mindsdb/api/executor/command_executor.py +69 -14
  6. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  7. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
  8. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  9. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  10. mindsdb/api/executor/planner/plan_join.py +67 -77
  11. mindsdb/api/executor/planner/query_planner.py +176 -155
  12. mindsdb/api/executor/planner/steps.py +37 -12
  13. mindsdb/api/executor/sql_query/result_set.py +45 -64
  14. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  15. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  16. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  17. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  18. mindsdb/api/executor/utilities/sql.py +42 -48
  19. mindsdb/api/http/namespaces/config.py +1 -1
  20. mindsdb/api/http/namespaces/file.py +14 -23
  21. mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
  22. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  23. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  24. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  25. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  26. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  27. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
  28. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  29. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  30. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  31. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
  32. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +29 -33
  33. mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
  34. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  35. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  36. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +145 -40
  37. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  38. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +352 -83
  39. mindsdb/integrations/libs/api_handler.py +279 -57
  40. mindsdb/integrations/libs/base.py +185 -30
  41. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  42. mindsdb/integrations/utilities/handler_utils.py +23 -8
  43. mindsdb/integrations/utilities/sql_utils.py +35 -40
  44. mindsdb/interfaces/agents/agents_controller.py +226 -196
  45. mindsdb/interfaces/agents/constants.py +8 -1
  46. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  47. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  48. mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
  49. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  50. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  51. mindsdb/interfaces/data_catalog/data_catalog_loader.py +375 -0
  52. mindsdb/interfaces/data_catalog/data_catalog_reader.py +38 -0
  53. mindsdb/interfaces/database/database.py +81 -57
  54. mindsdb/interfaces/database/integrations.py +222 -234
  55. mindsdb/interfaces/database/log.py +72 -104
  56. mindsdb/interfaces/database/projects.py +156 -193
  57. mindsdb/interfaces/file/file_controller.py +21 -65
  58. mindsdb/interfaces/knowledge_base/controller.py +66 -25
  59. mindsdb/interfaces/knowledge_base/evaluate.py +516 -0
  60. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  61. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  62. mindsdb/interfaces/skills/skills_controller.py +31 -36
  63. mindsdb/interfaces/skills/sql_agent.py +113 -86
  64. mindsdb/interfaces/storage/db.py +242 -82
  65. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  66. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  67. mindsdb/utilities/config.py +13 -2
  68. mindsdb/utilities/log.py +35 -26
  69. mindsdb/utilities/ml_task_queue/task.py +19 -22
  70. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  71. mindsdb/utilities/starters.py +40 -0
  72. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +257 -257
  73. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +76 -68
  74. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
  75. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
  76. {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
- import json
1
+ import struct
2
2
  import datetime
3
3
  from typing import Any
4
+ from array import array
4
5
 
5
6
  import numpy as np
6
7
  from numpy import dtype as np_dtype
@@ -9,11 +10,19 @@ from pandas.api import types as pd_types
9
10
 
10
11
  from mindsdb.api.executor.sql_query.result_set import ResultSet, get_mysql_data_type_from_series, Column
11
12
  from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype as lightwood_dtype
12
- from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE, DATA_C_TYPE_MAP, CTypeProperties
13
+ from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import (
14
+ MYSQL_DATA_TYPE,
15
+ DATA_C_TYPE_MAP,
16
+ CTypeProperties,
17
+ CHARSET_NUMBERS,
18
+ )
13
19
  from mindsdb.utilities import log
20
+ from mindsdb.utilities.json_encoder import CustomJSONEncoder
14
21
 
15
22
  logger = log.getLogger(__name__)
16
23
 
24
+ json_encoder = CustomJSONEncoder()
25
+
17
26
 
18
27
  def column_to_mysql_column_dict(column: Column, database_name: str | None = None) -> dict[str, str | int]:
19
28
  """Convert Column object to dict with column properties.
@@ -52,9 +61,13 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
52
61
  # endregion
53
62
 
54
63
  if isinstance(column.type, MYSQL_DATA_TYPE) is False:
55
- logger.warning(f'Unexpected column type: {column.type}. Use TEXT as fallback.')
64
+ logger.warning(f"Unexpected column type: {column.type}. Use TEXT as fallback.")
56
65
  column.type = MYSQL_DATA_TYPE.TEXT
57
66
 
67
+ charset = CHARSET_NUMBERS["utf8_unicode_ci"]
68
+ if column.type in (MYSQL_DATA_TYPE.JSON, MYSQL_DATA_TYPE.VECTOR):
69
+ charset = CHARSET_NUMBERS["binary"]
70
+
58
71
  type_properties: CTypeProperties = DATA_C_TYPE_MAP[column.type]
59
72
 
60
73
  result = {
@@ -66,6 +79,7 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
66
79
  "size": type_properties.size,
67
80
  "flags": type_properties.flags,
68
81
  "type": type_properties.code,
82
+ "charset": charset,
69
83
  }
70
84
  return result
71
85
 
@@ -82,7 +96,7 @@ def _dump_bool(var: Any) -> int | None:
82
96
  """
83
97
  if pd.isna(var):
84
98
  return None
85
- return '1' if var else '0'
99
+ return "1" if var else "0"
86
100
 
87
101
 
88
102
  def _dump_str(var: Any) -> str | None:
@@ -94,18 +108,19 @@ def _dump_str(var: Any) -> str | None:
94
108
  Returns:
95
109
  str | None: The string representation of the value or None if the value is None
96
110
  """
97
- if pd.isna(var):
98
- return None
99
111
  if isinstance(var, bytes):
100
112
  try:
101
- return var.decode('utf-8')
113
+ return var.decode("utf-8")
102
114
  except Exception:
103
115
  return str(var)[2:-1]
104
- if isinstance(var, dict):
116
+ if isinstance(var, (dict, list)):
105
117
  try:
106
- return json.dumps(var)
118
+ return json_encoder.encode(var)
107
119
  except Exception:
108
120
  return str(var)
121
+ if isinstance(var, list) is False and pd.isna(var):
122
+ # pd.isna returns array of bools for list, so we need to check if it is not a list
123
+ return None
109
124
  return str(var)
110
125
 
111
126
 
@@ -142,7 +157,7 @@ def _dump_date(var: datetime.date | str | None) -> str | None:
142
157
  return var
143
158
  elif pd.isna(var):
144
159
  return None
145
- logger.warning(f'Unexpected value type for DATE: {type(var)}, {var}')
160
+ logger.warning(f"Unexpected value type for DATE: {type(var)}, {var}")
146
161
  return _dump_str(var)
147
162
 
148
163
 
@@ -157,18 +172,18 @@ def _dump_datetime(var: datetime.datetime | str | None) -> str | None:
157
172
  str | None: The string representation of the datetime value or None if the value is None
158
173
  """
159
174
  if isinstance(var, datetime.date): # it is also datetime.datetime
160
- if hasattr(var, 'tzinfo') and var.tzinfo is not None:
175
+ if hasattr(var, "tzinfo") and var.tzinfo is not None:
161
176
  return var.astimezone(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
162
177
  return var.strftime("%Y-%m-%d %H:%M:%S")
163
178
  elif isinstance(var, pd.Timestamp):
164
179
  if var.tzinfo is not None:
165
- return var.tz_convert('UTC').strftime("%Y-%m-%d %H:%M:%S")
180
+ return var.tz_convert("UTC").strftime("%Y-%m-%d %H:%M:%S")
166
181
  return var.strftime("%Y-%m-%d %H:%M:%S")
167
182
  elif isinstance(var, str):
168
183
  return var
169
184
  elif pd.isna(var):
170
185
  return None
171
- logger.warning(f'Unexpected value type for DATETIME: {type(var)}, {var}')
186
+ logger.warning(f"Unexpected value type for DATETIME: {type(var)}, {var}")
172
187
  return _dump_str(var)
173
188
 
174
189
 
@@ -198,16 +213,34 @@ def _dump_time(var: datetime.time | str | None) -> str | None:
198
213
  return var.strftime("%H:%M:%S")
199
214
  elif isinstance(var, pd.Timestamp):
200
215
  if var.tzinfo is not None:
201
- return var.tz_convert('UTC').strftime("%H:%M:%S")
216
+ return var.tz_convert("UTC").strftime("%H:%M:%S")
202
217
  return var.strftime("%H:%M:%S")
203
218
  elif isinstance(var, str):
204
219
  return var
205
220
  elif pd.isna(var):
206
221
  return None
207
- logger.warning(f'Unexpected value type for TIME: {type(var)}, {var}')
222
+ logger.warning(f"Unexpected value type for TIME: {type(var)}, {var}")
208
223
  return _dump_str(var)
209
224
 
210
225
 
226
+ def _dump_vector(value: Any) -> bytes | None:
227
+ """Convert array or list of floats to a bytes.
228
+
229
+ Args:
230
+ value (Any): The value to dump
231
+
232
+ Returns:
233
+ bytes | None: The bytes representation of the vector value or None if the value is None
234
+ """
235
+ if isinstance(value, (array, list, np.ndarray)):
236
+ return b"".join([struct.pack("<f", el) for el in value])
237
+ elif pd.isna(value):
238
+ return None
239
+ err_msg = f"Unexpected value type for VECTOR: {type(value)}, {value}"
240
+ logger.error(err_msg)
241
+ raise ValueError(err_msg)
242
+
243
+
211
244
  def _handle_series_as_date(series: pd.Series) -> pd.Series:
212
245
  """Convert values in a series to a string representation of a date.
213
246
  NOTE: MySQL require exactly %Y-%m-%d for DATE type.
@@ -219,10 +252,10 @@ def _handle_series_as_date(series: pd.Series) -> pd.Series:
219
252
  pd.Series: The series with the date values as strings
220
253
  """
221
254
  if pd_types.is_datetime64_any_dtype(series.dtype):
222
- return series.dt.strftime('%Y-%m-%d')
255
+ return series.dt.strftime("%Y-%m-%d")
223
256
  elif pd_types.is_object_dtype(series.dtype):
224
257
  return series.apply(_dump_date)
225
- logger.info(f'Unexpected dtype: {series.dtype} for column with type DATE')
258
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type DATE")
226
259
  return series.apply(_dump_str)
227
260
 
228
261
 
@@ -237,10 +270,10 @@ def _handle_series_as_datetime(series: pd.Series) -> pd.Series:
237
270
  pd.Series: The series with the datetime values as strings
238
271
  """
239
272
  if pd_types.is_datetime64_any_dtype(series.dtype):
240
- return series.dt.strftime('%Y-%m-%d %H:%M:%S')
273
+ return series.dt.strftime("%Y-%m-%d %H:%M:%S")
241
274
  elif pd_types.is_object_dtype(series.dtype):
242
275
  return series.apply(_dump_datetime)
243
- logger.info(f'Unexpected dtype: {series.dtype} for column with type DATETIME')
276
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type DATETIME")
244
277
  return series.apply(_dump_str)
245
278
 
246
279
 
@@ -255,14 +288,14 @@ def _handle_series_as_time(series: pd.Series) -> pd.Series:
255
288
  pd.Series: The series with the time values as strings
256
289
  """
257
290
  if pd_types.is_timedelta64_ns_dtype(series.dtype):
258
- base_time = pd.Timestamp('2000-01-01')
259
- series = ((base_time + series).dt.strftime('%H:%M:%S'))
291
+ base_time = pd.Timestamp("2000-01-01")
292
+ series = (base_time + series).dt.strftime("%H:%M:%S")
260
293
  elif pd_types.is_datetime64_dtype(series.dtype):
261
- series = series.dt.strftime('%H:%M:%S')
294
+ series = series.dt.strftime("%H:%M:%S")
262
295
  elif pd_types.is_object_dtype(series.dtype):
263
296
  series = series.apply(_dump_time)
264
297
  else:
265
- logger.info(f'Unexpected dtype: {series.dtype} for column with type TIME')
298
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type TIME")
266
299
  series = series.apply(_dump_str)
267
300
  return series
268
301
 
@@ -278,14 +311,29 @@ def _handle_series_as_int(series: pd.Series) -> pd.Series:
278
311
  pd.Series: The series with the int values as strings
279
312
  """
280
313
  if pd_types.is_integer_dtype(series.dtype):
281
- if series.dtype == 'Int64':
314
+ if series.dtype == "Int64":
282
315
  # NOTE: 'apply' converts values to python floats
283
316
  return series.astype(object).apply(_dump_str)
284
317
  return series.apply(_dump_str)
285
318
  return series.apply(_dump_int_or_str)
286
319
 
287
320
 
288
- def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = False) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
321
+ def _handle_series_as_vector(series: pd.Series) -> pd.Series:
322
+ """Convert values in a series to a bytes representation of a vector.
323
+ NOTE: MySQL's VECTOR type require exactly 4 bytes per float.
324
+
325
+ Args:
326
+ series (pd.Series): The series to handle
327
+
328
+ Returns:
329
+ pd.Series: The series with the vector values as bytes
330
+ """
331
+ return series.apply(_dump_vector)
332
+
333
+
334
+ def dump_result_set_to_mysql(
335
+ result_set: ResultSet, infer_column_size: bool = False
336
+ ) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
289
337
  """
290
338
  Dumps the ResultSet to a format that can be used to send as MySQL response packet.
291
339
  NOTE: This method modifies the original DataFrame and columns.
@@ -319,10 +367,16 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
319
367
  case MYSQL_DATA_TYPE.TIME:
320
368
  series = _handle_series_as_time(series)
321
369
  case (
322
- MYSQL_DATA_TYPE.INT | MYSQL_DATA_TYPE.TINYINT | MYSQL_DATA_TYPE.SMALLINT
323
- | MYSQL_DATA_TYPE.MEDIUMINT | MYSQL_DATA_TYPE.BIGINT | MYSQL_DATA_TYPE.YEAR
370
+ MYSQL_DATA_TYPE.INT
371
+ | MYSQL_DATA_TYPE.TINYINT
372
+ | MYSQL_DATA_TYPE.SMALLINT
373
+ | MYSQL_DATA_TYPE.MEDIUMINT
374
+ | MYSQL_DATA_TYPE.BIGINT
375
+ | MYSQL_DATA_TYPE.YEAR
324
376
  ):
325
377
  series = _handle_series_as_int(series)
378
+ case MYSQL_DATA_TYPE.VECTOR:
379
+ series = _handle_series_as_vector(series)
326
380
  case _:
327
381
  series = series.apply(_dump_str)
328
382
 
@@ -330,22 +384,19 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
330
384
  # we may split this operation for dt and other types for optimisation
331
385
  df[i] = series.replace([np.NaN, pd.NA, pd.NaT], None)
332
386
 
333
- columns_dicts = [
334
- column_to_mysql_column_dict(column)
335
- for column in result_set.columns
336
- ]
387
+ columns_dicts = [column_to_mysql_column_dict(column) for column in result_set.columns]
337
388
 
338
- if infer_column_size and any(column_info.get('size') is None for column_info in columns_dicts):
389
+ if infer_column_size and any(column_info.get("size") is None for column_info in columns_dicts):
339
390
  if len(df) == 0:
340
391
  for column_info in columns_dicts:
341
- if column_info['size'] is None:
342
- column_info['size'] = 1
392
+ if column_info["size"] is None:
393
+ column_info["size"] = 1
343
394
  else:
344
395
  sample = df.head(100)
345
396
  for i, column_info in enumerate(columns_dicts):
346
397
  try:
347
- column_info['size'] = sample[sample.columns[i]].astype(str).str.len().max()
398
+ column_info["size"] = sample[sample.columns[i]].astype(str).str.len().max()
348
399
  except Exception:
349
- column_info['size'] = 1
400
+ column_info["size"] = 1
350
401
 
351
402
  return df, columns_dicts
@@ -1,26 +1,28 @@
1
- from typing import Text, Dict, Any
1
+ from google.cloud.bigquery import Client, QueryJobConfig
2
2
  from google.api_core.exceptions import BadRequest
3
+ import pandas as pd
3
4
  from sqlalchemy_bigquery.base import BigQueryDialect
4
- from google.cloud.bigquery import Client, QueryJobConfig
5
+ from typing import Any, Dict, Optional, Text
5
6
 
6
7
  from mindsdb.utilities import log
7
8
  from mindsdb_sql_parser.ast.base import ASTNode
8
- from mindsdb.integrations.libs.base import DatabaseHandler
9
+ from mindsdb.integrations.libs.base import MetaDatabaseHandler
9
10
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
10
11
  from mindsdb.integrations.utilities.handlers.auth_utilities.google import GoogleServiceAccountOAuth2Manager
11
12
  from mindsdb.integrations.libs.response import (
12
13
  HandlerStatusResponse as StatusResponse,
13
14
  HandlerResponse as Response,
14
- RESPONSE_TYPE
15
+ RESPONSE_TYPE,
15
16
  )
16
17
 
17
18
  logger = log.getLogger(__name__)
18
19
 
19
20
 
20
- class BigQueryHandler(DatabaseHandler):
21
+ class BigQueryHandler(MetaDatabaseHandler):
21
22
  """
22
23
  This handler handles connection and execution of Google BigQuery statements.
23
24
  """
25
+
24
26
  name = "bigquery"
25
27
 
26
28
  def __init__(self, name: Text, connection_data: Dict, **kwargs: Any):
@@ -49,19 +51,16 @@ class BigQueryHandler(DatabaseHandler):
49
51
  return self.connection
50
52
 
51
53
  # Mandatory connection parameters
52
- if not all(key in self.connection_data for key in ['project_id', 'dataset']):
53
- raise ValueError('Required parameters (project_id, dataset) must be provided.')
54
+ if not all(key in self.connection_data for key in ["project_id", "dataset"]):
55
+ raise ValueError("Required parameters (project_id, dataset) must be provided.")
54
56
 
55
57
  google_sa_oauth2_manager = GoogleServiceAccountOAuth2Manager(
56
- credentials_file=self.connection_data.get('service_account_keys'),
57
- credentials_json=self.connection_data.get('service_account_json')
58
+ credentials_file=self.connection_data.get("service_account_keys"),
59
+ credentials_json=self.connection_data.get("service_account_json"),
58
60
  )
59
61
  credentials = google_sa_oauth2_manager.get_oauth2_credentials()
60
62
 
61
- client = Client(
62
- project=self.connection_data["project_id"],
63
- credentials=credentials
64
- )
63
+ client = Client(project=self.connection_data["project_id"], credentials=credentials)
65
64
  self.is_connected = True
66
65
  self.connection = client
67
66
  return self.connection
@@ -86,14 +85,14 @@ class BigQueryHandler(DatabaseHandler):
86
85
 
87
86
  try:
88
87
  connection = self.connect()
89
- connection.query('SELECT 1;')
88
+ connection.query("SELECT 1;")
90
89
 
91
90
  # Check if the dataset exists
92
- connection.get_dataset(self.connection_data['dataset'])
91
+ connection.get_dataset(self.connection_data["dataset"])
93
92
 
94
93
  response.success = True
95
94
  except (BadRequest, ValueError) as e:
96
- logger.error(f'Error connecting to BigQuery {self.connection_data["project_id"]}, {e}!')
95
+ logger.error(f"Error connecting to BigQuery {self.connection_data['project_id']}, {e}!")
97
96
  response.error_message = e
98
97
 
99
98
  if response.success is False and self.is_connected is True:
@@ -113,22 +112,18 @@ class BigQueryHandler(DatabaseHandler):
113
112
  """
114
113
  connection = self.connect()
115
114
  try:
116
- job_config = QueryJobConfig(default_dataset=f"{self.connection_data['project_id']}.{self.connection_data['dataset']}")
115
+ job_config = QueryJobConfig(
116
+ default_dataset=f"{self.connection_data['project_id']}.{self.connection_data['dataset']}"
117
+ )
117
118
  query = connection.query(query, job_config=job_config)
118
119
  result = query.to_dataframe()
119
120
  if not result.empty:
120
- response = Response(
121
- RESPONSE_TYPE.TABLE,
122
- result
123
- )
121
+ response = Response(RESPONSE_TYPE.TABLE, result)
124
122
  else:
125
123
  response = Response(RESPONSE_TYPE.OK)
126
124
  except Exception as e:
127
- logger.error(f'Error running query: {query} on {self.connection_data["project_id"]}!')
128
- response = Response(
129
- RESPONSE_TYPE.ERROR,
130
- error_message=str(e)
131
- )
125
+ logger.error(f"Error running query: {query} on {self.connection_data['project_id']}!")
126
+ response = Response(RESPONSE_TYPE.ERROR, error_message=str(e))
132
127
  return response
133
128
 
134
129
  def query(self, query: ASTNode) -> Response:
@@ -154,7 +149,7 @@ class BigQueryHandler(DatabaseHandler):
154
149
  """
155
150
  query = f"""
156
151
  SELECT table_name, table_schema, table_type
157
- FROM `{self.connection_data['project_id']}.{self.connection_data['dataset']}.INFORMATION_SCHEMA.TABLES`
152
+ FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES`
158
153
  WHERE table_type IN ('BASE TABLE', 'VIEW')
159
154
  """
160
155
  result = self.native_query(query)
@@ -174,8 +169,204 @@ class BigQueryHandler(DatabaseHandler):
174
169
  """
175
170
  query = f"""
176
171
  SELECT column_name AS Field, data_type as Type
177
- FROM `{self.connection_data['project_id']}.{self.connection_data['dataset']}.INFORMATION_SCHEMA.COLUMNS`
172
+ FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
178
173
  WHERE table_name = '{table_name}'
179
174
  """
180
175
  result = self.native_query(query)
181
176
  return result
177
+
178
+ def meta_get_tables(self, table_names: Optional[list] = None) -> Response:
179
+ """
180
+ Retrieves table metadata for the specified tables (or all tables if no list is provided).
181
+
182
+ Args:
183
+ table_names (list): A list of table names for which to retrieve metadata information.
184
+
185
+ Returns:
186
+ Response: A response object containing the metadata information, formatted as per the `Response` class.
187
+ """
188
+ query = f"""
189
+ SELECT
190
+ t.table_name,
191
+ t.table_schema,
192
+ t.table_type,
193
+ st.row_count
194
+ FROM
195
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES` AS t
196
+ JOIN
197
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.__TABLES__` AS st
198
+ ON
199
+ t.table_name = st.table_id
200
+ WHERE
201
+ t.table_type IN ('BASE TABLE', 'VIEW')
202
+ """
203
+
204
+ if table_names is not None and len(table_names) > 0:
205
+ table_names = [f"'{t}'" for t in table_names]
206
+ query += f" AND t.table_name IN ({','.join(table_names)})"
207
+
208
+ result = self.native_query(query)
209
+ return result
210
+
211
+ def meta_get_columns(self, table_names: Optional[list] = None) -> Response:
212
+ """
213
+ Retrieves column metadata for the specified tables (or all tables if no list is provided).
214
+
215
+ Args:
216
+ table_names (list): A list of table names for which to retrieve column metadata.
217
+
218
+ Returns:
219
+ Response: A response object containing the column metadata.
220
+ """
221
+ query = f"""
222
+ SELECT
223
+ table_name,
224
+ column_name,
225
+ data_type,
226
+ column_default,
227
+ CASE is_nullable
228
+ WHEN 'YES' THEN TRUE
229
+ ELSE FALSE
230
+ END AS is_nullable
231
+ FROM
232
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
233
+ """
234
+
235
+ if table_names is not None and len(table_names) > 0:
236
+ table_names = [f"'{t}'" for t in table_names]
237
+ query += f" WHERE table_name IN ({','.join(table_names)})"
238
+
239
+ result = self.native_query(query)
240
+ return result
241
+
242
+ def meta_get_column_statistics_for_table(self, table_name: str, columns: list) -> Response:
243
+ """
244
+ Retrieves statistics for the specified columns in a table.
245
+
246
+ Args:
247
+ table_name (str): The name of the table.
248
+ columns (list): A list of column names to retrieve statistics for.
249
+
250
+ Returns:
251
+ Response: A response object containing the column statistics.
252
+ """
253
+ # To avoid hitting BigQuery's query size limits, we will chunk the columns into batches.
254
+ # This is because the queries are combined using UNION ALL, which can lead to very large queries if there are many columns.
255
+ BATCH_SIZE = 20
256
+
257
+ def chunked(lst, n):
258
+ """
259
+ Yields successive n-sized chunks from lst.
260
+ """
261
+ for i in range(0, len(lst), n):
262
+ yield lst[i : i + n]
263
+
264
+ queries = []
265
+ for column_batch in chunked(columns, BATCH_SIZE):
266
+ batch_queries = []
267
+ for column in column_batch:
268
+ batch_queries.append(
269
+ f"""
270
+ SELECT
271
+ '{table_name}' AS table_name,
272
+ '{column}' AS column_name,
273
+ SAFE_DIVIDE(COUNTIF({column} IS NULL), COUNT(*)) * 100 AS null_percentage,
274
+ CAST(MIN(`{column}`) AS STRING) AS minimum_value,
275
+ CAST(MAX(`{column}`) AS STRING) AS maximum_value,
276
+ COUNT(DISTINCT {column}) AS distinct_values_count
277
+ FROM
278
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.{table_name}`
279
+ """
280
+ )
281
+
282
+ query = " UNION ALL ".join(batch_queries)
283
+ queries.append(query)
284
+
285
+ results = []
286
+ for query in queries:
287
+ try:
288
+ result = self.native_query(query)
289
+ if result.resp_type == RESPONSE_TYPE.TABLE:
290
+ results.append(result.data_frame)
291
+ else:
292
+ logger.error(f"Error retrieving column statistics for table {table_name}: {result.error_message}")
293
+ except Exception as e:
294
+ logger.error(f"Exception occurred while retrieving column statistics for table {table_name}: {e}")
295
+
296
+ if not results:
297
+ logger.warning(f"No column statistics could be retrieved for table {table_name}.")
298
+ return Response(
299
+ RESPONSE_TYPE.ERROR, error_message=f"No column statistics could be retrieved for table {table_name}."
300
+ )
301
+ return Response(RESPONSE_TYPE.TABLE, pd.concat(results, ignore_index=True) if results else pd.DataFrame())
302
+
303
+ def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response:
304
+ """
305
+ Retrieves primary key information for the specified tables (or all tables if no list is provided).
306
+
307
+ Args:
308
+ table_names (list): A list of table names for which to retrieve primary key information.
309
+
310
+ Returns:
311
+ Response: A response object containing the primary key information.
312
+ """
313
+ query = f"""
314
+ SELECT
315
+ tc.table_name,
316
+ kcu.column_name,
317
+ kcu.ordinal_position,
318
+ tc.constraint_name,
319
+ FROM
320
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
321
+ JOIN
322
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
323
+ ON
324
+ tc.constraint_name = kcu.constraint_name
325
+ WHERE
326
+ tc.constraint_type = 'PRIMARY KEY'
327
+ """
328
+
329
+ if table_names is not None and len(table_names) > 0:
330
+ table_names = [f"'{t}'" for t in table_names]
331
+ query += f" AND tc.table_name IN ({','.join(table_names)})"
332
+
333
+ result = self.native_query(query)
334
+ return result
335
+
336
+ def meta_get_foreign_keys(self, table_names: Optional[list] = None) -> Response:
337
+ """
338
+ Retrieves foreign key information for the specified tables (or all tables if no list is provided).
339
+
340
+ Args:
341
+ table_names (list): A list of table names for which to retrieve foreign key information.
342
+
343
+ Returns:
344
+ Response: A response object containing the foreign key information.
345
+ """
346
+ query = f"""
347
+ SELECT
348
+ ccu.table_name AS parent_table_name,
349
+ ccu.column_name AS parent_column_name,
350
+ kcu.table_name AS child_table_name,
351
+ kcu.column_name AS child_column_name,
352
+ tc.constraint_name
353
+ FROM
354
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
355
+ JOIN
356
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
357
+ ON
358
+ tc.constraint_name = kcu.constraint_name
359
+ JOIN
360
+ `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` AS ccu
361
+ ON
362
+ tc.constraint_name = ccu.constraint_name
363
+ WHERE
364
+ tc.constraint_type = 'FOREIGN KEY'
365
+ """
366
+
367
+ if table_names is not None and len(table_names) > 0:
368
+ table_names = [f"'{t}'" for t in table_names]
369
+ query += f" AND tc.table_name IN ({','.join(table_names)})"
370
+
371
+ result = self.native_query(query)
372
+ return result
@@ -75,10 +75,7 @@ class FileHandler(DatabaseHandler):
75
75
  def query(self, query: ASTNode) -> Response:
76
76
  if type(query) is DropTables:
77
77
  for table_identifier in query.tables:
78
- if (
79
- len(table_identifier.parts) == 2
80
- and table_identifier.parts[0] != self.name
81
- ):
78
+ if len(table_identifier.parts) == 2 and table_identifier.parts[0] != self.name:
82
79
  return Response(
83
80
  RESPONSE_TYPE.ERROR,
84
81
  error_message=f"Can't delete table from database '{table_identifier.parts[0]}'",
@@ -136,9 +133,20 @@ class FileHandler(DatabaseHandler):
136
133
  return Response(RESPONSE_TYPE.OK)
137
134
 
138
135
  elif isinstance(query, Select):
139
- table_name, page_name = self._get_table_page_names(query.from_table)
136
+ if isinstance(query.from_table, Select):
137
+ # partitioning mode
138
+ sub_result = self.query(query.from_table)
139
+ if sub_result.error_message is not None:
140
+ raise RuntimeError(sub_result.error_message)
140
141
 
141
- df = self.file_controller.get_file_data(table_name, page_name)
142
+ df = sub_result.data_frame
143
+ query.from_table = Identifier("t")
144
+ elif isinstance(query.from_table, Identifier):
145
+ table_name, page_name = self._get_table_page_names(query.from_table)
146
+
147
+ df = self.file_controller.get_file_data(table_name, page_name)
148
+ else:
149
+ raise RuntimeError(f"Not supported query target: {query}")
142
150
 
143
151
  # Process the SELECT query
144
152
  result_df = query_df(df, query)
@@ -191,9 +199,7 @@ class FileHandler(DatabaseHandler):
191
199
  data_frame=pd.DataFrame(
192
200
  [
193
201
  {
194
- "Field": x["name"].strip()
195
- if isinstance(x, dict)
196
- else x.strip(),
202
+ "Field": x["name"].strip() if isinstance(x, dict) else x.strip(),
197
203
  "Type": "str",
198
204
  }
199
205
  for x in file_meta["columns"]