MindsDB 25.5.4.2__py3-none-any.whl → 25.6.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/api/a2a/agent.py +50 -26
- mindsdb/api/a2a/common/server/server.py +32 -26
- mindsdb/api/a2a/task_manager.py +68 -6
- mindsdb/api/executor/command_executor.py +69 -14
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
- mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +91 -84
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
- mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
- mindsdb/api/executor/planner/plan_join.py +67 -77
- mindsdb/api/executor/planner/query_planner.py +176 -155
- mindsdb/api/executor/planner/steps.py +37 -12
- mindsdb/api/executor/sql_query/result_set.py +45 -64
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
- mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
- mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
- mindsdb/api/executor/utilities/sql.py +42 -48
- mindsdb/api/http/namespaces/config.py +1 -1
- mindsdb/api/http/namespaces/file.py +14 -23
- mindsdb/api/http/namespaces/knowledge_bases.py +132 -154
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
- mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
- mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +219 -28
- mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
- mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +29 -33
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +277 -356
- mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +145 -40
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
- mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +352 -83
- mindsdb/integrations/libs/api_handler.py +279 -57
- mindsdb/integrations/libs/base.py +185 -30
- mindsdb/integrations/utilities/files/file_reader.py +99 -73
- mindsdb/integrations/utilities/handler_utils.py +23 -8
- mindsdb/integrations/utilities/sql_utils.py +35 -40
- mindsdb/interfaces/agents/agents_controller.py +226 -196
- mindsdb/interfaces/agents/constants.py +8 -1
- mindsdb/interfaces/agents/langchain_agent.py +42 -11
- mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
- mindsdb/interfaces/agents/mindsdb_database_agent.py +23 -18
- mindsdb/interfaces/data_catalog/__init__.py +0 -0
- mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +375 -0
- mindsdb/interfaces/data_catalog/data_catalog_reader.py +38 -0
- mindsdb/interfaces/database/database.py +81 -57
- mindsdb/interfaces/database/integrations.py +222 -234
- mindsdb/interfaces/database/log.py +72 -104
- mindsdb/interfaces/database/projects.py +156 -193
- mindsdb/interfaces/file/file_controller.py +21 -65
- mindsdb/interfaces/knowledge_base/controller.py +66 -25
- mindsdb/interfaces/knowledge_base/evaluate.py +516 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
- mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
- mindsdb/interfaces/skills/skills_controller.py +31 -36
- mindsdb/interfaces/skills/sql_agent.py +113 -86
- mindsdb/interfaces/storage/db.py +242 -82
- mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
- mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
- mindsdb/utilities/config.py +13 -2
- mindsdb/utilities/log.py +35 -26
- mindsdb/utilities/ml_task_queue/task.py +19 -22
- mindsdb/utilities/render/sqlalchemy_render.py +129 -181
- mindsdb/utilities/starters.py +40 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/METADATA +257 -257
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/RECORD +76 -68
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.5.4.2.dist-info → mindsdb-25.6.3.0.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import
|
|
1
|
+
import struct
|
|
2
2
|
import datetime
|
|
3
3
|
from typing import Any
|
|
4
|
+
from array import array
|
|
4
5
|
|
|
5
6
|
import numpy as np
|
|
6
7
|
from numpy import dtype as np_dtype
|
|
@@ -9,11 +10,19 @@ from pandas.api import types as pd_types
|
|
|
9
10
|
|
|
10
11
|
from mindsdb.api.executor.sql_query.result_set import ResultSet, get_mysql_data_type_from_series, Column
|
|
11
12
|
from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype as lightwood_dtype
|
|
12
|
-
from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import
|
|
13
|
+
from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import (
|
|
14
|
+
MYSQL_DATA_TYPE,
|
|
15
|
+
DATA_C_TYPE_MAP,
|
|
16
|
+
CTypeProperties,
|
|
17
|
+
CHARSET_NUMBERS,
|
|
18
|
+
)
|
|
13
19
|
from mindsdb.utilities import log
|
|
20
|
+
from mindsdb.utilities.json_encoder import CustomJSONEncoder
|
|
14
21
|
|
|
15
22
|
logger = log.getLogger(__name__)
|
|
16
23
|
|
|
24
|
+
json_encoder = CustomJSONEncoder()
|
|
25
|
+
|
|
17
26
|
|
|
18
27
|
def column_to_mysql_column_dict(column: Column, database_name: str | None = None) -> dict[str, str | int]:
|
|
19
28
|
"""Convert Column object to dict with column properties.
|
|
@@ -52,9 +61,13 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
|
|
|
52
61
|
# endregion
|
|
53
62
|
|
|
54
63
|
if isinstance(column.type, MYSQL_DATA_TYPE) is False:
|
|
55
|
-
logger.warning(f
|
|
64
|
+
logger.warning(f"Unexpected column type: {column.type}. Use TEXT as fallback.")
|
|
56
65
|
column.type = MYSQL_DATA_TYPE.TEXT
|
|
57
66
|
|
|
67
|
+
charset = CHARSET_NUMBERS["utf8_unicode_ci"]
|
|
68
|
+
if column.type in (MYSQL_DATA_TYPE.JSON, MYSQL_DATA_TYPE.VECTOR):
|
|
69
|
+
charset = CHARSET_NUMBERS["binary"]
|
|
70
|
+
|
|
58
71
|
type_properties: CTypeProperties = DATA_C_TYPE_MAP[column.type]
|
|
59
72
|
|
|
60
73
|
result = {
|
|
@@ -66,6 +79,7 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
|
|
|
66
79
|
"size": type_properties.size,
|
|
67
80
|
"flags": type_properties.flags,
|
|
68
81
|
"type": type_properties.code,
|
|
82
|
+
"charset": charset,
|
|
69
83
|
}
|
|
70
84
|
return result
|
|
71
85
|
|
|
@@ -82,7 +96,7 @@ def _dump_bool(var: Any) -> int | None:
|
|
|
82
96
|
"""
|
|
83
97
|
if pd.isna(var):
|
|
84
98
|
return None
|
|
85
|
-
return
|
|
99
|
+
return "1" if var else "0"
|
|
86
100
|
|
|
87
101
|
|
|
88
102
|
def _dump_str(var: Any) -> str | None:
|
|
@@ -94,18 +108,19 @@ def _dump_str(var: Any) -> str | None:
|
|
|
94
108
|
Returns:
|
|
95
109
|
str | None: The string representation of the value or None if the value is None
|
|
96
110
|
"""
|
|
97
|
-
if pd.isna(var):
|
|
98
|
-
return None
|
|
99
111
|
if isinstance(var, bytes):
|
|
100
112
|
try:
|
|
101
|
-
return var.decode(
|
|
113
|
+
return var.decode("utf-8")
|
|
102
114
|
except Exception:
|
|
103
115
|
return str(var)[2:-1]
|
|
104
|
-
if isinstance(var, dict):
|
|
116
|
+
if isinstance(var, (dict, list)):
|
|
105
117
|
try:
|
|
106
|
-
return
|
|
118
|
+
return json_encoder.encode(var)
|
|
107
119
|
except Exception:
|
|
108
120
|
return str(var)
|
|
121
|
+
if isinstance(var, list) is False and pd.isna(var):
|
|
122
|
+
# pd.isna returns array of bools for list, so we need to check if it is not a list
|
|
123
|
+
return None
|
|
109
124
|
return str(var)
|
|
110
125
|
|
|
111
126
|
|
|
@@ -142,7 +157,7 @@ def _dump_date(var: datetime.date | str | None) -> str | None:
|
|
|
142
157
|
return var
|
|
143
158
|
elif pd.isna(var):
|
|
144
159
|
return None
|
|
145
|
-
logger.warning(f
|
|
160
|
+
logger.warning(f"Unexpected value type for DATE: {type(var)}, {var}")
|
|
146
161
|
return _dump_str(var)
|
|
147
162
|
|
|
148
163
|
|
|
@@ -157,18 +172,18 @@ def _dump_datetime(var: datetime.datetime | str | None) -> str | None:
|
|
|
157
172
|
str | None: The string representation of the datetime value or None if the value is None
|
|
158
173
|
"""
|
|
159
174
|
if isinstance(var, datetime.date): # it is also datetime.datetime
|
|
160
|
-
if hasattr(var,
|
|
175
|
+
if hasattr(var, "tzinfo") and var.tzinfo is not None:
|
|
161
176
|
return var.astimezone(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
|
|
162
177
|
return var.strftime("%Y-%m-%d %H:%M:%S")
|
|
163
178
|
elif isinstance(var, pd.Timestamp):
|
|
164
179
|
if var.tzinfo is not None:
|
|
165
|
-
return var.tz_convert(
|
|
180
|
+
return var.tz_convert("UTC").strftime("%Y-%m-%d %H:%M:%S")
|
|
166
181
|
return var.strftime("%Y-%m-%d %H:%M:%S")
|
|
167
182
|
elif isinstance(var, str):
|
|
168
183
|
return var
|
|
169
184
|
elif pd.isna(var):
|
|
170
185
|
return None
|
|
171
|
-
logger.warning(f
|
|
186
|
+
logger.warning(f"Unexpected value type for DATETIME: {type(var)}, {var}")
|
|
172
187
|
return _dump_str(var)
|
|
173
188
|
|
|
174
189
|
|
|
@@ -198,16 +213,34 @@ def _dump_time(var: datetime.time | str | None) -> str | None:
|
|
|
198
213
|
return var.strftime("%H:%M:%S")
|
|
199
214
|
elif isinstance(var, pd.Timestamp):
|
|
200
215
|
if var.tzinfo is not None:
|
|
201
|
-
return var.tz_convert(
|
|
216
|
+
return var.tz_convert("UTC").strftime("%H:%M:%S")
|
|
202
217
|
return var.strftime("%H:%M:%S")
|
|
203
218
|
elif isinstance(var, str):
|
|
204
219
|
return var
|
|
205
220
|
elif pd.isna(var):
|
|
206
221
|
return None
|
|
207
|
-
logger.warning(f
|
|
222
|
+
logger.warning(f"Unexpected value type for TIME: {type(var)}, {var}")
|
|
208
223
|
return _dump_str(var)
|
|
209
224
|
|
|
210
225
|
|
|
226
|
+
def _dump_vector(value: Any) -> bytes | None:
|
|
227
|
+
"""Convert array or list of floats to a bytes.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
value (Any): The value to dump
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
bytes | None: The bytes representation of the vector value or None if the value is None
|
|
234
|
+
"""
|
|
235
|
+
if isinstance(value, (array, list, np.ndarray)):
|
|
236
|
+
return b"".join([struct.pack("<f", el) for el in value])
|
|
237
|
+
elif pd.isna(value):
|
|
238
|
+
return None
|
|
239
|
+
err_msg = f"Unexpected value type for VECTOR: {type(value)}, {value}"
|
|
240
|
+
logger.error(err_msg)
|
|
241
|
+
raise ValueError(err_msg)
|
|
242
|
+
|
|
243
|
+
|
|
211
244
|
def _handle_series_as_date(series: pd.Series) -> pd.Series:
|
|
212
245
|
"""Convert values in a series to a string representation of a date.
|
|
213
246
|
NOTE: MySQL require exactly %Y-%m-%d for DATE type.
|
|
@@ -219,10 +252,10 @@ def _handle_series_as_date(series: pd.Series) -> pd.Series:
|
|
|
219
252
|
pd.Series: The series with the date values as strings
|
|
220
253
|
"""
|
|
221
254
|
if pd_types.is_datetime64_any_dtype(series.dtype):
|
|
222
|
-
return series.dt.strftime(
|
|
255
|
+
return series.dt.strftime("%Y-%m-%d")
|
|
223
256
|
elif pd_types.is_object_dtype(series.dtype):
|
|
224
257
|
return series.apply(_dump_date)
|
|
225
|
-
logger.info(f
|
|
258
|
+
logger.info(f"Unexpected dtype: {series.dtype} for column with type DATE")
|
|
226
259
|
return series.apply(_dump_str)
|
|
227
260
|
|
|
228
261
|
|
|
@@ -237,10 +270,10 @@ def _handle_series_as_datetime(series: pd.Series) -> pd.Series:
|
|
|
237
270
|
pd.Series: The series with the datetime values as strings
|
|
238
271
|
"""
|
|
239
272
|
if pd_types.is_datetime64_any_dtype(series.dtype):
|
|
240
|
-
return series.dt.strftime(
|
|
273
|
+
return series.dt.strftime("%Y-%m-%d %H:%M:%S")
|
|
241
274
|
elif pd_types.is_object_dtype(series.dtype):
|
|
242
275
|
return series.apply(_dump_datetime)
|
|
243
|
-
logger.info(f
|
|
276
|
+
logger.info(f"Unexpected dtype: {series.dtype} for column with type DATETIME")
|
|
244
277
|
return series.apply(_dump_str)
|
|
245
278
|
|
|
246
279
|
|
|
@@ -255,14 +288,14 @@ def _handle_series_as_time(series: pd.Series) -> pd.Series:
|
|
|
255
288
|
pd.Series: The series with the time values as strings
|
|
256
289
|
"""
|
|
257
290
|
if pd_types.is_timedelta64_ns_dtype(series.dtype):
|
|
258
|
-
base_time = pd.Timestamp(
|
|
259
|
-
series = (
|
|
291
|
+
base_time = pd.Timestamp("2000-01-01")
|
|
292
|
+
series = (base_time + series).dt.strftime("%H:%M:%S")
|
|
260
293
|
elif pd_types.is_datetime64_dtype(series.dtype):
|
|
261
|
-
series = series.dt.strftime(
|
|
294
|
+
series = series.dt.strftime("%H:%M:%S")
|
|
262
295
|
elif pd_types.is_object_dtype(series.dtype):
|
|
263
296
|
series = series.apply(_dump_time)
|
|
264
297
|
else:
|
|
265
|
-
logger.info(f
|
|
298
|
+
logger.info(f"Unexpected dtype: {series.dtype} for column with type TIME")
|
|
266
299
|
series = series.apply(_dump_str)
|
|
267
300
|
return series
|
|
268
301
|
|
|
@@ -278,14 +311,29 @@ def _handle_series_as_int(series: pd.Series) -> pd.Series:
|
|
|
278
311
|
pd.Series: The series with the int values as strings
|
|
279
312
|
"""
|
|
280
313
|
if pd_types.is_integer_dtype(series.dtype):
|
|
281
|
-
if series.dtype ==
|
|
314
|
+
if series.dtype == "Int64":
|
|
282
315
|
# NOTE: 'apply' converts values to python floats
|
|
283
316
|
return series.astype(object).apply(_dump_str)
|
|
284
317
|
return series.apply(_dump_str)
|
|
285
318
|
return series.apply(_dump_int_or_str)
|
|
286
319
|
|
|
287
320
|
|
|
288
|
-
def
|
|
321
|
+
def _handle_series_as_vector(series: pd.Series) -> pd.Series:
|
|
322
|
+
"""Convert values in a series to a bytes representation of a vector.
|
|
323
|
+
NOTE: MySQL's VECTOR type require exactly 4 bytes per float.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
series (pd.Series): The series to handle
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
pd.Series: The series with the vector values as bytes
|
|
330
|
+
"""
|
|
331
|
+
return series.apply(_dump_vector)
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def dump_result_set_to_mysql(
|
|
335
|
+
result_set: ResultSet, infer_column_size: bool = False
|
|
336
|
+
) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
|
|
289
337
|
"""
|
|
290
338
|
Dumps the ResultSet to a format that can be used to send as MySQL response packet.
|
|
291
339
|
NOTE: This method modifies the original DataFrame and columns.
|
|
@@ -319,10 +367,16 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
|
|
|
319
367
|
case MYSQL_DATA_TYPE.TIME:
|
|
320
368
|
series = _handle_series_as_time(series)
|
|
321
369
|
case (
|
|
322
|
-
MYSQL_DATA_TYPE.INT
|
|
323
|
-
| MYSQL_DATA_TYPE.
|
|
370
|
+
MYSQL_DATA_TYPE.INT
|
|
371
|
+
| MYSQL_DATA_TYPE.TINYINT
|
|
372
|
+
| MYSQL_DATA_TYPE.SMALLINT
|
|
373
|
+
| MYSQL_DATA_TYPE.MEDIUMINT
|
|
374
|
+
| MYSQL_DATA_TYPE.BIGINT
|
|
375
|
+
| MYSQL_DATA_TYPE.YEAR
|
|
324
376
|
):
|
|
325
377
|
series = _handle_series_as_int(series)
|
|
378
|
+
case MYSQL_DATA_TYPE.VECTOR:
|
|
379
|
+
series = _handle_series_as_vector(series)
|
|
326
380
|
case _:
|
|
327
381
|
series = series.apply(_dump_str)
|
|
328
382
|
|
|
@@ -330,22 +384,19 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
|
|
|
330
384
|
# we may split this operation for dt and other types for optimisation
|
|
331
385
|
df[i] = series.replace([np.NaN, pd.NA, pd.NaT], None)
|
|
332
386
|
|
|
333
|
-
columns_dicts = [
|
|
334
|
-
column_to_mysql_column_dict(column)
|
|
335
|
-
for column in result_set.columns
|
|
336
|
-
]
|
|
387
|
+
columns_dicts = [column_to_mysql_column_dict(column) for column in result_set.columns]
|
|
337
388
|
|
|
338
|
-
if infer_column_size and any(column_info.get(
|
|
389
|
+
if infer_column_size and any(column_info.get("size") is None for column_info in columns_dicts):
|
|
339
390
|
if len(df) == 0:
|
|
340
391
|
for column_info in columns_dicts:
|
|
341
|
-
if column_info[
|
|
342
|
-
column_info[
|
|
392
|
+
if column_info["size"] is None:
|
|
393
|
+
column_info["size"] = 1
|
|
343
394
|
else:
|
|
344
395
|
sample = df.head(100)
|
|
345
396
|
for i, column_info in enumerate(columns_dicts):
|
|
346
397
|
try:
|
|
347
|
-
column_info[
|
|
398
|
+
column_info["size"] = sample[sample.columns[i]].astype(str).str.len().max()
|
|
348
399
|
except Exception:
|
|
349
|
-
column_info[
|
|
400
|
+
column_info["size"] = 1
|
|
350
401
|
|
|
351
402
|
return df, columns_dicts
|
|
@@ -1,26 +1,28 @@
|
|
|
1
|
-
from
|
|
1
|
+
from google.cloud.bigquery import Client, QueryJobConfig
|
|
2
2
|
from google.api_core.exceptions import BadRequest
|
|
3
|
+
import pandas as pd
|
|
3
4
|
from sqlalchemy_bigquery.base import BigQueryDialect
|
|
4
|
-
from
|
|
5
|
+
from typing import Any, Dict, Optional, Text
|
|
5
6
|
|
|
6
7
|
from mindsdb.utilities import log
|
|
7
8
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
8
|
-
from mindsdb.integrations.libs.base import
|
|
9
|
+
from mindsdb.integrations.libs.base import MetaDatabaseHandler
|
|
9
10
|
from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
|
|
10
11
|
from mindsdb.integrations.utilities.handlers.auth_utilities.google import GoogleServiceAccountOAuth2Manager
|
|
11
12
|
from mindsdb.integrations.libs.response import (
|
|
12
13
|
HandlerStatusResponse as StatusResponse,
|
|
13
14
|
HandlerResponse as Response,
|
|
14
|
-
RESPONSE_TYPE
|
|
15
|
+
RESPONSE_TYPE,
|
|
15
16
|
)
|
|
16
17
|
|
|
17
18
|
logger = log.getLogger(__name__)
|
|
18
19
|
|
|
19
20
|
|
|
20
|
-
class BigQueryHandler(
|
|
21
|
+
class BigQueryHandler(MetaDatabaseHandler):
|
|
21
22
|
"""
|
|
22
23
|
This handler handles connection and execution of Google BigQuery statements.
|
|
23
24
|
"""
|
|
25
|
+
|
|
24
26
|
name = "bigquery"
|
|
25
27
|
|
|
26
28
|
def __init__(self, name: Text, connection_data: Dict, **kwargs: Any):
|
|
@@ -49,19 +51,16 @@ class BigQueryHandler(DatabaseHandler):
|
|
|
49
51
|
return self.connection
|
|
50
52
|
|
|
51
53
|
# Mandatory connection parameters
|
|
52
|
-
if not all(key in self.connection_data for key in [
|
|
53
|
-
raise ValueError(
|
|
54
|
+
if not all(key in self.connection_data for key in ["project_id", "dataset"]):
|
|
55
|
+
raise ValueError("Required parameters (project_id, dataset) must be provided.")
|
|
54
56
|
|
|
55
57
|
google_sa_oauth2_manager = GoogleServiceAccountOAuth2Manager(
|
|
56
|
-
credentials_file=self.connection_data.get(
|
|
57
|
-
credentials_json=self.connection_data.get(
|
|
58
|
+
credentials_file=self.connection_data.get("service_account_keys"),
|
|
59
|
+
credentials_json=self.connection_data.get("service_account_json"),
|
|
58
60
|
)
|
|
59
61
|
credentials = google_sa_oauth2_manager.get_oauth2_credentials()
|
|
60
62
|
|
|
61
|
-
client = Client(
|
|
62
|
-
project=self.connection_data["project_id"],
|
|
63
|
-
credentials=credentials
|
|
64
|
-
)
|
|
63
|
+
client = Client(project=self.connection_data["project_id"], credentials=credentials)
|
|
65
64
|
self.is_connected = True
|
|
66
65
|
self.connection = client
|
|
67
66
|
return self.connection
|
|
@@ -86,14 +85,14 @@ class BigQueryHandler(DatabaseHandler):
|
|
|
86
85
|
|
|
87
86
|
try:
|
|
88
87
|
connection = self.connect()
|
|
89
|
-
connection.query(
|
|
88
|
+
connection.query("SELECT 1;")
|
|
90
89
|
|
|
91
90
|
# Check if the dataset exists
|
|
92
|
-
connection.get_dataset(self.connection_data[
|
|
91
|
+
connection.get_dataset(self.connection_data["dataset"])
|
|
93
92
|
|
|
94
93
|
response.success = True
|
|
95
94
|
except (BadRequest, ValueError) as e:
|
|
96
|
-
logger.error(f
|
|
95
|
+
logger.error(f"Error connecting to BigQuery {self.connection_data['project_id']}, {e}!")
|
|
97
96
|
response.error_message = e
|
|
98
97
|
|
|
99
98
|
if response.success is False and self.is_connected is True:
|
|
@@ -113,22 +112,18 @@ class BigQueryHandler(DatabaseHandler):
|
|
|
113
112
|
"""
|
|
114
113
|
connection = self.connect()
|
|
115
114
|
try:
|
|
116
|
-
job_config = QueryJobConfig(
|
|
115
|
+
job_config = QueryJobConfig(
|
|
116
|
+
default_dataset=f"{self.connection_data['project_id']}.{self.connection_data['dataset']}"
|
|
117
|
+
)
|
|
117
118
|
query = connection.query(query, job_config=job_config)
|
|
118
119
|
result = query.to_dataframe()
|
|
119
120
|
if not result.empty:
|
|
120
|
-
response = Response(
|
|
121
|
-
RESPONSE_TYPE.TABLE,
|
|
122
|
-
result
|
|
123
|
-
)
|
|
121
|
+
response = Response(RESPONSE_TYPE.TABLE, result)
|
|
124
122
|
else:
|
|
125
123
|
response = Response(RESPONSE_TYPE.OK)
|
|
126
124
|
except Exception as e:
|
|
127
|
-
logger.error(f
|
|
128
|
-
response = Response(
|
|
129
|
-
RESPONSE_TYPE.ERROR,
|
|
130
|
-
error_message=str(e)
|
|
131
|
-
)
|
|
125
|
+
logger.error(f"Error running query: {query} on {self.connection_data['project_id']}!")
|
|
126
|
+
response = Response(RESPONSE_TYPE.ERROR, error_message=str(e))
|
|
132
127
|
return response
|
|
133
128
|
|
|
134
129
|
def query(self, query: ASTNode) -> Response:
|
|
@@ -154,7 +149,7 @@ class BigQueryHandler(DatabaseHandler):
|
|
|
154
149
|
"""
|
|
155
150
|
query = f"""
|
|
156
151
|
SELECT table_name, table_schema, table_type
|
|
157
|
-
FROM `{self.connection_data[
|
|
152
|
+
FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES`
|
|
158
153
|
WHERE table_type IN ('BASE TABLE', 'VIEW')
|
|
159
154
|
"""
|
|
160
155
|
result = self.native_query(query)
|
|
@@ -174,8 +169,204 @@ class BigQueryHandler(DatabaseHandler):
|
|
|
174
169
|
"""
|
|
175
170
|
query = f"""
|
|
176
171
|
SELECT column_name AS Field, data_type as Type
|
|
177
|
-
FROM `{self.connection_data[
|
|
172
|
+
FROM `{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
|
|
178
173
|
WHERE table_name = '{table_name}'
|
|
179
174
|
"""
|
|
180
175
|
result = self.native_query(query)
|
|
181
176
|
return result
|
|
177
|
+
|
|
178
|
+
def meta_get_tables(self, table_names: Optional[list] = None) -> Response:
|
|
179
|
+
"""
|
|
180
|
+
Retrieves table metadata for the specified tables (or all tables if no list is provided).
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
table_names (list): A list of table names for which to retrieve metadata information.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
Response: A response object containing the metadata information, formatted as per the `Response` class.
|
|
187
|
+
"""
|
|
188
|
+
query = f"""
|
|
189
|
+
SELECT
|
|
190
|
+
t.table_name,
|
|
191
|
+
t.table_schema,
|
|
192
|
+
t.table_type,
|
|
193
|
+
st.row_count
|
|
194
|
+
FROM
|
|
195
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLES` AS t
|
|
196
|
+
JOIN
|
|
197
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.__TABLES__` AS st
|
|
198
|
+
ON
|
|
199
|
+
t.table_name = st.table_id
|
|
200
|
+
WHERE
|
|
201
|
+
t.table_type IN ('BASE TABLE', 'VIEW')
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
if table_names is not None and len(table_names) > 0:
|
|
205
|
+
table_names = [f"'{t}'" for t in table_names]
|
|
206
|
+
query += f" AND t.table_name IN ({','.join(table_names)})"
|
|
207
|
+
|
|
208
|
+
result = self.native_query(query)
|
|
209
|
+
return result
|
|
210
|
+
|
|
211
|
+
def meta_get_columns(self, table_names: Optional[list] = None) -> Response:
|
|
212
|
+
"""
|
|
213
|
+
Retrieves column metadata for the specified tables (or all tables if no list is provided).
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
table_names (list): A list of table names for which to retrieve column metadata.
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Response: A response object containing the column metadata.
|
|
220
|
+
"""
|
|
221
|
+
query = f"""
|
|
222
|
+
SELECT
|
|
223
|
+
table_name,
|
|
224
|
+
column_name,
|
|
225
|
+
data_type,
|
|
226
|
+
column_default,
|
|
227
|
+
CASE is_nullable
|
|
228
|
+
WHEN 'YES' THEN TRUE
|
|
229
|
+
ELSE FALSE
|
|
230
|
+
END AS is_nullable
|
|
231
|
+
FROM
|
|
232
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.COLUMNS`
|
|
233
|
+
"""
|
|
234
|
+
|
|
235
|
+
if table_names is not None and len(table_names) > 0:
|
|
236
|
+
table_names = [f"'{t}'" for t in table_names]
|
|
237
|
+
query += f" WHERE table_name IN ({','.join(table_names)})"
|
|
238
|
+
|
|
239
|
+
result = self.native_query(query)
|
|
240
|
+
return result
|
|
241
|
+
|
|
242
|
+
def meta_get_column_statistics_for_table(self, table_name: str, columns: list) -> Response:
|
|
243
|
+
"""
|
|
244
|
+
Retrieves statistics for the specified columns in a table.
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
table_name (str): The name of the table.
|
|
248
|
+
columns (list): A list of column names to retrieve statistics for.
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
Response: A response object containing the column statistics.
|
|
252
|
+
"""
|
|
253
|
+
# To avoid hitting BigQuery's query size limits, we will chunk the columns into batches.
|
|
254
|
+
# This is because the queries are combined using UNION ALL, which can lead to very large queries if there are many columns.
|
|
255
|
+
BATCH_SIZE = 20
|
|
256
|
+
|
|
257
|
+
def chunked(lst, n):
|
|
258
|
+
"""
|
|
259
|
+
Yields successive n-sized chunks from lst.
|
|
260
|
+
"""
|
|
261
|
+
for i in range(0, len(lst), n):
|
|
262
|
+
yield lst[i : i + n]
|
|
263
|
+
|
|
264
|
+
queries = []
|
|
265
|
+
for column_batch in chunked(columns, BATCH_SIZE):
|
|
266
|
+
batch_queries = []
|
|
267
|
+
for column in column_batch:
|
|
268
|
+
batch_queries.append(
|
|
269
|
+
f"""
|
|
270
|
+
SELECT
|
|
271
|
+
'{table_name}' AS table_name,
|
|
272
|
+
'{column}' AS column_name,
|
|
273
|
+
SAFE_DIVIDE(COUNTIF({column} IS NULL), COUNT(*)) * 100 AS null_percentage,
|
|
274
|
+
CAST(MIN(`{column}`) AS STRING) AS minimum_value,
|
|
275
|
+
CAST(MAX(`{column}`) AS STRING) AS maximum_value,
|
|
276
|
+
COUNT(DISTINCT {column}) AS distinct_values_count
|
|
277
|
+
FROM
|
|
278
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.{table_name}`
|
|
279
|
+
"""
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
query = " UNION ALL ".join(batch_queries)
|
|
283
|
+
queries.append(query)
|
|
284
|
+
|
|
285
|
+
results = []
|
|
286
|
+
for query in queries:
|
|
287
|
+
try:
|
|
288
|
+
result = self.native_query(query)
|
|
289
|
+
if result.resp_type == RESPONSE_TYPE.TABLE:
|
|
290
|
+
results.append(result.data_frame)
|
|
291
|
+
else:
|
|
292
|
+
logger.error(f"Error retrieving column statistics for table {table_name}: {result.error_message}")
|
|
293
|
+
except Exception as e:
|
|
294
|
+
logger.error(f"Exception occurred while retrieving column statistics for table {table_name}: {e}")
|
|
295
|
+
|
|
296
|
+
if not results:
|
|
297
|
+
logger.warning(f"No column statistics could be retrieved for table {table_name}.")
|
|
298
|
+
return Response(
|
|
299
|
+
RESPONSE_TYPE.ERROR, error_message=f"No column statistics could be retrieved for table {table_name}."
|
|
300
|
+
)
|
|
301
|
+
return Response(RESPONSE_TYPE.TABLE, pd.concat(results, ignore_index=True) if results else pd.DataFrame())
|
|
302
|
+
|
|
303
|
+
def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response:
|
|
304
|
+
"""
|
|
305
|
+
Retrieves primary key information for the specified tables (or all tables if no list is provided).
|
|
306
|
+
|
|
307
|
+
Args:
|
|
308
|
+
table_names (list): A list of table names for which to retrieve primary key information.
|
|
309
|
+
|
|
310
|
+
Returns:
|
|
311
|
+
Response: A response object containing the primary key information.
|
|
312
|
+
"""
|
|
313
|
+
query = f"""
|
|
314
|
+
SELECT
|
|
315
|
+
tc.table_name,
|
|
316
|
+
kcu.column_name,
|
|
317
|
+
kcu.ordinal_position,
|
|
318
|
+
tc.constraint_name,
|
|
319
|
+
FROM
|
|
320
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
|
|
321
|
+
JOIN
|
|
322
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
|
|
323
|
+
ON
|
|
324
|
+
tc.constraint_name = kcu.constraint_name
|
|
325
|
+
WHERE
|
|
326
|
+
tc.constraint_type = 'PRIMARY KEY'
|
|
327
|
+
"""
|
|
328
|
+
|
|
329
|
+
if table_names is not None and len(table_names) > 0:
|
|
330
|
+
table_names = [f"'{t}'" for t in table_names]
|
|
331
|
+
query += f" AND tc.table_name IN ({','.join(table_names)})"
|
|
332
|
+
|
|
333
|
+
result = self.native_query(query)
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
def meta_get_foreign_keys(self, table_names: Optional[list] = None) -> Response:
|
|
337
|
+
"""
|
|
338
|
+
Retrieves foreign key information for the specified tables (or all tables if no list is provided).
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
table_names (list): A list of table names for which to retrieve foreign key information.
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Response: A response object containing the foreign key information.
|
|
345
|
+
"""
|
|
346
|
+
query = f"""
|
|
347
|
+
SELECT
|
|
348
|
+
ccu.table_name AS parent_table_name,
|
|
349
|
+
ccu.column_name AS parent_column_name,
|
|
350
|
+
kcu.table_name AS child_table_name,
|
|
351
|
+
kcu.column_name AS child_column_name,
|
|
352
|
+
tc.constraint_name
|
|
353
|
+
FROM
|
|
354
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.TABLE_CONSTRAINTS` AS tc
|
|
355
|
+
JOIN
|
|
356
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.KEY_COLUMN_USAGE` AS kcu
|
|
357
|
+
ON
|
|
358
|
+
tc.constraint_name = kcu.constraint_name
|
|
359
|
+
JOIN
|
|
360
|
+
`{self.connection_data["project_id"]}.{self.connection_data["dataset"]}.INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE` AS ccu
|
|
361
|
+
ON
|
|
362
|
+
tc.constraint_name = ccu.constraint_name
|
|
363
|
+
WHERE
|
|
364
|
+
tc.constraint_type = 'FOREIGN KEY'
|
|
365
|
+
"""
|
|
366
|
+
|
|
367
|
+
if table_names is not None and len(table_names) > 0:
|
|
368
|
+
table_names = [f"'{t}'" for t in table_names]
|
|
369
|
+
query += f" AND tc.table_name IN ({','.join(table_names)})"
|
|
370
|
+
|
|
371
|
+
result = self.native_query(query)
|
|
372
|
+
return result
|
|
@@ -75,10 +75,7 @@ class FileHandler(DatabaseHandler):
|
|
|
75
75
|
def query(self, query: ASTNode) -> Response:
|
|
76
76
|
if type(query) is DropTables:
|
|
77
77
|
for table_identifier in query.tables:
|
|
78
|
-
if (
|
|
79
|
-
len(table_identifier.parts) == 2
|
|
80
|
-
and table_identifier.parts[0] != self.name
|
|
81
|
-
):
|
|
78
|
+
if len(table_identifier.parts) == 2 and table_identifier.parts[0] != self.name:
|
|
82
79
|
return Response(
|
|
83
80
|
RESPONSE_TYPE.ERROR,
|
|
84
81
|
error_message=f"Can't delete table from database '{table_identifier.parts[0]}'",
|
|
@@ -136,9 +133,20 @@ class FileHandler(DatabaseHandler):
|
|
|
136
133
|
return Response(RESPONSE_TYPE.OK)
|
|
137
134
|
|
|
138
135
|
elif isinstance(query, Select):
|
|
139
|
-
|
|
136
|
+
if isinstance(query.from_table, Select):
|
|
137
|
+
# partitioning mode
|
|
138
|
+
sub_result = self.query(query.from_table)
|
|
139
|
+
if sub_result.error_message is not None:
|
|
140
|
+
raise RuntimeError(sub_result.error_message)
|
|
140
141
|
|
|
141
|
-
|
|
142
|
+
df = sub_result.data_frame
|
|
143
|
+
query.from_table = Identifier("t")
|
|
144
|
+
elif isinstance(query.from_table, Identifier):
|
|
145
|
+
table_name, page_name = self._get_table_page_names(query.from_table)
|
|
146
|
+
|
|
147
|
+
df = self.file_controller.get_file_data(table_name, page_name)
|
|
148
|
+
else:
|
|
149
|
+
raise RuntimeError(f"Not supported query target: {query}")
|
|
142
150
|
|
|
143
151
|
# Process the SELECT query
|
|
144
152
|
result_df = query_df(df, query)
|
|
@@ -191,9 +199,7 @@ class FileHandler(DatabaseHandler):
|
|
|
191
199
|
data_frame=pd.DataFrame(
|
|
192
200
|
[
|
|
193
201
|
{
|
|
194
|
-
"Field": x["name"].strip()
|
|
195
|
-
if isinstance(x, dict)
|
|
196
|
-
else x.strip(),
|
|
202
|
+
"Field": x["name"].strip() if isinstance(x, dict) else x.strip(),
|
|
197
203
|
"Type": "str",
|
|
198
204
|
}
|
|
199
205
|
for x in file_meta["columns"]
|