MindsDB 25.5.4.1__py3-none-any.whl → 25.6.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (70) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/api/a2a/agent.py +28 -25
  3. mindsdb/api/a2a/common/server/server.py +32 -26
  4. mindsdb/api/a2a/run_a2a.py +1 -1
  5. mindsdb/api/executor/command_executor.py +69 -14
  6. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +49 -65
  7. mindsdb/api/executor/datahub/datanodes/project_datanode.py +29 -48
  8. mindsdb/api/executor/datahub/datanodes/system_tables.py +35 -61
  9. mindsdb/api/executor/planner/plan_join.py +67 -77
  10. mindsdb/api/executor/planner/query_planner.py +176 -155
  11. mindsdb/api/executor/planner/steps.py +37 -12
  12. mindsdb/api/executor/sql_query/result_set.py +45 -64
  13. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +14 -18
  14. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +17 -18
  15. mindsdb/api/executor/sql_query/steps/insert_step.py +13 -33
  16. mindsdb/api/executor/sql_query/steps/subselect_step.py +43 -35
  17. mindsdb/api/executor/utilities/sql.py +42 -48
  18. mindsdb/api/http/namespaces/config.py +1 -1
  19. mindsdb/api/http/namespaces/file.py +14 -23
  20. mindsdb/api/mysql/mysql_proxy/data_types/mysql_datum.py +12 -28
  21. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/binary_resultset_row_package.py +59 -50
  22. mindsdb/api/mysql/mysql_proxy/data_types/mysql_packets/resultset_row_package.py +9 -8
  23. mindsdb/api/mysql/mysql_proxy/libs/constants/mysql.py +449 -461
  24. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +87 -36
  25. mindsdb/integrations/handlers/file_handler/file_handler.py +15 -9
  26. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +43 -24
  27. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +10 -3
  28. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +26 -33
  29. mindsdb/integrations/handlers/oracle_handler/oracle_handler.py +74 -51
  30. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +305 -98
  31. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +53 -34
  32. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +136 -6
  33. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +334 -83
  34. mindsdb/integrations/libs/api_handler.py +261 -57
  35. mindsdb/integrations/libs/base.py +100 -29
  36. mindsdb/integrations/utilities/files/file_reader.py +99 -73
  37. mindsdb/integrations/utilities/handler_utils.py +23 -8
  38. mindsdb/integrations/utilities/sql_utils.py +35 -40
  39. mindsdb/interfaces/agents/agents_controller.py +196 -192
  40. mindsdb/interfaces/agents/constants.py +7 -1
  41. mindsdb/interfaces/agents/langchain_agent.py +42 -11
  42. mindsdb/interfaces/agents/mcp_client_agent.py +29 -21
  43. mindsdb/interfaces/data_catalog/__init__.py +0 -0
  44. mindsdb/interfaces/data_catalog/base_data_catalog.py +54 -0
  45. mindsdb/interfaces/data_catalog/data_catalog_loader.py +359 -0
  46. mindsdb/interfaces/data_catalog/data_catalog_reader.py +34 -0
  47. mindsdb/interfaces/database/database.py +81 -57
  48. mindsdb/interfaces/database/integrations.py +220 -234
  49. mindsdb/interfaces/database/log.py +72 -104
  50. mindsdb/interfaces/database/projects.py +156 -193
  51. mindsdb/interfaces/file/file_controller.py +21 -65
  52. mindsdb/interfaces/knowledge_base/controller.py +63 -10
  53. mindsdb/interfaces/knowledge_base/evaluate.py +519 -0
  54. mindsdb/interfaces/knowledge_base/llm_client.py +75 -0
  55. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +83 -43
  56. mindsdb/interfaces/skills/skills_controller.py +54 -36
  57. mindsdb/interfaces/skills/sql_agent.py +109 -86
  58. mindsdb/interfaces/storage/db.py +223 -79
  59. mindsdb/migrations/versions/2025-05-28_a44643042fe8_added_data_catalog_tables.py +118 -0
  60. mindsdb/migrations/versions/2025-06-09_608e376c19a7_updated_data_catalog_data_types.py +58 -0
  61. mindsdb/utilities/config.py +9 -2
  62. mindsdb/utilities/log.py +35 -26
  63. mindsdb/utilities/ml_task_queue/task.py +19 -22
  64. mindsdb/utilities/render/sqlalchemy_render.py +129 -181
  65. mindsdb/utilities/starters.py +49 -1
  66. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/METADATA +268 -268
  67. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/RECORD +70 -62
  68. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/WHEEL +0 -0
  69. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/licenses/LICENSE +0 -0
  70. {mindsdb-25.5.4.1.dist-info → mindsdb-25.6.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
1
- import json
1
+ import struct
2
2
  import datetime
3
3
  from typing import Any
4
+ from array import array
4
5
 
5
6
  import numpy as np
6
7
  from numpy import dtype as np_dtype
@@ -9,11 +10,19 @@ from pandas.api import types as pd_types
9
10
 
10
11
  from mindsdb.api.executor.sql_query.result_set import ResultSet, get_mysql_data_type_from_series, Column
11
12
  from mindsdb.api.mysql.mysql_proxy.utilities.lightwood_dtype import dtype as lightwood_dtype
12
- from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import MYSQL_DATA_TYPE, DATA_C_TYPE_MAP, CTypeProperties
13
+ from mindsdb.api.mysql.mysql_proxy.libs.constants.mysql import (
14
+ MYSQL_DATA_TYPE,
15
+ DATA_C_TYPE_MAP,
16
+ CTypeProperties,
17
+ CHARSET_NUMBERS,
18
+ )
13
19
  from mindsdb.utilities import log
20
+ from mindsdb.utilities.json_encoder import CustomJSONEncoder
14
21
 
15
22
  logger = log.getLogger(__name__)
16
23
 
24
+ json_encoder = CustomJSONEncoder()
25
+
17
26
 
18
27
  def column_to_mysql_column_dict(column: Column, database_name: str | None = None) -> dict[str, str | int]:
19
28
  """Convert Column object to dict with column properties.
@@ -52,9 +61,13 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
52
61
  # endregion
53
62
 
54
63
  if isinstance(column.type, MYSQL_DATA_TYPE) is False:
55
- logger.warning(f'Unexpected column type: {column.type}. Use TEXT as fallback.')
64
+ logger.warning(f"Unexpected column type: {column.type}. Use TEXT as fallback.")
56
65
  column.type = MYSQL_DATA_TYPE.TEXT
57
66
 
67
+ charset = CHARSET_NUMBERS["utf8_unicode_ci"]
68
+ if column.type in (MYSQL_DATA_TYPE.JSON, MYSQL_DATA_TYPE.VECTOR):
69
+ charset = CHARSET_NUMBERS["binary"]
70
+
58
71
  type_properties: CTypeProperties = DATA_C_TYPE_MAP[column.type]
59
72
 
60
73
  result = {
@@ -66,6 +79,7 @@ def column_to_mysql_column_dict(column: Column, database_name: str | None = None
66
79
  "size": type_properties.size,
67
80
  "flags": type_properties.flags,
68
81
  "type": type_properties.code,
82
+ "charset": charset,
69
83
  }
70
84
  return result
71
85
 
@@ -82,7 +96,7 @@ def _dump_bool(var: Any) -> int | None:
82
96
  """
83
97
  if pd.isna(var):
84
98
  return None
85
- return '1' if var else '0'
99
+ return "1" if var else "0"
86
100
 
87
101
 
88
102
  def _dump_str(var: Any) -> str | None:
@@ -94,18 +108,19 @@ def _dump_str(var: Any) -> str | None:
94
108
  Returns:
95
109
  str | None: The string representation of the value or None if the value is None
96
110
  """
97
- if pd.isna(var):
98
- return None
99
111
  if isinstance(var, bytes):
100
112
  try:
101
- return var.decode('utf-8')
113
+ return var.decode("utf-8")
102
114
  except Exception:
103
115
  return str(var)[2:-1]
104
- if isinstance(var, dict):
116
+ if isinstance(var, (dict, list)):
105
117
  try:
106
- return json.dumps(var)
118
+ return json_encoder.encode(var)
107
119
  except Exception:
108
120
  return str(var)
121
+ if isinstance(var, list) is False and pd.isna(var):
122
+ # pd.isna returns array of bools for list, so we need to check if it is not a list
123
+ return None
109
124
  return str(var)
110
125
 
111
126
 
@@ -142,7 +157,7 @@ def _dump_date(var: datetime.date | str | None) -> str | None:
142
157
  return var
143
158
  elif pd.isna(var):
144
159
  return None
145
- logger.warning(f'Unexpected value type for DATE: {type(var)}, {var}')
160
+ logger.warning(f"Unexpected value type for DATE: {type(var)}, {var}")
146
161
  return _dump_str(var)
147
162
 
148
163
 
@@ -157,18 +172,18 @@ def _dump_datetime(var: datetime.datetime | str | None) -> str | None:
157
172
  str | None: The string representation of the datetime value or None if the value is None
158
173
  """
159
174
  if isinstance(var, datetime.date): # it is also datetime.datetime
160
- if hasattr(var, 'tzinfo') and var.tzinfo is not None:
175
+ if hasattr(var, "tzinfo") and var.tzinfo is not None:
161
176
  return var.astimezone(datetime.timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
162
177
  return var.strftime("%Y-%m-%d %H:%M:%S")
163
178
  elif isinstance(var, pd.Timestamp):
164
179
  if var.tzinfo is not None:
165
- return var.tz_convert('UTC').strftime("%Y-%m-%d %H:%M:%S")
180
+ return var.tz_convert("UTC").strftime("%Y-%m-%d %H:%M:%S")
166
181
  return var.strftime("%Y-%m-%d %H:%M:%S")
167
182
  elif isinstance(var, str):
168
183
  return var
169
184
  elif pd.isna(var):
170
185
  return None
171
- logger.warning(f'Unexpected value type for DATETIME: {type(var)}, {var}')
186
+ logger.warning(f"Unexpected value type for DATETIME: {type(var)}, {var}")
172
187
  return _dump_str(var)
173
188
 
174
189
 
@@ -198,16 +213,34 @@ def _dump_time(var: datetime.time | str | None) -> str | None:
198
213
  return var.strftime("%H:%M:%S")
199
214
  elif isinstance(var, pd.Timestamp):
200
215
  if var.tzinfo is not None:
201
- return var.tz_convert('UTC').strftime("%H:%M:%S")
216
+ return var.tz_convert("UTC").strftime("%H:%M:%S")
202
217
  return var.strftime("%H:%M:%S")
203
218
  elif isinstance(var, str):
204
219
  return var
205
220
  elif pd.isna(var):
206
221
  return None
207
- logger.warning(f'Unexpected value type for TIME: {type(var)}, {var}')
222
+ logger.warning(f"Unexpected value type for TIME: {type(var)}, {var}")
208
223
  return _dump_str(var)
209
224
 
210
225
 
226
+ def _dump_vector(value: Any) -> bytes | None:
227
+ """Convert array or list of floats to a bytes.
228
+
229
+ Args:
230
+ value (Any): The value to dump
231
+
232
+ Returns:
233
+ bytes | None: The bytes representation of the vector value or None if the value is None
234
+ """
235
+ if isinstance(value, (array, list, np.ndarray)):
236
+ return b"".join([struct.pack("<f", el) for el in value])
237
+ elif pd.isna(value):
238
+ return None
239
+ err_msg = f"Unexpected value type for VECTOR: {type(value)}, {value}"
240
+ logger.error(err_msg)
241
+ raise ValueError(err_msg)
242
+
243
+
211
244
  def _handle_series_as_date(series: pd.Series) -> pd.Series:
212
245
  """Convert values in a series to a string representation of a date.
213
246
  NOTE: MySQL require exactly %Y-%m-%d for DATE type.
@@ -219,10 +252,10 @@ def _handle_series_as_date(series: pd.Series) -> pd.Series:
219
252
  pd.Series: The series with the date values as strings
220
253
  """
221
254
  if pd_types.is_datetime64_any_dtype(series.dtype):
222
- return series.dt.strftime('%Y-%m-%d')
255
+ return series.dt.strftime("%Y-%m-%d")
223
256
  elif pd_types.is_object_dtype(series.dtype):
224
257
  return series.apply(_dump_date)
225
- logger.info(f'Unexpected dtype: {series.dtype} for column with type DATE')
258
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type DATE")
226
259
  return series.apply(_dump_str)
227
260
 
228
261
 
@@ -237,10 +270,10 @@ def _handle_series_as_datetime(series: pd.Series) -> pd.Series:
237
270
  pd.Series: The series with the datetime values as strings
238
271
  """
239
272
  if pd_types.is_datetime64_any_dtype(series.dtype):
240
- return series.dt.strftime('%Y-%m-%d %H:%M:%S')
273
+ return series.dt.strftime("%Y-%m-%d %H:%M:%S")
241
274
  elif pd_types.is_object_dtype(series.dtype):
242
275
  return series.apply(_dump_datetime)
243
- logger.info(f'Unexpected dtype: {series.dtype} for column with type DATETIME')
276
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type DATETIME")
244
277
  return series.apply(_dump_str)
245
278
 
246
279
 
@@ -255,14 +288,14 @@ def _handle_series_as_time(series: pd.Series) -> pd.Series:
255
288
  pd.Series: The series with the time values as strings
256
289
  """
257
290
  if pd_types.is_timedelta64_ns_dtype(series.dtype):
258
- base_time = pd.Timestamp('2000-01-01')
259
- series = ((base_time + series).dt.strftime('%H:%M:%S'))
291
+ base_time = pd.Timestamp("2000-01-01")
292
+ series = (base_time + series).dt.strftime("%H:%M:%S")
260
293
  elif pd_types.is_datetime64_dtype(series.dtype):
261
- series = series.dt.strftime('%H:%M:%S')
294
+ series = series.dt.strftime("%H:%M:%S")
262
295
  elif pd_types.is_object_dtype(series.dtype):
263
296
  series = series.apply(_dump_time)
264
297
  else:
265
- logger.info(f'Unexpected dtype: {series.dtype} for column with type TIME')
298
+ logger.info(f"Unexpected dtype: {series.dtype} for column with type TIME")
266
299
  series = series.apply(_dump_str)
267
300
  return series
268
301
 
@@ -278,14 +311,29 @@ def _handle_series_as_int(series: pd.Series) -> pd.Series:
278
311
  pd.Series: The series with the int values as strings
279
312
  """
280
313
  if pd_types.is_integer_dtype(series.dtype):
281
- if series.dtype == 'Int64':
314
+ if series.dtype == "Int64":
282
315
  # NOTE: 'apply' converts values to python floats
283
316
  return series.astype(object).apply(_dump_str)
284
317
  return series.apply(_dump_str)
285
318
  return series.apply(_dump_int_or_str)
286
319
 
287
320
 
288
- def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = False) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
321
+ def _handle_series_as_vector(series: pd.Series) -> pd.Series:
322
+ """Convert values in a series to a bytes representation of a vector.
323
+ NOTE: MySQL's VECTOR type require exactly 4 bytes per float.
324
+
325
+ Args:
326
+ series (pd.Series): The series to handle
327
+
328
+ Returns:
329
+ pd.Series: The series with the vector values as bytes
330
+ """
331
+ return series.apply(_dump_vector)
332
+
333
+
334
+ def dump_result_set_to_mysql(
335
+ result_set: ResultSet, infer_column_size: bool = False
336
+ ) -> tuple[pd.DataFrame, list[dict[str, str | int]]]:
289
337
  """
290
338
  Dumps the ResultSet to a format that can be used to send as MySQL response packet.
291
339
  NOTE: This method modifies the original DataFrame and columns.
@@ -319,10 +367,16 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
319
367
  case MYSQL_DATA_TYPE.TIME:
320
368
  series = _handle_series_as_time(series)
321
369
  case (
322
- MYSQL_DATA_TYPE.INT | MYSQL_DATA_TYPE.TINYINT | MYSQL_DATA_TYPE.SMALLINT
323
- | MYSQL_DATA_TYPE.MEDIUMINT | MYSQL_DATA_TYPE.BIGINT | MYSQL_DATA_TYPE.YEAR
370
+ MYSQL_DATA_TYPE.INT
371
+ | MYSQL_DATA_TYPE.TINYINT
372
+ | MYSQL_DATA_TYPE.SMALLINT
373
+ | MYSQL_DATA_TYPE.MEDIUMINT
374
+ | MYSQL_DATA_TYPE.BIGINT
375
+ | MYSQL_DATA_TYPE.YEAR
324
376
  ):
325
377
  series = _handle_series_as_int(series)
378
+ case MYSQL_DATA_TYPE.VECTOR:
379
+ series = _handle_series_as_vector(series)
326
380
  case _:
327
381
  series = series.apply(_dump_str)
328
382
 
@@ -330,22 +384,19 @@ def dump_result_set_to_mysql(result_set: ResultSet, infer_column_size: bool = Fa
330
384
  # we may split this operation for dt and other types for optimisation
331
385
  df[i] = series.replace([np.NaN, pd.NA, pd.NaT], None)
332
386
 
333
- columns_dicts = [
334
- column_to_mysql_column_dict(column)
335
- for column in result_set.columns
336
- ]
387
+ columns_dicts = [column_to_mysql_column_dict(column) for column in result_set.columns]
337
388
 
338
- if infer_column_size and any(column_info.get('size') is None for column_info in columns_dicts):
389
+ if infer_column_size and any(column_info.get("size") is None for column_info in columns_dicts):
339
390
  if len(df) == 0:
340
391
  for column_info in columns_dicts:
341
- if column_info['size'] is None:
342
- column_info['size'] = 1
392
+ if column_info["size"] is None:
393
+ column_info["size"] = 1
343
394
  else:
344
395
  sample = df.head(100)
345
396
  for i, column_info in enumerate(columns_dicts):
346
397
  try:
347
- column_info['size'] = sample[sample.columns[i]].astype(str).str.len().max()
398
+ column_info["size"] = sample[sample.columns[i]].astype(str).str.len().max()
348
399
  except Exception:
349
- column_info['size'] = 1
400
+ column_info["size"] = 1
350
401
 
351
402
  return df, columns_dicts
@@ -75,10 +75,7 @@ class FileHandler(DatabaseHandler):
75
75
  def query(self, query: ASTNode) -> Response:
76
76
  if type(query) is DropTables:
77
77
  for table_identifier in query.tables:
78
- if (
79
- len(table_identifier.parts) == 2
80
- and table_identifier.parts[0] != self.name
81
- ):
78
+ if len(table_identifier.parts) == 2 and table_identifier.parts[0] != self.name:
82
79
  return Response(
83
80
  RESPONSE_TYPE.ERROR,
84
81
  error_message=f"Can't delete table from database '{table_identifier.parts[0]}'",
@@ -136,9 +133,20 @@ class FileHandler(DatabaseHandler):
136
133
  return Response(RESPONSE_TYPE.OK)
137
134
 
138
135
  elif isinstance(query, Select):
139
- table_name, page_name = self._get_table_page_names(query.from_table)
136
+ if isinstance(query.from_table, Select):
137
+ # partitioning mode
138
+ sub_result = self.query(query.from_table)
139
+ if sub_result.error_message is not None:
140
+ raise RuntimeError(sub_result.error_message)
140
141
 
141
- df = self.file_controller.get_file_data(table_name, page_name)
142
+ df = sub_result.data_frame
143
+ query.from_table = Identifier("t")
144
+ elif isinstance(query.from_table, Identifier):
145
+ table_name, page_name = self._get_table_page_names(query.from_table)
146
+
147
+ df = self.file_controller.get_file_data(table_name, page_name)
148
+ else:
149
+ raise RuntimeError(f"Not supported query target: {query}")
142
150
 
143
151
  # Process the SELECT query
144
152
  result_df = query_df(df, query)
@@ -191,9 +199,7 @@ class FileHandler(DatabaseHandler):
191
199
  data_frame=pd.DataFrame(
192
200
  [
193
201
  {
194
- "Field": x["name"].strip()
195
- if isinstance(x, dict)
196
- else x.strip(),
202
+ "Field": x["name"].strip() if isinstance(x, dict) else x.strip(),
197
203
  "Type": "str",
198
204
  }
199
205
  for x in file_meta["columns"]
@@ -7,12 +7,22 @@ from pathlib import Path
7
7
  import pandas
8
8
  import pytest
9
9
  from mindsdb_sql_parser.exceptions import ParsingException
10
- from mindsdb_sql_parser.ast import CreateTable, DropTables, Identifier, Insert, TableColumn, Update
10
+ from mindsdb_sql_parser.ast import (
11
+ CreateTable,
12
+ DropTables,
13
+ Identifier,
14
+ Insert,
15
+ TableColumn,
16
+ Update,
17
+ )
11
18
 
12
19
  from mindsdb.integrations.handlers.file_handler.file_handler import FileHandler
13
20
  from mindsdb.integrations.libs.response import RESPONSE_TYPE
14
21
 
15
- from mindsdb.integrations.utilities.files.file_reader import FileReader
22
+ from mindsdb.integrations.utilities.files.file_reader import (
23
+ FileReader,
24
+ FileProcessingError,
25
+ )
16
26
 
17
27
 
18
28
  # Define a table to use as content for all of the file types
@@ -103,21 +113,18 @@ class TestIsItX:
103
113
 
104
114
  def test_is_it_csv(self):
105
115
  # We can't test xlsx or parquet here because they're binary files
106
- for file_path, result in (
107
- (csv_file(), True),
108
- (json_file(), False)
109
- ):
116
+ for file_path, result in ((csv_file(), True), (json_file(), False)):
110
117
  with open(file_path, "r") as fh:
111
118
  assert FileReader.is_csv(StringIO(fh.read())) is result
112
119
 
113
120
  def test_format(self):
114
121
  for file_path, result in (
115
- (csv_file(), 'csv'),
116
- (xlsx_file(), 'xlsx'),
117
- (json_file(), 'json'),
118
- (parquet_file(), 'parquet'),
119
- (txt_file(), 'txt'),
120
- (pdf_file(), 'pdf'),
122
+ (csv_file(), "csv"),
123
+ (xlsx_file(), "xlsx"),
124
+ (json_file(), "json"),
125
+ (parquet_file(), "parquet"),
126
+ (txt_file(), "txt"),
127
+ (pdf_file(), "pdf"),
121
128
  ):
122
129
  assert FileReader(path=file_path).get_format() == result
123
130
 
@@ -182,6 +189,7 @@ class TestQuery:
182
189
 
183
190
  def mock_get_file_path(self, name):
184
191
  return csv_tmp
192
+
185
193
  monkeypatch.setattr(MockFileController, "get_file_path", mock_get_file_path)
186
194
 
187
195
  file_handler = FileHandler(file_controller=MockFileController())
@@ -255,14 +263,13 @@ class TestQuery:
255
263
 
256
264
 
257
265
  def test_handle_source():
258
-
259
266
  def get_reader(file_path):
260
267
  # using path
261
268
  reader = FileReader(path=file_path)
262
269
  yield reader
263
270
 
264
271
  # using file descriptor
265
- with open(file_path, 'rb') as fd:
272
+ with open(file_path, "rb") as fd:
266
273
  reader = FileReader(file=fd)
267
274
  yield reader
268
275
  fd.seek(0)
@@ -310,14 +317,31 @@ def test_check_valid_dialects(csv_string, delimiter):
310
317
  def test_tsv():
311
318
  file = BytesIO(b"example;csv;file\tname")
312
319
 
313
- reader = FileReader(file=file, name='test.tsv')
314
- assert reader.get_format() == 'csv'
315
- assert reader.parameters['delimiter'] == '\t'
320
+ reader = FileReader(file=file, name="test.tsv")
321
+ assert reader.get_format() == "csv"
322
+ assert reader.parameters["delimiter"] == "\t"
316
323
 
317
324
  df = reader.get_page_content()
318
325
  assert len(df.columns) == 2
319
326
 
320
327
 
328
+ def test_bad_csv_header():
329
+ file = BytesIO(b" a,b ,c\n1,2,3\n")
330
+ reader = FileReader(file=file, name="test.tsv")
331
+ df = reader.get_page_content()
332
+ assert set(df.columns) == set(["a", "b", "c"])
333
+
334
+ wrong_data = [
335
+ b"a, ,c\n1,2,3\n",
336
+ b"a, \t,c\n1,2,3\n",
337
+ b" ,b,c\n1,2,3\n",
338
+ ]
339
+ for data in wrong_data:
340
+ reader = FileReader(file=BytesIO(data), name="test.tsv")
341
+ with pytest.raises(FileProcessingError):
342
+ df = reader.get_page_content()
343
+
344
+
321
345
  def test_check_invalid_dialects():
322
346
  with pytest.raises(Exception):
323
347
  FileHandler._get_csv_dialect("example csv file")
@@ -334,10 +358,7 @@ def test_get_tables():
334
358
  assert response.type == RESPONSE_TYPE.TABLE
335
359
 
336
360
  expected_df = pandas.DataFrame(
337
- [
338
- {"TABLE_NAME": x[0], "TABLE_ROWS": x[1], "TABLE_TYPE": "BASE TABLE"}
339
- for x in file_records
340
- ]
361
+ [{"TABLE_NAME": x[0], "TABLE_ROWS": x[1], "TABLE_TYPE": "BASE TABLE"} for x in file_records]
341
362
  )
342
363
 
343
364
  assert response.data_frame.equals(expected_df)
@@ -349,8 +370,6 @@ def test_get_columns():
349
370
 
350
371
  assert response.type == RESPONSE_TYPE.TABLE
351
372
 
352
- expected_df = pandas.DataFrame(
353
- [{"Field": x, "Type": "str"} for x in file_records[0][2]]
354
- )
373
+ expected_df = pandas.DataFrame([{"Field": x, "Type": "str"} for x in file_records[0][2]])
355
374
 
356
375
  assert response.data_frame.equals(expected_df)
@@ -2,7 +2,7 @@ import ast
2
2
  from typing import Dict, Optional, List
3
3
 
4
4
 
5
- from litellm import completion, batch_completion, embedding
5
+ from litellm import completion, batch_completion, embedding, acompletion
6
6
  import pandas as pd
7
7
 
8
8
  from mindsdb.integrations.libs.base import BaseMLEngine
@@ -42,10 +42,17 @@ class LiteLLMHandler(BaseMLEngine):
42
42
  f"https://{args['snowflake_account_id']}.snowflakecomputing.com/api/v2/cortex/inference:complete"
43
43
  )
44
44
 
45
- from litellm import acompletion
46
-
47
45
  return await acompletion(model=model, messages=messages, stream=False, **args)
48
46
 
47
+ @staticmethod
48
+ def completion(model: str, messages: List[dict], args: dict):
49
+ if model.startswith("snowflake/") and "snowflake_account_id" in args:
50
+ args["api_base"] = (
51
+ f"https://{args['snowflake_account_id']}.snowflakecomputing.com/api/v2/cortex/inference:complete"
52
+ )
53
+
54
+ return completion(model=model, messages=messages, stream=False, **args)
55
+
49
56
  def create(
50
57
  self,
51
58
  target: str,
@@ -31,9 +31,7 @@ def _map_type(mysql_type_text: str) -> MYSQL_DATA_TYPE:
31
31
  try:
32
32
  return MYSQL_DATA_TYPE(mysql_type_text.upper())
33
33
  except Exception:
34
- logger.warning(
35
- f"MySQL handler: unknown type: {mysql_type_text}, use TEXT as fallback."
36
- )
34
+ logger.warning(f"MySQL handler: unknown type: {mysql_type_text}, use TEXT as fallback.")
37
35
  return MYSQL_DATA_TYPE.TEXT
38
36
 
39
37
 
@@ -65,22 +63,23 @@ def _make_table_response(result: list[dict], cursor: mysql.connector.cursor.MySQ
65
63
  mysql_types.append(reverse_c_type_map[type_int])
66
64
  continue
67
65
 
68
- if type_int != C_TYPES.MYSQL_TYPE_BLOB:
69
- raise ValueError(f'Unknown MySQL type id={type_int} in column {col[0]}')
70
-
71
- # region determine text/blob type by flags
72
- # Unfortunately, there is no way to determine particular type of text/blob column by flags.
73
- # Subtype have to be determined by 8-s element of description tuple, but mysql.conector
74
- # return the same value for all text types (TINYTEXT, TEXT, MEDIUMTEXT, LONGTEXT), and for
75
- # all blob types (TINYBLOB, BLOB, MEDIUMBLOB, LONGBLOB).
76
- if col[7] == 16: # and col[8] == 45
77
- mysql_types.append(MYSQL_DATA_TYPE.TEXT)
78
- elif col[7] == 144: # and col[8] == 63
79
- mysql_types.append(MYSQL_DATA_TYPE.BLOB)
66
+ if type_int == C_TYPES.MYSQL_TYPE_BLOB:
67
+ # region determine text/blob type by flags
68
+ # Unfortunately, there is no way to determine particular type of text/blob column by flags.
69
+ # Subtype have to be determined by 8-s element of description tuple, but mysql.conector
70
+ # return the same value for all text types (TINYTEXT, TEXT, MEDIUMTEXT, LONGTEXT), and for
71
+ # all blob types (TINYBLOB, BLOB, MEDIUMBLOB, LONGBLOB).
72
+ if col[7] == 16: # and col[8] == 45
73
+ mysql_types.append(MYSQL_DATA_TYPE.TEXT)
74
+ elif col[7] == 144: # and col[8] == 63
75
+ mysql_types.append(MYSQL_DATA_TYPE.BLOB)
76
+ else:
77
+ logger.debug(f"MySQL handler: unknown type code {col[7]}, use TEXT as fallback.")
78
+ mysql_types.append(MYSQL_DATA_TYPE.TEXT)
79
+ # endregion
80
80
  else:
81
- logger.debug(f'MySQL handler: unknown type code {col[7]}, use TEXT as fallback.')
81
+ logger.warning(f"MySQL handler: unknown type id={type_int} in column {col[0]}, use TEXT as fallback.")
82
82
  mysql_types.append(MYSQL_DATA_TYPE.TEXT)
83
- # endregion
84
83
 
85
84
  # region cast int and bool to nullable types
86
85
  serieses = []
@@ -88,22 +87,20 @@ def _make_table_response(result: list[dict], cursor: mysql.connector.cursor.MySQ
88
87
  expected_dtype = None
89
88
  column_name = description[i][0]
90
89
  if mysql_type in (
91
- MYSQL_DATA_TYPE.SMALLINT, MYSQL_DATA_TYPE.INT, MYSQL_DATA_TYPE.MEDIUMINT,
92
- MYSQL_DATA_TYPE.BIGINT, MYSQL_DATA_TYPE.TINYINT
90
+ MYSQL_DATA_TYPE.SMALLINT,
91
+ MYSQL_DATA_TYPE.INT,
92
+ MYSQL_DATA_TYPE.MEDIUMINT,
93
+ MYSQL_DATA_TYPE.BIGINT,
94
+ MYSQL_DATA_TYPE.TINYINT,
93
95
  ):
94
- expected_dtype = 'Int64'
96
+ expected_dtype = "Int64"
95
97
  elif mysql_type in (MYSQL_DATA_TYPE.BOOL, MYSQL_DATA_TYPE.BOOLEAN):
96
- expected_dtype = 'boolean'
98
+ expected_dtype = "boolean"
97
99
  serieses.append(pd.Series([row[column_name] for row in result], dtype=expected_dtype, name=description[i][0]))
98
100
  df = pd.concat(serieses, axis=1, copy=False)
99
101
  # endregion
100
102
 
101
- response = Response(
102
- RESPONSE_TYPE.TABLE,
103
- df,
104
- affected_rows=cursor.rowcount,
105
- mysql_types=mysql_types
106
- )
103
+ response = Response(RESPONSE_TYPE.TABLE, df, affected_rows=cursor.rowcount, mysql_types=mysql_types)
107
104
  return response
108
105
 
109
106
 
@@ -219,9 +216,7 @@ class MySQLHandler(DatabaseHandler):
219
216
  connection = self.connect()
220
217
  result.success = connection.is_connected()
221
218
  except mysql.connector.Error as e:
222
- logger.error(
223
- f'Error connecting to MySQL {self.connection_data["database"]}, {e}!'
224
- )
219
+ logger.error(f"Error connecting to MySQL {self.connection_data['database']}, {e}!")
225
220
  result.error_message = str(e)
226
221
 
227
222
  if result.success and need_to_close:
@@ -252,9 +247,7 @@ class MySQLHandler(DatabaseHandler):
252
247
  else:
253
248
  response = Response(RESPONSE_TYPE.OK, affected_rows=cur.rowcount)
254
249
  except mysql.connector.Error as e:
255
- logger.error(
256
- f'Error running query: {query} on {self.connection_data["database"]}!'
257
- )
250
+ logger.error(f"Error running query: {query} on {self.connection_data['database']}!")
258
251
  response = Response(RESPONSE_TYPE.ERROR, error_message=str(e))
259
252
  if connection is not None and connection.is_connected():
260
253
  connection.rollback()