MindsDB 25.6.4.0__py3-none-any.whl → 25.7.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (61) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +53 -94
  3. mindsdb/api/a2a/agent.py +30 -206
  4. mindsdb/api/a2a/common/server/server.py +26 -27
  5. mindsdb/api/a2a/task_manager.py +93 -227
  6. mindsdb/api/a2a/utils.py +21 -0
  7. mindsdb/api/executor/command_executor.py +8 -6
  8. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +1 -1
  9. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +9 -11
  10. mindsdb/api/executor/datahub/datanodes/system_tables.py +1 -1
  11. mindsdb/api/executor/planner/query_prepare.py +68 -87
  12. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +6 -1
  13. mindsdb/api/executor/sql_query/steps/union_step.py +11 -9
  14. mindsdb/api/executor/utilities/sql.py +97 -21
  15. mindsdb/api/http/namespaces/agents.py +126 -201
  16. mindsdb/api/http/namespaces/config.py +12 -1
  17. mindsdb/api/http/namespaces/file.py +49 -24
  18. mindsdb/api/mcp/start.py +45 -31
  19. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +45 -52
  20. mindsdb/integrations/handlers/huggingface_handler/__init__.py +17 -12
  21. mindsdb/integrations/handlers/huggingface_handler/finetune.py +223 -223
  22. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +383 -383
  23. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -6
  24. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -6
  25. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  26. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +22 -15
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +244 -141
  28. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +1 -1
  29. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +3 -2
  30. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +1 -1
  31. mindsdb/integrations/handlers/statsforecast_handler/requirements.txt +1 -0
  32. mindsdb/integrations/handlers/statsforecast_handler/requirements_extra.txt +1 -0
  33. mindsdb/integrations/libs/keyword_search_base.py +41 -0
  34. mindsdb/integrations/libs/vectordatabase_handler.py +114 -84
  35. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +36 -42
  36. mindsdb/integrations/utilities/sql_utils.py +11 -0
  37. mindsdb/interfaces/agents/agents_controller.py +29 -9
  38. mindsdb/interfaces/agents/langchain_agent.py +7 -5
  39. mindsdb/interfaces/agents/mcp_client_agent.py +4 -4
  40. mindsdb/interfaces/agents/mindsdb_database_agent.py +10 -43
  41. mindsdb/interfaces/data_catalog/data_catalog_reader.py +3 -1
  42. mindsdb/interfaces/database/projects.py +1 -3
  43. mindsdb/interfaces/functions/controller.py +54 -64
  44. mindsdb/interfaces/functions/to_markdown.py +47 -14
  45. mindsdb/interfaces/knowledge_base/controller.py +228 -110
  46. mindsdb/interfaces/knowledge_base/evaluate.py +18 -6
  47. mindsdb/interfaces/knowledge_base/executor.py +346 -0
  48. mindsdb/interfaces/knowledge_base/llm_client.py +5 -6
  49. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +20 -45
  50. mindsdb/interfaces/knowledge_base/preprocessing/models.py +36 -69
  51. mindsdb/interfaces/skills/custom/text2sql/mindsdb_kb_tools.py +2 -0
  52. mindsdb/interfaces/skills/sql_agent.py +181 -130
  53. mindsdb/interfaces/storage/db.py +9 -7
  54. mindsdb/utilities/config.py +58 -40
  55. mindsdb/utilities/exception.py +58 -7
  56. mindsdb/utilities/security.py +54 -11
  57. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/METADATA +245 -259
  58. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/RECORD +61 -58
  59. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/WHEEL +0 -0
  60. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/licenses/LICENSE +0 -0
  61. {mindsdb-25.6.4.0.dist-info → mindsdb-25.7.2.0.dist-info}/top_level.txt +0 -0
@@ -8,7 +8,7 @@ from mindsdb.api.executor.planner import utils
8
8
 
9
9
  def to_string(identifier):
10
10
  # alternative to AST.to_string() but without quoting
11
- return '.'.join(identifier.parts)
11
+ return ".".join(identifier.parts)
12
12
 
13
13
 
14
14
  class Table:
@@ -32,7 +32,6 @@ class Column:
32
32
  def __init__(self, node=None, table=None, name=None, type=None):
33
33
  alias = None
34
34
  if node is not None:
35
-
36
35
  if isinstance(node, ast.Identifier):
37
36
  # set name
38
37
  name = node.parts[-1] # ???
@@ -67,26 +66,25 @@ class Statement:
67
66
  self.offset = 0
68
67
 
69
68
 
70
- class PreparedStatementPlanner():
71
-
69
+ class PreparedStatementPlanner:
72
70
  def __init__(self, planner):
73
71
  self.planner = planner
74
72
 
75
73
  def get_type_of_var(self, v):
76
74
  if isinstance(v, str):
77
- return 'str'
75
+ return "str"
78
76
  elif isinstance(v, float):
79
- return 'float'
77
+ return "float"
80
78
  elif isinstance(v, int):
81
- return 'integer'
79
+ return "integer"
82
80
 
83
- return 'str'
81
+ return "str"
84
82
 
85
83
  def get_statement_info(self):
86
84
  stmt = self.planner.statement
87
85
 
88
86
  if stmt is None:
89
- raise PlanningException('Statement is not prepared')
87
+ raise PlanningException("Statement is not prepared")
90
88
 
91
89
  columns_result = []
92
90
 
@@ -95,45 +93,45 @@ class PreparedStatementPlanner():
95
93
  if column.table is not None:
96
94
  table = column.table.name
97
95
  ds = column.table.ds
98
- columns_result.append(dict(
99
- alias=column.alias,
100
- type=column.type,
101
- name=column.name,
102
- table_name=table,
103
- table_alias=table,
104
- ds=ds,
105
- ))
96
+ columns_result.append(
97
+ dict(
98
+ alias=column.alias,
99
+ type=column.type,
100
+ name=column.name,
101
+ table_name=table,
102
+ table_alias=table,
103
+ ds=ds,
104
+ )
105
+ )
106
106
 
107
107
  parameters = []
108
108
  for param in stmt.params:
109
- name = '?'
110
- parameters.append(dict(
111
- alias=name,
112
- type='str',
113
- name=name,
114
- ))
115
-
116
- return {
117
- 'parameters': parameters,
118
- 'columns': columns_result
119
- }
109
+ name = "?"
110
+ parameters.append(
111
+ dict(
112
+ alias=name,
113
+ type="str",
114
+ name=name,
115
+ )
116
+ )
120
117
 
121
- def get_table_of_column(self, t):
118
+ return {"parameters": parameters, "columns": columns_result}
122
119
 
120
+ def get_table_of_column(self, t):
123
121
  tables_map = self.planner.statement.tables_map
124
122
 
125
123
  # get tables to check
126
124
  if len(t.parts) > 1:
127
125
  # try to find table
128
126
  table_parts = t.parts[:-1]
129
- table_name = '.'.join(table_parts)
127
+ table_name = ".".join(table_parts)
130
128
  if table_name in tables_map:
131
129
  return tables_map[table_name]
132
130
 
133
131
  elif len(table_parts) > 1:
134
132
  # maybe datasource is 1st part
135
133
  table_parts = table_parts[1:]
136
- table_name = '.'.join(table_parts)
134
+ table_name = ".".join(table_parts)
137
135
  if table_name in tables_map:
138
136
  return tables_map[table_name]
139
137
 
@@ -158,14 +156,10 @@ class PreparedStatementPlanner():
158
156
  # in reverse order
159
157
  for p in table.parts[::-1]:
160
158
  parts.insert(0, p)
161
- keys.append('.'.join(parts))
159
+ keys.append(".".join(parts))
162
160
 
163
161
  # remember table
164
- tbl = Table(
165
- ds=ds,
166
- node=table,
167
- is_predictor=is_predictor
168
- )
162
+ tbl = Table(ds=ds, node=table, is_predictor=is_predictor)
169
163
  tbl.keys = keys
170
164
 
171
165
  return tbl
@@ -189,7 +183,6 @@ class PreparedStatementPlanner():
189
183
  stmt.tables_map = {}
190
184
  stmt.tables_lvl1 = []
191
185
  if query.from_table is not None:
192
-
193
186
  if isinstance(query.from_table, ast.Join):
194
187
  # get all tables
195
188
  join_tables = utils.convert_join_to_list(query.from_table)
@@ -198,21 +191,17 @@ class PreparedStatementPlanner():
198
191
 
199
192
  if isinstance(query.from_table, ast.Select):
200
193
  # nested select, get only last select
201
- join_tables = [
202
- dict(
203
- table=utils.get_deepest_select(query.from_table).from_table
204
- )
205
- ]
194
+ join_tables = [dict(table=utils.get_deepest_select(query.from_table).from_table)]
206
195
 
207
196
  for i, join_table in enumerate(join_tables):
208
- table = join_table['table']
197
+ table = join_table["table"]
209
198
  if isinstance(table, ast.Identifier):
210
199
  tbl = self.table_from_identifier(table)
211
200
 
212
201
  if tbl.is_predictor:
213
202
  # Is the last table?
214
203
  if i + 1 < len(join_tables):
215
- raise PlanningException('Predictor must be last table in query')
204
+ raise PlanningException("Predictor must be last table in query")
216
205
 
217
206
  stmt.tables_lvl1.append(tbl)
218
207
  for key in tbl.keys:
@@ -225,13 +214,12 @@ class PreparedStatementPlanner():
225
214
  # is there any predictors at other levels?
226
215
  lvl1_predictors = [i for i in stmt.tables_lvl1 if i.is_predictor]
227
216
  if len(query_predictors) != len(lvl1_predictors):
228
- raise PlanningException('Predictor is not at first level')
217
+ raise PlanningException("Predictor is not at first level")
229
218
 
230
219
  # === get targets ===
231
220
  columns = []
232
221
  get_all_tables = False
233
222
  for t in query.targets:
234
-
235
223
  column = Column(t)
236
224
 
237
225
  # column alias
@@ -264,10 +252,10 @@ class PreparedStatementPlanner():
264
252
  column.type = self.get_type_of_var(t.value)
265
253
  elif isinstance(t, ast.Function):
266
254
  # mysql function
267
- if t.op == 'connection_id':
268
- column.type = 'integer'
255
+ if t.op == "connection_id":
256
+ column.type = "integer"
269
257
  else:
270
- column.type = 'str'
258
+ column.type = "str"
271
259
  else:
272
260
  # TODO go down into lower level.
273
261
  # It can be function, operation, select.
@@ -276,7 +264,7 @@ class PreparedStatementPlanner():
276
264
  # TODO add several known types for function, i.e ABS-int
277
265
 
278
266
  # TODO TypeCast - as casted type
279
- column.type = 'str'
267
+ column.type = "str"
280
268
 
281
269
  if alias is not None:
282
270
  column.alias = alias
@@ -299,28 +287,25 @@ class PreparedStatementPlanner():
299
287
  if step.result_data is not None:
300
288
  # save results
301
289
 
302
- if len(step.result_data['tables']) > 0:
303
- table_info = step.result_data['tables'][0]
304
- columns_info = step.result_data['columns'][table_info]
290
+ if len(step.result_data["tables"]) > 0:
291
+ table_info = step.result_data["tables"][0]
292
+ columns_info = step.result_data["columns"][table_info]
305
293
 
306
294
  table.columns = []
307
295
  table.ds = table_info[0]
308
296
  for col in columns_info:
309
297
  if isinstance(col, tuple):
310
298
  # is predictor
311
- col = dict(name=col[0], type='str')
299
+ col = dict(name=col[0], type="str")
312
300
  table.columns.append(
313
301
  Column(
314
- name=col['name'],
315
- type=col['type'],
302
+ name=col["name"],
303
+ type=col["type"],
316
304
  )
317
305
  )
318
306
 
319
307
  # map by names
320
- table.columns_map = {
321
- i.name.upper(): i
322
- for i in table.columns
323
- }
308
+ table.columns_map = {i.name.upper(): i for i in table.columns}
324
309
 
325
310
  # === create columns list ===
326
311
  columns_result = []
@@ -329,7 +314,7 @@ class PreparedStatementPlanner():
329
314
  # add data from all tables
330
315
  for table in stmt.tables_lvl1:
331
316
  if table.columns is None:
332
- raise PlanningException(f'Table is not found {table.name}')
317
+ raise PlanningException(f"Table is not found {table.name}")
333
318
 
334
319
  for col in table.columns:
335
320
  # col = {name: 'col', type: 'str'}
@@ -354,7 +339,7 @@ class PreparedStatementPlanner():
354
339
  column.type = table.columns_map[col_name].type
355
340
  else:
356
341
  # continue
357
- raise PlanningException(f'Column not found {col_name}')
342
+ raise PlanningException(f"Column not found {col_name}")
358
343
 
359
344
  else:
360
345
  # table is not found, looking for in all tables
@@ -368,11 +353,11 @@ class PreparedStatementPlanner():
368
353
 
369
354
  # forcing alias
370
355
  if column.alias is None:
371
- column.alias = f'column_{i}'
356
+ column.alias = f"column_{i}"
372
357
 
373
358
  # forcing type
374
359
  if column.type is None:
375
- column.type = 'str'
360
+ column.type = "str"
376
361
 
377
362
  columns_result.append(column)
378
363
 
@@ -393,28 +378,25 @@ class PreparedStatementPlanner():
393
378
  if step.result_data is not None:
394
379
  # save results
395
380
 
396
- if len(step.result_data['tables']) > 0:
397
- table_info = step.result_data['tables'][0]
398
- columns_info = step.result_data['columns'][table_info]
381
+ if len(step.result_data["tables"]) > 0:
382
+ table_info = step.result_data["tables"][0]
383
+ columns_info = step.result_data["columns"][table_info]
399
384
 
400
385
  table.columns = []
401
386
  table.ds = table_info[0]
402
387
  for col in columns_info:
403
388
  if isinstance(col, tuple):
404
389
  # is predictor
405
- col = dict(name=col[0], type='str')
390
+ col = dict(name=col[0], type="str")
406
391
  table.columns.append(
407
392
  Column(
408
- name=col['name'],
409
- type=col['type'],
393
+ name=col["name"],
394
+ type=col["type"],
410
395
  )
411
396
  )
412
397
 
413
398
  # map by names
414
- table.columns_map = {
415
- i.name.upper(): i
416
- for i in table.columns
417
- }
399
+ table.columns_map = {i.name.upper(): i for i in table.columns}
418
400
 
419
401
  # save results
420
402
  columns_result = []
@@ -430,7 +412,7 @@ class PreparedStatementPlanner():
430
412
 
431
413
  if column.type is None:
432
414
  # forcing type
433
- column.type = 'str'
415
+ column.type = "str"
434
416
 
435
417
  columns_result.append(column)
436
418
 
@@ -440,13 +422,12 @@ class PreparedStatementPlanner():
440
422
  stmt = self.planner.statement
441
423
 
442
424
  stmt.columns = [
443
- Column(name='Variable_name', type='str'),
444
- Column(name='Value', type='str'),
425
+ Column(name="Variable_name", type="str"),
426
+ Column(name="Value", type="str"),
445
427
  ]
446
428
  return []
447
429
 
448
430
  def prepare_steps(self, query):
449
-
450
431
  stmt = Statement()
451
432
  self.planner.statement = stmt
452
433
 
@@ -476,7 +457,6 @@ class PreparedStatementPlanner():
476
457
  if isinstance(query, ast.Show):
477
458
  return self.prepare_show(query)
478
459
  else:
479
-
480
460
  # do nothing
481
461
  return []
482
462
  # raise NotImplementedError(query.__name__)
@@ -496,7 +476,6 @@ class PreparedStatementPlanner():
496
476
  query = self.planner.query
497
477
 
498
478
  if params is not None:
499
-
500
479
  if len(params) != len(stmt.params):
501
480
  raise PlanningException("Count of execution parameters don't match prepared statement")
502
481
 
@@ -508,12 +487,14 @@ class PreparedStatementPlanner():
508
487
  stmt.params = None
509
488
 
510
489
  if (
511
- isinstance(query, ast.Select)
512
- or isinstance(query, ast.Union)
513
- or isinstance(query, ast.CreateTable)
514
- or isinstance(query, ast.Insert)
515
- or isinstance(query, ast.Update)
516
- or isinstance(query, ast.Delete)
490
+ isinstance(query, ast.Select)
491
+ or isinstance(query, ast.Union)
492
+ or isinstance(query, ast.CreateTable)
493
+ or isinstance(query, ast.Insert)
494
+ or isinstance(query, ast.Update)
495
+ or isinstance(query, ast.Delete)
496
+ or isinstance(query, ast.Intersect)
497
+ or isinstance(query, ast.Except)
517
498
  ):
518
499
  return self.plan_query(query)
519
500
  else:
@@ -6,6 +6,8 @@ from mindsdb_sql_parser.ast import (
6
6
  Parameter,
7
7
  BinaryOperation,
8
8
  Tuple,
9
+ Union,
10
+ Intersect,
9
11
  )
10
12
 
11
13
  from mindsdb.api.executor.planner.steps import FetchDataframeStep
@@ -92,7 +94,10 @@ class FetchDataframeStepCall(BaseStepCall):
92
94
  response: DataHubResponse = dn.query(native_query=step.raw_query, session=self.session)
93
95
  df = response.data_frame
94
96
  else:
95
- table_alias = get_table_alias(step.query.from_table, self.context.get("database"))
97
+ if isinstance(step.query, (Union, Intersect)):
98
+ table_alias = ["", "", ""]
99
+ else:
100
+ table_alias = get_table_alias(step.query.from_table, self.context.get("database"))
96
101
 
97
102
  # TODO for information_schema we have 'database' = 'mindsdb'
98
103
 
@@ -9,7 +9,6 @@ from .base import BaseStepCall
9
9
 
10
10
 
11
11
  class UnionStepCall(BaseStepCall):
12
-
13
12
  bind = UnionStep
14
13
 
15
14
  def call(self, step):
@@ -19,7 +18,8 @@ class UnionStepCall(BaseStepCall):
19
18
  # count of columns have to match
20
19
  if len(left_result.columns) != len(right_result.columns):
21
20
  raise WrongArgumentError(
22
- f'UNION columns count mismatch: {len(left_result.columns)} != {len(right_result.columns)} ')
21
+ f"UNION columns count mismatch: {len(left_result.columns)} != {len(right_result.columns)} "
22
+ )
23
23
 
24
24
  # types have to match
25
25
  # TODO: return checking type later
@@ -33,19 +33,21 @@ class UnionStepCall(BaseStepCall):
33
33
  table_a, names = left_result.to_df_cols()
34
34
  table_b, _ = right_result.to_df_cols()
35
35
 
36
- op = 'UNION ALL'
37
- if step.unique:
38
- op = 'UNION'
36
+ if step.operation.lower() == "intersect":
37
+ op = "INTERSECT"
38
+ else:
39
+ op = "UNION"
40
+
41
+ if step.unique is not True:
42
+ op += " ALL"
43
+
39
44
  query = f"""
40
45
  SELECT * FROM table_a
41
46
  {op}
42
47
  SELECT * FROM table_b
43
48
  """
44
49
 
45
- resp_df, _description = query_df_with_type_infer_fallback(query, {
46
- 'table_a': table_a,
47
- 'table_b': table_b
48
- })
50
+ resp_df, _description = query_df_with_type_infer_fallback(query, {"table_a": table_a, "table_b": table_b})
49
51
  resp_df.replace({np.nan: None}, inplace=True)
50
52
 
51
53
  return ResultSet.from_df_cols(df=resp_df, columns_dict=names)
@@ -6,13 +6,14 @@ from duckdb import InvalidInputException
6
6
  import numpy as np
7
7
 
8
8
  from mindsdb_sql_parser import parse_sql
9
- from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
10
- from mindsdb.integrations.utilities.query_traversal import query_traversal
11
9
  from mindsdb_sql_parser.ast import ASTNode, Select, Identifier, Function, Constant
12
- from mindsdb.utilities.functions import resolve_table_identifier, resolve_model_identifier
13
10
 
11
+ from mindsdb.integrations.utilities.query_traversal import query_traversal
14
12
  from mindsdb.utilities import log
13
+ from mindsdb.utilities.exception import format_db_error_message
14
+ from mindsdb.utilities.functions import resolve_table_identifier, resolve_model_identifier
15
15
  from mindsdb.utilities.json_encoder import CustomJSONEncoder
16
+ from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
16
17
 
17
18
  logger = log.getLogger(__name__)
18
19
 
@@ -64,29 +65,85 @@ def query_df_with_type_infer_fallback(query_str: str, dataframes: dict, user_fun
64
65
  pandas.columns
65
66
  """
66
67
 
67
- with duckdb.connect(database=":memory:") as con:
68
- if user_functions:
69
- user_functions.register(con)
70
-
71
- for name, value in dataframes.items():
72
- con.register(name, value)
73
-
74
- exception = None
75
- for sample_size in [1000, 10000, 1000000]:
76
- try:
77
- con.execute(f"set global pandas_analyze_sample={sample_size};")
78
- result_df = con.execute(query_str).fetchdf()
79
- except InvalidInputException as e:
80
- exception = e
68
+ try:
69
+ with duckdb.connect(database=":memory:") as con:
70
+ if user_functions:
71
+ user_functions.register(con)
72
+
73
+ for name, value in dataframes.items():
74
+ con.register(name, value)
75
+
76
+ exception = None
77
+ for sample_size in [1000, 10000, 1000000]:
78
+ try:
79
+ con.execute(f"set global pandas_analyze_sample={sample_size};")
80
+ result_df = con.execute(query_str).fetchdf()
81
+ except InvalidInputException as e:
82
+ exception = e
83
+ else:
84
+ break
81
85
  else:
82
- break
83
- else:
84
- raise exception
85
- description = con.description
86
+ raise exception
87
+ description = con.description
88
+ except Exception as e:
89
+ raise Exception(
90
+ format_db_error_message(db_type="DuckDB", db_error_msg=str(e), failed_query=query_str, is_external=False)
91
+ ) from e
86
92
 
87
93
  return result_df, description
88
94
 
89
95
 
96
+ _duckdb_functions_and_kw_list = None
97
+
98
+
99
+ def get_duckdb_functions_and_kw_list() -> list[str] | None:
100
+ """Returns a list of all functions and keywords supported by DuckDB.
101
+ The list is merge of:
102
+ - list of duckdb's functions: 'select * from duckdb_functions()' or 'pragma functions'
103
+ - ist of keywords, because of some functions are just sintax-sugar
104
+ and not present in the duckdb_functions (like 'if()').
105
+ - hardcoded list of window_functions, because there are no way to get if from duckdb,
106
+ and they are not present in the duckdb_functions()
107
+
108
+ Returns:
109
+ list[str] | None: List of supported functions and keywords, or None if unable to retrieve the list.
110
+ """
111
+ global _duckdb_functions_and_kw_list
112
+ window_functions_list = [
113
+ "cume_dist",
114
+ "dense_rank",
115
+ "first_value",
116
+ "lag",
117
+ "last_value",
118
+ "lead",
119
+ "nth_value",
120
+ "ntile",
121
+ "percent_rank",
122
+ "rank_dense",
123
+ "rank",
124
+ "row_number",
125
+ ]
126
+ if _duckdb_functions_and_kw_list is None:
127
+ try:
128
+ df, _ = query_df_with_type_infer_fallback(
129
+ """
130
+ select distinct name
131
+ from (
132
+ select function_name as name from duckdb_functions()
133
+ union all
134
+ select keyword_name as name from duckdb_keywords()
135
+ ) ta;
136
+ """,
137
+ dataframes={},
138
+ )
139
+ df.columns = [name.lower() for name in df.columns]
140
+ _duckdb_functions_and_kw_list = df["name"].drop_duplicates().str.lower().to_list() + window_functions_list
141
+ except Exception as e:
142
+ logger.warning(f"Unable to get DuckDB functions list: {e}")
143
+
144
+ return _duckdb_functions_and_kw_list
145
+
146
+
90
147
  def query_df(df, query, session=None):
91
148
  """Perform simple query ('select' from one table, without subqueries and joins) on DataFrame.
92
149
 
@@ -100,8 +157,10 @@ def query_df(df, query, session=None):
100
157
 
101
158
  if isinstance(query, str):
102
159
  query_ast = parse_sql(query)
160
+ query_str = query
103
161
  else:
104
162
  query_ast = copy.deepcopy(query)
163
+ query_str = str(query)
105
164
 
106
165
  if isinstance(query_ast, Select) is False or isinstance(query_ast.from_table, Identifier) is False:
107
166
  raise Exception("Only 'SELECT from TABLE' statements supported for internal query")
@@ -125,6 +184,7 @@ def query_df(df, query, session=None):
125
184
  return node
126
185
  if isinstance(node, Function):
127
186
  fnc_name = node.op.lower()
187
+
128
188
  if fnc_name == "database" and len(node.args) == 0:
129
189
  if session is not None:
130
190
  cur_db = session.database
@@ -142,6 +202,22 @@ def query_df(df, query, session=None):
142
202
  if user_functions is not None:
143
203
  user_functions.check_function(node)
144
204
 
205
+ duckdb_functions_and_kw_list = get_duckdb_functions_and_kw_list() or []
206
+ custom_functions_list = [] if user_functions is None else list(user_functions.functions.keys())
207
+ all_functions_list = duckdb_functions_and_kw_list + custom_functions_list
208
+ if len(all_functions_list) > 0 and fnc_name not in all_functions_list:
209
+ raise Exception(
210
+ format_db_error_message(
211
+ db_type="DuckDB",
212
+ db_error_msg=(
213
+ f"Unknown function: '{fnc_name}'. This function is not recognized during internal query processing.\n"
214
+ "Please use DuckDB-supported functions instead."
215
+ ),
216
+ failed_query=query_str,
217
+ is_external=False,
218
+ )
219
+ )
220
+
145
221
  query_traversal(query_ast, adapt_query)
146
222
 
147
223
  # convert json columns