MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (102) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +11 -1
  3. mindsdb/api/a2a/common/server/server.py +16 -6
  4. mindsdb/api/executor/command_executor.py +215 -150
  5. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  6. mindsdb/api/executor/planner/plan_join.py +3 -0
  7. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  8. mindsdb/api/executor/planner/query_planner.py +1 -0
  9. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  10. mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
  11. mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
  12. mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
  13. mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
  14. mindsdb/api/executor/utilities/sql.py +30 -0
  15. mindsdb/api/http/initialize.py +18 -44
  16. mindsdb/api/http/namespaces/agents.py +23 -20
  17. mindsdb/api/http/namespaces/chatbots.py +83 -120
  18. mindsdb/api/http/namespaces/file.py +1 -1
  19. mindsdb/api/http/namespaces/jobs.py +38 -60
  20. mindsdb/api/http/namespaces/tree.py +69 -61
  21. mindsdb/api/http/namespaces/views.py +56 -72
  22. mindsdb/api/mcp/start.py +2 -0
  23. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  24. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  25. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  27. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  28. mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
  29. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  30. mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
  31. mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
  32. mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
  33. mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
  34. mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
  35. mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
  36. mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
  37. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  38. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  39. mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
  40. mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
  41. mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
  42. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
  43. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
  44. mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
  45. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
  46. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  47. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
  48. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  49. mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
  50. mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
  51. mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
  52. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  53. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  54. mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
  55. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
  56. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
  57. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  58. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  59. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  60. mindsdb/integrations/libs/llm/config.py +0 -14
  61. mindsdb/integrations/libs/llm/utils.py +0 -15
  62. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  63. mindsdb/integrations/utilities/files/file_reader.py +5 -19
  64. mindsdb/integrations/utilities/handler_utils.py +32 -12
  65. mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
  66. mindsdb/interfaces/agents/agents_controller.py +246 -149
  67. mindsdb/interfaces/agents/constants.py +0 -1
  68. mindsdb/interfaces/agents/langchain_agent.py +11 -6
  69. mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
  70. mindsdb/interfaces/database/database.py +38 -13
  71. mindsdb/interfaces/database/integrations.py +20 -5
  72. mindsdb/interfaces/database/projects.py +174 -23
  73. mindsdb/interfaces/database/views.py +86 -60
  74. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  75. mindsdb/interfaces/knowledge_base/controller.py +33 -6
  76. mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
  77. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  78. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
  79. mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
  80. mindsdb/interfaces/query_context/context_controller.py +111 -145
  81. mindsdb/interfaces/skills/skills_controller.py +18 -6
  82. mindsdb/interfaces/storage/db.py +40 -6
  83. mindsdb/interfaces/variables/variables_controller.py +8 -15
  84. mindsdb/utilities/config.py +5 -3
  85. mindsdb/utilities/fs.py +54 -17
  86. mindsdb/utilities/functions.py +72 -60
  87. mindsdb/utilities/log.py +38 -6
  88. mindsdb/utilities/ps.py +7 -7
  89. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
  90. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
  91. mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
  92. mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
  93. mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
  94. mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
  95. mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
  96. mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
  97. mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
  98. mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
  99. /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
  100. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
  101. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
  102. {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
@@ -14,6 +14,7 @@ class PendingFT(openai.OpenAIError):
14
14
  """
15
15
  Custom exception to handle pending fine-tuning status.
16
16
  """
17
+
17
18
  message: str
18
19
 
19
20
  def __init__(self, message) -> None:
@@ -65,10 +66,7 @@ def retry_with_exponential_backoff(
65
66
 
66
67
  if isinstance(hour_budget, float) or isinstance(hour_budget, int):
67
68
  try:
68
- max_retries = round(
69
- (math.log((hour_budget * 3600) / initial_delay))
70
- / math.log(exponential_base)
71
- )
69
+ max_retries = round((math.log((hour_budget * 3600) / initial_delay)) / math.log(exponential_base))
72
70
  except ValueError:
73
71
  max_retries = 10
74
72
  else:
@@ -81,22 +79,20 @@ def retry_with_exponential_backoff(
81
79
 
82
80
  except status_errors as e:
83
81
  raise Exception(
84
- f'Error status {e.status_code} raised by OpenAI API: {e.body.get("message", "Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.")}' # noqa
82
+ f"Error status {e.status_code} raised by OpenAI API: {e.body.get('message', 'Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.')}" # noqa
85
83
  ) # noqa
86
84
 
87
85
  except wait_errors:
88
86
  num_retries += 1
89
87
  if num_retries > max_retries:
90
- raise Exception(
91
- f"Maximum number of retries ({max_retries}) exceeded."
92
- )
88
+ raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
93
89
  # Increment the delay and wait
94
90
  delay *= exponential_base * (1 + jitter * random.random())
95
91
  time.sleep(delay)
96
92
 
97
93
  except openai.OpenAIError as e:
98
94
  raise Exception(
99
- f'General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.' # noqa
95
+ f"General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information." # noqa
100
96
  )
101
97
 
102
98
  except Exception as e:
@@ -107,7 +103,7 @@ def retry_with_exponential_backoff(
107
103
  return _retry_with_exponential_backoff
108
104
 
109
105
 
110
- def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text = 'first'):
106
+ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text = "first"):
111
107
  """
112
108
  Truncates message list to fit within the token limit.
113
109
  The first message for chat completion models are general directives with the system role, which will ideally be kept at all times.
@@ -129,20 +125,18 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
129
125
  except KeyError:
130
126
  # If the encoding is not found, defualt to cl100k_base.
131
127
  # This is applicable for handlers that extend the OpenAI handler such as Anyscale.
132
- model_name = 'gpt-3.5-turbo-0301'
133
- encoder = tiktoken.get_encoding('cl100k_base')
128
+ model_name = "gpt-3.5-turbo-0301"
129
+ encoder = tiktoken.get_encoding("cl100k_base")
134
130
 
135
131
  sys_priming = messages[0:1]
136
132
  n_tokens = count_tokens(messages, encoder, model_name)
137
133
  while n_tokens > max_tokens:
138
134
  if len(messages) == 2:
139
- return messages[
140
- :-1
141
- ] # edge case: if limit is surpassed by just one input, we remove initial instruction
135
+ return messages[:-1] # edge case: if limit is surpassed by just one input, we remove initial instruction
142
136
  elif len(messages) == 1:
143
137
  return messages
144
138
 
145
- if truncate == 'first':
139
+ if truncate == "first":
146
140
  messages = sys_priming + messages[2:]
147
141
  else:
148
142
  messages = sys_priming + messages[1:-1]
@@ -151,7 +145,7 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
151
145
  return messages
152
146
 
153
147
 
154
- def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text = 'gpt-3.5-turbo-0301'):
148
+ def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text = "gpt-3.5-turbo-0301"):
155
149
  """
156
150
  Counts the number of tokens in a list of messages.
157
151
 
@@ -160,24 +154,23 @@ def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_na
160
154
  encoder: Tokenizer
161
155
  model_name: Model name
162
156
  """
163
- if (
164
- "gpt-3.5-turbo" in model_name
165
- ): # note: future models may deviate from this (only 0301 really complies)
166
- num_tokens = 0
167
- for message in messages:
168
- num_tokens += (
169
- 4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
170
- )
171
- for key, value in message.items():
172
- num_tokens += len(encoder.encode(value))
173
- if key == "name": # if there's a name, the role is omitted
174
- num_tokens += -1 # role is always required and always 1 token
175
- num_tokens += 2 # every reply is primed with <im_start>assistant
176
- return num_tokens
157
+ if "gpt-3.5-turbo" in model_name: # note: future models may deviate from this (only 0301 really complies)
158
+ tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
159
+ tokens_per_name = -1
177
160
  else:
178
- raise NotImplementedError(
179
- f"""_count_tokens() is not presently implemented for model {model_name}."""
180
- )
161
+ tokens_per_message = 3
162
+ tokens_per_name = 1
163
+
164
+ num_tokens = 0
165
+ for message in messages:
166
+ num_tokens += tokens_per_message
167
+
168
+ for key, value in message.items():
169
+ num_tokens += len(encoder.encode(value))
170
+ if key == "name": # if there's a name, the role is omitted
171
+ num_tokens += tokens_per_name
172
+ num_tokens += 2 # every reply is primed with <im_start>assistant
173
+ return num_tokens
181
174
 
182
175
 
183
176
  def get_available_models(client) -> List[Text]:
@@ -24,7 +24,7 @@ from mindsdb.integrations.handlers.openai_handler.helpers import (
24
24
  PendingFT,
25
25
  )
26
26
  from mindsdb.integrations.handlers.openai_handler.constants import (
27
- CHAT_MODELS,
27
+ CHAT_MODELS_PREFIXES,
28
28
  IMAGE_MODELS,
29
29
  FINETUNING_MODELS,
30
30
  OPENAI_API_BASE,
@@ -62,7 +62,6 @@ class OpenAIHandler(BaseMLEngine):
62
62
  self.rate_limit = 60 # requests per minute
63
63
  self.max_batch_size = 20
64
64
  self.default_max_tokens = 100
65
- self.chat_completion_models = CHAT_MODELS
66
65
  self.supported_ft_models = FINETUNING_MODELS # base models compatible with finetuning
67
66
  # For now this are only used for handlers that inherits OpenAIHandler and don't need to override base methods
68
67
  self.api_key_name = getattr(self, "api_key_name", self.name)
@@ -89,6 +88,13 @@ class OpenAIHandler(BaseMLEngine):
89
88
  client = self._get_client(api_key=api_key, base_url=api_base, org=org, args=connection_args)
90
89
  OpenAIHandler._check_client_connection(client)
91
90
 
91
+ @staticmethod
92
+ def is_chat_model(model_name):
93
+ for prefix in CHAT_MODELS_PREFIXES:
94
+ if model_name.startswith(prefix):
95
+ return True
96
+ return False
97
+
92
98
  @staticmethod
93
99
  def _check_client_connection(client: OpenAI) -> None:
94
100
  """
@@ -350,11 +356,6 @@ class OpenAIHandler(BaseMLEngine):
350
356
  "user": pred_args.get("user", None),
351
357
  }
352
358
 
353
- if args.get("mode", self.default_mode) != "default" and model_name not in self.chat_completion_models:
354
- raise Exception(
355
- f"Conversational modes are only available for the following models: {', '.join(self.chat_completion_models)}"
356
- ) # noqa
357
-
358
359
  if args.get("prompt_template", False):
359
360
  prompts, empty_prompt_ids = get_completed_prompts(base_template, df, strict=strict_prompt_template)
360
361
 
@@ -515,7 +516,7 @@ class OpenAIHandler(BaseMLEngine):
515
516
  return _submit_image_completion(kwargs, prompts, api_args)
516
517
  elif model_name == "embedding":
517
518
  return _submit_embedding_completion(kwargs, prompts, api_args)
518
- elif model_name in self.chat_completion_models:
519
+ elif self.is_chat_model(model_name):
519
520
  if model_name == "gpt-3.5-turbo-instruct":
520
521
  return _submit_normal_completion(kwargs, prompts, api_args)
521
522
  else:
@@ -579,13 +580,14 @@ class OpenAIHandler(BaseMLEngine):
579
580
  tidy_comps.append(c.text.strip("\n").strip(""))
580
581
  return tidy_comps
581
582
 
582
- kwargs["prompt"] = prompts
583
583
  kwargs = {**kwargs, **api_args}
584
584
 
585
585
  before_openai_query(kwargs)
586
- resp = _tidy(client.completions.create(**kwargs))
587
- _log_api_call(kwargs, resp)
588
- return resp
586
+ responses = []
587
+ for prompt in prompts:
588
+ responses.extend(_tidy(client.completions.create(prompt=prompt, **kwargs)))
589
+ _log_api_call(kwargs, responses)
590
+ return responses
589
591
 
590
592
  def _submit_embedding_completion(kwargs: Dict, prompts: List[Text], api_args: Dict) -> List[float]:
591
593
  """
@@ -5,7 +5,19 @@ from urllib.parse import urlparse
5
5
 
6
6
  import pandas as pd
7
7
  import psycopg
8
- from mindsdb_sql_parser.ast import Parameter, Identifier, Update, BinaryOperation
8
+ from mindsdb_sql_parser.ast import (
9
+ Parameter,
10
+ Identifier,
11
+ BinaryOperation,
12
+ Tuple as AstTuple,
13
+ Constant,
14
+ Select,
15
+ OrderBy,
16
+ TypeCast,
17
+ Delete,
18
+ Update,
19
+ Function,
20
+ )
9
21
  from pgvector.psycopg import register_vector
10
22
 
11
23
  from mindsdb.integrations.handlers.postgres_handler.postgres_handler import (
@@ -17,6 +29,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
17
29
  VectorStoreHandler,
18
30
  DistanceFunction,
19
31
  TableField,
32
+ FilterOperator,
20
33
  )
21
34
  from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
22
35
  from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
@@ -169,31 +182,42 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
169
182
  embedding_condition = None
170
183
 
171
184
  for condition in conditions:
185
+ is_embedding = condition.column == "embeddings"
186
+
172
187
  parts = condition.column.split(".")
173
- key = parts[0]
188
+ key = Identifier(parts[0])
189
+
174
190
  # converts 'col.el1.el2' to col->'el1'->>'el2'
175
191
  if len(parts) > 1:
176
192
  # intermediate elements
177
193
  for el in parts[1:-1]:
178
- key += f" -> '{el}'"
194
+ key = BinaryOperation(op="->", args=[key, Constant(el)])
179
195
 
180
196
  # last element
181
- key += f" ->> '{parts[-1]}'"
197
+ key = BinaryOperation(op="->>", args=[key, Constant(parts[-1])])
182
198
 
183
199
  type_cast = None
184
- if isinstance(condition.value, int):
200
+ value = condition.value
201
+ if (
202
+ isinstance(value, list)
203
+ and len(value) > 0
204
+ and condition.op in (FilterOperator.IN, FilterOperator.NOT_IN)
205
+ ):
206
+ value = condition.value[0]
207
+
208
+ if isinstance(value, int):
185
209
  type_cast = "int"
186
- elif isinstance(condition.value, float):
210
+ elif isinstance(value, float):
187
211
  type_cast = "float"
188
212
  if type_cast is not None:
189
- key = f"({key})::{type_cast}"
213
+ key = TypeCast(type_cast, key)
190
214
 
191
215
  item = {
192
216
  "name": key,
193
217
  "op": condition.op.value,
194
218
  "value": condition.value,
195
219
  }
196
- if key == "embeddings":
220
+ if is_embedding:
197
221
  embedding_condition = item
198
222
  else:
199
223
  filter_conditions.append(item)
@@ -205,64 +229,24 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
205
229
  """
206
230
  Construct where clauses from filter conditions
207
231
  """
208
- if filter_conditions is None:
209
- return ""
210
232
 
211
- where_clauses = []
233
+ where_clause = None
212
234
 
213
235
  for item in filter_conditions:
214
236
  key = item["name"]
215
237
 
216
238
  if item["op"].lower() in ("in", "not in"):
217
- values = list(repr(i) for i in item["value"])
218
- item["value"] = "({})".format(", ".join(values))
239
+ values = [Constant(i) for i in item["value"]]
240
+ value = AstTuple(values)
219
241
  else:
220
- if item["value"] is None:
221
- item["value"] = "null"
222
- else:
223
- item["value"] = repr(item["value"])
224
- where_clauses.append(f"{key} {item['op']} {item['value']}")
225
-
226
- if len(where_clauses) > 1:
227
- return f"WHERE {' AND '.join(where_clauses)}"
228
- elif len(where_clauses) == 1:
229
- return f"WHERE {where_clauses[0]}"
230
- else:
231
- return ""
232
-
233
- @staticmethod
234
- def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
235
- if not keyword_query or not content_column_name:
236
- return PgVectorHandler._construct_where_clause(filter_conditions)
237
-
238
- keyword_query_condition = (
239
- f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
240
- )
241
- if filter_conditions is None:
242
- return ""
243
-
244
- where_clauses = []
245
-
246
- for item in filter_conditions:
247
- key = item["name"]
242
+ value = Constant(item["value"])
243
+ condition = BinaryOperation(op=item["op"], args=[key, value])
248
244
 
249
- if item["op"].lower() in ("in", "not in"):
250
- values = list(repr(i) for i in item["value"])
251
- item["value"] = "({})".format(", ".join(values))
245
+ if where_clause is None:
246
+ where_clause = condition
252
247
  else:
253
- if item["value"] is None:
254
- item["value"] = "null"
255
- else:
256
- item["value"] = repr(item["value"])
257
- where_clauses.append(f"{key} {item['op']} {item['value']}")
258
-
259
- where_clauses.append(keyword_query_condition)
260
- if len(where_clauses) > 1:
261
- return f"WHERE {' AND '.join(where_clauses)}"
262
- elif len(where_clauses) == 1:
263
- return f"WHERE {where_clauses[0]}"
264
- else:
265
- return ""
248
+ where_clause = BinaryOperation(op="AND", args=[where_clause, condition])
249
+ return where_clause
266
250
 
267
251
  @staticmethod
268
252
  def _construct_full_after_from_clause(
@@ -275,9 +259,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
275
259
  def _build_keyword_bm25_query(
276
260
  self,
277
261
  table_name: str,
278
- query: str,
262
+ keyword_search_args: KeywordSearchArgs,
279
263
  columns: List[str] = None,
280
- content_column_name: str = "content",
281
264
  conditions: List[FilterCondition] = None,
282
265
  limit: int = None,
283
266
  offset: int = None,
@@ -286,21 +269,44 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
286
269
  columns = ["id", "content", "metadata"]
287
270
 
288
271
  filter_conditions, _ = self._translate_conditions(conditions)
272
+ where_clause = self._construct_where_clause(filter_conditions)
289
273
 
290
- # given filter conditions, construct where clause
291
- where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
292
-
293
- query = f"""
294
- SELECT
295
- {", ".join(columns)},
296
- ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
297
- FROM
298
- {table_name}
299
- {where_clause if where_clause else ""}
300
- {f"LIMIT {limit}" if limit else ""}
301
- {f"OFFSET {offset}" if offset else ""};"""
274
+ if keyword_search_args:
275
+ keyword_query_condition = BinaryOperation(
276
+ op="@@",
277
+ args=[
278
+ Function("to_tsvector", args=[Constant("english"), Identifier(keyword_search_args.column)]),
279
+ Function("websearch_to_tsquery", args=[Constant("english"), Constant(keyword_search_args.query)]),
280
+ ],
281
+ )
302
282
 
303
- return query
283
+ if where_clause:
284
+ where_clause = BinaryOperation(op="AND", args=[where_clause, keyword_query_condition])
285
+ else:
286
+ where_clause = keyword_query_condition
287
+
288
+ distance = Function(
289
+ "ts_rank_cd",
290
+ args=[
291
+ Function("to_tsvector", args=[Constant("english"), Identifier(keyword_search_args.column)]),
292
+ Function("websearch_to_tsquery", args=[Constant("english"), Constant(keyword_search_args.query)]),
293
+ ],
294
+ alias=Identifier("distance"),
295
+ )
296
+
297
+ targets = [Identifier(col) for col in columns]
298
+ targets.append(distance)
299
+
300
+ limit_clause = Constant(limit) if limit else None
301
+ offset_clause = Constant(offset) if offset else None
302
+
303
+ return Select(
304
+ targets=targets,
305
+ from_table=Identifier(table_name),
306
+ where=where_clause,
307
+ limit=limit_clause,
308
+ offset=offset_clause,
309
+ )
304
310
 
305
311
  def _build_select_query(
306
312
  self,
@@ -309,12 +315,12 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
309
315
  conditions: List[FilterCondition] = None,
310
316
  limit: int = None,
311
317
  offset: int = None,
312
- ) -> str:
318
+ ) -> Select:
313
319
  """
314
320
  given inputs, build string query
315
321
  """
316
- limit_clause = f"LIMIT {limit}" if limit else ""
317
- offset_clause = f"OFFSET {offset}" if offset else ""
322
+ limit_clause = Constant(limit) if limit else None
323
+ offset_clause = Constant(offset) if offset else None
318
324
 
319
325
  # translate filter conditions to dictionary
320
326
  filter_conditions, embedding_search = self._translate_conditions(conditions)
@@ -335,7 +341,15 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
335
341
  modified_columns = ["id", "content", "embeddings", "metadata"]
336
342
  has_distance = True
337
343
 
338
- targets = ", ".join(modified_columns)
344
+ targets = [Identifier(col) for col in modified_columns]
345
+
346
+ query = Select(
347
+ targets=targets,
348
+ from_table=Identifier(table_name),
349
+ where=where_clause,
350
+ limit=limit_clause,
351
+ offset=offset_clause,
352
+ )
339
353
 
340
354
  if embedding_search:
341
355
  search_vector = embedding_search["value"]
@@ -352,15 +366,18 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
352
366
  if isinstance(search_vector, list):
353
367
  search_vector = f"[{','.join(str(x) for x in search_vector)}]"
354
368
 
369
+ vector_op = BinaryOperation(
370
+ op=self.distance_op,
371
+ args=[Identifier("embeddings"), Constant(search_vector)],
372
+ alias=Identifier("distance"),
373
+ )
355
374
  # Calculate distance as part of the query if needed
356
375
  if has_distance:
357
- targets = f"{targets}, (embeddings {self.distance_op} '{search_vector}') as distance"
376
+ query.targets.append(vector_op)
358
377
 
359
- return f"SELECT {targets} FROM {table_name} {where_clause} ORDER BY embeddings {self.distance_op} '{search_vector}' ASC {limit_clause} {offset_clause} "
378
+ query.order_by = [OrderBy(vector_op, direction="ASC")]
360
379
 
361
- else:
362
- # if filter conditions, return rows that satisfy the conditions
363
- return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
380
+ return query
364
381
 
365
382
  def _check_table(self, table_name: str):
366
383
  # Apply namespace for a user
@@ -386,8 +403,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
386
403
  columns = ["id", "content", "embeddings", "metadata"]
387
404
 
388
405
  query = self._build_select_query(table_name, columns, conditions, limit, offset)
389
-
390
- result = self.raw_query(query)
406
+ query_str = self.renderer.get_string(query, with_failback=True)
407
+ result = self.raw_query(query_str)
391
408
 
392
409
  # ensure embeddings are returned as string so they can be parsed by mindsdb
393
410
  if "embeddings" in columns:
@@ -408,12 +425,10 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
408
425
 
409
426
  if columns is None:
410
427
  columns = ["id", "content", "embeddings", "metadata"]
411
- content_column_name = keyword_search_args.column
412
- query = self._build_keyword_bm25_query(
413
- table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
414
- )
415
428
 
416
- result = self.raw_query(query)
429
+ query = self._build_keyword_bm25_query(table_name, keyword_search_args, columns, conditions, limit, offset)
430
+ query_str = self.renderer.get_string(query, with_failback=True)
431
+ result = self.raw_query(query_str)
417
432
 
418
433
  # ensure embeddings are returned as string so they can be parsed by mindsdb
419
434
  if "embeddings" in columns:
@@ -622,8 +637,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
622
637
  filter_conditions, _ = self._translate_conditions(conditions)
623
638
  where_clause = self._construct_where_clause(filter_conditions)
624
639
 
625
- query = f"DELETE FROM {table_name} {where_clause}"
626
- self.raw_query(query)
640
+ query = Delete(table=Identifier(table_name), where=where_clause)
641
+ query_str = self.renderer.get_string(query, with_failback=True)
642
+ self.raw_query(query_str)
627
643
 
628
644
  def drop_table(self, table_name: str, if_exists=True):
629
645
  """
@@ -1,5 +1,3 @@
1
- import csv
2
- import io
3
1
  import time
4
2
  import json
5
3
  from typing import Optional, Any
@@ -625,7 +623,7 @@ class PostgresHandler(MetaDatabaseHandler):
625
623
  result = self.native_query(query)
626
624
  return result
627
625
 
628
- def meta_get_column_statistics(self, table_names: Optional[list] = None) -> dict:
626
+ def meta_get_column_statistics(self, table_names: Optional[list] = None) -> Response:
629
627
  """
630
628
  Retrieves column statistics (e.g., most common values, frequencies, null percentage, and distinct value count)
631
629
  for the specified tables or all tables if no list is provided.
@@ -634,54 +632,58 @@ class PostgresHandler(MetaDatabaseHandler):
634
632
  table_names (list): A list of table names for which to retrieve column statistics.
635
633
 
636
634
  Returns:
637
- dict: A dictionary containing the column statistics.
635
+ Response: A response object containing the column statistics.
638
636
  """
639
- query = """
637
+ table_filter = ""
638
+ if table_names is not None and len(table_names) > 0:
639
+ quoted_names = [f"'{t}'" for t in table_names]
640
+ table_filter = f" AND ps.tablename IN ({','.join(quoted_names)})"
641
+
642
+ query = (
643
+ """
640
644
  SELECT
641
- ps.attname AS column_name,
642
- ps.tablename AS table_name,
643
- ps.most_common_vals AS most_common_values,
644
- ps.most_common_freqs::text AS most_common_frequencies,
645
- ps.null_frac * 100 AS null_percentage,
646
- ps.n_distinct AS distinct_values_count,
647
- ps.histogram_bounds AS histogram_bounds
645
+ ps.tablename AS TABLE_NAME,
646
+ ps.attname AS COLUMN_NAME,
647
+ ROUND(ps.null_frac::numeric * 100, 2) AS NULL_PERCENTAGE,
648
+ CASE
649
+ WHEN ps.n_distinct < 0 THEN NULL
650
+ ELSE ps.n_distinct::bigint
651
+ END AS DISTINCT_VALUES_COUNT,
652
+ ps.most_common_vals AS MOST_COMMON_VALUES,
653
+ ps.most_common_freqs AS MOST_COMMON_FREQUENCIES,
654
+ ps.histogram_bounds
648
655
  FROM pg_stats ps
649
656
  WHERE ps.schemaname = current_schema()
650
657
  AND ps.tablename NOT LIKE 'pg_%'
651
658
  AND ps.tablename NOT LIKE 'sql_%'
652
659
  """
653
-
654
- if table_names is not None and len(table_names) > 0:
655
- table_names = [f"'{t}'" for t in table_names]
656
- query += f" AND ps.tablename IN ({','.join(table_names)})"
660
+ + table_filter
661
+ + """
662
+ ORDER BY ps.tablename, ps.attname
663
+ """
664
+ )
657
665
 
658
666
  result = self.native_query(query)
659
- df = result.data_frame
660
667
 
661
- def parse_pg_array_string(x):
662
- try:
663
- return (
664
- [item.strip(" ,") for row in csv.reader(io.StringIO(x.strip("{}"))) for item in row if item.strip()]
665
- if x
666
- else []
667
- )
668
- except IndexError:
669
- logger.error(f"Error parsing PostgreSQL array string: {x}")
670
- return []
671
-
672
- # Convert most_common_values and most_common_frequencies from string representation to lists.
673
- df["most_common_values"] = df["most_common_values"].apply(lambda x: parse_pg_array_string(x))
674
- df["most_common_frequencies"] = df["most_common_frequencies"].apply(lambda x: parse_pg_array_string(x))
675
-
676
- # Get the minimum and maximum values from the histogram bounds.
677
- df["minimum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[0] if x else None)
678
- df["maximum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[-1] if x else None)
679
-
680
- # Handle cases where distinct_values_count is negative (indicating an approximation).
681
- df["distinct_values_count"] = df["distinct_values_count"].apply(lambda x: x if x >= 0 else None)
668
+ if result.type == RESPONSE_TYPE.TABLE and result.data_frame is not None:
669
+ df = result.data_frame
682
670
 
683
- result.data_frame = df.drop(columns=["histogram_bounds"])
671
+ # Extract min/max from histogram bounds
672
+ def extract_min_max(histogram_str):
673
+ if histogram_str and str(histogram_str) != "nan":
674
+ clean = str(histogram_str).strip("{}")
675
+ if clean:
676
+ values = clean.split(",")
677
+ min_val = values[0].strip(" \"'") if values else None
678
+ max_val = values[-1].strip(" \"'") if values else None
679
+ return min_val, max_val
680
+ return None, None
684
681
 
682
+ min_max_values = df["histogram_bounds"].apply(extract_min_max)
683
+ df["MINIMUM_VALUE"] = min_max_values.apply(lambda x: x[0])
684
+ df["MAXIMUM_VALUE"] = min_max_values.apply(lambda x: x[1])
685
+
686
+ result.data_frame = df.drop(columns=["histogram_bounds"])
685
687
  return result
686
688
 
687
689
  def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response: