MindsDB 25.7.3.0__py3-none-any.whl → 25.8.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +11 -1
- mindsdb/api/a2a/common/server/server.py +16 -6
- mindsdb/api/executor/command_executor.py +215 -150
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
- mindsdb/api/executor/planner/plan_join.py +3 -0
- mindsdb/api/executor/planner/plan_join_ts.py +117 -100
- mindsdb/api/executor/planner/query_planner.py +1 -0
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
- mindsdb/api/executor/sql_query/steps/fetch_dataframe.py +21 -24
- mindsdb/api/executor/sql_query/steps/fetch_dataframe_partition.py +9 -3
- mindsdb/api/executor/sql_query/steps/subselect_step.py +11 -8
- mindsdb/api/executor/utilities/mysql_to_duckdb_functions.py +264 -0
- mindsdb/api/executor/utilities/sql.py +30 -0
- mindsdb/api/http/initialize.py +18 -44
- mindsdb/api/http/namespaces/agents.py +23 -20
- mindsdb/api/http/namespaces/chatbots.py +83 -120
- mindsdb/api/http/namespaces/file.py +1 -1
- mindsdb/api/http/namespaces/jobs.py +38 -60
- mindsdb/api/http/namespaces/tree.py +69 -61
- mindsdb/api/http/namespaces/views.py +56 -72
- mindsdb/api/mcp/start.py +2 -0
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
- mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
- mindsdb/integrations/handlers/db2_handler/db2_handler.py +19 -23
- mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/gong_handler/__about__.py +2 -0
- mindsdb/integrations/handlers/gong_handler/__init__.py +30 -0
- mindsdb/integrations/handlers/gong_handler/connection_args.py +37 -0
- mindsdb/integrations/handlers/gong_handler/gong_handler.py +164 -0
- mindsdb/integrations/handlers/gong_handler/gong_tables.py +508 -0
- mindsdb/integrations/handlers/gong_handler/icon.svg +25 -0
- mindsdb/integrations/handlers/gong_handler/test_gong_handler.py +125 -0
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
- mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/huggingface_handler/__init__.py +8 -12
- mindsdb/integrations/handlers/huggingface_handler/finetune.py +203 -223
- mindsdb/integrations/handlers/huggingface_handler/huggingface_handler.py +360 -383
- mindsdb/integrations/handlers/huggingface_handler/requirements.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +7 -7
- mindsdb/integrations/handlers/huggingface_handler/settings.py +25 -25
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -77
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +5 -2
- mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
- mindsdb/integrations/handlers/openai_handler/constants.py +11 -30
- mindsdb/integrations/handlers/openai_handler/helpers.py +27 -34
- mindsdb/integrations/handlers/openai_handler/openai_handler.py +14 -12
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
- mindsdb/integrations/handlers/salesforce_handler/constants.py +215 -0
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +141 -80
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +0 -1
- mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
- mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
- mindsdb/integrations/libs/llm/config.py +0 -14
- mindsdb/integrations/libs/llm/utils.py +0 -15
- mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
- mindsdb/integrations/utilities/files/file_reader.py +5 -19
- mindsdb/integrations/utilities/handler_utils.py +32 -12
- mindsdb/integrations/utilities/rag/rerankers/base_reranker.py +1 -1
- mindsdb/interfaces/agents/agents_controller.py +246 -149
- mindsdb/interfaces/agents/constants.py +0 -1
- mindsdb/interfaces/agents/langchain_agent.py +11 -6
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +4 -4
- mindsdb/interfaces/database/database.py +38 -13
- mindsdb/interfaces/database/integrations.py +20 -5
- mindsdb/interfaces/database/projects.py +174 -23
- mindsdb/interfaces/database/views.py +86 -60
- mindsdb/interfaces/jobs/jobs_controller.py +103 -110
- mindsdb/interfaces/knowledge_base/controller.py +33 -6
- mindsdb/interfaces/knowledge_base/evaluate.py +2 -1
- mindsdb/interfaces/knowledge_base/executor.py +24 -0
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +6 -10
- mindsdb/interfaces/knowledge_base/preprocessing/text_splitter.py +73 -0
- mindsdb/interfaces/query_context/context_controller.py +111 -145
- mindsdb/interfaces/skills/skills_controller.py +18 -6
- mindsdb/interfaces/storage/db.py +40 -6
- mindsdb/interfaces/variables/variables_controller.py +8 -15
- mindsdb/utilities/config.py +5 -3
- mindsdb/utilities/fs.py +54 -17
- mindsdb/utilities/functions.py +72 -60
- mindsdb/utilities/log.py +38 -6
- mindsdb/utilities/ps.py +7 -7
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/METADATA +282 -268
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/RECORD +94 -92
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__about__.py +0 -9
- mindsdb/integrations/handlers/anyscale_endpoints_handler/__init__.py +0 -20
- mindsdb/integrations/handlers/anyscale_endpoints_handler/anyscale_endpoints_handler.py +0 -290
- mindsdb/integrations/handlers/anyscale_endpoints_handler/creation_args.py +0 -14
- mindsdb/integrations/handlers/anyscale_endpoints_handler/icon.svg +0 -4
- mindsdb/integrations/handlers/anyscale_endpoints_handler/requirements.txt +0 -2
- mindsdb/integrations/handlers/anyscale_endpoints_handler/settings.py +0 -51
- mindsdb/integrations/handlers/anyscale_endpoints_handler/tests/test_anyscale_endpoints_handler.py +0 -212
- /mindsdb/integrations/handlers/{anyscale_endpoints_handler/tests/__init__.py → gong_handler/requirements.txt} +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.3.0.dist-info → mindsdb-25.8.2.0.dist-info}/top_level.txt +0 -0
|
@@ -14,6 +14,7 @@ class PendingFT(openai.OpenAIError):
|
|
|
14
14
|
"""
|
|
15
15
|
Custom exception to handle pending fine-tuning status.
|
|
16
16
|
"""
|
|
17
|
+
|
|
17
18
|
message: str
|
|
18
19
|
|
|
19
20
|
def __init__(self, message) -> None:
|
|
@@ -65,10 +66,7 @@ def retry_with_exponential_backoff(
|
|
|
65
66
|
|
|
66
67
|
if isinstance(hour_budget, float) or isinstance(hour_budget, int):
|
|
67
68
|
try:
|
|
68
|
-
max_retries = round(
|
|
69
|
-
(math.log((hour_budget * 3600) / initial_delay))
|
|
70
|
-
/ math.log(exponential_base)
|
|
71
|
-
)
|
|
69
|
+
max_retries = round((math.log((hour_budget * 3600) / initial_delay)) / math.log(exponential_base))
|
|
72
70
|
except ValueError:
|
|
73
71
|
max_retries = 10
|
|
74
72
|
else:
|
|
@@ -81,22 +79,20 @@ def retry_with_exponential_backoff(
|
|
|
81
79
|
|
|
82
80
|
except status_errors as e:
|
|
83
81
|
raise Exception(
|
|
84
|
-
f
|
|
82
|
+
f"Error status {e.status_code} raised by OpenAI API: {e.body.get('message', 'Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information.')}" # noqa
|
|
85
83
|
) # noqa
|
|
86
84
|
|
|
87
85
|
except wait_errors:
|
|
88
86
|
num_retries += 1
|
|
89
87
|
if num_retries > max_retries:
|
|
90
|
-
raise Exception(
|
|
91
|
-
f"Maximum number of retries ({max_retries}) exceeded."
|
|
92
|
-
)
|
|
88
|
+
raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
|
|
93
89
|
# Increment the delay and wait
|
|
94
90
|
delay *= exponential_base * (1 + jitter * random.random())
|
|
95
91
|
time.sleep(delay)
|
|
96
92
|
|
|
97
93
|
except openai.OpenAIError as e:
|
|
98
94
|
raise Exception(
|
|
99
|
-
f
|
|
95
|
+
f"General {str(e)} error raised by OpenAI. Please refer to `https://platform.openai.com/docs/guides/error-codes` for more information." # noqa
|
|
100
96
|
)
|
|
101
97
|
|
|
102
98
|
except Exception as e:
|
|
@@ -107,7 +103,7 @@ def retry_with_exponential_backoff(
|
|
|
107
103
|
return _retry_with_exponential_backoff
|
|
108
104
|
|
|
109
105
|
|
|
110
|
-
def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text =
|
|
106
|
+
def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_tokens: int, truncate: Text = "first"):
|
|
111
107
|
"""
|
|
112
108
|
Truncates message list to fit within the token limit.
|
|
113
109
|
The first message for chat completion models are general directives with the system role, which will ideally be kept at all times.
|
|
@@ -129,20 +125,18 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
|
|
|
129
125
|
except KeyError:
|
|
130
126
|
# If the encoding is not found, defualt to cl100k_base.
|
|
131
127
|
# This is applicable for handlers that extend the OpenAI handler such as Anyscale.
|
|
132
|
-
model_name =
|
|
133
|
-
encoder = tiktoken.get_encoding(
|
|
128
|
+
model_name = "gpt-3.5-turbo-0301"
|
|
129
|
+
encoder = tiktoken.get_encoding("cl100k_base")
|
|
134
130
|
|
|
135
131
|
sys_priming = messages[0:1]
|
|
136
132
|
n_tokens = count_tokens(messages, encoder, model_name)
|
|
137
133
|
while n_tokens > max_tokens:
|
|
138
134
|
if len(messages) == 2:
|
|
139
|
-
return messages[
|
|
140
|
-
:-1
|
|
141
|
-
] # edge case: if limit is surpassed by just one input, we remove initial instruction
|
|
135
|
+
return messages[:-1] # edge case: if limit is surpassed by just one input, we remove initial instruction
|
|
142
136
|
elif len(messages) == 1:
|
|
143
137
|
return messages
|
|
144
138
|
|
|
145
|
-
if truncate ==
|
|
139
|
+
if truncate == "first":
|
|
146
140
|
messages = sys_priming + messages[2:]
|
|
147
141
|
else:
|
|
148
142
|
messages = sys_priming + messages[1:-1]
|
|
@@ -151,7 +145,7 @@ def truncate_msgs_for_token_limit(messages: List[Dict], model_name: Text, max_to
|
|
|
151
145
|
return messages
|
|
152
146
|
|
|
153
147
|
|
|
154
|
-
def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text =
|
|
148
|
+
def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_name: Text = "gpt-3.5-turbo-0301"):
|
|
155
149
|
"""
|
|
156
150
|
Counts the number of tokens in a list of messages.
|
|
157
151
|
|
|
@@ -160,24 +154,23 @@ def count_tokens(messages: List[Dict], encoder: tiktoken.core.Encoding, model_na
|
|
|
160
154
|
encoder: Tokenizer
|
|
161
155
|
model_name: Model name
|
|
162
156
|
"""
|
|
163
|
-
if (
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
num_tokens = 0
|
|
167
|
-
for message in messages:
|
|
168
|
-
num_tokens += (
|
|
169
|
-
4 # every message follows <im_start>{role/name}\n{content}<im_end>\n
|
|
170
|
-
)
|
|
171
|
-
for key, value in message.items():
|
|
172
|
-
num_tokens += len(encoder.encode(value))
|
|
173
|
-
if key == "name": # if there's a name, the role is omitted
|
|
174
|
-
num_tokens += -1 # role is always required and always 1 token
|
|
175
|
-
num_tokens += 2 # every reply is primed with <im_start>assistant
|
|
176
|
-
return num_tokens
|
|
157
|
+
if "gpt-3.5-turbo" in model_name: # note: future models may deviate from this (only 0301 really complies)
|
|
158
|
+
tokens_per_message = 4 # every message follows <|start|>{role/name}\n{content}<|end|>\n
|
|
159
|
+
tokens_per_name = -1
|
|
177
160
|
else:
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
161
|
+
tokens_per_message = 3
|
|
162
|
+
tokens_per_name = 1
|
|
163
|
+
|
|
164
|
+
num_tokens = 0
|
|
165
|
+
for message in messages:
|
|
166
|
+
num_tokens += tokens_per_message
|
|
167
|
+
|
|
168
|
+
for key, value in message.items():
|
|
169
|
+
num_tokens += len(encoder.encode(value))
|
|
170
|
+
if key == "name": # if there's a name, the role is omitted
|
|
171
|
+
num_tokens += tokens_per_name
|
|
172
|
+
num_tokens += 2 # every reply is primed with <im_start>assistant
|
|
173
|
+
return num_tokens
|
|
181
174
|
|
|
182
175
|
|
|
183
176
|
def get_available_models(client) -> List[Text]:
|
|
@@ -24,7 +24,7 @@ from mindsdb.integrations.handlers.openai_handler.helpers import (
|
|
|
24
24
|
PendingFT,
|
|
25
25
|
)
|
|
26
26
|
from mindsdb.integrations.handlers.openai_handler.constants import (
|
|
27
|
-
|
|
27
|
+
CHAT_MODELS_PREFIXES,
|
|
28
28
|
IMAGE_MODELS,
|
|
29
29
|
FINETUNING_MODELS,
|
|
30
30
|
OPENAI_API_BASE,
|
|
@@ -62,7 +62,6 @@ class OpenAIHandler(BaseMLEngine):
|
|
|
62
62
|
self.rate_limit = 60 # requests per minute
|
|
63
63
|
self.max_batch_size = 20
|
|
64
64
|
self.default_max_tokens = 100
|
|
65
|
-
self.chat_completion_models = CHAT_MODELS
|
|
66
65
|
self.supported_ft_models = FINETUNING_MODELS # base models compatible with finetuning
|
|
67
66
|
# For now this are only used for handlers that inherits OpenAIHandler and don't need to override base methods
|
|
68
67
|
self.api_key_name = getattr(self, "api_key_name", self.name)
|
|
@@ -89,6 +88,13 @@ class OpenAIHandler(BaseMLEngine):
|
|
|
89
88
|
client = self._get_client(api_key=api_key, base_url=api_base, org=org, args=connection_args)
|
|
90
89
|
OpenAIHandler._check_client_connection(client)
|
|
91
90
|
|
|
91
|
+
@staticmethod
|
|
92
|
+
def is_chat_model(model_name):
|
|
93
|
+
for prefix in CHAT_MODELS_PREFIXES:
|
|
94
|
+
if model_name.startswith(prefix):
|
|
95
|
+
return True
|
|
96
|
+
return False
|
|
97
|
+
|
|
92
98
|
@staticmethod
|
|
93
99
|
def _check_client_connection(client: OpenAI) -> None:
|
|
94
100
|
"""
|
|
@@ -350,11 +356,6 @@ class OpenAIHandler(BaseMLEngine):
|
|
|
350
356
|
"user": pred_args.get("user", None),
|
|
351
357
|
}
|
|
352
358
|
|
|
353
|
-
if args.get("mode", self.default_mode) != "default" and model_name not in self.chat_completion_models:
|
|
354
|
-
raise Exception(
|
|
355
|
-
f"Conversational modes are only available for the following models: {', '.join(self.chat_completion_models)}"
|
|
356
|
-
) # noqa
|
|
357
|
-
|
|
358
359
|
if args.get("prompt_template", False):
|
|
359
360
|
prompts, empty_prompt_ids = get_completed_prompts(base_template, df, strict=strict_prompt_template)
|
|
360
361
|
|
|
@@ -515,7 +516,7 @@ class OpenAIHandler(BaseMLEngine):
|
|
|
515
516
|
return _submit_image_completion(kwargs, prompts, api_args)
|
|
516
517
|
elif model_name == "embedding":
|
|
517
518
|
return _submit_embedding_completion(kwargs, prompts, api_args)
|
|
518
|
-
elif
|
|
519
|
+
elif self.is_chat_model(model_name):
|
|
519
520
|
if model_name == "gpt-3.5-turbo-instruct":
|
|
520
521
|
return _submit_normal_completion(kwargs, prompts, api_args)
|
|
521
522
|
else:
|
|
@@ -579,13 +580,14 @@ class OpenAIHandler(BaseMLEngine):
|
|
|
579
580
|
tidy_comps.append(c.text.strip("\n").strip(""))
|
|
580
581
|
return tidy_comps
|
|
581
582
|
|
|
582
|
-
kwargs["prompt"] = prompts
|
|
583
583
|
kwargs = {**kwargs, **api_args}
|
|
584
584
|
|
|
585
585
|
before_openai_query(kwargs)
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
586
|
+
responses = []
|
|
587
|
+
for prompt in prompts:
|
|
588
|
+
responses.extend(_tidy(client.completions.create(prompt=prompt, **kwargs)))
|
|
589
|
+
_log_api_call(kwargs, responses)
|
|
590
|
+
return responses
|
|
589
591
|
|
|
590
592
|
def _submit_embedding_completion(kwargs: Dict, prompts: List[Text], api_args: Dict) -> List[float]:
|
|
591
593
|
"""
|
|
@@ -5,7 +5,19 @@ from urllib.parse import urlparse
|
|
|
5
5
|
|
|
6
6
|
import pandas as pd
|
|
7
7
|
import psycopg
|
|
8
|
-
from mindsdb_sql_parser.ast import
|
|
8
|
+
from mindsdb_sql_parser.ast import (
|
|
9
|
+
Parameter,
|
|
10
|
+
Identifier,
|
|
11
|
+
BinaryOperation,
|
|
12
|
+
Tuple as AstTuple,
|
|
13
|
+
Constant,
|
|
14
|
+
Select,
|
|
15
|
+
OrderBy,
|
|
16
|
+
TypeCast,
|
|
17
|
+
Delete,
|
|
18
|
+
Update,
|
|
19
|
+
Function,
|
|
20
|
+
)
|
|
9
21
|
from pgvector.psycopg import register_vector
|
|
10
22
|
|
|
11
23
|
from mindsdb.integrations.handlers.postgres_handler.postgres_handler import (
|
|
@@ -17,6 +29,7 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
17
29
|
VectorStoreHandler,
|
|
18
30
|
DistanceFunction,
|
|
19
31
|
TableField,
|
|
32
|
+
FilterOperator,
|
|
20
33
|
)
|
|
21
34
|
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
22
35
|
from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
|
|
@@ -169,31 +182,42 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
169
182
|
embedding_condition = None
|
|
170
183
|
|
|
171
184
|
for condition in conditions:
|
|
185
|
+
is_embedding = condition.column == "embeddings"
|
|
186
|
+
|
|
172
187
|
parts = condition.column.split(".")
|
|
173
|
-
key = parts[0]
|
|
188
|
+
key = Identifier(parts[0])
|
|
189
|
+
|
|
174
190
|
# converts 'col.el1.el2' to col->'el1'->>'el2'
|
|
175
191
|
if len(parts) > 1:
|
|
176
192
|
# intermediate elements
|
|
177
193
|
for el in parts[1:-1]:
|
|
178
|
-
key
|
|
194
|
+
key = BinaryOperation(op="->", args=[key, Constant(el)])
|
|
179
195
|
|
|
180
196
|
# last element
|
|
181
|
-
key
|
|
197
|
+
key = BinaryOperation(op="->>", args=[key, Constant(parts[-1])])
|
|
182
198
|
|
|
183
199
|
type_cast = None
|
|
184
|
-
|
|
200
|
+
value = condition.value
|
|
201
|
+
if (
|
|
202
|
+
isinstance(value, list)
|
|
203
|
+
and len(value) > 0
|
|
204
|
+
and condition.op in (FilterOperator.IN, FilterOperator.NOT_IN)
|
|
205
|
+
):
|
|
206
|
+
value = condition.value[0]
|
|
207
|
+
|
|
208
|
+
if isinstance(value, int):
|
|
185
209
|
type_cast = "int"
|
|
186
|
-
elif isinstance(
|
|
210
|
+
elif isinstance(value, float):
|
|
187
211
|
type_cast = "float"
|
|
188
212
|
if type_cast is not None:
|
|
189
|
-
key =
|
|
213
|
+
key = TypeCast(type_cast, key)
|
|
190
214
|
|
|
191
215
|
item = {
|
|
192
216
|
"name": key,
|
|
193
217
|
"op": condition.op.value,
|
|
194
218
|
"value": condition.value,
|
|
195
219
|
}
|
|
196
|
-
if
|
|
220
|
+
if is_embedding:
|
|
197
221
|
embedding_condition = item
|
|
198
222
|
else:
|
|
199
223
|
filter_conditions.append(item)
|
|
@@ -205,64 +229,24 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
205
229
|
"""
|
|
206
230
|
Construct where clauses from filter conditions
|
|
207
231
|
"""
|
|
208
|
-
if filter_conditions is None:
|
|
209
|
-
return ""
|
|
210
232
|
|
|
211
|
-
|
|
233
|
+
where_clause = None
|
|
212
234
|
|
|
213
235
|
for item in filter_conditions:
|
|
214
236
|
key = item["name"]
|
|
215
237
|
|
|
216
238
|
if item["op"].lower() in ("in", "not in"):
|
|
217
|
-
values =
|
|
218
|
-
|
|
239
|
+
values = [Constant(i) for i in item["value"]]
|
|
240
|
+
value = AstTuple(values)
|
|
219
241
|
else:
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
else:
|
|
223
|
-
item["value"] = repr(item["value"])
|
|
224
|
-
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
225
|
-
|
|
226
|
-
if len(where_clauses) > 1:
|
|
227
|
-
return f"WHERE {' AND '.join(where_clauses)}"
|
|
228
|
-
elif len(where_clauses) == 1:
|
|
229
|
-
return f"WHERE {where_clauses[0]}"
|
|
230
|
-
else:
|
|
231
|
-
return ""
|
|
232
|
-
|
|
233
|
-
@staticmethod
|
|
234
|
-
def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
|
|
235
|
-
if not keyword_query or not content_column_name:
|
|
236
|
-
return PgVectorHandler._construct_where_clause(filter_conditions)
|
|
237
|
-
|
|
238
|
-
keyword_query_condition = (
|
|
239
|
-
f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
|
|
240
|
-
)
|
|
241
|
-
if filter_conditions is None:
|
|
242
|
-
return ""
|
|
243
|
-
|
|
244
|
-
where_clauses = []
|
|
245
|
-
|
|
246
|
-
for item in filter_conditions:
|
|
247
|
-
key = item["name"]
|
|
242
|
+
value = Constant(item["value"])
|
|
243
|
+
condition = BinaryOperation(op=item["op"], args=[key, value])
|
|
248
244
|
|
|
249
|
-
if
|
|
250
|
-
|
|
251
|
-
item["value"] = "({})".format(", ".join(values))
|
|
245
|
+
if where_clause is None:
|
|
246
|
+
where_clause = condition
|
|
252
247
|
else:
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
else:
|
|
256
|
-
item["value"] = repr(item["value"])
|
|
257
|
-
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
258
|
-
|
|
259
|
-
where_clauses.append(keyword_query_condition)
|
|
260
|
-
if len(where_clauses) > 1:
|
|
261
|
-
return f"WHERE {' AND '.join(where_clauses)}"
|
|
262
|
-
elif len(where_clauses) == 1:
|
|
263
|
-
return f"WHERE {where_clauses[0]}"
|
|
264
|
-
else:
|
|
265
|
-
return ""
|
|
248
|
+
where_clause = BinaryOperation(op="AND", args=[where_clause, condition])
|
|
249
|
+
return where_clause
|
|
266
250
|
|
|
267
251
|
@staticmethod
|
|
268
252
|
def _construct_full_after_from_clause(
|
|
@@ -275,9 +259,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
275
259
|
def _build_keyword_bm25_query(
|
|
276
260
|
self,
|
|
277
261
|
table_name: str,
|
|
278
|
-
|
|
262
|
+
keyword_search_args: KeywordSearchArgs,
|
|
279
263
|
columns: List[str] = None,
|
|
280
|
-
content_column_name: str = "content",
|
|
281
264
|
conditions: List[FilterCondition] = None,
|
|
282
265
|
limit: int = None,
|
|
283
266
|
offset: int = None,
|
|
@@ -286,21 +269,44 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
286
269
|
columns = ["id", "content", "metadata"]
|
|
287
270
|
|
|
288
271
|
filter_conditions, _ = self._translate_conditions(conditions)
|
|
272
|
+
where_clause = self._construct_where_clause(filter_conditions)
|
|
289
273
|
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
{table_name}
|
|
299
|
-
{where_clause if where_clause else ""}
|
|
300
|
-
{f"LIMIT {limit}" if limit else ""}
|
|
301
|
-
{f"OFFSET {offset}" if offset else ""};"""
|
|
274
|
+
if keyword_search_args:
|
|
275
|
+
keyword_query_condition = BinaryOperation(
|
|
276
|
+
op="@@",
|
|
277
|
+
args=[
|
|
278
|
+
Function("to_tsvector", args=[Constant("english"), Identifier(keyword_search_args.column)]),
|
|
279
|
+
Function("websearch_to_tsquery", args=[Constant("english"), Constant(keyword_search_args.query)]),
|
|
280
|
+
],
|
|
281
|
+
)
|
|
302
282
|
|
|
303
|
-
|
|
283
|
+
if where_clause:
|
|
284
|
+
where_clause = BinaryOperation(op="AND", args=[where_clause, keyword_query_condition])
|
|
285
|
+
else:
|
|
286
|
+
where_clause = keyword_query_condition
|
|
287
|
+
|
|
288
|
+
distance = Function(
|
|
289
|
+
"ts_rank_cd",
|
|
290
|
+
args=[
|
|
291
|
+
Function("to_tsvector", args=[Constant("english"), Identifier(keyword_search_args.column)]),
|
|
292
|
+
Function("websearch_to_tsquery", args=[Constant("english"), Constant(keyword_search_args.query)]),
|
|
293
|
+
],
|
|
294
|
+
alias=Identifier("distance"),
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
targets = [Identifier(col) for col in columns]
|
|
298
|
+
targets.append(distance)
|
|
299
|
+
|
|
300
|
+
limit_clause = Constant(limit) if limit else None
|
|
301
|
+
offset_clause = Constant(offset) if offset else None
|
|
302
|
+
|
|
303
|
+
return Select(
|
|
304
|
+
targets=targets,
|
|
305
|
+
from_table=Identifier(table_name),
|
|
306
|
+
where=where_clause,
|
|
307
|
+
limit=limit_clause,
|
|
308
|
+
offset=offset_clause,
|
|
309
|
+
)
|
|
304
310
|
|
|
305
311
|
def _build_select_query(
|
|
306
312
|
self,
|
|
@@ -309,12 +315,12 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
309
315
|
conditions: List[FilterCondition] = None,
|
|
310
316
|
limit: int = None,
|
|
311
317
|
offset: int = None,
|
|
312
|
-
) ->
|
|
318
|
+
) -> Select:
|
|
313
319
|
"""
|
|
314
320
|
given inputs, build string query
|
|
315
321
|
"""
|
|
316
|
-
limit_clause =
|
|
317
|
-
offset_clause =
|
|
322
|
+
limit_clause = Constant(limit) if limit else None
|
|
323
|
+
offset_clause = Constant(offset) if offset else None
|
|
318
324
|
|
|
319
325
|
# translate filter conditions to dictionary
|
|
320
326
|
filter_conditions, embedding_search = self._translate_conditions(conditions)
|
|
@@ -335,7 +341,15 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
335
341
|
modified_columns = ["id", "content", "embeddings", "metadata"]
|
|
336
342
|
has_distance = True
|
|
337
343
|
|
|
338
|
-
targets =
|
|
344
|
+
targets = [Identifier(col) for col in modified_columns]
|
|
345
|
+
|
|
346
|
+
query = Select(
|
|
347
|
+
targets=targets,
|
|
348
|
+
from_table=Identifier(table_name),
|
|
349
|
+
where=where_clause,
|
|
350
|
+
limit=limit_clause,
|
|
351
|
+
offset=offset_clause,
|
|
352
|
+
)
|
|
339
353
|
|
|
340
354
|
if embedding_search:
|
|
341
355
|
search_vector = embedding_search["value"]
|
|
@@ -352,15 +366,18 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
352
366
|
if isinstance(search_vector, list):
|
|
353
367
|
search_vector = f"[{','.join(str(x) for x in search_vector)}]"
|
|
354
368
|
|
|
369
|
+
vector_op = BinaryOperation(
|
|
370
|
+
op=self.distance_op,
|
|
371
|
+
args=[Identifier("embeddings"), Constant(search_vector)],
|
|
372
|
+
alias=Identifier("distance"),
|
|
373
|
+
)
|
|
355
374
|
# Calculate distance as part of the query if needed
|
|
356
375
|
if has_distance:
|
|
357
|
-
targets
|
|
376
|
+
query.targets.append(vector_op)
|
|
358
377
|
|
|
359
|
-
|
|
378
|
+
query.order_by = [OrderBy(vector_op, direction="ASC")]
|
|
360
379
|
|
|
361
|
-
|
|
362
|
-
# if filter conditions, return rows that satisfy the conditions
|
|
363
|
-
return f"SELECT {targets} FROM {table_name} {where_clause} {limit_clause} {offset_clause}"
|
|
380
|
+
return query
|
|
364
381
|
|
|
365
382
|
def _check_table(self, table_name: str):
|
|
366
383
|
# Apply namespace for a user
|
|
@@ -386,8 +403,8 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
386
403
|
columns = ["id", "content", "embeddings", "metadata"]
|
|
387
404
|
|
|
388
405
|
query = self._build_select_query(table_name, columns, conditions, limit, offset)
|
|
389
|
-
|
|
390
|
-
result = self.raw_query(
|
|
406
|
+
query_str = self.renderer.get_string(query, with_failback=True)
|
|
407
|
+
result = self.raw_query(query_str)
|
|
391
408
|
|
|
392
409
|
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
393
410
|
if "embeddings" in columns:
|
|
@@ -408,12 +425,10 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
408
425
|
|
|
409
426
|
if columns is None:
|
|
410
427
|
columns = ["id", "content", "embeddings", "metadata"]
|
|
411
|
-
content_column_name = keyword_search_args.column
|
|
412
|
-
query = self._build_keyword_bm25_query(
|
|
413
|
-
table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
|
|
414
|
-
)
|
|
415
428
|
|
|
416
|
-
|
|
429
|
+
query = self._build_keyword_bm25_query(table_name, keyword_search_args, columns, conditions, limit, offset)
|
|
430
|
+
query_str = self.renderer.get_string(query, with_failback=True)
|
|
431
|
+
result = self.raw_query(query_str)
|
|
417
432
|
|
|
418
433
|
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
419
434
|
if "embeddings" in columns:
|
|
@@ -622,8 +637,9 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
|
622
637
|
filter_conditions, _ = self._translate_conditions(conditions)
|
|
623
638
|
where_clause = self._construct_where_clause(filter_conditions)
|
|
624
639
|
|
|
625
|
-
query =
|
|
626
|
-
self.
|
|
640
|
+
query = Delete(table=Identifier(table_name), where=where_clause)
|
|
641
|
+
query_str = self.renderer.get_string(query, with_failback=True)
|
|
642
|
+
self.raw_query(query_str)
|
|
627
643
|
|
|
628
644
|
def drop_table(self, table_name: str, if_exists=True):
|
|
629
645
|
"""
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import csv
|
|
2
|
-
import io
|
|
3
1
|
import time
|
|
4
2
|
import json
|
|
5
3
|
from typing import Optional, Any
|
|
@@ -625,7 +623,7 @@ class PostgresHandler(MetaDatabaseHandler):
|
|
|
625
623
|
result = self.native_query(query)
|
|
626
624
|
return result
|
|
627
625
|
|
|
628
|
-
def meta_get_column_statistics(self, table_names: Optional[list] = None) ->
|
|
626
|
+
def meta_get_column_statistics(self, table_names: Optional[list] = None) -> Response:
|
|
629
627
|
"""
|
|
630
628
|
Retrieves column statistics (e.g., most common values, frequencies, null percentage, and distinct value count)
|
|
631
629
|
for the specified tables or all tables if no list is provided.
|
|
@@ -634,54 +632,58 @@ class PostgresHandler(MetaDatabaseHandler):
|
|
|
634
632
|
table_names (list): A list of table names for which to retrieve column statistics.
|
|
635
633
|
|
|
636
634
|
Returns:
|
|
637
|
-
|
|
635
|
+
Response: A response object containing the column statistics.
|
|
638
636
|
"""
|
|
639
|
-
|
|
637
|
+
table_filter = ""
|
|
638
|
+
if table_names is not None and len(table_names) > 0:
|
|
639
|
+
quoted_names = [f"'{t}'" for t in table_names]
|
|
640
|
+
table_filter = f" AND ps.tablename IN ({','.join(quoted_names)})"
|
|
641
|
+
|
|
642
|
+
query = (
|
|
643
|
+
"""
|
|
640
644
|
SELECT
|
|
641
|
-
ps.
|
|
642
|
-
ps.
|
|
643
|
-
ps.
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
645
|
+
ps.tablename AS TABLE_NAME,
|
|
646
|
+
ps.attname AS COLUMN_NAME,
|
|
647
|
+
ROUND(ps.null_frac::numeric * 100, 2) AS NULL_PERCENTAGE,
|
|
648
|
+
CASE
|
|
649
|
+
WHEN ps.n_distinct < 0 THEN NULL
|
|
650
|
+
ELSE ps.n_distinct::bigint
|
|
651
|
+
END AS DISTINCT_VALUES_COUNT,
|
|
652
|
+
ps.most_common_vals AS MOST_COMMON_VALUES,
|
|
653
|
+
ps.most_common_freqs AS MOST_COMMON_FREQUENCIES,
|
|
654
|
+
ps.histogram_bounds
|
|
648
655
|
FROM pg_stats ps
|
|
649
656
|
WHERE ps.schemaname = current_schema()
|
|
650
657
|
AND ps.tablename NOT LIKE 'pg_%'
|
|
651
658
|
AND ps.tablename NOT LIKE 'sql_%'
|
|
652
659
|
"""
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
660
|
+
+ table_filter
|
|
661
|
+
+ """
|
|
662
|
+
ORDER BY ps.tablename, ps.attname
|
|
663
|
+
"""
|
|
664
|
+
)
|
|
657
665
|
|
|
658
666
|
result = self.native_query(query)
|
|
659
|
-
df = result.data_frame
|
|
660
667
|
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
return (
|
|
664
|
-
[item.strip(" ,") for row in csv.reader(io.StringIO(x.strip("{}"))) for item in row if item.strip()]
|
|
665
|
-
if x
|
|
666
|
-
else []
|
|
667
|
-
)
|
|
668
|
-
except IndexError:
|
|
669
|
-
logger.error(f"Error parsing PostgreSQL array string: {x}")
|
|
670
|
-
return []
|
|
671
|
-
|
|
672
|
-
# Convert most_common_values and most_common_frequencies from string representation to lists.
|
|
673
|
-
df["most_common_values"] = df["most_common_values"].apply(lambda x: parse_pg_array_string(x))
|
|
674
|
-
df["most_common_frequencies"] = df["most_common_frequencies"].apply(lambda x: parse_pg_array_string(x))
|
|
675
|
-
|
|
676
|
-
# Get the minimum and maximum values from the histogram bounds.
|
|
677
|
-
df["minimum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[0] if x else None)
|
|
678
|
-
df["maximum_value"] = df["histogram_bounds"].apply(lambda x: parse_pg_array_string(x)[-1] if x else None)
|
|
679
|
-
|
|
680
|
-
# Handle cases where distinct_values_count is negative (indicating an approximation).
|
|
681
|
-
df["distinct_values_count"] = df["distinct_values_count"].apply(lambda x: x if x >= 0 else None)
|
|
668
|
+
if result.type == RESPONSE_TYPE.TABLE and result.data_frame is not None:
|
|
669
|
+
df = result.data_frame
|
|
682
670
|
|
|
683
|
-
|
|
671
|
+
# Extract min/max from histogram bounds
|
|
672
|
+
def extract_min_max(histogram_str):
|
|
673
|
+
if histogram_str and str(histogram_str) != "nan":
|
|
674
|
+
clean = str(histogram_str).strip("{}")
|
|
675
|
+
if clean:
|
|
676
|
+
values = clean.split(",")
|
|
677
|
+
min_val = values[0].strip(" \"'") if values else None
|
|
678
|
+
max_val = values[-1].strip(" \"'") if values else None
|
|
679
|
+
return min_val, max_val
|
|
680
|
+
return None, None
|
|
684
681
|
|
|
682
|
+
min_max_values = df["histogram_bounds"].apply(extract_min_max)
|
|
683
|
+
df["MINIMUM_VALUE"] = min_max_values.apply(lambda x: x[0])
|
|
684
|
+
df["MAXIMUM_VALUE"] = min_max_values.apply(lambda x: x[1])
|
|
685
|
+
|
|
686
|
+
result.data_frame = df.drop(columns=["histogram_bounds"])
|
|
685
687
|
return result
|
|
686
688
|
|
|
687
689
|
def meta_get_primary_keys(self, table_names: Optional[list] = None) -> Response:
|