MindsDB 25.7.1.0__py3-none-any.whl → 25.7.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +54 -95
- mindsdb/api/a2a/agent.py +30 -206
- mindsdb/api/a2a/common/server/server.py +26 -27
- mindsdb/api/a2a/task_manager.py +93 -227
- mindsdb/api/a2a/utils.py +21 -0
- mindsdb/api/executor/command_executor.py +7 -2
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
- mindsdb/api/executor/utilities/sql.py +97 -21
- mindsdb/api/http/namespaces/agents.py +127 -202
- mindsdb/api/http/namespaces/config.py +12 -1
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +11 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +94 -1
- mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +4 -3
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -3
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
- mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
- mindsdb/integrations/libs/keyword_search_base.py +41 -0
- mindsdb/integrations/libs/vectordatabase_handler.py +35 -14
- mindsdb/integrations/utilities/sql_utils.py +11 -0
- mindsdb/interfaces/agents/agents_controller.py +2 -2
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +18 -4
- mindsdb/interfaces/database/projects.py +1 -3
- mindsdb/interfaces/functions/controller.py +54 -64
- mindsdb/interfaces/functions/to_markdown.py +47 -14
- mindsdb/interfaces/knowledge_base/controller.py +134 -35
- mindsdb/interfaces/knowledge_base/evaluate.py +53 -10
- mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
- mindsdb/utilities/config.py +46 -39
- mindsdb/utilities/exception.py +11 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/METADATA +236 -236
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/RECORD +38 -36
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.1.0.dist-info → mindsdb-25.7.3.0.dist-info}/top_level.txt +0 -0
|
@@ -18,6 +18,8 @@ from mindsdb.integrations.libs.vectordatabase_handler import (
|
|
|
18
18
|
DistanceFunction,
|
|
19
19
|
TableField,
|
|
20
20
|
)
|
|
21
|
+
from mindsdb.integrations.libs.keyword_search_base import KeywordSearchBase
|
|
22
|
+
from mindsdb.integrations.utilities.sql_utils import KeywordSearchArgs
|
|
21
23
|
from mindsdb.utilities import log
|
|
22
24
|
from mindsdb.utilities.profiler import profiler
|
|
23
25
|
from mindsdb.utilities.context import context as ctx
|
|
@@ -26,7 +28,7 @@ logger = log.getLogger(__name__)
|
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
# todo Issue #7316 add support for different indexes and search algorithms e.g. cosine similarity or L2 norm
|
|
29
|
-
class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
31
|
+
class PgVectorHandler(PostgresHandler, VectorStoreHandler, KeywordSearchBase):
|
|
30
32
|
"""This handler handles connection and execution of the PostgreSQL with pgvector extension statements."""
|
|
31
33
|
|
|
32
34
|
name = "pgvector"
|
|
@@ -228,6 +230,40 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
228
230
|
else:
|
|
229
231
|
return ""
|
|
230
232
|
|
|
233
|
+
@staticmethod
|
|
234
|
+
def _construct_where_clause_with_keywords(filter_conditions=None, keyword_query=None, content_column_name=None):
|
|
235
|
+
if not keyword_query or not content_column_name:
|
|
236
|
+
return PgVectorHandler._construct_where_clause(filter_conditions)
|
|
237
|
+
|
|
238
|
+
keyword_query_condition = (
|
|
239
|
+
f"""to_tsvector('english', {content_column_name}) @@ websearch_to_tsquery('english', '{keyword_query}')"""
|
|
240
|
+
)
|
|
241
|
+
if filter_conditions is None:
|
|
242
|
+
return ""
|
|
243
|
+
|
|
244
|
+
where_clauses = []
|
|
245
|
+
|
|
246
|
+
for item in filter_conditions:
|
|
247
|
+
key = item["name"]
|
|
248
|
+
|
|
249
|
+
if item["op"].lower() in ("in", "not in"):
|
|
250
|
+
values = list(repr(i) for i in item["value"])
|
|
251
|
+
item["value"] = "({})".format(", ".join(values))
|
|
252
|
+
else:
|
|
253
|
+
if item["value"] is None:
|
|
254
|
+
item["value"] = "null"
|
|
255
|
+
else:
|
|
256
|
+
item["value"] = repr(item["value"])
|
|
257
|
+
where_clauses.append(f"{key} {item['op']} {item['value']}")
|
|
258
|
+
|
|
259
|
+
where_clauses.append(keyword_query_condition)
|
|
260
|
+
if len(where_clauses) > 1:
|
|
261
|
+
return f"WHERE {' AND '.join(where_clauses)}"
|
|
262
|
+
elif len(where_clauses) == 1:
|
|
263
|
+
return f"WHERE {where_clauses[0]}"
|
|
264
|
+
else:
|
|
265
|
+
return ""
|
|
266
|
+
|
|
231
267
|
@staticmethod
|
|
232
268
|
def _construct_full_after_from_clause(
|
|
233
269
|
where_clause: str,
|
|
@@ -236,6 +272,36 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
236
272
|
) -> str:
|
|
237
273
|
return f"{where_clause} {offset_clause} {limit_clause}"
|
|
238
274
|
|
|
275
|
+
def _build_keyword_bm25_query(
|
|
276
|
+
self,
|
|
277
|
+
table_name: str,
|
|
278
|
+
query: str,
|
|
279
|
+
columns: List[str] = None,
|
|
280
|
+
content_column_name: str = "content",
|
|
281
|
+
conditions: List[FilterCondition] = None,
|
|
282
|
+
limit: int = None,
|
|
283
|
+
offset: int = None,
|
|
284
|
+
):
|
|
285
|
+
if columns is None:
|
|
286
|
+
columns = ["id", "content", "metadata"]
|
|
287
|
+
|
|
288
|
+
filter_conditions, _ = self._translate_conditions(conditions)
|
|
289
|
+
|
|
290
|
+
# given filter conditions, construct where clause
|
|
291
|
+
where_clause = self._construct_where_clause_with_keywords(filter_conditions, query, content_column_name)
|
|
292
|
+
|
|
293
|
+
query = f"""
|
|
294
|
+
SELECT
|
|
295
|
+
{", ".join(columns)},
|
|
296
|
+
ts_rank_cd(to_tsvector('english', {content_column_name}), websearch_to_tsquery('english', '{query}')) as distance
|
|
297
|
+
FROM
|
|
298
|
+
{table_name}
|
|
299
|
+
{where_clause if where_clause else ""}
|
|
300
|
+
{f"LIMIT {limit}" if limit else ""}
|
|
301
|
+
{f"OFFSET {offset}" if offset else ""};"""
|
|
302
|
+
|
|
303
|
+
return query
|
|
304
|
+
|
|
239
305
|
def _build_select_query(
|
|
240
306
|
self,
|
|
241
307
|
table_name: str,
|
|
@@ -320,6 +386,33 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
|
|
|
320
386
|
columns = ["id", "content", "embeddings", "metadata"]
|
|
321
387
|
|
|
322
388
|
query = self._build_select_query(table_name, columns, conditions, limit, offset)
|
|
389
|
+
|
|
390
|
+
result = self.raw_query(query)
|
|
391
|
+
|
|
392
|
+
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
393
|
+
if "embeddings" in columns:
|
|
394
|
+
result["embeddings"] = result["embeddings"].astype(str)
|
|
395
|
+
|
|
396
|
+
return result
|
|
397
|
+
|
|
398
|
+
def keyword_select(
|
|
399
|
+
self,
|
|
400
|
+
table_name: str,
|
|
401
|
+
columns: List[str] = None,
|
|
402
|
+
conditions: List[FilterCondition] = None,
|
|
403
|
+
offset: int = None,
|
|
404
|
+
limit: int = None,
|
|
405
|
+
keyword_search_args: KeywordSearchArgs = None,
|
|
406
|
+
) -> pd.DataFrame:
|
|
407
|
+
table_name = self._check_table(table_name)
|
|
408
|
+
|
|
409
|
+
if columns is None:
|
|
410
|
+
columns = ["id", "content", "embeddings", "metadata"]
|
|
411
|
+
content_column_name = keyword_search_args.column
|
|
412
|
+
query = self._build_keyword_bm25_query(
|
|
413
|
+
table_name, keyword_search_args.query, columns, content_column_name, conditions, limit, offset
|
|
414
|
+
)
|
|
415
|
+
|
|
323
416
|
result = self.raw_query(query)
|
|
324
417
|
|
|
325
418
|
# ensure embeddings are returned as string so they can be parsed by mindsdb
|
|
@@ -7,6 +7,7 @@ from duckdb import HTTPException
|
|
|
7
7
|
from mindsdb_sql_parser import parse_sql
|
|
8
8
|
import pandas as pd
|
|
9
9
|
from typing import Text, Dict, Optional
|
|
10
|
+
from botocore.client import Config
|
|
10
11
|
from botocore.exceptions import ClientError
|
|
11
12
|
|
|
12
13
|
from mindsdb_sql_parser.ast.base import ASTNode
|
|
@@ -16,7 +17,7 @@ from mindsdb.utilities import log
|
|
|
16
17
|
from mindsdb.integrations.libs.response import (
|
|
17
18
|
HandlerStatusResponse as StatusResponse,
|
|
18
19
|
HandlerResponse as Response,
|
|
19
|
-
RESPONSE_TYPE
|
|
20
|
+
RESPONSE_TYPE,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
|
|
@@ -26,16 +27,12 @@ logger = log.getLogger(__name__)
|
|
|
26
27
|
|
|
27
28
|
|
|
28
29
|
class ListFilesTable(APIResource):
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
conditions: List[FilterCondition] = None,
|
|
33
|
-
limit: int = None,
|
|
34
|
-
*args, **kwargs) -> pd.DataFrame:
|
|
35
|
-
|
|
30
|
+
def list(
|
|
31
|
+
self, targets: List[str] = None, conditions: List[FilterCondition] = None, limit: int = None, *args, **kwargs
|
|
32
|
+
) -> pd.DataFrame:
|
|
36
33
|
buckets = None
|
|
37
34
|
for condition in conditions:
|
|
38
|
-
if condition.column ==
|
|
35
|
+
if condition.column == "bucket":
|
|
39
36
|
if condition.op == FilterOperator.IN:
|
|
40
37
|
buckets = condition.value
|
|
41
38
|
elif condition.op == FilterOperator.EQUAL:
|
|
@@ -44,25 +41,27 @@ class ListFilesTable(APIResource):
|
|
|
44
41
|
|
|
45
42
|
data = []
|
|
46
43
|
for obj in self.handler.get_objects(limit=limit, buckets=buckets):
|
|
47
|
-
path = obj[
|
|
48
|
-
path = path.replace(
|
|
44
|
+
path = obj["Key"]
|
|
45
|
+
path = path.replace("`", "")
|
|
49
46
|
item = {
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
47
|
+
"path": path,
|
|
48
|
+
"bucket": obj["Bucket"],
|
|
49
|
+
"name": path[path.rfind("/") + 1 :],
|
|
50
|
+
"extension": path[path.rfind(".") + 1 :],
|
|
54
51
|
}
|
|
55
52
|
|
|
53
|
+
if targets and "public_url" in targets:
|
|
54
|
+
item["public_url"] = self.handler.generate_sas_url(path, obj["Bucket"])
|
|
55
|
+
|
|
56
56
|
data.append(item)
|
|
57
57
|
|
|
58
58
|
return pd.DataFrame(data=data, columns=self.get_columns())
|
|
59
59
|
|
|
60
60
|
def get_columns(self) -> List[str]:
|
|
61
|
-
return ["path", "name", "extension", "bucket", "content"]
|
|
61
|
+
return ["path", "name", "extension", "bucket", "content", "public_url"]
|
|
62
62
|
|
|
63
63
|
|
|
64
64
|
class FileTable(APIResource):
|
|
65
|
-
|
|
66
65
|
def list(self, targets: List[str] = None, table_name=None, *args, **kwargs) -> pd.DataFrame:
|
|
67
66
|
return self.handler.read_as_table(table_name)
|
|
68
67
|
|
|
@@ -76,9 +75,9 @@ class S3Handler(APIHandler):
|
|
|
76
75
|
This handler handles connection and execution of the SQL statements on AWS S3.
|
|
77
76
|
"""
|
|
78
77
|
|
|
79
|
-
name =
|
|
78
|
+
name = "s3"
|
|
80
79
|
# TODO: Can other file formats be supported?
|
|
81
|
-
supported_file_formats = [
|
|
80
|
+
supported_file_formats = ["csv", "tsv", "json", "parquet"]
|
|
82
81
|
|
|
83
82
|
def __init__(self, name: Text, connection_data: Optional[Dict], **kwargs):
|
|
84
83
|
"""
|
|
@@ -96,7 +95,7 @@ class S3Handler(APIHandler):
|
|
|
96
95
|
self.connection = None
|
|
97
96
|
self.is_connected = False
|
|
98
97
|
self.thread_safe = True
|
|
99
|
-
self.bucket = self.connection_data.get(
|
|
98
|
+
self.bucket = self.connection_data.get("bucket")
|
|
100
99
|
self._regions = {}
|
|
101
100
|
|
|
102
101
|
self._files_table = ListFilesTable(self)
|
|
@@ -119,8 +118,8 @@ class S3Handler(APIHandler):
|
|
|
119
118
|
return self.connection
|
|
120
119
|
|
|
121
120
|
# Validate mandatory parameters.
|
|
122
|
-
if not all(key in self.connection_data for key in [
|
|
123
|
-
raise ValueError(
|
|
121
|
+
if not all(key in self.connection_data for key in ["aws_access_key_id", "aws_secret_access_key"]):
|
|
122
|
+
raise ValueError("Required parameters (aws_access_key_id, aws_secret_access_key) must be provided.")
|
|
124
123
|
|
|
125
124
|
# Connect to S3 and configure mandatory credentials.
|
|
126
125
|
self.connection = self._connect_boto3()
|
|
@@ -152,13 +151,13 @@ class S3Handler(APIHandler):
|
|
|
152
151
|
duckdb_conn.execute(f"SET s3_secret_access_key='{self.connection_data['aws_secret_access_key']}'")
|
|
153
152
|
|
|
154
153
|
# Configure optional parameters.
|
|
155
|
-
if
|
|
154
|
+
if "aws_session_token" in self.connection_data:
|
|
156
155
|
duckdb_conn.execute(f"SET s3_session_token='{self.connection_data['aws_session_token']}'")
|
|
157
156
|
|
|
158
157
|
# detect region for bucket
|
|
159
158
|
if bucket not in self._regions:
|
|
160
159
|
client = self.connect()
|
|
161
|
-
self._regions[bucket] = client.get_bucket_location(Bucket=bucket)[
|
|
160
|
+
self._regions[bucket] = client.get_bucket_location(Bucket=bucket)["LocationConstraint"]
|
|
162
161
|
|
|
163
162
|
region = self._regions[bucket]
|
|
164
163
|
duckdb_conn.execute(f"SET s3_region='{region}'")
|
|
@@ -177,15 +176,17 @@ class S3Handler(APIHandler):
|
|
|
177
176
|
"""
|
|
178
177
|
# Configure mandatory credentials.
|
|
179
178
|
config = {
|
|
180
|
-
|
|
181
|
-
|
|
179
|
+
"aws_access_key_id": self.connection_data["aws_access_key_id"],
|
|
180
|
+
"aws_secret_access_key": self.connection_data["aws_secret_access_key"],
|
|
182
181
|
}
|
|
183
182
|
|
|
184
183
|
# Configure optional parameters.
|
|
185
|
-
|
|
186
|
-
|
|
184
|
+
optional_parameters = ["region_name", "aws_session_token"]
|
|
185
|
+
for parameter in optional_parameters:
|
|
186
|
+
if parameter in self.connection_data:
|
|
187
|
+
config[parameter] = self.connection_data[parameter]
|
|
187
188
|
|
|
188
|
-
client = boto3.client(
|
|
189
|
+
client = boto3.client("s3", **config, config=Config(signature_version="s3v4"))
|
|
189
190
|
|
|
190
191
|
# check connection
|
|
191
192
|
if self.bucket is not None:
|
|
@@ -219,7 +220,7 @@ class S3Handler(APIHandler):
|
|
|
219
220
|
self._connect_boto3()
|
|
220
221
|
response.success = True
|
|
221
222
|
except (ClientError, ValueError) as e:
|
|
222
|
-
logger.error(f
|
|
223
|
+
logger.error(f"Error connecting to S3 with the given credentials, {e}!")
|
|
223
224
|
response.error_message = str(e)
|
|
224
225
|
|
|
225
226
|
if response.success and need_to_close:
|
|
@@ -235,8 +236,8 @@ class S3Handler(APIHandler):
|
|
|
235
236
|
return self.bucket, key
|
|
236
237
|
|
|
237
238
|
# get bucket from first part of the key
|
|
238
|
-
ar = key.split(
|
|
239
|
-
return ar[0],
|
|
239
|
+
ar = key.split("/")
|
|
240
|
+
return ar[0], "/".join(ar[1:])
|
|
240
241
|
|
|
241
242
|
def read_as_table(self, key) -> pd.DataFrame:
|
|
242
243
|
"""
|
|
@@ -245,7 +246,6 @@ class S3Handler(APIHandler):
|
|
|
245
246
|
bucket, key = self._get_bucket(key)
|
|
246
247
|
|
|
247
248
|
with self._connect_duckdb(bucket) as connection:
|
|
248
|
-
|
|
249
249
|
cursor = connection.execute(f"SELECT * FROM 's3://{bucket}/{key}'")
|
|
250
250
|
|
|
251
251
|
return cursor.fetchdf()
|
|
@@ -259,7 +259,7 @@ class S3Handler(APIHandler):
|
|
|
259
259
|
client = self.connect()
|
|
260
260
|
|
|
261
261
|
obj = client.get_object(Bucket=bucket, Key=key)
|
|
262
|
-
content = obj[
|
|
262
|
+
content = obj["Body"].read()
|
|
263
263
|
return content
|
|
264
264
|
|
|
265
265
|
def add_data_to_table(self, key, df) -> None:
|
|
@@ -277,7 +277,7 @@ class S3Handler(APIHandler):
|
|
|
277
277
|
client = self.connect()
|
|
278
278
|
client.head_object(Bucket=bucket, Key=key)
|
|
279
279
|
except ClientError as e:
|
|
280
|
-
logger.error(f
|
|
280
|
+
logger.error(f"Error querying the file {key} in the bucket {bucket}, {e}!")
|
|
281
281
|
raise e
|
|
282
282
|
|
|
283
283
|
with self._connect_duckdb(bucket) as connection:
|
|
@@ -309,31 +309,28 @@ class S3Handler(APIHandler):
|
|
|
309
309
|
if isinstance(query, Select):
|
|
310
310
|
table_name = query.from_table.parts[-1]
|
|
311
311
|
|
|
312
|
-
if table_name ==
|
|
312
|
+
if table_name == "files":
|
|
313
313
|
table = self._files_table
|
|
314
314
|
df = table.select(query)
|
|
315
315
|
|
|
316
316
|
# add content
|
|
317
317
|
has_content = False
|
|
318
318
|
for target in query.targets:
|
|
319
|
-
if isinstance(target, Identifier) and target.parts[-1].lower() ==
|
|
319
|
+
if isinstance(target, Identifier) and target.parts[-1].lower() == "content":
|
|
320
320
|
has_content = True
|
|
321
321
|
break
|
|
322
322
|
if has_content:
|
|
323
|
-
df[
|
|
323
|
+
df["content"] = df["path"].apply(self._read_as_content)
|
|
324
324
|
else:
|
|
325
|
-
extension = table_name.split(
|
|
325
|
+
extension = table_name.split(".")[-1]
|
|
326
326
|
if extension not in self.supported_file_formats:
|
|
327
|
-
logger.error(f
|
|
328
|
-
raise ValueError(f
|
|
327
|
+
logger.error(f"The file format {extension} is not supported!")
|
|
328
|
+
raise ValueError(f"The file format {extension} is not supported!")
|
|
329
329
|
|
|
330
330
|
table = FileTable(self, table_name=table_name)
|
|
331
331
|
df = table.select(query)
|
|
332
332
|
|
|
333
|
-
response = Response(
|
|
334
|
-
RESPONSE_TYPE.TABLE,
|
|
335
|
-
data_frame=df
|
|
336
|
-
)
|
|
333
|
+
response = Response(RESPONSE_TYPE.TABLE, data_frame=df)
|
|
337
334
|
elif isinstance(query, Insert):
|
|
338
335
|
table_name = query.table.parts[-1]
|
|
339
336
|
table = FileTable(self, table_name=table_name)
|
|
@@ -364,7 +361,7 @@ class S3Handler(APIHandler):
|
|
|
364
361
|
scan_buckets = [self.bucket]
|
|
365
362
|
else:
|
|
366
363
|
add_bucket_to_name = True
|
|
367
|
-
scan_buckets = [b[
|
|
364
|
+
scan_buckets = [b["Name"] for b in client.list_buckets()["Buckets"]]
|
|
368
365
|
|
|
369
366
|
objects = []
|
|
370
367
|
for bucket in scan_buckets:
|
|
@@ -372,23 +369,38 @@ class S3Handler(APIHandler):
|
|
|
372
369
|
continue
|
|
373
370
|
|
|
374
371
|
resp = client.list_objects_v2(Bucket=bucket)
|
|
375
|
-
if
|
|
372
|
+
if "Contents" not in resp:
|
|
376
373
|
continue
|
|
377
374
|
|
|
378
|
-
for obj in resp[
|
|
379
|
-
if obj.get(
|
|
375
|
+
for obj in resp["Contents"]:
|
|
376
|
+
if obj.get("StorageClass", "STANDARD") != "STANDARD":
|
|
380
377
|
continue
|
|
381
378
|
|
|
382
|
-
obj[
|
|
379
|
+
obj["Bucket"] = bucket
|
|
383
380
|
if add_bucket_to_name:
|
|
384
381
|
# bucket is part of the name
|
|
385
|
-
obj[
|
|
382
|
+
obj["Key"] = f"{bucket}/{obj['Key']}"
|
|
386
383
|
objects.append(obj)
|
|
387
384
|
if limit is not None and len(objects) >= limit:
|
|
388
385
|
break
|
|
389
386
|
|
|
390
387
|
return objects
|
|
391
388
|
|
|
389
|
+
def generate_sas_url(self, key: str, bucket: str) -> str:
|
|
390
|
+
"""
|
|
391
|
+
Generates a pre-signed URL for accessing an object in the S3 bucket.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
key (str): The key (path) of the object in the S3 bucket.
|
|
395
|
+
bucket (str): The name of the S3 bucket.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
str: The pre-signed URL for accessing the object.
|
|
399
|
+
"""
|
|
400
|
+
client = self.connect()
|
|
401
|
+
url = client.generate_presigned_url("get_object", Params={"Bucket": bucket, "Key": key}, ExpiresIn=3600)
|
|
402
|
+
return url
|
|
403
|
+
|
|
392
404
|
def get_tables(self) -> Response:
|
|
393
405
|
"""
|
|
394
406
|
Retrieves a list of tables (objects) in the S3 bucket.
|
|
@@ -402,21 +414,13 @@ class S3Handler(APIHandler):
|
|
|
402
414
|
# Get only the supported file formats.
|
|
403
415
|
# Wrap the object names with backticks to prevent SQL syntax errors.
|
|
404
416
|
supported_names = [
|
|
405
|
-
f"`{obj['Key']}`"
|
|
406
|
-
for obj in self.get_objects()
|
|
407
|
-
if obj['Key'].split('.')[-1] in self.supported_file_formats
|
|
417
|
+
f"`{obj['Key']}`" for obj in self.get_objects() if obj["Key"].split(".")[-1] in self.supported_file_formats
|
|
408
418
|
]
|
|
409
419
|
|
|
410
420
|
# virtual table with list of files
|
|
411
|
-
supported_names.insert(0,
|
|
421
|
+
supported_names.insert(0, "files")
|
|
412
422
|
|
|
413
|
-
response = Response(
|
|
414
|
-
RESPONSE_TYPE.TABLE,
|
|
415
|
-
data_frame=pd.DataFrame(
|
|
416
|
-
supported_names,
|
|
417
|
-
columns=['table_name']
|
|
418
|
-
)
|
|
419
|
-
)
|
|
423
|
+
response = Response(RESPONSE_TYPE.TABLE, data_frame=pd.DataFrame(supported_names, columns=["table_name"]))
|
|
420
424
|
|
|
421
425
|
return response
|
|
422
426
|
|
|
@@ -433,11 +437,7 @@ class S3Handler(APIHandler):
|
|
|
433
437
|
Returns:
|
|
434
438
|
Response: A response object containing the column details, formatted as per the `Response` class.
|
|
435
439
|
"""
|
|
436
|
-
query = Select(
|
|
437
|
-
targets=[Star()],
|
|
438
|
-
from_table=Identifier(parts=[table_name]),
|
|
439
|
-
limit=Constant(1)
|
|
440
|
-
)
|
|
440
|
+
query = Select(targets=[Star()], from_table=Identifier(parts=[table_name]), limit=Constant(1))
|
|
441
441
|
|
|
442
442
|
result = self.query(query)
|
|
443
443
|
|
|
@@ -445,10 +445,12 @@ class S3Handler(APIHandler):
|
|
|
445
445
|
RESPONSE_TYPE.TABLE,
|
|
446
446
|
data_frame=pd.DataFrame(
|
|
447
447
|
{
|
|
448
|
-
|
|
449
|
-
|
|
448
|
+
"column_name": result.data_frame.columns,
|
|
449
|
+
"data_type": [
|
|
450
|
+
data_type if data_type != "object" else "string" for data_type in result.data_frame.dtypes
|
|
451
|
+
],
|
|
450
452
|
}
|
|
451
|
-
)
|
|
453
|
+
),
|
|
452
454
|
)
|
|
453
455
|
|
|
454
456
|
return response
|
|
@@ -72,7 +72,7 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
72
72
|
|
|
73
73
|
resource_tables = self._get_resource_names()
|
|
74
74
|
for resource_name in resource_tables:
|
|
75
|
-
table_class = create_table_class(resource_name)
|
|
75
|
+
table_class = create_table_class(resource_name.lower())
|
|
76
76
|
self._register_table(resource_name, table_class(self))
|
|
77
77
|
|
|
78
78
|
return self.connection
|
|
@@ -271,10 +271,11 @@ class SalesforceHandler(MetaAPIHandler):
|
|
|
271
271
|
|
|
272
272
|
# Retrieve the metadata for all Salesforce resources.
|
|
273
273
|
main_metadata = connection.sobjects.describe()
|
|
274
|
-
|
|
275
274
|
if table_names:
|
|
276
275
|
# Filter the metadata for the specified tables.
|
|
277
|
-
main_metadata = [
|
|
276
|
+
main_metadata = [
|
|
277
|
+
resource for resource in main_metadata["sobjects"] if resource["name"].lower() in table_names
|
|
278
|
+
]
|
|
278
279
|
else:
|
|
279
280
|
main_metadata = main_metadata["sobjects"]
|
|
280
281
|
|
|
@@ -164,9 +164,18 @@ def create_table_class(resource_name: Text) -> MetaAPIResource:
|
|
|
164
164
|
"""
|
|
165
165
|
client = self.handler.connect()
|
|
166
166
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
167
|
+
try:
|
|
168
|
+
resource_metadata = next(
|
|
169
|
+
(resource for resource in main_metadata if resource["name"].lower() == resource_name),
|
|
170
|
+
)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
logger.warning(f"Failed to get resource metadata for {resource_name}: {e}")
|
|
173
|
+
return {
|
|
174
|
+
"table_name": table_name,
|
|
175
|
+
"table_type": "BASE TABLE",
|
|
176
|
+
"table_description": "",
|
|
177
|
+
"row_count": None,
|
|
178
|
+
}
|
|
170
179
|
|
|
171
180
|
# Get row count if Id column is aggregatable.
|
|
172
181
|
row_count = None
|