linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,1023 @@
|
|
|
1
|
+
"""Dremio REST API database adapter.
|
|
2
|
+
|
|
3
|
+
This module provides a Database implementation for connecting to Dremio
|
|
4
|
+
data lakehouse using the REST API v3. This is useful when the Arrow Flight
|
|
5
|
+
SQL port (32010) is not accessible, such as behind Cloudflare or firewalls.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
14
|
+
from urllib.parse import parse_qs, quote, urlparse
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import requests
|
|
18
|
+
from linkml_runtime import SchemaView
|
|
19
|
+
from linkml_runtime.linkml_model import SlotDefinition
|
|
20
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
21
|
+
|
|
22
|
+
from linkml_store.api import Database
|
|
23
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
24
|
+
from linkml_store.api.stores.dremio_rest.dremio_rest_collection import DremioRestCollection
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ForeignKeyInfo:
|
|
31
|
+
"""Information about a foreign key constraint."""
|
|
32
|
+
|
|
33
|
+
constraint_name: str
|
|
34
|
+
source_table: str
|
|
35
|
+
source_columns: List[str]
|
|
36
|
+
target_table: str
|
|
37
|
+
target_columns: List[str]
|
|
38
|
+
source_schema: Optional[str] = None
|
|
39
|
+
target_schema: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ColumnInfo:
|
|
44
|
+
"""Information about a column including comments and nested structure."""
|
|
45
|
+
|
|
46
|
+
name: str
|
|
47
|
+
data_type: str
|
|
48
|
+
is_nullable: bool = True
|
|
49
|
+
comment: Optional[str] = None
|
|
50
|
+
ordinal_position: int = 0
|
|
51
|
+
nested_fields: List["ColumnInfo"] = field(default_factory=list)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TableInfo:
|
|
56
|
+
"""Information about a table including comments."""
|
|
57
|
+
|
|
58
|
+
name: str
|
|
59
|
+
schema_name: Optional[str] = None
|
|
60
|
+
comment: Optional[str] = None
|
|
61
|
+
columns: List[ColumnInfo] = field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# Mapping from Dremio SQL type names to LinkML types
|
|
65
|
+
DREMIO_SQL_TO_LINKML = {
|
|
66
|
+
"VARCHAR": "string",
|
|
67
|
+
"CHAR": "string",
|
|
68
|
+
"BIGINT": "integer",
|
|
69
|
+
"INTEGER": "integer",
|
|
70
|
+
"INT": "integer",
|
|
71
|
+
"SMALLINT": "integer",
|
|
72
|
+
"TINYINT": "integer",
|
|
73
|
+
"BOOLEAN": "boolean",
|
|
74
|
+
"DOUBLE": "float",
|
|
75
|
+
"FLOAT": "float",
|
|
76
|
+
"DECIMAL": "float",
|
|
77
|
+
"DATE": "date",
|
|
78
|
+
"TIMESTAMP": "datetime",
|
|
79
|
+
"TIME": "string",
|
|
80
|
+
"BINARY": "string",
|
|
81
|
+
"VARBINARY": "string",
|
|
82
|
+
"LIST": "string",
|
|
83
|
+
"STRUCT": "string",
|
|
84
|
+
"MAP": "string",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class DremioRestDatabase(Database):
|
|
89
|
+
"""
|
|
90
|
+
An adapter for Dremio data lakehouse using the REST API v3.
|
|
91
|
+
|
|
92
|
+
This adapter connects to Dremio using the standard REST API, which is
|
|
93
|
+
useful when the Arrow Flight SQL port is not accessible.
|
|
94
|
+
|
|
95
|
+
Handle format:
|
|
96
|
+
dremio-rest://[username:password@]host[:port][/path][?params]
|
|
97
|
+
|
|
98
|
+
Examples:
|
|
99
|
+
- dremio-rest://localhost
|
|
100
|
+
- dremio-rest://user:pass@lakehouse.example.com
|
|
101
|
+
- dremio-rest://lakehouse.example.com?schema=gold.study
|
|
102
|
+
- dremio-rest://lakehouse.example.com?cf_token_env=CF_AUTHORIZATION
|
|
103
|
+
|
|
104
|
+
Parameters (query string):
|
|
105
|
+
- schema: Default schema/space to use for unqualified table names
|
|
106
|
+
- verify_ssl: Whether to verify SSL certificates (default: true)
|
|
107
|
+
- cf_token_env: Environment variable name for Cloudflare Access token
|
|
108
|
+
- username_env: Environment variable for username (default: DREMIO_USER)
|
|
109
|
+
- password_env: Environment variable for password (default: DREMIO_PASSWORD)
|
|
110
|
+
|
|
111
|
+
Environment variables:
|
|
112
|
+
- DREMIO_USER: Default username
|
|
113
|
+
- DREMIO_PASSWORD: Default password
|
|
114
|
+
- CF_AUTHORIZATION: Cloudflare Access token (if behind Cloudflare)
|
|
115
|
+
"""
|
|
116
|
+
|
|
117
|
+
_auth_token: Optional[str] = None
|
|
118
|
+
_connection_info: Optional[Dict[str, Any]] = None
|
|
119
|
+
_session: Optional[requests.Session] = None
|
|
120
|
+
collection_class = DremioRestCollection
|
|
121
|
+
|
|
122
|
+
def __init__(
|
|
123
|
+
self,
|
|
124
|
+
handle: Optional[str] = None,
|
|
125
|
+
recreate_if_exists: bool = False,
|
|
126
|
+
username: Optional[str] = None,
|
|
127
|
+
password: Optional[str] = None,
|
|
128
|
+
**kwargs,
|
|
129
|
+
):
|
|
130
|
+
"""Initialize a Dremio REST database connection.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
handle: Connection string in format dremio-rest://host
|
|
134
|
+
recreate_if_exists: Not applicable for Dremio (ignored)
|
|
135
|
+
username: Optional username (overrides env var)
|
|
136
|
+
password: Optional password (overrides env var)
|
|
137
|
+
**kwargs: Additional arguments passed to parent
|
|
138
|
+
"""
|
|
139
|
+
if handle is None:
|
|
140
|
+
handle = "dremio-rest://localhost"
|
|
141
|
+
|
|
142
|
+
self._connection_info = self._parse_handle(handle)
|
|
143
|
+
|
|
144
|
+
# Override with explicit credentials if provided
|
|
145
|
+
if username:
|
|
146
|
+
self._connection_info["username"] = username
|
|
147
|
+
if password:
|
|
148
|
+
self._connection_info["password"] = password
|
|
149
|
+
|
|
150
|
+
super().__init__(handle=handle, **kwargs)
|
|
151
|
+
|
|
152
|
+
def _parse_handle(self, handle: str) -> Dict[str, Any]:
|
|
153
|
+
"""Parse a Dremio REST connection handle.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
handle: Connection string like dremio-rest://user:pass@host:port/path?params
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Dictionary with connection parameters.
|
|
160
|
+
"""
|
|
161
|
+
# Ensure scheme is present
|
|
162
|
+
if not handle.startswith("dremio-rest://"):
|
|
163
|
+
handle = f"dremio-rest://{handle}"
|
|
164
|
+
|
|
165
|
+
parsed = urlparse(handle)
|
|
166
|
+
|
|
167
|
+
# Parse query parameters
|
|
168
|
+
params = parse_qs(parsed.query)
|
|
169
|
+
|
|
170
|
+
# Extract parameters with defaults
|
|
171
|
+
verify_ssl = params.get("verify_ssl", ["true"])[0].lower() == "true"
|
|
172
|
+
default_schema = params.get("schema", [None])[0]
|
|
173
|
+
cf_token_env = params.get("cf_token_env", ["CF_AUTHORIZATION"])[0]
|
|
174
|
+
username_env = params.get("username_env", ["DREMIO_USER"])[0]
|
|
175
|
+
password_env = params.get("password_env", ["DREMIO_PASSWORD"])[0]
|
|
176
|
+
|
|
177
|
+
# Get credentials from URL or environment
|
|
178
|
+
username = parsed.username or os.environ.get(username_env)
|
|
179
|
+
password = parsed.password or os.environ.get(password_env)
|
|
180
|
+
cf_token = os.environ.get(cf_token_env)
|
|
181
|
+
|
|
182
|
+
# Determine port (default to 443 for HTTPS)
|
|
183
|
+
port = parsed.port or 443
|
|
184
|
+
|
|
185
|
+
return {
|
|
186
|
+
"host": parsed.hostname or "localhost",
|
|
187
|
+
"port": port,
|
|
188
|
+
"username": username,
|
|
189
|
+
"password": password,
|
|
190
|
+
"path": parsed.path.lstrip("/") if parsed.path else None,
|
|
191
|
+
"default_schema": default_schema,
|
|
192
|
+
"verify_ssl": verify_ssl,
|
|
193
|
+
"cf_token": cf_token,
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
@property
|
|
197
|
+
def session(self) -> requests.Session:
|
|
198
|
+
"""Get or create the requests session."""
|
|
199
|
+
if self._session is None:
|
|
200
|
+
self._session = requests.Session()
|
|
201
|
+
if not self._connection_info["verify_ssl"]:
|
|
202
|
+
self._session.verify = False
|
|
203
|
+
import urllib3
|
|
204
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
205
|
+
return self._session
|
|
206
|
+
|
|
207
|
+
@property
|
|
208
|
+
def base_url(self) -> str:
|
|
209
|
+
"""Get the base URL for API requests."""
|
|
210
|
+
info = self._connection_info
|
|
211
|
+
host = info["host"]
|
|
212
|
+
port = info["port"]
|
|
213
|
+
if port == 443:
|
|
214
|
+
return f"https://{host}"
|
|
215
|
+
else:
|
|
216
|
+
return f"https://{host}:{port}"
|
|
217
|
+
|
|
218
|
+
def _get_cookies(self) -> Dict[str, str]:
|
|
219
|
+
"""Get cookies for requests (e.g., Cloudflare Access token)."""
|
|
220
|
+
cookies = {}
|
|
221
|
+
cf_token = self._connection_info.get("cf_token")
|
|
222
|
+
if cf_token:
|
|
223
|
+
cookies["CF_Authorization"] = cf_token
|
|
224
|
+
return cookies
|
|
225
|
+
|
|
226
|
+
def _authenticate(self) -> str:
|
|
227
|
+
"""Authenticate with Dremio and get auth token.
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
Authentication token for subsequent requests.
|
|
231
|
+
|
|
232
|
+
Raises:
|
|
233
|
+
ConnectionError: If authentication fails.
|
|
234
|
+
"""
|
|
235
|
+
if self._auth_token:
|
|
236
|
+
return self._auth_token
|
|
237
|
+
|
|
238
|
+
info = self._connection_info
|
|
239
|
+
username = info.get("username")
|
|
240
|
+
password = info.get("password")
|
|
241
|
+
|
|
242
|
+
if not username or not password:
|
|
243
|
+
raise ConnectionError(
|
|
244
|
+
"Dremio credentials required. Set DREMIO_USER and DREMIO_PASSWORD "
|
|
245
|
+
"environment variables or provide in connection string."
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
url = f"{self.base_url}/apiv2/login"
|
|
249
|
+
cookies = self._get_cookies()
|
|
250
|
+
|
|
251
|
+
logger.info(f"Authenticating to Dremio at {self.base_url}")
|
|
252
|
+
|
|
253
|
+
response = self.session.post(
|
|
254
|
+
url,
|
|
255
|
+
json={"userName": username, "password": password},
|
|
256
|
+
cookies=cookies,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if not response.ok:
|
|
260
|
+
raise ConnectionError(
|
|
261
|
+
f"Dremio authentication failed: {response.status_code} - {response.text[:200]}"
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
token = response.json().get("token")
|
|
265
|
+
if not token:
|
|
266
|
+
raise ConnectionError("No token in authentication response")
|
|
267
|
+
|
|
268
|
+
self._auth_token = f"_dremio{token}"
|
|
269
|
+
logger.info("Dremio authentication successful")
|
|
270
|
+
return self._auth_token
|
|
271
|
+
|
|
272
|
+
def _get_headers(self) -> Dict[str, str]:
|
|
273
|
+
"""Get headers for authenticated requests."""
|
|
274
|
+
token = self._authenticate()
|
|
275
|
+
return {"Authorization": token, "Content-Type": "application/json"}
|
|
276
|
+
|
|
277
|
+
def _execute_query(self, sql: str, timeout: int = 300) -> pd.DataFrame:
|
|
278
|
+
"""Execute a SQL query and return results as DataFrame.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
sql: SQL query string.
|
|
282
|
+
timeout: Maximum time to wait for query completion in seconds.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Pandas DataFrame with query results.
|
|
286
|
+
|
|
287
|
+
Raises:
|
|
288
|
+
RuntimeError: If query fails or times out.
|
|
289
|
+
"""
|
|
290
|
+
headers = self._get_headers()
|
|
291
|
+
cookies = self._get_cookies()
|
|
292
|
+
|
|
293
|
+
# Submit query
|
|
294
|
+
url = f"{self.base_url}/api/v3/sql"
|
|
295
|
+
logger.debug(f"Executing SQL: {sql}")
|
|
296
|
+
|
|
297
|
+
response = self.session.post(
|
|
298
|
+
url,
|
|
299
|
+
headers=headers,
|
|
300
|
+
json={"sql": sql},
|
|
301
|
+
cookies=cookies,
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
if not response.ok:
|
|
305
|
+
raise RuntimeError(f"Query submission failed: {response.status_code} - {response.text[:200]}")
|
|
306
|
+
|
|
307
|
+
job_id = response.json().get("id")
|
|
308
|
+
if not job_id:
|
|
309
|
+
raise RuntimeError("No job ID in query response")
|
|
310
|
+
|
|
311
|
+
logger.debug(f"Query job ID: {job_id}")
|
|
312
|
+
|
|
313
|
+
# Wait for completion
|
|
314
|
+
start_time = time.time()
|
|
315
|
+
while True:
|
|
316
|
+
if time.time() - start_time > timeout:
|
|
317
|
+
raise RuntimeError(f"Query timed out after {timeout} seconds")
|
|
318
|
+
|
|
319
|
+
status_url = f"{self.base_url}/api/v3/job/{job_id}"
|
|
320
|
+
status_response = self.session.get(status_url, headers=headers, cookies=cookies)
|
|
321
|
+
status = status_response.json()
|
|
322
|
+
|
|
323
|
+
job_state = status.get("jobState")
|
|
324
|
+
if job_state == "COMPLETED":
|
|
325
|
+
break
|
|
326
|
+
elif job_state in ("FAILED", "CANCELED"):
|
|
327
|
+
error_msg = status.get("errorMessage", "Unknown error")
|
|
328
|
+
raise RuntimeError(f"Query {job_state}: {error_msg}")
|
|
329
|
+
|
|
330
|
+
time.sleep(0.5)
|
|
331
|
+
|
|
332
|
+
# Fetch results with pagination
|
|
333
|
+
row_count = status.get("rowCount", 0)
|
|
334
|
+
logger.debug(f"Query completed with {row_count} rows")
|
|
335
|
+
|
|
336
|
+
all_rows = []
|
|
337
|
+
offset = 0
|
|
338
|
+
limit = 500 # Dremio max per request
|
|
339
|
+
|
|
340
|
+
while offset < row_count:
|
|
341
|
+
results_url = f"{self.base_url}/api/v3/job/{job_id}/results"
|
|
342
|
+
results_response = self.session.get(
|
|
343
|
+
results_url,
|
|
344
|
+
headers=headers,
|
|
345
|
+
cookies=cookies,
|
|
346
|
+
params={"offset": offset, "limit": limit},
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
if not results_response.ok:
|
|
350
|
+
raise RuntimeError(f"Failed to fetch results: {results_response.status_code}")
|
|
351
|
+
|
|
352
|
+
results = results_response.json()
|
|
353
|
+
rows = results.get("rows", [])
|
|
354
|
+
if not rows:
|
|
355
|
+
break
|
|
356
|
+
|
|
357
|
+
all_rows.extend(rows)
|
|
358
|
+
offset += limit
|
|
359
|
+
|
|
360
|
+
return pd.DataFrame(all_rows)
|
|
361
|
+
|
|
362
|
+
def _execute_update(self, sql: str) -> int:
|
|
363
|
+
"""Execute a SQL update/insert/delete statement.
|
|
364
|
+
|
|
365
|
+
Args:
|
|
366
|
+
sql: SQL statement.
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Number of affected rows (-1 if unknown).
|
|
370
|
+
"""
|
|
371
|
+
# For DML, we just execute and check for success
|
|
372
|
+
self._execute_query(sql)
|
|
373
|
+
return -1
|
|
374
|
+
|
|
375
|
+
def commit(self, **kwargs):
|
|
376
|
+
"""Commit pending changes (no-op for Dremio REST)."""
|
|
377
|
+
pass
|
|
378
|
+
|
|
379
|
+
def close(self, **kwargs):
|
|
380
|
+
"""Close the Dremio connection."""
|
|
381
|
+
if self._session:
|
|
382
|
+
self._session.close()
|
|
383
|
+
self._session = None
|
|
384
|
+
self._auth_token = None
|
|
385
|
+
|
|
386
|
+
def drop(self, missing_ok=True, **kwargs):
|
|
387
|
+
"""Drop the database.
|
|
388
|
+
|
|
389
|
+
Note: This is not supported for Dremio as it's typically a read/query layer.
|
|
390
|
+
"""
|
|
391
|
+
self.close()
|
|
392
|
+
logger.warning("Dremio does not support dropping databases through this adapter")
|
|
393
|
+
|
|
394
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
395
|
+
"""Execute a query against Dremio.
|
|
396
|
+
|
|
397
|
+
Args:
|
|
398
|
+
query: Query object specifying the query parameters.
|
|
399
|
+
**kwargs: Additional arguments.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
QueryResult with matching rows.
|
|
403
|
+
"""
|
|
404
|
+
from_table = query.from_table
|
|
405
|
+
if not from_table:
|
|
406
|
+
raise ValueError("Query must specify from_table")
|
|
407
|
+
|
|
408
|
+
collection = self.get_collection(from_table, create_if_not_exists=False)
|
|
409
|
+
if collection:
|
|
410
|
+
return collection.query(query, **kwargs)
|
|
411
|
+
else:
|
|
412
|
+
return QueryResult(query=query, num_rows=0, rows=[])
|
|
413
|
+
|
|
414
|
+
@property
|
|
415
|
+
def supports_sql(self) -> bool:
|
|
416
|
+
"""Return True - Dremio REST supports raw SQL queries."""
|
|
417
|
+
return True
|
|
418
|
+
|
|
419
|
+
def _qualify_table_names(self, sql: str) -> str:
|
|
420
|
+
"""Qualify unqualified table names in SQL using configured schema.
|
|
421
|
+
|
|
422
|
+
Handles FROM and JOIN clauses, qualifying table names that don't
|
|
423
|
+
already contain dots or quotes.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
sql: SQL query string.
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
SQL with qualified table names.
|
|
430
|
+
"""
|
|
431
|
+
default_schema = self._connection_info.get("default_schema")
|
|
432
|
+
if not default_schema:
|
|
433
|
+
return sql
|
|
434
|
+
|
|
435
|
+
# Pattern matches FROM/JOIN followed by an unqualified table name
|
|
436
|
+
# Unqualified = no dots, no quotes, just a simple identifier
|
|
437
|
+
pattern = r'(?i)((?:FROM|JOIN)\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s+(?:AS\s+)?[a-zA-Z_][a-zA-Z0-9_]*|\s+(?:WHERE|ORDER|GROUP|LIMIT|HAVING|UNION|INTERSECT|EXCEPT|ON|LEFT|RIGHT|INNER|OUTER|CROSS|FULL|;)|$)'
|
|
438
|
+
|
|
439
|
+
def replace_table(match):
|
|
440
|
+
prefix = match.group(1) # "FROM " or "JOIN "
|
|
441
|
+
table = match.group(2) # table name
|
|
442
|
+
suffix = match.group(3) # rest (alias, WHERE, etc.)
|
|
443
|
+
|
|
444
|
+
# Check if this looks like a keyword (not a table name)
|
|
445
|
+
keywords = {'WHERE', 'ORDER', 'GROUP', 'LIMIT', 'HAVING', 'UNION',
|
|
446
|
+
'INTERSECT', 'EXCEPT', 'SELECT', 'AS', 'ON', 'AND', 'OR',
|
|
447
|
+
'LEFT', 'RIGHT', 'INNER', 'OUTER', 'CROSS', 'FULL', 'JOIN'}
|
|
448
|
+
if table.upper() in keywords:
|
|
449
|
+
return match.group(0)
|
|
450
|
+
|
|
451
|
+
qualified = self._get_table_path(table)
|
|
452
|
+
return f"{prefix}{qualified}{suffix}"
|
|
453
|
+
|
|
454
|
+
return re.sub(pattern, replace_table, sql)
|
|
455
|
+
|
|
456
|
+
def execute_sql(self, sql: str, **kwargs) -> QueryResult:
|
|
457
|
+
"""
|
|
458
|
+
Execute a raw SQL query against Dremio via REST API.
|
|
459
|
+
|
|
460
|
+
If a default schema is configured in the connection URL, unqualified
|
|
461
|
+
table names in FROM/JOIN clauses will be automatically qualified.
|
|
462
|
+
|
|
463
|
+
:param sql: SQL query string
|
|
464
|
+
:param kwargs: Additional arguments
|
|
465
|
+
:return: QueryResult containing the results
|
|
466
|
+
"""
|
|
467
|
+
sql = self._qualify_table_names(sql)
|
|
468
|
+
logger.debug(f"Qualified SQL: {sql}")
|
|
469
|
+
df = self._execute_query(sql)
|
|
470
|
+
return QueryResult(num_rows=len(df), rows=df.to_dict("records"))
|
|
471
|
+
|
|
472
|
+
def _needs_quoting(self, identifier: str) -> bool:
|
|
473
|
+
"""Check if an identifier needs quoting in SQL.
|
|
474
|
+
|
|
475
|
+
Identifiers need quoting if they contain special characters
|
|
476
|
+
like hyphens, spaces, or start with a digit.
|
|
477
|
+
"""
|
|
478
|
+
if not identifier:
|
|
479
|
+
return False
|
|
480
|
+
# Needs quoting if contains non-alphanumeric/underscore or starts with digit
|
|
481
|
+
if not identifier[0].isalpha() and identifier[0] != "_":
|
|
482
|
+
return True
|
|
483
|
+
return not all(c.isalnum() or c == "_" for c in identifier)
|
|
484
|
+
|
|
485
|
+
def _quote_if_needed(self, identifier: str) -> str:
|
|
486
|
+
"""Quote an identifier if it contains special characters."""
|
|
487
|
+
if self._needs_quoting(identifier):
|
|
488
|
+
return f'"{identifier}"'
|
|
489
|
+
return identifier
|
|
490
|
+
|
|
491
|
+
def _get_table_path(self, table_name: str) -> str:
|
|
492
|
+
"""Get the full table path including schema if configured.
|
|
493
|
+
|
|
494
|
+
Args:
|
|
495
|
+
table_name: Table name.
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
Full table path for SQL queries.
|
|
499
|
+
"""
|
|
500
|
+
default_schema = self._connection_info.get("default_schema")
|
|
501
|
+
path = self._connection_info.get("path")
|
|
502
|
+
|
|
503
|
+
# If already has dots/quotes, assume it's qualified
|
|
504
|
+
if "." in table_name or '"' in table_name:
|
|
505
|
+
return table_name
|
|
506
|
+
|
|
507
|
+
if default_schema:
|
|
508
|
+
# Schema like "gold-db-2 postgresql.gold" needs proper quoting
|
|
509
|
+
# Split into source and schema.table parts
|
|
510
|
+
parts = default_schema.split(".")
|
|
511
|
+
if len(parts) >= 2:
|
|
512
|
+
# Source name may have spaces/hyphens - quote if needed
|
|
513
|
+
source = self._quote_if_needed(parts[0])
|
|
514
|
+
schema = ".".join(parts[1:])
|
|
515
|
+
return f'{source}.{schema}.{table_name}'
|
|
516
|
+
else:
|
|
517
|
+
return f'{self._quote_if_needed(default_schema)}.{table_name}'
|
|
518
|
+
elif path:
|
|
519
|
+
return f'"{path}"."{table_name}"'
|
|
520
|
+
else:
|
|
521
|
+
return f'"{table_name}"'
|
|
522
|
+
|
|
523
|
+
def _table_exists(self, table_name: str) -> bool:
|
|
524
|
+
"""Check if a table exists in Dremio.
|
|
525
|
+
|
|
526
|
+
Args:
|
|
527
|
+
table_name: Name of the table to check.
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
True if table exists.
|
|
531
|
+
"""
|
|
532
|
+
full_path = self._get_table_path(table_name)
|
|
533
|
+
sql = f"SELECT * FROM {full_path} LIMIT 0"
|
|
534
|
+
try:
|
|
535
|
+
self._execute_query(sql)
|
|
536
|
+
return True
|
|
537
|
+
except Exception:
|
|
538
|
+
return False
|
|
539
|
+
|
|
540
|
+
def _list_table_names(self) -> List[str]:
|
|
541
|
+
"""List all table names in the database."""
|
|
542
|
+
try:
|
|
543
|
+
path = self._connection_info.get("path")
|
|
544
|
+
default_schema = self._connection_info.get("default_schema")
|
|
545
|
+
|
|
546
|
+
sql = """
|
|
547
|
+
SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
|
|
548
|
+
FROM INFORMATION_SCHEMA."TABLES"
|
|
549
|
+
WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
|
|
550
|
+
"""
|
|
551
|
+
|
|
552
|
+
if path:
|
|
553
|
+
sql += f" AND TABLE_SCHEMA = '{path}'"
|
|
554
|
+
elif default_schema:
|
|
555
|
+
sql += f" AND TABLE_SCHEMA = '{default_schema}'"
|
|
556
|
+
|
|
557
|
+
df = self._execute_query(sql)
|
|
558
|
+
return df["TABLE_NAME"].tolist() if not df.empty else []
|
|
559
|
+
except Exception as e:
|
|
560
|
+
logger.warning(f"Could not list tables: {e}")
|
|
561
|
+
return []
|
|
562
|
+
|
|
563
|
+
def init_collections(self):
|
|
564
|
+
"""Initialize collections dict.
|
|
565
|
+
|
|
566
|
+
Note: Unlike other adapters, we don't scan INFORMATION_SCHEMA here
|
|
567
|
+
because it can be very slow on large Dremio instances. Collections
|
|
568
|
+
are created on-demand when get_collection() is called with a table path.
|
|
569
|
+
|
|
570
|
+
Use discover_collections() to explicitly scan for available tables.
|
|
571
|
+
"""
|
|
572
|
+
if self._collections is None:
|
|
573
|
+
self._collections = {}
|
|
574
|
+
|
|
575
|
+
def discover_collections(self):
|
|
576
|
+
"""Discover and register all available tables from Dremio.
|
|
577
|
+
|
|
578
|
+
This queries INFORMATION_SCHEMA to find all tables. This can be slow
|
|
579
|
+
on large Dremio instances - use only when you need to list all tables.
|
|
580
|
+
"""
|
|
581
|
+
if self._collections is None:
|
|
582
|
+
self._collections = {}
|
|
583
|
+
|
|
584
|
+
path = self._connection_info.get("path")
|
|
585
|
+
default_schema = self._connection_info.get("default_schema")
|
|
586
|
+
|
|
587
|
+
sql = """
|
|
588
|
+
SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
|
|
589
|
+
FROM INFORMATION_SCHEMA."TABLES"
|
|
590
|
+
WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
|
|
591
|
+
"""
|
|
592
|
+
|
|
593
|
+
if path:
|
|
594
|
+
sql += f" AND TABLE_SCHEMA = '{path}'"
|
|
595
|
+
elif default_schema:
|
|
596
|
+
sql += f" AND TABLE_SCHEMA = '{default_schema}'"
|
|
597
|
+
|
|
598
|
+
df = self._execute_query(sql)
|
|
599
|
+
|
|
600
|
+
for _, row in df.iterrows():
|
|
601
|
+
table_name = row["TABLE_NAME"]
|
|
602
|
+
collection_name = table_name
|
|
603
|
+
|
|
604
|
+
if collection_name not in self._collections:
|
|
605
|
+
collection = DremioRestCollection(name=collection_name, parent=self)
|
|
606
|
+
collection.metadata.is_prepopulated = True
|
|
607
|
+
self._collections[collection_name] = collection
|
|
608
|
+
|
|
609
|
+
logger.info(f"Discovered {len(self._collections)} tables in Dremio")
|
|
610
|
+
|
|
611
|
+
def _detect_source_type(self, source_name: str) -> Optional[str]:
|
|
612
|
+
"""Detect the type of a Dremio data source.
|
|
613
|
+
|
|
614
|
+
Args:
|
|
615
|
+
source_name: Name of the source (e.g., 'gold-db-2 postgresql').
|
|
616
|
+
|
|
617
|
+
Returns:
|
|
618
|
+
Source type ('postgresql', 'mysql', 'mongodb', etc.) or None.
|
|
619
|
+
"""
|
|
620
|
+
source_lower = source_name.lower()
|
|
621
|
+
if "postgresql" in source_lower or "postgres" in source_lower:
|
|
622
|
+
return "postgresql"
|
|
623
|
+
elif "mysql" in source_lower:
|
|
624
|
+
return "mysql"
|
|
625
|
+
elif "mongo" in source_lower:
|
|
626
|
+
return "mongodb"
|
|
627
|
+
elif "iceberg" in source_lower:
|
|
628
|
+
return "iceberg"
|
|
629
|
+
elif "hive" in source_lower:
|
|
630
|
+
return "hive"
|
|
631
|
+
return None
|
|
632
|
+
|
|
633
|
+
def _get_source_from_schema(self, schema_name: str) -> Tuple[Optional[str], Optional[str]]:
|
|
634
|
+
"""Extract source name and schema from a full schema path.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
schema_name: Full schema path like 'gold-db-2 postgresql.gold'.
|
|
638
|
+
|
|
639
|
+
Returns:
|
|
640
|
+
Tuple of (source_name, schema_within_source).
|
|
641
|
+
"""
|
|
642
|
+
if "." in schema_name:
|
|
643
|
+
parts = schema_name.split(".", 1)
|
|
644
|
+
return parts[0], parts[1] if len(parts) > 1 else None
|
|
645
|
+
return schema_name, None
|
|
646
|
+
|
|
647
|
+
def get_foreign_keys(
|
|
648
|
+
self, schema_name: Optional[str] = None, table_name: Optional[str] = None
|
|
649
|
+
) -> List[ForeignKeyInfo]:
|
|
650
|
+
"""Get foreign key constraints from PostgreSQL sources via pg_catalog.
|
|
651
|
+
|
|
652
|
+
This method queries PostgreSQL's pg_catalog.pg_constraint to retrieve
|
|
653
|
+
foreign key information. Only works for PostgreSQL-backed sources.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
|
|
657
|
+
If None, uses the default schema from connection.
|
|
658
|
+
table_name: Optional table name to filter results.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
List of ForeignKeyInfo objects describing FK relationships.
|
|
662
|
+
"""
|
|
663
|
+
if schema_name is None:
|
|
664
|
+
schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
|
|
665
|
+
|
|
666
|
+
if not schema_name:
|
|
667
|
+
logger.warning("No schema specified for FK introspection")
|
|
668
|
+
return []
|
|
669
|
+
|
|
670
|
+
source_name, pg_schema = self._get_source_from_schema(schema_name)
|
|
671
|
+
source_type = self._detect_source_type(source_name)
|
|
672
|
+
|
|
673
|
+
if source_type != "postgresql":
|
|
674
|
+
logger.info(f"FK introspection only supported for PostgreSQL sources, not {source_type}")
|
|
675
|
+
return []
|
|
676
|
+
|
|
677
|
+
# Query FK constraints from pg_catalog
|
|
678
|
+
fk_sql = f'''
|
|
679
|
+
SELECT
|
|
680
|
+
con.conname as constraint_name,
|
|
681
|
+
src_class.relname as source_table,
|
|
682
|
+
tgt_class.relname as target_table,
|
|
683
|
+
con.conkey as source_col_nums,
|
|
684
|
+
con.confkey as target_col_nums,
|
|
685
|
+
con.conrelid as source_oid,
|
|
686
|
+
con.confrelid as target_oid
|
|
687
|
+
FROM "{source_name}".pg_catalog.pg_constraint con
|
|
688
|
+
JOIN "{source_name}".pg_catalog.pg_class src_class ON con.conrelid = src_class.oid
|
|
689
|
+
JOIN "{source_name}".pg_catalog.pg_class tgt_class ON con.confrelid = tgt_class.oid
|
|
690
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON src_class.relnamespace = nsp.oid
|
|
691
|
+
WHERE con.contype = 'f'
|
|
692
|
+
'''
|
|
693
|
+
|
|
694
|
+
if pg_schema:
|
|
695
|
+
fk_sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
696
|
+
if table_name:
|
|
697
|
+
fk_sql += f" AND src_class.relname = '{table_name}'"
|
|
698
|
+
|
|
699
|
+
# Query column info for resolving column numbers to names
|
|
700
|
+
col_sql = f'''
|
|
701
|
+
SELECT
|
|
702
|
+
c.oid as table_oid,
|
|
703
|
+
a.attnum as col_num,
|
|
704
|
+
a.attname as col_name
|
|
705
|
+
FROM "{source_name}".pg_catalog.pg_class c
|
|
706
|
+
JOIN "{source_name}".pg_catalog.pg_attribute a ON a.attrelid = c.oid
|
|
707
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
|
|
708
|
+
WHERE a.attnum > 0 AND NOT a.attisdropped
|
|
709
|
+
'''
|
|
710
|
+
|
|
711
|
+
if pg_schema:
|
|
712
|
+
col_sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
713
|
+
|
|
714
|
+
try:
|
|
715
|
+
fk_df = self._execute_query(fk_sql)
|
|
716
|
+
col_df = self._execute_query(col_sql)
|
|
717
|
+
|
|
718
|
+
# Build column lookup: (table_oid, col_num) -> col_name
|
|
719
|
+
col_lookup = {}
|
|
720
|
+
for _, row in col_df.iterrows():
|
|
721
|
+
key = (row["table_oid"], row["col_num"])
|
|
722
|
+
col_lookup[key] = row["col_name"]
|
|
723
|
+
|
|
724
|
+
# Build FK info list
|
|
725
|
+
fk_list = []
|
|
726
|
+
|
|
727
|
+
for _, fk in fk_df.iterrows():
|
|
728
|
+
# Parse array strings like '{1}' or '{1,2}'
|
|
729
|
+
src_nums = [int(x) for x in str(fk["source_col_nums"]).strip("{}").split(",") if x]
|
|
730
|
+
tgt_nums = [int(x) for x in str(fk["target_col_nums"]).strip("{}").split(",") if x]
|
|
731
|
+
|
|
732
|
+
src_cols = [col_lookup.get((fk["source_oid"], n), f"col_{n}") for n in src_nums]
|
|
733
|
+
tgt_cols = [col_lookup.get((fk["target_oid"], n), f"col_{n}") for n in tgt_nums]
|
|
734
|
+
|
|
735
|
+
fk_list.append(
|
|
736
|
+
ForeignKeyInfo(
|
|
737
|
+
constraint_name=fk["constraint_name"],
|
|
738
|
+
source_table=fk["source_table"],
|
|
739
|
+
source_columns=src_cols,
|
|
740
|
+
target_table=fk["target_table"],
|
|
741
|
+
target_columns=tgt_cols,
|
|
742
|
+
source_schema=pg_schema,
|
|
743
|
+
target_schema=pg_schema, # Assumes same schema
|
|
744
|
+
)
|
|
745
|
+
)
|
|
746
|
+
|
|
747
|
+
logger.info(f"Found {len(fk_list)} foreign key constraints in {schema_name}")
|
|
748
|
+
return fk_list
|
|
749
|
+
|
|
750
|
+
except Exception as e:
|
|
751
|
+
logger.warning(f"Could not retrieve FK constraints: {e}")
|
|
752
|
+
return []
|
|
753
|
+
|
|
754
|
+
def get_table_comments(
|
|
755
|
+
self, schema_name: Optional[str] = None, table_name: Optional[str] = None
|
|
756
|
+
) -> Dict[str, TableInfo]:
|
|
757
|
+
"""Get table and column comments from PostgreSQL sources via pg_description.
|
|
758
|
+
|
|
759
|
+
Args:
|
|
760
|
+
schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
|
|
761
|
+
table_name: Optional table name to filter results.
|
|
762
|
+
|
|
763
|
+
Returns:
|
|
764
|
+
Dictionary mapping table names to TableInfo with comments.
|
|
765
|
+
"""
|
|
766
|
+
if schema_name is None:
|
|
767
|
+
schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
|
|
768
|
+
|
|
769
|
+
if not schema_name:
|
|
770
|
+
return {}
|
|
771
|
+
|
|
772
|
+
source_name, pg_schema = self._get_source_from_schema(schema_name)
|
|
773
|
+
source_type = self._detect_source_type(source_name)
|
|
774
|
+
|
|
775
|
+
if source_type != "postgresql":
|
|
776
|
+
logger.info(f"Comment introspection only supported for PostgreSQL sources")
|
|
777
|
+
return {}
|
|
778
|
+
|
|
779
|
+
sql = f'''
|
|
780
|
+
SELECT
|
|
781
|
+
c.relname as table_name,
|
|
782
|
+
a.attname as column_name,
|
|
783
|
+
a.attnum as col_num,
|
|
784
|
+
td.description as table_comment,
|
|
785
|
+
cd.description as column_comment
|
|
786
|
+
FROM "{source_name}".pg_catalog.pg_class c
|
|
787
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
|
|
788
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_attribute a
|
|
789
|
+
ON a.attrelid = c.oid AND a.attnum > 0 AND NOT a.attisdropped
|
|
790
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_description td
|
|
791
|
+
ON td.objoid = c.oid AND td.objsubid = 0
|
|
792
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_description cd
|
|
793
|
+
ON cd.objoid = c.oid AND cd.objsubid = a.attnum
|
|
794
|
+
WHERE c.relkind IN ('r', 'v')
|
|
795
|
+
'''
|
|
796
|
+
|
|
797
|
+
if pg_schema:
|
|
798
|
+
sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
799
|
+
if table_name:
|
|
800
|
+
sql += f" AND c.relname = '{table_name}'"
|
|
801
|
+
|
|
802
|
+
sql += " ORDER BY c.relname, a.attnum"
|
|
803
|
+
|
|
804
|
+
try:
|
|
805
|
+
df = self._execute_query(sql)
|
|
806
|
+
|
|
807
|
+
tables = {}
|
|
808
|
+
for tbl_name in df["table_name"].unique():
|
|
809
|
+
tbl_df = df[df["table_name"] == tbl_name]
|
|
810
|
+
table_comment = tbl_df["table_comment"].iloc[0] if not tbl_df["table_comment"].isna().all() else None
|
|
811
|
+
|
|
812
|
+
columns = []
|
|
813
|
+
for _, row in tbl_df.iterrows():
|
|
814
|
+
if row["column_name"]:
|
|
815
|
+
columns.append(
|
|
816
|
+
ColumnInfo(
|
|
817
|
+
name=row["column_name"],
|
|
818
|
+
data_type="", # Not fetched here
|
|
819
|
+
comment=row["column_comment"] if not pd.isna(row["column_comment"]) else None,
|
|
820
|
+
ordinal_position=int(row["col_num"]) if row["col_num"] else 0,
|
|
821
|
+
)
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
tables[tbl_name] = TableInfo(
|
|
825
|
+
name=tbl_name, schema_name=pg_schema, comment=table_comment, columns=columns
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
return tables
|
|
829
|
+
|
|
830
|
+
except Exception as e:
|
|
831
|
+
logger.warning(f"Could not retrieve table comments: {e}")
|
|
832
|
+
return {}
|
|
833
|
+
|
|
834
|
+
def get_nested_schema(self, table_path: str) -> Dict[str, Any]:
|
|
835
|
+
"""Get full schema including nested types by querying with LIMIT 0.
|
|
836
|
+
|
|
837
|
+
For complex types (ARRAY, STRUCT/ROW), the INFORMATION_SCHEMA doesn't
|
|
838
|
+
return nested field information. This method uses the REST catalog API
|
|
839
|
+
to get the full schema with nested structure.
|
|
840
|
+
|
|
841
|
+
Args:
|
|
842
|
+
table_path: Full table path (e.g., '"schema".table').
|
|
843
|
+
|
|
844
|
+
Returns:
|
|
845
|
+
Dictionary with column names and their type info.
|
|
846
|
+
"""
|
|
847
|
+
# First try the REST catalog API for detailed type info
|
|
848
|
+
path = self._connection_info.get("path")
|
|
849
|
+
default_schema = self._connection_info.get("default_schema")
|
|
850
|
+
|
|
851
|
+
# Parse table path to get catalog path
|
|
852
|
+
table_path_clean = table_path.replace('"', '')
|
|
853
|
+
parts = table_path_clean.split(".")
|
|
854
|
+
|
|
855
|
+
# Try to find table via catalog API
|
|
856
|
+
try:
|
|
857
|
+
headers = self._get_headers()
|
|
858
|
+
cookies = self._get_cookies()
|
|
859
|
+
|
|
860
|
+
# Build catalog path
|
|
861
|
+
if len(parts) >= 2:
|
|
862
|
+
catalog_path = ".".join(parts[:-1])
|
|
863
|
+
table_name = parts[-1]
|
|
864
|
+
else:
|
|
865
|
+
catalog_path = path or default_schema or ""
|
|
866
|
+
table_name = parts[0] if parts else table_path_clean
|
|
867
|
+
|
|
868
|
+
# URL encode the path
|
|
869
|
+
encoded_path = quote(f"{catalog_path}.{table_name}" if catalog_path else table_name, safe="")
|
|
870
|
+
url = f"{self.base_url}/api/v3/catalog/by-path/{encoded_path}"
|
|
871
|
+
|
|
872
|
+
response = self.session.get(url, headers=headers, cookies=cookies)
|
|
873
|
+
|
|
874
|
+
if response.ok:
|
|
875
|
+
catalog_data = response.json()
|
|
876
|
+
fields = catalog_data.get("fields", [])
|
|
877
|
+
|
|
878
|
+
schema_info = {}
|
|
879
|
+
for field_data in fields:
|
|
880
|
+
field_name = field_data.get("name")
|
|
881
|
+
field_type = field_data.get("type", {})
|
|
882
|
+
|
|
883
|
+
type_name = field_type.get("name", "UNKNOWN")
|
|
884
|
+
field_info = {
|
|
885
|
+
"name": field_name,
|
|
886
|
+
"dremio_type": type_name,
|
|
887
|
+
"nullable": True, # Not always available in REST API
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
# Handle complex types with subSchema
|
|
891
|
+
sub_schema = field_type.get("subSchema")
|
|
892
|
+
if sub_schema:
|
|
893
|
+
field_info["nested_fields"] = []
|
|
894
|
+
for sub_field in sub_schema:
|
|
895
|
+
sub_name = sub_field.get("name")
|
|
896
|
+
sub_type = sub_field.get("type", {})
|
|
897
|
+
field_info["nested_fields"].append({
|
|
898
|
+
"name": sub_name,
|
|
899
|
+
"dremio_type": sub_type.get("name", "UNKNOWN"),
|
|
900
|
+
})
|
|
901
|
+
|
|
902
|
+
# Handle list element types
|
|
903
|
+
if type_name == "LIST":
|
|
904
|
+
sub_schema = field_type.get("subSchema", [])
|
|
905
|
+
if sub_schema:
|
|
906
|
+
field_info["element_type"] = sub_schema[0].get("type", {}).get("name", "UNKNOWN")
|
|
907
|
+
|
|
908
|
+
schema_info[field_name] = field_info
|
|
909
|
+
|
|
910
|
+
return schema_info
|
|
911
|
+
|
|
912
|
+
except Exception as e:
|
|
913
|
+
logger.debug(f"Could not get schema via catalog API: {e}")
|
|
914
|
+
|
|
915
|
+
# Fall back to LIMIT 0 query for column info (without nested structure)
|
|
916
|
+
try:
|
|
917
|
+
sql = f"SELECT * FROM {table_path} LIMIT 0"
|
|
918
|
+
df = self._execute_query(sql)
|
|
919
|
+
|
|
920
|
+
schema_info = {}
|
|
921
|
+
for col in df.columns:
|
|
922
|
+
schema_info[col] = {
|
|
923
|
+
"name": col,
|
|
924
|
+
"dremio_type": str(df[col].dtype),
|
|
925
|
+
"nullable": True,
|
|
926
|
+
}
|
|
927
|
+
|
|
928
|
+
return schema_info
|
|
929
|
+
|
|
930
|
+
except Exception as e:
|
|
931
|
+
logger.warning(f"Could not get nested schema for {table_path}: {e}")
|
|
932
|
+
return {}
|
|
933
|
+
|
|
934
|
+
def induce_schema_view(self, include_foreign_keys: bool = True) -> SchemaView:
|
|
935
|
+
"""Induce a schema view from Dremio table structures.
|
|
936
|
+
|
|
937
|
+
Args:
|
|
938
|
+
include_foreign_keys: If True, attempt to retrieve FK info from
|
|
939
|
+
PostgreSQL sources and add relationships.
|
|
940
|
+
|
|
941
|
+
Returns:
|
|
942
|
+
SchemaView representing the database schema.
|
|
943
|
+
"""
|
|
944
|
+
logger.info(f"Inducing schema view for {self.metadata.handle}")
|
|
945
|
+
sb = SchemaBuilder()
|
|
946
|
+
|
|
947
|
+
if not self._collections:
|
|
948
|
+
self.discover_collections()
|
|
949
|
+
|
|
950
|
+
path = self._connection_info.get("path")
|
|
951
|
+
default_schema = self._connection_info.get("default_schema")
|
|
952
|
+
|
|
953
|
+
try:
|
|
954
|
+
sql = """
|
|
955
|
+
SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE,
|
|
956
|
+
IS_NULLABLE, ORDINAL_POSITION
|
|
957
|
+
FROM INFORMATION_SCHEMA."COLUMNS"
|
|
958
|
+
"""
|
|
959
|
+
|
|
960
|
+
if path:
|
|
961
|
+
sql += f" WHERE TABLE_SCHEMA = '{path}'"
|
|
962
|
+
elif default_schema:
|
|
963
|
+
sql += f" WHERE TABLE_SCHEMA = '{default_schema}'"
|
|
964
|
+
|
|
965
|
+
sql += " ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION"
|
|
966
|
+
|
|
967
|
+
df = self._execute_query(sql)
|
|
968
|
+
|
|
969
|
+
current_table = None
|
|
970
|
+
for _, row in df.iterrows():
|
|
971
|
+
schema_name = row["TABLE_SCHEMA"]
|
|
972
|
+
table_name = row["TABLE_NAME"]
|
|
973
|
+
column_name = row["COLUMN_NAME"]
|
|
974
|
+
data_type = row["DATA_TYPE"]
|
|
975
|
+
is_nullable = row["IS_NULLABLE"]
|
|
976
|
+
|
|
977
|
+
# Get class name
|
|
978
|
+
if schema_name in (path, default_schema):
|
|
979
|
+
class_name = table_name
|
|
980
|
+
else:
|
|
981
|
+
class_name = f"{schema_name}_{table_name}"
|
|
982
|
+
|
|
983
|
+
if class_name != current_table:
|
|
984
|
+
sb.add_class(class_name)
|
|
985
|
+
current_table = class_name
|
|
986
|
+
|
|
987
|
+
# Map Dremio type to LinkML type
|
|
988
|
+
base_type = re.split(r"[\(\[]", str(data_type))[0].upper()
|
|
989
|
+
linkml_type = DREMIO_SQL_TO_LINKML.get(base_type, "string")
|
|
990
|
+
|
|
991
|
+
sd = SlotDefinition(column_name, required=is_nullable == "NO", range=linkml_type)
|
|
992
|
+
sb.schema.classes[class_name].attributes[sd.name] = sd
|
|
993
|
+
logger.debug(f"Introspected slot: {class_name}.{sd.name}: {sd.range}")
|
|
994
|
+
|
|
995
|
+
except Exception as e:
|
|
996
|
+
logger.warning(f"Could not introspect schema from INFORMATION_SCHEMA: {e}")
|
|
997
|
+
|
|
998
|
+
# Add foreign key relationships
|
|
999
|
+
if include_foreign_keys:
|
|
1000
|
+
schema_to_use = path or default_schema
|
|
1001
|
+
if schema_to_use:
|
|
1002
|
+
fks = self.get_foreign_keys(schema_to_use)
|
|
1003
|
+
for fk in fks:
|
|
1004
|
+
# Get or derive class names
|
|
1005
|
+
src_class = fk.source_table
|
|
1006
|
+
tgt_class = fk.target_table
|
|
1007
|
+
|
|
1008
|
+
# Skip if classes don't exist
|
|
1009
|
+
if src_class not in sb.schema.classes or tgt_class not in sb.schema.classes:
|
|
1010
|
+
continue
|
|
1011
|
+
|
|
1012
|
+
# For single-column FKs, update the slot's range to point to target class
|
|
1013
|
+
if len(fk.source_columns) == 1:
|
|
1014
|
+
src_col = fk.source_columns[0]
|
|
1015
|
+
if src_col in sb.schema.classes[src_class].attributes:
|
|
1016
|
+
slot = sb.schema.classes[src_class].attributes[src_col]
|
|
1017
|
+
# Set range to target class, indicating a relationship
|
|
1018
|
+
slot.range = tgt_class
|
|
1019
|
+
slot.description = f"Foreign key to {tgt_class}"
|
|
1020
|
+
logger.debug(f"Added FK relationship: {src_class}.{src_col} -> {tgt_class}")
|
|
1021
|
+
|
|
1022
|
+
sb.add_defaults()
|
|
1023
|
+
return SchemaView(sb.schema)
|