linkml-store 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- linkml_store/__init__.py +7 -0
- linkml_store/api/__init__.py +8 -0
- linkml_store/api/client.py +414 -0
- linkml_store/api/collection.py +1280 -0
- linkml_store/api/config.py +187 -0
- linkml_store/api/database.py +862 -0
- linkml_store/api/queries.py +69 -0
- linkml_store/api/stores/__init__.py +0 -0
- linkml_store/api/stores/chromadb/__init__.py +7 -0
- linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
- linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
- linkml_store/api/stores/dremio/__init__.py +10 -0
- linkml_store/api/stores/dremio/dremio_collection.py +555 -0
- linkml_store/api/stores/dremio/dremio_database.py +1052 -0
- linkml_store/api/stores/dremio/mappings.py +105 -0
- linkml_store/api/stores/dremio_rest/__init__.py +11 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
- linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
- linkml_store/api/stores/duckdb/__init__.py +16 -0
- linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
- linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
- linkml_store/api/stores/duckdb/mappings.py +8 -0
- linkml_store/api/stores/filesystem/__init__.py +15 -0
- linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
- linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
- linkml_store/api/stores/hdf5/__init__.py +7 -0
- linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
- linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
- linkml_store/api/stores/ibis/__init__.py +5 -0
- linkml_store/api/stores/ibis/ibis_collection.py +488 -0
- linkml_store/api/stores/ibis/ibis_database.py +328 -0
- linkml_store/api/stores/mongodb/__init__.py +25 -0
- linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
- linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
- linkml_store/api/stores/neo4j/__init__.py +0 -0
- linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
- linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
- linkml_store/api/stores/solr/__init__.py +3 -0
- linkml_store/api/stores/solr/solr_collection.py +224 -0
- linkml_store/api/stores/solr/solr_database.py +83 -0
- linkml_store/api/stores/solr/solr_utils.py +0 -0
- linkml_store/api/types.py +4 -0
- linkml_store/cli.py +1147 -0
- linkml_store/constants.py +7 -0
- linkml_store/graphs/__init__.py +0 -0
- linkml_store/graphs/graph_map.py +24 -0
- linkml_store/index/__init__.py +53 -0
- linkml_store/index/implementations/__init__.py +0 -0
- linkml_store/index/implementations/llm_indexer.py +174 -0
- linkml_store/index/implementations/simple_indexer.py +43 -0
- linkml_store/index/indexer.py +211 -0
- linkml_store/inference/__init__.py +13 -0
- linkml_store/inference/evaluation.py +195 -0
- linkml_store/inference/implementations/__init__.py +0 -0
- linkml_store/inference/implementations/llm_inference_engine.py +154 -0
- linkml_store/inference/implementations/rag_inference_engine.py +276 -0
- linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
- linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
- linkml_store/inference/inference_config.py +66 -0
- linkml_store/inference/inference_engine.py +209 -0
- linkml_store/inference/inference_engine_registry.py +74 -0
- linkml_store/plotting/__init__.py +5 -0
- linkml_store/plotting/cli.py +826 -0
- linkml_store/plotting/dimensionality_reduction.py +453 -0
- linkml_store/plotting/embedding_plot.py +489 -0
- linkml_store/plotting/facet_chart.py +73 -0
- linkml_store/plotting/heatmap.py +383 -0
- linkml_store/utils/__init__.py +0 -0
- linkml_store/utils/change_utils.py +17 -0
- linkml_store/utils/dat_parser.py +95 -0
- linkml_store/utils/embedding_matcher.py +424 -0
- linkml_store/utils/embedding_utils.py +299 -0
- linkml_store/utils/enrichment_analyzer.py +217 -0
- linkml_store/utils/file_utils.py +37 -0
- linkml_store/utils/format_utils.py +550 -0
- linkml_store/utils/io.py +38 -0
- linkml_store/utils/llm_utils.py +122 -0
- linkml_store/utils/mongodb_utils.py +145 -0
- linkml_store/utils/neo4j_utils.py +42 -0
- linkml_store/utils/object_utils.py +190 -0
- linkml_store/utils/pandas_utils.py +93 -0
- linkml_store/utils/patch_utils.py +126 -0
- linkml_store/utils/query_utils.py +89 -0
- linkml_store/utils/schema_utils.py +23 -0
- linkml_store/utils/sklearn_utils.py +193 -0
- linkml_store/utils/sql_utils.py +177 -0
- linkml_store/utils/stats_utils.py +53 -0
- linkml_store/utils/vector_utils.py +158 -0
- linkml_store/webapi/__init__.py +0 -0
- linkml_store/webapi/html/__init__.py +3 -0
- linkml_store/webapi/html/base.html.j2 +24 -0
- linkml_store/webapi/html/collection_details.html.j2 +15 -0
- linkml_store/webapi/html/database_details.html.j2 +16 -0
- linkml_store/webapi/html/databases.html.j2 +14 -0
- linkml_store/webapi/html/generic.html.j2 +43 -0
- linkml_store/webapi/main.py +855 -0
- linkml_store-0.3.0.dist-info/METADATA +226 -0
- linkml_store-0.3.0.dist-info/RECORD +101 -0
- linkml_store-0.3.0.dist-info/WHEEL +4 -0
- linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
- linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,1052 @@
|
|
|
1
|
+
"""Dremio database adapter using Arrow Flight SQL.
|
|
2
|
+
|
|
3
|
+
This module provides a Database implementation for connecting to Dremio
|
|
4
|
+
data lakehouse using the Arrow Flight SQL protocol for high-performance
|
|
5
|
+
data access.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import re
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
13
|
+
from urllib.parse import parse_qs, urlparse
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
from linkml_runtime import SchemaView
|
|
17
|
+
from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
|
|
18
|
+
from linkml_runtime.utils.schema_builder import SchemaBuilder
|
|
19
|
+
|
|
20
|
+
from linkml_store.api import Database
|
|
21
|
+
from linkml_store.api.queries import Query, QueryResult
|
|
22
|
+
from linkml_store.api.stores.dremio.dremio_collection import DremioCollection
|
|
23
|
+
from linkml_store.api.stores.dremio.mappings import get_linkml_type_from_arrow
|
|
24
|
+
from linkml_store.utils.format_utils import Format
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class ForeignKeyInfo:
|
|
31
|
+
"""Information about a foreign key constraint."""
|
|
32
|
+
|
|
33
|
+
constraint_name: str
|
|
34
|
+
source_table: str
|
|
35
|
+
source_columns: List[str]
|
|
36
|
+
target_table: str
|
|
37
|
+
target_columns: List[str]
|
|
38
|
+
source_schema: Optional[str] = None
|
|
39
|
+
target_schema: Optional[str] = None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class ColumnInfo:
|
|
44
|
+
"""Information about a column including comments and nested structure."""
|
|
45
|
+
|
|
46
|
+
name: str
|
|
47
|
+
data_type: str
|
|
48
|
+
is_nullable: bool = True
|
|
49
|
+
comment: Optional[str] = None
|
|
50
|
+
ordinal_position: int = 0
|
|
51
|
+
nested_fields: List["ColumnInfo"] = field(default_factory=list)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class TableInfo:
|
|
56
|
+
"""Information about a table including comments."""
|
|
57
|
+
|
|
58
|
+
name: str
|
|
59
|
+
schema_name: Optional[str] = None
|
|
60
|
+
comment: Optional[str] = None
|
|
61
|
+
columns: List[ColumnInfo] = field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DremioDatabase(Database):
|
|
65
|
+
"""
|
|
66
|
+
An adapter for Dremio data lakehouse using Arrow Flight SQL.
|
|
67
|
+
|
|
68
|
+
This adapter connects to Dremio using the Arrow Flight SQL protocol,
|
|
69
|
+
which provides high-performance data transfer using Apache Arrow.
|
|
70
|
+
|
|
71
|
+
Handle format:
|
|
72
|
+
dremio://[username:password@]host[:port][/path][?params]
|
|
73
|
+
|
|
74
|
+
Examples:
|
|
75
|
+
- dremio://localhost:32010
|
|
76
|
+
- dremio://user:pass@localhost:32010
|
|
77
|
+
- dremio://localhost:32010?useEncryption=false
|
|
78
|
+
- dremio://user:pass@dremio.example.com:32010/Samples
|
|
79
|
+
|
|
80
|
+
Parameters (query string):
|
|
81
|
+
- useEncryption: Whether to use TLS (default: true)
|
|
82
|
+
- disableCertificateVerification: Skip cert verification (default: false)
|
|
83
|
+
- schema: Default schema/space to use
|
|
84
|
+
- username_env: Environment variable for username (default: DREMIO_USER)
|
|
85
|
+
- password_env: Environment variable for password (default: DREMIO_PASSWORD)
|
|
86
|
+
|
|
87
|
+
Environment variables:
|
|
88
|
+
- DREMIO_USER: Default username (if not in URL)
|
|
89
|
+
- DREMIO_PASSWORD: Default password (if not in URL)
|
|
90
|
+
|
|
91
|
+
Note:
|
|
92
|
+
Requires pyarrow with Flight SQL support. Install with:
|
|
93
|
+
pip install pyarrow
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
_flight_client = None
|
|
97
|
+
_adbc_connection = None
|
|
98
|
+
_connection_info: dict = None
|
|
99
|
+
collection_class = DremioCollection
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
handle: Optional[str] = None,
|
|
104
|
+
recreate_if_exists: bool = False,
|
|
105
|
+
username: Optional[str] = None,
|
|
106
|
+
password: Optional[str] = None,
|
|
107
|
+
**kwargs,
|
|
108
|
+
):
|
|
109
|
+
"""Initialize a Dremio database connection.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
handle: Connection string in format dremio://host:port
|
|
113
|
+
recreate_if_exists: Not applicable for Dremio (ignored)
|
|
114
|
+
username: Optional username (can also be in handle)
|
|
115
|
+
password: Optional password (can also be in handle)
|
|
116
|
+
**kwargs: Additional arguments passed to parent
|
|
117
|
+
"""
|
|
118
|
+
if handle is None:
|
|
119
|
+
handle = "dremio://localhost:32010"
|
|
120
|
+
|
|
121
|
+
self._connection_info = self._parse_handle(handle)
|
|
122
|
+
|
|
123
|
+
# Override with explicit credentials if provided
|
|
124
|
+
if username:
|
|
125
|
+
self._connection_info["username"] = username
|
|
126
|
+
if password:
|
|
127
|
+
self._connection_info["password"] = password
|
|
128
|
+
|
|
129
|
+
super().__init__(handle=handle, **kwargs)
|
|
130
|
+
|
|
131
|
+
def _parse_handle(self, handle: str) -> dict:
|
|
132
|
+
"""Parse a Dremio connection handle.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
handle: Connection string like dremio://user:pass@host:port/path?params
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
Dictionary with connection parameters.
|
|
139
|
+
"""
|
|
140
|
+
# Ensure scheme is present
|
|
141
|
+
if not handle.startswith("dremio://"):
|
|
142
|
+
handle = f"dremio://{handle}"
|
|
143
|
+
|
|
144
|
+
parsed = urlparse(handle)
|
|
145
|
+
|
|
146
|
+
# Parse query parameters
|
|
147
|
+
params = parse_qs(parsed.query)
|
|
148
|
+
|
|
149
|
+
# Extract single values from query params
|
|
150
|
+
use_encryption = params.get("useEncryption", ["true"])[0].lower() == "true"
|
|
151
|
+
disable_cert_verify = params.get("disableCertificateVerification", ["false"])[0].lower() == "true"
|
|
152
|
+
default_schema = params.get("schema", [None])[0]
|
|
153
|
+
|
|
154
|
+
# Get env var names (configurable via query params)
|
|
155
|
+
username_env = params.get("username_env", ["DREMIO_USER"])[0]
|
|
156
|
+
password_env = params.get("password_env", ["DREMIO_PASSWORD"])[0]
|
|
157
|
+
|
|
158
|
+
# Get credentials from URL or environment variables
|
|
159
|
+
username = parsed.username or os.environ.get(username_env)
|
|
160
|
+
password = parsed.password or os.environ.get(password_env)
|
|
161
|
+
|
|
162
|
+
return {
|
|
163
|
+
"host": parsed.hostname or "localhost",
|
|
164
|
+
"port": parsed.port or 32010,
|
|
165
|
+
"username": username,
|
|
166
|
+
"password": password,
|
|
167
|
+
"path": parsed.path.lstrip("/") if parsed.path else None,
|
|
168
|
+
"use_encryption": use_encryption,
|
|
169
|
+
"disable_cert_verify": disable_cert_verify,
|
|
170
|
+
"default_schema": default_schema,
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
@property
|
|
174
|
+
def flight_client(self):
|
|
175
|
+
"""Get or create the Arrow Flight SQL client.
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
FlightSqlClient connected to Dremio.
|
|
179
|
+
|
|
180
|
+
Raises:
|
|
181
|
+
ImportError: If pyarrow is not installed.
|
|
182
|
+
ConnectionError: If connection to Dremio fails.
|
|
183
|
+
"""
|
|
184
|
+
if self._flight_client is None:
|
|
185
|
+
try:
|
|
186
|
+
import pyarrow.flight as flight
|
|
187
|
+
except ImportError as e:
|
|
188
|
+
raise ImportError(
|
|
189
|
+
"pyarrow with Flight support is required for Dremio adapter. "
|
|
190
|
+
"Install with: pip install pyarrow"
|
|
191
|
+
) from e
|
|
192
|
+
|
|
193
|
+
info = self._connection_info
|
|
194
|
+
host = info["host"]
|
|
195
|
+
port = info["port"]
|
|
196
|
+
|
|
197
|
+
# Build location
|
|
198
|
+
if info["use_encryption"]:
|
|
199
|
+
location = f"grpc+tls://{host}:{port}"
|
|
200
|
+
else:
|
|
201
|
+
location = f"grpc://{host}:{port}"
|
|
202
|
+
|
|
203
|
+
logger.info(f"Connecting to Dremio at {location}")
|
|
204
|
+
|
|
205
|
+
# Build connection options
|
|
206
|
+
client_options = []
|
|
207
|
+
|
|
208
|
+
if info["disable_cert_verify"]:
|
|
209
|
+
client_options.append(("disable_server_verification", "true"))
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
client = flight.FlightClient(location)
|
|
213
|
+
|
|
214
|
+
# Authenticate if credentials provided
|
|
215
|
+
if info["username"] and info["password"]:
|
|
216
|
+
# Get auth token using basic auth
|
|
217
|
+
bearer_token = self._authenticate(client, info["username"], info["password"])
|
|
218
|
+
# Store token for subsequent requests
|
|
219
|
+
self._bearer_token = bearer_token
|
|
220
|
+
else:
|
|
221
|
+
self._bearer_token = None
|
|
222
|
+
|
|
223
|
+
self._flight_client = client
|
|
224
|
+
|
|
225
|
+
except Exception as e:
|
|
226
|
+
raise ConnectionError(f"Failed to connect to Dremio at {location}: {e}") from e
|
|
227
|
+
|
|
228
|
+
return self._flight_client
|
|
229
|
+
|
|
230
|
+
def _authenticate(self, client, username: str, password: str) -> bytes:
|
|
231
|
+
"""Authenticate with Dremio and get bearer token.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
client: Flight client
|
|
235
|
+
username: Dremio username
|
|
236
|
+
password: Dremio password
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Bearer token for subsequent requests.
|
|
240
|
+
"""
|
|
241
|
+
import pyarrow.flight as flight
|
|
242
|
+
|
|
243
|
+
# Use basic authentication
|
|
244
|
+
auth_handler = flight.BasicAuth(username, password)
|
|
245
|
+
token_pair = client.authenticate_basic_token(username, password)
|
|
246
|
+
return token_pair[1] # Return the token value
|
|
247
|
+
|
|
248
|
+
def _get_call_options(self):
|
|
249
|
+
"""Get Flight call options with authentication headers.
|
|
250
|
+
|
|
251
|
+
Returns:
|
|
252
|
+
FlightCallOptions with bearer token if authenticated.
|
|
253
|
+
"""
|
|
254
|
+
import pyarrow.flight as flight
|
|
255
|
+
|
|
256
|
+
if hasattr(self, "_bearer_token") and self._bearer_token:
|
|
257
|
+
return flight.FlightCallOptions(headers=[(b"authorization", self._bearer_token)])
|
|
258
|
+
return flight.FlightCallOptions()
|
|
259
|
+
|
|
260
|
+
def _execute_query(self, sql: str) -> "pyarrow.Table":
|
|
261
|
+
"""Execute a SQL query and return results as Arrow Table.
|
|
262
|
+
|
|
263
|
+
Uses ADBC Flight SQL driver when available (faster), falls back to
|
|
264
|
+
raw Flight RPC otherwise.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
sql: SQL query string.
|
|
268
|
+
|
|
269
|
+
Returns:
|
|
270
|
+
PyArrow Table with query results.
|
|
271
|
+
"""
|
|
272
|
+
logger.debug(f"Executing SQL: {sql}")
|
|
273
|
+
|
|
274
|
+
# Try ADBC first (much faster for Flight SQL)
|
|
275
|
+
try:
|
|
276
|
+
return self._execute_query_adbc(sql)
|
|
277
|
+
except ImportError:
|
|
278
|
+
logger.debug("ADBC not available, falling back to raw Flight")
|
|
279
|
+
return self._execute_query_flight(sql)
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def adbc_connection(self):
|
|
283
|
+
"""Get or create cached ADBC Flight SQL connection.
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
ADBC connection to Dremio.
|
|
287
|
+
|
|
288
|
+
Raises:
|
|
289
|
+
ImportError: If ADBC driver is not installed.
|
|
290
|
+
"""
|
|
291
|
+
if self._adbc_connection is None:
|
|
292
|
+
import adbc_driver_flightsql.dbapi as flightsql
|
|
293
|
+
|
|
294
|
+
info = self._connection_info
|
|
295
|
+
host = info["host"]
|
|
296
|
+
port = info["port"]
|
|
297
|
+
|
|
298
|
+
# Build URI
|
|
299
|
+
if info["use_encryption"]:
|
|
300
|
+
uri = f"grpc+tls://{host}:{port}"
|
|
301
|
+
else:
|
|
302
|
+
uri = f"grpc://{host}:{port}"
|
|
303
|
+
|
|
304
|
+
# Build connection kwargs
|
|
305
|
+
connect_kwargs = {"uri": uri}
|
|
306
|
+
|
|
307
|
+
# Add auth if available
|
|
308
|
+
if info["username"] and info["password"]:
|
|
309
|
+
connect_kwargs["db_kwargs"] = {
|
|
310
|
+
"username": info["username"],
|
|
311
|
+
"password": info["password"],
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
logger.info(f"Establishing ADBC Flight SQL connection to {uri}")
|
|
315
|
+
self._adbc_connection = flightsql.connect(**connect_kwargs)
|
|
316
|
+
|
|
317
|
+
return self._adbc_connection
|
|
318
|
+
|
|
319
|
+
def _execute_query_adbc(self, sql: str) -> "pyarrow.Table":
|
|
320
|
+
"""Execute query using ADBC Flight SQL driver (fast path).
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
sql: SQL query string.
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
PyArrow Table with query results.
|
|
327
|
+
|
|
328
|
+
Raises:
|
|
329
|
+
ImportError: If ADBC driver is not installed.
|
|
330
|
+
"""
|
|
331
|
+
import pyarrow as pa
|
|
332
|
+
|
|
333
|
+
conn = self.adbc_connection
|
|
334
|
+
sql_upper = sql.strip().upper()
|
|
335
|
+
|
|
336
|
+
# Handle context-setting statements (USE, SET, ALTER SESSION, etc.)
|
|
337
|
+
# These don't return meaningful results but set session state
|
|
338
|
+
if sql_upper.startswith(("USE ", "SET ", "ALTER SESSION")):
|
|
339
|
+
with conn.cursor() as cursor:
|
|
340
|
+
cursor.execute(sql)
|
|
341
|
+
# Don't try to fetch results - just execute for side effect
|
|
342
|
+
return pa.table({})
|
|
343
|
+
|
|
344
|
+
with conn.cursor() as cursor:
|
|
345
|
+
cursor.execute(sql)
|
|
346
|
+
return cursor.fetch_arrow_table()
|
|
347
|
+
|
|
348
|
+
def _execute_query_flight(self, sql: str) -> "pyarrow.Table":
|
|
349
|
+
"""Execute query using raw Flight RPC (fallback path).
|
|
350
|
+
|
|
351
|
+
Args:
|
|
352
|
+
sql: SQL query string.
|
|
353
|
+
|
|
354
|
+
Returns:
|
|
355
|
+
PyArrow Table with query results.
|
|
356
|
+
"""
|
|
357
|
+
import pyarrow.flight as flight
|
|
358
|
+
|
|
359
|
+
client = self.flight_client
|
|
360
|
+
options = self._get_call_options()
|
|
361
|
+
|
|
362
|
+
# Create Flight SQL command
|
|
363
|
+
flight_desc = flight.FlightDescriptor.for_command(sql.encode("utf-8"))
|
|
364
|
+
|
|
365
|
+
# Get flight info
|
|
366
|
+
try:
|
|
367
|
+
# For Dremio, we use the execute method directly
|
|
368
|
+
# Prepare the SQL statement
|
|
369
|
+
info = client.get_flight_info(flight_desc, options)
|
|
370
|
+
|
|
371
|
+
# Get the data from all endpoints
|
|
372
|
+
tables = []
|
|
373
|
+
for endpoint in info.endpoints:
|
|
374
|
+
reader = client.do_get(endpoint.ticket, options)
|
|
375
|
+
tables.append(reader.read_all())
|
|
376
|
+
|
|
377
|
+
if not tables:
|
|
378
|
+
import pyarrow as pa
|
|
379
|
+
|
|
380
|
+
return pa.table({})
|
|
381
|
+
|
|
382
|
+
# Concatenate all tables
|
|
383
|
+
import pyarrow as pa
|
|
384
|
+
|
|
385
|
+
return pa.concat_tables(tables)
|
|
386
|
+
|
|
387
|
+
except Exception as e:
|
|
388
|
+
logger.error(f"Query execution failed: {e}")
|
|
389
|
+
raise
|
|
390
|
+
|
|
391
|
+
def _execute_update(self, sql: str) -> int:
|
|
392
|
+
"""Execute a SQL update/insert/delete statement.
|
|
393
|
+
|
|
394
|
+
Args:
|
|
395
|
+
sql: SQL statement.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
Number of affected rows (-1 if unknown).
|
|
399
|
+
"""
|
|
400
|
+
import pyarrow.flight as flight
|
|
401
|
+
|
|
402
|
+
logger.debug(f"Executing update: {sql}")
|
|
403
|
+
|
|
404
|
+
client = self.flight_client
|
|
405
|
+
options = self._get_call_options()
|
|
406
|
+
|
|
407
|
+
try:
|
|
408
|
+
# For DML statements, use do_action
|
|
409
|
+
action = flight.Action("query", sql.encode("utf-8"))
|
|
410
|
+
results = list(client.do_action(action, options))
|
|
411
|
+
# Try to parse affected rows from result
|
|
412
|
+
if results:
|
|
413
|
+
try:
|
|
414
|
+
return int(results[0].body.to_pybytes().decode("utf-8"))
|
|
415
|
+
except (ValueError, AttributeError):
|
|
416
|
+
pass
|
|
417
|
+
return -1
|
|
418
|
+
except Exception as e:
|
|
419
|
+
logger.warning(f"Update execution failed, trying alternative method: {e}")
|
|
420
|
+
# Some Dremio versions may not support do_action for DML
|
|
421
|
+
# Fall back to regular query
|
|
422
|
+
try:
|
|
423
|
+
self._execute_query(sql)
|
|
424
|
+
return -1
|
|
425
|
+
except Exception as e2:
|
|
426
|
+
logger.error(f"Update failed: {e2}")
|
|
427
|
+
raise
|
|
428
|
+
|
|
429
|
+
def commit(self, **kwargs):
|
|
430
|
+
"""Commit pending changes.
|
|
431
|
+
|
|
432
|
+
Note: Dremio auto-commits, this is a no-op.
|
|
433
|
+
"""
|
|
434
|
+
pass
|
|
435
|
+
|
|
436
|
+
def close(self, **kwargs):
|
|
437
|
+
"""Close the Dremio connection."""
|
|
438
|
+
if self._flight_client:
|
|
439
|
+
self._flight_client.close()
|
|
440
|
+
self._flight_client = None
|
|
441
|
+
if self._adbc_connection:
|
|
442
|
+
self._adbc_connection.close()
|
|
443
|
+
self._adbc_connection = None
|
|
444
|
+
|
|
445
|
+
def drop(self, missing_ok=True, **kwargs):
|
|
446
|
+
"""Drop the database.
|
|
447
|
+
|
|
448
|
+
Note: This is not supported for Dremio as it's typically a read/query layer.
|
|
449
|
+
Individual tables can be dropped if you have permissions.
|
|
450
|
+
"""
|
|
451
|
+
self.close()
|
|
452
|
+
logger.warning("Dremio does not support dropping databases through this adapter")
|
|
453
|
+
|
|
454
|
+
def query(self, query: Query, **kwargs) -> QueryResult:
|
|
455
|
+
"""Execute a query against Dremio.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
query: Query object specifying the query parameters.
|
|
459
|
+
**kwargs: Additional arguments.
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
QueryResult with matching rows.
|
|
463
|
+
"""
|
|
464
|
+
from_table = query.from_table
|
|
465
|
+
if not from_table:
|
|
466
|
+
raise ValueError("Query must specify from_table")
|
|
467
|
+
|
|
468
|
+
# Check if collection exists
|
|
469
|
+
collection = self.get_collection(from_table, create_if_not_exists=False)
|
|
470
|
+
if collection:
|
|
471
|
+
return collection.query(query, **kwargs)
|
|
472
|
+
else:
|
|
473
|
+
return QueryResult(query=query, num_rows=0, rows=[])
|
|
474
|
+
|
|
475
|
+
@property
|
|
476
|
+
def supports_sql(self) -> bool:
|
|
477
|
+
"""Return True - Dremio supports raw SQL queries."""
|
|
478
|
+
return True
|
|
479
|
+
|
|
480
|
+
def _qualify_table_names(self, sql: str) -> str:
|
|
481
|
+
"""Qualify unqualified table names in SQL using configured schema.
|
|
482
|
+
|
|
483
|
+
Handles FROM and JOIN clauses, qualifying table names that don't
|
|
484
|
+
already contain dots or quotes.
|
|
485
|
+
|
|
486
|
+
Args:
|
|
487
|
+
sql: SQL query string.
|
|
488
|
+
|
|
489
|
+
Returns:
|
|
490
|
+
SQL with qualified table names.
|
|
491
|
+
"""
|
|
492
|
+
default_schema = self._connection_info.get("default_schema")
|
|
493
|
+
if not default_schema:
|
|
494
|
+
return sql
|
|
495
|
+
|
|
496
|
+
# Pattern matches FROM/JOIN followed by an unqualified table name
|
|
497
|
+
# Unqualified = no dots, no quotes, just a simple identifier
|
|
498
|
+
# Captures: (FROM|JOIN) (tablename) (optional: AS? alias | WHERE | ORDER | LIMIT | GROUP | ; | end)
|
|
499
|
+
pattern = r'(?i)((?:FROM|JOIN)\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s+(?:AS\s+)?[a-zA-Z_][a-zA-Z0-9_]*|\s+(?:WHERE|ORDER|GROUP|LIMIT|HAVING|UNION|INTERSECT|EXCEPT|ON|LEFT|RIGHT|INNER|OUTER|CROSS|FULL|;)|$)'
|
|
500
|
+
|
|
501
|
+
def replace_table(match):
|
|
502
|
+
prefix = match.group(1) # "FROM " or "JOIN "
|
|
503
|
+
table = match.group(2) # table name
|
|
504
|
+
suffix = match.group(3) # rest (alias, WHERE, etc.)
|
|
505
|
+
|
|
506
|
+
# Check if this looks like a keyword (not a table name)
|
|
507
|
+
keywords = {'WHERE', 'ORDER', 'GROUP', 'LIMIT', 'HAVING', 'UNION',
|
|
508
|
+
'INTERSECT', 'EXCEPT', 'SELECT', 'AS', 'ON', 'AND', 'OR',
|
|
509
|
+
'LEFT', 'RIGHT', 'INNER', 'OUTER', 'CROSS', 'FULL', 'JOIN'}
|
|
510
|
+
if table.upper() in keywords:
|
|
511
|
+
return match.group(0)
|
|
512
|
+
|
|
513
|
+
qualified = self._get_table_path(table)
|
|
514
|
+
return f"{prefix}{qualified}{suffix}"
|
|
515
|
+
|
|
516
|
+
return re.sub(pattern, replace_table, sql)
|
|
517
|
+
|
|
518
|
+
def execute_sql(self, sql: str, **kwargs) -> QueryResult:
|
|
519
|
+
"""
|
|
520
|
+
Execute a raw SQL query against Dremio.
|
|
521
|
+
|
|
522
|
+
If a default schema is configured in the connection URL, unqualified
|
|
523
|
+
table names in FROM/JOIN clauses will be automatically qualified.
|
|
524
|
+
|
|
525
|
+
:param sql: SQL query string
|
|
526
|
+
:param kwargs: Additional arguments
|
|
527
|
+
:return: QueryResult containing the results
|
|
528
|
+
"""
|
|
529
|
+
sql = self._qualify_table_names(sql)
|
|
530
|
+
logger.debug(f"Qualified SQL: {sql}")
|
|
531
|
+
result = self._execute_query(sql)
|
|
532
|
+
df = result.to_pandas()
|
|
533
|
+
return QueryResult(num_rows=len(df), rows=df.to_dict("records"))
|
|
534
|
+
|
|
535
|
+
def _needs_quoting(self, identifier: str) -> bool:
|
|
536
|
+
"""Check if an identifier needs quoting in SQL.
|
|
537
|
+
|
|
538
|
+
Identifiers need quoting if they contain special characters
|
|
539
|
+
like hyphens, spaces, or start with a digit.
|
|
540
|
+
"""
|
|
541
|
+
if not identifier:
|
|
542
|
+
return False
|
|
543
|
+
# Needs quoting if contains non-alphanumeric/underscore or starts with digit
|
|
544
|
+
if not identifier[0].isalpha() and identifier[0] != "_":
|
|
545
|
+
return True
|
|
546
|
+
return not all(c.isalnum() or c == "_" for c in identifier)
|
|
547
|
+
|
|
548
|
+
def _quote_if_needed(self, identifier: str) -> str:
|
|
549
|
+
"""Quote an identifier if it contains special characters."""
|
|
550
|
+
if self._needs_quoting(identifier):
|
|
551
|
+
return f'"{identifier}"'
|
|
552
|
+
return identifier
|
|
553
|
+
|
|
554
|
+
def _get_table_path(self, table_name: str) -> str:
|
|
555
|
+
"""Get the full table path including schema if configured.
|
|
556
|
+
|
|
557
|
+
Args:
|
|
558
|
+
table_name: Table name.
|
|
559
|
+
|
|
560
|
+
Returns:
|
|
561
|
+
Full table path.
|
|
562
|
+
"""
|
|
563
|
+
default_schema = self._connection_info.get("default_schema")
|
|
564
|
+
path = self._connection_info.get("path")
|
|
565
|
+
|
|
566
|
+
if "." in table_name or '"' in table_name:
|
|
567
|
+
# Already qualified
|
|
568
|
+
return table_name
|
|
569
|
+
|
|
570
|
+
if default_schema:
|
|
571
|
+
# Schema like "gold-db-2 postgresql.gold" needs proper quoting
|
|
572
|
+
# Split into source and schema.table parts
|
|
573
|
+
parts = default_schema.split(".")
|
|
574
|
+
if len(parts) >= 2:
|
|
575
|
+
# Source name may have spaces/hyphens - quote if needed
|
|
576
|
+
source = self._quote_if_needed(parts[0])
|
|
577
|
+
schema = ".".join(parts[1:])
|
|
578
|
+
return f'{source}.{schema}.{table_name}'
|
|
579
|
+
else:
|
|
580
|
+
return f'{self._quote_if_needed(default_schema)}.{table_name}'
|
|
581
|
+
elif path:
|
|
582
|
+
return f'"{path}"."{table_name}"'
|
|
583
|
+
else:
|
|
584
|
+
return f'"{table_name}"'
|
|
585
|
+
|
|
586
|
+
def _table_exists(self, table_name: str) -> bool:
|
|
587
|
+
"""Check if a table exists in Dremio.
|
|
588
|
+
|
|
589
|
+
Args:
|
|
590
|
+
table_name: Name of the table to check.
|
|
591
|
+
|
|
592
|
+
Returns:
|
|
593
|
+
True if table exists.
|
|
594
|
+
"""
|
|
595
|
+
try:
|
|
596
|
+
# Try to get table info by querying with LIMIT 0
|
|
597
|
+
full_path = self._get_table_path(table_name)
|
|
598
|
+
sql = f"SELECT * FROM {full_path} LIMIT 0"
|
|
599
|
+
self._execute_query(sql)
|
|
600
|
+
return True
|
|
601
|
+
except Exception:
|
|
602
|
+
return False
|
|
603
|
+
|
|
604
|
+
def init_collections(self):
|
|
605
|
+
"""Initialize collections from Dremio tables.
|
|
606
|
+
|
|
607
|
+
This queries the INFORMATION_SCHEMA to discover available tables.
|
|
608
|
+
"""
|
|
609
|
+
if self._collections is None:
|
|
610
|
+
self._collections = {}
|
|
611
|
+
|
|
612
|
+
try:
|
|
613
|
+
# Query information schema for tables
|
|
614
|
+
path = self._connection_info.get("path")
|
|
615
|
+
default_schema = self._connection_info.get("default_schema")
|
|
616
|
+
|
|
617
|
+
# Build query for tables
|
|
618
|
+
sql = """
|
|
619
|
+
SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
|
|
620
|
+
FROM INFORMATION_SCHEMA."TABLES"
|
|
621
|
+
WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
|
|
622
|
+
"""
|
|
623
|
+
|
|
624
|
+
if path:
|
|
625
|
+
sql += f" AND TABLE_SCHEMA = '{path}'"
|
|
626
|
+
elif default_schema:
|
|
627
|
+
sql += f" AND TABLE_SCHEMA = '{default_schema}'"
|
|
628
|
+
|
|
629
|
+
result = self._execute_query(sql)
|
|
630
|
+
|
|
631
|
+
for i in range(result.num_rows):
|
|
632
|
+
schema_name = result.column("TABLE_SCHEMA")[i].as_py()
|
|
633
|
+
table_name = result.column("TABLE_NAME")[i].as_py()
|
|
634
|
+
|
|
635
|
+
# Use simple name if in default schema, otherwise qualified name
|
|
636
|
+
if schema_name in (path, default_schema):
|
|
637
|
+
collection_name = table_name
|
|
638
|
+
else:
|
|
639
|
+
collection_name = f"{schema_name}.{table_name}"
|
|
640
|
+
|
|
641
|
+
if collection_name not in self._collections:
|
|
642
|
+
collection = DremioCollection(name=collection_name, parent=self)
|
|
643
|
+
collection.metadata.is_prepopulated = True
|
|
644
|
+
self._collections[collection_name] = collection
|
|
645
|
+
|
|
646
|
+
logger.info(f"Discovered {len(self._collections)} tables in Dremio")
|
|
647
|
+
|
|
648
|
+
except Exception as e:
|
|
649
|
+
logger.warning(f"Could not query INFORMATION_SCHEMA: {e}")
|
|
650
|
+
# Collections will be created on-demand
|
|
651
|
+
|
|
652
|
+
def _detect_source_type(self, source_name: str) -> Optional[str]:
|
|
653
|
+
"""Detect the type of a Dremio data source.
|
|
654
|
+
|
|
655
|
+
Args:
|
|
656
|
+
source_name: Name of the source (e.g., 'gold-db-2 postgresql').
|
|
657
|
+
|
|
658
|
+
Returns:
|
|
659
|
+
Source type ('postgresql', 'mysql', 'mongodb', etc.) or None.
|
|
660
|
+
"""
|
|
661
|
+
source_lower = source_name.lower()
|
|
662
|
+
if "postgresql" in source_lower or "postgres" in source_lower:
|
|
663
|
+
return "postgresql"
|
|
664
|
+
elif "mysql" in source_lower:
|
|
665
|
+
return "mysql"
|
|
666
|
+
elif "mongo" in source_lower:
|
|
667
|
+
return "mongodb"
|
|
668
|
+
elif "iceberg" in source_lower:
|
|
669
|
+
return "iceberg"
|
|
670
|
+
elif "hive" in source_lower:
|
|
671
|
+
return "hive"
|
|
672
|
+
return None
|
|
673
|
+
|
|
674
|
+
def _get_source_from_schema(self, schema_name: str) -> Tuple[Optional[str], Optional[str]]:
|
|
675
|
+
"""Extract source name and schema from a full schema path.
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
schema_name: Full schema path like 'gold-db-2 postgresql.gold'.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
Tuple of (source_name, schema_within_source).
|
|
682
|
+
"""
|
|
683
|
+
if "." in schema_name:
|
|
684
|
+
parts = schema_name.split(".", 1)
|
|
685
|
+
return parts[0], parts[1] if len(parts) > 1 else None
|
|
686
|
+
return schema_name, None
|
|
687
|
+
|
|
688
|
+
def get_foreign_keys(
|
|
689
|
+
self, schema_name: Optional[str] = None, table_name: Optional[str] = None
|
|
690
|
+
) -> List[ForeignKeyInfo]:
|
|
691
|
+
"""Get foreign key constraints from PostgreSQL sources via pg_catalog.
|
|
692
|
+
|
|
693
|
+
This method queries PostgreSQL's pg_catalog.pg_constraint to retrieve
|
|
694
|
+
foreign key information. Only works for PostgreSQL-backed sources.
|
|
695
|
+
|
|
696
|
+
Args:
|
|
697
|
+
schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
|
|
698
|
+
If None, uses the default schema from connection.
|
|
699
|
+
table_name: Optional table name to filter results.
|
|
700
|
+
|
|
701
|
+
Returns:
|
|
702
|
+
List of ForeignKeyInfo objects describing FK relationships.
|
|
703
|
+
|
|
704
|
+
Example:
|
|
705
|
+
>>> db = DremioDatabase("dremio://lakehouse:32010")
|
|
706
|
+
>>> fks = db.get_foreign_keys("gold-db-2 postgresql.gold")
|
|
707
|
+
>>> for fk in fks[:3]:
|
|
708
|
+
... print(f"{fk.source_table}.{fk.source_columns} -> {fk.target_table}")
|
|
709
|
+
"""
|
|
710
|
+
if schema_name is None:
|
|
711
|
+
schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
|
|
712
|
+
|
|
713
|
+
if not schema_name:
|
|
714
|
+
logger.warning("No schema specified for FK introspection")
|
|
715
|
+
return []
|
|
716
|
+
|
|
717
|
+
source_name, pg_schema = self._get_source_from_schema(schema_name)
|
|
718
|
+
source_type = self._detect_source_type(source_name)
|
|
719
|
+
|
|
720
|
+
if source_type != "postgresql":
|
|
721
|
+
logger.info(f"FK introspection only supported for PostgreSQL sources, not {source_type}")
|
|
722
|
+
return []
|
|
723
|
+
|
|
724
|
+
# Query FK constraints from pg_catalog
|
|
725
|
+
fk_sql = f'''
|
|
726
|
+
SELECT
|
|
727
|
+
con.conname as constraint_name,
|
|
728
|
+
src_class.relname as source_table,
|
|
729
|
+
tgt_class.relname as target_table,
|
|
730
|
+
con.conkey as source_col_nums,
|
|
731
|
+
con.confkey as target_col_nums,
|
|
732
|
+
con.conrelid as source_oid,
|
|
733
|
+
con.confrelid as target_oid
|
|
734
|
+
FROM "{source_name}".pg_catalog.pg_constraint con
|
|
735
|
+
JOIN "{source_name}".pg_catalog.pg_class src_class ON con.conrelid = src_class.oid
|
|
736
|
+
JOIN "{source_name}".pg_catalog.pg_class tgt_class ON con.confrelid = tgt_class.oid
|
|
737
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON src_class.relnamespace = nsp.oid
|
|
738
|
+
WHERE con.contype = 'f'
|
|
739
|
+
'''
|
|
740
|
+
|
|
741
|
+
if pg_schema:
|
|
742
|
+
fk_sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
743
|
+
if table_name:
|
|
744
|
+
fk_sql += f" AND src_class.relname = '{table_name}'"
|
|
745
|
+
|
|
746
|
+
# Query column info for resolving column numbers to names
|
|
747
|
+
col_sql = f'''
|
|
748
|
+
SELECT
|
|
749
|
+
c.oid as table_oid,
|
|
750
|
+
a.attnum as col_num,
|
|
751
|
+
a.attname as col_name
|
|
752
|
+
FROM "{source_name}".pg_catalog.pg_class c
|
|
753
|
+
JOIN "{source_name}".pg_catalog.pg_attribute a ON a.attrelid = c.oid
|
|
754
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
|
|
755
|
+
WHERE a.attnum > 0 AND NOT a.attisdropped
|
|
756
|
+
'''
|
|
757
|
+
|
|
758
|
+
if pg_schema:
|
|
759
|
+
col_sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
760
|
+
|
|
761
|
+
try:
|
|
762
|
+
fk_result = self._execute_query(fk_sql)
|
|
763
|
+
col_result = self._execute_query(col_sql)
|
|
764
|
+
|
|
765
|
+
# Build column lookup: (table_oid, col_num) -> col_name
|
|
766
|
+
col_df = col_result.to_pandas()
|
|
767
|
+
col_lookup = {}
|
|
768
|
+
for _, row in col_df.iterrows():
|
|
769
|
+
key = (row["table_oid"], row["col_num"])
|
|
770
|
+
col_lookup[key] = row["col_name"]
|
|
771
|
+
|
|
772
|
+
# Build FK info list
|
|
773
|
+
fk_df = fk_result.to_pandas()
|
|
774
|
+
fk_list = []
|
|
775
|
+
|
|
776
|
+
for _, fk in fk_df.iterrows():
|
|
777
|
+
# Parse array strings like '{1}' or '{1,2}'
|
|
778
|
+
src_nums = [int(x) for x in str(fk["source_col_nums"]).strip("{}").split(",") if x]
|
|
779
|
+
tgt_nums = [int(x) for x in str(fk["target_col_nums"]).strip("{}").split(",") if x]
|
|
780
|
+
|
|
781
|
+
src_cols = [col_lookup.get((fk["source_oid"], n), f"col_{n}") for n in src_nums]
|
|
782
|
+
tgt_cols = [col_lookup.get((fk["target_oid"], n), f"col_{n}") for n in tgt_nums]
|
|
783
|
+
|
|
784
|
+
fk_list.append(
|
|
785
|
+
ForeignKeyInfo(
|
|
786
|
+
constraint_name=fk["constraint_name"],
|
|
787
|
+
source_table=fk["source_table"],
|
|
788
|
+
source_columns=src_cols,
|
|
789
|
+
target_table=fk["target_table"],
|
|
790
|
+
target_columns=tgt_cols,
|
|
791
|
+
source_schema=pg_schema,
|
|
792
|
+
target_schema=pg_schema, # Assumes same schema
|
|
793
|
+
)
|
|
794
|
+
)
|
|
795
|
+
|
|
796
|
+
logger.info(f"Found {len(fk_list)} foreign key constraints in {schema_name}")
|
|
797
|
+
return fk_list
|
|
798
|
+
|
|
799
|
+
except Exception as e:
|
|
800
|
+
logger.warning(f"Could not retrieve FK constraints: {e}")
|
|
801
|
+
return []
|
|
802
|
+
|
|
803
|
+
def get_table_comments(
|
|
804
|
+
self, schema_name: Optional[str] = None, table_name: Optional[str] = None
|
|
805
|
+
) -> Dict[str, TableInfo]:
|
|
806
|
+
"""Get table and column comments from PostgreSQL sources via pg_description.
|
|
807
|
+
|
|
808
|
+
Args:
|
|
809
|
+
schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
|
|
810
|
+
table_name: Optional table name to filter results.
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
Dictionary mapping table names to TableInfo with comments.
|
|
814
|
+
"""
|
|
815
|
+
if schema_name is None:
|
|
816
|
+
schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
|
|
817
|
+
|
|
818
|
+
if not schema_name:
|
|
819
|
+
return {}
|
|
820
|
+
|
|
821
|
+
source_name, pg_schema = self._get_source_from_schema(schema_name)
|
|
822
|
+
source_type = self._detect_source_type(source_name)
|
|
823
|
+
|
|
824
|
+
if source_type != "postgresql":
|
|
825
|
+
logger.info(f"Comment introspection only supported for PostgreSQL sources")
|
|
826
|
+
return {}
|
|
827
|
+
|
|
828
|
+
sql = f'''
|
|
829
|
+
SELECT
|
|
830
|
+
c.relname as table_name,
|
|
831
|
+
a.attname as column_name,
|
|
832
|
+
a.attnum as col_num,
|
|
833
|
+
td.description as table_comment,
|
|
834
|
+
cd.description as column_comment
|
|
835
|
+
FROM "{source_name}".pg_catalog.pg_class c
|
|
836
|
+
JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
|
|
837
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_attribute a
|
|
838
|
+
ON a.attrelid = c.oid AND a.attnum > 0 AND NOT a.attisdropped
|
|
839
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_description td
|
|
840
|
+
ON td.objoid = c.oid AND td.objsubid = 0
|
|
841
|
+
LEFT JOIN "{source_name}".pg_catalog.pg_description cd
|
|
842
|
+
ON cd.objoid = c.oid AND cd.objsubid = a.attnum
|
|
843
|
+
WHERE c.relkind IN ('r', 'v')
|
|
844
|
+
'''
|
|
845
|
+
|
|
846
|
+
if pg_schema:
|
|
847
|
+
sql += f" AND nsp.nspname = '{pg_schema}'"
|
|
848
|
+
if table_name:
|
|
849
|
+
sql += f" AND c.relname = '{table_name}'"
|
|
850
|
+
|
|
851
|
+
sql += " ORDER BY c.relname, a.attnum"
|
|
852
|
+
|
|
853
|
+
try:
|
|
854
|
+
result = self._execute_query(sql)
|
|
855
|
+
df = result.to_pandas()
|
|
856
|
+
|
|
857
|
+
tables = {}
|
|
858
|
+
for tbl_name in df["table_name"].unique():
|
|
859
|
+
tbl_df = df[df["table_name"] == tbl_name]
|
|
860
|
+
table_comment = tbl_df["table_comment"].iloc[0] if not tbl_df["table_comment"].isna().all() else None
|
|
861
|
+
|
|
862
|
+
columns = []
|
|
863
|
+
for _, row in tbl_df.iterrows():
|
|
864
|
+
if row["column_name"]:
|
|
865
|
+
columns.append(
|
|
866
|
+
ColumnInfo(
|
|
867
|
+
name=row["column_name"],
|
|
868
|
+
data_type="", # Not fetched here
|
|
869
|
+
comment=row["column_comment"] if not pd.isna(row["column_comment"]) else None,
|
|
870
|
+
ordinal_position=int(row["col_num"]) if row["col_num"] else 0,
|
|
871
|
+
)
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
tables[tbl_name] = TableInfo(
|
|
875
|
+
name=tbl_name, schema_name=pg_schema, comment=table_comment, columns=columns
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
return tables
|
|
879
|
+
|
|
880
|
+
except Exception as e:
|
|
881
|
+
logger.warning(f"Could not retrieve table comments: {e}")
|
|
882
|
+
return {}
|
|
883
|
+
|
|
884
|
+
def get_nested_schema(self, table_path: str) -> Dict[str, Any]:
|
|
885
|
+
"""Get full schema including nested types by querying with LIMIT 0.
|
|
886
|
+
|
|
887
|
+
For complex types (ARRAY, STRUCT/ROW), the metadata methods don't
|
|
888
|
+
return nested field information. This method executes a LIMIT 0
|
|
889
|
+
query to get the full Arrow schema with nested structure.
|
|
890
|
+
|
|
891
|
+
Args:
|
|
892
|
+
table_path: Full table path (e.g., '"schema".table').
|
|
893
|
+
|
|
894
|
+
Returns:
|
|
895
|
+
Dictionary with column names and their Arrow type info.
|
|
896
|
+
"""
|
|
897
|
+
sql = f"SELECT * FROM {table_path} LIMIT 0"
|
|
898
|
+
|
|
899
|
+
try:
|
|
900
|
+
result = self._execute_query(sql)
|
|
901
|
+
schema_info = {}
|
|
902
|
+
|
|
903
|
+
for field in result.schema:
|
|
904
|
+
type_str = str(field.type)
|
|
905
|
+
field_info = {
|
|
906
|
+
"name": field.name,
|
|
907
|
+
"arrow_type": type_str,
|
|
908
|
+
"nullable": field.nullable,
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
# Parse nested structure for struct types
|
|
912
|
+
if type_str.startswith("struct<"):
|
|
913
|
+
field_info["nested_fields"] = []
|
|
914
|
+
for nested in field.type:
|
|
915
|
+
field_info["nested_fields"].append(
|
|
916
|
+
{"name": nested.name, "arrow_type": str(nested.type), "nullable": nested.nullable}
|
|
917
|
+
)
|
|
918
|
+
|
|
919
|
+
# Parse list element type
|
|
920
|
+
if hasattr(field.type, "value_type"):
|
|
921
|
+
field_info["element_type"] = str(field.type.value_type)
|
|
922
|
+
|
|
923
|
+
schema_info[field.name] = field_info
|
|
924
|
+
|
|
925
|
+
return schema_info
|
|
926
|
+
|
|
927
|
+
except Exception as e:
|
|
928
|
+
logger.warning(f"Could not get nested schema for {table_path}: {e}")
|
|
929
|
+
return {}
|
|
930
|
+
|
|
931
|
+
def induce_schema_view(self, include_foreign_keys: bool = True) -> SchemaView:
|
|
932
|
+
"""Induce a schema view from Dremio table structures.
|
|
933
|
+
|
|
934
|
+
Args:
|
|
935
|
+
include_foreign_keys: If True, attempt to retrieve FK info from
|
|
936
|
+
PostgreSQL sources and add relationships.
|
|
937
|
+
|
|
938
|
+
Returns:
|
|
939
|
+
SchemaView representing the database schema.
|
|
940
|
+
"""
|
|
941
|
+
logger.info(f"Inducing schema view for {self.metadata.handle}")
|
|
942
|
+
sb = SchemaBuilder()
|
|
943
|
+
|
|
944
|
+
# Ensure collections are initialized
|
|
945
|
+
if not self._collections:
|
|
946
|
+
self.init_collections()
|
|
947
|
+
|
|
948
|
+
path = self._connection_info.get("path")
|
|
949
|
+
default_schema = self._connection_info.get("default_schema")
|
|
950
|
+
|
|
951
|
+
try:
|
|
952
|
+
# Query columns from INFORMATION_SCHEMA
|
|
953
|
+
sql = """
|
|
954
|
+
SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE,
|
|
955
|
+
IS_NULLABLE, ORDINAL_POSITION
|
|
956
|
+
FROM INFORMATION_SCHEMA."COLUMNS"
|
|
957
|
+
"""
|
|
958
|
+
|
|
959
|
+
if path:
|
|
960
|
+
sql += f" WHERE TABLE_SCHEMA = '{path}'"
|
|
961
|
+
elif default_schema:
|
|
962
|
+
sql += f" WHERE TABLE_SCHEMA = '{default_schema}'"
|
|
963
|
+
|
|
964
|
+
sql += " ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION"
|
|
965
|
+
|
|
966
|
+
result = self._execute_query(sql)
|
|
967
|
+
|
|
968
|
+
# Group columns by table
|
|
969
|
+
current_table = None
|
|
970
|
+
for i in range(result.num_rows):
|
|
971
|
+
schema_name = result.column("TABLE_SCHEMA")[i].as_py()
|
|
972
|
+
table_name = result.column("TABLE_NAME")[i].as_py()
|
|
973
|
+
column_name = result.column("COLUMN_NAME")[i].as_py()
|
|
974
|
+
data_type = result.column("DATA_TYPE")[i].as_py()
|
|
975
|
+
is_nullable = result.column("IS_NULLABLE")[i].as_py()
|
|
976
|
+
|
|
977
|
+
# Get class name
|
|
978
|
+
if schema_name in (path, default_schema):
|
|
979
|
+
class_name = table_name
|
|
980
|
+
else:
|
|
981
|
+
class_name = f"{schema_name}_{table_name}"
|
|
982
|
+
|
|
983
|
+
# Add class if new
|
|
984
|
+
if class_name != current_table:
|
|
985
|
+
sb.add_class(class_name)
|
|
986
|
+
current_table = class_name
|
|
987
|
+
|
|
988
|
+
# Map Dremio type to LinkML type
|
|
989
|
+
from linkml_store.api.stores.dremio.mappings import DREMIO_SQL_TO_LINKML
|
|
990
|
+
|
|
991
|
+
# Extract base type (before any parentheses)
|
|
992
|
+
base_type = re.split(r"[\(\[]", data_type)[0].upper()
|
|
993
|
+
linkml_type = DREMIO_SQL_TO_LINKML.get(base_type, "string")
|
|
994
|
+
|
|
995
|
+
# Create slot definition
|
|
996
|
+
sd = SlotDefinition(column_name, required=is_nullable == "NO", range=linkml_type)
|
|
997
|
+
sb.schema.classes[class_name].attributes[sd.name] = sd
|
|
998
|
+
logger.debug(f"Introspected slot: {class_name}.{sd.name}: {sd.range}")
|
|
999
|
+
|
|
1000
|
+
except Exception as e:
|
|
1001
|
+
logger.warning(f"Could not introspect schema from INFORMATION_SCHEMA: {e}")
|
|
1002
|
+
|
|
1003
|
+
# Add foreign key relationships
|
|
1004
|
+
if include_foreign_keys:
|
|
1005
|
+
schema_to_use = path or default_schema
|
|
1006
|
+
if schema_to_use:
|
|
1007
|
+
fks = self.get_foreign_keys(schema_to_use)
|
|
1008
|
+
for fk in fks:
|
|
1009
|
+
# Get or derive class names
|
|
1010
|
+
src_class = fk.source_table
|
|
1011
|
+
tgt_class = fk.target_table
|
|
1012
|
+
|
|
1013
|
+
# Skip if classes don't exist
|
|
1014
|
+
if src_class not in sb.schema.classes or tgt_class not in sb.schema.classes:
|
|
1015
|
+
continue
|
|
1016
|
+
|
|
1017
|
+
# For single-column FKs, update the slot's range to point to target class
|
|
1018
|
+
if len(fk.source_columns) == 1:
|
|
1019
|
+
src_col = fk.source_columns[0]
|
|
1020
|
+
if src_col in sb.schema.classes[src_class].attributes:
|
|
1021
|
+
slot = sb.schema.classes[src_class].attributes[src_col]
|
|
1022
|
+
# Set range to target class, indicating a relationship
|
|
1023
|
+
slot.range = tgt_class
|
|
1024
|
+
slot.description = f"Foreign key to {tgt_class}"
|
|
1025
|
+
logger.debug(f"Added FK relationship: {src_class}.{src_col} -> {tgt_class}")
|
|
1026
|
+
|
|
1027
|
+
sb.add_defaults()
|
|
1028
|
+
return SchemaView(sb.schema)
|
|
1029
|
+
|
|
1030
|
+
def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
|
|
1031
|
+
"""Export database to a file.
|
|
1032
|
+
|
|
1033
|
+
Args:
|
|
1034
|
+
location: Output file path.
|
|
1035
|
+
target_format: Output format.
|
|
1036
|
+
**kwargs: Additional arguments.
|
|
1037
|
+
"""
|
|
1038
|
+
# Use default export logic from parent
|
|
1039
|
+
super().export_database(location, target_format=target_format, **kwargs)
|
|
1040
|
+
|
|
1041
|
+
def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
|
|
1042
|
+
"""Import data into Dremio.
|
|
1043
|
+
|
|
1044
|
+
Note: Direct import is limited in Dremio. Data typically needs to be
|
|
1045
|
+
loaded through Dremio's data sources or uploaded to connected storage.
|
|
1046
|
+
|
|
1047
|
+
Args:
|
|
1048
|
+
location: Source file path.
|
|
1049
|
+
source_format: Source format.
|
|
1050
|
+
**kwargs: Additional arguments.
|
|
1051
|
+
"""
|
|
1052
|
+
super().import_database(location, source_format=source_format, **kwargs)
|