linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,1052 @@
1
+ """Dremio database adapter using Arrow Flight SQL.
2
+
3
+ This module provides a Database implementation for connecting to Dremio
4
+ data lakehouse using the Arrow Flight SQL protocol for high-performance
5
+ data access.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import re
11
+ from dataclasses import dataclass, field
12
+ from typing import Any, Dict, List, Optional, Tuple, Union
13
+ from urllib.parse import parse_qs, urlparse
14
+
15
+ import pandas as pd
16
+ from linkml_runtime import SchemaView
17
+ from linkml_runtime.linkml_model import ClassDefinition, SlotDefinition
18
+ from linkml_runtime.utils.schema_builder import SchemaBuilder
19
+
20
+ from linkml_store.api import Database
21
+ from linkml_store.api.queries import Query, QueryResult
22
+ from linkml_store.api.stores.dremio.dremio_collection import DremioCollection
23
+ from linkml_store.api.stores.dremio.mappings import get_linkml_type_from_arrow
24
+ from linkml_store.utils.format_utils import Format
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class ForeignKeyInfo:
31
+ """Information about a foreign key constraint."""
32
+
33
+ constraint_name: str
34
+ source_table: str
35
+ source_columns: List[str]
36
+ target_table: str
37
+ target_columns: List[str]
38
+ source_schema: Optional[str] = None
39
+ target_schema: Optional[str] = None
40
+
41
+
42
+ @dataclass
43
+ class ColumnInfo:
44
+ """Information about a column including comments and nested structure."""
45
+
46
+ name: str
47
+ data_type: str
48
+ is_nullable: bool = True
49
+ comment: Optional[str] = None
50
+ ordinal_position: int = 0
51
+ nested_fields: List["ColumnInfo"] = field(default_factory=list)
52
+
53
+
54
+ @dataclass
55
+ class TableInfo:
56
+ """Information about a table including comments."""
57
+
58
+ name: str
59
+ schema_name: Optional[str] = None
60
+ comment: Optional[str] = None
61
+ columns: List[ColumnInfo] = field(default_factory=list)
62
+
63
+
64
+ class DremioDatabase(Database):
65
+ """
66
+ An adapter for Dremio data lakehouse using Arrow Flight SQL.
67
+
68
+ This adapter connects to Dremio using the Arrow Flight SQL protocol,
69
+ which provides high-performance data transfer using Apache Arrow.
70
+
71
+ Handle format:
72
+ dremio://[username:password@]host[:port][/path][?params]
73
+
74
+ Examples:
75
+ - dremio://localhost:32010
76
+ - dremio://user:pass@localhost:32010
77
+ - dremio://localhost:32010?useEncryption=false
78
+ - dremio://user:pass@dremio.example.com:32010/Samples
79
+
80
+ Parameters (query string):
81
+ - useEncryption: Whether to use TLS (default: true)
82
+ - disableCertificateVerification: Skip cert verification (default: false)
83
+ - schema: Default schema/space to use
84
+ - username_env: Environment variable for username (default: DREMIO_USER)
85
+ - password_env: Environment variable for password (default: DREMIO_PASSWORD)
86
+
87
+ Environment variables:
88
+ - DREMIO_USER: Default username (if not in URL)
89
+ - DREMIO_PASSWORD: Default password (if not in URL)
90
+
91
+ Note:
92
+ Requires pyarrow with Flight SQL support. Install with:
93
+ pip install pyarrow
94
+ """
95
+
96
+ _flight_client = None
97
+ _adbc_connection = None
98
+ _connection_info: dict = None
99
+ collection_class = DremioCollection
100
+
101
+ def __init__(
102
+ self,
103
+ handle: Optional[str] = None,
104
+ recreate_if_exists: bool = False,
105
+ username: Optional[str] = None,
106
+ password: Optional[str] = None,
107
+ **kwargs,
108
+ ):
109
+ """Initialize a Dremio database connection.
110
+
111
+ Args:
112
+ handle: Connection string in format dremio://host:port
113
+ recreate_if_exists: Not applicable for Dremio (ignored)
114
+ username: Optional username (can also be in handle)
115
+ password: Optional password (can also be in handle)
116
+ **kwargs: Additional arguments passed to parent
117
+ """
118
+ if handle is None:
119
+ handle = "dremio://localhost:32010"
120
+
121
+ self._connection_info = self._parse_handle(handle)
122
+
123
+ # Override with explicit credentials if provided
124
+ if username:
125
+ self._connection_info["username"] = username
126
+ if password:
127
+ self._connection_info["password"] = password
128
+
129
+ super().__init__(handle=handle, **kwargs)
130
+
131
+ def _parse_handle(self, handle: str) -> dict:
132
+ """Parse a Dremio connection handle.
133
+
134
+ Args:
135
+ handle: Connection string like dremio://user:pass@host:port/path?params
136
+
137
+ Returns:
138
+ Dictionary with connection parameters.
139
+ """
140
+ # Ensure scheme is present
141
+ if not handle.startswith("dremio://"):
142
+ handle = f"dremio://{handle}"
143
+
144
+ parsed = urlparse(handle)
145
+
146
+ # Parse query parameters
147
+ params = parse_qs(parsed.query)
148
+
149
+ # Extract single values from query params
150
+ use_encryption = params.get("useEncryption", ["true"])[0].lower() == "true"
151
+ disable_cert_verify = params.get("disableCertificateVerification", ["false"])[0].lower() == "true"
152
+ default_schema = params.get("schema", [None])[0]
153
+
154
+ # Get env var names (configurable via query params)
155
+ username_env = params.get("username_env", ["DREMIO_USER"])[0]
156
+ password_env = params.get("password_env", ["DREMIO_PASSWORD"])[0]
157
+
158
+ # Get credentials from URL or environment variables
159
+ username = parsed.username or os.environ.get(username_env)
160
+ password = parsed.password or os.environ.get(password_env)
161
+
162
+ return {
163
+ "host": parsed.hostname or "localhost",
164
+ "port": parsed.port or 32010,
165
+ "username": username,
166
+ "password": password,
167
+ "path": parsed.path.lstrip("/") if parsed.path else None,
168
+ "use_encryption": use_encryption,
169
+ "disable_cert_verify": disable_cert_verify,
170
+ "default_schema": default_schema,
171
+ }
172
+
173
+ @property
174
+ def flight_client(self):
175
+ """Get or create the Arrow Flight SQL client.
176
+
177
+ Returns:
178
+ FlightSqlClient connected to Dremio.
179
+
180
+ Raises:
181
+ ImportError: If pyarrow is not installed.
182
+ ConnectionError: If connection to Dremio fails.
183
+ """
184
+ if self._flight_client is None:
185
+ try:
186
+ import pyarrow.flight as flight
187
+ except ImportError as e:
188
+ raise ImportError(
189
+ "pyarrow with Flight support is required for Dremio adapter. "
190
+ "Install with: pip install pyarrow"
191
+ ) from e
192
+
193
+ info = self._connection_info
194
+ host = info["host"]
195
+ port = info["port"]
196
+
197
+ # Build location
198
+ if info["use_encryption"]:
199
+ location = f"grpc+tls://{host}:{port}"
200
+ else:
201
+ location = f"grpc://{host}:{port}"
202
+
203
+ logger.info(f"Connecting to Dremio at {location}")
204
+
205
+ # Build connection options
206
+ client_options = []
207
+
208
+ if info["disable_cert_verify"]:
209
+ client_options.append(("disable_server_verification", "true"))
210
+
211
+ try:
212
+ client = flight.FlightClient(location)
213
+
214
+ # Authenticate if credentials provided
215
+ if info["username"] and info["password"]:
216
+ # Get auth token using basic auth
217
+ bearer_token = self._authenticate(client, info["username"], info["password"])
218
+ # Store token for subsequent requests
219
+ self._bearer_token = bearer_token
220
+ else:
221
+ self._bearer_token = None
222
+
223
+ self._flight_client = client
224
+
225
+ except Exception as e:
226
+ raise ConnectionError(f"Failed to connect to Dremio at {location}: {e}") from e
227
+
228
+ return self._flight_client
229
+
230
+ def _authenticate(self, client, username: str, password: str) -> bytes:
231
+ """Authenticate with Dremio and get bearer token.
232
+
233
+ Args:
234
+ client: Flight client
235
+ username: Dremio username
236
+ password: Dremio password
237
+
238
+ Returns:
239
+ Bearer token for subsequent requests.
240
+ """
241
+ import pyarrow.flight as flight
242
+
243
+ # Use basic authentication
244
+ auth_handler = flight.BasicAuth(username, password)
245
+ token_pair = client.authenticate_basic_token(username, password)
246
+ return token_pair[1] # Return the token value
247
+
248
+ def _get_call_options(self):
249
+ """Get Flight call options with authentication headers.
250
+
251
+ Returns:
252
+ FlightCallOptions with bearer token if authenticated.
253
+ """
254
+ import pyarrow.flight as flight
255
+
256
+ if hasattr(self, "_bearer_token") and self._bearer_token:
257
+ return flight.FlightCallOptions(headers=[(b"authorization", self._bearer_token)])
258
+ return flight.FlightCallOptions()
259
+
260
+ def _execute_query(self, sql: str) -> "pyarrow.Table":
261
+ """Execute a SQL query and return results as Arrow Table.
262
+
263
+ Uses ADBC Flight SQL driver when available (faster), falls back to
264
+ raw Flight RPC otherwise.
265
+
266
+ Args:
267
+ sql: SQL query string.
268
+
269
+ Returns:
270
+ PyArrow Table with query results.
271
+ """
272
+ logger.debug(f"Executing SQL: {sql}")
273
+
274
+ # Try ADBC first (much faster for Flight SQL)
275
+ try:
276
+ return self._execute_query_adbc(sql)
277
+ except ImportError:
278
+ logger.debug("ADBC not available, falling back to raw Flight")
279
+ return self._execute_query_flight(sql)
280
+
281
+ @property
282
+ def adbc_connection(self):
283
+ """Get or create cached ADBC Flight SQL connection.
284
+
285
+ Returns:
286
+ ADBC connection to Dremio.
287
+
288
+ Raises:
289
+ ImportError: If ADBC driver is not installed.
290
+ """
291
+ if self._adbc_connection is None:
292
+ import adbc_driver_flightsql.dbapi as flightsql
293
+
294
+ info = self._connection_info
295
+ host = info["host"]
296
+ port = info["port"]
297
+
298
+ # Build URI
299
+ if info["use_encryption"]:
300
+ uri = f"grpc+tls://{host}:{port}"
301
+ else:
302
+ uri = f"grpc://{host}:{port}"
303
+
304
+ # Build connection kwargs
305
+ connect_kwargs = {"uri": uri}
306
+
307
+ # Add auth if available
308
+ if info["username"] and info["password"]:
309
+ connect_kwargs["db_kwargs"] = {
310
+ "username": info["username"],
311
+ "password": info["password"],
312
+ }
313
+
314
+ logger.info(f"Establishing ADBC Flight SQL connection to {uri}")
315
+ self._adbc_connection = flightsql.connect(**connect_kwargs)
316
+
317
+ return self._adbc_connection
318
+
319
+ def _execute_query_adbc(self, sql: str) -> "pyarrow.Table":
320
+ """Execute query using ADBC Flight SQL driver (fast path).
321
+
322
+ Args:
323
+ sql: SQL query string.
324
+
325
+ Returns:
326
+ PyArrow Table with query results.
327
+
328
+ Raises:
329
+ ImportError: If ADBC driver is not installed.
330
+ """
331
+ import pyarrow as pa
332
+
333
+ conn = self.adbc_connection
334
+ sql_upper = sql.strip().upper()
335
+
336
+ # Handle context-setting statements (USE, SET, ALTER SESSION, etc.)
337
+ # These don't return meaningful results but set session state
338
+ if sql_upper.startswith(("USE ", "SET ", "ALTER SESSION")):
339
+ with conn.cursor() as cursor:
340
+ cursor.execute(sql)
341
+ # Don't try to fetch results - just execute for side effect
342
+ return pa.table({})
343
+
344
+ with conn.cursor() as cursor:
345
+ cursor.execute(sql)
346
+ return cursor.fetch_arrow_table()
347
+
348
+ def _execute_query_flight(self, sql: str) -> "pyarrow.Table":
349
+ """Execute query using raw Flight RPC (fallback path).
350
+
351
+ Args:
352
+ sql: SQL query string.
353
+
354
+ Returns:
355
+ PyArrow Table with query results.
356
+ """
357
+ import pyarrow.flight as flight
358
+
359
+ client = self.flight_client
360
+ options = self._get_call_options()
361
+
362
+ # Create Flight SQL command
363
+ flight_desc = flight.FlightDescriptor.for_command(sql.encode("utf-8"))
364
+
365
+ # Get flight info
366
+ try:
367
+ # For Dremio, we use the execute method directly
368
+ # Prepare the SQL statement
369
+ info = client.get_flight_info(flight_desc, options)
370
+
371
+ # Get the data from all endpoints
372
+ tables = []
373
+ for endpoint in info.endpoints:
374
+ reader = client.do_get(endpoint.ticket, options)
375
+ tables.append(reader.read_all())
376
+
377
+ if not tables:
378
+ import pyarrow as pa
379
+
380
+ return pa.table({})
381
+
382
+ # Concatenate all tables
383
+ import pyarrow as pa
384
+
385
+ return pa.concat_tables(tables)
386
+
387
+ except Exception as e:
388
+ logger.error(f"Query execution failed: {e}")
389
+ raise
390
+
391
+ def _execute_update(self, sql: str) -> int:
392
+ """Execute a SQL update/insert/delete statement.
393
+
394
+ Args:
395
+ sql: SQL statement.
396
+
397
+ Returns:
398
+ Number of affected rows (-1 if unknown).
399
+ """
400
+ import pyarrow.flight as flight
401
+
402
+ logger.debug(f"Executing update: {sql}")
403
+
404
+ client = self.flight_client
405
+ options = self._get_call_options()
406
+
407
+ try:
408
+ # For DML statements, use do_action
409
+ action = flight.Action("query", sql.encode("utf-8"))
410
+ results = list(client.do_action(action, options))
411
+ # Try to parse affected rows from result
412
+ if results:
413
+ try:
414
+ return int(results[0].body.to_pybytes().decode("utf-8"))
415
+ except (ValueError, AttributeError):
416
+ pass
417
+ return -1
418
+ except Exception as e:
419
+ logger.warning(f"Update execution failed, trying alternative method: {e}")
420
+ # Some Dremio versions may not support do_action for DML
421
+ # Fall back to regular query
422
+ try:
423
+ self._execute_query(sql)
424
+ return -1
425
+ except Exception as e2:
426
+ logger.error(f"Update failed: {e2}")
427
+ raise
428
+
429
+ def commit(self, **kwargs):
430
+ """Commit pending changes.
431
+
432
+ Note: Dremio auto-commits, this is a no-op.
433
+ """
434
+ pass
435
+
436
+ def close(self, **kwargs):
437
+ """Close the Dremio connection."""
438
+ if self._flight_client:
439
+ self._flight_client.close()
440
+ self._flight_client = None
441
+ if self._adbc_connection:
442
+ self._adbc_connection.close()
443
+ self._adbc_connection = None
444
+
445
+ def drop(self, missing_ok=True, **kwargs):
446
+ """Drop the database.
447
+
448
+ Note: This is not supported for Dremio as it's typically a read/query layer.
449
+ Individual tables can be dropped if you have permissions.
450
+ """
451
+ self.close()
452
+ logger.warning("Dremio does not support dropping databases through this adapter")
453
+
454
+ def query(self, query: Query, **kwargs) -> QueryResult:
455
+ """Execute a query against Dremio.
456
+
457
+ Args:
458
+ query: Query object specifying the query parameters.
459
+ **kwargs: Additional arguments.
460
+
461
+ Returns:
462
+ QueryResult with matching rows.
463
+ """
464
+ from_table = query.from_table
465
+ if not from_table:
466
+ raise ValueError("Query must specify from_table")
467
+
468
+ # Check if collection exists
469
+ collection = self.get_collection(from_table, create_if_not_exists=False)
470
+ if collection:
471
+ return collection.query(query, **kwargs)
472
+ else:
473
+ return QueryResult(query=query, num_rows=0, rows=[])
474
+
475
+ @property
476
+ def supports_sql(self) -> bool:
477
+ """Return True - Dremio supports raw SQL queries."""
478
+ return True
479
+
480
+ def _qualify_table_names(self, sql: str) -> str:
481
+ """Qualify unqualified table names in SQL using configured schema.
482
+
483
+ Handles FROM and JOIN clauses, qualifying table names that don't
484
+ already contain dots or quotes.
485
+
486
+ Args:
487
+ sql: SQL query string.
488
+
489
+ Returns:
490
+ SQL with qualified table names.
491
+ """
492
+ default_schema = self._connection_info.get("default_schema")
493
+ if not default_schema:
494
+ return sql
495
+
496
+ # Pattern matches FROM/JOIN followed by an unqualified table name
497
+ # Unqualified = no dots, no quotes, just a simple identifier
498
+ # Captures: (FROM|JOIN) (tablename) (optional: AS? alias | WHERE | ORDER | LIMIT | GROUP | ; | end)
499
+ pattern = r'(?i)((?:FROM|JOIN)\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s+(?:AS\s+)?[a-zA-Z_][a-zA-Z0-9_]*|\s+(?:WHERE|ORDER|GROUP|LIMIT|HAVING|UNION|INTERSECT|EXCEPT|ON|LEFT|RIGHT|INNER|OUTER|CROSS|FULL|;)|$)'
500
+
501
+ def replace_table(match):
502
+ prefix = match.group(1) # "FROM " or "JOIN "
503
+ table = match.group(2) # table name
504
+ suffix = match.group(3) # rest (alias, WHERE, etc.)
505
+
506
+ # Check if this looks like a keyword (not a table name)
507
+ keywords = {'WHERE', 'ORDER', 'GROUP', 'LIMIT', 'HAVING', 'UNION',
508
+ 'INTERSECT', 'EXCEPT', 'SELECT', 'AS', 'ON', 'AND', 'OR',
509
+ 'LEFT', 'RIGHT', 'INNER', 'OUTER', 'CROSS', 'FULL', 'JOIN'}
510
+ if table.upper() in keywords:
511
+ return match.group(0)
512
+
513
+ qualified = self._get_table_path(table)
514
+ return f"{prefix}{qualified}{suffix}"
515
+
516
+ return re.sub(pattern, replace_table, sql)
517
+
518
+ def execute_sql(self, sql: str, **kwargs) -> QueryResult:
519
+ """
520
+ Execute a raw SQL query against Dremio.
521
+
522
+ If a default schema is configured in the connection URL, unqualified
523
+ table names in FROM/JOIN clauses will be automatically qualified.
524
+
525
+ :param sql: SQL query string
526
+ :param kwargs: Additional arguments
527
+ :return: QueryResult containing the results
528
+ """
529
+ sql = self._qualify_table_names(sql)
530
+ logger.debug(f"Qualified SQL: {sql}")
531
+ result = self._execute_query(sql)
532
+ df = result.to_pandas()
533
+ return QueryResult(num_rows=len(df), rows=df.to_dict("records"))
534
+
535
+ def _needs_quoting(self, identifier: str) -> bool:
536
+ """Check if an identifier needs quoting in SQL.
537
+
538
+ Identifiers need quoting if they contain special characters
539
+ like hyphens, spaces, or start with a digit.
540
+ """
541
+ if not identifier:
542
+ return False
543
+ # Needs quoting if contains non-alphanumeric/underscore or starts with digit
544
+ if not identifier[0].isalpha() and identifier[0] != "_":
545
+ return True
546
+ return not all(c.isalnum() or c == "_" for c in identifier)
547
+
548
+ def _quote_if_needed(self, identifier: str) -> str:
549
+ """Quote an identifier if it contains special characters."""
550
+ if self._needs_quoting(identifier):
551
+ return f'"{identifier}"'
552
+ return identifier
553
+
554
+ def _get_table_path(self, table_name: str) -> str:
555
+ """Get the full table path including schema if configured.
556
+
557
+ Args:
558
+ table_name: Table name.
559
+
560
+ Returns:
561
+ Full table path.
562
+ """
563
+ default_schema = self._connection_info.get("default_schema")
564
+ path = self._connection_info.get("path")
565
+
566
+ if "." in table_name or '"' in table_name:
567
+ # Already qualified
568
+ return table_name
569
+
570
+ if default_schema:
571
+ # Schema like "gold-db-2 postgresql.gold" needs proper quoting
572
+ # Split into source and schema.table parts
573
+ parts = default_schema.split(".")
574
+ if len(parts) >= 2:
575
+ # Source name may have spaces/hyphens - quote if needed
576
+ source = self._quote_if_needed(parts[0])
577
+ schema = ".".join(parts[1:])
578
+ return f'{source}.{schema}.{table_name}'
579
+ else:
580
+ return f'{self._quote_if_needed(default_schema)}.{table_name}'
581
+ elif path:
582
+ return f'"{path}"."{table_name}"'
583
+ else:
584
+ return f'"{table_name}"'
585
+
586
+ def _table_exists(self, table_name: str) -> bool:
587
+ """Check if a table exists in Dremio.
588
+
589
+ Args:
590
+ table_name: Name of the table to check.
591
+
592
+ Returns:
593
+ True if table exists.
594
+ """
595
+ try:
596
+ # Try to get table info by querying with LIMIT 0
597
+ full_path = self._get_table_path(table_name)
598
+ sql = f"SELECT * FROM {full_path} LIMIT 0"
599
+ self._execute_query(sql)
600
+ return True
601
+ except Exception:
602
+ return False
603
+
604
+ def init_collections(self):
605
+ """Initialize collections from Dremio tables.
606
+
607
+ This queries the INFORMATION_SCHEMA to discover available tables.
608
+ """
609
+ if self._collections is None:
610
+ self._collections = {}
611
+
612
+ try:
613
+ # Query information schema for tables
614
+ path = self._connection_info.get("path")
615
+ default_schema = self._connection_info.get("default_schema")
616
+
617
+ # Build query for tables
618
+ sql = """
619
+ SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
620
+ FROM INFORMATION_SCHEMA."TABLES"
621
+ WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
622
+ """
623
+
624
+ if path:
625
+ sql += f" AND TABLE_SCHEMA = '{path}'"
626
+ elif default_schema:
627
+ sql += f" AND TABLE_SCHEMA = '{default_schema}'"
628
+
629
+ result = self._execute_query(sql)
630
+
631
+ for i in range(result.num_rows):
632
+ schema_name = result.column("TABLE_SCHEMA")[i].as_py()
633
+ table_name = result.column("TABLE_NAME")[i].as_py()
634
+
635
+ # Use simple name if in default schema, otherwise qualified name
636
+ if schema_name in (path, default_schema):
637
+ collection_name = table_name
638
+ else:
639
+ collection_name = f"{schema_name}.{table_name}"
640
+
641
+ if collection_name not in self._collections:
642
+ collection = DremioCollection(name=collection_name, parent=self)
643
+ collection.metadata.is_prepopulated = True
644
+ self._collections[collection_name] = collection
645
+
646
+ logger.info(f"Discovered {len(self._collections)} tables in Dremio")
647
+
648
+ except Exception as e:
649
+ logger.warning(f"Could not query INFORMATION_SCHEMA: {e}")
650
+ # Collections will be created on-demand
651
+
652
+ def _detect_source_type(self, source_name: str) -> Optional[str]:
653
+ """Detect the type of a Dremio data source.
654
+
655
+ Args:
656
+ source_name: Name of the source (e.g., 'gold-db-2 postgresql').
657
+
658
+ Returns:
659
+ Source type ('postgresql', 'mysql', 'mongodb', etc.) or None.
660
+ """
661
+ source_lower = source_name.lower()
662
+ if "postgresql" in source_lower or "postgres" in source_lower:
663
+ return "postgresql"
664
+ elif "mysql" in source_lower:
665
+ return "mysql"
666
+ elif "mongo" in source_lower:
667
+ return "mongodb"
668
+ elif "iceberg" in source_lower:
669
+ return "iceberg"
670
+ elif "hive" in source_lower:
671
+ return "hive"
672
+ return None
673
+
674
+ def _get_source_from_schema(self, schema_name: str) -> Tuple[Optional[str], Optional[str]]:
675
+ """Extract source name and schema from a full schema path.
676
+
677
+ Args:
678
+ schema_name: Full schema path like 'gold-db-2 postgresql.gold'.
679
+
680
+ Returns:
681
+ Tuple of (source_name, schema_within_source).
682
+ """
683
+ if "." in schema_name:
684
+ parts = schema_name.split(".", 1)
685
+ return parts[0], parts[1] if len(parts) > 1 else None
686
+ return schema_name, None
687
+
688
+ def get_foreign_keys(
689
+ self, schema_name: Optional[str] = None, table_name: Optional[str] = None
690
+ ) -> List[ForeignKeyInfo]:
691
+ """Get foreign key constraints from PostgreSQL sources via pg_catalog.
692
+
693
+ This method queries PostgreSQL's pg_catalog.pg_constraint to retrieve
694
+ foreign key information. Only works for PostgreSQL-backed sources.
695
+
696
+ Args:
697
+ schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
698
+ If None, uses the default schema from connection.
699
+ table_name: Optional table name to filter results.
700
+
701
+ Returns:
702
+ List of ForeignKeyInfo objects describing FK relationships.
703
+
704
+ Example:
705
+ >>> db = DremioDatabase("dremio://lakehouse:32010")
706
+ >>> fks = db.get_foreign_keys("gold-db-2 postgresql.gold")
707
+ >>> for fk in fks[:3]:
708
+ ... print(f"{fk.source_table}.{fk.source_columns} -> {fk.target_table}")
709
+ """
710
+ if schema_name is None:
711
+ schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
712
+
713
+ if not schema_name:
714
+ logger.warning("No schema specified for FK introspection")
715
+ return []
716
+
717
+ source_name, pg_schema = self._get_source_from_schema(schema_name)
718
+ source_type = self._detect_source_type(source_name)
719
+
720
+ if source_type != "postgresql":
721
+ logger.info(f"FK introspection only supported for PostgreSQL sources, not {source_type}")
722
+ return []
723
+
724
+ # Query FK constraints from pg_catalog
725
+ fk_sql = f'''
726
+ SELECT
727
+ con.conname as constraint_name,
728
+ src_class.relname as source_table,
729
+ tgt_class.relname as target_table,
730
+ con.conkey as source_col_nums,
731
+ con.confkey as target_col_nums,
732
+ con.conrelid as source_oid,
733
+ con.confrelid as target_oid
734
+ FROM "{source_name}".pg_catalog.pg_constraint con
735
+ JOIN "{source_name}".pg_catalog.pg_class src_class ON con.conrelid = src_class.oid
736
+ JOIN "{source_name}".pg_catalog.pg_class tgt_class ON con.confrelid = tgt_class.oid
737
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON src_class.relnamespace = nsp.oid
738
+ WHERE con.contype = 'f'
739
+ '''
740
+
741
+ if pg_schema:
742
+ fk_sql += f" AND nsp.nspname = '{pg_schema}'"
743
+ if table_name:
744
+ fk_sql += f" AND src_class.relname = '{table_name}'"
745
+
746
+ # Query column info for resolving column numbers to names
747
+ col_sql = f'''
748
+ SELECT
749
+ c.oid as table_oid,
750
+ a.attnum as col_num,
751
+ a.attname as col_name
752
+ FROM "{source_name}".pg_catalog.pg_class c
753
+ JOIN "{source_name}".pg_catalog.pg_attribute a ON a.attrelid = c.oid
754
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
755
+ WHERE a.attnum > 0 AND NOT a.attisdropped
756
+ '''
757
+
758
+ if pg_schema:
759
+ col_sql += f" AND nsp.nspname = '{pg_schema}'"
760
+
761
+ try:
762
+ fk_result = self._execute_query(fk_sql)
763
+ col_result = self._execute_query(col_sql)
764
+
765
+ # Build column lookup: (table_oid, col_num) -> col_name
766
+ col_df = col_result.to_pandas()
767
+ col_lookup = {}
768
+ for _, row in col_df.iterrows():
769
+ key = (row["table_oid"], row["col_num"])
770
+ col_lookup[key] = row["col_name"]
771
+
772
+ # Build FK info list
773
+ fk_df = fk_result.to_pandas()
774
+ fk_list = []
775
+
776
+ for _, fk in fk_df.iterrows():
777
+ # Parse array strings like '{1}' or '{1,2}'
778
+ src_nums = [int(x) for x in str(fk["source_col_nums"]).strip("{}").split(",") if x]
779
+ tgt_nums = [int(x) for x in str(fk["target_col_nums"]).strip("{}").split(",") if x]
780
+
781
+ src_cols = [col_lookup.get((fk["source_oid"], n), f"col_{n}") for n in src_nums]
782
+ tgt_cols = [col_lookup.get((fk["target_oid"], n), f"col_{n}") for n in tgt_nums]
783
+
784
+ fk_list.append(
785
+ ForeignKeyInfo(
786
+ constraint_name=fk["constraint_name"],
787
+ source_table=fk["source_table"],
788
+ source_columns=src_cols,
789
+ target_table=fk["target_table"],
790
+ target_columns=tgt_cols,
791
+ source_schema=pg_schema,
792
+ target_schema=pg_schema, # Assumes same schema
793
+ )
794
+ )
795
+
796
+ logger.info(f"Found {len(fk_list)} foreign key constraints in {schema_name}")
797
+ return fk_list
798
+
799
+ except Exception as e:
800
+ logger.warning(f"Could not retrieve FK constraints: {e}")
801
+ return []
802
+
803
+ def get_table_comments(
804
+ self, schema_name: Optional[str] = None, table_name: Optional[str] = None
805
+ ) -> Dict[str, TableInfo]:
806
+ """Get table and column comments from PostgreSQL sources via pg_description.
807
+
808
+ Args:
809
+ schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
810
+ table_name: Optional table name to filter results.
811
+
812
+ Returns:
813
+ Dictionary mapping table names to TableInfo with comments.
814
+ """
815
+ if schema_name is None:
816
+ schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
817
+
818
+ if not schema_name:
819
+ return {}
820
+
821
+ source_name, pg_schema = self._get_source_from_schema(schema_name)
822
+ source_type = self._detect_source_type(source_name)
823
+
824
+ if source_type != "postgresql":
825
+ logger.info(f"Comment introspection only supported for PostgreSQL sources")
826
+ return {}
827
+
828
+ sql = f'''
829
+ SELECT
830
+ c.relname as table_name,
831
+ a.attname as column_name,
832
+ a.attnum as col_num,
833
+ td.description as table_comment,
834
+ cd.description as column_comment
835
+ FROM "{source_name}".pg_catalog.pg_class c
836
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
837
+ LEFT JOIN "{source_name}".pg_catalog.pg_attribute a
838
+ ON a.attrelid = c.oid AND a.attnum > 0 AND NOT a.attisdropped
839
+ LEFT JOIN "{source_name}".pg_catalog.pg_description td
840
+ ON td.objoid = c.oid AND td.objsubid = 0
841
+ LEFT JOIN "{source_name}".pg_catalog.pg_description cd
842
+ ON cd.objoid = c.oid AND cd.objsubid = a.attnum
843
+ WHERE c.relkind IN ('r', 'v')
844
+ '''
845
+
846
+ if pg_schema:
847
+ sql += f" AND nsp.nspname = '{pg_schema}'"
848
+ if table_name:
849
+ sql += f" AND c.relname = '{table_name}'"
850
+
851
+ sql += " ORDER BY c.relname, a.attnum"
852
+
853
+ try:
854
+ result = self._execute_query(sql)
855
+ df = result.to_pandas()
856
+
857
+ tables = {}
858
+ for tbl_name in df["table_name"].unique():
859
+ tbl_df = df[df["table_name"] == tbl_name]
860
+ table_comment = tbl_df["table_comment"].iloc[0] if not tbl_df["table_comment"].isna().all() else None
861
+
862
+ columns = []
863
+ for _, row in tbl_df.iterrows():
864
+ if row["column_name"]:
865
+ columns.append(
866
+ ColumnInfo(
867
+ name=row["column_name"],
868
+ data_type="", # Not fetched here
869
+ comment=row["column_comment"] if not pd.isna(row["column_comment"]) else None,
870
+ ordinal_position=int(row["col_num"]) if row["col_num"] else 0,
871
+ )
872
+ )
873
+
874
+ tables[tbl_name] = TableInfo(
875
+ name=tbl_name, schema_name=pg_schema, comment=table_comment, columns=columns
876
+ )
877
+
878
+ return tables
879
+
880
+ except Exception as e:
881
+ logger.warning(f"Could not retrieve table comments: {e}")
882
+ return {}
883
+
884
+ def get_nested_schema(self, table_path: str) -> Dict[str, Any]:
885
+ """Get full schema including nested types by querying with LIMIT 0.
886
+
887
+ For complex types (ARRAY, STRUCT/ROW), the metadata methods don't
888
+ return nested field information. This method executes a LIMIT 0
889
+ query to get the full Arrow schema with nested structure.
890
+
891
+ Args:
892
+ table_path: Full table path (e.g., '"schema".table').
893
+
894
+ Returns:
895
+ Dictionary with column names and their Arrow type info.
896
+ """
897
+ sql = f"SELECT * FROM {table_path} LIMIT 0"
898
+
899
+ try:
900
+ result = self._execute_query(sql)
901
+ schema_info = {}
902
+
903
+ for field in result.schema:
904
+ type_str = str(field.type)
905
+ field_info = {
906
+ "name": field.name,
907
+ "arrow_type": type_str,
908
+ "nullable": field.nullable,
909
+ }
910
+
911
+ # Parse nested structure for struct types
912
+ if type_str.startswith("struct<"):
913
+ field_info["nested_fields"] = []
914
+ for nested in field.type:
915
+ field_info["nested_fields"].append(
916
+ {"name": nested.name, "arrow_type": str(nested.type), "nullable": nested.nullable}
917
+ )
918
+
919
+ # Parse list element type
920
+ if hasattr(field.type, "value_type"):
921
+ field_info["element_type"] = str(field.type.value_type)
922
+
923
+ schema_info[field.name] = field_info
924
+
925
+ return schema_info
926
+
927
+ except Exception as e:
928
+ logger.warning(f"Could not get nested schema for {table_path}: {e}")
929
+ return {}
930
+
931
+ def induce_schema_view(self, include_foreign_keys: bool = True) -> SchemaView:
932
+ """Induce a schema view from Dremio table structures.
933
+
934
+ Args:
935
+ include_foreign_keys: If True, attempt to retrieve FK info from
936
+ PostgreSQL sources and add relationships.
937
+
938
+ Returns:
939
+ SchemaView representing the database schema.
940
+ """
941
+ logger.info(f"Inducing schema view for {self.metadata.handle}")
942
+ sb = SchemaBuilder()
943
+
944
+ # Ensure collections are initialized
945
+ if not self._collections:
946
+ self.init_collections()
947
+
948
+ path = self._connection_info.get("path")
949
+ default_schema = self._connection_info.get("default_schema")
950
+
951
+ try:
952
+ # Query columns from INFORMATION_SCHEMA
953
+ sql = """
954
+ SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE,
955
+ IS_NULLABLE, ORDINAL_POSITION
956
+ FROM INFORMATION_SCHEMA."COLUMNS"
957
+ """
958
+
959
+ if path:
960
+ sql += f" WHERE TABLE_SCHEMA = '{path}'"
961
+ elif default_schema:
962
+ sql += f" WHERE TABLE_SCHEMA = '{default_schema}'"
963
+
964
+ sql += " ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION"
965
+
966
+ result = self._execute_query(sql)
967
+
968
+ # Group columns by table
969
+ current_table = None
970
+ for i in range(result.num_rows):
971
+ schema_name = result.column("TABLE_SCHEMA")[i].as_py()
972
+ table_name = result.column("TABLE_NAME")[i].as_py()
973
+ column_name = result.column("COLUMN_NAME")[i].as_py()
974
+ data_type = result.column("DATA_TYPE")[i].as_py()
975
+ is_nullable = result.column("IS_NULLABLE")[i].as_py()
976
+
977
+ # Get class name
978
+ if schema_name in (path, default_schema):
979
+ class_name = table_name
980
+ else:
981
+ class_name = f"{schema_name}_{table_name}"
982
+
983
+ # Add class if new
984
+ if class_name != current_table:
985
+ sb.add_class(class_name)
986
+ current_table = class_name
987
+
988
+ # Map Dremio type to LinkML type
989
+ from linkml_store.api.stores.dremio.mappings import DREMIO_SQL_TO_LINKML
990
+
991
+ # Extract base type (before any parentheses)
992
+ base_type = re.split(r"[\(\[]", data_type)[0].upper()
993
+ linkml_type = DREMIO_SQL_TO_LINKML.get(base_type, "string")
994
+
995
+ # Create slot definition
996
+ sd = SlotDefinition(column_name, required=is_nullable == "NO", range=linkml_type)
997
+ sb.schema.classes[class_name].attributes[sd.name] = sd
998
+ logger.debug(f"Introspected slot: {class_name}.{sd.name}: {sd.range}")
999
+
1000
+ except Exception as e:
1001
+ logger.warning(f"Could not introspect schema from INFORMATION_SCHEMA: {e}")
1002
+
1003
+ # Add foreign key relationships
1004
+ if include_foreign_keys:
1005
+ schema_to_use = path or default_schema
1006
+ if schema_to_use:
1007
+ fks = self.get_foreign_keys(schema_to_use)
1008
+ for fk in fks:
1009
+ # Get or derive class names
1010
+ src_class = fk.source_table
1011
+ tgt_class = fk.target_table
1012
+
1013
+ # Skip if classes don't exist
1014
+ if src_class not in sb.schema.classes or tgt_class not in sb.schema.classes:
1015
+ continue
1016
+
1017
+ # For single-column FKs, update the slot's range to point to target class
1018
+ if len(fk.source_columns) == 1:
1019
+ src_col = fk.source_columns[0]
1020
+ if src_col in sb.schema.classes[src_class].attributes:
1021
+ slot = sb.schema.classes[src_class].attributes[src_col]
1022
+ # Set range to target class, indicating a relationship
1023
+ slot.range = tgt_class
1024
+ slot.description = f"Foreign key to {tgt_class}"
1025
+ logger.debug(f"Added FK relationship: {src_class}.{src_col} -> {tgt_class}")
1026
+
1027
+ sb.add_defaults()
1028
+ return SchemaView(sb.schema)
1029
+
1030
+ def export_database(self, location: str, target_format: Optional[Union[str, Format]] = None, **kwargs):
1031
+ """Export database to a file.
1032
+
1033
+ Args:
1034
+ location: Output file path.
1035
+ target_format: Output format.
1036
+ **kwargs: Additional arguments.
1037
+ """
1038
+ # Use default export logic from parent
1039
+ super().export_database(location, target_format=target_format, **kwargs)
1040
+
1041
+ def import_database(self, location: str, source_format: Optional[str] = None, **kwargs):
1042
+ """Import data into Dremio.
1043
+
1044
+ Note: Direct import is limited in Dremio. Data typically needs to be
1045
+ loaded through Dremio's data sources or uploaded to connected storage.
1046
+
1047
+ Args:
1048
+ location: Source file path.
1049
+ source_format: Source format.
1050
+ **kwargs: Additional arguments.
1051
+ """
1052
+ super().import_database(location, source_format=source_format, **kwargs)