linkml-store 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. linkml_store/__init__.py +7 -0
  2. linkml_store/api/__init__.py +8 -0
  3. linkml_store/api/client.py +414 -0
  4. linkml_store/api/collection.py +1280 -0
  5. linkml_store/api/config.py +187 -0
  6. linkml_store/api/database.py +862 -0
  7. linkml_store/api/queries.py +69 -0
  8. linkml_store/api/stores/__init__.py +0 -0
  9. linkml_store/api/stores/chromadb/__init__.py +7 -0
  10. linkml_store/api/stores/chromadb/chromadb_collection.py +121 -0
  11. linkml_store/api/stores/chromadb/chromadb_database.py +89 -0
  12. linkml_store/api/stores/dremio/__init__.py +10 -0
  13. linkml_store/api/stores/dremio/dremio_collection.py +555 -0
  14. linkml_store/api/stores/dremio/dremio_database.py +1052 -0
  15. linkml_store/api/stores/dremio/mappings.py +105 -0
  16. linkml_store/api/stores/dremio_rest/__init__.py +11 -0
  17. linkml_store/api/stores/dremio_rest/dremio_rest_collection.py +502 -0
  18. linkml_store/api/stores/dremio_rest/dremio_rest_database.py +1023 -0
  19. linkml_store/api/stores/duckdb/__init__.py +16 -0
  20. linkml_store/api/stores/duckdb/duckdb_collection.py +339 -0
  21. linkml_store/api/stores/duckdb/duckdb_database.py +283 -0
  22. linkml_store/api/stores/duckdb/mappings.py +8 -0
  23. linkml_store/api/stores/filesystem/__init__.py +15 -0
  24. linkml_store/api/stores/filesystem/filesystem_collection.py +186 -0
  25. linkml_store/api/stores/filesystem/filesystem_database.py +81 -0
  26. linkml_store/api/stores/hdf5/__init__.py +7 -0
  27. linkml_store/api/stores/hdf5/hdf5_collection.py +104 -0
  28. linkml_store/api/stores/hdf5/hdf5_database.py +79 -0
  29. linkml_store/api/stores/ibis/__init__.py +5 -0
  30. linkml_store/api/stores/ibis/ibis_collection.py +488 -0
  31. linkml_store/api/stores/ibis/ibis_database.py +328 -0
  32. linkml_store/api/stores/mongodb/__init__.py +25 -0
  33. linkml_store/api/stores/mongodb/mongodb_collection.py +379 -0
  34. linkml_store/api/stores/mongodb/mongodb_database.py +114 -0
  35. linkml_store/api/stores/neo4j/__init__.py +0 -0
  36. linkml_store/api/stores/neo4j/neo4j_collection.py +429 -0
  37. linkml_store/api/stores/neo4j/neo4j_database.py +154 -0
  38. linkml_store/api/stores/solr/__init__.py +3 -0
  39. linkml_store/api/stores/solr/solr_collection.py +224 -0
  40. linkml_store/api/stores/solr/solr_database.py +83 -0
  41. linkml_store/api/stores/solr/solr_utils.py +0 -0
  42. linkml_store/api/types.py +4 -0
  43. linkml_store/cli.py +1147 -0
  44. linkml_store/constants.py +7 -0
  45. linkml_store/graphs/__init__.py +0 -0
  46. linkml_store/graphs/graph_map.py +24 -0
  47. linkml_store/index/__init__.py +53 -0
  48. linkml_store/index/implementations/__init__.py +0 -0
  49. linkml_store/index/implementations/llm_indexer.py +174 -0
  50. linkml_store/index/implementations/simple_indexer.py +43 -0
  51. linkml_store/index/indexer.py +211 -0
  52. linkml_store/inference/__init__.py +13 -0
  53. linkml_store/inference/evaluation.py +195 -0
  54. linkml_store/inference/implementations/__init__.py +0 -0
  55. linkml_store/inference/implementations/llm_inference_engine.py +154 -0
  56. linkml_store/inference/implementations/rag_inference_engine.py +276 -0
  57. linkml_store/inference/implementations/rule_based_inference_engine.py +169 -0
  58. linkml_store/inference/implementations/sklearn_inference_engine.py +314 -0
  59. linkml_store/inference/inference_config.py +66 -0
  60. linkml_store/inference/inference_engine.py +209 -0
  61. linkml_store/inference/inference_engine_registry.py +74 -0
  62. linkml_store/plotting/__init__.py +5 -0
  63. linkml_store/plotting/cli.py +826 -0
  64. linkml_store/plotting/dimensionality_reduction.py +453 -0
  65. linkml_store/plotting/embedding_plot.py +489 -0
  66. linkml_store/plotting/facet_chart.py +73 -0
  67. linkml_store/plotting/heatmap.py +383 -0
  68. linkml_store/utils/__init__.py +0 -0
  69. linkml_store/utils/change_utils.py +17 -0
  70. linkml_store/utils/dat_parser.py +95 -0
  71. linkml_store/utils/embedding_matcher.py +424 -0
  72. linkml_store/utils/embedding_utils.py +299 -0
  73. linkml_store/utils/enrichment_analyzer.py +217 -0
  74. linkml_store/utils/file_utils.py +37 -0
  75. linkml_store/utils/format_utils.py +550 -0
  76. linkml_store/utils/io.py +38 -0
  77. linkml_store/utils/llm_utils.py +122 -0
  78. linkml_store/utils/mongodb_utils.py +145 -0
  79. linkml_store/utils/neo4j_utils.py +42 -0
  80. linkml_store/utils/object_utils.py +190 -0
  81. linkml_store/utils/pandas_utils.py +93 -0
  82. linkml_store/utils/patch_utils.py +126 -0
  83. linkml_store/utils/query_utils.py +89 -0
  84. linkml_store/utils/schema_utils.py +23 -0
  85. linkml_store/utils/sklearn_utils.py +193 -0
  86. linkml_store/utils/sql_utils.py +177 -0
  87. linkml_store/utils/stats_utils.py +53 -0
  88. linkml_store/utils/vector_utils.py +158 -0
  89. linkml_store/webapi/__init__.py +0 -0
  90. linkml_store/webapi/html/__init__.py +3 -0
  91. linkml_store/webapi/html/base.html.j2 +24 -0
  92. linkml_store/webapi/html/collection_details.html.j2 +15 -0
  93. linkml_store/webapi/html/database_details.html.j2 +16 -0
  94. linkml_store/webapi/html/databases.html.j2 +14 -0
  95. linkml_store/webapi/html/generic.html.j2 +43 -0
  96. linkml_store/webapi/main.py +855 -0
  97. linkml_store-0.3.0.dist-info/METADATA +226 -0
  98. linkml_store-0.3.0.dist-info/RECORD +101 -0
  99. linkml_store-0.3.0.dist-info/WHEEL +4 -0
  100. linkml_store-0.3.0.dist-info/entry_points.txt +3 -0
  101. linkml_store-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,1023 @@
1
+ """Dremio REST API database adapter.
2
+
3
+ This module provides a Database implementation for connecting to Dremio
4
+ data lakehouse using the REST API v3. This is useful when the Arrow Flight
5
+ SQL port (32010) is not accessible, such as behind Cloudflare or firewalls.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import re
11
+ import time
12
+ from dataclasses import dataclass, field
13
+ from typing import Any, Dict, List, Optional, Tuple, Union
14
+ from urllib.parse import parse_qs, quote, urlparse
15
+
16
+ import pandas as pd
17
+ import requests
18
+ from linkml_runtime import SchemaView
19
+ from linkml_runtime.linkml_model import SlotDefinition
20
+ from linkml_runtime.utils.schema_builder import SchemaBuilder
21
+
22
+ from linkml_store.api import Database
23
+ from linkml_store.api.queries import Query, QueryResult
24
+ from linkml_store.api.stores.dremio_rest.dremio_rest_collection import DremioRestCollection
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @dataclass
30
+ class ForeignKeyInfo:
31
+ """Information about a foreign key constraint."""
32
+
33
+ constraint_name: str
34
+ source_table: str
35
+ source_columns: List[str]
36
+ target_table: str
37
+ target_columns: List[str]
38
+ source_schema: Optional[str] = None
39
+ target_schema: Optional[str] = None
40
+
41
+
42
+ @dataclass
43
+ class ColumnInfo:
44
+ """Information about a column including comments and nested structure."""
45
+
46
+ name: str
47
+ data_type: str
48
+ is_nullable: bool = True
49
+ comment: Optional[str] = None
50
+ ordinal_position: int = 0
51
+ nested_fields: List["ColumnInfo"] = field(default_factory=list)
52
+
53
+
54
+ @dataclass
55
+ class TableInfo:
56
+ """Information about a table including comments."""
57
+
58
+ name: str
59
+ schema_name: Optional[str] = None
60
+ comment: Optional[str] = None
61
+ columns: List[ColumnInfo] = field(default_factory=list)
62
+
63
+
64
+ # Mapping from Dremio SQL type names to LinkML types
65
+ DREMIO_SQL_TO_LINKML = {
66
+ "VARCHAR": "string",
67
+ "CHAR": "string",
68
+ "BIGINT": "integer",
69
+ "INTEGER": "integer",
70
+ "INT": "integer",
71
+ "SMALLINT": "integer",
72
+ "TINYINT": "integer",
73
+ "BOOLEAN": "boolean",
74
+ "DOUBLE": "float",
75
+ "FLOAT": "float",
76
+ "DECIMAL": "float",
77
+ "DATE": "date",
78
+ "TIMESTAMP": "datetime",
79
+ "TIME": "string",
80
+ "BINARY": "string",
81
+ "VARBINARY": "string",
82
+ "LIST": "string",
83
+ "STRUCT": "string",
84
+ "MAP": "string",
85
+ }
86
+
87
+
88
+ class DremioRestDatabase(Database):
89
+ """
90
+ An adapter for Dremio data lakehouse using the REST API v3.
91
+
92
+ This adapter connects to Dremio using the standard REST API, which is
93
+ useful when the Arrow Flight SQL port is not accessible.
94
+
95
+ Handle format:
96
+ dremio-rest://[username:password@]host[:port][/path][?params]
97
+
98
+ Examples:
99
+ - dremio-rest://localhost
100
+ - dremio-rest://user:pass@lakehouse.example.com
101
+ - dremio-rest://lakehouse.example.com?schema=gold.study
102
+ - dremio-rest://lakehouse.example.com?cf_token_env=CF_AUTHORIZATION
103
+
104
+ Parameters (query string):
105
+ - schema: Default schema/space to use for unqualified table names
106
+ - verify_ssl: Whether to verify SSL certificates (default: true)
107
+ - cf_token_env: Environment variable name for Cloudflare Access token
108
+ - username_env: Environment variable for username (default: DREMIO_USER)
109
+ - password_env: Environment variable for password (default: DREMIO_PASSWORD)
110
+
111
+ Environment variables:
112
+ - DREMIO_USER: Default username
113
+ - DREMIO_PASSWORD: Default password
114
+ - CF_AUTHORIZATION: Cloudflare Access token (if behind Cloudflare)
115
+ """
116
+
117
+ _auth_token: Optional[str] = None
118
+ _connection_info: Optional[Dict[str, Any]] = None
119
+ _session: Optional[requests.Session] = None
120
+ collection_class = DremioRestCollection
121
+
122
+ def __init__(
123
+ self,
124
+ handle: Optional[str] = None,
125
+ recreate_if_exists: bool = False,
126
+ username: Optional[str] = None,
127
+ password: Optional[str] = None,
128
+ **kwargs,
129
+ ):
130
+ """Initialize a Dremio REST database connection.
131
+
132
+ Args:
133
+ handle: Connection string in format dremio-rest://host
134
+ recreate_if_exists: Not applicable for Dremio (ignored)
135
+ username: Optional username (overrides env var)
136
+ password: Optional password (overrides env var)
137
+ **kwargs: Additional arguments passed to parent
138
+ """
139
+ if handle is None:
140
+ handle = "dremio-rest://localhost"
141
+
142
+ self._connection_info = self._parse_handle(handle)
143
+
144
+ # Override with explicit credentials if provided
145
+ if username:
146
+ self._connection_info["username"] = username
147
+ if password:
148
+ self._connection_info["password"] = password
149
+
150
+ super().__init__(handle=handle, **kwargs)
151
+
152
+ def _parse_handle(self, handle: str) -> Dict[str, Any]:
153
+ """Parse a Dremio REST connection handle.
154
+
155
+ Args:
156
+ handle: Connection string like dremio-rest://user:pass@host:port/path?params
157
+
158
+ Returns:
159
+ Dictionary with connection parameters.
160
+ """
161
+ # Ensure scheme is present
162
+ if not handle.startswith("dremio-rest://"):
163
+ handle = f"dremio-rest://{handle}"
164
+
165
+ parsed = urlparse(handle)
166
+
167
+ # Parse query parameters
168
+ params = parse_qs(parsed.query)
169
+
170
+ # Extract parameters with defaults
171
+ verify_ssl = params.get("verify_ssl", ["true"])[0].lower() == "true"
172
+ default_schema = params.get("schema", [None])[0]
173
+ cf_token_env = params.get("cf_token_env", ["CF_AUTHORIZATION"])[0]
174
+ username_env = params.get("username_env", ["DREMIO_USER"])[0]
175
+ password_env = params.get("password_env", ["DREMIO_PASSWORD"])[0]
176
+
177
+ # Get credentials from URL or environment
178
+ username = parsed.username or os.environ.get(username_env)
179
+ password = parsed.password or os.environ.get(password_env)
180
+ cf_token = os.environ.get(cf_token_env)
181
+
182
+ # Determine port (default to 443 for HTTPS)
183
+ port = parsed.port or 443
184
+
185
+ return {
186
+ "host": parsed.hostname or "localhost",
187
+ "port": port,
188
+ "username": username,
189
+ "password": password,
190
+ "path": parsed.path.lstrip("/") if parsed.path else None,
191
+ "default_schema": default_schema,
192
+ "verify_ssl": verify_ssl,
193
+ "cf_token": cf_token,
194
+ }
195
+
196
+ @property
197
+ def session(self) -> requests.Session:
198
+ """Get or create the requests session."""
199
+ if self._session is None:
200
+ self._session = requests.Session()
201
+ if not self._connection_info["verify_ssl"]:
202
+ self._session.verify = False
203
+ import urllib3
204
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
205
+ return self._session
206
+
207
+ @property
208
+ def base_url(self) -> str:
209
+ """Get the base URL for API requests."""
210
+ info = self._connection_info
211
+ host = info["host"]
212
+ port = info["port"]
213
+ if port == 443:
214
+ return f"https://{host}"
215
+ else:
216
+ return f"https://{host}:{port}"
217
+
218
+ def _get_cookies(self) -> Dict[str, str]:
219
+ """Get cookies for requests (e.g., Cloudflare Access token)."""
220
+ cookies = {}
221
+ cf_token = self._connection_info.get("cf_token")
222
+ if cf_token:
223
+ cookies["CF_Authorization"] = cf_token
224
+ return cookies
225
+
226
+ def _authenticate(self) -> str:
227
+ """Authenticate with Dremio and get auth token.
228
+
229
+ Returns:
230
+ Authentication token for subsequent requests.
231
+
232
+ Raises:
233
+ ConnectionError: If authentication fails.
234
+ """
235
+ if self._auth_token:
236
+ return self._auth_token
237
+
238
+ info = self._connection_info
239
+ username = info.get("username")
240
+ password = info.get("password")
241
+
242
+ if not username or not password:
243
+ raise ConnectionError(
244
+ "Dremio credentials required. Set DREMIO_USER and DREMIO_PASSWORD "
245
+ "environment variables or provide in connection string."
246
+ )
247
+
248
+ url = f"{self.base_url}/apiv2/login"
249
+ cookies = self._get_cookies()
250
+
251
+ logger.info(f"Authenticating to Dremio at {self.base_url}")
252
+
253
+ response = self.session.post(
254
+ url,
255
+ json={"userName": username, "password": password},
256
+ cookies=cookies,
257
+ )
258
+
259
+ if not response.ok:
260
+ raise ConnectionError(
261
+ f"Dremio authentication failed: {response.status_code} - {response.text[:200]}"
262
+ )
263
+
264
+ token = response.json().get("token")
265
+ if not token:
266
+ raise ConnectionError("No token in authentication response")
267
+
268
+ self._auth_token = f"_dremio{token}"
269
+ logger.info("Dremio authentication successful")
270
+ return self._auth_token
271
+
272
+ def _get_headers(self) -> Dict[str, str]:
273
+ """Get headers for authenticated requests."""
274
+ token = self._authenticate()
275
+ return {"Authorization": token, "Content-Type": "application/json"}
276
+
277
+ def _execute_query(self, sql: str, timeout: int = 300) -> pd.DataFrame:
278
+ """Execute a SQL query and return results as DataFrame.
279
+
280
+ Args:
281
+ sql: SQL query string.
282
+ timeout: Maximum time to wait for query completion in seconds.
283
+
284
+ Returns:
285
+ Pandas DataFrame with query results.
286
+
287
+ Raises:
288
+ RuntimeError: If query fails or times out.
289
+ """
290
+ headers = self._get_headers()
291
+ cookies = self._get_cookies()
292
+
293
+ # Submit query
294
+ url = f"{self.base_url}/api/v3/sql"
295
+ logger.debug(f"Executing SQL: {sql}")
296
+
297
+ response = self.session.post(
298
+ url,
299
+ headers=headers,
300
+ json={"sql": sql},
301
+ cookies=cookies,
302
+ )
303
+
304
+ if not response.ok:
305
+ raise RuntimeError(f"Query submission failed: {response.status_code} - {response.text[:200]}")
306
+
307
+ job_id = response.json().get("id")
308
+ if not job_id:
309
+ raise RuntimeError("No job ID in query response")
310
+
311
+ logger.debug(f"Query job ID: {job_id}")
312
+
313
+ # Wait for completion
314
+ start_time = time.time()
315
+ while True:
316
+ if time.time() - start_time > timeout:
317
+ raise RuntimeError(f"Query timed out after {timeout} seconds")
318
+
319
+ status_url = f"{self.base_url}/api/v3/job/{job_id}"
320
+ status_response = self.session.get(status_url, headers=headers, cookies=cookies)
321
+ status = status_response.json()
322
+
323
+ job_state = status.get("jobState")
324
+ if job_state == "COMPLETED":
325
+ break
326
+ elif job_state in ("FAILED", "CANCELED"):
327
+ error_msg = status.get("errorMessage", "Unknown error")
328
+ raise RuntimeError(f"Query {job_state}: {error_msg}")
329
+
330
+ time.sleep(0.5)
331
+
332
+ # Fetch results with pagination
333
+ row_count = status.get("rowCount", 0)
334
+ logger.debug(f"Query completed with {row_count} rows")
335
+
336
+ all_rows = []
337
+ offset = 0
338
+ limit = 500 # Dremio max per request
339
+
340
+ while offset < row_count:
341
+ results_url = f"{self.base_url}/api/v3/job/{job_id}/results"
342
+ results_response = self.session.get(
343
+ results_url,
344
+ headers=headers,
345
+ cookies=cookies,
346
+ params={"offset": offset, "limit": limit},
347
+ )
348
+
349
+ if not results_response.ok:
350
+ raise RuntimeError(f"Failed to fetch results: {results_response.status_code}")
351
+
352
+ results = results_response.json()
353
+ rows = results.get("rows", [])
354
+ if not rows:
355
+ break
356
+
357
+ all_rows.extend(rows)
358
+ offset += limit
359
+
360
+ return pd.DataFrame(all_rows)
361
+
362
+ def _execute_update(self, sql: str) -> int:
363
+ """Execute a SQL update/insert/delete statement.
364
+
365
+ Args:
366
+ sql: SQL statement.
367
+
368
+ Returns:
369
+ Number of affected rows (-1 if unknown).
370
+ """
371
+ # For DML, we just execute and check for success
372
+ self._execute_query(sql)
373
+ return -1
374
+
375
+ def commit(self, **kwargs):
376
+ """Commit pending changes (no-op for Dremio REST)."""
377
+ pass
378
+
379
+ def close(self, **kwargs):
380
+ """Close the Dremio connection."""
381
+ if self._session:
382
+ self._session.close()
383
+ self._session = None
384
+ self._auth_token = None
385
+
386
+ def drop(self, missing_ok=True, **kwargs):
387
+ """Drop the database.
388
+
389
+ Note: This is not supported for Dremio as it's typically a read/query layer.
390
+ """
391
+ self.close()
392
+ logger.warning("Dremio does not support dropping databases through this adapter")
393
+
394
+ def query(self, query: Query, **kwargs) -> QueryResult:
395
+ """Execute a query against Dremio.
396
+
397
+ Args:
398
+ query: Query object specifying the query parameters.
399
+ **kwargs: Additional arguments.
400
+
401
+ Returns:
402
+ QueryResult with matching rows.
403
+ """
404
+ from_table = query.from_table
405
+ if not from_table:
406
+ raise ValueError("Query must specify from_table")
407
+
408
+ collection = self.get_collection(from_table, create_if_not_exists=False)
409
+ if collection:
410
+ return collection.query(query, **kwargs)
411
+ else:
412
+ return QueryResult(query=query, num_rows=0, rows=[])
413
+
414
+ @property
415
+ def supports_sql(self) -> bool:
416
+ """Return True - Dremio REST supports raw SQL queries."""
417
+ return True
418
+
419
+ def _qualify_table_names(self, sql: str) -> str:
420
+ """Qualify unqualified table names in SQL using configured schema.
421
+
422
+ Handles FROM and JOIN clauses, qualifying table names that don't
423
+ already contain dots or quotes.
424
+
425
+ Args:
426
+ sql: SQL query string.
427
+
428
+ Returns:
429
+ SQL with qualified table names.
430
+ """
431
+ default_schema = self._connection_info.get("default_schema")
432
+ if not default_schema:
433
+ return sql
434
+
435
+ # Pattern matches FROM/JOIN followed by an unqualified table name
436
+ # Unqualified = no dots, no quotes, just a simple identifier
437
+ pattern = r'(?i)((?:FROM|JOIN)\s+)([a-zA-Z_][a-zA-Z0-9_]*)(\s+(?:AS\s+)?[a-zA-Z_][a-zA-Z0-9_]*|\s+(?:WHERE|ORDER|GROUP|LIMIT|HAVING|UNION|INTERSECT|EXCEPT|ON|LEFT|RIGHT|INNER|OUTER|CROSS|FULL|;)|$)'
438
+
439
+ def replace_table(match):
440
+ prefix = match.group(1) # "FROM " or "JOIN "
441
+ table = match.group(2) # table name
442
+ suffix = match.group(3) # rest (alias, WHERE, etc.)
443
+
444
+ # Check if this looks like a keyword (not a table name)
445
+ keywords = {'WHERE', 'ORDER', 'GROUP', 'LIMIT', 'HAVING', 'UNION',
446
+ 'INTERSECT', 'EXCEPT', 'SELECT', 'AS', 'ON', 'AND', 'OR',
447
+ 'LEFT', 'RIGHT', 'INNER', 'OUTER', 'CROSS', 'FULL', 'JOIN'}
448
+ if table.upper() in keywords:
449
+ return match.group(0)
450
+
451
+ qualified = self._get_table_path(table)
452
+ return f"{prefix}{qualified}{suffix}"
453
+
454
+ return re.sub(pattern, replace_table, sql)
455
+
456
+ def execute_sql(self, sql: str, **kwargs) -> QueryResult:
457
+ """
458
+ Execute a raw SQL query against Dremio via REST API.
459
+
460
+ If a default schema is configured in the connection URL, unqualified
461
+ table names in FROM/JOIN clauses will be automatically qualified.
462
+
463
+ :param sql: SQL query string
464
+ :param kwargs: Additional arguments
465
+ :return: QueryResult containing the results
466
+ """
467
+ sql = self._qualify_table_names(sql)
468
+ logger.debug(f"Qualified SQL: {sql}")
469
+ df = self._execute_query(sql)
470
+ return QueryResult(num_rows=len(df), rows=df.to_dict("records"))
471
+
472
+ def _needs_quoting(self, identifier: str) -> bool:
473
+ """Check if an identifier needs quoting in SQL.
474
+
475
+ Identifiers need quoting if they contain special characters
476
+ like hyphens, spaces, or start with a digit.
477
+ """
478
+ if not identifier:
479
+ return False
480
+ # Needs quoting if contains non-alphanumeric/underscore or starts with digit
481
+ if not identifier[0].isalpha() and identifier[0] != "_":
482
+ return True
483
+ return not all(c.isalnum() or c == "_" for c in identifier)
484
+
485
+ def _quote_if_needed(self, identifier: str) -> str:
486
+ """Quote an identifier if it contains special characters."""
487
+ if self._needs_quoting(identifier):
488
+ return f'"{identifier}"'
489
+ return identifier
490
+
491
+ def _get_table_path(self, table_name: str) -> str:
492
+ """Get the full table path including schema if configured.
493
+
494
+ Args:
495
+ table_name: Table name.
496
+
497
+ Returns:
498
+ Full table path for SQL queries.
499
+ """
500
+ default_schema = self._connection_info.get("default_schema")
501
+ path = self._connection_info.get("path")
502
+
503
+ # If already has dots/quotes, assume it's qualified
504
+ if "." in table_name or '"' in table_name:
505
+ return table_name
506
+
507
+ if default_schema:
508
+ # Schema like "gold-db-2 postgresql.gold" needs proper quoting
509
+ # Split into source and schema.table parts
510
+ parts = default_schema.split(".")
511
+ if len(parts) >= 2:
512
+ # Source name may have spaces/hyphens - quote if needed
513
+ source = self._quote_if_needed(parts[0])
514
+ schema = ".".join(parts[1:])
515
+ return f'{source}.{schema}.{table_name}'
516
+ else:
517
+ return f'{self._quote_if_needed(default_schema)}.{table_name}'
518
+ elif path:
519
+ return f'"{path}"."{table_name}"'
520
+ else:
521
+ return f'"{table_name}"'
522
+
523
+ def _table_exists(self, table_name: str) -> bool:
524
+ """Check if a table exists in Dremio.
525
+
526
+ Args:
527
+ table_name: Name of the table to check.
528
+
529
+ Returns:
530
+ True if table exists.
531
+ """
532
+ full_path = self._get_table_path(table_name)
533
+ sql = f"SELECT * FROM {full_path} LIMIT 0"
534
+ try:
535
+ self._execute_query(sql)
536
+ return True
537
+ except Exception:
538
+ return False
539
+
540
+ def _list_table_names(self) -> List[str]:
541
+ """List all table names in the database."""
542
+ try:
543
+ path = self._connection_info.get("path")
544
+ default_schema = self._connection_info.get("default_schema")
545
+
546
+ sql = """
547
+ SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
548
+ FROM INFORMATION_SCHEMA."TABLES"
549
+ WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
550
+ """
551
+
552
+ if path:
553
+ sql += f" AND TABLE_SCHEMA = '{path}'"
554
+ elif default_schema:
555
+ sql += f" AND TABLE_SCHEMA = '{default_schema}'"
556
+
557
+ df = self._execute_query(sql)
558
+ return df["TABLE_NAME"].tolist() if not df.empty else []
559
+ except Exception as e:
560
+ logger.warning(f"Could not list tables: {e}")
561
+ return []
562
+
563
+ def init_collections(self):
564
+ """Initialize collections dict.
565
+
566
+ Note: Unlike other adapters, we don't scan INFORMATION_SCHEMA here
567
+ because it can be very slow on large Dremio instances. Collections
568
+ are created on-demand when get_collection() is called with a table path.
569
+
570
+ Use discover_collections() to explicitly scan for available tables.
571
+ """
572
+ if self._collections is None:
573
+ self._collections = {}
574
+
575
+ def discover_collections(self):
576
+ """Discover and register all available tables from Dremio.
577
+
578
+ This queries INFORMATION_SCHEMA to find all tables. This can be slow
579
+ on large Dremio instances - use only when you need to list all tables.
580
+ """
581
+ if self._collections is None:
582
+ self._collections = {}
583
+
584
+ path = self._connection_info.get("path")
585
+ default_schema = self._connection_info.get("default_schema")
586
+
587
+ sql = """
588
+ SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
589
+ FROM INFORMATION_SCHEMA."TABLES"
590
+ WHERE TABLE_TYPE IN ('TABLE', 'VIEW')
591
+ """
592
+
593
+ if path:
594
+ sql += f" AND TABLE_SCHEMA = '{path}'"
595
+ elif default_schema:
596
+ sql += f" AND TABLE_SCHEMA = '{default_schema}'"
597
+
598
+ df = self._execute_query(sql)
599
+
600
+ for _, row in df.iterrows():
601
+ table_name = row["TABLE_NAME"]
602
+ collection_name = table_name
603
+
604
+ if collection_name not in self._collections:
605
+ collection = DremioRestCollection(name=collection_name, parent=self)
606
+ collection.metadata.is_prepopulated = True
607
+ self._collections[collection_name] = collection
608
+
609
+ logger.info(f"Discovered {len(self._collections)} tables in Dremio")
610
+
611
+ def _detect_source_type(self, source_name: str) -> Optional[str]:
612
+ """Detect the type of a Dremio data source.
613
+
614
+ Args:
615
+ source_name: Name of the source (e.g., 'gold-db-2 postgresql').
616
+
617
+ Returns:
618
+ Source type ('postgresql', 'mysql', 'mongodb', etc.) or None.
619
+ """
620
+ source_lower = source_name.lower()
621
+ if "postgresql" in source_lower or "postgres" in source_lower:
622
+ return "postgresql"
623
+ elif "mysql" in source_lower:
624
+ return "mysql"
625
+ elif "mongo" in source_lower:
626
+ return "mongodb"
627
+ elif "iceberg" in source_lower:
628
+ return "iceberg"
629
+ elif "hive" in source_lower:
630
+ return "hive"
631
+ return None
632
+
633
+ def _get_source_from_schema(self, schema_name: str) -> Tuple[Optional[str], Optional[str]]:
634
+ """Extract source name and schema from a full schema path.
635
+
636
+ Args:
637
+ schema_name: Full schema path like 'gold-db-2 postgresql.gold'.
638
+
639
+ Returns:
640
+ Tuple of (source_name, schema_within_source).
641
+ """
642
+ if "." in schema_name:
643
+ parts = schema_name.split(".", 1)
644
+ return parts[0], parts[1] if len(parts) > 1 else None
645
+ return schema_name, None
646
+
647
+ def get_foreign_keys(
648
+ self, schema_name: Optional[str] = None, table_name: Optional[str] = None
649
+ ) -> List[ForeignKeyInfo]:
650
+ """Get foreign key constraints from PostgreSQL sources via pg_catalog.
651
+
652
+ This method queries PostgreSQL's pg_catalog.pg_constraint to retrieve
653
+ foreign key information. Only works for PostgreSQL-backed sources.
654
+
655
+ Args:
656
+ schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
657
+ If None, uses the default schema from connection.
658
+ table_name: Optional table name to filter results.
659
+
660
+ Returns:
661
+ List of ForeignKeyInfo objects describing FK relationships.
662
+ """
663
+ if schema_name is None:
664
+ schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
665
+
666
+ if not schema_name:
667
+ logger.warning("No schema specified for FK introspection")
668
+ return []
669
+
670
+ source_name, pg_schema = self._get_source_from_schema(schema_name)
671
+ source_type = self._detect_source_type(source_name)
672
+
673
+ if source_type != "postgresql":
674
+ logger.info(f"FK introspection only supported for PostgreSQL sources, not {source_type}")
675
+ return []
676
+
677
+ # Query FK constraints from pg_catalog
678
+ fk_sql = f'''
679
+ SELECT
680
+ con.conname as constraint_name,
681
+ src_class.relname as source_table,
682
+ tgt_class.relname as target_table,
683
+ con.conkey as source_col_nums,
684
+ con.confkey as target_col_nums,
685
+ con.conrelid as source_oid,
686
+ con.confrelid as target_oid
687
+ FROM "{source_name}".pg_catalog.pg_constraint con
688
+ JOIN "{source_name}".pg_catalog.pg_class src_class ON con.conrelid = src_class.oid
689
+ JOIN "{source_name}".pg_catalog.pg_class tgt_class ON con.confrelid = tgt_class.oid
690
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON src_class.relnamespace = nsp.oid
691
+ WHERE con.contype = 'f'
692
+ '''
693
+
694
+ if pg_schema:
695
+ fk_sql += f" AND nsp.nspname = '{pg_schema}'"
696
+ if table_name:
697
+ fk_sql += f" AND src_class.relname = '{table_name}'"
698
+
699
+ # Query column info for resolving column numbers to names
700
+ col_sql = f'''
701
+ SELECT
702
+ c.oid as table_oid,
703
+ a.attnum as col_num,
704
+ a.attname as col_name
705
+ FROM "{source_name}".pg_catalog.pg_class c
706
+ JOIN "{source_name}".pg_catalog.pg_attribute a ON a.attrelid = c.oid
707
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
708
+ WHERE a.attnum > 0 AND NOT a.attisdropped
709
+ '''
710
+
711
+ if pg_schema:
712
+ col_sql += f" AND nsp.nspname = '{pg_schema}'"
713
+
714
+ try:
715
+ fk_df = self._execute_query(fk_sql)
716
+ col_df = self._execute_query(col_sql)
717
+
718
+ # Build column lookup: (table_oid, col_num) -> col_name
719
+ col_lookup = {}
720
+ for _, row in col_df.iterrows():
721
+ key = (row["table_oid"], row["col_num"])
722
+ col_lookup[key] = row["col_name"]
723
+
724
+ # Build FK info list
725
+ fk_list = []
726
+
727
+ for _, fk in fk_df.iterrows():
728
+ # Parse array strings like '{1}' or '{1,2}'
729
+ src_nums = [int(x) for x in str(fk["source_col_nums"]).strip("{}").split(",") if x]
730
+ tgt_nums = [int(x) for x in str(fk["target_col_nums"]).strip("{}").split(",") if x]
731
+
732
+ src_cols = [col_lookup.get((fk["source_oid"], n), f"col_{n}") for n in src_nums]
733
+ tgt_cols = [col_lookup.get((fk["target_oid"], n), f"col_{n}") for n in tgt_nums]
734
+
735
+ fk_list.append(
736
+ ForeignKeyInfo(
737
+ constraint_name=fk["constraint_name"],
738
+ source_table=fk["source_table"],
739
+ source_columns=src_cols,
740
+ target_table=fk["target_table"],
741
+ target_columns=tgt_cols,
742
+ source_schema=pg_schema,
743
+ target_schema=pg_schema, # Assumes same schema
744
+ )
745
+ )
746
+
747
+ logger.info(f"Found {len(fk_list)} foreign key constraints in {schema_name}")
748
+ return fk_list
749
+
750
+ except Exception as e:
751
+ logger.warning(f"Could not retrieve FK constraints: {e}")
752
+ return []
753
+
754
+ def get_table_comments(
755
+ self, schema_name: Optional[str] = None, table_name: Optional[str] = None
756
+ ) -> Dict[str, TableInfo]:
757
+ """Get table and column comments from PostgreSQL sources via pg_description.
758
+
759
+ Args:
760
+ schema_name: Full schema path (e.g., 'gold-db-2 postgresql.gold').
761
+ table_name: Optional table name to filter results.
762
+
763
+ Returns:
764
+ Dictionary mapping table names to TableInfo with comments.
765
+ """
766
+ if schema_name is None:
767
+ schema_name = self._connection_info.get("default_schema") or self._connection_info.get("path")
768
+
769
+ if not schema_name:
770
+ return {}
771
+
772
+ source_name, pg_schema = self._get_source_from_schema(schema_name)
773
+ source_type = self._detect_source_type(source_name)
774
+
775
+ if source_type != "postgresql":
776
+ logger.info(f"Comment introspection only supported for PostgreSQL sources")
777
+ return {}
778
+
779
+ sql = f'''
780
+ SELECT
781
+ c.relname as table_name,
782
+ a.attname as column_name,
783
+ a.attnum as col_num,
784
+ td.description as table_comment,
785
+ cd.description as column_comment
786
+ FROM "{source_name}".pg_catalog.pg_class c
787
+ JOIN "{source_name}".pg_catalog.pg_namespace nsp ON c.relnamespace = nsp.oid
788
+ LEFT JOIN "{source_name}".pg_catalog.pg_attribute a
789
+ ON a.attrelid = c.oid AND a.attnum > 0 AND NOT a.attisdropped
790
+ LEFT JOIN "{source_name}".pg_catalog.pg_description td
791
+ ON td.objoid = c.oid AND td.objsubid = 0
792
+ LEFT JOIN "{source_name}".pg_catalog.pg_description cd
793
+ ON cd.objoid = c.oid AND cd.objsubid = a.attnum
794
+ WHERE c.relkind IN ('r', 'v')
795
+ '''
796
+
797
+ if pg_schema:
798
+ sql += f" AND nsp.nspname = '{pg_schema}'"
799
+ if table_name:
800
+ sql += f" AND c.relname = '{table_name}'"
801
+
802
+ sql += " ORDER BY c.relname, a.attnum"
803
+
804
+ try:
805
+ df = self._execute_query(sql)
806
+
807
+ tables = {}
808
+ for tbl_name in df["table_name"].unique():
809
+ tbl_df = df[df["table_name"] == tbl_name]
810
+ table_comment = tbl_df["table_comment"].iloc[0] if not tbl_df["table_comment"].isna().all() else None
811
+
812
+ columns = []
813
+ for _, row in tbl_df.iterrows():
814
+ if row["column_name"]:
815
+ columns.append(
816
+ ColumnInfo(
817
+ name=row["column_name"],
818
+ data_type="", # Not fetched here
819
+ comment=row["column_comment"] if not pd.isna(row["column_comment"]) else None,
820
+ ordinal_position=int(row["col_num"]) if row["col_num"] else 0,
821
+ )
822
+ )
823
+
824
+ tables[tbl_name] = TableInfo(
825
+ name=tbl_name, schema_name=pg_schema, comment=table_comment, columns=columns
826
+ )
827
+
828
+ return tables
829
+
830
+ except Exception as e:
831
+ logger.warning(f"Could not retrieve table comments: {e}")
832
+ return {}
833
+
834
+ def get_nested_schema(self, table_path: str) -> Dict[str, Any]:
835
+ """Get full schema including nested types by querying with LIMIT 0.
836
+
837
+ For complex types (ARRAY, STRUCT/ROW), the INFORMATION_SCHEMA doesn't
838
+ return nested field information. This method uses the REST catalog API
839
+ to get the full schema with nested structure.
840
+
841
+ Args:
842
+ table_path: Full table path (e.g., '"schema".table').
843
+
844
+ Returns:
845
+ Dictionary with column names and their type info.
846
+ """
847
+ # First try the REST catalog API for detailed type info
848
+ path = self._connection_info.get("path")
849
+ default_schema = self._connection_info.get("default_schema")
850
+
851
+ # Parse table path to get catalog path
852
+ table_path_clean = table_path.replace('"', '')
853
+ parts = table_path_clean.split(".")
854
+
855
+ # Try to find table via catalog API
856
+ try:
857
+ headers = self._get_headers()
858
+ cookies = self._get_cookies()
859
+
860
+ # Build catalog path
861
+ if len(parts) >= 2:
862
+ catalog_path = ".".join(parts[:-1])
863
+ table_name = parts[-1]
864
+ else:
865
+ catalog_path = path or default_schema or ""
866
+ table_name = parts[0] if parts else table_path_clean
867
+
868
+ # URL encode the path
869
+ encoded_path = quote(f"{catalog_path}.{table_name}" if catalog_path else table_name, safe="")
870
+ url = f"{self.base_url}/api/v3/catalog/by-path/{encoded_path}"
871
+
872
+ response = self.session.get(url, headers=headers, cookies=cookies)
873
+
874
+ if response.ok:
875
+ catalog_data = response.json()
876
+ fields = catalog_data.get("fields", [])
877
+
878
+ schema_info = {}
879
+ for field_data in fields:
880
+ field_name = field_data.get("name")
881
+ field_type = field_data.get("type", {})
882
+
883
+ type_name = field_type.get("name", "UNKNOWN")
884
+ field_info = {
885
+ "name": field_name,
886
+ "dremio_type": type_name,
887
+ "nullable": True, # Not always available in REST API
888
+ }
889
+
890
+ # Handle complex types with subSchema
891
+ sub_schema = field_type.get("subSchema")
892
+ if sub_schema:
893
+ field_info["nested_fields"] = []
894
+ for sub_field in sub_schema:
895
+ sub_name = sub_field.get("name")
896
+ sub_type = sub_field.get("type", {})
897
+ field_info["nested_fields"].append({
898
+ "name": sub_name,
899
+ "dremio_type": sub_type.get("name", "UNKNOWN"),
900
+ })
901
+
902
+ # Handle list element types
903
+ if type_name == "LIST":
904
+ sub_schema = field_type.get("subSchema", [])
905
+ if sub_schema:
906
+ field_info["element_type"] = sub_schema[0].get("type", {}).get("name", "UNKNOWN")
907
+
908
+ schema_info[field_name] = field_info
909
+
910
+ return schema_info
911
+
912
+ except Exception as e:
913
+ logger.debug(f"Could not get schema via catalog API: {e}")
914
+
915
+ # Fall back to LIMIT 0 query for column info (without nested structure)
916
+ try:
917
+ sql = f"SELECT * FROM {table_path} LIMIT 0"
918
+ df = self._execute_query(sql)
919
+
920
+ schema_info = {}
921
+ for col in df.columns:
922
+ schema_info[col] = {
923
+ "name": col,
924
+ "dremio_type": str(df[col].dtype),
925
+ "nullable": True,
926
+ }
927
+
928
+ return schema_info
929
+
930
+ except Exception as e:
931
+ logger.warning(f"Could not get nested schema for {table_path}: {e}")
932
+ return {}
933
+
934
+ def induce_schema_view(self, include_foreign_keys: bool = True) -> SchemaView:
935
+ """Induce a schema view from Dremio table structures.
936
+
937
+ Args:
938
+ include_foreign_keys: If True, attempt to retrieve FK info from
939
+ PostgreSQL sources and add relationships.
940
+
941
+ Returns:
942
+ SchemaView representing the database schema.
943
+ """
944
+ logger.info(f"Inducing schema view for {self.metadata.handle}")
945
+ sb = SchemaBuilder()
946
+
947
+ if not self._collections:
948
+ self.discover_collections()
949
+
950
+ path = self._connection_info.get("path")
951
+ default_schema = self._connection_info.get("default_schema")
952
+
953
+ try:
954
+ sql = """
955
+ SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, DATA_TYPE,
956
+ IS_NULLABLE, ORDINAL_POSITION
957
+ FROM INFORMATION_SCHEMA."COLUMNS"
958
+ """
959
+
960
+ if path:
961
+ sql += f" WHERE TABLE_SCHEMA = '{path}'"
962
+ elif default_schema:
963
+ sql += f" WHERE TABLE_SCHEMA = '{default_schema}'"
964
+
965
+ sql += " ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION"
966
+
967
+ df = self._execute_query(sql)
968
+
969
+ current_table = None
970
+ for _, row in df.iterrows():
971
+ schema_name = row["TABLE_SCHEMA"]
972
+ table_name = row["TABLE_NAME"]
973
+ column_name = row["COLUMN_NAME"]
974
+ data_type = row["DATA_TYPE"]
975
+ is_nullable = row["IS_NULLABLE"]
976
+
977
+ # Get class name
978
+ if schema_name in (path, default_schema):
979
+ class_name = table_name
980
+ else:
981
+ class_name = f"{schema_name}_{table_name}"
982
+
983
+ if class_name != current_table:
984
+ sb.add_class(class_name)
985
+ current_table = class_name
986
+
987
+ # Map Dremio type to LinkML type
988
+ base_type = re.split(r"[\(\[]", str(data_type))[0].upper()
989
+ linkml_type = DREMIO_SQL_TO_LINKML.get(base_type, "string")
990
+
991
+ sd = SlotDefinition(column_name, required=is_nullable == "NO", range=linkml_type)
992
+ sb.schema.classes[class_name].attributes[sd.name] = sd
993
+ logger.debug(f"Introspected slot: {class_name}.{sd.name}: {sd.range}")
994
+
995
+ except Exception as e:
996
+ logger.warning(f"Could not introspect schema from INFORMATION_SCHEMA: {e}")
997
+
998
+ # Add foreign key relationships
999
+ if include_foreign_keys:
1000
+ schema_to_use = path or default_schema
1001
+ if schema_to_use:
1002
+ fks = self.get_foreign_keys(schema_to_use)
1003
+ for fk in fks:
1004
+ # Get or derive class names
1005
+ src_class = fk.source_table
1006
+ tgt_class = fk.target_table
1007
+
1008
+ # Skip if classes don't exist
1009
+ if src_class not in sb.schema.classes or tgt_class not in sb.schema.classes:
1010
+ continue
1011
+
1012
+ # For single-column FKs, update the slot's range to point to target class
1013
+ if len(fk.source_columns) == 1:
1014
+ src_col = fk.source_columns[0]
1015
+ if src_col in sb.schema.classes[src_class].attributes:
1016
+ slot = sb.schema.classes[src_class].attributes[src_col]
1017
+ # Set range to target class, indicating a relationship
1018
+ slot.range = tgt_class
1019
+ slot.description = f"Foreign key to {tgt_class}"
1020
+ logger.debug(f"Added FK relationship: {src_class}.{src_col} -> {tgt_class}")
1021
+
1022
+ sb.add_defaults()
1023
+ return SchemaView(sb.schema)