signalpilot-ai-internal 0.7.6__py3-none-any.whl → 0.10.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. signalpilot_ai_internal/__init__.py +1 -0
  2. signalpilot_ai_internal/_version.py +1 -1
  3. signalpilot_ai_internal/databricks_schema_service.py +902 -0
  4. signalpilot_ai_internal/file_scanner_service.py +2 -1
  5. signalpilot_ai_internal/handlers.py +72 -2
  6. signalpilot_ai_internal/mcp_handlers.py +508 -0
  7. signalpilot_ai_internal/mcp_server_manager.py +298 -0
  8. signalpilot_ai_internal/mcp_service.py +1303 -0
  9. signalpilot_ai_internal/schema_search_config.yml +8 -8
  10. signalpilot_ai_internal/schema_search_service.py +62 -1
  11. signalpilot_ai_internal/test_dbt_mcp_server.py +180 -0
  12. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +5 -3
  13. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +4 -2
  14. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +7 -1
  15. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/110.224e83db03814fd03955.js +7 -0
  16. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.972abe1d2d66f083f9cc.js +1 -0
  17. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.ad22ccddd74ee306fb56.js +1 -0
  18. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.c4232851631fb2e7e59a.js +1 -0
  19. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/726.318e4e791edb63cc788f.js +1 -0
  20. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.2d75de1a8d2c3131a8db.js +1 -0
  21. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.ca9e114a30896b669a3c.js +1 -0
  22. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.d9914229e4f120e7e9e4.js +1 -0
  23. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.d80de1e4da5b520d2f3b.js +1 -0
  24. signalpilot_ai_internal-0.10.22.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.b63c429ca81e743b403c.js +1 -0
  25. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +38 -20
  26. {signalpilot_ai_internal-0.7.6.dist-info → signalpilot_ai_internal-0.10.22.dist-info}/METADATA +3 -2
  27. signalpilot_ai_internal-0.10.22.dist-info/RECORD +56 -0
  28. {signalpilot_ai_internal-0.7.6.dist-info → signalpilot_ai_internal-0.10.22.dist-info}/WHEEL +1 -1
  29. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.72484b768a04f89bd3dd.js +0 -1
  30. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.9b4f05a99f5003f82094.js +0 -1
  31. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/490.b4ccb9601c8112407c5d.js +0 -1
  32. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.3aa564fc148b37d1d719.js +0 -1
  33. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/839.ed04fa601a43e8dd24d1.js +0 -1
  34. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.4e9edb7f224152c1dcb4.js +0 -2
  35. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/898.4e9edb7f224152c1dcb4.js.LICENSE.txt +0 -1
  36. signalpilot_ai_internal-0.7.6.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.ee8951353b00c13b8070.js +0 -1
  37. signalpilot_ai_internal-0.7.6.dist-info/RECORD +0 -49
  38. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
  39. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
  40. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -0
  41. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -0
  42. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -0
  43. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -0
  44. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -0
  45. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -0
  46. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -0
  47. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -0
  48. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -0
  49. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt +0 -0
  50. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -0
  51. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -0
  52. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -0
  53. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -0
  54. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -0
  55. {signalpilot_ai_internal-0.7.6.data → signalpilot_ai_internal-0.10.22.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
  56. {signalpilot_ai_internal-0.7.6.dist-info → signalpilot_ai_internal-0.10.22.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,902 @@
1
+ """
2
+ Databricks schema service handlers for SignalPilot AI.
3
+ Provides REST API handlers for Databricks SQL Warehouse schema retrieval and query execution.
4
+
5
+ Supports two authentication methods:
6
+ - Personal Access Token (PAT): User pastes token directly
7
+ - Service Principal: OAuth client credentials flow with automatic token refresh
8
+
9
+ Uses Unity Catalog with 3-level namespace: catalog.schema.table
10
+ """
11
+
12
+ import json
13
+ import os
14
+ import subprocess
15
+ import sys
16
+ import time
17
+ from typing import Any, Dict, Optional, List, Tuple
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from functools import lru_cache
20
+ import threading
21
+
22
+ from jupyter_server.base.handlers import APIHandler
23
+ import tornado
24
+
25
+
26
+ # In-memory token cache for Service Principal OAuth tokens
27
+ # Key: connection_id or hash of client credentials
28
+ # Value: {"access_token": str, "expires_at": float}
29
+ _sp_token_cache: Dict[str, Dict[str, Any]] = {}
30
+
31
+
32
+ class DatabricksSchemaHandler(APIHandler):
33
+ """Handler for Databricks schema operations"""
34
+
35
+ def _setup_databricks_environment(self):
36
+ """Install required Databricks packages if not available"""
37
+ def install_package(package_name):
38
+ try:
39
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
40
+ return True
41
+ except subprocess.CalledProcessError:
42
+ return False
43
+
44
+ missing_packages = []
45
+
46
+ try:
47
+ from databricks import sql as databricks_sql
48
+ except ImportError:
49
+ if install_package("databricks-sql-connector"):
50
+ try:
51
+ from databricks import sql as databricks_sql
52
+ except ImportError as e:
53
+ missing_packages.append(f"databricks-sql-connector: {str(e)}")
54
+ else:
55
+ missing_packages.append("databricks-sql-connector: installation failed")
56
+
57
+ if missing_packages:
58
+ raise ImportError("Required modules could not be installed: " + ", ".join(missing_packages))
59
+
60
+ from databricks import sql as databricks_sql
61
+ return databricks_sql
62
+
63
+ def _get_databricks_config(self, provided_config: Optional[Dict] = None) -> Optional[Dict]:
64
+ """Get Databricks configuration from request or environment variables"""
65
+ if provided_config:
66
+ return provided_config
67
+
68
+ # Look for Databricks database configuration in the environment
69
+ for key, value in os.environ.items():
70
+ if key.endswith('_CONNECTION_JSON'):
71
+ try:
72
+ config = json.loads(value)
73
+ if config.get('type') == 'databricks':
74
+ return config
75
+ except Exception as e:
76
+ print(f"[DatabricksSchemaHandler] Error parsing database config {key}: {e}")
77
+ continue
78
+
79
+ return None
80
+
81
+ def _get_access_token(self, config: Dict) -> str:
82
+ """Get access token for authentication.
83
+
84
+ For PAT: returns the token directly
85
+ For Service Principal: obtains OAuth token via client credentials flow
86
+ """
87
+ auth_type = config.get('authType', 'pat')
88
+
89
+ if auth_type == 'pat':
90
+ # Personal Access Token - use directly
91
+ token = config.get('accessToken')
92
+ if not token:
93
+ raise ValueError("Personal Access Token is required for PAT authentication")
94
+ return token
95
+
96
+ elif auth_type == 'service_principal':
97
+ # Service Principal - OAuth client credentials flow
98
+ return self._get_sp_access_token(config)
99
+
100
+ else:
101
+ raise ValueError(f"Unknown authentication type: {auth_type}")
102
+
103
+ def _get_sp_access_token(self, config: Dict) -> str:
104
+ """Get access token via Service Principal OAuth client credentials flow."""
105
+ client_id = config.get('clientId')
106
+ client_secret = config.get('clientSecret')
107
+
108
+ if not client_id or not client_secret:
109
+ raise ValueError("Client ID and Client Secret are required for Service Principal authentication")
110
+
111
+ # Create cache key from client credentials
112
+ cache_key = f"{client_id}:{hash(client_secret)}"
113
+
114
+ # Check cache for valid token
115
+ cached = _sp_token_cache.get(cache_key)
116
+ if cached:
117
+ # Refresh if within 60 seconds of expiry
118
+ if cached.get('expires_at', 0) > time.time() + 60:
119
+ return cached['access_token']
120
+
121
+ # Get OAuth token URL
122
+ # Default to Azure AD endpoint if not specified
123
+ token_url = config.get('oauthTokenUrl')
124
+ if not token_url:
125
+ # Try to derive from workspace URL for Azure
126
+ workspace_url = config.get('connectionUrl', '')
127
+ if 'azuredatabricks.net' in workspace_url:
128
+ # Azure Databricks - use Azure AD
129
+ tenant_id = config.get('tenantId', 'common')
130
+ token_url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
131
+ else:
132
+ # AWS/GCP - use Databricks OAuth endpoint
133
+ # Extract host from workspace URL
134
+ import re
135
+ match = re.match(r'https?://([^/]+)', workspace_url)
136
+ if match:
137
+ host = match.group(1)
138
+ token_url = f"https://{host}/oidc/v1/token"
139
+ else:
140
+ raise ValueError("Cannot determine OAuth token URL. Please provide oauthTokenUrl in config.")
141
+
142
+ # Request new token
143
+ import urllib.request
144
+ import urllib.parse
145
+
146
+ # Prepare token request
147
+ scopes = config.get('scopes', ['2ff814a6-3304-4ab8-85cb-cd0e6f879c1d/.default'])
148
+ if isinstance(scopes, str):
149
+ scopes = [scopes]
150
+
151
+ data = {
152
+ 'grant_type': 'client_credentials',
153
+ 'client_id': client_id,
154
+ 'client_secret': client_secret,
155
+ 'scope': ' '.join(scopes)
156
+ }
157
+
158
+ encoded_data = urllib.parse.urlencode(data).encode('utf-8')
159
+
160
+ req = urllib.request.Request(
161
+ token_url,
162
+ data=encoded_data,
163
+ headers={
164
+ 'Content-Type': 'application/x-www-form-urlencoded'
165
+ }
166
+ )
167
+
168
+ try:
169
+ with urllib.request.urlopen(req, timeout=30) as response:
170
+ result = json.loads(response.read().decode('utf-8'))
171
+
172
+ access_token = result.get('access_token')
173
+ expires_in = result.get('expires_in', 3600)
174
+
175
+ if not access_token:
176
+ raise ValueError("No access_token in OAuth response")
177
+
178
+ # Cache the token
179
+ _sp_token_cache[cache_key] = {
180
+ 'access_token': access_token,
181
+ 'expires_at': time.time() + expires_in
182
+ }
183
+
184
+ return access_token
185
+
186
+ except urllib.error.HTTPError as e:
187
+ error_body = e.read().decode('utf-8') if e.fp else str(e)
188
+ raise ValueError(f"OAuth token request failed: {e.code} - {error_body}")
189
+ except Exception as e:
190
+ raise ValueError(f"Failed to obtain OAuth token: {str(e)}")
191
+
192
+ def _get_connection_params(self, config: Dict) -> Dict[str, Any]:
193
+ """Build Databricks connection parameters from configuration"""
194
+ import re
195
+
196
+ # Extract host from connectionUrl
197
+ connection_url = config.get('connectionUrl', '')
198
+ if not connection_url:
199
+ raise ValueError("connectionUrl (workspace URL) is required for Databricks")
200
+
201
+ # Extract host from URL
202
+ url_match = re.match(r'https?://([^/]+)', connection_url)
203
+ if not url_match:
204
+ raise ValueError(f"Invalid Databricks connectionUrl format: {connection_url}")
205
+
206
+ server_hostname = url_match.group(1)
207
+
208
+ # Get HTTP path for SQL warehouse
209
+ http_path = config.get('warehouseHttpPath') or config.get('httpPath')
210
+ if not http_path:
211
+ warehouse_id = config.get('warehouseId')
212
+ if warehouse_id:
213
+ http_path = f"/sql/1.0/warehouses/{warehouse_id}"
214
+ else:
215
+ raise ValueError("Either warehouseHttpPath or warehouseId is required")
216
+
217
+ # Get access token
218
+ access_token = self._get_access_token(config)
219
+
220
+ conn_params = {
221
+ 'server_hostname': server_hostname,
222
+ 'http_path': http_path,
223
+ 'access_token': access_token,
224
+ }
225
+
226
+ # Optional catalog (Unity Catalog)
227
+ catalog = config.get('catalog')
228
+ if catalog:
229
+ conn_params['catalog'] = catalog
230
+
231
+ # Optional schema
232
+ schema = config.get('schema')
233
+ if schema:
234
+ conn_params['schema'] = schema
235
+
236
+ return conn_params
237
+
238
+ def _list_catalogs(self, cursor) -> List[str]:
239
+ """List all accessible catalogs"""
240
+ cursor.execute("SHOW CATALOGS")
241
+ rows = cursor.fetchall()
242
+ return [row[0] for row in rows if row[0] not in ('system', 'samples')]
243
+
244
+ def _list_schemas(self, cursor, catalog: str) -> List[str]:
245
+ """List all schemas in a catalog"""
246
+ cursor.execute(f"SHOW SCHEMAS IN `{catalog}`")
247
+ rows = cursor.fetchall()
248
+ return [row[0] for row in rows if row[0] not in ('information_schema',)]
249
+
250
+ def _list_tables(self, cursor, catalog: str, schema: str) -> List[Dict]:
251
+ """List all tables in a schema with their type"""
252
+ cursor.execute(f"SHOW TABLES IN `{catalog}`.`{schema}`")
253
+ rows = cursor.fetchall()
254
+ tables = []
255
+ for row in rows:
256
+ # SHOW TABLES returns: database, tableName, isTemporary
257
+ table_name = row[1] if len(row) > 1 else row[0]
258
+ tables.append({
259
+ 'table_name': table_name,
260
+ 'table_type': 'TABLE'
261
+ })
262
+ return tables
263
+
264
+ def _get_table_columns(self, cursor, catalog: str, schema: str, table: str) -> List[Dict]:
265
+ """Get column information for a table"""
266
+ try:
267
+ cursor.execute(f"DESCRIBE TABLE `{catalog}`.`{schema}`.`{table}`")
268
+ rows = cursor.fetchall()
269
+ columns = []
270
+ for row in rows:
271
+ # DESCRIBE TABLE returns: col_name, data_type, comment
272
+ col_name = row[0]
273
+
274
+ # Skip metadata rows (partition info, etc.)
275
+ if col_name.startswith('#') or not col_name.strip():
276
+ continue
277
+
278
+ columns.append({
279
+ 'column_name': col_name,
280
+ 'data_type': row[1] if len(row) > 1 else 'unknown',
281
+ 'is_nullable': 'YES', # Databricks doesn't always expose this
282
+ 'column_default': None,
283
+ 'description': row[2] if len(row) > 2 and row[2] else None
284
+ })
285
+ return columns
286
+ except Exception as e:
287
+ print(f"[DatabricksSchemaHandler] Error getting columns for {catalog}.{schema}.{table}: {e}")
288
+ return []
289
+
290
+ def _fetch_table_with_columns(self, databricks_sql, conn_params: Dict, catalog: str, schema: str, table_info: Dict) -> Dict:
291
+ """Fetch a single table with its columns using a new connection (for parallel execution)"""
292
+ connection = None
293
+ try:
294
+ connection = databricks_sql.connect(**conn_params)
295
+ cursor = connection.cursor()
296
+
297
+ table_name = table_info['table_name']
298
+ columns = self._get_table_columns(cursor, catalog, schema, table_name)
299
+
300
+ cursor.close()
301
+
302
+ return {
303
+ 'catalog': catalog,
304
+ 'schema': schema,
305
+ 'table': table_name,
306
+ 'type': table_info.get('table_type', 'TABLE'),
307
+ 'columns': columns,
308
+ 'error': None
309
+ }
310
+ except Exception as e:
311
+ print(f"[DatabricksSchemaHandler] Error fetching table {catalog}.{schema}.{table_info.get('table_name', 'unknown')}: {e}")
312
+ return {
313
+ 'catalog': catalog,
314
+ 'schema': schema,
315
+ 'table': table_info.get('table_name', 'unknown'),
316
+ 'type': table_info.get('table_type', 'TABLE'),
317
+ 'columns': [],
318
+ 'error': str(e)
319
+ }
320
+ finally:
321
+ if connection:
322
+ try:
323
+ connection.close()
324
+ except:
325
+ pass
326
+
327
+ def _fetch_schema_tables(self, databricks_sql, conn_params: Dict, catalog: str, schema: str) -> Dict:
328
+ """Fetch all tables for a schema with parallel column fetching"""
329
+ connection = None
330
+ try:
331
+ # Get list of tables first
332
+ connection = databricks_sql.connect(**conn_params)
333
+ cursor = connection.cursor()
334
+
335
+ tables = self._list_tables(cursor, catalog, schema)
336
+ cursor.close()
337
+ connection.close()
338
+ connection = None
339
+
340
+ print(f" Schema {schema}: {len(tables)} tables - fetching in parallel...")
341
+
342
+ # Fetch table details in parallel
343
+ schema_obj = {
344
+ 'schema': schema,
345
+ 'tables': [],
346
+ 'error': None
347
+ }
348
+
349
+ if not tables:
350
+ return schema_obj
351
+
352
+ # Use ThreadPoolExecutor for parallel table fetching
353
+ max_workers = min(10, len(tables)) # Limit concurrent connections
354
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
355
+ future_to_table = {
356
+ executor.submit(
357
+ self._fetch_table_with_columns,
358
+ databricks_sql,
359
+ conn_params,
360
+ catalog,
361
+ schema,
362
+ table_info
363
+ ): table_info for table_info in tables
364
+ }
365
+
366
+ for future in as_completed(future_to_table):
367
+ try:
368
+ table_obj = future.result()
369
+ schema_obj['tables'].append(table_obj)
370
+ except Exception as e:
371
+ table_info = future_to_table[future]
372
+ print(f" Error processing table {table_info.get('table_name')}: {e}")
373
+
374
+ return schema_obj
375
+
376
+ except Exception as e:
377
+ print(f" Error processing schema {schema}: {e}")
378
+ return {
379
+ 'schema': schema,
380
+ 'tables': [],
381
+ 'error': str(e)
382
+ }
383
+ finally:
384
+ if connection:
385
+ try:
386
+ connection.close()
387
+ except:
388
+ pass
389
+
390
+ def _build_catalog(self, databricks_sql, conn_params: Dict, specified_catalog: Optional[str] = None, specified_schema: Optional[str] = None) -> Dict:
391
+ """Build complete catalog structure with parallel processing"""
392
+ connection = databricks_sql.connect(**conn_params)
393
+ cursor = connection.cursor()
394
+
395
+ try:
396
+ catalog_data = []
397
+
398
+ # Get catalogs to process
399
+ if specified_catalog:
400
+ catalogs = [specified_catalog]
401
+ else:
402
+ catalogs = self._list_catalogs(cursor)
403
+
404
+ print(f"[DatabricksSchemaHandler] Processing {len(catalogs)} catalogs with parallel optimization...")
405
+
406
+ for catalog in catalogs:
407
+ print(f" Processing catalog: {catalog}")
408
+ catalog_obj = {
409
+ 'catalog': catalog,
410
+ 'schemas': []
411
+ }
412
+
413
+ try:
414
+ schemas_list = self._list_schemas(cursor, catalog)
415
+
416
+ # Filter schemas if specified_schema is provided
417
+ if specified_schema:
418
+ schemas = [s for s in schemas_list if s == specified_schema]
419
+ else:
420
+ schemas = schemas_list
421
+
422
+ print(f" Found {len(schemas)} schemas - processing in parallel...")
423
+
424
+ if not schemas:
425
+ catalog_data.append(catalog_obj)
426
+ continue
427
+
428
+ # Process schemas in parallel
429
+ max_workers = min(5, len(schemas)) # Limit concurrent schema processing
430
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
431
+ future_to_schema = {
432
+ executor.submit(
433
+ self._fetch_schema_tables,
434
+ databricks_sql,
435
+ conn_params,
436
+ catalog,
437
+ schema
438
+ ): schema for schema in schemas
439
+ }
440
+
441
+ for future in as_completed(future_to_schema):
442
+ try:
443
+ schema_obj = future.result()
444
+ catalog_obj['schemas'].append(schema_obj)
445
+ except Exception as e:
446
+ schema = future_to_schema[future]
447
+ print(f" Error processing schema {schema}: {e}")
448
+ catalog_obj['schemas'].append({
449
+ 'schema': schema,
450
+ 'tables': [],
451
+ 'error': str(e)
452
+ })
453
+
454
+ except Exception as e:
455
+ print(f" Error processing catalog {catalog}: {e}")
456
+ catalog_obj['schemas'].append({
457
+ 'schema': 'default',
458
+ 'tables': [],
459
+ 'error': str(e)
460
+ })
461
+
462
+ catalog_data.append(catalog_obj)
463
+
464
+ return {'catalogs': catalog_data}
465
+
466
+ finally:
467
+ cursor.close()
468
+ connection.close()
469
+
470
+ def _format_catalog_as_markdown(self, catalog_data: Dict) -> Tuple[str, Dict]:
471
+ """Format the catalog as markdown and build table_schemas dict"""
472
+ lines = ["# Databricks Database Schema\n"]
473
+ table_schemas = {}
474
+
475
+ total_tables = 0
476
+ for cat in catalog_data.get('catalogs', []):
477
+ for sch in cat.get('schemas', []):
478
+ total_tables += len(sch.get('tables', []))
479
+
480
+ lines.append(f"Found **{total_tables}** table(s)\n")
481
+
482
+ for cat in catalog_data.get('catalogs', []):
483
+ catalog_name = cat['catalog']
484
+
485
+ for sch in cat.get('schemas', []):
486
+ schema_name = sch['schema']
487
+
488
+ if sch.get('error'):
489
+ lines.append(f"\n## {catalog_name}.{schema_name}\n")
490
+ lines.append(f"Error: {sch['error']}\n")
491
+ continue
492
+
493
+ for table in sch.get('tables', []):
494
+ table_name = table['table']
495
+ full_name = f"{catalog_name}.{schema_name}.{table_name}"
496
+
497
+ lines.append(f"\n## {full_name}\n")
498
+
499
+ columns = table.get('columns', [])
500
+ lines.append(f"\n### Columns ({len(columns)})\n")
501
+
502
+ for col in columns:
503
+ col_name = col.get('column_name', 'unknown')
504
+ data_type = col.get('data_type', 'unknown')
505
+ description = col.get('description')
506
+
507
+ if description:
508
+ lines.append(f"- **{col_name}**: {data_type} - {description}\n")
509
+ else:
510
+ lines.append(f"- **{col_name}**: {data_type}\n")
511
+
512
+ # Store in table_schemas
513
+ table_schemas[full_name] = {
514
+ 'catalog': catalog_name,
515
+ 'schema': schema_name,
516
+ 'table_name': table_name,
517
+ 'full_name': full_name,
518
+ 'columns': [dict(col) for col in columns],
519
+ 'primary_keys': [], # Databricks doesn't always expose PK info
520
+ 'foreign_keys': [],
521
+ 'indices': []
522
+ }
523
+
524
+ lines.append("\n---\n")
525
+
526
+ return ''.join(lines).strip(), table_schemas
527
+
528
+ @tornado.web.authenticated
529
+ def post(self):
530
+ """Get Databricks database schema information"""
531
+ try:
532
+ # Parse request body
533
+ try:
534
+ body = json.loads(self.request.body.decode('utf-8'))
535
+ except json.JSONDecodeError:
536
+ self.set_status(400)
537
+ self.finish(json.dumps({
538
+ "error": "Invalid JSON in request body"
539
+ }))
540
+ return
541
+
542
+ # Get Databricks configuration from request or environment
543
+ config = self._get_databricks_config(body.get('config'))
544
+
545
+ if not config:
546
+ self.set_status(400)
547
+ self.finish(json.dumps({
548
+ "error": "No Databricks configuration provided and no Databricks configurations found in environment"
549
+ }))
550
+ return
551
+
552
+ # Setup Databricks environment
553
+ try:
554
+ databricks_sql = self._setup_databricks_environment()
555
+ except ImportError as e:
556
+ self.set_status(500)
557
+ self.finish(json.dumps({
558
+ "error": str(e)
559
+ }))
560
+ return
561
+
562
+ # Get database schema
563
+ try:
564
+ conn_params = self._get_connection_params(config)
565
+ specified_catalog = config.get('catalog')
566
+ specified_schema = config.get('schema')
567
+
568
+ print(f"[DatabricksSchemaHandler] Connecting to {conn_params['server_hostname']}")
569
+ if specified_catalog:
570
+ print(f"[DatabricksSchemaHandler] Filtering to catalog: {specified_catalog}")
571
+ if specified_schema:
572
+ print(f"[DatabricksSchemaHandler] Filtering to schema: {specified_schema}")
573
+
574
+ catalog_data = self._build_catalog(
575
+ databricks_sql,
576
+ conn_params,
577
+ specified_catalog=specified_catalog,
578
+ specified_schema=specified_schema
579
+ )
580
+
581
+ markdown_result, table_schemas = self._format_catalog_as_markdown(catalog_data)
582
+
583
+ self.finish(json.dumps({
584
+ "result": markdown_result,
585
+ "table_schemas": table_schemas,
586
+ "catalogs": catalog_data.get('catalogs', [])
587
+ }))
588
+
589
+ except Exception as e:
590
+ error_msg = str(e)
591
+ # Provide helpful error messages
592
+ if 'PAT' in error_msg.upper() or 'token' in error_msg.lower():
593
+ error_msg = f"Authentication failed: {error_msg}. If PATs are disabled in your workspace, try Service Principal authentication."
594
+ elif 'warehouse' in error_msg.lower():
595
+ error_msg = f"SQL Warehouse error: {error_msg}. Ensure your warehouse is running and accessible."
596
+
597
+ self.set_status(500)
598
+ self.finish(json.dumps({
599
+ "error": f"Error connecting to Databricks: {error_msg}"
600
+ }))
601
+
602
+ except Exception as e:
603
+ self.set_status(500)
604
+ self.finish(json.dumps({
605
+ "error": "Internal server error",
606
+ "message": str(e)
607
+ }))
608
+
609
+
610
+ class DatabricksQueryHandler(APIHandler):
611
+ """Handler for Databricks query execution"""
612
+
613
+ def _setup_databricks_environment(self):
614
+ """Install required Databricks packages if not available"""
615
+ def install_package(package_name):
616
+ try:
617
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
618
+ return True
619
+ except subprocess.CalledProcessError:
620
+ return False
621
+
622
+ missing_packages = []
623
+
624
+ try:
625
+ from databricks import sql as databricks_sql
626
+ except ImportError:
627
+ if install_package("databricks-sql-connector"):
628
+ try:
629
+ from databricks import sql as databricks_sql
630
+ except ImportError as e:
631
+ missing_packages.append(f"databricks-sql-connector: {str(e)}")
632
+ else:
633
+ missing_packages.append("databricks-sql-connector: installation failed")
634
+
635
+ if missing_packages:
636
+ raise ImportError("Required modules could not be installed: " + ", ".join(missing_packages))
637
+
638
+ from databricks import sql as databricks_sql
639
+ return databricks_sql
640
+
641
+ def _get_databricks_config(self, provided_config: Optional[Dict] = None) -> Optional[Dict]:
642
+ """Get Databricks configuration from request or environment variables"""
643
+ if provided_config:
644
+ return provided_config
645
+
646
+ # Look for Databricks database configuration in the environment
647
+ for key, value in os.environ.items():
648
+ if key.endswith('_CONNECTION_JSON'):
649
+ try:
650
+ config = json.loads(value)
651
+ if config.get('type') == 'databricks':
652
+ return config
653
+ except Exception as e:
654
+ print(f"[DatabricksQueryHandler] Error parsing database config {key}: {e}")
655
+ continue
656
+
657
+ return None
658
+
659
+ def _get_access_token(self, config: Dict) -> str:
660
+ """Get access token for authentication - delegates to schema handler logic"""
661
+ # Reuse the schema handler's token logic
662
+ handler = DatabricksSchemaHandler(self.application, self.request)
663
+ return handler._get_access_token(config)
664
+
665
+ def _get_connection_params(self, config: Dict) -> Dict[str, Any]:
666
+ """Build Databricks connection parameters from configuration"""
667
+ import re
668
+
669
+ connection_url = config.get('connectionUrl', '')
670
+ if not connection_url:
671
+ raise ValueError("connectionUrl (workspace URL) is required for Databricks")
672
+
673
+ url_match = re.match(r'https?://([^/]+)', connection_url)
674
+ if not url_match:
675
+ raise ValueError(f"Invalid Databricks connectionUrl format: {connection_url}")
676
+
677
+ server_hostname = url_match.group(1)
678
+
679
+ http_path = config.get('warehouseHttpPath') or config.get('httpPath')
680
+ if not http_path:
681
+ warehouse_id = config.get('warehouseId')
682
+ if warehouse_id:
683
+ http_path = f"/sql/1.0/warehouses/{warehouse_id}"
684
+ else:
685
+ raise ValueError("Either warehouseHttpPath or warehouseId is required")
686
+
687
+ access_token = self._get_access_token(config)
688
+
689
+ conn_params = {
690
+ 'server_hostname': server_hostname,
691
+ 'http_path': http_path,
692
+ 'access_token': access_token,
693
+ }
694
+
695
+ catalog = config.get('catalog')
696
+ if catalog:
697
+ conn_params['catalog'] = catalog
698
+
699
+ schema = config.get('schema')
700
+ if schema:
701
+ conn_params['schema'] = schema
702
+
703
+ return conn_params
704
+
705
+ @tornado.web.authenticated
706
+ def post(self):
707
+ """Execute a read-only SQL query on Databricks"""
708
+ try:
709
+ # Parse request body
710
+ try:
711
+ body = json.loads(self.request.body.decode('utf-8'))
712
+ except json.JSONDecodeError:
713
+ self.set_status(400)
714
+ self.finish(json.dumps({
715
+ "error": "Invalid JSON in request body"
716
+ }))
717
+ return
718
+
719
+ # Get query from request
720
+ query = body.get('query')
721
+ if not query:
722
+ self.set_status(400)
723
+ self.finish(json.dumps({
724
+ "error": "Missing 'query' field in request body"
725
+ }))
726
+ return
727
+
728
+ # Basic validation for read-only queries
729
+ normalized_query = query.strip().upper()
730
+ allowed_starts = ['SELECT', 'WITH', 'SHOW', 'DESCRIBE', 'EXPLAIN']
731
+
732
+ if not any(normalized_query.startswith(start) for start in allowed_starts):
733
+ self.set_status(400)
734
+ self.finish(json.dumps({
735
+ "error": f"Only {', '.join(allowed_starts)} statements are allowed for read queries."
736
+ }))
737
+ return
738
+
739
+ # Get Databricks configuration from request or environment
740
+ config = self._get_databricks_config(body.get('config'))
741
+
742
+ if not config:
743
+ self.set_status(400)
744
+ self.finish(json.dumps({
745
+ "error": "No Databricks configuration provided and no Databricks configurations found in environment"
746
+ }))
747
+ return
748
+
749
+ # Setup Databricks environment
750
+ try:
751
+ databricks_sql = self._setup_databricks_environment()
752
+ except ImportError as e:
753
+ self.set_status(500)
754
+ self.finish(json.dumps({
755
+ "error": str(e)
756
+ }))
757
+ return
758
+
759
+ # Execute query
760
+ try:
761
+ conn_params = self._get_connection_params(config)
762
+
763
+ # Allow specifying a specific catalog for the query
764
+ catalog = body.get('catalog')
765
+ if catalog:
766
+ conn_params['catalog'] = catalog
767
+
768
+ connection = databricks_sql.connect(**conn_params)
769
+ cursor = connection.cursor()
770
+
771
+ try:
772
+ cursor.execute(query)
773
+
774
+ # Get column names from cursor description
775
+ columns = [desc[0] for desc in cursor.description] if cursor.description else []
776
+
777
+ # Fetch all results
778
+ rows = cursor.fetchall()
779
+
780
+ # Convert result to list of dictionaries
781
+ result_rows = [
782
+ {columns[i]: row[i] for i in range(len(columns))}
783
+ for row in rows
784
+ ]
785
+
786
+ self.finish(json.dumps({
787
+ "result": result_rows
788
+ }))
789
+
790
+ finally:
791
+ cursor.close()
792
+ connection.close()
793
+
794
+ except Exception as e:
795
+ self.set_status(500)
796
+ self.finish(json.dumps({
797
+ "error": f"Databricks query failed: {str(e)}"
798
+ }))
799
+
800
+ except Exception as e:
801
+ self.set_status(500)
802
+ self.finish(json.dumps({
803
+ "error": "Internal server error",
804
+ "message": str(e)
805
+ }))
806
+
807
+
808
+ class DatabricksTestHandler(APIHandler):
809
+ """Handler for testing Databricks connection"""
810
+
811
+ @tornado.web.authenticated
812
+ def post(self):
813
+ """Test Databricks connection and return status"""
814
+ try:
815
+ # Parse request body
816
+ try:
817
+ body = json.loads(self.request.body.decode('utf-8'))
818
+ except json.JSONDecodeError:
819
+ self.set_status(400)
820
+ self.finish(json.dumps({
821
+ "error": "Invalid JSON in request body"
822
+ }))
823
+ return
824
+
825
+ config = body.get('config')
826
+ if not config:
827
+ self.set_status(400)
828
+ self.finish(json.dumps({
829
+ "error": "No configuration provided"
830
+ }))
831
+ return
832
+
833
+ # Setup environment
834
+ schema_handler = DatabricksSchemaHandler(self.application, self.request)
835
+ try:
836
+ databricks_sql = schema_handler._setup_databricks_environment()
837
+ except ImportError as e:
838
+ self.set_status(500)
839
+ self.finish(json.dumps({
840
+ "ok": False,
841
+ "error": str(e)
842
+ }))
843
+ return
844
+
845
+ # Test connection
846
+ try:
847
+ import time
848
+ start_time = time.time()
849
+
850
+ conn_params = schema_handler._get_connection_params(config)
851
+ connection = databricks_sql.connect(**conn_params)
852
+ cursor = connection.cursor()
853
+
854
+ try:
855
+ # Test basic query
856
+ cursor.execute("SELECT 1 as test")
857
+ cursor.fetchall()
858
+
859
+ sql_latency = int((time.time() - start_time) * 1000)
860
+
861
+ # Try to get current user
862
+ identity_info = {"type": "unknown", "name": "unknown"}
863
+ try:
864
+ cursor.execute("SELECT current_user() as user")
865
+ user_row = cursor.fetchone()
866
+ if user_row:
867
+ auth_type = config.get('authType', 'pat')
868
+ identity_info = {
869
+ "type": "user" if auth_type == 'pat' else "service_principal",
870
+ "name": user_row[0]
871
+ }
872
+ except Exception:
873
+ pass
874
+
875
+ self.finish(json.dumps({
876
+ "ok": True,
877
+ "identity": identity_info,
878
+ "sql": {"ok": True, "latency_ms": sql_latency},
879
+ "api": {"ok": True}
880
+ }))
881
+
882
+ finally:
883
+ cursor.close()
884
+ connection.close()
885
+
886
+ except Exception as e:
887
+ error_msg = str(e)
888
+ self.finish(json.dumps({
889
+ "ok": False,
890
+ "error": error_msg,
891
+ "identity": None,
892
+ "sql": {"ok": False, "error": error_msg},
893
+ "api": {"ok": False}
894
+ }))
895
+
896
+ except Exception as e:
897
+ self.set_status(500)
898
+ self.finish(json.dumps({
899
+ "ok": False,
900
+ "error": "Internal server error",
901
+ "message": str(e)
902
+ }))