signalpilot-ai-internal 0.10.0__py3-none-any.whl → 0.11.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. signalpilot_ai_internal/__init__.py +1 -0
  2. signalpilot_ai_internal/_version.py +1 -1
  3. signalpilot_ai_internal/cache_service.py +22 -21
  4. signalpilot_ai_internal/composio_handlers.py +224 -0
  5. signalpilot_ai_internal/composio_service.py +511 -0
  6. signalpilot_ai_internal/database_config_handlers.py +182 -0
  7. signalpilot_ai_internal/database_config_service.py +166 -0
  8. signalpilot_ai_internal/databricks_schema_service.py +907 -0
  9. signalpilot_ai_internal/file_scanner_service.py +5 -146
  10. signalpilot_ai_internal/handlers.py +388 -9
  11. signalpilot_ai_internal/integrations_config.py +256 -0
  12. signalpilot_ai_internal/log_utils.py +31 -0
  13. signalpilot_ai_internal/mcp_handlers.py +532 -0
  14. signalpilot_ai_internal/mcp_server_manager.py +298 -0
  15. signalpilot_ai_internal/mcp_service.py +1255 -0
  16. signalpilot_ai_internal/oauth_token_store.py +141 -0
  17. signalpilot_ai_internal/schema_search_config.yml +17 -11
  18. signalpilot_ai_internal/schema_search_service.py +85 -4
  19. signalpilot_ai_internal/signalpilot_home.py +961 -0
  20. signalpilot_ai_internal/snowflake_schema_service.py +2 -0
  21. signalpilot_ai_internal/test_dbt_mcp_server.py +180 -0
  22. signalpilot_ai_internal/unified_database_schema_service.py +2 -0
  23. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json +15 -48
  24. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/package.json → signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/package.json.orig +9 -52
  25. {signalpilot_ai_internal-0.10.0.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/schemas/signalpilot-ai-internal/plugin.json +7 -1
  26. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.bab318d6caadb055e29c.js +1 -0
  27. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/129.868ca665e6fc225c20a0.js +1 -0
  28. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/179.fd45a2e75d471d0aa3b9.js +7 -0
  29. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.81105a94aa873fc51a94.js +1 -0
  30. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.a002dd4630d3b6404a90.js +1 -0
  31. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.cc6f6ecacd703bcdb468.js +1 -0
  32. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.817a883549d55a0e0576.js +1 -0
  33. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.a4daecd44f1e9364e44a.js +1 -0
  34. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.667225aab294fb5ed161.js +1 -0
  35. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/447.8138af2522716e5a926f.js +1 -0
  36. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.925c73e32f3c07448da0.js +1 -0
  37. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/477.aaa4cc9e87801fb45f5b.js +1 -0
  38. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.370056149a59022b700c.js +1 -0
  39. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/510.868ca665e6fc225c20a0.js +1 -0
  40. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.835f97f7ccfc70ff5c93.js +1 -0
  41. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.6c13335f73de089d6b1e.js +1 -0
  42. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/574.ad2709e91ebcac5bbe68.js +1 -0
  43. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.bddbab8e464fe31f0393.js +1 -0
  44. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.fda1bcdb10497b0a6ade.js +1 -0
  45. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.d046701f475fcbf6697d.js +1 -0
  46. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.c306dffd4cfe8a613d13.js +1 -0
  47. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.e39898b6f336539f228c.js +1 -0
  48. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.77cc0ca10a1860df1b52.js +1 -0
  49. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/936.4e2850b2af985ed0d378.js +1 -0
  50. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js +2 -0
  51. signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.055f50d20a31f3068c72.js +1 -0
  52. {signalpilot_ai_internal-0.10.0.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/third-party-licenses.json +47 -29
  53. {signalpilot_ai_internal-0.10.0.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/METADATA +14 -31
  54. signalpilot_ai_internal-0.11.24.dist-info/RECORD +66 -0
  55. signalpilot_ai_internal-0.11.24.dist-info/licenses/LICENSE +7 -0
  56. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/122.e2dadf63dc64d7b5f1ee.js +0 -1
  57. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/220.328403b5545f268b95c6.js +0 -1
  58. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/262.726e1da31a50868cb297.js +0 -1
  59. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/330.af2e9cb5def5ae2b84d5.js +0 -1
  60. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/353.972abe1d2d66f083f9cc.js +0 -1
  61. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/364.dbec4c2dc12e7b050dcc.js +0 -1
  62. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/384.fa432bdb7fb6b1c95ad6.js +0 -1
  63. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/439.37e271d7a80336daabe2.js +0 -1
  64. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/476.ad22ccddd74ee306fb56.js +0 -1
  65. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/481.73c7a9290b7d35a8b9c1.js +0 -1
  66. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/512.b58fc0093d080b8ee61c.js +0 -1
  67. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js +0 -2
  68. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/57.e9acd2e1f9739037f1ab.js +0 -1
  69. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/635.9720593ee20b768da3ca.js +0 -1
  70. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/713.8e6edc9a965bdd578ca7.js +0 -1
  71. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/741.dc49867fafb03ea2ba4d.js +0 -1
  72. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/742.91e7b516c8699eea3373.js +0 -1
  73. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/785.2d75de1a8d2c3131a8db.js +0 -1
  74. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/786.770dc7bcab77e14cc135.js +0 -7
  75. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/801.ca9e114a30896b669a3c.js +0 -1
  76. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/880.25ddd15aca09421d3765.js +0 -1
  77. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/888.34054db17bcf6e87ec95.js +0 -1
  78. signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/remoteEntry.b05b2f0c9617ba28370d.js +0 -1
  79. signalpilot_ai_internal-0.10.0.dist-info/RECORD +0 -50
  80. signalpilot_ai_internal-0.10.0.dist-info/licenses/LICENSE +0 -29
  81. {signalpilot_ai_internal-0.10.0.data → signalpilot_ai_internal-0.11.24.data}/data/etc/jupyter/jupyter_server_config.d/signalpilot_ai.json +0 -0
  82. {signalpilot_ai_internal-0.10.0.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/install.json +0 -0
  83. /signalpilot_ai_internal-0.10.0.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/553.b4042a795c91d9ff71ef.js.LICENSE.txt → /signalpilot_ai_internal-0.11.24.data/data/share/jupyter/labextensions/signalpilot-ai-internal/static/956.eeffe67d7781fd63ef4b.js.LICENSE.txt +0 -0
  84. {signalpilot_ai_internal-0.10.0.data → signalpilot_ai_internal-0.11.24.data}/data/share/jupyter/labextensions/signalpilot-ai-internal/static/style.js +0 -0
  85. {signalpilot_ai_internal-0.10.0.dist-info → signalpilot_ai_internal-0.11.24.dist-info}/WHEEL +0 -0
@@ -0,0 +1,907 @@
1
+ """
2
+ Databricks schema service handlers for SignalPilot AI.
3
+ Provides REST API handlers for Databricks SQL Warehouse schema retrieval and query execution.
4
+
5
+ Supports two authentication methods:
6
+ - Personal Access Token (PAT): User pastes token directly
7
+ - Service Principal: OAuth client credentials flow with automatic token refresh
8
+
9
+ Uses Unity Catalog with 3-level namespace: catalog.schema.table
10
+ """
11
+
12
+ import json
13
+ import os
14
+ import subprocess
15
+ import sys
16
+ import time
17
+ from typing import Any, Dict, Optional, List, Tuple
18
+ from concurrent.futures import ThreadPoolExecutor, as_completed
19
+ from functools import lru_cache
20
+ import threading
21
+
22
+ from jupyter_server.base.handlers import APIHandler
23
+ import tornado
24
+
25
+ from .log_utils import print
26
+
27
+ # In-memory token cache for Service Principal OAuth tokens
28
+ # Key: connection_id or hash of client credentials
29
+ # Value: {"access_token": str, "expires_at": float}
30
+ _sp_token_cache: Dict[str, Dict[str, Any]] = {}
31
+
32
+
33
+ class DatabricksSchemaHandler(APIHandler):
34
+ """Handler for Databricks schema operations"""
35
+
36
+ def _setup_databricks_environment(self):
37
+ """Install required Databricks packages if not available"""
38
+ def install_package(package_name):
39
+ try:
40
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
41
+ return True
42
+ except subprocess.CalledProcessError:
43
+ return False
44
+
45
+ missing_packages = []
46
+
47
+ try:
48
+ from databricks import sql as databricks_sql
49
+ except ImportError:
50
+ if install_package("databricks-sql-connector"):
51
+ try:
52
+ from databricks import sql as databricks_sql
53
+ except ImportError as e:
54
+ missing_packages.append(f"databricks-sql-connector: {str(e)}")
55
+ else:
56
+ missing_packages.append("databricks-sql-connector: installation failed")
57
+
58
+ if missing_packages:
59
+ raise ImportError("Required modules could not be installed: " + ", ".join(missing_packages))
60
+
61
+ from databricks import sql as databricks_sql
62
+ return databricks_sql
63
+
64
+ def _get_databricks_config(self, provided_config: Optional[Dict] = None) -> Optional[Dict]:
65
+ """Get Databricks configuration from request or environment variables"""
66
+ if provided_config:
67
+ return provided_config
68
+
69
+ # Look for Databricks database configuration in the environment
70
+ for key, value in os.environ.items():
71
+ if key.endswith('_CONNECTION_JSON'):
72
+ try:
73
+ config = json.loads(value)
74
+ if config.get('type') == 'databricks':
75
+ return config
76
+ except Exception as e:
77
+ print(f"[DatabricksSchemaHandler] Error parsing database config {key}: {e}")
78
+ continue
79
+
80
+ return None
81
+
82
+ def _get_access_token(self, config: Dict) -> str:
83
+ """Get access token for authentication.
84
+
85
+ For PAT: returns the token directly
86
+ For Service Principal: obtains OAuth token via client credentials flow
87
+ """
88
+ auth_type = config.get('authType', 'pat')
89
+
90
+ if auth_type == 'pat':
91
+ # Personal Access Token - use directly
92
+ token = config.get('accessToken')
93
+ if not token:
94
+ raise ValueError("Personal Access Token is required for PAT authentication")
95
+ return token
96
+
97
+ elif auth_type == 'service_principal':
98
+ # Service Principal - OAuth client credentials flow
99
+ return self._get_sp_access_token(config)
100
+
101
+ else:
102
+ raise ValueError(f"Unknown authentication type: {auth_type}")
103
+
104
+ def _get_sp_access_token(self, config: Dict) -> str:
105
+ """Get access token via Service Principal OAuth client credentials flow."""
106
+ client_id = config.get('clientId')
107
+ client_secret = config.get('clientSecret')
108
+
109
+ if not client_id or not client_secret:
110
+ raise ValueError("Client ID and Client Secret are required for Service Principal authentication")
111
+
112
+ # Create cache key from client credentials
113
+ cache_key = f"{client_id}:{hash(client_secret)}"
114
+
115
+ # Check cache for valid token
116
+ cached = _sp_token_cache.get(cache_key)
117
+ if cached:
118
+ # Refresh if within 60 seconds of expiry
119
+ if cached.get('expires_at', 0) > time.time() + 60:
120
+ return cached['access_token']
121
+
122
+ # Get OAuth token URL
123
+ # Default to Azure AD endpoint if not specified
124
+ token_url = config.get('oauthTokenUrl')
125
+ if not token_url:
126
+ # Try to derive from workspace URL for Azure
127
+ workspace_url = config.get('connectionUrl', '')
128
+ if 'azuredatabricks.net' in workspace_url:
129
+ # Azure Databricks - use Azure AD
130
+ tenant_id = config.get('tenantId', 'common')
131
+ token_url = f"https://login.microsoftonline.com/{tenant_id}/oauth2/v2.0/token"
132
+ else:
133
+ # AWS/GCP - use Databricks OAuth endpoint
134
+ # Extract host from workspace URL
135
+ import re
136
+ match = re.match(r'https?://([^/]+)', workspace_url)
137
+ if match:
138
+ host = match.group(1)
139
+ token_url = f"https://{host}/oidc/v1/token"
140
+ else:
141
+ raise ValueError("Cannot determine OAuth token URL. Please provide oauthTokenUrl in config.")
142
+
143
+ # Request new token
144
+ import urllib.request
145
+ import urllib.parse
146
+
147
+ # Prepare token request
148
+ scopes = config.get('scopes', ['2ff814a6-3304-4ab8-85cb-cd0e6f879c1d/.default'])
149
+ if isinstance(scopes, str):
150
+ scopes = [scopes]
151
+
152
+ data = {
153
+ 'grant_type': 'client_credentials',
154
+ 'client_id': client_id,
155
+ 'client_secret': client_secret,
156
+ 'scope': ' '.join(scopes)
157
+ }
158
+
159
+ encoded_data = urllib.parse.urlencode(data).encode('utf-8')
160
+
161
+ req = urllib.request.Request(
162
+ token_url,
163
+ data=encoded_data,
164
+ headers={
165
+ 'Content-Type': 'application/x-www-form-urlencoded'
166
+ }
167
+ )
168
+
169
+ try:
170
+ with urllib.request.urlopen(req, timeout=30) as response:
171
+ result = json.loads(response.read().decode('utf-8'))
172
+
173
+ access_token = result.get('access_token')
174
+ expires_in = result.get('expires_in', 3600)
175
+
176
+ if not access_token:
177
+ raise ValueError("No access_token in OAuth response")
178
+
179
+ # Cache the token
180
+ _sp_token_cache[cache_key] = {
181
+ 'access_token': access_token,
182
+ 'expires_at': time.time() + expires_in
183
+ }
184
+
185
+ return access_token
186
+
187
+ except urllib.error.HTTPError as e:
188
+ error_body = e.read().decode('utf-8') if e.fp else str(e)
189
+ raise ValueError(f"OAuth token request failed: {e.code} - {error_body}")
190
+ except Exception as e:
191
+ raise ValueError(f"Failed to obtain OAuth token: {str(e)}")
192
+
193
+ def _get_connection_params(self, config: Dict) -> Dict[str, Any]:
194
+ """Build Databricks connection parameters from configuration"""
195
+ import re
196
+
197
+ # Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
198
+ connection_url = config.get('host') or config.get('connectionUrl', '')
199
+ if not connection_url:
200
+ raise ValueError("host (workspace URL) is required for Databricks")
201
+
202
+ # Extract host from URL - support both with and without protocol prefix
203
+ url_match = re.match(r'https?://([^/]+)', connection_url)
204
+ if url_match:
205
+ server_hostname = url_match.group(1)
206
+ else:
207
+ # Assume it's just the hostname without protocol
208
+ server_hostname = connection_url.split('/')[0].strip()
209
+
210
+ # Get HTTP path for SQL warehouse
211
+ http_path = config.get('warehouseHttpPath') or config.get('httpPath')
212
+ if not http_path:
213
+ warehouse_id = config.get('warehouseId')
214
+ if warehouse_id:
215
+ http_path = f"/sql/1.0/warehouses/{warehouse_id}"
216
+ else:
217
+ raise ValueError("Either warehouseHttpPath or warehouseId is required")
218
+
219
+ # Get access token
220
+ access_token = self._get_access_token(config)
221
+
222
+ conn_params = {
223
+ 'server_hostname': server_hostname,
224
+ 'http_path': http_path,
225
+ 'access_token': access_token,
226
+ }
227
+
228
+ # Optional catalog (Unity Catalog)
229
+ catalog = config.get('catalog')
230
+ if catalog:
231
+ conn_params['catalog'] = catalog
232
+
233
+ # Optional schema
234
+ schema = config.get('schema')
235
+ if schema:
236
+ conn_params['schema'] = schema
237
+
238
+ return conn_params
239
+
240
+ def _list_catalogs(self, cursor) -> List[str]:
241
+ """List all accessible catalogs"""
242
+ cursor.execute("SHOW CATALOGS")
243
+ rows = cursor.fetchall()
244
+ return [row[0] for row in rows if row[0] not in ('system', 'samples')]
245
+
246
+ def _list_schemas(self, cursor, catalog: str) -> List[str]:
247
+ """List all schemas in a catalog"""
248
+ cursor.execute(f"SHOW SCHEMAS IN `{catalog}`")
249
+ rows = cursor.fetchall()
250
+ return [row[0] for row in rows if row[0] not in ('information_schema',)]
251
+
252
+ def _list_tables(self, cursor, catalog: str, schema: str) -> List[Dict]:
253
+ """List all tables in a schema with their type"""
254
+ cursor.execute(f"SHOW TABLES IN `{catalog}`.`{schema}`")
255
+ rows = cursor.fetchall()
256
+ tables = []
257
+ for row in rows:
258
+ # SHOW TABLES returns: database, tableName, isTemporary
259
+ table_name = row[1] if len(row) > 1 else row[0]
260
+ tables.append({
261
+ 'table_name': table_name,
262
+ 'table_type': 'TABLE'
263
+ })
264
+ return tables
265
+
266
+ def _get_table_columns(self, cursor, catalog: str, schema: str, table: str) -> List[Dict]:
267
+ """Get column information for a table"""
268
+ try:
269
+ cursor.execute(f"DESCRIBE TABLE `{catalog}`.`{schema}`.`{table}`")
270
+ rows = cursor.fetchall()
271
+ columns = []
272
+ for row in rows:
273
+ # DESCRIBE TABLE returns: col_name, data_type, comment
274
+ col_name = row[0]
275
+
276
+ # Skip metadata rows (partition info, etc.)
277
+ if col_name.startswith('#') or not col_name.strip():
278
+ continue
279
+
280
+ columns.append({
281
+ 'column_name': col_name,
282
+ 'data_type': row[1] if len(row) > 1 else 'unknown',
283
+ 'is_nullable': 'YES', # Databricks doesn't always expose this
284
+ 'column_default': None,
285
+ 'description': row[2] if len(row) > 2 and row[2] else None
286
+ })
287
+ return columns
288
+ except Exception as e:
289
+ print(f"[DatabricksSchemaHandler] Error getting columns for {catalog}.{schema}.{table}: {e}")
290
+ return []
291
+
292
+ def _fetch_table_with_columns(self, databricks_sql, conn_params: Dict, catalog: str, schema: str, table_info: Dict) -> Dict:
293
+ """Fetch a single table with its columns using a new connection (for parallel execution)"""
294
+ connection = None
295
+ try:
296
+ connection = databricks_sql.connect(**conn_params)
297
+ cursor = connection.cursor()
298
+
299
+ table_name = table_info['table_name']
300
+ columns = self._get_table_columns(cursor, catalog, schema, table_name)
301
+
302
+ cursor.close()
303
+
304
+ return {
305
+ 'catalog': catalog,
306
+ 'schema': schema,
307
+ 'table': table_name,
308
+ 'type': table_info.get('table_type', 'TABLE'),
309
+ 'columns': columns,
310
+ 'error': None
311
+ }
312
+ except Exception as e:
313
+ print(f"[DatabricksSchemaHandler] Error fetching table {catalog}.{schema}.{table_info.get('table_name', 'unknown')}: {e}")
314
+ return {
315
+ 'catalog': catalog,
316
+ 'schema': schema,
317
+ 'table': table_info.get('table_name', 'unknown'),
318
+ 'type': table_info.get('table_type', 'TABLE'),
319
+ 'columns': [],
320
+ 'error': str(e)
321
+ }
322
+ finally:
323
+ if connection:
324
+ try:
325
+ connection.close()
326
+ except:
327
+ pass
328
+
329
+ def _fetch_schema_tables(self, databricks_sql, conn_params: Dict, catalog: str, schema: str) -> Dict:
330
+ """Fetch all tables for a schema with parallel column fetching"""
331
+ connection = None
332
+ try:
333
+ # Get list of tables first
334
+ connection = databricks_sql.connect(**conn_params)
335
+ cursor = connection.cursor()
336
+
337
+ tables = self._list_tables(cursor, catalog, schema)
338
+ cursor.close()
339
+ connection.close()
340
+ connection = None
341
+
342
+ print(f" Schema {schema}: {len(tables)} tables - fetching in parallel...")
343
+
344
+ # Fetch table details in parallel
345
+ schema_obj = {
346
+ 'schema': schema,
347
+ 'tables': [],
348
+ 'error': None
349
+ }
350
+
351
+ if not tables:
352
+ return schema_obj
353
+
354
+ # Use ThreadPoolExecutor for parallel table fetching
355
+ max_workers = min(10, len(tables)) # Limit concurrent connections
356
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
357
+ future_to_table = {
358
+ executor.submit(
359
+ self._fetch_table_with_columns,
360
+ databricks_sql,
361
+ conn_params,
362
+ catalog,
363
+ schema,
364
+ table_info
365
+ ): table_info for table_info in tables
366
+ }
367
+
368
+ for future in as_completed(future_to_table):
369
+ try:
370
+ table_obj = future.result()
371
+ schema_obj['tables'].append(table_obj)
372
+ except Exception as e:
373
+ table_info = future_to_table[future]
374
+ print(f" Error processing table {table_info.get('table_name')}: {e}")
375
+
376
+ return schema_obj
377
+
378
+ except Exception as e:
379
+ print(f" Error processing schema {schema}: {e}")
380
+ return {
381
+ 'schema': schema,
382
+ 'tables': [],
383
+ 'error': str(e)
384
+ }
385
+ finally:
386
+ if connection:
387
+ try:
388
+ connection.close()
389
+ except:
390
+ pass
391
+
392
+ def _build_catalog(self, databricks_sql, conn_params: Dict, specified_catalog: Optional[str] = None, specified_schema: Optional[str] = None) -> Dict:
393
+ """Build complete catalog structure with parallel processing"""
394
+ connection = databricks_sql.connect(**conn_params)
395
+ cursor = connection.cursor()
396
+
397
+ try:
398
+ catalog_data = []
399
+
400
+ # Get catalogs to process
401
+ if specified_catalog:
402
+ catalogs = [specified_catalog]
403
+ else:
404
+ catalogs = self._list_catalogs(cursor)
405
+
406
+ print(f"[DatabricksSchemaHandler] Processing {len(catalogs)} catalogs with parallel optimization...")
407
+
408
+ for catalog in catalogs:
409
+ print(f" Processing catalog: {catalog}")
410
+ catalog_obj = {
411
+ 'catalog': catalog,
412
+ 'schemas': []
413
+ }
414
+
415
+ try:
416
+ schemas_list = self._list_schemas(cursor, catalog)
417
+
418
+ # Filter schemas if specified_schema is provided
419
+ if specified_schema:
420
+ schemas = [s for s in schemas_list if s == specified_schema]
421
+ else:
422
+ schemas = schemas_list
423
+
424
+ print(f" Found {len(schemas)} schemas - processing in parallel...")
425
+
426
+ if not schemas:
427
+ catalog_data.append(catalog_obj)
428
+ continue
429
+
430
+ # Process schemas in parallel
431
+ max_workers = min(5, len(schemas)) # Limit concurrent schema processing
432
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
433
+ future_to_schema = {
434
+ executor.submit(
435
+ self._fetch_schema_tables,
436
+ databricks_sql,
437
+ conn_params,
438
+ catalog,
439
+ schema
440
+ ): schema for schema in schemas
441
+ }
442
+
443
+ for future in as_completed(future_to_schema):
444
+ try:
445
+ schema_obj = future.result()
446
+ catalog_obj['schemas'].append(schema_obj)
447
+ except Exception as e:
448
+ schema = future_to_schema[future]
449
+ print(f" Error processing schema {schema}: {e}")
450
+ catalog_obj['schemas'].append({
451
+ 'schema': schema,
452
+ 'tables': [],
453
+ 'error': str(e)
454
+ })
455
+
456
+ except Exception as e:
457
+ print(f" Error processing catalog {catalog}: {e}")
458
+ catalog_obj['schemas'].append({
459
+ 'schema': 'default',
460
+ 'tables': [],
461
+ 'error': str(e)
462
+ })
463
+
464
+ catalog_data.append(catalog_obj)
465
+
466
+ return {'catalogs': catalog_data}
467
+
468
+ finally:
469
+ cursor.close()
470
+ connection.close()
471
+
472
+ def _format_catalog_as_markdown(self, catalog_data: Dict) -> Tuple[str, Dict]:
473
+ """Format the catalog as markdown and build table_schemas dict"""
474
+ lines = ["# Databricks Database Schema\n"]
475
+ table_schemas = {}
476
+
477
+ total_tables = 0
478
+ for cat in catalog_data.get('catalogs', []):
479
+ for sch in cat.get('schemas', []):
480
+ total_tables += len(sch.get('tables', []))
481
+
482
+ lines.append(f"Found **{total_tables}** table(s)\n")
483
+
484
+ for cat in catalog_data.get('catalogs', []):
485
+ catalog_name = cat['catalog']
486
+
487
+ for sch in cat.get('schemas', []):
488
+ schema_name = sch['schema']
489
+
490
+ if sch.get('error'):
491
+ lines.append(f"\n## {catalog_name}.{schema_name}\n")
492
+ lines.append(f"Error: {sch['error']}\n")
493
+ continue
494
+
495
+ for table in sch.get('tables', []):
496
+ table_name = table['table']
497
+ full_name = f"{catalog_name}.{schema_name}.{table_name}"
498
+
499
+ lines.append(f"\n## {full_name}\n")
500
+
501
+ columns = table.get('columns', [])
502
+ lines.append(f"\n### Columns ({len(columns)})\n")
503
+
504
+ for col in columns:
505
+ col_name = col.get('column_name', 'unknown')
506
+ data_type = col.get('data_type', 'unknown')
507
+ description = col.get('description')
508
+
509
+ if description:
510
+ lines.append(f"- **{col_name}**: {data_type} - {description}\n")
511
+ else:
512
+ lines.append(f"- **{col_name}**: {data_type}\n")
513
+
514
+ # Store in table_schemas
515
+ table_schemas[full_name] = {
516
+ 'catalog': catalog_name,
517
+ 'schema': schema_name,
518
+ 'table_name': table_name,
519
+ 'full_name': full_name,
520
+ 'columns': [dict(col) for col in columns],
521
+ 'primary_keys': [], # Databricks doesn't always expose PK info
522
+ 'foreign_keys': [],
523
+ 'indices': []
524
+ }
525
+
526
+ lines.append("\n---\n")
527
+
528
+ return ''.join(lines).strip(), table_schemas
529
+
530
+ @tornado.web.authenticated
531
+ def post(self):
532
+ """Get Databricks database schema information"""
533
+ try:
534
+ # Parse request body
535
+ try:
536
+ body = json.loads(self.request.body.decode('utf-8'))
537
+ except json.JSONDecodeError:
538
+ self.set_status(400)
539
+ self.finish(json.dumps({
540
+ "error": "Invalid JSON in request body"
541
+ }))
542
+ return
543
+
544
+ # Get Databricks configuration from request or environment
545
+ config = self._get_databricks_config(body.get('config'))
546
+
547
+ if not config:
548
+ self.set_status(400)
549
+ self.finish(json.dumps({
550
+ "error": "No Databricks configuration provided and no Databricks configurations found in environment"
551
+ }))
552
+ return
553
+
554
+ # Setup Databricks environment
555
+ try:
556
+ databricks_sql = self._setup_databricks_environment()
557
+ except ImportError as e:
558
+ self.set_status(500)
559
+ self.finish(json.dumps({
560
+ "error": str(e)
561
+ }))
562
+ return
563
+
564
+ # Get database schema
565
+ try:
566
+ conn_params = self._get_connection_params(config)
567
+ specified_catalog = config.get('catalog')
568
+ specified_schema = config.get('schema')
569
+
570
+ print(f"[DatabricksSchemaHandler] Connecting to {conn_params['server_hostname']}")
571
+ if specified_catalog:
572
+ print(f"[DatabricksSchemaHandler] Filtering to catalog: {specified_catalog}")
573
+ if specified_schema:
574
+ print(f"[DatabricksSchemaHandler] Filtering to schema: {specified_schema}")
575
+
576
+ catalog_data = self._build_catalog(
577
+ databricks_sql,
578
+ conn_params,
579
+ specified_catalog=specified_catalog,
580
+ specified_schema=specified_schema
581
+ )
582
+
583
+ markdown_result, table_schemas = self._format_catalog_as_markdown(catalog_data)
584
+
585
+ self.finish(json.dumps({
586
+ "result": markdown_result,
587
+ "table_schemas": table_schemas,
588
+ "catalogs": catalog_data.get('catalogs', [])
589
+ }))
590
+
591
+ except Exception as e:
592
+ error_msg = str(e)
593
+ # Provide helpful error messages
594
+ if 'PAT' in error_msg.upper() or 'token' in error_msg.lower():
595
+ error_msg = f"Authentication failed: {error_msg}. If PATs are disabled in your workspace, try Service Principal authentication."
596
+ elif 'warehouse' in error_msg.lower():
597
+ error_msg = f"SQL Warehouse error: {error_msg}. Ensure your warehouse is running and accessible."
598
+
599
+ self.set_status(500)
600
+ self.finish(json.dumps({
601
+ "error": f"Error connecting to Databricks: {error_msg}"
602
+ }))
603
+
604
+ except Exception as e:
605
+ self.set_status(500)
606
+ self.finish(json.dumps({
607
+ "error": "Internal server error",
608
+ "message": str(e)
609
+ }))
610
+
611
+
612
+ class DatabricksQueryHandler(APIHandler):
613
+ """Handler for Databricks query execution"""
614
+
615
+ def _setup_databricks_environment(self):
616
+ """Install required Databricks packages if not available"""
617
+ def install_package(package_name):
618
+ try:
619
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
620
+ return True
621
+ except subprocess.CalledProcessError:
622
+ return False
623
+
624
+ missing_packages = []
625
+
626
+ try:
627
+ from databricks import sql as databricks_sql
628
+ except ImportError:
629
+ if install_package("databricks-sql-connector"):
630
+ try:
631
+ from databricks import sql as databricks_sql
632
+ except ImportError as e:
633
+ missing_packages.append(f"databricks-sql-connector: {str(e)}")
634
+ else:
635
+ missing_packages.append("databricks-sql-connector: installation failed")
636
+
637
+ if missing_packages:
638
+ raise ImportError("Required modules could not be installed: " + ", ".join(missing_packages))
639
+
640
+ from databricks import sql as databricks_sql
641
+ return databricks_sql
642
+
643
+ def _get_databricks_config(self, provided_config: Optional[Dict] = None) -> Optional[Dict]:
644
+ """Get Databricks configuration from request or environment variables"""
645
+ if provided_config:
646
+ return provided_config
647
+
648
+ # Look for Databricks database configuration in the environment
649
+ for key, value in os.environ.items():
650
+ if key.endswith('_CONNECTION_JSON'):
651
+ try:
652
+ config = json.loads(value)
653
+ if config.get('type') == 'databricks':
654
+ return config
655
+ except Exception as e:
656
+ print(f"[DatabricksQueryHandler] Error parsing database config {key}: {e}")
657
+ continue
658
+
659
+ return None
660
+
661
+ def _get_access_token(self, config: Dict) -> str:
662
+ """Get access token for authentication - delegates to schema handler logic"""
663
+ # Reuse the schema handler's token logic
664
+ handler = DatabricksSchemaHandler(self.application, self.request)
665
+ return handler._get_access_token(config)
666
+
667
+ def _get_connection_params(self, config: Dict) -> Dict[str, Any]:
668
+ """Build Databricks connection parameters from configuration"""
669
+ import re
670
+
671
+ # Extract host - check 'host' first, then fall back to 'connectionUrl' for backwards compatibility
672
+ connection_url = config.get('host') or config.get('connectionUrl', '')
673
+ if not connection_url:
674
+ raise ValueError("host (workspace URL) is required for Databricks")
675
+
676
+ # Extract host from URL - support both with and without protocol prefix
677
+ url_match = re.match(r'https?://([^/]+)', connection_url)
678
+ if url_match:
679
+ server_hostname = url_match.group(1)
680
+ else:
681
+ # Assume it's just the hostname without protocol
682
+ server_hostname = connection_url.split('/')[0].strip()
683
+
684
+ http_path = config.get('warehouseHttpPath') or config.get('httpPath')
685
+ if not http_path:
686
+ warehouse_id = config.get('warehouseId')
687
+ if warehouse_id:
688
+ http_path = f"/sql/1.0/warehouses/{warehouse_id}"
689
+ else:
690
+ raise ValueError("Either warehouseHttpPath or warehouseId is required")
691
+
692
+ access_token = self._get_access_token(config)
693
+
694
+ conn_params = {
695
+ 'server_hostname': server_hostname,
696
+ 'http_path': http_path,
697
+ 'access_token': access_token,
698
+ }
699
+
700
+ catalog = config.get('catalog')
701
+ if catalog:
702
+ conn_params['catalog'] = catalog
703
+
704
+ schema = config.get('schema')
705
+ if schema:
706
+ conn_params['schema'] = schema
707
+
708
+ return conn_params
709
+
710
+ @tornado.web.authenticated
711
+ def post(self):
712
+ """Execute a read-only SQL query on Databricks"""
713
+ try:
714
+ # Parse request body
715
+ try:
716
+ body = json.loads(self.request.body.decode('utf-8'))
717
+ except json.JSONDecodeError:
718
+ self.set_status(400)
719
+ self.finish(json.dumps({
720
+ "error": "Invalid JSON in request body"
721
+ }))
722
+ return
723
+
724
+ # Get query from request
725
+ query = body.get('query')
726
+ if not query:
727
+ self.set_status(400)
728
+ self.finish(json.dumps({
729
+ "error": "Missing 'query' field in request body"
730
+ }))
731
+ return
732
+
733
+ # Basic validation for read-only queries
734
+ normalized_query = query.strip().upper()
735
+ allowed_starts = ['SELECT', 'WITH', 'SHOW', 'DESCRIBE', 'EXPLAIN']
736
+
737
+ if not any(normalized_query.startswith(start) for start in allowed_starts):
738
+ self.set_status(400)
739
+ self.finish(json.dumps({
740
+ "error": f"Only {', '.join(allowed_starts)} statements are allowed for read queries."
741
+ }))
742
+ return
743
+
744
+ # Get Databricks configuration from request or environment
745
+ config = self._get_databricks_config(body.get('config'))
746
+
747
+ if not config:
748
+ self.set_status(400)
749
+ self.finish(json.dumps({
750
+ "error": "No Databricks configuration provided and no Databricks configurations found in environment"
751
+ }))
752
+ return
753
+
754
+ # Setup Databricks environment
755
+ try:
756
+ databricks_sql = self._setup_databricks_environment()
757
+ except ImportError as e:
758
+ self.set_status(500)
759
+ self.finish(json.dumps({
760
+ "error": str(e)
761
+ }))
762
+ return
763
+
764
+ # Execute query
765
+ try:
766
+ conn_params = self._get_connection_params(config)
767
+
768
+ # Allow specifying a specific catalog for the query
769
+ catalog = body.get('catalog')
770
+ if catalog:
771
+ conn_params['catalog'] = catalog
772
+
773
+ connection = databricks_sql.connect(**conn_params)
774
+ cursor = connection.cursor()
775
+
776
+ try:
777
+ cursor.execute(query)
778
+
779
+ # Get column names from cursor description
780
+ columns = [desc[0] for desc in cursor.description] if cursor.description else []
781
+
782
+ # Fetch all results
783
+ rows = cursor.fetchall()
784
+
785
+ # Convert result to list of dictionaries
786
+ result_rows = [
787
+ {columns[i]: row[i] for i in range(len(columns))}
788
+ for row in rows
789
+ ]
790
+
791
+ self.finish(json.dumps({
792
+ "result": result_rows
793
+ }))
794
+
795
+ finally:
796
+ cursor.close()
797
+ connection.close()
798
+
799
+ except Exception as e:
800
+ self.set_status(500)
801
+ self.finish(json.dumps({
802
+ "error": f"Databricks query failed: {str(e)}"
803
+ }))
804
+
805
+ except Exception as e:
806
+ self.set_status(500)
807
+ self.finish(json.dumps({
808
+ "error": "Internal server error",
809
+ "message": str(e)
810
+ }))
811
+
812
+
813
+ class DatabricksTestHandler(APIHandler):
814
+ """Handler for testing Databricks connection"""
815
+
816
+ @tornado.web.authenticated
817
+ def post(self):
818
+ """Test Databricks connection and return status"""
819
+ try:
820
+ # Parse request body
821
+ try:
822
+ body = json.loads(self.request.body.decode('utf-8'))
823
+ except json.JSONDecodeError:
824
+ self.set_status(400)
825
+ self.finish(json.dumps({
826
+ "error": "Invalid JSON in request body"
827
+ }))
828
+ return
829
+
830
+ config = body.get('config')
831
+ if not config:
832
+ self.set_status(400)
833
+ self.finish(json.dumps({
834
+ "error": "No configuration provided"
835
+ }))
836
+ return
837
+
838
+ # Setup environment
839
+ schema_handler = DatabricksSchemaHandler(self.application, self.request)
840
+ try:
841
+ databricks_sql = schema_handler._setup_databricks_environment()
842
+ except ImportError as e:
843
+ self.set_status(500)
844
+ self.finish(json.dumps({
845
+ "ok": False,
846
+ "error": str(e)
847
+ }))
848
+ return
849
+
850
+ # Test connection
851
+ try:
852
+ import time
853
+ start_time = time.time()
854
+
855
+ conn_params = schema_handler._get_connection_params(config)
856
+ connection = databricks_sql.connect(**conn_params)
857
+ cursor = connection.cursor()
858
+
859
+ try:
860
+ # Test basic query
861
+ cursor.execute("SELECT 1 as test")
862
+ cursor.fetchall()
863
+
864
+ sql_latency = int((time.time() - start_time) * 1000)
865
+
866
+ # Try to get current user
867
+ identity_info = {"type": "unknown", "name": "unknown"}
868
+ try:
869
+ cursor.execute("SELECT current_user() as user")
870
+ user_row = cursor.fetchone()
871
+ if user_row:
872
+ auth_type = config.get('authType', 'pat')
873
+ identity_info = {
874
+ "type": "user" if auth_type == 'pat' else "service_principal",
875
+ "name": user_row[0]
876
+ }
877
+ except Exception:
878
+ pass
879
+
880
+ self.finish(json.dumps({
881
+ "ok": True,
882
+ "identity": identity_info,
883
+ "sql": {"ok": True, "latency_ms": sql_latency},
884
+ "api": {"ok": True}
885
+ }))
886
+
887
+ finally:
888
+ cursor.close()
889
+ connection.close()
890
+
891
+ except Exception as e:
892
+ error_msg = str(e)
893
+ self.finish(json.dumps({
894
+ "ok": False,
895
+ "error": error_msg,
896
+ "identity": None,
897
+ "sql": {"ok": False, "error": error_msg},
898
+ "api": {"ok": False}
899
+ }))
900
+
901
+ except Exception as e:
902
+ self.set_status(500)
903
+ self.finish(json.dumps({
904
+ "ok": False,
905
+ "error": "Internal server error",
906
+ "message": str(e)
907
+ }))