classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,1279 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
from collections import deque
|
|
7
|
+
from collections.abc import AsyncGenerator, Generator
|
|
8
|
+
from contextlib import closing
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from datetime import UTC, datetime, timedelta
|
|
11
|
+
from typing import Any
|
|
12
|
+
from urllib.parse import urlparse
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
|
|
16
|
+
from ...models.generated_input import (
|
|
17
|
+
DatabricksInput,
|
|
18
|
+
DatabricksMaskedPat,
|
|
19
|
+
DatabricksMaskedServicePrincipal,
|
|
20
|
+
DatabricksOptionalConnection,
|
|
21
|
+
DatabricksOptionalExtraction,
|
|
22
|
+
DatabricksOptionalScope,
|
|
23
|
+
DatabricksRequiredPat,
|
|
24
|
+
DatabricksRequiredServicePrincipal,
|
|
25
|
+
SamplingConfig,
|
|
26
|
+
SamplingStrategy,
|
|
27
|
+
)
|
|
28
|
+
from ...models.generated_single_asset_scan_results import (
|
|
29
|
+
AssetType as OutputAssetType,
|
|
30
|
+
)
|
|
31
|
+
from ...models.generated_single_asset_scan_results import (
|
|
32
|
+
DetectionResult,
|
|
33
|
+
SingleAssetScanResults,
|
|
34
|
+
)
|
|
35
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
36
|
+
from ..base import BaseSource
|
|
37
|
+
from ..dependencies import require_module
|
|
38
|
+
from ..tabular_utils import build_tabular_location, format_tabular_sample_content
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
_DEFAULT_EXCLUDED_CATALOGS = {"system"}
|
|
43
|
+
_DEFAULT_EXCLUDED_SCHEMAS = {"information_schema"}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class TableRef:
|
|
48
|
+
catalog: str
|
|
49
|
+
schema: str
|
|
50
|
+
table: str
|
|
51
|
+
object_type: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass(frozen=True)
|
|
55
|
+
class NotebookRef:
|
|
56
|
+
path: str
|
|
57
|
+
object_id: str | None
|
|
58
|
+
language: str | None
|
|
59
|
+
created_at_ms: int | None
|
|
60
|
+
modified_at_ms: int | None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(frozen=True)
|
|
64
|
+
class PipelineRef:
|
|
65
|
+
pipeline_id: str
|
|
66
|
+
name: str
|
|
67
|
+
state: str | None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _quote_identifier(identifier: str) -> str:
|
|
71
|
+
return f"`{identifier.replace('`', '``')}`"
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _quote_literal(value: str) -> str:
|
|
75
|
+
return "'" + value.replace("'", "''") + "'"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DatabricksSource(BaseSource):
|
|
79
|
+
source_type = "databricks"
|
|
80
|
+
|
|
81
|
+
def __init__(
|
|
82
|
+
self,
|
|
83
|
+
recipe: dict[str, Any],
|
|
84
|
+
source_id: str | None = None,
|
|
85
|
+
runner_id: str | None = None,
|
|
86
|
+
) -> None:
|
|
87
|
+
super().__init__(recipe, source_id, runner_id)
|
|
88
|
+
self.config = DatabricksInput.model_validate(recipe)
|
|
89
|
+
self.runner_id = runner_id or "local-run"
|
|
90
|
+
|
|
91
|
+
self._databricks_sql = require_module(
|
|
92
|
+
module_name="databricks.sql",
|
|
93
|
+
source_name="Databricks",
|
|
94
|
+
uv_groups=["databricks"],
|
|
95
|
+
detail="The Databricks SQL connector is optional.",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# pyarrow→pandas conversion calls pytz.timezone() on the timezone name
|
|
99
|
+
# embedded in Arrow schema metadata. Databricks uses 'Etc/UTC' which is
|
|
100
|
+
# absent from pytz's built-in zone list. Pre-populating the cache makes
|
|
101
|
+
# pytz.timezone('Etc/UTC') return UTC without hitting the lookup failure.
|
|
102
|
+
try:
|
|
103
|
+
import pytz
|
|
104
|
+
|
|
105
|
+
pytz._tzinfo_cache.setdefault("Etc/UTC", pytz.UTC)
|
|
106
|
+
except Exception:
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
self._validate_auth_configuration()
|
|
110
|
+
|
|
111
|
+
self.session = requests.Session()
|
|
112
|
+
self._access_token: str | None = None
|
|
113
|
+
self._access_token_expiry: datetime | None = None
|
|
114
|
+
|
|
115
|
+
self._table_lookup: dict[str, TableRef] = {}
|
|
116
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
117
|
+
|
|
118
|
+
def _validate_auth_configuration(self) -> None:
|
|
119
|
+
required = self.config.required
|
|
120
|
+
masked = self.config.masked
|
|
121
|
+
|
|
122
|
+
if isinstance(required, DatabricksRequiredPat):
|
|
123
|
+
if not isinstance(masked, DatabricksMaskedPat):
|
|
124
|
+
raise ValueError("DATABRICKS PAT_TOKEN auth requires masked.token")
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
if isinstance(required, DatabricksRequiredServicePrincipal):
|
|
128
|
+
if not isinstance(masked, DatabricksMaskedServicePrincipal):
|
|
129
|
+
raise ValueError("DATABRICKS SERVICE_PRINCIPAL auth requires masked.client_secret")
|
|
130
|
+
return
|
|
131
|
+
|
|
132
|
+
raise ValueError("Unsupported DATABRICKS auth configuration")
|
|
133
|
+
|
|
134
|
+
def _asset_type_value(self) -> str:
|
|
135
|
+
type_value = self.config.type
|
|
136
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
137
|
+
|
|
138
|
+
def _sampling(self) -> SamplingConfig:
|
|
139
|
+
return self.config.sampling
|
|
140
|
+
|
|
141
|
+
def _connection_options(self) -> DatabricksOptionalConnection:
|
|
142
|
+
if self.config.optional and self.config.optional.connection:
|
|
143
|
+
return self.config.optional.connection
|
|
144
|
+
return DatabricksOptionalConnection()
|
|
145
|
+
|
|
146
|
+
def _scope_options(self) -> DatabricksOptionalScope:
|
|
147
|
+
if self.config.optional and self.config.optional.scope:
|
|
148
|
+
return self.config.optional.scope
|
|
149
|
+
return DatabricksOptionalScope()
|
|
150
|
+
|
|
151
|
+
def _extraction_options(self) -> DatabricksOptionalExtraction:
|
|
152
|
+
if self.config.optional and self.config.optional.extraction:
|
|
153
|
+
return self.config.optional.extraction
|
|
154
|
+
return DatabricksOptionalExtraction()
|
|
155
|
+
|
|
156
|
+
def _workspace_url(self) -> str:
|
|
157
|
+
return str(self.config.required.workspace_url).rstrip("/")
|
|
158
|
+
|
|
159
|
+
def _workspace_host(self) -> str:
|
|
160
|
+
parsed = urlparse(self._workspace_url())
|
|
161
|
+
return parsed.netloc
|
|
162
|
+
|
|
163
|
+
def _warehouse_id(self) -> str:
|
|
164
|
+
return self.config.required.warehouse_id
|
|
165
|
+
|
|
166
|
+
def _timeout_seconds(self) -> int:
|
|
167
|
+
timeout = self._connection_options().timeout_seconds
|
|
168
|
+
return int(timeout or 30)
|
|
169
|
+
|
|
170
|
+
def _statement_timeout_seconds(self) -> int:
|
|
171
|
+
timeout = self._connection_options().statement_timeout_seconds
|
|
172
|
+
return int(timeout or 60)
|
|
173
|
+
|
|
174
|
+
def _is_pat_mode(self) -> bool:
|
|
175
|
+
return isinstance(self.config.required, DatabricksRequiredPat)
|
|
176
|
+
|
|
177
|
+
def _masked_pat_token(self) -> str:
|
|
178
|
+
masked = self.config.masked
|
|
179
|
+
if not isinstance(masked, DatabricksMaskedPat):
|
|
180
|
+
raise ValueError("DATABRICKS PAT_TOKEN auth requires masked.token")
|
|
181
|
+
return masked.token
|
|
182
|
+
|
|
183
|
+
def _service_principal_credentials(self) -> tuple[str, str]:
|
|
184
|
+
required = self.config.required
|
|
185
|
+
masked = self.config.masked
|
|
186
|
+
if not isinstance(required, DatabricksRequiredServicePrincipal):
|
|
187
|
+
raise ValueError("SERVICE_PRINCIPAL auth mode is required")
|
|
188
|
+
if not isinstance(masked, DatabricksMaskedServicePrincipal):
|
|
189
|
+
raise ValueError("DATABRICKS SERVICE_PRINCIPAL auth requires masked.client_secret")
|
|
190
|
+
return required.client_id, masked.client_secret
|
|
191
|
+
|
|
192
|
+
def _is_access_token_expired(self) -> bool:
|
|
193
|
+
if self._access_token_expiry is None:
|
|
194
|
+
return True
|
|
195
|
+
return self._access_token_expiry <= datetime.now(UTC)
|
|
196
|
+
|
|
197
|
+
def _acquire_service_principal_token(self) -> str:
|
|
198
|
+
client_id, client_secret = self._service_principal_credentials()
|
|
199
|
+
|
|
200
|
+
response = self.session.post(
|
|
201
|
+
f"{self._workspace_url()}/oidc/v1/token",
|
|
202
|
+
data={
|
|
203
|
+
"grant_type": "client_credentials",
|
|
204
|
+
"client_id": client_id,
|
|
205
|
+
"client_secret": client_secret,
|
|
206
|
+
"scope": "all-apis",
|
|
207
|
+
},
|
|
208
|
+
timeout=self._timeout_seconds(),
|
|
209
|
+
)
|
|
210
|
+
response.raise_for_status()
|
|
211
|
+
|
|
212
|
+
payload = response.json()
|
|
213
|
+
token = payload.get("access_token")
|
|
214
|
+
if not isinstance(token, str) or not token.strip():
|
|
215
|
+
raise ValueError("Databricks token response did not include access_token")
|
|
216
|
+
|
|
217
|
+
expires_in = int(payload.get("expires_in", 3600))
|
|
218
|
+
safety_seconds = 300
|
|
219
|
+
valid_for_seconds = max(expires_in - safety_seconds, 0)
|
|
220
|
+
self._access_token_expiry = datetime.now(UTC) + timedelta(seconds=valid_for_seconds)
|
|
221
|
+
|
|
222
|
+
return token.strip()
|
|
223
|
+
|
|
224
|
+
def _access_token_value(self) -> str:
|
|
225
|
+
if self._is_pat_mode():
|
|
226
|
+
return self._masked_pat_token().strip()
|
|
227
|
+
|
|
228
|
+
if self._access_token and not self._is_access_token_expired():
|
|
229
|
+
return self._access_token
|
|
230
|
+
|
|
231
|
+
self._access_token = self._acquire_service_principal_token()
|
|
232
|
+
return self._access_token
|
|
233
|
+
|
|
234
|
+
def _authorization_header(self) -> str:
|
|
235
|
+
return f"Bearer {self._access_token_value()}"
|
|
236
|
+
|
|
237
|
+
def _request_json(
|
|
238
|
+
self,
|
|
239
|
+
method: str,
|
|
240
|
+
path: str,
|
|
241
|
+
*,
|
|
242
|
+
params: dict[str, Any] | None = None,
|
|
243
|
+
json_payload: dict[str, Any] | None = None,
|
|
244
|
+
) -> dict[str, Any]:
|
|
245
|
+
url = (
|
|
246
|
+
path
|
|
247
|
+
if path.startswith("http://") or path.startswith("https://")
|
|
248
|
+
else f"{self._workspace_url()}/{path.lstrip('/')}"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
headers = {
|
|
252
|
+
"Authorization": self._authorization_header(),
|
|
253
|
+
"Accept": "application/json",
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
response = self.session.request(
|
|
257
|
+
method=method,
|
|
258
|
+
url=url,
|
|
259
|
+
headers=headers,
|
|
260
|
+
params=params,
|
|
261
|
+
json=json_payload,
|
|
262
|
+
timeout=self._timeout_seconds(),
|
|
263
|
+
)
|
|
264
|
+
response.raise_for_status()
|
|
265
|
+
|
|
266
|
+
if response.status_code == 204 or not response.text.strip():
|
|
267
|
+
return {}
|
|
268
|
+
|
|
269
|
+
return response.json()
|
|
270
|
+
|
|
271
|
+
def _paged_values(
|
|
272
|
+
self,
|
|
273
|
+
path: str,
|
|
274
|
+
*,
|
|
275
|
+
params: dict[str, Any] | None = None,
|
|
276
|
+
value_keys: tuple[str, ...],
|
|
277
|
+
) -> list[dict[str, Any]]:
|
|
278
|
+
collected: list[dict[str, Any]] = []
|
|
279
|
+
|
|
280
|
+
next_page_token: str | None = None
|
|
281
|
+
while True:
|
|
282
|
+
current_params = dict(params or {})
|
|
283
|
+
if next_page_token:
|
|
284
|
+
current_params["page_token"] = next_page_token
|
|
285
|
+
|
|
286
|
+
payload = self._request_json("get", path, params=current_params)
|
|
287
|
+
values: Any = None
|
|
288
|
+
for key in value_keys:
|
|
289
|
+
candidate = payload.get(key)
|
|
290
|
+
if isinstance(candidate, list):
|
|
291
|
+
values = candidate
|
|
292
|
+
break
|
|
293
|
+
|
|
294
|
+
if isinstance(values, list):
|
|
295
|
+
for entry in values:
|
|
296
|
+
if isinstance(entry, dict):
|
|
297
|
+
collected.append(entry)
|
|
298
|
+
|
|
299
|
+
token = payload.get("next_page_token")
|
|
300
|
+
if not isinstance(token, str) or not token.strip():
|
|
301
|
+
break
|
|
302
|
+
next_page_token = token
|
|
303
|
+
|
|
304
|
+
return collected
|
|
305
|
+
|
|
306
|
+
def _connect_sql(self):
|
|
307
|
+
return self._databricks_sql.connect(
|
|
308
|
+
server_hostname=self._workspace_host(),
|
|
309
|
+
http_path=f"/sql/1.0/warehouses/{self._warehouse_id()}",
|
|
310
|
+
access_token=self._access_token_value(),
|
|
311
|
+
_socket_timeout=self._timeout_seconds(),
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def _catalog_allowlist(self) -> set[str] | None:
|
|
315
|
+
configured = self._scope_options().include_catalogs
|
|
316
|
+
if not configured:
|
|
317
|
+
return None
|
|
318
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
319
|
+
|
|
320
|
+
def _catalog_denylist(self) -> set[str]:
|
|
321
|
+
configured = self._scope_options().exclude_catalogs or []
|
|
322
|
+
denylist = {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
323
|
+
if not denylist:
|
|
324
|
+
denylist = set(_DEFAULT_EXCLUDED_CATALOGS)
|
|
325
|
+
return denylist
|
|
326
|
+
|
|
327
|
+
def _schema_allowlist(self) -> set[str] | None:
|
|
328
|
+
configured = self._scope_options().include_schemas
|
|
329
|
+
if not configured:
|
|
330
|
+
return None
|
|
331
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
332
|
+
|
|
333
|
+
def _schema_denylist(self) -> set[str]:
|
|
334
|
+
configured = self._scope_options().exclude_schemas or []
|
|
335
|
+
denylist = {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
336
|
+
if not denylist:
|
|
337
|
+
denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
|
|
338
|
+
return denylist
|
|
339
|
+
|
|
340
|
+
def _table_allowlist(self) -> set[str] | None:
|
|
341
|
+
configured = self._scope_options().include_tables
|
|
342
|
+
if not configured:
|
|
343
|
+
return None
|
|
344
|
+
return {entry.strip().lower() for entry in configured if entry and entry.strip()}
|
|
345
|
+
|
|
346
|
+
def _catalog_allowed(self, catalog: str) -> bool:
|
|
347
|
+
normalized = catalog.lower()
|
|
348
|
+
|
|
349
|
+
if normalized in self._catalog_denylist():
|
|
350
|
+
return False
|
|
351
|
+
|
|
352
|
+
if normalized == "hive_metastore" and not self._scope_options().include_hive_metastore:
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
allowlist = self._catalog_allowlist()
|
|
356
|
+
if allowlist and normalized not in allowlist:
|
|
357
|
+
return False
|
|
358
|
+
return True
|
|
359
|
+
|
|
360
|
+
def _schema_allowed(self, catalog: str, schema: str) -> bool:
|
|
361
|
+
scoped_schema = f"{catalog}.{schema}".lower()
|
|
362
|
+
|
|
363
|
+
denylist = self._schema_denylist()
|
|
364
|
+
if schema.lower() in denylist or scoped_schema in denylist:
|
|
365
|
+
return False
|
|
366
|
+
|
|
367
|
+
allowlist = self._schema_allowlist()
|
|
368
|
+
if not allowlist:
|
|
369
|
+
return True
|
|
370
|
+
|
|
371
|
+
return schema.lower() in allowlist or scoped_schema in allowlist
|
|
372
|
+
|
|
373
|
+
def _table_allowed(self, table_ref: TableRef) -> bool:
|
|
374
|
+
allowlist = self._table_allowlist()
|
|
375
|
+
if not allowlist:
|
|
376
|
+
return True
|
|
377
|
+
|
|
378
|
+
table = table_ref.table.lower()
|
|
379
|
+
schema_table = f"{table_ref.schema}.{table_ref.table}".lower()
|
|
380
|
+
catalog_schema_table = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}".lower()
|
|
381
|
+
|
|
382
|
+
return table in allowlist or schema_table in allowlist or catalog_schema_table in allowlist
|
|
383
|
+
|
|
384
|
+
def _list_catalogs(self) -> list[str]:
|
|
385
|
+
values = self._paged_values(
|
|
386
|
+
"/api/2.1/unity-catalog/catalogs",
|
|
387
|
+
value_keys=("catalogs", "value", "items"),
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
catalogs: list[str] = []
|
|
391
|
+
for entry in values:
|
|
392
|
+
name = entry.get("name")
|
|
393
|
+
if isinstance(name, str) and name and self._catalog_allowed(name):
|
|
394
|
+
catalogs.append(name)
|
|
395
|
+
|
|
396
|
+
catalogs.sort()
|
|
397
|
+
logger.info("Found %d catalog(s): %s", len(catalogs), ", ".join(catalogs) or "(none)")
|
|
398
|
+
return catalogs
|
|
399
|
+
|
|
400
|
+
def _list_schemas_for_catalog(self, catalog: str) -> list[str]:
|
|
401
|
+
values = self._paged_values(
|
|
402
|
+
"/api/2.1/unity-catalog/schemas",
|
|
403
|
+
params={"catalog_name": catalog},
|
|
404
|
+
value_keys=("schemas", "value", "items"),
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
schemas: list[str] = []
|
|
408
|
+
for entry in values:
|
|
409
|
+
name = entry.get("name")
|
|
410
|
+
if not isinstance(name, str) or not name:
|
|
411
|
+
continue
|
|
412
|
+
if self._schema_allowed(catalog, name):
|
|
413
|
+
schemas.append(name)
|
|
414
|
+
|
|
415
|
+
schemas.sort()
|
|
416
|
+
logger.info("Catalog %s: found %d schema(s)", catalog, len(schemas))
|
|
417
|
+
return schemas
|
|
418
|
+
|
|
419
|
+
def _coerce_object_type(self, table_type: Any) -> str:
|
|
420
|
+
normalized = str(table_type or "TABLE").upper()
|
|
421
|
+
if "VIEW" in normalized:
|
|
422
|
+
return "VIEW"
|
|
423
|
+
return "TABLE"
|
|
424
|
+
|
|
425
|
+
def _list_tables_for_schema(self, catalog: str, schema: str) -> list[TableRef]:
|
|
426
|
+
values = self._paged_values(
|
|
427
|
+
"/api/2.1/unity-catalog/tables",
|
|
428
|
+
params={"catalog_name": catalog, "schema_name": schema},
|
|
429
|
+
value_keys=("tables", "value", "items"),
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
limit_value = self._scope_options().table_limit_per_schema
|
|
433
|
+
limit = int(limit_value) if limit_value else None
|
|
434
|
+
|
|
435
|
+
tables: list[TableRef] = []
|
|
436
|
+
for entry in values:
|
|
437
|
+
table_name = entry.get("name") or entry.get("table_name")
|
|
438
|
+
if not isinstance(table_name, str) or not table_name:
|
|
439
|
+
continue
|
|
440
|
+
|
|
441
|
+
table_ref = TableRef(
|
|
442
|
+
catalog=catalog,
|
|
443
|
+
schema=schema,
|
|
444
|
+
table=table_name,
|
|
445
|
+
object_type=self._coerce_object_type(entry.get("table_type") or entry.get("type")),
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
if not self._table_allowed(table_ref):
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
tables.append(table_ref)
|
|
452
|
+
if limit is not None and len(tables) >= limit:
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
logger.info("Schema %s.%s: found %d table(s)", catalog, schema, len(tables))
|
|
456
|
+
return tables
|
|
457
|
+
|
|
458
|
+
def _iter_tables(self) -> list[TableRef]:
|
|
459
|
+
tables: list[TableRef] = []
|
|
460
|
+
|
|
461
|
+
for catalog in self._list_catalogs():
|
|
462
|
+
if self._aborted:
|
|
463
|
+
break
|
|
464
|
+
|
|
465
|
+
try:
|
|
466
|
+
schemas = self._list_schemas_for_catalog(catalog)
|
|
467
|
+
except Exception as exc:
|
|
468
|
+
logger.warning("Skipping catalog %s due to schema listing error: %s", catalog, exc)
|
|
469
|
+
continue
|
|
470
|
+
|
|
471
|
+
for schema in schemas:
|
|
472
|
+
if self._aborted:
|
|
473
|
+
break
|
|
474
|
+
|
|
475
|
+
try:
|
|
476
|
+
tables.extend(self._list_tables_for_schema(catalog, schema))
|
|
477
|
+
except Exception as exc:
|
|
478
|
+
logger.warning(
|
|
479
|
+
"Skipping schema %s.%s due to table listing error: %s",
|
|
480
|
+
catalog,
|
|
481
|
+
schema,
|
|
482
|
+
exc,
|
|
483
|
+
)
|
|
484
|
+
|
|
485
|
+
logger.info("Discovery complete: %d table(s) in scope", len(tables))
|
|
486
|
+
return tables
|
|
487
|
+
|
|
488
|
+
def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
|
|
489
|
+
return (table_ref.catalog, table_ref.schema, table_ref.table)
|
|
490
|
+
|
|
491
|
+
def _table_raw_id(self, table_ref: TableRef) -> str:
|
|
492
|
+
return f"{table_ref.catalog}_#_{table_ref.schema}_#_{table_ref.table}"
|
|
493
|
+
|
|
494
|
+
def _parse_qualified_table_name(self, value: str) -> tuple[str, str, str] | None:
|
|
495
|
+
cleaned = value.strip().strip("`")
|
|
496
|
+
if not cleaned:
|
|
497
|
+
return None
|
|
498
|
+
|
|
499
|
+
parts = [part.strip().strip("`") for part in cleaned.split(".") if part.strip()]
|
|
500
|
+
if len(parts) < 3:
|
|
501
|
+
return None
|
|
502
|
+
|
|
503
|
+
return (parts[-3], parts[-2], parts[-1])
|
|
504
|
+
|
|
505
|
+
def _lineage_table_ref_from_payload(
|
|
506
|
+
self,
|
|
507
|
+
payload: dict[str, Any],
|
|
508
|
+
) -> tuple[str, str, str] | None:
|
|
509
|
+
nested = payload.get("tableInfo")
|
|
510
|
+
if isinstance(nested, dict):
|
|
511
|
+
payload = nested
|
|
512
|
+
|
|
513
|
+
catalog = payload.get("catalog_name") or payload.get("catalog")
|
|
514
|
+
schema = payload.get("schema_name") or payload.get("schema")
|
|
515
|
+
table = payload.get("name") or payload.get("table")
|
|
516
|
+
|
|
517
|
+
if all(isinstance(value, str) and value for value in (catalog, schema, table)):
|
|
518
|
+
return (catalog, schema, table)
|
|
519
|
+
|
|
520
|
+
table_name = payload.get("table_name") or payload.get("full_name")
|
|
521
|
+
if isinstance(table_name, str) and table_name.strip():
|
|
522
|
+
return self._parse_qualified_table_name(table_name)
|
|
523
|
+
|
|
524
|
+
return None
|
|
525
|
+
|
|
526
|
+
def _lineage_refs_for_table(self, table_ref: TableRef) -> set[tuple[str, str, str]]:
|
|
527
|
+
response = self._request_json(
|
|
528
|
+
"get",
|
|
529
|
+
"/api/2.0/lineage-tracking/table-lineage",
|
|
530
|
+
params={
|
|
531
|
+
"table_name": f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}",
|
|
532
|
+
"include_entity_lineage": str(
|
|
533
|
+
bool(self._extraction_options().include_notebooks)
|
|
534
|
+
).lower(),
|
|
535
|
+
},
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
refs: set[tuple[str, str, str]] = set()
|
|
539
|
+
|
|
540
|
+
upstreams = response.get("upstreams")
|
|
541
|
+
if isinstance(upstreams, list):
|
|
542
|
+
for entry in upstreams:
|
|
543
|
+
if isinstance(entry, dict):
|
|
544
|
+
parsed = self._lineage_table_ref_from_payload(entry)
|
|
545
|
+
if parsed:
|
|
546
|
+
refs.add(parsed)
|
|
547
|
+
|
|
548
|
+
upstream_tables = response.get("upstream_tables")
|
|
549
|
+
if isinstance(upstream_tables, list):
|
|
550
|
+
for entry in upstream_tables:
|
|
551
|
+
if isinstance(entry, dict):
|
|
552
|
+
parsed = self._lineage_table_ref_from_payload(entry)
|
|
553
|
+
if parsed:
|
|
554
|
+
refs.add(parsed)
|
|
555
|
+
|
|
556
|
+
return refs
|
|
557
|
+
|
|
558
|
+
def _iter_notebooks(self) -> Generator[NotebookRef, None, None]:
|
|
559
|
+
if not self._extraction_options().include_notebooks:
|
|
560
|
+
return
|
|
561
|
+
|
|
562
|
+
queue: deque[str] = deque(["/"])
|
|
563
|
+
visited_paths: set[str] = set()
|
|
564
|
+
|
|
565
|
+
while queue:
|
|
566
|
+
if self._aborted:
|
|
567
|
+
break
|
|
568
|
+
|
|
569
|
+
path = queue.popleft()
|
|
570
|
+
if path in visited_paths:
|
|
571
|
+
continue
|
|
572
|
+
visited_paths.add(path)
|
|
573
|
+
|
|
574
|
+
try:
|
|
575
|
+
payload = self._request_json(
|
|
576
|
+
"get",
|
|
577
|
+
"/api/2.0/workspace/list",
|
|
578
|
+
params={"path": path},
|
|
579
|
+
)
|
|
580
|
+
except Exception as exc:
|
|
581
|
+
logger.warning("Skipping workspace path %s due to listing error: %s", path, exc)
|
|
582
|
+
continue
|
|
583
|
+
|
|
584
|
+
objects = payload.get("objects")
|
|
585
|
+
if not isinstance(objects, list):
|
|
586
|
+
continue
|
|
587
|
+
|
|
588
|
+
for obj in objects:
|
|
589
|
+
if not isinstance(obj, dict):
|
|
590
|
+
continue
|
|
591
|
+
|
|
592
|
+
object_type = str(obj.get("object_type") or "").upper()
|
|
593
|
+
object_path = obj.get("path")
|
|
594
|
+
if not isinstance(object_path, str) or not object_path:
|
|
595
|
+
continue
|
|
596
|
+
|
|
597
|
+
if object_type == "DIRECTORY":
|
|
598
|
+
queue.append(object_path)
|
|
599
|
+
continue
|
|
600
|
+
|
|
601
|
+
if object_type != "NOTEBOOK":
|
|
602
|
+
continue
|
|
603
|
+
|
|
604
|
+
object_id = obj.get("object_id")
|
|
605
|
+
yield NotebookRef(
|
|
606
|
+
path=object_path,
|
|
607
|
+
object_id=str(object_id) if object_id is not None else None,
|
|
608
|
+
language=(
|
|
609
|
+
str(obj.get("language")) if obj.get("language") is not None else None
|
|
610
|
+
),
|
|
611
|
+
created_at_ms=(
|
|
612
|
+
int(obj["created_at"]) if isinstance(obj.get("created_at"), int) else None
|
|
613
|
+
),
|
|
614
|
+
modified_at_ms=(
|
|
615
|
+
int(obj["modified_at"]) if isinstance(obj.get("modified_at"), int) else None
|
|
616
|
+
),
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
def _iter_pipelines(self) -> Generator[PipelineRef, None, None]:
|
|
620
|
+
if not self._extraction_options().include_pipelines:
|
|
621
|
+
return
|
|
622
|
+
|
|
623
|
+
next_page_token: str | None = None
|
|
624
|
+
while True:
|
|
625
|
+
params = {}
|
|
626
|
+
if next_page_token:
|
|
627
|
+
params["page_token"] = next_page_token
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
payload = self._request_json("get", "/api/2.0/pipelines", params=params)
|
|
631
|
+
except Exception as exc:
|
|
632
|
+
logger.warning("Could not list Databricks pipelines: %s", exc)
|
|
633
|
+
break
|
|
634
|
+
|
|
635
|
+
values: list[dict[str, Any]] = []
|
|
636
|
+
for key in ("statuses", "pipelines", "value", "items"):
|
|
637
|
+
candidate = payload.get(key)
|
|
638
|
+
if isinstance(candidate, list):
|
|
639
|
+
values = candidate
|
|
640
|
+
break
|
|
641
|
+
|
|
642
|
+
for entry in values:
|
|
643
|
+
pipeline_id = entry.get("pipeline_id") or entry.get("id")
|
|
644
|
+
if not isinstance(pipeline_id, str) or not pipeline_id:
|
|
645
|
+
continue
|
|
646
|
+
|
|
647
|
+
name = entry.get("name")
|
|
648
|
+
state = entry.get("state") or entry.get("health")
|
|
649
|
+
yield PipelineRef(
|
|
650
|
+
pipeline_id=pipeline_id,
|
|
651
|
+
name=str(name) if isinstance(name, str) and name else pipeline_id,
|
|
652
|
+
state=str(state) if isinstance(state, str) and state else None,
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
token = payload.get("next_page_token")
|
|
656
|
+
if not isinstance(token, str) or not token.strip():
|
|
657
|
+
break
|
|
658
|
+
next_page_token = token
|
|
659
|
+
|
|
660
|
+
def test_connection(self) -> dict[str, Any]:
|
|
661
|
+
logger.info("Testing connection to Databricks Unity Catalog...")
|
|
662
|
+
result = {
|
|
663
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
664
|
+
"source_type": self.recipe.get("type"),
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
try:
|
|
668
|
+
catalogs = self._list_catalogs()
|
|
669
|
+
if not catalogs:
|
|
670
|
+
raise ValueError("No Unity Catalog catalogs available for scanning")
|
|
671
|
+
|
|
672
|
+
with closing(self._connect_sql()) as conn:
|
|
673
|
+
with conn.cursor() as cursor:
|
|
674
|
+
cursor.execute("SELECT 1")
|
|
675
|
+
cursor.fetchone()
|
|
676
|
+
|
|
677
|
+
auth_mode = (
|
|
678
|
+
"PAT_TOKEN"
|
|
679
|
+
if isinstance(self.config.required, DatabricksRequiredPat)
|
|
680
|
+
else "SERVICE_PRINCIPAL"
|
|
681
|
+
)
|
|
682
|
+
result["status"] = "SUCCESS"
|
|
683
|
+
result["message"] = (
|
|
684
|
+
"Successfully connected to Databricks Unity Catalog "
|
|
685
|
+
f"using {auth_mode}. Reachable catalogs: {len(catalogs)}."
|
|
686
|
+
)
|
|
687
|
+
except Exception as exc:
|
|
688
|
+
result["status"] = "FAILURE"
|
|
689
|
+
result["message"] = f"Failed to connect to Databricks Unity Catalog: {exc}"
|
|
690
|
+
|
|
691
|
+
return result
|
|
692
|
+
|
|
693
|
+
def _table_to_asset(
|
|
694
|
+
self,
|
|
695
|
+
table_ref: TableRef,
|
|
696
|
+
*,
|
|
697
|
+
links: list[str] | None = None,
|
|
698
|
+
) -> SingleAssetScanResults:
|
|
699
|
+
asset_name = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
700
|
+
raw_id = self._table_raw_id(table_ref)
|
|
701
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
702
|
+
external_url = (
|
|
703
|
+
f"{self._workspace_url()}/explore/data/"
|
|
704
|
+
f"{table_ref.catalog}/{table_ref.schema}/{table_ref.table}"
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
metadata = {
|
|
708
|
+
"catalog": table_ref.catalog,
|
|
709
|
+
"schema": table_ref.schema,
|
|
710
|
+
"table": table_ref.table,
|
|
711
|
+
"object_type": table_ref.object_type,
|
|
712
|
+
"sampling": {
|
|
713
|
+
"strategy": str(self._sampling().strategy),
|
|
714
|
+
},
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
now = datetime.now(UTC)
|
|
718
|
+
return SingleAssetScanResults(
|
|
719
|
+
hash=asset_hash,
|
|
720
|
+
checksum=self.calculate_checksum(metadata),
|
|
721
|
+
name=asset_name,
|
|
722
|
+
external_url=external_url,
|
|
723
|
+
links=links or [],
|
|
724
|
+
asset_type=OutputAssetType.TABLE,
|
|
725
|
+
source_id=self.source_id,
|
|
726
|
+
created_at=now,
|
|
727
|
+
updated_at=now,
|
|
728
|
+
runner_id=self.runner_id,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
def _notebook_raw_id(self, notebook: NotebookRef) -> str:
|
|
732
|
+
return f"notebook_#_{notebook.path}"
|
|
733
|
+
|
|
734
|
+
def _notebook_to_asset(self, notebook: NotebookRef) -> SingleAssetScanResults:
|
|
735
|
+
raw_id = self._notebook_raw_id(notebook)
|
|
736
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
737
|
+
|
|
738
|
+
metadata = {
|
|
739
|
+
"kind": "notebook",
|
|
740
|
+
"path": notebook.path,
|
|
741
|
+
"object_id": notebook.object_id,
|
|
742
|
+
"language": notebook.language,
|
|
743
|
+
"created_at_ms": notebook.created_at_ms,
|
|
744
|
+
"modified_at_ms": notebook.modified_at_ms,
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
raw_content = json.dumps(metadata, ensure_ascii=False)
|
|
748
|
+
text_content = "\n".join(
|
|
749
|
+
[
|
|
750
|
+
"kind=notebook",
|
|
751
|
+
f"path={notebook.path}",
|
|
752
|
+
f"language={notebook.language or 'unknown'}",
|
|
753
|
+
f"object_id={notebook.object_id or 'unknown'}",
|
|
754
|
+
]
|
|
755
|
+
)
|
|
756
|
+
self._content_cache[asset_hash] = (raw_content, text_content)
|
|
757
|
+
|
|
758
|
+
now = datetime.now(UTC)
|
|
759
|
+
return SingleAssetScanResults(
|
|
760
|
+
hash=asset_hash,
|
|
761
|
+
checksum=self.calculate_checksum(metadata),
|
|
762
|
+
name=notebook.path,
|
|
763
|
+
external_url=f"{self._workspace_url()}/#workspace{notebook.path}",
|
|
764
|
+
links=[],
|
|
765
|
+
asset_type=OutputAssetType.TXT,
|
|
766
|
+
source_id=self.source_id,
|
|
767
|
+
created_at=now,
|
|
768
|
+
updated_at=now,
|
|
769
|
+
runner_id=self.runner_id,
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
def _pipeline_raw_id(self, pipeline: PipelineRef) -> str:
|
|
773
|
+
return f"pipeline_#_{pipeline.pipeline_id}"
|
|
774
|
+
|
|
775
|
+
def _pipeline_to_asset(self, pipeline: PipelineRef) -> SingleAssetScanResults:
|
|
776
|
+
raw_id = self._pipeline_raw_id(pipeline)
|
|
777
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
778
|
+
|
|
779
|
+
metadata = {
|
|
780
|
+
"kind": "pipeline",
|
|
781
|
+
"pipeline_id": pipeline.pipeline_id,
|
|
782
|
+
"name": pipeline.name,
|
|
783
|
+
"state": pipeline.state,
|
|
784
|
+
}
|
|
785
|
+
|
|
786
|
+
raw_content = json.dumps(metadata, ensure_ascii=False)
|
|
787
|
+
text_content = "\n".join(
|
|
788
|
+
[
|
|
789
|
+
"kind=pipeline",
|
|
790
|
+
f"pipeline_id={pipeline.pipeline_id}",
|
|
791
|
+
f"name={pipeline.name}",
|
|
792
|
+
f"state={pipeline.state or 'unknown'}",
|
|
793
|
+
]
|
|
794
|
+
)
|
|
795
|
+
self._content_cache[asset_hash] = (raw_content, text_content)
|
|
796
|
+
|
|
797
|
+
now = datetime.now(UTC)
|
|
798
|
+
return SingleAssetScanResults(
|
|
799
|
+
hash=asset_hash,
|
|
800
|
+
checksum=self.calculate_checksum(metadata),
|
|
801
|
+
name=pipeline.name,
|
|
802
|
+
external_url=f"{self._workspace_url()}/#joblist/pipelines/{pipeline.pipeline_id}",
|
|
803
|
+
links=[],
|
|
804
|
+
asset_type=OutputAssetType.TXT,
|
|
805
|
+
source_id=self.source_id,
|
|
806
|
+
created_at=now,
|
|
807
|
+
updated_at=now,
|
|
808
|
+
runner_id=self.runner_id,
|
|
809
|
+
)
|
|
810
|
+
|
|
811
|
+
STREAM_DETECTIONS = True
|
|
812
|
+
|
|
813
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
814
|
+
if self._aborted:
|
|
815
|
+
return
|
|
816
|
+
|
|
817
|
+
# 1. Discover all tables first to establish the scope for lineage links
|
|
818
|
+
logger.info("Starting Databricks extraction: discovering tables...")
|
|
819
|
+
tables = self._iter_tables()
|
|
820
|
+
table_hash_by_key: dict[tuple[str, str, str], str] = {
|
|
821
|
+
self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
|
|
822
|
+
for table_ref in tables
|
|
823
|
+
}
|
|
824
|
+
|
|
825
|
+
# 2. Process tables
|
|
826
|
+
include_lineage = self._extraction_options().include_table_lineage
|
|
827
|
+
if include_lineage and tables:
|
|
828
|
+
logger.info("Fetching table lineage for %d table(s)...", len(tables))
|
|
829
|
+
|
|
830
|
+
batch: list[SingleAssetScanResults] = []
|
|
831
|
+
emitted_tables = 0
|
|
832
|
+
|
|
833
|
+
for i, table_ref in enumerate(tables, 1):
|
|
834
|
+
if self._aborted:
|
|
835
|
+
return
|
|
836
|
+
|
|
837
|
+
table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
838
|
+
logger.info("Processing table %d/%d: %s", i, len(tables), table_label)
|
|
839
|
+
|
|
840
|
+
linked_hashes: list[str] = []
|
|
841
|
+
if include_lineage:
|
|
842
|
+
try:
|
|
843
|
+
upstream_refs = self._lineage_refs_for_table(table_ref)
|
|
844
|
+
linked_hashes = [
|
|
845
|
+
table_hash_by_key[target]
|
|
846
|
+
for target in sorted(upstream_refs)
|
|
847
|
+
if target in table_hash_by_key
|
|
848
|
+
]
|
|
849
|
+
if linked_hashes:
|
|
850
|
+
logger.debug("%s has %d upstream link(s)", table_label, len(linked_hashes))
|
|
851
|
+
except Exception as exc:
|
|
852
|
+
logger.warning(
|
|
853
|
+
"Could not resolve Databricks lineage for %s: %s", table_label, exc
|
|
854
|
+
)
|
|
855
|
+
|
|
856
|
+
asset = self._table_to_asset(table_ref, links=linked_hashes)
|
|
857
|
+
self._table_lookup[asset.hash] = table_ref
|
|
858
|
+
batch.append(asset)
|
|
859
|
+
emitted_tables += 1
|
|
860
|
+
|
|
861
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
862
|
+
logger.info(
|
|
863
|
+
"Emitting batch of %d table asset(s) (total so far: %d)",
|
|
864
|
+
len(batch),
|
|
865
|
+
emitted_tables,
|
|
866
|
+
)
|
|
867
|
+
yield batch
|
|
868
|
+
batch = []
|
|
869
|
+
|
|
870
|
+
# 3. Process notebooks
|
|
871
|
+
notebook_count = 0
|
|
872
|
+
for notebook in self._iter_notebooks():
|
|
873
|
+
if self._aborted:
|
|
874
|
+
break
|
|
875
|
+
|
|
876
|
+
asset = self._notebook_to_asset(notebook)
|
|
877
|
+
batch.append(asset)
|
|
878
|
+
notebook_count += 1
|
|
879
|
+
|
|
880
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
881
|
+
yield batch
|
|
882
|
+
batch = []
|
|
883
|
+
|
|
884
|
+
if notebook_count:
|
|
885
|
+
logger.info("Discovered %d notebook(s)", notebook_count)
|
|
886
|
+
|
|
887
|
+
# 4. Process pipelines
|
|
888
|
+
pipeline_count = 0
|
|
889
|
+
for pipeline_ref in self._iter_pipelines():
|
|
890
|
+
if self._aborted:
|
|
891
|
+
break
|
|
892
|
+
|
|
893
|
+
asset = self._pipeline_to_asset(pipeline_ref)
|
|
894
|
+
batch.append(asset)
|
|
895
|
+
pipeline_count += 1
|
|
896
|
+
|
|
897
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
898
|
+
yield batch
|
|
899
|
+
batch = []
|
|
900
|
+
|
|
901
|
+
if pipeline_count:
|
|
902
|
+
logger.info("Discovered %d pipeline(s)", pipeline_count)
|
|
903
|
+
|
|
904
|
+
if batch:
|
|
905
|
+
logger.info("Emitting final batch of %d asset(s)", len(batch))
|
|
906
|
+
yield batch
|
|
907
|
+
|
|
908
|
+
logger.info(
|
|
909
|
+
"Extraction complete: %d table(s), %d notebook(s), %d pipeline(s)",
|
|
910
|
+
emitted_tables,
|
|
911
|
+
notebook_count,
|
|
912
|
+
pipeline_count,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
916
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
917
|
+
|
|
918
|
+
def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
|
|
919
|
+
if asset_id in self._table_lookup:
|
|
920
|
+
return self._table_lookup[asset_id]
|
|
921
|
+
|
|
922
|
+
decoded = asset_id
|
|
923
|
+
if "_#_" not in decoded:
|
|
924
|
+
try:
|
|
925
|
+
decoded = unhash_id(asset_id)
|
|
926
|
+
except Exception:
|
|
927
|
+
decoded = asset_id
|
|
928
|
+
|
|
929
|
+
parts = decoded.split("_#_")
|
|
930
|
+
if len(parts) >= 2 and parts[-2] in {"notebook", "pipeline"}:
|
|
931
|
+
return None
|
|
932
|
+
|
|
933
|
+
if len(parts) >= 4 and parts[0].upper() == "DATABRICKS":
|
|
934
|
+
return TableRef(
|
|
935
|
+
catalog=parts[-3],
|
|
936
|
+
schema=parts[-2],
|
|
937
|
+
table=parts[-1],
|
|
938
|
+
object_type="TABLE",
|
|
939
|
+
)
|
|
940
|
+
|
|
941
|
+
if len(parts) >= 3:
|
|
942
|
+
return TableRef(
|
|
943
|
+
catalog=parts[-3],
|
|
944
|
+
schema=parts[-2],
|
|
945
|
+
table=parts[-1],
|
|
946
|
+
object_type="TABLE",
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
return None
|
|
950
|
+
|
|
951
|
+
def _available_columns(self, table_ref: TableRef) -> list[str]:
|
|
952
|
+
query = (
|
|
953
|
+
"SELECT column_name "
|
|
954
|
+
"FROM system.information_schema.columns "
|
|
955
|
+
f"WHERE table_catalog = {_quote_literal(table_ref.catalog)} "
|
|
956
|
+
f"AND table_schema = {_quote_literal(table_ref.schema)} "
|
|
957
|
+
f"AND table_name = {_quote_literal(table_ref.table)} "
|
|
958
|
+
"ORDER BY ordinal_position"
|
|
959
|
+
)
|
|
960
|
+
|
|
961
|
+
with closing(self._connect_sql()) as conn:
|
|
962
|
+
with conn.cursor() as cursor:
|
|
963
|
+
cursor.execute(query)
|
|
964
|
+
columns: list[str] = []
|
|
965
|
+
for row in cursor.fetchall():
|
|
966
|
+
candidate: Any | None = None
|
|
967
|
+
try:
|
|
968
|
+
candidate = row[0] # type: ignore[index]
|
|
969
|
+
except Exception:
|
|
970
|
+
candidate = None
|
|
971
|
+
if isinstance(candidate, str):
|
|
972
|
+
columns.append(candidate)
|
|
973
|
+
return columns
|
|
974
|
+
|
|
975
|
+
def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
|
|
976
|
+
sampling = self._sampling()
|
|
977
|
+
configured = sampling.order_by_column
|
|
978
|
+
if configured and configured in columns:
|
|
979
|
+
return configured
|
|
980
|
+
|
|
981
|
+
priority_candidates = (
|
|
982
|
+
"updated_at",
|
|
983
|
+
"modified_at",
|
|
984
|
+
"created_at",
|
|
985
|
+
"inserted_at",
|
|
986
|
+
"timestamp",
|
|
987
|
+
"ts",
|
|
988
|
+
"date",
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
for candidate in priority_candidates:
|
|
992
|
+
if candidate in columns:
|
|
993
|
+
return candidate
|
|
994
|
+
return None
|
|
995
|
+
|
|
996
|
+
def _build_sampling_query(
|
|
997
|
+
self, table_ref: TableRef, columns: list[str]
|
|
998
|
+
) -> tuple[str, list[Any]]:
|
|
999
|
+
sampling = self._sampling()
|
|
1000
|
+
|
|
1001
|
+
if not columns:
|
|
1002
|
+
raise ValueError(
|
|
1003
|
+
f"Table {table_ref.catalog}.{table_ref.schema}.{table_ref.table} has no readable columns"
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
|
|
1007
|
+
from_expr = (
|
|
1008
|
+
f"{_quote_identifier(table_ref.catalog)}."
|
|
1009
|
+
f"{_quote_identifier(table_ref.schema)}."
|
|
1010
|
+
f"{_quote_identifier(table_ref.table)}"
|
|
1011
|
+
)
|
|
1012
|
+
|
|
1013
|
+
query = f"SELECT {quoted_columns} FROM {from_expr}"
|
|
1014
|
+
|
|
1015
|
+
strategy = sampling.strategy
|
|
1016
|
+
if strategy == SamplingStrategy.LATEST:
|
|
1017
|
+
order_column = self._resolve_latest_order_column(columns)
|
|
1018
|
+
if order_column:
|
|
1019
|
+
query += f" ORDER BY {_quote_identifier(order_column)} DESC"
|
|
1020
|
+
elif sampling.fallback_to_random is not False:
|
|
1021
|
+
query += " ORDER BY rand()"
|
|
1022
|
+
elif strategy == SamplingStrategy.RANDOM:
|
|
1023
|
+
query += " ORDER BY rand()"
|
|
1024
|
+
|
|
1025
|
+
if strategy != SamplingStrategy.ALL:
|
|
1026
|
+
query += f" LIMIT {int(sampling.rows_per_page or 100)}"
|
|
1027
|
+
|
|
1028
|
+
return query, []
|
|
1029
|
+
|
|
1030
|
+
def _count_table_rows(self, table_ref: TableRef) -> int | None:
|
|
1031
|
+
try:
|
|
1032
|
+
with closing(self._connect_sql()) as conn:
|
|
1033
|
+
with conn.cursor() as cursor:
|
|
1034
|
+
cursor.execute(
|
|
1035
|
+
f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.catalog)}.{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
|
|
1036
|
+
)
|
|
1037
|
+
row = cursor.fetchone()
|
|
1038
|
+
return int(row[0]) if row else None
|
|
1039
|
+
except Exception:
|
|
1040
|
+
return None
|
|
1041
|
+
|
|
1042
|
+
def _serialize_cell(self, value: Any) -> str:
|
|
1043
|
+
if value is None:
|
|
1044
|
+
return "null"
|
|
1045
|
+
if isinstance(value, (bytes, bytearray, memoryview)):
|
|
1046
|
+
return f"<{len(bytes(value))} bytes>"
|
|
1047
|
+
if isinstance(value, datetime):
|
|
1048
|
+
return value.isoformat()
|
|
1049
|
+
return str(value)
|
|
1050
|
+
|
|
1051
|
+
def _format_sample_content(
|
|
1052
|
+
self,
|
|
1053
|
+
table_ref: TableRef,
|
|
1054
|
+
column_names: list[str],
|
|
1055
|
+
rows: list[tuple[Any, ...]],
|
|
1056
|
+
row_offset: int = 0,
|
|
1057
|
+
) -> tuple[str, str]:
|
|
1058
|
+
sampling = self._sampling()
|
|
1059
|
+
return format_tabular_sample_content(
|
|
1060
|
+
scope_label="table",
|
|
1061
|
+
scope_value=f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}",
|
|
1062
|
+
strategy=sampling.strategy,
|
|
1063
|
+
rows=rows,
|
|
1064
|
+
column_names=column_names,
|
|
1065
|
+
serialize_cell=self._serialize_cell,
|
|
1066
|
+
include_column_names=sampling.include_column_names is not False,
|
|
1067
|
+
object_type=table_ref.object_type,
|
|
1068
|
+
raw_metadata={
|
|
1069
|
+
"catalog": table_ref.catalog,
|
|
1070
|
+
"schema": table_ref.schema,
|
|
1071
|
+
"table": table_ref.table,
|
|
1072
|
+
},
|
|
1073
|
+
row_offset=row_offset,
|
|
1074
|
+
)
|
|
1075
|
+
|
|
1076
|
+
def _fetch_one_page(
|
|
1077
|
+
self, table_ref: TableRef, base_query: str, page_size: int, offset: int
|
|
1078
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
1079
|
+
with closing(self._connect_sql()) as conn:
|
|
1080
|
+
paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
|
|
1081
|
+
with conn.cursor() as cursor:
|
|
1082
|
+
cursor.execute(paginated_query)
|
|
1083
|
+
rows = list(cursor.fetchall())
|
|
1084
|
+
column_names = (
|
|
1085
|
+
[desc[0] for desc in cursor.description] if cursor.description else []
|
|
1086
|
+
)
|
|
1087
|
+
return rows, column_names
|
|
1088
|
+
|
|
1089
|
+
def _fetch_one_page_on_conn(
|
|
1090
|
+
self,
|
|
1091
|
+
conn: Any,
|
|
1092
|
+
base_query: str,
|
|
1093
|
+
page_size: int,
|
|
1094
|
+
offset: int,
|
|
1095
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
1096
|
+
paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
|
|
1097
|
+
with conn.cursor() as cursor:
|
|
1098
|
+
cursor.execute(paginated_query)
|
|
1099
|
+
rows = list(cursor.fetchall())
|
|
1100
|
+
column_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
1101
|
+
return rows, column_names
|
|
1102
|
+
|
|
1103
|
+
@staticmethod
|
|
1104
|
+
def _cursor_execute(cursor: Any, query: str) -> list[str]:
|
|
1105
|
+
cursor.execute(query)
|
|
1106
|
+
return [desc[0] for desc in cursor.description] if cursor.description else []
|
|
1107
|
+
|
|
1108
|
+
@staticmethod
|
|
1109
|
+
def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
|
|
1110
|
+
return list(cursor.fetchmany(size))
|
|
1111
|
+
|
|
1112
|
+
def _fetch_sample_rows(
|
|
1113
|
+
self, table_ref: TableRef
|
|
1114
|
+
) -> tuple[list[tuple[Any, ...]], list[str]] | None:
|
|
1115
|
+
table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
1116
|
+
columns = self._available_columns(table_ref)
|
|
1117
|
+
sampling = self._sampling()
|
|
1118
|
+
query, _params = self._build_sampling_query(table_ref, columns)
|
|
1119
|
+
|
|
1120
|
+
logger.info(
|
|
1121
|
+
"Sampling %s (%d column(s), strategy=%s)",
|
|
1122
|
+
table_label,
|
|
1123
|
+
len(columns),
|
|
1124
|
+
str(sampling.strategy),
|
|
1125
|
+
)
|
|
1126
|
+
|
|
1127
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
1128
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
1129
|
+
rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
|
|
1130
|
+
else:
|
|
1131
|
+
with closing(self._connect_sql()) as conn:
|
|
1132
|
+
with conn.cursor() as cursor:
|
|
1133
|
+
cursor.execute(query)
|
|
1134
|
+
rows = cursor.fetchall()
|
|
1135
|
+
column_names = [desc[0] for desc in cursor.description or []]
|
|
1136
|
+
|
|
1137
|
+
if not column_names:
|
|
1138
|
+
logger.warning("No columns returned for %s; skipping", table_label)
|
|
1139
|
+
return None
|
|
1140
|
+
|
|
1141
|
+
logger.info("Fetched %d row(s) from %s", len(rows), table_label)
|
|
1142
|
+
return rows, column_names
|
|
1143
|
+
|
|
1144
|
+
def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
|
|
1145
|
+
result = self._fetch_sample_rows(table_ref)
|
|
1146
|
+
if result is None:
|
|
1147
|
+
return None
|
|
1148
|
+
rows, column_names = result
|
|
1149
|
+
return self._format_sample_content(table_ref, column_names, rows)
|
|
1150
|
+
|
|
1151
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
1152
|
+
cached = self._content_cache.get(asset_id)
|
|
1153
|
+
if cached:
|
|
1154
|
+
return cached
|
|
1155
|
+
|
|
1156
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
1157
|
+
if not table_ref:
|
|
1158
|
+
return None
|
|
1159
|
+
|
|
1160
|
+
sampled = self._sample_table_rows(table_ref)
|
|
1161
|
+
|
|
1162
|
+
if sampled is None:
|
|
1163
|
+
return None
|
|
1164
|
+
|
|
1165
|
+
self._content_cache[asset_id] = sampled
|
|
1166
|
+
return sampled
|
|
1167
|
+
|
|
1168
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
1169
|
+
sampling = self._sampling()
|
|
1170
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
1171
|
+
if not table_ref:
|
|
1172
|
+
return
|
|
1173
|
+
|
|
1174
|
+
table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
1175
|
+
|
|
1176
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
1177
|
+
result = self._fetch_sample_rows(table_ref)
|
|
1178
|
+
if result is None:
|
|
1179
|
+
return
|
|
1180
|
+
rows, column_names = result
|
|
1181
|
+
logger.info(
|
|
1182
|
+
"Scanning %s: %d row(s) [strategy=%s]",
|
|
1183
|
+
table_label,
|
|
1184
|
+
len(rows),
|
|
1185
|
+
str(sampling.strategy),
|
|
1186
|
+
)
|
|
1187
|
+
for i, row in enumerate(rows):
|
|
1188
|
+
formatted = self._format_sample_content(
|
|
1189
|
+
table_ref, column_names, [row], row_offset=i
|
|
1190
|
+
)
|
|
1191
|
+
if formatted:
|
|
1192
|
+
yield formatted
|
|
1193
|
+
return
|
|
1194
|
+
|
|
1195
|
+
columns = self._available_columns(table_ref)
|
|
1196
|
+
query, _ = self._build_sampling_query(table_ref, columns)
|
|
1197
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
1198
|
+
table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
1199
|
+
|
|
1200
|
+
total_rows = self._count_table_rows(table_ref)
|
|
1201
|
+
total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
|
|
1202
|
+
if total_rows is not None and total_batches is not None:
|
|
1203
|
+
logger.info(
|
|
1204
|
+
"Full scan %s: %d rows, %d batches of %d",
|
|
1205
|
+
table_label,
|
|
1206
|
+
total_rows,
|
|
1207
|
+
total_batches,
|
|
1208
|
+
rows_per_page,
|
|
1209
|
+
)
|
|
1210
|
+
|
|
1211
|
+
# Stream rows via fetchmany — O(1) per page at any offset, no PK needed.
|
|
1212
|
+
# Each fetchmany() advances the server-side result pointer without re-scanning.
|
|
1213
|
+
row_offset = 0
|
|
1214
|
+
page_num = 1
|
|
1215
|
+
|
|
1216
|
+
conn = self._connect_sql()
|
|
1217
|
+
cursor = conn.cursor()
|
|
1218
|
+
try:
|
|
1219
|
+
column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
|
|
1220
|
+
if not column_names:
|
|
1221
|
+
return
|
|
1222
|
+
|
|
1223
|
+
while not self._aborted:
|
|
1224
|
+
if total_batches is not None:
|
|
1225
|
+
logger.info("%s batch %d/%d", table_label, page_num, total_batches)
|
|
1226
|
+
|
|
1227
|
+
rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
|
|
1228
|
+
if not rows:
|
|
1229
|
+
break
|
|
1230
|
+
|
|
1231
|
+
# Yield each row individually so detection runs in parallel with fetching.
|
|
1232
|
+
for i, row in enumerate(rows):
|
|
1233
|
+
formatted = self._format_sample_content(
|
|
1234
|
+
table_ref, column_names, [row], row_offset=row_offset + i
|
|
1235
|
+
)
|
|
1236
|
+
if formatted:
|
|
1237
|
+
self._content_cache[asset_id] = formatted
|
|
1238
|
+
yield formatted
|
|
1239
|
+
|
|
1240
|
+
row_offset += len(rows)
|
|
1241
|
+
page_num += 1
|
|
1242
|
+
if len(rows) < rows_per_page:
|
|
1243
|
+
break
|
|
1244
|
+
finally:
|
|
1245
|
+
try:
|
|
1246
|
+
cursor.close()
|
|
1247
|
+
except Exception:
|
|
1248
|
+
pass
|
|
1249
|
+
conn.close()
|
|
1250
|
+
|
|
1251
|
+
def enrich_finding_location(
|
|
1252
|
+
self,
|
|
1253
|
+
finding: DetectionResult,
|
|
1254
|
+
asset: SingleAssetScanResults,
|
|
1255
|
+
text_content: str,
|
|
1256
|
+
) -> None:
|
|
1257
|
+
del text_content
|
|
1258
|
+
table_ref = self._table_lookup.get(asset.hash)
|
|
1259
|
+
if not table_ref:
|
|
1260
|
+
return
|
|
1261
|
+
|
|
1262
|
+
path = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
|
|
1263
|
+
cached = self._content_cache.get(asset.hash)
|
|
1264
|
+
raw_content = cached[0] if cached else None
|
|
1265
|
+
metadata = finding.metadata or {}
|
|
1266
|
+
finding.location = build_tabular_location(
|
|
1267
|
+
raw_content=raw_content,
|
|
1268
|
+
matched_content=finding.matched_content,
|
|
1269
|
+
base_path=path,
|
|
1270
|
+
row_index=metadata.get("tabular_row_index"),
|
|
1271
|
+
column_name=metadata.get("tabular_column_name"),
|
|
1272
|
+
)
|
|
1273
|
+
|
|
1274
|
+
def abort(self) -> None:
|
|
1275
|
+
logger.info("Aborting Databricks extraction...")
|
|
1276
|
+
super().abort()
|
|
1277
|
+
|
|
1278
|
+
def cleanup(self) -> None:
|
|
1279
|
+
self.session.close()
|