classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,1279 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import logging
6
+ from collections import deque
7
+ from collections.abc import AsyncGenerator, Generator
8
+ from contextlib import closing
9
+ from dataclasses import dataclass
10
+ from datetime import UTC, datetime, timedelta
11
+ from typing import Any
12
+ from urllib.parse import urlparse
13
+
14
+ import requests
15
+
16
+ from ...models.generated_input import (
17
+ DatabricksInput,
18
+ DatabricksMaskedPat,
19
+ DatabricksMaskedServicePrincipal,
20
+ DatabricksOptionalConnection,
21
+ DatabricksOptionalExtraction,
22
+ DatabricksOptionalScope,
23
+ DatabricksRequiredPat,
24
+ DatabricksRequiredServicePrincipal,
25
+ SamplingConfig,
26
+ SamplingStrategy,
27
+ )
28
+ from ...models.generated_single_asset_scan_results import (
29
+ AssetType as OutputAssetType,
30
+ )
31
+ from ...models.generated_single_asset_scan_results import (
32
+ DetectionResult,
33
+ SingleAssetScanResults,
34
+ )
35
+ from ...utils.hashing import hash_id, unhash_id
36
+ from ..base import BaseSource
37
+ from ..dependencies import require_module
38
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ _DEFAULT_EXCLUDED_CATALOGS = {"system"}
43
+ _DEFAULT_EXCLUDED_SCHEMAS = {"information_schema"}
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class TableRef:
48
+ catalog: str
49
+ schema: str
50
+ table: str
51
+ object_type: str
52
+
53
+
54
+ @dataclass(frozen=True)
55
+ class NotebookRef:
56
+ path: str
57
+ object_id: str | None
58
+ language: str | None
59
+ created_at_ms: int | None
60
+ modified_at_ms: int | None
61
+
62
+
63
+ @dataclass(frozen=True)
64
+ class PipelineRef:
65
+ pipeline_id: str
66
+ name: str
67
+ state: str | None
68
+
69
+
70
+ def _quote_identifier(identifier: str) -> str:
71
+ return f"`{identifier.replace('`', '``')}`"
72
+
73
+
74
+ def _quote_literal(value: str) -> str:
75
+ return "'" + value.replace("'", "''") + "'"
76
+
77
+
78
+ class DatabricksSource(BaseSource):
79
+ source_type = "databricks"
80
+
81
+ def __init__(
82
+ self,
83
+ recipe: dict[str, Any],
84
+ source_id: str | None = None,
85
+ runner_id: str | None = None,
86
+ ) -> None:
87
+ super().__init__(recipe, source_id, runner_id)
88
+ self.config = DatabricksInput.model_validate(recipe)
89
+ self.runner_id = runner_id or "local-run"
90
+
91
+ self._databricks_sql = require_module(
92
+ module_name="databricks.sql",
93
+ source_name="Databricks",
94
+ uv_groups=["databricks"],
95
+ detail="The Databricks SQL connector is optional.",
96
+ )
97
+
98
+ # pyarrow→pandas conversion calls pytz.timezone() on the timezone name
99
+ # embedded in Arrow schema metadata. Databricks uses 'Etc/UTC' which is
100
+ # absent from pytz's built-in zone list. Pre-populating the cache makes
101
+ # pytz.timezone('Etc/UTC') return UTC without hitting the lookup failure.
102
+ try:
103
+ import pytz
104
+
105
+ pytz._tzinfo_cache.setdefault("Etc/UTC", pytz.UTC)
106
+ except Exception:
107
+ pass
108
+
109
+ self._validate_auth_configuration()
110
+
111
+ self.session = requests.Session()
112
+ self._access_token: str | None = None
113
+ self._access_token_expiry: datetime | None = None
114
+
115
+ self._table_lookup: dict[str, TableRef] = {}
116
+ self._content_cache: dict[str, tuple[str, str]] = {}
117
+
118
+ def _validate_auth_configuration(self) -> None:
119
+ required = self.config.required
120
+ masked = self.config.masked
121
+
122
+ if isinstance(required, DatabricksRequiredPat):
123
+ if not isinstance(masked, DatabricksMaskedPat):
124
+ raise ValueError("DATABRICKS PAT_TOKEN auth requires masked.token")
125
+ return
126
+
127
+ if isinstance(required, DatabricksRequiredServicePrincipal):
128
+ if not isinstance(masked, DatabricksMaskedServicePrincipal):
129
+ raise ValueError("DATABRICKS SERVICE_PRINCIPAL auth requires masked.client_secret")
130
+ return
131
+
132
+ raise ValueError("Unsupported DATABRICKS auth configuration")
133
+
134
+ def _asset_type_value(self) -> str:
135
+ type_value = self.config.type
136
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
137
+
138
+ def _sampling(self) -> SamplingConfig:
139
+ return self.config.sampling
140
+
141
+ def _connection_options(self) -> DatabricksOptionalConnection:
142
+ if self.config.optional and self.config.optional.connection:
143
+ return self.config.optional.connection
144
+ return DatabricksOptionalConnection()
145
+
146
+ def _scope_options(self) -> DatabricksOptionalScope:
147
+ if self.config.optional and self.config.optional.scope:
148
+ return self.config.optional.scope
149
+ return DatabricksOptionalScope()
150
+
151
+ def _extraction_options(self) -> DatabricksOptionalExtraction:
152
+ if self.config.optional and self.config.optional.extraction:
153
+ return self.config.optional.extraction
154
+ return DatabricksOptionalExtraction()
155
+
156
+ def _workspace_url(self) -> str:
157
+ return str(self.config.required.workspace_url).rstrip("/")
158
+
159
+ def _workspace_host(self) -> str:
160
+ parsed = urlparse(self._workspace_url())
161
+ return parsed.netloc
162
+
163
+ def _warehouse_id(self) -> str:
164
+ return self.config.required.warehouse_id
165
+
166
+ def _timeout_seconds(self) -> int:
167
+ timeout = self._connection_options().timeout_seconds
168
+ return int(timeout or 30)
169
+
170
+ def _statement_timeout_seconds(self) -> int:
171
+ timeout = self._connection_options().statement_timeout_seconds
172
+ return int(timeout or 60)
173
+
174
+ def _is_pat_mode(self) -> bool:
175
+ return isinstance(self.config.required, DatabricksRequiredPat)
176
+
177
+ def _masked_pat_token(self) -> str:
178
+ masked = self.config.masked
179
+ if not isinstance(masked, DatabricksMaskedPat):
180
+ raise ValueError("DATABRICKS PAT_TOKEN auth requires masked.token")
181
+ return masked.token
182
+
183
+ def _service_principal_credentials(self) -> tuple[str, str]:
184
+ required = self.config.required
185
+ masked = self.config.masked
186
+ if not isinstance(required, DatabricksRequiredServicePrincipal):
187
+ raise ValueError("SERVICE_PRINCIPAL auth mode is required")
188
+ if not isinstance(masked, DatabricksMaskedServicePrincipal):
189
+ raise ValueError("DATABRICKS SERVICE_PRINCIPAL auth requires masked.client_secret")
190
+ return required.client_id, masked.client_secret
191
+
192
+ def _is_access_token_expired(self) -> bool:
193
+ if self._access_token_expiry is None:
194
+ return True
195
+ return self._access_token_expiry <= datetime.now(UTC)
196
+
197
+ def _acquire_service_principal_token(self) -> str:
198
+ client_id, client_secret = self._service_principal_credentials()
199
+
200
+ response = self.session.post(
201
+ f"{self._workspace_url()}/oidc/v1/token",
202
+ data={
203
+ "grant_type": "client_credentials",
204
+ "client_id": client_id,
205
+ "client_secret": client_secret,
206
+ "scope": "all-apis",
207
+ },
208
+ timeout=self._timeout_seconds(),
209
+ )
210
+ response.raise_for_status()
211
+
212
+ payload = response.json()
213
+ token = payload.get("access_token")
214
+ if not isinstance(token, str) or not token.strip():
215
+ raise ValueError("Databricks token response did not include access_token")
216
+
217
+ expires_in = int(payload.get("expires_in", 3600))
218
+ safety_seconds = 300
219
+ valid_for_seconds = max(expires_in - safety_seconds, 0)
220
+ self._access_token_expiry = datetime.now(UTC) + timedelta(seconds=valid_for_seconds)
221
+
222
+ return token.strip()
223
+
224
+ def _access_token_value(self) -> str:
225
+ if self._is_pat_mode():
226
+ return self._masked_pat_token().strip()
227
+
228
+ if self._access_token and not self._is_access_token_expired():
229
+ return self._access_token
230
+
231
+ self._access_token = self._acquire_service_principal_token()
232
+ return self._access_token
233
+
234
+ def _authorization_header(self) -> str:
235
+ return f"Bearer {self._access_token_value()}"
236
+
237
+ def _request_json(
238
+ self,
239
+ method: str,
240
+ path: str,
241
+ *,
242
+ params: dict[str, Any] | None = None,
243
+ json_payload: dict[str, Any] | None = None,
244
+ ) -> dict[str, Any]:
245
+ url = (
246
+ path
247
+ if path.startswith("http://") or path.startswith("https://")
248
+ else f"{self._workspace_url()}/{path.lstrip('/')}"
249
+ )
250
+
251
+ headers = {
252
+ "Authorization": self._authorization_header(),
253
+ "Accept": "application/json",
254
+ }
255
+
256
+ response = self.session.request(
257
+ method=method,
258
+ url=url,
259
+ headers=headers,
260
+ params=params,
261
+ json=json_payload,
262
+ timeout=self._timeout_seconds(),
263
+ )
264
+ response.raise_for_status()
265
+
266
+ if response.status_code == 204 or not response.text.strip():
267
+ return {}
268
+
269
+ return response.json()
270
+
271
+ def _paged_values(
272
+ self,
273
+ path: str,
274
+ *,
275
+ params: dict[str, Any] | None = None,
276
+ value_keys: tuple[str, ...],
277
+ ) -> list[dict[str, Any]]:
278
+ collected: list[dict[str, Any]] = []
279
+
280
+ next_page_token: str | None = None
281
+ while True:
282
+ current_params = dict(params or {})
283
+ if next_page_token:
284
+ current_params["page_token"] = next_page_token
285
+
286
+ payload = self._request_json("get", path, params=current_params)
287
+ values: Any = None
288
+ for key in value_keys:
289
+ candidate = payload.get(key)
290
+ if isinstance(candidate, list):
291
+ values = candidate
292
+ break
293
+
294
+ if isinstance(values, list):
295
+ for entry in values:
296
+ if isinstance(entry, dict):
297
+ collected.append(entry)
298
+
299
+ token = payload.get("next_page_token")
300
+ if not isinstance(token, str) or not token.strip():
301
+ break
302
+ next_page_token = token
303
+
304
+ return collected
305
+
306
+ def _connect_sql(self):
307
+ return self._databricks_sql.connect(
308
+ server_hostname=self._workspace_host(),
309
+ http_path=f"/sql/1.0/warehouses/{self._warehouse_id()}",
310
+ access_token=self._access_token_value(),
311
+ _socket_timeout=self._timeout_seconds(),
312
+ )
313
+
314
+ def _catalog_allowlist(self) -> set[str] | None:
315
+ configured = self._scope_options().include_catalogs
316
+ if not configured:
317
+ return None
318
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
319
+
320
+ def _catalog_denylist(self) -> set[str]:
321
+ configured = self._scope_options().exclude_catalogs or []
322
+ denylist = {entry.strip().lower() for entry in configured if entry and entry.strip()}
323
+ if not denylist:
324
+ denylist = set(_DEFAULT_EXCLUDED_CATALOGS)
325
+ return denylist
326
+
327
+ def _schema_allowlist(self) -> set[str] | None:
328
+ configured = self._scope_options().include_schemas
329
+ if not configured:
330
+ return None
331
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
332
+
333
+ def _schema_denylist(self) -> set[str]:
334
+ configured = self._scope_options().exclude_schemas or []
335
+ denylist = {entry.strip().lower() for entry in configured if entry and entry.strip()}
336
+ if not denylist:
337
+ denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
338
+ return denylist
339
+
340
+ def _table_allowlist(self) -> set[str] | None:
341
+ configured = self._scope_options().include_tables
342
+ if not configured:
343
+ return None
344
+ return {entry.strip().lower() for entry in configured if entry and entry.strip()}
345
+
346
+ def _catalog_allowed(self, catalog: str) -> bool:
347
+ normalized = catalog.lower()
348
+
349
+ if normalized in self._catalog_denylist():
350
+ return False
351
+
352
+ if normalized == "hive_metastore" and not self._scope_options().include_hive_metastore:
353
+ return False
354
+
355
+ allowlist = self._catalog_allowlist()
356
+ if allowlist and normalized not in allowlist:
357
+ return False
358
+ return True
359
+
360
+ def _schema_allowed(self, catalog: str, schema: str) -> bool:
361
+ scoped_schema = f"{catalog}.{schema}".lower()
362
+
363
+ denylist = self._schema_denylist()
364
+ if schema.lower() in denylist or scoped_schema in denylist:
365
+ return False
366
+
367
+ allowlist = self._schema_allowlist()
368
+ if not allowlist:
369
+ return True
370
+
371
+ return schema.lower() in allowlist or scoped_schema in allowlist
372
+
373
+ def _table_allowed(self, table_ref: TableRef) -> bool:
374
+ allowlist = self._table_allowlist()
375
+ if not allowlist:
376
+ return True
377
+
378
+ table = table_ref.table.lower()
379
+ schema_table = f"{table_ref.schema}.{table_ref.table}".lower()
380
+ catalog_schema_table = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}".lower()
381
+
382
+ return table in allowlist or schema_table in allowlist or catalog_schema_table in allowlist
383
+
384
+ def _list_catalogs(self) -> list[str]:
385
+ values = self._paged_values(
386
+ "/api/2.1/unity-catalog/catalogs",
387
+ value_keys=("catalogs", "value", "items"),
388
+ )
389
+
390
+ catalogs: list[str] = []
391
+ for entry in values:
392
+ name = entry.get("name")
393
+ if isinstance(name, str) and name and self._catalog_allowed(name):
394
+ catalogs.append(name)
395
+
396
+ catalogs.sort()
397
+ logger.info("Found %d catalog(s): %s", len(catalogs), ", ".join(catalogs) or "(none)")
398
+ return catalogs
399
+
400
+ def _list_schemas_for_catalog(self, catalog: str) -> list[str]:
401
+ values = self._paged_values(
402
+ "/api/2.1/unity-catalog/schemas",
403
+ params={"catalog_name": catalog},
404
+ value_keys=("schemas", "value", "items"),
405
+ )
406
+
407
+ schemas: list[str] = []
408
+ for entry in values:
409
+ name = entry.get("name")
410
+ if not isinstance(name, str) or not name:
411
+ continue
412
+ if self._schema_allowed(catalog, name):
413
+ schemas.append(name)
414
+
415
+ schemas.sort()
416
+ logger.info("Catalog %s: found %d schema(s)", catalog, len(schemas))
417
+ return schemas
418
+
419
+ def _coerce_object_type(self, table_type: Any) -> str:
420
+ normalized = str(table_type or "TABLE").upper()
421
+ if "VIEW" in normalized:
422
+ return "VIEW"
423
+ return "TABLE"
424
+
425
+ def _list_tables_for_schema(self, catalog: str, schema: str) -> list[TableRef]:
426
+ values = self._paged_values(
427
+ "/api/2.1/unity-catalog/tables",
428
+ params={"catalog_name": catalog, "schema_name": schema},
429
+ value_keys=("tables", "value", "items"),
430
+ )
431
+
432
+ limit_value = self._scope_options().table_limit_per_schema
433
+ limit = int(limit_value) if limit_value else None
434
+
435
+ tables: list[TableRef] = []
436
+ for entry in values:
437
+ table_name = entry.get("name") or entry.get("table_name")
438
+ if not isinstance(table_name, str) or not table_name:
439
+ continue
440
+
441
+ table_ref = TableRef(
442
+ catalog=catalog,
443
+ schema=schema,
444
+ table=table_name,
445
+ object_type=self._coerce_object_type(entry.get("table_type") or entry.get("type")),
446
+ )
447
+
448
+ if not self._table_allowed(table_ref):
449
+ continue
450
+
451
+ tables.append(table_ref)
452
+ if limit is not None and len(tables) >= limit:
453
+ break
454
+
455
+ logger.info("Schema %s.%s: found %d table(s)", catalog, schema, len(tables))
456
+ return tables
457
+
458
+ def _iter_tables(self) -> list[TableRef]:
459
+ tables: list[TableRef] = []
460
+
461
+ for catalog in self._list_catalogs():
462
+ if self._aborted:
463
+ break
464
+
465
+ try:
466
+ schemas = self._list_schemas_for_catalog(catalog)
467
+ except Exception as exc:
468
+ logger.warning("Skipping catalog %s due to schema listing error: %s", catalog, exc)
469
+ continue
470
+
471
+ for schema in schemas:
472
+ if self._aborted:
473
+ break
474
+
475
+ try:
476
+ tables.extend(self._list_tables_for_schema(catalog, schema))
477
+ except Exception as exc:
478
+ logger.warning(
479
+ "Skipping schema %s.%s due to table listing error: %s",
480
+ catalog,
481
+ schema,
482
+ exc,
483
+ )
484
+
485
+ logger.info("Discovery complete: %d table(s) in scope", len(tables))
486
+ return tables
487
+
488
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
489
+ return (table_ref.catalog, table_ref.schema, table_ref.table)
490
+
491
+ def _table_raw_id(self, table_ref: TableRef) -> str:
492
+ return f"{table_ref.catalog}_#_{table_ref.schema}_#_{table_ref.table}"
493
+
494
+ def _parse_qualified_table_name(self, value: str) -> tuple[str, str, str] | None:
495
+ cleaned = value.strip().strip("`")
496
+ if not cleaned:
497
+ return None
498
+
499
+ parts = [part.strip().strip("`") for part in cleaned.split(".") if part.strip()]
500
+ if len(parts) < 3:
501
+ return None
502
+
503
+ return (parts[-3], parts[-2], parts[-1])
504
+
505
+ def _lineage_table_ref_from_payload(
506
+ self,
507
+ payload: dict[str, Any],
508
+ ) -> tuple[str, str, str] | None:
509
+ nested = payload.get("tableInfo")
510
+ if isinstance(nested, dict):
511
+ payload = nested
512
+
513
+ catalog = payload.get("catalog_name") or payload.get("catalog")
514
+ schema = payload.get("schema_name") or payload.get("schema")
515
+ table = payload.get("name") or payload.get("table")
516
+
517
+ if all(isinstance(value, str) and value for value in (catalog, schema, table)):
518
+ return (catalog, schema, table)
519
+
520
+ table_name = payload.get("table_name") or payload.get("full_name")
521
+ if isinstance(table_name, str) and table_name.strip():
522
+ return self._parse_qualified_table_name(table_name)
523
+
524
+ return None
525
+
526
+ def _lineage_refs_for_table(self, table_ref: TableRef) -> set[tuple[str, str, str]]:
527
+ response = self._request_json(
528
+ "get",
529
+ "/api/2.0/lineage-tracking/table-lineage",
530
+ params={
531
+ "table_name": f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}",
532
+ "include_entity_lineage": str(
533
+ bool(self._extraction_options().include_notebooks)
534
+ ).lower(),
535
+ },
536
+ )
537
+
538
+ refs: set[tuple[str, str, str]] = set()
539
+
540
+ upstreams = response.get("upstreams")
541
+ if isinstance(upstreams, list):
542
+ for entry in upstreams:
543
+ if isinstance(entry, dict):
544
+ parsed = self._lineage_table_ref_from_payload(entry)
545
+ if parsed:
546
+ refs.add(parsed)
547
+
548
+ upstream_tables = response.get("upstream_tables")
549
+ if isinstance(upstream_tables, list):
550
+ for entry in upstream_tables:
551
+ if isinstance(entry, dict):
552
+ parsed = self._lineage_table_ref_from_payload(entry)
553
+ if parsed:
554
+ refs.add(parsed)
555
+
556
+ return refs
557
+
558
+ def _iter_notebooks(self) -> Generator[NotebookRef, None, None]:
559
+ if not self._extraction_options().include_notebooks:
560
+ return
561
+
562
+ queue: deque[str] = deque(["/"])
563
+ visited_paths: set[str] = set()
564
+
565
+ while queue:
566
+ if self._aborted:
567
+ break
568
+
569
+ path = queue.popleft()
570
+ if path in visited_paths:
571
+ continue
572
+ visited_paths.add(path)
573
+
574
+ try:
575
+ payload = self._request_json(
576
+ "get",
577
+ "/api/2.0/workspace/list",
578
+ params={"path": path},
579
+ )
580
+ except Exception as exc:
581
+ logger.warning("Skipping workspace path %s due to listing error: %s", path, exc)
582
+ continue
583
+
584
+ objects = payload.get("objects")
585
+ if not isinstance(objects, list):
586
+ continue
587
+
588
+ for obj in objects:
589
+ if not isinstance(obj, dict):
590
+ continue
591
+
592
+ object_type = str(obj.get("object_type") or "").upper()
593
+ object_path = obj.get("path")
594
+ if not isinstance(object_path, str) or not object_path:
595
+ continue
596
+
597
+ if object_type == "DIRECTORY":
598
+ queue.append(object_path)
599
+ continue
600
+
601
+ if object_type != "NOTEBOOK":
602
+ continue
603
+
604
+ object_id = obj.get("object_id")
605
+ yield NotebookRef(
606
+ path=object_path,
607
+ object_id=str(object_id) if object_id is not None else None,
608
+ language=(
609
+ str(obj.get("language")) if obj.get("language") is not None else None
610
+ ),
611
+ created_at_ms=(
612
+ int(obj["created_at"]) if isinstance(obj.get("created_at"), int) else None
613
+ ),
614
+ modified_at_ms=(
615
+ int(obj["modified_at"]) if isinstance(obj.get("modified_at"), int) else None
616
+ ),
617
+ )
618
+
619
+ def _iter_pipelines(self) -> Generator[PipelineRef, None, None]:
620
+ if not self._extraction_options().include_pipelines:
621
+ return
622
+
623
+ next_page_token: str | None = None
624
+ while True:
625
+ params = {}
626
+ if next_page_token:
627
+ params["page_token"] = next_page_token
628
+
629
+ try:
630
+ payload = self._request_json("get", "/api/2.0/pipelines", params=params)
631
+ except Exception as exc:
632
+ logger.warning("Could not list Databricks pipelines: %s", exc)
633
+ break
634
+
635
+ values: list[dict[str, Any]] = []
636
+ for key in ("statuses", "pipelines", "value", "items"):
637
+ candidate = payload.get(key)
638
+ if isinstance(candidate, list):
639
+ values = candidate
640
+ break
641
+
642
+ for entry in values:
643
+ pipeline_id = entry.get("pipeline_id") or entry.get("id")
644
+ if not isinstance(pipeline_id, str) or not pipeline_id:
645
+ continue
646
+
647
+ name = entry.get("name")
648
+ state = entry.get("state") or entry.get("health")
649
+ yield PipelineRef(
650
+ pipeline_id=pipeline_id,
651
+ name=str(name) if isinstance(name, str) and name else pipeline_id,
652
+ state=str(state) if isinstance(state, str) and state else None,
653
+ )
654
+
655
+ token = payload.get("next_page_token")
656
+ if not isinstance(token, str) or not token.strip():
657
+ break
658
+ next_page_token = token
659
+
660
+ def test_connection(self) -> dict[str, Any]:
661
+ logger.info("Testing connection to Databricks Unity Catalog...")
662
+ result = {
663
+ "timestamp": datetime.now(UTC).isoformat(),
664
+ "source_type": self.recipe.get("type"),
665
+ }
666
+
667
+ try:
668
+ catalogs = self._list_catalogs()
669
+ if not catalogs:
670
+ raise ValueError("No Unity Catalog catalogs available for scanning")
671
+
672
+ with closing(self._connect_sql()) as conn:
673
+ with conn.cursor() as cursor:
674
+ cursor.execute("SELECT 1")
675
+ cursor.fetchone()
676
+
677
+ auth_mode = (
678
+ "PAT_TOKEN"
679
+ if isinstance(self.config.required, DatabricksRequiredPat)
680
+ else "SERVICE_PRINCIPAL"
681
+ )
682
+ result["status"] = "SUCCESS"
683
+ result["message"] = (
684
+ "Successfully connected to Databricks Unity Catalog "
685
+ f"using {auth_mode}. Reachable catalogs: {len(catalogs)}."
686
+ )
687
+ except Exception as exc:
688
+ result["status"] = "FAILURE"
689
+ result["message"] = f"Failed to connect to Databricks Unity Catalog: {exc}"
690
+
691
+ return result
692
+
693
+ def _table_to_asset(
694
+ self,
695
+ table_ref: TableRef,
696
+ *,
697
+ links: list[str] | None = None,
698
+ ) -> SingleAssetScanResults:
699
+ asset_name = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
700
+ raw_id = self._table_raw_id(table_ref)
701
+ asset_hash = self.generate_hash_id(raw_id)
702
+ external_url = (
703
+ f"{self._workspace_url()}/explore/data/"
704
+ f"{table_ref.catalog}/{table_ref.schema}/{table_ref.table}"
705
+ )
706
+
707
+ metadata = {
708
+ "catalog": table_ref.catalog,
709
+ "schema": table_ref.schema,
710
+ "table": table_ref.table,
711
+ "object_type": table_ref.object_type,
712
+ "sampling": {
713
+ "strategy": str(self._sampling().strategy),
714
+ },
715
+ }
716
+
717
+ now = datetime.now(UTC)
718
+ return SingleAssetScanResults(
719
+ hash=asset_hash,
720
+ checksum=self.calculate_checksum(metadata),
721
+ name=asset_name,
722
+ external_url=external_url,
723
+ links=links or [],
724
+ asset_type=OutputAssetType.TABLE,
725
+ source_id=self.source_id,
726
+ created_at=now,
727
+ updated_at=now,
728
+ runner_id=self.runner_id,
729
+ )
730
+
731
+ def _notebook_raw_id(self, notebook: NotebookRef) -> str:
732
+ return f"notebook_#_{notebook.path}"
733
+
734
+ def _notebook_to_asset(self, notebook: NotebookRef) -> SingleAssetScanResults:
735
+ raw_id = self._notebook_raw_id(notebook)
736
+ asset_hash = self.generate_hash_id(raw_id)
737
+
738
+ metadata = {
739
+ "kind": "notebook",
740
+ "path": notebook.path,
741
+ "object_id": notebook.object_id,
742
+ "language": notebook.language,
743
+ "created_at_ms": notebook.created_at_ms,
744
+ "modified_at_ms": notebook.modified_at_ms,
745
+ }
746
+
747
+ raw_content = json.dumps(metadata, ensure_ascii=False)
748
+ text_content = "\n".join(
749
+ [
750
+ "kind=notebook",
751
+ f"path={notebook.path}",
752
+ f"language={notebook.language or 'unknown'}",
753
+ f"object_id={notebook.object_id or 'unknown'}",
754
+ ]
755
+ )
756
+ self._content_cache[asset_hash] = (raw_content, text_content)
757
+
758
+ now = datetime.now(UTC)
759
+ return SingleAssetScanResults(
760
+ hash=asset_hash,
761
+ checksum=self.calculate_checksum(metadata),
762
+ name=notebook.path,
763
+ external_url=f"{self._workspace_url()}/#workspace{notebook.path}",
764
+ links=[],
765
+ asset_type=OutputAssetType.TXT,
766
+ source_id=self.source_id,
767
+ created_at=now,
768
+ updated_at=now,
769
+ runner_id=self.runner_id,
770
+ )
771
+
772
+ def _pipeline_raw_id(self, pipeline: PipelineRef) -> str:
773
+ return f"pipeline_#_{pipeline.pipeline_id}"
774
+
775
+ def _pipeline_to_asset(self, pipeline: PipelineRef) -> SingleAssetScanResults:
776
+ raw_id = self._pipeline_raw_id(pipeline)
777
+ asset_hash = self.generate_hash_id(raw_id)
778
+
779
+ metadata = {
780
+ "kind": "pipeline",
781
+ "pipeline_id": pipeline.pipeline_id,
782
+ "name": pipeline.name,
783
+ "state": pipeline.state,
784
+ }
785
+
786
+ raw_content = json.dumps(metadata, ensure_ascii=False)
787
+ text_content = "\n".join(
788
+ [
789
+ "kind=pipeline",
790
+ f"pipeline_id={pipeline.pipeline_id}",
791
+ f"name={pipeline.name}",
792
+ f"state={pipeline.state or 'unknown'}",
793
+ ]
794
+ )
795
+ self._content_cache[asset_hash] = (raw_content, text_content)
796
+
797
+ now = datetime.now(UTC)
798
+ return SingleAssetScanResults(
799
+ hash=asset_hash,
800
+ checksum=self.calculate_checksum(metadata),
801
+ name=pipeline.name,
802
+ external_url=f"{self._workspace_url()}/#joblist/pipelines/{pipeline.pipeline_id}",
803
+ links=[],
804
+ asset_type=OutputAssetType.TXT,
805
+ source_id=self.source_id,
806
+ created_at=now,
807
+ updated_at=now,
808
+ runner_id=self.runner_id,
809
+ )
810
+
811
+ STREAM_DETECTIONS = True
812
+
813
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
814
+ if self._aborted:
815
+ return
816
+
817
+ # 1. Discover all tables first to establish the scope for lineage links
818
+ logger.info("Starting Databricks extraction: discovering tables...")
819
+ tables = self._iter_tables()
820
+ table_hash_by_key: dict[tuple[str, str, str], str] = {
821
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
822
+ for table_ref in tables
823
+ }
824
+
825
+ # 2. Process tables
826
+ include_lineage = self._extraction_options().include_table_lineage
827
+ if include_lineage and tables:
828
+ logger.info("Fetching table lineage for %d table(s)...", len(tables))
829
+
830
+ batch: list[SingleAssetScanResults] = []
831
+ emitted_tables = 0
832
+
833
+ for i, table_ref in enumerate(tables, 1):
834
+ if self._aborted:
835
+ return
836
+
837
+ table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
838
+ logger.info("Processing table %d/%d: %s", i, len(tables), table_label)
839
+
840
+ linked_hashes: list[str] = []
841
+ if include_lineage:
842
+ try:
843
+ upstream_refs = self._lineage_refs_for_table(table_ref)
844
+ linked_hashes = [
845
+ table_hash_by_key[target]
846
+ for target in sorted(upstream_refs)
847
+ if target in table_hash_by_key
848
+ ]
849
+ if linked_hashes:
850
+ logger.debug("%s has %d upstream link(s)", table_label, len(linked_hashes))
851
+ except Exception as exc:
852
+ logger.warning(
853
+ "Could not resolve Databricks lineage for %s: %s", table_label, exc
854
+ )
855
+
856
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
857
+ self._table_lookup[asset.hash] = table_ref
858
+ batch.append(asset)
859
+ emitted_tables += 1
860
+
861
+ if len(batch) >= self.BATCH_SIZE:
862
+ logger.info(
863
+ "Emitting batch of %d table asset(s) (total so far: %d)",
864
+ len(batch),
865
+ emitted_tables,
866
+ )
867
+ yield batch
868
+ batch = []
869
+
870
+ # 3. Process notebooks
871
+ notebook_count = 0
872
+ for notebook in self._iter_notebooks():
873
+ if self._aborted:
874
+ break
875
+
876
+ asset = self._notebook_to_asset(notebook)
877
+ batch.append(asset)
878
+ notebook_count += 1
879
+
880
+ if len(batch) >= self.BATCH_SIZE:
881
+ yield batch
882
+ batch = []
883
+
884
+ if notebook_count:
885
+ logger.info("Discovered %d notebook(s)", notebook_count)
886
+
887
+ # 4. Process pipelines
888
+ pipeline_count = 0
889
+ for pipeline_ref in self._iter_pipelines():
890
+ if self._aborted:
891
+ break
892
+
893
+ asset = self._pipeline_to_asset(pipeline_ref)
894
+ batch.append(asset)
895
+ pipeline_count += 1
896
+
897
+ if len(batch) >= self.BATCH_SIZE:
898
+ yield batch
899
+ batch = []
900
+
901
+ if pipeline_count:
902
+ logger.info("Discovered %d pipeline(s)", pipeline_count)
903
+
904
+ if batch:
905
+ logger.info("Emitting final batch of %d asset(s)", len(batch))
906
+ yield batch
907
+
908
+ logger.info(
909
+ "Extraction complete: %d table(s), %d notebook(s), %d pipeline(s)",
910
+ emitted_tables,
911
+ notebook_count,
912
+ pipeline_count,
913
+ )
914
+
915
+ def generate_hash_id(self, asset_id: str) -> str:
916
+ return hash_id(self._asset_type_value(), asset_id)
917
+
918
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
919
+ if asset_id in self._table_lookup:
920
+ return self._table_lookup[asset_id]
921
+
922
+ decoded = asset_id
923
+ if "_#_" not in decoded:
924
+ try:
925
+ decoded = unhash_id(asset_id)
926
+ except Exception:
927
+ decoded = asset_id
928
+
929
+ parts = decoded.split("_#_")
930
+ if len(parts) >= 2 and parts[-2] in {"notebook", "pipeline"}:
931
+ return None
932
+
933
+ if len(parts) >= 4 and parts[0].upper() == "DATABRICKS":
934
+ return TableRef(
935
+ catalog=parts[-3],
936
+ schema=parts[-2],
937
+ table=parts[-1],
938
+ object_type="TABLE",
939
+ )
940
+
941
+ if len(parts) >= 3:
942
+ return TableRef(
943
+ catalog=parts[-3],
944
+ schema=parts[-2],
945
+ table=parts[-1],
946
+ object_type="TABLE",
947
+ )
948
+
949
+ return None
950
+
951
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
952
+ query = (
953
+ "SELECT column_name "
954
+ "FROM system.information_schema.columns "
955
+ f"WHERE table_catalog = {_quote_literal(table_ref.catalog)} "
956
+ f"AND table_schema = {_quote_literal(table_ref.schema)} "
957
+ f"AND table_name = {_quote_literal(table_ref.table)} "
958
+ "ORDER BY ordinal_position"
959
+ )
960
+
961
+ with closing(self._connect_sql()) as conn:
962
+ with conn.cursor() as cursor:
963
+ cursor.execute(query)
964
+ columns: list[str] = []
965
+ for row in cursor.fetchall():
966
+ candidate: Any | None = None
967
+ try:
968
+ candidate = row[0] # type: ignore[index]
969
+ except Exception:
970
+ candidate = None
971
+ if isinstance(candidate, str):
972
+ columns.append(candidate)
973
+ return columns
974
+
975
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
976
+ sampling = self._sampling()
977
+ configured = sampling.order_by_column
978
+ if configured and configured in columns:
979
+ return configured
980
+
981
+ priority_candidates = (
982
+ "updated_at",
983
+ "modified_at",
984
+ "created_at",
985
+ "inserted_at",
986
+ "timestamp",
987
+ "ts",
988
+ "date",
989
+ )
990
+
991
+ for candidate in priority_candidates:
992
+ if candidate in columns:
993
+ return candidate
994
+ return None
995
+
996
+ def _build_sampling_query(
997
+ self, table_ref: TableRef, columns: list[str]
998
+ ) -> tuple[str, list[Any]]:
999
+ sampling = self._sampling()
1000
+
1001
+ if not columns:
1002
+ raise ValueError(
1003
+ f"Table {table_ref.catalog}.{table_ref.schema}.{table_ref.table} has no readable columns"
1004
+ )
1005
+
1006
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
1007
+ from_expr = (
1008
+ f"{_quote_identifier(table_ref.catalog)}."
1009
+ f"{_quote_identifier(table_ref.schema)}."
1010
+ f"{_quote_identifier(table_ref.table)}"
1011
+ )
1012
+
1013
+ query = f"SELECT {quoted_columns} FROM {from_expr}"
1014
+
1015
+ strategy = sampling.strategy
1016
+ if strategy == SamplingStrategy.LATEST:
1017
+ order_column = self._resolve_latest_order_column(columns)
1018
+ if order_column:
1019
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
1020
+ elif sampling.fallback_to_random is not False:
1021
+ query += " ORDER BY rand()"
1022
+ elif strategy == SamplingStrategy.RANDOM:
1023
+ query += " ORDER BY rand()"
1024
+
1025
+ if strategy != SamplingStrategy.ALL:
1026
+ query += f" LIMIT {int(sampling.rows_per_page or 100)}"
1027
+
1028
+ return query, []
1029
+
1030
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
1031
+ try:
1032
+ with closing(self._connect_sql()) as conn:
1033
+ with conn.cursor() as cursor:
1034
+ cursor.execute(
1035
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.catalog)}.{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
1036
+ )
1037
+ row = cursor.fetchone()
1038
+ return int(row[0]) if row else None
1039
+ except Exception:
1040
+ return None
1041
+
1042
+ def _serialize_cell(self, value: Any) -> str:
1043
+ if value is None:
1044
+ return "null"
1045
+ if isinstance(value, (bytes, bytearray, memoryview)):
1046
+ return f"<{len(bytes(value))} bytes>"
1047
+ if isinstance(value, datetime):
1048
+ return value.isoformat()
1049
+ return str(value)
1050
+
1051
+ def _format_sample_content(
1052
+ self,
1053
+ table_ref: TableRef,
1054
+ column_names: list[str],
1055
+ rows: list[tuple[Any, ...]],
1056
+ row_offset: int = 0,
1057
+ ) -> tuple[str, str]:
1058
+ sampling = self._sampling()
1059
+ return format_tabular_sample_content(
1060
+ scope_label="table",
1061
+ scope_value=f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}",
1062
+ strategy=sampling.strategy,
1063
+ rows=rows,
1064
+ column_names=column_names,
1065
+ serialize_cell=self._serialize_cell,
1066
+ include_column_names=sampling.include_column_names is not False,
1067
+ object_type=table_ref.object_type,
1068
+ raw_metadata={
1069
+ "catalog": table_ref.catalog,
1070
+ "schema": table_ref.schema,
1071
+ "table": table_ref.table,
1072
+ },
1073
+ row_offset=row_offset,
1074
+ )
1075
+
1076
+ def _fetch_one_page(
1077
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
1078
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
1079
+ with closing(self._connect_sql()) as conn:
1080
+ paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
1081
+ with conn.cursor() as cursor:
1082
+ cursor.execute(paginated_query)
1083
+ rows = list(cursor.fetchall())
1084
+ column_names = (
1085
+ [desc[0] for desc in cursor.description] if cursor.description else []
1086
+ )
1087
+ return rows, column_names
1088
+
1089
+ def _fetch_one_page_on_conn(
1090
+ self,
1091
+ conn: Any,
1092
+ base_query: str,
1093
+ page_size: int,
1094
+ offset: int,
1095
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
1096
+ paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
1097
+ with conn.cursor() as cursor:
1098
+ cursor.execute(paginated_query)
1099
+ rows = list(cursor.fetchall())
1100
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
1101
+ return rows, column_names
1102
+
1103
+ @staticmethod
1104
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
1105
+ cursor.execute(query)
1106
+ return [desc[0] for desc in cursor.description] if cursor.description else []
1107
+
1108
+ @staticmethod
1109
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
1110
+ return list(cursor.fetchmany(size))
1111
+
1112
+ def _fetch_sample_rows(
1113
+ self, table_ref: TableRef
1114
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
1115
+ table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
1116
+ columns = self._available_columns(table_ref)
1117
+ sampling = self._sampling()
1118
+ query, _params = self._build_sampling_query(table_ref, columns)
1119
+
1120
+ logger.info(
1121
+ "Sampling %s (%d column(s), strategy=%s)",
1122
+ table_label,
1123
+ len(columns),
1124
+ str(sampling.strategy),
1125
+ )
1126
+
1127
+ if sampling.strategy == SamplingStrategy.ALL:
1128
+ rows_per_page = int(sampling.rows_per_page or 100)
1129
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
1130
+ else:
1131
+ with closing(self._connect_sql()) as conn:
1132
+ with conn.cursor() as cursor:
1133
+ cursor.execute(query)
1134
+ rows = cursor.fetchall()
1135
+ column_names = [desc[0] for desc in cursor.description or []]
1136
+
1137
+ if not column_names:
1138
+ logger.warning("No columns returned for %s; skipping", table_label)
1139
+ return None
1140
+
1141
+ logger.info("Fetched %d row(s) from %s", len(rows), table_label)
1142
+ return rows, column_names
1143
+
1144
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
1145
+ result = self._fetch_sample_rows(table_ref)
1146
+ if result is None:
1147
+ return None
1148
+ rows, column_names = result
1149
+ return self._format_sample_content(table_ref, column_names, rows)
1150
+
1151
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
1152
+ cached = self._content_cache.get(asset_id)
1153
+ if cached:
1154
+ return cached
1155
+
1156
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
1157
+ if not table_ref:
1158
+ return None
1159
+
1160
+ sampled = self._sample_table_rows(table_ref)
1161
+
1162
+ if sampled is None:
1163
+ return None
1164
+
1165
+ self._content_cache[asset_id] = sampled
1166
+ return sampled
1167
+
1168
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
1169
+ sampling = self._sampling()
1170
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
1171
+ if not table_ref:
1172
+ return
1173
+
1174
+ table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
1175
+
1176
+ if sampling.strategy != SamplingStrategy.ALL:
1177
+ result = self._fetch_sample_rows(table_ref)
1178
+ if result is None:
1179
+ return
1180
+ rows, column_names = result
1181
+ logger.info(
1182
+ "Scanning %s: %d row(s) [strategy=%s]",
1183
+ table_label,
1184
+ len(rows),
1185
+ str(sampling.strategy),
1186
+ )
1187
+ for i, row in enumerate(rows):
1188
+ formatted = self._format_sample_content(
1189
+ table_ref, column_names, [row], row_offset=i
1190
+ )
1191
+ if formatted:
1192
+ yield formatted
1193
+ return
1194
+
1195
+ columns = self._available_columns(table_ref)
1196
+ query, _ = self._build_sampling_query(table_ref, columns)
1197
+ rows_per_page = int(sampling.rows_per_page or 100)
1198
+ table_label = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
1199
+
1200
+ total_rows = self._count_table_rows(table_ref)
1201
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
1202
+ if total_rows is not None and total_batches is not None:
1203
+ logger.info(
1204
+ "Full scan %s: %d rows, %d batches of %d",
1205
+ table_label,
1206
+ total_rows,
1207
+ total_batches,
1208
+ rows_per_page,
1209
+ )
1210
+
1211
+ # Stream rows via fetchmany — O(1) per page at any offset, no PK needed.
1212
+ # Each fetchmany() advances the server-side result pointer without re-scanning.
1213
+ row_offset = 0
1214
+ page_num = 1
1215
+
1216
+ conn = self._connect_sql()
1217
+ cursor = conn.cursor()
1218
+ try:
1219
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
1220
+ if not column_names:
1221
+ return
1222
+
1223
+ while not self._aborted:
1224
+ if total_batches is not None:
1225
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
1226
+
1227
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
1228
+ if not rows:
1229
+ break
1230
+
1231
+ # Yield each row individually so detection runs in parallel with fetching.
1232
+ for i, row in enumerate(rows):
1233
+ formatted = self._format_sample_content(
1234
+ table_ref, column_names, [row], row_offset=row_offset + i
1235
+ )
1236
+ if formatted:
1237
+ self._content_cache[asset_id] = formatted
1238
+ yield formatted
1239
+
1240
+ row_offset += len(rows)
1241
+ page_num += 1
1242
+ if len(rows) < rows_per_page:
1243
+ break
1244
+ finally:
1245
+ try:
1246
+ cursor.close()
1247
+ except Exception:
1248
+ pass
1249
+ conn.close()
1250
+
1251
+ def enrich_finding_location(
1252
+ self,
1253
+ finding: DetectionResult,
1254
+ asset: SingleAssetScanResults,
1255
+ text_content: str,
1256
+ ) -> None:
1257
+ del text_content
1258
+ table_ref = self._table_lookup.get(asset.hash)
1259
+ if not table_ref:
1260
+ return
1261
+
1262
+ path = f"{table_ref.catalog}.{table_ref.schema}.{table_ref.table}"
1263
+ cached = self._content_cache.get(asset.hash)
1264
+ raw_content = cached[0] if cached else None
1265
+ metadata = finding.metadata or {}
1266
+ finding.location = build_tabular_location(
1267
+ raw_content=raw_content,
1268
+ matched_content=finding.matched_content,
1269
+ base_path=path,
1270
+ row_index=metadata.get("tabular_row_index"),
1271
+ column_name=metadata.get("tabular_column_name"),
1272
+ )
1273
+
1274
+ def abort(self) -> None:
1275
+ logger.info("Aborting Databricks extraction...")
1276
+ super().abort()
1277
+
1278
+ def cleanup(self) -> None:
1279
+ self.session.close()