classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,912 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from contextlib import closing
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, date, datetime
|
|
9
|
+
from decimal import Decimal
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from ...models.generated_input import (
|
|
13
|
+
SamplingConfig,
|
|
14
|
+
SamplingStrategy,
|
|
15
|
+
SnowflakeInput,
|
|
16
|
+
SnowflakeMaskedDefaultAuthenticator,
|
|
17
|
+
SnowflakeMaskedExternalBrowserAuthenticator,
|
|
18
|
+
SnowflakeMaskedKeyPairAuthenticator,
|
|
19
|
+
SnowflakeMaskedOauthAuthenticatorToken,
|
|
20
|
+
SnowflakeOptionalConnection,
|
|
21
|
+
SnowflakeOptionalExtraction,
|
|
22
|
+
SnowflakeOptionalScope,
|
|
23
|
+
SnowflakeRequiredDefaultAuthenticator,
|
|
24
|
+
SnowflakeRequiredExternalBrowserAuthenticator,
|
|
25
|
+
SnowflakeRequiredKeyPairAuthenticator,
|
|
26
|
+
SnowflakeRequiredOauthAuthenticatorToken,
|
|
27
|
+
)
|
|
28
|
+
from ...models.generated_single_asset_scan_results import (
|
|
29
|
+
AssetType as OutputAssetType,
|
|
30
|
+
)
|
|
31
|
+
from ...models.generated_single_asset_scan_results import (
|
|
32
|
+
DetectionResult,
|
|
33
|
+
SingleAssetScanResults,
|
|
34
|
+
)
|
|
35
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
36
|
+
from ..base import BaseSource
|
|
37
|
+
from ..dependencies import require_module
|
|
38
|
+
from ..tabular_utils import build_tabular_location, format_tabular_sample_content
|
|
39
|
+
|
|
40
|
+
logger = logging.getLogger(__name__)
|
|
41
|
+
|
|
42
|
+
_DEFAULT_EXCLUDED_DATABASES = {"SNOWFLAKE", "SNOWFLAKE_SAMPLE_DATA"}
|
|
43
|
+
_DEFAULT_EXCLUDED_SCHEMAS = {"INFORMATION_SCHEMA"}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class TableRef:
|
|
48
|
+
database: str
|
|
49
|
+
schema: str
|
|
50
|
+
table: str
|
|
51
|
+
object_type: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _quote_identifier(identifier: str) -> str:
|
|
55
|
+
return '"' + identifier.replace('"', '""') + '"'
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SnowflakeSource(BaseSource):
|
|
59
|
+
source_type = "snowflake"
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
recipe: dict[str, Any],
|
|
64
|
+
source_id: str | None = None,
|
|
65
|
+
runner_id: str | None = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
super().__init__(recipe, source_id, runner_id)
|
|
68
|
+
self.config = SnowflakeInput.model_validate(recipe)
|
|
69
|
+
self.runner_id = runner_id or "local-run"
|
|
70
|
+
self._snowflake = require_module(
|
|
71
|
+
module_name="snowflake.connector",
|
|
72
|
+
source_name="Snowflake",
|
|
73
|
+
uv_groups=["snowflake"],
|
|
74
|
+
detail="The Snowflake connector is optional.",
|
|
75
|
+
)
|
|
76
|
+
self._validate_auth_configuration()
|
|
77
|
+
|
|
78
|
+
self._table_lookup: dict[str, TableRef] = {}
|
|
79
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
80
|
+
|
|
81
|
+
def _validate_auth_configuration(self) -> None:
|
|
82
|
+
required = self.config.required
|
|
83
|
+
masked = self.config.masked
|
|
84
|
+
|
|
85
|
+
if isinstance(required, SnowflakeRequiredDefaultAuthenticator):
|
|
86
|
+
if not isinstance(masked, SnowflakeMaskedDefaultAuthenticator):
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"SNOWFLAKE DEFAULT_AUTHENTICATOR requires masked.username and masked.password"
|
|
89
|
+
)
|
|
90
|
+
return
|
|
91
|
+
|
|
92
|
+
if isinstance(required, SnowflakeRequiredExternalBrowserAuthenticator):
|
|
93
|
+
if not isinstance(masked, SnowflakeMaskedExternalBrowserAuthenticator):
|
|
94
|
+
raise ValueError(
|
|
95
|
+
"SNOWFLAKE EXTERNAL_BROWSER_AUTHENTICATOR requires masked.username"
|
|
96
|
+
)
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
if isinstance(required, SnowflakeRequiredKeyPairAuthenticator):
|
|
100
|
+
if not isinstance(masked, SnowflakeMaskedKeyPairAuthenticator):
|
|
101
|
+
raise ValueError(
|
|
102
|
+
"SNOWFLAKE KEY_PAIR_AUTHENTICATOR requires masked.username and masked.private_key"
|
|
103
|
+
)
|
|
104
|
+
return
|
|
105
|
+
|
|
106
|
+
if isinstance(required, SnowflakeRequiredOauthAuthenticatorToken):
|
|
107
|
+
if not isinstance(masked, SnowflakeMaskedOauthAuthenticatorToken):
|
|
108
|
+
raise ValueError(
|
|
109
|
+
"SNOWFLAKE OAUTH_AUTHENTICATOR_TOKEN requires masked.username and masked.token"
|
|
110
|
+
)
|
|
111
|
+
return
|
|
112
|
+
|
|
113
|
+
raise ValueError("Unsupported SNOWFLAKE auth configuration")
|
|
114
|
+
|
|
115
|
+
def _asset_type_value(self) -> str:
|
|
116
|
+
type_value = self.config.type
|
|
117
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
118
|
+
|
|
119
|
+
def _sampling(self) -> SamplingConfig:
|
|
120
|
+
return self.config.sampling
|
|
121
|
+
|
|
122
|
+
def _connection_options(self) -> SnowflakeOptionalConnection:
|
|
123
|
+
if self.config.optional and self.config.optional.connection:
|
|
124
|
+
return self.config.optional.connection
|
|
125
|
+
return SnowflakeOptionalConnection()
|
|
126
|
+
|
|
127
|
+
def _scope_options(self) -> SnowflakeOptionalScope:
|
|
128
|
+
if self.config.optional and self.config.optional.scope:
|
|
129
|
+
return self.config.optional.scope
|
|
130
|
+
return SnowflakeOptionalScope()
|
|
131
|
+
|
|
132
|
+
def _extraction_options(self) -> SnowflakeOptionalExtraction:
|
|
133
|
+
if self.config.optional and self.config.optional.extraction:
|
|
134
|
+
return self.config.optional.extraction
|
|
135
|
+
return SnowflakeOptionalExtraction()
|
|
136
|
+
|
|
137
|
+
def _account_id(self) -> str:
|
|
138
|
+
required = self.config.required
|
|
139
|
+
return required.account_id
|
|
140
|
+
|
|
141
|
+
def _snowflake_domain(self) -> str:
|
|
142
|
+
domain = self._connection_options().snowflake_domain
|
|
143
|
+
return str(domain or "snowflakecomputing.com")
|
|
144
|
+
|
|
145
|
+
def _account_locator(self) -> str:
|
|
146
|
+
account_id = self._account_id().strip().removeprefix("https://").removeprefix("http://")
|
|
147
|
+
account_id = account_id.rstrip("/")
|
|
148
|
+
suffix = f".{self._snowflake_domain()}"
|
|
149
|
+
if account_id.endswith(suffix):
|
|
150
|
+
return account_id[: -len(suffix)]
|
|
151
|
+
return account_id
|
|
152
|
+
|
|
153
|
+
def _username(self) -> str:
|
|
154
|
+
masked = self.config.masked
|
|
155
|
+
return masked.username
|
|
156
|
+
|
|
157
|
+
def _build_private_key_bytes(self, private_key: str, password: str | None) -> bytes:
|
|
158
|
+
from cryptography.hazmat.backends import default_backend
|
|
159
|
+
from cryptography.hazmat.primitives import serialization
|
|
160
|
+
|
|
161
|
+
pkey_bytes = private_key.replace("\\n", "\n").encode()
|
|
162
|
+
private_key_obj = serialization.load_pem_private_key(
|
|
163
|
+
pkey_bytes,
|
|
164
|
+
password=password.encode() if password else None,
|
|
165
|
+
backend=default_backend(),
|
|
166
|
+
)
|
|
167
|
+
return private_key_obj.private_bytes(
|
|
168
|
+
encoding=serialization.Encoding.DER,
|
|
169
|
+
format=serialization.PrivateFormat.PKCS8,
|
|
170
|
+
encryption_algorithm=serialization.NoEncryption(),
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
def _connect(self):
|
|
174
|
+
required = self.config.required
|
|
175
|
+
masked = self.config.masked
|
|
176
|
+
connection_options = self._connection_options()
|
|
177
|
+
|
|
178
|
+
connect_kwargs: dict[str, Any] = {
|
|
179
|
+
"account": self._account_locator(),
|
|
180
|
+
"user": self._username(),
|
|
181
|
+
"login_timeout": int(connection_options.connect_timeout_seconds or 30),
|
|
182
|
+
"session_parameters": {
|
|
183
|
+
"QUERY_TAG": "classifyre-snowflake-source",
|
|
184
|
+
},
|
|
185
|
+
}
|
|
186
|
+
if connection_options.warehouse:
|
|
187
|
+
connect_kwargs["warehouse"] = connection_options.warehouse
|
|
188
|
+
if connection_options.role:
|
|
189
|
+
connect_kwargs["role"] = connection_options.role
|
|
190
|
+
|
|
191
|
+
auth_type = required.authentication_type
|
|
192
|
+
if isinstance(required, SnowflakeRequiredDefaultAuthenticator):
|
|
193
|
+
assert isinstance(masked, SnowflakeMaskedDefaultAuthenticator)
|
|
194
|
+
connect_kwargs["password"] = masked.password
|
|
195
|
+
elif isinstance(required, SnowflakeRequiredExternalBrowserAuthenticator):
|
|
196
|
+
connect_kwargs["authenticator"] = "externalbrowser"
|
|
197
|
+
elif isinstance(required, SnowflakeRequiredKeyPairAuthenticator):
|
|
198
|
+
assert isinstance(masked, SnowflakeMaskedKeyPairAuthenticator)
|
|
199
|
+
connect_kwargs["private_key"] = self._build_private_key_bytes(
|
|
200
|
+
masked.private_key,
|
|
201
|
+
masked.private_key_password,
|
|
202
|
+
)
|
|
203
|
+
connect_kwargs["authenticator"] = "snowflake_jwt"
|
|
204
|
+
elif isinstance(required, SnowflakeRequiredOauthAuthenticatorToken):
|
|
205
|
+
assert isinstance(masked, SnowflakeMaskedOauthAuthenticatorToken)
|
|
206
|
+
connect_kwargs["token"] = masked.token
|
|
207
|
+
connect_kwargs["authenticator"] = "oauth"
|
|
208
|
+
else: # pragma: no cover - guarded in _validate_auth_configuration
|
|
209
|
+
raise ValueError(f"Unsupported SNOWFLAKE authentication type: {auth_type}")
|
|
210
|
+
|
|
211
|
+
connect_args = connection_options.connect_args or {}
|
|
212
|
+
if isinstance(connect_args, dict):
|
|
213
|
+
connect_kwargs.update(connect_args)
|
|
214
|
+
|
|
215
|
+
return self._snowflake.connect(**connect_kwargs)
|
|
216
|
+
|
|
217
|
+
def _fetch_dict_rows(self, cursor: Any) -> list[dict[str, Any]]:
|
|
218
|
+
rows = cursor.fetchall()
|
|
219
|
+
description = getattr(cursor, "description", None) or []
|
|
220
|
+
columns = [str(col[0]).upper() for col in description if isinstance(col, tuple) and col]
|
|
221
|
+
|
|
222
|
+
result: list[dict[str, Any]] = []
|
|
223
|
+
for row in rows:
|
|
224
|
+
if isinstance(row, dict):
|
|
225
|
+
result.append({str(key).upper(): value for key, value in row.items()})
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if isinstance(row, tuple):
|
|
229
|
+
mapped: dict[str, Any] = {}
|
|
230
|
+
for index, value in enumerate(row):
|
|
231
|
+
key = columns[index] if index < len(columns) else f"COL_{index}"
|
|
232
|
+
mapped[key] = value
|
|
233
|
+
result.append(mapped)
|
|
234
|
+
|
|
235
|
+
return result
|
|
236
|
+
|
|
237
|
+
def _excluded_databases(self) -> set[str]:
|
|
238
|
+
configured = self._scope_options().exclude_databases or []
|
|
239
|
+
excluded = {name.strip().upper() for name in configured if name.strip()}
|
|
240
|
+
if not excluded:
|
|
241
|
+
excluded = set(_DEFAULT_EXCLUDED_DATABASES)
|
|
242
|
+
return excluded
|
|
243
|
+
|
|
244
|
+
def _schema_allowlist(self) -> set[str] | None:
|
|
245
|
+
configured = self._scope_options().include_schemas
|
|
246
|
+
if not configured:
|
|
247
|
+
return None
|
|
248
|
+
return {schema.strip().upper() for schema in configured if schema.strip()}
|
|
249
|
+
|
|
250
|
+
def _schema_denylist(self) -> set[str]:
|
|
251
|
+
configured = self._scope_options().exclude_schemas or []
|
|
252
|
+
denylist = {schema.strip().upper() for schema in configured if schema.strip()}
|
|
253
|
+
if not denylist:
|
|
254
|
+
denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
|
|
255
|
+
return denylist
|
|
256
|
+
|
|
257
|
+
def _object_allowlist(self) -> set[str]:
|
|
258
|
+
include_objects = self._scope_options().include_objects or []
|
|
259
|
+
return {entry.strip().lower() for entry in include_objects if entry.strip()}
|
|
260
|
+
|
|
261
|
+
def _include_tables_enabled(self) -> bool:
|
|
262
|
+
return self._scope_options().include_tables is not False
|
|
263
|
+
|
|
264
|
+
def _include_views_enabled(self) -> bool:
|
|
265
|
+
return self._scope_options().include_views is not False
|
|
266
|
+
|
|
267
|
+
def _include_table_lineage_enabled(self) -> bool:
|
|
268
|
+
return self._extraction_options().include_table_lineage is not False
|
|
269
|
+
|
|
270
|
+
def _include_view_lineage_enabled(self) -> bool:
|
|
271
|
+
return self._extraction_options().include_view_lineage is not False
|
|
272
|
+
|
|
273
|
+
def _resolve_databases(self) -> list[str]:
|
|
274
|
+
scope_options = self._scope_options()
|
|
275
|
+
include_all = bool(scope_options.include_all_databases)
|
|
276
|
+
configured_database = scope_options.database
|
|
277
|
+
|
|
278
|
+
if not include_all:
|
|
279
|
+
if configured_database:
|
|
280
|
+
return [configured_database]
|
|
281
|
+
raise ValueError(
|
|
282
|
+
"SNOWFLAKE source requires optional.scope.database when include_all_databases is false. "
|
|
283
|
+
"Set optional.scope.database (e.g. 'ANALYTICS') or enable include_all_databases."
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
excluded = self._excluded_databases()
|
|
287
|
+
databases: list[str] = []
|
|
288
|
+
with closing(self._connect()) as conn:
|
|
289
|
+
with conn.cursor() as cursor:
|
|
290
|
+
cursor.execute("SHOW DATABASES")
|
|
291
|
+
for row in self._fetch_dict_rows(cursor):
|
|
292
|
+
database_name = row.get("NAME")
|
|
293
|
+
if not isinstance(database_name, str) or not database_name:
|
|
294
|
+
continue
|
|
295
|
+
if database_name.upper() in excluded:
|
|
296
|
+
continue
|
|
297
|
+
databases.append(database_name)
|
|
298
|
+
|
|
299
|
+
if configured_database and configured_database not in databases:
|
|
300
|
+
databases.insert(0, configured_database)
|
|
301
|
+
|
|
302
|
+
return databases
|
|
303
|
+
|
|
304
|
+
def _list_tables_for_database(self, database: str) -> list[TableRef]:
|
|
305
|
+
include_tables = self._include_tables_enabled()
|
|
306
|
+
include_views = self._include_views_enabled()
|
|
307
|
+
if not include_tables and not include_views:
|
|
308
|
+
return []
|
|
309
|
+
|
|
310
|
+
schema_allowlist = self._schema_allowlist()
|
|
311
|
+
schema_denylist = self._schema_denylist()
|
|
312
|
+
object_allowlist = self._object_allowlist()
|
|
313
|
+
table_limit = self._scope_options().table_limit
|
|
314
|
+
limit = int(table_limit) if table_limit else None
|
|
315
|
+
|
|
316
|
+
query = f"""
|
|
317
|
+
SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
|
|
318
|
+
FROM {_quote_identifier(database)}.INFORMATION_SCHEMA.TABLES
|
|
319
|
+
ORDER BY TABLE_SCHEMA, TABLE_NAME
|
|
320
|
+
"""
|
|
321
|
+
tables: list[TableRef] = []
|
|
322
|
+
with closing(self._connect()) as conn:
|
|
323
|
+
with conn.cursor() as cursor:
|
|
324
|
+
cursor.execute(query)
|
|
325
|
+
for row in self._fetch_dict_rows(cursor):
|
|
326
|
+
schema_name = row.get("TABLE_SCHEMA")
|
|
327
|
+
table_name = row.get("TABLE_NAME")
|
|
328
|
+
table_type = row.get("TABLE_TYPE")
|
|
329
|
+
if not isinstance(schema_name, str) or not isinstance(table_name, str):
|
|
330
|
+
continue
|
|
331
|
+
|
|
332
|
+
schema_upper = schema_name.upper()
|
|
333
|
+
if schema_upper in schema_denylist:
|
|
334
|
+
continue
|
|
335
|
+
if schema_allowlist and schema_upper not in schema_allowlist:
|
|
336
|
+
continue
|
|
337
|
+
|
|
338
|
+
normalized_type = str(table_type).upper()
|
|
339
|
+
is_view = "VIEW" in normalized_type
|
|
340
|
+
if is_view and not include_views:
|
|
341
|
+
continue
|
|
342
|
+
if not is_view and not include_tables:
|
|
343
|
+
continue
|
|
344
|
+
|
|
345
|
+
scoped_name = f"{schema_name}.{table_name}".lower()
|
|
346
|
+
db_scoped_name = f"{database}.{schema_name}.{table_name}".lower()
|
|
347
|
+
if (
|
|
348
|
+
object_allowlist
|
|
349
|
+
and scoped_name not in object_allowlist
|
|
350
|
+
and db_scoped_name not in object_allowlist
|
|
351
|
+
):
|
|
352
|
+
continue
|
|
353
|
+
|
|
354
|
+
tables.append(
|
|
355
|
+
TableRef(
|
|
356
|
+
database=database,
|
|
357
|
+
schema=schema_name,
|
|
358
|
+
table=table_name,
|
|
359
|
+
object_type="VIEW" if is_view else "TABLE",
|
|
360
|
+
)
|
|
361
|
+
)
|
|
362
|
+
if limit is not None and len(tables) >= limit:
|
|
363
|
+
break
|
|
364
|
+
|
|
365
|
+
return tables
|
|
366
|
+
|
|
367
|
+
def _iter_tables(self) -> list[TableRef]:
|
|
368
|
+
tables: list[TableRef] = []
|
|
369
|
+
for database in self._resolve_databases():
|
|
370
|
+
if self._aborted:
|
|
371
|
+
break
|
|
372
|
+
try:
|
|
373
|
+
tables.extend(self._list_tables_for_database(database))
|
|
374
|
+
except Exception as exc:
|
|
375
|
+
logger.warning("Skipping database %s due to listing error: %s", database, exc)
|
|
376
|
+
return tables
|
|
377
|
+
|
|
378
|
+
def test_connection(self) -> dict[str, Any]:
|
|
379
|
+
logger.info("Testing connection to SNOWFLAKE...")
|
|
380
|
+
result = {
|
|
381
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
382
|
+
"source_type": self.recipe.get("type"),
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
try:
|
|
386
|
+
databases = self._resolve_databases()
|
|
387
|
+
if not databases:
|
|
388
|
+
raise ValueError("No databases available for scanning")
|
|
389
|
+
|
|
390
|
+
with closing(self._connect()) as conn:
|
|
391
|
+
with conn.cursor() as cursor:
|
|
392
|
+
cursor.execute("SELECT 1")
|
|
393
|
+
cursor.fetchone()
|
|
394
|
+
|
|
395
|
+
result["status"] = "SUCCESS"
|
|
396
|
+
result["message"] = (
|
|
397
|
+
f"Successfully connected to SNOWFLAKE. Reachable databases: {len(databases)}."
|
|
398
|
+
)
|
|
399
|
+
except Exception as exc:
|
|
400
|
+
result["status"] = "FAILURE"
|
|
401
|
+
result["message"] = f"Failed to connect to SNOWFLAKE: {exc}"
|
|
402
|
+
|
|
403
|
+
return result
|
|
404
|
+
|
|
405
|
+
def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
|
|
406
|
+
return (table_ref.database, table_ref.schema, table_ref.table)
|
|
407
|
+
|
|
408
|
+
def _table_raw_id(self, table_ref: TableRef) -> str:
|
|
409
|
+
return f"{table_ref.database}_#_{table_ref.schema}_#_{table_ref.table}"
|
|
410
|
+
|
|
411
|
+
def _collect_dependency_links(
|
|
412
|
+
self,
|
|
413
|
+
tables: list[TableRef],
|
|
414
|
+
) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
|
|
415
|
+
if not self._include_table_lineage_enabled() and not self._include_view_lineage_enabled():
|
|
416
|
+
return {}
|
|
417
|
+
|
|
418
|
+
known_keys = {self._table_key(table_ref) for table_ref in tables}
|
|
419
|
+
view_keys = {
|
|
420
|
+
self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "VIEW"
|
|
421
|
+
}
|
|
422
|
+
table_keys = {
|
|
423
|
+
self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "TABLE"
|
|
424
|
+
}
|
|
425
|
+
if not known_keys:
|
|
426
|
+
return {}
|
|
427
|
+
|
|
428
|
+
query = """
|
|
429
|
+
SELECT
|
|
430
|
+
REFERENCING_DATABASE,
|
|
431
|
+
REFERENCING_SCHEMA,
|
|
432
|
+
REFERENCING_OBJECT_NAME,
|
|
433
|
+
REFERENCING_OBJECT_DOMAIN,
|
|
434
|
+
REFERENCED_DATABASE,
|
|
435
|
+
REFERENCED_SCHEMA,
|
|
436
|
+
REFERENCED_OBJECT_NAME,
|
|
437
|
+
REFERENCED_OBJECT_DOMAIN
|
|
438
|
+
FROM SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES
|
|
439
|
+
WHERE REFERENCING_OBJECT_DOMAIN IN ('TABLE', 'VIEW')
|
|
440
|
+
AND REFERENCED_OBJECT_DOMAIN IN ('TABLE', 'VIEW')
|
|
441
|
+
"""
|
|
442
|
+
|
|
443
|
+
links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
|
|
444
|
+
try:
|
|
445
|
+
with closing(self._connect()) as conn:
|
|
446
|
+
with conn.cursor() as cursor:
|
|
447
|
+
cursor.execute(query)
|
|
448
|
+
for row in self._fetch_dict_rows(cursor):
|
|
449
|
+
source_db = row.get("REFERENCING_DATABASE")
|
|
450
|
+
source_schema = row.get("REFERENCING_SCHEMA")
|
|
451
|
+
source_table = row.get("REFERENCING_OBJECT_NAME")
|
|
452
|
+
source_domain = row.get("REFERENCING_OBJECT_DOMAIN")
|
|
453
|
+
target_db = row.get("REFERENCED_DATABASE")
|
|
454
|
+
target_schema = row.get("REFERENCED_SCHEMA")
|
|
455
|
+
target_table = row.get("REFERENCED_OBJECT_NAME")
|
|
456
|
+
if not all(
|
|
457
|
+
isinstance(value, str)
|
|
458
|
+
for value in (
|
|
459
|
+
source_db,
|
|
460
|
+
source_schema,
|
|
461
|
+
source_table,
|
|
462
|
+
source_domain,
|
|
463
|
+
target_db,
|
|
464
|
+
target_schema,
|
|
465
|
+
target_table,
|
|
466
|
+
)
|
|
467
|
+
):
|
|
468
|
+
continue
|
|
469
|
+
|
|
470
|
+
source_key = (source_db, source_schema, source_table)
|
|
471
|
+
target_key = (target_db, target_schema, target_table)
|
|
472
|
+
if source_key not in known_keys or target_key not in known_keys:
|
|
473
|
+
continue
|
|
474
|
+
|
|
475
|
+
source_is_view = source_key in view_keys
|
|
476
|
+
source_is_table = source_key in table_keys
|
|
477
|
+
if source_is_view and not self._include_view_lineage_enabled():
|
|
478
|
+
continue
|
|
479
|
+
if source_is_table and not self._include_table_lineage_enabled():
|
|
480
|
+
continue
|
|
481
|
+
|
|
482
|
+
links.setdefault(source_key, set()).add(target_key)
|
|
483
|
+
except Exception as exc:
|
|
484
|
+
logger.warning("Could not resolve Snowflake lineage links: %s", exc)
|
|
485
|
+
|
|
486
|
+
return links
|
|
487
|
+
|
|
488
|
+
def _table_to_asset(
|
|
489
|
+
self,
|
|
490
|
+
table_ref: TableRef,
|
|
491
|
+
*,
|
|
492
|
+
links: list[str] | None = None,
|
|
493
|
+
) -> SingleAssetScanResults:
|
|
494
|
+
asset_name = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
|
|
495
|
+
raw_id = self._table_raw_id(table_ref)
|
|
496
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
497
|
+
external_url = (
|
|
498
|
+
f"snowflake://{self._account_locator()}/"
|
|
499
|
+
f"{table_ref.database}/{table_ref.schema}.{table_ref.table}"
|
|
500
|
+
)
|
|
501
|
+
|
|
502
|
+
extraction_options = self._extraction_options()
|
|
503
|
+
metadata = {
|
|
504
|
+
"account_id": self._account_id(),
|
|
505
|
+
"database": table_ref.database,
|
|
506
|
+
"schema": table_ref.schema,
|
|
507
|
+
"table": table_ref.table,
|
|
508
|
+
"object_type": table_ref.object_type,
|
|
509
|
+
"lineage": {
|
|
510
|
+
"start_time": (
|
|
511
|
+
extraction_options.start_time.isoformat()
|
|
512
|
+
if extraction_options.start_time
|
|
513
|
+
else None
|
|
514
|
+
),
|
|
515
|
+
"include_table_lineage": bool(extraction_options.include_table_lineage),
|
|
516
|
+
"include_view_lineage": bool(extraction_options.include_view_lineage),
|
|
517
|
+
},
|
|
518
|
+
"sampling": {
|
|
519
|
+
"strategy": str(self._sampling().strategy),
|
|
520
|
+
},
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
now = datetime.now(UTC)
|
|
524
|
+
return SingleAssetScanResults(
|
|
525
|
+
hash=asset_hash,
|
|
526
|
+
checksum=self.calculate_checksum(metadata),
|
|
527
|
+
name=asset_name,
|
|
528
|
+
external_url=external_url,
|
|
529
|
+
links=links or [],
|
|
530
|
+
asset_type=OutputAssetType.TABLE,
|
|
531
|
+
source_id=self.source_id,
|
|
532
|
+
created_at=now,
|
|
533
|
+
updated_at=now,
|
|
534
|
+
runner_id=self.runner_id,
|
|
535
|
+
)
|
|
536
|
+
|
|
537
|
+
STREAM_DETECTIONS = True
|
|
538
|
+
|
|
539
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
540
|
+
if self._aborted:
|
|
541
|
+
return
|
|
542
|
+
|
|
543
|
+
tables = self._iter_tables()
|
|
544
|
+
table_hash_by_key: dict[tuple[str, str, str], str] = {
|
|
545
|
+
self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
|
|
546
|
+
for table_ref in tables
|
|
547
|
+
}
|
|
548
|
+
dependency_links = self._collect_dependency_links(tables)
|
|
549
|
+
|
|
550
|
+
batch: list[SingleAssetScanResults] = []
|
|
551
|
+
for table_ref in tables:
|
|
552
|
+
if self._aborted:
|
|
553
|
+
return
|
|
554
|
+
|
|
555
|
+
key = self._table_key(table_ref)
|
|
556
|
+
linked_hashes = [
|
|
557
|
+
table_hash_by_key[target]
|
|
558
|
+
for target in sorted(dependency_links.get(key, set()))
|
|
559
|
+
if target in table_hash_by_key
|
|
560
|
+
]
|
|
561
|
+
|
|
562
|
+
asset = self._table_to_asset(table_ref, links=linked_hashes)
|
|
563
|
+
self._table_lookup[asset.hash] = table_ref
|
|
564
|
+
batch.append(asset)
|
|
565
|
+
|
|
566
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
567
|
+
yield batch
|
|
568
|
+
batch = []
|
|
569
|
+
|
|
570
|
+
if batch:
|
|
571
|
+
yield batch
|
|
572
|
+
|
|
573
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
574
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
575
|
+
|
|
576
|
+
def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
|
|
577
|
+
if asset_id in self._table_lookup:
|
|
578
|
+
return self._table_lookup[asset_id]
|
|
579
|
+
|
|
580
|
+
decoded = asset_id
|
|
581
|
+
if "_#_" not in decoded:
|
|
582
|
+
try:
|
|
583
|
+
decoded = unhash_id(asset_id)
|
|
584
|
+
except Exception:
|
|
585
|
+
decoded = asset_id
|
|
586
|
+
|
|
587
|
+
if decoded.startswith("SNOWFLAKE_#_"):
|
|
588
|
+
decoded = decoded[len("SNOWFLAKE_#_") :]
|
|
589
|
+
|
|
590
|
+
parts = decoded.split("_#_")
|
|
591
|
+
if len(parts) >= 3:
|
|
592
|
+
return TableRef(
|
|
593
|
+
database=parts[-3],
|
|
594
|
+
schema=parts[-2],
|
|
595
|
+
table=parts[-1],
|
|
596
|
+
object_type="TABLE",
|
|
597
|
+
)
|
|
598
|
+
return None
|
|
599
|
+
|
|
600
|
+
def _available_columns(self, table_ref: TableRef) -> list[str]:
|
|
601
|
+
query = f"""
|
|
602
|
+
SELECT COLUMN_NAME
|
|
603
|
+
FROM {_quote_identifier(table_ref.database)}.INFORMATION_SCHEMA.COLUMNS
|
|
604
|
+
WHERE TABLE_SCHEMA = %s
|
|
605
|
+
AND TABLE_NAME = %s
|
|
606
|
+
ORDER BY ORDINAL_POSITION
|
|
607
|
+
"""
|
|
608
|
+
with closing(self._connect()) as conn:
|
|
609
|
+
with conn.cursor() as cursor:
|
|
610
|
+
cursor.execute(query, [table_ref.schema, table_ref.table])
|
|
611
|
+
return [
|
|
612
|
+
row.get("COLUMN_NAME")
|
|
613
|
+
for row in self._fetch_dict_rows(cursor)
|
|
614
|
+
if isinstance(row.get("COLUMN_NAME"), str)
|
|
615
|
+
]
|
|
616
|
+
|
|
617
|
+
def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
|
|
618
|
+
sampling = self._sampling()
|
|
619
|
+
configured = sampling.order_by_column
|
|
620
|
+
if configured:
|
|
621
|
+
for column in columns:
|
|
622
|
+
if column == configured or column.lower() == configured.lower():
|
|
623
|
+
return column
|
|
624
|
+
|
|
625
|
+
priority_candidates = (
|
|
626
|
+
"updated_at",
|
|
627
|
+
"modified_at",
|
|
628
|
+
"created_at",
|
|
629
|
+
"inserted_at",
|
|
630
|
+
"timestamp",
|
|
631
|
+
"ts",
|
|
632
|
+
"date",
|
|
633
|
+
)
|
|
634
|
+
lower_lookup = {column.lower(): column for column in columns}
|
|
635
|
+
for candidate in priority_candidates:
|
|
636
|
+
if candidate in lower_lookup:
|
|
637
|
+
return lower_lookup[candidate]
|
|
638
|
+
return None
|
|
639
|
+
|
|
640
|
+
def _build_sampling_query(
|
|
641
|
+
self, table_ref: TableRef, columns: list[str]
|
|
642
|
+
) -> tuple[str, list[Any]]:
|
|
643
|
+
sampling = self._sampling()
|
|
644
|
+
if not columns:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
f"Table {table_ref.database}.{table_ref.schema}.{table_ref.table} has no readable columns"
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
|
|
650
|
+
query = (
|
|
651
|
+
f"SELECT {quoted_columns} FROM "
|
|
652
|
+
f"{_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.schema)}."
|
|
653
|
+
f"{_quote_identifier(table_ref.table)}"
|
|
654
|
+
)
|
|
655
|
+
|
|
656
|
+
strategy = sampling.strategy
|
|
657
|
+
if strategy == SamplingStrategy.ALL:
|
|
658
|
+
return query, []
|
|
659
|
+
|
|
660
|
+
if strategy == SamplingStrategy.LATEST:
|
|
661
|
+
order_column = self._resolve_latest_order_column(columns)
|
|
662
|
+
if order_column:
|
|
663
|
+
query += f" ORDER BY {_quote_identifier(order_column)} DESC"
|
|
664
|
+
elif sampling.fallback_to_random is not False:
|
|
665
|
+
query += " ORDER BY RANDOM()"
|
|
666
|
+
elif strategy == SamplingStrategy.RANDOM:
|
|
667
|
+
query += " ORDER BY RANDOM()"
|
|
668
|
+
|
|
669
|
+
query += f" LIMIT {int(sampling.rows_per_page or 100)}"
|
|
670
|
+
return query, []
|
|
671
|
+
|
|
672
|
+
def _count_table_rows(self, table_ref: TableRef) -> int | None:
|
|
673
|
+
try:
|
|
674
|
+
with closing(self._connect()) as conn:
|
|
675
|
+
with conn.cursor() as cursor:
|
|
676
|
+
cursor.execute(
|
|
677
|
+
f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
|
|
678
|
+
)
|
|
679
|
+
row = cursor.fetchone()
|
|
680
|
+
return int(row[0]) if row else None
|
|
681
|
+
except Exception:
|
|
682
|
+
return None
|
|
683
|
+
|
|
684
|
+
def _serialize_cell(self, value: Any) -> str:
|
|
685
|
+
if value is None:
|
|
686
|
+
return "null"
|
|
687
|
+
if isinstance(value, memoryview):
|
|
688
|
+
value = value.tobytes()
|
|
689
|
+
if isinstance(value, (bytes, bytearray)):
|
|
690
|
+
return f"<{len(value)} bytes>"
|
|
691
|
+
if isinstance(value, (datetime, date)):
|
|
692
|
+
return value.isoformat()
|
|
693
|
+
if isinstance(value, Decimal):
|
|
694
|
+
return str(value)
|
|
695
|
+
return str(value)
|
|
696
|
+
|
|
697
|
+
def _format_sample_content(
|
|
698
|
+
self,
|
|
699
|
+
table_ref: TableRef,
|
|
700
|
+
column_names: list[str],
|
|
701
|
+
rows: list[tuple[Any, ...]],
|
|
702
|
+
row_offset: int = 0,
|
|
703
|
+
) -> tuple[str, str]:
|
|
704
|
+
sampling = self._sampling()
|
|
705
|
+
return format_tabular_sample_content(
|
|
706
|
+
scope_label="table",
|
|
707
|
+
scope_value=f"{table_ref.database}.{table_ref.schema}.{table_ref.table}",
|
|
708
|
+
strategy=sampling.strategy,
|
|
709
|
+
rows=rows,
|
|
710
|
+
column_names=column_names,
|
|
711
|
+
serialize_cell=self._serialize_cell,
|
|
712
|
+
include_column_names=sampling.include_column_names is not False,
|
|
713
|
+
object_type=table_ref.object_type,
|
|
714
|
+
row_offset=row_offset,
|
|
715
|
+
raw_metadata={
|
|
716
|
+
"database": table_ref.database,
|
|
717
|
+
"schema": table_ref.schema,
|
|
718
|
+
"table": table_ref.table,
|
|
719
|
+
},
|
|
720
|
+
)
|
|
721
|
+
|
|
722
|
+
def _normalize_rows(self, rows: list[Any], column_names: list[str]) -> list[tuple[Any, ...]]:
|
|
723
|
+
normalized: list[tuple[Any, ...]] = []
|
|
724
|
+
for row in rows:
|
|
725
|
+
if isinstance(row, tuple):
|
|
726
|
+
normalized.append(row)
|
|
727
|
+
elif isinstance(row, dict):
|
|
728
|
+
normalized.append(tuple(row.get(column) for column in column_names))
|
|
729
|
+
return normalized
|
|
730
|
+
|
|
731
|
+
def _fetch_one_page(
|
|
732
|
+
self, table_ref: TableRef, base_query: str, page_size: int, offset: int
|
|
733
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
734
|
+
with closing(self._connect()) as conn:
|
|
735
|
+
paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
|
|
736
|
+
with conn.cursor() as cursor:
|
|
737
|
+
cursor.execute(paginated_query, [])
|
|
738
|
+
raw_batch = list(cursor.fetchall())
|
|
739
|
+
description = getattr(cursor, "description", None) or []
|
|
740
|
+
column_names = [
|
|
741
|
+
str(col[0]) for col in description if isinstance(col, tuple) and col
|
|
742
|
+
]
|
|
743
|
+
rows = self._normalize_rows(raw_batch, column_names)
|
|
744
|
+
return rows, column_names
|
|
745
|
+
|
|
746
|
+
@staticmethod
|
|
747
|
+
def _cursor_execute(cursor: Any, query: str) -> list[str]:
|
|
748
|
+
cursor.execute(query, [])
|
|
749
|
+
description = getattr(cursor, "description", None) or []
|
|
750
|
+
return [str(col[0]) for col in description if isinstance(col, tuple) and col]
|
|
751
|
+
|
|
752
|
+
@staticmethod
|
|
753
|
+
def _cursor_fetchmany(cursor: Any, size: int) -> list[Any]:
|
|
754
|
+
return list(cursor.fetchmany(size))
|
|
755
|
+
|
|
756
|
+
def _fetch_sample_rows(
|
|
757
|
+
self, table_ref: TableRef
|
|
758
|
+
) -> tuple[list[tuple[Any, ...]], list[str]] | None:
|
|
759
|
+
columns = self._available_columns(table_ref)
|
|
760
|
+
sampling = self._sampling()
|
|
761
|
+
query, params = self._build_sampling_query(table_ref, columns)
|
|
762
|
+
|
|
763
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
764
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
765
|
+
rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
|
|
766
|
+
else:
|
|
767
|
+
with closing(self._connect()) as conn:
|
|
768
|
+
with conn.cursor() as cursor:
|
|
769
|
+
cursor.execute(query, params)
|
|
770
|
+
raw_rows = cursor.fetchall()
|
|
771
|
+
description = getattr(cursor, "description", None) or []
|
|
772
|
+
column_names = [
|
|
773
|
+
str(col[0]) for col in description if isinstance(col, tuple) and col
|
|
774
|
+
]
|
|
775
|
+
rows = self._normalize_rows(raw_rows, column_names)
|
|
776
|
+
|
|
777
|
+
if not column_names:
|
|
778
|
+
return None
|
|
779
|
+
return rows, column_names
|
|
780
|
+
|
|
781
|
+
def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
|
|
782
|
+
result = self._fetch_sample_rows(table_ref)
|
|
783
|
+
if result is None:
|
|
784
|
+
return None
|
|
785
|
+
rows, column_names = result
|
|
786
|
+
return self._format_sample_content(table_ref, column_names, rows)
|
|
787
|
+
|
|
788
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
789
|
+
cached = self._content_cache.get(asset_id)
|
|
790
|
+
if cached:
|
|
791
|
+
return cached
|
|
792
|
+
|
|
793
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
794
|
+
if not table_ref:
|
|
795
|
+
return None
|
|
796
|
+
|
|
797
|
+
sampled = self._sample_table_rows(table_ref)
|
|
798
|
+
|
|
799
|
+
if sampled is None:
|
|
800
|
+
return None
|
|
801
|
+
|
|
802
|
+
self._content_cache[asset_id] = sampled
|
|
803
|
+
return sampled
|
|
804
|
+
|
|
805
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
806
|
+
sampling = self._sampling()
|
|
807
|
+
|
|
808
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
809
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
810
|
+
if not table_ref:
|
|
811
|
+
return
|
|
812
|
+
result = self._fetch_sample_rows(table_ref)
|
|
813
|
+
if result is None:
|
|
814
|
+
return
|
|
815
|
+
rows, column_names = result
|
|
816
|
+
for i, row in enumerate(rows):
|
|
817
|
+
formatted = self._format_sample_content(
|
|
818
|
+
table_ref, column_names, [row], row_offset=i
|
|
819
|
+
)
|
|
820
|
+
if formatted:
|
|
821
|
+
self._content_cache[asset_id] = formatted
|
|
822
|
+
yield formatted
|
|
823
|
+
return
|
|
824
|
+
|
|
825
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
826
|
+
if not table_ref:
|
|
827
|
+
return
|
|
828
|
+
|
|
829
|
+
columns = self._available_columns(table_ref)
|
|
830
|
+
query, _ = self._build_sampling_query(table_ref, columns)
|
|
831
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
832
|
+
table_label = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
|
|
833
|
+
|
|
834
|
+
total_rows = self._count_table_rows(table_ref)
|
|
835
|
+
total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
|
|
836
|
+
if total_rows is not None and total_batches is not None:
|
|
837
|
+
logger.info(
|
|
838
|
+
"Full scan %s: %d rows, %d batches of %d",
|
|
839
|
+
table_label,
|
|
840
|
+
total_rows,
|
|
841
|
+
total_batches,
|
|
842
|
+
rows_per_page,
|
|
843
|
+
)
|
|
844
|
+
|
|
845
|
+
# Stream rows via fetchmany — O(1) per page at any offset, no PK needed.
|
|
846
|
+
# Each fetchmany() advances the server-side result pointer without re-scanning.
|
|
847
|
+
row_offset = 0
|
|
848
|
+
page_num = 1
|
|
849
|
+
|
|
850
|
+
conn = self._connect()
|
|
851
|
+
cursor = conn.cursor()
|
|
852
|
+
try:
|
|
853
|
+
column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
|
|
854
|
+
if not column_names:
|
|
855
|
+
return
|
|
856
|
+
|
|
857
|
+
while not self._aborted:
|
|
858
|
+
if total_batches is not None:
|
|
859
|
+
logger.info("%s batch %d/%d", table_label, page_num, total_batches)
|
|
860
|
+
|
|
861
|
+
raw_rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
|
|
862
|
+
if not raw_rows:
|
|
863
|
+
break
|
|
864
|
+
|
|
865
|
+
rows = self._normalize_rows(raw_rows, column_names)
|
|
866
|
+
|
|
867
|
+
# Yield each row individually so detection runs in parallel with fetching.
|
|
868
|
+
for i, row in enumerate(rows):
|
|
869
|
+
formatted = self._format_sample_content(
|
|
870
|
+
table_ref, column_names, [row], row_offset=row_offset + i
|
|
871
|
+
)
|
|
872
|
+
if formatted:
|
|
873
|
+
self._content_cache[asset_id] = formatted
|
|
874
|
+
yield formatted
|
|
875
|
+
|
|
876
|
+
row_offset += len(rows)
|
|
877
|
+
page_num += 1
|
|
878
|
+
if len(rows) < rows_per_page:
|
|
879
|
+
break
|
|
880
|
+
finally:
|
|
881
|
+
try:
|
|
882
|
+
cursor.close()
|
|
883
|
+
except Exception:
|
|
884
|
+
pass
|
|
885
|
+
conn.close()
|
|
886
|
+
|
|
887
|
+
def enrich_finding_location(
|
|
888
|
+
self,
|
|
889
|
+
finding: DetectionResult,
|
|
890
|
+
asset: SingleAssetScanResults,
|
|
891
|
+
text_content: str,
|
|
892
|
+
) -> None:
|
|
893
|
+
del text_content
|
|
894
|
+
table_ref = self._table_lookup.get(asset.hash)
|
|
895
|
+
if not table_ref:
|
|
896
|
+
return
|
|
897
|
+
|
|
898
|
+
path = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
|
|
899
|
+
cached = self._content_cache.get(asset.hash)
|
|
900
|
+
raw_content = cached[0] if cached else None
|
|
901
|
+
metadata = finding.metadata or {}
|
|
902
|
+
finding.location = build_tabular_location(
|
|
903
|
+
raw_content=raw_content,
|
|
904
|
+
matched_content=finding.matched_content,
|
|
905
|
+
base_path=path,
|
|
906
|
+
row_index=metadata.get("tabular_row_index"),
|
|
907
|
+
column_name=metadata.get("tabular_column_name"),
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
def abort(self) -> None:
|
|
911
|
+
logger.info("Aborting SNOWFLAKE extraction...")
|
|
912
|
+
super().abort()
|