classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,912 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from contextlib import closing
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, date, datetime
9
+ from decimal import Decimal
10
+ from typing import Any
11
+
12
+ from ...models.generated_input import (
13
+ SamplingConfig,
14
+ SamplingStrategy,
15
+ SnowflakeInput,
16
+ SnowflakeMaskedDefaultAuthenticator,
17
+ SnowflakeMaskedExternalBrowserAuthenticator,
18
+ SnowflakeMaskedKeyPairAuthenticator,
19
+ SnowflakeMaskedOauthAuthenticatorToken,
20
+ SnowflakeOptionalConnection,
21
+ SnowflakeOptionalExtraction,
22
+ SnowflakeOptionalScope,
23
+ SnowflakeRequiredDefaultAuthenticator,
24
+ SnowflakeRequiredExternalBrowserAuthenticator,
25
+ SnowflakeRequiredKeyPairAuthenticator,
26
+ SnowflakeRequiredOauthAuthenticatorToken,
27
+ )
28
+ from ...models.generated_single_asset_scan_results import (
29
+ AssetType as OutputAssetType,
30
+ )
31
+ from ...models.generated_single_asset_scan_results import (
32
+ DetectionResult,
33
+ SingleAssetScanResults,
34
+ )
35
+ from ...utils.hashing import hash_id, unhash_id
36
+ from ..base import BaseSource
37
+ from ..dependencies import require_module
38
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ _DEFAULT_EXCLUDED_DATABASES = {"SNOWFLAKE", "SNOWFLAKE_SAMPLE_DATA"}
43
+ _DEFAULT_EXCLUDED_SCHEMAS = {"INFORMATION_SCHEMA"}
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class TableRef:
48
+ database: str
49
+ schema: str
50
+ table: str
51
+ object_type: str
52
+
53
+
54
+ def _quote_identifier(identifier: str) -> str:
55
+ return '"' + identifier.replace('"', '""') + '"'
56
+
57
+
58
+ class SnowflakeSource(BaseSource):
59
+ source_type = "snowflake"
60
+
61
+ def __init__(
62
+ self,
63
+ recipe: dict[str, Any],
64
+ source_id: str | None = None,
65
+ runner_id: str | None = None,
66
+ ) -> None:
67
+ super().__init__(recipe, source_id, runner_id)
68
+ self.config = SnowflakeInput.model_validate(recipe)
69
+ self.runner_id = runner_id or "local-run"
70
+ self._snowflake = require_module(
71
+ module_name="snowflake.connector",
72
+ source_name="Snowflake",
73
+ uv_groups=["snowflake"],
74
+ detail="The Snowflake connector is optional.",
75
+ )
76
+ self._validate_auth_configuration()
77
+
78
+ self._table_lookup: dict[str, TableRef] = {}
79
+ self._content_cache: dict[str, tuple[str, str]] = {}
80
+
81
+ def _validate_auth_configuration(self) -> None:
82
+ required = self.config.required
83
+ masked = self.config.masked
84
+
85
+ if isinstance(required, SnowflakeRequiredDefaultAuthenticator):
86
+ if not isinstance(masked, SnowflakeMaskedDefaultAuthenticator):
87
+ raise ValueError(
88
+ "SNOWFLAKE DEFAULT_AUTHENTICATOR requires masked.username and masked.password"
89
+ )
90
+ return
91
+
92
+ if isinstance(required, SnowflakeRequiredExternalBrowserAuthenticator):
93
+ if not isinstance(masked, SnowflakeMaskedExternalBrowserAuthenticator):
94
+ raise ValueError(
95
+ "SNOWFLAKE EXTERNAL_BROWSER_AUTHENTICATOR requires masked.username"
96
+ )
97
+ return
98
+
99
+ if isinstance(required, SnowflakeRequiredKeyPairAuthenticator):
100
+ if not isinstance(masked, SnowflakeMaskedKeyPairAuthenticator):
101
+ raise ValueError(
102
+ "SNOWFLAKE KEY_PAIR_AUTHENTICATOR requires masked.username and masked.private_key"
103
+ )
104
+ return
105
+
106
+ if isinstance(required, SnowflakeRequiredOauthAuthenticatorToken):
107
+ if not isinstance(masked, SnowflakeMaskedOauthAuthenticatorToken):
108
+ raise ValueError(
109
+ "SNOWFLAKE OAUTH_AUTHENTICATOR_TOKEN requires masked.username and masked.token"
110
+ )
111
+ return
112
+
113
+ raise ValueError("Unsupported SNOWFLAKE auth configuration")
114
+
115
+ def _asset_type_value(self) -> str:
116
+ type_value = self.config.type
117
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
118
+
119
+ def _sampling(self) -> SamplingConfig:
120
+ return self.config.sampling
121
+
122
+ def _connection_options(self) -> SnowflakeOptionalConnection:
123
+ if self.config.optional and self.config.optional.connection:
124
+ return self.config.optional.connection
125
+ return SnowflakeOptionalConnection()
126
+
127
+ def _scope_options(self) -> SnowflakeOptionalScope:
128
+ if self.config.optional and self.config.optional.scope:
129
+ return self.config.optional.scope
130
+ return SnowflakeOptionalScope()
131
+
132
+ def _extraction_options(self) -> SnowflakeOptionalExtraction:
133
+ if self.config.optional and self.config.optional.extraction:
134
+ return self.config.optional.extraction
135
+ return SnowflakeOptionalExtraction()
136
+
137
+ def _account_id(self) -> str:
138
+ required = self.config.required
139
+ return required.account_id
140
+
141
+ def _snowflake_domain(self) -> str:
142
+ domain = self._connection_options().snowflake_domain
143
+ return str(domain or "snowflakecomputing.com")
144
+
145
+ def _account_locator(self) -> str:
146
+ account_id = self._account_id().strip().removeprefix("https://").removeprefix("http://")
147
+ account_id = account_id.rstrip("/")
148
+ suffix = f".{self._snowflake_domain()}"
149
+ if account_id.endswith(suffix):
150
+ return account_id[: -len(suffix)]
151
+ return account_id
152
+
153
+ def _username(self) -> str:
154
+ masked = self.config.masked
155
+ return masked.username
156
+
157
+ def _build_private_key_bytes(self, private_key: str, password: str | None) -> bytes:
158
+ from cryptography.hazmat.backends import default_backend
159
+ from cryptography.hazmat.primitives import serialization
160
+
161
+ pkey_bytes = private_key.replace("\\n", "\n").encode()
162
+ private_key_obj = serialization.load_pem_private_key(
163
+ pkey_bytes,
164
+ password=password.encode() if password else None,
165
+ backend=default_backend(),
166
+ )
167
+ return private_key_obj.private_bytes(
168
+ encoding=serialization.Encoding.DER,
169
+ format=serialization.PrivateFormat.PKCS8,
170
+ encryption_algorithm=serialization.NoEncryption(),
171
+ )
172
+
173
+ def _connect(self):
174
+ required = self.config.required
175
+ masked = self.config.masked
176
+ connection_options = self._connection_options()
177
+
178
+ connect_kwargs: dict[str, Any] = {
179
+ "account": self._account_locator(),
180
+ "user": self._username(),
181
+ "login_timeout": int(connection_options.connect_timeout_seconds or 30),
182
+ "session_parameters": {
183
+ "QUERY_TAG": "classifyre-snowflake-source",
184
+ },
185
+ }
186
+ if connection_options.warehouse:
187
+ connect_kwargs["warehouse"] = connection_options.warehouse
188
+ if connection_options.role:
189
+ connect_kwargs["role"] = connection_options.role
190
+
191
+ auth_type = required.authentication_type
192
+ if isinstance(required, SnowflakeRequiredDefaultAuthenticator):
193
+ assert isinstance(masked, SnowflakeMaskedDefaultAuthenticator)
194
+ connect_kwargs["password"] = masked.password
195
+ elif isinstance(required, SnowflakeRequiredExternalBrowserAuthenticator):
196
+ connect_kwargs["authenticator"] = "externalbrowser"
197
+ elif isinstance(required, SnowflakeRequiredKeyPairAuthenticator):
198
+ assert isinstance(masked, SnowflakeMaskedKeyPairAuthenticator)
199
+ connect_kwargs["private_key"] = self._build_private_key_bytes(
200
+ masked.private_key,
201
+ masked.private_key_password,
202
+ )
203
+ connect_kwargs["authenticator"] = "snowflake_jwt"
204
+ elif isinstance(required, SnowflakeRequiredOauthAuthenticatorToken):
205
+ assert isinstance(masked, SnowflakeMaskedOauthAuthenticatorToken)
206
+ connect_kwargs["token"] = masked.token
207
+ connect_kwargs["authenticator"] = "oauth"
208
+ else: # pragma: no cover - guarded in _validate_auth_configuration
209
+ raise ValueError(f"Unsupported SNOWFLAKE authentication type: {auth_type}")
210
+
211
+ connect_args = connection_options.connect_args or {}
212
+ if isinstance(connect_args, dict):
213
+ connect_kwargs.update(connect_args)
214
+
215
+ return self._snowflake.connect(**connect_kwargs)
216
+
217
+ def _fetch_dict_rows(self, cursor: Any) -> list[dict[str, Any]]:
218
+ rows = cursor.fetchall()
219
+ description = getattr(cursor, "description", None) or []
220
+ columns = [str(col[0]).upper() for col in description if isinstance(col, tuple) and col]
221
+
222
+ result: list[dict[str, Any]] = []
223
+ for row in rows:
224
+ if isinstance(row, dict):
225
+ result.append({str(key).upper(): value for key, value in row.items()})
226
+ continue
227
+
228
+ if isinstance(row, tuple):
229
+ mapped: dict[str, Any] = {}
230
+ for index, value in enumerate(row):
231
+ key = columns[index] if index < len(columns) else f"COL_{index}"
232
+ mapped[key] = value
233
+ result.append(mapped)
234
+
235
+ return result
236
+
237
+ def _excluded_databases(self) -> set[str]:
238
+ configured = self._scope_options().exclude_databases or []
239
+ excluded = {name.strip().upper() for name in configured if name.strip()}
240
+ if not excluded:
241
+ excluded = set(_DEFAULT_EXCLUDED_DATABASES)
242
+ return excluded
243
+
244
+ def _schema_allowlist(self) -> set[str] | None:
245
+ configured = self._scope_options().include_schemas
246
+ if not configured:
247
+ return None
248
+ return {schema.strip().upper() for schema in configured if schema.strip()}
249
+
250
+ def _schema_denylist(self) -> set[str]:
251
+ configured = self._scope_options().exclude_schemas or []
252
+ denylist = {schema.strip().upper() for schema in configured if schema.strip()}
253
+ if not denylist:
254
+ denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
255
+ return denylist
256
+
257
+ def _object_allowlist(self) -> set[str]:
258
+ include_objects = self._scope_options().include_objects or []
259
+ return {entry.strip().lower() for entry in include_objects if entry.strip()}
260
+
261
+ def _include_tables_enabled(self) -> bool:
262
+ return self._scope_options().include_tables is not False
263
+
264
+ def _include_views_enabled(self) -> bool:
265
+ return self._scope_options().include_views is not False
266
+
267
+ def _include_table_lineage_enabled(self) -> bool:
268
+ return self._extraction_options().include_table_lineage is not False
269
+
270
+ def _include_view_lineage_enabled(self) -> bool:
271
+ return self._extraction_options().include_view_lineage is not False
272
+
273
+ def _resolve_databases(self) -> list[str]:
274
+ scope_options = self._scope_options()
275
+ include_all = bool(scope_options.include_all_databases)
276
+ configured_database = scope_options.database
277
+
278
+ if not include_all:
279
+ if configured_database:
280
+ return [configured_database]
281
+ raise ValueError(
282
+ "SNOWFLAKE source requires optional.scope.database when include_all_databases is false. "
283
+ "Set optional.scope.database (e.g. 'ANALYTICS') or enable include_all_databases."
284
+ )
285
+
286
+ excluded = self._excluded_databases()
287
+ databases: list[str] = []
288
+ with closing(self._connect()) as conn:
289
+ with conn.cursor() as cursor:
290
+ cursor.execute("SHOW DATABASES")
291
+ for row in self._fetch_dict_rows(cursor):
292
+ database_name = row.get("NAME")
293
+ if not isinstance(database_name, str) or not database_name:
294
+ continue
295
+ if database_name.upper() in excluded:
296
+ continue
297
+ databases.append(database_name)
298
+
299
+ if configured_database and configured_database not in databases:
300
+ databases.insert(0, configured_database)
301
+
302
+ return databases
303
+
304
+ def _list_tables_for_database(self, database: str) -> list[TableRef]:
305
+ include_tables = self._include_tables_enabled()
306
+ include_views = self._include_views_enabled()
307
+ if not include_tables and not include_views:
308
+ return []
309
+
310
+ schema_allowlist = self._schema_allowlist()
311
+ schema_denylist = self._schema_denylist()
312
+ object_allowlist = self._object_allowlist()
313
+ table_limit = self._scope_options().table_limit
314
+ limit = int(table_limit) if table_limit else None
315
+
316
+ query = f"""
317
+ SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
318
+ FROM {_quote_identifier(database)}.INFORMATION_SCHEMA.TABLES
319
+ ORDER BY TABLE_SCHEMA, TABLE_NAME
320
+ """
321
+ tables: list[TableRef] = []
322
+ with closing(self._connect()) as conn:
323
+ with conn.cursor() as cursor:
324
+ cursor.execute(query)
325
+ for row in self._fetch_dict_rows(cursor):
326
+ schema_name = row.get("TABLE_SCHEMA")
327
+ table_name = row.get("TABLE_NAME")
328
+ table_type = row.get("TABLE_TYPE")
329
+ if not isinstance(schema_name, str) or not isinstance(table_name, str):
330
+ continue
331
+
332
+ schema_upper = schema_name.upper()
333
+ if schema_upper in schema_denylist:
334
+ continue
335
+ if schema_allowlist and schema_upper not in schema_allowlist:
336
+ continue
337
+
338
+ normalized_type = str(table_type).upper()
339
+ is_view = "VIEW" in normalized_type
340
+ if is_view and not include_views:
341
+ continue
342
+ if not is_view and not include_tables:
343
+ continue
344
+
345
+ scoped_name = f"{schema_name}.{table_name}".lower()
346
+ db_scoped_name = f"{database}.{schema_name}.{table_name}".lower()
347
+ if (
348
+ object_allowlist
349
+ and scoped_name not in object_allowlist
350
+ and db_scoped_name not in object_allowlist
351
+ ):
352
+ continue
353
+
354
+ tables.append(
355
+ TableRef(
356
+ database=database,
357
+ schema=schema_name,
358
+ table=table_name,
359
+ object_type="VIEW" if is_view else "TABLE",
360
+ )
361
+ )
362
+ if limit is not None and len(tables) >= limit:
363
+ break
364
+
365
+ return tables
366
+
367
+ def _iter_tables(self) -> list[TableRef]:
368
+ tables: list[TableRef] = []
369
+ for database in self._resolve_databases():
370
+ if self._aborted:
371
+ break
372
+ try:
373
+ tables.extend(self._list_tables_for_database(database))
374
+ except Exception as exc:
375
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
376
+ return tables
377
+
378
+ def test_connection(self) -> dict[str, Any]:
379
+ logger.info("Testing connection to SNOWFLAKE...")
380
+ result = {
381
+ "timestamp": datetime.now(UTC).isoformat(),
382
+ "source_type": self.recipe.get("type"),
383
+ }
384
+
385
+ try:
386
+ databases = self._resolve_databases()
387
+ if not databases:
388
+ raise ValueError("No databases available for scanning")
389
+
390
+ with closing(self._connect()) as conn:
391
+ with conn.cursor() as cursor:
392
+ cursor.execute("SELECT 1")
393
+ cursor.fetchone()
394
+
395
+ result["status"] = "SUCCESS"
396
+ result["message"] = (
397
+ f"Successfully connected to SNOWFLAKE. Reachable databases: {len(databases)}."
398
+ )
399
+ except Exception as exc:
400
+ result["status"] = "FAILURE"
401
+ result["message"] = f"Failed to connect to SNOWFLAKE: {exc}"
402
+
403
+ return result
404
+
405
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
406
+ return (table_ref.database, table_ref.schema, table_ref.table)
407
+
408
+ def _table_raw_id(self, table_ref: TableRef) -> str:
409
+ return f"{table_ref.database}_#_{table_ref.schema}_#_{table_ref.table}"
410
+
411
+ def _collect_dependency_links(
412
+ self,
413
+ tables: list[TableRef],
414
+ ) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
415
+ if not self._include_table_lineage_enabled() and not self._include_view_lineage_enabled():
416
+ return {}
417
+
418
+ known_keys = {self._table_key(table_ref) for table_ref in tables}
419
+ view_keys = {
420
+ self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "VIEW"
421
+ }
422
+ table_keys = {
423
+ self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "TABLE"
424
+ }
425
+ if not known_keys:
426
+ return {}
427
+
428
+ query = """
429
+ SELECT
430
+ REFERENCING_DATABASE,
431
+ REFERENCING_SCHEMA,
432
+ REFERENCING_OBJECT_NAME,
433
+ REFERENCING_OBJECT_DOMAIN,
434
+ REFERENCED_DATABASE,
435
+ REFERENCED_SCHEMA,
436
+ REFERENCED_OBJECT_NAME,
437
+ REFERENCED_OBJECT_DOMAIN
438
+ FROM SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES
439
+ WHERE REFERENCING_OBJECT_DOMAIN IN ('TABLE', 'VIEW')
440
+ AND REFERENCED_OBJECT_DOMAIN IN ('TABLE', 'VIEW')
441
+ """
442
+
443
+ links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
444
+ try:
445
+ with closing(self._connect()) as conn:
446
+ with conn.cursor() as cursor:
447
+ cursor.execute(query)
448
+ for row in self._fetch_dict_rows(cursor):
449
+ source_db = row.get("REFERENCING_DATABASE")
450
+ source_schema = row.get("REFERENCING_SCHEMA")
451
+ source_table = row.get("REFERENCING_OBJECT_NAME")
452
+ source_domain = row.get("REFERENCING_OBJECT_DOMAIN")
453
+ target_db = row.get("REFERENCED_DATABASE")
454
+ target_schema = row.get("REFERENCED_SCHEMA")
455
+ target_table = row.get("REFERENCED_OBJECT_NAME")
456
+ if not all(
457
+ isinstance(value, str)
458
+ for value in (
459
+ source_db,
460
+ source_schema,
461
+ source_table,
462
+ source_domain,
463
+ target_db,
464
+ target_schema,
465
+ target_table,
466
+ )
467
+ ):
468
+ continue
469
+
470
+ source_key = (source_db, source_schema, source_table)
471
+ target_key = (target_db, target_schema, target_table)
472
+ if source_key not in known_keys or target_key not in known_keys:
473
+ continue
474
+
475
+ source_is_view = source_key in view_keys
476
+ source_is_table = source_key in table_keys
477
+ if source_is_view and not self._include_view_lineage_enabled():
478
+ continue
479
+ if source_is_table and not self._include_table_lineage_enabled():
480
+ continue
481
+
482
+ links.setdefault(source_key, set()).add(target_key)
483
+ except Exception as exc:
484
+ logger.warning("Could not resolve Snowflake lineage links: %s", exc)
485
+
486
+ return links
487
+
488
+ def _table_to_asset(
489
+ self,
490
+ table_ref: TableRef,
491
+ *,
492
+ links: list[str] | None = None,
493
+ ) -> SingleAssetScanResults:
494
+ asset_name = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
495
+ raw_id = self._table_raw_id(table_ref)
496
+ asset_hash = self.generate_hash_id(raw_id)
497
+ external_url = (
498
+ f"snowflake://{self._account_locator()}/"
499
+ f"{table_ref.database}/{table_ref.schema}.{table_ref.table}"
500
+ )
501
+
502
+ extraction_options = self._extraction_options()
503
+ metadata = {
504
+ "account_id": self._account_id(),
505
+ "database": table_ref.database,
506
+ "schema": table_ref.schema,
507
+ "table": table_ref.table,
508
+ "object_type": table_ref.object_type,
509
+ "lineage": {
510
+ "start_time": (
511
+ extraction_options.start_time.isoformat()
512
+ if extraction_options.start_time
513
+ else None
514
+ ),
515
+ "include_table_lineage": bool(extraction_options.include_table_lineage),
516
+ "include_view_lineage": bool(extraction_options.include_view_lineage),
517
+ },
518
+ "sampling": {
519
+ "strategy": str(self._sampling().strategy),
520
+ },
521
+ }
522
+
523
+ now = datetime.now(UTC)
524
+ return SingleAssetScanResults(
525
+ hash=asset_hash,
526
+ checksum=self.calculate_checksum(metadata),
527
+ name=asset_name,
528
+ external_url=external_url,
529
+ links=links or [],
530
+ asset_type=OutputAssetType.TABLE,
531
+ source_id=self.source_id,
532
+ created_at=now,
533
+ updated_at=now,
534
+ runner_id=self.runner_id,
535
+ )
536
+
537
+ STREAM_DETECTIONS = True
538
+
539
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
540
+ if self._aborted:
541
+ return
542
+
543
+ tables = self._iter_tables()
544
+ table_hash_by_key: dict[tuple[str, str, str], str] = {
545
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
546
+ for table_ref in tables
547
+ }
548
+ dependency_links = self._collect_dependency_links(tables)
549
+
550
+ batch: list[SingleAssetScanResults] = []
551
+ for table_ref in tables:
552
+ if self._aborted:
553
+ return
554
+
555
+ key = self._table_key(table_ref)
556
+ linked_hashes = [
557
+ table_hash_by_key[target]
558
+ for target in sorted(dependency_links.get(key, set()))
559
+ if target in table_hash_by_key
560
+ ]
561
+
562
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
563
+ self._table_lookup[asset.hash] = table_ref
564
+ batch.append(asset)
565
+
566
+ if len(batch) >= self.BATCH_SIZE:
567
+ yield batch
568
+ batch = []
569
+
570
+ if batch:
571
+ yield batch
572
+
573
+ def generate_hash_id(self, asset_id: str) -> str:
574
+ return hash_id(self._asset_type_value(), asset_id)
575
+
576
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
577
+ if asset_id in self._table_lookup:
578
+ return self._table_lookup[asset_id]
579
+
580
+ decoded = asset_id
581
+ if "_#_" not in decoded:
582
+ try:
583
+ decoded = unhash_id(asset_id)
584
+ except Exception:
585
+ decoded = asset_id
586
+
587
+ if decoded.startswith("SNOWFLAKE_#_"):
588
+ decoded = decoded[len("SNOWFLAKE_#_") :]
589
+
590
+ parts = decoded.split("_#_")
591
+ if len(parts) >= 3:
592
+ return TableRef(
593
+ database=parts[-3],
594
+ schema=parts[-2],
595
+ table=parts[-1],
596
+ object_type="TABLE",
597
+ )
598
+ return None
599
+
600
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
601
+ query = f"""
602
+ SELECT COLUMN_NAME
603
+ FROM {_quote_identifier(table_ref.database)}.INFORMATION_SCHEMA.COLUMNS
604
+ WHERE TABLE_SCHEMA = %s
605
+ AND TABLE_NAME = %s
606
+ ORDER BY ORDINAL_POSITION
607
+ """
608
+ with closing(self._connect()) as conn:
609
+ with conn.cursor() as cursor:
610
+ cursor.execute(query, [table_ref.schema, table_ref.table])
611
+ return [
612
+ row.get("COLUMN_NAME")
613
+ for row in self._fetch_dict_rows(cursor)
614
+ if isinstance(row.get("COLUMN_NAME"), str)
615
+ ]
616
+
617
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
618
+ sampling = self._sampling()
619
+ configured = sampling.order_by_column
620
+ if configured:
621
+ for column in columns:
622
+ if column == configured or column.lower() == configured.lower():
623
+ return column
624
+
625
+ priority_candidates = (
626
+ "updated_at",
627
+ "modified_at",
628
+ "created_at",
629
+ "inserted_at",
630
+ "timestamp",
631
+ "ts",
632
+ "date",
633
+ )
634
+ lower_lookup = {column.lower(): column for column in columns}
635
+ for candidate in priority_candidates:
636
+ if candidate in lower_lookup:
637
+ return lower_lookup[candidate]
638
+ return None
639
+
640
+ def _build_sampling_query(
641
+ self, table_ref: TableRef, columns: list[str]
642
+ ) -> tuple[str, list[Any]]:
643
+ sampling = self._sampling()
644
+ if not columns:
645
+ raise ValueError(
646
+ f"Table {table_ref.database}.{table_ref.schema}.{table_ref.table} has no readable columns"
647
+ )
648
+
649
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
650
+ query = (
651
+ f"SELECT {quoted_columns} FROM "
652
+ f"{_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.schema)}."
653
+ f"{_quote_identifier(table_ref.table)}"
654
+ )
655
+
656
+ strategy = sampling.strategy
657
+ if strategy == SamplingStrategy.ALL:
658
+ return query, []
659
+
660
+ if strategy == SamplingStrategy.LATEST:
661
+ order_column = self._resolve_latest_order_column(columns)
662
+ if order_column:
663
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
664
+ elif sampling.fallback_to_random is not False:
665
+ query += " ORDER BY RANDOM()"
666
+ elif strategy == SamplingStrategy.RANDOM:
667
+ query += " ORDER BY RANDOM()"
668
+
669
+ query += f" LIMIT {int(sampling.rows_per_page or 100)}"
670
+ return query, []
671
+
672
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
673
+ try:
674
+ with closing(self._connect()) as conn:
675
+ with conn.cursor() as cursor:
676
+ cursor.execute(
677
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
678
+ )
679
+ row = cursor.fetchone()
680
+ return int(row[0]) if row else None
681
+ except Exception:
682
+ return None
683
+
684
+ def _serialize_cell(self, value: Any) -> str:
685
+ if value is None:
686
+ return "null"
687
+ if isinstance(value, memoryview):
688
+ value = value.tobytes()
689
+ if isinstance(value, (bytes, bytearray)):
690
+ return f"<{len(value)} bytes>"
691
+ if isinstance(value, (datetime, date)):
692
+ return value.isoformat()
693
+ if isinstance(value, Decimal):
694
+ return str(value)
695
+ return str(value)
696
+
697
+ def _format_sample_content(
698
+ self,
699
+ table_ref: TableRef,
700
+ column_names: list[str],
701
+ rows: list[tuple[Any, ...]],
702
+ row_offset: int = 0,
703
+ ) -> tuple[str, str]:
704
+ sampling = self._sampling()
705
+ return format_tabular_sample_content(
706
+ scope_label="table",
707
+ scope_value=f"{table_ref.database}.{table_ref.schema}.{table_ref.table}",
708
+ strategy=sampling.strategy,
709
+ rows=rows,
710
+ column_names=column_names,
711
+ serialize_cell=self._serialize_cell,
712
+ include_column_names=sampling.include_column_names is not False,
713
+ object_type=table_ref.object_type,
714
+ row_offset=row_offset,
715
+ raw_metadata={
716
+ "database": table_ref.database,
717
+ "schema": table_ref.schema,
718
+ "table": table_ref.table,
719
+ },
720
+ )
721
+
722
+ def _normalize_rows(self, rows: list[Any], column_names: list[str]) -> list[tuple[Any, ...]]:
723
+ normalized: list[tuple[Any, ...]] = []
724
+ for row in rows:
725
+ if isinstance(row, tuple):
726
+ normalized.append(row)
727
+ elif isinstance(row, dict):
728
+ normalized.append(tuple(row.get(column) for column in column_names))
729
+ return normalized
730
+
731
+ def _fetch_one_page(
732
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
733
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
734
+ with closing(self._connect()) as conn:
735
+ paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
736
+ with conn.cursor() as cursor:
737
+ cursor.execute(paginated_query, [])
738
+ raw_batch = list(cursor.fetchall())
739
+ description = getattr(cursor, "description", None) or []
740
+ column_names = [
741
+ str(col[0]) for col in description if isinstance(col, tuple) and col
742
+ ]
743
+ rows = self._normalize_rows(raw_batch, column_names)
744
+ return rows, column_names
745
+
746
+ @staticmethod
747
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
748
+ cursor.execute(query, [])
749
+ description = getattr(cursor, "description", None) or []
750
+ return [str(col[0]) for col in description if isinstance(col, tuple) and col]
751
+
752
+ @staticmethod
753
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[Any]:
754
+ return list(cursor.fetchmany(size))
755
+
756
+ def _fetch_sample_rows(
757
+ self, table_ref: TableRef
758
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
759
+ columns = self._available_columns(table_ref)
760
+ sampling = self._sampling()
761
+ query, params = self._build_sampling_query(table_ref, columns)
762
+
763
+ if sampling.strategy == SamplingStrategy.ALL:
764
+ rows_per_page = int(sampling.rows_per_page or 100)
765
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
766
+ else:
767
+ with closing(self._connect()) as conn:
768
+ with conn.cursor() as cursor:
769
+ cursor.execute(query, params)
770
+ raw_rows = cursor.fetchall()
771
+ description = getattr(cursor, "description", None) or []
772
+ column_names = [
773
+ str(col[0]) for col in description if isinstance(col, tuple) and col
774
+ ]
775
+ rows = self._normalize_rows(raw_rows, column_names)
776
+
777
+ if not column_names:
778
+ return None
779
+ return rows, column_names
780
+
781
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
782
+ result = self._fetch_sample_rows(table_ref)
783
+ if result is None:
784
+ return None
785
+ rows, column_names = result
786
+ return self._format_sample_content(table_ref, column_names, rows)
787
+
788
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
789
+ cached = self._content_cache.get(asset_id)
790
+ if cached:
791
+ return cached
792
+
793
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
794
+ if not table_ref:
795
+ return None
796
+
797
+ sampled = self._sample_table_rows(table_ref)
798
+
799
+ if sampled is None:
800
+ return None
801
+
802
+ self._content_cache[asset_id] = sampled
803
+ return sampled
804
+
805
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
806
+ sampling = self._sampling()
807
+
808
+ if sampling.strategy != SamplingStrategy.ALL:
809
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
810
+ if not table_ref:
811
+ return
812
+ result = self._fetch_sample_rows(table_ref)
813
+ if result is None:
814
+ return
815
+ rows, column_names = result
816
+ for i, row in enumerate(rows):
817
+ formatted = self._format_sample_content(
818
+ table_ref, column_names, [row], row_offset=i
819
+ )
820
+ if formatted:
821
+ self._content_cache[asset_id] = formatted
822
+ yield formatted
823
+ return
824
+
825
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
826
+ if not table_ref:
827
+ return
828
+
829
+ columns = self._available_columns(table_ref)
830
+ query, _ = self._build_sampling_query(table_ref, columns)
831
+ rows_per_page = int(sampling.rows_per_page or 100)
832
+ table_label = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
833
+
834
+ total_rows = self._count_table_rows(table_ref)
835
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
836
+ if total_rows is not None and total_batches is not None:
837
+ logger.info(
838
+ "Full scan %s: %d rows, %d batches of %d",
839
+ table_label,
840
+ total_rows,
841
+ total_batches,
842
+ rows_per_page,
843
+ )
844
+
845
+ # Stream rows via fetchmany — O(1) per page at any offset, no PK needed.
846
+ # Each fetchmany() advances the server-side result pointer without re-scanning.
847
+ row_offset = 0
848
+ page_num = 1
849
+
850
+ conn = self._connect()
851
+ cursor = conn.cursor()
852
+ try:
853
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
854
+ if not column_names:
855
+ return
856
+
857
+ while not self._aborted:
858
+ if total_batches is not None:
859
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
860
+
861
+ raw_rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
862
+ if not raw_rows:
863
+ break
864
+
865
+ rows = self._normalize_rows(raw_rows, column_names)
866
+
867
+ # Yield each row individually so detection runs in parallel with fetching.
868
+ for i, row in enumerate(rows):
869
+ formatted = self._format_sample_content(
870
+ table_ref, column_names, [row], row_offset=row_offset + i
871
+ )
872
+ if formatted:
873
+ self._content_cache[asset_id] = formatted
874
+ yield formatted
875
+
876
+ row_offset += len(rows)
877
+ page_num += 1
878
+ if len(rows) < rows_per_page:
879
+ break
880
+ finally:
881
+ try:
882
+ cursor.close()
883
+ except Exception:
884
+ pass
885
+ conn.close()
886
+
887
+ def enrich_finding_location(
888
+ self,
889
+ finding: DetectionResult,
890
+ asset: SingleAssetScanResults,
891
+ text_content: str,
892
+ ) -> None:
893
+ del text_content
894
+ table_ref = self._table_lookup.get(asset.hash)
895
+ if not table_ref:
896
+ return
897
+
898
+ path = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
899
+ cached = self._content_cache.get(asset.hash)
900
+ raw_content = cached[0] if cached else None
901
+ metadata = finding.metadata or {}
902
+ finding.location = build_tabular_location(
903
+ raw_content=raw_content,
904
+ matched_content=finding.matched_content,
905
+ base_path=path,
906
+ row_index=metadata.get("tabular_row_index"),
907
+ column_name=metadata.get("tabular_column_name"),
908
+ )
909
+
910
+ def abort(self) -> None:
911
+ logger.info("Aborting SNOWFLAKE extraction...")
912
+ super().abort()