classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,1034 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from contextlib import closing
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+
11
+ from ...models.generated_input import (
12
+ MSSQLInput,
13
+ MSSQLOptionalConnection,
14
+ MSSQLOptionalExtraction,
15
+ MSSQLOptionalScope,
16
+ SamplingConfig,
17
+ SamplingStrategy,
18
+ )
19
+ from ...models.generated_single_asset_scan_results import (
20
+ AssetType as OutputAssetType,
21
+ )
22
+ from ...models.generated_single_asset_scan_results import (
23
+ DetectionResult,
24
+ SingleAssetScanResults,
25
+ )
26
+ from ...utils.hashing import hash_id, unhash_id
27
+ from ..base import BaseSource
28
+ from ..dependencies import require_module
29
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ _DEFAULT_EXCLUDED_DATABASES = {"master", "tempdb", "model"}
34
+ _DEFAULT_EXCLUDED_SCHEMAS = {"INFORMATION_SCHEMA", "sys"}
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class TableRef:
39
+ database: str
40
+ schema: str
41
+ table: str
42
+ object_type: str
43
+
44
+
45
+ def _quote_identifier(identifier: str) -> str:
46
+ return f"[{identifier.replace(']', ']]')}]"
47
+
48
+
49
+ class MSSQLSource(BaseSource):
50
+ source_type = "mssql"
51
+
52
+ def __init__(
53
+ self,
54
+ recipe: dict[str, Any],
55
+ source_id: str | None = None,
56
+ runner_id: str | None = None,
57
+ ) -> None:
58
+ super().__init__(recipe, source_id, runner_id)
59
+ self.config = MSSQLInput.model_validate(recipe)
60
+ self.runner_id = runner_id or "local-run"
61
+ self._pymssql = require_module(
62
+ module_name="pymssql",
63
+ source_name="MSSQL",
64
+ uv_groups=["mssql"],
65
+ detail="The MSSQL connector is optional.",
66
+ )
67
+ self._host = self.config.required.host
68
+ self._port = int(self.config.required.port)
69
+ self._table_lookup: dict[str, TableRef] = {}
70
+ self._content_cache: dict[str, tuple[str, str]] = {}
71
+ self._pk_columns_cache: dict[tuple[str, str, str], list[str]] = {}
72
+ self._unsupported_feature_warning_logged = False
73
+
74
+ def _asset_type_value(self) -> str:
75
+ type_value = self.config.type
76
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
77
+
78
+ def _sampling(self) -> SamplingConfig:
79
+ return self.config.sampling
80
+
81
+ def _connection_options(self) -> MSSQLOptionalConnection:
82
+ if self.config.optional and self.config.optional.connection:
83
+ return self.config.optional.connection
84
+ return MSSQLOptionalConnection()
85
+
86
+ def _scope_options(self) -> MSSQLOptionalScope:
87
+ if self.config.optional and self.config.optional.scope:
88
+ return self.config.optional.scope
89
+ return MSSQLOptionalScope()
90
+
91
+ def _extraction_options(self) -> MSSQLOptionalExtraction:
92
+ if self.config.optional and self.config.optional.extraction:
93
+ return self.config.optional.extraction
94
+ return MSSQLOptionalExtraction()
95
+
96
+ def _auth_mode(self) -> str:
97
+ connection = self._connection_options()
98
+ mode = getattr(connection, "auth_mode", None)
99
+ if hasattr(mode, "root"):
100
+ mode = mode.root
101
+ value = mode.value if hasattr(mode, "value") else mode
102
+ normalized = str(value or "CUSTOM").strip().upper()
103
+ return normalized if normalized else "CUSTOM"
104
+
105
+ def _ldap_domain(self) -> str | None:
106
+ connection = self._connection_options()
107
+ domain = getattr(connection, "ldap_domain", None)
108
+ if not isinstance(domain, str):
109
+ return None
110
+ cleaned = domain.strip()
111
+ return cleaned if cleaned else None
112
+
113
+ def _username(self) -> str:
114
+ username = self.config.masked.username
115
+ if self._auth_mode() != "LDAP":
116
+ return username
117
+
118
+ if "\\" in username or "@" in username:
119
+ return username
120
+
121
+ domain = self._ldap_domain()
122
+ if domain:
123
+ return f"{domain}\\{username}"
124
+ return username
125
+
126
+ def _password(self) -> str:
127
+ return self.config.masked.password
128
+
129
+ def _is_aws_rds(self) -> bool:
130
+ configured = self._connection_options().is_aws_rds
131
+ if isinstance(configured, bool):
132
+ return configured
133
+ hostname = self._host.strip().lower()
134
+ return hostname.endswith(".rds.amazonaws.com") or ".rds." in hostname
135
+
136
+ def _connect(self, database: str | None = None):
137
+ connection_options = self._connection_options()
138
+ connect_kwargs: dict[str, Any] = {
139
+ "server": self._host,
140
+ "port": int(self._port),
141
+ "user": self._username(),
142
+ "password": self._password(),
143
+ "login_timeout": int(connection_options.connect_timeout_seconds or 30),
144
+ "timeout": int(connection_options.connect_timeout_seconds or 30),
145
+ }
146
+ if database:
147
+ connect_kwargs["database"] = database
148
+
149
+ connection = self._pymssql.connect(**connect_kwargs)
150
+ try:
151
+ connection.autocommit(True)
152
+ except Exception:
153
+ pass
154
+ return connection
155
+
156
+ def _excluded_databases(self) -> set[str]:
157
+ configured = self._scope_options().exclude_databases or []
158
+ excluded = {name.strip() for name in configured if name.strip()}
159
+ if not excluded:
160
+ excluded = set(_DEFAULT_EXCLUDED_DATABASES)
161
+ return excluded
162
+
163
+ def _schema_allowlist(self) -> set[str] | None:
164
+ configured = self._scope_options().include_schemas
165
+ if not configured:
166
+ return None
167
+ return {schema.strip() for schema in configured if schema.strip()}
168
+
169
+ def _schema_denylist(self) -> set[str]:
170
+ configured = self._scope_options().exclude_schemas or []
171
+ denylist = {schema.strip() for schema in configured if schema.strip()}
172
+ if not denylist:
173
+ denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
174
+ return denylist
175
+
176
+ def _object_allowlist(self) -> set[str]:
177
+ include_objects = self._scope_options().include_objects or []
178
+ return {entry.strip().lower() for entry in include_objects if entry.strip()}
179
+
180
+ def _include_tables_enabled(self) -> bool:
181
+ return self._scope_options().include_tables is not False
182
+
183
+ def _include_views_enabled(self) -> bool:
184
+ return self._scope_options().include_views is not False
185
+
186
+ def _include_table_lineage_enabled(self) -> bool:
187
+ return self._extraction_options().include_table_lineage is not False
188
+
189
+ def _include_view_lineage_enabled(self) -> bool:
190
+ return self._extraction_options().include_view_lineage is not False
191
+
192
+ def _log_unsupported_extraction_features(self) -> None:
193
+ if self._unsupported_feature_warning_logged:
194
+ return
195
+ self._unsupported_feature_warning_logged = True
196
+
197
+ extraction = self._extraction_options()
198
+ unsupported: list[str] = []
199
+ if extraction.include_view_column_lineage:
200
+ unsupported.append("include_view_column_lineage")
201
+ if extraction.include_stored_procedures:
202
+ unsupported.append("include_stored_procedures")
203
+ if extraction.include_stored_procedures_code:
204
+ unsupported.append("include_stored_procedures_code")
205
+ if extraction.include_jobs:
206
+ unsupported.append("include_jobs")
207
+ if extraction.include_query_lineage:
208
+ unsupported.append("include_query_lineage")
209
+ if extraction.include_usage_statistics:
210
+ unsupported.append("include_usage_statistics")
211
+
212
+ if unsupported:
213
+ logger.warning(
214
+ "MSSQL extraction options currently not implemented in this connector: %s",
215
+ ", ".join(sorted(unsupported)),
216
+ )
217
+
218
+ def _resolve_databases(self) -> list[str]:
219
+ scope_options = self._scope_options()
220
+ include_all = bool(scope_options.include_all_databases)
221
+ configured_database = scope_options.database
222
+
223
+ if not include_all:
224
+ if configured_database:
225
+ return [configured_database]
226
+ raise ValueError(
227
+ "MSSQL source requires optional.scope.database when include_all_databases is false. "
228
+ "Set optional.scope.database (e.g. 'msdb') or enable include_all_databases."
229
+ )
230
+
231
+ excluded = self._excluded_databases()
232
+ databases: list[str] = []
233
+ with closing(self._connect()) as conn:
234
+ with conn.cursor() as cursor:
235
+ cursor.execute(
236
+ """
237
+ SELECT name
238
+ FROM sys.databases
239
+ WHERE state_desc = 'ONLINE'
240
+ ORDER BY name
241
+ """
242
+ )
243
+ for row in cursor.fetchall():
244
+ database_name = row[0] if isinstance(row, tuple) else None
245
+ if not isinstance(database_name, str) or not database_name:
246
+ continue
247
+ if database_name in excluded:
248
+ continue
249
+ databases.append(database_name)
250
+
251
+ if configured_database and configured_database not in databases:
252
+ databases.insert(0, configured_database)
253
+
254
+ return databases
255
+
256
+ def _get_primary_key_columns(self, table_ref: TableRef) -> list[str]:
257
+ cache_key = (table_ref.database, table_ref.schema, table_ref.table)
258
+ if cache_key in self._pk_columns_cache:
259
+ return self._pk_columns_cache[cache_key]
260
+
261
+ if table_ref.object_type == "VIEW":
262
+ self._pk_columns_cache[cache_key] = []
263
+ return []
264
+
265
+ try:
266
+ with closing(self._connect(table_ref.database)) as conn:
267
+ with conn.cursor() as cursor:
268
+ cursor.execute(
269
+ """
270
+ SELECT kcu.COLUMN_NAME
271
+ FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS tc
272
+ JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE kcu
273
+ ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME
274
+ AND tc.TABLE_SCHEMA = kcu.TABLE_SCHEMA
275
+ AND tc.TABLE_NAME = kcu.TABLE_NAME
276
+ WHERE tc.CONSTRAINT_TYPE = 'PRIMARY KEY'
277
+ AND tc.TABLE_CATALOG = %s
278
+ AND tc.TABLE_SCHEMA = %s
279
+ AND tc.TABLE_NAME = %s
280
+ ORDER BY kcu.ORDINAL_POSITION
281
+ """,
282
+ (table_ref.database, table_ref.schema, table_ref.table),
283
+ )
284
+ columns = [
285
+ row[0]
286
+ for row in cursor.fetchall()
287
+ if isinstance(row, tuple) and row and isinstance(row[0], str)
288
+ ]
289
+ except Exception:
290
+ columns = []
291
+
292
+ self._pk_columns_cache[cache_key] = columns
293
+ return columns
294
+
295
+ def _list_tables_for_database(self, database: str) -> list[TableRef]:
296
+ include_tables = self._include_tables_enabled()
297
+ include_views = self._include_views_enabled()
298
+ if not include_tables and not include_views:
299
+ return []
300
+
301
+ schema_allowlist = self._schema_allowlist()
302
+ schema_denylist = self._schema_denylist()
303
+ object_allowlist = self._object_allowlist()
304
+ table_limit = self._scope_options().table_limit
305
+ limit = int(table_limit) if table_limit else None
306
+
307
+ tables: list[TableRef] = []
308
+ with closing(self._connect(database)) as conn:
309
+ with conn.cursor() as cursor:
310
+ cursor.execute(
311
+ """
312
+ SELECT TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE
313
+ FROM INFORMATION_SCHEMA.TABLES
314
+ WHERE TABLE_CATALOG = %s
315
+ ORDER BY TABLE_SCHEMA, TABLE_NAME
316
+ """,
317
+ (database,),
318
+ )
319
+ for row in cursor.fetchall():
320
+ if not isinstance(row, tuple) or len(row) < 3:
321
+ continue
322
+
323
+ schema_name = row[0]
324
+ table_name = row[1]
325
+ table_type = row[2]
326
+ if not isinstance(schema_name, str) or not isinstance(table_name, str):
327
+ continue
328
+ if schema_name in schema_denylist:
329
+ continue
330
+ if schema_allowlist and schema_name not in schema_allowlist:
331
+ continue
332
+
333
+ is_view = str(table_type).upper() == "VIEW"
334
+ if is_view and not include_views:
335
+ continue
336
+ if not is_view and not include_tables:
337
+ continue
338
+
339
+ scoped_name = f"{schema_name}.{table_name}".lower()
340
+ db_scoped_name = f"{database}.{schema_name}.{table_name}".lower()
341
+ if (
342
+ object_allowlist
343
+ and scoped_name not in object_allowlist
344
+ and db_scoped_name not in object_allowlist
345
+ ):
346
+ continue
347
+
348
+ tables.append(
349
+ TableRef(
350
+ database=database,
351
+ schema=schema_name,
352
+ table=table_name,
353
+ object_type="VIEW" if is_view else "TABLE",
354
+ )
355
+ )
356
+ if limit is not None and len(tables) >= limit:
357
+ break
358
+
359
+ return tables
360
+
361
+ def _iter_tables(self) -> list[TableRef]:
362
+ tables: list[TableRef] = []
363
+ for database in self._resolve_databases():
364
+ if self._aborted:
365
+ break
366
+ try:
367
+ tables.extend(self._list_tables_for_database(database))
368
+ except Exception as exc:
369
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
370
+ return tables
371
+
372
+ def test_connection(self) -> dict[str, Any]:
373
+ logger.info("Testing connection to MSSQL...")
374
+ result = {
375
+ "timestamp": datetime.now(UTC).isoformat(),
376
+ "source_type": self.recipe.get("type"),
377
+ }
378
+
379
+ try:
380
+ databases = self._resolve_databases()
381
+ if not databases:
382
+ raise ValueError("No databases available for scanning")
383
+
384
+ with closing(self._connect(databases[0])) as conn:
385
+ with conn.cursor() as cursor:
386
+ cursor.execute("SELECT 1")
387
+ cursor.fetchone()
388
+
389
+ result["status"] = "SUCCESS"
390
+ result["message"] = (
391
+ f"Successfully connected to MSSQL. Reachable databases: {len(databases)}."
392
+ )
393
+ except Exception as exc:
394
+ result["status"] = "FAILURE"
395
+ result["message"] = f"Failed to connect to MSSQL: {exc}"
396
+
397
+ return result
398
+
399
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
400
+ return (table_ref.database, table_ref.schema, table_ref.table)
401
+
402
+ def _table_raw_id(self, table_ref: TableRef) -> str:
403
+ return f"{table_ref.database}_#_{table_ref.schema}_#_{table_ref.table}"
404
+
405
+ def _collect_foreign_key_links(
406
+ self,
407
+ tables: list[TableRef],
408
+ ) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
409
+ table_keys = {
410
+ self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "TABLE"
411
+ }
412
+ by_database: dict[str, set[tuple[str, str, str]]] = {}
413
+ for table_ref in tables:
414
+ if table_ref.object_type != "TABLE":
415
+ continue
416
+ by_database.setdefault(table_ref.database, set()).add(self._table_key(table_ref))
417
+
418
+ links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
419
+ for database, scoped_keys in by_database.items():
420
+ try:
421
+ with closing(self._connect(database)) as conn:
422
+ with conn.cursor() as cursor:
423
+ cursor.execute(
424
+ """
425
+ SELECT
426
+ OBJECT_SCHEMA_NAME(fk.parent_object_id) AS source_schema,
427
+ OBJECT_NAME(fk.parent_object_id) AS source_table,
428
+ OBJECT_SCHEMA_NAME(fk.referenced_object_id) AS target_schema,
429
+ OBJECT_NAME(fk.referenced_object_id) AS target_table
430
+ FROM sys.foreign_keys fk
431
+ """
432
+ )
433
+ for row in cursor.fetchall():
434
+ if not isinstance(row, tuple) or len(row) < 4:
435
+ continue
436
+ source_schema, source_table, target_schema, target_table = row
437
+ source_key = (database, str(source_schema), str(source_table))
438
+ target_key = (database, str(target_schema), str(target_table))
439
+ if source_key not in scoped_keys or target_key not in table_keys:
440
+ continue
441
+ links.setdefault(source_key, set()).add(target_key)
442
+ except Exception as exc:
443
+ logger.warning(
444
+ "Could not resolve foreign key links for database %s: %s",
445
+ database,
446
+ exc,
447
+ )
448
+
449
+ return links
450
+
451
+ def _collect_view_dependency_links(
452
+ self,
453
+ tables: list[TableRef],
454
+ ) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
455
+ table_keys = {self._table_key(table_ref) for table_ref in tables}
456
+ view_keys = {
457
+ self._table_key(table_ref) for table_ref in tables if table_ref.object_type == "VIEW"
458
+ }
459
+
460
+ by_database: dict[str, set[tuple[str, str, str]]] = {}
461
+ for table_ref in tables:
462
+ if table_ref.object_type != "VIEW":
463
+ continue
464
+ by_database.setdefault(table_ref.database, set()).add(self._table_key(table_ref))
465
+
466
+ links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
467
+ for database, scoped_views in by_database.items():
468
+ try:
469
+ with closing(self._connect(database)) as conn:
470
+ with conn.cursor() as cursor:
471
+ cursor.execute(
472
+ """
473
+ SELECT
474
+ OBJECT_SCHEMA_NAME(dep.referencing_id) AS source_schema,
475
+ OBJECT_NAME(dep.referencing_id) AS source_name,
476
+ OBJECT_SCHEMA_NAME(dep.referenced_id) AS target_schema,
477
+ OBJECT_NAME(dep.referenced_id) AS target_name
478
+ FROM sys.sql_expression_dependencies dep
479
+ JOIN sys.views v ON dep.referencing_id = v.object_id
480
+ WHERE dep.referenced_id IS NOT NULL
481
+ """
482
+ )
483
+ for row in cursor.fetchall():
484
+ if not isinstance(row, tuple) or len(row) < 4:
485
+ continue
486
+
487
+ source_schema, source_name, target_schema, target_name = row
488
+ source_key = (database, str(source_schema), str(source_name))
489
+ target_key = (database, str(target_schema), str(target_name))
490
+
491
+ if source_key not in scoped_views:
492
+ continue
493
+ if target_key not in table_keys:
494
+ continue
495
+ if source_key not in view_keys:
496
+ continue
497
+
498
+ links.setdefault(source_key, set()).add(target_key)
499
+ except Exception as exc:
500
+ logger.warning(
501
+ "Could not resolve view lineage links for database %s: %s",
502
+ database,
503
+ exc,
504
+ )
505
+
506
+ return links
507
+
508
+ def _collect_dependency_links(
509
+ self,
510
+ tables: list[TableRef],
511
+ ) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
512
+ links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
513
+
514
+ if self._include_table_lineage_enabled():
515
+ fk_links = self._collect_foreign_key_links(tables)
516
+ for source, targets in fk_links.items():
517
+ links.setdefault(source, set()).update(targets)
518
+
519
+ if self._include_view_lineage_enabled():
520
+ view_links = self._collect_view_dependency_links(tables)
521
+ for source, targets in view_links.items():
522
+ links.setdefault(source, set()).update(targets)
523
+
524
+ return links
525
+
526
+ def _table_to_asset(
527
+ self,
528
+ table_ref: TableRef,
529
+ *,
530
+ links: list[str] | None = None,
531
+ ) -> SingleAssetScanResults:
532
+ asset_name = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
533
+ raw_id = self._table_raw_id(table_ref)
534
+ asset_hash = self.generate_hash_id(raw_id)
535
+ external_url = (
536
+ f"mssql://{self._host}:{self._port}/"
537
+ f"{table_ref.database}/{table_ref.schema}.{table_ref.table}"
538
+ )
539
+
540
+ metadata = {
541
+ "database": table_ref.database,
542
+ "schema": table_ref.schema,
543
+ "table": table_ref.table,
544
+ "object_type": table_ref.object_type,
545
+ "is_aws_rds": self._is_aws_rds(),
546
+ "sampling": {
547
+ "strategy": str(self._sampling().strategy),
548
+ },
549
+ }
550
+
551
+ now = datetime.now(UTC)
552
+ return SingleAssetScanResults(
553
+ hash=asset_hash,
554
+ checksum=self.calculate_checksum(metadata),
555
+ name=asset_name,
556
+ external_url=external_url,
557
+ links=links or [],
558
+ asset_type=OutputAssetType.TABLE,
559
+ source_id=self.source_id,
560
+ created_at=now,
561
+ updated_at=now,
562
+ runner_id=self.runner_id,
563
+ )
564
+
565
+ STREAM_DETECTIONS = True
566
+
567
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
568
+ if self._aborted:
569
+ return
570
+
571
+ self._log_unsupported_extraction_features()
572
+
573
+ tables = self._iter_tables()
574
+ table_hash_by_key: dict[tuple[str, str, str], str] = {
575
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
576
+ for table_ref in tables
577
+ }
578
+ table_fk_links = self._collect_dependency_links(tables)
579
+
580
+ batch: list[SingleAssetScanResults] = []
581
+ for table_ref in tables:
582
+ if self._aborted:
583
+ return
584
+
585
+ key = self._table_key(table_ref)
586
+ linked_hashes = [
587
+ table_hash_by_key[target]
588
+ for target in sorted(table_fk_links.get(key, set()))
589
+ if target in table_hash_by_key
590
+ ]
591
+
592
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
593
+ self._table_lookup[asset.hash] = table_ref
594
+ batch.append(asset)
595
+
596
+ if len(batch) >= self.BATCH_SIZE:
597
+ yield batch
598
+ batch = []
599
+
600
+ if batch:
601
+ yield batch
602
+
603
+ def generate_hash_id(self, asset_id: str) -> str:
604
+ return hash_id(self._asset_type_value(), asset_id)
605
+
606
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
607
+ if asset_id in self._table_lookup:
608
+ return self._table_lookup[asset_id]
609
+
610
+ decoded = asset_id
611
+ if "_#_" not in decoded:
612
+ try:
613
+ decoded = unhash_id(asset_id)
614
+ except Exception:
615
+ decoded = asset_id
616
+
617
+ parts = decoded.split("_#_")
618
+ if len(parts) >= 4 and parts[0].upper() == "MSSQL":
619
+ return TableRef(
620
+ database=parts[-3],
621
+ schema=parts[-2],
622
+ table=parts[-1],
623
+ object_type="TABLE",
624
+ )
625
+ if len(parts) >= 3:
626
+ return TableRef(
627
+ database=parts[-3],
628
+ schema=parts[-2],
629
+ table=parts[-1],
630
+ object_type="TABLE",
631
+ )
632
+ return None
633
+
634
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
635
+ with closing(self._connect(table_ref.database)) as conn:
636
+ with conn.cursor() as cursor:
637
+ cursor.execute(
638
+ """
639
+ SELECT COLUMN_NAME
640
+ FROM INFORMATION_SCHEMA.COLUMNS
641
+ WHERE TABLE_CATALOG = %s
642
+ AND TABLE_SCHEMA = %s
643
+ AND TABLE_NAME = %s
644
+ ORDER BY ORDINAL_POSITION
645
+ """,
646
+ (table_ref.database, table_ref.schema, table_ref.table),
647
+ )
648
+ return [
649
+ row[0]
650
+ for row in cursor.fetchall()
651
+ if isinstance(row, tuple) and row and isinstance(row[0], str)
652
+ ]
653
+
654
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
655
+ sampling = self._sampling()
656
+ configured = sampling.order_by_column
657
+ if configured and configured in columns:
658
+ return configured
659
+
660
+ priority_candidates = (
661
+ "updated_at",
662
+ "modified_at",
663
+ "created_at",
664
+ "inserted_at",
665
+ "timestamp",
666
+ "ts",
667
+ "date",
668
+ )
669
+ for candidate in priority_candidates:
670
+ if candidate in columns:
671
+ return candidate
672
+ return None
673
+
674
+ def _build_sampling_query(
675
+ self, table_ref: TableRef, columns: list[str]
676
+ ) -> tuple[str, list[Any]]:
677
+ sampling = self._sampling()
678
+ if not columns:
679
+ raise ValueError(
680
+ f"Table {table_ref.database}.{table_ref.schema}.{table_ref.table} has no readable columns"
681
+ )
682
+
683
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
684
+
685
+ strategy = sampling.strategy
686
+ if strategy == SamplingStrategy.ALL:
687
+ query = (
688
+ f"SELECT {quoted_columns} FROM "
689
+ f"{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
690
+ )
691
+ return query, []
692
+
693
+ rows_per_page = int(sampling.rows_per_page or 100)
694
+ query = (
695
+ f"SELECT TOP {rows_per_page} {quoted_columns} FROM "
696
+ f"{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
697
+ )
698
+
699
+ if strategy == SamplingStrategy.LATEST:
700
+ order_column = self._resolve_latest_order_column(columns)
701
+ if order_column:
702
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
703
+ elif sampling.fallback_to_random is not False:
704
+ query += " ORDER BY NEWID()"
705
+ elif strategy == SamplingStrategy.RANDOM:
706
+ query += " ORDER BY NEWID()"
707
+
708
+ return query, []
709
+
710
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
711
+ try:
712
+ with closing(self._connect(table_ref.database)) as conn:
713
+ with conn.cursor() as cursor:
714
+ cursor.execute(
715
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
716
+ )
717
+ row = cursor.fetchone()
718
+ return int(row[0]) if row else None
719
+ except Exception:
720
+ return None
721
+
722
+ def _serialize_cell(self, value: Any) -> str:
723
+ if value is None:
724
+ return "null"
725
+ if isinstance(value, memoryview):
726
+ value = value.tobytes()
727
+ if isinstance(value, (bytes, bytearray)):
728
+ return f"<{len(value)} bytes>"
729
+ if isinstance(value, datetime):
730
+ return value.isoformat()
731
+ return str(value)
732
+
733
+ def _format_sample_content(
734
+ self,
735
+ table_ref: TableRef,
736
+ column_names: list[str],
737
+ rows: list[tuple[Any, ...]],
738
+ row_offset: int = 0,
739
+ ) -> tuple[str, str]:
740
+ sampling = self._sampling()
741
+ return format_tabular_sample_content(
742
+ scope_label="table",
743
+ scope_value=f"{table_ref.database}.{table_ref.schema}.{table_ref.table}",
744
+ strategy=sampling.strategy,
745
+ rows=rows,
746
+ column_names=column_names,
747
+ serialize_cell=self._serialize_cell,
748
+ include_column_names=sampling.include_column_names is not False,
749
+ object_type=table_ref.object_type,
750
+ row_offset=row_offset,
751
+ raw_metadata={
752
+ "database": table_ref.database,
753
+ "schema": table_ref.schema,
754
+ "table": table_ref.table,
755
+ },
756
+ )
757
+
758
+ def _fetch_one_page(
759
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
760
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
761
+ with closing(self._connect(table_ref.database)) as conn:
762
+ paginated_query = (
763
+ f"{base_query} ORDER BY (SELECT NULL) "
764
+ f"OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
765
+ )
766
+ with conn.cursor() as cursor:
767
+ cursor.execute(paginated_query)
768
+ rows = list(cursor.fetchall())
769
+ column_names = (
770
+ [desc[0] for desc in cursor.description] if cursor.description else []
771
+ )
772
+ return rows, column_names
773
+
774
+ def _fetch_one_page_on_conn(
775
+ self,
776
+ conn: Any,
777
+ base_query: str,
778
+ page_size: int,
779
+ offset: int,
780
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
781
+ paginated_query = (
782
+ f"{base_query} ORDER BY (SELECT NULL) "
783
+ f"OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
784
+ )
785
+ with conn.cursor() as cursor:
786
+ cursor.execute(paginated_query)
787
+ rows = list(cursor.fetchall())
788
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
789
+ return rows, column_names
790
+
791
+ @staticmethod
792
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
793
+ cursor.execute(query)
794
+ return [desc[0] for desc in cursor.description] if cursor.description else []
795
+
796
+ @staticmethod
797
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
798
+ return list(cursor.fetchmany(size))
799
+
800
+ def _fetch_page_keyset(
801
+ self,
802
+ conn: Any,
803
+ base_query: str,
804
+ page_size: int,
805
+ pk_columns: list[str],
806
+ pk_order: str,
807
+ last_pk_values: list[Any] | None,
808
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
809
+ """Fetch one page using keyset pagination — O(1) cost at any offset."""
810
+ if last_pk_values is None:
811
+ paginated_query = (
812
+ f"{base_query} ORDER BY {pk_order} OFFSET 0 ROWS FETCH NEXT {page_size} ROWS ONLY"
813
+ )
814
+ params: list[Any] = []
815
+ elif len(pk_columns) == 1:
816
+ where = f"WHERE {_quote_identifier(pk_columns[0])} > ?"
817
+ paginated_query = (
818
+ f"{base_query} {where} ORDER BY {pk_order} "
819
+ f"OFFSET 0 ROWS FETCH NEXT {page_size} ROWS ONLY"
820
+ )
821
+ params = [last_pk_values[0]]
822
+ else:
823
+ # Composite PK: expanded OR form for broad MSSQL compatibility
824
+ conditions = []
825
+ params = []
826
+ for i in range(len(pk_columns)):
827
+ eq_parts = " AND ".join(f"{_quote_identifier(pk_columns[j])} = ?" for j in range(i))
828
+ gt_part = f"{_quote_identifier(pk_columns[i])} > ?"
829
+ if eq_parts:
830
+ conditions.append(f"({eq_parts} AND {gt_part})")
831
+ params.extend(last_pk_values[:i])
832
+ params.append(last_pk_values[i])
833
+ else:
834
+ conditions.append(f"({gt_part})")
835
+ params.append(last_pk_values[i])
836
+ where = "WHERE " + " OR ".join(conditions)
837
+ paginated_query = (
838
+ f"{base_query} {where} ORDER BY {pk_order} "
839
+ f"OFFSET 0 ROWS FETCH NEXT {page_size} ROWS ONLY"
840
+ )
841
+
842
+ with conn.cursor() as cursor:
843
+ cursor.execute(paginated_query, params if params else None)
844
+ rows = list(cursor.fetchall())
845
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
846
+ return rows, column_names
847
+
848
+ def _fetch_sample_rows(
849
+ self, table_ref: TableRef
850
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
851
+ columns = self._available_columns(table_ref)
852
+ sampling = self._sampling()
853
+ query, _params = self._build_sampling_query(table_ref, columns)
854
+
855
+ if sampling.strategy == SamplingStrategy.ALL:
856
+ rows_per_page = int(sampling.rows_per_page or 100)
857
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
858
+ else:
859
+ with closing(self._connect(table_ref.database)) as conn:
860
+ with conn.cursor() as cursor:
861
+ cursor.execute(query)
862
+ rows = cursor.fetchall()
863
+ column_names = [desc[0] for desc in cursor.description or []]
864
+
865
+ if not column_names:
866
+ return None
867
+ return rows, column_names
868
+
869
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
870
+ result = self._fetch_sample_rows(table_ref)
871
+ if result is None:
872
+ return None
873
+ rows, column_names = result
874
+ return self._format_sample_content(table_ref, column_names, rows)
875
+
876
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
877
+ cached = self._content_cache.get(asset_id)
878
+ if cached:
879
+ return cached
880
+
881
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
882
+ if not table_ref:
883
+ return None
884
+
885
+ sampled = self._sample_table_rows(table_ref)
886
+
887
+ if sampled is None:
888
+ return None
889
+
890
+ self._content_cache[asset_id] = sampled
891
+ return sampled
892
+
893
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
894
+ sampling = self._sampling()
895
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
896
+ if not table_ref:
897
+ return
898
+
899
+ if sampling.strategy != SamplingStrategy.ALL:
900
+ result = self._fetch_sample_rows(table_ref)
901
+ if result is None:
902
+ return
903
+ rows, column_names = result
904
+ for i, row in enumerate(rows):
905
+ formatted = self._format_sample_content(
906
+ table_ref, column_names, [row], row_offset=i
907
+ )
908
+ if formatted:
909
+ self._content_cache[asset_id] = formatted
910
+ yield formatted
911
+ return
912
+
913
+ columns = self._available_columns(table_ref)
914
+ query, _ = self._build_sampling_query(table_ref, columns)
915
+ rows_per_page = int(sampling.rows_per_page or 100)
916
+ table_label = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
917
+
918
+ total_rows = self._count_table_rows(table_ref)
919
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
920
+ if total_rows is not None and total_batches is not None:
921
+ logger.info(
922
+ "Full scan %s: %d rows, %d batches of %d",
923
+ table_label,
924
+ total_rows,
925
+ total_batches,
926
+ rows_per_page,
927
+ )
928
+
929
+ # Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
930
+ # Fall back to streaming fetchmany (also O(1)) for tables without a primary key.
931
+ pk_columns = (
932
+ self._get_primary_key_columns(table_ref) if table_ref.object_type == "TABLE" else []
933
+ )
934
+ pk_indices: list[int] = []
935
+ use_keyset = False
936
+ if pk_columns:
937
+ column_list = self._available_columns(table_ref)
938
+ indices = [column_list.index(col) for col in pk_columns if col in column_list]
939
+ if len(indices) == len(pk_columns):
940
+ pk_indices = indices
941
+ pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
942
+ use_keyset = True
943
+
944
+ row_offset = 0
945
+ page_num = 1
946
+ last_pk_values: list[Any] | None = None
947
+
948
+ conn = self._connect(table_ref.database)
949
+ cursor = conn.cursor() if not use_keyset else None
950
+ try:
951
+ if cursor is not None:
952
+ # Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
953
+ # pyodbc streams results natively so fetchmany() is O(1) per batch.
954
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
955
+ if not column_names:
956
+ return
957
+
958
+ while not self._aborted:
959
+ if total_batches is not None:
960
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
961
+
962
+ if use_keyset:
963
+ rows, column_names = await asyncio.to_thread(
964
+ self._fetch_page_keyset,
965
+ conn,
966
+ query,
967
+ rows_per_page,
968
+ pk_columns,
969
+ pk_order,
970
+ last_pk_values,
971
+ )
972
+ else:
973
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
974
+ if not rows:
975
+ break
976
+
977
+ if not rows or not column_names:
978
+ break
979
+
980
+ # Yield each row individually so the detection pipeline can start
981
+ # processing rows while the next page is being fetched in a thread.
982
+ for i, row in enumerate(rows):
983
+ formatted = self._format_sample_content(
984
+ table_ref, column_names, [row], row_offset=row_offset + i
985
+ )
986
+ if formatted:
987
+ self._content_cache[asset_id] = formatted
988
+ yield formatted
989
+
990
+ if use_keyset:
991
+ last_row = rows[-1]
992
+ last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
993
+
994
+ row_offset += len(rows)
995
+ page_num += 1
996
+ if len(rows) < rows_per_page:
997
+ break
998
+ finally:
999
+ if cursor is not None:
1000
+ try:
1001
+ cursor.close()
1002
+ except Exception:
1003
+ pass
1004
+ conn.close()
1005
+
1006
+ def enrich_finding_location(
1007
+ self,
1008
+ finding: DetectionResult,
1009
+ asset: SingleAssetScanResults,
1010
+ text_content: str,
1011
+ ) -> None:
1012
+ del text_content
1013
+ table_ref = self._table_lookup.get(asset.hash)
1014
+ if not table_ref:
1015
+ return
1016
+
1017
+ path = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
1018
+ cached = self._content_cache.get(asset.hash)
1019
+ raw_content = cached[0] if cached else None
1020
+ metadata = finding.metadata or {}
1021
+ finding.location = build_tabular_location(
1022
+ raw_content=raw_content,
1023
+ matched_content=finding.matched_content,
1024
+ base_path=path,
1025
+ primary_key_columns=(
1026
+ self._get_primary_key_columns(table_ref) if table_ref.object_type == "TABLE" else []
1027
+ ),
1028
+ row_index=metadata.get("tabular_row_index"),
1029
+ column_name=metadata.get("tabular_column_name"),
1030
+ )
1031
+
1032
+ def abort(self) -> None:
1033
+ logger.info("Aborting MSSQL extraction...")
1034
+ super().abort()