classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,774 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from dataclasses import dataclass
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ from ...models.generated_input import (
11
+ PostgreSQLInput,
12
+ PostgreSQLOptionalConnection,
13
+ PostgreSQLOptionalScope,
14
+ SamplingConfig,
15
+ SamplingStrategy,
16
+ )
17
+ from ...models.generated_single_asset_scan_results import (
18
+ AssetType as OutputAssetType,
19
+ )
20
+ from ...models.generated_single_asset_scan_results import (
21
+ DetectionResult,
22
+ SingleAssetScanResults,
23
+ )
24
+ from ...utils.hashing import hash_id, unhash_id
25
+ from ..base import BaseSource
26
+ from ..dependencies import require_module
27
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ _DEFAULT_EXCLUDED_SCHEMAS = {"information_schema", "pg_catalog", "pg_toast"}
32
+
33
+
34
+ @dataclass(frozen=True)
35
+ class TableRef:
36
+ database: str
37
+ schema: str
38
+ table: str
39
+
40
+
41
+ def _quote_identifier(identifier: str) -> str:
42
+ return '"' + identifier.replace('"', '""') + '"'
43
+
44
+
45
+ class PostgreSQLSource(BaseSource):
46
+ source_type = "postgresql"
47
+
48
+ def __init__(
49
+ self,
50
+ recipe: dict[str, Any],
51
+ source_id: str | None = None,
52
+ runner_id: str | None = None,
53
+ ) -> None:
54
+ super().__init__(recipe, source_id, runner_id)
55
+ self.config = PostgreSQLInput.model_validate(recipe)
56
+ self.runner_id = runner_id or "local-run"
57
+ self._psycopg2 = require_module(
58
+ module_name="psycopg2",
59
+ source_name="PostgreSQL",
60
+ uv_groups=["postgresql"],
61
+ detail="The PostgreSQL connector is optional.",
62
+ )
63
+ self._table_lookup: dict[str, TableRef] = {}
64
+ self._content_cache: dict[str, tuple[str, str]] = {}
65
+ self._pk_columns_cache: dict[tuple[str, str, str], list[str]] = {}
66
+
67
+ def _asset_type_value(self) -> str:
68
+ type_value = self.config.type
69
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
70
+
71
+ def _sampling(self) -> SamplingConfig:
72
+ return self.config.sampling
73
+
74
+ def _connection_options(self) -> PostgreSQLOptionalConnection:
75
+ if self.config.optional and self.config.optional.connection:
76
+ return self.config.optional.connection
77
+ return PostgreSQLOptionalConnection()
78
+
79
+ def _scope_options(self) -> PostgreSQLOptionalScope:
80
+ if self.config.optional and self.config.optional.scope:
81
+ return self.config.optional.scope
82
+ return PostgreSQLOptionalScope()
83
+
84
+ def _username(self) -> str:
85
+ return self.config.masked.username
86
+
87
+ def _password(self) -> str:
88
+ return self.config.masked.password
89
+
90
+ def _connect(self, database: str):
91
+ connection_options = self._connection_options()
92
+ connect_kwargs = {
93
+ "host": self.config.required.host,
94
+ "port": int(self.config.required.port),
95
+ "user": self._username(),
96
+ "password": self._password(),
97
+ "dbname": database,
98
+ "connect_timeout": int(connection_options.connect_timeout_seconds or 30),
99
+ "sslmode": str(connection_options.ssl_mode or "prefer"),
100
+ }
101
+ connection = self._psycopg2.connect(**connect_kwargs)
102
+ connection.autocommit = True
103
+ return connection
104
+
105
+ def _resolve_databases(self) -> list[str]:
106
+ scope_options = self._scope_options()
107
+ include_all = bool(scope_options.include_all_databases)
108
+ configured_database = scope_options.database
109
+
110
+ if not include_all:
111
+ # Default to "postgres" maintenance database when no explicit database is configured,
112
+ # so that connection tests can proceed and report actual auth/connectivity errors.
113
+ return [configured_database or "postgres"]
114
+
115
+ maintenance_database = scope_options.maintenance_database or "postgres"
116
+ databases: list[str] = []
117
+ with self._connect(maintenance_database) as conn:
118
+ with conn.cursor() as cursor:
119
+ cursor.execute(
120
+ """
121
+ SELECT datname
122
+ FROM pg_database
123
+ WHERE datistemplate = false
124
+ AND datallowconn = true
125
+ AND datname <> 'rdsadmin'
126
+ ORDER BY datname
127
+ """
128
+ )
129
+ for (database_name,) in cursor.fetchall():
130
+ if isinstance(database_name, str) and database_name:
131
+ databases.append(database_name)
132
+
133
+ if configured_database and configured_database not in databases:
134
+ databases.insert(0, configured_database)
135
+
136
+ if not databases:
137
+ return [maintenance_database]
138
+ return databases
139
+
140
+ def _table_allowlist(self) -> set[str]:
141
+ allowlist: set[str] = set()
142
+ include_tables = self._scope_options().include_tables or []
143
+ for item in include_tables:
144
+ normalized = item.strip().lower()
145
+ if normalized:
146
+ allowlist.add(normalized)
147
+ return allowlist
148
+
149
+ def _schema_allowlist(self) -> set[str] | None:
150
+ include_schemas = self._scope_options().include_schemas
151
+ if not include_schemas:
152
+ return None
153
+ return {schema.strip() for schema in include_schemas if schema.strip()}
154
+
155
+ def _schema_denylist(self) -> set[str]:
156
+ configured = self._scope_options().exclude_schemas or []
157
+ denylist = {schema.strip() for schema in configured if schema.strip()}
158
+ if not denylist:
159
+ denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
160
+ return denylist
161
+
162
+ def _get_primary_key_columns(self, table_ref: TableRef) -> list[str]:
163
+ cache_key = (table_ref.database, table_ref.schema, table_ref.table)
164
+ if cache_key in self._pk_columns_cache:
165
+ return self._pk_columns_cache[cache_key]
166
+ try:
167
+ with self._connect(table_ref.database) as conn:
168
+ with conn.cursor() as cursor:
169
+ cursor.execute(
170
+ """
171
+ SELECT kcu.column_name
172
+ FROM information_schema.table_constraints tc
173
+ JOIN information_schema.key_column_usage kcu
174
+ ON tc.constraint_name = kcu.constraint_name
175
+ AND tc.table_schema = kcu.table_schema
176
+ AND tc.table_name = kcu.table_name
177
+ WHERE tc.constraint_type = 'PRIMARY KEY'
178
+ AND tc.table_schema = %s
179
+ AND tc.table_name = %s
180
+ ORDER BY kcu.ordinal_position
181
+ """,
182
+ (table_ref.schema, table_ref.table),
183
+ )
184
+ cols = [row[0] for row in cursor.fetchall() if isinstance(row[0], str)]
185
+ except Exception:
186
+ cols = []
187
+ self._pk_columns_cache[cache_key] = cols
188
+ return cols
189
+
190
+ def _list_tables_for_database(self, database: str) -> list[TableRef]:
191
+ schema_allowlist = self._schema_allowlist()
192
+ schema_denylist = self._schema_denylist()
193
+ table_allowlist = self._table_allowlist()
194
+ table_limit = self._scope_options().table_limit
195
+ limit = int(table_limit) if table_limit else None
196
+
197
+ tables: list[TableRef] = []
198
+ with self._connect(database) as conn:
199
+ with conn.cursor() as cursor:
200
+ cursor.execute(
201
+ """
202
+ SELECT table_schema, table_name
203
+ FROM information_schema.tables
204
+ WHERE table_type = 'BASE TABLE'
205
+ ORDER BY table_schema, table_name
206
+ """
207
+ )
208
+ for schema_name, table_name in cursor.fetchall():
209
+ if not isinstance(schema_name, str) or not isinstance(table_name, str):
210
+ continue
211
+ if schema_name in schema_denylist:
212
+ continue
213
+ if schema_allowlist and schema_name not in schema_allowlist:
214
+ continue
215
+
216
+ schema_table = f"{schema_name}.{table_name}".lower()
217
+ db_schema_table = f"{database}.{schema_name}.{table_name}".lower()
218
+ if (
219
+ table_allowlist
220
+ and schema_table not in table_allowlist
221
+ and db_schema_table not in table_allowlist
222
+ ):
223
+ continue
224
+
225
+ tables.append(TableRef(database=database, schema=schema_name, table=table_name))
226
+ if limit is not None and len(tables) >= limit:
227
+ break
228
+ return tables
229
+
230
+ def _iter_tables(self) -> list[TableRef]:
231
+ tables: list[TableRef] = []
232
+ for database in self._resolve_databases():
233
+ if self._aborted:
234
+ break
235
+ try:
236
+ tables.extend(self._list_tables_for_database(database))
237
+ except Exception as exc:
238
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
239
+ return tables
240
+
241
+ def test_connection(self) -> dict[str, Any]:
242
+ logger.info("Testing connection to PostgreSQL...")
243
+ result = {
244
+ "timestamp": datetime.now(UTC).isoformat(),
245
+ "source_type": self.recipe.get("type"),
246
+ }
247
+
248
+ try:
249
+ databases = self._resolve_databases()
250
+ with self._connect(databases[0]) as conn:
251
+ with conn.cursor() as cursor:
252
+ cursor.execute("SELECT 1")
253
+ cursor.fetchone()
254
+ result["status"] = "SUCCESS"
255
+ result["message"] = (
256
+ f"Successfully connected to PostgreSQL. Reachable databases: {len(databases)}."
257
+ )
258
+ except Exception as exc:
259
+ result["status"] = "FAILURE"
260
+ result["message"] = f"Failed to connect to PostgreSQL: {exc}"
261
+
262
+ return result
263
+
264
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
265
+ return (table_ref.database, table_ref.schema, table_ref.table)
266
+
267
+ def _table_raw_id(self, table_ref: TableRef) -> str:
268
+ return f"{table_ref.database}_#_{table_ref.schema}_#_{table_ref.table}"
269
+
270
+ def _collect_foreign_key_links(
271
+ self,
272
+ tables: list[TableRef],
273
+ ) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
274
+ by_database: dict[str, set[tuple[str, str, str]]] = {}
275
+ for table_ref in tables:
276
+ by_database.setdefault(table_ref.database, set()).add(self._table_key(table_ref))
277
+
278
+ links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
279
+ for database, scoped_keys in by_database.items():
280
+ try:
281
+ with self._connect(database) as conn:
282
+ with conn.cursor() as cursor:
283
+ cursor.execute(
284
+ """
285
+ SELECT
286
+ source_ns.nspname AS source_schema,
287
+ source_tbl.relname AS source_table,
288
+ target_ns.nspname AS target_schema,
289
+ target_tbl.relname AS target_table
290
+ FROM pg_constraint AS fk
291
+ JOIN pg_class AS source_tbl
292
+ ON source_tbl.oid = fk.conrelid
293
+ JOIN pg_namespace AS source_ns
294
+ ON source_ns.oid = source_tbl.relnamespace
295
+ JOIN pg_class AS target_tbl
296
+ ON target_tbl.oid = fk.confrelid
297
+ JOIN pg_namespace AS target_ns
298
+ ON target_ns.oid = target_tbl.relnamespace
299
+ WHERE fk.contype = 'f'
300
+ """
301
+ )
302
+ for (
303
+ source_schema,
304
+ source_table,
305
+ target_schema,
306
+ target_table,
307
+ ) in cursor.fetchall():
308
+ source_key = (database, source_schema, source_table)
309
+ target_key = (database, target_schema, target_table)
310
+ if source_key not in scoped_keys or target_key not in scoped_keys:
311
+ continue
312
+ links.setdefault(source_key, set()).add(target_key)
313
+ except Exception as exc:
314
+ logger.warning(
315
+ "Could not resolve foreign key links for database %s: %s",
316
+ database,
317
+ exc,
318
+ )
319
+
320
+ return links
321
+
322
+ def _table_to_asset(
323
+ self, table_ref: TableRef, *, links: list[str] | None = None
324
+ ) -> SingleAssetScanResults:
325
+ asset_name = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
326
+ raw_id = self._table_raw_id(table_ref)
327
+ asset_hash = self.generate_hash_id(raw_id)
328
+ external_url = (
329
+ f"postgresql://{self.config.required.host}:{self.config.required.port}/"
330
+ f"{table_ref.database}/{table_ref.schema}.{table_ref.table}"
331
+ )
332
+
333
+ metadata = {
334
+ "database": table_ref.database,
335
+ "schema": table_ref.schema,
336
+ "table": table_ref.table,
337
+ "sampling": {
338
+ "strategy": str(self._sampling().strategy),
339
+ },
340
+ }
341
+
342
+ now = datetime.now(UTC)
343
+ return SingleAssetScanResults(
344
+ hash=asset_hash,
345
+ checksum=self.calculate_checksum(metadata),
346
+ name=asset_name,
347
+ external_url=external_url,
348
+ links=links or [],
349
+ asset_type=OutputAssetType.TABLE,
350
+ source_id=self.source_id,
351
+ created_at=now,
352
+ updated_at=now,
353
+ runner_id=self.runner_id,
354
+ )
355
+
356
+ STREAM_DETECTIONS = True
357
+
358
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
359
+ if self._aborted:
360
+ return
361
+
362
+ tables = self._iter_tables()
363
+ table_hash_by_key: dict[tuple[str, str, str], str] = {
364
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
365
+ for table_ref in tables
366
+ }
367
+ table_fk_links = self._collect_foreign_key_links(tables)
368
+
369
+ batch: list[SingleAssetScanResults] = []
370
+ for table_ref in tables:
371
+ if self._aborted:
372
+ return
373
+
374
+ key = self._table_key(table_ref)
375
+ linked_hashes = [
376
+ table_hash_by_key[target]
377
+ for target in sorted(table_fk_links.get(key, set()))
378
+ if target in table_hash_by_key
379
+ ]
380
+
381
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
382
+ self._table_lookup[asset.hash] = table_ref
383
+ batch.append(asset)
384
+
385
+ if len(batch) >= self.BATCH_SIZE:
386
+ yield batch
387
+ batch = []
388
+
389
+ if batch:
390
+ yield batch
391
+
392
+ def generate_hash_id(self, asset_id: str) -> str:
393
+ return hash_id(self._asset_type_value(), asset_id)
394
+
395
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
396
+ if asset_id in self._table_lookup:
397
+ return self._table_lookup[asset_id]
398
+
399
+ decoded = asset_id
400
+ if "_#_" not in decoded:
401
+ try:
402
+ decoded = unhash_id(asset_id)
403
+ except Exception:
404
+ decoded = asset_id
405
+
406
+ parts = decoded.split("_#_")
407
+ if len(parts) >= 4 and parts[0].upper() == "POSTGRESQL":
408
+ return TableRef(database=parts[-3], schema=parts[-2], table=parts[-1])
409
+ if len(parts) >= 3:
410
+ return TableRef(database=parts[-3], schema=parts[-2], table=parts[-1])
411
+ return None
412
+
413
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
414
+ with self._connect(table_ref.database) as conn:
415
+ with conn.cursor() as cursor:
416
+ cursor.execute(
417
+ """
418
+ SELECT column_name
419
+ FROM information_schema.columns
420
+ WHERE table_schema = %s AND table_name = %s
421
+ ORDER BY ordinal_position
422
+ """,
423
+ (table_ref.schema, table_ref.table),
424
+ )
425
+ columns = [column for (column,) in cursor.fetchall() if isinstance(column, str)]
426
+ return columns
427
+
428
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
429
+ sampling = self._sampling()
430
+ configured = sampling.order_by_column
431
+ if configured and configured in columns:
432
+ return configured
433
+
434
+ priority_candidates = (
435
+ "updated_at",
436
+ "modified_at",
437
+ "created_at",
438
+ "inserted_at",
439
+ "timestamp",
440
+ "ts",
441
+ "date",
442
+ )
443
+ for candidate in priority_candidates:
444
+ if candidate in columns:
445
+ return candidate
446
+ return None
447
+
448
+ def _build_sampling_query(
449
+ self, table_ref: TableRef, columns: list[str]
450
+ ) -> tuple[str, list[Any]]:
451
+ sampling = self._sampling()
452
+ if not columns:
453
+ raise ValueError(
454
+ f"Table {table_ref.database}.{table_ref.schema}.{table_ref.table} has no readable columns"
455
+ )
456
+
457
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
458
+ query = (
459
+ f"SELECT {quoted_columns} FROM "
460
+ f"{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
461
+ )
462
+
463
+ strategy = sampling.strategy
464
+ if strategy == SamplingStrategy.LATEST:
465
+ order_column = self._resolve_latest_order_column(columns)
466
+ if order_column:
467
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC NULLS LAST"
468
+ elif sampling.fallback_to_random is not False:
469
+ query += " ORDER BY RANDOM()"
470
+ elif strategy == SamplingStrategy.RANDOM:
471
+ query += " ORDER BY RANDOM()"
472
+ # SamplingStrategy.ALL: no ORDER BY, no LIMIT — paginated by fetch_content_pages
473
+
474
+ if strategy != SamplingStrategy.ALL:
475
+ query += " LIMIT %s"
476
+ return query, [int(sampling.rows_per_page or 100)]
477
+
478
+ return query, []
479
+
480
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
481
+ try:
482
+ with self._connect(table_ref.database) as conn:
483
+ with conn.cursor() as cursor:
484
+ cursor.execute(
485
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
486
+ )
487
+ row = cursor.fetchone()
488
+ return int(row[0]) if row else None
489
+ except Exception:
490
+ return None
491
+
492
+ def _serialize_cell(self, value: Any) -> str:
493
+ if value is None:
494
+ return "null"
495
+ if isinstance(value, memoryview):
496
+ value = value.tobytes()
497
+ if isinstance(value, (bytes, bytearray)):
498
+ return f"<{len(value)} bytes>"
499
+ if isinstance(value, datetime):
500
+ return value.isoformat()
501
+ return str(value)
502
+
503
+ def _format_sample_content(
504
+ self,
505
+ table_ref: TableRef,
506
+ column_names: list[str],
507
+ rows: list[tuple[Any, ...]],
508
+ row_offset: int = 0,
509
+ ) -> tuple[str, str]:
510
+ sampling = self._sampling()
511
+ return format_tabular_sample_content(
512
+ scope_label="table",
513
+ scope_value=f"{table_ref.database}.{table_ref.schema}.{table_ref.table}",
514
+ strategy=sampling.strategy,
515
+ rows=rows,
516
+ column_names=column_names,
517
+ serialize_cell=self._serialize_cell,
518
+ include_column_names=sampling.include_column_names is not False,
519
+ raw_metadata={
520
+ "database": table_ref.database,
521
+ "schema": table_ref.schema,
522
+ "table": table_ref.table,
523
+ },
524
+ row_offset=row_offset,
525
+ )
526
+
527
+ def _fetch_one_page(
528
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
529
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
530
+ with self._connect(table_ref.database) as conn:
531
+ paginated_query = f"{base_query} LIMIT %s OFFSET %s"
532
+ with conn.cursor() as cursor:
533
+ cursor.execute(paginated_query, [page_size, offset])
534
+ rows = list(cursor.fetchall())
535
+ column_names = (
536
+ [desc[0] for desc in cursor.description] if cursor.description else []
537
+ )
538
+ return rows, column_names
539
+
540
+ def _fetch_one_page_on_conn(
541
+ self,
542
+ conn: Any,
543
+ base_query: str,
544
+ page_size: int,
545
+ offset: int,
546
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
547
+ paginated_query = f"{base_query} LIMIT %s OFFSET %s"
548
+ with conn.cursor() as cursor:
549
+ cursor.execute(paginated_query, [page_size, offset])
550
+ rows = list(cursor.fetchall())
551
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
552
+ return rows, column_names
553
+
554
+ @staticmethod
555
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
556
+ cursor.execute(query)
557
+ return [desc[0] for desc in cursor.description] if cursor.description else []
558
+
559
+ @staticmethod
560
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
561
+ return list(cursor.fetchmany(size))
562
+
563
+ def _fetch_page_keyset(
564
+ self,
565
+ conn: Any,
566
+ base_query: str,
567
+ page_size: int,
568
+ pk_columns: list[str],
569
+ pk_order: str,
570
+ last_pk_values: list[Any] | None,
571
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
572
+ """Fetch one page using keyset pagination — O(1) cost at any offset."""
573
+ params: list[Any]
574
+ if last_pk_values is None:
575
+ paginated_query = f"{base_query} ORDER BY {pk_order} LIMIT %s"
576
+ params = [page_size]
577
+ elif len(pk_columns) == 1:
578
+ where = f"WHERE {_quote_identifier(pk_columns[0])} > %s"
579
+ paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
580
+ params = [last_pk_values[0], page_size]
581
+ else:
582
+ pk_cols_quoted = ", ".join(_quote_identifier(col) for col in pk_columns)
583
+ placeholders = ", ".join("%s" for _ in pk_columns)
584
+ where = f"WHERE ({pk_cols_quoted}) > ({placeholders})"
585
+ paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
586
+ params = [*last_pk_values, page_size]
587
+
588
+ with conn.cursor() as cursor:
589
+ cursor.execute(paginated_query, params)
590
+ rows = list(cursor.fetchall())
591
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
592
+ return rows, column_names
593
+
594
+ def _fetch_sample_rows(
595
+ self, table_ref: TableRef
596
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
597
+ columns = self._available_columns(table_ref)
598
+ sampling = self._sampling()
599
+ query, params = self._build_sampling_query(table_ref, columns)
600
+
601
+ if sampling.strategy == SamplingStrategy.ALL:
602
+ rows_per_page = int(sampling.rows_per_page or 100)
603
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
604
+ else:
605
+ with self._connect(table_ref.database) as conn:
606
+ with conn.cursor() as cursor:
607
+ cursor.execute(query, params if params else None)
608
+ rows = cursor.fetchall()
609
+ column_names = [desc[0] for desc in cursor.description or []]
610
+
611
+ if not column_names:
612
+ return None
613
+ return rows, column_names
614
+
615
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
616
+ result = self._fetch_sample_rows(table_ref)
617
+ if result is None:
618
+ return None
619
+ rows, column_names = result
620
+ return self._format_sample_content(table_ref, column_names, rows)
621
+
622
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
623
+ cached = self._content_cache.get(asset_id)
624
+ if cached:
625
+ return cached
626
+
627
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
628
+ if not table_ref:
629
+ return None
630
+
631
+ sampled = self._sample_table_rows(table_ref)
632
+
633
+ if sampled is None:
634
+ return None
635
+
636
+ self._content_cache[asset_id] = sampled
637
+ return sampled
638
+
639
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
640
+ sampling = self._sampling()
641
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
642
+ if not table_ref:
643
+ return
644
+
645
+ if sampling.strategy != SamplingStrategy.ALL:
646
+ result = self._fetch_sample_rows(table_ref)
647
+ if result is None:
648
+ return
649
+ rows, column_names = result
650
+ for i, row in enumerate(rows):
651
+ formatted = self._format_sample_content(
652
+ table_ref, column_names, [row], row_offset=i
653
+ )
654
+ if formatted:
655
+ yield formatted
656
+ return
657
+
658
+ columns = self._available_columns(table_ref)
659
+ query, _ = self._build_sampling_query(table_ref, columns)
660
+ rows_per_page = int(sampling.rows_per_page or 100)
661
+ table_label = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
662
+
663
+ total_rows = self._count_table_rows(table_ref)
664
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
665
+ if total_rows is not None and total_batches is not None:
666
+ logger.info(
667
+ "Full scan %s: %d rows, %d batches of %d",
668
+ table_label,
669
+ total_rows,
670
+ total_batches,
671
+ rows_per_page,
672
+ )
673
+
674
+ # Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
675
+ # Fall back to streaming fetchmany (also O(1)) for tables without a primary key.
676
+ pk_columns = self._get_primary_key_columns(table_ref)
677
+ pk_indices: list[int] = []
678
+ use_keyset = False
679
+ if pk_columns:
680
+ column_list = self._available_columns(table_ref)
681
+ indices = [column_list.index(col) for col in pk_columns if col in column_list]
682
+ if len(indices) == len(pk_columns):
683
+ pk_indices = indices
684
+ pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
685
+ use_keyset = True
686
+
687
+ row_offset = 0
688
+ page_num = 1
689
+ last_pk_values: list[Any] | None = None
690
+
691
+ conn = self._connect(table_ref.database)
692
+ cursor = conn.cursor() if not use_keyset else None
693
+ try:
694
+ if cursor is not None:
695
+ # Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
696
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
697
+ if not column_names:
698
+ return
699
+
700
+ while not self._aborted:
701
+ if total_batches is not None:
702
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
703
+
704
+ if use_keyset:
705
+ rows, column_names = await asyncio.to_thread(
706
+ self._fetch_page_keyset,
707
+ conn,
708
+ query,
709
+ rows_per_page,
710
+ pk_columns,
711
+ pk_order,
712
+ last_pk_values,
713
+ )
714
+ else:
715
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
716
+ if not rows:
717
+ break
718
+
719
+ if not rows or not column_names:
720
+ break
721
+
722
+ # Yield each row individually so the detection pipeline can start
723
+ # processing rows while the next page is being fetched in a thread.
724
+ for i, row in enumerate(rows):
725
+ formatted = self._format_sample_content(
726
+ table_ref, column_names, [row], row_offset=row_offset + i
727
+ )
728
+ if formatted:
729
+ self._content_cache[asset_id] = formatted
730
+ yield formatted
731
+
732
+ if use_keyset:
733
+ last_row = rows[-1]
734
+ last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
735
+
736
+ row_offset += len(rows)
737
+ page_num += 1
738
+ if len(rows) < rows_per_page:
739
+ break
740
+ finally:
741
+ if cursor is not None:
742
+ try:
743
+ cursor.close()
744
+ except Exception:
745
+ pass
746
+ conn.close()
747
+
748
+ def enrich_finding_location(
749
+ self,
750
+ finding: DetectionResult,
751
+ asset: SingleAssetScanResults,
752
+ text_content: str,
753
+ ) -> None:
754
+ del text_content
755
+ table_ref = self._table_lookup.get(asset.hash)
756
+ if not table_ref:
757
+ return
758
+
759
+ path = f"{table_ref.schema}.{table_ref.table}"
760
+ cached = self._content_cache.get(asset.hash)
761
+ raw_content = cached[0] if cached else None
762
+ metadata = finding.metadata or {}
763
+ finding.location = build_tabular_location(
764
+ raw_content=raw_content,
765
+ matched_content=finding.matched_content,
766
+ base_path=path,
767
+ primary_key_columns=self._get_primary_key_columns(table_ref),
768
+ row_index=metadata.get("tabular_row_index"),
769
+ column_name=metadata.get("tabular_column_name"),
770
+ )
771
+
772
+ def abort(self) -> None:
773
+ logger.info("Aborting PostgreSQL extraction...")
774
+ super().abort()