classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,797 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import ssl as ssl_module
6
+ from collections.abc import AsyncGenerator
7
+ from contextlib import closing
8
+ from dataclasses import dataclass
9
+ from datetime import UTC, datetime
10
+ from typing import Any
11
+
12
+ from ...models.generated_input import (
13
+ MySQLInput,
14
+ MySQLOptionalConnection,
15
+ MySQLOptionalScope,
16
+ MySQLSSLMode,
17
+ SamplingConfig,
18
+ SamplingStrategy,
19
+ )
20
+ from ...models.generated_single_asset_scan_results import (
21
+ AssetType as OutputAssetType,
22
+ )
23
+ from ...models.generated_single_asset_scan_results import (
24
+ DetectionResult,
25
+ SingleAssetScanResults,
26
+ )
27
+ from ...utils.hashing import hash_id, unhash_id
28
+ from ..base import BaseSource
29
+ from ..dependencies import require_module
30
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+ _DEFAULT_EXCLUDED_DATABASES = {
35
+ "information_schema",
36
+ "mysql",
37
+ "performance_schema",
38
+ "sys",
39
+ }
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class TableRef:
44
+ database: str
45
+ table: str
46
+
47
+
48
+ def _quote_identifier(identifier: str) -> str:
49
+ return f"`{identifier.replace('`', '``')}`"
50
+
51
+
52
+ class MySQLSource(BaseSource):
53
+ source_type = "mysql"
54
+
55
+ def __init__(
56
+ self,
57
+ recipe: dict[str, Any],
58
+ source_id: str | None = None,
59
+ runner_id: str | None = None,
60
+ ) -> None:
61
+ super().__init__(recipe, source_id, runner_id)
62
+ self.config = MySQLInput.model_validate(recipe)
63
+ self.runner_id = runner_id or "local-run"
64
+ self._pymysql = require_module(
65
+ module_name="pymysql",
66
+ source_name="MySQL",
67
+ uv_groups=["mysql"],
68
+ detail="The MySQL connector is optional.",
69
+ )
70
+ self._table_lookup: dict[str, TableRef] = {}
71
+ self._content_cache: dict[str, tuple[str, str]] = {}
72
+ self._pk_columns_cache: dict[tuple[str, str], list[str]] = {}
73
+
74
+ def _asset_type_value(self) -> str:
75
+ type_value = self.config.type
76
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
77
+
78
+ def _sampling(self) -> SamplingConfig:
79
+ return self.config.sampling
80
+
81
+ def _connection_options(self) -> MySQLOptionalConnection:
82
+ if self.config.optional and self.config.optional.connection:
83
+ return self.config.optional.connection
84
+ return MySQLOptionalConnection()
85
+
86
+ def _scope_options(self) -> MySQLOptionalScope:
87
+ if self.config.optional and self.config.optional.scope:
88
+ return self.config.optional.scope
89
+ return MySQLOptionalScope()
90
+
91
+ def _username(self) -> str:
92
+ return self.config.masked.username
93
+
94
+ def _password(self) -> str:
95
+ return self.config.masked.password
96
+
97
+ def _build_ssl_kwargs(self, connection_options: MySQLOptionalConnection) -> dict[str, Any]:
98
+ ssl_mode = connection_options.ssl_mode or MySQLSSLMode.PREFERRED
99
+ ssl_ca_pem = self.config.masked.ssl_ca
100
+
101
+ if ssl_mode == MySQLSSLMode.DISABLED:
102
+ return {"ssl_disabled": True}
103
+
104
+ if ssl_mode == MySQLSSLMode.PREFERRED and not ssl_ca_pem:
105
+ return {}
106
+
107
+ ctx = ssl_module.create_default_context()
108
+ if ssl_ca_pem:
109
+ # Normalize PEM: fix escaped newlines from JSON/env round-trips, strip whitespace
110
+ normalized = ssl_ca_pem.replace("\\n", "\n").replace("\r\n", "\n").strip()
111
+ ctx.load_verify_locations(cadata=normalized)
112
+
113
+ if ssl_mode == MySQLSSLMode.VERIFY_IDENTITY:
114
+ ctx.check_hostname = True
115
+ ctx.verify_mode = ssl_module.CERT_REQUIRED
116
+ elif ssl_mode == MySQLSSLMode.VERIFY_CA:
117
+ ctx.check_hostname = False
118
+ ctx.verify_mode = ssl_module.CERT_REQUIRED
119
+ else:
120
+ # REQUIRED or PREFERRED with a CA cert — encrypt but don't verify
121
+ ctx.check_hostname = False
122
+ ctx.verify_mode = ssl_module.CERT_NONE
123
+
124
+ return {"ssl": ctx}
125
+
126
+ def _connect(self, database: str | None = None):
127
+ connection_options = self._connection_options()
128
+ connect_kwargs: dict[str, Any] = {
129
+ "host": self.config.required.host,
130
+ "port": int(self.config.required.port),
131
+ "user": self._username(),
132
+ "password": self._password(),
133
+ "connect_timeout": int(connection_options.connect_timeout_seconds or 30),
134
+ }
135
+ if database:
136
+ connect_kwargs["database"] = database
137
+
138
+ connect_kwargs.update(self._build_ssl_kwargs(connection_options))
139
+
140
+ if connection_options.allow_public_key_retrieval:
141
+ connect_kwargs["allow_public_key_retrieval"] = True
142
+
143
+ connection = self._pymysql.connect(**connect_kwargs)
144
+ connection.autocommit(True)
145
+ return connection
146
+
147
+ def _excluded_databases(self) -> set[str]:
148
+ configured = self._scope_options().exclude_databases or []
149
+ excluded = {db.strip() for db in configured if db.strip()}
150
+ if not excluded:
151
+ excluded = set(_DEFAULT_EXCLUDED_DATABASES)
152
+ return excluded
153
+
154
+ def _resolve_databases(self) -> list[str]:
155
+ scope_options = self._scope_options()
156
+ include_all = bool(scope_options.include_all_databases)
157
+ configured_database = scope_options.database
158
+
159
+ if not include_all:
160
+ if configured_database:
161
+ return [configured_database]
162
+ raise ValueError(
163
+ "MySQL source requires optional.scope.database when include_all_databases is false. "
164
+ "Set optional.scope.database (e.g. 'app_db') or enable include_all_databases."
165
+ )
166
+
167
+ excluded = self._excluded_databases()
168
+ databases: list[str] = []
169
+ with closing(self._connect()) as conn:
170
+ with conn.cursor() as cursor:
171
+ cursor.execute("SHOW DATABASES")
172
+ for row in cursor.fetchall():
173
+ database_name = row[0] if isinstance(row, tuple) else None
174
+ if not isinstance(database_name, str) or not database_name:
175
+ continue
176
+ if database_name in excluded:
177
+ continue
178
+ databases.append(database_name)
179
+
180
+ if configured_database and configured_database not in databases:
181
+ databases.insert(0, configured_database)
182
+
183
+ return databases
184
+
185
+ def _table_allowlist(self) -> set[str]:
186
+ allowlist: set[str] = set()
187
+ include_tables = self._scope_options().include_tables or []
188
+ for item in include_tables:
189
+ normalized = item.strip().lower()
190
+ if normalized:
191
+ allowlist.add(normalized)
192
+ return allowlist
193
+
194
+ def _get_primary_key_columns(self, table_ref: TableRef) -> list[str]:
195
+ cache_key = (table_ref.database, table_ref.table)
196
+ if cache_key in self._pk_columns_cache:
197
+ return self._pk_columns_cache[cache_key]
198
+ try:
199
+ with self._connect(table_ref.database) as conn:
200
+ with conn.cursor() as cursor:
201
+ cursor.execute(
202
+ """
203
+ SELECT column_name
204
+ FROM information_schema.key_column_usage
205
+ WHERE constraint_name = 'PRIMARY'
206
+ AND table_schema = %s
207
+ AND table_name = %s
208
+ ORDER BY ordinal_position
209
+ """,
210
+ (table_ref.database, table_ref.table),
211
+ )
212
+ cols = [row[0] for row in cursor.fetchall() if isinstance(row[0], str)]
213
+ except Exception:
214
+ cols = []
215
+ self._pk_columns_cache[cache_key] = cols
216
+ return cols
217
+
218
+ def _list_tables_for_database(self, database: str) -> list[TableRef]:
219
+ table_allowlist = self._table_allowlist()
220
+ table_limit = self._scope_options().table_limit
221
+ limit = int(table_limit) if table_limit else None
222
+
223
+ tables: list[TableRef] = []
224
+ with closing(self._connect(database)) as conn:
225
+ with conn.cursor() as cursor:
226
+ cursor.execute(
227
+ """
228
+ SELECT table_name
229
+ FROM information_schema.tables
230
+ WHERE table_schema = %s
231
+ AND table_type = 'BASE TABLE'
232
+ ORDER BY table_name
233
+ """,
234
+ (database,),
235
+ )
236
+ for row in cursor.fetchall():
237
+ table_name = row[0] if isinstance(row, tuple) else None
238
+ if not isinstance(table_name, str) or not table_name:
239
+ continue
240
+
241
+ normalized_table = table_name.lower()
242
+ normalized_db_table = f"{database}.{table_name}".lower()
243
+ if (
244
+ table_allowlist
245
+ and normalized_table not in table_allowlist
246
+ and normalized_db_table not in table_allowlist
247
+ ):
248
+ continue
249
+
250
+ tables.append(TableRef(database=database, table=table_name))
251
+ if limit is not None and len(tables) >= limit:
252
+ break
253
+ return tables
254
+
255
+ def _iter_tables(self) -> list[TableRef]:
256
+ tables: list[TableRef] = []
257
+ for database in self._resolve_databases():
258
+ if self._aborted:
259
+ break
260
+ try:
261
+ tables.extend(self._list_tables_for_database(database))
262
+ except Exception as exc:
263
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
264
+ return tables
265
+
266
+ def test_connection(self) -> dict[str, Any]:
267
+ logger.info("Testing connection to MySQL...")
268
+ result = {
269
+ "timestamp": datetime.now(UTC).isoformat(),
270
+ "source_type": self.recipe.get("type"),
271
+ }
272
+
273
+ try:
274
+ databases = self._resolve_databases()
275
+ if not databases:
276
+ raise ValueError("No databases available for scanning")
277
+ with closing(self._connect(databases[0])) as conn:
278
+ with conn.cursor() as cursor:
279
+ cursor.execute("SELECT 1")
280
+ cursor.fetchone()
281
+
282
+ result["status"] = "SUCCESS"
283
+ result["message"] = (
284
+ f"Successfully connected to MySQL. Reachable databases: {len(databases)}."
285
+ )
286
+ except Exception as exc:
287
+ result["status"] = "FAILURE"
288
+ result["message"] = f"Failed to connect to MySQL: {exc}"
289
+
290
+ return result
291
+
292
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str]:
293
+ return (table_ref.database, table_ref.table)
294
+
295
+ def _table_raw_id(self, table_ref: TableRef) -> str:
296
+ return f"{table_ref.database}_#_{table_ref.table}"
297
+
298
+ def _collect_foreign_key_links(
299
+ self,
300
+ tables: list[TableRef],
301
+ ) -> dict[tuple[str, str], set[tuple[str, str]]]:
302
+ table_keys = {self._table_key(table_ref) for table_ref in tables}
303
+ by_database: dict[str, set[tuple[str, str]]] = {}
304
+ for table_ref in tables:
305
+ by_database.setdefault(table_ref.database, set()).add(self._table_key(table_ref))
306
+
307
+ links: dict[tuple[str, str], set[tuple[str, str]]] = {}
308
+ for database, scoped_keys in by_database.items():
309
+ try:
310
+ with closing(self._connect(database)) as conn:
311
+ with conn.cursor() as cursor:
312
+ cursor.execute(
313
+ """
314
+ SELECT
315
+ TABLE_SCHEMA AS source_database,
316
+ TABLE_NAME AS source_table,
317
+ REFERENCED_TABLE_SCHEMA AS target_database,
318
+ REFERENCED_TABLE_NAME AS target_table
319
+ FROM information_schema.KEY_COLUMN_USAGE
320
+ WHERE TABLE_SCHEMA = %s
321
+ AND REFERENCED_TABLE_SCHEMA IS NOT NULL
322
+ AND REFERENCED_TABLE_NAME IS NOT NULL
323
+ """,
324
+ (database,),
325
+ )
326
+ for source_db, source_table, target_db, target_table in cursor.fetchall():
327
+ source_key = (source_db, source_table)
328
+ target_key = (target_db, target_table)
329
+ if source_key not in scoped_keys:
330
+ continue
331
+ if target_key not in table_keys:
332
+ continue
333
+ links.setdefault(source_key, set()).add(target_key)
334
+ except Exception as exc:
335
+ logger.warning(
336
+ "Could not resolve foreign key links for database %s: %s",
337
+ database,
338
+ exc,
339
+ )
340
+
341
+ return links
342
+
343
+ def _table_to_asset(
344
+ self, table_ref: TableRef, *, links: list[str] | None = None
345
+ ) -> SingleAssetScanResults:
346
+ asset_name = f"{table_ref.database}.{table_ref.table}"
347
+ raw_id = self._table_raw_id(table_ref)
348
+ asset_hash = self.generate_hash_id(raw_id)
349
+ external_url = (
350
+ f"mysql://{self.config.required.host}:{self.config.required.port}/"
351
+ f"{table_ref.database}/{table_ref.table}"
352
+ )
353
+
354
+ metadata = {
355
+ "database": table_ref.database,
356
+ "table": table_ref.table,
357
+ "sampling": {
358
+ "strategy": str(self._sampling().strategy),
359
+ },
360
+ }
361
+
362
+ now = datetime.now(UTC)
363
+ return SingleAssetScanResults(
364
+ hash=asset_hash,
365
+ checksum=self.calculate_checksum(metadata),
366
+ name=asset_name,
367
+ external_url=external_url,
368
+ links=links or [],
369
+ asset_type=OutputAssetType.TABLE,
370
+ source_id=self.source_id,
371
+ created_at=now,
372
+ updated_at=now,
373
+ runner_id=self.runner_id,
374
+ )
375
+
376
+ STREAM_DETECTIONS = True
377
+
378
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
379
+ if self._aborted:
380
+ return
381
+
382
+ tables = self._iter_tables()
383
+ table_hash_by_key: dict[tuple[str, str], str] = {
384
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
385
+ for table_ref in tables
386
+ }
387
+ table_fk_links = self._collect_foreign_key_links(tables)
388
+
389
+ batch: list[SingleAssetScanResults] = []
390
+ for table_ref in tables:
391
+ if self._aborted:
392
+ return
393
+
394
+ key = self._table_key(table_ref)
395
+ linked_hashes = [
396
+ table_hash_by_key[target]
397
+ for target in sorted(table_fk_links.get(key, set()))
398
+ if target in table_hash_by_key
399
+ ]
400
+
401
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
402
+ self._table_lookup[asset.hash] = table_ref
403
+ batch.append(asset)
404
+
405
+ if len(batch) >= self.BATCH_SIZE:
406
+ yield batch
407
+ batch = []
408
+
409
+ if batch:
410
+ yield batch
411
+
412
+ def generate_hash_id(self, asset_id: str) -> str:
413
+ return hash_id(self._asset_type_value(), asset_id)
414
+
415
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
416
+ if asset_id in self._table_lookup:
417
+ return self._table_lookup[asset_id]
418
+
419
+ decoded = asset_id
420
+ if "_#_" not in decoded:
421
+ try:
422
+ decoded = unhash_id(asset_id)
423
+ except Exception:
424
+ decoded = asset_id
425
+
426
+ parts = decoded.split("_#_")
427
+ if len(parts) >= 3 and parts[0].upper() == "MYSQL":
428
+ return TableRef(database=parts[-2], table=parts[-1])
429
+ if len(parts) >= 2:
430
+ return TableRef(database=parts[-2], table=parts[-1])
431
+ return None
432
+
433
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
434
+ with closing(self._connect(table_ref.database)) as conn:
435
+ with conn.cursor() as cursor:
436
+ cursor.execute(
437
+ """
438
+ SELECT column_name
439
+ FROM information_schema.columns
440
+ WHERE table_schema = %s
441
+ AND table_name = %s
442
+ ORDER BY ordinal_position
443
+ """,
444
+ (table_ref.database, table_ref.table),
445
+ )
446
+ return [
447
+ row[0]
448
+ for row in cursor.fetchall()
449
+ if isinstance(row, tuple) and row and isinstance(row[0], str)
450
+ ]
451
+
452
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
453
+ sampling = self._sampling()
454
+ configured = sampling.order_by_column
455
+ if configured and configured in columns:
456
+ return configured
457
+
458
+ priority_candidates = (
459
+ "updated_at",
460
+ "modified_at",
461
+ "created_at",
462
+ "inserted_at",
463
+ "timestamp",
464
+ "ts",
465
+ "date",
466
+ )
467
+ for candidate in priority_candidates:
468
+ if candidate in columns:
469
+ return candidate
470
+ return None
471
+
472
+ def _build_sampling_query(
473
+ self, table_ref: TableRef, columns: list[str]
474
+ ) -> tuple[str, list[Any]]:
475
+ sampling = self._sampling()
476
+ if not columns:
477
+ raise ValueError(
478
+ f"Table {table_ref.database}.{table_ref.table} has no readable columns"
479
+ )
480
+
481
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
482
+ query = (
483
+ f"SELECT {quoted_columns} FROM "
484
+ f"{_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.table)}"
485
+ )
486
+
487
+ strategy = sampling.strategy
488
+ if strategy == SamplingStrategy.LATEST:
489
+ order_column = self._resolve_latest_order_column(columns)
490
+ if order_column:
491
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
492
+ elif sampling.fallback_to_random is not False:
493
+ query += " ORDER BY RAND()"
494
+ elif strategy == SamplingStrategy.RANDOM:
495
+ query += " ORDER BY RAND()"
496
+ # SamplingStrategy.ALL: no ORDER BY, no LIMIT — paginated by fetch_content_pages
497
+
498
+ if strategy != SamplingStrategy.ALL:
499
+ query += " LIMIT %s"
500
+ return query, [int(sampling.rows_per_page or 100)]
501
+
502
+ return query, []
503
+
504
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
505
+ try:
506
+ with closing(self._connect(table_ref.database)) as conn:
507
+ with conn.cursor() as cursor:
508
+ cursor.execute(
509
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.table)}"
510
+ )
511
+ row = cursor.fetchone()
512
+ return int(row[0]) if row else None
513
+ except Exception:
514
+ return None
515
+
516
+ def _serialize_cell(self, value: Any) -> str:
517
+ if value is None:
518
+ return "null"
519
+ if isinstance(value, memoryview):
520
+ value = value.tobytes()
521
+ if isinstance(value, (bytes, bytearray)):
522
+ return f"<{len(value)} bytes>"
523
+ if isinstance(value, datetime):
524
+ return value.isoformat()
525
+ return str(value)
526
+
527
+ def _format_sample_content(
528
+ self,
529
+ table_ref: TableRef,
530
+ column_names: list[str],
531
+ rows: list[tuple[Any, ...]],
532
+ row_offset: int = 0,
533
+ ) -> tuple[str, str]:
534
+ sampling = self._sampling()
535
+ return format_tabular_sample_content(
536
+ scope_label="table",
537
+ scope_value=f"{table_ref.database}.{table_ref.table}",
538
+ strategy=sampling.strategy,
539
+ rows=rows,
540
+ column_names=column_names,
541
+ serialize_cell=self._serialize_cell,
542
+ include_column_names=sampling.include_column_names is not False,
543
+ raw_metadata={
544
+ "database": table_ref.database,
545
+ "table": table_ref.table,
546
+ },
547
+ row_offset=row_offset,
548
+ )
549
+
550
+ def _fetch_one_page(
551
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
552
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
553
+ with closing(self._connect(table_ref.database)) as conn:
554
+ paginated_query = f"{base_query} LIMIT %s OFFSET %s"
555
+ with conn.cursor() as cursor:
556
+ cursor.execute(paginated_query, [page_size, offset])
557
+ rows = list(cursor.fetchall())
558
+ column_names = (
559
+ [desc[0] for desc in cursor.description] if cursor.description else []
560
+ )
561
+ return rows, column_names
562
+
563
+ def _fetch_one_page_on_conn(
564
+ self,
565
+ conn: Any,
566
+ base_query: str,
567
+ page_size: int,
568
+ offset: int,
569
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
570
+ paginated_query = f"{base_query} LIMIT %s OFFSET %s"
571
+ with conn.cursor() as cursor:
572
+ cursor.execute(paginated_query, [page_size, offset])
573
+ rows = list(cursor.fetchall())
574
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
575
+ return rows, column_names
576
+
577
+ @staticmethod
578
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
579
+ cursor.execute(query)
580
+ return [desc[0] for desc in cursor.description] if cursor.description else []
581
+
582
+ @staticmethod
583
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
584
+ return list(cursor.fetchmany(size))
585
+
586
+ def _fetch_page_keyset(
587
+ self,
588
+ conn: Any,
589
+ base_query: str,
590
+ page_size: int,
591
+ pk_columns: list[str],
592
+ pk_order: str,
593
+ last_pk_values: list[Any] | None,
594
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
595
+ """Fetch one page using keyset pagination — O(1) cost at any offset."""
596
+ params: list[Any]
597
+ if last_pk_values is None:
598
+ paginated_query = f"{base_query} ORDER BY {pk_order} LIMIT %s"
599
+ params = [page_size]
600
+ elif len(pk_columns) == 1:
601
+ where = f"WHERE {_quote_identifier(pk_columns[0])} > %s"
602
+ paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
603
+ params = [last_pk_values[0], page_size]
604
+ else:
605
+ pk_cols_quoted = ", ".join(_quote_identifier(col) for col in pk_columns)
606
+ placeholders = ", ".join("%s" for _ in pk_columns)
607
+ where = f"WHERE ({pk_cols_quoted}) > ({placeholders})"
608
+ paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
609
+ params = [*last_pk_values, page_size]
610
+
611
+ with conn.cursor() as cursor:
612
+ cursor.execute(paginated_query, params)
613
+ rows = list(cursor.fetchall())
614
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
615
+ return rows, column_names
616
+
617
+ def _fetch_sample_rows(
618
+ self, table_ref: TableRef
619
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
620
+ columns = self._available_columns(table_ref)
621
+ sampling = self._sampling()
622
+ query, params = self._build_sampling_query(table_ref, columns)
623
+
624
+ if sampling.strategy == SamplingStrategy.ALL:
625
+ rows_per_page = int(sampling.rows_per_page or 100)
626
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
627
+ else:
628
+ with closing(self._connect(table_ref.database)) as conn:
629
+ with conn.cursor() as cursor:
630
+ cursor.execute(query, params if params else None)
631
+ rows = cursor.fetchall()
632
+ column_names = [desc[0] for desc in cursor.description or []]
633
+
634
+ if not column_names:
635
+ return None
636
+ return rows, column_names
637
+
638
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
639
+ result = self._fetch_sample_rows(table_ref)
640
+ if result is None:
641
+ return None
642
+ rows, column_names = result
643
+ return self._format_sample_content(table_ref, column_names, rows)
644
+
645
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
646
+ cached = self._content_cache.get(asset_id)
647
+ if cached:
648
+ return cached
649
+
650
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
651
+ if not table_ref:
652
+ return None
653
+
654
+ sampled = self._sample_table_rows(table_ref)
655
+
656
+ if sampled is None:
657
+ return None
658
+
659
+ self._content_cache[asset_id] = sampled
660
+ return sampled
661
+
662
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
663
+ sampling = self._sampling()
664
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
665
+ if not table_ref:
666
+ return
667
+
668
+ if sampling.strategy != SamplingStrategy.ALL:
669
+ result = self._fetch_sample_rows(table_ref)
670
+ if result is None:
671
+ return
672
+ rows, column_names = result
673
+ for i, row in enumerate(rows):
674
+ formatted = self._format_sample_content(
675
+ table_ref, column_names, [row], row_offset=i
676
+ )
677
+ if formatted:
678
+ yield formatted
679
+ return
680
+
681
+ columns = self._available_columns(table_ref)
682
+ query, _ = self._build_sampling_query(table_ref, columns)
683
+ rows_per_page = int(sampling.rows_per_page or 100)
684
+ table_label = f"{table_ref.database}.{table_ref.table}"
685
+
686
+ total_rows = self._count_table_rows(table_ref)
687
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
688
+ if total_rows is not None and total_batches is not None:
689
+ logger.info(
690
+ "Full scan %s: %d rows, %d batches of %d",
691
+ table_label,
692
+ total_rows,
693
+ total_batches,
694
+ rows_per_page,
695
+ )
696
+
697
+ # Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
698
+ # Fall back to streaming fetchmany (also O(1)) for tables without a primary key.
699
+ pk_columns = self._get_primary_key_columns(table_ref)
700
+ pk_indices: list[int] = []
701
+ use_keyset = False
702
+ if pk_columns:
703
+ column_list = self._available_columns(table_ref)
704
+ indices = [column_list.index(col) for col in pk_columns if col in column_list]
705
+ if len(indices) == len(pk_columns):
706
+ pk_indices = indices
707
+ pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
708
+ use_keyset = True
709
+
710
+ row_offset = 0
711
+ page_num = 1
712
+ last_pk_values: list[Any] | None = None
713
+
714
+ conn = self._connect(table_ref.database)
715
+ cursor = conn.cursor() if not use_keyset else None
716
+ try:
717
+ if cursor is not None:
718
+ # Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
719
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
720
+ if not column_names:
721
+ return
722
+
723
+ while not self._aborted:
724
+ if total_batches is not None:
725
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
726
+
727
+ if use_keyset:
728
+ rows, column_names = await asyncio.to_thread(
729
+ self._fetch_page_keyset,
730
+ conn,
731
+ query,
732
+ rows_per_page,
733
+ pk_columns,
734
+ pk_order,
735
+ last_pk_values,
736
+ )
737
+ else:
738
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
739
+ if not rows:
740
+ break
741
+
742
+ if not rows or not column_names:
743
+ break
744
+
745
+ # Yield each row individually so the detection pipeline can start
746
+ # processing rows while the next page is being fetched in a thread.
747
+ for i, row in enumerate(rows):
748
+ formatted = self._format_sample_content(
749
+ table_ref, column_names, [row], row_offset=row_offset + i
750
+ )
751
+ if formatted:
752
+ self._content_cache[asset_id] = formatted
753
+ yield formatted
754
+
755
+ if use_keyset:
756
+ last_row = rows[-1]
757
+ last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
758
+
759
+ row_offset += len(rows)
760
+ page_num += 1
761
+ if len(rows) < rows_per_page:
762
+ break
763
+ finally:
764
+ if cursor is not None:
765
+ try:
766
+ cursor.close()
767
+ except Exception:
768
+ pass
769
+ conn.close()
770
+
771
+ def enrich_finding_location(
772
+ self,
773
+ finding: DetectionResult,
774
+ asset: SingleAssetScanResults,
775
+ text_content: str,
776
+ ) -> None:
777
+ del text_content
778
+ table_ref = self._table_lookup.get(asset.hash)
779
+ if not table_ref:
780
+ return
781
+
782
+ path = table_ref.table
783
+ cached = self._content_cache.get(asset.hash)
784
+ raw_content = cached[0] if cached else None
785
+ metadata = finding.metadata or {}
786
+ finding.location = build_tabular_location(
787
+ raw_content=raw_content,
788
+ matched_content=finding.matched_content,
789
+ base_path=path,
790
+ primary_key_columns=self._get_primary_key_columns(table_ref),
791
+ row_index=metadata.get("tabular_row_index"),
792
+ column_name=metadata.get("tabular_column_name"),
793
+ )
794
+
795
+ def abort(self) -> None:
796
+ logger.info("Aborting MySQL extraction...")
797
+ super().abort()