classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,709 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from contextlib import closing
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+
11
+ from ...models.generated_input import (
12
+ HiveInput,
13
+ HiveOptionalConnection,
14
+ HiveOptionalScope,
15
+ SamplingConfig,
16
+ SamplingStrategy,
17
+ )
18
+ from ...models.generated_single_asset_scan_results import (
19
+ AssetType as OutputAssetType,
20
+ )
21
+ from ...models.generated_single_asset_scan_results import (
22
+ DetectionResult,
23
+ SingleAssetScanResults,
24
+ )
25
+ from ...utils.hashing import hash_id, unhash_id
26
+ from ..base import BaseSource
27
+ from ..dependencies import require_module
28
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ _DEFAULT_EXCLUDED_DATABASES = {"information_schema", "sys"}
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class TableRef:
37
+ database: str
38
+ table: str
39
+ object_type: str
40
+
41
+
42
+ def _quote_identifier(identifier: str) -> str:
43
+ return f"`{identifier.replace('`', '``')}`"
44
+
45
+
46
+ class HiveSource(BaseSource):
47
+ source_type = "hive"
48
+
49
+ def __init__(
50
+ self,
51
+ recipe: dict[str, Any],
52
+ source_id: str | None = None,
53
+ runner_id: str | None = None,
54
+ ) -> None:
55
+ super().__init__(recipe, source_id, runner_id)
56
+ self.config = HiveInput.model_validate(recipe)
57
+ self.runner_id = runner_id or "local-run"
58
+ self._pyhive_hive = require_module(
59
+ module_name="pyhive.hive",
60
+ source_name="Hive",
61
+ uv_groups=["hive"],
62
+ detail="The Hive connector is optional.",
63
+ )
64
+ self._host = self.config.required.host
65
+ self._port = int(self.config.required.port)
66
+ self._table_lookup: dict[str, TableRef] = {}
67
+ self._content_cache: dict[str, tuple[str, str]] = {}
68
+ self._table_type_cache: dict[tuple[str, str], str] = {}
69
+
70
+ def _asset_type_value(self) -> str:
71
+ type_value = self.config.type
72
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
73
+
74
+ def _sampling(self) -> SamplingConfig:
75
+ return self.config.sampling
76
+
77
+ def _connection_options(self) -> HiveOptionalConnection:
78
+ if self.config.optional and self.config.optional.connection:
79
+ return self.config.optional.connection
80
+ return HiveOptionalConnection()
81
+
82
+ def _scope_options(self) -> HiveOptionalScope:
83
+ if self.config.optional and self.config.optional.scope:
84
+ return self.config.optional.scope
85
+ return HiveOptionalScope()
86
+
87
+ def _username(self) -> str:
88
+ return self.config.masked.username
89
+
90
+ def _password(self) -> str:
91
+ return self.config.masked.password
92
+
93
+ def _connection_scheme(self) -> str:
94
+ connection = self._connection_options()
95
+ scheme = (
96
+ connection.scheme.value if hasattr(connection.scheme, "value") else connection.scheme
97
+ )
98
+ if not scheme:
99
+ return "hive"
100
+ return str(scheme)
101
+
102
+ def _connect(self, database: str | None = None):
103
+ connection_options = self._connection_options()
104
+ scope_options = self._scope_options()
105
+
106
+ target_database = database or scope_options.database or "default"
107
+ connect_kwargs: dict[str, Any] = {
108
+ "host": self._host,
109
+ "port": int(self._port),
110
+ "username": self._username(),
111
+ "password": self._password(),
112
+ "database": target_database,
113
+ }
114
+
115
+ scheme = self._connection_scheme().lower()
116
+ if scheme == "hive+http":
117
+ connect_kwargs.setdefault("scheme", "http")
118
+ elif scheme in {"hive+https", "databricks+pyhive"}:
119
+ connect_kwargs.setdefault("scheme", "https")
120
+
121
+ connect_args = connection_options.connect_args or {}
122
+ if isinstance(connect_args, dict):
123
+ connect_kwargs.update(connect_args)
124
+
125
+ hive_module = self._pyhive_hive
126
+ if not hasattr(hive_module, "connect"):
127
+ raise RuntimeError("PyHive module does not expose hive.connect")
128
+
129
+ return hive_module.connect(**connect_kwargs)
130
+
131
+ def _excluded_databases(self) -> set[str]:
132
+ configured = self._scope_options().exclude_databases or []
133
+ excluded = {name.strip() for name in configured if name.strip()}
134
+ if not excluded:
135
+ excluded = set(_DEFAULT_EXCLUDED_DATABASES)
136
+ return excluded
137
+
138
+ def _object_allowlist(self) -> set[str]:
139
+ include_objects = self._scope_options().include_objects or []
140
+ return {entry.strip().lower() for entry in include_objects if entry.strip()}
141
+
142
+ def _include_tables_enabled(self) -> bool:
143
+ return self._scope_options().include_tables is not False
144
+
145
+ def _include_views_enabled(self) -> bool:
146
+ return self._scope_options().include_views is not False
147
+
148
+ def _resolve_databases(self) -> list[str]:
149
+ scope_options = self._scope_options()
150
+ include_all = bool(scope_options.include_all_databases)
151
+ configured_database = scope_options.database
152
+
153
+ if not include_all:
154
+ if configured_database:
155
+ return [configured_database]
156
+ raise ValueError(
157
+ "Hive source requires optional.scope.database when include_all_databases is false. "
158
+ "Set optional.scope.database (e.g. 'default') or enable include_all_databases."
159
+ )
160
+
161
+ excluded = self._excluded_databases()
162
+ databases: list[str] = []
163
+
164
+ seed_database = configured_database or "default"
165
+ with closing(self._connect(seed_database)) as conn:
166
+ with conn.cursor() as cursor:
167
+ cursor.execute("SHOW DATABASES")
168
+ for row in cursor.fetchall():
169
+ database_name = row[0] if isinstance(row, tuple) and row else None
170
+ if not isinstance(database_name, str) or not database_name:
171
+ continue
172
+ if database_name in excluded:
173
+ continue
174
+ databases.append(database_name)
175
+
176
+ if configured_database and configured_database not in databases:
177
+ databases.insert(0, configured_database)
178
+
179
+ return databases
180
+
181
+ def _resolve_object_type(self, database: str, table: str) -> str:
182
+ cache_key = (database, table)
183
+ if cache_key in self._table_type_cache:
184
+ return self._table_type_cache[cache_key]
185
+
186
+ object_type = "TABLE"
187
+ try:
188
+ query = f"DESCRIBE FORMATTED {_quote_identifier(database)}.{_quote_identifier(table)}"
189
+ with closing(self._connect(database)) as conn:
190
+ with conn.cursor() as cursor:
191
+ cursor.execute(query)
192
+ for row in cursor.fetchall():
193
+ if not isinstance(row, tuple) or not row:
194
+ continue
195
+
196
+ field_name = row[0]
197
+ details = row[1] if len(row) > 1 else ""
198
+ if not isinstance(field_name, str):
199
+ continue
200
+
201
+ if field_name.strip().lower() == "table type:":
202
+ detail_text = str(details).strip().upper()
203
+ if "VIRTUAL_VIEW" in detail_text or "VIEW" in detail_text:
204
+ object_type = "VIEW"
205
+ break
206
+ except Exception:
207
+ object_type = "TABLE"
208
+
209
+ self._table_type_cache[cache_key] = object_type
210
+ return object_type
211
+
212
+ def _list_tables_for_database(self, database: str) -> list[TableRef]:
213
+ include_tables = self._include_tables_enabled()
214
+ include_views = self._include_views_enabled()
215
+ if not include_tables and not include_views:
216
+ return []
217
+
218
+ object_allowlist = self._object_allowlist()
219
+ table_limit = self._scope_options().table_limit
220
+ limit = int(table_limit) if table_limit else None
221
+
222
+ tables: list[TableRef] = []
223
+ with closing(self._connect(database)) as conn:
224
+ with conn.cursor() as cursor:
225
+ cursor.execute(f"SHOW TABLES IN {_quote_identifier(database)}")
226
+ for row in cursor.fetchall():
227
+ table_name = row[0] if isinstance(row, tuple) and row else None
228
+ if not isinstance(table_name, str) or not table_name:
229
+ continue
230
+
231
+ normalized_table = table_name.lower()
232
+ normalized_db_table = f"{database}.{table_name}".lower()
233
+ if (
234
+ object_allowlist
235
+ and normalized_table not in object_allowlist
236
+ and normalized_db_table not in object_allowlist
237
+ ):
238
+ continue
239
+
240
+ object_type = self._resolve_object_type(database, table_name)
241
+ if object_type == "VIEW" and not include_views:
242
+ continue
243
+ if object_type != "VIEW" and not include_tables:
244
+ continue
245
+
246
+ tables.append(
247
+ TableRef(
248
+ database=database,
249
+ table=table_name,
250
+ object_type=object_type,
251
+ )
252
+ )
253
+ if limit is not None and len(tables) >= limit:
254
+ break
255
+
256
+ return tables
257
+
258
+ def _iter_tables(self) -> list[TableRef]:
259
+ tables: list[TableRef] = []
260
+ for database in self._resolve_databases():
261
+ if self._aborted:
262
+ break
263
+ try:
264
+ tables.extend(self._list_tables_for_database(database))
265
+ except Exception as exc:
266
+ logger.warning("Skipping database %s due to listing error: %s", database, exc)
267
+ return tables
268
+
269
+ def test_connection(self) -> dict[str, Any]:
270
+ logger.info("Testing connection to Hive...")
271
+ result = {
272
+ "timestamp": datetime.now(UTC).isoformat(),
273
+ "source_type": self.recipe.get("type"),
274
+ }
275
+
276
+ try:
277
+ databases = self._resolve_databases()
278
+ if not databases:
279
+ raise ValueError("No databases available for scanning")
280
+
281
+ with closing(self._connect(databases[0])) as conn:
282
+ with conn.cursor() as cursor:
283
+ cursor.execute("SELECT 1")
284
+ cursor.fetchone()
285
+
286
+ result["status"] = "SUCCESS"
287
+ result["message"] = (
288
+ f"Successfully connected to Hive. Reachable databases: {len(databases)}."
289
+ )
290
+ except Exception as exc:
291
+ result["status"] = "FAILURE"
292
+ result["message"] = f"Failed to connect to Hive: {exc}"
293
+
294
+ return result
295
+
296
+ def _table_key(self, table_ref: TableRef) -> tuple[str, str]:
297
+ return (table_ref.database, table_ref.table)
298
+
299
+ def _table_raw_id(self, table_ref: TableRef) -> str:
300
+ return f"{table_ref.database}_#_{table_ref.table}"
301
+
302
+ def _collect_foreign_key_links(
303
+ self,
304
+ _tables: list[TableRef],
305
+ ) -> dict[tuple[str, str], set[tuple[str, str]]]:
306
+ return {}
307
+
308
+ def _table_to_asset(
309
+ self,
310
+ table_ref: TableRef,
311
+ *,
312
+ links: list[str] | None = None,
313
+ ) -> SingleAssetScanResults:
314
+ asset_name = f"{table_ref.database}.{table_ref.table}"
315
+ raw_id = self._table_raw_id(table_ref)
316
+ asset_hash = self.generate_hash_id(raw_id)
317
+ external_url = (
318
+ f"{self._connection_scheme()}://{self._host}:{self._port}/"
319
+ f"{table_ref.database}/{table_ref.table}"
320
+ )
321
+
322
+ metadata = {
323
+ "database": table_ref.database,
324
+ "table": table_ref.table,
325
+ "object_type": table_ref.object_type,
326
+ "sampling": {
327
+ "strategy": str(self._sampling().strategy),
328
+ },
329
+ }
330
+
331
+ now = datetime.now(UTC)
332
+ return SingleAssetScanResults(
333
+ hash=asset_hash,
334
+ checksum=self.calculate_checksum(metadata),
335
+ name=asset_name,
336
+ external_url=external_url,
337
+ links=links or [],
338
+ asset_type=OutputAssetType.TABLE,
339
+ source_id=self.source_id,
340
+ created_at=now,
341
+ updated_at=now,
342
+ runner_id=self.runner_id,
343
+ )
344
+
345
+ STREAM_DETECTIONS = True
346
+
347
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
348
+ if self._aborted:
349
+ return
350
+
351
+ tables = self._iter_tables()
352
+ table_hash_by_key: dict[tuple[str, str], str] = {
353
+ self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
354
+ for table_ref in tables
355
+ }
356
+ table_fk_links = self._collect_foreign_key_links(tables)
357
+
358
+ batch: list[SingleAssetScanResults] = []
359
+ for table_ref in tables:
360
+ if self._aborted:
361
+ return
362
+
363
+ key = self._table_key(table_ref)
364
+ linked_hashes = [
365
+ table_hash_by_key[target]
366
+ for target in sorted(table_fk_links.get(key, set()))
367
+ if target in table_hash_by_key
368
+ ]
369
+
370
+ asset = self._table_to_asset(table_ref, links=linked_hashes)
371
+ self._table_lookup[asset.hash] = table_ref
372
+ batch.append(asset)
373
+
374
+ if len(batch) >= self.BATCH_SIZE:
375
+ yield batch
376
+ batch = []
377
+
378
+ if batch:
379
+ yield batch
380
+
381
+ def generate_hash_id(self, asset_id: str) -> str:
382
+ return hash_id(self._asset_type_value(), asset_id)
383
+
384
+ def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
385
+ if asset_id in self._table_lookup:
386
+ return self._table_lookup[asset_id]
387
+
388
+ decoded = asset_id
389
+ if "_#_" not in decoded:
390
+ try:
391
+ decoded = unhash_id(asset_id)
392
+ except Exception:
393
+ decoded = asset_id
394
+
395
+ parts = decoded.split("_#_")
396
+ if len(parts) >= 3 and parts[0].upper() == "HIVE":
397
+ return TableRef(
398
+ database=parts[-2],
399
+ table=parts[-1],
400
+ object_type="TABLE",
401
+ )
402
+ if len(parts) >= 2:
403
+ return TableRef(
404
+ database=parts[-2],
405
+ table=parts[-1],
406
+ object_type="TABLE",
407
+ )
408
+ return None
409
+
410
+ def _available_columns(self, table_ref: TableRef) -> list[str]:
411
+ query = (
412
+ f"DESCRIBE {_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.table)}"
413
+ )
414
+ with closing(self._connect(table_ref.database)) as conn:
415
+ with conn.cursor() as cursor:
416
+ cursor.execute(query)
417
+ columns: list[str] = []
418
+ for row in cursor.fetchall():
419
+ if not isinstance(row, tuple) or not row:
420
+ continue
421
+ column_name = row[0]
422
+ if not isinstance(column_name, str):
423
+ continue
424
+ normalized = column_name.strip()
425
+ if not normalized or normalized.startswith("#"):
426
+ continue
427
+ columns.append(normalized)
428
+ return columns
429
+
430
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
431
+ sampling = self._sampling()
432
+ configured = sampling.order_by_column
433
+ if configured and configured in columns:
434
+ return configured
435
+
436
+ priority_candidates = (
437
+ "updated_at",
438
+ "modified_at",
439
+ "created_at",
440
+ "inserted_at",
441
+ "timestamp",
442
+ "ts",
443
+ "date",
444
+ )
445
+ for candidate in priority_candidates:
446
+ if candidate in columns:
447
+ return candidate
448
+ return None
449
+
450
+ def _build_sampling_query(
451
+ self, table_ref: TableRef, columns: list[str]
452
+ ) -> tuple[str, list[Any]]:
453
+ sampling = self._sampling()
454
+ if not columns:
455
+ raise ValueError(
456
+ f"Table {table_ref.database}.{table_ref.table} has no readable columns"
457
+ )
458
+
459
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
460
+ from_expr = f"{_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.table)}"
461
+
462
+ strategy = sampling.strategy
463
+ if strategy == SamplingStrategy.ALL:
464
+ query = f"SELECT {quoted_columns} FROM {from_expr}"
465
+ return query, []
466
+
467
+ query = f"SELECT {quoted_columns} FROM {from_expr}"
468
+
469
+ if strategy == SamplingStrategy.LATEST:
470
+ order_column = self._resolve_latest_order_column(columns)
471
+ if order_column:
472
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
473
+ elif sampling.fallback_to_random is not False:
474
+ query += " ORDER BY rand()"
475
+ elif strategy == SamplingStrategy.RANDOM:
476
+ query += " ORDER BY rand()"
477
+
478
+ query += f" LIMIT {int(sampling.rows_per_page or 100)}"
479
+ return query, []
480
+
481
+ def _count_table_rows(self, table_ref: TableRef) -> int | None:
482
+ try:
483
+ with closing(self._connect(table_ref.database)) as conn:
484
+ with conn.cursor() as cursor:
485
+ cursor.execute(
486
+ f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.database)}.{_quote_identifier(table_ref.table)}"
487
+ )
488
+ row = cursor.fetchone()
489
+ return int(row[0]) if row else None
490
+ except Exception:
491
+ return None
492
+
493
+ def _serialize_cell(self, value: Any) -> str:
494
+ if value is None:
495
+ return "null"
496
+ if isinstance(value, memoryview):
497
+ value = value.tobytes()
498
+ if isinstance(value, (bytes, bytearray)):
499
+ return f"<{len(value)} bytes>"
500
+ if isinstance(value, datetime):
501
+ return value.isoformat()
502
+ return str(value)
503
+
504
+ def _format_sample_content(
505
+ self,
506
+ table_ref: TableRef,
507
+ column_names: list[str],
508
+ rows: list[tuple[Any, ...]],
509
+ row_offset: int = 0,
510
+ ) -> tuple[str, str]:
511
+ sampling = self._sampling()
512
+ return format_tabular_sample_content(
513
+ scope_label="table",
514
+ scope_value=f"{table_ref.database}.{table_ref.table}",
515
+ strategy=sampling.strategy,
516
+ rows=rows,
517
+ column_names=column_names,
518
+ serialize_cell=self._serialize_cell,
519
+ include_column_names=sampling.include_column_names is not False,
520
+ object_type=table_ref.object_type,
521
+ raw_metadata={
522
+ "database": table_ref.database,
523
+ "table": table_ref.table,
524
+ },
525
+ row_offset=row_offset,
526
+ )
527
+
528
+ def _fetch_one_page(
529
+ self, table_ref: TableRef, base_query: str, page_size: int, offset: int
530
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
531
+ with closing(self._connect(table_ref.database)) as conn:
532
+ paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
533
+ with conn.cursor() as cursor:
534
+ cursor.execute(paginated_query)
535
+ rows = list(cursor.fetchall())
536
+ column_names = (
537
+ [desc[0] for desc in cursor.description] if cursor.description else []
538
+ )
539
+ return rows, column_names
540
+
541
+ def _fetch_one_page_on_conn(
542
+ self,
543
+ conn: Any,
544
+ base_query: str,
545
+ page_size: int,
546
+ offset: int,
547
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
548
+ paginated_query = f"{base_query} LIMIT {page_size} OFFSET {offset}"
549
+ with conn.cursor() as cursor:
550
+ cursor.execute(paginated_query)
551
+ rows = list(cursor.fetchall())
552
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
553
+ return rows, column_names
554
+
555
+ @staticmethod
556
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
557
+ cursor.execute(query)
558
+ return [desc[0] for desc in cursor.description] if cursor.description else []
559
+
560
+ @staticmethod
561
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
562
+ return list(cursor.fetchmany(size))
563
+
564
+ def _fetch_sample_rows(
565
+ self, table_ref: TableRef
566
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
567
+ columns = self._available_columns(table_ref)
568
+ sampling = self._sampling()
569
+ query, _params = self._build_sampling_query(table_ref, columns)
570
+
571
+ if sampling.strategy == SamplingStrategy.ALL:
572
+ rows_per_page = int(sampling.rows_per_page or 100)
573
+ rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
574
+ else:
575
+ with closing(self._connect(table_ref.database)) as conn:
576
+ with conn.cursor() as cursor:
577
+ cursor.execute(query)
578
+ rows = cursor.fetchall()
579
+ column_names = [desc[0] for desc in cursor.description or []]
580
+
581
+ if not column_names:
582
+ return None
583
+ return rows, column_names
584
+
585
+ def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
586
+ result = self._fetch_sample_rows(table_ref)
587
+ if result is None:
588
+ return None
589
+ rows, column_names = result
590
+ return self._format_sample_content(table_ref, column_names, rows)
591
+
592
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
593
+ cached = self._content_cache.get(asset_id)
594
+ if cached:
595
+ return cached
596
+
597
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
598
+ if not table_ref:
599
+ return None
600
+
601
+ sampled = self._sample_table_rows(table_ref)
602
+
603
+ if sampled is None:
604
+ return None
605
+
606
+ self._content_cache[asset_id] = sampled
607
+ return sampled
608
+
609
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
610
+ sampling = self._sampling()
611
+ table_ref = self._parse_table_ref_from_asset_id(asset_id)
612
+ if not table_ref:
613
+ return
614
+
615
+ if sampling.strategy != SamplingStrategy.ALL:
616
+ result = self._fetch_sample_rows(table_ref)
617
+ if result is None:
618
+ return
619
+ rows, column_names = result
620
+ for i, row in enumerate(rows):
621
+ formatted = self._format_sample_content(
622
+ table_ref, column_names, [row], row_offset=i
623
+ )
624
+ self._content_cache[asset_id] = formatted
625
+ yield formatted
626
+ return
627
+
628
+ columns = self._available_columns(table_ref)
629
+ query, _ = self._build_sampling_query(table_ref, columns)
630
+ rows_per_page = int(sampling.rows_per_page or 100)
631
+ table_label = f"{table_ref.database}.{table_ref.table}"
632
+
633
+ total_rows = self._count_table_rows(table_ref)
634
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
635
+ if total_rows is not None and total_batches is not None:
636
+ logger.info(
637
+ "Full scan %s: %d rows, %d batches of %d",
638
+ table_label,
639
+ total_rows,
640
+ total_batches,
641
+ rows_per_page,
642
+ )
643
+
644
+ # Stream rows via fetchmany — O(1) per page at any offset, no PK needed.
645
+ # Each fetchmany() advances the server-side result pointer without re-scanning.
646
+ row_offset = 0
647
+ page_num = 1
648
+
649
+ conn = self._connect(table_ref.database)
650
+ cursor = conn.cursor()
651
+ try:
652
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
653
+ if not column_names:
654
+ return
655
+
656
+ while not self._aborted:
657
+ if total_batches is not None:
658
+ logger.info("%s batch %d/%d", table_label, page_num, total_batches)
659
+
660
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
661
+ if not rows:
662
+ break
663
+
664
+ # Yield each row individually so detection runs in parallel with fetching.
665
+ for i, row in enumerate(rows):
666
+ formatted = self._format_sample_content(
667
+ table_ref, column_names, [row], row_offset=row_offset + i
668
+ )
669
+ if formatted:
670
+ self._content_cache[asset_id] = formatted
671
+ yield formatted
672
+
673
+ row_offset += len(rows)
674
+ page_num += 1
675
+ if len(rows) < rows_per_page:
676
+ break
677
+ finally:
678
+ try:
679
+ cursor.close()
680
+ except Exception:
681
+ pass
682
+ conn.close()
683
+
684
+ def enrich_finding_location(
685
+ self,
686
+ finding: DetectionResult,
687
+ asset: SingleAssetScanResults,
688
+ text_content: str,
689
+ ) -> None:
690
+ del text_content
691
+ table_ref = self._table_lookup.get(asset.hash)
692
+ if not table_ref:
693
+ return
694
+
695
+ path = f"{table_ref.database}.{table_ref.table}"
696
+ cached = self._content_cache.get(asset.hash)
697
+ raw_content = cached[0] if cached else None
698
+ metadata = finding.metadata or {}
699
+ finding.location = build_tabular_location(
700
+ raw_content=raw_content,
701
+ matched_content=finding.matched_content,
702
+ base_path=path,
703
+ row_index=metadata.get("tabular_row_index"),
704
+ column_name=metadata.get("tabular_column_name"),
705
+ )
706
+
707
+ def abort(self) -> None:
708
+ logger.info("Aborting Hive extraction...")
709
+ super().abort()