classifyre-cli 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. classifyre_cli-0.4.2.dist-info/METADATA +167 -0
  2. classifyre_cli-0.4.2.dist-info/RECORD +101 -0
  3. classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
  4. classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
  5. src/__init__.py +1 -0
  6. src/detectors/__init__.py +105 -0
  7. src/detectors/base.py +97 -0
  8. src/detectors/broken_links/__init__.py +3 -0
  9. src/detectors/broken_links/detector.py +280 -0
  10. src/detectors/config.py +59 -0
  11. src/detectors/content/__init__.py +0 -0
  12. src/detectors/custom/__init__.py +13 -0
  13. src/detectors/custom/detector.py +45 -0
  14. src/detectors/custom/runners/__init__.py +56 -0
  15. src/detectors/custom/runners/_base.py +177 -0
  16. src/detectors/custom/runners/_factory.py +51 -0
  17. src/detectors/custom/runners/_feature_extraction.py +138 -0
  18. src/detectors/custom/runners/_gliner2.py +324 -0
  19. src/detectors/custom/runners/_image_classification.py +98 -0
  20. src/detectors/custom/runners/_llm.py +22 -0
  21. src/detectors/custom/runners/_object_detection.py +107 -0
  22. src/detectors/custom/runners/_regex.py +147 -0
  23. src/detectors/custom/runners/_text_classification.py +109 -0
  24. src/detectors/custom/trainer.py +293 -0
  25. src/detectors/dependencies.py +109 -0
  26. src/detectors/pii/__init__.py +0 -0
  27. src/detectors/pii/detector.py +883 -0
  28. src/detectors/secrets/__init__.py +0 -0
  29. src/detectors/secrets/detector.py +399 -0
  30. src/detectors/threat/__init__.py +0 -0
  31. src/detectors/threat/code_security_detector.py +206 -0
  32. src/detectors/threat/yara_detector.py +177 -0
  33. src/main.py +608 -0
  34. src/models/generated_detectors.py +1296 -0
  35. src/models/generated_input.py +2732 -0
  36. src/models/generated_single_asset_scan_results.py +240 -0
  37. src/outputs/__init__.py +3 -0
  38. src/outputs/base.py +69 -0
  39. src/outputs/console.py +62 -0
  40. src/outputs/factory.py +156 -0
  41. src/outputs/file.py +83 -0
  42. src/outputs/rest.py +258 -0
  43. src/pipeline/__init__.py +7 -0
  44. src/pipeline/content_provider.py +26 -0
  45. src/pipeline/detector_pipeline.py +742 -0
  46. src/pipeline/parsed_content_provider.py +59 -0
  47. src/sandbox/__init__.py +5 -0
  48. src/sandbox/runner.py +145 -0
  49. src/sources/__init__.py +95 -0
  50. src/sources/atlassian_common.py +389 -0
  51. src/sources/azure_blob_storage/__init__.py +3 -0
  52. src/sources/azure_blob_storage/source.py +130 -0
  53. src/sources/base.py +296 -0
  54. src/sources/confluence/__init__.py +3 -0
  55. src/sources/confluence/source.py +733 -0
  56. src/sources/databricks/__init__.py +3 -0
  57. src/sources/databricks/source.py +1279 -0
  58. src/sources/dependencies.py +81 -0
  59. src/sources/google_cloud_storage/__init__.py +3 -0
  60. src/sources/google_cloud_storage/source.py +114 -0
  61. src/sources/hive/__init__.py +3 -0
  62. src/sources/hive/source.py +709 -0
  63. src/sources/jira/__init__.py +3 -0
  64. src/sources/jira/source.py +605 -0
  65. src/sources/mongodb/__init__.py +3 -0
  66. src/sources/mongodb/source.py +550 -0
  67. src/sources/mssql/__init__.py +3 -0
  68. src/sources/mssql/source.py +1034 -0
  69. src/sources/mysql/__init__.py +3 -0
  70. src/sources/mysql/source.py +797 -0
  71. src/sources/neo4j/__init__.py +0 -0
  72. src/sources/neo4j/source.py +523 -0
  73. src/sources/object_storage/base.py +679 -0
  74. src/sources/oracle/__init__.py +3 -0
  75. src/sources/oracle/source.py +982 -0
  76. src/sources/postgresql/__init__.py +3 -0
  77. src/sources/postgresql/source.py +774 -0
  78. src/sources/powerbi/__init__.py +3 -0
  79. src/sources/powerbi/source.py +774 -0
  80. src/sources/recipe_normalizer.py +179 -0
  81. src/sources/s3_compatible_storage/README.md +66 -0
  82. src/sources/s3_compatible_storage/__init__.py +3 -0
  83. src/sources/s3_compatible_storage/source.py +150 -0
  84. src/sources/servicedesk/__init__.py +3 -0
  85. src/sources/servicedesk/source.py +620 -0
  86. src/sources/slack/__init__.py +3 -0
  87. src/sources/slack/source.py +534 -0
  88. src/sources/snowflake/__init__.py +3 -0
  89. src/sources/snowflake/source.py +912 -0
  90. src/sources/tableau/__init__.py +3 -0
  91. src/sources/tableau/source.py +799 -0
  92. src/sources/tabular_utils.py +165 -0
  93. src/sources/wordpress/__init__.py +3 -0
  94. src/sources/wordpress/source.py +590 -0
  95. src/telemetry.py +96 -0
  96. src/utils/__init__.py +1 -0
  97. src/utils/content_extraction.py +108 -0
  98. src/utils/file_parser.py +777 -0
  99. src/utils/hashing.py +82 -0
  100. src/utils/uv_sync.py +79 -0
  101. src/utils/validation.py +56 -0
@@ -0,0 +1,982 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import AsyncGenerator
6
+ from contextlib import closing
7
+ from dataclasses import dataclass
8
+ from datetime import UTC, datetime
9
+ from typing import Any
10
+
11
+ from ...models.generated_input import (
12
+ OracleInput,
13
+ OracleOptionalConnection,
14
+ OracleOptionalScope,
15
+ SamplingConfig,
16
+ SamplingStrategy,
17
+ )
18
+ from ...models.generated_single_asset_scan_results import (
19
+ AssetType as OutputAssetType,
20
+ )
21
+ from ...models.generated_single_asset_scan_results import (
22
+ DetectionResult,
23
+ SingleAssetScanResults,
24
+ )
25
+ from ...utils.hashing import hash_id, unhash_id
26
+ from ..base import BaseSource
27
+ from ..dependencies import require_module
28
+ from ..tabular_utils import build_tabular_location, format_tabular_sample_content
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ _DEFAULT_EXCLUDED_SCHEMAS = {
33
+ "SYS",
34
+ "SYSTEM",
35
+ "DBSNMP",
36
+ "WMSYS",
37
+ "CTXSYS",
38
+ "XDB",
39
+ "MDSYS",
40
+ "ORDSYS",
41
+ "OUTLN",
42
+ "ORDDATA",
43
+ }
44
+
45
+
46
+ @dataclass(frozen=True)
47
+ class ObjectRef:
48
+ service_name: str
49
+ schema: str
50
+ name: str
51
+ object_type: str
52
+
53
+
54
+ def _quote_identifier(identifier: str) -> str:
55
+ return '"' + identifier.replace('"', '""') + '"'
56
+
57
+
58
+ class OracleSource(BaseSource):
59
+ source_type = "oracle"
60
+
61
+ def __init__(
62
+ self,
63
+ recipe: dict[str, Any],
64
+ source_id: str | None = None,
65
+ runner_id: str | None = None,
66
+ ) -> None:
67
+ super().__init__(recipe, source_id, runner_id)
68
+ self.config = OracleInput.model_validate(recipe)
69
+ self.runner_id = runner_id or "local-run"
70
+ self._oracledb = require_module(
71
+ module_name="oracledb",
72
+ source_name="Oracle",
73
+ uv_groups=["oracle"],
74
+ detail="The Oracle connector is optional.",
75
+ )
76
+ self._host = self.config.required.host
77
+ self._port = int(self.config.required.port)
78
+ self._service_name = self.config.required.service_name
79
+ self._table_lookup: dict[str, ObjectRef] = {}
80
+ self._content_cache: dict[str, tuple[str, str]] = {}
81
+ self._pk_columns_cache: dict[tuple[str, str], list[str]] = {}
82
+
83
+ def _asset_type_value(self) -> str:
84
+ type_value = self.config.type
85
+ return type_value.value if hasattr(type_value, "value") else str(type_value)
86
+
87
+ def _sampling(self) -> SamplingConfig:
88
+ return self.config.sampling
89
+
90
+ def _connection_options(self) -> OracleOptionalConnection:
91
+ if self.config.optional and self.config.optional.connection:
92
+ return self.config.optional.connection
93
+ return OracleOptionalConnection()
94
+
95
+ def _scope_options(self) -> OracleOptionalScope:
96
+ if self.config.optional and self.config.optional.scope:
97
+ return self.config.optional.scope
98
+ return OracleOptionalScope()
99
+
100
+ def _username(self) -> str:
101
+ return self.config.masked.username
102
+
103
+ def _password(self) -> str:
104
+ return self.config.masked.password
105
+
106
+ def _dsn(self) -> str:
107
+ if hasattr(self._oracledb, "makedsn"):
108
+ return str(
109
+ self._oracledb.makedsn(
110
+ self._host,
111
+ int(self._port),
112
+ service_name=self._service_name,
113
+ )
114
+ )
115
+ return f"{self._host}:{self._port}/{self._service_name}"
116
+
117
+ def _connect(self):
118
+ connection_options = self._connection_options()
119
+ connect_kwargs: dict[str, Any] = {
120
+ "user": self._username(),
121
+ "password": self._password(),
122
+ "dsn": self._dsn(),
123
+ "tcp_connect_timeout": int(connection_options.connect_timeout_seconds or 30),
124
+ }
125
+
126
+ try:
127
+ return self._oracledb.connect(**connect_kwargs)
128
+ except TypeError:
129
+ connect_kwargs.pop("tcp_connect_timeout", None)
130
+ return self._oracledb.connect(**connect_kwargs)
131
+
132
+ def _schema_allowlist(self) -> set[str] | None:
133
+ configured = self._scope_options().include_schemas
134
+ if not configured:
135
+ return None
136
+ return {schema.strip().upper() for schema in configured if schema.strip()}
137
+
138
+ def _schema_denylist(self) -> set[str]:
139
+ configured = self._scope_options().exclude_schemas or []
140
+ denylist = {schema.strip().upper() for schema in configured if schema.strip()}
141
+ if not denylist:
142
+ denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
143
+ return denylist
144
+
145
+ def _object_allowlist(self) -> set[str]:
146
+ include_objects = self._scope_options().include_objects or []
147
+ return {entry.strip().lower() for entry in include_objects if entry.strip()}
148
+
149
+ def _include_tables_enabled(self) -> bool:
150
+ return self._scope_options().include_tables is not False
151
+
152
+ def _include_views_enabled(self) -> bool:
153
+ return self._scope_options().include_views is not False
154
+
155
+ def _include_view_lineage_enabled(self) -> bool:
156
+ scope = self._scope_options()
157
+ return bool(scope.include_view_lineage or scope.include_view_column_lineage)
158
+
159
+ def _list_objects(self) -> list[ObjectRef]:
160
+ include_tables = self._include_tables_enabled()
161
+ include_views = self._include_views_enabled()
162
+ if not include_tables and not include_views:
163
+ return []
164
+
165
+ schema_allowlist = self._schema_allowlist()
166
+ schema_denylist = self._schema_denylist()
167
+ object_allowlist = self._object_allowlist()
168
+ table_limit = self._scope_options().table_limit
169
+ limit = int(table_limit) if table_limit else None
170
+
171
+ objects: list[ObjectRef] = []
172
+
173
+ with closing(self._connect()) as conn:
174
+ with conn.cursor() as cursor:
175
+ if include_tables:
176
+ cursor.execute(
177
+ """
178
+ SELECT owner, table_name
179
+ FROM all_tables
180
+ ORDER BY owner, table_name
181
+ """
182
+ )
183
+ for row in cursor.fetchall():
184
+ if not isinstance(row, tuple) or len(row) < 2:
185
+ continue
186
+ schema_name = row[0]
187
+ object_name = row[1]
188
+ if not isinstance(schema_name, str) or not isinstance(object_name, str):
189
+ continue
190
+
191
+ schema_upper = schema_name.upper()
192
+ if schema_upper in schema_denylist:
193
+ continue
194
+ if schema_allowlist and schema_upper not in schema_allowlist:
195
+ continue
196
+
197
+ scoped_name = f"{schema_upper}.{object_name}".lower()
198
+ service_scoped_name = (
199
+ f"{self._service_name}.{schema_upper}.{object_name}".lower()
200
+ )
201
+ if (
202
+ object_allowlist
203
+ and scoped_name not in object_allowlist
204
+ and service_scoped_name not in object_allowlist
205
+ ):
206
+ continue
207
+
208
+ objects.append(
209
+ ObjectRef(
210
+ service_name=self._service_name,
211
+ schema=schema_upper,
212
+ name=object_name,
213
+ object_type="TABLE",
214
+ )
215
+ )
216
+ if limit is not None and len(objects) >= limit:
217
+ return objects
218
+
219
+ if include_views:
220
+ cursor.execute(
221
+ """
222
+ SELECT owner, view_name
223
+ FROM all_views
224
+ ORDER BY owner, view_name
225
+ """
226
+ )
227
+ for row in cursor.fetchall():
228
+ if not isinstance(row, tuple) or len(row) < 2:
229
+ continue
230
+ schema_name = row[0]
231
+ object_name = row[1]
232
+ if not isinstance(schema_name, str) or not isinstance(object_name, str):
233
+ continue
234
+
235
+ schema_upper = schema_name.upper()
236
+ if schema_upper in schema_denylist:
237
+ continue
238
+ if schema_allowlist and schema_upper not in schema_allowlist:
239
+ continue
240
+
241
+ scoped_name = f"{schema_upper}.{object_name}".lower()
242
+ service_scoped_name = (
243
+ f"{self._service_name}.{schema_upper}.{object_name}".lower()
244
+ )
245
+ if (
246
+ object_allowlist
247
+ and scoped_name not in object_allowlist
248
+ and service_scoped_name not in object_allowlist
249
+ ):
250
+ continue
251
+
252
+ objects.append(
253
+ ObjectRef(
254
+ service_name=self._service_name,
255
+ schema=schema_upper,
256
+ name=object_name,
257
+ object_type="VIEW",
258
+ )
259
+ )
260
+ if limit is not None and len(objects) >= limit:
261
+ return objects
262
+
263
+ return objects
264
+
265
+ def _iter_objects(self) -> list[ObjectRef]:
266
+ try:
267
+ return self._list_objects()
268
+ except Exception as exc:
269
+ logger.warning("Oracle object listing failed: %s", exc)
270
+ return []
271
+
272
+ def test_connection(self) -> dict[str, Any]:
273
+ logger.info("Testing connection to Oracle...")
274
+ result = {
275
+ "timestamp": datetime.now(UTC).isoformat(),
276
+ "source_type": self.recipe.get("type"),
277
+ }
278
+
279
+ try:
280
+ with closing(self._connect()) as conn:
281
+ with conn.cursor() as cursor:
282
+ cursor.execute("SELECT 1 FROM dual")
283
+ cursor.fetchone()
284
+
285
+ objects = self._iter_objects()
286
+ result["status"] = "SUCCESS"
287
+ result["message"] = (
288
+ f"Successfully connected to Oracle. Reachable objects: {len(objects)}."
289
+ )
290
+ except Exception as exc:
291
+ result["status"] = "FAILURE"
292
+ result["message"] = f"Failed to connect to Oracle: {exc}"
293
+
294
+ return result
295
+
296
+ def _object_key(self, object_ref: ObjectRef) -> tuple[str, str]:
297
+ return (object_ref.schema, object_ref.name)
298
+
299
+ def _object_raw_id(self, object_ref: ObjectRef) -> str:
300
+ return f"{object_ref.service_name}_#_{object_ref.schema}_#_{object_ref.name}"
301
+
302
+ def _collect_foreign_key_links(
303
+ self,
304
+ objects: list[ObjectRef],
305
+ ) -> dict[tuple[str, str], set[tuple[str, str]]]:
306
+ table_keys = {
307
+ self._object_key(object_ref)
308
+ for object_ref in objects
309
+ if object_ref.object_type == "TABLE"
310
+ }
311
+ links: dict[tuple[str, str], set[tuple[str, str]]] = {}
312
+
313
+ if not table_keys:
314
+ return links
315
+
316
+ try:
317
+ with closing(self._connect()) as conn:
318
+ with conn.cursor() as cursor:
319
+ cursor.execute(
320
+ """
321
+ SELECT
322
+ src.owner AS source_owner,
323
+ src.table_name AS source_table,
324
+ tgt.owner AS target_owner,
325
+ tgt.table_name AS target_table
326
+ FROM all_constraints src
327
+ JOIN all_constraints tgt
328
+ ON src.r_owner = tgt.owner
329
+ AND src.r_constraint_name = tgt.constraint_name
330
+ WHERE src.constraint_type = 'R'
331
+ """
332
+ )
333
+ for row in cursor.fetchall():
334
+ if not isinstance(row, tuple) or len(row) < 4:
335
+ continue
336
+
337
+ source_schema = row[0]
338
+ source_name = row[1]
339
+ target_schema = row[2]
340
+ target_name = row[3]
341
+ if (
342
+ not isinstance(source_schema, str)
343
+ or not isinstance(source_name, str)
344
+ or not isinstance(target_schema, str)
345
+ or not isinstance(target_name, str)
346
+ ):
347
+ continue
348
+
349
+ source_key = (source_schema.upper(), source_name)
350
+ target_key = (target_schema.upper(), target_name)
351
+ if source_key not in table_keys or target_key not in table_keys:
352
+ continue
353
+ links.setdefault(source_key, set()).add(target_key)
354
+ except Exception as exc:
355
+ logger.warning("Could not resolve Oracle foreign key links: %s", exc)
356
+
357
+ return links
358
+
359
+ def _collect_view_links(
360
+ self,
361
+ objects: list[ObjectRef],
362
+ ) -> dict[tuple[str, str], set[tuple[str, str]]]:
363
+ if not self._include_view_lineage_enabled():
364
+ return {}
365
+
366
+ object_keys = {self._object_key(object_ref) for object_ref in objects}
367
+ view_keys = {
368
+ self._object_key(object_ref)
369
+ for object_ref in objects
370
+ if object_ref.object_type == "VIEW"
371
+ }
372
+
373
+ if not view_keys:
374
+ return {}
375
+
376
+ links: dict[tuple[str, str], set[tuple[str, str]]] = {}
377
+ try:
378
+ with closing(self._connect()) as conn:
379
+ with conn.cursor() as cursor:
380
+ cursor.execute(
381
+ """
382
+ SELECT
383
+ owner,
384
+ name,
385
+ referenced_owner,
386
+ referenced_name,
387
+ referenced_type
388
+ FROM all_dependencies
389
+ WHERE type = 'VIEW'
390
+ AND referenced_type IN ('TABLE', 'VIEW')
391
+ """
392
+ )
393
+ for row in cursor.fetchall():
394
+ if not isinstance(row, tuple) or len(row) < 5:
395
+ continue
396
+
397
+ owner = row[0]
398
+ name = row[1]
399
+ referenced_owner = row[2]
400
+ referenced_name = row[3]
401
+ referenced_type = row[4]
402
+ if (
403
+ not isinstance(owner, str)
404
+ or not isinstance(name, str)
405
+ or not isinstance(referenced_owner, str)
406
+ or not isinstance(referenced_name, str)
407
+ ):
408
+ continue
409
+ if not isinstance(referenced_type, str):
410
+ continue
411
+
412
+ source_key = (owner.upper(), name)
413
+ target_key = (referenced_owner.upper(), referenced_name)
414
+ if source_key not in view_keys or target_key not in object_keys:
415
+ continue
416
+
417
+ links.setdefault(source_key, set()).add(target_key)
418
+ except Exception as exc:
419
+ logger.warning("Could not resolve Oracle view lineage links: %s", exc)
420
+
421
+ return links
422
+
423
+ def _object_to_asset(
424
+ self,
425
+ object_ref: ObjectRef,
426
+ *,
427
+ links: list[str] | None = None,
428
+ ) -> SingleAssetScanResults:
429
+ asset_name = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
430
+ raw_id = self._object_raw_id(object_ref)
431
+ asset_hash = self.generate_hash_id(raw_id)
432
+ external_url = (
433
+ f"oracle://{self._host}:{self._port}/{object_ref.service_name}/"
434
+ f"{object_ref.schema}.{object_ref.name}"
435
+ )
436
+
437
+ metadata = {
438
+ "service_name": object_ref.service_name,
439
+ "schema": object_ref.schema,
440
+ "object": object_ref.name,
441
+ "object_type": object_ref.object_type,
442
+ "lineage": {
443
+ "include_view_lineage": bool(self._scope_options().include_view_lineage),
444
+ "include_view_column_lineage": bool(
445
+ self._scope_options().include_view_column_lineage
446
+ ),
447
+ },
448
+ "sampling": {
449
+ "strategy": str(self._sampling().strategy),
450
+ },
451
+ }
452
+
453
+ now = datetime.now(UTC)
454
+ return SingleAssetScanResults(
455
+ hash=asset_hash,
456
+ checksum=self.calculate_checksum(metadata),
457
+ name=asset_name,
458
+ external_url=external_url,
459
+ links=links or [],
460
+ asset_type=OutputAssetType.TABLE,
461
+ source_id=self.source_id,
462
+ created_at=now,
463
+ updated_at=now,
464
+ runner_id=self.runner_id,
465
+ )
466
+
467
+ STREAM_DETECTIONS = True
468
+
469
+ async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
470
+ if self._aborted:
471
+ return
472
+
473
+ objects = self._iter_objects()
474
+ object_hash_by_key: dict[tuple[str, str], str] = {
475
+ self._object_key(object_ref): self.generate_hash_id(self._object_raw_id(object_ref))
476
+ for object_ref in objects
477
+ }
478
+ fk_links = self._collect_foreign_key_links(objects)
479
+ view_links = self._collect_view_links(objects)
480
+
481
+ batch: list[SingleAssetScanResults] = []
482
+ for object_ref in objects:
483
+ if self._aborted:
484
+ return
485
+
486
+ key = self._object_key(object_ref)
487
+ combined_targets = set(fk_links.get(key, set())) | set(view_links.get(key, set()))
488
+ linked_hashes = [
489
+ object_hash_by_key[target]
490
+ for target in sorted(combined_targets)
491
+ if target in object_hash_by_key
492
+ ]
493
+
494
+ asset = self._object_to_asset(object_ref, links=linked_hashes)
495
+ self._table_lookup[asset.hash] = object_ref
496
+ batch.append(asset)
497
+
498
+ if len(batch) >= self.BATCH_SIZE:
499
+ yield batch
500
+ batch = []
501
+
502
+ if batch:
503
+ yield batch
504
+
505
+ def generate_hash_id(self, asset_id: str) -> str:
506
+ return hash_id(self._asset_type_value(), asset_id)
507
+
508
+ def _parse_object_ref_from_asset_id(self, asset_id: str) -> ObjectRef | None:
509
+ if asset_id in self._table_lookup:
510
+ return self._table_lookup[asset_id]
511
+
512
+ decoded = asset_id
513
+ if "_#_" not in decoded:
514
+ try:
515
+ decoded = unhash_id(asset_id)
516
+ except Exception:
517
+ decoded = asset_id
518
+
519
+ parts = decoded.split("_#_")
520
+ if len(parts) >= 5 and parts[0].upper() == "ORACLE":
521
+ # Backward compatibility with older ORACLE_#_ENV_#_SERVICE_#_SCHEMA_#_OBJECT ids.
522
+ return ObjectRef(
523
+ service_name=parts[-3],
524
+ schema=parts[-2],
525
+ name=parts[-1],
526
+ object_type="TABLE",
527
+ )
528
+ if len(parts) >= 4 and parts[0].upper() == "ORACLE":
529
+ return ObjectRef(
530
+ service_name=parts[-3],
531
+ schema=parts[-2],
532
+ name=parts[-1],
533
+ object_type="TABLE",
534
+ )
535
+ if len(parts) >= 4:
536
+ return ObjectRef(
537
+ service_name=parts[-3],
538
+ schema=parts[-2],
539
+ name=parts[-1],
540
+ object_type="TABLE",
541
+ )
542
+ if len(parts) >= 3:
543
+ return ObjectRef(
544
+ service_name=self._service_name,
545
+ schema=parts[-2],
546
+ name=parts[-1],
547
+ object_type="TABLE",
548
+ )
549
+ return None
550
+
551
+ def _available_columns(self, object_ref: ObjectRef) -> list[str]:
552
+ with closing(self._connect()) as conn:
553
+ with conn.cursor() as cursor:
554
+ cursor.execute(
555
+ """
556
+ SELECT column_name
557
+ FROM all_tab_columns
558
+ WHERE owner = :owner
559
+ AND table_name = :table_name
560
+ ORDER BY column_id
561
+ """,
562
+ {
563
+ "owner": object_ref.schema,
564
+ "table_name": object_ref.name,
565
+ },
566
+ )
567
+ return [
568
+ row[0]
569
+ for row in cursor.fetchall()
570
+ if isinstance(row, tuple) and row and isinstance(row[0], str)
571
+ ]
572
+
573
+ def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
574
+ sampling = self._sampling()
575
+ normalized = {column.lower(): column for column in columns}
576
+
577
+ configured = sampling.order_by_column
578
+ if configured:
579
+ configured_column = normalized.get(configured.lower())
580
+ if configured_column:
581
+ return configured_column
582
+
583
+ priority_candidates = (
584
+ "updated_at",
585
+ "modified_at",
586
+ "created_at",
587
+ "inserted_at",
588
+ "timestamp",
589
+ "ts",
590
+ "date",
591
+ )
592
+ for candidate in priority_candidates:
593
+ resolved = normalized.get(candidate)
594
+ if resolved:
595
+ return resolved
596
+ return None
597
+
598
+ def _build_sampling_query(
599
+ self, object_ref: ObjectRef, columns: list[str]
600
+ ) -> tuple[str, list[Any]]:
601
+ sampling = self._sampling()
602
+ if not columns:
603
+ raise ValueError(
604
+ f"Object {object_ref.service_name}.{object_ref.schema}.{object_ref.name} has no readable columns"
605
+ )
606
+
607
+ quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
608
+ quoted_object = (
609
+ f"{_quote_identifier(object_ref.schema)}.{_quote_identifier(object_ref.name)}"
610
+ )
611
+
612
+ strategy = sampling.strategy
613
+ if strategy == SamplingStrategy.ALL:
614
+ return f"SELECT {quoted_columns} FROM {quoted_object}", []
615
+
616
+ rows_per_page = int(sampling.rows_per_page or 100)
617
+ query = f"SELECT {quoted_columns} FROM {quoted_object}"
618
+
619
+ if strategy == SamplingStrategy.LATEST:
620
+ order_column = self._resolve_latest_order_column(columns)
621
+ if order_column:
622
+ query += f" ORDER BY {_quote_identifier(order_column)} DESC"
623
+ elif sampling.fallback_to_random is not False:
624
+ query += " ORDER BY DBMS_RANDOM.VALUE"
625
+ elif strategy == SamplingStrategy.RANDOM:
626
+ query += " ORDER BY DBMS_RANDOM.VALUE"
627
+
628
+ query += f" FETCH FIRST {rows_per_page} ROWS ONLY"
629
+ return query, []
630
+
631
+ def _count_table_rows(self, object_ref: ObjectRef) -> int | None:
632
+ try:
633
+ with closing(self._connect()) as conn:
634
+ with conn.cursor() as cursor:
635
+ cursor.execute(
636
+ f"SELECT COUNT(*) FROM {_quote_identifier(object_ref.schema)}.{_quote_identifier(object_ref.name)}"
637
+ )
638
+ row = cursor.fetchone()
639
+ return int(row[0]) if row else None
640
+ except Exception:
641
+ return None
642
+
643
+ def _serialize_cell(self, value: Any) -> str:
644
+ if value is None:
645
+ return "null"
646
+ if isinstance(value, memoryview):
647
+ value = value.tobytes()
648
+
649
+ if hasattr(value, "read"):
650
+ try:
651
+ value = value.read()
652
+ except Exception:
653
+ value = str(value)
654
+
655
+ if isinstance(value, (bytes, bytearray)):
656
+ return f"<{len(value)} bytes>"
657
+ if isinstance(value, datetime):
658
+ return value.isoformat()
659
+ return str(value)
660
+
661
+ def _format_sample_content(
662
+ self,
663
+ object_ref: ObjectRef,
664
+ column_names: list[str],
665
+ rows: list[tuple[Any, ...]],
666
+ row_offset: int = 0,
667
+ ) -> tuple[str, str]:
668
+ sampling = self._sampling()
669
+ return format_tabular_sample_content(
670
+ scope_label="object",
671
+ scope_value=f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}",
672
+ strategy=sampling.strategy,
673
+ rows=rows,
674
+ column_names=column_names,
675
+ serialize_cell=self._serialize_cell,
676
+ include_column_names=sampling.include_column_names is not False,
677
+ object_type=object_ref.object_type,
678
+ raw_metadata={
679
+ "service_name": object_ref.service_name,
680
+ "schema": object_ref.schema,
681
+ "object": object_ref.name,
682
+ },
683
+ row_offset=row_offset,
684
+ )
685
+
686
+ def _fetch_one_page(
687
+ self, object_ref: ObjectRef, base_query: str, page_size: int, offset: int
688
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
689
+ with closing(self._connect()) as conn:
690
+ paginated_query = f"{base_query} OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
691
+ with conn.cursor() as cursor:
692
+ cursor.execute(paginated_query)
693
+ rows = list(cursor.fetchall())
694
+ column_names = (
695
+ [desc[0] for desc in cursor.description] if cursor.description else []
696
+ )
697
+ return rows, column_names
698
+
699
+ def _fetch_one_page_on_conn(
700
+ self,
701
+ conn: Any,
702
+ base_query: str,
703
+ page_size: int,
704
+ offset: int,
705
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
706
+ paginated_query = f"{base_query} OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
707
+ with conn.cursor() as cursor:
708
+ cursor.execute(paginated_query)
709
+ rows = list(cursor.fetchall())
710
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
711
+ return rows, column_names
712
+
713
+ @staticmethod
714
+ def _cursor_execute(cursor: Any, query: str) -> list[str]:
715
+ cursor.execute(query)
716
+ return [desc[0] for desc in cursor.description] if cursor.description else []
717
+
718
+ @staticmethod
719
+ def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
720
+ return list(cursor.fetchmany(size))
721
+
722
+ def _fetch_page_keyset(
723
+ self,
724
+ conn: Any,
725
+ base_query: str,
726
+ page_size: int,
727
+ pk_columns: list[str],
728
+ pk_order: str,
729
+ last_pk_values: list[Any] | None,
730
+ ) -> tuple[list[tuple[Any, ...]], list[str]]:
731
+ """Fetch one page using keyset pagination — O(1) cost at any offset."""
732
+ bind: dict[str, Any] = {}
733
+ if last_pk_values is None:
734
+ paginated_query = f"{base_query} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
735
+ elif len(pk_columns) == 1:
736
+ where = f"WHERE {_quote_identifier(pk_columns[0])} > :pk0"
737
+ paginated_query = (
738
+ f"{base_query} {where} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
739
+ )
740
+ bind = {"pk0": last_pk_values[0]}
741
+ else:
742
+ pk_cols_quoted = ", ".join(_quote_identifier(col) for col in pk_columns)
743
+ placeholders = ", ".join(f":pk{i}" for i in range(len(pk_columns)))
744
+ where = f"WHERE ({pk_cols_quoted}) > ({placeholders})"
745
+ paginated_query = (
746
+ f"{base_query} {where} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
747
+ )
748
+ bind = {f"pk{i}": last_pk_values[i] for i in range(len(pk_columns))}
749
+
750
+ with conn.cursor() as cursor:
751
+ cursor.execute(paginated_query, bind if bind else [])
752
+ rows = list(cursor.fetchall())
753
+ column_names = [desc[0] for desc in cursor.description] if cursor.description else []
754
+ return rows, column_names
755
+
756
+ def _fetch_sample_rows(
757
+ self, object_ref: ObjectRef
758
+ ) -> tuple[list[tuple[Any, ...]], list[str]] | None:
759
+ columns = self._available_columns(object_ref)
760
+ sampling = self._sampling()
761
+ query, _params = self._build_sampling_query(object_ref, columns)
762
+
763
+ if sampling.strategy == SamplingStrategy.ALL:
764
+ rows_per_page = int(sampling.rows_per_page or 100)
765
+ rows, column_names = self._fetch_one_page(object_ref, query, rows_per_page, 0)
766
+ else:
767
+ with closing(self._connect()) as conn:
768
+ with conn.cursor() as cursor:
769
+ cursor.execute(query)
770
+ rows = cursor.fetchall()
771
+ column_names = [desc[0] for desc in cursor.description or []]
772
+
773
+ if not column_names:
774
+ return None
775
+ return rows, column_names
776
+
777
+ def _sample_table_rows(self, object_ref: ObjectRef) -> tuple[str, str] | None:
778
+ result = self._fetch_sample_rows(object_ref)
779
+ if result is None:
780
+ return None
781
+ rows, column_names = result
782
+ return self._format_sample_content(object_ref, column_names, rows)
783
+
784
+ async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
785
+ cached = self._content_cache.get(asset_id)
786
+ if cached:
787
+ return cached
788
+
789
+ object_ref = self._parse_object_ref_from_asset_id(asset_id)
790
+ if not object_ref:
791
+ return None
792
+
793
+ sampled = self._sample_table_rows(object_ref)
794
+
795
+ if sampled is None:
796
+ return None
797
+
798
+ self._content_cache[asset_id] = sampled
799
+ return sampled
800
+
801
+ async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
802
+ sampling = self._sampling()
803
+ object_ref = self._parse_object_ref_from_asset_id(asset_id)
804
+ if not object_ref:
805
+ return
806
+
807
+ if sampling.strategy != SamplingStrategy.ALL:
808
+ result = self._fetch_sample_rows(object_ref)
809
+ if result is None:
810
+ return
811
+ rows, column_names = result
812
+ for i, row in enumerate(rows):
813
+ formatted = self._format_sample_content(
814
+ object_ref, column_names, [row], row_offset=i
815
+ )
816
+ if formatted:
817
+ yield formatted
818
+ return
819
+
820
+ columns = self._available_columns(object_ref)
821
+ query, _ = self._build_sampling_query(object_ref, columns)
822
+ rows_per_page = int(sampling.rows_per_page or 100)
823
+ object_label = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
824
+
825
+ total_rows = self._count_table_rows(object_ref)
826
+ total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
827
+ if total_rows is not None and total_batches is not None:
828
+ logger.info(
829
+ "Full scan %s: %d rows, %d batches of %d",
830
+ object_label,
831
+ total_rows,
832
+ total_batches,
833
+ rows_per_page,
834
+ )
835
+
836
+ # Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
837
+ # Fall back to streaming fetchmany (also O(1)) for objects without a primary key.
838
+ pk_columns = (
839
+ self._get_primary_key_columns(object_ref) if object_ref.object_type == "TABLE" else []
840
+ )
841
+ pk_indices: list[int] = []
842
+ use_keyset = False
843
+ if pk_columns:
844
+ column_list = self._available_columns(object_ref)
845
+ indices = [column_list.index(col) for col in pk_columns if col in column_list]
846
+ if len(indices) == len(pk_columns):
847
+ pk_indices = indices
848
+ pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
849
+ use_keyset = True
850
+
851
+ row_offset = 0
852
+ page_num = 1
853
+ last_pk_values: list[Any] | None = None
854
+
855
+ conn = self._connect()
856
+ cursor = conn.cursor() if not use_keyset else None
857
+ try:
858
+ if cursor is not None:
859
+ # Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
860
+ column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
861
+ if not column_names:
862
+ return
863
+
864
+ while not self._aborted:
865
+ if total_batches is not None:
866
+ logger.info("%s batch %d/%d", object_label, page_num, total_batches)
867
+
868
+ if use_keyset:
869
+ rows, column_names = await asyncio.to_thread(
870
+ self._fetch_page_keyset,
871
+ conn,
872
+ query,
873
+ rows_per_page,
874
+ pk_columns,
875
+ pk_order,
876
+ last_pk_values,
877
+ )
878
+ else:
879
+ rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
880
+ if not rows:
881
+ break
882
+
883
+ if not rows or not column_names:
884
+ break
885
+
886
+ # Yield each row individually so the detection pipeline can start
887
+ # processing rows while the next page is being fetched in a thread.
888
+ for i, row in enumerate(rows):
889
+ formatted = self._format_sample_content(
890
+ object_ref, column_names, [row], row_offset=row_offset + i
891
+ )
892
+ if formatted:
893
+ self._content_cache[asset_id] = formatted
894
+ yield formatted
895
+
896
+ if use_keyset:
897
+ last_row = rows[-1]
898
+ last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
899
+
900
+ row_offset += len(rows)
901
+ page_num += 1
902
+ if len(rows) < rows_per_page:
903
+ break
904
+ finally:
905
+ if cursor is not None:
906
+ try:
907
+ cursor.close()
908
+ except Exception:
909
+ pass
910
+ conn.close()
911
+
912
+ def _get_primary_key_columns(self, object_ref: ObjectRef) -> list[str]:
913
+ cache_key = (object_ref.schema, object_ref.name)
914
+ if cache_key in self._pk_columns_cache:
915
+ return self._pk_columns_cache[cache_key]
916
+
917
+ if object_ref.object_type == "VIEW":
918
+ self._pk_columns_cache[cache_key] = []
919
+ return []
920
+
921
+ try:
922
+ with closing(self._connect()) as conn:
923
+ with conn.cursor() as cursor:
924
+ cursor.execute(
925
+ """
926
+ SELECT cols.column_name
927
+ FROM all_constraints cons
928
+ JOIN all_cons_columns cols
929
+ ON cons.owner = cols.owner
930
+ AND cons.constraint_name = cols.constraint_name
931
+ WHERE cons.constraint_type = 'P'
932
+ AND cons.owner = :owner
933
+ AND cons.table_name = :table_name
934
+ ORDER BY cols.position
935
+ """,
936
+ {
937
+ "owner": object_ref.schema,
938
+ "table_name": object_ref.name,
939
+ },
940
+ )
941
+ columns = [
942
+ row[0]
943
+ for row in cursor.fetchall()
944
+ if isinstance(row, tuple) and row and isinstance(row[0], str)
945
+ ]
946
+ except Exception:
947
+ columns = []
948
+
949
+ self._pk_columns_cache[cache_key] = columns
950
+ return columns
951
+
952
+ def enrich_finding_location(
953
+ self,
954
+ finding: DetectionResult,
955
+ asset: SingleAssetScanResults,
956
+ text_content: str,
957
+ ) -> None:
958
+ del text_content
959
+ object_ref = self._table_lookup.get(asset.hash)
960
+ if not object_ref:
961
+ return
962
+
963
+ path = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
964
+ cached = self._content_cache.get(asset.hash)
965
+ raw_content = cached[0] if cached else None
966
+ metadata = finding.metadata or {}
967
+ finding.location = build_tabular_location(
968
+ raw_content=raw_content,
969
+ matched_content=finding.matched_content,
970
+ base_path=path,
971
+ primary_key_columns=(
972
+ self._get_primary_key_columns(object_ref)
973
+ if object_ref.object_type == "TABLE"
974
+ else []
975
+ ),
976
+ row_index=metadata.get("tabular_row_index"),
977
+ column_name=metadata.get("tabular_column_name"),
978
+ )
979
+
980
+ def abort(self) -> None:
981
+ logger.info("Aborting Oracle extraction...")
982
+ super().abort()