classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,982 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from contextlib import closing
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import UTC, datetime
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from ...models.generated_input import (
|
|
12
|
+
OracleInput,
|
|
13
|
+
OracleOptionalConnection,
|
|
14
|
+
OracleOptionalScope,
|
|
15
|
+
SamplingConfig,
|
|
16
|
+
SamplingStrategy,
|
|
17
|
+
)
|
|
18
|
+
from ...models.generated_single_asset_scan_results import (
|
|
19
|
+
AssetType as OutputAssetType,
|
|
20
|
+
)
|
|
21
|
+
from ...models.generated_single_asset_scan_results import (
|
|
22
|
+
DetectionResult,
|
|
23
|
+
SingleAssetScanResults,
|
|
24
|
+
)
|
|
25
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
26
|
+
from ..base import BaseSource
|
|
27
|
+
from ..dependencies import require_module
|
|
28
|
+
from ..tabular_utils import build_tabular_location, format_tabular_sample_content
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
_DEFAULT_EXCLUDED_SCHEMAS = {
|
|
33
|
+
"SYS",
|
|
34
|
+
"SYSTEM",
|
|
35
|
+
"DBSNMP",
|
|
36
|
+
"WMSYS",
|
|
37
|
+
"CTXSYS",
|
|
38
|
+
"XDB",
|
|
39
|
+
"MDSYS",
|
|
40
|
+
"ORDSYS",
|
|
41
|
+
"OUTLN",
|
|
42
|
+
"ORDDATA",
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class ObjectRef:
|
|
48
|
+
service_name: str
|
|
49
|
+
schema: str
|
|
50
|
+
name: str
|
|
51
|
+
object_type: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _quote_identifier(identifier: str) -> str:
|
|
55
|
+
return '"' + identifier.replace('"', '""') + '"'
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class OracleSource(BaseSource):
|
|
59
|
+
source_type = "oracle"
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
recipe: dict[str, Any],
|
|
64
|
+
source_id: str | None = None,
|
|
65
|
+
runner_id: str | None = None,
|
|
66
|
+
) -> None:
|
|
67
|
+
super().__init__(recipe, source_id, runner_id)
|
|
68
|
+
self.config = OracleInput.model_validate(recipe)
|
|
69
|
+
self.runner_id = runner_id or "local-run"
|
|
70
|
+
self._oracledb = require_module(
|
|
71
|
+
module_name="oracledb",
|
|
72
|
+
source_name="Oracle",
|
|
73
|
+
uv_groups=["oracle"],
|
|
74
|
+
detail="The Oracle connector is optional.",
|
|
75
|
+
)
|
|
76
|
+
self._host = self.config.required.host
|
|
77
|
+
self._port = int(self.config.required.port)
|
|
78
|
+
self._service_name = self.config.required.service_name
|
|
79
|
+
self._table_lookup: dict[str, ObjectRef] = {}
|
|
80
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
81
|
+
self._pk_columns_cache: dict[tuple[str, str], list[str]] = {}
|
|
82
|
+
|
|
83
|
+
def _asset_type_value(self) -> str:
|
|
84
|
+
type_value = self.config.type
|
|
85
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
86
|
+
|
|
87
|
+
def _sampling(self) -> SamplingConfig:
|
|
88
|
+
return self.config.sampling
|
|
89
|
+
|
|
90
|
+
def _connection_options(self) -> OracleOptionalConnection:
|
|
91
|
+
if self.config.optional and self.config.optional.connection:
|
|
92
|
+
return self.config.optional.connection
|
|
93
|
+
return OracleOptionalConnection()
|
|
94
|
+
|
|
95
|
+
def _scope_options(self) -> OracleOptionalScope:
|
|
96
|
+
if self.config.optional and self.config.optional.scope:
|
|
97
|
+
return self.config.optional.scope
|
|
98
|
+
return OracleOptionalScope()
|
|
99
|
+
|
|
100
|
+
def _username(self) -> str:
|
|
101
|
+
return self.config.masked.username
|
|
102
|
+
|
|
103
|
+
def _password(self) -> str:
|
|
104
|
+
return self.config.masked.password
|
|
105
|
+
|
|
106
|
+
def _dsn(self) -> str:
|
|
107
|
+
if hasattr(self._oracledb, "makedsn"):
|
|
108
|
+
return str(
|
|
109
|
+
self._oracledb.makedsn(
|
|
110
|
+
self._host,
|
|
111
|
+
int(self._port),
|
|
112
|
+
service_name=self._service_name,
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
return f"{self._host}:{self._port}/{self._service_name}"
|
|
116
|
+
|
|
117
|
+
def _connect(self):
|
|
118
|
+
connection_options = self._connection_options()
|
|
119
|
+
connect_kwargs: dict[str, Any] = {
|
|
120
|
+
"user": self._username(),
|
|
121
|
+
"password": self._password(),
|
|
122
|
+
"dsn": self._dsn(),
|
|
123
|
+
"tcp_connect_timeout": int(connection_options.connect_timeout_seconds or 30),
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
return self._oracledb.connect(**connect_kwargs)
|
|
128
|
+
except TypeError:
|
|
129
|
+
connect_kwargs.pop("tcp_connect_timeout", None)
|
|
130
|
+
return self._oracledb.connect(**connect_kwargs)
|
|
131
|
+
|
|
132
|
+
def _schema_allowlist(self) -> set[str] | None:
|
|
133
|
+
configured = self._scope_options().include_schemas
|
|
134
|
+
if not configured:
|
|
135
|
+
return None
|
|
136
|
+
return {schema.strip().upper() for schema in configured if schema.strip()}
|
|
137
|
+
|
|
138
|
+
def _schema_denylist(self) -> set[str]:
|
|
139
|
+
configured = self._scope_options().exclude_schemas or []
|
|
140
|
+
denylist = {schema.strip().upper() for schema in configured if schema.strip()}
|
|
141
|
+
if not denylist:
|
|
142
|
+
denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
|
|
143
|
+
return denylist
|
|
144
|
+
|
|
145
|
+
def _object_allowlist(self) -> set[str]:
|
|
146
|
+
include_objects = self._scope_options().include_objects or []
|
|
147
|
+
return {entry.strip().lower() for entry in include_objects if entry.strip()}
|
|
148
|
+
|
|
149
|
+
def _include_tables_enabled(self) -> bool:
|
|
150
|
+
return self._scope_options().include_tables is not False
|
|
151
|
+
|
|
152
|
+
def _include_views_enabled(self) -> bool:
|
|
153
|
+
return self._scope_options().include_views is not False
|
|
154
|
+
|
|
155
|
+
def _include_view_lineage_enabled(self) -> bool:
|
|
156
|
+
scope = self._scope_options()
|
|
157
|
+
return bool(scope.include_view_lineage or scope.include_view_column_lineage)
|
|
158
|
+
|
|
159
|
+
def _list_objects(self) -> list[ObjectRef]:
|
|
160
|
+
include_tables = self._include_tables_enabled()
|
|
161
|
+
include_views = self._include_views_enabled()
|
|
162
|
+
if not include_tables and not include_views:
|
|
163
|
+
return []
|
|
164
|
+
|
|
165
|
+
schema_allowlist = self._schema_allowlist()
|
|
166
|
+
schema_denylist = self._schema_denylist()
|
|
167
|
+
object_allowlist = self._object_allowlist()
|
|
168
|
+
table_limit = self._scope_options().table_limit
|
|
169
|
+
limit = int(table_limit) if table_limit else None
|
|
170
|
+
|
|
171
|
+
objects: list[ObjectRef] = []
|
|
172
|
+
|
|
173
|
+
with closing(self._connect()) as conn:
|
|
174
|
+
with conn.cursor() as cursor:
|
|
175
|
+
if include_tables:
|
|
176
|
+
cursor.execute(
|
|
177
|
+
"""
|
|
178
|
+
SELECT owner, table_name
|
|
179
|
+
FROM all_tables
|
|
180
|
+
ORDER BY owner, table_name
|
|
181
|
+
"""
|
|
182
|
+
)
|
|
183
|
+
for row in cursor.fetchall():
|
|
184
|
+
if not isinstance(row, tuple) or len(row) < 2:
|
|
185
|
+
continue
|
|
186
|
+
schema_name = row[0]
|
|
187
|
+
object_name = row[1]
|
|
188
|
+
if not isinstance(schema_name, str) or not isinstance(object_name, str):
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
schema_upper = schema_name.upper()
|
|
192
|
+
if schema_upper in schema_denylist:
|
|
193
|
+
continue
|
|
194
|
+
if schema_allowlist and schema_upper not in schema_allowlist:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
scoped_name = f"{schema_upper}.{object_name}".lower()
|
|
198
|
+
service_scoped_name = (
|
|
199
|
+
f"{self._service_name}.{schema_upper}.{object_name}".lower()
|
|
200
|
+
)
|
|
201
|
+
if (
|
|
202
|
+
object_allowlist
|
|
203
|
+
and scoped_name not in object_allowlist
|
|
204
|
+
and service_scoped_name not in object_allowlist
|
|
205
|
+
):
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
objects.append(
|
|
209
|
+
ObjectRef(
|
|
210
|
+
service_name=self._service_name,
|
|
211
|
+
schema=schema_upper,
|
|
212
|
+
name=object_name,
|
|
213
|
+
object_type="TABLE",
|
|
214
|
+
)
|
|
215
|
+
)
|
|
216
|
+
if limit is not None and len(objects) >= limit:
|
|
217
|
+
return objects
|
|
218
|
+
|
|
219
|
+
if include_views:
|
|
220
|
+
cursor.execute(
|
|
221
|
+
"""
|
|
222
|
+
SELECT owner, view_name
|
|
223
|
+
FROM all_views
|
|
224
|
+
ORDER BY owner, view_name
|
|
225
|
+
"""
|
|
226
|
+
)
|
|
227
|
+
for row in cursor.fetchall():
|
|
228
|
+
if not isinstance(row, tuple) or len(row) < 2:
|
|
229
|
+
continue
|
|
230
|
+
schema_name = row[0]
|
|
231
|
+
object_name = row[1]
|
|
232
|
+
if not isinstance(schema_name, str) or not isinstance(object_name, str):
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
schema_upper = schema_name.upper()
|
|
236
|
+
if schema_upper in schema_denylist:
|
|
237
|
+
continue
|
|
238
|
+
if schema_allowlist and schema_upper not in schema_allowlist:
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
scoped_name = f"{schema_upper}.{object_name}".lower()
|
|
242
|
+
service_scoped_name = (
|
|
243
|
+
f"{self._service_name}.{schema_upper}.{object_name}".lower()
|
|
244
|
+
)
|
|
245
|
+
if (
|
|
246
|
+
object_allowlist
|
|
247
|
+
and scoped_name not in object_allowlist
|
|
248
|
+
and service_scoped_name not in object_allowlist
|
|
249
|
+
):
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
objects.append(
|
|
253
|
+
ObjectRef(
|
|
254
|
+
service_name=self._service_name,
|
|
255
|
+
schema=schema_upper,
|
|
256
|
+
name=object_name,
|
|
257
|
+
object_type="VIEW",
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
if limit is not None and len(objects) >= limit:
|
|
261
|
+
return objects
|
|
262
|
+
|
|
263
|
+
return objects
|
|
264
|
+
|
|
265
|
+
def _iter_objects(self) -> list[ObjectRef]:
|
|
266
|
+
try:
|
|
267
|
+
return self._list_objects()
|
|
268
|
+
except Exception as exc:
|
|
269
|
+
logger.warning("Oracle object listing failed: %s", exc)
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
def test_connection(self) -> dict[str, Any]:
|
|
273
|
+
logger.info("Testing connection to Oracle...")
|
|
274
|
+
result = {
|
|
275
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
276
|
+
"source_type": self.recipe.get("type"),
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
with closing(self._connect()) as conn:
|
|
281
|
+
with conn.cursor() as cursor:
|
|
282
|
+
cursor.execute("SELECT 1 FROM dual")
|
|
283
|
+
cursor.fetchone()
|
|
284
|
+
|
|
285
|
+
objects = self._iter_objects()
|
|
286
|
+
result["status"] = "SUCCESS"
|
|
287
|
+
result["message"] = (
|
|
288
|
+
f"Successfully connected to Oracle. Reachable objects: {len(objects)}."
|
|
289
|
+
)
|
|
290
|
+
except Exception as exc:
|
|
291
|
+
result["status"] = "FAILURE"
|
|
292
|
+
result["message"] = f"Failed to connect to Oracle: {exc}"
|
|
293
|
+
|
|
294
|
+
return result
|
|
295
|
+
|
|
296
|
+
def _object_key(self, object_ref: ObjectRef) -> tuple[str, str]:
|
|
297
|
+
return (object_ref.schema, object_ref.name)
|
|
298
|
+
|
|
299
|
+
def _object_raw_id(self, object_ref: ObjectRef) -> str:
|
|
300
|
+
return f"{object_ref.service_name}_#_{object_ref.schema}_#_{object_ref.name}"
|
|
301
|
+
|
|
302
|
+
def _collect_foreign_key_links(
|
|
303
|
+
self,
|
|
304
|
+
objects: list[ObjectRef],
|
|
305
|
+
) -> dict[tuple[str, str], set[tuple[str, str]]]:
|
|
306
|
+
table_keys = {
|
|
307
|
+
self._object_key(object_ref)
|
|
308
|
+
for object_ref in objects
|
|
309
|
+
if object_ref.object_type == "TABLE"
|
|
310
|
+
}
|
|
311
|
+
links: dict[tuple[str, str], set[tuple[str, str]]] = {}
|
|
312
|
+
|
|
313
|
+
if not table_keys:
|
|
314
|
+
return links
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
with closing(self._connect()) as conn:
|
|
318
|
+
with conn.cursor() as cursor:
|
|
319
|
+
cursor.execute(
|
|
320
|
+
"""
|
|
321
|
+
SELECT
|
|
322
|
+
src.owner AS source_owner,
|
|
323
|
+
src.table_name AS source_table,
|
|
324
|
+
tgt.owner AS target_owner,
|
|
325
|
+
tgt.table_name AS target_table
|
|
326
|
+
FROM all_constraints src
|
|
327
|
+
JOIN all_constraints tgt
|
|
328
|
+
ON src.r_owner = tgt.owner
|
|
329
|
+
AND src.r_constraint_name = tgt.constraint_name
|
|
330
|
+
WHERE src.constraint_type = 'R'
|
|
331
|
+
"""
|
|
332
|
+
)
|
|
333
|
+
for row in cursor.fetchall():
|
|
334
|
+
if not isinstance(row, tuple) or len(row) < 4:
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
source_schema = row[0]
|
|
338
|
+
source_name = row[1]
|
|
339
|
+
target_schema = row[2]
|
|
340
|
+
target_name = row[3]
|
|
341
|
+
if (
|
|
342
|
+
not isinstance(source_schema, str)
|
|
343
|
+
or not isinstance(source_name, str)
|
|
344
|
+
or not isinstance(target_schema, str)
|
|
345
|
+
or not isinstance(target_name, str)
|
|
346
|
+
):
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
source_key = (source_schema.upper(), source_name)
|
|
350
|
+
target_key = (target_schema.upper(), target_name)
|
|
351
|
+
if source_key not in table_keys or target_key not in table_keys:
|
|
352
|
+
continue
|
|
353
|
+
links.setdefault(source_key, set()).add(target_key)
|
|
354
|
+
except Exception as exc:
|
|
355
|
+
logger.warning("Could not resolve Oracle foreign key links: %s", exc)
|
|
356
|
+
|
|
357
|
+
return links
|
|
358
|
+
|
|
359
|
+
def _collect_view_links(
|
|
360
|
+
self,
|
|
361
|
+
objects: list[ObjectRef],
|
|
362
|
+
) -> dict[tuple[str, str], set[tuple[str, str]]]:
|
|
363
|
+
if not self._include_view_lineage_enabled():
|
|
364
|
+
return {}
|
|
365
|
+
|
|
366
|
+
object_keys = {self._object_key(object_ref) for object_ref in objects}
|
|
367
|
+
view_keys = {
|
|
368
|
+
self._object_key(object_ref)
|
|
369
|
+
for object_ref in objects
|
|
370
|
+
if object_ref.object_type == "VIEW"
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
if not view_keys:
|
|
374
|
+
return {}
|
|
375
|
+
|
|
376
|
+
links: dict[tuple[str, str], set[tuple[str, str]]] = {}
|
|
377
|
+
try:
|
|
378
|
+
with closing(self._connect()) as conn:
|
|
379
|
+
with conn.cursor() as cursor:
|
|
380
|
+
cursor.execute(
|
|
381
|
+
"""
|
|
382
|
+
SELECT
|
|
383
|
+
owner,
|
|
384
|
+
name,
|
|
385
|
+
referenced_owner,
|
|
386
|
+
referenced_name,
|
|
387
|
+
referenced_type
|
|
388
|
+
FROM all_dependencies
|
|
389
|
+
WHERE type = 'VIEW'
|
|
390
|
+
AND referenced_type IN ('TABLE', 'VIEW')
|
|
391
|
+
"""
|
|
392
|
+
)
|
|
393
|
+
for row in cursor.fetchall():
|
|
394
|
+
if not isinstance(row, tuple) or len(row) < 5:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
owner = row[0]
|
|
398
|
+
name = row[1]
|
|
399
|
+
referenced_owner = row[2]
|
|
400
|
+
referenced_name = row[3]
|
|
401
|
+
referenced_type = row[4]
|
|
402
|
+
if (
|
|
403
|
+
not isinstance(owner, str)
|
|
404
|
+
or not isinstance(name, str)
|
|
405
|
+
or not isinstance(referenced_owner, str)
|
|
406
|
+
or not isinstance(referenced_name, str)
|
|
407
|
+
):
|
|
408
|
+
continue
|
|
409
|
+
if not isinstance(referenced_type, str):
|
|
410
|
+
continue
|
|
411
|
+
|
|
412
|
+
source_key = (owner.upper(), name)
|
|
413
|
+
target_key = (referenced_owner.upper(), referenced_name)
|
|
414
|
+
if source_key not in view_keys or target_key not in object_keys:
|
|
415
|
+
continue
|
|
416
|
+
|
|
417
|
+
links.setdefault(source_key, set()).add(target_key)
|
|
418
|
+
except Exception as exc:
|
|
419
|
+
logger.warning("Could not resolve Oracle view lineage links: %s", exc)
|
|
420
|
+
|
|
421
|
+
return links
|
|
422
|
+
|
|
423
|
+
def _object_to_asset(
|
|
424
|
+
self,
|
|
425
|
+
object_ref: ObjectRef,
|
|
426
|
+
*,
|
|
427
|
+
links: list[str] | None = None,
|
|
428
|
+
) -> SingleAssetScanResults:
|
|
429
|
+
asset_name = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
|
|
430
|
+
raw_id = self._object_raw_id(object_ref)
|
|
431
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
432
|
+
external_url = (
|
|
433
|
+
f"oracle://{self._host}:{self._port}/{object_ref.service_name}/"
|
|
434
|
+
f"{object_ref.schema}.{object_ref.name}"
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
metadata = {
|
|
438
|
+
"service_name": object_ref.service_name,
|
|
439
|
+
"schema": object_ref.schema,
|
|
440
|
+
"object": object_ref.name,
|
|
441
|
+
"object_type": object_ref.object_type,
|
|
442
|
+
"lineage": {
|
|
443
|
+
"include_view_lineage": bool(self._scope_options().include_view_lineage),
|
|
444
|
+
"include_view_column_lineage": bool(
|
|
445
|
+
self._scope_options().include_view_column_lineage
|
|
446
|
+
),
|
|
447
|
+
},
|
|
448
|
+
"sampling": {
|
|
449
|
+
"strategy": str(self._sampling().strategy),
|
|
450
|
+
},
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
now = datetime.now(UTC)
|
|
454
|
+
return SingleAssetScanResults(
|
|
455
|
+
hash=asset_hash,
|
|
456
|
+
checksum=self.calculate_checksum(metadata),
|
|
457
|
+
name=asset_name,
|
|
458
|
+
external_url=external_url,
|
|
459
|
+
links=links or [],
|
|
460
|
+
asset_type=OutputAssetType.TABLE,
|
|
461
|
+
source_id=self.source_id,
|
|
462
|
+
created_at=now,
|
|
463
|
+
updated_at=now,
|
|
464
|
+
runner_id=self.runner_id,
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
STREAM_DETECTIONS = True
|
|
468
|
+
|
|
469
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
470
|
+
if self._aborted:
|
|
471
|
+
return
|
|
472
|
+
|
|
473
|
+
objects = self._iter_objects()
|
|
474
|
+
object_hash_by_key: dict[tuple[str, str], str] = {
|
|
475
|
+
self._object_key(object_ref): self.generate_hash_id(self._object_raw_id(object_ref))
|
|
476
|
+
for object_ref in objects
|
|
477
|
+
}
|
|
478
|
+
fk_links = self._collect_foreign_key_links(objects)
|
|
479
|
+
view_links = self._collect_view_links(objects)
|
|
480
|
+
|
|
481
|
+
batch: list[SingleAssetScanResults] = []
|
|
482
|
+
for object_ref in objects:
|
|
483
|
+
if self._aborted:
|
|
484
|
+
return
|
|
485
|
+
|
|
486
|
+
key = self._object_key(object_ref)
|
|
487
|
+
combined_targets = set(fk_links.get(key, set())) | set(view_links.get(key, set()))
|
|
488
|
+
linked_hashes = [
|
|
489
|
+
object_hash_by_key[target]
|
|
490
|
+
for target in sorted(combined_targets)
|
|
491
|
+
if target in object_hash_by_key
|
|
492
|
+
]
|
|
493
|
+
|
|
494
|
+
asset = self._object_to_asset(object_ref, links=linked_hashes)
|
|
495
|
+
self._table_lookup[asset.hash] = object_ref
|
|
496
|
+
batch.append(asset)
|
|
497
|
+
|
|
498
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
499
|
+
yield batch
|
|
500
|
+
batch = []
|
|
501
|
+
|
|
502
|
+
if batch:
|
|
503
|
+
yield batch
|
|
504
|
+
|
|
505
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
506
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
507
|
+
|
|
508
|
+
def _parse_object_ref_from_asset_id(self, asset_id: str) -> ObjectRef | None:
|
|
509
|
+
if asset_id in self._table_lookup:
|
|
510
|
+
return self._table_lookup[asset_id]
|
|
511
|
+
|
|
512
|
+
decoded = asset_id
|
|
513
|
+
if "_#_" not in decoded:
|
|
514
|
+
try:
|
|
515
|
+
decoded = unhash_id(asset_id)
|
|
516
|
+
except Exception:
|
|
517
|
+
decoded = asset_id
|
|
518
|
+
|
|
519
|
+
parts = decoded.split("_#_")
|
|
520
|
+
if len(parts) >= 5 and parts[0].upper() == "ORACLE":
|
|
521
|
+
# Backward compatibility with older ORACLE_#_ENV_#_SERVICE_#_SCHEMA_#_OBJECT ids.
|
|
522
|
+
return ObjectRef(
|
|
523
|
+
service_name=parts[-3],
|
|
524
|
+
schema=parts[-2],
|
|
525
|
+
name=parts[-1],
|
|
526
|
+
object_type="TABLE",
|
|
527
|
+
)
|
|
528
|
+
if len(parts) >= 4 and parts[0].upper() == "ORACLE":
|
|
529
|
+
return ObjectRef(
|
|
530
|
+
service_name=parts[-3],
|
|
531
|
+
schema=parts[-2],
|
|
532
|
+
name=parts[-1],
|
|
533
|
+
object_type="TABLE",
|
|
534
|
+
)
|
|
535
|
+
if len(parts) >= 4:
|
|
536
|
+
return ObjectRef(
|
|
537
|
+
service_name=parts[-3],
|
|
538
|
+
schema=parts[-2],
|
|
539
|
+
name=parts[-1],
|
|
540
|
+
object_type="TABLE",
|
|
541
|
+
)
|
|
542
|
+
if len(parts) >= 3:
|
|
543
|
+
return ObjectRef(
|
|
544
|
+
service_name=self._service_name,
|
|
545
|
+
schema=parts[-2],
|
|
546
|
+
name=parts[-1],
|
|
547
|
+
object_type="TABLE",
|
|
548
|
+
)
|
|
549
|
+
return None
|
|
550
|
+
|
|
551
|
+
def _available_columns(self, object_ref: ObjectRef) -> list[str]:
|
|
552
|
+
with closing(self._connect()) as conn:
|
|
553
|
+
with conn.cursor() as cursor:
|
|
554
|
+
cursor.execute(
|
|
555
|
+
"""
|
|
556
|
+
SELECT column_name
|
|
557
|
+
FROM all_tab_columns
|
|
558
|
+
WHERE owner = :owner
|
|
559
|
+
AND table_name = :table_name
|
|
560
|
+
ORDER BY column_id
|
|
561
|
+
""",
|
|
562
|
+
{
|
|
563
|
+
"owner": object_ref.schema,
|
|
564
|
+
"table_name": object_ref.name,
|
|
565
|
+
},
|
|
566
|
+
)
|
|
567
|
+
return [
|
|
568
|
+
row[0]
|
|
569
|
+
for row in cursor.fetchall()
|
|
570
|
+
if isinstance(row, tuple) and row and isinstance(row[0], str)
|
|
571
|
+
]
|
|
572
|
+
|
|
573
|
+
def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
|
|
574
|
+
sampling = self._sampling()
|
|
575
|
+
normalized = {column.lower(): column for column in columns}
|
|
576
|
+
|
|
577
|
+
configured = sampling.order_by_column
|
|
578
|
+
if configured:
|
|
579
|
+
configured_column = normalized.get(configured.lower())
|
|
580
|
+
if configured_column:
|
|
581
|
+
return configured_column
|
|
582
|
+
|
|
583
|
+
priority_candidates = (
|
|
584
|
+
"updated_at",
|
|
585
|
+
"modified_at",
|
|
586
|
+
"created_at",
|
|
587
|
+
"inserted_at",
|
|
588
|
+
"timestamp",
|
|
589
|
+
"ts",
|
|
590
|
+
"date",
|
|
591
|
+
)
|
|
592
|
+
for candidate in priority_candidates:
|
|
593
|
+
resolved = normalized.get(candidate)
|
|
594
|
+
if resolved:
|
|
595
|
+
return resolved
|
|
596
|
+
return None
|
|
597
|
+
|
|
598
|
+
def _build_sampling_query(
|
|
599
|
+
self, object_ref: ObjectRef, columns: list[str]
|
|
600
|
+
) -> tuple[str, list[Any]]:
|
|
601
|
+
sampling = self._sampling()
|
|
602
|
+
if not columns:
|
|
603
|
+
raise ValueError(
|
|
604
|
+
f"Object {object_ref.service_name}.{object_ref.schema}.{object_ref.name} has no readable columns"
|
|
605
|
+
)
|
|
606
|
+
|
|
607
|
+
quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
|
|
608
|
+
quoted_object = (
|
|
609
|
+
f"{_quote_identifier(object_ref.schema)}.{_quote_identifier(object_ref.name)}"
|
|
610
|
+
)
|
|
611
|
+
|
|
612
|
+
strategy = sampling.strategy
|
|
613
|
+
if strategy == SamplingStrategy.ALL:
|
|
614
|
+
return f"SELECT {quoted_columns} FROM {quoted_object}", []
|
|
615
|
+
|
|
616
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
617
|
+
query = f"SELECT {quoted_columns} FROM {quoted_object}"
|
|
618
|
+
|
|
619
|
+
if strategy == SamplingStrategy.LATEST:
|
|
620
|
+
order_column = self._resolve_latest_order_column(columns)
|
|
621
|
+
if order_column:
|
|
622
|
+
query += f" ORDER BY {_quote_identifier(order_column)} DESC"
|
|
623
|
+
elif sampling.fallback_to_random is not False:
|
|
624
|
+
query += " ORDER BY DBMS_RANDOM.VALUE"
|
|
625
|
+
elif strategy == SamplingStrategy.RANDOM:
|
|
626
|
+
query += " ORDER BY DBMS_RANDOM.VALUE"
|
|
627
|
+
|
|
628
|
+
query += f" FETCH FIRST {rows_per_page} ROWS ONLY"
|
|
629
|
+
return query, []
|
|
630
|
+
|
|
631
|
+
def _count_table_rows(self, object_ref: ObjectRef) -> int | None:
|
|
632
|
+
try:
|
|
633
|
+
with closing(self._connect()) as conn:
|
|
634
|
+
with conn.cursor() as cursor:
|
|
635
|
+
cursor.execute(
|
|
636
|
+
f"SELECT COUNT(*) FROM {_quote_identifier(object_ref.schema)}.{_quote_identifier(object_ref.name)}"
|
|
637
|
+
)
|
|
638
|
+
row = cursor.fetchone()
|
|
639
|
+
return int(row[0]) if row else None
|
|
640
|
+
except Exception:
|
|
641
|
+
return None
|
|
642
|
+
|
|
643
|
+
def _serialize_cell(self, value: Any) -> str:
|
|
644
|
+
if value is None:
|
|
645
|
+
return "null"
|
|
646
|
+
if isinstance(value, memoryview):
|
|
647
|
+
value = value.tobytes()
|
|
648
|
+
|
|
649
|
+
if hasattr(value, "read"):
|
|
650
|
+
try:
|
|
651
|
+
value = value.read()
|
|
652
|
+
except Exception:
|
|
653
|
+
value = str(value)
|
|
654
|
+
|
|
655
|
+
if isinstance(value, (bytes, bytearray)):
|
|
656
|
+
return f"<{len(value)} bytes>"
|
|
657
|
+
if isinstance(value, datetime):
|
|
658
|
+
return value.isoformat()
|
|
659
|
+
return str(value)
|
|
660
|
+
|
|
661
|
+
def _format_sample_content(
|
|
662
|
+
self,
|
|
663
|
+
object_ref: ObjectRef,
|
|
664
|
+
column_names: list[str],
|
|
665
|
+
rows: list[tuple[Any, ...]],
|
|
666
|
+
row_offset: int = 0,
|
|
667
|
+
) -> tuple[str, str]:
|
|
668
|
+
sampling = self._sampling()
|
|
669
|
+
return format_tabular_sample_content(
|
|
670
|
+
scope_label="object",
|
|
671
|
+
scope_value=f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}",
|
|
672
|
+
strategy=sampling.strategy,
|
|
673
|
+
rows=rows,
|
|
674
|
+
column_names=column_names,
|
|
675
|
+
serialize_cell=self._serialize_cell,
|
|
676
|
+
include_column_names=sampling.include_column_names is not False,
|
|
677
|
+
object_type=object_ref.object_type,
|
|
678
|
+
raw_metadata={
|
|
679
|
+
"service_name": object_ref.service_name,
|
|
680
|
+
"schema": object_ref.schema,
|
|
681
|
+
"object": object_ref.name,
|
|
682
|
+
},
|
|
683
|
+
row_offset=row_offset,
|
|
684
|
+
)
|
|
685
|
+
|
|
686
|
+
def _fetch_one_page(
|
|
687
|
+
self, object_ref: ObjectRef, base_query: str, page_size: int, offset: int
|
|
688
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
689
|
+
with closing(self._connect()) as conn:
|
|
690
|
+
paginated_query = f"{base_query} OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
|
|
691
|
+
with conn.cursor() as cursor:
|
|
692
|
+
cursor.execute(paginated_query)
|
|
693
|
+
rows = list(cursor.fetchall())
|
|
694
|
+
column_names = (
|
|
695
|
+
[desc[0] for desc in cursor.description] if cursor.description else []
|
|
696
|
+
)
|
|
697
|
+
return rows, column_names
|
|
698
|
+
|
|
699
|
+
def _fetch_one_page_on_conn(
|
|
700
|
+
self,
|
|
701
|
+
conn: Any,
|
|
702
|
+
base_query: str,
|
|
703
|
+
page_size: int,
|
|
704
|
+
offset: int,
|
|
705
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
706
|
+
paginated_query = f"{base_query} OFFSET {offset} ROWS FETCH NEXT {page_size} ROWS ONLY"
|
|
707
|
+
with conn.cursor() as cursor:
|
|
708
|
+
cursor.execute(paginated_query)
|
|
709
|
+
rows = list(cursor.fetchall())
|
|
710
|
+
column_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
711
|
+
return rows, column_names
|
|
712
|
+
|
|
713
|
+
@staticmethod
|
|
714
|
+
def _cursor_execute(cursor: Any, query: str) -> list[str]:
|
|
715
|
+
cursor.execute(query)
|
|
716
|
+
return [desc[0] for desc in cursor.description] if cursor.description else []
|
|
717
|
+
|
|
718
|
+
@staticmethod
|
|
719
|
+
def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
|
|
720
|
+
return list(cursor.fetchmany(size))
|
|
721
|
+
|
|
722
|
+
def _fetch_page_keyset(
|
|
723
|
+
self,
|
|
724
|
+
conn: Any,
|
|
725
|
+
base_query: str,
|
|
726
|
+
page_size: int,
|
|
727
|
+
pk_columns: list[str],
|
|
728
|
+
pk_order: str,
|
|
729
|
+
last_pk_values: list[Any] | None,
|
|
730
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
731
|
+
"""Fetch one page using keyset pagination — O(1) cost at any offset."""
|
|
732
|
+
bind: dict[str, Any] = {}
|
|
733
|
+
if last_pk_values is None:
|
|
734
|
+
paginated_query = f"{base_query} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
|
|
735
|
+
elif len(pk_columns) == 1:
|
|
736
|
+
where = f"WHERE {_quote_identifier(pk_columns[0])} > :pk0"
|
|
737
|
+
paginated_query = (
|
|
738
|
+
f"{base_query} {where} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
|
|
739
|
+
)
|
|
740
|
+
bind = {"pk0": last_pk_values[0]}
|
|
741
|
+
else:
|
|
742
|
+
pk_cols_quoted = ", ".join(_quote_identifier(col) for col in pk_columns)
|
|
743
|
+
placeholders = ", ".join(f":pk{i}" for i in range(len(pk_columns)))
|
|
744
|
+
where = f"WHERE ({pk_cols_quoted}) > ({placeholders})"
|
|
745
|
+
paginated_query = (
|
|
746
|
+
f"{base_query} {where} ORDER BY {pk_order} FETCH FIRST {page_size} ROWS ONLY"
|
|
747
|
+
)
|
|
748
|
+
bind = {f"pk{i}": last_pk_values[i] for i in range(len(pk_columns))}
|
|
749
|
+
|
|
750
|
+
with conn.cursor() as cursor:
|
|
751
|
+
cursor.execute(paginated_query, bind if bind else [])
|
|
752
|
+
rows = list(cursor.fetchall())
|
|
753
|
+
column_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
754
|
+
return rows, column_names
|
|
755
|
+
|
|
756
|
+
def _fetch_sample_rows(
|
|
757
|
+
self, object_ref: ObjectRef
|
|
758
|
+
) -> tuple[list[tuple[Any, ...]], list[str]] | None:
|
|
759
|
+
columns = self._available_columns(object_ref)
|
|
760
|
+
sampling = self._sampling()
|
|
761
|
+
query, _params = self._build_sampling_query(object_ref, columns)
|
|
762
|
+
|
|
763
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
764
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
765
|
+
rows, column_names = self._fetch_one_page(object_ref, query, rows_per_page, 0)
|
|
766
|
+
else:
|
|
767
|
+
with closing(self._connect()) as conn:
|
|
768
|
+
with conn.cursor() as cursor:
|
|
769
|
+
cursor.execute(query)
|
|
770
|
+
rows = cursor.fetchall()
|
|
771
|
+
column_names = [desc[0] for desc in cursor.description or []]
|
|
772
|
+
|
|
773
|
+
if not column_names:
|
|
774
|
+
return None
|
|
775
|
+
return rows, column_names
|
|
776
|
+
|
|
777
|
+
def _sample_table_rows(self, object_ref: ObjectRef) -> tuple[str, str] | None:
|
|
778
|
+
result = self._fetch_sample_rows(object_ref)
|
|
779
|
+
if result is None:
|
|
780
|
+
return None
|
|
781
|
+
rows, column_names = result
|
|
782
|
+
return self._format_sample_content(object_ref, column_names, rows)
|
|
783
|
+
|
|
784
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
785
|
+
cached = self._content_cache.get(asset_id)
|
|
786
|
+
if cached:
|
|
787
|
+
return cached
|
|
788
|
+
|
|
789
|
+
object_ref = self._parse_object_ref_from_asset_id(asset_id)
|
|
790
|
+
if not object_ref:
|
|
791
|
+
return None
|
|
792
|
+
|
|
793
|
+
sampled = self._sample_table_rows(object_ref)
|
|
794
|
+
|
|
795
|
+
if sampled is None:
|
|
796
|
+
return None
|
|
797
|
+
|
|
798
|
+
self._content_cache[asset_id] = sampled
|
|
799
|
+
return sampled
|
|
800
|
+
|
|
801
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
802
|
+
sampling = self._sampling()
|
|
803
|
+
object_ref = self._parse_object_ref_from_asset_id(asset_id)
|
|
804
|
+
if not object_ref:
|
|
805
|
+
return
|
|
806
|
+
|
|
807
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
808
|
+
result = self._fetch_sample_rows(object_ref)
|
|
809
|
+
if result is None:
|
|
810
|
+
return
|
|
811
|
+
rows, column_names = result
|
|
812
|
+
for i, row in enumerate(rows):
|
|
813
|
+
formatted = self._format_sample_content(
|
|
814
|
+
object_ref, column_names, [row], row_offset=i
|
|
815
|
+
)
|
|
816
|
+
if formatted:
|
|
817
|
+
yield formatted
|
|
818
|
+
return
|
|
819
|
+
|
|
820
|
+
columns = self._available_columns(object_ref)
|
|
821
|
+
query, _ = self._build_sampling_query(object_ref, columns)
|
|
822
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
823
|
+
object_label = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
|
|
824
|
+
|
|
825
|
+
total_rows = self._count_table_rows(object_ref)
|
|
826
|
+
total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
|
|
827
|
+
if total_rows is not None and total_batches is not None:
|
|
828
|
+
logger.info(
|
|
829
|
+
"Full scan %s: %d rows, %d batches of %d",
|
|
830
|
+
object_label,
|
|
831
|
+
total_rows,
|
|
832
|
+
total_batches,
|
|
833
|
+
rows_per_page,
|
|
834
|
+
)
|
|
835
|
+
|
|
836
|
+
# Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
|
|
837
|
+
# Fall back to streaming fetchmany (also O(1)) for objects without a primary key.
|
|
838
|
+
pk_columns = (
|
|
839
|
+
self._get_primary_key_columns(object_ref) if object_ref.object_type == "TABLE" else []
|
|
840
|
+
)
|
|
841
|
+
pk_indices: list[int] = []
|
|
842
|
+
use_keyset = False
|
|
843
|
+
if pk_columns:
|
|
844
|
+
column_list = self._available_columns(object_ref)
|
|
845
|
+
indices = [column_list.index(col) for col in pk_columns if col in column_list]
|
|
846
|
+
if len(indices) == len(pk_columns):
|
|
847
|
+
pk_indices = indices
|
|
848
|
+
pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
|
|
849
|
+
use_keyset = True
|
|
850
|
+
|
|
851
|
+
row_offset = 0
|
|
852
|
+
page_num = 1
|
|
853
|
+
last_pk_values: list[Any] | None = None
|
|
854
|
+
|
|
855
|
+
conn = self._connect()
|
|
856
|
+
cursor = conn.cursor() if not use_keyset else None
|
|
857
|
+
try:
|
|
858
|
+
if cursor is not None:
|
|
859
|
+
# Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
|
|
860
|
+
column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
|
|
861
|
+
if not column_names:
|
|
862
|
+
return
|
|
863
|
+
|
|
864
|
+
while not self._aborted:
|
|
865
|
+
if total_batches is not None:
|
|
866
|
+
logger.info("%s batch %d/%d", object_label, page_num, total_batches)
|
|
867
|
+
|
|
868
|
+
if use_keyset:
|
|
869
|
+
rows, column_names = await asyncio.to_thread(
|
|
870
|
+
self._fetch_page_keyset,
|
|
871
|
+
conn,
|
|
872
|
+
query,
|
|
873
|
+
rows_per_page,
|
|
874
|
+
pk_columns,
|
|
875
|
+
pk_order,
|
|
876
|
+
last_pk_values,
|
|
877
|
+
)
|
|
878
|
+
else:
|
|
879
|
+
rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
|
|
880
|
+
if not rows:
|
|
881
|
+
break
|
|
882
|
+
|
|
883
|
+
if not rows or not column_names:
|
|
884
|
+
break
|
|
885
|
+
|
|
886
|
+
# Yield each row individually so the detection pipeline can start
|
|
887
|
+
# processing rows while the next page is being fetched in a thread.
|
|
888
|
+
for i, row in enumerate(rows):
|
|
889
|
+
formatted = self._format_sample_content(
|
|
890
|
+
object_ref, column_names, [row], row_offset=row_offset + i
|
|
891
|
+
)
|
|
892
|
+
if formatted:
|
|
893
|
+
self._content_cache[asset_id] = formatted
|
|
894
|
+
yield formatted
|
|
895
|
+
|
|
896
|
+
if use_keyset:
|
|
897
|
+
last_row = rows[-1]
|
|
898
|
+
last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
|
|
899
|
+
|
|
900
|
+
row_offset += len(rows)
|
|
901
|
+
page_num += 1
|
|
902
|
+
if len(rows) < rows_per_page:
|
|
903
|
+
break
|
|
904
|
+
finally:
|
|
905
|
+
if cursor is not None:
|
|
906
|
+
try:
|
|
907
|
+
cursor.close()
|
|
908
|
+
except Exception:
|
|
909
|
+
pass
|
|
910
|
+
conn.close()
|
|
911
|
+
|
|
912
|
+
def _get_primary_key_columns(self, object_ref: ObjectRef) -> list[str]:
|
|
913
|
+
cache_key = (object_ref.schema, object_ref.name)
|
|
914
|
+
if cache_key in self._pk_columns_cache:
|
|
915
|
+
return self._pk_columns_cache[cache_key]
|
|
916
|
+
|
|
917
|
+
if object_ref.object_type == "VIEW":
|
|
918
|
+
self._pk_columns_cache[cache_key] = []
|
|
919
|
+
return []
|
|
920
|
+
|
|
921
|
+
try:
|
|
922
|
+
with closing(self._connect()) as conn:
|
|
923
|
+
with conn.cursor() as cursor:
|
|
924
|
+
cursor.execute(
|
|
925
|
+
"""
|
|
926
|
+
SELECT cols.column_name
|
|
927
|
+
FROM all_constraints cons
|
|
928
|
+
JOIN all_cons_columns cols
|
|
929
|
+
ON cons.owner = cols.owner
|
|
930
|
+
AND cons.constraint_name = cols.constraint_name
|
|
931
|
+
WHERE cons.constraint_type = 'P'
|
|
932
|
+
AND cons.owner = :owner
|
|
933
|
+
AND cons.table_name = :table_name
|
|
934
|
+
ORDER BY cols.position
|
|
935
|
+
""",
|
|
936
|
+
{
|
|
937
|
+
"owner": object_ref.schema,
|
|
938
|
+
"table_name": object_ref.name,
|
|
939
|
+
},
|
|
940
|
+
)
|
|
941
|
+
columns = [
|
|
942
|
+
row[0]
|
|
943
|
+
for row in cursor.fetchall()
|
|
944
|
+
if isinstance(row, tuple) and row and isinstance(row[0], str)
|
|
945
|
+
]
|
|
946
|
+
except Exception:
|
|
947
|
+
columns = []
|
|
948
|
+
|
|
949
|
+
self._pk_columns_cache[cache_key] = columns
|
|
950
|
+
return columns
|
|
951
|
+
|
|
952
|
+
def enrich_finding_location(
|
|
953
|
+
self,
|
|
954
|
+
finding: DetectionResult,
|
|
955
|
+
asset: SingleAssetScanResults,
|
|
956
|
+
text_content: str,
|
|
957
|
+
) -> None:
|
|
958
|
+
del text_content
|
|
959
|
+
object_ref = self._table_lookup.get(asset.hash)
|
|
960
|
+
if not object_ref:
|
|
961
|
+
return
|
|
962
|
+
|
|
963
|
+
path = f"{object_ref.service_name}.{object_ref.schema}.{object_ref.name}"
|
|
964
|
+
cached = self._content_cache.get(asset.hash)
|
|
965
|
+
raw_content = cached[0] if cached else None
|
|
966
|
+
metadata = finding.metadata or {}
|
|
967
|
+
finding.location = build_tabular_location(
|
|
968
|
+
raw_content=raw_content,
|
|
969
|
+
matched_content=finding.matched_content,
|
|
970
|
+
base_path=path,
|
|
971
|
+
primary_key_columns=(
|
|
972
|
+
self._get_primary_key_columns(object_ref)
|
|
973
|
+
if object_ref.object_type == "TABLE"
|
|
974
|
+
else []
|
|
975
|
+
),
|
|
976
|
+
row_index=metadata.get("tabular_row_index"),
|
|
977
|
+
column_name=metadata.get("tabular_column_name"),
|
|
978
|
+
)
|
|
979
|
+
|
|
980
|
+
def abort(self) -> None:
|
|
981
|
+
logger.info("Aborting Oracle extraction...")
|
|
982
|
+
super().abort()
|