classifyre-cli 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- classifyre_cli-0.4.2.dist-info/METADATA +167 -0
- classifyre_cli-0.4.2.dist-info/RECORD +101 -0
- classifyre_cli-0.4.2.dist-info/WHEEL +4 -0
- classifyre_cli-0.4.2.dist-info/entry_points.txt +2 -0
- src/__init__.py +1 -0
- src/detectors/__init__.py +105 -0
- src/detectors/base.py +97 -0
- src/detectors/broken_links/__init__.py +3 -0
- src/detectors/broken_links/detector.py +280 -0
- src/detectors/config.py +59 -0
- src/detectors/content/__init__.py +0 -0
- src/detectors/custom/__init__.py +13 -0
- src/detectors/custom/detector.py +45 -0
- src/detectors/custom/runners/__init__.py +56 -0
- src/detectors/custom/runners/_base.py +177 -0
- src/detectors/custom/runners/_factory.py +51 -0
- src/detectors/custom/runners/_feature_extraction.py +138 -0
- src/detectors/custom/runners/_gliner2.py +324 -0
- src/detectors/custom/runners/_image_classification.py +98 -0
- src/detectors/custom/runners/_llm.py +22 -0
- src/detectors/custom/runners/_object_detection.py +107 -0
- src/detectors/custom/runners/_regex.py +147 -0
- src/detectors/custom/runners/_text_classification.py +109 -0
- src/detectors/custom/trainer.py +293 -0
- src/detectors/dependencies.py +109 -0
- src/detectors/pii/__init__.py +0 -0
- src/detectors/pii/detector.py +883 -0
- src/detectors/secrets/__init__.py +0 -0
- src/detectors/secrets/detector.py +399 -0
- src/detectors/threat/__init__.py +0 -0
- src/detectors/threat/code_security_detector.py +206 -0
- src/detectors/threat/yara_detector.py +177 -0
- src/main.py +608 -0
- src/models/generated_detectors.py +1296 -0
- src/models/generated_input.py +2732 -0
- src/models/generated_single_asset_scan_results.py +240 -0
- src/outputs/__init__.py +3 -0
- src/outputs/base.py +69 -0
- src/outputs/console.py +62 -0
- src/outputs/factory.py +156 -0
- src/outputs/file.py +83 -0
- src/outputs/rest.py +258 -0
- src/pipeline/__init__.py +7 -0
- src/pipeline/content_provider.py +26 -0
- src/pipeline/detector_pipeline.py +742 -0
- src/pipeline/parsed_content_provider.py +59 -0
- src/sandbox/__init__.py +5 -0
- src/sandbox/runner.py +145 -0
- src/sources/__init__.py +95 -0
- src/sources/atlassian_common.py +389 -0
- src/sources/azure_blob_storage/__init__.py +3 -0
- src/sources/azure_blob_storage/source.py +130 -0
- src/sources/base.py +296 -0
- src/sources/confluence/__init__.py +3 -0
- src/sources/confluence/source.py +733 -0
- src/sources/databricks/__init__.py +3 -0
- src/sources/databricks/source.py +1279 -0
- src/sources/dependencies.py +81 -0
- src/sources/google_cloud_storage/__init__.py +3 -0
- src/sources/google_cloud_storage/source.py +114 -0
- src/sources/hive/__init__.py +3 -0
- src/sources/hive/source.py +709 -0
- src/sources/jira/__init__.py +3 -0
- src/sources/jira/source.py +605 -0
- src/sources/mongodb/__init__.py +3 -0
- src/sources/mongodb/source.py +550 -0
- src/sources/mssql/__init__.py +3 -0
- src/sources/mssql/source.py +1034 -0
- src/sources/mysql/__init__.py +3 -0
- src/sources/mysql/source.py +797 -0
- src/sources/neo4j/__init__.py +0 -0
- src/sources/neo4j/source.py +523 -0
- src/sources/object_storage/base.py +679 -0
- src/sources/oracle/__init__.py +3 -0
- src/sources/oracle/source.py +982 -0
- src/sources/postgresql/__init__.py +3 -0
- src/sources/postgresql/source.py +774 -0
- src/sources/powerbi/__init__.py +3 -0
- src/sources/powerbi/source.py +774 -0
- src/sources/recipe_normalizer.py +179 -0
- src/sources/s3_compatible_storage/README.md +66 -0
- src/sources/s3_compatible_storage/__init__.py +3 -0
- src/sources/s3_compatible_storage/source.py +150 -0
- src/sources/servicedesk/__init__.py +3 -0
- src/sources/servicedesk/source.py +620 -0
- src/sources/slack/__init__.py +3 -0
- src/sources/slack/source.py +534 -0
- src/sources/snowflake/__init__.py +3 -0
- src/sources/snowflake/source.py +912 -0
- src/sources/tableau/__init__.py +3 -0
- src/sources/tableau/source.py +799 -0
- src/sources/tabular_utils.py +165 -0
- src/sources/wordpress/__init__.py +3 -0
- src/sources/wordpress/source.py +590 -0
- src/telemetry.py +96 -0
- src/utils/__init__.py +1 -0
- src/utils/content_extraction.py +108 -0
- src/utils/file_parser.py +777 -0
- src/utils/hashing.py +82 -0
- src/utils/uv_sync.py +79 -0
- src/utils/validation.py +56 -0
|
@@ -0,0 +1,774 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from collections.abc import AsyncGenerator
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from ...models.generated_input import (
|
|
11
|
+
PostgreSQLInput,
|
|
12
|
+
PostgreSQLOptionalConnection,
|
|
13
|
+
PostgreSQLOptionalScope,
|
|
14
|
+
SamplingConfig,
|
|
15
|
+
SamplingStrategy,
|
|
16
|
+
)
|
|
17
|
+
from ...models.generated_single_asset_scan_results import (
|
|
18
|
+
AssetType as OutputAssetType,
|
|
19
|
+
)
|
|
20
|
+
from ...models.generated_single_asset_scan_results import (
|
|
21
|
+
DetectionResult,
|
|
22
|
+
SingleAssetScanResults,
|
|
23
|
+
)
|
|
24
|
+
from ...utils.hashing import hash_id, unhash_id
|
|
25
|
+
from ..base import BaseSource
|
|
26
|
+
from ..dependencies import require_module
|
|
27
|
+
from ..tabular_utils import build_tabular_location, format_tabular_sample_content
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
_DEFAULT_EXCLUDED_SCHEMAS = {"information_schema", "pg_catalog", "pg_toast"}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass(frozen=True)
|
|
35
|
+
class TableRef:
|
|
36
|
+
database: str
|
|
37
|
+
schema: str
|
|
38
|
+
table: str
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _quote_identifier(identifier: str) -> str:
|
|
42
|
+
return '"' + identifier.replace('"', '""') + '"'
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class PostgreSQLSource(BaseSource):
|
|
46
|
+
source_type = "postgresql"
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
recipe: dict[str, Any],
|
|
51
|
+
source_id: str | None = None,
|
|
52
|
+
runner_id: str | None = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
super().__init__(recipe, source_id, runner_id)
|
|
55
|
+
self.config = PostgreSQLInput.model_validate(recipe)
|
|
56
|
+
self.runner_id = runner_id or "local-run"
|
|
57
|
+
self._psycopg2 = require_module(
|
|
58
|
+
module_name="psycopg2",
|
|
59
|
+
source_name="PostgreSQL",
|
|
60
|
+
uv_groups=["postgresql"],
|
|
61
|
+
detail="The PostgreSQL connector is optional.",
|
|
62
|
+
)
|
|
63
|
+
self._table_lookup: dict[str, TableRef] = {}
|
|
64
|
+
self._content_cache: dict[str, tuple[str, str]] = {}
|
|
65
|
+
self._pk_columns_cache: dict[tuple[str, str, str], list[str]] = {}
|
|
66
|
+
|
|
67
|
+
def _asset_type_value(self) -> str:
|
|
68
|
+
type_value = self.config.type
|
|
69
|
+
return type_value.value if hasattr(type_value, "value") else str(type_value)
|
|
70
|
+
|
|
71
|
+
def _sampling(self) -> SamplingConfig:
|
|
72
|
+
return self.config.sampling
|
|
73
|
+
|
|
74
|
+
def _connection_options(self) -> PostgreSQLOptionalConnection:
|
|
75
|
+
if self.config.optional and self.config.optional.connection:
|
|
76
|
+
return self.config.optional.connection
|
|
77
|
+
return PostgreSQLOptionalConnection()
|
|
78
|
+
|
|
79
|
+
def _scope_options(self) -> PostgreSQLOptionalScope:
|
|
80
|
+
if self.config.optional and self.config.optional.scope:
|
|
81
|
+
return self.config.optional.scope
|
|
82
|
+
return PostgreSQLOptionalScope()
|
|
83
|
+
|
|
84
|
+
def _username(self) -> str:
|
|
85
|
+
return self.config.masked.username
|
|
86
|
+
|
|
87
|
+
def _password(self) -> str:
|
|
88
|
+
return self.config.masked.password
|
|
89
|
+
|
|
90
|
+
def _connect(self, database: str):
|
|
91
|
+
connection_options = self._connection_options()
|
|
92
|
+
connect_kwargs = {
|
|
93
|
+
"host": self.config.required.host,
|
|
94
|
+
"port": int(self.config.required.port),
|
|
95
|
+
"user": self._username(),
|
|
96
|
+
"password": self._password(),
|
|
97
|
+
"dbname": database,
|
|
98
|
+
"connect_timeout": int(connection_options.connect_timeout_seconds or 30),
|
|
99
|
+
"sslmode": str(connection_options.ssl_mode or "prefer"),
|
|
100
|
+
}
|
|
101
|
+
connection = self._psycopg2.connect(**connect_kwargs)
|
|
102
|
+
connection.autocommit = True
|
|
103
|
+
return connection
|
|
104
|
+
|
|
105
|
+
def _resolve_databases(self) -> list[str]:
|
|
106
|
+
scope_options = self._scope_options()
|
|
107
|
+
include_all = bool(scope_options.include_all_databases)
|
|
108
|
+
configured_database = scope_options.database
|
|
109
|
+
|
|
110
|
+
if not include_all:
|
|
111
|
+
# Default to "postgres" maintenance database when no explicit database is configured,
|
|
112
|
+
# so that connection tests can proceed and report actual auth/connectivity errors.
|
|
113
|
+
return [configured_database or "postgres"]
|
|
114
|
+
|
|
115
|
+
maintenance_database = scope_options.maintenance_database or "postgres"
|
|
116
|
+
databases: list[str] = []
|
|
117
|
+
with self._connect(maintenance_database) as conn:
|
|
118
|
+
with conn.cursor() as cursor:
|
|
119
|
+
cursor.execute(
|
|
120
|
+
"""
|
|
121
|
+
SELECT datname
|
|
122
|
+
FROM pg_database
|
|
123
|
+
WHERE datistemplate = false
|
|
124
|
+
AND datallowconn = true
|
|
125
|
+
AND datname <> 'rdsadmin'
|
|
126
|
+
ORDER BY datname
|
|
127
|
+
"""
|
|
128
|
+
)
|
|
129
|
+
for (database_name,) in cursor.fetchall():
|
|
130
|
+
if isinstance(database_name, str) and database_name:
|
|
131
|
+
databases.append(database_name)
|
|
132
|
+
|
|
133
|
+
if configured_database and configured_database not in databases:
|
|
134
|
+
databases.insert(0, configured_database)
|
|
135
|
+
|
|
136
|
+
if not databases:
|
|
137
|
+
return [maintenance_database]
|
|
138
|
+
return databases
|
|
139
|
+
|
|
140
|
+
def _table_allowlist(self) -> set[str]:
|
|
141
|
+
allowlist: set[str] = set()
|
|
142
|
+
include_tables = self._scope_options().include_tables or []
|
|
143
|
+
for item in include_tables:
|
|
144
|
+
normalized = item.strip().lower()
|
|
145
|
+
if normalized:
|
|
146
|
+
allowlist.add(normalized)
|
|
147
|
+
return allowlist
|
|
148
|
+
|
|
149
|
+
def _schema_allowlist(self) -> set[str] | None:
|
|
150
|
+
include_schemas = self._scope_options().include_schemas
|
|
151
|
+
if not include_schemas:
|
|
152
|
+
return None
|
|
153
|
+
return {schema.strip() for schema in include_schemas if schema.strip()}
|
|
154
|
+
|
|
155
|
+
def _schema_denylist(self) -> set[str]:
|
|
156
|
+
configured = self._scope_options().exclude_schemas or []
|
|
157
|
+
denylist = {schema.strip() for schema in configured if schema.strip()}
|
|
158
|
+
if not denylist:
|
|
159
|
+
denylist = set(_DEFAULT_EXCLUDED_SCHEMAS)
|
|
160
|
+
return denylist
|
|
161
|
+
|
|
162
|
+
def _get_primary_key_columns(self, table_ref: TableRef) -> list[str]:
|
|
163
|
+
cache_key = (table_ref.database, table_ref.schema, table_ref.table)
|
|
164
|
+
if cache_key in self._pk_columns_cache:
|
|
165
|
+
return self._pk_columns_cache[cache_key]
|
|
166
|
+
try:
|
|
167
|
+
with self._connect(table_ref.database) as conn:
|
|
168
|
+
with conn.cursor() as cursor:
|
|
169
|
+
cursor.execute(
|
|
170
|
+
"""
|
|
171
|
+
SELECT kcu.column_name
|
|
172
|
+
FROM information_schema.table_constraints tc
|
|
173
|
+
JOIN information_schema.key_column_usage kcu
|
|
174
|
+
ON tc.constraint_name = kcu.constraint_name
|
|
175
|
+
AND tc.table_schema = kcu.table_schema
|
|
176
|
+
AND tc.table_name = kcu.table_name
|
|
177
|
+
WHERE tc.constraint_type = 'PRIMARY KEY'
|
|
178
|
+
AND tc.table_schema = %s
|
|
179
|
+
AND tc.table_name = %s
|
|
180
|
+
ORDER BY kcu.ordinal_position
|
|
181
|
+
""",
|
|
182
|
+
(table_ref.schema, table_ref.table),
|
|
183
|
+
)
|
|
184
|
+
cols = [row[0] for row in cursor.fetchall() if isinstance(row[0], str)]
|
|
185
|
+
except Exception:
|
|
186
|
+
cols = []
|
|
187
|
+
self._pk_columns_cache[cache_key] = cols
|
|
188
|
+
return cols
|
|
189
|
+
|
|
190
|
+
def _list_tables_for_database(self, database: str) -> list[TableRef]:
|
|
191
|
+
schema_allowlist = self._schema_allowlist()
|
|
192
|
+
schema_denylist = self._schema_denylist()
|
|
193
|
+
table_allowlist = self._table_allowlist()
|
|
194
|
+
table_limit = self._scope_options().table_limit
|
|
195
|
+
limit = int(table_limit) if table_limit else None
|
|
196
|
+
|
|
197
|
+
tables: list[TableRef] = []
|
|
198
|
+
with self._connect(database) as conn:
|
|
199
|
+
with conn.cursor() as cursor:
|
|
200
|
+
cursor.execute(
|
|
201
|
+
"""
|
|
202
|
+
SELECT table_schema, table_name
|
|
203
|
+
FROM information_schema.tables
|
|
204
|
+
WHERE table_type = 'BASE TABLE'
|
|
205
|
+
ORDER BY table_schema, table_name
|
|
206
|
+
"""
|
|
207
|
+
)
|
|
208
|
+
for schema_name, table_name in cursor.fetchall():
|
|
209
|
+
if not isinstance(schema_name, str) or not isinstance(table_name, str):
|
|
210
|
+
continue
|
|
211
|
+
if schema_name in schema_denylist:
|
|
212
|
+
continue
|
|
213
|
+
if schema_allowlist and schema_name not in schema_allowlist:
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
schema_table = f"{schema_name}.{table_name}".lower()
|
|
217
|
+
db_schema_table = f"{database}.{schema_name}.{table_name}".lower()
|
|
218
|
+
if (
|
|
219
|
+
table_allowlist
|
|
220
|
+
and schema_table not in table_allowlist
|
|
221
|
+
and db_schema_table not in table_allowlist
|
|
222
|
+
):
|
|
223
|
+
continue
|
|
224
|
+
|
|
225
|
+
tables.append(TableRef(database=database, schema=schema_name, table=table_name))
|
|
226
|
+
if limit is not None and len(tables) >= limit:
|
|
227
|
+
break
|
|
228
|
+
return tables
|
|
229
|
+
|
|
230
|
+
def _iter_tables(self) -> list[TableRef]:
|
|
231
|
+
tables: list[TableRef] = []
|
|
232
|
+
for database in self._resolve_databases():
|
|
233
|
+
if self._aborted:
|
|
234
|
+
break
|
|
235
|
+
try:
|
|
236
|
+
tables.extend(self._list_tables_for_database(database))
|
|
237
|
+
except Exception as exc:
|
|
238
|
+
logger.warning("Skipping database %s due to listing error: %s", database, exc)
|
|
239
|
+
return tables
|
|
240
|
+
|
|
241
|
+
def test_connection(self) -> dict[str, Any]:
|
|
242
|
+
logger.info("Testing connection to PostgreSQL...")
|
|
243
|
+
result = {
|
|
244
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
245
|
+
"source_type": self.recipe.get("type"),
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
try:
|
|
249
|
+
databases = self._resolve_databases()
|
|
250
|
+
with self._connect(databases[0]) as conn:
|
|
251
|
+
with conn.cursor() as cursor:
|
|
252
|
+
cursor.execute("SELECT 1")
|
|
253
|
+
cursor.fetchone()
|
|
254
|
+
result["status"] = "SUCCESS"
|
|
255
|
+
result["message"] = (
|
|
256
|
+
f"Successfully connected to PostgreSQL. Reachable databases: {len(databases)}."
|
|
257
|
+
)
|
|
258
|
+
except Exception as exc:
|
|
259
|
+
result["status"] = "FAILURE"
|
|
260
|
+
result["message"] = f"Failed to connect to PostgreSQL: {exc}"
|
|
261
|
+
|
|
262
|
+
return result
|
|
263
|
+
|
|
264
|
+
def _table_key(self, table_ref: TableRef) -> tuple[str, str, str]:
|
|
265
|
+
return (table_ref.database, table_ref.schema, table_ref.table)
|
|
266
|
+
|
|
267
|
+
def _table_raw_id(self, table_ref: TableRef) -> str:
|
|
268
|
+
return f"{table_ref.database}_#_{table_ref.schema}_#_{table_ref.table}"
|
|
269
|
+
|
|
270
|
+
def _collect_foreign_key_links(
|
|
271
|
+
self,
|
|
272
|
+
tables: list[TableRef],
|
|
273
|
+
) -> dict[tuple[str, str, str], set[tuple[str, str, str]]]:
|
|
274
|
+
by_database: dict[str, set[tuple[str, str, str]]] = {}
|
|
275
|
+
for table_ref in tables:
|
|
276
|
+
by_database.setdefault(table_ref.database, set()).add(self._table_key(table_ref))
|
|
277
|
+
|
|
278
|
+
links: dict[tuple[str, str, str], set[tuple[str, str, str]]] = {}
|
|
279
|
+
for database, scoped_keys in by_database.items():
|
|
280
|
+
try:
|
|
281
|
+
with self._connect(database) as conn:
|
|
282
|
+
with conn.cursor() as cursor:
|
|
283
|
+
cursor.execute(
|
|
284
|
+
"""
|
|
285
|
+
SELECT
|
|
286
|
+
source_ns.nspname AS source_schema,
|
|
287
|
+
source_tbl.relname AS source_table,
|
|
288
|
+
target_ns.nspname AS target_schema,
|
|
289
|
+
target_tbl.relname AS target_table
|
|
290
|
+
FROM pg_constraint AS fk
|
|
291
|
+
JOIN pg_class AS source_tbl
|
|
292
|
+
ON source_tbl.oid = fk.conrelid
|
|
293
|
+
JOIN pg_namespace AS source_ns
|
|
294
|
+
ON source_ns.oid = source_tbl.relnamespace
|
|
295
|
+
JOIN pg_class AS target_tbl
|
|
296
|
+
ON target_tbl.oid = fk.confrelid
|
|
297
|
+
JOIN pg_namespace AS target_ns
|
|
298
|
+
ON target_ns.oid = target_tbl.relnamespace
|
|
299
|
+
WHERE fk.contype = 'f'
|
|
300
|
+
"""
|
|
301
|
+
)
|
|
302
|
+
for (
|
|
303
|
+
source_schema,
|
|
304
|
+
source_table,
|
|
305
|
+
target_schema,
|
|
306
|
+
target_table,
|
|
307
|
+
) in cursor.fetchall():
|
|
308
|
+
source_key = (database, source_schema, source_table)
|
|
309
|
+
target_key = (database, target_schema, target_table)
|
|
310
|
+
if source_key not in scoped_keys or target_key not in scoped_keys:
|
|
311
|
+
continue
|
|
312
|
+
links.setdefault(source_key, set()).add(target_key)
|
|
313
|
+
except Exception as exc:
|
|
314
|
+
logger.warning(
|
|
315
|
+
"Could not resolve foreign key links for database %s: %s",
|
|
316
|
+
database,
|
|
317
|
+
exc,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
return links
|
|
321
|
+
|
|
322
|
+
def _table_to_asset(
|
|
323
|
+
self, table_ref: TableRef, *, links: list[str] | None = None
|
|
324
|
+
) -> SingleAssetScanResults:
|
|
325
|
+
asset_name = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
|
|
326
|
+
raw_id = self._table_raw_id(table_ref)
|
|
327
|
+
asset_hash = self.generate_hash_id(raw_id)
|
|
328
|
+
external_url = (
|
|
329
|
+
f"postgresql://{self.config.required.host}:{self.config.required.port}/"
|
|
330
|
+
f"{table_ref.database}/{table_ref.schema}.{table_ref.table}"
|
|
331
|
+
)
|
|
332
|
+
|
|
333
|
+
metadata = {
|
|
334
|
+
"database": table_ref.database,
|
|
335
|
+
"schema": table_ref.schema,
|
|
336
|
+
"table": table_ref.table,
|
|
337
|
+
"sampling": {
|
|
338
|
+
"strategy": str(self._sampling().strategy),
|
|
339
|
+
},
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
now = datetime.now(UTC)
|
|
343
|
+
return SingleAssetScanResults(
|
|
344
|
+
hash=asset_hash,
|
|
345
|
+
checksum=self.calculate_checksum(metadata),
|
|
346
|
+
name=asset_name,
|
|
347
|
+
external_url=external_url,
|
|
348
|
+
links=links or [],
|
|
349
|
+
asset_type=OutputAssetType.TABLE,
|
|
350
|
+
source_id=self.source_id,
|
|
351
|
+
created_at=now,
|
|
352
|
+
updated_at=now,
|
|
353
|
+
runner_id=self.runner_id,
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
STREAM_DETECTIONS = True
|
|
357
|
+
|
|
358
|
+
async def extract_raw(self) -> AsyncGenerator[list[SingleAssetScanResults], None]:
|
|
359
|
+
if self._aborted:
|
|
360
|
+
return
|
|
361
|
+
|
|
362
|
+
tables = self._iter_tables()
|
|
363
|
+
table_hash_by_key: dict[tuple[str, str, str], str] = {
|
|
364
|
+
self._table_key(table_ref): self.generate_hash_id(self._table_raw_id(table_ref))
|
|
365
|
+
for table_ref in tables
|
|
366
|
+
}
|
|
367
|
+
table_fk_links = self._collect_foreign_key_links(tables)
|
|
368
|
+
|
|
369
|
+
batch: list[SingleAssetScanResults] = []
|
|
370
|
+
for table_ref in tables:
|
|
371
|
+
if self._aborted:
|
|
372
|
+
return
|
|
373
|
+
|
|
374
|
+
key = self._table_key(table_ref)
|
|
375
|
+
linked_hashes = [
|
|
376
|
+
table_hash_by_key[target]
|
|
377
|
+
for target in sorted(table_fk_links.get(key, set()))
|
|
378
|
+
if target in table_hash_by_key
|
|
379
|
+
]
|
|
380
|
+
|
|
381
|
+
asset = self._table_to_asset(table_ref, links=linked_hashes)
|
|
382
|
+
self._table_lookup[asset.hash] = table_ref
|
|
383
|
+
batch.append(asset)
|
|
384
|
+
|
|
385
|
+
if len(batch) >= self.BATCH_SIZE:
|
|
386
|
+
yield batch
|
|
387
|
+
batch = []
|
|
388
|
+
|
|
389
|
+
if batch:
|
|
390
|
+
yield batch
|
|
391
|
+
|
|
392
|
+
def generate_hash_id(self, asset_id: str) -> str:
|
|
393
|
+
return hash_id(self._asset_type_value(), asset_id)
|
|
394
|
+
|
|
395
|
+
def _parse_table_ref_from_asset_id(self, asset_id: str) -> TableRef | None:
|
|
396
|
+
if asset_id in self._table_lookup:
|
|
397
|
+
return self._table_lookup[asset_id]
|
|
398
|
+
|
|
399
|
+
decoded = asset_id
|
|
400
|
+
if "_#_" not in decoded:
|
|
401
|
+
try:
|
|
402
|
+
decoded = unhash_id(asset_id)
|
|
403
|
+
except Exception:
|
|
404
|
+
decoded = asset_id
|
|
405
|
+
|
|
406
|
+
parts = decoded.split("_#_")
|
|
407
|
+
if len(parts) >= 4 and parts[0].upper() == "POSTGRESQL":
|
|
408
|
+
return TableRef(database=parts[-3], schema=parts[-2], table=parts[-1])
|
|
409
|
+
if len(parts) >= 3:
|
|
410
|
+
return TableRef(database=parts[-3], schema=parts[-2], table=parts[-1])
|
|
411
|
+
return None
|
|
412
|
+
|
|
413
|
+
def _available_columns(self, table_ref: TableRef) -> list[str]:
|
|
414
|
+
with self._connect(table_ref.database) as conn:
|
|
415
|
+
with conn.cursor() as cursor:
|
|
416
|
+
cursor.execute(
|
|
417
|
+
"""
|
|
418
|
+
SELECT column_name
|
|
419
|
+
FROM information_schema.columns
|
|
420
|
+
WHERE table_schema = %s AND table_name = %s
|
|
421
|
+
ORDER BY ordinal_position
|
|
422
|
+
""",
|
|
423
|
+
(table_ref.schema, table_ref.table),
|
|
424
|
+
)
|
|
425
|
+
columns = [column for (column,) in cursor.fetchall() if isinstance(column, str)]
|
|
426
|
+
return columns
|
|
427
|
+
|
|
428
|
+
def _resolve_latest_order_column(self, columns: list[str]) -> str | None:
|
|
429
|
+
sampling = self._sampling()
|
|
430
|
+
configured = sampling.order_by_column
|
|
431
|
+
if configured and configured in columns:
|
|
432
|
+
return configured
|
|
433
|
+
|
|
434
|
+
priority_candidates = (
|
|
435
|
+
"updated_at",
|
|
436
|
+
"modified_at",
|
|
437
|
+
"created_at",
|
|
438
|
+
"inserted_at",
|
|
439
|
+
"timestamp",
|
|
440
|
+
"ts",
|
|
441
|
+
"date",
|
|
442
|
+
)
|
|
443
|
+
for candidate in priority_candidates:
|
|
444
|
+
if candidate in columns:
|
|
445
|
+
return candidate
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
def _build_sampling_query(
|
|
449
|
+
self, table_ref: TableRef, columns: list[str]
|
|
450
|
+
) -> tuple[str, list[Any]]:
|
|
451
|
+
sampling = self._sampling()
|
|
452
|
+
if not columns:
|
|
453
|
+
raise ValueError(
|
|
454
|
+
f"Table {table_ref.database}.{table_ref.schema}.{table_ref.table} has no readable columns"
|
|
455
|
+
)
|
|
456
|
+
|
|
457
|
+
quoted_columns = ", ".join(_quote_identifier(column) for column in columns)
|
|
458
|
+
query = (
|
|
459
|
+
f"SELECT {quoted_columns} FROM "
|
|
460
|
+
f"{_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
strategy = sampling.strategy
|
|
464
|
+
if strategy == SamplingStrategy.LATEST:
|
|
465
|
+
order_column = self._resolve_latest_order_column(columns)
|
|
466
|
+
if order_column:
|
|
467
|
+
query += f" ORDER BY {_quote_identifier(order_column)} DESC NULLS LAST"
|
|
468
|
+
elif sampling.fallback_to_random is not False:
|
|
469
|
+
query += " ORDER BY RANDOM()"
|
|
470
|
+
elif strategy == SamplingStrategy.RANDOM:
|
|
471
|
+
query += " ORDER BY RANDOM()"
|
|
472
|
+
# SamplingStrategy.ALL: no ORDER BY, no LIMIT — paginated by fetch_content_pages
|
|
473
|
+
|
|
474
|
+
if strategy != SamplingStrategy.ALL:
|
|
475
|
+
query += " LIMIT %s"
|
|
476
|
+
return query, [int(sampling.rows_per_page or 100)]
|
|
477
|
+
|
|
478
|
+
return query, []
|
|
479
|
+
|
|
480
|
+
def _count_table_rows(self, table_ref: TableRef) -> int | None:
|
|
481
|
+
try:
|
|
482
|
+
with self._connect(table_ref.database) as conn:
|
|
483
|
+
with conn.cursor() as cursor:
|
|
484
|
+
cursor.execute(
|
|
485
|
+
f"SELECT COUNT(*) FROM {_quote_identifier(table_ref.schema)}.{_quote_identifier(table_ref.table)}"
|
|
486
|
+
)
|
|
487
|
+
row = cursor.fetchone()
|
|
488
|
+
return int(row[0]) if row else None
|
|
489
|
+
except Exception:
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
def _serialize_cell(self, value: Any) -> str:
|
|
493
|
+
if value is None:
|
|
494
|
+
return "null"
|
|
495
|
+
if isinstance(value, memoryview):
|
|
496
|
+
value = value.tobytes()
|
|
497
|
+
if isinstance(value, (bytes, bytearray)):
|
|
498
|
+
return f"<{len(value)} bytes>"
|
|
499
|
+
if isinstance(value, datetime):
|
|
500
|
+
return value.isoformat()
|
|
501
|
+
return str(value)
|
|
502
|
+
|
|
503
|
+
def _format_sample_content(
|
|
504
|
+
self,
|
|
505
|
+
table_ref: TableRef,
|
|
506
|
+
column_names: list[str],
|
|
507
|
+
rows: list[tuple[Any, ...]],
|
|
508
|
+
row_offset: int = 0,
|
|
509
|
+
) -> tuple[str, str]:
|
|
510
|
+
sampling = self._sampling()
|
|
511
|
+
return format_tabular_sample_content(
|
|
512
|
+
scope_label="table",
|
|
513
|
+
scope_value=f"{table_ref.database}.{table_ref.schema}.{table_ref.table}",
|
|
514
|
+
strategy=sampling.strategy,
|
|
515
|
+
rows=rows,
|
|
516
|
+
column_names=column_names,
|
|
517
|
+
serialize_cell=self._serialize_cell,
|
|
518
|
+
include_column_names=sampling.include_column_names is not False,
|
|
519
|
+
raw_metadata={
|
|
520
|
+
"database": table_ref.database,
|
|
521
|
+
"schema": table_ref.schema,
|
|
522
|
+
"table": table_ref.table,
|
|
523
|
+
},
|
|
524
|
+
row_offset=row_offset,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
def _fetch_one_page(
|
|
528
|
+
self, table_ref: TableRef, base_query: str, page_size: int, offset: int
|
|
529
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
530
|
+
with self._connect(table_ref.database) as conn:
|
|
531
|
+
paginated_query = f"{base_query} LIMIT %s OFFSET %s"
|
|
532
|
+
with conn.cursor() as cursor:
|
|
533
|
+
cursor.execute(paginated_query, [page_size, offset])
|
|
534
|
+
rows = list(cursor.fetchall())
|
|
535
|
+
column_names = (
|
|
536
|
+
[desc[0] for desc in cursor.description] if cursor.description else []
|
|
537
|
+
)
|
|
538
|
+
return rows, column_names
|
|
539
|
+
|
|
540
|
+
def _fetch_one_page_on_conn(
|
|
541
|
+
self,
|
|
542
|
+
conn: Any,
|
|
543
|
+
base_query: str,
|
|
544
|
+
page_size: int,
|
|
545
|
+
offset: int,
|
|
546
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
547
|
+
paginated_query = f"{base_query} LIMIT %s OFFSET %s"
|
|
548
|
+
with conn.cursor() as cursor:
|
|
549
|
+
cursor.execute(paginated_query, [page_size, offset])
|
|
550
|
+
rows = list(cursor.fetchall())
|
|
551
|
+
column_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
552
|
+
return rows, column_names
|
|
553
|
+
|
|
554
|
+
@staticmethod
|
|
555
|
+
def _cursor_execute(cursor: Any, query: str) -> list[str]:
|
|
556
|
+
cursor.execute(query)
|
|
557
|
+
return [desc[0] for desc in cursor.description] if cursor.description else []
|
|
558
|
+
|
|
559
|
+
@staticmethod
|
|
560
|
+
def _cursor_fetchmany(cursor: Any, size: int) -> list[tuple[Any, ...]]:
|
|
561
|
+
return list(cursor.fetchmany(size))
|
|
562
|
+
|
|
563
|
+
def _fetch_page_keyset(
|
|
564
|
+
self,
|
|
565
|
+
conn: Any,
|
|
566
|
+
base_query: str,
|
|
567
|
+
page_size: int,
|
|
568
|
+
pk_columns: list[str],
|
|
569
|
+
pk_order: str,
|
|
570
|
+
last_pk_values: list[Any] | None,
|
|
571
|
+
) -> tuple[list[tuple[Any, ...]], list[str]]:
|
|
572
|
+
"""Fetch one page using keyset pagination — O(1) cost at any offset."""
|
|
573
|
+
params: list[Any]
|
|
574
|
+
if last_pk_values is None:
|
|
575
|
+
paginated_query = f"{base_query} ORDER BY {pk_order} LIMIT %s"
|
|
576
|
+
params = [page_size]
|
|
577
|
+
elif len(pk_columns) == 1:
|
|
578
|
+
where = f"WHERE {_quote_identifier(pk_columns[0])} > %s"
|
|
579
|
+
paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
|
|
580
|
+
params = [last_pk_values[0], page_size]
|
|
581
|
+
else:
|
|
582
|
+
pk_cols_quoted = ", ".join(_quote_identifier(col) for col in pk_columns)
|
|
583
|
+
placeholders = ", ".join("%s" for _ in pk_columns)
|
|
584
|
+
where = f"WHERE ({pk_cols_quoted}) > ({placeholders})"
|
|
585
|
+
paginated_query = f"{base_query} {where} ORDER BY {pk_order} LIMIT %s"
|
|
586
|
+
params = [*last_pk_values, page_size]
|
|
587
|
+
|
|
588
|
+
with conn.cursor() as cursor:
|
|
589
|
+
cursor.execute(paginated_query, params)
|
|
590
|
+
rows = list(cursor.fetchall())
|
|
591
|
+
column_names = [desc[0] for desc in cursor.description] if cursor.description else []
|
|
592
|
+
return rows, column_names
|
|
593
|
+
|
|
594
|
+
def _fetch_sample_rows(
|
|
595
|
+
self, table_ref: TableRef
|
|
596
|
+
) -> tuple[list[tuple[Any, ...]], list[str]] | None:
|
|
597
|
+
columns = self._available_columns(table_ref)
|
|
598
|
+
sampling = self._sampling()
|
|
599
|
+
query, params = self._build_sampling_query(table_ref, columns)
|
|
600
|
+
|
|
601
|
+
if sampling.strategy == SamplingStrategy.ALL:
|
|
602
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
603
|
+
rows, column_names = self._fetch_one_page(table_ref, query, rows_per_page, 0)
|
|
604
|
+
else:
|
|
605
|
+
with self._connect(table_ref.database) as conn:
|
|
606
|
+
with conn.cursor() as cursor:
|
|
607
|
+
cursor.execute(query, params if params else None)
|
|
608
|
+
rows = cursor.fetchall()
|
|
609
|
+
column_names = [desc[0] for desc in cursor.description or []]
|
|
610
|
+
|
|
611
|
+
if not column_names:
|
|
612
|
+
return None
|
|
613
|
+
return rows, column_names
|
|
614
|
+
|
|
615
|
+
def _sample_table_rows(self, table_ref: TableRef) -> tuple[str, str] | None:
|
|
616
|
+
result = self._fetch_sample_rows(table_ref)
|
|
617
|
+
if result is None:
|
|
618
|
+
return None
|
|
619
|
+
rows, column_names = result
|
|
620
|
+
return self._format_sample_content(table_ref, column_names, rows)
|
|
621
|
+
|
|
622
|
+
async def fetch_content(self, asset_id: str) -> tuple[str, str] | None:
|
|
623
|
+
cached = self._content_cache.get(asset_id)
|
|
624
|
+
if cached:
|
|
625
|
+
return cached
|
|
626
|
+
|
|
627
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
628
|
+
if not table_ref:
|
|
629
|
+
return None
|
|
630
|
+
|
|
631
|
+
sampled = self._sample_table_rows(table_ref)
|
|
632
|
+
|
|
633
|
+
if sampled is None:
|
|
634
|
+
return None
|
|
635
|
+
|
|
636
|
+
self._content_cache[asset_id] = sampled
|
|
637
|
+
return sampled
|
|
638
|
+
|
|
639
|
+
async def fetch_content_pages(self, asset_id: str) -> AsyncGenerator[tuple[str, str], None]:
|
|
640
|
+
sampling = self._sampling()
|
|
641
|
+
table_ref = self._parse_table_ref_from_asset_id(asset_id)
|
|
642
|
+
if not table_ref:
|
|
643
|
+
return
|
|
644
|
+
|
|
645
|
+
if sampling.strategy != SamplingStrategy.ALL:
|
|
646
|
+
result = self._fetch_sample_rows(table_ref)
|
|
647
|
+
if result is None:
|
|
648
|
+
return
|
|
649
|
+
rows, column_names = result
|
|
650
|
+
for i, row in enumerate(rows):
|
|
651
|
+
formatted = self._format_sample_content(
|
|
652
|
+
table_ref, column_names, [row], row_offset=i
|
|
653
|
+
)
|
|
654
|
+
if formatted:
|
|
655
|
+
yield formatted
|
|
656
|
+
return
|
|
657
|
+
|
|
658
|
+
columns = self._available_columns(table_ref)
|
|
659
|
+
query, _ = self._build_sampling_query(table_ref, columns)
|
|
660
|
+
rows_per_page = int(sampling.rows_per_page or 100)
|
|
661
|
+
table_label = f"{table_ref.database}.{table_ref.schema}.{table_ref.table}"
|
|
662
|
+
|
|
663
|
+
total_rows = self._count_table_rows(table_ref)
|
|
664
|
+
total_batches = ((total_rows + rows_per_page - 1) // rows_per_page) if total_rows else None
|
|
665
|
+
if total_rows is not None and total_batches is not None:
|
|
666
|
+
logger.info(
|
|
667
|
+
"Full scan %s: %d rows, %d batches of %d",
|
|
668
|
+
table_label,
|
|
669
|
+
total_rows,
|
|
670
|
+
total_batches,
|
|
671
|
+
rows_per_page,
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# Prefer keyset pagination (O(1) per page) with a PK-ordered cursor.
|
|
675
|
+
# Fall back to streaming fetchmany (also O(1)) for tables without a primary key.
|
|
676
|
+
pk_columns = self._get_primary_key_columns(table_ref)
|
|
677
|
+
pk_indices: list[int] = []
|
|
678
|
+
use_keyset = False
|
|
679
|
+
if pk_columns:
|
|
680
|
+
column_list = self._available_columns(table_ref)
|
|
681
|
+
indices = [column_list.index(col) for col in pk_columns if col in column_list]
|
|
682
|
+
if len(indices) == len(pk_columns):
|
|
683
|
+
pk_indices = indices
|
|
684
|
+
pk_order = ", ".join(_quote_identifier(col) for col in pk_columns)
|
|
685
|
+
use_keyset = True
|
|
686
|
+
|
|
687
|
+
row_offset = 0
|
|
688
|
+
page_num = 1
|
|
689
|
+
last_pk_values: list[Any] | None = None
|
|
690
|
+
|
|
691
|
+
conn = self._connect(table_ref.database)
|
|
692
|
+
cursor = conn.cursor() if not use_keyset else None
|
|
693
|
+
try:
|
|
694
|
+
if cursor is not None:
|
|
695
|
+
# Streaming path: execute once, fetchmany in a loop — no OFFSET cost.
|
|
696
|
+
column_names = await asyncio.to_thread(self._cursor_execute, cursor, query)
|
|
697
|
+
if not column_names:
|
|
698
|
+
return
|
|
699
|
+
|
|
700
|
+
while not self._aborted:
|
|
701
|
+
if total_batches is not None:
|
|
702
|
+
logger.info("%s batch %d/%d", table_label, page_num, total_batches)
|
|
703
|
+
|
|
704
|
+
if use_keyset:
|
|
705
|
+
rows, column_names = await asyncio.to_thread(
|
|
706
|
+
self._fetch_page_keyset,
|
|
707
|
+
conn,
|
|
708
|
+
query,
|
|
709
|
+
rows_per_page,
|
|
710
|
+
pk_columns,
|
|
711
|
+
pk_order,
|
|
712
|
+
last_pk_values,
|
|
713
|
+
)
|
|
714
|
+
else:
|
|
715
|
+
rows = await asyncio.to_thread(self._cursor_fetchmany, cursor, rows_per_page)
|
|
716
|
+
if not rows:
|
|
717
|
+
break
|
|
718
|
+
|
|
719
|
+
if not rows or not column_names:
|
|
720
|
+
break
|
|
721
|
+
|
|
722
|
+
# Yield each row individually so the detection pipeline can start
|
|
723
|
+
# processing rows while the next page is being fetched in a thread.
|
|
724
|
+
for i, row in enumerate(rows):
|
|
725
|
+
formatted = self._format_sample_content(
|
|
726
|
+
table_ref, column_names, [row], row_offset=row_offset + i
|
|
727
|
+
)
|
|
728
|
+
if formatted:
|
|
729
|
+
self._content_cache[asset_id] = formatted
|
|
730
|
+
yield formatted
|
|
731
|
+
|
|
732
|
+
if use_keyset:
|
|
733
|
+
last_row = rows[-1]
|
|
734
|
+
last_pk_values = [last_row[pk_indices[j]] for j in range(len(pk_columns))]
|
|
735
|
+
|
|
736
|
+
row_offset += len(rows)
|
|
737
|
+
page_num += 1
|
|
738
|
+
if len(rows) < rows_per_page:
|
|
739
|
+
break
|
|
740
|
+
finally:
|
|
741
|
+
if cursor is not None:
|
|
742
|
+
try:
|
|
743
|
+
cursor.close()
|
|
744
|
+
except Exception:
|
|
745
|
+
pass
|
|
746
|
+
conn.close()
|
|
747
|
+
|
|
748
|
+
def enrich_finding_location(
|
|
749
|
+
self,
|
|
750
|
+
finding: DetectionResult,
|
|
751
|
+
asset: SingleAssetScanResults,
|
|
752
|
+
text_content: str,
|
|
753
|
+
) -> None:
|
|
754
|
+
del text_content
|
|
755
|
+
table_ref = self._table_lookup.get(asset.hash)
|
|
756
|
+
if not table_ref:
|
|
757
|
+
return
|
|
758
|
+
|
|
759
|
+
path = f"{table_ref.schema}.{table_ref.table}"
|
|
760
|
+
cached = self._content_cache.get(asset.hash)
|
|
761
|
+
raw_content = cached[0] if cached else None
|
|
762
|
+
metadata = finding.metadata or {}
|
|
763
|
+
finding.location = build_tabular_location(
|
|
764
|
+
raw_content=raw_content,
|
|
765
|
+
matched_content=finding.matched_content,
|
|
766
|
+
base_path=path,
|
|
767
|
+
primary_key_columns=self._get_primary_key_columns(table_ref),
|
|
768
|
+
row_index=metadata.get("tabular_row_index"),
|
|
769
|
+
column_name=metadata.get("tabular_column_name"),
|
|
770
|
+
)
|
|
771
|
+
|
|
772
|
+
def abort(self) -> None:
|
|
773
|
+
logger.info("Aborting PostgreSQL extraction...")
|
|
774
|
+
super().abort()
|