ethyca-fides 2.63.0rc2__py2.py3-none-any.whl → 2.63.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/METADATA +1 -1
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/RECORD +131 -112
- fides/_version.py +3 -3
- fides/api/alembic/migrations/versions/29e56fa1fdb3_add_monitor_tasks.py +147 -0
- fides/api/alembic/migrations/versions/5efcdf18438e_add_manual_task_tables.py +160 -0
- fides/api/alembic/migrations/versions/bf713b5a021d_staged_resource_ancestor_link_data_.py +20 -11
- fides/api/api/v1/endpoints/privacy_request_endpoints.py +4 -4
- fides/api/db/base.py +7 -1
- fides/api/migrations/post_upgrade_index_creation.py +3 -3
- fides/api/models/connectionconfig.py +1 -1
- fides/api/models/detection_discovery/__init__.py +35 -0
- fides/api/models/detection_discovery/monitor_task.py +162 -0
- fides/api/models/field_types/__init__.py +5 -0
- fides/api/models/field_types/encrypted_large_data.py +151 -0
- fides/api/models/manual_tasks/__init__.py +8 -0
- fides/api/models/manual_tasks/manual_task.py +110 -0
- fides/api/models/manual_tasks/manual_task_log.py +100 -0
- fides/api/models/privacy_preference.py +1 -1
- fides/api/models/privacy_request/execution_log.py +3 -31
- fides/api/models/privacy_request/privacy_request.py +16 -3
- fides/api/models/privacy_request/request_task.py +36 -25
- fides/api/models/worker_task.py +96 -0
- fides/api/schemas/external_storage.py +22 -0
- fides/api/schemas/manual_tasks/__init__.py +0 -0
- fides/api/schemas/manual_tasks/manual_task_schemas.py +79 -0
- fides/api/schemas/manual_tasks/manual_task_status.py +151 -0
- fides/api/schemas/privacy_request.py +1 -12
- fides/api/service/connectors/base_erasure_email_connector.py +1 -1
- fides/api/service/connectors/consent_email_connector.py +2 -1
- fides/api/service/connectors/dynamic_erasure_email_connector.py +2 -1
- fides/api/service/connectors/erasure_email_connector.py +1 -1
- fides/api/service/external_data_storage.py +371 -0
- fides/api/service/privacy_request/request_runner_service.py +5 -5
- fides/api/service/privacy_request/request_service.py +1 -1
- fides/api/task/create_request_tasks.py +1 -1
- fides/api/task/execute_request_tasks.py +9 -8
- fides/api/task/graph_task.py +22 -10
- fides/api/util/consent_util.py +1 -1
- fides/api/util/data_size.py +102 -0
- fides/api/util/encryption/aes_gcm_encryption_util.py +271 -0
- fides/service/manual_tasks/__init__.py +0 -0
- fides/service/manual_tasks/manual_task_service.py +150 -0
- fides/service/privacy_request/privacy_request_service.py +1 -1
- fides/ui-build/static/admin/404.html +1 -1
- fides/ui-build/static/admin/add-systems/manual.html +1 -1
- fides/ui-build/static/admin/add-systems/multiple.html +1 -1
- fides/ui-build/static/admin/add-systems.html +1 -1
- fides/ui-build/static/admin/consent/configure/add-vendors.html +1 -1
- fides/ui-build/static/admin/consent/configure.html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience/[id].html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience/new.html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience.html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices/[id].html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices/new.html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices.html +1 -1
- fides/ui-build/static/admin/consent/properties.html +1 -1
- fides/ui-build/static/admin/consent/reporting.html +1 -1
- fides/ui-build/static/admin/consent.html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn]/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects.html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/resources/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/resources.html +1 -1
- fides/ui-build/static/admin/data-catalog.html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center/[monitorId]/[systemId].html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center/[monitorId].html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center.html +1 -1
- fides/ui-build/static/admin/data-discovery/activity.html +1 -1
- fides/ui-build/static/admin/data-discovery/detection/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-discovery/detection.html +1 -1
- fides/ui-build/static/admin/data-discovery/discovery/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-discovery/discovery.html +1 -1
- fides/ui-build/static/admin/datamap.html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId]/[collectionName]/[...subfieldNames].html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId]/[collectionName].html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId].html +1 -1
- fides/ui-build/static/admin/dataset/new.html +1 -1
- fides/ui-build/static/admin/dataset.html +1 -1
- fides/ui-build/static/admin/datastore-connection/[id].html +1 -1
- fides/ui-build/static/admin/datastore-connection/new.html +1 -1
- fides/ui-build/static/admin/datastore-connection.html +1 -1
- fides/ui-build/static/admin/index.html +1 -1
- fides/ui-build/static/admin/integrations/[id].html +1 -1
- fides/ui-build/static/admin/integrations.html +1 -1
- fides/ui-build/static/admin/login/[provider].html +1 -1
- fides/ui-build/static/admin/login.html +1 -1
- fides/ui-build/static/admin/messaging/[id].html +1 -1
- fides/ui-build/static/admin/messaging/add-template.html +1 -1
- fides/ui-build/static/admin/messaging.html +1 -1
- fides/ui-build/static/admin/poc/ant-components.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/AntForm.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikAntFormItem.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikControlled.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikField.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikSpreadField.html +1 -1
- fides/ui-build/static/admin/poc/forms.html +1 -1
- fides/ui-build/static/admin/poc/table-migration.html +1 -1
- fides/ui-build/static/admin/privacy-requests/[id].html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure/messaging.html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure/storage.html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure.html +1 -1
- fides/ui-build/static/admin/privacy-requests.html +1 -1
- fides/ui-build/static/admin/properties/[id].html +1 -1
- fides/ui-build/static/admin/properties/add-property.html +1 -1
- fides/ui-build/static/admin/properties.html +1 -1
- fides/ui-build/static/admin/reporting/datamap.html +1 -1
- fides/ui-build/static/admin/settings/about/alpha.html +1 -1
- fides/ui-build/static/admin/settings/about.html +1 -1
- fides/ui-build/static/admin/settings/consent/[configuration_id]/[purpose_id].html +1 -1
- fides/ui-build/static/admin/settings/consent.html +1 -1
- fides/ui-build/static/admin/settings/custom-fields.html +1 -1
- fides/ui-build/static/admin/settings/domain-records.html +1 -1
- fides/ui-build/static/admin/settings/domains.html +1 -1
- fides/ui-build/static/admin/settings/email-templates.html +1 -1
- fides/ui-build/static/admin/settings/locations.html +1 -1
- fides/ui-build/static/admin/settings/organization.html +1 -1
- fides/ui-build/static/admin/settings/regulations.html +1 -1
- fides/ui-build/static/admin/systems/configure/[id]/test-datasets.html +1 -1
- fides/ui-build/static/admin/systems/configure/[id].html +1 -1
- fides/ui-build/static/admin/systems.html +1 -1
- fides/ui-build/static/admin/taxonomy.html +1 -1
- fides/ui-build/static/admin/user-management/new.html +1 -1
- fides/ui-build/static/admin/user-management/profile/[id].html +1 -1
- fides/ui-build/static/admin/user-management.html +1 -1
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/WHEEL +0 -0
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/entry_points.txt +0 -0
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/licenses/LICENSE +0 -0
- {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/top_level.txt +0 -0
- /fides/api/models/{detection_discovery.py → detection_discovery/core.py} +0 -0
- /fides/ui-build/static/admin/_next/static/{Fb70i-8GI-owNAvgEJWhA → SZn_Fpr_qG1COMjkdloep}/_buildManifest.js +0 -0
- /fides/ui-build/static/admin/_next/static/{Fb70i-8GI-owNAvgEJWhA → SZn_Fpr_qG1COMjkdloep}/_ssgManifest.js +0 -0
@@ -0,0 +1,371 @@
|
|
1
|
+
"""
|
2
|
+
Service for handling external storage of large encrypted data.
|
3
|
+
|
4
|
+
This service provides a generic interface for storing large data that would
|
5
|
+
otherwise exceed database column size limits or impact performance.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import os
|
9
|
+
from io import BytesIO
|
10
|
+
from typing import Any, Optional
|
11
|
+
|
12
|
+
from loguru import logger
|
13
|
+
from sqlalchemy.orm import Session
|
14
|
+
|
15
|
+
from fides.api.models.storage import StorageConfig, get_active_default_storage_config
|
16
|
+
from fides.api.schemas.external_storage import ExternalStorageMetadata
|
17
|
+
from fides.api.schemas.storage.storage import StorageDetails, StorageType
|
18
|
+
from fides.api.service.storage.gcs import get_gcs_client
|
19
|
+
from fides.api.service.storage.s3 import generic_delete_from_s3, generic_upload_to_s3
|
20
|
+
from fides.api.service.storage.util import get_local_filename
|
21
|
+
from fides.api.util.aws_util import get_s3_client
|
22
|
+
from fides.api.util.encryption.aes_gcm_encryption_util import decrypt_data, encrypt_data
|
23
|
+
|
24
|
+
|
25
|
+
class ExternalDataStorageError(Exception):
|
26
|
+
"""Raised when external data storage operations fail."""
|
27
|
+
|
28
|
+
|
29
|
+
class ExternalDataStorageService:
|
30
|
+
"""
|
31
|
+
Service for storing large encrypted data externally.
|
32
|
+
|
33
|
+
Handles:
|
34
|
+
- Automatic encryption/decryption
|
35
|
+
- Multiple storage backends (S3, local, GCS, etc.)
|
36
|
+
- Consistent file organization
|
37
|
+
- Cleanup operations
|
38
|
+
"""
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def _get_storage_config(db: Session, storage_key: Optional[str]) -> "StorageConfig":
|
42
|
+
"""Resolve and return the StorageConfig to use.
|
43
|
+
|
44
|
+
Preference order:
|
45
|
+
|
46
|
+
1. If *storage_key* is provided, fetch that specific configuration.
|
47
|
+
2. Otherwise, fall back to the *active* default storage configuration.
|
48
|
+
|
49
|
+
Raises ExternalDataStorageError when no suitable configuration is found.
|
50
|
+
"""
|
51
|
+
|
52
|
+
if storage_key:
|
53
|
+
storage_config = (
|
54
|
+
db.query(StorageConfig).filter(StorageConfig.key == storage_key).first()
|
55
|
+
)
|
56
|
+
if not storage_config:
|
57
|
+
msg = f"Storage configuration with key '{storage_key}' not found"
|
58
|
+
logger.error(msg)
|
59
|
+
raise ExternalDataStorageError(msg)
|
60
|
+
return storage_config
|
61
|
+
|
62
|
+
# No explicit key – use the active default
|
63
|
+
storage_config = get_active_default_storage_config(db)
|
64
|
+
if not storage_config:
|
65
|
+
msg = "No active default storage configuration available for large data"
|
66
|
+
logger.error(msg)
|
67
|
+
raise ExternalDataStorageError(msg)
|
68
|
+
|
69
|
+
return storage_config
|
70
|
+
|
71
|
+
@staticmethod
|
72
|
+
def store_data(
|
73
|
+
db: Session,
|
74
|
+
storage_path: str,
|
75
|
+
data: Any,
|
76
|
+
storage_key: Optional[str] = None,
|
77
|
+
) -> ExternalStorageMetadata:
|
78
|
+
"""
|
79
|
+
Store data in external storage with encryption.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
db: Database session
|
83
|
+
storage_path: Path where data should be stored (e.g., "model/id/field/timestamp")
|
84
|
+
data: The data to store (will be serialized and encrypted)
|
85
|
+
storage_key: Optional specific storage config key to use
|
86
|
+
|
87
|
+
Returns:
|
88
|
+
ExternalStorageMetadata with storage details
|
89
|
+
|
90
|
+
Raises:
|
91
|
+
ExternalDataStorageError: If storage operation fails
|
92
|
+
"""
|
93
|
+
try:
|
94
|
+
storage_config = ExternalDataStorageService._get_storage_config(
|
95
|
+
db, storage_key
|
96
|
+
)
|
97
|
+
|
98
|
+
# Serialize and encrypt the data
|
99
|
+
encrypted_data = encrypt_data(data)
|
100
|
+
file_size = len(encrypted_data)
|
101
|
+
|
102
|
+
# Store to external storage based on type
|
103
|
+
if storage_config.type == StorageType.s3:
|
104
|
+
ExternalDataStorageService._store_to_s3(
|
105
|
+
storage_config, storage_path, encrypted_data
|
106
|
+
)
|
107
|
+
elif storage_config.type == StorageType.gcs:
|
108
|
+
ExternalDataStorageService._store_to_gcs(
|
109
|
+
storage_config, storage_path, encrypted_data
|
110
|
+
)
|
111
|
+
elif storage_config.type == StorageType.local:
|
112
|
+
ExternalDataStorageService._store_to_local(storage_path, encrypted_data)
|
113
|
+
else:
|
114
|
+
raise ExternalDataStorageError(
|
115
|
+
f"Unsupported storage type: {storage_config.type}"
|
116
|
+
)
|
117
|
+
|
118
|
+
# Create and return metadata
|
119
|
+
metadata = ExternalStorageMetadata(
|
120
|
+
storage_type=StorageType(storage_config.type.value),
|
121
|
+
file_key=storage_path,
|
122
|
+
filesize=file_size,
|
123
|
+
storage_key=storage_config.key,
|
124
|
+
)
|
125
|
+
|
126
|
+
logger.info(
|
127
|
+
f"Stored {file_size:,} bytes to {storage_config.type} storage "
|
128
|
+
f"at path: {storage_path}"
|
129
|
+
)
|
130
|
+
|
131
|
+
return metadata
|
132
|
+
|
133
|
+
except Exception as e:
|
134
|
+
logger.error(f"Failed to store data externally: {str(e)}")
|
135
|
+
raise ExternalDataStorageError(f"Failed to store data: {str(e)}") from e
|
136
|
+
|
137
|
+
@staticmethod
|
138
|
+
def retrieve_data(
|
139
|
+
db: Session,
|
140
|
+
metadata: ExternalStorageMetadata,
|
141
|
+
) -> Any:
|
142
|
+
"""
|
143
|
+
Retrieve and decrypt data from external storage.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
db: Database session
|
147
|
+
metadata: Storage metadata containing location and details
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
Decrypted and deserialized data
|
151
|
+
|
152
|
+
Raises:
|
153
|
+
ExternalDataStorageError: If retrieval operation fails
|
154
|
+
"""
|
155
|
+
try:
|
156
|
+
storage_config = ExternalDataStorageService._get_storage_config(
|
157
|
+
db, metadata.storage_key
|
158
|
+
)
|
159
|
+
|
160
|
+
# Retrieve encrypted data based on storage type
|
161
|
+
storage_type_value = (
|
162
|
+
metadata.storage_type.value
|
163
|
+
if isinstance(metadata.storage_type, StorageType)
|
164
|
+
else metadata.storage_type
|
165
|
+
)
|
166
|
+
|
167
|
+
if storage_type_value == StorageType.s3.value:
|
168
|
+
encrypted_data = ExternalDataStorageService._retrieve_from_s3(
|
169
|
+
storage_config, metadata
|
170
|
+
)
|
171
|
+
elif storage_type_value == StorageType.gcs.value:
|
172
|
+
encrypted_data = ExternalDataStorageService._retrieve_from_gcs(
|
173
|
+
storage_config, metadata
|
174
|
+
)
|
175
|
+
elif storage_type_value == StorageType.local.value:
|
176
|
+
encrypted_data = ExternalDataStorageService._retrieve_from_local(
|
177
|
+
metadata
|
178
|
+
)
|
179
|
+
else:
|
180
|
+
raise ExternalDataStorageError(
|
181
|
+
f"Unsupported storage type: {storage_type_value}"
|
182
|
+
)
|
183
|
+
|
184
|
+
# Handle case where download returns None
|
185
|
+
if encrypted_data is None:
|
186
|
+
raise ExternalDataStorageError(
|
187
|
+
f"No data found at path: {metadata.file_key}"
|
188
|
+
)
|
189
|
+
|
190
|
+
# Decrypt and deserialize
|
191
|
+
data = decrypt_data(encrypted_data)
|
192
|
+
|
193
|
+
logger.info(
|
194
|
+
f"Retrieved {metadata.filesize:,} bytes from {storage_type_value} storage "
|
195
|
+
f"at path: {metadata.file_key}"
|
196
|
+
)
|
197
|
+
|
198
|
+
return data
|
199
|
+
|
200
|
+
except ExternalDataStorageError:
|
201
|
+
raise
|
202
|
+
except Exception as e:
|
203
|
+
logger.error(f"Failed to retrieve data from external storage: {str(e)}")
|
204
|
+
raise ExternalDataStorageError(f"Failed to retrieve data: {str(e)}") from e
|
205
|
+
|
206
|
+
@staticmethod
|
207
|
+
def delete_data(
|
208
|
+
db: Session,
|
209
|
+
metadata: ExternalStorageMetadata,
|
210
|
+
) -> None:
|
211
|
+
"""
|
212
|
+
Delete data from external storage.
|
213
|
+
|
214
|
+
Args:
|
215
|
+
db: Database session
|
216
|
+
metadata: Storage metadata containing location
|
217
|
+
|
218
|
+
Note:
|
219
|
+
This operation is best-effort and will log warnings on failure
|
220
|
+
rather than raising exceptions, to support cleanup scenarios.
|
221
|
+
"""
|
222
|
+
try:
|
223
|
+
storage_config = ExternalDataStorageService._get_storage_config(
|
224
|
+
db, metadata.storage_key
|
225
|
+
)
|
226
|
+
|
227
|
+
# Delete from external storage based on type
|
228
|
+
storage_type_value = (
|
229
|
+
metadata.storage_type.value
|
230
|
+
if isinstance(metadata.storage_type, StorageType)
|
231
|
+
else metadata.storage_type
|
232
|
+
)
|
233
|
+
|
234
|
+
if storage_type_value == StorageType.s3.value:
|
235
|
+
ExternalDataStorageService._delete_from_s3(storage_config, metadata)
|
236
|
+
elif storage_type_value == StorageType.gcs.value:
|
237
|
+
ExternalDataStorageService._delete_from_gcs(storage_config, metadata)
|
238
|
+
elif storage_type_value == StorageType.local.value:
|
239
|
+
ExternalDataStorageService._delete_from_local(metadata)
|
240
|
+
else:
|
241
|
+
logger.warning(
|
242
|
+
f"Unsupported storage type for cleanup: {storage_type_value}"
|
243
|
+
)
|
244
|
+
return
|
245
|
+
|
246
|
+
logger.info(
|
247
|
+
f"Deleted external storage file from {storage_type_value} storage "
|
248
|
+
f"at path: {metadata.file_key}"
|
249
|
+
)
|
250
|
+
|
251
|
+
except Exception as e:
|
252
|
+
# Log but don't raise - cleanup should be best effort
|
253
|
+
logger.warning(
|
254
|
+
f"Failed to delete external storage file at {metadata.file_key}: {str(e)}"
|
255
|
+
)
|
256
|
+
|
257
|
+
# Private helper methods for each storage type
|
258
|
+
|
259
|
+
@staticmethod
|
260
|
+
def _store_to_s3(config: StorageConfig, file_key: str, data: bytes) -> None:
|
261
|
+
"""Store data to S3 using existing generic_upload_to_s3"""
|
262
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
263
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
264
|
+
|
265
|
+
document = BytesIO(data)
|
266
|
+
generic_upload_to_s3(
|
267
|
+
storage_secrets=config.secrets,
|
268
|
+
bucket_name=bucket_name,
|
269
|
+
file_key=file_key,
|
270
|
+
auth_method=auth_method,
|
271
|
+
document=document,
|
272
|
+
)
|
273
|
+
|
274
|
+
@staticmethod
|
275
|
+
def _store_to_gcs(config: StorageConfig, file_key: str, data: bytes) -> None:
|
276
|
+
"""Store data to GCS using existing get_gcs_client"""
|
277
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
278
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
279
|
+
|
280
|
+
storage_client = get_gcs_client(auth_method, config.secrets)
|
281
|
+
bucket = storage_client.bucket(bucket_name)
|
282
|
+
blob = bucket.blob(file_key)
|
283
|
+
|
284
|
+
blob.upload_from_string(data, content_type="application/octet-stream")
|
285
|
+
|
286
|
+
@staticmethod
|
287
|
+
def _store_to_local(file_key: str, data: bytes) -> None:
|
288
|
+
"""Store data to local filesystem using existing get_local_filename"""
|
289
|
+
file_path = get_local_filename(file_key)
|
290
|
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
291
|
+
with open(file_path, "wb") as f:
|
292
|
+
f.write(data)
|
293
|
+
|
294
|
+
@staticmethod
|
295
|
+
def _retrieve_from_s3(
|
296
|
+
config: StorageConfig, metadata: ExternalStorageMetadata
|
297
|
+
) -> bytes:
|
298
|
+
"""Retrieve data from S3 directly, bypassing file size limits"""
|
299
|
+
|
300
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
301
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
302
|
+
|
303
|
+
# Get S3 client directly and download content regardless of file size
|
304
|
+
s3_client = get_s3_client(auth_method, config.secrets)
|
305
|
+
|
306
|
+
try:
|
307
|
+
# Download content directly to BytesIO buffer
|
308
|
+
file_obj = BytesIO()
|
309
|
+
s3_client.download_fileobj(
|
310
|
+
Bucket=bucket_name, Key=metadata.file_key, Fileobj=file_obj
|
311
|
+
)
|
312
|
+
file_obj.seek(0) # Reset file pointer to beginning
|
313
|
+
return file_obj.read()
|
314
|
+
except Exception as e:
|
315
|
+
logger.error(f"Error retrieving file from S3: {e}")
|
316
|
+
raise e
|
317
|
+
|
318
|
+
@staticmethod
|
319
|
+
def _retrieve_from_gcs(
|
320
|
+
config: StorageConfig, metadata: ExternalStorageMetadata
|
321
|
+
) -> bytes:
|
322
|
+
"""Retrieve data from GCS using existing get_gcs_client"""
|
323
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
324
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
325
|
+
|
326
|
+
storage_client = get_gcs_client(auth_method, config.secrets)
|
327
|
+
bucket = storage_client.bucket(bucket_name)
|
328
|
+
blob = bucket.blob(metadata.file_key)
|
329
|
+
return blob.download_as_bytes()
|
330
|
+
|
331
|
+
@staticmethod
|
332
|
+
def _retrieve_from_local(metadata: ExternalStorageMetadata) -> bytes:
|
333
|
+
"""Retrieve data from local filesystem"""
|
334
|
+
file_path = get_local_filename(metadata.file_key)
|
335
|
+
with open(file_path, "rb") as f:
|
336
|
+
return f.read()
|
337
|
+
|
338
|
+
@staticmethod
|
339
|
+
def _delete_from_s3(
|
340
|
+
config: StorageConfig, metadata: ExternalStorageMetadata
|
341
|
+
) -> None:
|
342
|
+
"""Delete data from S3 using existing generic_delete_from_s3"""
|
343
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
344
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
345
|
+
|
346
|
+
generic_delete_from_s3(
|
347
|
+
storage_secrets=config.secrets,
|
348
|
+
bucket_name=bucket_name,
|
349
|
+
file_key=metadata.file_key,
|
350
|
+
auth_method=auth_method,
|
351
|
+
)
|
352
|
+
|
353
|
+
@staticmethod
|
354
|
+
def _delete_from_gcs(
|
355
|
+
config: StorageConfig, metadata: ExternalStorageMetadata
|
356
|
+
) -> None:
|
357
|
+
"""Delete data from GCS using existing get_gcs_client"""
|
358
|
+
bucket_name = config.details[StorageDetails.BUCKET.value]
|
359
|
+
auth_method = config.details[StorageDetails.AUTH_METHOD.value]
|
360
|
+
|
361
|
+
storage_client = get_gcs_client(auth_method, config.secrets)
|
362
|
+
bucket = storage_client.bucket(bucket_name)
|
363
|
+
blob = bucket.blob(metadata.file_key)
|
364
|
+
blob.delete()
|
365
|
+
|
366
|
+
@staticmethod
|
367
|
+
def _delete_from_local(metadata: ExternalStorageMetadata) -> None:
|
368
|
+
"""Delete data from local filesystem"""
|
369
|
+
file_path = get_local_filename(metadata.file_key)
|
370
|
+
if os.path.exists(file_path):
|
371
|
+
os.remove(file_path)
|
@@ -262,14 +262,14 @@ def upload_access_results( # pylint: disable=R0912
|
|
262
262
|
privacy_request.add_success_execution_log(
|
263
263
|
session,
|
264
264
|
connection_key=None,
|
265
|
-
dataset_name="Access
|
265
|
+
dataset_name="Access package upload",
|
266
266
|
collection_name=None,
|
267
|
-
message="Access
|
267
|
+
message="Access package upload successful for privacy request.",
|
268
268
|
action_type=ActionType.access,
|
269
269
|
)
|
270
270
|
logger.bind(
|
271
271
|
time_taken=time.time() - start_time,
|
272
|
-
).info("Access
|
272
|
+
).info("Access package upload successful for privacy request.")
|
273
273
|
except common_exceptions.StorageUploadError as exc:
|
274
274
|
logger.bind(
|
275
275
|
policy_key=policy.key,
|
@@ -279,9 +279,9 @@ def upload_access_results( # pylint: disable=R0912
|
|
279
279
|
privacy_request.add_error_execution_log(
|
280
280
|
session,
|
281
281
|
connection_key=None,
|
282
|
-
dataset_name="Access
|
282
|
+
dataset_name="Access package upload",
|
283
283
|
collection_name=None,
|
284
|
-
message="Access
|
284
|
+
message="Access package upload failed for privacy request.",
|
285
285
|
action_type=ActionType.access,
|
286
286
|
)
|
287
287
|
privacy_request.status = PrivacyRequestStatus.error
|
@@ -17,11 +17,11 @@ from fides.api.models.privacy_request import (
|
|
17
17
|
PrivacyRequest,
|
18
18
|
RequestTask,
|
19
19
|
)
|
20
|
+
from fides.api.models.worker_task import ExecutionLogStatus
|
20
21
|
from fides.api.schemas.drp_privacy_request import DrpPrivacyRequestCreate
|
21
22
|
from fides.api.schemas.masking.masking_secrets import MaskingSecretCache
|
22
23
|
from fides.api.schemas.policy import ActionType
|
23
24
|
from fides.api.schemas.privacy_request import (
|
24
|
-
ExecutionLogStatus,
|
25
25
|
PrivacyRequestResponse,
|
26
26
|
PrivacyRequestStatus,
|
27
27
|
)
|
@@ -29,8 +29,8 @@ from fides.api.models.privacy_request import (
|
|
29
29
|
RequestTask,
|
30
30
|
TraversalDetails,
|
31
31
|
)
|
32
|
+
from fides.api.models.worker_task import ExecutionLogStatus
|
32
33
|
from fides.api.schemas.policy import ActionType
|
33
|
-
from fides.api.schemas.privacy_request import ExecutionLogStatus
|
34
34
|
from fides.api.task.deprecated_graph_task import format_data_use_map_for_caching
|
35
35
|
from fides.api.task.execute_request_tasks import log_task_queued, queue_request_task
|
36
36
|
from fides.api.util.logger_context_utils import log_context
|
@@ -22,8 +22,9 @@ from fides.api.common_exceptions import (
|
|
22
22
|
from fides.api.graph.config import TERMINATOR_ADDRESS, CollectionAddress
|
23
23
|
from fides.api.models.connectionconfig import ConnectionConfig
|
24
24
|
from fides.api.models.privacy_request import ExecutionLog, PrivacyRequest, RequestTask
|
25
|
+
from fides.api.models.worker_task import ExecutionLogStatus
|
25
26
|
from fides.api.schemas.policy import ActionType, CurrentStep
|
26
|
-
from fides.api.schemas.privacy_request import
|
27
|
+
from fides.api.schemas.privacy_request import PrivacyRequestStatus
|
27
28
|
from fides.api.task.graph_task import (
|
28
29
|
GraphTask,
|
29
30
|
mark_current_and_downstream_nodes_as_failed,
|
@@ -145,7 +146,7 @@ def can_run_task_body(
|
|
145
146
|
if request_task.is_terminator_task:
|
146
147
|
logger.info(
|
147
148
|
"Terminator {} task reached.",
|
148
|
-
request_task.action_type
|
149
|
+
request_task.action_type,
|
149
150
|
)
|
150
151
|
return False
|
151
152
|
if request_task.is_root_task:
|
@@ -154,7 +155,7 @@ def can_run_task_body(
|
|
154
155
|
if request_task.status != ExecutionLogStatus.pending:
|
155
156
|
logger_method(request_task)(
|
156
157
|
"Skipping {} task {} with status {}.",
|
157
|
-
request_task.action_type
|
158
|
+
request_task.action_type,
|
158
159
|
request_task.collection_address,
|
159
160
|
request_task.status.value,
|
160
161
|
)
|
@@ -449,7 +450,7 @@ def log_task_complete(request_task: RequestTask) -> None:
|
|
449
450
|
"""Convenience method for logging task completion"""
|
450
451
|
logger.info(
|
451
452
|
"{} task {} is {}.",
|
452
|
-
request_task.action_type.
|
453
|
+
request_task.action_type.capitalize(),
|
453
454
|
request_task.collection_address,
|
454
455
|
request_task.status.value,
|
455
456
|
)
|
@@ -478,9 +479,9 @@ def _order_tasks_by_input_key(
|
|
478
479
|
|
479
480
|
|
480
481
|
mapping = {
|
481
|
-
ActionType.access: run_access_node,
|
482
|
-
ActionType.erasure: run_erasure_node,
|
483
|
-
ActionType.consent: run_consent_node,
|
482
|
+
ActionType.access.value: run_access_node,
|
483
|
+
ActionType.erasure.value: run_erasure_node,
|
484
|
+
ActionType.consent.value: run_consent_node,
|
484
485
|
}
|
485
486
|
|
486
487
|
|
@@ -504,7 +505,7 @@ def log_task_queued(request_task: RequestTask, location: str) -> None:
|
|
504
505
|
"""Helper for logging that tasks are queued"""
|
505
506
|
logger_method(request_task)(
|
506
507
|
"Queuing {} task {} from {}.",
|
507
|
-
request_task.action_type
|
508
|
+
request_task.action_type,
|
508
509
|
request_task.collection_address,
|
509
510
|
location,
|
510
511
|
)
|
fides/api/task/graph_task.py
CHANGED
@@ -39,8 +39,8 @@ from fides.api.models.datasetconfig import DatasetConfig
|
|
39
39
|
from fides.api.models.policy import Policy, Rule
|
40
40
|
from fides.api.models.privacy_preference import PrivacyPreferenceHistory
|
41
41
|
from fides.api.models.privacy_request import ExecutionLog, PrivacyRequest, RequestTask
|
42
|
+
from fides.api.models.worker_task import ExecutionLogStatus
|
42
43
|
from fides.api.schemas.policy import ActionType, CurrentStep
|
43
|
-
from fides.api.schemas.privacy_request import ExecutionLogStatus
|
44
44
|
from fides.api.service.connectors.base_connector import BaseConnector
|
45
45
|
from fides.api.task.consolidate_query_matches import consolidate_query_matches
|
46
46
|
from fides.api.task.filter_element_match import filter_element_match
|
@@ -503,12 +503,20 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
|
|
503
503
|
self.post_process_input_data(formatted_input_data)
|
504
504
|
)
|
505
505
|
|
506
|
-
# For erasures:
|
507
|
-
|
508
|
-
|
506
|
+
# For erasures: build placeholder version incrementally to avoid holding two full
|
507
|
+
# copies of the data in memory simultaneously.
|
508
|
+
placeholder_output: List[Row] = []
|
509
|
+
for original_row in output:
|
510
|
+
# Create a deep copy of the *single* row, transform it, then append to
|
511
|
+
# the placeholder list. Peak memory at any point is one extra row rather
|
512
|
+
# than an entire dataset.
|
513
|
+
row_copy = copy.deepcopy(original_row)
|
509
514
|
filter_element_match(
|
510
|
-
|
515
|
+
row_copy,
|
516
|
+
query_paths=post_processed_node_input_data,
|
517
|
+
delete_elements=False,
|
511
518
|
)
|
519
|
+
placeholder_output.append(row_copy)
|
512
520
|
|
513
521
|
# For DSR 3.0, save data to build masking requests directly
|
514
522
|
# on the Request Task.
|
@@ -519,11 +527,14 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
|
|
519
527
|
# TODO Remove when we stop support for DSR 2.0
|
520
528
|
# Save data to build masking requests for DSR 2.0 in Redis.
|
521
529
|
# Results saved with matching array elements preserved
|
522
|
-
|
523
|
-
|
524
|
-
|
530
|
+
if not CONFIG.execution.use_dsr_3_0:
|
531
|
+
self.resources.cache_results_with_placeholders(
|
532
|
+
f"access_request__{self.key}", placeholder_output
|
533
|
+
)
|
525
534
|
|
526
|
-
# For access request results,
|
535
|
+
# For access request results, mutate rows in-place to remove non-matching
|
536
|
+
# array elements. We already iterated over `output` above, so reuse the same
|
537
|
+
# loop structure to keep cache locality.
|
527
538
|
for row in output:
|
528
539
|
logger.info(
|
529
540
|
"Filtering row in {} for matching array elements.",
|
@@ -537,7 +548,8 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
|
|
537
548
|
|
538
549
|
# TODO Remove when we stop support for DSR 2.0
|
539
550
|
# Saves intermediate access results for DSR 2.0 in Redis
|
540
|
-
|
551
|
+
if not CONFIG.execution.use_dsr_3_0:
|
552
|
+
self.resources.cache_object(f"access_request__{self.key}", output)
|
541
553
|
|
542
554
|
# Return filtered rows with non-matched array data removed.
|
543
555
|
return output
|
fides/api/util/consent_util.py
CHANGED
@@ -18,7 +18,7 @@ from fides.api.models.privacy_request import (
|
|
18
18
|
)
|
19
19
|
from fides.api.models.sql_models import System # type: ignore[attr-defined]
|
20
20
|
from fides.api.models.tcf_purpose_overrides import TCFPurposeOverride
|
21
|
-
from fides.api.
|
21
|
+
from fides.api.models.worker_task import ExecutionLogStatus
|
22
22
|
from fides.api.schemas.redis_cache import Identity
|
23
23
|
|
24
24
|
|
@@ -0,0 +1,102 @@
|
|
1
|
+
"""
|
2
|
+
Helpers for estimating the size of large collections of access data.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
7
|
+
import json
|
8
|
+
import sys
|
9
|
+
from typing import List, Optional
|
10
|
+
|
11
|
+
from loguru import logger
|
12
|
+
|
13
|
+
from fides.api.util.collection_util import Row
|
14
|
+
from fides.api.util.custom_json_encoder import CustomJSONEncoder
|
15
|
+
|
16
|
+
# 640MB threshold for external storage
|
17
|
+
# We only generate an estimated size for large datasets so we want to be conservative
|
18
|
+
# and fallback to external storage even if we haven't hit the 1GB max limit.
|
19
|
+
# We also want to pad for encryption and base64 encoding.
|
20
|
+
LARGE_DATA_THRESHOLD_BYTES = 640 * 1024 * 1024 # 640MB
|
21
|
+
|
22
|
+
|
23
|
+
def calculate_data_size(data: List[Row]) -> int: # noqa: D401 – utility function
|
24
|
+
"""Return an approximate JSON-serialized size (in bytes) for a list of *Row*.
|
25
|
+
|
26
|
+
The implementation purposefully avoids serializing the entire payload when
|
27
|
+
*data* is large. For collections >1000 rows we sample a subset, measure the
|
28
|
+
encoded size, then extrapolate. This keeps memory usage bounded while still
|
29
|
+
giving us an order-of-magnitude estimate suitable for "should I stream this
|
30
|
+
out to S3?" decisions.
|
31
|
+
"""
|
32
|
+
|
33
|
+
if not data:
|
34
|
+
return 0
|
35
|
+
|
36
|
+
try:
|
37
|
+
data_count = len(data)
|
38
|
+
|
39
|
+
# For very large datasets, estimate size from a sample to avoid memory issues
|
40
|
+
if data_count > 1000:
|
41
|
+
logger.debug(
|
42
|
+
f"Calculating size for large dataset ({data_count} rows) using sampling"
|
43
|
+
)
|
44
|
+
|
45
|
+
sample_size = min(500, max(100, data_count // 20)) # 5 % capped at 500
|
46
|
+
|
47
|
+
# stratified sampling – take items spaced across the set when possible
|
48
|
+
if data_count > sample_size * 3:
|
49
|
+
step = data_count // sample_size
|
50
|
+
sample_indices = list(range(0, data_count, step))[:sample_size]
|
51
|
+
sample = [data[i] for i in sample_indices]
|
52
|
+
else:
|
53
|
+
sample = data[:sample_size]
|
54
|
+
|
55
|
+
sample_json = json.dumps(
|
56
|
+
sample, cls=CustomJSONEncoder, separators=(",", ":")
|
57
|
+
)
|
58
|
+
sample_bytes = len(sample_json.encode("utf-8"))
|
59
|
+
|
60
|
+
avg_record_size = sample_bytes / sample_size
|
61
|
+
content_size = int(avg_record_size * data_count)
|
62
|
+
|
63
|
+
# overhead: 2 bytes for [] plus a comma between every record plus 1 % slack
|
64
|
+
structure_overhead = 2 + (data_count - 1) + int(content_size * 0.01)
|
65
|
+
return content_size + structure_overhead
|
66
|
+
|
67
|
+
# small datasets – just measure
|
68
|
+
json_str = json.dumps(data, cls=CustomJSONEncoder, separators=(",", ":"))
|
69
|
+
return len(json_str.encode("utf-8"))
|
70
|
+
|
71
|
+
except (TypeError, ValueError) as exc:
|
72
|
+
logger.warning(
|
73
|
+
f"Failed to calculate JSON size, falling back to sys.getsizeof: {exc}"
|
74
|
+
)
|
75
|
+
return sys.getsizeof(data)
|
76
|
+
|
77
|
+
|
78
|
+
def is_large_data(
|
79
|
+
data: List[Row], threshold_bytes: Optional[int] = None
|
80
|
+
) -> bool: # noqa: D401
|
81
|
+
"""Return *True* if *data* is likely to exceed *threshold_bytes* when serialized."""
|
82
|
+
|
83
|
+
if not data:
|
84
|
+
return False
|
85
|
+
|
86
|
+
threshold = (
|
87
|
+
threshold_bytes if threshold_bytes is not None else LARGE_DATA_THRESHOLD_BYTES
|
88
|
+
)
|
89
|
+
size = calculate_data_size(data)
|
90
|
+
if size > threshold:
|
91
|
+
logger.info(
|
92
|
+
f"Data size ({size:,} bytes) exceeds threshold ({threshold:,} bytes) – using external storage"
|
93
|
+
)
|
94
|
+
return True
|
95
|
+
return False
|
96
|
+
|
97
|
+
|
98
|
+
__all__ = [
|
99
|
+
"calculate_data_size",
|
100
|
+
"is_large_data",
|
101
|
+
"LARGE_DATA_THRESHOLD_BYTES",
|
102
|
+
]
|