ethyca-fides 2.63.0rc2__py2.py3-none-any.whl → 2.63.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/METADATA +1 -1
  2. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/RECORD +131 -112
  3. fides/_version.py +3 -3
  4. fides/api/alembic/migrations/versions/29e56fa1fdb3_add_monitor_tasks.py +147 -0
  5. fides/api/alembic/migrations/versions/5efcdf18438e_add_manual_task_tables.py +160 -0
  6. fides/api/alembic/migrations/versions/bf713b5a021d_staged_resource_ancestor_link_data_.py +20 -11
  7. fides/api/api/v1/endpoints/privacy_request_endpoints.py +4 -4
  8. fides/api/db/base.py +7 -1
  9. fides/api/migrations/post_upgrade_index_creation.py +3 -3
  10. fides/api/models/connectionconfig.py +1 -1
  11. fides/api/models/detection_discovery/__init__.py +35 -0
  12. fides/api/models/detection_discovery/monitor_task.py +162 -0
  13. fides/api/models/field_types/__init__.py +5 -0
  14. fides/api/models/field_types/encrypted_large_data.py +151 -0
  15. fides/api/models/manual_tasks/__init__.py +8 -0
  16. fides/api/models/manual_tasks/manual_task.py +110 -0
  17. fides/api/models/manual_tasks/manual_task_log.py +100 -0
  18. fides/api/models/privacy_preference.py +1 -1
  19. fides/api/models/privacy_request/execution_log.py +3 -31
  20. fides/api/models/privacy_request/privacy_request.py +16 -3
  21. fides/api/models/privacy_request/request_task.py +36 -25
  22. fides/api/models/worker_task.py +96 -0
  23. fides/api/schemas/external_storage.py +22 -0
  24. fides/api/schemas/manual_tasks/__init__.py +0 -0
  25. fides/api/schemas/manual_tasks/manual_task_schemas.py +79 -0
  26. fides/api/schemas/manual_tasks/manual_task_status.py +151 -0
  27. fides/api/schemas/privacy_request.py +1 -12
  28. fides/api/service/connectors/base_erasure_email_connector.py +1 -1
  29. fides/api/service/connectors/consent_email_connector.py +2 -1
  30. fides/api/service/connectors/dynamic_erasure_email_connector.py +2 -1
  31. fides/api/service/connectors/erasure_email_connector.py +1 -1
  32. fides/api/service/external_data_storage.py +371 -0
  33. fides/api/service/privacy_request/request_runner_service.py +5 -5
  34. fides/api/service/privacy_request/request_service.py +1 -1
  35. fides/api/task/create_request_tasks.py +1 -1
  36. fides/api/task/execute_request_tasks.py +9 -8
  37. fides/api/task/graph_task.py +22 -10
  38. fides/api/util/consent_util.py +1 -1
  39. fides/api/util/data_size.py +102 -0
  40. fides/api/util/encryption/aes_gcm_encryption_util.py +271 -0
  41. fides/service/manual_tasks/__init__.py +0 -0
  42. fides/service/manual_tasks/manual_task_service.py +150 -0
  43. fides/service/privacy_request/privacy_request_service.py +1 -1
  44. fides/ui-build/static/admin/404.html +1 -1
  45. fides/ui-build/static/admin/add-systems/manual.html +1 -1
  46. fides/ui-build/static/admin/add-systems/multiple.html +1 -1
  47. fides/ui-build/static/admin/add-systems.html +1 -1
  48. fides/ui-build/static/admin/consent/configure/add-vendors.html +1 -1
  49. fides/ui-build/static/admin/consent/configure.html +1 -1
  50. fides/ui-build/static/admin/consent/privacy-experience/[id].html +1 -1
  51. fides/ui-build/static/admin/consent/privacy-experience/new.html +1 -1
  52. fides/ui-build/static/admin/consent/privacy-experience.html +1 -1
  53. fides/ui-build/static/admin/consent/privacy-notices/[id].html +1 -1
  54. fides/ui-build/static/admin/consent/privacy-notices/new.html +1 -1
  55. fides/ui-build/static/admin/consent/privacy-notices.html +1 -1
  56. fides/ui-build/static/admin/consent/properties.html +1 -1
  57. fides/ui-build/static/admin/consent/reporting.html +1 -1
  58. fides/ui-build/static/admin/consent.html +1 -1
  59. fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn]/[resourceUrn].html +1 -1
  60. fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn].html +1 -1
  61. fides/ui-build/static/admin/data-catalog/[systemId]/projects.html +1 -1
  62. fides/ui-build/static/admin/data-catalog/[systemId]/resources/[resourceUrn].html +1 -1
  63. fides/ui-build/static/admin/data-catalog/[systemId]/resources.html +1 -1
  64. fides/ui-build/static/admin/data-catalog.html +1 -1
  65. fides/ui-build/static/admin/data-discovery/action-center/[monitorId]/[systemId].html +1 -1
  66. fides/ui-build/static/admin/data-discovery/action-center/[monitorId].html +1 -1
  67. fides/ui-build/static/admin/data-discovery/action-center.html +1 -1
  68. fides/ui-build/static/admin/data-discovery/activity.html +1 -1
  69. fides/ui-build/static/admin/data-discovery/detection/[resourceUrn].html +1 -1
  70. fides/ui-build/static/admin/data-discovery/detection.html +1 -1
  71. fides/ui-build/static/admin/data-discovery/discovery/[resourceUrn].html +1 -1
  72. fides/ui-build/static/admin/data-discovery/discovery.html +1 -1
  73. fides/ui-build/static/admin/datamap.html +1 -1
  74. fides/ui-build/static/admin/dataset/[datasetId]/[collectionName]/[...subfieldNames].html +1 -1
  75. fides/ui-build/static/admin/dataset/[datasetId]/[collectionName].html +1 -1
  76. fides/ui-build/static/admin/dataset/[datasetId].html +1 -1
  77. fides/ui-build/static/admin/dataset/new.html +1 -1
  78. fides/ui-build/static/admin/dataset.html +1 -1
  79. fides/ui-build/static/admin/datastore-connection/[id].html +1 -1
  80. fides/ui-build/static/admin/datastore-connection/new.html +1 -1
  81. fides/ui-build/static/admin/datastore-connection.html +1 -1
  82. fides/ui-build/static/admin/index.html +1 -1
  83. fides/ui-build/static/admin/integrations/[id].html +1 -1
  84. fides/ui-build/static/admin/integrations.html +1 -1
  85. fides/ui-build/static/admin/login/[provider].html +1 -1
  86. fides/ui-build/static/admin/login.html +1 -1
  87. fides/ui-build/static/admin/messaging/[id].html +1 -1
  88. fides/ui-build/static/admin/messaging/add-template.html +1 -1
  89. fides/ui-build/static/admin/messaging.html +1 -1
  90. fides/ui-build/static/admin/poc/ant-components.html +1 -1
  91. fides/ui-build/static/admin/poc/form-experiments/AntForm.html +1 -1
  92. fides/ui-build/static/admin/poc/form-experiments/FormikAntFormItem.html +1 -1
  93. fides/ui-build/static/admin/poc/form-experiments/FormikControlled.html +1 -1
  94. fides/ui-build/static/admin/poc/form-experiments/FormikField.html +1 -1
  95. fides/ui-build/static/admin/poc/form-experiments/FormikSpreadField.html +1 -1
  96. fides/ui-build/static/admin/poc/forms.html +1 -1
  97. fides/ui-build/static/admin/poc/table-migration.html +1 -1
  98. fides/ui-build/static/admin/privacy-requests/[id].html +1 -1
  99. fides/ui-build/static/admin/privacy-requests/configure/messaging.html +1 -1
  100. fides/ui-build/static/admin/privacy-requests/configure/storage.html +1 -1
  101. fides/ui-build/static/admin/privacy-requests/configure.html +1 -1
  102. fides/ui-build/static/admin/privacy-requests.html +1 -1
  103. fides/ui-build/static/admin/properties/[id].html +1 -1
  104. fides/ui-build/static/admin/properties/add-property.html +1 -1
  105. fides/ui-build/static/admin/properties.html +1 -1
  106. fides/ui-build/static/admin/reporting/datamap.html +1 -1
  107. fides/ui-build/static/admin/settings/about/alpha.html +1 -1
  108. fides/ui-build/static/admin/settings/about.html +1 -1
  109. fides/ui-build/static/admin/settings/consent/[configuration_id]/[purpose_id].html +1 -1
  110. fides/ui-build/static/admin/settings/consent.html +1 -1
  111. fides/ui-build/static/admin/settings/custom-fields.html +1 -1
  112. fides/ui-build/static/admin/settings/domain-records.html +1 -1
  113. fides/ui-build/static/admin/settings/domains.html +1 -1
  114. fides/ui-build/static/admin/settings/email-templates.html +1 -1
  115. fides/ui-build/static/admin/settings/locations.html +1 -1
  116. fides/ui-build/static/admin/settings/organization.html +1 -1
  117. fides/ui-build/static/admin/settings/regulations.html +1 -1
  118. fides/ui-build/static/admin/systems/configure/[id]/test-datasets.html +1 -1
  119. fides/ui-build/static/admin/systems/configure/[id].html +1 -1
  120. fides/ui-build/static/admin/systems.html +1 -1
  121. fides/ui-build/static/admin/taxonomy.html +1 -1
  122. fides/ui-build/static/admin/user-management/new.html +1 -1
  123. fides/ui-build/static/admin/user-management/profile/[id].html +1 -1
  124. fides/ui-build/static/admin/user-management.html +1 -1
  125. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/WHEEL +0 -0
  126. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/entry_points.txt +0 -0
  127. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/licenses/LICENSE +0 -0
  128. {ethyca_fides-2.63.0rc2.dist-info → ethyca_fides-2.63.1.dist-info}/top_level.txt +0 -0
  129. /fides/api/models/{detection_discovery.py → detection_discovery/core.py} +0 -0
  130. /fides/ui-build/static/admin/_next/static/{Fb70i-8GI-owNAvgEJWhA → SZn_Fpr_qG1COMjkdloep}/_buildManifest.js +0 -0
  131. /fides/ui-build/static/admin/_next/static/{Fb70i-8GI-owNAvgEJWhA → SZn_Fpr_qG1COMjkdloep}/_ssgManifest.js +0 -0
@@ -0,0 +1,371 @@
1
+ """
2
+ Service for handling external storage of large encrypted data.
3
+
4
+ This service provides a generic interface for storing large data that would
5
+ otherwise exceed database column size limits or impact performance.
6
+ """
7
+
8
+ import os
9
+ from io import BytesIO
10
+ from typing import Any, Optional
11
+
12
+ from loguru import logger
13
+ from sqlalchemy.orm import Session
14
+
15
+ from fides.api.models.storage import StorageConfig, get_active_default_storage_config
16
+ from fides.api.schemas.external_storage import ExternalStorageMetadata
17
+ from fides.api.schemas.storage.storage import StorageDetails, StorageType
18
+ from fides.api.service.storage.gcs import get_gcs_client
19
+ from fides.api.service.storage.s3 import generic_delete_from_s3, generic_upload_to_s3
20
+ from fides.api.service.storage.util import get_local_filename
21
+ from fides.api.util.aws_util import get_s3_client
22
+ from fides.api.util.encryption.aes_gcm_encryption_util import decrypt_data, encrypt_data
23
+
24
+
25
+ class ExternalDataStorageError(Exception):
26
+ """Raised when external data storage operations fail."""
27
+
28
+
29
+ class ExternalDataStorageService:
30
+ """
31
+ Service for storing large encrypted data externally.
32
+
33
+ Handles:
34
+ - Automatic encryption/decryption
35
+ - Multiple storage backends (S3, local, GCS, etc.)
36
+ - Consistent file organization
37
+ - Cleanup operations
38
+ """
39
+
40
+ @staticmethod
41
+ def _get_storage_config(db: Session, storage_key: Optional[str]) -> "StorageConfig":
42
+ """Resolve and return the StorageConfig to use.
43
+
44
+ Preference order:
45
+
46
+ 1. If *storage_key* is provided, fetch that specific configuration.
47
+ 2. Otherwise, fall back to the *active* default storage configuration.
48
+
49
+ Raises ExternalDataStorageError when no suitable configuration is found.
50
+ """
51
+
52
+ if storage_key:
53
+ storage_config = (
54
+ db.query(StorageConfig).filter(StorageConfig.key == storage_key).first()
55
+ )
56
+ if not storage_config:
57
+ msg = f"Storage configuration with key '{storage_key}' not found"
58
+ logger.error(msg)
59
+ raise ExternalDataStorageError(msg)
60
+ return storage_config
61
+
62
+ # No explicit key – use the active default
63
+ storage_config = get_active_default_storage_config(db)
64
+ if not storage_config:
65
+ msg = "No active default storage configuration available for large data"
66
+ logger.error(msg)
67
+ raise ExternalDataStorageError(msg)
68
+
69
+ return storage_config
70
+
71
+ @staticmethod
72
+ def store_data(
73
+ db: Session,
74
+ storage_path: str,
75
+ data: Any,
76
+ storage_key: Optional[str] = None,
77
+ ) -> ExternalStorageMetadata:
78
+ """
79
+ Store data in external storage with encryption.
80
+
81
+ Args:
82
+ db: Database session
83
+ storage_path: Path where data should be stored (e.g., "model/id/field/timestamp")
84
+ data: The data to store (will be serialized and encrypted)
85
+ storage_key: Optional specific storage config key to use
86
+
87
+ Returns:
88
+ ExternalStorageMetadata with storage details
89
+
90
+ Raises:
91
+ ExternalDataStorageError: If storage operation fails
92
+ """
93
+ try:
94
+ storage_config = ExternalDataStorageService._get_storage_config(
95
+ db, storage_key
96
+ )
97
+
98
+ # Serialize and encrypt the data
99
+ encrypted_data = encrypt_data(data)
100
+ file_size = len(encrypted_data)
101
+
102
+ # Store to external storage based on type
103
+ if storage_config.type == StorageType.s3:
104
+ ExternalDataStorageService._store_to_s3(
105
+ storage_config, storage_path, encrypted_data
106
+ )
107
+ elif storage_config.type == StorageType.gcs:
108
+ ExternalDataStorageService._store_to_gcs(
109
+ storage_config, storage_path, encrypted_data
110
+ )
111
+ elif storage_config.type == StorageType.local:
112
+ ExternalDataStorageService._store_to_local(storage_path, encrypted_data)
113
+ else:
114
+ raise ExternalDataStorageError(
115
+ f"Unsupported storage type: {storage_config.type}"
116
+ )
117
+
118
+ # Create and return metadata
119
+ metadata = ExternalStorageMetadata(
120
+ storage_type=StorageType(storage_config.type.value),
121
+ file_key=storage_path,
122
+ filesize=file_size,
123
+ storage_key=storage_config.key,
124
+ )
125
+
126
+ logger.info(
127
+ f"Stored {file_size:,} bytes to {storage_config.type} storage "
128
+ f"at path: {storage_path}"
129
+ )
130
+
131
+ return metadata
132
+
133
+ except Exception as e:
134
+ logger.error(f"Failed to store data externally: {str(e)}")
135
+ raise ExternalDataStorageError(f"Failed to store data: {str(e)}") from e
136
+
137
+ @staticmethod
138
+ def retrieve_data(
139
+ db: Session,
140
+ metadata: ExternalStorageMetadata,
141
+ ) -> Any:
142
+ """
143
+ Retrieve and decrypt data from external storage.
144
+
145
+ Args:
146
+ db: Database session
147
+ metadata: Storage metadata containing location and details
148
+
149
+ Returns:
150
+ Decrypted and deserialized data
151
+
152
+ Raises:
153
+ ExternalDataStorageError: If retrieval operation fails
154
+ """
155
+ try:
156
+ storage_config = ExternalDataStorageService._get_storage_config(
157
+ db, metadata.storage_key
158
+ )
159
+
160
+ # Retrieve encrypted data based on storage type
161
+ storage_type_value = (
162
+ metadata.storage_type.value
163
+ if isinstance(metadata.storage_type, StorageType)
164
+ else metadata.storage_type
165
+ )
166
+
167
+ if storage_type_value == StorageType.s3.value:
168
+ encrypted_data = ExternalDataStorageService._retrieve_from_s3(
169
+ storage_config, metadata
170
+ )
171
+ elif storage_type_value == StorageType.gcs.value:
172
+ encrypted_data = ExternalDataStorageService._retrieve_from_gcs(
173
+ storage_config, metadata
174
+ )
175
+ elif storage_type_value == StorageType.local.value:
176
+ encrypted_data = ExternalDataStorageService._retrieve_from_local(
177
+ metadata
178
+ )
179
+ else:
180
+ raise ExternalDataStorageError(
181
+ f"Unsupported storage type: {storage_type_value}"
182
+ )
183
+
184
+ # Handle case where download returns None
185
+ if encrypted_data is None:
186
+ raise ExternalDataStorageError(
187
+ f"No data found at path: {metadata.file_key}"
188
+ )
189
+
190
+ # Decrypt and deserialize
191
+ data = decrypt_data(encrypted_data)
192
+
193
+ logger.info(
194
+ f"Retrieved {metadata.filesize:,} bytes from {storage_type_value} storage "
195
+ f"at path: {metadata.file_key}"
196
+ )
197
+
198
+ return data
199
+
200
+ except ExternalDataStorageError:
201
+ raise
202
+ except Exception as e:
203
+ logger.error(f"Failed to retrieve data from external storage: {str(e)}")
204
+ raise ExternalDataStorageError(f"Failed to retrieve data: {str(e)}") from e
205
+
206
+ @staticmethod
207
+ def delete_data(
208
+ db: Session,
209
+ metadata: ExternalStorageMetadata,
210
+ ) -> None:
211
+ """
212
+ Delete data from external storage.
213
+
214
+ Args:
215
+ db: Database session
216
+ metadata: Storage metadata containing location
217
+
218
+ Note:
219
+ This operation is best-effort and will log warnings on failure
220
+ rather than raising exceptions, to support cleanup scenarios.
221
+ """
222
+ try:
223
+ storage_config = ExternalDataStorageService._get_storage_config(
224
+ db, metadata.storage_key
225
+ )
226
+
227
+ # Delete from external storage based on type
228
+ storage_type_value = (
229
+ metadata.storage_type.value
230
+ if isinstance(metadata.storage_type, StorageType)
231
+ else metadata.storage_type
232
+ )
233
+
234
+ if storage_type_value == StorageType.s3.value:
235
+ ExternalDataStorageService._delete_from_s3(storage_config, metadata)
236
+ elif storage_type_value == StorageType.gcs.value:
237
+ ExternalDataStorageService._delete_from_gcs(storage_config, metadata)
238
+ elif storage_type_value == StorageType.local.value:
239
+ ExternalDataStorageService._delete_from_local(metadata)
240
+ else:
241
+ logger.warning(
242
+ f"Unsupported storage type for cleanup: {storage_type_value}"
243
+ )
244
+ return
245
+
246
+ logger.info(
247
+ f"Deleted external storage file from {storage_type_value} storage "
248
+ f"at path: {metadata.file_key}"
249
+ )
250
+
251
+ except Exception as e:
252
+ # Log but don't raise - cleanup should be best effort
253
+ logger.warning(
254
+ f"Failed to delete external storage file at {metadata.file_key}: {str(e)}"
255
+ )
256
+
257
+ # Private helper methods for each storage type
258
+
259
+ @staticmethod
260
+ def _store_to_s3(config: StorageConfig, file_key: str, data: bytes) -> None:
261
+ """Store data to S3 using existing generic_upload_to_s3"""
262
+ bucket_name = config.details[StorageDetails.BUCKET.value]
263
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
264
+
265
+ document = BytesIO(data)
266
+ generic_upload_to_s3(
267
+ storage_secrets=config.secrets,
268
+ bucket_name=bucket_name,
269
+ file_key=file_key,
270
+ auth_method=auth_method,
271
+ document=document,
272
+ )
273
+
274
+ @staticmethod
275
+ def _store_to_gcs(config: StorageConfig, file_key: str, data: bytes) -> None:
276
+ """Store data to GCS using existing get_gcs_client"""
277
+ bucket_name = config.details[StorageDetails.BUCKET.value]
278
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
279
+
280
+ storage_client = get_gcs_client(auth_method, config.secrets)
281
+ bucket = storage_client.bucket(bucket_name)
282
+ blob = bucket.blob(file_key)
283
+
284
+ blob.upload_from_string(data, content_type="application/octet-stream")
285
+
286
+ @staticmethod
287
+ def _store_to_local(file_key: str, data: bytes) -> None:
288
+ """Store data to local filesystem using existing get_local_filename"""
289
+ file_path = get_local_filename(file_key)
290
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
291
+ with open(file_path, "wb") as f:
292
+ f.write(data)
293
+
294
+ @staticmethod
295
+ def _retrieve_from_s3(
296
+ config: StorageConfig, metadata: ExternalStorageMetadata
297
+ ) -> bytes:
298
+ """Retrieve data from S3 directly, bypassing file size limits"""
299
+
300
+ bucket_name = config.details[StorageDetails.BUCKET.value]
301
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
302
+
303
+ # Get S3 client directly and download content regardless of file size
304
+ s3_client = get_s3_client(auth_method, config.secrets)
305
+
306
+ try:
307
+ # Download content directly to BytesIO buffer
308
+ file_obj = BytesIO()
309
+ s3_client.download_fileobj(
310
+ Bucket=bucket_name, Key=metadata.file_key, Fileobj=file_obj
311
+ )
312
+ file_obj.seek(0) # Reset file pointer to beginning
313
+ return file_obj.read()
314
+ except Exception as e:
315
+ logger.error(f"Error retrieving file from S3: {e}")
316
+ raise e
317
+
318
+ @staticmethod
319
+ def _retrieve_from_gcs(
320
+ config: StorageConfig, metadata: ExternalStorageMetadata
321
+ ) -> bytes:
322
+ """Retrieve data from GCS using existing get_gcs_client"""
323
+ bucket_name = config.details[StorageDetails.BUCKET.value]
324
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
325
+
326
+ storage_client = get_gcs_client(auth_method, config.secrets)
327
+ bucket = storage_client.bucket(bucket_name)
328
+ blob = bucket.blob(metadata.file_key)
329
+ return blob.download_as_bytes()
330
+
331
+ @staticmethod
332
+ def _retrieve_from_local(metadata: ExternalStorageMetadata) -> bytes:
333
+ """Retrieve data from local filesystem"""
334
+ file_path = get_local_filename(metadata.file_key)
335
+ with open(file_path, "rb") as f:
336
+ return f.read()
337
+
338
+ @staticmethod
339
+ def _delete_from_s3(
340
+ config: StorageConfig, metadata: ExternalStorageMetadata
341
+ ) -> None:
342
+ """Delete data from S3 using existing generic_delete_from_s3"""
343
+ bucket_name = config.details[StorageDetails.BUCKET.value]
344
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
345
+
346
+ generic_delete_from_s3(
347
+ storage_secrets=config.secrets,
348
+ bucket_name=bucket_name,
349
+ file_key=metadata.file_key,
350
+ auth_method=auth_method,
351
+ )
352
+
353
+ @staticmethod
354
+ def _delete_from_gcs(
355
+ config: StorageConfig, metadata: ExternalStorageMetadata
356
+ ) -> None:
357
+ """Delete data from GCS using existing get_gcs_client"""
358
+ bucket_name = config.details[StorageDetails.BUCKET.value]
359
+ auth_method = config.details[StorageDetails.AUTH_METHOD.value]
360
+
361
+ storage_client = get_gcs_client(auth_method, config.secrets)
362
+ bucket = storage_client.bucket(bucket_name)
363
+ blob = bucket.blob(metadata.file_key)
364
+ blob.delete()
365
+
366
+ @staticmethod
367
+ def _delete_from_local(metadata: ExternalStorageMetadata) -> None:
368
+ """Delete data from local filesystem"""
369
+ file_path = get_local_filename(metadata.file_key)
370
+ if os.path.exists(file_path):
371
+ os.remove(file_path)
@@ -262,14 +262,14 @@ def upload_access_results( # pylint: disable=R0912
262
262
  privacy_request.add_success_execution_log(
263
263
  session,
264
264
  connection_key=None,
265
- dataset_name="Access Package Upload",
265
+ dataset_name="Access package upload",
266
266
  collection_name=None,
267
- message="Access Package Upload successful for privacy request.",
267
+ message="Access package upload successful for privacy request.",
268
268
  action_type=ActionType.access,
269
269
  )
270
270
  logger.bind(
271
271
  time_taken=time.time() - start_time,
272
- ).info("Access Package Upload successful for privacy request.")
272
+ ).info("Access package upload successful for privacy request.")
273
273
  except common_exceptions.StorageUploadError as exc:
274
274
  logger.bind(
275
275
  policy_key=policy.key,
@@ -279,9 +279,9 @@ def upload_access_results( # pylint: disable=R0912
279
279
  privacy_request.add_error_execution_log(
280
280
  session,
281
281
  connection_key=None,
282
- dataset_name="Access Package Upload",
282
+ dataset_name="Access package upload",
283
283
  collection_name=None,
284
- message="Access Package Upload failed for privacy request.",
284
+ message="Access package upload failed for privacy request.",
285
285
  action_type=ActionType.access,
286
286
  )
287
287
  privacy_request.status = PrivacyRequestStatus.error
@@ -17,11 +17,11 @@ from fides.api.models.privacy_request import (
17
17
  PrivacyRequest,
18
18
  RequestTask,
19
19
  )
20
+ from fides.api.models.worker_task import ExecutionLogStatus
20
21
  from fides.api.schemas.drp_privacy_request import DrpPrivacyRequestCreate
21
22
  from fides.api.schemas.masking.masking_secrets import MaskingSecretCache
22
23
  from fides.api.schemas.policy import ActionType
23
24
  from fides.api.schemas.privacy_request import (
24
- ExecutionLogStatus,
25
25
  PrivacyRequestResponse,
26
26
  PrivacyRequestStatus,
27
27
  )
@@ -29,8 +29,8 @@ from fides.api.models.privacy_request import (
29
29
  RequestTask,
30
30
  TraversalDetails,
31
31
  )
32
+ from fides.api.models.worker_task import ExecutionLogStatus
32
33
  from fides.api.schemas.policy import ActionType
33
- from fides.api.schemas.privacy_request import ExecutionLogStatus
34
34
  from fides.api.task.deprecated_graph_task import format_data_use_map_for_caching
35
35
  from fides.api.task.execute_request_tasks import log_task_queued, queue_request_task
36
36
  from fides.api.util.logger_context_utils import log_context
@@ -22,8 +22,9 @@ from fides.api.common_exceptions import (
22
22
  from fides.api.graph.config import TERMINATOR_ADDRESS, CollectionAddress
23
23
  from fides.api.models.connectionconfig import ConnectionConfig
24
24
  from fides.api.models.privacy_request import ExecutionLog, PrivacyRequest, RequestTask
25
+ from fides.api.models.worker_task import ExecutionLogStatus
25
26
  from fides.api.schemas.policy import ActionType, CurrentStep
26
- from fides.api.schemas.privacy_request import ExecutionLogStatus, PrivacyRequestStatus
27
+ from fides.api.schemas.privacy_request import PrivacyRequestStatus
27
28
  from fides.api.task.graph_task import (
28
29
  GraphTask,
29
30
  mark_current_and_downstream_nodes_as_failed,
@@ -145,7 +146,7 @@ def can_run_task_body(
145
146
  if request_task.is_terminator_task:
146
147
  logger.info(
147
148
  "Terminator {} task reached.",
148
- request_task.action_type.value,
149
+ request_task.action_type,
149
150
  )
150
151
  return False
151
152
  if request_task.is_root_task:
@@ -154,7 +155,7 @@ def can_run_task_body(
154
155
  if request_task.status != ExecutionLogStatus.pending:
155
156
  logger_method(request_task)(
156
157
  "Skipping {} task {} with status {}.",
157
- request_task.action_type.value,
158
+ request_task.action_type,
158
159
  request_task.collection_address,
159
160
  request_task.status.value,
160
161
  )
@@ -449,7 +450,7 @@ def log_task_complete(request_task: RequestTask) -> None:
449
450
  """Convenience method for logging task completion"""
450
451
  logger.info(
451
452
  "{} task {} is {}.",
452
- request_task.action_type.value.capitalize(),
453
+ request_task.action_type.capitalize(),
453
454
  request_task.collection_address,
454
455
  request_task.status.value,
455
456
  )
@@ -478,9 +479,9 @@ def _order_tasks_by_input_key(
478
479
 
479
480
 
480
481
  mapping = {
481
- ActionType.access: run_access_node,
482
- ActionType.erasure: run_erasure_node,
483
- ActionType.consent: run_consent_node,
482
+ ActionType.access.value: run_access_node,
483
+ ActionType.erasure.value: run_erasure_node,
484
+ ActionType.consent.value: run_consent_node,
484
485
  }
485
486
 
486
487
 
@@ -504,7 +505,7 @@ def log_task_queued(request_task: RequestTask, location: str) -> None:
504
505
  """Helper for logging that tasks are queued"""
505
506
  logger_method(request_task)(
506
507
  "Queuing {} task {} from {}.",
507
- request_task.action_type.value,
508
+ request_task.action_type,
508
509
  request_task.collection_address,
509
510
  location,
510
511
  )
@@ -39,8 +39,8 @@ from fides.api.models.datasetconfig import DatasetConfig
39
39
  from fides.api.models.policy import Policy, Rule
40
40
  from fides.api.models.privacy_preference import PrivacyPreferenceHistory
41
41
  from fides.api.models.privacy_request import ExecutionLog, PrivacyRequest, RequestTask
42
+ from fides.api.models.worker_task import ExecutionLogStatus
42
43
  from fides.api.schemas.policy import ActionType, CurrentStep
43
- from fides.api.schemas.privacy_request import ExecutionLogStatus
44
44
  from fides.api.service.connectors.base_connector import BaseConnector
45
45
  from fides.api.task.consolidate_query_matches import consolidate_query_matches
46
46
  from fides.api.task.filter_element_match import filter_element_match
@@ -503,12 +503,20 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
503
503
  self.post_process_input_data(formatted_input_data)
504
504
  )
505
505
 
506
- # For erasures: cache results with non-matching array elements *replaced* with placeholder text
507
- placeholder_output: List[Row] = copy.deepcopy(output)
508
- for row in placeholder_output:
506
+ # For erasures: build placeholder version incrementally to avoid holding two full
507
+ # copies of the data in memory simultaneously.
508
+ placeholder_output: List[Row] = []
509
+ for original_row in output:
510
+ # Create a deep copy of the *single* row, transform it, then append to
511
+ # the placeholder list. Peak memory at any point is one extra row rather
512
+ # than an entire dataset.
513
+ row_copy = copy.deepcopy(original_row)
509
514
  filter_element_match(
510
- row, query_paths=post_processed_node_input_data, delete_elements=False
515
+ row_copy,
516
+ query_paths=post_processed_node_input_data,
517
+ delete_elements=False,
511
518
  )
519
+ placeholder_output.append(row_copy)
512
520
 
513
521
  # For DSR 3.0, save data to build masking requests directly
514
522
  # on the Request Task.
@@ -519,11 +527,14 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
519
527
  # TODO Remove when we stop support for DSR 2.0
520
528
  # Save data to build masking requests for DSR 2.0 in Redis.
521
529
  # Results saved with matching array elements preserved
522
- self.resources.cache_results_with_placeholders(
523
- f"access_request__{self.key}", placeholder_output
524
- )
530
+ if not CONFIG.execution.use_dsr_3_0:
531
+ self.resources.cache_results_with_placeholders(
532
+ f"access_request__{self.key}", placeholder_output
533
+ )
525
534
 
526
- # For access request results, cache results with non-matching array elements *removed*
535
+ # For access request results, mutate rows in-place to remove non-matching
536
+ # array elements. We already iterated over `output` above, so reuse the same
537
+ # loop structure to keep cache locality.
527
538
  for row in output:
528
539
  logger.info(
529
540
  "Filtering row in {} for matching array elements.",
@@ -537,7 +548,8 @@ class GraphTask(ABC): # pylint: disable=too-many-instance-attributes
537
548
 
538
549
  # TODO Remove when we stop support for DSR 2.0
539
550
  # Saves intermediate access results for DSR 2.0 in Redis
540
- self.resources.cache_object(f"access_request__{self.key}", output)
551
+ if not CONFIG.execution.use_dsr_3_0:
552
+ self.resources.cache_object(f"access_request__{self.key}", output)
541
553
 
542
554
  # Return filtered rows with non-matched array data removed.
543
555
  return output
@@ -18,7 +18,7 @@ from fides.api.models.privacy_request import (
18
18
  )
19
19
  from fides.api.models.sql_models import System # type: ignore[attr-defined]
20
20
  from fides.api.models.tcf_purpose_overrides import TCFPurposeOverride
21
- from fides.api.schemas.privacy_request import ExecutionLogStatus
21
+ from fides.api.models.worker_task import ExecutionLogStatus
22
22
  from fides.api.schemas.redis_cache import Identity
23
23
 
24
24
 
@@ -0,0 +1,102 @@
1
+ """
2
+ Helpers for estimating the size of large collections of access data.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ import sys
9
+ from typing import List, Optional
10
+
11
+ from loguru import logger
12
+
13
+ from fides.api.util.collection_util import Row
14
+ from fides.api.util.custom_json_encoder import CustomJSONEncoder
15
+
16
+ # 640MB threshold for external storage
17
+ # We only generate an estimated size for large datasets so we want to be conservative
18
+ # and fallback to external storage even if we haven't hit the 1GB max limit.
19
+ # We also want to pad for encryption and base64 encoding.
20
+ LARGE_DATA_THRESHOLD_BYTES = 640 * 1024 * 1024 # 640MB
21
+
22
+
23
+ def calculate_data_size(data: List[Row]) -> int: # noqa: D401 – utility function
24
+ """Return an approximate JSON-serialized size (in bytes) for a list of *Row*.
25
+
26
+ The implementation purposefully avoids serializing the entire payload when
27
+ *data* is large. For collections >1000 rows we sample a subset, measure the
28
+ encoded size, then extrapolate. This keeps memory usage bounded while still
29
+ giving us an order-of-magnitude estimate suitable for "should I stream this
30
+ out to S3?" decisions.
31
+ """
32
+
33
+ if not data:
34
+ return 0
35
+
36
+ try:
37
+ data_count = len(data)
38
+
39
+ # For very large datasets, estimate size from a sample to avoid memory issues
40
+ if data_count > 1000:
41
+ logger.debug(
42
+ f"Calculating size for large dataset ({data_count} rows) using sampling"
43
+ )
44
+
45
+ sample_size = min(500, max(100, data_count // 20)) # 5 % capped at 500
46
+
47
+ # stratified sampling – take items spaced across the set when possible
48
+ if data_count > sample_size * 3:
49
+ step = data_count // sample_size
50
+ sample_indices = list(range(0, data_count, step))[:sample_size]
51
+ sample = [data[i] for i in sample_indices]
52
+ else:
53
+ sample = data[:sample_size]
54
+
55
+ sample_json = json.dumps(
56
+ sample, cls=CustomJSONEncoder, separators=(",", ":")
57
+ )
58
+ sample_bytes = len(sample_json.encode("utf-8"))
59
+
60
+ avg_record_size = sample_bytes / sample_size
61
+ content_size = int(avg_record_size * data_count)
62
+
63
+ # overhead: 2 bytes for [] plus a comma between every record plus 1 % slack
64
+ structure_overhead = 2 + (data_count - 1) + int(content_size * 0.01)
65
+ return content_size + structure_overhead
66
+
67
+ # small datasets – just measure
68
+ json_str = json.dumps(data, cls=CustomJSONEncoder, separators=(",", ":"))
69
+ return len(json_str.encode("utf-8"))
70
+
71
+ except (TypeError, ValueError) as exc:
72
+ logger.warning(
73
+ f"Failed to calculate JSON size, falling back to sys.getsizeof: {exc}"
74
+ )
75
+ return sys.getsizeof(data)
76
+
77
+
78
+ def is_large_data(
79
+ data: List[Row], threshold_bytes: Optional[int] = None
80
+ ) -> bool: # noqa: D401
81
+ """Return *True* if *data* is likely to exceed *threshold_bytes* when serialized."""
82
+
83
+ if not data:
84
+ return False
85
+
86
+ threshold = (
87
+ threshold_bytes if threshold_bytes is not None else LARGE_DATA_THRESHOLD_BYTES
88
+ )
89
+ size = calculate_data_size(data)
90
+ if size > threshold:
91
+ logger.info(
92
+ f"Data size ({size:,} bytes) exceeds threshold ({threshold:,} bytes) – using external storage"
93
+ )
94
+ return True
95
+ return False
96
+
97
+
98
+ __all__ = [
99
+ "calculate_data_size",
100
+ "is_large_data",
101
+ "LARGE_DATA_THRESHOLD_BYTES",
102
+ ]