ethyca-fides 2.69.1rc0__py2.py3-none-any.whl → 2.69.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ethyca-fides might be problematic. Click here for more details.
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/METADATA +1 -1
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/RECORD +100 -100
- fides/_version.py +3 -3
- fides/api/service/privacy_request/dsr_package/dsr_report_builder.py +12 -4
- fides/api/service/privacy_request/request_runner_service.py +2 -2
- fides/api/service/storage/streaming/schemas.py +27 -19
- fides/api/service/storage/streaming/smart_open_client.py +2 -2
- fides/api/service/storage/streaming/smart_open_streaming_storage.py +202 -119
- fides/api/service/storage/util.py +9 -4
- fides/api/util/rate_limit.py +25 -7
- fides/config/security_settings.py +4 -1
- fides/ui-build/static/admin/404.html +1 -1
- fides/ui-build/static/admin/_next/static/{OmXHlY9MvjoZH9jDkAytl → Mzh6ue6wVfRTXIvDbuNvr}/_buildManifest.js +1 -1
- fides/ui-build/static/admin/_next/static/chunks/pages/integrations/[id]-5d40db1ce8810e1d.js +1 -0
- fides/ui-build/static/admin/add-systems/manual.html +1 -1
- fides/ui-build/static/admin/add-systems/multiple.html +1 -1
- fides/ui-build/static/admin/add-systems.html +1 -1
- fides/ui-build/static/admin/consent/configure/add-vendors.html +1 -1
- fides/ui-build/static/admin/consent/configure.html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience/[id].html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience/new.html +1 -1
- fides/ui-build/static/admin/consent/privacy-experience.html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices/[id].html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices/new.html +1 -1
- fides/ui-build/static/admin/consent/privacy-notices.html +1 -1
- fides/ui-build/static/admin/consent/properties.html +1 -1
- fides/ui-build/static/admin/consent/reporting.html +1 -1
- fides/ui-build/static/admin/consent.html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn]/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects/[projectUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/projects.html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/resources/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-catalog/[systemId]/resources.html +1 -1
- fides/ui-build/static/admin/data-catalog.html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center/[monitorId]/[systemId].html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center/[monitorId].html +1 -1
- fides/ui-build/static/admin/data-discovery/action-center.html +1 -1
- fides/ui-build/static/admin/data-discovery/activity.html +1 -1
- fides/ui-build/static/admin/data-discovery/detection/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-discovery/detection.html +1 -1
- fides/ui-build/static/admin/data-discovery/discovery/[resourceUrn].html +1 -1
- fides/ui-build/static/admin/data-discovery/discovery.html +1 -1
- fides/ui-build/static/admin/datamap.html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId]/[collectionName]/[...subfieldNames].html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId]/[collectionName].html +1 -1
- fides/ui-build/static/admin/dataset/[datasetId].html +1 -1
- fides/ui-build/static/admin/dataset/new.html +1 -1
- fides/ui-build/static/admin/dataset.html +1 -1
- fides/ui-build/static/admin/datastore-connection/[id].html +1 -1
- fides/ui-build/static/admin/datastore-connection/new.html +1 -1
- fides/ui-build/static/admin/datastore-connection.html +1 -1
- fides/ui-build/static/admin/index.html +1 -1
- fides/ui-build/static/admin/integrations/[id].html +1 -1
- fides/ui-build/static/admin/integrations.html +1 -1
- fides/ui-build/static/admin/login/[provider].html +1 -1
- fides/ui-build/static/admin/login.html +1 -1
- fides/ui-build/static/admin/messaging/[id].html +1 -1
- fides/ui-build/static/admin/messaging/add-template.html +1 -1
- fides/ui-build/static/admin/messaging.html +1 -1
- fides/ui-build/static/admin/poc/ant-components.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/AntForm.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikAntFormItem.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikControlled.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikField.html +1 -1
- fides/ui-build/static/admin/poc/form-experiments/FormikSpreadField.html +1 -1
- fides/ui-build/static/admin/poc/forms.html +1 -1
- fides/ui-build/static/admin/poc/table-migration.html +1 -1
- fides/ui-build/static/admin/privacy-requests/[id].html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure/messaging.html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure/storage.html +1 -1
- fides/ui-build/static/admin/privacy-requests/configure.html +1 -1
- fides/ui-build/static/admin/privacy-requests.html +1 -1
- fides/ui-build/static/admin/properties/[id].html +1 -1
- fides/ui-build/static/admin/properties/add-property.html +1 -1
- fides/ui-build/static/admin/properties.html +1 -1
- fides/ui-build/static/admin/reporting/datamap.html +1 -1
- fides/ui-build/static/admin/settings/about/alpha.html +1 -1
- fides/ui-build/static/admin/settings/about.html +1 -1
- fides/ui-build/static/admin/settings/consent/[configuration_id]/[purpose_id].html +1 -1
- fides/ui-build/static/admin/settings/consent.html +1 -1
- fides/ui-build/static/admin/settings/custom-fields.html +1 -1
- fides/ui-build/static/admin/settings/domain-records.html +1 -1
- fides/ui-build/static/admin/settings/domains.html +1 -1
- fides/ui-build/static/admin/settings/email-templates.html +1 -1
- fides/ui-build/static/admin/settings/locations.html +1 -1
- fides/ui-build/static/admin/settings/organization.html +1 -1
- fides/ui-build/static/admin/settings/privacy-requests.html +1 -1
- fides/ui-build/static/admin/settings/regulations.html +1 -1
- fides/ui-build/static/admin/systems/configure/[id]/test-datasets.html +1 -1
- fides/ui-build/static/admin/systems/configure/[id].html +1 -1
- fides/ui-build/static/admin/systems.html +1 -1
- fides/ui-build/static/admin/taxonomy.html +1 -1
- fides/ui-build/static/admin/user-management/new.html +1 -1
- fides/ui-build/static/admin/user-management/profile/[id].html +1 -1
- fides/ui-build/static/admin/user-management.html +1 -1
- fides/ui-build/static/admin/_next/static/chunks/pages/integrations/[id]-4c3c413a2668df53.js +0 -1
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/WHEEL +0 -0
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/entry_points.txt +0 -0
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/licenses/LICENSE +0 -0
- {ethyca_fides-2.69.1rc0.dist-info → ethyca_fides-2.69.2.dist-info}/top_level.txt +0 -0
- /fides/ui-build/static/admin/_next/static/{OmXHlY9MvjoZH9jDkAytl → Mzh6ue6wVfRTXIvDbuNvr}/_ssgManifest.js +0 -0
|
@@ -3,11 +3,12 @@ from __future__ import annotations
|
|
|
3
3
|
|
|
4
4
|
import csv
|
|
5
5
|
import json
|
|
6
|
+
import time
|
|
6
7
|
from datetime import datetime
|
|
7
8
|
from io import BytesIO, StringIO
|
|
8
9
|
from itertools import chain
|
|
9
10
|
from typing import Any, Generator, Iterable, Optional, Tuple
|
|
10
|
-
from urllib.parse import urlparse
|
|
11
|
+
from urllib.parse import unquote, urlparse
|
|
11
12
|
|
|
12
13
|
from fideslang.validation import AnyHttpUrlString
|
|
13
14
|
from loguru import logger
|
|
@@ -25,10 +26,11 @@ from fides.api.service.storage.streaming.dsr_storage import (
|
|
|
25
26
|
)
|
|
26
27
|
from fides.api.service.storage.streaming.retry import retry_cloud_storage_operation
|
|
27
28
|
from fides.api.service.storage.streaming.schemas import (
|
|
28
|
-
|
|
29
|
+
DEFAULT_CHUNK_SIZE,
|
|
30
|
+
MAX_FILE_SIZE,
|
|
29
31
|
AttachmentInfo,
|
|
30
32
|
AttachmentProcessingInfo,
|
|
31
|
-
|
|
33
|
+
SmartOpenStreamingStorageConfig,
|
|
32
34
|
StorageUploadConfig,
|
|
33
35
|
StreamingBufferConfig,
|
|
34
36
|
)
|
|
@@ -66,16 +68,22 @@ class SmartOpenStreamingStorage:
|
|
|
66
68
|
def __init__(
|
|
67
69
|
self,
|
|
68
70
|
storage_client: SmartOpenStorageClient,
|
|
69
|
-
chunk_size: int =
|
|
71
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
70
72
|
):
|
|
71
73
|
"""Initialize with a smart-open storage client.
|
|
72
74
|
|
|
73
75
|
Args:
|
|
74
76
|
storage_client: Smart-open based storage client
|
|
75
|
-
chunk_size: Size of chunks for streaming attachments (default:
|
|
77
|
+
chunk_size: Size of chunks for streaming attachments (default: 5MB)
|
|
78
|
+
|
|
79
|
+
Raises:
|
|
80
|
+
ValidationError: If chunk_size is outside valid range (1KB - 2GB)
|
|
76
81
|
"""
|
|
82
|
+
# Validate parameters using Pydantic schema
|
|
83
|
+
config = SmartOpenStreamingStorageConfig(chunk_size=chunk_size)
|
|
84
|
+
|
|
77
85
|
self.storage_client = storage_client
|
|
78
|
-
self.chunk_size = chunk_size
|
|
86
|
+
self.chunk_size = config.chunk_size
|
|
79
87
|
# Track used filenames per dataset to match DSR report builder behavior
|
|
80
88
|
# Maps dataset_name -> set of used filenames
|
|
81
89
|
self.used_filenames_per_dataset: dict[str, set[str]] = {}
|
|
@@ -98,10 +106,15 @@ class SmartOpenStreamingStorage:
|
|
|
98
106
|
Raises:
|
|
99
107
|
ValueError: If URL cannot be parsed
|
|
100
108
|
"""
|
|
109
|
+
if storage_key is None or storage_key == "":
|
|
110
|
+
logger.error(f"Storage key cannot be empty: {storage_key}")
|
|
111
|
+
raise ValueError("Storage key cannot be empty")
|
|
112
|
+
|
|
101
113
|
if storage_key.startswith("s3://"):
|
|
102
114
|
# Extract bucket from S3 URL: s3://bucket/path
|
|
103
115
|
parts = storage_key.split("/")
|
|
104
116
|
if len(parts) < 4:
|
|
117
|
+
logger.error(f"Invalid S3 URL format: {storage_key}")
|
|
105
118
|
raise ValueError(f"Invalid S3 URL format: {storage_key}")
|
|
106
119
|
return parts[2], "/".join(parts[3:])
|
|
107
120
|
|
|
@@ -111,16 +124,16 @@ class SmartOpenStreamingStorage:
|
|
|
111
124
|
parts = clean_url.split(S3_AMAZONAWS_COM_DOMAIN)
|
|
112
125
|
if len(parts) == 2:
|
|
113
126
|
bucket = parts[0].replace("https://", "").replace("http://", "")
|
|
114
|
-
key =
|
|
115
|
-
"/"
|
|
116
|
-
) #
|
|
127
|
+
key = unquote(
|
|
128
|
+
parts[1].lstrip("/")
|
|
129
|
+
) # URL-decode and strip leading slash for S3 compatibility
|
|
117
130
|
return bucket, key
|
|
118
131
|
|
|
119
132
|
# Handle generic HTTP(S) URLs
|
|
120
133
|
if storage_key.startswith(("http://", "https://")):
|
|
121
134
|
parsed = urlparse(storage_key)
|
|
122
135
|
bucket = parsed.netloc
|
|
123
|
-
key = parsed.path.lstrip("/")
|
|
136
|
+
key = unquote(parsed.path.lstrip("/")) # URL-decode the path
|
|
124
137
|
return bucket, key
|
|
125
138
|
|
|
126
139
|
raise ValueError(f"Could not parse storage URL: {storage_key}")
|
|
@@ -143,103 +156,6 @@ class SmartOpenStreamingStorage:
|
|
|
143
156
|
[content_bytes]
|
|
144
157
|
)
|
|
145
158
|
|
|
146
|
-
def build_attachments_list(
|
|
147
|
-
self, data: dict, config: PackageSplitConfig
|
|
148
|
-
) -> list[tuple[str, dict, int]]:
|
|
149
|
-
"""
|
|
150
|
-
Build a list of attachments from the data.
|
|
151
|
-
|
|
152
|
-
Args:
|
|
153
|
-
data: The data to build the attachments list from
|
|
154
|
-
config: The configuration for package splitting
|
|
155
|
-
|
|
156
|
-
Returns:
|
|
157
|
-
A list of AttachmentInfo objects
|
|
158
|
-
"""
|
|
159
|
-
attachments_list = []
|
|
160
|
-
for key, value in data.items():
|
|
161
|
-
if not isinstance(value, list):
|
|
162
|
-
continue
|
|
163
|
-
|
|
164
|
-
for item in value:
|
|
165
|
-
attachments = item.get("attachments", [])
|
|
166
|
-
if not isinstance(attachments, list):
|
|
167
|
-
attachments = []
|
|
168
|
-
|
|
169
|
-
attachment_count = len(attachments)
|
|
170
|
-
|
|
171
|
-
# Only include items that have attachments
|
|
172
|
-
if attachment_count > 0:
|
|
173
|
-
# If a single item has more attachments than the limit, we need to split it
|
|
174
|
-
if attachment_count > config.max_attachments:
|
|
175
|
-
# Split the item into multiple sub-items
|
|
176
|
-
for i in range(0, attachment_count, config.max_attachments):
|
|
177
|
-
sub_attachments = attachments[
|
|
178
|
-
i : i + config.max_attachments
|
|
179
|
-
]
|
|
180
|
-
sub_item = item.copy()
|
|
181
|
-
sub_item["attachments"] = sub_attachments
|
|
182
|
-
attachments_list.append(
|
|
183
|
-
(key, sub_item, len(sub_attachments))
|
|
184
|
-
)
|
|
185
|
-
else:
|
|
186
|
-
attachments_list.append((key, item, attachment_count))
|
|
187
|
-
|
|
188
|
-
return attachments_list
|
|
189
|
-
|
|
190
|
-
def split_data_into_packages(
|
|
191
|
-
self, data: dict, config: Optional[PackageSplitConfig] = None
|
|
192
|
-
) -> list[dict]:
|
|
193
|
-
"""Split large datasets into multiple smaller packages.
|
|
194
|
-
|
|
195
|
-
Uses a best-fit decreasing algorithm to optimize package distribution:
|
|
196
|
-
1. Sort items by attachment count (largest first)
|
|
197
|
-
2. Try to fit each item in the package with the most remaining space
|
|
198
|
-
3. Create new packages only when necessary
|
|
199
|
-
4. Handle items that exceed the max_attachments limit by splitting them
|
|
200
|
-
|
|
201
|
-
Args:
|
|
202
|
-
data: The data to split
|
|
203
|
-
config: Configuration for package splitting (defaults to PackageSplitConfig())
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
List of data packages
|
|
207
|
-
"""
|
|
208
|
-
# Use default config if none provided
|
|
209
|
-
if config is None:
|
|
210
|
-
config = PackageSplitConfig()
|
|
211
|
-
|
|
212
|
-
# Collect all items with their attachment counts
|
|
213
|
-
all_items = self.build_attachments_list(data, config)
|
|
214
|
-
|
|
215
|
-
# Sort by attachment count (largest first) for better space utilization
|
|
216
|
-
all_items.sort(key=lambda x: x[2], reverse=True)
|
|
217
|
-
|
|
218
|
-
packages: list[dict[str, Any]] = []
|
|
219
|
-
package_attachment_counts: list[int] = []
|
|
220
|
-
|
|
221
|
-
for key, item, attachment_count in all_items:
|
|
222
|
-
# Try to find a package with enough space
|
|
223
|
-
package_found = False
|
|
224
|
-
|
|
225
|
-
for i, current_count in enumerate(package_attachment_counts):
|
|
226
|
-
if current_count + attachment_count <= config.max_attachments:
|
|
227
|
-
# Add to existing package
|
|
228
|
-
if key not in packages[i]:
|
|
229
|
-
packages[i][key] = []
|
|
230
|
-
packages[i][key].append(item)
|
|
231
|
-
package_attachment_counts[i] += attachment_count
|
|
232
|
-
package_found = True
|
|
233
|
-
break
|
|
234
|
-
|
|
235
|
-
if not package_found:
|
|
236
|
-
# Create new package - this item cannot fit in any existing package
|
|
237
|
-
new_package = {key: [item]}
|
|
238
|
-
packages.append(new_package)
|
|
239
|
-
package_attachment_counts.append(attachment_count)
|
|
240
|
-
|
|
241
|
-
return packages
|
|
242
|
-
|
|
243
159
|
def _validate_attachment(
|
|
244
160
|
self, attachment: dict
|
|
245
161
|
) -> Optional[AttachmentProcessingInfo]:
|
|
@@ -294,23 +210,75 @@ class SmartOpenStreamingStorage:
|
|
|
294
210
|
Returns:
|
|
295
211
|
Iterator that yields chunks of the attachment content
|
|
296
212
|
"""
|
|
213
|
+
for arg in [bucket, key, storage_key]:
|
|
214
|
+
if arg is None or arg == "":
|
|
215
|
+
logger.error(f"{arg} cannot be empty: {arg}")
|
|
216
|
+
raise ValueError(f"{arg} cannot be empty")
|
|
217
|
+
|
|
297
218
|
try:
|
|
298
219
|
with self.storage_client.stream_read(bucket, key) as content_stream:
|
|
299
220
|
# Stream in chunks instead of reading entire file
|
|
300
|
-
chunk_count = 0
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
221
|
+
chunk_count = total_bytes = 0
|
|
222
|
+
max_chunks = (
|
|
223
|
+
MAX_FILE_SIZE // self.chunk_size + 1
|
|
224
|
+
) # Safety limit to prevent infinite loops
|
|
225
|
+
|
|
226
|
+
size_based_timeout = MAX_FILE_SIZE // (10 * 1024 * 1024) # 1s per 10MB
|
|
227
|
+
timeout = 300 + size_based_timeout # 5 minutes base + 1s per 10MB
|
|
228
|
+
start_time = time.time()
|
|
229
|
+
|
|
230
|
+
# Log the calculated timeout for debugging
|
|
231
|
+
logger.debug(
|
|
232
|
+
f"Starting stream for {storage_key} with timeout: {timeout}s "
|
|
233
|
+
f"(base: 300s + size-based: {size_based_timeout}s)"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
while chunk_count < max_chunks and total_bytes < MAX_FILE_SIZE:
|
|
237
|
+
elapsed_time = time.time() - start_time
|
|
238
|
+
if elapsed_time >= timeout:
|
|
239
|
+
raise TimeoutError(
|
|
240
|
+
f"Timeout reached ({timeout}s) while streaming attachment {storage_key}."
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
try:
|
|
244
|
+
chunk = content_stream.read(self.chunk_size)
|
|
245
|
+
except Exception as read_error:
|
|
246
|
+
logger.error(
|
|
247
|
+
f"Error reading chunk from stream for {storage_key}: {read_error}"
|
|
248
|
+
)
|
|
249
|
+
raise StorageUploadError(
|
|
250
|
+
f"Stream read error for {storage_key}: {read_error}"
|
|
251
|
+
) from read_error
|
|
252
|
+
|
|
304
253
|
if not chunk:
|
|
254
|
+
# End of stream reached normally
|
|
255
|
+
logger.debug(
|
|
256
|
+
f"Successfully streamed attachment {storage_key}: "
|
|
257
|
+
f"{total_bytes} bytes in {chunk_count} chunks"
|
|
258
|
+
)
|
|
305
259
|
break
|
|
260
|
+
|
|
306
261
|
chunk_count += 1
|
|
307
262
|
total_bytes += len(chunk)
|
|
308
263
|
yield chunk
|
|
309
264
|
|
|
265
|
+
# Log if we hit limits
|
|
266
|
+
if chunk_count >= max_chunks:
|
|
267
|
+
logger.warning(
|
|
268
|
+
f"Maximum chunk count ({max_chunks}) reached for attachment {storage_key}. "
|
|
269
|
+
f"Streamed {total_bytes} bytes. Stream may be incomplete."
|
|
270
|
+
)
|
|
271
|
+
elif total_bytes >= MAX_FILE_SIZE:
|
|
272
|
+
logger.warning(
|
|
273
|
+
f"Maximum file size ({MAX_FILE_SIZE} bytes) reached for attachment {storage_key}. "
|
|
274
|
+
f"Streamed {total_bytes} bytes in {chunk_count} chunks. Stream may be incomplete."
|
|
275
|
+
)
|
|
276
|
+
|
|
310
277
|
except Exception as e:
|
|
311
|
-
logger.
|
|
312
|
-
|
|
313
|
-
|
|
278
|
+
logger.error(f"Failed to stream attachment {storage_key}: {e}")
|
|
279
|
+
raise StorageUploadError(
|
|
280
|
+
f"Failed to stream attachment {storage_key}: {e}"
|
|
281
|
+
) from e
|
|
314
282
|
|
|
315
283
|
def _collect_and_validate_attachments(
|
|
316
284
|
self, data: dict
|
|
@@ -332,6 +300,8 @@ class SmartOpenStreamingStorage:
|
|
|
332
300
|
processed_attachments: dict[tuple[str, str], str] = {}
|
|
333
301
|
|
|
334
302
|
# Use the shared contextual processing function
|
|
303
|
+
# Note: This method should only be used when DSR report builder is not available
|
|
304
|
+
# For HTML format, use _collect_and_validate_attachments_from_dsr_builder instead
|
|
335
305
|
processed_attachments_list = process_attachments_contextually(
|
|
336
306
|
data,
|
|
337
307
|
used_filenames_data,
|
|
@@ -557,8 +527,9 @@ class SmartOpenStreamingStorage:
|
|
|
557
527
|
raise StorageUploadError(f"Failed to generate DSR report: {e}") from e
|
|
558
528
|
|
|
559
529
|
# Use the DSR report builder's processed attachments to avoid duplicates
|
|
530
|
+
# Use the redacted data from the DSR report builder instead of the original data
|
|
560
531
|
all_attachments = self._collect_and_validate_attachments_from_dsr_builder(
|
|
561
|
-
|
|
532
|
+
dsr_builder.dsr_data, dsr_builder
|
|
562
533
|
)
|
|
563
534
|
|
|
564
535
|
if not all_attachments:
|
|
@@ -595,7 +566,9 @@ class SmartOpenStreamingStorage:
|
|
|
595
566
|
)
|
|
596
567
|
|
|
597
568
|
# Create ZIP generator with attachment files
|
|
598
|
-
attachment_files_generator = self._create_attachment_files(
|
|
569
|
+
attachment_files_generator = self._create_attachment_files(
|
|
570
|
+
all_attachments, buffer_config
|
|
571
|
+
)
|
|
599
572
|
|
|
600
573
|
# Combine both generators and stream the complete ZIP to storage
|
|
601
574
|
combined_entries = chain(attachment_files_generator, dsr_files_generator)
|
|
@@ -673,6 +646,7 @@ class SmartOpenStreamingStorage:
|
|
|
673
646
|
max_workers,
|
|
674
647
|
batch_size,
|
|
675
648
|
resp_format,
|
|
649
|
+
buffer_config,
|
|
676
650
|
)
|
|
677
651
|
|
|
678
652
|
# Use smart-open's streaming upload capability
|
|
@@ -718,6 +692,7 @@ class SmartOpenStreamingStorage:
|
|
|
718
692
|
max_workers: int,
|
|
719
693
|
batch_size: int,
|
|
720
694
|
resp_format: str,
|
|
695
|
+
buffer_config: Optional[StreamingBufferConfig] = None,
|
|
721
696
|
) -> Generator[Tuple[str, datetime, int, Any, Iterable[bytes]], None, None]:
|
|
722
697
|
"""Create a generator for ZIP file contents including data and attachments.
|
|
723
698
|
|
|
@@ -743,7 +718,9 @@ class SmartOpenStreamingStorage:
|
|
|
743
718
|
yield from self._convert_to_stream_zip_format(data_files_generator)
|
|
744
719
|
|
|
745
720
|
# Then, yield attachment files (already in stream_zip format, stream directly)
|
|
746
|
-
attachment_files_generator = self._create_attachment_files(
|
|
721
|
+
attachment_files_generator = self._create_attachment_files(
|
|
722
|
+
all_attachments, buffer_config
|
|
723
|
+
)
|
|
747
724
|
yield from attachment_files_generator
|
|
748
725
|
|
|
749
726
|
def _create_data_files(
|
|
@@ -785,9 +762,62 @@ class SmartOpenStreamingStorage:
|
|
|
785
762
|
data_content = json.dumps(value, default=str).encode("utf-8")
|
|
786
763
|
yield f"{key}.json", BytesIO(data_content), {}
|
|
787
764
|
|
|
765
|
+
def _handle_attachment_error(
|
|
766
|
+
self,
|
|
767
|
+
all_attachments: list[AttachmentProcessingInfo],
|
|
768
|
+
failed_attachments: list[dict[str, Optional[str]]],
|
|
769
|
+
) -> Generator[Tuple[str, datetime, int, Any, Iterable[bytes]], None, None]:
|
|
770
|
+
"""Handle attachment errors and create a summary file."""
|
|
771
|
+
|
|
772
|
+
try:
|
|
773
|
+
# Calculate success rate with division by zero protection
|
|
774
|
+
total_attempted = len(all_attachments)
|
|
775
|
+
total_failed = len(failed_attachments)
|
|
776
|
+
success_rate = "N/A"
|
|
777
|
+
|
|
778
|
+
if total_attempted > 0:
|
|
779
|
+
success_rate = (
|
|
780
|
+
f"{((total_attempted - total_failed) / total_attempted * 100):.1f}%"
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
error_summary = {
|
|
784
|
+
"failed_attachments": failed_attachments,
|
|
785
|
+
"total_failed": total_failed,
|
|
786
|
+
"total_attempted": total_attempted,
|
|
787
|
+
"success_rate": success_rate,
|
|
788
|
+
"timestamp": datetime.now().isoformat(),
|
|
789
|
+
}
|
|
790
|
+
|
|
791
|
+
error_summary_content = json.dumps(error_summary, indent=2).encode("utf-8")
|
|
792
|
+
yield (
|
|
793
|
+
"errors/attachment_failures_summary.json",
|
|
794
|
+
datetime.now(),
|
|
795
|
+
DEFAULT_FILE_MODE,
|
|
796
|
+
_ZIP_32_TYPE(),
|
|
797
|
+
iter([error_summary_content]),
|
|
798
|
+
)
|
|
799
|
+
except Exception as summary_error:
|
|
800
|
+
logger.error(f"Failed to create error summary: {summary_error}")
|
|
801
|
+
# Create a minimal error summary as fallback
|
|
802
|
+
fallback_summary = {
|
|
803
|
+
"error": "Failed to generate detailed error summary",
|
|
804
|
+
"total_failed": len(failed_attachments),
|
|
805
|
+
"total_attempted": len(all_attachments),
|
|
806
|
+
"timestamp": datetime.now().isoformat(),
|
|
807
|
+
}
|
|
808
|
+
fallback_content = json.dumps(fallback_summary, indent=2).encode("utf-8")
|
|
809
|
+
yield (
|
|
810
|
+
"errors/attachment_failures_summary.json",
|
|
811
|
+
datetime.now(),
|
|
812
|
+
DEFAULT_FILE_MODE,
|
|
813
|
+
_ZIP_32_TYPE(),
|
|
814
|
+
iter([fallback_content]),
|
|
815
|
+
)
|
|
816
|
+
|
|
788
817
|
def _create_attachment_files(
|
|
789
818
|
self,
|
|
790
819
|
all_attachments: list[AttachmentProcessingInfo],
|
|
820
|
+
buffer_config: Optional[StreamingBufferConfig] = None,
|
|
791
821
|
) -> Generator[Tuple[str, datetime, int, Any, Iterable[bytes]], None, None]:
|
|
792
822
|
"""Create attachment files for the ZIP using true cloud-to-cloud streaming.
|
|
793
823
|
|
|
@@ -797,13 +827,66 @@ class SmartOpenStreamingStorage:
|
|
|
797
827
|
|
|
798
828
|
Args:
|
|
799
829
|
all_attachments: List of validated attachments
|
|
830
|
+
buffer_config: Configuration for error handling behavior
|
|
800
831
|
|
|
801
832
|
Returns:
|
|
802
833
|
Generator yielding attachment file entries in stream_zip format
|
|
803
834
|
"""
|
|
835
|
+
if buffer_config is None:
|
|
836
|
+
buffer_config = StreamingBufferConfig()
|
|
837
|
+
|
|
838
|
+
failed_attachments = []
|
|
839
|
+
|
|
804
840
|
for attachment_info in all_attachments:
|
|
805
|
-
|
|
806
|
-
|
|
841
|
+
try:
|
|
842
|
+
result = self._process_attachment_safely(attachment_info)
|
|
843
|
+
yield result
|
|
844
|
+
except StorageUploadError as e:
|
|
845
|
+
# Log the failure
|
|
846
|
+
failed_attachments.append(
|
|
847
|
+
{
|
|
848
|
+
"attachment": attachment_info.attachment.file_name,
|
|
849
|
+
"storage_key": attachment_info.attachment.storage_key,
|
|
850
|
+
"error": str(e),
|
|
851
|
+
}
|
|
852
|
+
)
|
|
853
|
+
logger.error(
|
|
854
|
+
f"Failed to process attachment {attachment_info.attachment.file_name} "
|
|
855
|
+
f"({attachment_info.attachment.storage_key}): {e}"
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
# If fail_fast is enabled, re-raise the exception
|
|
859
|
+
if buffer_config.fail_fast_on_attachment_errors:
|
|
860
|
+
raise
|
|
861
|
+
|
|
862
|
+
# Create a placeholder file with error information if error details are enabled
|
|
863
|
+
if buffer_config.include_error_details:
|
|
864
|
+
error_content = (
|
|
865
|
+
f"Error: Failed to retrieve attachment - {e}".encode("utf-8")
|
|
866
|
+
)
|
|
867
|
+
error_filename = (
|
|
868
|
+
f"ERROR_{attachment_info.attachment.file_name or 'unknown'}"
|
|
869
|
+
)
|
|
870
|
+
yield (
|
|
871
|
+
f"errors/{error_filename}",
|
|
872
|
+
datetime.now(),
|
|
873
|
+
DEFAULT_FILE_MODE,
|
|
874
|
+
_ZIP_32_TYPE(),
|
|
875
|
+
iter([error_content]),
|
|
876
|
+
)
|
|
877
|
+
|
|
878
|
+
# Log summary of failed attachments
|
|
879
|
+
if failed_attachments:
|
|
880
|
+
logger.warning(
|
|
881
|
+
f"Failed to process {len(failed_attachments)} attachments: "
|
|
882
|
+
f"{[att['attachment'] for att in failed_attachments]}"
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
# Create a summary error file with details about all failures if error details are enabled
|
|
886
|
+
if buffer_config.include_error_details:
|
|
887
|
+
yield from self._handle_attachment_error(
|
|
888
|
+
all_attachments, failed_attachments
|
|
889
|
+
)
|
|
807
890
|
|
|
808
891
|
def _transform_data_for_access_package(
|
|
809
892
|
self, data: dict[str, Any], all_attachments: list[AttachmentProcessingInfo]
|
|
@@ -10,7 +10,7 @@ from fides.api.util.storage_util import format_size
|
|
|
10
10
|
|
|
11
11
|
# This is the max file size for downloading the content of an attachment.
|
|
12
12
|
# This is an industry standard used by companies like Google and Microsoft.
|
|
13
|
-
LARGE_FILE_THRESHOLD =
|
|
13
|
+
LARGE_FILE_THRESHOLD = 2 * 1024 * 1024 * 1024 # 2 GB
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class AllowedFileType(EnumType):
|
|
@@ -172,6 +172,7 @@ def generate_attachment_url_from_storage_path(
|
|
|
172
172
|
1. Using resolve_attachment_storage_path() to calculate the actual storage path
|
|
173
173
|
2. Handling different directory structures (attachments vs data/dataset/collection)
|
|
174
174
|
3. Generating proper relative paths from HTML template locations to attachment files
|
|
175
|
+
4. URL-encoding filenames for proper HTML link functionality
|
|
175
176
|
|
|
176
177
|
Used by:
|
|
177
178
|
- _process_attachment_list() in this file
|
|
@@ -191,17 +192,21 @@ def generate_attachment_url_from_storage_path(
|
|
|
191
192
|
# Calculate the actual storage path
|
|
192
193
|
storage_path = resolve_attachment_storage_path(unique_filename, base_path)
|
|
193
194
|
|
|
195
|
+
# URL-encode the filename for proper HTML link functionality
|
|
196
|
+
# Always encode when streaming is enabled to ensure consistency
|
|
197
|
+
encoded_filename = quote(unique_filename, safe="")
|
|
198
|
+
|
|
194
199
|
# Generate relative path from HTML template directory to storage path
|
|
195
200
|
if html_directory == "attachments" and base_path == "attachments":
|
|
196
201
|
# From attachments/index.html to attachments/filename.pdf (same directory)
|
|
197
|
-
return
|
|
202
|
+
return encoded_filename
|
|
198
203
|
if html_directory.startswith("data/") and base_path.startswith("data/"):
|
|
199
204
|
# From data/dataset/collection/index.html to data/dataset/collection/attachments/filename.pdf
|
|
200
205
|
# Both are in data/ structure, so go to attachments subdirectory
|
|
201
|
-
return f"attachments/{
|
|
206
|
+
return f"attachments/{encoded_filename}"
|
|
202
207
|
# For other cases, calculate relative path
|
|
203
208
|
# This is a simplified approach - in practice, you might need more sophisticated path resolution
|
|
204
|
-
return f"../{storage_path}"
|
|
209
|
+
return f"../{storage_path.replace(unique_filename, encoded_filename)}"
|
|
205
210
|
return download_url
|
|
206
211
|
|
|
207
212
|
|
fides/api/util/rate_limit.py
CHANGED
|
@@ -180,15 +180,33 @@ is_rate_limit_enabled = (
|
|
|
180
180
|
CONFIG.security.rate_limit_client_ip_header is not None
|
|
181
181
|
and CONFIG.security.rate_limit_client_ip_header != ""
|
|
182
182
|
)
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
CONFIG.security.request_rate_limit
|
|
187
|
-
], # Creates ONE shared bucket for all endpoints
|
|
183
|
+
|
|
184
|
+
disabled_limiter = Limiter(
|
|
185
|
+
default_limits=[CONFIG.security.request_rate_limit],
|
|
188
186
|
headers_enabled=True,
|
|
189
187
|
key_prefix=CONFIG.security.rate_limit_prefix,
|
|
190
188
|
key_func=safe_rate_limit_key,
|
|
191
189
|
retry_after="http-date",
|
|
192
|
-
|
|
193
|
-
enabled=is_rate_limit_enabled,
|
|
190
|
+
enabled=False,
|
|
194
191
|
)
|
|
192
|
+
|
|
193
|
+
try:
|
|
194
|
+
if is_rate_limit_enabled:
|
|
195
|
+
fides_limiter = Limiter(
|
|
196
|
+
storage_uri=CONFIG.redis.connection_url_unencoded,
|
|
197
|
+
application_limits=[
|
|
198
|
+
CONFIG.security.request_rate_limit
|
|
199
|
+
], # Creates ONE shared bucket for all endpoints
|
|
200
|
+
headers_enabled=True,
|
|
201
|
+
key_prefix=CONFIG.security.rate_limit_prefix,
|
|
202
|
+
key_func=safe_rate_limit_key,
|
|
203
|
+
retry_after="http-date",
|
|
204
|
+
in_memory_fallback_enabled=False, # Fall back to no rate limiting if Redis unavailable
|
|
205
|
+
enabled=is_rate_limit_enabled,
|
|
206
|
+
)
|
|
207
|
+
else:
|
|
208
|
+
fides_limiter = disabled_limiter
|
|
209
|
+
except Exception as e:
|
|
210
|
+
logger.exception("Error instantiating rate limiter: {}", e)
|
|
211
|
+
if is_rate_limit_enabled:
|
|
212
|
+
raise e
|
|
@@ -222,10 +222,13 @@ class SecuritySettings(FidesSettings):
|
|
|
222
222
|
def validate_rate_limit_client_ip_header(
|
|
223
223
|
cls,
|
|
224
224
|
v: str,
|
|
225
|
-
) -> str:
|
|
225
|
+
) -> Optional[str]:
|
|
226
226
|
"""Validate supported `rate_limit_client_ip_header`"""
|
|
227
227
|
insecure_headers = ["x-forwarded-for"]
|
|
228
228
|
|
|
229
|
+
if not v:
|
|
230
|
+
return None
|
|
231
|
+
|
|
229
232
|
if v.lower() in insecure_headers:
|
|
230
233
|
raise ValueError(
|
|
231
234
|
"The rate_limit_client_ip_header cannot be set to a header that is not secure."
|
|
@@ -1 +1 @@
|
|
|
1
|
-
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link data-next-font="" rel="preconnect" href="/" crossorigin="anonymous"/><link rel="preload" href="/_next/static/css/650df9c348000a26.css" as="style"/><link rel="stylesheet" href="/_next/static/css/650df9c348000a26.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/_next/static/chunks/polyfills-42372ed130431b0a.js"></script><script src="/_next/static/chunks/webpack-678e89d68dbcd94f.js" defer=""></script><script src="/_next/static/chunks/framework-c92fc3344e6fd165.js" defer=""></script><script src="/_next/static/chunks/main-090643377c8254e6.js" defer=""></script><script src="/_next/static/chunks/pages/_app-fcdad91f6f66292b.js" defer=""></script><script src="/_next/static/chunks/pages/404-471a6b18e712f050.js" defer=""></script><script src="/_next/static/
|
|
1
|
+
<!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link data-next-font="" rel="preconnect" href="/" crossorigin="anonymous"/><link rel="preload" href="/_next/static/css/650df9c348000a26.css" as="style"/><link rel="stylesheet" href="/_next/static/css/650df9c348000a26.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/_next/static/chunks/polyfills-42372ed130431b0a.js"></script><script src="/_next/static/chunks/webpack-678e89d68dbcd94f.js" defer=""></script><script src="/_next/static/chunks/framework-c92fc3344e6fd165.js" defer=""></script><script src="/_next/static/chunks/main-090643377c8254e6.js" defer=""></script><script src="/_next/static/chunks/pages/_app-fcdad91f6f66292b.js" defer=""></script><script src="/_next/static/chunks/pages/404-471a6b18e712f050.js" defer=""></script><script src="/_next/static/Mzh6ue6wVfRTXIvDbuNvr/_buildManifest.js" defer=""></script><script src="/_next/static/Mzh6ue6wVfRTXIvDbuNvr/_ssgManifest.js" defer=""></script><style>.data-ant-cssinjs-cache-path{content:"";}</style></head><body><div id="__next"><div style="height:100%;display:flex"></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/404","query":{},"buildId":"Mzh6ue6wVfRTXIvDbuNvr","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|