atlan-application-sdk 0.1.1rc39__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/.cursor/BUGBOT.md +424 -0
- application_sdk/activities/metadata_extraction/sql.py +400 -25
- application_sdk/application/__init__.py +2 -0
- application_sdk/application/metadata_extraction/sql.py +3 -0
- application_sdk/clients/.cursor/BUGBOT.md +280 -0
- application_sdk/clients/models.py +42 -0
- application_sdk/clients/sql.py +127 -87
- application_sdk/clients/temporal.py +3 -1
- application_sdk/common/.cursor/BUGBOT.md +316 -0
- application_sdk/common/aws_utils.py +259 -11
- application_sdk/common/utils.py +145 -9
- application_sdk/constants.py +8 -0
- application_sdk/decorators/.cursor/BUGBOT.md +279 -0
- application_sdk/handlers/__init__.py +8 -1
- application_sdk/handlers/sql.py +63 -22
- application_sdk/inputs/.cursor/BUGBOT.md +250 -0
- application_sdk/interceptors/.cursor/BUGBOT.md +320 -0
- application_sdk/interceptors/cleanup.py +171 -0
- application_sdk/interceptors/events.py +6 -6
- application_sdk/observability/decorators/observability_decorator.py +36 -22
- application_sdk/outputs/.cursor/BUGBOT.md +295 -0
- application_sdk/outputs/iceberg.py +4 -0
- application_sdk/outputs/json.py +6 -0
- application_sdk/outputs/parquet.py +13 -3
- application_sdk/server/.cursor/BUGBOT.md +442 -0
- application_sdk/server/fastapi/__init__.py +59 -3
- application_sdk/server/fastapi/models.py +27 -0
- application_sdk/services/objectstore.py +16 -3
- application_sdk/version.py +1 -1
- application_sdk/workflows/.cursor/BUGBOT.md +218 -0
- {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/RECORD +35 -24
- {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import shutil
|
|
3
|
+
from datetime import timedelta
|
|
4
|
+
from typing import Any, Dict, List, Optional, Type
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
from temporalio import activity, workflow
|
|
8
|
+
from temporalio.common import RetryPolicy
|
|
9
|
+
from temporalio.worker import (
|
|
10
|
+
ExecuteWorkflowInput,
|
|
11
|
+
Interceptor,
|
|
12
|
+
WorkflowInboundInterceptor,
|
|
13
|
+
WorkflowInterceptorClassInput,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from application_sdk.activities.common.utils import build_output_path
|
|
17
|
+
from application_sdk.constants import CLEANUP_BASE_PATHS, TEMPORARY_PATH
|
|
18
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
19
|
+
|
|
20
|
+
logger = get_logger(__name__)
|
|
21
|
+
activity.logger = logger
|
|
22
|
+
workflow.logger = logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class CleanupResult(BaseModel):
|
|
26
|
+
"""Result model for cleanup operations.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
path_results (Dict[str, bool]): Cleanup results for each path (True=success, False=failure)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
path_results: Dict[str, bool]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@activity.defn
|
|
36
|
+
async def cleanup() -> CleanupResult:
|
|
37
|
+
"""Clean up temporary artifacts and activity state for the current workflow.
|
|
38
|
+
|
|
39
|
+
Performs two types of cleanup:
|
|
40
|
+
1. File cleanup: Removes all contents from configured base paths or default workflow directory
|
|
41
|
+
2. State cleanup: Clears activity state for the current workflow (includes resource cleanup)
|
|
42
|
+
|
|
43
|
+
Uses CLEANUP_BASE_PATHS constant or defaults to workflow-specific artifacts directory.
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
CleanupResult: Structured cleanup results with path results and summary statistics.
|
|
47
|
+
"""
|
|
48
|
+
path_results: Dict[str, bool] = {}
|
|
49
|
+
base_paths: List[str] = [os.path.join(TEMPORARY_PATH, build_output_path())]
|
|
50
|
+
|
|
51
|
+
# Use configured paths or default to workflow-specific artifacts directory
|
|
52
|
+
if CLEANUP_BASE_PATHS:
|
|
53
|
+
base_paths = CLEANUP_BASE_PATHS
|
|
54
|
+
logger.info(f"Using CLEANUP_BASE_PATHS: {base_paths} for cleanup")
|
|
55
|
+
|
|
56
|
+
logger.info(f"Cleaning up all contents from base paths: {base_paths}")
|
|
57
|
+
|
|
58
|
+
for base_path in base_paths:
|
|
59
|
+
try:
|
|
60
|
+
if os.path.exists(base_path):
|
|
61
|
+
if os.path.isdir(base_path):
|
|
62
|
+
# Remove entire directory and recreate it empty
|
|
63
|
+
shutil.rmtree(base_path)
|
|
64
|
+
logger.info(f"Cleaned up all contents from: {base_path}")
|
|
65
|
+
path_results[base_path] = True
|
|
66
|
+
else:
|
|
67
|
+
logger.warning(f"Path is not a directory: {base_path}")
|
|
68
|
+
path_results[base_path] = False
|
|
69
|
+
else:
|
|
70
|
+
logger.debug(f"Directory doesn't exist: {base_path}")
|
|
71
|
+
path_results[base_path] = True
|
|
72
|
+
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.error(f"Unexpected error cleaning up {base_path}: {e}")
|
|
75
|
+
path_results[base_path] = False
|
|
76
|
+
|
|
77
|
+
return CleanupResult(
|
|
78
|
+
path_results=path_results,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class CleanupWorkflowInboundInterceptor(WorkflowInboundInterceptor):
|
|
83
|
+
"""Interceptor for workflow-level app artifacts cleanup.
|
|
84
|
+
|
|
85
|
+
This interceptor cleans up the entire app directory structure when the workflow
|
|
86
|
+
completes or fails, following the pattern: base_path/appname/workflow_id/run_id
|
|
87
|
+
Supports multiple base paths for comprehensive cleanup.
|
|
88
|
+
"""
|
|
89
|
+
|
|
90
|
+
async def execute_workflow(self, input: ExecuteWorkflowInput) -> Any:
|
|
91
|
+
"""Execute a workflow with app artifacts cleanup.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
input (ExecuteWorkflowInput): The workflow execution input
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Any: The result of the workflow execution
|
|
98
|
+
|
|
99
|
+
Raises:
|
|
100
|
+
Exception: Re-raises any exceptions from workflow execution
|
|
101
|
+
"""
|
|
102
|
+
output = None
|
|
103
|
+
try:
|
|
104
|
+
output = await super().execute_workflow(input)
|
|
105
|
+
except Exception:
|
|
106
|
+
raise
|
|
107
|
+
|
|
108
|
+
finally:
|
|
109
|
+
# Always attempt cleanup regardless of workflow success/failure
|
|
110
|
+
try:
|
|
111
|
+
await workflow.execute_activity(
|
|
112
|
+
cleanup,
|
|
113
|
+
schedule_to_close_timeout=timedelta(minutes=5),
|
|
114
|
+
retry_policy=RetryPolicy(
|
|
115
|
+
maximum_attempts=3,
|
|
116
|
+
),
|
|
117
|
+
summary="This activity is used to cleanup the local artifacts and the activity state after the workflow is completed.",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
logger.info("Cleanup completed successfully")
|
|
121
|
+
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.warning(f"Failed to cleanup artifacts: {e}")
|
|
124
|
+
# Don't re-raise - cleanup failures shouldn't fail the workflow
|
|
125
|
+
|
|
126
|
+
return output
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
class CleanupInterceptor(Interceptor):
|
|
130
|
+
"""Temporal interceptor for automatic app artifacts cleanup.
|
|
131
|
+
|
|
132
|
+
This interceptor provides cleanup capabilities for application artifacts
|
|
133
|
+
across multiple base paths following the pattern: base_path/appname/workflow_id/run_id
|
|
134
|
+
|
|
135
|
+
Features:
|
|
136
|
+
- Automatic cleanup of app-specific artifact directories
|
|
137
|
+
- Cleanup on workflow completion or failure
|
|
138
|
+
- Supports multiple cleanup paths via ATLAN_CLEANUP_BASE_PATHS env var
|
|
139
|
+
- Simple activity-based cleanup logic
|
|
140
|
+
- Comprehensive error handling and logging
|
|
141
|
+
|
|
142
|
+
Example:
|
|
143
|
+
>>> # Register the interceptor with Temporal worker
|
|
144
|
+
>>> worker = Worker(
|
|
145
|
+
... client,
|
|
146
|
+
... task_queue="my-task-queue",
|
|
147
|
+
... workflows=[MyWorkflow],
|
|
148
|
+
... activities=[my_activity, cleanup],
|
|
149
|
+
... interceptors=[CleanupInterceptor()]
|
|
150
|
+
... )
|
|
151
|
+
|
|
152
|
+
Environment Configuration:
|
|
153
|
+
>>> # Single path (default)
|
|
154
|
+
>>> ATLAN_CLEANUP_BASE_PATHS="./local/tmp/artifacts/apps"
|
|
155
|
+
|
|
156
|
+
>>> # Multiple paths (comma-separated)
|
|
157
|
+
>>> ATLAN_CLEANUP_BASE_PATHS="./local/tmp/artifacts/apps,/storage/temp/apps,/shared/cleanup/apps"
|
|
158
|
+
"""
|
|
159
|
+
|
|
160
|
+
def workflow_interceptor_class(
|
|
161
|
+
self, input: WorkflowInterceptorClassInput
|
|
162
|
+
) -> Optional[Type[WorkflowInboundInterceptor]]:
|
|
163
|
+
"""Get the workflow interceptor class for cleanup.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
input (WorkflowInterceptorClassInput): The interceptor input
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Optional[Type[WorkflowInboundInterceptor]]: The workflow interceptor class
|
|
170
|
+
"""
|
|
171
|
+
return CleanupWorkflowInboundInterceptor
|
|
@@ -23,6 +23,8 @@ from application_sdk.observability.logger_adaptor import get_logger
|
|
|
23
23
|
from application_sdk.services.eventstore import EventStore
|
|
24
24
|
|
|
25
25
|
logger = get_logger(__name__)
|
|
26
|
+
activity.logger = logger
|
|
27
|
+
workflow.logger = logger
|
|
26
28
|
|
|
27
29
|
TEMPORAL_NOT_FOUND_FAILURE = (
|
|
28
30
|
"type.googleapis.com/temporal.api.errordetails.v1.NotFoundFailure"
|
|
@@ -41,9 +43,9 @@ async def publish_event(event_data: dict) -> None:
|
|
|
41
43
|
try:
|
|
42
44
|
event = Event(**event_data)
|
|
43
45
|
await EventStore.publish_event(event)
|
|
44
|
-
|
|
46
|
+
logger.info(f"Published event: {event_data.get('event_name','')}")
|
|
45
47
|
except Exception as e:
|
|
46
|
-
|
|
48
|
+
logger.error(f"Failed to publish event: {e}")
|
|
47
49
|
raise
|
|
48
50
|
|
|
49
51
|
|
|
@@ -123,7 +125,7 @@ class EventWorkflowInboundInterceptor(WorkflowInboundInterceptor):
|
|
|
123
125
|
retry_policy=RetryPolicy(maximum_attempts=3),
|
|
124
126
|
)
|
|
125
127
|
except Exception as e:
|
|
126
|
-
|
|
128
|
+
logger.warning(f"Failed to publish workflow start event: {e}")
|
|
127
129
|
# Don't fail the workflow if event publishing fails
|
|
128
130
|
|
|
129
131
|
output = None
|
|
@@ -152,9 +154,7 @@ class EventWorkflowInboundInterceptor(WorkflowInboundInterceptor):
|
|
|
152
154
|
retry_policy=RetryPolicy(maximum_attempts=3),
|
|
153
155
|
)
|
|
154
156
|
except Exception as publish_error:
|
|
155
|
-
|
|
156
|
-
f"Failed to publish workflow end event: {publish_error}"
|
|
157
|
-
)
|
|
157
|
+
logger.warning(f"Failed to publish workflow end event: {publish_error}")
|
|
158
158
|
|
|
159
159
|
return output
|
|
160
160
|
|
|
@@ -4,7 +4,9 @@ import time
|
|
|
4
4
|
import uuid
|
|
5
5
|
from typing import Any, Callable, TypeVar, cast
|
|
6
6
|
|
|
7
|
-
from application_sdk.observability.
|
|
7
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
8
|
+
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
9
|
+
from application_sdk.observability.traces_adaptor import get_traces
|
|
8
10
|
|
|
9
11
|
T = TypeVar("T")
|
|
10
12
|
|
|
@@ -136,9 +138,9 @@ def _record_error_observability(
|
|
|
136
138
|
|
|
137
139
|
|
|
138
140
|
def observability(
|
|
139
|
-
logger: Any,
|
|
140
|
-
metrics: Any,
|
|
141
|
-
traces: Any,
|
|
141
|
+
logger: Any = None,
|
|
142
|
+
metrics: Any = None,
|
|
143
|
+
traces: Any = None,
|
|
142
144
|
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
143
145
|
"""Decorator for adding observability to functions.
|
|
144
146
|
|
|
@@ -146,16 +148,23 @@ def observability(
|
|
|
146
148
|
It handles both synchronous and asynchronous functions.
|
|
147
149
|
|
|
148
150
|
Args:
|
|
149
|
-
logger: Logger instance for operation logging
|
|
150
|
-
metrics: Metrics adapter for recording operation metrics
|
|
151
|
-
traces: Traces adapter for recording operation traces
|
|
151
|
+
logger: Logger instance for operation logging. If None, auto-initializes using get_logger()
|
|
152
|
+
metrics: Metrics adapter for recording operation metrics. If None, auto-initializes using get_metrics()
|
|
153
|
+
traces: Traces adapter for recording operation traces. If None, auto-initializes using get_traces()
|
|
152
154
|
|
|
153
155
|
Returns:
|
|
154
156
|
Callable: Decorated function with observability
|
|
155
157
|
|
|
156
158
|
Example:
|
|
157
159
|
```python
|
|
160
|
+
# With explicit observability components
|
|
158
161
|
@observability(logger, metrics, traces)
|
|
162
|
+
async def my_function():
|
|
163
|
+
# Function implementation
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
# With auto-initialization (recommended)
|
|
167
|
+
@observability()
|
|
159
168
|
async def my_function():
|
|
160
169
|
# Function implementation
|
|
161
170
|
pass
|
|
@@ -163,6 +172,11 @@ def observability(
|
|
|
163
172
|
"""
|
|
164
173
|
|
|
165
174
|
def decorator(func: Callable[..., T]) -> Callable[..., T]:
|
|
175
|
+
# Auto-initialize observability components if not provided
|
|
176
|
+
actual_logger = logger or get_logger(func.__module__)
|
|
177
|
+
actual_metrics = metrics or get_metrics()
|
|
178
|
+
actual_traces = traces or get_traces()
|
|
179
|
+
|
|
166
180
|
# Get function metadata
|
|
167
181
|
func_name = func.__name__
|
|
168
182
|
func_doc = func.__doc__ or f"Executing {func_name}"
|
|
@@ -170,7 +184,7 @@ def observability(
|
|
|
170
184
|
is_async = inspect.iscoroutinefunction(func)
|
|
171
185
|
|
|
172
186
|
# Debug logging for function decoration
|
|
173
|
-
|
|
187
|
+
actual_logger.debug(f"Decorating function {func_name} (async={is_async})")
|
|
174
188
|
|
|
175
189
|
@functools.wraps(func)
|
|
176
190
|
async def async_wrapper(*args: Any, **kwargs: Any) -> T:
|
|
@@ -181,16 +195,16 @@ def observability(
|
|
|
181
195
|
|
|
182
196
|
try:
|
|
183
197
|
# Log start of operation
|
|
184
|
-
|
|
198
|
+
actual_logger.debug(f"Starting async function {func_name}")
|
|
185
199
|
|
|
186
200
|
# Execute the function
|
|
187
201
|
result = await func(*args, **kwargs)
|
|
188
202
|
|
|
189
203
|
# Record success observability
|
|
190
204
|
_record_success_observability(
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
205
|
+
actual_logger,
|
|
206
|
+
actual_metrics,
|
|
207
|
+
actual_traces,
|
|
194
208
|
func_name,
|
|
195
209
|
func_doc,
|
|
196
210
|
func_module,
|
|
@@ -204,9 +218,9 @@ def observability(
|
|
|
204
218
|
except Exception as e:
|
|
205
219
|
# Record error observability
|
|
206
220
|
_record_error_observability(
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
221
|
+
actual_logger,
|
|
222
|
+
actual_metrics,
|
|
223
|
+
actual_traces,
|
|
210
224
|
func_name,
|
|
211
225
|
func_doc,
|
|
212
226
|
func_module,
|
|
@@ -226,16 +240,16 @@ def observability(
|
|
|
226
240
|
|
|
227
241
|
try:
|
|
228
242
|
# Log start of operation
|
|
229
|
-
|
|
243
|
+
actual_logger.debug(f"Starting sync function {func_name}")
|
|
230
244
|
|
|
231
245
|
# Execute the function
|
|
232
246
|
result = func(*args, **kwargs)
|
|
233
247
|
|
|
234
248
|
# Record success observability
|
|
235
249
|
_record_success_observability(
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
250
|
+
actual_logger,
|
|
251
|
+
actual_metrics,
|
|
252
|
+
actual_traces,
|
|
239
253
|
func_name,
|
|
240
254
|
func_doc,
|
|
241
255
|
func_module,
|
|
@@ -249,9 +263,9 @@ def observability(
|
|
|
249
263
|
except Exception as e:
|
|
250
264
|
# Record error observability
|
|
251
265
|
_record_error_observability(
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
266
|
+
actual_logger,
|
|
267
|
+
actual_metrics,
|
|
268
|
+
actual_traces,
|
|
255
269
|
func_name,
|
|
256
270
|
func_doc,
|
|
257
271
|
func_module,
|
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
# Output Code Review Guidelines - Data Output Processing
|
|
2
|
+
|
|
3
|
+
## Context-Specific Patterns
|
|
4
|
+
|
|
5
|
+
This directory contains output processing implementations for various data formats (JSON, Parquet, Iceberg). Output processors must handle data uploads efficiently while maintaining data integrity and correct destination paths.
|
|
6
|
+
|
|
7
|
+
### Phase 1: Critical Output Safety Issues
|
|
8
|
+
|
|
9
|
+
**Object Store Path Management:**
|
|
10
|
+
|
|
11
|
+
- **Correct destination paths**: Upload paths must respect user-configured output prefixes
|
|
12
|
+
- **Path construction accuracy**: Object store keys must be calculated correctly, not hardcoded
|
|
13
|
+
- **User prefix preservation**: Respect user-provided output directories and naming conventions
|
|
14
|
+
- **Path validation**: Ensure upload paths don't conflict with existing data
|
|
15
|
+
|
|
16
|
+
**Data Integrity and Security:**
|
|
17
|
+
|
|
18
|
+
- All output data must be validated before upload
|
|
19
|
+
- File permissions and access controls must be properly set
|
|
20
|
+
- Data serialization must be consistent and recoverable
|
|
21
|
+
- Prevent overwriting critical data without confirmation
|
|
22
|
+
- Maintain data lineage information in output metadata
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
# ✅ DO: Proper object store upload path handling
|
|
26
|
+
class JsonOutput:
|
|
27
|
+
async def upload_to_object_store(
|
|
28
|
+
self,
|
|
29
|
+
data: List[dict],
|
|
30
|
+
output_prefix: str, # User-provided output location
|
|
31
|
+
filename: str
|
|
32
|
+
) -> dict:
|
|
33
|
+
"""Upload data with correct path handling."""
|
|
34
|
+
|
|
35
|
+
# Construct full object store path respecting user's output prefix
|
|
36
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
37
|
+
|
|
38
|
+
# Serialize data
|
|
39
|
+
json_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
40
|
+
|
|
41
|
+
# Upload to correct location
|
|
42
|
+
result = await self.object_store.upload_file(
|
|
43
|
+
data=json_data,
|
|
44
|
+
destination=object_store_key # Respect user's intended location
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
# ❌ REJECT: Incorrect path handling
|
|
50
|
+
class BadJsonOutput:
|
|
51
|
+
async def upload_to_object_store(self, data: List[dict], filename: str):
|
|
52
|
+
# Wrong: hardcoded or derived path, ignoring user configuration
|
|
53
|
+
object_store_key = get_object_store_prefix(f"/tmp/{filename}") # Ignores output_prefix!
|
|
54
|
+
|
|
55
|
+
result = await self.object_store.upload_file(
|
|
56
|
+
data=orjson.dumps(data),
|
|
57
|
+
destination=object_store_key # Wrong destination!
|
|
58
|
+
)
|
|
59
|
+
return result
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Phase 2: Output Architecture Patterns
|
|
63
|
+
|
|
64
|
+
**Performance Optimization Requirements:**
|
|
65
|
+
|
|
66
|
+
- **Parallelization opportunities**: Flag sequential upload operations that could be parallelized
|
|
67
|
+
- **Batch processing**: Group related uploads to reduce overhead
|
|
68
|
+
- **Streaming uploads**: Use streaming for large datasets instead of loading into memory
|
|
69
|
+
- **Connection optimization**: Reuse object store connections across operations
|
|
70
|
+
|
|
71
|
+
**Resource Management:**
|
|
72
|
+
|
|
73
|
+
- Use proper connection pooling for object store operations
|
|
74
|
+
- Implement timeout handling for upload operations
|
|
75
|
+
- Clean up temporary files after upload
|
|
76
|
+
- Handle partial upload failures gracefully
|
|
77
|
+
- Monitor memory usage during large data serialization
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# ✅ DO: Parallel upload processing
|
|
81
|
+
async def upload_multiple_datasets_parallel(
|
|
82
|
+
self,
|
|
83
|
+
datasets: List[Tuple[List[dict], str]], # (data, filename) pairs
|
|
84
|
+
output_prefix: str
|
|
85
|
+
) -> List[dict]:
|
|
86
|
+
"""Upload multiple datasets in parallel for better performance."""
|
|
87
|
+
|
|
88
|
+
async def upload_single_dataset(data: List[dict], filename: str) -> dict:
|
|
89
|
+
"""Upload a single dataset with error handling."""
|
|
90
|
+
try:
|
|
91
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
92
|
+
serialized_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
93
|
+
|
|
94
|
+
return await self.object_store.upload_file(
|
|
95
|
+
data=serialized_data,
|
|
96
|
+
destination=object_store_key
|
|
97
|
+
)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Failed to upload {filename}: {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
# Parallel processing with controlled concurrency
|
|
103
|
+
semaphore = asyncio.Semaphore(5) # Limit concurrent uploads
|
|
104
|
+
|
|
105
|
+
async def upload_with_semaphore(data: List[dict], filename: str) -> dict:
|
|
106
|
+
async with semaphore:
|
|
107
|
+
return await upload_single_dataset(data, filename)
|
|
108
|
+
|
|
109
|
+
tasks = [upload_with_semaphore(data, filename) for data, filename in datasets]
|
|
110
|
+
return await asyncio.gather(*tasks)
|
|
111
|
+
|
|
112
|
+
# ❌ REJECT: Sequential upload processing
|
|
113
|
+
async def upload_multiple_datasets_sequential(
|
|
114
|
+
self,
|
|
115
|
+
datasets: List[Tuple[List[dict], str]],
|
|
116
|
+
output_prefix: str
|
|
117
|
+
) -> List[dict]:
|
|
118
|
+
"""Sequential uploads - should be flagged for parallelization."""
|
|
119
|
+
results = []
|
|
120
|
+
for data, filename in datasets: # FLAG: Could be parallelized
|
|
121
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
122
|
+
result = await self.object_store.upload_file(data, object_store_key)
|
|
123
|
+
results.append(result)
|
|
124
|
+
return results
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Phase 3: Output Testing Requirements
|
|
128
|
+
|
|
129
|
+
**Data Output Testing:**
|
|
130
|
+
|
|
131
|
+
- Test with various data formats and sizes
|
|
132
|
+
- Test serialization and deserialization consistency
|
|
133
|
+
- Test partial upload scenarios and recovery
|
|
134
|
+
- Mock object store operations in unit tests
|
|
135
|
+
- Include integration tests with real object store
|
|
136
|
+
- Test data corruption detection and prevention
|
|
137
|
+
|
|
138
|
+
**Performance Testing:**
|
|
139
|
+
|
|
140
|
+
- Include tests for large dataset uploads
|
|
141
|
+
- Test memory usage during serialization
|
|
142
|
+
- Test concurrent upload operations
|
|
143
|
+
- Verify timeout handling works correctly
|
|
144
|
+
- Test connection pool behavior under load
|
|
145
|
+
|
|
146
|
+
### Phase 4: Performance and Scalability
|
|
147
|
+
|
|
148
|
+
**Data Upload Efficiency:**
|
|
149
|
+
|
|
150
|
+
- Use streaming uploads for large datasets
|
|
151
|
+
- Implement proper chunking for oversized data
|
|
152
|
+
- Use compression for large text-based outputs
|
|
153
|
+
- Monitor upload progress and provide feedback
|
|
154
|
+
- Optimize serialization performance (use orjson over json)
|
|
155
|
+
|
|
156
|
+
**Object Store Optimization:**
|
|
157
|
+
|
|
158
|
+
- Use connection pooling for object store clients
|
|
159
|
+
- Implement proper retry logic for upload failures
|
|
160
|
+
- Use parallel uploads where appropriate
|
|
161
|
+
- Monitor upload metrics and error rates
|
|
162
|
+
- Handle bandwidth limitations gracefully
|
|
163
|
+
|
|
164
|
+
### Phase 5: Output Maintainability
|
|
165
|
+
|
|
166
|
+
**Error Handling and Recovery:**
|
|
167
|
+
|
|
168
|
+
- Implement comprehensive error handling for all upload operations
|
|
169
|
+
- Provide meaningful error messages with upload context
|
|
170
|
+
- Handle partial upload failures gracefully
|
|
171
|
+
- Implement proper retry logic for transient failures
|
|
172
|
+
- Log all upload operations with destination information
|
|
173
|
+
|
|
174
|
+
**Configuration Management:**
|
|
175
|
+
|
|
176
|
+
- Externalize all output-related configuration
|
|
177
|
+
- Support different output destinations and formats
|
|
178
|
+
- Validate output configuration before processing
|
|
179
|
+
- Document all supported output parameters
|
|
180
|
+
- Handle environment-specific output requirements
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Output-Specific Anti-Patterns
|
|
185
|
+
|
|
186
|
+
**Always Reject:**
|
|
187
|
+
|
|
188
|
+
- **Path derivation errors**: Deriving object store paths from local temporary paths
|
|
189
|
+
- **Sequential uploads**: Uploading multiple files sequentially when parallel uploads are possible
|
|
190
|
+
- **Memory inefficiency**: Loading entire datasets into memory for serialization
|
|
191
|
+
- **Missing upload verification**: Not verifying successful uploads
|
|
192
|
+
- **Poor error recovery**: Not handling partial upload failures gracefully
|
|
193
|
+
- **Resource leaks**: Not cleaning up temporary files or connections
|
|
194
|
+
|
|
195
|
+
**Object Store Upload Anti-Patterns:**
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
# ❌ REJECT: Incorrect upload path handling
|
|
199
|
+
class BadOutputProcessor:
|
|
200
|
+
async def upload_results(self, results: List[dict]):
|
|
201
|
+
# Wrong: derives upload path from temporary local path
|
|
202
|
+
local_temp_file = "/tmp/results.json"
|
|
203
|
+
upload_key = get_object_store_prefix(local_temp_file) # Incorrect!
|
|
204
|
+
|
|
205
|
+
await self.object_store.upload_file(results, upload_key)
|
|
206
|
+
|
|
207
|
+
# ✅ REQUIRE: Correct upload path handling
|
|
208
|
+
class GoodOutputProcessor:
|
|
209
|
+
async def upload_results(
|
|
210
|
+
self,
|
|
211
|
+
results: List[dict],
|
|
212
|
+
output_prefix: str, # User-specified destination
|
|
213
|
+
filename: str = "results.json"
|
|
214
|
+
):
|
|
215
|
+
# Use actual user-configured output location
|
|
216
|
+
upload_key = os.path.join(output_prefix, filename)
|
|
217
|
+
|
|
218
|
+
await self.object_store.upload_file(
|
|
219
|
+
data=orjson.dumps(results),
|
|
220
|
+
destination=upload_key # Correct destination
|
|
221
|
+
)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
**Performance Anti-Patterns:**
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# ❌ REJECT: Sequential upload processing
|
|
228
|
+
async def upload_multiple_files_sequential(file_data_pairs: List[Tuple]):
|
|
229
|
+
results = []
|
|
230
|
+
for data, filename in file_data_pairs: # Should be parallelized
|
|
231
|
+
result = await upload_single_file(data, filename)
|
|
232
|
+
results.append(result)
|
|
233
|
+
return results
|
|
234
|
+
|
|
235
|
+
# ✅ REQUIRE: Parallel upload processing with proper error handling
|
|
236
|
+
async def upload_multiple_files_parallel(
|
|
237
|
+
file_data_pairs: List[Tuple],
|
|
238
|
+
max_concurrency: int = 5
|
|
239
|
+
) -> List[dict]:
|
|
240
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
241
|
+
|
|
242
|
+
async def upload_with_semaphore(data, filename):
|
|
243
|
+
async with semaphore:
|
|
244
|
+
try:
|
|
245
|
+
return await upload_single_file(data, filename)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error(f"Upload failed for {filename}: {e}")
|
|
248
|
+
return {"filename": filename, "status": "failed", "error": str(e)}
|
|
249
|
+
|
|
250
|
+
tasks = [upload_with_semaphore(data, filename) for data, filename in file_data_pairs]
|
|
251
|
+
return await asyncio.gather(*tasks)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**Memory Management Anti-Patterns:**
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
# ❌ REJECT: Loading entire dataset for serialization
|
|
258
|
+
async def bad_large_dataset_upload(large_dataset: List[dict]):
|
|
259
|
+
# Loads entire dataset into memory
|
|
260
|
+
json_data = orjson.dumps(large_dataset) # Could exceed memory limits
|
|
261
|
+
await upload_data(json_data)
|
|
262
|
+
|
|
263
|
+
# ✅ REQUIRE: Streaming serialization for large datasets
|
|
264
|
+
async def good_large_dataset_upload(large_dataset: List[dict], chunk_size: int = 1000):
|
|
265
|
+
"""Stream large datasets to avoid memory issues."""
|
|
266
|
+
|
|
267
|
+
async def serialize_chunk(chunk: List[dict]) -> bytes:
|
|
268
|
+
return orjson.dumps(chunk, option=orjson.OPT_APPEND_NEWLINE)
|
|
269
|
+
|
|
270
|
+
# Process in chunks to manage memory
|
|
271
|
+
for i in range(0, len(large_dataset), chunk_size):
|
|
272
|
+
chunk = large_dataset[i:i + chunk_size]
|
|
273
|
+
serialized_chunk = await serialize_chunk(chunk)
|
|
274
|
+
|
|
275
|
+
await upload_chunk(
|
|
276
|
+
data=serialized_chunk,
|
|
277
|
+
chunk_index=i // chunk_size
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Educational Context for Output Reviews
|
|
282
|
+
|
|
283
|
+
When reviewing output code, emphasize:
|
|
284
|
+
|
|
285
|
+
1. **Data Integrity Impact**: "Incorrect upload path handling can cause data to be stored in wrong locations, making it inaccessible to downstream processes. This breaks data pipelines and can cause data loss."
|
|
286
|
+
|
|
287
|
+
2. **Performance Impact**: "Sequential uploads create unnecessary bottlenecks. For enterprise datasets with multiple output files, parallelization can significantly reduce processing time and improve user experience."
|
|
288
|
+
|
|
289
|
+
3. **Resource Impact**: "Poor memory management during serialization can cause out-of-memory errors with large datasets. Streaming and chunking are essential for enterprise-scale data output."
|
|
290
|
+
|
|
291
|
+
4. **User Experience Impact**: "Output path errors are often discovered late in processing, causing wasted computation and frustrating delays. Proper validation and clear error messages improve reliability."
|
|
292
|
+
|
|
293
|
+
5. **Scalability Impact**: "Output patterns that work for small datasets can fail at enterprise scale. Always design output processes to handle the largest expected dataset sizes efficiently."
|
|
294
|
+
|
|
295
|
+
6. **Data Pipeline Impact**: "Output processing is the final step in data pipelines. Failures here can invalidate all upstream processing work. Robust error handling and verification are critical for pipeline reliability."
|
|
@@ -29,6 +29,7 @@ class IcebergOutput(Output):
|
|
|
29
29
|
mode: str = "append",
|
|
30
30
|
total_record_count: int = 0,
|
|
31
31
|
chunk_count: int = 0,
|
|
32
|
+
retain_local_copy: bool = False,
|
|
32
33
|
):
|
|
33
34
|
"""Initialize the Iceberg output class.
|
|
34
35
|
|
|
@@ -39,6 +40,8 @@ class IcebergOutput(Output):
|
|
|
39
40
|
mode (str, optional): Write mode for the iceberg table. Defaults to "append".
|
|
40
41
|
total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
|
|
41
42
|
chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
|
|
43
|
+
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
44
|
+
Defaults to False.
|
|
42
45
|
"""
|
|
43
46
|
self.total_record_count = total_record_count
|
|
44
47
|
self.chunk_count = chunk_count
|
|
@@ -47,6 +50,7 @@ class IcebergOutput(Output):
|
|
|
47
50
|
self.iceberg_table = iceberg_table
|
|
48
51
|
self.mode = mode
|
|
49
52
|
self.metrics = get_metrics()
|
|
53
|
+
self.retain_local_copy = retain_local_copy
|
|
50
54
|
|
|
51
55
|
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
52
56
|
"""
|