atlan-application-sdk 0.1.1rc39__py3-none-any.whl → 0.1.1rc41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. application_sdk/activities/.cursor/BUGBOT.md +424 -0
  2. application_sdk/activities/metadata_extraction/sql.py +400 -25
  3. application_sdk/application/__init__.py +2 -0
  4. application_sdk/application/metadata_extraction/sql.py +3 -0
  5. application_sdk/clients/.cursor/BUGBOT.md +280 -0
  6. application_sdk/clients/models.py +42 -0
  7. application_sdk/clients/sql.py +127 -87
  8. application_sdk/clients/temporal.py +3 -1
  9. application_sdk/common/.cursor/BUGBOT.md +316 -0
  10. application_sdk/common/aws_utils.py +259 -11
  11. application_sdk/common/utils.py +145 -9
  12. application_sdk/constants.py +8 -0
  13. application_sdk/decorators/.cursor/BUGBOT.md +279 -0
  14. application_sdk/handlers/__init__.py +8 -1
  15. application_sdk/handlers/sql.py +63 -22
  16. application_sdk/inputs/.cursor/BUGBOT.md +250 -0
  17. application_sdk/interceptors/.cursor/BUGBOT.md +320 -0
  18. application_sdk/interceptors/cleanup.py +171 -0
  19. application_sdk/interceptors/events.py +6 -6
  20. application_sdk/observability/decorators/observability_decorator.py +36 -22
  21. application_sdk/outputs/.cursor/BUGBOT.md +295 -0
  22. application_sdk/outputs/iceberg.py +4 -0
  23. application_sdk/outputs/json.py +6 -0
  24. application_sdk/outputs/parquet.py +13 -3
  25. application_sdk/server/.cursor/BUGBOT.md +442 -0
  26. application_sdk/server/fastapi/__init__.py +59 -3
  27. application_sdk/server/fastapi/models.py +27 -0
  28. application_sdk/services/objectstore.py +16 -3
  29. application_sdk/version.py +1 -1
  30. application_sdk/workflows/.cursor/BUGBOT.md +218 -0
  31. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/METADATA +1 -1
  32. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/RECORD +35 -24
  33. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/WHEEL +0 -0
  34. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/LICENSE +0 -0
  35. {atlan_application_sdk-0.1.1rc39.dist-info → atlan_application_sdk-0.1.1rc41.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,171 @@
1
+ import os
2
+ import shutil
3
+ from datetime import timedelta
4
+ from typing import Any, Dict, List, Optional, Type
5
+
6
+ from pydantic import BaseModel
7
+ from temporalio import activity, workflow
8
+ from temporalio.common import RetryPolicy
9
+ from temporalio.worker import (
10
+ ExecuteWorkflowInput,
11
+ Interceptor,
12
+ WorkflowInboundInterceptor,
13
+ WorkflowInterceptorClassInput,
14
+ )
15
+
16
+ from application_sdk.activities.common.utils import build_output_path
17
+ from application_sdk.constants import CLEANUP_BASE_PATHS, TEMPORARY_PATH
18
+ from application_sdk.observability.logger_adaptor import get_logger
19
+
20
+ logger = get_logger(__name__)
21
+ activity.logger = logger
22
+ workflow.logger = logger
23
+
24
+
25
+ class CleanupResult(BaseModel):
26
+ """Result model for cleanup operations.
27
+
28
+ Attributes:
29
+ path_results (Dict[str, bool]): Cleanup results for each path (True=success, False=failure)
30
+ """
31
+
32
+ path_results: Dict[str, bool]
33
+
34
+
35
+ @activity.defn
36
+ async def cleanup() -> CleanupResult:
37
+ """Clean up temporary artifacts and activity state for the current workflow.
38
+
39
+ Performs two types of cleanup:
40
+ 1. File cleanup: Removes all contents from configured base paths or default workflow directory
41
+ 2. State cleanup: Clears activity state for the current workflow (includes resource cleanup)
42
+
43
+ Uses CLEANUP_BASE_PATHS constant or defaults to workflow-specific artifacts directory.
44
+
45
+ Returns:
46
+ CleanupResult: Structured cleanup results with path results and summary statistics.
47
+ """
48
+ path_results: Dict[str, bool] = {}
49
+ base_paths: List[str] = [os.path.join(TEMPORARY_PATH, build_output_path())]
50
+
51
+ # Use configured paths or default to workflow-specific artifacts directory
52
+ if CLEANUP_BASE_PATHS:
53
+ base_paths = CLEANUP_BASE_PATHS
54
+ logger.info(f"Using CLEANUP_BASE_PATHS: {base_paths} for cleanup")
55
+
56
+ logger.info(f"Cleaning up all contents from base paths: {base_paths}")
57
+
58
+ for base_path in base_paths:
59
+ try:
60
+ if os.path.exists(base_path):
61
+ if os.path.isdir(base_path):
62
+ # Remove entire directory and recreate it empty
63
+ shutil.rmtree(base_path)
64
+ logger.info(f"Cleaned up all contents from: {base_path}")
65
+ path_results[base_path] = True
66
+ else:
67
+ logger.warning(f"Path is not a directory: {base_path}")
68
+ path_results[base_path] = False
69
+ else:
70
+ logger.debug(f"Directory doesn't exist: {base_path}")
71
+ path_results[base_path] = True
72
+
73
+ except Exception as e:
74
+ logger.error(f"Unexpected error cleaning up {base_path}: {e}")
75
+ path_results[base_path] = False
76
+
77
+ return CleanupResult(
78
+ path_results=path_results,
79
+ )
80
+
81
+
82
+ class CleanupWorkflowInboundInterceptor(WorkflowInboundInterceptor):
83
+ """Interceptor for workflow-level app artifacts cleanup.
84
+
85
+ This interceptor cleans up the entire app directory structure when the workflow
86
+ completes or fails, following the pattern: base_path/appname/workflow_id/run_id
87
+ Supports multiple base paths for comprehensive cleanup.
88
+ """
89
+
90
+ async def execute_workflow(self, input: ExecuteWorkflowInput) -> Any:
91
+ """Execute a workflow with app artifacts cleanup.
92
+
93
+ Args:
94
+ input (ExecuteWorkflowInput): The workflow execution input
95
+
96
+ Returns:
97
+ Any: The result of the workflow execution
98
+
99
+ Raises:
100
+ Exception: Re-raises any exceptions from workflow execution
101
+ """
102
+ output = None
103
+ try:
104
+ output = await super().execute_workflow(input)
105
+ except Exception:
106
+ raise
107
+
108
+ finally:
109
+ # Always attempt cleanup regardless of workflow success/failure
110
+ try:
111
+ await workflow.execute_activity(
112
+ cleanup,
113
+ schedule_to_close_timeout=timedelta(minutes=5),
114
+ retry_policy=RetryPolicy(
115
+ maximum_attempts=3,
116
+ ),
117
+ summary="This activity is used to cleanup the local artifacts and the activity state after the workflow is completed.",
118
+ )
119
+
120
+ logger.info("Cleanup completed successfully")
121
+
122
+ except Exception as e:
123
+ logger.warning(f"Failed to cleanup artifacts: {e}")
124
+ # Don't re-raise - cleanup failures shouldn't fail the workflow
125
+
126
+ return output
127
+
128
+
129
+ class CleanupInterceptor(Interceptor):
130
+ """Temporal interceptor for automatic app artifacts cleanup.
131
+
132
+ This interceptor provides cleanup capabilities for application artifacts
133
+ across multiple base paths following the pattern: base_path/appname/workflow_id/run_id
134
+
135
+ Features:
136
+ - Automatic cleanup of app-specific artifact directories
137
+ - Cleanup on workflow completion or failure
138
+ - Supports multiple cleanup paths via ATLAN_CLEANUP_BASE_PATHS env var
139
+ - Simple activity-based cleanup logic
140
+ - Comprehensive error handling and logging
141
+
142
+ Example:
143
+ >>> # Register the interceptor with Temporal worker
144
+ >>> worker = Worker(
145
+ ... client,
146
+ ... task_queue="my-task-queue",
147
+ ... workflows=[MyWorkflow],
148
+ ... activities=[my_activity, cleanup],
149
+ ... interceptors=[CleanupInterceptor()]
150
+ ... )
151
+
152
+ Environment Configuration:
153
+ >>> # Single path (default)
154
+ >>> ATLAN_CLEANUP_BASE_PATHS="./local/tmp/artifacts/apps"
155
+
156
+ >>> # Multiple paths (comma-separated)
157
+ >>> ATLAN_CLEANUP_BASE_PATHS="./local/tmp/artifacts/apps,/storage/temp/apps,/shared/cleanup/apps"
158
+ """
159
+
160
+ def workflow_interceptor_class(
161
+ self, input: WorkflowInterceptorClassInput
162
+ ) -> Optional[Type[WorkflowInboundInterceptor]]:
163
+ """Get the workflow interceptor class for cleanup.
164
+
165
+ Args:
166
+ input (WorkflowInterceptorClassInput): The interceptor input
167
+
168
+ Returns:
169
+ Optional[Type[WorkflowInboundInterceptor]]: The workflow interceptor class
170
+ """
171
+ return CleanupWorkflowInboundInterceptor
@@ -23,6 +23,8 @@ from application_sdk.observability.logger_adaptor import get_logger
23
23
  from application_sdk.services.eventstore import EventStore
24
24
 
25
25
  logger = get_logger(__name__)
26
+ activity.logger = logger
27
+ workflow.logger = logger
26
28
 
27
29
  TEMPORAL_NOT_FOUND_FAILURE = (
28
30
  "type.googleapis.com/temporal.api.errordetails.v1.NotFoundFailure"
@@ -41,9 +43,9 @@ async def publish_event(event_data: dict) -> None:
41
43
  try:
42
44
  event = Event(**event_data)
43
45
  await EventStore.publish_event(event)
44
- activity.logger.info(f"Published event: {event_data.get('event_name','')}")
46
+ logger.info(f"Published event: {event_data.get('event_name','')}")
45
47
  except Exception as e:
46
- activity.logger.error(f"Failed to publish event: {e}")
48
+ logger.error(f"Failed to publish event: {e}")
47
49
  raise
48
50
 
49
51
 
@@ -123,7 +125,7 @@ class EventWorkflowInboundInterceptor(WorkflowInboundInterceptor):
123
125
  retry_policy=RetryPolicy(maximum_attempts=3),
124
126
  )
125
127
  except Exception as e:
126
- workflow.logger.warning(f"Failed to publish workflow start event: {e}")
128
+ logger.warning(f"Failed to publish workflow start event: {e}")
127
129
  # Don't fail the workflow if event publishing fails
128
130
 
129
131
  output = None
@@ -152,9 +154,7 @@ class EventWorkflowInboundInterceptor(WorkflowInboundInterceptor):
152
154
  retry_policy=RetryPolicy(maximum_attempts=3),
153
155
  )
154
156
  except Exception as publish_error:
155
- workflow.logger.warning(
156
- f"Failed to publish workflow end event: {publish_error}"
157
- )
157
+ logger.warning(f"Failed to publish workflow end event: {publish_error}")
158
158
 
159
159
  return output
160
160
 
@@ -4,7 +4,9 @@ import time
4
4
  import uuid
5
5
  from typing import Any, Callable, TypeVar, cast
6
6
 
7
- from application_sdk.observability.metrics_adaptor import MetricType
7
+ from application_sdk.observability.logger_adaptor import get_logger
8
+ from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
9
+ from application_sdk.observability.traces_adaptor import get_traces
8
10
 
9
11
  T = TypeVar("T")
10
12
 
@@ -136,9 +138,9 @@ def _record_error_observability(
136
138
 
137
139
 
138
140
  def observability(
139
- logger: Any,
140
- metrics: Any,
141
- traces: Any,
141
+ logger: Any = None,
142
+ metrics: Any = None,
143
+ traces: Any = None,
142
144
  ) -> Callable[[Callable[..., T]], Callable[..., T]]:
143
145
  """Decorator for adding observability to functions.
144
146
 
@@ -146,16 +148,23 @@ def observability(
146
148
  It handles both synchronous and asynchronous functions.
147
149
 
148
150
  Args:
149
- logger: Logger instance for operation logging
150
- metrics: Metrics adapter for recording operation metrics
151
- traces: Traces adapter for recording operation traces
151
+ logger: Logger instance for operation logging. If None, auto-initializes using get_logger()
152
+ metrics: Metrics adapter for recording operation metrics. If None, auto-initializes using get_metrics()
153
+ traces: Traces adapter for recording operation traces. If None, auto-initializes using get_traces()
152
154
 
153
155
  Returns:
154
156
  Callable: Decorated function with observability
155
157
 
156
158
  Example:
157
159
  ```python
160
+ # With explicit observability components
158
161
  @observability(logger, metrics, traces)
162
+ async def my_function():
163
+ # Function implementation
164
+ pass
165
+
166
+ # With auto-initialization (recommended)
167
+ @observability()
159
168
  async def my_function():
160
169
  # Function implementation
161
170
  pass
@@ -163,6 +172,11 @@ def observability(
163
172
  """
164
173
 
165
174
  def decorator(func: Callable[..., T]) -> Callable[..., T]:
175
+ # Auto-initialize observability components if not provided
176
+ actual_logger = logger or get_logger(func.__module__)
177
+ actual_metrics = metrics or get_metrics()
178
+ actual_traces = traces or get_traces()
179
+
166
180
  # Get function metadata
167
181
  func_name = func.__name__
168
182
  func_doc = func.__doc__ or f"Executing {func_name}"
@@ -170,7 +184,7 @@ def observability(
170
184
  is_async = inspect.iscoroutinefunction(func)
171
185
 
172
186
  # Debug logging for function decoration
173
- logger.debug(f"Decorating function {func_name} (async={is_async})")
187
+ actual_logger.debug(f"Decorating function {func_name} (async={is_async})")
174
188
 
175
189
  @functools.wraps(func)
176
190
  async def async_wrapper(*args: Any, **kwargs: Any) -> T:
@@ -181,16 +195,16 @@ def observability(
181
195
 
182
196
  try:
183
197
  # Log start of operation
184
- logger.debug(f"Starting async function {func_name}")
198
+ actual_logger.debug(f"Starting async function {func_name}")
185
199
 
186
200
  # Execute the function
187
201
  result = await func(*args, **kwargs)
188
202
 
189
203
  # Record success observability
190
204
  _record_success_observability(
191
- logger,
192
- metrics,
193
- traces,
205
+ actual_logger,
206
+ actual_metrics,
207
+ actual_traces,
194
208
  func_name,
195
209
  func_doc,
196
210
  func_module,
@@ -204,9 +218,9 @@ def observability(
204
218
  except Exception as e:
205
219
  # Record error observability
206
220
  _record_error_observability(
207
- logger,
208
- metrics,
209
- traces,
221
+ actual_logger,
222
+ actual_metrics,
223
+ actual_traces,
210
224
  func_name,
211
225
  func_doc,
212
226
  func_module,
@@ -226,16 +240,16 @@ def observability(
226
240
 
227
241
  try:
228
242
  # Log start of operation
229
- logger.debug(f"Starting sync function {func_name}")
243
+ actual_logger.debug(f"Starting sync function {func_name}")
230
244
 
231
245
  # Execute the function
232
246
  result = func(*args, **kwargs)
233
247
 
234
248
  # Record success observability
235
249
  _record_success_observability(
236
- logger,
237
- metrics,
238
- traces,
250
+ actual_logger,
251
+ actual_metrics,
252
+ actual_traces,
239
253
  func_name,
240
254
  func_doc,
241
255
  func_module,
@@ -249,9 +263,9 @@ def observability(
249
263
  except Exception as e:
250
264
  # Record error observability
251
265
  _record_error_observability(
252
- logger,
253
- metrics,
254
- traces,
266
+ actual_logger,
267
+ actual_metrics,
268
+ actual_traces,
255
269
  func_name,
256
270
  func_doc,
257
271
  func_module,
@@ -0,0 +1,295 @@
1
+ # Output Code Review Guidelines - Data Output Processing
2
+
3
+ ## Context-Specific Patterns
4
+
5
+ This directory contains output processing implementations for various data formats (JSON, Parquet, Iceberg). Output processors must handle data uploads efficiently while maintaining data integrity and correct destination paths.
6
+
7
+ ### Phase 1: Critical Output Safety Issues
8
+
9
+ **Object Store Path Management:**
10
+
11
+ - **Correct destination paths**: Upload paths must respect user-configured output prefixes
12
+ - **Path construction accuracy**: Object store keys must be calculated correctly, not hardcoded
13
+ - **User prefix preservation**: Respect user-provided output directories and naming conventions
14
+ - **Path validation**: Ensure upload paths don't conflict with existing data
15
+
16
+ **Data Integrity and Security:**
17
+
18
+ - All output data must be validated before upload
19
+ - File permissions and access controls must be properly set
20
+ - Data serialization must be consistent and recoverable
21
+ - Prevent overwriting critical data without confirmation
22
+ - Maintain data lineage information in output metadata
23
+
24
+ ```python
25
+ # ✅ DO: Proper object store upload path handling
26
+ class JsonOutput:
27
+ async def upload_to_object_store(
28
+ self,
29
+ data: List[dict],
30
+ output_prefix: str, # User-provided output location
31
+ filename: str
32
+ ) -> dict:
33
+ """Upload data with correct path handling."""
34
+
35
+ # Construct full object store path respecting user's output prefix
36
+ object_store_key = os.path.join(output_prefix, filename)
37
+
38
+ # Serialize data
39
+ json_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
40
+
41
+ # Upload to correct location
42
+ result = await self.object_store.upload_file(
43
+ data=json_data,
44
+ destination=object_store_key # Respect user's intended location
45
+ )
46
+
47
+ return result
48
+
49
+ # ❌ REJECT: Incorrect path handling
50
+ class BadJsonOutput:
51
+ async def upload_to_object_store(self, data: List[dict], filename: str):
52
+ # Wrong: hardcoded or derived path, ignoring user configuration
53
+ object_store_key = get_object_store_prefix(f"/tmp/{filename}") # Ignores output_prefix!
54
+
55
+ result = await self.object_store.upload_file(
56
+ data=orjson.dumps(data),
57
+ destination=object_store_key # Wrong destination!
58
+ )
59
+ return result
60
+ ```
61
+
62
+ ### Phase 2: Output Architecture Patterns
63
+
64
+ **Performance Optimization Requirements:**
65
+
66
+ - **Parallelization opportunities**: Flag sequential upload operations that could be parallelized
67
+ - **Batch processing**: Group related uploads to reduce overhead
68
+ - **Streaming uploads**: Use streaming for large datasets instead of loading into memory
69
+ - **Connection optimization**: Reuse object store connections across operations
70
+
71
+ **Resource Management:**
72
+
73
+ - Use proper connection pooling for object store operations
74
+ - Implement timeout handling for upload operations
75
+ - Clean up temporary files after upload
76
+ - Handle partial upload failures gracefully
77
+ - Monitor memory usage during large data serialization
78
+
79
+ ```python
80
+ # ✅ DO: Parallel upload processing
81
+ async def upload_multiple_datasets_parallel(
82
+ self,
83
+ datasets: List[Tuple[List[dict], str]], # (data, filename) pairs
84
+ output_prefix: str
85
+ ) -> List[dict]:
86
+ """Upload multiple datasets in parallel for better performance."""
87
+
88
+ async def upload_single_dataset(data: List[dict], filename: str) -> dict:
89
+ """Upload a single dataset with error handling."""
90
+ try:
91
+ object_store_key = os.path.join(output_prefix, filename)
92
+ serialized_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
93
+
94
+ return await self.object_store.upload_file(
95
+ data=serialized_data,
96
+ destination=object_store_key
97
+ )
98
+ except Exception as e:
99
+ logger.error(f"Failed to upload {filename}: {e}")
100
+ raise
101
+
102
+ # Parallel processing with controlled concurrency
103
+ semaphore = asyncio.Semaphore(5) # Limit concurrent uploads
104
+
105
+ async def upload_with_semaphore(data: List[dict], filename: str) -> dict:
106
+ async with semaphore:
107
+ return await upload_single_dataset(data, filename)
108
+
109
+ tasks = [upload_with_semaphore(data, filename) for data, filename in datasets]
110
+ return await asyncio.gather(*tasks)
111
+
112
+ # ❌ REJECT: Sequential upload processing
113
+ async def upload_multiple_datasets_sequential(
114
+ self,
115
+ datasets: List[Tuple[List[dict], str]],
116
+ output_prefix: str
117
+ ) -> List[dict]:
118
+ """Sequential uploads - should be flagged for parallelization."""
119
+ results = []
120
+ for data, filename in datasets: # FLAG: Could be parallelized
121
+ object_store_key = os.path.join(output_prefix, filename)
122
+ result = await self.object_store.upload_file(data, object_store_key)
123
+ results.append(result)
124
+ return results
125
+ ```
126
+
127
+ ### Phase 3: Output Testing Requirements
128
+
129
+ **Data Output Testing:**
130
+
131
+ - Test with various data formats and sizes
132
+ - Test serialization and deserialization consistency
133
+ - Test partial upload scenarios and recovery
134
+ - Mock object store operations in unit tests
135
+ - Include integration tests with real object store
136
+ - Test data corruption detection and prevention
137
+
138
+ **Performance Testing:**
139
+
140
+ - Include tests for large dataset uploads
141
+ - Test memory usage during serialization
142
+ - Test concurrent upload operations
143
+ - Verify timeout handling works correctly
144
+ - Test connection pool behavior under load
145
+
146
+ ### Phase 4: Performance and Scalability
147
+
148
+ **Data Upload Efficiency:**
149
+
150
+ - Use streaming uploads for large datasets
151
+ - Implement proper chunking for oversized data
152
+ - Use compression for large text-based outputs
153
+ - Monitor upload progress and provide feedback
154
+ - Optimize serialization performance (use orjson over json)
155
+
156
+ **Object Store Optimization:**
157
+
158
+ - Use connection pooling for object store clients
159
+ - Implement proper retry logic for upload failures
160
+ - Use parallel uploads where appropriate
161
+ - Monitor upload metrics and error rates
162
+ - Handle bandwidth limitations gracefully
163
+
164
+ ### Phase 5: Output Maintainability
165
+
166
+ **Error Handling and Recovery:**
167
+
168
+ - Implement comprehensive error handling for all upload operations
169
+ - Provide meaningful error messages with upload context
170
+ - Handle partial upload failures gracefully
171
+ - Implement proper retry logic for transient failures
172
+ - Log all upload operations with destination information
173
+
174
+ **Configuration Management:**
175
+
176
+ - Externalize all output-related configuration
177
+ - Support different output destinations and formats
178
+ - Validate output configuration before processing
179
+ - Document all supported output parameters
180
+ - Handle environment-specific output requirements
181
+
182
+ ---
183
+
184
+ ## Output-Specific Anti-Patterns
185
+
186
+ **Always Reject:**
187
+
188
+ - **Path derivation errors**: Deriving object store paths from local temporary paths
189
+ - **Sequential uploads**: Uploading multiple files sequentially when parallel uploads are possible
190
+ - **Memory inefficiency**: Loading entire datasets into memory for serialization
191
+ - **Missing upload verification**: Not verifying successful uploads
192
+ - **Poor error recovery**: Not handling partial upload failures gracefully
193
+ - **Resource leaks**: Not cleaning up temporary files or connections
194
+
195
+ **Object Store Upload Anti-Patterns:**
196
+
197
+ ```python
198
+ # ❌ REJECT: Incorrect upload path handling
199
+ class BadOutputProcessor:
200
+ async def upload_results(self, results: List[dict]):
201
+ # Wrong: derives upload path from temporary local path
202
+ local_temp_file = "/tmp/results.json"
203
+ upload_key = get_object_store_prefix(local_temp_file) # Incorrect!
204
+
205
+ await self.object_store.upload_file(results, upload_key)
206
+
207
+ # ✅ REQUIRE: Correct upload path handling
208
+ class GoodOutputProcessor:
209
+ async def upload_results(
210
+ self,
211
+ results: List[dict],
212
+ output_prefix: str, # User-specified destination
213
+ filename: str = "results.json"
214
+ ):
215
+ # Use actual user-configured output location
216
+ upload_key = os.path.join(output_prefix, filename)
217
+
218
+ await self.object_store.upload_file(
219
+ data=orjson.dumps(results),
220
+ destination=upload_key # Correct destination
221
+ )
222
+ ```
223
+
224
+ **Performance Anti-Patterns:**
225
+
226
+ ```python
227
+ # ❌ REJECT: Sequential upload processing
228
+ async def upload_multiple_files_sequential(file_data_pairs: List[Tuple]):
229
+ results = []
230
+ for data, filename in file_data_pairs: # Should be parallelized
231
+ result = await upload_single_file(data, filename)
232
+ results.append(result)
233
+ return results
234
+
235
+ # ✅ REQUIRE: Parallel upload processing with proper error handling
236
+ async def upload_multiple_files_parallel(
237
+ file_data_pairs: List[Tuple],
238
+ max_concurrency: int = 5
239
+ ) -> List[dict]:
240
+ semaphore = asyncio.Semaphore(max_concurrency)
241
+
242
+ async def upload_with_semaphore(data, filename):
243
+ async with semaphore:
244
+ try:
245
+ return await upload_single_file(data, filename)
246
+ except Exception as e:
247
+ logger.error(f"Upload failed for {filename}: {e}")
248
+ return {"filename": filename, "status": "failed", "error": str(e)}
249
+
250
+ tasks = [upload_with_semaphore(data, filename) for data, filename in file_data_pairs]
251
+ return await asyncio.gather(*tasks)
252
+ ```
253
+
254
+ **Memory Management Anti-Patterns:**
255
+
256
+ ```python
257
+ # ❌ REJECT: Loading entire dataset for serialization
258
+ async def bad_large_dataset_upload(large_dataset: List[dict]):
259
+ # Loads entire dataset into memory
260
+ json_data = orjson.dumps(large_dataset) # Could exceed memory limits
261
+ await upload_data(json_data)
262
+
263
+ # ✅ REQUIRE: Streaming serialization for large datasets
264
+ async def good_large_dataset_upload(large_dataset: List[dict], chunk_size: int = 1000):
265
+ """Stream large datasets to avoid memory issues."""
266
+
267
+ async def serialize_chunk(chunk: List[dict]) -> bytes:
268
+ return orjson.dumps(chunk, option=orjson.OPT_APPEND_NEWLINE)
269
+
270
+ # Process in chunks to manage memory
271
+ for i in range(0, len(large_dataset), chunk_size):
272
+ chunk = large_dataset[i:i + chunk_size]
273
+ serialized_chunk = await serialize_chunk(chunk)
274
+
275
+ await upload_chunk(
276
+ data=serialized_chunk,
277
+ chunk_index=i // chunk_size
278
+ )
279
+ ```
280
+
281
+ ## Educational Context for Output Reviews
282
+
283
+ When reviewing output code, emphasize:
284
+
285
+ 1. **Data Integrity Impact**: "Incorrect upload path handling can cause data to be stored in wrong locations, making it inaccessible to downstream processes. This breaks data pipelines and can cause data loss."
286
+
287
+ 2. **Performance Impact**: "Sequential uploads create unnecessary bottlenecks. For enterprise datasets with multiple output files, parallelization can significantly reduce processing time and improve user experience."
288
+
289
+ 3. **Resource Impact**: "Poor memory management during serialization can cause out-of-memory errors with large datasets. Streaming and chunking are essential for enterprise-scale data output."
290
+
291
+ 4. **User Experience Impact**: "Output path errors are often discovered late in processing, causing wasted computation and frustrating delays. Proper validation and clear error messages improve reliability."
292
+
293
+ 5. **Scalability Impact**: "Output patterns that work for small datasets can fail at enterprise scale. Always design output processes to handle the largest expected dataset sizes efficiently."
294
+
295
+ 6. **Data Pipeline Impact**: "Output processing is the final step in data pipelines. Failures here can invalidate all upstream processing work. Robust error handling and verification are critical for pipeline reliability."
@@ -29,6 +29,7 @@ class IcebergOutput(Output):
29
29
  mode: str = "append",
30
30
  total_record_count: int = 0,
31
31
  chunk_count: int = 0,
32
+ retain_local_copy: bool = False,
32
33
  ):
33
34
  """Initialize the Iceberg output class.
34
35
 
@@ -39,6 +40,8 @@ class IcebergOutput(Output):
39
40
  mode (str, optional): Write mode for the iceberg table. Defaults to "append".
40
41
  total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
41
42
  chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
43
+ retain_local_copy (bool, optional): Whether to retain the local copy of the files.
44
+ Defaults to False.
42
45
  """
43
46
  self.total_record_count = total_record_count
44
47
  self.chunk_count = chunk_count
@@ -47,6 +50,7 @@ class IcebergOutput(Output):
47
50
  self.iceberg_table = iceberg_table
48
51
  self.mode = mode
49
52
  self.metrics = get_metrics()
53
+ self.retain_local_copy = retain_local_copy
50
54
 
51
55
  async def write_dataframe(self, dataframe: "pd.DataFrame"):
52
56
  """