atlan-application-sdk 0.1.1rc38__py3-none-any.whl → 0.1.1rc40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/.cursor/BUGBOT.md +424 -0
- application_sdk/clients/.cursor/BUGBOT.md +280 -0
- application_sdk/clients/sql.py +110 -74
- application_sdk/clients/temporal.py +4 -2
- application_sdk/common/.cursor/BUGBOT.md +316 -0
- application_sdk/constants.py +8 -0
- application_sdk/decorators/.cursor/BUGBOT.md +279 -0
- application_sdk/inputs/.cursor/BUGBOT.md +250 -0
- application_sdk/interceptors/.cursor/BUGBOT.md +320 -0
- application_sdk/interceptors/cleanup.py +171 -0
- application_sdk/interceptors/events.py +6 -6
- application_sdk/outputs/.cursor/BUGBOT.md +295 -0
- application_sdk/outputs/iceberg.py +4 -0
- application_sdk/outputs/json.py +6 -0
- application_sdk/outputs/parquet.py +89 -34
- application_sdk/server/.cursor/BUGBOT.md +442 -0
- application_sdk/services/objectstore.py +98 -20
- application_sdk/version.py +1 -1
- application_sdk/workflows/.cursor/BUGBOT.md +218 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/RECORD +24 -14
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
# Decorator Code Review Guidelines - Centralized Function Decorators
|
|
2
|
+
|
|
3
|
+
## Context-Specific Patterns
|
|
4
|
+
|
|
5
|
+
This directory contains all decorator implementations for the Application SDK. Decorators must be centralized here to avoid scattered functionality and ensure consistent patterns.
|
|
6
|
+
|
|
7
|
+
### Phase 1: Critical Decorator Safety Issues
|
|
8
|
+
|
|
9
|
+
**Decorator Centralization:**
|
|
10
|
+
|
|
11
|
+
- **ALL decorators must be in this directory**: No decorators should exist in other modules (lock/, observability/, etc.)
|
|
12
|
+
- **Consolidate scattered decorators**: If decorators are found elsewhere, they must be moved here
|
|
13
|
+
- **Single responsibility per file**: Each decorator type should have its own file (locks.py, observability_decorator.py)
|
|
14
|
+
- **Proper imports**: Other modules should import decorators from here, not define their own
|
|
15
|
+
|
|
16
|
+
**Type Safety and Function Signatures:**
|
|
17
|
+
|
|
18
|
+
- All decorators must preserve function signatures and type hints
|
|
19
|
+
- Use `functools.wraps` to maintain function metadata
|
|
20
|
+
- Generic decorators must use proper type annotations
|
|
21
|
+
- Return types must match the original function's return type
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
# ✅ DO: Proper decorator type safety
|
|
25
|
+
from typing import Callable, Any, TypeVar, ParamSpec
|
|
26
|
+
from functools import wraps
|
|
27
|
+
|
|
28
|
+
P = ParamSpec('P')
|
|
29
|
+
T = TypeVar('T')
|
|
30
|
+
|
|
31
|
+
def my_decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
32
|
+
@wraps(func)
|
|
33
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
34
|
+
# Decorator logic
|
|
35
|
+
return func(*args, **kwargs)
|
|
36
|
+
return wrapper
|
|
37
|
+
|
|
38
|
+
# ❌ NEVER: Poor type annotations
|
|
39
|
+
def bad_decorator(func): # No type hints
|
|
40
|
+
def wrapper(*args, **kwargs): # No type preservation
|
|
41
|
+
return func(*args, **kwargs)
|
|
42
|
+
return wrapper # Missing @wraps
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Phase 2: Decorator Architecture Patterns
|
|
46
|
+
|
|
47
|
+
**Proper Decorator Structure:**
|
|
48
|
+
|
|
49
|
+
- **Parameterized decorators**: Support both `@decorator` and `@decorator(param=value)` usage patterns
|
|
50
|
+
- **Error handling**: Decorators must not swallow exceptions unless explicitly designed to do so
|
|
51
|
+
- **Resource cleanup**: Decorators that acquire resources must ensure cleanup in finally blocks
|
|
52
|
+
- **Context preservation**: Maintain original function context and metadata
|
|
53
|
+
|
|
54
|
+
**Configuration Management:**
|
|
55
|
+
|
|
56
|
+
- **Centralized constants**: All decorator configuration should use constants from this directory
|
|
57
|
+
- **Shared configuration**: Related decorators should share configuration patterns
|
|
58
|
+
- **Environment awareness**: Decorators should work in both development and production environments
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
# ✅ DO: Proper decorator configuration
|
|
62
|
+
from application_sdk.constants import DEFAULT_LOCK_TTL, DEFAULT_MAX_LOCKS
|
|
63
|
+
|
|
64
|
+
# Shared configuration for lock decorators
|
|
65
|
+
LOCK_CONFIG_KEY = "distributed_lock_config" # Centralized key
|
|
66
|
+
|
|
67
|
+
def distributed_lock(
|
|
68
|
+
lock_name: Optional[str] = None,
|
|
69
|
+
max_locks: int = DEFAULT_MAX_LOCKS,
|
|
70
|
+
ttl_seconds: int = DEFAULT_LOCK_TTL
|
|
71
|
+
):
|
|
72
|
+
"""Distributed lock decorator with proper defaults and configuration."""
|
|
73
|
+
|
|
74
|
+
def decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
75
|
+
@wraps(func)
|
|
76
|
+
async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
77
|
+
# Use centralized configuration
|
|
78
|
+
actual_lock_name = lock_name or f"{func.__module__}.{func.__name__}"
|
|
79
|
+
|
|
80
|
+
# Store config in activity context using shared key
|
|
81
|
+
activity_info = activity.info()
|
|
82
|
+
activity_info.memo[LOCK_CONFIG_KEY] = {
|
|
83
|
+
"lock_name": actual_lock_name,
|
|
84
|
+
"max_locks": max_locks,
|
|
85
|
+
"ttl_seconds": ttl_seconds
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return await func(*args, **kwargs)
|
|
89
|
+
return wrapper
|
|
90
|
+
return decorator
|
|
91
|
+
|
|
92
|
+
# ❌ REJECT: Scattered constants and configuration
|
|
93
|
+
def bad_lock_decorator(max_locks=10): # Hardcoded default
|
|
94
|
+
LOCK_KEY = "my_lock_key" # Should be centralized
|
|
95
|
+
# Configuration scattered across files
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Phase 3: Decorator Testing Requirements
|
|
99
|
+
|
|
100
|
+
**Comprehensive Decorator Testing:**
|
|
101
|
+
|
|
102
|
+
- **Function preservation**: Test that decorators preserve original function behavior
|
|
103
|
+
- **Type safety**: Verify type hints are maintained after decoration
|
|
104
|
+
- **Error propagation**: Ensure exceptions are properly handled and propagated
|
|
105
|
+
- **Resource cleanup**: Test cleanup behavior in both success and failure cases
|
|
106
|
+
- **Configuration validation**: Test all configuration parameters and edge cases
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# ✅ DO: Comprehensive decorator testing
|
|
110
|
+
@pytest.mark.asyncio
|
|
111
|
+
class TestDistributedLockDecorator:
|
|
112
|
+
"""Test suite for distributed lock decorator."""
|
|
113
|
+
|
|
114
|
+
async def test_function_signature_preservation(self):
|
|
115
|
+
"""Test that decorator preserves function signature and types."""
|
|
116
|
+
|
|
117
|
+
@distributed_lock("test_lock")
|
|
118
|
+
async def test_function(param1: str, param2: int = 10) -> dict:
|
|
119
|
+
"""Test function docstring."""
|
|
120
|
+
return {"param1": param1, "param2": param2}
|
|
121
|
+
|
|
122
|
+
# Verify signature preservation
|
|
123
|
+
assert test_function.__name__ == "test_function"
|
|
124
|
+
assert test_function.__doc__ == "Test function docstring."
|
|
125
|
+
|
|
126
|
+
# Verify function still works
|
|
127
|
+
result = await test_function("test", 20)
|
|
128
|
+
assert result == {"param1": "test", "param2": 20}
|
|
129
|
+
|
|
130
|
+
async def test_error_propagation(self):
|
|
131
|
+
"""Test that decorator properly propagates exceptions."""
|
|
132
|
+
|
|
133
|
+
@distributed_lock("error_lock")
|
|
134
|
+
async def failing_function():
|
|
135
|
+
raise ValueError("Test error")
|
|
136
|
+
|
|
137
|
+
# Verify exception is propagated, not swallowed
|
|
138
|
+
with pytest.raises(ValueError, match="Test error"):
|
|
139
|
+
await failing_function()
|
|
140
|
+
|
|
141
|
+
async def test_resource_cleanup_on_failure(self, mock_lock_manager):
|
|
142
|
+
"""Test that resources are cleaned up even when function fails."""
|
|
143
|
+
|
|
144
|
+
@distributed_lock("cleanup_test")
|
|
145
|
+
async def failing_function():
|
|
146
|
+
raise RuntimeError("Simulated failure")
|
|
147
|
+
|
|
148
|
+
mock_lock_manager.acquire_lock.return_value.__aenter__ = AsyncMock()
|
|
149
|
+
mock_lock_manager.acquire_lock.return_value.__aexit__ = AsyncMock()
|
|
150
|
+
|
|
151
|
+
with pytest.raises(RuntimeError):
|
|
152
|
+
await failing_function()
|
|
153
|
+
|
|
154
|
+
# Verify cleanup was called
|
|
155
|
+
mock_lock_manager.acquire_lock.return_value.__aexit__.assert_called_once()
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Phase 4: Performance and Integration
|
|
159
|
+
|
|
160
|
+
**Decorator Performance:**
|
|
161
|
+
|
|
162
|
+
- **Minimal overhead**: Decorators should add minimal performance overhead
|
|
163
|
+
- **Async compatibility**: All decorators must work correctly with async functions
|
|
164
|
+
- **Context manager efficiency**: Use efficient context managers for resource management
|
|
165
|
+
- **Caching**: Cache expensive decorator setup operations where appropriate
|
|
166
|
+
|
|
167
|
+
**Integration Patterns:**
|
|
168
|
+
|
|
169
|
+
- **Temporal integration**: Decorators must work correctly with Temporal activities and workflows
|
|
170
|
+
- **Observability integration**: Integrate with logging, metrics, and tracing systems
|
|
171
|
+
- **Error handling integration**: Work correctly with the SDK's error handling patterns
|
|
172
|
+
|
|
173
|
+
### Phase 5: Decorator Maintainability
|
|
174
|
+
|
|
175
|
+
**Code Organization:**
|
|
176
|
+
|
|
177
|
+
- **One decorator type per file**: Keep related decorators together (all lock decorators in locks.py)
|
|
178
|
+
- **Clear naming**: Decorator files should clearly indicate their purpose
|
|
179
|
+
- **Consistent patterns**: All decorators should follow the same structural patterns
|
|
180
|
+
- **Documentation**: Each decorator must have comprehensive docstrings with usage examples
|
|
181
|
+
|
|
182
|
+
**Backwards Compatibility:**
|
|
183
|
+
|
|
184
|
+
- **API stability**: Decorator APIs should be stable across versions
|
|
185
|
+
- **Graceful deprecation**: Deprecated decorators should include migration guidance
|
|
186
|
+
- **Version compatibility**: Support existing usage patterns when adding new features
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Decorator-Specific Anti-Patterns
|
|
191
|
+
|
|
192
|
+
**Always Reject:**
|
|
193
|
+
|
|
194
|
+
- **Scattered decorators**: Decorators defined outside this directory
|
|
195
|
+
- **Missing type safety**: Decorators without proper type annotations
|
|
196
|
+
- **Resource leaks**: Decorators that don't clean up resources properly
|
|
197
|
+
- **Exception swallowing**: Decorators that hide exceptions unintentionally
|
|
198
|
+
- **Poor configuration**: Hardcoded values that should be configurable
|
|
199
|
+
- **No function preservation**: Decorators that don't preserve original function metadata
|
|
200
|
+
|
|
201
|
+
**Centralization Anti-Patterns:**
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
# ❌ REJECT: Decorators in wrong locations
|
|
205
|
+
# Found in application_sdk/lock/__init__.py
|
|
206
|
+
def needs_lock(max_locks=10):
|
|
207
|
+
"""Should be in decorators/locks.py instead"""
|
|
208
|
+
|
|
209
|
+
# Found in application_sdk/observability/some_module.py
|
|
210
|
+
def trace_activity(func):
|
|
211
|
+
"""Should be in decorators/observability_decorator.py"""
|
|
212
|
+
|
|
213
|
+
# ✅ REQUIRE: Centralized decorators
|
|
214
|
+
# In application_sdk/decorators/locks.py
|
|
215
|
+
def needs_lock(max_locks: int = DEFAULT_MAX_LOCKS):
|
|
216
|
+
"""Properly located distributed lock decorator"""
|
|
217
|
+
|
|
218
|
+
# In application_sdk/decorators/observability_decorator.py
|
|
219
|
+
def observability(logger=None, metrics=None, traces=None):
|
|
220
|
+
"""Properly located observability decorator"""
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
**Type Safety Anti-Patterns:**
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
# ❌ REJECT: Poor type safety
|
|
227
|
+
def bad_decorator(func): # No type annotations
|
|
228
|
+
def wrapper(*args, **kwargs): # No parameter specifications
|
|
229
|
+
return func(*args, **kwargs)
|
|
230
|
+
return wrapper # Missing @wraps, no return type
|
|
231
|
+
|
|
232
|
+
# ✅ REQUIRE: Proper type safety
|
|
233
|
+
from typing import Callable, TypeVar, ParamSpec
|
|
234
|
+
from functools import wraps
|
|
235
|
+
|
|
236
|
+
P = ParamSpec('P')
|
|
237
|
+
T = TypeVar('T')
|
|
238
|
+
|
|
239
|
+
def good_decorator(func: Callable[P, T]) -> Callable[P, T]:
|
|
240
|
+
@wraps(func)
|
|
241
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
242
|
+
return func(*args, **kwargs)
|
|
243
|
+
return wrapper
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
**Configuration Anti-Patterns:**
|
|
247
|
+
|
|
248
|
+
```python
|
|
249
|
+
# ❌ REJECT: Scattered configuration
|
|
250
|
+
# Different files using different keys for same concept
|
|
251
|
+
LOCK_KEY_1 = "lock_config" # In locks.py
|
|
252
|
+
LOCK_KEY_2 = "distributed_lock" # In interceptors.py
|
|
253
|
+
DEFAULT_TTL = 300 # Hardcoded in decorator
|
|
254
|
+
|
|
255
|
+
# ✅ REQUIRE: Centralized configuration
|
|
256
|
+
# In application_sdk/constants.py
|
|
257
|
+
DISTRIBUTED_LOCK_CONFIG_KEY = "distributed_lock_config"
|
|
258
|
+
DEFAULT_LOCK_TTL = 300
|
|
259
|
+
DEFAULT_MAX_LOCKS = 10
|
|
260
|
+
|
|
261
|
+
# In decorators using shared constants
|
|
262
|
+
from application_sdk.constants import DISTRIBUTED_LOCK_CONFIG_KEY, DEFAULT_LOCK_TTL
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## Educational Context for Decorator Reviews
|
|
266
|
+
|
|
267
|
+
When reviewing decorator code, emphasize:
|
|
268
|
+
|
|
269
|
+
1. **Centralization Impact**: "Scattered decorators create maintenance nightmares. When the same decorator logic appears in multiple places, bugs get fixed in some places but not others. Centralization ensures consistency and reduces maintenance burden."
|
|
270
|
+
|
|
271
|
+
2. **Type Safety Impact**: "Decorators that don't preserve type information break IDE support, static analysis, and developer productivity. Proper type annotations are essential for maintaining code quality in large codebases."
|
|
272
|
+
|
|
273
|
+
3. **Resource Management Impact**: "Decorators often manage resources (locks, connections, contexts). Poor resource management in decorators can cause system-wide issues because they're used across many functions."
|
|
274
|
+
|
|
275
|
+
4. **Function Preservation Impact**: "Decorators that don't preserve original function metadata break debugging, introspection, and documentation tools. Using @functools.wraps is not optional."
|
|
276
|
+
|
|
277
|
+
5. **Testing Impact**: "Decorators are cross-cutting concerns that affect many functions. Bugs in decorators have amplified impact, making thorough testing especially critical."
|
|
278
|
+
|
|
279
|
+
6. **Performance Impact**: "Decorators add overhead to every function call they wrap. Inefficient decorators can degrade system performance across the entire application."
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
# Input Code Review Guidelines - Data Input Processing
|
|
2
|
+
|
|
3
|
+
## Context-Specific Patterns
|
|
4
|
+
|
|
5
|
+
This directory contains input processing implementations for various data formats (JSON, Parquet, SQL). Input processors must handle data efficiently while maintaining data integrity and performance.
|
|
6
|
+
|
|
7
|
+
### Phase 1: Critical Input Safety Issues
|
|
8
|
+
|
|
9
|
+
**Object Store Path Management:**
|
|
10
|
+
|
|
11
|
+
- **Correct path calculation**: Source paths must use the actual object store prefix, not derived local paths
|
|
12
|
+
- **Path validation**: Verify that object store keys are valid and within constraints
|
|
13
|
+
- **User-provided prefixes**: Respect user-configured input prefixes and download paths
|
|
14
|
+
- **Path consistency**: Ensure downloaded files match the expected object store locations
|
|
15
|
+
|
|
16
|
+
**Data Validation and Security:**
|
|
17
|
+
|
|
18
|
+
- All input data must be validated before processing
|
|
19
|
+
- File size limits must be enforced to prevent resource exhaustion
|
|
20
|
+
- File type validation required for uploaded/downloaded files
|
|
21
|
+
- Malicious file content detection for executable or script files
|
|
22
|
+
- Input path traversal prevention
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
# ✅ DO: Proper object store path handling
|
|
26
|
+
class JsonInput:
|
|
27
|
+
async def download_from_object_store(
|
|
28
|
+
self,
|
|
29
|
+
input_prefix: str, # User-provided prefix
|
|
30
|
+
local_destination: str
|
|
31
|
+
) -> List[str]:
|
|
32
|
+
"""Download files with correct path handling."""
|
|
33
|
+
|
|
34
|
+
# Use the actual input prefix, not derived local path
|
|
35
|
+
object_store_source = input_prefix # Keep user's intended source
|
|
36
|
+
|
|
37
|
+
downloaded_files = await self.object_store.download_files(
|
|
38
|
+
source=object_store_source,
|
|
39
|
+
destination=local_destination
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
return downloaded_files
|
|
43
|
+
|
|
44
|
+
# ❌ REJECT: Incorrect path handling
|
|
45
|
+
class BadJsonInput:
|
|
46
|
+
async def download_from_object_store(
|
|
47
|
+
self,
|
|
48
|
+
input_prefix: str,
|
|
49
|
+
local_destination: str
|
|
50
|
+
) -> List[str]:
|
|
51
|
+
# Wrong: derives object store path from local path
|
|
52
|
+
object_store_source = get_object_store_prefix(local_destination)
|
|
53
|
+
# This ignores the user's actual input_prefix!
|
|
54
|
+
|
|
55
|
+
return await self.object_store.download_files(
|
|
56
|
+
source=object_store_source, # Wrong source!
|
|
57
|
+
destination=local_destination
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
### Phase 2: Input Architecture Patterns
|
|
62
|
+
|
|
63
|
+
**Performance Optimization Requirements:**
|
|
64
|
+
|
|
65
|
+
- **Parallelization opportunities**: Flag sequential file operations that could be parallelized
|
|
66
|
+
- **Batch processing**: Group related operations to reduce overhead
|
|
67
|
+
- **Memory efficiency**: Process large files in chunks, not all at once
|
|
68
|
+
- **Connection reuse**: Optimize object store connections across operations
|
|
69
|
+
|
|
70
|
+
**Resource Management:**
|
|
71
|
+
|
|
72
|
+
- Use proper connection pooling for object store operations
|
|
73
|
+
- Implement timeout handling for download operations
|
|
74
|
+
- Clean up temporary files after processing
|
|
75
|
+
- Handle partial download failures gracefully
|
|
76
|
+
- Monitor memory usage during large file processing
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# ✅ DO: Parallelized file processing
|
|
80
|
+
async def download_multiple_files_parallel(
|
|
81
|
+
self,
|
|
82
|
+
file_paths: List[str],
|
|
83
|
+
destination_dir: str
|
|
84
|
+
) -> List[str]:
|
|
85
|
+
"""Download multiple files in parallel for better performance."""
|
|
86
|
+
|
|
87
|
+
async def download_single_file(file_path: str) -> str:
|
|
88
|
+
"""Download a single file with error handling."""
|
|
89
|
+
try:
|
|
90
|
+
return await self.object_store.download_file(
|
|
91
|
+
source=file_path,
|
|
92
|
+
destination=os.path.join(destination_dir, os.path.basename(file_path))
|
|
93
|
+
)
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.error(f"Failed to download {file_path}: {e}")
|
|
96
|
+
raise
|
|
97
|
+
|
|
98
|
+
# Parallel processing with controlled concurrency
|
|
99
|
+
semaphore = asyncio.Semaphore(10) # Limit concurrent downloads
|
|
100
|
+
|
|
101
|
+
async def download_with_semaphore(file_path: str) -> str:
|
|
102
|
+
async with semaphore:
|
|
103
|
+
return await download_single_file(file_path)
|
|
104
|
+
|
|
105
|
+
tasks = [download_with_semaphore(path) for path in file_paths]
|
|
106
|
+
return await asyncio.gather(*tasks)
|
|
107
|
+
|
|
108
|
+
# ❌ REJECT: Sequential processing
|
|
109
|
+
async def download_multiple_files_sequential(self, file_paths: List[str]) -> List[str]:
|
|
110
|
+
"""Sequential download - should be flagged for parallelization."""
|
|
111
|
+
downloaded = []
|
|
112
|
+
for file_path in file_paths: # FLAG: Could be parallelized
|
|
113
|
+
result = await self.object_store.download_file(file_path)
|
|
114
|
+
downloaded.append(result)
|
|
115
|
+
return downloaded
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
### Phase 3: Input Testing Requirements
|
|
119
|
+
|
|
120
|
+
**Data Input Testing:**
|
|
121
|
+
|
|
122
|
+
- Test with various file formats and sizes
|
|
123
|
+
- Test malformed data handling
|
|
124
|
+
- Test partial download/upload scenarios
|
|
125
|
+
- Mock object store operations in unit tests
|
|
126
|
+
- Include integration tests with real object store
|
|
127
|
+
- Test error recovery and retry logic
|
|
128
|
+
|
|
129
|
+
**Performance Testing:**
|
|
130
|
+
|
|
131
|
+
- Include tests for large file processing
|
|
132
|
+
- Test memory usage with different chunk sizes
|
|
133
|
+
- Test concurrent download/upload operations
|
|
134
|
+
- Verify timeout handling works correctly
|
|
135
|
+
- Test connection pool behavior
|
|
136
|
+
|
|
137
|
+
### Phase 4: Performance and Scalability
|
|
138
|
+
|
|
139
|
+
**Data Processing Efficiency:**
|
|
140
|
+
|
|
141
|
+
- Use streaming for large files instead of loading entirely into memory
|
|
142
|
+
- Implement proper chunking for batch operations
|
|
143
|
+
- Use async generators for memory-efficient data processing
|
|
144
|
+
- Monitor memory usage and processing time
|
|
145
|
+
- Optimize file I/O operations
|
|
146
|
+
|
|
147
|
+
**Object Store Optimization:**
|
|
148
|
+
|
|
149
|
+
- Use connection pooling for object store clients
|
|
150
|
+
- Implement proper retry logic for transient failures
|
|
151
|
+
- Use parallel operations where appropriate
|
|
152
|
+
- Cache frequently accessed metadata
|
|
153
|
+
- Monitor object store operation metrics
|
|
154
|
+
|
|
155
|
+
### Phase 5: Input Data Maintainability
|
|
156
|
+
|
|
157
|
+
**Error Handling and Recovery:**
|
|
158
|
+
|
|
159
|
+
- Implement comprehensive error handling for all input operations
|
|
160
|
+
- Provide meaningful error messages with context
|
|
161
|
+
- Handle partial failures gracefully (some files fail, others succeed)
|
|
162
|
+
- Implement proper retry logic for transient failures
|
|
163
|
+
- Log all input operations with sufficient context
|
|
164
|
+
|
|
165
|
+
**Configuration Management:**
|
|
166
|
+
|
|
167
|
+
- Externalize all input-related configuration
|
|
168
|
+
- Support different input sources and formats
|
|
169
|
+
- Validate input configuration before processing
|
|
170
|
+
- Document all supported input parameters
|
|
171
|
+
- Handle environment-specific input requirements
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Input-Specific Anti-Patterns
|
|
176
|
+
|
|
177
|
+
**Always Reject:**
|
|
178
|
+
|
|
179
|
+
- **Path calculation errors**: Using local paths to derive object store paths
|
|
180
|
+
- **Sequential processing**: Processing multiple files sequentially when parallel processing is possible
|
|
181
|
+
- **Memory inefficiency**: Loading large files entirely into memory
|
|
182
|
+
- **Missing error handling**: Input operations without proper try-catch blocks
|
|
183
|
+
- **Poor path validation**: Not validating object store keys or file paths
|
|
184
|
+
- **Resource leaks**: Not cleaning up temporary files or connections
|
|
185
|
+
|
|
186
|
+
**Object Store Anti-Patterns:**
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# ❌ REJECT: Incorrect object store usage
|
|
190
|
+
class BadInputProcessor:
|
|
191
|
+
async def process_files(self, local_files: List[str]):
|
|
192
|
+
# Wrong: derives object store path from local path
|
|
193
|
+
for local_file in local_files:
|
|
194
|
+
object_store_key = get_object_store_prefix(local_file) # Incorrect!
|
|
195
|
+
await self.object_store.download_file(object_store_key, local_file)
|
|
196
|
+
|
|
197
|
+
# ✅ REQUIRE: Correct object store usage
|
|
198
|
+
class GoodInputProcessor:
|
|
199
|
+
async def process_files(
|
|
200
|
+
self,
|
|
201
|
+
object_store_paths: List[str], # Actual object store paths
|
|
202
|
+
local_destination_dir: str
|
|
203
|
+
):
|
|
204
|
+
# Use actual object store paths, not derived ones
|
|
205
|
+
for object_store_path in object_store_paths:
|
|
206
|
+
local_file = os.path.join(
|
|
207
|
+
local_destination_dir,
|
|
208
|
+
os.path.basename(object_store_path)
|
|
209
|
+
)
|
|
210
|
+
await self.object_store.download_file(object_store_path, local_file)
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
**Performance Anti-Patterns:**
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
# ❌ REJECT: Sequential file processing
|
|
217
|
+
async def process_files_sequential(file_list: List[str]):
|
|
218
|
+
results = []
|
|
219
|
+
for file_path in file_list: # Should be parallelized
|
|
220
|
+
result = await process_single_file(file_path)
|
|
221
|
+
results.append(result)
|
|
222
|
+
return results
|
|
223
|
+
|
|
224
|
+
# ✅ REQUIRE: Parallel file processing
|
|
225
|
+
async def process_files_parallel(file_list: List[str], max_concurrency: int = 10):
|
|
226
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
227
|
+
|
|
228
|
+
async def process_with_semaphore(file_path: str):
|
|
229
|
+
async with semaphore:
|
|
230
|
+
return await process_single_file(file_path)
|
|
231
|
+
|
|
232
|
+
tasks = [process_with_semaphore(path) for path in file_list]
|
|
233
|
+
return await asyncio.gather(*tasks, return_exceptions=True)
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
## Educational Context for Input Reviews
|
|
237
|
+
|
|
238
|
+
When reviewing input code, emphasize:
|
|
239
|
+
|
|
240
|
+
1. **Data Integrity Impact**: "Incorrect object store path handling can cause data loss or corruption. Files uploaded to wrong locations become inaccessible, breaking data processing pipelines."
|
|
241
|
+
|
|
242
|
+
2. **Performance Impact**: "Sequential file processing creates unnecessary bottlenecks. For enterprise datasets with hundreds of files, parallelization can reduce processing time from hours to minutes."
|
|
243
|
+
|
|
244
|
+
3. **Resource Impact**: "Poor memory management in input processing can cause out-of-memory errors with large datasets. Streaming and chunking are essential for enterprise-scale data processing."
|
|
245
|
+
|
|
246
|
+
4. **User Experience Impact**: "Input path handling errors are often silent until runtime, causing difficult-to-debug failures. Proper validation and clear error messages save hours of troubleshooting."
|
|
247
|
+
|
|
248
|
+
5. **Scalability Impact**: "Input processing patterns that work for small datasets can fail catastrophically at enterprise scale. Always design for the largest expected dataset size."
|
|
249
|
+
|
|
250
|
+
6. **Reliability Impact**: "Input operations are often the first point of failure in data pipelines. Robust error handling and retry logic in input processing prevents entire workflows from failing due to transient issues."
|