atlan-application-sdk 0.1.1rc38__py3-none-any.whl → 0.1.1rc40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/.cursor/BUGBOT.md +424 -0
- application_sdk/clients/.cursor/BUGBOT.md +280 -0
- application_sdk/clients/sql.py +110 -74
- application_sdk/clients/temporal.py +4 -2
- application_sdk/common/.cursor/BUGBOT.md +316 -0
- application_sdk/constants.py +8 -0
- application_sdk/decorators/.cursor/BUGBOT.md +279 -0
- application_sdk/inputs/.cursor/BUGBOT.md +250 -0
- application_sdk/interceptors/.cursor/BUGBOT.md +320 -0
- application_sdk/interceptors/cleanup.py +171 -0
- application_sdk/interceptors/events.py +6 -6
- application_sdk/outputs/.cursor/BUGBOT.md +295 -0
- application_sdk/outputs/iceberg.py +4 -0
- application_sdk/outputs/json.py +6 -0
- application_sdk/outputs/parquet.py +89 -34
- application_sdk/server/.cursor/BUGBOT.md +442 -0
- application_sdk/services/objectstore.py +98 -20
- application_sdk/version.py +1 -1
- application_sdk/workflows/.cursor/BUGBOT.md +218 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/RECORD +24 -14
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc38.dist-info → atlan_application_sdk-0.1.1rc40.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
# Output Code Review Guidelines - Data Output Processing
|
|
2
|
+
|
|
3
|
+
## Context-Specific Patterns
|
|
4
|
+
|
|
5
|
+
This directory contains output processing implementations for various data formats (JSON, Parquet, Iceberg). Output processors must handle data uploads efficiently while maintaining data integrity and correct destination paths.
|
|
6
|
+
|
|
7
|
+
### Phase 1: Critical Output Safety Issues
|
|
8
|
+
|
|
9
|
+
**Object Store Path Management:**
|
|
10
|
+
|
|
11
|
+
- **Correct destination paths**: Upload paths must respect user-configured output prefixes
|
|
12
|
+
- **Path construction accuracy**: Object store keys must be calculated correctly, not hardcoded
|
|
13
|
+
- **User prefix preservation**: Respect user-provided output directories and naming conventions
|
|
14
|
+
- **Path validation**: Ensure upload paths don't conflict with existing data
|
|
15
|
+
|
|
16
|
+
**Data Integrity and Security:**
|
|
17
|
+
|
|
18
|
+
- All output data must be validated before upload
|
|
19
|
+
- File permissions and access controls must be properly set
|
|
20
|
+
- Data serialization must be consistent and recoverable
|
|
21
|
+
- Prevent overwriting critical data without confirmation
|
|
22
|
+
- Maintain data lineage information in output metadata
|
|
23
|
+
|
|
24
|
+
```python
|
|
25
|
+
# ✅ DO: Proper object store upload path handling
|
|
26
|
+
class JsonOutput:
|
|
27
|
+
async def upload_to_object_store(
|
|
28
|
+
self,
|
|
29
|
+
data: List[dict],
|
|
30
|
+
output_prefix: str, # User-provided output location
|
|
31
|
+
filename: str
|
|
32
|
+
) -> dict:
|
|
33
|
+
"""Upload data with correct path handling."""
|
|
34
|
+
|
|
35
|
+
# Construct full object store path respecting user's output prefix
|
|
36
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
37
|
+
|
|
38
|
+
# Serialize data
|
|
39
|
+
json_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
40
|
+
|
|
41
|
+
# Upload to correct location
|
|
42
|
+
result = await self.object_store.upload_file(
|
|
43
|
+
data=json_data,
|
|
44
|
+
destination=object_store_key # Respect user's intended location
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
return result
|
|
48
|
+
|
|
49
|
+
# ❌ REJECT: Incorrect path handling
|
|
50
|
+
class BadJsonOutput:
|
|
51
|
+
async def upload_to_object_store(self, data: List[dict], filename: str):
|
|
52
|
+
# Wrong: hardcoded or derived path, ignoring user configuration
|
|
53
|
+
object_store_key = get_object_store_prefix(f"/tmp/{filename}") # Ignores output_prefix!
|
|
54
|
+
|
|
55
|
+
result = await self.object_store.upload_file(
|
|
56
|
+
data=orjson.dumps(data),
|
|
57
|
+
destination=object_store_key # Wrong destination!
|
|
58
|
+
)
|
|
59
|
+
return result
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Phase 2: Output Architecture Patterns
|
|
63
|
+
|
|
64
|
+
**Performance Optimization Requirements:**
|
|
65
|
+
|
|
66
|
+
- **Parallelization opportunities**: Flag sequential upload operations that could be parallelized
|
|
67
|
+
- **Batch processing**: Group related uploads to reduce overhead
|
|
68
|
+
- **Streaming uploads**: Use streaming for large datasets instead of loading into memory
|
|
69
|
+
- **Connection optimization**: Reuse object store connections across operations
|
|
70
|
+
|
|
71
|
+
**Resource Management:**
|
|
72
|
+
|
|
73
|
+
- Use proper connection pooling for object store operations
|
|
74
|
+
- Implement timeout handling for upload operations
|
|
75
|
+
- Clean up temporary files after upload
|
|
76
|
+
- Handle partial upload failures gracefully
|
|
77
|
+
- Monitor memory usage during large data serialization
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# ✅ DO: Parallel upload processing
|
|
81
|
+
async def upload_multiple_datasets_parallel(
|
|
82
|
+
self,
|
|
83
|
+
datasets: List[Tuple[List[dict], str]], # (data, filename) pairs
|
|
84
|
+
output_prefix: str
|
|
85
|
+
) -> List[dict]:
|
|
86
|
+
"""Upload multiple datasets in parallel for better performance."""
|
|
87
|
+
|
|
88
|
+
async def upload_single_dataset(data: List[dict], filename: str) -> dict:
|
|
89
|
+
"""Upload a single dataset with error handling."""
|
|
90
|
+
try:
|
|
91
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
92
|
+
serialized_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
|
|
93
|
+
|
|
94
|
+
return await self.object_store.upload_file(
|
|
95
|
+
data=serialized_data,
|
|
96
|
+
destination=object_store_key
|
|
97
|
+
)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.error(f"Failed to upload {filename}: {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
# Parallel processing with controlled concurrency
|
|
103
|
+
semaphore = asyncio.Semaphore(5) # Limit concurrent uploads
|
|
104
|
+
|
|
105
|
+
async def upload_with_semaphore(data: List[dict], filename: str) -> dict:
|
|
106
|
+
async with semaphore:
|
|
107
|
+
return await upload_single_dataset(data, filename)
|
|
108
|
+
|
|
109
|
+
tasks = [upload_with_semaphore(data, filename) for data, filename in datasets]
|
|
110
|
+
return await asyncio.gather(*tasks)
|
|
111
|
+
|
|
112
|
+
# ❌ REJECT: Sequential upload processing
|
|
113
|
+
async def upload_multiple_datasets_sequential(
|
|
114
|
+
self,
|
|
115
|
+
datasets: List[Tuple[List[dict], str]],
|
|
116
|
+
output_prefix: str
|
|
117
|
+
) -> List[dict]:
|
|
118
|
+
"""Sequential uploads - should be flagged for parallelization."""
|
|
119
|
+
results = []
|
|
120
|
+
for data, filename in datasets: # FLAG: Could be parallelized
|
|
121
|
+
object_store_key = os.path.join(output_prefix, filename)
|
|
122
|
+
result = await self.object_store.upload_file(data, object_store_key)
|
|
123
|
+
results.append(result)
|
|
124
|
+
return results
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Phase 3: Output Testing Requirements
|
|
128
|
+
|
|
129
|
+
**Data Output Testing:**
|
|
130
|
+
|
|
131
|
+
- Test with various data formats and sizes
|
|
132
|
+
- Test serialization and deserialization consistency
|
|
133
|
+
- Test partial upload scenarios and recovery
|
|
134
|
+
- Mock object store operations in unit tests
|
|
135
|
+
- Include integration tests with real object store
|
|
136
|
+
- Test data corruption detection and prevention
|
|
137
|
+
|
|
138
|
+
**Performance Testing:**
|
|
139
|
+
|
|
140
|
+
- Include tests for large dataset uploads
|
|
141
|
+
- Test memory usage during serialization
|
|
142
|
+
- Test concurrent upload operations
|
|
143
|
+
- Verify timeout handling works correctly
|
|
144
|
+
- Test connection pool behavior under load
|
|
145
|
+
|
|
146
|
+
### Phase 4: Performance and Scalability
|
|
147
|
+
|
|
148
|
+
**Data Upload Efficiency:**
|
|
149
|
+
|
|
150
|
+
- Use streaming uploads for large datasets
|
|
151
|
+
- Implement proper chunking for oversized data
|
|
152
|
+
- Use compression for large text-based outputs
|
|
153
|
+
- Monitor upload progress and provide feedback
|
|
154
|
+
- Optimize serialization performance (use orjson over json)
|
|
155
|
+
|
|
156
|
+
**Object Store Optimization:**
|
|
157
|
+
|
|
158
|
+
- Use connection pooling for object store clients
|
|
159
|
+
- Implement proper retry logic for upload failures
|
|
160
|
+
- Use parallel uploads where appropriate
|
|
161
|
+
- Monitor upload metrics and error rates
|
|
162
|
+
- Handle bandwidth limitations gracefully
|
|
163
|
+
|
|
164
|
+
### Phase 5: Output Maintainability
|
|
165
|
+
|
|
166
|
+
**Error Handling and Recovery:**
|
|
167
|
+
|
|
168
|
+
- Implement comprehensive error handling for all upload operations
|
|
169
|
+
- Provide meaningful error messages with upload context
|
|
170
|
+
- Handle partial upload failures gracefully
|
|
171
|
+
- Implement proper retry logic for transient failures
|
|
172
|
+
- Log all upload operations with destination information
|
|
173
|
+
|
|
174
|
+
**Configuration Management:**
|
|
175
|
+
|
|
176
|
+
- Externalize all output-related configuration
|
|
177
|
+
- Support different output destinations and formats
|
|
178
|
+
- Validate output configuration before processing
|
|
179
|
+
- Document all supported output parameters
|
|
180
|
+
- Handle environment-specific output requirements
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Output-Specific Anti-Patterns
|
|
185
|
+
|
|
186
|
+
**Always Reject:**
|
|
187
|
+
|
|
188
|
+
- **Path derivation errors**: Deriving object store paths from local temporary paths
|
|
189
|
+
- **Sequential uploads**: Uploading multiple files sequentially when parallel uploads are possible
|
|
190
|
+
- **Memory inefficiency**: Loading entire datasets into memory for serialization
|
|
191
|
+
- **Missing upload verification**: Not verifying successful uploads
|
|
192
|
+
- **Poor error recovery**: Not handling partial upload failures gracefully
|
|
193
|
+
- **Resource leaks**: Not cleaning up temporary files or connections
|
|
194
|
+
|
|
195
|
+
**Object Store Upload Anti-Patterns:**
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
# ❌ REJECT: Incorrect upload path handling
|
|
199
|
+
class BadOutputProcessor:
|
|
200
|
+
async def upload_results(self, results: List[dict]):
|
|
201
|
+
# Wrong: derives upload path from temporary local path
|
|
202
|
+
local_temp_file = "/tmp/results.json"
|
|
203
|
+
upload_key = get_object_store_prefix(local_temp_file) # Incorrect!
|
|
204
|
+
|
|
205
|
+
await self.object_store.upload_file(results, upload_key)
|
|
206
|
+
|
|
207
|
+
# ✅ REQUIRE: Correct upload path handling
|
|
208
|
+
class GoodOutputProcessor:
|
|
209
|
+
async def upload_results(
|
|
210
|
+
self,
|
|
211
|
+
results: List[dict],
|
|
212
|
+
output_prefix: str, # User-specified destination
|
|
213
|
+
filename: str = "results.json"
|
|
214
|
+
):
|
|
215
|
+
# Use actual user-configured output location
|
|
216
|
+
upload_key = os.path.join(output_prefix, filename)
|
|
217
|
+
|
|
218
|
+
await self.object_store.upload_file(
|
|
219
|
+
data=orjson.dumps(results),
|
|
220
|
+
destination=upload_key # Correct destination
|
|
221
|
+
)
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
**Performance Anti-Patterns:**
|
|
225
|
+
|
|
226
|
+
```python
|
|
227
|
+
# ❌ REJECT: Sequential upload processing
|
|
228
|
+
async def upload_multiple_files_sequential(file_data_pairs: List[Tuple]):
|
|
229
|
+
results = []
|
|
230
|
+
for data, filename in file_data_pairs: # Should be parallelized
|
|
231
|
+
result = await upload_single_file(data, filename)
|
|
232
|
+
results.append(result)
|
|
233
|
+
return results
|
|
234
|
+
|
|
235
|
+
# ✅ REQUIRE: Parallel upload processing with proper error handling
|
|
236
|
+
async def upload_multiple_files_parallel(
|
|
237
|
+
file_data_pairs: List[Tuple],
|
|
238
|
+
max_concurrency: int = 5
|
|
239
|
+
) -> List[dict]:
|
|
240
|
+
semaphore = asyncio.Semaphore(max_concurrency)
|
|
241
|
+
|
|
242
|
+
async def upload_with_semaphore(data, filename):
|
|
243
|
+
async with semaphore:
|
|
244
|
+
try:
|
|
245
|
+
return await upload_single_file(data, filename)
|
|
246
|
+
except Exception as e:
|
|
247
|
+
logger.error(f"Upload failed for {filename}: {e}")
|
|
248
|
+
return {"filename": filename, "status": "failed", "error": str(e)}
|
|
249
|
+
|
|
250
|
+
tasks = [upload_with_semaphore(data, filename) for data, filename in file_data_pairs]
|
|
251
|
+
return await asyncio.gather(*tasks)
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**Memory Management Anti-Patterns:**
|
|
255
|
+
|
|
256
|
+
```python
|
|
257
|
+
# ❌ REJECT: Loading entire dataset for serialization
|
|
258
|
+
async def bad_large_dataset_upload(large_dataset: List[dict]):
|
|
259
|
+
# Loads entire dataset into memory
|
|
260
|
+
json_data = orjson.dumps(large_dataset) # Could exceed memory limits
|
|
261
|
+
await upload_data(json_data)
|
|
262
|
+
|
|
263
|
+
# ✅ REQUIRE: Streaming serialization for large datasets
|
|
264
|
+
async def good_large_dataset_upload(large_dataset: List[dict], chunk_size: int = 1000):
|
|
265
|
+
"""Stream large datasets to avoid memory issues."""
|
|
266
|
+
|
|
267
|
+
async def serialize_chunk(chunk: List[dict]) -> bytes:
|
|
268
|
+
return orjson.dumps(chunk, option=orjson.OPT_APPEND_NEWLINE)
|
|
269
|
+
|
|
270
|
+
# Process in chunks to manage memory
|
|
271
|
+
for i in range(0, len(large_dataset), chunk_size):
|
|
272
|
+
chunk = large_dataset[i:i + chunk_size]
|
|
273
|
+
serialized_chunk = await serialize_chunk(chunk)
|
|
274
|
+
|
|
275
|
+
await upload_chunk(
|
|
276
|
+
data=serialized_chunk,
|
|
277
|
+
chunk_index=i // chunk_size
|
|
278
|
+
)
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
## Educational Context for Output Reviews
|
|
282
|
+
|
|
283
|
+
When reviewing output code, emphasize:
|
|
284
|
+
|
|
285
|
+
1. **Data Integrity Impact**: "Incorrect upload path handling can cause data to be stored in wrong locations, making it inaccessible to downstream processes. This breaks data pipelines and can cause data loss."
|
|
286
|
+
|
|
287
|
+
2. **Performance Impact**: "Sequential uploads create unnecessary bottlenecks. For enterprise datasets with multiple output files, parallelization can significantly reduce processing time and improve user experience."
|
|
288
|
+
|
|
289
|
+
3. **Resource Impact**: "Poor memory management during serialization can cause out-of-memory errors with large datasets. Streaming and chunking are essential for enterprise-scale data output."
|
|
290
|
+
|
|
291
|
+
4. **User Experience Impact**: "Output path errors are often discovered late in processing, causing wasted computation and frustrating delays. Proper validation and clear error messages improve reliability."
|
|
292
|
+
|
|
293
|
+
5. **Scalability Impact**: "Output patterns that work for small datasets can fail at enterprise scale. Always design output processes to handle the largest expected dataset sizes efficiently."
|
|
294
|
+
|
|
295
|
+
6. **Data Pipeline Impact**: "Output processing is the final step in data pipelines. Failures here can invalidate all upstream processing work. Robust error handling and verification are critical for pipeline reliability."
|
|
@@ -29,6 +29,7 @@ class IcebergOutput(Output):
|
|
|
29
29
|
mode: str = "append",
|
|
30
30
|
total_record_count: int = 0,
|
|
31
31
|
chunk_count: int = 0,
|
|
32
|
+
retain_local_copy: bool = False,
|
|
32
33
|
):
|
|
33
34
|
"""Initialize the Iceberg output class.
|
|
34
35
|
|
|
@@ -39,6 +40,8 @@ class IcebergOutput(Output):
|
|
|
39
40
|
mode (str, optional): Write mode for the iceberg table. Defaults to "append".
|
|
40
41
|
total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
|
|
41
42
|
chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
|
|
43
|
+
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
44
|
+
Defaults to False.
|
|
42
45
|
"""
|
|
43
46
|
self.total_record_count = total_record_count
|
|
44
47
|
self.chunk_count = chunk_count
|
|
@@ -47,6 +50,7 @@ class IcebergOutput(Output):
|
|
|
47
50
|
self.iceberg_table = iceberg_table
|
|
48
51
|
self.mode = mode
|
|
49
52
|
self.metrics = get_metrics()
|
|
53
|
+
self.retain_local_copy = retain_local_copy
|
|
50
54
|
|
|
51
55
|
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
52
56
|
"""
|
application_sdk/outputs/json.py
CHANGED
|
@@ -93,6 +93,7 @@ class JsonOutput(Output):
|
|
|
93
93
|
path_gen: Callable[[int | None, int], str] = path_gen,
|
|
94
94
|
start_marker: Optional[str] = None,
|
|
95
95
|
end_marker: Optional[str] = None,
|
|
96
|
+
retain_local_copy: bool = False,
|
|
96
97
|
**kwargs: Dict[str, Any],
|
|
97
98
|
):
|
|
98
99
|
"""Initialize the JSON output handler.
|
|
@@ -113,6 +114,8 @@ class JsonOutput(Output):
|
|
|
113
114
|
Defaults to 0.
|
|
114
115
|
path_gen (Callable, optional): Function to generate file paths.
|
|
115
116
|
Defaults to path_gen function.
|
|
117
|
+
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
118
|
+
Defaults to False.
|
|
116
119
|
"""
|
|
117
120
|
self.output_path = output_path
|
|
118
121
|
self.output_suffix = output_suffix
|
|
@@ -133,6 +136,7 @@ class JsonOutput(Output):
|
|
|
133
136
|
self.start_marker = start_marker
|
|
134
137
|
self.end_marker = end_marker
|
|
135
138
|
self.metrics = get_metrics()
|
|
139
|
+
self.retain_local_copy = retain_local_copy
|
|
136
140
|
|
|
137
141
|
if not self.output_path:
|
|
138
142
|
raise ValueError("output_path is required")
|
|
@@ -282,6 +286,7 @@ class JsonOutput(Output):
|
|
|
282
286
|
await ObjectStore.upload_prefix(
|
|
283
287
|
source=self.output_path,
|
|
284
288
|
destination=get_object_store_prefix(self.output_path),
|
|
289
|
+
retain_local_copy=self.retain_local_copy,
|
|
285
290
|
)
|
|
286
291
|
|
|
287
292
|
except Exception as e:
|
|
@@ -367,6 +372,7 @@ class JsonOutput(Output):
|
|
|
367
372
|
await ObjectStore.upload_file(
|
|
368
373
|
source=output_file_name,
|
|
369
374
|
destination=get_object_store_prefix(output_file_name),
|
|
375
|
+
retain_local_copy=self.retain_local_copy,
|
|
370
376
|
)
|
|
371
377
|
|
|
372
378
|
self.buffer.clear()
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from typing import TYPE_CHECKING, List, Optional, Union
|
|
3
4
|
|
|
4
5
|
from temporalio import activity
|
|
5
6
|
|
|
@@ -18,6 +19,14 @@ if TYPE_CHECKING:
|
|
|
18
19
|
import pandas as pd
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
class WriteMode(Enum):
|
|
23
|
+
"""Enumeration of write modes for Parquet output operations."""
|
|
24
|
+
|
|
25
|
+
APPEND = "append"
|
|
26
|
+
OVERWRITE = "overwrite"
|
|
27
|
+
OVERWRITE_PARTITIONS = "overwrite-partitions"
|
|
28
|
+
|
|
29
|
+
|
|
21
30
|
class ParquetOutput(Output):
|
|
22
31
|
"""Output handler for writing data to Parquet files.
|
|
23
32
|
|
|
@@ -29,7 +38,6 @@ class ParquetOutput(Output):
|
|
|
29
38
|
output_prefix (str): Prefix for files when uploading to object store.
|
|
30
39
|
output_suffix (str): Suffix for output files.
|
|
31
40
|
typename (Optional[str]): Type name of the entity e.g database, schema, table.
|
|
32
|
-
mode (str): Write mode for parquet files ("append" or "overwrite").
|
|
33
41
|
chunk_size (int): Maximum number of records per chunk.
|
|
34
42
|
total_record_count (int): Total number of records processed.
|
|
35
43
|
chunk_count (int): Number of chunks created.
|
|
@@ -45,7 +53,6 @@ class ParquetOutput(Output):
|
|
|
45
53
|
output_suffix: str = "",
|
|
46
54
|
output_prefix: str = "",
|
|
47
55
|
typename: Optional[str] = None,
|
|
48
|
-
write_mode: Literal["append", "overwrite", "overwrite-partitions"] = "append",
|
|
49
56
|
chunk_size: Optional[int] = 100000,
|
|
50
57
|
buffer_size: Optional[int] = 100000,
|
|
51
58
|
total_record_count: int = 0,
|
|
@@ -53,6 +60,7 @@ class ParquetOutput(Output):
|
|
|
53
60
|
chunk_start: Optional[int] = None,
|
|
54
61
|
start_marker: Optional[str] = None,
|
|
55
62
|
end_marker: Optional[str] = None,
|
|
63
|
+
retain_local_copy: bool = False,
|
|
56
64
|
):
|
|
57
65
|
"""Initialize the Parquet output handler.
|
|
58
66
|
|
|
@@ -61,7 +69,6 @@ class ParquetOutput(Output):
|
|
|
61
69
|
output_suffix (str): Suffix for output files.
|
|
62
70
|
output_prefix (str): Prefix for files when uploading to object store.
|
|
63
71
|
typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
|
|
64
|
-
mode (str, optional): Write mode for parquet files. Defaults to "append".
|
|
65
72
|
chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
|
|
66
73
|
total_record_count (int, optional): Initial total record count. Defaults to 0.
|
|
67
74
|
chunk_count (int, optional): Initial chunk count. Defaults to 0.
|
|
@@ -73,12 +80,13 @@ class ParquetOutput(Output):
|
|
|
73
80
|
Defaults to None.
|
|
74
81
|
end_marker (Optional[str], optional): End marker for query extraction.
|
|
75
82
|
Defaults to None.
|
|
83
|
+
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
84
|
+
Defaults to False.
|
|
76
85
|
"""
|
|
77
86
|
self.output_path = output_path
|
|
78
87
|
self.output_suffix = output_suffix
|
|
79
88
|
self.output_prefix = output_prefix
|
|
80
89
|
self.typename = typename
|
|
81
|
-
self.write_mode = write_mode
|
|
82
90
|
self.chunk_size = chunk_size
|
|
83
91
|
self.buffer_size = buffer_size
|
|
84
92
|
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
@@ -94,6 +102,7 @@ class ParquetOutput(Output):
|
|
|
94
102
|
self.end_marker = end_marker
|
|
95
103
|
self.statistics = []
|
|
96
104
|
self.metrics = get_metrics()
|
|
105
|
+
self.retain_local_copy = retain_local_copy
|
|
97
106
|
|
|
98
107
|
# Create output directory
|
|
99
108
|
self.output_path = os.path.join(self.output_path, self.output_suffix)
|
|
@@ -103,7 +112,7 @@ class ParquetOutput(Output):
|
|
|
103
112
|
|
|
104
113
|
def path_gen(
|
|
105
114
|
self,
|
|
106
|
-
chunk_start: int
|
|
115
|
+
chunk_start: Optional[int] = None,
|
|
107
116
|
chunk_count: int = 0,
|
|
108
117
|
start_marker: Optional[str] = None,
|
|
109
118
|
end_marker: Optional[str] = None,
|
|
@@ -111,7 +120,7 @@ class ParquetOutput(Output):
|
|
|
111
120
|
"""Generate a file path for a chunk.
|
|
112
121
|
|
|
113
122
|
Args:
|
|
114
|
-
chunk_start (int
|
|
123
|
+
chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
|
|
115
124
|
chunk_count (int): Total number of chunks.
|
|
116
125
|
start_marker (Optional[str]): Start marker for query extraction.
|
|
117
126
|
end_marker (Optional[str]): End marker for query extraction.
|
|
@@ -182,7 +191,7 @@ class ParquetOutput(Output):
|
|
|
182
191
|
name="parquet_write_records",
|
|
183
192
|
value=len(dataframe),
|
|
184
193
|
metric_type=MetricType.COUNTER,
|
|
185
|
-
labels={"type": "pandas", "mode":
|
|
194
|
+
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
186
195
|
description="Number of records written to Parquet files from pandas DataFrame",
|
|
187
196
|
)
|
|
188
197
|
|
|
@@ -191,7 +200,7 @@ class ParquetOutput(Output):
|
|
|
191
200
|
name="parquet_chunks_written",
|
|
192
201
|
value=1,
|
|
193
202
|
metric_type=MetricType.COUNTER,
|
|
194
|
-
labels={"type": "pandas", "mode":
|
|
203
|
+
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
195
204
|
description="Number of chunks written to Parquet files",
|
|
196
205
|
)
|
|
197
206
|
|
|
@@ -203,69 +212,115 @@ class ParquetOutput(Output):
|
|
|
203
212
|
name="parquet_write_errors",
|
|
204
213
|
value=1,
|
|
205
214
|
metric_type=MetricType.COUNTER,
|
|
206
|
-
labels={
|
|
215
|
+
labels={
|
|
216
|
+
"type": "pandas",
|
|
217
|
+
"mode": WriteMode.APPEND.value,
|
|
218
|
+
"error": str(e),
|
|
219
|
+
},
|
|
207
220
|
description="Number of errors while writing to Parquet files",
|
|
208
221
|
)
|
|
209
222
|
logger.error(f"Error writing pandas dataframe to parquet: {str(e)}")
|
|
210
223
|
raise
|
|
211
224
|
|
|
212
|
-
async def write_daft_dataframe(
|
|
225
|
+
async def write_daft_dataframe(
|
|
226
|
+
self,
|
|
227
|
+
dataframe: "daft.DataFrame", # noqa: F821
|
|
228
|
+
partition_cols: Optional[List] = None,
|
|
229
|
+
write_mode: Union[WriteMode, str] = WriteMode.APPEND,
|
|
230
|
+
morsel_size: int = 100_000,
|
|
231
|
+
):
|
|
213
232
|
"""Write a daft DataFrame to Parquet files and upload to object store.
|
|
214
233
|
|
|
234
|
+
Uses Daft's native file size management to automatically split large DataFrames
|
|
235
|
+
into multiple parquet files based on the configured target file size. Supports
|
|
236
|
+
Hive partitioning for efficient data organization.
|
|
237
|
+
|
|
215
238
|
Args:
|
|
216
239
|
dataframe (daft.DataFrame): The DataFrame to write.
|
|
240
|
+
partition_cols (Optional[List]): Column names or expressions to use for Hive partitioning.
|
|
241
|
+
Can be strings (column names) or daft column expressions. If None (default), no partitioning is applied.
|
|
242
|
+
write_mode (Union[WriteMode, str]): Write mode for parquet files.
|
|
243
|
+
Use WriteMode.APPEND, WriteMode.OVERWRITE, WriteMode.OVERWRITE_PARTITIONS, or their string equivalents.
|
|
244
|
+
morsel_size (int): Default number of rows in a morsel used for the new local executor, when running locally on just a single machine,
|
|
245
|
+
Daft does not use partitions. Instead of using partitioning to control parallelism, the local execution engine performs a streaming-based
|
|
246
|
+
execution on small "morsels" of data, which provides much more stable memory utilization while improving the user experience with not having
|
|
247
|
+
to worry about partitioning.
|
|
248
|
+
|
|
249
|
+
Note:
|
|
250
|
+
- Daft automatically handles file chunking based on parquet_target_filesize
|
|
251
|
+
- Multiple files will be created if DataFrame exceeds DAPR limit
|
|
252
|
+
- If partition_cols is set, creates Hive-style directory structure
|
|
217
253
|
"""
|
|
218
254
|
try:
|
|
255
|
+
import daft
|
|
256
|
+
|
|
257
|
+
# Convert string to enum if needed for backward compatibility
|
|
258
|
+
if isinstance(write_mode, str):
|
|
259
|
+
write_mode = WriteMode(write_mode)
|
|
260
|
+
|
|
219
261
|
row_count = dataframe.count_rows()
|
|
220
262
|
if row_count == 0:
|
|
221
263
|
return
|
|
222
264
|
|
|
265
|
+
# Use Daft's execution context for temporary configuration
|
|
266
|
+
with daft.execution_config_ctx(
|
|
267
|
+
parquet_target_filesize=self.max_file_size_bytes,
|
|
268
|
+
default_morsel_size=morsel_size,
|
|
269
|
+
):
|
|
270
|
+
# Daft automatically handles file splitting and naming
|
|
271
|
+
dataframe.write_parquet(
|
|
272
|
+
root_dir=self.output_path,
|
|
273
|
+
write_mode=write_mode.value,
|
|
274
|
+
partition_cols=partition_cols if partition_cols else [],
|
|
275
|
+
)
|
|
276
|
+
|
|
223
277
|
# Update counters
|
|
224
278
|
self.chunk_count += 1
|
|
225
279
|
self.total_record_count += row_count
|
|
226
280
|
|
|
227
|
-
# Generate file path using path_gen function
|
|
228
|
-
if self.start_marker and self.end_marker:
|
|
229
|
-
file_path = self.output_path
|
|
230
|
-
else:
|
|
231
|
-
file_path = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count, self.start_marker, self.end_marker)}"
|
|
232
|
-
|
|
233
|
-
# Write the dataframe to parquet using daft
|
|
234
|
-
dataframe.write_parquet(
|
|
235
|
-
file_path,
|
|
236
|
-
write_mode=self.write_mode,
|
|
237
|
-
)
|
|
238
|
-
|
|
239
281
|
# Record metrics for successful write
|
|
240
282
|
self.metrics.record_metric(
|
|
241
283
|
name="parquet_write_records",
|
|
242
284
|
value=row_count,
|
|
243
285
|
metric_type=MetricType.COUNTER,
|
|
244
|
-
labels={"type": "daft", "mode":
|
|
286
|
+
labels={"type": "daft", "mode": write_mode.value},
|
|
245
287
|
description="Number of records written to Parquet files from daft DataFrame",
|
|
246
288
|
)
|
|
247
289
|
|
|
248
|
-
# Record
|
|
290
|
+
# Record operation metrics (note: actual file count may be higher due to Daft's splitting)
|
|
249
291
|
self.metrics.record_metric(
|
|
250
|
-
name="
|
|
292
|
+
name="parquet_write_operations",
|
|
251
293
|
value=1,
|
|
252
294
|
metric_type=MetricType.COUNTER,
|
|
253
|
-
labels={"type": "daft", "mode":
|
|
254
|
-
description="Number of
|
|
295
|
+
labels={"type": "daft", "mode": write_mode.value},
|
|
296
|
+
description="Number of write operations to Parquet files",
|
|
255
297
|
)
|
|
256
298
|
|
|
257
|
-
#
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
299
|
+
# Upload the entire directory (contains multiple parquet files created by Daft)
|
|
300
|
+
if write_mode == WriteMode.OVERWRITE:
|
|
301
|
+
# Delete the directory from object store
|
|
302
|
+
try:
|
|
303
|
+
await ObjectStore.delete_prefix(
|
|
304
|
+
prefix=get_object_store_prefix(self.output_path)
|
|
305
|
+
)
|
|
306
|
+
except FileNotFoundError as e:
|
|
307
|
+
logger.info(
|
|
308
|
+
f"No files found under prefix {get_object_store_prefix(self.output_path)}: {str(e)}"
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
await ObjectStore.upload_prefix(
|
|
312
|
+
source=self.output_path,
|
|
313
|
+
destination=get_object_store_prefix(self.output_path),
|
|
314
|
+
retain_local_copy=self.retain_local_copy,
|
|
261
315
|
)
|
|
316
|
+
|
|
262
317
|
except Exception as e:
|
|
263
318
|
# Record metrics for failed write
|
|
264
319
|
self.metrics.record_metric(
|
|
265
320
|
name="parquet_write_errors",
|
|
266
321
|
value=1,
|
|
267
322
|
metric_type=MetricType.COUNTER,
|
|
268
|
-
labels={"type": "daft", "mode":
|
|
323
|
+
labels={"type": "daft", "mode": write_mode, "error": str(e)},
|
|
269
324
|
description="Number of errors while writing to Parquet files",
|
|
270
325
|
)
|
|
271
326
|
logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
|
|
@@ -279,7 +334,7 @@ class ParquetOutput(Output):
|
|
|
279
334
|
"""
|
|
280
335
|
return self.output_path
|
|
281
336
|
|
|
282
|
-
async def _flush_buffer(self, chunk_part):
|
|
337
|
+
async def _flush_buffer(self, chunk_part: int):
|
|
283
338
|
"""Flush the current buffer to a Parquet file.
|
|
284
339
|
|
|
285
340
|
This method combines all DataFrames in the buffer, writes them to a Parquet file,
|