atlan-application-sdk 0.1.1rc38__py3-none-any.whl → 0.1.1rc40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ # Output Code Review Guidelines - Data Output Processing
2
+
3
+ ## Context-Specific Patterns
4
+
5
+ This directory contains output processing implementations for various data formats (JSON, Parquet, Iceberg). Output processors must handle data uploads efficiently while maintaining data integrity and correct destination paths.
6
+
7
+ ### Phase 1: Critical Output Safety Issues
8
+
9
+ **Object Store Path Management:**
10
+
11
+ - **Correct destination paths**: Upload paths must respect user-configured output prefixes
12
+ - **Path construction accuracy**: Object store keys must be calculated correctly, not hardcoded
13
+ - **User prefix preservation**: Respect user-provided output directories and naming conventions
14
+ - **Path validation**: Ensure upload paths don't conflict with existing data
15
+
16
+ **Data Integrity and Security:**
17
+
18
+ - All output data must be validated before upload
19
+ - File permissions and access controls must be properly set
20
+ - Data serialization must be consistent and recoverable
21
+ - Prevent overwriting critical data without confirmation
22
+ - Maintain data lineage information in output metadata
23
+
24
+ ```python
25
+ # ✅ DO: Proper object store upload path handling
26
+ class JsonOutput:
27
+ async def upload_to_object_store(
28
+ self,
29
+ data: List[dict],
30
+ output_prefix: str, # User-provided output location
31
+ filename: str
32
+ ) -> dict:
33
+ """Upload data with correct path handling."""
34
+
35
+ # Construct full object store path respecting user's output prefix
36
+ object_store_key = os.path.join(output_prefix, filename)
37
+
38
+ # Serialize data
39
+ json_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
40
+
41
+ # Upload to correct location
42
+ result = await self.object_store.upload_file(
43
+ data=json_data,
44
+ destination=object_store_key # Respect user's intended location
45
+ )
46
+
47
+ return result
48
+
49
+ # ❌ REJECT: Incorrect path handling
50
+ class BadJsonOutput:
51
+ async def upload_to_object_store(self, data: List[dict], filename: str):
52
+ # Wrong: hardcoded or derived path, ignoring user configuration
53
+ object_store_key = get_object_store_prefix(f"/tmp/{filename}") # Ignores output_prefix!
54
+
55
+ result = await self.object_store.upload_file(
56
+ data=orjson.dumps(data),
57
+ destination=object_store_key # Wrong destination!
58
+ )
59
+ return result
60
+ ```
61
+
62
+ ### Phase 2: Output Architecture Patterns
63
+
64
+ **Performance Optimization Requirements:**
65
+
66
+ - **Parallelization opportunities**: Flag sequential upload operations that could be parallelized
67
+ - **Batch processing**: Group related uploads to reduce overhead
68
+ - **Streaming uploads**: Use streaming for large datasets instead of loading into memory
69
+ - **Connection optimization**: Reuse object store connections across operations
70
+
71
+ **Resource Management:**
72
+
73
+ - Use proper connection pooling for object store operations
74
+ - Implement timeout handling for upload operations
75
+ - Clean up temporary files after upload
76
+ - Handle partial upload failures gracefully
77
+ - Monitor memory usage during large data serialization
78
+
79
+ ```python
80
+ # ✅ DO: Parallel upload processing
81
+ async def upload_multiple_datasets_parallel(
82
+ self,
83
+ datasets: List[Tuple[List[dict], str]], # (data, filename) pairs
84
+ output_prefix: str
85
+ ) -> List[dict]:
86
+ """Upload multiple datasets in parallel for better performance."""
87
+
88
+ async def upload_single_dataset(data: List[dict], filename: str) -> dict:
89
+ """Upload a single dataset with error handling."""
90
+ try:
91
+ object_store_key = os.path.join(output_prefix, filename)
92
+ serialized_data = orjson.dumps(data, option=orjson.OPT_APPEND_NEWLINE)
93
+
94
+ return await self.object_store.upload_file(
95
+ data=serialized_data,
96
+ destination=object_store_key
97
+ )
98
+ except Exception as e:
99
+ logger.error(f"Failed to upload {filename}: {e}")
100
+ raise
101
+
102
+ # Parallel processing with controlled concurrency
103
+ semaphore = asyncio.Semaphore(5) # Limit concurrent uploads
104
+
105
+ async def upload_with_semaphore(data: List[dict], filename: str) -> dict:
106
+ async with semaphore:
107
+ return await upload_single_dataset(data, filename)
108
+
109
+ tasks = [upload_with_semaphore(data, filename) for data, filename in datasets]
110
+ return await asyncio.gather(*tasks)
111
+
112
+ # ❌ REJECT: Sequential upload processing
113
+ async def upload_multiple_datasets_sequential(
114
+ self,
115
+ datasets: List[Tuple[List[dict], str]],
116
+ output_prefix: str
117
+ ) -> List[dict]:
118
+ """Sequential uploads - should be flagged for parallelization."""
119
+ results = []
120
+ for data, filename in datasets: # FLAG: Could be parallelized
121
+ object_store_key = os.path.join(output_prefix, filename)
122
+ result = await self.object_store.upload_file(data, object_store_key)
123
+ results.append(result)
124
+ return results
125
+ ```
126
+
127
+ ### Phase 3: Output Testing Requirements
128
+
129
+ **Data Output Testing:**
130
+
131
+ - Test with various data formats and sizes
132
+ - Test serialization and deserialization consistency
133
+ - Test partial upload scenarios and recovery
134
+ - Mock object store operations in unit tests
135
+ - Include integration tests with real object store
136
+ - Test data corruption detection and prevention
137
+
138
+ **Performance Testing:**
139
+
140
+ - Include tests for large dataset uploads
141
+ - Test memory usage during serialization
142
+ - Test concurrent upload operations
143
+ - Verify timeout handling works correctly
144
+ - Test connection pool behavior under load
145
+
146
+ ### Phase 4: Performance and Scalability
147
+
148
+ **Data Upload Efficiency:**
149
+
150
+ - Use streaming uploads for large datasets
151
+ - Implement proper chunking for oversized data
152
+ - Use compression for large text-based outputs
153
+ - Monitor upload progress and provide feedback
154
+ - Optimize serialization performance (use orjson over json)
155
+
156
+ **Object Store Optimization:**
157
+
158
+ - Use connection pooling for object store clients
159
+ - Implement proper retry logic for upload failures
160
+ - Use parallel uploads where appropriate
161
+ - Monitor upload metrics and error rates
162
+ - Handle bandwidth limitations gracefully
163
+
164
+ ### Phase 5: Output Maintainability
165
+
166
+ **Error Handling and Recovery:**
167
+
168
+ - Implement comprehensive error handling for all upload operations
169
+ - Provide meaningful error messages with upload context
170
+ - Handle partial upload failures gracefully
171
+ - Implement proper retry logic for transient failures
172
+ - Log all upload operations with destination information
173
+
174
+ **Configuration Management:**
175
+
176
+ - Externalize all output-related configuration
177
+ - Support different output destinations and formats
178
+ - Validate output configuration before processing
179
+ - Document all supported output parameters
180
+ - Handle environment-specific output requirements
181
+
182
+ ---
183
+
184
+ ## Output-Specific Anti-Patterns
185
+
186
+ **Always Reject:**
187
+
188
+ - **Path derivation errors**: Deriving object store paths from local temporary paths
189
+ - **Sequential uploads**: Uploading multiple files sequentially when parallel uploads are possible
190
+ - **Memory inefficiency**: Loading entire datasets into memory for serialization
191
+ - **Missing upload verification**: Not verifying successful uploads
192
+ - **Poor error recovery**: Not handling partial upload failures gracefully
193
+ - **Resource leaks**: Not cleaning up temporary files or connections
194
+
195
+ **Object Store Upload Anti-Patterns:**
196
+
197
+ ```python
198
+ # ❌ REJECT: Incorrect upload path handling
199
+ class BadOutputProcessor:
200
+ async def upload_results(self, results: List[dict]):
201
+ # Wrong: derives upload path from temporary local path
202
+ local_temp_file = "/tmp/results.json"
203
+ upload_key = get_object_store_prefix(local_temp_file) # Incorrect!
204
+
205
+ await self.object_store.upload_file(results, upload_key)
206
+
207
+ # ✅ REQUIRE: Correct upload path handling
208
+ class GoodOutputProcessor:
209
+ async def upload_results(
210
+ self,
211
+ results: List[dict],
212
+ output_prefix: str, # User-specified destination
213
+ filename: str = "results.json"
214
+ ):
215
+ # Use actual user-configured output location
216
+ upload_key = os.path.join(output_prefix, filename)
217
+
218
+ await self.object_store.upload_file(
219
+ data=orjson.dumps(results),
220
+ destination=upload_key # Correct destination
221
+ )
222
+ ```
223
+
224
+ **Performance Anti-Patterns:**
225
+
226
+ ```python
227
+ # ❌ REJECT: Sequential upload processing
228
+ async def upload_multiple_files_sequential(file_data_pairs: List[Tuple]):
229
+ results = []
230
+ for data, filename in file_data_pairs: # Should be parallelized
231
+ result = await upload_single_file(data, filename)
232
+ results.append(result)
233
+ return results
234
+
235
+ # ✅ REQUIRE: Parallel upload processing with proper error handling
236
+ async def upload_multiple_files_parallel(
237
+ file_data_pairs: List[Tuple],
238
+ max_concurrency: int = 5
239
+ ) -> List[dict]:
240
+ semaphore = asyncio.Semaphore(max_concurrency)
241
+
242
+ async def upload_with_semaphore(data, filename):
243
+ async with semaphore:
244
+ try:
245
+ return await upload_single_file(data, filename)
246
+ except Exception as e:
247
+ logger.error(f"Upload failed for {filename}: {e}")
248
+ return {"filename": filename, "status": "failed", "error": str(e)}
249
+
250
+ tasks = [upload_with_semaphore(data, filename) for data, filename in file_data_pairs]
251
+ return await asyncio.gather(*tasks)
252
+ ```
253
+
254
+ **Memory Management Anti-Patterns:**
255
+
256
+ ```python
257
+ # ❌ REJECT: Loading entire dataset for serialization
258
+ async def bad_large_dataset_upload(large_dataset: List[dict]):
259
+ # Loads entire dataset into memory
260
+ json_data = orjson.dumps(large_dataset) # Could exceed memory limits
261
+ await upload_data(json_data)
262
+
263
+ # ✅ REQUIRE: Streaming serialization for large datasets
264
+ async def good_large_dataset_upload(large_dataset: List[dict], chunk_size: int = 1000):
265
+ """Stream large datasets to avoid memory issues."""
266
+
267
+ async def serialize_chunk(chunk: List[dict]) -> bytes:
268
+ return orjson.dumps(chunk, option=orjson.OPT_APPEND_NEWLINE)
269
+
270
+ # Process in chunks to manage memory
271
+ for i in range(0, len(large_dataset), chunk_size):
272
+ chunk = large_dataset[i:i + chunk_size]
273
+ serialized_chunk = await serialize_chunk(chunk)
274
+
275
+ await upload_chunk(
276
+ data=serialized_chunk,
277
+ chunk_index=i // chunk_size
278
+ )
279
+ ```
280
+
281
+ ## Educational Context for Output Reviews
282
+
283
+ When reviewing output code, emphasize:
284
+
285
+ 1. **Data Integrity Impact**: "Incorrect upload path handling can cause data to be stored in wrong locations, making it inaccessible to downstream processes. This breaks data pipelines and can cause data loss."
286
+
287
+ 2. **Performance Impact**: "Sequential uploads create unnecessary bottlenecks. For enterprise datasets with multiple output files, parallelization can significantly reduce processing time and improve user experience."
288
+
289
+ 3. **Resource Impact**: "Poor memory management during serialization can cause out-of-memory errors with large datasets. Streaming and chunking are essential for enterprise-scale data output."
290
+
291
+ 4. **User Experience Impact**: "Output path errors are often discovered late in processing, causing wasted computation and frustrating delays. Proper validation and clear error messages improve reliability."
292
+
293
+ 5. **Scalability Impact**: "Output patterns that work for small datasets can fail at enterprise scale. Always design output processes to handle the largest expected dataset sizes efficiently."
294
+
295
+ 6. **Data Pipeline Impact**: "Output processing is the final step in data pipelines. Failures here can invalidate all upstream processing work. Robust error handling and verification are critical for pipeline reliability."
@@ -29,6 +29,7 @@ class IcebergOutput(Output):
29
29
  mode: str = "append",
30
30
  total_record_count: int = 0,
31
31
  chunk_count: int = 0,
32
+ retain_local_copy: bool = False,
32
33
  ):
33
34
  """Initialize the Iceberg output class.
34
35
 
@@ -39,6 +40,8 @@ class IcebergOutput(Output):
39
40
  mode (str, optional): Write mode for the iceberg table. Defaults to "append".
40
41
  total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
41
42
  chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
43
+ retain_local_copy (bool, optional): Whether to retain the local copy of the files.
44
+ Defaults to False.
42
45
  """
43
46
  self.total_record_count = total_record_count
44
47
  self.chunk_count = chunk_count
@@ -47,6 +50,7 @@ class IcebergOutput(Output):
47
50
  self.iceberg_table = iceberg_table
48
51
  self.mode = mode
49
52
  self.metrics = get_metrics()
53
+ self.retain_local_copy = retain_local_copy
50
54
 
51
55
  async def write_dataframe(self, dataframe: "pd.DataFrame"):
52
56
  """
@@ -93,6 +93,7 @@ class JsonOutput(Output):
93
93
  path_gen: Callable[[int | None, int], str] = path_gen,
94
94
  start_marker: Optional[str] = None,
95
95
  end_marker: Optional[str] = None,
96
+ retain_local_copy: bool = False,
96
97
  **kwargs: Dict[str, Any],
97
98
  ):
98
99
  """Initialize the JSON output handler.
@@ -113,6 +114,8 @@ class JsonOutput(Output):
113
114
  Defaults to 0.
114
115
  path_gen (Callable, optional): Function to generate file paths.
115
116
  Defaults to path_gen function.
117
+ retain_local_copy (bool, optional): Whether to retain the local copy of the files.
118
+ Defaults to False.
116
119
  """
117
120
  self.output_path = output_path
118
121
  self.output_suffix = output_suffix
@@ -133,6 +136,7 @@ class JsonOutput(Output):
133
136
  self.start_marker = start_marker
134
137
  self.end_marker = end_marker
135
138
  self.metrics = get_metrics()
139
+ self.retain_local_copy = retain_local_copy
136
140
 
137
141
  if not self.output_path:
138
142
  raise ValueError("output_path is required")
@@ -282,6 +286,7 @@ class JsonOutput(Output):
282
286
  await ObjectStore.upload_prefix(
283
287
  source=self.output_path,
284
288
  destination=get_object_store_prefix(self.output_path),
289
+ retain_local_copy=self.retain_local_copy,
285
290
  )
286
291
 
287
292
  except Exception as e:
@@ -367,6 +372,7 @@ class JsonOutput(Output):
367
372
  await ObjectStore.upload_file(
368
373
  source=output_file_name,
369
374
  destination=get_object_store_prefix(output_file_name),
375
+ retain_local_copy=self.retain_local_copy,
370
376
  )
371
377
 
372
378
  self.buffer.clear()
@@ -1,5 +1,6 @@
1
1
  import os
2
- from typing import TYPE_CHECKING, List, Literal, Optional, Union
2
+ from enum import Enum
3
+ from typing import TYPE_CHECKING, List, Optional, Union
3
4
 
4
5
  from temporalio import activity
5
6
 
@@ -18,6 +19,14 @@ if TYPE_CHECKING:
18
19
  import pandas as pd
19
20
 
20
21
 
22
+ class WriteMode(Enum):
23
+ """Enumeration of write modes for Parquet output operations."""
24
+
25
+ APPEND = "append"
26
+ OVERWRITE = "overwrite"
27
+ OVERWRITE_PARTITIONS = "overwrite-partitions"
28
+
29
+
21
30
  class ParquetOutput(Output):
22
31
  """Output handler for writing data to Parquet files.
23
32
 
@@ -29,7 +38,6 @@ class ParquetOutput(Output):
29
38
  output_prefix (str): Prefix for files when uploading to object store.
30
39
  output_suffix (str): Suffix for output files.
31
40
  typename (Optional[str]): Type name of the entity e.g database, schema, table.
32
- mode (str): Write mode for parquet files ("append" or "overwrite").
33
41
  chunk_size (int): Maximum number of records per chunk.
34
42
  total_record_count (int): Total number of records processed.
35
43
  chunk_count (int): Number of chunks created.
@@ -45,7 +53,6 @@ class ParquetOutput(Output):
45
53
  output_suffix: str = "",
46
54
  output_prefix: str = "",
47
55
  typename: Optional[str] = None,
48
- write_mode: Literal["append", "overwrite", "overwrite-partitions"] = "append",
49
56
  chunk_size: Optional[int] = 100000,
50
57
  buffer_size: Optional[int] = 100000,
51
58
  total_record_count: int = 0,
@@ -53,6 +60,7 @@ class ParquetOutput(Output):
53
60
  chunk_start: Optional[int] = None,
54
61
  start_marker: Optional[str] = None,
55
62
  end_marker: Optional[str] = None,
63
+ retain_local_copy: bool = False,
56
64
  ):
57
65
  """Initialize the Parquet output handler.
58
66
 
@@ -61,7 +69,6 @@ class ParquetOutput(Output):
61
69
  output_suffix (str): Suffix for output files.
62
70
  output_prefix (str): Prefix for files when uploading to object store.
63
71
  typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
64
- mode (str, optional): Write mode for parquet files. Defaults to "append".
65
72
  chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
66
73
  total_record_count (int, optional): Initial total record count. Defaults to 0.
67
74
  chunk_count (int, optional): Initial chunk count. Defaults to 0.
@@ -73,12 +80,13 @@ class ParquetOutput(Output):
73
80
  Defaults to None.
74
81
  end_marker (Optional[str], optional): End marker for query extraction.
75
82
  Defaults to None.
83
+ retain_local_copy (bool, optional): Whether to retain the local copy of the files.
84
+ Defaults to False.
76
85
  """
77
86
  self.output_path = output_path
78
87
  self.output_suffix = output_suffix
79
88
  self.output_prefix = output_prefix
80
89
  self.typename = typename
81
- self.write_mode = write_mode
82
90
  self.chunk_size = chunk_size
83
91
  self.buffer_size = buffer_size
84
92
  self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
@@ -94,6 +102,7 @@ class ParquetOutput(Output):
94
102
  self.end_marker = end_marker
95
103
  self.statistics = []
96
104
  self.metrics = get_metrics()
105
+ self.retain_local_copy = retain_local_copy
97
106
 
98
107
  # Create output directory
99
108
  self.output_path = os.path.join(self.output_path, self.output_suffix)
@@ -103,7 +112,7 @@ class ParquetOutput(Output):
103
112
 
104
113
  def path_gen(
105
114
  self,
106
- chunk_start: int | None = None,
115
+ chunk_start: Optional[int] = None,
107
116
  chunk_count: int = 0,
108
117
  start_marker: Optional[str] = None,
109
118
  end_marker: Optional[str] = None,
@@ -111,7 +120,7 @@ class ParquetOutput(Output):
111
120
  """Generate a file path for a chunk.
112
121
 
113
122
  Args:
114
- chunk_start (int | None): Starting index of the chunk, or None for single chunk.
123
+ chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
115
124
  chunk_count (int): Total number of chunks.
116
125
  start_marker (Optional[str]): Start marker for query extraction.
117
126
  end_marker (Optional[str]): End marker for query extraction.
@@ -182,7 +191,7 @@ class ParquetOutput(Output):
182
191
  name="parquet_write_records",
183
192
  value=len(dataframe),
184
193
  metric_type=MetricType.COUNTER,
185
- labels={"type": "pandas", "mode": self.write_mode},
194
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
186
195
  description="Number of records written to Parquet files from pandas DataFrame",
187
196
  )
188
197
 
@@ -191,7 +200,7 @@ class ParquetOutput(Output):
191
200
  name="parquet_chunks_written",
192
201
  value=1,
193
202
  metric_type=MetricType.COUNTER,
194
- labels={"type": "pandas", "mode": self.write_mode},
203
+ labels={"type": "pandas", "mode": WriteMode.APPEND.value},
195
204
  description="Number of chunks written to Parquet files",
196
205
  )
197
206
 
@@ -203,69 +212,115 @@ class ParquetOutput(Output):
203
212
  name="parquet_write_errors",
204
213
  value=1,
205
214
  metric_type=MetricType.COUNTER,
206
- labels={"type": "pandas", "mode": self.write_mode, "error": str(e)},
215
+ labels={
216
+ "type": "pandas",
217
+ "mode": WriteMode.APPEND.value,
218
+ "error": str(e),
219
+ },
207
220
  description="Number of errors while writing to Parquet files",
208
221
  )
209
222
  logger.error(f"Error writing pandas dataframe to parquet: {str(e)}")
210
223
  raise
211
224
 
212
- async def write_daft_dataframe(self, dataframe: "daft.DataFrame"): # noqa: F821
225
+ async def write_daft_dataframe(
226
+ self,
227
+ dataframe: "daft.DataFrame", # noqa: F821
228
+ partition_cols: Optional[List] = None,
229
+ write_mode: Union[WriteMode, str] = WriteMode.APPEND,
230
+ morsel_size: int = 100_000,
231
+ ):
213
232
  """Write a daft DataFrame to Parquet files and upload to object store.
214
233
 
234
+ Uses Daft's native file size management to automatically split large DataFrames
235
+ into multiple parquet files based on the configured target file size. Supports
236
+ Hive partitioning for efficient data organization.
237
+
215
238
  Args:
216
239
  dataframe (daft.DataFrame): The DataFrame to write.
240
+ partition_cols (Optional[List]): Column names or expressions to use for Hive partitioning.
241
+ Can be strings (column names) or daft column expressions. If None (default), no partitioning is applied.
242
+ write_mode (Union[WriteMode, str]): Write mode for parquet files.
243
+ Use WriteMode.APPEND, WriteMode.OVERWRITE, WriteMode.OVERWRITE_PARTITIONS, or their string equivalents.
244
+ morsel_size (int): Default number of rows in a morsel used for the new local executor, when running locally on just a single machine,
245
+ Daft does not use partitions. Instead of using partitioning to control parallelism, the local execution engine performs a streaming-based
246
+ execution on small "morsels" of data, which provides much more stable memory utilization while improving the user experience with not having
247
+ to worry about partitioning.
248
+
249
+ Note:
250
+ - Daft automatically handles file chunking based on parquet_target_filesize
251
+ - Multiple files will be created if DataFrame exceeds DAPR limit
252
+ - If partition_cols is set, creates Hive-style directory structure
217
253
  """
218
254
  try:
255
+ import daft
256
+
257
+ # Convert string to enum if needed for backward compatibility
258
+ if isinstance(write_mode, str):
259
+ write_mode = WriteMode(write_mode)
260
+
219
261
  row_count = dataframe.count_rows()
220
262
  if row_count == 0:
221
263
  return
222
264
 
265
+ # Use Daft's execution context for temporary configuration
266
+ with daft.execution_config_ctx(
267
+ parquet_target_filesize=self.max_file_size_bytes,
268
+ default_morsel_size=morsel_size,
269
+ ):
270
+ # Daft automatically handles file splitting and naming
271
+ dataframe.write_parquet(
272
+ root_dir=self.output_path,
273
+ write_mode=write_mode.value,
274
+ partition_cols=partition_cols if partition_cols else [],
275
+ )
276
+
223
277
  # Update counters
224
278
  self.chunk_count += 1
225
279
  self.total_record_count += row_count
226
280
 
227
- # Generate file path using path_gen function
228
- if self.start_marker and self.end_marker:
229
- file_path = self.output_path
230
- else:
231
- file_path = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count, self.start_marker, self.end_marker)}"
232
-
233
- # Write the dataframe to parquet using daft
234
- dataframe.write_parquet(
235
- file_path,
236
- write_mode=self.write_mode,
237
- )
238
-
239
281
  # Record metrics for successful write
240
282
  self.metrics.record_metric(
241
283
  name="parquet_write_records",
242
284
  value=row_count,
243
285
  metric_type=MetricType.COUNTER,
244
- labels={"type": "daft", "mode": self.write_mode},
286
+ labels={"type": "daft", "mode": write_mode.value},
245
287
  description="Number of records written to Parquet files from daft DataFrame",
246
288
  )
247
289
 
248
- # Record chunk metrics
290
+ # Record operation metrics (note: actual file count may be higher due to Daft's splitting)
249
291
  self.metrics.record_metric(
250
- name="parquet_chunks_written",
292
+ name="parquet_write_operations",
251
293
  value=1,
252
294
  metric_type=MetricType.COUNTER,
253
- labels={"type": "daft", "mode": self.write_mode},
254
- description="Number of chunks written to Parquet files",
295
+ labels={"type": "daft", "mode": write_mode.value},
296
+ description="Number of write operations to Parquet files",
255
297
  )
256
298
 
257
- # Upload the file to object store
258
- await ObjectStore.upload_file(
259
- source=file_path,
260
- destination=get_object_store_prefix(file_path),
299
+ # Upload the entire directory (contains multiple parquet files created by Daft)
300
+ if write_mode == WriteMode.OVERWRITE:
301
+ # Delete the directory from object store
302
+ try:
303
+ await ObjectStore.delete_prefix(
304
+ prefix=get_object_store_prefix(self.output_path)
305
+ )
306
+ except FileNotFoundError as e:
307
+ logger.info(
308
+ f"No files found under prefix {get_object_store_prefix(self.output_path)}: {str(e)}"
309
+ )
310
+
311
+ await ObjectStore.upload_prefix(
312
+ source=self.output_path,
313
+ destination=get_object_store_prefix(self.output_path),
314
+ retain_local_copy=self.retain_local_copy,
261
315
  )
316
+
262
317
  except Exception as e:
263
318
  # Record metrics for failed write
264
319
  self.metrics.record_metric(
265
320
  name="parquet_write_errors",
266
321
  value=1,
267
322
  metric_type=MetricType.COUNTER,
268
- labels={"type": "daft", "mode": self.write_mode, "error": str(e)},
323
+ labels={"type": "daft", "mode": write_mode, "error": str(e)},
269
324
  description="Number of errors while writing to Parquet files",
270
325
  )
271
326
  logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
@@ -279,7 +334,7 @@ class ParquetOutput(Output):
279
334
  """
280
335
  return self.output_path
281
336
 
282
- async def _flush_buffer(self, chunk_part):
337
+ async def _flush_buffer(self, chunk_part: int):
283
338
  """Flush the current buffer to a Parquet file.
284
339
 
285
340
  This method combines all DataFrames in the buffer, writes them to a Parquet file,