atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +30 -7
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -445
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -1,21 +1,34 @@
1
1
  import inspect
2
2
  import os
3
3
  import shutil
4
- from enum import Enum
5
- from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ AsyncGenerator,
7
+ AsyncIterator,
8
+ Generator,
9
+ List,
10
+ Optional,
11
+ Union,
12
+ cast,
13
+ )
6
14
 
7
15
  from temporalio import activity
8
16
 
9
17
  from application_sdk.activities.common.utils import get_object_store_prefix
10
- from application_sdk.common.dataframe_utils import is_empty_dataframe
11
18
  from application_sdk.constants import (
12
19
  DAPR_MAX_GRPC_MESSAGE_LENGTH,
13
20
  ENABLE_ATLAN_UPLOAD,
14
21
  UPSTREAM_OBJECT_STORE_NAME,
15
22
  )
23
+ from application_sdk.io import DataframeType, Reader, WriteMode, Writer
24
+ from application_sdk.io.utils import (
25
+ PARQUET_FILE_EXTENSION,
26
+ download_files,
27
+ is_empty_dataframe,
28
+ path_gen,
29
+ )
16
30
  from application_sdk.observability.logger_adaptor import get_logger
17
31
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
18
- from application_sdk.outputs import Output
19
32
  from application_sdk.services.objectstore import ObjectStore
20
33
 
21
34
  logger = get_logger(__name__)
@@ -26,23 +39,314 @@ if TYPE_CHECKING:
26
39
  import pandas as pd
27
40
 
28
41
 
29
- class WriteMode(Enum):
30
- """Enumeration of write modes for Parquet output operations."""
42
+ class ParquetFileReader(Reader):
43
+ """
44
+ Parquet File Reader class to read data from Parquet files using daft and pandas.
45
+ Supports reading both single files and directories containing multiple parquet files.
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ path: str,
51
+ chunk_size: Optional[int] = 100000,
52
+ buffer_size: Optional[int] = 5000,
53
+ file_names: Optional[List[str]] = None,
54
+ dataframe_type: DataframeType = DataframeType.pandas,
55
+ ):
56
+ """Initialize the Parquet input class.
57
+
58
+ Args:
59
+ path (str): Path to parquet file or directory containing parquet files.
60
+ It accepts both types of paths:
61
+ local path or object store path
62
+ Wildcards are not supported.
63
+ chunk_size (int): Number of rows per batch. Defaults to 100000.
64
+ buffer_size (int): Number of rows per batch. Defaults to 5000.
65
+ file_names (Optional[List[str]]): List of file names to read. Defaults to None.
66
+ dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
67
+
68
+ Raises:
69
+ ValueError: When path is not provided or when single file path is combined with file_names
70
+ """
71
+
72
+ # Validate that single file path and file_names are not both specified
73
+ if path.endswith(PARQUET_FILE_EXTENSION) and file_names:
74
+ raise ValueError(
75
+ f"Cannot specify both a single file path ('{path}') and file_names filter. "
76
+ f"Either provide a directory path with file_names, or specify the exact file path without file_names."
77
+ )
78
+
79
+ self.path = path
80
+ self.chunk_size = chunk_size
81
+ self.buffer_size = buffer_size
82
+ self.file_names = file_names
83
+ self.dataframe_type = dataframe_type
84
+
85
+ async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
86
+ """
87
+ Method to read the data from the parquet files in the path
88
+ and return as a single combined pandas dataframe
89
+ """
90
+ if self.dataframe_type == DataframeType.pandas:
91
+ return await self._get_dataframe()
92
+ elif self.dataframe_type == DataframeType.daft:
93
+ return await self._get_daft_dataframe()
94
+ else:
95
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
96
+
97
+ def read_batches(
98
+ self,
99
+ ) -> Union[
100
+ AsyncIterator["pd.DataFrame"],
101
+ AsyncIterator["daft.DataFrame"],
102
+ ]:
103
+ """
104
+ Method to read the data from the parquet files in the path
105
+ and return as a batched pandas dataframe
106
+ """
107
+ if self.dataframe_type == DataframeType.pandas:
108
+ return self._get_batched_dataframe()
109
+ elif self.dataframe_type == DataframeType.daft:
110
+ return self._get_batched_daft_dataframe()
111
+ else:
112
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
113
+
114
+ async def _get_dataframe(self) -> "pd.DataFrame":
115
+ """Read data from parquet file(s) and return as pandas DataFrame.
116
+
117
+ Returns:
118
+ pd.DataFrame: Combined dataframe from specified parquet files
119
+
120
+ Raises:
121
+ ValueError: When no valid path can be determined or no matching files found
122
+ Exception: When reading parquet files fails
123
+
124
+ Example transformation:
125
+ Input files:
126
+ +------------------+
127
+ | file1.parquet |
128
+ | file2.parquet |
129
+ | file3.parquet |
130
+ +------------------+
131
+
132
+ With file_names=["file1.parquet", "file3.parquet"]:
133
+ +-------+-------+-------+
134
+ | col1 | col2 | col3 |
135
+ +-------+-------+-------+
136
+ | val1 | val2 | val3 | # from file1.parquet
137
+ | val7 | val8 | val9 | # from file3.parquet
138
+ +-------+-------+-------+
139
+
140
+ Transformations:
141
+ - Only specified files are read and combined
142
+ - Column schemas must be compatible across files
143
+ - Only reads files in the specified directory
144
+ """
145
+ try:
146
+ import pandas as pd
147
+
148
+ # Ensure files are available (local or downloaded)
149
+ parquet_files = await download_files(
150
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
151
+ )
152
+ logger.info(f"Reading {len(parquet_files)} parquet files")
153
+
154
+ return pd.concat(
155
+ (pd.read_parquet(parquet_file) for parquet_file in parquet_files),
156
+ ignore_index=True,
157
+ )
158
+ except Exception as e:
159
+ logger.error(f"Error reading data from parquet file(s): {str(e)}")
160
+ raise
161
+
162
+ async def _get_batched_dataframe(
163
+ self,
164
+ ) -> AsyncIterator["pd.DataFrame"]:
165
+ """Read data from parquet file(s) in batches as pandas DataFrames.
166
+
167
+ Returns:
168
+ AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
169
+
170
+ Raises:
171
+ ValueError: When no parquet files found locally or in object store
172
+ Exception: When reading parquet files fails
173
+
174
+ Example transformation:
175
+ Input files:
176
+ +------------------+
177
+ | file1.parquet |
178
+ | file2.parquet |
179
+ | file3.parquet |
180
+ +------------------+
181
+
182
+ With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
183
+ Batch 1:
184
+ +-------+-------+
185
+ | col1 | col2 |
186
+ +-------+-------+
187
+ | val1 | val2 | # from file1.parquet
188
+ | val3 | val4 | # from file1.parquet
189
+ +-------+-------+
190
+
191
+ Batch 2:
192
+ +-------+-------+
193
+ | col1 | col2 |
194
+ +-------+-------+
195
+ | val5 | val6 | # from file2.parquet
196
+ | val7 | val8 | # from file2.parquet
197
+ +-------+-------+
198
+
199
+ Transformations:
200
+ - Only specified files are combined then split into chunks
201
+ - Each batch is a separate DataFrame
202
+ - Only reads files in the specified directory
203
+ """
204
+ try:
205
+ import pandas as pd
206
+
207
+ # Ensure files are available (local or downloaded)
208
+ parquet_files = await download_files(
209
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
210
+ )
211
+ logger.info(f"Reading {len(parquet_files)} parquet files in batches")
31
212
 
32
- APPEND = "append"
33
- OVERWRITE = "overwrite"
34
- OVERWRITE_PARTITIONS = "overwrite-partitions"
213
+ # Process each file individually to maintain memory efficiency
214
+ for parquet_file in parquet_files:
215
+ df = pd.read_parquet(parquet_file)
216
+ for i in range(0, len(df), self.chunk_size):
217
+ yield df.iloc[i : i + self.chunk_size] # type: ignore
218
+ except Exception as e:
219
+ logger.error(
220
+ f"Error reading data from parquet file(s) in batches: {str(e)}"
221
+ )
222
+ raise
35
223
 
224
+ async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
225
+ """Read data from parquet file(s) and return as daft DataFrame.
36
226
 
37
- class ParquetOutput(Output):
227
+ Returns:
228
+ daft.DataFrame: Combined daft dataframe from specified parquet files
229
+
230
+ Raises:
231
+ ValueError: When no parquet files found locally or in object store
232
+ Exception: When reading parquet files fails
233
+
234
+ Example transformation:
235
+ Input files:
236
+ +------------------+
237
+ | file1.parquet |
238
+ | file2.parquet |
239
+ | file3.parquet |
240
+ +------------------+
241
+
242
+ With file_names=["file1.parquet", "file3.parquet"]:
243
+ +-------+-------+-------+
244
+ | col1 | col2 | col3 |
245
+ +-------+-------+-------+
246
+ | val1 | val2 | val3 | # from file1.parquet
247
+ | val7 | val8 | val9 | # from file3.parquet
248
+ +-------+-------+-------+
249
+
250
+ Transformations:
251
+ - Only specified parquet files combined into single daft DataFrame
252
+ - Lazy evaluation for better performance
253
+ - Column schemas must be compatible across files
254
+ """
255
+ try:
256
+ import daft # type: ignore
257
+
258
+ # Ensure files are available (local or downloaded)
259
+ parquet_files = await download_files(
260
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
261
+ )
262
+ logger.info(f"Reading {len(parquet_files)} parquet files with daft")
263
+
264
+ # Use the discovered/downloaded files directly
265
+ return daft.read_parquet(parquet_files)
266
+ except Exception as e:
267
+ logger.error(
268
+ f"Error reading data from parquet file(s) using daft: {str(e)}"
269
+ )
270
+ raise
271
+
272
+ async def _get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
273
+ """Get batched daft dataframe from parquet file(s).
274
+
275
+ Returns:
276
+ AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
277
+ a batch of data from individual parquet files
278
+
279
+ Raises:
280
+ ValueError: When no parquet files found locally or in object store
281
+ Exception: When reading parquet files fails
282
+
283
+ Example transformation:
284
+ Input files:
285
+ +------------------+
286
+ | file1.parquet |
287
+ | file2.parquet |
288
+ | file3.parquet |
289
+ +------------------+
290
+
291
+ With file_names=["file1.parquet", "file3.parquet"]:
292
+ Batch 1 (file1.parquet):
293
+ +-------+-------+
294
+ | col1 | col2 |
295
+ +-------+-------+
296
+ | val1 | val2 |
297
+ | val3 | val4 |
298
+ +-------+-------+
299
+
300
+ Batch 2 (file3.parquet):
301
+ +-------+-------+
302
+ | col1 | col2 |
303
+ +-------+-------+
304
+ | val7 | val8 |
305
+ | val9 | val10 |
306
+ +-------+-------+
307
+
308
+ Transformations:
309
+ - Each specified file becomes a separate daft DataFrame batch
310
+ - Lazy evaluation for better performance
311
+ - Files processed individually for memory efficiency
312
+ """
313
+ try:
314
+ import daft # type: ignore
315
+
316
+ # Ensure files are available (local or downloaded)
317
+ parquet_files = await download_files(
318
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
319
+ )
320
+ logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
321
+
322
+ # Create a lazy dataframe without loading data into memory
323
+ lazy_df = daft.read_parquet(parquet_files)
324
+
325
+ # Get total count efficiently
326
+ total_rows = lazy_df.count_rows()
327
+
328
+ # Yield chunks without loading everything into memory
329
+ for offset in range(0, total_rows, self.buffer_size):
330
+ chunk = lazy_df.offset(offset).limit(self.buffer_size)
331
+ yield chunk
332
+
333
+ del lazy_df
334
+
335
+ except Exception as error:
336
+ logger.error(
337
+ f"Error reading data from parquet file(s) in batches using daft: {error}"
338
+ )
339
+ raise
340
+
341
+
342
+ class ParquetFileWriter(Writer):
38
343
  """Output handler for writing data to Parquet files.
39
344
 
40
345
  This class handles writing DataFrames to Parquet files with support for chunking
41
346
  and automatic uploading to object store.
42
347
 
43
348
  Attributes:
44
- output_path (str): Base path where Parquet files will be written.
45
- output_suffix (str): Suffix for output files.
349
+ path (str): Base path where Parquet files will be written.
46
350
  typename (Optional[str]): Type name of the entity e.g database, schema, table.
47
351
  chunk_size (int): Maximum number of records per chunk.
48
352
  total_record_count (int): Total number of records processed.
@@ -54,29 +358,26 @@ class ParquetOutput(Output):
54
358
  use_consolidation (bool): Whether to use consolidation.
55
359
  """
56
360
 
57
- _EXTENSION = ".parquet"
58
-
59
361
  def __init__(
60
362
  self,
61
- output_path: str = "",
62
- output_suffix: str = "",
363
+ path: str,
63
364
  typename: Optional[str] = None,
64
365
  chunk_size: Optional[int] = 100000,
65
- buffer_size: int = 5000,
66
- total_record_count: int = 0,
67
- chunk_count: int = 0,
68
- chunk_part: int = 0,
366
+ buffer_size: Optional[int] = 5000,
367
+ total_record_count: Optional[int] = 0,
368
+ chunk_count: Optional[int] = 0,
369
+ chunk_part: Optional[int] = 0,
69
370
  chunk_start: Optional[int] = None,
70
371
  start_marker: Optional[str] = None,
71
372
  end_marker: Optional[str] = None,
72
- retain_local_copy: bool = False,
73
- use_consolidation: bool = False,
373
+ retain_local_copy: Optional[bool] = False,
374
+ use_consolidation: Optional[bool] = False,
375
+ dataframe_type: DataframeType = DataframeType.pandas,
74
376
  ):
75
377
  """Initialize the Parquet output handler.
76
378
 
77
379
  Args:
78
- output_path (str): Base path where Parquet files will be written.
79
- output_suffix (str): Suffix for output files.
380
+ path (str): Base path where Parquet files will be written.
80
381
  typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
81
382
  chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
82
383
  total_record_count (int, optional): Initial total record count. Defaults to 0.
@@ -91,9 +392,10 @@ class ParquetOutput(Output):
91
392
  Defaults to False.
92
393
  use_consolidation (bool, optional): Whether to use consolidation.
93
394
  Defaults to False.
395
+ dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
94
396
  """
95
- self.output_path = output_path
96
- self.output_suffix = output_suffix
397
+ self.extension = PARQUET_FILE_EXTENSION
398
+ self.path = path
97
399
  self.typename = typename
98
400
  self.chunk_size = chunk_size
99
401
  self.buffer_size = buffer_size
@@ -112,6 +414,9 @@ class ParquetOutput(Output):
112
414
  self.partitions = []
113
415
  self.metrics = get_metrics()
114
416
  self.retain_local_copy = retain_local_copy
417
+ self.dataframe_type = dataframe_type
418
+ self._is_closed = False
419
+ self._statistics = None
115
420
 
116
421
  # Consolidation-specific attributes
117
422
  # Use consolidation to efficiently write parquet files in buffered manner
@@ -128,13 +433,14 @@ class ParquetOutput(Output):
128
433
  if self.chunk_start:
129
434
  self.chunk_count = self.chunk_start + self.chunk_count
130
435
 
436
+ if not self.path:
437
+ raise ValueError("path is required")
131
438
  # Create output directory
132
- self.output_path = os.path.join(self.output_path, self.output_suffix)
133
439
  if self.typename:
134
- self.output_path = os.path.join(self.output_path, self.typename)
135
- os.makedirs(self.output_path, exist_ok=True)
440
+ self.path = os.path.join(self.path, self.typename)
441
+ os.makedirs(self.path, exist_ok=True)
136
442
 
137
- async def write_batched_dataframe(
443
+ async def _write_batched_dataframe(
138
444
  self,
139
445
  batched_dataframe: Union[
140
446
  AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
@@ -155,7 +461,7 @@ class ParquetOutput(Output):
155
461
  """
156
462
  if not self.use_consolidation:
157
463
  # Fallback to base class implementation
158
- await super().write_batched_dataframe(batched_dataframe)
464
+ await super()._write_batched_dataframe(batched_dataframe)
159
465
  return
160
466
 
161
467
  try:
@@ -186,12 +492,13 @@ class ParquetOutput(Output):
186
492
  await self._cleanup_temp_folders() # Cleanup on error
187
493
  raise
188
494
 
189
- async def write_daft_dataframe(
495
+ async def _write_daft_dataframe(
190
496
  self,
191
497
  dataframe: "daft.DataFrame", # noqa: F821
192
498
  partition_cols: Optional[List] = None,
193
- write_mode: Union[WriteMode, str] = WriteMode.APPEND,
499
+ write_mode: Union[WriteMode, str] = WriteMode.APPEND.value,
194
500
  morsel_size: int = 100_000,
501
+ **kwargs,
195
502
  ):
196
503
  """Write a daft DataFrame to Parquet files and upload to object store.
197
504
 
@@ -234,7 +541,7 @@ class ParquetOutput(Output):
234
541
  ):
235
542
  # Daft automatically handles file splitting and naming
236
543
  result = dataframe.write_parquet(
237
- root_dir=self.output_path,
544
+ root_dir=self.path,
238
545
  write_mode=write_mode.value,
239
546
  partition_cols=partition_cols,
240
547
  )
@@ -267,11 +574,11 @@ class ParquetOutput(Output):
267
574
  # Delete the directory from object store
268
575
  try:
269
576
  await ObjectStore.delete_prefix(
270
- prefix=get_object_store_prefix(self.output_path)
577
+ prefix=get_object_store_prefix(self.path)
271
578
  )
272
579
  except FileNotFoundError as e:
273
580
  logger.info(
274
- f"No files found under prefix {get_object_store_prefix(self.output_path)}: {str(e)}"
581
+ f"No files found under prefix {get_object_store_prefix(self.path)}: {str(e)}"
275
582
  )
276
583
  for path in file_paths:
277
584
  if ENABLE_ATLAN_UPLOAD:
@@ -311,20 +618,24 @@ class ParquetOutput(Output):
311
618
  Returns:
312
619
  str: The full path of the output file.
313
620
  """
314
- return self.output_path
621
+ return self.path
315
622
 
316
623
  # Consolidation helper methods
317
624
 
318
625
  def _get_temp_folder_path(self, folder_index: int) -> str:
319
626
  """Generate temp folder path consistent with existing structure."""
320
- temp_base_path = os.path.join(self.output_path, "temp_accumulation")
627
+ temp_base_path = os.path.join(self.path, "temp_accumulation")
321
628
  return os.path.join(temp_base_path, f"folder-{folder_index}")
322
629
 
323
630
  def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
324
631
  """Generate final consolidated file path using existing path_gen logic."""
325
632
  return os.path.join(
326
- self.output_path,
327
- self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
633
+ self.path,
634
+ path_gen(
635
+ chunk_count=folder_index,
636
+ chunk_part=chunk_part,
637
+ extension=self.extension,
638
+ ),
328
639
  )
329
640
 
330
641
  async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
@@ -374,14 +685,14 @@ class ParquetOutput(Output):
374
685
  [
375
686
  f
376
687
  for f in os.listdir(self.current_temp_folder_path)
377
- if f.endswith(".parquet")
688
+ if f.endswith(self.extension)
378
689
  ]
379
690
  )
380
- chunk_file_name = f"chunk-{existing_files}.parquet"
691
+ chunk_file_name = f"chunk-{existing_files}{self.extension}"
381
692
  chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
382
693
 
383
694
  # Write chunk using existing write_chunk method
384
- await self.write_chunk(chunk, chunk_file_path)
695
+ await self._write_chunk(chunk, chunk_file_path)
385
696
 
386
697
  async def _consolidate_current_folder(self):
387
698
  """Consolidate current temp folder using Daft."""
@@ -392,7 +703,7 @@ class ParquetOutput(Output):
392
703
  import daft
393
704
 
394
705
  # Read all parquet files in temp folder
395
- pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
706
+ pattern = os.path.join(self.current_temp_folder_path, f"*{self.extension}")
396
707
  daft_df = daft.read_parquet(pattern)
397
708
  partitions = 0
398
709
 
@@ -408,7 +719,7 @@ class ParquetOutput(Output):
408
719
  result_dict = result.to_pydict()
409
720
  partitions = len(result_dict["path"])
410
721
  for i, file_path in enumerate(result_dict["path"]):
411
- if file_path.endswith(".parquet"):
722
+ if file_path.endswith(self.extension):
412
723
  consolidated_file_path = self._get_consolidated_file_path(
413
724
  folder_index=self.chunk_count,
414
725
  chunk_part=i,
@@ -462,7 +773,7 @@ class ParquetOutput(Output):
462
773
  shutil.rmtree(temp_folder, ignore_errors=True)
463
774
 
464
775
  # Clean up base temp directory if it exists and is empty
465
- temp_base_path = os.path.join(self.output_path, "temp_accumulation")
776
+ temp_base_path = os.path.join(self.path, "temp_accumulation")
466
777
  if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
467
778
  os.rmdir(temp_base_path)
468
779
 
@@ -475,7 +786,7 @@ class ParquetOutput(Output):
475
786
  except Exception as e:
476
787
  logger.warning(f"Error cleaning up temp folders: {str(e)}")
477
788
 
478
- async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
789
+ async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
479
790
  """Write a chunk to a Parquet file.
480
791
 
481
792
  This method writes a chunk to a Parquet file and uploads the file to the object store.