atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +312 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +749 -0
  14. application_sdk/io/json.py +473 -0
  15. application_sdk/{outputs → io}/parquet.py +414 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +16 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +14 -1
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -453
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
@@ -1,21 +1,34 @@
1
1
  import inspect
2
2
  import os
3
3
  import shutil
4
- from enum import Enum
5
- from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
4
+ from typing import (
5
+ TYPE_CHECKING,
6
+ AsyncGenerator,
7
+ AsyncIterator,
8
+ Generator,
9
+ List,
10
+ Optional,
11
+ Union,
12
+ cast,
13
+ )
6
14
 
7
15
  from temporalio import activity
8
16
 
9
17
  from application_sdk.activities.common.utils import get_object_store_prefix
10
- from application_sdk.common.dataframe_utils import is_empty_dataframe
11
18
  from application_sdk.constants import (
12
19
  DAPR_MAX_GRPC_MESSAGE_LENGTH,
13
20
  ENABLE_ATLAN_UPLOAD,
14
21
  UPSTREAM_OBJECT_STORE_NAME,
15
22
  )
23
+ from application_sdk.io import DataframeType, Reader, WriteMode, Writer
24
+ from application_sdk.io.utils import (
25
+ PARQUET_FILE_EXTENSION,
26
+ download_files,
27
+ is_empty_dataframe,
28
+ path_gen,
29
+ )
16
30
  from application_sdk.observability.logger_adaptor import get_logger
17
31
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
18
- from application_sdk.outputs import Output
19
32
  from application_sdk.services.objectstore import ObjectStore
20
33
 
21
34
  logger = get_logger(__name__)
@@ -26,23 +39,370 @@ if TYPE_CHECKING:
26
39
  import pandas as pd
27
40
 
28
41
 
29
- class WriteMode(Enum):
30
- """Enumeration of write modes for Parquet output operations."""
42
+ class ParquetFileReader(Reader):
43
+ """Parquet File Reader class to read data from Parquet files using daft and pandas.
44
+
45
+ Supports reading both single files and directories containing multiple parquet files.
46
+ Follows Python's file I/O pattern with read/close semantics and supports context managers.
47
+
48
+ Attributes:
49
+ path (str): Path to parquet file or directory containing parquet files.
50
+ chunk_size (int): Number of rows per batch.
51
+ buffer_size (int): Number of rows per batch for daft.
52
+ file_names (Optional[List[str]]): List of specific file names to read.
53
+ dataframe_type (DataframeType): Type of dataframe to return (pandas or daft).
54
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
55
+
56
+ Example:
57
+ Using context manager (recommended)::
58
+
59
+ async with ParquetFileReader(path="/data/input") as reader:
60
+ df = await reader.read()
61
+ # close() called automatically, temp files cleaned up
62
+
63
+ Reading in batches::
64
+
65
+ async with ParquetFileReader(path="/data/input", chunk_size=50000) as reader:
66
+ async for batch in reader.read_batches():
67
+ process(batch)
68
+
69
+ Using close() explicitly::
70
+
71
+ reader = ParquetFileReader(path="/data/input")
72
+ df = await reader.read()
73
+ await reader.close() # Clean up downloaded temp files
74
+ """
75
+
76
+ def __init__(
77
+ self,
78
+ path: str,
79
+ chunk_size: Optional[int] = 100000,
80
+ buffer_size: Optional[int] = 5000,
81
+ file_names: Optional[List[str]] = None,
82
+ dataframe_type: DataframeType = DataframeType.pandas,
83
+ cleanup_on_close: bool = True,
84
+ ):
85
+ """Initialize the Parquet input class.
86
+
87
+ Args:
88
+ path (str): Path to parquet file or directory containing parquet files.
89
+ It accepts both types of paths:
90
+ local path or object store path
91
+ Wildcards are not supported.
92
+ chunk_size (int): Number of rows per batch. Defaults to 100000.
93
+ buffer_size (int): Number of rows per batch. Defaults to 5000.
94
+ file_names (Optional[List[str]]): List of file names to read. Defaults to None.
95
+ dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
96
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close. Defaults to True.
97
+
98
+ Raises:
99
+ ValueError: When path is not provided or when single file path is combined with file_names
100
+ """
101
+
102
+ # Validate that single file path and file_names are not both specified
103
+ if path.endswith(PARQUET_FILE_EXTENSION) and file_names:
104
+ raise ValueError(
105
+ f"Cannot specify both a single file path ('{path}') and file_names filter. "
106
+ f"Either provide a directory path with file_names, or specify the exact file path without file_names."
107
+ )
108
+
109
+ self.path = path
110
+ self.chunk_size = chunk_size
111
+ self.buffer_size = buffer_size
112
+ self.file_names = file_names
113
+ self.dataframe_type = dataframe_type
114
+ self.cleanup_on_close = cleanup_on_close
115
+ self._is_closed = False
116
+ self._downloaded_files: List[str] = []
117
+
118
+ async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
119
+ """Read the data from the parquet files and return as a single DataFrame.
120
+
121
+ Returns:
122
+ Union[pd.DataFrame, daft.DataFrame]: Combined dataframe from parquet files.
123
+
124
+ Raises:
125
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
126
+ """
127
+ if self._is_closed:
128
+ raise ValueError("Cannot read from a closed reader")
129
+
130
+ if self.dataframe_type == DataframeType.pandas:
131
+ return await self._get_dataframe()
132
+ elif self.dataframe_type == DataframeType.daft:
133
+ return await self._get_daft_dataframe()
134
+ else:
135
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
136
+
137
+ def read_batches(
138
+ self,
139
+ ) -> Union[
140
+ AsyncIterator["pd.DataFrame"],
141
+ AsyncIterator["daft.DataFrame"],
142
+ ]:
143
+ """Read the data from the parquet files and return as batched DataFrames.
144
+
145
+ Returns:
146
+ Union[AsyncIterator[pd.DataFrame], AsyncIterator[daft.DataFrame]]:
147
+ Async iterator of DataFrames.
148
+
149
+ Raises:
150
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
151
+ """
152
+ if self._is_closed:
153
+ raise ValueError("Cannot read from a closed reader")
154
+
155
+ if self.dataframe_type == DataframeType.pandas:
156
+ return self._get_batched_dataframe()
157
+ elif self.dataframe_type == DataframeType.daft:
158
+ return self._get_batched_daft_dataframe()
159
+ else:
160
+ raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
161
+
162
+ async def _get_dataframe(self) -> "pd.DataFrame":
163
+ """Read data from parquet file(s) and return as pandas DataFrame.
164
+
165
+ Returns:
166
+ pd.DataFrame: Combined dataframe from specified parquet files
167
+
168
+ Raises:
169
+ ValueError: When no valid path can be determined or no matching files found
170
+ Exception: When reading parquet files fails
171
+
172
+ Example transformation:
173
+ Input files:
174
+ +------------------+
175
+ | file1.parquet |
176
+ | file2.parquet |
177
+ | file3.parquet |
178
+ +------------------+
179
+
180
+ With file_names=["file1.parquet", "file3.parquet"]:
181
+ +-------+-------+-------+
182
+ | col1 | col2 | col3 |
183
+ +-------+-------+-------+
184
+ | val1 | val2 | val3 | # from file1.parquet
185
+ | val7 | val8 | val9 | # from file3.parquet
186
+ +-------+-------+-------+
187
+
188
+ Transformations:
189
+ - Only specified files are read and combined
190
+ - Column schemas must be compatible across files
191
+ - Only reads files in the specified directory
192
+ """
193
+ try:
194
+ import pandas as pd
195
+
196
+ # Ensure files are available (local or downloaded)
197
+ parquet_files = await download_files(
198
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
199
+ )
200
+ # Track downloaded files for cleanup on close
201
+ self._downloaded_files.extend(parquet_files)
202
+ logger.info(f"Reading {len(parquet_files)} parquet files")
203
+
204
+ return pd.concat(
205
+ (pd.read_parquet(parquet_file) for parquet_file in parquet_files),
206
+ ignore_index=True,
207
+ )
208
+ except Exception as e:
209
+ logger.error(f"Error reading data from parquet file(s): {str(e)}")
210
+ raise
211
+
212
+ async def _get_batched_dataframe(
213
+ self,
214
+ ) -> AsyncIterator["pd.DataFrame"]:
215
+ """Read data from parquet file(s) in batches as pandas DataFrames.
216
+
217
+ Returns:
218
+ AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
219
+
220
+ Raises:
221
+ ValueError: When no parquet files found locally or in object store
222
+ Exception: When reading parquet files fails
223
+
224
+ Example transformation:
225
+ Input files:
226
+ +------------------+
227
+ | file1.parquet |
228
+ | file2.parquet |
229
+ | file3.parquet |
230
+ +------------------+
231
+
232
+ With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
233
+ Batch 1:
234
+ +-------+-------+
235
+ | col1 | col2 |
236
+ +-------+-------+
237
+ | val1 | val2 | # from file1.parquet
238
+ | val3 | val4 | # from file1.parquet
239
+ +-------+-------+
240
+
241
+ Batch 2:
242
+ +-------+-------+
243
+ | col1 | col2 |
244
+ +-------+-------+
245
+ | val5 | val6 | # from file2.parquet
246
+ | val7 | val8 | # from file2.parquet
247
+ +-------+-------+
248
+
249
+ Transformations:
250
+ - Only specified files are combined then split into chunks
251
+ - Each batch is a separate DataFrame
252
+ - Only reads files in the specified directory
253
+ """
254
+ try:
255
+ import pandas as pd
256
+
257
+ # Ensure files are available (local or downloaded)
258
+ parquet_files = await download_files(
259
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
260
+ )
261
+ # Track downloaded files for cleanup on close
262
+ self._downloaded_files.extend(parquet_files)
263
+ logger.info(f"Reading {len(parquet_files)} parquet files in batches")
264
+
265
+ # Process each file individually to maintain memory efficiency
266
+ for parquet_file in parquet_files:
267
+ df = pd.read_parquet(parquet_file)
268
+ for i in range(0, len(df), self.chunk_size):
269
+ yield df.iloc[i : i + self.chunk_size] # type: ignore
270
+ except Exception as e:
271
+ logger.error(
272
+ f"Error reading data from parquet file(s) in batches: {str(e)}"
273
+ )
274
+ raise
275
+
276
+ async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
277
+ """Read data from parquet file(s) and return as daft DataFrame.
278
+
279
+ Returns:
280
+ daft.DataFrame: Combined daft dataframe from specified parquet files
281
+
282
+ Raises:
283
+ ValueError: When no parquet files found locally or in object store
284
+ Exception: When reading parquet files fails
285
+
286
+ Example transformation:
287
+ Input files:
288
+ +------------------+
289
+ | file1.parquet |
290
+ | file2.parquet |
291
+ | file3.parquet |
292
+ +------------------+
293
+
294
+ With file_names=["file1.parquet", "file3.parquet"]:
295
+ +-------+-------+-------+
296
+ | col1 | col2 | col3 |
297
+ +-------+-------+-------+
298
+ | val1 | val2 | val3 | # from file1.parquet
299
+ | val7 | val8 | val9 | # from file3.parquet
300
+ +-------+-------+-------+
301
+
302
+ Transformations:
303
+ - Only specified parquet files combined into single daft DataFrame
304
+ - Lazy evaluation for better performance
305
+ - Column schemas must be compatible across files
306
+ """
307
+ try:
308
+ import daft # type: ignore
309
+
310
+ # Ensure files are available (local or downloaded)
311
+ parquet_files = await download_files(
312
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
313
+ )
314
+ # Track downloaded files for cleanup on close
315
+ self._downloaded_files.extend(parquet_files)
316
+ logger.info(f"Reading {len(parquet_files)} parquet files with daft")
317
+
318
+ # Use the discovered/downloaded files directly
319
+ return daft.read_parquet(parquet_files)
320
+ except Exception as e:
321
+ logger.error(
322
+ f"Error reading data from parquet file(s) using daft: {str(e)}"
323
+ )
324
+ raise
325
+
326
+ async def _get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
327
+ """Get batched daft dataframe from parquet file(s).
328
+
329
+ Returns:
330
+ AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
331
+ a batch of data from individual parquet files
332
+
333
+ Raises:
334
+ ValueError: When no parquet files found locally or in object store
335
+ Exception: When reading parquet files fails
336
+
337
+ Example transformation:
338
+ Input files:
339
+ +------------------+
340
+ | file1.parquet |
341
+ | file2.parquet |
342
+ | file3.parquet |
343
+ +------------------+
344
+
345
+ With file_names=["file1.parquet", "file3.parquet"]:
346
+ Batch 1 (file1.parquet):
347
+ +-------+-------+
348
+ | col1 | col2 |
349
+ +-------+-------+
350
+ | val1 | val2 |
351
+ | val3 | val4 |
352
+ +-------+-------+
353
+
354
+ Batch 2 (file3.parquet):
355
+ +-------+-------+
356
+ | col1 | col2 |
357
+ +-------+-------+
358
+ | val7 | val8 |
359
+ | val9 | val10 |
360
+ +-------+-------+
361
+
362
+ Transformations:
363
+ - Each specified file becomes a separate daft DataFrame batch
364
+ - Lazy evaluation for better performance
365
+ - Files processed individually for memory efficiency
366
+ """
367
+ try:
368
+ import daft # type: ignore
369
+
370
+ # Ensure files are available (local or downloaded)
371
+ parquet_files = await download_files(
372
+ self.path, PARQUET_FILE_EXTENSION, self.file_names
373
+ )
374
+ # Track downloaded files for cleanup on close
375
+ self._downloaded_files.extend(parquet_files)
376
+ logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
377
+
378
+ # Create a lazy dataframe without loading data into memory
379
+ lazy_df = daft.read_parquet(parquet_files)
380
+
381
+ # Get total count efficiently
382
+ total_rows = lazy_df.count_rows()
31
383
 
32
- APPEND = "append"
33
- OVERWRITE = "overwrite"
34
- OVERWRITE_PARTITIONS = "overwrite-partitions"
384
+ # Yield chunks without loading everything into memory
385
+ for offset in range(0, total_rows, self.buffer_size):
386
+ chunk = lazy_df.offset(offset).limit(self.buffer_size)
387
+ yield chunk
35
388
 
389
+ del lazy_df
36
390
 
37
- class ParquetOutput(Output):
391
+ except Exception as error:
392
+ logger.error(
393
+ f"Error reading data from parquet file(s) in batches using daft: {error}"
394
+ )
395
+ raise
396
+
397
+
398
+ class ParquetFileWriter(Writer):
38
399
  """Output handler for writing data to Parquet files.
39
400
 
40
401
  This class handles writing DataFrames to Parquet files with support for chunking
41
402
  and automatic uploading to object store.
42
403
 
43
404
  Attributes:
44
- output_path (str): Base path where Parquet files will be written.
45
- output_suffix (str): Suffix for output files.
405
+ path (str): Base path where Parquet files will be written.
46
406
  typename (Optional[str]): Type name of the entity e.g database, schema, table.
47
407
  chunk_size (int): Maximum number of records per chunk.
48
408
  total_record_count (int): Total number of records processed.
@@ -54,29 +414,26 @@ class ParquetOutput(Output):
54
414
  use_consolidation (bool): Whether to use consolidation.
55
415
  """
56
416
 
57
- _EXTENSION = ".parquet"
58
-
59
417
  def __init__(
60
418
  self,
61
- output_path: str = "",
62
- output_suffix: str = "",
419
+ path: str,
63
420
  typename: Optional[str] = None,
64
421
  chunk_size: Optional[int] = 100000,
65
- buffer_size: int = 5000,
66
- total_record_count: int = 0,
67
- chunk_count: int = 0,
68
- chunk_part: int = 0,
422
+ buffer_size: Optional[int] = 5000,
423
+ total_record_count: Optional[int] = 0,
424
+ chunk_count: Optional[int] = 0,
425
+ chunk_part: Optional[int] = 0,
69
426
  chunk_start: Optional[int] = None,
70
427
  start_marker: Optional[str] = None,
71
428
  end_marker: Optional[str] = None,
72
- retain_local_copy: bool = False,
73
- use_consolidation: bool = False,
429
+ retain_local_copy: Optional[bool] = False,
430
+ use_consolidation: Optional[bool] = False,
431
+ dataframe_type: DataframeType = DataframeType.pandas,
74
432
  ):
75
433
  """Initialize the Parquet output handler.
76
434
 
77
435
  Args:
78
- output_path (str): Base path where Parquet files will be written.
79
- output_suffix (str): Suffix for output files.
436
+ path (str): Base path where Parquet files will be written.
80
437
  typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
81
438
  chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
82
439
  total_record_count (int, optional): Initial total record count. Defaults to 0.
@@ -91,9 +448,10 @@ class ParquetOutput(Output):
91
448
  Defaults to False.
92
449
  use_consolidation (bool, optional): Whether to use consolidation.
93
450
  Defaults to False.
451
+ dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
94
452
  """
95
- self.output_path = output_path
96
- self.output_suffix = output_suffix
453
+ self.extension = PARQUET_FILE_EXTENSION
454
+ self.path = path
97
455
  self.typename = typename
98
456
  self.chunk_size = chunk_size
99
457
  self.buffer_size = buffer_size
@@ -112,6 +470,9 @@ class ParquetOutput(Output):
112
470
  self.partitions = []
113
471
  self.metrics = get_metrics()
114
472
  self.retain_local_copy = retain_local_copy
473
+ self.dataframe_type = dataframe_type
474
+ self._is_closed = False
475
+ self._statistics = None
115
476
 
116
477
  # Consolidation-specific attributes
117
478
  # Use consolidation to efficiently write parquet files in buffered manner
@@ -128,13 +489,14 @@ class ParquetOutput(Output):
128
489
  if self.chunk_start:
129
490
  self.chunk_count = self.chunk_start + self.chunk_count
130
491
 
492
+ if not self.path:
493
+ raise ValueError("path is required")
131
494
  # Create output directory
132
- self.output_path = os.path.join(self.output_path, self.output_suffix)
133
495
  if self.typename:
134
- self.output_path = os.path.join(self.output_path, self.typename)
135
- os.makedirs(self.output_path, exist_ok=True)
496
+ self.path = os.path.join(self.path, self.typename)
497
+ os.makedirs(self.path, exist_ok=True)
136
498
 
137
- async def write_batched_dataframe(
499
+ async def _write_batched_dataframe(
138
500
  self,
139
501
  batched_dataframe: Union[
140
502
  AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
@@ -155,7 +517,7 @@ class ParquetOutput(Output):
155
517
  """
156
518
  if not self.use_consolidation:
157
519
  # Fallback to base class implementation
158
- await super().write_batched_dataframe(batched_dataframe)
520
+ await super()._write_batched_dataframe(batched_dataframe)
159
521
  return
160
522
 
161
523
  try:
@@ -186,12 +548,13 @@ class ParquetOutput(Output):
186
548
  await self._cleanup_temp_folders() # Cleanup on error
187
549
  raise
188
550
 
189
- async def write_daft_dataframe(
551
+ async def _write_daft_dataframe(
190
552
  self,
191
553
  dataframe: "daft.DataFrame", # noqa: F821
192
554
  partition_cols: Optional[List] = None,
193
- write_mode: Union[WriteMode, str] = WriteMode.APPEND,
555
+ write_mode: Union[WriteMode, str] = WriteMode.APPEND.value,
194
556
  morsel_size: int = 100_000,
557
+ **kwargs,
195
558
  ):
196
559
  """Write a daft DataFrame to Parquet files and upload to object store.
197
560
 
@@ -234,7 +597,7 @@ class ParquetOutput(Output):
234
597
  ):
235
598
  # Daft automatically handles file splitting and naming
236
599
  result = dataframe.write_parquet(
237
- root_dir=self.output_path,
600
+ root_dir=self.path,
238
601
  write_mode=write_mode.value,
239
602
  partition_cols=partition_cols,
240
603
  )
@@ -267,11 +630,11 @@ class ParquetOutput(Output):
267
630
  # Delete the directory from object store
268
631
  try:
269
632
  await ObjectStore.delete_prefix(
270
- prefix=get_object_store_prefix(self.output_path)
633
+ prefix=get_object_store_prefix(self.path)
271
634
  )
272
635
  except FileNotFoundError as e:
273
636
  logger.info(
274
- f"No files found under prefix {get_object_store_prefix(self.output_path)}: {str(e)}"
637
+ f"No files found under prefix {get_object_store_prefix(self.path)}: {str(e)}"
275
638
  )
276
639
  for path in file_paths:
277
640
  if ENABLE_ATLAN_UPLOAD:
@@ -311,20 +674,24 @@ class ParquetOutput(Output):
311
674
  Returns:
312
675
  str: The full path of the output file.
313
676
  """
314
- return self.output_path
677
+ return self.path
315
678
 
316
679
  # Consolidation helper methods
317
680
 
318
681
  def _get_temp_folder_path(self, folder_index: int) -> str:
319
682
  """Generate temp folder path consistent with existing structure."""
320
- temp_base_path = os.path.join(self.output_path, "temp_accumulation")
683
+ temp_base_path = os.path.join(self.path, "temp_accumulation")
321
684
  return os.path.join(temp_base_path, f"folder-{folder_index}")
322
685
 
323
686
  def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
324
687
  """Generate final consolidated file path using existing path_gen logic."""
325
688
  return os.path.join(
326
- self.output_path,
327
- self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
689
+ self.path,
690
+ path_gen(
691
+ chunk_count=folder_index,
692
+ chunk_part=chunk_part,
693
+ extension=self.extension,
694
+ ),
328
695
  )
329
696
 
330
697
  async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
@@ -374,14 +741,14 @@ class ParquetOutput(Output):
374
741
  [
375
742
  f
376
743
  for f in os.listdir(self.current_temp_folder_path)
377
- if f.endswith(".parquet")
744
+ if f.endswith(self.extension)
378
745
  ]
379
746
  )
380
- chunk_file_name = f"chunk-{existing_files}.parquet"
747
+ chunk_file_name = f"chunk-{existing_files}{self.extension}"
381
748
  chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
382
749
 
383
750
  # Write chunk using existing write_chunk method
384
- await self.write_chunk(chunk, chunk_file_path)
751
+ await self._write_chunk(chunk, chunk_file_path)
385
752
 
386
753
  async def _consolidate_current_folder(self):
387
754
  """Consolidate current temp folder using Daft."""
@@ -392,7 +759,7 @@ class ParquetOutput(Output):
392
759
  import daft
393
760
 
394
761
  # Read all parquet files in temp folder
395
- pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
762
+ pattern = os.path.join(self.current_temp_folder_path, f"*{self.extension}")
396
763
  daft_df = daft.read_parquet(pattern)
397
764
  partitions = 0
398
765
 
@@ -408,7 +775,7 @@ class ParquetOutput(Output):
408
775
  result_dict = result.to_pydict()
409
776
  partitions = len(result_dict["path"])
410
777
  for i, file_path in enumerate(result_dict["path"]):
411
- if file_path.endswith(".parquet"):
778
+ if file_path.endswith(self.extension):
412
779
  consolidated_file_path = self._get_consolidated_file_path(
413
780
  folder_index=self.chunk_count,
414
781
  chunk_part=i,
@@ -462,7 +829,7 @@ class ParquetOutput(Output):
462
829
  shutil.rmtree(temp_folder, ignore_errors=True)
463
830
 
464
831
  # Clean up base temp directory if it exists and is empty
465
- temp_base_path = os.path.join(self.output_path, "temp_accumulation")
832
+ temp_base_path = os.path.join(self.path, "temp_accumulation")
466
833
  if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
467
834
  os.rmdir(temp_base_path)
468
835
 
@@ -475,7 +842,7 @@ class ParquetOutput(Output):
475
842
  except Exception as e:
476
843
  logger.warning(f"Error cleaning up temp folders: {str(e)}")
477
844
 
478
- async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
845
+ async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
479
846
  """Write a chunk to a Parquet file.
480
847
 
481
848
  This method writes a chunk to a Parquet file and uploads the file to the object store.