atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +312 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +749 -0
- application_sdk/io/json.py +473 -0
- application_sdk/{outputs → io}/parquet.py +414 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +16 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +14 -1
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
from
|
|
5
|
-
|
|
4
|
+
from typing import (
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
AsyncGenerator,
|
|
7
|
+
AsyncIterator,
|
|
8
|
+
Generator,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
Union,
|
|
12
|
+
cast,
|
|
13
|
+
)
|
|
6
14
|
|
|
7
15
|
from temporalio import activity
|
|
8
16
|
|
|
9
17
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
10
|
-
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
11
18
|
from application_sdk.constants import (
|
|
12
19
|
DAPR_MAX_GRPC_MESSAGE_LENGTH,
|
|
13
20
|
ENABLE_ATLAN_UPLOAD,
|
|
14
21
|
UPSTREAM_OBJECT_STORE_NAME,
|
|
15
22
|
)
|
|
23
|
+
from application_sdk.io import DataframeType, Reader, WriteMode, Writer
|
|
24
|
+
from application_sdk.io.utils import (
|
|
25
|
+
PARQUET_FILE_EXTENSION,
|
|
26
|
+
download_files,
|
|
27
|
+
is_empty_dataframe,
|
|
28
|
+
path_gen,
|
|
29
|
+
)
|
|
16
30
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
17
31
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
18
|
-
from application_sdk.outputs import Output
|
|
19
32
|
from application_sdk.services.objectstore import ObjectStore
|
|
20
33
|
|
|
21
34
|
logger = get_logger(__name__)
|
|
@@ -26,23 +39,370 @@ if TYPE_CHECKING:
|
|
|
26
39
|
import pandas as pd
|
|
27
40
|
|
|
28
41
|
|
|
29
|
-
class
|
|
30
|
-
"""
|
|
42
|
+
class ParquetFileReader(Reader):
|
|
43
|
+
"""Parquet File Reader class to read data from Parquet files using daft and pandas.
|
|
44
|
+
|
|
45
|
+
Supports reading both single files and directories containing multiple parquet files.
|
|
46
|
+
Follows Python's file I/O pattern with read/close semantics and supports context managers.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
path (str): Path to parquet file or directory containing parquet files.
|
|
50
|
+
chunk_size (int): Number of rows per batch.
|
|
51
|
+
buffer_size (int): Number of rows per batch for daft.
|
|
52
|
+
file_names (Optional[List[str]]): List of specific file names to read.
|
|
53
|
+
dataframe_type (DataframeType): Type of dataframe to return (pandas or daft).
|
|
54
|
+
cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
|
|
55
|
+
|
|
56
|
+
Example:
|
|
57
|
+
Using context manager (recommended)::
|
|
58
|
+
|
|
59
|
+
async with ParquetFileReader(path="/data/input") as reader:
|
|
60
|
+
df = await reader.read()
|
|
61
|
+
# close() called automatically, temp files cleaned up
|
|
62
|
+
|
|
63
|
+
Reading in batches::
|
|
64
|
+
|
|
65
|
+
async with ParquetFileReader(path="/data/input", chunk_size=50000) as reader:
|
|
66
|
+
async for batch in reader.read_batches():
|
|
67
|
+
process(batch)
|
|
68
|
+
|
|
69
|
+
Using close() explicitly::
|
|
70
|
+
|
|
71
|
+
reader = ParquetFileReader(path="/data/input")
|
|
72
|
+
df = await reader.read()
|
|
73
|
+
await reader.close() # Clean up downloaded temp files
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
path: str,
|
|
79
|
+
chunk_size: Optional[int] = 100000,
|
|
80
|
+
buffer_size: Optional[int] = 5000,
|
|
81
|
+
file_names: Optional[List[str]] = None,
|
|
82
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
83
|
+
cleanup_on_close: bool = True,
|
|
84
|
+
):
|
|
85
|
+
"""Initialize the Parquet input class.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
path (str): Path to parquet file or directory containing parquet files.
|
|
89
|
+
It accepts both types of paths:
|
|
90
|
+
local path or object store path
|
|
91
|
+
Wildcards are not supported.
|
|
92
|
+
chunk_size (int): Number of rows per batch. Defaults to 100000.
|
|
93
|
+
buffer_size (int): Number of rows per batch. Defaults to 5000.
|
|
94
|
+
file_names (Optional[List[str]]): List of file names to read. Defaults to None.
|
|
95
|
+
dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
|
|
96
|
+
cleanup_on_close (bool): Whether to clean up downloaded temp files on close. Defaults to True.
|
|
97
|
+
|
|
98
|
+
Raises:
|
|
99
|
+
ValueError: When path is not provided or when single file path is combined with file_names
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# Validate that single file path and file_names are not both specified
|
|
103
|
+
if path.endswith(PARQUET_FILE_EXTENSION) and file_names:
|
|
104
|
+
raise ValueError(
|
|
105
|
+
f"Cannot specify both a single file path ('{path}') and file_names filter. "
|
|
106
|
+
f"Either provide a directory path with file_names, or specify the exact file path without file_names."
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.path = path
|
|
110
|
+
self.chunk_size = chunk_size
|
|
111
|
+
self.buffer_size = buffer_size
|
|
112
|
+
self.file_names = file_names
|
|
113
|
+
self.dataframe_type = dataframe_type
|
|
114
|
+
self.cleanup_on_close = cleanup_on_close
|
|
115
|
+
self._is_closed = False
|
|
116
|
+
self._downloaded_files: List[str] = []
|
|
117
|
+
|
|
118
|
+
async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
|
|
119
|
+
"""Read the data from the parquet files and return as a single DataFrame.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Union[pd.DataFrame, daft.DataFrame]: Combined dataframe from parquet files.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If the reader has been closed or dataframe_type is unsupported.
|
|
126
|
+
"""
|
|
127
|
+
if self._is_closed:
|
|
128
|
+
raise ValueError("Cannot read from a closed reader")
|
|
129
|
+
|
|
130
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
131
|
+
return await self._get_dataframe()
|
|
132
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
133
|
+
return await self._get_daft_dataframe()
|
|
134
|
+
else:
|
|
135
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
136
|
+
|
|
137
|
+
def read_batches(
|
|
138
|
+
self,
|
|
139
|
+
) -> Union[
|
|
140
|
+
AsyncIterator["pd.DataFrame"],
|
|
141
|
+
AsyncIterator["daft.DataFrame"],
|
|
142
|
+
]:
|
|
143
|
+
"""Read the data from the parquet files and return as batched DataFrames.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Union[AsyncIterator[pd.DataFrame], AsyncIterator[daft.DataFrame]]:
|
|
147
|
+
Async iterator of DataFrames.
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
ValueError: If the reader has been closed or dataframe_type is unsupported.
|
|
151
|
+
"""
|
|
152
|
+
if self._is_closed:
|
|
153
|
+
raise ValueError("Cannot read from a closed reader")
|
|
154
|
+
|
|
155
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
156
|
+
return self._get_batched_dataframe()
|
|
157
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
158
|
+
return self._get_batched_daft_dataframe()
|
|
159
|
+
else:
|
|
160
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
161
|
+
|
|
162
|
+
async def _get_dataframe(self) -> "pd.DataFrame":
|
|
163
|
+
"""Read data from parquet file(s) and return as pandas DataFrame.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
pd.DataFrame: Combined dataframe from specified parquet files
|
|
167
|
+
|
|
168
|
+
Raises:
|
|
169
|
+
ValueError: When no valid path can be determined or no matching files found
|
|
170
|
+
Exception: When reading parquet files fails
|
|
171
|
+
|
|
172
|
+
Example transformation:
|
|
173
|
+
Input files:
|
|
174
|
+
+------------------+
|
|
175
|
+
| file1.parquet |
|
|
176
|
+
| file2.parquet |
|
|
177
|
+
| file3.parquet |
|
|
178
|
+
+------------------+
|
|
179
|
+
|
|
180
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
181
|
+
+-------+-------+-------+
|
|
182
|
+
| col1 | col2 | col3 |
|
|
183
|
+
+-------+-------+-------+
|
|
184
|
+
| val1 | val2 | val3 | # from file1.parquet
|
|
185
|
+
| val7 | val8 | val9 | # from file3.parquet
|
|
186
|
+
+-------+-------+-------+
|
|
187
|
+
|
|
188
|
+
Transformations:
|
|
189
|
+
- Only specified files are read and combined
|
|
190
|
+
- Column schemas must be compatible across files
|
|
191
|
+
- Only reads files in the specified directory
|
|
192
|
+
"""
|
|
193
|
+
try:
|
|
194
|
+
import pandas as pd
|
|
195
|
+
|
|
196
|
+
# Ensure files are available (local or downloaded)
|
|
197
|
+
parquet_files = await download_files(
|
|
198
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
199
|
+
)
|
|
200
|
+
# Track downloaded files for cleanup on close
|
|
201
|
+
self._downloaded_files.extend(parquet_files)
|
|
202
|
+
logger.info(f"Reading {len(parquet_files)} parquet files")
|
|
203
|
+
|
|
204
|
+
return pd.concat(
|
|
205
|
+
(pd.read_parquet(parquet_file) for parquet_file in parquet_files),
|
|
206
|
+
ignore_index=True,
|
|
207
|
+
)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.error(f"Error reading data from parquet file(s): {str(e)}")
|
|
210
|
+
raise
|
|
211
|
+
|
|
212
|
+
async def _get_batched_dataframe(
|
|
213
|
+
self,
|
|
214
|
+
) -> AsyncIterator["pd.DataFrame"]:
|
|
215
|
+
"""Read data from parquet file(s) in batches as pandas DataFrames.
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
ValueError: When no parquet files found locally or in object store
|
|
222
|
+
Exception: When reading parquet files fails
|
|
223
|
+
|
|
224
|
+
Example transformation:
|
|
225
|
+
Input files:
|
|
226
|
+
+------------------+
|
|
227
|
+
| file1.parquet |
|
|
228
|
+
| file2.parquet |
|
|
229
|
+
| file3.parquet |
|
|
230
|
+
+------------------+
|
|
231
|
+
|
|
232
|
+
With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
|
|
233
|
+
Batch 1:
|
|
234
|
+
+-------+-------+
|
|
235
|
+
| col1 | col2 |
|
|
236
|
+
+-------+-------+
|
|
237
|
+
| val1 | val2 | # from file1.parquet
|
|
238
|
+
| val3 | val4 | # from file1.parquet
|
|
239
|
+
+-------+-------+
|
|
240
|
+
|
|
241
|
+
Batch 2:
|
|
242
|
+
+-------+-------+
|
|
243
|
+
| col1 | col2 |
|
|
244
|
+
+-------+-------+
|
|
245
|
+
| val5 | val6 | # from file2.parquet
|
|
246
|
+
| val7 | val8 | # from file2.parquet
|
|
247
|
+
+-------+-------+
|
|
248
|
+
|
|
249
|
+
Transformations:
|
|
250
|
+
- Only specified files are combined then split into chunks
|
|
251
|
+
- Each batch is a separate DataFrame
|
|
252
|
+
- Only reads files in the specified directory
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
import pandas as pd
|
|
256
|
+
|
|
257
|
+
# Ensure files are available (local or downloaded)
|
|
258
|
+
parquet_files = await download_files(
|
|
259
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
260
|
+
)
|
|
261
|
+
# Track downloaded files for cleanup on close
|
|
262
|
+
self._downloaded_files.extend(parquet_files)
|
|
263
|
+
logger.info(f"Reading {len(parquet_files)} parquet files in batches")
|
|
264
|
+
|
|
265
|
+
# Process each file individually to maintain memory efficiency
|
|
266
|
+
for parquet_file in parquet_files:
|
|
267
|
+
df = pd.read_parquet(parquet_file)
|
|
268
|
+
for i in range(0, len(df), self.chunk_size):
|
|
269
|
+
yield df.iloc[i : i + self.chunk_size] # type: ignore
|
|
270
|
+
except Exception as e:
|
|
271
|
+
logger.error(
|
|
272
|
+
f"Error reading data from parquet file(s) in batches: {str(e)}"
|
|
273
|
+
)
|
|
274
|
+
raise
|
|
275
|
+
|
|
276
|
+
async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
277
|
+
"""Read data from parquet file(s) and return as daft DataFrame.
|
|
278
|
+
|
|
279
|
+
Returns:
|
|
280
|
+
daft.DataFrame: Combined daft dataframe from specified parquet files
|
|
281
|
+
|
|
282
|
+
Raises:
|
|
283
|
+
ValueError: When no parquet files found locally or in object store
|
|
284
|
+
Exception: When reading parquet files fails
|
|
285
|
+
|
|
286
|
+
Example transformation:
|
|
287
|
+
Input files:
|
|
288
|
+
+------------------+
|
|
289
|
+
| file1.parquet |
|
|
290
|
+
| file2.parquet |
|
|
291
|
+
| file3.parquet |
|
|
292
|
+
+------------------+
|
|
293
|
+
|
|
294
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
295
|
+
+-------+-------+-------+
|
|
296
|
+
| col1 | col2 | col3 |
|
|
297
|
+
+-------+-------+-------+
|
|
298
|
+
| val1 | val2 | val3 | # from file1.parquet
|
|
299
|
+
| val7 | val8 | val9 | # from file3.parquet
|
|
300
|
+
+-------+-------+-------+
|
|
301
|
+
|
|
302
|
+
Transformations:
|
|
303
|
+
- Only specified parquet files combined into single daft DataFrame
|
|
304
|
+
- Lazy evaluation for better performance
|
|
305
|
+
- Column schemas must be compatible across files
|
|
306
|
+
"""
|
|
307
|
+
try:
|
|
308
|
+
import daft # type: ignore
|
|
309
|
+
|
|
310
|
+
# Ensure files are available (local or downloaded)
|
|
311
|
+
parquet_files = await download_files(
|
|
312
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
313
|
+
)
|
|
314
|
+
# Track downloaded files for cleanup on close
|
|
315
|
+
self._downloaded_files.extend(parquet_files)
|
|
316
|
+
logger.info(f"Reading {len(parquet_files)} parquet files with daft")
|
|
317
|
+
|
|
318
|
+
# Use the discovered/downloaded files directly
|
|
319
|
+
return daft.read_parquet(parquet_files)
|
|
320
|
+
except Exception as e:
|
|
321
|
+
logger.error(
|
|
322
|
+
f"Error reading data from parquet file(s) using daft: {str(e)}"
|
|
323
|
+
)
|
|
324
|
+
raise
|
|
325
|
+
|
|
326
|
+
async def _get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
|
|
327
|
+
"""Get batched daft dataframe from parquet file(s).
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
|
|
331
|
+
a batch of data from individual parquet files
|
|
332
|
+
|
|
333
|
+
Raises:
|
|
334
|
+
ValueError: When no parquet files found locally or in object store
|
|
335
|
+
Exception: When reading parquet files fails
|
|
336
|
+
|
|
337
|
+
Example transformation:
|
|
338
|
+
Input files:
|
|
339
|
+
+------------------+
|
|
340
|
+
| file1.parquet |
|
|
341
|
+
| file2.parquet |
|
|
342
|
+
| file3.parquet |
|
|
343
|
+
+------------------+
|
|
344
|
+
|
|
345
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
346
|
+
Batch 1 (file1.parquet):
|
|
347
|
+
+-------+-------+
|
|
348
|
+
| col1 | col2 |
|
|
349
|
+
+-------+-------+
|
|
350
|
+
| val1 | val2 |
|
|
351
|
+
| val3 | val4 |
|
|
352
|
+
+-------+-------+
|
|
353
|
+
|
|
354
|
+
Batch 2 (file3.parquet):
|
|
355
|
+
+-------+-------+
|
|
356
|
+
| col1 | col2 |
|
|
357
|
+
+-------+-------+
|
|
358
|
+
| val7 | val8 |
|
|
359
|
+
| val9 | val10 |
|
|
360
|
+
+-------+-------+
|
|
361
|
+
|
|
362
|
+
Transformations:
|
|
363
|
+
- Each specified file becomes a separate daft DataFrame batch
|
|
364
|
+
- Lazy evaluation for better performance
|
|
365
|
+
- Files processed individually for memory efficiency
|
|
366
|
+
"""
|
|
367
|
+
try:
|
|
368
|
+
import daft # type: ignore
|
|
369
|
+
|
|
370
|
+
# Ensure files are available (local or downloaded)
|
|
371
|
+
parquet_files = await download_files(
|
|
372
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
373
|
+
)
|
|
374
|
+
# Track downloaded files for cleanup on close
|
|
375
|
+
self._downloaded_files.extend(parquet_files)
|
|
376
|
+
logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
|
|
377
|
+
|
|
378
|
+
# Create a lazy dataframe without loading data into memory
|
|
379
|
+
lazy_df = daft.read_parquet(parquet_files)
|
|
380
|
+
|
|
381
|
+
# Get total count efficiently
|
|
382
|
+
total_rows = lazy_df.count_rows()
|
|
31
383
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
384
|
+
# Yield chunks without loading everything into memory
|
|
385
|
+
for offset in range(0, total_rows, self.buffer_size):
|
|
386
|
+
chunk = lazy_df.offset(offset).limit(self.buffer_size)
|
|
387
|
+
yield chunk
|
|
35
388
|
|
|
389
|
+
del lazy_df
|
|
36
390
|
|
|
37
|
-
|
|
391
|
+
except Exception as error:
|
|
392
|
+
logger.error(
|
|
393
|
+
f"Error reading data from parquet file(s) in batches using daft: {error}"
|
|
394
|
+
)
|
|
395
|
+
raise
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
class ParquetFileWriter(Writer):
|
|
38
399
|
"""Output handler for writing data to Parquet files.
|
|
39
400
|
|
|
40
401
|
This class handles writing DataFrames to Parquet files with support for chunking
|
|
41
402
|
and automatic uploading to object store.
|
|
42
403
|
|
|
43
404
|
Attributes:
|
|
44
|
-
|
|
45
|
-
output_suffix (str): Suffix for output files.
|
|
405
|
+
path (str): Base path where Parquet files will be written.
|
|
46
406
|
typename (Optional[str]): Type name of the entity e.g database, schema, table.
|
|
47
407
|
chunk_size (int): Maximum number of records per chunk.
|
|
48
408
|
total_record_count (int): Total number of records processed.
|
|
@@ -54,29 +414,26 @@ class ParquetOutput(Output):
|
|
|
54
414
|
use_consolidation (bool): Whether to use consolidation.
|
|
55
415
|
"""
|
|
56
416
|
|
|
57
|
-
_EXTENSION = ".parquet"
|
|
58
|
-
|
|
59
417
|
def __init__(
|
|
60
418
|
self,
|
|
61
|
-
|
|
62
|
-
output_suffix: str = "",
|
|
419
|
+
path: str,
|
|
63
420
|
typename: Optional[str] = None,
|
|
64
421
|
chunk_size: Optional[int] = 100000,
|
|
65
|
-
buffer_size: int = 5000,
|
|
66
|
-
total_record_count: int = 0,
|
|
67
|
-
chunk_count: int = 0,
|
|
68
|
-
chunk_part: int = 0,
|
|
422
|
+
buffer_size: Optional[int] = 5000,
|
|
423
|
+
total_record_count: Optional[int] = 0,
|
|
424
|
+
chunk_count: Optional[int] = 0,
|
|
425
|
+
chunk_part: Optional[int] = 0,
|
|
69
426
|
chunk_start: Optional[int] = None,
|
|
70
427
|
start_marker: Optional[str] = None,
|
|
71
428
|
end_marker: Optional[str] = None,
|
|
72
|
-
retain_local_copy: bool = False,
|
|
73
|
-
use_consolidation: bool = False,
|
|
429
|
+
retain_local_copy: Optional[bool] = False,
|
|
430
|
+
use_consolidation: Optional[bool] = False,
|
|
431
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
74
432
|
):
|
|
75
433
|
"""Initialize the Parquet output handler.
|
|
76
434
|
|
|
77
435
|
Args:
|
|
78
|
-
|
|
79
|
-
output_suffix (str): Suffix for output files.
|
|
436
|
+
path (str): Base path where Parquet files will be written.
|
|
80
437
|
typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
|
|
81
438
|
chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
|
|
82
439
|
total_record_count (int, optional): Initial total record count. Defaults to 0.
|
|
@@ -91,9 +448,10 @@ class ParquetOutput(Output):
|
|
|
91
448
|
Defaults to False.
|
|
92
449
|
use_consolidation (bool, optional): Whether to use consolidation.
|
|
93
450
|
Defaults to False.
|
|
451
|
+
dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
|
|
94
452
|
"""
|
|
95
|
-
self.
|
|
96
|
-
self.
|
|
453
|
+
self.extension = PARQUET_FILE_EXTENSION
|
|
454
|
+
self.path = path
|
|
97
455
|
self.typename = typename
|
|
98
456
|
self.chunk_size = chunk_size
|
|
99
457
|
self.buffer_size = buffer_size
|
|
@@ -112,6 +470,9 @@ class ParquetOutput(Output):
|
|
|
112
470
|
self.partitions = []
|
|
113
471
|
self.metrics = get_metrics()
|
|
114
472
|
self.retain_local_copy = retain_local_copy
|
|
473
|
+
self.dataframe_type = dataframe_type
|
|
474
|
+
self._is_closed = False
|
|
475
|
+
self._statistics = None
|
|
115
476
|
|
|
116
477
|
# Consolidation-specific attributes
|
|
117
478
|
# Use consolidation to efficiently write parquet files in buffered manner
|
|
@@ -128,13 +489,14 @@ class ParquetOutput(Output):
|
|
|
128
489
|
if self.chunk_start:
|
|
129
490
|
self.chunk_count = self.chunk_start + self.chunk_count
|
|
130
491
|
|
|
492
|
+
if not self.path:
|
|
493
|
+
raise ValueError("path is required")
|
|
131
494
|
# Create output directory
|
|
132
|
-
self.output_path = os.path.join(self.output_path, self.output_suffix)
|
|
133
495
|
if self.typename:
|
|
134
|
-
self.
|
|
135
|
-
os.makedirs(self.
|
|
496
|
+
self.path = os.path.join(self.path, self.typename)
|
|
497
|
+
os.makedirs(self.path, exist_ok=True)
|
|
136
498
|
|
|
137
|
-
async def
|
|
499
|
+
async def _write_batched_dataframe(
|
|
138
500
|
self,
|
|
139
501
|
batched_dataframe: Union[
|
|
140
502
|
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
@@ -155,7 +517,7 @@ class ParquetOutput(Output):
|
|
|
155
517
|
"""
|
|
156
518
|
if not self.use_consolidation:
|
|
157
519
|
# Fallback to base class implementation
|
|
158
|
-
await super().
|
|
520
|
+
await super()._write_batched_dataframe(batched_dataframe)
|
|
159
521
|
return
|
|
160
522
|
|
|
161
523
|
try:
|
|
@@ -186,12 +548,13 @@ class ParquetOutput(Output):
|
|
|
186
548
|
await self._cleanup_temp_folders() # Cleanup on error
|
|
187
549
|
raise
|
|
188
550
|
|
|
189
|
-
async def
|
|
551
|
+
async def _write_daft_dataframe(
|
|
190
552
|
self,
|
|
191
553
|
dataframe: "daft.DataFrame", # noqa: F821
|
|
192
554
|
partition_cols: Optional[List] = None,
|
|
193
|
-
write_mode: Union[WriteMode, str] = WriteMode.APPEND,
|
|
555
|
+
write_mode: Union[WriteMode, str] = WriteMode.APPEND.value,
|
|
194
556
|
morsel_size: int = 100_000,
|
|
557
|
+
**kwargs,
|
|
195
558
|
):
|
|
196
559
|
"""Write a daft DataFrame to Parquet files and upload to object store.
|
|
197
560
|
|
|
@@ -234,7 +597,7 @@ class ParquetOutput(Output):
|
|
|
234
597
|
):
|
|
235
598
|
# Daft automatically handles file splitting and naming
|
|
236
599
|
result = dataframe.write_parquet(
|
|
237
|
-
root_dir=self.
|
|
600
|
+
root_dir=self.path,
|
|
238
601
|
write_mode=write_mode.value,
|
|
239
602
|
partition_cols=partition_cols,
|
|
240
603
|
)
|
|
@@ -267,11 +630,11 @@ class ParquetOutput(Output):
|
|
|
267
630
|
# Delete the directory from object store
|
|
268
631
|
try:
|
|
269
632
|
await ObjectStore.delete_prefix(
|
|
270
|
-
prefix=get_object_store_prefix(self.
|
|
633
|
+
prefix=get_object_store_prefix(self.path)
|
|
271
634
|
)
|
|
272
635
|
except FileNotFoundError as e:
|
|
273
636
|
logger.info(
|
|
274
|
-
f"No files found under prefix {get_object_store_prefix(self.
|
|
637
|
+
f"No files found under prefix {get_object_store_prefix(self.path)}: {str(e)}"
|
|
275
638
|
)
|
|
276
639
|
for path in file_paths:
|
|
277
640
|
if ENABLE_ATLAN_UPLOAD:
|
|
@@ -311,20 +674,24 @@ class ParquetOutput(Output):
|
|
|
311
674
|
Returns:
|
|
312
675
|
str: The full path of the output file.
|
|
313
676
|
"""
|
|
314
|
-
return self.
|
|
677
|
+
return self.path
|
|
315
678
|
|
|
316
679
|
# Consolidation helper methods
|
|
317
680
|
|
|
318
681
|
def _get_temp_folder_path(self, folder_index: int) -> str:
|
|
319
682
|
"""Generate temp folder path consistent with existing structure."""
|
|
320
|
-
temp_base_path = os.path.join(self.
|
|
683
|
+
temp_base_path = os.path.join(self.path, "temp_accumulation")
|
|
321
684
|
return os.path.join(temp_base_path, f"folder-{folder_index}")
|
|
322
685
|
|
|
323
686
|
def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
|
|
324
687
|
"""Generate final consolidated file path using existing path_gen logic."""
|
|
325
688
|
return os.path.join(
|
|
326
|
-
self.
|
|
327
|
-
|
|
689
|
+
self.path,
|
|
690
|
+
path_gen(
|
|
691
|
+
chunk_count=folder_index,
|
|
692
|
+
chunk_part=chunk_part,
|
|
693
|
+
extension=self.extension,
|
|
694
|
+
),
|
|
328
695
|
)
|
|
329
696
|
|
|
330
697
|
async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
|
|
@@ -374,14 +741,14 @@ class ParquetOutput(Output):
|
|
|
374
741
|
[
|
|
375
742
|
f
|
|
376
743
|
for f in os.listdir(self.current_temp_folder_path)
|
|
377
|
-
if f.endswith(
|
|
744
|
+
if f.endswith(self.extension)
|
|
378
745
|
]
|
|
379
746
|
)
|
|
380
|
-
chunk_file_name = f"chunk-{existing_files}.
|
|
747
|
+
chunk_file_name = f"chunk-{existing_files}{self.extension}"
|
|
381
748
|
chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
|
|
382
749
|
|
|
383
750
|
# Write chunk using existing write_chunk method
|
|
384
|
-
await self.
|
|
751
|
+
await self._write_chunk(chunk, chunk_file_path)
|
|
385
752
|
|
|
386
753
|
async def _consolidate_current_folder(self):
|
|
387
754
|
"""Consolidate current temp folder using Daft."""
|
|
@@ -392,7 +759,7 @@ class ParquetOutput(Output):
|
|
|
392
759
|
import daft
|
|
393
760
|
|
|
394
761
|
# Read all parquet files in temp folder
|
|
395
|
-
pattern = os.path.join(self.current_temp_folder_path, "
|
|
762
|
+
pattern = os.path.join(self.current_temp_folder_path, f"*{self.extension}")
|
|
396
763
|
daft_df = daft.read_parquet(pattern)
|
|
397
764
|
partitions = 0
|
|
398
765
|
|
|
@@ -408,7 +775,7 @@ class ParquetOutput(Output):
|
|
|
408
775
|
result_dict = result.to_pydict()
|
|
409
776
|
partitions = len(result_dict["path"])
|
|
410
777
|
for i, file_path in enumerate(result_dict["path"]):
|
|
411
|
-
if file_path.endswith(
|
|
778
|
+
if file_path.endswith(self.extension):
|
|
412
779
|
consolidated_file_path = self._get_consolidated_file_path(
|
|
413
780
|
folder_index=self.chunk_count,
|
|
414
781
|
chunk_part=i,
|
|
@@ -462,7 +829,7 @@ class ParquetOutput(Output):
|
|
|
462
829
|
shutil.rmtree(temp_folder, ignore_errors=True)
|
|
463
830
|
|
|
464
831
|
# Clean up base temp directory if it exists and is empty
|
|
465
|
-
temp_base_path = os.path.join(self.
|
|
832
|
+
temp_base_path = os.path.join(self.path, "temp_accumulation")
|
|
466
833
|
if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
|
|
467
834
|
os.rmdir(temp_base_path)
|
|
468
835
|
|
|
@@ -475,7 +842,7 @@ class ParquetOutput(Output):
|
|
|
475
842
|
except Exception as e:
|
|
476
843
|
logger.warning(f"Error cleaning up temp folders: {str(e)}")
|
|
477
844
|
|
|
478
|
-
async def
|
|
845
|
+
async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
479
846
|
"""Write a chunk to a Parquet file.
|
|
480
847
|
|
|
481
848
|
This method writes a chunk to a Parquet file and uploads the file to the object store.
|