atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +308 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +654 -0
- application_sdk/io/json.py +429 -0
- application_sdk/{outputs → io}/parquet.py +358 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +23 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +30 -7
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -445
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,21 +1,34 @@
|
|
|
1
1
|
import inspect
|
|
2
2
|
import os
|
|
3
3
|
import shutil
|
|
4
|
-
from
|
|
5
|
-
|
|
4
|
+
from typing import (
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
AsyncGenerator,
|
|
7
|
+
AsyncIterator,
|
|
8
|
+
Generator,
|
|
9
|
+
List,
|
|
10
|
+
Optional,
|
|
11
|
+
Union,
|
|
12
|
+
cast,
|
|
13
|
+
)
|
|
6
14
|
|
|
7
15
|
from temporalio import activity
|
|
8
16
|
|
|
9
17
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
10
|
-
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
11
18
|
from application_sdk.constants import (
|
|
12
19
|
DAPR_MAX_GRPC_MESSAGE_LENGTH,
|
|
13
20
|
ENABLE_ATLAN_UPLOAD,
|
|
14
21
|
UPSTREAM_OBJECT_STORE_NAME,
|
|
15
22
|
)
|
|
23
|
+
from application_sdk.io import DataframeType, Reader, WriteMode, Writer
|
|
24
|
+
from application_sdk.io.utils import (
|
|
25
|
+
PARQUET_FILE_EXTENSION,
|
|
26
|
+
download_files,
|
|
27
|
+
is_empty_dataframe,
|
|
28
|
+
path_gen,
|
|
29
|
+
)
|
|
16
30
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
17
31
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
18
|
-
from application_sdk.outputs import Output
|
|
19
32
|
from application_sdk.services.objectstore import ObjectStore
|
|
20
33
|
|
|
21
34
|
logger = get_logger(__name__)
|
|
@@ -26,23 +39,314 @@ if TYPE_CHECKING:
|
|
|
26
39
|
import pandas as pd
|
|
27
40
|
|
|
28
41
|
|
|
29
|
-
class
|
|
30
|
-
"""
|
|
42
|
+
class ParquetFileReader(Reader):
|
|
43
|
+
"""
|
|
44
|
+
Parquet File Reader class to read data from Parquet files using daft and pandas.
|
|
45
|
+
Supports reading both single files and directories containing multiple parquet files.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
path: str,
|
|
51
|
+
chunk_size: Optional[int] = 100000,
|
|
52
|
+
buffer_size: Optional[int] = 5000,
|
|
53
|
+
file_names: Optional[List[str]] = None,
|
|
54
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
55
|
+
):
|
|
56
|
+
"""Initialize the Parquet input class.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
path (str): Path to parquet file or directory containing parquet files.
|
|
60
|
+
It accepts both types of paths:
|
|
61
|
+
local path or object store path
|
|
62
|
+
Wildcards are not supported.
|
|
63
|
+
chunk_size (int): Number of rows per batch. Defaults to 100000.
|
|
64
|
+
buffer_size (int): Number of rows per batch. Defaults to 5000.
|
|
65
|
+
file_names (Optional[List[str]]): List of file names to read. Defaults to None.
|
|
66
|
+
dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
ValueError: When path is not provided or when single file path is combined with file_names
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
# Validate that single file path and file_names are not both specified
|
|
73
|
+
if path.endswith(PARQUET_FILE_EXTENSION) and file_names:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"Cannot specify both a single file path ('{path}') and file_names filter. "
|
|
76
|
+
f"Either provide a directory path with file_names, or specify the exact file path without file_names."
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
self.path = path
|
|
80
|
+
self.chunk_size = chunk_size
|
|
81
|
+
self.buffer_size = buffer_size
|
|
82
|
+
self.file_names = file_names
|
|
83
|
+
self.dataframe_type = dataframe_type
|
|
84
|
+
|
|
85
|
+
async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
|
|
86
|
+
"""
|
|
87
|
+
Method to read the data from the parquet files in the path
|
|
88
|
+
and return as a single combined pandas dataframe
|
|
89
|
+
"""
|
|
90
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
91
|
+
return await self._get_dataframe()
|
|
92
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
93
|
+
return await self._get_daft_dataframe()
|
|
94
|
+
else:
|
|
95
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
96
|
+
|
|
97
|
+
def read_batches(
|
|
98
|
+
self,
|
|
99
|
+
) -> Union[
|
|
100
|
+
AsyncIterator["pd.DataFrame"],
|
|
101
|
+
AsyncIterator["daft.DataFrame"],
|
|
102
|
+
]:
|
|
103
|
+
"""
|
|
104
|
+
Method to read the data from the parquet files in the path
|
|
105
|
+
and return as a batched pandas dataframe
|
|
106
|
+
"""
|
|
107
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
108
|
+
return self._get_batched_dataframe()
|
|
109
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
110
|
+
return self._get_batched_daft_dataframe()
|
|
111
|
+
else:
|
|
112
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
113
|
+
|
|
114
|
+
async def _get_dataframe(self) -> "pd.DataFrame":
|
|
115
|
+
"""Read data from parquet file(s) and return as pandas DataFrame.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
pd.DataFrame: Combined dataframe from specified parquet files
|
|
119
|
+
|
|
120
|
+
Raises:
|
|
121
|
+
ValueError: When no valid path can be determined or no matching files found
|
|
122
|
+
Exception: When reading parquet files fails
|
|
123
|
+
|
|
124
|
+
Example transformation:
|
|
125
|
+
Input files:
|
|
126
|
+
+------------------+
|
|
127
|
+
| file1.parquet |
|
|
128
|
+
| file2.parquet |
|
|
129
|
+
| file3.parquet |
|
|
130
|
+
+------------------+
|
|
131
|
+
|
|
132
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
133
|
+
+-------+-------+-------+
|
|
134
|
+
| col1 | col2 | col3 |
|
|
135
|
+
+-------+-------+-------+
|
|
136
|
+
| val1 | val2 | val3 | # from file1.parquet
|
|
137
|
+
| val7 | val8 | val9 | # from file3.parquet
|
|
138
|
+
+-------+-------+-------+
|
|
139
|
+
|
|
140
|
+
Transformations:
|
|
141
|
+
- Only specified files are read and combined
|
|
142
|
+
- Column schemas must be compatible across files
|
|
143
|
+
- Only reads files in the specified directory
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
import pandas as pd
|
|
147
|
+
|
|
148
|
+
# Ensure files are available (local or downloaded)
|
|
149
|
+
parquet_files = await download_files(
|
|
150
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
151
|
+
)
|
|
152
|
+
logger.info(f"Reading {len(parquet_files)} parquet files")
|
|
153
|
+
|
|
154
|
+
return pd.concat(
|
|
155
|
+
(pd.read_parquet(parquet_file) for parquet_file in parquet_files),
|
|
156
|
+
ignore_index=True,
|
|
157
|
+
)
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"Error reading data from parquet file(s): {str(e)}")
|
|
160
|
+
raise
|
|
161
|
+
|
|
162
|
+
async def _get_batched_dataframe(
|
|
163
|
+
self,
|
|
164
|
+
) -> AsyncIterator["pd.DataFrame"]:
|
|
165
|
+
"""Read data from parquet file(s) in batches as pandas DataFrames.
|
|
166
|
+
|
|
167
|
+
Returns:
|
|
168
|
+
AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
|
|
169
|
+
|
|
170
|
+
Raises:
|
|
171
|
+
ValueError: When no parquet files found locally or in object store
|
|
172
|
+
Exception: When reading parquet files fails
|
|
173
|
+
|
|
174
|
+
Example transformation:
|
|
175
|
+
Input files:
|
|
176
|
+
+------------------+
|
|
177
|
+
| file1.parquet |
|
|
178
|
+
| file2.parquet |
|
|
179
|
+
| file3.parquet |
|
|
180
|
+
+------------------+
|
|
181
|
+
|
|
182
|
+
With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
|
|
183
|
+
Batch 1:
|
|
184
|
+
+-------+-------+
|
|
185
|
+
| col1 | col2 |
|
|
186
|
+
+-------+-------+
|
|
187
|
+
| val1 | val2 | # from file1.parquet
|
|
188
|
+
| val3 | val4 | # from file1.parquet
|
|
189
|
+
+-------+-------+
|
|
190
|
+
|
|
191
|
+
Batch 2:
|
|
192
|
+
+-------+-------+
|
|
193
|
+
| col1 | col2 |
|
|
194
|
+
+-------+-------+
|
|
195
|
+
| val5 | val6 | # from file2.parquet
|
|
196
|
+
| val7 | val8 | # from file2.parquet
|
|
197
|
+
+-------+-------+
|
|
198
|
+
|
|
199
|
+
Transformations:
|
|
200
|
+
- Only specified files are combined then split into chunks
|
|
201
|
+
- Each batch is a separate DataFrame
|
|
202
|
+
- Only reads files in the specified directory
|
|
203
|
+
"""
|
|
204
|
+
try:
|
|
205
|
+
import pandas as pd
|
|
206
|
+
|
|
207
|
+
# Ensure files are available (local or downloaded)
|
|
208
|
+
parquet_files = await download_files(
|
|
209
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
210
|
+
)
|
|
211
|
+
logger.info(f"Reading {len(parquet_files)} parquet files in batches")
|
|
31
212
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
213
|
+
# Process each file individually to maintain memory efficiency
|
|
214
|
+
for parquet_file in parquet_files:
|
|
215
|
+
df = pd.read_parquet(parquet_file)
|
|
216
|
+
for i in range(0, len(df), self.chunk_size):
|
|
217
|
+
yield df.iloc[i : i + self.chunk_size] # type: ignore
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(
|
|
220
|
+
f"Error reading data from parquet file(s) in batches: {str(e)}"
|
|
221
|
+
)
|
|
222
|
+
raise
|
|
35
223
|
|
|
224
|
+
async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
225
|
+
"""Read data from parquet file(s) and return as daft DataFrame.
|
|
36
226
|
|
|
37
|
-
|
|
227
|
+
Returns:
|
|
228
|
+
daft.DataFrame: Combined daft dataframe from specified parquet files
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
ValueError: When no parquet files found locally or in object store
|
|
232
|
+
Exception: When reading parquet files fails
|
|
233
|
+
|
|
234
|
+
Example transformation:
|
|
235
|
+
Input files:
|
|
236
|
+
+------------------+
|
|
237
|
+
| file1.parquet |
|
|
238
|
+
| file2.parquet |
|
|
239
|
+
| file3.parquet |
|
|
240
|
+
+------------------+
|
|
241
|
+
|
|
242
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
243
|
+
+-------+-------+-------+
|
|
244
|
+
| col1 | col2 | col3 |
|
|
245
|
+
+-------+-------+-------+
|
|
246
|
+
| val1 | val2 | val3 | # from file1.parquet
|
|
247
|
+
| val7 | val8 | val9 | # from file3.parquet
|
|
248
|
+
+-------+-------+-------+
|
|
249
|
+
|
|
250
|
+
Transformations:
|
|
251
|
+
- Only specified parquet files combined into single daft DataFrame
|
|
252
|
+
- Lazy evaluation for better performance
|
|
253
|
+
- Column schemas must be compatible across files
|
|
254
|
+
"""
|
|
255
|
+
try:
|
|
256
|
+
import daft # type: ignore
|
|
257
|
+
|
|
258
|
+
# Ensure files are available (local or downloaded)
|
|
259
|
+
parquet_files = await download_files(
|
|
260
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
261
|
+
)
|
|
262
|
+
logger.info(f"Reading {len(parquet_files)} parquet files with daft")
|
|
263
|
+
|
|
264
|
+
# Use the discovered/downloaded files directly
|
|
265
|
+
return daft.read_parquet(parquet_files)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
logger.error(
|
|
268
|
+
f"Error reading data from parquet file(s) using daft: {str(e)}"
|
|
269
|
+
)
|
|
270
|
+
raise
|
|
271
|
+
|
|
272
|
+
async def _get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
|
|
273
|
+
"""Get batched daft dataframe from parquet file(s).
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
|
|
277
|
+
a batch of data from individual parquet files
|
|
278
|
+
|
|
279
|
+
Raises:
|
|
280
|
+
ValueError: When no parquet files found locally or in object store
|
|
281
|
+
Exception: When reading parquet files fails
|
|
282
|
+
|
|
283
|
+
Example transformation:
|
|
284
|
+
Input files:
|
|
285
|
+
+------------------+
|
|
286
|
+
| file1.parquet |
|
|
287
|
+
| file2.parquet |
|
|
288
|
+
| file3.parquet |
|
|
289
|
+
+------------------+
|
|
290
|
+
|
|
291
|
+
With file_names=["file1.parquet", "file3.parquet"]:
|
|
292
|
+
Batch 1 (file1.parquet):
|
|
293
|
+
+-------+-------+
|
|
294
|
+
| col1 | col2 |
|
|
295
|
+
+-------+-------+
|
|
296
|
+
| val1 | val2 |
|
|
297
|
+
| val3 | val4 |
|
|
298
|
+
+-------+-------+
|
|
299
|
+
|
|
300
|
+
Batch 2 (file3.parquet):
|
|
301
|
+
+-------+-------+
|
|
302
|
+
| col1 | col2 |
|
|
303
|
+
+-------+-------+
|
|
304
|
+
| val7 | val8 |
|
|
305
|
+
| val9 | val10 |
|
|
306
|
+
+-------+-------+
|
|
307
|
+
|
|
308
|
+
Transformations:
|
|
309
|
+
- Each specified file becomes a separate daft DataFrame batch
|
|
310
|
+
- Lazy evaluation for better performance
|
|
311
|
+
- Files processed individually for memory efficiency
|
|
312
|
+
"""
|
|
313
|
+
try:
|
|
314
|
+
import daft # type: ignore
|
|
315
|
+
|
|
316
|
+
# Ensure files are available (local or downloaded)
|
|
317
|
+
parquet_files = await download_files(
|
|
318
|
+
self.path, PARQUET_FILE_EXTENSION, self.file_names
|
|
319
|
+
)
|
|
320
|
+
logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
|
|
321
|
+
|
|
322
|
+
# Create a lazy dataframe without loading data into memory
|
|
323
|
+
lazy_df = daft.read_parquet(parquet_files)
|
|
324
|
+
|
|
325
|
+
# Get total count efficiently
|
|
326
|
+
total_rows = lazy_df.count_rows()
|
|
327
|
+
|
|
328
|
+
# Yield chunks without loading everything into memory
|
|
329
|
+
for offset in range(0, total_rows, self.buffer_size):
|
|
330
|
+
chunk = lazy_df.offset(offset).limit(self.buffer_size)
|
|
331
|
+
yield chunk
|
|
332
|
+
|
|
333
|
+
del lazy_df
|
|
334
|
+
|
|
335
|
+
except Exception as error:
|
|
336
|
+
logger.error(
|
|
337
|
+
f"Error reading data from parquet file(s) in batches using daft: {error}"
|
|
338
|
+
)
|
|
339
|
+
raise
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
class ParquetFileWriter(Writer):
|
|
38
343
|
"""Output handler for writing data to Parquet files.
|
|
39
344
|
|
|
40
345
|
This class handles writing DataFrames to Parquet files with support for chunking
|
|
41
346
|
and automatic uploading to object store.
|
|
42
347
|
|
|
43
348
|
Attributes:
|
|
44
|
-
|
|
45
|
-
output_suffix (str): Suffix for output files.
|
|
349
|
+
path (str): Base path where Parquet files will be written.
|
|
46
350
|
typename (Optional[str]): Type name of the entity e.g database, schema, table.
|
|
47
351
|
chunk_size (int): Maximum number of records per chunk.
|
|
48
352
|
total_record_count (int): Total number of records processed.
|
|
@@ -54,29 +358,26 @@ class ParquetOutput(Output):
|
|
|
54
358
|
use_consolidation (bool): Whether to use consolidation.
|
|
55
359
|
"""
|
|
56
360
|
|
|
57
|
-
_EXTENSION = ".parquet"
|
|
58
|
-
|
|
59
361
|
def __init__(
|
|
60
362
|
self,
|
|
61
|
-
|
|
62
|
-
output_suffix: str = "",
|
|
363
|
+
path: str,
|
|
63
364
|
typename: Optional[str] = None,
|
|
64
365
|
chunk_size: Optional[int] = 100000,
|
|
65
|
-
buffer_size: int = 5000,
|
|
66
|
-
total_record_count: int = 0,
|
|
67
|
-
chunk_count: int = 0,
|
|
68
|
-
chunk_part: int = 0,
|
|
366
|
+
buffer_size: Optional[int] = 5000,
|
|
367
|
+
total_record_count: Optional[int] = 0,
|
|
368
|
+
chunk_count: Optional[int] = 0,
|
|
369
|
+
chunk_part: Optional[int] = 0,
|
|
69
370
|
chunk_start: Optional[int] = None,
|
|
70
371
|
start_marker: Optional[str] = None,
|
|
71
372
|
end_marker: Optional[str] = None,
|
|
72
|
-
retain_local_copy: bool = False,
|
|
73
|
-
use_consolidation: bool = False,
|
|
373
|
+
retain_local_copy: Optional[bool] = False,
|
|
374
|
+
use_consolidation: Optional[bool] = False,
|
|
375
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
74
376
|
):
|
|
75
377
|
"""Initialize the Parquet output handler.
|
|
76
378
|
|
|
77
379
|
Args:
|
|
78
|
-
|
|
79
|
-
output_suffix (str): Suffix for output files.
|
|
380
|
+
path (str): Base path where Parquet files will be written.
|
|
80
381
|
typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
|
|
81
382
|
chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
|
|
82
383
|
total_record_count (int, optional): Initial total record count. Defaults to 0.
|
|
@@ -91,9 +392,10 @@ class ParquetOutput(Output):
|
|
|
91
392
|
Defaults to False.
|
|
92
393
|
use_consolidation (bool, optional): Whether to use consolidation.
|
|
93
394
|
Defaults to False.
|
|
395
|
+
dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
|
|
94
396
|
"""
|
|
95
|
-
self.
|
|
96
|
-
self.
|
|
397
|
+
self.extension = PARQUET_FILE_EXTENSION
|
|
398
|
+
self.path = path
|
|
97
399
|
self.typename = typename
|
|
98
400
|
self.chunk_size = chunk_size
|
|
99
401
|
self.buffer_size = buffer_size
|
|
@@ -112,6 +414,9 @@ class ParquetOutput(Output):
|
|
|
112
414
|
self.partitions = []
|
|
113
415
|
self.metrics = get_metrics()
|
|
114
416
|
self.retain_local_copy = retain_local_copy
|
|
417
|
+
self.dataframe_type = dataframe_type
|
|
418
|
+
self._is_closed = False
|
|
419
|
+
self._statistics = None
|
|
115
420
|
|
|
116
421
|
# Consolidation-specific attributes
|
|
117
422
|
# Use consolidation to efficiently write parquet files in buffered manner
|
|
@@ -128,13 +433,14 @@ class ParquetOutput(Output):
|
|
|
128
433
|
if self.chunk_start:
|
|
129
434
|
self.chunk_count = self.chunk_start + self.chunk_count
|
|
130
435
|
|
|
436
|
+
if not self.path:
|
|
437
|
+
raise ValueError("path is required")
|
|
131
438
|
# Create output directory
|
|
132
|
-
self.output_path = os.path.join(self.output_path, self.output_suffix)
|
|
133
439
|
if self.typename:
|
|
134
|
-
self.
|
|
135
|
-
os.makedirs(self.
|
|
440
|
+
self.path = os.path.join(self.path, self.typename)
|
|
441
|
+
os.makedirs(self.path, exist_ok=True)
|
|
136
442
|
|
|
137
|
-
async def
|
|
443
|
+
async def _write_batched_dataframe(
|
|
138
444
|
self,
|
|
139
445
|
batched_dataframe: Union[
|
|
140
446
|
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
@@ -155,7 +461,7 @@ class ParquetOutput(Output):
|
|
|
155
461
|
"""
|
|
156
462
|
if not self.use_consolidation:
|
|
157
463
|
# Fallback to base class implementation
|
|
158
|
-
await super().
|
|
464
|
+
await super()._write_batched_dataframe(batched_dataframe)
|
|
159
465
|
return
|
|
160
466
|
|
|
161
467
|
try:
|
|
@@ -186,12 +492,13 @@ class ParquetOutput(Output):
|
|
|
186
492
|
await self._cleanup_temp_folders() # Cleanup on error
|
|
187
493
|
raise
|
|
188
494
|
|
|
189
|
-
async def
|
|
495
|
+
async def _write_daft_dataframe(
|
|
190
496
|
self,
|
|
191
497
|
dataframe: "daft.DataFrame", # noqa: F821
|
|
192
498
|
partition_cols: Optional[List] = None,
|
|
193
|
-
write_mode: Union[WriteMode, str] = WriteMode.APPEND,
|
|
499
|
+
write_mode: Union[WriteMode, str] = WriteMode.APPEND.value,
|
|
194
500
|
morsel_size: int = 100_000,
|
|
501
|
+
**kwargs,
|
|
195
502
|
):
|
|
196
503
|
"""Write a daft DataFrame to Parquet files and upload to object store.
|
|
197
504
|
|
|
@@ -234,7 +541,7 @@ class ParquetOutput(Output):
|
|
|
234
541
|
):
|
|
235
542
|
# Daft automatically handles file splitting and naming
|
|
236
543
|
result = dataframe.write_parquet(
|
|
237
|
-
root_dir=self.
|
|
544
|
+
root_dir=self.path,
|
|
238
545
|
write_mode=write_mode.value,
|
|
239
546
|
partition_cols=partition_cols,
|
|
240
547
|
)
|
|
@@ -267,11 +574,11 @@ class ParquetOutput(Output):
|
|
|
267
574
|
# Delete the directory from object store
|
|
268
575
|
try:
|
|
269
576
|
await ObjectStore.delete_prefix(
|
|
270
|
-
prefix=get_object_store_prefix(self.
|
|
577
|
+
prefix=get_object_store_prefix(self.path)
|
|
271
578
|
)
|
|
272
579
|
except FileNotFoundError as e:
|
|
273
580
|
logger.info(
|
|
274
|
-
f"No files found under prefix {get_object_store_prefix(self.
|
|
581
|
+
f"No files found under prefix {get_object_store_prefix(self.path)}: {str(e)}"
|
|
275
582
|
)
|
|
276
583
|
for path in file_paths:
|
|
277
584
|
if ENABLE_ATLAN_UPLOAD:
|
|
@@ -311,20 +618,24 @@ class ParquetOutput(Output):
|
|
|
311
618
|
Returns:
|
|
312
619
|
str: The full path of the output file.
|
|
313
620
|
"""
|
|
314
|
-
return self.
|
|
621
|
+
return self.path
|
|
315
622
|
|
|
316
623
|
# Consolidation helper methods
|
|
317
624
|
|
|
318
625
|
def _get_temp_folder_path(self, folder_index: int) -> str:
|
|
319
626
|
"""Generate temp folder path consistent with existing structure."""
|
|
320
|
-
temp_base_path = os.path.join(self.
|
|
627
|
+
temp_base_path = os.path.join(self.path, "temp_accumulation")
|
|
321
628
|
return os.path.join(temp_base_path, f"folder-{folder_index}")
|
|
322
629
|
|
|
323
630
|
def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
|
|
324
631
|
"""Generate final consolidated file path using existing path_gen logic."""
|
|
325
632
|
return os.path.join(
|
|
326
|
-
self.
|
|
327
|
-
|
|
633
|
+
self.path,
|
|
634
|
+
path_gen(
|
|
635
|
+
chunk_count=folder_index,
|
|
636
|
+
chunk_part=chunk_part,
|
|
637
|
+
extension=self.extension,
|
|
638
|
+
),
|
|
328
639
|
)
|
|
329
640
|
|
|
330
641
|
async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
|
|
@@ -374,14 +685,14 @@ class ParquetOutput(Output):
|
|
|
374
685
|
[
|
|
375
686
|
f
|
|
376
687
|
for f in os.listdir(self.current_temp_folder_path)
|
|
377
|
-
if f.endswith(
|
|
688
|
+
if f.endswith(self.extension)
|
|
378
689
|
]
|
|
379
690
|
)
|
|
380
|
-
chunk_file_name = f"chunk-{existing_files}.
|
|
691
|
+
chunk_file_name = f"chunk-{existing_files}{self.extension}"
|
|
381
692
|
chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
|
|
382
693
|
|
|
383
694
|
# Write chunk using existing write_chunk method
|
|
384
|
-
await self.
|
|
695
|
+
await self._write_chunk(chunk, chunk_file_path)
|
|
385
696
|
|
|
386
697
|
async def _consolidate_current_folder(self):
|
|
387
698
|
"""Consolidate current temp folder using Daft."""
|
|
@@ -392,7 +703,7 @@ class ParquetOutput(Output):
|
|
|
392
703
|
import daft
|
|
393
704
|
|
|
394
705
|
# Read all parquet files in temp folder
|
|
395
|
-
pattern = os.path.join(self.current_temp_folder_path, "
|
|
706
|
+
pattern = os.path.join(self.current_temp_folder_path, f"*{self.extension}")
|
|
396
707
|
daft_df = daft.read_parquet(pattern)
|
|
397
708
|
partitions = 0
|
|
398
709
|
|
|
@@ -408,7 +719,7 @@ class ParquetOutput(Output):
|
|
|
408
719
|
result_dict = result.to_pydict()
|
|
409
720
|
partitions = len(result_dict["path"])
|
|
410
721
|
for i, file_path in enumerate(result_dict["path"]):
|
|
411
|
-
if file_path.endswith(
|
|
722
|
+
if file_path.endswith(self.extension):
|
|
412
723
|
consolidated_file_path = self._get_consolidated_file_path(
|
|
413
724
|
folder_index=self.chunk_count,
|
|
414
725
|
chunk_part=i,
|
|
@@ -462,7 +773,7 @@ class ParquetOutput(Output):
|
|
|
462
773
|
shutil.rmtree(temp_folder, ignore_errors=True)
|
|
463
774
|
|
|
464
775
|
# Clean up base temp directory if it exists and is empty
|
|
465
|
-
temp_base_path = os.path.join(self.
|
|
776
|
+
temp_base_path = os.path.join(self.path, "temp_accumulation")
|
|
466
777
|
if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
|
|
467
778
|
os.rmdir(temp_base_path)
|
|
468
779
|
|
|
@@ -475,7 +786,7 @@ class ParquetOutput(Output):
|
|
|
475
786
|
except Exception as e:
|
|
476
787
|
logger.warning(f"Error cleaning up temp folders: {str(e)}")
|
|
477
788
|
|
|
478
|
-
async def
|
|
789
|
+
async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
479
790
|
"""Write a chunk to a Parquet file.
|
|
480
791
|
|
|
481
792
|
This method writes a chunk to a Parquet file and uploads the file to the object store.
|