atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +312 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +749 -0
- application_sdk/io/json.py +473 -0
- application_sdk/{outputs → io}/parquet.py +414 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +16 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +14 -1
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,749 @@
|
|
|
1
|
+
"""Output module for handling data output operations.
|
|
2
|
+
|
|
3
|
+
This module provides base classes and utilities for handling various types of data outputs
|
|
4
|
+
in the application, including file outputs and object store interactions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import gc
|
|
8
|
+
import inspect
|
|
9
|
+
import os
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from typing import (
|
|
13
|
+
TYPE_CHECKING,
|
|
14
|
+
Any,
|
|
15
|
+
AsyncGenerator,
|
|
16
|
+
AsyncIterator,
|
|
17
|
+
Dict,
|
|
18
|
+
Generator,
|
|
19
|
+
Iterator,
|
|
20
|
+
List,
|
|
21
|
+
Optional,
|
|
22
|
+
Union,
|
|
23
|
+
cast,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
import orjson
|
|
27
|
+
from temporalio import activity
|
|
28
|
+
|
|
29
|
+
from application_sdk.activities.common.models import ActivityStatistics
|
|
30
|
+
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
31
|
+
from application_sdk.common.types import DataframeType
|
|
32
|
+
from application_sdk.constants import ENABLE_ATLAN_UPLOAD, UPSTREAM_OBJECT_STORE_NAME
|
|
33
|
+
from application_sdk.io.utils import (
|
|
34
|
+
estimate_dataframe_record_size,
|
|
35
|
+
is_empty_dataframe,
|
|
36
|
+
path_gen,
|
|
37
|
+
)
|
|
38
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
39
|
+
from application_sdk.observability.metrics_adaptor import MetricType
|
|
40
|
+
from application_sdk.services.objectstore import ObjectStore
|
|
41
|
+
|
|
42
|
+
logger = get_logger(__name__)
|
|
43
|
+
activity.logger = logger
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
if TYPE_CHECKING:
|
|
47
|
+
import daft # type: ignore
|
|
48
|
+
import pandas as pd
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class Reader(ABC):
|
|
52
|
+
"""Abstract base class for reader data sources.
|
|
53
|
+
|
|
54
|
+
This class defines the interface for reader handlers that can read data
|
|
55
|
+
from various sources in different formats. Follows Python's file I/O
|
|
56
|
+
pattern with read/close semantics and supports context managers.
|
|
57
|
+
|
|
58
|
+
Attributes:
|
|
59
|
+
path (str): Path where the reader will read from.
|
|
60
|
+
_is_closed (bool): Whether the reader has been closed.
|
|
61
|
+
_downloaded_files (List[str]): List of downloaded temporary files to clean up.
|
|
62
|
+
cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
|
|
63
|
+
|
|
64
|
+
Example:
|
|
65
|
+
Using close() explicitly::
|
|
66
|
+
|
|
67
|
+
reader = ParquetFileReader(path="/data/input")
|
|
68
|
+
df = await reader.read()
|
|
69
|
+
await reader.close() # Cleans up any downloaded temp files
|
|
70
|
+
|
|
71
|
+
Using context manager (recommended)::
|
|
72
|
+
|
|
73
|
+
async with ParquetFileReader(path="/data/input") as reader:
|
|
74
|
+
df = await reader.read()
|
|
75
|
+
# close() called automatically
|
|
76
|
+
|
|
77
|
+
Reading in batches with context manager::
|
|
78
|
+
|
|
79
|
+
async with JsonFileReader(path="/data/input") as reader:
|
|
80
|
+
async for batch in reader.read_batches():
|
|
81
|
+
process(batch)
|
|
82
|
+
# close() called automatically
|
|
83
|
+
"""
|
|
84
|
+
|
|
85
|
+
path: str
|
|
86
|
+
_is_closed: bool = False
|
|
87
|
+
_downloaded_files: List[str] = []
|
|
88
|
+
cleanup_on_close: bool = True
|
|
89
|
+
|
|
90
|
+
async def __aenter__(self) -> "Reader":
|
|
91
|
+
"""Enter the async context manager.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Reader: The reader instance.
|
|
95
|
+
"""
|
|
96
|
+
return self
|
|
97
|
+
|
|
98
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
99
|
+
"""Exit the async context manager, closing the reader.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
exc_type: Exception type if an exception was raised.
|
|
103
|
+
exc_val: Exception value if an exception was raised.
|
|
104
|
+
exc_tb: Exception traceback if an exception was raised.
|
|
105
|
+
"""
|
|
106
|
+
await self.close()
|
|
107
|
+
|
|
108
|
+
async def close(self) -> None:
|
|
109
|
+
"""Close the reader and clean up any downloaded temporary files.
|
|
110
|
+
|
|
111
|
+
This method cleans up any temporary files that were downloaded from
|
|
112
|
+
the object store during read operations. Calling close() multiple
|
|
113
|
+
times is safe (subsequent calls are no-ops).
|
|
114
|
+
|
|
115
|
+
Note:
|
|
116
|
+
Set ``cleanup_on_close=False`` during initialization to retain
|
|
117
|
+
downloaded files after closing.
|
|
118
|
+
|
|
119
|
+
Example::
|
|
120
|
+
|
|
121
|
+
reader = ParquetFileReader(path="/data/input")
|
|
122
|
+
df = await reader.read()
|
|
123
|
+
await reader.close() # Cleans up temp files
|
|
124
|
+
"""
|
|
125
|
+
if self._is_closed:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
if self.cleanup_on_close and self._downloaded_files:
|
|
129
|
+
await self._cleanup_downloaded_files()
|
|
130
|
+
|
|
131
|
+
self._is_closed = True
|
|
132
|
+
|
|
133
|
+
async def _cleanup_downloaded_files(self) -> None:
|
|
134
|
+
"""Clean up downloaded temporary files.
|
|
135
|
+
|
|
136
|
+
Override this method in subclasses for custom cleanup behavior.
|
|
137
|
+
"""
|
|
138
|
+
import shutil
|
|
139
|
+
|
|
140
|
+
for file_path in self._downloaded_files:
|
|
141
|
+
try:
|
|
142
|
+
if os.path.isfile(file_path):
|
|
143
|
+
os.remove(file_path)
|
|
144
|
+
elif os.path.isdir(file_path):
|
|
145
|
+
shutil.rmtree(file_path, ignore_errors=True)
|
|
146
|
+
except Exception as e:
|
|
147
|
+
logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
|
|
148
|
+
|
|
149
|
+
self._downloaded_files.clear()
|
|
150
|
+
|
|
151
|
+
@abstractmethod
|
|
152
|
+
def read_batches(
|
|
153
|
+
self,
|
|
154
|
+
) -> Union[
|
|
155
|
+
Iterator["pd.DataFrame"],
|
|
156
|
+
AsyncIterator["pd.DataFrame"],
|
|
157
|
+
Iterator["daft.DataFrame"],
|
|
158
|
+
AsyncIterator["daft.DataFrame"],
|
|
159
|
+
]:
|
|
160
|
+
"""Get an iterator of batched pandas DataFrames.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
|
|
164
|
+
|
|
165
|
+
Raises:
|
|
166
|
+
NotImplementedError: If the method is not implemented.
|
|
167
|
+
ValueError: If the reader has been closed.
|
|
168
|
+
"""
|
|
169
|
+
raise NotImplementedError
|
|
170
|
+
|
|
171
|
+
@abstractmethod
|
|
172
|
+
async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
|
|
173
|
+
"""Get a single pandas or daft DataFrame.
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Union["pd.DataFrame", "daft.DataFrame"]: A pandas or daft DataFrame.
|
|
177
|
+
|
|
178
|
+
Raises:
|
|
179
|
+
NotImplementedError: If the method is not implemented.
|
|
180
|
+
ValueError: If the reader has been closed.
|
|
181
|
+
"""
|
|
182
|
+
raise NotImplementedError
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class WriteMode(Enum):
|
|
186
|
+
"""Enumeration of write modes for output operations."""
|
|
187
|
+
|
|
188
|
+
APPEND = "append"
|
|
189
|
+
OVERWRITE = "overwrite"
|
|
190
|
+
OVERWRITE_PARTITIONS = "overwrite-partitions"
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class Writer(ABC):
|
|
194
|
+
"""Abstract base class for writer handlers.
|
|
195
|
+
|
|
196
|
+
This class defines the interface for writer handlers that can write data
|
|
197
|
+
to various destinations in different formats. Follows Python's file I/O
|
|
198
|
+
pattern with open/write/close semantics and supports context managers.
|
|
199
|
+
|
|
200
|
+
Attributes:
|
|
201
|
+
path (str): Path where the writer will be written.
|
|
202
|
+
output_prefix (str): Prefix for files when uploading to object store.
|
|
203
|
+
total_record_count (int): Total number of records processed.
|
|
204
|
+
chunk_count (int): Number of chunks the writer was split into.
|
|
205
|
+
buffer_size (int): Size of the buffer to write data to.
|
|
206
|
+
max_file_size_bytes (int): Maximum size of the file to write data to.
|
|
207
|
+
current_buffer_size (int): Current size of the buffer to write data to.
|
|
208
|
+
current_buffer_size_bytes (int): Current size of the buffer to write data to.
|
|
209
|
+
partitions (List[int]): Partitions of the writer.
|
|
210
|
+
|
|
211
|
+
Example:
|
|
212
|
+
Using close() explicitly::
|
|
213
|
+
|
|
214
|
+
writer = JsonFileWriter(path="/data/output")
|
|
215
|
+
await writer.write(dataframe)
|
|
216
|
+
await writer.write({"key": "value"}) # Dict support
|
|
217
|
+
stats = await writer.close()
|
|
218
|
+
|
|
219
|
+
Using context manager (recommended)::
|
|
220
|
+
|
|
221
|
+
async with JsonFileWriter(path="/data/output") as writer:
|
|
222
|
+
await writer.write(dataframe)
|
|
223
|
+
# close() called automatically
|
|
224
|
+
"""
|
|
225
|
+
|
|
226
|
+
path: str
|
|
227
|
+
output_prefix: str
|
|
228
|
+
total_record_count: int
|
|
229
|
+
chunk_count: int
|
|
230
|
+
chunk_part: int
|
|
231
|
+
buffer_size: int
|
|
232
|
+
max_file_size_bytes: int
|
|
233
|
+
current_buffer_size: int
|
|
234
|
+
current_buffer_size_bytes: int
|
|
235
|
+
partitions: List[int]
|
|
236
|
+
extension: str
|
|
237
|
+
dataframe_type: DataframeType
|
|
238
|
+
_is_closed: bool = False
|
|
239
|
+
_statistics: Optional[ActivityStatistics] = None
|
|
240
|
+
|
|
241
|
+
async def __aenter__(self) -> "Writer":
|
|
242
|
+
"""Enter the async context manager.
|
|
243
|
+
|
|
244
|
+
Returns:
|
|
245
|
+
Writer: The writer instance.
|
|
246
|
+
"""
|
|
247
|
+
return self
|
|
248
|
+
|
|
249
|
+
async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
|
|
250
|
+
"""Exit the async context manager, closing the writer.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
exc_type: Exception type if an exception was raised.
|
|
254
|
+
exc_val: Exception value if an exception was raised.
|
|
255
|
+
exc_tb: Exception traceback if an exception was raised.
|
|
256
|
+
"""
|
|
257
|
+
await self.close()
|
|
258
|
+
|
|
259
|
+
def _convert_to_dataframe(
|
|
260
|
+
self,
|
|
261
|
+
data: Union[
|
|
262
|
+
"pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
|
|
263
|
+
],
|
|
264
|
+
) -> Union["pd.DataFrame", "daft.DataFrame"]:
|
|
265
|
+
"""Convert input data to a DataFrame if needed.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
data: Input data - can be a DataFrame, dict, or list of dicts.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
A pandas or daft DataFrame depending on self.dataframe_type.
|
|
272
|
+
|
|
273
|
+
Raises:
|
|
274
|
+
TypeError: If data type is not supported or if dict/list input is used with daft when daft is not available.
|
|
275
|
+
"""
|
|
276
|
+
import pandas as pd
|
|
277
|
+
|
|
278
|
+
# Already a pandas DataFrame - return as-is or convert to daft if needed
|
|
279
|
+
if isinstance(data, pd.DataFrame):
|
|
280
|
+
if self.dataframe_type == DataframeType.daft:
|
|
281
|
+
try:
|
|
282
|
+
import daft
|
|
283
|
+
|
|
284
|
+
return daft.from_pandas(data)
|
|
285
|
+
except ImportError:
|
|
286
|
+
raise TypeError(
|
|
287
|
+
"daft is not installed. Please install daft to use DataframeType.daft, "
|
|
288
|
+
"or use DataframeType.pandas instead."
|
|
289
|
+
)
|
|
290
|
+
return data
|
|
291
|
+
|
|
292
|
+
# Check for daft DataFrame
|
|
293
|
+
try:
|
|
294
|
+
import daft
|
|
295
|
+
|
|
296
|
+
if isinstance(data, daft.DataFrame):
|
|
297
|
+
return data
|
|
298
|
+
except ImportError:
|
|
299
|
+
pass
|
|
300
|
+
|
|
301
|
+
# Convert dict or list of dicts to DataFrame
|
|
302
|
+
if isinstance(data, dict) or (
|
|
303
|
+
isinstance(data, list) and len(data) > 0 and isinstance(data[0], dict)
|
|
304
|
+
):
|
|
305
|
+
# For daft dataframe_type, convert to daft DataFrame directly
|
|
306
|
+
if self.dataframe_type == DataframeType.daft:
|
|
307
|
+
try:
|
|
308
|
+
import daft
|
|
309
|
+
|
|
310
|
+
# Convert to columnar format for daft.from_pydict()
|
|
311
|
+
if isinstance(data, dict):
|
|
312
|
+
# Single dict: {"col1": "val1", "col2": "val2"} -> {"col1": ["val1"], "col2": ["val2"]}
|
|
313
|
+
columnar_data = {k: [v] for k, v in data.items()}
|
|
314
|
+
else:
|
|
315
|
+
# List of dicts: [{"col1": "v1"}, {"col1": "v2"}] -> {"col1": ["v1", "v2"]}
|
|
316
|
+
columnar_data = {}
|
|
317
|
+
for record in data:
|
|
318
|
+
for key, value in record.items():
|
|
319
|
+
if key not in columnar_data:
|
|
320
|
+
columnar_data[key] = []
|
|
321
|
+
columnar_data[key].append(value)
|
|
322
|
+
return daft.from_pydict(columnar_data)
|
|
323
|
+
except ImportError:
|
|
324
|
+
raise TypeError(
|
|
325
|
+
"Dict and list inputs require daft to be installed when using DataframeType.daft. "
|
|
326
|
+
"Please install daft or use DataframeType.pandas instead."
|
|
327
|
+
)
|
|
328
|
+
# For pandas dataframe_type, convert to pandas DataFrame
|
|
329
|
+
return pd.DataFrame([data] if isinstance(data, dict) else data)
|
|
330
|
+
|
|
331
|
+
raise TypeError(
|
|
332
|
+
f"Unsupported data type: {type(data).__name__}. "
|
|
333
|
+
"Expected DataFrame, dict, or list of dicts."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
async def write(
|
|
337
|
+
self,
|
|
338
|
+
data: Union[
|
|
339
|
+
"pd.DataFrame", "daft.DataFrame", Dict[str, Any], List[Dict[str, Any]]
|
|
340
|
+
],
|
|
341
|
+
**kwargs: Any,
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Write data to the output destination.
|
|
344
|
+
|
|
345
|
+
Supports writing DataFrames, dicts (converted to single-row DataFrame),
|
|
346
|
+
or lists of dicts (converted to multi-row DataFrame).
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
data: Data to write - DataFrame, dict, or list of dicts.
|
|
350
|
+
**kwargs: Additional parameters passed to the underlying write method.
|
|
351
|
+
|
|
352
|
+
Raises:
|
|
353
|
+
ValueError: If the writer has been closed or dataframe_type is unsupported.
|
|
354
|
+
TypeError: If data type is not supported.
|
|
355
|
+
"""
|
|
356
|
+
if self._is_closed:
|
|
357
|
+
raise ValueError("Cannot write to a closed writer")
|
|
358
|
+
|
|
359
|
+
# Convert to DataFrame if needed
|
|
360
|
+
dataframe = self._convert_to_dataframe(data)
|
|
361
|
+
|
|
362
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
363
|
+
await self._write_dataframe(dataframe, **kwargs)
|
|
364
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
365
|
+
await self._write_daft_dataframe(dataframe, **kwargs)
|
|
366
|
+
else:
|
|
367
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
368
|
+
|
|
369
|
+
async def write_batches(
|
|
370
|
+
self,
|
|
371
|
+
dataframe: Union[
|
|
372
|
+
AsyncGenerator["pd.DataFrame", None],
|
|
373
|
+
Generator["pd.DataFrame", None, None],
|
|
374
|
+
AsyncGenerator["daft.DataFrame", None],
|
|
375
|
+
Generator["daft.DataFrame", None, None],
|
|
376
|
+
],
|
|
377
|
+
) -> None:
|
|
378
|
+
"""Write batched DataFrames to the output destination.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
dataframe: Async or sync generator yielding DataFrames.
|
|
382
|
+
|
|
383
|
+
Raises:
|
|
384
|
+
ValueError: If the writer has been closed or dataframe_type is unsupported.
|
|
385
|
+
"""
|
|
386
|
+
if self._is_closed:
|
|
387
|
+
raise ValueError("Cannot write to a closed writer")
|
|
388
|
+
|
|
389
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
390
|
+
await self._write_batched_dataframe(dataframe)
|
|
391
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
392
|
+
await self._write_batched_daft_dataframe(dataframe)
|
|
393
|
+
else:
|
|
394
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
395
|
+
|
|
396
|
+
async def _write_batched_dataframe(
|
|
397
|
+
self,
|
|
398
|
+
batched_dataframe: Union[
|
|
399
|
+
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
400
|
+
],
|
|
401
|
+
):
|
|
402
|
+
"""Write a batched pandas DataFrame to Output.
|
|
403
|
+
|
|
404
|
+
This method writes the DataFrame to Output provided, potentially splitting it
|
|
405
|
+
into chunks based on chunk_size and buffer_size settings.
|
|
406
|
+
|
|
407
|
+
Args:
|
|
408
|
+
dataframe (pd.DataFrame): The DataFrame to write.
|
|
409
|
+
|
|
410
|
+
Note:
|
|
411
|
+
If the DataFrame is empty, the method returns without writing.
|
|
412
|
+
"""
|
|
413
|
+
try:
|
|
414
|
+
if inspect.isasyncgen(batched_dataframe):
|
|
415
|
+
async for dataframe in batched_dataframe:
|
|
416
|
+
if not is_empty_dataframe(dataframe):
|
|
417
|
+
await self._write_dataframe(dataframe)
|
|
418
|
+
else:
|
|
419
|
+
# Cast to Generator since we've confirmed it's not an AsyncGenerator
|
|
420
|
+
sync_generator = cast(
|
|
421
|
+
Generator["pd.DataFrame", None, None], batched_dataframe
|
|
422
|
+
)
|
|
423
|
+
for dataframe in sync_generator:
|
|
424
|
+
if not is_empty_dataframe(dataframe):
|
|
425
|
+
await self._write_dataframe(dataframe)
|
|
426
|
+
except Exception as e:
|
|
427
|
+
logger.error(f"Error writing batched dataframe: {str(e)}")
|
|
428
|
+
raise
|
|
429
|
+
|
|
430
|
+
async def _write_dataframe(self, dataframe: "pd.DataFrame", **kwargs):
|
|
431
|
+
"""Write a pandas DataFrame to Parquet files and upload to object store.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
dataframe (pd.DataFrame): The DataFrame to write.
|
|
435
|
+
**kwargs: Additional parameters (currently unused for pandas DataFrames).
|
|
436
|
+
"""
|
|
437
|
+
try:
|
|
438
|
+
if self.chunk_start is None:
|
|
439
|
+
self.chunk_part = 0
|
|
440
|
+
if len(dataframe) == 0:
|
|
441
|
+
return
|
|
442
|
+
|
|
443
|
+
chunk_size_bytes = estimate_dataframe_record_size(dataframe, self.extension)
|
|
444
|
+
|
|
445
|
+
for i in range(0, len(dataframe), self.buffer_size):
|
|
446
|
+
chunk = dataframe[i : i + self.buffer_size]
|
|
447
|
+
|
|
448
|
+
if (
|
|
449
|
+
self.current_buffer_size_bytes + chunk_size_bytes
|
|
450
|
+
> self.max_file_size_bytes
|
|
451
|
+
):
|
|
452
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
|
|
453
|
+
if os.path.exists(output_file_name):
|
|
454
|
+
await self._upload_file(output_file_name)
|
|
455
|
+
self.chunk_part += 1
|
|
456
|
+
|
|
457
|
+
self.current_buffer_size += len(chunk)
|
|
458
|
+
self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
|
|
459
|
+
await self._flush_buffer(chunk, self.chunk_part)
|
|
460
|
+
|
|
461
|
+
del chunk
|
|
462
|
+
gc.collect()
|
|
463
|
+
|
|
464
|
+
if self.current_buffer_size_bytes > 0:
|
|
465
|
+
# Finally upload the final file to the object store
|
|
466
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, extension=self.extension)}"
|
|
467
|
+
if os.path.exists(output_file_name):
|
|
468
|
+
await self._upload_file(output_file_name)
|
|
469
|
+
self.chunk_part += 1
|
|
470
|
+
|
|
471
|
+
# Record metrics for successful write
|
|
472
|
+
self.metrics.record_metric(
|
|
473
|
+
name="write_records",
|
|
474
|
+
value=len(dataframe),
|
|
475
|
+
metric_type=MetricType.COUNTER,
|
|
476
|
+
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
477
|
+
description="Number of records written to files from pandas DataFrame",
|
|
478
|
+
)
|
|
479
|
+
|
|
480
|
+
# Record chunk metrics
|
|
481
|
+
self.metrics.record_metric(
|
|
482
|
+
name="chunks_written",
|
|
483
|
+
value=1,
|
|
484
|
+
metric_type=MetricType.COUNTER,
|
|
485
|
+
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
486
|
+
description="Number of chunks written to files",
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# If chunk_start is set we don't want to increment the chunk_count
|
|
490
|
+
# Since it should only increment the chunk_part in this case
|
|
491
|
+
if self.chunk_start is None:
|
|
492
|
+
self.chunk_count += 1
|
|
493
|
+
self.partitions.append(self.chunk_part)
|
|
494
|
+
except Exception as e:
|
|
495
|
+
# Record metrics for failed write
|
|
496
|
+
self.metrics.record_metric(
|
|
497
|
+
name="write_errors",
|
|
498
|
+
value=1,
|
|
499
|
+
metric_type=MetricType.COUNTER,
|
|
500
|
+
labels={
|
|
501
|
+
"type": "pandas",
|
|
502
|
+
"mode": WriteMode.APPEND.value,
|
|
503
|
+
"error": str(e),
|
|
504
|
+
},
|
|
505
|
+
description="Number of errors while writing to files",
|
|
506
|
+
)
|
|
507
|
+
logger.error(f"Error writing pandas dataframe to files: {str(e)}")
|
|
508
|
+
raise
|
|
509
|
+
|
|
510
|
+
async def _write_batched_daft_dataframe(
|
|
511
|
+
self,
|
|
512
|
+
batched_dataframe: Union[
|
|
513
|
+
AsyncGenerator["daft.DataFrame", None], # noqa: F821
|
|
514
|
+
Generator["daft.DataFrame", None, None], # noqa: F821
|
|
515
|
+
],
|
|
516
|
+
):
|
|
517
|
+
"""Write a batched daft DataFrame to JSON files.
|
|
518
|
+
|
|
519
|
+
This method writes the DataFrame to JSON files, potentially splitting it
|
|
520
|
+
into chunks based on chunk_size and buffer_size settings.
|
|
521
|
+
|
|
522
|
+
Args:
|
|
523
|
+
dataframe (daft.DataFrame): The DataFrame to write.
|
|
524
|
+
|
|
525
|
+
Note:
|
|
526
|
+
If the DataFrame is empty, the method returns without writing.
|
|
527
|
+
"""
|
|
528
|
+
try:
|
|
529
|
+
if inspect.isasyncgen(batched_dataframe):
|
|
530
|
+
async for dataframe in batched_dataframe:
|
|
531
|
+
if not is_empty_dataframe(dataframe):
|
|
532
|
+
await self._write_daft_dataframe(dataframe)
|
|
533
|
+
else:
|
|
534
|
+
# Cast to Generator since we've confirmed it's not an AsyncGenerator
|
|
535
|
+
sync_generator = cast(
|
|
536
|
+
Generator["daft.DataFrame", None, None], batched_dataframe
|
|
537
|
+
) # noqa: F821
|
|
538
|
+
for dataframe in sync_generator:
|
|
539
|
+
if not is_empty_dataframe(dataframe):
|
|
540
|
+
await self._write_daft_dataframe(dataframe)
|
|
541
|
+
except Exception as e:
|
|
542
|
+
logger.error(f"Error writing batched daft dataframe: {str(e)}")
|
|
543
|
+
raise
|
|
544
|
+
|
|
545
|
+
@abstractmethod
|
|
546
|
+
async def _write_daft_dataframe(self, dataframe: "daft.DataFrame", **kwargs): # noqa: F821
|
|
547
|
+
"""Write a daft DataFrame to the output destination.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
dataframe (daft.DataFrame): The DataFrame to write.
|
|
551
|
+
**kwargs: Additional parameters passed through from write().
|
|
552
|
+
"""
|
|
553
|
+
pass
|
|
554
|
+
|
|
555
|
+
@property
|
|
556
|
+
def statistics(self) -> ActivityStatistics:
|
|
557
|
+
"""Get current statistics without closing the writer.
|
|
558
|
+
|
|
559
|
+
Returns:
|
|
560
|
+
ActivityStatistics: Current statistics (record count, chunk count, partitions).
|
|
561
|
+
|
|
562
|
+
Note:
|
|
563
|
+
This returns the current state. For final statistics after all
|
|
564
|
+
writes complete, use close() instead.
|
|
565
|
+
"""
|
|
566
|
+
return ActivityStatistics(
|
|
567
|
+
total_record_count=self.total_record_count,
|
|
568
|
+
chunk_count=len(self.partitions),
|
|
569
|
+
partitions=self.partitions,
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
async def _finalize(self) -> None:
|
|
573
|
+
"""Finalize the writer before closing.
|
|
574
|
+
|
|
575
|
+
Override this method in subclasses to perform any final flush operations,
|
|
576
|
+
upload remaining files, etc. This is called by close() before writing statistics.
|
|
577
|
+
"""
|
|
578
|
+
pass
|
|
579
|
+
|
|
580
|
+
async def close(self) -> ActivityStatistics:
|
|
581
|
+
"""Close the writer, flush buffers, upload files, and return statistics.
|
|
582
|
+
|
|
583
|
+
This method finalizes all pending writes, uploads any remaining files to
|
|
584
|
+
the object store, writes statistics, and marks the writer as closed.
|
|
585
|
+
Calling close() multiple times is safe (subsequent calls are no-ops).
|
|
586
|
+
|
|
587
|
+
The typename for statistics is automatically taken from `self.typename`
|
|
588
|
+
if it was set during initialization.
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
ActivityStatistics: Final statistics including total_record_count,
|
|
592
|
+
chunk_count, and partitions.
|
|
593
|
+
|
|
594
|
+
Raises:
|
|
595
|
+
ValueError: If statistics data is invalid.
|
|
596
|
+
Exception: If there's an error during finalization or writing statistics.
|
|
597
|
+
|
|
598
|
+
Example:
|
|
599
|
+
```python
|
|
600
|
+
writer = JsonFileWriter(path="/data/output", typename="table")
|
|
601
|
+
await writer.write(dataframe)
|
|
602
|
+
stats = await writer.close()
|
|
603
|
+
print(f"Wrote {stats.total_record_count} records")
|
|
604
|
+
```
|
|
605
|
+
"""
|
|
606
|
+
if self._is_closed:
|
|
607
|
+
if self._statistics:
|
|
608
|
+
return self._statistics
|
|
609
|
+
return self.statistics
|
|
610
|
+
|
|
611
|
+
try:
|
|
612
|
+
# Allow subclasses to perform final flush/upload operations
|
|
613
|
+
await self._finalize()
|
|
614
|
+
|
|
615
|
+
# Use self.typename if available
|
|
616
|
+
typename = getattr(self, "typename", None)
|
|
617
|
+
|
|
618
|
+
# Write statistics to file and object store
|
|
619
|
+
statistics_dict = await self._write_statistics(typename)
|
|
620
|
+
if not statistics_dict:
|
|
621
|
+
raise ValueError("No statistics data available")
|
|
622
|
+
|
|
623
|
+
self._statistics = ActivityStatistics.model_validate(statistics_dict)
|
|
624
|
+
if typename:
|
|
625
|
+
self._statistics.typename = typename
|
|
626
|
+
|
|
627
|
+
self._is_closed = True
|
|
628
|
+
return self._statistics
|
|
629
|
+
|
|
630
|
+
except Exception as e:
|
|
631
|
+
logger.error(f"Error closing writer: {str(e)}")
|
|
632
|
+
raise
|
|
633
|
+
|
|
634
|
+
async def _upload_file(self, file_name: str):
|
|
635
|
+
"""Upload a file to the object store."""
|
|
636
|
+
# Get retain_local_copy from the writer instance, defaulting to False
|
|
637
|
+
retain_local = getattr(self, "retain_local_copy", False)
|
|
638
|
+
|
|
639
|
+
if ENABLE_ATLAN_UPLOAD:
|
|
640
|
+
await ObjectStore.upload_file(
|
|
641
|
+
source=file_name,
|
|
642
|
+
store_name=UPSTREAM_OBJECT_STORE_NAME,
|
|
643
|
+
retain_local_copy=True, # Always retain for the second upload to deployment store
|
|
644
|
+
destination=get_object_store_prefix(file_name),
|
|
645
|
+
)
|
|
646
|
+
await ObjectStore.upload_file(
|
|
647
|
+
source=file_name,
|
|
648
|
+
destination=get_object_store_prefix(file_name),
|
|
649
|
+
retain_local_copy=retain_local, # Respect the writer's retain_local_copy setting
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
self.current_buffer_size_bytes = 0
|
|
653
|
+
|
|
654
|
+
async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
|
|
655
|
+
"""Flush the current buffer to a JSON file.
|
|
656
|
+
|
|
657
|
+
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
658
|
+
and uploads the file to the object store.
|
|
659
|
+
|
|
660
|
+
Note:
|
|
661
|
+
If the buffer is empty or has no records, the method returns without writing.
|
|
662
|
+
"""
|
|
663
|
+
try:
|
|
664
|
+
if not is_empty_dataframe(chunk):
|
|
665
|
+
self.total_record_count += len(chunk)
|
|
666
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, chunk_part, extension=self.extension)}"
|
|
667
|
+
await self._write_chunk(chunk, output_file_name)
|
|
668
|
+
|
|
669
|
+
self.current_buffer_size = 0
|
|
670
|
+
|
|
671
|
+
# Record chunk metrics
|
|
672
|
+
self.metrics.record_metric(
|
|
673
|
+
name="chunks_written",
|
|
674
|
+
value=1,
|
|
675
|
+
metric_type=MetricType.COUNTER,
|
|
676
|
+
labels={"type": "output"},
|
|
677
|
+
description="Number of chunks written to files",
|
|
678
|
+
)
|
|
679
|
+
|
|
680
|
+
except Exception as e:
|
|
681
|
+
# Record metrics for failed write
|
|
682
|
+
self.metrics.record_metric(
|
|
683
|
+
name="write_errors",
|
|
684
|
+
value=1,
|
|
685
|
+
metric_type=MetricType.COUNTER,
|
|
686
|
+
labels={"type": "output", "error": str(e)},
|
|
687
|
+
description="Number of errors while writing to files",
|
|
688
|
+
)
|
|
689
|
+
logger.error(f"Error flushing buffer to files: {str(e)}")
|
|
690
|
+
raise e
|
|
691
|
+
|
|
692
|
+
async def _write_statistics(
|
|
693
|
+
self, typename: Optional[str] = None
|
|
694
|
+
) -> Optional[Dict[str, Any]]:
|
|
695
|
+
"""Write statistics about the output to a JSON file.
|
|
696
|
+
|
|
697
|
+
Internal method called by close() to persist statistics.
|
|
698
|
+
|
|
699
|
+
Args:
|
|
700
|
+
typename (str, optional): Type name for organizing statistics.
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
Dict containing statistics data.
|
|
704
|
+
|
|
705
|
+
Raises:
|
|
706
|
+
Exception: If there's an error writing or uploading the statistics.
|
|
707
|
+
"""
|
|
708
|
+
try:
|
|
709
|
+
# prepare the statistics
|
|
710
|
+
statistics = {
|
|
711
|
+
"total_record_count": self.total_record_count,
|
|
712
|
+
"chunk_count": len(self.partitions),
|
|
713
|
+
"partitions": self.partitions,
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
# Ensure typename is included in the statistics payload (if provided)
|
|
717
|
+
if typename:
|
|
718
|
+
statistics["typename"] = typename
|
|
719
|
+
|
|
720
|
+
# Write the statistics to a json file inside a dedicated statistics/ folder
|
|
721
|
+
statistics_dir = os.path.join(self.path, "statistics")
|
|
722
|
+
os.makedirs(statistics_dir, exist_ok=True)
|
|
723
|
+
output_file_name = os.path.join(statistics_dir, "statistics.json.ignore")
|
|
724
|
+
# If chunk_start is provided, include it in the statistics filename
|
|
725
|
+
try:
|
|
726
|
+
cs = getattr(self, "chunk_start", None)
|
|
727
|
+
if cs is not None:
|
|
728
|
+
output_file_name = os.path.join(
|
|
729
|
+
statistics_dir, f"statistics-chunk-{cs}.json.ignore"
|
|
730
|
+
)
|
|
731
|
+
except Exception:
|
|
732
|
+
# If accessing chunk_start fails, fallback to default filename
|
|
733
|
+
pass
|
|
734
|
+
|
|
735
|
+
# Write the statistics dictionary to the JSON file
|
|
736
|
+
with open(output_file_name, "wb") as f:
|
|
737
|
+
f.write(orjson.dumps(statistics))
|
|
738
|
+
|
|
739
|
+
destination_file_path = get_object_store_prefix(output_file_name)
|
|
740
|
+
# Push the file to the object store
|
|
741
|
+
await ObjectStore.upload_file(
|
|
742
|
+
source=output_file_name,
|
|
743
|
+
destination=destination_file_path,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
return statistics
|
|
747
|
+
except Exception as e:
|
|
748
|
+
logger.error(f"Error writing statistics: {str(e)}")
|
|
749
|
+
raise
|