atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +312 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +749 -0
- application_sdk/io/json.py +473 -0
- application_sdk/{outputs → io}/parquet.py +414 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +16 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +14 -1
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Union
|
|
3
|
+
|
|
4
|
+
import orjson
|
|
5
|
+
from temporalio import activity
|
|
6
|
+
|
|
7
|
+
from application_sdk.common.types import DataframeType
|
|
8
|
+
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
9
|
+
from application_sdk.io.utils import (
|
|
10
|
+
JSON_FILE_EXTENSION,
|
|
11
|
+
convert_datetime_to_epoch,
|
|
12
|
+
download_files,
|
|
13
|
+
path_gen,
|
|
14
|
+
process_null_fields,
|
|
15
|
+
)
|
|
16
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
17
|
+
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import daft
|
|
21
|
+
import pandas as pd
|
|
22
|
+
|
|
23
|
+
from application_sdk.io import Reader, Writer
|
|
24
|
+
|
|
25
|
+
logger = get_logger(__name__)
|
|
26
|
+
activity.logger = logger
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class JsonFileReader(Reader):
|
|
30
|
+
"""JSON File Reader class to read data from JSON files using daft and pandas.
|
|
31
|
+
|
|
32
|
+
Supports reading both single files and directories containing multiple JSON files.
|
|
33
|
+
Follows Python's file I/O pattern with read/close semantics and supports context managers.
|
|
34
|
+
|
|
35
|
+
Attributes:
|
|
36
|
+
path (str): Path to JSON file or directory containing JSON files.
|
|
37
|
+
chunk_size (int): Number of rows per batch.
|
|
38
|
+
file_names (Optional[List[str]]): List of specific file names to read.
|
|
39
|
+
dataframe_type (DataframeType): Type of dataframe to return (pandas or daft).
|
|
40
|
+
cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
Using context manager (recommended)::
|
|
44
|
+
|
|
45
|
+
async with JsonFileReader(path="/data/input") as reader:
|
|
46
|
+
df = await reader.read()
|
|
47
|
+
# close() called automatically, temp files cleaned up
|
|
48
|
+
|
|
49
|
+
Reading in batches::
|
|
50
|
+
|
|
51
|
+
async with JsonFileReader(path="/data/input", chunk_size=50000) as reader:
|
|
52
|
+
async for batch in reader.read_batches():
|
|
53
|
+
process(batch)
|
|
54
|
+
|
|
55
|
+
Using close() explicitly::
|
|
56
|
+
|
|
57
|
+
reader = JsonFileReader(path="/data/input")
|
|
58
|
+
df = await reader.read()
|
|
59
|
+
await reader.close() # Clean up downloaded temp files
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
path: str,
|
|
65
|
+
file_names: Optional[List[str]] = None,
|
|
66
|
+
chunk_size: Optional[int] = 100000,
|
|
67
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
68
|
+
cleanup_on_close: bool = True,
|
|
69
|
+
):
|
|
70
|
+
"""Initialize the JsonInput class.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
path (str): Path to JSON file or directory containing JSON files.
|
|
74
|
+
It accepts both types of paths:
|
|
75
|
+
local path or object store path
|
|
76
|
+
Wildcards are not supported.
|
|
77
|
+
file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
|
|
78
|
+
chunk_size (int): Number of rows per batch. Defaults to 100000.
|
|
79
|
+
dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
|
|
80
|
+
cleanup_on_close (bool): Whether to clean up downloaded temp files on close. Defaults to True.
|
|
81
|
+
|
|
82
|
+
Raises:
|
|
83
|
+
ValueError: When path is not provided or when single file path is combined with file_names
|
|
84
|
+
"""
|
|
85
|
+
self.extension = JSON_FILE_EXTENSION
|
|
86
|
+
|
|
87
|
+
# Validate that single file path and file_names are not both specified
|
|
88
|
+
if path.endswith(self.extension) and file_names:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Cannot specify both a single file path ('{path}') and file_names filter. "
|
|
91
|
+
f"Either provide a directory path with file_names, or specify the exact file path without file_names."
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
self.path = path
|
|
95
|
+
self.chunk_size = chunk_size
|
|
96
|
+
self.file_names = file_names
|
|
97
|
+
self.dataframe_type = dataframe_type
|
|
98
|
+
self.cleanup_on_close = cleanup_on_close
|
|
99
|
+
self._is_closed = False
|
|
100
|
+
self._downloaded_files: List[str] = []
|
|
101
|
+
|
|
102
|
+
async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
|
|
103
|
+
"""Read the data from the JSON files and return as a single DataFrame.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Union[pd.DataFrame, daft.DataFrame]: Combined dataframe from JSON files.
|
|
107
|
+
|
|
108
|
+
Raises:
|
|
109
|
+
ValueError: If the reader has been closed or dataframe_type is unsupported.
|
|
110
|
+
"""
|
|
111
|
+
if self._is_closed:
|
|
112
|
+
raise ValueError("Cannot read from a closed reader")
|
|
113
|
+
|
|
114
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
115
|
+
return await self._get_dataframe()
|
|
116
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
117
|
+
return await self._get_daft_dataframe()
|
|
118
|
+
else:
|
|
119
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
120
|
+
|
|
121
|
+
def read_batches(
|
|
122
|
+
self,
|
|
123
|
+
) -> Union[
|
|
124
|
+
AsyncIterator["pd.DataFrame"],
|
|
125
|
+
AsyncIterator["daft.DataFrame"],
|
|
126
|
+
]:
|
|
127
|
+
"""Read the data from the JSON files and return as batched DataFrames.
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
Union[AsyncIterator[pd.DataFrame], AsyncIterator[daft.DataFrame]]:
|
|
131
|
+
Async iterator of DataFrames.
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If the reader has been closed or dataframe_type is unsupported.
|
|
135
|
+
"""
|
|
136
|
+
if self._is_closed:
|
|
137
|
+
raise ValueError("Cannot read from a closed reader")
|
|
138
|
+
|
|
139
|
+
if self.dataframe_type == DataframeType.pandas:
|
|
140
|
+
return self._get_batched_dataframe()
|
|
141
|
+
elif self.dataframe_type == DataframeType.daft:
|
|
142
|
+
return self._get_batched_daft_dataframe()
|
|
143
|
+
else:
|
|
144
|
+
raise ValueError(f"Unsupported dataframe_type: {self.dataframe_type}")
|
|
145
|
+
|
|
146
|
+
async def _get_batched_dataframe(
|
|
147
|
+
self,
|
|
148
|
+
) -> AsyncIterator["pd.DataFrame"]:
|
|
149
|
+
"""Read the data from the JSON files and return as a batched pandas dataframe."""
|
|
150
|
+
try:
|
|
151
|
+
import pandas as pd
|
|
152
|
+
|
|
153
|
+
# Ensure files are available (local or downloaded)
|
|
154
|
+
json_files = await download_files(
|
|
155
|
+
self.path, self.extension, self.file_names
|
|
156
|
+
)
|
|
157
|
+
# Track downloaded files for cleanup on close
|
|
158
|
+
self._downloaded_files.extend(json_files)
|
|
159
|
+
logger.info(f"Reading {len(json_files)} JSON files in batches")
|
|
160
|
+
|
|
161
|
+
for json_file in json_files:
|
|
162
|
+
json_reader_obj = pd.read_json(
|
|
163
|
+
json_file,
|
|
164
|
+
chunksize=self.chunk_size,
|
|
165
|
+
lines=True,
|
|
166
|
+
)
|
|
167
|
+
for chunk in json_reader_obj:
|
|
168
|
+
yield chunk
|
|
169
|
+
except Exception as e:
|
|
170
|
+
logger.error(f"Error reading batched data from JSON: {str(e)}")
|
|
171
|
+
raise
|
|
172
|
+
|
|
173
|
+
async def _get_dataframe(self) -> "pd.DataFrame":
|
|
174
|
+
"""Read the data from the JSON files and return as a single pandas dataframe."""
|
|
175
|
+
try:
|
|
176
|
+
import pandas as pd
|
|
177
|
+
|
|
178
|
+
# Ensure files are available (local or downloaded)
|
|
179
|
+
json_files = await download_files(
|
|
180
|
+
self.path, self.extension, self.file_names
|
|
181
|
+
)
|
|
182
|
+
# Track downloaded files for cleanup on close
|
|
183
|
+
self._downloaded_files.extend(json_files)
|
|
184
|
+
logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
|
|
185
|
+
|
|
186
|
+
return pd.concat(
|
|
187
|
+
(pd.read_json(json_file, lines=True) for json_file in json_files),
|
|
188
|
+
ignore_index=True,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
except Exception as e:
|
|
192
|
+
logger.error(f"Error reading data from JSON: {str(e)}")
|
|
193
|
+
raise
|
|
194
|
+
|
|
195
|
+
async def _get_batched_daft_dataframe(
|
|
196
|
+
self,
|
|
197
|
+
) -> AsyncIterator["daft.DataFrame"]: # noqa: F821
|
|
198
|
+
"""Read the data from the JSON files and return as a batched daft dataframe."""
|
|
199
|
+
try:
|
|
200
|
+
import daft
|
|
201
|
+
|
|
202
|
+
# Ensure files are available (local or downloaded)
|
|
203
|
+
json_files = await download_files(
|
|
204
|
+
self.path, self.extension, self.file_names
|
|
205
|
+
)
|
|
206
|
+
# Track downloaded files for cleanup on close
|
|
207
|
+
self._downloaded_files.extend(json_files)
|
|
208
|
+
logger.info(f"Reading {len(json_files)} JSON files as daft batches")
|
|
209
|
+
|
|
210
|
+
# Yield each discovered file as separate batch with chunking
|
|
211
|
+
for json_file in json_files:
|
|
212
|
+
yield daft.read_json(json_file, _chunk_size=self.chunk_size)
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error(f"Error reading batched data from JSON using daft: {str(e)}")
|
|
215
|
+
raise
|
|
216
|
+
|
|
217
|
+
async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
218
|
+
"""Read the data from the JSON files and return as a single daft dataframe."""
|
|
219
|
+
try:
|
|
220
|
+
import daft
|
|
221
|
+
|
|
222
|
+
# Ensure files are available (local or downloaded)
|
|
223
|
+
json_files = await download_files(
|
|
224
|
+
self.path, self.extension, self.file_names
|
|
225
|
+
)
|
|
226
|
+
# Track downloaded files for cleanup on close
|
|
227
|
+
self._downloaded_files.extend(json_files)
|
|
228
|
+
logger.info(f"Reading {len(json_files)} JSON files with daft")
|
|
229
|
+
|
|
230
|
+
# Use the discovered/downloaded files directly
|
|
231
|
+
return daft.read_json(json_files)
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"Error reading data from JSON using daft: {str(e)}")
|
|
234
|
+
raise
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class JsonFileWriter(Writer):
|
|
238
|
+
"""Output handler for writing data to JSON files.
|
|
239
|
+
|
|
240
|
+
This class provides functionality for writing data to JSON files with support
|
|
241
|
+
for chunking large datasets, buffering, and automatic file path generation.
|
|
242
|
+
It can handle both pandas and daft DataFrames as input.
|
|
243
|
+
|
|
244
|
+
The output can be written to local files and optionally uploaded to an object
|
|
245
|
+
store. Files are named using a configurable path generation scheme that
|
|
246
|
+
includes chunk numbers for split files.
|
|
247
|
+
|
|
248
|
+
Attributes:
|
|
249
|
+
path (str): Full path where JSON files will be written.
|
|
250
|
+
typename (Optional[str]): Type identifier for the data being written.
|
|
251
|
+
chunk_start (Optional[int]): Starting index for chunk numbering.
|
|
252
|
+
buffer_size (int): Size of the write buffer in bytes.
|
|
253
|
+
chunk_size (int): Maximum number of records per chunk.
|
|
254
|
+
total_record_count (int): Total number of records processed.
|
|
255
|
+
chunk_count (int): Number of chunks written.
|
|
256
|
+
buffer (List[Union[pd.DataFrame, daft.DataFrame]]): Buffer for accumulating
|
|
257
|
+
data before writing.
|
|
258
|
+
"""
|
|
259
|
+
|
|
260
|
+
def __init__(
|
|
261
|
+
self,
|
|
262
|
+
path: str,
|
|
263
|
+
typename: Optional[str] = None,
|
|
264
|
+
chunk_start: Optional[int] = None,
|
|
265
|
+
buffer_size: Optional[int] = 5000,
|
|
266
|
+
chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
|
|
267
|
+
total_record_count: Optional[int] = 0,
|
|
268
|
+
chunk_count: Optional[int] = 0,
|
|
269
|
+
start_marker: Optional[str] = None,
|
|
270
|
+
end_marker: Optional[str] = None,
|
|
271
|
+
retain_local_copy: Optional[bool] = False,
|
|
272
|
+
dataframe_type: DataframeType = DataframeType.pandas,
|
|
273
|
+
**kwargs: Dict[str, Any],
|
|
274
|
+
):
|
|
275
|
+
"""Initialize the JSON output handler.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
path (str): Full path where JSON files will be written.
|
|
279
|
+
typename (Optional[str], optional): Type identifier for the data being written.
|
|
280
|
+
If provided, a subdirectory with this name will be created under path.
|
|
281
|
+
Defaults to None.
|
|
282
|
+
chunk_start (Optional[int], optional): Starting index for chunk numbering.
|
|
283
|
+
Defaults to None.
|
|
284
|
+
buffer_size (int, optional): Size of the buffer in bytes.
|
|
285
|
+
Defaults to 10MB (1024 * 1024 * 10).
|
|
286
|
+
chunk_size (Optional[int], optional): Maximum number of records per chunk. If None, uses config value.
|
|
287
|
+
Defaults to None.
|
|
288
|
+
total_record_count (int, optional): Initial total record count.
|
|
289
|
+
Defaults to 0.
|
|
290
|
+
chunk_count (int, optional): Initial chunk count.
|
|
291
|
+
Defaults to 0.
|
|
292
|
+
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
293
|
+
Defaults to False.
|
|
294
|
+
dataframe_type (DataframeType, optional): Type of dataframe to write. Defaults to DataframeType.pandas.
|
|
295
|
+
"""
|
|
296
|
+
self.path = path
|
|
297
|
+
self.typename = typename
|
|
298
|
+
self.chunk_start = chunk_start
|
|
299
|
+
self.total_record_count = total_record_count
|
|
300
|
+
self.chunk_count = chunk_count
|
|
301
|
+
self.buffer_size = buffer_size
|
|
302
|
+
self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
|
|
303
|
+
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
304
|
+
self.current_buffer_size = 0
|
|
305
|
+
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
306
|
+
self.max_file_size_bytes = int(
|
|
307
|
+
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
308
|
+
) # 90% of DAPR limit as safety buffer
|
|
309
|
+
self.start_marker = start_marker
|
|
310
|
+
self.end_marker = end_marker
|
|
311
|
+
self.partitions = []
|
|
312
|
+
self.chunk_part = 0
|
|
313
|
+
self.metrics = get_metrics()
|
|
314
|
+
self.retain_local_copy = retain_local_copy
|
|
315
|
+
self.extension = JSON_FILE_EXTENSION
|
|
316
|
+
self.dataframe_type = dataframe_type
|
|
317
|
+
self._is_closed = False
|
|
318
|
+
self._statistics = None
|
|
319
|
+
|
|
320
|
+
if not self.path:
|
|
321
|
+
raise ValueError("path is required")
|
|
322
|
+
|
|
323
|
+
if typename:
|
|
324
|
+
self.path = os.path.join(self.path, typename)
|
|
325
|
+
os.makedirs(self.path, exist_ok=True)
|
|
326
|
+
|
|
327
|
+
if self.chunk_start:
|
|
328
|
+
self.chunk_count = self.chunk_start + self.chunk_count
|
|
329
|
+
|
|
330
|
+
async def _write_daft_dataframe(
|
|
331
|
+
self,
|
|
332
|
+
dataframe: "daft.DataFrame",
|
|
333
|
+
preserve_fields: Optional[List[str]] = None,
|
|
334
|
+
null_to_empty_dict_fields: Optional[List[str]] = None,
|
|
335
|
+
**kwargs,
|
|
336
|
+
): # noqa: F821
|
|
337
|
+
"""Write a daft DataFrame to JSON files.
|
|
338
|
+
|
|
339
|
+
This method converts the daft DataFrame to pandas and writes it to JSON files.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
dataframe (daft.DataFrame): The DataFrame to write.
|
|
343
|
+
preserve_fields (Optional[List[str]]): List of fields to preserve during null processing.
|
|
344
|
+
Defaults to ["identity_cycle", "number_columns_in_part_key",
|
|
345
|
+
"columns_participating_in_part_key", "engine", "is_insertable_into", "is_typed"].
|
|
346
|
+
null_to_empty_dict_fields (Optional[List[str]]): List of fields to convert from null to empty dict.
|
|
347
|
+
Defaults to ["attributes", "customAttributes"].
|
|
348
|
+
|
|
349
|
+
Note:
|
|
350
|
+
Daft does not have built-in JSON writing support, so we are using orjson.
|
|
351
|
+
"""
|
|
352
|
+
# Initialize default values for mutable arguments
|
|
353
|
+
if preserve_fields is None:
|
|
354
|
+
preserve_fields = [
|
|
355
|
+
"identity_cycle",
|
|
356
|
+
"number_columns_in_part_key",
|
|
357
|
+
"columns_participating_in_part_key",
|
|
358
|
+
"engine",
|
|
359
|
+
"is_insertable_into",
|
|
360
|
+
"is_typed",
|
|
361
|
+
]
|
|
362
|
+
if null_to_empty_dict_fields is None:
|
|
363
|
+
null_to_empty_dict_fields = [
|
|
364
|
+
"attributes",
|
|
365
|
+
"customAttributes",
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
try:
|
|
369
|
+
if self.chunk_start is None:
|
|
370
|
+
self.chunk_part = 0
|
|
371
|
+
|
|
372
|
+
buffer = []
|
|
373
|
+
for row in dataframe.iter_rows():
|
|
374
|
+
self.total_record_count += 1
|
|
375
|
+
# Convert datetime fields to epoch timestamps before serialization
|
|
376
|
+
row = convert_datetime_to_epoch(row)
|
|
377
|
+
# Remove null attributes from the row recursively, preserving specified fields
|
|
378
|
+
cleaned_row = process_null_fields(
|
|
379
|
+
row, preserve_fields, null_to_empty_dict_fields
|
|
380
|
+
)
|
|
381
|
+
# Serialize the row and add it to the buffer
|
|
382
|
+
serialized_row = orjson.dumps(
|
|
383
|
+
cleaned_row, option=orjson.OPT_APPEND_NEWLINE
|
|
384
|
+
)
|
|
385
|
+
buffer.append(serialized_row)
|
|
386
|
+
self.current_buffer_size += 1
|
|
387
|
+
self.current_buffer_size_bytes += len(serialized_row)
|
|
388
|
+
|
|
389
|
+
# If the buffer size is reached append to the file and clear the buffer
|
|
390
|
+
if self.current_buffer_size >= self.buffer_size:
|
|
391
|
+
await self._flush_daft_buffer(buffer, self.chunk_part)
|
|
392
|
+
|
|
393
|
+
if self.current_buffer_size_bytes > self.max_file_size_bytes or (
|
|
394
|
+
self.total_record_count > 0
|
|
395
|
+
and self.total_record_count % self.chunk_size == 0
|
|
396
|
+
):
|
|
397
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
|
|
398
|
+
if os.path.exists(output_file_name):
|
|
399
|
+
await self._upload_file(output_file_name)
|
|
400
|
+
self.chunk_part += 1
|
|
401
|
+
|
|
402
|
+
# Write any remaining rows in the buffer
|
|
403
|
+
if self.current_buffer_size > 0:
|
|
404
|
+
await self._flush_daft_buffer(buffer, self.chunk_part)
|
|
405
|
+
|
|
406
|
+
# Record metrics for successful write
|
|
407
|
+
self.metrics.record_metric(
|
|
408
|
+
name="json_write_records",
|
|
409
|
+
value=dataframe.count_rows(),
|
|
410
|
+
metric_type=MetricType.COUNTER,
|
|
411
|
+
labels={"type": "daft"},
|
|
412
|
+
description="Number of records written to JSON files from daft DataFrame",
|
|
413
|
+
)
|
|
414
|
+
except Exception as e:
|
|
415
|
+
# Record metrics for failed write
|
|
416
|
+
self.metrics.record_metric(
|
|
417
|
+
name="json_write_errors",
|
|
418
|
+
value=1,
|
|
419
|
+
metric_type=MetricType.COUNTER,
|
|
420
|
+
labels={"type": "daft", "error": str(e)},
|
|
421
|
+
description="Number of errors while writing to JSON files",
|
|
422
|
+
)
|
|
423
|
+
logger.error(f"Error writing daft dataframe to json: {str(e)}")
|
|
424
|
+
raise
|
|
425
|
+
|
|
426
|
+
async def _flush_daft_buffer(self, buffer: List[str], chunk_part: int):
|
|
427
|
+
"""Flush the current buffer to a JSON file.
|
|
428
|
+
|
|
429
|
+
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
430
|
+
and uploads the file to the object store.
|
|
431
|
+
"""
|
|
432
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
|
|
433
|
+
with open(output_file_name, "ab+") as f:
|
|
434
|
+
f.writelines(buffer)
|
|
435
|
+
buffer.clear() # Clear the buffer
|
|
436
|
+
|
|
437
|
+
self.current_buffer_size = 0
|
|
438
|
+
|
|
439
|
+
# Record chunk metrics
|
|
440
|
+
self.metrics.record_metric(
|
|
441
|
+
name="json_chunks_written",
|
|
442
|
+
value=1,
|
|
443
|
+
metric_type=MetricType.COUNTER,
|
|
444
|
+
labels={"type": "daft"},
|
|
445
|
+
description="Number of chunks written to JSON files",
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
async def _write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
449
|
+
"""Write a chunk to a JSON file.
|
|
450
|
+
|
|
451
|
+
This method writes a chunk to a JSON file and uploads the file to the object store.
|
|
452
|
+
"""
|
|
453
|
+
mode = "w" if not os.path.exists(file_name) else "a"
|
|
454
|
+
with open(file_name, mode=mode) as f:
|
|
455
|
+
chunk.to_json(f, orient="records", lines=True)
|
|
456
|
+
|
|
457
|
+
async def _finalize(self) -> None:
|
|
458
|
+
"""Finalize the JSON writer before closing.
|
|
459
|
+
|
|
460
|
+
Uploads any remaining buffered data to the object store.
|
|
461
|
+
"""
|
|
462
|
+
# Upload the final file if there's remaining buffered data
|
|
463
|
+
if self.current_buffer_size_bytes > 0:
|
|
464
|
+
output_file_name = f"{self.path}/{path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker, extension=self.extension)}"
|
|
465
|
+
if os.path.exists(output_file_name):
|
|
466
|
+
await self._upload_file(output_file_name)
|
|
467
|
+
self.chunk_part += 1
|
|
468
|
+
|
|
469
|
+
# If chunk_start is set we don't want to increment the chunk_count
|
|
470
|
+
# Since it should only increment the chunk_part in this case
|
|
471
|
+
if self.chunk_start is None:
|
|
472
|
+
self.chunk_count += 1
|
|
473
|
+
self.partitions.append(self.chunk_part)
|