atlan-application-sdk 0.1.1rc43__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/metadata_extraction/sql.py +9 -35
- application_sdk/activities/query_extraction/sql.py +0 -2
- application_sdk/inputs/parquet.py +15 -3
- application_sdk/inputs/sql_query.py +2 -2
- application_sdk/interceptors/cleanup.py +0 -1
- application_sdk/outputs/__init__.py +176 -12
- application_sdk/outputs/json.py +57 -181
- application_sdk/outputs/parquet.py +230 -161
- application_sdk/transformers/query/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/workflows/metadata_extraction/sql.py +5 -4
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/RECORD +16 -16
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/NOTICE +0 -0
application_sdk/outputs/json.py
CHANGED
|
@@ -1,16 +1,15 @@
|
|
|
1
1
|
import os
|
|
2
2
|
from datetime import datetime
|
|
3
|
-
from typing import TYPE_CHECKING, Any,
|
|
3
|
+
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
4
4
|
|
|
5
5
|
import orjson
|
|
6
6
|
from temporalio import activity
|
|
7
7
|
|
|
8
|
-
from application_sdk.activities.common.
|
|
8
|
+
from application_sdk.activities.common.models import ActivityStatistics
|
|
9
9
|
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
10
10
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
11
11
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
12
12
|
from application_sdk.outputs import Output
|
|
13
|
-
from application_sdk.services.objectstore import ObjectStore
|
|
14
13
|
|
|
15
14
|
logger = get_logger(__name__)
|
|
16
15
|
activity.logger = logger
|
|
@@ -20,22 +19,6 @@ if TYPE_CHECKING:
|
|
|
20
19
|
import pandas as pd
|
|
21
20
|
|
|
22
21
|
|
|
23
|
-
def path_gen(chunk_start: int | None, chunk_count: int) -> str:
|
|
24
|
-
"""Generate a file path for a chunk.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
chunk_start (int | None): Starting index of the chunk, or None for single chunk.
|
|
28
|
-
chunk_count (int): Total number of chunks.
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
str: Generated file path for the chunk.
|
|
32
|
-
"""
|
|
33
|
-
if chunk_start is None:
|
|
34
|
-
return f"{str(chunk_count)}.json"
|
|
35
|
-
else:
|
|
36
|
-
return f"chunk-{chunk_start}-part{chunk_count}.json"
|
|
37
|
-
|
|
38
|
-
|
|
39
22
|
def convert_datetime_to_epoch(data: Any) -> Any:
|
|
40
23
|
"""Convert datetime objects to epoch timestamps in milliseconds.
|
|
41
24
|
|
|
@@ -68,7 +51,6 @@ class JsonOutput(Output):
|
|
|
68
51
|
Attributes:
|
|
69
52
|
output_path (Optional[str]): Base path where JSON files will be written.
|
|
70
53
|
output_suffix (str): Suffix added to file paths when uploading to object store.
|
|
71
|
-
output_prefix (Optional[str]): Prefix for output files and object store paths.
|
|
72
54
|
typename (Optional[str]): Type identifier for the data being written.
|
|
73
55
|
chunk_start (Optional[int]): Starting index for chunk numbering.
|
|
74
56
|
buffer_size (int): Size of the write buffer in bytes.
|
|
@@ -79,18 +61,18 @@ class JsonOutput(Output):
|
|
|
79
61
|
data before writing.
|
|
80
62
|
"""
|
|
81
63
|
|
|
64
|
+
_EXTENSION = ".json"
|
|
65
|
+
|
|
82
66
|
def __init__(
|
|
83
67
|
self,
|
|
84
68
|
output_suffix: str,
|
|
85
69
|
output_path: Optional[str] = None,
|
|
86
|
-
output_prefix: Optional[str] = None,
|
|
87
70
|
typename: Optional[str] = None,
|
|
88
71
|
chunk_start: Optional[int] = None,
|
|
89
|
-
buffer_size: int =
|
|
90
|
-
chunk_size: Optional[int] =
|
|
72
|
+
buffer_size: int = 5000,
|
|
73
|
+
chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
|
|
91
74
|
total_record_count: int = 0,
|
|
92
75
|
chunk_count: int = 0,
|
|
93
|
-
path_gen: Callable[[int | None, int], str] = path_gen,
|
|
94
76
|
start_marker: Optional[str] = None,
|
|
95
77
|
end_marker: Optional[str] = None,
|
|
96
78
|
retain_local_copy: bool = False,
|
|
@@ -101,7 +83,6 @@ class JsonOutput(Output):
|
|
|
101
83
|
Args:
|
|
102
84
|
output_path (str): Path where JSON files will be written.
|
|
103
85
|
output_suffix (str): Prefix for files when uploading to object store.
|
|
104
|
-
output_prefix (Optional[str], optional): Prefix for files where the files will be written and uploaded.
|
|
105
86
|
chunk_start (Optional[int], optional): Starting index for chunk numbering.
|
|
106
87
|
Defaults to None.
|
|
107
88
|
buffer_size (int, optional): Size of the buffer in bytes.
|
|
@@ -112,29 +93,27 @@ class JsonOutput(Output):
|
|
|
112
93
|
Defaults to 0.
|
|
113
94
|
chunk_count (int, optional): Initial chunk count.
|
|
114
95
|
Defaults to 0.
|
|
115
|
-
path_gen (Callable, optional): Function to generate file paths.
|
|
116
|
-
Defaults to path_gen function.
|
|
117
96
|
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
118
97
|
Defaults to False.
|
|
119
98
|
"""
|
|
120
99
|
self.output_path = output_path
|
|
121
100
|
self.output_suffix = output_suffix
|
|
122
|
-
self.output_prefix = output_prefix
|
|
123
101
|
self.typename = typename
|
|
124
102
|
self.chunk_start = chunk_start
|
|
125
103
|
self.total_record_count = total_record_count
|
|
126
104
|
self.chunk_count = chunk_count
|
|
127
105
|
self.buffer_size = buffer_size
|
|
128
|
-
self.chunk_size = chunk_size or
|
|
106
|
+
self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
|
|
129
107
|
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
130
108
|
self.current_buffer_size = 0
|
|
131
109
|
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
132
110
|
self.max_file_size_bytes = int(
|
|
133
111
|
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
134
112
|
) # 90% of DAPR limit as safety buffer
|
|
135
|
-
self.path_gen = path_gen
|
|
136
113
|
self.start_marker = start_marker
|
|
137
114
|
self.end_marker = end_marker
|
|
115
|
+
self.partitions = []
|
|
116
|
+
self.chunk_part = 0
|
|
138
117
|
self.metrics = get_metrics()
|
|
139
118
|
self.retain_local_copy = retain_local_copy
|
|
140
119
|
|
|
@@ -146,81 +125,8 @@ class JsonOutput(Output):
|
|
|
146
125
|
self.output_path = os.path.join(self.output_path, typename)
|
|
147
126
|
os.makedirs(self.output_path, exist_ok=True)
|
|
148
127
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
self.path_gen = (
|
|
152
|
-
lambda chunk_start,
|
|
153
|
-
chunk_count: f"{self.start_marker}_{self.end_marker}.json"
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
157
|
-
"""Write a pandas DataFrame to JSON files.
|
|
158
|
-
|
|
159
|
-
This method writes the DataFrame to JSON files, potentially splitting it
|
|
160
|
-
into chunks based on chunk_size and buffer_size settings.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
dataframe (pd.DataFrame): The DataFrame to write.
|
|
164
|
-
|
|
165
|
-
Note:
|
|
166
|
-
If the DataFrame is empty, the method returns without writing.
|
|
167
|
-
"""
|
|
168
|
-
if len(dataframe) == 0:
|
|
169
|
-
return
|
|
170
|
-
|
|
171
|
-
try:
|
|
172
|
-
# Split the DataFrame into chunks
|
|
173
|
-
partition = (
|
|
174
|
-
self.chunk_size
|
|
175
|
-
if self.chunk_start is None
|
|
176
|
-
else min(self.chunk_size, self.buffer_size)
|
|
177
|
-
)
|
|
178
|
-
chunks = [
|
|
179
|
-
dataframe[i : i + partition]
|
|
180
|
-
for i in range(0, len(dataframe), partition)
|
|
181
|
-
]
|
|
182
|
-
|
|
183
|
-
for chunk in chunks:
|
|
184
|
-
# Estimate size of this chunk
|
|
185
|
-
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "json")
|
|
186
|
-
|
|
187
|
-
# Check if adding this chunk would exceed size limit
|
|
188
|
-
if (
|
|
189
|
-
self.current_buffer_size_bytes + chunk_size_bytes
|
|
190
|
-
> self.max_file_size_bytes
|
|
191
|
-
and self.current_buffer_size > 0
|
|
192
|
-
):
|
|
193
|
-
# Flush current buffer before adding this chunk
|
|
194
|
-
await self._flush_buffer()
|
|
195
|
-
|
|
196
|
-
self.buffer.append(chunk)
|
|
197
|
-
self.current_buffer_size += len(chunk)
|
|
198
|
-
self.current_buffer_size_bytes += chunk_size_bytes
|
|
199
|
-
|
|
200
|
-
if self.current_buffer_size >= partition:
|
|
201
|
-
await self._flush_buffer()
|
|
202
|
-
|
|
203
|
-
await self._flush_buffer()
|
|
204
|
-
|
|
205
|
-
# Record metrics for successful write
|
|
206
|
-
self.metrics.record_metric(
|
|
207
|
-
name="json_write_records",
|
|
208
|
-
value=len(dataframe),
|
|
209
|
-
metric_type=MetricType.COUNTER,
|
|
210
|
-
labels={"type": "pandas"},
|
|
211
|
-
description="Number of records written to JSON files from pandas DataFrame",
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
except Exception as e:
|
|
215
|
-
# Record metrics for failed write
|
|
216
|
-
self.metrics.record_metric(
|
|
217
|
-
name="json_write_errors",
|
|
218
|
-
value=1,
|
|
219
|
-
metric_type=MetricType.COUNTER,
|
|
220
|
-
labels={"type": "pandas", "error": str(e)},
|
|
221
|
-
description="Number of errors while writing to JSON files",
|
|
222
|
-
)
|
|
223
|
-
logger.error(f"Error writing dataframe to json: {str(e)}")
|
|
128
|
+
if self.chunk_start:
|
|
129
|
+
self.chunk_count = self.chunk_start + self.chunk_count
|
|
224
130
|
|
|
225
131
|
async def write_daft_dataframe(
|
|
226
132
|
self,
|
|
@@ -249,6 +155,9 @@ class JsonOutput(Output):
|
|
|
249
155
|
Daft does not have built-in JSON writing support, so we are using orjson.
|
|
250
156
|
"""
|
|
251
157
|
try:
|
|
158
|
+
if self.chunk_start is None:
|
|
159
|
+
self.chunk_part = 0
|
|
160
|
+
|
|
252
161
|
buffer = []
|
|
253
162
|
for row in dataframe.iter_rows():
|
|
254
163
|
self.total_record_count += 1
|
|
@@ -261,17 +170,27 @@ class JsonOutput(Output):
|
|
|
261
170
|
# Serialize the row and add it to the buffer
|
|
262
171
|
serialized_row = orjson.dumps(
|
|
263
172
|
cleaned_row, option=orjson.OPT_APPEND_NEWLINE
|
|
264
|
-
)
|
|
173
|
+
)
|
|
265
174
|
buffer.append(serialized_row)
|
|
175
|
+
self.current_buffer_size += 1
|
|
266
176
|
self.current_buffer_size_bytes += len(serialized_row)
|
|
267
|
-
|
|
268
|
-
|
|
177
|
+
|
|
178
|
+
# If the buffer size is reached append to the file and clear the buffer
|
|
179
|
+
if self.current_buffer_size >= self.buffer_size:
|
|
180
|
+
await self.flush_daft_buffer(buffer, self.chunk_part)
|
|
181
|
+
|
|
182
|
+
if self.current_buffer_size_bytes > self.max_file_size_bytes or (
|
|
183
|
+
self.total_record_count > 0
|
|
184
|
+
and self.total_record_count % self.chunk_size == 0
|
|
269
185
|
):
|
|
270
|
-
|
|
186
|
+
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker)}"
|
|
187
|
+
if os.path.exists(output_file_name):
|
|
188
|
+
await self._upload_file(output_file_name)
|
|
189
|
+
self.chunk_part += 1
|
|
271
190
|
|
|
272
191
|
# Write any remaining rows in the buffer
|
|
273
|
-
if
|
|
274
|
-
await self.flush_daft_buffer(buffer)
|
|
192
|
+
if self.current_buffer_size > 0:
|
|
193
|
+
await self.flush_daft_buffer(buffer, self.chunk_part)
|
|
275
194
|
|
|
276
195
|
# Record metrics for successful write
|
|
277
196
|
self.metrics.record_metric(
|
|
@@ -281,14 +200,6 @@ class JsonOutput(Output):
|
|
|
281
200
|
labels={"type": "daft"},
|
|
282
201
|
description="Number of records written to JSON files from daft DataFrame",
|
|
283
202
|
)
|
|
284
|
-
|
|
285
|
-
# Push files to the object store
|
|
286
|
-
await ObjectStore.upload_prefix(
|
|
287
|
-
source=self.output_path,
|
|
288
|
-
destination=get_object_store_prefix(self.output_path),
|
|
289
|
-
retain_local_copy=self.retain_local_copy,
|
|
290
|
-
)
|
|
291
|
-
|
|
292
203
|
except Exception as e:
|
|
293
204
|
# Record metrics for failed write
|
|
294
205
|
self.metrics.record_metric(
|
|
@@ -300,22 +211,20 @@ class JsonOutput(Output):
|
|
|
300
211
|
)
|
|
301
212
|
logger.error(f"Error writing daft dataframe to json: {str(e)}")
|
|
302
213
|
|
|
303
|
-
async def flush_daft_buffer(self, buffer: List[str]):
|
|
214
|
+
async def flush_daft_buffer(self, buffer: List[str], chunk_part: int):
|
|
304
215
|
"""Flush the current buffer to a JSON file.
|
|
305
216
|
|
|
306
217
|
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
307
218
|
and uploads the file to the object store.
|
|
308
219
|
"""
|
|
309
|
-
self.chunk_count += 1
|
|
310
220
|
output_file_name = (
|
|
311
|
-
f"{self.output_path}/{self.path_gen(self.
|
|
221
|
+
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
312
222
|
)
|
|
313
|
-
with open(output_file_name, "
|
|
223
|
+
with open(output_file_name, "ab+") as f:
|
|
314
224
|
f.writelines(buffer)
|
|
315
225
|
buffer.clear() # Clear the buffer
|
|
316
226
|
|
|
317
227
|
self.current_buffer_size = 0
|
|
318
|
-
self.current_buffer_size_bytes = 0
|
|
319
228
|
|
|
320
229
|
# Record chunk metrics
|
|
321
230
|
self.metrics.record_metric(
|
|
@@ -326,67 +235,34 @@ class JsonOutput(Output):
|
|
|
326
235
|
description="Number of chunks written to JSON files",
|
|
327
236
|
)
|
|
328
237
|
|
|
329
|
-
async def
|
|
330
|
-
"""
|
|
331
|
-
|
|
332
|
-
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
333
|
-
and uploads the file to the object store.
|
|
238
|
+
async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
239
|
+
"""Write a chunk to a JSON file.
|
|
334
240
|
|
|
335
|
-
|
|
336
|
-
If the buffer is empty or has no records, the method returns without writing.
|
|
241
|
+
This method writes a chunk to a JSON file and uploads the file to the object store.
|
|
337
242
|
"""
|
|
338
|
-
|
|
243
|
+
mode = "w" if not os.path.exists(file_name) else "a"
|
|
244
|
+
chunk.to_json(file_name, orient="records", lines=True, mode=mode)
|
|
339
245
|
|
|
340
|
-
|
|
341
|
-
|
|
246
|
+
async def get_statistics(
|
|
247
|
+
self, typename: Optional[str] = None
|
|
248
|
+
) -> ActivityStatistics:
|
|
249
|
+
"""Get the statistics of the JSON files.
|
|
342
250
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
251
|
+
This method returns the statistics of the JSON files.
|
|
252
|
+
"""
|
|
253
|
+
# Finally upload the final file
|
|
254
|
+
if self.current_buffer_size_bytes > 0:
|
|
255
|
+
output_file_name = (
|
|
256
|
+
f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
|
|
346
257
|
)
|
|
258
|
+
if os.path.exists(output_file_name):
|
|
259
|
+
await self._upload_file(output_file_name)
|
|
260
|
+
self.chunk_part += 1
|
|
347
261
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
# Write DataFrame to JSON file
|
|
354
|
-
if not combined_dataframe.empty:
|
|
355
|
-
self.chunk_count += 1
|
|
356
|
-
self.total_record_count += len(combined_dataframe)
|
|
357
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
358
|
-
combined_dataframe.to_json(
|
|
359
|
-
output_file_name, orient="records", lines=True
|
|
360
|
-
)
|
|
361
|
-
|
|
362
|
-
# Record chunk metrics
|
|
363
|
-
self.metrics.record_metric(
|
|
364
|
-
name="json_chunks_written",
|
|
365
|
-
value=1,
|
|
366
|
-
metric_type=MetricType.COUNTER,
|
|
367
|
-
labels={"type": "pandas"},
|
|
368
|
-
description="Number of chunks written to JSON files",
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
# Push the file to the object store
|
|
372
|
-
await ObjectStore.upload_file(
|
|
373
|
-
source=output_file_name,
|
|
374
|
-
destination=get_object_store_prefix(output_file_name),
|
|
375
|
-
retain_local_copy=self.retain_local_copy,
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
self.buffer.clear()
|
|
379
|
-
self.current_buffer_size = 0
|
|
380
|
-
self.current_buffer_size_bytes = 0
|
|
262
|
+
# If chunk_start is set we don't want to increment the chunk_count
|
|
263
|
+
# Since it should only increment the chunk_part in this case
|
|
264
|
+
if self.chunk_start is None:
|
|
265
|
+
self.chunk_count += 1
|
|
266
|
+
self.partitions.append(self.chunk_part)
|
|
381
267
|
|
|
382
|
-
|
|
383
|
-
# Record metrics for failed write
|
|
384
|
-
self.metrics.record_metric(
|
|
385
|
-
name="json_write_errors",
|
|
386
|
-
value=1,
|
|
387
|
-
metric_type=MetricType.COUNTER,
|
|
388
|
-
labels={"type": "pandas", "error": str(e)},
|
|
389
|
-
description="Number of errors while writing to JSON files",
|
|
390
|
-
)
|
|
391
|
-
logger.error(f"Error flushing buffer to json: {str(e)}")
|
|
392
|
-
raise e
|
|
268
|
+
return await super().get_statistics(typename)
|