atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +308 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +654 -0
- application_sdk/io/json.py +429 -0
- application_sdk/{outputs → io}/parquet.py +358 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +23 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +30 -7
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -445
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,445 +0,0 @@
|
|
|
1
|
-
"""Output module for handling data output operations.
|
|
2
|
-
|
|
3
|
-
This module provides base classes and utilities for handling various types of data outputs
|
|
4
|
-
in the application, including file outputs and object store interactions.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import gc
|
|
8
|
-
import inspect
|
|
9
|
-
import os
|
|
10
|
-
from abc import ABC, abstractmethod
|
|
11
|
-
from enum import Enum
|
|
12
|
-
from typing import (
|
|
13
|
-
TYPE_CHECKING,
|
|
14
|
-
Any,
|
|
15
|
-
AsyncGenerator,
|
|
16
|
-
Dict,
|
|
17
|
-
Generator,
|
|
18
|
-
List,
|
|
19
|
-
Optional,
|
|
20
|
-
Union,
|
|
21
|
-
cast,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
import orjson
|
|
25
|
-
from temporalio import activity
|
|
26
|
-
|
|
27
|
-
from application_sdk.activities.common.models import ActivityStatistics
|
|
28
|
-
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
29
|
-
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
30
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
31
|
-
from application_sdk.observability.metrics_adaptor import MetricType
|
|
32
|
-
from application_sdk.services.objectstore import ObjectStore
|
|
33
|
-
|
|
34
|
-
logger = get_logger(__name__)
|
|
35
|
-
activity.logger = logger
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
if TYPE_CHECKING:
|
|
39
|
-
import daft # type: ignore
|
|
40
|
-
import pandas as pd
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class WriteMode(Enum):
|
|
44
|
-
"""Enumeration of write modes for output operations."""
|
|
45
|
-
|
|
46
|
-
APPEND = "append"
|
|
47
|
-
OVERWRITE = "overwrite"
|
|
48
|
-
OVERWRITE_PARTITIONS = "overwrite-partitions"
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class Output(ABC):
|
|
52
|
-
"""Abstract base class for output handlers.
|
|
53
|
-
|
|
54
|
-
This class defines the interface for output handlers that can write data
|
|
55
|
-
to various destinations in different formats.
|
|
56
|
-
|
|
57
|
-
Attributes:
|
|
58
|
-
output_path (str): Path where the output will be written.
|
|
59
|
-
upload_file_prefix (str): Prefix for files when uploading to object store.
|
|
60
|
-
total_record_count (int): Total number of records processed.
|
|
61
|
-
chunk_count (int): Number of chunks the output was split into.
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
output_path: str
|
|
65
|
-
output_prefix: str
|
|
66
|
-
total_record_count: int
|
|
67
|
-
chunk_count: int
|
|
68
|
-
chunk_part: int
|
|
69
|
-
buffer_size: int
|
|
70
|
-
max_file_size_bytes: int
|
|
71
|
-
current_buffer_size: int
|
|
72
|
-
current_buffer_size_bytes: int
|
|
73
|
-
partitions: List[int]
|
|
74
|
-
|
|
75
|
-
def estimate_dataframe_record_size(self, dataframe: "pd.DataFrame") -> int:
|
|
76
|
-
"""Estimate File size of a DataFrame by sampling a few records."""
|
|
77
|
-
if len(dataframe) == 0:
|
|
78
|
-
return 0
|
|
79
|
-
|
|
80
|
-
# Sample up to 10 records to estimate average size
|
|
81
|
-
sample_size = min(10, len(dataframe))
|
|
82
|
-
sample = dataframe.head(sample_size)
|
|
83
|
-
file_type = type(self).__name__.lower().replace("output", "")
|
|
84
|
-
compression_factor = 1
|
|
85
|
-
if file_type == "json":
|
|
86
|
-
sample_file = sample.to_json(orient="records", lines=True)
|
|
87
|
-
else:
|
|
88
|
-
sample_file = sample.to_parquet(index=False, compression="snappy")
|
|
89
|
-
compression_factor = 0.01
|
|
90
|
-
if sample_file is not None:
|
|
91
|
-
avg_record_size = len(sample_file) / sample_size * compression_factor
|
|
92
|
-
return int(avg_record_size)
|
|
93
|
-
|
|
94
|
-
return 0
|
|
95
|
-
|
|
96
|
-
def path_gen(
|
|
97
|
-
self,
|
|
98
|
-
chunk_count: Optional[int] = None,
|
|
99
|
-
chunk_part: int = 0,
|
|
100
|
-
start_marker: Optional[str] = None,
|
|
101
|
-
end_marker: Optional[str] = None,
|
|
102
|
-
) -> str:
|
|
103
|
-
"""Generate a file path for a chunk.
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
|
|
107
|
-
chunk_count (int): Total number of chunks.
|
|
108
|
-
start_marker (Optional[str]): Start marker for query extraction.
|
|
109
|
-
end_marker (Optional[str]): End marker for query extraction.
|
|
110
|
-
|
|
111
|
-
Returns:
|
|
112
|
-
str: Generated file path for the chunk.
|
|
113
|
-
"""
|
|
114
|
-
# For Query Extraction - use start and end markers without chunk count
|
|
115
|
-
if start_marker and end_marker:
|
|
116
|
-
return f"{start_marker}_{end_marker}{self._EXTENSION}"
|
|
117
|
-
|
|
118
|
-
# For regular chunking - include chunk count
|
|
119
|
-
if chunk_count is None:
|
|
120
|
-
return f"{str(chunk_part)}{self._EXTENSION}"
|
|
121
|
-
else:
|
|
122
|
-
return f"chunk-{str(chunk_count)}-part{str(chunk_part)}{self._EXTENSION}"
|
|
123
|
-
|
|
124
|
-
def process_null_fields(
|
|
125
|
-
self,
|
|
126
|
-
obj: Any,
|
|
127
|
-
preserve_fields: Optional[List[str]] = None,
|
|
128
|
-
null_to_empty_dict_fields: Optional[List[str]] = None,
|
|
129
|
-
) -> Any:
|
|
130
|
-
"""
|
|
131
|
-
By default the method removes null values from dictionaries and lists.
|
|
132
|
-
Except for the fields specified in preserve_fields.
|
|
133
|
-
And fields in null_to_empty_dict_fields are replaced with empty dict if null.
|
|
134
|
-
|
|
135
|
-
Args:
|
|
136
|
-
obj: The object to clean (dict, list, or other value)
|
|
137
|
-
preserve_fields: Optional list of field names that should be preserved even if they contain null values
|
|
138
|
-
null_to_empty_dict_fields: Optional list of field names that should be replaced with empty dict if null
|
|
139
|
-
|
|
140
|
-
Returns:
|
|
141
|
-
The cleaned object with null values removed
|
|
142
|
-
"""
|
|
143
|
-
if isinstance(obj, dict):
|
|
144
|
-
result = {}
|
|
145
|
-
for k, v in obj.items():
|
|
146
|
-
# Handle null fields that should be converted to empty dicts
|
|
147
|
-
if k in (null_to_empty_dict_fields or []) and v is None:
|
|
148
|
-
result[k] = {}
|
|
149
|
-
continue
|
|
150
|
-
|
|
151
|
-
# Process the value recursively
|
|
152
|
-
processed_value = self.process_null_fields(
|
|
153
|
-
v, preserve_fields, null_to_empty_dict_fields
|
|
154
|
-
)
|
|
155
|
-
|
|
156
|
-
# Keep the field if it's in preserve_fields or has a non-None processed value
|
|
157
|
-
if k in (preserve_fields or []) or processed_value is not None:
|
|
158
|
-
result[k] = processed_value
|
|
159
|
-
|
|
160
|
-
return result
|
|
161
|
-
return obj
|
|
162
|
-
|
|
163
|
-
async def write_batched_dataframe(
|
|
164
|
-
self,
|
|
165
|
-
batched_dataframe: Union[
|
|
166
|
-
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
167
|
-
],
|
|
168
|
-
):
|
|
169
|
-
"""Write a batched pandas DataFrame to Output.
|
|
170
|
-
|
|
171
|
-
This method writes the DataFrame to Output provided, potentially splitting it
|
|
172
|
-
into chunks based on chunk_size and buffer_size settings.
|
|
173
|
-
|
|
174
|
-
Args:
|
|
175
|
-
dataframe (pd.DataFrame): The DataFrame to write.
|
|
176
|
-
|
|
177
|
-
Note:
|
|
178
|
-
If the DataFrame is empty, the method returns without writing.
|
|
179
|
-
"""
|
|
180
|
-
try:
|
|
181
|
-
if inspect.isasyncgen(batched_dataframe):
|
|
182
|
-
async for dataframe in batched_dataframe:
|
|
183
|
-
if not is_empty_dataframe(dataframe):
|
|
184
|
-
await self.write_dataframe(dataframe)
|
|
185
|
-
else:
|
|
186
|
-
# Cast to Generator since we've confirmed it's not an AsyncGenerator
|
|
187
|
-
sync_generator = cast(
|
|
188
|
-
Generator["pd.DataFrame", None, None], batched_dataframe
|
|
189
|
-
)
|
|
190
|
-
for dataframe in sync_generator:
|
|
191
|
-
if not is_empty_dataframe(dataframe):
|
|
192
|
-
await self.write_dataframe(dataframe)
|
|
193
|
-
except Exception as e:
|
|
194
|
-
logger.error(f"Error writing batched dataframe: {str(e)}")
|
|
195
|
-
raise
|
|
196
|
-
|
|
197
|
-
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
198
|
-
"""Write a pandas DataFrame to Parquet files and upload to object store.
|
|
199
|
-
|
|
200
|
-
Args:
|
|
201
|
-
dataframe (pd.DataFrame): The DataFrame to write.
|
|
202
|
-
"""
|
|
203
|
-
try:
|
|
204
|
-
if self.chunk_start is None:
|
|
205
|
-
self.chunk_part = 0
|
|
206
|
-
if len(dataframe) == 0:
|
|
207
|
-
return
|
|
208
|
-
|
|
209
|
-
chunk_size_bytes = self.estimate_dataframe_record_size(dataframe)
|
|
210
|
-
|
|
211
|
-
for i in range(0, len(dataframe), self.buffer_size):
|
|
212
|
-
chunk = dataframe[i : i + self.buffer_size]
|
|
213
|
-
|
|
214
|
-
if (
|
|
215
|
-
self.current_buffer_size_bytes + chunk_size_bytes
|
|
216
|
-
> self.max_file_size_bytes
|
|
217
|
-
):
|
|
218
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
|
|
219
|
-
if os.path.exists(output_file_name):
|
|
220
|
-
await self._upload_file(output_file_name)
|
|
221
|
-
self.chunk_part += 1
|
|
222
|
-
|
|
223
|
-
self.current_buffer_size += len(chunk)
|
|
224
|
-
self.current_buffer_size_bytes += chunk_size_bytes * len(chunk)
|
|
225
|
-
await self._flush_buffer(chunk, self.chunk_part)
|
|
226
|
-
|
|
227
|
-
del chunk
|
|
228
|
-
gc.collect()
|
|
229
|
-
|
|
230
|
-
if self.current_buffer_size_bytes > 0:
|
|
231
|
-
# Finally upload the final file to the object store
|
|
232
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
|
|
233
|
-
if os.path.exists(output_file_name):
|
|
234
|
-
await self._upload_file(output_file_name)
|
|
235
|
-
self.chunk_part += 1
|
|
236
|
-
|
|
237
|
-
# Record metrics for successful write
|
|
238
|
-
self.metrics.record_metric(
|
|
239
|
-
name="write_records",
|
|
240
|
-
value=len(dataframe),
|
|
241
|
-
metric_type=MetricType.COUNTER,
|
|
242
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
243
|
-
description="Number of records written to files from pandas DataFrame",
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
# Record chunk metrics
|
|
247
|
-
self.metrics.record_metric(
|
|
248
|
-
name="chunks_written",
|
|
249
|
-
value=1,
|
|
250
|
-
metric_type=MetricType.COUNTER,
|
|
251
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
252
|
-
description="Number of chunks written to files",
|
|
253
|
-
)
|
|
254
|
-
|
|
255
|
-
# If chunk_start is set we don't want to increment the chunk_count
|
|
256
|
-
# Since it should only increment the chunk_part in this case
|
|
257
|
-
if self.chunk_start is None:
|
|
258
|
-
self.chunk_count += 1
|
|
259
|
-
self.partitions.append(self.chunk_part)
|
|
260
|
-
except Exception as e:
|
|
261
|
-
# Record metrics for failed write
|
|
262
|
-
self.metrics.record_metric(
|
|
263
|
-
name="write_errors",
|
|
264
|
-
value=1,
|
|
265
|
-
metric_type=MetricType.COUNTER,
|
|
266
|
-
labels={
|
|
267
|
-
"type": "pandas",
|
|
268
|
-
"mode": WriteMode.APPEND.value,
|
|
269
|
-
"error": str(e),
|
|
270
|
-
},
|
|
271
|
-
description="Number of errors while writing to files",
|
|
272
|
-
)
|
|
273
|
-
logger.error(f"Error writing pandas dataframe to files: {str(e)}")
|
|
274
|
-
raise
|
|
275
|
-
|
|
276
|
-
async def write_batched_daft_dataframe(
|
|
277
|
-
self,
|
|
278
|
-
batched_dataframe: Union[
|
|
279
|
-
AsyncGenerator["daft.DataFrame", None], # noqa: F821
|
|
280
|
-
Generator["daft.DataFrame", None, None], # noqa: F821
|
|
281
|
-
],
|
|
282
|
-
):
|
|
283
|
-
"""Write a batched daft DataFrame to JSON files.
|
|
284
|
-
|
|
285
|
-
This method writes the DataFrame to JSON files, potentially splitting it
|
|
286
|
-
into chunks based on chunk_size and buffer_size settings.
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
dataframe (daft.DataFrame): The DataFrame to write.
|
|
290
|
-
|
|
291
|
-
Note:
|
|
292
|
-
If the DataFrame is empty, the method returns without writing.
|
|
293
|
-
"""
|
|
294
|
-
try:
|
|
295
|
-
if inspect.isasyncgen(batched_dataframe):
|
|
296
|
-
async for dataframe in batched_dataframe:
|
|
297
|
-
if not is_empty_dataframe(dataframe):
|
|
298
|
-
await self.write_daft_dataframe(dataframe)
|
|
299
|
-
else:
|
|
300
|
-
# Cast to Generator since we've confirmed it's not an AsyncGenerator
|
|
301
|
-
sync_generator = cast(
|
|
302
|
-
Generator["daft.DataFrame", None, None], batched_dataframe
|
|
303
|
-
) # noqa: F821
|
|
304
|
-
for dataframe in sync_generator:
|
|
305
|
-
if not is_empty_dataframe(dataframe):
|
|
306
|
-
await self.write_daft_dataframe(dataframe)
|
|
307
|
-
except Exception as e:
|
|
308
|
-
logger.error(f"Error writing batched daft dataframe: {str(e)}")
|
|
309
|
-
|
|
310
|
-
@abstractmethod
|
|
311
|
-
async def write_daft_dataframe(self, dataframe: "daft.DataFrame"): # noqa: F821
|
|
312
|
-
"""Write a daft DataFrame to the output destination.
|
|
313
|
-
|
|
314
|
-
Args:
|
|
315
|
-
dataframe (daft.DataFrame): The DataFrame to write.
|
|
316
|
-
"""
|
|
317
|
-
pass
|
|
318
|
-
|
|
319
|
-
async def get_statistics(
|
|
320
|
-
self, typename: Optional[str] = None
|
|
321
|
-
) -> ActivityStatistics:
|
|
322
|
-
"""Returns statistics about the output.
|
|
323
|
-
|
|
324
|
-
This method returns a ActivityStatistics object with total record count and chunk count.
|
|
325
|
-
|
|
326
|
-
Args:
|
|
327
|
-
typename (str): Type name of the entity e.g database, schema, table.
|
|
328
|
-
|
|
329
|
-
Raises:
|
|
330
|
-
ValidationError: If the statistics data is invalid
|
|
331
|
-
Exception: If there's an error writing the statistics
|
|
332
|
-
"""
|
|
333
|
-
try:
|
|
334
|
-
statistics = await self.write_statistics(typename)
|
|
335
|
-
if not statistics:
|
|
336
|
-
raise ValueError("No statistics data available")
|
|
337
|
-
statistics = ActivityStatistics.model_validate(statistics)
|
|
338
|
-
if typename:
|
|
339
|
-
statistics.typename = typename
|
|
340
|
-
return statistics
|
|
341
|
-
except Exception as e:
|
|
342
|
-
logger.error(f"Error getting statistics: {str(e)}")
|
|
343
|
-
raise
|
|
344
|
-
|
|
345
|
-
async def _upload_file(self, file_name: str):
|
|
346
|
-
"""Upload a file to the object store."""
|
|
347
|
-
await ObjectStore.upload_file(
|
|
348
|
-
source=file_name,
|
|
349
|
-
destination=get_object_store_prefix(file_name),
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
self.current_buffer_size_bytes = 0
|
|
353
|
-
|
|
354
|
-
async def _flush_buffer(self, chunk: "pd.DataFrame", chunk_part: int):
|
|
355
|
-
"""Flush the current buffer to a JSON file.
|
|
356
|
-
|
|
357
|
-
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
358
|
-
and uploads the file to the object store.
|
|
359
|
-
|
|
360
|
-
Note:
|
|
361
|
-
If the buffer is empty or has no records, the method returns without writing.
|
|
362
|
-
"""
|
|
363
|
-
try:
|
|
364
|
-
if not is_empty_dataframe(chunk):
|
|
365
|
-
self.total_record_count += len(chunk)
|
|
366
|
-
output_file_name = (
|
|
367
|
-
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
368
|
-
)
|
|
369
|
-
await self.write_chunk(chunk, output_file_name)
|
|
370
|
-
|
|
371
|
-
self.current_buffer_size = 0
|
|
372
|
-
|
|
373
|
-
# Record chunk metrics
|
|
374
|
-
self.metrics.record_metric(
|
|
375
|
-
name="chunks_written",
|
|
376
|
-
value=1,
|
|
377
|
-
metric_type=MetricType.COUNTER,
|
|
378
|
-
labels={"type": "output"},
|
|
379
|
-
description="Number of chunks written to files",
|
|
380
|
-
)
|
|
381
|
-
|
|
382
|
-
except Exception as e:
|
|
383
|
-
# Record metrics for failed write
|
|
384
|
-
self.metrics.record_metric(
|
|
385
|
-
name="write_errors",
|
|
386
|
-
value=1,
|
|
387
|
-
metric_type=MetricType.COUNTER,
|
|
388
|
-
labels={"type": "output", "error": str(e)},
|
|
389
|
-
description="Number of errors while writing to files",
|
|
390
|
-
)
|
|
391
|
-
logger.error(f"Error flushing buffer to files: {str(e)}")
|
|
392
|
-
raise e
|
|
393
|
-
|
|
394
|
-
async def write_statistics(
|
|
395
|
-
self, typename: Optional[str] = None
|
|
396
|
-
) -> Optional[Dict[str, Any]]:
|
|
397
|
-
"""Write statistics about the output to a JSON file.
|
|
398
|
-
|
|
399
|
-
This method writes statistics including total record count and chunk count
|
|
400
|
-
to a JSON file and uploads it to the object store.
|
|
401
|
-
|
|
402
|
-
Raises:
|
|
403
|
-
Exception: If there's an error writing or uploading the statistics.
|
|
404
|
-
"""
|
|
405
|
-
try:
|
|
406
|
-
# prepare the statistics
|
|
407
|
-
statistics = {
|
|
408
|
-
"total_record_count": self.total_record_count,
|
|
409
|
-
"chunk_count": len(self.partitions),
|
|
410
|
-
"partitions": self.partitions,
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
# Ensure typename is included in the statistics payload (if provided)
|
|
414
|
-
if typename:
|
|
415
|
-
statistics["typename"] = typename
|
|
416
|
-
|
|
417
|
-
# Write the statistics to a json file inside a dedicated statistics/ folder
|
|
418
|
-
statistics_dir = os.path.join(self.output_path, "statistics")
|
|
419
|
-
os.makedirs(statistics_dir, exist_ok=True)
|
|
420
|
-
output_file_name = os.path.join(statistics_dir, "statistics.json.ignore")
|
|
421
|
-
# If chunk_start is provided, include it in the statistics filename
|
|
422
|
-
try:
|
|
423
|
-
cs = getattr(self, "chunk_start", None)
|
|
424
|
-
if cs is not None:
|
|
425
|
-
output_file_name = os.path.join(
|
|
426
|
-
statistics_dir, f"statistics-chunk-{cs}.json.ignore"
|
|
427
|
-
)
|
|
428
|
-
except Exception:
|
|
429
|
-
# If accessing chunk_start fails, fallback to default filename
|
|
430
|
-
pass
|
|
431
|
-
|
|
432
|
-
# Write the statistics dictionary to the JSON file
|
|
433
|
-
with open(output_file_name, "wb") as f:
|
|
434
|
-
f.write(orjson.dumps(statistics))
|
|
435
|
-
|
|
436
|
-
destination_file_path = get_object_store_prefix(output_file_name)
|
|
437
|
-
# Push the file to the object store
|
|
438
|
-
await ObjectStore.upload_file(
|
|
439
|
-
source=output_file_name,
|
|
440
|
-
destination=destination_file_path,
|
|
441
|
-
)
|
|
442
|
-
|
|
443
|
-
return statistics
|
|
444
|
-
except Exception as e:
|
|
445
|
-
logger.error(f"Error writing statistics: {str(e)}")
|
|
@@ -1,139 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, Union
|
|
2
|
-
|
|
3
|
-
from pyiceberg.catalog import Catalog
|
|
4
|
-
from pyiceberg.table import Table
|
|
5
|
-
from temporalio import activity
|
|
6
|
-
|
|
7
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
8
|
-
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
9
|
-
from application_sdk.outputs import Output
|
|
10
|
-
|
|
11
|
-
logger = get_logger(__name__)
|
|
12
|
-
activity.logger = logger
|
|
13
|
-
|
|
14
|
-
if TYPE_CHECKING:
|
|
15
|
-
import daft
|
|
16
|
-
import pandas as pd
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class IcebergOutput(Output):
|
|
20
|
-
"""
|
|
21
|
-
Iceberg Output class to write data to Iceberg tables using daft and pandas
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
iceberg_catalog: Catalog,
|
|
27
|
-
iceberg_namespace: str,
|
|
28
|
-
iceberg_table: Union[str, Table],
|
|
29
|
-
mode: str = "append",
|
|
30
|
-
total_record_count: int = 0,
|
|
31
|
-
chunk_count: int = 0,
|
|
32
|
-
retain_local_copy: bool = False,
|
|
33
|
-
):
|
|
34
|
-
"""Initialize the Iceberg output class.
|
|
35
|
-
|
|
36
|
-
Args:
|
|
37
|
-
iceberg_catalog (Catalog): Iceberg catalog object.
|
|
38
|
-
iceberg_namespace (str): Iceberg namespace.
|
|
39
|
-
iceberg_table (Union[str, Table]): Iceberg table object or table name.
|
|
40
|
-
mode (str, optional): Write mode for the iceberg table. Defaults to "append".
|
|
41
|
-
total_record_count (int, optional): Total record count written to the iceberg table. Defaults to 0.
|
|
42
|
-
chunk_count (int, optional): Number of chunks written to the iceberg table. Defaults to 0.
|
|
43
|
-
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
44
|
-
Defaults to False.
|
|
45
|
-
"""
|
|
46
|
-
self.total_record_count = total_record_count
|
|
47
|
-
self.chunk_count = chunk_count
|
|
48
|
-
self.iceberg_catalog = iceberg_catalog
|
|
49
|
-
self.iceberg_namespace = iceberg_namespace
|
|
50
|
-
self.iceberg_table = iceberg_table
|
|
51
|
-
self.mode = mode
|
|
52
|
-
self.metrics = get_metrics()
|
|
53
|
-
self.retain_local_copy = retain_local_copy
|
|
54
|
-
|
|
55
|
-
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
56
|
-
"""
|
|
57
|
-
Method to write the pandas dataframe to an iceberg table
|
|
58
|
-
"""
|
|
59
|
-
try:
|
|
60
|
-
import daft
|
|
61
|
-
|
|
62
|
-
if len(dataframe) == 0:
|
|
63
|
-
return
|
|
64
|
-
# convert the pandas dataframe to a daft dataframe
|
|
65
|
-
daft_dataframe = daft.from_pandas(dataframe)
|
|
66
|
-
await self.write_daft_dataframe(daft_dataframe)
|
|
67
|
-
|
|
68
|
-
# Record metrics for successful write
|
|
69
|
-
self.metrics.record_metric(
|
|
70
|
-
name="iceberg_write_records",
|
|
71
|
-
value=len(dataframe),
|
|
72
|
-
metric_type=MetricType.COUNTER,
|
|
73
|
-
labels={"mode": self.mode, "type": "pandas"},
|
|
74
|
-
description="Number of records written to Iceberg table from pandas DataFrame",
|
|
75
|
-
)
|
|
76
|
-
except Exception as e:
|
|
77
|
-
# Record metrics for failed write
|
|
78
|
-
self.metrics.record_metric(
|
|
79
|
-
name="iceberg_write_errors",
|
|
80
|
-
value=1,
|
|
81
|
-
metric_type=MetricType.COUNTER,
|
|
82
|
-
labels={"mode": self.mode, "type": "pandas", "error": str(e)},
|
|
83
|
-
description="Number of errors while writing to Iceberg table",
|
|
84
|
-
)
|
|
85
|
-
logger.error(f"Error writing pandas dataframe to iceberg table: {str(e)}")
|
|
86
|
-
raise e
|
|
87
|
-
|
|
88
|
-
async def write_daft_dataframe(self, dataframe: "daft.DataFrame"): # noqa: F821
|
|
89
|
-
"""
|
|
90
|
-
Method to write the daft dataframe to an iceberg table
|
|
91
|
-
"""
|
|
92
|
-
try:
|
|
93
|
-
if dataframe.count_rows() == 0:
|
|
94
|
-
return
|
|
95
|
-
# Create a new table in the iceberg catalog
|
|
96
|
-
self.chunk_count += 1
|
|
97
|
-
self.total_record_count += dataframe.count_rows()
|
|
98
|
-
|
|
99
|
-
# check if iceberg table is already created
|
|
100
|
-
if isinstance(self.iceberg_table, Table):
|
|
101
|
-
# if yes, use the existing iceberg table
|
|
102
|
-
table = self.iceberg_table
|
|
103
|
-
else:
|
|
104
|
-
# if not, create a new table in the iceberg catalog
|
|
105
|
-
table = self.iceberg_catalog.create_table_if_not_exists(
|
|
106
|
-
f"{self.iceberg_namespace}.{self.iceberg_table}",
|
|
107
|
-
schema=dataframe.to_arrow().schema,
|
|
108
|
-
)
|
|
109
|
-
# write the dataframe to the iceberg table
|
|
110
|
-
dataframe.write_iceberg(table, mode=self.mode)
|
|
111
|
-
|
|
112
|
-
# Record metrics for successful write
|
|
113
|
-
self.metrics.record_metric(
|
|
114
|
-
name="iceberg_write_records",
|
|
115
|
-
value=dataframe.count_rows(),
|
|
116
|
-
metric_type=MetricType.COUNTER,
|
|
117
|
-
labels={"mode": self.mode, "type": "daft"},
|
|
118
|
-
description="Number of records written to Iceberg table from daft DataFrame",
|
|
119
|
-
)
|
|
120
|
-
|
|
121
|
-
# Record chunk metrics
|
|
122
|
-
self.metrics.record_metric(
|
|
123
|
-
name="iceberg_chunks_written",
|
|
124
|
-
value=1,
|
|
125
|
-
metric_type=MetricType.COUNTER,
|
|
126
|
-
labels={"mode": self.mode},
|
|
127
|
-
description="Number of chunks written to Iceberg table",
|
|
128
|
-
)
|
|
129
|
-
except Exception as e:
|
|
130
|
-
# Record metrics for failed write
|
|
131
|
-
self.metrics.record_metric(
|
|
132
|
-
name="iceberg_write_errors",
|
|
133
|
-
value=1,
|
|
134
|
-
metric_type=MetricType.COUNTER,
|
|
135
|
-
labels={"mode": self.mode, "type": "daft", "error": str(e)},
|
|
136
|
-
description="Number of errors while writing to Iceberg table",
|
|
137
|
-
)
|
|
138
|
-
logger.error(f"Error writing daft dataframe to iceberg table: {str(e)}")
|
|
139
|
-
raise e
|