atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +308 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +654 -0
- application_sdk/io/json.py +429 -0
- application_sdk/{outputs → io}/parquet.py +358 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +23 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +30 -7
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -445
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
application_sdk/outputs/json.py
DELETED
|
@@ -1,268 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from datetime import datetime
|
|
3
|
-
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
|
|
4
|
-
|
|
5
|
-
import orjson
|
|
6
|
-
from temporalio import activity
|
|
7
|
-
|
|
8
|
-
from application_sdk.activities.common.models import ActivityStatistics
|
|
9
|
-
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
10
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
11
|
-
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
12
|
-
from application_sdk.outputs import Output
|
|
13
|
-
|
|
14
|
-
logger = get_logger(__name__)
|
|
15
|
-
activity.logger = logger
|
|
16
|
-
|
|
17
|
-
if TYPE_CHECKING:
|
|
18
|
-
import daft # type: ignore
|
|
19
|
-
import pandas as pd
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def convert_datetime_to_epoch(data: Any) -> Any:
|
|
23
|
-
"""Convert datetime objects to epoch timestamps in milliseconds.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
data: The data to convert
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
The converted data with datetime fields as epoch timestamps
|
|
30
|
-
"""
|
|
31
|
-
if isinstance(data, datetime):
|
|
32
|
-
return int(data.timestamp() * 1000)
|
|
33
|
-
elif isinstance(data, dict):
|
|
34
|
-
return {k: convert_datetime_to_epoch(v) for k, v in data.items()}
|
|
35
|
-
elif isinstance(data, list):
|
|
36
|
-
return [convert_datetime_to_epoch(item) for item in data]
|
|
37
|
-
return data
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
class JsonOutput(Output):
|
|
41
|
-
"""Output handler for writing data to JSON files.
|
|
42
|
-
|
|
43
|
-
This class provides functionality for writing data to JSON files with support
|
|
44
|
-
for chunking large datasets, buffering, and automatic file path generation.
|
|
45
|
-
It can handle both pandas and daft DataFrames as input.
|
|
46
|
-
|
|
47
|
-
The output can be written to local files and optionally uploaded to an object
|
|
48
|
-
store. Files are named using a configurable path generation scheme that
|
|
49
|
-
includes chunk numbers for split files.
|
|
50
|
-
|
|
51
|
-
Attributes:
|
|
52
|
-
output_path (Optional[str]): Base path where JSON files will be written.
|
|
53
|
-
output_suffix (str): Suffix added to file paths when uploading to object store.
|
|
54
|
-
typename (Optional[str]): Type identifier for the data being written.
|
|
55
|
-
chunk_start (Optional[int]): Starting index for chunk numbering.
|
|
56
|
-
buffer_size (int): Size of the write buffer in bytes.
|
|
57
|
-
chunk_size (int): Maximum number of records per chunk.
|
|
58
|
-
total_record_count (int): Total number of records processed.
|
|
59
|
-
chunk_count (int): Number of chunks written.
|
|
60
|
-
buffer (List[Union[pd.DataFrame, daft.DataFrame]]): Buffer for accumulating
|
|
61
|
-
data before writing.
|
|
62
|
-
"""
|
|
63
|
-
|
|
64
|
-
_EXTENSION = ".json"
|
|
65
|
-
|
|
66
|
-
def __init__(
|
|
67
|
-
self,
|
|
68
|
-
output_suffix: str,
|
|
69
|
-
output_path: Optional[str] = None,
|
|
70
|
-
typename: Optional[str] = None,
|
|
71
|
-
chunk_start: Optional[int] = None,
|
|
72
|
-
buffer_size: int = 5000,
|
|
73
|
-
chunk_size: Optional[int] = 50000, # to limit the memory usage on upload
|
|
74
|
-
total_record_count: int = 0,
|
|
75
|
-
chunk_count: int = 0,
|
|
76
|
-
start_marker: Optional[str] = None,
|
|
77
|
-
end_marker: Optional[str] = None,
|
|
78
|
-
retain_local_copy: bool = False,
|
|
79
|
-
**kwargs: Dict[str, Any],
|
|
80
|
-
):
|
|
81
|
-
"""Initialize the JSON output handler.
|
|
82
|
-
|
|
83
|
-
Args:
|
|
84
|
-
output_path (str): Path where JSON files will be written.
|
|
85
|
-
output_suffix (str): Prefix for files when uploading to object store.
|
|
86
|
-
chunk_start (Optional[int], optional): Starting index for chunk numbering.
|
|
87
|
-
Defaults to None.
|
|
88
|
-
buffer_size (int, optional): Size of the buffer in bytes.
|
|
89
|
-
Defaults to 10MB (1024 * 1024 * 10).
|
|
90
|
-
chunk_size (Optional[int], optional): Maximum number of records per chunk. If None, uses config value.
|
|
91
|
-
Defaults to None.
|
|
92
|
-
total_record_count (int, optional): Initial total record count.
|
|
93
|
-
Defaults to 0.
|
|
94
|
-
chunk_count (int, optional): Initial chunk count.
|
|
95
|
-
Defaults to 0.
|
|
96
|
-
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
97
|
-
Defaults to False.
|
|
98
|
-
"""
|
|
99
|
-
self.output_path = output_path
|
|
100
|
-
self.output_suffix = output_suffix
|
|
101
|
-
self.typename = typename
|
|
102
|
-
self.chunk_start = chunk_start
|
|
103
|
-
self.total_record_count = total_record_count
|
|
104
|
-
self.chunk_count = chunk_count
|
|
105
|
-
self.buffer_size = buffer_size
|
|
106
|
-
self.chunk_size = chunk_size or 50000 # to limit the memory usage on upload
|
|
107
|
-
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
108
|
-
self.current_buffer_size = 0
|
|
109
|
-
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
110
|
-
self.max_file_size_bytes = int(
|
|
111
|
-
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
112
|
-
) # 90% of DAPR limit as safety buffer
|
|
113
|
-
self.start_marker = start_marker
|
|
114
|
-
self.end_marker = end_marker
|
|
115
|
-
self.partitions = []
|
|
116
|
-
self.chunk_part = 0
|
|
117
|
-
self.metrics = get_metrics()
|
|
118
|
-
self.retain_local_copy = retain_local_copy
|
|
119
|
-
|
|
120
|
-
if not self.output_path:
|
|
121
|
-
raise ValueError("output_path is required")
|
|
122
|
-
|
|
123
|
-
self.output_path = os.path.join(self.output_path, output_suffix)
|
|
124
|
-
if typename:
|
|
125
|
-
self.output_path = os.path.join(self.output_path, typename)
|
|
126
|
-
os.makedirs(self.output_path, exist_ok=True)
|
|
127
|
-
|
|
128
|
-
if self.chunk_start:
|
|
129
|
-
self.chunk_count = self.chunk_start + self.chunk_count
|
|
130
|
-
|
|
131
|
-
async def write_daft_dataframe(
|
|
132
|
-
self,
|
|
133
|
-
dataframe: "daft.DataFrame",
|
|
134
|
-
preserve_fields: Optional[List[str]] = [
|
|
135
|
-
"identity_cycle",
|
|
136
|
-
"number_columns_in_part_key",
|
|
137
|
-
"columns_participating_in_part_key",
|
|
138
|
-
"engine",
|
|
139
|
-
"is_insertable_into",
|
|
140
|
-
"is_typed",
|
|
141
|
-
],
|
|
142
|
-
null_to_empty_dict_fields: Optional[List[str]] = [
|
|
143
|
-
"attributes",
|
|
144
|
-
"customAttributes",
|
|
145
|
-
],
|
|
146
|
-
): # noqa: F821
|
|
147
|
-
"""Write a daft DataFrame to JSON files.
|
|
148
|
-
|
|
149
|
-
This method converts the daft DataFrame to pandas and writes it to JSON files.
|
|
150
|
-
|
|
151
|
-
Args:
|
|
152
|
-
dataframe (daft.DataFrame): The DataFrame to write.
|
|
153
|
-
|
|
154
|
-
Note:
|
|
155
|
-
Daft does not have built-in JSON writing support, so we are using orjson.
|
|
156
|
-
"""
|
|
157
|
-
try:
|
|
158
|
-
if self.chunk_start is None:
|
|
159
|
-
self.chunk_part = 0
|
|
160
|
-
|
|
161
|
-
buffer = []
|
|
162
|
-
for row in dataframe.iter_rows():
|
|
163
|
-
self.total_record_count += 1
|
|
164
|
-
# Convert datetime fields to epoch timestamps before serialization
|
|
165
|
-
row = convert_datetime_to_epoch(row)
|
|
166
|
-
# Remove null attributes from the row recursively, preserving specified fields
|
|
167
|
-
cleaned_row = self.process_null_fields(
|
|
168
|
-
row, preserve_fields, null_to_empty_dict_fields
|
|
169
|
-
)
|
|
170
|
-
# Serialize the row and add it to the buffer
|
|
171
|
-
serialized_row = orjson.dumps(
|
|
172
|
-
cleaned_row, option=orjson.OPT_APPEND_NEWLINE
|
|
173
|
-
)
|
|
174
|
-
buffer.append(serialized_row)
|
|
175
|
-
self.current_buffer_size += 1
|
|
176
|
-
self.current_buffer_size_bytes += len(serialized_row)
|
|
177
|
-
|
|
178
|
-
# If the buffer size is reached append to the file and clear the buffer
|
|
179
|
-
if self.current_buffer_size >= self.buffer_size:
|
|
180
|
-
await self.flush_daft_buffer(buffer, self.chunk_part)
|
|
181
|
-
|
|
182
|
-
if self.current_buffer_size_bytes > self.max_file_size_bytes or (
|
|
183
|
-
self.total_record_count > 0
|
|
184
|
-
and self.total_record_count % self.chunk_size == 0
|
|
185
|
-
):
|
|
186
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part, self.start_marker, self.end_marker)}"
|
|
187
|
-
if os.path.exists(output_file_name):
|
|
188
|
-
await self._upload_file(output_file_name)
|
|
189
|
-
self.chunk_part += 1
|
|
190
|
-
|
|
191
|
-
# Write any remaining rows in the buffer
|
|
192
|
-
if self.current_buffer_size > 0:
|
|
193
|
-
await self.flush_daft_buffer(buffer, self.chunk_part)
|
|
194
|
-
|
|
195
|
-
# Record metrics for successful write
|
|
196
|
-
self.metrics.record_metric(
|
|
197
|
-
name="json_write_records",
|
|
198
|
-
value=dataframe.count_rows(),
|
|
199
|
-
metric_type=MetricType.COUNTER,
|
|
200
|
-
labels={"type": "daft"},
|
|
201
|
-
description="Number of records written to JSON files from daft DataFrame",
|
|
202
|
-
)
|
|
203
|
-
except Exception as e:
|
|
204
|
-
# Record metrics for failed write
|
|
205
|
-
self.metrics.record_metric(
|
|
206
|
-
name="json_write_errors",
|
|
207
|
-
value=1,
|
|
208
|
-
metric_type=MetricType.COUNTER,
|
|
209
|
-
labels={"type": "daft", "error": str(e)},
|
|
210
|
-
description="Number of errors while writing to JSON files",
|
|
211
|
-
)
|
|
212
|
-
logger.error(f"Error writing daft dataframe to json: {str(e)}")
|
|
213
|
-
|
|
214
|
-
async def flush_daft_buffer(self, buffer: List[str], chunk_part: int):
|
|
215
|
-
"""Flush the current buffer to a JSON file.
|
|
216
|
-
|
|
217
|
-
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
218
|
-
and uploads the file to the object store.
|
|
219
|
-
"""
|
|
220
|
-
output_file_name = (
|
|
221
|
-
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
222
|
-
)
|
|
223
|
-
with open(output_file_name, "ab+") as f:
|
|
224
|
-
f.writelines(buffer)
|
|
225
|
-
buffer.clear() # Clear the buffer
|
|
226
|
-
|
|
227
|
-
self.current_buffer_size = 0
|
|
228
|
-
|
|
229
|
-
# Record chunk metrics
|
|
230
|
-
self.metrics.record_metric(
|
|
231
|
-
name="json_chunks_written",
|
|
232
|
-
value=1,
|
|
233
|
-
metric_type=MetricType.COUNTER,
|
|
234
|
-
labels={"type": "daft"},
|
|
235
|
-
description="Number of chunks written to JSON files",
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
239
|
-
"""Write a chunk to a JSON file.
|
|
240
|
-
|
|
241
|
-
This method writes a chunk to a JSON file and uploads the file to the object store.
|
|
242
|
-
"""
|
|
243
|
-
mode = "w" if not os.path.exists(file_name) else "a"
|
|
244
|
-
chunk.to_json(file_name, orient="records", lines=True, mode=mode)
|
|
245
|
-
|
|
246
|
-
async def get_statistics(
|
|
247
|
-
self, typename: Optional[str] = None
|
|
248
|
-
) -> ActivityStatistics:
|
|
249
|
-
"""Get the statistics of the JSON files.
|
|
250
|
-
|
|
251
|
-
This method returns the statistics of the JSON files.
|
|
252
|
-
"""
|
|
253
|
-
# Finally upload the final file
|
|
254
|
-
if self.current_buffer_size_bytes > 0:
|
|
255
|
-
output_file_name = (
|
|
256
|
-
f"{self.output_path}/{self.path_gen(self.chunk_count, self.chunk_part)}"
|
|
257
|
-
)
|
|
258
|
-
if os.path.exists(output_file_name):
|
|
259
|
-
await self._upload_file(output_file_name)
|
|
260
|
-
self.chunk_part += 1
|
|
261
|
-
|
|
262
|
-
# If chunk_start is set we don't want to increment the chunk_count
|
|
263
|
-
# Since it should only increment the chunk_part in this case
|
|
264
|
-
if self.chunk_start is None:
|
|
265
|
-
self.chunk_count += 1
|
|
266
|
-
self.partitions.append(self.chunk_part)
|
|
267
|
-
|
|
268
|
-
return await super().get_statistics(typename)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE
RENAMED
|
File without changes
|