atlan-application-sdk 0.1.1rc36__py3-none-any.whl → 0.1.1rc38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/models.py +2 -1
- application_sdk/clients/atlan.py +56 -0
- application_sdk/inputs/parquet.py +4 -4
- application_sdk/outputs/__init__.py +24 -1
- application_sdk/outputs/json.py +57 -38
- application_sdk/outputs/parquet.py +116 -18
- application_sdk/version.py +1 -1
- application_sdk/workflows/metadata_extraction/sql.py +18 -5
- {atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/RECORD +13 -14
- application_sdk/clients/async_atlan.py +0 -70
- {atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/licenses/NOTICE +0 -0
|
@@ -4,7 +4,7 @@ This module contains Pydantic models used to represent various data structures
|
|
|
4
4
|
needed by activities, such as statistics and configuration.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from typing import Optional
|
|
7
|
+
from typing import List, Optional
|
|
8
8
|
|
|
9
9
|
from pydantic import BaseModel
|
|
10
10
|
|
|
@@ -34,4 +34,5 @@ class ActivityStatistics(BaseModel):
|
|
|
34
34
|
|
|
35
35
|
total_record_count: int = 0
|
|
36
36
|
chunk_count: int = 0
|
|
37
|
+
partitions: Optional[List[int]] = []
|
|
37
38
|
typename: Optional[str] = None
|
application_sdk/clients/atlan.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Optional
|
|
2
2
|
|
|
3
|
+
from pyatlan.client.aio import AsyncAtlanClient
|
|
3
4
|
from pyatlan.client.atlan import AtlanClient
|
|
4
5
|
|
|
5
6
|
from application_sdk.common.error_codes import ClientError
|
|
@@ -68,3 +69,58 @@ def _get_client_from_token(api_token_guid: str):
|
|
|
68
69
|
f"{ClientError.AUTH_CONFIG_ERROR}: Environment variable CLIENT_SECRET is required when API_TOKEN_GUID is set."
|
|
69
70
|
)
|
|
70
71
|
return AtlanClient.from_token_guid(guid=api_token_guid)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
async def get_async_client(
|
|
75
|
+
base_url: Optional[str] = None,
|
|
76
|
+
api_key: Optional[str] = None,
|
|
77
|
+
api_token_guid: Optional[str] = None,
|
|
78
|
+
) -> AsyncAtlanClient:
|
|
79
|
+
"""
|
|
80
|
+
Returns an authenticated AsyncAtlanClient instance using provided parameters or environment variables.
|
|
81
|
+
|
|
82
|
+
Selects authentication method based on the presence of parameters or environment variables and validates the required configuration.
|
|
83
|
+
In general, the use of environment variables is recommended. Any parameters specified will override the environment variables.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
base_url: Atlan base URL (overrides ATLAN_BASE_URL)
|
|
87
|
+
api_key: Atlan API key (overrides ATLAN_API_KEY)
|
|
88
|
+
api_token_guid: API token GUID (overrides API_TOKEN_GUID)
|
|
89
|
+
"""
|
|
90
|
+
# Resolve final values (parameters override env vars)
|
|
91
|
+
final_token_guid = api_token_guid or ATLAN_API_TOKEN_GUID
|
|
92
|
+
final_base_url = base_url or ATLAN_BASE_URL
|
|
93
|
+
final_api_key = api_key or ATLAN_API_KEY
|
|
94
|
+
|
|
95
|
+
# Priority 1: Token-based auth (recommended for production)
|
|
96
|
+
if final_token_guid:
|
|
97
|
+
if final_base_url or final_api_key:
|
|
98
|
+
logger.warning(
|
|
99
|
+
"Token auth takes precedence - ignoring base_url/api_key parameters as well as ATLAN_BASE_URL and ATLAN_API_KEY environment variables."
|
|
100
|
+
)
|
|
101
|
+
return await _get_async_client_from_token(final_token_guid)
|
|
102
|
+
|
|
103
|
+
# Priority 2: API key + base URL auth
|
|
104
|
+
if not final_base_url:
|
|
105
|
+
raise ClientError(
|
|
106
|
+
"ATLAN_BASE_URL is required (via parameter or environment variable)"
|
|
107
|
+
)
|
|
108
|
+
if not final_api_key:
|
|
109
|
+
raise ClientError(
|
|
110
|
+
"ATLAN_API_KEY is required (via parameter or environment variable)"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
logger.info("Using API key-based authentication")
|
|
114
|
+
return AsyncAtlanClient(base_url=final_base_url, api_key=final_api_key)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
async def _get_async_client_from_token(api_token_guid: str):
|
|
118
|
+
if not ATLAN_CLIENT_ID:
|
|
119
|
+
raise ClientError(
|
|
120
|
+
f"{ClientError.AUTH_CONFIG_ERROR}: Environment variable CLIENT_ID is required when API_TOKEN_GUID is set."
|
|
121
|
+
)
|
|
122
|
+
if not ATLAN_CLIENT_SECRET:
|
|
123
|
+
raise ClientError(
|
|
124
|
+
f"{ClientError.AUTH_CONFIG_ERROR}: Environment variable CLIENT_SECRET is required when API_TOKEN_GUID is set."
|
|
125
|
+
)
|
|
126
|
+
return await AsyncAtlanClient.from_token_guid(guid=api_token_guid)
|
|
@@ -10,7 +10,7 @@ from application_sdk.services.objectstore import ObjectStore
|
|
|
10
10
|
logger = get_logger(__name__)
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
|
-
import daft
|
|
13
|
+
import daft # type: ignore
|
|
14
14
|
import pandas as pd
|
|
15
15
|
|
|
16
16
|
|
|
@@ -136,7 +136,7 @@ class ParquetInput(Input):
|
|
|
136
136
|
daft.DataFrame: Combined daft dataframe from all parquet files.
|
|
137
137
|
"""
|
|
138
138
|
try:
|
|
139
|
-
import daft
|
|
139
|
+
import daft # type: ignore
|
|
140
140
|
|
|
141
141
|
if self.file_names:
|
|
142
142
|
path = f"{self.path}/{self.file_names[0].split('/')[0]}"
|
|
@@ -161,11 +161,11 @@ class ParquetInput(Input):
|
|
|
161
161
|
a batch of data from the parquet file(s).
|
|
162
162
|
"""
|
|
163
163
|
try:
|
|
164
|
-
import daft
|
|
164
|
+
import daft # type: ignore
|
|
165
165
|
|
|
166
166
|
if self.file_names:
|
|
167
167
|
for file_name in self.file_names:
|
|
168
|
-
path = f"{self.path}/{file_name
|
|
168
|
+
path = f"{self.path}/{file_name}"
|
|
169
169
|
if self.input_prefix and path:
|
|
170
170
|
await self.download_files(path)
|
|
171
171
|
yield daft.read_parquet(path)
|
|
@@ -13,6 +13,7 @@ from typing import (
|
|
|
13
13
|
Dict,
|
|
14
14
|
Generator,
|
|
15
15
|
List,
|
|
16
|
+
Literal,
|
|
16
17
|
Optional,
|
|
17
18
|
Union,
|
|
18
19
|
cast,
|
|
@@ -31,7 +32,7 @@ logger = get_logger(__name__)
|
|
|
31
32
|
activity.logger = logger
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
|
-
import daft
|
|
35
|
+
import daft # type: ignore
|
|
35
36
|
import pandas as pd
|
|
36
37
|
|
|
37
38
|
|
|
@@ -52,6 +53,27 @@ class Output(ABC):
|
|
|
52
53
|
output_prefix: str
|
|
53
54
|
total_record_count: int
|
|
54
55
|
chunk_count: int
|
|
56
|
+
statistics: List[int] = []
|
|
57
|
+
|
|
58
|
+
def estimate_dataframe_file_size(
|
|
59
|
+
self, dataframe: "pd.DataFrame", file_type: Literal["json", "parquet"]
|
|
60
|
+
) -> int:
|
|
61
|
+
"""Estimate File size of a DataFrame by sampling a few records."""
|
|
62
|
+
if len(dataframe) == 0:
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
# Sample up to 10 records to estimate average size
|
|
66
|
+
sample_size = min(10, len(dataframe))
|
|
67
|
+
sample = dataframe.head(sample_size)
|
|
68
|
+
if file_type == "json":
|
|
69
|
+
sample_file = sample.to_json(orient="records", lines=True)
|
|
70
|
+
else:
|
|
71
|
+
sample_file = sample.to_parquet(index=False, compression="snappy")
|
|
72
|
+
if sample_file is not None:
|
|
73
|
+
avg_record_size = len(sample_file) / sample_size
|
|
74
|
+
return int(avg_record_size * len(dataframe))
|
|
75
|
+
|
|
76
|
+
return 0
|
|
55
77
|
|
|
56
78
|
def process_null_fields(
|
|
57
79
|
self,
|
|
@@ -217,6 +239,7 @@ class Output(ABC):
|
|
|
217
239
|
statistics = {
|
|
218
240
|
"total_record_count": self.total_record_count,
|
|
219
241
|
"chunk_count": self.chunk_count,
|
|
242
|
+
"partitions": self.statistics,
|
|
220
243
|
}
|
|
221
244
|
|
|
222
245
|
# Write the statistics to a json file
|
application_sdk/outputs/json.py
CHANGED
|
@@ -6,6 +6,7 @@ import orjson
|
|
|
6
6
|
from temporalio import activity
|
|
7
7
|
|
|
8
8
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
9
|
+
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
9
10
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
10
11
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
11
12
|
from application_sdk.outputs import Output
|
|
@@ -15,7 +16,7 @@ logger = get_logger(__name__)
|
|
|
15
16
|
activity.logger = logger
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
|
-
import daft
|
|
19
|
+
import daft # type: ignore
|
|
19
20
|
import pandas as pd
|
|
20
21
|
|
|
21
22
|
|
|
@@ -32,7 +33,7 @@ def path_gen(chunk_start: int | None, chunk_count: int) -> str:
|
|
|
32
33
|
if chunk_start is None:
|
|
33
34
|
return f"{str(chunk_count)}.json"
|
|
34
35
|
else:
|
|
35
|
-
return f"{
|
|
36
|
+
return f"chunk-{chunk_start}-part{chunk_count}.json"
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
def convert_datetime_to_epoch(data: Any) -> Any:
|
|
@@ -124,6 +125,10 @@ class JsonOutput(Output):
|
|
|
124
125
|
self.chunk_size = chunk_size or 100000
|
|
125
126
|
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
126
127
|
self.current_buffer_size = 0
|
|
128
|
+
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
129
|
+
self.max_file_size_bytes = int(
|
|
130
|
+
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
131
|
+
) # 90% of DAPR limit as safety buffer
|
|
127
132
|
self.path_gen = path_gen
|
|
128
133
|
self.start_marker = start_marker
|
|
129
134
|
self.end_marker = end_marker
|
|
@@ -172,8 +177,21 @@ class JsonOutput(Output):
|
|
|
172
177
|
]
|
|
173
178
|
|
|
174
179
|
for chunk in chunks:
|
|
180
|
+
# Estimate size of this chunk
|
|
181
|
+
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "json")
|
|
182
|
+
|
|
183
|
+
# Check if adding this chunk would exceed size limit
|
|
184
|
+
if (
|
|
185
|
+
self.current_buffer_size_bytes + chunk_size_bytes
|
|
186
|
+
> self.max_file_size_bytes
|
|
187
|
+
and self.current_buffer_size > 0
|
|
188
|
+
):
|
|
189
|
+
# Flush current buffer before adding this chunk
|
|
190
|
+
await self._flush_buffer()
|
|
191
|
+
|
|
175
192
|
self.buffer.append(chunk)
|
|
176
193
|
self.current_buffer_size += len(chunk)
|
|
194
|
+
self.current_buffer_size_bytes += chunk_size_bytes
|
|
177
195
|
|
|
178
196
|
if self.current_buffer_size >= partition:
|
|
179
197
|
await self._flush_buffer()
|
|
@@ -237,45 +255,19 @@ class JsonOutput(Output):
|
|
|
237
255
|
row, preserve_fields, null_to_empty_dict_fields
|
|
238
256
|
)
|
|
239
257
|
# Serialize the row and add it to the buffer
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
250
|
-
with open(output_file_name, "w") as f:
|
|
251
|
-
f.writelines(buffer)
|
|
252
|
-
buffer.clear() # Clear the buffer
|
|
253
|
-
|
|
254
|
-
# Record chunk metrics
|
|
255
|
-
self.metrics.record_metric(
|
|
256
|
-
name="json_chunks_written",
|
|
257
|
-
value=1,
|
|
258
|
-
metric_type=MetricType.COUNTER,
|
|
259
|
-
labels={"type": "daft"},
|
|
260
|
-
description="Number of chunks written to JSON files",
|
|
261
|
-
)
|
|
258
|
+
serialized_row = orjson.dumps(
|
|
259
|
+
cleaned_row, option=orjson.OPT_APPEND_NEWLINE
|
|
260
|
+
).decode("utf-8")
|
|
261
|
+
buffer.append(serialized_row)
|
|
262
|
+
self.current_buffer_size_bytes += len(serialized_row)
|
|
263
|
+
if (self.chunk_size and len(buffer) >= self.chunk_size) or (
|
|
264
|
+
self.current_buffer_size_bytes > self.max_file_size_bytes
|
|
265
|
+
):
|
|
266
|
+
await self.flush_daft_buffer(buffer)
|
|
262
267
|
|
|
263
268
|
# Write any remaining rows in the buffer
|
|
264
269
|
if buffer:
|
|
265
|
-
self.
|
|
266
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
267
|
-
with open(output_file_name, "w") as f:
|
|
268
|
-
f.writelines(buffer)
|
|
269
|
-
buffer.clear()
|
|
270
|
-
|
|
271
|
-
# Record chunk metrics
|
|
272
|
-
self.metrics.record_metric(
|
|
273
|
-
name="json_chunks_written",
|
|
274
|
-
value=1,
|
|
275
|
-
metric_type=MetricType.COUNTER,
|
|
276
|
-
labels={"type": "daft"},
|
|
277
|
-
description="Number of chunks written to JSON files",
|
|
278
|
-
)
|
|
270
|
+
await self.flush_daft_buffer(buffer)
|
|
279
271
|
|
|
280
272
|
# Record metrics for successful write
|
|
281
273
|
self.metrics.record_metric(
|
|
@@ -303,6 +295,32 @@ class JsonOutput(Output):
|
|
|
303
295
|
)
|
|
304
296
|
logger.error(f"Error writing daft dataframe to json: {str(e)}")
|
|
305
297
|
|
|
298
|
+
async def flush_daft_buffer(self, buffer: List[str]):
|
|
299
|
+
"""Flush the current buffer to a JSON file.
|
|
300
|
+
|
|
301
|
+
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
302
|
+
and uploads the file to the object store.
|
|
303
|
+
"""
|
|
304
|
+
self.chunk_count += 1
|
|
305
|
+
output_file_name = (
|
|
306
|
+
f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
307
|
+
)
|
|
308
|
+
with open(output_file_name, "w") as f:
|
|
309
|
+
f.writelines(buffer)
|
|
310
|
+
buffer.clear() # Clear the buffer
|
|
311
|
+
|
|
312
|
+
self.current_buffer_size = 0
|
|
313
|
+
self.current_buffer_size_bytes = 0
|
|
314
|
+
|
|
315
|
+
# Record chunk metrics
|
|
316
|
+
self.metrics.record_metric(
|
|
317
|
+
name="json_chunks_written",
|
|
318
|
+
value=1,
|
|
319
|
+
metric_type=MetricType.COUNTER,
|
|
320
|
+
labels={"type": "daft"},
|
|
321
|
+
description="Number of chunks written to JSON files",
|
|
322
|
+
)
|
|
323
|
+
|
|
306
324
|
async def _flush_buffer(self):
|
|
307
325
|
"""Flush the current buffer to a JSON file.
|
|
308
326
|
|
|
@@ -353,6 +371,7 @@ class JsonOutput(Output):
|
|
|
353
371
|
|
|
354
372
|
self.buffer.clear()
|
|
355
373
|
self.current_buffer_size = 0
|
|
374
|
+
self.current_buffer_size_bytes = 0
|
|
356
375
|
|
|
357
376
|
except Exception as e:
|
|
358
377
|
# Record metrics for failed write
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import TYPE_CHECKING, Literal, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, List, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
from temporalio import activity
|
|
5
5
|
|
|
6
6
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
7
|
+
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
7
8
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
8
9
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
9
10
|
from application_sdk.outputs import Output
|
|
@@ -13,7 +14,7 @@ logger = get_logger(__name__)
|
|
|
13
14
|
activity.logger = logger
|
|
14
15
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
16
|
-
import daft
|
|
17
|
+
import daft # type: ignore
|
|
17
18
|
import pandas as pd
|
|
18
19
|
|
|
19
20
|
|
|
@@ -46,6 +47,7 @@ class ParquetOutput(Output):
|
|
|
46
47
|
typename: Optional[str] = None,
|
|
47
48
|
write_mode: Literal["append", "overwrite", "overwrite-partitions"] = "append",
|
|
48
49
|
chunk_size: Optional[int] = 100000,
|
|
50
|
+
buffer_size: Optional[int] = 100000,
|
|
49
51
|
total_record_count: int = 0,
|
|
50
52
|
chunk_count: int = 0,
|
|
51
53
|
chunk_start: Optional[int] = None,
|
|
@@ -78,11 +80,19 @@ class ParquetOutput(Output):
|
|
|
78
80
|
self.typename = typename
|
|
79
81
|
self.write_mode = write_mode
|
|
80
82
|
self.chunk_size = chunk_size
|
|
83
|
+
self.buffer_size = buffer_size
|
|
84
|
+
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
81
85
|
self.total_record_count = total_record_count
|
|
82
86
|
self.chunk_count = chunk_count
|
|
87
|
+
self.current_buffer_size = 0
|
|
88
|
+
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
89
|
+
self.max_file_size_bytes = int(
|
|
90
|
+
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
91
|
+
) # 90% of DAPR limit as safety buffer
|
|
83
92
|
self.chunk_start = chunk_start
|
|
84
93
|
self.start_marker = start_marker
|
|
85
94
|
self.end_marker = end_marker
|
|
95
|
+
self.statistics = []
|
|
86
96
|
self.metrics = get_metrics()
|
|
87
97
|
|
|
88
98
|
# Create output directory
|
|
@@ -117,7 +127,7 @@ class ParquetOutput(Output):
|
|
|
117
127
|
if chunk_start is None:
|
|
118
128
|
return f"{str(chunk_count)}.parquet"
|
|
119
129
|
else:
|
|
120
|
-
return f"{str(chunk_start
|
|
130
|
+
return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
|
|
121
131
|
|
|
122
132
|
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
123
133
|
"""Write a pandas DataFrame to Parquet files and upload to object store.
|
|
@@ -126,20 +136,46 @@ class ParquetOutput(Output):
|
|
|
126
136
|
dataframe (pd.DataFrame): The DataFrame to write.
|
|
127
137
|
"""
|
|
128
138
|
try:
|
|
139
|
+
chunk_part = 0
|
|
129
140
|
if len(dataframe) == 0:
|
|
130
141
|
return
|
|
131
142
|
|
|
132
|
-
#
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# Write the dataframe to parquet using pandas native method
|
|
138
|
-
dataframe.to_parquet(
|
|
139
|
-
file_path,
|
|
140
|
-
index=False,
|
|
141
|
-
compression="snappy", # Using snappy compression by default
|
|
143
|
+
# Split the DataFrame into chunks
|
|
144
|
+
partition = (
|
|
145
|
+
self.chunk_size
|
|
146
|
+
if self.chunk_start is None
|
|
147
|
+
else min(self.chunk_size, self.buffer_size)
|
|
142
148
|
)
|
|
149
|
+
chunks = [
|
|
150
|
+
dataframe[i : i + partition] # type: ignore
|
|
151
|
+
for i in range(0, len(dataframe), partition)
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
for chunk in chunks:
|
|
155
|
+
# Estimate size of this chunk
|
|
156
|
+
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
|
|
157
|
+
|
|
158
|
+
# Check if adding this chunk would exceed size limit
|
|
159
|
+
if (
|
|
160
|
+
self.current_buffer_size_bytes + chunk_size_bytes
|
|
161
|
+
> self.max_file_size_bytes
|
|
162
|
+
and self.current_buffer_size > 0
|
|
163
|
+
):
|
|
164
|
+
# Flush current buffer before adding this chunk
|
|
165
|
+
chunk_part += 1
|
|
166
|
+
await self._flush_buffer(chunk_part)
|
|
167
|
+
|
|
168
|
+
self.buffer.append(chunk)
|
|
169
|
+
self.current_buffer_size += len(chunk)
|
|
170
|
+
self.current_buffer_size_bytes += chunk_size_bytes
|
|
171
|
+
|
|
172
|
+
if self.current_buffer_size >= partition: # type: ignore
|
|
173
|
+
chunk_part += 1
|
|
174
|
+
await self._flush_buffer(chunk_part)
|
|
175
|
+
|
|
176
|
+
if self.buffer and self.current_buffer_size > 0:
|
|
177
|
+
chunk_part += 1
|
|
178
|
+
await self._flush_buffer(chunk_part)
|
|
143
179
|
|
|
144
180
|
# Record metrics for successful write
|
|
145
181
|
self.metrics.record_metric(
|
|
@@ -159,11 +195,8 @@ class ParquetOutput(Output):
|
|
|
159
195
|
description="Number of chunks written to Parquet files",
|
|
160
196
|
)
|
|
161
197
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
source=file_path,
|
|
165
|
-
destination=get_object_store_prefix(file_path),
|
|
166
|
-
)
|
|
198
|
+
self.chunk_count += 1
|
|
199
|
+
self.statistics.append(chunk_part)
|
|
167
200
|
except Exception as e:
|
|
168
201
|
# Record metrics for failed write
|
|
169
202
|
self.metrics.record_metric(
|
|
@@ -245,3 +278,68 @@ class ParquetOutput(Output):
|
|
|
245
278
|
str: The full path of the output file.
|
|
246
279
|
"""
|
|
247
280
|
return self.output_path
|
|
281
|
+
|
|
282
|
+
async def _flush_buffer(self, chunk_part):
|
|
283
|
+
"""Flush the current buffer to a Parquet file.
|
|
284
|
+
|
|
285
|
+
This method combines all DataFrames in the buffer, writes them to a Parquet file,
|
|
286
|
+
and uploads the file to the object store.
|
|
287
|
+
|
|
288
|
+
Note:
|
|
289
|
+
If the buffer is empty or has no records, the method returns without writing.
|
|
290
|
+
"""
|
|
291
|
+
import pandas as pd
|
|
292
|
+
|
|
293
|
+
if not self.buffer or not self.current_buffer_size:
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
|
|
297
|
+
raise TypeError(
|
|
298
|
+
"_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
# Now it's safe to cast for pd.concat
|
|
303
|
+
pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
|
|
304
|
+
combined_dataframe = pd.concat(pd_buffer)
|
|
305
|
+
|
|
306
|
+
# Write DataFrame to Parquet file
|
|
307
|
+
if not combined_dataframe.empty:
|
|
308
|
+
self.total_record_count += len(combined_dataframe)
|
|
309
|
+
output_file_name = (
|
|
310
|
+
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
311
|
+
)
|
|
312
|
+
combined_dataframe.to_parquet(
|
|
313
|
+
output_file_name, index=False, compression="snappy"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Record chunk metrics
|
|
317
|
+
self.metrics.record_metric(
|
|
318
|
+
name="parquet_chunks_written",
|
|
319
|
+
value=1,
|
|
320
|
+
metric_type=MetricType.COUNTER,
|
|
321
|
+
labels={"type": "pandas"},
|
|
322
|
+
description="Number of chunks written to Parquet files",
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Push the file to the object store
|
|
326
|
+
await ObjectStore.upload_file(
|
|
327
|
+
source=output_file_name,
|
|
328
|
+
destination=get_object_store_prefix(output_file_name),
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
self.buffer.clear()
|
|
332
|
+
self.current_buffer_size = 0
|
|
333
|
+
self.current_buffer_size_bytes = 0
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
# Record metrics for failed write
|
|
337
|
+
self.metrics.record_metric(
|
|
338
|
+
name="parquet_write_errors",
|
|
339
|
+
value=1,
|
|
340
|
+
metric_type=MetricType.COUNTER,
|
|
341
|
+
labels={"type": "pandas", "error": str(e)},
|
|
342
|
+
description="Number of errors while writing to Parquet files",
|
|
343
|
+
)
|
|
344
|
+
logger.error(f"Error flushing buffer to parquet: {str(e)}")
|
|
345
|
+
raise e
|
application_sdk/version.py
CHANGED
|
@@ -107,7 +107,11 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
107
107
|
activity_statistics = ActivityStatistics.model_validate(raw_statistics)
|
|
108
108
|
transform_activities: List[Any] = []
|
|
109
109
|
|
|
110
|
-
if
|
|
110
|
+
if (
|
|
111
|
+
activity_statistics is None
|
|
112
|
+
or activity_statistics.chunk_count == 0
|
|
113
|
+
or not activity_statistics.partitions
|
|
114
|
+
):
|
|
111
115
|
# to handle the case where the fetch_fn returns None or no chunks
|
|
112
116
|
return
|
|
113
117
|
|
|
@@ -115,7 +119,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
115
119
|
raise ValueError("Invalid typename")
|
|
116
120
|
|
|
117
121
|
batches, chunk_starts = self.get_transform_batches(
|
|
118
|
-
activity_statistics.chunk_count,
|
|
122
|
+
activity_statistics.chunk_count,
|
|
123
|
+
activity_statistics.typename,
|
|
124
|
+
activity_statistics.partitions,
|
|
119
125
|
)
|
|
120
126
|
|
|
121
127
|
for i in range(len(batches)):
|
|
@@ -144,7 +150,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
144
150
|
total_record_count += metadata_model.total_record_count
|
|
145
151
|
chunk_count += metadata_model.chunk_count
|
|
146
152
|
|
|
147
|
-
def get_transform_batches(
|
|
153
|
+
def get_transform_batches(
|
|
154
|
+
self, chunk_count: int, typename: str, partitions: List[int]
|
|
155
|
+
):
|
|
148
156
|
"""Get batches for parallel transformation processing.
|
|
149
157
|
|
|
150
158
|
Args:
|
|
@@ -159,12 +167,17 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
159
167
|
batches: List[List[str]] = []
|
|
160
168
|
chunk_start_numbers: List[int] = []
|
|
161
169
|
|
|
162
|
-
for i in
|
|
170
|
+
for i, partition in enumerate(partitions):
|
|
163
171
|
# Track starting chunk number (which is just i)
|
|
164
172
|
chunk_start_numbers.append(i)
|
|
165
173
|
|
|
166
174
|
# Each batch contains exactly one chunk
|
|
167
|
-
batches.append(
|
|
175
|
+
batches.append(
|
|
176
|
+
[
|
|
177
|
+
f"{typename}/chunk-{i}-part{file+1}.parquet"
|
|
178
|
+
for file in range(partition)
|
|
179
|
+
]
|
|
180
|
+
)
|
|
168
181
|
|
|
169
182
|
return batches, chunk_start_numbers
|
|
170
183
|
|
{atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: atlan-application-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1rc38
|
|
4
4
|
Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
|
|
5
5
|
Project-URL: Repository, https://github.com/atlanhq/application-sdk
|
|
6
6
|
Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
|
{atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/RECORD
RENAMED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
application_sdk/__init__.py,sha256=2e2mvmLJ5dxmJGPELtb33xwP-j6JMdoIuqKycEn7hjg,151
|
|
2
2
|
application_sdk/constants.py,sha256=GzwZO0pa9M-FgibmfIs1lh-Fwo06K9Tk6WzGqMyJgpI,10362
|
|
3
|
-
application_sdk/version.py,sha256=
|
|
3
|
+
application_sdk/version.py,sha256=4PQKkm_QOy4mEJgwY5eIcUeeEIpIWPjzc_pKiJQmpXw,88
|
|
4
4
|
application_sdk/worker.py,sha256=i5f0AeKI39IfsLO05QkwC6uMz0zDPSJqP7B2byri1VI,7489
|
|
5
5
|
application_sdk/activities/__init__.py,sha256=QaXLOBYbb0zPOY5kfDQh56qbXQFaYNXOjJ5PCvatiZ4,9530
|
|
6
6
|
application_sdk/activities/lock_management.py,sha256=L__GZ9BsArwU1ntYwAgCKsSjCqN6QBeOfT-OT4WyD4Y,3983
|
|
7
7
|
application_sdk/activities/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
application_sdk/activities/common/models.py,sha256=
|
|
8
|
+
application_sdk/activities/common/models.py,sha256=LIZfWvTtgtbAUvvn-rwrPQgD7fP2J0Gxdxr_ITgw-jM,1243
|
|
9
9
|
application_sdk/activities/common/utils.py,sha256=F4Fq9Gl_gvUQj_fSdwzTU7obqUnemYL1dgb_yS34vTM,6967
|
|
10
10
|
application_sdk/activities/metadata_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
application_sdk/activities/metadata_extraction/base.py,sha256=ENFojpxqKdN_eVSL4iet3cGfylPOfcl1jnflfo4zhs8,3920
|
|
@@ -16,8 +16,7 @@ application_sdk/activities/query_extraction/sql.py,sha256=mesGP_kiWzrJ8wboWFVt2j
|
|
|
16
16
|
application_sdk/application/__init__.py,sha256=WDWDWP-IQ-ny7okqsrdTwH60cXKgXBRcnlJ1XVYfiNU,7957
|
|
17
17
|
application_sdk/application/metadata_extraction/sql.py,sha256=ohpV4qZ92uKRlH7I_8G67ocnWkZJAZCU_7XdvqYPiN4,7966
|
|
18
18
|
application_sdk/clients/__init__.py,sha256=C9T84J7V6ZumcoWJPAxdd3tqSmbyciaGBJn-CaCCny0,1341
|
|
19
|
-
application_sdk/clients/
|
|
20
|
-
application_sdk/clients/atlan.py,sha256=f2-Uk5KiPIDJEhGkfYctA_f3CwoVB_mWNBMVvxeLuY4,2684
|
|
19
|
+
application_sdk/clients/atlan.py,sha256=l6yV39fr1006SJFwkOTNDQlbSFlHCZQaUPfdUlzdVEg,5053
|
|
21
20
|
application_sdk/clients/atlan_auth.py,sha256=D7FuNqv81ohNXLJtdx1AFw_jU6a3g0Pw6149ia4ucFY,8930
|
|
22
21
|
application_sdk/clients/base.py,sha256=TIn3pG89eXUc1XSYf4jk66m1vajWp0WxcCQOOltdazA,14021
|
|
23
22
|
application_sdk/clients/redis.py,sha256=IfAD32vLp88BCvsDTaQtxFHxzHlEx4V7TK7h1HwDDBg,15917
|
|
@@ -56,7 +55,7 @@ application_sdk/handlers/sql.py,sha256=oeB-sgWwPYo31xaD87TyMc0h51Sary1F-CmhExt9_
|
|
|
56
55
|
application_sdk/inputs/__init__.py,sha256=_d-cUhcDyoJTJR3PdQkC831go6VDw9AM6Bg7-qm3NHI,1900
|
|
57
56
|
application_sdk/inputs/iceberg.py,sha256=xiv1kNtVx1k0h3ZJbJeXjZwdfBGSy9j9orYP_AyCYlI,2756
|
|
58
57
|
application_sdk/inputs/json.py,sha256=Yv70Y9YuutN2trqK5-z2UNtBL0895ZbdEiBDt9cYM9s,6216
|
|
59
|
-
application_sdk/inputs/parquet.py,sha256=
|
|
58
|
+
application_sdk/inputs/parquet.py,sha256=GnyB0r4-7GNLBl3ooVFUzsxunZsrHStKK2h7XRc7AIY,6723
|
|
60
59
|
application_sdk/inputs/sql_query.py,sha256=1EREgea6kKNaMIyX2HLJgbJ07rtAgLasd9NyvDcdZok,10636
|
|
61
60
|
application_sdk/interceptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
62
61
|
application_sdk/interceptors/events.py,sha256=Kh0dEsc6q7YtlN9cxatiL_ZrmBxriv55r9lxvIKGg3A,6548
|
|
@@ -68,10 +67,10 @@ application_sdk/observability/observability.py,sha256=DP0I4bHyg3TA4hxCqDFy2IiRmB
|
|
|
68
67
|
application_sdk/observability/traces_adaptor.py,sha256=0eQJPN-tYA_dV8D3uEa5ZiX9g12NDuLnPaFuQMVDdL0,18242
|
|
69
68
|
application_sdk/observability/utils.py,sha256=MKEpT0WYtpATUgLgJDkGQaAP_t-jpDYMUKDfEvr8Phg,2448
|
|
70
69
|
application_sdk/observability/decorators/observability_decorator.py,sha256=JNrWNXT5W4klmlAc5b8C3_VBjDu0PI64W2ptr7LMzk4,8110
|
|
71
|
-
application_sdk/outputs/__init__.py,sha256
|
|
70
|
+
application_sdk/outputs/__init__.py,sha256=HIENr2w9gu6u3sF_nvraj45yk53NDAddtaXSUHIVBjs,9469
|
|
72
71
|
application_sdk/outputs/iceberg.py,sha256=IGtj5WDgqLu6vzDEvw5DLsKsjm29Krto3AHvWpemr0A,5311
|
|
73
|
-
application_sdk/outputs/json.py,sha256=
|
|
74
|
-
application_sdk/outputs/parquet.py,sha256=
|
|
72
|
+
application_sdk/outputs/json.py,sha256=zyYQjGj5tb7bJhNt3ObwsuHT6Gakj8qNey-siUlWdP4,15065
|
|
73
|
+
application_sdk/outputs/parquet.py,sha256=pJkOw-CV-JXr0Q4nJXCu0SScbv3I0usONZ4CT1KYxYI,13973
|
|
75
74
|
application_sdk/server/__init__.py,sha256=KTqE1YPw_3WDVMWatJUuf9OOiobLM2K5SMaBrI62sCo,1568
|
|
76
75
|
application_sdk/server/fastapi/__init__.py,sha256=YOdWNE-qqiXfo-exvxPg8T0PSuOxTdeSetUn6-BXxZg,27704
|
|
77
76
|
application_sdk/server/fastapi/models.py,sha256=K6eNl3XXiTXKUvRTpq3oqdGH3jY1-ApobXma04J86fE,6665
|
|
@@ -139,11 +138,11 @@ application_sdk/transformers/query/templates/table.yaml,sha256=QQAGLD1UFjbpSA5wv
|
|
|
139
138
|
application_sdk/transformers/query/templates/tag_attachment.yaml,sha256=dWNDGwRU4_P-t7ibv5XelMP36aGLG29U6MEXOA8zYt0,2884
|
|
140
139
|
application_sdk/workflows/__init__.py,sha256=byluvgzTovr4L1co7YGb4--ktMBqt2pXBjYoxz4dIeU,3869
|
|
141
140
|
application_sdk/workflows/metadata_extraction/__init__.py,sha256=jHUe_ZBQ66jx8bgyduPuECo2RdmJtQsQAKlakADEQbc,120
|
|
142
|
-
application_sdk/workflows/metadata_extraction/sql.py,sha256=
|
|
141
|
+
application_sdk/workflows/metadata_extraction/sql.py,sha256=BhaZavEL8H3Jvf28FGcHtZwqdsUT_EHZ4VTqiaieWek,12278
|
|
143
142
|
application_sdk/workflows/query_extraction/__init__.py,sha256=n066_CX5RpJz6DIxGMkKS3eGSRg03ilaCtsqfJWQb7Q,117
|
|
144
143
|
application_sdk/workflows/query_extraction/sql.py,sha256=kT_JQkLCRZ44ZpaC4QvPL6DxnRIIVh8gYHLqRbMI-hA,4826
|
|
145
|
-
atlan_application_sdk-0.1.
|
|
146
|
-
atlan_application_sdk-0.1.
|
|
147
|
-
atlan_application_sdk-0.1.
|
|
148
|
-
atlan_application_sdk-0.1.
|
|
149
|
-
atlan_application_sdk-0.1.
|
|
144
|
+
atlan_application_sdk-0.1.1rc38.dist-info/METADATA,sha256=dQKtTnaVMPF0DU96vA850Gl6gzpRSmuL48ovbq1wD5o,5567
|
|
145
|
+
atlan_application_sdk-0.1.1rc38.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
146
|
+
atlan_application_sdk-0.1.1rc38.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
147
|
+
atlan_application_sdk-0.1.1rc38.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
|
|
148
|
+
atlan_application_sdk-0.1.1rc38.dist-info/RECORD,,
|
|
@@ -1,70 +0,0 @@
|
|
|
1
|
-
from typing import Optional
|
|
2
|
-
|
|
3
|
-
from pyatlan.client.aio.client import AsyncAtlanClient
|
|
4
|
-
|
|
5
|
-
from application_sdk.common.error_codes import ClientError
|
|
6
|
-
from application_sdk.constants import (
|
|
7
|
-
ATLAN_API_KEY,
|
|
8
|
-
ATLAN_API_TOKEN_GUID,
|
|
9
|
-
ATLAN_BASE_URL,
|
|
10
|
-
ATLAN_CLIENT_ID,
|
|
11
|
-
ATLAN_CLIENT_SECRET,
|
|
12
|
-
)
|
|
13
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
14
|
-
|
|
15
|
-
logger = get_logger(__name__)
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
async def get_client(
|
|
19
|
-
base_url: Optional[str] = None,
|
|
20
|
-
api_key: Optional[str] = None,
|
|
21
|
-
api_token_guid: Optional[str] = None,
|
|
22
|
-
) -> AsyncAtlanClient:
|
|
23
|
-
"""
|
|
24
|
-
Returns an authenticated AsyncAtlanClient instance using provided parameters or environment variables.
|
|
25
|
-
|
|
26
|
-
Selects authentication method based on the presence of parameters or environment variables and validates the required configuration.
|
|
27
|
-
In general, the use of environment variables is recommended. Any parameters specified will override the environment variables.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
base_url: Atlan base URL (overrides ATLAN_BASE_URL)
|
|
31
|
-
api_key: Atlan API key (overrides ATLAN_API_KEY)
|
|
32
|
-
api_token_guid: API token GUID (overrides API_TOKEN_GUID)
|
|
33
|
-
"""
|
|
34
|
-
# Resolve final values (parameters override env vars)
|
|
35
|
-
final_token_guid = api_token_guid or ATLAN_API_TOKEN_GUID
|
|
36
|
-
final_base_url = base_url or ATLAN_BASE_URL
|
|
37
|
-
final_api_key = api_key or ATLAN_API_KEY
|
|
38
|
-
|
|
39
|
-
# Priority 1: Token-based auth (recommended for production)
|
|
40
|
-
if final_token_guid:
|
|
41
|
-
if final_base_url or final_api_key:
|
|
42
|
-
logger.warning(
|
|
43
|
-
"Token auth takes precedence - ignoring base_url/api_key parameters as well as ATLAN_BASE_URL and ATLAN_API_KEY environment variables."
|
|
44
|
-
)
|
|
45
|
-
return await _get_client_from_token(final_token_guid)
|
|
46
|
-
|
|
47
|
-
# Priority 2: API key + base URL auth
|
|
48
|
-
if not final_base_url:
|
|
49
|
-
raise ClientError(
|
|
50
|
-
"ATLAN_BASE_URL is required (via parameter or environment variable)"
|
|
51
|
-
)
|
|
52
|
-
if not final_api_key:
|
|
53
|
-
raise ClientError(
|
|
54
|
-
"ATLAN_API_KEY is required (via parameter or environment variable)"
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
logger.info("Using API key-based authentication")
|
|
58
|
-
return AsyncAtlanClient(base_url=final_base_url, api_key=final_api_key)
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
async def _get_client_from_token(api_token_guid: str):
|
|
62
|
-
if not ATLAN_CLIENT_ID:
|
|
63
|
-
raise ClientError(
|
|
64
|
-
f"{ClientError.AUTH_CONFIG_ERROR}: Environment variable CLIENT_ID is required when API_TOKEN_GUID is set."
|
|
65
|
-
)
|
|
66
|
-
if not ATLAN_CLIENT_SECRET:
|
|
67
|
-
raise ClientError(
|
|
68
|
-
f"{ClientError.AUTH_CONFIG_ERROR}: Environment variable CLIENT_SECRET is required when API_TOKEN_GUID is set."
|
|
69
|
-
)
|
|
70
|
-
return await AsyncAtlanClient.from_token_guid(guid=api_token_guid)
|
{atlan_application_sdk-0.1.1rc36.dist-info → atlan_application_sdk-0.1.1rc38.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|