atlan-application-sdk 0.1.1rc35__py3-none-any.whl → 0.1.1rc37__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/models.py +2 -1
- application_sdk/activities/lock_management.py +110 -0
- application_sdk/clients/redis.py +443 -0
- application_sdk/clients/temporal.py +31 -187
- application_sdk/common/error_codes.py +24 -3
- application_sdk/constants.py +18 -1
- application_sdk/decorators/__init__.py +0 -0
- application_sdk/decorators/locks.py +42 -0
- application_sdk/handlers/base.py +18 -1
- application_sdk/inputs/parquet.py +4 -4
- application_sdk/interceptors/__init__.py +0 -0
- application_sdk/interceptors/events.py +193 -0
- application_sdk/interceptors/lock.py +139 -0
- application_sdk/outputs/__init__.py +24 -1
- application_sdk/outputs/json.py +57 -38
- application_sdk/outputs/parquet.py +116 -18
- application_sdk/version.py +1 -1
- application_sdk/workflows/metadata_extraction/sql.py +18 -5
- {atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/METADATA +4 -2
- {atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/RECORD +23 -16
- {atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Redis lock interceptor for Temporal workflows.
|
|
2
|
+
|
|
3
|
+
Manages distributed locks for activities decorated with @needs_lock using
|
|
4
|
+
separate lock acquisition and release activities to avoid workflow deadlocks.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import timedelta
|
|
8
|
+
from typing import Any, Dict, Optional, Type
|
|
9
|
+
|
|
10
|
+
from temporalio import workflow
|
|
11
|
+
from temporalio.common import RetryPolicy
|
|
12
|
+
from temporalio.worker import (
|
|
13
|
+
Interceptor,
|
|
14
|
+
StartActivityInput,
|
|
15
|
+
WorkflowInboundInterceptor,
|
|
16
|
+
WorkflowInterceptorClassInput,
|
|
17
|
+
WorkflowOutboundInterceptor,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from application_sdk.common.error_codes import WorkflowError
|
|
21
|
+
from application_sdk.constants import (
|
|
22
|
+
APPLICATION_NAME,
|
|
23
|
+
IS_LOCKING_DISABLED,
|
|
24
|
+
LOCK_METADATA_KEY,
|
|
25
|
+
)
|
|
26
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
27
|
+
|
|
28
|
+
logger = get_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RedisLockInterceptor(Interceptor):
|
|
32
|
+
"""Main interceptor class for Redis distributed locking."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, activities: Dict[str, Any]):
|
|
35
|
+
"""Initialize Redis lock interceptor.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
activities: Dictionary mapping activity names to activity functions
|
|
39
|
+
"""
|
|
40
|
+
self.activities = activities
|
|
41
|
+
|
|
42
|
+
def workflow_interceptor_class(
|
|
43
|
+
self, input: WorkflowInterceptorClassInput
|
|
44
|
+
) -> Optional[Type[WorkflowInboundInterceptor]]:
|
|
45
|
+
activities = self.activities
|
|
46
|
+
|
|
47
|
+
class RedisLockWorkflowInboundInterceptor(WorkflowInboundInterceptor):
|
|
48
|
+
"""Inbound interceptor that manages Redis locks for activities."""
|
|
49
|
+
|
|
50
|
+
def init(self, outbound: WorkflowOutboundInterceptor) -> None:
|
|
51
|
+
"""Initialize with Redis lock outbound interceptor."""
|
|
52
|
+
lock_outbound = RedisLockOutboundInterceptor(outbound, activities)
|
|
53
|
+
super().init(lock_outbound)
|
|
54
|
+
|
|
55
|
+
return RedisLockWorkflowInboundInterceptor
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class RedisLockOutboundInterceptor(WorkflowOutboundInterceptor):
|
|
59
|
+
"""Outbound interceptor that acquires Redis locks before activity execution."""
|
|
60
|
+
|
|
61
|
+
def __init__(self, next: WorkflowOutboundInterceptor, activities: Dict[str, Any]):
|
|
62
|
+
super().__init__(next)
|
|
63
|
+
self.activities = activities
|
|
64
|
+
|
|
65
|
+
async def start_activity( # type: ignore[override]
|
|
66
|
+
self, input: StartActivityInput
|
|
67
|
+
) -> workflow.ActivityHandle[Any]:
|
|
68
|
+
"""Start activity with distributed lock if required."""
|
|
69
|
+
|
|
70
|
+
# Check if activity needs locking
|
|
71
|
+
activity_fn = self.activities.get(input.activity)
|
|
72
|
+
if (
|
|
73
|
+
not activity_fn
|
|
74
|
+
or not hasattr(activity_fn, LOCK_METADATA_KEY)
|
|
75
|
+
or IS_LOCKING_DISABLED
|
|
76
|
+
):
|
|
77
|
+
return await self.next.start_activity(input)
|
|
78
|
+
|
|
79
|
+
lock_config = getattr(activity_fn, LOCK_METADATA_KEY)
|
|
80
|
+
lock_name = lock_config.get("lock_name", input.activity)
|
|
81
|
+
max_locks = lock_config.get("max_locks", 5)
|
|
82
|
+
if not input.schedule_to_close_timeout:
|
|
83
|
+
logger.error(
|
|
84
|
+
f"Activity '{input.activity}' with @needs_lock decorator requires schedule_to_close_timeout"
|
|
85
|
+
)
|
|
86
|
+
raise WorkflowError(
|
|
87
|
+
f"{WorkflowError.WORKFLOW_CONFIG_ERROR}: Activity '{input.activity}' with @needs_lock decorator must be called with schedule_to_close_timeout parameter. "
|
|
88
|
+
f"Example: workflow.execute_activity('{input.activity}', schedule_to_close_timeout=timedelta(minutes=10))"
|
|
89
|
+
)
|
|
90
|
+
ttl_seconds = int(input.schedule_to_close_timeout.total_seconds())
|
|
91
|
+
|
|
92
|
+
# Orchestrate lock acquisition -> business activity -> lock release
|
|
93
|
+
return await self._execute_with_lock_orchestration(
|
|
94
|
+
input, lock_name, max_locks, ttl_seconds
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
async def _execute_with_lock_orchestration(
|
|
98
|
+
self,
|
|
99
|
+
input: StartActivityInput,
|
|
100
|
+
lock_name: str,
|
|
101
|
+
max_locks: int,
|
|
102
|
+
ttl_seconds: int,
|
|
103
|
+
) -> workflow.ActivityHandle[Any]:
|
|
104
|
+
"""Execute activity with distributed lock orchestration."""
|
|
105
|
+
owner_id = f"{APPLICATION_NAME}:{workflow.info().run_id}"
|
|
106
|
+
lock_result = None
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Step 1: Acquire lock via dedicated activity (can take >2s safely)
|
|
110
|
+
start_to_close_timeout = workflow.info().execution_timeout
|
|
111
|
+
lock_result = await workflow.execute_activity(
|
|
112
|
+
"acquire_distributed_lock",
|
|
113
|
+
args=[lock_name, max_locks, ttl_seconds, owner_id],
|
|
114
|
+
start_to_close_timeout=start_to_close_timeout,
|
|
115
|
+
retry_policy=RetryPolicy(maximum_attempts=1),
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
logger.debug(f"Lock acquired: {lock_result}, executing {input.activity}")
|
|
119
|
+
|
|
120
|
+
# Step 2: Execute the business activity and return its handle
|
|
121
|
+
return await self.next.start_activity(input)
|
|
122
|
+
|
|
123
|
+
finally:
|
|
124
|
+
# Step 3: Release lock (fire-and-forget with short timeout)
|
|
125
|
+
if lock_result is not None:
|
|
126
|
+
try:
|
|
127
|
+
await workflow.execute_local_activity(
|
|
128
|
+
"release_distributed_lock",
|
|
129
|
+
args=[lock_result["resource_id"], lock_result["owner_id"]],
|
|
130
|
+
start_to_close_timeout=timedelta(seconds=5),
|
|
131
|
+
retry_policy=RetryPolicy(maximum_attempts=1),
|
|
132
|
+
)
|
|
133
|
+
logger.debug(f"Lock released: {lock_result['resource_id']}")
|
|
134
|
+
except Exception as e:
|
|
135
|
+
# Silent failure - TTL will handle cleanup
|
|
136
|
+
logger.warning(
|
|
137
|
+
f"Lock release failed for {lock_result['resource_id']}: {e}. "
|
|
138
|
+
f"TTL will handle cleanup."
|
|
139
|
+
)
|
|
@@ -13,6 +13,7 @@ from typing import (
|
|
|
13
13
|
Dict,
|
|
14
14
|
Generator,
|
|
15
15
|
List,
|
|
16
|
+
Literal,
|
|
16
17
|
Optional,
|
|
17
18
|
Union,
|
|
18
19
|
cast,
|
|
@@ -31,7 +32,7 @@ logger = get_logger(__name__)
|
|
|
31
32
|
activity.logger = logger
|
|
32
33
|
|
|
33
34
|
if TYPE_CHECKING:
|
|
34
|
-
import daft
|
|
35
|
+
import daft # type: ignore
|
|
35
36
|
import pandas as pd
|
|
36
37
|
|
|
37
38
|
|
|
@@ -52,6 +53,27 @@ class Output(ABC):
|
|
|
52
53
|
output_prefix: str
|
|
53
54
|
total_record_count: int
|
|
54
55
|
chunk_count: int
|
|
56
|
+
statistics: List[int] = []
|
|
57
|
+
|
|
58
|
+
def estimate_dataframe_file_size(
|
|
59
|
+
self, dataframe: "pd.DataFrame", file_type: Literal["json", "parquet"]
|
|
60
|
+
) -> int:
|
|
61
|
+
"""Estimate File size of a DataFrame by sampling a few records."""
|
|
62
|
+
if len(dataframe) == 0:
|
|
63
|
+
return 0
|
|
64
|
+
|
|
65
|
+
# Sample up to 10 records to estimate average size
|
|
66
|
+
sample_size = min(10, len(dataframe))
|
|
67
|
+
sample = dataframe.head(sample_size)
|
|
68
|
+
if file_type == "json":
|
|
69
|
+
sample_file = sample.to_json(orient="records", lines=True)
|
|
70
|
+
else:
|
|
71
|
+
sample_file = sample.to_parquet(index=False, compression="snappy")
|
|
72
|
+
if sample_file is not None:
|
|
73
|
+
avg_record_size = len(sample_file) / sample_size
|
|
74
|
+
return int(avg_record_size * len(dataframe))
|
|
75
|
+
|
|
76
|
+
return 0
|
|
55
77
|
|
|
56
78
|
def process_null_fields(
|
|
57
79
|
self,
|
|
@@ -217,6 +239,7 @@ class Output(ABC):
|
|
|
217
239
|
statistics = {
|
|
218
240
|
"total_record_count": self.total_record_count,
|
|
219
241
|
"chunk_count": self.chunk_count,
|
|
242
|
+
"partitions": self.statistics,
|
|
220
243
|
}
|
|
221
244
|
|
|
222
245
|
# Write the statistics to a json file
|
application_sdk/outputs/json.py
CHANGED
|
@@ -6,6 +6,7 @@ import orjson
|
|
|
6
6
|
from temporalio import activity
|
|
7
7
|
|
|
8
8
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
9
|
+
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
9
10
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
10
11
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
11
12
|
from application_sdk.outputs import Output
|
|
@@ -15,7 +16,7 @@ logger = get_logger(__name__)
|
|
|
15
16
|
activity.logger = logger
|
|
16
17
|
|
|
17
18
|
if TYPE_CHECKING:
|
|
18
|
-
import daft
|
|
19
|
+
import daft # type: ignore
|
|
19
20
|
import pandas as pd
|
|
20
21
|
|
|
21
22
|
|
|
@@ -32,7 +33,7 @@ def path_gen(chunk_start: int | None, chunk_count: int) -> str:
|
|
|
32
33
|
if chunk_start is None:
|
|
33
34
|
return f"{str(chunk_count)}.json"
|
|
34
35
|
else:
|
|
35
|
-
return f"{
|
|
36
|
+
return f"chunk-{chunk_start}-part{chunk_count}.json"
|
|
36
37
|
|
|
37
38
|
|
|
38
39
|
def convert_datetime_to_epoch(data: Any) -> Any:
|
|
@@ -124,6 +125,10 @@ class JsonOutput(Output):
|
|
|
124
125
|
self.chunk_size = chunk_size or 100000
|
|
125
126
|
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
126
127
|
self.current_buffer_size = 0
|
|
128
|
+
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
129
|
+
self.max_file_size_bytes = int(
|
|
130
|
+
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
131
|
+
) # 90% of DAPR limit as safety buffer
|
|
127
132
|
self.path_gen = path_gen
|
|
128
133
|
self.start_marker = start_marker
|
|
129
134
|
self.end_marker = end_marker
|
|
@@ -172,8 +177,21 @@ class JsonOutput(Output):
|
|
|
172
177
|
]
|
|
173
178
|
|
|
174
179
|
for chunk in chunks:
|
|
180
|
+
# Estimate size of this chunk
|
|
181
|
+
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "json")
|
|
182
|
+
|
|
183
|
+
# Check if adding this chunk would exceed size limit
|
|
184
|
+
if (
|
|
185
|
+
self.current_buffer_size_bytes + chunk_size_bytes
|
|
186
|
+
> self.max_file_size_bytes
|
|
187
|
+
and self.current_buffer_size > 0
|
|
188
|
+
):
|
|
189
|
+
# Flush current buffer before adding this chunk
|
|
190
|
+
await self._flush_buffer()
|
|
191
|
+
|
|
175
192
|
self.buffer.append(chunk)
|
|
176
193
|
self.current_buffer_size += len(chunk)
|
|
194
|
+
self.current_buffer_size_bytes += chunk_size_bytes
|
|
177
195
|
|
|
178
196
|
if self.current_buffer_size >= partition:
|
|
179
197
|
await self._flush_buffer()
|
|
@@ -237,45 +255,19 @@ class JsonOutput(Output):
|
|
|
237
255
|
row, preserve_fields, null_to_empty_dict_fields
|
|
238
256
|
)
|
|
239
257
|
# Serialize the row and add it to the buffer
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
)
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
250
|
-
with open(output_file_name, "w") as f:
|
|
251
|
-
f.writelines(buffer)
|
|
252
|
-
buffer.clear() # Clear the buffer
|
|
253
|
-
|
|
254
|
-
# Record chunk metrics
|
|
255
|
-
self.metrics.record_metric(
|
|
256
|
-
name="json_chunks_written",
|
|
257
|
-
value=1,
|
|
258
|
-
metric_type=MetricType.COUNTER,
|
|
259
|
-
labels={"type": "daft"},
|
|
260
|
-
description="Number of chunks written to JSON files",
|
|
261
|
-
)
|
|
258
|
+
serialized_row = orjson.dumps(
|
|
259
|
+
cleaned_row, option=orjson.OPT_APPEND_NEWLINE
|
|
260
|
+
).decode("utf-8")
|
|
261
|
+
buffer.append(serialized_row)
|
|
262
|
+
self.current_buffer_size_bytes += len(serialized_row)
|
|
263
|
+
if (self.chunk_size and len(buffer) >= self.chunk_size) or (
|
|
264
|
+
self.current_buffer_size_bytes > self.max_file_size_bytes
|
|
265
|
+
):
|
|
266
|
+
await self.flush_daft_buffer(buffer)
|
|
262
267
|
|
|
263
268
|
# Write any remaining rows in the buffer
|
|
264
269
|
if buffer:
|
|
265
|
-
self.
|
|
266
|
-
output_file_name = f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
267
|
-
with open(output_file_name, "w") as f:
|
|
268
|
-
f.writelines(buffer)
|
|
269
|
-
buffer.clear()
|
|
270
|
-
|
|
271
|
-
# Record chunk metrics
|
|
272
|
-
self.metrics.record_metric(
|
|
273
|
-
name="json_chunks_written",
|
|
274
|
-
value=1,
|
|
275
|
-
metric_type=MetricType.COUNTER,
|
|
276
|
-
labels={"type": "daft"},
|
|
277
|
-
description="Number of chunks written to JSON files",
|
|
278
|
-
)
|
|
270
|
+
await self.flush_daft_buffer(buffer)
|
|
279
271
|
|
|
280
272
|
# Record metrics for successful write
|
|
281
273
|
self.metrics.record_metric(
|
|
@@ -303,6 +295,32 @@ class JsonOutput(Output):
|
|
|
303
295
|
)
|
|
304
296
|
logger.error(f"Error writing daft dataframe to json: {str(e)}")
|
|
305
297
|
|
|
298
|
+
async def flush_daft_buffer(self, buffer: List[str]):
|
|
299
|
+
"""Flush the current buffer to a JSON file.
|
|
300
|
+
|
|
301
|
+
This method combines all DataFrames in the buffer, writes them to a JSON file,
|
|
302
|
+
and uploads the file to the object store.
|
|
303
|
+
"""
|
|
304
|
+
self.chunk_count += 1
|
|
305
|
+
output_file_name = (
|
|
306
|
+
f"{self.output_path}/{self.path_gen(self.chunk_start, self.chunk_count)}"
|
|
307
|
+
)
|
|
308
|
+
with open(output_file_name, "w") as f:
|
|
309
|
+
f.writelines(buffer)
|
|
310
|
+
buffer.clear() # Clear the buffer
|
|
311
|
+
|
|
312
|
+
self.current_buffer_size = 0
|
|
313
|
+
self.current_buffer_size_bytes = 0
|
|
314
|
+
|
|
315
|
+
# Record chunk metrics
|
|
316
|
+
self.metrics.record_metric(
|
|
317
|
+
name="json_chunks_written",
|
|
318
|
+
value=1,
|
|
319
|
+
metric_type=MetricType.COUNTER,
|
|
320
|
+
labels={"type": "daft"},
|
|
321
|
+
description="Number of chunks written to JSON files",
|
|
322
|
+
)
|
|
323
|
+
|
|
306
324
|
async def _flush_buffer(self):
|
|
307
325
|
"""Flush the current buffer to a JSON file.
|
|
308
326
|
|
|
@@ -353,6 +371,7 @@ class JsonOutput(Output):
|
|
|
353
371
|
|
|
354
372
|
self.buffer.clear()
|
|
355
373
|
self.current_buffer_size = 0
|
|
374
|
+
self.current_buffer_size_bytes = 0
|
|
356
375
|
|
|
357
376
|
except Exception as e:
|
|
358
377
|
# Record metrics for failed write
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import os
|
|
2
|
-
from typing import TYPE_CHECKING, Literal, Optional
|
|
2
|
+
from typing import TYPE_CHECKING, List, Literal, Optional, Union
|
|
3
3
|
|
|
4
4
|
from temporalio import activity
|
|
5
5
|
|
|
6
6
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
7
|
+
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
7
8
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
8
9
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
9
10
|
from application_sdk.outputs import Output
|
|
@@ -13,7 +14,7 @@ logger = get_logger(__name__)
|
|
|
13
14
|
activity.logger = logger
|
|
14
15
|
|
|
15
16
|
if TYPE_CHECKING:
|
|
16
|
-
import daft
|
|
17
|
+
import daft # type: ignore
|
|
17
18
|
import pandas as pd
|
|
18
19
|
|
|
19
20
|
|
|
@@ -46,6 +47,7 @@ class ParquetOutput(Output):
|
|
|
46
47
|
typename: Optional[str] = None,
|
|
47
48
|
write_mode: Literal["append", "overwrite", "overwrite-partitions"] = "append",
|
|
48
49
|
chunk_size: Optional[int] = 100000,
|
|
50
|
+
buffer_size: Optional[int] = 100000,
|
|
49
51
|
total_record_count: int = 0,
|
|
50
52
|
chunk_count: int = 0,
|
|
51
53
|
chunk_start: Optional[int] = None,
|
|
@@ -78,11 +80,19 @@ class ParquetOutput(Output):
|
|
|
78
80
|
self.typename = typename
|
|
79
81
|
self.write_mode = write_mode
|
|
80
82
|
self.chunk_size = chunk_size
|
|
83
|
+
self.buffer_size = buffer_size
|
|
84
|
+
self.buffer: List[Union["pd.DataFrame", "daft.DataFrame"]] = [] # noqa: F821
|
|
81
85
|
self.total_record_count = total_record_count
|
|
82
86
|
self.chunk_count = chunk_count
|
|
87
|
+
self.current_buffer_size = 0
|
|
88
|
+
self.current_buffer_size_bytes = 0 # Track estimated buffer size in bytes
|
|
89
|
+
self.max_file_size_bytes = int(
|
|
90
|
+
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
91
|
+
) # 90% of DAPR limit as safety buffer
|
|
83
92
|
self.chunk_start = chunk_start
|
|
84
93
|
self.start_marker = start_marker
|
|
85
94
|
self.end_marker = end_marker
|
|
95
|
+
self.statistics = []
|
|
86
96
|
self.metrics = get_metrics()
|
|
87
97
|
|
|
88
98
|
# Create output directory
|
|
@@ -117,7 +127,7 @@ class ParquetOutput(Output):
|
|
|
117
127
|
if chunk_start is None:
|
|
118
128
|
return f"{str(chunk_count)}.parquet"
|
|
119
129
|
else:
|
|
120
|
-
return f"{str(chunk_start
|
|
130
|
+
return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
|
|
121
131
|
|
|
122
132
|
async def write_dataframe(self, dataframe: "pd.DataFrame"):
|
|
123
133
|
"""Write a pandas DataFrame to Parquet files and upload to object store.
|
|
@@ -126,20 +136,46 @@ class ParquetOutput(Output):
|
|
|
126
136
|
dataframe (pd.DataFrame): The DataFrame to write.
|
|
127
137
|
"""
|
|
128
138
|
try:
|
|
139
|
+
chunk_part = 0
|
|
129
140
|
if len(dataframe) == 0:
|
|
130
141
|
return
|
|
131
142
|
|
|
132
|
-
#
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
# Write the dataframe to parquet using pandas native method
|
|
138
|
-
dataframe.to_parquet(
|
|
139
|
-
file_path,
|
|
140
|
-
index=False,
|
|
141
|
-
compression="snappy", # Using snappy compression by default
|
|
143
|
+
# Split the DataFrame into chunks
|
|
144
|
+
partition = (
|
|
145
|
+
self.chunk_size
|
|
146
|
+
if self.chunk_start is None
|
|
147
|
+
else min(self.chunk_size, self.buffer_size)
|
|
142
148
|
)
|
|
149
|
+
chunks = [
|
|
150
|
+
dataframe[i : i + partition] # type: ignore
|
|
151
|
+
for i in range(0, len(dataframe), partition)
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
for chunk in chunks:
|
|
155
|
+
# Estimate size of this chunk
|
|
156
|
+
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
|
|
157
|
+
|
|
158
|
+
# Check if adding this chunk would exceed size limit
|
|
159
|
+
if (
|
|
160
|
+
self.current_buffer_size_bytes + chunk_size_bytes
|
|
161
|
+
> self.max_file_size_bytes
|
|
162
|
+
and self.current_buffer_size > 0
|
|
163
|
+
):
|
|
164
|
+
# Flush current buffer before adding this chunk
|
|
165
|
+
chunk_part += 1
|
|
166
|
+
await self._flush_buffer(chunk_part)
|
|
167
|
+
|
|
168
|
+
self.buffer.append(chunk)
|
|
169
|
+
self.current_buffer_size += len(chunk)
|
|
170
|
+
self.current_buffer_size_bytes += chunk_size_bytes
|
|
171
|
+
|
|
172
|
+
if self.current_buffer_size >= partition: # type: ignore
|
|
173
|
+
chunk_part += 1
|
|
174
|
+
await self._flush_buffer(chunk_part)
|
|
175
|
+
|
|
176
|
+
if self.buffer and self.current_buffer_size > 0:
|
|
177
|
+
chunk_part += 1
|
|
178
|
+
await self._flush_buffer(chunk_part)
|
|
143
179
|
|
|
144
180
|
# Record metrics for successful write
|
|
145
181
|
self.metrics.record_metric(
|
|
@@ -159,11 +195,8 @@ class ParquetOutput(Output):
|
|
|
159
195
|
description="Number of chunks written to Parquet files",
|
|
160
196
|
)
|
|
161
197
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
source=file_path,
|
|
165
|
-
destination=get_object_store_prefix(file_path),
|
|
166
|
-
)
|
|
198
|
+
self.chunk_count += 1
|
|
199
|
+
self.statistics.append(chunk_part)
|
|
167
200
|
except Exception as e:
|
|
168
201
|
# Record metrics for failed write
|
|
169
202
|
self.metrics.record_metric(
|
|
@@ -245,3 +278,68 @@ class ParquetOutput(Output):
|
|
|
245
278
|
str: The full path of the output file.
|
|
246
279
|
"""
|
|
247
280
|
return self.output_path
|
|
281
|
+
|
|
282
|
+
async def _flush_buffer(self, chunk_part):
|
|
283
|
+
"""Flush the current buffer to a Parquet file.
|
|
284
|
+
|
|
285
|
+
This method combines all DataFrames in the buffer, writes them to a Parquet file,
|
|
286
|
+
and uploads the file to the object store.
|
|
287
|
+
|
|
288
|
+
Note:
|
|
289
|
+
If the buffer is empty or has no records, the method returns without writing.
|
|
290
|
+
"""
|
|
291
|
+
import pandas as pd
|
|
292
|
+
|
|
293
|
+
if not self.buffer or not self.current_buffer_size:
|
|
294
|
+
return
|
|
295
|
+
|
|
296
|
+
if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
|
|
297
|
+
raise TypeError(
|
|
298
|
+
"_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
# Now it's safe to cast for pd.concat
|
|
303
|
+
pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
|
|
304
|
+
combined_dataframe = pd.concat(pd_buffer)
|
|
305
|
+
|
|
306
|
+
# Write DataFrame to Parquet file
|
|
307
|
+
if not combined_dataframe.empty:
|
|
308
|
+
self.total_record_count += len(combined_dataframe)
|
|
309
|
+
output_file_name = (
|
|
310
|
+
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
311
|
+
)
|
|
312
|
+
combined_dataframe.to_parquet(
|
|
313
|
+
output_file_name, index=False, compression="snappy"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Record chunk metrics
|
|
317
|
+
self.metrics.record_metric(
|
|
318
|
+
name="parquet_chunks_written",
|
|
319
|
+
value=1,
|
|
320
|
+
metric_type=MetricType.COUNTER,
|
|
321
|
+
labels={"type": "pandas"},
|
|
322
|
+
description="Number of chunks written to Parquet files",
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Push the file to the object store
|
|
326
|
+
await ObjectStore.upload_file(
|
|
327
|
+
source=output_file_name,
|
|
328
|
+
destination=get_object_store_prefix(output_file_name),
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
self.buffer.clear()
|
|
332
|
+
self.current_buffer_size = 0
|
|
333
|
+
self.current_buffer_size_bytes = 0
|
|
334
|
+
|
|
335
|
+
except Exception as e:
|
|
336
|
+
# Record metrics for failed write
|
|
337
|
+
self.metrics.record_metric(
|
|
338
|
+
name="parquet_write_errors",
|
|
339
|
+
value=1,
|
|
340
|
+
metric_type=MetricType.COUNTER,
|
|
341
|
+
labels={"type": "pandas", "error": str(e)},
|
|
342
|
+
description="Number of errors while writing to Parquet files",
|
|
343
|
+
)
|
|
344
|
+
logger.error(f"Error flushing buffer to parquet: {str(e)}")
|
|
345
|
+
raise e
|
application_sdk/version.py
CHANGED
|
@@ -107,7 +107,11 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
107
107
|
activity_statistics = ActivityStatistics.model_validate(raw_statistics)
|
|
108
108
|
transform_activities: List[Any] = []
|
|
109
109
|
|
|
110
|
-
if
|
|
110
|
+
if (
|
|
111
|
+
activity_statistics is None
|
|
112
|
+
or activity_statistics.chunk_count == 0
|
|
113
|
+
or not activity_statistics.partitions
|
|
114
|
+
):
|
|
111
115
|
# to handle the case where the fetch_fn returns None or no chunks
|
|
112
116
|
return
|
|
113
117
|
|
|
@@ -115,7 +119,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
115
119
|
raise ValueError("Invalid typename")
|
|
116
120
|
|
|
117
121
|
batches, chunk_starts = self.get_transform_batches(
|
|
118
|
-
activity_statistics.chunk_count,
|
|
122
|
+
activity_statistics.chunk_count,
|
|
123
|
+
activity_statistics.typename,
|
|
124
|
+
activity_statistics.partitions,
|
|
119
125
|
)
|
|
120
126
|
|
|
121
127
|
for i in range(len(batches)):
|
|
@@ -144,7 +150,9 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
144
150
|
total_record_count += metadata_model.total_record_count
|
|
145
151
|
chunk_count += metadata_model.chunk_count
|
|
146
152
|
|
|
147
|
-
def get_transform_batches(
|
|
153
|
+
def get_transform_batches(
|
|
154
|
+
self, chunk_count: int, typename: str, partitions: List[int]
|
|
155
|
+
):
|
|
148
156
|
"""Get batches for parallel transformation processing.
|
|
149
157
|
|
|
150
158
|
Args:
|
|
@@ -159,12 +167,17 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
159
167
|
batches: List[List[str]] = []
|
|
160
168
|
chunk_start_numbers: List[int] = []
|
|
161
169
|
|
|
162
|
-
for i in
|
|
170
|
+
for i, partition in enumerate(partitions):
|
|
163
171
|
# Track starting chunk number (which is just i)
|
|
164
172
|
chunk_start_numbers.append(i)
|
|
165
173
|
|
|
166
174
|
# Each batch contains exactly one chunk
|
|
167
|
-
batches.append(
|
|
175
|
+
batches.append(
|
|
176
|
+
[
|
|
177
|
+
f"{typename}/chunk-{i}-part{file+1}.parquet"
|
|
178
|
+
for file in range(partition)
|
|
179
|
+
]
|
|
180
|
+
)
|
|
168
181
|
|
|
169
182
|
return batches, chunk_start_numbers
|
|
170
183
|
|
{atlan_application_sdk-0.1.1rc35.dist-info → atlan_application_sdk-0.1.1rc37.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: atlan-application-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1rc37
|
|
4
4
|
Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
|
|
5
5
|
Project-URL: Repository, https://github.com/atlanhq/application-sdk
|
|
6
6
|
Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
|
|
@@ -26,12 +26,14 @@ Requires-Dist: fastapi[standard]>=0.115.0
|
|
|
26
26
|
Requires-Dist: loguru>=0.7.3
|
|
27
27
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
28
28
|
Requires-Dist: psutil>=7.0.0
|
|
29
|
-
Requires-Dist: pyatlan>=8.0.
|
|
29
|
+
Requires-Dist: pyatlan>=8.0.2
|
|
30
30
|
Requires-Dist: pydantic>=2.10.6
|
|
31
31
|
Requires-Dist: python-dotenv>=1.1.0
|
|
32
32
|
Requires-Dist: uvloop>=0.21.0; sys_platform != 'win32'
|
|
33
33
|
Provides-Extra: daft
|
|
34
34
|
Requires-Dist: daft>=0.4.12; extra == 'daft'
|
|
35
|
+
Provides-Extra: distributed-lock
|
|
36
|
+
Requires-Dist: redis[hiredis]>=5.2.0; extra == 'distributed-lock'
|
|
35
37
|
Provides-Extra: iam-auth
|
|
36
38
|
Requires-Dist: boto3>=1.38.6; extra == 'iam-auth'
|
|
37
39
|
Provides-Extra: iceberg
|