atlan-application-sdk 0.1.1rc43__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/metadata_extraction/sql.py +9 -35
- application_sdk/activities/query_extraction/sql.py +0 -2
- application_sdk/inputs/parquet.py +15 -3
- application_sdk/inputs/sql_query.py +2 -2
- application_sdk/interceptors/cleanup.py +0 -1
- application_sdk/outputs/__init__.py +176 -12
- application_sdk/outputs/json.py +57 -181
- application_sdk/outputs/parquet.py +230 -161
- application_sdk/transformers/query/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/workflows/metadata_extraction/sql.py +5 -4
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/RECORD +16 -16
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import os
|
|
3
|
+
import shutil
|
|
2
4
|
from enum import Enum
|
|
3
|
-
from typing import TYPE_CHECKING, List, Optional, Union
|
|
5
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
|
|
4
6
|
|
|
5
7
|
from temporalio import activity
|
|
6
8
|
|
|
7
9
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
10
|
+
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
8
11
|
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
9
12
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
10
13
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
@@ -35,57 +38,57 @@ class ParquetOutput(Output):
|
|
|
35
38
|
|
|
36
39
|
Attributes:
|
|
37
40
|
output_path (str): Base path where Parquet files will be written.
|
|
38
|
-
output_prefix (str): Prefix for files when uploading to object store.
|
|
39
41
|
output_suffix (str): Suffix for output files.
|
|
40
42
|
typename (Optional[str]): Type name of the entity e.g database, schema, table.
|
|
41
43
|
chunk_size (int): Maximum number of records per chunk.
|
|
42
44
|
total_record_count (int): Total number of records processed.
|
|
43
45
|
chunk_count (int): Number of chunks created.
|
|
44
46
|
chunk_start (Optional[int]): Starting index for chunk numbering.
|
|
45
|
-
path_gen (Callable): Function to generate file paths.
|
|
46
47
|
start_marker (Optional[str]): Start marker for query extraction.
|
|
47
48
|
end_marker (Optional[str]): End marker for query extraction.
|
|
49
|
+
retain_local_copy (bool): Whether to retain the local copy of the files.
|
|
50
|
+
use_consolidation (bool): Whether to use consolidation.
|
|
48
51
|
"""
|
|
49
52
|
|
|
53
|
+
_EXTENSION = ".parquet"
|
|
54
|
+
|
|
50
55
|
def __init__(
|
|
51
56
|
self,
|
|
52
57
|
output_path: str = "",
|
|
53
58
|
output_suffix: str = "",
|
|
54
|
-
output_prefix: str = "",
|
|
55
59
|
typename: Optional[str] = None,
|
|
56
60
|
chunk_size: Optional[int] = 100000,
|
|
57
|
-
buffer_size:
|
|
61
|
+
buffer_size: int = 5000,
|
|
58
62
|
total_record_count: int = 0,
|
|
59
63
|
chunk_count: int = 0,
|
|
60
64
|
chunk_start: Optional[int] = None,
|
|
61
65
|
start_marker: Optional[str] = None,
|
|
62
66
|
end_marker: Optional[str] = None,
|
|
63
67
|
retain_local_copy: bool = False,
|
|
68
|
+
use_consolidation: bool = False,
|
|
64
69
|
):
|
|
65
70
|
"""Initialize the Parquet output handler.
|
|
66
71
|
|
|
67
72
|
Args:
|
|
68
73
|
output_path (str): Base path where Parquet files will be written.
|
|
69
74
|
output_suffix (str): Suffix for output files.
|
|
70
|
-
output_prefix (str): Prefix for files when uploading to object store.
|
|
71
75
|
typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
|
|
72
76
|
chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
|
|
73
77
|
total_record_count (int, optional): Initial total record count. Defaults to 0.
|
|
74
78
|
chunk_count (int, optional): Initial chunk count. Defaults to 0.
|
|
75
79
|
chunk_start (Optional[int], optional): Starting index for chunk numbering.
|
|
76
80
|
Defaults to None.
|
|
77
|
-
path_gen (Callable, optional): Function to generate file paths.
|
|
78
|
-
Defaults to path_gen function.
|
|
79
81
|
start_marker (Optional[str], optional): Start marker for query extraction.
|
|
80
82
|
Defaults to None.
|
|
81
83
|
end_marker (Optional[str], optional): End marker for query extraction.
|
|
82
84
|
Defaults to None.
|
|
83
85
|
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
84
86
|
Defaults to False.
|
|
87
|
+
use_consolidation (bool, optional): Whether to use consolidation.
|
|
88
|
+
Defaults to False.
|
|
85
89
|
"""
|
|
86
90
|
self.output_path = output_path
|
|
87
91
|
self.output_suffix = output_suffix
|
|
88
|
-
self.output_prefix = output_prefix
|
|
89
92
|
self.typename = typename
|
|
90
93
|
self.chunk_size = chunk_size
|
|
91
94
|
self.buffer_size = buffer_size
|
|
@@ -98,128 +101,84 @@ class ParquetOutput(Output):
|
|
|
98
101
|
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
99
102
|
) # 90% of DAPR limit as safety buffer
|
|
100
103
|
self.chunk_start = chunk_start
|
|
104
|
+
self.chunk_part = 0
|
|
101
105
|
self.start_marker = start_marker
|
|
102
106
|
self.end_marker = end_marker
|
|
103
|
-
self.
|
|
107
|
+
self.partitions = []
|
|
104
108
|
self.metrics = get_metrics()
|
|
105
109
|
self.retain_local_copy = retain_local_copy
|
|
106
110
|
|
|
111
|
+
# Consolidation-specific attributes
|
|
112
|
+
# Use consolidation to efficiently write parquet files in buffered manner
|
|
113
|
+
# since there's no cleaner way to write parquet files incrementally
|
|
114
|
+
self.use_consolidation = use_consolidation
|
|
115
|
+
self.consolidation_threshold = (
|
|
116
|
+
chunk_size or 100000
|
|
117
|
+
) # Use chunk_size as threshold
|
|
118
|
+
self.current_folder_records = 0 # Track records in current temp folder
|
|
119
|
+
self.temp_folder_index = 0 # Current temp folder index
|
|
120
|
+
self.temp_folders_created: List[int] = [] # Track temp folders for cleanup
|
|
121
|
+
self.current_temp_folder_path: Optional[str] = None # Current temp folder path
|
|
122
|
+
|
|
123
|
+
if self.chunk_start:
|
|
124
|
+
self.chunk_count = self.chunk_start + self.chunk_count
|
|
125
|
+
|
|
107
126
|
# Create output directory
|
|
108
127
|
self.output_path = os.path.join(self.output_path, self.output_suffix)
|
|
109
128
|
if self.typename:
|
|
110
129
|
self.output_path = os.path.join(self.output_path, self.typename)
|
|
111
130
|
os.makedirs(self.output_path, exist_ok=True)
|
|
112
131
|
|
|
113
|
-
def
|
|
132
|
+
async def write_batched_dataframe(
|
|
114
133
|
self,
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"""Generate a file path for a chunk.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
|
|
124
|
-
chunk_count (int): Total number of chunks.
|
|
125
|
-
start_marker (Optional[str]): Start marker for query extraction.
|
|
126
|
-
end_marker (Optional[str]): End marker for query extraction.
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
str: Generated file path for the chunk.
|
|
130
|
-
"""
|
|
131
|
-
# For Query Extraction - use start and end markers without chunk count
|
|
132
|
-
if start_marker and end_marker:
|
|
133
|
-
return f"{start_marker}_{end_marker}.parquet"
|
|
134
|
+
batched_dataframe: Union[
|
|
135
|
+
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
136
|
+
],
|
|
137
|
+
):
|
|
138
|
+
"""Write a batched pandas DataFrame to Parquet files with consolidation support.
|
|
134
139
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
return f"{str(chunk_count)}.parquet"
|
|
138
|
-
else:
|
|
139
|
-
return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
|
|
140
|
+
This method implements a consolidation strategy to efficiently write parquet files
|
|
141
|
+
in a buffered manner, since there's no cleaner way to write parquet files incrementally.
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
+
The process:
|
|
144
|
+
1. Accumulate DataFrames into temp folders (buffer_size chunks each)
|
|
145
|
+
2. When consolidation_threshold is reached, use Daft to merge into optimized files
|
|
146
|
+
3. Clean up temporary files after consolidation
|
|
143
147
|
|
|
144
148
|
Args:
|
|
145
|
-
|
|
149
|
+
batched_dataframe: AsyncGenerator or Generator of pandas DataFrames to write.
|
|
146
150
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# Split the DataFrame into chunks
|
|
153
|
-
partition = (
|
|
154
|
-
self.chunk_size
|
|
155
|
-
if self.chunk_start is None
|
|
156
|
-
else min(self.chunk_size, self.buffer_size)
|
|
157
|
-
)
|
|
158
|
-
chunks = [
|
|
159
|
-
dataframe[i : i + partition] # type: ignore
|
|
160
|
-
for i in range(0, len(dataframe), partition)
|
|
161
|
-
]
|
|
162
|
-
|
|
163
|
-
for chunk in chunks:
|
|
164
|
-
# Estimate size of this chunk
|
|
165
|
-
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
|
|
166
|
-
|
|
167
|
-
# Check if adding this chunk would exceed size limit
|
|
168
|
-
if (
|
|
169
|
-
self.current_buffer_size_bytes + chunk_size_bytes
|
|
170
|
-
> self.max_file_size_bytes
|
|
171
|
-
and self.current_buffer_size > 0
|
|
172
|
-
):
|
|
173
|
-
# Flush current buffer before adding this chunk
|
|
174
|
-
chunk_part += 1
|
|
175
|
-
await self._flush_buffer(chunk_part)
|
|
176
|
-
|
|
177
|
-
self.buffer.append(chunk)
|
|
178
|
-
self.current_buffer_size += len(chunk)
|
|
179
|
-
self.current_buffer_size_bytes += chunk_size_bytes
|
|
180
|
-
|
|
181
|
-
if self.current_buffer_size >= partition: # type: ignore
|
|
182
|
-
chunk_part += 1
|
|
183
|
-
await self._flush_buffer(chunk_part)
|
|
151
|
+
if not self.use_consolidation:
|
|
152
|
+
# Fallback to base class implementation
|
|
153
|
+
await super().write_batched_dataframe(batched_dataframe)
|
|
154
|
+
return
|
|
184
155
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
156
|
+
try:
|
|
157
|
+
# Phase 1: Accumulate DataFrames into temp folders
|
|
158
|
+
if inspect.isasyncgen(batched_dataframe):
|
|
159
|
+
async for dataframe in batched_dataframe:
|
|
160
|
+
if not is_empty_dataframe(dataframe):
|
|
161
|
+
await self._accumulate_dataframe(dataframe)
|
|
162
|
+
else:
|
|
163
|
+
sync_generator = cast(
|
|
164
|
+
Generator["pd.DataFrame", None, None], batched_dataframe
|
|
165
|
+
)
|
|
166
|
+
for dataframe in sync_generator:
|
|
167
|
+
if not is_empty_dataframe(dataframe):
|
|
168
|
+
await self._accumulate_dataframe(dataframe)
|
|
188
169
|
|
|
189
|
-
#
|
|
190
|
-
self.
|
|
191
|
-
|
|
192
|
-
value=len(dataframe),
|
|
193
|
-
metric_type=MetricType.COUNTER,
|
|
194
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
195
|
-
description="Number of records written to Parquet files from pandas DataFrame",
|
|
196
|
-
)
|
|
170
|
+
# Phase 2: Consolidate any remaining temp folder
|
|
171
|
+
if self.current_folder_records > 0:
|
|
172
|
+
await self._consolidate_current_folder()
|
|
197
173
|
|
|
198
|
-
#
|
|
199
|
-
self.
|
|
200
|
-
name="parquet_chunks_written",
|
|
201
|
-
value=1,
|
|
202
|
-
metric_type=MetricType.COUNTER,
|
|
203
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
204
|
-
description="Number of chunks written to Parquet files",
|
|
205
|
-
)
|
|
174
|
+
# Phase 3: Cleanup temp folders
|
|
175
|
+
await self._cleanup_temp_folders()
|
|
206
176
|
|
|
207
|
-
self.chunk_count += 1
|
|
208
|
-
self.statistics.append(chunk_part)
|
|
209
177
|
except Exception as e:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
name="parquet_write_errors",
|
|
213
|
-
value=1,
|
|
214
|
-
metric_type=MetricType.COUNTER,
|
|
215
|
-
labels={
|
|
216
|
-
"type": "pandas",
|
|
217
|
-
"mode": WriteMode.APPEND.value,
|
|
218
|
-
"error": str(e),
|
|
219
|
-
},
|
|
220
|
-
description="Number of errors while writing to Parquet files",
|
|
178
|
+
logger.error(
|
|
179
|
+
f"Error in batched dataframe writing with consolidation: {str(e)}"
|
|
221
180
|
)
|
|
222
|
-
|
|
181
|
+
await self._cleanup_temp_folders() # Cleanup on error
|
|
223
182
|
raise
|
|
224
183
|
|
|
225
184
|
async def write_daft_dataframe(
|
|
@@ -320,7 +279,13 @@ class ParquetOutput(Output):
|
|
|
320
279
|
name="parquet_write_errors",
|
|
321
280
|
value=1,
|
|
322
281
|
metric_type=MetricType.COUNTER,
|
|
323
|
-
labels={
|
|
282
|
+
labels={
|
|
283
|
+
"type": "daft",
|
|
284
|
+
"mode": write_mode.value
|
|
285
|
+
if isinstance(write_mode, WriteMode)
|
|
286
|
+
else write_mode,
|
|
287
|
+
"error": str(e),
|
|
288
|
+
},
|
|
324
289
|
description="Number of errors while writing to Parquet files",
|
|
325
290
|
)
|
|
326
291
|
logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
|
|
@@ -334,67 +299,171 @@ class ParquetOutput(Output):
|
|
|
334
299
|
"""
|
|
335
300
|
return self.output_path
|
|
336
301
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
302
|
+
# Consolidation helper methods
|
|
303
|
+
|
|
304
|
+
def _get_temp_folder_path(self, folder_index: int) -> str:
|
|
305
|
+
"""Generate temp folder path consistent with existing structure."""
|
|
306
|
+
temp_base_path = os.path.join(self.output_path, "temp_accumulation")
|
|
307
|
+
return os.path.join(temp_base_path, f"folder-{folder_index}")
|
|
308
|
+
|
|
309
|
+
def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
|
|
310
|
+
"""Generate final consolidated file path using existing path_gen logic."""
|
|
311
|
+
return os.path.join(
|
|
312
|
+
self.output_path,
|
|
313
|
+
self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
|
|
317
|
+
"""Accumulate DataFrame into temp folders, writing in buffer_size chunks."""
|
|
318
|
+
|
|
319
|
+
# Process dataframe in buffer_size chunks
|
|
320
|
+
for i in range(0, len(dataframe), self.buffer_size):
|
|
321
|
+
chunk = dataframe[i : i + self.buffer_size]
|
|
322
|
+
|
|
323
|
+
# Check if we need to consolidate current folder before adding this chunk
|
|
324
|
+
if (
|
|
325
|
+
self.current_folder_records + len(chunk)
|
|
326
|
+
) > self.consolidation_threshold:
|
|
327
|
+
if self.current_folder_records > 0:
|
|
328
|
+
await self._consolidate_current_folder()
|
|
329
|
+
self._start_new_temp_folder()
|
|
330
|
+
|
|
331
|
+
# Ensure we have a temp folder ready
|
|
332
|
+
if self.current_temp_folder_path is None:
|
|
333
|
+
self._start_new_temp_folder()
|
|
334
|
+
|
|
335
|
+
# Write chunk to current temp folder
|
|
336
|
+
await self._write_chunk_to_temp_folder(cast("pd.DataFrame", chunk))
|
|
337
|
+
self.current_folder_records += len(chunk)
|
|
338
|
+
|
|
339
|
+
def _start_new_temp_folder(self):
|
|
340
|
+
"""Start a new temp folder for accumulation and create the directory."""
|
|
341
|
+
if self.current_temp_folder_path is not None:
|
|
342
|
+
self.temp_folders_created.append(self.temp_folder_index)
|
|
343
|
+
self.temp_folder_index += 1
|
|
344
|
+
|
|
345
|
+
self.current_folder_records = 0
|
|
346
|
+
self.current_temp_folder_path = self._get_temp_folder_path(
|
|
347
|
+
self.temp_folder_index
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Create the directory
|
|
351
|
+
os.makedirs(self.current_temp_folder_path, exist_ok=True)
|
|
352
|
+
|
|
353
|
+
async def _write_chunk_to_temp_folder(self, chunk: "pd.DataFrame"):
|
|
354
|
+
"""Write a chunk to the current temp folder."""
|
|
355
|
+
if self.current_temp_folder_path is None:
|
|
356
|
+
raise ValueError("No temp folder path available")
|
|
357
|
+
|
|
358
|
+
# Generate file name for this chunk within the temp folder
|
|
359
|
+
existing_files = len(
|
|
360
|
+
[
|
|
361
|
+
f
|
|
362
|
+
for f in os.listdir(self.current_temp_folder_path)
|
|
363
|
+
if f.endswith(".parquet")
|
|
364
|
+
]
|
|
365
|
+
)
|
|
366
|
+
chunk_file_name = f"chunk-{existing_files}.parquet"
|
|
367
|
+
chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
|
|
342
368
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
"""
|
|
346
|
-
import pandas as pd
|
|
369
|
+
# Write chunk using existing write_chunk method
|
|
370
|
+
await self.write_chunk(chunk, chunk_file_path)
|
|
347
371
|
|
|
348
|
-
|
|
372
|
+
async def _consolidate_current_folder(self):
|
|
373
|
+
"""Consolidate current temp folder using Daft."""
|
|
374
|
+
if self.current_folder_records == 0 or self.current_temp_folder_path is None:
|
|
349
375
|
return
|
|
350
376
|
|
|
351
|
-
if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
|
|
352
|
-
raise TypeError(
|
|
353
|
-
"_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
|
|
354
|
-
)
|
|
355
|
-
|
|
356
377
|
try:
|
|
357
|
-
|
|
358
|
-
pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
|
|
359
|
-
combined_dataframe = pd.concat(pd_buffer)
|
|
360
|
-
|
|
361
|
-
# Write DataFrame to Parquet file
|
|
362
|
-
if not combined_dataframe.empty:
|
|
363
|
-
self.total_record_count += len(combined_dataframe)
|
|
364
|
-
output_file_name = (
|
|
365
|
-
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
366
|
-
)
|
|
367
|
-
combined_dataframe.to_parquet(
|
|
368
|
-
output_file_name, index=False, compression="snappy"
|
|
369
|
-
)
|
|
378
|
+
import daft
|
|
370
379
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
metric_type=MetricType.COUNTER,
|
|
376
|
-
labels={"type": "pandas"},
|
|
377
|
-
description="Number of chunks written to Parquet files",
|
|
378
|
-
)
|
|
380
|
+
# Read all parquet files in temp folder
|
|
381
|
+
pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
|
|
382
|
+
daft_df = daft.read_parquet(pattern)
|
|
383
|
+
partitions = 0
|
|
379
384
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
385
|
+
# Write consolidated file using Daft with size management
|
|
386
|
+
with daft.execution_config_ctx(
|
|
387
|
+
parquet_target_filesize=self.max_file_size_bytes
|
|
388
|
+
):
|
|
389
|
+
# Write to a temp location first
|
|
390
|
+
temp_consolidated_dir = f"{self.current_temp_folder_path}_temp"
|
|
391
|
+
result = daft_df.write_parquet(root_dir=temp_consolidated_dir)
|
|
392
|
+
|
|
393
|
+
# Get the generated file path and rename to final location
|
|
394
|
+
result_dict = result.to_pydict()
|
|
395
|
+
partitions = len(result_dict["path"])
|
|
396
|
+
for i, file_path in enumerate(result_dict["path"]):
|
|
397
|
+
if file_path.endswith(".parquet"):
|
|
398
|
+
consolidated_file_path = self._get_consolidated_file_path(
|
|
399
|
+
folder_index=self.chunk_count,
|
|
400
|
+
chunk_part=i,
|
|
401
|
+
)
|
|
402
|
+
os.rename(file_path, consolidated_file_path)
|
|
403
|
+
|
|
404
|
+
# Upload consolidated file to object store
|
|
405
|
+
await ObjectStore.upload_file(
|
|
406
|
+
source=consolidated_file_path,
|
|
407
|
+
destination=get_object_store_prefix(consolidated_file_path),
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Clean up temp consolidated dir
|
|
411
|
+
shutil.rmtree(temp_consolidated_dir, ignore_errors=True)
|
|
412
|
+
|
|
413
|
+
# Update statistics
|
|
414
|
+
self.chunk_count += 1
|
|
415
|
+
self.total_record_count += self.current_folder_records
|
|
416
|
+
self.partitions.append(partitions)
|
|
389
417
|
|
|
390
|
-
|
|
391
|
-
# Record metrics for failed write
|
|
418
|
+
# Record metrics
|
|
392
419
|
self.metrics.record_metric(
|
|
393
|
-
name="
|
|
420
|
+
name="consolidated_files",
|
|
394
421
|
value=1,
|
|
395
422
|
metric_type=MetricType.COUNTER,
|
|
396
|
-
labels={"type": "
|
|
397
|
-
description="Number of
|
|
423
|
+
labels={"type": "daft_consolidation"},
|
|
424
|
+
description="Number of consolidated parquet files created",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
logger.info(
|
|
428
|
+
f"Consolidated folder {self.temp_folder_index} with {self.current_folder_records} records"
|
|
398
429
|
)
|
|
399
|
-
|
|
400
|
-
|
|
430
|
+
|
|
431
|
+
except Exception as e:
|
|
432
|
+
logger.error(
|
|
433
|
+
f"Error consolidating folder {self.temp_folder_index}: {str(e)}"
|
|
434
|
+
)
|
|
435
|
+
raise
|
|
436
|
+
|
|
437
|
+
async def _cleanup_temp_folders(self):
|
|
438
|
+
"""Clean up all temp folders after consolidation."""
|
|
439
|
+
try:
|
|
440
|
+
# Add current folder to cleanup list if it exists
|
|
441
|
+
if self.current_temp_folder_path is not None:
|
|
442
|
+
self.temp_folders_created.append(self.temp_folder_index)
|
|
443
|
+
|
|
444
|
+
# Clean up all temp folders
|
|
445
|
+
for folder_index in self.temp_folders_created:
|
|
446
|
+
temp_folder = self._get_temp_folder_path(folder_index)
|
|
447
|
+
if os.path.exists(temp_folder):
|
|
448
|
+
shutil.rmtree(temp_folder, ignore_errors=True)
|
|
449
|
+
|
|
450
|
+
# Clean up base temp directory if it exists and is empty
|
|
451
|
+
temp_base_path = os.path.join(self.output_path, "temp_accumulation")
|
|
452
|
+
if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
|
|
453
|
+
os.rmdir(temp_base_path)
|
|
454
|
+
|
|
455
|
+
# Reset state
|
|
456
|
+
self.temp_folders_created.clear()
|
|
457
|
+
self.current_temp_folder_path = None
|
|
458
|
+
self.temp_folder_index = 0
|
|
459
|
+
self.current_folder_records = 0
|
|
460
|
+
|
|
461
|
+
except Exception as e:
|
|
462
|
+
logger.warning(f"Error cleaning up temp folders: {str(e)}")
|
|
463
|
+
|
|
464
|
+
async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
465
|
+
"""Write a chunk to a Parquet file.
|
|
466
|
+
|
|
467
|
+
This method writes a chunk to a Parquet file and uploads the file to the object store.
|
|
468
|
+
"""
|
|
469
|
+
chunk.to_parquet(file_name, index=False, compression="snappy")
|
|
@@ -415,7 +415,7 @@ class QueryBasedTransformer(TransformerInterface):
|
|
|
415
415
|
)
|
|
416
416
|
|
|
417
417
|
# run the SQL on the dataframe
|
|
418
|
-
logger.
|
|
418
|
+
logger.debug(
|
|
419
419
|
f"Running transformer for asset [{typename}] with SQL:\n {entity_sql_template}"
|
|
420
420
|
)
|
|
421
421
|
transformed_df = daft.sql(entity_sql_template)
|
application_sdk/version.py
CHANGED
|
@@ -10,6 +10,7 @@ from typing import Any, Callable, Coroutine, Dict, List, Sequence, Type
|
|
|
10
10
|
|
|
11
11
|
from temporalio import workflow
|
|
12
12
|
from temporalio.common import RetryPolicy
|
|
13
|
+
from typing_extensions import Tuple
|
|
13
14
|
|
|
14
15
|
from application_sdk.activities.common.models import ActivityStatistics
|
|
15
16
|
from application_sdk.activities.metadata_extraction.sql import (
|
|
@@ -152,15 +153,15 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
152
153
|
|
|
153
154
|
def get_transform_batches(
|
|
154
155
|
self, chunk_count: int, typename: str, partitions: List[int]
|
|
155
|
-
):
|
|
156
|
+
) -> Tuple[List[List[str]], List[int]]: # noqa: F821
|
|
156
157
|
"""Get batches for parallel transformation processing.
|
|
157
158
|
|
|
158
159
|
Args:
|
|
159
160
|
chunk_count (int): Total number of chunks to process.
|
|
160
161
|
typename (str): Type name for the chunks.
|
|
161
|
-
|
|
162
|
+
partitions (List[int]): List of partitions for each chunk.
|
|
162
163
|
Returns:
|
|
163
|
-
Tuple[List[List[str]], List[int]]: A
|
|
164
|
+
Tuple[List[List[str]], List[int]]: A list of file paths.
|
|
164
165
|
- List of batches, where each batch is a list of file paths
|
|
165
166
|
- List of starting chunk numbers for each batch
|
|
166
167
|
"""
|
|
@@ -174,7 +175,7 @@ class BaseSQLMetadataExtractionWorkflow(MetadataExtractionWorkflow):
|
|
|
174
175
|
# Each batch contains exactly one chunk
|
|
175
176
|
batches.append(
|
|
176
177
|
[
|
|
177
|
-
f"{typename}/chunk-{i}-part{file
|
|
178
|
+
f"{typename}/chunk-{i}-part{file}.parquet"
|
|
178
179
|
for file in range(partition)
|
|
179
180
|
]
|
|
180
181
|
)
|
{atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: atlan-application-sdk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1rc44
|
|
4
4
|
Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
|
|
5
5
|
Project-URL: Repository, https://github.com/atlanhq/application-sdk
|
|
6
6
|
Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
|
{atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/RECORD
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
application_sdk/__init__.py,sha256=2e2mvmLJ5dxmJGPELtb33xwP-j6JMdoIuqKycEn7hjg,151
|
|
2
2
|
application_sdk/constants.py,sha256=1THiejjOEgm4kHFN-PrwrUkfRk7q1pjOLWLm-t2ph1Q,10674
|
|
3
|
-
application_sdk/version.py,sha256=
|
|
3
|
+
application_sdk/version.py,sha256=PUC8knGCYDRf-xg5lgaXJ_F5evubFFRTEkWX9EULiq0,88
|
|
4
4
|
application_sdk/worker.py,sha256=i5f0AeKI39IfsLO05QkwC6uMz0zDPSJqP7B2byri1VI,7489
|
|
5
5
|
application_sdk/activities/__init__.py,sha256=QaXLOBYbb0zPOY5kfDQh56qbXQFaYNXOjJ5PCvatiZ4,9530
|
|
6
6
|
application_sdk/activities/lock_management.py,sha256=L__GZ9BsArwU1ntYwAgCKsSjCqN6QBeOfT-OT4WyD4Y,3983
|
|
@@ -11,9 +11,9 @@ application_sdk/activities/common/utils.py,sha256=nSNGkY5eS5pPc8etdPWkXBFTSaConG
|
|
|
11
11
|
application_sdk/activities/metadata_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
12
|
application_sdk/activities/metadata_extraction/base.py,sha256=ENFojpxqKdN_eVSL4iet3cGfylPOfcl1jnflfo4zhs8,3920
|
|
13
13
|
application_sdk/activities/metadata_extraction/rest.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
application_sdk/activities/metadata_extraction/sql.py,sha256=
|
|
14
|
+
application_sdk/activities/metadata_extraction/sql.py,sha256=ivIbTrkKAonijQQPfiOigoiXLWtA_-nLUn9lz09lpaU,34725
|
|
15
15
|
application_sdk/activities/query_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
|
-
application_sdk/activities/query_extraction/sql.py,sha256=
|
|
16
|
+
application_sdk/activities/query_extraction/sql.py,sha256=l64cGyTmbtaGcg3qj1YXKyNWiWeRsWPEuQyqW06rxxQ,21165
|
|
17
17
|
application_sdk/application/__init__.py,sha256=PbSImXYaQQ2IIee2SvI8AjDiSo2QcCFrM1PX3x-_RQs,8035
|
|
18
18
|
application_sdk/application/metadata_extraction/sql.py,sha256=rOd06Wodr4GyzupCYxVSCsNcuNar1rJM66ej9vocNHw,8138
|
|
19
19
|
application_sdk/clients/__init__.py,sha256=C9T84J7V6ZumcoWJPAxdd3tqSmbyciaGBJn-CaCCny0,1341
|
|
@@ -60,11 +60,11 @@ application_sdk/handlers/sql.py,sha256=6A_9xCtkXyNY5gPhImbftzrdPIEWIeTTqjyIewVES
|
|
|
60
60
|
application_sdk/inputs/__init__.py,sha256=_O5lK2A5EYyqwid8txKNEds3pHkoHGKrSTTWnQ-UzRA,6022
|
|
61
61
|
application_sdk/inputs/iceberg.py,sha256=xiv1kNtVx1k0h3ZJbJeXjZwdfBGSy9j9orYP_AyCYlI,2756
|
|
62
62
|
application_sdk/inputs/json.py,sha256=ZOgB3tuZSsb2m_KxiAdnbUQgU5ythCs-Mq-n4pPfeHA,4905
|
|
63
|
-
application_sdk/inputs/parquet.py,sha256=
|
|
64
|
-
application_sdk/inputs/sql_query.py,sha256=
|
|
63
|
+
application_sdk/inputs/parquet.py,sha256=9OzbrLZfkWdabqyLvHklwt4bONDp0WvNS5PHn2D0bnA,9519
|
|
64
|
+
application_sdk/inputs/sql_query.py,sha256=9deGGI5Wob8mDuq-vRjYgQvVrfuU_IjLkiYufZ2NqTo,10632
|
|
65
65
|
application_sdk/inputs/.cursor/BUGBOT.md,sha256=hwKGDbopv3NU0bpC_ElpAPDFcS59GWS3TunObGC6eLQ,9731
|
|
66
66
|
application_sdk/interceptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
67
|
-
application_sdk/interceptors/cleanup.py,sha256=
|
|
67
|
+
application_sdk/interceptors/cleanup.py,sha256=JlFcM_2Y5AIEfGTSNe0aoon7eoE68MIXI0rA3LHsSeY,5966
|
|
68
68
|
application_sdk/interceptors/events.py,sha256=TeStWmBbc4v1-dm2DWeKYsUfUhJLR8CtTQhu3TWOZWM,6524
|
|
69
69
|
application_sdk/interceptors/lock.py,sha256=Xe9TSjYKtDZUB94hbV7rHG_9rgKUJPTACeB8z8xsJ0w,5577
|
|
70
70
|
application_sdk/interceptors/.cursor/BUGBOT.md,sha256=pxmUF2c7dtaXAX8yAa1-LBa6FCrj_uw7aQcHrppjf1A,14570
|
|
@@ -75,10 +75,10 @@ application_sdk/observability/observability.py,sha256=DP0I4bHyg3TA4hxCqDFy2IiRmB
|
|
|
75
75
|
application_sdk/observability/traces_adaptor.py,sha256=0eQJPN-tYA_dV8D3uEa5ZiX9g12NDuLnPaFuQMVDdL0,18242
|
|
76
76
|
application_sdk/observability/utils.py,sha256=MKEpT0WYtpATUgLgJDkGQaAP_t-jpDYMUKDfEvr8Phg,2448
|
|
77
77
|
application_sdk/observability/decorators/observability_decorator.py,sha256=yd6qfrg1MmH5KcZ5Ydzb0RaBzmxx5FrmiI9qwvZx3EU,8963
|
|
78
|
-
application_sdk/outputs/__init__.py,sha256=
|
|
78
|
+
application_sdk/outputs/__init__.py,sha256=hrOPw0xuG9xP720Bt309TfbY2Qq_i51R8Xt3ZjwWDUY,15906
|
|
79
79
|
application_sdk/outputs/iceberg.py,sha256=TdppOMEMfojMhGyBmhWeu1AJQexRyHM-huAYeJmhjdY,5533
|
|
80
|
-
application_sdk/outputs/json.py,sha256=
|
|
81
|
-
application_sdk/outputs/parquet.py,sha256=
|
|
80
|
+
application_sdk/outputs/json.py,sha256=gYDDNOVb8EFxxeOkb6zKWZWjTEVgZLoapFM97_roK4A,10883
|
|
81
|
+
application_sdk/outputs/parquet.py,sha256=OLK7fF--ZrKrXLw6TP85nPqrIN1wW1I_rDs3FT2rQuA,20225
|
|
82
82
|
application_sdk/outputs/.cursor/BUGBOT.md,sha256=KxEC3CIyRSK1YftZou5BgKc6PRXT3qQmBNFJp-HSyYE,11496
|
|
83
83
|
application_sdk/server/__init__.py,sha256=KTqE1YPw_3WDVMWatJUuf9OOiobLM2K5SMaBrI62sCo,1568
|
|
84
84
|
application_sdk/server/.cursor/BUGBOT.md,sha256=p_MMoWUW5G1894WfOKYReZKWCuyJT_OJz3rL5g21NbI,16566
|
|
@@ -138,7 +138,7 @@ application_sdk/transformers/atlas/__init__.py,sha256=fw3D8bBtt61SseAfYut3JZddpX
|
|
|
138
138
|
application_sdk/transformers/atlas/sql.py,sha256=rkQXNZ7oebts5oF5E_Bw8NpcHHKScU0TmKciH_1l_k4,50419
|
|
139
139
|
application_sdk/transformers/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
140
140
|
application_sdk/transformers/common/utils.py,sha256=4ISMIQ0Gzghmi31p51FOFm5KLF7XF-fmH9PVT7i0DFE,4899
|
|
141
|
-
application_sdk/transformers/query/__init__.py,sha256=
|
|
141
|
+
application_sdk/transformers/query/__init__.py,sha256=yG1dGP3NhUizwkCgyFAzsr9SV9uWYZKjXoCWPrsIxVw,17358
|
|
142
142
|
application_sdk/transformers/query/templates/column.yaml,sha256=EXLYwGXN7LKT-v51n2EZnY99o6vHucyFaVSpM-sUSXw,7679
|
|
143
143
|
application_sdk/transformers/query/templates/database.yaml,sha256=SD1hJg5LI7gsBHQL5mW341sa51EkhcsIDDFlIOi9zdk,1374
|
|
144
144
|
application_sdk/transformers/query/templates/extras-procedure.yaml,sha256=XhAfVY4zm99K8fcgkYA1XPLv4ks-SA6SzMO3SMtQ60s,2298
|
|
@@ -149,11 +149,11 @@ application_sdk/transformers/query/templates/tag_attachment.yaml,sha256=dWNDGwRU
|
|
|
149
149
|
application_sdk/workflows/__init__.py,sha256=byluvgzTovr4L1co7YGb4--ktMBqt2pXBjYoxz4dIeU,3869
|
|
150
150
|
application_sdk/workflows/.cursor/BUGBOT.md,sha256=ybjRfSNgVSDzOrYoSvG8zIyL1JEVcsIj3AffizSfZKY,8162
|
|
151
151
|
application_sdk/workflows/metadata_extraction/__init__.py,sha256=jHUe_ZBQ66jx8bgyduPuECo2RdmJtQsQAKlakADEQbc,120
|
|
152
|
-
application_sdk/workflows/metadata_extraction/sql.py,sha256=
|
|
152
|
+
application_sdk/workflows/metadata_extraction/sql.py,sha256=6ZaVt84n-8U2ZvR9GR7uIJKv5v8CuyQjhlnoRJvDszc,12435
|
|
153
153
|
application_sdk/workflows/query_extraction/__init__.py,sha256=n066_CX5RpJz6DIxGMkKS3eGSRg03ilaCtsqfJWQb7Q,117
|
|
154
154
|
application_sdk/workflows/query_extraction/sql.py,sha256=kT_JQkLCRZ44ZpaC4QvPL6DxnRIIVh8gYHLqRbMI-hA,4826
|
|
155
|
-
atlan_application_sdk-0.1.
|
|
156
|
-
atlan_application_sdk-0.1.
|
|
157
|
-
atlan_application_sdk-0.1.
|
|
158
|
-
atlan_application_sdk-0.1.
|
|
159
|
-
atlan_application_sdk-0.1.
|
|
155
|
+
atlan_application_sdk-0.1.1rc44.dist-info/METADATA,sha256=FoTta0zU5XJfLfr0hmUeEyEXFOqZ0-Plb_CerNBVfcM,5567
|
|
156
|
+
atlan_application_sdk-0.1.1rc44.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
157
|
+
atlan_application_sdk-0.1.1rc44.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
|
158
|
+
atlan_application_sdk-0.1.1rc44.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
|
|
159
|
+
atlan_application_sdk-0.1.1rc44.dist-info/RECORD,,
|
{atlan_application_sdk-0.1.1rc43.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|