atlan-application-sdk 0.1.1rc42__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/metadata_extraction/sql.py +9 -35
- application_sdk/activities/query_extraction/sql.py +0 -2
- application_sdk/inputs/parquet.py +15 -3
- application_sdk/inputs/sql_query.py +2 -2
- application_sdk/interceptors/cleanup.py +0 -1
- application_sdk/outputs/__init__.py +176 -12
- application_sdk/outputs/json.py +57 -181
- application_sdk/outputs/parquet.py +230 -161
- application_sdk/services/objectstore.py +12 -6
- application_sdk/services/statestore.py +19 -22
- application_sdk/transformers/query/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/workflows/metadata_extraction/sql.py +5 -4
- {atlan_application_sdk-0.1.1rc42.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/METADATA +1 -1
- {atlan_application_sdk-0.1.1rc42.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/RECORD +18 -18
- {atlan_application_sdk-0.1.1rc42.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-0.1.1rc42.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-0.1.1rc42.dist-info → atlan_application_sdk-0.1.1rc44.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,10 +1,13 @@
|
|
|
1
|
+
import inspect
|
|
1
2
|
import os
|
|
3
|
+
import shutil
|
|
2
4
|
from enum import Enum
|
|
3
|
-
from typing import TYPE_CHECKING, List, Optional, Union
|
|
5
|
+
from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
|
|
4
6
|
|
|
5
7
|
from temporalio import activity
|
|
6
8
|
|
|
7
9
|
from application_sdk.activities.common.utils import get_object_store_prefix
|
|
10
|
+
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
8
11
|
from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
|
|
9
12
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
10
13
|
from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
|
|
@@ -35,57 +38,57 @@ class ParquetOutput(Output):
|
|
|
35
38
|
|
|
36
39
|
Attributes:
|
|
37
40
|
output_path (str): Base path where Parquet files will be written.
|
|
38
|
-
output_prefix (str): Prefix for files when uploading to object store.
|
|
39
41
|
output_suffix (str): Suffix for output files.
|
|
40
42
|
typename (Optional[str]): Type name of the entity e.g database, schema, table.
|
|
41
43
|
chunk_size (int): Maximum number of records per chunk.
|
|
42
44
|
total_record_count (int): Total number of records processed.
|
|
43
45
|
chunk_count (int): Number of chunks created.
|
|
44
46
|
chunk_start (Optional[int]): Starting index for chunk numbering.
|
|
45
|
-
path_gen (Callable): Function to generate file paths.
|
|
46
47
|
start_marker (Optional[str]): Start marker for query extraction.
|
|
47
48
|
end_marker (Optional[str]): End marker for query extraction.
|
|
49
|
+
retain_local_copy (bool): Whether to retain the local copy of the files.
|
|
50
|
+
use_consolidation (bool): Whether to use consolidation.
|
|
48
51
|
"""
|
|
49
52
|
|
|
53
|
+
_EXTENSION = ".parquet"
|
|
54
|
+
|
|
50
55
|
def __init__(
|
|
51
56
|
self,
|
|
52
57
|
output_path: str = "",
|
|
53
58
|
output_suffix: str = "",
|
|
54
|
-
output_prefix: str = "",
|
|
55
59
|
typename: Optional[str] = None,
|
|
56
60
|
chunk_size: Optional[int] = 100000,
|
|
57
|
-
buffer_size:
|
|
61
|
+
buffer_size: int = 5000,
|
|
58
62
|
total_record_count: int = 0,
|
|
59
63
|
chunk_count: int = 0,
|
|
60
64
|
chunk_start: Optional[int] = None,
|
|
61
65
|
start_marker: Optional[str] = None,
|
|
62
66
|
end_marker: Optional[str] = None,
|
|
63
67
|
retain_local_copy: bool = False,
|
|
68
|
+
use_consolidation: bool = False,
|
|
64
69
|
):
|
|
65
70
|
"""Initialize the Parquet output handler.
|
|
66
71
|
|
|
67
72
|
Args:
|
|
68
73
|
output_path (str): Base path where Parquet files will be written.
|
|
69
74
|
output_suffix (str): Suffix for output files.
|
|
70
|
-
output_prefix (str): Prefix for files when uploading to object store.
|
|
71
75
|
typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
|
|
72
76
|
chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
|
|
73
77
|
total_record_count (int, optional): Initial total record count. Defaults to 0.
|
|
74
78
|
chunk_count (int, optional): Initial chunk count. Defaults to 0.
|
|
75
79
|
chunk_start (Optional[int], optional): Starting index for chunk numbering.
|
|
76
80
|
Defaults to None.
|
|
77
|
-
path_gen (Callable, optional): Function to generate file paths.
|
|
78
|
-
Defaults to path_gen function.
|
|
79
81
|
start_marker (Optional[str], optional): Start marker for query extraction.
|
|
80
82
|
Defaults to None.
|
|
81
83
|
end_marker (Optional[str], optional): End marker for query extraction.
|
|
82
84
|
Defaults to None.
|
|
83
85
|
retain_local_copy (bool, optional): Whether to retain the local copy of the files.
|
|
84
86
|
Defaults to False.
|
|
87
|
+
use_consolidation (bool, optional): Whether to use consolidation.
|
|
88
|
+
Defaults to False.
|
|
85
89
|
"""
|
|
86
90
|
self.output_path = output_path
|
|
87
91
|
self.output_suffix = output_suffix
|
|
88
|
-
self.output_prefix = output_prefix
|
|
89
92
|
self.typename = typename
|
|
90
93
|
self.chunk_size = chunk_size
|
|
91
94
|
self.buffer_size = buffer_size
|
|
@@ -98,128 +101,84 @@ class ParquetOutput(Output):
|
|
|
98
101
|
DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
|
|
99
102
|
) # 90% of DAPR limit as safety buffer
|
|
100
103
|
self.chunk_start = chunk_start
|
|
104
|
+
self.chunk_part = 0
|
|
101
105
|
self.start_marker = start_marker
|
|
102
106
|
self.end_marker = end_marker
|
|
103
|
-
self.
|
|
107
|
+
self.partitions = []
|
|
104
108
|
self.metrics = get_metrics()
|
|
105
109
|
self.retain_local_copy = retain_local_copy
|
|
106
110
|
|
|
111
|
+
# Consolidation-specific attributes
|
|
112
|
+
# Use consolidation to efficiently write parquet files in buffered manner
|
|
113
|
+
# since there's no cleaner way to write parquet files incrementally
|
|
114
|
+
self.use_consolidation = use_consolidation
|
|
115
|
+
self.consolidation_threshold = (
|
|
116
|
+
chunk_size or 100000
|
|
117
|
+
) # Use chunk_size as threshold
|
|
118
|
+
self.current_folder_records = 0 # Track records in current temp folder
|
|
119
|
+
self.temp_folder_index = 0 # Current temp folder index
|
|
120
|
+
self.temp_folders_created: List[int] = [] # Track temp folders for cleanup
|
|
121
|
+
self.current_temp_folder_path: Optional[str] = None # Current temp folder path
|
|
122
|
+
|
|
123
|
+
if self.chunk_start:
|
|
124
|
+
self.chunk_count = self.chunk_start + self.chunk_count
|
|
125
|
+
|
|
107
126
|
# Create output directory
|
|
108
127
|
self.output_path = os.path.join(self.output_path, self.output_suffix)
|
|
109
128
|
if self.typename:
|
|
110
129
|
self.output_path = os.path.join(self.output_path, self.typename)
|
|
111
130
|
os.makedirs(self.output_path, exist_ok=True)
|
|
112
131
|
|
|
113
|
-
def
|
|
132
|
+
async def write_batched_dataframe(
|
|
114
133
|
self,
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"""Generate a file path for a chunk.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
|
|
124
|
-
chunk_count (int): Total number of chunks.
|
|
125
|
-
start_marker (Optional[str]): Start marker for query extraction.
|
|
126
|
-
end_marker (Optional[str]): End marker for query extraction.
|
|
127
|
-
|
|
128
|
-
Returns:
|
|
129
|
-
str: Generated file path for the chunk.
|
|
130
|
-
"""
|
|
131
|
-
# For Query Extraction - use start and end markers without chunk count
|
|
132
|
-
if start_marker and end_marker:
|
|
133
|
-
return f"{start_marker}_{end_marker}.parquet"
|
|
134
|
+
batched_dataframe: Union[
|
|
135
|
+
AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
|
|
136
|
+
],
|
|
137
|
+
):
|
|
138
|
+
"""Write a batched pandas DataFrame to Parquet files with consolidation support.
|
|
134
139
|
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
return f"{str(chunk_count)}.parquet"
|
|
138
|
-
else:
|
|
139
|
-
return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
|
|
140
|
+
This method implements a consolidation strategy to efficiently write parquet files
|
|
141
|
+
in a buffered manner, since there's no cleaner way to write parquet files incrementally.
|
|
140
142
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
+
The process:
|
|
144
|
+
1. Accumulate DataFrames into temp folders (buffer_size chunks each)
|
|
145
|
+
2. When consolidation_threshold is reached, use Daft to merge into optimized files
|
|
146
|
+
3. Clean up temporary files after consolidation
|
|
143
147
|
|
|
144
148
|
Args:
|
|
145
|
-
|
|
149
|
+
batched_dataframe: AsyncGenerator or Generator of pandas DataFrames to write.
|
|
146
150
|
"""
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
# Split the DataFrame into chunks
|
|
153
|
-
partition = (
|
|
154
|
-
self.chunk_size
|
|
155
|
-
if self.chunk_start is None
|
|
156
|
-
else min(self.chunk_size, self.buffer_size)
|
|
157
|
-
)
|
|
158
|
-
chunks = [
|
|
159
|
-
dataframe[i : i + partition] # type: ignore
|
|
160
|
-
for i in range(0, len(dataframe), partition)
|
|
161
|
-
]
|
|
162
|
-
|
|
163
|
-
for chunk in chunks:
|
|
164
|
-
# Estimate size of this chunk
|
|
165
|
-
chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
|
|
166
|
-
|
|
167
|
-
# Check if adding this chunk would exceed size limit
|
|
168
|
-
if (
|
|
169
|
-
self.current_buffer_size_bytes + chunk_size_bytes
|
|
170
|
-
> self.max_file_size_bytes
|
|
171
|
-
and self.current_buffer_size > 0
|
|
172
|
-
):
|
|
173
|
-
# Flush current buffer before adding this chunk
|
|
174
|
-
chunk_part += 1
|
|
175
|
-
await self._flush_buffer(chunk_part)
|
|
176
|
-
|
|
177
|
-
self.buffer.append(chunk)
|
|
178
|
-
self.current_buffer_size += len(chunk)
|
|
179
|
-
self.current_buffer_size_bytes += chunk_size_bytes
|
|
180
|
-
|
|
181
|
-
if self.current_buffer_size >= partition: # type: ignore
|
|
182
|
-
chunk_part += 1
|
|
183
|
-
await self._flush_buffer(chunk_part)
|
|
151
|
+
if not self.use_consolidation:
|
|
152
|
+
# Fallback to base class implementation
|
|
153
|
+
await super().write_batched_dataframe(batched_dataframe)
|
|
154
|
+
return
|
|
184
155
|
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
156
|
+
try:
|
|
157
|
+
# Phase 1: Accumulate DataFrames into temp folders
|
|
158
|
+
if inspect.isasyncgen(batched_dataframe):
|
|
159
|
+
async for dataframe in batched_dataframe:
|
|
160
|
+
if not is_empty_dataframe(dataframe):
|
|
161
|
+
await self._accumulate_dataframe(dataframe)
|
|
162
|
+
else:
|
|
163
|
+
sync_generator = cast(
|
|
164
|
+
Generator["pd.DataFrame", None, None], batched_dataframe
|
|
165
|
+
)
|
|
166
|
+
for dataframe in sync_generator:
|
|
167
|
+
if not is_empty_dataframe(dataframe):
|
|
168
|
+
await self._accumulate_dataframe(dataframe)
|
|
188
169
|
|
|
189
|
-
#
|
|
190
|
-
self.
|
|
191
|
-
|
|
192
|
-
value=len(dataframe),
|
|
193
|
-
metric_type=MetricType.COUNTER,
|
|
194
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
195
|
-
description="Number of records written to Parquet files from pandas DataFrame",
|
|
196
|
-
)
|
|
170
|
+
# Phase 2: Consolidate any remaining temp folder
|
|
171
|
+
if self.current_folder_records > 0:
|
|
172
|
+
await self._consolidate_current_folder()
|
|
197
173
|
|
|
198
|
-
#
|
|
199
|
-
self.
|
|
200
|
-
name="parquet_chunks_written",
|
|
201
|
-
value=1,
|
|
202
|
-
metric_type=MetricType.COUNTER,
|
|
203
|
-
labels={"type": "pandas", "mode": WriteMode.APPEND.value},
|
|
204
|
-
description="Number of chunks written to Parquet files",
|
|
205
|
-
)
|
|
174
|
+
# Phase 3: Cleanup temp folders
|
|
175
|
+
await self._cleanup_temp_folders()
|
|
206
176
|
|
|
207
|
-
self.chunk_count += 1
|
|
208
|
-
self.statistics.append(chunk_part)
|
|
209
177
|
except Exception as e:
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
name="parquet_write_errors",
|
|
213
|
-
value=1,
|
|
214
|
-
metric_type=MetricType.COUNTER,
|
|
215
|
-
labels={
|
|
216
|
-
"type": "pandas",
|
|
217
|
-
"mode": WriteMode.APPEND.value,
|
|
218
|
-
"error": str(e),
|
|
219
|
-
},
|
|
220
|
-
description="Number of errors while writing to Parquet files",
|
|
178
|
+
logger.error(
|
|
179
|
+
f"Error in batched dataframe writing with consolidation: {str(e)}"
|
|
221
180
|
)
|
|
222
|
-
|
|
181
|
+
await self._cleanup_temp_folders() # Cleanup on error
|
|
223
182
|
raise
|
|
224
183
|
|
|
225
184
|
async def write_daft_dataframe(
|
|
@@ -320,7 +279,13 @@ class ParquetOutput(Output):
|
|
|
320
279
|
name="parquet_write_errors",
|
|
321
280
|
value=1,
|
|
322
281
|
metric_type=MetricType.COUNTER,
|
|
323
|
-
labels={
|
|
282
|
+
labels={
|
|
283
|
+
"type": "daft",
|
|
284
|
+
"mode": write_mode.value
|
|
285
|
+
if isinstance(write_mode, WriteMode)
|
|
286
|
+
else write_mode,
|
|
287
|
+
"error": str(e),
|
|
288
|
+
},
|
|
324
289
|
description="Number of errors while writing to Parquet files",
|
|
325
290
|
)
|
|
326
291
|
logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
|
|
@@ -334,67 +299,171 @@ class ParquetOutput(Output):
|
|
|
334
299
|
"""
|
|
335
300
|
return self.output_path
|
|
336
301
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
302
|
+
# Consolidation helper methods
|
|
303
|
+
|
|
304
|
+
def _get_temp_folder_path(self, folder_index: int) -> str:
|
|
305
|
+
"""Generate temp folder path consistent with existing structure."""
|
|
306
|
+
temp_base_path = os.path.join(self.output_path, "temp_accumulation")
|
|
307
|
+
return os.path.join(temp_base_path, f"folder-{folder_index}")
|
|
308
|
+
|
|
309
|
+
def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
|
|
310
|
+
"""Generate final consolidated file path using existing path_gen logic."""
|
|
311
|
+
return os.path.join(
|
|
312
|
+
self.output_path,
|
|
313
|
+
self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
|
|
317
|
+
"""Accumulate DataFrame into temp folders, writing in buffer_size chunks."""
|
|
318
|
+
|
|
319
|
+
# Process dataframe in buffer_size chunks
|
|
320
|
+
for i in range(0, len(dataframe), self.buffer_size):
|
|
321
|
+
chunk = dataframe[i : i + self.buffer_size]
|
|
322
|
+
|
|
323
|
+
# Check if we need to consolidate current folder before adding this chunk
|
|
324
|
+
if (
|
|
325
|
+
self.current_folder_records + len(chunk)
|
|
326
|
+
) > self.consolidation_threshold:
|
|
327
|
+
if self.current_folder_records > 0:
|
|
328
|
+
await self._consolidate_current_folder()
|
|
329
|
+
self._start_new_temp_folder()
|
|
330
|
+
|
|
331
|
+
# Ensure we have a temp folder ready
|
|
332
|
+
if self.current_temp_folder_path is None:
|
|
333
|
+
self._start_new_temp_folder()
|
|
334
|
+
|
|
335
|
+
# Write chunk to current temp folder
|
|
336
|
+
await self._write_chunk_to_temp_folder(cast("pd.DataFrame", chunk))
|
|
337
|
+
self.current_folder_records += len(chunk)
|
|
338
|
+
|
|
339
|
+
def _start_new_temp_folder(self):
|
|
340
|
+
"""Start a new temp folder for accumulation and create the directory."""
|
|
341
|
+
if self.current_temp_folder_path is not None:
|
|
342
|
+
self.temp_folders_created.append(self.temp_folder_index)
|
|
343
|
+
self.temp_folder_index += 1
|
|
344
|
+
|
|
345
|
+
self.current_folder_records = 0
|
|
346
|
+
self.current_temp_folder_path = self._get_temp_folder_path(
|
|
347
|
+
self.temp_folder_index
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Create the directory
|
|
351
|
+
os.makedirs(self.current_temp_folder_path, exist_ok=True)
|
|
352
|
+
|
|
353
|
+
async def _write_chunk_to_temp_folder(self, chunk: "pd.DataFrame"):
|
|
354
|
+
"""Write a chunk to the current temp folder."""
|
|
355
|
+
if self.current_temp_folder_path is None:
|
|
356
|
+
raise ValueError("No temp folder path available")
|
|
357
|
+
|
|
358
|
+
# Generate file name for this chunk within the temp folder
|
|
359
|
+
existing_files = len(
|
|
360
|
+
[
|
|
361
|
+
f
|
|
362
|
+
for f in os.listdir(self.current_temp_folder_path)
|
|
363
|
+
if f.endswith(".parquet")
|
|
364
|
+
]
|
|
365
|
+
)
|
|
366
|
+
chunk_file_name = f"chunk-{existing_files}.parquet"
|
|
367
|
+
chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
|
|
342
368
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
"""
|
|
346
|
-
import pandas as pd
|
|
369
|
+
# Write chunk using existing write_chunk method
|
|
370
|
+
await self.write_chunk(chunk, chunk_file_path)
|
|
347
371
|
|
|
348
|
-
|
|
372
|
+
async def _consolidate_current_folder(self):
|
|
373
|
+
"""Consolidate current temp folder using Daft."""
|
|
374
|
+
if self.current_folder_records == 0 or self.current_temp_folder_path is None:
|
|
349
375
|
return
|
|
350
376
|
|
|
351
|
-
if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
|
|
352
|
-
raise TypeError(
|
|
353
|
-
"_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
|
|
354
|
-
)
|
|
355
|
-
|
|
356
377
|
try:
|
|
357
|
-
|
|
358
|
-
pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
|
|
359
|
-
combined_dataframe = pd.concat(pd_buffer)
|
|
360
|
-
|
|
361
|
-
# Write DataFrame to Parquet file
|
|
362
|
-
if not combined_dataframe.empty:
|
|
363
|
-
self.total_record_count += len(combined_dataframe)
|
|
364
|
-
output_file_name = (
|
|
365
|
-
f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
|
|
366
|
-
)
|
|
367
|
-
combined_dataframe.to_parquet(
|
|
368
|
-
output_file_name, index=False, compression="snappy"
|
|
369
|
-
)
|
|
378
|
+
import daft
|
|
370
379
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
metric_type=MetricType.COUNTER,
|
|
376
|
-
labels={"type": "pandas"},
|
|
377
|
-
description="Number of chunks written to Parquet files",
|
|
378
|
-
)
|
|
380
|
+
# Read all parquet files in temp folder
|
|
381
|
+
pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
|
|
382
|
+
daft_df = daft.read_parquet(pattern)
|
|
383
|
+
partitions = 0
|
|
379
384
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
385
|
+
# Write consolidated file using Daft with size management
|
|
386
|
+
with daft.execution_config_ctx(
|
|
387
|
+
parquet_target_filesize=self.max_file_size_bytes
|
|
388
|
+
):
|
|
389
|
+
# Write to a temp location first
|
|
390
|
+
temp_consolidated_dir = f"{self.current_temp_folder_path}_temp"
|
|
391
|
+
result = daft_df.write_parquet(root_dir=temp_consolidated_dir)
|
|
392
|
+
|
|
393
|
+
# Get the generated file path and rename to final location
|
|
394
|
+
result_dict = result.to_pydict()
|
|
395
|
+
partitions = len(result_dict["path"])
|
|
396
|
+
for i, file_path in enumerate(result_dict["path"]):
|
|
397
|
+
if file_path.endswith(".parquet"):
|
|
398
|
+
consolidated_file_path = self._get_consolidated_file_path(
|
|
399
|
+
folder_index=self.chunk_count,
|
|
400
|
+
chunk_part=i,
|
|
401
|
+
)
|
|
402
|
+
os.rename(file_path, consolidated_file_path)
|
|
403
|
+
|
|
404
|
+
# Upload consolidated file to object store
|
|
405
|
+
await ObjectStore.upload_file(
|
|
406
|
+
source=consolidated_file_path,
|
|
407
|
+
destination=get_object_store_prefix(consolidated_file_path),
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
# Clean up temp consolidated dir
|
|
411
|
+
shutil.rmtree(temp_consolidated_dir, ignore_errors=True)
|
|
412
|
+
|
|
413
|
+
# Update statistics
|
|
414
|
+
self.chunk_count += 1
|
|
415
|
+
self.total_record_count += self.current_folder_records
|
|
416
|
+
self.partitions.append(partitions)
|
|
389
417
|
|
|
390
|
-
|
|
391
|
-
# Record metrics for failed write
|
|
418
|
+
# Record metrics
|
|
392
419
|
self.metrics.record_metric(
|
|
393
|
-
name="
|
|
420
|
+
name="consolidated_files",
|
|
394
421
|
value=1,
|
|
395
422
|
metric_type=MetricType.COUNTER,
|
|
396
|
-
labels={"type": "
|
|
397
|
-
description="Number of
|
|
423
|
+
labels={"type": "daft_consolidation"},
|
|
424
|
+
description="Number of consolidated parquet files created",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
logger.info(
|
|
428
|
+
f"Consolidated folder {self.temp_folder_index} with {self.current_folder_records} records"
|
|
398
429
|
)
|
|
399
|
-
|
|
400
|
-
|
|
430
|
+
|
|
431
|
+
except Exception as e:
|
|
432
|
+
logger.error(
|
|
433
|
+
f"Error consolidating folder {self.temp_folder_index}: {str(e)}"
|
|
434
|
+
)
|
|
435
|
+
raise
|
|
436
|
+
|
|
437
|
+
async def _cleanup_temp_folders(self):
|
|
438
|
+
"""Clean up all temp folders after consolidation."""
|
|
439
|
+
try:
|
|
440
|
+
# Add current folder to cleanup list if it exists
|
|
441
|
+
if self.current_temp_folder_path is not None:
|
|
442
|
+
self.temp_folders_created.append(self.temp_folder_index)
|
|
443
|
+
|
|
444
|
+
# Clean up all temp folders
|
|
445
|
+
for folder_index in self.temp_folders_created:
|
|
446
|
+
temp_folder = self._get_temp_folder_path(folder_index)
|
|
447
|
+
if os.path.exists(temp_folder):
|
|
448
|
+
shutil.rmtree(temp_folder, ignore_errors=True)
|
|
449
|
+
|
|
450
|
+
# Clean up base temp directory if it exists and is empty
|
|
451
|
+
temp_base_path = os.path.join(self.output_path, "temp_accumulation")
|
|
452
|
+
if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
|
|
453
|
+
os.rmdir(temp_base_path)
|
|
454
|
+
|
|
455
|
+
# Reset state
|
|
456
|
+
self.temp_folders_created.clear()
|
|
457
|
+
self.current_temp_folder_path = None
|
|
458
|
+
self.temp_folder_index = 0
|
|
459
|
+
self.current_folder_records = 0
|
|
460
|
+
|
|
461
|
+
except Exception as e:
|
|
462
|
+
logger.warning(f"Error cleaning up temp folders: {str(e)}")
|
|
463
|
+
|
|
464
|
+
async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
|
|
465
|
+
"""Write a chunk to a Parquet file.
|
|
466
|
+
|
|
467
|
+
This method writes a chunk to a Parquet file and uploads the file to the object store.
|
|
468
|
+
"""
|
|
469
|
+
chunk.to_parquet(file_name, index=False, compression="snappy")
|
|
@@ -114,14 +114,17 @@ class ObjectStore:
|
|
|
114
114
|
|
|
115
115
|
@classmethod
|
|
116
116
|
async def get_content(
|
|
117
|
-
cls,
|
|
118
|
-
|
|
117
|
+
cls,
|
|
118
|
+
key: str,
|
|
119
|
+
store_name: str = DEPLOYMENT_OBJECT_STORE_NAME,
|
|
120
|
+
suppress_error: bool = False,
|
|
121
|
+
) -> bytes | None:
|
|
119
122
|
"""Get raw file content from the object store.
|
|
120
123
|
|
|
121
124
|
Args:
|
|
122
125
|
key: The path of the file in the object store.
|
|
123
126
|
store_name: Name of the Dapr object store binding to use.
|
|
124
|
-
|
|
127
|
+
suppress_error: Whether to suppress the error and return None if the file does not exist.
|
|
125
128
|
Returns:
|
|
126
129
|
The raw file content as bytes.
|
|
127
130
|
|
|
@@ -138,14 +141,18 @@ class ObjectStore:
|
|
|
138
141
|
store_name=store_name,
|
|
139
142
|
)
|
|
140
143
|
if not response_data:
|
|
144
|
+
if suppress_error:
|
|
145
|
+
return None
|
|
141
146
|
raise Exception(f"No data received for file: {key}")
|
|
142
147
|
|
|
143
148
|
logger.debug(f"Successfully retrieved file content: {key}")
|
|
144
149
|
return response_data
|
|
145
150
|
|
|
146
151
|
except Exception as e:
|
|
152
|
+
if suppress_error:
|
|
153
|
+
return None
|
|
147
154
|
logger.error(f"Error getting file content for {key}: {str(e)}")
|
|
148
|
-
raise
|
|
155
|
+
raise
|
|
149
156
|
|
|
150
157
|
@classmethod
|
|
151
158
|
async def exists(
|
|
@@ -463,8 +470,7 @@ class ObjectStore:
|
|
|
463
470
|
binding_metadata=metadata,
|
|
464
471
|
)
|
|
465
472
|
return response.data
|
|
466
|
-
except Exception
|
|
467
|
-
logger.error(f"Error in Dapr binding operation '{operation}': {str(e)}")
|
|
473
|
+
except Exception:
|
|
468
474
|
raise
|
|
469
475
|
|
|
470
476
|
@classmethod
|
|
@@ -104,33 +104,26 @@ class StateStore:
|
|
|
104
104
|
>>> creds = await StateStore.get_state("db-cred-456", StateType.CREDENTIALS)
|
|
105
105
|
>>> print(f"Database: {creds.get('database')}")
|
|
106
106
|
"""
|
|
107
|
-
|
|
108
107
|
state_file_path = build_state_store_path(id, type)
|
|
109
|
-
state = {}
|
|
110
|
-
|
|
111
108
|
try:
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
source=get_object_store_prefix(state_file_path),
|
|
115
|
-
destination=state_file_path,
|
|
109
|
+
object_store_content = await ObjectStore.get_content(
|
|
110
|
+
get_object_store_prefix(state_file_path),
|
|
116
111
|
store_name=UPSTREAM_OBJECT_STORE_NAME,
|
|
112
|
+
suppress_error=True,
|
|
117
113
|
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
state = json.load(file)
|
|
121
|
-
|
|
122
|
-
logger.info(f"State object downloaded for {id} with type {type}")
|
|
123
|
-
except Exception as e:
|
|
124
|
-
# local error message is "file not found", while in object store it is "object not found"
|
|
125
|
-
if "not found" in str(e).lower():
|
|
126
|
-
logger.info(
|
|
114
|
+
if not object_store_content:
|
|
115
|
+
logger.warning(
|
|
127
116
|
f"No state found for {type.value} with id '{id}', returning empty dict"
|
|
128
117
|
)
|
|
129
|
-
|
|
130
|
-
logger.error(f"Failed to extract state: {str(e)}")
|
|
131
|
-
raise
|
|
118
|
+
return {}
|
|
132
119
|
|
|
133
|
-
|
|
120
|
+
state = json.loads(object_store_content)
|
|
121
|
+
logger.info(f"State object retrieved for {id} with type {type}")
|
|
122
|
+
|
|
123
|
+
return state
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Failed to extract state: {str(e)}")
|
|
126
|
+
raise
|
|
134
127
|
|
|
135
128
|
@classmethod
|
|
136
129
|
async def save_state(cls, key: str, value: Any, id: str, type: StateType) -> None:
|
|
@@ -240,7 +233,9 @@ class StateStore:
|
|
|
240
233
|
... )
|
|
241
234
|
"""
|
|
242
235
|
try:
|
|
243
|
-
logger.info(
|
|
236
|
+
logger.info(
|
|
237
|
+
f"Saving state object in object store for {id} with type {type}"
|
|
238
|
+
)
|
|
244
239
|
# get the current state from object store
|
|
245
240
|
current_state = await cls.get_state(id, type)
|
|
246
241
|
state_file_path = build_state_store_path(id, type)
|
|
@@ -260,7 +255,9 @@ class StateStore:
|
|
|
260
255
|
destination=get_object_store_prefix(state_file_path),
|
|
261
256
|
store_name=UPSTREAM_OBJECT_STORE_NAME,
|
|
262
257
|
)
|
|
263
|
-
logger.info(
|
|
258
|
+
logger.info(
|
|
259
|
+
f"State object created in object store for {id} with type {type}"
|
|
260
|
+
)
|
|
264
261
|
return current_state
|
|
265
262
|
except Exception as e:
|
|
266
263
|
logger.error(f"Failed to store state: {str(e)}")
|
|
@@ -415,7 +415,7 @@ class QueryBasedTransformer(TransformerInterface):
|
|
|
415
415
|
)
|
|
416
416
|
|
|
417
417
|
# run the SQL on the dataframe
|
|
418
|
-
logger.
|
|
418
|
+
logger.debug(
|
|
419
419
|
f"Running transformer for asset [{typename}] with SQL:\n {entity_sql_template}"
|
|
420
420
|
)
|
|
421
421
|
transformed_df = daft.sql(entity_sql_template)
|
application_sdk/version.py
CHANGED