atlan-application-sdk 0.1.1rc42__py3-none-any.whl → 0.1.1rc44__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,13 @@
1
+ import inspect
1
2
  import os
3
+ import shutil
2
4
  from enum import Enum
3
- from typing import TYPE_CHECKING, List, Optional, Union
5
+ from typing import TYPE_CHECKING, AsyncGenerator, Generator, List, Optional, Union, cast
4
6
 
5
7
  from temporalio import activity
6
8
 
7
9
  from application_sdk.activities.common.utils import get_object_store_prefix
10
+ from application_sdk.common.dataframe_utils import is_empty_dataframe
8
11
  from application_sdk.constants import DAPR_MAX_GRPC_MESSAGE_LENGTH
9
12
  from application_sdk.observability.logger_adaptor import get_logger
10
13
  from application_sdk.observability.metrics_adaptor import MetricType, get_metrics
@@ -35,57 +38,57 @@ class ParquetOutput(Output):
35
38
 
36
39
  Attributes:
37
40
  output_path (str): Base path where Parquet files will be written.
38
- output_prefix (str): Prefix for files when uploading to object store.
39
41
  output_suffix (str): Suffix for output files.
40
42
  typename (Optional[str]): Type name of the entity e.g database, schema, table.
41
43
  chunk_size (int): Maximum number of records per chunk.
42
44
  total_record_count (int): Total number of records processed.
43
45
  chunk_count (int): Number of chunks created.
44
46
  chunk_start (Optional[int]): Starting index for chunk numbering.
45
- path_gen (Callable): Function to generate file paths.
46
47
  start_marker (Optional[str]): Start marker for query extraction.
47
48
  end_marker (Optional[str]): End marker for query extraction.
49
+ retain_local_copy (bool): Whether to retain the local copy of the files.
50
+ use_consolidation (bool): Whether to use consolidation.
48
51
  """
49
52
 
53
+ _EXTENSION = ".parquet"
54
+
50
55
  def __init__(
51
56
  self,
52
57
  output_path: str = "",
53
58
  output_suffix: str = "",
54
- output_prefix: str = "",
55
59
  typename: Optional[str] = None,
56
60
  chunk_size: Optional[int] = 100000,
57
- buffer_size: Optional[int] = 100000,
61
+ buffer_size: int = 5000,
58
62
  total_record_count: int = 0,
59
63
  chunk_count: int = 0,
60
64
  chunk_start: Optional[int] = None,
61
65
  start_marker: Optional[str] = None,
62
66
  end_marker: Optional[str] = None,
63
67
  retain_local_copy: bool = False,
68
+ use_consolidation: bool = False,
64
69
  ):
65
70
  """Initialize the Parquet output handler.
66
71
 
67
72
  Args:
68
73
  output_path (str): Base path where Parquet files will be written.
69
74
  output_suffix (str): Suffix for output files.
70
- output_prefix (str): Prefix for files when uploading to object store.
71
75
  typename (Optional[str], optional): Type name of the entity e.g database, schema, table.
72
76
  chunk_size (int, optional): Maximum records per chunk. Defaults to 100000.
73
77
  total_record_count (int, optional): Initial total record count. Defaults to 0.
74
78
  chunk_count (int, optional): Initial chunk count. Defaults to 0.
75
79
  chunk_start (Optional[int], optional): Starting index for chunk numbering.
76
80
  Defaults to None.
77
- path_gen (Callable, optional): Function to generate file paths.
78
- Defaults to path_gen function.
79
81
  start_marker (Optional[str], optional): Start marker for query extraction.
80
82
  Defaults to None.
81
83
  end_marker (Optional[str], optional): End marker for query extraction.
82
84
  Defaults to None.
83
85
  retain_local_copy (bool, optional): Whether to retain the local copy of the files.
84
86
  Defaults to False.
87
+ use_consolidation (bool, optional): Whether to use consolidation.
88
+ Defaults to False.
85
89
  """
86
90
  self.output_path = output_path
87
91
  self.output_suffix = output_suffix
88
- self.output_prefix = output_prefix
89
92
  self.typename = typename
90
93
  self.chunk_size = chunk_size
91
94
  self.buffer_size = buffer_size
@@ -98,128 +101,84 @@ class ParquetOutput(Output):
98
101
  DAPR_MAX_GRPC_MESSAGE_LENGTH * 0.9
99
102
  ) # 90% of DAPR limit as safety buffer
100
103
  self.chunk_start = chunk_start
104
+ self.chunk_part = 0
101
105
  self.start_marker = start_marker
102
106
  self.end_marker = end_marker
103
- self.statistics = []
107
+ self.partitions = []
104
108
  self.metrics = get_metrics()
105
109
  self.retain_local_copy = retain_local_copy
106
110
 
111
+ # Consolidation-specific attributes
112
+ # Use consolidation to efficiently write parquet files in buffered manner
113
+ # since there's no cleaner way to write parquet files incrementally
114
+ self.use_consolidation = use_consolidation
115
+ self.consolidation_threshold = (
116
+ chunk_size or 100000
117
+ ) # Use chunk_size as threshold
118
+ self.current_folder_records = 0 # Track records in current temp folder
119
+ self.temp_folder_index = 0 # Current temp folder index
120
+ self.temp_folders_created: List[int] = [] # Track temp folders for cleanup
121
+ self.current_temp_folder_path: Optional[str] = None # Current temp folder path
122
+
123
+ if self.chunk_start:
124
+ self.chunk_count = self.chunk_start + self.chunk_count
125
+
107
126
  # Create output directory
108
127
  self.output_path = os.path.join(self.output_path, self.output_suffix)
109
128
  if self.typename:
110
129
  self.output_path = os.path.join(self.output_path, self.typename)
111
130
  os.makedirs(self.output_path, exist_ok=True)
112
131
 
113
- def path_gen(
132
+ async def write_batched_dataframe(
114
133
  self,
115
- chunk_start: Optional[int] = None,
116
- chunk_count: int = 0,
117
- start_marker: Optional[str] = None,
118
- end_marker: Optional[str] = None,
119
- ) -> str:
120
- """Generate a file path for a chunk.
121
-
122
- Args:
123
- chunk_start (Optional[int]): Starting index of the chunk, or None for single chunk.
124
- chunk_count (int): Total number of chunks.
125
- start_marker (Optional[str]): Start marker for query extraction.
126
- end_marker (Optional[str]): End marker for query extraction.
127
-
128
- Returns:
129
- str: Generated file path for the chunk.
130
- """
131
- # For Query Extraction - use start and end markers without chunk count
132
- if start_marker and end_marker:
133
- return f"{start_marker}_{end_marker}.parquet"
134
+ batched_dataframe: Union[
135
+ AsyncGenerator["pd.DataFrame", None], Generator["pd.DataFrame", None, None]
136
+ ],
137
+ ):
138
+ """Write a batched pandas DataFrame to Parquet files with consolidation support.
134
139
 
135
- # For regular chunking - include chunk count
136
- if chunk_start is None:
137
- return f"{str(chunk_count)}.parquet"
138
- else:
139
- return f"chunk-{str(chunk_start)}-part{str(chunk_count)}.parquet"
140
+ This method implements a consolidation strategy to efficiently write parquet files
141
+ in a buffered manner, since there's no cleaner way to write parquet files incrementally.
140
142
 
141
- async def write_dataframe(self, dataframe: "pd.DataFrame"):
142
- """Write a pandas DataFrame to Parquet files and upload to object store.
143
+ The process:
144
+ 1. Accumulate DataFrames into temp folders (buffer_size chunks each)
145
+ 2. When consolidation_threshold is reached, use Daft to merge into optimized files
146
+ 3. Clean up temporary files after consolidation
143
147
 
144
148
  Args:
145
- dataframe (pd.DataFrame): The DataFrame to write.
149
+ batched_dataframe: AsyncGenerator or Generator of pandas DataFrames to write.
146
150
  """
147
- try:
148
- chunk_part = 0
149
- if len(dataframe) == 0:
150
- return
151
-
152
- # Split the DataFrame into chunks
153
- partition = (
154
- self.chunk_size
155
- if self.chunk_start is None
156
- else min(self.chunk_size, self.buffer_size)
157
- )
158
- chunks = [
159
- dataframe[i : i + partition] # type: ignore
160
- for i in range(0, len(dataframe), partition)
161
- ]
162
-
163
- for chunk in chunks:
164
- # Estimate size of this chunk
165
- chunk_size_bytes = self.estimate_dataframe_file_size(chunk, "parquet")
166
-
167
- # Check if adding this chunk would exceed size limit
168
- if (
169
- self.current_buffer_size_bytes + chunk_size_bytes
170
- > self.max_file_size_bytes
171
- and self.current_buffer_size > 0
172
- ):
173
- # Flush current buffer before adding this chunk
174
- chunk_part += 1
175
- await self._flush_buffer(chunk_part)
176
-
177
- self.buffer.append(chunk)
178
- self.current_buffer_size += len(chunk)
179
- self.current_buffer_size_bytes += chunk_size_bytes
180
-
181
- if self.current_buffer_size >= partition: # type: ignore
182
- chunk_part += 1
183
- await self._flush_buffer(chunk_part)
151
+ if not self.use_consolidation:
152
+ # Fallback to base class implementation
153
+ await super().write_batched_dataframe(batched_dataframe)
154
+ return
184
155
 
185
- if self.buffer and self.current_buffer_size > 0:
186
- chunk_part += 1
187
- await self._flush_buffer(chunk_part)
156
+ try:
157
+ # Phase 1: Accumulate DataFrames into temp folders
158
+ if inspect.isasyncgen(batched_dataframe):
159
+ async for dataframe in batched_dataframe:
160
+ if not is_empty_dataframe(dataframe):
161
+ await self._accumulate_dataframe(dataframe)
162
+ else:
163
+ sync_generator = cast(
164
+ Generator["pd.DataFrame", None, None], batched_dataframe
165
+ )
166
+ for dataframe in sync_generator:
167
+ if not is_empty_dataframe(dataframe):
168
+ await self._accumulate_dataframe(dataframe)
188
169
 
189
- # Record metrics for successful write
190
- self.metrics.record_metric(
191
- name="parquet_write_records",
192
- value=len(dataframe),
193
- metric_type=MetricType.COUNTER,
194
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
195
- description="Number of records written to Parquet files from pandas DataFrame",
196
- )
170
+ # Phase 2: Consolidate any remaining temp folder
171
+ if self.current_folder_records > 0:
172
+ await self._consolidate_current_folder()
197
173
 
198
- # Record chunk metrics
199
- self.metrics.record_metric(
200
- name="parquet_chunks_written",
201
- value=1,
202
- metric_type=MetricType.COUNTER,
203
- labels={"type": "pandas", "mode": WriteMode.APPEND.value},
204
- description="Number of chunks written to Parquet files",
205
- )
174
+ # Phase 3: Cleanup temp folders
175
+ await self._cleanup_temp_folders()
206
176
 
207
- self.chunk_count += 1
208
- self.statistics.append(chunk_part)
209
177
  except Exception as e:
210
- # Record metrics for failed write
211
- self.metrics.record_metric(
212
- name="parquet_write_errors",
213
- value=1,
214
- metric_type=MetricType.COUNTER,
215
- labels={
216
- "type": "pandas",
217
- "mode": WriteMode.APPEND.value,
218
- "error": str(e),
219
- },
220
- description="Number of errors while writing to Parquet files",
178
+ logger.error(
179
+ f"Error in batched dataframe writing with consolidation: {str(e)}"
221
180
  )
222
- logger.error(f"Error writing pandas dataframe to parquet: {str(e)}")
181
+ await self._cleanup_temp_folders() # Cleanup on error
223
182
  raise
224
183
 
225
184
  async def write_daft_dataframe(
@@ -320,7 +279,13 @@ class ParquetOutput(Output):
320
279
  name="parquet_write_errors",
321
280
  value=1,
322
281
  metric_type=MetricType.COUNTER,
323
- labels={"type": "daft", "mode": write_mode, "error": str(e)},
282
+ labels={
283
+ "type": "daft",
284
+ "mode": write_mode.value
285
+ if isinstance(write_mode, WriteMode)
286
+ else write_mode,
287
+ "error": str(e),
288
+ },
324
289
  description="Number of errors while writing to Parquet files",
325
290
  )
326
291
  logger.error(f"Error writing daft dataframe to parquet: {str(e)}")
@@ -334,67 +299,171 @@ class ParquetOutput(Output):
334
299
  """
335
300
  return self.output_path
336
301
 
337
- async def _flush_buffer(self, chunk_part: int):
338
- """Flush the current buffer to a Parquet file.
339
-
340
- This method combines all DataFrames in the buffer, writes them to a Parquet file,
341
- and uploads the file to the object store.
302
+ # Consolidation helper methods
303
+
304
+ def _get_temp_folder_path(self, folder_index: int) -> str:
305
+ """Generate temp folder path consistent with existing structure."""
306
+ temp_base_path = os.path.join(self.output_path, "temp_accumulation")
307
+ return os.path.join(temp_base_path, f"folder-{folder_index}")
308
+
309
+ def _get_consolidated_file_path(self, folder_index: int, chunk_part: int) -> str:
310
+ """Generate final consolidated file path using existing path_gen logic."""
311
+ return os.path.join(
312
+ self.output_path,
313
+ self.path_gen(chunk_count=folder_index, chunk_part=chunk_part),
314
+ )
315
+
316
+ async def _accumulate_dataframe(self, dataframe: "pd.DataFrame"):
317
+ """Accumulate DataFrame into temp folders, writing in buffer_size chunks."""
318
+
319
+ # Process dataframe in buffer_size chunks
320
+ for i in range(0, len(dataframe), self.buffer_size):
321
+ chunk = dataframe[i : i + self.buffer_size]
322
+
323
+ # Check if we need to consolidate current folder before adding this chunk
324
+ if (
325
+ self.current_folder_records + len(chunk)
326
+ ) > self.consolidation_threshold:
327
+ if self.current_folder_records > 0:
328
+ await self._consolidate_current_folder()
329
+ self._start_new_temp_folder()
330
+
331
+ # Ensure we have a temp folder ready
332
+ if self.current_temp_folder_path is None:
333
+ self._start_new_temp_folder()
334
+
335
+ # Write chunk to current temp folder
336
+ await self._write_chunk_to_temp_folder(cast("pd.DataFrame", chunk))
337
+ self.current_folder_records += len(chunk)
338
+
339
+ def _start_new_temp_folder(self):
340
+ """Start a new temp folder for accumulation and create the directory."""
341
+ if self.current_temp_folder_path is not None:
342
+ self.temp_folders_created.append(self.temp_folder_index)
343
+ self.temp_folder_index += 1
344
+
345
+ self.current_folder_records = 0
346
+ self.current_temp_folder_path = self._get_temp_folder_path(
347
+ self.temp_folder_index
348
+ )
349
+
350
+ # Create the directory
351
+ os.makedirs(self.current_temp_folder_path, exist_ok=True)
352
+
353
+ async def _write_chunk_to_temp_folder(self, chunk: "pd.DataFrame"):
354
+ """Write a chunk to the current temp folder."""
355
+ if self.current_temp_folder_path is None:
356
+ raise ValueError("No temp folder path available")
357
+
358
+ # Generate file name for this chunk within the temp folder
359
+ existing_files = len(
360
+ [
361
+ f
362
+ for f in os.listdir(self.current_temp_folder_path)
363
+ if f.endswith(".parquet")
364
+ ]
365
+ )
366
+ chunk_file_name = f"chunk-{existing_files}.parquet"
367
+ chunk_file_path = os.path.join(self.current_temp_folder_path, chunk_file_name)
342
368
 
343
- Note:
344
- If the buffer is empty or has no records, the method returns without writing.
345
- """
346
- import pandas as pd
369
+ # Write chunk using existing write_chunk method
370
+ await self.write_chunk(chunk, chunk_file_path)
347
371
 
348
- if not self.buffer or not self.current_buffer_size:
372
+ async def _consolidate_current_folder(self):
373
+ """Consolidate current temp folder using Daft."""
374
+ if self.current_folder_records == 0 or self.current_temp_folder_path is None:
349
375
  return
350
376
 
351
- if not all(isinstance(df, pd.DataFrame) for df in self.buffer):
352
- raise TypeError(
353
- "_flush_buffer encountered non-DataFrame elements in buffer. This should not happen."
354
- )
355
-
356
377
  try:
357
- # Now it's safe to cast for pd.concat
358
- pd_buffer: List[pd.DataFrame] = self.buffer # type: ignore
359
- combined_dataframe = pd.concat(pd_buffer)
360
-
361
- # Write DataFrame to Parquet file
362
- if not combined_dataframe.empty:
363
- self.total_record_count += len(combined_dataframe)
364
- output_file_name = (
365
- f"{self.output_path}/{self.path_gen(self.chunk_count, chunk_part)}"
366
- )
367
- combined_dataframe.to_parquet(
368
- output_file_name, index=False, compression="snappy"
369
- )
378
+ import daft
370
379
 
371
- # Record chunk metrics
372
- self.metrics.record_metric(
373
- name="parquet_chunks_written",
374
- value=1,
375
- metric_type=MetricType.COUNTER,
376
- labels={"type": "pandas"},
377
- description="Number of chunks written to Parquet files",
378
- )
380
+ # Read all parquet files in temp folder
381
+ pattern = os.path.join(self.current_temp_folder_path, "*.parquet")
382
+ daft_df = daft.read_parquet(pattern)
383
+ partitions = 0
379
384
 
380
- # Push the file to the object store
381
- await ObjectStore.upload_file(
382
- source=output_file_name,
383
- destination=get_object_store_prefix(output_file_name),
384
- )
385
-
386
- self.buffer.clear()
387
- self.current_buffer_size = 0
388
- self.current_buffer_size_bytes = 0
385
+ # Write consolidated file using Daft with size management
386
+ with daft.execution_config_ctx(
387
+ parquet_target_filesize=self.max_file_size_bytes
388
+ ):
389
+ # Write to a temp location first
390
+ temp_consolidated_dir = f"{self.current_temp_folder_path}_temp"
391
+ result = daft_df.write_parquet(root_dir=temp_consolidated_dir)
392
+
393
+ # Get the generated file path and rename to final location
394
+ result_dict = result.to_pydict()
395
+ partitions = len(result_dict["path"])
396
+ for i, file_path in enumerate(result_dict["path"]):
397
+ if file_path.endswith(".parquet"):
398
+ consolidated_file_path = self._get_consolidated_file_path(
399
+ folder_index=self.chunk_count,
400
+ chunk_part=i,
401
+ )
402
+ os.rename(file_path, consolidated_file_path)
403
+
404
+ # Upload consolidated file to object store
405
+ await ObjectStore.upload_file(
406
+ source=consolidated_file_path,
407
+ destination=get_object_store_prefix(consolidated_file_path),
408
+ )
409
+
410
+ # Clean up temp consolidated dir
411
+ shutil.rmtree(temp_consolidated_dir, ignore_errors=True)
412
+
413
+ # Update statistics
414
+ self.chunk_count += 1
415
+ self.total_record_count += self.current_folder_records
416
+ self.partitions.append(partitions)
389
417
 
390
- except Exception as e:
391
- # Record metrics for failed write
418
+ # Record metrics
392
419
  self.metrics.record_metric(
393
- name="parquet_write_errors",
420
+ name="consolidated_files",
394
421
  value=1,
395
422
  metric_type=MetricType.COUNTER,
396
- labels={"type": "pandas", "error": str(e)},
397
- description="Number of errors while writing to Parquet files",
423
+ labels={"type": "daft_consolidation"},
424
+ description="Number of consolidated parquet files created",
425
+ )
426
+
427
+ logger.info(
428
+ f"Consolidated folder {self.temp_folder_index} with {self.current_folder_records} records"
398
429
  )
399
- logger.error(f"Error flushing buffer to parquet: {str(e)}")
400
- raise e
430
+
431
+ except Exception as e:
432
+ logger.error(
433
+ f"Error consolidating folder {self.temp_folder_index}: {str(e)}"
434
+ )
435
+ raise
436
+
437
+ async def _cleanup_temp_folders(self):
438
+ """Clean up all temp folders after consolidation."""
439
+ try:
440
+ # Add current folder to cleanup list if it exists
441
+ if self.current_temp_folder_path is not None:
442
+ self.temp_folders_created.append(self.temp_folder_index)
443
+
444
+ # Clean up all temp folders
445
+ for folder_index in self.temp_folders_created:
446
+ temp_folder = self._get_temp_folder_path(folder_index)
447
+ if os.path.exists(temp_folder):
448
+ shutil.rmtree(temp_folder, ignore_errors=True)
449
+
450
+ # Clean up base temp directory if it exists and is empty
451
+ temp_base_path = os.path.join(self.output_path, "temp_accumulation")
452
+ if os.path.exists(temp_base_path) and not os.listdir(temp_base_path):
453
+ os.rmdir(temp_base_path)
454
+
455
+ # Reset state
456
+ self.temp_folders_created.clear()
457
+ self.current_temp_folder_path = None
458
+ self.temp_folder_index = 0
459
+ self.current_folder_records = 0
460
+
461
+ except Exception as e:
462
+ logger.warning(f"Error cleaning up temp folders: {str(e)}")
463
+
464
+ async def write_chunk(self, chunk: "pd.DataFrame", file_name: str):
465
+ """Write a chunk to a Parquet file.
466
+
467
+ This method writes a chunk to a Parquet file and uploads the file to the object store.
468
+ """
469
+ chunk.to_parquet(file_name, index=False, compression="snappy")
@@ -114,14 +114,17 @@ class ObjectStore:
114
114
 
115
115
  @classmethod
116
116
  async def get_content(
117
- cls, key: str, store_name: str = DEPLOYMENT_OBJECT_STORE_NAME
118
- ) -> bytes:
117
+ cls,
118
+ key: str,
119
+ store_name: str = DEPLOYMENT_OBJECT_STORE_NAME,
120
+ suppress_error: bool = False,
121
+ ) -> bytes | None:
119
122
  """Get raw file content from the object store.
120
123
 
121
124
  Args:
122
125
  key: The path of the file in the object store.
123
126
  store_name: Name of the Dapr object store binding to use.
124
-
127
+ suppress_error: Whether to suppress the error and return None if the file does not exist.
125
128
  Returns:
126
129
  The raw file content as bytes.
127
130
 
@@ -138,14 +141,18 @@ class ObjectStore:
138
141
  store_name=store_name,
139
142
  )
140
143
  if not response_data:
144
+ if suppress_error:
145
+ return None
141
146
  raise Exception(f"No data received for file: {key}")
142
147
 
143
148
  logger.debug(f"Successfully retrieved file content: {key}")
144
149
  return response_data
145
150
 
146
151
  except Exception as e:
152
+ if suppress_error:
153
+ return None
147
154
  logger.error(f"Error getting file content for {key}: {str(e)}")
148
- raise e
155
+ raise
149
156
 
150
157
  @classmethod
151
158
  async def exists(
@@ -463,8 +470,7 @@ class ObjectStore:
463
470
  binding_metadata=metadata,
464
471
  )
465
472
  return response.data
466
- except Exception as e:
467
- logger.error(f"Error in Dapr binding operation '{operation}': {str(e)}")
473
+ except Exception:
468
474
  raise
469
475
 
470
476
  @classmethod
@@ -104,33 +104,26 @@ class StateStore:
104
104
  >>> creds = await StateStore.get_state("db-cred-456", StateType.CREDENTIALS)
105
105
  >>> print(f"Database: {creds.get('database')}")
106
106
  """
107
-
108
107
  state_file_path = build_state_store_path(id, type)
109
- state = {}
110
-
111
108
  try:
112
- logger.info(f"Trying to download state object for {id} with type {type}")
113
- await ObjectStore.download_file(
114
- source=get_object_store_prefix(state_file_path),
115
- destination=state_file_path,
109
+ object_store_content = await ObjectStore.get_content(
110
+ get_object_store_prefix(state_file_path),
116
111
  store_name=UPSTREAM_OBJECT_STORE_NAME,
112
+ suppress_error=True,
117
113
  )
118
-
119
- with open(state_file_path, "r") as file:
120
- state = json.load(file)
121
-
122
- logger.info(f"State object downloaded for {id} with type {type}")
123
- except Exception as e:
124
- # local error message is "file not found", while in object store it is "object not found"
125
- if "not found" in str(e).lower():
126
- logger.info(
114
+ if not object_store_content:
115
+ logger.warning(
127
116
  f"No state found for {type.value} with id '{id}', returning empty dict"
128
117
  )
129
- else:
130
- logger.error(f"Failed to extract state: {str(e)}")
131
- raise
118
+ return {}
132
119
 
133
- return state
120
+ state = json.loads(object_store_content)
121
+ logger.info(f"State object retrieved for {id} with type {type}")
122
+
123
+ return state
124
+ except Exception as e:
125
+ logger.error(f"Failed to extract state: {str(e)}")
126
+ raise
134
127
 
135
128
  @classmethod
136
129
  async def save_state(cls, key: str, value: Any, id: str, type: StateType) -> None:
@@ -240,7 +233,9 @@ class StateStore:
240
233
  ... )
241
234
  """
242
235
  try:
243
- logger.info(f"Saving state object for {id} with type {type}")
236
+ logger.info(
237
+ f"Saving state object in object store for {id} with type {type}"
238
+ )
244
239
  # get the current state from object store
245
240
  current_state = await cls.get_state(id, type)
246
241
  state_file_path = build_state_store_path(id, type)
@@ -260,7 +255,9 @@ class StateStore:
260
255
  destination=get_object_store_prefix(state_file_path),
261
256
  store_name=UPSTREAM_OBJECT_STORE_NAME,
262
257
  )
263
- logger.info(f"State object saved for {id} with type {type}")
258
+ logger.info(
259
+ f"State object created in object store for {id} with type {type}"
260
+ )
264
261
  return current_state
265
262
  except Exception as e:
266
263
  logger.error(f"Failed to store state: {str(e)}")
@@ -415,7 +415,7 @@ class QueryBasedTransformer(TransformerInterface):
415
415
  )
416
416
 
417
417
  # run the SQL on the dataframe
418
- logger.info(
418
+ logger.debug(
419
419
  f"Running transformer for asset [{typename}] with SQL:\n {entity_sql_template}"
420
420
  )
421
421
  transformed_df = daft.sql(entity_sql_template)
@@ -2,4 +2,4 @@
2
2
  Version information for the application_sdk package.
3
3
  """
4
4
 
5
- __version__ = "0.1.1rc42"
5
+ __version__ = "0.1.1rc44"