atlan-application-sdk 2.0.0__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,7 @@ Example:
15
15
 
16
16
  import os
17
17
  from abc import ABC
18
+ from datetime import datetime, timedelta
18
19
  from typing import Any, Dict, Generic, Optional, TypeVar
19
20
 
20
21
  from pydantic import BaseModel
@@ -62,6 +63,7 @@ class ActivitiesState(BaseModel, Generic[HandlerType]):
62
63
  model_config = {"arbitrary_types_allowed": True}
63
64
  handler: Optional[HandlerType] = None
64
65
  workflow_args: Optional[Dict[str, Any]] = None
66
+ last_updated_timestamp: Optional[datetime] = None
65
67
 
66
68
 
67
69
  ActivitiesStateType = TypeVar("ActivitiesStateType", bound=ActivitiesState)
@@ -113,12 +115,15 @@ class ActivitiesInterface(ABC, Generic[ActivitiesStateType]):
113
115
  Note:
114
116
  The workflow ID is automatically retrieved from the current activity context.
115
117
  If no state exists for the current workflow, a new one will be created.
118
+ This method also updates the last_updated_timestamp to enable time-based
119
+ state refresh functionality.
116
120
  """
117
121
  workflow_id = get_workflow_id()
118
122
  if not self._state.get(workflow_id):
119
123
  self._state[workflow_id] = ActivitiesState()
120
124
 
121
125
  self._state[workflow_id].workflow_args = workflow_args
126
+ self._state[workflow_id].last_updated_timestamp = datetime.now()
122
127
 
123
128
  async def _get_state(self, workflow_args: Dict[str, Any]) -> ActivitiesStateType:
124
129
  """Retrieve the state for the current workflow.
@@ -142,6 +147,15 @@ class ActivitiesInterface(ABC, Generic[ActivitiesStateType]):
142
147
  workflow_id = get_workflow_id()
143
148
  if workflow_id not in self._state:
144
149
  await self._set_state(workflow_args)
150
+
151
+ else:
152
+ current_timestamp = datetime.now()
153
+ # if difference of current_timestamp and last_updated_timestamp is greater than 15 minutes, then again _set_state
154
+ last_updated = self._state[workflow_id].last_updated_timestamp
155
+ if last_updated and current_timestamp - last_updated > timedelta(
156
+ minutes=15
157
+ ):
158
+ await self._set_state(workflow_args)
145
159
  return self._state[workflow_id]
146
160
  except OrchestratorError as e:
147
161
  logger.error(
@@ -131,7 +131,9 @@ async def finalize_multidb_results(
131
131
  dataframe_list: List[
132
132
  Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
133
133
  ],
134
- setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
134
+ setup_parquet_output_func: Callable[
135
+ [str, bool, Optional[str]], Optional[ParquetFileWriter]
136
+ ],
135
137
  output_path: str,
136
138
  typename: str,
137
139
  ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
@@ -189,7 +191,9 @@ async def finalize_multidb_results(
189
191
  return concatenated
190
192
 
191
193
  # Create new parquet output for concatenated data
192
- concatenated_parquet_output = setup_parquet_output_func(output_path, True)
194
+ concatenated_parquet_output = setup_parquet_output_func(
195
+ output_path, True, typename
196
+ )
193
197
  if concatenated_parquet_output:
194
198
  await concatenated_parquet_output.write(concatenated) # type: ignore
195
199
  return await concatenated_parquet_output.close()
@@ -1,4 +1,5 @@
1
1
  import os
2
+ from datetime import datetime
2
3
  from typing import (
3
4
  TYPE_CHECKING,
4
5
  Any,
@@ -60,6 +61,7 @@ class BaseSQLMetadataExtractionActivitiesState(ActivitiesState):
60
61
  sql_client: Optional[BaseSQLClient] = None
61
62
  handler: Optional[BaseSQLHandler] = None
62
63
  transformer: Optional[TransformerInterface] = None
64
+ last_updated_timestamp: Optional[datetime] = None
63
65
 
64
66
 
65
67
  class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
@@ -149,13 +151,30 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
149
151
 
150
152
  Args:
151
153
  workflow_args (Dict[str, Any]): Arguments passed to the workflow.
154
+
155
+ Note:
156
+ This method creates and configures the new SQL client before closing
157
+ the old one to ensure state is never left with a closed client if
158
+ initialization fails. The timestamp is only updated after the new
159
+ client is successfully created and assigned.
152
160
  """
153
161
  workflow_id = get_workflow_id()
154
162
  if not self._state.get(workflow_id):
155
163
  self._state[workflow_id] = BaseSQLMetadataExtractionActivitiesState()
156
164
 
157
- await super()._set_state(workflow_args)
165
+ existing_state = self._state[workflow_id]
166
+
167
+ # Update workflow_args early, but preserve old timestamp until new client is ready
168
+ # This ensures that if initialization fails, the state can still be refreshed
169
+ existing_state.workflow_args = workflow_args
170
+
171
+ # Store reference to old client for cleanup after new client is ready
172
+ old_sql_client = None
173
+ if existing_state and existing_state.sql_client is not None:
174
+ old_sql_client = existing_state.sql_client
158
175
 
176
+ # Create and configure new client BEFORE closing old one
177
+ # This ensures state is never left with a closed client if initialization fails
159
178
  sql_client = self.sql_client_class()
160
179
 
161
180
  # Load credentials BEFORE creating handler to avoid race condition
@@ -165,10 +184,29 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
165
184
  )
166
185
  await sql_client.load(credentials)
167
186
 
168
- # Assign sql_client and handler to state AFTER credentials are loaded
187
+ # Only after new client is successfully created and configured,
188
+ # close old client and assign new one to state
189
+ if old_sql_client is not None:
190
+ try:
191
+ await old_sql_client.close()
192
+ logger.debug(
193
+ f"Closed existing SQL client for workflow {workflow_id} during state refresh"
194
+ )
195
+ except Exception as e:
196
+ logger.warning(
197
+ f"Failed to close existing SQL client for workflow {workflow_id}: {e}",
198
+ exc_info=True,
199
+ )
200
+ # Continue even if close fails - new client is already ready
201
+
202
+ # Assign sql_client and handler to state AFTER new client is ready
169
203
  self._state[workflow_id].sql_client = sql_client
170
204
  handler = self.handler_class(sql_client)
171
205
  self._state[workflow_id].handler = handler
206
+ # Update timestamp only after successful client creation and assignment
207
+ # This ensures that if initialization fails, the old timestamp remains
208
+ # and the state can be refreshed again immediately
209
+ self._state[workflow_id].last_updated_timestamp = datetime.now()
172
210
 
173
211
  # Create transformer with required parameters from ApplicationConstants
174
212
  transformer_params = {
@@ -49,9 +49,104 @@ if TYPE_CHECKING:
49
49
 
50
50
 
51
51
  class Reader(ABC):
52
+ """Abstract base class for reader data sources.
53
+
54
+ This class defines the interface for reader handlers that can read data
55
+ from various sources in different formats. Follows Python's file I/O
56
+ pattern with read/close semantics and supports context managers.
57
+
58
+ Attributes:
59
+ path (str): Path where the reader will read from.
60
+ _is_closed (bool): Whether the reader has been closed.
61
+ _downloaded_files (List[str]): List of downloaded temporary files to clean up.
62
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
63
+
64
+ Example:
65
+ Using close() explicitly::
66
+
67
+ reader = ParquetFileReader(path="/data/input")
68
+ df = await reader.read()
69
+ await reader.close() # Cleans up any downloaded temp files
70
+
71
+ Using context manager (recommended)::
72
+
73
+ async with ParquetFileReader(path="/data/input") as reader:
74
+ df = await reader.read()
75
+ # close() called automatically
76
+
77
+ Reading in batches with context manager::
78
+
79
+ async with JsonFileReader(path="/data/input") as reader:
80
+ async for batch in reader.read_batches():
81
+ process(batch)
82
+ # close() called automatically
52
83
  """
53
- Abstract base class for reader data sources.
54
- """
84
+
85
+ path: str
86
+ _is_closed: bool = False
87
+ _downloaded_files: List[str] = []
88
+ cleanup_on_close: bool = True
89
+
90
+ async def __aenter__(self) -> "Reader":
91
+ """Enter the async context manager.
92
+
93
+ Returns:
94
+ Reader: The reader instance.
95
+ """
96
+ return self
97
+
98
+ async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
99
+ """Exit the async context manager, closing the reader.
100
+
101
+ Args:
102
+ exc_type: Exception type if an exception was raised.
103
+ exc_val: Exception value if an exception was raised.
104
+ exc_tb: Exception traceback if an exception was raised.
105
+ """
106
+ await self.close()
107
+
108
+ async def close(self) -> None:
109
+ """Close the reader and clean up any downloaded temporary files.
110
+
111
+ This method cleans up any temporary files that were downloaded from
112
+ the object store during read operations. Calling close() multiple
113
+ times is safe (subsequent calls are no-ops).
114
+
115
+ Note:
116
+ Set ``cleanup_on_close=False`` during initialization to retain
117
+ downloaded files after closing.
118
+
119
+ Example::
120
+
121
+ reader = ParquetFileReader(path="/data/input")
122
+ df = await reader.read()
123
+ await reader.close() # Cleans up temp files
124
+ """
125
+ if self._is_closed:
126
+ return
127
+
128
+ if self.cleanup_on_close and self._downloaded_files:
129
+ await self._cleanup_downloaded_files()
130
+
131
+ self._is_closed = True
132
+
133
+ async def _cleanup_downloaded_files(self) -> None:
134
+ """Clean up downloaded temporary files.
135
+
136
+ Override this method in subclasses for custom cleanup behavior.
137
+ """
138
+ import shutil
139
+
140
+ for file_path in self._downloaded_files:
141
+ try:
142
+ if os.path.isfile(file_path):
143
+ os.remove(file_path)
144
+ elif os.path.isdir(file_path):
145
+ shutil.rmtree(file_path, ignore_errors=True)
146
+ except Exception as e:
147
+ logger.warning(f"Failed to clean up temporary file {file_path}: {e}")
148
+
149
+ self._downloaded_files.clear()
55
150
 
56
151
  @abstractmethod
57
152
  def read_batches(
@@ -62,27 +157,27 @@ class Reader(ABC):
62
157
  Iterator["daft.DataFrame"],
63
158
  AsyncIterator["daft.DataFrame"],
64
159
  ]:
65
- """
66
- Get an iterator of batched pandas DataFrames.
160
+ """Get an iterator of batched pandas DataFrames.
67
161
 
68
162
  Returns:
69
163
  Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
70
164
 
71
165
  Raises:
72
166
  NotImplementedError: If the method is not implemented.
167
+ ValueError: If the reader has been closed.
73
168
  """
74
169
  raise NotImplementedError
75
170
 
76
171
  @abstractmethod
77
172
  async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
78
- """
79
- Get a single pandas or daft DataFrame.
173
+ """Get a single pandas or daft DataFrame.
80
174
 
81
175
  Returns:
82
176
  Union["pd.DataFrame", "daft.DataFrame"]: A pandas or daft DataFrame.
83
177
 
84
178
  Raises:
85
179
  NotImplementedError: If the method is not implemented.
180
+ ValueError: If the reader has been closed.
86
181
  """
87
182
  raise NotImplementedError
88
183
 
@@ -27,9 +27,36 @@ activity.logger = logger
27
27
 
28
28
 
29
29
  class JsonFileReader(Reader):
30
- """
31
- JSON File Reader class to read data from JSON files using daft and pandas.
30
+ """JSON File Reader class to read data from JSON files using daft and pandas.
31
+
32
32
  Supports reading both single files and directories containing multiple JSON files.
33
+ Follows Python's file I/O pattern with read/close semantics and supports context managers.
34
+
35
+ Attributes:
36
+ path (str): Path to JSON file or directory containing JSON files.
37
+ chunk_size (int): Number of rows per batch.
38
+ file_names (Optional[List[str]]): List of specific file names to read.
39
+ dataframe_type (DataframeType): Type of dataframe to return (pandas or daft).
40
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
41
+
42
+ Example:
43
+ Using context manager (recommended)::
44
+
45
+ async with JsonFileReader(path="/data/input") as reader:
46
+ df = await reader.read()
47
+ # close() called automatically, temp files cleaned up
48
+
49
+ Reading in batches::
50
+
51
+ async with JsonFileReader(path="/data/input", chunk_size=50000) as reader:
52
+ async for batch in reader.read_batches():
53
+ process(batch)
54
+
55
+ Using close() explicitly::
56
+
57
+ reader = JsonFileReader(path="/data/input")
58
+ df = await reader.read()
59
+ await reader.close() # Clean up downloaded temp files
33
60
  """
34
61
 
35
62
  def __init__(
@@ -38,6 +65,7 @@ class JsonFileReader(Reader):
38
65
  file_names: Optional[List[str]] = None,
39
66
  chunk_size: Optional[int] = 100000,
40
67
  dataframe_type: DataframeType = DataframeType.pandas,
68
+ cleanup_on_close: bool = True,
41
69
  ):
42
70
  """Initialize the JsonInput class.
43
71
 
@@ -48,6 +76,8 @@ class JsonFileReader(Reader):
48
76
  Wildcards are not supported.
49
77
  file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
50
78
  chunk_size (int): Number of rows per batch. Defaults to 100000.
79
+ dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
80
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close. Defaults to True.
51
81
 
52
82
  Raises:
53
83
  ValueError: When path is not provided or when single file path is combined with file_names
@@ -65,12 +95,22 @@ class JsonFileReader(Reader):
65
95
  self.chunk_size = chunk_size
66
96
  self.file_names = file_names
67
97
  self.dataframe_type = dataframe_type
98
+ self.cleanup_on_close = cleanup_on_close
99
+ self._is_closed = False
100
+ self._downloaded_files: List[str] = []
68
101
 
69
102
  async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
103
+ """Read the data from the JSON files and return as a single DataFrame.
104
+
105
+ Returns:
106
+ Union[pd.DataFrame, daft.DataFrame]: Combined dataframe from JSON files.
107
+
108
+ Raises:
109
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
70
110
  """
71
- Method to read the data from the json files in the path
72
- and return as a single combined pandas dataframe
73
- """
111
+ if self._is_closed:
112
+ raise ValueError("Cannot read from a closed reader")
113
+
74
114
  if self.dataframe_type == DataframeType.pandas:
75
115
  return await self._get_dataframe()
76
116
  elif self.dataframe_type == DataframeType.daft:
@@ -84,10 +124,18 @@ class JsonFileReader(Reader):
84
124
  AsyncIterator["pd.DataFrame"],
85
125
  AsyncIterator["daft.DataFrame"],
86
126
  ]:
127
+ """Read the data from the JSON files and return as batched DataFrames.
128
+
129
+ Returns:
130
+ Union[AsyncIterator[pd.DataFrame], AsyncIterator[daft.DataFrame]]:
131
+ Async iterator of DataFrames.
132
+
133
+ Raises:
134
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
87
135
  """
88
- Method to read the data from the json files in the path
89
- and return as a batched pandas dataframe
90
- """
136
+ if self._is_closed:
137
+ raise ValueError("Cannot read from a closed reader")
138
+
91
139
  if self.dataframe_type == DataframeType.pandas:
92
140
  return self._get_batched_dataframe()
93
141
  elif self.dataframe_type == DataframeType.daft:
@@ -98,10 +146,7 @@ class JsonFileReader(Reader):
98
146
  async def _get_batched_dataframe(
99
147
  self,
100
148
  ) -> AsyncIterator["pd.DataFrame"]:
101
- """
102
- Method to read the data from the json files in the path
103
- and return as a batched pandas dataframe
104
- """
149
+ """Read the data from the JSON files and return as a batched pandas dataframe."""
105
150
  try:
106
151
  import pandas as pd
107
152
 
@@ -109,6 +154,8 @@ class JsonFileReader(Reader):
109
154
  json_files = await download_files(
110
155
  self.path, self.extension, self.file_names
111
156
  )
157
+ # Track downloaded files for cleanup on close
158
+ self._downloaded_files.extend(json_files)
112
159
  logger.info(f"Reading {len(json_files)} JSON files in batches")
113
160
 
114
161
  for json_file in json_files:
@@ -124,10 +171,7 @@ class JsonFileReader(Reader):
124
171
  raise
125
172
 
126
173
  async def _get_dataframe(self) -> "pd.DataFrame":
127
- """
128
- Method to read the data from the json files in the path
129
- and return as a single combined pandas dataframe
130
- """
174
+ """Read the data from the JSON files and return as a single pandas dataframe."""
131
175
  try:
132
176
  import pandas as pd
133
177
 
@@ -135,6 +179,8 @@ class JsonFileReader(Reader):
135
179
  json_files = await download_files(
136
180
  self.path, self.extension, self.file_names
137
181
  )
182
+ # Track downloaded files for cleanup on close
183
+ self._downloaded_files.extend(json_files)
138
184
  logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
139
185
 
140
186
  return pd.concat(
@@ -149,10 +195,7 @@ class JsonFileReader(Reader):
149
195
  async def _get_batched_daft_dataframe(
150
196
  self,
151
197
  ) -> AsyncIterator["daft.DataFrame"]: # noqa: F821
152
- """
153
- Method to read the data from the json files in the path
154
- and return as a batched daft dataframe
155
- """
198
+ """Read the data from the JSON files and return as a batched daft dataframe."""
156
199
  try:
157
200
  import daft
158
201
 
@@ -160,6 +203,8 @@ class JsonFileReader(Reader):
160
203
  json_files = await download_files(
161
204
  self.path, self.extension, self.file_names
162
205
  )
206
+ # Track downloaded files for cleanup on close
207
+ self._downloaded_files.extend(json_files)
163
208
  logger.info(f"Reading {len(json_files)} JSON files as daft batches")
164
209
 
165
210
  # Yield each discovered file as separate batch with chunking
@@ -170,10 +215,7 @@ class JsonFileReader(Reader):
170
215
  raise
171
216
 
172
217
  async def _get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
173
- """
174
- Method to read the data from the json files in the path
175
- and return as a single combined daft dataframe
176
- """
218
+ """Read the data from the JSON files and return as a single daft dataframe."""
177
219
  try:
178
220
  import daft
179
221
 
@@ -181,6 +223,8 @@ class JsonFileReader(Reader):
181
223
  json_files = await download_files(
182
224
  self.path, self.extension, self.file_names
183
225
  )
226
+ # Track downloaded files for cleanup on close
227
+ self._downloaded_files.extend(json_files)
184
228
  logger.info(f"Reading {len(json_files)} JSON files with daft")
185
229
 
186
230
  # Use the discovered/downloaded files directly
@@ -40,9 +40,37 @@ if TYPE_CHECKING:
40
40
 
41
41
 
42
42
  class ParquetFileReader(Reader):
43
- """
44
- Parquet File Reader class to read data from Parquet files using daft and pandas.
43
+ """Parquet File Reader class to read data from Parquet files using daft and pandas.
44
+
45
45
  Supports reading both single files and directories containing multiple parquet files.
46
+ Follows Python's file I/O pattern with read/close semantics and supports context managers.
47
+
48
+ Attributes:
49
+ path (str): Path to parquet file or directory containing parquet files.
50
+ chunk_size (int): Number of rows per batch.
51
+ buffer_size (int): Number of rows per batch for daft.
52
+ file_names (Optional[List[str]]): List of specific file names to read.
53
+ dataframe_type (DataframeType): Type of dataframe to return (pandas or daft).
54
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close.
55
+
56
+ Example:
57
+ Using context manager (recommended)::
58
+
59
+ async with ParquetFileReader(path="/data/input") as reader:
60
+ df = await reader.read()
61
+ # close() called automatically, temp files cleaned up
62
+
63
+ Reading in batches::
64
+
65
+ async with ParquetFileReader(path="/data/input", chunk_size=50000) as reader:
66
+ async for batch in reader.read_batches():
67
+ process(batch)
68
+
69
+ Using close() explicitly::
70
+
71
+ reader = ParquetFileReader(path="/data/input")
72
+ df = await reader.read()
73
+ await reader.close() # Clean up downloaded temp files
46
74
  """
47
75
 
48
76
  def __init__(
@@ -52,6 +80,7 @@ class ParquetFileReader(Reader):
52
80
  buffer_size: Optional[int] = 5000,
53
81
  file_names: Optional[List[str]] = None,
54
82
  dataframe_type: DataframeType = DataframeType.pandas,
83
+ cleanup_on_close: bool = True,
55
84
  ):
56
85
  """Initialize the Parquet input class.
57
86
 
@@ -64,6 +93,7 @@ class ParquetFileReader(Reader):
64
93
  buffer_size (int): Number of rows per batch. Defaults to 5000.
65
94
  file_names (Optional[List[str]]): List of file names to read. Defaults to None.
66
95
  dataframe_type (DataframeType): Type of dataframe to read. Defaults to DataframeType.pandas.
96
+ cleanup_on_close (bool): Whether to clean up downloaded temp files on close. Defaults to True.
67
97
 
68
98
  Raises:
69
99
  ValueError: When path is not provided or when single file path is combined with file_names
@@ -81,12 +111,22 @@ class ParquetFileReader(Reader):
81
111
  self.buffer_size = buffer_size
82
112
  self.file_names = file_names
83
113
  self.dataframe_type = dataframe_type
114
+ self.cleanup_on_close = cleanup_on_close
115
+ self._is_closed = False
116
+ self._downloaded_files: List[str] = []
84
117
 
85
118
  async def read(self) -> Union["pd.DataFrame", "daft.DataFrame"]:
119
+ """Read the data from the parquet files and return as a single DataFrame.
120
+
121
+ Returns:
122
+ Union[pd.DataFrame, daft.DataFrame]: Combined dataframe from parquet files.
123
+
124
+ Raises:
125
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
86
126
  """
87
- Method to read the data from the parquet files in the path
88
- and return as a single combined pandas dataframe
89
- """
127
+ if self._is_closed:
128
+ raise ValueError("Cannot read from a closed reader")
129
+
90
130
  if self.dataframe_type == DataframeType.pandas:
91
131
  return await self._get_dataframe()
92
132
  elif self.dataframe_type == DataframeType.daft:
@@ -100,10 +140,18 @@ class ParquetFileReader(Reader):
100
140
  AsyncIterator["pd.DataFrame"],
101
141
  AsyncIterator["daft.DataFrame"],
102
142
  ]:
143
+ """Read the data from the parquet files and return as batched DataFrames.
144
+
145
+ Returns:
146
+ Union[AsyncIterator[pd.DataFrame], AsyncIterator[daft.DataFrame]]:
147
+ Async iterator of DataFrames.
148
+
149
+ Raises:
150
+ ValueError: If the reader has been closed or dataframe_type is unsupported.
103
151
  """
104
- Method to read the data from the parquet files in the path
105
- and return as a batched pandas dataframe
106
- """
152
+ if self._is_closed:
153
+ raise ValueError("Cannot read from a closed reader")
154
+
107
155
  if self.dataframe_type == DataframeType.pandas:
108
156
  return self._get_batched_dataframe()
109
157
  elif self.dataframe_type == DataframeType.daft:
@@ -149,6 +197,8 @@ class ParquetFileReader(Reader):
149
197
  parquet_files = await download_files(
150
198
  self.path, PARQUET_FILE_EXTENSION, self.file_names
151
199
  )
200
+ # Track downloaded files for cleanup on close
201
+ self._downloaded_files.extend(parquet_files)
152
202
  logger.info(f"Reading {len(parquet_files)} parquet files")
153
203
 
154
204
  return pd.concat(
@@ -208,6 +258,8 @@ class ParquetFileReader(Reader):
208
258
  parquet_files = await download_files(
209
259
  self.path, PARQUET_FILE_EXTENSION, self.file_names
210
260
  )
261
+ # Track downloaded files for cleanup on close
262
+ self._downloaded_files.extend(parquet_files)
211
263
  logger.info(f"Reading {len(parquet_files)} parquet files in batches")
212
264
 
213
265
  # Process each file individually to maintain memory efficiency
@@ -259,6 +311,8 @@ class ParquetFileReader(Reader):
259
311
  parquet_files = await download_files(
260
312
  self.path, PARQUET_FILE_EXTENSION, self.file_names
261
313
  )
314
+ # Track downloaded files for cleanup on close
315
+ self._downloaded_files.extend(parquet_files)
262
316
  logger.info(f"Reading {len(parquet_files)} parquet files with daft")
263
317
 
264
318
  # Use the discovered/downloaded files directly
@@ -317,6 +371,8 @@ class ParquetFileReader(Reader):
317
371
  parquet_files = await download_files(
318
372
  self.path, PARQUET_FILE_EXTENSION, self.file_names
319
373
  )
374
+ # Track downloaded files for cleanup on close
375
+ self._downloaded_files.extend(parquet_files)
320
376
  logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
321
377
 
322
378
  # Create a lazy dataframe without loading data into memory
@@ -426,16 +426,9 @@ class AtlanObservability(Generic[T], ABC):
426
426
  chunk_start=0,
427
427
  chunk_part=int(time()),
428
428
  )
429
- logging.info(
430
- f"Successfully instantiated ParquetFileWriter for partition: {partition_path}"
431
- )
432
429
 
433
430
  await parquet_writer._write_dataframe(dataframe=df)
434
431
 
435
- logging.info(
436
- f"Successfully wrote {len(df)} records to partition: {partition_path}"
437
- )
438
-
439
432
  except Exception as partition_error:
440
433
  logging.error(
441
434
  f"Error processing partition {partition_path}: {str(partition_error)}"
@@ -459,9 +459,22 @@ class ObjectStore:
459
459
 
460
460
  logger.info(f"Found {len(file_list)} files to download from: {source}")
461
461
 
462
+ # Normalize source prefix to use forward slashes for comparison
463
+ normalized_source = cls._normalize_object_store_key(source)
464
+
462
465
  # Download each file
463
466
  for file_path in file_list:
464
- local_file_path = os.path.join(destination, file_path)
467
+ normalized_file_path = cls._normalize_object_store_key(file_path)
468
+ if normalized_file_path.startswith(normalized_source):
469
+ # Extract relative path after the prefix
470
+ relative_path = normalized_file_path[
471
+ len(normalized_source) :
472
+ ].lstrip("/")
473
+ else:
474
+ # Fallback to just the filename
475
+ relative_path = os.path.basename(normalized_file_path)
476
+
477
+ local_file_path = os.path.join(destination, relative_path)
465
478
  await cls.download_file(file_path, local_file_path, store_name)
466
479
 
467
480
  logger.info(f"Successfully downloaded all files from: {source}")
@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple, Type
4
4
 
5
5
  import daft
6
6
  import yaml
7
+ from daft.functions import to_struct, when
7
8
  from pyatlan.model.enums import AtlanConnectorType
8
9
 
9
10
  from application_sdk.observability.logger_adaptor import get_logger
@@ -227,7 +228,7 @@ class QueryBasedTransformer(TransformerInterface):
227
228
  # Only create a struct if we have fields
228
229
  if struct_fields:
229
230
  # Create the struct first
230
- struct = daft.struct(*struct_fields)
231
+ struct = to_struct(*struct_fields)
231
232
 
232
233
  # If we have non-null checks, apply them
233
234
  if non_null_fields:
@@ -236,8 +237,8 @@ class QueryBasedTransformer(TransformerInterface):
236
237
  for check in non_null_fields[1:]:
237
238
  any_non_null = any_non_null | check
238
239
 
239
- # Use if_else on the any_non_null Expression
240
- return any_non_null.if_else(struct, None).alias(prefix)
240
+ # Use when().otherwise() for conditional expression (replaces if_else in daft 0.7+)
241
+ return when(any_non_null, struct).otherwise(None).alias(prefix)
241
242
 
242
243
  return struct.alias(prefix)
243
244
 
@@ -2,4 +2,4 @@
2
2
  Version information for the application_sdk package.
3
3
  """
4
4
 
5
- __version__ = "2.0.0"
5
+ __version__ = "2.1.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atlan-application-sdk
3
- Version: 2.0.0
3
+ Version: 2.1.1
4
4
  Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
5
5
  Project-URL: Repository, https://github.com/atlanhq/application-sdk
6
6
  Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
@@ -31,7 +31,7 @@ Requires-Dist: pydantic<2.13.0,>=2.10.6
31
31
  Requires-Dist: python-dotenv<1.3.0,>=1.1.0
32
32
  Requires-Dist: uvloop<0.23.0,>=0.21.0; sys_platform != 'win32'
33
33
  Provides-Extra: daft
34
- Requires-Dist: daft<0.8.0,>=0.4.12; extra == 'daft'
34
+ Requires-Dist: daft<0.8.0,>=0.7.1; extra == 'daft'
35
35
  Provides-Extra: distributed-lock
36
36
  Requires-Dist: redis[hiredis]<7.2.0,>=5.2.0; extra == 'distributed-lock'
37
37
  Provides-Extra: iam-auth
@@ -1,18 +1,18 @@
1
1
  application_sdk/__init__.py,sha256=2e2mvmLJ5dxmJGPELtb33xwP-j6JMdoIuqKycEn7hjg,151
2
2
  application_sdk/constants.py,sha256=TvdmKQShVWBNQZdVF2y-fxuE31FmeraTnqQ9jT_n5XY,11567
3
- application_sdk/version.py,sha256=TY0SZFUH9N2qGF6tlbmJIww-CY7x-myMGotNEhFJko4,84
3
+ application_sdk/version.py,sha256=sNbvXviG7NgxM58lOHKhbZfERat5qAJNr3UZy_toVQs,84
4
4
  application_sdk/worker.py,sha256=DLMocpHvvwpdAopyXhxwM7ftaNlKvZMQfkgy1MFyiik,7561
5
- application_sdk/activities/__init__.py,sha256=6SiefuOPUDGfN3z6oPY4RkQLiUmkHpoDy5xadzpDzAw,11588
5
+ application_sdk/activities/__init__.py,sha256=i7iY6aL1VFg185n2rLLvD_sI2BA9zJ33jL5rD_sY__U,12350
6
6
  application_sdk/activities/lock_management.py,sha256=6Wdf3jMKitoarHQP91PIJOoGFz4aaOLS_40c7n1yAOA,3902
7
7
  application_sdk/activities/.cursor/BUGBOT.md,sha256=FNykX5aMkdOhzgpiGqstOnSp9JN63iR2XP3onU4AGh8,15843
8
8
  application_sdk/activities/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  application_sdk/activities/common/models.py,sha256=43MF_w0EzEQiJvGIqF_FNet4X6MEmwqYd3YAsHdQn08,1362
10
- application_sdk/activities/common/sql_utils.py,sha256=csLM8H9L5NY5_rhxHBFo-jkMoOKjxhB9xaFbnLbkBGg,10177
10
+ application_sdk/activities/common/sql_utils.py,sha256=QD4qOGkgJmlAGZKaSxqfC0AkjZVdTqdr6Q_Tw2CjIsM,10246
11
11
  application_sdk/activities/common/utils.py,sha256=ngyFmiZnMCAQtyu6vGeAlkzwNkM29MD_gBU5pWqOxJ4,8392
12
12
  application_sdk/activities/metadata_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  application_sdk/activities/metadata_extraction/base.py,sha256=ENFojpxqKdN_eVSL4iet3cGfylPOfcl1jnflfo4zhs8,3920
14
14
  application_sdk/activities/metadata_extraction/rest.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- application_sdk/activities/metadata_extraction/sql.py,sha256=IkI1ZhOKyoSwosRT-g8c8IDBuFBq7mwyHLpDvwYO_B4,25451
15
+ application_sdk/activities/metadata_extraction/sql.py,sha256=CmE77EsgbOuDL5AKaRCnq1jApJnDWNVxx-RZ49cJwus,27415
16
16
  application_sdk/activities/query_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  application_sdk/activities/query_extraction/sql.py,sha256=Gsa79R8CYY0uyt3rA2nLMfQs8-C4_zg1pJ_yYSF2cZw,21193
18
18
  application_sdk/application/__init__.py,sha256=vcrQsqlfmGvKcCZuOtHHaNRqHSGdXlEDftkb8Tv_shI,9867
@@ -64,15 +64,15 @@ application_sdk/interceptors/events.py,sha256=e0O6uK9_aCTmOORaTGN9RbcTg9_KNaakq-
64
64
  application_sdk/interceptors/lock.py,sha256=5ETm20zrTaH2b9fepN4Ckp1tGJV-uINqDrno_5RW3aw,6169
65
65
  application_sdk/interceptors/models.py,sha256=kEzJKvb-G1M7aKrLPgAmsukJXLXeh8hIJKwEkOiaY28,6115
66
66
  application_sdk/interceptors/.cursor/BUGBOT.md,sha256=pxmUF2c7dtaXAX8yAa1-LBa6FCrj_uw7aQcHrppjf1A,14570
67
- application_sdk/io/__init__.py,sha256=i5EbFW0IInErum4DTRAhWBjwT2EwiUHTpdYBHNmI-I4,24317
68
- application_sdk/io/json.py,sha256=yjgCMzbz4Ltyt0PCylVRrUqGl2FhQL_wfWW1KtSi-II,17435
69
- application_sdk/io/parquet.py,sha256=1J_C5c_LwDfuugIpiQqYgmU2uVcGearF5Kkrx-_34_I,31740
67
+ application_sdk/io/__init__.py,sha256=Fse-fEyrpMlLUxwyFkH8vWWSXz8rdWGlAjZy5ulAZCU,27767
68
+ application_sdk/io/json.py,sha256=sNSyHZCM_ZeaiJHUrolYVHKreBQqSCBsfsjD3JSkoD0,19729
69
+ application_sdk/io/parquet.py,sha256=zy9H_TvWI5CkktJ582NH7Ut_5rUH_S0Jy7ZbTD0JxeI,34227
70
70
  application_sdk/io/utils.py,sha256=sn_8Q6HgjeC8uyZp2XGMAfqdJ8XzkIllOEVYXIH54DY,10724
71
71
  application_sdk/observability/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
72
72
  application_sdk/observability/context.py,sha256=lJjpfxEjMY_hrdSDqq519YaWcztgc_1nM4d-mGV5shs,634
73
73
  application_sdk/observability/logger_adaptor.py,sha256=Fq5OE579ozr0EzsNYEh2H0q3POVAxtlWfJ-PSwWDGLM,30194
74
74
  application_sdk/observability/metrics_adaptor.py,sha256=5Oz02lUED60duryoVDF9mbD11fpxhbXi7P1609n_15Y,16446
75
- application_sdk/observability/observability.py,sha256=m5OgD_akc3YrkF5mCme2HcRaY7ysjLbVSaEs2C-K2Fs,24062
75
+ application_sdk/observability/observability.py,sha256=O2rBal_0pmFRen7Yx4c4dSH1NyiT937b4bY2w63q-4U,23751
76
76
  application_sdk/observability/traces_adaptor.py,sha256=0eQJPN-tYA_dV8D3uEa5ZiX9g12NDuLnPaFuQMVDdL0,18242
77
77
  application_sdk/observability/utils.py,sha256=-02GAFom8Bg4SNyCTNYySmen2dzvLfTu43bqsNq1AH0,3096
78
78
  application_sdk/observability/decorators/observability_decorator.py,sha256=yd6qfrg1MmH5KcZ5Ydzb0RaBzmxx5FrmiI9qwvZx3EU,8963
@@ -92,7 +92,7 @@ application_sdk/services/__init__.py,sha256=H-5HZEPdr53MUfAggyHqHhRXDRLZFZsxvJgW
92
92
  application_sdk/services/_utils.py,sha256=0yHqDP6qNb1OT-bX2XRYQPZ5xkGkV13nyRw6GkPlHs8,1136
93
93
  application_sdk/services/atlan_storage.py,sha256=TKzXxu0yXeUcmZehwp8PcnQTC4A9w9RlZ0Fl-Xp1bLE,8509
94
94
  application_sdk/services/eventstore.py,sha256=wCT921KRzUe3fAWKC-bbM6_OtIJTKpSQrOutPQzMEgs,6745
95
- application_sdk/services/objectstore.py,sha256=JgQrL9z_6aG5micVd7I2N6l3RA4EUJh4T2BCuC_ATwQ,20161
95
+ application_sdk/services/objectstore.py,sha256=dLljCsPPSr24bPKh0l3-xRblofzKVQ4LDfqDrMp8JGc,20819
96
96
  application_sdk/services/secretstore.py,sha256=Pmn1WlmHmgaDhWz5OXBB5_rKXQQMyLMzadwZSNKwc6Q,19070
97
97
  application_sdk/services/statestore.py,sha256=3-afiM3Vsoe1XDYRokdGTB5I5CwOKyieuX5RwIZf77o,9413
98
98
  application_sdk/test_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -138,7 +138,7 @@ application_sdk/transformers/atlas/__init__.py,sha256=fw3D8bBtt61SseAfYut3JZddpX
138
138
  application_sdk/transformers/atlas/sql.py,sha256=rkQXNZ7oebts5oF5E_Bw8NpcHHKScU0TmKciH_1l_k4,50419
139
139
  application_sdk/transformers/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
140
140
  application_sdk/transformers/common/utils.py,sha256=4ISMIQ0Gzghmi31p51FOFm5KLF7XF-fmH9PVT7i0DFE,4899
141
- application_sdk/transformers/query/__init__.py,sha256=yG1dGP3NhUizwkCgyFAzsr9SV9uWYZKjXoCWPrsIxVw,17358
141
+ application_sdk/transformers/query/__init__.py,sha256=4uVCU-NfDe08PlffjWQ5p4smQa7c518IL2rDgIk6694,17446
142
142
  application_sdk/transformers/query/templates/column.yaml,sha256=EXLYwGXN7LKT-v51n2EZnY99o6vHucyFaVSpM-sUSXw,7679
143
143
  application_sdk/transformers/query/templates/database.yaml,sha256=SD1hJg5LI7gsBHQL5mW341sa51EkhcsIDDFlIOi9zdk,1374
144
144
  application_sdk/transformers/query/templates/extras-procedure.yaml,sha256=XhAfVY4zm99K8fcgkYA1XPLv4ks-SA6SzMO3SMtQ60s,2298
@@ -152,8 +152,8 @@ application_sdk/workflows/metadata_extraction/__init__.py,sha256=jHUe_ZBQ66jx8bg
152
152
  application_sdk/workflows/metadata_extraction/sql.py,sha256=6ZaVt84n-8U2ZvR9GR7uIJKv5v8CuyQjhlnoRJvDszc,12435
153
153
  application_sdk/workflows/query_extraction/__init__.py,sha256=n066_CX5RpJz6DIxGMkKS3eGSRg03ilaCtsqfJWQb7Q,117
154
154
  application_sdk/workflows/query_extraction/sql.py,sha256=kT_JQkLCRZ44ZpaC4QvPL6DxnRIIVh8gYHLqRbMI-hA,4826
155
- atlan_application_sdk-2.0.0.dist-info/METADATA,sha256=B9-UVeLlDGDuxko6Nvb6Y8zrLNueaATYaJZMESikYZU,5806
156
- atlan_application_sdk-2.0.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
157
- atlan_application_sdk-2.0.0.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
158
- atlan_application_sdk-2.0.0.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
159
- atlan_application_sdk-2.0.0.dist-info/RECORD,,
155
+ atlan_application_sdk-2.1.1.dist-info/METADATA,sha256=Vc2uG2FMhuNXyZFXmGMmvc_LRpCBaNTcQEHpSV8NpOE,5805
156
+ atlan_application_sdk-2.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
157
+ atlan_application_sdk-2.1.1.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
158
+ atlan_application_sdk-2.1.1.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
159
+ atlan_application_sdk-2.1.1.dist-info/RECORD,,