atlan-application-sdk 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +308 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +654 -0
- application_sdk/io/json.py +429 -0
- application_sdk/{outputs → io}/parquet.py +358 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +23 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +35 -42
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
from typing import (
|
|
2
|
+
TYPE_CHECKING,
|
|
3
|
+
Any,
|
|
4
|
+
AsyncIterator,
|
|
5
|
+
Callable,
|
|
6
|
+
Dict,
|
|
7
|
+
Iterator,
|
|
8
|
+
List,
|
|
9
|
+
Optional,
|
|
10
|
+
Tuple,
|
|
11
|
+
Union,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
from application_sdk.activities.common.models import ActivityStatistics
|
|
15
|
+
from application_sdk.clients.sql import BaseSQLClient
|
|
16
|
+
from application_sdk.common.utils import (
|
|
17
|
+
get_database_names,
|
|
18
|
+
parse_credentials_extra,
|
|
19
|
+
prepare_query,
|
|
20
|
+
)
|
|
21
|
+
from application_sdk.io.parquet import ParquetFileWriter
|
|
22
|
+
from application_sdk.observability.logger_adaptor import get_logger
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
logger = get_logger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
async def setup_database_connection(
|
|
31
|
+
sql_client: BaseSQLClient,
|
|
32
|
+
database_name: str,
|
|
33
|
+
) -> None:
|
|
34
|
+
"""Setup connection for a specific database.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
sql_client: The SQL client to configure.
|
|
38
|
+
database_name: The name of the database to connect to.
|
|
39
|
+
"""
|
|
40
|
+
extra = parse_credentials_extra(sql_client.credentials)
|
|
41
|
+
extra["database"] = database_name
|
|
42
|
+
sql_client.credentials["extra"] = extra
|
|
43
|
+
await sql_client.load(sql_client.credentials)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def prepare_database_query(
|
|
47
|
+
sql_query: str,
|
|
48
|
+
database_name: Optional[str],
|
|
49
|
+
workflow_args: Dict[str, Any],
|
|
50
|
+
temp_table_regex_sql: str = "",
|
|
51
|
+
use_posix_regex: bool = False,
|
|
52
|
+
) -> str:
|
|
53
|
+
"""Prepare query for database execution with proper substitutions.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
sql_query: The raw SQL query string.
|
|
57
|
+
database_name: The database name to substitute into the query.
|
|
58
|
+
workflow_args: Workflow arguments for query preparation.
|
|
59
|
+
temp_table_regex_sql: SQL regex for temp table exclusion.
|
|
60
|
+
use_posix_regex: Whether to use POSIX regex syntax.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The prepared SQL query string.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If query preparation fails.
|
|
67
|
+
"""
|
|
68
|
+
# Replace database name placeholder if provided
|
|
69
|
+
fetch_sql = sql_query
|
|
70
|
+
if database_name:
|
|
71
|
+
fetch_sql = fetch_sql.replace("{database_name}", database_name)
|
|
72
|
+
|
|
73
|
+
# Prepare the query
|
|
74
|
+
prepared_query = prepare_query(
|
|
75
|
+
query=fetch_sql,
|
|
76
|
+
workflow_args=workflow_args,
|
|
77
|
+
temp_table_regex_sql=temp_table_regex_sql,
|
|
78
|
+
use_posix_regex=use_posix_regex,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if prepared_query is None:
|
|
82
|
+
db_context = f" for database {database_name}" if database_name else ""
|
|
83
|
+
raise ValueError(f"Failed to prepare query{db_context}")
|
|
84
|
+
|
|
85
|
+
return prepared_query
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def execute_single_db(
|
|
89
|
+
sql_client: BaseSQLClient,
|
|
90
|
+
prepared_query: Optional[str],
|
|
91
|
+
parquet_output: Optional[ParquetFileWriter],
|
|
92
|
+
write_to_file: bool,
|
|
93
|
+
) -> Tuple[
|
|
94
|
+
bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
|
|
95
|
+
]:
|
|
96
|
+
"""Execute a query against a single database.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
sql_client: The SQL client to use.
|
|
100
|
+
prepared_query: The prepared SQL query to execute.
|
|
101
|
+
parquet_output: Optional parquet writer for output.
|
|
102
|
+
write_to_file: Whether to write results to file.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Tuple of (success boolean, optional result iterator).
|
|
106
|
+
"""
|
|
107
|
+
if not prepared_query:
|
|
108
|
+
logger.error("Prepared query is None, cannot execute")
|
|
109
|
+
return False, None
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
batched_iterator = await sql_client.get_batched_results(prepared_query)
|
|
113
|
+
|
|
114
|
+
if write_to_file and parquet_output:
|
|
115
|
+
await parquet_output.write_batches(batched_iterator) # type: ignore
|
|
116
|
+
return True, None
|
|
117
|
+
|
|
118
|
+
return True, batched_iterator
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.error(
|
|
121
|
+
f"Error during query execution or output writing: {e}", exc_info=True
|
|
122
|
+
)
|
|
123
|
+
raise
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
async def finalize_multidb_results(
|
|
127
|
+
write_to_file: bool,
|
|
128
|
+
concatenate: bool,
|
|
129
|
+
return_dataframe: bool,
|
|
130
|
+
parquet_output: Optional[ParquetFileWriter],
|
|
131
|
+
dataframe_list: List[
|
|
132
|
+
Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
|
|
133
|
+
],
|
|
134
|
+
setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
|
|
135
|
+
output_path: str,
|
|
136
|
+
typename: str,
|
|
137
|
+
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
|
|
138
|
+
"""Finalize results for multi-database execution.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
write_to_file: Whether results were written to file.
|
|
142
|
+
concatenate: Whether to concatenate in-memory results.
|
|
143
|
+
return_dataframe: Whether to return the concatenated dataframe.
|
|
144
|
+
parquet_output: The parquet writer used (if any).
|
|
145
|
+
dataframe_list: List of dataframe iterators from each DB.
|
|
146
|
+
setup_parquet_output_func: Callback to create new parquet output.
|
|
147
|
+
output_path: Full path for output files.
|
|
148
|
+
typename: Type name for statistics.
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
Statistics or DataFrame, or None.
|
|
152
|
+
"""
|
|
153
|
+
if write_to_file and parquet_output:
|
|
154
|
+
return await parquet_output.close()
|
|
155
|
+
|
|
156
|
+
if not write_to_file and concatenate:
|
|
157
|
+
try:
|
|
158
|
+
import pandas as pd
|
|
159
|
+
|
|
160
|
+
valid_dataframes: List[pd.DataFrame] = []
|
|
161
|
+
for df_generator in dataframe_list:
|
|
162
|
+
if df_generator is None:
|
|
163
|
+
continue
|
|
164
|
+
# Handle both async and sync iterators
|
|
165
|
+
if hasattr(df_generator, "__aiter__"):
|
|
166
|
+
async for dataframe in df_generator: # type: ignore
|
|
167
|
+
if dataframe is None or (
|
|
168
|
+
hasattr(dataframe, "empty") and dataframe.empty
|
|
169
|
+
):
|
|
170
|
+
continue
|
|
171
|
+
valid_dataframes.append(dataframe)
|
|
172
|
+
else:
|
|
173
|
+
for dataframe in df_generator: # type: ignore
|
|
174
|
+
if dataframe is None or (
|
|
175
|
+
hasattr(dataframe, "empty") and dataframe.empty
|
|
176
|
+
):
|
|
177
|
+
continue
|
|
178
|
+
valid_dataframes.append(dataframe)
|
|
179
|
+
|
|
180
|
+
if not valid_dataframes:
|
|
181
|
+
logger.warning(
|
|
182
|
+
"No valid dataframes collected across databases for concatenation"
|
|
183
|
+
)
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
concatenated = pd.concat(valid_dataframes, ignore_index=True)
|
|
187
|
+
|
|
188
|
+
if return_dataframe:
|
|
189
|
+
return concatenated
|
|
190
|
+
|
|
191
|
+
# Create new parquet output for concatenated data
|
|
192
|
+
concatenated_parquet_output = setup_parquet_output_func(output_path, True)
|
|
193
|
+
if concatenated_parquet_output:
|
|
194
|
+
await concatenated_parquet_output.write(concatenated) # type: ignore
|
|
195
|
+
return await concatenated_parquet_output.close()
|
|
196
|
+
except Exception as e:
|
|
197
|
+
logger.error(
|
|
198
|
+
f"Error concatenating multi-DB dataframes: {str(e)}",
|
|
199
|
+
exc_info=True,
|
|
200
|
+
)
|
|
201
|
+
raise
|
|
202
|
+
|
|
203
|
+
logger.warning(
|
|
204
|
+
"multidb execution returned no output to write (write_to_file=False, concatenate=False)"
|
|
205
|
+
)
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
async def execute_multidb_flow(
|
|
210
|
+
sql_client: BaseSQLClient,
|
|
211
|
+
sql_query: str,
|
|
212
|
+
workflow_args: Dict[str, Any],
|
|
213
|
+
fetch_database_sql: Optional[str],
|
|
214
|
+
output_path: str,
|
|
215
|
+
typename: str,
|
|
216
|
+
write_to_file: bool,
|
|
217
|
+
concatenate: bool,
|
|
218
|
+
return_dataframe: bool,
|
|
219
|
+
parquet_output: Optional[ParquetFileWriter],
|
|
220
|
+
temp_table_regex_sql: str,
|
|
221
|
+
setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
|
|
222
|
+
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
|
|
223
|
+
"""Execute multi-database flow with proper error handling and result finalization.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
sql_client: The SQL client to use.
|
|
227
|
+
sql_query: The SQL query to execute on each database.
|
|
228
|
+
workflow_args: Workflow arguments.
|
|
229
|
+
fetch_database_sql: SQL to fetch list of databases.
|
|
230
|
+
output_path: Full path for output files.
|
|
231
|
+
typename: Type name for statistics.
|
|
232
|
+
write_to_file: Whether to write results to file.
|
|
233
|
+
concatenate: Whether to concatenate in-memory results.
|
|
234
|
+
return_dataframe: Whether to return the concatenated dataframe.
|
|
235
|
+
parquet_output: The parquet writer used (if any).
|
|
236
|
+
temp_table_regex_sql: SQL regex for temp table exclusion.
|
|
237
|
+
setup_parquet_output_func: Callback to create new parquet output.
|
|
238
|
+
|
|
239
|
+
Returns:
|
|
240
|
+
Statistics or DataFrame, or None.
|
|
241
|
+
"""
|
|
242
|
+
# Resolve databases to iterate
|
|
243
|
+
database_names = await get_database_names(
|
|
244
|
+
sql_client, workflow_args, fetch_database_sql
|
|
245
|
+
)
|
|
246
|
+
if not database_names:
|
|
247
|
+
logger.warning("No databases found to process")
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
successful_databases: List[str] = []
|
|
251
|
+
dataframe_list: List[
|
|
252
|
+
Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
|
|
253
|
+
] = []
|
|
254
|
+
|
|
255
|
+
# Iterate databases and execute
|
|
256
|
+
for database_name in database_names or []:
|
|
257
|
+
try:
|
|
258
|
+
# Setup connection for this database
|
|
259
|
+
await setup_database_connection(sql_client, database_name)
|
|
260
|
+
|
|
261
|
+
# Prepare query for this database
|
|
262
|
+
prepared_query = prepare_database_query(
|
|
263
|
+
sql_query,
|
|
264
|
+
database_name,
|
|
265
|
+
workflow_args,
|
|
266
|
+
temp_table_regex_sql,
|
|
267
|
+
use_posix_regex=True,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Execute using helper method
|
|
271
|
+
success, batched_iterator = await execute_single_db(
|
|
272
|
+
sql_client,
|
|
273
|
+
prepared_query,
|
|
274
|
+
parquet_output,
|
|
275
|
+
write_to_file,
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if success:
|
|
279
|
+
logger.info(f"Successfully processed database: {database_name}")
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
logger.error(
|
|
283
|
+
f"Failed to process database '{database_name}': {str(e)}. Failing the workflow.",
|
|
284
|
+
exc_info=True,
|
|
285
|
+
)
|
|
286
|
+
raise
|
|
287
|
+
|
|
288
|
+
if success:
|
|
289
|
+
successful_databases.append(database_name)
|
|
290
|
+
if not write_to_file and batched_iterator:
|
|
291
|
+
dataframe_list.append(batched_iterator)
|
|
292
|
+
|
|
293
|
+
# Log results
|
|
294
|
+
logger.info(
|
|
295
|
+
f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Finalize results
|
|
299
|
+
return await finalize_multidb_results(
|
|
300
|
+
write_to_file,
|
|
301
|
+
concatenate,
|
|
302
|
+
return_dataframe,
|
|
303
|
+
parquet_output,
|
|
304
|
+
dataframe_list,
|
|
305
|
+
setup_parquet_output_func,
|
|
306
|
+
output_path,
|
|
307
|
+
typename,
|
|
308
|
+
)
|
|
@@ -5,11 +5,10 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import asyncio
|
|
8
|
-
import glob
|
|
9
8
|
import os
|
|
10
9
|
from datetime import timedelta
|
|
11
10
|
from functools import wraps
|
|
12
|
-
from typing import Any, Awaitable, Callable,
|
|
11
|
+
from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
|
|
13
12
|
|
|
14
13
|
from temporalio import activity
|
|
15
14
|
|
|
@@ -230,46 +229,3 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
|
|
|
230
229
|
while True:
|
|
231
230
|
await asyncio.sleep(delay)
|
|
232
231
|
activity.heartbeat(*details)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
def find_local_files_by_extension(
|
|
236
|
-
path: str,
|
|
237
|
-
extension: str,
|
|
238
|
-
file_names: Optional[List[str]] = None,
|
|
239
|
-
) -> List[str]:
|
|
240
|
-
"""Find local files at the specified local path, optionally filtering by file names.
|
|
241
|
-
|
|
242
|
-
Args:
|
|
243
|
-
path (str): Local path to search in (file or directory)
|
|
244
|
-
extension (str): File extension to filter by (e.g., '.parquet', '.json')
|
|
245
|
-
file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
|
|
246
|
-
|
|
247
|
-
Returns:
|
|
248
|
-
List[str]: List of matching file paths
|
|
249
|
-
|
|
250
|
-
Example:
|
|
251
|
-
>>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
|
|
252
|
-
['file1.parquet', 'file2.parquet']
|
|
253
|
-
|
|
254
|
-
>>> find_local_files_by_extension("/data/single.json", ".json")
|
|
255
|
-
['single.json']
|
|
256
|
-
"""
|
|
257
|
-
if os.path.isfile(path) and path.endswith(extension):
|
|
258
|
-
# Single file - return it directly
|
|
259
|
-
return [path]
|
|
260
|
-
|
|
261
|
-
elif os.path.isdir(path):
|
|
262
|
-
# Directory - find all files in directory
|
|
263
|
-
all_files = glob.glob(
|
|
264
|
-
os.path.join(path, "**", f"*{extension}"),
|
|
265
|
-
recursive=True,
|
|
266
|
-
)
|
|
267
|
-
|
|
268
|
-
# Filter by file names if specified
|
|
269
|
-
if file_names:
|
|
270
|
-
file_names_set = set(file_names) # Convert to set for O(1) lookup
|
|
271
|
-
return [f for f in all_files if os.path.basename(f) in file_names_set]
|
|
272
|
-
else:
|
|
273
|
-
return all_files
|
|
274
|
-
|
|
275
|
-
return []
|