atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +312 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +749 -0
  14. application_sdk/io/json.py +473 -0
  15. application_sdk/{outputs → io}/parquet.py +414 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +16 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +14 -1
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -453
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,312 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Any,
4
+ AsyncIterator,
5
+ Callable,
6
+ Dict,
7
+ Iterator,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ Union,
12
+ )
13
+
14
+ from application_sdk.activities.common.models import ActivityStatistics
15
+ from application_sdk.clients.sql import BaseSQLClient
16
+ from application_sdk.common.utils import (
17
+ get_database_names,
18
+ parse_credentials_extra,
19
+ prepare_query,
20
+ )
21
+ from application_sdk.io.parquet import ParquetFileWriter
22
+ from application_sdk.observability.logger_adaptor import get_logger
23
+
24
+ if TYPE_CHECKING:
25
+ import pandas as pd
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ async def setup_database_connection(
31
+ sql_client: BaseSQLClient,
32
+ database_name: str,
33
+ ) -> None:
34
+ """Setup connection for a specific database.
35
+
36
+ Args:
37
+ sql_client: The SQL client to configure.
38
+ database_name: The name of the database to connect to.
39
+ """
40
+ extra = parse_credentials_extra(sql_client.credentials)
41
+ extra["database"] = database_name
42
+ sql_client.credentials["extra"] = extra
43
+ await sql_client.load(sql_client.credentials)
44
+
45
+
46
+ def prepare_database_query(
47
+ sql_query: str,
48
+ database_name: Optional[str],
49
+ workflow_args: Dict[str, Any],
50
+ temp_table_regex_sql: str = "",
51
+ use_posix_regex: bool = False,
52
+ ) -> str:
53
+ """Prepare query for database execution with proper substitutions.
54
+
55
+ Args:
56
+ sql_query: The raw SQL query string.
57
+ database_name: The database name to substitute into the query.
58
+ workflow_args: Workflow arguments for query preparation.
59
+ temp_table_regex_sql: SQL regex for temp table exclusion.
60
+ use_posix_regex: Whether to use POSIX regex syntax.
61
+
62
+ Returns:
63
+ The prepared SQL query string.
64
+
65
+ Raises:
66
+ ValueError: If query preparation fails.
67
+ """
68
+ # Replace database name placeholder if provided
69
+ fetch_sql = sql_query
70
+ if database_name:
71
+ fetch_sql = fetch_sql.replace("{database_name}", database_name)
72
+
73
+ # Prepare the query
74
+ prepared_query = prepare_query(
75
+ query=fetch_sql,
76
+ workflow_args=workflow_args,
77
+ temp_table_regex_sql=temp_table_regex_sql,
78
+ use_posix_regex=use_posix_regex,
79
+ )
80
+
81
+ if prepared_query is None:
82
+ db_context = f" for database {database_name}" if database_name else ""
83
+ raise ValueError(f"Failed to prepare query{db_context}")
84
+
85
+ return prepared_query
86
+
87
+
88
+ async def execute_single_db(
89
+ sql_client: BaseSQLClient,
90
+ prepared_query: Optional[str],
91
+ parquet_output: Optional[ParquetFileWriter],
92
+ write_to_file: bool,
93
+ ) -> Tuple[
94
+ bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
95
+ ]:
96
+ """Execute a query against a single database.
97
+
98
+ Args:
99
+ sql_client: The SQL client to use.
100
+ prepared_query: The prepared SQL query to execute.
101
+ parquet_output: Optional parquet writer for output.
102
+ write_to_file: Whether to write results to file.
103
+
104
+ Returns:
105
+ Tuple of (success boolean, optional result iterator).
106
+ """
107
+ if not prepared_query:
108
+ logger.error("Prepared query is None, cannot execute")
109
+ return False, None
110
+
111
+ try:
112
+ batched_iterator = await sql_client.get_batched_results(prepared_query)
113
+
114
+ if write_to_file and parquet_output:
115
+ await parquet_output.write_batches(batched_iterator) # type: ignore
116
+ return True, None
117
+
118
+ return True, batched_iterator
119
+ except Exception as e:
120
+ logger.error(
121
+ f"Error during query execution or output writing: {e}", exc_info=True
122
+ )
123
+ raise
124
+
125
+
126
+ async def finalize_multidb_results(
127
+ write_to_file: bool,
128
+ concatenate: bool,
129
+ return_dataframe: bool,
130
+ parquet_output: Optional[ParquetFileWriter],
131
+ dataframe_list: List[
132
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
133
+ ],
134
+ setup_parquet_output_func: Callable[
135
+ [str, bool, Optional[str]], Optional[ParquetFileWriter]
136
+ ],
137
+ output_path: str,
138
+ typename: str,
139
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
140
+ """Finalize results for multi-database execution.
141
+
142
+ Args:
143
+ write_to_file: Whether results were written to file.
144
+ concatenate: Whether to concatenate in-memory results.
145
+ return_dataframe: Whether to return the concatenated dataframe.
146
+ parquet_output: The parquet writer used (if any).
147
+ dataframe_list: List of dataframe iterators from each DB.
148
+ setup_parquet_output_func: Callback to create new parquet output.
149
+ output_path: Full path for output files.
150
+ typename: Type name for statistics.
151
+
152
+ Returns:
153
+ Statistics or DataFrame, or None.
154
+ """
155
+ if write_to_file and parquet_output:
156
+ return await parquet_output.close()
157
+
158
+ if not write_to_file and concatenate:
159
+ try:
160
+ import pandas as pd
161
+
162
+ valid_dataframes: List[pd.DataFrame] = []
163
+ for df_generator in dataframe_list:
164
+ if df_generator is None:
165
+ continue
166
+ # Handle both async and sync iterators
167
+ if hasattr(df_generator, "__aiter__"):
168
+ async for dataframe in df_generator: # type: ignore
169
+ if dataframe is None or (
170
+ hasattr(dataframe, "empty") and dataframe.empty
171
+ ):
172
+ continue
173
+ valid_dataframes.append(dataframe)
174
+ else:
175
+ for dataframe in df_generator: # type: ignore
176
+ if dataframe is None or (
177
+ hasattr(dataframe, "empty") and dataframe.empty
178
+ ):
179
+ continue
180
+ valid_dataframes.append(dataframe)
181
+
182
+ if not valid_dataframes:
183
+ logger.warning(
184
+ "No valid dataframes collected across databases for concatenation"
185
+ )
186
+ return None
187
+
188
+ concatenated = pd.concat(valid_dataframes, ignore_index=True)
189
+
190
+ if return_dataframe:
191
+ return concatenated
192
+
193
+ # Create new parquet output for concatenated data
194
+ concatenated_parquet_output = setup_parquet_output_func(
195
+ output_path, True, typename
196
+ )
197
+ if concatenated_parquet_output:
198
+ await concatenated_parquet_output.write(concatenated) # type: ignore
199
+ return await concatenated_parquet_output.close()
200
+ except Exception as e:
201
+ logger.error(
202
+ f"Error concatenating multi-DB dataframes: {str(e)}",
203
+ exc_info=True,
204
+ )
205
+ raise
206
+
207
+ logger.warning(
208
+ "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
209
+ )
210
+ return None
211
+
212
+
213
+ async def execute_multidb_flow(
214
+ sql_client: BaseSQLClient,
215
+ sql_query: str,
216
+ workflow_args: Dict[str, Any],
217
+ fetch_database_sql: Optional[str],
218
+ output_path: str,
219
+ typename: str,
220
+ write_to_file: bool,
221
+ concatenate: bool,
222
+ return_dataframe: bool,
223
+ parquet_output: Optional[ParquetFileWriter],
224
+ temp_table_regex_sql: str,
225
+ setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
226
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
227
+ """Execute multi-database flow with proper error handling and result finalization.
228
+
229
+ Args:
230
+ sql_client: The SQL client to use.
231
+ sql_query: The SQL query to execute on each database.
232
+ workflow_args: Workflow arguments.
233
+ fetch_database_sql: SQL to fetch list of databases.
234
+ output_path: Full path for output files.
235
+ typename: Type name for statistics.
236
+ write_to_file: Whether to write results to file.
237
+ concatenate: Whether to concatenate in-memory results.
238
+ return_dataframe: Whether to return the concatenated dataframe.
239
+ parquet_output: The parquet writer used (if any).
240
+ temp_table_regex_sql: SQL regex for temp table exclusion.
241
+ setup_parquet_output_func: Callback to create new parquet output.
242
+
243
+ Returns:
244
+ Statistics or DataFrame, or None.
245
+ """
246
+ # Resolve databases to iterate
247
+ database_names = await get_database_names(
248
+ sql_client, workflow_args, fetch_database_sql
249
+ )
250
+ if not database_names:
251
+ logger.warning("No databases found to process")
252
+ return None
253
+
254
+ successful_databases: List[str] = []
255
+ dataframe_list: List[
256
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
257
+ ] = []
258
+
259
+ # Iterate databases and execute
260
+ for database_name in database_names or []:
261
+ try:
262
+ # Setup connection for this database
263
+ await setup_database_connection(sql_client, database_name)
264
+
265
+ # Prepare query for this database
266
+ prepared_query = prepare_database_query(
267
+ sql_query,
268
+ database_name,
269
+ workflow_args,
270
+ temp_table_regex_sql,
271
+ use_posix_regex=True,
272
+ )
273
+
274
+ # Execute using helper method
275
+ success, batched_iterator = await execute_single_db(
276
+ sql_client,
277
+ prepared_query,
278
+ parquet_output,
279
+ write_to_file,
280
+ )
281
+
282
+ if success:
283
+ logger.info(f"Successfully processed database: {database_name}")
284
+
285
+ except Exception as e:
286
+ logger.error(
287
+ f"Failed to process database '{database_name}': {str(e)}. Failing the workflow.",
288
+ exc_info=True,
289
+ )
290
+ raise
291
+
292
+ if success:
293
+ successful_databases.append(database_name)
294
+ if not write_to_file and batched_iterator:
295
+ dataframe_list.append(batched_iterator)
296
+
297
+ # Log results
298
+ logger.info(
299
+ f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
300
+ )
301
+
302
+ # Finalize results
303
+ return await finalize_multidb_results(
304
+ write_to_file,
305
+ concatenate,
306
+ return_dataframe,
307
+ parquet_output,
308
+ dataframe_list,
309
+ setup_parquet_output_func,
310
+ output_path,
311
+ typename,
312
+ )
@@ -5,11 +5,10 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
5
5
  """
6
6
 
7
7
  import asyncio
8
- import glob
9
8
  import os
10
9
  from datetime import timedelta
11
10
  from functools import wraps
12
- from typing import Any, Awaitable, Callable, List, Optional, TypeVar, cast
11
+ from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
13
12
 
14
13
  from temporalio import activity
15
14
 
@@ -230,46 +229,3 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
230
229
  while True:
231
230
  await asyncio.sleep(delay)
232
231
  activity.heartbeat(*details)
233
-
234
-
235
- def find_local_files_by_extension(
236
- path: str,
237
- extension: str,
238
- file_names: Optional[List[str]] = None,
239
- ) -> List[str]:
240
- """Find local files at the specified local path, optionally filtering by file names.
241
-
242
- Args:
243
- path (str): Local path to search in (file or directory)
244
- extension (str): File extension to filter by (e.g., '.parquet', '.json')
245
- file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
246
-
247
- Returns:
248
- List[str]: List of matching file paths
249
-
250
- Example:
251
- >>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
252
- ['file1.parquet', 'file2.parquet']
253
-
254
- >>> find_local_files_by_extension("/data/single.json", ".json")
255
- ['single.json']
256
- """
257
- if os.path.isfile(path) and path.endswith(extension):
258
- # Single file - return it directly
259
- return [path]
260
-
261
- elif os.path.isdir(path):
262
- # Directory - find all files in directory
263
- all_files = glob.glob(
264
- os.path.join(path, "**", f"*{extension}"),
265
- recursive=True,
266
- )
267
-
268
- # Filter by file names if specified
269
- if file_names:
270
- file_names_set = set(file_names) # Convert to set for O(1) lookup
271
- return [f for f in all_files if os.path.basename(f) in file_names_set]
272
- else:
273
- return all_files
274
-
275
- return []