atlan-application-sdk 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/objectstore.py +30 -7
  25. application_sdk/services/secretstore.py +1 -1
  26. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  27. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  28. application_sdk/version.py +1 -1
  29. application_sdk/worker.py +1 -1
  30. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  31. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +36 -43
  32. application_sdk/common/dataframe_utils.py +0 -42
  33. application_sdk/events/__init__.py +0 -5
  34. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  35. application_sdk/inputs/__init__.py +0 -168
  36. application_sdk/inputs/iceberg.py +0 -75
  37. application_sdk/inputs/json.py +0 -136
  38. application_sdk/inputs/parquet.py +0 -272
  39. application_sdk/inputs/sql_query.py +0 -271
  40. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  41. application_sdk/outputs/__init__.py +0 -445
  42. application_sdk/outputs/iceberg.py +0 -139
  43. application_sdk/outputs/json.py +0 -268
  44. /application_sdk/{events → interceptors}/models.py +0 -0
  45. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  46. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  47. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  48. {atlan_application_sdk-1.1.0.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -0,0 +1,308 @@
1
+ from typing import (
2
+ TYPE_CHECKING,
3
+ Any,
4
+ AsyncIterator,
5
+ Callable,
6
+ Dict,
7
+ Iterator,
8
+ List,
9
+ Optional,
10
+ Tuple,
11
+ Union,
12
+ )
13
+
14
+ from application_sdk.activities.common.models import ActivityStatistics
15
+ from application_sdk.clients.sql import BaseSQLClient
16
+ from application_sdk.common.utils import (
17
+ get_database_names,
18
+ parse_credentials_extra,
19
+ prepare_query,
20
+ )
21
+ from application_sdk.io.parquet import ParquetFileWriter
22
+ from application_sdk.observability.logger_adaptor import get_logger
23
+
24
+ if TYPE_CHECKING:
25
+ import pandas as pd
26
+
27
+ logger = get_logger(__name__)
28
+
29
+
30
+ async def setup_database_connection(
31
+ sql_client: BaseSQLClient,
32
+ database_name: str,
33
+ ) -> None:
34
+ """Setup connection for a specific database.
35
+
36
+ Args:
37
+ sql_client: The SQL client to configure.
38
+ database_name: The name of the database to connect to.
39
+ """
40
+ extra = parse_credentials_extra(sql_client.credentials)
41
+ extra["database"] = database_name
42
+ sql_client.credentials["extra"] = extra
43
+ await sql_client.load(sql_client.credentials)
44
+
45
+
46
+ def prepare_database_query(
47
+ sql_query: str,
48
+ database_name: Optional[str],
49
+ workflow_args: Dict[str, Any],
50
+ temp_table_regex_sql: str = "",
51
+ use_posix_regex: bool = False,
52
+ ) -> str:
53
+ """Prepare query for database execution with proper substitutions.
54
+
55
+ Args:
56
+ sql_query: The raw SQL query string.
57
+ database_name: The database name to substitute into the query.
58
+ workflow_args: Workflow arguments for query preparation.
59
+ temp_table_regex_sql: SQL regex for temp table exclusion.
60
+ use_posix_regex: Whether to use POSIX regex syntax.
61
+
62
+ Returns:
63
+ The prepared SQL query string.
64
+
65
+ Raises:
66
+ ValueError: If query preparation fails.
67
+ """
68
+ # Replace database name placeholder if provided
69
+ fetch_sql = sql_query
70
+ if database_name:
71
+ fetch_sql = fetch_sql.replace("{database_name}", database_name)
72
+
73
+ # Prepare the query
74
+ prepared_query = prepare_query(
75
+ query=fetch_sql,
76
+ workflow_args=workflow_args,
77
+ temp_table_regex_sql=temp_table_regex_sql,
78
+ use_posix_regex=use_posix_regex,
79
+ )
80
+
81
+ if prepared_query is None:
82
+ db_context = f" for database {database_name}" if database_name else ""
83
+ raise ValueError(f"Failed to prepare query{db_context}")
84
+
85
+ return prepared_query
86
+
87
+
88
+ async def execute_single_db(
89
+ sql_client: BaseSQLClient,
90
+ prepared_query: Optional[str],
91
+ parquet_output: Optional[ParquetFileWriter],
92
+ write_to_file: bool,
93
+ ) -> Tuple[
94
+ bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
95
+ ]:
96
+ """Execute a query against a single database.
97
+
98
+ Args:
99
+ sql_client: The SQL client to use.
100
+ prepared_query: The prepared SQL query to execute.
101
+ parquet_output: Optional parquet writer for output.
102
+ write_to_file: Whether to write results to file.
103
+
104
+ Returns:
105
+ Tuple of (success boolean, optional result iterator).
106
+ """
107
+ if not prepared_query:
108
+ logger.error("Prepared query is None, cannot execute")
109
+ return False, None
110
+
111
+ try:
112
+ batched_iterator = await sql_client.get_batched_results(prepared_query)
113
+
114
+ if write_to_file and parquet_output:
115
+ await parquet_output.write_batches(batched_iterator) # type: ignore
116
+ return True, None
117
+
118
+ return True, batched_iterator
119
+ except Exception as e:
120
+ logger.error(
121
+ f"Error during query execution or output writing: {e}", exc_info=True
122
+ )
123
+ raise
124
+
125
+
126
+ async def finalize_multidb_results(
127
+ write_to_file: bool,
128
+ concatenate: bool,
129
+ return_dataframe: bool,
130
+ parquet_output: Optional[ParquetFileWriter],
131
+ dataframe_list: List[
132
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
133
+ ],
134
+ setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
135
+ output_path: str,
136
+ typename: str,
137
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
138
+ """Finalize results for multi-database execution.
139
+
140
+ Args:
141
+ write_to_file: Whether results were written to file.
142
+ concatenate: Whether to concatenate in-memory results.
143
+ return_dataframe: Whether to return the concatenated dataframe.
144
+ parquet_output: The parquet writer used (if any).
145
+ dataframe_list: List of dataframe iterators from each DB.
146
+ setup_parquet_output_func: Callback to create new parquet output.
147
+ output_path: Full path for output files.
148
+ typename: Type name for statistics.
149
+
150
+ Returns:
151
+ Statistics or DataFrame, or None.
152
+ """
153
+ if write_to_file and parquet_output:
154
+ return await parquet_output.close()
155
+
156
+ if not write_to_file and concatenate:
157
+ try:
158
+ import pandas as pd
159
+
160
+ valid_dataframes: List[pd.DataFrame] = []
161
+ for df_generator in dataframe_list:
162
+ if df_generator is None:
163
+ continue
164
+ # Handle both async and sync iterators
165
+ if hasattr(df_generator, "__aiter__"):
166
+ async for dataframe in df_generator: # type: ignore
167
+ if dataframe is None or (
168
+ hasattr(dataframe, "empty") and dataframe.empty
169
+ ):
170
+ continue
171
+ valid_dataframes.append(dataframe)
172
+ else:
173
+ for dataframe in df_generator: # type: ignore
174
+ if dataframe is None or (
175
+ hasattr(dataframe, "empty") and dataframe.empty
176
+ ):
177
+ continue
178
+ valid_dataframes.append(dataframe)
179
+
180
+ if not valid_dataframes:
181
+ logger.warning(
182
+ "No valid dataframes collected across databases for concatenation"
183
+ )
184
+ return None
185
+
186
+ concatenated = pd.concat(valid_dataframes, ignore_index=True)
187
+
188
+ if return_dataframe:
189
+ return concatenated
190
+
191
+ # Create new parquet output for concatenated data
192
+ concatenated_parquet_output = setup_parquet_output_func(output_path, True)
193
+ if concatenated_parquet_output:
194
+ await concatenated_parquet_output.write(concatenated) # type: ignore
195
+ return await concatenated_parquet_output.close()
196
+ except Exception as e:
197
+ logger.error(
198
+ f"Error concatenating multi-DB dataframes: {str(e)}",
199
+ exc_info=True,
200
+ )
201
+ raise
202
+
203
+ logger.warning(
204
+ "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
205
+ )
206
+ return None
207
+
208
+
209
+ async def execute_multidb_flow(
210
+ sql_client: BaseSQLClient,
211
+ sql_query: str,
212
+ workflow_args: Dict[str, Any],
213
+ fetch_database_sql: Optional[str],
214
+ output_path: str,
215
+ typename: str,
216
+ write_to_file: bool,
217
+ concatenate: bool,
218
+ return_dataframe: bool,
219
+ parquet_output: Optional[ParquetFileWriter],
220
+ temp_table_regex_sql: str,
221
+ setup_parquet_output_func: Callable[[str, bool], Optional[ParquetFileWriter]],
222
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
223
+ """Execute multi-database flow with proper error handling and result finalization.
224
+
225
+ Args:
226
+ sql_client: The SQL client to use.
227
+ sql_query: The SQL query to execute on each database.
228
+ workflow_args: Workflow arguments.
229
+ fetch_database_sql: SQL to fetch list of databases.
230
+ output_path: Full path for output files.
231
+ typename: Type name for statistics.
232
+ write_to_file: Whether to write results to file.
233
+ concatenate: Whether to concatenate in-memory results.
234
+ return_dataframe: Whether to return the concatenated dataframe.
235
+ parquet_output: The parquet writer used (if any).
236
+ temp_table_regex_sql: SQL regex for temp table exclusion.
237
+ setup_parquet_output_func: Callback to create new parquet output.
238
+
239
+ Returns:
240
+ Statistics or DataFrame, or None.
241
+ """
242
+ # Resolve databases to iterate
243
+ database_names = await get_database_names(
244
+ sql_client, workflow_args, fetch_database_sql
245
+ )
246
+ if not database_names:
247
+ logger.warning("No databases found to process")
248
+ return None
249
+
250
+ successful_databases: List[str] = []
251
+ dataframe_list: List[
252
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
253
+ ] = []
254
+
255
+ # Iterate databases and execute
256
+ for database_name in database_names or []:
257
+ try:
258
+ # Setup connection for this database
259
+ await setup_database_connection(sql_client, database_name)
260
+
261
+ # Prepare query for this database
262
+ prepared_query = prepare_database_query(
263
+ sql_query,
264
+ database_name,
265
+ workflow_args,
266
+ temp_table_regex_sql,
267
+ use_posix_regex=True,
268
+ )
269
+
270
+ # Execute using helper method
271
+ success, batched_iterator = await execute_single_db(
272
+ sql_client,
273
+ prepared_query,
274
+ parquet_output,
275
+ write_to_file,
276
+ )
277
+
278
+ if success:
279
+ logger.info(f"Successfully processed database: {database_name}")
280
+
281
+ except Exception as e:
282
+ logger.error(
283
+ f"Failed to process database '{database_name}': {str(e)}. Failing the workflow.",
284
+ exc_info=True,
285
+ )
286
+ raise
287
+
288
+ if success:
289
+ successful_databases.append(database_name)
290
+ if not write_to_file and batched_iterator:
291
+ dataframe_list.append(batched_iterator)
292
+
293
+ # Log results
294
+ logger.info(
295
+ f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
296
+ )
297
+
298
+ # Finalize results
299
+ return await finalize_multidb_results(
300
+ write_to_file,
301
+ concatenate,
302
+ return_dataframe,
303
+ parquet_output,
304
+ dataframe_list,
305
+ setup_parquet_output_func,
306
+ output_path,
307
+ typename,
308
+ )
@@ -5,11 +5,10 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
5
5
  """
6
6
 
7
7
  import asyncio
8
- import glob
9
8
  import os
10
9
  from datetime import timedelta
11
10
  from functools import wraps
12
- from typing import Any, Awaitable, Callable, List, Optional, TypeVar, cast
11
+ from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
13
12
 
14
13
  from temporalio import activity
15
14
 
@@ -230,46 +229,3 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
230
229
  while True:
231
230
  await asyncio.sleep(delay)
232
231
  activity.heartbeat(*details)
233
-
234
-
235
- def find_local_files_by_extension(
236
- path: str,
237
- extension: str,
238
- file_names: Optional[List[str]] = None,
239
- ) -> List[str]:
240
- """Find local files at the specified local path, optionally filtering by file names.
241
-
242
- Args:
243
- path (str): Local path to search in (file or directory)
244
- extension (str): File extension to filter by (e.g., '.parquet', '.json')
245
- file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
246
-
247
- Returns:
248
- List[str]: List of matching file paths
249
-
250
- Example:
251
- >>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
252
- ['file1.parquet', 'file2.parquet']
253
-
254
- >>> find_local_files_by_extension("/data/single.json", ".json")
255
- ['single.json']
256
- """
257
- if os.path.isfile(path) and path.endswith(extension):
258
- # Single file - return it directly
259
- return [path]
260
-
261
- elif os.path.isdir(path):
262
- # Directory - find all files in directory
263
- all_files = glob.glob(
264
- os.path.join(path, "**", f"*{extension}"),
265
- recursive=True,
266
- )
267
-
268
- # Filter by file names if specified
269
- if file_names:
270
- file_names_set = set(file_names) # Convert to set for O(1) lookup
271
- return [f for f in all_files if os.path.basename(f) in file_names_set]
272
- else:
273
- return all_files
274
-
275
- return []