atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +312 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +749 -0
- application_sdk/io/json.py +473 -0
- application_sdk/{outputs → io}/parquet.py +414 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +16 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +14 -1
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -2,10 +2,7 @@ import os
|
|
|
2
2
|
from typing import (
|
|
3
3
|
TYPE_CHECKING,
|
|
4
4
|
Any,
|
|
5
|
-
AsyncIterator,
|
|
6
5
|
Dict,
|
|
7
|
-
Iterator,
|
|
8
|
-
List,
|
|
9
6
|
Optional,
|
|
10
7
|
Tuple,
|
|
11
8
|
Type,
|
|
@@ -17,6 +14,7 @@ from typing import (
|
|
|
17
14
|
from temporalio import activity
|
|
18
15
|
|
|
19
16
|
from application_sdk.activities import ActivitiesInterface, ActivitiesState
|
|
17
|
+
from application_sdk.activities.common import sql_utils
|
|
20
18
|
from application_sdk.activities.common.models import ActivityStatistics
|
|
21
19
|
from application_sdk.activities.common.utils import (
|
|
22
20
|
auto_heartbeater,
|
|
@@ -24,21 +22,15 @@ from application_sdk.activities.common.utils import (
|
|
|
24
22
|
get_workflow_id,
|
|
25
23
|
)
|
|
26
24
|
from application_sdk.clients.sql import BaseSQLClient
|
|
27
|
-
from application_sdk.common.dataframe_utils import is_empty_dataframe
|
|
28
25
|
from application_sdk.common.error_codes import ActivityError
|
|
29
|
-
from application_sdk.common.utils import
|
|
30
|
-
get_database_names,
|
|
31
|
-
parse_credentials_extra,
|
|
32
|
-
prepare_query,
|
|
33
|
-
read_sql_files,
|
|
34
|
-
)
|
|
26
|
+
from application_sdk.common.utils import prepare_query, read_sql_files
|
|
35
27
|
from application_sdk.constants import APP_TENANT_ID, APPLICATION_NAME, SQL_QUERIES_PATH
|
|
36
28
|
from application_sdk.handlers.sql import BaseSQLHandler
|
|
37
|
-
from application_sdk.
|
|
38
|
-
from application_sdk.
|
|
29
|
+
from application_sdk.io import DataframeType
|
|
30
|
+
from application_sdk.io.json import JsonFileWriter
|
|
31
|
+
from application_sdk.io.parquet import ParquetFileReader, ParquetFileWriter
|
|
32
|
+
from application_sdk.io.utils import is_empty_dataframe
|
|
39
33
|
from application_sdk.observability.logger_adaptor import get_logger
|
|
40
|
-
from application_sdk.outputs.json import JsonOutput
|
|
41
|
-
from application_sdk.outputs.parquet import ParquetOutput
|
|
42
34
|
from application_sdk.services.atlan_storage import AtlanStorage
|
|
43
35
|
from application_sdk.services.secretstore import SecretStore
|
|
44
36
|
from application_sdk.transformers import TransformerInterface
|
|
@@ -237,105 +229,108 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
237
229
|
@overload
|
|
238
230
|
async def query_executor(
|
|
239
231
|
self,
|
|
240
|
-
|
|
232
|
+
sql_client: BaseSQLClient,
|
|
241
233
|
sql_query: Optional[str],
|
|
242
234
|
workflow_args: Dict[str, Any],
|
|
243
|
-
|
|
235
|
+
output_path: str,
|
|
244
236
|
typename: str,
|
|
245
237
|
write_to_file: bool = True,
|
|
246
238
|
concatenate: bool = False,
|
|
247
239
|
return_dataframe: bool = False,
|
|
248
|
-
sql_client: Optional[BaseSQLClient] = None,
|
|
249
240
|
) -> Optional[ActivityStatistics]: ...
|
|
250
241
|
|
|
251
242
|
@overload
|
|
252
243
|
async def query_executor(
|
|
253
244
|
self,
|
|
254
|
-
|
|
245
|
+
sql_client: BaseSQLClient,
|
|
255
246
|
sql_query: Optional[str],
|
|
256
247
|
workflow_args: Dict[str, Any],
|
|
257
|
-
|
|
248
|
+
output_path: str,
|
|
258
249
|
typename: str,
|
|
259
250
|
write_to_file: bool = True,
|
|
260
251
|
concatenate: bool = False,
|
|
261
252
|
return_dataframe: bool = True,
|
|
262
|
-
sql_client: Optional[BaseSQLClient] = None,
|
|
263
253
|
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]: ...
|
|
264
254
|
|
|
265
255
|
async def query_executor(
|
|
266
256
|
self,
|
|
267
|
-
|
|
257
|
+
sql_client: BaseSQLClient,
|
|
268
258
|
sql_query: Optional[str],
|
|
269
259
|
workflow_args: Dict[str, Any],
|
|
270
|
-
|
|
260
|
+
output_path: str,
|
|
271
261
|
typename: str,
|
|
272
262
|
write_to_file: bool = True,
|
|
273
263
|
concatenate: bool = False,
|
|
274
264
|
return_dataframe: bool = False,
|
|
275
|
-
sql_client: Optional[BaseSQLClient] = None,
|
|
276
265
|
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
|
|
277
266
|
"""
|
|
278
|
-
Executes a SQL query using the provided
|
|
267
|
+
Executes a SQL query using the provided client and saves the results to Parquet.
|
|
279
268
|
|
|
280
|
-
This method validates the input
|
|
281
|
-
workflow arguments, executes it, writes the resulting
|
|
269
|
+
This method validates the input client and query, prepares the query using
|
|
270
|
+
workflow arguments, executes it, writes the resulting DataFrame to
|
|
282
271
|
a Parquet file, and returns statistics about the output.
|
|
283
272
|
|
|
284
273
|
Args:
|
|
285
|
-
|
|
274
|
+
sql_client: The SQL client instance to use for executing the query.
|
|
286
275
|
sql_query: The SQL query string to execute. Placeholders can be used which
|
|
287
276
|
will be replaced using `workflow_args`.
|
|
288
|
-
workflow_args: Dictionary containing arguments for the workflow
|
|
289
|
-
|
|
290
|
-
- "output_prefix": Prefix for the output path.
|
|
291
|
-
- "output_path": Base directory for the output.
|
|
292
|
-
output_suffix: Suffix to append to the output file name.
|
|
277
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
278
|
+
output_path: Full path where the output files will be written.
|
|
293
279
|
typename: Type name used for generating output statistics.
|
|
280
|
+
write_to_file: Whether to write results to file. Defaults to True.
|
|
281
|
+
concatenate: Whether to concatenate results in multidb mode. Defaults to False.
|
|
282
|
+
return_dataframe: Whether to return a DataFrame instead of statistics. Defaults to False.
|
|
294
283
|
|
|
295
284
|
Returns:
|
|
296
285
|
Optional[Union[ActivityStatistics, pd.DataFrame]]: Statistics about the generated Parquet file,
|
|
297
286
|
or a DataFrame if return_dataframe=True, or None if the query is empty or execution fails.
|
|
298
287
|
|
|
299
288
|
Raises:
|
|
300
|
-
ValueError: If `
|
|
289
|
+
ValueError: If `sql_client` is not provided.
|
|
301
290
|
"""
|
|
302
291
|
# Common pre-checks and setup shared by both multidb and single-db paths
|
|
292
|
+
if not sql_client:
|
|
293
|
+
logger.error("SQL client is not provided")
|
|
294
|
+
raise ValueError("SQL client is required for query execution")
|
|
295
|
+
|
|
303
296
|
if not sql_query:
|
|
304
297
|
logger.warning("Query is empty, skipping execution.")
|
|
305
298
|
return None
|
|
306
299
|
|
|
307
|
-
if not sql_engine:
|
|
308
|
-
logger.error("SQL engine is not set.")
|
|
309
|
-
raise ValueError("SQL engine must be provided.")
|
|
310
|
-
|
|
311
300
|
# Setup parquet output using helper method
|
|
312
301
|
parquet_output = self._setup_parquet_output(
|
|
313
|
-
|
|
302
|
+
output_path, write_to_file, typename
|
|
314
303
|
)
|
|
315
304
|
|
|
316
305
|
# If multidb mode is enabled, run per-database flow
|
|
317
306
|
if getattr(self, "multidb", False):
|
|
318
|
-
return await
|
|
319
|
-
sql_client,
|
|
320
|
-
sql_query,
|
|
321
|
-
workflow_args,
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
307
|
+
return await sql_utils.execute_multidb_flow(
|
|
308
|
+
sql_client=sql_client,
|
|
309
|
+
sql_query=sql_query,
|
|
310
|
+
workflow_args=workflow_args,
|
|
311
|
+
fetch_database_sql=self.fetch_database_sql,
|
|
312
|
+
output_path=output_path,
|
|
313
|
+
typename=typename,
|
|
314
|
+
write_to_file=write_to_file,
|
|
315
|
+
concatenate=concatenate,
|
|
316
|
+
return_dataframe=return_dataframe,
|
|
317
|
+
parquet_output=parquet_output,
|
|
318
|
+
temp_table_regex_sql=self._get_temp_table_regex_sql(typename),
|
|
319
|
+
setup_parquet_output_func=self._setup_parquet_output,
|
|
328
320
|
)
|
|
329
321
|
|
|
330
322
|
# Single-db execution path
|
|
331
323
|
# Prepare query for single-db execution
|
|
332
|
-
prepared_query =
|
|
333
|
-
sql_query,
|
|
324
|
+
prepared_query = sql_utils.prepare_database_query(
|
|
325
|
+
sql_query,
|
|
326
|
+
None,
|
|
327
|
+
workflow_args,
|
|
328
|
+
self._get_temp_table_regex_sql(typename),
|
|
334
329
|
)
|
|
335
330
|
|
|
336
331
|
# Execute using helper method
|
|
337
|
-
success, _ = await
|
|
338
|
-
|
|
332
|
+
success, _ = await sql_utils.execute_single_db(
|
|
333
|
+
sql_client, prepared_query, parquet_output, write_to_file
|
|
339
334
|
)
|
|
340
335
|
|
|
341
336
|
if not success:
|
|
@@ -346,30 +341,33 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
346
341
|
logger.info(
|
|
347
342
|
f"Successfully wrote query results to {parquet_output.get_full_path()}"
|
|
348
343
|
)
|
|
349
|
-
return await parquet_output.
|
|
344
|
+
return await parquet_output.close()
|
|
350
345
|
|
|
351
346
|
logger.warning("No parquet output configured for single-db execution")
|
|
352
347
|
return None
|
|
353
348
|
|
|
354
349
|
def _setup_parquet_output(
|
|
355
350
|
self,
|
|
356
|
-
|
|
357
|
-
output_suffix: str,
|
|
351
|
+
output_path: str,
|
|
358
352
|
write_to_file: bool,
|
|
359
|
-
|
|
353
|
+
typename: Optional[str] = None,
|
|
354
|
+
) -> Optional[ParquetFileWriter]:
|
|
355
|
+
"""Create a ParquetFileWriter for the given output path.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
output_path: Full path where the output files will be written.
|
|
359
|
+
write_to_file: Whether to write results to file.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Optional[ParquetFileWriter]: A ParquetFileWriter instance, or None if write_to_file is False.
|
|
363
|
+
"""
|
|
360
364
|
if not write_to_file:
|
|
361
365
|
return None
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
logger.error("Output prefix or path not provided in workflow_args.")
|
|
366
|
-
raise ValueError(
|
|
367
|
-
"Output prefix and path must be specified in workflow_args."
|
|
368
|
-
)
|
|
369
|
-
return ParquetOutput(
|
|
370
|
-
output_path=output_path,
|
|
371
|
-
output_suffix=output_suffix,
|
|
366
|
+
|
|
367
|
+
return ParquetFileWriter(
|
|
368
|
+
path=output_path,
|
|
372
369
|
use_consolidation=True,
|
|
370
|
+
typename=typename,
|
|
373
371
|
)
|
|
374
372
|
|
|
375
373
|
def _get_temp_table_regex_sql(self, typename: str) -> str:
|
|
@@ -381,243 +379,6 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
381
379
|
else:
|
|
382
380
|
return ""
|
|
383
381
|
|
|
384
|
-
def _prepare_database_query(
|
|
385
|
-
self,
|
|
386
|
-
sql_query: str,
|
|
387
|
-
database_name: Optional[str],
|
|
388
|
-
workflow_args: Dict[str, Any],
|
|
389
|
-
typename: str,
|
|
390
|
-
use_posix_regex: bool = False,
|
|
391
|
-
) -> Optional[str]:
|
|
392
|
-
"""Prepare query for database execution with proper substitutions."""
|
|
393
|
-
# Replace database name placeholder if provided
|
|
394
|
-
fetch_sql = sql_query
|
|
395
|
-
if database_name:
|
|
396
|
-
fetch_sql = fetch_sql.replace("{database_name}", database_name)
|
|
397
|
-
|
|
398
|
-
# Get temp table regex SQL
|
|
399
|
-
temp_table_regex_sql = self._get_temp_table_regex_sql(typename)
|
|
400
|
-
|
|
401
|
-
# Prepare the query
|
|
402
|
-
prepared_query = prepare_query(
|
|
403
|
-
query=fetch_sql,
|
|
404
|
-
workflow_args=workflow_args,
|
|
405
|
-
temp_table_regex_sql=temp_table_regex_sql,
|
|
406
|
-
use_posix_regex=use_posix_regex,
|
|
407
|
-
)
|
|
408
|
-
|
|
409
|
-
if prepared_query is None:
|
|
410
|
-
db_context = f" for database {database_name}" if database_name else ""
|
|
411
|
-
raise ValueError(f"Failed to prepare query{db_context}")
|
|
412
|
-
|
|
413
|
-
return prepared_query
|
|
414
|
-
|
|
415
|
-
async def _setup_database_connection(
|
|
416
|
-
self,
|
|
417
|
-
sql_client: BaseSQLClient,
|
|
418
|
-
database_name: str,
|
|
419
|
-
) -> None:
|
|
420
|
-
"""Setup connection for a specific database."""
|
|
421
|
-
extra = parse_credentials_extra(sql_client.credentials)
|
|
422
|
-
extra["database"] = database_name
|
|
423
|
-
sql_client.credentials["extra"] = extra
|
|
424
|
-
await sql_client.load(sql_client.credentials)
|
|
425
|
-
|
|
426
|
-
# NOTE: Consolidated: per-database processing is now inlined in the multi-DB loop
|
|
427
|
-
|
|
428
|
-
async def _finalize_multidb_results(
|
|
429
|
-
self,
|
|
430
|
-
write_to_file: bool,
|
|
431
|
-
concatenate: bool,
|
|
432
|
-
return_dataframe: bool,
|
|
433
|
-
parquet_output: Optional[ParquetOutput],
|
|
434
|
-
dataframe_list: List[
|
|
435
|
-
Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
|
|
436
|
-
],
|
|
437
|
-
workflow_args: Dict[str, Any],
|
|
438
|
-
output_suffix: str,
|
|
439
|
-
typename: str,
|
|
440
|
-
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
|
|
441
|
-
"""Finalize results for multi-database execution."""
|
|
442
|
-
if write_to_file and parquet_output:
|
|
443
|
-
return await parquet_output.get_statistics(typename=typename)
|
|
444
|
-
|
|
445
|
-
if not write_to_file and concatenate:
|
|
446
|
-
try:
|
|
447
|
-
import pandas as pd # type: ignore
|
|
448
|
-
|
|
449
|
-
valid_dataframes: List[pd.DataFrame] = []
|
|
450
|
-
for df_generator in dataframe_list:
|
|
451
|
-
if df_generator is None:
|
|
452
|
-
continue
|
|
453
|
-
for dataframe in df_generator: # type: ignore[assignment]
|
|
454
|
-
if dataframe is None:
|
|
455
|
-
continue
|
|
456
|
-
if hasattr(dataframe, "empty") and getattr(dataframe, "empty"):
|
|
457
|
-
continue
|
|
458
|
-
valid_dataframes.append(dataframe)
|
|
459
|
-
|
|
460
|
-
if not valid_dataframes:
|
|
461
|
-
logger.warning(
|
|
462
|
-
"No valid dataframes collected across databases for concatenation"
|
|
463
|
-
)
|
|
464
|
-
return None
|
|
465
|
-
|
|
466
|
-
concatenated = pd.concat(valid_dataframes, ignore_index=True)
|
|
467
|
-
|
|
468
|
-
if return_dataframe:
|
|
469
|
-
return concatenated # type: ignore[return-value]
|
|
470
|
-
|
|
471
|
-
# Create new parquet output for concatenated data
|
|
472
|
-
concatenated_parquet_output = self._setup_parquet_output(
|
|
473
|
-
workflow_args, output_suffix, True
|
|
474
|
-
)
|
|
475
|
-
if concatenated_parquet_output:
|
|
476
|
-
await concatenated_parquet_output.write_dataframe(concatenated) # type: ignore[arg-type]
|
|
477
|
-
return await concatenated_parquet_output.get_statistics(
|
|
478
|
-
typename=typename
|
|
479
|
-
)
|
|
480
|
-
except Exception as e: # noqa: BLE001
|
|
481
|
-
logger.error(
|
|
482
|
-
f"Error concatenating multi-DB dataframes: {str(e)}",
|
|
483
|
-
exc_info=True,
|
|
484
|
-
)
|
|
485
|
-
raise
|
|
486
|
-
|
|
487
|
-
logger.warning(
|
|
488
|
-
"multidb execution returned no output to write (write_to_file=False, concatenate=False)"
|
|
489
|
-
)
|
|
490
|
-
return None
|
|
491
|
-
|
|
492
|
-
async def _execute_multidb_flow(
|
|
493
|
-
self,
|
|
494
|
-
sql_client: Optional[BaseSQLClient],
|
|
495
|
-
sql_query: str,
|
|
496
|
-
workflow_args: Dict[str, Any],
|
|
497
|
-
output_suffix: str,
|
|
498
|
-
typename: str,
|
|
499
|
-
write_to_file: bool,
|
|
500
|
-
concatenate: bool,
|
|
501
|
-
return_dataframe: bool,
|
|
502
|
-
parquet_output: Optional[ParquetOutput],
|
|
503
|
-
) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
|
|
504
|
-
"""Execute multi-database flow with proper error handling and result finalization."""
|
|
505
|
-
# Get effective SQL client
|
|
506
|
-
effective_sql_client = sql_client
|
|
507
|
-
if effective_sql_client is None:
|
|
508
|
-
state = cast(
|
|
509
|
-
BaseSQLMetadataExtractionActivitiesState,
|
|
510
|
-
await self._get_state(workflow_args),
|
|
511
|
-
)
|
|
512
|
-
effective_sql_client = state.sql_client
|
|
513
|
-
|
|
514
|
-
if not effective_sql_client:
|
|
515
|
-
logger.error("SQL client not initialized for multidb execution")
|
|
516
|
-
raise ValueError("SQL client not initialized")
|
|
517
|
-
|
|
518
|
-
# Resolve databases to iterate
|
|
519
|
-
database_names = await get_database_names(
|
|
520
|
-
effective_sql_client, workflow_args, self.fetch_database_sql
|
|
521
|
-
)
|
|
522
|
-
if not database_names:
|
|
523
|
-
logger.warning("No databases found to process")
|
|
524
|
-
return None
|
|
525
|
-
|
|
526
|
-
# Validate client
|
|
527
|
-
if not effective_sql_client.engine:
|
|
528
|
-
logger.error("SQL client engine not initialized")
|
|
529
|
-
raise ValueError("SQL client engine not initialized")
|
|
530
|
-
|
|
531
|
-
successful_databases: List[str] = []
|
|
532
|
-
dataframe_list: List[
|
|
533
|
-
Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
|
|
534
|
-
] = []
|
|
535
|
-
|
|
536
|
-
# Iterate databases and execute (consolidated single-db processing)
|
|
537
|
-
for database_name in database_names or []:
|
|
538
|
-
try:
|
|
539
|
-
# Setup connection for this database
|
|
540
|
-
await self._setup_database_connection(
|
|
541
|
-
effective_sql_client, database_name
|
|
542
|
-
)
|
|
543
|
-
|
|
544
|
-
# Prepare query for this database
|
|
545
|
-
prepared_query = self._prepare_database_query(
|
|
546
|
-
sql_query,
|
|
547
|
-
database_name,
|
|
548
|
-
workflow_args,
|
|
549
|
-
typename,
|
|
550
|
-
use_posix_regex=True,
|
|
551
|
-
)
|
|
552
|
-
|
|
553
|
-
# Execute using helper method
|
|
554
|
-
success, batched_iterator = await self._execute_single_db(
|
|
555
|
-
effective_sql_client.engine,
|
|
556
|
-
prepared_query,
|
|
557
|
-
parquet_output,
|
|
558
|
-
write_to_file,
|
|
559
|
-
)
|
|
560
|
-
|
|
561
|
-
if success:
|
|
562
|
-
logger.info(f"Successfully processed database: {database_name}")
|
|
563
|
-
|
|
564
|
-
except Exception as e: # noqa: BLE001
|
|
565
|
-
logger.error(
|
|
566
|
-
f"Failed to process database '{database_name}': {str(e)}. Failing the workflow.",
|
|
567
|
-
exc_info=True,
|
|
568
|
-
)
|
|
569
|
-
raise
|
|
570
|
-
|
|
571
|
-
if success:
|
|
572
|
-
successful_databases.append(database_name)
|
|
573
|
-
if not write_to_file and batched_iterator:
|
|
574
|
-
dataframe_list.append(batched_iterator)
|
|
575
|
-
|
|
576
|
-
# Log results
|
|
577
|
-
logger.info(
|
|
578
|
-
f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
|
|
579
|
-
)
|
|
580
|
-
|
|
581
|
-
# Finalize results
|
|
582
|
-
return await self._finalize_multidb_results(
|
|
583
|
-
write_to_file,
|
|
584
|
-
concatenate,
|
|
585
|
-
return_dataframe,
|
|
586
|
-
parquet_output,
|
|
587
|
-
dataframe_list,
|
|
588
|
-
workflow_args,
|
|
589
|
-
output_suffix,
|
|
590
|
-
typename,
|
|
591
|
-
)
|
|
592
|
-
|
|
593
|
-
async def _execute_single_db(
|
|
594
|
-
self,
|
|
595
|
-
sql_engine: Any,
|
|
596
|
-
prepared_query: Optional[str],
|
|
597
|
-
parquet_output: Optional[ParquetOutput],
|
|
598
|
-
write_to_file: bool,
|
|
599
|
-
) -> Tuple[
|
|
600
|
-
bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
|
|
601
|
-
]: # type: ignore
|
|
602
|
-
if not prepared_query:
|
|
603
|
-
logger.error("Prepared query is None, cannot execute")
|
|
604
|
-
return False, None
|
|
605
|
-
|
|
606
|
-
try:
|
|
607
|
-
sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
|
|
608
|
-
batched_iterator = await sql_input.get_batched_dataframe()
|
|
609
|
-
|
|
610
|
-
if write_to_file and parquet_output:
|
|
611
|
-
await parquet_output.write_batched_dataframe(batched_iterator) # type: ignore
|
|
612
|
-
return True, None
|
|
613
|
-
|
|
614
|
-
return True, batched_iterator
|
|
615
|
-
except Exception as e:
|
|
616
|
-
logger.error(
|
|
617
|
-
f"Error during query execution or output writing: {e}", exc_info=True
|
|
618
|
-
)
|
|
619
|
-
raise
|
|
620
|
-
|
|
621
382
|
@activity.defn
|
|
622
383
|
@auto_heartbeater
|
|
623
384
|
async def fetch_databases(
|
|
@@ -626,29 +387,28 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
626
387
|
"""Fetch databases from the source database.
|
|
627
388
|
|
|
628
389
|
Args:
|
|
629
|
-
|
|
630
|
-
raw_output: JsonOutput instance for writing raw data.
|
|
631
|
-
**kwargs: Additional keyword arguments.
|
|
390
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
632
391
|
|
|
633
392
|
Returns:
|
|
634
|
-
|
|
393
|
+
Optional[ActivityStatistics]: Statistics about the extracted databases.
|
|
635
394
|
"""
|
|
636
395
|
state = cast(
|
|
637
396
|
BaseSQLMetadataExtractionActivitiesState,
|
|
638
397
|
await self._get_state(workflow_args),
|
|
639
398
|
)
|
|
640
|
-
if not state.sql_client
|
|
641
|
-
logger.error("SQL client
|
|
642
|
-
raise ValueError("SQL client
|
|
399
|
+
if not state.sql_client:
|
|
400
|
+
logger.error("SQL client not initialized")
|
|
401
|
+
raise ValueError("SQL client not initialized")
|
|
643
402
|
|
|
644
403
|
prepared_query = prepare_query(
|
|
645
404
|
query=self.fetch_database_sql, workflow_args=workflow_args
|
|
646
405
|
)
|
|
406
|
+
base_output_path = workflow_args.get("output_path", "")
|
|
647
407
|
statistics = await self.query_executor(
|
|
648
|
-
|
|
408
|
+
sql_client=state.sql_client,
|
|
649
409
|
sql_query=prepared_query,
|
|
650
410
|
workflow_args=workflow_args,
|
|
651
|
-
|
|
411
|
+
output_path=os.path.join(base_output_path, "raw"),
|
|
652
412
|
typename="database",
|
|
653
413
|
)
|
|
654
414
|
return statistics
|
|
@@ -661,29 +421,28 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
661
421
|
"""Fetch schemas from the source database.
|
|
662
422
|
|
|
663
423
|
Args:
|
|
664
|
-
|
|
665
|
-
raw_output: JsonOutput instance for writing raw data.
|
|
666
|
-
**kwargs: Additional keyword arguments.
|
|
424
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
667
425
|
|
|
668
426
|
Returns:
|
|
669
|
-
|
|
427
|
+
Optional[ActivityStatistics]: Statistics about the extracted schemas.
|
|
670
428
|
"""
|
|
671
429
|
state = cast(
|
|
672
430
|
BaseSQLMetadataExtractionActivitiesState,
|
|
673
431
|
await self._get_state(workflow_args),
|
|
674
432
|
)
|
|
675
|
-
if not state.sql_client
|
|
676
|
-
logger.error("SQL client
|
|
677
|
-
raise ValueError("SQL client
|
|
433
|
+
if not state.sql_client:
|
|
434
|
+
logger.error("SQL client not initialized")
|
|
435
|
+
raise ValueError("SQL client not initialized")
|
|
678
436
|
|
|
679
437
|
prepared_query = prepare_query(
|
|
680
438
|
query=self.fetch_schema_sql, workflow_args=workflow_args
|
|
681
439
|
)
|
|
440
|
+
base_output_path = workflow_args.get("output_path", "")
|
|
682
441
|
statistics = await self.query_executor(
|
|
683
|
-
|
|
442
|
+
sql_client=state.sql_client,
|
|
684
443
|
sql_query=prepared_query,
|
|
685
444
|
workflow_args=workflow_args,
|
|
686
|
-
|
|
445
|
+
output_path=os.path.join(base_output_path, "raw"),
|
|
687
446
|
typename="schema",
|
|
688
447
|
)
|
|
689
448
|
return statistics
|
|
@@ -696,9 +455,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
696
455
|
"""Fetch tables from the source database.
|
|
697
456
|
|
|
698
457
|
Args:
|
|
699
|
-
|
|
700
|
-
raw_output: JsonOutput instance for writing raw data.
|
|
701
|
-
**kwargs: Additional keyword arguments.
|
|
458
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
702
459
|
|
|
703
460
|
Returns:
|
|
704
461
|
Optional[ActivityStatistics]: Statistics about the extracted tables, or None if extraction failed.
|
|
@@ -707,20 +464,21 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
707
464
|
BaseSQLMetadataExtractionActivitiesState,
|
|
708
465
|
await self._get_state(workflow_args),
|
|
709
466
|
)
|
|
710
|
-
if not state.sql_client
|
|
711
|
-
logger.error("SQL client
|
|
712
|
-
raise ValueError("SQL client
|
|
467
|
+
if not state.sql_client:
|
|
468
|
+
logger.error("SQL client not initialized")
|
|
469
|
+
raise ValueError("SQL client not initialized")
|
|
713
470
|
|
|
714
471
|
prepared_query = prepare_query(
|
|
715
472
|
query=self.fetch_table_sql,
|
|
716
473
|
workflow_args=workflow_args,
|
|
717
474
|
temp_table_regex_sql=self.extract_temp_table_regex_table_sql,
|
|
718
475
|
)
|
|
476
|
+
base_output_path = workflow_args.get("output_path", "")
|
|
719
477
|
statistics = await self.query_executor(
|
|
720
|
-
|
|
478
|
+
sql_client=state.sql_client,
|
|
721
479
|
sql_query=prepared_query,
|
|
722
480
|
workflow_args=workflow_args,
|
|
723
|
-
|
|
481
|
+
output_path=os.path.join(base_output_path, "raw"),
|
|
724
482
|
typename="table",
|
|
725
483
|
)
|
|
726
484
|
return statistics
|
|
@@ -733,9 +491,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
733
491
|
"""Fetch columns from the source database.
|
|
734
492
|
|
|
735
493
|
Args:
|
|
736
|
-
|
|
737
|
-
raw_output: JsonOutput instance for writing raw data.
|
|
738
|
-
**kwargs: Additional keyword arguments.
|
|
494
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
739
495
|
|
|
740
496
|
Returns:
|
|
741
497
|
Optional[ActivityStatistics]: Statistics about the extracted columns, or None if extraction failed.
|
|
@@ -744,20 +500,21 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
744
500
|
BaseSQLMetadataExtractionActivitiesState,
|
|
745
501
|
await self._get_state(workflow_args),
|
|
746
502
|
)
|
|
747
|
-
if not state.sql_client
|
|
748
|
-
logger.error("SQL client
|
|
749
|
-
raise ValueError("SQL client
|
|
503
|
+
if not state.sql_client:
|
|
504
|
+
logger.error("SQL client not initialized")
|
|
505
|
+
raise ValueError("SQL client not initialized")
|
|
750
506
|
|
|
751
507
|
prepared_query = prepare_query(
|
|
752
508
|
query=self.fetch_column_sql,
|
|
753
509
|
workflow_args=workflow_args,
|
|
754
510
|
temp_table_regex_sql=self.extract_temp_table_regex_column_sql,
|
|
755
511
|
)
|
|
512
|
+
base_output_path = workflow_args.get("output_path", "")
|
|
756
513
|
statistics = await self.query_executor(
|
|
757
|
-
|
|
514
|
+
sql_client=state.sql_client,
|
|
758
515
|
sql_query=prepared_query,
|
|
759
516
|
workflow_args=workflow_args,
|
|
760
|
-
|
|
517
|
+
output_path=os.path.join(base_output_path, "raw"),
|
|
761
518
|
typename="column",
|
|
762
519
|
)
|
|
763
520
|
return statistics
|
|
@@ -770,9 +527,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
770
527
|
"""Fetch procedures from the source database.
|
|
771
528
|
|
|
772
529
|
Args:
|
|
773
|
-
|
|
774
|
-
raw_output: JsonOutput instance for writing raw data.
|
|
775
|
-
**kwargs: Additional keyword arguments.
|
|
530
|
+
workflow_args: Dictionary containing arguments for the workflow.
|
|
776
531
|
|
|
777
532
|
Returns:
|
|
778
533
|
Optional[ActivityStatistics]: Statistics about the extracted procedures, or None if extraction failed.
|
|
@@ -781,18 +536,19 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
781
536
|
BaseSQLMetadataExtractionActivitiesState,
|
|
782
537
|
await self._get_state(workflow_args),
|
|
783
538
|
)
|
|
784
|
-
if not state.sql_client
|
|
785
|
-
logger.error("SQL client
|
|
786
|
-
raise ValueError("SQL client
|
|
539
|
+
if not state.sql_client:
|
|
540
|
+
logger.error("SQL client not initialized")
|
|
541
|
+
raise ValueError("SQL client not initialized")
|
|
787
542
|
|
|
788
543
|
prepared_query = prepare_query(
|
|
789
544
|
query=self.fetch_procedure_sql, workflow_args=workflow_args
|
|
790
545
|
)
|
|
546
|
+
base_output_path = workflow_args.get("output_path", "")
|
|
791
547
|
statistics = await self.query_executor(
|
|
792
|
-
|
|
548
|
+
sql_client=state.sql_client,
|
|
793
549
|
sql_query=prepared_query,
|
|
794
550
|
workflow_args=workflow_args,
|
|
795
|
-
|
|
551
|
+
output_path=os.path.join(base_output_path, "raw"),
|
|
796
552
|
typename="extras-procedure",
|
|
797
553
|
)
|
|
798
554
|
return statistics
|
|
@@ -807,7 +563,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
807
563
|
|
|
808
564
|
Args:
|
|
809
565
|
raw_input (Any): Input data to transform.
|
|
810
|
-
transformed_output (
|
|
566
|
+
transformed_output (JsonFileWriter): Output handler for transformed data.
|
|
811
567
|
**kwargs: Additional keyword arguments.
|
|
812
568
|
|
|
813
569
|
Returns:
|
|
@@ -824,17 +580,18 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
824
580
|
self._validate_output_args(workflow_args)
|
|
825
581
|
)
|
|
826
582
|
|
|
827
|
-
raw_input =
|
|
583
|
+
raw_input = ParquetFileReader(
|
|
828
584
|
path=os.path.join(output_path, "raw"),
|
|
829
585
|
file_names=workflow_args.get("file_names"),
|
|
586
|
+
dataframe_type=DataframeType.daft,
|
|
830
587
|
)
|
|
831
|
-
raw_input = raw_input.
|
|
588
|
+
raw_input = raw_input.read_batches()
|
|
832
589
|
|
|
833
|
-
transformed_output =
|
|
834
|
-
|
|
835
|
-
output_suffix="transformed",
|
|
590
|
+
transformed_output = JsonFileWriter(
|
|
591
|
+
path=os.path.join(output_path, "transformed"),
|
|
836
592
|
typename=typename,
|
|
837
593
|
chunk_start=workflow_args.get("chunk_start"),
|
|
594
|
+
dataframe_type=DataframeType.daft,
|
|
838
595
|
)
|
|
839
596
|
if state.transformer:
|
|
840
597
|
workflow_args["connection_name"] = workflow_args.get("connection", {}).get(
|
|
@@ -849,8 +606,8 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
|
|
|
849
606
|
transform_metadata = state.transformer.transform_metadata(
|
|
850
607
|
dataframe=dataframe, **workflow_args
|
|
851
608
|
)
|
|
852
|
-
|
|
853
|
-
return await transformed_output.
|
|
609
|
+
await transformed_output.write(transform_metadata)
|
|
610
|
+
return await transformed_output.close()
|
|
854
611
|
|
|
855
612
|
@activity.defn
|
|
856
613
|
@auto_heartbeater
|