atlan-application-sdk 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/secretstore.py +1 -1
  25. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  26. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  27. application_sdk/version.py +1 -1
  28. application_sdk/worker.py +1 -1
  29. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +35 -42
  31. application_sdk/common/dataframe_utils.py +0 -42
  32. application_sdk/events/__init__.py +0 -5
  33. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  34. application_sdk/inputs/__init__.py +0 -168
  35. application_sdk/inputs/iceberg.py +0 -75
  36. application_sdk/inputs/json.py +0 -136
  37. application_sdk/inputs/parquet.py +0 -272
  38. application_sdk/inputs/sql_query.py +0 -271
  39. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  40. application_sdk/outputs/__init__.py +0 -453
  41. application_sdk/outputs/iceberg.py +0 -139
  42. application_sdk/outputs/json.py +0 -268
  43. /application_sdk/{events → interceptors}/models.py +0 -0
  44. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  45. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -2,10 +2,7 @@ import os
2
2
  from typing import (
3
3
  TYPE_CHECKING,
4
4
  Any,
5
- AsyncIterator,
6
5
  Dict,
7
- Iterator,
8
- List,
9
6
  Optional,
10
7
  Tuple,
11
8
  Type,
@@ -17,6 +14,7 @@ from typing import (
17
14
  from temporalio import activity
18
15
 
19
16
  from application_sdk.activities import ActivitiesInterface, ActivitiesState
17
+ from application_sdk.activities.common import sql_utils
20
18
  from application_sdk.activities.common.models import ActivityStatistics
21
19
  from application_sdk.activities.common.utils import (
22
20
  auto_heartbeater,
@@ -24,21 +22,15 @@ from application_sdk.activities.common.utils import (
24
22
  get_workflow_id,
25
23
  )
26
24
  from application_sdk.clients.sql import BaseSQLClient
27
- from application_sdk.common.dataframe_utils import is_empty_dataframe
28
25
  from application_sdk.common.error_codes import ActivityError
29
- from application_sdk.common.utils import (
30
- get_database_names,
31
- parse_credentials_extra,
32
- prepare_query,
33
- read_sql_files,
34
- )
26
+ from application_sdk.common.utils import prepare_query, read_sql_files
35
27
  from application_sdk.constants import APP_TENANT_ID, APPLICATION_NAME, SQL_QUERIES_PATH
36
28
  from application_sdk.handlers.sql import BaseSQLHandler
37
- from application_sdk.inputs.parquet import ParquetInput
38
- from application_sdk.inputs.sql_query import SQLQueryInput
29
+ from application_sdk.io import DataframeType
30
+ from application_sdk.io.json import JsonFileWriter
31
+ from application_sdk.io.parquet import ParquetFileReader, ParquetFileWriter
32
+ from application_sdk.io.utils import is_empty_dataframe
39
33
  from application_sdk.observability.logger_adaptor import get_logger
40
- from application_sdk.outputs.json import JsonOutput
41
- from application_sdk.outputs.parquet import ParquetOutput
42
34
  from application_sdk.services.atlan_storage import AtlanStorage
43
35
  from application_sdk.services.secretstore import SecretStore
44
36
  from application_sdk.transformers import TransformerInterface
@@ -237,105 +229,108 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
237
229
  @overload
238
230
  async def query_executor(
239
231
  self,
240
- sql_engine: Any,
232
+ sql_client: BaseSQLClient,
241
233
  sql_query: Optional[str],
242
234
  workflow_args: Dict[str, Any],
243
- output_suffix: str,
235
+ output_path: str,
244
236
  typename: str,
245
237
  write_to_file: bool = True,
246
238
  concatenate: bool = False,
247
239
  return_dataframe: bool = False,
248
- sql_client: Optional[BaseSQLClient] = None,
249
240
  ) -> Optional[ActivityStatistics]: ...
250
241
 
251
242
  @overload
252
243
  async def query_executor(
253
244
  self,
254
- sql_engine: Any,
245
+ sql_client: BaseSQLClient,
255
246
  sql_query: Optional[str],
256
247
  workflow_args: Dict[str, Any],
257
- output_suffix: str,
248
+ output_path: str,
258
249
  typename: str,
259
250
  write_to_file: bool = True,
260
251
  concatenate: bool = False,
261
252
  return_dataframe: bool = True,
262
- sql_client: Optional[BaseSQLClient] = None,
263
253
  ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]: ...
264
254
 
265
255
  async def query_executor(
266
256
  self,
267
- sql_engine: Any,
257
+ sql_client: BaseSQLClient,
268
258
  sql_query: Optional[str],
269
259
  workflow_args: Dict[str, Any],
270
- output_suffix: str,
260
+ output_path: str,
271
261
  typename: str,
272
262
  write_to_file: bool = True,
273
263
  concatenate: bool = False,
274
264
  return_dataframe: bool = False,
275
- sql_client: Optional[BaseSQLClient] = None,
276
265
  ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
277
266
  """
278
- Executes a SQL query using the provided engine and saves the results to Parquet.
267
+ Executes a SQL query using the provided client and saves the results to Parquet.
279
268
 
280
- This method validates the input engine and query, prepares the query using
281
- workflow arguments, executes it, writes the resulting Daft DataFrame to
269
+ This method validates the input client and query, prepares the query using
270
+ workflow arguments, executes it, writes the resulting DataFrame to
282
271
  a Parquet file, and returns statistics about the output.
283
272
 
284
273
  Args:
285
- sql_engine: The SQL engine instance to use for executing the query.
274
+ sql_client: The SQL client instance to use for executing the query.
286
275
  sql_query: The SQL query string to execute. Placeholders can be used which
287
276
  will be replaced using `workflow_args`.
288
- workflow_args: Dictionary containing arguments for the workflow, used for
289
- preparing the query and defining output paths. Expected keys:
290
- - "output_prefix": Prefix for the output path.
291
- - "output_path": Base directory for the output.
292
- output_suffix: Suffix to append to the output file name.
277
+ workflow_args: Dictionary containing arguments for the workflow.
278
+ output_path: Full path where the output files will be written.
293
279
  typename: Type name used for generating output statistics.
280
+ write_to_file: Whether to write results to file. Defaults to True.
281
+ concatenate: Whether to concatenate results in multidb mode. Defaults to False.
282
+ return_dataframe: Whether to return a DataFrame instead of statistics. Defaults to False.
294
283
 
295
284
  Returns:
296
285
  Optional[Union[ActivityStatistics, pd.DataFrame]]: Statistics about the generated Parquet file,
297
286
  or a DataFrame if return_dataframe=True, or None if the query is empty or execution fails.
298
287
 
299
288
  Raises:
300
- ValueError: If `sql_engine` is not provided.
289
+ ValueError: If `sql_client` is not provided.
301
290
  """
302
291
  # Common pre-checks and setup shared by both multidb and single-db paths
292
+ if not sql_client:
293
+ logger.error("SQL client is not provided")
294
+ raise ValueError("SQL client is required for query execution")
295
+
303
296
  if not sql_query:
304
297
  logger.warning("Query is empty, skipping execution.")
305
298
  return None
306
299
 
307
- if not sql_engine:
308
- logger.error("SQL engine is not set.")
309
- raise ValueError("SQL engine must be provided.")
310
-
311
300
  # Setup parquet output using helper method
312
301
  parquet_output = self._setup_parquet_output(
313
- workflow_args, output_suffix, write_to_file
302
+ output_path, write_to_file, typename
314
303
  )
315
304
 
316
305
  # If multidb mode is enabled, run per-database flow
317
306
  if getattr(self, "multidb", False):
318
- return await self._execute_multidb_flow(
319
- sql_client,
320
- sql_query,
321
- workflow_args,
322
- output_suffix,
323
- typename,
324
- write_to_file,
325
- concatenate,
326
- return_dataframe,
327
- parquet_output,
307
+ return await sql_utils.execute_multidb_flow(
308
+ sql_client=sql_client,
309
+ sql_query=sql_query,
310
+ workflow_args=workflow_args,
311
+ fetch_database_sql=self.fetch_database_sql,
312
+ output_path=output_path,
313
+ typename=typename,
314
+ write_to_file=write_to_file,
315
+ concatenate=concatenate,
316
+ return_dataframe=return_dataframe,
317
+ parquet_output=parquet_output,
318
+ temp_table_regex_sql=self._get_temp_table_regex_sql(typename),
319
+ setup_parquet_output_func=self._setup_parquet_output,
328
320
  )
329
321
 
330
322
  # Single-db execution path
331
323
  # Prepare query for single-db execution
332
- prepared_query = self._prepare_database_query(
333
- sql_query, None, workflow_args, typename
324
+ prepared_query = sql_utils.prepare_database_query(
325
+ sql_query,
326
+ None,
327
+ workflow_args,
328
+ self._get_temp_table_regex_sql(typename),
334
329
  )
335
330
 
336
331
  # Execute using helper method
337
- success, _ = await self._execute_single_db(
338
- sql_engine, prepared_query, parquet_output, write_to_file
332
+ success, _ = await sql_utils.execute_single_db(
333
+ sql_client, prepared_query, parquet_output, write_to_file
339
334
  )
340
335
 
341
336
  if not success:
@@ -346,30 +341,33 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
346
341
  logger.info(
347
342
  f"Successfully wrote query results to {parquet_output.get_full_path()}"
348
343
  )
349
- return await parquet_output.get_statistics(typename=typename)
344
+ return await parquet_output.close()
350
345
 
351
346
  logger.warning("No parquet output configured for single-db execution")
352
347
  return None
353
348
 
354
349
  def _setup_parquet_output(
355
350
  self,
356
- workflow_args: Dict[str, Any],
357
- output_suffix: str,
351
+ output_path: str,
358
352
  write_to_file: bool,
359
- ) -> Optional[ParquetOutput]:
353
+ typename: Optional[str] = None,
354
+ ) -> Optional[ParquetFileWriter]:
355
+ """Create a ParquetFileWriter for the given output path.
356
+
357
+ Args:
358
+ output_path: Full path where the output files will be written.
359
+ write_to_file: Whether to write results to file.
360
+
361
+ Returns:
362
+ Optional[ParquetFileWriter]: A ParquetFileWriter instance, or None if write_to_file is False.
363
+ """
360
364
  if not write_to_file:
361
365
  return None
362
- output_prefix = workflow_args.get("output_prefix")
363
- output_path = workflow_args.get("output_path")
364
- if not output_prefix or not output_path:
365
- logger.error("Output prefix or path not provided in workflow_args.")
366
- raise ValueError(
367
- "Output prefix and path must be specified in workflow_args."
368
- )
369
- return ParquetOutput(
370
- output_path=output_path,
371
- output_suffix=output_suffix,
366
+
367
+ return ParquetFileWriter(
368
+ path=output_path,
372
369
  use_consolidation=True,
370
+ typename=typename,
373
371
  )
374
372
 
375
373
  def _get_temp_table_regex_sql(self, typename: str) -> str:
@@ -381,243 +379,6 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
381
379
  else:
382
380
  return ""
383
381
 
384
- def _prepare_database_query(
385
- self,
386
- sql_query: str,
387
- database_name: Optional[str],
388
- workflow_args: Dict[str, Any],
389
- typename: str,
390
- use_posix_regex: bool = False,
391
- ) -> Optional[str]:
392
- """Prepare query for database execution with proper substitutions."""
393
- # Replace database name placeholder if provided
394
- fetch_sql = sql_query
395
- if database_name:
396
- fetch_sql = fetch_sql.replace("{database_name}", database_name)
397
-
398
- # Get temp table regex SQL
399
- temp_table_regex_sql = self._get_temp_table_regex_sql(typename)
400
-
401
- # Prepare the query
402
- prepared_query = prepare_query(
403
- query=fetch_sql,
404
- workflow_args=workflow_args,
405
- temp_table_regex_sql=temp_table_regex_sql,
406
- use_posix_regex=use_posix_regex,
407
- )
408
-
409
- if prepared_query is None:
410
- db_context = f" for database {database_name}" if database_name else ""
411
- raise ValueError(f"Failed to prepare query{db_context}")
412
-
413
- return prepared_query
414
-
415
- async def _setup_database_connection(
416
- self,
417
- sql_client: BaseSQLClient,
418
- database_name: str,
419
- ) -> None:
420
- """Setup connection for a specific database."""
421
- extra = parse_credentials_extra(sql_client.credentials)
422
- extra["database"] = database_name
423
- sql_client.credentials["extra"] = extra
424
- await sql_client.load(sql_client.credentials)
425
-
426
- # NOTE: Consolidated: per-database processing is now inlined in the multi-DB loop
427
-
428
- async def _finalize_multidb_results(
429
- self,
430
- write_to_file: bool,
431
- concatenate: bool,
432
- return_dataframe: bool,
433
- parquet_output: Optional[ParquetOutput],
434
- dataframe_list: List[
435
- Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
436
- ],
437
- workflow_args: Dict[str, Any],
438
- output_suffix: str,
439
- typename: str,
440
- ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
441
- """Finalize results for multi-database execution."""
442
- if write_to_file and parquet_output:
443
- return await parquet_output.get_statistics(typename=typename)
444
-
445
- if not write_to_file and concatenate:
446
- try:
447
- import pandas as pd # type: ignore
448
-
449
- valid_dataframes: List[pd.DataFrame] = []
450
- for df_generator in dataframe_list:
451
- if df_generator is None:
452
- continue
453
- for dataframe in df_generator: # type: ignore[assignment]
454
- if dataframe is None:
455
- continue
456
- if hasattr(dataframe, "empty") and getattr(dataframe, "empty"):
457
- continue
458
- valid_dataframes.append(dataframe)
459
-
460
- if not valid_dataframes:
461
- logger.warning(
462
- "No valid dataframes collected across databases for concatenation"
463
- )
464
- return None
465
-
466
- concatenated = pd.concat(valid_dataframes, ignore_index=True)
467
-
468
- if return_dataframe:
469
- return concatenated # type: ignore[return-value]
470
-
471
- # Create new parquet output for concatenated data
472
- concatenated_parquet_output = self._setup_parquet_output(
473
- workflow_args, output_suffix, True
474
- )
475
- if concatenated_parquet_output:
476
- await concatenated_parquet_output.write_dataframe(concatenated) # type: ignore[arg-type]
477
- return await concatenated_parquet_output.get_statistics(
478
- typename=typename
479
- )
480
- except Exception as e: # noqa: BLE001
481
- logger.error(
482
- f"Error concatenating multi-DB dataframes: {str(e)}",
483
- exc_info=True,
484
- )
485
- raise
486
-
487
- logger.warning(
488
- "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
489
- )
490
- return None
491
-
492
- async def _execute_multidb_flow(
493
- self,
494
- sql_client: Optional[BaseSQLClient],
495
- sql_query: str,
496
- workflow_args: Dict[str, Any],
497
- output_suffix: str,
498
- typename: str,
499
- write_to_file: bool,
500
- concatenate: bool,
501
- return_dataframe: bool,
502
- parquet_output: Optional[ParquetOutput],
503
- ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
504
- """Execute multi-database flow with proper error handling and result finalization."""
505
- # Get effective SQL client
506
- effective_sql_client = sql_client
507
- if effective_sql_client is None:
508
- state = cast(
509
- BaseSQLMetadataExtractionActivitiesState,
510
- await self._get_state(workflow_args),
511
- )
512
- effective_sql_client = state.sql_client
513
-
514
- if not effective_sql_client:
515
- logger.error("SQL client not initialized for multidb execution")
516
- raise ValueError("SQL client not initialized")
517
-
518
- # Resolve databases to iterate
519
- database_names = await get_database_names(
520
- effective_sql_client, workflow_args, self.fetch_database_sql
521
- )
522
- if not database_names:
523
- logger.warning("No databases found to process")
524
- return None
525
-
526
- # Validate client
527
- if not effective_sql_client.engine:
528
- logger.error("SQL client engine not initialized")
529
- raise ValueError("SQL client engine not initialized")
530
-
531
- successful_databases: List[str] = []
532
- dataframe_list: List[
533
- Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
534
- ] = []
535
-
536
- # Iterate databases and execute (consolidated single-db processing)
537
- for database_name in database_names or []:
538
- try:
539
- # Setup connection for this database
540
- await self._setup_database_connection(
541
- effective_sql_client, database_name
542
- )
543
-
544
- # Prepare query for this database
545
- prepared_query = self._prepare_database_query(
546
- sql_query,
547
- database_name,
548
- workflow_args,
549
- typename,
550
- use_posix_regex=True,
551
- )
552
-
553
- # Execute using helper method
554
- success, batched_iterator = await self._execute_single_db(
555
- effective_sql_client.engine,
556
- prepared_query,
557
- parquet_output,
558
- write_to_file,
559
- )
560
-
561
- if success:
562
- logger.info(f"Successfully processed database: {database_name}")
563
-
564
- except Exception as e: # noqa: BLE001
565
- logger.error(
566
- f"Failed to process database '{database_name}': {str(e)}. Failing the workflow.",
567
- exc_info=True,
568
- )
569
- raise
570
-
571
- if success:
572
- successful_databases.append(database_name)
573
- if not write_to_file and batched_iterator:
574
- dataframe_list.append(batched_iterator)
575
-
576
- # Log results
577
- logger.info(
578
- f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
579
- )
580
-
581
- # Finalize results
582
- return await self._finalize_multidb_results(
583
- write_to_file,
584
- concatenate,
585
- return_dataframe,
586
- parquet_output,
587
- dataframe_list,
588
- workflow_args,
589
- output_suffix,
590
- typename,
591
- )
592
-
593
- async def _execute_single_db(
594
- self,
595
- sql_engine: Any,
596
- prepared_query: Optional[str],
597
- parquet_output: Optional[ParquetOutput],
598
- write_to_file: bool,
599
- ) -> Tuple[
600
- bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
601
- ]: # type: ignore
602
- if not prepared_query:
603
- logger.error("Prepared query is None, cannot execute")
604
- return False, None
605
-
606
- try:
607
- sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
608
- batched_iterator = await sql_input.get_batched_dataframe()
609
-
610
- if write_to_file and parquet_output:
611
- await parquet_output.write_batched_dataframe(batched_iterator) # type: ignore
612
- return True, None
613
-
614
- return True, batched_iterator
615
- except Exception as e:
616
- logger.error(
617
- f"Error during query execution or output writing: {e}", exc_info=True
618
- )
619
- raise
620
-
621
382
  @activity.defn
622
383
  @auto_heartbeater
623
384
  async def fetch_databases(
@@ -626,29 +387,28 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
626
387
  """Fetch databases from the source database.
627
388
 
628
389
  Args:
629
- batch_input: DataFrame containing the raw database data.
630
- raw_output: JsonOutput instance for writing raw data.
631
- **kwargs: Additional keyword arguments.
390
+ workflow_args: Dictionary containing arguments for the workflow.
632
391
 
633
392
  Returns:
634
- Dict containing chunk count, typename, and total record count.
393
+ Optional[ActivityStatistics]: Statistics about the extracted databases.
635
394
  """
636
395
  state = cast(
637
396
  BaseSQLMetadataExtractionActivitiesState,
638
397
  await self._get_state(workflow_args),
639
398
  )
640
- if not state.sql_client or not state.sql_client.engine:
641
- logger.error("SQL client or engine not initialized")
642
- raise ValueError("SQL client or engine not initialized")
399
+ if not state.sql_client:
400
+ logger.error("SQL client not initialized")
401
+ raise ValueError("SQL client not initialized")
643
402
 
644
403
  prepared_query = prepare_query(
645
404
  query=self.fetch_database_sql, workflow_args=workflow_args
646
405
  )
406
+ base_output_path = workflow_args.get("output_path", "")
647
407
  statistics = await self.query_executor(
648
- sql_engine=state.sql_client.engine,
408
+ sql_client=state.sql_client,
649
409
  sql_query=prepared_query,
650
410
  workflow_args=workflow_args,
651
- output_suffix="raw/database",
411
+ output_path=os.path.join(base_output_path, "raw"),
652
412
  typename="database",
653
413
  )
654
414
  return statistics
@@ -661,29 +421,28 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
661
421
  """Fetch schemas from the source database.
662
422
 
663
423
  Args:
664
- batch_input: DataFrame containing the raw schema data.
665
- raw_output: JsonOutput instance for writing raw data.
666
- **kwargs: Additional keyword arguments.
424
+ workflow_args: Dictionary containing arguments for the workflow.
667
425
 
668
426
  Returns:
669
- Dict containing chunk count, typename, and total record count.
427
+ Optional[ActivityStatistics]: Statistics about the extracted schemas.
670
428
  """
671
429
  state = cast(
672
430
  BaseSQLMetadataExtractionActivitiesState,
673
431
  await self._get_state(workflow_args),
674
432
  )
675
- if not state.sql_client or not state.sql_client.engine:
676
- logger.error("SQL client or engine not initialized")
677
- raise ValueError("SQL client or engine not initialized")
433
+ if not state.sql_client:
434
+ logger.error("SQL client not initialized")
435
+ raise ValueError("SQL client not initialized")
678
436
 
679
437
  prepared_query = prepare_query(
680
438
  query=self.fetch_schema_sql, workflow_args=workflow_args
681
439
  )
440
+ base_output_path = workflow_args.get("output_path", "")
682
441
  statistics = await self.query_executor(
683
- sql_engine=state.sql_client.engine,
442
+ sql_client=state.sql_client,
684
443
  sql_query=prepared_query,
685
444
  workflow_args=workflow_args,
686
- output_suffix="raw/schema",
445
+ output_path=os.path.join(base_output_path, "raw"),
687
446
  typename="schema",
688
447
  )
689
448
  return statistics
@@ -696,9 +455,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
696
455
  """Fetch tables from the source database.
697
456
 
698
457
  Args:
699
- batch_input: DataFrame containing the raw table data.
700
- raw_output: JsonOutput instance for writing raw data.
701
- **kwargs: Additional keyword arguments.
458
+ workflow_args: Dictionary containing arguments for the workflow.
702
459
 
703
460
  Returns:
704
461
  Optional[ActivityStatistics]: Statistics about the extracted tables, or None if extraction failed.
@@ -707,20 +464,21 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
707
464
  BaseSQLMetadataExtractionActivitiesState,
708
465
  await self._get_state(workflow_args),
709
466
  )
710
- if not state.sql_client or not state.sql_client.engine:
711
- logger.error("SQL client or engine not initialized")
712
- raise ValueError("SQL client or engine not initialized")
467
+ if not state.sql_client:
468
+ logger.error("SQL client not initialized")
469
+ raise ValueError("SQL client not initialized")
713
470
 
714
471
  prepared_query = prepare_query(
715
472
  query=self.fetch_table_sql,
716
473
  workflow_args=workflow_args,
717
474
  temp_table_regex_sql=self.extract_temp_table_regex_table_sql,
718
475
  )
476
+ base_output_path = workflow_args.get("output_path", "")
719
477
  statistics = await self.query_executor(
720
- sql_engine=state.sql_client.engine,
478
+ sql_client=state.sql_client,
721
479
  sql_query=prepared_query,
722
480
  workflow_args=workflow_args,
723
- output_suffix="raw/table",
481
+ output_path=os.path.join(base_output_path, "raw"),
724
482
  typename="table",
725
483
  )
726
484
  return statistics
@@ -733,9 +491,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
733
491
  """Fetch columns from the source database.
734
492
 
735
493
  Args:
736
- batch_input: DataFrame containing the raw column data.
737
- raw_output: JsonOutput instance for writing raw data.
738
- **kwargs: Additional keyword arguments.
494
+ workflow_args: Dictionary containing arguments for the workflow.
739
495
 
740
496
  Returns:
741
497
  Optional[ActivityStatistics]: Statistics about the extracted columns, or None if extraction failed.
@@ -744,20 +500,21 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
744
500
  BaseSQLMetadataExtractionActivitiesState,
745
501
  await self._get_state(workflow_args),
746
502
  )
747
- if not state.sql_client or not state.sql_client.engine:
748
- logger.error("SQL client or engine not initialized")
749
- raise ValueError("SQL client or engine not initialized")
503
+ if not state.sql_client:
504
+ logger.error("SQL client not initialized")
505
+ raise ValueError("SQL client not initialized")
750
506
 
751
507
  prepared_query = prepare_query(
752
508
  query=self.fetch_column_sql,
753
509
  workflow_args=workflow_args,
754
510
  temp_table_regex_sql=self.extract_temp_table_regex_column_sql,
755
511
  )
512
+ base_output_path = workflow_args.get("output_path", "")
756
513
  statistics = await self.query_executor(
757
- sql_engine=state.sql_client.engine,
514
+ sql_client=state.sql_client,
758
515
  sql_query=prepared_query,
759
516
  workflow_args=workflow_args,
760
- output_suffix="raw/column",
517
+ output_path=os.path.join(base_output_path, "raw"),
761
518
  typename="column",
762
519
  )
763
520
  return statistics
@@ -770,9 +527,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
770
527
  """Fetch procedures from the source database.
771
528
 
772
529
  Args:
773
- batch_input: DataFrame containing the raw column data.
774
- raw_output: JsonOutput instance for writing raw data.
775
- **kwargs: Additional keyword arguments.
530
+ workflow_args: Dictionary containing arguments for the workflow.
776
531
 
777
532
  Returns:
778
533
  Optional[ActivityStatistics]: Statistics about the extracted procedures, or None if extraction failed.
@@ -781,18 +536,19 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
781
536
  BaseSQLMetadataExtractionActivitiesState,
782
537
  await self._get_state(workflow_args),
783
538
  )
784
- if not state.sql_client or not state.sql_client.engine:
785
- logger.error("SQL client or engine not initialized")
786
- raise ValueError("SQL client or engine not initialized")
539
+ if not state.sql_client:
540
+ logger.error("SQL client not initialized")
541
+ raise ValueError("SQL client not initialized")
787
542
 
788
543
  prepared_query = prepare_query(
789
544
  query=self.fetch_procedure_sql, workflow_args=workflow_args
790
545
  )
546
+ base_output_path = workflow_args.get("output_path", "")
791
547
  statistics = await self.query_executor(
792
- sql_engine=state.sql_client.engine,
548
+ sql_client=state.sql_client,
793
549
  sql_query=prepared_query,
794
550
  workflow_args=workflow_args,
795
- output_suffix="raw/extras-procedure",
551
+ output_path=os.path.join(base_output_path, "raw"),
796
552
  typename="extras-procedure",
797
553
  )
798
554
  return statistics
@@ -807,7 +563,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
807
563
 
808
564
  Args:
809
565
  raw_input (Any): Input data to transform.
810
- transformed_output (JsonOutput): Output handler for transformed data.
566
+ transformed_output (JsonFileWriter): Output handler for transformed data.
811
567
  **kwargs: Additional keyword arguments.
812
568
 
813
569
  Returns:
@@ -824,17 +580,18 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
824
580
  self._validate_output_args(workflow_args)
825
581
  )
826
582
 
827
- raw_input = ParquetInput(
583
+ raw_input = ParquetFileReader(
828
584
  path=os.path.join(output_path, "raw"),
829
585
  file_names=workflow_args.get("file_names"),
586
+ dataframe_type=DataframeType.daft,
830
587
  )
831
- raw_input = raw_input.get_batched_daft_dataframe()
588
+ raw_input = raw_input.read_batches()
832
589
 
833
- transformed_output = JsonOutput(
834
- output_path=output_path,
835
- output_suffix="transformed",
590
+ transformed_output = JsonFileWriter(
591
+ path=os.path.join(output_path, "transformed"),
836
592
  typename=typename,
837
593
  chunk_start=workflow_args.get("chunk_start"),
594
+ dataframe_type=DataframeType.daft,
838
595
  )
839
596
  if state.transformer:
840
597
  workflow_args["connection_name"] = workflow_args.get("connection", {}).get(
@@ -849,8 +606,8 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
849
606
  transform_metadata = state.transformer.transform_metadata(
850
607
  dataframe=dataframe, **workflow_args
851
608
  )
852
- await transformed_output.write_daft_dataframe(transform_metadata)
853
- return await transformed_output.get_statistics(typename=typename)
609
+ await transformed_output.write(transform_metadata)
610
+ return await transformed_output.close()
854
611
 
855
612
  @activity.defn
856
613
  @auto_heartbeater