atlan-application-sdk 0.1.1rc40__py3-none-any.whl → 0.1.1rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. application_sdk/activities/common/utils.py +78 -4
  2. application_sdk/activities/metadata_extraction/sql.py +400 -27
  3. application_sdk/application/__init__.py +2 -0
  4. application_sdk/application/metadata_extraction/sql.py +3 -0
  5. application_sdk/clients/models.py +42 -0
  6. application_sdk/clients/sql.py +17 -13
  7. application_sdk/common/aws_utils.py +259 -11
  8. application_sdk/common/utils.py +145 -9
  9. application_sdk/handlers/__init__.py +8 -1
  10. application_sdk/handlers/sql.py +63 -22
  11. application_sdk/inputs/__init__.py +98 -2
  12. application_sdk/inputs/json.py +59 -87
  13. application_sdk/inputs/parquet.py +173 -94
  14. application_sdk/observability/decorators/observability_decorator.py +36 -22
  15. application_sdk/server/fastapi/__init__.py +59 -3
  16. application_sdk/server/fastapi/models.py +27 -0
  17. application_sdk/test_utils/hypothesis/strategies/inputs/json_input.py +10 -5
  18. application_sdk/test_utils/hypothesis/strategies/inputs/parquet_input.py +9 -4
  19. application_sdk/version.py +1 -1
  20. {atlan_application_sdk-0.1.1rc40.dist-info → atlan_application_sdk-0.1.1rc42.dist-info}/METADATA +1 -1
  21. {atlan_application_sdk-0.1.1rc40.dist-info → atlan_application_sdk-0.1.1rc42.dist-info}/RECORD +24 -23
  22. {atlan_application_sdk-0.1.1rc40.dist-info → atlan_application_sdk-0.1.1rc42.dist-info}/WHEEL +0 -0
  23. {atlan_application_sdk-0.1.1rc40.dist-info → atlan_application_sdk-0.1.1rc42.dist-info}/licenses/LICENSE +0 -0
  24. {atlan_application_sdk-0.1.1rc40.dist-info → atlan_application_sdk-0.1.1rc42.dist-info}/licenses/NOTICE +0 -0
@@ -5,10 +5,11 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import glob
8
9
  import os
9
10
  from datetime import timedelta
10
11
  from functools import wraps
11
- from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
12
+ from typing import Any, Awaitable, Callable, List, Optional, TypeVar, cast
12
13
 
13
14
  from temporalio import activity
14
15
 
@@ -79,17 +80,47 @@ def build_output_path() -> str:
79
80
 
80
81
  def get_object_store_prefix(path: str) -> str:
81
82
  """Get the object store prefix for the path.
83
+
84
+ This function handles two types of paths:
85
+ 1. Paths under TEMPORARY_PATH - converts them to relative object store paths
86
+ 2. User-provided paths - returns them as-is (already relative object store paths)
87
+
82
88
  Args:
83
- path: The path to the output directory.
89
+ path: The path to convert to object store prefix.
84
90
 
85
91
  Returns:
86
92
  The object store prefix for the path.
87
93
 
88
- Example:
94
+ Examples:
95
+ >>> # Temporary path case
89
96
  >>> get_object_store_prefix("./local/tmp/artifacts/apps/appName/workflows/wf-123/run-456")
90
97
  "artifacts/apps/appName/workflows/wf-123/run-456"
98
+
99
+ >>> # User-provided path case
100
+ >>> get_object_store_prefix("datasets/sales/2024/")
101
+ "datasets/sales/2024"
91
102
  """
92
- return os.path.relpath(path, TEMPORARY_PATH)
103
+ # Normalize paths for comparison
104
+ abs_path = os.path.abspath(path)
105
+ abs_temp_path = os.path.abspath(TEMPORARY_PATH)
106
+
107
+ # Check if path is under TEMPORARY_PATH
108
+ try:
109
+ # Use os.path.commonpath to properly check if path is under temp directory
110
+ # This prevents false positives like '/tmp/local123' matching '/tmp/local'
111
+ common_path = os.path.commonpath([abs_path, abs_temp_path])
112
+ if common_path == abs_temp_path:
113
+ # Path is under temp directory, convert to relative object store path
114
+ relative_path = os.path.relpath(abs_path, abs_temp_path)
115
+ # Normalize path separators to forward slashes for object store
116
+ return relative_path.replace(os.path.sep, "/")
117
+ else:
118
+ # Path is already a relative object store path, return as-is
119
+ return path.strip("/")
120
+ except ValueError:
121
+ # os.path.commonpath or os.path.relpath can raise ValueError on Windows with different drives
122
+ # In this case, treat as user-provided path, return as-is
123
+ return path.strip("/")
93
124
 
94
125
 
95
126
  def auto_heartbeater(fn: F) -> F:
@@ -199,3 +230,46 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
199
230
  while True:
200
231
  await asyncio.sleep(delay)
201
232
  activity.heartbeat(*details)
233
+
234
+
235
+ def find_local_files_by_extension(
236
+ path: str,
237
+ extension: str,
238
+ file_names: Optional[List[str]] = None,
239
+ ) -> List[str]:
240
+ """Find local files at the specified local path, optionally filtering by file names.
241
+
242
+ Args:
243
+ path (str): Local path to search in (file or directory)
244
+ extension (str): File extension to filter by (e.g., '.parquet', '.json')
245
+ file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
246
+
247
+ Returns:
248
+ List[str]: List of matching file paths
249
+
250
+ Example:
251
+ >>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
252
+ ['file1.parquet', 'file2.parquet']
253
+
254
+ >>> find_local_files_by_extension("/data/single.json", ".json")
255
+ ['single.json']
256
+ """
257
+ if os.path.isfile(path) and path.endswith(extension):
258
+ # Single file - return it directly
259
+ return [path]
260
+
261
+ elif os.path.isdir(path):
262
+ # Directory - find all files in directory
263
+ all_files = glob.glob(
264
+ os.path.join(path, "**", f"*{extension}"),
265
+ recursive=True,
266
+ )
267
+
268
+ # Filter by file names if specified
269
+ if file_names:
270
+ file_names_set = set(file_names) # Convert to set for O(1) lookup
271
+ return [f for f in all_files if os.path.basename(f) in file_names_set]
272
+ else:
273
+ return all_files
274
+
275
+ return []
@@ -1,5 +1,20 @@
1
1
  import os
2
- from typing import Any, Dict, Optional, Tuple, Type, cast
2
+ from typing import (
3
+ TYPE_CHECKING,
4
+ Any,
5
+ AsyncGenerator,
6
+ AsyncIterator,
7
+ Dict,
8
+ Generator,
9
+ Iterator,
10
+ List,
11
+ Optional,
12
+ Tuple,
13
+ Type,
14
+ Union,
15
+ cast,
16
+ overload,
17
+ )
3
18
 
4
19
  from temporalio import activity
5
20
 
@@ -13,7 +28,12 @@ from application_sdk.activities.common.utils import (
13
28
  from application_sdk.clients.sql import BaseSQLClient
14
29
  from application_sdk.common.dataframe_utils import is_empty_dataframe
15
30
  from application_sdk.common.error_codes import ActivityError
16
- from application_sdk.common.utils import prepare_query, read_sql_files
31
+ from application_sdk.common.utils import (
32
+ get_database_names,
33
+ parse_credentials_extra,
34
+ prepare_query,
35
+ read_sql_files,
36
+ )
17
37
  from application_sdk.constants import APP_TENANT_ID, APPLICATION_NAME, SQL_QUERIES_PATH
18
38
  from application_sdk.handlers.sql import BaseSQLHandler
19
39
  from application_sdk.inputs.parquet import ParquetInput
@@ -31,6 +51,9 @@ activity.logger = logger
31
51
 
32
52
  queries = read_sql_files(queries_prefix=SQL_QUERIES_PATH)
33
53
 
54
+ if TYPE_CHECKING:
55
+ import pandas as pd
56
+
34
57
 
35
58
  class BaseSQLMetadataExtractionActivitiesState(ActivitiesState):
36
59
  """State class for SQL metadata extraction activities.
@@ -90,6 +113,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
90
113
  sql_client_class: Optional[Type[BaseSQLClient]] = None,
91
114
  handler_class: Optional[Type[BaseSQLHandler]] = None,
92
115
  transformer_class: Optional[Type[TransformerInterface]] = None,
116
+ multidb: bool = False,
93
117
  ):
94
118
  """Initialize the SQL metadata extraction activities.
95
119
 
@@ -100,6 +124,8 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
100
124
  Defaults to BaseSQLHandler.
101
125
  transformer_class (Type[TransformerInterface], optional): Class for metadata transformation.
102
126
  Defaults to QueryBasedTransformer.
127
+ multidb (bool): When True, executes queries across multiple databases using
128
+ `multidb_query_executor`. Defaults to False.
103
129
  """
104
130
  if sql_client_class:
105
131
  self.sql_client_class = sql_client_class
@@ -108,6 +134,9 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
108
134
  if transformer_class:
109
135
  self.transformer_class = transformer_class
110
136
 
137
+ # Control whether to execute per-db using multidb executor
138
+ self.multidb = multidb
139
+
111
140
  super().__init__()
112
141
 
113
142
  # State methods
@@ -206,6 +235,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
206
235
  raise ValueError("Missing required workflow arguments")
207
236
  return output_prefix, output_path, typename, workflow_id, workflow_run_id
208
237
 
238
+ @overload
209
239
  async def query_executor(
210
240
  self,
211
241
  sql_engine: Any,
@@ -213,7 +243,38 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
213
243
  workflow_args: Dict[str, Any],
214
244
  output_suffix: str,
215
245
  typename: str,
216
- ) -> Optional[ActivityStatistics]:
246
+ write_to_file: bool = True,
247
+ concatenate: bool = False,
248
+ return_dataframe: bool = False,
249
+ sql_client: Optional[BaseSQLClient] = None,
250
+ ) -> Optional[ActivityStatistics]: ...
251
+
252
+ @overload
253
+ async def query_executor(
254
+ self,
255
+ sql_engine: Any,
256
+ sql_query: Optional[str],
257
+ workflow_args: Dict[str, Any],
258
+ output_suffix: str,
259
+ typename: str,
260
+ write_to_file: bool = True,
261
+ concatenate: bool = False,
262
+ return_dataframe: bool = True,
263
+ sql_client: Optional[BaseSQLClient] = None,
264
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]: ...
265
+
266
+ async def query_executor(
267
+ self,
268
+ sql_engine: Any,
269
+ sql_query: Optional[str],
270
+ workflow_args: Dict[str, Any],
271
+ output_suffix: str,
272
+ typename: str,
273
+ write_to_file: bool = True,
274
+ concatenate: bool = False,
275
+ return_dataframe: bool = False,
276
+ sql_client: Optional[BaseSQLClient] = None,
277
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
217
278
  """
218
279
  Executes a SQL query using the provided engine and saves the results to Parquet.
219
280
 
@@ -233,44 +294,358 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
233
294
  typename: Type name used for generating output statistics.
234
295
 
235
296
  Returns:
236
- Optional[ActivityStatistics]: Statistics about the generated Parquet file,
237
- or None if the query is empty or execution fails before writing output.
297
+ Optional[Union[ActivityStatistics, pd.DataFrame]]: Statistics about the generated Parquet file,
298
+ or a DataFrame if return_dataframe=True, or None if the query is empty or execution fails.
238
299
 
239
300
  Raises:
240
301
  ValueError: If `sql_engine` is not provided.
241
302
  """
303
+ # Common pre-checks and setup shared by both multidb and single-db paths
304
+ if not sql_query:
305
+ logger.warning("Query is empty, skipping execution.")
306
+ return None
307
+
242
308
  if not sql_engine:
243
309
  logger.error("SQL engine is not set.")
244
310
  raise ValueError("SQL engine must be provided.")
245
- if not sql_query:
246
- logger.warning("Query is empty, skipping execution.")
311
+
312
+ # Setup parquet output using helper method
313
+ parquet_output = self._setup_parquet_output(
314
+ workflow_args, output_suffix, write_to_file
315
+ )
316
+
317
+ # If multidb mode is enabled, run per-database flow
318
+ if getattr(self, "multidb", False):
319
+ return await self._execute_multidb_flow(
320
+ sql_client,
321
+ sql_query,
322
+ workflow_args,
323
+ output_suffix,
324
+ typename,
325
+ write_to_file,
326
+ concatenate,
327
+ return_dataframe,
328
+ parquet_output,
329
+ )
330
+
331
+ # Single-db execution path
332
+ # Prepare query for single-db execution
333
+ prepared_query = self._prepare_database_query(
334
+ sql_query, None, workflow_args, typename
335
+ )
336
+
337
+ # Execute using helper method
338
+ success, _ = await self._execute_single_db(
339
+ sql_engine, prepared_query, parquet_output, write_to_file
340
+ )
341
+
342
+ if not success:
343
+ logger.error("Failed to execute single-db query")
247
344
  return None
248
345
 
249
- try:
250
- sql_input = SQLQueryInput(engine=sql_engine, query=sql_query)
251
- dataframe = await sql_input.get_batched_dataframe()
346
+ if parquet_output:
347
+ logger.info(
348
+ f"Successfully wrote query results to {parquet_output.get_full_path()}"
349
+ )
350
+ return await parquet_output.get_statistics(typename=typename)
351
+
352
+ logger.warning("No parquet output configured for single-db execution")
353
+ return None
354
+
355
+ def _setup_parquet_output(
356
+ self,
357
+ workflow_args: Dict[str, Any],
358
+ output_suffix: str,
359
+ write_to_file: bool,
360
+ ) -> Optional[ParquetOutput]:
361
+ if not write_to_file:
362
+ return None
363
+ output_prefix = workflow_args.get("output_prefix")
364
+ output_path = workflow_args.get("output_path")
365
+ if not output_prefix or not output_path:
366
+ logger.error("Output prefix or path not provided in workflow_args.")
367
+ raise ValueError(
368
+ "Output prefix and path must be specified in workflow_args."
369
+ )
370
+ return ParquetOutput(
371
+ output_prefix=output_prefix,
372
+ output_path=output_path,
373
+ output_suffix=output_suffix,
374
+ )
252
375
 
253
- output_prefix = workflow_args.get("output_prefix")
254
- output_path = workflow_args.get("output_path")
376
+ def _get_temp_table_regex_sql(self, typename: str) -> str:
377
+ """Get the appropriate temp table regex SQL based on typename."""
378
+ if typename == "column":
379
+ return self.extract_temp_table_regex_column_sql or ""
380
+ elif typename == "table":
381
+ return self.extract_temp_table_regex_table_sql or ""
382
+ else:
383
+ return ""
255
384
 
256
- if not output_prefix or not output_path:
257
- logger.error("Output prefix or path not provided in workflow_args.")
258
- raise ValueError(
259
- "Output prefix and path must be specified in workflow_args."
385
+ def _prepare_database_query(
386
+ self,
387
+ sql_query: str,
388
+ database_name: Optional[str],
389
+ workflow_args: Dict[str, Any],
390
+ typename: str,
391
+ use_posix_regex: bool = False,
392
+ ) -> Optional[str]:
393
+ """Prepare query for database execution with proper substitutions."""
394
+ # Replace database name placeholder if provided
395
+ fetch_sql = sql_query
396
+ if database_name:
397
+ fetch_sql = fetch_sql.replace("{database_name}", database_name)
398
+
399
+ # Get temp table regex SQL
400
+ temp_table_regex_sql = self._get_temp_table_regex_sql(typename)
401
+
402
+ # Prepare the query
403
+ prepared_query = prepare_query(
404
+ query=fetch_sql,
405
+ workflow_args=workflow_args,
406
+ temp_table_regex_sql=temp_table_regex_sql,
407
+ use_posix_regex=use_posix_regex,
408
+ )
409
+
410
+ if prepared_query is None:
411
+ db_context = f" for database {database_name}" if database_name else ""
412
+ raise ValueError(f"Failed to prepare query{db_context}")
413
+
414
+ return prepared_query
415
+
416
+ async def _setup_database_connection(
417
+ self,
418
+ sql_client: BaseSQLClient,
419
+ database_name: str,
420
+ ) -> None:
421
+ """Setup connection for a specific database."""
422
+ extra = parse_credentials_extra(sql_client.credentials)
423
+ extra["database"] = database_name
424
+ sql_client.credentials["extra"] = extra
425
+ await sql_client.load(sql_client.credentials)
426
+
427
+ # NOTE: Consolidated: per-database processing is now inlined in the multi-DB loop
428
+
429
+ async def _finalize_multidb_results(
430
+ self,
431
+ write_to_file: bool,
432
+ concatenate: bool,
433
+ return_dataframe: bool,
434
+ parquet_output: Optional[ParquetOutput],
435
+ dataframe_list: List[
436
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
437
+ ],
438
+ workflow_args: Dict[str, Any],
439
+ output_suffix: str,
440
+ typename: str,
441
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
442
+ """Finalize results for multi-database execution."""
443
+ if write_to_file and parquet_output:
444
+ return await parquet_output.get_statistics(typename=typename)
445
+
446
+ if not write_to_file and concatenate:
447
+ try:
448
+ import pandas as pd # type: ignore
449
+
450
+ valid_dataframes: List[pd.DataFrame] = []
451
+ for df_generator in dataframe_list:
452
+ if df_generator is None:
453
+ continue
454
+ for dataframe in df_generator: # type: ignore[assignment]
455
+ if dataframe is None:
456
+ continue
457
+ if hasattr(dataframe, "empty") and getattr(dataframe, "empty"):
458
+ continue
459
+ valid_dataframes.append(dataframe)
460
+
461
+ if not valid_dataframes:
462
+ logger.warning(
463
+ "No valid dataframes collected across databases for concatenation"
464
+ )
465
+ return None
466
+
467
+ concatenated = pd.concat(valid_dataframes, ignore_index=True)
468
+
469
+ if return_dataframe:
470
+ return concatenated # type: ignore[return-value]
471
+
472
+ # Create new parquet output for concatenated data
473
+ concatenated_parquet_output = self._setup_parquet_output(
474
+ workflow_args, output_suffix, True
260
475
  )
476
+ if concatenated_parquet_output:
477
+ await concatenated_parquet_output.write_dataframe(concatenated) # type: ignore[arg-type]
478
+ return await concatenated_parquet_output.get_statistics(
479
+ typename=typename
480
+ )
481
+ except Exception as e: # noqa: BLE001
482
+ logger.error(
483
+ f"Error concatenating multi-DB dataframes: {str(e)}",
484
+ exc_info=True,
485
+ )
486
+ raise
487
+
488
+ logger.warning(
489
+ "multidb execution returned no output to write (write_to_file=False, concatenate=False)"
490
+ )
491
+ return None
261
492
 
262
- parquet_output = ParquetOutput(
263
- output_prefix=output_prefix,
264
- output_path=output_path,
265
- output_suffix=output_suffix,
493
+ async def _execute_multidb_flow(
494
+ self,
495
+ sql_client: Optional[BaseSQLClient],
496
+ sql_query: str,
497
+ workflow_args: Dict[str, Any],
498
+ output_suffix: str,
499
+ typename: str,
500
+ write_to_file: bool,
501
+ concatenate: bool,
502
+ return_dataframe: bool,
503
+ parquet_output: Optional[ParquetOutput],
504
+ ) -> Optional[Union[ActivityStatistics, "pd.DataFrame"]]:
505
+ """Execute multi-database flow with proper error handling and result finalization."""
506
+ # Get effective SQL client
507
+ effective_sql_client = sql_client
508
+ if effective_sql_client is None:
509
+ state = cast(
510
+ BaseSQLMetadataExtractionActivitiesState,
511
+ await self._get_state(workflow_args),
266
512
  )
267
- await parquet_output.write_batched_dataframe(dataframe)
268
- logger.info(
269
- f"Successfully wrote query results to {parquet_output.get_full_path()}"
513
+ effective_sql_client = state.sql_client
514
+
515
+ if not effective_sql_client:
516
+ logger.error("SQL client not initialized for multidb execution")
517
+ raise ValueError("SQL client not initialized")
518
+
519
+ # Resolve databases to iterate
520
+ database_names = await get_database_names(
521
+ effective_sql_client, workflow_args, self.fetch_database_sql
522
+ )
523
+ if not database_names:
524
+ logger.warning("No databases found to process")
525
+ return None
526
+
527
+ # Validate client
528
+ if not effective_sql_client.engine:
529
+ logger.error("SQL client engine not initialized")
530
+ raise ValueError("SQL client engine not initialized")
531
+
532
+ successful_databases: List[str] = []
533
+ failed_databases: List[str] = []
534
+ dataframe_list: List[
535
+ Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]
536
+ ] = []
537
+
538
+ # Iterate databases and execute (consolidated single-db processing)
539
+ for database_name in database_names or []:
540
+ try:
541
+ # Setup connection for this database
542
+ await self._setup_database_connection(
543
+ effective_sql_client, database_name
544
+ )
545
+
546
+ # Prepare query for this database
547
+ prepared_query = self._prepare_database_query(
548
+ sql_query,
549
+ database_name,
550
+ workflow_args,
551
+ typename,
552
+ use_posix_regex=True,
553
+ )
554
+
555
+ # Execute using helper method
556
+ success, batched_iter = await self._execute_single_db(
557
+ effective_sql_client.engine,
558
+ prepared_query,
559
+ parquet_output,
560
+ write_to_file,
561
+ )
562
+
563
+ if success:
564
+ logger.info(f"Successfully processed database: {database_name}")
565
+ else:
566
+ logger.warning(
567
+ f"Failed to execute query for database: {database_name}"
568
+ )
569
+ except Exception as e: # noqa: BLE001
570
+ logger.warning(
571
+ f"Failed to process database '{database_name}': {str(e)}. Skipping to next database."
572
+ )
573
+ success, batched_iter = False, None
574
+
575
+ if success:
576
+ successful_databases.append(database_name)
577
+ if not write_to_file and batched_iter:
578
+ dataframe_list.append(batched_iter)
579
+ else:
580
+ failed_databases.append(database_name)
581
+
582
+ # Log results
583
+ logger.info(
584
+ f"Successfully processed {len(successful_databases)} databases: {successful_databases}"
585
+ )
586
+ if failed_databases:
587
+ logger.warning(
588
+ f"Failed to process {len(failed_databases)} databases: {failed_databases}"
270
589
  )
271
590
 
272
- statistics = await parquet_output.get_statistics(typename=typename)
273
- return statistics
591
+ # Finalize results
592
+ return await self._finalize_multidb_results(
593
+ write_to_file,
594
+ concatenate,
595
+ return_dataframe,
596
+ parquet_output,
597
+ dataframe_list,
598
+ workflow_args,
599
+ output_suffix,
600
+ typename,
601
+ )
602
+
603
+ async def _execute_single_db(
604
+ self,
605
+ sql_engine: Any,
606
+ prepared_query: Optional[str],
607
+ parquet_output: Optional[ParquetOutput],
608
+ write_to_file: bool,
609
+ ) -> Tuple[
610
+ bool, Optional[Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]]
611
+ ]: # type: ignore
612
+ if not prepared_query:
613
+ logger.error("Prepared query is None, cannot execute")
614
+ return False, None
615
+
616
+ try:
617
+ sql_input = SQLQueryInput(engine=sql_engine, query=prepared_query)
618
+ batched_iter = await sql_input.get_batched_dataframe()
619
+
620
+ if write_to_file and parquet_output:
621
+ # Wrap iterator into a proper (async)generator for type safety
622
+ if hasattr(batched_iter, "__anext__"):
623
+
624
+ async def _to_async_gen(
625
+ it: AsyncIterator["pd.DataFrame"],
626
+ ) -> AsyncGenerator["pd.DataFrame", None]:
627
+ async for item in it:
628
+ yield item
629
+
630
+ wrapped: AsyncGenerator["pd.DataFrame", None] = _to_async_gen( # type: ignore
631
+ batched_iter # type: ignore
632
+ )
633
+ await parquet_output.write_batched_dataframe(wrapped)
634
+ else:
635
+
636
+ def _to_gen(
637
+ it: Iterator["pd.DataFrame"],
638
+ ) -> Generator["pd.DataFrame", None, None]:
639
+ for item in it:
640
+ yield item
641
+
642
+ wrapped_sync: Generator["pd.DataFrame", None, None] = _to_gen( # type: ignore
643
+ batched_iter # type: ignore
644
+ )
645
+ await parquet_output.write_batched_dataframe(wrapped_sync)
646
+ return True, None
647
+
648
+ return True, batched_iter
274
649
  except Exception as e:
275
650
  logger.error(
276
651
  f"Error during query execution or output writing: {e}", exc_info=True
@@ -485,9 +860,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
485
860
 
486
861
  raw_input = ParquetInput(
487
862
  path=os.path.join(output_path, "raw"),
488
- input_prefix=output_prefix,
489
863
  file_names=workflow_args.get("file_names"),
490
- chunk_size=None,
491
864
  )
492
865
  raw_input = raw_input.get_batched_daft_dataframe()
493
866
  transformed_output = JsonOutput(
@@ -164,6 +164,7 @@ class BaseApplication:
164
164
  self,
165
165
  workflow_class,
166
166
  ui_enabled: bool = True,
167
+ has_configmap: bool = False,
167
168
  ):
168
169
  """
169
170
  Optionally set up a server for the application. (No-op by default)
@@ -176,6 +177,7 @@ class BaseApplication:
176
177
  workflow_client=self.workflow_client,
177
178
  ui_enabled=ui_enabled,
178
179
  handler=self.handler_class(client=self.client_class()),
180
+ has_configmap=has_configmap,
179
181
  )
180
182
 
181
183
  if self.event_subscriptions:
@@ -161,12 +161,14 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
161
161
  workflow_class: Type[
162
162
  BaseSQLMetadataExtractionWorkflow
163
163
  ] = BaseSQLMetadataExtractionWorkflow,
164
+ has_configmap: bool = False,
164
165
  ) -> Any:
165
166
  """
166
167
  Set up the FastAPI server for the SQL metadata extraction application.
167
168
 
168
169
  Args:
169
170
  workflow_class (Type): Workflow class to register with the server. Defaults to BaseSQLMetadataExtractionWorkflow.
171
+ has_configmap (bool): Whether the application has a configmap. Defaults to False.
170
172
 
171
173
  Returns:
172
174
  Any: None
@@ -178,6 +180,7 @@ class BaseSQLMetadataExtractionApplication(BaseApplication):
178
180
  self.server = APIServer(
179
181
  handler=self.handler_class(sql_client=self.client_class()),
180
182
  workflow_client=self.workflow_client,
183
+ has_configmap=has_configmap,
181
184
  )
182
185
 
183
186
  # register the workflow on the application server