atlan-application-sdk 0.1.1rc41__py3-none-any.whl → 0.1.1rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,10 +5,11 @@ including workflow ID retrieval, automatic heartbeating, and periodic heartbeat
5
5
  """
6
6
 
7
7
  import asyncio
8
+ import glob
8
9
  import os
9
10
  from datetime import timedelta
10
11
  from functools import wraps
11
- from typing import Any, Awaitable, Callable, Optional, TypeVar, cast
12
+ from typing import Any, Awaitable, Callable, List, Optional, TypeVar, cast
12
13
 
13
14
  from temporalio import activity
14
15
 
@@ -79,17 +80,47 @@ def build_output_path() -> str:
79
80
 
80
81
  def get_object_store_prefix(path: str) -> str:
81
82
  """Get the object store prefix for the path.
83
+
84
+ This function handles two types of paths:
85
+ 1. Paths under TEMPORARY_PATH - converts them to relative object store paths
86
+ 2. User-provided paths - returns them as-is (already relative object store paths)
87
+
82
88
  Args:
83
- path: The path to the output directory.
89
+ path: The path to convert to object store prefix.
84
90
 
85
91
  Returns:
86
92
  The object store prefix for the path.
87
93
 
88
- Example:
94
+ Examples:
95
+ >>> # Temporary path case
89
96
  >>> get_object_store_prefix("./local/tmp/artifacts/apps/appName/workflows/wf-123/run-456")
90
97
  "artifacts/apps/appName/workflows/wf-123/run-456"
98
+
99
+ >>> # User-provided path case
100
+ >>> get_object_store_prefix("datasets/sales/2024/")
101
+ "datasets/sales/2024"
91
102
  """
92
- return os.path.relpath(path, TEMPORARY_PATH)
103
+ # Normalize paths for comparison
104
+ abs_path = os.path.abspath(path)
105
+ abs_temp_path = os.path.abspath(TEMPORARY_PATH)
106
+
107
+ # Check if path is under TEMPORARY_PATH
108
+ try:
109
+ # Use os.path.commonpath to properly check if path is under temp directory
110
+ # This prevents false positives like '/tmp/local123' matching '/tmp/local'
111
+ common_path = os.path.commonpath([abs_path, abs_temp_path])
112
+ if common_path == abs_temp_path:
113
+ # Path is under temp directory, convert to relative object store path
114
+ relative_path = os.path.relpath(abs_path, abs_temp_path)
115
+ # Normalize path separators to forward slashes for object store
116
+ return relative_path.replace(os.path.sep, "/")
117
+ else:
118
+ # Path is already a relative object store path, return as-is
119
+ return path.strip("/")
120
+ except ValueError:
121
+ # os.path.commonpath or os.path.relpath can raise ValueError on Windows with different drives
122
+ # In this case, treat as user-provided path, return as-is
123
+ return path.strip("/")
93
124
 
94
125
 
95
126
  def auto_heartbeater(fn: F) -> F:
@@ -199,3 +230,46 @@ async def send_periodic_heartbeat(delay: float, *details: Any) -> None:
199
230
  while True:
200
231
  await asyncio.sleep(delay)
201
232
  activity.heartbeat(*details)
233
+
234
+
235
+ def find_local_files_by_extension(
236
+ path: str,
237
+ extension: str,
238
+ file_names: Optional[List[str]] = None,
239
+ ) -> List[str]:
240
+ """Find local files at the specified local path, optionally filtering by file names.
241
+
242
+ Args:
243
+ path (str): Local path to search in (file or directory)
244
+ extension (str): File extension to filter by (e.g., '.parquet', '.json')
245
+ file_names (Optional[List[str]]): List of file names (basenames) to filter by, paths are not supported
246
+
247
+ Returns:
248
+ List[str]: List of matching file paths
249
+
250
+ Example:
251
+ >>> find_local_files_by_extension("/data", ".parquet", ["file1.parquet", "file2.parquet"])
252
+ ['file1.parquet', 'file2.parquet']
253
+
254
+ >>> find_local_files_by_extension("/data/single.json", ".json")
255
+ ['single.json']
256
+ """
257
+ if os.path.isfile(path) and path.endswith(extension):
258
+ # Single file - return it directly
259
+ return [path]
260
+
261
+ elif os.path.isdir(path):
262
+ # Directory - find all files in directory
263
+ all_files = glob.glob(
264
+ os.path.join(path, "**", f"*{extension}"),
265
+ recursive=True,
266
+ )
267
+
268
+ # Filter by file names if specified
269
+ if file_names:
270
+ file_names_set = set(file_names) # Convert to set for O(1) lookup
271
+ return [f for f in all_files if os.path.basename(f) in file_names_set]
272
+ else:
273
+ return all_files
274
+
275
+ return []
@@ -860,9 +860,7 @@ class BaseSQLMetadataExtractionActivities(ActivitiesInterface):
860
860
 
861
861
  raw_input = ParquetInput(
862
862
  path=os.path.join(output_path, "raw"),
863
- input_prefix=output_prefix,
864
863
  file_names=workflow_args.get("file_names"),
865
- chunk_size=None,
866
864
  )
867
865
  raw_input = raw_input.get_batched_daft_dataframe()
868
866
  transformed_output = JsonOutput(
@@ -1,7 +1,15 @@
1
+ import os
1
2
  from abc import ABC, abstractmethod
2
- from typing import TYPE_CHECKING, AsyncIterator, Iterator, Union
3
-
3
+ from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Union
4
+
5
+ from application_sdk.activities.common.utils import (
6
+ find_local_files_by_extension,
7
+ get_object_store_prefix,
8
+ )
9
+ from application_sdk.common.error_codes import IOError
10
+ from application_sdk.constants import TEMPORARY_PATH
4
11
  from application_sdk.observability.logger_adaptor import get_logger
12
+ from application_sdk.services.objectstore import ObjectStore
5
13
 
6
14
  logger = get_logger(__name__)
7
15
 
@@ -15,6 +23,94 @@ class Input(ABC):
15
23
  Abstract base class for input data sources.
16
24
  """
17
25
 
26
+ async def download_files(self) -> List[str]:
27
+ """Download files from object store if not available locally.
28
+
29
+ Flow:
30
+ 1. Check if files exist locally at self.path
31
+ 2. If not, try to download from object store
32
+ 3. Filter by self.file_names if provided
33
+ 4. Return list of file paths for logging purposes
34
+
35
+ Returns:
36
+ List[str]: List of file paths
37
+
38
+ Raises:
39
+ AttributeError: When the input class doesn't support file operations or _extension
40
+ IOError: When no files found locally or in object store
41
+ """
42
+ # Step 1: Check if files exist locally
43
+ local_files = find_local_files_by_extension(
44
+ self.path, self._EXTENSION, self.file_names
45
+ )
46
+ if local_files:
47
+ logger.info(
48
+ f"Found {len(local_files)} {self._EXTENSION} files locally at: {self.path}"
49
+ )
50
+ return local_files
51
+
52
+ # Step 2: Try to download from object store
53
+ logger.info(
54
+ f"No local {self._EXTENSION} files found at {self.path}, checking object store..."
55
+ )
56
+
57
+ try:
58
+ # Determine what to download based on path type and filters
59
+ downloaded_paths = []
60
+
61
+ if self.path.endswith(self._EXTENSION):
62
+ # Single file case (file_names validation already ensures this is valid)
63
+ source_path = get_object_store_prefix(self.path)
64
+ destination_path = os.path.join(TEMPORARY_PATH, source_path)
65
+ await ObjectStore.download_file(
66
+ source=source_path,
67
+ destination=destination_path,
68
+ )
69
+ downloaded_paths.append(destination_path)
70
+
71
+ elif self.file_names:
72
+ # Directory with specific files - download each file individually
73
+ for file_name in self.file_names:
74
+ file_path = os.path.join(self.path, file_name)
75
+ source_path = get_object_store_prefix(file_path)
76
+ destination_path = os.path.join(TEMPORARY_PATH, source_path)
77
+ await ObjectStore.download_file(
78
+ source=source_path,
79
+ destination=destination_path,
80
+ )
81
+ downloaded_paths.append(destination_path)
82
+ else:
83
+ # Download entire directory
84
+ source_path = get_object_store_prefix(self.path)
85
+ destination_path = os.path.join(TEMPORARY_PATH, source_path)
86
+ await ObjectStore.download_prefix(
87
+ source=source_path,
88
+ destination=destination_path,
89
+ )
90
+ # Find the actual files in the downloaded directory
91
+ found_files = find_local_files_by_extension(
92
+ destination_path, self._EXTENSION, getattr(self, "file_names", None)
93
+ )
94
+ downloaded_paths.extend(found_files)
95
+
96
+ # Check results
97
+ if downloaded_paths:
98
+ logger.info(
99
+ f"Successfully downloaded {len(downloaded_paths)} {self._EXTENSION} files from object store"
100
+ )
101
+ return downloaded_paths
102
+ else:
103
+ raise IOError(
104
+ f"{IOError.OBJECT_STORE_READ_ERROR}: Downloaded from object store but no {self._EXTENSION} files found"
105
+ )
106
+
107
+ except Exception as e:
108
+ logger.error(f"Failed to download from object store: {str(e)}")
109
+ raise IOError(
110
+ f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: No {self._EXTENSION} files found locally at '{self.path}' and failed to download from object store. "
111
+ f"Error: {str(e)}"
112
+ )
113
+
18
114
  @abstractmethod
19
115
  async def get_batched_dataframe(
20
116
  self,
@@ -1,11 +1,7 @@
1
- import os
2
1
  from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
3
2
 
4
- from application_sdk.activities.common.utils import get_object_store_prefix
5
- from application_sdk.common.error_codes import IOError
6
3
  from application_sdk.inputs import Input
7
4
  from application_sdk.observability.logger_adaptor import get_logger
8
- from application_sdk.services.objectstore import ObjectStore
9
5
 
10
6
  if TYPE_CHECKING:
11
7
  import daft
@@ -15,56 +11,43 @@ logger = get_logger(__name__)
15
11
 
16
12
 
17
13
  class JsonInput(Input):
18
- path: str
19
- chunk_size: Optional[int]
20
- file_names: Optional[List[str]]
21
- download_file_prefix: Optional[str]
14
+ """
15
+ JSON Input class to read data from JSON files using daft and pandas.
16
+ Supports reading both single files and directories containing multiple JSON files.
17
+ """
18
+
19
+ _EXTENSION = ".json"
22
20
 
23
21
  def __init__(
24
22
  self,
25
23
  path: str,
26
24
  file_names: Optional[List[str]] = None,
27
- download_file_prefix: Optional[str] = None,
28
- chunk_size: Optional[int] = None,
25
+ chunk_size: int = 100000,
29
26
  ):
30
27
  """Initialize the JsonInput class.
31
28
 
32
29
  Args:
33
- path (str): The path to the input directory.
34
- file_names (Optional[List[str]]): The list of files to read.
35
- download_file_prefix (Optional[str]): The prefix path in object store.
36
- chunk_size (Optional[int]): The chunk size to read the data. If None, uses config value.
30
+ path (str): Path to JSON file or directory containing JSON files.
31
+ It accepts both types of paths:
32
+ local path or object store path
33
+ Wildcards are not supported.
34
+ file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
35
+ chunk_size (int): Number of rows per batch. Defaults to 100000.
36
+
37
+ Raises:
38
+ ValueError: When path is not provided or when single file path is combined with file_names
37
39
  """
40
+
41
+ # Validate that single file path and file_names are not both specified
42
+ if path.endswith(self._EXTENSION) and file_names:
43
+ raise ValueError(
44
+ f"Cannot specify both a single file path ('{path}') and file_names filter. "
45
+ f"Either provide a directory path with file_names, or specify the exact file path without file_names."
46
+ )
47
+
38
48
  self.path = path
39
- # If chunk_size is provided, use it; otherwise default to 100,000 rows per batch
40
- self.chunk_size = chunk_size if chunk_size is not None else 100000
49
+ self.chunk_size = chunk_size
41
50
  self.file_names = file_names
42
- self.download_file_prefix = download_file_prefix
43
-
44
- async def download_files(self):
45
- """Download the files from the object store to the local path"""
46
- if not self.file_names:
47
- logger.debug("No files to download")
48
- return
49
-
50
- for file_name in self.file_names or []:
51
- try:
52
- if self.download_file_prefix is not None and not os.path.exists(
53
- os.path.join(self.path, file_name)
54
- ):
55
- destination_file_path = os.path.join(self.path, file_name)
56
- await ObjectStore.download_file(
57
- source=get_object_store_prefix(destination_file_path),
58
- destination=destination_file_path,
59
- )
60
- except IOError as e:
61
- logger.error(
62
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error downloading file {file_name}: {str(e)}",
63
- error_code=IOError.OBJECT_STORE_DOWNLOAD_ERROR.code,
64
- )
65
- raise IOError(
66
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error downloading file {file_name}: {str(e)}"
67
- )
68
51
 
69
52
  async def get_batched_dataframe(
70
53
  self,
@@ -76,22 +59,20 @@ class JsonInput(Input):
76
59
  try:
77
60
  import pandas as pd
78
61
 
79
- await self.download_files()
62
+ # Ensure files are available (local or downloaded)
63
+ json_files = await self.download_files()
64
+ logger.info(f"Reading {len(json_files)} JSON files in batches")
80
65
 
81
- for file_name in self.file_names or []:
82
- file_path = os.path.join(self.path, file_name)
66
+ for json_file in json_files:
83
67
  json_reader_obj = pd.read_json(
84
- file_path,
68
+ json_file,
85
69
  chunksize=self.chunk_size,
86
70
  lines=True,
87
71
  )
88
72
  for chunk in json_reader_obj:
89
73
  yield chunk
90
- except IOError as e:
91
- logger.error(
92
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error reading batched data from JSON: {str(e)}",
93
- error_code=IOError.OBJECT_STORE_DOWNLOAD_ERROR.code,
94
- )
74
+ except Exception as e:
75
+ logger.error(f"Error reading batched data from JSON: {str(e)}")
95
76
  raise
96
77
 
97
78
  async def get_dataframe(self) -> "pd.DataFrame":
@@ -102,21 +83,17 @@ class JsonInput(Input):
102
83
  try:
103
84
  import pandas as pd
104
85
 
105
- dataframes = []
106
- await self.download_files()
107
- for file_name in self.file_names or []:
108
- dataframes.append(
109
- pd.read_json(
110
- os.path.join(self.path, file_name),
111
- lines=True,
112
- )
113
- )
114
- return pd.concat(dataframes, ignore_index=True)
115
- except IOError as e:
116
- logger.error(
117
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error reading data from JSON: {str(e)}",
118
- error_code=IOError.OBJECT_STORE_DOWNLOAD_ERROR.code,
86
+ # Ensure files are available (local or downloaded)
87
+ json_files = await self.download_files()
88
+ logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
89
+
90
+ return pd.concat(
91
+ (pd.read_json(json_file, lines=True) for json_file in json_files),
92
+ ignore_index=True,
119
93
  )
94
+
95
+ except Exception as e:
96
+ logger.error(f"Error reading data from JSON: {str(e)}")
120
97
  raise
121
98
 
122
99
  async def get_batched_daft_dataframe(
@@ -129,18 +106,15 @@ class JsonInput(Input):
129
106
  try:
130
107
  import daft
131
108
 
132
- await self.download_files()
133
- for file_name in self.file_names or []:
134
- json_reader_obj = daft.read_json(
135
- path=os.path.join(self.path, file_name),
136
- _chunk_size=self.chunk_size,
137
- )
138
- yield json_reader_obj
139
- except IOError as e:
140
- logger.error(
141
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error reading batched data from JSON: {str(e)}",
142
- error_code=IOError.OBJECT_STORE_DOWNLOAD_ERROR.code,
143
- )
109
+ # Ensure files are available (local or downloaded)
110
+ json_files = await self.download_files()
111
+ logger.info(f"Reading {len(json_files)} JSON files as daft batches")
112
+
113
+ # Yield each discovered file as separate batch with chunking
114
+ for json_file in json_files:
115
+ yield daft.read_json(json_file, _chunk_size=self.chunk_size)
116
+ except Exception as e:
117
+ logger.error(f"Error reading batched data from JSON using daft: {str(e)}")
144
118
  raise
145
119
 
146
120
  async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
@@ -151,14 +125,12 @@ class JsonInput(Input):
151
125
  try:
152
126
  import daft
153
127
 
154
- await self.download_files()
155
- if not self.file_names or len(self.file_names) == 0:
156
- raise ValueError("No files to read")
157
- directory = os.path.join(self.path, self.file_names[0].split("/")[0])
158
- return daft.read_json(path=f"{directory}/*.json")
159
- except IOError as e:
160
- logger.error(
161
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: Error reading data from JSON using daft: {str(e)}",
162
- error_code=IOError.OBJECT_STORE_DOWNLOAD_ERROR.code,
163
- )
128
+ # Ensure files are available (local or downloaded)
129
+ json_files = await self.download_files()
130
+ logger.info(f"Reading {len(json_files)} JSON files with daft")
131
+
132
+ # Use the discovered/downloaded files directly
133
+ return daft.read_json(json_files)
134
+ except Exception as e:
135
+ logger.error(f"Error reading data from JSON using daft: {str(e)}")
164
136
  raise
@@ -1,11 +1,7 @@
1
- import glob
2
- import os
3
1
  from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
4
2
 
5
- from application_sdk.activities.common.utils import get_object_store_prefix
6
3
  from application_sdk.inputs import Input
7
4
  from application_sdk.observability.logger_adaptor import get_logger
8
- from application_sdk.services.objectstore import ObjectStore
9
5
 
10
6
  logger = get_logger(__name__)
11
7
 
@@ -20,107 +16,139 @@ class ParquetInput(Input):
20
16
  Supports reading both single files and directories containing multiple parquet files.
21
17
  """
22
18
 
19
+ _EXTENSION = ".parquet"
20
+
23
21
  def __init__(
24
22
  self,
25
- path: Optional[str] = None,
26
- chunk_size: Optional[int] = 100000,
27
- input_prefix: Optional[str] = None,
23
+ path: str,
24
+ chunk_size: int = 100000,
28
25
  file_names: Optional[List[str]] = None,
29
26
  ):
30
27
  """Initialize the Parquet input class.
31
28
 
32
29
  Args:
33
30
  path (str): Path to parquet file or directory containing parquet files.
34
- chunk_size (Optional[int], optional): Number of rows per batch.
35
- Defaults to 100000.
36
- input_prefix (Optional[str], optional): Prefix for files when reading from object store.
37
- If provided, files will be read from object store. Defaults to None.
38
- file_names (Optional[List[str]], optional): List of file names to read.
39
- Defaults to None.
31
+ It accepts both types of paths:
32
+ local path or object store path
33
+ Wildcards are not supported.
34
+ chunk_size (int): Number of rows per batch. Defaults to 100000.
35
+ file_names (Optional[List[str]]): List of file names to read. Defaults to None.
36
+
37
+ Raises:
38
+ ValueError: When path is not provided or when single file path is combined with file_names
40
39
  """
40
+
41
+ # Validate that single file path and file_names are not both specified
42
+ if path.endswith(self._EXTENSION) and file_names:
43
+ raise ValueError(
44
+ f"Cannot specify both a single file path ('{path}') and file_names filter. "
45
+ f"Either provide a directory path with file_names, or specify the exact file path without file_names."
46
+ )
47
+
41
48
  self.path = path
42
49
  self.chunk_size = chunk_size
43
- self.input_prefix = input_prefix
44
50
  self.file_names = file_names
45
51
 
46
- async def download_files(self, local_path: str) -> Optional[str]:
47
- """Read a file from the object store.
48
-
49
- Args:
50
- local_path (str): Path to the local data in the temp directory.
52
+ async def get_dataframe(self) -> "pd.DataFrame":
53
+ """Read data from parquet file(s) and return as pandas DataFrame.
51
54
 
52
55
  Returns:
53
- Optional[str]: Path to the downloaded local file.
54
- """
55
- # if the path is a directory, then check if the directory has any parquet files
56
- parquet_files = []
57
- if os.path.isdir(local_path):
58
- parquet_files = glob.glob(os.path.join(local_path, "*.parquet"))
59
- else:
60
- parquet_files = glob.glob(local_path)
61
- if not parquet_files:
62
- if self.input_prefix:
63
- logger.info(
64
- f"Reading file from object store: {local_path} from {self.input_prefix}"
65
- )
66
- if os.path.isdir(local_path):
67
- await ObjectStore.download_prefix(
68
- source=get_object_store_prefix(local_path),
69
- destination=local_path,
70
- )
71
- else:
72
- await ObjectStore.download_file(
73
- source=get_object_store_prefix(local_path),
74
- destination=local_path,
75
- )
76
- else:
77
- raise ValueError(
78
- f"No parquet files found in {local_path} and no input prefix provided"
79
- )
56
+ pd.DataFrame: Combined dataframe from specified parquet files
80
57
 
81
- async def get_dataframe(self) -> "pd.DataFrame":
82
- """
83
- Method to read the data from the parquet file(s)
84
- and return as a single combined pandas dataframe.
58
+ Raises:
59
+ ValueError: When no valid path can be determined or no matching files found
60
+ Exception: When reading parquet files fails
85
61
 
86
- Returns:
87
- "pd.DataFrame": Combined dataframe from all parquet files.
62
+ Example transformation:
63
+ Input files:
64
+ +------------------+
65
+ | file1.parquet |
66
+ | file2.parquet |
67
+ | file3.parquet |
68
+ +------------------+
69
+
70
+ With file_names=["file1.parquet", "file3.parquet"]:
71
+ +-------+-------+-------+
72
+ | col1 | col2 | col3 |
73
+ +-------+-------+-------+
74
+ | val1 | val2 | val3 | # from file1.parquet
75
+ | val7 | val8 | val9 | # from file3.parquet
76
+ +-------+-------+-------+
77
+
78
+ Transformations:
79
+ - Only specified files are read and combined
80
+ - Column schemas must be compatible across files
81
+ - Only reads files in the specified directory
88
82
  """
89
83
  try:
90
84
  import pandas as pd
91
85
 
92
- path = self.path
93
- if self.input_prefix and self.path:
94
- path = await self.download_files(self.path)
95
- # Use pandas native read_parquet which can handle both single files and directories
96
- return pd.read_parquet(path)
86
+ # Ensure files are available (local or downloaded)
87
+ parquet_files = await self.download_files()
88
+ logger.info(f"Reading {len(parquet_files)} parquet files")
89
+
90
+ return pd.concat(
91
+ (pd.read_parquet(parquet_file) for parquet_file in parquet_files),
92
+ ignore_index=True,
93
+ )
97
94
  except Exception as e:
98
95
  logger.error(f"Error reading data from parquet file(s): {str(e)}")
99
- # Re-raise to match IcebergInput behavior
100
96
  raise
101
97
 
102
98
  async def get_batched_dataframe(
103
99
  self,
104
100
  ) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
105
- """
106
- Method to read the data from the parquet file(s) in batches
107
- and return as an async iterator of pandas dataframes.
101
+ """Read data from parquet file(s) in batches as pandas DataFrames.
108
102
 
109
103
  Returns:
110
- AsyncIterator["pd.DataFrame"]: Async iterator of pandas dataframes.
104
+ AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
105
+
106
+ Raises:
107
+ ValueError: When no parquet files found locally or in object store
108
+ Exception: When reading parquet files fails
109
+
110
+ Example transformation:
111
+ Input files:
112
+ +------------------+
113
+ | file1.parquet |
114
+ | file2.parquet |
115
+ | file3.parquet |
116
+ +------------------+
117
+
118
+ With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
119
+ Batch 1:
120
+ +-------+-------+
121
+ | col1 | col2 |
122
+ +-------+-------+
123
+ | val1 | val2 | # from file1.parquet
124
+ | val3 | val4 | # from file1.parquet
125
+ +-------+-------+
126
+
127
+ Batch 2:
128
+ +-------+-------+
129
+ | col1 | col2 |
130
+ +-------+-------+
131
+ | val5 | val6 | # from file2.parquet
132
+ | val7 | val8 | # from file2.parquet
133
+ +-------+-------+
134
+
135
+ Transformations:
136
+ - Only specified files are combined then split into chunks
137
+ - Each batch is a separate DataFrame
138
+ - Only reads files in the specified directory
111
139
  """
112
140
  try:
113
141
  import pandas as pd
114
142
 
115
- path = self.path
116
- if self.input_prefix and self.path:
117
- path = await self.download_files(self.path)
118
- df = pd.read_parquet(path)
119
- if self.chunk_size:
143
+ # Ensure files are available (local or downloaded)
144
+ parquet_files = await self.download_files()
145
+ logger.info(f"Reading {len(parquet_files)} parquet files in batches")
146
+
147
+ # Process each file individually to maintain memory efficiency
148
+ for parquet_file in parquet_files:
149
+ df = pd.read_parquet(parquet_file)
120
150
  for i in range(0, len(df), self.chunk_size):
121
151
  yield df.iloc[i : i + self.chunk_size]
122
- else:
123
- yield df
124
152
  except Exception as e:
125
153
  logger.error(
126
154
  f"Error reading data from parquet file(s) in batches: {str(e)}"
@@ -128,51 +156,102 @@ class ParquetInput(Input):
128
156
  raise
129
157
 
130
158
  async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
131
- """
132
- Method to read the data from the parquet file(s)
133
- and return as a single combined daft dataframe.
159
+ """Read data from parquet file(s) and return as daft DataFrame.
134
160
 
135
161
  Returns:
136
- daft.DataFrame: Combined daft dataframe from all parquet files.
162
+ daft.DataFrame: Combined daft dataframe from specified parquet files
163
+
164
+ Raises:
165
+ ValueError: When no parquet files found locally or in object store
166
+ Exception: When reading parquet files fails
167
+
168
+ Example transformation:
169
+ Input files:
170
+ +------------------+
171
+ | file1.parquet |
172
+ | file2.parquet |
173
+ | file3.parquet |
174
+ +------------------+
175
+
176
+ With file_names=["file1.parquet", "file3.parquet"]:
177
+ +-------+-------+-------+
178
+ | col1 | col2 | col3 |
179
+ +-------+-------+-------+
180
+ | val1 | val2 | val3 | # from file1.parquet
181
+ | val7 | val8 | val9 | # from file3.parquet
182
+ +-------+-------+-------+
183
+
184
+ Transformations:
185
+ - Only specified parquet files combined into single daft DataFrame
186
+ - Lazy evaluation for better performance
187
+ - Column schemas must be compatible across files
137
188
  """
138
189
  try:
139
190
  import daft # type: ignore
140
191
 
141
- if self.file_names:
142
- path = f"{self.path}/{self.file_names[0].split('/')[0]}"
143
- else:
144
- path = self.path
145
- if self.input_prefix and path:
146
- await self.download_files(path)
147
- return daft.read_parquet(f"{path}/*.parquet")
192
+ # Ensure files are available (local or downloaded)
193
+ parquet_files = await self.download_files()
194
+ logger.info(f"Reading {len(parquet_files)} parquet files with daft")
195
+
196
+ # Use the discovered/downloaded files directly
197
+ return daft.read_parquet(parquet_files)
148
198
  except Exception as e:
149
199
  logger.error(
150
200
  f"Error reading data from parquet file(s) using daft: {str(e)}"
151
201
  )
152
- # Re-raise to match IcebergInput behavior
153
202
  raise
154
203
 
155
204
  async def get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
156
- """
157
- Get batched daft dataframe from parquet file(s)
205
+ """Get batched daft dataframe from parquet file(s).
158
206
 
159
207
  Returns:
160
208
  AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
161
- a batch of data from the parquet file(s).
209
+ a batch of data from individual parquet files
210
+
211
+ Raises:
212
+ ValueError: When no parquet files found locally or in object store
213
+ Exception: When reading parquet files fails
214
+
215
+ Example transformation:
216
+ Input files:
217
+ +------------------+
218
+ | file1.parquet |
219
+ | file2.parquet |
220
+ | file3.parquet |
221
+ +------------------+
222
+
223
+ With file_names=["file1.parquet", "file3.parquet"]:
224
+ Batch 1 (file1.parquet):
225
+ +-------+-------+
226
+ | col1 | col2 |
227
+ +-------+-------+
228
+ | val1 | val2 |
229
+ | val3 | val4 |
230
+ +-------+-------+
231
+
232
+ Batch 2 (file3.parquet):
233
+ +-------+-------+
234
+ | col1 | col2 |
235
+ +-------+-------+
236
+ | val7 | val8 |
237
+ | val9 | val10 |
238
+ +-------+-------+
239
+
240
+ Transformations:
241
+ - Each specified file becomes a separate daft DataFrame batch
242
+ - Lazy evaluation for better performance
243
+ - Files processed individually for memory efficiency
162
244
  """
163
245
  try:
164
246
  import daft # type: ignore
165
247
 
166
- if self.file_names:
167
- for file_name in self.file_names:
168
- path = f"{self.path}/{file_name}"
169
- if self.input_prefix and path:
170
- await self.download_files(path)
171
- yield daft.read_parquet(path)
172
- else:
173
- if self.path and self.input_prefix:
174
- await self.download_files(self.path)
175
- yield daft.read_parquet(f"{self.path}/*.parquet")
248
+ # Ensure files are available (local or downloaded)
249
+ parquet_files = await self.download_files()
250
+ logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
251
+
252
+ # Yield each discovered file as separate batch
253
+ for parquet_file in parquet_files:
254
+ yield daft.read_parquet(parquet_file)
176
255
 
177
256
  except Exception as error:
178
257
  logger.error(
@@ -2,11 +2,17 @@ from hypothesis import strategies as st
2
2
 
3
3
  # Strategy for generating safe file path components
4
4
  safe_path_strategy = st.text(
5
- alphabet=st.characters(),
6
- )
5
+ alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-",
6
+ min_size=1,
7
+ max_size=20,
8
+ ).map(lambda x: f"/data/{x}")
7
9
 
8
10
  # Strategy for generating file names
9
- file_name_strategy = st.builds(lambda name: f"{name}.json", name=safe_path_strategy)
11
+ file_name_strategy = st.text(
12
+ alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-",
13
+ min_size=1,
14
+ max_size=10,
15
+ ).map(lambda x: f"{x}.json")
10
16
 
11
17
  # Strategy for generating lists of file names
12
18
  file_names_strategy = st.lists(file_name_strategy, unique=True)
@@ -18,7 +24,6 @@ download_prefix_strategy = safe_path_strategy
18
24
  json_input_config_strategy = st.fixed_dictionaries(
19
25
  {
20
26
  "path": safe_path_strategy,
21
- "download_file_prefix": download_prefix_strategy,
22
- "file_names": file_names_strategy,
27
+ "file_names": st.one_of(st.none(), file_names_strategy),
23
28
  }
24
29
  )
@@ -2,11 +2,17 @@ from hypothesis import strategies as st
2
2
 
3
3
  # Strategy for generating safe file path components
4
4
  safe_path_strategy = st.text(
5
- alphabet=st.characters(),
6
- )
5
+ alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-",
6
+ min_size=1,
7
+ max_size=20,
8
+ ).map(lambda x: f"/data/{x}")
7
9
 
8
10
  # Strategy for generating file names
9
- file_name_strategy = st.builds(lambda name: f"{name}.parquet", name=safe_path_strategy)
11
+ file_name_strategy = st.text(
12
+ alphabet="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_-",
13
+ min_size=1,
14
+ max_size=10,
15
+ ).map(lambda x: f"{x}.parquet")
10
16
 
11
17
  # Strategy for generating lists of file names
12
18
  file_names_strategy = st.lists(file_name_strategy, unique=True)
@@ -22,7 +28,6 @@ parquet_input_config_strategy = st.fixed_dictionaries(
22
28
  {
23
29
  "path": safe_path_strategy,
24
30
  "chunk_size": chunk_size_strategy,
25
- "input_prefix": st.one_of(st.none(), input_prefix_strategy),
26
31
  "file_names": st.one_of(st.none(), file_names_strategy),
27
32
  }
28
33
  )
@@ -2,4 +2,4 @@
2
2
  Version information for the application_sdk package.
3
3
  """
4
4
 
5
- __version__ = "0.1.1rc41"
5
+ __version__ = "0.1.1rc42"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: atlan-application-sdk
3
- Version: 0.1.1rc41
3
+ Version: 0.1.1rc42
4
4
  Summary: Atlan Application SDK is a Python library for developing applications on the Atlan Platform
5
5
  Project-URL: Repository, https://github.com/atlanhq/application-sdk
6
6
  Project-URL: Documentation, https://github.com/atlanhq/application-sdk/README.md
@@ -1,17 +1,17 @@
1
1
  application_sdk/__init__.py,sha256=2e2mvmLJ5dxmJGPELtb33xwP-j6JMdoIuqKycEn7hjg,151
2
2
  application_sdk/constants.py,sha256=1THiejjOEgm4kHFN-PrwrUkfRk7q1pjOLWLm-t2ph1Q,10674
3
- application_sdk/version.py,sha256=zV2HH6SOEd5nUl8rfci_44ydqpKFM-6ShVxYm8ggtCk,88
3
+ application_sdk/version.py,sha256=Q8mQScZPLmTAD4YMM35CTgtJ-tDFJqdKLeeq94J4GkU,88
4
4
  application_sdk/worker.py,sha256=i5f0AeKI39IfsLO05QkwC6uMz0zDPSJqP7B2byri1VI,7489
5
5
  application_sdk/activities/__init__.py,sha256=QaXLOBYbb0zPOY5kfDQh56qbXQFaYNXOjJ5PCvatiZ4,9530
6
6
  application_sdk/activities/lock_management.py,sha256=L__GZ9BsArwU1ntYwAgCKsSjCqN6QBeOfT-OT4WyD4Y,3983
7
7
  application_sdk/activities/.cursor/BUGBOT.md,sha256=FNykX5aMkdOhzgpiGqstOnSp9JN63iR2XP3onU4AGh8,15843
8
8
  application_sdk/activities/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  application_sdk/activities/common/models.py,sha256=LIZfWvTtgtbAUvvn-rwrPQgD7fP2J0Gxdxr_ITgw-jM,1243
10
- application_sdk/activities/common/utils.py,sha256=F4Fq9Gl_gvUQj_fSdwzTU7obqUnemYL1dgb_yS34vTM,6967
10
+ application_sdk/activities/common/utils.py,sha256=nSNGkY5eS5pPc8etdPWkXBFTSaConGAD8LDtNqOMHF4,9836
11
11
  application_sdk/activities/metadata_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  application_sdk/activities/metadata_extraction/base.py,sha256=ENFojpxqKdN_eVSL4iet3cGfylPOfcl1jnflfo4zhs8,3920
13
13
  application_sdk/activities/metadata_extraction/rest.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- application_sdk/activities/metadata_extraction/sql.py,sha256=02u1KXT0EViqKRhYoJ-aYytohAu4IbmTe_JPyC3axTw,35898
14
+ application_sdk/activities/metadata_extraction/sql.py,sha256=I6TfA_sRb9w6slBhXuqJtw_2_4YSyK-1MiCHb4NWf-E,35829
15
15
  application_sdk/activities/query_extraction/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  application_sdk/activities/query_extraction/sql.py,sha256=mesGP_kiWzrJ8wboWFVt2jbDuGG_Fl3kQVvVMdH3KWA,21228
17
17
  application_sdk/application/__init__.py,sha256=PbSImXYaQQ2IIee2SvI8AjDiSo2QcCFrM1PX3x-_RQs,8035
@@ -57,10 +57,10 @@ application_sdk/events/models.py,sha256=7Esqp3WlbriT2EqT4kNiY_sHtRXRPLj27b8SbeC5
57
57
  application_sdk/handlers/__init__.py,sha256=3Wf7jCVFR2nYOyHZEc9jj8BQUnHCylFqoezp70J2Df0,1329
58
58
  application_sdk/handlers/base.py,sha256=ieWFbv8Gm7vfrrpS-mdMSm-mHGuQY02qiAVX2qPdj3w,2467
59
59
  application_sdk/handlers/sql.py,sha256=6A_9xCtkXyNY5gPhImbftzrdPIEWIeTTqjyIewVESHA,17815
60
- application_sdk/inputs/__init__.py,sha256=_d-cUhcDyoJTJR3PdQkC831go6VDw9AM6Bg7-qm3NHI,1900
60
+ application_sdk/inputs/__init__.py,sha256=_O5lK2A5EYyqwid8txKNEds3pHkoHGKrSTTWnQ-UzRA,6022
61
61
  application_sdk/inputs/iceberg.py,sha256=xiv1kNtVx1k0h3ZJbJeXjZwdfBGSy9j9orYP_AyCYlI,2756
62
- application_sdk/inputs/json.py,sha256=Yv70Y9YuutN2trqK5-z2UNtBL0895ZbdEiBDt9cYM9s,6216
63
- application_sdk/inputs/parquet.py,sha256=GnyB0r4-7GNLBl3ooVFUzsxunZsrHStKK2h7XRc7AIY,6723
62
+ application_sdk/inputs/json.py,sha256=ZOgB3tuZSsb2m_KxiAdnbUQgU5ythCs-Mq-n4pPfeHA,4905
63
+ application_sdk/inputs/parquet.py,sha256=51Wyvbv8vS6T_3bKHgq6pCva8w3PKCDH5jDuENy0z8c,9060
64
64
  application_sdk/inputs/sql_query.py,sha256=1EREgea6kKNaMIyX2HLJgbJ07rtAgLasd9NyvDcdZok,10636
65
65
  application_sdk/inputs/.cursor/BUGBOT.md,sha256=hwKGDbopv3NU0bpC_ElpAPDFcS59GWS3TunObGC6eLQ,9731
66
66
  application_sdk/interceptors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -115,8 +115,8 @@ application_sdk/test_utils/hypothesis/strategies/handlers/sql/__init__.py,sha256
115
115
  application_sdk/test_utils/hypothesis/strategies/handlers/sql/sql_metadata.py,sha256=xnf62RyS4UzxW1kTDys_4mg3Avg7KRfqRdP6O81FHp4,1883
116
116
  application_sdk/test_utils/hypothesis/strategies/handlers/sql/sql_preflight.py,sha256=e9uo6Bx5w_ZAEu6bDTWbMbmzqB0MYl2dH-JlXg3bkV8,2648
117
117
  application_sdk/test_utils/hypothesis/strategies/inputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
118
- application_sdk/test_utils/hypothesis/strategies/inputs/json_input.py,sha256=9mPI585J4tfQTZiM_uoFXCq7qrLML7jwH5eDm_mzvm4,749
119
- application_sdk/test_utils/hypothesis/strategies/inputs/parquet_input.py,sha256=qOi5En011KiOWAusP6dCvzr1rtC_sc2IkWfj7hWvSh0,950
118
+ application_sdk/test_utils/hypothesis/strategies/inputs/json_input.py,sha256=HfdCZnXIZFJiRuORpnmioXh8qHls9sWNSFDysy8il-o,913
119
+ application_sdk/test_utils/hypothesis/strategies/inputs/parquet_input.py,sha256=agjRA9agpak_GmWiIt9bi_oLGvLM_eunxXfxcNHK3MQ,1081
120
120
  application_sdk/test_utils/hypothesis/strategies/outputs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
121
121
  application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py,sha256=p9wotUJwc-Wmm54_qVG5Ivp_mgl7YTeAcQfC6RXlxCc,1835
122
122
  application_sdk/test_utils/hypothesis/strategies/outputs/statestore.py,sha256=gmYBwePNoSI_pl2WTXOClgkruzRwkOX_1SmBaUTha0c,2903
@@ -152,8 +152,8 @@ application_sdk/workflows/metadata_extraction/__init__.py,sha256=jHUe_ZBQ66jx8bg
152
152
  application_sdk/workflows/metadata_extraction/sql.py,sha256=BhaZavEL8H3Jvf28FGcHtZwqdsUT_EHZ4VTqiaieWek,12278
153
153
  application_sdk/workflows/query_extraction/__init__.py,sha256=n066_CX5RpJz6DIxGMkKS3eGSRg03ilaCtsqfJWQb7Q,117
154
154
  application_sdk/workflows/query_extraction/sql.py,sha256=kT_JQkLCRZ44ZpaC4QvPL6DxnRIIVh8gYHLqRbMI-hA,4826
155
- atlan_application_sdk-0.1.1rc41.dist-info/METADATA,sha256=foqYmvYq45g_A7zojtZJ0F06AjTxMOeAEnBE9FkkMFE,5567
156
- atlan_application_sdk-0.1.1rc41.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
157
- atlan_application_sdk-0.1.1rc41.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
158
- atlan_application_sdk-0.1.1rc41.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
159
- atlan_application_sdk-0.1.1rc41.dist-info/RECORD,,
155
+ atlan_application_sdk-0.1.1rc42.dist-info/METADATA,sha256=i1UmEi69_Uelfws3QsQ_AnUCMksuaeBb2O00kAXVGDc,5567
156
+ atlan_application_sdk-0.1.1rc42.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
157
+ atlan_application_sdk-0.1.1rc42.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
158
+ atlan_application_sdk-0.1.1rc42.dist-info/licenses/NOTICE,sha256=A-XVVGt3KOYuuMmvSMIFkg534F1vHiCggEBp4Ez3wGk,1041
159
+ atlan_application_sdk-0.1.1rc42.dist-info/RECORD,,