atlan-application-sdk 1.1.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. application_sdk/activities/common/sql_utils.py +308 -0
  2. application_sdk/activities/common/utils.py +1 -45
  3. application_sdk/activities/metadata_extraction/sql.py +110 -353
  4. application_sdk/activities/query_extraction/sql.py +12 -11
  5. application_sdk/application/__init__.py +1 -1
  6. application_sdk/clients/sql.py +167 -1
  7. application_sdk/clients/temporal.py +6 -6
  8. application_sdk/common/types.py +8 -0
  9. application_sdk/common/utils.py +1 -8
  10. application_sdk/constants.py +1 -1
  11. application_sdk/handlers/sql.py +10 -25
  12. application_sdk/interceptors/events.py +1 -1
  13. application_sdk/io/__init__.py +654 -0
  14. application_sdk/io/json.py +429 -0
  15. application_sdk/{outputs → io}/parquet.py +358 -47
  16. application_sdk/io/utils.py +307 -0
  17. application_sdk/observability/observability.py +23 -12
  18. application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
  19. application_sdk/server/fastapi/middleware/metrics.py +27 -24
  20. application_sdk/server/fastapi/models.py +1 -1
  21. application_sdk/server/fastapi/routers/server.py +1 -1
  22. application_sdk/server/fastapi/utils.py +10 -0
  23. application_sdk/services/eventstore.py +4 -4
  24. application_sdk/services/secretstore.py +1 -1
  25. application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
  26. application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
  27. application_sdk/version.py +1 -1
  28. application_sdk/worker.py +1 -1
  29. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/METADATA +9 -11
  30. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/RECORD +35 -42
  31. application_sdk/common/dataframe_utils.py +0 -42
  32. application_sdk/events/__init__.py +0 -5
  33. application_sdk/inputs/.cursor/BUGBOT.md +0 -250
  34. application_sdk/inputs/__init__.py +0 -168
  35. application_sdk/inputs/iceberg.py +0 -75
  36. application_sdk/inputs/json.py +0 -136
  37. application_sdk/inputs/parquet.py +0 -272
  38. application_sdk/inputs/sql_query.py +0 -271
  39. application_sdk/outputs/.cursor/BUGBOT.md +0 -295
  40. application_sdk/outputs/__init__.py +0 -453
  41. application_sdk/outputs/iceberg.py +0 -139
  42. application_sdk/outputs/json.py +0 -268
  43. /application_sdk/{events → interceptors}/models.py +0 -0
  44. /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
  45. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/WHEEL +0 -0
  46. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/LICENSE +0 -0
  47. {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.0.0.dist-info}/licenses/NOTICE +0 -0
@@ -1,168 +0,0 @@
1
- import os
2
- from abc import ABC, abstractmethod
3
- from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Union
4
-
5
- from application_sdk.activities.common.utils import (
6
- find_local_files_by_extension,
7
- get_object_store_prefix,
8
- )
9
- from application_sdk.common.error_codes import IOError
10
- from application_sdk.constants import TEMPORARY_PATH
11
- from application_sdk.observability.logger_adaptor import get_logger
12
- from application_sdk.services.objectstore import ObjectStore
13
-
14
- logger = get_logger(__name__)
15
-
16
- if TYPE_CHECKING:
17
- import daft
18
- import pandas as pd
19
-
20
-
21
- class Input(ABC):
22
- """
23
- Abstract base class for input data sources.
24
- """
25
-
26
- async def download_files(self) -> List[str]:
27
- """Download files from object store if not available locally.
28
-
29
- Flow:
30
- 1. Check if files exist locally at self.path
31
- 2. If not, try to download from object store
32
- 3. Filter by self.file_names if provided
33
- 4. Return list of file paths for logging purposes
34
-
35
- Returns:
36
- List[str]: List of file paths
37
-
38
- Raises:
39
- AttributeError: When the input class doesn't support file operations or _extension
40
- IOError: When no files found locally or in object store
41
- """
42
- # Step 1: Check if files exist locally
43
- local_files = find_local_files_by_extension(
44
- self.path, self._EXTENSION, self.file_names
45
- )
46
- if local_files:
47
- logger.info(
48
- f"Found {len(local_files)} {self._EXTENSION} files locally at: {self.path}"
49
- )
50
- return local_files
51
-
52
- # Step 2: Try to download from object store
53
- logger.info(
54
- f"No local {self._EXTENSION} files found at {self.path}, checking object store..."
55
- )
56
-
57
- try:
58
- # Determine what to download based on path type and filters
59
- downloaded_paths = []
60
-
61
- if self.path.endswith(self._EXTENSION):
62
- # Single file case (file_names validation already ensures this is valid)
63
- source_path = get_object_store_prefix(self.path)
64
- destination_path = os.path.join(TEMPORARY_PATH, source_path)
65
- await ObjectStore.download_file(
66
- source=source_path,
67
- destination=destination_path,
68
- )
69
- downloaded_paths.append(destination_path)
70
-
71
- elif self.file_names:
72
- # Directory with specific files - download each file individually
73
- for file_name in self.file_names:
74
- file_path = os.path.join(self.path, file_name)
75
- source_path = get_object_store_prefix(file_path)
76
- destination_path = os.path.join(TEMPORARY_PATH, source_path)
77
- await ObjectStore.download_file(
78
- source=source_path,
79
- destination=destination_path,
80
- )
81
- downloaded_paths.append(destination_path)
82
- else:
83
- # Download entire directory
84
- source_path = get_object_store_prefix(self.path)
85
- destination_path = os.path.join(TEMPORARY_PATH, source_path)
86
- await ObjectStore.download_prefix(
87
- source=source_path,
88
- destination=destination_path,
89
- )
90
- # Find the actual files in the downloaded directory
91
- found_files = find_local_files_by_extension(
92
- destination_path, self._EXTENSION, getattr(self, "file_names", None)
93
- )
94
- downloaded_paths.extend(found_files)
95
-
96
- # Check results
97
- if downloaded_paths:
98
- logger.info(
99
- f"Successfully downloaded {len(downloaded_paths)} {self._EXTENSION} files from object store"
100
- )
101
- return downloaded_paths
102
- else:
103
- raise IOError(
104
- f"{IOError.OBJECT_STORE_READ_ERROR}: Downloaded from object store but no {self._EXTENSION} files found"
105
- )
106
-
107
- except Exception as e:
108
- logger.error(f"Failed to download from object store: {str(e)}")
109
- raise IOError(
110
- f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: No {self._EXTENSION} files found locally at '{self.path}' and failed to download from object store. "
111
- f"Error: {str(e)}"
112
- )
113
-
114
- @abstractmethod
115
- async def get_batched_dataframe(
116
- self,
117
- ) -> Union[Iterator["pd.DataFrame"], AsyncIterator["pd.DataFrame"]]:
118
- """
119
- Get an iterator of batched pandas DataFrames.
120
-
121
- Returns:
122
- Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
123
-
124
- Raises:
125
- NotImplementedError: If the method is not implemented.
126
- """
127
- raise NotImplementedError
128
-
129
- @abstractmethod
130
- async def get_dataframe(self) -> "pd.DataFrame":
131
- """
132
- Get a single pandas DataFrame.
133
-
134
- Returns:
135
- "pd.DataFrame": A pandas DataFrame.
136
-
137
- Raises:
138
- NotImplementedError: If the method is not implemented.
139
- """
140
- raise NotImplementedError
141
-
142
- @abstractmethod
143
- async def get_batched_daft_dataframe(
144
- self,
145
- ) -> Union[Iterator["daft.DataFrame"], AsyncIterator["daft.DataFrame"]]: # noqa: F821
146
- """
147
- Get an iterator of batched daft DataFrames.
148
-
149
- Returns:
150
- Iterator[daft.DataFrame]: An iterator of batched daft DataFrames.
151
-
152
- Raises:
153
- NotImplementedError: If the method is not implemented.
154
- """
155
- raise NotImplementedError
156
-
157
- @abstractmethod
158
- async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
159
- """
160
- Get a single daft DataFrame.
161
-
162
- Returns:
163
- daft.DataFrame: A daft DataFrame.
164
-
165
- Raises:
166
- NotImplementedError: If the method is not implemented.
167
- """
168
- raise NotImplementedError
@@ -1,75 +0,0 @@
1
- from typing import TYPE_CHECKING, AsyncIterator, Iterator, Optional, Union
2
-
3
- from pyiceberg.table import Table
4
-
5
- from application_sdk.inputs import Input
6
- from application_sdk.observability.logger_adaptor import get_logger
7
-
8
- logger = get_logger(__name__)
9
-
10
- if TYPE_CHECKING:
11
- import daft
12
- import pandas as pd
13
-
14
-
15
- class IcebergInput(Input):
16
- """
17
- Iceberg Input class to read data from Iceberg tables using daft and pandas
18
- """
19
-
20
- table: Table
21
- chunk_size: Optional[int]
22
-
23
- def __init__(self, table: Table, chunk_size: Optional[int] = 100000):
24
- """Initialize the Iceberg input class.
25
-
26
- Args:
27
- table (Table): Iceberg table object.
28
- chunk_size (Optional[int], optional): Number of rows per batch.
29
- Defaults to 100000.
30
- """
31
- self.table = table
32
- self.chunk_size = chunk_size
33
-
34
- async def get_dataframe(self) -> "pd.DataFrame":
35
- """
36
- Method to read the data from the iceberg table
37
- and return as a single combined pandas dataframe
38
- """
39
- try:
40
- daft_dataframe = await self.get_daft_dataframe()
41
- return daft_dataframe.to_pandas()
42
- except Exception as e:
43
- logger.error(f"Error reading data from Iceberg table: {str(e)}")
44
- raise
45
-
46
- async def get_batched_dataframe(
47
- self,
48
- ) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
49
- # We are not implementing this method as we have to partition the daft dataframe
50
- # using dataframe.into_partitions() method. This method does all the partitions in memory
51
- # and using that can cause out of memory issues.
52
- # ref: https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/partitioning.html
53
- raise NotImplementedError
54
-
55
- async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
56
- """
57
- Method to read the data from the iceberg table
58
- and return as a single combined daft dataframe
59
- """
60
- try:
61
- import daft
62
-
63
- return daft.read_iceberg(self.table)
64
- except Exception as e:
65
- logger.error(f"Error reading data from Iceberg table using daft: {str(e)}")
66
- raise
67
-
68
- async def get_batched_daft_dataframe(
69
- self,
70
- ) -> Union[AsyncIterator["daft.DataFrame"], Iterator["daft.DataFrame"]]: # noqa: F821
71
- # We are not implementing this method as we have to partition the daft dataframe
72
- # using dataframe.into_partitions() method. This method does all the partitions in memory
73
- # and using that can cause out of memory issues.
74
- # ref: https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/partitioning.html
75
- raise NotImplementedError
@@ -1,136 +0,0 @@
1
- from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
2
-
3
- from application_sdk.inputs import Input
4
- from application_sdk.observability.logger_adaptor import get_logger
5
-
6
- if TYPE_CHECKING:
7
- import daft
8
- import pandas as pd
9
-
10
- logger = get_logger(__name__)
11
-
12
-
13
- class JsonInput(Input):
14
- """
15
- JSON Input class to read data from JSON files using daft and pandas.
16
- Supports reading both single files and directories containing multiple JSON files.
17
- """
18
-
19
- _EXTENSION = ".json"
20
-
21
- def __init__(
22
- self,
23
- path: str,
24
- file_names: Optional[List[str]] = None,
25
- chunk_size: int = 100000,
26
- ):
27
- """Initialize the JsonInput class.
28
-
29
- Args:
30
- path (str): Path to JSON file or directory containing JSON files.
31
- It accepts both types of paths:
32
- local path or object store path
33
- Wildcards are not supported.
34
- file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
35
- chunk_size (int): Number of rows per batch. Defaults to 100000.
36
-
37
- Raises:
38
- ValueError: When path is not provided or when single file path is combined with file_names
39
- """
40
-
41
- # Validate that single file path and file_names are not both specified
42
- if path.endswith(self._EXTENSION) and file_names:
43
- raise ValueError(
44
- f"Cannot specify both a single file path ('{path}') and file_names filter. "
45
- f"Either provide a directory path with file_names, or specify the exact file path without file_names."
46
- )
47
-
48
- self.path = path
49
- self.chunk_size = chunk_size
50
- self.file_names = file_names
51
-
52
- async def get_batched_dataframe(
53
- self,
54
- ) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
55
- """
56
- Method to read the data from the json files in the path
57
- and return as a batched pandas dataframe
58
- """
59
- try:
60
- import pandas as pd
61
-
62
- # Ensure files are available (local or downloaded)
63
- json_files = await self.download_files()
64
- logger.info(f"Reading {len(json_files)} JSON files in batches")
65
-
66
- for json_file in json_files:
67
- json_reader_obj = pd.read_json(
68
- json_file,
69
- chunksize=self.chunk_size,
70
- lines=True,
71
- )
72
- for chunk in json_reader_obj:
73
- yield chunk
74
- except Exception as e:
75
- logger.error(f"Error reading batched data from JSON: {str(e)}")
76
- raise
77
-
78
- async def get_dataframe(self) -> "pd.DataFrame":
79
- """
80
- Method to read the data from the json files in the path
81
- and return as a single combined pandas dataframe
82
- """
83
- try:
84
- import pandas as pd
85
-
86
- # Ensure files are available (local or downloaded)
87
- json_files = await self.download_files()
88
- logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
89
-
90
- return pd.concat(
91
- (pd.read_json(json_file, lines=True) for json_file in json_files),
92
- ignore_index=True,
93
- )
94
-
95
- except Exception as e:
96
- logger.error(f"Error reading data from JSON: {str(e)}")
97
- raise
98
-
99
- async def get_batched_daft_dataframe(
100
- self,
101
- ) -> Union[AsyncIterator["daft.DataFrame"], Iterator["daft.DataFrame"]]: # noqa: F821
102
- """
103
- Method to read the data from the json files in the path
104
- and return as a batched daft dataframe
105
- """
106
- try:
107
- import daft
108
-
109
- # Ensure files are available (local or downloaded)
110
- json_files = await self.download_files()
111
- logger.info(f"Reading {len(json_files)} JSON files as daft batches")
112
-
113
- # Yield each discovered file as separate batch with chunking
114
- for json_file in json_files:
115
- yield daft.read_json(json_file, _chunk_size=self.chunk_size)
116
- except Exception as e:
117
- logger.error(f"Error reading batched data from JSON using daft: {str(e)}")
118
- raise
119
-
120
- async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
121
- """
122
- Method to read the data from the json files in the path
123
- and return as a single combined daft dataframe
124
- """
125
- try:
126
- import daft
127
-
128
- # Ensure files are available (local or downloaded)
129
- json_files = await self.download_files()
130
- logger.info(f"Reading {len(json_files)} JSON files with daft")
131
-
132
- # Use the discovered/downloaded files directly
133
- return daft.read_json(json_files)
134
- except Exception as e:
135
- logger.error(f"Error reading data from JSON using daft: {str(e)}")
136
- raise
@@ -1,272 +0,0 @@
1
- from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
2
-
3
- from application_sdk.inputs import Input
4
- from application_sdk.observability.logger_adaptor import get_logger
5
-
6
- logger = get_logger(__name__)
7
-
8
- if TYPE_CHECKING:
9
- import daft # type: ignore
10
- import pandas as pd
11
-
12
-
13
- class ParquetInput(Input):
14
- """
15
- Parquet Input class to read data from Parquet files using daft and pandas.
16
- Supports reading both single files and directories containing multiple parquet files.
17
- """
18
-
19
- _EXTENSION = ".parquet"
20
-
21
- def __init__(
22
- self,
23
- path: str,
24
- chunk_size: int = 100000,
25
- buffer_size: int = 5000,
26
- file_names: Optional[List[str]] = None,
27
- ):
28
- """Initialize the Parquet input class.
29
-
30
- Args:
31
- path (str): Path to parquet file or directory containing parquet files.
32
- It accepts both types of paths:
33
- local path or object store path
34
- Wildcards are not supported.
35
- chunk_size (int): Number of rows per batch. Defaults to 100000.
36
- buffer_size (int): Number of rows per batch. Defaults to 5000.
37
- file_names (Optional[List[str]]): List of file names to read. Defaults to None.
38
-
39
- Raises:
40
- ValueError: When path is not provided or when single file path is combined with file_names
41
- """
42
-
43
- # Validate that single file path and file_names are not both specified
44
- if path.endswith(self._EXTENSION) and file_names:
45
- raise ValueError(
46
- f"Cannot specify both a single file path ('{path}') and file_names filter. "
47
- f"Either provide a directory path with file_names, or specify the exact file path without file_names."
48
- )
49
-
50
- self.path = path
51
- self.chunk_size = chunk_size
52
- self.buffer_size = buffer_size
53
- self.file_names = file_names
54
-
55
- async def get_dataframe(self) -> "pd.DataFrame":
56
- """Read data from parquet file(s) and return as pandas DataFrame.
57
-
58
- Returns:
59
- pd.DataFrame: Combined dataframe from specified parquet files
60
-
61
- Raises:
62
- ValueError: When no valid path can be determined or no matching files found
63
- Exception: When reading parquet files fails
64
-
65
- Example transformation:
66
- Input files:
67
- +------------------+
68
- | file1.parquet |
69
- | file2.parquet |
70
- | file3.parquet |
71
- +------------------+
72
-
73
- With file_names=["file1.parquet", "file3.parquet"]:
74
- +-------+-------+-------+
75
- | col1 | col2 | col3 |
76
- +-------+-------+-------+
77
- | val1 | val2 | val3 | # from file1.parquet
78
- | val7 | val8 | val9 | # from file3.parquet
79
- +-------+-------+-------+
80
-
81
- Transformations:
82
- - Only specified files are read and combined
83
- - Column schemas must be compatible across files
84
- - Only reads files in the specified directory
85
- """
86
- try:
87
- import pandas as pd
88
-
89
- # Ensure files are available (local or downloaded)
90
- parquet_files = await self.download_files()
91
- logger.info(f"Reading {len(parquet_files)} parquet files")
92
-
93
- return pd.concat(
94
- (pd.read_parquet(parquet_file) for parquet_file in parquet_files),
95
- ignore_index=True,
96
- )
97
- except Exception as e:
98
- logger.error(f"Error reading data from parquet file(s): {str(e)}")
99
- raise
100
-
101
- async def get_batched_dataframe(
102
- self,
103
- ) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
104
- """Read data from parquet file(s) in batches as pandas DataFrames.
105
-
106
- Returns:
107
- AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
108
-
109
- Raises:
110
- ValueError: When no parquet files found locally or in object store
111
- Exception: When reading parquet files fails
112
-
113
- Example transformation:
114
- Input files:
115
- +------------------+
116
- | file1.parquet |
117
- | file2.parquet |
118
- | file3.parquet |
119
- +------------------+
120
-
121
- With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
122
- Batch 1:
123
- +-------+-------+
124
- | col1 | col2 |
125
- +-------+-------+
126
- | val1 | val2 | # from file1.parquet
127
- | val3 | val4 | # from file1.parquet
128
- +-------+-------+
129
-
130
- Batch 2:
131
- +-------+-------+
132
- | col1 | col2 |
133
- +-------+-------+
134
- | val5 | val6 | # from file2.parquet
135
- | val7 | val8 | # from file2.parquet
136
- +-------+-------+
137
-
138
- Transformations:
139
- - Only specified files are combined then split into chunks
140
- - Each batch is a separate DataFrame
141
- - Only reads files in the specified directory
142
- """
143
- try:
144
- import pandas as pd
145
-
146
- # Ensure files are available (local or downloaded)
147
- parquet_files = await self.download_files()
148
- logger.info(f"Reading {len(parquet_files)} parquet files in batches")
149
-
150
- # Process each file individually to maintain memory efficiency
151
- for parquet_file in parquet_files:
152
- df = pd.read_parquet(parquet_file)
153
- for i in range(0, len(df), self.chunk_size):
154
- yield df.iloc[i : i + self.chunk_size]
155
- except Exception as e:
156
- logger.error(
157
- f"Error reading data from parquet file(s) in batches: {str(e)}"
158
- )
159
- raise
160
-
161
- async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
162
- """Read data from parquet file(s) and return as daft DataFrame.
163
-
164
- Returns:
165
- daft.DataFrame: Combined daft dataframe from specified parquet files
166
-
167
- Raises:
168
- ValueError: When no parquet files found locally or in object store
169
- Exception: When reading parquet files fails
170
-
171
- Example transformation:
172
- Input files:
173
- +------------------+
174
- | file1.parquet |
175
- | file2.parquet |
176
- | file3.parquet |
177
- +------------------+
178
-
179
- With file_names=["file1.parquet", "file3.parquet"]:
180
- +-------+-------+-------+
181
- | col1 | col2 | col3 |
182
- +-------+-------+-------+
183
- | val1 | val2 | val3 | # from file1.parquet
184
- | val7 | val8 | val9 | # from file3.parquet
185
- +-------+-------+-------+
186
-
187
- Transformations:
188
- - Only specified parquet files combined into single daft DataFrame
189
- - Lazy evaluation for better performance
190
- - Column schemas must be compatible across files
191
- """
192
- try:
193
- import daft # type: ignore
194
-
195
- # Ensure files are available (local or downloaded)
196
- parquet_files = await self.download_files()
197
- logger.info(f"Reading {len(parquet_files)} parquet files with daft")
198
-
199
- # Use the discovered/downloaded files directly
200
- return daft.read_parquet(parquet_files)
201
- except Exception as e:
202
- logger.error(
203
- f"Error reading data from parquet file(s) using daft: {str(e)}"
204
- )
205
- raise
206
-
207
- async def get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
208
- """Get batched daft dataframe from parquet file(s).
209
-
210
- Returns:
211
- AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
212
- a batch of data from individual parquet files
213
-
214
- Raises:
215
- ValueError: When no parquet files found locally or in object store
216
- Exception: When reading parquet files fails
217
-
218
- Example transformation:
219
- Input files:
220
- +------------------+
221
- | file1.parquet |
222
- | file2.parquet |
223
- | file3.parquet |
224
- +------------------+
225
-
226
- With file_names=["file1.parquet", "file3.parquet"]:
227
- Batch 1 (file1.parquet):
228
- +-------+-------+
229
- | col1 | col2 |
230
- +-------+-------+
231
- | val1 | val2 |
232
- | val3 | val4 |
233
- +-------+-------+
234
-
235
- Batch 2 (file3.parquet):
236
- +-------+-------+
237
- | col1 | col2 |
238
- +-------+-------+
239
- | val7 | val8 |
240
- | val9 | val10 |
241
- +-------+-------+
242
-
243
- Transformations:
244
- - Each specified file becomes a separate daft DataFrame batch
245
- - Lazy evaluation for better performance
246
- - Files processed individually for memory efficiency
247
- """
248
- try:
249
- import daft # type: ignore
250
-
251
- # Ensure files are available (local or downloaded)
252
- parquet_files = await self.download_files()
253
- logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
254
-
255
- # Create a lazy dataframe without loading data into memory
256
- lazy_df = daft.read_parquet(parquet_files)
257
-
258
- # Get total count efficiently
259
- total_rows = lazy_df.count_rows()
260
-
261
- # Yield chunks without loading everything into memory
262
- for offset in range(0, total_rows, self.buffer_size):
263
- chunk = lazy_df.offset(offset).limit(self.buffer_size)
264
- yield chunk
265
-
266
- del lazy_df
267
-
268
- except Exception as error:
269
- logger.error(
270
- f"Error reading data from parquet file(s) in batches using daft: {error}"
271
- )
272
- raise