atlan-application-sdk 1.1.1__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- application_sdk/activities/common/sql_utils.py +312 -0
- application_sdk/activities/common/utils.py +1 -45
- application_sdk/activities/metadata_extraction/sql.py +110 -353
- application_sdk/activities/query_extraction/sql.py +12 -11
- application_sdk/application/__init__.py +1 -1
- application_sdk/clients/sql.py +167 -1
- application_sdk/clients/temporal.py +6 -6
- application_sdk/common/types.py +8 -0
- application_sdk/common/utils.py +1 -8
- application_sdk/constants.py +1 -1
- application_sdk/handlers/sql.py +10 -25
- application_sdk/interceptors/events.py +1 -1
- application_sdk/io/__init__.py +749 -0
- application_sdk/io/json.py +473 -0
- application_sdk/{outputs → io}/parquet.py +414 -47
- application_sdk/io/utils.py +307 -0
- application_sdk/observability/observability.py +16 -12
- application_sdk/server/fastapi/middleware/logmiddleware.py +23 -17
- application_sdk/server/fastapi/middleware/metrics.py +27 -24
- application_sdk/server/fastapi/models.py +1 -1
- application_sdk/server/fastapi/routers/server.py +1 -1
- application_sdk/server/fastapi/utils.py +10 -0
- application_sdk/services/eventstore.py +4 -4
- application_sdk/services/objectstore.py +14 -1
- application_sdk/services/secretstore.py +1 -1
- application_sdk/test_utils/hypothesis/strategies/outputs/json_output.py +0 -1
- application_sdk/test_utils/hypothesis/strategies/server/fastapi/__init__.py +1 -1
- application_sdk/version.py +1 -1
- application_sdk/worker.py +1 -1
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/METADATA +9 -11
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/RECORD +36 -43
- application_sdk/common/dataframe_utils.py +0 -42
- application_sdk/events/__init__.py +0 -5
- application_sdk/inputs/.cursor/BUGBOT.md +0 -250
- application_sdk/inputs/__init__.py +0 -168
- application_sdk/inputs/iceberg.py +0 -75
- application_sdk/inputs/json.py +0 -136
- application_sdk/inputs/parquet.py +0 -272
- application_sdk/inputs/sql_query.py +0 -271
- application_sdk/outputs/.cursor/BUGBOT.md +0 -295
- application_sdk/outputs/__init__.py +0 -453
- application_sdk/outputs/iceberg.py +0 -139
- application_sdk/outputs/json.py +0 -268
- /application_sdk/{events → interceptors}/models.py +0 -0
- /application_sdk/{common/dapr_utils.py → services/_utils.py} +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/WHEEL +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/LICENSE +0 -0
- {atlan_application_sdk-1.1.1.dist-info → atlan_application_sdk-2.1.0.dist-info}/licenses/NOTICE +0 -0
|
@@ -1,168 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from abc import ABC, abstractmethod
|
|
3
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Union
|
|
4
|
-
|
|
5
|
-
from application_sdk.activities.common.utils import (
|
|
6
|
-
find_local_files_by_extension,
|
|
7
|
-
get_object_store_prefix,
|
|
8
|
-
)
|
|
9
|
-
from application_sdk.common.error_codes import IOError
|
|
10
|
-
from application_sdk.constants import TEMPORARY_PATH
|
|
11
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
12
|
-
from application_sdk.services.objectstore import ObjectStore
|
|
13
|
-
|
|
14
|
-
logger = get_logger(__name__)
|
|
15
|
-
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
import daft
|
|
18
|
-
import pandas as pd
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
class Input(ABC):
|
|
22
|
-
"""
|
|
23
|
-
Abstract base class for input data sources.
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
async def download_files(self) -> List[str]:
|
|
27
|
-
"""Download files from object store if not available locally.
|
|
28
|
-
|
|
29
|
-
Flow:
|
|
30
|
-
1. Check if files exist locally at self.path
|
|
31
|
-
2. If not, try to download from object store
|
|
32
|
-
3. Filter by self.file_names if provided
|
|
33
|
-
4. Return list of file paths for logging purposes
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
List[str]: List of file paths
|
|
37
|
-
|
|
38
|
-
Raises:
|
|
39
|
-
AttributeError: When the input class doesn't support file operations or _extension
|
|
40
|
-
IOError: When no files found locally or in object store
|
|
41
|
-
"""
|
|
42
|
-
# Step 1: Check if files exist locally
|
|
43
|
-
local_files = find_local_files_by_extension(
|
|
44
|
-
self.path, self._EXTENSION, self.file_names
|
|
45
|
-
)
|
|
46
|
-
if local_files:
|
|
47
|
-
logger.info(
|
|
48
|
-
f"Found {len(local_files)} {self._EXTENSION} files locally at: {self.path}"
|
|
49
|
-
)
|
|
50
|
-
return local_files
|
|
51
|
-
|
|
52
|
-
# Step 2: Try to download from object store
|
|
53
|
-
logger.info(
|
|
54
|
-
f"No local {self._EXTENSION} files found at {self.path}, checking object store..."
|
|
55
|
-
)
|
|
56
|
-
|
|
57
|
-
try:
|
|
58
|
-
# Determine what to download based on path type and filters
|
|
59
|
-
downloaded_paths = []
|
|
60
|
-
|
|
61
|
-
if self.path.endswith(self._EXTENSION):
|
|
62
|
-
# Single file case (file_names validation already ensures this is valid)
|
|
63
|
-
source_path = get_object_store_prefix(self.path)
|
|
64
|
-
destination_path = os.path.join(TEMPORARY_PATH, source_path)
|
|
65
|
-
await ObjectStore.download_file(
|
|
66
|
-
source=source_path,
|
|
67
|
-
destination=destination_path,
|
|
68
|
-
)
|
|
69
|
-
downloaded_paths.append(destination_path)
|
|
70
|
-
|
|
71
|
-
elif self.file_names:
|
|
72
|
-
# Directory with specific files - download each file individually
|
|
73
|
-
for file_name in self.file_names:
|
|
74
|
-
file_path = os.path.join(self.path, file_name)
|
|
75
|
-
source_path = get_object_store_prefix(file_path)
|
|
76
|
-
destination_path = os.path.join(TEMPORARY_PATH, source_path)
|
|
77
|
-
await ObjectStore.download_file(
|
|
78
|
-
source=source_path,
|
|
79
|
-
destination=destination_path,
|
|
80
|
-
)
|
|
81
|
-
downloaded_paths.append(destination_path)
|
|
82
|
-
else:
|
|
83
|
-
# Download entire directory
|
|
84
|
-
source_path = get_object_store_prefix(self.path)
|
|
85
|
-
destination_path = os.path.join(TEMPORARY_PATH, source_path)
|
|
86
|
-
await ObjectStore.download_prefix(
|
|
87
|
-
source=source_path,
|
|
88
|
-
destination=destination_path,
|
|
89
|
-
)
|
|
90
|
-
# Find the actual files in the downloaded directory
|
|
91
|
-
found_files = find_local_files_by_extension(
|
|
92
|
-
destination_path, self._EXTENSION, getattr(self, "file_names", None)
|
|
93
|
-
)
|
|
94
|
-
downloaded_paths.extend(found_files)
|
|
95
|
-
|
|
96
|
-
# Check results
|
|
97
|
-
if downloaded_paths:
|
|
98
|
-
logger.info(
|
|
99
|
-
f"Successfully downloaded {len(downloaded_paths)} {self._EXTENSION} files from object store"
|
|
100
|
-
)
|
|
101
|
-
return downloaded_paths
|
|
102
|
-
else:
|
|
103
|
-
raise IOError(
|
|
104
|
-
f"{IOError.OBJECT_STORE_READ_ERROR}: Downloaded from object store but no {self._EXTENSION} files found"
|
|
105
|
-
)
|
|
106
|
-
|
|
107
|
-
except Exception as e:
|
|
108
|
-
logger.error(f"Failed to download from object store: {str(e)}")
|
|
109
|
-
raise IOError(
|
|
110
|
-
f"{IOError.OBJECT_STORE_DOWNLOAD_ERROR}: No {self._EXTENSION} files found locally at '{self.path}' and failed to download from object store. "
|
|
111
|
-
f"Error: {str(e)}"
|
|
112
|
-
)
|
|
113
|
-
|
|
114
|
-
@abstractmethod
|
|
115
|
-
async def get_batched_dataframe(
|
|
116
|
-
self,
|
|
117
|
-
) -> Union[Iterator["pd.DataFrame"], AsyncIterator["pd.DataFrame"]]:
|
|
118
|
-
"""
|
|
119
|
-
Get an iterator of batched pandas DataFrames.
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
Iterator["pd.DataFrame"]: An iterator of batched pandas DataFrames.
|
|
123
|
-
|
|
124
|
-
Raises:
|
|
125
|
-
NotImplementedError: If the method is not implemented.
|
|
126
|
-
"""
|
|
127
|
-
raise NotImplementedError
|
|
128
|
-
|
|
129
|
-
@abstractmethod
|
|
130
|
-
async def get_dataframe(self) -> "pd.DataFrame":
|
|
131
|
-
"""
|
|
132
|
-
Get a single pandas DataFrame.
|
|
133
|
-
|
|
134
|
-
Returns:
|
|
135
|
-
"pd.DataFrame": A pandas DataFrame.
|
|
136
|
-
|
|
137
|
-
Raises:
|
|
138
|
-
NotImplementedError: If the method is not implemented.
|
|
139
|
-
"""
|
|
140
|
-
raise NotImplementedError
|
|
141
|
-
|
|
142
|
-
@abstractmethod
|
|
143
|
-
async def get_batched_daft_dataframe(
|
|
144
|
-
self,
|
|
145
|
-
) -> Union[Iterator["daft.DataFrame"], AsyncIterator["daft.DataFrame"]]: # noqa: F821
|
|
146
|
-
"""
|
|
147
|
-
Get an iterator of batched daft DataFrames.
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
Iterator[daft.DataFrame]: An iterator of batched daft DataFrames.
|
|
151
|
-
|
|
152
|
-
Raises:
|
|
153
|
-
NotImplementedError: If the method is not implemented.
|
|
154
|
-
"""
|
|
155
|
-
raise NotImplementedError
|
|
156
|
-
|
|
157
|
-
@abstractmethod
|
|
158
|
-
async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
159
|
-
"""
|
|
160
|
-
Get a single daft DataFrame.
|
|
161
|
-
|
|
162
|
-
Returns:
|
|
163
|
-
daft.DataFrame: A daft DataFrame.
|
|
164
|
-
|
|
165
|
-
Raises:
|
|
166
|
-
NotImplementedError: If the method is not implemented.
|
|
167
|
-
"""
|
|
168
|
-
raise NotImplementedError
|
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterator, Optional, Union
|
|
2
|
-
|
|
3
|
-
from pyiceberg.table import Table
|
|
4
|
-
|
|
5
|
-
from application_sdk.inputs import Input
|
|
6
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
7
|
-
|
|
8
|
-
logger = get_logger(__name__)
|
|
9
|
-
|
|
10
|
-
if TYPE_CHECKING:
|
|
11
|
-
import daft
|
|
12
|
-
import pandas as pd
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
class IcebergInput(Input):
|
|
16
|
-
"""
|
|
17
|
-
Iceberg Input class to read data from Iceberg tables using daft and pandas
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
table: Table
|
|
21
|
-
chunk_size: Optional[int]
|
|
22
|
-
|
|
23
|
-
def __init__(self, table: Table, chunk_size: Optional[int] = 100000):
|
|
24
|
-
"""Initialize the Iceberg input class.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
table (Table): Iceberg table object.
|
|
28
|
-
chunk_size (Optional[int], optional): Number of rows per batch.
|
|
29
|
-
Defaults to 100000.
|
|
30
|
-
"""
|
|
31
|
-
self.table = table
|
|
32
|
-
self.chunk_size = chunk_size
|
|
33
|
-
|
|
34
|
-
async def get_dataframe(self) -> "pd.DataFrame":
|
|
35
|
-
"""
|
|
36
|
-
Method to read the data from the iceberg table
|
|
37
|
-
and return as a single combined pandas dataframe
|
|
38
|
-
"""
|
|
39
|
-
try:
|
|
40
|
-
daft_dataframe = await self.get_daft_dataframe()
|
|
41
|
-
return daft_dataframe.to_pandas()
|
|
42
|
-
except Exception as e:
|
|
43
|
-
logger.error(f"Error reading data from Iceberg table: {str(e)}")
|
|
44
|
-
raise
|
|
45
|
-
|
|
46
|
-
async def get_batched_dataframe(
|
|
47
|
-
self,
|
|
48
|
-
) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
|
|
49
|
-
# We are not implementing this method as we have to partition the daft dataframe
|
|
50
|
-
# using dataframe.into_partitions() method. This method does all the partitions in memory
|
|
51
|
-
# and using that can cause out of memory issues.
|
|
52
|
-
# ref: https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/partitioning.html
|
|
53
|
-
raise NotImplementedError
|
|
54
|
-
|
|
55
|
-
async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
56
|
-
"""
|
|
57
|
-
Method to read the data from the iceberg table
|
|
58
|
-
and return as a single combined daft dataframe
|
|
59
|
-
"""
|
|
60
|
-
try:
|
|
61
|
-
import daft
|
|
62
|
-
|
|
63
|
-
return daft.read_iceberg(self.table)
|
|
64
|
-
except Exception as e:
|
|
65
|
-
logger.error(f"Error reading data from Iceberg table using daft: {str(e)}")
|
|
66
|
-
raise
|
|
67
|
-
|
|
68
|
-
async def get_batched_daft_dataframe(
|
|
69
|
-
self,
|
|
70
|
-
) -> Union[AsyncIterator["daft.DataFrame"], Iterator["daft.DataFrame"]]: # noqa: F821
|
|
71
|
-
# We are not implementing this method as we have to partition the daft dataframe
|
|
72
|
-
# using dataframe.into_partitions() method. This method does all the partitions in memory
|
|
73
|
-
# and using that can cause out of memory issues.
|
|
74
|
-
# ref: https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/partitioning.html
|
|
75
|
-
raise NotImplementedError
|
application_sdk/inputs/json.py
DELETED
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
|
|
2
|
-
|
|
3
|
-
from application_sdk.inputs import Input
|
|
4
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
5
|
-
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
import daft
|
|
8
|
-
import pandas as pd
|
|
9
|
-
|
|
10
|
-
logger = get_logger(__name__)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class JsonInput(Input):
|
|
14
|
-
"""
|
|
15
|
-
JSON Input class to read data from JSON files using daft and pandas.
|
|
16
|
-
Supports reading both single files and directories containing multiple JSON files.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
_EXTENSION = ".json"
|
|
20
|
-
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
path: str,
|
|
24
|
-
file_names: Optional[List[str]] = None,
|
|
25
|
-
chunk_size: int = 100000,
|
|
26
|
-
):
|
|
27
|
-
"""Initialize the JsonInput class.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
path (str): Path to JSON file or directory containing JSON files.
|
|
31
|
-
It accepts both types of paths:
|
|
32
|
-
local path or object store path
|
|
33
|
-
Wildcards are not supported.
|
|
34
|
-
file_names (Optional[List[str]]): List of specific file names to read. Defaults to None.
|
|
35
|
-
chunk_size (int): Number of rows per batch. Defaults to 100000.
|
|
36
|
-
|
|
37
|
-
Raises:
|
|
38
|
-
ValueError: When path is not provided or when single file path is combined with file_names
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
# Validate that single file path and file_names are not both specified
|
|
42
|
-
if path.endswith(self._EXTENSION) and file_names:
|
|
43
|
-
raise ValueError(
|
|
44
|
-
f"Cannot specify both a single file path ('{path}') and file_names filter. "
|
|
45
|
-
f"Either provide a directory path with file_names, or specify the exact file path without file_names."
|
|
46
|
-
)
|
|
47
|
-
|
|
48
|
-
self.path = path
|
|
49
|
-
self.chunk_size = chunk_size
|
|
50
|
-
self.file_names = file_names
|
|
51
|
-
|
|
52
|
-
async def get_batched_dataframe(
|
|
53
|
-
self,
|
|
54
|
-
) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
|
|
55
|
-
"""
|
|
56
|
-
Method to read the data from the json files in the path
|
|
57
|
-
and return as a batched pandas dataframe
|
|
58
|
-
"""
|
|
59
|
-
try:
|
|
60
|
-
import pandas as pd
|
|
61
|
-
|
|
62
|
-
# Ensure files are available (local or downloaded)
|
|
63
|
-
json_files = await self.download_files()
|
|
64
|
-
logger.info(f"Reading {len(json_files)} JSON files in batches")
|
|
65
|
-
|
|
66
|
-
for json_file in json_files:
|
|
67
|
-
json_reader_obj = pd.read_json(
|
|
68
|
-
json_file,
|
|
69
|
-
chunksize=self.chunk_size,
|
|
70
|
-
lines=True,
|
|
71
|
-
)
|
|
72
|
-
for chunk in json_reader_obj:
|
|
73
|
-
yield chunk
|
|
74
|
-
except Exception as e:
|
|
75
|
-
logger.error(f"Error reading batched data from JSON: {str(e)}")
|
|
76
|
-
raise
|
|
77
|
-
|
|
78
|
-
async def get_dataframe(self) -> "pd.DataFrame":
|
|
79
|
-
"""
|
|
80
|
-
Method to read the data from the json files in the path
|
|
81
|
-
and return as a single combined pandas dataframe
|
|
82
|
-
"""
|
|
83
|
-
try:
|
|
84
|
-
import pandas as pd
|
|
85
|
-
|
|
86
|
-
# Ensure files are available (local or downloaded)
|
|
87
|
-
json_files = await self.download_files()
|
|
88
|
-
logger.info(f"Reading {len(json_files)} JSON files as pandas dataframe")
|
|
89
|
-
|
|
90
|
-
return pd.concat(
|
|
91
|
-
(pd.read_json(json_file, lines=True) for json_file in json_files),
|
|
92
|
-
ignore_index=True,
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
except Exception as e:
|
|
96
|
-
logger.error(f"Error reading data from JSON: {str(e)}")
|
|
97
|
-
raise
|
|
98
|
-
|
|
99
|
-
async def get_batched_daft_dataframe(
|
|
100
|
-
self,
|
|
101
|
-
) -> Union[AsyncIterator["daft.DataFrame"], Iterator["daft.DataFrame"]]: # noqa: F821
|
|
102
|
-
"""
|
|
103
|
-
Method to read the data from the json files in the path
|
|
104
|
-
and return as a batched daft dataframe
|
|
105
|
-
"""
|
|
106
|
-
try:
|
|
107
|
-
import daft
|
|
108
|
-
|
|
109
|
-
# Ensure files are available (local or downloaded)
|
|
110
|
-
json_files = await self.download_files()
|
|
111
|
-
logger.info(f"Reading {len(json_files)} JSON files as daft batches")
|
|
112
|
-
|
|
113
|
-
# Yield each discovered file as separate batch with chunking
|
|
114
|
-
for json_file in json_files:
|
|
115
|
-
yield daft.read_json(json_file, _chunk_size=self.chunk_size)
|
|
116
|
-
except Exception as e:
|
|
117
|
-
logger.error(f"Error reading batched data from JSON using daft: {str(e)}")
|
|
118
|
-
raise
|
|
119
|
-
|
|
120
|
-
async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
121
|
-
"""
|
|
122
|
-
Method to read the data from the json files in the path
|
|
123
|
-
and return as a single combined daft dataframe
|
|
124
|
-
"""
|
|
125
|
-
try:
|
|
126
|
-
import daft
|
|
127
|
-
|
|
128
|
-
# Ensure files are available (local or downloaded)
|
|
129
|
-
json_files = await self.download_files()
|
|
130
|
-
logger.info(f"Reading {len(json_files)} JSON files with daft")
|
|
131
|
-
|
|
132
|
-
# Use the discovered/downloaded files directly
|
|
133
|
-
return daft.read_json(json_files)
|
|
134
|
-
except Exception as e:
|
|
135
|
-
logger.error(f"Error reading data from JSON using daft: {str(e)}")
|
|
136
|
-
raise
|
|
@@ -1,272 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING, AsyncIterator, Iterator, List, Optional, Union
|
|
2
|
-
|
|
3
|
-
from application_sdk.inputs import Input
|
|
4
|
-
from application_sdk.observability.logger_adaptor import get_logger
|
|
5
|
-
|
|
6
|
-
logger = get_logger(__name__)
|
|
7
|
-
|
|
8
|
-
if TYPE_CHECKING:
|
|
9
|
-
import daft # type: ignore
|
|
10
|
-
import pandas as pd
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class ParquetInput(Input):
|
|
14
|
-
"""
|
|
15
|
-
Parquet Input class to read data from Parquet files using daft and pandas.
|
|
16
|
-
Supports reading both single files and directories containing multiple parquet files.
|
|
17
|
-
"""
|
|
18
|
-
|
|
19
|
-
_EXTENSION = ".parquet"
|
|
20
|
-
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
path: str,
|
|
24
|
-
chunk_size: int = 100000,
|
|
25
|
-
buffer_size: int = 5000,
|
|
26
|
-
file_names: Optional[List[str]] = None,
|
|
27
|
-
):
|
|
28
|
-
"""Initialize the Parquet input class.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
path (str): Path to parquet file or directory containing parquet files.
|
|
32
|
-
It accepts both types of paths:
|
|
33
|
-
local path or object store path
|
|
34
|
-
Wildcards are not supported.
|
|
35
|
-
chunk_size (int): Number of rows per batch. Defaults to 100000.
|
|
36
|
-
buffer_size (int): Number of rows per batch. Defaults to 5000.
|
|
37
|
-
file_names (Optional[List[str]]): List of file names to read. Defaults to None.
|
|
38
|
-
|
|
39
|
-
Raises:
|
|
40
|
-
ValueError: When path is not provided or when single file path is combined with file_names
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
# Validate that single file path and file_names are not both specified
|
|
44
|
-
if path.endswith(self._EXTENSION) and file_names:
|
|
45
|
-
raise ValueError(
|
|
46
|
-
f"Cannot specify both a single file path ('{path}') and file_names filter. "
|
|
47
|
-
f"Either provide a directory path with file_names, or specify the exact file path without file_names."
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
self.path = path
|
|
51
|
-
self.chunk_size = chunk_size
|
|
52
|
-
self.buffer_size = buffer_size
|
|
53
|
-
self.file_names = file_names
|
|
54
|
-
|
|
55
|
-
async def get_dataframe(self) -> "pd.DataFrame":
|
|
56
|
-
"""Read data from parquet file(s) and return as pandas DataFrame.
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
pd.DataFrame: Combined dataframe from specified parquet files
|
|
60
|
-
|
|
61
|
-
Raises:
|
|
62
|
-
ValueError: When no valid path can be determined or no matching files found
|
|
63
|
-
Exception: When reading parquet files fails
|
|
64
|
-
|
|
65
|
-
Example transformation:
|
|
66
|
-
Input files:
|
|
67
|
-
+------------------+
|
|
68
|
-
| file1.parquet |
|
|
69
|
-
| file2.parquet |
|
|
70
|
-
| file3.parquet |
|
|
71
|
-
+------------------+
|
|
72
|
-
|
|
73
|
-
With file_names=["file1.parquet", "file3.parquet"]:
|
|
74
|
-
+-------+-------+-------+
|
|
75
|
-
| col1 | col2 | col3 |
|
|
76
|
-
+-------+-------+-------+
|
|
77
|
-
| val1 | val2 | val3 | # from file1.parquet
|
|
78
|
-
| val7 | val8 | val9 | # from file3.parquet
|
|
79
|
-
+-------+-------+-------+
|
|
80
|
-
|
|
81
|
-
Transformations:
|
|
82
|
-
- Only specified files are read and combined
|
|
83
|
-
- Column schemas must be compatible across files
|
|
84
|
-
- Only reads files in the specified directory
|
|
85
|
-
"""
|
|
86
|
-
try:
|
|
87
|
-
import pandas as pd
|
|
88
|
-
|
|
89
|
-
# Ensure files are available (local or downloaded)
|
|
90
|
-
parquet_files = await self.download_files()
|
|
91
|
-
logger.info(f"Reading {len(parquet_files)} parquet files")
|
|
92
|
-
|
|
93
|
-
return pd.concat(
|
|
94
|
-
(pd.read_parquet(parquet_file) for parquet_file in parquet_files),
|
|
95
|
-
ignore_index=True,
|
|
96
|
-
)
|
|
97
|
-
except Exception as e:
|
|
98
|
-
logger.error(f"Error reading data from parquet file(s): {str(e)}")
|
|
99
|
-
raise
|
|
100
|
-
|
|
101
|
-
async def get_batched_dataframe(
|
|
102
|
-
self,
|
|
103
|
-
) -> Union[AsyncIterator["pd.DataFrame"], Iterator["pd.DataFrame"]]:
|
|
104
|
-
"""Read data from parquet file(s) in batches as pandas DataFrames.
|
|
105
|
-
|
|
106
|
-
Returns:
|
|
107
|
-
AsyncIterator[pd.DataFrame]: Async iterator of pandas dataframes
|
|
108
|
-
|
|
109
|
-
Raises:
|
|
110
|
-
ValueError: When no parquet files found locally or in object store
|
|
111
|
-
Exception: When reading parquet files fails
|
|
112
|
-
|
|
113
|
-
Example transformation:
|
|
114
|
-
Input files:
|
|
115
|
-
+------------------+
|
|
116
|
-
| file1.parquet |
|
|
117
|
-
| file2.parquet |
|
|
118
|
-
| file3.parquet |
|
|
119
|
-
+------------------+
|
|
120
|
-
|
|
121
|
-
With file_names=["file1.parquet", "file2.parquet"] and chunk_size=2:
|
|
122
|
-
Batch 1:
|
|
123
|
-
+-------+-------+
|
|
124
|
-
| col1 | col2 |
|
|
125
|
-
+-------+-------+
|
|
126
|
-
| val1 | val2 | # from file1.parquet
|
|
127
|
-
| val3 | val4 | # from file1.parquet
|
|
128
|
-
+-------+-------+
|
|
129
|
-
|
|
130
|
-
Batch 2:
|
|
131
|
-
+-------+-------+
|
|
132
|
-
| col1 | col2 |
|
|
133
|
-
+-------+-------+
|
|
134
|
-
| val5 | val6 | # from file2.parquet
|
|
135
|
-
| val7 | val8 | # from file2.parquet
|
|
136
|
-
+-------+-------+
|
|
137
|
-
|
|
138
|
-
Transformations:
|
|
139
|
-
- Only specified files are combined then split into chunks
|
|
140
|
-
- Each batch is a separate DataFrame
|
|
141
|
-
- Only reads files in the specified directory
|
|
142
|
-
"""
|
|
143
|
-
try:
|
|
144
|
-
import pandas as pd
|
|
145
|
-
|
|
146
|
-
# Ensure files are available (local or downloaded)
|
|
147
|
-
parquet_files = await self.download_files()
|
|
148
|
-
logger.info(f"Reading {len(parquet_files)} parquet files in batches")
|
|
149
|
-
|
|
150
|
-
# Process each file individually to maintain memory efficiency
|
|
151
|
-
for parquet_file in parquet_files:
|
|
152
|
-
df = pd.read_parquet(parquet_file)
|
|
153
|
-
for i in range(0, len(df), self.chunk_size):
|
|
154
|
-
yield df.iloc[i : i + self.chunk_size]
|
|
155
|
-
except Exception as e:
|
|
156
|
-
logger.error(
|
|
157
|
-
f"Error reading data from parquet file(s) in batches: {str(e)}"
|
|
158
|
-
)
|
|
159
|
-
raise
|
|
160
|
-
|
|
161
|
-
async def get_daft_dataframe(self) -> "daft.DataFrame": # noqa: F821
|
|
162
|
-
"""Read data from parquet file(s) and return as daft DataFrame.
|
|
163
|
-
|
|
164
|
-
Returns:
|
|
165
|
-
daft.DataFrame: Combined daft dataframe from specified parquet files
|
|
166
|
-
|
|
167
|
-
Raises:
|
|
168
|
-
ValueError: When no parquet files found locally or in object store
|
|
169
|
-
Exception: When reading parquet files fails
|
|
170
|
-
|
|
171
|
-
Example transformation:
|
|
172
|
-
Input files:
|
|
173
|
-
+------------------+
|
|
174
|
-
| file1.parquet |
|
|
175
|
-
| file2.parquet |
|
|
176
|
-
| file3.parquet |
|
|
177
|
-
+------------------+
|
|
178
|
-
|
|
179
|
-
With file_names=["file1.parquet", "file3.parquet"]:
|
|
180
|
-
+-------+-------+-------+
|
|
181
|
-
| col1 | col2 | col3 |
|
|
182
|
-
+-------+-------+-------+
|
|
183
|
-
| val1 | val2 | val3 | # from file1.parquet
|
|
184
|
-
| val7 | val8 | val9 | # from file3.parquet
|
|
185
|
-
+-------+-------+-------+
|
|
186
|
-
|
|
187
|
-
Transformations:
|
|
188
|
-
- Only specified parquet files combined into single daft DataFrame
|
|
189
|
-
- Lazy evaluation for better performance
|
|
190
|
-
- Column schemas must be compatible across files
|
|
191
|
-
"""
|
|
192
|
-
try:
|
|
193
|
-
import daft # type: ignore
|
|
194
|
-
|
|
195
|
-
# Ensure files are available (local or downloaded)
|
|
196
|
-
parquet_files = await self.download_files()
|
|
197
|
-
logger.info(f"Reading {len(parquet_files)} parquet files with daft")
|
|
198
|
-
|
|
199
|
-
# Use the discovered/downloaded files directly
|
|
200
|
-
return daft.read_parquet(parquet_files)
|
|
201
|
-
except Exception as e:
|
|
202
|
-
logger.error(
|
|
203
|
-
f"Error reading data from parquet file(s) using daft: {str(e)}"
|
|
204
|
-
)
|
|
205
|
-
raise
|
|
206
|
-
|
|
207
|
-
async def get_batched_daft_dataframe(self) -> AsyncIterator["daft.DataFrame"]: # type: ignore
|
|
208
|
-
"""Get batched daft dataframe from parquet file(s).
|
|
209
|
-
|
|
210
|
-
Returns:
|
|
211
|
-
AsyncIterator[daft.DataFrame]: An async iterator of daft DataFrames, each containing
|
|
212
|
-
a batch of data from individual parquet files
|
|
213
|
-
|
|
214
|
-
Raises:
|
|
215
|
-
ValueError: When no parquet files found locally or in object store
|
|
216
|
-
Exception: When reading parquet files fails
|
|
217
|
-
|
|
218
|
-
Example transformation:
|
|
219
|
-
Input files:
|
|
220
|
-
+------------------+
|
|
221
|
-
| file1.parquet |
|
|
222
|
-
| file2.parquet |
|
|
223
|
-
| file3.parquet |
|
|
224
|
-
+------------------+
|
|
225
|
-
|
|
226
|
-
With file_names=["file1.parquet", "file3.parquet"]:
|
|
227
|
-
Batch 1 (file1.parquet):
|
|
228
|
-
+-------+-------+
|
|
229
|
-
| col1 | col2 |
|
|
230
|
-
+-------+-------+
|
|
231
|
-
| val1 | val2 |
|
|
232
|
-
| val3 | val4 |
|
|
233
|
-
+-------+-------+
|
|
234
|
-
|
|
235
|
-
Batch 2 (file3.parquet):
|
|
236
|
-
+-------+-------+
|
|
237
|
-
| col1 | col2 |
|
|
238
|
-
+-------+-------+
|
|
239
|
-
| val7 | val8 |
|
|
240
|
-
| val9 | val10 |
|
|
241
|
-
+-------+-------+
|
|
242
|
-
|
|
243
|
-
Transformations:
|
|
244
|
-
- Each specified file becomes a separate daft DataFrame batch
|
|
245
|
-
- Lazy evaluation for better performance
|
|
246
|
-
- Files processed individually for memory efficiency
|
|
247
|
-
"""
|
|
248
|
-
try:
|
|
249
|
-
import daft # type: ignore
|
|
250
|
-
|
|
251
|
-
# Ensure files are available (local or downloaded)
|
|
252
|
-
parquet_files = await self.download_files()
|
|
253
|
-
logger.info(f"Reading {len(parquet_files)} parquet files as daft batches")
|
|
254
|
-
|
|
255
|
-
# Create a lazy dataframe without loading data into memory
|
|
256
|
-
lazy_df = daft.read_parquet(parquet_files)
|
|
257
|
-
|
|
258
|
-
# Get total count efficiently
|
|
259
|
-
total_rows = lazy_df.count_rows()
|
|
260
|
-
|
|
261
|
-
# Yield chunks without loading everything into memory
|
|
262
|
-
for offset in range(0, total_rows, self.buffer_size):
|
|
263
|
-
chunk = lazy_df.offset(offset).limit(self.buffer_size)
|
|
264
|
-
yield chunk
|
|
265
|
-
|
|
266
|
-
del lazy_df
|
|
267
|
-
|
|
268
|
-
except Exception as error:
|
|
269
|
-
logger.error(
|
|
270
|
-
f"Error reading data from parquet file(s) in batches using daft: {error}"
|
|
271
|
-
)
|
|
272
|
-
raise
|