adapta 2.11.9__py3-none-any.whl → 3.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- adapta/__init__.py +1 -1
- adapta/_version.py +1 -1
- adapta/connectors/__init__.py +1 -1
- adapta/connectors/service_bus/__init__.py +1 -1
- adapta/connectors/service_bus/_connector.py +2 -3
- adapta/logs/__init__.py +1 -1
- adapta/logs/_async_logger.py +38 -24
- adapta/logs/_base.py +21 -21
- adapta/logs/_internal.py +6 -7
- adapta/logs/_internal_logger.py +113 -41
- adapta/logs/_logger_interface.py +9 -10
- adapta/logs/handlers/__init__.py +1 -1
- adapta/logs/handlers/datadog_api_handler.py +7 -7
- adapta/logs/handlers/safe_stream_handler.py +4 -4
- adapta/logs/models/__init__.py +1 -1
- adapta/logs/models/_log_level.py +1 -1
- adapta/logs/models/_logs_metadata.py +4 -5
- adapta/metrics/__init__.py +1 -1
- adapta/metrics/_base.py +14 -15
- adapta/metrics/providers/__init__.py +1 -1
- adapta/metrics/providers/datadog_provider.py +21 -22
- adapta/metrics/providers/void_provider.py +34 -0
- adapta/ml/__init__.py +1 -1
- adapta/ml/_model.py +1 -1
- adapta/ml/mlflow/__init__.py +1 -1
- adapta/ml/mlflow/_client.py +101 -5
- adapta/ml/mlflow/_functions.py +44 -13
- adapta/process_communication/__init__.py +1 -1
- adapta/process_communication/_models.py +8 -6
- adapta/schema_management/README.md +0 -1
- adapta/schema_management/__init__.py +1 -1
- adapta/schema_management/schema_entity.py +3 -3
- adapta/security/__init__.py +1 -1
- adapta/security/clients/__init__.py +1 -1
- adapta/security/clients/_azure_client.py +14 -12
- adapta/security/clients/_base.py +11 -6
- adapta/security/clients/_local_client.py +6 -6
- adapta/security/clients/aws/__init__.py +1 -1
- adapta/security/clients/aws/_aws_client.py +12 -10
- adapta/security/clients/aws/_aws_credentials.py +7 -8
- adapta/security/clients/hashicorp_vault/__init__.py +1 -1
- adapta/security/clients/hashicorp_vault/hashicorp_vault_client.py +7 -6
- adapta/security/clients/hashicorp_vault/kubernetes_client.py +2 -2
- adapta/security/clients/hashicorp_vault/oidc_client.py +2 -2
- adapta/security/clients/hashicorp_vault/token_client.py +2 -2
- adapta/storage/__init__.py +1 -1
- adapta/storage/blob/README.md +14 -10
- adapta/storage/blob/__init__.py +1 -1
- adapta/storage/blob/azure_storage_client.py +76 -24
- adapta/storage/blob/base.py +15 -13
- adapta/storage/blob/local_storage_client.py +28 -16
- adapta/storage/blob/s3_storage_client.py +19 -24
- adapta/storage/cache/__init__.py +1 -1
- adapta/storage/cache/_base.py +5 -5
- adapta/storage/cache/redis_cache.py +5 -5
- adapta/storage/database/__init__.py +4 -1
- adapta/storage/database/{README.md → v2/README.md} +2 -0
- adapta/storage/database/v2/__init__.py +17 -0
- adapta/storage/database/v2/azure_sql.py +143 -0
- adapta/storage/{distributed_object_store/datastax_astra → database/v2/models}/__init__.py +5 -5
- adapta/storage/database/v2/models/_models.py +53 -0
- adapta/storage/database/{odbc.py → v2/odbc.py} +22 -13
- adapta/storage/database/{snowflake_sql.py → v2/snowflake_sql.py} +20 -12
- adapta/storage/database/{trino_sql.py → v2/trino_sql.py} +15 -6
- adapta/storage/database/v3/README.md +109 -0
- adapta/storage/database/v3/__init__.py +14 -0
- adapta/storage/database/{azure_sql.py → v3/azure_sql.py} +7 -9
- adapta/storage/database/v3/models/__init__.py +19 -0
- adapta/storage/database/{models → v3/models}/_models.py +2 -3
- adapta/storage/database/v3/odbc.py +217 -0
- adapta/storage/database/v3/snowflake_sql.py +241 -0
- adapta/storage/database/v3/trino_sql.py +154 -0
- adapta/storage/delta_lake/__init__.py +2 -3
- adapta/storage/delta_lake/{README.md → v2/README.md} +2 -0
- adapta/storage/delta_lake/v2/__init__.py +19 -0
- adapta/storage/delta_lake/{_functions.py → v2/_functions.py} +43 -27
- adapta/storage/delta_lake/v2/_models.py +72 -0
- adapta/storage/delta_lake/v3/README.md +147 -0
- adapta/storage/delta_lake/v3/__init__.py +20 -0
- adapta/storage/delta_lake/v3/_functions.py +315 -0
- adapta/storage/delta_lake/{_models.py → v3/_models.py} +4 -5
- adapta/storage/distributed_object_store/__init__.py +3 -1
- adapta/storage/distributed_object_store/v2/__init__.py +18 -0
- adapta/storage/distributed_object_store/{datastax_astra → v2/datastax_astra}/README.md +2 -0
- adapta/storage/distributed_object_store/v2/datastax_astra/__init__.py +20 -0
- adapta/storage/distributed_object_store/{datastax_astra → v2/datastax_astra}/_models.py +16 -0
- adapta/storage/distributed_object_store/{datastax_astra → v2/datastax_astra}/astra_client.py +61 -52
- adapta/storage/{database/models → distributed_object_store/v3}/__init__.py +4 -5
- adapta/storage/distributed_object_store/v3/datastax_astra/README.md +277 -0
- adapta/storage/distributed_object_store/v3/datastax_astra/__init__.py +20 -0
- adapta/storage/distributed_object_store/v3/datastax_astra/_model_mappers.py +469 -0
- adapta/storage/distributed_object_store/v3/datastax_astra/_models.py +134 -0
- adapta/storage/distributed_object_store/v3/datastax_astra/astra_client.py +569 -0
- adapta/storage/exceptions.py +1 -1
- adapta/storage/models/__init__.py +1 -1
- adapta/storage/models/_functions.py +5 -5
- adapta/storage/models/astra.py +4 -4
- adapta/storage/models/aws.py +1 -1
- adapta/storage/models/azure.py +2 -3
- adapta/storage/models/base.py +9 -1
- adapta/storage/models/enum.py +19 -0
- adapta/storage/models/filter_expression.py +124 -10
- adapta/storage/models/format.py +16 -205
- adapta/storage/models/formatters/__init__.py +36 -0
- adapta/storage/models/formatters/dict.py +43 -0
- adapta/storage/models/formatters/exceptions.py +7 -0
- adapta/storage/models/formatters/metaframe.py +48 -0
- adapta/storage/models/formatters/pandas.py +139 -0
- adapta/storage/models/formatters/pickle.py +36 -0
- adapta/storage/models/formatters/polars.py +240 -0
- adapta/storage/models/formatters/unit.py +26 -0
- adapta/storage/models/hive.py +24 -16
- adapta/storage/models/local.py +1 -1
- adapta/storage/models/trino.py +56 -0
- adapta/storage/query_enabled_store/README.md +1 -1
- adapta/storage/query_enabled_store/__init__.py +7 -1
- adapta/storage/query_enabled_store/_models.py +42 -13
- adapta/storage/query_enabled_store/_qes_astra.py +27 -14
- adapta/storage/query_enabled_store/_qes_delta.py +32 -10
- adapta/storage/query_enabled_store/_qes_local.py +81 -0
- adapta/storage/query_enabled_store/_qes_trino.py +133 -0
- adapta/storage/secrets/__init__.py +1 -1
- adapta/storage/secrets/_base.py +5 -4
- adapta/storage/secrets/azure_secret_client.py +3 -4
- adapta/storage/secrets/hashicorp_vault_secret_storage_client.py +5 -5
- adapta/utils/README.md +92 -0
- adapta/utils/__init__.py +2 -1
- adapta/utils/_common.py +50 -17
- adapta/utils/_requests.py +53 -0
- adapta/utils/concurrent_task_runner.py +10 -9
- adapta/utils/data_structures/_functions.py +6 -6
- adapta/utils/decorators/_logging.py +3 -3
- adapta/utils/decorators/_rate_limit.py +2 -2
- adapta/utils/metaframe.py +172 -0
- adapta/utils/python_typing/_functions.py +5 -10
- {adapta-2.11.9.dist-info → adapta-3.5.13.dist-info}/METADATA +18 -14
- adapta-3.5.13.dist-info/RECORD +146 -0
- {adapta-2.11.9.dist-info → adapta-3.5.13.dist-info}/WHEEL +1 -1
- adapta-2.11.9.dist-info/RECORD +0 -110
- {adapta-2.11.9.dist-info → adapta-3.5.13.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,7 +1,8 @@
|
|
|
1
|
+
# pylint: disable=duplicate-code
|
|
1
2
|
"""
|
|
2
3
|
Operations on Delta Lake tables.
|
|
3
4
|
"""
|
|
4
|
-
# Copyright (c) 2023-
|
|
5
|
+
# Copyright (c) 2023-2026. ECCO Data & AI and other project contributors.
|
|
5
6
|
#
|
|
6
7
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
8
|
# you may not use this file except in compliance with the License.
|
|
@@ -19,7 +20,8 @@
|
|
|
19
20
|
import datetime
|
|
20
21
|
import hashlib
|
|
21
22
|
import zlib
|
|
22
|
-
from
|
|
23
|
+
from collections.abc import Iterator, Iterable
|
|
24
|
+
from warnings import warn
|
|
23
25
|
|
|
24
26
|
from pandas import DataFrame, concat
|
|
25
27
|
import pyarrow
|
|
@@ -30,27 +32,29 @@ from pyarrow._dataset_parquet import ParquetReadOptions # pylint: disable=E0611
|
|
|
30
32
|
from adapta.logs import SemanticLogger
|
|
31
33
|
from adapta.security.clients._base import AuthenticationClient
|
|
32
34
|
from adapta.storage.models.base import DataPath
|
|
33
|
-
from adapta.storage.delta_lake._models import DeltaTransaction
|
|
35
|
+
from adapta.storage.delta_lake.v2._models import DeltaTransaction
|
|
34
36
|
from adapta.storage.cache import KeyValueCache
|
|
35
|
-
from adapta.storage.models.
|
|
37
|
+
from adapta.storage.models.formatters import PandasDataFrameParquetSerializationFormat
|
|
36
38
|
from adapta.storage.models.filter_expression import Expression, ArrowFilterExpression, compile_expression
|
|
37
39
|
|
|
38
40
|
|
|
39
41
|
def load( # pylint: disable=R0913
|
|
40
42
|
auth_client: AuthenticationClient,
|
|
41
43
|
path: DataPath,
|
|
42
|
-
version:
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
version: int | None = None,
|
|
45
|
+
timestamp: datetime.datetime | None = None,
|
|
46
|
+
row_filter: Expression | pyarrow.compute.Expression | None = None,
|
|
47
|
+
columns: list[str] | None = None,
|
|
48
|
+
batch_size: int | None = None,
|
|
49
|
+
partition_filter_expressions: list[tuple] | None = None,
|
|
50
|
+
) -> DeltaTable | DataFrame | Iterator[DataFrame]:
|
|
48
51
|
"""
|
|
49
52
|
Loads Delta Lake table from Azure or AWS storage and converts it to a pandas dataframe.
|
|
50
53
|
|
|
51
54
|
:param auth_client: AuthenticationClient for target storage.
|
|
52
55
|
:param path: Path to delta table, in HDFS format: abfss://container@account.dfs.core.windows.net/my/path
|
|
53
|
-
:param version: Optional version to read. Defaults to latest.
|
|
56
|
+
:param version: Optional version to read. Defaults to latest. If set, timestamp will be ignored.
|
|
57
|
+
:param timestamp: Optional time travel timestamp. Allows to read data as of a specific time. Ignored if version is set.
|
|
54
58
|
:param row_filter: Optional filter to apply, as pyarrow expression. Example:
|
|
55
59
|
from pyarrow.dataset import field as pyarrow_field
|
|
56
60
|
|
|
@@ -66,9 +70,21 @@ def load( # pylint: disable=R0913
|
|
|
66
70
|
|
|
67
71
|
:return: A DeltaTable wrapped Rust class, pandas Dataframe or an iterator of pandas Dataframes, for batched reads.
|
|
68
72
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
73
|
+
warn(
|
|
74
|
+
"You are using version 2 of the load function. "
|
|
75
|
+
"This is deprecated and will be removed in adapta version 4. "
|
|
76
|
+
"Please upgrade to version 3: adapta.storage.delta_lake.v3",
|
|
77
|
+
DeprecationWarning,
|
|
78
|
+
)
|
|
79
|
+
if version:
|
|
80
|
+
timestamp = None
|
|
81
|
+
|
|
82
|
+
pyarrow_ds = DeltaTable(path.to_delta_rs_path(), version=version, storage_options=auth_client.connect_storage(path))
|
|
83
|
+
|
|
84
|
+
if timestamp:
|
|
85
|
+
pyarrow_ds.load_as_version(timestamp)
|
|
86
|
+
|
|
87
|
+
pyarrow_ds = pyarrow_ds.to_pyarrow_dataset(
|
|
72
88
|
partitions=partition_filter_expressions,
|
|
73
89
|
parquet_read_options=ParquetReadOptions(coerce_int96_timestamp_unit="ms"),
|
|
74
90
|
filesystem=auth_client.get_pyarrow_filesystem(path),
|
|
@@ -90,7 +106,7 @@ def load( # pylint: disable=R0913
|
|
|
90
106
|
return pyarrow_table.to_pandas(timestamp_as_object=True)
|
|
91
107
|
|
|
92
108
|
|
|
93
|
-
def history(auth_client: AuthenticationClient, path: DataPath, limit:
|
|
109
|
+
def history(auth_client: AuthenticationClient, path: DataPath, limit: int | None = 1) -> Iterable[DeltaTransaction]:
|
|
94
110
|
"""
|
|
95
111
|
Returns transaction history for the table under path.
|
|
96
112
|
|
|
@@ -108,10 +124,10 @@ def get_cache_key(
|
|
|
108
124
|
auth_client: AuthenticationClient,
|
|
109
125
|
path: DataPath,
|
|
110
126
|
batch_size=1000,
|
|
111
|
-
version:
|
|
112
|
-
row_filter:
|
|
113
|
-
columns:
|
|
114
|
-
partition_filter_expressions:
|
|
127
|
+
version: int | None = None,
|
|
128
|
+
row_filter: Expression | None = None,
|
|
129
|
+
columns: list[str] | None = None,
|
|
130
|
+
partition_filter_expressions: list[tuple] | None = None,
|
|
115
131
|
) -> str:
|
|
116
132
|
"""
|
|
117
133
|
Returns a cache key for the path and data read arguments
|
|
@@ -156,13 +172,13 @@ def load_cached( # pylint: disable=R0913
|
|
|
156
172
|
auth_client: AuthenticationClient,
|
|
157
173
|
path: DataPath,
|
|
158
174
|
cache: KeyValueCache,
|
|
159
|
-
cache_expires_after:
|
|
175
|
+
cache_expires_after: datetime.timedelta | None = datetime.timedelta(hours=1),
|
|
160
176
|
batch_size=1000,
|
|
161
|
-
version:
|
|
162
|
-
row_filter:
|
|
163
|
-
columns:
|
|
164
|
-
partition_filter_expressions:
|
|
165
|
-
logger:
|
|
177
|
+
version: int | None = None,
|
|
178
|
+
row_filter: Expression | None = None,
|
|
179
|
+
columns: list[str] | None = None,
|
|
180
|
+
partition_filter_expressions: list[tuple] | None = None,
|
|
181
|
+
logger: SemanticLogger | None = None,
|
|
166
182
|
) -> DataFrame:
|
|
167
183
|
"""
|
|
168
184
|
Loads Delta Lake table from an external cache and converts it to a single pandas dataframe (after applying column projections and row filters).
|
|
@@ -217,7 +233,7 @@ def load_cached( # pylint: disable=R0913
|
|
|
217
233
|
try:
|
|
218
234
|
return concat(
|
|
219
235
|
[
|
|
220
|
-
|
|
236
|
+
PandasDataFrameParquetSerializationFormat().deserialize(zlib.decompress(cached_batch))
|
|
221
237
|
for batch_key, cached_batch in cache.get(cache_key, is_map=True).items()
|
|
222
238
|
if batch_key != b"completed"
|
|
223
239
|
]
|
|
@@ -255,7 +271,7 @@ def load_cached( # pylint: disable=R0913
|
|
|
255
271
|
cache.include(
|
|
256
272
|
key=cache_key,
|
|
257
273
|
attribute=str(batch_index),
|
|
258
|
-
value=zlib.compress(
|
|
274
|
+
value=zlib.compress(PandasDataFrameParquetSerializationFormat().serialize(batch)),
|
|
259
275
|
)
|
|
260
276
|
for batch_index, batch in enumerate(data)
|
|
261
277
|
],
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
# pylint: disable=duplicate-code
|
|
2
|
+
"""
|
|
3
|
+
Models used by delta lake functions.
|
|
4
|
+
"""
|
|
5
|
+
# Copyright (c) 2023-2026. ECCO Data & AI and other project contributors.
|
|
6
|
+
#
|
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
8
|
+
# you may not use this file except in compliance with the License.
|
|
9
|
+
# You may obtain a copy of the License at
|
|
10
|
+
#
|
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
12
|
+
#
|
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
16
|
+
# See the License for the specific language governing permissions and
|
|
17
|
+
# limitations under the License.
|
|
18
|
+
#
|
|
19
|
+
|
|
20
|
+
from dataclasses import dataclass
|
|
21
|
+
from enum import Enum
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DeltaOperation(Enum):
|
|
25
|
+
"""
|
|
26
|
+
Possible Delta table operations.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
DELETE = "DELETE"
|
|
30
|
+
UPDATE = "UPDATE"
|
|
31
|
+
WRITE = "WRITE"
|
|
32
|
+
MERGE = "MERGE"
|
|
33
|
+
CREATE_TABLE = "CREATE TABLE"
|
|
34
|
+
CREATE_TABLE_AS_SELECT = "CREATE TABLE AS SELECT"
|
|
35
|
+
CREATE_OR_REPLACE_TABLE_AS_SELECT = "CREATE OR REPLACE TABLE AS SELECT"
|
|
36
|
+
CHANGE_COLUMN = "CHANGE COLUMN"
|
|
37
|
+
VACUUM_START = "VACUUM START"
|
|
38
|
+
VACUUM_END = "VACUUM END"
|
|
39
|
+
UNDEFINED = "UNDEFINED"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DeltaTransaction:
|
|
44
|
+
"""
|
|
45
|
+
A subset of Delta table transaction entry properties.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
version: int
|
|
49
|
+
timestamp: int
|
|
50
|
+
operation: DeltaOperation
|
|
51
|
+
operation_parameters: dict
|
|
52
|
+
read_version: int
|
|
53
|
+
is_blind_append: bool
|
|
54
|
+
|
|
55
|
+
@classmethod
|
|
56
|
+
def from_dict(cls, value: dict) -> "DeltaTransaction":
|
|
57
|
+
"""
|
|
58
|
+
Converts delta transaction log entry to DeltaTransaction.
|
|
59
|
+
:param value: single entry from `describe history ...`
|
|
60
|
+
:return:
|
|
61
|
+
"""
|
|
62
|
+
delta_op = value.get("operation", DeltaOperation.UNDEFINED.value)
|
|
63
|
+
supported_ops = {item.value for item in DeltaOperation}
|
|
64
|
+
|
|
65
|
+
return cls(
|
|
66
|
+
version=value.get("version", -1),
|
|
67
|
+
timestamp=value["timestamp"],
|
|
68
|
+
operation=DeltaOperation(delta_op) if delta_op in supported_ops else DeltaOperation.UNDEFINED,
|
|
69
|
+
operation_parameters=value.get("operationParameters", {}),
|
|
70
|
+
read_version=value.get("readVersion", -1),
|
|
71
|
+
is_blind_append=value.get("isBlindAppend", False),
|
|
72
|
+
)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
# Delta Lake Operations
|
|
2
|
+
|
|
3
|
+
Supported API:
|
|
4
|
+
- read delta table as `MetaFrame` which can easily be converted to `pandas.DataFrame` or `polars.DataFrame`
|
|
5
|
+
- read delta table in batches of a provided size, each batch being `MetaFrame`
|
|
6
|
+
- read a subset of columns from delta table
|
|
7
|
+
- read and filter a delta table without loading all rows in memory
|
|
8
|
+
|
|
9
|
+
## Examples usage
|
|
10
|
+
Prepare connection and load
|
|
11
|
+
### For Azure Datalake Gen2
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
import os
|
|
15
|
+
from adapta.security.clients import AzureClient
|
|
16
|
+
from adapta.storage.models.azure import AdlsGen2Path
|
|
17
|
+
from adapta.storage.delta_lake import load
|
|
18
|
+
|
|
19
|
+
os.environ["PROTEUS__USE_AZURE_CREDENTIAL"] = "1"
|
|
20
|
+
azure_client = AzureClient()
|
|
21
|
+
adls_path = AdlsGen2Path.from_hdfs_path('abfss://container@account.dfs.core.windows.net/path/to/my/table')
|
|
22
|
+
|
|
23
|
+
# get Iterable[MetaFrame]
|
|
24
|
+
batches = load(azure_client, adls_path, batch_size=1000)
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### For AWS Simple Storage Service (S3) or S3-Compatible Storage
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import os
|
|
31
|
+
from adapta.security.clients import AwsClient
|
|
32
|
+
from adapta.security.clients.aws._aws_credentials import EnvironmentAwsCredentials
|
|
33
|
+
from adapta.storage.delta_lake import load
|
|
34
|
+
import pandas as pd
|
|
35
|
+
import pyarrow as pa
|
|
36
|
+
|
|
37
|
+
# Set up environment variables
|
|
38
|
+
os.environ["PROTEUS__AWS_ACCESS_KEY_ID"] = minio_access_key_id
|
|
39
|
+
os.environ["PROTEUS__AWS_SECRET_ACCESS_KEY"] = minio_secret_key
|
|
40
|
+
os.environ["PROTEUS__AWS_REGION"] = "eu-central-1"
|
|
41
|
+
os.environ["PROTEUS__AWS_ENDPOINT"] = "http://example.com"
|
|
42
|
+
|
|
43
|
+
# Create client
|
|
44
|
+
credentials = EnvironmentAwsCredentials()
|
|
45
|
+
aws_client = AwsClient(credentials)
|
|
46
|
+
|
|
47
|
+
# Initialize session
|
|
48
|
+
aws_client.initialize_session()
|
|
49
|
+
|
|
50
|
+
# Creating a delta lake table with sample data
|
|
51
|
+
data = {
|
|
52
|
+
'Character': ['Boromir', 'Harry Potter', 'Sherlock Holmes', 'Tony Stark', 'Darth Vader'],
|
|
53
|
+
'Occupation': ['Professional succumber to temptation', 'Wizard', 'Detective', 'Iron Man', 'Sith Lord'],
|
|
54
|
+
'Catchphrase': [
|
|
55
|
+
'One does not simply walk into Mordor.',
|
|
56
|
+
'Expecto Patronum!',
|
|
57
|
+
'Elementary, my dear Watson.',
|
|
58
|
+
'I am Iron Man.',
|
|
59
|
+
'I find your lack of faith disturbing.'
|
|
60
|
+
]
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
df = pd.DataFrame(data) # Create a pandas DataFrame from the data
|
|
64
|
+
table = pa.Table.from_pandas(df) # Convert the DataFrame to a PyArrow Table
|
|
65
|
+
path_test = '/path/to/store/locally/delta/lake/table'
|
|
66
|
+
deltalake.write_deltalake(path_test, table) # Write the PyArrow Table to a Delta Lake table
|
|
67
|
+
|
|
68
|
+
# Save the Delta Lake table to S3 blob storage
|
|
69
|
+
s3_client.save_data(path_test, s3_path)
|
|
70
|
+
|
|
71
|
+
# Get Iterable[pandas.DataFrame]
|
|
72
|
+
batches = load(aws_client, s3_path, batch_size=1000))
|
|
73
|
+
|
|
74
|
+
# Print each loaded batch
|
|
75
|
+
for batch in batches:
|
|
76
|
+
print(batch.to_pandas())
|
|
77
|
+
print("\n---\n")
|
|
78
|
+
|
|
79
|
+
# The content of the Delta Lake table should be printed in the screen
|
|
80
|
+
# Character ... Catchphrase
|
|
81
|
+
# 0 Boromir ... One does not simply walk into Mordor.
|
|
82
|
+
# 1 Harry Potter ... Expecto Patronum!
|
|
83
|
+
# 2 Sherlock Holmes ... Elementary, my dear Watson.
|
|
84
|
+
# 3 Tony Stark ... I am Iron Man.
|
|
85
|
+
# 4 Darth Vader ... I find your lack of faith disturbing.
|
|
86
|
+
#
|
|
87
|
+
# [5 rows x 3 columns]
|
|
88
|
+
# ---
|
|
89
|
+
```
|
|
90
|
+
## Using the Filtering API.
|
|
91
|
+
1. Create generic filter expressions
|
|
92
|
+
```python
|
|
93
|
+
from adapta.storage.models.filter_expression import FilterField
|
|
94
|
+
|
|
95
|
+
simple_filter = FilterField("my_column") == "some-value"
|
|
96
|
+
combined_filter = (FilterField("my_column") == "some-value") & (FilterField("other_column") == "another-value")
|
|
97
|
+
combined_filter_with_collection = (FilterField("my_column") == "something1") & (FilterField("other_column").isin(['else', 'nonexistent']))
|
|
98
|
+
complex_filter = (FilterField("my_column") == "something1") | (FilterField("my_other_column") == "else") & (FilterField("another_column") == 123)
|
|
99
|
+
```
|
|
100
|
+
2. Load and apply the expression
|
|
101
|
+
```python
|
|
102
|
+
# simple_filtered is of type pandas.DataFrame
|
|
103
|
+
simple_filtered = load(azure_client, adls_path, row_filter=simple_expression_pyarrow, columns=["my_column", "my_other_column"]).to_pandas()
|
|
104
|
+
# my_column my_other_column
|
|
105
|
+
# 0 some-value 123
|
|
106
|
+
# 1 some-value another-value
|
|
107
|
+
|
|
108
|
+
print(load(azure_client, adls_path, row_filter=combined_filter, columns=["my_column", "my_other_column"]).to_pandas())
|
|
109
|
+
# my_column my_other_column
|
|
110
|
+
# 0 some-value another-value
|
|
111
|
+
|
|
112
|
+
print(load(azure_client, adls_path, row_filter=combined_filter_with_collection, columns=["my_column", "my_other_column"]).to_pandas())
|
|
113
|
+
# my_column my_other_column
|
|
114
|
+
# 0 something1 else
|
|
115
|
+
# 1 something1 nonexistent
|
|
116
|
+
|
|
117
|
+
print(load(azure_client, adls_path, row_filter=complex_filter, columns=["my_column", "my_other_column", "another_column"]).to_pandas())
|
|
118
|
+
# my_column my_other_column another_column
|
|
119
|
+
# 0 something1 else 1
|
|
120
|
+
# 1 something1 nonexistent 2
|
|
121
|
+
# 2 something1 nonexistent1 123
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
# Using with Hive paths
|
|
125
|
+
```python
|
|
126
|
+
logger: SemanticLogger # review proteus.logs readme to learn how to construct a logger instance
|
|
127
|
+
os.environ['PROTEUS__HIVE_USER'] = 'delamain'
|
|
128
|
+
os.environ['PROTEUS__HIVE_PASSWORD'] = 'secret'
|
|
129
|
+
hive_path = HivePath.from_hdfs_path(
|
|
130
|
+
"hive://sqlserver@myserver.database.windows.net:1433/sparkdatalake/bronze/bronze_table")
|
|
131
|
+
|
|
132
|
+
adls_path2 = AdlsGen2Path.from_hdfs_path(hive_path.get_physical_path(logger=logger))
|
|
133
|
+
|
|
134
|
+
# get Iterable[MetaFrame]
|
|
135
|
+
batches2 = load(azure_client, adls_path2, batch_size=1000)
|
|
136
|
+
|
|
137
|
+
# read data using Redis Cache, improves read time by a factor of >10 on single-node Redis.
|
|
138
|
+
# for big tables, choose bigger batch sizes to speed up cache population. General rule:
|
|
139
|
+
# batch_size = row_count / 10
|
|
140
|
+
# if there is no cache hit, load_cached() will fallback to load() behaviour
|
|
141
|
+
r_cache = RedisCache(host="esd-superset-test.redis.cache.windows.net", database_number=1)
|
|
142
|
+
os.environ['PROTEUS__CACHE_REDIS_PASSWORD'] = '...'
|
|
143
|
+
read_raw = load_cached(azure_client, adls_path, r_cache, row_filter=filter,
|
|
144
|
+
cache_expires_after=datetime.timedelta(minutes=15), batch_size=int(1e6))
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Import index
|
|
3
|
+
"""
|
|
4
|
+
# Copyright (c) 2023-2026. ECCO Data & AI and other project contributors.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
#
|
|
18
|
+
|
|
19
|
+
from adapta.storage.delta_lake.v3._functions import *
|
|
20
|
+
from adapta.storage.delta_lake.v3._models import *
|