vast-daft 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vast_daft-0.1.1/.gitignore +43 -0
- vast_daft-0.1.1/PKG-INFO +82 -0
- vast_daft-0.1.1/README.md +55 -0
- vast_daft-0.1.1/pyproject.toml +59 -0
- vast_daft-0.1.1/src/vast_daft/__init__.py +186 -0
- vast_daft-0.1.1/src/vast_daft/_pushdown.py +245 -0
- vast_daft-0.1.1/src/vast_daft/config.py +80 -0
- vast_daft-0.1.1/src/vast_daft/connection.py +132 -0
- vast_daft-0.1.1/src/vast_daft/py.typed +0 -0
- vast_daft-0.1.1/src/vast_daft/scan.py +377 -0
- vast_daft-0.1.1/src/vast_daft/sink.py +165 -0
- vast_daft-0.1.1/src/vast_daft/source.py +553 -0
- vast_daft-0.1.1/src/vast_daft/table.py +568 -0
- vast_daft-0.1.1/tests/__init__.py +0 -0
- vast_daft-0.1.1/tests/test_catalog_identifiers.py +236 -0
- vast_daft-0.1.1/tests/test_config_and_predicates.py +102 -0
- vast_daft-0.1.1/tests/test_pushdown.py +284 -0
- vast_daft-0.1.1/tests/test_scan.py +147 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.eggs/
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# uv
|
|
17
|
+
uv.lock
|
|
18
|
+
|
|
19
|
+
# IDE
|
|
20
|
+
.idea/
|
|
21
|
+
.vscode/
|
|
22
|
+
*.swp
|
|
23
|
+
*.swo
|
|
24
|
+
*~
|
|
25
|
+
.DS_Store
|
|
26
|
+
|
|
27
|
+
# Testing
|
|
28
|
+
.pytest_cache/
|
|
29
|
+
.coverage
|
|
30
|
+
htmlcov/
|
|
31
|
+
.mypy_cache/
|
|
32
|
+
|
|
33
|
+
# Distribution
|
|
34
|
+
*.whl
|
|
35
|
+
*.tar.gz
|
|
36
|
+
|
|
37
|
+
# Environment
|
|
38
|
+
.env
|
|
39
|
+
.env.*
|
|
40
|
+
!.env.example
|
|
41
|
+
|
|
42
|
+
# Local example data (Lance datasets, downloaded corpora)
|
|
43
|
+
data/
|
vast_daft-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: vast-daft
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Daft custom connector (DataSource/DataSink) for VastDB
|
|
5
|
+
Project-URL: Homepage, https://github.com/vast-data/vast-daft
|
|
6
|
+
Project-URL: Repository, https://github.com/vast-data/vast-daft
|
|
7
|
+
Author-email: Ofer Helman <ofer.helman@vastdata.com>
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Requires-Dist: daft==0.7.10
|
|
11
|
+
Requires-Dist: ibis-framework>=9.0
|
|
12
|
+
Requires-Dist: numpy<2
|
|
13
|
+
Requires-Dist: pyarrow>=15.0
|
|
14
|
+
Requires-Dist: vastdb>=1.2
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: pyiceberg[s3fs,sql-sqlite]>=0.11.1; extra == 'dev'
|
|
17
|
+
Requires-Dist: pylance>=0.39.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: pyright>=1.1; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0; extra == 'dev'
|
|
22
|
+
Requires-Dist: ray==2.53.0; extra == 'dev'
|
|
23
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
24
|
+
Provides-Extra: examples
|
|
25
|
+
Requires-Dist: python-dotenv>=1.0; extra == 'examples'
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# vast-daft
|
|
29
|
+
|
|
30
|
+
Daft custom connector (`DataSource` / `DataSink`) for [VastDB](https://vastdata.com).
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install vast-daft
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Or with uv:
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
uv add vast-daft
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Quick Start
|
|
45
|
+
|
|
46
|
+
### Reading from VastDB
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
import pyarrow as pa
|
|
50
|
+
from vast_daft import VastDBConfig, VastDBDataSource
|
|
51
|
+
|
|
52
|
+
config = VastDBConfig(
|
|
53
|
+
endpoint="http://vastdb:9090",
|
|
54
|
+
access_key="YOUR_ACCESS_KEY",
|
|
55
|
+
secret_key="YOUR_SECRET_KEY",
|
|
56
|
+
bucket="my-bucket",
|
|
57
|
+
schema="my-schema",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
schema = pa.schema([
|
|
61
|
+
("id", pa.string()),
|
|
62
|
+
("name", pa.string()),
|
|
63
|
+
("value", pa.float64()),
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
source = VastDBDataSource(config, "my_table", schema)
|
|
67
|
+
df = source.read()
|
|
68
|
+
df.show()
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Writing to VastDB
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import daft
|
|
75
|
+
from vast_daft import VastDBConfig, VastDBDataSink
|
|
76
|
+
|
|
77
|
+
config = VastDBConfig(...)
|
|
78
|
+
schema = pa.schema([("id", pa.string()), ("value", pa.float64())])
|
|
79
|
+
|
|
80
|
+
sink = VastDBDataSink(config, "my_table", schema)
|
|
81
|
+
daft.from_pydict({"id": ["a", "b"], "value": [1.0, 2.0]}).write_sink(sink).show()
|
|
82
|
+
```
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# vast-daft
|
|
2
|
+
|
|
3
|
+
Daft custom connector (`DataSource` / `DataSink`) for [VastDB](https://vastdata.com).
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install vast-daft
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Or with uv:
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
uv add vast-daft
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
### Reading from VastDB
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import pyarrow as pa
|
|
23
|
+
from vast_daft import VastDBConfig, VastDBDataSource
|
|
24
|
+
|
|
25
|
+
config = VastDBConfig(
|
|
26
|
+
endpoint="http://vastdb:9090",
|
|
27
|
+
access_key="YOUR_ACCESS_KEY",
|
|
28
|
+
secret_key="YOUR_SECRET_KEY",
|
|
29
|
+
bucket="my-bucket",
|
|
30
|
+
schema="my-schema",
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
schema = pa.schema([
|
|
34
|
+
("id", pa.string()),
|
|
35
|
+
("name", pa.string()),
|
|
36
|
+
("value", pa.float64()),
|
|
37
|
+
])
|
|
38
|
+
|
|
39
|
+
source = VastDBDataSource(config, "my_table", schema)
|
|
40
|
+
df = source.read()
|
|
41
|
+
df.show()
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Writing to VastDB
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import daft
|
|
48
|
+
from vast_daft import VastDBConfig, VastDBDataSink
|
|
49
|
+
|
|
50
|
+
config = VastDBConfig(...)
|
|
51
|
+
schema = pa.schema([("id", pa.string()), ("value", pa.float64())])
|
|
52
|
+
|
|
53
|
+
sink = VastDBDataSink(config, "my_table", schema)
|
|
54
|
+
daft.from_pydict({"id": ["a", "b"], "value": [1.0, 2.0]}).write_sink(sink).show()
|
|
55
|
+
```
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "vast-daft"
|
|
3
|
+
version = "0.1.1"
|
|
4
|
+
description = "Daft custom connector (DataSource/DataSink) for VastDB"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
authors = [{ name = "Ofer Helman", email = "ofer.helman@vastdata.com" }]
|
|
9
|
+
dependencies = [
|
|
10
|
+
"daft==0.7.10",
|
|
11
|
+
"numpy<2",
|
|
12
|
+
"vastdb>=1.2",
|
|
13
|
+
"pyarrow>=15.0",
|
|
14
|
+
"ibis-framework>=9.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.urls]
|
|
18
|
+
Homepage = "https://github.com/vast-data/vast-daft"
|
|
19
|
+
Repository = "https://github.com/vast-data/vast-daft"
|
|
20
|
+
|
|
21
|
+
[project.optional-dependencies]
|
|
22
|
+
dev = [
|
|
23
|
+
"pytest>=8.0",
|
|
24
|
+
"pytest-cov>=5.0",
|
|
25
|
+
"ruff>=0.4",
|
|
26
|
+
"pyright>=1.1",
|
|
27
|
+
# "ty>=0.0.0a1",
|
|
28
|
+
"python-dotenv>=1.0",
|
|
29
|
+
"ray==2.53.0",
|
|
30
|
+
"pylance>=0.39.0",
|
|
31
|
+
"pyiceberg[s3fs,sql-sqlite]>=0.11.1",
|
|
32
|
+
]
|
|
33
|
+
examples = [
|
|
34
|
+
"python-dotenv>=1.0",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[build-system]
|
|
38
|
+
requires = ["hatchling"]
|
|
39
|
+
build-backend = "hatchling.build"
|
|
40
|
+
|
|
41
|
+
[tool.pyright]
|
|
42
|
+
typeCheckingMode = "basic"
|
|
43
|
+
|
|
44
|
+
[tool.pytest]
|
|
45
|
+
testpaths = ["tests"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
target-version = "py312"
|
|
49
|
+
line-length = 120
|
|
50
|
+
|
|
51
|
+
[tool.ruff.lint]
|
|
52
|
+
select = ["E", "F", "I", "W", "UP"]
|
|
53
|
+
|
|
54
|
+
[tool.ty.environment]
|
|
55
|
+
python-version = "3.12"
|
|
56
|
+
|
|
57
|
+
[tool.ty.rules]
|
|
58
|
+
unsupported-operator = "ignore"
|
|
59
|
+
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""vast-daft: Daft custom connector for VastDB.
|
|
2
|
+
|
|
3
|
+
Provides ``DataSource`` / ``DataSink`` implementations for reading from
|
|
4
|
+
and writing to VastDB, and a Daft catalog integration for native SQL queries.
|
|
5
|
+
|
|
6
|
+
Importing this module also registers two convenience methods on
|
|
7
|
+
``daft.DataFrame``:
|
|
8
|
+
|
|
9
|
+
* ``df.read_vastdb(config, table_name, ...)`` — read a VastDB table
|
|
10
|
+
* ``df.write_vastdb(config, table_name, schema, ...)`` — write to a VastDB table
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import daft
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
|
|
20
|
+
from vast_daft.config import VastDBConfig
|
|
21
|
+
from vast_daft.connection import VastDBConnection
|
|
22
|
+
from vast_daft.scan import VastDBScanOperator
|
|
23
|
+
from vast_daft.sink import VastDBDataSink
|
|
24
|
+
from vast_daft.source import VastDBDataSource
|
|
25
|
+
from vast_daft.table import VastDBCatalog, VastDBTable
|
|
26
|
+
|
|
27
|
+
# ---------------------------------------------------------------------------
|
|
28
|
+
# df.read_vastdb()
|
|
29
|
+
# ---------------------------------------------------------------------------
|
|
30
|
+
|
|
31
|
+
def _read_vastdb(
|
|
32
|
+
config: VastDBConfig,
|
|
33
|
+
table_name: str,
|
|
34
|
+
*,
|
|
35
|
+
bucket: str | None = None,
|
|
36
|
+
schema: str | None = None,
|
|
37
|
+
columns: list[str] | None = None,
|
|
38
|
+
num_splits: int | None = None,
|
|
39
|
+
) -> daft.DataFrame:
|
|
40
|
+
"""Read a VastDB table into a Daft DataFrame.
|
|
41
|
+
|
|
42
|
+
This is a module-level factory (not a DataFrame method) that mirrors the
|
|
43
|
+
ergonomics of ``daft.read_parquet()``, ``daft.read_iceberg()``, etc.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
config:
|
|
48
|
+
VastDB connection configuration.
|
|
49
|
+
table_name:
|
|
50
|
+
Name of the VastDB table to read.
|
|
51
|
+
bucket:
|
|
52
|
+
VastDB bucket. Overrides ``config.bucket`` when provided.
|
|
53
|
+
schema:
|
|
54
|
+
VastDB schema. Overrides ``config.schema`` when provided.
|
|
55
|
+
columns:
|
|
56
|
+
Optional list of column names to project.
|
|
57
|
+
num_splits:
|
|
58
|
+
Number of parallel read splits. Auto-estimated when omitted.
|
|
59
|
+
|
|
60
|
+
Returns
|
|
61
|
+
-------
|
|
62
|
+
daft.DataFrame
|
|
63
|
+
|
|
64
|
+
Examples
|
|
65
|
+
--------
|
|
66
|
+
>>> import vast_daft
|
|
67
|
+
>>> from vast_daft import VastDBConfig
|
|
68
|
+
>>> config = VastDBConfig(endpoint="http://vastdb:9090", access_key="...", secret_key="...")
|
|
69
|
+
>>> df = vast_daft.read_vastdb(config, "orders")
|
|
70
|
+
>>> df.show()
|
|
71
|
+
"""
|
|
72
|
+
from daft.daft import ScanOperatorHandle
|
|
73
|
+
from daft.logical.builder import LogicalPlanBuilder
|
|
74
|
+
|
|
75
|
+
_bucket = bucket or config.bucket
|
|
76
|
+
_schema = schema or config.schema
|
|
77
|
+
if _bucket is None or _schema is None:
|
|
78
|
+
raise ValueError("bucket and schema must be provided either as arguments or via VastDBConfig")
|
|
79
|
+
|
|
80
|
+
# Discover the PyArrow schema from the live table
|
|
81
|
+
conn = VastDBConnection(config)
|
|
82
|
+
with conn.session.transaction() as tx:
|
|
83
|
+
db_table = tx.bucket(_bucket).schema(_schema).table(table_name)
|
|
84
|
+
pa_schema = db_table.columns()
|
|
85
|
+
|
|
86
|
+
if columns is not None:
|
|
87
|
+
pa_schema = pa.schema([pa_schema.field(c) for c in columns])
|
|
88
|
+
|
|
89
|
+
kwargs: dict[str, Any] = {}
|
|
90
|
+
if num_splits is not None:
|
|
91
|
+
kwargs["num_splits"] = num_splits
|
|
92
|
+
|
|
93
|
+
scan_op = VastDBScanOperator(
|
|
94
|
+
config,
|
|
95
|
+
table_name,
|
|
96
|
+
pa_schema,
|
|
97
|
+
bucket=_bucket,
|
|
98
|
+
db_schema=_schema,
|
|
99
|
+
columns=columns,
|
|
100
|
+
**kwargs,
|
|
101
|
+
)
|
|
102
|
+
handle = ScanOperatorHandle.from_python_scan_operator(scan_op)
|
|
103
|
+
builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
|
|
104
|
+
return daft.DataFrame(builder)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
# df.write_vastdb()
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
def _write_vastdb(
|
|
112
|
+
self: daft.DataFrame,
|
|
113
|
+
config: VastDBConfig,
|
|
114
|
+
table_name: str,
|
|
115
|
+
table_schema: pa.Schema,
|
|
116
|
+
*,
|
|
117
|
+
bucket: str | None = None,
|
|
118
|
+
schema: str | None = None,
|
|
119
|
+
create_if_missing: bool = True,
|
|
120
|
+
) -> daft.DataFrame:
|
|
121
|
+
"""Write the DataFrame to a VastDB table.
|
|
122
|
+
|
|
123
|
+
Registered on ``daft.DataFrame`` when ``vast_daft`` is imported.
|
|
124
|
+
|
|
125
|
+
Parameters
|
|
126
|
+
----------
|
|
127
|
+
config:
|
|
128
|
+
VastDB connection configuration.
|
|
129
|
+
table_name:
|
|
130
|
+
Name of the target VastDB table.
|
|
131
|
+
table_schema:
|
|
132
|
+
PyArrow schema of the target table.
|
|
133
|
+
bucket:
|
|
134
|
+
VastDB bucket. Overrides ``config.bucket`` when provided.
|
|
135
|
+
schema:
|
|
136
|
+
VastDB schema. Overrides ``config.schema`` when provided.
|
|
137
|
+
create_if_missing:
|
|
138
|
+
Auto-create the table if it does not exist (default: ``True``).
|
|
139
|
+
|
|
140
|
+
Returns
|
|
141
|
+
-------
|
|
142
|
+
daft.DataFrame
|
|
143
|
+
A DataFrame with ``rows_written`` and ``bytes_written`` columns,
|
|
144
|
+
one row per partition written.
|
|
145
|
+
|
|
146
|
+
Examples
|
|
147
|
+
--------
|
|
148
|
+
>>> import daft, pyarrow as pa, vast_daft
|
|
149
|
+
>>> from vast_daft import VastDBConfig
|
|
150
|
+
>>> config = VastDBConfig(endpoint="http://vastdb:9090", access_key="...", secret_key="...")
|
|
151
|
+
>>> schema = pa.schema([("id", pa.int64()), ("value", pa.float64())])
|
|
152
|
+
>>> df = daft.from_pydict({"id": [1, 2], "value": [0.1, 0.2]})
|
|
153
|
+
>>> df.write_vastdb(config, "my_table", schema)
|
|
154
|
+
"""
|
|
155
|
+
sink = VastDBDataSink(
|
|
156
|
+
config,
|
|
157
|
+
table_name,
|
|
158
|
+
table_schema,
|
|
159
|
+
bucket=bucket,
|
|
160
|
+
schema=schema,
|
|
161
|
+
create_if_missing=create_if_missing,
|
|
162
|
+
)
|
|
163
|
+
return self.write_sink(sink)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# Register df.write_vastdb on daft.DataFrame
|
|
167
|
+
daft.DataFrame.write_vastdb = _write_vastdb # type: ignore[attr-defined]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
__all__ = [
|
|
171
|
+
# Core
|
|
172
|
+
"VastDBConfig",
|
|
173
|
+
"VastDBConnection",
|
|
174
|
+
# Daft connectors
|
|
175
|
+
"VastDBDataSource",
|
|
176
|
+
"VastDBDataSink",
|
|
177
|
+
"VastDBScanOperator",
|
|
178
|
+
# Catalog / Table (Daft interfaces — use with daft.attach_catalog + daft.sql)
|
|
179
|
+
"VastDBCatalog",
|
|
180
|
+
"VastDBTable",
|
|
181
|
+
# Convenience top-level functions (mirrors daft.read_parquet etc.)
|
|
182
|
+
"read_vastdb",
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
# Public alias — lets users call vast_daft.read_vastdb(...)
|
|
186
|
+
read_vastdb = _read_vastdb
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""Translate Daft ``Pushdowns`` to ibis predicates and projection columns.
|
|
2
|
+
|
|
3
|
+
Daft calls ``get_tasks(pushdowns)`` with a :class:`~daft.io.pushdowns.Pushdowns`
|
|
4
|
+
object that may carry:
|
|
5
|
+
|
|
6
|
+
* ``filters`` — a ``daft.expressions.Expression`` (the combined filter predicate)
|
|
7
|
+
* ``columns`` — a ``list[str]`` of column names to project
|
|
8
|
+
* ``limit`` — an ``int`` row limit
|
|
9
|
+
|
|
10
|
+
This module converts the ``filters`` expression into an ibis predicate that can
|
|
11
|
+
be passed to :func:`vastdb.table.Table.select_splits`, achieving genuine
|
|
12
|
+
server-side push-down for filter, column projection, and row limit.
|
|
13
|
+
|
|
14
|
+
Supported Daft expression nodes
|
|
15
|
+
---------------------------------
|
|
16
|
+
Binary comparisons: ``==``, ``!=``, ``>``, ``>=``, ``<``, ``<=``
|
|
17
|
+
Membership: ``col.is_in([...])``
|
|
18
|
+
Between: ``col.between(lo, hi)``
|
|
19
|
+
Null checks: ``col.is_null()``, ``col.not_null()``
|
|
20
|
+
Logical: ``expr & expr``, ``expr | expr``, ``~expr``
|
|
21
|
+
String: ``col.contains(s)``, ``col.startswith(s)``, ``col.endswith(s)``
|
|
22
|
+
|
|
23
|
+
Any unsupported node causes the entire filter to be returned un-translated
|
|
24
|
+
(``None``), leaving Daft to apply it client-side as a post-filter.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import logging
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
import ibis
|
|
33
|
+
from daft.expressions import Expression, ExpressionVisitor
|
|
34
|
+
from daft.io.pushdowns import Pushdowns
|
|
35
|
+
from ibis.common.deferred import Deferred
|
|
36
|
+
from ibis.expr.types import Column as BooleanColumn
|
|
37
|
+
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
Predicate = BooleanColumn | Deferred
|
|
41
|
+
|
|
42
|
+
# Sentinel to signal "unsupported — fall back to client-side"
|
|
43
|
+
_UNSUPPORTED = object()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Visitor
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class _DaftToIbisVisitor(ExpressionVisitor):
|
|
52
|
+
"""Walk a Daft expression tree and produce an ibis predicate.
|
|
53
|
+
|
|
54
|
+
Returns ``_UNSUPPORTED`` (the module-level sentinel) for any node that
|
|
55
|
+
cannot be translated, so callers can detect a partial failure without
|
|
56
|
+
raising.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
def __init__(self) -> None:
|
|
60
|
+
super().__init__()
|
|
61
|
+
self.referenced_columns: set[str] = set()
|
|
62
|
+
|
|
63
|
+
# -- Leaves --------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
def visit_col(self, name: str) -> Any:
|
|
66
|
+
self.referenced_columns.add(name)
|
|
67
|
+
return ibis._[name]
|
|
68
|
+
|
|
69
|
+
def visit_lit(self, value: Any) -> Any:
|
|
70
|
+
# Daft passes the Python value directly (int, float, str, bool, None)
|
|
71
|
+
return value
|
|
72
|
+
|
|
73
|
+
# -- Structural nodes that we don't push down ----------------------------
|
|
74
|
+
|
|
75
|
+
def visit_alias(self, expr: Expression, alias: str) -> Any:
|
|
76
|
+
# Aliases don't affect predicate semantics — visit the inner expr
|
|
77
|
+
return self.visit(expr)
|
|
78
|
+
|
|
79
|
+
def visit_cast(self, expr: Expression, dtype: Any) -> Any:
|
|
80
|
+
# Casts are not translatable to ibis predicates
|
|
81
|
+
return _UNSUPPORTED
|
|
82
|
+
|
|
83
|
+
def visit_coalesce(self, *args: Expression) -> Any:
|
|
84
|
+
# Forward-compatibility with newer Daft visitor APIs.
|
|
85
|
+
return _UNSUPPORTED
|
|
86
|
+
|
|
87
|
+
# -- Function dispatch ---------------------------------------------------
|
|
88
|
+
|
|
89
|
+
def visit_function(self, name: str, args: list) -> Any:
|
|
90
|
+
"""Fallback for unrecognised function names."""
|
|
91
|
+
return _UNSUPPORTED
|
|
92
|
+
|
|
93
|
+
# visit_() in the base class dispatches to visit_<name> if it exists,
|
|
94
|
+
# otherwise calls visit_function(). We register individual functions
|
|
95
|
+
# below as visit_<daft_function_name> methods.
|
|
96
|
+
|
|
97
|
+
def _binop(self, args: list, op: str) -> Any:
|
|
98
|
+
"""Helper: visit two expression args and apply a binary ibis operator."""
|
|
99
|
+
lhs = self.visit(args[0])
|
|
100
|
+
rhs = self.visit(args[1])
|
|
101
|
+
if lhs is _UNSUPPORTED or rhs is _UNSUPPORTED:
|
|
102
|
+
return _UNSUPPORTED
|
|
103
|
+
ops = {
|
|
104
|
+
"equal": lambda a, b: a == b,
|
|
105
|
+
"not_equal": lambda a, b: a != b,
|
|
106
|
+
"greater_than": lambda a, b: a > b,
|
|
107
|
+
"greater_than_or_equal": lambda a, b: a >= b,
|
|
108
|
+
"less_than": lambda a, b: a < b,
|
|
109
|
+
"less_than_or_equal": lambda a, b: a <= b,
|
|
110
|
+
"and": lambda a, b: a & b,
|
|
111
|
+
"or": lambda a, b: a | b,
|
|
112
|
+
}
|
|
113
|
+
return ops[op](lhs, rhs)
|
|
114
|
+
|
|
115
|
+
# Comparison operators
|
|
116
|
+
def visit_equal(self, lhs: Expression, rhs: Expression) -> Any:
|
|
117
|
+
return self._binop([lhs, rhs], "equal")
|
|
118
|
+
|
|
119
|
+
def visit_not_equal(self, lhs: Expression, rhs: Expression) -> Any:
|
|
120
|
+
return self._binop([lhs, rhs], "not_equal")
|
|
121
|
+
|
|
122
|
+
def visit_greater_than(self, lhs: Expression, rhs: Expression) -> Any:
|
|
123
|
+
return self._binop([lhs, rhs], "greater_than")
|
|
124
|
+
|
|
125
|
+
def visit_greater_than_or_equal(self, lhs: Expression, rhs: Expression) -> Any:
|
|
126
|
+
return self._binop([lhs, rhs], "greater_than_or_equal")
|
|
127
|
+
|
|
128
|
+
def visit_less_than(self, lhs: Expression, rhs: Expression) -> Any:
|
|
129
|
+
return self._binop([lhs, rhs], "less_than")
|
|
130
|
+
|
|
131
|
+
def visit_less_than_or_equal(self, lhs: Expression, rhs: Expression) -> Any:
|
|
132
|
+
return self._binop([lhs, rhs], "less_than_or_equal")
|
|
133
|
+
|
|
134
|
+
# Logical operators
|
|
135
|
+
def visit_and(self, lhs: Expression, rhs: Expression) -> Any:
|
|
136
|
+
return self._binop([lhs, rhs], "and")
|
|
137
|
+
|
|
138
|
+
def visit_or(self, lhs: Expression, rhs: Expression) -> Any:
|
|
139
|
+
return self._binop([lhs, rhs], "or")
|
|
140
|
+
|
|
141
|
+
def visit_not(self, expr: Expression) -> Any:
|
|
142
|
+
inner = self.visit(expr)
|
|
143
|
+
if inner is _UNSUPPORTED:
|
|
144
|
+
return _UNSUPPORTED
|
|
145
|
+
return ~inner
|
|
146
|
+
|
|
147
|
+
# Null checks
|
|
148
|
+
def visit_is_null(self, expr: Expression) -> Any:
|
|
149
|
+
inner = self.visit(expr)
|
|
150
|
+
if inner is _UNSUPPORTED:
|
|
151
|
+
return _UNSUPPORTED
|
|
152
|
+
return inner.isnull()
|
|
153
|
+
|
|
154
|
+
def visit_not_null(self, expr: Expression) -> Any:
|
|
155
|
+
inner = self.visit(expr)
|
|
156
|
+
if inner is _UNSUPPORTED:
|
|
157
|
+
return _UNSUPPORTED
|
|
158
|
+
return inner.notnull()
|
|
159
|
+
|
|
160
|
+
# Membership: is_in(expr, [lit, lit, ...])
|
|
161
|
+
# NOTE: args[1] is a raw list of Expression objects, not pre-visited.
|
|
162
|
+
def visit_is_in(self, expr: Expression, values: list) -> Any:
|
|
163
|
+
col_ibis = self.visit(expr)
|
|
164
|
+
if col_ibis is _UNSUPPORTED:
|
|
165
|
+
return _UNSUPPORTED
|
|
166
|
+
visited_vals = [self.visit(v) for v in values]
|
|
167
|
+
if any(v is _UNSUPPORTED for v in visited_vals):
|
|
168
|
+
return _UNSUPPORTED
|
|
169
|
+
return col_ibis.isin(visited_vals)
|
|
170
|
+
|
|
171
|
+
# Between: between(expr, lo, hi)
|
|
172
|
+
def visit_between(self, expr: Expression, lo: Expression, hi: Expression) -> Any:
|
|
173
|
+
col_ibis = self.visit(expr)
|
|
174
|
+
lo_val = self.visit(lo)
|
|
175
|
+
hi_val = self.visit(hi)
|
|
176
|
+
if any(v is _UNSUPPORTED for v in [col_ibis, lo_val, hi_val]):
|
|
177
|
+
return _UNSUPPORTED
|
|
178
|
+
return col_ibis.between(lo_val, hi_val)
|
|
179
|
+
|
|
180
|
+
# String functions
|
|
181
|
+
def visit_starts_with(self, expr: Expression, prefix: Expression) -> Any:
|
|
182
|
+
col_ibis = self.visit(expr)
|
|
183
|
+
val = self.visit(prefix)
|
|
184
|
+
if col_ibis is _UNSUPPORTED or val is _UNSUPPORTED:
|
|
185
|
+
return _UNSUPPORTED
|
|
186
|
+
return col_ibis.startswith(val)
|
|
187
|
+
|
|
188
|
+
def visit_ends_with(self, expr: Expression, suffix: Expression) -> Any:
|
|
189
|
+
col_ibis = self.visit(expr)
|
|
190
|
+
val = self.visit(suffix)
|
|
191
|
+
if col_ibis is _UNSUPPORTED or val is _UNSUPPORTED:
|
|
192
|
+
return _UNSUPPORTED
|
|
193
|
+
return col_ibis.endswith(val)
|
|
194
|
+
|
|
195
|
+
def visit_contains(self, expr: Expression, pattern: Expression) -> Any:
|
|
196
|
+
col_ibis = self.visit(expr)
|
|
197
|
+
val = self.visit(pattern)
|
|
198
|
+
if col_ibis is _UNSUPPORTED or val is _UNSUPPORTED:
|
|
199
|
+
return _UNSUPPORTED
|
|
200
|
+
return col_ibis.contains(val)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
# Public entry-point
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
def pushdowns_to_predicate(pushdowns: Pushdowns) -> Predicate | None:
|
|
208
|
+
"""Convert *pushdowns.filters* to an ibis predicate, or ``None``.
|
|
209
|
+
|
|
210
|
+
Returns ``None`` when no filter is present or when the expression contains
|
|
211
|
+
an unsupported node. In that case the caller should not pass a predicate
|
|
212
|
+
to VastDB, and Daft will apply the filter client-side.
|
|
213
|
+
"""
|
|
214
|
+
predicate, _ = pushdowns_to_predicate_and_columns(pushdowns)
|
|
215
|
+
return predicate
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def pushdowns_to_predicate_and_columns(
|
|
219
|
+
pushdowns: Pushdowns,
|
|
220
|
+
) -> tuple[Predicate | None, set[str]]:
|
|
221
|
+
"""Translate the filter and return the columns it references.
|
|
222
|
+
|
|
223
|
+
The referenced column set is needed by callers that also push down a
|
|
224
|
+
column projection. VastDB's ``select_splits`` requires every column the
|
|
225
|
+
predicate touches to appear in ``columns``; otherwise the server raises
|
|
226
|
+
``FieldNotFound``. Callers must union this set with their projection
|
|
227
|
+
before handing it to VastDB.
|
|
228
|
+
|
|
229
|
+
The set is empty when the predicate is unsupported or absent.
|
|
230
|
+
"""
|
|
231
|
+
if pushdowns.filters is None:
|
|
232
|
+
return None, set()
|
|
233
|
+
visitor = _DaftToIbisVisitor()
|
|
234
|
+
try:
|
|
235
|
+
result = visitor.visit(pushdowns.filters)
|
|
236
|
+
except Exception as exc:
|
|
237
|
+
logger.debug("Daft filter not pushed down to VastDB — visitor error: %s", exc)
|
|
238
|
+
return None, set()
|
|
239
|
+
if result is _UNSUPPORTED:
|
|
240
|
+
logger.debug(
|
|
241
|
+
"Daft filter not pushed down to VastDB — unsupported expression: %s",
|
|
242
|
+
pushdowns.filters,
|
|
243
|
+
)
|
|
244
|
+
return None, set()
|
|
245
|
+
return result, visitor.referenced_columns
|