planar 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- planar/_version.py +1 -1
- planar/ai/agent.py +19 -3
- planar/ai/agent_base.py +1 -5
- planar/ai/agent_utils.py +0 -72
- planar/ai/models.py +30 -0
- planar/ai/pydantic_ai.py +12 -11
- planar/app.py +1 -7
- planar/config.py +2 -0
- planar/data/__init__.py +17 -0
- planar/data/config.py +49 -0
- planar/data/dataset.py +272 -0
- planar/data/exceptions.py +19 -0
- planar/data/test_dataset.py +354 -0
- planar/dependencies.py +30 -0
- planar/routers/agents_router.py +52 -4
- planar/routers/test_routes_security.py +3 -2
- planar/rules/__init__.py +12 -18
- planar/scaffold_templates/planar.dev.yaml.j2 +9 -0
- planar/scaffold_templates/planar.prod.yaml.j2 +14 -0
- planar/testing/workflow_observer.py +2 -2
- planar/workflows/notifications.py +39 -3
- {planar-0.8.0.dist-info → planar-0.9.0.dist-info}/METADATA +5 -1
- {planar-0.8.0.dist-info → planar-0.9.0.dist-info}/RECORD +25 -19
- {planar-0.8.0.dist-info → planar-0.9.0.dist-info}/WHEEL +0 -0
- {planar-0.8.0.dist-info → planar-0.9.0.dist-info}/entry_points.txt +0 -0
planar/data/dataset.py
ADDED
@@ -0,0 +1,272 @@
|
|
1
|
+
"""PlanarDataset implementation for working with Ducklake tables."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from typing import Literal, Self
|
5
|
+
|
6
|
+
import ibis
|
7
|
+
import polars as pl
|
8
|
+
import pyarrow as pa
|
9
|
+
from ibis.backends.duckdb import Backend as DuckDBBackend
|
10
|
+
from ibis.common.exceptions import TableNotFound
|
11
|
+
from pydantic import BaseModel
|
12
|
+
|
13
|
+
from planar.config import PlanarConfig
|
14
|
+
from planar.files.storage.config import LocalDirectoryConfig, S3Config
|
15
|
+
from planar.logging import get_logger
|
16
|
+
from planar.session import get_config
|
17
|
+
|
18
|
+
from .exceptions import DataError, DatasetAlreadyExistsError, DatasetNotFoundError
|
19
|
+
|
20
|
+
logger = get_logger(__name__)
|
21
|
+
|
22
|
+
|
23
|
+
class PlanarDataset(BaseModel):
|
24
|
+
"""Reference to a Ducklake table.
|
25
|
+
|
26
|
+
This class provides a simple interface for working with datasets in Ducklake,
|
27
|
+
handling creation, reading, writing, and deletion of tabular data.
|
28
|
+
"""
|
29
|
+
|
30
|
+
# TODO: Add support for schema name (ie. namespace)
|
31
|
+
name: str # Table name in Ducklake
|
32
|
+
# TODO: Add snapshot version: no version = latest, otherwise time travel on read operations
|
33
|
+
# TODO: Add partition support? A Dataset representation could be a table with a partition column
|
34
|
+
|
35
|
+
model_config = {"arbitrary_types_allowed": True}
|
36
|
+
# TODO: Add serialization metadata to make clear this is a dataset reference
|
37
|
+
# like EntityField.
|
38
|
+
|
39
|
+
@classmethod
|
40
|
+
async def create(cls, name: str, if_not_exists: bool = True) -> Self:
|
41
|
+
"""Create a dataset reference.
|
42
|
+
|
43
|
+
Note: The actual table is created when data is first written to avoid
|
44
|
+
DuckDB's requirement that tables have at least one column.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
name: Name of the dataset
|
48
|
+
if_not_exists: If True, don't raise error if dataset exists. default: True
|
49
|
+
catalog: Catalog name in Ducklake
|
50
|
+
|
51
|
+
Returns:
|
52
|
+
PlanarDataset instance
|
53
|
+
|
54
|
+
Raises:
|
55
|
+
DatasetAlreadyExistsError: If dataset exists and if_not_exists=False
|
56
|
+
"""
|
57
|
+
dataset = cls(name=name)
|
58
|
+
|
59
|
+
# Check if dataset already exists
|
60
|
+
if await dataset.exists():
|
61
|
+
if not if_not_exists:
|
62
|
+
raise DatasetAlreadyExistsError(f"Dataset {name} already exists")
|
63
|
+
logger.debug("dataset already exists", dataset_name=name)
|
64
|
+
else:
|
65
|
+
logger.debug("dataset reference created", dataset_name=name)
|
66
|
+
|
67
|
+
return dataset
|
68
|
+
|
69
|
+
async def exists(self) -> bool:
|
70
|
+
"""Check if the dataset exists in Ducklake."""
|
71
|
+
con = await self._get_connection()
|
72
|
+
try:
|
73
|
+
# TODO: Query for the table name directly
|
74
|
+
tables = await asyncio.to_thread(con.list_tables)
|
75
|
+
return self.name in tables
|
76
|
+
except Exception as e:
|
77
|
+
logger.warning("failed to check dataset existence", error=str(e))
|
78
|
+
return False
|
79
|
+
|
80
|
+
async def write(
|
81
|
+
self,
|
82
|
+
data: pl.DataFrame | ibis.Table | list | dict,
|
83
|
+
mode: Literal["overwrite", "append"] = "append",
|
84
|
+
) -> None:
|
85
|
+
"""Write data to the dataset.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
data: Data to write (Polars DataFrame, PyArrow Table, or Ibis expression)
|
89
|
+
mode: Write mode - "append" or "overwrite"
|
90
|
+
"""
|
91
|
+
con = await self._get_connection()
|
92
|
+
overwrite = mode == "overwrite"
|
93
|
+
|
94
|
+
try:
|
95
|
+
if not await self.exists():
|
96
|
+
await asyncio.to_thread(
|
97
|
+
con.create_table, self.name, data, overwrite=overwrite
|
98
|
+
)
|
99
|
+
else:
|
100
|
+
# TODO: Explore if workflow context can be used to set metadata
|
101
|
+
# on the snapshot version for lineage
|
102
|
+
if isinstance(data, pl.DataFrame):
|
103
|
+
await asyncio.to_thread(
|
104
|
+
con.insert,
|
105
|
+
self.name,
|
106
|
+
ibis.memtable(data),
|
107
|
+
overwrite=overwrite,
|
108
|
+
)
|
109
|
+
else:
|
110
|
+
await asyncio.to_thread(
|
111
|
+
con.insert, self.name, data, overwrite=overwrite
|
112
|
+
)
|
113
|
+
|
114
|
+
logger.debug(
|
115
|
+
"wrote data to dataset",
|
116
|
+
dataset_name=self.name,
|
117
|
+
mode=mode,
|
118
|
+
)
|
119
|
+
except Exception as e:
|
120
|
+
raise DataError(f"Failed to write data: {e}") from e
|
121
|
+
|
122
|
+
async def read(
|
123
|
+
self,
|
124
|
+
columns: list[str] | None = None,
|
125
|
+
limit: int | None = None,
|
126
|
+
) -> ibis.Table:
|
127
|
+
"""Read data as an Ibis table expression.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
columns: Optional list of columns to select
|
131
|
+
limit: Optional row limit
|
132
|
+
|
133
|
+
Returns:
|
134
|
+
Ibis table expression that can be further filtered using Ibis methods
|
135
|
+
"""
|
136
|
+
con = await self._get_connection()
|
137
|
+
|
138
|
+
try:
|
139
|
+
table = await asyncio.to_thread(con.table, self.name)
|
140
|
+
|
141
|
+
if columns:
|
142
|
+
table = table.select(columns)
|
143
|
+
|
144
|
+
if limit:
|
145
|
+
table = table.limit(limit)
|
146
|
+
|
147
|
+
return table
|
148
|
+
except TableNotFound as e:
|
149
|
+
raise DatasetNotFoundError(f"Dataset {self.name} not found") from e
|
150
|
+
except Exception as e:
|
151
|
+
raise DataError(f"Failed to read data: {e}") from e
|
152
|
+
|
153
|
+
async def to_polars(self) -> pl.DataFrame:
|
154
|
+
"""Read entire dataset as Polars DataFrame."""
|
155
|
+
table = await self.read()
|
156
|
+
return await asyncio.to_thread(table.to_polars)
|
157
|
+
|
158
|
+
async def to_pyarrow(self) -> pa.Table:
|
159
|
+
"""Read entire dataset as PyArrow Table."""
|
160
|
+
table = await self.read()
|
161
|
+
return await asyncio.to_thread(table.to_pyarrow)
|
162
|
+
|
163
|
+
async def delete(self) -> None:
|
164
|
+
"""Delete the dataset."""
|
165
|
+
con = await self._get_connection()
|
166
|
+
try:
|
167
|
+
await asyncio.to_thread(con.drop_table, self.name, force=True)
|
168
|
+
logger.info("deleted dataset", dataset_name=self.name)
|
169
|
+
except Exception as e:
|
170
|
+
raise DataError(f"Failed to delete dataset: {e}") from e
|
171
|
+
|
172
|
+
async def _get_connection(self) -> DuckDBBackend:
|
173
|
+
"""Get Ibis connection to Ducklake."""
|
174
|
+
config = get_config()
|
175
|
+
|
176
|
+
if not config.data:
|
177
|
+
raise DataError(
|
178
|
+
"Data configuration not found. Please configure 'data' in your planar.yaml"
|
179
|
+
)
|
180
|
+
|
181
|
+
# TODO: Add cached connection pooling or memoize the connection
|
182
|
+
return await self._create_connection(config)
|
183
|
+
|
184
|
+
async def _create_connection(self, config: PlanarConfig) -> DuckDBBackend:
|
185
|
+
"""Create Ibis DuckDB connection with Ducklake."""
|
186
|
+
data_config = config.data
|
187
|
+
if not data_config:
|
188
|
+
raise DataError("Data configuration not found")
|
189
|
+
|
190
|
+
# Connect to DuckDB with Ducklake extension
|
191
|
+
con = await asyncio.to_thread(ibis.duckdb.connect, extensions=["ducklake"])
|
192
|
+
|
193
|
+
# Build Ducklake connection string based on catalog type
|
194
|
+
catalog_config = data_config.catalog
|
195
|
+
|
196
|
+
if catalog_config.type == "duckdb":
|
197
|
+
metadata_path = catalog_config.path
|
198
|
+
elif catalog_config.type == "postgres":
|
199
|
+
# Use connection components to build postgres connection string
|
200
|
+
pg = catalog_config
|
201
|
+
metadata_path = f"postgres:dbname={pg.db}"
|
202
|
+
if pg.host:
|
203
|
+
metadata_path += f" host={pg.host}"
|
204
|
+
if pg.port:
|
205
|
+
metadata_path += f" port={pg.port}"
|
206
|
+
if pg.user:
|
207
|
+
metadata_path += f" user={pg.user}"
|
208
|
+
if pg.password:
|
209
|
+
metadata_path += f" password={pg.password}"
|
210
|
+
elif catalog_config.type == "sqlite":
|
211
|
+
metadata_path = f"sqlite:{catalog_config.path}"
|
212
|
+
else:
|
213
|
+
raise ValueError(f"Unsupported catalog type: {catalog_config.type}")
|
214
|
+
|
215
|
+
# Build ATTACH statement
|
216
|
+
attach_sql = f"ATTACH 'ducklake:{metadata_path}'"
|
217
|
+
|
218
|
+
# Add data path from storage config
|
219
|
+
storage = data_config.storage
|
220
|
+
if isinstance(storage, LocalDirectoryConfig):
|
221
|
+
data_path = storage.directory
|
222
|
+
elif isinstance(storage, S3Config):
|
223
|
+
data_path = f"s3://{storage.bucket_name}/"
|
224
|
+
else:
|
225
|
+
# Generic fallback
|
226
|
+
data_path = getattr(storage, "path", None) or getattr(
|
227
|
+
storage, "directory", "."
|
228
|
+
)
|
229
|
+
|
230
|
+
attach_sql += f" (DATA_PATH '{data_path}')"
|
231
|
+
|
232
|
+
# Attach to Ducklake
|
233
|
+
try:
|
234
|
+
await asyncio.to_thread(con.raw_sql, attach_sql)
|
235
|
+
except Exception as e:
|
236
|
+
raise DataError(f"Failed to attach to Ducklake: {e}") from e
|
237
|
+
|
238
|
+
# Check available catalogs (what ibis calls schemas) and use the correct one
|
239
|
+
try:
|
240
|
+
catalogs = await asyncio.to_thread(con.list_catalogs)
|
241
|
+
logger.debug("available catalogs", catalogs=catalogs)
|
242
|
+
|
243
|
+
# Find the ducklake catalog (it might have a different name pattern)
|
244
|
+
ducklake_catalog = None
|
245
|
+
for cat in catalogs:
|
246
|
+
if "ducklake" in cat.lower() or cat == data_config.catalog_name:
|
247
|
+
ducklake_catalog = cat
|
248
|
+
break
|
249
|
+
|
250
|
+
if ducklake_catalog:
|
251
|
+
if not ducklake_catalog.replace("_", "").replace("-", "").isalnum():
|
252
|
+
raise DataError(f"Invalid catalog name format: {ducklake_catalog}")
|
253
|
+
await asyncio.to_thread(con.raw_sql, f"USE {ducklake_catalog}")
|
254
|
+
logger.debug("using catalog", catalog=ducklake_catalog)
|
255
|
+
else:
|
256
|
+
catalog_name = data_config.catalog_name
|
257
|
+
if not catalog_name.replace("_", "").replace("-", "").isalnum():
|
258
|
+
raise DataError(f"Invalid catalog name format: {catalog_name}")
|
259
|
+
await asyncio.to_thread(
|
260
|
+
con.raw_sql,
|
261
|
+
f"CREATE SCHEMA IF NOT EXISTS {catalog_name}",
|
262
|
+
)
|
263
|
+
await asyncio.to_thread(con.raw_sql, f"USE {catalog_name}")
|
264
|
+
logger.debug(
|
265
|
+
"created and using catalog", catalog=data_config.catalog_name
|
266
|
+
)
|
267
|
+
|
268
|
+
except Exception as e:
|
269
|
+
logger.warning("failed to set catalog, using default", error=str(e))
|
270
|
+
# Continue without setting catalog - will use qualified names
|
271
|
+
|
272
|
+
return con
|
@@ -0,0 +1,19 @@
|
|
1
|
+
"""Exceptions for the Planar data module."""
|
2
|
+
|
3
|
+
|
4
|
+
class DataError(Exception):
|
5
|
+
"""Base exception for data-related errors."""
|
6
|
+
|
7
|
+
pass
|
8
|
+
|
9
|
+
|
10
|
+
class DatasetNotFoundError(DataError):
|
11
|
+
"""Raised when a dataset is not found."""
|
12
|
+
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
class DatasetAlreadyExistsError(DataError):
|
17
|
+
"""Raised when trying to create a dataset that already exists."""
|
18
|
+
|
19
|
+
pass
|
@@ -0,0 +1,354 @@
|
|
1
|
+
"""Tests for PlanarDataset."""
|
2
|
+
|
3
|
+
import polars as pl
|
4
|
+
import pyarrow as pa
|
5
|
+
import pytest
|
6
|
+
from ibis import literal
|
7
|
+
|
8
|
+
from planar import PlanarApp
|
9
|
+
from planar.data import PlanarDataset
|
10
|
+
from planar.data.config import DataConfig, SQLiteCatalogConfig
|
11
|
+
from planar.data.exceptions import (
|
12
|
+
DataError,
|
13
|
+
DatasetAlreadyExistsError,
|
14
|
+
DatasetNotFoundError,
|
15
|
+
)
|
16
|
+
from planar.files.storage.config import LocalDirectoryConfig
|
17
|
+
from planar.workflows import step
|
18
|
+
|
19
|
+
|
20
|
+
@pytest.fixture
|
21
|
+
def data_config(tmp_path):
|
22
|
+
"""Create a test data configuration."""
|
23
|
+
data_dir = tmp_path / "data"
|
24
|
+
data_dir.mkdir(exist_ok=True)
|
25
|
+
|
26
|
+
catalog_path = data_dir / "test.sqlite"
|
27
|
+
storage_path = data_dir / "ducklake_files"
|
28
|
+
storage_path.mkdir(exist_ok=True)
|
29
|
+
|
30
|
+
return DataConfig(
|
31
|
+
catalog=SQLiteCatalogConfig(type="sqlite", path=str(catalog_path)),
|
32
|
+
storage=LocalDirectoryConfig(backend="localdir", directory=str(storage_path)),
|
33
|
+
)
|
34
|
+
|
35
|
+
|
36
|
+
@pytest.fixture(name="app")
|
37
|
+
def app_fixture(data_config):
|
38
|
+
"""Create a PlanarApp with data configuration."""
|
39
|
+
app = PlanarApp()
|
40
|
+
# Add data config to the app's config
|
41
|
+
app.config.data = data_config
|
42
|
+
return app
|
43
|
+
|
44
|
+
|
45
|
+
@pytest.mark.asyncio
|
46
|
+
async def test_dataset_create(client):
|
47
|
+
"""Test creating a dataset reference."""
|
48
|
+
dataset = await PlanarDataset.create("test_table")
|
49
|
+
assert dataset.name == "test_table"
|
50
|
+
|
51
|
+
# Dataset reference exists but table isn't created until first write
|
52
|
+
assert not await dataset.exists()
|
53
|
+
|
54
|
+
# Write some data to actually create the table
|
55
|
+
df = pl.DataFrame({"id": [1], "name": ["test"]})
|
56
|
+
await dataset.write(df, mode="overwrite")
|
57
|
+
|
58
|
+
# Now it should exist
|
59
|
+
assert await dataset.exists()
|
60
|
+
|
61
|
+
# Cleanup
|
62
|
+
await dataset.delete()
|
63
|
+
|
64
|
+
|
65
|
+
@pytest.mark.asyncio
|
66
|
+
async def test_dataset_create_if_not_exists(client):
|
67
|
+
"""Test creating a dataset with if_not_exists behavior."""
|
68
|
+
# Create dataset and write data to make it exist
|
69
|
+
dataset1 = await PlanarDataset.create("test_table")
|
70
|
+
df = pl.DataFrame({"id": [1], "name": ["test"]})
|
71
|
+
await dataset1.write(df, mode="overwrite")
|
72
|
+
|
73
|
+
# Create again with if_not_exists=True (default) - should not raise
|
74
|
+
dataset2 = await PlanarDataset.create("test_table", if_not_exists=True)
|
75
|
+
assert dataset2.name == dataset1.name
|
76
|
+
|
77
|
+
# Create again with if_not_exists=False - should raise
|
78
|
+
with pytest.raises(DatasetAlreadyExistsError):
|
79
|
+
await PlanarDataset.create("test_table", if_not_exists=False)
|
80
|
+
|
81
|
+
# Cleanup
|
82
|
+
await dataset1.delete()
|
83
|
+
|
84
|
+
|
85
|
+
@pytest.mark.asyncio
|
86
|
+
async def test_dataset_write_and_read_polars(client):
|
87
|
+
"""Test writing and reading data with Polars."""
|
88
|
+
dataset = await PlanarDataset.create("test_polars")
|
89
|
+
|
90
|
+
# Create test data
|
91
|
+
df = pl.DataFrame(
|
92
|
+
{
|
93
|
+
"id": [1, 2, 3],
|
94
|
+
"name": ["Alice", "Bob", "Charlie"],
|
95
|
+
"amount": [100.5, 200.0, 150.75],
|
96
|
+
}
|
97
|
+
)
|
98
|
+
|
99
|
+
# Write data
|
100
|
+
await dataset.write(df, mode="overwrite")
|
101
|
+
|
102
|
+
# Read data back
|
103
|
+
result = await dataset.to_polars()
|
104
|
+
|
105
|
+
# Verify
|
106
|
+
assert result.shape == df.shape
|
107
|
+
assert set(result.columns) == set(df.columns)
|
108
|
+
assert result["id"].to_list() == [1, 2, 3]
|
109
|
+
assert result["name"].to_list() == ["Alice", "Bob", "Charlie"]
|
110
|
+
|
111
|
+
# Cleanup
|
112
|
+
await dataset.delete()
|
113
|
+
|
114
|
+
|
115
|
+
@pytest.mark.asyncio
|
116
|
+
async def test_dataset_write_and_read_pyarrow(client):
|
117
|
+
"""Test writing and reading data with PyArrow."""
|
118
|
+
dataset = await PlanarDataset.create("test_pyarrow")
|
119
|
+
|
120
|
+
# Create test data
|
121
|
+
table = pa.table(
|
122
|
+
{
|
123
|
+
"id": [1, 2, 3],
|
124
|
+
"name": ["Alice", "Bob", "Charlie"],
|
125
|
+
"amount": [100.5, 200.0, 150.75],
|
126
|
+
}
|
127
|
+
)
|
128
|
+
|
129
|
+
# Write data
|
130
|
+
await dataset.write(table, mode="overwrite")
|
131
|
+
|
132
|
+
# Read data back
|
133
|
+
result = await dataset.to_pyarrow()
|
134
|
+
|
135
|
+
# Verify
|
136
|
+
assert result.num_rows == table.num_rows
|
137
|
+
assert result.column_names == table.column_names
|
138
|
+
|
139
|
+
# Cleanup
|
140
|
+
await dataset.delete()
|
141
|
+
|
142
|
+
|
143
|
+
@pytest.mark.asyncio
|
144
|
+
async def test_dataset_append_mode(client):
|
145
|
+
"""Test appending data to a dataset."""
|
146
|
+
dataset = await PlanarDataset.create("test_append")
|
147
|
+
|
148
|
+
# Write initial data
|
149
|
+
df1 = pl.DataFrame({"id": [1, 2], "value": ["a", "b"]})
|
150
|
+
await dataset.write(df1, mode="overwrite")
|
151
|
+
|
152
|
+
# Append more data
|
153
|
+
df2 = pl.DataFrame({"id": [3, 4], "value": ["c", "d"]})
|
154
|
+
await dataset.write(df2, mode="append")
|
155
|
+
|
156
|
+
result = await dataset.to_polars()
|
157
|
+
|
158
|
+
# Verify
|
159
|
+
assert len(result) == 4
|
160
|
+
assert set(result["id"].to_list()) == {1, 2, 3, 4}
|
161
|
+
assert set(result["value"].to_list()) == {"a", "b", "c", "d"}
|
162
|
+
|
163
|
+
# Cleanup
|
164
|
+
await dataset.delete()
|
165
|
+
|
166
|
+
|
167
|
+
@pytest.mark.asyncio
|
168
|
+
async def test_dataset_overwrite_replaces_existing(client):
|
169
|
+
"""Overwrite should replace existing rows completely."""
|
170
|
+
dataset = await PlanarDataset.create("test_overwrite")
|
171
|
+
|
172
|
+
df1 = pl.DataFrame({"id": [1, 2], "value": ["a", "b"]})
|
173
|
+
await dataset.write(df1, mode="overwrite")
|
174
|
+
result1 = await dataset.to_polars()
|
175
|
+
assert result1.shape == (2, 2)
|
176
|
+
|
177
|
+
df2 = pl.DataFrame({"id": [3], "value": ["c"]})
|
178
|
+
await dataset.write(df2, mode="overwrite")
|
179
|
+
result2 = await dataset.to_polars()
|
180
|
+
assert result2.shape == (1, 2)
|
181
|
+
assert result2["id"].to_list() == [3]
|
182
|
+
assert result2["value"].to_list() == ["c"]
|
183
|
+
|
184
|
+
await dataset.delete()
|
185
|
+
|
186
|
+
|
187
|
+
@pytest.mark.asyncio
|
188
|
+
async def test_dataset_read_with_filter(client):
|
189
|
+
"""Test reading data with Ibis filtering."""
|
190
|
+
dataset = await PlanarDataset.create("test_filter")
|
191
|
+
|
192
|
+
# Write test data
|
193
|
+
df = pl.DataFrame({"id": range(1, 11), "value": range(10, 101, 10)})
|
194
|
+
await dataset.write(df, mode="overwrite")
|
195
|
+
|
196
|
+
table = await dataset.read()
|
197
|
+
filtered_table = table.filter(table.value > literal(50))
|
198
|
+
filtered_df = filtered_table.to_polars()
|
199
|
+
|
200
|
+
assert len(filtered_df) == 5
|
201
|
+
assert all(v > 50 for v in filtered_df["value"].to_list())
|
202
|
+
|
203
|
+
# Cleanup
|
204
|
+
await dataset.delete()
|
205
|
+
|
206
|
+
|
207
|
+
@pytest.mark.asyncio
|
208
|
+
async def test_dataset_read_with_columns_and_limit(client):
|
209
|
+
"""Test reading specific columns with limit."""
|
210
|
+
dataset = await PlanarDataset.create("test_select")
|
211
|
+
|
212
|
+
# Write test data
|
213
|
+
df = pl.DataFrame(
|
214
|
+
{
|
215
|
+
"id": range(1, 11),
|
216
|
+
"name": [f"user_{i}" for i in range(1, 11)],
|
217
|
+
"value": range(10, 101, 10),
|
218
|
+
}
|
219
|
+
)
|
220
|
+
await dataset.write(df, mode="overwrite")
|
221
|
+
|
222
|
+
# Read specific columns with limit
|
223
|
+
table = await dataset.read(columns=["id", "name"], limit=5)
|
224
|
+
result_df = table.to_polars()
|
225
|
+
|
226
|
+
# Verify
|
227
|
+
assert len(result_df) == 5
|
228
|
+
assert set(result_df.columns) == {"id", "name"}
|
229
|
+
assert "value" not in result_df.columns
|
230
|
+
|
231
|
+
# Cleanup
|
232
|
+
await dataset.delete()
|
233
|
+
|
234
|
+
|
235
|
+
@pytest.mark.asyncio
|
236
|
+
async def test_dataset_not_found(client):
|
237
|
+
"""Test reading from non-existent dataset."""
|
238
|
+
dataset = PlanarDataset(name="nonexistent")
|
239
|
+
|
240
|
+
# Check exists returns False
|
241
|
+
assert not await dataset.exists()
|
242
|
+
|
243
|
+
# Try to read - should raise
|
244
|
+
with pytest.raises(DatasetNotFoundError):
|
245
|
+
await dataset.read()
|
246
|
+
|
247
|
+
|
248
|
+
@pytest.mark.asyncio
|
249
|
+
async def test_dataset_delete(client):
|
250
|
+
"""Test deleting a dataset."""
|
251
|
+
dataset = await PlanarDataset.create("test_delete")
|
252
|
+
|
253
|
+
# Write some data
|
254
|
+
df = pl.DataFrame({"id": [1, 2, 3]})
|
255
|
+
await dataset.write(df)
|
256
|
+
|
257
|
+
# Verify it exists
|
258
|
+
assert await dataset.exists()
|
259
|
+
|
260
|
+
# Delete it
|
261
|
+
await dataset.delete()
|
262
|
+
|
263
|
+
# Verify it's gone
|
264
|
+
assert not await dataset.exists()
|
265
|
+
|
266
|
+
|
267
|
+
@pytest.mark.asyncio
|
268
|
+
async def test_dataset_write_list_of_dicts(client):
|
269
|
+
"""Write list-of-dicts input and read back with Polars."""
|
270
|
+
dataset = await PlanarDataset.create("test_list_of_dicts")
|
271
|
+
|
272
|
+
rows = [{"id": 1, "name": "a"}, {"id": 2, "name": "b"}]
|
273
|
+
await dataset.write(rows, mode="overwrite")
|
274
|
+
|
275
|
+
result = await dataset.to_polars()
|
276
|
+
assert set(result.columns) == {"id", "name"}
|
277
|
+
assert sorted(result["id"].to_list()) == [1, 2]
|
278
|
+
|
279
|
+
await dataset.delete()
|
280
|
+
|
281
|
+
|
282
|
+
@pytest.mark.asyncio
|
283
|
+
async def test_dataset_write_dict_of_lists(client):
|
284
|
+
"""Write dict-of-lists input and read back with Polars."""
|
285
|
+
dataset = await PlanarDataset.create("test_dict_of_lists")
|
286
|
+
|
287
|
+
data = {"id": [1, 2], "name": ["a", "b"]}
|
288
|
+
await dataset.write(data, mode="overwrite")
|
289
|
+
|
290
|
+
result = await dataset.to_polars()
|
291
|
+
assert result.shape == (2, 2)
|
292
|
+
assert set(result["name"].to_list()) == {"a", "b"}
|
293
|
+
|
294
|
+
await dataset.delete()
|
295
|
+
|
296
|
+
|
297
|
+
@pytest.mark.asyncio
|
298
|
+
async def test_dataset_workflow_serialization(client):
|
299
|
+
"""Test that PlanarDataset can be used as workflow input/output."""
|
300
|
+
|
301
|
+
@step()
|
302
|
+
async def create_data() -> PlanarDataset:
|
303
|
+
"""Create a dataset with sample data."""
|
304
|
+
dataset = await PlanarDataset.create("workflow_data")
|
305
|
+
|
306
|
+
df = pl.DataFrame(
|
307
|
+
{"product": ["A", "B", "C", "D"], "sales": [100, 200, 150, 300]}
|
308
|
+
)
|
309
|
+
await dataset.write(df, mode="overwrite")
|
310
|
+
|
311
|
+
return dataset
|
312
|
+
|
313
|
+
@step()
|
314
|
+
async def analyze_data(dataset: PlanarDataset) -> float:
|
315
|
+
"""Analyze the dataset and return total sales."""
|
316
|
+
df = await dataset.to_polars()
|
317
|
+
return float(df["sales"].sum())
|
318
|
+
|
319
|
+
# Test basic workflow functionality without API
|
320
|
+
dataset = await create_data()
|
321
|
+
total = await analyze_data(dataset)
|
322
|
+
|
323
|
+
# Verify results
|
324
|
+
assert total == 750.0 # Sum of [100, 200, 150, 300]
|
325
|
+
|
326
|
+
# Cleanup
|
327
|
+
await dataset.delete()
|
328
|
+
|
329
|
+
|
330
|
+
@pytest.mark.asyncio
|
331
|
+
async def test_no_data_config_error(client):
|
332
|
+
"""Test error when data config is not set."""
|
333
|
+
# Remove data config
|
334
|
+
client.app.config.data = None
|
335
|
+
|
336
|
+
dataset = PlanarDataset(name="test")
|
337
|
+
|
338
|
+
with pytest.raises(DataError, match="Data configuration not found"):
|
339
|
+
await dataset._get_connection()
|
340
|
+
|
341
|
+
|
342
|
+
@pytest.mark.asyncio
|
343
|
+
async def test_write_with_invalid_input_raises(client):
|
344
|
+
"""Unknown input types to write() should raise a DataError."""
|
345
|
+
|
346
|
+
class Foo:
|
347
|
+
pass
|
348
|
+
|
349
|
+
dataset = await PlanarDataset.create("test_invalid_input")
|
350
|
+
|
351
|
+
with pytest.raises(DataError):
|
352
|
+
await dataset.write(Foo(), mode="overwrite") # type: ignore
|
353
|
+
|
354
|
+
await dataset.delete()
|
planar/dependencies.py
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from importlib import import_module
|
5
|
+
from typing import Mapping, Tuple
|
6
|
+
|
7
|
+
|
8
|
+
# mapping: public_name -> (relative_submodule, attribute_in_submodule)
|
9
|
+
# This is a PEP 562 compliant way to lazily import modules
|
10
|
+
# which is a way to avoid circular dependencies in __init__.py.
|
11
|
+
def lazy_exports(module_name: str, mapping: Mapping[str, Tuple[str, str]]) -> None:
|
12
|
+
mod = sys.modules[module_name]
|
13
|
+
mod.__all__ = list(mapping.keys()) # type: ignore
|
14
|
+
|
15
|
+
def __getattr__(name: str):
|
16
|
+
try:
|
17
|
+
submod, attr = mapping[name]
|
18
|
+
except KeyError:
|
19
|
+
raise AttributeError(
|
20
|
+
f"module {module_name!r} has no attribute {name!r}"
|
21
|
+
) from None
|
22
|
+
obj = getattr(import_module(submod, module_name), attr)
|
23
|
+
setattr(mod, name, obj) # cache
|
24
|
+
return obj
|
25
|
+
|
26
|
+
def __dir__():
|
27
|
+
return sorted(set(mod.__dict__.keys()) | set(mod.__all__))
|
28
|
+
|
29
|
+
mod.__getattr__ = __getattr__ # PEP 562
|
30
|
+
mod.__dir__ = __dir__
|