patito 0.5.1__py3-none-any.whl → 0.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patito/__init__.py +4 -23
- patito/_docs.py +1 -0
- patito/_pydantic/__init__.py +0 -0
- patito/_pydantic/column_info.py +94 -0
- patito/_pydantic/dtypes/__init__.py +25 -0
- patito/_pydantic/dtypes/dtypes.py +249 -0
- patito/_pydantic/dtypes/utils.py +227 -0
- patito/_pydantic/repr.py +139 -0
- patito/_pydantic/schema.py +96 -0
- patito/exceptions.py +174 -7
- patito/polars.py +310 -102
- patito/pydantic.py +361 -511
- patito/validators.py +229 -96
- {patito-0.5.1.dist-info → patito-0.6.2.dist-info}/METADATA +12 -26
- patito-0.6.2.dist-info/RECORD +17 -0
- patito/database.py +0 -658
- patito/duckdb.py +0 -2793
- patito/sql.py +0 -88
- patito/xdg.py +0 -22
- patito-0.5.1.dist-info/RECORD +0 -14
- {patito-0.5.1.dist-info → patito-0.6.2.dist-info}/LICENSE +0 -0
- {patito-0.5.1.dist-info → patito-0.6.2.dist-info}/WHEEL +0 -0
patito/duckdb.py
DELETED
|
@@ -1,2793 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Module which wraps around the duckdb module in an opiniated manner.
|
|
3
|
-
"""
|
|
4
|
-
from __future__ import annotations
|
|
5
|
-
|
|
6
|
-
import hashlib
|
|
7
|
-
from collections.abc import Collection, Iterable, Iterator
|
|
8
|
-
from pathlib import Path
|
|
9
|
-
from typing import (
|
|
10
|
-
TYPE_CHECKING,
|
|
11
|
-
Any,
|
|
12
|
-
Dict,
|
|
13
|
-
Generic,
|
|
14
|
-
List,
|
|
15
|
-
Optional,
|
|
16
|
-
Set,
|
|
17
|
-
Tuple,
|
|
18
|
-
Type,
|
|
19
|
-
TypeVar,
|
|
20
|
-
Union,
|
|
21
|
-
cast,
|
|
22
|
-
)
|
|
23
|
-
|
|
24
|
-
import numpy as np
|
|
25
|
-
import polars as pl
|
|
26
|
-
import pyarrow as pa # type: ignore[import]
|
|
27
|
-
from pydantic import create_model
|
|
28
|
-
from typing_extensions import Literal
|
|
29
|
-
|
|
30
|
-
from patito import sql
|
|
31
|
-
from patito.exceptions import MultipleRowsReturned, RowDoesNotExist
|
|
32
|
-
from patito.polars import DataFrame
|
|
33
|
-
from patito.pydantic import Model, ModelType
|
|
34
|
-
|
|
35
|
-
try:
|
|
36
|
-
import pandas as pd
|
|
37
|
-
|
|
38
|
-
_PANDAS_AVAILABLE = True
|
|
39
|
-
except ImportError:
|
|
40
|
-
_PANDAS_AVAILABLE = False
|
|
41
|
-
|
|
42
|
-
if TYPE_CHECKING:
|
|
43
|
-
import duckdb
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
# Types which can be used to instantiate a DuckDB Relation object
|
|
47
|
-
RelationSource = Union[
|
|
48
|
-
DataFrame,
|
|
49
|
-
pl.DataFrame,
|
|
50
|
-
"pd.DataFrame",
|
|
51
|
-
Path,
|
|
52
|
-
str,
|
|
53
|
-
"duckdb.DuckDBPyRelation",
|
|
54
|
-
"Relation",
|
|
55
|
-
]
|
|
56
|
-
|
|
57
|
-
# Used to refer to type(self) in Relation methods which preserve the type.
|
|
58
|
-
# Hard-coding Relation or Relation[ModelType] does not work for subclasses
|
|
59
|
-
# that return type(self) since that will refer to the parent class.
|
|
60
|
-
# See relevant SO answer: https://stackoverflow.com/a/63178532
|
|
61
|
-
RelationType = TypeVar("RelationType", bound="Relation")
|
|
62
|
-
|
|
63
|
-
# The SQL types supported by DuckDB
|
|
64
|
-
# See: https://duckdb.org/docs/sql/data_types/overview
|
|
65
|
-
# fmt: off
|
|
66
|
-
DuckDBSQLType = Literal[
|
|
67
|
-
"BIGINT", "INT8", "LONG",
|
|
68
|
-
"BLOB", "BYTEA", "BINARY", "VARBINARY",
|
|
69
|
-
"BOOLEAN", "BOOL", "LOGICAL",
|
|
70
|
-
"DATE",
|
|
71
|
-
"DOUBLE", "FLOAT8", "NUMERIC", "DECIMAL",
|
|
72
|
-
"HUGEINT",
|
|
73
|
-
"INTEGER", "INT4", "INT", "SIGNED",
|
|
74
|
-
"INTERVAL",
|
|
75
|
-
"REAL", "FLOAT4", "FLOAT",
|
|
76
|
-
"SMALLINT", "INT2", "SHORT",
|
|
77
|
-
"TIME",
|
|
78
|
-
"TIMESTAMP", "DATETIME",
|
|
79
|
-
"TIMESTAMP WITH TIMEZONE", "TIMESTAMPTZ",
|
|
80
|
-
"TINYINT", "INT1",
|
|
81
|
-
"UBIGINT",
|
|
82
|
-
"UINTEGER",
|
|
83
|
-
"USMALLINT",
|
|
84
|
-
"UTINYINT",
|
|
85
|
-
"UUID",
|
|
86
|
-
"VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING",
|
|
87
|
-
]
|
|
88
|
-
# fmt: on
|
|
89
|
-
|
|
90
|
-
# Used for backward-compatible patches
|
|
91
|
-
POLARS_VERSION: Optional[Tuple[int, int, int]]
|
|
92
|
-
try:
|
|
93
|
-
POLARS_VERSION = cast(
|
|
94
|
-
Tuple[int, int, int],
|
|
95
|
-
tuple(map(int, pl.__version__.split("."))),
|
|
96
|
-
)
|
|
97
|
-
except ValueError: # pragma: no cover
|
|
98
|
-
POLARS_VERSION = None
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
def create_pydantic_model(relation: "duckdb.DuckDBPyRelation") -> Type[Model]:
|
|
102
|
-
"""Create pydantic model deserialization of the given relation."""
|
|
103
|
-
pydantic_annotations = {column: (Any, ...) for column in relation.columns}
|
|
104
|
-
return create_model( # type: ignore
|
|
105
|
-
relation.alias,
|
|
106
|
-
__base__=Model,
|
|
107
|
-
**pydantic_annotations, # pyright: ignore
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def _enum_type_name(field_properties: dict) -> str:
|
|
112
|
-
"""
|
|
113
|
-
Return enum DuckDB SQL type name based on enum values.
|
|
114
|
-
|
|
115
|
-
The same enum values, regardless of ordering, will always be given the same name.
|
|
116
|
-
"""
|
|
117
|
-
enum_values = ", ".join(repr(value) for value in sorted(field_properties["enum"]))
|
|
118
|
-
value_hash = hashlib.md5(enum_values.encode("utf-8")).hexdigest() # noqa: #S303
|
|
119
|
-
return f"enum__{value_hash}"
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
def _is_missing_enum_type_exception(exception: BaseException) -> bool:
|
|
123
|
-
"""
|
|
124
|
-
Return True if the given exception might be caused by missing enum type definitions.
|
|
125
|
-
|
|
126
|
-
Args:
|
|
127
|
-
exception: Exception raised by DuckDB.
|
|
128
|
-
|
|
129
|
-
Returns:
|
|
130
|
-
True if the exception might be caused by a missing SQL enum type definition.
|
|
131
|
-
"""
|
|
132
|
-
description = str(exception)
|
|
133
|
-
# DuckDB version <= 0.3.4
|
|
134
|
-
old_exception = description.startswith("Not implemented Error: DataType")
|
|
135
|
-
# DuckDB version >= 0.4.0
|
|
136
|
-
new_exception = description.startswith("Catalog Error: Type with name enum_")
|
|
137
|
-
return old_exception or new_exception
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class Relation(Generic[ModelType]):
|
|
141
|
-
# The database connection which the given relation belongs to
|
|
142
|
-
database: Database
|
|
143
|
-
|
|
144
|
-
# The underlying DuckDB relation object which this class wraps around
|
|
145
|
-
_relation: duckdb.DuckDBPyRelation
|
|
146
|
-
|
|
147
|
-
# Can be set by subclasses in order to specify the serialization class for rows.
|
|
148
|
-
# Must accept column names as keyword arguments.
|
|
149
|
-
model: Optional[Type[ModelType]] = None
|
|
150
|
-
|
|
151
|
-
# The alias that can be used to refer to the relation in queries
|
|
152
|
-
alias: str
|
|
153
|
-
|
|
154
|
-
def __init__( # noqa: C901
|
|
155
|
-
self,
|
|
156
|
-
derived_from: RelationSource,
|
|
157
|
-
database: Optional[Database] = None,
|
|
158
|
-
model: Optional[Type[ModelType]] = None,
|
|
159
|
-
) -> None:
|
|
160
|
-
"""
|
|
161
|
-
Create a new relation object containing data to be queried with DuckDB.
|
|
162
|
-
|
|
163
|
-
Args:
|
|
164
|
-
derived_from: Data to be represented as a DuckDB relation object.
|
|
165
|
-
Can be one of the following types:
|
|
166
|
-
|
|
167
|
-
- A pandas or polars DataFrame.
|
|
168
|
-
- An SQL query represented as a string.
|
|
169
|
-
- A ``Path`` object pointing to a CSV or a parquet file.
|
|
170
|
-
The path must point to an existing file with either a ``.csv``
|
|
171
|
-
or ``.parquet`` file extension.
|
|
172
|
-
- A native DuckDB relation object (``duckdb.DuckDBPyRelation``).
|
|
173
|
-
- A ``patito.duckdb.Relation`` object.
|
|
174
|
-
|
|
175
|
-
database: Which database to load the relation into. If not provided,
|
|
176
|
-
the default DuckDB database will be used.
|
|
177
|
-
|
|
178
|
-
model: Sub-class of ``patito.Model`` which specifies how to deserialize rows
|
|
179
|
-
when fetched with methods such as
|
|
180
|
-
:ref:`Relation.get()<duckdb.Relation.get>` and ``__iter__()``.
|
|
181
|
-
|
|
182
|
-
Will also be used to create a strict table schema if
|
|
183
|
-
:ref:`Relation.create_table()<duckdb.Relation.create_table>`.
|
|
184
|
-
schema should be constructed.
|
|
185
|
-
|
|
186
|
-
If not provided, a dynamic model fitting the relation schema will be
|
|
187
|
-
created when required.
|
|
188
|
-
|
|
189
|
-
Can also be set later dynamically by invoking
|
|
190
|
-
:ref:`Relation.set_model()<duckdb.Relation.set_model>`.
|
|
191
|
-
|
|
192
|
-
Raises:
|
|
193
|
-
ValueError: If any one of the following cases are encountered:
|
|
194
|
-
|
|
195
|
-
- If a provided ``Path`` object does not have a ``.csv`` or
|
|
196
|
-
``.parquet`` file extension.
|
|
197
|
-
- If a database and relation object is provided, but the relation object
|
|
198
|
-
does not belong to the database.
|
|
199
|
-
|
|
200
|
-
TypeError: If the type of ``derived_from`` is not supported.
|
|
201
|
-
|
|
202
|
-
Examples:
|
|
203
|
-
Instantiated from a dataframe:
|
|
204
|
-
|
|
205
|
-
>>> import patito as pt
|
|
206
|
-
>>> df = pt.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
207
|
-
>>> pt.duckdb.Relation(df).filter("a > 2").to_df()
|
|
208
|
-
shape: (1, 2)
|
|
209
|
-
┌─────┬─────┐
|
|
210
|
-
│ a ┆ b │
|
|
211
|
-
│ --- ┆ --- │
|
|
212
|
-
│ i64 ┆ i64 │
|
|
213
|
-
╞═════╪═════╡
|
|
214
|
-
│ 3 ┆ 6 │
|
|
215
|
-
└─────┴─────┘
|
|
216
|
-
|
|
217
|
-
Instantiated from an SQL query:
|
|
218
|
-
|
|
219
|
-
>>> pt.duckdb.Relation("select 1 as a, 2 as b").to_df()
|
|
220
|
-
shape: (1, 2)
|
|
221
|
-
┌─────┬─────┐
|
|
222
|
-
│ a ┆ b │
|
|
223
|
-
│ --- ┆ --- │
|
|
224
|
-
│ i64 ┆ i64 │
|
|
225
|
-
╞═════╪═════╡
|
|
226
|
-
│ 1 ┆ 2 │
|
|
227
|
-
└─────┴─────┘
|
|
228
|
-
"""
|
|
229
|
-
import duckdb
|
|
230
|
-
|
|
231
|
-
if isinstance(derived_from, Relation):
|
|
232
|
-
if (
|
|
233
|
-
database is not None
|
|
234
|
-
and derived_from.database.connection is not database.connection
|
|
235
|
-
):
|
|
236
|
-
raise ValueError(
|
|
237
|
-
"Relations can't be casted between database connections."
|
|
238
|
-
)
|
|
239
|
-
self.database = derived_from.database
|
|
240
|
-
self._relation = derived_from._relation
|
|
241
|
-
self.model = derived_from.model
|
|
242
|
-
return
|
|
243
|
-
|
|
244
|
-
if database is None:
|
|
245
|
-
self.database = Database.default()
|
|
246
|
-
else:
|
|
247
|
-
self.database = database
|
|
248
|
-
|
|
249
|
-
if isinstance(derived_from, duckdb.DuckDBPyRelation):
|
|
250
|
-
relation = derived_from
|
|
251
|
-
elif isinstance(derived_from, str):
|
|
252
|
-
relation = self.database.connection.from_query(derived_from)
|
|
253
|
-
elif _PANDAS_AVAILABLE and isinstance(derived_from, pd.DataFrame):
|
|
254
|
-
# We must replace pd.NA with np.nan in order for it to be considered
|
|
255
|
-
# as null by DuckDB. Otherwise it will casted to the string <NA>
|
|
256
|
-
# or even segfault.
|
|
257
|
-
derived_from = derived_from.fillna(np.nan)
|
|
258
|
-
relation = self.database.connection.from_df(derived_from)
|
|
259
|
-
elif isinstance(derived_from, pl.DataFrame):
|
|
260
|
-
relation = self.database.connection.from_arrow(derived_from.to_arrow())
|
|
261
|
-
elif isinstance(derived_from, Path):
|
|
262
|
-
if derived_from.suffix.lower() == ".parquet":
|
|
263
|
-
relation = self.database.connection.from_parquet(str(derived_from))
|
|
264
|
-
elif derived_from.suffix.lower() == ".csv":
|
|
265
|
-
relation = self.database.connection.from_csv_auto(str(derived_from))
|
|
266
|
-
else:
|
|
267
|
-
raise ValueError(
|
|
268
|
-
f"Unsupported file suffix {derived_from.suffix!r} for data import!"
|
|
269
|
-
)
|
|
270
|
-
else:
|
|
271
|
-
raise TypeError # pragma: no cover
|
|
272
|
-
|
|
273
|
-
self._relation = relation
|
|
274
|
-
if model is not None:
|
|
275
|
-
self.model = model # pyright: ignore
|
|
276
|
-
|
|
277
|
-
def aggregate(
|
|
278
|
-
self,
|
|
279
|
-
*aggregations: str,
|
|
280
|
-
group_by: Union[str, Iterable[str]],
|
|
281
|
-
**named_aggregations: str,
|
|
282
|
-
) -> Relation:
|
|
283
|
-
"""
|
|
284
|
-
Return relation formed by ``GROUP BY`` SQL aggregation(s).
|
|
285
|
-
|
|
286
|
-
Args:
|
|
287
|
-
aggregations: Zero or more aggregation expressions such as
|
|
288
|
-
"sum(column_name)" and "count(distinct column_name)".
|
|
289
|
-
named_aggregations: Zero or more aggregated expressions where the keyword is
|
|
290
|
-
used to name the given aggregation. For example,
|
|
291
|
-
``my_column="sum(column_name)"`` is inserted as
|
|
292
|
-
``"sum(column_name) as my_column"`` in the executed SQL query.
|
|
293
|
-
group_by: A single column name or iterable collection of column names to
|
|
294
|
-
group by.
|
|
295
|
-
|
|
296
|
-
Examples:
|
|
297
|
-
>>> import patito as pt
|
|
298
|
-
>>> df = pt.DataFrame({"a": [1, 2, 3], "b": ["X", "Y", "X"]})
|
|
299
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
300
|
-
>>> relation.aggregate(
|
|
301
|
-
... "b",
|
|
302
|
-
... "sum(a)",
|
|
303
|
-
... "greatest(b)",
|
|
304
|
-
... max_a="max(a)",
|
|
305
|
-
... group_by="b",
|
|
306
|
-
... ).to_df()
|
|
307
|
-
shape: (2, 4)
|
|
308
|
-
┌─────┬────────┬─────────────┬───────┐
|
|
309
|
-
│ b ┆ sum(a) ┆ greatest(b) ┆ max_a │
|
|
310
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
311
|
-
│ str ┆ f64 ┆ str ┆ i64 │
|
|
312
|
-
╞═════╪════════╪═════════════╪═══════╡
|
|
313
|
-
│ X ┆ 4.0 ┆ X ┆ 3 │
|
|
314
|
-
│ Y ┆ 2.0 ┆ Y ┆ 2 │
|
|
315
|
-
└─────┴────────┴─────────────┴───────┘
|
|
316
|
-
"""
|
|
317
|
-
expression = ", ".join(
|
|
318
|
-
aggregations
|
|
319
|
-
+ tuple(
|
|
320
|
-
f"{expression} as {column_name}"
|
|
321
|
-
for column_name, expression in named_aggregations.items()
|
|
322
|
-
)
|
|
323
|
-
)
|
|
324
|
-
relation = self._relation.aggregate(
|
|
325
|
-
aggr_expr=expression,
|
|
326
|
-
group_expr=group_by if isinstance(group_by, str) else ", ".join(group_by),
|
|
327
|
-
)
|
|
328
|
-
return self._wrap(relation=relation, schema_change=True)
|
|
329
|
-
|
|
330
|
-
def add_suffix(
|
|
331
|
-
self,
|
|
332
|
-
suffix: str,
|
|
333
|
-
include: Optional[Collection[str]] = None,
|
|
334
|
-
exclude: Optional[Collection[str]] = None,
|
|
335
|
-
) -> Relation:
|
|
336
|
-
"""
|
|
337
|
-
Add a suffix to all the columns of the relation.
|
|
338
|
-
|
|
339
|
-
Args:
|
|
340
|
-
suffix: A string to append to add to all columns names.
|
|
341
|
-
include: If provided, only the given columns will be renamed.
|
|
342
|
-
exclude: If provided, the given columns will `not` be renamed.
|
|
343
|
-
|
|
344
|
-
Raises:
|
|
345
|
-
TypeError: If both include `and` exclude are provided at the same time.
|
|
346
|
-
|
|
347
|
-
Examples:
|
|
348
|
-
>>> import patito as pt
|
|
349
|
-
>>> relation = pt.duckdb.Relation("select 1 as column_1, 2 as column_2")
|
|
350
|
-
>>> relation.add_suffix("_renamed").to_df()
|
|
351
|
-
shape: (1, 2)
|
|
352
|
-
┌──────────────────┬──────────────────┐
|
|
353
|
-
│ column_1_renamed ┆ column_2_renamed │
|
|
354
|
-
│ --- ┆ --- │
|
|
355
|
-
│ i64 ┆ i64 │
|
|
356
|
-
╞══════════════════╪══════════════════╡
|
|
357
|
-
│ 1 ┆ 2 │
|
|
358
|
-
└──────────────────┴──────────────────┘
|
|
359
|
-
|
|
360
|
-
>>> relation.add_suffix("_renamed", include=["column_1"]).to_df()
|
|
361
|
-
shape: (1, 2)
|
|
362
|
-
┌──────────────────┬──────────┐
|
|
363
|
-
│ column_1_renamed ┆ column_2 │
|
|
364
|
-
│ --- ┆ --- │
|
|
365
|
-
│ i64 ┆ i64 │
|
|
366
|
-
╞══════════════════╪══════════╡
|
|
367
|
-
│ 1 ┆ 2 │
|
|
368
|
-
└──────────────────┴──────────┘
|
|
369
|
-
|
|
370
|
-
>>> relation.add_suffix("_renamed", exclude=["column_1"]).to_df()
|
|
371
|
-
shape: (1, 2)
|
|
372
|
-
┌──────────┬──────────────────┐
|
|
373
|
-
│ column_1 ┆ column_2_renamed │
|
|
374
|
-
│ --- ┆ --- │
|
|
375
|
-
│ i64 ┆ i64 │
|
|
376
|
-
╞══════════╪══════════════════╡
|
|
377
|
-
│ 1 ┆ 2 │
|
|
378
|
-
└──────────┴──────────────────┘
|
|
379
|
-
"""
|
|
380
|
-
if include is not None and exclude is not None:
|
|
381
|
-
raise TypeError("Both include and exclude provided at the same time!")
|
|
382
|
-
elif include is not None:
|
|
383
|
-
included = lambda column: column in include
|
|
384
|
-
elif exclude is not None:
|
|
385
|
-
included = lambda column: column not in exclude
|
|
386
|
-
else:
|
|
387
|
-
included = lambda _: True # noqa: E731
|
|
388
|
-
|
|
389
|
-
return self.select(
|
|
390
|
-
", ".join(
|
|
391
|
-
f"{column} as {column}{suffix}" if included(column) else column
|
|
392
|
-
for column in self.columns
|
|
393
|
-
)
|
|
394
|
-
)
|
|
395
|
-
|
|
396
|
-
def add_prefix(
|
|
397
|
-
self,
|
|
398
|
-
prefix: str,
|
|
399
|
-
include: Optional[Iterable[str]] = None,
|
|
400
|
-
exclude: Optional[Iterable[str]] = None,
|
|
401
|
-
) -> Relation:
|
|
402
|
-
"""
|
|
403
|
-
Add a prefix to all the columns of the relation.
|
|
404
|
-
|
|
405
|
-
Args:
|
|
406
|
-
prefix: A string to prepend to add to all the columns names.
|
|
407
|
-
include: If provided, only the given columns will be renamed.
|
|
408
|
-
exclude: If provided, the given columns will `not` be renamed.
|
|
409
|
-
|
|
410
|
-
Raises:
|
|
411
|
-
TypeError: If both include `and` exclude are provided at the same time.
|
|
412
|
-
|
|
413
|
-
Examples:
|
|
414
|
-
>>> import patito as pt
|
|
415
|
-
>>> relation = pt.duckdb.Relation("select 1 as column_1, 2 as column_2")
|
|
416
|
-
>>> relation.add_prefix("renamed_").to_df()
|
|
417
|
-
shape: (1, 2)
|
|
418
|
-
┌──────────────────┬──────────────────┐
|
|
419
|
-
│ renamed_column_1 ┆ renamed_column_2 │
|
|
420
|
-
│ --- ┆ --- │
|
|
421
|
-
│ i64 ┆ i64 │
|
|
422
|
-
╞══════════════════╪══════════════════╡
|
|
423
|
-
│ 1 ┆ 2 │
|
|
424
|
-
└──────────────────┴──────────────────┘
|
|
425
|
-
|
|
426
|
-
>>> relation.add_prefix("renamed_", include=["column_1"]).to_df()
|
|
427
|
-
shape: (1, 2)
|
|
428
|
-
┌──────────────────┬──────────┐
|
|
429
|
-
│ renamed_column_1 ┆ column_2 │
|
|
430
|
-
│ --- ┆ --- │
|
|
431
|
-
│ i64 ┆ i64 │
|
|
432
|
-
╞══════════════════╪══════════╡
|
|
433
|
-
│ 1 ┆ 2 │
|
|
434
|
-
└──────────────────┴──────────┘
|
|
435
|
-
|
|
436
|
-
>>> relation.add_prefix("renamed_", exclude=["column_1"]).to_df()
|
|
437
|
-
shape: (1, 2)
|
|
438
|
-
┌──────────┬──────────────────┐
|
|
439
|
-
│ column_1 ┆ renamed_column_2 │
|
|
440
|
-
│ --- ┆ --- │
|
|
441
|
-
│ i64 ┆ i64 │
|
|
442
|
-
╞══════════╪══════════════════╡
|
|
443
|
-
│ 1 ┆ 2 │
|
|
444
|
-
└──────────┴──────────────────┘
|
|
445
|
-
"""
|
|
446
|
-
if include is not None and exclude is not None:
|
|
447
|
-
raise TypeError("Both include and exclude provided at the same time!")
|
|
448
|
-
elif include is not None:
|
|
449
|
-
included = lambda column: column in include
|
|
450
|
-
elif exclude is not None:
|
|
451
|
-
included = lambda column: column not in exclude
|
|
452
|
-
else:
|
|
453
|
-
included = lambda _: True
|
|
454
|
-
|
|
455
|
-
return self.select(
|
|
456
|
-
", ".join(
|
|
457
|
-
f"{column} as {prefix}{column}" if included(column) else column
|
|
458
|
-
for column in self.columns
|
|
459
|
-
)
|
|
460
|
-
)
|
|
461
|
-
|
|
462
|
-
def all(self, *filters: str, **equalities: Union[int, float, str]) -> bool:
|
|
463
|
-
"""
|
|
464
|
-
Return ``True`` if the given predicate(s) are true for all rows in the relation.
|
|
465
|
-
|
|
466
|
-
See :func:`Relation.filter()` for additional information regarding the
|
|
467
|
-
parameters.
|
|
468
|
-
|
|
469
|
-
Args:
|
|
470
|
-
filters: SQL predicates to satisfy.
|
|
471
|
-
equalities: SQL equality predicates to satisfy.
|
|
472
|
-
|
|
473
|
-
Examples:
|
|
474
|
-
>>> import patito as pt
|
|
475
|
-
>>> df = pt.DataFrame(
|
|
476
|
-
... {
|
|
477
|
-
... "even_number": [2, 4, 6],
|
|
478
|
-
... "odd_number": [1, 3, 5],
|
|
479
|
-
... "zero": [0, 0, 0],
|
|
480
|
-
... }
|
|
481
|
-
... )
|
|
482
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
483
|
-
>>> relation.all(zero=0)
|
|
484
|
-
True
|
|
485
|
-
>>> relation.all(
|
|
486
|
-
... "even_number % 2 = 0",
|
|
487
|
-
... "odd_number % 2 = 1",
|
|
488
|
-
... zero=0,
|
|
489
|
-
... )
|
|
490
|
-
True
|
|
491
|
-
>>> relation.all(zero=1)
|
|
492
|
-
False
|
|
493
|
-
>>> relation.all("odd_number % 2 = 0")
|
|
494
|
-
False
|
|
495
|
-
"""
|
|
496
|
-
return self.filter(*filters, **equalities).count() == self.count()
|
|
497
|
-
|
|
498
|
-
def case(
|
|
499
|
-
self,
|
|
500
|
-
*,
|
|
501
|
-
from_column: str,
|
|
502
|
-
to_column: str,
|
|
503
|
-
mapping: Dict[sql.SQLLiteral, sql.SQLLiteral],
|
|
504
|
-
default: sql.SQLLiteral,
|
|
505
|
-
) -> Relation:
|
|
506
|
-
"""
|
|
507
|
-
Map values of one column over to a new column.
|
|
508
|
-
|
|
509
|
-
Args:
|
|
510
|
-
from_column: Name of column defining the domain of the mapping.
|
|
511
|
-
to_column: Name of column to insert the mapped values into.
|
|
512
|
-
mapping: Dictionary defining the mapping. The dictionary keys represent the
|
|
513
|
-
input values, while the dictionary values represent the output values.
|
|
514
|
-
Items are inserted into the SQL case statement by their repr() string
|
|
515
|
-
value.
|
|
516
|
-
default: Default output value for inputs which have no provided mapping.
|
|
517
|
-
|
|
518
|
-
Examples:
|
|
519
|
-
The following case statement...
|
|
520
|
-
|
|
521
|
-
>>> import patito as pt
|
|
522
|
-
>>> db = pt.duckdb.Database()
|
|
523
|
-
>>> relation = db.to_relation("select 1 as a union select 2 as a")
|
|
524
|
-
>>> relation.case(
|
|
525
|
-
... from_column="a",
|
|
526
|
-
... to_column="b",
|
|
527
|
-
... mapping={1: "one", 2: "two"},
|
|
528
|
-
... default="three",
|
|
529
|
-
... ).order(by="a").to_df()
|
|
530
|
-
shape: (2, 2)
|
|
531
|
-
┌─────┬─────┐
|
|
532
|
-
│ a ┆ b │
|
|
533
|
-
│ --- ┆ --- │
|
|
534
|
-
│ i64 ┆ str │
|
|
535
|
-
╞═════╪═════╡
|
|
536
|
-
│ 1 ┆ one │
|
|
537
|
-
│ 2 ┆ two │
|
|
538
|
-
└─────┴─────┘
|
|
539
|
-
|
|
540
|
-
... is equivalent with:
|
|
541
|
-
|
|
542
|
-
>>> case_statement = pt.sql.Case(
|
|
543
|
-
... on_column="a",
|
|
544
|
-
... mapping={1: "one", 2: "two"},
|
|
545
|
-
... default="three",
|
|
546
|
-
... as_column="b",
|
|
547
|
-
... )
|
|
548
|
-
>>> relation.select(f"*, {case_statement}").order(by="a").to_df()
|
|
549
|
-
shape: (2, 2)
|
|
550
|
-
┌─────┬─────┐
|
|
551
|
-
│ a ┆ b │
|
|
552
|
-
│ --- ┆ --- │
|
|
553
|
-
│ i64 ┆ str │
|
|
554
|
-
╞═════╪═════╡
|
|
555
|
-
│ 1 ┆ one │
|
|
556
|
-
│ 2 ┆ two │
|
|
557
|
-
└─────┴─────┘
|
|
558
|
-
"""
|
|
559
|
-
|
|
560
|
-
case_statement = sql.Case(
|
|
561
|
-
on_column=from_column,
|
|
562
|
-
mapping=mapping,
|
|
563
|
-
default=default,
|
|
564
|
-
as_column=to_column,
|
|
565
|
-
)
|
|
566
|
-
new_relation = self._relation.project(f"*, {case_statement}")
|
|
567
|
-
return self._wrap(relation=new_relation, schema_change=True)
|
|
568
|
-
|
|
569
|
-
def cast(
|
|
570
|
-
self: RelationType,
|
|
571
|
-
model: Optional[ModelType] = None,
|
|
572
|
-
strict: bool = False,
|
|
573
|
-
include: Optional[Collection[str]] = None,
|
|
574
|
-
exclude: Optional[Collection[str]] = None,
|
|
575
|
-
) -> RelationType:
|
|
576
|
-
"""
|
|
577
|
-
Cast the columns of the relation to types compatible with the associated model.
|
|
578
|
-
|
|
579
|
-
The associated model must either be set by invoking
|
|
580
|
-
:ref:`Relation.set_model() <duckdb.Relation.set_model>` or provided with the
|
|
581
|
-
``model`` parameter.
|
|
582
|
-
|
|
583
|
-
Any columns of the relation that are not part of the given model schema will be
|
|
584
|
-
left as-is.
|
|
585
|
-
|
|
586
|
-
Args:
|
|
587
|
-
model: If :ref:`Relation.set_model() <duckdb.Relation.set_model>` has not
|
|
588
|
-
been invoked or is intended to be overwritten.
|
|
589
|
-
strict: If set to ``False``, columns which are technically compliant with
|
|
590
|
-
the specified field type, will not be casted. For example, a column
|
|
591
|
-
annotated with ``int`` is technically compliant with ``SMALLINT``, even
|
|
592
|
-
if ``INTEGER`` is the default SQL type associated with ``int``-annotated
|
|
593
|
-
fields. If ``strict`` is set to ``True``, the resulting dtypes will
|
|
594
|
-
be forced to the default dtype associated with each python type.
|
|
595
|
-
include: If provided, only the given columns will be casted.
|
|
596
|
-
exclude: If provided, the given columns will `not` be casted.
|
|
597
|
-
|
|
598
|
-
Returns:
|
|
599
|
-
New relation where the columns have been casted according to the model
|
|
600
|
-
schema.
|
|
601
|
-
|
|
602
|
-
Examples:
|
|
603
|
-
>>> import patito as pt
|
|
604
|
-
>>> class Schema(pt.Model):
|
|
605
|
-
... float_column: float
|
|
606
|
-
...
|
|
607
|
-
>>> relation = pt.duckdb.Relation("select 1 as float_column")
|
|
608
|
-
>>> relation.types["float_column"]
|
|
609
|
-
INTEGER
|
|
610
|
-
>>> relation.cast(model=Schema).types["float_column"]
|
|
611
|
-
DOUBLE
|
|
612
|
-
|
|
613
|
-
>>> relation = pt.duckdb.Relation("select 1::FLOAT as float_column")
|
|
614
|
-
>>> relation.cast(model=Schema).types["float_column"]
|
|
615
|
-
FLOAT
|
|
616
|
-
>>> relation.cast(model=Schema, strict=True).types["float_column"]
|
|
617
|
-
DOUBLE
|
|
618
|
-
|
|
619
|
-
>>> class Schema(pt.Model):
|
|
620
|
-
... column_1: float
|
|
621
|
-
... column_2: float
|
|
622
|
-
...
|
|
623
|
-
>>> relation = pt.duckdb.Relation(
|
|
624
|
-
... "select 1 as column_1, 2 as column_2"
|
|
625
|
-
... ).set_model(Schema)
|
|
626
|
-
>>> relation.types
|
|
627
|
-
{'column_1': INTEGER, 'column_2': INTEGER}
|
|
628
|
-
>>> relation.cast(include=["column_1"]).types
|
|
629
|
-
{'column_1': DOUBLE, 'column_2': INTEGER}
|
|
630
|
-
>>> relation.cast(exclude=["column_1"]).types
|
|
631
|
-
{'column_1': INTEGER, 'column_2': DOUBLE}
|
|
632
|
-
"""
|
|
633
|
-
if model is not None:
|
|
634
|
-
relation = self.set_model(model)
|
|
635
|
-
schema = model
|
|
636
|
-
elif self.model is not None:
|
|
637
|
-
relation = self
|
|
638
|
-
schema = cast(ModelType, self.model)
|
|
639
|
-
else:
|
|
640
|
-
class_name = self.__class__.__name__
|
|
641
|
-
raise TypeError(
|
|
642
|
-
f"{class_name}.cast() invoked without "
|
|
643
|
-
f"{class_name}.model having been set! "
|
|
644
|
-
f"You should invoke {class_name}.set_model() first "
|
|
645
|
-
"or explicitly provide a model to .cast()."
|
|
646
|
-
)
|
|
647
|
-
|
|
648
|
-
if include is not None and exclude is not None:
|
|
649
|
-
raise ValueError(
|
|
650
|
-
"Both include and exclude provided to "
|
|
651
|
-
f"{self.__class__.__name__}.cast()!"
|
|
652
|
-
)
|
|
653
|
-
elif include is not None:
|
|
654
|
-
include = set(include)
|
|
655
|
-
elif exclude is not None:
|
|
656
|
-
include = set(relation.columns) - set(exclude)
|
|
657
|
-
else:
|
|
658
|
-
include = set(relation.columns)
|
|
659
|
-
|
|
660
|
-
new_columns = []
|
|
661
|
-
for column, current_type in relation.types.items():
|
|
662
|
-
if column not in schema.columns:
|
|
663
|
-
new_columns.append(column)
|
|
664
|
-
elif column in include and (
|
|
665
|
-
strict or current_type not in schema.valid_sql_types[column]
|
|
666
|
-
):
|
|
667
|
-
new_type = schema.sql_types[column]
|
|
668
|
-
new_columns.append(f"{column}::{new_type} as {column}")
|
|
669
|
-
else:
|
|
670
|
-
new_columns.append(column)
|
|
671
|
-
return cast(RelationType, self.select(*new_columns))
|
|
672
|
-
|
|
673
|
-
def coalesce(
|
|
674
|
-
self: RelationType,
|
|
675
|
-
**column_expressions: Union[str, int, float],
|
|
676
|
-
) -> RelationType:
|
|
677
|
-
"""
|
|
678
|
-
Replace null-values in given columns with respective values.
|
|
679
|
-
|
|
680
|
-
For example, ``coalesce(column_name=value)`` is compiled to:
|
|
681
|
-
``f"coalesce({column_name}, {repr(value)}) as column_name"`` in the resulting
|
|
682
|
-
SQL.
|
|
683
|
-
|
|
684
|
-
Args:
|
|
685
|
-
column_expressions: Keywords indicate which columns to coalesce, while the
|
|
686
|
-
string representation of the respective arguments are used as the
|
|
687
|
-
null-replacement.
|
|
688
|
-
|
|
689
|
-
Return:
|
|
690
|
-
Relation: Relation where values have been filled in for nulls in the given
|
|
691
|
-
columns.
|
|
692
|
-
|
|
693
|
-
Examples:
|
|
694
|
-
>>> import patito as pt
|
|
695
|
-
>>> df = pt.DataFrame(
|
|
696
|
-
... {
|
|
697
|
-
... "a": [1, None, 3],
|
|
698
|
-
... "b": ["four", "five", None],
|
|
699
|
-
... "c": [None, 8.0, 9.0],
|
|
700
|
-
... }
|
|
701
|
-
... )
|
|
702
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
703
|
-
>>> relation.coalesce(a=2, b="six").to_df()
|
|
704
|
-
shape: (3, 3)
|
|
705
|
-
┌─────┬──────┬──────┐
|
|
706
|
-
│ a ┆ b ┆ c │
|
|
707
|
-
│ --- ┆ --- ┆ --- │
|
|
708
|
-
│ i64 ┆ str ┆ f64 │
|
|
709
|
-
╞═════╪══════╪══════╡
|
|
710
|
-
│ 1 ┆ four ┆ null │
|
|
711
|
-
│ 2 ┆ five ┆ 8.0 │
|
|
712
|
-
│ 3 ┆ six ┆ 9.0 │
|
|
713
|
-
└─────┴──────┴──────┘
|
|
714
|
-
"""
|
|
715
|
-
projections = []
|
|
716
|
-
for column in self.columns:
|
|
717
|
-
if column in column_expressions:
|
|
718
|
-
expression = column_expressions[column]
|
|
719
|
-
projections.append(f"coalesce({column}, {expression!r}) as {column}")
|
|
720
|
-
else:
|
|
721
|
-
projections.append(column)
|
|
722
|
-
return cast(RelationType, self.select(*projections))
|
|
723
|
-
|
|
724
|
-
@property
|
|
725
|
-
def columns(self) -> List[str]:
|
|
726
|
-
"""
|
|
727
|
-
Return the columns of the relation as a list of strings.
|
|
728
|
-
|
|
729
|
-
Examples:
|
|
730
|
-
>>> import patito as pt
|
|
731
|
-
>>> pt.duckdb.Relation("select 1 as a, 2 as b").columns
|
|
732
|
-
['a', 'b']
|
|
733
|
-
"""
|
|
734
|
-
# Under certain specific circumstances columns are suffixed with
|
|
735
|
-
# :1, which need to be removed from the column name.
|
|
736
|
-
return [column.partition(":")[0] for column in self._relation.columns]
|
|
737
|
-
|
|
738
|
-
def count(self) -> int:
|
|
739
|
-
"""
|
|
740
|
-
Return the number of rows in the given relation.
|
|
741
|
-
|
|
742
|
-
Returns:
|
|
743
|
-
Number of rows in the relation as an integer.
|
|
744
|
-
|
|
745
|
-
Examples:
|
|
746
|
-
>>> import patito as pt
|
|
747
|
-
>>> relation = pt.duckdb.Relation("select 1 as a")
|
|
748
|
-
>>> relation.count()
|
|
749
|
-
1
|
|
750
|
-
>>> (relation + relation).count()
|
|
751
|
-
2
|
|
752
|
-
|
|
753
|
-
The :ref:`Relation.__len__()<duckdb.Relation.__len__>` method invokes
|
|
754
|
-
``Relation.count()`` under the hood, and is equivalent:
|
|
755
|
-
|
|
756
|
-
>>> len(relation)
|
|
757
|
-
1
|
|
758
|
-
>>> len(relation + relation)
|
|
759
|
-
2
|
|
760
|
-
"""
|
|
761
|
-
return cast(Tuple[int], self._relation.aggregate("count(*)").fetchone())[0]
|
|
762
|
-
|
|
763
|
-
def create_table(self: RelationType, name: str) -> RelationType:
|
|
764
|
-
"""
|
|
765
|
-
Create new database table based on relation.
|
|
766
|
-
|
|
767
|
-
If ``self.model`` is set with
|
|
768
|
-
:ref:`Relation.set_model()<duckdb.Relation.set_model>`, then the model is used
|
|
769
|
-
to infer the table schema. Otherwise, a permissive table schema is created based
|
|
770
|
-
on the relation data.
|
|
771
|
-
|
|
772
|
-
Returns:
|
|
773
|
-
Relation: A relation pointing to the newly created table.
|
|
774
|
-
|
|
775
|
-
Examples:
|
|
776
|
-
>>> from typing import Literal
|
|
777
|
-
>>> import patito as pt
|
|
778
|
-
|
|
779
|
-
>>> df = pt.DataFrame({"enum_column": ["A", "A", "B"]})
|
|
780
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
781
|
-
>>> relation.create_table("permissive_table").types
|
|
782
|
-
{'enum_column': VARCHAR}
|
|
783
|
-
|
|
784
|
-
>>> class TableSchema(pt.Model):
|
|
785
|
-
... enum_column: Literal["A", "B", "C"]
|
|
786
|
-
...
|
|
787
|
-
>>> relation.set_model(TableSchema).create_table("strict_table").types
|
|
788
|
-
{'enum_column': enum__7ba49365cc1b0fd57e61088b3bc9aa25}
|
|
789
|
-
"""
|
|
790
|
-
if self.model is not None:
|
|
791
|
-
self.database.create_table(name=name, model=self.model)
|
|
792
|
-
self.insert_into(table=name)
|
|
793
|
-
else:
|
|
794
|
-
self._relation.create(table_name=name)
|
|
795
|
-
return cast(RelationType, self.database.table(name))
|
|
796
|
-
|
|
797
|
-
def create_view(
|
|
798
|
-
self: RelationType,
|
|
799
|
-
name: str,
|
|
800
|
-
replace: bool = False,
|
|
801
|
-
) -> RelationType:
|
|
802
|
-
"""
|
|
803
|
-
Create new database view based on relation.
|
|
804
|
-
|
|
805
|
-
Returns:
|
|
806
|
-
Relation: A relation pointing to the newly created view.
|
|
807
|
-
|
|
808
|
-
Examples:
|
|
809
|
-
>>> import patito as pt
|
|
810
|
-
>>> db = pt.duckdb.Database()
|
|
811
|
-
>>> df = pt.DataFrame({"column": ["A", "A", "B"]})
|
|
812
|
-
>>> relation = db.to_relation(df)
|
|
813
|
-
>>> relation.create_view("my_view")
|
|
814
|
-
>>> db.query("select * from my_view").to_df()
|
|
815
|
-
shape: (3, 1)
|
|
816
|
-
┌────────┐
|
|
817
|
-
│ column │
|
|
818
|
-
│ --- │
|
|
819
|
-
│ str │
|
|
820
|
-
╞════════╡
|
|
821
|
-
│ A │
|
|
822
|
-
│ A │
|
|
823
|
-
│ B │
|
|
824
|
-
└────────┘
|
|
825
|
-
"""
|
|
826
|
-
self._relation.create_view(view_name=name, replace=replace)
|
|
827
|
-
return cast(RelationType, self.database.view(name))
|
|
828
|
-
|
|
829
|
-
def drop(self, *columns: str) -> Relation:
|
|
830
|
-
"""
|
|
831
|
-
Remove specified column(s) from relation.
|
|
832
|
-
|
|
833
|
-
Args:
|
|
834
|
-
columns (str): Any number of string column names to be dropped.
|
|
835
|
-
|
|
836
|
-
Examples:
|
|
837
|
-
>>> import patito as pt
|
|
838
|
-
>>> relation = pt.duckdb.Relation("select 1 as a, 2 as b, 3 as c")
|
|
839
|
-
>>> relation.columns
|
|
840
|
-
['a', 'b', 'c']
|
|
841
|
-
>>> relation.drop("c").columns
|
|
842
|
-
['a', 'b']
|
|
843
|
-
>>> relation.drop("b", "c").columns
|
|
844
|
-
['a']
|
|
845
|
-
"""
|
|
846
|
-
new_columns = self.columns.copy()
|
|
847
|
-
for column in columns:
|
|
848
|
-
new_columns.remove(column)
|
|
849
|
-
return self[new_columns]
|
|
850
|
-
|
|
851
|
-
def distinct(self: RelationType) -> RelationType:
|
|
852
|
-
"""
|
|
853
|
-
Drop all duplicate rows of the relation.
|
|
854
|
-
|
|
855
|
-
Example:
|
|
856
|
-
>>> import patito as pt
|
|
857
|
-
>>> df = pt.DataFrame(
|
|
858
|
-
... [[1, 2, 3], [1, 2, 3], [3, 2, 1]],
|
|
859
|
-
... schema=["a", "b", "c"],
|
|
860
|
-
... orient="row",
|
|
861
|
-
... )
|
|
862
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
863
|
-
>>> relation.to_df()
|
|
864
|
-
shape: (3, 3)
|
|
865
|
-
┌─────┬─────┬─────┐
|
|
866
|
-
│ a ┆ b ┆ c │
|
|
867
|
-
│ --- ┆ --- ┆ --- │
|
|
868
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
869
|
-
╞═════╪═════╪═════╡
|
|
870
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
871
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
872
|
-
│ 3 ┆ 2 ┆ 1 │
|
|
873
|
-
└─────┴─────┴─────┘
|
|
874
|
-
>>> relation.distinct().to_df()
|
|
875
|
-
shape: (2, 3)
|
|
876
|
-
┌─────┬─────┬─────┐
|
|
877
|
-
│ a ┆ b ┆ c │
|
|
878
|
-
│ --- ┆ --- ┆ --- │
|
|
879
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
880
|
-
╞═════╪═════╪═════╡
|
|
881
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
882
|
-
│ 3 ┆ 2 ┆ 1 │
|
|
883
|
-
└─────┴─────┴─────┘
|
|
884
|
-
"""
|
|
885
|
-
return self._wrap(self._relation.distinct(), schema_change=False)
|
|
886
|
-
|
|
887
|
-
def except_(self: RelationType, other: RelationSource) -> RelationType:
|
|
888
|
-
"""
|
|
889
|
-
Remove all rows that can be found in the other other relation.
|
|
890
|
-
|
|
891
|
-
Args:
|
|
892
|
-
other: Another relation or something that can be casted to a relation.
|
|
893
|
-
|
|
894
|
-
Returns:
|
|
895
|
-
New relation without the rows that can be found in the other relation.
|
|
896
|
-
|
|
897
|
-
Example:
|
|
898
|
-
>>> import patito as pt
|
|
899
|
-
>>> relation_123 = pt.duckdb.Relation(
|
|
900
|
-
... "select 1 union select 2 union select 3"
|
|
901
|
-
... )
|
|
902
|
-
>>> relation_123.order(by="1").to_df()
|
|
903
|
-
shape: (3, 1)
|
|
904
|
-
┌─────┐
|
|
905
|
-
│ 1 │
|
|
906
|
-
│ --- │
|
|
907
|
-
│ i64 │
|
|
908
|
-
╞═════╡
|
|
909
|
-
│ 1 │
|
|
910
|
-
│ 2 │
|
|
911
|
-
│ 3 │
|
|
912
|
-
└─────┘
|
|
913
|
-
>>> relation_2 = pt.duckdb.Relation("select 2")
|
|
914
|
-
>>> relation_2.to_df()
|
|
915
|
-
shape: (1, 1)
|
|
916
|
-
┌─────┐
|
|
917
|
-
│ 2 │
|
|
918
|
-
│ --- │
|
|
919
|
-
│ i64 │
|
|
920
|
-
╞═════╡
|
|
921
|
-
│ 2 │
|
|
922
|
-
└─────┘
|
|
923
|
-
>>> relation_123.except_(relation_2).order(by="1").to_df()
|
|
924
|
-
shape: (2, 1)
|
|
925
|
-
┌─────┐
|
|
926
|
-
│ 1 │
|
|
927
|
-
│ --- │
|
|
928
|
-
│ i64 │
|
|
929
|
-
╞═════╡
|
|
930
|
-
│ 1 │
|
|
931
|
-
│ 3 │
|
|
932
|
-
└─────┘
|
|
933
|
-
"""
|
|
934
|
-
return self._wrap(
|
|
935
|
-
self._relation.except_(self.database.to_relation(other)._relation),
|
|
936
|
-
schema_change=False,
|
|
937
|
-
)
|
|
938
|
-
|
|
939
|
-
def execute(self) -> duckdb.DuckDBPyRelation:
|
|
940
|
-
"""
|
|
941
|
-
Execute built relation query and return result object.
|
|
942
|
-
|
|
943
|
-
Returns:
|
|
944
|
-
A native ``duckdb.DuckDBPyResult`` object representing the executed query.
|
|
945
|
-
|
|
946
|
-
Examples:
|
|
947
|
-
>>> import patito as pt
|
|
948
|
-
>>> relation = pt.duckdb.Relation(
|
|
949
|
-
... "select 1 as a, 2 as b union select 3 as a, 4 as b"
|
|
950
|
-
... )
|
|
951
|
-
>>> result = relation.aggregate("sum(a)", group_by="").execute()
|
|
952
|
-
>>> result.description
|
|
953
|
-
[('sum(a)', 'NUMBER', None, None, None, None, None)]
|
|
954
|
-
>>> result.fetchall()
|
|
955
|
-
[(4,)]
|
|
956
|
-
"""
|
|
957
|
-
# A star-select is here performed in order to work around certain DuckDB bugs
|
|
958
|
-
return self._relation.project("*").execute()
|
|
959
|
-
|
|
960
|
-
def get(self, *filters: str, **equalities: Union[str, int, float]) -> ModelType:
|
|
961
|
-
"""
|
|
962
|
-
Fetch the single row that matches the given filter(s).
|
|
963
|
-
|
|
964
|
-
If you expect a relation to already return one row, you can use get() without
|
|
965
|
-
any arguments to return that row.
|
|
966
|
-
|
|
967
|
-
Raises:
|
|
968
|
-
RuntimeError: RuntimeError is thrown if not exactly one single row matches
|
|
969
|
-
the given filter.
|
|
970
|
-
|
|
971
|
-
Args:
|
|
972
|
-
filters (str): A conjunction of SQL where clauses.
|
|
973
|
-
equalities (Any): A conjunction of SQL equality clauses. The keyword name
|
|
974
|
-
is the column and the parameter is the value of the equality.
|
|
975
|
-
|
|
976
|
-
Returns:
|
|
977
|
-
Model: A Patito model representing the given row.
|
|
978
|
-
|
|
979
|
-
Examples:
|
|
980
|
-
>>> import patito as pt
|
|
981
|
-
>>> import polars as pl
|
|
982
|
-
>>> df = pt.DataFrame({"product_id": [1, 2, 3], "price": [10, 10, 20]})
|
|
983
|
-
>>> relation = pt.duckdb.Relation(df).set_alias("my_relation")
|
|
984
|
-
|
|
985
|
-
The ``.get()`` method will by default return a dynamically constructed
|
|
986
|
-
Patito model if no model has been associated with the given relation:
|
|
987
|
-
|
|
988
|
-
>>> relation.get(product_id=1)
|
|
989
|
-
my_relation(product_id=1, price=10)
|
|
990
|
-
|
|
991
|
-
If a Patito model has been associated with the relation, by the use of
|
|
992
|
-
:ref:`Relation.set_model()<duckdb.Relation.set_model>`, then the given model
|
|
993
|
-
will be used to represent the return type:
|
|
994
|
-
|
|
995
|
-
>>> class Product(pt.Model):
|
|
996
|
-
... product_id: int = pt.Field(unique=True)
|
|
997
|
-
... price: float
|
|
998
|
-
...
|
|
999
|
-
>>> relation.set_model(Product).get(product_id=1)
|
|
1000
|
-
Product(product_id=1, price=10.0)
|
|
1001
|
-
|
|
1002
|
-
You can invoke ``.get()`` without any arguments on relations containing
|
|
1003
|
-
exactly one row:
|
|
1004
|
-
|
|
1005
|
-
>>> relation.filter(product_id=1).get()
|
|
1006
|
-
my_relation(product_id=1, price=10)
|
|
1007
|
-
|
|
1008
|
-
If the given predicate matches multiple rows a ``MultipleRowsReturned``
|
|
1009
|
-
exception will be raised:
|
|
1010
|
-
|
|
1011
|
-
>>> try:
|
|
1012
|
-
... relation.get(price=10)
|
|
1013
|
-
... except pt.exceptions.MultipleRowsReturned as e:
|
|
1014
|
-
... print(e)
|
|
1015
|
-
...
|
|
1016
|
-
Relation.get(price=10) returned 2 rows!
|
|
1017
|
-
|
|
1018
|
-
If the given predicate matches zero rows a ``RowDoesNotExist`` exception
|
|
1019
|
-
will be raised:
|
|
1020
|
-
|
|
1021
|
-
>>> try:
|
|
1022
|
-
... relation.get(price=0)
|
|
1023
|
-
... except pt.exceptions.RowDoesNotExist as e:
|
|
1024
|
-
... print(e)
|
|
1025
|
-
...
|
|
1026
|
-
Relation.get(price=0) returned 0 rows!
|
|
1027
|
-
"""
|
|
1028
|
-
if filters or equalities:
|
|
1029
|
-
relation = self.filter(*filters, **equalities)
|
|
1030
|
-
else:
|
|
1031
|
-
relation = self
|
|
1032
|
-
result = relation.execute()
|
|
1033
|
-
row = result.fetchone()
|
|
1034
|
-
if row is None or result.fetchone() is not None:
|
|
1035
|
-
args = [repr(f) for f in filters]
|
|
1036
|
-
args.extend(f"{key}={value!r}" for key, value in equalities.items())
|
|
1037
|
-
args_string = ",".join(args)
|
|
1038
|
-
|
|
1039
|
-
num_rows = relation.count()
|
|
1040
|
-
if num_rows == 0:
|
|
1041
|
-
raise RowDoesNotExist(f"Relation.get({args_string}) returned 0 rows!")
|
|
1042
|
-
else:
|
|
1043
|
-
raise MultipleRowsReturned(
|
|
1044
|
-
f"Relation.get({args_string}) returned {num_rows} rows!"
|
|
1045
|
-
)
|
|
1046
|
-
return self._to_model(row=row)
|
|
1047
|
-
|
|
1048
|
-
def _to_model(self, row: tuple) -> ModelType:
|
|
1049
|
-
"""
|
|
1050
|
-
Cast row tuple to proper return type.
|
|
1051
|
-
|
|
1052
|
-
If self.model is set, either by a class variable of a subclass or by the
|
|
1053
|
-
invocation of Relation.set_model(), that type is used to construct the return
|
|
1054
|
-
value. Otherwise, a pydantic model is dynamically created based on the column
|
|
1055
|
-
schema of the relation.
|
|
1056
|
-
"""
|
|
1057
|
-
kwargs = {column: value for column, value in zip(self.columns, row)}
|
|
1058
|
-
if self.model:
|
|
1059
|
-
return self.model(**kwargs)
|
|
1060
|
-
else:
|
|
1061
|
-
RowModel = create_pydantic_model(relation=self._relation)
|
|
1062
|
-
return cast(
|
|
1063
|
-
ModelType,
|
|
1064
|
-
RowModel(**kwargs),
|
|
1065
|
-
)
|
|
1066
|
-
|
|
1067
|
-
def filter(
|
|
1068
|
-
self: RelationType,
|
|
1069
|
-
*filters: str,
|
|
1070
|
-
**equalities: Union[str, int, float],
|
|
1071
|
-
) -> RelationType:
|
|
1072
|
-
"""
|
|
1073
|
-
Return subset of rows of relation that satisfy the given predicates.
|
|
1074
|
-
|
|
1075
|
-
The method returns self if no filters are provided.
|
|
1076
|
-
|
|
1077
|
-
Args:
|
|
1078
|
-
filters: A conjunction of SQL ``WHERE`` clauses.
|
|
1079
|
-
equalities: A conjunction of SQL equality clauses. The keyword name
|
|
1080
|
-
is the column and the parameter is the value of the equality.
|
|
1081
|
-
|
|
1082
|
-
Returns:
|
|
1083
|
-
Relation: A new relation where all rows satisfy the given criteria.
|
|
1084
|
-
|
|
1085
|
-
Examples:
|
|
1086
|
-
>>> import patito as pt
|
|
1087
|
-
>>> df = pt.DataFrame(
|
|
1088
|
-
... {
|
|
1089
|
-
... "number": [1, 2, 3, 4],
|
|
1090
|
-
... "string": ["A", "A", "B", "B"],
|
|
1091
|
-
... }
|
|
1092
|
-
... )
|
|
1093
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
1094
|
-
>>> relation.filter("number % 2 = 0").to_df()
|
|
1095
|
-
shape: (2, 2)
|
|
1096
|
-
┌────────┬────────┐
|
|
1097
|
-
│ number ┆ string │
|
|
1098
|
-
│ --- ┆ --- │
|
|
1099
|
-
│ i64 ┆ str │
|
|
1100
|
-
╞════════╪════════╡
|
|
1101
|
-
│ 2 ┆ A │
|
|
1102
|
-
│ 4 ┆ B │
|
|
1103
|
-
└────────┴────────┘
|
|
1104
|
-
|
|
1105
|
-
>>> relation.filter(number=1, string="A").to_df()
|
|
1106
|
-
shape: (1, 2)
|
|
1107
|
-
┌────────┬────────┐
|
|
1108
|
-
│ number ┆ string │
|
|
1109
|
-
│ --- ┆ --- │
|
|
1110
|
-
│ i64 ┆ str │
|
|
1111
|
-
╞════════╪════════╡
|
|
1112
|
-
│ 1 ┆ A │
|
|
1113
|
-
└────────┴────────┘
|
|
1114
|
-
"""
|
|
1115
|
-
if not filters and not equalities:
|
|
1116
|
-
return self
|
|
1117
|
-
|
|
1118
|
-
clauses: List[str] = []
|
|
1119
|
-
if filters:
|
|
1120
|
-
clauses.extend(filters)
|
|
1121
|
-
if equalities:
|
|
1122
|
-
clauses.extend(f"{key}={value!r}" for key, value in equalities.items())
|
|
1123
|
-
filter_string = " and ".join(f"({clause})" for clause in clauses)
|
|
1124
|
-
return self._wrap(self._relation.filter(filter_string), schema_change=False)
|
|
1125
|
-
|
|
1126
|
-
def join(
|
|
1127
|
-
self: RelationType,
|
|
1128
|
-
other: RelationSource,
|
|
1129
|
-
*,
|
|
1130
|
-
on: str,
|
|
1131
|
-
how: Literal["inner", "left"] = "inner",
|
|
1132
|
-
) -> RelationType:
|
|
1133
|
-
"""
|
|
1134
|
-
Join relation with other relation source based on condition.
|
|
1135
|
-
|
|
1136
|
-
See :ref:`duckdb.Relation.inner_join() <duckdb.Relation.inner_join>` and
|
|
1137
|
-
:ref:`Relation.left_join() <duckdb.Relation.left_join>` for alternative method
|
|
1138
|
-
shortcuts instead of using ``how``.
|
|
1139
|
-
|
|
1140
|
-
Args:
|
|
1141
|
-
other: A source which can be casted to a ``Relation`` object, and be used
|
|
1142
|
-
as the right table in the join.
|
|
1143
|
-
on: Join condition following the ``INNER JOIN ... ON`` in the SQL query.
|
|
1144
|
-
how: Either ``"left"`` or ``"inner"`` for what type of SQL join operation to
|
|
1145
|
-
perform.
|
|
1146
|
-
|
|
1147
|
-
Returns:
|
|
1148
|
-
Relation: New relation based on the joined relations.
|
|
1149
|
-
|
|
1150
|
-
Example:
|
|
1151
|
-
>>> import patito as pt
|
|
1152
|
-
>>> products_df = pt.DataFrame(
|
|
1153
|
-
... {
|
|
1154
|
-
... "product_name": ["apple", "banana", "oranges"],
|
|
1155
|
-
... "supplier_id": [2, 1, 3],
|
|
1156
|
-
... }
|
|
1157
|
-
... )
|
|
1158
|
-
>>> products = pt.duckdb.Relation(products_df)
|
|
1159
|
-
>>> supplier_df = pt.DataFrame(
|
|
1160
|
-
... {
|
|
1161
|
-
... "id": [1, 2],
|
|
1162
|
-
... "supplier_name": ["Banana Republic", "Applies Inc."],
|
|
1163
|
-
... }
|
|
1164
|
-
... )
|
|
1165
|
-
>>> suppliers = pt.duckdb.Relation(supplier_df)
|
|
1166
|
-
>>> products.set_alias("p").join(
|
|
1167
|
-
... suppliers.set_alias("s"),
|
|
1168
|
-
... on="p.supplier_id = s.id",
|
|
1169
|
-
... how="inner",
|
|
1170
|
-
... ).to_df()
|
|
1171
|
-
shape: (2, 4)
|
|
1172
|
-
┌──────────────┬─────────────┬─────┬─────────────────┐
|
|
1173
|
-
│ product_name ┆ supplier_id ┆ id ┆ supplier_name │
|
|
1174
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1175
|
-
│ str ┆ i64 ┆ i64 ┆ str │
|
|
1176
|
-
╞══════════════╪═════════════╪═════╪═════════════════╡
|
|
1177
|
-
│ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
|
|
1178
|
-
│ banana ┆ 1 ┆ 1 ┆ Banana Republic │
|
|
1179
|
-
└──────────────┴─────────────┴─────┴─────────────────┘
|
|
1180
|
-
|
|
1181
|
-
>>> products.set_alias("p").join(
|
|
1182
|
-
... suppliers.set_alias("s"),
|
|
1183
|
-
... on="p.supplier_id = s.id",
|
|
1184
|
-
... how="left",
|
|
1185
|
-
... ).to_df()
|
|
1186
|
-
shape: (3, 4)
|
|
1187
|
-
┌──────────────┬─────────────┬──────┬─────────────────┐
|
|
1188
|
-
│ product_name ┆ supplier_id ┆ id ┆ supplier_name │
|
|
1189
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1190
|
-
│ str ┆ i64 ┆ i64 ┆ str │
|
|
1191
|
-
╞══════════════╪═════════════╪══════╪═════════════════╡
|
|
1192
|
-
│ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
|
|
1193
|
-
│ banana ┆ 1 ┆ 1 ┆ Banana Republic │
|
|
1194
|
-
│ oranges ┆ 3 ┆ null ┆ null │
|
|
1195
|
-
└──────────────┴─────────────┴──────┴─────────────────┘
|
|
1196
|
-
"""
|
|
1197
|
-
return self._wrap(
|
|
1198
|
-
self._relation.join(
|
|
1199
|
-
self.database.to_relation(other)._relation, condition=on, how=how
|
|
1200
|
-
),
|
|
1201
|
-
schema_change=True,
|
|
1202
|
-
)
|
|
1203
|
-
|
|
1204
|
-
def inner_join(self: RelationType, other: RelationSource, on: str) -> RelationType:
|
|
1205
|
-
"""
|
|
1206
|
-
Inner join relation with other relation source based on condition.
|
|
1207
|
-
|
|
1208
|
-
Args:
|
|
1209
|
-
other: A source which can be casted to a ``Relation`` object, and be used
|
|
1210
|
-
as the right table in the join.
|
|
1211
|
-
on: Join condition following the ``INNER JOIN ... ON`` in the SQL query.
|
|
1212
|
-
|
|
1213
|
-
Returns:
|
|
1214
|
-
Relation: New relation based on the joined relations.
|
|
1215
|
-
|
|
1216
|
-
Example:
|
|
1217
|
-
>>> import patito as pt
|
|
1218
|
-
>>> products_df = pt.DataFrame(
|
|
1219
|
-
... {
|
|
1220
|
-
... "product_name": ["apple", "banana", "oranges"],
|
|
1221
|
-
... "supplier_id": [2, 1, 3],
|
|
1222
|
-
... }
|
|
1223
|
-
... )
|
|
1224
|
-
>>> products = pt.duckdb.Relation(products_df)
|
|
1225
|
-
>>> supplier_df = pt.DataFrame(
|
|
1226
|
-
... {
|
|
1227
|
-
... "id": [1, 2],
|
|
1228
|
-
... "supplier_name": ["Banana Republic", "Applies Inc."],
|
|
1229
|
-
... }
|
|
1230
|
-
... )
|
|
1231
|
-
>>> suppliers = pt.duckdb.Relation(supplier_df)
|
|
1232
|
-
>>> products.set_alias("p").inner_join(
|
|
1233
|
-
... suppliers.set_alias("s"),
|
|
1234
|
-
... on="p.supplier_id = s.id",
|
|
1235
|
-
... ).to_df()
|
|
1236
|
-
shape: (2, 4)
|
|
1237
|
-
┌──────────────┬─────────────┬─────┬─────────────────┐
|
|
1238
|
-
│ product_name ┆ supplier_id ┆ id ┆ supplier_name │
|
|
1239
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1240
|
-
│ str ┆ i64 ┆ i64 ┆ str │
|
|
1241
|
-
╞══════════════╪═════════════╪═════╪═════════════════╡
|
|
1242
|
-
│ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
|
|
1243
|
-
│ banana ┆ 1 ┆ 1 ┆ Banana Republic │
|
|
1244
|
-
└──────────────┴─────────────┴─────┴─────────────────┘
|
|
1245
|
-
"""
|
|
1246
|
-
return self._wrap(
|
|
1247
|
-
self._relation.join(
|
|
1248
|
-
other_rel=self.database.to_relation(other)._relation,
|
|
1249
|
-
condition=on,
|
|
1250
|
-
how="inner",
|
|
1251
|
-
),
|
|
1252
|
-
schema_change=True,
|
|
1253
|
-
)
|
|
1254
|
-
|
|
1255
|
-
def left_join(self: RelationType, other: RelationSource, on: str) -> RelationType:
|
|
1256
|
-
"""
|
|
1257
|
-
Left join relation with other relation source based on condition.
|
|
1258
|
-
|
|
1259
|
-
Args:
|
|
1260
|
-
other: A source which can be casted to a Relation object, and be used as
|
|
1261
|
-
the right table in the join.
|
|
1262
|
-
on: Join condition following the ``LEFT JOIN ... ON`` in the SQL query.
|
|
1263
|
-
|
|
1264
|
-
Returns:
|
|
1265
|
-
Relation: New relation based on the joined tables.
|
|
1266
|
-
|
|
1267
|
-
Example:
|
|
1268
|
-
>>> import patito as pt
|
|
1269
|
-
>>> products_df = pt.DataFrame(
|
|
1270
|
-
... {
|
|
1271
|
-
... "product_name": ["apple", "banana", "oranges"],
|
|
1272
|
-
... "supplier_id": [2, 1, 3],
|
|
1273
|
-
... }
|
|
1274
|
-
... )
|
|
1275
|
-
>>> products = pt.duckdb.Relation(products_df)
|
|
1276
|
-
>>> supplier_df = pt.DataFrame(
|
|
1277
|
-
... {
|
|
1278
|
-
... "id": [1, 2],
|
|
1279
|
-
... "supplier_name": ["Banana Republic", "Applies Inc."],
|
|
1280
|
-
... }
|
|
1281
|
-
... )
|
|
1282
|
-
>>> suppliers = pt.duckdb.Relation(supplier_df)
|
|
1283
|
-
>>> products.set_alias("p").left_join(
|
|
1284
|
-
... suppliers.set_alias("s"),
|
|
1285
|
-
... on="p.supplier_id = s.id",
|
|
1286
|
-
... ).to_df()
|
|
1287
|
-
shape: (3, 4)
|
|
1288
|
-
┌──────────────┬─────────────┬──────┬─────────────────┐
|
|
1289
|
-
│ product_name ┆ supplier_id ┆ id ┆ supplier_name │
|
|
1290
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1291
|
-
│ str ┆ i64 ┆ i64 ┆ str │
|
|
1292
|
-
╞══════════════╪═════════════╪══════╪═════════════════╡
|
|
1293
|
-
│ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
|
|
1294
|
-
│ banana ┆ 1 ┆ 1 ┆ Banana Republic │
|
|
1295
|
-
│ oranges ┆ 3 ┆ null ┆ null │
|
|
1296
|
-
└──────────────┴─────────────┴──────┴─────────────────┘
|
|
1297
|
-
"""
|
|
1298
|
-
return self._wrap(
|
|
1299
|
-
self._relation.join(
|
|
1300
|
-
other_rel=self.database.to_relation(other)._relation,
|
|
1301
|
-
condition=on,
|
|
1302
|
-
how="left",
|
|
1303
|
-
),
|
|
1304
|
-
schema_change=True,
|
|
1305
|
-
)
|
|
1306
|
-
|
|
1307
|
-
def limit(self: RelationType, n: int, *, offset: int = 0) -> RelationType:
|
|
1308
|
-
"""
|
|
1309
|
-
Remove all but the first n rows.
|
|
1310
|
-
|
|
1311
|
-
Args:
|
|
1312
|
-
n: The number of rows to keep.
|
|
1313
|
-
offset: Disregard the first ``offset`` rows before starting to count which
|
|
1314
|
-
rows to keep.
|
|
1315
|
-
|
|
1316
|
-
Returns:
|
|
1317
|
-
New relation with only n rows.
|
|
1318
|
-
|
|
1319
|
-
Example:
|
|
1320
|
-
>>> import patito as pt
|
|
1321
|
-
>>> relation = (
|
|
1322
|
-
... pt.duckdb.Relation("select 1 as column")
|
|
1323
|
-
... + pt.duckdb.Relation("select 2 as column")
|
|
1324
|
-
... + pt.duckdb.Relation("select 3 as column")
|
|
1325
|
-
... + pt.duckdb.Relation("select 4 as column")
|
|
1326
|
-
... )
|
|
1327
|
-
>>> relation.limit(2).to_df()
|
|
1328
|
-
shape: (2, 1)
|
|
1329
|
-
┌────────┐
|
|
1330
|
-
│ column │
|
|
1331
|
-
│ --- │
|
|
1332
|
-
│ i64 │
|
|
1333
|
-
╞════════╡
|
|
1334
|
-
│ 1 │
|
|
1335
|
-
│ 2 │
|
|
1336
|
-
└────────┘
|
|
1337
|
-
>>> relation.limit(2, offset=2).to_df()
|
|
1338
|
-
shape: (2, 1)
|
|
1339
|
-
┌────────┐
|
|
1340
|
-
│ column │
|
|
1341
|
-
│ --- │
|
|
1342
|
-
│ i64 │
|
|
1343
|
-
╞════════╡
|
|
1344
|
-
│ 3 │
|
|
1345
|
-
│ 4 │
|
|
1346
|
-
└────────┘
|
|
1347
|
-
"""
|
|
1348
|
-
return self._wrap(self._relation.limit(n=n, offset=offset), schema_change=False)
|
|
1349
|
-
|
|
1350
|
-
def order(self: RelationType, by: Union[str, Iterable[str]]) -> RelationType:
|
|
1351
|
-
"""
|
|
1352
|
-
Change the order of the rows of the relation.
|
|
1353
|
-
|
|
1354
|
-
Args:
|
|
1355
|
-
by: An ``ORDER BY`` SQL expression such as ``"age DESC"`` or
|
|
1356
|
-
``("age DESC", "name ASC")``.
|
|
1357
|
-
|
|
1358
|
-
Returns:
|
|
1359
|
-
New relation where the rows have been ordered according to ``by``.
|
|
1360
|
-
|
|
1361
|
-
Example:
|
|
1362
|
-
>>> import patito as pt
|
|
1363
|
-
>>> df = pt.DataFrame(
|
|
1364
|
-
... {
|
|
1365
|
-
... "name": ["Alice", "Bob", "Charles", "Diana"],
|
|
1366
|
-
... "age": [20, 20, 30, 35],
|
|
1367
|
-
... }
|
|
1368
|
-
... )
|
|
1369
|
-
>>> df
|
|
1370
|
-
shape: (4, 2)
|
|
1371
|
-
┌─────────┬─────┐
|
|
1372
|
-
│ name ┆ age │
|
|
1373
|
-
│ --- ┆ --- │
|
|
1374
|
-
│ str ┆ i64 │
|
|
1375
|
-
╞═════════╪═════╡
|
|
1376
|
-
│ Alice ┆ 20 │
|
|
1377
|
-
│ Bob ┆ 20 │
|
|
1378
|
-
│ Charles ┆ 30 │
|
|
1379
|
-
│ Diana ┆ 35 │
|
|
1380
|
-
└─────────┴─────┘
|
|
1381
|
-
>>> relation = pt.duckdb.Relation(df)
|
|
1382
|
-
>>> relation.order(by="age desc").to_df()
|
|
1383
|
-
shape: (4, 2)
|
|
1384
|
-
┌─────────┬─────┐
|
|
1385
|
-
│ name ┆ age │
|
|
1386
|
-
│ --- ┆ --- │
|
|
1387
|
-
│ str ┆ i64 │
|
|
1388
|
-
╞═════════╪═════╡
|
|
1389
|
-
│ Diana ┆ 35 │
|
|
1390
|
-
│ Charles ┆ 30 │
|
|
1391
|
-
│ Alice ┆ 20 │
|
|
1392
|
-
│ Bob ┆ 20 │
|
|
1393
|
-
└─────────┴─────┘
|
|
1394
|
-
>>> relation.order(by=["age desc", "name desc"]).to_df()
|
|
1395
|
-
shape: (4, 2)
|
|
1396
|
-
┌─────────┬─────┐
|
|
1397
|
-
│ name ┆ age │
|
|
1398
|
-
│ --- ┆ --- │
|
|
1399
|
-
│ str ┆ i64 │
|
|
1400
|
-
╞═════════╪═════╡
|
|
1401
|
-
│ Diana ┆ 35 │
|
|
1402
|
-
│ Charles ┆ 30 │
|
|
1403
|
-
│ Bob ┆ 20 │
|
|
1404
|
-
│ Alice ┆ 20 │
|
|
1405
|
-
└─────────┴─────┘
|
|
1406
|
-
"""
|
|
1407
|
-
order_expr = by if isinstance(by, str) else ", ".join(by)
|
|
1408
|
-
return self._wrap(
|
|
1409
|
-
self._relation.order(order_expr=order_expr),
|
|
1410
|
-
schema_change=False,
|
|
1411
|
-
)
|
|
1412
|
-
|
|
1413
|
-
def insert_into(
|
|
1414
|
-
self: RelationType,
|
|
1415
|
-
table: str,
|
|
1416
|
-
) -> RelationType:
|
|
1417
|
-
"""
|
|
1418
|
-
Insert all rows of the relation into a given table.
|
|
1419
|
-
|
|
1420
|
-
The relation must contain all the columns present in the target table.
|
|
1421
|
-
Extra columns are ignored and the column order is automatically matched
|
|
1422
|
-
with the target table.
|
|
1423
|
-
|
|
1424
|
-
Args:
|
|
1425
|
-
table: Name of table for which to insert values into.
|
|
1426
|
-
|
|
1427
|
-
Returns:
|
|
1428
|
-
Relation: The original relation, i.e. ``self``.
|
|
1429
|
-
|
|
1430
|
-
Examples:
|
|
1431
|
-
>>> import patito as pt
|
|
1432
|
-
>>> db = pt.duckdb.Database()
|
|
1433
|
-
>>> db.to_relation("select 1 as a").create_table("my_table")
|
|
1434
|
-
>>> db.table("my_table").to_df()
|
|
1435
|
-
shape: (1, 1)
|
|
1436
|
-
┌─────┐
|
|
1437
|
-
│ a │
|
|
1438
|
-
│ --- │
|
|
1439
|
-
│ i64 │
|
|
1440
|
-
╞═════╡
|
|
1441
|
-
│ 1 │
|
|
1442
|
-
└─────┘
|
|
1443
|
-
>>> db.to_relation("select 2 as a").insert_into("my_table")
|
|
1444
|
-
>>> db.table("my_table").to_df()
|
|
1445
|
-
shape: (2, 1)
|
|
1446
|
-
┌─────┐
|
|
1447
|
-
│ a │
|
|
1448
|
-
│ --- │
|
|
1449
|
-
│ i64 │
|
|
1450
|
-
╞═════╡
|
|
1451
|
-
│ 1 │
|
|
1452
|
-
│ 2 │
|
|
1453
|
-
└─────┘
|
|
1454
|
-
"""
|
|
1455
|
-
table_relation = self.database.table(table)
|
|
1456
|
-
missing_columns = set(table_relation.columns) - set(self.columns)
|
|
1457
|
-
if missing_columns:
|
|
1458
|
-
raise TypeError(
|
|
1459
|
-
f"Relation is missing column(s) {missing_columns} "
|
|
1460
|
-
f"in order to be inserted into table '{table}'!",
|
|
1461
|
-
)
|
|
1462
|
-
|
|
1463
|
-
reordered_relation = self[table_relation.columns]
|
|
1464
|
-
reordered_relation._relation.insert_into(table_name=table)
|
|
1465
|
-
return self
|
|
1466
|
-
|
|
1467
|
-
def intersect(self: RelationType, other: RelationSource) -> RelationType:
|
|
1468
|
-
"""
|
|
1469
|
-
Return a new relation containing the rows that are present in both relations.
|
|
1470
|
-
|
|
1471
|
-
This is a set operation which will remove duplicate rows as well.
|
|
1472
|
-
|
|
1473
|
-
Args:
|
|
1474
|
-
other: Another relation with the same column names.
|
|
1475
|
-
|
|
1476
|
-
Returns:
|
|
1477
|
-
Relation[Model]: A new relation with only those rows that are present in
|
|
1478
|
-
both relations.
|
|
1479
|
-
|
|
1480
|
-
Example:
|
|
1481
|
-
>>> import patito as pt
|
|
1482
|
-
>>> df1 = pt.DataFrame({"a": [1, 1, 2], "b": [1, 1, 2]})
|
|
1483
|
-
>>> df2 = pt.DataFrame({"a": [1, 1, 3], "b": [1, 1, 3]})
|
|
1484
|
-
>>> pt.duckdb.Relation(df1).intersect(pt.duckdb.Relation(df2)).to_df()
|
|
1485
|
-
shape: (1, 2)
|
|
1486
|
-
┌─────┬─────┐
|
|
1487
|
-
│ a ┆ b │
|
|
1488
|
-
│ --- ┆ --- │
|
|
1489
|
-
│ i64 ┆ i64 │
|
|
1490
|
-
╞═════╪═════╡
|
|
1491
|
-
│ 1 ┆ 1 │
|
|
1492
|
-
└─────┴─────┘
|
|
1493
|
-
"""
|
|
1494
|
-
other = self.database.to_relation(other)
|
|
1495
|
-
return self._wrap(
|
|
1496
|
-
self._relation.intersect(other._relation),
|
|
1497
|
-
schema_change=False,
|
|
1498
|
-
)
|
|
1499
|
-
|
|
1500
|
-
def select(
|
|
1501
|
-
self,
|
|
1502
|
-
*projections: Union[str, int, float],
|
|
1503
|
-
**named_projections: Union[str, int, float],
|
|
1504
|
-
) -> Relation:
|
|
1505
|
-
"""
|
|
1506
|
-
Return relation based on one or more SQL ``SELECT`` projections.
|
|
1507
|
-
|
|
1508
|
-
Keyword arguments are converted into ``{arg} as {keyword}`` in the executed SQL
|
|
1509
|
-
query.
|
|
1510
|
-
|
|
1511
|
-
Args:
|
|
1512
|
-
*projections: One or more strings representing SQL statements to be
|
|
1513
|
-
selected. For example ``"2"`` or ``"another_column"``.
|
|
1514
|
-
**named_projections: One ore more keyword arguments where the keyword
|
|
1515
|
-
specifies the name of the new column and the value is an SQL statement
|
|
1516
|
-
defining the content of the new column. For example
|
|
1517
|
-
``new_column="2 * another_column"``.
|
|
1518
|
-
|
|
1519
|
-
Examples:
|
|
1520
|
-
>>> import patito as pt
|
|
1521
|
-
>>> db = pt.duckdb.Database()
|
|
1522
|
-
>>> relation = db.to_relation(pt.DataFrame({"original_column": [1, 2, 3]}))
|
|
1523
|
-
>>> relation.select("*").to_df()
|
|
1524
|
-
shape: (3, 1)
|
|
1525
|
-
┌─────────────────┐
|
|
1526
|
-
│ original_column │
|
|
1527
|
-
│ --- │
|
|
1528
|
-
│ i64 │
|
|
1529
|
-
╞═════════════════╡
|
|
1530
|
-
│ 1 │
|
|
1531
|
-
│ 2 │
|
|
1532
|
-
│ 3 │
|
|
1533
|
-
└─────────────────┘
|
|
1534
|
-
>>> relation.select("*", multiplied_column="2 * original_column").to_df()
|
|
1535
|
-
shape: (3, 2)
|
|
1536
|
-
┌─────────────────┬───────────────────┐
|
|
1537
|
-
│ original_column ┆ multiplied_column │
|
|
1538
|
-
│ --- ┆ --- │
|
|
1539
|
-
│ i64 ┆ i64 │
|
|
1540
|
-
╞═════════════════╪═══════════════════╡
|
|
1541
|
-
│ 1 ┆ 2 │
|
|
1542
|
-
│ 2 ┆ 4 │
|
|
1543
|
-
│ 3 ┆ 6 │
|
|
1544
|
-
└─────────────────┴───────────────────┘
|
|
1545
|
-
"""
|
|
1546
|
-
# We expand '*' to an explicit list of columns in order to support redefining
|
|
1547
|
-
# columns within the star expressed columns.
|
|
1548
|
-
expanded_projections: list = list(projections)
|
|
1549
|
-
try:
|
|
1550
|
-
star_index = projections.index("*")
|
|
1551
|
-
if named_projections:
|
|
1552
|
-
# Allow explicitly named projections to overwrite star-selected columns
|
|
1553
|
-
expanded_projections[star_index : star_index + 1] = [
|
|
1554
|
-
column for column in self.columns if column not in named_projections
|
|
1555
|
-
]
|
|
1556
|
-
else:
|
|
1557
|
-
expanded_projections[star_index : star_index + 1] = self.columns
|
|
1558
|
-
except ValueError:
|
|
1559
|
-
pass
|
|
1560
|
-
|
|
1561
|
-
projection = ", ".join(
|
|
1562
|
-
expanded_projections
|
|
1563
|
-
+ list( # pyright: ignore
|
|
1564
|
-
f"{expression} as {column_name}"
|
|
1565
|
-
for column_name, expression in named_projections.items()
|
|
1566
|
-
)
|
|
1567
|
-
)
|
|
1568
|
-
try:
|
|
1569
|
-
relation = self._relation.project(projection)
|
|
1570
|
-
except RuntimeError as exc: # pragma: no cover
|
|
1571
|
-
# We might get a RunTime error if the enum type has not
|
|
1572
|
-
# been created yet. If so, we create all enum types for
|
|
1573
|
-
# this model.
|
|
1574
|
-
if self.model is not None and _is_missing_enum_type_exception(exc):
|
|
1575
|
-
self.database.create_enum_types(model=self.model)
|
|
1576
|
-
relation = self._relation.project(projection)
|
|
1577
|
-
else:
|
|
1578
|
-
raise exc
|
|
1579
|
-
return self._wrap(relation=relation, schema_change=True)
|
|
1580
|
-
|
|
1581
|
-
def rename(self, **columns: str) -> Relation:
|
|
1582
|
-
"""
|
|
1583
|
-
Rename columns as specified.
|
|
1584
|
-
|
|
1585
|
-
Args:
|
|
1586
|
-
**columns: A set of keyword arguments where the keyword is the old column
|
|
1587
|
-
name and the value is the new column name.
|
|
1588
|
-
|
|
1589
|
-
Raises:
|
|
1590
|
-
ValueError: If any of the given keywords do not exist as columns in the
|
|
1591
|
-
relation.
|
|
1592
|
-
|
|
1593
|
-
Examples:
|
|
1594
|
-
>>> import patito as pt
|
|
1595
|
-
>>> relation = pt.duckdb.Relation("select 1 as a, 2 as b")
|
|
1596
|
-
>>> relation.rename(b="c").to_df().select(["a", "c"])
|
|
1597
|
-
shape: (1, 2)
|
|
1598
|
-
┌─────┬─────┐
|
|
1599
|
-
│ a ┆ c │
|
|
1600
|
-
│ --- ┆ --- │
|
|
1601
|
-
│ i64 ┆ i64 │
|
|
1602
|
-
╞═════╪═════╡
|
|
1603
|
-
│ 1 ┆ 2 │
|
|
1604
|
-
└─────┴─────┘
|
|
1605
|
-
"""
|
|
1606
|
-
existing_columns = set(self.columns)
|
|
1607
|
-
missing = set(columns.keys()) - set(existing_columns)
|
|
1608
|
-
if missing:
|
|
1609
|
-
raise ValueError(
|
|
1610
|
-
f"Column '{missing.pop()}' can not be renamed as it does not exist. "
|
|
1611
|
-
f"The columns of the relation are: {', '.join(existing_columns)}."
|
|
1612
|
-
)
|
|
1613
|
-
# If we rename a column to overwrite another existing one, the column should
|
|
1614
|
-
# be overwritten.
|
|
1615
|
-
existing_columns = set(existing_columns) - set(columns.values())
|
|
1616
|
-
relation = self._relation.project(
|
|
1617
|
-
", ".join(
|
|
1618
|
-
f"{column} as {columns.get(column, column)}"
|
|
1619
|
-
for column in existing_columns
|
|
1620
|
-
)
|
|
1621
|
-
)
|
|
1622
|
-
return self._wrap(relation=relation, schema_change=True)
|
|
1623
|
-
|
|
1624
|
-
def set_alias(self: RelationType, name: str) -> RelationType:
|
|
1625
|
-
"""
|
|
1626
|
-
Set SQL alias for the given relation to be used in further queries.
|
|
1627
|
-
|
|
1628
|
-
Args:
|
|
1629
|
-
name: The new alias for the given relation.
|
|
1630
|
-
|
|
1631
|
-
Returns:
|
|
1632
|
-
Relation: A new relation containing the same query but addressable with the
|
|
1633
|
-
new alias.
|
|
1634
|
-
|
|
1635
|
-
Example:
|
|
1636
|
-
>>> import patito as pt
|
|
1637
|
-
>>> relation_1 = pt.duckdb.Relation("select 1 as a, 2 as b")
|
|
1638
|
-
>>> relation_2 = pt.duckdb.Relation("select 1 as a, 3 as c")
|
|
1639
|
-
>>> relation_1.set_alias("x").inner_join(
|
|
1640
|
-
... relation_2.set_alias("y"),
|
|
1641
|
-
... on="x.a = y.a",
|
|
1642
|
-
... ).select("x.a", "y.a", "b", "c").to_df()
|
|
1643
|
-
shape: (1, 4)
|
|
1644
|
-
┌─────┬─────┬─────┬─────┐
|
|
1645
|
-
│ a ┆ a:1 ┆ b ┆ c │
|
|
1646
|
-
│ --- ┆ --- ┆ --- ┆ --- │
|
|
1647
|
-
│ i64 ┆ i64 ┆ i64 ┆ i64 │
|
|
1648
|
-
╞═════╪═════╪═════╪═════╡
|
|
1649
|
-
│ 1 ┆ 1 ┆ 2 ┆ 3 │
|
|
1650
|
-
└─────┴─────┴─────┴─────┘
|
|
1651
|
-
"""
|
|
1652
|
-
return self._wrap(
|
|
1653
|
-
self._relation.set_alias(name),
|
|
1654
|
-
schema_change=False,
|
|
1655
|
-
)
|
|
1656
|
-
|
|
1657
|
-
def set_model(self, model): # type: ignore[no-untyped-def] # noqa: ANN
|
|
1658
|
-
"""
|
|
1659
|
-
Associate a give Patito model with the relation.
|
|
1660
|
-
|
|
1661
|
-
The returned relation has an associated ``.model`` attribute which can in turn
|
|
1662
|
-
be used by several methods such as :ref:`Relation.get()<duckdb.Relation.get>`,
|
|
1663
|
-
:ref:`Relation.create_table()<duckdb.Relation.create_table>`, and
|
|
1664
|
-
:ref:`Relation.__iter__<duckdb.Relation.__iter__>`.
|
|
1665
|
-
|
|
1666
|
-
Args:
|
|
1667
|
-
model: A Patito Model class specifying the intended schema of the relation.
|
|
1668
|
-
|
|
1669
|
-
Returns:
|
|
1670
|
-
Relation[model]: A new relation with the associated model.
|
|
1671
|
-
|
|
1672
|
-
Example:
|
|
1673
|
-
>>> from typing import Literal
|
|
1674
|
-
>>> import patito as pt
|
|
1675
|
-
>>> class MySchema(pt.Model):
|
|
1676
|
-
... float_column: float
|
|
1677
|
-
... enum_column: Literal["A", "B", "C"]
|
|
1678
|
-
...
|
|
1679
|
-
>>> relation = pt.duckdb.Relation(
|
|
1680
|
-
... "select 1 as float_column, 'A' as enum_column"
|
|
1681
|
-
... )
|
|
1682
|
-
>>> relation.get()
|
|
1683
|
-
query_relation(float_column=1, enum_column='A')
|
|
1684
|
-
>>> relation.set_model(MySchema).get()
|
|
1685
|
-
MySchema(float_column=1.0, enum_column='A')
|
|
1686
|
-
>>> relation.create_table("unmodeled_table").types
|
|
1687
|
-
{'float_column': INTEGER, 'enum_column': VARCHAR}
|
|
1688
|
-
>>> relation.set_model(MySchema).create_table("modeled_table").types
|
|
1689
|
-
{'float_column': DOUBLE,
|
|
1690
|
-
'enum_column': enum__7ba49365cc1b0fd57e61088b3bc9aa25}
|
|
1691
|
-
"""
|
|
1692
|
-
# We are not able to annotate the generic instance of type(self)[type(model)]
|
|
1693
|
-
# due to the lack of higher-kinded generics in python as of this writing.
|
|
1694
|
-
# See: https://github.com/python/typing/issues/548
|
|
1695
|
-
# This cast() will be wrong for sub-classes of Relation...
|
|
1696
|
-
return cast(
|
|
1697
|
-
Relation[model],
|
|
1698
|
-
type(self)(
|
|
1699
|
-
derived_from=self._relation,
|
|
1700
|
-
database=self.database,
|
|
1701
|
-
model=model,
|
|
1702
|
-
),
|
|
1703
|
-
)
|
|
1704
|
-
|
|
1705
|
-
@property
|
|
1706
|
-
def types(self): # type: ignore[no-untyped-def] # noqa
|
|
1707
|
-
"""
|
|
1708
|
-
Return the SQL types of all the columns of the given relation.
|
|
1709
|
-
|
|
1710
|
-
Returns:
|
|
1711
|
-
dict[str, str]: A dictionary where the keys are the column names and the
|
|
1712
|
-
values are SQL types as strings.
|
|
1713
|
-
|
|
1714
|
-
Examples:
|
|
1715
|
-
>>> import patito as pt
|
|
1716
|
-
>>> pt.duckdb.Relation("select 1 as a, 'my_value' as b").types
|
|
1717
|
-
{'a': INTEGER, 'b': VARCHAR}
|
|
1718
|
-
"""
|
|
1719
|
-
return dict(zip(self.columns, self._relation.types))
|
|
1720
|
-
|
|
1721
|
-
def to_pandas(self) -> "pd.DataFrame":
|
|
1722
|
-
"""
|
|
1723
|
-
Return a pandas DataFrame representation of relation object.
|
|
1724
|
-
|
|
1725
|
-
Returns: A ``pandas.DataFrame`` object containing all the data of the relation.
|
|
1726
|
-
|
|
1727
|
-
Example:
|
|
1728
|
-
>>> import patito as pt
|
|
1729
|
-
>>> pt.duckdb.Relation("select 1 as column union select 2 as column").order(
|
|
1730
|
-
... by="1"
|
|
1731
|
-
... ).to_pandas()
|
|
1732
|
-
column
|
|
1733
|
-
0 1
|
|
1734
|
-
1 2
|
|
1735
|
-
"""
|
|
1736
|
-
return self._relation.to_df()
|
|
1737
|
-
|
|
1738
|
-
def to_df(self) -> DataFrame:
|
|
1739
|
-
"""
|
|
1740
|
-
Return a polars DataFrame representation of relation object.
|
|
1741
|
-
|
|
1742
|
-
Returns: A ``patito.DataFrame`` object which inherits from ``polars.DataFrame``.
|
|
1743
|
-
|
|
1744
|
-
Example:
|
|
1745
|
-
>>> import patito as pt
|
|
1746
|
-
>>> pt.duckdb.Relation("select 1 as column union select 2 as column").order(
|
|
1747
|
-
... by="1"
|
|
1748
|
-
... ).to_df()
|
|
1749
|
-
shape: (2, 1)
|
|
1750
|
-
┌────────┐
|
|
1751
|
-
│ column │
|
|
1752
|
-
│ --- │
|
|
1753
|
-
│ i64 │
|
|
1754
|
-
╞════════╡
|
|
1755
|
-
│ 1 │
|
|
1756
|
-
│ 2 │
|
|
1757
|
-
└────────┘
|
|
1758
|
-
"""
|
|
1759
|
-
# Here we do a star-select to work around certain weird issues with DuckDB
|
|
1760
|
-
self._relation = self._relation.project("*")
|
|
1761
|
-
arrow_table = cast(pa.lib.Table, self._relation.to_arrow_table())
|
|
1762
|
-
try:
|
|
1763
|
-
# We cast `INTEGER`-typed columns to `pl.Int64` when converting to Polars
|
|
1764
|
-
# because polars is much more eager to store integer Series as 64-bit
|
|
1765
|
-
# integers. Otherwise there must be done a lot of manual casting whenever
|
|
1766
|
-
# you cross the boundary between DuckDB and polars.
|
|
1767
|
-
return DataFrame._from_arrow(arrow_table).with_columns(
|
|
1768
|
-
pl.col(pl.Int32).cast(pl.Int64)
|
|
1769
|
-
)
|
|
1770
|
-
except (pa.ArrowInvalid, pl.ArrowError): # pragma: no cover
|
|
1771
|
-
# Empty relations with enum columns can sometimes produce errors.
|
|
1772
|
-
# As a last-ditch effort, we convert such columns to VARCHAR.
|
|
1773
|
-
casted_columns = [
|
|
1774
|
-
f"{field.name}::VARCHAR as {field.name}"
|
|
1775
|
-
if isinstance(field.type, pa.DictionaryType)
|
|
1776
|
-
else field.name
|
|
1777
|
-
for field in arrow_table.schema
|
|
1778
|
-
]
|
|
1779
|
-
non_enum_relation = self._relation.project(", ".join(casted_columns))
|
|
1780
|
-
arrow_table = non_enum_relation.to_arrow_table()
|
|
1781
|
-
return DataFrame._from_arrow(arrow_table).with_columns(
|
|
1782
|
-
pl.col(pl.Int32).cast(pl.Int64)
|
|
1783
|
-
)
|
|
1784
|
-
|
|
1785
|
-
def to_series(self) -> pl.Series:
|
|
1786
|
-
"""
|
|
1787
|
-
Convert the given relation to a polars Series.
|
|
1788
|
-
|
|
1789
|
-
Raises:
|
|
1790
|
-
TypeError: If the given relation does not contain exactly one column.
|
|
1791
|
-
|
|
1792
|
-
Returns: A ``polars.Series`` object containing the data of the relation.
|
|
1793
|
-
|
|
1794
|
-
Example:
|
|
1795
|
-
>>> import patito as pt
|
|
1796
|
-
>>> relation = pt.duckdb.Relation("select 1 as a union select 2 as a")
|
|
1797
|
-
>>> relation.order(by="a").to_series()
|
|
1798
|
-
shape: (2,)
|
|
1799
|
-
Series: 'a' [i32]
|
|
1800
|
-
[
|
|
1801
|
-
1
|
|
1802
|
-
2
|
|
1803
|
-
]
|
|
1804
|
-
"""
|
|
1805
|
-
if len(self._relation.columns) != 1:
|
|
1806
|
-
raise TypeError(
|
|
1807
|
-
f"{self.__class__.__name__}.to_series() was invoked on a relation with "
|
|
1808
|
-
f"{len(self._relation.columns)} columns, while exactly 1 is required!"
|
|
1809
|
-
)
|
|
1810
|
-
dataframe: DataFrame = DataFrame._from_arrow(self._relation.to_arrow_table())
|
|
1811
|
-
return dataframe.to_series(index=0).alias(name=self.columns[0])
|
|
1812
|
-
|
|
1813
|
-
def union(self: RelationType, other: RelationSource) -> RelationType:
|
|
1814
|
-
"""
|
|
1815
|
-
Produce a new relation that contains the rows of both relations.
|
|
1816
|
-
|
|
1817
|
-
The ``+`` operator can also be used to union two relations.
|
|
1818
|
-
|
|
1819
|
-
The two relations must have the same column names, but not necessarily in the
|
|
1820
|
-
same order as reordering of columns is automatically performed, unlike regular
|
|
1821
|
-
SQL.
|
|
1822
|
-
|
|
1823
|
-
Duplicates are `not` dropped.
|
|
1824
|
-
|
|
1825
|
-
Args:
|
|
1826
|
-
other: A ``patito.duckdb.Relation`` object or something that can be
|
|
1827
|
-
*casted* to ``patito.duckdb.Relation``.
|
|
1828
|
-
See :ref:`Relation<duckdb.Relation.__init__>`.
|
|
1829
|
-
|
|
1830
|
-
Returns:
|
|
1831
|
-
New relation containing the rows of both ``self`` and ``other``.
|
|
1832
|
-
|
|
1833
|
-
Raises:
|
|
1834
|
-
TypeError: If the two relations do not contain the same columns.
|
|
1835
|
-
|
|
1836
|
-
Examples:
|
|
1837
|
-
>>> import patito as pt
|
|
1838
|
-
>>> relation_1 = pt.duckdb.Relation("select 1 as a")
|
|
1839
|
-
>>> relation_2 = pt.duckdb.Relation("select 2 as a")
|
|
1840
|
-
>>> relation_1.union(relation_2).to_df()
|
|
1841
|
-
shape: (2, 1)
|
|
1842
|
-
┌─────┐
|
|
1843
|
-
│ a │
|
|
1844
|
-
│ --- │
|
|
1845
|
-
│ i64 │
|
|
1846
|
-
╞═════╡
|
|
1847
|
-
│ 1 │
|
|
1848
|
-
│ 2 │
|
|
1849
|
-
└─────┘
|
|
1850
|
-
|
|
1851
|
-
>>> (relation_1 + relation_2).to_df()
|
|
1852
|
-
shape: (2, 1)
|
|
1853
|
-
┌─────┐
|
|
1854
|
-
│ a │
|
|
1855
|
-
│ --- │
|
|
1856
|
-
│ i64 │
|
|
1857
|
-
╞═════╡
|
|
1858
|
-
│ 1 │
|
|
1859
|
-
│ 2 │
|
|
1860
|
-
└─────┘
|
|
1861
|
-
"""
|
|
1862
|
-
other_relation = self.database.to_relation(other)
|
|
1863
|
-
if set(self.columns) != set(other_relation.columns):
|
|
1864
|
-
msg = "Union between relations with different column names is not allowed."
|
|
1865
|
-
additional_left = set(self.columns) - set(other_relation.columns)
|
|
1866
|
-
additional_right = set(other_relation.columns) - set(self.columns)
|
|
1867
|
-
if additional_left:
|
|
1868
|
-
msg += f" Additional columns in left relation: {additional_left}."
|
|
1869
|
-
if additional_right:
|
|
1870
|
-
msg += f" Additional columns in right relation: {additional_right}."
|
|
1871
|
-
raise TypeError(msg)
|
|
1872
|
-
if other_relation.columns != self.columns:
|
|
1873
|
-
reordered_relation = other_relation[self.columns]
|
|
1874
|
-
else:
|
|
1875
|
-
reordered_relation = other_relation
|
|
1876
|
-
unioned_relation = self._relation.union(reordered_relation._relation)
|
|
1877
|
-
return self._wrap(relation=unioned_relation, schema_change=False)
|
|
1878
|
-
|
|
1879
|
-
def with_columns(
|
|
1880
|
-
self,
|
|
1881
|
-
**named_projections: Union[str, int, float],
|
|
1882
|
-
) -> Relation:
|
|
1883
|
-
"""
|
|
1884
|
-
Return relations with additional columns.
|
|
1885
|
-
|
|
1886
|
-
If the provided columns expressions already exists as a column on the relation,
|
|
1887
|
-
the given column is overwritten.
|
|
1888
|
-
|
|
1889
|
-
Args:
|
|
1890
|
-
named_projections: A set of column expressions, where the keyword is used
|
|
1891
|
-
as the column name, while the right-hand argument is a valid SQL
|
|
1892
|
-
expression.
|
|
1893
|
-
|
|
1894
|
-
Returns:
|
|
1895
|
-
Relation with the given columns appended, or possibly overwritten.
|
|
1896
|
-
|
|
1897
|
-
Examples:
|
|
1898
|
-
>>> import patito as pt
|
|
1899
|
-
>>> db = pt.duckdb.Database()
|
|
1900
|
-
>>> relation = db.to_relation("select 1 as a, 2 as b")
|
|
1901
|
-
>>> relation.with_columns(c="a + b").to_df()
|
|
1902
|
-
shape: (1, 3)
|
|
1903
|
-
┌─────┬─────┬─────┐
|
|
1904
|
-
│ a ┆ b ┆ c │
|
|
1905
|
-
│ --- ┆ --- ┆ --- │
|
|
1906
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
1907
|
-
╞═════╪═════╪═════╡
|
|
1908
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
1909
|
-
└─────┴─────┴─────┘
|
|
1910
|
-
"""
|
|
1911
|
-
return self.select("*", **named_projections)
|
|
1912
|
-
|
|
1913
|
-
def with_missing_defaultable_columns(
|
|
1914
|
-
self: RelationType,
|
|
1915
|
-
include: Optional[Iterable[str]] = None,
|
|
1916
|
-
exclude: Optional[Iterable[str]] = None,
|
|
1917
|
-
) -> RelationType:
|
|
1918
|
-
"""
|
|
1919
|
-
Add missing defaultable columns filled with the default values of correct type.
|
|
1920
|
-
|
|
1921
|
-
Make sure to invoke :ref:`Relation.set_model()<duckdb.Relation.set_model>` with
|
|
1922
|
-
the correct model schema before executing
|
|
1923
|
-
``Relation.with_missing_default_columns()``.
|
|
1924
|
-
|
|
1925
|
-
Args:
|
|
1926
|
-
include: If provided, only fill in default values for missing columns part
|
|
1927
|
-
of this collection of column names.
|
|
1928
|
-
exclude: If provided, do `not` fill in default values for missing columns
|
|
1929
|
-
part of this collection of column names.
|
|
1930
|
-
|
|
1931
|
-
Returns:
|
|
1932
|
-
Relation: New relation where missing columns with default values according
|
|
1933
|
-
to the schema have been filled in.
|
|
1934
|
-
|
|
1935
|
-
Example:
|
|
1936
|
-
>>> import patito as pt
|
|
1937
|
-
>>> class MyModel(pt.Model):
|
|
1938
|
-
... non_default_column: int
|
|
1939
|
-
... another_non_default_column: int
|
|
1940
|
-
... default_column: int = 42
|
|
1941
|
-
... another_default_column: int = 42
|
|
1942
|
-
...
|
|
1943
|
-
>>> relation = pt.duckdb.Relation(
|
|
1944
|
-
... "select 1 as non_default_column, 2 as default_column"
|
|
1945
|
-
... )
|
|
1946
|
-
>>> relation.to_df()
|
|
1947
|
-
shape: (1, 2)
|
|
1948
|
-
┌────────────────────┬────────────────┐
|
|
1949
|
-
│ non_default_column ┆ default_column │
|
|
1950
|
-
│ --- ┆ --- │
|
|
1951
|
-
│ i64 ┆ i64 │
|
|
1952
|
-
╞════════════════════╪════════════════╡
|
|
1953
|
-
│ 1 ┆ 2 │
|
|
1954
|
-
└────────────────────┴────────────────┘
|
|
1955
|
-
>>> relation.set_model(MyModel).with_missing_defaultable_columns().to_df()
|
|
1956
|
-
shape: (1, 3)
|
|
1957
|
-
┌────────────────────┬────────────────┬────────────────────────┐
|
|
1958
|
-
│ non_default_column ┆ default_column ┆ another_default_column │
|
|
1959
|
-
│ --- ┆ --- ┆ --- │
|
|
1960
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
1961
|
-
╞════════════════════╪════════════════╪════════════════════════╡
|
|
1962
|
-
│ 1 ┆ 2 ┆ 42 │
|
|
1963
|
-
└────────────────────┴────────────────┴────────────────────────┘
|
|
1964
|
-
"""
|
|
1965
|
-
if self.model is None:
|
|
1966
|
-
class_name = self.__class__.__name__
|
|
1967
|
-
raise TypeError(
|
|
1968
|
-
f"{class_name}.with_missing_default_columns() invoked without "
|
|
1969
|
-
f"{class_name}.model having been set! "
|
|
1970
|
-
f"You should invoke {class_name}.set_model() first!"
|
|
1971
|
-
)
|
|
1972
|
-
elif include is not None and exclude is not None:
|
|
1973
|
-
raise TypeError("Both include and exclude provided at the same time!")
|
|
1974
|
-
|
|
1975
|
-
missing_columns = set(self.model.columns) - set(self.columns)
|
|
1976
|
-
defaultable_columns = self.model.defaults.keys()
|
|
1977
|
-
missing_defaultable_columns = missing_columns & defaultable_columns
|
|
1978
|
-
|
|
1979
|
-
if exclude is not None:
|
|
1980
|
-
missing_defaultable_columns -= set(exclude)
|
|
1981
|
-
elif include is not None:
|
|
1982
|
-
missing_defaultable_columns = missing_defaultable_columns & set(include)
|
|
1983
|
-
|
|
1984
|
-
projection = "*"
|
|
1985
|
-
for column_name in missing_defaultable_columns:
|
|
1986
|
-
sql_type = self.model.sql_types[column_name]
|
|
1987
|
-
default_value = self.model.defaults[column_name]
|
|
1988
|
-
projection += f", {default_value!r}::{sql_type} as {column_name}"
|
|
1989
|
-
|
|
1990
|
-
try:
|
|
1991
|
-
relation = self._relation.project(projection)
|
|
1992
|
-
except Exception as exc: # pragma: no cover
|
|
1993
|
-
# We might get a RunTime error if the enum type has not
|
|
1994
|
-
# been created yet. If so, we create all enum types for
|
|
1995
|
-
# this model.
|
|
1996
|
-
if _is_missing_enum_type_exception(exc):
|
|
1997
|
-
self.database.create_enum_types(model=self.model)
|
|
1998
|
-
relation = self._relation.project(projection)
|
|
1999
|
-
else:
|
|
2000
|
-
raise exc
|
|
2001
|
-
return self._wrap(relation=relation, schema_change=False)
|
|
2002
|
-
|
|
2003
|
-
def with_missing_nullable_columns(
|
|
2004
|
-
self: RelationType,
|
|
2005
|
-
include: Optional[Iterable[str]] = None,
|
|
2006
|
-
exclude: Optional[Iterable[str]] = None,
|
|
2007
|
-
) -> RelationType:
|
|
2008
|
-
"""
|
|
2009
|
-
Add missing nullable columns filled with correctly typed nulls.
|
|
2010
|
-
|
|
2011
|
-
Make sure to invoke :ref:`Relation.set_model()<duckdb.Relation.set_model>` with
|
|
2012
|
-
the correct model schema before executing
|
|
2013
|
-
``Relation.with_missing_nullable_columns()``.
|
|
2014
|
-
|
|
2015
|
-
Args:
|
|
2016
|
-
include: If provided, only fill in null values for missing columns part of
|
|
2017
|
-
this collection of column names.
|
|
2018
|
-
exclude: If provided, do `not` fill in null values for missing columns
|
|
2019
|
-
part of this collection of column names.
|
|
2020
|
-
|
|
2021
|
-
Returns:
|
|
2022
|
-
Relation: New relation where missing nullable columns have been filled in
|
|
2023
|
-
with null values.
|
|
2024
|
-
|
|
2025
|
-
Example:
|
|
2026
|
-
>>> from typing import Optional
|
|
2027
|
-
>>> import patito as pt
|
|
2028
|
-
>>> class MyModel(pt.Model):
|
|
2029
|
-
... non_nullable_column: int
|
|
2030
|
-
... nullable_column: Optional[int]
|
|
2031
|
-
... another_nullable_column: Optional[int]
|
|
2032
|
-
...
|
|
2033
|
-
>>> relation = pt.duckdb.Relation("select 1 as nullable_column")
|
|
2034
|
-
>>> relation.to_df()
|
|
2035
|
-
shape: (1, 1)
|
|
2036
|
-
┌─────────────────┐
|
|
2037
|
-
│ nullable_column │
|
|
2038
|
-
│ --- │
|
|
2039
|
-
│ i64 │
|
|
2040
|
-
╞═════════════════╡
|
|
2041
|
-
│ 1 │
|
|
2042
|
-
└─────────────────┘
|
|
2043
|
-
>>> relation.set_model(MyModel).with_missing_nullable_columns().to_df()
|
|
2044
|
-
shape: (1, 2)
|
|
2045
|
-
┌─────────────────┬─────────────────────────┐
|
|
2046
|
-
│ nullable_column ┆ another_nullable_column │
|
|
2047
|
-
│ --- ┆ --- │
|
|
2048
|
-
│ i64 ┆ i64 │
|
|
2049
|
-
╞═════════════════╪═════════════════════════╡
|
|
2050
|
-
│ 1 ┆ null │
|
|
2051
|
-
└─────────────────┴─────────────────────────┘
|
|
2052
|
-
"""
|
|
2053
|
-
if self.model is None:
|
|
2054
|
-
class_name = self.__class__.__name__
|
|
2055
|
-
raise TypeError(
|
|
2056
|
-
f"{class_name}.with_missing_nullable_columns() invoked without "
|
|
2057
|
-
f"{class_name}.model having been set! "
|
|
2058
|
-
f"You should invoke {class_name}.set_model() first!"
|
|
2059
|
-
)
|
|
2060
|
-
elif include is not None and exclude is not None:
|
|
2061
|
-
raise TypeError("Both include and exclude provided at the same time!")
|
|
2062
|
-
|
|
2063
|
-
missing_columns = set(self.model.columns) - set(self.columns)
|
|
2064
|
-
missing_nullable_columns = self.model.nullable_columns & missing_columns
|
|
2065
|
-
|
|
2066
|
-
if exclude is not None:
|
|
2067
|
-
missing_nullable_columns -= set(exclude)
|
|
2068
|
-
elif include is not None:
|
|
2069
|
-
missing_nullable_columns = missing_nullable_columns & set(include)
|
|
2070
|
-
|
|
2071
|
-
projection = "*"
|
|
2072
|
-
for missing_nullable_column in missing_nullable_columns:
|
|
2073
|
-
sql_type = self.model.sql_types[missing_nullable_column]
|
|
2074
|
-
projection += f", null::{sql_type} as {missing_nullable_column}"
|
|
2075
|
-
|
|
2076
|
-
try:
|
|
2077
|
-
relation = self._relation.project(projection)
|
|
2078
|
-
except Exception as exc: # pragma: no cover
|
|
2079
|
-
# We might get a RunTime error if the enum type has not
|
|
2080
|
-
# been created yet. If so, we create all enum types for
|
|
2081
|
-
# this model.
|
|
2082
|
-
if _is_missing_enum_type_exception(exc):
|
|
2083
|
-
self.database.create_enum_types(model=self.model)
|
|
2084
|
-
relation = self._relation.project(projection)
|
|
2085
|
-
else:
|
|
2086
|
-
raise exc
|
|
2087
|
-
return self._wrap(relation=relation, schema_change=False)
|
|
2088
|
-
|
|
2089
|
-
def __add__(self: RelationType, other: RelationSource) -> RelationType:
|
|
2090
|
-
"""
|
|
2091
|
-
Execute ``self.union(other)``.
|
|
2092
|
-
|
|
2093
|
-
See :ref:`Relation.union()<duckdb.Relation.union>` for full documentation.
|
|
2094
|
-
"""
|
|
2095
|
-
return self.union(other)
|
|
2096
|
-
|
|
2097
|
-
def __eq__(self, other: object) -> bool:
|
|
2098
|
-
"""Check if Relation is equal to a Relation-able data source."""
|
|
2099
|
-
other_relation = self.database.to_relation(other) # type: ignore
|
|
2100
|
-
# Check if the number of rows are equal, and then check if each row is equal.
|
|
2101
|
-
# Use zip(self, other_relation, strict=True) when we upgrade to Python 3.10.
|
|
2102
|
-
return self.count() == other_relation.count() and all(
|
|
2103
|
-
row == other_row for row, other_row in zip(self, other_relation)
|
|
2104
|
-
)
|
|
2105
|
-
|
|
2106
|
-
def __getitem__(self, key: Union[str, Iterable[str]]) -> Relation:
|
|
2107
|
-
"""
|
|
2108
|
-
Return Relation with selected columns.
|
|
2109
|
-
|
|
2110
|
-
Uses :ref:`Relation.select()<duckdb.Relation.select>` under-the-hood in order to
|
|
2111
|
-
perform the selection. Can technically be used to rename columns,
|
|
2112
|
-
define derived columns, and so on, but prefer the use of Relation.select() for
|
|
2113
|
-
such use cases.
|
|
2114
|
-
|
|
2115
|
-
Args:
|
|
2116
|
-
key: Columns to select, either a single column represented as a string, or
|
|
2117
|
-
an iterable of strings.
|
|
2118
|
-
|
|
2119
|
-
Returns:
|
|
2120
|
-
New relation only containing the column subset specified.
|
|
2121
|
-
|
|
2122
|
-
Example:
|
|
2123
|
-
>>> import patito as pt
|
|
2124
|
-
>>> relation = pt.duckdb.Relation("select 1 as a, 2 as b, 3 as c")
|
|
2125
|
-
>>> relation.to_df()
|
|
2126
|
-
shape: (1, 3)
|
|
2127
|
-
┌─────┬─────┬─────┐
|
|
2128
|
-
│ a ┆ b ┆ c │
|
|
2129
|
-
│ --- ┆ --- ┆ --- │
|
|
2130
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
2131
|
-
╞═════╪═════╪═════╡
|
|
2132
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
2133
|
-
└─────┴─────┴─────┘
|
|
2134
|
-
>>> relation[["a", "b"]].to_df()
|
|
2135
|
-
shape: (1, 2)
|
|
2136
|
-
┌─────┬─────┐
|
|
2137
|
-
│ a ┆ b │
|
|
2138
|
-
│ --- ┆ --- │
|
|
2139
|
-
│ i64 ┆ i64 │
|
|
2140
|
-
╞═════╪═════╡
|
|
2141
|
-
│ 1 ┆ 2 │
|
|
2142
|
-
└─────┴─────┘
|
|
2143
|
-
>>> relation["a"].to_df()
|
|
2144
|
-
shape: (1, 1)
|
|
2145
|
-
┌─────┐
|
|
2146
|
-
│ a │
|
|
2147
|
-
│ --- │
|
|
2148
|
-
│ i64 │
|
|
2149
|
-
╞═════╡
|
|
2150
|
-
│ 1 │
|
|
2151
|
-
└─────┘
|
|
2152
|
-
"""
|
|
2153
|
-
projection = key if isinstance(key, str) else ", ".join(key)
|
|
2154
|
-
return self._wrap(
|
|
2155
|
-
relation=self._relation.project(projection),
|
|
2156
|
-
schema_change=True,
|
|
2157
|
-
)
|
|
2158
|
-
|
|
2159
|
-
def __iter__(self) -> Iterator[ModelType]:
|
|
2160
|
-
"""
|
|
2161
|
-
Iterate over rows in relation.
|
|
2162
|
-
|
|
2163
|
-
If :ref:`Relation.set_model()<duckdb.Relation.set_model>` has been invoked
|
|
2164
|
-
first, the given model will be used to deserialize each row. Otherwise a Patito
|
|
2165
|
-
model is dynamically constructed which fits the schema of the relation.
|
|
2166
|
-
|
|
2167
|
-
Returns:
|
|
2168
|
-
Iterator[Model]: An iterator of patito Model objects representing each row.
|
|
2169
|
-
|
|
2170
|
-
Example:
|
|
2171
|
-
>>> from typing import Literal
|
|
2172
|
-
>>> import patito as pt
|
|
2173
|
-
>>> df = pt.DataFrame({"float_column": [1, 2], "enum_column": ["A", "B"]})
|
|
2174
|
-
>>> relation = pt.duckdb.Relation(df).set_alias("my_relation")
|
|
2175
|
-
>>> for row in relation:
|
|
2176
|
-
... print(row)
|
|
2177
|
-
...
|
|
2178
|
-
float_column=1 enum_column='A'
|
|
2179
|
-
float_column=2 enum_column='B'
|
|
2180
|
-
>>> list(relation)
|
|
2181
|
-
[my_relation(float_column=1, enum_column='A'),
|
|
2182
|
-
my_relation(float_column=2, enum_column='B')]
|
|
2183
|
-
|
|
2184
|
-
>>> class MySchema(pt.Model):
|
|
2185
|
-
... float_column: float
|
|
2186
|
-
... enum_column: Literal["A", "B", "C"]
|
|
2187
|
-
...
|
|
2188
|
-
>>> relation = relation.set_model(MySchema)
|
|
2189
|
-
>>> for row in relation:
|
|
2190
|
-
... print(row)
|
|
2191
|
-
...
|
|
2192
|
-
float_column=1.0 enum_column='A'
|
|
2193
|
-
float_column=2.0 enum_column='B'
|
|
2194
|
-
>>> list(relation)
|
|
2195
|
-
[MySchema(float_column=1.0, enum_column='A'),
|
|
2196
|
-
MySchema(float_column=2.0, enum_column='B')]
|
|
2197
|
-
"""
|
|
2198
|
-
result = self._relation.execute()
|
|
2199
|
-
while True:
|
|
2200
|
-
row_tuple = result.fetchone()
|
|
2201
|
-
if not row_tuple:
|
|
2202
|
-
return
|
|
2203
|
-
else:
|
|
2204
|
-
yield self._to_model(row_tuple)
|
|
2205
|
-
|
|
2206
|
-
def __len__(self) -> int:
|
|
2207
|
-
"""
|
|
2208
|
-
Return the number of rows in the relation.
|
|
2209
|
-
|
|
2210
|
-
See :ref:`Relation.count()<duckdb.Relation.count>` for full documentation.
|
|
2211
|
-
"""
|
|
2212
|
-
return self.count()
|
|
2213
|
-
|
|
2214
|
-
def __str__(self) -> str:
|
|
2215
|
-
"""
|
|
2216
|
-
Return string representation of Relation object.
|
|
2217
|
-
|
|
2218
|
-
Includes an expression tree, the result columns, and a result preview.
|
|
2219
|
-
|
|
2220
|
-
Example:
|
|
2221
|
-
>>> import patito as pt
|
|
2222
|
-
>>> products = pt.duckdb.Relation(
|
|
2223
|
-
... pt.DataFrame(
|
|
2224
|
-
... {
|
|
2225
|
-
... "product_name": ["apple", "red_apple", "banana", "oranges"],
|
|
2226
|
-
... "supplier_id": [2, 2, 1, 3],
|
|
2227
|
-
... }
|
|
2228
|
-
... )
|
|
2229
|
-
... ).set_alias("products")
|
|
2230
|
-
>>> print(str(products)) # xdoctest: +SKIP
|
|
2231
|
-
---------------------
|
|
2232
|
-
--- Relation Tree ---
|
|
2233
|
-
---------------------
|
|
2234
|
-
arrow_scan(94609350519648, 140317161740928, 140317161731168, 1000000)\
|
|
2235
|
-
|
|
2236
|
-
---------------------
|
|
2237
|
-
-- Result Columns --
|
|
2238
|
-
---------------------
|
|
2239
|
-
- product_name (VARCHAR)
|
|
2240
|
-
- supplier_id (BIGINT)\
|
|
2241
|
-
|
|
2242
|
-
---------------------
|
|
2243
|
-
-- Result Preview --
|
|
2244
|
-
---------------------
|
|
2245
|
-
product_name supplier_id
|
|
2246
|
-
VARCHAR BIGINT
|
|
2247
|
-
[ Rows: 4]
|
|
2248
|
-
apple 2
|
|
2249
|
-
red_apple 2
|
|
2250
|
-
banana 1
|
|
2251
|
-
oranges 3
|
|
2252
|
-
|
|
2253
|
-
>>> suppliers = pt.duckdb.Relation(
|
|
2254
|
-
... pt.DataFrame(
|
|
2255
|
-
... {
|
|
2256
|
-
... "id": [1, 2],
|
|
2257
|
-
... "supplier_name": ["Banana Republic", "Applies Inc."],
|
|
2258
|
-
... }
|
|
2259
|
-
... )
|
|
2260
|
-
... ).set_alias("suppliers")
|
|
2261
|
-
>>> relation = (
|
|
2262
|
-
... products.set_alias("p")
|
|
2263
|
-
... .inner_join(
|
|
2264
|
-
... suppliers.set_alias("s"),
|
|
2265
|
-
... on="p.supplier_id = s.id",
|
|
2266
|
-
... )
|
|
2267
|
-
... .aggregate(
|
|
2268
|
-
... "supplier_name",
|
|
2269
|
-
... num_products="count(product_name)",
|
|
2270
|
-
... group_by=["supplier_id", "supplier_name"],
|
|
2271
|
-
... )
|
|
2272
|
-
... )
|
|
2273
|
-
>>> print(str(relation)) # xdoctest: +SKIP
|
|
2274
|
-
---------------------
|
|
2275
|
-
--- Relation Tree ---
|
|
2276
|
-
---------------------
|
|
2277
|
-
Aggregate [supplier_name, count(product_name)]
|
|
2278
|
-
Join INNER p.supplier_id = s.id
|
|
2279
|
-
arrow_scan(94609350519648, 140317161740928, 140317161731168, 1000000)
|
|
2280
|
-
arrow_scan(94609436221024, 140317161740928, 140317161731168, 1000000)\
|
|
2281
|
-
|
|
2282
|
-
---------------------
|
|
2283
|
-
-- Result Columns --
|
|
2284
|
-
---------------------
|
|
2285
|
-
- supplier_name (VARCHAR)
|
|
2286
|
-
- num_products (BIGINT)\
|
|
2287
|
-
|
|
2288
|
-
---------------------
|
|
2289
|
-
-- Result Preview --
|
|
2290
|
-
---------------------
|
|
2291
|
-
supplier_name num_products
|
|
2292
|
-
VARCHAR BIGINT
|
|
2293
|
-
[ Rows: 2]
|
|
2294
|
-
Applies Inc. 2
|
|
2295
|
-
Banana Republic 1
|
|
2296
|
-
|
|
2297
|
-
"""
|
|
2298
|
-
return str(self._relation)
|
|
2299
|
-
|
|
2300
|
-
def _wrap(
|
|
2301
|
-
self: RelationType,
|
|
2302
|
-
relation: "duckdb.DuckDBPyRelation",
|
|
2303
|
-
schema_change: bool = False,
|
|
2304
|
-
) -> RelationType:
|
|
2305
|
-
"""
|
|
2306
|
-
Wrap DuckDB Relation object in same Relation wrapper class as self.
|
|
2307
|
-
|
|
2308
|
-
This will preserve the type of the relation, even for subclasses Relation.
|
|
2309
|
-
It should therefore only be used for relations which can be considered schema-
|
|
2310
|
-
compatible with the original relation. Otherwise set schema_change to True
|
|
2311
|
-
in order to create a Relation base object instead.
|
|
2312
|
-
"""
|
|
2313
|
-
return type(self)(
|
|
2314
|
-
derived_from=relation,
|
|
2315
|
-
database=self.database,
|
|
2316
|
-
model=self.model if not schema_change else None,
|
|
2317
|
-
)
|
|
2318
|
-
|
|
2319
|
-
|
|
2320
|
-
class Database:
|
|
2321
|
-
# Types created in order to represent enum strings
|
|
2322
|
-
enum_types: Set[str]
|
|
2323
|
-
|
|
2324
|
-
def __init__(
|
|
2325
|
-
self,
|
|
2326
|
-
path: Optional[Path] = None,
|
|
2327
|
-
read_only: bool = False,
|
|
2328
|
-
**kwargs: Any, # noqa: ANN401
|
|
2329
|
-
) -> None:
|
|
2330
|
-
"""
|
|
2331
|
-
Instantiate a new DuckDB database, either persisted to disk or in-memory.
|
|
2332
|
-
|
|
2333
|
-
Args:
|
|
2334
|
-
path: Optional path to store all the data to. If ``None`` the data is
|
|
2335
|
-
persisted in-memory only.
|
|
2336
|
-
read_only: If the database connection should be a read-only connection.
|
|
2337
|
-
**kwargs: Additional keywords forwarded to ``duckdb.connect()``.
|
|
2338
|
-
|
|
2339
|
-
Examples:
|
|
2340
|
-
>>> import patito as pt
|
|
2341
|
-
>>> db = pt.duckdb.Database()
|
|
2342
|
-
>>> db.to_relation("select 1 as a, 2 as b").create_table("my_table")
|
|
2343
|
-
>>> db.query("select * from my_table").to_df()
|
|
2344
|
-
shape: (1, 2)
|
|
2345
|
-
┌─────┬─────┐
|
|
2346
|
-
│ a ┆ b │
|
|
2347
|
-
│ --- ┆ --- │
|
|
2348
|
-
│ i64 ┆ i64 │
|
|
2349
|
-
╞═════╪═════╡
|
|
2350
|
-
│ 1 ┆ 2 │
|
|
2351
|
-
└─────┴─────┘
|
|
2352
|
-
"""
|
|
2353
|
-
import duckdb
|
|
2354
|
-
|
|
2355
|
-
self.path = path
|
|
2356
|
-
self.connection = duckdb.connect(
|
|
2357
|
-
database=str(path) if path else ":memory:",
|
|
2358
|
-
read_only=read_only,
|
|
2359
|
-
**kwargs,
|
|
2360
|
-
)
|
|
2361
|
-
self.enum_types: Set[str] = set()
|
|
2362
|
-
|
|
2363
|
-
@classmethod
|
|
2364
|
-
def default(cls) -> Database:
|
|
2365
|
-
"""
|
|
2366
|
-
Return the default DuckDB database.
|
|
2367
|
-
|
|
2368
|
-
Returns:
|
|
2369
|
-
A patito :ref:`Database<duckdb.Database>` object wrapping around the given
|
|
2370
|
-
connection.
|
|
2371
|
-
|
|
2372
|
-
Example:
|
|
2373
|
-
>>> import patito as pt
|
|
2374
|
-
>>> db = pt.duckdb.Database.default()
|
|
2375
|
-
>>> db.query("select 1 as a, 2 as b").to_df()
|
|
2376
|
-
shape: (1, 2)
|
|
2377
|
-
┌─────┬─────┐
|
|
2378
|
-
│ a ┆ b │
|
|
2379
|
-
│ --- ┆ --- │
|
|
2380
|
-
│ i64 ┆ i64 │
|
|
2381
|
-
╞═════╪═════╡
|
|
2382
|
-
│ 1 ┆ 2 │
|
|
2383
|
-
└─────┴─────┘
|
|
2384
|
-
"""
|
|
2385
|
-
import duckdb
|
|
2386
|
-
|
|
2387
|
-
return cls.from_connection(duckdb.default_connection)
|
|
2388
|
-
|
|
2389
|
-
@classmethod
|
|
2390
|
-
def from_connection(cls, connection: "duckdb.DuckDBPyConnection") -> Database:
|
|
2391
|
-
"""
|
|
2392
|
-
Create database from native DuckDB connection object.
|
|
2393
|
-
|
|
2394
|
-
Args:
|
|
2395
|
-
connection: A native DuckDB connection object created with
|
|
2396
|
-
``duckdb.connect()``.
|
|
2397
|
-
|
|
2398
|
-
Returns:
|
|
2399
|
-
A :ref:`Database<duckdb.Database>` object wrapping around the given
|
|
2400
|
-
connection.
|
|
2401
|
-
|
|
2402
|
-
Example:
|
|
2403
|
-
>>> import duckdb
|
|
2404
|
-
>>> import patito as pt
|
|
2405
|
-
>>> connection = duckdb.connect()
|
|
2406
|
-
>>> database = pt.duckdb.Database.from_connection(connection)
|
|
2407
|
-
"""
|
|
2408
|
-
obj = cls.__new__(cls)
|
|
2409
|
-
obj.connection = connection
|
|
2410
|
-
obj.enum_types = set()
|
|
2411
|
-
return obj
|
|
2412
|
-
|
|
2413
|
-
def to_relation(
|
|
2414
|
-
self,
|
|
2415
|
-
derived_from: RelationSource,
|
|
2416
|
-
) -> Relation:
|
|
2417
|
-
"""
|
|
2418
|
-
Create a new relation object based on data source.
|
|
2419
|
-
|
|
2420
|
-
The given data will be represented as a relation associated with the database.
|
|
2421
|
-
``Database(x).to_relation(y)`` is equivalent to
|
|
2422
|
-
``Relation(y, database=Database(x))``.
|
|
2423
|
-
|
|
2424
|
-
Args:
|
|
2425
|
-
derived_from (RelationSource): One of either a polars or pandas
|
|
2426
|
-
``DataFrame``, a ``pathlib.Path`` to a parquet or CSV file, a SQL query
|
|
2427
|
-
string, or an existing relation.
|
|
2428
|
-
|
|
2429
|
-
Example:
|
|
2430
|
-
>>> import patito as pt
|
|
2431
|
-
>>> db = pt.duckdb.Database()
|
|
2432
|
-
>>> db.to_relation("select 1 as a, 2 as b").to_df()
|
|
2433
|
-
shape: (1, 2)
|
|
2434
|
-
┌─────┬─────┐
|
|
2435
|
-
│ a ┆ b │
|
|
2436
|
-
│ --- ┆ --- │
|
|
2437
|
-
│ i64 ┆ i64 │
|
|
2438
|
-
╞═════╪═════╡
|
|
2439
|
-
│ 1 ┆ 2 │
|
|
2440
|
-
└─────┴─────┘
|
|
2441
|
-
>>> db.to_relation(pt.DataFrame({"c": [3, 4], "d": ["5", "6"]})).to_df()
|
|
2442
|
-
shape: (2, 2)
|
|
2443
|
-
┌─────┬─────┐
|
|
2444
|
-
│ c ┆ d │
|
|
2445
|
-
│ --- ┆ --- │
|
|
2446
|
-
│ i64 ┆ str │
|
|
2447
|
-
╞═════╪═════╡
|
|
2448
|
-
│ 3 ┆ 5 │
|
|
2449
|
-
│ 4 ┆ 6 │
|
|
2450
|
-
└─────┴─────┘
|
|
2451
|
-
"""
|
|
2452
|
-
return Relation(
|
|
2453
|
-
derived_from=derived_from,
|
|
2454
|
-
database=self,
|
|
2455
|
-
)
|
|
2456
|
-
|
|
2457
|
-
def execute(
|
|
2458
|
-
self,
|
|
2459
|
-
query: str,
|
|
2460
|
-
*parameters: Collection[Union[str, int, float, bool]],
|
|
2461
|
-
) -> None:
|
|
2462
|
-
"""
|
|
2463
|
-
Execute SQL query in DuckDB database.
|
|
2464
|
-
|
|
2465
|
-
Args:
|
|
2466
|
-
query: A SQL statement to execute. Does `not` have to be terminated with
|
|
2467
|
-
a semicolon (``;``).
|
|
2468
|
-
parameters: One or more sets of parameters to insert into prepared
|
|
2469
|
-
statements. The values are replaced in place of the question marks
|
|
2470
|
-
(``?``) in the prepared query.
|
|
2471
|
-
|
|
2472
|
-
Example:
|
|
2473
|
-
>>> import patito as pt
|
|
2474
|
-
>>> db = pt.duckdb.Database()
|
|
2475
|
-
>>> db.execute("create table my_table (x bigint);")
|
|
2476
|
-
>>> db.execute("insert into my_table values (1), (2), (3)")
|
|
2477
|
-
>>> db.table("my_table").to_df()
|
|
2478
|
-
shape: (3, 1)
|
|
2479
|
-
┌─────┐
|
|
2480
|
-
│ x │
|
|
2481
|
-
│ --- │
|
|
2482
|
-
│ i64 │
|
|
2483
|
-
╞═════╡
|
|
2484
|
-
│ 1 │
|
|
2485
|
-
│ 2 │
|
|
2486
|
-
│ 3 │
|
|
2487
|
-
└─────┘
|
|
2488
|
-
|
|
2489
|
-
Parameters can be specified when executing prepared queries.
|
|
2490
|
-
|
|
2491
|
-
>>> db.execute("delete from my_table where x = ?", (2,))
|
|
2492
|
-
>>> db.table("my_table").to_df()
|
|
2493
|
-
shape: (2, 1)
|
|
2494
|
-
┌─────┐
|
|
2495
|
-
│ x │
|
|
2496
|
-
│ --- │
|
|
2497
|
-
│ i64 │
|
|
2498
|
-
╞═════╡
|
|
2499
|
-
│ 1 │
|
|
2500
|
-
│ 3 │
|
|
2501
|
-
└─────┘
|
|
2502
|
-
|
|
2503
|
-
Multiple parameter sets can be specified when executing multiple prepared
|
|
2504
|
-
queries.
|
|
2505
|
-
|
|
2506
|
-
>>> db.execute(
|
|
2507
|
-
... "delete from my_table where x = ?",
|
|
2508
|
-
... (1,),
|
|
2509
|
-
... (3,),
|
|
2510
|
-
... )
|
|
2511
|
-
>>> db.table("my_table").to_df()
|
|
2512
|
-
shape: (0, 1)
|
|
2513
|
-
┌─────┐
|
|
2514
|
-
│ x │
|
|
2515
|
-
│ --- │
|
|
2516
|
-
│ i64 │
|
|
2517
|
-
╞═════╡
|
|
2518
|
-
└─────┘
|
|
2519
|
-
"""
|
|
2520
|
-
duckdb_parameters: Union[
|
|
2521
|
-
Collection[Union[str, int, float, bool]],
|
|
2522
|
-
Collection[Collection[Union[str, int, float, bool]]],
|
|
2523
|
-
None,
|
|
2524
|
-
]
|
|
2525
|
-
if parameters is None or len(parameters) == 0:
|
|
2526
|
-
duckdb_parameters = []
|
|
2527
|
-
multiple_parameter_sets = False
|
|
2528
|
-
elif len(parameters) == 1:
|
|
2529
|
-
duckdb_parameters = parameters[0]
|
|
2530
|
-
multiple_parameter_sets = False
|
|
2531
|
-
else:
|
|
2532
|
-
duckdb_parameters = parameters
|
|
2533
|
-
multiple_parameter_sets = True
|
|
2534
|
-
|
|
2535
|
-
self.connection.execute(
|
|
2536
|
-
query=query,
|
|
2537
|
-
parameters=duckdb_parameters,
|
|
2538
|
-
multiple_parameter_sets=multiple_parameter_sets,
|
|
2539
|
-
)
|
|
2540
|
-
|
|
2541
|
-
def query(self, query: str, alias: str = "query_relation") -> Relation:
|
|
2542
|
-
"""
|
|
2543
|
-
Execute arbitrary SQL select query and return the relation.
|
|
2544
|
-
|
|
2545
|
-
Args:
|
|
2546
|
-
query: Arbitrary SQL select query.
|
|
2547
|
-
alias: The alias to assign to the resulting relation, to be used in further
|
|
2548
|
-
queries.
|
|
2549
|
-
|
|
2550
|
-
Returns: A relation representing the data produced by the given query.
|
|
2551
|
-
|
|
2552
|
-
Example:
|
|
2553
|
-
>>> import patito as pt
|
|
2554
|
-
>>> db = pt.duckdb.Database()
|
|
2555
|
-
>>> relation = db.query("select 1 as a, 2 as b, 3 as c")
|
|
2556
|
-
>>> relation.to_df()
|
|
2557
|
-
shape: (1, 3)
|
|
2558
|
-
┌─────┬─────┬─────┐
|
|
2559
|
-
│ a ┆ b ┆ c │
|
|
2560
|
-
│ --- ┆ --- ┆ --- │
|
|
2561
|
-
│ i64 ┆ i64 ┆ i64 │
|
|
2562
|
-
╞═════╪═════╪═════╡
|
|
2563
|
-
│ 1 ┆ 2 ┆ 3 │
|
|
2564
|
-
└─────┴─────┴─────┘
|
|
2565
|
-
|
|
2566
|
-
>>> relation = db.query("select 1 as a, 2 as b, 3 as c", alias="my_alias")
|
|
2567
|
-
>>> relation.select("my_alias.a").to_df()
|
|
2568
|
-
shape: (1, 1)
|
|
2569
|
-
┌─────┐
|
|
2570
|
-
│ a │
|
|
2571
|
-
│ --- │
|
|
2572
|
-
│ i64 │
|
|
2573
|
-
╞═════╡
|
|
2574
|
-
│ 1 │
|
|
2575
|
-
└─────┘
|
|
2576
|
-
"""
|
|
2577
|
-
return Relation(
|
|
2578
|
-
self.connection.query(query=query, alias=alias),
|
|
2579
|
-
database=self,
|
|
2580
|
-
)
|
|
2581
|
-
|
|
2582
|
-
def empty_relation(self, schema: Type[ModelType]) -> Relation[ModelType]:
|
|
2583
|
-
"""
|
|
2584
|
-
Create relation with zero rows, but correct schema that matches the given model.
|
|
2585
|
-
|
|
2586
|
-
Args:
|
|
2587
|
-
schema: A patito model which specifies the column names and types of the
|
|
2588
|
-
given relation.
|
|
2589
|
-
|
|
2590
|
-
Example:
|
|
2591
|
-
>>> import patito as pt
|
|
2592
|
-
>>> class Schema(pt.Model):
|
|
2593
|
-
... string_column: str
|
|
2594
|
-
... bool_column: bool
|
|
2595
|
-
...
|
|
2596
|
-
>>> db = pt.duckdb.Database()
|
|
2597
|
-
>>> empty_relation = db.empty_relation(Schema)
|
|
2598
|
-
>>> empty_relation.to_df()
|
|
2599
|
-
shape: (0, 2)
|
|
2600
|
-
┌───────────────┬─────────────┐
|
|
2601
|
-
│ string_column ┆ bool_column │
|
|
2602
|
-
│ --- ┆ --- │
|
|
2603
|
-
│ str ┆ bool │
|
|
2604
|
-
╞═══════════════╪═════════════╡
|
|
2605
|
-
└───────────────┴─────────────┘
|
|
2606
|
-
>>> non_empty_relation = db.query(
|
|
2607
|
-
... "select 'dummy' as string_column, true as bool_column"
|
|
2608
|
-
... )
|
|
2609
|
-
>>> non_empty_relation.union(empty_relation).to_df()
|
|
2610
|
-
shape: (1, 2)
|
|
2611
|
-
┌───────────────┬─────────────┐
|
|
2612
|
-
│ string_column ┆ bool_column │
|
|
2613
|
-
│ --- ┆ --- │
|
|
2614
|
-
│ str ┆ bool │
|
|
2615
|
-
╞═══════════════╪═════════════╡
|
|
2616
|
-
│ dummy ┆ true │
|
|
2617
|
-
└───────────────┴─────────────┘
|
|
2618
|
-
"""
|
|
2619
|
-
return self.to_relation(schema.examples()).limit(0)
|
|
2620
|
-
|
|
2621
|
-
def table(self, name: str) -> Relation:
|
|
2622
|
-
"""
|
|
2623
|
-
Return relation representing all the data in the given table.
|
|
2624
|
-
|
|
2625
|
-
Args:
|
|
2626
|
-
name: The name of the table.
|
|
2627
|
-
|
|
2628
|
-
Example:
|
|
2629
|
-
>>> import patito as pt
|
|
2630
|
-
>>> df = pt.DataFrame({"a": [1, 2], "b": [3, 4]})
|
|
2631
|
-
>>> db = pt.duckdb.Database()
|
|
2632
|
-
>>> relation = db.to_relation(df)
|
|
2633
|
-
>>> relation.create_table(name="my_table")
|
|
2634
|
-
>>> db.table("my_table").to_df()
|
|
2635
|
-
shape: (2, 2)
|
|
2636
|
-
┌─────┬─────┐
|
|
2637
|
-
│ a ┆ b │
|
|
2638
|
-
│ --- ┆ --- │
|
|
2639
|
-
│ i64 ┆ i64 │
|
|
2640
|
-
╞═════╪═════╡
|
|
2641
|
-
│ 1 ┆ 3 │
|
|
2642
|
-
│ 2 ┆ 4 │
|
|
2643
|
-
└─────┴─────┘
|
|
2644
|
-
"""
|
|
2645
|
-
return Relation(
|
|
2646
|
-
self.connection.table(name),
|
|
2647
|
-
database=self.from_connection(self.connection),
|
|
2648
|
-
)
|
|
2649
|
-
|
|
2650
|
-
def view(self, name: str) -> Relation:
|
|
2651
|
-
"""
|
|
2652
|
-
Return relation representing all the data in the given view.
|
|
2653
|
-
|
|
2654
|
-
Args:
|
|
2655
|
-
name: The name of the view.
|
|
2656
|
-
|
|
2657
|
-
Example:
|
|
2658
|
-
>>> import patito as pt
|
|
2659
|
-
>>> df = pt.DataFrame({"a": [1, 2], "b": [3, 4]})
|
|
2660
|
-
>>> db = pt.duckdb.Database()
|
|
2661
|
-
>>> relation = db.to_relation(df)
|
|
2662
|
-
>>> relation.create_view(name="my_view")
|
|
2663
|
-
>>> db.view("my_view").to_df()
|
|
2664
|
-
shape: (2, 2)
|
|
2665
|
-
┌─────┬─────┐
|
|
2666
|
-
│ a ┆ b │
|
|
2667
|
-
│ --- ┆ --- │
|
|
2668
|
-
│ i64 ┆ i64 │
|
|
2669
|
-
╞═════╪═════╡
|
|
2670
|
-
│ 1 ┆ 3 │
|
|
2671
|
-
│ 2 ┆ 4 │
|
|
2672
|
-
└─────┴─────┘
|
|
2673
|
-
"""
|
|
2674
|
-
return Relation(
|
|
2675
|
-
self.connection.view(name),
|
|
2676
|
-
database=self.from_connection(self.connection),
|
|
2677
|
-
)
|
|
2678
|
-
|
|
2679
|
-
def create_table(
|
|
2680
|
-
self,
|
|
2681
|
-
name: str,
|
|
2682
|
-
model: Type[ModelType],
|
|
2683
|
-
) -> Relation[ModelType]:
|
|
2684
|
-
"""
|
|
2685
|
-
Create table with schema matching the provided Patito model.
|
|
2686
|
-
|
|
2687
|
-
See :ref:`Relation.insert_into()<duckdb.Relation.insert_into>` for how to insert
|
|
2688
|
-
data into the table after creation.
|
|
2689
|
-
The :ref:`Relation.create_table()<duckdb.Relation.create_table>` method can also
|
|
2690
|
-
be used to create a table from a given relation `and` insert the data at the
|
|
2691
|
-
same time.
|
|
2692
|
-
|
|
2693
|
-
Args:
|
|
2694
|
-
name: Name of new database table.
|
|
2695
|
-
model (Type[Model]): Patito model indicating names and types of table
|
|
2696
|
-
columns.
|
|
2697
|
-
Returns:
|
|
2698
|
-
Relation[ModelType]: Relation pointing to the new table.
|
|
2699
|
-
|
|
2700
|
-
Example:
|
|
2701
|
-
>>> from typing import Optional
|
|
2702
|
-
>>> import patito as pt
|
|
2703
|
-
>>> class MyModel(pt.Model):
|
|
2704
|
-
... str_column: str
|
|
2705
|
-
... nullable_string_column: Optional[str]
|
|
2706
|
-
...
|
|
2707
|
-
>>> db = pt.duckdb.Database()
|
|
2708
|
-
>>> db.create_table(name="my_table", model=MyModel)
|
|
2709
|
-
>>> db.table("my_table").types
|
|
2710
|
-
{'str_column': VARCHAR, 'nullable_string_column': VARCHAR}
|
|
2711
|
-
"""
|
|
2712
|
-
self.create_enum_types(model=model)
|
|
2713
|
-
schema = model.schema()
|
|
2714
|
-
non_nullable = schema.get("required", [])
|
|
2715
|
-
columns = []
|
|
2716
|
-
for column_name, sql_type in model.sql_types.items():
|
|
2717
|
-
column = f"{column_name} {sql_type}"
|
|
2718
|
-
if column_name in non_nullable:
|
|
2719
|
-
column += " not null"
|
|
2720
|
-
columns.append(column)
|
|
2721
|
-
self.connection.execute(f"create table {name} ({','.join(columns)})")
|
|
2722
|
-
# TODO: Fix typing
|
|
2723
|
-
return self.table(name).set_model(model) # pyright: ignore
|
|
2724
|
-
|
|
2725
|
-
def create_enum_types(self, model: Type[ModelType]) -> None:
|
|
2726
|
-
"""
|
|
2727
|
-
Define SQL enum types in DuckDB database.
|
|
2728
|
-
|
|
2729
|
-
Args:
|
|
2730
|
-
model: Model for which all Literal-annotated or enum-annotated string fields
|
|
2731
|
-
will get respective DuckDB enum types.
|
|
2732
|
-
|
|
2733
|
-
Example:
|
|
2734
|
-
>>> import patito as pt
|
|
2735
|
-
>>> class EnumModel(pt.Model):
|
|
2736
|
-
... enum_column: Literal["A", "B", "C"]
|
|
2737
|
-
...
|
|
2738
|
-
>>> db = pt.duckdb.Database()
|
|
2739
|
-
>>> db.create_enum_types(EnumModel)
|
|
2740
|
-
>>> db.enum_types
|
|
2741
|
-
{'enum__7ba49365cc1b0fd57e61088b3bc9aa25'}
|
|
2742
|
-
"""
|
|
2743
|
-
import duckdb
|
|
2744
|
-
|
|
2745
|
-
for props in model._schema_properties().values():
|
|
2746
|
-
if "enum" not in props or props["type"] != "string":
|
|
2747
|
-
# DuckDB enums only support string values
|
|
2748
|
-
continue
|
|
2749
|
-
|
|
2750
|
-
enum_type_name = _enum_type_name(field_properties=props)
|
|
2751
|
-
if enum_type_name in self.enum_types:
|
|
2752
|
-
# This enum type has already been created
|
|
2753
|
-
continue
|
|
2754
|
-
|
|
2755
|
-
enum_values = ", ".join(repr(value) for value in sorted(props["enum"]))
|
|
2756
|
-
try:
|
|
2757
|
-
self.connection.execute(
|
|
2758
|
-
f"create type {enum_type_name} as enum ({enum_values})"
|
|
2759
|
-
)
|
|
2760
|
-
except duckdb.CatalogException as e:
|
|
2761
|
-
if "already exists" not in str(e):
|
|
2762
|
-
raise e # pragma: no cover
|
|
2763
|
-
self.enum_types.add(enum_type_name)
|
|
2764
|
-
|
|
2765
|
-
def create_view(
|
|
2766
|
-
self,
|
|
2767
|
-
name: str,
|
|
2768
|
-
data: RelationSource,
|
|
2769
|
-
) -> Relation:
|
|
2770
|
-
"""Create a view based on the given data source."""
|
|
2771
|
-
return self.to_relation(derived_from=data).create_view(name)
|
|
2772
|
-
|
|
2773
|
-
def __contains__(self, table: str) -> bool:
|
|
2774
|
-
"""
|
|
2775
|
-
Return ``True`` if the database contains a table with the given name.
|
|
2776
|
-
|
|
2777
|
-
Args:
|
|
2778
|
-
table: The name of the table to be checked for.
|
|
2779
|
-
|
|
2780
|
-
Examples:
|
|
2781
|
-
>>> import patito as pt
|
|
2782
|
-
>>> db = pt.duckdb.Database()
|
|
2783
|
-
>>> "my_table" in db
|
|
2784
|
-
False
|
|
2785
|
-
>>> db.to_relation("select 1 as a, 2 as b").create_table(name="my_table")
|
|
2786
|
-
>>> "my_table" in db
|
|
2787
|
-
True
|
|
2788
|
-
"""
|
|
2789
|
-
try:
|
|
2790
|
-
self.connection.table(table_name=table)
|
|
2791
|
-
return True
|
|
2792
|
-
except Exception:
|
|
2793
|
-
return False
|