patito 0.5.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
patito/duckdb.py DELETED
@@ -1,2793 +0,0 @@
1
- """
2
- Module which wraps around the duckdb module in an opiniated manner.
3
- """
4
- from __future__ import annotations
5
-
6
- import hashlib
7
- from collections.abc import Collection, Iterable, Iterator
8
- from pathlib import Path
9
- from typing import (
10
- TYPE_CHECKING,
11
- Any,
12
- Dict,
13
- Generic,
14
- List,
15
- Optional,
16
- Set,
17
- Tuple,
18
- Type,
19
- TypeVar,
20
- Union,
21
- cast,
22
- )
23
-
24
- import numpy as np
25
- import polars as pl
26
- import pyarrow as pa # type: ignore[import]
27
- from pydantic import create_model
28
- from typing_extensions import Literal
29
-
30
- from patito import sql
31
- from patito.exceptions import MultipleRowsReturned, RowDoesNotExist
32
- from patito.polars import DataFrame
33
- from patito.pydantic import Model, ModelType
34
-
35
- try:
36
- import pandas as pd
37
-
38
- _PANDAS_AVAILABLE = True
39
- except ImportError:
40
- _PANDAS_AVAILABLE = False
41
-
42
- if TYPE_CHECKING:
43
- import duckdb
44
-
45
-
46
- # Types which can be used to instantiate a DuckDB Relation object
47
- RelationSource = Union[
48
- DataFrame,
49
- pl.DataFrame,
50
- "pd.DataFrame",
51
- Path,
52
- str,
53
- "duckdb.DuckDBPyRelation",
54
- "Relation",
55
- ]
56
-
57
- # Used to refer to type(self) in Relation methods which preserve the type.
58
- # Hard-coding Relation or Relation[ModelType] does not work for subclasses
59
- # that return type(self) since that will refer to the parent class.
60
- # See relevant SO answer: https://stackoverflow.com/a/63178532
61
- RelationType = TypeVar("RelationType", bound="Relation")
62
-
63
- # The SQL types supported by DuckDB
64
- # See: https://duckdb.org/docs/sql/data_types/overview
65
- # fmt: off
66
- DuckDBSQLType = Literal[
67
- "BIGINT", "INT8", "LONG",
68
- "BLOB", "BYTEA", "BINARY", "VARBINARY",
69
- "BOOLEAN", "BOOL", "LOGICAL",
70
- "DATE",
71
- "DOUBLE", "FLOAT8", "NUMERIC", "DECIMAL",
72
- "HUGEINT",
73
- "INTEGER", "INT4", "INT", "SIGNED",
74
- "INTERVAL",
75
- "REAL", "FLOAT4", "FLOAT",
76
- "SMALLINT", "INT2", "SHORT",
77
- "TIME",
78
- "TIMESTAMP", "DATETIME",
79
- "TIMESTAMP WITH TIMEZONE", "TIMESTAMPTZ",
80
- "TINYINT", "INT1",
81
- "UBIGINT",
82
- "UINTEGER",
83
- "USMALLINT",
84
- "UTINYINT",
85
- "UUID",
86
- "VARCHAR", "CHAR", "BPCHAR", "TEXT", "STRING",
87
- ]
88
- # fmt: on
89
-
90
- # Used for backward-compatible patches
91
- POLARS_VERSION: Optional[Tuple[int, int, int]]
92
- try:
93
- POLARS_VERSION = cast(
94
- Tuple[int, int, int],
95
- tuple(map(int, pl.__version__.split("."))),
96
- )
97
- except ValueError: # pragma: no cover
98
- POLARS_VERSION = None
99
-
100
-
101
- def create_pydantic_model(relation: "duckdb.DuckDBPyRelation") -> Type[Model]:
102
- """Create pydantic model deserialization of the given relation."""
103
- pydantic_annotations = {column: (Any, ...) for column in relation.columns}
104
- return create_model( # type: ignore
105
- relation.alias,
106
- __base__=Model,
107
- **pydantic_annotations, # pyright: ignore
108
- )
109
-
110
-
111
- def _enum_type_name(field_properties: dict) -> str:
112
- """
113
- Return enum DuckDB SQL type name based on enum values.
114
-
115
- The same enum values, regardless of ordering, will always be given the same name.
116
- """
117
- enum_values = ", ".join(repr(value) for value in sorted(field_properties["enum"]))
118
- value_hash = hashlib.md5(enum_values.encode("utf-8")).hexdigest() # noqa: #S303
119
- return f"enum__{value_hash}"
120
-
121
-
122
- def _is_missing_enum_type_exception(exception: BaseException) -> bool:
123
- """
124
- Return True if the given exception might be caused by missing enum type definitions.
125
-
126
- Args:
127
- exception: Exception raised by DuckDB.
128
-
129
- Returns:
130
- True if the exception might be caused by a missing SQL enum type definition.
131
- """
132
- description = str(exception)
133
- # DuckDB version <= 0.3.4
134
- old_exception = description.startswith("Not implemented Error: DataType")
135
- # DuckDB version >= 0.4.0
136
- new_exception = description.startswith("Catalog Error: Type with name enum_")
137
- return old_exception or new_exception
138
-
139
-
140
- class Relation(Generic[ModelType]):
141
- # The database connection which the given relation belongs to
142
- database: Database
143
-
144
- # The underlying DuckDB relation object which this class wraps around
145
- _relation: duckdb.DuckDBPyRelation
146
-
147
- # Can be set by subclasses in order to specify the serialization class for rows.
148
- # Must accept column names as keyword arguments.
149
- model: Optional[Type[ModelType]] = None
150
-
151
- # The alias that can be used to refer to the relation in queries
152
- alias: str
153
-
154
- def __init__( # noqa: C901
155
- self,
156
- derived_from: RelationSource,
157
- database: Optional[Database] = None,
158
- model: Optional[Type[ModelType]] = None,
159
- ) -> None:
160
- """
161
- Create a new relation object containing data to be queried with DuckDB.
162
-
163
- Args:
164
- derived_from: Data to be represented as a DuckDB relation object.
165
- Can be one of the following types:
166
-
167
- - A pandas or polars DataFrame.
168
- - An SQL query represented as a string.
169
- - A ``Path`` object pointing to a CSV or a parquet file.
170
- The path must point to an existing file with either a ``.csv``
171
- or ``.parquet`` file extension.
172
- - A native DuckDB relation object (``duckdb.DuckDBPyRelation``).
173
- - A ``patito.duckdb.Relation`` object.
174
-
175
- database: Which database to load the relation into. If not provided,
176
- the default DuckDB database will be used.
177
-
178
- model: Sub-class of ``patito.Model`` which specifies how to deserialize rows
179
- when fetched with methods such as
180
- :ref:`Relation.get()<duckdb.Relation.get>` and ``__iter__()``.
181
-
182
- Will also be used to create a strict table schema if
183
- :ref:`Relation.create_table()<duckdb.Relation.create_table>`.
184
- schema should be constructed.
185
-
186
- If not provided, a dynamic model fitting the relation schema will be
187
- created when required.
188
-
189
- Can also be set later dynamically by invoking
190
- :ref:`Relation.set_model()<duckdb.Relation.set_model>`.
191
-
192
- Raises:
193
- ValueError: If any one of the following cases are encountered:
194
-
195
- - If a provided ``Path`` object does not have a ``.csv`` or
196
- ``.parquet`` file extension.
197
- - If a database and relation object is provided, but the relation object
198
- does not belong to the database.
199
-
200
- TypeError: If the type of ``derived_from`` is not supported.
201
-
202
- Examples:
203
- Instantiated from a dataframe:
204
-
205
- >>> import patito as pt
206
- >>> df = pt.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
207
- >>> pt.duckdb.Relation(df).filter("a > 2").to_df()
208
- shape: (1, 2)
209
- ┌─────┬─────┐
210
- │ a ┆ b │
211
- │ --- ┆ --- │
212
- │ i64 ┆ i64 │
213
- ╞═════╪═════╡
214
- │ 3 ┆ 6 │
215
- └─────┴─────┘
216
-
217
- Instantiated from an SQL query:
218
-
219
- >>> pt.duckdb.Relation("select 1 as a, 2 as b").to_df()
220
- shape: (1, 2)
221
- ┌─────┬─────┐
222
- │ a ┆ b │
223
- │ --- ┆ --- │
224
- │ i64 ┆ i64 │
225
- ╞═════╪═════╡
226
- │ 1 ┆ 2 │
227
- └─────┴─────┘
228
- """
229
- import duckdb
230
-
231
- if isinstance(derived_from, Relation):
232
- if (
233
- database is not None
234
- and derived_from.database.connection is not database.connection
235
- ):
236
- raise ValueError(
237
- "Relations can't be casted between database connections."
238
- )
239
- self.database = derived_from.database
240
- self._relation = derived_from._relation
241
- self.model = derived_from.model
242
- return
243
-
244
- if database is None:
245
- self.database = Database.default()
246
- else:
247
- self.database = database
248
-
249
- if isinstance(derived_from, duckdb.DuckDBPyRelation):
250
- relation = derived_from
251
- elif isinstance(derived_from, str):
252
- relation = self.database.connection.from_query(derived_from)
253
- elif _PANDAS_AVAILABLE and isinstance(derived_from, pd.DataFrame):
254
- # We must replace pd.NA with np.nan in order for it to be considered
255
- # as null by DuckDB. Otherwise it will casted to the string <NA>
256
- # or even segfault.
257
- derived_from = derived_from.fillna(np.nan)
258
- relation = self.database.connection.from_df(derived_from)
259
- elif isinstance(derived_from, pl.DataFrame):
260
- relation = self.database.connection.from_arrow(derived_from.to_arrow())
261
- elif isinstance(derived_from, Path):
262
- if derived_from.suffix.lower() == ".parquet":
263
- relation = self.database.connection.from_parquet(str(derived_from))
264
- elif derived_from.suffix.lower() == ".csv":
265
- relation = self.database.connection.from_csv_auto(str(derived_from))
266
- else:
267
- raise ValueError(
268
- f"Unsupported file suffix {derived_from.suffix!r} for data import!"
269
- )
270
- else:
271
- raise TypeError # pragma: no cover
272
-
273
- self._relation = relation
274
- if model is not None:
275
- self.model = model # pyright: ignore
276
-
277
- def aggregate(
278
- self,
279
- *aggregations: str,
280
- group_by: Union[str, Iterable[str]],
281
- **named_aggregations: str,
282
- ) -> Relation:
283
- """
284
- Return relation formed by ``GROUP BY`` SQL aggregation(s).
285
-
286
- Args:
287
- aggregations: Zero or more aggregation expressions such as
288
- "sum(column_name)" and "count(distinct column_name)".
289
- named_aggregations: Zero or more aggregated expressions where the keyword is
290
- used to name the given aggregation. For example,
291
- ``my_column="sum(column_name)"`` is inserted as
292
- ``"sum(column_name) as my_column"`` in the executed SQL query.
293
- group_by: A single column name or iterable collection of column names to
294
- group by.
295
-
296
- Examples:
297
- >>> import patito as pt
298
- >>> df = pt.DataFrame({"a": [1, 2, 3], "b": ["X", "Y", "X"]})
299
- >>> relation = pt.duckdb.Relation(df)
300
- >>> relation.aggregate(
301
- ... "b",
302
- ... "sum(a)",
303
- ... "greatest(b)",
304
- ... max_a="max(a)",
305
- ... group_by="b",
306
- ... ).to_df()
307
- shape: (2, 4)
308
- ┌─────┬────────┬─────────────┬───────┐
309
- │ b ┆ sum(a) ┆ greatest(b) ┆ max_a │
310
- │ --- ┆ --- ┆ --- ┆ --- │
311
- │ str ┆ f64 ┆ str ┆ i64 │
312
- ╞═════╪════════╪═════════════╪═══════╡
313
- │ X ┆ 4.0 ┆ X ┆ 3 │
314
- │ Y ┆ 2.0 ┆ Y ┆ 2 │
315
- └─────┴────────┴─────────────┴───────┘
316
- """
317
- expression = ", ".join(
318
- aggregations
319
- + tuple(
320
- f"{expression} as {column_name}"
321
- for column_name, expression in named_aggregations.items()
322
- )
323
- )
324
- relation = self._relation.aggregate(
325
- aggr_expr=expression,
326
- group_expr=group_by if isinstance(group_by, str) else ", ".join(group_by),
327
- )
328
- return self._wrap(relation=relation, schema_change=True)
329
-
330
- def add_suffix(
331
- self,
332
- suffix: str,
333
- include: Optional[Collection[str]] = None,
334
- exclude: Optional[Collection[str]] = None,
335
- ) -> Relation:
336
- """
337
- Add a suffix to all the columns of the relation.
338
-
339
- Args:
340
- suffix: A string to append to add to all columns names.
341
- include: If provided, only the given columns will be renamed.
342
- exclude: If provided, the given columns will `not` be renamed.
343
-
344
- Raises:
345
- TypeError: If both include `and` exclude are provided at the same time.
346
-
347
- Examples:
348
- >>> import patito as pt
349
- >>> relation = pt.duckdb.Relation("select 1 as column_1, 2 as column_2")
350
- >>> relation.add_suffix("_renamed").to_df()
351
- shape: (1, 2)
352
- ┌──────────────────┬──────────────────┐
353
- │ column_1_renamed ┆ column_2_renamed │
354
- │ --- ┆ --- │
355
- │ i64 ┆ i64 │
356
- ╞══════════════════╪══════════════════╡
357
- │ 1 ┆ 2 │
358
- └──────────────────┴──────────────────┘
359
-
360
- >>> relation.add_suffix("_renamed", include=["column_1"]).to_df()
361
- shape: (1, 2)
362
- ┌──────────────────┬──────────┐
363
- │ column_1_renamed ┆ column_2 │
364
- │ --- ┆ --- │
365
- │ i64 ┆ i64 │
366
- ╞══════════════════╪══════════╡
367
- │ 1 ┆ 2 │
368
- └──────────────────┴──────────┘
369
-
370
- >>> relation.add_suffix("_renamed", exclude=["column_1"]).to_df()
371
- shape: (1, 2)
372
- ┌──────────┬──────────────────┐
373
- │ column_1 ┆ column_2_renamed │
374
- │ --- ┆ --- │
375
- │ i64 ┆ i64 │
376
- ╞══════════╪══════════════════╡
377
- │ 1 ┆ 2 │
378
- └──────────┴──────────────────┘
379
- """
380
- if include is not None and exclude is not None:
381
- raise TypeError("Both include and exclude provided at the same time!")
382
- elif include is not None:
383
- included = lambda column: column in include
384
- elif exclude is not None:
385
- included = lambda column: column not in exclude
386
- else:
387
- included = lambda _: True # noqa: E731
388
-
389
- return self.select(
390
- ", ".join(
391
- f"{column} as {column}{suffix}" if included(column) else column
392
- for column in self.columns
393
- )
394
- )
395
-
396
- def add_prefix(
397
- self,
398
- prefix: str,
399
- include: Optional[Iterable[str]] = None,
400
- exclude: Optional[Iterable[str]] = None,
401
- ) -> Relation:
402
- """
403
- Add a prefix to all the columns of the relation.
404
-
405
- Args:
406
- prefix: A string to prepend to add to all the columns names.
407
- include: If provided, only the given columns will be renamed.
408
- exclude: If provided, the given columns will `not` be renamed.
409
-
410
- Raises:
411
- TypeError: If both include `and` exclude are provided at the same time.
412
-
413
- Examples:
414
- >>> import patito as pt
415
- >>> relation = pt.duckdb.Relation("select 1 as column_1, 2 as column_2")
416
- >>> relation.add_prefix("renamed_").to_df()
417
- shape: (1, 2)
418
- ┌──────────────────┬──────────────────┐
419
- │ renamed_column_1 ┆ renamed_column_2 │
420
- │ --- ┆ --- │
421
- │ i64 ┆ i64 │
422
- ╞══════════════════╪══════════════════╡
423
- │ 1 ┆ 2 │
424
- └──────────────────┴──────────────────┘
425
-
426
- >>> relation.add_prefix("renamed_", include=["column_1"]).to_df()
427
- shape: (1, 2)
428
- ┌──────────────────┬──────────┐
429
- │ renamed_column_1 ┆ column_2 │
430
- │ --- ┆ --- │
431
- │ i64 ┆ i64 │
432
- ╞══════════════════╪══════════╡
433
- │ 1 ┆ 2 │
434
- └──────────────────┴──────────┘
435
-
436
- >>> relation.add_prefix("renamed_", exclude=["column_1"]).to_df()
437
- shape: (1, 2)
438
- ┌──────────┬──────────────────┐
439
- │ column_1 ┆ renamed_column_2 │
440
- │ --- ┆ --- │
441
- │ i64 ┆ i64 │
442
- ╞══════════╪══════════════════╡
443
- │ 1 ┆ 2 │
444
- └──────────┴──────────────────┘
445
- """
446
- if include is not None and exclude is not None:
447
- raise TypeError("Both include and exclude provided at the same time!")
448
- elif include is not None:
449
- included = lambda column: column in include
450
- elif exclude is not None:
451
- included = lambda column: column not in exclude
452
- else:
453
- included = lambda _: True
454
-
455
- return self.select(
456
- ", ".join(
457
- f"{column} as {prefix}{column}" if included(column) else column
458
- for column in self.columns
459
- )
460
- )
461
-
462
- def all(self, *filters: str, **equalities: Union[int, float, str]) -> bool:
463
- """
464
- Return ``True`` if the given predicate(s) are true for all rows in the relation.
465
-
466
- See :func:`Relation.filter()` for additional information regarding the
467
- parameters.
468
-
469
- Args:
470
- filters: SQL predicates to satisfy.
471
- equalities: SQL equality predicates to satisfy.
472
-
473
- Examples:
474
- >>> import patito as pt
475
- >>> df = pt.DataFrame(
476
- ... {
477
- ... "even_number": [2, 4, 6],
478
- ... "odd_number": [1, 3, 5],
479
- ... "zero": [0, 0, 0],
480
- ... }
481
- ... )
482
- >>> relation = pt.duckdb.Relation(df)
483
- >>> relation.all(zero=0)
484
- True
485
- >>> relation.all(
486
- ... "even_number % 2 = 0",
487
- ... "odd_number % 2 = 1",
488
- ... zero=0,
489
- ... )
490
- True
491
- >>> relation.all(zero=1)
492
- False
493
- >>> relation.all("odd_number % 2 = 0")
494
- False
495
- """
496
- return self.filter(*filters, **equalities).count() == self.count()
497
-
498
- def case(
499
- self,
500
- *,
501
- from_column: str,
502
- to_column: str,
503
- mapping: Dict[sql.SQLLiteral, sql.SQLLiteral],
504
- default: sql.SQLLiteral,
505
- ) -> Relation:
506
- """
507
- Map values of one column over to a new column.
508
-
509
- Args:
510
- from_column: Name of column defining the domain of the mapping.
511
- to_column: Name of column to insert the mapped values into.
512
- mapping: Dictionary defining the mapping. The dictionary keys represent the
513
- input values, while the dictionary values represent the output values.
514
- Items are inserted into the SQL case statement by their repr() string
515
- value.
516
- default: Default output value for inputs which have no provided mapping.
517
-
518
- Examples:
519
- The following case statement...
520
-
521
- >>> import patito as pt
522
- >>> db = pt.duckdb.Database()
523
- >>> relation = db.to_relation("select 1 as a union select 2 as a")
524
- >>> relation.case(
525
- ... from_column="a",
526
- ... to_column="b",
527
- ... mapping={1: "one", 2: "two"},
528
- ... default="three",
529
- ... ).order(by="a").to_df()
530
- shape: (2, 2)
531
- ┌─────┬─────┐
532
- │ a ┆ b │
533
- │ --- ┆ --- │
534
- │ i64 ┆ str │
535
- ╞═════╪═════╡
536
- │ 1 ┆ one │
537
- │ 2 ┆ two │
538
- └─────┴─────┘
539
-
540
- ... is equivalent with:
541
-
542
- >>> case_statement = pt.sql.Case(
543
- ... on_column="a",
544
- ... mapping={1: "one", 2: "two"},
545
- ... default="three",
546
- ... as_column="b",
547
- ... )
548
- >>> relation.select(f"*, {case_statement}").order(by="a").to_df()
549
- shape: (2, 2)
550
- ┌─────┬─────┐
551
- │ a ┆ b │
552
- │ --- ┆ --- │
553
- │ i64 ┆ str │
554
- ╞═════╪═════╡
555
- │ 1 ┆ one │
556
- │ 2 ┆ two │
557
- └─────┴─────┘
558
- """
559
-
560
- case_statement = sql.Case(
561
- on_column=from_column,
562
- mapping=mapping,
563
- default=default,
564
- as_column=to_column,
565
- )
566
- new_relation = self._relation.project(f"*, {case_statement}")
567
- return self._wrap(relation=new_relation, schema_change=True)
568
-
569
- def cast(
570
- self: RelationType,
571
- model: Optional[ModelType] = None,
572
- strict: bool = False,
573
- include: Optional[Collection[str]] = None,
574
- exclude: Optional[Collection[str]] = None,
575
- ) -> RelationType:
576
- """
577
- Cast the columns of the relation to types compatible with the associated model.
578
-
579
- The associated model must either be set by invoking
580
- :ref:`Relation.set_model() <duckdb.Relation.set_model>` or provided with the
581
- ``model`` parameter.
582
-
583
- Any columns of the relation that are not part of the given model schema will be
584
- left as-is.
585
-
586
- Args:
587
- model: If :ref:`Relation.set_model() <duckdb.Relation.set_model>` has not
588
- been invoked or is intended to be overwritten.
589
- strict: If set to ``False``, columns which are technically compliant with
590
- the specified field type, will not be casted. For example, a column
591
- annotated with ``int`` is technically compliant with ``SMALLINT``, even
592
- if ``INTEGER`` is the default SQL type associated with ``int``-annotated
593
- fields. If ``strict`` is set to ``True``, the resulting dtypes will
594
- be forced to the default dtype associated with each python type.
595
- include: If provided, only the given columns will be casted.
596
- exclude: If provided, the given columns will `not` be casted.
597
-
598
- Returns:
599
- New relation where the columns have been casted according to the model
600
- schema.
601
-
602
- Examples:
603
- >>> import patito as pt
604
- >>> class Schema(pt.Model):
605
- ... float_column: float
606
- ...
607
- >>> relation = pt.duckdb.Relation("select 1 as float_column")
608
- >>> relation.types["float_column"]
609
- INTEGER
610
- >>> relation.cast(model=Schema).types["float_column"]
611
- DOUBLE
612
-
613
- >>> relation = pt.duckdb.Relation("select 1::FLOAT as float_column")
614
- >>> relation.cast(model=Schema).types["float_column"]
615
- FLOAT
616
- >>> relation.cast(model=Schema, strict=True).types["float_column"]
617
- DOUBLE
618
-
619
- >>> class Schema(pt.Model):
620
- ... column_1: float
621
- ... column_2: float
622
- ...
623
- >>> relation = pt.duckdb.Relation(
624
- ... "select 1 as column_1, 2 as column_2"
625
- ... ).set_model(Schema)
626
- >>> relation.types
627
- {'column_1': INTEGER, 'column_2': INTEGER}
628
- >>> relation.cast(include=["column_1"]).types
629
- {'column_1': DOUBLE, 'column_2': INTEGER}
630
- >>> relation.cast(exclude=["column_1"]).types
631
- {'column_1': INTEGER, 'column_2': DOUBLE}
632
- """
633
- if model is not None:
634
- relation = self.set_model(model)
635
- schema = model
636
- elif self.model is not None:
637
- relation = self
638
- schema = cast(ModelType, self.model)
639
- else:
640
- class_name = self.__class__.__name__
641
- raise TypeError(
642
- f"{class_name}.cast() invoked without "
643
- f"{class_name}.model having been set! "
644
- f"You should invoke {class_name}.set_model() first "
645
- "or explicitly provide a model to .cast()."
646
- )
647
-
648
- if include is not None and exclude is not None:
649
- raise ValueError(
650
- "Both include and exclude provided to "
651
- f"{self.__class__.__name__}.cast()!"
652
- )
653
- elif include is not None:
654
- include = set(include)
655
- elif exclude is not None:
656
- include = set(relation.columns) - set(exclude)
657
- else:
658
- include = set(relation.columns)
659
-
660
- new_columns = []
661
- for column, current_type in relation.types.items():
662
- if column not in schema.columns:
663
- new_columns.append(column)
664
- elif column in include and (
665
- strict or current_type not in schema.valid_sql_types[column]
666
- ):
667
- new_type = schema.sql_types[column]
668
- new_columns.append(f"{column}::{new_type} as {column}")
669
- else:
670
- new_columns.append(column)
671
- return cast(RelationType, self.select(*new_columns))
672
-
673
- def coalesce(
674
- self: RelationType,
675
- **column_expressions: Union[str, int, float],
676
- ) -> RelationType:
677
- """
678
- Replace null-values in given columns with respective values.
679
-
680
- For example, ``coalesce(column_name=value)`` is compiled to:
681
- ``f"coalesce({column_name}, {repr(value)}) as column_name"`` in the resulting
682
- SQL.
683
-
684
- Args:
685
- column_expressions: Keywords indicate which columns to coalesce, while the
686
- string representation of the respective arguments are used as the
687
- null-replacement.
688
-
689
- Return:
690
- Relation: Relation where values have been filled in for nulls in the given
691
- columns.
692
-
693
- Examples:
694
- >>> import patito as pt
695
- >>> df = pt.DataFrame(
696
- ... {
697
- ... "a": [1, None, 3],
698
- ... "b": ["four", "five", None],
699
- ... "c": [None, 8.0, 9.0],
700
- ... }
701
- ... )
702
- >>> relation = pt.duckdb.Relation(df)
703
- >>> relation.coalesce(a=2, b="six").to_df()
704
- shape: (3, 3)
705
- ┌─────┬──────┬──────┐
706
- │ a ┆ b ┆ c │
707
- │ --- ┆ --- ┆ --- │
708
- │ i64 ┆ str ┆ f64 │
709
- ╞═════╪══════╪══════╡
710
- │ 1 ┆ four ┆ null │
711
- │ 2 ┆ five ┆ 8.0 │
712
- │ 3 ┆ six ┆ 9.0 │
713
- └─────┴──────┴──────┘
714
- """
715
- projections = []
716
- for column in self.columns:
717
- if column in column_expressions:
718
- expression = column_expressions[column]
719
- projections.append(f"coalesce({column}, {expression!r}) as {column}")
720
- else:
721
- projections.append(column)
722
- return cast(RelationType, self.select(*projections))
723
-
724
- @property
725
- def columns(self) -> List[str]:
726
- """
727
- Return the columns of the relation as a list of strings.
728
-
729
- Examples:
730
- >>> import patito as pt
731
- >>> pt.duckdb.Relation("select 1 as a, 2 as b").columns
732
- ['a', 'b']
733
- """
734
- # Under certain specific circumstances columns are suffixed with
735
- # :1, which need to be removed from the column name.
736
- return [column.partition(":")[0] for column in self._relation.columns]
737
-
738
- def count(self) -> int:
739
- """
740
- Return the number of rows in the given relation.
741
-
742
- Returns:
743
- Number of rows in the relation as an integer.
744
-
745
- Examples:
746
- >>> import patito as pt
747
- >>> relation = pt.duckdb.Relation("select 1 as a")
748
- >>> relation.count()
749
- 1
750
- >>> (relation + relation).count()
751
- 2
752
-
753
- The :ref:`Relation.__len__()<duckdb.Relation.__len__>` method invokes
754
- ``Relation.count()`` under the hood, and is equivalent:
755
-
756
- >>> len(relation)
757
- 1
758
- >>> len(relation + relation)
759
- 2
760
- """
761
- return cast(Tuple[int], self._relation.aggregate("count(*)").fetchone())[0]
762
-
763
- def create_table(self: RelationType, name: str) -> RelationType:
764
- """
765
- Create new database table based on relation.
766
-
767
- If ``self.model`` is set with
768
- :ref:`Relation.set_model()<duckdb.Relation.set_model>`, then the model is used
769
- to infer the table schema. Otherwise, a permissive table schema is created based
770
- on the relation data.
771
-
772
- Returns:
773
- Relation: A relation pointing to the newly created table.
774
-
775
- Examples:
776
- >>> from typing import Literal
777
- >>> import patito as pt
778
-
779
- >>> df = pt.DataFrame({"enum_column": ["A", "A", "B"]})
780
- >>> relation = pt.duckdb.Relation(df)
781
- >>> relation.create_table("permissive_table").types
782
- {'enum_column': VARCHAR}
783
-
784
- >>> class TableSchema(pt.Model):
785
- ... enum_column: Literal["A", "B", "C"]
786
- ...
787
- >>> relation.set_model(TableSchema).create_table("strict_table").types
788
- {'enum_column': enum__7ba49365cc1b0fd57e61088b3bc9aa25}
789
- """
790
- if self.model is not None:
791
- self.database.create_table(name=name, model=self.model)
792
- self.insert_into(table=name)
793
- else:
794
- self._relation.create(table_name=name)
795
- return cast(RelationType, self.database.table(name))
796
-
797
- def create_view(
798
- self: RelationType,
799
- name: str,
800
- replace: bool = False,
801
- ) -> RelationType:
802
- """
803
- Create new database view based on relation.
804
-
805
- Returns:
806
- Relation: A relation pointing to the newly created view.
807
-
808
- Examples:
809
- >>> import patito as pt
810
- >>> db = pt.duckdb.Database()
811
- >>> df = pt.DataFrame({"column": ["A", "A", "B"]})
812
- >>> relation = db.to_relation(df)
813
- >>> relation.create_view("my_view")
814
- >>> db.query("select * from my_view").to_df()
815
- shape: (3, 1)
816
- ┌────────┐
817
- │ column │
818
- │ --- │
819
- │ str │
820
- ╞════════╡
821
- │ A │
822
- │ A │
823
- │ B │
824
- └────────┘
825
- """
826
- self._relation.create_view(view_name=name, replace=replace)
827
- return cast(RelationType, self.database.view(name))
828
-
829
- def drop(self, *columns: str) -> Relation:
830
- """
831
- Remove specified column(s) from relation.
832
-
833
- Args:
834
- columns (str): Any number of string column names to be dropped.
835
-
836
- Examples:
837
- >>> import patito as pt
838
- >>> relation = pt.duckdb.Relation("select 1 as a, 2 as b, 3 as c")
839
- >>> relation.columns
840
- ['a', 'b', 'c']
841
- >>> relation.drop("c").columns
842
- ['a', 'b']
843
- >>> relation.drop("b", "c").columns
844
- ['a']
845
- """
846
- new_columns = self.columns.copy()
847
- for column in columns:
848
- new_columns.remove(column)
849
- return self[new_columns]
850
-
851
- def distinct(self: RelationType) -> RelationType:
852
- """
853
- Drop all duplicate rows of the relation.
854
-
855
- Example:
856
- >>> import patito as pt
857
- >>> df = pt.DataFrame(
858
- ... [[1, 2, 3], [1, 2, 3], [3, 2, 1]],
859
- ... schema=["a", "b", "c"],
860
- ... orient="row",
861
- ... )
862
- >>> relation = pt.duckdb.Relation(df)
863
- >>> relation.to_df()
864
- shape: (3, 3)
865
- ┌─────┬─────┬─────┐
866
- │ a ┆ b ┆ c │
867
- │ --- ┆ --- ┆ --- │
868
- │ i64 ┆ i64 ┆ i64 │
869
- ╞═════╪═════╪═════╡
870
- │ 1 ┆ 2 ┆ 3 │
871
- │ 1 ┆ 2 ┆ 3 │
872
- │ 3 ┆ 2 ┆ 1 │
873
- └─────┴─────┴─────┘
874
- >>> relation.distinct().to_df()
875
- shape: (2, 3)
876
- ┌─────┬─────┬─────┐
877
- │ a ┆ b ┆ c │
878
- │ --- ┆ --- ┆ --- │
879
- │ i64 ┆ i64 ┆ i64 │
880
- ╞═════╪═════╪═════╡
881
- │ 1 ┆ 2 ┆ 3 │
882
- │ 3 ┆ 2 ┆ 1 │
883
- └─────┴─────┴─────┘
884
- """
885
- return self._wrap(self._relation.distinct(), schema_change=False)
886
-
887
- def except_(self: RelationType, other: RelationSource) -> RelationType:
888
- """
889
- Remove all rows that can be found in the other other relation.
890
-
891
- Args:
892
- other: Another relation or something that can be casted to a relation.
893
-
894
- Returns:
895
- New relation without the rows that can be found in the other relation.
896
-
897
- Example:
898
- >>> import patito as pt
899
- >>> relation_123 = pt.duckdb.Relation(
900
- ... "select 1 union select 2 union select 3"
901
- ... )
902
- >>> relation_123.order(by="1").to_df()
903
- shape: (3, 1)
904
- ┌─────┐
905
- │ 1 │
906
- │ --- │
907
- │ i64 │
908
- ╞═════╡
909
- │ 1 │
910
- │ 2 │
911
- │ 3 │
912
- └─────┘
913
- >>> relation_2 = pt.duckdb.Relation("select 2")
914
- >>> relation_2.to_df()
915
- shape: (1, 1)
916
- ┌─────┐
917
- │ 2 │
918
- │ --- │
919
- │ i64 │
920
- ╞═════╡
921
- │ 2 │
922
- └─────┘
923
- >>> relation_123.except_(relation_2).order(by="1").to_df()
924
- shape: (2, 1)
925
- ┌─────┐
926
- │ 1 │
927
- │ --- │
928
- │ i64 │
929
- ╞═════╡
930
- │ 1 │
931
- │ 3 │
932
- └─────┘
933
- """
934
- return self._wrap(
935
- self._relation.except_(self.database.to_relation(other)._relation),
936
- schema_change=False,
937
- )
938
-
939
- def execute(self) -> duckdb.DuckDBPyRelation:
940
- """
941
- Execute built relation query and return result object.
942
-
943
- Returns:
944
- A native ``duckdb.DuckDBPyResult`` object representing the executed query.
945
-
946
- Examples:
947
- >>> import patito as pt
948
- >>> relation = pt.duckdb.Relation(
949
- ... "select 1 as a, 2 as b union select 3 as a, 4 as b"
950
- ... )
951
- >>> result = relation.aggregate("sum(a)", group_by="").execute()
952
- >>> result.description
953
- [('sum(a)', 'NUMBER', None, None, None, None, None)]
954
- >>> result.fetchall()
955
- [(4,)]
956
- """
957
- # A star-select is here performed in order to work around certain DuckDB bugs
958
- return self._relation.project("*").execute()
959
-
960
- def get(self, *filters: str, **equalities: Union[str, int, float]) -> ModelType:
961
- """
962
- Fetch the single row that matches the given filter(s).
963
-
964
- If you expect a relation to already return one row, you can use get() without
965
- any arguments to return that row.
966
-
967
- Raises:
968
- RuntimeError: RuntimeError is thrown if not exactly one single row matches
969
- the given filter.
970
-
971
- Args:
972
- filters (str): A conjunction of SQL where clauses.
973
- equalities (Any): A conjunction of SQL equality clauses. The keyword name
974
- is the column and the parameter is the value of the equality.
975
-
976
- Returns:
977
- Model: A Patito model representing the given row.
978
-
979
- Examples:
980
- >>> import patito as pt
981
- >>> import polars as pl
982
- >>> df = pt.DataFrame({"product_id": [1, 2, 3], "price": [10, 10, 20]})
983
- >>> relation = pt.duckdb.Relation(df).set_alias("my_relation")
984
-
985
- The ``.get()`` method will by default return a dynamically constructed
986
- Patito model if no model has been associated with the given relation:
987
-
988
- >>> relation.get(product_id=1)
989
- my_relation(product_id=1, price=10)
990
-
991
- If a Patito model has been associated with the relation, by the use of
992
- :ref:`Relation.set_model()<duckdb.Relation.set_model>`, then the given model
993
- will be used to represent the return type:
994
-
995
- >>> class Product(pt.Model):
996
- ... product_id: int = pt.Field(unique=True)
997
- ... price: float
998
- ...
999
- >>> relation.set_model(Product).get(product_id=1)
1000
- Product(product_id=1, price=10.0)
1001
-
1002
- You can invoke ``.get()`` without any arguments on relations containing
1003
- exactly one row:
1004
-
1005
- >>> relation.filter(product_id=1).get()
1006
- my_relation(product_id=1, price=10)
1007
-
1008
- If the given predicate matches multiple rows a ``MultipleRowsReturned``
1009
- exception will be raised:
1010
-
1011
- >>> try:
1012
- ... relation.get(price=10)
1013
- ... except pt.exceptions.MultipleRowsReturned as e:
1014
- ... print(e)
1015
- ...
1016
- Relation.get(price=10) returned 2 rows!
1017
-
1018
- If the given predicate matches zero rows a ``RowDoesNotExist`` exception
1019
- will be raised:
1020
-
1021
- >>> try:
1022
- ... relation.get(price=0)
1023
- ... except pt.exceptions.RowDoesNotExist as e:
1024
- ... print(e)
1025
- ...
1026
- Relation.get(price=0) returned 0 rows!
1027
- """
1028
- if filters or equalities:
1029
- relation = self.filter(*filters, **equalities)
1030
- else:
1031
- relation = self
1032
- result = relation.execute()
1033
- row = result.fetchone()
1034
- if row is None or result.fetchone() is not None:
1035
- args = [repr(f) for f in filters]
1036
- args.extend(f"{key}={value!r}" for key, value in equalities.items())
1037
- args_string = ",".join(args)
1038
-
1039
- num_rows = relation.count()
1040
- if num_rows == 0:
1041
- raise RowDoesNotExist(f"Relation.get({args_string}) returned 0 rows!")
1042
- else:
1043
- raise MultipleRowsReturned(
1044
- f"Relation.get({args_string}) returned {num_rows} rows!"
1045
- )
1046
- return self._to_model(row=row)
1047
-
1048
- def _to_model(self, row: tuple) -> ModelType:
1049
- """
1050
- Cast row tuple to proper return type.
1051
-
1052
- If self.model is set, either by a class variable of a subclass or by the
1053
- invocation of Relation.set_model(), that type is used to construct the return
1054
- value. Otherwise, a pydantic model is dynamically created based on the column
1055
- schema of the relation.
1056
- """
1057
- kwargs = {column: value for column, value in zip(self.columns, row)}
1058
- if self.model:
1059
- return self.model(**kwargs)
1060
- else:
1061
- RowModel = create_pydantic_model(relation=self._relation)
1062
- return cast(
1063
- ModelType,
1064
- RowModel(**kwargs),
1065
- )
1066
-
1067
- def filter(
1068
- self: RelationType,
1069
- *filters: str,
1070
- **equalities: Union[str, int, float],
1071
- ) -> RelationType:
1072
- """
1073
- Return subset of rows of relation that satisfy the given predicates.
1074
-
1075
- The method returns self if no filters are provided.
1076
-
1077
- Args:
1078
- filters: A conjunction of SQL ``WHERE`` clauses.
1079
- equalities: A conjunction of SQL equality clauses. The keyword name
1080
- is the column and the parameter is the value of the equality.
1081
-
1082
- Returns:
1083
- Relation: A new relation where all rows satisfy the given criteria.
1084
-
1085
- Examples:
1086
- >>> import patito as pt
1087
- >>> df = pt.DataFrame(
1088
- ... {
1089
- ... "number": [1, 2, 3, 4],
1090
- ... "string": ["A", "A", "B", "B"],
1091
- ... }
1092
- ... )
1093
- >>> relation = pt.duckdb.Relation(df)
1094
- >>> relation.filter("number % 2 = 0").to_df()
1095
- shape: (2, 2)
1096
- ┌────────┬────────┐
1097
- │ number ┆ string │
1098
- │ --- ┆ --- │
1099
- │ i64 ┆ str │
1100
- ╞════════╪════════╡
1101
- │ 2 ┆ A │
1102
- │ 4 ┆ B │
1103
- └────────┴────────┘
1104
-
1105
- >>> relation.filter(number=1, string="A").to_df()
1106
- shape: (1, 2)
1107
- ┌────────┬────────┐
1108
- │ number ┆ string │
1109
- │ --- ┆ --- │
1110
- │ i64 ┆ str │
1111
- ╞════════╪════════╡
1112
- │ 1 ┆ A │
1113
- └────────┴────────┘
1114
- """
1115
- if not filters and not equalities:
1116
- return self
1117
-
1118
- clauses: List[str] = []
1119
- if filters:
1120
- clauses.extend(filters)
1121
- if equalities:
1122
- clauses.extend(f"{key}={value!r}" for key, value in equalities.items())
1123
- filter_string = " and ".join(f"({clause})" for clause in clauses)
1124
- return self._wrap(self._relation.filter(filter_string), schema_change=False)
1125
-
1126
- def join(
1127
- self: RelationType,
1128
- other: RelationSource,
1129
- *,
1130
- on: str,
1131
- how: Literal["inner", "left"] = "inner",
1132
- ) -> RelationType:
1133
- """
1134
- Join relation with other relation source based on condition.
1135
-
1136
- See :ref:`duckdb.Relation.inner_join() <duckdb.Relation.inner_join>` and
1137
- :ref:`Relation.left_join() <duckdb.Relation.left_join>` for alternative method
1138
- shortcuts instead of using ``how``.
1139
-
1140
- Args:
1141
- other: A source which can be casted to a ``Relation`` object, and be used
1142
- as the right table in the join.
1143
- on: Join condition following the ``INNER JOIN ... ON`` in the SQL query.
1144
- how: Either ``"left"`` or ``"inner"`` for what type of SQL join operation to
1145
- perform.
1146
-
1147
- Returns:
1148
- Relation: New relation based on the joined relations.
1149
-
1150
- Example:
1151
- >>> import patito as pt
1152
- >>> products_df = pt.DataFrame(
1153
- ... {
1154
- ... "product_name": ["apple", "banana", "oranges"],
1155
- ... "supplier_id": [2, 1, 3],
1156
- ... }
1157
- ... )
1158
- >>> products = pt.duckdb.Relation(products_df)
1159
- >>> supplier_df = pt.DataFrame(
1160
- ... {
1161
- ... "id": [1, 2],
1162
- ... "supplier_name": ["Banana Republic", "Applies Inc."],
1163
- ... }
1164
- ... )
1165
- >>> suppliers = pt.duckdb.Relation(supplier_df)
1166
- >>> products.set_alias("p").join(
1167
- ... suppliers.set_alias("s"),
1168
- ... on="p.supplier_id = s.id",
1169
- ... how="inner",
1170
- ... ).to_df()
1171
- shape: (2, 4)
1172
- ┌──────────────┬─────────────┬─────┬─────────────────┐
1173
- │ product_name ┆ supplier_id ┆ id ┆ supplier_name │
1174
- │ --- ┆ --- ┆ --- ┆ --- │
1175
- │ str ┆ i64 ┆ i64 ┆ str │
1176
- ╞══════════════╪═════════════╪═════╪═════════════════╡
1177
- │ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
1178
- │ banana ┆ 1 ┆ 1 ┆ Banana Republic │
1179
- └──────────────┴─────────────┴─────┴─────────────────┘
1180
-
1181
- >>> products.set_alias("p").join(
1182
- ... suppliers.set_alias("s"),
1183
- ... on="p.supplier_id = s.id",
1184
- ... how="left",
1185
- ... ).to_df()
1186
- shape: (3, 4)
1187
- ┌──────────────┬─────────────┬──────┬─────────────────┐
1188
- │ product_name ┆ supplier_id ┆ id ┆ supplier_name │
1189
- │ --- ┆ --- ┆ --- ┆ --- │
1190
- │ str ┆ i64 ┆ i64 ┆ str │
1191
- ╞══════════════╪═════════════╪══════╪═════════════════╡
1192
- │ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
1193
- │ banana ┆ 1 ┆ 1 ┆ Banana Republic │
1194
- │ oranges ┆ 3 ┆ null ┆ null │
1195
- └──────────────┴─────────────┴──────┴─────────────────┘
1196
- """
1197
- return self._wrap(
1198
- self._relation.join(
1199
- self.database.to_relation(other)._relation, condition=on, how=how
1200
- ),
1201
- schema_change=True,
1202
- )
1203
-
1204
- def inner_join(self: RelationType, other: RelationSource, on: str) -> RelationType:
1205
- """
1206
- Inner join relation with other relation source based on condition.
1207
-
1208
- Args:
1209
- other: A source which can be casted to a ``Relation`` object, and be used
1210
- as the right table in the join.
1211
- on: Join condition following the ``INNER JOIN ... ON`` in the SQL query.
1212
-
1213
- Returns:
1214
- Relation: New relation based on the joined relations.
1215
-
1216
- Example:
1217
- >>> import patito as pt
1218
- >>> products_df = pt.DataFrame(
1219
- ... {
1220
- ... "product_name": ["apple", "banana", "oranges"],
1221
- ... "supplier_id": [2, 1, 3],
1222
- ... }
1223
- ... )
1224
- >>> products = pt.duckdb.Relation(products_df)
1225
- >>> supplier_df = pt.DataFrame(
1226
- ... {
1227
- ... "id": [1, 2],
1228
- ... "supplier_name": ["Banana Republic", "Applies Inc."],
1229
- ... }
1230
- ... )
1231
- >>> suppliers = pt.duckdb.Relation(supplier_df)
1232
- >>> products.set_alias("p").inner_join(
1233
- ... suppliers.set_alias("s"),
1234
- ... on="p.supplier_id = s.id",
1235
- ... ).to_df()
1236
- shape: (2, 4)
1237
- ┌──────────────┬─────────────┬─────┬─────────────────┐
1238
- │ product_name ┆ supplier_id ┆ id ┆ supplier_name │
1239
- │ --- ┆ --- ┆ --- ┆ --- │
1240
- │ str ┆ i64 ┆ i64 ┆ str │
1241
- ╞══════════════╪═════════════╪═════╪═════════════════╡
1242
- │ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
1243
- │ banana ┆ 1 ┆ 1 ┆ Banana Republic │
1244
- └──────────────┴─────────────┴─────┴─────────────────┘
1245
- """
1246
- return self._wrap(
1247
- self._relation.join(
1248
- other_rel=self.database.to_relation(other)._relation,
1249
- condition=on,
1250
- how="inner",
1251
- ),
1252
- schema_change=True,
1253
- )
1254
-
1255
- def left_join(self: RelationType, other: RelationSource, on: str) -> RelationType:
1256
- """
1257
- Left join relation with other relation source based on condition.
1258
-
1259
- Args:
1260
- other: A source which can be casted to a Relation object, and be used as
1261
- the right table in the join.
1262
- on: Join condition following the ``LEFT JOIN ... ON`` in the SQL query.
1263
-
1264
- Returns:
1265
- Relation: New relation based on the joined tables.
1266
-
1267
- Example:
1268
- >>> import patito as pt
1269
- >>> products_df = pt.DataFrame(
1270
- ... {
1271
- ... "product_name": ["apple", "banana", "oranges"],
1272
- ... "supplier_id": [2, 1, 3],
1273
- ... }
1274
- ... )
1275
- >>> products = pt.duckdb.Relation(products_df)
1276
- >>> supplier_df = pt.DataFrame(
1277
- ... {
1278
- ... "id": [1, 2],
1279
- ... "supplier_name": ["Banana Republic", "Applies Inc."],
1280
- ... }
1281
- ... )
1282
- >>> suppliers = pt.duckdb.Relation(supplier_df)
1283
- >>> products.set_alias("p").left_join(
1284
- ... suppliers.set_alias("s"),
1285
- ... on="p.supplier_id = s.id",
1286
- ... ).to_df()
1287
- shape: (3, 4)
1288
- ┌──────────────┬─────────────┬──────┬─────────────────┐
1289
- │ product_name ┆ supplier_id ┆ id ┆ supplier_name │
1290
- │ --- ┆ --- ┆ --- ┆ --- │
1291
- │ str ┆ i64 ┆ i64 ┆ str │
1292
- ╞══════════════╪═════════════╪══════╪═════════════════╡
1293
- │ apple ┆ 2 ┆ 2 ┆ Applies Inc. │
1294
- │ banana ┆ 1 ┆ 1 ┆ Banana Republic │
1295
- │ oranges ┆ 3 ┆ null ┆ null │
1296
- └──────────────┴─────────────┴──────┴─────────────────┘
1297
- """
1298
- return self._wrap(
1299
- self._relation.join(
1300
- other_rel=self.database.to_relation(other)._relation,
1301
- condition=on,
1302
- how="left",
1303
- ),
1304
- schema_change=True,
1305
- )
1306
-
1307
- def limit(self: RelationType, n: int, *, offset: int = 0) -> RelationType:
1308
- """
1309
- Remove all but the first n rows.
1310
-
1311
- Args:
1312
- n: The number of rows to keep.
1313
- offset: Disregard the first ``offset`` rows before starting to count which
1314
- rows to keep.
1315
-
1316
- Returns:
1317
- New relation with only n rows.
1318
-
1319
- Example:
1320
- >>> import patito as pt
1321
- >>> relation = (
1322
- ... pt.duckdb.Relation("select 1 as column")
1323
- ... + pt.duckdb.Relation("select 2 as column")
1324
- ... + pt.duckdb.Relation("select 3 as column")
1325
- ... + pt.duckdb.Relation("select 4 as column")
1326
- ... )
1327
- >>> relation.limit(2).to_df()
1328
- shape: (2, 1)
1329
- ┌────────┐
1330
- │ column │
1331
- │ --- │
1332
- │ i64 │
1333
- ╞════════╡
1334
- │ 1 │
1335
- │ 2 │
1336
- └────────┘
1337
- >>> relation.limit(2, offset=2).to_df()
1338
- shape: (2, 1)
1339
- ┌────────┐
1340
- │ column │
1341
- │ --- │
1342
- │ i64 │
1343
- ╞════════╡
1344
- │ 3 │
1345
- │ 4 │
1346
- └────────┘
1347
- """
1348
- return self._wrap(self._relation.limit(n=n, offset=offset), schema_change=False)
1349
-
1350
- def order(self: RelationType, by: Union[str, Iterable[str]]) -> RelationType:
1351
- """
1352
- Change the order of the rows of the relation.
1353
-
1354
- Args:
1355
- by: An ``ORDER BY`` SQL expression such as ``"age DESC"`` or
1356
- ``("age DESC", "name ASC")``.
1357
-
1358
- Returns:
1359
- New relation where the rows have been ordered according to ``by``.
1360
-
1361
- Example:
1362
- >>> import patito as pt
1363
- >>> df = pt.DataFrame(
1364
- ... {
1365
- ... "name": ["Alice", "Bob", "Charles", "Diana"],
1366
- ... "age": [20, 20, 30, 35],
1367
- ... }
1368
- ... )
1369
- >>> df
1370
- shape: (4, 2)
1371
- ┌─────────┬─────┐
1372
- │ name ┆ age │
1373
- │ --- ┆ --- │
1374
- │ str ┆ i64 │
1375
- ╞═════════╪═════╡
1376
- │ Alice ┆ 20 │
1377
- │ Bob ┆ 20 │
1378
- │ Charles ┆ 30 │
1379
- │ Diana ┆ 35 │
1380
- └─────────┴─────┘
1381
- >>> relation = pt.duckdb.Relation(df)
1382
- >>> relation.order(by="age desc").to_df()
1383
- shape: (4, 2)
1384
- ┌─────────┬─────┐
1385
- │ name ┆ age │
1386
- │ --- ┆ --- │
1387
- │ str ┆ i64 │
1388
- ╞═════════╪═════╡
1389
- │ Diana ┆ 35 │
1390
- │ Charles ┆ 30 │
1391
- │ Alice ┆ 20 │
1392
- │ Bob ┆ 20 │
1393
- └─────────┴─────┘
1394
- >>> relation.order(by=["age desc", "name desc"]).to_df()
1395
- shape: (4, 2)
1396
- ┌─────────┬─────┐
1397
- │ name ┆ age │
1398
- │ --- ┆ --- │
1399
- │ str ┆ i64 │
1400
- ╞═════════╪═════╡
1401
- │ Diana ┆ 35 │
1402
- │ Charles ┆ 30 │
1403
- │ Bob ┆ 20 │
1404
- │ Alice ┆ 20 │
1405
- └─────────┴─────┘
1406
- """
1407
- order_expr = by if isinstance(by, str) else ", ".join(by)
1408
- return self._wrap(
1409
- self._relation.order(order_expr=order_expr),
1410
- schema_change=False,
1411
- )
1412
-
1413
- def insert_into(
1414
- self: RelationType,
1415
- table: str,
1416
- ) -> RelationType:
1417
- """
1418
- Insert all rows of the relation into a given table.
1419
-
1420
- The relation must contain all the columns present in the target table.
1421
- Extra columns are ignored and the column order is automatically matched
1422
- with the target table.
1423
-
1424
- Args:
1425
- table: Name of table for which to insert values into.
1426
-
1427
- Returns:
1428
- Relation: The original relation, i.e. ``self``.
1429
-
1430
- Examples:
1431
- >>> import patito as pt
1432
- >>> db = pt.duckdb.Database()
1433
- >>> db.to_relation("select 1 as a").create_table("my_table")
1434
- >>> db.table("my_table").to_df()
1435
- shape: (1, 1)
1436
- ┌─────┐
1437
- │ a │
1438
- │ --- │
1439
- │ i64 │
1440
- ╞═════╡
1441
- │ 1 │
1442
- └─────┘
1443
- >>> db.to_relation("select 2 as a").insert_into("my_table")
1444
- >>> db.table("my_table").to_df()
1445
- shape: (2, 1)
1446
- ┌─────┐
1447
- │ a │
1448
- │ --- │
1449
- │ i64 │
1450
- ╞═════╡
1451
- │ 1 │
1452
- │ 2 │
1453
- └─────┘
1454
- """
1455
- table_relation = self.database.table(table)
1456
- missing_columns = set(table_relation.columns) - set(self.columns)
1457
- if missing_columns:
1458
- raise TypeError(
1459
- f"Relation is missing column(s) {missing_columns} "
1460
- f"in order to be inserted into table '{table}'!",
1461
- )
1462
-
1463
- reordered_relation = self[table_relation.columns]
1464
- reordered_relation._relation.insert_into(table_name=table)
1465
- return self
1466
-
1467
- def intersect(self: RelationType, other: RelationSource) -> RelationType:
1468
- """
1469
- Return a new relation containing the rows that are present in both relations.
1470
-
1471
- This is a set operation which will remove duplicate rows as well.
1472
-
1473
- Args:
1474
- other: Another relation with the same column names.
1475
-
1476
- Returns:
1477
- Relation[Model]: A new relation with only those rows that are present in
1478
- both relations.
1479
-
1480
- Example:
1481
- >>> import patito as pt
1482
- >>> df1 = pt.DataFrame({"a": [1, 1, 2], "b": [1, 1, 2]})
1483
- >>> df2 = pt.DataFrame({"a": [1, 1, 3], "b": [1, 1, 3]})
1484
- >>> pt.duckdb.Relation(df1).intersect(pt.duckdb.Relation(df2)).to_df()
1485
- shape: (1, 2)
1486
- ┌─────┬─────┐
1487
- │ a ┆ b │
1488
- │ --- ┆ --- │
1489
- │ i64 ┆ i64 │
1490
- ╞═════╪═════╡
1491
- │ 1 ┆ 1 │
1492
- └─────┴─────┘
1493
- """
1494
- other = self.database.to_relation(other)
1495
- return self._wrap(
1496
- self._relation.intersect(other._relation),
1497
- schema_change=False,
1498
- )
1499
-
1500
- def select(
1501
- self,
1502
- *projections: Union[str, int, float],
1503
- **named_projections: Union[str, int, float],
1504
- ) -> Relation:
1505
- """
1506
- Return relation based on one or more SQL ``SELECT`` projections.
1507
-
1508
- Keyword arguments are converted into ``{arg} as {keyword}`` in the executed SQL
1509
- query.
1510
-
1511
- Args:
1512
- *projections: One or more strings representing SQL statements to be
1513
- selected. For example ``"2"`` or ``"another_column"``.
1514
- **named_projections: One ore more keyword arguments where the keyword
1515
- specifies the name of the new column and the value is an SQL statement
1516
- defining the content of the new column. For example
1517
- ``new_column="2 * another_column"``.
1518
-
1519
- Examples:
1520
- >>> import patito as pt
1521
- >>> db = pt.duckdb.Database()
1522
- >>> relation = db.to_relation(pt.DataFrame({"original_column": [1, 2, 3]}))
1523
- >>> relation.select("*").to_df()
1524
- shape: (3, 1)
1525
- ┌─────────────────┐
1526
- │ original_column │
1527
- │ --- │
1528
- │ i64 │
1529
- ╞═════════════════╡
1530
- │ 1 │
1531
- │ 2 │
1532
- │ 3 │
1533
- └─────────────────┘
1534
- >>> relation.select("*", multiplied_column="2 * original_column").to_df()
1535
- shape: (3, 2)
1536
- ┌─────────────────┬───────────────────┐
1537
- │ original_column ┆ multiplied_column │
1538
- │ --- ┆ --- │
1539
- │ i64 ┆ i64 │
1540
- ╞═════════════════╪═══════════════════╡
1541
- │ 1 ┆ 2 │
1542
- │ 2 ┆ 4 │
1543
- │ 3 ┆ 6 │
1544
- └─────────────────┴───────────────────┘
1545
- """
1546
- # We expand '*' to an explicit list of columns in order to support redefining
1547
- # columns within the star expressed columns.
1548
- expanded_projections: list = list(projections)
1549
- try:
1550
- star_index = projections.index("*")
1551
- if named_projections:
1552
- # Allow explicitly named projections to overwrite star-selected columns
1553
- expanded_projections[star_index : star_index + 1] = [
1554
- column for column in self.columns if column not in named_projections
1555
- ]
1556
- else:
1557
- expanded_projections[star_index : star_index + 1] = self.columns
1558
- except ValueError:
1559
- pass
1560
-
1561
- projection = ", ".join(
1562
- expanded_projections
1563
- + list( # pyright: ignore
1564
- f"{expression} as {column_name}"
1565
- for column_name, expression in named_projections.items()
1566
- )
1567
- )
1568
- try:
1569
- relation = self._relation.project(projection)
1570
- except RuntimeError as exc: # pragma: no cover
1571
- # We might get a RunTime error if the enum type has not
1572
- # been created yet. If so, we create all enum types for
1573
- # this model.
1574
- if self.model is not None and _is_missing_enum_type_exception(exc):
1575
- self.database.create_enum_types(model=self.model)
1576
- relation = self._relation.project(projection)
1577
- else:
1578
- raise exc
1579
- return self._wrap(relation=relation, schema_change=True)
1580
-
1581
- def rename(self, **columns: str) -> Relation:
1582
- """
1583
- Rename columns as specified.
1584
-
1585
- Args:
1586
- **columns: A set of keyword arguments where the keyword is the old column
1587
- name and the value is the new column name.
1588
-
1589
- Raises:
1590
- ValueError: If any of the given keywords do not exist as columns in the
1591
- relation.
1592
-
1593
- Examples:
1594
- >>> import patito as pt
1595
- >>> relation = pt.duckdb.Relation("select 1 as a, 2 as b")
1596
- >>> relation.rename(b="c").to_df().select(["a", "c"])
1597
- shape: (1, 2)
1598
- ┌─────┬─────┐
1599
- │ a ┆ c │
1600
- │ --- ┆ --- │
1601
- │ i64 ┆ i64 │
1602
- ╞═════╪═════╡
1603
- │ 1 ┆ 2 │
1604
- └─────┴─────┘
1605
- """
1606
- existing_columns = set(self.columns)
1607
- missing = set(columns.keys()) - set(existing_columns)
1608
- if missing:
1609
- raise ValueError(
1610
- f"Column '{missing.pop()}' can not be renamed as it does not exist. "
1611
- f"The columns of the relation are: {', '.join(existing_columns)}."
1612
- )
1613
- # If we rename a column to overwrite another existing one, the column should
1614
- # be overwritten.
1615
- existing_columns = set(existing_columns) - set(columns.values())
1616
- relation = self._relation.project(
1617
- ", ".join(
1618
- f"{column} as {columns.get(column, column)}"
1619
- for column in existing_columns
1620
- )
1621
- )
1622
- return self._wrap(relation=relation, schema_change=True)
1623
-
1624
- def set_alias(self: RelationType, name: str) -> RelationType:
1625
- """
1626
- Set SQL alias for the given relation to be used in further queries.
1627
-
1628
- Args:
1629
- name: The new alias for the given relation.
1630
-
1631
- Returns:
1632
- Relation: A new relation containing the same query but addressable with the
1633
- new alias.
1634
-
1635
- Example:
1636
- >>> import patito as pt
1637
- >>> relation_1 = pt.duckdb.Relation("select 1 as a, 2 as b")
1638
- >>> relation_2 = pt.duckdb.Relation("select 1 as a, 3 as c")
1639
- >>> relation_1.set_alias("x").inner_join(
1640
- ... relation_2.set_alias("y"),
1641
- ... on="x.a = y.a",
1642
- ... ).select("x.a", "y.a", "b", "c").to_df()
1643
- shape: (1, 4)
1644
- ┌─────┬─────┬─────┬─────┐
1645
- │ a ┆ a:1 ┆ b ┆ c │
1646
- │ --- ┆ --- ┆ --- ┆ --- │
1647
- │ i64 ┆ i64 ┆ i64 ┆ i64 │
1648
- ╞═════╪═════╪═════╪═════╡
1649
- │ 1 ┆ 1 ┆ 2 ┆ 3 │
1650
- └─────┴─────┴─────┴─────┘
1651
- """
1652
- return self._wrap(
1653
- self._relation.set_alias(name),
1654
- schema_change=False,
1655
- )
1656
-
1657
- def set_model(self, model): # type: ignore[no-untyped-def] # noqa: ANN
1658
- """
1659
- Associate a give Patito model with the relation.
1660
-
1661
- The returned relation has an associated ``.model`` attribute which can in turn
1662
- be used by several methods such as :ref:`Relation.get()<duckdb.Relation.get>`,
1663
- :ref:`Relation.create_table()<duckdb.Relation.create_table>`, and
1664
- :ref:`Relation.__iter__<duckdb.Relation.__iter__>`.
1665
-
1666
- Args:
1667
- model: A Patito Model class specifying the intended schema of the relation.
1668
-
1669
- Returns:
1670
- Relation[model]: A new relation with the associated model.
1671
-
1672
- Example:
1673
- >>> from typing import Literal
1674
- >>> import patito as pt
1675
- >>> class MySchema(pt.Model):
1676
- ... float_column: float
1677
- ... enum_column: Literal["A", "B", "C"]
1678
- ...
1679
- >>> relation = pt.duckdb.Relation(
1680
- ... "select 1 as float_column, 'A' as enum_column"
1681
- ... )
1682
- >>> relation.get()
1683
- query_relation(float_column=1, enum_column='A')
1684
- >>> relation.set_model(MySchema).get()
1685
- MySchema(float_column=1.0, enum_column='A')
1686
- >>> relation.create_table("unmodeled_table").types
1687
- {'float_column': INTEGER, 'enum_column': VARCHAR}
1688
- >>> relation.set_model(MySchema).create_table("modeled_table").types
1689
- {'float_column': DOUBLE,
1690
- 'enum_column': enum__7ba49365cc1b0fd57e61088b3bc9aa25}
1691
- """
1692
- # We are not able to annotate the generic instance of type(self)[type(model)]
1693
- # due to the lack of higher-kinded generics in python as of this writing.
1694
- # See: https://github.com/python/typing/issues/548
1695
- # This cast() will be wrong for sub-classes of Relation...
1696
- return cast(
1697
- Relation[model],
1698
- type(self)(
1699
- derived_from=self._relation,
1700
- database=self.database,
1701
- model=model,
1702
- ),
1703
- )
1704
-
1705
- @property
1706
- def types(self): # type: ignore[no-untyped-def] # noqa
1707
- """
1708
- Return the SQL types of all the columns of the given relation.
1709
-
1710
- Returns:
1711
- dict[str, str]: A dictionary where the keys are the column names and the
1712
- values are SQL types as strings.
1713
-
1714
- Examples:
1715
- >>> import patito as pt
1716
- >>> pt.duckdb.Relation("select 1 as a, 'my_value' as b").types
1717
- {'a': INTEGER, 'b': VARCHAR}
1718
- """
1719
- return dict(zip(self.columns, self._relation.types))
1720
-
1721
- def to_pandas(self) -> "pd.DataFrame":
1722
- """
1723
- Return a pandas DataFrame representation of relation object.
1724
-
1725
- Returns: A ``pandas.DataFrame`` object containing all the data of the relation.
1726
-
1727
- Example:
1728
- >>> import patito as pt
1729
- >>> pt.duckdb.Relation("select 1 as column union select 2 as column").order(
1730
- ... by="1"
1731
- ... ).to_pandas()
1732
- column
1733
- 0 1
1734
- 1 2
1735
- """
1736
- return self._relation.to_df()
1737
-
1738
- def to_df(self) -> DataFrame:
1739
- """
1740
- Return a polars DataFrame representation of relation object.
1741
-
1742
- Returns: A ``patito.DataFrame`` object which inherits from ``polars.DataFrame``.
1743
-
1744
- Example:
1745
- >>> import patito as pt
1746
- >>> pt.duckdb.Relation("select 1 as column union select 2 as column").order(
1747
- ... by="1"
1748
- ... ).to_df()
1749
- shape: (2, 1)
1750
- ┌────────┐
1751
- │ column │
1752
- │ --- │
1753
- │ i64 │
1754
- ╞════════╡
1755
- │ 1 │
1756
- │ 2 │
1757
- └────────┘
1758
- """
1759
- # Here we do a star-select to work around certain weird issues with DuckDB
1760
- self._relation = self._relation.project("*")
1761
- arrow_table = cast(pa.lib.Table, self._relation.to_arrow_table())
1762
- try:
1763
- # We cast `INTEGER`-typed columns to `pl.Int64` when converting to Polars
1764
- # because polars is much more eager to store integer Series as 64-bit
1765
- # integers. Otherwise there must be done a lot of manual casting whenever
1766
- # you cross the boundary between DuckDB and polars.
1767
- return DataFrame._from_arrow(arrow_table).with_columns(
1768
- pl.col(pl.Int32).cast(pl.Int64)
1769
- )
1770
- except (pa.ArrowInvalid, pl.ArrowError): # pragma: no cover
1771
- # Empty relations with enum columns can sometimes produce errors.
1772
- # As a last-ditch effort, we convert such columns to VARCHAR.
1773
- casted_columns = [
1774
- f"{field.name}::VARCHAR as {field.name}"
1775
- if isinstance(field.type, pa.DictionaryType)
1776
- else field.name
1777
- for field in arrow_table.schema
1778
- ]
1779
- non_enum_relation = self._relation.project(", ".join(casted_columns))
1780
- arrow_table = non_enum_relation.to_arrow_table()
1781
- return DataFrame._from_arrow(arrow_table).with_columns(
1782
- pl.col(pl.Int32).cast(pl.Int64)
1783
- )
1784
-
1785
- def to_series(self) -> pl.Series:
1786
- """
1787
- Convert the given relation to a polars Series.
1788
-
1789
- Raises:
1790
- TypeError: If the given relation does not contain exactly one column.
1791
-
1792
- Returns: A ``polars.Series`` object containing the data of the relation.
1793
-
1794
- Example:
1795
- >>> import patito as pt
1796
- >>> relation = pt.duckdb.Relation("select 1 as a union select 2 as a")
1797
- >>> relation.order(by="a").to_series()
1798
- shape: (2,)
1799
- Series: 'a' [i32]
1800
- [
1801
- 1
1802
- 2
1803
- ]
1804
- """
1805
- if len(self._relation.columns) != 1:
1806
- raise TypeError(
1807
- f"{self.__class__.__name__}.to_series() was invoked on a relation with "
1808
- f"{len(self._relation.columns)} columns, while exactly 1 is required!"
1809
- )
1810
- dataframe: DataFrame = DataFrame._from_arrow(self._relation.to_arrow_table())
1811
- return dataframe.to_series(index=0).alias(name=self.columns[0])
1812
-
1813
- def union(self: RelationType, other: RelationSource) -> RelationType:
1814
- """
1815
- Produce a new relation that contains the rows of both relations.
1816
-
1817
- The ``+`` operator can also be used to union two relations.
1818
-
1819
- The two relations must have the same column names, but not necessarily in the
1820
- same order as reordering of columns is automatically performed, unlike regular
1821
- SQL.
1822
-
1823
- Duplicates are `not` dropped.
1824
-
1825
- Args:
1826
- other: A ``patito.duckdb.Relation`` object or something that can be
1827
- *casted* to ``patito.duckdb.Relation``.
1828
- See :ref:`Relation<duckdb.Relation.__init__>`.
1829
-
1830
- Returns:
1831
- New relation containing the rows of both ``self`` and ``other``.
1832
-
1833
- Raises:
1834
- TypeError: If the two relations do not contain the same columns.
1835
-
1836
- Examples:
1837
- >>> import patito as pt
1838
- >>> relation_1 = pt.duckdb.Relation("select 1 as a")
1839
- >>> relation_2 = pt.duckdb.Relation("select 2 as a")
1840
- >>> relation_1.union(relation_2).to_df()
1841
- shape: (2, 1)
1842
- ┌─────┐
1843
- │ a │
1844
- │ --- │
1845
- │ i64 │
1846
- ╞═════╡
1847
- │ 1 │
1848
- │ 2 │
1849
- └─────┘
1850
-
1851
- >>> (relation_1 + relation_2).to_df()
1852
- shape: (2, 1)
1853
- ┌─────┐
1854
- │ a │
1855
- │ --- │
1856
- │ i64 │
1857
- ╞═════╡
1858
- │ 1 │
1859
- │ 2 │
1860
- └─────┘
1861
- """
1862
- other_relation = self.database.to_relation(other)
1863
- if set(self.columns) != set(other_relation.columns):
1864
- msg = "Union between relations with different column names is not allowed."
1865
- additional_left = set(self.columns) - set(other_relation.columns)
1866
- additional_right = set(other_relation.columns) - set(self.columns)
1867
- if additional_left:
1868
- msg += f" Additional columns in left relation: {additional_left}."
1869
- if additional_right:
1870
- msg += f" Additional columns in right relation: {additional_right}."
1871
- raise TypeError(msg)
1872
- if other_relation.columns != self.columns:
1873
- reordered_relation = other_relation[self.columns]
1874
- else:
1875
- reordered_relation = other_relation
1876
- unioned_relation = self._relation.union(reordered_relation._relation)
1877
- return self._wrap(relation=unioned_relation, schema_change=False)
1878
-
1879
- def with_columns(
1880
- self,
1881
- **named_projections: Union[str, int, float],
1882
- ) -> Relation:
1883
- """
1884
- Return relations with additional columns.
1885
-
1886
- If the provided columns expressions already exists as a column on the relation,
1887
- the given column is overwritten.
1888
-
1889
- Args:
1890
- named_projections: A set of column expressions, where the keyword is used
1891
- as the column name, while the right-hand argument is a valid SQL
1892
- expression.
1893
-
1894
- Returns:
1895
- Relation with the given columns appended, or possibly overwritten.
1896
-
1897
- Examples:
1898
- >>> import patito as pt
1899
- >>> db = pt.duckdb.Database()
1900
- >>> relation = db.to_relation("select 1 as a, 2 as b")
1901
- >>> relation.with_columns(c="a + b").to_df()
1902
- shape: (1, 3)
1903
- ┌─────┬─────┬─────┐
1904
- │ a ┆ b ┆ c │
1905
- │ --- ┆ --- ┆ --- │
1906
- │ i64 ┆ i64 ┆ i64 │
1907
- ╞═════╪═════╪═════╡
1908
- │ 1 ┆ 2 ┆ 3 │
1909
- └─────┴─────┴─────┘
1910
- """
1911
- return self.select("*", **named_projections)
1912
-
1913
- def with_missing_defaultable_columns(
1914
- self: RelationType,
1915
- include: Optional[Iterable[str]] = None,
1916
- exclude: Optional[Iterable[str]] = None,
1917
- ) -> RelationType:
1918
- """
1919
- Add missing defaultable columns filled with the default values of correct type.
1920
-
1921
- Make sure to invoke :ref:`Relation.set_model()<duckdb.Relation.set_model>` with
1922
- the correct model schema before executing
1923
- ``Relation.with_missing_default_columns()``.
1924
-
1925
- Args:
1926
- include: If provided, only fill in default values for missing columns part
1927
- of this collection of column names.
1928
- exclude: If provided, do `not` fill in default values for missing columns
1929
- part of this collection of column names.
1930
-
1931
- Returns:
1932
- Relation: New relation where missing columns with default values according
1933
- to the schema have been filled in.
1934
-
1935
- Example:
1936
- >>> import patito as pt
1937
- >>> class MyModel(pt.Model):
1938
- ... non_default_column: int
1939
- ... another_non_default_column: int
1940
- ... default_column: int = 42
1941
- ... another_default_column: int = 42
1942
- ...
1943
- >>> relation = pt.duckdb.Relation(
1944
- ... "select 1 as non_default_column, 2 as default_column"
1945
- ... )
1946
- >>> relation.to_df()
1947
- shape: (1, 2)
1948
- ┌────────────────────┬────────────────┐
1949
- │ non_default_column ┆ default_column │
1950
- │ --- ┆ --- │
1951
- │ i64 ┆ i64 │
1952
- ╞════════════════════╪════════════════╡
1953
- │ 1 ┆ 2 │
1954
- └────────────────────┴────────────────┘
1955
- >>> relation.set_model(MyModel).with_missing_defaultable_columns().to_df()
1956
- shape: (1, 3)
1957
- ┌────────────────────┬────────────────┬────────────────────────┐
1958
- │ non_default_column ┆ default_column ┆ another_default_column │
1959
- │ --- ┆ --- ┆ --- │
1960
- │ i64 ┆ i64 ┆ i64 │
1961
- ╞════════════════════╪════════════════╪════════════════════════╡
1962
- │ 1 ┆ 2 ┆ 42 │
1963
- └────────────────────┴────────────────┴────────────────────────┘
1964
- """
1965
- if self.model is None:
1966
- class_name = self.__class__.__name__
1967
- raise TypeError(
1968
- f"{class_name}.with_missing_default_columns() invoked without "
1969
- f"{class_name}.model having been set! "
1970
- f"You should invoke {class_name}.set_model() first!"
1971
- )
1972
- elif include is not None and exclude is not None:
1973
- raise TypeError("Both include and exclude provided at the same time!")
1974
-
1975
- missing_columns = set(self.model.columns) - set(self.columns)
1976
- defaultable_columns = self.model.defaults.keys()
1977
- missing_defaultable_columns = missing_columns & defaultable_columns
1978
-
1979
- if exclude is not None:
1980
- missing_defaultable_columns -= set(exclude)
1981
- elif include is not None:
1982
- missing_defaultable_columns = missing_defaultable_columns & set(include)
1983
-
1984
- projection = "*"
1985
- for column_name in missing_defaultable_columns:
1986
- sql_type = self.model.sql_types[column_name]
1987
- default_value = self.model.defaults[column_name]
1988
- projection += f", {default_value!r}::{sql_type} as {column_name}"
1989
-
1990
- try:
1991
- relation = self._relation.project(projection)
1992
- except Exception as exc: # pragma: no cover
1993
- # We might get a RunTime error if the enum type has not
1994
- # been created yet. If so, we create all enum types for
1995
- # this model.
1996
- if _is_missing_enum_type_exception(exc):
1997
- self.database.create_enum_types(model=self.model)
1998
- relation = self._relation.project(projection)
1999
- else:
2000
- raise exc
2001
- return self._wrap(relation=relation, schema_change=False)
2002
-
2003
- def with_missing_nullable_columns(
2004
- self: RelationType,
2005
- include: Optional[Iterable[str]] = None,
2006
- exclude: Optional[Iterable[str]] = None,
2007
- ) -> RelationType:
2008
- """
2009
- Add missing nullable columns filled with correctly typed nulls.
2010
-
2011
- Make sure to invoke :ref:`Relation.set_model()<duckdb.Relation.set_model>` with
2012
- the correct model schema before executing
2013
- ``Relation.with_missing_nullable_columns()``.
2014
-
2015
- Args:
2016
- include: If provided, only fill in null values for missing columns part of
2017
- this collection of column names.
2018
- exclude: If provided, do `not` fill in null values for missing columns
2019
- part of this collection of column names.
2020
-
2021
- Returns:
2022
- Relation: New relation where missing nullable columns have been filled in
2023
- with null values.
2024
-
2025
- Example:
2026
- >>> from typing import Optional
2027
- >>> import patito as pt
2028
- >>> class MyModel(pt.Model):
2029
- ... non_nullable_column: int
2030
- ... nullable_column: Optional[int]
2031
- ... another_nullable_column: Optional[int]
2032
- ...
2033
- >>> relation = pt.duckdb.Relation("select 1 as nullable_column")
2034
- >>> relation.to_df()
2035
- shape: (1, 1)
2036
- ┌─────────────────┐
2037
- │ nullable_column │
2038
- │ --- │
2039
- │ i64 │
2040
- ╞═════════════════╡
2041
- │ 1 │
2042
- └─────────────────┘
2043
- >>> relation.set_model(MyModel).with_missing_nullable_columns().to_df()
2044
- shape: (1, 2)
2045
- ┌─────────────────┬─────────────────────────┐
2046
- │ nullable_column ┆ another_nullable_column │
2047
- │ --- ┆ --- │
2048
- │ i64 ┆ i64 │
2049
- ╞═════════════════╪═════════════════════════╡
2050
- │ 1 ┆ null │
2051
- └─────────────────┴─────────────────────────┘
2052
- """
2053
- if self.model is None:
2054
- class_name = self.__class__.__name__
2055
- raise TypeError(
2056
- f"{class_name}.with_missing_nullable_columns() invoked without "
2057
- f"{class_name}.model having been set! "
2058
- f"You should invoke {class_name}.set_model() first!"
2059
- )
2060
- elif include is not None and exclude is not None:
2061
- raise TypeError("Both include and exclude provided at the same time!")
2062
-
2063
- missing_columns = set(self.model.columns) - set(self.columns)
2064
- missing_nullable_columns = self.model.nullable_columns & missing_columns
2065
-
2066
- if exclude is not None:
2067
- missing_nullable_columns -= set(exclude)
2068
- elif include is not None:
2069
- missing_nullable_columns = missing_nullable_columns & set(include)
2070
-
2071
- projection = "*"
2072
- for missing_nullable_column in missing_nullable_columns:
2073
- sql_type = self.model.sql_types[missing_nullable_column]
2074
- projection += f", null::{sql_type} as {missing_nullable_column}"
2075
-
2076
- try:
2077
- relation = self._relation.project(projection)
2078
- except Exception as exc: # pragma: no cover
2079
- # We might get a RunTime error if the enum type has not
2080
- # been created yet. If so, we create all enum types for
2081
- # this model.
2082
- if _is_missing_enum_type_exception(exc):
2083
- self.database.create_enum_types(model=self.model)
2084
- relation = self._relation.project(projection)
2085
- else:
2086
- raise exc
2087
- return self._wrap(relation=relation, schema_change=False)
2088
-
2089
- def __add__(self: RelationType, other: RelationSource) -> RelationType:
2090
- """
2091
- Execute ``self.union(other)``.
2092
-
2093
- See :ref:`Relation.union()<duckdb.Relation.union>` for full documentation.
2094
- """
2095
- return self.union(other)
2096
-
2097
- def __eq__(self, other: object) -> bool:
2098
- """Check if Relation is equal to a Relation-able data source."""
2099
- other_relation = self.database.to_relation(other) # type: ignore
2100
- # Check if the number of rows are equal, and then check if each row is equal.
2101
- # Use zip(self, other_relation, strict=True) when we upgrade to Python 3.10.
2102
- return self.count() == other_relation.count() and all(
2103
- row == other_row for row, other_row in zip(self, other_relation)
2104
- )
2105
-
2106
- def __getitem__(self, key: Union[str, Iterable[str]]) -> Relation:
2107
- """
2108
- Return Relation with selected columns.
2109
-
2110
- Uses :ref:`Relation.select()<duckdb.Relation.select>` under-the-hood in order to
2111
- perform the selection. Can technically be used to rename columns,
2112
- define derived columns, and so on, but prefer the use of Relation.select() for
2113
- such use cases.
2114
-
2115
- Args:
2116
- key: Columns to select, either a single column represented as a string, or
2117
- an iterable of strings.
2118
-
2119
- Returns:
2120
- New relation only containing the column subset specified.
2121
-
2122
- Example:
2123
- >>> import patito as pt
2124
- >>> relation = pt.duckdb.Relation("select 1 as a, 2 as b, 3 as c")
2125
- >>> relation.to_df()
2126
- shape: (1, 3)
2127
- ┌─────┬─────┬─────┐
2128
- │ a ┆ b ┆ c │
2129
- │ --- ┆ --- ┆ --- │
2130
- │ i64 ┆ i64 ┆ i64 │
2131
- ╞═════╪═════╪═════╡
2132
- │ 1 ┆ 2 ┆ 3 │
2133
- └─────┴─────┴─────┘
2134
- >>> relation[["a", "b"]].to_df()
2135
- shape: (1, 2)
2136
- ┌─────┬─────┐
2137
- │ a ┆ b │
2138
- │ --- ┆ --- │
2139
- │ i64 ┆ i64 │
2140
- ╞═════╪═════╡
2141
- │ 1 ┆ 2 │
2142
- └─────┴─────┘
2143
- >>> relation["a"].to_df()
2144
- shape: (1, 1)
2145
- ┌─────┐
2146
- │ a │
2147
- │ --- │
2148
- │ i64 │
2149
- ╞═════╡
2150
- │ 1 │
2151
- └─────┘
2152
- """
2153
- projection = key if isinstance(key, str) else ", ".join(key)
2154
- return self._wrap(
2155
- relation=self._relation.project(projection),
2156
- schema_change=True,
2157
- )
2158
-
2159
- def __iter__(self) -> Iterator[ModelType]:
2160
- """
2161
- Iterate over rows in relation.
2162
-
2163
- If :ref:`Relation.set_model()<duckdb.Relation.set_model>` has been invoked
2164
- first, the given model will be used to deserialize each row. Otherwise a Patito
2165
- model is dynamically constructed which fits the schema of the relation.
2166
-
2167
- Returns:
2168
- Iterator[Model]: An iterator of patito Model objects representing each row.
2169
-
2170
- Example:
2171
- >>> from typing import Literal
2172
- >>> import patito as pt
2173
- >>> df = pt.DataFrame({"float_column": [1, 2], "enum_column": ["A", "B"]})
2174
- >>> relation = pt.duckdb.Relation(df).set_alias("my_relation")
2175
- >>> for row in relation:
2176
- ... print(row)
2177
- ...
2178
- float_column=1 enum_column='A'
2179
- float_column=2 enum_column='B'
2180
- >>> list(relation)
2181
- [my_relation(float_column=1, enum_column='A'),
2182
- my_relation(float_column=2, enum_column='B')]
2183
-
2184
- >>> class MySchema(pt.Model):
2185
- ... float_column: float
2186
- ... enum_column: Literal["A", "B", "C"]
2187
- ...
2188
- >>> relation = relation.set_model(MySchema)
2189
- >>> for row in relation:
2190
- ... print(row)
2191
- ...
2192
- float_column=1.0 enum_column='A'
2193
- float_column=2.0 enum_column='B'
2194
- >>> list(relation)
2195
- [MySchema(float_column=1.0, enum_column='A'),
2196
- MySchema(float_column=2.0, enum_column='B')]
2197
- """
2198
- result = self._relation.execute()
2199
- while True:
2200
- row_tuple = result.fetchone()
2201
- if not row_tuple:
2202
- return
2203
- else:
2204
- yield self._to_model(row_tuple)
2205
-
2206
- def __len__(self) -> int:
2207
- """
2208
- Return the number of rows in the relation.
2209
-
2210
- See :ref:`Relation.count()<duckdb.Relation.count>` for full documentation.
2211
- """
2212
- return self.count()
2213
-
2214
- def __str__(self) -> str:
2215
- """
2216
- Return string representation of Relation object.
2217
-
2218
- Includes an expression tree, the result columns, and a result preview.
2219
-
2220
- Example:
2221
- >>> import patito as pt
2222
- >>> products = pt.duckdb.Relation(
2223
- ... pt.DataFrame(
2224
- ... {
2225
- ... "product_name": ["apple", "red_apple", "banana", "oranges"],
2226
- ... "supplier_id": [2, 2, 1, 3],
2227
- ... }
2228
- ... )
2229
- ... ).set_alias("products")
2230
- >>> print(str(products)) # xdoctest: +SKIP
2231
- ---------------------
2232
- --- Relation Tree ---
2233
- ---------------------
2234
- arrow_scan(94609350519648, 140317161740928, 140317161731168, 1000000)\
2235
-
2236
- ---------------------
2237
- -- Result Columns --
2238
- ---------------------
2239
- - product_name (VARCHAR)
2240
- - supplier_id (BIGINT)\
2241
-
2242
- ---------------------
2243
- -- Result Preview --
2244
- ---------------------
2245
- product_name supplier_id
2246
- VARCHAR BIGINT
2247
- [ Rows: 4]
2248
- apple 2
2249
- red_apple 2
2250
- banana 1
2251
- oranges 3
2252
-
2253
- >>> suppliers = pt.duckdb.Relation(
2254
- ... pt.DataFrame(
2255
- ... {
2256
- ... "id": [1, 2],
2257
- ... "supplier_name": ["Banana Republic", "Applies Inc."],
2258
- ... }
2259
- ... )
2260
- ... ).set_alias("suppliers")
2261
- >>> relation = (
2262
- ... products.set_alias("p")
2263
- ... .inner_join(
2264
- ... suppliers.set_alias("s"),
2265
- ... on="p.supplier_id = s.id",
2266
- ... )
2267
- ... .aggregate(
2268
- ... "supplier_name",
2269
- ... num_products="count(product_name)",
2270
- ... group_by=["supplier_id", "supplier_name"],
2271
- ... )
2272
- ... )
2273
- >>> print(str(relation)) # xdoctest: +SKIP
2274
- ---------------------
2275
- --- Relation Tree ---
2276
- ---------------------
2277
- Aggregate [supplier_name, count(product_name)]
2278
- Join INNER p.supplier_id = s.id
2279
- arrow_scan(94609350519648, 140317161740928, 140317161731168, 1000000)
2280
- arrow_scan(94609436221024, 140317161740928, 140317161731168, 1000000)\
2281
-
2282
- ---------------------
2283
- -- Result Columns --
2284
- ---------------------
2285
- - supplier_name (VARCHAR)
2286
- - num_products (BIGINT)\
2287
-
2288
- ---------------------
2289
- -- Result Preview --
2290
- ---------------------
2291
- supplier_name num_products
2292
- VARCHAR BIGINT
2293
- [ Rows: 2]
2294
- Applies Inc. 2
2295
- Banana Republic 1
2296
-
2297
- """
2298
- return str(self._relation)
2299
-
2300
- def _wrap(
2301
- self: RelationType,
2302
- relation: "duckdb.DuckDBPyRelation",
2303
- schema_change: bool = False,
2304
- ) -> RelationType:
2305
- """
2306
- Wrap DuckDB Relation object in same Relation wrapper class as self.
2307
-
2308
- This will preserve the type of the relation, even for subclasses Relation.
2309
- It should therefore only be used for relations which can be considered schema-
2310
- compatible with the original relation. Otherwise set schema_change to True
2311
- in order to create a Relation base object instead.
2312
- """
2313
- return type(self)(
2314
- derived_from=relation,
2315
- database=self.database,
2316
- model=self.model if not schema_change else None,
2317
- )
2318
-
2319
-
2320
- class Database:
2321
- # Types created in order to represent enum strings
2322
- enum_types: Set[str]
2323
-
2324
- def __init__(
2325
- self,
2326
- path: Optional[Path] = None,
2327
- read_only: bool = False,
2328
- **kwargs: Any, # noqa: ANN401
2329
- ) -> None:
2330
- """
2331
- Instantiate a new DuckDB database, either persisted to disk or in-memory.
2332
-
2333
- Args:
2334
- path: Optional path to store all the data to. If ``None`` the data is
2335
- persisted in-memory only.
2336
- read_only: If the database connection should be a read-only connection.
2337
- **kwargs: Additional keywords forwarded to ``duckdb.connect()``.
2338
-
2339
- Examples:
2340
- >>> import patito as pt
2341
- >>> db = pt.duckdb.Database()
2342
- >>> db.to_relation("select 1 as a, 2 as b").create_table("my_table")
2343
- >>> db.query("select * from my_table").to_df()
2344
- shape: (1, 2)
2345
- ┌─────┬─────┐
2346
- │ a ┆ b │
2347
- │ --- ┆ --- │
2348
- │ i64 ┆ i64 │
2349
- ╞═════╪═════╡
2350
- │ 1 ┆ 2 │
2351
- └─────┴─────┘
2352
- """
2353
- import duckdb
2354
-
2355
- self.path = path
2356
- self.connection = duckdb.connect(
2357
- database=str(path) if path else ":memory:",
2358
- read_only=read_only,
2359
- **kwargs,
2360
- )
2361
- self.enum_types: Set[str] = set()
2362
-
2363
- @classmethod
2364
- def default(cls) -> Database:
2365
- """
2366
- Return the default DuckDB database.
2367
-
2368
- Returns:
2369
- A patito :ref:`Database<duckdb.Database>` object wrapping around the given
2370
- connection.
2371
-
2372
- Example:
2373
- >>> import patito as pt
2374
- >>> db = pt.duckdb.Database.default()
2375
- >>> db.query("select 1 as a, 2 as b").to_df()
2376
- shape: (1, 2)
2377
- ┌─────┬─────┐
2378
- │ a ┆ b │
2379
- │ --- ┆ --- │
2380
- │ i64 ┆ i64 │
2381
- ╞═════╪═════╡
2382
- │ 1 ┆ 2 │
2383
- └─────┴─────┘
2384
- """
2385
- import duckdb
2386
-
2387
- return cls.from_connection(duckdb.default_connection)
2388
-
2389
- @classmethod
2390
- def from_connection(cls, connection: "duckdb.DuckDBPyConnection") -> Database:
2391
- """
2392
- Create database from native DuckDB connection object.
2393
-
2394
- Args:
2395
- connection: A native DuckDB connection object created with
2396
- ``duckdb.connect()``.
2397
-
2398
- Returns:
2399
- A :ref:`Database<duckdb.Database>` object wrapping around the given
2400
- connection.
2401
-
2402
- Example:
2403
- >>> import duckdb
2404
- >>> import patito as pt
2405
- >>> connection = duckdb.connect()
2406
- >>> database = pt.duckdb.Database.from_connection(connection)
2407
- """
2408
- obj = cls.__new__(cls)
2409
- obj.connection = connection
2410
- obj.enum_types = set()
2411
- return obj
2412
-
2413
- def to_relation(
2414
- self,
2415
- derived_from: RelationSource,
2416
- ) -> Relation:
2417
- """
2418
- Create a new relation object based on data source.
2419
-
2420
- The given data will be represented as a relation associated with the database.
2421
- ``Database(x).to_relation(y)`` is equivalent to
2422
- ``Relation(y, database=Database(x))``.
2423
-
2424
- Args:
2425
- derived_from (RelationSource): One of either a polars or pandas
2426
- ``DataFrame``, a ``pathlib.Path`` to a parquet or CSV file, a SQL query
2427
- string, or an existing relation.
2428
-
2429
- Example:
2430
- >>> import patito as pt
2431
- >>> db = pt.duckdb.Database()
2432
- >>> db.to_relation("select 1 as a, 2 as b").to_df()
2433
- shape: (1, 2)
2434
- ┌─────┬─────┐
2435
- │ a ┆ b │
2436
- │ --- ┆ --- │
2437
- │ i64 ┆ i64 │
2438
- ╞═════╪═════╡
2439
- │ 1 ┆ 2 │
2440
- └─────┴─────┘
2441
- >>> db.to_relation(pt.DataFrame({"c": [3, 4], "d": ["5", "6"]})).to_df()
2442
- shape: (2, 2)
2443
- ┌─────┬─────┐
2444
- │ c ┆ d │
2445
- │ --- ┆ --- │
2446
- │ i64 ┆ str │
2447
- ╞═════╪═════╡
2448
- │ 3 ┆ 5 │
2449
- │ 4 ┆ 6 │
2450
- └─────┴─────┘
2451
- """
2452
- return Relation(
2453
- derived_from=derived_from,
2454
- database=self,
2455
- )
2456
-
2457
- def execute(
2458
- self,
2459
- query: str,
2460
- *parameters: Collection[Union[str, int, float, bool]],
2461
- ) -> None:
2462
- """
2463
- Execute SQL query in DuckDB database.
2464
-
2465
- Args:
2466
- query: A SQL statement to execute. Does `not` have to be terminated with
2467
- a semicolon (``;``).
2468
- parameters: One or more sets of parameters to insert into prepared
2469
- statements. The values are replaced in place of the question marks
2470
- (``?``) in the prepared query.
2471
-
2472
- Example:
2473
- >>> import patito as pt
2474
- >>> db = pt.duckdb.Database()
2475
- >>> db.execute("create table my_table (x bigint);")
2476
- >>> db.execute("insert into my_table values (1), (2), (3)")
2477
- >>> db.table("my_table").to_df()
2478
- shape: (3, 1)
2479
- ┌─────┐
2480
- │ x │
2481
- │ --- │
2482
- │ i64 │
2483
- ╞═════╡
2484
- │ 1 │
2485
- │ 2 │
2486
- │ 3 │
2487
- └─────┘
2488
-
2489
- Parameters can be specified when executing prepared queries.
2490
-
2491
- >>> db.execute("delete from my_table where x = ?", (2,))
2492
- >>> db.table("my_table").to_df()
2493
- shape: (2, 1)
2494
- ┌─────┐
2495
- │ x │
2496
- │ --- │
2497
- │ i64 │
2498
- ╞═════╡
2499
- │ 1 │
2500
- │ 3 │
2501
- └─────┘
2502
-
2503
- Multiple parameter sets can be specified when executing multiple prepared
2504
- queries.
2505
-
2506
- >>> db.execute(
2507
- ... "delete from my_table where x = ?",
2508
- ... (1,),
2509
- ... (3,),
2510
- ... )
2511
- >>> db.table("my_table").to_df()
2512
- shape: (0, 1)
2513
- ┌─────┐
2514
- │ x │
2515
- │ --- │
2516
- │ i64 │
2517
- ╞═════╡
2518
- └─────┘
2519
- """
2520
- duckdb_parameters: Union[
2521
- Collection[Union[str, int, float, bool]],
2522
- Collection[Collection[Union[str, int, float, bool]]],
2523
- None,
2524
- ]
2525
- if parameters is None or len(parameters) == 0:
2526
- duckdb_parameters = []
2527
- multiple_parameter_sets = False
2528
- elif len(parameters) == 1:
2529
- duckdb_parameters = parameters[0]
2530
- multiple_parameter_sets = False
2531
- else:
2532
- duckdb_parameters = parameters
2533
- multiple_parameter_sets = True
2534
-
2535
- self.connection.execute(
2536
- query=query,
2537
- parameters=duckdb_parameters,
2538
- multiple_parameter_sets=multiple_parameter_sets,
2539
- )
2540
-
2541
- def query(self, query: str, alias: str = "query_relation") -> Relation:
2542
- """
2543
- Execute arbitrary SQL select query and return the relation.
2544
-
2545
- Args:
2546
- query: Arbitrary SQL select query.
2547
- alias: The alias to assign to the resulting relation, to be used in further
2548
- queries.
2549
-
2550
- Returns: A relation representing the data produced by the given query.
2551
-
2552
- Example:
2553
- >>> import patito as pt
2554
- >>> db = pt.duckdb.Database()
2555
- >>> relation = db.query("select 1 as a, 2 as b, 3 as c")
2556
- >>> relation.to_df()
2557
- shape: (1, 3)
2558
- ┌─────┬─────┬─────┐
2559
- │ a ┆ b ┆ c │
2560
- │ --- ┆ --- ┆ --- │
2561
- │ i64 ┆ i64 ┆ i64 │
2562
- ╞═════╪═════╪═════╡
2563
- │ 1 ┆ 2 ┆ 3 │
2564
- └─────┴─────┴─────┘
2565
-
2566
- >>> relation = db.query("select 1 as a, 2 as b, 3 as c", alias="my_alias")
2567
- >>> relation.select("my_alias.a").to_df()
2568
- shape: (1, 1)
2569
- ┌─────┐
2570
- │ a │
2571
- │ --- │
2572
- │ i64 │
2573
- ╞═════╡
2574
- │ 1 │
2575
- └─────┘
2576
- """
2577
- return Relation(
2578
- self.connection.query(query=query, alias=alias),
2579
- database=self,
2580
- )
2581
-
2582
- def empty_relation(self, schema: Type[ModelType]) -> Relation[ModelType]:
2583
- """
2584
- Create relation with zero rows, but correct schema that matches the given model.
2585
-
2586
- Args:
2587
- schema: A patito model which specifies the column names and types of the
2588
- given relation.
2589
-
2590
- Example:
2591
- >>> import patito as pt
2592
- >>> class Schema(pt.Model):
2593
- ... string_column: str
2594
- ... bool_column: bool
2595
- ...
2596
- >>> db = pt.duckdb.Database()
2597
- >>> empty_relation = db.empty_relation(Schema)
2598
- >>> empty_relation.to_df()
2599
- shape: (0, 2)
2600
- ┌───────────────┬─────────────┐
2601
- │ string_column ┆ bool_column │
2602
- │ --- ┆ --- │
2603
- │ str ┆ bool │
2604
- ╞═══════════════╪═════════════╡
2605
- └───────────────┴─────────────┘
2606
- >>> non_empty_relation = db.query(
2607
- ... "select 'dummy' as string_column, true as bool_column"
2608
- ... )
2609
- >>> non_empty_relation.union(empty_relation).to_df()
2610
- shape: (1, 2)
2611
- ┌───────────────┬─────────────┐
2612
- │ string_column ┆ bool_column │
2613
- │ --- ┆ --- │
2614
- │ str ┆ bool │
2615
- ╞═══════════════╪═════════════╡
2616
- │ dummy ┆ true │
2617
- └───────────────┴─────────────┘
2618
- """
2619
- return self.to_relation(schema.examples()).limit(0)
2620
-
2621
- def table(self, name: str) -> Relation:
2622
- """
2623
- Return relation representing all the data in the given table.
2624
-
2625
- Args:
2626
- name: The name of the table.
2627
-
2628
- Example:
2629
- >>> import patito as pt
2630
- >>> df = pt.DataFrame({"a": [1, 2], "b": [3, 4]})
2631
- >>> db = pt.duckdb.Database()
2632
- >>> relation = db.to_relation(df)
2633
- >>> relation.create_table(name="my_table")
2634
- >>> db.table("my_table").to_df()
2635
- shape: (2, 2)
2636
- ┌─────┬─────┐
2637
- │ a ┆ b │
2638
- │ --- ┆ --- │
2639
- │ i64 ┆ i64 │
2640
- ╞═════╪═════╡
2641
- │ 1 ┆ 3 │
2642
- │ 2 ┆ 4 │
2643
- └─────┴─────┘
2644
- """
2645
- return Relation(
2646
- self.connection.table(name),
2647
- database=self.from_connection(self.connection),
2648
- )
2649
-
2650
- def view(self, name: str) -> Relation:
2651
- """
2652
- Return relation representing all the data in the given view.
2653
-
2654
- Args:
2655
- name: The name of the view.
2656
-
2657
- Example:
2658
- >>> import patito as pt
2659
- >>> df = pt.DataFrame({"a": [1, 2], "b": [3, 4]})
2660
- >>> db = pt.duckdb.Database()
2661
- >>> relation = db.to_relation(df)
2662
- >>> relation.create_view(name="my_view")
2663
- >>> db.view("my_view").to_df()
2664
- shape: (2, 2)
2665
- ┌─────┬─────┐
2666
- │ a ┆ b │
2667
- │ --- ┆ --- │
2668
- │ i64 ┆ i64 │
2669
- ╞═════╪═════╡
2670
- │ 1 ┆ 3 │
2671
- │ 2 ┆ 4 │
2672
- └─────┴─────┘
2673
- """
2674
- return Relation(
2675
- self.connection.view(name),
2676
- database=self.from_connection(self.connection),
2677
- )
2678
-
2679
- def create_table(
2680
- self,
2681
- name: str,
2682
- model: Type[ModelType],
2683
- ) -> Relation[ModelType]:
2684
- """
2685
- Create table with schema matching the provided Patito model.
2686
-
2687
- See :ref:`Relation.insert_into()<duckdb.Relation.insert_into>` for how to insert
2688
- data into the table after creation.
2689
- The :ref:`Relation.create_table()<duckdb.Relation.create_table>` method can also
2690
- be used to create a table from a given relation `and` insert the data at the
2691
- same time.
2692
-
2693
- Args:
2694
- name: Name of new database table.
2695
- model (Type[Model]): Patito model indicating names and types of table
2696
- columns.
2697
- Returns:
2698
- Relation[ModelType]: Relation pointing to the new table.
2699
-
2700
- Example:
2701
- >>> from typing import Optional
2702
- >>> import patito as pt
2703
- >>> class MyModel(pt.Model):
2704
- ... str_column: str
2705
- ... nullable_string_column: Optional[str]
2706
- ...
2707
- >>> db = pt.duckdb.Database()
2708
- >>> db.create_table(name="my_table", model=MyModel)
2709
- >>> db.table("my_table").types
2710
- {'str_column': VARCHAR, 'nullable_string_column': VARCHAR}
2711
- """
2712
- self.create_enum_types(model=model)
2713
- schema = model.schema()
2714
- non_nullable = schema.get("required", [])
2715
- columns = []
2716
- for column_name, sql_type in model.sql_types.items():
2717
- column = f"{column_name} {sql_type}"
2718
- if column_name in non_nullable:
2719
- column += " not null"
2720
- columns.append(column)
2721
- self.connection.execute(f"create table {name} ({','.join(columns)})")
2722
- # TODO: Fix typing
2723
- return self.table(name).set_model(model) # pyright: ignore
2724
-
2725
- def create_enum_types(self, model: Type[ModelType]) -> None:
2726
- """
2727
- Define SQL enum types in DuckDB database.
2728
-
2729
- Args:
2730
- model: Model for which all Literal-annotated or enum-annotated string fields
2731
- will get respective DuckDB enum types.
2732
-
2733
- Example:
2734
- >>> import patito as pt
2735
- >>> class EnumModel(pt.Model):
2736
- ... enum_column: Literal["A", "B", "C"]
2737
- ...
2738
- >>> db = pt.duckdb.Database()
2739
- >>> db.create_enum_types(EnumModel)
2740
- >>> db.enum_types
2741
- {'enum__7ba49365cc1b0fd57e61088b3bc9aa25'}
2742
- """
2743
- import duckdb
2744
-
2745
- for props in model._schema_properties().values():
2746
- if "enum" not in props or props["type"] != "string":
2747
- # DuckDB enums only support string values
2748
- continue
2749
-
2750
- enum_type_name = _enum_type_name(field_properties=props)
2751
- if enum_type_name in self.enum_types:
2752
- # This enum type has already been created
2753
- continue
2754
-
2755
- enum_values = ", ".join(repr(value) for value in sorted(props["enum"]))
2756
- try:
2757
- self.connection.execute(
2758
- f"create type {enum_type_name} as enum ({enum_values})"
2759
- )
2760
- except duckdb.CatalogException as e:
2761
- if "already exists" not in str(e):
2762
- raise e # pragma: no cover
2763
- self.enum_types.add(enum_type_name)
2764
-
2765
- def create_view(
2766
- self,
2767
- name: str,
2768
- data: RelationSource,
2769
- ) -> Relation:
2770
- """Create a view based on the given data source."""
2771
- return self.to_relation(derived_from=data).create_view(name)
2772
-
2773
- def __contains__(self, table: str) -> bool:
2774
- """
2775
- Return ``True`` if the database contains a table with the given name.
2776
-
2777
- Args:
2778
- table: The name of the table to be checked for.
2779
-
2780
- Examples:
2781
- >>> import patito as pt
2782
- >>> db = pt.duckdb.Database()
2783
- >>> "my_table" in db
2784
- False
2785
- >>> db.to_relation("select 1 as a, 2 as b").create_table(name="my_table")
2786
- >>> "my_table" in db
2787
- True
2788
- """
2789
- try:
2790
- self.connection.table(table_name=table)
2791
- return True
2792
- except Exception:
2793
- return False