patito 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- patito/__init__.py +12 -6
- patito/database.py +658 -0
- patito/duckdb.py +153 -186
- patito/polars.py +52 -45
- patito/pydantic.py +99 -88
- patito/sql.py +2 -3
- patito/validators.py +87 -1
- patito/xdg.py +22 -0
- {patito-0.4.3.dist-info → patito-0.5.0.dist-info}/LICENSE +1 -0
- {patito-0.4.3.dist-info → patito-0.5.0.dist-info}/METADATA +18 -17
- patito-0.5.0.dist-info/RECORD +14 -0
- {patito-0.4.3.dist-info → patito-0.5.0.dist-info}/WHEEL +1 -1
- patito-0.4.3.dist-info/RECORD +0 -12
patito/__init__.py
CHANGED
|
@@ -6,6 +6,7 @@ from patito.exceptions import ValidationError
|
|
|
6
6
|
from patito.polars import DataFrame, LazyFrame
|
|
7
7
|
from patito.pydantic import Field, Model
|
|
8
8
|
|
|
9
|
+
_CACHING_AVAILABLE = False
|
|
9
10
|
_DUCKDB_AVAILABLE = False
|
|
10
11
|
field = col("_")
|
|
11
12
|
__all__ = [
|
|
@@ -16,6 +17,7 @@ __all__ = [
|
|
|
16
17
|
"Model",
|
|
17
18
|
"Series",
|
|
18
19
|
"ValidationError",
|
|
20
|
+
"_CACHING_AVAILABLE",
|
|
19
21
|
"_DUCKDB_AVAILABLE",
|
|
20
22
|
"col",
|
|
21
23
|
"exceptions",
|
|
@@ -24,17 +26,21 @@ __all__ = [
|
|
|
24
26
|
]
|
|
25
27
|
|
|
26
28
|
try:
|
|
27
|
-
from patito
|
|
29
|
+
from patito import duckdb
|
|
28
30
|
|
|
29
31
|
_DUCKDB_AVAILABLE = True
|
|
30
|
-
__all__ += [
|
|
31
|
-
"Database",
|
|
32
|
-
"Relation",
|
|
33
|
-
"RelationSource",
|
|
34
|
-
]
|
|
32
|
+
__all__ += ["duckdb"]
|
|
35
33
|
except ImportError: # pragma: no cover
|
|
36
34
|
pass
|
|
37
35
|
|
|
36
|
+
try:
|
|
37
|
+
from patito.database import Database
|
|
38
|
+
|
|
39
|
+
_CACHING_AVAILABLE = True
|
|
40
|
+
__all__ += ["Database"]
|
|
41
|
+
except ImportError:
|
|
42
|
+
pass
|
|
43
|
+
|
|
38
44
|
|
|
39
45
|
try:
|
|
40
46
|
from importlib.metadata import PackageNotFoundError, version
|
patito/database.py
ADDED
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
"""Module containing utilities for retrieving data from external databases."""
|
|
2
|
+
import glob
|
|
3
|
+
import hashlib
|
|
4
|
+
import inspect
|
|
5
|
+
import re
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Any,
|
|
12
|
+
Callable,
|
|
13
|
+
Dict,
|
|
14
|
+
Generic,
|
|
15
|
+
Optional,
|
|
16
|
+
Type,
|
|
17
|
+
TypeVar,
|
|
18
|
+
Union,
|
|
19
|
+
cast,
|
|
20
|
+
overload,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
import polars as pl
|
|
24
|
+
import pyarrow as pa # type: ignore[import]
|
|
25
|
+
import pyarrow.parquet as pq # type: ignore[import]
|
|
26
|
+
from typing_extensions import Literal, ParamSpec, Protocol
|
|
27
|
+
|
|
28
|
+
from patito import xdg
|
|
29
|
+
|
|
30
|
+
if TYPE_CHECKING:
|
|
31
|
+
from patito import Model
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
P = ParamSpec("P")
|
|
35
|
+
DF = TypeVar("DF", bound=Union[pl.DataFrame, pl.LazyFrame], covariant=True)
|
|
36
|
+
|
|
37
|
+
# Increment this integer whenever you make backwards-incompatible changes to
|
|
38
|
+
# the parquet caching implemented in WrappedQueryFunc, then such caches
|
|
39
|
+
# are ejected the next time the wrapper tries to read from them.
|
|
40
|
+
CACHE_VERSION = 1
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class QueryConstructor(Protocol[P]):
|
|
44
|
+
"""A function taking arbitrary arguments and returning an SQL query string."""
|
|
45
|
+
|
|
46
|
+
__name__: str
|
|
47
|
+
|
|
48
|
+
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> str:
|
|
49
|
+
"""
|
|
50
|
+
Return SQL query constructed from the given parameters.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
*args: Positional arguments used to build SQL query.
|
|
54
|
+
**kwargs: Keyword arguments used to build SQL query.
|
|
55
|
+
"""
|
|
56
|
+
... # pragma: no cover
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DatabaseQuery(Generic[P, DF]):
|
|
60
|
+
"""A class acting as a function that returns a polars.DataFrame when called."""
|
|
61
|
+
|
|
62
|
+
_cache: Union[bool, Path]
|
|
63
|
+
|
|
64
|
+
def __init__( # noqa: C901
|
|
65
|
+
self,
|
|
66
|
+
query_constructor: QueryConstructor[P],
|
|
67
|
+
cache_directory: Path,
|
|
68
|
+
query_handler: Callable[..., pa.Table],
|
|
69
|
+
ttl: timedelta,
|
|
70
|
+
lazy: bool = False,
|
|
71
|
+
cache: Union[str, Path, bool] = False,
|
|
72
|
+
model: Union[Type["Model"], None] = None,
|
|
73
|
+
query_handler_kwargs: Optional[Dict[Any, Any]] = None,
|
|
74
|
+
) -> None:
|
|
75
|
+
"""
|
|
76
|
+
Convert SQL string query function to polars.DataFrame function.
|
|
77
|
+
|
|
78
|
+
Args:
|
|
79
|
+
query_constructor: A function that takes arbitrary arguments and returns
|
|
80
|
+
an SQL query string.
|
|
81
|
+
cache_directory: Path to directory to store parquet cache files in.
|
|
82
|
+
query_handler: Function used to execute SQL queries and return arrow
|
|
83
|
+
tables.
|
|
84
|
+
ttl: See Database.query for documentation.
|
|
85
|
+
lazy: See Database.query for documentation.
|
|
86
|
+
cache: See Database.query for documentation.
|
|
87
|
+
model: See Database.query for documentation.
|
|
88
|
+
query_handler_kwargs: Arbitrary keyword arguments forwarded to the provided
|
|
89
|
+
query handler.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
ValueError: If the given path does not have a '.parquet' file extension.
|
|
93
|
+
"""
|
|
94
|
+
if not isinstance(cache, bool) and Path(cache).suffix != ".parquet":
|
|
95
|
+
raise ValueError("Cache paths must have the '.parquet' file extension!")
|
|
96
|
+
|
|
97
|
+
if isinstance(cache, (Path, str)):
|
|
98
|
+
self._cache = cache_directory.joinpath(cache)
|
|
99
|
+
else:
|
|
100
|
+
self._cache = cache
|
|
101
|
+
self._query_constructor = query_constructor
|
|
102
|
+
self.cache_directory = cache_directory
|
|
103
|
+
|
|
104
|
+
self._query_handler_kwargs = query_handler_kwargs or {}
|
|
105
|
+
# Unless explicitly specified otherwise by the end-user, we retrieve query
|
|
106
|
+
# results as arrow tables with column types directly supported by polars.
|
|
107
|
+
# Otherwise the resulting parquet files that are written to disk can not be
|
|
108
|
+
# lazily read with polars.scan_parquet.
|
|
109
|
+
self._query_handler_kwargs.setdefault("cast_to_polars_equivalent_types", True)
|
|
110
|
+
|
|
111
|
+
# We construct the new function with the same parameter signature as
|
|
112
|
+
# wrapped_function, but with polars.DataFrame as the return type.
|
|
113
|
+
@wraps(query_constructor)
|
|
114
|
+
def cached_func(*args: P.args, **kwargs: P.kwargs) -> DF:
|
|
115
|
+
query = query_constructor(*args, **kwargs)
|
|
116
|
+
cache_path = self.cache_path(*args, **kwargs)
|
|
117
|
+
if cache_path and cache_path.exists():
|
|
118
|
+
metadata: Dict[bytes, bytes] = pq.read_schema(cache_path).metadata or {}
|
|
119
|
+
|
|
120
|
+
# Check if the cache file was produced by an identical SQL query
|
|
121
|
+
is_same_query = metadata.get(b"query") == query.encode("utf-8")
|
|
122
|
+
|
|
123
|
+
# Check if the cache is too old to be re-used
|
|
124
|
+
cache_created_time = datetime.fromisoformat(
|
|
125
|
+
metadata.get(
|
|
126
|
+
b"query_start_time", b"1900-01-01T00:00:00.000000"
|
|
127
|
+
).decode("utf-8")
|
|
128
|
+
)
|
|
129
|
+
is_fresh_cache = (datetime.now() - cache_created_time) < ttl
|
|
130
|
+
|
|
131
|
+
# Check if the cache was produced by an incompatible version
|
|
132
|
+
cache_version = int.from_bytes(
|
|
133
|
+
metadata.get(
|
|
134
|
+
b"cache_version",
|
|
135
|
+
(0).to_bytes(length=16, byteorder="little", signed=False),
|
|
136
|
+
),
|
|
137
|
+
byteorder="little",
|
|
138
|
+
signed=False,
|
|
139
|
+
)
|
|
140
|
+
is_compatible_version = cache_version >= CACHE_VERSION
|
|
141
|
+
|
|
142
|
+
if is_same_query and is_fresh_cache and is_compatible_version:
|
|
143
|
+
if lazy:
|
|
144
|
+
return pl.scan_parquet(cache_path) # type: ignore
|
|
145
|
+
else:
|
|
146
|
+
return pl.read_parquet(cache_path) # type: ignore
|
|
147
|
+
|
|
148
|
+
arrow_table = query_handler(query, **self._query_handler_kwargs)
|
|
149
|
+
if cache_path:
|
|
150
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
151
|
+
# We write the cache *before* any potential model validation since
|
|
152
|
+
# we don't want to lose the result of an expensive query just because
|
|
153
|
+
# the model specification is wrong.
|
|
154
|
+
# We also use pyarrow.parquet.write_table instead of
|
|
155
|
+
# polars.write_parquet since we want to write the arrow table's metadata
|
|
156
|
+
# to the parquet file, such as the executed query, time, etc..
|
|
157
|
+
# This metadata is not preserved by polars.
|
|
158
|
+
metadata = arrow_table.schema.metadata
|
|
159
|
+
metadata[
|
|
160
|
+
b"wrapped_function_name"
|
|
161
|
+
] = self._query_constructor.__name__.encode("utf-8")
|
|
162
|
+
# Store the cache version as an 16-bit unsigned little-endian number
|
|
163
|
+
metadata[b"cache_version"] = CACHE_VERSION.to_bytes(
|
|
164
|
+
length=16,
|
|
165
|
+
byteorder="little",
|
|
166
|
+
signed=False,
|
|
167
|
+
)
|
|
168
|
+
pq.write_table(
|
|
169
|
+
table=arrow_table.replace_schema_metadata(metadata),
|
|
170
|
+
where=cache_path,
|
|
171
|
+
# In order to support nanosecond-resolution timestamps, we must
|
|
172
|
+
# use parquet version >= 2.6.
|
|
173
|
+
version="2.6",
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
polars_df = cast(pl.DataFrame, pl.from_arrow(arrow_table))
|
|
177
|
+
if model:
|
|
178
|
+
model.validate(polars_df)
|
|
179
|
+
|
|
180
|
+
if lazy:
|
|
181
|
+
if cache_path:
|
|
182
|
+
# Delete in-memory representation of data and read from the new
|
|
183
|
+
# parquet file instead. That way we get consistent memory pressure
|
|
184
|
+
# the first and subsequent times this function is invoked.
|
|
185
|
+
del polars_df, arrow_table
|
|
186
|
+
return pl.scan_parquet(source=cache_path) # type: ignore
|
|
187
|
+
else:
|
|
188
|
+
return polars_df.lazy() # type: ignore
|
|
189
|
+
else:
|
|
190
|
+
return polars_df # type: ignore
|
|
191
|
+
|
|
192
|
+
self._cached_func = cached_func
|
|
193
|
+
|
|
194
|
+
def cache_path(self, *args: P.args, **kwargs: P.kwargs) -> Optional[Path]:
|
|
195
|
+
"""
|
|
196
|
+
Return the path to the parquet cache that would store the result of the query.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
args: The positional arguments passed to the wrapped function.
|
|
200
|
+
kwargs: The keyword arguments passed to the wrapped function.
|
|
201
|
+
|
|
202
|
+
Returns:
|
|
203
|
+
A deterministic path to a parquet cache. None if caching is disabled.
|
|
204
|
+
"""
|
|
205
|
+
# We convert args+kwargs to kwargs-only and use it to format the string
|
|
206
|
+
function_signature = inspect.signature(self._query_constructor)
|
|
207
|
+
bound_arguments = function_signature.bind(*args, **kwargs)
|
|
208
|
+
|
|
209
|
+
if isinstance(self._cache, Path):
|
|
210
|
+
# Interpret relative paths relative to the main query cache directory
|
|
211
|
+
return Path(str(self._cache).format(**bound_arguments.arguments))
|
|
212
|
+
elif self._cache is True:
|
|
213
|
+
directory: Path = self.cache_directory / self._query_constructor.__name__
|
|
214
|
+
directory.mkdir(exist_ok=True, parents=True)
|
|
215
|
+
sql_query = self.query_string(*args, **kwargs)
|
|
216
|
+
sql_query_hash = hashlib.sha1( # noqa: S324,S303
|
|
217
|
+
sql_query.encode("utf-8")
|
|
218
|
+
).hexdigest()
|
|
219
|
+
return directory / f"{sql_query_hash}.parquet"
|
|
220
|
+
else:
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
def __call__(self, *args: P.args, **kwargs: P.kwargs) -> DF: # noqa: D102
|
|
224
|
+
return self._cached_func(*args, **kwargs)
|
|
225
|
+
|
|
226
|
+
def query_string(self, *args: P.args, **kwargs: P.kwargs) -> str:
|
|
227
|
+
"""
|
|
228
|
+
Return the query to be executed for the given parameters.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
*args: Positional arguments used to construct the query string.
|
|
232
|
+
*kwargs: Keyword arguments used to construct the query string.
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
The query string produced for the given input parameters.
|
|
236
|
+
"""
|
|
237
|
+
return self._query_constructor(*args, **kwargs)
|
|
238
|
+
|
|
239
|
+
def refresh_cache(self, *args: P.args, **kwargs: P.kwargs) -> DF:
|
|
240
|
+
"""
|
|
241
|
+
Force query execution by refreshing the cache.
|
|
242
|
+
|
|
243
|
+
Args:
|
|
244
|
+
*args: Positional arguments used to construct the SQL query string.
|
|
245
|
+
*kwargs: Keyword arguments used to construct the SQL query string.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
A DataFrame representing the result of the newly executed query.
|
|
249
|
+
"""
|
|
250
|
+
cache_path = self.cache_path(*args, **kwargs)
|
|
251
|
+
if cache_path and cache_path.exists():
|
|
252
|
+
cache_path.unlink()
|
|
253
|
+
return self._cached_func(*args, **kwargs)
|
|
254
|
+
|
|
255
|
+
def clear_caches(self) -> None:
|
|
256
|
+
"""Delete all parquet cache files produced by this query wrapper."""
|
|
257
|
+
if self._cache is False:
|
|
258
|
+
# Caching is not enabled, so this is simply a no-op
|
|
259
|
+
return
|
|
260
|
+
|
|
261
|
+
if self._cache is True:
|
|
262
|
+
glob_pattern = str(
|
|
263
|
+
self.cache_directory / self._query_constructor.__name__ / "*.parquet"
|
|
264
|
+
)
|
|
265
|
+
else:
|
|
266
|
+
# We replace all formatting specifiers of the form '{variable}' with
|
|
267
|
+
# recursive globs '**' (in case strings containing '/' are inserted) and
|
|
268
|
+
# search for all occurrences of such file paths.
|
|
269
|
+
# For example if cache="{a}/{b}.parquet" is specified, we search for
|
|
270
|
+
# all files matching the glob pattern '**/**.parquet'.
|
|
271
|
+
glob_pattern = re.sub( # noqa: PD005
|
|
272
|
+
# We specify the reluctant qualifier (?) in order to get narrow matches
|
|
273
|
+
pattern=r"\{.+?\}",
|
|
274
|
+
repl="**",
|
|
275
|
+
string=str(self._cache),
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
for parquet_path in glob.iglob(glob_pattern):
|
|
279
|
+
try:
|
|
280
|
+
metadata: Dict[bytes, bytes] = (
|
|
281
|
+
pq.read_schema(where=parquet_path).metadata or {}
|
|
282
|
+
)
|
|
283
|
+
if metadata.get(
|
|
284
|
+
b"wrapped_function_name"
|
|
285
|
+
) == self._query_constructor.__name__.encode("utf-8"):
|
|
286
|
+
Path(parquet_path).unlink()
|
|
287
|
+
except Exception: # noqa: S112
|
|
288
|
+
# If we can't read the parquet metadata for some reason,
|
|
289
|
+
# it is probably not a cache anyway.
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
class Database:
|
|
294
|
+
"""
|
|
295
|
+
Construct manager for executing SQL queries and caching the results.
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
query_handler: The function that the Database object should use for executing
|
|
299
|
+
SQL queries. Its first argument should be the SQL query string to execute,
|
|
300
|
+
and it should return the query result as an arrow table, for instance
|
|
301
|
+
pyarrow.Table.
|
|
302
|
+
cache_directory: Path to the directory where caches should be stored as parquet
|
|
303
|
+
files. If not provided, the `XDG`_ Base Directory Specification will be
|
|
304
|
+
used to determine the suitable cache directory, by default
|
|
305
|
+
``~/.cache/patito`` or ``${XDG_CACHE_HOME}/patito``.
|
|
306
|
+
default_ttl: The default Time To Live (TTL), or with other words, how long to
|
|
307
|
+
wait until caches are refreshed due to old age. The given default TTL can be
|
|
308
|
+
overwritten by specifying the ``ttl`` parameter in
|
|
309
|
+
:func:`Database.query`. The default is 52 weeks.
|
|
310
|
+
|
|
311
|
+
Examples:
|
|
312
|
+
We start by importing the necessary modules:
|
|
313
|
+
|
|
314
|
+
>>> from pathlib import Path
|
|
315
|
+
...
|
|
316
|
+
>>> import patito as pt
|
|
317
|
+
>>> import pyarrow as pa
|
|
318
|
+
|
|
319
|
+
In order to construct a ``Database``, we need to provide the constructor with
|
|
320
|
+
a function that can *execute* query strings. How to construct this function will
|
|
321
|
+
depend on what you actually want to run your queries against, for example a
|
|
322
|
+
local or remote database. For the purposes of demonstration we will use
|
|
323
|
+
SQLite since it is built into Python's standard library, but you can use
|
|
324
|
+
anything; for example Snowflake or PostgresQL.
|
|
325
|
+
|
|
326
|
+
We will use Python's standard library
|
|
327
|
+
`documentation <https://docs.python.org/3/library/sqlite3.html>`_
|
|
328
|
+
to create an in-memory SQLite database.
|
|
329
|
+
It will contain a single table named ``movies`` containing some dummy data.
|
|
330
|
+
The details do not really matter here, the only important part is that we
|
|
331
|
+
construct a database which we can run SQL queries against.
|
|
332
|
+
|
|
333
|
+
>>> import sqlite3
|
|
334
|
+
...
|
|
335
|
+
>>> def dummy_database() -> sqlite3.Cursor:
|
|
336
|
+
... connection = sqlite3.connect(":memory:")
|
|
337
|
+
... cursor = connection.cursor()
|
|
338
|
+
... cursor.execute("CREATE TABLE movies(title, year, score)")
|
|
339
|
+
... data = [
|
|
340
|
+
... ("Monty Python Live at the Hollywood Bowl", 1982, 7.9),
|
|
341
|
+
... ("Monty Python's The Meaning of Life", 1983, 7.5),
|
|
342
|
+
... ("Monty Python's Life of Brian", 1979, 8.0),
|
|
343
|
+
... ]
|
|
344
|
+
... cursor.executemany("INSERT INTO movies VALUES(?, ?, ?)", data)
|
|
345
|
+
... connection.commit()
|
|
346
|
+
... return cursor
|
|
347
|
+
|
|
348
|
+
Using this dummy database, we are now able to construct a function which accepts
|
|
349
|
+
SQL queries as its first parameter, executes the query, and returns the query
|
|
350
|
+
result in the form of an Arrow table.
|
|
351
|
+
|
|
352
|
+
>>> def query_handler(query: str) -> pa.Table:
|
|
353
|
+
... cursor = dummy_database()
|
|
354
|
+
... cursor.execute(query)
|
|
355
|
+
... columns = [description[0] for description in cursor.description]
|
|
356
|
+
... data = [dict(zip(columns, row)) for row in cursor.fetchall()]
|
|
357
|
+
... return pa.Table.from_pylist(data)
|
|
358
|
+
|
|
359
|
+
We can now construct a ``Database`` object, providing ``query_handler``
|
|
360
|
+
as the way to execute SQL queries.
|
|
361
|
+
|
|
362
|
+
>>> db = pt.Database(query_handler=query_handler)
|
|
363
|
+
|
|
364
|
+
The resulting object can now be used to execute SQL queries against the database
|
|
365
|
+
and return the result in the form of a polars ``DataFrame`` object.
|
|
366
|
+
|
|
367
|
+
>>> db.query("select * from movies order by year limit 1")
|
|
368
|
+
shape: (1, 3)
|
|
369
|
+
┌──────────────────────────────┬──────┬───────┐
|
|
370
|
+
│ title ┆ year ┆ score │
|
|
371
|
+
│ --- ┆ --- ┆ --- │
|
|
372
|
+
│ str ┆ i64 ┆ f64 │
|
|
373
|
+
╞══════════════════════════════╪══════╪═══════╡
|
|
374
|
+
│ Monty Python's Life of Brian ┆ 1979 ┆ 8.0 │
|
|
375
|
+
└──────────────────────────────┴──────┴───────┘
|
|
376
|
+
|
|
377
|
+
But the main way to use a ``Database`` object is to use the
|
|
378
|
+
``@Database.as_query`` decarator to wrap functions which return SQL
|
|
379
|
+
query *strings*.
|
|
380
|
+
|
|
381
|
+
>>> @db.as_query()
|
|
382
|
+
>>> def movies(newer_than_year: int):
|
|
383
|
+
... return f"select * from movies where year > {newer_than_year}"
|
|
384
|
+
|
|
385
|
+
This decorator will convert the function from producing query strings, to
|
|
386
|
+
actually executing the given query and return the query result in the form of
|
|
387
|
+
a polars ``DataFrame`` object.
|
|
388
|
+
|
|
389
|
+
>>> movies(newer_than_year=1980)
|
|
390
|
+
shape: (2, 3)
|
|
391
|
+
┌───────────────────────────────────┬──────┬───────┐
|
|
392
|
+
│ title ┆ year ┆ score │
|
|
393
|
+
│ --- ┆ --- ┆ --- │
|
|
394
|
+
│ str ┆ i64 ┆ f64 │
|
|
395
|
+
╞═══════════════════════════════════╪══════╪═══════╡
|
|
396
|
+
│ Monty Python Live at the Hollywo… ┆ 1982 ┆ 7.9 │
|
|
397
|
+
│ Monty Python's The Meaning of Li… ┆ 1983 ┆ 7.5 │
|
|
398
|
+
└───────────────────────────────────┴──────┴───────┘
|
|
399
|
+
|
|
400
|
+
Caching is not enabled by default, but it can be enabled by specifying
|
|
401
|
+
``cache=True`` to the ``@db.as_query(...)`` decorator. Other arguments are
|
|
402
|
+
also accepted, such as ``lazy=True`` if you want to retrieve the results in the
|
|
403
|
+
form of a ``LazyFrame`` instead of a ``DataFrame``, ``ttl`` if you want to
|
|
404
|
+
specify another TTL, and any additional keyword arguments are forwarded to
|
|
405
|
+
``query_executor`` when the SQL query is executed. You can read more about these
|
|
406
|
+
parameters in the documentation of :func:`Database.query`.
|
|
407
|
+
|
|
408
|
+
.. _XDG: https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
|
|
409
|
+
"""
|
|
410
|
+
|
|
411
|
+
Query = DatabaseQuery
|
|
412
|
+
|
|
413
|
+
def __init__( # noqa: D107
|
|
414
|
+
self,
|
|
415
|
+
query_handler: Callable[..., pa.Table],
|
|
416
|
+
cache_directory: Optional[Path] = None,
|
|
417
|
+
default_ttl: timedelta = timedelta(weeks=52), # noqa: B008
|
|
418
|
+
) -> None:
|
|
419
|
+
self.query_handler = query_handler
|
|
420
|
+
self.cache_directory = cache_directory or xdg.cache_home(application="patito")
|
|
421
|
+
self.default_ttl = default_ttl
|
|
422
|
+
|
|
423
|
+
self.cache_directory.mkdir(exist_ok=True, parents=True)
|
|
424
|
+
|
|
425
|
+
# With lazy = False a DataFrame-producing wrapper is returned
|
|
426
|
+
@overload
|
|
427
|
+
def as_query(
|
|
428
|
+
self,
|
|
429
|
+
*,
|
|
430
|
+
lazy: Literal[False] = False,
|
|
431
|
+
cache: Union[str, Path, bool] = False,
|
|
432
|
+
ttl: Optional[timedelta] = None,
|
|
433
|
+
model: Union[Type["Model"], None] = None,
|
|
434
|
+
**kwargs: Any, # noqa: ANN401
|
|
435
|
+
) -> Callable[[QueryConstructor[P]], DatabaseQuery[P, pl.DataFrame]]:
|
|
436
|
+
... # pragma: no cover
|
|
437
|
+
|
|
438
|
+
# With lazy = True a LazyFrame-producing wrapper is returned
|
|
439
|
+
@overload
|
|
440
|
+
def as_query(
|
|
441
|
+
self,
|
|
442
|
+
*,
|
|
443
|
+
lazy: Literal[True],
|
|
444
|
+
cache: Union[str, Path, bool] = False,
|
|
445
|
+
ttl: Optional[timedelta] = None,
|
|
446
|
+
model: Union[Type["Model"], None] = None,
|
|
447
|
+
**kwargs: Any, # noqa: ANN401
|
|
448
|
+
) -> Callable[[QueryConstructor[P]], DatabaseQuery[P, pl.LazyFrame]]:
|
|
449
|
+
... # pragma: no cover
|
|
450
|
+
|
|
451
|
+
def as_query(
|
|
452
|
+
self,
|
|
453
|
+
*,
|
|
454
|
+
lazy: bool = False,
|
|
455
|
+
cache: Union[str, Path, bool] = False,
|
|
456
|
+
ttl: Optional[timedelta] = None,
|
|
457
|
+
model: Union[Type["Model"], None] = None,
|
|
458
|
+
**kwargs: Any, # noqa: ANN401
|
|
459
|
+
) -> Callable[
|
|
460
|
+
[QueryConstructor[P]], DatabaseQuery[P, Union[pl.DataFrame, pl.LazyFrame]]
|
|
461
|
+
]:
|
|
462
|
+
"""
|
|
463
|
+
Execute the returned query string and return a polars dataframe.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
lazy: If the result should be returned as a LazyFrame rather than a
|
|
467
|
+
DataFrame. Allows more efficient reading from parquet caches if caching
|
|
468
|
+
is enabled.
|
|
469
|
+
cache: If queries should be cached in order to save time and costs.
|
|
470
|
+
The cache will only be used if the exact same SQL string has
|
|
471
|
+
been executed before.
|
|
472
|
+
If the parameter is specified as ``True``, a parquet file is
|
|
473
|
+
created for each unique query string, and is located at:
|
|
474
|
+
artifacts/query_cache/<function_name>/<query_md5_hash>.parquet
|
|
475
|
+
If the a string or ``pathlib.Path`` object is provided, the given path
|
|
476
|
+
will be used, but it must have a '.parquet' file extension.
|
|
477
|
+
Relative paths are interpreted relative to artifacts/query_cache/
|
|
478
|
+
in the workspace root. The given parquet path will be overwritten
|
|
479
|
+
if the query string changes, so only the latest query string value
|
|
480
|
+
will be cached.
|
|
481
|
+
ttl: The Time to Live (TTL) of the cache specified as a datetime.timedelta
|
|
482
|
+
object. When the cache becomes older than the specified TTL, the query
|
|
483
|
+
will be re-executed on the next invocation of the query function
|
|
484
|
+
and the cache will refreshed.
|
|
485
|
+
model: An optional Patito model used to validate the content of the
|
|
486
|
+
dataframe before return.
|
|
487
|
+
**kwargs: Connection parameters forwarded to sql_to_polars, for example
|
|
488
|
+
db_params.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
A new function which returns a polars DataFrame based on the query
|
|
492
|
+
specified by the original function's return string.
|
|
493
|
+
"""
|
|
494
|
+
|
|
495
|
+
def wrapper(query_constructor: QueryConstructor) -> DatabaseQuery:
|
|
496
|
+
return self.Query(
|
|
497
|
+
query_constructor=query_constructor,
|
|
498
|
+
lazy=lazy,
|
|
499
|
+
cache=cache,
|
|
500
|
+
ttl=ttl if ttl is not None else self.default_ttl,
|
|
501
|
+
cache_directory=self.cache_directory,
|
|
502
|
+
model=model,
|
|
503
|
+
query_handler=_with_query_metadata(self.query_handler),
|
|
504
|
+
query_handler_kwargs=kwargs,
|
|
505
|
+
)
|
|
506
|
+
|
|
507
|
+
return wrapper
|
|
508
|
+
|
|
509
|
+
# With lazy=False, a DataFrame is returned
|
|
510
|
+
@overload
|
|
511
|
+
def query(
|
|
512
|
+
self,
|
|
513
|
+
query: str,
|
|
514
|
+
*,
|
|
515
|
+
lazy: Literal[False] = False,
|
|
516
|
+
cache: Union[str, Path, bool] = False,
|
|
517
|
+
ttl: Optional[timedelta] = None,
|
|
518
|
+
model: Union[Type["Model"], None] = None,
|
|
519
|
+
**kwargs: Any, # noqa: ANN401
|
|
520
|
+
) -> pl.DataFrame:
|
|
521
|
+
... # pragma: no cover
|
|
522
|
+
|
|
523
|
+
# With lazy=True, a LazyFrame is returned
|
|
524
|
+
@overload
|
|
525
|
+
def query(
|
|
526
|
+
self,
|
|
527
|
+
query: str,
|
|
528
|
+
*,
|
|
529
|
+
lazy: Literal[True],
|
|
530
|
+
cache: Union[str, Path, bool] = False,
|
|
531
|
+
ttl: Optional[timedelta] = None,
|
|
532
|
+
model: Union[Type["Model"], None] = None,
|
|
533
|
+
**kwargs: Any, # noqa: ANN401
|
|
534
|
+
) -> pl.LazyFrame:
|
|
535
|
+
... # pragma: no cover
|
|
536
|
+
|
|
537
|
+
def query(
|
|
538
|
+
self,
|
|
539
|
+
query: str,
|
|
540
|
+
*,
|
|
541
|
+
lazy: bool = False,
|
|
542
|
+
cache: Union[str, Path, bool] = False,
|
|
543
|
+
ttl: Optional[timedelta] = None,
|
|
544
|
+
model: Union[Type["Model"], None] = None,
|
|
545
|
+
**kwargs: Any, # noqa: ANN401
|
|
546
|
+
) -> Union[pl.DataFrame, pl.LazyFrame]:
|
|
547
|
+
"""
|
|
548
|
+
Execute the given query and return the query result as a DataFrame or LazyFrame.
|
|
549
|
+
|
|
550
|
+
See :ref:`Database.as_query` for a more powerful way to build and execute
|
|
551
|
+
queries.
|
|
552
|
+
|
|
553
|
+
Args:
|
|
554
|
+
query: The query string to be executed, for instance an SQL query.
|
|
555
|
+
lazy: If the query result should be returned in the form of a LazyFrame
|
|
556
|
+
instead of a DataFrame.
|
|
557
|
+
cache: If the query result should be saved and re-used the next time the
|
|
558
|
+
same query is executed. Can also be provided as a path. See
|
|
559
|
+
:func:`Database.as_query` for full documentation.
|
|
560
|
+
ttl: How long to use cached results until the query is re-executed anyway.
|
|
561
|
+
model: A :ref:`Model` to optionally validate the query result.
|
|
562
|
+
**kwargs: All additional keyword arguments are forwarded to the query
|
|
563
|
+
handler which executes the given query.
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
The result of the query in the form of a ``DataFrame`` if ``lazy=False``, or
|
|
567
|
+
a ``LazyFrame`` otherwise.
|
|
568
|
+
|
|
569
|
+
Examples:
|
|
570
|
+
We will use DuckDB as our example database.
|
|
571
|
+
|
|
572
|
+
>>> import duckdb
|
|
573
|
+
>>> import patito as pt
|
|
574
|
+
|
|
575
|
+
We will construct a really simple query source from an in-memory database.
|
|
576
|
+
|
|
577
|
+
>>> db = duckdb.connect(":memory:")
|
|
578
|
+
>>> query_handler = lambda query: db.cursor().query(query).arrow()
|
|
579
|
+
>>> query_source = pt.Database(query_handler=query_handler)
|
|
580
|
+
|
|
581
|
+
We can now use :func:`Database.query` in order to execute queries against
|
|
582
|
+
the in-memory database.
|
|
583
|
+
|
|
584
|
+
>>> query_source.query("select 1 as a, 2 as b, 3 as c")
|
|
585
|
+
shape: (1, 3)
|
|
586
|
+
┌─────┬─────┬─────┐
|
|
587
|
+
│ a ┆ b ┆ c │
|
|
588
|
+
│ --- ┆ --- ┆ --- │
|
|
589
|
+
│ i32 ┆ i32 ┆ i32 │
|
|
590
|
+
╞═════╪═════╪═════╡
|
|
591
|
+
│ 1 ┆ 2 ┆ 3 │
|
|
592
|
+
└─────┴─────┴─────┘
|
|
593
|
+
"""
|
|
594
|
+
|
|
595
|
+
def __direct_query() -> str:
|
|
596
|
+
"""
|
|
597
|
+
A regular named function in order to store parquet files correctly.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
The user-provided query string.
|
|
601
|
+
"""
|
|
602
|
+
return query
|
|
603
|
+
|
|
604
|
+
return self.as_query(
|
|
605
|
+
lazy=lazy, # type: ignore
|
|
606
|
+
cache=cache,
|
|
607
|
+
ttl=ttl,
|
|
608
|
+
model=model,
|
|
609
|
+
**kwargs,
|
|
610
|
+
)(__direct_query)()
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def _with_query_metadata(query_handler: Callable[P, pa.Table]) -> Callable[P, pa.Table]:
|
|
614
|
+
"""
|
|
615
|
+
Wrap SQL-query handler with additional logic.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
query_handler: Function accepting an SQL query as its first argument and
|
|
619
|
+
returning an Arrow table.
|
|
620
|
+
|
|
621
|
+
Returns:
|
|
622
|
+
New function that returns Arrow table with additional metedata. Arrow types
|
|
623
|
+
which are not supported by polars directly have also been converted to
|
|
624
|
+
compatible ones where applicable.
|
|
625
|
+
"""
|
|
626
|
+
|
|
627
|
+
@wraps(query_handler)
|
|
628
|
+
def wrapped_query_handler(
|
|
629
|
+
*args: P.args,
|
|
630
|
+
**kwargs: P.kwargs,
|
|
631
|
+
) -> pa.Table:
|
|
632
|
+
cast_to_polars_equivalent_types = kwargs.pop(
|
|
633
|
+
"cast_to_polars_equivalent_types", True
|
|
634
|
+
)
|
|
635
|
+
start_time = datetime.now()
|
|
636
|
+
arrow_table = query_handler(*args, **kwargs)
|
|
637
|
+
finish_time = datetime.now()
|
|
638
|
+
metadata: dict = arrow_table.schema.metadata or {}
|
|
639
|
+
if cast_to_polars_equivalent_types:
|
|
640
|
+
# We perform a round-trip to polars and back in order to get an arrow table
|
|
641
|
+
# with column types that are directly supported by polars.
|
|
642
|
+
arrow_table = pl.from_arrow(arrow_table).to_arrow()
|
|
643
|
+
|
|
644
|
+
# Store additional metadata which is useful when the arrow table is written to a
|
|
645
|
+
# parquet file as a caching mechanism.
|
|
646
|
+
metadata.update(
|
|
647
|
+
{
|
|
648
|
+
"query": args[0],
|
|
649
|
+
"query_start_time": start_time.isoformat(),
|
|
650
|
+
"query_end_time": finish_time.isoformat(),
|
|
651
|
+
}
|
|
652
|
+
)
|
|
653
|
+
return arrow_table.replace_schema_metadata(metadata)
|
|
654
|
+
|
|
655
|
+
return wrapped_query_handler
|
|
656
|
+
|
|
657
|
+
|
|
658
|
+
__all__ = ["Database"]
|