patito 0.5.1__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
patito/database.py DELETED
@@ -1,658 +0,0 @@
1
- """Module containing utilities for retrieving data from external databases."""
2
- import glob
3
- import hashlib
4
- import inspect
5
- import re
6
- from datetime import datetime, timedelta
7
- from functools import wraps
8
- from pathlib import Path
9
- from typing import (
10
- TYPE_CHECKING,
11
- Any,
12
- Callable,
13
- Dict,
14
- Generic,
15
- Optional,
16
- Type,
17
- TypeVar,
18
- Union,
19
- cast,
20
- overload,
21
- )
22
-
23
- import polars as pl
24
- import pyarrow as pa # type: ignore[import]
25
- import pyarrow.parquet as pq # type: ignore[import]
26
- from typing_extensions import Literal, ParamSpec, Protocol
27
-
28
- from patito import xdg
29
-
30
- if TYPE_CHECKING:
31
- from patito import Model
32
-
33
-
34
- P = ParamSpec("P")
35
- DF = TypeVar("DF", bound=Union[pl.DataFrame, pl.LazyFrame], covariant=True)
36
-
37
- # Increment this integer whenever you make backwards-incompatible changes to
38
- # the parquet caching implemented in WrappedQueryFunc, then such caches
39
- # are ejected the next time the wrapper tries to read from them.
40
- CACHE_VERSION = 1
41
-
42
-
43
- class QueryConstructor(Protocol[P]):
44
- """A function taking arbitrary arguments and returning an SQL query string."""
45
-
46
- __name__: str
47
-
48
- def __call__(self, *args: P.args, **kwargs: P.kwargs) -> str:
49
- """
50
- Return SQL query constructed from the given parameters.
51
-
52
- Args:
53
- *args: Positional arguments used to build SQL query.
54
- **kwargs: Keyword arguments used to build SQL query.
55
- """
56
- ... # pragma: no cover
57
-
58
-
59
- class DatabaseQuery(Generic[P, DF]):
60
- """A class acting as a function that returns a polars.DataFrame when called."""
61
-
62
- _cache: Union[bool, Path]
63
-
64
- def __init__( # noqa: C901
65
- self,
66
- query_constructor: QueryConstructor[P],
67
- cache_directory: Path,
68
- query_handler: Callable[..., pa.Table],
69
- ttl: timedelta,
70
- lazy: bool = False,
71
- cache: Union[str, Path, bool] = False,
72
- model: Union[Type["Model"], None] = None,
73
- query_handler_kwargs: Optional[Dict[Any, Any]] = None,
74
- ) -> None:
75
- """
76
- Convert SQL string query function to polars.DataFrame function.
77
-
78
- Args:
79
- query_constructor: A function that takes arbitrary arguments and returns
80
- an SQL query string.
81
- cache_directory: Path to directory to store parquet cache files in.
82
- query_handler: Function used to execute SQL queries and return arrow
83
- tables.
84
- ttl: See Database.query for documentation.
85
- lazy: See Database.query for documentation.
86
- cache: See Database.query for documentation.
87
- model: See Database.query for documentation.
88
- query_handler_kwargs: Arbitrary keyword arguments forwarded to the provided
89
- query handler.
90
-
91
- Raises:
92
- ValueError: If the given path does not have a '.parquet' file extension.
93
- """
94
- if not isinstance(cache, bool) and Path(cache).suffix != ".parquet":
95
- raise ValueError("Cache paths must have the '.parquet' file extension!")
96
-
97
- if isinstance(cache, (Path, str)):
98
- self._cache = cache_directory.joinpath(cache)
99
- else:
100
- self._cache = cache
101
- self._query_constructor = query_constructor
102
- self.cache_directory = cache_directory
103
-
104
- self._query_handler_kwargs = query_handler_kwargs or {}
105
- # Unless explicitly specified otherwise by the end-user, we retrieve query
106
- # results as arrow tables with column types directly supported by polars.
107
- # Otherwise the resulting parquet files that are written to disk can not be
108
- # lazily read with polars.scan_parquet.
109
- self._query_handler_kwargs.setdefault("cast_to_polars_equivalent_types", True)
110
-
111
- # We construct the new function with the same parameter signature as
112
- # wrapped_function, but with polars.DataFrame as the return type.
113
- @wraps(query_constructor)
114
- def cached_func(*args: P.args, **kwargs: P.kwargs) -> DF:
115
- query = query_constructor(*args, **kwargs)
116
- cache_path = self.cache_path(*args, **kwargs)
117
- if cache_path and cache_path.exists():
118
- metadata: Dict[bytes, bytes] = pq.read_schema(cache_path).metadata or {}
119
-
120
- # Check if the cache file was produced by an identical SQL query
121
- is_same_query = metadata.get(b"query") == query.encode("utf-8")
122
-
123
- # Check if the cache is too old to be re-used
124
- cache_created_time = datetime.fromisoformat(
125
- metadata.get(
126
- b"query_start_time", b"1900-01-01T00:00:00.000000"
127
- ).decode("utf-8")
128
- )
129
- is_fresh_cache = (datetime.now() - cache_created_time) < ttl
130
-
131
- # Check if the cache was produced by an incompatible version
132
- cache_version = int.from_bytes(
133
- metadata.get(
134
- b"cache_version",
135
- (0).to_bytes(length=16, byteorder="little", signed=False),
136
- ),
137
- byteorder="little",
138
- signed=False,
139
- )
140
- is_compatible_version = cache_version >= CACHE_VERSION
141
-
142
- if is_same_query and is_fresh_cache and is_compatible_version:
143
- if lazy:
144
- return pl.scan_parquet(cache_path) # type: ignore
145
- else:
146
- return pl.read_parquet(cache_path) # type: ignore
147
-
148
- arrow_table = query_handler(query, **self._query_handler_kwargs)
149
- if cache_path:
150
- cache_path.parent.mkdir(parents=True, exist_ok=True)
151
- # We write the cache *before* any potential model validation since
152
- # we don't want to lose the result of an expensive query just because
153
- # the model specification is wrong.
154
- # We also use pyarrow.parquet.write_table instead of
155
- # polars.write_parquet since we want to write the arrow table's metadata
156
- # to the parquet file, such as the executed query, time, etc..
157
- # This metadata is not preserved by polars.
158
- metadata = arrow_table.schema.metadata
159
- metadata[
160
- b"wrapped_function_name"
161
- ] = self._query_constructor.__name__.encode("utf-8")
162
- # Store the cache version as an 16-bit unsigned little-endian number
163
- metadata[b"cache_version"] = CACHE_VERSION.to_bytes(
164
- length=16,
165
- byteorder="little",
166
- signed=False,
167
- )
168
- pq.write_table(
169
- table=arrow_table.replace_schema_metadata(metadata),
170
- where=cache_path,
171
- # In order to support nanosecond-resolution timestamps, we must
172
- # use parquet version >= 2.6.
173
- version="2.6",
174
- )
175
-
176
- polars_df = cast(pl.DataFrame, pl.from_arrow(arrow_table))
177
- if model:
178
- model.validate(polars_df)
179
-
180
- if lazy:
181
- if cache_path:
182
- # Delete in-memory representation of data and read from the new
183
- # parquet file instead. That way we get consistent memory pressure
184
- # the first and subsequent times this function is invoked.
185
- del polars_df, arrow_table
186
- return pl.scan_parquet(source=cache_path) # type: ignore
187
- else:
188
- return polars_df.lazy() # type: ignore
189
- else:
190
- return polars_df # type: ignore
191
-
192
- self._cached_func = cached_func
193
-
194
- def cache_path(self, *args: P.args, **kwargs: P.kwargs) -> Optional[Path]:
195
- """
196
- Return the path to the parquet cache that would store the result of the query.
197
-
198
- Args:
199
- args: The positional arguments passed to the wrapped function.
200
- kwargs: The keyword arguments passed to the wrapped function.
201
-
202
- Returns:
203
- A deterministic path to a parquet cache. None if caching is disabled.
204
- """
205
- # We convert args+kwargs to kwargs-only and use it to format the string
206
- function_signature = inspect.signature(self._query_constructor)
207
- bound_arguments = function_signature.bind(*args, **kwargs)
208
-
209
- if isinstance(self._cache, Path):
210
- # Interpret relative paths relative to the main query cache directory
211
- return Path(str(self._cache).format(**bound_arguments.arguments))
212
- elif self._cache is True:
213
- directory: Path = self.cache_directory / self._query_constructor.__name__
214
- directory.mkdir(exist_ok=True, parents=True)
215
- sql_query = self.query_string(*args, **kwargs)
216
- sql_query_hash = hashlib.sha1( # noqa: S324,S303
217
- sql_query.encode("utf-8")
218
- ).hexdigest()
219
- return directory / f"{sql_query_hash}.parquet"
220
- else:
221
- return None
222
-
223
- def __call__(self, *args: P.args, **kwargs: P.kwargs) -> DF: # noqa: D102
224
- return self._cached_func(*args, **kwargs)
225
-
226
- def query_string(self, *args: P.args, **kwargs: P.kwargs) -> str:
227
- """
228
- Return the query to be executed for the given parameters.
229
-
230
- Args:
231
- *args: Positional arguments used to construct the query string.
232
- *kwargs: Keyword arguments used to construct the query string.
233
-
234
- Returns:
235
- The query string produced for the given input parameters.
236
- """
237
- return self._query_constructor(*args, **kwargs)
238
-
239
- def refresh_cache(self, *args: P.args, **kwargs: P.kwargs) -> DF:
240
- """
241
- Force query execution by refreshing the cache.
242
-
243
- Args:
244
- *args: Positional arguments used to construct the SQL query string.
245
- *kwargs: Keyword arguments used to construct the SQL query string.
246
-
247
- Returns:
248
- A DataFrame representing the result of the newly executed query.
249
- """
250
- cache_path = self.cache_path(*args, **kwargs)
251
- if cache_path and cache_path.exists():
252
- cache_path.unlink()
253
- return self._cached_func(*args, **kwargs)
254
-
255
- def clear_caches(self) -> None:
256
- """Delete all parquet cache files produced by this query wrapper."""
257
- if self._cache is False:
258
- # Caching is not enabled, so this is simply a no-op
259
- return
260
-
261
- if self._cache is True:
262
- glob_pattern = str(
263
- self.cache_directory / self._query_constructor.__name__ / "*.parquet"
264
- )
265
- else:
266
- # We replace all formatting specifiers of the form '{variable}' with
267
- # recursive globs '**' (in case strings containing '/' are inserted) and
268
- # search for all occurrences of such file paths.
269
- # For example if cache="{a}/{b}.parquet" is specified, we search for
270
- # all files matching the glob pattern '**/**.parquet'.
271
- glob_pattern = re.sub( # noqa: PD005
272
- # We specify the reluctant qualifier (?) in order to get narrow matches
273
- pattern=r"\{.+?\}",
274
- repl="**",
275
- string=str(self._cache),
276
- )
277
-
278
- for parquet_path in glob.iglob(glob_pattern):
279
- try:
280
- metadata: Dict[bytes, bytes] = (
281
- pq.read_schema(where=parquet_path).metadata or {}
282
- )
283
- if metadata.get(
284
- b"wrapped_function_name"
285
- ) == self._query_constructor.__name__.encode("utf-8"):
286
- Path(parquet_path).unlink()
287
- except Exception: # noqa: S112
288
- # If we can't read the parquet metadata for some reason,
289
- # it is probably not a cache anyway.
290
- continue
291
-
292
-
293
- class Database:
294
- """
295
- Construct manager for executing SQL queries and caching the results.
296
-
297
- Args:
298
- query_handler: The function that the Database object should use for executing
299
- SQL queries. Its first argument should be the SQL query string to execute,
300
- and it should return the query result as an arrow table, for instance
301
- pyarrow.Table.
302
- cache_directory: Path to the directory where caches should be stored as parquet
303
- files. If not provided, the `XDG`_ Base Directory Specification will be
304
- used to determine the suitable cache directory, by default
305
- ``~/.cache/patito`` or ``${XDG_CACHE_HOME}/patito``.
306
- default_ttl: The default Time To Live (TTL), or with other words, how long to
307
- wait until caches are refreshed due to old age. The given default TTL can be
308
- overwritten by specifying the ``ttl`` parameter in
309
- :func:`Database.query`. The default is 52 weeks.
310
-
311
- Examples:
312
- We start by importing the necessary modules:
313
-
314
- >>> from pathlib import Path
315
- ...
316
- >>> import patito as pt
317
- >>> import pyarrow as pa
318
-
319
- In order to construct a ``Database``, we need to provide the constructor with
320
- a function that can *execute* query strings. How to construct this function will
321
- depend on what you actually want to run your queries against, for example a
322
- local or remote database. For the purposes of demonstration we will use
323
- SQLite since it is built into Python's standard library, but you can use
324
- anything; for example Snowflake or PostgresQL.
325
-
326
- We will use Python's standard library
327
- `documentation <https://docs.python.org/3/library/sqlite3.html>`_
328
- to create an in-memory SQLite database.
329
- It will contain a single table named ``movies`` containing some dummy data.
330
- The details do not really matter here, the only important part is that we
331
- construct a database which we can run SQL queries against.
332
-
333
- >>> import sqlite3
334
- ...
335
- >>> def dummy_database() -> sqlite3.Cursor:
336
- ... connection = sqlite3.connect(":memory:")
337
- ... cursor = connection.cursor()
338
- ... cursor.execute("CREATE TABLE movies(title, year, score)")
339
- ... data = [
340
- ... ("Monty Python Live at the Hollywood Bowl", 1982, 7.9),
341
- ... ("Monty Python's The Meaning of Life", 1983, 7.5),
342
- ... ("Monty Python's Life of Brian", 1979, 8.0),
343
- ... ]
344
- ... cursor.executemany("INSERT INTO movies VALUES(?, ?, ?)", data)
345
- ... connection.commit()
346
- ... return cursor
347
-
348
- Using this dummy database, we are now able to construct a function which accepts
349
- SQL queries as its first parameter, executes the query, and returns the query
350
- result in the form of an Arrow table.
351
-
352
- >>> def query_handler(query: str) -> pa.Table:
353
- ... cursor = dummy_database()
354
- ... cursor.execute(query)
355
- ... columns = [description[0] for description in cursor.description]
356
- ... data = [dict(zip(columns, row)) for row in cursor.fetchall()]
357
- ... return pa.Table.from_pylist(data)
358
-
359
- We can now construct a ``Database`` object, providing ``query_handler``
360
- as the way to execute SQL queries.
361
-
362
- >>> db = pt.Database(query_handler=query_handler)
363
-
364
- The resulting object can now be used to execute SQL queries against the database
365
- and return the result in the form of a polars ``DataFrame`` object.
366
-
367
- >>> db.query("select * from movies order by year limit 1")
368
- shape: (1, 3)
369
- ┌──────────────────────────────┬──────┬───────┐
370
- │ title ┆ year ┆ score │
371
- │ --- ┆ --- ┆ --- │
372
- │ str ┆ i64 ┆ f64 │
373
- ╞══════════════════════════════╪══════╪═══════╡
374
- │ Monty Python's Life of Brian ┆ 1979 ┆ 8.0 │
375
- └──────────────────────────────┴──────┴───────┘
376
-
377
- But the main way to use a ``Database`` object is to use the
378
- ``@Database.as_query`` decarator to wrap functions which return SQL
379
- query *strings*.
380
-
381
- >>> @db.as_query()
382
- >>> def movies(newer_than_year: int):
383
- ... return f"select * from movies where year > {newer_than_year}"
384
-
385
- This decorator will convert the function from producing query strings, to
386
- actually executing the given query and return the query result in the form of
387
- a polars ``DataFrame`` object.
388
-
389
- >>> movies(newer_than_year=1980)
390
- shape: (2, 3)
391
- ┌───────────────────────────────────┬──────┬───────┐
392
- │ title ┆ year ┆ score │
393
- │ --- ┆ --- ┆ --- │
394
- │ str ┆ i64 ┆ f64 │
395
- ╞═══════════════════════════════════╪══════╪═══════╡
396
- │ Monty Python Live at the Hollywo… ┆ 1982 ┆ 7.9 │
397
- │ Monty Python's The Meaning of Li… ┆ 1983 ┆ 7.5 │
398
- └───────────────────────────────────┴──────┴───────┘
399
-
400
- Caching is not enabled by default, but it can be enabled by specifying
401
- ``cache=True`` to the ``@db.as_query(...)`` decorator. Other arguments are
402
- also accepted, such as ``lazy=True`` if you want to retrieve the results in the
403
- form of a ``LazyFrame`` instead of a ``DataFrame``, ``ttl`` if you want to
404
- specify another TTL, and any additional keyword arguments are forwarded to
405
- ``query_executor`` when the SQL query is executed. You can read more about these
406
- parameters in the documentation of :func:`Database.query`.
407
-
408
- .. _XDG: https://specifications.freedesktop.org/basedir-spec/basedir-spec-latest.html
409
- """
410
-
411
- Query = DatabaseQuery
412
-
413
- def __init__( # noqa: D107
414
- self,
415
- query_handler: Callable[..., pa.Table],
416
- cache_directory: Optional[Path] = None,
417
- default_ttl: timedelta = timedelta(weeks=52), # noqa: B008
418
- ) -> None:
419
- self.query_handler = query_handler
420
- self.cache_directory = cache_directory or xdg.cache_home(application="patito")
421
- self.default_ttl = default_ttl
422
-
423
- self.cache_directory.mkdir(exist_ok=True, parents=True)
424
-
425
- # With lazy = False a DataFrame-producing wrapper is returned
426
- @overload
427
- def as_query(
428
- self,
429
- *,
430
- lazy: Literal[False] = False,
431
- cache: Union[str, Path, bool] = False,
432
- ttl: Optional[timedelta] = None,
433
- model: Union[Type["Model"], None] = None,
434
- **kwargs: Any, # noqa: ANN401
435
- ) -> Callable[[QueryConstructor[P]], DatabaseQuery[P, pl.DataFrame]]:
436
- ... # pragma: no cover
437
-
438
- # With lazy = True a LazyFrame-producing wrapper is returned
439
- @overload
440
- def as_query(
441
- self,
442
- *,
443
- lazy: Literal[True],
444
- cache: Union[str, Path, bool] = False,
445
- ttl: Optional[timedelta] = None,
446
- model: Union[Type["Model"], None] = None,
447
- **kwargs: Any, # noqa: ANN401
448
- ) -> Callable[[QueryConstructor[P]], DatabaseQuery[P, pl.LazyFrame]]:
449
- ... # pragma: no cover
450
-
451
- def as_query(
452
- self,
453
- *,
454
- lazy: bool = False,
455
- cache: Union[str, Path, bool] = False,
456
- ttl: Optional[timedelta] = None,
457
- model: Union[Type["Model"], None] = None,
458
- **kwargs: Any, # noqa: ANN401
459
- ) -> Callable[
460
- [QueryConstructor[P]], DatabaseQuery[P, Union[pl.DataFrame, pl.LazyFrame]]
461
- ]:
462
- """
463
- Execute the returned query string and return a polars dataframe.
464
-
465
- Args:
466
- lazy: If the result should be returned as a LazyFrame rather than a
467
- DataFrame. Allows more efficient reading from parquet caches if caching
468
- is enabled.
469
- cache: If queries should be cached in order to save time and costs.
470
- The cache will only be used if the exact same SQL string has
471
- been executed before.
472
- If the parameter is specified as ``True``, a parquet file is
473
- created for each unique query string, and is located at:
474
- artifacts/query_cache/<function_name>/<query_md5_hash>.parquet
475
- If the a string or ``pathlib.Path`` object is provided, the given path
476
- will be used, but it must have a '.parquet' file extension.
477
- Relative paths are interpreted relative to artifacts/query_cache/
478
- in the workspace root. The given parquet path will be overwritten
479
- if the query string changes, so only the latest query string value
480
- will be cached.
481
- ttl: The Time to Live (TTL) of the cache specified as a datetime.timedelta
482
- object. When the cache becomes older than the specified TTL, the query
483
- will be re-executed on the next invocation of the query function
484
- and the cache will refreshed.
485
- model: An optional Patito model used to validate the content of the
486
- dataframe before return.
487
- **kwargs: Connection parameters forwarded to sql_to_polars, for example
488
- db_params.
489
-
490
- Returns:
491
- A new function which returns a polars DataFrame based on the query
492
- specified by the original function's return string.
493
- """
494
-
495
- def wrapper(query_constructor: QueryConstructor) -> DatabaseQuery:
496
- return self.Query(
497
- query_constructor=query_constructor,
498
- lazy=lazy,
499
- cache=cache,
500
- ttl=ttl if ttl is not None else self.default_ttl,
501
- cache_directory=self.cache_directory,
502
- model=model,
503
- query_handler=_with_query_metadata(self.query_handler),
504
- query_handler_kwargs=kwargs,
505
- )
506
-
507
- return wrapper
508
-
509
- # With lazy=False, a DataFrame is returned
510
- @overload
511
- def query(
512
- self,
513
- query: str,
514
- *,
515
- lazy: Literal[False] = False,
516
- cache: Union[str, Path, bool] = False,
517
- ttl: Optional[timedelta] = None,
518
- model: Union[Type["Model"], None] = None,
519
- **kwargs: Any, # noqa: ANN401
520
- ) -> pl.DataFrame:
521
- ... # pragma: no cover
522
-
523
- # With lazy=True, a LazyFrame is returned
524
- @overload
525
- def query(
526
- self,
527
- query: str,
528
- *,
529
- lazy: Literal[True],
530
- cache: Union[str, Path, bool] = False,
531
- ttl: Optional[timedelta] = None,
532
- model: Union[Type["Model"], None] = None,
533
- **kwargs: Any, # noqa: ANN401
534
- ) -> pl.LazyFrame:
535
- ... # pragma: no cover
536
-
537
- def query(
538
- self,
539
- query: str,
540
- *,
541
- lazy: bool = False,
542
- cache: Union[str, Path, bool] = False,
543
- ttl: Optional[timedelta] = None,
544
- model: Union[Type["Model"], None] = None,
545
- **kwargs: Any, # noqa: ANN401
546
- ) -> Union[pl.DataFrame, pl.LazyFrame]:
547
- """
548
- Execute the given query and return the query result as a DataFrame or LazyFrame.
549
-
550
- See :ref:`Database.as_query` for a more powerful way to build and execute
551
- queries.
552
-
553
- Args:
554
- query: The query string to be executed, for instance an SQL query.
555
- lazy: If the query result should be returned in the form of a LazyFrame
556
- instead of a DataFrame.
557
- cache: If the query result should be saved and re-used the next time the
558
- same query is executed. Can also be provided as a path. See
559
- :func:`Database.as_query` for full documentation.
560
- ttl: How long to use cached results until the query is re-executed anyway.
561
- model: A :ref:`Model` to optionally validate the query result.
562
- **kwargs: All additional keyword arguments are forwarded to the query
563
- handler which executes the given query.
564
-
565
- Returns:
566
- The result of the query in the form of a ``DataFrame`` if ``lazy=False``, or
567
- a ``LazyFrame`` otherwise.
568
-
569
- Examples:
570
- We will use DuckDB as our example database.
571
-
572
- >>> import duckdb
573
- >>> import patito as pt
574
-
575
- We will construct a really simple query source from an in-memory database.
576
-
577
- >>> db = duckdb.connect(":memory:")
578
- >>> query_handler = lambda query: db.cursor().query(query).arrow()
579
- >>> query_source = pt.Database(query_handler=query_handler)
580
-
581
- We can now use :func:`Database.query` in order to execute queries against
582
- the in-memory database.
583
-
584
- >>> query_source.query("select 1 as a, 2 as b, 3 as c")
585
- shape: (1, 3)
586
- ┌─────┬─────┬─────┐
587
- │ a ┆ b ┆ c │
588
- │ --- ┆ --- ┆ --- │
589
- │ i32 ┆ i32 ┆ i32 │
590
- ╞═════╪═════╪═════╡
591
- │ 1 ┆ 2 ┆ 3 │
592
- └─────┴─────┴─────┘
593
- """
594
-
595
- def __direct_query() -> str:
596
- """
597
- A regular named function in order to store parquet files correctly.
598
-
599
- Returns:
600
- The user-provided query string.
601
- """
602
- return query
603
-
604
- return self.as_query(
605
- lazy=lazy, # type: ignore
606
- cache=cache,
607
- ttl=ttl,
608
- model=model,
609
- **kwargs,
610
- )(__direct_query)()
611
-
612
-
613
- def _with_query_metadata(query_handler: Callable[P, pa.Table]) -> Callable[P, pa.Table]:
614
- """
615
- Wrap SQL-query handler with additional logic.
616
-
617
- Args:
618
- query_handler: Function accepting an SQL query as its first argument and
619
- returning an Arrow table.
620
-
621
- Returns:
622
- New function that returns Arrow table with additional metedata. Arrow types
623
- which are not supported by polars directly have also been converted to
624
- compatible ones where applicable.
625
- """
626
-
627
- @wraps(query_handler)
628
- def wrapped_query_handler(
629
- *args: P.args,
630
- **kwargs: P.kwargs,
631
- ) -> pa.Table:
632
- cast_to_polars_equivalent_types = kwargs.pop(
633
- "cast_to_polars_equivalent_types", True
634
- )
635
- start_time = datetime.now()
636
- arrow_table = query_handler(*args, **kwargs)
637
- finish_time = datetime.now()
638
- metadata: dict = arrow_table.schema.metadata or {}
639
- if cast_to_polars_equivalent_types:
640
- # We perform a round-trip to polars and back in order to get an arrow table
641
- # with column types that are directly supported by polars.
642
- arrow_table = pl.from_arrow(arrow_table).to_arrow()
643
-
644
- # Store additional metadata which is useful when the arrow table is written to a
645
- # parquet file as a caching mechanism.
646
- metadata.update(
647
- {
648
- "query": args[0],
649
- "query_start_time": start_time.isoformat(),
650
- "query_end_time": finish_time.isoformat(),
651
- }
652
- )
653
- return arrow_table.replace_schema_metadata(metadata)
654
-
655
- return wrapped_query_handler
656
-
657
-
658
- __all__ = ["Database"]