polars-runtime-compat 1.34.0b2__cp39-abi3-manylinux_2_24_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of polars-runtime-compat might be problematic. Click here for more details.
- _polars_runtime_compat/.gitkeep +0 -0
- _polars_runtime_compat/_polars_runtime_compat.abi3.so +0 -0
- polars/__init__.py +528 -0
- polars/_cpu_check.py +265 -0
- polars/_dependencies.py +355 -0
- polars/_plr.py +99 -0
- polars/_plr.pyi +2496 -0
- polars/_reexport.py +23 -0
- polars/_typing.py +478 -0
- polars/_utils/__init__.py +37 -0
- polars/_utils/async_.py +102 -0
- polars/_utils/cache.py +176 -0
- polars/_utils/cloud.py +40 -0
- polars/_utils/constants.py +29 -0
- polars/_utils/construction/__init__.py +46 -0
- polars/_utils/construction/dataframe.py +1397 -0
- polars/_utils/construction/other.py +72 -0
- polars/_utils/construction/series.py +560 -0
- polars/_utils/construction/utils.py +118 -0
- polars/_utils/convert.py +224 -0
- polars/_utils/deprecation.py +406 -0
- polars/_utils/getitem.py +457 -0
- polars/_utils/logging.py +11 -0
- polars/_utils/nest_asyncio.py +264 -0
- polars/_utils/parquet.py +15 -0
- polars/_utils/parse/__init__.py +12 -0
- polars/_utils/parse/expr.py +242 -0
- polars/_utils/polars_version.py +19 -0
- polars/_utils/pycapsule.py +53 -0
- polars/_utils/scan.py +27 -0
- polars/_utils/serde.py +63 -0
- polars/_utils/slice.py +215 -0
- polars/_utils/udfs.py +1251 -0
- polars/_utils/unstable.py +63 -0
- polars/_utils/various.py +782 -0
- polars/_utils/wrap.py +25 -0
- polars/api.py +370 -0
- polars/catalog/__init__.py +0 -0
- polars/catalog/unity/__init__.py +19 -0
- polars/catalog/unity/client.py +733 -0
- polars/catalog/unity/models.py +152 -0
- polars/config.py +1571 -0
- polars/convert/__init__.py +25 -0
- polars/convert/general.py +1046 -0
- polars/convert/normalize.py +261 -0
- polars/dataframe/__init__.py +5 -0
- polars/dataframe/_html.py +186 -0
- polars/dataframe/frame.py +12582 -0
- polars/dataframe/group_by.py +1067 -0
- polars/dataframe/plotting.py +257 -0
- polars/datatype_expr/__init__.py +5 -0
- polars/datatype_expr/array.py +56 -0
- polars/datatype_expr/datatype_expr.py +304 -0
- polars/datatype_expr/list.py +18 -0
- polars/datatype_expr/struct.py +69 -0
- polars/datatypes/__init__.py +122 -0
- polars/datatypes/_parse.py +195 -0
- polars/datatypes/_utils.py +48 -0
- polars/datatypes/classes.py +1213 -0
- polars/datatypes/constants.py +11 -0
- polars/datatypes/constructor.py +172 -0
- polars/datatypes/convert.py +366 -0
- polars/datatypes/group.py +130 -0
- polars/exceptions.py +230 -0
- polars/expr/__init__.py +7 -0
- polars/expr/array.py +964 -0
- polars/expr/binary.py +346 -0
- polars/expr/categorical.py +306 -0
- polars/expr/datetime.py +2620 -0
- polars/expr/expr.py +11272 -0
- polars/expr/list.py +1408 -0
- polars/expr/meta.py +444 -0
- polars/expr/name.py +321 -0
- polars/expr/string.py +3045 -0
- polars/expr/struct.py +357 -0
- polars/expr/whenthen.py +185 -0
- polars/functions/__init__.py +193 -0
- polars/functions/aggregation/__init__.py +33 -0
- polars/functions/aggregation/horizontal.py +298 -0
- polars/functions/aggregation/vertical.py +341 -0
- polars/functions/as_datatype.py +848 -0
- polars/functions/business.py +138 -0
- polars/functions/col.py +384 -0
- polars/functions/datatype.py +121 -0
- polars/functions/eager.py +524 -0
- polars/functions/escape_regex.py +29 -0
- polars/functions/lazy.py +2751 -0
- polars/functions/len.py +68 -0
- polars/functions/lit.py +210 -0
- polars/functions/random.py +22 -0
- polars/functions/range/__init__.py +19 -0
- polars/functions/range/_utils.py +15 -0
- polars/functions/range/date_range.py +303 -0
- polars/functions/range/datetime_range.py +370 -0
- polars/functions/range/int_range.py +348 -0
- polars/functions/range/linear_space.py +311 -0
- polars/functions/range/time_range.py +287 -0
- polars/functions/repeat.py +301 -0
- polars/functions/whenthen.py +353 -0
- polars/interchange/__init__.py +10 -0
- polars/interchange/buffer.py +77 -0
- polars/interchange/column.py +190 -0
- polars/interchange/dataframe.py +230 -0
- polars/interchange/from_dataframe.py +328 -0
- polars/interchange/protocol.py +303 -0
- polars/interchange/utils.py +170 -0
- polars/io/__init__.py +64 -0
- polars/io/_utils.py +317 -0
- polars/io/avro.py +49 -0
- polars/io/clipboard.py +36 -0
- polars/io/cloud/__init__.py +17 -0
- polars/io/cloud/_utils.py +80 -0
- polars/io/cloud/credential_provider/__init__.py +17 -0
- polars/io/cloud/credential_provider/_builder.py +520 -0
- polars/io/cloud/credential_provider/_providers.py +618 -0
- polars/io/csv/__init__.py +9 -0
- polars/io/csv/_utils.py +38 -0
- polars/io/csv/batched_reader.py +142 -0
- polars/io/csv/functions.py +1495 -0
- polars/io/database/__init__.py +6 -0
- polars/io/database/_arrow_registry.py +70 -0
- polars/io/database/_cursor_proxies.py +147 -0
- polars/io/database/_executor.py +578 -0
- polars/io/database/_inference.py +314 -0
- polars/io/database/_utils.py +144 -0
- polars/io/database/functions.py +516 -0
- polars/io/delta.py +499 -0
- polars/io/iceberg/__init__.py +3 -0
- polars/io/iceberg/_utils.py +697 -0
- polars/io/iceberg/dataset.py +556 -0
- polars/io/iceberg/functions.py +151 -0
- polars/io/ipc/__init__.py +8 -0
- polars/io/ipc/functions.py +514 -0
- polars/io/json/__init__.py +3 -0
- polars/io/json/read.py +101 -0
- polars/io/ndjson.py +332 -0
- polars/io/parquet/__init__.py +17 -0
- polars/io/parquet/field_overwrites.py +140 -0
- polars/io/parquet/functions.py +722 -0
- polars/io/partition.py +491 -0
- polars/io/plugins.py +187 -0
- polars/io/pyarrow_dataset/__init__.py +5 -0
- polars/io/pyarrow_dataset/anonymous_scan.py +109 -0
- polars/io/pyarrow_dataset/functions.py +79 -0
- polars/io/scan_options/__init__.py +5 -0
- polars/io/scan_options/_options.py +59 -0
- polars/io/scan_options/cast_options.py +126 -0
- polars/io/spreadsheet/__init__.py +6 -0
- polars/io/spreadsheet/_utils.py +52 -0
- polars/io/spreadsheet/_write_utils.py +647 -0
- polars/io/spreadsheet/functions.py +1323 -0
- polars/lazyframe/__init__.py +9 -0
- polars/lazyframe/engine_config.py +61 -0
- polars/lazyframe/frame.py +8564 -0
- polars/lazyframe/group_by.py +669 -0
- polars/lazyframe/in_process.py +42 -0
- polars/lazyframe/opt_flags.py +333 -0
- polars/meta/__init__.py +14 -0
- polars/meta/build.py +33 -0
- polars/meta/index_type.py +27 -0
- polars/meta/thread_pool.py +50 -0
- polars/meta/versions.py +120 -0
- polars/ml/__init__.py +0 -0
- polars/ml/torch.py +213 -0
- polars/ml/utilities.py +30 -0
- polars/plugins.py +155 -0
- polars/py.typed +0 -0
- polars/pyproject.toml +96 -0
- polars/schema.py +265 -0
- polars/selectors.py +3117 -0
- polars/series/__init__.py +5 -0
- polars/series/array.py +776 -0
- polars/series/binary.py +254 -0
- polars/series/categorical.py +246 -0
- polars/series/datetime.py +2275 -0
- polars/series/list.py +1087 -0
- polars/series/plotting.py +191 -0
- polars/series/series.py +9197 -0
- polars/series/string.py +2367 -0
- polars/series/struct.py +154 -0
- polars/series/utils.py +191 -0
- polars/sql/__init__.py +7 -0
- polars/sql/context.py +677 -0
- polars/sql/functions.py +139 -0
- polars/string_cache.py +185 -0
- polars/testing/__init__.py +13 -0
- polars/testing/asserts/__init__.py +9 -0
- polars/testing/asserts/frame.py +231 -0
- polars/testing/asserts/series.py +219 -0
- polars/testing/asserts/utils.py +12 -0
- polars/testing/parametric/__init__.py +33 -0
- polars/testing/parametric/profiles.py +107 -0
- polars/testing/parametric/strategies/__init__.py +22 -0
- polars/testing/parametric/strategies/_utils.py +14 -0
- polars/testing/parametric/strategies/core.py +615 -0
- polars/testing/parametric/strategies/data.py +452 -0
- polars/testing/parametric/strategies/dtype.py +436 -0
- polars/testing/parametric/strategies/legacy.py +169 -0
- polars/type_aliases.py +24 -0
- polars_runtime_compat-1.34.0b2.dist-info/METADATA +190 -0
- polars_runtime_compat-1.34.0b2.dist-info/RECORD +203 -0
- polars_runtime_compat-1.34.0b2.dist-info/WHEEL +4 -0
- polars_runtime_compat-1.34.0b2.dist-info/licenses/LICENSE +20 -0
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import polars._reexport as pl
|
|
6
|
+
import polars.functions as F
|
|
7
|
+
from polars._utils.various import qualified_type_name
|
|
8
|
+
from polars.datatypes import Boolean, Enum, Int64, String, UInt8, UInt32
|
|
9
|
+
from polars.exceptions import InvalidOperationError
|
|
10
|
+
from polars.interchange.dataframe import PolarsDataFrame
|
|
11
|
+
from polars.interchange.protocol import ColumnNullType, CopyNotAllowedError, DtypeKind
|
|
12
|
+
from polars.interchange.utils import (
|
|
13
|
+
dtype_to_polars_dtype,
|
|
14
|
+
get_buffer_length_in_elements,
|
|
15
|
+
polars_dtype_to_data_buffer_dtype,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from polars import DataFrame, Series
|
|
20
|
+
from polars._typing import PolarsDataType
|
|
21
|
+
from polars.interchange.protocol import Buffer, Column, Dtype, SupportsInterchange
|
|
22
|
+
from polars.interchange.protocol import DataFrame as InterchangeDataFrame
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def from_dataframe(df: SupportsInterchange, *, allow_copy: bool = True) -> DataFrame:
|
|
26
|
+
"""
|
|
27
|
+
Build a Polars DataFrame from any dataframe supporting the interchange protocol.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
df
|
|
32
|
+
Object supporting the dataframe interchange protocol, i.e. must have implemented
|
|
33
|
+
the `__dataframe__` method.
|
|
34
|
+
allow_copy
|
|
35
|
+
Allow memory to be copied to perform the conversion. If set to False, causes
|
|
36
|
+
conversions that are not zero-copy to fail.
|
|
37
|
+
"""
|
|
38
|
+
if isinstance(df, pl.DataFrame):
|
|
39
|
+
return df
|
|
40
|
+
elif isinstance(df, PolarsDataFrame):
|
|
41
|
+
return df._df
|
|
42
|
+
|
|
43
|
+
if not hasattr(df, "__dataframe__"):
|
|
44
|
+
msg = f"`df` of type {qualified_type_name(df)!r} does not support the dataframe interchange protocol"
|
|
45
|
+
raise TypeError(msg)
|
|
46
|
+
|
|
47
|
+
return _from_dataframe(
|
|
48
|
+
df.__dataframe__(allow_copy=allow_copy), # type: ignore[arg-type]
|
|
49
|
+
allow_copy=allow_copy,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _from_dataframe(df: InterchangeDataFrame, *, allow_copy: bool) -> DataFrame:
|
|
54
|
+
chunks = []
|
|
55
|
+
for chunk in df.get_chunks():
|
|
56
|
+
polars_chunk = _protocol_df_chunk_to_polars(chunk, allow_copy=allow_copy)
|
|
57
|
+
chunks.append(polars_chunk)
|
|
58
|
+
|
|
59
|
+
# Handle implementations that incorrectly yield no chunks for an empty dataframe
|
|
60
|
+
if not chunks:
|
|
61
|
+
polars_chunk = _protocol_df_chunk_to_polars(df, allow_copy=allow_copy)
|
|
62
|
+
chunks.append(polars_chunk)
|
|
63
|
+
|
|
64
|
+
return F.concat(chunks, rechunk=False)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _protocol_df_chunk_to_polars(
|
|
68
|
+
df: InterchangeDataFrame, *, allow_copy: bool
|
|
69
|
+
) -> DataFrame:
|
|
70
|
+
columns = []
|
|
71
|
+
for column, name in zip(df.get_columns(), df.column_names()):
|
|
72
|
+
dtype = dtype_to_polars_dtype(column.dtype)
|
|
73
|
+
if dtype == String:
|
|
74
|
+
s = _string_column_to_series(column, allow_copy=allow_copy)
|
|
75
|
+
elif dtype == Enum:
|
|
76
|
+
s = _categorical_column_to_series(column, allow_copy=allow_copy)
|
|
77
|
+
else:
|
|
78
|
+
s = _column_to_series(column, dtype, allow_copy=allow_copy)
|
|
79
|
+
columns.append(s.alias(name))
|
|
80
|
+
|
|
81
|
+
return pl.DataFrame(columns)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _column_to_series(
|
|
85
|
+
column: Column, dtype: PolarsDataType, *, allow_copy: bool
|
|
86
|
+
) -> Series:
|
|
87
|
+
buffers = column.get_buffers()
|
|
88
|
+
offset = column.offset
|
|
89
|
+
|
|
90
|
+
data_buffer = _construct_data_buffer(
|
|
91
|
+
*buffers["data"], column.size(), offset, allow_copy=allow_copy
|
|
92
|
+
)
|
|
93
|
+
validity_buffer = _construct_validity_buffer(
|
|
94
|
+
buffers["validity"], column, dtype, data_buffer, offset, allow_copy=allow_copy
|
|
95
|
+
)
|
|
96
|
+
return pl.Series._from_buffers(dtype, data=data_buffer, validity=validity_buffer)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _string_column_to_series(column: Column, *, allow_copy: bool) -> Series:
|
|
100
|
+
if column.size() == 0:
|
|
101
|
+
return pl.Series(dtype=String)
|
|
102
|
+
elif not allow_copy:
|
|
103
|
+
msg = "string buffers must be converted"
|
|
104
|
+
raise CopyNotAllowedError(msg)
|
|
105
|
+
|
|
106
|
+
buffers = column.get_buffers()
|
|
107
|
+
offset = column.offset
|
|
108
|
+
|
|
109
|
+
offsets_buffer_info = buffers["offsets"]
|
|
110
|
+
if offsets_buffer_info is None:
|
|
111
|
+
msg = "cannot create String column without an offsets buffer"
|
|
112
|
+
raise RuntimeError(msg)
|
|
113
|
+
offsets_buffer = _construct_offsets_buffer(
|
|
114
|
+
*offsets_buffer_info, offset, allow_copy=allow_copy
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
buffer, dtype = buffers["data"]
|
|
118
|
+
data_buffer = _construct_data_buffer(
|
|
119
|
+
buffer, dtype, buffer.bufsize, offset=0, allow_copy=allow_copy
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
# First construct a Series without a validity buffer
|
|
123
|
+
# to allow constructing the validity buffer from a sentinel value
|
|
124
|
+
data_buffers = [data_buffer, offsets_buffer]
|
|
125
|
+
data = pl.Series._from_buffers(String, data=data_buffers, validity=None)
|
|
126
|
+
|
|
127
|
+
# Add the validity buffer if present
|
|
128
|
+
validity_buffer = _construct_validity_buffer(
|
|
129
|
+
buffers["validity"], column, String, data, offset, allow_copy=allow_copy
|
|
130
|
+
)
|
|
131
|
+
if validity_buffer is not None:
|
|
132
|
+
data = pl.Series._from_buffers(
|
|
133
|
+
String, data=data_buffers, validity=validity_buffer
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
return data
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _categorical_column_to_series(column: Column, *, allow_copy: bool) -> Series:
|
|
140
|
+
categorical = column.describe_categorical
|
|
141
|
+
if not categorical["is_dictionary"]:
|
|
142
|
+
msg = "non-dictionary categoricals are not yet supported"
|
|
143
|
+
raise NotImplementedError(msg)
|
|
144
|
+
|
|
145
|
+
categories_col = categorical["categories"]
|
|
146
|
+
if categories_col.size() == 0:
|
|
147
|
+
dtype = Enum([])
|
|
148
|
+
elif categories_col.dtype[0] != DtypeKind.STRING:
|
|
149
|
+
msg = "non-string categories are not supported"
|
|
150
|
+
raise NotImplementedError(msg)
|
|
151
|
+
else:
|
|
152
|
+
categories = _string_column_to_series(categories_col, allow_copy=allow_copy)
|
|
153
|
+
dtype = Enum(categories)
|
|
154
|
+
|
|
155
|
+
buffers = column.get_buffers()
|
|
156
|
+
offset = column.offset
|
|
157
|
+
|
|
158
|
+
data_buffer = _construct_data_buffer(
|
|
159
|
+
*buffers["data"], column.size(), offset, allow_copy=allow_copy
|
|
160
|
+
)
|
|
161
|
+
validity_buffer = _construct_validity_buffer(
|
|
162
|
+
buffers["validity"], column, dtype, data_buffer, offset, allow_copy=allow_copy
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# First construct a physical Series without categories
|
|
166
|
+
# to allow for sentinel values that do not fit in UInt32
|
|
167
|
+
data_dtype = data_buffer.dtype
|
|
168
|
+
out = pl.Series._from_buffers(
|
|
169
|
+
data_dtype, data=data_buffer, validity=validity_buffer
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Polars only supports UInt32 categoricals
|
|
173
|
+
if data_dtype != UInt32:
|
|
174
|
+
if not allow_copy and column.size() > 0:
|
|
175
|
+
msg = f"data buffer must be cast from {data_dtype} to UInt32"
|
|
176
|
+
raise CopyNotAllowedError(msg)
|
|
177
|
+
|
|
178
|
+
# TODO: Cast directly to Enum
|
|
179
|
+
# https://github.com/pola-rs/polars/issues/13409
|
|
180
|
+
out = out.cast(UInt32)
|
|
181
|
+
|
|
182
|
+
return out.cast(dtype)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _construct_data_buffer(
|
|
186
|
+
buffer: Buffer,
|
|
187
|
+
dtype: Dtype,
|
|
188
|
+
length: int,
|
|
189
|
+
offset: int = 0,
|
|
190
|
+
*,
|
|
191
|
+
allow_copy: bool,
|
|
192
|
+
) -> Series:
|
|
193
|
+
polars_dtype = dtype_to_polars_dtype(dtype)
|
|
194
|
+
|
|
195
|
+
# Handle implementations that incorrectly set the data buffer dtype
|
|
196
|
+
# to the column dtype
|
|
197
|
+
# https://github.com/pola-rs/polars/pull/10787
|
|
198
|
+
polars_dtype = polars_dtype_to_data_buffer_dtype(polars_dtype)
|
|
199
|
+
|
|
200
|
+
buffer_info = (buffer.ptr, offset, length)
|
|
201
|
+
|
|
202
|
+
# Handle byte-packed boolean buffer
|
|
203
|
+
if polars_dtype == Boolean and dtype[1] == 8:
|
|
204
|
+
if length == 0:
|
|
205
|
+
return pl.Series(dtype=Boolean)
|
|
206
|
+
elif not allow_copy:
|
|
207
|
+
msg = "byte-packed boolean buffer must be converted to bit-packed boolean"
|
|
208
|
+
raise CopyNotAllowedError(msg)
|
|
209
|
+
return pl.Series._from_buffer(UInt8, buffer_info, owner=buffer).cast(Boolean)
|
|
210
|
+
|
|
211
|
+
return pl.Series._from_buffer(polars_dtype, buffer_info, owner=buffer)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _construct_offsets_buffer(
|
|
215
|
+
buffer: Buffer,
|
|
216
|
+
dtype: Dtype,
|
|
217
|
+
offset: int,
|
|
218
|
+
*,
|
|
219
|
+
allow_copy: bool,
|
|
220
|
+
) -> Series:
|
|
221
|
+
polars_dtype = dtype_to_polars_dtype(dtype)
|
|
222
|
+
length = get_buffer_length_in_elements(buffer.bufsize, dtype) - offset
|
|
223
|
+
|
|
224
|
+
buffer_info = (buffer.ptr, offset, length)
|
|
225
|
+
s = pl.Series._from_buffer(polars_dtype, buffer_info, owner=buffer)
|
|
226
|
+
|
|
227
|
+
# Polars only supports Int64 offsets
|
|
228
|
+
if polars_dtype != Int64:
|
|
229
|
+
if not allow_copy:
|
|
230
|
+
msg = f"offsets buffer must be cast from {polars_dtype} to Int64"
|
|
231
|
+
raise CopyNotAllowedError(msg)
|
|
232
|
+
s = s.cast(Int64)
|
|
233
|
+
|
|
234
|
+
return s
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _construct_validity_buffer(
|
|
238
|
+
validity_buffer_info: tuple[Buffer, Dtype] | None,
|
|
239
|
+
column: Column,
|
|
240
|
+
column_dtype: PolarsDataType,
|
|
241
|
+
data: Series,
|
|
242
|
+
offset: int = 0,
|
|
243
|
+
*,
|
|
244
|
+
allow_copy: bool,
|
|
245
|
+
) -> Series | None:
|
|
246
|
+
null_type, null_value = column.describe_null
|
|
247
|
+
if null_type == ColumnNullType.NON_NULLABLE or column.null_count == 0:
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
elif null_type == ColumnNullType.USE_BITMASK:
|
|
251
|
+
if validity_buffer_info is None:
|
|
252
|
+
return None
|
|
253
|
+
buffer = validity_buffer_info[0]
|
|
254
|
+
return _construct_validity_buffer_from_bitmask(
|
|
255
|
+
buffer, null_value, column.size(), offset, allow_copy=allow_copy
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
elif null_type == ColumnNullType.USE_BYTEMASK:
|
|
259
|
+
if validity_buffer_info is None:
|
|
260
|
+
return None
|
|
261
|
+
buffer = validity_buffer_info[0]
|
|
262
|
+
return _construct_validity_buffer_from_bytemask(
|
|
263
|
+
buffer, null_value, allow_copy=allow_copy
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
elif null_type == ColumnNullType.USE_NAN:
|
|
267
|
+
if not allow_copy:
|
|
268
|
+
msg = "bitmask must be constructed"
|
|
269
|
+
raise CopyNotAllowedError(msg)
|
|
270
|
+
return data.is_not_nan()
|
|
271
|
+
|
|
272
|
+
elif null_type == ColumnNullType.USE_SENTINEL:
|
|
273
|
+
if not allow_copy:
|
|
274
|
+
msg = "bitmask must be constructed"
|
|
275
|
+
raise CopyNotAllowedError(msg)
|
|
276
|
+
|
|
277
|
+
sentinel = pl.Series([null_value])
|
|
278
|
+
try:
|
|
279
|
+
if column_dtype.is_temporal():
|
|
280
|
+
sentinel = sentinel.cast(column_dtype)
|
|
281
|
+
return data != sentinel # noqa: TRY300
|
|
282
|
+
except InvalidOperationError as e:
|
|
283
|
+
msg = f"invalid sentinel value for column of type {column_dtype}: {null_value!r}"
|
|
284
|
+
raise TypeError(msg) from e
|
|
285
|
+
|
|
286
|
+
else:
|
|
287
|
+
msg = f"unsupported null type: {null_type!r}"
|
|
288
|
+
raise NotImplementedError(msg)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def _construct_validity_buffer_from_bitmask(
|
|
292
|
+
buffer: Buffer,
|
|
293
|
+
null_value: int,
|
|
294
|
+
length: int,
|
|
295
|
+
offset: int = 0,
|
|
296
|
+
*,
|
|
297
|
+
allow_copy: bool,
|
|
298
|
+
) -> Series:
|
|
299
|
+
buffer_info = (buffer.ptr, offset, length)
|
|
300
|
+
s = pl.Series._from_buffer(Boolean, buffer_info, buffer)
|
|
301
|
+
|
|
302
|
+
if null_value != 0:
|
|
303
|
+
if not allow_copy:
|
|
304
|
+
msg = "bitmask must be inverted"
|
|
305
|
+
raise CopyNotAllowedError(msg)
|
|
306
|
+
s = ~s
|
|
307
|
+
|
|
308
|
+
return s
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _construct_validity_buffer_from_bytemask(
|
|
312
|
+
buffer: Buffer,
|
|
313
|
+
null_value: int,
|
|
314
|
+
*,
|
|
315
|
+
allow_copy: bool,
|
|
316
|
+
) -> Series:
|
|
317
|
+
if not allow_copy:
|
|
318
|
+
msg = "bytemask must be converted into a bitmask"
|
|
319
|
+
raise CopyNotAllowedError(msg)
|
|
320
|
+
|
|
321
|
+
buffer_info = (buffer.ptr, 0, buffer.bufsize)
|
|
322
|
+
s = pl.Series._from_buffer(UInt8, buffer_info, owner=buffer)
|
|
323
|
+
s = s.cast(Boolean)
|
|
324
|
+
|
|
325
|
+
if null_value != 0:
|
|
326
|
+
s = ~s
|
|
327
|
+
|
|
328
|
+
return s
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from enum import IntEnum
|
|
4
|
+
from typing import (
|
|
5
|
+
TYPE_CHECKING,
|
|
6
|
+
Any,
|
|
7
|
+
ClassVar,
|
|
8
|
+
Literal,
|
|
9
|
+
Protocol,
|
|
10
|
+
TypedDict,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from polars._utils.unstable import issue_unstable_warning
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
import sys
|
|
17
|
+
from collections.abc import Iterable, Sequence
|
|
18
|
+
|
|
19
|
+
from polars.interchange.buffer import PolarsBuffer
|
|
20
|
+
from polars.interchange.column import PolarsColumn
|
|
21
|
+
|
|
22
|
+
if sys.version_info >= (3, 10):
|
|
23
|
+
from typing import TypeAlias
|
|
24
|
+
else:
|
|
25
|
+
from typing_extensions import TypeAlias
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class DlpackDeviceType(IntEnum):
|
|
29
|
+
"""Integer enum for device type codes matching DLPack."""
|
|
30
|
+
|
|
31
|
+
CPU = 1
|
|
32
|
+
CUDA = 2
|
|
33
|
+
CPU_PINNED = 3
|
|
34
|
+
OPENCL = 4
|
|
35
|
+
VULKAN = 7
|
|
36
|
+
METAL = 8
|
|
37
|
+
VPI = 9
|
|
38
|
+
ROCM = 10
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class DtypeKind(IntEnum):
|
|
42
|
+
"""
|
|
43
|
+
Integer enum for data types.
|
|
44
|
+
|
|
45
|
+
Attributes
|
|
46
|
+
----------
|
|
47
|
+
INT : int
|
|
48
|
+
Matches to signed integer data type.
|
|
49
|
+
UINT : int
|
|
50
|
+
Matches to unsigned integer data type.
|
|
51
|
+
FLOAT : int
|
|
52
|
+
Matches to floating point data type.
|
|
53
|
+
BOOL : int
|
|
54
|
+
Matches to boolean data type.
|
|
55
|
+
STRING : int
|
|
56
|
+
Matches to string data type (UTF-8 encoded).
|
|
57
|
+
DATETIME : int
|
|
58
|
+
Matches to datetime data type.
|
|
59
|
+
CATEGORICAL : int
|
|
60
|
+
Matches to categorical data type.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
INT = 0
|
|
64
|
+
UINT = 1
|
|
65
|
+
FLOAT = 2
|
|
66
|
+
BOOL = 20
|
|
67
|
+
STRING = 21 # UTF-8
|
|
68
|
+
DATETIME = 22
|
|
69
|
+
CATEGORICAL = 23
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
Dtype: TypeAlias = tuple[DtypeKind, int, str, str] # see Column.dtype
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class ColumnNullType(IntEnum):
|
|
76
|
+
"""
|
|
77
|
+
Integer enum for null type representation.
|
|
78
|
+
|
|
79
|
+
Attributes
|
|
80
|
+
----------
|
|
81
|
+
NON_NULLABLE : int
|
|
82
|
+
Non-nullable column.
|
|
83
|
+
USE_NAN : int
|
|
84
|
+
Use explicit float NaN value.
|
|
85
|
+
USE_SENTINEL : int
|
|
86
|
+
Sentinel value besides NaN.
|
|
87
|
+
USE_BITMASK : int
|
|
88
|
+
The bit is set/unset representing a null on a certain position.
|
|
89
|
+
USE_BYTEMASK : int
|
|
90
|
+
The byte is set/unset representing a null on a certain position.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
NON_NULLABLE = 0
|
|
94
|
+
USE_NAN = 1
|
|
95
|
+
USE_SENTINEL = 2
|
|
96
|
+
USE_BITMASK = 3
|
|
97
|
+
USE_BYTEMASK = 4
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class ColumnBuffers(TypedDict):
|
|
101
|
+
"""Buffers backing a column."""
|
|
102
|
+
|
|
103
|
+
# first element is a buffer containing the column data;
|
|
104
|
+
# second element is the data buffer's associated dtype
|
|
105
|
+
data: tuple[PolarsBuffer, Dtype]
|
|
106
|
+
|
|
107
|
+
# first element is a buffer containing mask values indicating missing data;
|
|
108
|
+
# second element is the mask value buffer's associated dtype.
|
|
109
|
+
# None if the null representation is not a bit or byte mask
|
|
110
|
+
validity: tuple[PolarsBuffer, Dtype] | None
|
|
111
|
+
|
|
112
|
+
# first element is a buffer containing the offset values for
|
|
113
|
+
# variable-size binary data (e.g., variable-length strings);
|
|
114
|
+
# second element is the offsets buffer's associated dtype.
|
|
115
|
+
# None if the data buffer does not have an associated offsets buffer
|
|
116
|
+
offsets: tuple[PolarsBuffer, Dtype] | None
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class CategoricalDescription(TypedDict):
|
|
120
|
+
"""Description of a categorical column."""
|
|
121
|
+
|
|
122
|
+
# whether the ordering of dictionary indices is semantically meaningful
|
|
123
|
+
is_ordered: bool
|
|
124
|
+
# whether a dictionary-style mapping of categorical values to other objects exists
|
|
125
|
+
is_dictionary: Literal[True]
|
|
126
|
+
# Python-level only (e.g. `{int: str}`).
|
|
127
|
+
# None if not a dictionary-style categorical.
|
|
128
|
+
categories: PolarsColumn
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class Buffer(Protocol):
|
|
132
|
+
"""Interchange buffer object."""
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def bufsize(self) -> int:
|
|
136
|
+
"""Buffer size in bytes."""
|
|
137
|
+
|
|
138
|
+
@property
|
|
139
|
+
def ptr(self) -> int:
|
|
140
|
+
"""Pointer to start of the buffer as an integer."""
|
|
141
|
+
|
|
142
|
+
def __dlpack__(self) -> Any:
|
|
143
|
+
"""Represent this structure as DLPack interface."""
|
|
144
|
+
|
|
145
|
+
def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]:
|
|
146
|
+
"""Device type and device ID for where the data in the buffer resides."""
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class Column(Protocol):
|
|
150
|
+
"""Interchange column object."""
|
|
151
|
+
|
|
152
|
+
def size(self) -> int:
|
|
153
|
+
"""Size of the column in elements."""
|
|
154
|
+
|
|
155
|
+
@property
|
|
156
|
+
def offset(self) -> int:
|
|
157
|
+
"""Offset of the first element with respect to the start of the underlying buffer.""" # noqa: W505
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def dtype(self) -> Dtype:
|
|
161
|
+
"""Data type of the column."""
|
|
162
|
+
|
|
163
|
+
@property
|
|
164
|
+
def describe_categorical(self) -> CategoricalDescription:
|
|
165
|
+
"""Description of the categorical data type of the column."""
|
|
166
|
+
|
|
167
|
+
@property
|
|
168
|
+
def describe_null(self) -> tuple[ColumnNullType, Any]:
|
|
169
|
+
"""Description of the null representation the column uses."""
|
|
170
|
+
|
|
171
|
+
@property
|
|
172
|
+
def null_count(self) -> int | None:
|
|
173
|
+
"""Number of null elements, if known."""
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def metadata(self) -> dict[str, Any]:
|
|
177
|
+
"""The metadata for the column."""
|
|
178
|
+
|
|
179
|
+
def num_chunks(self) -> int:
|
|
180
|
+
"""Return the number of chunks the column consists of."""
|
|
181
|
+
|
|
182
|
+
def get_chunks(self, n_chunks: int | None = None) -> Iterable[Column]:
|
|
183
|
+
"""Return an iterator yielding the column chunks."""
|
|
184
|
+
|
|
185
|
+
def get_buffers(self) -> ColumnBuffers:
|
|
186
|
+
"""Return a dictionary containing the underlying buffers."""
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class DataFrame(Protocol):
|
|
190
|
+
"""Interchange dataframe object."""
|
|
191
|
+
|
|
192
|
+
version: ClassVar[int] # Version of the protocol
|
|
193
|
+
|
|
194
|
+
def __dataframe__(
|
|
195
|
+
self,
|
|
196
|
+
nan_as_null: bool = False, # noqa: FBT001
|
|
197
|
+
allow_copy: bool = True, # noqa: FBT001
|
|
198
|
+
) -> DataFrame:
|
|
199
|
+
"""Convert to a dataframe object implementing the dataframe interchange protocol.""" # noqa: W505
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def metadata(self) -> dict[str, Any]:
|
|
203
|
+
"""The metadata for the dataframe."""
|
|
204
|
+
|
|
205
|
+
def num_columns(self) -> int:
|
|
206
|
+
"""Return the number of columns in the dataframe."""
|
|
207
|
+
|
|
208
|
+
def num_rows(self) -> int | None:
|
|
209
|
+
"""Return the number of rows in the dataframe, if available."""
|
|
210
|
+
|
|
211
|
+
def num_chunks(self) -> int:
|
|
212
|
+
"""Return the number of chunks the dataframe consists of.."""
|
|
213
|
+
|
|
214
|
+
def column_names(self) -> Iterable[str]:
|
|
215
|
+
"""Return the column names."""
|
|
216
|
+
|
|
217
|
+
def get_column(self, i: int) -> Column:
|
|
218
|
+
"""Return the column at the indicated position."""
|
|
219
|
+
|
|
220
|
+
def get_column_by_name(self, name: str) -> Column:
|
|
221
|
+
"""Return the column with the given name."""
|
|
222
|
+
|
|
223
|
+
def get_columns(self) -> Iterable[Column]:
|
|
224
|
+
"""Return an iterator yielding the columns."""
|
|
225
|
+
|
|
226
|
+
def select_columns(self, indices: Sequence[int]) -> DataFrame:
|
|
227
|
+
"""Create a new dataframe by selecting a subset of columns by index."""
|
|
228
|
+
|
|
229
|
+
def select_columns_by_name(self, names: Sequence[str]) -> DataFrame:
|
|
230
|
+
"""Create a new dataframe by selecting a subset of columns by name."""
|
|
231
|
+
|
|
232
|
+
def get_chunks(self, n_chunks: int | None = None) -> Iterable[DataFrame]:
|
|
233
|
+
"""Return an iterator yielding the chunks of the dataframe."""
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class SupportsInterchange(Protocol):
|
|
237
|
+
"""Dataframe that supports conversion into an interchange dataframe object."""
|
|
238
|
+
|
|
239
|
+
def __dataframe__(
|
|
240
|
+
self,
|
|
241
|
+
nan_as_null: bool = False, # noqa: FBT001
|
|
242
|
+
allow_copy: bool = True, # noqa: FBT001
|
|
243
|
+
) -> SupportsInterchange:
|
|
244
|
+
"""Convert to a dataframe object implementing the dataframe interchange protocol.""" # noqa: W505
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
class Endianness:
|
|
248
|
+
"""Enum indicating the byte-order of a data type."""
|
|
249
|
+
|
|
250
|
+
LITTLE = "<"
|
|
251
|
+
BIG = ">"
|
|
252
|
+
NATIVE = "="
|
|
253
|
+
NA = "|"
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
class CopyNotAllowedError(RuntimeError):
|
|
257
|
+
"""Exception raised when a copy is required, but `allow_copy` is set to `False`."""
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class CompatLevel:
|
|
261
|
+
"""Data structure compatibility level."""
|
|
262
|
+
|
|
263
|
+
_version: int
|
|
264
|
+
|
|
265
|
+
def __init__(self) -> None:
|
|
266
|
+
msg = "it is not allowed to create a CompatLevel object"
|
|
267
|
+
raise TypeError(msg)
|
|
268
|
+
|
|
269
|
+
@staticmethod
|
|
270
|
+
def _with_version(version: int) -> CompatLevel:
|
|
271
|
+
compat_level = CompatLevel.__new__(CompatLevel)
|
|
272
|
+
compat_level._version = version
|
|
273
|
+
return compat_level
|
|
274
|
+
|
|
275
|
+
@staticmethod
|
|
276
|
+
def _newest() -> CompatLevel:
|
|
277
|
+
return CompatLevel._future1 # type: ignore[attr-defined]
|
|
278
|
+
|
|
279
|
+
@staticmethod
|
|
280
|
+
def newest() -> CompatLevel:
|
|
281
|
+
"""
|
|
282
|
+
Get the highest supported compatibility level.
|
|
283
|
+
|
|
284
|
+
.. warning::
|
|
285
|
+
Highest compatibility level is considered **unstable**. It may be changed
|
|
286
|
+
at any point without it being considered a breaking change.
|
|
287
|
+
"""
|
|
288
|
+
issue_unstable_warning(
|
|
289
|
+
"using the highest compatibility level is considered unstable."
|
|
290
|
+
)
|
|
291
|
+
return CompatLevel._newest()
|
|
292
|
+
|
|
293
|
+
@staticmethod
|
|
294
|
+
def oldest() -> CompatLevel:
|
|
295
|
+
"""Get the most compatible level."""
|
|
296
|
+
return CompatLevel._compatible # type: ignore[attr-defined]
|
|
297
|
+
|
|
298
|
+
def __repr__(self) -> str:
|
|
299
|
+
return f"<{self.__class__.__module__}.{self.__class__.__qualname__}: {self._version}>"
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
CompatLevel._compatible = CompatLevel._with_version(0) # type: ignore[attr-defined]
|
|
303
|
+
CompatLevel._future1 = CompatLevel._with_version(1) # type: ignore[attr-defined]
|