pydiverse-common 0.3.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+ from .dtypes import (
4
+ Bool,
5
+ Date,
6
+ Datetime,
7
+ Decimal,
8
+ Dtype,
9
+ Duration,
10
+ Float,
11
+ Float32,
12
+ Float64,
13
+ Int,
14
+ Int8,
15
+ Int16,
16
+ Int32,
17
+ Int64,
18
+ List,
19
+ NullType,
20
+ PandasBackend,
21
+ String,
22
+ Time,
23
+ UInt8,
24
+ UInt16,
25
+ UInt32,
26
+ UInt64,
27
+ )
28
+ from .version import __version__
29
+
30
+ __all__ = [
31
+ "__version__",
32
+ "Dtype",
33
+ "Bool",
34
+ "Date",
35
+ "Datetime",
36
+ "Decimal",
37
+ "Duration",
38
+ "Float",
39
+ "Float32",
40
+ "Float64",
41
+ "Int",
42
+ "Int8",
43
+ "Int16",
44
+ "Int32",
45
+ "Int64",
46
+ "NullType",
47
+ "String",
48
+ "Time",
49
+ "UInt8",
50
+ "UInt16",
51
+ "UInt32",
52
+ "UInt64",
53
+ "List",
54
+ "PandasBackend",
55
+ ]
@@ -0,0 +1,418 @@
1
+ # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+ from enum import Enum
4
+
5
+
6
+ class PandasBackend(str, Enum):
7
+ NUMPY = "numpy"
8
+ ARROW = "arrow"
9
+
10
+
11
+ class Dtype:
12
+ """Base class for all data types."""
13
+
14
+ def __eq__(self, rhs):
15
+ return isinstance(rhs, Dtype) and type(self) is type(rhs)
16
+
17
+ def __hash__(self):
18
+ return hash(type(self))
19
+
20
+ def __repr__(self):
21
+ return self.__class__.__name__
22
+
23
+ @classmethod
24
+ def is_int(cls):
25
+ return False
26
+
27
+ @classmethod
28
+ def is_float(cls):
29
+ return False
30
+
31
+ @classmethod
32
+ def is_subtype(cls, rhs):
33
+ rhs_cls = type(rhs)
34
+ return (
35
+ (cls is rhs_cls)
36
+ or (rhs_cls is Int and cls.is_int())
37
+ or (rhs_cls is Float and cls.is_float())
38
+ )
39
+
40
+ @staticmethod
41
+ def from_sql(sql_type) -> "Dtype":
42
+ import sqlalchemy as sqa
43
+
44
+ if isinstance(sql_type, sqa.SmallInteger):
45
+ return Int16()
46
+ if isinstance(sql_type, sqa.BigInteger):
47
+ return Int64()
48
+ if isinstance(sql_type, sqa.Integer):
49
+ return Int32()
50
+ if isinstance(sql_type, sqa.Float):
51
+ precision = sql_type.precision or 53
52
+ if precision <= 24:
53
+ return Float32()
54
+ return Float64()
55
+ if isinstance(sql_type, sqa.Numeric | sqa.DECIMAL):
56
+ # Just to be safe, we always use FLOAT64 for fixpoint numbers.
57
+ # Databases are obsessed about fixpoint. However, in dataframes, it
58
+ # is more common to just work with double precision floating point.
59
+ # We see Decimal as subtype of Float. Pydiverse.transform will convert
60
+ # Decimal to Float64 whenever it cannot guarantee semantic correctness
61
+ # otherwise.
62
+ return Float64()
63
+ if isinstance(sql_type, sqa.String):
64
+ return String()
65
+ if isinstance(sql_type, sqa.Boolean):
66
+ return Bool()
67
+ if isinstance(sql_type, sqa.Date):
68
+ return Date()
69
+ if isinstance(sql_type, sqa.Time):
70
+ return Time()
71
+ if isinstance(sql_type, sqa.DateTime):
72
+ return Datetime()
73
+ if isinstance(sql_type, sqa.Interval):
74
+ return Duration()
75
+ if isinstance(sql_type, sqa.ARRAY):
76
+ return List(Dtype.from_sql(sql_type.item_type.from_sql))
77
+ if isinstance(sql_type, sqa.Null):
78
+ return NullType()
79
+
80
+ raise TypeError
81
+
82
+ @staticmethod
83
+ def from_pandas(pandas_type) -> "Dtype":
84
+ import numpy as np
85
+ import pandas as pd
86
+
87
+ if isinstance(pandas_type, pd.ArrowDtype):
88
+ return Dtype.from_arrow(pandas_type.pyarrow_dtype)
89
+
90
+ def is_np_dtype(type_, np_dtype):
91
+ return pd.core.dtypes.common._is_dtype_type(
92
+ type_, pd.core.dtypes.common.classes(np_dtype)
93
+ )
94
+
95
+ if pd.api.types.is_signed_integer_dtype(pandas_type):
96
+ if is_np_dtype(pandas_type, np.int64):
97
+ return Int64()
98
+ elif is_np_dtype(pandas_type, np.int32):
99
+ return Int32()
100
+ elif is_np_dtype(pandas_type, np.int16):
101
+ return Int16()
102
+ elif is_np_dtype(pandas_type, np.int8):
103
+ return Int8()
104
+ raise TypeError
105
+ if pd.api.types.is_unsigned_integer_dtype(pandas_type):
106
+ if is_np_dtype(pandas_type, np.uint64):
107
+ return UInt64()
108
+ elif is_np_dtype(pandas_type, np.uint32):
109
+ return UInt32()
110
+ elif is_np_dtype(pandas_type, np.uint16):
111
+ return UInt16()
112
+ elif is_np_dtype(pandas_type, np.uint8):
113
+ return UInt8()
114
+ raise TypeError
115
+ if pd.api.types.is_float_dtype(pandas_type):
116
+ if is_np_dtype(pandas_type, np.float64):
117
+ return Float64()
118
+ elif is_np_dtype(pandas_type, np.float32):
119
+ return Float32()
120
+ raise TypeError
121
+ if pd.api.types.is_string_dtype(pandas_type):
122
+ # We reserve the use of the object column for string.
123
+ return String()
124
+ if pd.api.types.is_bool_dtype(pandas_type):
125
+ return Bool()
126
+ if pd.api.types.is_datetime64_any_dtype(pandas_type):
127
+ return Datetime()
128
+ if pd.api.types.is_timedelta64_dtype(pandas_type):
129
+ return Duration()
130
+ # we don't know any decimal/time dtypes in pandas if column is not
131
+ # arrow backed
132
+
133
+ raise TypeError
134
+
135
+ @staticmethod
136
+ def from_arrow(arrow_type) -> "Dtype":
137
+ import pyarrow as pa
138
+
139
+ if pa.types.is_signed_integer(arrow_type):
140
+ if pa.types.is_int64(arrow_type):
141
+ return Int64()
142
+ if pa.types.is_int32(arrow_type):
143
+ return Int32()
144
+ if pa.types.is_int16(arrow_type):
145
+ return Int16()
146
+ if pa.types.is_int8(arrow_type):
147
+ return Int8()
148
+ raise TypeError
149
+ if pa.types.is_unsigned_integer(arrow_type):
150
+ if pa.types.is_uint64(arrow_type):
151
+ return UInt64()
152
+ if pa.types.is_uint32(arrow_type):
153
+ return UInt32()
154
+ if pa.types.is_uint16(arrow_type):
155
+ return UInt16()
156
+ if pa.types.is_uint8(arrow_type):
157
+ return UInt8()
158
+ raise TypeError
159
+ if pa.types.is_floating(arrow_type):
160
+ if pa.types.is_float64(arrow_type):
161
+ return Float64()
162
+ if pa.types.is_float32(arrow_type):
163
+ return Float32()
164
+ if pa.types.is_float16(arrow_type):
165
+ return Float32()
166
+ raise TypeError
167
+ if pa.types.is_decimal(arrow_type):
168
+ # We don't recommend using Decimal in dataframes, but we support it.
169
+ return Decimal()
170
+ if pa.types.is_string(arrow_type):
171
+ return String()
172
+ if pa.types.is_boolean(arrow_type):
173
+ return Bool()
174
+ if pa.types.is_timestamp(arrow_type):
175
+ return Datetime()
176
+ if pa.types.is_date(arrow_type):
177
+ return Date()
178
+ if pa.types.is_time(arrow_type):
179
+ return Time()
180
+ if pa.types.is_duration(arrow_type):
181
+ return Duration()
182
+ raise TypeError
183
+
184
+ @staticmethod
185
+ def from_polars(polars_type) -> "Dtype":
186
+ import polars as pl
187
+
188
+ if isinstance(polars_type, pl.List):
189
+ return List(Dtype.from_polars(polars_type.inner))
190
+
191
+ return {
192
+ pl.Int64: Int64(),
193
+ pl.Int32: Int32(),
194
+ pl.Int16: Int16(),
195
+ pl.Int8: Int8(),
196
+ pl.UInt64: UInt64(),
197
+ pl.UInt32: UInt32(),
198
+ pl.UInt16: UInt16(),
199
+ pl.UInt8: UInt8(),
200
+ pl.Float64: Float64(),
201
+ pl.Float32: Float32(),
202
+ pl.Decimal: Decimal(),
203
+ pl.Utf8: String(),
204
+ pl.Boolean: Bool(),
205
+ pl.Datetime: Datetime(),
206
+ pl.Time: Time(),
207
+ pl.Date: Date(),
208
+ pl.Null: NullType(),
209
+ pl.Duration: Duration(),
210
+ pl.Enum: String(),
211
+ }[polars_type.base_type()]
212
+
213
+ def to_sql(self):
214
+ import sqlalchemy as sqa
215
+
216
+ return {
217
+ Int(): sqa.BigInteger(), # we default to 64 bit
218
+ Int8(): sqa.SmallInteger(),
219
+ Int16(): sqa.SmallInteger(),
220
+ Int32(): sqa.Integer(),
221
+ Int64(): sqa.BigInteger(),
222
+ UInt8(): sqa.SmallInteger(),
223
+ UInt16(): sqa.Integer(),
224
+ UInt32(): sqa.BigInteger(),
225
+ UInt64(): sqa.BigInteger(),
226
+ Float(): sqa.Float(53), # we default to 64 bit
227
+ Float32(): sqa.Float(24),
228
+ Float64(): sqa.Float(53),
229
+ Decimal(): sqa.DECIMAL(),
230
+ String(): sqa.String(),
231
+ Bool(): sqa.Boolean(),
232
+ Date(): sqa.Date(),
233
+ Time(): sqa.Time(),
234
+ Datetime(): sqa.DateTime(),
235
+ Duration(): sqa.Interval(),
236
+ NullType(): sqa.types.NullType(),
237
+ }[self]
238
+
239
+ def to_pandas(self, backend: PandasBackend = PandasBackend.ARROW):
240
+ import pandas as pd
241
+
242
+ if backend == PandasBackend.NUMPY:
243
+ return self.to_pandas_nullable(backend)
244
+ if backend == PandasBackend.ARROW:
245
+ if self == String():
246
+ return pd.StringDtype(storage="pyarrow")
247
+ return pd.ArrowDtype(self.to_arrow())
248
+
249
+ def to_pandas_nullable(self, backend: PandasBackend = PandasBackend.ARROW):
250
+ import pandas as pd
251
+
252
+ if self == Time():
253
+ if backend == PandasBackend.ARROW:
254
+ return pd.ArrowDtype(self.to_arrow())
255
+ raise TypeError("pandas doesn't have a native time dtype")
256
+
257
+ return {
258
+ Int(): pd.Int64Dtype(), # we default to 64 bit
259
+ Int8(): pd.Int8Dtype(),
260
+ Int16(): pd.Int16Dtype(),
261
+ Int32(): pd.Int32Dtype(),
262
+ Int64(): pd.Int64Dtype(),
263
+ UInt8(): pd.UInt8Dtype(),
264
+ UInt16(): pd.UInt16Dtype(),
265
+ UInt32(): pd.UInt32Dtype(),
266
+ UInt64(): pd.UInt64Dtype(),
267
+ Float(): pd.Float64Dtype(), # we default to 64 bit
268
+ Float32(): pd.Float32Dtype(),
269
+ Float64(): pd.Float64Dtype(),
270
+ Decimal(): pd.Float64Dtype(), # NumericDtype is
271
+ String(): pd.StringDtype(),
272
+ Bool(): pd.BooleanDtype(),
273
+ Date(): "datetime64[s]",
274
+ Datetime(): "datetime64[us]",
275
+ Time(): "timedelta64[us]",
276
+ Duration(): "timedelta64[us]",
277
+ }[self]
278
+
279
+ def to_arrow(self):
280
+ import pyarrow as pa
281
+
282
+ return {
283
+ Int(): pa.int64(), # we default to 64 bit
284
+ Int8(): pa.int8(),
285
+ Int16(): pa.int16(),
286
+ Int32(): pa.int32(),
287
+ Int64(): pa.int64(),
288
+ UInt8(): pa.uint8(),
289
+ UInt16(): pa.uint16(),
290
+ UInt32(): pa.uint32(),
291
+ UInt64(): pa.uint64(),
292
+ Float(): pa.float64(), # we default to 64 bit
293
+ Float32(): pa.float32(),
294
+ Float64(): pa.float64(),
295
+ Decimal(): pa.decimal128(35, 10), # Arbitrary precision
296
+ String(): pa.string(),
297
+ Bool(): pa.bool_(),
298
+ Date(): pa.date32(),
299
+ Time(): pa.time64("us"),
300
+ Datetime(): pa.timestamp("us"),
301
+ Duration(): pa.duration("us"),
302
+ }[self]
303
+
304
+ def to_polars(self: "Dtype"):
305
+ import polars as pl
306
+
307
+ return {
308
+ Int(): pl.Int64, # we default to 64 bit
309
+ Int64(): pl.Int64,
310
+ Int32(): pl.Int32,
311
+ Int16(): pl.Int16,
312
+ Int8(): pl.Int8,
313
+ UInt64(): pl.UInt64,
314
+ UInt32(): pl.UInt32,
315
+ UInt16(): pl.UInt16,
316
+ UInt8(): pl.UInt8,
317
+ Float(): pl.Float64, # we default to 64 bit
318
+ Float64(): pl.Float64,
319
+ Float32(): pl.Float32,
320
+ Decimal(): pl.Decimal(scale=10), # Arbitrary precision
321
+ String(): pl.Utf8,
322
+ Bool(): pl.Boolean,
323
+ Datetime(): pl.Datetime("us"),
324
+ Duration(): pl.Duration("us"),
325
+ Time(): pl.Time, # Polars uses nanoseconds since midnight
326
+ Date(): pl.Date,
327
+ NullType(): pl.Null,
328
+ }[self]
329
+
330
+
331
+ class Float(Dtype):
332
+ @classmethod
333
+ def is_float(cls):
334
+ return True
335
+
336
+
337
+ class Float64(Float): ...
338
+
339
+
340
+ class Float32(Float): ...
341
+
342
+
343
+ class Decimal(Float): ...
344
+
345
+
346
+ class Int(Dtype):
347
+ @classmethod
348
+ def is_int(cls):
349
+ return True
350
+
351
+
352
+ class Int64(Int): ...
353
+
354
+
355
+ class Int32(Int): ...
356
+
357
+
358
+ class Int16(Int): ...
359
+
360
+
361
+ class Int8(Int): ...
362
+
363
+
364
+ class UInt64(Int): ...
365
+
366
+
367
+ class UInt32(Int): ...
368
+
369
+
370
+ class UInt16(Int): ...
371
+
372
+
373
+ class UInt8(Int): ...
374
+
375
+
376
+ class String(Dtype): ...
377
+
378
+
379
+ class Bool(Dtype): ...
380
+
381
+
382
+ class Datetime(Dtype): ...
383
+
384
+
385
+ class Date(Dtype): ...
386
+
387
+
388
+ class Time(Dtype): ...
389
+
390
+
391
+ class Duration(Dtype): ...
392
+
393
+
394
+ class NullType(Dtype): ...
395
+
396
+
397
+ class List(Dtype):
398
+ def __init__(self, inner: "Dtype"):
399
+ self.inner = inner
400
+
401
+ def __eq__(self, rhs):
402
+ return isinstance(rhs, List) and self.inner == rhs.inner
403
+
404
+ def __hash__(self):
405
+ return hash((0, hash(self.inner)))
406
+
407
+ def __repr__(self):
408
+ return f"List[{repr(self.inner)}]"
409
+
410
+ def to_sql(self):
411
+ import sqlalchemy as sqa
412
+
413
+ return sqa.ARRAY(self.inner.to_sql())
414
+
415
+ def to_polars(self):
416
+ import polars as pl
417
+
418
+ return pl.List(self.inner.to_polars())
@@ -0,0 +1,9 @@
1
+ # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+
4
+
5
+ class DisposedError(Exception):
6
+ """
7
+ Exception raise when an object has been disposed, but some attributes are
8
+ being accessed nevertheless.
9
+ """
@@ -0,0 +1,13 @@
1
+ # Copyright (c) QuantCo and pydiverse contributors 2025-2025
2
+ # SPDX-License-Identifier: BSD-3-Clause
3
+ from .deep_map import deep_map
4
+ from .deep_merge import deep_merge
5
+ from .disposable import Disposable
6
+ from .import_ import requires
7
+
8
+ __all__ = [
9
+ "deep_map",
10
+ "deep_merge",
11
+ "Disposable",
12
+ "requires",
13
+ ]