pydiverse-common 0.3.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pydiverse/common/__init__.py +55 -0
- pydiverse/common/dtypes.py +418 -0
- pydiverse/common/errors/__init__.py +9 -0
- pydiverse/common/util/__init__.py +13 -0
- pydiverse/common/util/computation_tracing.py +341 -0
- pydiverse/common/util/deep_map.py +100 -0
- pydiverse/common/util/deep_merge.py +55 -0
- pydiverse/common/util/disposable.py +28 -0
- pydiverse/common/util/hashing.py +32 -0
- pydiverse/common/util/import_.py +135 -0
- pydiverse/common/util/structlog.py +115 -0
- pydiverse/common/version.py +10 -0
- pydiverse_common-0.3.2.dist-info/METADATA +64 -0
- pydiverse_common-0.3.2.dist-info/RECORD +16 -0
- pydiverse_common-0.3.2.dist-info/WHEEL +5 -0
- pydiverse_common-0.3.2.dist-info/licenses/LICENSE +29 -0
@@ -0,0 +1,55 @@
|
|
1
|
+
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
|
+
# SPDX-License-Identifier: BSD-3-Clause
|
3
|
+
from .dtypes import (
|
4
|
+
Bool,
|
5
|
+
Date,
|
6
|
+
Datetime,
|
7
|
+
Decimal,
|
8
|
+
Dtype,
|
9
|
+
Duration,
|
10
|
+
Float,
|
11
|
+
Float32,
|
12
|
+
Float64,
|
13
|
+
Int,
|
14
|
+
Int8,
|
15
|
+
Int16,
|
16
|
+
Int32,
|
17
|
+
Int64,
|
18
|
+
List,
|
19
|
+
NullType,
|
20
|
+
PandasBackend,
|
21
|
+
String,
|
22
|
+
Time,
|
23
|
+
UInt8,
|
24
|
+
UInt16,
|
25
|
+
UInt32,
|
26
|
+
UInt64,
|
27
|
+
)
|
28
|
+
from .version import __version__
|
29
|
+
|
30
|
+
__all__ = [
|
31
|
+
"__version__",
|
32
|
+
"Dtype",
|
33
|
+
"Bool",
|
34
|
+
"Date",
|
35
|
+
"Datetime",
|
36
|
+
"Decimal",
|
37
|
+
"Duration",
|
38
|
+
"Float",
|
39
|
+
"Float32",
|
40
|
+
"Float64",
|
41
|
+
"Int",
|
42
|
+
"Int8",
|
43
|
+
"Int16",
|
44
|
+
"Int32",
|
45
|
+
"Int64",
|
46
|
+
"NullType",
|
47
|
+
"String",
|
48
|
+
"Time",
|
49
|
+
"UInt8",
|
50
|
+
"UInt16",
|
51
|
+
"UInt32",
|
52
|
+
"UInt64",
|
53
|
+
"List",
|
54
|
+
"PandasBackend",
|
55
|
+
]
|
@@ -0,0 +1,418 @@
|
|
1
|
+
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
|
+
# SPDX-License-Identifier: BSD-3-Clause
|
3
|
+
from enum import Enum
|
4
|
+
|
5
|
+
|
6
|
+
class PandasBackend(str, Enum):
|
7
|
+
NUMPY = "numpy"
|
8
|
+
ARROW = "arrow"
|
9
|
+
|
10
|
+
|
11
|
+
class Dtype:
|
12
|
+
"""Base class for all data types."""
|
13
|
+
|
14
|
+
def __eq__(self, rhs):
|
15
|
+
return isinstance(rhs, Dtype) and type(self) is type(rhs)
|
16
|
+
|
17
|
+
def __hash__(self):
|
18
|
+
return hash(type(self))
|
19
|
+
|
20
|
+
def __repr__(self):
|
21
|
+
return self.__class__.__name__
|
22
|
+
|
23
|
+
@classmethod
|
24
|
+
def is_int(cls):
|
25
|
+
return False
|
26
|
+
|
27
|
+
@classmethod
|
28
|
+
def is_float(cls):
|
29
|
+
return False
|
30
|
+
|
31
|
+
@classmethod
|
32
|
+
def is_subtype(cls, rhs):
|
33
|
+
rhs_cls = type(rhs)
|
34
|
+
return (
|
35
|
+
(cls is rhs_cls)
|
36
|
+
or (rhs_cls is Int and cls.is_int())
|
37
|
+
or (rhs_cls is Float and cls.is_float())
|
38
|
+
)
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def from_sql(sql_type) -> "Dtype":
|
42
|
+
import sqlalchemy as sqa
|
43
|
+
|
44
|
+
if isinstance(sql_type, sqa.SmallInteger):
|
45
|
+
return Int16()
|
46
|
+
if isinstance(sql_type, sqa.BigInteger):
|
47
|
+
return Int64()
|
48
|
+
if isinstance(sql_type, sqa.Integer):
|
49
|
+
return Int32()
|
50
|
+
if isinstance(sql_type, sqa.Float):
|
51
|
+
precision = sql_type.precision or 53
|
52
|
+
if precision <= 24:
|
53
|
+
return Float32()
|
54
|
+
return Float64()
|
55
|
+
if isinstance(sql_type, sqa.Numeric | sqa.DECIMAL):
|
56
|
+
# Just to be safe, we always use FLOAT64 for fixpoint numbers.
|
57
|
+
# Databases are obsessed about fixpoint. However, in dataframes, it
|
58
|
+
# is more common to just work with double precision floating point.
|
59
|
+
# We see Decimal as subtype of Float. Pydiverse.transform will convert
|
60
|
+
# Decimal to Float64 whenever it cannot guarantee semantic correctness
|
61
|
+
# otherwise.
|
62
|
+
return Float64()
|
63
|
+
if isinstance(sql_type, sqa.String):
|
64
|
+
return String()
|
65
|
+
if isinstance(sql_type, sqa.Boolean):
|
66
|
+
return Bool()
|
67
|
+
if isinstance(sql_type, sqa.Date):
|
68
|
+
return Date()
|
69
|
+
if isinstance(sql_type, sqa.Time):
|
70
|
+
return Time()
|
71
|
+
if isinstance(sql_type, sqa.DateTime):
|
72
|
+
return Datetime()
|
73
|
+
if isinstance(sql_type, sqa.Interval):
|
74
|
+
return Duration()
|
75
|
+
if isinstance(sql_type, sqa.ARRAY):
|
76
|
+
return List(Dtype.from_sql(sql_type.item_type.from_sql))
|
77
|
+
if isinstance(sql_type, sqa.Null):
|
78
|
+
return NullType()
|
79
|
+
|
80
|
+
raise TypeError
|
81
|
+
|
82
|
+
@staticmethod
|
83
|
+
def from_pandas(pandas_type) -> "Dtype":
|
84
|
+
import numpy as np
|
85
|
+
import pandas as pd
|
86
|
+
|
87
|
+
if isinstance(pandas_type, pd.ArrowDtype):
|
88
|
+
return Dtype.from_arrow(pandas_type.pyarrow_dtype)
|
89
|
+
|
90
|
+
def is_np_dtype(type_, np_dtype):
|
91
|
+
return pd.core.dtypes.common._is_dtype_type(
|
92
|
+
type_, pd.core.dtypes.common.classes(np_dtype)
|
93
|
+
)
|
94
|
+
|
95
|
+
if pd.api.types.is_signed_integer_dtype(pandas_type):
|
96
|
+
if is_np_dtype(pandas_type, np.int64):
|
97
|
+
return Int64()
|
98
|
+
elif is_np_dtype(pandas_type, np.int32):
|
99
|
+
return Int32()
|
100
|
+
elif is_np_dtype(pandas_type, np.int16):
|
101
|
+
return Int16()
|
102
|
+
elif is_np_dtype(pandas_type, np.int8):
|
103
|
+
return Int8()
|
104
|
+
raise TypeError
|
105
|
+
if pd.api.types.is_unsigned_integer_dtype(pandas_type):
|
106
|
+
if is_np_dtype(pandas_type, np.uint64):
|
107
|
+
return UInt64()
|
108
|
+
elif is_np_dtype(pandas_type, np.uint32):
|
109
|
+
return UInt32()
|
110
|
+
elif is_np_dtype(pandas_type, np.uint16):
|
111
|
+
return UInt16()
|
112
|
+
elif is_np_dtype(pandas_type, np.uint8):
|
113
|
+
return UInt8()
|
114
|
+
raise TypeError
|
115
|
+
if pd.api.types.is_float_dtype(pandas_type):
|
116
|
+
if is_np_dtype(pandas_type, np.float64):
|
117
|
+
return Float64()
|
118
|
+
elif is_np_dtype(pandas_type, np.float32):
|
119
|
+
return Float32()
|
120
|
+
raise TypeError
|
121
|
+
if pd.api.types.is_string_dtype(pandas_type):
|
122
|
+
# We reserve the use of the object column for string.
|
123
|
+
return String()
|
124
|
+
if pd.api.types.is_bool_dtype(pandas_type):
|
125
|
+
return Bool()
|
126
|
+
if pd.api.types.is_datetime64_any_dtype(pandas_type):
|
127
|
+
return Datetime()
|
128
|
+
if pd.api.types.is_timedelta64_dtype(pandas_type):
|
129
|
+
return Duration()
|
130
|
+
# we don't know any decimal/time dtypes in pandas if column is not
|
131
|
+
# arrow backed
|
132
|
+
|
133
|
+
raise TypeError
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
def from_arrow(arrow_type) -> "Dtype":
|
137
|
+
import pyarrow as pa
|
138
|
+
|
139
|
+
if pa.types.is_signed_integer(arrow_type):
|
140
|
+
if pa.types.is_int64(arrow_type):
|
141
|
+
return Int64()
|
142
|
+
if pa.types.is_int32(arrow_type):
|
143
|
+
return Int32()
|
144
|
+
if pa.types.is_int16(arrow_type):
|
145
|
+
return Int16()
|
146
|
+
if pa.types.is_int8(arrow_type):
|
147
|
+
return Int8()
|
148
|
+
raise TypeError
|
149
|
+
if pa.types.is_unsigned_integer(arrow_type):
|
150
|
+
if pa.types.is_uint64(arrow_type):
|
151
|
+
return UInt64()
|
152
|
+
if pa.types.is_uint32(arrow_type):
|
153
|
+
return UInt32()
|
154
|
+
if pa.types.is_uint16(arrow_type):
|
155
|
+
return UInt16()
|
156
|
+
if pa.types.is_uint8(arrow_type):
|
157
|
+
return UInt8()
|
158
|
+
raise TypeError
|
159
|
+
if pa.types.is_floating(arrow_type):
|
160
|
+
if pa.types.is_float64(arrow_type):
|
161
|
+
return Float64()
|
162
|
+
if pa.types.is_float32(arrow_type):
|
163
|
+
return Float32()
|
164
|
+
if pa.types.is_float16(arrow_type):
|
165
|
+
return Float32()
|
166
|
+
raise TypeError
|
167
|
+
if pa.types.is_decimal(arrow_type):
|
168
|
+
# We don't recommend using Decimal in dataframes, but we support it.
|
169
|
+
return Decimal()
|
170
|
+
if pa.types.is_string(arrow_type):
|
171
|
+
return String()
|
172
|
+
if pa.types.is_boolean(arrow_type):
|
173
|
+
return Bool()
|
174
|
+
if pa.types.is_timestamp(arrow_type):
|
175
|
+
return Datetime()
|
176
|
+
if pa.types.is_date(arrow_type):
|
177
|
+
return Date()
|
178
|
+
if pa.types.is_time(arrow_type):
|
179
|
+
return Time()
|
180
|
+
if pa.types.is_duration(arrow_type):
|
181
|
+
return Duration()
|
182
|
+
raise TypeError
|
183
|
+
|
184
|
+
@staticmethod
|
185
|
+
def from_polars(polars_type) -> "Dtype":
|
186
|
+
import polars as pl
|
187
|
+
|
188
|
+
if isinstance(polars_type, pl.List):
|
189
|
+
return List(Dtype.from_polars(polars_type.inner))
|
190
|
+
|
191
|
+
return {
|
192
|
+
pl.Int64: Int64(),
|
193
|
+
pl.Int32: Int32(),
|
194
|
+
pl.Int16: Int16(),
|
195
|
+
pl.Int8: Int8(),
|
196
|
+
pl.UInt64: UInt64(),
|
197
|
+
pl.UInt32: UInt32(),
|
198
|
+
pl.UInt16: UInt16(),
|
199
|
+
pl.UInt8: UInt8(),
|
200
|
+
pl.Float64: Float64(),
|
201
|
+
pl.Float32: Float32(),
|
202
|
+
pl.Decimal: Decimal(),
|
203
|
+
pl.Utf8: String(),
|
204
|
+
pl.Boolean: Bool(),
|
205
|
+
pl.Datetime: Datetime(),
|
206
|
+
pl.Time: Time(),
|
207
|
+
pl.Date: Date(),
|
208
|
+
pl.Null: NullType(),
|
209
|
+
pl.Duration: Duration(),
|
210
|
+
pl.Enum: String(),
|
211
|
+
}[polars_type.base_type()]
|
212
|
+
|
213
|
+
def to_sql(self):
|
214
|
+
import sqlalchemy as sqa
|
215
|
+
|
216
|
+
return {
|
217
|
+
Int(): sqa.BigInteger(), # we default to 64 bit
|
218
|
+
Int8(): sqa.SmallInteger(),
|
219
|
+
Int16(): sqa.SmallInteger(),
|
220
|
+
Int32(): sqa.Integer(),
|
221
|
+
Int64(): sqa.BigInteger(),
|
222
|
+
UInt8(): sqa.SmallInteger(),
|
223
|
+
UInt16(): sqa.Integer(),
|
224
|
+
UInt32(): sqa.BigInteger(),
|
225
|
+
UInt64(): sqa.BigInteger(),
|
226
|
+
Float(): sqa.Float(53), # we default to 64 bit
|
227
|
+
Float32(): sqa.Float(24),
|
228
|
+
Float64(): sqa.Float(53),
|
229
|
+
Decimal(): sqa.DECIMAL(),
|
230
|
+
String(): sqa.String(),
|
231
|
+
Bool(): sqa.Boolean(),
|
232
|
+
Date(): sqa.Date(),
|
233
|
+
Time(): sqa.Time(),
|
234
|
+
Datetime(): sqa.DateTime(),
|
235
|
+
Duration(): sqa.Interval(),
|
236
|
+
NullType(): sqa.types.NullType(),
|
237
|
+
}[self]
|
238
|
+
|
239
|
+
def to_pandas(self, backend: PandasBackend = PandasBackend.ARROW):
|
240
|
+
import pandas as pd
|
241
|
+
|
242
|
+
if backend == PandasBackend.NUMPY:
|
243
|
+
return self.to_pandas_nullable(backend)
|
244
|
+
if backend == PandasBackend.ARROW:
|
245
|
+
if self == String():
|
246
|
+
return pd.StringDtype(storage="pyarrow")
|
247
|
+
return pd.ArrowDtype(self.to_arrow())
|
248
|
+
|
249
|
+
def to_pandas_nullable(self, backend: PandasBackend = PandasBackend.ARROW):
|
250
|
+
import pandas as pd
|
251
|
+
|
252
|
+
if self == Time():
|
253
|
+
if backend == PandasBackend.ARROW:
|
254
|
+
return pd.ArrowDtype(self.to_arrow())
|
255
|
+
raise TypeError("pandas doesn't have a native time dtype")
|
256
|
+
|
257
|
+
return {
|
258
|
+
Int(): pd.Int64Dtype(), # we default to 64 bit
|
259
|
+
Int8(): pd.Int8Dtype(),
|
260
|
+
Int16(): pd.Int16Dtype(),
|
261
|
+
Int32(): pd.Int32Dtype(),
|
262
|
+
Int64(): pd.Int64Dtype(),
|
263
|
+
UInt8(): pd.UInt8Dtype(),
|
264
|
+
UInt16(): pd.UInt16Dtype(),
|
265
|
+
UInt32(): pd.UInt32Dtype(),
|
266
|
+
UInt64(): pd.UInt64Dtype(),
|
267
|
+
Float(): pd.Float64Dtype(), # we default to 64 bit
|
268
|
+
Float32(): pd.Float32Dtype(),
|
269
|
+
Float64(): pd.Float64Dtype(),
|
270
|
+
Decimal(): pd.Float64Dtype(), # NumericDtype is
|
271
|
+
String(): pd.StringDtype(),
|
272
|
+
Bool(): pd.BooleanDtype(),
|
273
|
+
Date(): "datetime64[s]",
|
274
|
+
Datetime(): "datetime64[us]",
|
275
|
+
Time(): "timedelta64[us]",
|
276
|
+
Duration(): "timedelta64[us]",
|
277
|
+
}[self]
|
278
|
+
|
279
|
+
def to_arrow(self):
|
280
|
+
import pyarrow as pa
|
281
|
+
|
282
|
+
return {
|
283
|
+
Int(): pa.int64(), # we default to 64 bit
|
284
|
+
Int8(): pa.int8(),
|
285
|
+
Int16(): pa.int16(),
|
286
|
+
Int32(): pa.int32(),
|
287
|
+
Int64(): pa.int64(),
|
288
|
+
UInt8(): pa.uint8(),
|
289
|
+
UInt16(): pa.uint16(),
|
290
|
+
UInt32(): pa.uint32(),
|
291
|
+
UInt64(): pa.uint64(),
|
292
|
+
Float(): pa.float64(), # we default to 64 bit
|
293
|
+
Float32(): pa.float32(),
|
294
|
+
Float64(): pa.float64(),
|
295
|
+
Decimal(): pa.decimal128(35, 10), # Arbitrary precision
|
296
|
+
String(): pa.string(),
|
297
|
+
Bool(): pa.bool_(),
|
298
|
+
Date(): pa.date32(),
|
299
|
+
Time(): pa.time64("us"),
|
300
|
+
Datetime(): pa.timestamp("us"),
|
301
|
+
Duration(): pa.duration("us"),
|
302
|
+
}[self]
|
303
|
+
|
304
|
+
def to_polars(self: "Dtype"):
|
305
|
+
import polars as pl
|
306
|
+
|
307
|
+
return {
|
308
|
+
Int(): pl.Int64, # we default to 64 bit
|
309
|
+
Int64(): pl.Int64,
|
310
|
+
Int32(): pl.Int32,
|
311
|
+
Int16(): pl.Int16,
|
312
|
+
Int8(): pl.Int8,
|
313
|
+
UInt64(): pl.UInt64,
|
314
|
+
UInt32(): pl.UInt32,
|
315
|
+
UInt16(): pl.UInt16,
|
316
|
+
UInt8(): pl.UInt8,
|
317
|
+
Float(): pl.Float64, # we default to 64 bit
|
318
|
+
Float64(): pl.Float64,
|
319
|
+
Float32(): pl.Float32,
|
320
|
+
Decimal(): pl.Decimal(scale=10), # Arbitrary precision
|
321
|
+
String(): pl.Utf8,
|
322
|
+
Bool(): pl.Boolean,
|
323
|
+
Datetime(): pl.Datetime("us"),
|
324
|
+
Duration(): pl.Duration("us"),
|
325
|
+
Time(): pl.Time, # Polars uses nanoseconds since midnight
|
326
|
+
Date(): pl.Date,
|
327
|
+
NullType(): pl.Null,
|
328
|
+
}[self]
|
329
|
+
|
330
|
+
|
331
|
+
class Float(Dtype):
|
332
|
+
@classmethod
|
333
|
+
def is_float(cls):
|
334
|
+
return True
|
335
|
+
|
336
|
+
|
337
|
+
class Float64(Float): ...
|
338
|
+
|
339
|
+
|
340
|
+
class Float32(Float): ...
|
341
|
+
|
342
|
+
|
343
|
+
class Decimal(Float): ...
|
344
|
+
|
345
|
+
|
346
|
+
class Int(Dtype):
|
347
|
+
@classmethod
|
348
|
+
def is_int(cls):
|
349
|
+
return True
|
350
|
+
|
351
|
+
|
352
|
+
class Int64(Int): ...
|
353
|
+
|
354
|
+
|
355
|
+
class Int32(Int): ...
|
356
|
+
|
357
|
+
|
358
|
+
class Int16(Int): ...
|
359
|
+
|
360
|
+
|
361
|
+
class Int8(Int): ...
|
362
|
+
|
363
|
+
|
364
|
+
class UInt64(Int): ...
|
365
|
+
|
366
|
+
|
367
|
+
class UInt32(Int): ...
|
368
|
+
|
369
|
+
|
370
|
+
class UInt16(Int): ...
|
371
|
+
|
372
|
+
|
373
|
+
class UInt8(Int): ...
|
374
|
+
|
375
|
+
|
376
|
+
class String(Dtype): ...
|
377
|
+
|
378
|
+
|
379
|
+
class Bool(Dtype): ...
|
380
|
+
|
381
|
+
|
382
|
+
class Datetime(Dtype): ...
|
383
|
+
|
384
|
+
|
385
|
+
class Date(Dtype): ...
|
386
|
+
|
387
|
+
|
388
|
+
class Time(Dtype): ...
|
389
|
+
|
390
|
+
|
391
|
+
class Duration(Dtype): ...
|
392
|
+
|
393
|
+
|
394
|
+
class NullType(Dtype): ...
|
395
|
+
|
396
|
+
|
397
|
+
class List(Dtype):
|
398
|
+
def __init__(self, inner: "Dtype"):
|
399
|
+
self.inner = inner
|
400
|
+
|
401
|
+
def __eq__(self, rhs):
|
402
|
+
return isinstance(rhs, List) and self.inner == rhs.inner
|
403
|
+
|
404
|
+
def __hash__(self):
|
405
|
+
return hash((0, hash(self.inner)))
|
406
|
+
|
407
|
+
def __repr__(self):
|
408
|
+
return f"List[{repr(self.inner)}]"
|
409
|
+
|
410
|
+
def to_sql(self):
|
411
|
+
import sqlalchemy as sqa
|
412
|
+
|
413
|
+
return sqa.ARRAY(self.inner.to_sql())
|
414
|
+
|
415
|
+
def to_polars(self):
|
416
|
+
import polars as pl
|
417
|
+
|
418
|
+
return pl.List(self.inner.to_polars())
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# Copyright (c) QuantCo and pydiverse contributors 2025-2025
|
2
|
+
# SPDX-License-Identifier: BSD-3-Clause
|
3
|
+
from .deep_map import deep_map
|
4
|
+
from .deep_merge import deep_merge
|
5
|
+
from .disposable import Disposable
|
6
|
+
from .import_ import requires
|
7
|
+
|
8
|
+
__all__ = [
|
9
|
+
"deep_map",
|
10
|
+
"deep_merge",
|
11
|
+
"Disposable",
|
12
|
+
"requires",
|
13
|
+
]
|