df-npy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- df_npy/__init__.py +5 -0
- df_npy/_arrays.py +30 -0
- df_npy/_axis.py +141 -0
- df_npy/_constants.py +51 -0
- df_npy/_dtypes.py +169 -0
- df_npy/_json.py +22 -0
- df_npy/_paths.py +18 -0
- df_npy/_serializer.py +110 -0
- df_npy-0.1.0.dist-info/METADATA +7 -0
- df_npy-0.1.0.dist-info/RECORD +11 -0
- df_npy-0.1.0.dist-info/WHEEL +4 -0
df_npy/__init__.py
ADDED
df_npy/_arrays.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ._constants import (
|
|
11
|
+
NUMPY_DTYPE_FLOAT64,
|
|
12
|
+
NUMPY_DTYPE_UNICODE,
|
|
13
|
+
STRING_MISSING_VALUE_SENTINEL,
|
|
14
|
+
)
|
|
15
|
+
from ._dtypes import DtypePlan, is_string_dtype
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def prepare_writable_array(df: pd.DataFrame, plan: DtypePlan) -> np.ndarray:
|
|
19
|
+
if is_string_dtype(plan.representative_dtype):
|
|
20
|
+
array = (
|
|
21
|
+
df.astype(object)
|
|
22
|
+
.where(df.notna(), other=STRING_MISSING_VALUE_SENTINEL)
|
|
23
|
+
.to_numpy(dtype=NUMPY_DTYPE_UNICODE, copy=False)
|
|
24
|
+
)
|
|
25
|
+
return np.asfortranarray(array)
|
|
26
|
+
|
|
27
|
+
if plan.mixed_numeric:
|
|
28
|
+
return np.asfortranarray(df.to_numpy(dtype=NUMPY_DTYPE_FLOAT64, copy=False))
|
|
29
|
+
|
|
30
|
+
return np.asfortranarray(df.to_numpy(dtype=plan.representative_dtype, copy=False))
|
df_npy/_axis.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from contextlib import suppress
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ._constants import DEFAULT_TIME_UNIT, AxisMetaKey, AxisType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _extract_time_unit(dtype_name: str | None, default: str = DEFAULT_TIME_UNIT) -> str:
|
|
14
|
+
if not dtype_name:
|
|
15
|
+
return default
|
|
16
|
+
match = re.search(r"\[(\w+)", dtype_name)
|
|
17
|
+
if match:
|
|
18
|
+
return match.group(1)
|
|
19
|
+
return default
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def serialise_axis(index: pd.Index | pd.MultiIndex) -> dict[str, Any]:
|
|
23
|
+
if isinstance(index, pd.MultiIndex):
|
|
24
|
+
return {
|
|
25
|
+
AxisMetaKey.TYPE.value: AxisType.MULTIINDEX.value,
|
|
26
|
+
AxisMetaKey.NAMES.value: list(index.names),
|
|
27
|
+
AxisMetaKey.NLEVELS.value: index.nlevels,
|
|
28
|
+
AxisMetaKey.LEVELS.value: [serialise_axis(level) for level in index.levels],
|
|
29
|
+
AxisMetaKey.CODES.value: [codes.tolist() for codes in index.codes],
|
|
30
|
+
AxisMetaKey.SORTORDER.value: index.sortorder,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
payload: dict[str, Any] = {
|
|
34
|
+
AxisMetaKey.TYPE.value: type(index).__name__,
|
|
35
|
+
AxisMetaKey.NAME.value: index.name,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if isinstance(index, pd.RangeIndex):
|
|
39
|
+
payload.update(
|
|
40
|
+
{
|
|
41
|
+
AxisMetaKey.RANGE.value: True,
|
|
42
|
+
AxisMetaKey.START.value: int(index.start),
|
|
43
|
+
AxisMetaKey.STOP.value: int(index.stop),
|
|
44
|
+
AxisMetaKey.STEP.value: int(index.step),
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
return payload
|
|
48
|
+
|
|
49
|
+
if isinstance(index, pd.DatetimeIndex):
|
|
50
|
+
payload.update(
|
|
51
|
+
{
|
|
52
|
+
AxisMetaKey.DATETIME.value: True,
|
|
53
|
+
AxisMetaKey.DTYPE.value: index.dtype.name,
|
|
54
|
+
AxisMetaKey.TZ.value: str(index.tz) if index.tz is not None else None,
|
|
55
|
+
AxisMetaKey.FREQ.value: index.freqstr,
|
|
56
|
+
AxisMetaKey.VALUES_I8.value: index.asi8.tolist(),
|
|
57
|
+
},
|
|
58
|
+
)
|
|
59
|
+
return payload
|
|
60
|
+
|
|
61
|
+
if isinstance(index, pd.TimedeltaIndex):
|
|
62
|
+
payload.update(
|
|
63
|
+
{
|
|
64
|
+
AxisMetaKey.TIMEDELTA.value: True,
|
|
65
|
+
AxisMetaKey.DTYPE.value: index.dtype.name,
|
|
66
|
+
AxisMetaKey.FREQ.value: index.freqstr,
|
|
67
|
+
AxisMetaKey.VALUES_I8.value: index.asi8.tolist(),
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
return payload
|
|
71
|
+
|
|
72
|
+
payload.update(
|
|
73
|
+
{
|
|
74
|
+
AxisMetaKey.VALUES.value: index.tolist(),
|
|
75
|
+
AxisMetaKey.DTYPE.value: str(getattr(index, "dtype", "object")),
|
|
76
|
+
},
|
|
77
|
+
)
|
|
78
|
+
return payload
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def deserialise_axis(metadata: dict[str, Any]) -> pd.Index | pd.MultiIndex:
|
|
82
|
+
axis_type = metadata.get(AxisMetaKey.TYPE.value)
|
|
83
|
+
|
|
84
|
+
if axis_type == AxisType.MULTIINDEX.value:
|
|
85
|
+
levels = [
|
|
86
|
+
deserialise_axis(level) for level in metadata[AxisMetaKey.LEVELS.value]
|
|
87
|
+
]
|
|
88
|
+
return pd.MultiIndex(
|
|
89
|
+
levels=levels,
|
|
90
|
+
codes=metadata[AxisMetaKey.CODES.value],
|
|
91
|
+
names=metadata.get(AxisMetaKey.NAMES.value),
|
|
92
|
+
sortorder=metadata.get(AxisMetaKey.SORTORDER.value),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if metadata.get(AxisMetaKey.RANGE.value):
|
|
96
|
+
return pd.RangeIndex(
|
|
97
|
+
start=metadata[AxisMetaKey.START.value],
|
|
98
|
+
stop=metadata[AxisMetaKey.STOP.value],
|
|
99
|
+
step=metadata[AxisMetaKey.STEP.value],
|
|
100
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if metadata.get(AxisMetaKey.DATETIME.value):
|
|
104
|
+
unit = _extract_time_unit(metadata.get(AxisMetaKey.DTYPE.value))
|
|
105
|
+
values = metadata.get(AxisMetaKey.VALUES_I8.value, [])
|
|
106
|
+
tz = metadata.get(AxisMetaKey.TZ.value)
|
|
107
|
+
|
|
108
|
+
if tz:
|
|
109
|
+
idx = pd.to_datetime(values, unit=unit, utc=True)
|
|
110
|
+
idx = pd.DatetimeIndex(
|
|
111
|
+
idx,
|
|
112
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
113
|
+
).tz_convert(tz)
|
|
114
|
+
else:
|
|
115
|
+
idx = pd.DatetimeIndex(
|
|
116
|
+
pd.to_datetime(values, unit=unit),
|
|
117
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if freq := metadata.get(AxisMetaKey.FREQ.value):
|
|
121
|
+
with suppress(ValueError):
|
|
122
|
+
idx = pd.DatetimeIndex(idx, name=idx.name, freq=freq)
|
|
123
|
+
return idx
|
|
124
|
+
|
|
125
|
+
if metadata.get(AxisMetaKey.TIMEDELTA.value):
|
|
126
|
+
unit = _extract_time_unit(metadata.get(AxisMetaKey.DTYPE.value))
|
|
127
|
+
values_i8 = metadata.get(AxisMetaKey.VALUES_I8.value, [])
|
|
128
|
+
td_arr = np.array(values_i8, dtype=f"timedelta64[{unit}]")
|
|
129
|
+
idx = pd.TimedeltaIndex(
|
|
130
|
+
td_arr,
|
|
131
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
132
|
+
)
|
|
133
|
+
if freq := metadata.get(AxisMetaKey.FREQ.value):
|
|
134
|
+
with suppress(ValueError):
|
|
135
|
+
idx = pd.TimedeltaIndex(idx, name=idx.name, freq=freq)
|
|
136
|
+
return idx
|
|
137
|
+
|
|
138
|
+
return pd.Index(
|
|
139
|
+
metadata.get(AxisMetaKey.VALUES.value, []),
|
|
140
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
141
|
+
)
|
df_npy/_constants.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MetadataKey(StrEnum):
|
|
5
|
+
COLUMNS = "columns"
|
|
6
|
+
INDEX = "index"
|
|
7
|
+
DTYPE = "dtype"
|
|
8
|
+
STORAGE_DTYPE = "storage_dtype"
|
|
9
|
+
COLUMN_DTYPES = "column_dtypes"
|
|
10
|
+
SHAPE = "shape"
|
|
11
|
+
ORDER = "order"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AxisMetaKey(StrEnum):
|
|
15
|
+
TYPE = "type"
|
|
16
|
+
NAME = "name"
|
|
17
|
+
RANGE = "range"
|
|
18
|
+
START = "start"
|
|
19
|
+
STOP = "stop"
|
|
20
|
+
STEP = "step"
|
|
21
|
+
DATETIME = "datetime"
|
|
22
|
+
TIMEDELTA = "timedelta"
|
|
23
|
+
DTYPE = "dtype"
|
|
24
|
+
TZ = "tz"
|
|
25
|
+
FREQ = "freq"
|
|
26
|
+
VALUES_I8 = "values_i8"
|
|
27
|
+
VALUES = "values"
|
|
28
|
+
LEVELS = "levels"
|
|
29
|
+
CODES = "codes"
|
|
30
|
+
NAMES = "names"
|
|
31
|
+
SORTORDER = "sortorder"
|
|
32
|
+
NLEVELS = "nlevels"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AxisType(StrEnum):
|
|
36
|
+
MULTIINDEX = "multiindex"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
STRING_MISSING_VALUE_SENTINEL = "<MISSING_STRING_VALUE_SENTINEL_df_npy>"
|
|
40
|
+
NUMPY_DTYPE_FLOAT64 = "float64"
|
|
41
|
+
NUMPY_DTYPE_UNICODE = "U"
|
|
42
|
+
PANDAS_NULLABLE_INT_DTYPE = "Int64"
|
|
43
|
+
PANDAS_BOOL_DTYPE = "bool"
|
|
44
|
+
PANDAS_NULLABLE_BOOL_DTYPE = "boolean"
|
|
45
|
+
DEFAULT_TIME_UNIT = "ns"
|
|
46
|
+
NPY_SUFFIX = ".npy"
|
|
47
|
+
JSON_SUFFIX = ".json"
|
|
48
|
+
ARRAY_ORDER_FORTRAN = "F"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
STRING_DTYPE_NAMES = {"object", "string", "str"}
|
df_npy/_dtypes.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from numpy.typing import DTypeLike
|
|
8
|
+
|
|
9
|
+
from ._constants import (
|
|
10
|
+
NUMPY_DTYPE_FLOAT64,
|
|
11
|
+
PANDAS_BOOL_DTYPE,
|
|
12
|
+
PANDAS_NULLABLE_BOOL_DTYPE,
|
|
13
|
+
STRING_DTYPE_NAMES,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class DtypePlan:
|
|
19
|
+
representative_dtype: DTypeLike
|
|
20
|
+
mixed_numeric: bool
|
|
21
|
+
column_dtypes: list[str]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_string_dtype(dtype: object) -> bool:
|
|
25
|
+
if dtype is None:
|
|
26
|
+
return False
|
|
27
|
+
try:
|
|
28
|
+
return pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(
|
|
29
|
+
dtype,
|
|
30
|
+
)
|
|
31
|
+
except TypeError, ValueError:
|
|
32
|
+
return str(dtype) in STRING_DTYPE_NAMES
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_numeric_dtype(dtype: object) -> bool:
|
|
36
|
+
try:
|
|
37
|
+
return pd.api.types.is_numeric_dtype(dtype)
|
|
38
|
+
except TypeError, ValueError:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_integer_dtype(dtype: object) -> bool:
|
|
43
|
+
try:
|
|
44
|
+
return pd.api.types.is_integer_dtype(dtype)
|
|
45
|
+
except TypeError, ValueError:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_float_dtype(dtype: object) -> bool:
|
|
50
|
+
try:
|
|
51
|
+
return pd.api.types.is_float_dtype(dtype)
|
|
52
|
+
except TypeError, ValueError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_dtype_plan(df: pd.DataFrame) -> DtypePlan:
|
|
57
|
+
column_dtypes = [str(dtype) for dtype in df.dtypes]
|
|
58
|
+
dtypes = df.dtypes
|
|
59
|
+
n_distinct_dtypes = dtypes.nunique()
|
|
60
|
+
|
|
61
|
+
if n_distinct_dtypes == 0:
|
|
62
|
+
return DtypePlan(
|
|
63
|
+
representative_dtype=NUMPY_DTYPE_FLOAT64,
|
|
64
|
+
mixed_numeric=False,
|
|
65
|
+
column_dtypes=column_dtypes,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if any(is_string_dtype(dt) for dt in dtypes):
|
|
69
|
+
if n_distinct_dtypes > 1:
|
|
70
|
+
msg = (
|
|
71
|
+
f"DataFrame has {n_distinct_dtypes} distinct dtypes; "
|
|
72
|
+
"only single-dtype frames are supported "
|
|
73
|
+
"(string/object cannot be mixed)."
|
|
74
|
+
)
|
|
75
|
+
raise ValueError(
|
|
76
|
+
msg,
|
|
77
|
+
)
|
|
78
|
+
non_null = df.stack().dropna()
|
|
79
|
+
has_non_string = not non_null.map(lambda value: isinstance(value, str)).all()
|
|
80
|
+
if has_non_string:
|
|
81
|
+
msg = (
|
|
82
|
+
"Pickle-backed object serialization is disabled; "
|
|
83
|
+
"object/string frames must contain only string values "
|
|
84
|
+
"and missing values."
|
|
85
|
+
)
|
|
86
|
+
raise ValueError(
|
|
87
|
+
msg,
|
|
88
|
+
)
|
|
89
|
+
return DtypePlan(
|
|
90
|
+
representative_dtype=dtypes.iloc[0],
|
|
91
|
+
mixed_numeric=False,
|
|
92
|
+
column_dtypes=column_dtypes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if all(is_numeric_dtype(dt) for dt in dtypes):
|
|
96
|
+
has_int = any(is_integer_dtype(dt) for dt in dtypes)
|
|
97
|
+
has_float = any(is_float_dtype(dt) for dt in dtypes)
|
|
98
|
+
if has_int and has_float:
|
|
99
|
+
return DtypePlan(
|
|
100
|
+
representative_dtype=NUMPY_DTYPE_FLOAT64,
|
|
101
|
+
mixed_numeric=True,
|
|
102
|
+
column_dtypes=column_dtypes,
|
|
103
|
+
)
|
|
104
|
+
if n_distinct_dtypes > 1:
|
|
105
|
+
msg = (
|
|
106
|
+
f"DataFrame has {n_distinct_dtypes} distinct numeric dtypes; "
|
|
107
|
+
"only int+float mixing is supported."
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(
|
|
110
|
+
msg,
|
|
111
|
+
)
|
|
112
|
+
return DtypePlan(
|
|
113
|
+
representative_dtype=dtypes.iloc[0],
|
|
114
|
+
mixed_numeric=False,
|
|
115
|
+
column_dtypes=column_dtypes,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if n_distinct_dtypes > 1:
|
|
119
|
+
msg = (
|
|
120
|
+
f"DataFrame has {n_distinct_dtypes} distinct dtypes; "
|
|
121
|
+
"only single-dtype frames are supported."
|
|
122
|
+
)
|
|
123
|
+
raise ValueError(
|
|
124
|
+
msg,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return DtypePlan(
|
|
128
|
+
representative_dtype=dtypes.iloc[0],
|
|
129
|
+
mixed_numeric=False,
|
|
130
|
+
column_dtypes=column_dtypes,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _nullable_integer_dtype(dtype_name: str) -> str:
|
|
135
|
+
match = re.fullmatch(r"(u?)int(8|16|32|64)", dtype_name)
|
|
136
|
+
if not match:
|
|
137
|
+
return "Int64"
|
|
138
|
+
unsigned, bits = match.groups()
|
|
139
|
+
prefix = "UInt" if unsigned else "Int"
|
|
140
|
+
return f"{prefix}{bits}"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def restore_column_dtypes(df: pd.DataFrame, column_dtypes: list[str]) -> pd.DataFrame:
|
|
144
|
+
if len(column_dtypes) != len(df.columns):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"column_dtypes length does not match DataFrame columns length.",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
for position, column in enumerate(df.columns):
|
|
150
|
+
dtype_name = column_dtypes[position]
|
|
151
|
+
series = df[column]
|
|
152
|
+
if dtype_name.startswith(("int", "uint")):
|
|
153
|
+
if series.isna().any():
|
|
154
|
+
nullable_dtype = _nullable_integer_dtype(dtype_name)
|
|
155
|
+
df[column] = pd.to_numeric(series, errors="coerce").astype(
|
|
156
|
+
nullable_dtype,
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
df[column] = pd.to_numeric(series, errors="raise").astype(dtype_name)
|
|
160
|
+
elif dtype_name.startswith("float"):
|
|
161
|
+
df[column] = pd.to_numeric(series, errors="coerce").astype(dtype_name)
|
|
162
|
+
elif dtype_name == PANDAS_BOOL_DTYPE:
|
|
163
|
+
df[column] = series.astype(PANDAS_BOOL_DTYPE)
|
|
164
|
+
elif dtype_name == PANDAS_NULLABLE_BOOL_DTYPE:
|
|
165
|
+
df[column] = series.astype(PANDAS_NULLABLE_BOOL_DTYPE)
|
|
166
|
+
else:
|
|
167
|
+
df[column] = series.astype(dtype_name)
|
|
168
|
+
|
|
169
|
+
return df
|
df_npy/_json.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def safe_str(obj: object) -> str:
|
|
7
|
+
try:
|
|
8
|
+
return str(obj)
|
|
9
|
+
except Exception:
|
|
10
|
+
return repr(obj)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def json_default(obj: object) -> bool | float | int | str:
|
|
14
|
+
if isinstance(obj, np.integer):
|
|
15
|
+
return int(obj)
|
|
16
|
+
if isinstance(obj, np.floating):
|
|
17
|
+
return float(obj)
|
|
18
|
+
if isinstance(obj, np.bool_):
|
|
19
|
+
return bool(obj)
|
|
20
|
+
if isinstance(obj, (np.datetime64, np.timedelta64)):
|
|
21
|
+
return str(obj)
|
|
22
|
+
return safe_str(obj)
|
df_npy/_paths.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ._constants import JSON_SUFFIX, NPY_SUFFIX
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def ensure_npy_path(file_path: Path | str, *, for_write: bool) -> Path:
|
|
9
|
+
path = Path(file_path)
|
|
10
|
+
if path.suffix != NPY_SUFFIX:
|
|
11
|
+
path = path.with_suffix(NPY_SUFFIX)
|
|
12
|
+
if for_write:
|
|
13
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
return path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def metadata_file_from_npy_file(npy_file: Path) -> Path:
|
|
18
|
+
return npy_file.with_suffix(JSON_SUFFIX)
|
df_npy/_serializer.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from ._arrays import prepare_writable_array
|
|
11
|
+
from ._axis import deserialise_axis, serialise_axis
|
|
12
|
+
from ._constants import ARRAY_ORDER_FORTRAN, STRING_MISSING_VALUE_SENTINEL, MetadataKey
|
|
13
|
+
from ._dtypes import extract_dtype_plan, is_string_dtype, restore_column_dtypes
|
|
14
|
+
from ._json import json_default
|
|
15
|
+
from ._paths import ensure_npy_path, metadata_file_from_npy_file
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class NpySerializer:
|
|
19
|
+
@classmethod
|
|
20
|
+
def to_npy(cls, df: pd.DataFrame, file_path: Path | str) -> None:
|
|
21
|
+
path = ensure_npy_path(file_path, for_write=True)
|
|
22
|
+
if not df.columns.is_unique:
|
|
23
|
+
raise ValueError("Columns must be unique for identifier-based subsetting.")
|
|
24
|
+
|
|
25
|
+
logger.info(f"Serializing DataFrame to {path}")
|
|
26
|
+
logger.debug("Extracting dtype from DataFrame")
|
|
27
|
+
plan = extract_dtype_plan(df)
|
|
28
|
+
|
|
29
|
+
logger.debug("Converting DataFrame to Fortran-ordered NumPy array")
|
|
30
|
+
np_array = prepare_writable_array(df, plan)
|
|
31
|
+
|
|
32
|
+
logger.debug("Creating metadata for DataFrame serialization")
|
|
33
|
+
metadata = {
|
|
34
|
+
MetadataKey.COLUMNS.value: serialise_axis(df.columns),
|
|
35
|
+
MetadataKey.INDEX.value: serialise_axis(df.index),
|
|
36
|
+
MetadataKey.DTYPE.value: str(plan.representative_dtype),
|
|
37
|
+
MetadataKey.STORAGE_DTYPE.value: str(np_array.dtype),
|
|
38
|
+
MetadataKey.COLUMN_DTYPES.value: plan.column_dtypes,
|
|
39
|
+
MetadataKey.SHAPE.value: list(np_array.shape),
|
|
40
|
+
MetadataKey.ORDER.value: ARRAY_ORDER_FORTRAN,
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
logger.debug("Saving array")
|
|
44
|
+
np.save(path, np_array, allow_pickle=False)
|
|
45
|
+
|
|
46
|
+
logger.debug("Saving metadata")
|
|
47
|
+
metadata_path = metadata_file_from_npy_file(path)
|
|
48
|
+
metadata_path.write_text(
|
|
49
|
+
json.dumps(metadata, indent=4, default=json_default),
|
|
50
|
+
encoding="utf-8",
|
|
51
|
+
)
|
|
52
|
+
logger.success(f"DataFrame serialized to {path}.")
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_npy(
|
|
56
|
+
cls,
|
|
57
|
+
file_path: Path | str,
|
|
58
|
+
*,
|
|
59
|
+
identifiers: list[str] | None = None,
|
|
60
|
+
) -> pd.DataFrame:
|
|
61
|
+
path = ensure_npy_path(file_path, for_write=False)
|
|
62
|
+
logger.info(f"Deserializing DataFrame from {path}")
|
|
63
|
+
if not path.exists():
|
|
64
|
+
raise FileNotFoundError(path)
|
|
65
|
+
|
|
66
|
+
logger.debug("Loading metadata from JSON file")
|
|
67
|
+
metadata_path = metadata_file_from_npy_file(path)
|
|
68
|
+
metadata = json.loads(metadata_path.read_text(encoding="utf-8"))
|
|
69
|
+
|
|
70
|
+
if MetadataKey.COLUMN_DTYPES.value not in metadata:
|
|
71
|
+
raise ValueError("Metadata is missing required key: column_dtypes")
|
|
72
|
+
column_dtypes = metadata[MetadataKey.COLUMN_DTYPES.value]
|
|
73
|
+
|
|
74
|
+
logger.debug("Loading NumPy array from file")
|
|
75
|
+
np_array = np.load(path, allow_pickle=False, mmap_mode="r")
|
|
76
|
+
if not np_array.flags["F_CONTIGUOUS"]:
|
|
77
|
+
raise ValueError("Stored array is not Fortran contiguous as expected.")
|
|
78
|
+
|
|
79
|
+
logger.debug("Deserializing index")
|
|
80
|
+
index = deserialise_axis(metadata[MetadataKey.INDEX.value])
|
|
81
|
+
|
|
82
|
+
logger.debug("Deserializing columns")
|
|
83
|
+
columns = deserialise_axis(metadata[MetadataKey.COLUMNS.value])
|
|
84
|
+
|
|
85
|
+
if identifiers is not None:
|
|
86
|
+
logger.debug("Subsetting columns based on provided identifiers")
|
|
87
|
+
positions = {column: position for position, column in enumerate(columns)}
|
|
88
|
+
try:
|
|
89
|
+
column_indices = [positions[column] for column in identifiers]
|
|
90
|
+
except KeyError as exc:
|
|
91
|
+
raise KeyError(f"Identifier not found in columns: {exc}") from None
|
|
92
|
+
np_array = np_array[:, np.asarray(column_indices, dtype=np.int64)]
|
|
93
|
+
columns = pd.Index(identifiers, name=columns.name)
|
|
94
|
+
column_dtypes = [column_dtypes[index] for index in column_indices]
|
|
95
|
+
else:
|
|
96
|
+
_ = np_array[...]
|
|
97
|
+
|
|
98
|
+
logger.debug("Creating DataFrame from NumPy array, columns, and index")
|
|
99
|
+
df = pd.DataFrame(np_array, columns=columns, index=index, copy=False)
|
|
100
|
+
|
|
101
|
+
if is_string_dtype(metadata.get(MetadataKey.DTYPE.value)):
|
|
102
|
+
logger.debug("Replacing sentinel values with NaN for string/object dtype")
|
|
103
|
+
df.replace(STRING_MISSING_VALUE_SENTINEL, np.nan, inplace=True)
|
|
104
|
+
|
|
105
|
+
logger.debug("Restoring per-column dtypes from metadata")
|
|
106
|
+
df = restore_column_dtypes(df, column_dtypes)
|
|
107
|
+
|
|
108
|
+
logger.success(f"DataFrame deserialized from {path}.")
|
|
109
|
+
logger.debug(f"Deserialized DataFrame shape: {df.shape}")
|
|
110
|
+
return df
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
df_npy/__init__.py,sha256=Ox1NtpAz9uHwGMOw6cz0qXed8RBemmxMTsBd9y0F3g4,106
|
|
2
|
+
df_npy/_arrays.py,sha256=8dmfaI7iRQh9KmONoUTSwsLIrUh8i4obZwpvWHzTgNM,852
|
|
3
|
+
df_npy/_axis.py,sha256=tJVwl4eyRMuSuNSecPu27VxsXsASSBBFMZue8Vz1mf0,4828
|
|
4
|
+
df_npy/_constants.py,sha256=vO86zJvEavu2p4owU1j0jpP3hNDVhVHTY0-htxuOjjU,1079
|
|
5
|
+
df_npy/_dtypes.py,sha256=pDlP7bsERr035gLzb3QWpWNA_IzQBBLqqjMGlvgQEug,5152
|
|
6
|
+
df_npy/_json.py,sha256=knr5EsUnj88L6d9dVEUDu5YNNknWUu_0JH4WrbSQ1kI,520
|
|
7
|
+
df_npy/_paths.py,sha256=XiGRhZxQPKPiYVQmhAAVqvpf65kl_vdPG-PfS8aevZs,482
|
|
8
|
+
df_npy/_serializer.py,sha256=Yn0v9eYaLYejlvgklLEAkNCYLDsLORF7MUcWyZpqDkU,4583
|
|
9
|
+
df_npy-0.1.0.dist-info/METADATA,sha256=A82OGlNX-seTKYIw7eJxxO-ydxcqFuTAS9bAra88xXM,160
|
|
10
|
+
df_npy-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
11
|
+
df_npy-0.1.0.dist-info/RECORD,,
|