df-npy 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- df_npy-0.1.0/.github/workflows/ci-release.yml +140 -0
- df_npy-0.1.0/.gitignore +6 -0
- df_npy-0.1.0/.pre-commit-config.yaml +26 -0
- df_npy-0.1.0/PKG-INFO +7 -0
- df_npy-0.1.0/README.md +39 -0
- df_npy-0.1.0/df_npy/__init__.py +5 -0
- df_npy-0.1.0/df_npy/_arrays.py +30 -0
- df_npy-0.1.0/df_npy/_axis.py +141 -0
- df_npy-0.1.0/df_npy/_constants.py +51 -0
- df_npy-0.1.0/df_npy/_dtypes.py +169 -0
- df_npy-0.1.0/df_npy/_json.py +22 -0
- df_npy-0.1.0/df_npy/_paths.py +18 -0
- df_npy-0.1.0/df_npy/_serializer.py +110 -0
- df_npy-0.1.0/docs/api.md +5 -0
- df_npy-0.1.0/docs/index.md +53 -0
- df_npy-0.1.0/docs/usage/performance.md +35 -0
- df_npy-0.1.0/docs/usage/quickstart.md +49 -0
- df_npy-0.1.0/justfile +34 -0
- df_npy-0.1.0/mkdocs.yml +28 -0
- df_npy-0.1.0/pyproject.toml +45 -0
- df_npy-0.1.0/tests/test_integration_performance.py +135 -0
- df_npy-0.1.0/tests/test_private_helpers.py +132 -0
- df_npy-0.1.0/tests/test_serialisation.py +74 -0
- df_npy-0.1.0/uv.lock +689 -0
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
name: CI and Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- "**"
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
quality:
|
|
12
|
+
name: Lint, Typecheck, Test
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- name: Checkout
|
|
16
|
+
uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Setup Python
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.14"
|
|
22
|
+
|
|
23
|
+
- name: Setup uv
|
|
24
|
+
uses: astral-sh/setup-uv@v6
|
|
25
|
+
|
|
26
|
+
- name: Sync dependencies
|
|
27
|
+
run: uv sync --group dev
|
|
28
|
+
|
|
29
|
+
- name: Ruff lint
|
|
30
|
+
run: uv run ruff check
|
|
31
|
+
|
|
32
|
+
- name: Ruff format check
|
|
33
|
+
run: uv run ruff format --check
|
|
34
|
+
|
|
35
|
+
- name: Type check
|
|
36
|
+
run: uv run ty check
|
|
37
|
+
|
|
38
|
+
- name: Test
|
|
39
|
+
run: uv run python -m pytest -q
|
|
40
|
+
|
|
41
|
+
docs:
|
|
42
|
+
name: Build and Publish Docs
|
|
43
|
+
needs: quality
|
|
44
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
45
|
+
runs-on: ubuntu-latest
|
|
46
|
+
permissions:
|
|
47
|
+
contents: write
|
|
48
|
+
steps:
|
|
49
|
+
- name: Checkout
|
|
50
|
+
uses: actions/checkout@v4
|
|
51
|
+
|
|
52
|
+
- name: Setup Python
|
|
53
|
+
uses: actions/setup-python@v5
|
|
54
|
+
with:
|
|
55
|
+
python-version: "3.14"
|
|
56
|
+
|
|
57
|
+
- name: Setup uv
|
|
58
|
+
uses: astral-sh/setup-uv@v6
|
|
59
|
+
|
|
60
|
+
- name: Sync dependencies
|
|
61
|
+
run: uv sync --group dev
|
|
62
|
+
|
|
63
|
+
- name: Build docs
|
|
64
|
+
run: uv run zensical build
|
|
65
|
+
|
|
66
|
+
- name: Deploy to gh-pages
|
|
67
|
+
uses: peaceiris/actions-gh-pages@v4
|
|
68
|
+
with:
|
|
69
|
+
github_token: ${{ secrets.GITHUB_TOKEN }}
|
|
70
|
+
publish_branch: gh-pages
|
|
71
|
+
publish_dir: ./site
|
|
72
|
+
force_orphan: true
|
|
73
|
+
|
|
74
|
+
pypi:
|
|
75
|
+
name: Publish to PyPI (if new version)
|
|
76
|
+
needs: quality
|
|
77
|
+
if: github.event_name == 'push' && github.ref == 'refs/heads/main'
|
|
78
|
+
runs-on: ubuntu-latest
|
|
79
|
+
permissions:
|
|
80
|
+
contents: read
|
|
81
|
+
id-token: write
|
|
82
|
+
steps:
|
|
83
|
+
- name: Checkout
|
|
84
|
+
uses: actions/checkout@v4
|
|
85
|
+
|
|
86
|
+
- name: Setup Python
|
|
87
|
+
uses: actions/setup-python@v5
|
|
88
|
+
with:
|
|
89
|
+
python-version: "3.14"
|
|
90
|
+
|
|
91
|
+
- name: Setup uv
|
|
92
|
+
uses: astral-sh/setup-uv@v6
|
|
93
|
+
|
|
94
|
+
- name: Sync dependencies
|
|
95
|
+
run: uv sync --group dev
|
|
96
|
+
|
|
97
|
+
- name: Build package
|
|
98
|
+
run: uv build
|
|
99
|
+
|
|
100
|
+
- name: Check if version already exists on PyPI
|
|
101
|
+
id: version_check
|
|
102
|
+
run: |
|
|
103
|
+
python - <<'PY'
|
|
104
|
+
import json
|
|
105
|
+
import os
|
|
106
|
+
import pathlib
|
|
107
|
+
import tomllib
|
|
108
|
+
import urllib.error
|
|
109
|
+
import urllib.request
|
|
110
|
+
|
|
111
|
+
pyproject = tomllib.loads(pathlib.Path("pyproject.toml").read_text())
|
|
112
|
+
name = pyproject["project"]["name"]
|
|
113
|
+
version = pyproject["project"]["version"]
|
|
114
|
+
exists = False
|
|
115
|
+
|
|
116
|
+
url = f"https://pypi.org/pypi/{name}/json"
|
|
117
|
+
try:
|
|
118
|
+
with urllib.request.urlopen(url, timeout=15) as response:
|
|
119
|
+
payload = json.load(response)
|
|
120
|
+
releases = payload.get("releases", {})
|
|
121
|
+
exists = version in releases and bool(releases.get(version))
|
|
122
|
+
except urllib.error.HTTPError as exc:
|
|
123
|
+
if exc.code != 404:
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as f:
|
|
127
|
+
f.write(f"package_name={name}\n")
|
|
128
|
+
f.write(f"package_version={version}\n")
|
|
129
|
+
f.write(f"version_exists={str(exists).lower()}\n")
|
|
130
|
+
PY
|
|
131
|
+
|
|
132
|
+
- name: Publish to PyPI with uv (OIDC)
|
|
133
|
+
if: steps.version_check.outputs.version_exists != 'true'
|
|
134
|
+
run: uv publish --trusted-publishing always dist/*
|
|
135
|
+
|
|
136
|
+
- name: Report skipped publish
|
|
137
|
+
if: steps.version_check.outputs.version_exists == 'true'
|
|
138
|
+
run: |
|
|
139
|
+
echo "PyPI publish skipped."
|
|
140
|
+
echo "version_exists=${{ steps.version_check.outputs.version_exists }}"
|
df_npy-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- id: check-added-large-files
|
|
10
|
+
- id: check-merge-conflict
|
|
11
|
+
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.15.20
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: [--fix]
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
|
|
19
|
+
- repo: local
|
|
20
|
+
hooks:
|
|
21
|
+
- id: ty-check
|
|
22
|
+
name: ty check
|
|
23
|
+
entry: uv run ty check
|
|
24
|
+
language: system
|
|
25
|
+
pass_filenames: false
|
|
26
|
+
types: [python]
|
df_npy-0.1.0/PKG-INFO
ADDED
df_npy-0.1.0/README.md
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# df-npy
|
|
2
|
+
|
|
3
|
+
Serialize and deserialize pandas DataFrames to `.npy` plus JSON metadata.
|
|
4
|
+
|
|
5
|
+
## Security
|
|
6
|
+
|
|
7
|
+
### Pickle policy
|
|
8
|
+
|
|
9
|
+
`df-npy` has a strict no-pickle policy.
|
|
10
|
+
|
|
11
|
+
- Serialization always writes NumPy arrays with `allow_pickle=False`.
|
|
12
|
+
- Deserialization always reads with `allow_pickle=False`.
|
|
13
|
+
- DataFrames that would require pickle-backed object serialization are rejected.
|
|
14
|
+
|
|
15
|
+
This behavior is intentional for security-sensitive environments.
|
|
16
|
+
|
|
17
|
+
### What this means in practice
|
|
18
|
+
|
|
19
|
+
Supported data should be representable without Python object pickling.
|
|
20
|
+
|
|
21
|
+
Examples of unsupported frames:
|
|
22
|
+
|
|
23
|
+
- Object/string frames containing non-string Python objects.
|
|
24
|
+
- Heterogeneous object columns that rely on pickle to round-trip values.
|
|
25
|
+
|
|
26
|
+
When unsupported data is provided, `df-npy` raises `ValueError` rather than falling back to pickle.
|
|
27
|
+
|
|
28
|
+
## API
|
|
29
|
+
|
|
30
|
+
Public API:
|
|
31
|
+
|
|
32
|
+
- `NpySerializer.to_npy(df, file_path)`
|
|
33
|
+
- `NpySerializer.from_npy(file_path, identifiers=None)`
|
|
34
|
+
|
|
35
|
+
Import:
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from df_npy import NpySerializer
|
|
39
|
+
```
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ._constants import (
|
|
11
|
+
NUMPY_DTYPE_FLOAT64,
|
|
12
|
+
NUMPY_DTYPE_UNICODE,
|
|
13
|
+
STRING_MISSING_VALUE_SENTINEL,
|
|
14
|
+
)
|
|
15
|
+
from ._dtypes import DtypePlan, is_string_dtype
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def prepare_writable_array(df: pd.DataFrame, plan: DtypePlan) -> np.ndarray:
|
|
19
|
+
if is_string_dtype(plan.representative_dtype):
|
|
20
|
+
array = (
|
|
21
|
+
df.astype(object)
|
|
22
|
+
.where(df.notna(), other=STRING_MISSING_VALUE_SENTINEL)
|
|
23
|
+
.to_numpy(dtype=NUMPY_DTYPE_UNICODE, copy=False)
|
|
24
|
+
)
|
|
25
|
+
return np.asfortranarray(array)
|
|
26
|
+
|
|
27
|
+
if plan.mixed_numeric:
|
|
28
|
+
return np.asfortranarray(df.to_numpy(dtype=NUMPY_DTYPE_FLOAT64, copy=False))
|
|
29
|
+
|
|
30
|
+
return np.asfortranarray(df.to_numpy(dtype=plan.representative_dtype, copy=False))
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from contextlib import suppress
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import numpy as np
|
|
8
|
+
import pandas as pd
|
|
9
|
+
|
|
10
|
+
from ._constants import DEFAULT_TIME_UNIT, AxisMetaKey, AxisType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _extract_time_unit(dtype_name: str | None, default: str = DEFAULT_TIME_UNIT) -> str:
|
|
14
|
+
if not dtype_name:
|
|
15
|
+
return default
|
|
16
|
+
match = re.search(r"\[(\w+)", dtype_name)
|
|
17
|
+
if match:
|
|
18
|
+
return match.group(1)
|
|
19
|
+
return default
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def serialise_axis(index: pd.Index | pd.MultiIndex) -> dict[str, Any]:
|
|
23
|
+
if isinstance(index, pd.MultiIndex):
|
|
24
|
+
return {
|
|
25
|
+
AxisMetaKey.TYPE.value: AxisType.MULTIINDEX.value,
|
|
26
|
+
AxisMetaKey.NAMES.value: list(index.names),
|
|
27
|
+
AxisMetaKey.NLEVELS.value: index.nlevels,
|
|
28
|
+
AxisMetaKey.LEVELS.value: [serialise_axis(level) for level in index.levels],
|
|
29
|
+
AxisMetaKey.CODES.value: [codes.tolist() for codes in index.codes],
|
|
30
|
+
AxisMetaKey.SORTORDER.value: index.sortorder,
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
payload: dict[str, Any] = {
|
|
34
|
+
AxisMetaKey.TYPE.value: type(index).__name__,
|
|
35
|
+
AxisMetaKey.NAME.value: index.name,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if isinstance(index, pd.RangeIndex):
|
|
39
|
+
payload.update(
|
|
40
|
+
{
|
|
41
|
+
AxisMetaKey.RANGE.value: True,
|
|
42
|
+
AxisMetaKey.START.value: int(index.start),
|
|
43
|
+
AxisMetaKey.STOP.value: int(index.stop),
|
|
44
|
+
AxisMetaKey.STEP.value: int(index.step),
|
|
45
|
+
},
|
|
46
|
+
)
|
|
47
|
+
return payload
|
|
48
|
+
|
|
49
|
+
if isinstance(index, pd.DatetimeIndex):
|
|
50
|
+
payload.update(
|
|
51
|
+
{
|
|
52
|
+
AxisMetaKey.DATETIME.value: True,
|
|
53
|
+
AxisMetaKey.DTYPE.value: index.dtype.name,
|
|
54
|
+
AxisMetaKey.TZ.value: str(index.tz) if index.tz is not None else None,
|
|
55
|
+
AxisMetaKey.FREQ.value: index.freqstr,
|
|
56
|
+
AxisMetaKey.VALUES_I8.value: index.asi8.tolist(),
|
|
57
|
+
},
|
|
58
|
+
)
|
|
59
|
+
return payload
|
|
60
|
+
|
|
61
|
+
if isinstance(index, pd.TimedeltaIndex):
|
|
62
|
+
payload.update(
|
|
63
|
+
{
|
|
64
|
+
AxisMetaKey.TIMEDELTA.value: True,
|
|
65
|
+
AxisMetaKey.DTYPE.value: index.dtype.name,
|
|
66
|
+
AxisMetaKey.FREQ.value: index.freqstr,
|
|
67
|
+
AxisMetaKey.VALUES_I8.value: index.asi8.tolist(),
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
return payload
|
|
71
|
+
|
|
72
|
+
payload.update(
|
|
73
|
+
{
|
|
74
|
+
AxisMetaKey.VALUES.value: index.tolist(),
|
|
75
|
+
AxisMetaKey.DTYPE.value: str(getattr(index, "dtype", "object")),
|
|
76
|
+
},
|
|
77
|
+
)
|
|
78
|
+
return payload
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def deserialise_axis(metadata: dict[str, Any]) -> pd.Index | pd.MultiIndex:
|
|
82
|
+
axis_type = metadata.get(AxisMetaKey.TYPE.value)
|
|
83
|
+
|
|
84
|
+
if axis_type == AxisType.MULTIINDEX.value:
|
|
85
|
+
levels = [
|
|
86
|
+
deserialise_axis(level) for level in metadata[AxisMetaKey.LEVELS.value]
|
|
87
|
+
]
|
|
88
|
+
return pd.MultiIndex(
|
|
89
|
+
levels=levels,
|
|
90
|
+
codes=metadata[AxisMetaKey.CODES.value],
|
|
91
|
+
names=metadata.get(AxisMetaKey.NAMES.value),
|
|
92
|
+
sortorder=metadata.get(AxisMetaKey.SORTORDER.value),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if metadata.get(AxisMetaKey.RANGE.value):
|
|
96
|
+
return pd.RangeIndex(
|
|
97
|
+
start=metadata[AxisMetaKey.START.value],
|
|
98
|
+
stop=metadata[AxisMetaKey.STOP.value],
|
|
99
|
+
step=metadata[AxisMetaKey.STEP.value],
|
|
100
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
if metadata.get(AxisMetaKey.DATETIME.value):
|
|
104
|
+
unit = _extract_time_unit(metadata.get(AxisMetaKey.DTYPE.value))
|
|
105
|
+
values = metadata.get(AxisMetaKey.VALUES_I8.value, [])
|
|
106
|
+
tz = metadata.get(AxisMetaKey.TZ.value)
|
|
107
|
+
|
|
108
|
+
if tz:
|
|
109
|
+
idx = pd.to_datetime(values, unit=unit, utc=True)
|
|
110
|
+
idx = pd.DatetimeIndex(
|
|
111
|
+
idx,
|
|
112
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
113
|
+
).tz_convert(tz)
|
|
114
|
+
else:
|
|
115
|
+
idx = pd.DatetimeIndex(
|
|
116
|
+
pd.to_datetime(values, unit=unit),
|
|
117
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
if freq := metadata.get(AxisMetaKey.FREQ.value):
|
|
121
|
+
with suppress(ValueError):
|
|
122
|
+
idx = pd.DatetimeIndex(idx, name=idx.name, freq=freq)
|
|
123
|
+
return idx
|
|
124
|
+
|
|
125
|
+
if metadata.get(AxisMetaKey.TIMEDELTA.value):
|
|
126
|
+
unit = _extract_time_unit(metadata.get(AxisMetaKey.DTYPE.value))
|
|
127
|
+
values_i8 = metadata.get(AxisMetaKey.VALUES_I8.value, [])
|
|
128
|
+
td_arr = np.array(values_i8, dtype=f"timedelta64[{unit}]")
|
|
129
|
+
idx = pd.TimedeltaIndex(
|
|
130
|
+
td_arr,
|
|
131
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
132
|
+
)
|
|
133
|
+
if freq := metadata.get(AxisMetaKey.FREQ.value):
|
|
134
|
+
with suppress(ValueError):
|
|
135
|
+
idx = pd.TimedeltaIndex(idx, name=idx.name, freq=freq)
|
|
136
|
+
return idx
|
|
137
|
+
|
|
138
|
+
return pd.Index(
|
|
139
|
+
metadata.get(AxisMetaKey.VALUES.value, []),
|
|
140
|
+
name=metadata.get(AxisMetaKey.NAME.value),
|
|
141
|
+
)
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class MetadataKey(StrEnum):
|
|
5
|
+
COLUMNS = "columns"
|
|
6
|
+
INDEX = "index"
|
|
7
|
+
DTYPE = "dtype"
|
|
8
|
+
STORAGE_DTYPE = "storage_dtype"
|
|
9
|
+
COLUMN_DTYPES = "column_dtypes"
|
|
10
|
+
SHAPE = "shape"
|
|
11
|
+
ORDER = "order"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AxisMetaKey(StrEnum):
|
|
15
|
+
TYPE = "type"
|
|
16
|
+
NAME = "name"
|
|
17
|
+
RANGE = "range"
|
|
18
|
+
START = "start"
|
|
19
|
+
STOP = "stop"
|
|
20
|
+
STEP = "step"
|
|
21
|
+
DATETIME = "datetime"
|
|
22
|
+
TIMEDELTA = "timedelta"
|
|
23
|
+
DTYPE = "dtype"
|
|
24
|
+
TZ = "tz"
|
|
25
|
+
FREQ = "freq"
|
|
26
|
+
VALUES_I8 = "values_i8"
|
|
27
|
+
VALUES = "values"
|
|
28
|
+
LEVELS = "levels"
|
|
29
|
+
CODES = "codes"
|
|
30
|
+
NAMES = "names"
|
|
31
|
+
SORTORDER = "sortorder"
|
|
32
|
+
NLEVELS = "nlevels"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AxisType(StrEnum):
|
|
36
|
+
MULTIINDEX = "multiindex"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
STRING_MISSING_VALUE_SENTINEL = "<MISSING_STRING_VALUE_SENTINEL_df_npy>"
|
|
40
|
+
NUMPY_DTYPE_FLOAT64 = "float64"
|
|
41
|
+
NUMPY_DTYPE_UNICODE = "U"
|
|
42
|
+
PANDAS_NULLABLE_INT_DTYPE = "Int64"
|
|
43
|
+
PANDAS_BOOL_DTYPE = "bool"
|
|
44
|
+
PANDAS_NULLABLE_BOOL_DTYPE = "boolean"
|
|
45
|
+
DEFAULT_TIME_UNIT = "ns"
|
|
46
|
+
NPY_SUFFIX = ".npy"
|
|
47
|
+
JSON_SUFFIX = ".json"
|
|
48
|
+
ARRAY_ORDER_FORTRAN = "F"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
STRING_DTYPE_NAMES = {"object", "string", "str"}
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from numpy.typing import DTypeLike
|
|
8
|
+
|
|
9
|
+
from ._constants import (
|
|
10
|
+
NUMPY_DTYPE_FLOAT64,
|
|
11
|
+
PANDAS_BOOL_DTYPE,
|
|
12
|
+
PANDAS_NULLABLE_BOOL_DTYPE,
|
|
13
|
+
STRING_DTYPE_NAMES,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class DtypePlan:
|
|
19
|
+
representative_dtype: DTypeLike
|
|
20
|
+
mixed_numeric: bool
|
|
21
|
+
column_dtypes: list[str]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def is_string_dtype(dtype: object) -> bool:
|
|
25
|
+
if dtype is None:
|
|
26
|
+
return False
|
|
27
|
+
try:
|
|
28
|
+
return pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(
|
|
29
|
+
dtype,
|
|
30
|
+
)
|
|
31
|
+
except TypeError, ValueError:
|
|
32
|
+
return str(dtype) in STRING_DTYPE_NAMES
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_numeric_dtype(dtype: object) -> bool:
|
|
36
|
+
try:
|
|
37
|
+
return pd.api.types.is_numeric_dtype(dtype)
|
|
38
|
+
except TypeError, ValueError:
|
|
39
|
+
return False
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def is_integer_dtype(dtype: object) -> bool:
|
|
43
|
+
try:
|
|
44
|
+
return pd.api.types.is_integer_dtype(dtype)
|
|
45
|
+
except TypeError, ValueError:
|
|
46
|
+
return False
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def is_float_dtype(dtype: object) -> bool:
|
|
50
|
+
try:
|
|
51
|
+
return pd.api.types.is_float_dtype(dtype)
|
|
52
|
+
except TypeError, ValueError:
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def extract_dtype_plan(df: pd.DataFrame) -> DtypePlan:
|
|
57
|
+
column_dtypes = [str(dtype) for dtype in df.dtypes]
|
|
58
|
+
dtypes = df.dtypes
|
|
59
|
+
n_distinct_dtypes = dtypes.nunique()
|
|
60
|
+
|
|
61
|
+
if n_distinct_dtypes == 0:
|
|
62
|
+
return DtypePlan(
|
|
63
|
+
representative_dtype=NUMPY_DTYPE_FLOAT64,
|
|
64
|
+
mixed_numeric=False,
|
|
65
|
+
column_dtypes=column_dtypes,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
if any(is_string_dtype(dt) for dt in dtypes):
|
|
69
|
+
if n_distinct_dtypes > 1:
|
|
70
|
+
msg = (
|
|
71
|
+
f"DataFrame has {n_distinct_dtypes} distinct dtypes; "
|
|
72
|
+
"only single-dtype frames are supported "
|
|
73
|
+
"(string/object cannot be mixed)."
|
|
74
|
+
)
|
|
75
|
+
raise ValueError(
|
|
76
|
+
msg,
|
|
77
|
+
)
|
|
78
|
+
non_null = df.stack().dropna()
|
|
79
|
+
has_non_string = not non_null.map(lambda value: isinstance(value, str)).all()
|
|
80
|
+
if has_non_string:
|
|
81
|
+
msg = (
|
|
82
|
+
"Pickle-backed object serialization is disabled; "
|
|
83
|
+
"object/string frames must contain only string values "
|
|
84
|
+
"and missing values."
|
|
85
|
+
)
|
|
86
|
+
raise ValueError(
|
|
87
|
+
msg,
|
|
88
|
+
)
|
|
89
|
+
return DtypePlan(
|
|
90
|
+
representative_dtype=dtypes.iloc[0],
|
|
91
|
+
mixed_numeric=False,
|
|
92
|
+
column_dtypes=column_dtypes,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
if all(is_numeric_dtype(dt) for dt in dtypes):
|
|
96
|
+
has_int = any(is_integer_dtype(dt) for dt in dtypes)
|
|
97
|
+
has_float = any(is_float_dtype(dt) for dt in dtypes)
|
|
98
|
+
if has_int and has_float:
|
|
99
|
+
return DtypePlan(
|
|
100
|
+
representative_dtype=NUMPY_DTYPE_FLOAT64,
|
|
101
|
+
mixed_numeric=True,
|
|
102
|
+
column_dtypes=column_dtypes,
|
|
103
|
+
)
|
|
104
|
+
if n_distinct_dtypes > 1:
|
|
105
|
+
msg = (
|
|
106
|
+
f"DataFrame has {n_distinct_dtypes} distinct numeric dtypes; "
|
|
107
|
+
"only int+float mixing is supported."
|
|
108
|
+
)
|
|
109
|
+
raise ValueError(
|
|
110
|
+
msg,
|
|
111
|
+
)
|
|
112
|
+
return DtypePlan(
|
|
113
|
+
representative_dtype=dtypes.iloc[0],
|
|
114
|
+
mixed_numeric=False,
|
|
115
|
+
column_dtypes=column_dtypes,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if n_distinct_dtypes > 1:
|
|
119
|
+
msg = (
|
|
120
|
+
f"DataFrame has {n_distinct_dtypes} distinct dtypes; "
|
|
121
|
+
"only single-dtype frames are supported."
|
|
122
|
+
)
|
|
123
|
+
raise ValueError(
|
|
124
|
+
msg,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
return DtypePlan(
|
|
128
|
+
representative_dtype=dtypes.iloc[0],
|
|
129
|
+
mixed_numeric=False,
|
|
130
|
+
column_dtypes=column_dtypes,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _nullable_integer_dtype(dtype_name: str) -> str:
|
|
135
|
+
match = re.fullmatch(r"(u?)int(8|16|32|64)", dtype_name)
|
|
136
|
+
if not match:
|
|
137
|
+
return "Int64"
|
|
138
|
+
unsigned, bits = match.groups()
|
|
139
|
+
prefix = "UInt" if unsigned else "Int"
|
|
140
|
+
return f"{prefix}{bits}"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def restore_column_dtypes(df: pd.DataFrame, column_dtypes: list[str]) -> pd.DataFrame:
|
|
144
|
+
if len(column_dtypes) != len(df.columns):
|
|
145
|
+
raise ValueError(
|
|
146
|
+
"column_dtypes length does not match DataFrame columns length.",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
for position, column in enumerate(df.columns):
|
|
150
|
+
dtype_name = column_dtypes[position]
|
|
151
|
+
series = df[column]
|
|
152
|
+
if dtype_name.startswith(("int", "uint")):
|
|
153
|
+
if series.isna().any():
|
|
154
|
+
nullable_dtype = _nullable_integer_dtype(dtype_name)
|
|
155
|
+
df[column] = pd.to_numeric(series, errors="coerce").astype(
|
|
156
|
+
nullable_dtype,
|
|
157
|
+
)
|
|
158
|
+
else:
|
|
159
|
+
df[column] = pd.to_numeric(series, errors="raise").astype(dtype_name)
|
|
160
|
+
elif dtype_name.startswith("float"):
|
|
161
|
+
df[column] = pd.to_numeric(series, errors="coerce").astype(dtype_name)
|
|
162
|
+
elif dtype_name == PANDAS_BOOL_DTYPE:
|
|
163
|
+
df[column] = series.astype(PANDAS_BOOL_DTYPE)
|
|
164
|
+
elif dtype_name == PANDAS_NULLABLE_BOOL_DTYPE:
|
|
165
|
+
df[column] = series.astype(PANDAS_NULLABLE_BOOL_DTYPE)
|
|
166
|
+
else:
|
|
167
|
+
df[column] = series.astype(dtype_name)
|
|
168
|
+
|
|
169
|
+
return df
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def safe_str(obj: object) -> str:
|
|
7
|
+
try:
|
|
8
|
+
return str(obj)
|
|
9
|
+
except Exception:
|
|
10
|
+
return repr(obj)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def json_default(obj: object) -> bool | float | int | str:
|
|
14
|
+
if isinstance(obj, np.integer):
|
|
15
|
+
return int(obj)
|
|
16
|
+
if isinstance(obj, np.floating):
|
|
17
|
+
return float(obj)
|
|
18
|
+
if isinstance(obj, np.bool_):
|
|
19
|
+
return bool(obj)
|
|
20
|
+
if isinstance(obj, (np.datetime64, np.timedelta64)):
|
|
21
|
+
return str(obj)
|
|
22
|
+
return safe_str(obj)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ._constants import JSON_SUFFIX, NPY_SUFFIX
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def ensure_npy_path(file_path: Path | str, *, for_write: bool) -> Path:
|
|
9
|
+
path = Path(file_path)
|
|
10
|
+
if path.suffix != NPY_SUFFIX:
|
|
11
|
+
path = path.with_suffix(NPY_SUFFIX)
|
|
12
|
+
if for_write:
|
|
13
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
14
|
+
return path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def metadata_file_from_npy_file(npy_file: Path) -> Path:
|
|
18
|
+
return npy_file.with_suffix(JSON_SUFFIX)
|