sciduckdb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sciduckdb/__init__.py +31 -0
- sciduckdb/sciduckdb.py +1276 -0
- sciduckdb-0.1.0.dist-info/METADATA +57 -0
- sciduckdb-0.1.0.dist-info/RECORD +5 -0
- sciduckdb-0.1.0.dist-info/WHEEL +4 -0
sciduckdb/__init__.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""SciDuck — A thin DuckDB layer for managing versioned scientific data."""
|
|
2
|
+
|
|
3
|
+
from .sciduckdb import (
|
|
4
|
+
SciDuck,
|
|
5
|
+
_infer_duckdb_type,
|
|
6
|
+
_numpy_dtype_to_duckdb,
|
|
7
|
+
_python_to_storage,
|
|
8
|
+
_storage_to_python,
|
|
9
|
+
_storage_to_python_column,
|
|
10
|
+
_infer_data_columns,
|
|
11
|
+
_value_to_storage_row,
|
|
12
|
+
_dataframe_to_storage_rows,
|
|
13
|
+
_bulk_df_to_storage_rows,
|
|
14
|
+
_flatten_dict,
|
|
15
|
+
_unflatten_dict,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"SciDuck",
|
|
20
|
+
"_infer_duckdb_type",
|
|
21
|
+
"_numpy_dtype_to_duckdb",
|
|
22
|
+
"_python_to_storage",
|
|
23
|
+
"_storage_to_python",
|
|
24
|
+
"_storage_to_python_column",
|
|
25
|
+
"_infer_data_columns",
|
|
26
|
+
"_value_to_storage_row",
|
|
27
|
+
"_dataframe_to_storage_rows",
|
|
28
|
+
"_bulk_df_to_storage_rows",
|
|
29
|
+
"_flatten_dict",
|
|
30
|
+
"_unflatten_dict",
|
|
31
|
+
]
|
sciduckdb/sciduckdb.py
ADDED
|
@@ -0,0 +1,1276 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SciDuck — A thin DuckDB layer for managing versioned scientific data.
|
|
3
|
+
|
|
4
|
+
Each variable is stored in its own table. Variables are associated with a
|
|
5
|
+
hierarchical dataset schema (e.g. subject → session → trial) and can be
|
|
6
|
+
saved at any level of that hierarchy. Multiple versions of each variable
|
|
7
|
+
are supported natively.
|
|
8
|
+
|
|
9
|
+
All data — including arrays — is stored in queryable DuckDB types (LIST,
|
|
10
|
+
nested LIST, JSON) so the database can be inspected with DBeaver or any
|
|
11
|
+
DuckDB-compatible viewer.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import duckdb
|
|
15
|
+
import logging
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
import json
|
|
19
|
+
import datetime
|
|
20
|
+
import threading
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("sciduck")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _schema_str(value):
|
|
28
|
+
"""Stringify a schema key value, converting whole-number floats to int.
|
|
29
|
+
|
|
30
|
+
Schema keys are stored as VARCHAR. str(1.0) → "1.0" but str(1) → "1".
|
|
31
|
+
MATLAB sends all numbers as float, so without this conversion queries
|
|
32
|
+
and cache lookups fail because "1.0" ≠ "1".
|
|
33
|
+
"""
|
|
34
|
+
if isinstance(value, float) and value.is_integer():
|
|
35
|
+
return str(int(value))
|
|
36
|
+
return str(value)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# ---------------------------------------------------------------------------
|
|
40
|
+
# Type mapping helpers
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
|
|
43
|
+
def _numpy_dtype_to_duckdb(dtype: np.dtype) -> str:
|
|
44
|
+
"""Map a numpy scalar dtype to a DuckDB type string."""
|
|
45
|
+
kind = dtype.kind
|
|
46
|
+
size = dtype.itemsize
|
|
47
|
+
if kind == "f":
|
|
48
|
+
return "FLOAT" if size <= 4 else "DOUBLE"
|
|
49
|
+
if kind in ("i", "u"):
|
|
50
|
+
mapping = {1: "TINYINT", 2: "SMALLINT", 4: "INTEGER", 8: "BIGINT"}
|
|
51
|
+
base = mapping.get(size, "BIGINT")
|
|
52
|
+
if kind == "u":
|
|
53
|
+
return "U" + base
|
|
54
|
+
return base
|
|
55
|
+
if kind == "b":
|
|
56
|
+
return "BOOLEAN"
|
|
57
|
+
if kind in ("U", "S", "O"):
|
|
58
|
+
return "VARCHAR"
|
|
59
|
+
if kind == "M":
|
|
60
|
+
return "TIMESTAMP"
|
|
61
|
+
if kind == "m":
|
|
62
|
+
return "INTERVAL"
|
|
63
|
+
return "VARCHAR"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _infer_duckdb_type(value: Any) -> Tuple[str, dict]:
|
|
67
|
+
"""
|
|
68
|
+
Infer the DuckDB column type and a metadata dict for round-trip
|
|
69
|
+
restoration from a single Python/numpy value.
|
|
70
|
+
|
|
71
|
+
Returns (duckdb_type_str, metadata_dict).
|
|
72
|
+
"""
|
|
73
|
+
meta: dict = {}
|
|
74
|
+
|
|
75
|
+
# --- numpy arrays ---
|
|
76
|
+
if isinstance(value, np.ndarray):
|
|
77
|
+
base = _numpy_dtype_to_duckdb(value.dtype)
|
|
78
|
+
meta["python_type"] = "ndarray"
|
|
79
|
+
meta["numpy_dtype"] = str(value.dtype)
|
|
80
|
+
meta["ndim"] = value.ndim
|
|
81
|
+
meta["shape_hint"] = list(value.shape)
|
|
82
|
+
if value.ndim == 1:
|
|
83
|
+
return f"{base}[]", meta
|
|
84
|
+
if value.ndim == 2:
|
|
85
|
+
meta["shape_hint"] = [None, value.shape[1]] # rows vary, cols fixed
|
|
86
|
+
return f"{base}[][]", meta
|
|
87
|
+
# 3-D+ : store as JSON
|
|
88
|
+
meta["python_type"] = "ndarray_json"
|
|
89
|
+
return "VARCHAR", meta
|
|
90
|
+
|
|
91
|
+
# --- Python scalars ---
|
|
92
|
+
if isinstance(value, bool):
|
|
93
|
+
meta["python_type"] = "bool"
|
|
94
|
+
return "BOOLEAN", meta
|
|
95
|
+
if isinstance(value, int):
|
|
96
|
+
meta["python_type"] = "int"
|
|
97
|
+
return "BIGINT", meta
|
|
98
|
+
if isinstance(value, float):
|
|
99
|
+
meta["python_type"] = "float"
|
|
100
|
+
return "DOUBLE", meta
|
|
101
|
+
if isinstance(value, str):
|
|
102
|
+
meta["python_type"] = "str"
|
|
103
|
+
return "VARCHAR", meta
|
|
104
|
+
|
|
105
|
+
# --- Python lists ---
|
|
106
|
+
if isinstance(value, list):
|
|
107
|
+
meta["python_type"] = "list"
|
|
108
|
+
if len(value) > 0:
|
|
109
|
+
inner = value[0]
|
|
110
|
+
# Check for homogeneous list
|
|
111
|
+
if isinstance(inner, list):
|
|
112
|
+
if not all(isinstance(v, list) for v in value):
|
|
113
|
+
raise TypeError(
|
|
114
|
+
"Heterogeneous lists are not supported. "
|
|
115
|
+
"All elements must be the same type."
|
|
116
|
+
)
|
|
117
|
+
meta["nested"] = True
|
|
118
|
+
return "DOUBLE[][]", meta
|
|
119
|
+
if isinstance(inner, np.ndarray):
|
|
120
|
+
if not all(isinstance(v, np.ndarray) for v in value):
|
|
121
|
+
raise TypeError(
|
|
122
|
+
"Heterogeneous lists are not supported. "
|
|
123
|
+
"All elements must be the same type."
|
|
124
|
+
)
|
|
125
|
+
meta["nested"] = True
|
|
126
|
+
meta["contains_ndarray"] = True
|
|
127
|
+
meta["ndarray_dtype"] = str(inner.dtype)
|
|
128
|
+
return "DOUBLE[][]", meta
|
|
129
|
+
if isinstance(inner, (int, float)):
|
|
130
|
+
if not all(isinstance(v, (int, float)) for v in value):
|
|
131
|
+
raise TypeError(
|
|
132
|
+
"Heterogeneous lists are not supported. "
|
|
133
|
+
"All elements must be the same type."
|
|
134
|
+
)
|
|
135
|
+
return "DOUBLE[]", meta
|
|
136
|
+
if isinstance(inner, str):
|
|
137
|
+
if not all(isinstance(v, str) for v in value):
|
|
138
|
+
raise TypeError(
|
|
139
|
+
"Heterogeneous lists are not supported. "
|
|
140
|
+
"All elements must be the same type."
|
|
141
|
+
)
|
|
142
|
+
return "VARCHAR[]", meta
|
|
143
|
+
return "VARCHAR[]", meta
|
|
144
|
+
|
|
145
|
+
# --- dict → JSON ---
|
|
146
|
+
if isinstance(value, dict):
|
|
147
|
+
meta["python_type"] = "dict"
|
|
148
|
+
# Track ndarray values for restoration
|
|
149
|
+
ndarray_keys = {}
|
|
150
|
+
for k, v in value.items():
|
|
151
|
+
if isinstance(v, np.ndarray):
|
|
152
|
+
ndarray_keys[k] = {
|
|
153
|
+
"dtype": str(v.dtype),
|
|
154
|
+
"shape": list(v.shape),
|
|
155
|
+
}
|
|
156
|
+
if ndarray_keys:
|
|
157
|
+
meta["ndarray_keys"] = ndarray_keys
|
|
158
|
+
return "JSON", meta
|
|
159
|
+
|
|
160
|
+
# --- datetime ---
|
|
161
|
+
if isinstance(value, (datetime.datetime, pd.Timestamp)):
|
|
162
|
+
meta["python_type"] = "datetime"
|
|
163
|
+
return "TIMESTAMP", meta
|
|
164
|
+
if isinstance(value, datetime.date):
|
|
165
|
+
meta["python_type"] = "date"
|
|
166
|
+
return "DATE", meta
|
|
167
|
+
if isinstance(value, (datetime.timedelta, pd.Timedelta)):
|
|
168
|
+
meta["python_type"] = "INTERVAL"
|
|
169
|
+
return "INTERVAL", meta
|
|
170
|
+
|
|
171
|
+
# --- pandas categorical (shouldn't normally arrive here, but handle) ---
|
|
172
|
+
if isinstance(value, pd.Categorical):
|
|
173
|
+
meta["python_type"] = "categorical"
|
|
174
|
+
return "VARCHAR", meta
|
|
175
|
+
|
|
176
|
+
# --- fallback: JSON-serialize ---
|
|
177
|
+
meta["python_type"] = "json_fallback"
|
|
178
|
+
return "VARCHAR", meta
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _convert_for_json(value: Any) -> Any:
|
|
182
|
+
"""Recursively convert ndarrays/DataFrames to lists for JSON serialization."""
|
|
183
|
+
if isinstance(value, pd.DataFrame):
|
|
184
|
+
return _convert_for_json(value.to_dict("list"))
|
|
185
|
+
if isinstance(value, pd.Series):
|
|
186
|
+
return value.tolist()
|
|
187
|
+
if isinstance(value, np.ndarray):
|
|
188
|
+
return value.tolist()
|
|
189
|
+
if isinstance(value, dict):
|
|
190
|
+
return {k: _convert_for_json(v) for k, v in value.items()}
|
|
191
|
+
if isinstance(value, list):
|
|
192
|
+
return [_convert_for_json(v) for v in value]
|
|
193
|
+
return value
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _python_to_storage(value: Any, meta: dict) -> Any:
|
|
197
|
+
"""Convert a Python value to its DuckDB-storable form."""
|
|
198
|
+
ptype = meta.get("python_type", "")
|
|
199
|
+
|
|
200
|
+
if ptype == "ndarray":
|
|
201
|
+
arr = value
|
|
202
|
+
# Scalar in a column typed as ndarray (e.g. ragged vectors): wrap as 1-element list
|
|
203
|
+
if not isinstance(arr, np.ndarray):
|
|
204
|
+
return [arr]
|
|
205
|
+
if arr.ndim == 1:
|
|
206
|
+
return arr.tolist()
|
|
207
|
+
if arr.ndim == 2:
|
|
208
|
+
return [row.tolist() for row in arr]
|
|
209
|
+
|
|
210
|
+
if ptype == "ndarray_json":
|
|
211
|
+
return json.dumps(value.tolist())
|
|
212
|
+
|
|
213
|
+
if ptype == "dict":
|
|
214
|
+
return json.dumps(_convert_for_json(value))
|
|
215
|
+
|
|
216
|
+
if ptype == "json_fallback":
|
|
217
|
+
return json.dumps(_convert_for_json(value))
|
|
218
|
+
|
|
219
|
+
if ptype == "list":
|
|
220
|
+
# Convert ndarrays within list to nested lists
|
|
221
|
+
if meta.get("contains_ndarray"):
|
|
222
|
+
return [v.tolist() if isinstance(v, np.ndarray) else v for v in value]
|
|
223
|
+
return value # DuckDB handles native lists
|
|
224
|
+
|
|
225
|
+
return value
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def _storage_to_python(value: Any, meta: dict) -> Any:
|
|
229
|
+
"""Restore a stored DuckDB value back to its original Python type."""
|
|
230
|
+
ptype = meta.get("python_type", "")
|
|
231
|
+
|
|
232
|
+
if ptype == "ndarray":
|
|
233
|
+
dtype = np.dtype(meta.get("numpy_dtype", "float64"))
|
|
234
|
+
ndim = meta.get("ndim", 1)
|
|
235
|
+
if ndim >= 2:
|
|
236
|
+
# DuckDB returns ndarray of ndarrays; stack them
|
|
237
|
+
return np.stack([np.asarray(row) for row in value]).astype(dtype)
|
|
238
|
+
return np.asarray(value, dtype=dtype)
|
|
239
|
+
|
|
240
|
+
if ptype == "ndarray_json":
|
|
241
|
+
dtype = np.dtype(meta.get("numpy_dtype", "float64"))
|
|
242
|
+
return np.array(json.loads(value), dtype=dtype)
|
|
243
|
+
|
|
244
|
+
if ptype == "dict":
|
|
245
|
+
if isinstance(value, str):
|
|
246
|
+
result = json.loads(value)
|
|
247
|
+
else:
|
|
248
|
+
result = value # DuckDB JSON type may already return dict
|
|
249
|
+
# Restore ndarray values if metadata exists
|
|
250
|
+
ndarray_keys = meta.get("ndarray_keys", {})
|
|
251
|
+
for k, arr_meta in ndarray_keys.items():
|
|
252
|
+
if k in result:
|
|
253
|
+
dtype = np.dtype(arr_meta.get("dtype", "float64"))
|
|
254
|
+
result[k] = np.array(result[k], dtype=dtype)
|
|
255
|
+
return result
|
|
256
|
+
|
|
257
|
+
if ptype == "json_fallback":
|
|
258
|
+
return json.loads(value)
|
|
259
|
+
|
|
260
|
+
if ptype == "list":
|
|
261
|
+
# DuckDB may return ndarray; convert back to list
|
|
262
|
+
if meta.get("contains_ndarray"):
|
|
263
|
+
# Restore as list of ndarrays
|
|
264
|
+
dtype = np.dtype(meta.get("ndarray_dtype", "float64"))
|
|
265
|
+
if isinstance(value, np.ndarray):
|
|
266
|
+
return [np.asarray(v, dtype=dtype) for v in value]
|
|
267
|
+
return [np.asarray(v, dtype=dtype) for v in value]
|
|
268
|
+
if isinstance(value, np.ndarray):
|
|
269
|
+
if meta.get("nested"):
|
|
270
|
+
return [v.tolist() if isinstance(v, np.ndarray) else v for v in value]
|
|
271
|
+
return value.tolist()
|
|
272
|
+
return value
|
|
273
|
+
|
|
274
|
+
if ptype == "int":
|
|
275
|
+
return int(value) if value is not None else None
|
|
276
|
+
|
|
277
|
+
if ptype == "float":
|
|
278
|
+
return float(value) if value is not None else None
|
|
279
|
+
|
|
280
|
+
if ptype == "bool":
|
|
281
|
+
return bool(value) if value is not None else None
|
|
282
|
+
|
|
283
|
+
if ptype == "str":
|
|
284
|
+
return str(value) if value is not None else None
|
|
285
|
+
|
|
286
|
+
return value
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _storage_to_python_column(series: "pd.Series", meta: dict) -> "pd.Series":
|
|
290
|
+
"""Vectorized column-level dispatch of _storage_to_python.
|
|
291
|
+
|
|
292
|
+
Applied once per column in bulk loads instead of once per cell (N records ×
|
|
293
|
+
M columns calls vs N×M calls for the per-element path). Pass-through types
|
|
294
|
+
(float, int, bool, str) are returned unchanged — DuckDB already emits the
|
|
295
|
+
right pandas dtype for those. Complex types use pd.Series.apply, which is
|
|
296
|
+
still faster than an explicit Python for-loop.
|
|
297
|
+
"""
|
|
298
|
+
ptype = meta.get("python_type", "")
|
|
299
|
+
|
|
300
|
+
# Scalar types: DuckDB already returns the right pandas dtype — no-op.
|
|
301
|
+
if ptype in ("float", "int", "bool", "str", ""):
|
|
302
|
+
return series
|
|
303
|
+
|
|
304
|
+
# JSON blob types: decode string once per cell, but at column granularity.
|
|
305
|
+
if ptype in ("dict", "json_fallback"):
|
|
306
|
+
return series.apply(lambda v: json.loads(v) if isinstance(v, str) else v)
|
|
307
|
+
|
|
308
|
+
# All remaining types (ndarray, ndarray_json, list, …): delegate per-element.
|
|
309
|
+
return series.apply(lambda v: _storage_to_python(v, meta))
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def _flatten_dict(d, _prefix=()):
|
|
313
|
+
"""Flatten a nested dict into {dot.separated.key: leaf_value} pairs.
|
|
314
|
+
Returns (flat_dict, path_map) where path_map maps each dot-key
|
|
315
|
+
to its tuple-of-keys path for faithful reconstruction."""
|
|
316
|
+
flat = {}
|
|
317
|
+
paths = {}
|
|
318
|
+
for k, v in d.items():
|
|
319
|
+
current = _prefix + (k,)
|
|
320
|
+
if isinstance(v, dict):
|
|
321
|
+
sub_flat, sub_paths = _flatten_dict(v, current)
|
|
322
|
+
flat.update(sub_flat)
|
|
323
|
+
paths.update(sub_paths)
|
|
324
|
+
else:
|
|
325
|
+
dot_key = ".".join(current)
|
|
326
|
+
flat[dot_key] = v
|
|
327
|
+
paths[dot_key] = list(current)
|
|
328
|
+
return flat, paths
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
def _unflatten_dict(flat, path_map):
|
|
332
|
+
"""Reconstruct a nested dict from flat dot-keys using stored path_map."""
|
|
333
|
+
result = {}
|
|
334
|
+
for dot_key, value in flat.items():
|
|
335
|
+
path = path_map.get(dot_key, dot_key.split("."))
|
|
336
|
+
current = result
|
|
337
|
+
for key in path[:-1]:
|
|
338
|
+
current = current.setdefault(key, {})
|
|
339
|
+
current[path[-1]] = value
|
|
340
|
+
return result
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ---------------------------------------------------------------------------
|
|
344
|
+
# Column inference & storage-row helpers (module-level, used by SciDuck and
|
|
345
|
+
# DatabaseManager)
|
|
346
|
+
# ---------------------------------------------------------------------------
|
|
347
|
+
|
|
348
|
+
def _infer_data_columns(
|
|
349
|
+
sample_value: Any, data_col_name: Optional[str] = None
|
|
350
|
+
) -> Tuple[dict, dict]:
|
|
351
|
+
"""
|
|
352
|
+
From a sample data value, return:
|
|
353
|
+
- data_col_types: dict of {col_name: duckdb_type_str}
|
|
354
|
+
- dtype_meta: metadata dict for round-trip restoration
|
|
355
|
+
"""
|
|
356
|
+
# DataFrame mode: each DataFrame column → its own DuckDB column.
|
|
357
|
+
# One DuckDB row is stored per DataFrame row; the column type reflects
|
|
358
|
+
# the individual cell value type (independent of table height).
|
|
359
|
+
if isinstance(sample_value, pd.DataFrame):
|
|
360
|
+
col_types = {}
|
|
361
|
+
meta = {
|
|
362
|
+
"mode": "dataframe",
|
|
363
|
+
"columns": {},
|
|
364
|
+
"df_columns": list(sample_value.columns),
|
|
365
|
+
}
|
|
366
|
+
for col_name in sample_value.columns:
|
|
367
|
+
col_series = sample_value[col_name]
|
|
368
|
+
if len(sample_value) == 0:
|
|
369
|
+
ddb_type = "VARCHAR"
|
|
370
|
+
col_meta = {"python_type": "str"}
|
|
371
|
+
else:
|
|
372
|
+
cell_val = col_series.iloc[0]
|
|
373
|
+
if isinstance(cell_val, np.generic):
|
|
374
|
+
cell_val = cell_val.item()
|
|
375
|
+
# to_python.m sends array cells as Python lists (via .tolist()).
|
|
376
|
+
# Normalise to ndarray so _infer_duckdb_type handles them correctly.
|
|
377
|
+
if isinstance(cell_val, list) and len(cell_val) > 0:
|
|
378
|
+
cell_val = np.asarray(cell_val)
|
|
379
|
+
ddb_type, col_meta = _infer_duckdb_type(cell_val)
|
|
380
|
+
col_types[col_name] = ddb_type
|
|
381
|
+
meta["columns"][col_name] = col_meta
|
|
382
|
+
return col_types, meta
|
|
383
|
+
|
|
384
|
+
# Dict mode: each key → its own DuckDB column (nested dicts are flattened)
|
|
385
|
+
if isinstance(sample_value, dict):
|
|
386
|
+
has_nested = any(isinstance(v, dict) for v in sample_value.values())
|
|
387
|
+
if has_nested:
|
|
388
|
+
flat, path_map = _flatten_dict(sample_value)
|
|
389
|
+
else:
|
|
390
|
+
flat = sample_value
|
|
391
|
+
path_map = {k: [k] for k in sample_value}
|
|
392
|
+
col_types = {}
|
|
393
|
+
meta = {"mode": "multi_column", "columns": {}}
|
|
394
|
+
if has_nested:
|
|
395
|
+
meta["nested"] = True
|
|
396
|
+
meta["path_map"] = path_map
|
|
397
|
+
for col_name, val in flat.items():
|
|
398
|
+
# Unwrap length-1 arrays to scalars before type inference
|
|
399
|
+
if isinstance(val, np.ndarray) and val.size == 1:
|
|
400
|
+
val = val.item()
|
|
401
|
+
ddb_type, col_meta = _infer_duckdb_type(val)
|
|
402
|
+
col_types[col_name] = ddb_type
|
|
403
|
+
meta["columns"][col_name] = col_meta
|
|
404
|
+
return col_types, meta
|
|
405
|
+
|
|
406
|
+
# Single-column mode — use provided name or default to "value"
|
|
407
|
+
col_name = data_col_name or "value"
|
|
408
|
+
ddb_type, col_meta = _infer_duckdb_type(sample_value)
|
|
409
|
+
meta = {"mode": "single_column", "columns": {col_name: col_meta}}
|
|
410
|
+
return {col_name: ddb_type}, meta
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _dataframe_to_storage_rows(df: pd.DataFrame, dtype_meta: dict) -> list:
|
|
414
|
+
"""Convert a DataFrame to a list of per-row storage values.
|
|
415
|
+
|
|
416
|
+
Returns a list of lists: one inner list per DataFrame row, each containing
|
|
417
|
+
one storage-ready value per column in the order defined by dtype_meta["columns"].
|
|
418
|
+
"""
|
|
419
|
+
col_metas = dtype_meta["columns"]
|
|
420
|
+
rows = []
|
|
421
|
+
for i in range(len(df)):
|
|
422
|
+
row = []
|
|
423
|
+
for col, col_meta in col_metas.items():
|
|
424
|
+
cell_val = df[col].iloc[i]
|
|
425
|
+
if isinstance(cell_val, np.generic):
|
|
426
|
+
cell_val = cell_val.item()
|
|
427
|
+
# to_python.m sends array cells as Python lists (via .tolist()).
|
|
428
|
+
# Normalise to ndarray so _python_to_storage handles them correctly.
|
|
429
|
+
if isinstance(cell_val, list) and len(cell_val) > 0:
|
|
430
|
+
cell_val = np.asarray(cell_val)
|
|
431
|
+
row.append(_python_to_storage(cell_val, col_meta))
|
|
432
|
+
rows.append(row)
|
|
433
|
+
return rows
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def _bulk_df_to_storage_rows(
|
|
437
|
+
df_list: list, record_ids: list, dtype_meta: dict
|
|
438
|
+
) -> list:
|
|
439
|
+
"""Bulk convert N DataFrames to (record_id, ...storage_values) rows.
|
|
440
|
+
|
|
441
|
+
Equivalent to calling _dataframe_to_storage_rows N times and assembling
|
|
442
|
+
(record_id, ...) tuples, but processes each column as a whole to avoid
|
|
443
|
+
O(N×C) per-cell pandas iloc overhead.
|
|
444
|
+
|
|
445
|
+
Falls back to the per-row path when DataFrame schemas differ.
|
|
446
|
+
"""
|
|
447
|
+
if not df_list:
|
|
448
|
+
return []
|
|
449
|
+
|
|
450
|
+
col_metas = dtype_meta["columns"]
|
|
451
|
+
first_cols = list(df_list[0].columns)
|
|
452
|
+
|
|
453
|
+
# Fall back to per-row path if schemas differ (shouldn't happen in normal use).
|
|
454
|
+
if not all(list(df.columns) == first_cols for df in df_list):
|
|
455
|
+
rows: list = []
|
|
456
|
+
for rid, df in zip(record_ids, df_list):
|
|
457
|
+
for storage_row in _dataframe_to_storage_rows(df, dtype_meta):
|
|
458
|
+
rows.append((rid,) + tuple(storage_row))
|
|
459
|
+
return rows
|
|
460
|
+
|
|
461
|
+
# Build flat record_id list: one entry per storage row (multi-row records
|
|
462
|
+
# contribute len(df) entries, typical 1-row records contribute 1).
|
|
463
|
+
expanded_rids = []
|
|
464
|
+
for rid, df in zip(record_ids, df_list):
|
|
465
|
+
expanded_rids.extend([rid] * len(df))
|
|
466
|
+
|
|
467
|
+
# Concat once so column operations don't cross DataFrame boundaries.
|
|
468
|
+
big_df = pd.concat(df_list, ignore_index=True)
|
|
469
|
+
|
|
470
|
+
# Build per-column storage arrays using column-level operations.
|
|
471
|
+
col_arrays: dict = {}
|
|
472
|
+
for col, col_meta in col_metas.items():
|
|
473
|
+
ptype = col_meta.get("python_type", "")
|
|
474
|
+
raw = big_df[col]
|
|
475
|
+
|
|
476
|
+
if ptype == "ndarray":
|
|
477
|
+
vals = raw.to_numpy()
|
|
478
|
+
col_arrays[col] = [
|
|
479
|
+
v.tolist() if isinstance(v, np.ndarray)
|
|
480
|
+
else (v if isinstance(v, list) else [v])
|
|
481
|
+
for v in vals
|
|
482
|
+
]
|
|
483
|
+
elif ptype in ("dict", "json_fallback"):
|
|
484
|
+
col_arrays[col] = [json.dumps(_convert_for_json(v)) for v in raw.to_numpy()]
|
|
485
|
+
elif ptype == "list":
|
|
486
|
+
if col_meta.get("contains_ndarray"):
|
|
487
|
+
col_arrays[col] = [
|
|
488
|
+
[e.tolist() if isinstance(e, np.ndarray) else e for e in v]
|
|
489
|
+
for v in raw.to_numpy()
|
|
490
|
+
]
|
|
491
|
+
else:
|
|
492
|
+
col_arrays[col] = raw.tolist()
|
|
493
|
+
else:
|
|
494
|
+
# Scalar types (float, int, str, bool …): tolist() converts numpy
|
|
495
|
+
# scalars to Python builtins, which is what DuckDB expects.
|
|
496
|
+
col_arrays[col] = raw.tolist()
|
|
497
|
+
|
|
498
|
+
cols_in_order = list(col_metas.keys())
|
|
499
|
+
n = len(big_df)
|
|
500
|
+
return [
|
|
501
|
+
(expanded_rids[i],) + tuple(col_arrays[col][i] for col in cols_in_order)
|
|
502
|
+
for i in range(n)
|
|
503
|
+
]
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def _value_to_storage_row(value: Any, dtype_meta: dict) -> list:
|
|
507
|
+
"""Convert a data value to a list of storage-ready column values.
|
|
508
|
+
|
|
509
|
+
For DataFrames use _dataframe_to_storage_rows() instead.
|
|
510
|
+
"""
|
|
511
|
+
mode = dtype_meta.get("mode", "single_column")
|
|
512
|
+
col_metas = dtype_meta["columns"]
|
|
513
|
+
|
|
514
|
+
if mode == "multi_column":
|
|
515
|
+
if dtype_meta.get("nested"):
|
|
516
|
+
flat, _ = _flatten_dict(value)
|
|
517
|
+
else:
|
|
518
|
+
flat = value
|
|
519
|
+
return [
|
|
520
|
+
_python_to_storage(flat[col], col_metas[col])
|
|
521
|
+
for col in col_metas
|
|
522
|
+
]
|
|
523
|
+
else:
|
|
524
|
+
# Single column — get the one key (could be "value" or a named column)
|
|
525
|
+
col_name = next(iter(col_metas))
|
|
526
|
+
col_meta = col_metas[col_name]
|
|
527
|
+
return [_python_to_storage(value, col_meta)]
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
# ---------------------------------------------------------------------------
|
|
531
|
+
# Main class
|
|
532
|
+
# ---------------------------------------------------------------------------
|
|
533
|
+
|
|
534
|
+
class SciDuck:
|
|
535
|
+
"""
|
|
536
|
+
A thin DuckDB layer for managing versioned, schema-aware scientific data.
|
|
537
|
+
|
|
538
|
+
Parameters
|
|
539
|
+
----------
|
|
540
|
+
db_path : str or Path
|
|
541
|
+
Path to the DuckDB database file. Use ":memory:" for in-memory.
|
|
542
|
+
dataset_schema : list of str
|
|
543
|
+
Ordered hierarchy, e.g. ["subject", "session", "trial"].
|
|
544
|
+
"""
|
|
545
|
+
|
|
546
|
+
def __init__(self, db_path: Union[str, Path], dataset_schema: List[str]):
|
|
547
|
+
self.db_path = str(db_path)
|
|
548
|
+
self.dataset_schema = list(dataset_schema)
|
|
549
|
+
self._lock = threading.Lock()
|
|
550
|
+
logger.info("DuckDB lock ACQUIRED: %s", self.db_path)
|
|
551
|
+
self.con = duckdb.connect(self.db_path)
|
|
552
|
+
self._init_metadata_tables()
|
|
553
|
+
|
|
554
|
+
# ------------------------------------------------------------------
|
|
555
|
+
# Thin internal interface (future backend swap point)
|
|
556
|
+
# ------------------------------------------------------------------
|
|
557
|
+
|
|
558
|
+
def _execute(self, sql: str, params=None):
|
|
559
|
+
# NOTE: DuckDB's Python connection returns itself from execute(), so
|
|
560
|
+
# execute() and fetchXxx() share the same connection state. All callers
|
|
561
|
+
# that fetch results must hold _lock for the entire execute+fetch sequence.
|
|
562
|
+
# Use _fetchall / _fetchdf for queries that return rows; call _execute
|
|
563
|
+
# directly (under _lock) only for DDL/DML that needs no fetch.
|
|
564
|
+
with self._lock:
|
|
565
|
+
if params:
|
|
566
|
+
return self.con.execute(sql, params)
|
|
567
|
+
return self.con.execute(sql)
|
|
568
|
+
|
|
569
|
+
def _executemany(self, sql: str, params_list):
|
|
570
|
+
with self._lock:
|
|
571
|
+
return self.con.executemany(sql, params_list)
|
|
572
|
+
|
|
573
|
+
def _begin(self):
|
|
574
|
+
with self._lock:
|
|
575
|
+
self.con.execute("BEGIN TRANSACTION")
|
|
576
|
+
|
|
577
|
+
def _commit(self):
|
|
578
|
+
with self._lock:
|
|
579
|
+
self.con.execute("COMMIT")
|
|
580
|
+
|
|
581
|
+
def _rollback(self):
|
|
582
|
+
with self._lock:
|
|
583
|
+
self.con.execute("ROLLBACK")
|
|
584
|
+
|
|
585
|
+
def _fetchall(self, sql: str, params=None) -> list:
|
|
586
|
+
with self._lock:
|
|
587
|
+
if params:
|
|
588
|
+
return self.con.execute(sql, params).fetchall()
|
|
589
|
+
return self.con.execute(sql).fetchall()
|
|
590
|
+
|
|
591
|
+
def fetchall(self, sql: str, params=None) -> list:
|
|
592
|
+
"""Public alias for _fetchall — accessible from MATLAB (underscore methods are not)."""
|
|
593
|
+
return self._fetchall(sql, params)
|
|
594
|
+
|
|
595
|
+
def _fetchdf(self, sql: str, params=None) -> pd.DataFrame:
|
|
596
|
+
with self._lock:
|
|
597
|
+
if params:
|
|
598
|
+
return self.con.execute(sql, params).fetchdf()
|
|
599
|
+
return self.con.execute(sql).fetchdf()
|
|
600
|
+
|
|
601
|
+
def _table_exists(self, name: str) -> bool:
|
|
602
|
+
rows = self._fetchall(
|
|
603
|
+
"SELECT COUNT(*) FROM information_schema.tables "
|
|
604
|
+
"WHERE table_name = ?", [name]
|
|
605
|
+
)
|
|
606
|
+
return rows[0][0] > 0
|
|
607
|
+
|
|
608
|
+
# ------------------------------------------------------------------
|
|
609
|
+
# Metadata table creation
|
|
610
|
+
# ------------------------------------------------------------------
|
|
611
|
+
|
|
612
|
+
def _init_metadata_tables(self):
|
|
613
|
+
# --- _schema ---
|
|
614
|
+
schema_cols = ", ".join(f'"{s}" VARCHAR' for s in self.dataset_schema)
|
|
615
|
+
self._execute(f"""
|
|
616
|
+
CREATE TABLE IF NOT EXISTS _schema (
|
|
617
|
+
schema_id INTEGER PRIMARY KEY,
|
|
618
|
+
schema_level VARCHAR NOT NULL,
|
|
619
|
+
{schema_cols}
|
|
620
|
+
)
|
|
621
|
+
""")
|
|
622
|
+
# Create a sequence for schema_id if it doesn't exist
|
|
623
|
+
try:
|
|
624
|
+
self._execute("CREATE SEQUENCE IF NOT EXISTS _schema_id_seq START 1")
|
|
625
|
+
except Exception:
|
|
626
|
+
pass # sequence already exists
|
|
627
|
+
|
|
628
|
+
# --- _variables ---
|
|
629
|
+
self._execute("""
|
|
630
|
+
CREATE TABLE IF NOT EXISTS _variables (
|
|
631
|
+
variable_name VARCHAR PRIMARY KEY,
|
|
632
|
+
schema_level VARCHAR NOT NULL,
|
|
633
|
+
dtype VARCHAR,
|
|
634
|
+
created_at TIMESTAMP DEFAULT current_timestamp,
|
|
635
|
+
description VARCHAR DEFAULT ''
|
|
636
|
+
)
|
|
637
|
+
""")
|
|
638
|
+
|
|
639
|
+
# --- _variable_groups ---
|
|
640
|
+
self._execute("""
|
|
641
|
+
CREATE TABLE IF NOT EXISTS _variable_groups (
|
|
642
|
+
group_name VARCHAR NOT NULL,
|
|
643
|
+
variable_name VARCHAR NOT NULL,
|
|
644
|
+
PRIMARY KEY (group_name, variable_name)
|
|
645
|
+
)
|
|
646
|
+
""")
|
|
647
|
+
|
|
648
|
+
# Validate schema consistency if _schema already has data
|
|
649
|
+
if self._fetchall("SELECT COUNT(*) FROM _schema")[0][0] > 0:
|
|
650
|
+
existing_cols = [
|
|
651
|
+
row[0] for row in self._fetchall(
|
|
652
|
+
"SELECT column_name FROM information_schema.columns "
|
|
653
|
+
"WHERE table_name = '_schema' "
|
|
654
|
+
"AND column_name NOT IN ('schema_id', 'schema_level') "
|
|
655
|
+
"ORDER BY ordinal_position"
|
|
656
|
+
)
|
|
657
|
+
]
|
|
658
|
+
if existing_cols != self.dataset_schema:
|
|
659
|
+
raise ValueError(
|
|
660
|
+
f"Database schema mismatch. "
|
|
661
|
+
f"Existing: {existing_cols}, Provided: {self.dataset_schema}"
|
|
662
|
+
)
|
|
663
|
+
|
|
664
|
+
# ------------------------------------------------------------------
|
|
665
|
+
# Schema entry management
|
|
666
|
+
# ------------------------------------------------------------------
|
|
667
|
+
|
|
668
|
+
def _schema_key_columns(self, schema_level: str) -> List[str]:
|
|
669
|
+
"""Return schema columns from the top down to (and including) schema_level."""
|
|
670
|
+
idx = self.dataset_schema.index(schema_level)
|
|
671
|
+
return self.dataset_schema[: idx + 1]
|
|
672
|
+
|
|
673
|
+
def _get_or_create_schema_id(self, schema_level: str, key_values: dict) -> int:
|
|
674
|
+
"""Look up or insert a row in _schema. Return the schema_id."""
|
|
675
|
+
key_cols = [k for k in self.dataset_schema if k in key_values]
|
|
676
|
+
|
|
677
|
+
# Build WHERE clause
|
|
678
|
+
conditions = []
|
|
679
|
+
params = [schema_level]
|
|
680
|
+
for col in key_cols:
|
|
681
|
+
conditions.append(f'"{col}" = ?')
|
|
682
|
+
params.append(_schema_str(key_values[col]))
|
|
683
|
+
# Columns above the level that should be NULL are implicit —
|
|
684
|
+
# but to be safe, also require NULLs for levels below.
|
|
685
|
+
for col in self.dataset_schema:
|
|
686
|
+
if col not in key_cols:
|
|
687
|
+
conditions.append(f'"{col}" IS NULL')
|
|
688
|
+
|
|
689
|
+
where = " AND ".join(conditions)
|
|
690
|
+
rows = self._fetchall(
|
|
691
|
+
f'SELECT schema_id FROM _schema WHERE schema_level = ? AND {where}',
|
|
692
|
+
params,
|
|
693
|
+
)
|
|
694
|
+
if rows:
|
|
695
|
+
return rows[0][0]
|
|
696
|
+
|
|
697
|
+
# Insert new entry — use MAX+1 for consistency with batch path
|
|
698
|
+
new_id = self._fetchall(
|
|
699
|
+
"SELECT COALESCE(MAX(schema_id), 0) + 1 FROM _schema"
|
|
700
|
+
)[0][0]
|
|
701
|
+
col_names = ["schema_id", "schema_level"] + key_cols
|
|
702
|
+
placeholders = ", ".join(["?"] * len(col_names))
|
|
703
|
+
col_str = ", ".join(f'"{c}"' for c in col_names)
|
|
704
|
+
values = [new_id, schema_level] + [_schema_str(key_values[c]) for c in key_cols]
|
|
705
|
+
self._execute(
|
|
706
|
+
f"INSERT INTO _schema ({col_str}) VALUES ({placeholders})", values
|
|
707
|
+
)
|
|
708
|
+
return new_id
|
|
709
|
+
|
|
710
|
+
def batch_get_or_create_schema_ids(
|
|
711
|
+
self,
|
|
712
|
+
combos: dict, # {(schema_level, key_tuple): key_values_dict}
|
|
713
|
+
) -> dict:
|
|
714
|
+
"""
|
|
715
|
+
Batch-resolve schema IDs for multiple (schema_level, key_values) combos.
|
|
716
|
+
|
|
717
|
+
Instead of N individual SELECT+INSERT round-trips, does:
|
|
718
|
+
1. One SELECT to fetch all existing schema entries
|
|
719
|
+
2. Batch INSERT for missing entries
|
|
720
|
+
3. One SELECT to get IDs for newly inserted entries
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
combos: dict mapping (schema_level, key_tuple) -> key_values dict
|
|
724
|
+
|
|
725
|
+
Returns:
|
|
726
|
+
dict mapping (schema_level, key_tuple) -> schema_id
|
|
727
|
+
"""
|
|
728
|
+
if not combos:
|
|
729
|
+
return {}
|
|
730
|
+
|
|
731
|
+
result = {}
|
|
732
|
+
|
|
733
|
+
# Group combos by (schema_level, key set) for efficient querying
|
|
734
|
+
by_level_and_keys = {}
|
|
735
|
+
for (schema_level, key_tuple), key_values in combos.items():
|
|
736
|
+
group_key = (schema_level, frozenset(key_values.keys()))
|
|
737
|
+
by_level_and_keys.setdefault(group_key, []).append(
|
|
738
|
+
((schema_level, key_tuple), key_values)
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
for (schema_level, key_set), entries in by_level_and_keys.items():
|
|
742
|
+
key_cols = [k for k in self.dataset_schema if k in key_set]
|
|
743
|
+
null_cols = [c for c in self.dataset_schema if c not in key_cols]
|
|
744
|
+
|
|
745
|
+
# Build a single query to find all existing matches at this level
|
|
746
|
+
# We fetch all rows for this schema_level and match in Python
|
|
747
|
+
null_conditions = " AND ".join(
|
|
748
|
+
f'"{col}" IS NULL' for col in null_cols
|
|
749
|
+
)
|
|
750
|
+
where_clause = f'schema_level = ?'
|
|
751
|
+
if null_conditions:
|
|
752
|
+
where_clause += f' AND {null_conditions}'
|
|
753
|
+
|
|
754
|
+
col_select = ", ".join(f'"{c}"' for c in key_cols)
|
|
755
|
+
rows = self._fetchall(
|
|
756
|
+
f'SELECT schema_id, {col_select} FROM _schema WHERE {where_clause}',
|
|
757
|
+
[schema_level],
|
|
758
|
+
)
|
|
759
|
+
|
|
760
|
+
# Build lookup: tuple of col values -> schema_id
|
|
761
|
+
existing_lookup = {}
|
|
762
|
+
for row in rows:
|
|
763
|
+
sid = row[0]
|
|
764
|
+
row_key = tuple(_schema_str(v) if v is not None else "" for v in row[1:])
|
|
765
|
+
existing_lookup[row_key] = sid
|
|
766
|
+
|
|
767
|
+
# Match entries against existing rows
|
|
768
|
+
missing = [] # [(combo_key, key_values), ...]
|
|
769
|
+
for combo_key, key_values in entries:
|
|
770
|
+
match_key = tuple(_schema_str(key_values.get(c, "")) for c in key_cols)
|
|
771
|
+
if match_key in existing_lookup:
|
|
772
|
+
result[combo_key] = existing_lookup[match_key]
|
|
773
|
+
else:
|
|
774
|
+
missing.append((combo_key, key_values, match_key))
|
|
775
|
+
|
|
776
|
+
# Batch insert missing entries
|
|
777
|
+
if missing:
|
|
778
|
+
# Allocate a block of IDs from current max instead of N nextval() calls
|
|
779
|
+
max_row = self._fetchall(
|
|
780
|
+
"SELECT COALESCE(MAX(schema_id), 0) FROM _schema"
|
|
781
|
+
)
|
|
782
|
+
first_id = max_row[0][0] + 1
|
|
783
|
+
|
|
784
|
+
col_names = ["schema_id", "schema_level"] + key_cols
|
|
785
|
+
col_str = ", ".join(f'"{c}"' for c in col_names)
|
|
786
|
+
|
|
787
|
+
insert_rows = []
|
|
788
|
+
for idx, (combo_key, key_values, _) in enumerate(missing):
|
|
789
|
+
new_id = first_id + idx
|
|
790
|
+
row = [new_id, schema_level] + [
|
|
791
|
+
_schema_str(key_values[c]) for c in key_cols
|
|
792
|
+
]
|
|
793
|
+
insert_rows.append(row)
|
|
794
|
+
result[combo_key] = new_id
|
|
795
|
+
|
|
796
|
+
# Use DataFrame-based insert for speed
|
|
797
|
+
insert_df = pd.DataFrame(insert_rows, columns=col_names)
|
|
798
|
+
self.con.execute(
|
|
799
|
+
f"INSERT INTO _schema ({col_str}) SELECT * FROM insert_df"
|
|
800
|
+
)
|
|
801
|
+
|
|
802
|
+
return result
|
|
803
|
+
|
|
804
|
+
# ------------------------------------------------------------------
|
|
805
|
+
# Save
|
|
806
|
+
# ------------------------------------------------------------------
|
|
807
|
+
|
|
808
|
+
def save(
|
|
809
|
+
self,
|
|
810
|
+
name: str,
|
|
811
|
+
data: Any,
|
|
812
|
+
schema_level: Optional[str] = None,
|
|
813
|
+
description: str = "",
|
|
814
|
+
force: bool = False,
|
|
815
|
+
**schema_keys,
|
|
816
|
+
):
|
|
817
|
+
"""
|
|
818
|
+
Save a variable to the database.
|
|
819
|
+
|
|
820
|
+
Parameters
|
|
821
|
+
----------
|
|
822
|
+
name : str
|
|
823
|
+
Variable name (becomes the table name).
|
|
824
|
+
data : Any
|
|
825
|
+
The data to save. Can be:
|
|
826
|
+
- pd.DataFrame with schema-level columns (Mode A)
|
|
827
|
+
- Any Python/numpy object + schema_keys kwargs (Mode B, single entry)
|
|
828
|
+
- dict mapping tuples → values (Mode C, batch)
|
|
829
|
+
schema_level : str, optional
|
|
830
|
+
Which schema level to store at. Defaults to the lowest level.
|
|
831
|
+
description : str
|
|
832
|
+
Optional description for this variable.
|
|
833
|
+
force : bool
|
|
834
|
+
Deprecated, kept for backward compatibility.
|
|
835
|
+
**schema_keys
|
|
836
|
+
Keyword arguments specifying the schema entry for Mode B.
|
|
837
|
+
e.g. subject="S01", session=1, trial=3.
|
|
838
|
+
Note: all schema key values are coerced to strings before storage.
|
|
839
|
+
"""
|
|
840
|
+
# --- Determine save mode ---
|
|
841
|
+
data_col_name = None # Override for single-column name preservation
|
|
842
|
+
|
|
843
|
+
# Mode B: single entry via kwargs
|
|
844
|
+
if schema_keys:
|
|
845
|
+
provided_schema_cols = [k for k in self.dataset_schema if k in schema_keys]
|
|
846
|
+
if schema_level is None:
|
|
847
|
+
schema_level = provided_schema_cols[-1] if provided_schema_cols else self.dataset_schema[-1]
|
|
848
|
+
if schema_level not in self.dataset_schema:
|
|
849
|
+
raise ValueError(
|
|
850
|
+
f"schema_level '{schema_level}' not in {self.dataset_schema}"
|
|
851
|
+
)
|
|
852
|
+
key_cols = provided_schema_cols
|
|
853
|
+
entries = [(
|
|
854
|
+
{k: schema_keys[k] for k in key_cols},
|
|
855
|
+
data,
|
|
856
|
+
)]
|
|
857
|
+
|
|
858
|
+
else:
|
|
859
|
+
if schema_level is None:
|
|
860
|
+
schema_level = self.dataset_schema[-1]
|
|
861
|
+
if schema_level not in self.dataset_schema:
|
|
862
|
+
raise ValueError(
|
|
863
|
+
f"schema_level '{schema_level}' not in {self.dataset_schema}"
|
|
864
|
+
)
|
|
865
|
+
key_cols = self._schema_key_columns(schema_level)
|
|
866
|
+
|
|
867
|
+
# Mode A: DataFrame with schema columns
|
|
868
|
+
if isinstance(data, pd.DataFrame) and all(c in data.columns for c in key_cols):
|
|
869
|
+
entries, data_col_name = self._entries_from_dataframe(data, key_cols, schema_level)
|
|
870
|
+
|
|
871
|
+
# Mode C: dict with tuple keys
|
|
872
|
+
elif isinstance(data, dict) and data and isinstance(next(iter(data.keys())), tuple):
|
|
873
|
+
entries = []
|
|
874
|
+
for key_tuple, value in data.items():
|
|
875
|
+
if len(key_tuple) != len(key_cols):
|
|
876
|
+
raise ValueError(
|
|
877
|
+
f"Key tuple length {len(key_tuple)} != "
|
|
878
|
+
f"expected {len(key_cols)} for level '{schema_level}'"
|
|
879
|
+
)
|
|
880
|
+
key_dict = dict(zip(key_cols, key_tuple))
|
|
881
|
+
entries.append((key_dict, value))
|
|
882
|
+
|
|
883
|
+
else:
|
|
884
|
+
raise ValueError(
|
|
885
|
+
"Cannot determine save mode. Provide either:\n"
|
|
886
|
+
" (A) a DataFrame with schema-level columns,\n"
|
|
887
|
+
" (B) schema key kwargs (e.g. subject='S01', session=1), or\n"
|
|
888
|
+
" (C) a dict mapping tuples to values."
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
# --- Determine column types from the first entry's data ---
|
|
892
|
+
sample_value = entries[0][1]
|
|
893
|
+
data_col_types, dtype_meta = self._infer_data_columns(sample_value, data_col_name)
|
|
894
|
+
|
|
895
|
+
# --- Ensure the variable table exists ---
|
|
896
|
+
is_dataframe = dtype_meta.get("mode") == "dataframe"
|
|
897
|
+
self._ensure_variable_table(name, data_col_types, schema_level,
|
|
898
|
+
is_dataframe=is_dataframe)
|
|
899
|
+
|
|
900
|
+
# --- Insert rows (INSERT OR REPLACE for "latest wins" semantics) ---
|
|
901
|
+
col_names = ["schema_id"] + list(data_col_types.keys())
|
|
902
|
+
col_str = ", ".join(f'"{c}"' for c in col_names)
|
|
903
|
+
placeholders = ", ".join(["?"] * len(col_names))
|
|
904
|
+
|
|
905
|
+
for key_dict, value in entries:
|
|
906
|
+
schema_id = self._get_or_create_schema_id(schema_level, key_dict)
|
|
907
|
+
if isinstance(value, pd.DataFrame):
|
|
908
|
+
# Delete old rows for this schema_id, then insert one per DataFrame row.
|
|
909
|
+
self._execute(f'DELETE FROM "{name}" WHERE schema_id = ?', [schema_id])
|
|
910
|
+
for storage_row in _dataframe_to_storage_rows(value, dtype_meta):
|
|
911
|
+
self._execute(
|
|
912
|
+
f'INSERT INTO "{name}" ({col_str}) VALUES ({placeholders})',
|
|
913
|
+
[schema_id] + storage_row,
|
|
914
|
+
)
|
|
915
|
+
else:
|
|
916
|
+
storage_values = self._value_to_storage_row(value, dtype_meta)
|
|
917
|
+
row = [schema_id] + storage_values
|
|
918
|
+
self._execute(
|
|
919
|
+
f'INSERT OR REPLACE INTO "{name}" ({col_str}) VALUES ({placeholders})', row
|
|
920
|
+
)
|
|
921
|
+
|
|
922
|
+
# --- Register in _variables (one row per variable) ---
|
|
923
|
+
self._execute(
|
|
924
|
+
"INSERT INTO _variables (variable_name, schema_level, dtype, description) "
|
|
925
|
+
"VALUES (?, ?, ?, ?) "
|
|
926
|
+
"ON CONFLICT (variable_name) DO UPDATE SET dtype = excluded.dtype",
|
|
927
|
+
[name, schema_level, json.dumps(dtype_meta), description],
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
def _entries_from_dataframe(
|
|
931
|
+
self, df: pd.DataFrame, key_cols: List[str], schema_level: str
|
|
932
|
+
) -> Tuple[List[Tuple[dict, Any]], Optional[str]]:
|
|
933
|
+
"""
|
|
934
|
+
Convert a DataFrame (Mode A) into a list of (key_dict, row_data) entries.
|
|
935
|
+
|
|
936
|
+
Each row in the DataFrame becomes one entry. The non-schema columns
|
|
937
|
+
become the stored data (as a dict / single value).
|
|
938
|
+
|
|
939
|
+
Returns (entries, single_col_name) where single_col_name is set if
|
|
940
|
+
there's exactly one data column (so we can preserve its name).
|
|
941
|
+
"""
|
|
942
|
+
data_cols = [c for c in df.columns if c not in key_cols]
|
|
943
|
+
entries = []
|
|
944
|
+
single_col_name = data_cols[0] if len(data_cols) == 1 else None
|
|
945
|
+
for _, row in df.iterrows():
|
|
946
|
+
key_dict = {k: row[k] for k in key_cols}
|
|
947
|
+
if len(data_cols) == 1:
|
|
948
|
+
value = row[data_cols[0]]
|
|
949
|
+
# Convert numpy types to Python types for cleaner handling
|
|
950
|
+
if isinstance(value, (np.integer,)):
|
|
951
|
+
value = int(value)
|
|
952
|
+
elif isinstance(value, (np.floating,)):
|
|
953
|
+
value = float(value)
|
|
954
|
+
elif isinstance(value, (np.bool_,)):
|
|
955
|
+
value = bool(value)
|
|
956
|
+
else:
|
|
957
|
+
value = {c: row[c] for c in data_cols}
|
|
958
|
+
entries.append((key_dict, value))
|
|
959
|
+
return entries, single_col_name
|
|
960
|
+
|
|
961
|
+
def _infer_data_columns(
|
|
962
|
+
self, sample_value: Any, data_col_name: Optional[str] = None
|
|
963
|
+
) -> Tuple[dict, dict]:
|
|
964
|
+
"""Delegate to module-level _infer_data_columns."""
|
|
965
|
+
return _infer_data_columns(sample_value, data_col_name)
|
|
966
|
+
|
|
967
|
+
def _value_to_storage_row(self, value: Any, dtype_meta: dict) -> list:
|
|
968
|
+
"""Delegate to module-level _value_to_storage_row."""
|
|
969
|
+
return _value_to_storage_row(value, dtype_meta)
|
|
970
|
+
|
|
971
|
+
def _ensure_variable_table(self, name: str, data_col_types: dict, schema_level: str,
|
|
972
|
+
is_dataframe: bool = False):
|
|
973
|
+
"""Create the variable table if it doesn't exist."""
|
|
974
|
+
if self._table_exists(name):
|
|
975
|
+
return
|
|
976
|
+
data_cols_sql = ", ".join(
|
|
977
|
+
f'"{col}" {dtype}' for col, dtype in data_col_types.items()
|
|
978
|
+
)
|
|
979
|
+
# DataFrames store one DuckDB row per table row: no unique constraint
|
|
980
|
+
# on schema_id. Other types use schema_id as a primary key so that
|
|
981
|
+
# INSERT OR REPLACE gives "latest wins" semantics.
|
|
982
|
+
if is_dataframe:
|
|
983
|
+
schema_id_col = "schema_id INTEGER NOT NULL"
|
|
984
|
+
else:
|
|
985
|
+
schema_id_col = "schema_id INTEGER PRIMARY KEY"
|
|
986
|
+
self._execute(f"""
|
|
987
|
+
CREATE TABLE "{name}" (
|
|
988
|
+
{schema_id_col},
|
|
989
|
+
{data_cols_sql}
|
|
990
|
+
)
|
|
991
|
+
""")
|
|
992
|
+
|
|
993
|
+
# ------------------------------------------------------------------
|
|
994
|
+
# Load
|
|
995
|
+
# ------------------------------------------------------------------
|
|
996
|
+
|
|
997
|
+
def load(
|
|
998
|
+
self,
|
|
999
|
+
name: str,
|
|
1000
|
+
raw: bool = True,
|
|
1001
|
+
**schema_keys,
|
|
1002
|
+
) -> Union[pd.DataFrame, Any]:
|
|
1003
|
+
"""
|
|
1004
|
+
Load a variable from the database.
|
|
1005
|
+
|
|
1006
|
+
Parameters
|
|
1007
|
+
----------
|
|
1008
|
+
name : str
|
|
1009
|
+
Variable name.
|
|
1010
|
+
raw : bool
|
|
1011
|
+
If True and the result is a single row, return the reconstructed
|
|
1012
|
+
Python object instead of a DataFrame.
|
|
1013
|
+
**schema_keys
|
|
1014
|
+
Optional filters, e.g. subject="S01" to load a subset.
|
|
1015
|
+
|
|
1016
|
+
Returns
|
|
1017
|
+
-------
|
|
1018
|
+
pd.DataFrame or Python object (if raw=True and single row).
|
|
1019
|
+
"""
|
|
1020
|
+
if not self._table_exists(name):
|
|
1021
|
+
raise KeyError(f"Variable '{name}' not found in database.")
|
|
1022
|
+
|
|
1023
|
+
# Get metadata
|
|
1024
|
+
rows = self._fetchall(
|
|
1025
|
+
"SELECT schema_level, dtype FROM _variables WHERE variable_name = ?",
|
|
1026
|
+
[name],
|
|
1027
|
+
)
|
|
1028
|
+
if not rows:
|
|
1029
|
+
raise KeyError(f"Variable '{name}' not found.")
|
|
1030
|
+
schema_level, dtype_json = rows[0]
|
|
1031
|
+
dtype_meta = json.loads(dtype_json)
|
|
1032
|
+
|
|
1033
|
+
# Select all schema columns so non-contiguous keys appear in results
|
|
1034
|
+
all_schema_cols = self.dataset_schema
|
|
1035
|
+
schema_select = ", ".join(f's."{c}"' for c in all_schema_cols)
|
|
1036
|
+
data_cols = list(dtype_meta["columns"].keys())
|
|
1037
|
+
data_select = ", ".join(f'v."{c}"' for c in data_cols)
|
|
1038
|
+
|
|
1039
|
+
sql = (
|
|
1040
|
+
f'SELECT {schema_select}, {data_select} '
|
|
1041
|
+
f'FROM "{name}" v '
|
|
1042
|
+
f'JOIN _schema s ON v.schema_id = s.schema_id'
|
|
1043
|
+
)
|
|
1044
|
+
params: list = []
|
|
1045
|
+
|
|
1046
|
+
# Apply schema key filters (any valid schema column)
|
|
1047
|
+
conditions = []
|
|
1048
|
+
for col, val in schema_keys.items():
|
|
1049
|
+
if col in all_schema_cols:
|
|
1050
|
+
conditions.append(f's."{col}" = ?')
|
|
1051
|
+
params.append(_schema_str(val))
|
|
1052
|
+
if conditions:
|
|
1053
|
+
sql += ' WHERE ' + ' AND '.join(conditions)
|
|
1054
|
+
|
|
1055
|
+
df = self._fetchdf(sql, params or None)
|
|
1056
|
+
|
|
1057
|
+
mode = dtype_meta.get("mode", "single_column")
|
|
1058
|
+
columns_meta = dtype_meta.get("columns", {})
|
|
1059
|
+
|
|
1060
|
+
if mode == "dataframe":
|
|
1061
|
+
# One DuckDB row per DataFrame row: apply _storage_to_python per cell.
|
|
1062
|
+
# Drop schema columns; keep only data columns.
|
|
1063
|
+
data_cols = list(columns_meta.keys())
|
|
1064
|
+
result = {}
|
|
1065
|
+
for c, meta in columns_meta.items():
|
|
1066
|
+
if c in df.columns:
|
|
1067
|
+
result[c] = [_storage_to_python(df[c].iloc[i], meta)
|
|
1068
|
+
for i in range(len(df))]
|
|
1069
|
+
df_columns = dtype_meta.get("df_columns", data_cols)
|
|
1070
|
+
return pd.DataFrame(result, columns=df_columns)
|
|
1071
|
+
|
|
1072
|
+
# Non-DataFrame: restore types then return raw object if single row
|
|
1073
|
+
df = self._restore_types(df, dtype_meta)
|
|
1074
|
+
|
|
1075
|
+
if raw and len(df) == 1:
|
|
1076
|
+
if mode == "single_column":
|
|
1077
|
+
col_name = next(iter(columns_meta))
|
|
1078
|
+
col_meta = columns_meta[col_name]
|
|
1079
|
+
raw_val = df[col_name].iloc[0]
|
|
1080
|
+
return _storage_to_python(raw_val, col_meta)
|
|
1081
|
+
elif mode == "multi_column":
|
|
1082
|
+
result = {}
|
|
1083
|
+
for c, meta in columns_meta.items():
|
|
1084
|
+
result[c] = _storage_to_python(df[c].iloc[0], meta)
|
|
1085
|
+
if dtype_meta.get("nested"):
|
|
1086
|
+
return _unflatten_dict(result, dtype_meta["path_map"])
|
|
1087
|
+
return result
|
|
1088
|
+
|
|
1089
|
+
return df
|
|
1090
|
+
|
|
1091
|
+
def _restore_types(self, df: pd.DataFrame, dtype_meta: dict) -> pd.DataFrame:
|
|
1092
|
+
"""Apply type restoration to data columns of a loaded DataFrame."""
|
|
1093
|
+
columns_meta = dtype_meta.get("columns", {})
|
|
1094
|
+
for col_name, col_meta in columns_meta.items():
|
|
1095
|
+
if col_name in df.columns:
|
|
1096
|
+
restored = [
|
|
1097
|
+
_storage_to_python(df[col_name].iloc[i], col_meta)
|
|
1098
|
+
for i in range(len(df))
|
|
1099
|
+
]
|
|
1100
|
+
df[col_name] = restored
|
|
1101
|
+
return df
|
|
1102
|
+
|
|
1103
|
+
# ------------------------------------------------------------------
|
|
1104
|
+
# List / inspect
|
|
1105
|
+
# ------------------------------------------------------------------
|
|
1106
|
+
|
|
1107
|
+
def list_variables(self) -> pd.DataFrame:
|
|
1108
|
+
"""
|
|
1109
|
+
List all variables with their schema level and creation time.
|
|
1110
|
+
"""
|
|
1111
|
+
return self._fetchdf("""
|
|
1112
|
+
SELECT variable_name, schema_level, created_at, description
|
|
1113
|
+
FROM _variables
|
|
1114
|
+
ORDER BY variable_name
|
|
1115
|
+
""")
|
|
1116
|
+
|
|
1117
|
+
def list_versions(self, name: str) -> pd.DataFrame:
|
|
1118
|
+
"""
|
|
1119
|
+
List variable metadata and all distinct schema entries saved for it.
|
|
1120
|
+
"""
|
|
1121
|
+
if not self._table_exists(name):
|
|
1122
|
+
return pd.DataFrame()
|
|
1123
|
+
return self._fetchdf(
|
|
1124
|
+
"SELECT v.variable_name, v.schema_level, v.created_at, v.description, "
|
|
1125
|
+
"COUNT(d.schema_id) AS num_entries "
|
|
1126
|
+
f'FROM _variables v LEFT JOIN "{name}" d ON 1=1 '
|
|
1127
|
+
"WHERE v.variable_name = ? "
|
|
1128
|
+
"GROUP BY v.variable_name, v.schema_level, v.created_at, v.description",
|
|
1129
|
+
[name],
|
|
1130
|
+
)
|
|
1131
|
+
|
|
1132
|
+
# ------------------------------------------------------------------
|
|
1133
|
+
# Delete
|
|
1134
|
+
# ------------------------------------------------------------------
|
|
1135
|
+
|
|
1136
|
+
def delete(self, name: str):
|
|
1137
|
+
"""
|
|
1138
|
+
Delete a variable, dropping its data table and all metadata records.
|
|
1139
|
+
"""
|
|
1140
|
+
if self._table_exists(name):
|
|
1141
|
+
self._execute(f'DROP TABLE "{name}"')
|
|
1142
|
+
self._execute(
|
|
1143
|
+
"DELETE FROM _variables WHERE variable_name = ?", [name]
|
|
1144
|
+
)
|
|
1145
|
+
self._execute(
|
|
1146
|
+
"DELETE FROM _variable_groups WHERE variable_name = ?", [name]
|
|
1147
|
+
)
|
|
1148
|
+
|
|
1149
|
+
# ------------------------------------------------------------------
|
|
1150
|
+
# Groups
|
|
1151
|
+
# ------------------------------------------------------------------
|
|
1152
|
+
|
|
1153
|
+
def add_to_group(self, group_name: str, variable_names: Union[str, List[str]]):
|
|
1154
|
+
"""Add one or more variables to a group."""
|
|
1155
|
+
if isinstance(variable_names, str):
|
|
1156
|
+
variable_names = [variable_names]
|
|
1157
|
+
for vn in variable_names:
|
|
1158
|
+
self._execute(
|
|
1159
|
+
"INSERT INTO _variable_groups (group_name, variable_name) "
|
|
1160
|
+
"VALUES (?, ?) ON CONFLICT DO NOTHING",
|
|
1161
|
+
[group_name, vn],
|
|
1162
|
+
)
|
|
1163
|
+
|
|
1164
|
+
def remove_from_group(self, group_name: str, variable_names: Union[str, List[str]]):
|
|
1165
|
+
"""Remove one or more variables from a group."""
|
|
1166
|
+
if isinstance(variable_names, str):
|
|
1167
|
+
variable_names = [variable_names]
|
|
1168
|
+
for vn in variable_names:
|
|
1169
|
+
self._execute(
|
|
1170
|
+
"DELETE FROM _variable_groups "
|
|
1171
|
+
"WHERE group_name = ? AND variable_name = ?",
|
|
1172
|
+
[group_name, vn],
|
|
1173
|
+
)
|
|
1174
|
+
|
|
1175
|
+
def list_groups(self) -> List[str]:
|
|
1176
|
+
"""List all group names."""
|
|
1177
|
+
rows = self._fetchall(
|
|
1178
|
+
"SELECT DISTINCT group_name FROM _variable_groups ORDER BY group_name"
|
|
1179
|
+
)
|
|
1180
|
+
return [r[0] for r in rows]
|
|
1181
|
+
|
|
1182
|
+
def get_group(self, group_name: str) -> List[str]:
|
|
1183
|
+
"""Get all variable names in a group."""
|
|
1184
|
+
rows = self._fetchall(
|
|
1185
|
+
"SELECT variable_name FROM _variable_groups "
|
|
1186
|
+
"WHERE group_name = ? ORDER BY variable_name",
|
|
1187
|
+
[group_name],
|
|
1188
|
+
)
|
|
1189
|
+
return [r[0] for r in rows]
|
|
1190
|
+
|
|
1191
|
+
# ------------------------------------------------------------------
|
|
1192
|
+
# Schema introspection
|
|
1193
|
+
# ------------------------------------------------------------------
|
|
1194
|
+
|
|
1195
|
+
def distinct_schema_values(self, key: str) -> List:
|
|
1196
|
+
"""Return all distinct non-null values for a schema column, sorted."""
|
|
1197
|
+
if key not in self.dataset_schema:
|
|
1198
|
+
raise ValueError(
|
|
1199
|
+
f"'{key}' is not a schema column. "
|
|
1200
|
+
f"Available: {self.dataset_schema}"
|
|
1201
|
+
)
|
|
1202
|
+
rows = self._fetchall(
|
|
1203
|
+
f'SELECT DISTINCT "{key}" FROM _schema '
|
|
1204
|
+
f'WHERE "{key}" IS NOT NULL '
|
|
1205
|
+
f'ORDER BY "{key}"'
|
|
1206
|
+
)
|
|
1207
|
+
return [r[0] for r in rows]
|
|
1208
|
+
|
|
1209
|
+
def distinct_schema_combinations(self, keys: list[str]) -> list[tuple]:
|
|
1210
|
+
"""Return all distinct non-null combinations for multiple schema columns.
|
|
1211
|
+
|
|
1212
|
+
Args:
|
|
1213
|
+
keys: List of schema column names to query.
|
|
1214
|
+
|
|
1215
|
+
Returns:
|
|
1216
|
+
List of tuples, each tuple being one existing combination of values
|
|
1217
|
+
(as strings, since _schema stores VARCHAR columns). Sorted by the
|
|
1218
|
+
column order given.
|
|
1219
|
+
"""
|
|
1220
|
+
for k in keys:
|
|
1221
|
+
if k not in self.dataset_schema:
|
|
1222
|
+
raise ValueError(
|
|
1223
|
+
f"'{k}' is not a schema column. "
|
|
1224
|
+
f"Available: {self.dataset_schema}"
|
|
1225
|
+
)
|
|
1226
|
+
col_list = ", ".join(f'"{k}"' for k in keys)
|
|
1227
|
+
where_clause = " AND ".join(f'"{k}" IS NOT NULL' for k in keys)
|
|
1228
|
+
order_clause = ", ".join(f'"{k}"' for k in keys)
|
|
1229
|
+
rows = self._fetchall(
|
|
1230
|
+
f"SELECT DISTINCT {col_list} FROM _schema "
|
|
1231
|
+
f"WHERE {where_clause} "
|
|
1232
|
+
f"ORDER BY {order_clause}"
|
|
1233
|
+
)
|
|
1234
|
+
return [tuple(r) for r in rows]
|
|
1235
|
+
|
|
1236
|
+
# ------------------------------------------------------------------
|
|
1237
|
+
# Direct query access
|
|
1238
|
+
# ------------------------------------------------------------------
|
|
1239
|
+
|
|
1240
|
+
def query(self, sql: str, params=None) -> pd.DataFrame:
|
|
1241
|
+
"""Execute arbitrary SQL and return a DataFrame."""
|
|
1242
|
+
return self._fetchdf(sql, params)
|
|
1243
|
+
|
|
1244
|
+
# ------------------------------------------------------------------
|
|
1245
|
+
# Context manager / cleanup
|
|
1246
|
+
# ------------------------------------------------------------------
|
|
1247
|
+
|
|
1248
|
+
def close(self):
|
|
1249
|
+
"""Close the DuckDB connection."""
|
|
1250
|
+
self.con.close()
|
|
1251
|
+
logger.info("DuckDB lock RELEASED: %s", self.db_path)
|
|
1252
|
+
|
|
1253
|
+
def reopen(self):
|
|
1254
|
+
"""Reopen the DuckDB connection after close()."""
|
|
1255
|
+
logger.info("DuckDB lock ACQUIRED (reopen): %s", self.db_path)
|
|
1256
|
+
self.con = duckdb.connect(str(self.db_path))
|
|
1257
|
+
|
|
1258
|
+
def __enter__(self):
|
|
1259
|
+
"""Enter context manager."""
|
|
1260
|
+
return self
|
|
1261
|
+
|
|
1262
|
+
def __exit__(self, *args):
|
|
1263
|
+
"""Exit context manager, closing the DuckDB connection."""
|
|
1264
|
+
self.close()
|
|
1265
|
+
|
|
1266
|
+
def __repr__(self):
|
|
1267
|
+
try:
|
|
1268
|
+
n_vars = self._fetchall(
|
|
1269
|
+
"SELECT COUNT(DISTINCT variable_name) FROM _variables"
|
|
1270
|
+
)[0][0]
|
|
1271
|
+
except Exception:
|
|
1272
|
+
n_vars = "?"
|
|
1273
|
+
return (
|
|
1274
|
+
f"SciDuck(path='{self.db_path}', "
|
|
1275
|
+
f"schema={self.dataset_schema}, variables={n_vars})"
|
|
1276
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sciduckdb
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A thin DuckDB layer for managing versioned scientific data
|
|
5
|
+
Author: SciStack Contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Keywords: data-management,duckdb,scientific-data,versioning
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: duckdb>=0.9.0
|
|
21
|
+
Requires-Dist: numpy>=1.20
|
|
22
|
+
Requires-Dist: pandas>=1.3
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# SciDuck
|
|
31
|
+
|
|
32
|
+
A thin DuckDB layer for managing versioned scientific data.
|
|
33
|
+
|
|
34
|
+
Each variable is stored in its own table. Variables are associated with a hierarchical dataset schema (e.g. subject -> session -> trial) and can be saved at any level of that hierarchy. Multiple versions of each variable are supported natively.
|
|
35
|
+
|
|
36
|
+
All data -- including arrays -- is stored in queryable DuckDB types (LIST, nested LIST, JSON) so the database can be inspected with DBeaver or any DuckDB-compatible viewer.
|
|
37
|
+
|
|
38
|
+
## Usage
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from sciduckdb import SciDuck
|
|
42
|
+
|
|
43
|
+
duck = SciDuck("data.duckdb", dataset_schema=["subject", "session"])
|
|
44
|
+
duck.save("MyVar", data, subject="S01", session=1)
|
|
45
|
+
loaded = duck.load("MyVar", subject="S01", session=1)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- **Three save modes**: DataFrame with schema columns (Mode A), single entry via kwargs (Mode B), or dict mapping tuples to values (Mode C)
|
|
51
|
+
- **Automatic type inference**: Maps Python/numpy types to DuckDB types
|
|
52
|
+
- **Round-trip restoration**: Metadata tracks original types for lossless load
|
|
53
|
+
- **Version management**: Automatic version numbering, duplicate hash detection
|
|
54
|
+
- **Variable groups**: Organize variables into named groups
|
|
55
|
+
- **Schema validation**: Validates dataset schema consistency across sessions
|
|
56
|
+
|
|
57
|
+
Note: all schema key values are coerced to strings before storage.
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
sciduckdb/__init__.py,sha256=o2xnzbv-_7AakjaAxf-934Xp3mULCBtHKmVD3P4FrPE,741
|
|
2
|
+
sciduckdb/sciduckdb.py,sha256=CX-Vs8SxkbKeaoYqJndDrfBOJud6NCGIaX-CZ_ipSkI,49538
|
|
3
|
+
sciduckdb-0.1.0.dist-info/METADATA,sha256=cjvpli30jLVGZIvX6ctaHkhcZ-9UsrvVFK9lYIUcWgA,2441
|
|
4
|
+
sciduckdb-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
5
|
+
sciduckdb-0.1.0.dist-info/RECORD,,
|