sciduckdb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sciduckdb/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ """SciDuck — A thin DuckDB layer for managing versioned scientific data."""
2
+
3
+ from .sciduckdb import (
4
+ SciDuck,
5
+ _infer_duckdb_type,
6
+ _numpy_dtype_to_duckdb,
7
+ _python_to_storage,
8
+ _storage_to_python,
9
+ _storage_to_python_column,
10
+ _infer_data_columns,
11
+ _value_to_storage_row,
12
+ _dataframe_to_storage_rows,
13
+ _bulk_df_to_storage_rows,
14
+ _flatten_dict,
15
+ _unflatten_dict,
16
+ )
17
+
18
+ __all__ = [
19
+ "SciDuck",
20
+ "_infer_duckdb_type",
21
+ "_numpy_dtype_to_duckdb",
22
+ "_python_to_storage",
23
+ "_storage_to_python",
24
+ "_storage_to_python_column",
25
+ "_infer_data_columns",
26
+ "_value_to_storage_row",
27
+ "_dataframe_to_storage_rows",
28
+ "_bulk_df_to_storage_rows",
29
+ "_flatten_dict",
30
+ "_unflatten_dict",
31
+ ]
sciduckdb/sciduckdb.py ADDED
@@ -0,0 +1,1276 @@
1
+ """
2
+ SciDuck — A thin DuckDB layer for managing versioned scientific data.
3
+
4
+ Each variable is stored in its own table. Variables are associated with a
5
+ hierarchical dataset schema (e.g. subject → session → trial) and can be
6
+ saved at any level of that hierarchy. Multiple versions of each variable
7
+ are supported natively.
8
+
9
+ All data — including arrays — is stored in queryable DuckDB types (LIST,
10
+ nested LIST, JSON) so the database can be inspected with DBeaver or any
11
+ DuckDB-compatible viewer.
12
+ """
13
+
14
+ import duckdb
15
+ import logging
16
+ import pandas as pd
17
+ import numpy as np
18
+ import json
19
+ import datetime
20
+ import threading
21
+ from pathlib import Path
22
+ from typing import Any, Dict, List, Optional, Tuple, Union
23
+
24
+ logger = logging.getLogger("sciduck")
25
+
26
+
27
+ def _schema_str(value):
28
+ """Stringify a schema key value, converting whole-number floats to int.
29
+
30
+ Schema keys are stored as VARCHAR. str(1.0) → "1.0" but str(1) → "1".
31
+ MATLAB sends all numbers as float, so without this conversion queries
32
+ and cache lookups fail because "1.0" ≠ "1".
33
+ """
34
+ if isinstance(value, float) and value.is_integer():
35
+ return str(int(value))
36
+ return str(value)
37
+
38
+
39
+ # ---------------------------------------------------------------------------
40
+ # Type mapping helpers
41
+ # ---------------------------------------------------------------------------
42
+
43
+ def _numpy_dtype_to_duckdb(dtype: np.dtype) -> str:
44
+ """Map a numpy scalar dtype to a DuckDB type string."""
45
+ kind = dtype.kind
46
+ size = dtype.itemsize
47
+ if kind == "f":
48
+ return "FLOAT" if size <= 4 else "DOUBLE"
49
+ if kind in ("i", "u"):
50
+ mapping = {1: "TINYINT", 2: "SMALLINT", 4: "INTEGER", 8: "BIGINT"}
51
+ base = mapping.get(size, "BIGINT")
52
+ if kind == "u":
53
+ return "U" + base
54
+ return base
55
+ if kind == "b":
56
+ return "BOOLEAN"
57
+ if kind in ("U", "S", "O"):
58
+ return "VARCHAR"
59
+ if kind == "M":
60
+ return "TIMESTAMP"
61
+ if kind == "m":
62
+ return "INTERVAL"
63
+ return "VARCHAR"
64
+
65
+
66
+ def _infer_duckdb_type(value: Any) -> Tuple[str, dict]:
67
+ """
68
+ Infer the DuckDB column type and a metadata dict for round-trip
69
+ restoration from a single Python/numpy value.
70
+
71
+ Returns (duckdb_type_str, metadata_dict).
72
+ """
73
+ meta: dict = {}
74
+
75
+ # --- numpy arrays ---
76
+ if isinstance(value, np.ndarray):
77
+ base = _numpy_dtype_to_duckdb(value.dtype)
78
+ meta["python_type"] = "ndarray"
79
+ meta["numpy_dtype"] = str(value.dtype)
80
+ meta["ndim"] = value.ndim
81
+ meta["shape_hint"] = list(value.shape)
82
+ if value.ndim == 1:
83
+ return f"{base}[]", meta
84
+ if value.ndim == 2:
85
+ meta["shape_hint"] = [None, value.shape[1]] # rows vary, cols fixed
86
+ return f"{base}[][]", meta
87
+ # 3-D+ : store as JSON
88
+ meta["python_type"] = "ndarray_json"
89
+ return "VARCHAR", meta
90
+
91
+ # --- Python scalars ---
92
+ if isinstance(value, bool):
93
+ meta["python_type"] = "bool"
94
+ return "BOOLEAN", meta
95
+ if isinstance(value, int):
96
+ meta["python_type"] = "int"
97
+ return "BIGINT", meta
98
+ if isinstance(value, float):
99
+ meta["python_type"] = "float"
100
+ return "DOUBLE", meta
101
+ if isinstance(value, str):
102
+ meta["python_type"] = "str"
103
+ return "VARCHAR", meta
104
+
105
+ # --- Python lists ---
106
+ if isinstance(value, list):
107
+ meta["python_type"] = "list"
108
+ if len(value) > 0:
109
+ inner = value[0]
110
+ # Check for homogeneous list
111
+ if isinstance(inner, list):
112
+ if not all(isinstance(v, list) for v in value):
113
+ raise TypeError(
114
+ "Heterogeneous lists are not supported. "
115
+ "All elements must be the same type."
116
+ )
117
+ meta["nested"] = True
118
+ return "DOUBLE[][]", meta
119
+ if isinstance(inner, np.ndarray):
120
+ if not all(isinstance(v, np.ndarray) for v in value):
121
+ raise TypeError(
122
+ "Heterogeneous lists are not supported. "
123
+ "All elements must be the same type."
124
+ )
125
+ meta["nested"] = True
126
+ meta["contains_ndarray"] = True
127
+ meta["ndarray_dtype"] = str(inner.dtype)
128
+ return "DOUBLE[][]", meta
129
+ if isinstance(inner, (int, float)):
130
+ if not all(isinstance(v, (int, float)) for v in value):
131
+ raise TypeError(
132
+ "Heterogeneous lists are not supported. "
133
+ "All elements must be the same type."
134
+ )
135
+ return "DOUBLE[]", meta
136
+ if isinstance(inner, str):
137
+ if not all(isinstance(v, str) for v in value):
138
+ raise TypeError(
139
+ "Heterogeneous lists are not supported. "
140
+ "All elements must be the same type."
141
+ )
142
+ return "VARCHAR[]", meta
143
+ return "VARCHAR[]", meta
144
+
145
+ # --- dict → JSON ---
146
+ if isinstance(value, dict):
147
+ meta["python_type"] = "dict"
148
+ # Track ndarray values for restoration
149
+ ndarray_keys = {}
150
+ for k, v in value.items():
151
+ if isinstance(v, np.ndarray):
152
+ ndarray_keys[k] = {
153
+ "dtype": str(v.dtype),
154
+ "shape": list(v.shape),
155
+ }
156
+ if ndarray_keys:
157
+ meta["ndarray_keys"] = ndarray_keys
158
+ return "JSON", meta
159
+
160
+ # --- datetime ---
161
+ if isinstance(value, (datetime.datetime, pd.Timestamp)):
162
+ meta["python_type"] = "datetime"
163
+ return "TIMESTAMP", meta
164
+ if isinstance(value, datetime.date):
165
+ meta["python_type"] = "date"
166
+ return "DATE", meta
167
+ if isinstance(value, (datetime.timedelta, pd.Timedelta)):
168
+ meta["python_type"] = "INTERVAL"
169
+ return "INTERVAL", meta
170
+
171
+ # --- pandas categorical (shouldn't normally arrive here, but handle) ---
172
+ if isinstance(value, pd.Categorical):
173
+ meta["python_type"] = "categorical"
174
+ return "VARCHAR", meta
175
+
176
+ # --- fallback: JSON-serialize ---
177
+ meta["python_type"] = "json_fallback"
178
+ return "VARCHAR", meta
179
+
180
+
181
+ def _convert_for_json(value: Any) -> Any:
182
+ """Recursively convert ndarrays/DataFrames to lists for JSON serialization."""
183
+ if isinstance(value, pd.DataFrame):
184
+ return _convert_for_json(value.to_dict("list"))
185
+ if isinstance(value, pd.Series):
186
+ return value.tolist()
187
+ if isinstance(value, np.ndarray):
188
+ return value.tolist()
189
+ if isinstance(value, dict):
190
+ return {k: _convert_for_json(v) for k, v in value.items()}
191
+ if isinstance(value, list):
192
+ return [_convert_for_json(v) for v in value]
193
+ return value
194
+
195
+
196
+ def _python_to_storage(value: Any, meta: dict) -> Any:
197
+ """Convert a Python value to its DuckDB-storable form."""
198
+ ptype = meta.get("python_type", "")
199
+
200
+ if ptype == "ndarray":
201
+ arr = value
202
+ # Scalar in a column typed as ndarray (e.g. ragged vectors): wrap as 1-element list
203
+ if not isinstance(arr, np.ndarray):
204
+ return [arr]
205
+ if arr.ndim == 1:
206
+ return arr.tolist()
207
+ if arr.ndim == 2:
208
+ return [row.tolist() for row in arr]
209
+
210
+ if ptype == "ndarray_json":
211
+ return json.dumps(value.tolist())
212
+
213
+ if ptype == "dict":
214
+ return json.dumps(_convert_for_json(value))
215
+
216
+ if ptype == "json_fallback":
217
+ return json.dumps(_convert_for_json(value))
218
+
219
+ if ptype == "list":
220
+ # Convert ndarrays within list to nested lists
221
+ if meta.get("contains_ndarray"):
222
+ return [v.tolist() if isinstance(v, np.ndarray) else v for v in value]
223
+ return value # DuckDB handles native lists
224
+
225
+ return value
226
+
227
+
228
+ def _storage_to_python(value: Any, meta: dict) -> Any:
229
+ """Restore a stored DuckDB value back to its original Python type."""
230
+ ptype = meta.get("python_type", "")
231
+
232
+ if ptype == "ndarray":
233
+ dtype = np.dtype(meta.get("numpy_dtype", "float64"))
234
+ ndim = meta.get("ndim", 1)
235
+ if ndim >= 2:
236
+ # DuckDB returns ndarray of ndarrays; stack them
237
+ return np.stack([np.asarray(row) for row in value]).astype(dtype)
238
+ return np.asarray(value, dtype=dtype)
239
+
240
+ if ptype == "ndarray_json":
241
+ dtype = np.dtype(meta.get("numpy_dtype", "float64"))
242
+ return np.array(json.loads(value), dtype=dtype)
243
+
244
+ if ptype == "dict":
245
+ if isinstance(value, str):
246
+ result = json.loads(value)
247
+ else:
248
+ result = value # DuckDB JSON type may already return dict
249
+ # Restore ndarray values if metadata exists
250
+ ndarray_keys = meta.get("ndarray_keys", {})
251
+ for k, arr_meta in ndarray_keys.items():
252
+ if k in result:
253
+ dtype = np.dtype(arr_meta.get("dtype", "float64"))
254
+ result[k] = np.array(result[k], dtype=dtype)
255
+ return result
256
+
257
+ if ptype == "json_fallback":
258
+ return json.loads(value)
259
+
260
+ if ptype == "list":
261
+ # DuckDB may return ndarray; convert back to list
262
+ if meta.get("contains_ndarray"):
263
+ # Restore as list of ndarrays
264
+ dtype = np.dtype(meta.get("ndarray_dtype", "float64"))
265
+ if isinstance(value, np.ndarray):
266
+ return [np.asarray(v, dtype=dtype) for v in value]
267
+ return [np.asarray(v, dtype=dtype) for v in value]
268
+ if isinstance(value, np.ndarray):
269
+ if meta.get("nested"):
270
+ return [v.tolist() if isinstance(v, np.ndarray) else v for v in value]
271
+ return value.tolist()
272
+ return value
273
+
274
+ if ptype == "int":
275
+ return int(value) if value is not None else None
276
+
277
+ if ptype == "float":
278
+ return float(value) if value is not None else None
279
+
280
+ if ptype == "bool":
281
+ return bool(value) if value is not None else None
282
+
283
+ if ptype == "str":
284
+ return str(value) if value is not None else None
285
+
286
+ return value
287
+
288
+
289
+ def _storage_to_python_column(series: "pd.Series", meta: dict) -> "pd.Series":
290
+ """Vectorized column-level dispatch of _storage_to_python.
291
+
292
+ Applied once per column in bulk loads instead of once per cell (N records ×
293
+ M columns calls vs N×M calls for the per-element path). Pass-through types
294
+ (float, int, bool, str) are returned unchanged — DuckDB already emits the
295
+ right pandas dtype for those. Complex types use pd.Series.apply, which is
296
+ still faster than an explicit Python for-loop.
297
+ """
298
+ ptype = meta.get("python_type", "")
299
+
300
+ # Scalar types: DuckDB already returns the right pandas dtype — no-op.
301
+ if ptype in ("float", "int", "bool", "str", ""):
302
+ return series
303
+
304
+ # JSON blob types: decode string once per cell, but at column granularity.
305
+ if ptype in ("dict", "json_fallback"):
306
+ return series.apply(lambda v: json.loads(v) if isinstance(v, str) else v)
307
+
308
+ # All remaining types (ndarray, ndarray_json, list, …): delegate per-element.
309
+ return series.apply(lambda v: _storage_to_python(v, meta))
310
+
311
+
312
+ def _flatten_dict(d, _prefix=()):
313
+ """Flatten a nested dict into {dot.separated.key: leaf_value} pairs.
314
+ Returns (flat_dict, path_map) where path_map maps each dot-key
315
+ to its tuple-of-keys path for faithful reconstruction."""
316
+ flat = {}
317
+ paths = {}
318
+ for k, v in d.items():
319
+ current = _prefix + (k,)
320
+ if isinstance(v, dict):
321
+ sub_flat, sub_paths = _flatten_dict(v, current)
322
+ flat.update(sub_flat)
323
+ paths.update(sub_paths)
324
+ else:
325
+ dot_key = ".".join(current)
326
+ flat[dot_key] = v
327
+ paths[dot_key] = list(current)
328
+ return flat, paths
329
+
330
+
331
+ def _unflatten_dict(flat, path_map):
332
+ """Reconstruct a nested dict from flat dot-keys using stored path_map."""
333
+ result = {}
334
+ for dot_key, value in flat.items():
335
+ path = path_map.get(dot_key, dot_key.split("."))
336
+ current = result
337
+ for key in path[:-1]:
338
+ current = current.setdefault(key, {})
339
+ current[path[-1]] = value
340
+ return result
341
+
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # Column inference & storage-row helpers (module-level, used by SciDuck and
345
+ # DatabaseManager)
346
+ # ---------------------------------------------------------------------------
347
+
348
+ def _infer_data_columns(
349
+ sample_value: Any, data_col_name: Optional[str] = None
350
+ ) -> Tuple[dict, dict]:
351
+ """
352
+ From a sample data value, return:
353
+ - data_col_types: dict of {col_name: duckdb_type_str}
354
+ - dtype_meta: metadata dict for round-trip restoration
355
+ """
356
+ # DataFrame mode: each DataFrame column → its own DuckDB column.
357
+ # One DuckDB row is stored per DataFrame row; the column type reflects
358
+ # the individual cell value type (independent of table height).
359
+ if isinstance(sample_value, pd.DataFrame):
360
+ col_types = {}
361
+ meta = {
362
+ "mode": "dataframe",
363
+ "columns": {},
364
+ "df_columns": list(sample_value.columns),
365
+ }
366
+ for col_name in sample_value.columns:
367
+ col_series = sample_value[col_name]
368
+ if len(sample_value) == 0:
369
+ ddb_type = "VARCHAR"
370
+ col_meta = {"python_type": "str"}
371
+ else:
372
+ cell_val = col_series.iloc[0]
373
+ if isinstance(cell_val, np.generic):
374
+ cell_val = cell_val.item()
375
+ # to_python.m sends array cells as Python lists (via .tolist()).
376
+ # Normalise to ndarray so _infer_duckdb_type handles them correctly.
377
+ if isinstance(cell_val, list) and len(cell_val) > 0:
378
+ cell_val = np.asarray(cell_val)
379
+ ddb_type, col_meta = _infer_duckdb_type(cell_val)
380
+ col_types[col_name] = ddb_type
381
+ meta["columns"][col_name] = col_meta
382
+ return col_types, meta
383
+
384
+ # Dict mode: each key → its own DuckDB column (nested dicts are flattened)
385
+ if isinstance(sample_value, dict):
386
+ has_nested = any(isinstance(v, dict) for v in sample_value.values())
387
+ if has_nested:
388
+ flat, path_map = _flatten_dict(sample_value)
389
+ else:
390
+ flat = sample_value
391
+ path_map = {k: [k] for k in sample_value}
392
+ col_types = {}
393
+ meta = {"mode": "multi_column", "columns": {}}
394
+ if has_nested:
395
+ meta["nested"] = True
396
+ meta["path_map"] = path_map
397
+ for col_name, val in flat.items():
398
+ # Unwrap length-1 arrays to scalars before type inference
399
+ if isinstance(val, np.ndarray) and val.size == 1:
400
+ val = val.item()
401
+ ddb_type, col_meta = _infer_duckdb_type(val)
402
+ col_types[col_name] = ddb_type
403
+ meta["columns"][col_name] = col_meta
404
+ return col_types, meta
405
+
406
+ # Single-column mode — use provided name or default to "value"
407
+ col_name = data_col_name or "value"
408
+ ddb_type, col_meta = _infer_duckdb_type(sample_value)
409
+ meta = {"mode": "single_column", "columns": {col_name: col_meta}}
410
+ return {col_name: ddb_type}, meta
411
+
412
+
413
+ def _dataframe_to_storage_rows(df: pd.DataFrame, dtype_meta: dict) -> list:
414
+ """Convert a DataFrame to a list of per-row storage values.
415
+
416
+ Returns a list of lists: one inner list per DataFrame row, each containing
417
+ one storage-ready value per column in the order defined by dtype_meta["columns"].
418
+ """
419
+ col_metas = dtype_meta["columns"]
420
+ rows = []
421
+ for i in range(len(df)):
422
+ row = []
423
+ for col, col_meta in col_metas.items():
424
+ cell_val = df[col].iloc[i]
425
+ if isinstance(cell_val, np.generic):
426
+ cell_val = cell_val.item()
427
+ # to_python.m sends array cells as Python lists (via .tolist()).
428
+ # Normalise to ndarray so _python_to_storage handles them correctly.
429
+ if isinstance(cell_val, list) and len(cell_val) > 0:
430
+ cell_val = np.asarray(cell_val)
431
+ row.append(_python_to_storage(cell_val, col_meta))
432
+ rows.append(row)
433
+ return rows
434
+
435
+
436
+ def _bulk_df_to_storage_rows(
437
+ df_list: list, record_ids: list, dtype_meta: dict
438
+ ) -> list:
439
+ """Bulk convert N DataFrames to (record_id, ...storage_values) rows.
440
+
441
+ Equivalent to calling _dataframe_to_storage_rows N times and assembling
442
+ (record_id, ...) tuples, but processes each column as a whole to avoid
443
+ O(N×C) per-cell pandas iloc overhead.
444
+
445
+ Falls back to the per-row path when DataFrame schemas differ.
446
+ """
447
+ if not df_list:
448
+ return []
449
+
450
+ col_metas = dtype_meta["columns"]
451
+ first_cols = list(df_list[0].columns)
452
+
453
+ # Fall back to per-row path if schemas differ (shouldn't happen in normal use).
454
+ if not all(list(df.columns) == first_cols for df in df_list):
455
+ rows: list = []
456
+ for rid, df in zip(record_ids, df_list):
457
+ for storage_row in _dataframe_to_storage_rows(df, dtype_meta):
458
+ rows.append((rid,) + tuple(storage_row))
459
+ return rows
460
+
461
+ # Build flat record_id list: one entry per storage row (multi-row records
462
+ # contribute len(df) entries, typical 1-row records contribute 1).
463
+ expanded_rids = []
464
+ for rid, df in zip(record_ids, df_list):
465
+ expanded_rids.extend([rid] * len(df))
466
+
467
+ # Concat once so column operations don't cross DataFrame boundaries.
468
+ big_df = pd.concat(df_list, ignore_index=True)
469
+
470
+ # Build per-column storage arrays using column-level operations.
471
+ col_arrays: dict = {}
472
+ for col, col_meta in col_metas.items():
473
+ ptype = col_meta.get("python_type", "")
474
+ raw = big_df[col]
475
+
476
+ if ptype == "ndarray":
477
+ vals = raw.to_numpy()
478
+ col_arrays[col] = [
479
+ v.tolist() if isinstance(v, np.ndarray)
480
+ else (v if isinstance(v, list) else [v])
481
+ for v in vals
482
+ ]
483
+ elif ptype in ("dict", "json_fallback"):
484
+ col_arrays[col] = [json.dumps(_convert_for_json(v)) for v in raw.to_numpy()]
485
+ elif ptype == "list":
486
+ if col_meta.get("contains_ndarray"):
487
+ col_arrays[col] = [
488
+ [e.tolist() if isinstance(e, np.ndarray) else e for e in v]
489
+ for v in raw.to_numpy()
490
+ ]
491
+ else:
492
+ col_arrays[col] = raw.tolist()
493
+ else:
494
+ # Scalar types (float, int, str, bool …): tolist() converts numpy
495
+ # scalars to Python builtins, which is what DuckDB expects.
496
+ col_arrays[col] = raw.tolist()
497
+
498
+ cols_in_order = list(col_metas.keys())
499
+ n = len(big_df)
500
+ return [
501
+ (expanded_rids[i],) + tuple(col_arrays[col][i] for col in cols_in_order)
502
+ for i in range(n)
503
+ ]
504
+
505
+
506
+ def _value_to_storage_row(value: Any, dtype_meta: dict) -> list:
507
+ """Convert a data value to a list of storage-ready column values.
508
+
509
+ For DataFrames use _dataframe_to_storage_rows() instead.
510
+ """
511
+ mode = dtype_meta.get("mode", "single_column")
512
+ col_metas = dtype_meta["columns"]
513
+
514
+ if mode == "multi_column":
515
+ if dtype_meta.get("nested"):
516
+ flat, _ = _flatten_dict(value)
517
+ else:
518
+ flat = value
519
+ return [
520
+ _python_to_storage(flat[col], col_metas[col])
521
+ for col in col_metas
522
+ ]
523
+ else:
524
+ # Single column — get the one key (could be "value" or a named column)
525
+ col_name = next(iter(col_metas))
526
+ col_meta = col_metas[col_name]
527
+ return [_python_to_storage(value, col_meta)]
528
+
529
+
530
+ # ---------------------------------------------------------------------------
531
+ # Main class
532
+ # ---------------------------------------------------------------------------
533
+
534
+ class SciDuck:
535
+ """
536
+ A thin DuckDB layer for managing versioned, schema-aware scientific data.
537
+
538
+ Parameters
539
+ ----------
540
+ db_path : str or Path
541
+ Path to the DuckDB database file. Use ":memory:" for in-memory.
542
+ dataset_schema : list of str
543
+ Ordered hierarchy, e.g. ["subject", "session", "trial"].
544
+ """
545
+
546
+ def __init__(self, db_path: Union[str, Path], dataset_schema: List[str]):
547
+ self.db_path = str(db_path)
548
+ self.dataset_schema = list(dataset_schema)
549
+ self._lock = threading.Lock()
550
+ logger.info("DuckDB lock ACQUIRED: %s", self.db_path)
551
+ self.con = duckdb.connect(self.db_path)
552
+ self._init_metadata_tables()
553
+
554
+ # ------------------------------------------------------------------
555
+ # Thin internal interface (future backend swap point)
556
+ # ------------------------------------------------------------------
557
+
558
+ def _execute(self, sql: str, params=None):
559
+ # NOTE: DuckDB's Python connection returns itself from execute(), so
560
+ # execute() and fetchXxx() share the same connection state. All callers
561
+ # that fetch results must hold _lock for the entire execute+fetch sequence.
562
+ # Use _fetchall / _fetchdf for queries that return rows; call _execute
563
+ # directly (under _lock) only for DDL/DML that needs no fetch.
564
+ with self._lock:
565
+ if params:
566
+ return self.con.execute(sql, params)
567
+ return self.con.execute(sql)
568
+
569
+ def _executemany(self, sql: str, params_list):
570
+ with self._lock:
571
+ return self.con.executemany(sql, params_list)
572
+
573
+ def _begin(self):
574
+ with self._lock:
575
+ self.con.execute("BEGIN TRANSACTION")
576
+
577
+ def _commit(self):
578
+ with self._lock:
579
+ self.con.execute("COMMIT")
580
+
581
+ def _rollback(self):
582
+ with self._lock:
583
+ self.con.execute("ROLLBACK")
584
+
585
+ def _fetchall(self, sql: str, params=None) -> list:
586
+ with self._lock:
587
+ if params:
588
+ return self.con.execute(sql, params).fetchall()
589
+ return self.con.execute(sql).fetchall()
590
+
591
+ def fetchall(self, sql: str, params=None) -> list:
592
+ """Public alias for _fetchall — accessible from MATLAB (underscore methods are not)."""
593
+ return self._fetchall(sql, params)
594
+
595
+ def _fetchdf(self, sql: str, params=None) -> pd.DataFrame:
596
+ with self._lock:
597
+ if params:
598
+ return self.con.execute(sql, params).fetchdf()
599
+ return self.con.execute(sql).fetchdf()
600
+
601
+ def _table_exists(self, name: str) -> bool:
602
+ rows = self._fetchall(
603
+ "SELECT COUNT(*) FROM information_schema.tables "
604
+ "WHERE table_name = ?", [name]
605
+ )
606
+ return rows[0][0] > 0
607
+
608
+ # ------------------------------------------------------------------
609
+ # Metadata table creation
610
+ # ------------------------------------------------------------------
611
+
612
+ def _init_metadata_tables(self):
613
+ # --- _schema ---
614
+ schema_cols = ", ".join(f'"{s}" VARCHAR' for s in self.dataset_schema)
615
+ self._execute(f"""
616
+ CREATE TABLE IF NOT EXISTS _schema (
617
+ schema_id INTEGER PRIMARY KEY,
618
+ schema_level VARCHAR NOT NULL,
619
+ {schema_cols}
620
+ )
621
+ """)
622
+ # Create a sequence for schema_id if it doesn't exist
623
+ try:
624
+ self._execute("CREATE SEQUENCE IF NOT EXISTS _schema_id_seq START 1")
625
+ except Exception:
626
+ pass # sequence already exists
627
+
628
+ # --- _variables ---
629
+ self._execute("""
630
+ CREATE TABLE IF NOT EXISTS _variables (
631
+ variable_name VARCHAR PRIMARY KEY,
632
+ schema_level VARCHAR NOT NULL,
633
+ dtype VARCHAR,
634
+ created_at TIMESTAMP DEFAULT current_timestamp,
635
+ description VARCHAR DEFAULT ''
636
+ )
637
+ """)
638
+
639
+ # --- _variable_groups ---
640
+ self._execute("""
641
+ CREATE TABLE IF NOT EXISTS _variable_groups (
642
+ group_name VARCHAR NOT NULL,
643
+ variable_name VARCHAR NOT NULL,
644
+ PRIMARY KEY (group_name, variable_name)
645
+ )
646
+ """)
647
+
648
+ # Validate schema consistency if _schema already has data
649
+ if self._fetchall("SELECT COUNT(*) FROM _schema")[0][0] > 0:
650
+ existing_cols = [
651
+ row[0] for row in self._fetchall(
652
+ "SELECT column_name FROM information_schema.columns "
653
+ "WHERE table_name = '_schema' "
654
+ "AND column_name NOT IN ('schema_id', 'schema_level') "
655
+ "ORDER BY ordinal_position"
656
+ )
657
+ ]
658
+ if existing_cols != self.dataset_schema:
659
+ raise ValueError(
660
+ f"Database schema mismatch. "
661
+ f"Existing: {existing_cols}, Provided: {self.dataset_schema}"
662
+ )
663
+
664
+ # ------------------------------------------------------------------
665
+ # Schema entry management
666
+ # ------------------------------------------------------------------
667
+
668
+ def _schema_key_columns(self, schema_level: str) -> List[str]:
669
+ """Return schema columns from the top down to (and including) schema_level."""
670
+ idx = self.dataset_schema.index(schema_level)
671
+ return self.dataset_schema[: idx + 1]
672
+
673
+ def _get_or_create_schema_id(self, schema_level: str, key_values: dict) -> int:
674
+ """Look up or insert a row in _schema. Return the schema_id."""
675
+ key_cols = [k for k in self.dataset_schema if k in key_values]
676
+
677
+ # Build WHERE clause
678
+ conditions = []
679
+ params = [schema_level]
680
+ for col in key_cols:
681
+ conditions.append(f'"{col}" = ?')
682
+ params.append(_schema_str(key_values[col]))
683
+ # Columns above the level that should be NULL are implicit —
684
+ # but to be safe, also require NULLs for levels below.
685
+ for col in self.dataset_schema:
686
+ if col not in key_cols:
687
+ conditions.append(f'"{col}" IS NULL')
688
+
689
+ where = " AND ".join(conditions)
690
+ rows = self._fetchall(
691
+ f'SELECT schema_id FROM _schema WHERE schema_level = ? AND {where}',
692
+ params,
693
+ )
694
+ if rows:
695
+ return rows[0][0]
696
+
697
+ # Insert new entry — use MAX+1 for consistency with batch path
698
+ new_id = self._fetchall(
699
+ "SELECT COALESCE(MAX(schema_id), 0) + 1 FROM _schema"
700
+ )[0][0]
701
+ col_names = ["schema_id", "schema_level"] + key_cols
702
+ placeholders = ", ".join(["?"] * len(col_names))
703
+ col_str = ", ".join(f'"{c}"' for c in col_names)
704
+ values = [new_id, schema_level] + [_schema_str(key_values[c]) for c in key_cols]
705
+ self._execute(
706
+ f"INSERT INTO _schema ({col_str}) VALUES ({placeholders})", values
707
+ )
708
+ return new_id
709
+
710
+ def batch_get_or_create_schema_ids(
711
+ self,
712
+ combos: dict, # {(schema_level, key_tuple): key_values_dict}
713
+ ) -> dict:
714
+ """
715
+ Batch-resolve schema IDs for multiple (schema_level, key_values) combos.
716
+
717
+ Instead of N individual SELECT+INSERT round-trips, does:
718
+ 1. One SELECT to fetch all existing schema entries
719
+ 2. Batch INSERT for missing entries
720
+ 3. One SELECT to get IDs for newly inserted entries
721
+
722
+ Args:
723
+ combos: dict mapping (schema_level, key_tuple) -> key_values dict
724
+
725
+ Returns:
726
+ dict mapping (schema_level, key_tuple) -> schema_id
727
+ """
728
+ if not combos:
729
+ return {}
730
+
731
+ result = {}
732
+
733
+ # Group combos by (schema_level, key set) for efficient querying
734
+ by_level_and_keys = {}
735
+ for (schema_level, key_tuple), key_values in combos.items():
736
+ group_key = (schema_level, frozenset(key_values.keys()))
737
+ by_level_and_keys.setdefault(group_key, []).append(
738
+ ((schema_level, key_tuple), key_values)
739
+ )
740
+
741
+ for (schema_level, key_set), entries in by_level_and_keys.items():
742
+ key_cols = [k for k in self.dataset_schema if k in key_set]
743
+ null_cols = [c for c in self.dataset_schema if c not in key_cols]
744
+
745
+ # Build a single query to find all existing matches at this level
746
+ # We fetch all rows for this schema_level and match in Python
747
+ null_conditions = " AND ".join(
748
+ f'"{col}" IS NULL' for col in null_cols
749
+ )
750
+ where_clause = f'schema_level = ?'
751
+ if null_conditions:
752
+ where_clause += f' AND {null_conditions}'
753
+
754
+ col_select = ", ".join(f'"{c}"' for c in key_cols)
755
+ rows = self._fetchall(
756
+ f'SELECT schema_id, {col_select} FROM _schema WHERE {where_clause}',
757
+ [schema_level],
758
+ )
759
+
760
+ # Build lookup: tuple of col values -> schema_id
761
+ existing_lookup = {}
762
+ for row in rows:
763
+ sid = row[0]
764
+ row_key = tuple(_schema_str(v) if v is not None else "" for v in row[1:])
765
+ existing_lookup[row_key] = sid
766
+
767
+ # Match entries against existing rows
768
+ missing = [] # [(combo_key, key_values), ...]
769
+ for combo_key, key_values in entries:
770
+ match_key = tuple(_schema_str(key_values.get(c, "")) for c in key_cols)
771
+ if match_key in existing_lookup:
772
+ result[combo_key] = existing_lookup[match_key]
773
+ else:
774
+ missing.append((combo_key, key_values, match_key))
775
+
776
+ # Batch insert missing entries
777
+ if missing:
778
+ # Allocate a block of IDs from current max instead of N nextval() calls
779
+ max_row = self._fetchall(
780
+ "SELECT COALESCE(MAX(schema_id), 0) FROM _schema"
781
+ )
782
+ first_id = max_row[0][0] + 1
783
+
784
+ col_names = ["schema_id", "schema_level"] + key_cols
785
+ col_str = ", ".join(f'"{c}"' for c in col_names)
786
+
787
+ insert_rows = []
788
+ for idx, (combo_key, key_values, _) in enumerate(missing):
789
+ new_id = first_id + idx
790
+ row = [new_id, schema_level] + [
791
+ _schema_str(key_values[c]) for c in key_cols
792
+ ]
793
+ insert_rows.append(row)
794
+ result[combo_key] = new_id
795
+
796
+ # Use DataFrame-based insert for speed
797
+ insert_df = pd.DataFrame(insert_rows, columns=col_names)
798
+ self.con.execute(
799
+ f"INSERT INTO _schema ({col_str}) SELECT * FROM insert_df"
800
+ )
801
+
802
+ return result
803
+
804
+ # ------------------------------------------------------------------
805
+ # Save
806
+ # ------------------------------------------------------------------
807
+
808
+ def save(
809
+ self,
810
+ name: str,
811
+ data: Any,
812
+ schema_level: Optional[str] = None,
813
+ description: str = "",
814
+ force: bool = False,
815
+ **schema_keys,
816
+ ):
817
+ """
818
+ Save a variable to the database.
819
+
820
+ Parameters
821
+ ----------
822
+ name : str
823
+ Variable name (becomes the table name).
824
+ data : Any
825
+ The data to save. Can be:
826
+ - pd.DataFrame with schema-level columns (Mode A)
827
+ - Any Python/numpy object + schema_keys kwargs (Mode B, single entry)
828
+ - dict mapping tuples → values (Mode C, batch)
829
+ schema_level : str, optional
830
+ Which schema level to store at. Defaults to the lowest level.
831
+ description : str
832
+ Optional description for this variable.
833
+ force : bool
834
+ Deprecated, kept for backward compatibility.
835
+ **schema_keys
836
+ Keyword arguments specifying the schema entry for Mode B.
837
+ e.g. subject="S01", session=1, trial=3.
838
+ Note: all schema key values are coerced to strings before storage.
839
+ """
840
+ # --- Determine save mode ---
841
+ data_col_name = None # Override for single-column name preservation
842
+
843
+ # Mode B: single entry via kwargs
844
+ if schema_keys:
845
+ provided_schema_cols = [k for k in self.dataset_schema if k in schema_keys]
846
+ if schema_level is None:
847
+ schema_level = provided_schema_cols[-1] if provided_schema_cols else self.dataset_schema[-1]
848
+ if schema_level not in self.dataset_schema:
849
+ raise ValueError(
850
+ f"schema_level '{schema_level}' not in {self.dataset_schema}"
851
+ )
852
+ key_cols = provided_schema_cols
853
+ entries = [(
854
+ {k: schema_keys[k] for k in key_cols},
855
+ data,
856
+ )]
857
+
858
+ else:
859
+ if schema_level is None:
860
+ schema_level = self.dataset_schema[-1]
861
+ if schema_level not in self.dataset_schema:
862
+ raise ValueError(
863
+ f"schema_level '{schema_level}' not in {self.dataset_schema}"
864
+ )
865
+ key_cols = self._schema_key_columns(schema_level)
866
+
867
+ # Mode A: DataFrame with schema columns
868
+ if isinstance(data, pd.DataFrame) and all(c in data.columns for c in key_cols):
869
+ entries, data_col_name = self._entries_from_dataframe(data, key_cols, schema_level)
870
+
871
+ # Mode C: dict with tuple keys
872
+ elif isinstance(data, dict) and data and isinstance(next(iter(data.keys())), tuple):
873
+ entries = []
874
+ for key_tuple, value in data.items():
875
+ if len(key_tuple) != len(key_cols):
876
+ raise ValueError(
877
+ f"Key tuple length {len(key_tuple)} != "
878
+ f"expected {len(key_cols)} for level '{schema_level}'"
879
+ )
880
+ key_dict = dict(zip(key_cols, key_tuple))
881
+ entries.append((key_dict, value))
882
+
883
+ else:
884
+ raise ValueError(
885
+ "Cannot determine save mode. Provide either:\n"
886
+ " (A) a DataFrame with schema-level columns,\n"
887
+ " (B) schema key kwargs (e.g. subject='S01', session=1), or\n"
888
+ " (C) a dict mapping tuples to values."
889
+ )
890
+
891
+ # --- Determine column types from the first entry's data ---
892
+ sample_value = entries[0][1]
893
+ data_col_types, dtype_meta = self._infer_data_columns(sample_value, data_col_name)
894
+
895
+ # --- Ensure the variable table exists ---
896
+ is_dataframe = dtype_meta.get("mode") == "dataframe"
897
+ self._ensure_variable_table(name, data_col_types, schema_level,
898
+ is_dataframe=is_dataframe)
899
+
900
+ # --- Insert rows (INSERT OR REPLACE for "latest wins" semantics) ---
901
+ col_names = ["schema_id"] + list(data_col_types.keys())
902
+ col_str = ", ".join(f'"{c}"' for c in col_names)
903
+ placeholders = ", ".join(["?"] * len(col_names))
904
+
905
+ for key_dict, value in entries:
906
+ schema_id = self._get_or_create_schema_id(schema_level, key_dict)
907
+ if isinstance(value, pd.DataFrame):
908
+ # Delete old rows for this schema_id, then insert one per DataFrame row.
909
+ self._execute(f'DELETE FROM "{name}" WHERE schema_id = ?', [schema_id])
910
+ for storage_row in _dataframe_to_storage_rows(value, dtype_meta):
911
+ self._execute(
912
+ f'INSERT INTO "{name}" ({col_str}) VALUES ({placeholders})',
913
+ [schema_id] + storage_row,
914
+ )
915
+ else:
916
+ storage_values = self._value_to_storage_row(value, dtype_meta)
917
+ row = [schema_id] + storage_values
918
+ self._execute(
919
+ f'INSERT OR REPLACE INTO "{name}" ({col_str}) VALUES ({placeholders})', row
920
+ )
921
+
922
+ # --- Register in _variables (one row per variable) ---
923
+ self._execute(
924
+ "INSERT INTO _variables (variable_name, schema_level, dtype, description) "
925
+ "VALUES (?, ?, ?, ?) "
926
+ "ON CONFLICT (variable_name) DO UPDATE SET dtype = excluded.dtype",
927
+ [name, schema_level, json.dumps(dtype_meta), description],
928
+ )
929
+
930
+ def _entries_from_dataframe(
931
+ self, df: pd.DataFrame, key_cols: List[str], schema_level: str
932
+ ) -> Tuple[List[Tuple[dict, Any]], Optional[str]]:
933
+ """
934
+ Convert a DataFrame (Mode A) into a list of (key_dict, row_data) entries.
935
+
936
+ Each row in the DataFrame becomes one entry. The non-schema columns
937
+ become the stored data (as a dict / single value).
938
+
939
+ Returns (entries, single_col_name) where single_col_name is set if
940
+ there's exactly one data column (so we can preserve its name).
941
+ """
942
+ data_cols = [c for c in df.columns if c not in key_cols]
943
+ entries = []
944
+ single_col_name = data_cols[0] if len(data_cols) == 1 else None
945
+ for _, row in df.iterrows():
946
+ key_dict = {k: row[k] for k in key_cols}
947
+ if len(data_cols) == 1:
948
+ value = row[data_cols[0]]
949
+ # Convert numpy types to Python types for cleaner handling
950
+ if isinstance(value, (np.integer,)):
951
+ value = int(value)
952
+ elif isinstance(value, (np.floating,)):
953
+ value = float(value)
954
+ elif isinstance(value, (np.bool_,)):
955
+ value = bool(value)
956
+ else:
957
+ value = {c: row[c] for c in data_cols}
958
+ entries.append((key_dict, value))
959
+ return entries, single_col_name
960
+
961
+ def _infer_data_columns(
962
+ self, sample_value: Any, data_col_name: Optional[str] = None
963
+ ) -> Tuple[dict, dict]:
964
+ """Delegate to module-level _infer_data_columns."""
965
+ return _infer_data_columns(sample_value, data_col_name)
966
+
967
+ def _value_to_storage_row(self, value: Any, dtype_meta: dict) -> list:
968
+ """Delegate to module-level _value_to_storage_row."""
969
+ return _value_to_storage_row(value, dtype_meta)
970
+
971
+ def _ensure_variable_table(self, name: str, data_col_types: dict, schema_level: str,
972
+ is_dataframe: bool = False):
973
+ """Create the variable table if it doesn't exist."""
974
+ if self._table_exists(name):
975
+ return
976
+ data_cols_sql = ", ".join(
977
+ f'"{col}" {dtype}' for col, dtype in data_col_types.items()
978
+ )
979
+ # DataFrames store one DuckDB row per table row: no unique constraint
980
+ # on schema_id. Other types use schema_id as a primary key so that
981
+ # INSERT OR REPLACE gives "latest wins" semantics.
982
+ if is_dataframe:
983
+ schema_id_col = "schema_id INTEGER NOT NULL"
984
+ else:
985
+ schema_id_col = "schema_id INTEGER PRIMARY KEY"
986
+ self._execute(f"""
987
+ CREATE TABLE "{name}" (
988
+ {schema_id_col},
989
+ {data_cols_sql}
990
+ )
991
+ """)
992
+
993
+ # ------------------------------------------------------------------
994
+ # Load
995
+ # ------------------------------------------------------------------
996
+
997
+ def load(
998
+ self,
999
+ name: str,
1000
+ raw: bool = True,
1001
+ **schema_keys,
1002
+ ) -> Union[pd.DataFrame, Any]:
1003
+ """
1004
+ Load a variable from the database.
1005
+
1006
+ Parameters
1007
+ ----------
1008
+ name : str
1009
+ Variable name.
1010
+ raw : bool
1011
+ If True and the result is a single row, return the reconstructed
1012
+ Python object instead of a DataFrame.
1013
+ **schema_keys
1014
+ Optional filters, e.g. subject="S01" to load a subset.
1015
+
1016
+ Returns
1017
+ -------
1018
+ pd.DataFrame or Python object (if raw=True and single row).
1019
+ """
1020
+ if not self._table_exists(name):
1021
+ raise KeyError(f"Variable '{name}' not found in database.")
1022
+
1023
+ # Get metadata
1024
+ rows = self._fetchall(
1025
+ "SELECT schema_level, dtype FROM _variables WHERE variable_name = ?",
1026
+ [name],
1027
+ )
1028
+ if not rows:
1029
+ raise KeyError(f"Variable '{name}' not found.")
1030
+ schema_level, dtype_json = rows[0]
1031
+ dtype_meta = json.loads(dtype_json)
1032
+
1033
+ # Select all schema columns so non-contiguous keys appear in results
1034
+ all_schema_cols = self.dataset_schema
1035
+ schema_select = ", ".join(f's."{c}"' for c in all_schema_cols)
1036
+ data_cols = list(dtype_meta["columns"].keys())
1037
+ data_select = ", ".join(f'v."{c}"' for c in data_cols)
1038
+
1039
+ sql = (
1040
+ f'SELECT {schema_select}, {data_select} '
1041
+ f'FROM "{name}" v '
1042
+ f'JOIN _schema s ON v.schema_id = s.schema_id'
1043
+ )
1044
+ params: list = []
1045
+
1046
+ # Apply schema key filters (any valid schema column)
1047
+ conditions = []
1048
+ for col, val in schema_keys.items():
1049
+ if col in all_schema_cols:
1050
+ conditions.append(f's."{col}" = ?')
1051
+ params.append(_schema_str(val))
1052
+ if conditions:
1053
+ sql += ' WHERE ' + ' AND '.join(conditions)
1054
+
1055
+ df = self._fetchdf(sql, params or None)
1056
+
1057
+ mode = dtype_meta.get("mode", "single_column")
1058
+ columns_meta = dtype_meta.get("columns", {})
1059
+
1060
+ if mode == "dataframe":
1061
+ # One DuckDB row per DataFrame row: apply _storage_to_python per cell.
1062
+ # Drop schema columns; keep only data columns.
1063
+ data_cols = list(columns_meta.keys())
1064
+ result = {}
1065
+ for c, meta in columns_meta.items():
1066
+ if c in df.columns:
1067
+ result[c] = [_storage_to_python(df[c].iloc[i], meta)
1068
+ for i in range(len(df))]
1069
+ df_columns = dtype_meta.get("df_columns", data_cols)
1070
+ return pd.DataFrame(result, columns=df_columns)
1071
+
1072
+ # Non-DataFrame: restore types then return raw object if single row
1073
+ df = self._restore_types(df, dtype_meta)
1074
+
1075
+ if raw and len(df) == 1:
1076
+ if mode == "single_column":
1077
+ col_name = next(iter(columns_meta))
1078
+ col_meta = columns_meta[col_name]
1079
+ raw_val = df[col_name].iloc[0]
1080
+ return _storage_to_python(raw_val, col_meta)
1081
+ elif mode == "multi_column":
1082
+ result = {}
1083
+ for c, meta in columns_meta.items():
1084
+ result[c] = _storage_to_python(df[c].iloc[0], meta)
1085
+ if dtype_meta.get("nested"):
1086
+ return _unflatten_dict(result, dtype_meta["path_map"])
1087
+ return result
1088
+
1089
+ return df
1090
+
1091
+ def _restore_types(self, df: pd.DataFrame, dtype_meta: dict) -> pd.DataFrame:
1092
+ """Apply type restoration to data columns of a loaded DataFrame."""
1093
+ columns_meta = dtype_meta.get("columns", {})
1094
+ for col_name, col_meta in columns_meta.items():
1095
+ if col_name in df.columns:
1096
+ restored = [
1097
+ _storage_to_python(df[col_name].iloc[i], col_meta)
1098
+ for i in range(len(df))
1099
+ ]
1100
+ df[col_name] = restored
1101
+ return df
1102
+
1103
+ # ------------------------------------------------------------------
1104
+ # List / inspect
1105
+ # ------------------------------------------------------------------
1106
+
1107
+ def list_variables(self) -> pd.DataFrame:
1108
+ """
1109
+ List all variables with their schema level and creation time.
1110
+ """
1111
+ return self._fetchdf("""
1112
+ SELECT variable_name, schema_level, created_at, description
1113
+ FROM _variables
1114
+ ORDER BY variable_name
1115
+ """)
1116
+
1117
+ def list_versions(self, name: str) -> pd.DataFrame:
1118
+ """
1119
+ List variable metadata and all distinct schema entries saved for it.
1120
+ """
1121
+ if not self._table_exists(name):
1122
+ return pd.DataFrame()
1123
+ return self._fetchdf(
1124
+ "SELECT v.variable_name, v.schema_level, v.created_at, v.description, "
1125
+ "COUNT(d.schema_id) AS num_entries "
1126
+ f'FROM _variables v LEFT JOIN "{name}" d ON 1=1 '
1127
+ "WHERE v.variable_name = ? "
1128
+ "GROUP BY v.variable_name, v.schema_level, v.created_at, v.description",
1129
+ [name],
1130
+ )
1131
+
1132
+ # ------------------------------------------------------------------
1133
+ # Delete
1134
+ # ------------------------------------------------------------------
1135
+
1136
+ def delete(self, name: str):
1137
+ """
1138
+ Delete a variable, dropping its data table and all metadata records.
1139
+ """
1140
+ if self._table_exists(name):
1141
+ self._execute(f'DROP TABLE "{name}"')
1142
+ self._execute(
1143
+ "DELETE FROM _variables WHERE variable_name = ?", [name]
1144
+ )
1145
+ self._execute(
1146
+ "DELETE FROM _variable_groups WHERE variable_name = ?", [name]
1147
+ )
1148
+
1149
+ # ------------------------------------------------------------------
1150
+ # Groups
1151
+ # ------------------------------------------------------------------
1152
+
1153
+ def add_to_group(self, group_name: str, variable_names: Union[str, List[str]]):
1154
+ """Add one or more variables to a group."""
1155
+ if isinstance(variable_names, str):
1156
+ variable_names = [variable_names]
1157
+ for vn in variable_names:
1158
+ self._execute(
1159
+ "INSERT INTO _variable_groups (group_name, variable_name) "
1160
+ "VALUES (?, ?) ON CONFLICT DO NOTHING",
1161
+ [group_name, vn],
1162
+ )
1163
+
1164
+ def remove_from_group(self, group_name: str, variable_names: Union[str, List[str]]):
1165
+ """Remove one or more variables from a group."""
1166
+ if isinstance(variable_names, str):
1167
+ variable_names = [variable_names]
1168
+ for vn in variable_names:
1169
+ self._execute(
1170
+ "DELETE FROM _variable_groups "
1171
+ "WHERE group_name = ? AND variable_name = ?",
1172
+ [group_name, vn],
1173
+ )
1174
+
1175
+ def list_groups(self) -> List[str]:
1176
+ """List all group names."""
1177
+ rows = self._fetchall(
1178
+ "SELECT DISTINCT group_name FROM _variable_groups ORDER BY group_name"
1179
+ )
1180
+ return [r[0] for r in rows]
1181
+
1182
+ def get_group(self, group_name: str) -> List[str]:
1183
+ """Get all variable names in a group."""
1184
+ rows = self._fetchall(
1185
+ "SELECT variable_name FROM _variable_groups "
1186
+ "WHERE group_name = ? ORDER BY variable_name",
1187
+ [group_name],
1188
+ )
1189
+ return [r[0] for r in rows]
1190
+
1191
+ # ------------------------------------------------------------------
1192
+ # Schema introspection
1193
+ # ------------------------------------------------------------------
1194
+
1195
+ def distinct_schema_values(self, key: str) -> List:
1196
+ """Return all distinct non-null values for a schema column, sorted."""
1197
+ if key not in self.dataset_schema:
1198
+ raise ValueError(
1199
+ f"'{key}' is not a schema column. "
1200
+ f"Available: {self.dataset_schema}"
1201
+ )
1202
+ rows = self._fetchall(
1203
+ f'SELECT DISTINCT "{key}" FROM _schema '
1204
+ f'WHERE "{key}" IS NOT NULL '
1205
+ f'ORDER BY "{key}"'
1206
+ )
1207
+ return [r[0] for r in rows]
1208
+
1209
+ def distinct_schema_combinations(self, keys: list[str]) -> list[tuple]:
1210
+ """Return all distinct non-null combinations for multiple schema columns.
1211
+
1212
+ Args:
1213
+ keys: List of schema column names to query.
1214
+
1215
+ Returns:
1216
+ List of tuples, each tuple being one existing combination of values
1217
+ (as strings, since _schema stores VARCHAR columns). Sorted by the
1218
+ column order given.
1219
+ """
1220
+ for k in keys:
1221
+ if k not in self.dataset_schema:
1222
+ raise ValueError(
1223
+ f"'{k}' is not a schema column. "
1224
+ f"Available: {self.dataset_schema}"
1225
+ )
1226
+ col_list = ", ".join(f'"{k}"' for k in keys)
1227
+ where_clause = " AND ".join(f'"{k}" IS NOT NULL' for k in keys)
1228
+ order_clause = ", ".join(f'"{k}"' for k in keys)
1229
+ rows = self._fetchall(
1230
+ f"SELECT DISTINCT {col_list} FROM _schema "
1231
+ f"WHERE {where_clause} "
1232
+ f"ORDER BY {order_clause}"
1233
+ )
1234
+ return [tuple(r) for r in rows]
1235
+
1236
+ # ------------------------------------------------------------------
1237
+ # Direct query access
1238
+ # ------------------------------------------------------------------
1239
+
1240
+ def query(self, sql: str, params=None) -> pd.DataFrame:
1241
+ """Execute arbitrary SQL and return a DataFrame."""
1242
+ return self._fetchdf(sql, params)
1243
+
1244
+ # ------------------------------------------------------------------
1245
+ # Context manager / cleanup
1246
+ # ------------------------------------------------------------------
1247
+
1248
+ def close(self):
1249
+ """Close the DuckDB connection."""
1250
+ self.con.close()
1251
+ logger.info("DuckDB lock RELEASED: %s", self.db_path)
1252
+
1253
+ def reopen(self):
1254
+ """Reopen the DuckDB connection after close()."""
1255
+ logger.info("DuckDB lock ACQUIRED (reopen): %s", self.db_path)
1256
+ self.con = duckdb.connect(str(self.db_path))
1257
+
1258
+ def __enter__(self):
1259
+ """Enter context manager."""
1260
+ return self
1261
+
1262
+ def __exit__(self, *args):
1263
+ """Exit context manager, closing the DuckDB connection."""
1264
+ self.close()
1265
+
1266
+ def __repr__(self):
1267
+ try:
1268
+ n_vars = self._fetchall(
1269
+ "SELECT COUNT(DISTINCT variable_name) FROM _variables"
1270
+ )[0][0]
1271
+ except Exception:
1272
+ n_vars = "?"
1273
+ return (
1274
+ f"SciDuck(path='{self.db_path}', "
1275
+ f"schema={self.dataset_schema}, variables={n_vars})"
1276
+ )
@@ -0,0 +1,57 @@
1
+ Metadata-Version: 2.4
2
+ Name: sciduckdb
3
+ Version: 0.1.0
4
+ Summary: A thin DuckDB layer for managing versioned scientific data
5
+ Author: SciStack Contributors
6
+ License-Expression: MIT
7
+ Keywords: data-management,duckdb,scientific-data,versioning
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.9
20
+ Requires-Dist: duckdb>=0.9.0
21
+ Requires-Dist: numpy>=1.20
22
+ Requires-Dist: pandas>=1.3
23
+ Provides-Extra: dev
24
+ Requires-Dist: mypy>=1.0; extra == 'dev'
25
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
26
+ Requires-Dist: pytest>=7.0; extra == 'dev'
27
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # SciDuck
31
+
32
+ A thin DuckDB layer for managing versioned scientific data.
33
+
34
+ Each variable is stored in its own table. Variables are associated with a hierarchical dataset schema (e.g. subject -> session -> trial) and can be saved at any level of that hierarchy. Multiple versions of each variable are supported natively.
35
+
36
+ All data -- including arrays -- is stored in queryable DuckDB types (LIST, nested LIST, JSON) so the database can be inspected with DBeaver or any DuckDB-compatible viewer.
37
+
38
+ ## Usage
39
+
40
+ ```python
41
+ from sciduckdb import SciDuck
42
+
43
+ duck = SciDuck("data.duckdb", dataset_schema=["subject", "session"])
44
+ duck.save("MyVar", data, subject="S01", session=1)
45
+ loaded = duck.load("MyVar", subject="S01", session=1)
46
+ ```
47
+
48
+ ## Features
49
+
50
+ - **Three save modes**: DataFrame with schema columns (Mode A), single entry via kwargs (Mode B), or dict mapping tuples to values (Mode C)
51
+ - **Automatic type inference**: Maps Python/numpy types to DuckDB types
52
+ - **Round-trip restoration**: Metadata tracks original types for lossless load
53
+ - **Version management**: Automatic version numbering, duplicate hash detection
54
+ - **Variable groups**: Organize variables into named groups
55
+ - **Schema validation**: Validates dataset schema consistency across sessions
56
+
57
+ Note: all schema key values are coerced to strings before storage.
@@ -0,0 +1,5 @@
1
+ sciduckdb/__init__.py,sha256=o2xnzbv-_7AakjaAxf-934Xp3mULCBtHKmVD3P4FrPE,741
2
+ sciduckdb/sciduckdb.py,sha256=CX-Vs8SxkbKeaoYqJndDrfBOJud6NCGIaX-CZ_ipSkI,49538
3
+ sciduckdb-0.1.0.dist-info/METADATA,sha256=cjvpli30jLVGZIvX6ctaHkhcZ-9UsrvVFK9lYIUcWgA,2441
4
+ sciduckdb-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
5
+ sciduckdb-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any