juniper-data 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. juniper_data/__init__.py +88 -0
  2. juniper_data/__main__.py +78 -0
  3. juniper_data/api/__init__.py +10 -0
  4. juniper_data/api/app.py +111 -0
  5. juniper_data/api/middleware.py +95 -0
  6. juniper_data/api/routes/__init__.py +9 -0
  7. juniper_data/api/routes/datasets.py +414 -0
  8. juniper_data/api/routes/generators.py +125 -0
  9. juniper_data/api/routes/health.py +49 -0
  10. juniper_data/api/security.py +238 -0
  11. juniper_data/api/settings.py +109 -0
  12. juniper_data/core/__init__.py +32 -0
  13. juniper_data/core/artifacts.py +63 -0
  14. juniper_data/core/dataset_id.py +38 -0
  15. juniper_data/core/models.py +135 -0
  16. juniper_data/core/split.py +120 -0
  17. juniper_data/generators/__init__.py +15 -0
  18. juniper_data/generators/arc_agi/__init__.py +11 -0
  19. juniper_data/generators/arc_agi/generator.py +229 -0
  20. juniper_data/generators/arc_agi/params.py +56 -0
  21. juniper_data/generators/checkerboard/__init__.py +15 -0
  22. juniper_data/generators/checkerboard/generator.py +114 -0
  23. juniper_data/generators/checkerboard/params.py +32 -0
  24. juniper_data/generators/circles/__init__.py +11 -0
  25. juniper_data/generators/circles/generator.py +112 -0
  26. juniper_data/generators/circles/params.py +31 -0
  27. juniper_data/generators/csv_import/__init__.py +15 -0
  28. juniper_data/generators/csv_import/generator.py +198 -0
  29. juniper_data/generators/csv_import/params.py +48 -0
  30. juniper_data/generators/gaussian/__init__.py +11 -0
  31. juniper_data/generators/gaussian/generator.py +149 -0
  32. juniper_data/generators/gaussian/params.py +53 -0
  33. juniper_data/generators/mnist/__init__.py +11 -0
  34. juniper_data/generators/mnist/generator.py +124 -0
  35. juniper_data/generators/mnist/params.py +39 -0
  36. juniper_data/generators/spiral/__init__.py +57 -0
  37. juniper_data/generators/spiral/defaults.py +39 -0
  38. juniper_data/generators/spiral/generator.py +206 -0
  39. juniper_data/generators/spiral/params.py +148 -0
  40. juniper_data/generators/xor/__init__.py +11 -0
  41. juniper_data/generators/xor/generator.py +162 -0
  42. juniper_data/generators/xor/params.py +30 -0
  43. juniper_data/storage/__init__.py +120 -0
  44. juniper_data/storage/base.py +279 -0
  45. juniper_data/storage/cached.py +211 -0
  46. juniper_data/storage/hf_store.py +257 -0
  47. juniper_data/storage/kaggle_store.py +333 -0
  48. juniper_data/storage/local_fs.py +232 -0
  49. juniper_data/storage/memory.py +136 -0
  50. juniper_data/storage/postgres_store.py +373 -0
  51. juniper_data/storage/redis_store.py +264 -0
  52. juniper_data/tests/__init__.py +1 -0
  53. juniper_data/tests/conftest.py +68 -0
  54. juniper_data/tests/fixtures/generate_golden_datasets.py +199 -0
  55. juniper_data/tests/integration/__init__.py +1 -0
  56. juniper_data/tests/integration/test_api.py +283 -0
  57. juniper_data/tests/integration/test_e2e_workflow.py +378 -0
  58. juniper_data/tests/integration/test_lifecycle_api.py +304 -0
  59. juniper_data/tests/integration/test_security_integration.py +189 -0
  60. juniper_data/tests/integration/test_storage_workflow.py +259 -0
  61. juniper_data/tests/performance/__init__.py +1 -0
  62. juniper_data/tests/performance/test_generator_benchmarks.py +178 -0
  63. juniper_data/tests/performance/test_storage_benchmarks.py +257 -0
  64. juniper_data/tests/unit/__init__.py +1 -0
  65. juniper_data/tests/unit/test_api_app.py +206 -0
  66. juniper_data/tests/unit/test_api_routes.py +407 -0
  67. juniper_data/tests/unit/test_api_settings.py +100 -0
  68. juniper_data/tests/unit/test_arc_agi_generator.py +525 -0
  69. juniper_data/tests/unit/test_artifacts.py +145 -0
  70. juniper_data/tests/unit/test_cached_store.py +423 -0
  71. juniper_data/tests/unit/test_checkerboard_generator.py +232 -0
  72. juniper_data/tests/unit/test_circles_generator.py +256 -0
  73. juniper_data/tests/unit/test_csv_import_generator.py +345 -0
  74. juniper_data/tests/unit/test_dataset_id.py +181 -0
  75. juniper_data/tests/unit/test_gaussian_generator.py +333 -0
  76. juniper_data/tests/unit/test_hf_store.py +416 -0
  77. juniper_data/tests/unit/test_init.py +93 -0
  78. juniper_data/tests/unit/test_kaggle_store.py +469 -0
  79. juniper_data/tests/unit/test_lifecycle.py +394 -0
  80. juniper_data/tests/unit/test_main.py +127 -0
  81. juniper_data/tests/unit/test_middleware.py +79 -0
  82. juniper_data/tests/unit/test_mnist_generator.py +370 -0
  83. juniper_data/tests/unit/test_postgres_store.py +490 -0
  84. juniper_data/tests/unit/test_redis_store.py +500 -0
  85. juniper_data/tests/unit/test_security.py +281 -0
  86. juniper_data/tests/unit/test_security_boundaries.py +517 -0
  87. juniper_data/tests/unit/test_spiral_generator.py +566 -0
  88. juniper_data/tests/unit/test_split.py +245 -0
  89. juniper_data/tests/unit/test_storage.py +767 -0
  90. juniper_data/tests/unit/test_xor_generator.py +223 -0
  91. juniper_data-0.4.2.dist-info/METADATA +216 -0
  92. juniper_data-0.4.2.dist-info/RECORD +95 -0
  93. juniper_data-0.4.2.dist-info/WHEEL +5 -0
  94. juniper_data-0.4.2.dist-info/licenses/LICENSE +9 -0
  95. juniper_data-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,333 @@
1
+ """Kaggle datasets integration for downloading and caching datasets."""
2
+
3
+ from datetime import UTC, datetime
4
+ from pathlib import Path
5
+ from typing import Any
6
+
7
+ import numpy as np
8
+
9
+ from juniper_data.core.models import DatasetMeta
10
+
11
+ from .base import DatasetStore
12
+ from .memory import InMemoryDatasetStore
13
+
14
+ try:
15
+ from kaggle.api.kaggle_api_extended import KaggleApi
16
+
17
+ KAGGLE_AVAILABLE = True
18
+ except ImportError:
19
+ KAGGLE_AVAILABLE = False
20
+ KaggleApi = None # type: ignore[assignment, misc]
21
+
22
+
23
+ class KaggleDatasetStore(DatasetStore):
24
+ """Kaggle API integration for downloading datasets.
25
+
26
+ Downloads datasets from Kaggle and caches them locally.
27
+ Primarily used as a data source, not for persistent storage.
28
+
29
+ Requires the `kaggle` package: pip install kaggle
30
+ Also requires Kaggle API credentials in ~/.kaggle/kaggle.json
31
+ or via KAGGLE_USERNAME and KAGGLE_KEY environment variables.
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ download_path: Path | None = None,
37
+ cache_store: DatasetStore | None = None,
38
+ auto_authenticate: bool = True,
39
+ ) -> None:
40
+ """Initialize the Kaggle store.
41
+
42
+ Args:
43
+ download_path: Path for downloading and extracting datasets.
44
+ cache_store: Optional store for caching loaded datasets.
45
+ auto_authenticate: Automatically authenticate with Kaggle API.
46
+
47
+ Raises:
48
+ ImportError: If kaggle package is not installed.
49
+ """
50
+ if not KAGGLE_AVAILABLE:
51
+ raise ImportError("Kaggle package not installed. Install with: pip install kaggle")
52
+
53
+ self._download_path = download_path or Path("./data/kaggle")
54
+ self._download_path.mkdir(parents=True, exist_ok=True)
55
+ self._cache_store = cache_store or InMemoryDatasetStore()
56
+
57
+ self._api: Any | None = None
58
+ if auto_authenticate:
59
+ self._authenticate()
60
+
61
+ def _authenticate(self) -> None:
62
+ """Authenticate with Kaggle API."""
63
+ self._api = KaggleApi()
64
+ self._api.authenticate()
65
+
66
+ def download_dataset(
67
+ self,
68
+ dataset_ref: str,
69
+ unzip: bool = True,
70
+ force: bool = False,
71
+ ) -> Path:
72
+ """Download a dataset from Kaggle.
73
+
74
+ Args:
75
+ dataset_ref: Dataset reference in format "owner/dataset-name".
76
+ unzip: Whether to unzip downloaded files.
77
+ force: Force re-download even if already exists.
78
+
79
+ Returns:
80
+ Path to the downloaded/extracted dataset directory.
81
+
82
+ Raises:
83
+ RuntimeError: If authentication failed or API not available.
84
+ """
85
+ if self._api is None:
86
+ raise RuntimeError("Kaggle API not authenticated. Call _authenticate() first.")
87
+
88
+ dataset_path = self._download_path / dataset_ref.replace("/", "_")
89
+
90
+ if dataset_path.exists() and not force:
91
+ return dataset_path
92
+
93
+ dataset_path.mkdir(parents=True, exist_ok=True)
94
+
95
+ self._api.dataset_download_files(
96
+ dataset_ref,
97
+ path=str(dataset_path),
98
+ unzip=unzip,
99
+ force=force,
100
+ )
101
+
102
+ return dataset_path
103
+
104
+ def load_kaggle_dataset(
105
+ self,
106
+ dataset_ref: str,
107
+ file_name: str,
108
+ feature_columns: list[str] | None = None,
109
+ label_column: str = "label",
110
+ delimiter: str = ",",
111
+ n_samples: int | None = None,
112
+ seed: int | None = None,
113
+ one_hot_labels: bool = True,
114
+ normalize_features: bool = False,
115
+ train_ratio: float = 0.8,
116
+ ) -> tuple[str, DatasetMeta, dict[str, np.ndarray]]:
117
+ """Download and load a CSV dataset from Kaggle.
118
+
119
+ Args:
120
+ dataset_ref: Dataset reference in format "owner/dataset-name".
121
+ file_name: Name of the CSV file within the dataset.
122
+ feature_columns: Column names for features (None = auto-detect).
123
+ label_column: Column name for labels.
124
+ delimiter: CSV delimiter.
125
+ n_samples: Optional limit on number of samples.
126
+ seed: Random seed for shuffling.
127
+ one_hot_labels: One-hot encode labels.
128
+ normalize_features: Normalize features to [0, 1].
129
+ train_ratio: Ratio for train/test split.
130
+
131
+ Returns:
132
+ Tuple of (dataset_id, metadata, arrays).
133
+ """
134
+ dataset_path = self.download_dataset(dataset_ref)
135
+ file_path = dataset_path / file_name
136
+
137
+ if not file_path.exists():
138
+ all_files = list(dataset_path.glob("**/*"))
139
+ csv_files = [f for f in all_files if f.suffix.lower() == ".csv"]
140
+ if csv_files:
141
+ file_path = csv_files[0]
142
+ else:
143
+ raise FileNotFoundError(
144
+ f"File '{file_name}' not found in dataset. Available files: {[f.name for f in all_files]}"
145
+ )
146
+
147
+ import csv
148
+
149
+ data = []
150
+ with open(file_path, encoding="utf-8") as f:
151
+ reader = csv.DictReader(f, delimiter=delimiter)
152
+ for row in reader:
153
+ data.append(row)
154
+
155
+ if not data:
156
+ raise ValueError("No data found in CSV file")
157
+
158
+ if seed is not None:
159
+ import random
160
+
161
+ random.seed(seed)
162
+ random.shuffle(data)
163
+
164
+ if n_samples is not None:
165
+ data = data[:n_samples]
166
+
167
+ all_columns = list(data[0].keys())
168
+ if feature_columns is None:
169
+ feature_columns = [c for c in all_columns if c != label_column]
170
+
171
+ features = []
172
+ labels = []
173
+
174
+ for row in data:
175
+ feature_row = []
176
+ for col in feature_columns:
177
+ val = row.get(col, 0)
178
+ try:
179
+ feature_row.append(float(val))
180
+ except (ValueError, TypeError):
181
+ feature_row.append(0.0)
182
+ features.append(feature_row)
183
+ labels.append(row.get(label_column))
184
+
185
+ X = np.array(features, dtype=np.float32)
186
+
187
+ if normalize_features:
188
+ X_min = X.min(axis=0, keepdims=True)
189
+ X_max = X.max(axis=0, keepdims=True)
190
+ X_range = X_max - X_min
191
+ X_range[X_range == 0] = 1
192
+ X = (X - X_min) / X_range
193
+
194
+ unique_labels = sorted([str(lbl) for lbl in set(labels)])
195
+ label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
196
+ n_classes = len(unique_labels)
197
+
198
+ label_indices = np.array([label_to_idx[str(lbl)] for lbl in labels])
199
+
200
+ if one_hot_labels:
201
+ y = np.zeros((len(labels), n_classes), dtype=np.float32)
202
+ y[np.arange(len(labels)), label_indices] = 1.0
203
+ else:
204
+ y = label_indices.astype(np.float32).reshape(-1, 1)
205
+
206
+ class_distribution = {}
207
+ for i in range(n_classes):
208
+ class_distribution[str(i)] = int((label_indices == i).sum())
209
+
210
+ n_train = int(len(X) * train_ratio)
211
+ X_train, X_test = X[:n_train], X[n_train:]
212
+ y_train, y_test = y[:n_train], y[n_train:]
213
+
214
+ dataset_id = f"kaggle-{dataset_ref.replace('/', '-')}-{len(X)}"
215
+
216
+ meta = DatasetMeta(
217
+ dataset_id=dataset_id,
218
+ generator="kaggle",
219
+ generator_version="1.0.0",
220
+ params={
221
+ "dataset_ref": dataset_ref,
222
+ "file_name": file_name,
223
+ "n_samples": len(X),
224
+ "seed": seed,
225
+ "normalize_features": normalize_features,
226
+ "one_hot_labels": one_hot_labels,
227
+ },
228
+ n_samples=len(X),
229
+ n_features=X.shape[1],
230
+ n_classes=n_classes,
231
+ n_train=n_train,
232
+ n_test=len(X) - n_train,
233
+ class_distribution=class_distribution,
234
+ created_at=datetime.now(UTC),
235
+ tags=["kaggle", dataset_ref.split("/")[0]],
236
+ )
237
+
238
+ arrays = {
239
+ "X_train": X_train,
240
+ "y_train": y_train,
241
+ "X_test": X_test,
242
+ "y_test": y_test,
243
+ "X_full": X,
244
+ "y_full": y,
245
+ }
246
+
247
+ self._cache_store.save(dataset_id, meta, arrays)
248
+
249
+ return dataset_id, meta, arrays
250
+
251
+ def list_competitions(self, search: str | None = None) -> list[dict]:
252
+ """List available Kaggle competitions.
253
+
254
+ Args:
255
+ search: Optional search term.
256
+
257
+ Returns:
258
+ List of competition info dictionaries.
259
+ """
260
+ if self._api is None:
261
+ raise RuntimeError("Kaggle API not authenticated.")
262
+
263
+ competitions = self._api.competitions_list(search=search)
264
+ return [
265
+ {
266
+ "ref": c.ref,
267
+ "title": c.title,
268
+ "deadline": c.deadline,
269
+ "category": c.category,
270
+ }
271
+ for c in competitions
272
+ ]
273
+
274
+ def list_kaggle_datasets(self, search: str | None = None, page: int = 1) -> list[dict]:
275
+ """List available Kaggle datasets.
276
+
277
+ Args:
278
+ search: Optional search term.
279
+ page: Page number for pagination.
280
+
281
+ Returns:
282
+ List of dataset info dictionaries.
283
+ """
284
+ if self._api is None:
285
+ raise RuntimeError("Kaggle API not authenticated.")
286
+
287
+ datasets = self._api.dataset_list(search=search, page=page)
288
+ return [
289
+ {
290
+ "ref": d.ref,
291
+ "title": d.title,
292
+ "size": d.totalBytes,
293
+ "lastUpdated": d.lastUpdated,
294
+ }
295
+ for d in datasets
296
+ ]
297
+
298
+ def save(
299
+ self,
300
+ dataset_id: str,
301
+ meta: DatasetMeta,
302
+ arrays: dict[str, np.ndarray],
303
+ ) -> None:
304
+ """Save to cache store."""
305
+ self._cache_store.save(dataset_id, meta, arrays)
306
+
307
+ def get_meta(self, dataset_id: str) -> DatasetMeta | None:
308
+ """Get from cache store."""
309
+ return self._cache_store.get_meta(dataset_id)
310
+
311
+ def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
312
+ """Get from cache store."""
313
+ return self._cache_store.get_artifact_bytes(dataset_id)
314
+
315
+ def exists(self, dataset_id: str) -> bool:
316
+ """Check cache store."""
317
+ return self._cache_store.exists(dataset_id)
318
+
319
+ def delete(self, dataset_id: str) -> bool:
320
+ """Delete from cache store."""
321
+ return self._cache_store.delete(dataset_id)
322
+
323
+ def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
324
+ """List from cache store."""
325
+ return self._cache_store.list_datasets(limit, offset)
326
+
327
+ def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
328
+ """Update in cache store."""
329
+ return self._cache_store.update_meta(dataset_id, meta)
330
+
331
+ def list_all_metadata(self) -> list[DatasetMeta]:
332
+ """List from cache store."""
333
+ return self._cache_store.list_all_metadata()
@@ -0,0 +1,232 @@
1
+ """Local filesystem dataset store."""
2
+
3
+ import io
4
+ import json
5
+ import logging
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+
9
+ # from typing import Any, Dict, List, Optional
10
+ from typing import Any
11
+
12
+ import numpy as np
13
+
14
+ from juniper_data.core.models import DatasetMeta
15
+ from juniper_data.storage.base import DatasetStore
16
+
17
+
18
+ def _json_serializer(obj: Any) -> str:
19
+ """JSON serializer for objects not serializable by default."""
20
+ if isinstance(obj, datetime):
21
+ return obj.isoformat()
22
+ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")
23
+
24
+
25
+ class LocalFSDatasetStore(DatasetStore):
26
+ """Local filesystem implementation of DatasetStore.
27
+
28
+ Stores datasets as JSON metadata files and NPZ array files.
29
+
30
+ Storage layout:
31
+ {base_path}/{dataset_id}.meta.json
32
+ {base_path}/{dataset_id}.npz
33
+ """
34
+
35
+ def __init__(self, base_path: Path) -> None:
36
+ """Initialize the local filesystem store.
37
+ Args:
38
+ base_path: Base directory for storing datasets. Created if it doesn't exist.
39
+ """
40
+ self._base_path = Path(base_path)
41
+ self._base_path.mkdir(parents=True, exist_ok=True)
42
+
43
+ def _meta_path(self, dataset_id: str) -> Path:
44
+ """Get path to metadata file."""
45
+ return self._base_path / f"{dataset_id}.meta.json"
46
+
47
+ def _npz_path(self, dataset_id: str) -> Path:
48
+ """Get path to NPZ file."""
49
+ return self._base_path / f"{dataset_id}.npz"
50
+
51
+ def save(
52
+ self,
53
+ dataset_id: str,
54
+ meta: DatasetMeta,
55
+ arrays: dict[str, np.ndarray],
56
+ ) -> None:
57
+ """Save dataset metadata and arrays to filesystem.
58
+
59
+ Args:
60
+ dataset_id: Unique identifier for the dataset.
61
+ meta: Dataset metadata.
62
+ arrays: Dictionary of numpy arrays.
63
+
64
+ Raises:
65
+ IOError: If the save operation fails.
66
+ """
67
+ meta_path = self._meta_path(dataset_id)
68
+ npz_path = self._npz_path(dataset_id)
69
+
70
+ # Write to temporary files first, then atomically replace the final files
71
+ tmp_meta_path = meta_path.with_suffix(meta_path.suffix + ".tmp")
72
+ tmp_npz_path = npz_path.with_suffix(npz_path.suffix + ".tmp")
73
+
74
+ meta_json = json.dumps(
75
+ meta.model_dump(),
76
+ default=_json_serializer,
77
+ indent=2,
78
+ )
79
+
80
+ try:
81
+ # Write metadata JSON to temporary file
82
+ tmp_meta_path.write_text(meta_json, encoding="utf-8")
83
+
84
+ # Write NPZ data to temporary file
85
+ buffer = io.BytesIO()
86
+ np.savez_compressed(buffer, **arrays) # type: ignore[arg-type] # numpy stubs incomplete for **kwargs
87
+ buffer.seek(0)
88
+ tmp_npz_path.write_bytes(buffer.read())
89
+
90
+ # Atomically replace final files with the temporary ones.
91
+ # Write NPZ first so we never have metadata without its NPZ.
92
+ tmp_npz_path.replace(npz_path)
93
+ tmp_meta_path.replace(meta_path)
94
+ except Exception:
95
+ # Best-effort cleanup of temporary files on failure
96
+ try:
97
+ tmp_meta_path.unlink(missing_ok=True)
98
+ except OSError:
99
+ logging.debug(
100
+ "Failed to remove temporary metadata file %s during cleanup",
101
+ tmp_meta_path,
102
+ exc_info=True,
103
+ )
104
+ try:
105
+ tmp_npz_path.unlink(missing_ok=True)
106
+ except OSError:
107
+ logging.debug(
108
+ "Failed to remove temporary NPZ file %s during cleanup",
109
+ tmp_npz_path,
110
+ exc_info=True,
111
+ )
112
+ raise
113
+
114
+ def get_meta(self, dataset_id: str) -> DatasetMeta | None:
115
+ """Get dataset metadata from filesystem.
116
+
117
+ Args:
118
+ dataset_id: Unique identifier for the dataset.
119
+
120
+ Returns:
121
+ Dataset metadata if found, None otherwise.
122
+ """
123
+ meta_path = self._meta_path(dataset_id)
124
+ if not meta_path.exists():
125
+ return None
126
+
127
+ meta_json = meta_path.read_text(encoding="utf-8")
128
+ meta_dict = json.loads(meta_json)
129
+
130
+ return DatasetMeta(**meta_dict)
131
+
132
+ def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
133
+ """Get dataset artifact as NPZ bytes.
134
+
135
+ Args:
136
+ dataset_id: Unique identifier for the dataset.
137
+
138
+ Returns:
139
+ NPZ file contents as bytes if found, None otherwise.
140
+ """
141
+ npz_path = self._npz_path(dataset_id)
142
+ return npz_path.read_bytes() if npz_path.exists() else None
143
+
144
+ def exists(self, dataset_id: str) -> bool:
145
+ """Check if dataset exists on filesystem.
146
+
147
+ Args:
148
+ dataset_id: Unique identifier for the dataset.
149
+
150
+ Returns:
151
+ True if both metadata and NPZ files exist, False otherwise.
152
+ """
153
+ return self._meta_path(dataset_id).exists() and self._npz_path(dataset_id).exists()
154
+
155
+ def delete(self, dataset_id: str) -> bool:
156
+ """Delete dataset from filesystem.
157
+
158
+ Args:
159
+ dataset_id: Unique identifier for the dataset.
160
+
161
+ Returns:
162
+ True if the dataset was deleted, False if it didn't exist.
163
+ """
164
+ meta_path = self._meta_path(dataset_id)
165
+ npz_path = self._npz_path(dataset_id)
166
+
167
+ if not meta_path.exists() and not npz_path.exists():
168
+ return False
169
+
170
+ if meta_path.exists():
171
+ meta_path.unlink()
172
+ if npz_path.exists():
173
+ npz_path.unlink()
174
+
175
+ return True
176
+
177
+ def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
178
+ """List dataset IDs from filesystem.
179
+
180
+ Finds datasets by globbing for .meta.json files.
181
+
182
+ Args:
183
+ limit: Maximum number of dataset IDs to return.
184
+ offset: Number of dataset IDs to skip.
185
+
186
+ Returns:
187
+ List of dataset IDs.
188
+ """
189
+ meta_files = sorted(self._base_path.glob("*.meta.json"))
190
+ dataset_ids = [f.stem.replace(".meta", "") for f in meta_files]
191
+ return dataset_ids[offset : offset + limit]
192
+
193
+ @property
194
+ def base_path(self) -> Path:
195
+ """Get the base storage path."""
196
+ return self._base_path
197
+
198
+ def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
199
+ """Update dataset metadata on filesystem.
200
+
201
+ Args:
202
+ dataset_id: Unique identifier for the dataset.
203
+ meta: Updated dataset metadata.
204
+
205
+ Returns:
206
+ True if the dataset was updated, False if it didn't exist.
207
+ """
208
+ meta_path = self._meta_path(dataset_id)
209
+ if not meta_path.exists():
210
+ return False
211
+
212
+ meta_json = json.dumps(
213
+ meta.model_dump(),
214
+ default=_json_serializer,
215
+ indent=2,
216
+ )
217
+ meta_path.write_text(meta_json, encoding="utf-8")
218
+ return True
219
+
220
+ def list_all_metadata(self) -> list[DatasetMeta]:
221
+ """List all dataset metadata from filesystem.
222
+
223
+ Returns:
224
+ List of all DatasetMeta objects.
225
+ """
226
+ result = []
227
+ for meta_file in self._base_path.glob("*.meta.json"):
228
+ dataset_id = meta_file.stem.replace(".meta", "")
229
+ meta = self.get_meta(dataset_id)
230
+ if meta is not None:
231
+ result.append(meta)
232
+ return result
@@ -0,0 +1,136 @@
1
+ """In-memory dataset store for testing and development."""
2
+
3
+ import io
4
+
5
+ import numpy as np
6
+
7
+ from juniper_data.core.models import DatasetMeta
8
+ from juniper_data.storage.base import DatasetStore
9
+
10
+
11
+ class InMemoryDatasetStore(DatasetStore):
12
+ """In-memory implementation of DatasetStore.
13
+
14
+ Stores datasets in dictionaries. Useful for testing and development.
15
+ Data is lost when the process exits.
16
+ """
17
+
18
+ def __init__(self) -> None:
19
+ """Initialize the in-memory store."""
20
+ self._metadata: dict[str, DatasetMeta] = {}
21
+ self._arrays: dict[str, dict[str, np.ndarray]] = {}
22
+
23
+ def save(
24
+ self,
25
+ dataset_id: str,
26
+ meta: DatasetMeta,
27
+ arrays: dict[str, np.ndarray],
28
+ ) -> None:
29
+ """Save dataset metadata and arrays to memory.
30
+
31
+ Args:
32
+ dataset_id: Unique identifier for the dataset.
33
+ meta: Dataset metadata.
34
+ arrays: Dictionary of numpy arrays.
35
+ """
36
+ self._metadata[dataset_id] = meta
37
+ self._arrays[dataset_id] = {k: v.copy() for k, v in arrays.items()}
38
+
39
+ def get_meta(self, dataset_id: str) -> DatasetMeta | None:
40
+ """Get dataset metadata from memory.
41
+
42
+ Args:
43
+ dataset_id: Unique identifier for the dataset.
44
+
45
+ Returns:
46
+ Dataset metadata if found, None otherwise.
47
+ """
48
+ return self._metadata.get(dataset_id)
49
+
50
+ def get_artifact_bytes(self, dataset_id: str) -> bytes | None:
51
+ """Get dataset artifact as NPZ bytes.
52
+
53
+ Args:
54
+ dataset_id: Unique identifier for the dataset.
55
+
56
+ Returns:
57
+ NPZ file contents as bytes if found, None otherwise.
58
+ """
59
+ arrays = self._arrays.get(dataset_id)
60
+ if arrays is None:
61
+ return None
62
+
63
+ buffer = io.BytesIO()
64
+ # Sort keys to ensure stable NPZ artifact bytes regardless of dict construction order.
65
+ sorted_arrays = {key: arrays[key] for key in sorted(arrays.keys())}
66
+ np.savez_compressed(buffer, **sorted_arrays)
67
+ buffer.seek(0)
68
+ return buffer.read()
69
+
70
+ def exists(self, dataset_id: str) -> bool:
71
+ """Check if dataset exists in memory.
72
+
73
+ Args:
74
+ dataset_id: Unique identifier for the dataset.
75
+
76
+ Returns:
77
+ True if the dataset exists, False otherwise.
78
+ """
79
+ return dataset_id in self._metadata
80
+
81
+ def delete(self, dataset_id: str) -> bool:
82
+ """Delete dataset from memory.
83
+
84
+ Args:
85
+ dataset_id: Unique identifier for the dataset.
86
+
87
+ Returns:
88
+ True if the dataset was deleted, False if it didn't exist.
89
+ """
90
+ if dataset_id not in self._metadata:
91
+ return False
92
+
93
+ del self._metadata[dataset_id]
94
+ del self._arrays[dataset_id]
95
+ return True
96
+
97
+ def list_datasets(self, limit: int = 100, offset: int = 0) -> list[str]:
98
+ """List dataset IDs from memory.
99
+
100
+ Args:
101
+ limit: Maximum number of dataset IDs to return.
102
+ offset: Number of dataset IDs to skip.
103
+
104
+ Returns:
105
+ List of dataset IDs.
106
+ """
107
+ all_ids = sorted(self._metadata.keys())
108
+ return all_ids[offset : offset + limit]
109
+
110
+ def clear(self) -> None:
111
+ """Clear all stored datasets. Useful for test cleanup."""
112
+ self._metadata.clear()
113
+ self._arrays.clear()
114
+
115
+ def update_meta(self, dataset_id: str, meta: DatasetMeta) -> bool:
116
+ """Update dataset metadata in memory.
117
+
118
+ Args:
119
+ dataset_id: Unique identifier for the dataset.
120
+ meta: Updated dataset metadata.
121
+
122
+ Returns:
123
+ True if the dataset was updated, False if it didn't exist.
124
+ """
125
+ if dataset_id not in self._metadata:
126
+ return False
127
+ self._metadata[dataset_id] = meta
128
+ return True
129
+
130
+ def list_all_metadata(self) -> list[DatasetMeta]:
131
+ """List all dataset metadata from memory.
132
+
133
+ Returns:
134
+ List of all DatasetMeta objects.
135
+ """
136
+ return list(self._metadata.values())