sdf-xarray 0.2.0__cp312-cp312-win_amd64.whl → 0.5.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdf_xarray/__init__.py CHANGED
@@ -1,22 +1,43 @@
1
+ import contextlib
1
2
  import os
2
- import pathlib
3
3
  import re
4
4
  from collections import Counter, defaultdict
5
+ from collections.abc import Callable, Iterable
6
+ from importlib.metadata import version
5
7
  from itertools import product
6
- from typing import Iterable
8
+ from os import PathLike as os_PathLike
9
+ from pathlib import Path
10
+ from typing import ClassVar
7
11
 
8
12
  import numpy as np
9
13
  import xarray as xr
14
+ from packaging.version import Version
10
15
  from xarray.backends import AbstractDataStore, BackendArray, BackendEntrypoint
11
16
  from xarray.backends.file_manager import CachingFileManager
12
17
  from xarray.backends.locks import ensure_lock
13
18
  from xarray.core import indexing
19
+ from xarray.core.types import T_Chunks
14
20
  from xarray.core.utils import close_on_error, try_read_magic_number_from_path
15
21
  from xarray.core.variable import Variable
16
22
 
23
+ # NOTE: Do not delete these lines, otherwise the "epoch" dataset and dataarray
24
+ # accessors will not be imported when the user imports sdf_xarray
25
+ import sdf_xarray.dataset_accessor
26
+ import sdf_xarray.download
17
27
  import sdf_xarray.plotting # noqa: F401
18
28
 
19
- from .sdf_interface import Constant, SDFFile
29
+ # NOTE: This attempts to initialise with the "pint" accessor if the user
30
+ # has installed the package
31
+ with contextlib.suppress(ImportError):
32
+ import pint_xarray # noqa: F401
33
+
34
+ from .sdf_interface import Constant, SDFFile # type: ignore # noqa: PGH003
35
+
36
+ # TODO Remove this once the new kwarg options are fully implemented
37
+ if Version(version("xarray")) >= Version("2025.8.0"):
38
+ xr.set_options(use_new_combine_kwarg_defaults=True)
39
+
40
+ PathLike = str | os_PathLike
20
41
 
21
42
 
22
43
  def _rename_with_underscore(name: str) -> str:
@@ -48,24 +69,140 @@ def _process_latex_name(variable_name: str) -> str:
48
69
  return variable_name
49
70
 
50
71
 
51
- def combine_datasets(path_glob: Iterable | str, **kwargs) -> xr.Dataset:
52
- """Combine all datasets using a single time dimension"""
72
+ def _resolve_glob(path_glob: PathLike | Iterable[PathLike]):
73
+ """
74
+ Normalise input path_glob into a sorted list of absolute, resolved Path objects.
75
+ """
76
+
77
+ try:
78
+ p = Path(path_glob)
79
+ paths = list(p.parent.glob(p.name)) if p.name == "*.sdf" else list(p)
80
+ except TypeError:
81
+ paths = list({Path(p) for p in path_glob})
82
+
83
+ paths = sorted(p.resolve() for p in paths)
84
+ if not paths:
85
+ raise FileNotFoundError(f"No files matched pattern or input: {path_glob!r}")
86
+ return paths
87
+
88
+
89
+ def _build_datatree_from_dataset(
90
+ ds: xr.Dataset,
91
+ ) -> xr.DataTree:
92
+ """
93
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
94
+ file. This is due to the fact that these names include slashes which `xarray`
95
+ can use to automatically build up a datatree. We do additionally replace
96
+ spaces with underscores to be more pythonic. You can find the
97
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
98
+
99
+ In some cases the user may output the ``always + species`` dumpmask which
100
+ means that SDF variable will have species data plus a general one. When
101
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
102
+ variable information and have leaves with variables so we move the node
103
+ information to a leaf named ``node/All`` (see example of
104
+ ``Dervied/Number_Density/All`` in below table)
105
+
106
+ Below are some examples of how variable names are translated from the
107
+ regular `xarray.open_dataset` result into their more traditional names.
108
+
109
+ =================================== ===================================
110
+ Dataset variable name DataTree variable name
111
+ =================================== ===================================
112
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
113
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
114
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
115
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
116
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
117
+ =================================== ===================================
118
+
119
+ Parameters
120
+ ----------
121
+ ds
122
+ Incoming `xarray.Dataset` to convert to a `xarray.DataTree`
123
+ """
124
+ renames = {}
125
+ for name, var in ds.data_vars.items():
126
+ # Append the current variable name to the attributes
127
+ var.attrs["flat_structure_name"] = name
128
+ renames.update({name: var.attrs["full_name"].replace(" ", "_")})
129
+
130
+ new_names = renames.values()
131
+
132
+ final_renames = {
133
+ key: (
134
+ f"{path}/All"
135
+ if any(other.startswith(f"{path}/") for other in new_names)
136
+ else path
137
+ )
138
+ for key, path in renames.items()
139
+ }
140
+
141
+ ds = ds.rename_vars(final_renames)
142
+ dt = xr.DataTree.from_dict(ds)
143
+ dt.attrs = ds.attrs
144
+ return dt
145
+
146
+
147
+ def purge_unselected_data_vars(ds: xr.Dataset, data_vars: list[str]) -> xr.Dataset:
148
+ """
149
+ If the user has exclusively requested only certain variables be
150
+ loaded in then we purge all other variables and dimensions
151
+ """
152
+ existing_data_vars = set(ds.data_vars.keys())
153
+ vars_to_keep = set(data_vars) & existing_data_vars
154
+ vars_to_drop = existing_data_vars - vars_to_keep
155
+ ds = ds.drop_vars(vars_to_drop)
156
+
157
+ existing_dims = set(ds.sizes)
158
+ dims_to_keep = set()
159
+ for var in vars_to_keep:
160
+ dims_to_keep.update(ds[var].coords._names)
161
+ dims_to_keep.update(ds[var].dims)
162
+
163
+ coords_to_drop = existing_dims - dims_to_keep
164
+ return ds.drop_dims(coords_to_drop)
165
+
166
+
167
+ def combine_datasets(
168
+ path_glob: Iterable | str, data_vars: list[str], **kwargs
169
+ ) -> xr.Dataset:
170
+ """
171
+ Combine all datasets using a single time dimension, optionally extract
172
+ data from only the listed data_vars
173
+ """
174
+
175
+ if data_vars is not None:
176
+ return xr.open_mfdataset(
177
+ path_glob,
178
+ join="outer",
179
+ coords="different",
180
+ compat="no_conflicts",
181
+ combine="nested",
182
+ concat_dim="time",
183
+ preprocess=SDFPreprocess(data_vars=data_vars),
184
+ **kwargs,
185
+ )
53
186
 
54
187
  return xr.open_mfdataset(
55
188
  path_glob,
56
- data_vars="minimal",
57
- coords="minimal",
58
- compat="override",
189
+ data_vars="all",
190
+ coords="different",
191
+ compat="no_conflicts",
192
+ join="outer",
59
193
  preprocess=SDFPreprocess(),
60
194
  **kwargs,
61
195
  )
62
196
 
63
197
 
64
198
  def open_mfdataset(
65
- path_glob: Iterable | str | pathlib.Path | pathlib.Path.glob,
199
+ path_glob: Iterable | str | Path | Callable[..., Iterable[Path]],
66
200
  *,
67
201
  separate_times: bool = False,
68
202
  keep_particles: bool = False,
203
+ probe_names: list[str] | None = None,
204
+ data_vars: list[str] | None = None,
205
+ chunks: T_Chunks = "auto",
69
206
  ) -> xr.Dataset:
70
207
  """Open a set of EPOCH SDF files as one `xarray.Dataset`
71
208
 
@@ -95,20 +232,47 @@ def open_mfdataset(
95
232
  different output frequencies
96
233
  keep_particles :
97
234
  If ``True``, also load particle data (this may use a lot of memory!)
235
+ probe_names :
236
+ List of EPOCH probe names
237
+ data_vars :
238
+ List of data vars to load in (If not specified loads in all variables)
239
+ chunks :
240
+ Dictionary with keys given by dimension names and values given by chunk sizes.
241
+ In general, these should divide the dimensions of each dataset. By default
242
+ chunks are automatically set so that they are the same size as the dimensions
243
+ stored in each of the SDF files. See `Xarray chunking-and-performance
244
+ <https://docs.xarray.dev/en/stable/user-guide/dask.html#chunking-and-performance>`_
245
+ for details on why this is useful for large datasets. The default behaviour is
246
+ to do this automatically and can be disabled by ``chunks=None``.
98
247
  """
99
248
 
100
- # TODO: This is not very robust, look at how xarray.open_mfdataset does it
101
- if isinstance(path_glob, str):
102
- path_glob = pathlib.Path().glob(path_glob)
103
-
104
- # Coerce to list because we might need to use the sequence multiple times
105
- path_glob = sorted(list(path_glob))
249
+ path_glob = _resolve_glob(path_glob)
106
250
 
107
251
  if not separate_times:
108
- return combine_datasets(path_glob, keep_particles=keep_particles)
252
+ return combine_datasets(
253
+ path_glob,
254
+ data_vars=data_vars,
255
+ keep_particles=keep_particles,
256
+ probe_names=probe_names,
257
+ chunks=chunks,
258
+ )
259
+
260
+ _, var_times_map = make_time_dims(path_glob)
261
+
262
+ all_dfs = []
263
+ for f in path_glob:
264
+ ds = xr.open_dataset(
265
+ f, keep_particles=keep_particles, probe_names=probe_names, chunks=chunks
266
+ )
267
+
268
+ # If the data_vars are specified then only load them in and disregard the rest.
269
+ # If there are no remaining data variables then skip adding the dataset to list
270
+ if data_vars is not None:
271
+ ds = purge_unselected_data_vars(ds, data_vars)
272
+ if not ds.data_vars:
273
+ continue
109
274
 
110
- time_dims, var_times_map = make_time_dims(path_glob)
111
- all_dfs = [xr.open_dataset(f, keep_particles=keep_particles) for f in path_glob]
275
+ all_dfs.append(ds)
112
276
 
113
277
  for df in all_dfs:
114
278
  for da in df:
@@ -125,10 +289,155 @@ def open_mfdataset(
125
289
  )
126
290
 
127
291
  return xr.combine_by_coords(
128
- all_dfs, data_vars="minimal", combine_attrs="drop_conflicts"
292
+ all_dfs,
293
+ coords="different",
294
+ combine_attrs="drop_conflicts",
295
+ join="outer",
296
+ compat="no_conflicts",
297
+ )
298
+
299
+
300
+ def open_datatree(
301
+ path: PathLike,
302
+ *,
303
+ keep_particles: bool = False,
304
+ probe_names: list[str] | None = None,
305
+ ) -> xr.DataTree:
306
+ """
307
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
308
+ file. This is due to the fact that these names include slashes which `xarray`
309
+ can use to automatically build up a datatree. We do additionally replace
310
+ spaces with underscores to be more pythonic. You can find the
311
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
312
+
313
+ In some cases the user may output the ``always + species`` dumpmask which
314
+ means that SDF variable will have species data plus a general one. When
315
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
316
+ variable information and have leaves with variables so we move the node
317
+ information to a leaf named ``node/All`` (see example of
318
+ ``Dervied/Number_Density/All`` in below table)
319
+
320
+ Below are some examples of how variable names are translated from the
321
+ regular `xarray.open_dataset` result into their more traditional names.
322
+
323
+ =================================== ===================================
324
+ Dataset variable name DataTree variable name
325
+ =================================== ===================================
326
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
327
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
328
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
329
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
330
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
331
+ =================================== ===================================
332
+
333
+ Parameters
334
+ ----------
335
+ path
336
+ The path to the SDF file
337
+ keep_particles
338
+ If ``True``, also load particle data (this may use a lot of memory!)
339
+ probe_names
340
+ List of EPOCH probe names
341
+
342
+ Examples
343
+ --------
344
+ >>> dt = open_datatree("0000.sdf")
345
+ >>> dt["Electric_Field"]["Ex"].values # Access all Electric_Field_Ex data
346
+ """
347
+
348
+ return xr.open_datatree(
349
+ path, keep_particles=keep_particles, probe_names=probe_names
129
350
  )
130
351
 
131
352
 
353
+ def open_mfdatatree(
354
+ path_glob: Iterable | str | Path | Callable[..., Iterable[Path]],
355
+ *,
356
+ separate_times: bool = False,
357
+ keep_particles: bool = False,
358
+ probe_names: list[str] | None = None,
359
+ data_vars: list[str] | None = None,
360
+ ) -> xr.DataTree:
361
+ """Open a set of EPOCH SDF files as one `xarray.DataTree`
362
+
363
+ EPOCH can output variables at different periods, so each individal
364
+ SDF file from one EPOCH run may have different variables in it. In
365
+ order to combine all files into one `xarray.Dataset`, we need to
366
+ concatenate variables across their time dimension.
367
+
368
+ We have two choices:
369
+
370
+ 1. One time dimension where some variables may not be defined at all time
371
+ points, and so will be filled with NaNs at missing points; or
372
+ 2. Multiple time dimensions, one for each output frequency
373
+
374
+ The second option is better for memory consumption, as the missing data with
375
+ the first option still takes up space. However, proper lazy-loading may
376
+ mitigate this.
377
+
378
+ The ``separate_times`` argument can be used to switch between these choices.
379
+
380
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
381
+ file. This is due to the fact that these names include slashes which `xarray`
382
+ can use to automatically build up a datatree. We do additionally replace
383
+ spaces with underscores to be more pythonic. You can find the
384
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
385
+
386
+ This function combines multiple SDF files into a single `xarray.DataTree` with a
387
+ unified time dimension and hierarchical organization of variables.
388
+
389
+ In some cases the user may output the ``always + species`` dumpmask which
390
+ means that SDF variable will have species data plus a general one. When
391
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
392
+ variable information and have leaves with variables so we move the node
393
+ information to a leaf named ``node/All`` (see example of
394
+ ``Dervied/Number_Density/All`` in below table)
395
+
396
+ Below are some examples of how variable names are translated from the
397
+ regular `xarray.open_dataset` result into their more traditional names.
398
+
399
+ =================================== ===================================
400
+ Dataset variable name DataTree variable name
401
+ =================================== ===================================
402
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
403
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
404
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
405
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
406
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
407
+ =================================== ===================================
408
+
409
+ Parameters
410
+ ----------
411
+ path_glob
412
+ List of filenames or string glob pattern
413
+ separate_times
414
+ If ``True``, create separate time dimensions for variables defined at
415
+ different output frequencies
416
+ keep_particles
417
+ If ``True``, also load particle data (this may use a lot of memory!)
418
+ probe_names
419
+ List of EPOCH probe names
420
+ data_vars
421
+ List of data vars to load in (If not specified loads in all variables)
422
+
423
+ Examples
424
+ --------
425
+ >>> dt = open_mfdatatree("*.sdf")
426
+ >>> dt["Electric_Field"]["Ex"].values # Access all Electric_Field_Ex data
427
+ >>> dt.coords["time"].values # Access combined time dimension
428
+ """
429
+ # First, combine the datasets as usual
430
+ combined_ds = open_mfdataset(
431
+ path_glob,
432
+ separate_times=separate_times,
433
+ keep_particles=keep_particles,
434
+ probe_names=probe_names,
435
+ data_vars=data_vars,
436
+ )
437
+
438
+ return _build_datatree_from_dataset(combined_ds)
439
+
440
+
132
441
  def make_time_dims(path_glob):
133
442
  """Extract the distinct set of time arrays from a collection of
134
443
  SDF files, along with a mapping from variable names to their time
@@ -146,14 +455,12 @@ def make_time_dims(path_glob):
146
455
  )
147
456
 
148
457
  # Count the unique set of lists of times
149
- times_count = Counter((tuple(v) for v in vars_count.values()))
458
+ times_count = Counter(tuple(v) for v in vars_count.values())
150
459
 
151
460
  # Give each set of times a unique name
152
461
  time_dims = {}
153
- count = 0
154
- for t in times_count:
462
+ for count, t in enumerate(times_count):
155
463
  time_dims[f"time{count}"] = t
156
- count += 1
157
464
 
158
465
  # Map each variable to the name of its time dimension
159
466
  var_times_map = {}
@@ -174,13 +481,11 @@ class SDFBackendArray(BackendArray):
174
481
 
175
482
  __slots__ = ("datastore", "dtype", "shape", "variable_name")
176
483
 
177
- def __init__(self, variable_name, datastore):
484
+ def __init__(self, variable_name, datastore, shape, dtype):
178
485
  self.datastore = datastore
179
486
  self.variable_name = variable_name
180
-
181
- array = self.get_array()
182
- self.shape = array.shape
183
- self.dtype = array.dtype
487
+ self.shape = shape
488
+ self.dtype = dtype
184
489
 
185
490
  def get_array(self, needs_lock=True):
186
491
  with self.datastore.acquire_context(needs_lock) as ds:
@@ -205,19 +510,28 @@ class SDFDataStore(AbstractDataStore):
205
510
  """Store for reading and writing data via the SDF library."""
206
511
 
207
512
  __slots__ = (
208
- "lock",
209
- "drop_variables",
210
- "keep_particles",
211
513
  "_filename",
212
514
  "_manager",
515
+ "drop_variables",
516
+ "keep_particles",
517
+ "lock",
518
+ "probe_names",
213
519
  )
214
520
 
215
- def __init__(self, manager, drop_variables=None, keep_particles=False, lock=None):
521
+ def __init__(
522
+ self,
523
+ manager,
524
+ drop_variables=None,
525
+ keep_particles=False,
526
+ lock=None,
527
+ probe_names=None,
528
+ ):
216
529
  self._manager = manager
217
530
  self._filename = self.ds.filename
218
531
  self.drop_variables = drop_variables
219
532
  self.keep_particles = keep_particles
220
533
  self.lock = ensure_lock(lock)
534
+ self.probe_names = probe_names
221
535
 
222
536
  @classmethod
223
537
  def open(
@@ -226,6 +540,7 @@ class SDFDataStore(AbstractDataStore):
226
540
  lock=None,
227
541
  drop_variables=None,
228
542
  keep_particles=False,
543
+ probe_names=None,
229
544
  ):
230
545
  if isinstance(filename, os.PathLike):
231
546
  filename = os.fspath(filename)
@@ -236,6 +551,7 @@ class SDFDataStore(AbstractDataStore):
236
551
  lock=lock,
237
552
  drop_variables=drop_variables,
238
553
  keep_particles=keep_particles,
554
+ probe_names=probe_names,
239
555
  )
240
556
 
241
557
  def _acquire(self, needs_lock=True):
@@ -249,12 +565,21 @@ class SDFDataStore(AbstractDataStore):
249
565
  def acquire_context(self, needs_lock=True):
250
566
  return self._manager.acquire_context(needs_lock)
251
567
 
252
- def load(self):
568
+ def load(self): # noqa: PLR0912, PLR0915
253
569
  # Drop any requested variables
254
570
  if self.drop_variables:
571
+ # Build a mapping from underscored names to real variable names
572
+ name_map = {_rename_with_underscore(var): var for var in self.ds.variables}
573
+
255
574
  for variable in self.drop_variables:
256
- # TODO: nicer error handling
257
- self.ds.variables.pop(variable)
575
+ key = _rename_with_underscore(variable)
576
+ original_name = name_map.get(key)
577
+
578
+ if original_name is None:
579
+ raise KeyError(
580
+ f"Variable '{variable}' not found (interpreted as '{key}')."
581
+ )
582
+ self.ds.variables.pop(original_name)
258
583
 
259
584
  # These two dicts are global metadata about the run or file
260
585
  attrs = {**self.ds.header, **self.ds.run_info}
@@ -274,8 +599,7 @@ class SDFDataStore(AbstractDataStore):
274
599
  def _process_grid_name(grid_name: str, transform_func) -> str:
275
600
  """Apply the given transformation function and then rename with underscores."""
276
601
  transformed_name = transform_func(grid_name)
277
- renamed_name = _rename_with_underscore(transformed_name)
278
- return renamed_name
602
+ return _rename_with_underscore(transformed_name)
279
603
 
280
604
  for key, value in self.ds.grids.items():
281
605
  if "cpu" in key.lower():
@@ -310,6 +634,8 @@ class SDFDataStore(AbstractDataStore):
310
634
  # Had some problems with these variables, so just ignore them for now
311
635
  if "cpu" in key.lower():
312
636
  continue
637
+ if "boundary" in key.lower():
638
+ continue
313
639
  if "output file" in key.lower():
314
640
  continue
315
641
 
@@ -331,12 +657,38 @@ class SDFDataStore(AbstractDataStore):
331
657
  if value.units is not None:
332
658
  data_attrs["units"] = value.units
333
659
 
334
- data_vars[base_name] = Variable(dims, value.data, attrs=data_attrs)
660
+ var = Variable(dims, value.data, attrs=data_attrs)
661
+
662
+ # Provide preferred_chunks for constants so dask aligns to natural shapes
663
+ var.encoding["preferred_chunks"] = dict(zip(dims, shape))
664
+
665
+ data_vars[base_name] = var
335
666
  continue
336
667
 
337
668
  if value.is_point_data:
338
669
  # Point (particle) variables are 1D
339
- var_coords = (f"ID_{_process_grid_name(key, _grid_species_name)}",)
670
+
671
+ # Particle data does not maintain a fixed dimension size
672
+ # throughout the simulation. An example of a particle name comes
673
+ # in the form of `Particles/Px/Ion_H` which is then modified
674
+ # using `_process_grid_name()` into `Ion_H`. This is fine as the
675
+ # other components of the momentum (`Py`, `Pz`) will have the same
676
+ # size as they represent the same bunch of particles.
677
+
678
+ # Probes however have names in the form of `Electron_Front_Probe/Px`
679
+ # which are changed to just `Px`; this is fine when there is only one
680
+ # probe in the system but when there are multiple they will have
681
+ # conflicting sizes so we can't keep the names as simply `Px` so we
682
+ # instead set their dimension as the full name `Electron_Front_Probe_Px`.
683
+ is_probe_name_match = self.probe_names is not None and any(
684
+ name in key for name in self.probe_names
685
+ )
686
+ name_processor = (
687
+ _rename_with_underscore
688
+ if is_probe_name_match
689
+ else _grid_species_name
690
+ )
691
+ var_coords = (f"ID_{_process_grid_name(key, name_processor)}",)
340
692
  else:
341
693
  # These are DataArrays
342
694
 
@@ -359,9 +711,9 @@ class SDFDataStore(AbstractDataStore):
359
711
  grid_mid = self.ds.grids[value.grid_mid]
360
712
  grid_mid_base_name = _process_grid_name(grid_mid.name, _norm_grid_name)
361
713
  for dim_size, dim_name in zip(grid_mid.shape, grid_mid.labels):
362
- dim_size_lookup[dim_name][
363
- dim_size
364
- ] = f"{dim_name}_{grid_mid_base_name}"
714
+ dim_size_lookup[dim_name][dim_size] = (
715
+ f"{dim_name}_{grid_mid_base_name}"
716
+ )
365
717
 
366
718
  var_coords = [
367
719
  dim_size_lookup[dim_name][dim_size]
@@ -377,8 +729,24 @@ class SDFDataStore(AbstractDataStore):
377
729
  "full_name": key,
378
730
  "long_name": long_name,
379
731
  }
380
- lazy_data = indexing.LazilyIndexedArray(SDFBackendArray(key, self))
381
- data_vars[base_name] = Variable(var_coords, lazy_data, data_attrs)
732
+ lazy_data = indexing.LazilyIndexedArray(
733
+ SDFBackendArray(key, self, shape=value.shape, dtype=value.data.dtype)
734
+ )
735
+ var = Variable(var_coords, lazy_data, data_attrs)
736
+ # Set preferred chunks to match on-disk layout
737
+ # For point data (1D): full dimension
738
+ # For grid data (N-D): individual grid chunk sizes
739
+ if value.is_point_data:
740
+ var.encoding["preferred_chunks"] = {var_coords[0]: len(value.data)}
741
+ else:
742
+ # Align with on-disk grid structure
743
+ chunk_dict = {}
744
+ for dim_name, size in zip(var_coords, value.shape):
745
+ # Use natural on-disk boundaries
746
+ chunk_dict[dim_name] = size
747
+ var.encoding["preferred_chunks"] = chunk_dict
748
+
749
+ data_vars[base_name] = var
382
750
 
383
751
  # TODO: might need to decode if mult is set?
384
752
 
@@ -397,14 +765,23 @@ class SDFDataStore(AbstractDataStore):
397
765
 
398
766
 
399
767
  class SDFEntrypoint(BackendEntrypoint):
768
+ supports_groups = True
769
+ open_dataset_parameters: ClassVar[list[str]] = [
770
+ "filename_or_obj",
771
+ "drop_variables",
772
+ "keep_particles",
773
+ "probe_names",
774
+ ]
775
+
400
776
  def open_dataset(
401
777
  self,
402
778
  filename_or_obj,
403
779
  *,
404
780
  drop_variables=None,
405
781
  keep_particles=False,
782
+ probe_names=None,
406
783
  ):
407
- if isinstance(filename_or_obj, pathlib.Path):
784
+ if isinstance(filename_or_obj, Path):
408
785
  # sdf library takes a filename only
409
786
  # TODO: work out if we need to deal with file handles
410
787
  filename_or_obj = str(filename_or_obj)
@@ -413,33 +790,89 @@ class SDFEntrypoint(BackendEntrypoint):
413
790
  filename_or_obj,
414
791
  drop_variables=drop_variables,
415
792
  keep_particles=keep_particles,
793
+ probe_names=probe_names,
416
794
  )
417
795
  with close_on_error(store):
418
796
  return store.load()
419
797
 
420
- open_dataset_parameters = ["filename_or_obj", "drop_variables", "keep_particles"]
798
+ open_datatree_parameters: ClassVar[list[str]] = [
799
+ "filename_or_obj",
800
+ "drop_variables",
801
+ "keep_particles",
802
+ "probe_names",
803
+ ]
804
+
805
+ def open_datatree(
806
+ self,
807
+ filename_or_obj,
808
+ *,
809
+ drop_variables=None,
810
+ keep_particles=False,
811
+ probe_names=None,
812
+ ):
813
+ ds = self.open_dataset(
814
+ filename_or_obj,
815
+ drop_variables=drop_variables,
816
+ keep_particles=keep_particles,
817
+ probe_names=probe_names,
818
+ )
819
+ return _build_datatree_from_dataset(ds)
421
820
 
422
821
  def guess_can_open(self, filename_or_obj):
423
822
  magic_number = try_read_magic_number_from_path(filename_or_obj)
424
823
  if magic_number is not None:
425
824
  return magic_number.startswith(b"SDF1")
426
825
 
427
- try:
428
- _, ext = os.path.splitext(filename_or_obj)
429
- except TypeError:
430
- return False
431
- return ext in {".sdf", ".SDF"}
826
+ return Path(filename_or_obj).suffix in {".sdf", ".SDF"}
432
827
 
433
828
  description = "Use .sdf files in Xarray"
434
829
 
435
- url = "https://epochpic.github.io/documentation/visualising_output/python.html"
830
+ url = "https://epochpic.github.io/documentation/visualising_output/python_beam.html"
831
+
832
+
833
+ class XrTUIEntrpoint:
834
+ def open_mfdatatree(self, paths: list[Path]) -> xr.DataTree:
835
+ return open_mfdatatree(paths)
436
836
 
437
837
 
438
838
  class SDFPreprocess:
439
- """Preprocess SDF files for xarray ensuring matching job ids and sets time dimension"""
839
+ """Preprocess SDF files for xarray ensuring matching job ids and sets
840
+ time dimension.
841
+
842
+ This class is used as a 'preprocess' function within ``xr.open_mfdataset``. It
843
+ performs three main duties on each individual file's Dataset:
844
+
845
+ 1. Checks for a **matching job ID** across all files to ensure dataset consistency.
846
+ 2. **Filters** the Dataset to keep only the variables specified in `data_vars`
847
+ and their required coordinates.
848
+ 3. **Expands dimensions** to include a single 'time' coordinate, preparing the
849
+ Dataset for concatenation.
850
+
851
+ EPOCH can output variables at different intervals, so some SDF files
852
+ may not contain the requested variable. We combine this data into one
853
+ dataset by concatenating across the time dimension.
854
+
855
+ The combination is performed using ``join="outer"`` (in the calling ``open_mfdataset`` function),
856
+ meaning that the final combined dataset will contain the variable across the
857
+ entire time span, with NaNs filling the time steps where the variable was absent in
858
+ the individual file.
440
859
 
441
- def __init__(self):
860
+ With large SDF files, this filtering method will save on memory consumption when
861
+ compared to loading all variables from all files before concatenation.
862
+
863
+ Parameters
864
+ ----------
865
+ data_vars :
866
+ A list of data variables to load in (If not specified loads
867
+ in all variables)
868
+ """
869
+
870
+ def __init__(
871
+ self,
872
+ data_vars: list[str] | None = None,
873
+ ):
442
874
  self.job_id: int | None = None
875
+ self.data_vars = data_vars
443
876
 
444
877
  def __call__(self, ds: xr.Dataset) -> xr.Dataset:
445
878
  if self.job_id is None:
@@ -450,17 +883,23 @@ class SDFPreprocess:
450
883
  f"Mismatching job ids (got {ds.attrs['jobid1']}, expected {self.job_id})"
451
884
  )
452
885
 
453
- ds = ds.expand_dims(time=[ds.attrs["time"]])
886
+ # If the user has exclusively requested only certain variables be
887
+ # loaded in then we purge all other variables and coordinates
888
+ if self.data_vars:
889
+ ds = purge_unselected_data_vars(ds, self.data_vars)
890
+
891
+ time_val = ds.attrs.get("time", np.nan)
892
+ ds = ds.expand_dims(time=[time_val])
454
893
  ds = ds.assign_coords(
455
894
  time=(
456
895
  "time",
457
- [ds.attrs["time"]],
896
+ [time_val],
458
897
  {"units": "s", "long_name": "Time", "full_name": "time"},
459
898
  )
460
899
  )
461
900
  # Particles' spartial coordinates also evolve in time
462
901
  for coord, value in ds.coords.items():
463
902
  if value.attrs.get("point_data", False):
464
- ds.coords[coord] = value.expand_dims(time=[ds.attrs["time"]])
903
+ ds.coords[coord] = value.expand_dims(time=[time_val])
465
904
 
466
905
  return ds