sdf-xarray 0.5.0__cp314-cp314t-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sdf_xarray/__init__.py ADDED
@@ -0,0 +1,905 @@
1
+ import contextlib
2
+ import os
3
+ import re
4
+ from collections import Counter, defaultdict
5
+ from collections.abc import Callable, Iterable
6
+ from importlib.metadata import version
7
+ from itertools import product
8
+ from os import PathLike as os_PathLike
9
+ from pathlib import Path
10
+ from typing import ClassVar
11
+
12
+ import numpy as np
13
+ import xarray as xr
14
+ from packaging.version import Version
15
+ from xarray.backends import AbstractDataStore, BackendArray, BackendEntrypoint
16
+ from xarray.backends.file_manager import CachingFileManager
17
+ from xarray.backends.locks import ensure_lock
18
+ from xarray.core import indexing
19
+ from xarray.core.types import T_Chunks
20
+ from xarray.core.utils import close_on_error, try_read_magic_number_from_path
21
+ from xarray.core.variable import Variable
22
+
23
+ # NOTE: Do not delete these lines, otherwise the "epoch" dataset and dataarray
24
+ # accessors will not be imported when the user imports sdf_xarray
25
+ import sdf_xarray.dataset_accessor
26
+ import sdf_xarray.download
27
+ import sdf_xarray.plotting # noqa: F401
28
+
29
+ # NOTE: This attempts to initialise with the "pint" accessor if the user
30
+ # has installed the package
31
+ with contextlib.suppress(ImportError):
32
+ import pint_xarray # noqa: F401
33
+
34
+ from .sdf_interface import Constant, SDFFile # type: ignore # noqa: PGH003
35
+
36
+ # TODO Remove this once the new kwarg options are fully implemented
37
+ if Version(version("xarray")) >= Version("2025.8.0"):
38
+ xr.set_options(use_new_combine_kwarg_defaults=True)
39
+
40
+ PathLike = str | os_PathLike
41
+
42
+
43
+ def _rename_with_underscore(name: str) -> str:
44
+ """A lot of the variable names have spaces, forward slashes and dashes in them, which
45
+ are not valid in netCDF names so we replace them with underscores."""
46
+ return name.replace("/", "_").replace(" ", "_").replace("-", "_")
47
+
48
+
49
+ def _process_latex_name(variable_name: str) -> str:
50
+ """Converts variable names to LaTeX format where possible
51
+ using the following rules:
52
+ - E -> $E_x$
53
+ - E -> $E_y$
54
+ - E -> $E_z$
55
+
56
+ This repeats for B, J and P. It only changes the variable
57
+ name if there are spaces around the affix (prefix + suffix)
58
+ or if there is no trailing space. This is to avoid changing variable
59
+ names that may contain these affixes as part of the variable name itself.
60
+ """
61
+ prefixes = ["E", "B", "J", "P"]
62
+ suffixes = ["x", "y", "z"]
63
+ for prefix, suffix in product(prefixes, suffixes):
64
+ # Match affix with preceding space and trailing space or end of string
65
+ affix_pattern = rf"\b{prefix}{suffix}\b"
66
+ # Insert LaTeX format while preserving spaces
67
+ replacement = rf"${prefix}_{suffix}$"
68
+ variable_name = re.sub(affix_pattern, replacement, variable_name)
69
+ return variable_name
70
+
71
+
72
+ def _resolve_glob(path_glob: PathLike | Iterable[PathLike]):
73
+ """
74
+ Normalise input path_glob into a sorted list of absolute, resolved Path objects.
75
+ """
76
+
77
+ try:
78
+ p = Path(path_glob)
79
+ paths = list(p.parent.glob(p.name)) if p.name == "*.sdf" else list(p)
80
+ except TypeError:
81
+ paths = list({Path(p) for p in path_glob})
82
+
83
+ paths = sorted(p.resolve() for p in paths)
84
+ if not paths:
85
+ raise FileNotFoundError(f"No files matched pattern or input: {path_glob!r}")
86
+ return paths
87
+
88
+
89
+ def _build_datatree_from_dataset(
90
+ ds: xr.Dataset,
91
+ ) -> xr.DataTree:
92
+ """
93
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
94
+ file. This is due to the fact that these names include slashes which `xarray`
95
+ can use to automatically build up a datatree. We do additionally replace
96
+ spaces with underscores to be more pythonic. You can find the
97
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
98
+
99
+ In some cases the user may output the ``always + species`` dumpmask which
100
+ means that SDF variable will have species data plus a general one. When
101
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
102
+ variable information and have leaves with variables so we move the node
103
+ information to a leaf named ``node/All`` (see example of
104
+ ``Dervied/Number_Density/All`` in below table)
105
+
106
+ Below are some examples of how variable names are translated from the
107
+ regular `xarray.open_dataset` result into their more traditional names.
108
+
109
+ =================================== ===================================
110
+ Dataset variable name DataTree variable name
111
+ =================================== ===================================
112
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
113
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
114
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
115
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
116
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
117
+ =================================== ===================================
118
+
119
+ Parameters
120
+ ----------
121
+ ds
122
+ Incoming `xarray.Dataset` to convert to a `xarray.DataTree`
123
+ """
124
+ renames = {}
125
+ for name, var in ds.data_vars.items():
126
+ # Append the current variable name to the attributes
127
+ var.attrs["flat_structure_name"] = name
128
+ renames.update({name: var.attrs["full_name"].replace(" ", "_")})
129
+
130
+ new_names = renames.values()
131
+
132
+ final_renames = {
133
+ key: (
134
+ f"{path}/All"
135
+ if any(other.startswith(f"{path}/") for other in new_names)
136
+ else path
137
+ )
138
+ for key, path in renames.items()
139
+ }
140
+
141
+ ds = ds.rename_vars(final_renames)
142
+ dt = xr.DataTree.from_dict(ds)
143
+ dt.attrs = ds.attrs
144
+ return dt
145
+
146
+
147
+ def purge_unselected_data_vars(ds: xr.Dataset, data_vars: list[str]) -> xr.Dataset:
148
+ """
149
+ If the user has exclusively requested only certain variables be
150
+ loaded in then we purge all other variables and dimensions
151
+ """
152
+ existing_data_vars = set(ds.data_vars.keys())
153
+ vars_to_keep = set(data_vars) & existing_data_vars
154
+ vars_to_drop = existing_data_vars - vars_to_keep
155
+ ds = ds.drop_vars(vars_to_drop)
156
+
157
+ existing_dims = set(ds.sizes)
158
+ dims_to_keep = set()
159
+ for var in vars_to_keep:
160
+ dims_to_keep.update(ds[var].coords._names)
161
+ dims_to_keep.update(ds[var].dims)
162
+
163
+ coords_to_drop = existing_dims - dims_to_keep
164
+ return ds.drop_dims(coords_to_drop)
165
+
166
+
167
+ def combine_datasets(
168
+ path_glob: Iterable | str, data_vars: list[str], **kwargs
169
+ ) -> xr.Dataset:
170
+ """
171
+ Combine all datasets using a single time dimension, optionally extract
172
+ data from only the listed data_vars
173
+ """
174
+
175
+ if data_vars is not None:
176
+ return xr.open_mfdataset(
177
+ path_glob,
178
+ join="outer",
179
+ coords="different",
180
+ compat="no_conflicts",
181
+ combine="nested",
182
+ concat_dim="time",
183
+ preprocess=SDFPreprocess(data_vars=data_vars),
184
+ **kwargs,
185
+ )
186
+
187
+ return xr.open_mfdataset(
188
+ path_glob,
189
+ data_vars="all",
190
+ coords="different",
191
+ compat="no_conflicts",
192
+ join="outer",
193
+ preprocess=SDFPreprocess(),
194
+ **kwargs,
195
+ )
196
+
197
+
198
+ def open_mfdataset(
199
+ path_glob: Iterable | str | Path | Callable[..., Iterable[Path]],
200
+ *,
201
+ separate_times: bool = False,
202
+ keep_particles: bool = False,
203
+ probe_names: list[str] | None = None,
204
+ data_vars: list[str] | None = None,
205
+ chunks: T_Chunks = "auto",
206
+ ) -> xr.Dataset:
207
+ """Open a set of EPOCH SDF files as one `xarray.Dataset`
208
+
209
+ EPOCH can output variables at different periods, so each individal
210
+ SDF file from one EPOCH run may have different variables in it. In
211
+ order to combine all files into one `xarray.Dataset`, we need to
212
+ concatenate variables across their time dimension.
213
+
214
+ We have two choices:
215
+
216
+ 1. One time dimension where some variables may not be defined at all time
217
+ points, and so will be filled with NaNs at missing points; or
218
+ 2. Multiple time dimensions, one for each output frequency
219
+
220
+ The second option is better for memory consumption, as the missing data with
221
+ the first option still takes up space. However, proper lazy-loading may
222
+ mitigate this.
223
+
224
+ The ``separate_times`` argument can be used to switch between these choices.
225
+
226
+ Parameters
227
+ ----------
228
+ path_glob :
229
+ List of filenames or string glob pattern
230
+ separate_times :
231
+ If ``True``, create separate time dimensions for variables defined at
232
+ different output frequencies
233
+ keep_particles :
234
+ If ``True``, also load particle data (this may use a lot of memory!)
235
+ probe_names :
236
+ List of EPOCH probe names
237
+ data_vars :
238
+ List of data vars to load in (If not specified loads in all variables)
239
+ chunks :
240
+ Dictionary with keys given by dimension names and values given by chunk sizes.
241
+ In general, these should divide the dimensions of each dataset. By default
242
+ chunks are automatically set so that they are the same size as the dimensions
243
+ stored in each of the SDF files. See `Xarray chunking-and-performance
244
+ <https://docs.xarray.dev/en/stable/user-guide/dask.html#chunking-and-performance>`_
245
+ for details on why this is useful for large datasets. The default behaviour is
246
+ to do this automatically and can be disabled by ``chunks=None``.
247
+ """
248
+
249
+ path_glob = _resolve_glob(path_glob)
250
+
251
+ if not separate_times:
252
+ return combine_datasets(
253
+ path_glob,
254
+ data_vars=data_vars,
255
+ keep_particles=keep_particles,
256
+ probe_names=probe_names,
257
+ chunks=chunks,
258
+ )
259
+
260
+ _, var_times_map = make_time_dims(path_glob)
261
+
262
+ all_dfs = []
263
+ for f in path_glob:
264
+ ds = xr.open_dataset(
265
+ f, keep_particles=keep_particles, probe_names=probe_names, chunks=chunks
266
+ )
267
+
268
+ # If the data_vars are specified then only load them in and disregard the rest.
269
+ # If there are no remaining data variables then skip adding the dataset to list
270
+ if data_vars is not None:
271
+ ds = purge_unselected_data_vars(ds, data_vars)
272
+ if not ds.data_vars:
273
+ continue
274
+
275
+ all_dfs.append(ds)
276
+
277
+ for df in all_dfs:
278
+ for da in df:
279
+ df[da] = df[da].expand_dims(
280
+ dim={var_times_map[str(da)]: [df.attrs["time"]]}
281
+ )
282
+ for coord in df.coords:
283
+ if df.coords[coord].attrs.get("point_data", False):
284
+ # We need to undo our renaming of the coordinates
285
+ base_name = coord.split("_", maxsplit=1)[-1]
286
+ sdf_coord_name = f"Grid_{base_name}"
287
+ df.coords[coord] = df.coords[coord].expand_dims(
288
+ dim={var_times_map[sdf_coord_name]: [df.attrs["time"]]}
289
+ )
290
+
291
+ return xr.combine_by_coords(
292
+ all_dfs,
293
+ coords="different",
294
+ combine_attrs="drop_conflicts",
295
+ join="outer",
296
+ compat="no_conflicts",
297
+ )
298
+
299
+
300
+ def open_datatree(
301
+ path: PathLike,
302
+ *,
303
+ keep_particles: bool = False,
304
+ probe_names: list[str] | None = None,
305
+ ) -> xr.DataTree:
306
+ """
307
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
308
+ file. This is due to the fact that these names include slashes which `xarray`
309
+ can use to automatically build up a datatree. We do additionally replace
310
+ spaces with underscores to be more pythonic. You can find the
311
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
312
+
313
+ In some cases the user may output the ``always + species`` dumpmask which
314
+ means that SDF variable will have species data plus a general one. When
315
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
316
+ variable information and have leaves with variables so we move the node
317
+ information to a leaf named ``node/All`` (see example of
318
+ ``Dervied/Number_Density/All`` in below table)
319
+
320
+ Below are some examples of how variable names are translated from the
321
+ regular `xarray.open_dataset` result into their more traditional names.
322
+
323
+ =================================== ===================================
324
+ Dataset variable name DataTree variable name
325
+ =================================== ===================================
326
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
327
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
328
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
329
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
330
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
331
+ =================================== ===================================
332
+
333
+ Parameters
334
+ ----------
335
+ path
336
+ The path to the SDF file
337
+ keep_particles
338
+ If ``True``, also load particle data (this may use a lot of memory!)
339
+ probe_names
340
+ List of EPOCH probe names
341
+
342
+ Examples
343
+ --------
344
+ >>> dt = open_datatree("0000.sdf")
345
+ >>> dt["Electric_Field"]["Ex"].values # Access all Electric_Field_Ex data
346
+ """
347
+
348
+ return xr.open_datatree(
349
+ path, keep_particles=keep_particles, probe_names=probe_names
350
+ )
351
+
352
+
353
+ def open_mfdatatree(
354
+ path_glob: Iterable | str | Path | Callable[..., Iterable[Path]],
355
+ *,
356
+ separate_times: bool = False,
357
+ keep_particles: bool = False,
358
+ probe_names: list[str] | None = None,
359
+ data_vars: list[str] | None = None,
360
+ ) -> xr.DataTree:
361
+ """Open a set of EPOCH SDF files as one `xarray.DataTree`
362
+
363
+ EPOCH can output variables at different periods, so each individal
364
+ SDF file from one EPOCH run may have different variables in it. In
365
+ order to combine all files into one `xarray.Dataset`, we need to
366
+ concatenate variables across their time dimension.
367
+
368
+ We have two choices:
369
+
370
+ 1. One time dimension where some variables may not be defined at all time
371
+ points, and so will be filled with NaNs at missing points; or
372
+ 2. Multiple time dimensions, one for each output frequency
373
+
374
+ The second option is better for memory consumption, as the missing data with
375
+ the first option still takes up space. However, proper lazy-loading may
376
+ mitigate this.
377
+
378
+ The ``separate_times`` argument can be used to switch between these choices.
379
+
380
+ An `xarray.DataTree` is constructed utilising the original names in the SDF
381
+ file. This is due to the fact that these names include slashes which `xarray`
382
+ can use to automatically build up a datatree. We do additionally replace
383
+ spaces with underscores to be more pythonic. You can find the
384
+ `xarray.Dataset` name under the ``attrs["flat_structure_name"]`` for referencing.
385
+
386
+ This function combines multiple SDF files into a single `xarray.DataTree` with a
387
+ unified time dimension and hierarchical organization of variables.
388
+
389
+ In some cases the user may output the ``always + species`` dumpmask which
390
+ means that SDF variable will have species data plus a general one. When
391
+ defining a `xarray.DataTree` you cannot have a node of that tree contain both
392
+ variable information and have leaves with variables so we move the node
393
+ information to a leaf named ``node/All`` (see example of
394
+ ``Dervied/Number_Density/All`` in below table)
395
+
396
+ Below are some examples of how variable names are translated from the
397
+ regular `xarray.open_dataset` result into their more traditional names.
398
+
399
+ =================================== ===================================
400
+ Dataset variable name DataTree variable name
401
+ =================================== ===================================
402
+ ``Derived_Number_Density`` ``Derived/Number_Density/All``
403
+ ``Derived_Number_Density_Electron`` ``Derived/Number_Density/Electron``
404
+ ``Derived_Number_Density_Ion`` ``Derived/Number_Density/Ion``
405
+ ``Derived_Number_Density_Photon`` ``Derived/Number_Density/Photon``
406
+ ``Derived_Average_Particle_Energy`` ``Derived/Average_Particle_Energy``
407
+ =================================== ===================================
408
+
409
+ Parameters
410
+ ----------
411
+ path_glob
412
+ List of filenames or string glob pattern
413
+ separate_times
414
+ If ``True``, create separate time dimensions for variables defined at
415
+ different output frequencies
416
+ keep_particles
417
+ If ``True``, also load particle data (this may use a lot of memory!)
418
+ probe_names
419
+ List of EPOCH probe names
420
+ data_vars
421
+ List of data vars to load in (If not specified loads in all variables)
422
+
423
+ Examples
424
+ --------
425
+ >>> dt = open_mfdatatree("*.sdf")
426
+ >>> dt["Electric_Field"]["Ex"].values # Access all Electric_Field_Ex data
427
+ >>> dt.coords["time"].values # Access combined time dimension
428
+ """
429
+ # First, combine the datasets as usual
430
+ combined_ds = open_mfdataset(
431
+ path_glob,
432
+ separate_times=separate_times,
433
+ keep_particles=keep_particles,
434
+ probe_names=probe_names,
435
+ data_vars=data_vars,
436
+ )
437
+
438
+ return _build_datatree_from_dataset(combined_ds)
439
+
440
+
441
+ def make_time_dims(path_glob):
442
+ """Extract the distinct set of time arrays from a collection of
443
+ SDF files, along with a mapping from variable names to their time
444
+ dimension.
445
+ """
446
+ # Map variable names to list of times
447
+ vars_count = defaultdict(list)
448
+ for f in path_glob:
449
+ with SDFFile(str(f)) as sdf_file:
450
+ for key in sdf_file.variables:
451
+ vars_count[_rename_with_underscore(key)].append(sdf_file.header["time"])
452
+ for grid in sdf_file.grids.values():
453
+ vars_count[_rename_with_underscore(grid.name)].append(
454
+ sdf_file.header["time"]
455
+ )
456
+
457
+ # Count the unique set of lists of times
458
+ times_count = Counter(tuple(v) for v in vars_count.values())
459
+
460
+ # Give each set of times a unique name
461
+ time_dims = {}
462
+ for count, t in enumerate(times_count):
463
+ time_dims[f"time{count}"] = t
464
+
465
+ # Map each variable to the name of its time dimension
466
+ var_times_map = {}
467
+ for key, value in vars_count.items():
468
+ v_tuple = tuple(value)
469
+ for time_name, time_dim in time_dims.items():
470
+ if v_tuple == time_dim:
471
+ var_times_map[key] = time_name
472
+ break
473
+ else:
474
+ raise ValueError(f"Didn't find time dim for {key!r} with {v_tuple}")
475
+
476
+ return time_dims, var_times_map
477
+
478
+
479
+ class SDFBackendArray(BackendArray):
480
+ """Adapater class required for lazy loading"""
481
+
482
+ __slots__ = ("datastore", "dtype", "shape", "variable_name")
483
+
484
+ def __init__(self, variable_name, datastore, shape, dtype):
485
+ self.datastore = datastore
486
+ self.variable_name = variable_name
487
+ self.shape = shape
488
+ self.dtype = dtype
489
+
490
+ def get_array(self, needs_lock=True):
491
+ with self.datastore.acquire_context(needs_lock) as ds:
492
+ return ds.variables[self.variable_name]
493
+
494
+ def __getitem__(self, key: indexing.ExplicitIndexer) -> np.typing.ArrayLike:
495
+ return indexing.explicit_indexing_adapter(
496
+ key,
497
+ self.shape,
498
+ indexing.IndexingSupport.OUTER,
499
+ self._raw_indexing_method,
500
+ )
501
+
502
+ def _raw_indexing_method(self, key: tuple) -> np.typing.ArrayLike:
503
+ # thread safe method that access to data on disk
504
+ with self.datastore.acquire_context():
505
+ original_array = self.get_array(needs_lock=False)
506
+ return original_array.data[key]
507
+
508
+
509
+ class SDFDataStore(AbstractDataStore):
510
+ """Store for reading and writing data via the SDF library."""
511
+
512
+ __slots__ = (
513
+ "_filename",
514
+ "_manager",
515
+ "drop_variables",
516
+ "keep_particles",
517
+ "lock",
518
+ "probe_names",
519
+ )
520
+
521
+ def __init__(
522
+ self,
523
+ manager,
524
+ drop_variables=None,
525
+ keep_particles=False,
526
+ lock=None,
527
+ probe_names=None,
528
+ ):
529
+ self._manager = manager
530
+ self._filename = self.ds.filename
531
+ self.drop_variables = drop_variables
532
+ self.keep_particles = keep_particles
533
+ self.lock = ensure_lock(lock)
534
+ self.probe_names = probe_names
535
+
536
+ @classmethod
537
+ def open(
538
+ cls,
539
+ filename,
540
+ lock=None,
541
+ drop_variables=None,
542
+ keep_particles=False,
543
+ probe_names=None,
544
+ ):
545
+ if isinstance(filename, os.PathLike):
546
+ filename = os.fspath(filename)
547
+
548
+ manager = CachingFileManager(SDFFile, filename, lock=lock)
549
+ return cls(
550
+ manager,
551
+ lock=lock,
552
+ drop_variables=drop_variables,
553
+ keep_particles=keep_particles,
554
+ probe_names=probe_names,
555
+ )
556
+
557
+ def _acquire(self, needs_lock=True):
558
+ with self._manager.acquire_context(needs_lock) as ds:
559
+ return ds
560
+
561
+ @property
562
+ def ds(self):
563
+ return self._acquire()
564
+
565
+ def acquire_context(self, needs_lock=True):
566
+ return self._manager.acquire_context(needs_lock)
567
+
568
+ def load(self): # noqa: PLR0912, PLR0915
569
+ # Drop any requested variables
570
+ if self.drop_variables:
571
+ # Build a mapping from underscored names to real variable names
572
+ name_map = {_rename_with_underscore(var): var for var in self.ds.variables}
573
+
574
+ for variable in self.drop_variables:
575
+ key = _rename_with_underscore(variable)
576
+ original_name = name_map.get(key)
577
+
578
+ if original_name is None:
579
+ raise KeyError(
580
+ f"Variable '{variable}' not found (interpreted as '{key}')."
581
+ )
582
+ self.ds.variables.pop(original_name)
583
+
584
+ # These two dicts are global metadata about the run or file
585
+ attrs = {**self.ds.header, **self.ds.run_info}
586
+
587
+ data_vars = {}
588
+ coords = {}
589
+
590
+ def _norm_grid_name(grid_name: str) -> str:
591
+ """There may be multiple grids all with the same coordinate names, so
592
+ drop the "Grid/" from the start, and append the rest to the
593
+ dimension name. This lets us disambiguate them all. Probably"""
594
+ return grid_name.split("/", maxsplit=1)[-1]
595
+
596
+ def _grid_species_name(grid_name: str) -> str:
597
+ return grid_name.split("/")[-1]
598
+
599
+ def _process_grid_name(grid_name: str, transform_func) -> str:
600
+ """Apply the given transformation function and then rename with underscores."""
601
+ transformed_name = transform_func(grid_name)
602
+ return _rename_with_underscore(transformed_name)
603
+
604
+ for key, value in self.ds.grids.items():
605
+ if "cpu" in key.lower():
606
+ # Had some problems with these variables, so just ignore them for now
607
+ continue
608
+
609
+ if not self.keep_particles and value.is_point_data:
610
+ continue
611
+
612
+ base_name = _process_grid_name(value.name, _norm_grid_name)
613
+
614
+ for label, coord, unit in zip(value.labels, value.data, value.units):
615
+ full_name = f"{label}_{base_name}"
616
+ dim_name = (
617
+ f"ID_{_process_grid_name(key, _grid_species_name)}"
618
+ if value.is_point_data
619
+ else full_name
620
+ )
621
+ coords[full_name] = (
622
+ dim_name,
623
+ coord,
624
+ {
625
+ "long_name": label.replace("_", " "),
626
+ "units": unit,
627
+ "point_data": value.is_point_data,
628
+ "full_name": value.name,
629
+ },
630
+ )
631
+
632
+ # Read and convert SDF variables and meshes to xarray DataArrays and Coordinates
633
+ for key, value in self.ds.variables.items():
634
+ # Had some problems with these variables, so just ignore them for now
635
+ if "cpu" in key.lower():
636
+ continue
637
+ if "boundary" in key.lower():
638
+ continue
639
+ if "output file" in key.lower():
640
+ continue
641
+
642
+ if not self.keep_particles and value.is_point_data:
643
+ continue
644
+
645
+ if isinstance(value, Constant) or value.grid is None:
646
+ # We don't have a grid, either because it's just a
647
+ # scalar, or because it's an array over something
648
+ # else. We have no more information, so just make up
649
+ # some (hopefully) unique dimension names
650
+ shape = getattr(value.data, "shape", ())
651
+ dims = [f"dim_{key}_{n}" for n, _ in enumerate(shape)]
652
+ base_name = _rename_with_underscore(key)
653
+
654
+ data_attrs = {}
655
+ data_attrs["full_name"] = key
656
+ data_attrs["long_name"] = base_name.replace("_", " ")
657
+ if value.units is not None:
658
+ data_attrs["units"] = value.units
659
+
660
+ var = Variable(dims, value.data, attrs=data_attrs)
661
+
662
+ # Provide preferred_chunks for constants so dask aligns to natural shapes
663
+ var.encoding["preferred_chunks"] = dict(zip(dims, shape))
664
+
665
+ data_vars[base_name] = var
666
+ continue
667
+
668
+ if value.is_point_data:
669
+ # Point (particle) variables are 1D
670
+
671
+ # Particle data does not maintain a fixed dimension size
672
+ # throughout the simulation. An example of a particle name comes
673
+ # in the form of `Particles/Px/Ion_H` which is then modified
674
+ # using `_process_grid_name()` into `Ion_H`. This is fine as the
675
+ # other components of the momentum (`Py`, `Pz`) will have the same
676
+ # size as they represent the same bunch of particles.
677
+
678
+ # Probes however have names in the form of `Electron_Front_Probe/Px`
679
+ # which are changed to just `Px`; this is fine when there is only one
680
+ # probe in the system but when there are multiple they will have
681
+ # conflicting sizes so we can't keep the names as simply `Px` so we
682
+ # instead set their dimension as the full name `Electron_Front_Probe_Px`.
683
+ is_probe_name_match = self.probe_names is not None and any(
684
+ name in key for name in self.probe_names
685
+ )
686
+ name_processor = (
687
+ _rename_with_underscore
688
+ if is_probe_name_match
689
+ else _grid_species_name
690
+ )
691
+ var_coords = (f"ID_{_process_grid_name(key, name_processor)}",)
692
+ else:
693
+ # These are DataArrays
694
+
695
+ # SDF makes matching up the coordinates a bit convoluted. Each
696
+ # dimension on a variable can be defined either on "grid" or
697
+ # "grid_mid", and the only way to tell which one is to compare the
698
+ # variable's dimension sizes for each grid. We do this by making a
699
+ # nested dict that looks something like:
700
+ #
701
+ # {"X": {129: "X_Grid", 129: "X_Grid_mid"}}
702
+ #
703
+ # Then we can look up the dimension label and size to get *our* name
704
+ # for the corresponding coordinate
705
+ dim_size_lookup = defaultdict(dict)
706
+ grid = self.ds.grids[value.grid]
707
+ grid_base_name = _process_grid_name(grid.name, _norm_grid_name)
708
+ for dim_size, dim_name in zip(grid.shape, grid.labels):
709
+ dim_size_lookup[dim_name][dim_size] = f"{dim_name}_{grid_base_name}"
710
+
711
+ grid_mid = self.ds.grids[value.grid_mid]
712
+ grid_mid_base_name = _process_grid_name(grid_mid.name, _norm_grid_name)
713
+ for dim_size, dim_name in zip(grid_mid.shape, grid_mid.labels):
714
+ dim_size_lookup[dim_name][dim_size] = (
715
+ f"{dim_name}_{grid_mid_base_name}"
716
+ )
717
+
718
+ var_coords = [
719
+ dim_size_lookup[dim_name][dim_size]
720
+ for dim_name, dim_size in zip(grid.labels, value.shape)
721
+ ]
722
+
723
+ # TODO: error handling here? other attributes?
724
+ base_name = _rename_with_underscore(key)
725
+ long_name = _process_latex_name(base_name.replace("_", " "))
726
+ data_attrs = {
727
+ "units": value.units,
728
+ "point_data": value.is_point_data,
729
+ "full_name": key,
730
+ "long_name": long_name,
731
+ }
732
+ lazy_data = indexing.LazilyIndexedArray(
733
+ SDFBackendArray(key, self, shape=value.shape, dtype=value.data.dtype)
734
+ )
735
+ var = Variable(var_coords, lazy_data, data_attrs)
736
+ # Set preferred chunks to match on-disk layout
737
+ # For point data (1D): full dimension
738
+ # For grid data (N-D): individual grid chunk sizes
739
+ if value.is_point_data:
740
+ var.encoding["preferred_chunks"] = {var_coords[0]: len(value.data)}
741
+ else:
742
+ # Align with on-disk grid structure
743
+ chunk_dict = {}
744
+ for dim_name, size in zip(var_coords, value.shape):
745
+ # Use natural on-disk boundaries
746
+ chunk_dict[dim_name] = size
747
+ var.encoding["preferred_chunks"] = chunk_dict
748
+
749
+ data_vars[base_name] = var
750
+
751
+ # TODO: might need to decode if mult is set?
752
+
753
+ # # see also conventions.decode_cf_variables
754
+ # vars, attrs, coords = my_decode_variables(
755
+ # vars, attrs, decode_times, decode_timedelta, decode_coords
756
+ # )
757
+
758
+ ds = xr.Dataset(data_vars, attrs=attrs, coords=coords)
759
+ ds.set_close(self.ds.close)
760
+
761
+ return ds
762
+
763
+ def close(self, **kwargs):
764
+ self._manager.close(**kwargs)
765
+
766
+
767
+ class SDFEntrypoint(BackendEntrypoint):
768
+ supports_groups = True
769
+ open_dataset_parameters: ClassVar[list[str]] = [
770
+ "filename_or_obj",
771
+ "drop_variables",
772
+ "keep_particles",
773
+ "probe_names",
774
+ ]
775
+
776
+ def open_dataset(
777
+ self,
778
+ filename_or_obj,
779
+ *,
780
+ drop_variables=None,
781
+ keep_particles=False,
782
+ probe_names=None,
783
+ ):
784
+ if isinstance(filename_or_obj, Path):
785
+ # sdf library takes a filename only
786
+ # TODO: work out if we need to deal with file handles
787
+ filename_or_obj = str(filename_or_obj)
788
+
789
+ store = SDFDataStore.open(
790
+ filename_or_obj,
791
+ drop_variables=drop_variables,
792
+ keep_particles=keep_particles,
793
+ probe_names=probe_names,
794
+ )
795
+ with close_on_error(store):
796
+ return store.load()
797
+
798
+ open_datatree_parameters: ClassVar[list[str]] = [
799
+ "filename_or_obj",
800
+ "drop_variables",
801
+ "keep_particles",
802
+ "probe_names",
803
+ ]
804
+
805
+ def open_datatree(
806
+ self,
807
+ filename_or_obj,
808
+ *,
809
+ drop_variables=None,
810
+ keep_particles=False,
811
+ probe_names=None,
812
+ ):
813
+ ds = self.open_dataset(
814
+ filename_or_obj,
815
+ drop_variables=drop_variables,
816
+ keep_particles=keep_particles,
817
+ probe_names=probe_names,
818
+ )
819
+ return _build_datatree_from_dataset(ds)
820
+
821
+ def guess_can_open(self, filename_or_obj):
822
+ magic_number = try_read_magic_number_from_path(filename_or_obj)
823
+ if magic_number is not None:
824
+ return magic_number.startswith(b"SDF1")
825
+
826
+ return Path(filename_or_obj).suffix in {".sdf", ".SDF"}
827
+
828
+ description = "Use .sdf files in Xarray"
829
+
830
+ url = "https://epochpic.github.io/documentation/visualising_output/python_beam.html"
831
+
832
+
833
+ class XrTUIEntrpoint:
834
+ def open_mfdatatree(self, paths: list[Path]) -> xr.DataTree:
835
+ return open_mfdatatree(paths)
836
+
837
+
838
+ class SDFPreprocess:
839
+ """Preprocess SDF files for xarray ensuring matching job ids and sets
840
+ time dimension.
841
+
842
+ This class is used as a 'preprocess' function within ``xr.open_mfdataset``. It
843
+ performs three main duties on each individual file's Dataset:
844
+
845
+ 1. Checks for a **matching job ID** across all files to ensure dataset consistency.
846
+ 2. **Filters** the Dataset to keep only the variables specified in `data_vars`
847
+ and their required coordinates.
848
+ 3. **Expands dimensions** to include a single 'time' coordinate, preparing the
849
+ Dataset for concatenation.
850
+
851
+ EPOCH can output variables at different intervals, so some SDF files
852
+ may not contain the requested variable. We combine this data into one
853
+ dataset by concatenating across the time dimension.
854
+
855
+ The combination is performed using ``join="outer"`` (in the calling ``open_mfdataset`` function),
856
+ meaning that the final combined dataset will contain the variable across the
857
+ entire time span, with NaNs filling the time steps where the variable was absent in
858
+ the individual file.
859
+
860
+ With large SDF files, this filtering method will save on memory consumption when
861
+ compared to loading all variables from all files before concatenation.
862
+
863
+ Parameters
864
+ ----------
865
+ data_vars :
866
+ A list of data variables to load in (If not specified loads
867
+ in all variables)
868
+ """
869
+
870
+ def __init__(
871
+ self,
872
+ data_vars: list[str] | None = None,
873
+ ):
874
+ self.job_id: int | None = None
875
+ self.data_vars = data_vars
876
+
877
+ def __call__(self, ds: xr.Dataset) -> xr.Dataset:
878
+ if self.job_id is None:
879
+ self.job_id = ds.attrs["jobid1"]
880
+
881
+ if self.job_id != ds.attrs["jobid1"]:
882
+ raise ValueError(
883
+ f"Mismatching job ids (got {ds.attrs['jobid1']}, expected {self.job_id})"
884
+ )
885
+
886
+ # If the user has exclusively requested only certain variables be
887
+ # loaded in then we purge all other variables and coordinates
888
+ if self.data_vars:
889
+ ds = purge_unselected_data_vars(ds, self.data_vars)
890
+
891
+ time_val = ds.attrs.get("time", np.nan)
892
+ ds = ds.expand_dims(time=[time_val])
893
+ ds = ds.assign_coords(
894
+ time=(
895
+ "time",
896
+ [time_val],
897
+ {"units": "s", "long_name": "Time", "full_name": "time"},
898
+ )
899
+ )
900
+ # Particles' spartial coordinates also evolve in time
901
+ for coord, value in ds.coords.items():
902
+ if value.attrs.get("point_data", False):
903
+ ds.coords[coord] = value.expand_dims(time=[time_val])
904
+
905
+ return ds