dcnum 0.23.2__py3-none-any.whl → 0.25.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dcnum might be problematic. Click here for more details.

dcnum/read/hdf5_data.py CHANGED
@@ -102,10 +102,12 @@ class HDF5Data:
102
102
  elif (feat in self.h5["events"]
103
103
  and len(self.h5["events"][feat].shape) == 1): # cache scalar
104
104
  if self.index_mapping is None:
105
- idx_map = slice(None) # no mapping indices, just slice
105
+ # no mapping indices, just slice
106
+ dat_sc = self.h5["events"][feat][:]
106
107
  else:
107
- idx_map = get_mapping_indices(self.index_mapping)
108
- self._cache_scalar[feat] = self.h5["events"][feat][idx_map]
108
+ dat_sc = get_mapped_object(self.h5["events"][feat],
109
+ index_mapping=self.index_mapping)[:]
110
+ self._cache_scalar[feat] = dat_sc
109
111
  return self._cache_scalar[feat]
110
112
  else:
111
113
  if feat in self.h5["events"]:
@@ -117,9 +119,11 @@ class HDF5Data:
117
119
  else:
118
120
  # Check the basins
119
121
  for idx in range(len(self.basins)):
120
- bn, bn_features = self.get_basin_data(idx)
121
- if bn_features and feat in bn_features:
122
- return bn[feat] # already index-mapped
122
+ bn_grp, bn_feats, bn_map = self.get_basin_data(idx)
123
+ if bn_feats and feat in bn_feats:
124
+ mapped_ds = get_mapped_object(obj=bn_grp[feat],
125
+ index_mapping=bn_map)
126
+ return mapped_ds
123
127
  # If we got here, then the feature data does not exist.
124
128
  raise KeyError(f"Feature '{feat}' not found in {self}!")
125
129
 
@@ -200,12 +204,7 @@ class HDF5Data:
200
204
  np.array(h5["tables"][tab][tkey]).reshape(-1)
201
205
  self.tables[tab] = tabdict
202
206
  # basins
203
- basins = []
204
- for bnkey in h5.get("basins", {}).keys():
205
- bn_data = "\n".join(
206
- [s.decode() for s in h5["basins"][bnkey][:].tolist()])
207
- bn_dict = json.loads(bn_data)
208
- basins.append(bn_dict)
207
+ basins = self.extract_basin_dicts(h5)
209
208
  self.basins = sorted(basins, key=lambda x: x["name"])
210
209
 
211
210
  if state["pixel_size"] is not None:
@@ -271,6 +270,30 @@ class HDF5Data:
271
270
  pixel_size = float(f"{pixel_size:.8f}")
272
271
  self.meta["imaging:pixel size"] = pixel_size
273
272
 
273
+ @staticmethod
274
+ def extract_basin_dicts(h5, check=True):
275
+ """Return list of basin dictionaries"""
276
+ # TODO:
277
+ # - support iterative mapped basins and catch
278
+ # circular basin definitions.
279
+ basins = []
280
+ for bnkey in h5.get("basins", {}).keys():
281
+ bn_data = "\n".join(
282
+ [s.decode() for s in h5["basins"][bnkey][:].tolist()])
283
+ bn_dict = json.loads(bn_data)
284
+ if check:
285
+ if bn_dict["type"] not in ["internal", "file"]:
286
+ # we only support file-based and internal basins
287
+ continue
288
+ basinmap = bn_dict.get("mapping")
289
+ if basinmap is not None and basinmap not in h5["events"]:
290
+ # basinmap feature is missing
291
+ continue
292
+ # Add the basin
293
+ basins.append(bn_dict)
294
+
295
+ return basins
296
+
274
297
  @property
275
298
  def features_scalar_frame(self):
276
299
  """Scalar features that apply to all events in a frame
@@ -289,9 +312,10 @@ class HDF5Data:
289
312
 
290
313
  def close(self):
291
314
  """Close the underlying HDF5 file"""
292
- for bn, _ in self._basin_data.values():
293
- if bn is not None:
294
- bn.close()
315
+ for bn_group, _, _ in self._basin_data.values():
316
+ if bn_group is not None:
317
+ if bn_group.id.valid:
318
+ bn_group.file.close()
295
319
  self._image_cache.clear()
296
320
  self._basin_data.clear()
297
321
  self.h5.close()
@@ -369,66 +393,110 @@ class HDF5Data:
369
393
  raise ValueError(f"Invalid parameter '{var}'!")
370
394
  return kwargs
371
395
 
372
- def get_basin_data(self, index):
396
+ def get_basin_data(self, index: int) -> (
397
+ h5py.Group,
398
+ List,
399
+ int | slice | List | np.ndarray,
400
+ ):
373
401
  """Return HDF5Data info for a basin index in `self.basins`
374
402
 
403
+ Parameters
404
+ ----------
405
+ index: int
406
+ index of the basin from which to get data
407
+
375
408
  Returns
376
409
  -------
377
- data: HDF5Data
378
- Data instance
410
+ group: h5py.Group
411
+ HDF5 group containing HDF5 Datasets with the names
412
+ listed in `features`
379
413
  features: list of str
380
- List of features made available by this data instance
414
+ list of features made available by this basin
415
+ index_mapping:
416
+ a mapping (see `__init__`) that defines mapping from
417
+ the basin dataset to the referring dataset
381
418
  """
382
419
  if index not in self._basin_data:
383
420
  bn_dict = self.basins[index]
384
- for ff in bn_dict["paths"]:
385
- pp = pathlib.Path(ff)
386
- if pp.is_absolute() and pp.exists():
387
- path = pp
388
- break
389
- else:
390
- # try relative path
391
- prel = pathlib.Path(self.path).parent / pp
392
- if prel.exists():
393
- path = prel
394
- break
421
+
422
+ # HDF5 group containing the feature data
423
+ if bn_dict["type"] == "file":
424
+ h5group, features = self._get_basin_data_file(bn_dict)
425
+ elif bn_dict["type"] == "internal":
426
+ h5group, features = self._get_basin_data_internal(bn_dict)
395
427
  else:
396
- path = None
397
- if path is None:
398
- self._basin_data[index] = (None, None)
428
+ raise ValueError(f"Invalid basin type '{bn_dict['type']}'")
429
+
430
+ # index mapping
431
+ feat_basinmap = bn_dict.get("mapping", None)
432
+ if feat_basinmap is None:
433
+ # This is NOT a mapped basin.
434
+ index_mapping = self.index_mapping
399
435
  else:
400
- feat_basinmap = bn_dict.get("mapping", None)
401
- if feat_basinmap is None:
402
- # This is NOT a mapped basin.
403
- index_mapping = self.index_mapping
436
+ # This is a mapped basin. Create an indexing list.
437
+ if self.index_mapping is None:
438
+ # The current dataset is not mapped.
439
+ basinmap_idx = slice(None)
404
440
  else:
405
- # This is a mapped basin. Create an indexing list.
406
- if self.index_mapping is None:
407
- # The current dataset is not mapped.
408
- basinmap_idx = slice(None)
409
- else:
410
- # The current dataset is also mapped.
411
- basinmap_idx = get_mapping_indices(self.index_mapping)
412
- basinmap = self.h5[f"events/{feat_basinmap}"]
413
- index_mapping = basinmap[basinmap_idx]
414
-
415
- h5dat = HDF5Data(path, index_mapping=index_mapping)
416
- features = bn_dict.get("features")
417
- if features is None:
418
- # Only get the features from the actual HDF5 file.
419
- # If this file has basins as well, the basin metadata
420
- # should have been copied over to the parent file. This
421
- # makes things a little cleaner, because basins are not
422
- # nested, but all basins are available in the top file.
423
- # See :func:`write.store_metadata` for copying metadata
424
- # between files.
425
- # The writer can still specify "features" in the basin
426
- # metadata, then these basins are indeed nested, and
427
- # we consider that ok as well.
428
- features = sorted(h5dat.h5["events"].keys())
429
- self._basin_data[index] = (h5dat, features)
441
+ # The current dataset is also mapped.
442
+ basinmap_idx = get_mapping_indices(self.index_mapping)
443
+ basinmap = self.h5[f"events/{feat_basinmap}"]
444
+ index_mapping = basinmap[basinmap_idx]
445
+
446
+ self._basin_data[index] = (h5group, features, index_mapping)
430
447
  return self._basin_data[index]
431
448
 
449
+ def _get_basin_data_file(self, bn_dict):
450
+ for ff in bn_dict["paths"]:
451
+ pp = pathlib.Path(ff)
452
+ if pp.is_absolute() and pp.exists():
453
+ path = pp
454
+ break
455
+ else:
456
+ # try relative path
457
+ prel = pathlib.Path(self.path).parent / pp
458
+ if prel.exists():
459
+ path = prel
460
+ break
461
+ else:
462
+ path = None
463
+ if path is None:
464
+ # Cannot get data from this basin / cannot find file
465
+ h5group = None
466
+ features = []
467
+ else:
468
+ h5 = h5py.File(path, "r")
469
+ h5group = h5["events"]
470
+ # features defined in the basin
471
+ features = bn_dict.get("features")
472
+ if features is None:
473
+ # Only get the features from the actual HDF5 file.
474
+ # If this file has basins as well, the basin metadata
475
+ # should have been copied over to the parent file. This
476
+ # makes things a little cleaner, because basins are not
477
+ # nested, but all basins are available in the top file.
478
+ # See :func:`write.store_metadata` for copying metadata
479
+ # between files.
480
+ # The writer can still specify "features" in the basin
481
+ # metadata, then these basins are indeed nested, and
482
+ # we consider that ok as well.
483
+ features = sorted(h5group.keys())
484
+ return h5group, features
485
+
486
+ def _get_basin_data_internal(self, bn_dict):
487
+ # The group name is normally "basin_events"
488
+ group_name = bn_dict["paths"][0]
489
+ if group_name != "basin_events":
490
+ warnings.warn(
491
+ f"Uncommon group name for basin features: {group_name}")
492
+ h5group = self.h5[group_name]
493
+ features = bn_dict.get("features")
494
+ if features is None:
495
+ raise ValueError(
496
+ f"Encountered invalid internal basin '{bn_dict}': "
497
+ f"'features' must be defined")
498
+ return h5group, features
499
+
432
500
  def get_image_cache(self, feat):
433
501
  """Create an HDF5ImageCache object for the current dataset
434
502
 
@@ -442,15 +510,15 @@ class HDF5Data:
442
510
  idx_map = None
443
511
  # search all basins
444
512
  for idx in range(len(self.basins)):
445
- bn_dat, features = self.get_basin_data(idx)
446
- if features is not None:
447
- if feat in features:
513
+ bn_grp, bn_feats, bn_map = self.get_basin_data(idx)
514
+ if bn_feats is not None:
515
+ if feat in bn_feats:
448
516
  # HDF5 dataset
449
- ds = bn_dat.h5[f"events/{feat}"]
517
+ ds = bn_grp[feat]
450
518
  # Index mapping (taken from the basins which
451
519
  # already includes the mapping from the current
452
520
  # instance).
453
- idx_map = bn_dat.index_mapping
521
+ idx_map = bn_map
454
522
  break
455
523
  else:
456
524
  ds = None
@@ -471,9 +539,9 @@ class HDF5Data:
471
539
  features = sorted(self.h5["/events"].keys())
472
540
  # add basin features
473
541
  for ii in range(len(self.basins)):
474
- _, bfeats = self.get_basin_data(ii)
475
- if bfeats:
476
- features += bfeats
542
+ _, bn_feats, _ = self.get_basin_data(ii)
543
+ if bn_feats:
544
+ features += bn_feats
477
545
  self._keys = sorted(set(features))
478
546
  return self._keys
479
547
 
dcnum/read/mapped.py CHANGED
@@ -27,8 +27,21 @@ class MappedHDF5Dataset:
27
27
  if isinstance(idx, numbers.Integral):
28
28
  return self.h5ds[self.mapping_indices[idx]]
29
29
  else:
30
- idx_mapped = self.mapping_indices[idx]
31
- return self.h5ds[idx_mapped]
30
+ midx = self.mapping_indices[idx]
31
+ start = np.min(midx)
32
+ # Add one, because the final index must be included
33
+ stop = np.max(midx) + 1
34
+ # We have to perform mapping.
35
+ # Since h5py is very slow at indexing with arrays,
36
+ # we instead read the data in chunks from the input file,
37
+ # and perform the mapping afterward using the numpy arrays.
38
+ data_in = self.h5ds[start:stop]
39
+ # Determine the indices that we need from that chunk.
40
+ data = data_in[midx - start]
41
+ return data
42
+
43
+ def __len__(self):
44
+ return self.shape[0]
32
45
 
33
46
 
34
47
  def get_mapping_indices(
@@ -36,7 +36,10 @@ class SegmentTorchMPO(TorchSegmenterBase, MPOSegmenter):
36
36
  # Set number of pytorch threads to 1, because dcnum is doing
37
37
  # all the multiprocessing.
38
38
  # https://pytorch.org/docs/stable/generated/torch.set_num_threads.html#torch.set_num_threads
39
- torch.set_num_threads(1)
39
+ if torch.get_num_threads() != 1:
40
+ torch.set_num_threads(1)
41
+ if torch.get_num_interop_threads() != 1:
42
+ torch.set_num_interop_threads(1)
40
43
  device = torch.device("cpu")
41
44
 
42
45
  # Load model and metadata
dcnum/write/__init__.py CHANGED
@@ -2,5 +2,5 @@
2
2
  from .deque_writer_thread import DequeWriterThread
3
3
  from .queue_collector_thread import EventStash, QueueCollectorThread
4
4
  from .writer import (
5
- HDF5Writer, copy_features, copy_metadata, create_with_basins,
5
+ HDF5Writer, copy_basins, copy_features, copy_metadata, create_with_basins,
6
6
  set_default_filter_kwargs)
@@ -8,8 +8,6 @@ from typing import List
8
8
 
9
9
  import numpy as np
10
10
 
11
- from ..read import HDF5Data
12
-
13
11
 
14
12
  class EventStash:
15
13
  def __init__(self,
@@ -61,11 +59,10 @@ class EventStash:
61
59
  Event dictionary
62
60
  """
63
61
  idx_loc = index - self.index_offset
64
- idx_stop = self.nev_idx[idx_loc]
65
- self._tracker[idx_loc] = True
66
62
 
67
63
  if events:
68
64
  slice_loc = None
65
+ idx_stop = self.nev_idx[idx_loc]
69
66
  for feat in events:
70
67
  dev = events[feat]
71
68
  if dev.size:
@@ -76,6 +73,8 @@ class EventStash:
76
73
  if slice_loc:
77
74
  self.indices_for_data[slice_loc] = index
78
75
 
76
+ self._tracker[idx_loc] = True
77
+
79
78
  def require_feature(self, feat, sample_data):
80
79
  """Create a new empty feature array in `self.events` and return it
81
80
 
@@ -87,10 +86,10 @@ class EventStash:
87
86
  Sample data for one event of the feature (used to determine
88
87
  shape and dtype of the feature array)
89
88
  """
90
- sample_data = np.array(sample_data)
91
- event_shape = sample_data.shape
92
- dtype = sample_data.dtype
93
89
  if feat not in self.events:
90
+ sample_data = np.array(sample_data)
91
+ event_shape = sample_data.shape
92
+ dtype = sample_data.dtype
94
93
  darr = np.zeros((self.size,) + tuple(event_shape),
95
94
  dtype=dtype)
96
95
  self.events[feat] = darr
@@ -99,7 +98,6 @@ class EventStash:
99
98
 
100
99
  class QueueCollectorThread(threading.Thread):
101
100
  def __init__(self,
102
- data: HDF5Data,
103
101
  event_queue: mp.Queue,
104
102
  writer_dq: deque,
105
103
  feat_nevents: mp.Array,
@@ -115,9 +113,6 @@ class QueueCollectorThread(threading.Thread):
115
113
 
116
114
  Parameters
117
115
  ----------
118
- data:
119
- Data source object. This is used for appending additional
120
- information
121
116
  event_queue:
122
117
  A queue object to which other processes or threads write
123
118
  events as tuples `(frame_index, events_dict)`.
@@ -146,8 +141,6 @@ class QueueCollectorThread(threading.Thread):
146
141
  super(QueueCollectorThread, self).__init__(
147
142
  name="QueueCollector", *args, **kwargs)
148
143
  self.logger = logging.getLogger("dcnum.write.QueueCollector")
149
- #: HDF5 data instance
150
- self.data = data
151
144
  #: Event queue from which to collect event data
152
145
  self.event_queue = event_queue
153
146
  #: Writer deque to which event arrays are appended
@@ -169,7 +162,7 @@ class QueueCollectorThread(threading.Thread):
169
162
  # We are not writing to `event_queue` so we can safely cancel
170
163
  # our queue thread if we are told to stop.
171
164
  self.event_queue.cancel_join_thread()
172
- # Indexes the current frame in `self.data`.
165
+ # Indexes the current frame in the input HDF5Data instance.
173
166
  last_idx = 0
174
167
  self.logger.debug("Started collector thread")
175
168
  while True: