PyPI - zea - Versions diffs - 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl - Mend

zea 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

zea/__init__.py +1 -1
zea/backend/tensorflow/dataloader.py +0 -4
zea/beamform/pixelgrid.py +1 -1
zea/data/__init__.py +0 -9
zea/data/augmentations.py +221 -28
zea/data/convert/__init__.py +1 -6
zea/data/convert/__main__.py +123 -0
zea/data/convert/camus.py +99 -39
zea/data/convert/echonet.py +183 -82
zea/data/convert/echonetlvh/README.md +2 -3
zea/data/convert/echonetlvh/{convert_raw_to_usbmd.py → __init__.py} +173 -102
zea/data/convert/echonetlvh/manual_rejections.txt +73 -0
zea/data/convert/echonetlvh/precompute_crop.py +43 -64
zea/data/convert/picmus.py +37 -40
zea/data/convert/utils.py +86 -0
zea/data/convert/{matlab.py → verasonics.py} +33 -61
zea/data/data_format.py +124 -4
zea/data/dataloader.py +12 -7
zea/data/datasets.py +109 -70
zea/data/file.py +91 -82
zea/data/file_operations.py +496 -0
zea/data/preset_utils.py +1 -1
zea/display.py +7 -8
zea/internal/checks.py +6 -12
zea/internal/operators.py +4 -0
zea/io_lib.py +108 -160
zea/models/__init__.py +1 -1
zea/models/diffusion.py +62 -11
zea/models/lv_segmentation.py +2 -0
zea/ops.py +398 -158
zea/scan.py +18 -8
zea/tensor_ops.py +82 -62
zea/tools/fit_scan_cone.py +90 -160
zea/tracking/__init__.py +16 -0
zea/tracking/base.py +94 -0
zea/tracking/lucas_kanade.py +474 -0
zea/tracking/segmentation.py +110 -0
zea/utils.py +11 -2
{zea-0.0.7.dist-info → zea-0.0.8.dist-info}/METADATA +3 -1
{zea-0.0.7.dist-info → zea-0.0.8.dist-info}/RECORD +43 -35
{zea-0.0.7.dist-info → zea-0.0.8.dist-info}/WHEEL +0 -0
{zea-0.0.7.dist-info → zea-0.0.8.dist-info}/entry_points.txt +0 -0
{zea-0.0.7.dist-info → zea-0.0.8.dist-info}/licenses/LICENSE +0 -0

zea/data/data_format.py CHANGED Viewed

@@ -6,6 +6,7 @@ import inspect
 from dataclasses import dataclass
 from pathlib import Path
+import h5py
 import numpy as np
 from keras.utils import pad_sequences
@@ -20,15 +21,15 @@ class DatasetElement:
     """Class to store a dataset element with a name, data, description and unit. Used to
     supply additional dataset elements to the generate_zea_dataset function."""
-    # The group name to store the dataset under. This can be a nested group, e.g.
-    # "scan/waveforms"
-    group_name: str
     # The name of the dataset. This will be the key in the group.
     dataset_name: str
     # The data to store in the dataset.
     data: np.ndarray
     description: str
     unit: str
+    # The group name to store the dataset under. This can be a nested group, e.g.
+    # "lens/profiles"
+    group_name: str = ""
 def generate_example_dataset(
@@ -111,9 +112,43 @@ def generate_example_dataset(
         focus_distances=focus_distances,
         polar_angles=polar_angles,
         azimuth_angles=azimuth_angles,
+        additional_elements=_generate_example_dataset_elements(),
+        description="This is an example dataset generated by zea",
     )
+def _generate_example_dataset_elements() -> list[DatasetElement]:
+    """Generates a list of example DatasetElement objects to be used as additional
+    elements in the generate_zea_dataset function.
+    Returns:
+        list: A list of DatasetElement objects.
+    """
+    example_elements = [
+        DatasetElement(
+            dataset_name="temperature",
+            data=np.array(42),
+            description="The temperature during the measurement",
+            unit="unitless",
+        ),
+        DatasetElement(
+            dataset_name="lens_profile",
+            data=np.random.rand(100),
+            description="An example lens profile",
+            unit="mm",
+            group_name="lens",
+        ),
+        DatasetElement(
+            dataset_name="lens_material",
+            data=np.array(["material1", "material2", "material3"], dtype=h5py.string_dtype()),
+            description="An example lens material list",
+            unit="unitless",
+            group_name="lens",
+        ),
+    ]
+    return example_elements
 def validate_input_data(raw_data, aligned_data, envelope_data, beamformed_data, image, image_sc):
     """
     Validates input data for generate_zea_dataset
@@ -498,9 +533,18 @@ def _write_datasets(
     # Add additional elements
     if additional_elements is not None:
+        # Write scan group
+        non_standard_elements_group_name = "non_standard_elements"
+        non_standard_elements_group = dataset.create_group(non_standard_elements_group_name)
+        non_standard_elements_group.attrs["description"] = (
+            "This group contains non-standard elements that can be added by the user."
+        )
         for element in additional_elements:
+            group_name = non_standard_elements_group_name
+            if element.group_name != "":
+                group_name += f"/{element.group_name}"
             _add_dataset(
-                group_name=element.group_name,
+                group_name=group_name,
                 name=element.dataset_name,
                 data=element.data,
                 description=element.description,
@@ -721,3 +765,79 @@ def generate_zea_dataset(
     validate_file(path)
     log.info(f"zea dataset written to {log.yellow(path)}")
+def load_description(path):
+    """Loads the description of a zea dataset.
+    Args:
+        path (str): The path to the zea dataset.
+    Returns:
+        str: The description of the dataset, or an empty string if not found.
+    """
+    path = Path(path)
+    with File(path, "r") as file:
+        description = file.attrs.get("description", "")
+    return description
+def load_additional_elements(path):
+    """Loads additional dataset elements from a zea dataset.
+    Args:
+        path (str): The path to the zea dataset.
+    Returns:
+        list: A list of DatasetElement objects.
+    """
+    path = Path(path)
+    with File(path, "r") as file:
+        if "non_standard_elements" not in file:
+            return []
+        additional_elements = _load_additional_elements_from_group(file, "non_standard_elements")
+    return additional_elements
+def _load_additional_elements_from_group(file, path):
+    """Recursively loads additional dataset elements from a group."""
+    elements = []
+    for name, item in file[path].items():
+        if isinstance(item, h5py.Dataset):
+            elements.append(_load_dataset_element_from_group(file, f"{path}/{name}"))
+        elif isinstance(item, h5py.Group):
+            elements.extend(_load_additional_elements_from_group(file, f"{path}/{name}"))
+    return elements
+def _load_dataset_element_from_group(file, path):
+    """Loads a specific dataset element from a group.
+    Args:
+        file (h5py.File): The HDF5 file object.
+        path (str): The full path to the dataset element.
+            e.g., "non_standard_elements/lens/lens_profile"
+    Returns:
+        DatasetElement: The loaded dataset element.
+    """
+    dataset = file[path]
+    description = dataset.attrs.get("description", "")
+    unit = dataset.attrs.get("unit", "")
+    data = dataset[()]
+    path_parts = path.split("/")
+    return DatasetElement(
+        dataset_name=path_parts[-1],
+        data=data,
+        description=description,
+        unit=unit,
+        group_name="/".join(path_parts[1:-1]),
+    )

zea/data/dataloader.py CHANGED Viewed

@@ -5,7 +5,7 @@ H5 dataloader for loading images from zea datasets.
 import re
 from itertools import product
 from pathlib import Path
-from typing import List
+from typing import List, Tuple, Union
 import numpy as np
@@ -65,12 +65,12 @@ def generate_h5_indices(
                 (
                     "/folder/path_to_file.hdf5",
                     "data/image",
-                    [range(0, 1), slice(None, 256, None), slice(None, 256, None)],
+                    (range(0, 1), slice(None, 256, None), slice(None, 256, None)),
                 ),
                 (
                     "/folder/path_to_file.hdf5",
                     "data/image",
-                    [range(1, 2), slice(None, 256, None), slice(None, 256, None)],
+                    (range(1, 2), slice(None, 256, None), slice(None, 256, None)),
                 ),
                 ...,
             ]
@@ -117,7 +117,7 @@ def generate_h5_indices(
             # Optionally limit frames to load from each file
             n_frames_in_file = min(n_frames_in_file, limit_n_frames)
             indices = [
-                range(i, i + block_size, frame_index_stride)
+                list(range(i, i + block_size, frame_index_stride))
                 for i in range(0, n_frames_in_file - block_size + 1, block_step_size)
             ]
             yield [indices]
@@ -132,7 +132,7 @@ def generate_h5_indices(
             continue
         if additional_axes_iter:
-            axis_indices += [range(shape[axis]) for axis in additional_axes_iter]
+            axis_indices += [list(range(shape[axis])) for axis in additional_axes_iter]
         axis_indices = product(*axis_indices)
@@ -140,7 +140,7 @@ def generate_h5_indices(
             full_indices = [slice(size) for size in shape]
             for i, axis in enumerate([initial_frame_axis] + list(additional_axes_iter)):
                 full_indices[axis] = axis_index[i]
-            indices.append((file, key, full_indices))
+            indices.append((file, key, tuple(full_indices)))
     if skipped_files > 0:
         log.warning(
@@ -321,7 +321,12 @@ class H5Generator(Dataset):
         initial_delay=INITIAL_RETRY_DELAY,
         retry_action=_h5_reopen_on_io_error,
     )
-    def load(self, file: File, key: str, indices: tuple | str):
+    def load(
+        self,
+        file: File,
+        key: str,
+        indices: Tuple[Union[list, slice, int], ...] | List[int] | int | None = None,
+    ):
         """Extract data from hdf5 file.
         Args:
             file_name (str): name of the file to extract image from.

zea/data/datasets.py CHANGED Viewed

@@ -31,9 +31,12 @@ Features
 """
+import functools
+import multiprocessing
+import os
 from collections import OrderedDict
 from pathlib import Path
-from typing import List
+from typing import List, Tuple
 import numpy as np
 import tqdm
@@ -48,16 +51,12 @@ from zea.data.preset_utils import (
     _hf_resolve_path,
 )
 from zea.datapaths import format_data_path
-from zea.internal.utils import (
-    calculate_file_hash,
-    reduce_to_signature,
-)
+from zea.internal.cache import cache_output
+from zea.internal.core import hash_elements
+from zea.internal.utils import calculate_file_hash, reduce_to_signature
 from zea.io_lib import search_file_tree
 from zea.tools.hf import HFPath
-from zea.utils import (
-    date_string_to_readable,
-    get_date_string,
-)
+from zea.utils import date_string_to_readable, get_date_string
 _CHECK_MAX_DATASET_SIZE = 10000
 _VALIDATED_FLAG_FILE = "validated.flag"
@@ -106,16 +105,78 @@ class H5FileHandleCache:
         return self._file_handle_cache[file_path]
+    def close(self):
+        """Close all cached file handles."""
+        cache: OrderedDict = getattr(self, "_file_handle_cache", None)
+        if not cache:
+            return
+        # iterate over a static list to avoid mutation during iteration
+        for fh in list(cache.values()):
+            if fh is None:
+                continue
+            try:
+                # attempt to close unconditionally and swallow exceptions
+                fh.close()
+            except Exception:
+                # During interpreter shutdown or if the h5py internals are already
+                # torn down, close() can raise weird errors (e.g. TypeError).
+                # Swallow them here to avoid exceptions from __del__.
+                pass
+        cache.clear()  # clear the cache dict
     def __del__(self):
-        """Ensure cached files are closed."""
-        if hasattr(self, "_file_handle_cache"):
-            for _, file in self._file_handle_cache.items():
-                if file is not None and self._check_if_open(file):
-                    file.close()
-            self._file_handle_cache = OrderedDict()
+        self.close()
+@cache_output("filepaths", "key", "_filepath_hash", verbose=True)
+def _find_h5_file_shapes(filepaths, key, _filepath_hash, verbose=True):
+    # NOTE: we cache the output of this function such that file loading over the network is
+    # faster for repeated calls with the same filepaths, key and _filepath_hash
+    assert _filepath_hash is not None
+    get_shape = functools.partial(File.get_shape, key=key)
+    if os.environ.get("ZEA_FIND_H5_SHAPES_PARALLEL", "1") in ("1", "true", "yes"):
+        # using multiprocessing to speed up reading hdf5 files
+        # make sure to call find_h5_file_shapes from within a function
+        # or use if __name__ == "__main__" to avoid freezing the main process
+        with multiprocessing.Pool() as pool:
+            file_shapes = list(
+                tqdm.tqdm(
+                    pool.imap(get_shape, filepaths),
+                    total=len(filepaths),
+                    desc="Getting file shapes in each h5 file",
+                    disable=not verbose,
+                )
+            )
+    else:
+        file_shapes = []
+        for file_path in tqdm.tqdm(
+            filepaths,
+            desc="Getting file shapes in each h5 file",
+            disable=not verbose,
+        ):
+            file_shapes.append(get_shape(file_path))
+    return file_shapes
-def find_h5_files(paths: str | list, key: str = None, search_file_tree_kwargs: dict | None = None):
+def _file_hash(filepaths):
+    # NOTE: this is really fast, even over network filesystemss
+    total_size = 0
+    modified_times = []
+    for fp in filepaths:
+        if os.path.isfile(fp):
+            total_size += os.path.getsize(fp)
+            modified_times.append(os.path.getmtime(fp))
+    return hash_elements([total_size, modified_times])
+def find_h5_files(paths: str | list, key: str = None) -> Tuple[List[str], List[tuple]]:
     """
     Find HDF5 files from a directory or list of directories and optionally retrieve their shapes.
@@ -123,17 +184,11 @@ def find_h5_files(paths: str | list, key: str = None, search_file_tree_kwargs: d
         paths (str or list): A single directory path, a list of directory paths,
             or a single HDF5 file path.
         key (str, optional): The key to get the file shapes for.
-        search_file_tree_kwargs (dict, optional): Additional keyword arguments for the
-            search_file_tree function. Defaults to None.
     Returns:
-        - file_paths (list): List of file paths to the HDF5 files.
-        - file_shapes (list): List of shapes of the HDF5 datasets.
+        - file_paths (list): List of file paths (str) to the HDF5 files.
+        - file_shapes (list): List of shapes (tuple) of the HDF5 datasets.
     """
-    if search_file_tree_kwargs is None:
-        search_file_tree_kwargs = {}
     # Make sure paths is a list
     if not isinstance(paths, (tuple, list)):
         paths = [paths]
@@ -154,14 +209,12 @@ def find_h5_files(paths: str | list, key: str = None, search_file_tree_kwargs: d
             file_paths.append(str(path))
             continue
-        dataset_info = search_file_tree(
-            path,
-            filetypes=FILE_TYPES,
-            hdf5_key_for_length=key,
-            **search_file_tree_kwargs,
-        )
-        file_shapes += dataset_info["file_shapes"]
-        file_paths += [str(Path(path) / file_path) for file_path in dataset_info["file_paths"]]
+        _filepaths = list(search_file_tree(path, filetypes=FILE_TYPES))
+        file_shapes += _find_h5_file_shapes(_filepaths, key, _file_hash(_filepaths))
+        file_paths += _filepaths
+    # Convert file paths to strings
+    file_paths = [str(fp) for fp in file_paths]
     return file_paths, file_shapes
@@ -174,8 +227,7 @@ class Folder:
     def __init__(
         self,
         folder_path: list[str] | list[Path],
-        key: str = None,
-        search_file_tree_kwargs: dict | None = None,
+        key: str,
         validate: bool = True,
         hf_cache_dir: str = HF_DATASETS_DIR,
         **kwargs,
@@ -197,11 +249,8 @@ class Folder:
         self.folder_path = Path(folder_path)
         self.key = key
-        self.search_file_tree_kwargs = search_file_tree_kwargs
         self.validate = validate
-        self.file_paths, self.file_shapes = find_h5_files(
-            folder_path, self.key, self.search_file_tree_kwargs
-        )
+        self.file_paths, self.file_shapes = find_h5_files(folder_path, self.key)
         assert self.n_files > 0, f"No files in folder: {folder_path}"
         if self.validate:
             self.validate_folder()
@@ -321,24 +370,27 @@ class Folder:
         data_types = self.get_data_types(self.file_paths[0])
         number_of_frames = sum(num_frames_per_file)
-        with open(validation_file_path, "w", encoding="utf-8") as f:
-            f.write(f"Dataset: {path}\n")
-            f.write(f"Validated on: {get_date_string()}\n")
-            f.write(f"Number of files: {self.n_files}\n")
-            f.write(f"Number of frames: {number_of_frames}\n")
-            f.write(f"Data types: {', '.join(data_types)}\n")
-            f.write(f"{'-' * 80}\n")
-            # write all file names (not entire path) with number of frames on a new line
-            for file_path, num_frames in zip(self.file_paths, num_frames_per_file):
-                f.write(f"{file_path.name}: {num_frames}\n")
-            f.write(f"{'-' * 80}\n")
-        # Write the hash of the validation file
-        validation_file_hash = calculate_file_hash(validation_file_path)
-        with open(validation_file_path, "a", encoding="utf-8") as f:
-            # *** validation file hash *** (80 total line length)
-            f.write("*** validation file hash ***\n")
-            f.write(f"hash: {validation_file_hash}")
+        try:
+            with open(validation_file_path, "w", encoding="utf-8") as f:
+                f.write(f"Dataset: {path}\n")
+                f.write(f"Validated on: {get_date_string()}\n")
+                f.write(f"Number of files: {self.n_files}\n")
+                f.write(f"Number of frames: {number_of_frames}\n")
+                f.write(f"Data types: {', '.join(data_types)}\n")
+                f.write(f"{'-' * 80}\n")
+                # write all file names (not entire path) with number of frames on a new line
+                for file_path, num_frames in zip(self.file_paths, num_frames_per_file):
+                    f.write(f"{file_path.name}: {num_frames}\n")
+                f.write(f"{'-' * 80}\n")
+            # Write the hash of the validation file
+            validation_file_hash = calculate_file_hash(validation_file_path)
+            with open(validation_file_path, "a", encoding="utf-8") as f:
+                # *** validation file hash *** (80 total line length)
+                f.write("*** validation file hash ***\n")
+                f.write(f"hash: {validation_file_hash}")
+        except Exception as e:
+            log.warning(f"Unable to write validation flag: {e}")
     def __repr__(self):
         return (
@@ -415,7 +467,6 @@ class Dataset(H5FileHandleCache):
         self,
         file_paths: List[str] | str,
         key: str,
-        search_file_tree_kwargs: dict | None = None,
         validate: bool = True,
         directory_splits: list | None = None,
         **kwargs,
@@ -426,9 +477,6 @@ class Dataset(H5FileHandleCache):
             file_paths (str or list): (list of) path(s) to the folder(s) containing the HDF5 file(s)
                 or list of HDF5 file paths. Can be a mixed list of folders and files.
             key (str): The key to access the HDF5 dataset.
-            search_file_tree_kwargs (dict, optional): Additional keyword arguments for the
-                search_file_tree function. These are only used when `file_paths` are directories.
-                Defaults to None.
             validate (bool, optional): Whether to validate the dataset. Defaults to True.
             directory_splits (list, optional): List of directory split by. Is a list of floats
                 between 0 and 1, with the same length as the number of file_paths given.
@@ -437,7 +485,6 @@ class Dataset(H5FileHandleCache):
         """
         super().__init__(**kwargs)
         self.key = key
-        self.search_file_tree_kwargs = search_file_tree_kwargs
         self.validate = validate
         self.file_paths, self.file_shapes = self.find_files_and_shapes(file_paths)
@@ -477,7 +524,7 @@ class Dataset(H5FileHandleCache):
                 file_path = Path(file_path)
             if file_path.is_dir():
-                folder = Folder(file_path, self.key, self.search_file_tree_kwargs, self.validate)
+                folder = Folder(file_path, self.key, self.validate)
                 file_paths += folder.file_paths
                 file_shapes += folder.file_shapes
                 del folder
@@ -541,14 +588,6 @@ class Dataset(H5FileHandleCache):
     def __str__(self):
         return f"Dataset with {self.n_files} files (key='{self.key}')"
-    def close(self):
-        """Close all cached file handles."""
-        for file in self._file_handle_cache.values():
-            if file is not None and file.id.valid:
-                file.close()
-        self._file_handle_cache.clear()
-        log.info("Closed all cached file handles.")
     def __enter__(self):
         return self

zea 0.0.7__py3-none-any.whl → 0.0.8__py3-none-any.whl

zea 0.0.7py3-none-any.whl → 0.0.8py3-none-any.whl