PyPI - etiket-sync-agent-quantify - Versions diffs - 0.3.0b1__py3-none-any.whl - Mend

etiket-sync-agent-quantify 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

etiket_sync_agent_quantify/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .quantify_sync_class import QuantifySync
+from .quantify_config_class import QuantifyConfigData
+__all__ = ["QuantifySync", "QuantifyConfigData"]
+__version__ = "0.3.0b1"

etiket_sync_agent_quantify/live_sync.py ADDED Viewed

@@ -0,0 +1,360 @@
+from contextlib import contextmanager
+from pathlib import Path
+from qdrive.dataset.dataset import dataset
+from etiket_client.remote.endpoints.models.types import FileStatusLocal, FileType
+from etiket_sync_agent_quantify.utility import to_gridded_dataset
+import time
+import logging
+import tempfile
+import h5py
+import uuid
+import xarray as xr
+import numpy as np
+logger = logging.getLogger(__name__)
+@contextmanager
+def with_dataset_snapshot(file_location: Path) -> Path:
+    """
+    Creates a safe, temporary copy of an HDF5/netCDF dataset file to prevent conflicts
+    when the original is being actively written to by another process.
+    Args:
+        file_location: Path to the original dataset file
+    Yields:
+        Path to the temporary copy that can be safely read
+    Raises:
+        Various exceptions if file operations fail
+    Example:
+        with with_dataset_snapshot(data_file) as safe_file:
+            dataset = xr.open_dataset(safe_file, engine='h5netcdf')
+            # Process dataset without worrying about concurrent writes
+    """
+    temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5", delete=False)
+    try:
+        with open(file_location, 'rb') as src:
+            temp_file.write(src.read())
+        temp_file.close()
+        yield Path(temp_file.name)
+    except (IOError, OSError) as e:
+        logger.exception(f"Error creating dataset snapshot for {file_location}: {e}")
+        raise e
+    except Exception as e:
+        logger.exception(f"Unexpected error handling dataset {file_location}: {e}")
+        raise e
+    finally:
+        try:
+            Path(temp_file.name).unlink()
+        except OSError:
+            logger.warning(f"Failed to remove temporary snapshot file: {temp_file.name}")
+def is_dataset_live(file_location: Path, perform_NAN_check = True, n_th_attempt = 0) -> bool:
+    """
+    Returns False if one of the following conditions are met:
+    - If the dataset has not been modified in the last 2 minutes
+    - If a new directory is created in the same parent directory with a newer modification time
+    - If a new directory is created in the grandparent directory with a newer modification time
+    - If the dataset does not contain any NaN values (if perform_NAN_check is True)
+    """
+    # Check modification time
+    last_modified = file_location.stat().st_mtime
+    if (time.time() - last_modified) > 120:
+        return False
+    current_dataset_mtime = file_location.stat().st_mtime
+    parent_dir = file_location.parent
+    # Check if any new directories in the same parent directory have newer modification time
+    try:
+        for item in parent_dir.parent.iterdir():
+            if item.is_dir() and item.name != parent_dir.name:
+                dir_mtime = item.stat().st_mtime
+                if dir_mtime > current_dataset_mtime:
+                    logger.debug(f"Found newer directory {item} in parent directory")
+                    return False
+        if parent_dir.parent.parent.exists():
+            for item in parent_dir.parent.parent.iterdir():
+                if item.is_dir() and item.name != parent_dir.parent.name:  # Check siblings of grandparent
+                    dir_mtime = item.stat().st_mtime
+                    if dir_mtime > current_dataset_mtime:
+                        logger.debug(f"Found newer directory {item} in grandparent directory")
+                        return False
+    except (PermissionError, FileNotFoundError) as e:
+        logger.warning(f"Error checking for new directories: {e}")
+    if perform_NAN_check is True:
+        print("TRYING TO CHECK NAN")
+        try:
+            with with_dataset_snapshot(file_location) as safe_file:
+                with xr.open_dataset(safe_file, engine='h5netcdf') as dataset:
+                    return has_nan_values(dataset)
+        except OSError as e:
+            # likely partially written dataset, try again (max 3 times)
+            if n_th_attempt < 3:
+                # sleep to give io time to finish.
+                time.sleep(0.5)
+                return is_dataset_live(file_location, perform_NAN_check, n_th_attempt + 1)
+            else:
+                logger.exception("Error checking for NaN values in dataset")
+                # file is corrupted in some way, probs because something is writing.
+                raise ValueError("Unable to read HDF5 files, maybe something is locked?") from e
+        except Exception:
+            logger.exception("Error checking for NaN values in dataset")
+            return False
+    return True
+class XArrayReplicator:
+    '''
+    Replicates the xarray dataset into a new HDF5 file. This file is launched in SWMR mode, and gets the expected attributes present in the qdrive dataset.
+    The sync process works by following the state of the NAN values in the dataset.
+    '''
+    def __init__(self, ds_name : str, dataset_location: Path, dataset_uuid: uuid.UUID):
+        self.dataset_location = dataset_location
+        self.qdrive_dataset = dataset(dataset_uuid)
+        self.dataset_followers = {}
+        self.last_mod_sync = dataset_location.stat().st_mtime
+        with with_dataset_snapshot(dataset_location) as safe_file:
+            with xr.open_dataset(safe_file, engine='h5netcdf') as xr_dataset:
+                with tempfile.TemporaryDirectory() as temp_dir:
+                    temp = Path(temp_dir) / "temp.hdf5"
+                    try:
+                        xr_dataset = to_gridded_dataset(xr_dataset)
+                    except Exception:
+                        pass
+                    xr_dataset.to_netcdf(temp, engine='h5netcdf', invalid_netcdf=True)
+                    m_file = Path(temp_dir) / "measurement.hdf5"
+                    # kinda have to do some hacky stuff to get the superblock to work ... (standard superblock is v2, but we need at least v3 for the qdrive dataset)
+                    convert_to_superblock_v3(m_file, h5py.File(temp, 'r'))
+                    if ds_name not in self.qdrive_dataset.files.keys():
+                        self.qdrive_dataset._files._add_new_file(ds_name,
+                                                                file_path=m_file,
+                                                                file_type=FileType.HDF5_CACHE,
+                                                                generator="quantify_sync_module",
+                                                                file_status = FileStatusLocal.writing)
+                self.hdf5_file = h5py.File(self.qdrive_dataset[ds_name].path, 'a', locking=False, libver='v112')
+                for name in xr_dataset.variables :
+                    self.dataset_followers[name] = DatasetFollower(self.hdf5_file[name], xr_dataset[name])
+        self.hdf5_file.swmr_mode = True
+    def sync(self):
+        keep_syncing = True
+        while keep_syncing:
+            keep_syncing = not self._check_done()
+            if self._has_update():
+                try:
+                    with with_dataset_snapshot(self.dataset_location) as safe_file:
+                        with xr.open_dataset(safe_file, engine='h5netcdf') as xr_dataset:
+                            try:
+                                xr_dataset = to_gridded_dataset(xr_dataset)
+                            except Exception:
+                                pass
+                            for name in xr_dataset.variables:
+                                self.dataset_followers[name].update(xr_dataset[name])
+                            self.hdf5_file.flush()
+                except Exception:
+                    time.sleep(0.5)
+                    logger.exception("Error reading dataset")
+            else:
+                time.sleep(0.5) # default write interval in quantify is 0.5s
+        for follower in self.dataset_followers.values():
+            follower.complete()
+    def _has_update(self) -> bool:
+        last_mod = self.dataset_location.stat().st_mtime
+        if last_mod > self.last_mod_sync:
+            self.last_mod_sync = last_mod
+            return True
+        return False
+    def _check_done(self):
+        done = True
+        for follower in self.dataset_followers.values():
+            if not follower.noNanValues:
+                done = False
+                break
+        if done:
+            return True
+        return not is_dataset_live(self.dataset_location, perform_NAN_check=False)
+class DatasetFollower:
+    '''
+    Object used cache the state of a datasets in the netcdf4 file. If the file has new values, they will be written to the live HDF5 file.
+    '''
+    def __init__(self, h5_dataset: h5py.Dataset, initial_state: xr.DataArray):
+        self.dataset = h5_dataset
+        self.noNanValues = False
+        raw_data = np.asarray(initial_state.data)
+        cursor = self.__get_cursor(raw_data)
+        cursor_shape = (1,) if raw_data.ndim == 0 else (raw_data.ndim,)
+        h5_dataset.attrs.create('__cursor', cursor, dtype=np.int32, shape=cursor_shape)
+        h5_dataset.attrs['completed'] = False
+    def update(self, data_array: xr.DataArray):
+        data = data_array.values
+        old_cursor = self.dataset.attrs['__cursor']
+        new_cursor = self.__get_cursor(data)
+        if not np.array_equal(old_cursor, new_cursor):
+            try:
+                if self.dataset.shape != data.shape:
+                    self.dataset.resize(data.shape)
+                slices = []
+                for i in range(len(data.shape)):
+                    if old_cursor[i] == new_cursor[i]:
+                        slices.append(slice(new_cursor[i], new_cursor[i]+1))
+                    else:
+                        if i == data.ndim-1:
+                            slices.append(slice(old_cursor[i], new_cursor[i]))
+                        else:
+                            slices.append(slice(old_cursor[i], new_cursor[i]+1))
+                        break
+                self.dataset.write_direct(data, np.s_[tuple(slices)], np.s_[tuple(slices)])
+                self.dataset.attrs['__cursor'] = new_cursor
+                self.dataset.attrs['completed'] = False
+            except Exception:
+                self.dataset.attrs['__cursor'] = old_cursor
+                self.dataset.attrs['completed'] = False
+                logger.exception("Error updating dataset")
+    def complete(self):
+        self.dataset.attrs['completed'] = True
+    def __get_cursor(self, raw_data: np.ndarray):
+        '''
+        Finds the position of the last value that is not NaN in the data array.
+        This helps track how much of the dataset has already been written.
+        '''
+        non_nan_mask = ~np.isnan(raw_data)
+        if np.all(non_nan_mask):
+            self.noNanValues = True
+            return np.unravel_index(raw_data.size - 1, raw_data.shape)
+        # If all values are NaN, return zeros
+        if not np.any(non_nan_mask):
+            return tuple([0] * len(raw_data.shape))
+        # Find the last non-NaN value
+        flat_indices = np.flatnonzero(non_nan_mask)
+        if len(flat_indices) > 0:
+            last_non_nan_idx = flat_indices[-1]
+            # Convert flat index to dimensional indices
+            return np.unravel_index(last_non_nan_idx, raw_data.shape)
+        # Fallback to all zeros (shouldn't reach here given the checks above)
+        return tuple([0] * len(raw_data.shape))
+def convert_to_superblock_v3(new_file : Path, h5_old_file : h5py.File):
+    with h5py.File(new_file, 'w', locking=False, libver='v112') as h5_new_file:
+        # create all groups and dataset of the original file (normally not nested)
+        for h5_name, h5_object in h5_old_file.items():
+            if isinstance(h5_object, h5py.Group):
+                h5_new_file.create_group(h5_name)
+            elif isinstance(h5_object, h5py.Dataset):
+                h5_new_file.create_dataset(h5_name, data=h5_object[()])
+            else:
+                raise ValueError("Unknown type in HDF5 file")
+        # Copy file attributes
+        for h5_name, h5_object in h5_old_file.attrs.items():
+            h5_new_file.attrs[h5_name] = h5_object
+        # Copy object attributes and handle special cases
+        for h5_name, h5_object in h5_old_file.items():
+            for attr_name, attr_value in h5_object.attrs.items():
+                if attr_name == 'DIMENSION_LIST':
+                    dimension_scale = [np.array([h5_new_file[h5py.h5r.get_name(ds_ref, h5_old_file.id)].ref
+                                            for ds_ref in reference_list], dtype=np.object_)
+                                            for reference_list in attr_value]
+                    create_dimension_list_attr(h5_new_file, h5_name, dimension_scale)
+                elif attr_name == 'REFERENCE_LIST':
+                    # extract from compound datatype
+                    reference_list = [(h5_new_file[h5py.h5r.get_name(ref_compound['dataset'], h5_old_file.id)].ref, ref_compound['dimension']) for ref_compound in attr_value]
+                    create_reference_list_attr(h5_new_file, h5_name, reference_list)
+                elif attr_name == 'CLASS':
+                    create_str_attr(h5_new_file[h5_name], 'CLASS', str(attr_value.decode('utf-8')))
+                elif attr_name == 'NAME':
+                    create_str_attr(h5_new_file[h5_name], 'NAME', str(attr_value.decode('utf-8')))
+                else :
+                    h5_new_file[h5_name].attrs[attr_name] = attr_value
+def create_dimension_list_attr(h5_new_file, h5_name, dimension_scale):
+    type_id = h5py.h5t.vlen_create(h5py.h5t.STD_REF_OBJ)
+    space_id = h5py.h5s.create_simple((len(dimension_scale),), (len(dimension_scale),))
+    attr = h5py.h5a.create(h5_new_file[h5_name].id, 'DIMENSION_LIST'.encode('utf-8'), type_id, space_id)
+    arr = np.array(dimension_scale + [''], dtype=object)[:-1]  # Append and remove an empty string to ensure correct type
+    attr.write(arr)
+def create_reference_list_attr(h5_new_file, h5_name, reference_list):
+    type_id = h5py.h5t.create(h5py.h5t.COMPOUND, h5py.h5t.STD_REF_OBJ.get_size() + h5py.h5t.NATIVE_UINT32.get_size())
+    type_id.insert('dataset'.encode('utf-8'), 0, h5py.h5t.STD_REF_OBJ)
+    type_id.insert('dimension'.encode('utf-8'), h5py.h5t.STD_REF_OBJ.get_size(), h5py.h5t.NATIVE_UINT32)
+    space_id = h5py.h5s.create_simple((len(reference_list),), (len(reference_list),))
+    attr = h5py.h5a.create(h5_new_file[h5_name].id, 'REFERENCE_LIST'.encode('utf-8'), type_id, space_id)
+    attr.write(np.array(reference_list, dtype=[('dataset', 'O'), ('dimension', np.uint32)]))
+def create_str_attr(dataset : h5py.Dataset, attr_name : str, string_value: str):
+    if h5py.h5a.exists(dataset.id, attr_name.encode('utf-8')):
+            h5py.h5a.delete(dataset.id,name = attr_name.encode('utf-8'))
+    type_id = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
+    type_id.set_size(len(string_value)+1)
+    type_id.set_strpad(h5py.h5t.STR_NULLTERM)
+    space = h5py.h5s.create(h5py.h5s.SCALAR)
+    attr = h5py.h5a.create(dataset.id, attr_name.encode('utf-8'), type_id, space)
+    string = np.array(string_value.encode('ascii'), dtype=h5py.string_dtype('ascii', len(string_value)+1))
+    attr.write(string)
+def has_nan_values(dataset: xr.Dataset) -> bool:
+    """
+    Check if an xarray Dataset contains any NaN values in data variables or coordinates.
+    Args:
+        dataset: The xarray Dataset to check
+    Returns:
+        True if any NaN values are found, False otherwise
+    """
+    for var_name, da in dataset.data_vars.items():
+        try:
+            if np.isnan(da.values).any():
+                logger.debug(f"Found NaN values in data variable: {var_name}")
+                return True
+        except TypeError: # Skip non-numeric arrays (e.g., strings)
+            continue
+    for coord_name, coord in dataset.coords.items():
+        try:
+            if np.isnan(coord.values).any():
+                logger.debug(f"Found NaN values in coordinate: {coord_name}")
+                return True
+        except TypeError:
+            continue
+    return False

etiket_sync_agent_quantify/quantify_config_class.py ADDED Viewed

@@ -0,0 +1,76 @@
+import pathlib
+import dataclasses
+import etiket_sync_agent_quantify
+from typing import Optional
+from etiket_sync_agent.db import get_db_session_context
+from etiket_sync_agent.crud.sync_sources import crud_sync_sources, SyncSources
+@dataclasses.dataclass
+class QuantifyConfigData:
+    quantify_directory: pathlib.Path
+    set_up : str
+    is_server_folder: bool = False
+    def __post_init__(self):
+        # ensure the path is of the type pathlib.Path (str is converted to Path) and expand ~
+        self.quantify_directory = pathlib.Path(self.quantify_directory).expanduser()
+    async def validate(self, current_sync_source : Optional[SyncSources] = None):
+        """
+        Validates the Quantify base directory configuration.
+        Checks:
+        1. If the quantify_directory exists and is a directory.
+        2. If the quantify_directory conflicts with an existing Quantify sync source
+            (i.e., it's identical, a subdirectory, or a parent directory).
+        Raises:
+            ValueError: If any validation check fails.
+        Returns:
+            True if all checks pass.
+        """
+        # Resolve to an absolute path for consistent comparisons
+        try:
+            abs_quantify_dir = self.quantify_directory.expanduser().resolve(strict=True)
+        except FileNotFoundError as e:
+            raise ValueError(f"The specified Quantify directory does not exist: {self.quantify_directory}") from e
+        # check if the path exists and is a directory.
+        if not abs_quantify_dir.is_dir():
+            raise ValueError(f"The specified path is not a directory: {abs_quantify_dir}")
+        # check if the directory is not yet added/is part of a directory that is already added.
+        async with get_db_session_context() as session:
+            sync_sources = await crud_sync_sources.list_sync_sources(session)
+            for sync_source in sync_sources:
+                if current_sync_source is not None and sync_source.id == current_sync_source.id:
+                    # path may not be updated::
+                    if pathlib.Path(sync_source.config_data['quantify_directory']) != abs_quantify_dir:
+                        raise ValueError(f"The directory '{abs_quantify_dir}' is already added as sync source '{sync_source.name}'.")
+                    continue
+                # Assuming 'quantify' is the correct enum member
+                if sync_source.backend == etiket_sync_agent_quantify.__name__:
+                    # Assuming config_data stores the path as a string and is always present/valid.
+                    try:
+                        existing_path_str = sync_source.config_data['quantify_directory']
+                        existing_path = pathlib.Path(existing_path_str).expanduser().resolve()
+                    except Exception as e:
+                        # If an existing path can't be resolved, log it but continue validation
+                        print(f"Warning: Could not resolve existing quantify directory '{existing_path_str}' for sync source '{sync_source.name}' ({sync_source.id}): {e}")
+                        continue
+                    # Check for conflicts
+                    if abs_quantify_dir == existing_path:
+                        raise ValueError(f"The directory '{abs_quantify_dir}' is already added as sync source '{sync_source.name}'.")
+                    # Check if the new path is inside an existing path
+                    if abs_quantify_dir.is_relative_to(existing_path):
+                        raise ValueError(f"The directory '{abs_quantify_dir}' is inside the directory '{existing_path}' added by sync source '{sync_source.name}'.")
+                    # Check if an existing path is inside the new path
+                    if existing_path.is_relative_to(abs_quantify_dir):
+                        raise ValueError(f"The directory '{existing_path}' added by sync source '{sync_source.name}' is inside the specified directory '{abs_quantify_dir}'.")
+        return True

etiket_sync_agent_quantify/quantify_sync_class.py ADDED Viewed

@@ -0,0 +1,199 @@
+import os
+import pathlib
+import xarray
+import re
+import typing
+from datetime import datetime
+from pathlib import Path
+from etiket_sync_agent_quantify.live_sync import is_dataset_live, XArrayReplicator
+from etiket_sync_agent_quantify.quantify_config_class import QuantifyConfigData
+from etiket_sync_agent_quantify.utility import to_gridded_dataset
+from etiket_sync_agent.backends.sync_source_abstract import SyncSourceFileBase, ScopeRequirement
+from etiket_sync_agent.sync.sync_records.manager import SyncRecordManager
+from etiket_sync_agent.sync.sync_utilities import dataset_info, file_info, FileType, sync_utilities
+from etiket_sync_agent.schemas import SyncItemSchema
+class QuantifySync(SyncSourceFileBase):
+    sync_agent_name: typing.ClassVar[str] = "Quantify"
+    config_data_class: typing.ClassVar[typing.Type[QuantifyConfigData]] = QuantifyConfigData
+    scope_requirement: typing.ClassVar[ScopeRequirement] = ScopeRequirement.REQUIRED
+    supports_scope_mapping: typing.ClassVar[bool] = False
+    live_sync_implemented: typing.ClassVar[bool] = True
+    level: typing.ClassVar[int] = 2
+    has_owner: typing.ClassVar[bool] = True
+    is_single_file: typing.ClassVar[bool] = False
+    @staticmethod
+    def rootPath(config_data: QuantifyConfigData) -> pathlib.Path:
+        return pathlib.Path(config_data.quantify_directory)
+    @staticmethod
+    async def checkLiveDataset(config_data: QuantifyConfigData, syncIdentifier: SyncItemSchema, maxPriority: bool) -> bool:
+        if not maxPriority:
+            return False
+        dataset_dir = Path(os.path.join(config_data.quantify_directory, syncIdentifier.dataIdentifier))
+        # Check if any new directories in parent directory have newer modification time
+        try:
+            parent_dir = dataset_dir.parent
+            current_dataset_mtime = dataset_dir.stat().st_mtime
+            # Check siblings in the parent directory
+            for item in parent_dir.parent.iterdir():
+                if item.is_dir() and item.name != parent_dir.name:
+                    dir_mtime = item.stat().st_mtime
+                    if dir_mtime > current_dataset_mtime:
+                        return False
+            # Check siblings in the grandparent directory
+            if parent_dir.parent.parent.exists():
+                for item in parent_dir.parent.parent.iterdir():
+                    if item.is_dir() and item.name != parent_dir.parent.name:
+                        dir_mtime = item.stat().st_mtime
+                        if dir_mtime > current_dataset_mtime:
+                            return False
+        except (PermissionError, FileNotFoundError):
+            return False
+        # Also check for any HDF5 files in the dataset directory that might be live
+        for root, _, files in os.walk(dataset_dir):
+            for file in files:
+                if file.endswith(".hdf5") or file.endswith(".h5"):
+                    file_path = Path(os.path.join(root, file))
+                    if is_dataset_live(file_path):
+                        return True
+        return False
+    @staticmethod
+    async def syncDatasetNormal(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, sync_record: SyncRecordManager):
+        with sync_record.task("Creating dataset from Quantify dataset (not live)"):
+            await create_ds_from_quantify(configData, syncIdentifier, False, sync_record)
+        dataset_path = pathlib.Path(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier))
+        with sync_record.task("Uploading auxiliary files to the server"):
+            for root, dirs, files in os.walk(dataset_path):
+                for file in files:
+                    if not (file.endswith(".hdf5") or file.endswith(".h5")):
+                        name, file_path = process_file_name(root, file, dataset_path)
+                        if name is None:
+                            continue
+                        f_type = FileType.UNKNOWN
+                        if file.endswith(".json"):
+                            f_type = FileType.JSON
+                        if file.endswith(".txt"):
+                            f_type = FileType.TEXT
+                        f_info = file_info(name = name, fileName = file,
+                            created = datetime.fromtimestamp(pathlib.Path(os.path.join(root, file)).stat().st_mtime),
+                            fileType = f_type, file_generator = "Quantify")
+                        await sync_utilities.upload_file(file_path, syncIdentifier, f_info, sync_record)
+        with sync_record.task("Uploading HDF5 datasets to the server"):
+            for root, dirs, files in os.walk(dataset_path):
+                for file in files:
+                    if file.endswith(".hdf5") or file.endswith(".h5"):
+                        name, file_path = process_file_name(root, file, dataset_path)
+                        if name is None:
+                            continue
+                        if is_dataset_live(file_path) is True:
+                            replicator = XArrayReplicator(name, file_path, syncIdentifier.datasetUUID)
+                            replicator.sync()
+                        # upload only if the dataset is not live anymore, assuming the exit of the replicator is triggered on finish
+                        if is_dataset_live(file_path) is False:
+                            f_info = file_info(name = name, fileName = file,
+                                                created = datetime.fromtimestamp(pathlib.Path(os.path.join(root, file)).stat().st_mtime),
+                                                fileType = FileType.HDF5_NETCDF, file_generator = "Quantify")
+                            ds = xarray.load_dataset(file_path, engine='h5netcdf')
+                            try:
+                                ds = to_gridded_dataset(ds)
+                            except ValueError as e:
+                                sync_record.add_log(f"Error converting dataset to gridded dataset: {e} -- proceeding without conversion")
+                            except Exception as e:
+                                sync_record.add_log(f"Error converting dataset to gridded dataset (unknown error) {e} -- proceeding without conversion")
+                            # check if fields in the datasets are standard deviations and mark them as such -- this is useful for plotting
+                            data_vars = list(ds)
+                            for var_name in data_vars:
+                                if var_name.endswith("_u") and var_name[:-2] in data_vars:
+                                    ds[var_name[:-2]].attrs['__std'] = var_name
+                                    ds[var_name].attrs['__is_std'] = 1
+                            await sync_utilities.upload_xarray(ds, syncIdentifier, f_info, sync_record)
+                        else:
+                            raise Exception("Live dataset is still live after replicator exit")
+    @staticmethod
+    async def syncDatasetLive(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, sync_record: SyncRecordManager):
+        with sync_record.task("Creating dataset from Quantify dataset (live)"):
+            await create_ds_from_quantify(configData, syncIdentifier, True, sync_record)
+        dataset_path = pathlib.Path(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier))
+        with sync_record.task("Starting live replication from Quantify datasets"):
+            for root, dirs, files in os.walk(dataset_path):
+                for file in files:
+                    if file.endswith(".hdf5") or file.endswith(".h5"):
+                        name, file_path = process_file_name(root, file, dataset_path)
+                        if name is None:
+                            continue
+                        if is_dataset_live(file_path) is True:
+                            replicator = XArrayReplicator(name, file_path, syncIdentifier.datasetUUID)
+                            replicator.sync()
+def process_file_name(file_dir : str, file_name : str, dataset_path : str) -> typing.Tuple[str, pathlib.Path]:
+    if file_name.startswith("."):
+        return None, None
+    relative_path = os.path.relpath(os.path.join(file_dir, file_name), start=dataset_path)
+    name_parts = [re.sub(r"\d{8}-\d{6}-\d{3}-[a-z0-9]{6}-", "", part)
+                    for part in pathlib.Path(relative_path).parts]
+    reformatted_file_name = ".".join(name_parts)
+    file_path = pathlib.Path(os.path.join(file_dir, file_name))
+    return reformatted_file_name, file_path
+async def create_ds_from_quantify(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, live : bool, sync_record: SyncRecordManager):
+    sync_record.add_log("Extracting metadata from Quantify dataset: " + syncIdentifier.dataIdentifier)
+    tuid = syncIdentifier.dataIdentifier.split('/')[1][:26]
+    name = syncIdentifier.dataIdentifier.split('/')[1][27:]
+    created = datetime.strptime(tuid[:18], "%Y%m%d-%H%M%S-%f")
+    # get variable names in the dataset, this is handy for searching!
+    keywords = set()
+    # loop through all datasets in the folder os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier) (not recursive) and get the keywords
+    for file in os.listdir(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier)):
+        try:
+            if file.endswith(".hdf5") or file.endswith(".h5"):
+                with xarray.load_dataset(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier, file), engine='h5netcdf') as xr_ds:
+                    for key in xr_ds.keys():
+                        if 'long_name' in xr_ds[key].attrs.keys():
+                            keywords.add(xr_ds[key].attrs['long_name'])
+                            continue
+                        if 'name' in xr_ds[key].attrs.keys():
+                            keywords.add(xr_ds[key].attrs['name'])
+                    for key in xr_ds.coords:
+                        if 'long_name' in xr_ds[key].attrs.keys():
+                            keywords.add(xr_ds[key].attrs['long_name'])
+                            continue
+                        if 'name' in xr_ds[key].attrs.keys():
+                            keywords.add(xr_ds[key].attrs['name'])
+        except Exception as e:
+            print(f"Error loading dataset: {e}")
+    sync_record.add_log("Creating dataset info")
+    ds_info = dataset_info(name = name, datasetUUID = syncIdentifier.datasetUUID,
+                alt_uid = tuid, scopeUUID = syncIdentifier.scopeUUID,
+                created = created, keywords = list(keywords),
+                attributes = {"set-up" : configData.set_up})
+    await sync_utilities.create_or_update_dataset(live, syncIdentifier, ds_info, sync_record)

etiket_sync_agent_quantify/utility.py ADDED Viewed

@@ -0,0 +1,37 @@
+import xarray as xr
+import numpy as np
+def to_gridded_dataset(dataset: xr.Dataset, dimension: str = "dim_0") -> xr.Dataset:
+    '''
+    Converts a quantify dataset to a gridded dataset.
+    '''
+    if dimension not in dataset.dims:
+        raise ValueError(f"Dimension {dimension} not in dims {dataset.dims}.")
+    if "grid_2d" in dataset.attrs:
+        # In some cases the type does not seem to be a python type, so checking for numpy type as well.
+        if isinstance(dataset.attrs["grid_2d"], bool) and dataset.attrs["grid_2d"] is False:
+            raise ValueError("Dataset is not gridded, this function cannot be applied.")
+        if isinstance(dataset.attrs["grid_2d"], np.bool_) and dataset.attrs["grid_2d"] == np.bool_(False):
+            raise ValueError("Dataset is not gridded, this function cannot be applied.")
+    coords_names = sorted(v for v in dataset.variables if v.startswith("x"))[::-1]
+    # legacy datasets saved this in vars ...
+    dataset = dataset.set_coords(coords_names)
+    if len(coords_names) == 1:
+        # No unstacking needed just swap the dimension
+        for var in dataset.data_vars:
+            if dimension in dataset[var].dims:
+                dataset = dataset.update(
+                    {var: dataset[var].swap_dims({dimension: coords_names[0]})},
+                )
+    else:
+        dataset = dataset.set_index({dimension: coords_names})
+        dataset = dataset.unstack(dim=dimension)
+    # per quantify convention.
+    if "grid_2d" in dataset.attrs:
+        dataset.attrs["grid_2d"] = False
+    return dataset

etiket_sync_agent_quantify-0.3.0b1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,158 @@
+Metadata-Version: 2.4
+Name: etiket_sync_agent_quantify
+Version: 0.3.0b1
+Summary: Quantify backend for eTiKeT sync agent
+Author: QHarbor team
+License-Expression: LicenseRef-Proprietary
+Project-URL: Homepage, https://qharbor.nl
+Project-URL: Documentation, https://docs.qharbor.nl
+Keywords: etiket,sync,backend,quantify,quantum
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: POSIX :: Linux
+Classifier: Operating System :: MacOS :: MacOS X
+Classifier: Operating System :: Microsoft :: Windows
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Topic :: Scientific/Engineering
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENCE
+Requires-Dist: etiket_sync_agent>=0.3.0b1
+Requires-Dist: xarray
+Provides-Extra: test
+Requires-Dist: quantify-core>=0.7; extra == "test"
+Dynamic: license-file
+# eTiKeT Sync Agent - Quantify Backend
+Backend for synchronizing Quantify datasets with the eTiKeT platform. This backend scans Quantify data directories and syncs HDF5 datasets to the cloud.
+## Installation
+```bash
+pip install etiket_sync_agent_quantify
+```
+The package is automatically discovered by `etiket_sync_agent` through the entry-point system.
+## What Gets Synchronized
+When a Quantify dataset is synced, the following data is extracted and uploaded:
+| Quantify Data | eTiKeT Field | Description |
+|---------------|--------------|-------------|
+| TUID (first 26 chars) | `alt_uid` | Unique identifier extracted from the folder name |
+| Folder name (after TUID) | `name` | Name of the measurement |
+| TUID timestamp | `collected` | Measurement timestamp parsed from TUID format |
+| Variable `long_name`/`name` attrs | `tags` | Extracted from HDF5 dataset attributes |
+| Config `set_up` | `attributes.set-up` | Experimental setup from configuration |
+| HDF5 files | Data files | xarray datasets converted and uploaded |
+| JSON/text files | Auxiliary files | Additional files in the dataset folder |
+### Data Processing
+- **Grid conversion**: Datasets are automatically converted to gridded format for efficient visualization
+- **Standard deviation detection**: Variables ending with `_u` are marked as uncertainties for their base variable
+- **TUID extraction**: The unique identifier follows the format `YYYYMMDD-HHMMSS-fff-xxxxxx`
+---
+## Configuration
+The Quantify backend requires a `QuantifyConfigData` configuration with the following fields:
+| Field | Type | Required | Description |
+|-------|------|----------|-------------|
+| `quantify_directory` | `Path` or `str` | Yes | Path to the Quantify data directory |
+| `set_up` | `str` | Yes | Name of the experimental setup (added as `set-up` attribute) |
+| `is_server_folder` | `bool` | No | Whether this is a server folder (default: `False`), e.g. on network drive of the university. |
+### Example Configuration
+Example using the `etiket-sdk` package:
+```python
+from etiket_sdk.sync import SyncSources
+SyncSources.create(
+    name="my_quantify_source",
+    backend_identifier="etiket_sync_agent_quantify",
+    config_data={
+        "quantify_directory": "~/quantify-data",
+        "set_up": "dilution_fridge_1",
+        "is_server_folder": False
+    },
+    default_scope="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
+)
+```
+## Live Sync
+The Quantify backend supports **real-time synchronization** of running measurements.
+We currenly do not support non-gridded datasets.
+### How Live Detection Works
+A dataset is considered "live" (still being written to) if:
+1. **Recent modification**: The HDF5 file was modified within the last **2 minutes**
+2. **Contains NaN values**: Quantify pre-allocates arrays with NaN values that get filled during measurement
+3. **No newer datasets**: No other dataset directories have been created (indicating the measurement is still active)
+### How Live Sync Works
+The `XArrayReplicator` class performs real-time replication:
+1. **Creates a qdrive dataset** with SWMR (Single-Writer Multiple-Reader) HDF5 mode
+2. **Monitors file modifications** by checking the source file's modification time
+3. **Tracks data using NaN cursor**: Follows the position of NaN values to detect new data
+4. **Polls for updates** every **0.5 seconds**.
+5. **Completes when**: All NaN values are replaced OR the 2-minute timeout is reached OR a newer dataset directory is created
+### Timeout Behavior
+| Condition | Timeout | Action |
+|-----------|---------|--------|
+| No file modification | 120 seconds (2 min) | Dataset marked as complete |
+| Newer directory created | Immediate | Live sync stops, dataset marked complete |
+| All NaN values filled | Immediate | Dataset marked complete |
+### File Snapshot Safety
+To avoid conflicts with concurrent file writes, the backend creates **temporary snapshots** of HDF5 files before reading:
+```python
+with with_dataset_snapshot(file_path) as safe_file:
+    dataset = xr.open_dataset(safe_file, engine='h5netcdf')
+    # Process dataset safely
+```
+This is necessary because Quantify continuously overwrites the HDF5 file during measurement. Creating a temporary copy avoids file corruption or read errors when the source file is being written to.
+---
+## Features
+- **Direct Quantify integration**: Reads HDF5 datasets written by Quantify's data management system
+- **Live sync support**: Real-time monitoring of running measurements via SWMR HDF5
+- **Automatic TUID parsing**: Extracts timestamps and identifiers from Quantify's TUID format
+- **Grid conversion**: Converts irregular datasets to gridded format for efficient visualization
+- **Auxiliary file upload**: Syncs JSON, text, and other files alongside the main data
+## Requirements
+- Python >= 3.10
+- xarray
+- h5py
+- h5netcdf
+## License
+Copyright © 2025 QHarbor. All Rights Reserved. See [LICENCE](LICENCE) for details.

etiket_sync_agent_quantify-0.3.0b1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,11 @@
+etiket_sync_agent_quantify/__init__.py,sha256=oG9nX60PYq6eftC2ZfkE3HDzvFO9ZlJbSFxsfsLairU,174
+etiket_sync_agent_quantify/live_sync.py,sha256=AYDVsI9o3urVbI-QwwxCR62zP6wsXV-fUC6JC2cN90o,16344
+etiket_sync_agent_quantify/quantify_config_class.py,sha256=UyBUSXq0AZwzxaVPoI3yDtcMIBh7RaNYygTfNrtBCkU,4018
+etiket_sync_agent_quantify/quantify_sync_class.py,sha256=sCXXJuMXFMUjmdxViZOaP9KQub36X5TTexscox0BLSI,11071
+etiket_sync_agent_quantify/utility.py,sha256=oE6JWLAecg0uPfGV9bs08yz1lt90oKmIUOfe9wtkH6I,1584
+etiket_sync_agent_quantify-0.3.0b1.dist-info/licenses/LICENCE,sha256=tdZwE43Th9efUgN8-4UpMUyh0kYQ4Dk678agSuhpnc0,2357
+etiket_sync_agent_quantify-0.3.0b1.dist-info/METADATA,sha256=EB4Kot_woslLewePV9sv_NPTxWDoPASK0_RMMSe9arI,6202
+etiket_sync_agent_quantify-0.3.0b1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+etiket_sync_agent_quantify-0.3.0b1.dist-info/entry_points.txt,sha256=NVgaHloUE0GgtwSkWJGgQ4GYl6i1bT2l6qQ79OEwwzY,91
+etiket_sync_agent_quantify-0.3.0b1.dist-info/top_level.txt,sha256=fKH3mxReAXUBerdivtAr9yJ4_JFc2oP_QxCX4xeEBEc,27
+etiket_sync_agent_quantify-0.3.0b1.dist-info/RECORD,,

etiket_sync_agent_quantify-0.3.0b1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

etiket_sync_agent_quantify-0.3.0b1.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [etiket_sync_agent.backends]
2	+ quantify_sync_agent = etiket_sync_agent_quantify:QuantifySync

etiket_sync_agent_quantify-0.3.0b1.dist-info/licenses/LICENCE ADDED Viewed

@@ -0,0 +1,34 @@
+All Rights Reserved License
+Copyright ©️ 2024-2026 QHarbor B.V. All Rights Reserved.
+Agreement to Terms
+By accessing, downloading, installing, or viewing this Software (including its source code, binaries, or application files), you acknowledge and agree to the terms outlined below.
+Terms and Conditions
+This software and its source code (the "Software") are the exclusive property of QHarbor B.V. and are protected by copyright and other intellectual property laws.
+License and Testing Exemptions
+Commercial License: If you have entered into a separate commercial license agreement with QHarbor B.V., the terms of that agreement shall supersede the restrictions listed below.
+Testing Permission: If you have obtained written permission for testing/evaluation from QHarbor B.V., you are permitted to install and use the Software for evaluation purposes. However, this permission strictly excludes the right to modify, alter, create derivative works, or reverse engineer the Software.
+Prohibited Actions
+Unless explicitly authorized by the exemptions above, you are NOT permitted to:
+	•	Copy, reproduce, or duplicate the Software in any form (except as reasonably necessary for viewing or authorized installation)
+	•	Modify, alter, or create derivative works based on the Software
+	•	Distribute, publish, or share the Software with others
+	•	Reverse engineer, decompile, or disassemble the Software
+	•	Use the Software for any commercial or non-commercial purposes
+	•	Transfer, sell, lease, or sublicense the Software
+	•	Remove or alter any copyright notices or proprietary markings
+Viewing Only
+For those without a commercial license or written testing permission, the Software is made available for viewing and reference purposes only. Any access to view the source code or application does not grant any rights to use, copy, or modify the Software.
+No Implied Rights
+No rights are granted by implication, estoppel, or otherwise. All rights not expressly granted are reserved by the copyright holder.
+Governing Law
+This license and any disputes arising from it shall be governed by the laws of the Netherlands.
+Disclaimer
+This Software is provided "AS IS" without warranty of any kind. The copyright holder disclaims all warranties and shall not be liable for any damages arising from the use or inability to use this Software.

etiket_sync_agent_quantify-0.3.0b1.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ etiket_sync_agent_quantify