etiket-sync-agent-quantify 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ from .quantify_sync_class import QuantifySync
2
+ from .quantify_config_class import QuantifyConfigData
3
+
4
+ __all__ = ["QuantifySync", "QuantifyConfigData"]
5
+ __version__ = "0.3.0b1"
@@ -0,0 +1,360 @@
1
+ from contextlib import contextmanager
2
+ from pathlib import Path
3
+
4
+ from qdrive.dataset.dataset import dataset
5
+ from etiket_client.remote.endpoints.models.types import FileStatusLocal, FileType
6
+ from etiket_sync_agent_quantify.utility import to_gridded_dataset
7
+
8
+ import time
9
+ import logging
10
+ import tempfile
11
+ import h5py
12
+ import uuid
13
+
14
+ import xarray as xr
15
+ import numpy as np
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ @contextmanager
20
+ def with_dataset_snapshot(file_location: Path) -> Path:
21
+ """
22
+ Creates a safe, temporary copy of an HDF5/netCDF dataset file to prevent conflicts
23
+ when the original is being actively written to by another process.
24
+
25
+ Args:
26
+ file_location: Path to the original dataset file
27
+
28
+ Yields:
29
+ Path to the temporary copy that can be safely read
30
+
31
+ Raises:
32
+ Various exceptions if file operations fail
33
+
34
+ Example:
35
+ with with_dataset_snapshot(data_file) as safe_file:
36
+ dataset = xr.open_dataset(safe_file, engine='h5netcdf')
37
+ # Process dataset without worrying about concurrent writes
38
+ """
39
+ temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5", delete=False)
40
+ try:
41
+ with open(file_location, 'rb') as src:
42
+ temp_file.write(src.read())
43
+ temp_file.close()
44
+ yield Path(temp_file.name)
45
+ except (IOError, OSError) as e:
46
+ logger.exception(f"Error creating dataset snapshot for {file_location}: {e}")
47
+ raise e
48
+ except Exception as e:
49
+ logger.exception(f"Unexpected error handling dataset {file_location}: {e}")
50
+ raise e
51
+ finally:
52
+ try:
53
+ Path(temp_file.name).unlink()
54
+ except OSError:
55
+ logger.warning(f"Failed to remove temporary snapshot file: {temp_file.name}")
56
+
57
+ def is_dataset_live(file_location: Path, perform_NAN_check = True, n_th_attempt = 0) -> bool:
58
+ """
59
+ Returns False if one of the following conditions are met:
60
+ - If the dataset has not been modified in the last 2 minutes
61
+ - If a new directory is created in the same parent directory with a newer modification time
62
+ - If a new directory is created in the grandparent directory with a newer modification time
63
+ - If the dataset does not contain any NaN values (if perform_NAN_check is True)
64
+ """
65
+ # Check modification time
66
+ last_modified = file_location.stat().st_mtime
67
+ if (time.time() - last_modified) > 120:
68
+ return False
69
+
70
+ current_dataset_mtime = file_location.stat().st_mtime
71
+ parent_dir = file_location.parent
72
+
73
+ # Check if any new directories in the same parent directory have newer modification time
74
+ try:
75
+ for item in parent_dir.parent.iterdir():
76
+ if item.is_dir() and item.name != parent_dir.name:
77
+ dir_mtime = item.stat().st_mtime
78
+ if dir_mtime > current_dataset_mtime:
79
+ logger.debug(f"Found newer directory {item} in parent directory")
80
+ return False
81
+
82
+ if parent_dir.parent.parent.exists():
83
+ for item in parent_dir.parent.parent.iterdir():
84
+ if item.is_dir() and item.name != parent_dir.parent.name: # Check siblings of grandparent
85
+ dir_mtime = item.stat().st_mtime
86
+ if dir_mtime > current_dataset_mtime:
87
+ logger.debug(f"Found newer directory {item} in grandparent directory")
88
+ return False
89
+ except (PermissionError, FileNotFoundError) as e:
90
+ logger.warning(f"Error checking for new directories: {e}")
91
+
92
+
93
+ if perform_NAN_check is True:
94
+ print("TRYING TO CHECK NAN")
95
+ try:
96
+ with with_dataset_snapshot(file_location) as safe_file:
97
+ with xr.open_dataset(safe_file, engine='h5netcdf') as dataset:
98
+ return has_nan_values(dataset)
99
+ except OSError as e:
100
+ # likely partially written dataset, try again (max 3 times)
101
+ if n_th_attempt < 3:
102
+ # sleep to give io time to finish.
103
+ time.sleep(0.5)
104
+ return is_dataset_live(file_location, perform_NAN_check, n_th_attempt + 1)
105
+ else:
106
+ logger.exception("Error checking for NaN values in dataset")
107
+ # file is corrupted in some way, probs because something is writing.
108
+ raise ValueError("Unable to read HDF5 files, maybe something is locked?") from e
109
+ except Exception:
110
+ logger.exception("Error checking for NaN values in dataset")
111
+ return False
112
+
113
+ return True
114
+
115
+ class XArrayReplicator:
116
+ '''
117
+ Replicates the xarray dataset into a new HDF5 file. This file is launched in SWMR mode, and gets the expected attributes present in the qdrive dataset.
118
+ The sync process works by following the state of the NAN values in the dataset.
119
+ '''
120
+ def __init__(self, ds_name : str, dataset_location: Path, dataset_uuid: uuid.UUID):
121
+ self.dataset_location = dataset_location
122
+ self.qdrive_dataset = dataset(dataset_uuid)
123
+
124
+ self.dataset_followers = {}
125
+
126
+ self.last_mod_sync = dataset_location.stat().st_mtime
127
+
128
+ with with_dataset_snapshot(dataset_location) as safe_file:
129
+ with xr.open_dataset(safe_file, engine='h5netcdf') as xr_dataset:
130
+ with tempfile.TemporaryDirectory() as temp_dir:
131
+ temp = Path(temp_dir) / "temp.hdf5"
132
+
133
+ try:
134
+ xr_dataset = to_gridded_dataset(xr_dataset)
135
+ except Exception:
136
+ pass
137
+
138
+ xr_dataset.to_netcdf(temp, engine='h5netcdf', invalid_netcdf=True)
139
+ m_file = Path(temp_dir) / "measurement.hdf5"
140
+ # kinda have to do some hacky stuff to get the superblock to work ... (standard superblock is v2, but we need at least v3 for the qdrive dataset)
141
+ convert_to_superblock_v3(m_file, h5py.File(temp, 'r'))
142
+ if ds_name not in self.qdrive_dataset.files.keys():
143
+ self.qdrive_dataset._files._add_new_file(ds_name,
144
+ file_path=m_file,
145
+ file_type=FileType.HDF5_CACHE,
146
+ generator="quantify_sync_module",
147
+ file_status = FileStatusLocal.writing)
148
+
149
+ self.hdf5_file = h5py.File(self.qdrive_dataset[ds_name].path, 'a', locking=False, libver='v112')
150
+
151
+ for name in xr_dataset.variables :
152
+ self.dataset_followers[name] = DatasetFollower(self.hdf5_file[name], xr_dataset[name])
153
+
154
+ self.hdf5_file.swmr_mode = True
155
+
156
+ def sync(self):
157
+ keep_syncing = True
158
+
159
+ while keep_syncing:
160
+ keep_syncing = not self._check_done()
161
+ if self._has_update():
162
+ try:
163
+ with with_dataset_snapshot(self.dataset_location) as safe_file:
164
+ with xr.open_dataset(safe_file, engine='h5netcdf') as xr_dataset:
165
+ try:
166
+ xr_dataset = to_gridded_dataset(xr_dataset)
167
+ except Exception:
168
+ pass
169
+
170
+ for name in xr_dataset.variables:
171
+ self.dataset_followers[name].update(xr_dataset[name])
172
+ self.hdf5_file.flush()
173
+ except Exception:
174
+ time.sleep(0.5)
175
+ logger.exception("Error reading dataset")
176
+ else:
177
+ time.sleep(0.5) # default write interval in quantify is 0.5s
178
+
179
+ for follower in self.dataset_followers.values():
180
+ follower.complete()
181
+
182
+ def _has_update(self) -> bool:
183
+ last_mod = self.dataset_location.stat().st_mtime
184
+ if last_mod > self.last_mod_sync:
185
+ self.last_mod_sync = last_mod
186
+ return True
187
+ return False
188
+
189
+ def _check_done(self):
190
+ done = True
191
+ for follower in self.dataset_followers.values():
192
+ if not follower.noNanValues:
193
+ done = False
194
+ break
195
+ if done:
196
+ return True
197
+ return not is_dataset_live(self.dataset_location, perform_NAN_check=False)
198
+
199
+ class DatasetFollower:
200
+ '''
201
+ Object used cache the state of a datasets in the netcdf4 file. If the file has new values, they will be written to the live HDF5 file.
202
+ '''
203
+ def __init__(self, h5_dataset: h5py.Dataset, initial_state: xr.DataArray):
204
+ self.dataset = h5_dataset
205
+ self.noNanValues = False
206
+
207
+ raw_data = np.asarray(initial_state.data)
208
+ cursor = self.__get_cursor(raw_data)
209
+ cursor_shape = (1,) if raw_data.ndim == 0 else (raw_data.ndim,)
210
+
211
+ h5_dataset.attrs.create('__cursor', cursor, dtype=np.int32, shape=cursor_shape)
212
+ h5_dataset.attrs['completed'] = False
213
+
214
+ def update(self, data_array: xr.DataArray):
215
+ data = data_array.values
216
+ old_cursor = self.dataset.attrs['__cursor']
217
+ new_cursor = self.__get_cursor(data)
218
+ if not np.array_equal(old_cursor, new_cursor):
219
+ try:
220
+ if self.dataset.shape != data.shape:
221
+ self.dataset.resize(data.shape)
222
+
223
+ slices = []
224
+ for i in range(len(data.shape)):
225
+ if old_cursor[i] == new_cursor[i]:
226
+ slices.append(slice(new_cursor[i], new_cursor[i]+1))
227
+ else:
228
+ if i == data.ndim-1:
229
+ slices.append(slice(old_cursor[i], new_cursor[i]))
230
+ else:
231
+ slices.append(slice(old_cursor[i], new_cursor[i]+1))
232
+ break
233
+ self.dataset.write_direct(data, np.s_[tuple(slices)], np.s_[tuple(slices)])
234
+ self.dataset.attrs['__cursor'] = new_cursor
235
+ self.dataset.attrs['completed'] = False
236
+ except Exception:
237
+ self.dataset.attrs['__cursor'] = old_cursor
238
+ self.dataset.attrs['completed'] = False
239
+ logger.exception("Error updating dataset")
240
+
241
+ def complete(self):
242
+ self.dataset.attrs['completed'] = True
243
+
244
+ def __get_cursor(self, raw_data: np.ndarray):
245
+ '''
246
+ Finds the position of the last value that is not NaN in the data array.
247
+ This helps track how much of the dataset has already been written.
248
+ '''
249
+ non_nan_mask = ~np.isnan(raw_data)
250
+
251
+ if np.all(non_nan_mask):
252
+ self.noNanValues = True
253
+ return np.unravel_index(raw_data.size - 1, raw_data.shape)
254
+
255
+ # If all values are NaN, return zeros
256
+ if not np.any(non_nan_mask):
257
+ return tuple([0] * len(raw_data.shape))
258
+
259
+ # Find the last non-NaN value
260
+ flat_indices = np.flatnonzero(non_nan_mask)
261
+ if len(flat_indices) > 0:
262
+ last_non_nan_idx = flat_indices[-1]
263
+ # Convert flat index to dimensional indices
264
+ return np.unravel_index(last_non_nan_idx, raw_data.shape)
265
+
266
+ # Fallback to all zeros (shouldn't reach here given the checks above)
267
+ return tuple([0] * len(raw_data.shape))
268
+
269
+ def convert_to_superblock_v3(new_file : Path, h5_old_file : h5py.File):
270
+ with h5py.File(new_file, 'w', locking=False, libver='v112') as h5_new_file:
271
+ # create all groups and dataset of the original file (normally not nested)
272
+ for h5_name, h5_object in h5_old_file.items():
273
+ if isinstance(h5_object, h5py.Group):
274
+ h5_new_file.create_group(h5_name)
275
+ elif isinstance(h5_object, h5py.Dataset):
276
+ h5_new_file.create_dataset(h5_name, data=h5_object[()])
277
+ else:
278
+ raise ValueError("Unknown type in HDF5 file")
279
+
280
+ # Copy file attributes
281
+ for h5_name, h5_object in h5_old_file.attrs.items():
282
+ h5_new_file.attrs[h5_name] = h5_object
283
+
284
+ # Copy object attributes and handle special cases
285
+ for h5_name, h5_object in h5_old_file.items():
286
+ for attr_name, attr_value in h5_object.attrs.items():
287
+ if attr_name == 'DIMENSION_LIST':
288
+ dimension_scale = [np.array([h5_new_file[h5py.h5r.get_name(ds_ref, h5_old_file.id)].ref
289
+ for ds_ref in reference_list], dtype=np.object_)
290
+ for reference_list in attr_value]
291
+
292
+ create_dimension_list_attr(h5_new_file, h5_name, dimension_scale)
293
+ elif attr_name == 'REFERENCE_LIST':
294
+ # extract from compound datatype
295
+ reference_list = [(h5_new_file[h5py.h5r.get_name(ref_compound['dataset'], h5_old_file.id)].ref, ref_compound['dimension']) for ref_compound in attr_value]
296
+ create_reference_list_attr(h5_new_file, h5_name, reference_list)
297
+ elif attr_name == 'CLASS':
298
+ create_str_attr(h5_new_file[h5_name], 'CLASS', str(attr_value.decode('utf-8')))
299
+ elif attr_name == 'NAME':
300
+ create_str_attr(h5_new_file[h5_name], 'NAME', str(attr_value.decode('utf-8')))
301
+ else :
302
+ h5_new_file[h5_name].attrs[attr_name] = attr_value
303
+
304
+
305
+ def create_dimension_list_attr(h5_new_file, h5_name, dimension_scale):
306
+ type_id = h5py.h5t.vlen_create(h5py.h5t.STD_REF_OBJ)
307
+ space_id = h5py.h5s.create_simple((len(dimension_scale),), (len(dimension_scale),))
308
+ attr = h5py.h5a.create(h5_new_file[h5_name].id, 'DIMENSION_LIST'.encode('utf-8'), type_id, space_id)
309
+ arr = np.array(dimension_scale + [''], dtype=object)[:-1] # Append and remove an empty string to ensure correct type
310
+ attr.write(arr)
311
+
312
+ def create_reference_list_attr(h5_new_file, h5_name, reference_list):
313
+ type_id = h5py.h5t.create(h5py.h5t.COMPOUND, h5py.h5t.STD_REF_OBJ.get_size() + h5py.h5t.NATIVE_UINT32.get_size())
314
+ type_id.insert('dataset'.encode('utf-8'), 0, h5py.h5t.STD_REF_OBJ)
315
+ type_id.insert('dimension'.encode('utf-8'), h5py.h5t.STD_REF_OBJ.get_size(), h5py.h5t.NATIVE_UINT32)
316
+ space_id = h5py.h5s.create_simple((len(reference_list),), (len(reference_list),))
317
+ attr = h5py.h5a.create(h5_new_file[h5_name].id, 'REFERENCE_LIST'.encode('utf-8'), type_id, space_id)
318
+ attr.write(np.array(reference_list, dtype=[('dataset', 'O'), ('dimension', np.uint32)]))
319
+
320
+
321
+ def create_str_attr(dataset : h5py.Dataset, attr_name : str, string_value: str):
322
+ if h5py.h5a.exists(dataset.id, attr_name.encode('utf-8')):
323
+ h5py.h5a.delete(dataset.id,name = attr_name.encode('utf-8'))
324
+
325
+ type_id = h5py.h5t.TypeID.copy(h5py.h5t.C_S1)
326
+ type_id.set_size(len(string_value)+1)
327
+ type_id.set_strpad(h5py.h5t.STR_NULLTERM)
328
+ space = h5py.h5s.create(h5py.h5s.SCALAR)
329
+
330
+ attr = h5py.h5a.create(dataset.id, attr_name.encode('utf-8'), type_id, space)
331
+ string = np.array(string_value.encode('ascii'), dtype=h5py.string_dtype('ascii', len(string_value)+1))
332
+ attr.write(string)
333
+
334
+ def has_nan_values(dataset: xr.Dataset) -> bool:
335
+ """
336
+ Check if an xarray Dataset contains any NaN values in data variables or coordinates.
337
+
338
+ Args:
339
+ dataset: The xarray Dataset to check
340
+
341
+ Returns:
342
+ True if any NaN values are found, False otherwise
343
+ """
344
+ for var_name, da in dataset.data_vars.items():
345
+ try:
346
+ if np.isnan(da.values).any():
347
+ logger.debug(f"Found NaN values in data variable: {var_name}")
348
+ return True
349
+ except TypeError: # Skip non-numeric arrays (e.g., strings)
350
+ continue
351
+
352
+ for coord_name, coord in dataset.coords.items():
353
+ try:
354
+ if np.isnan(coord.values).any():
355
+ logger.debug(f"Found NaN values in coordinate: {coord_name}")
356
+ return True
357
+ except TypeError:
358
+ continue
359
+
360
+ return False
@@ -0,0 +1,76 @@
1
+ import pathlib
2
+ import dataclasses
3
+ import etiket_sync_agent_quantify
4
+
5
+ from typing import Optional
6
+
7
+ from etiket_sync_agent.db import get_db_session_context
8
+ from etiket_sync_agent.crud.sync_sources import crud_sync_sources, SyncSources
9
+
10
+ @dataclasses.dataclass
11
+ class QuantifyConfigData:
12
+ quantify_directory: pathlib.Path
13
+ set_up : str
14
+ is_server_folder: bool = False
15
+
16
+ def __post_init__(self):
17
+ # ensure the path is of the type pathlib.Path (str is converted to Path) and expand ~
18
+ self.quantify_directory = pathlib.Path(self.quantify_directory).expanduser()
19
+
20
+ async def validate(self, current_sync_source : Optional[SyncSources] = None):
21
+ """
22
+ Validates the Quantify base directory configuration.
23
+
24
+ Checks:
25
+ 1. If the quantify_directory exists and is a directory.
26
+ 2. If the quantify_directory conflicts with an existing Quantify sync source
27
+ (i.e., it's identical, a subdirectory, or a parent directory).
28
+
29
+ Raises:
30
+ ValueError: If any validation check fails.
31
+
32
+ Returns:
33
+ True if all checks pass.
34
+ """
35
+ # Resolve to an absolute path for consistent comparisons
36
+ try:
37
+ abs_quantify_dir = self.quantify_directory.expanduser().resolve(strict=True)
38
+ except FileNotFoundError as e:
39
+ raise ValueError(f"The specified Quantify directory does not exist: {self.quantify_directory}") from e
40
+
41
+ # check if the path exists and is a directory.
42
+ if not abs_quantify_dir.is_dir():
43
+ raise ValueError(f"The specified path is not a directory: {abs_quantify_dir}")
44
+
45
+ # check if the directory is not yet added/is part of a directory that is already added.
46
+ async with get_db_session_context() as session:
47
+ sync_sources = await crud_sync_sources.list_sync_sources(session)
48
+ for sync_source in sync_sources:
49
+ if current_sync_source is not None and sync_source.id == current_sync_source.id:
50
+ # path may not be updated::
51
+ if pathlib.Path(sync_source.config_data['quantify_directory']) != abs_quantify_dir:
52
+ raise ValueError(f"The directory '{abs_quantify_dir}' is already added as sync source '{sync_source.name}'.")
53
+ continue
54
+
55
+ # Assuming 'quantify' is the correct enum member
56
+ if sync_source.backend == etiket_sync_agent_quantify.__name__:
57
+ # Assuming config_data stores the path as a string and is always present/valid.
58
+ try:
59
+ existing_path_str = sync_source.config_data['quantify_directory']
60
+ existing_path = pathlib.Path(existing_path_str).expanduser().resolve()
61
+ except Exception as e:
62
+ # If an existing path can't be resolved, log it but continue validation
63
+ print(f"Warning: Could not resolve existing quantify directory '{existing_path_str}' for sync source '{sync_source.name}' ({sync_source.id}): {e}")
64
+ continue
65
+
66
+ # Check for conflicts
67
+ if abs_quantify_dir == existing_path:
68
+ raise ValueError(f"The directory '{abs_quantify_dir}' is already added as sync source '{sync_source.name}'.")
69
+ # Check if the new path is inside an existing path
70
+ if abs_quantify_dir.is_relative_to(existing_path):
71
+ raise ValueError(f"The directory '{abs_quantify_dir}' is inside the directory '{existing_path}' added by sync source '{sync_source.name}'.")
72
+ # Check if an existing path is inside the new path
73
+ if existing_path.is_relative_to(abs_quantify_dir):
74
+ raise ValueError(f"The directory '{existing_path}' added by sync source '{sync_source.name}' is inside the specified directory '{abs_quantify_dir}'.")
75
+
76
+ return True
@@ -0,0 +1,199 @@
1
+ import os
2
+ import pathlib
3
+ import xarray
4
+ import re
5
+ import typing
6
+
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from etiket_sync_agent_quantify.live_sync import is_dataset_live, XArrayReplicator
11
+ from etiket_sync_agent_quantify.quantify_config_class import QuantifyConfigData
12
+ from etiket_sync_agent_quantify.utility import to_gridded_dataset
13
+
14
+ from etiket_sync_agent.backends.sync_source_abstract import SyncSourceFileBase, ScopeRequirement
15
+ from etiket_sync_agent.sync.sync_records.manager import SyncRecordManager
16
+ from etiket_sync_agent.sync.sync_utilities import dataset_info, file_info, FileType, sync_utilities
17
+ from etiket_sync_agent.schemas import SyncItemSchema
18
+
19
+ class QuantifySync(SyncSourceFileBase):
20
+ sync_agent_name: typing.ClassVar[str] = "Quantify"
21
+ config_data_class: typing.ClassVar[typing.Type[QuantifyConfigData]] = QuantifyConfigData
22
+ scope_requirement: typing.ClassVar[ScopeRequirement] = ScopeRequirement.REQUIRED
23
+ supports_scope_mapping: typing.ClassVar[bool] = False
24
+ live_sync_implemented: typing.ClassVar[bool] = True
25
+ level: typing.ClassVar[int] = 2
26
+ has_owner: typing.ClassVar[bool] = True
27
+ is_single_file: typing.ClassVar[bool] = False
28
+
29
+ @staticmethod
30
+ def rootPath(config_data: QuantifyConfigData) -> pathlib.Path:
31
+ return pathlib.Path(config_data.quantify_directory)
32
+
33
+ @staticmethod
34
+ async def checkLiveDataset(config_data: QuantifyConfigData, syncIdentifier: SyncItemSchema, maxPriority: bool) -> bool:
35
+ if not maxPriority:
36
+ return False
37
+
38
+ dataset_dir = Path(os.path.join(config_data.quantify_directory, syncIdentifier.dataIdentifier))
39
+
40
+ # Check if any new directories in parent directory have newer modification time
41
+ try:
42
+ parent_dir = dataset_dir.parent
43
+ current_dataset_mtime = dataset_dir.stat().st_mtime
44
+
45
+ # Check siblings in the parent directory
46
+ for item in parent_dir.parent.iterdir():
47
+ if item.is_dir() and item.name != parent_dir.name:
48
+ dir_mtime = item.stat().st_mtime
49
+ if dir_mtime > current_dataset_mtime:
50
+ return False
51
+
52
+ # Check siblings in the grandparent directory
53
+ if parent_dir.parent.parent.exists():
54
+ for item in parent_dir.parent.parent.iterdir():
55
+ if item.is_dir() and item.name != parent_dir.parent.name:
56
+ dir_mtime = item.stat().st_mtime
57
+ if dir_mtime > current_dataset_mtime:
58
+ return False
59
+ except (PermissionError, FileNotFoundError):
60
+ return False
61
+
62
+ # Also check for any HDF5 files in the dataset directory that might be live
63
+ for root, _, files in os.walk(dataset_dir):
64
+ for file in files:
65
+ if file.endswith(".hdf5") or file.endswith(".h5"):
66
+ file_path = Path(os.path.join(root, file))
67
+ if is_dataset_live(file_path):
68
+ return True
69
+
70
+ return False
71
+
72
+ @staticmethod
73
+ async def syncDatasetNormal(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, sync_record: SyncRecordManager):
74
+ with sync_record.task("Creating dataset from Quantify dataset (not live)"):
75
+ await create_ds_from_quantify(configData, syncIdentifier, False, sync_record)
76
+
77
+ dataset_path = pathlib.Path(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier))
78
+ with sync_record.task("Uploading auxiliary files to the server"):
79
+ for root, dirs, files in os.walk(dataset_path):
80
+ for file in files:
81
+ if not (file.endswith(".hdf5") or file.endswith(".h5")):
82
+ name, file_path = process_file_name(root, file, dataset_path)
83
+ if name is None:
84
+ continue
85
+
86
+ f_type = FileType.UNKNOWN
87
+ if file.endswith(".json"):
88
+ f_type = FileType.JSON
89
+ if file.endswith(".txt"):
90
+ f_type = FileType.TEXT
91
+
92
+ f_info = file_info(name = name, fileName = file,
93
+ created = datetime.fromtimestamp(pathlib.Path(os.path.join(root, file)).stat().st_mtime),
94
+ fileType = f_type, file_generator = "Quantify")
95
+
96
+ await sync_utilities.upload_file(file_path, syncIdentifier, f_info, sync_record)
97
+
98
+ with sync_record.task("Uploading HDF5 datasets to the server"):
99
+ for root, dirs, files in os.walk(dataset_path):
100
+ for file in files:
101
+ if file.endswith(".hdf5") or file.endswith(".h5"):
102
+ name, file_path = process_file_name(root, file, dataset_path)
103
+ if name is None:
104
+ continue
105
+
106
+ if is_dataset_live(file_path) is True:
107
+ replicator = XArrayReplicator(name, file_path, syncIdentifier.datasetUUID)
108
+ replicator.sync()
109
+
110
+ # upload only if the dataset is not live anymore, assuming the exit of the replicator is triggered on finish
111
+ if is_dataset_live(file_path) is False:
112
+ f_info = file_info(name = name, fileName = file,
113
+ created = datetime.fromtimestamp(pathlib.Path(os.path.join(root, file)).stat().st_mtime),
114
+ fileType = FileType.HDF5_NETCDF, file_generator = "Quantify")
115
+ ds = xarray.load_dataset(file_path, engine='h5netcdf')
116
+
117
+ try:
118
+ ds = to_gridded_dataset(ds)
119
+ except ValueError as e:
120
+ sync_record.add_log(f"Error converting dataset to gridded dataset: {e} -- proceeding without conversion")
121
+ except Exception as e:
122
+ sync_record.add_log(f"Error converting dataset to gridded dataset (unknown error) {e} -- proceeding without conversion")
123
+
124
+ # check if fields in the datasets are standard deviations and mark them as such -- this is useful for plotting
125
+ data_vars = list(ds)
126
+ for var_name in data_vars:
127
+ if var_name.endswith("_u") and var_name[:-2] in data_vars:
128
+ ds[var_name[:-2]].attrs['__std'] = var_name
129
+ ds[var_name].attrs['__is_std'] = 1
130
+
131
+ await sync_utilities.upload_xarray(ds, syncIdentifier, f_info, sync_record)
132
+ else:
133
+ raise Exception("Live dataset is still live after replicator exit")
134
+
135
+ @staticmethod
136
+ async def syncDatasetLive(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, sync_record: SyncRecordManager):
137
+ with sync_record.task("Creating dataset from Quantify dataset (live)"):
138
+ await create_ds_from_quantify(configData, syncIdentifier, True, sync_record)
139
+ dataset_path = pathlib.Path(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier))
140
+
141
+ with sync_record.task("Starting live replication from Quantify datasets"):
142
+ for root, dirs, files in os.walk(dataset_path):
143
+ for file in files:
144
+ if file.endswith(".hdf5") or file.endswith(".h5"):
145
+ name, file_path = process_file_name(root, file, dataset_path)
146
+ if name is None:
147
+ continue
148
+
149
+ if is_dataset_live(file_path) is True:
150
+ replicator = XArrayReplicator(name, file_path, syncIdentifier.datasetUUID)
151
+ replicator.sync()
152
+
153
+ def process_file_name(file_dir : str, file_name : str, dataset_path : str) -> typing.Tuple[str, pathlib.Path]:
154
+ if file_name.startswith("."):
155
+ return None, None
156
+
157
+ relative_path = os.path.relpath(os.path.join(file_dir, file_name), start=dataset_path)
158
+ name_parts = [re.sub(r"\d{8}-\d{6}-\d{3}-[a-z0-9]{6}-", "", part)
159
+ for part in pathlib.Path(relative_path).parts]
160
+ reformatted_file_name = ".".join(name_parts)
161
+ file_path = pathlib.Path(os.path.join(file_dir, file_name))
162
+ return reformatted_file_name, file_path
163
+
164
+ async def create_ds_from_quantify(configData: QuantifyConfigData, syncIdentifier: SyncItemSchema, live : bool, sync_record: SyncRecordManager):
165
+ sync_record.add_log("Extracting metadata from Quantify dataset: " + syncIdentifier.dataIdentifier)
166
+ tuid = syncIdentifier.dataIdentifier.split('/')[1][:26]
167
+ name = syncIdentifier.dataIdentifier.split('/')[1][27:]
168
+ created = datetime.strptime(tuid[:18], "%Y%m%d-%H%M%S-%f")
169
+
170
+ # get variable names in the dataset, this is handy for searching!
171
+ keywords = set()
172
+
173
+ # loop through all datasets in the folder os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier) (not recursive) and get the keywords
174
+ for file in os.listdir(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier)):
175
+ try:
176
+ if file.endswith(".hdf5") or file.endswith(".h5"):
177
+ with xarray.load_dataset(os.path.join(configData.quantify_directory, syncIdentifier.dataIdentifier, file), engine='h5netcdf') as xr_ds:
178
+ for key in xr_ds.keys():
179
+ if 'long_name' in xr_ds[key].attrs.keys():
180
+ keywords.add(xr_ds[key].attrs['long_name'])
181
+ continue
182
+ if 'name' in xr_ds[key].attrs.keys():
183
+ keywords.add(xr_ds[key].attrs['name'])
184
+
185
+ for key in xr_ds.coords:
186
+ if 'long_name' in xr_ds[key].attrs.keys():
187
+ keywords.add(xr_ds[key].attrs['long_name'])
188
+ continue
189
+ if 'name' in xr_ds[key].attrs.keys():
190
+ keywords.add(xr_ds[key].attrs['name'])
191
+ except Exception as e:
192
+ print(f"Error loading dataset: {e}")
193
+
194
+ sync_record.add_log("Creating dataset info")
195
+ ds_info = dataset_info(name = name, datasetUUID = syncIdentifier.datasetUUID,
196
+ alt_uid = tuid, scopeUUID = syncIdentifier.scopeUUID,
197
+ created = created, keywords = list(keywords),
198
+ attributes = {"set-up" : configData.set_up})
199
+ await sync_utilities.create_or_update_dataset(live, syncIdentifier, ds_info, sync_record)
@@ -0,0 +1,37 @@
1
+ import xarray as xr
2
+ import numpy as np
3
+
4
+ def to_gridded_dataset(dataset: xr.Dataset, dimension: str = "dim_0") -> xr.Dataset:
5
+ '''
6
+ Converts a quantify dataset to a gridded dataset.
7
+ '''
8
+ if dimension not in dataset.dims:
9
+ raise ValueError(f"Dimension {dimension} not in dims {dataset.dims}.")
10
+ if "grid_2d" in dataset.attrs:
11
+ # In some cases the type does not seem to be a python type, so checking for numpy type as well.
12
+ if isinstance(dataset.attrs["grid_2d"], bool) and dataset.attrs["grid_2d"] is False:
13
+ raise ValueError("Dataset is not gridded, this function cannot be applied.")
14
+ if isinstance(dataset.attrs["grid_2d"], np.bool_) and dataset.attrs["grid_2d"] == np.bool_(False):
15
+ raise ValueError("Dataset is not gridded, this function cannot be applied.")
16
+
17
+
18
+ coords_names = sorted(v for v in dataset.variables if v.startswith("x"))[::-1]
19
+ # legacy datasets saved this in vars ...
20
+ dataset = dataset.set_coords(coords_names)
21
+
22
+ if len(coords_names) == 1:
23
+ # No unstacking needed just swap the dimension
24
+ for var in dataset.data_vars:
25
+ if dimension in dataset[var].dims:
26
+ dataset = dataset.update(
27
+ {var: dataset[var].swap_dims({dimension: coords_names[0]})},
28
+ )
29
+ else:
30
+ dataset = dataset.set_index({dimension: coords_names})
31
+ dataset = dataset.unstack(dim=dimension)
32
+
33
+ # per quantify convention.
34
+ if "grid_2d" in dataset.attrs:
35
+ dataset.attrs["grid_2d"] = False
36
+
37
+ return dataset
@@ -0,0 +1,158 @@
1
+ Metadata-Version: 2.4
2
+ Name: etiket_sync_agent_quantify
3
+ Version: 0.3.0b1
4
+ Summary: Quantify backend for eTiKeT sync agent
5
+ Author: QHarbor team
6
+ License-Expression: LicenseRef-Proprietary
7
+ Project-URL: Homepage, https://qharbor.nl
8
+ Project-URL: Documentation, https://docs.qharbor.nl
9
+ Keywords: etiket,sync,backend,quantify,quantum
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Programming Language :: Python :: 3.13
21
+ Classifier: Programming Language :: Python :: 3.14
22
+ Classifier: Topic :: Scientific/Engineering
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENCE
27
+ Requires-Dist: etiket_sync_agent>=0.3.0b1
28
+ Requires-Dist: xarray
29
+ Provides-Extra: test
30
+ Requires-Dist: quantify-core>=0.7; extra == "test"
31
+ Dynamic: license-file
32
+
33
+ # eTiKeT Sync Agent - Quantify Backend
34
+
35
+ Backend for synchronizing Quantify datasets with the eTiKeT platform. This backend scans Quantify data directories and syncs HDF5 datasets to the cloud.
36
+
37
+ ## Installation
38
+
39
+ ```bash
40
+ pip install etiket_sync_agent_quantify
41
+ ```
42
+
43
+ The package is automatically discovered by `etiket_sync_agent` through the entry-point system.
44
+
45
+ ## What Gets Synchronized
46
+
47
+ When a Quantify dataset is synced, the following data is extracted and uploaded:
48
+
49
+ | Quantify Data | eTiKeT Field | Description |
50
+ |---------------|--------------|-------------|
51
+ | TUID (first 26 chars) | `alt_uid` | Unique identifier extracted from the folder name |
52
+ | Folder name (after TUID) | `name` | Name of the measurement |
53
+ | TUID timestamp | `collected` | Measurement timestamp parsed from TUID format |
54
+ | Variable `long_name`/`name` attrs | `tags` | Extracted from HDF5 dataset attributes |
55
+ | Config `set_up` | `attributes.set-up` | Experimental setup from configuration |
56
+ | HDF5 files | Data files | xarray datasets converted and uploaded |
57
+ | JSON/text files | Auxiliary files | Additional files in the dataset folder |
58
+
59
+ ### Data Processing
60
+
61
+ - **Grid conversion**: Datasets are automatically converted to gridded format for efficient visualization
62
+ - **Standard deviation detection**: Variables ending with `_u` are marked as uncertainties for their base variable
63
+ - **TUID extraction**: The unique identifier follows the format `YYYYMMDD-HHMMSS-fff-xxxxxx`
64
+
65
+ ---
66
+
67
+ ## Configuration
68
+
69
+ The Quantify backend requires a `QuantifyConfigData` configuration with the following fields:
70
+
71
+ | Field | Type | Required | Description |
72
+ |-------|------|----------|-------------|
73
+ | `quantify_directory` | `Path` or `str` | Yes | Path to the Quantify data directory |
74
+ | `set_up` | `str` | Yes | Name of the experimental setup (added as `set-up` attribute) |
75
+ | `is_server_folder` | `bool` | No | Whether this is a server folder (default: `False`), e.g. on network drive of the university. |
76
+
77
+ ### Example Configuration
78
+
79
+ Example using the `etiket-sdk` package:
80
+
81
+ ```python
82
+ from etiket_sdk.sync import SyncSources
83
+
84
+ SyncSources.create(
85
+ name="my_quantify_source",
86
+ backend_identifier="etiket_sync_agent_quantify",
87
+ config_data={
88
+ "quantify_directory": "~/quantify-data",
89
+ "set_up": "dilution_fridge_1",
90
+ "is_server_folder": False
91
+ },
92
+ default_scope="xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
93
+ )
94
+ ```
95
+
96
+ ## Live Sync
97
+
98
+ The Quantify backend supports **real-time synchronization** of running measurements.
99
+ We currenly do not support non-gridded datasets.
100
+
101
+ ### How Live Detection Works
102
+
103
+ A dataset is considered "live" (still being written to) if:
104
+
105
+ 1. **Recent modification**: The HDF5 file was modified within the last **2 minutes**
106
+ 2. **Contains NaN values**: Quantify pre-allocates arrays with NaN values that get filled during measurement
107
+ 3. **No newer datasets**: No other dataset directories have been created (indicating the measurement is still active)
108
+
109
+ ### How Live Sync Works
110
+
111
+ The `XArrayReplicator` class performs real-time replication:
112
+
113
+ 1. **Creates a qdrive dataset** with SWMR (Single-Writer Multiple-Reader) HDF5 mode
114
+ 2. **Monitors file modifications** by checking the source file's modification time
115
+ 3. **Tracks data using NaN cursor**: Follows the position of NaN values to detect new data
116
+ 4. **Polls for updates** every **0.5 seconds**.
117
+ 5. **Completes when**: All NaN values are replaced OR the 2-minute timeout is reached OR a newer dataset directory is created
118
+
119
+ ### Timeout Behavior
120
+
121
+ | Condition | Timeout | Action |
122
+ |-----------|---------|--------|
123
+ | No file modification | 120 seconds (2 min) | Dataset marked as complete |
124
+ | Newer directory created | Immediate | Live sync stops, dataset marked complete |
125
+ | All NaN values filled | Immediate | Dataset marked complete |
126
+
127
+ ### File Snapshot Safety
128
+
129
+ To avoid conflicts with concurrent file writes, the backend creates **temporary snapshots** of HDF5 files before reading:
130
+
131
+ ```python
132
+ with with_dataset_snapshot(file_path) as safe_file:
133
+ dataset = xr.open_dataset(safe_file, engine='h5netcdf')
134
+ # Process dataset safely
135
+ ```
136
+
137
+ This is necessary because Quantify continuously overwrites the HDF5 file during measurement. Creating a temporary copy avoids file corruption or read errors when the source file is being written to.
138
+
139
+ ---
140
+
141
+ ## Features
142
+
143
+ - **Direct Quantify integration**: Reads HDF5 datasets written by Quantify's data management system
144
+ - **Live sync support**: Real-time monitoring of running measurements via SWMR HDF5
145
+ - **Automatic TUID parsing**: Extracts timestamps and identifiers from Quantify's TUID format
146
+ - **Grid conversion**: Converts irregular datasets to gridded format for efficient visualization
147
+ - **Auxiliary file upload**: Syncs JSON, text, and other files alongside the main data
148
+
149
+ ## Requirements
150
+
151
+ - Python >= 3.10
152
+ - xarray
153
+ - h5py
154
+ - h5netcdf
155
+
156
+ ## License
157
+
158
+ Copyright © 2025 QHarbor. All Rights Reserved. See [LICENCE](LICENCE) for details.
@@ -0,0 +1,11 @@
1
+ etiket_sync_agent_quantify/__init__.py,sha256=oG9nX60PYq6eftC2ZfkE3HDzvFO9ZlJbSFxsfsLairU,174
2
+ etiket_sync_agent_quantify/live_sync.py,sha256=AYDVsI9o3urVbI-QwwxCR62zP6wsXV-fUC6JC2cN90o,16344
3
+ etiket_sync_agent_quantify/quantify_config_class.py,sha256=UyBUSXq0AZwzxaVPoI3yDtcMIBh7RaNYygTfNrtBCkU,4018
4
+ etiket_sync_agent_quantify/quantify_sync_class.py,sha256=sCXXJuMXFMUjmdxViZOaP9KQub36X5TTexscox0BLSI,11071
5
+ etiket_sync_agent_quantify/utility.py,sha256=oE6JWLAecg0uPfGV9bs08yz1lt90oKmIUOfe9wtkH6I,1584
6
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/licenses/LICENCE,sha256=tdZwE43Th9efUgN8-4UpMUyh0kYQ4Dk678agSuhpnc0,2357
7
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/METADATA,sha256=EB4Kot_woslLewePV9sv_NPTxWDoPASK0_RMMSe9arI,6202
8
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
9
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/entry_points.txt,sha256=NVgaHloUE0GgtwSkWJGgQ4GYl6i1bT2l6qQ79OEwwzY,91
10
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/top_level.txt,sha256=fKH3mxReAXUBerdivtAr9yJ4_JFc2oP_QxCX4xeEBEc,27
11
+ etiket_sync_agent_quantify-0.3.0b1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [etiket_sync_agent.backends]
2
+ quantify_sync_agent = etiket_sync_agent_quantify:QuantifySync
@@ -0,0 +1,34 @@
1
+ All Rights Reserved License
2
+ Copyright ©️ 2024-2026 QHarbor B.V. All Rights Reserved.
3
+
4
+ Agreement to Terms
5
+ By accessing, downloading, installing, or viewing this Software (including its source code, binaries, or application files), you acknowledge and agree to the terms outlined below.
6
+
7
+ Terms and Conditions
8
+ This software and its source code (the "Software") are the exclusive property of QHarbor B.V. and are protected by copyright and other intellectual property laws.
9
+
10
+ License and Testing Exemptions
11
+ Commercial License: If you have entered into a separate commercial license agreement with QHarbor B.V., the terms of that agreement shall supersede the restrictions listed below.
12
+ Testing Permission: If you have obtained written permission for testing/evaluation from QHarbor B.V., you are permitted to install and use the Software for evaluation purposes. However, this permission strictly excludes the right to modify, alter, create derivative works, or reverse engineer the Software.
13
+
14
+ Prohibited Actions
15
+ Unless explicitly authorized by the exemptions above, you are NOT permitted to:
16
+ • Copy, reproduce, or duplicate the Software in any form (except as reasonably necessary for viewing or authorized installation)
17
+ • Modify, alter, or create derivative works based on the Software
18
+ • Distribute, publish, or share the Software with others
19
+ • Reverse engineer, decompile, or disassemble the Software
20
+ • Use the Software for any commercial or non-commercial purposes
21
+ • Transfer, sell, lease, or sublicense the Software
22
+ • Remove or alter any copyright notices or proprietary markings
23
+
24
+ Viewing Only
25
+ For those without a commercial license or written testing permission, the Software is made available for viewing and reference purposes only. Any access to view the source code or application does not grant any rights to use, copy, or modify the Software.
26
+
27
+ No Implied Rights
28
+ No rights are granted by implication, estoppel, or otherwise. All rights not expressly granted are reserved by the copyright holder.
29
+
30
+ Governing Law
31
+ This license and any disputes arising from it shall be governed by the laws of the Netherlands.
32
+
33
+ Disclaimer
34
+ This Software is provided "AS IS" without warranty of any kind. The copyright holder disclaims all warranties and shall not be liable for any damages arising from the use or inability to use this Software.
@@ -0,0 +1 @@
1
+ etiket_sync_agent_quantify