ONE-api 3.0b1__py3-none-any.whl → 3.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/LICENSE +21 -21
  2. {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/METADATA +115 -115
  3. ONE_api-3.0b4.dist-info/RECORD +37 -0
  4. one/__init__.py +2 -2
  5. one/alf/__init__.py +1 -1
  6. one/alf/cache.py +640 -653
  7. one/alf/exceptions.py +105 -105
  8. one/alf/io.py +876 -876
  9. one/alf/path.py +1450 -1450
  10. one/alf/spec.py +519 -504
  11. one/api.py +2949 -2973
  12. one/converters.py +850 -850
  13. one/params.py +414 -414
  14. one/registration.py +845 -845
  15. one/remote/__init__.py +1 -1
  16. one/remote/aws.py +313 -313
  17. one/remote/base.py +142 -142
  18. one/remote/globus.py +1254 -1254
  19. one/tests/fixtures/params/.caches +6 -6
  20. one/tests/fixtures/params/.test.alyx.internationalbrainlab.org +8 -8
  21. one/tests/fixtures/rest_responses/1f187d80fd59677b395fcdb18e68e4401bfa1cc9 +1 -1
  22. one/tests/fixtures/rest_responses/47893cf67c985e6361cdee009334963f49fb0746 +1 -1
  23. one/tests/fixtures/rest_responses/535d0e9a1e2c1efbdeba0d673b131e00361a2edb +1 -1
  24. one/tests/fixtures/rest_responses/6dc96f7e9bcc6ac2e7581489b9580a6cd3f28293 +1 -1
  25. one/tests/fixtures/rest_responses/db1731fb8df0208944ae85f76718430813a8bf50 +1 -1
  26. one/tests/fixtures/rest_responses/dcce48259bb929661f60a02a48563f70aa6185b3 +1 -1
  27. one/tests/fixtures/rest_responses/f530d6022f61cdc9e38cc66beb3cb71f3003c9a1 +1 -1
  28. one/tests/fixtures/test_dbs.json +14 -14
  29. one/util.py +524 -524
  30. one/webclient.py +1366 -1354
  31. ONE_api-3.0b1.dist-info/RECORD +0 -37
  32. {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/WHEEL +0 -0
  33. {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/top_level.txt +0 -0
one/alf/cache.py CHANGED
@@ -1,653 +1,640 @@
1
- """Construct Parquet database from local file system.
2
-
3
- NB: If using a remote Alyx instance it is advisable to generate the cache via the Alyx one_cache
4
- management command, otherwise the resulting cache UUIDs will not match those on the database.
5
-
6
- Examples
7
- --------
8
- >>> from one.api import One
9
- >>> cache_dir = 'path/to/data'
10
- >>> make_parquet_db(cache_dir)
11
- >>> one = One(cache_dir=cache_dir)
12
-
13
- """
14
-
15
- # -------------------------------------------------------------------------------------------------
16
- # Imports
17
- # -------------------------------------------------------------------------------------------------
18
-
19
- import datetime
20
- import uuid
21
- from functools import partial
22
- from pathlib import Path
23
- import warnings
24
- import logging
25
-
26
- import pandas as pd
27
- import numpy as np
28
- from packaging import version
29
- from iblutil.util import Bunch
30
- from iblutil.io import parquet
31
- from iblutil.io.hashfile import md5
32
-
33
- from one.alf.spec import QC, is_uuid_string
34
- from one.alf.io import iter_sessions
35
- from one.alf.path import session_path_parts, get_alf_path
36
-
37
- __all__ = [
38
- 'make_parquet_db', 'patch_tables', 'merge_tables', 'QC_TYPE', 'remove_table_files',
39
- 'remove_missing_datasets', 'load_tables', 'EMPTY_DATASETS_FRAME', 'EMPTY_SESSIONS_FRAME']
40
- _logger = logging.getLogger(__name__)
41
-
42
- # -------------------------------------------------------------------------------------------------
43
- # Global variables
44
- # -------------------------------------------------------------------------------------------------
45
-
46
- QC_TYPE = pd.CategoricalDtype(categories=[e.name for e in sorted(QC)], ordered=True)
47
- """pandas.api.types.CategoricalDtype: The cache table QC column data type."""
48
-
49
- SESSIONS_COLUMNS = {
50
- 'id': object, # str
51
- 'lab': object, # str
52
- 'subject': object, # str
53
- 'date': object, # datetime.date
54
- 'number': np.uint16, # int
55
- 'task_protocol': object, # str
56
- 'projects': object # str
57
- }
58
- """dict: A map of sessions table fields and their data types."""
59
-
60
- DATASETS_COLUMNS = {
61
- 'eid': object, # str
62
- 'id': object, # str
63
- 'rel_path': object, # relative to the session path, includes the filename
64
- 'file_size': 'UInt64', # file size in bytes (nullable)
65
- 'hash': object, # sha1/md5, computed in load function
66
- 'exists': bool, # bool
67
- 'qc': QC_TYPE # one.alf.spec.QC enumeration
68
- }
69
- """dict: A map of datasets table fields and their data types."""
70
-
71
- EMPTY_DATASETS_FRAME = (pd.DataFrame(columns=DATASETS_COLUMNS)
72
- .astype(DATASETS_COLUMNS)
73
- .set_index(['eid', 'id']))
74
- """pandas.DataFrame: An empty datasets dataframe with correct columns and dtypes."""
75
-
76
- EMPTY_SESSIONS_FRAME = (pd.DataFrame(columns=SESSIONS_COLUMNS)
77
- .astype(SESSIONS_COLUMNS)
78
- .set_index('id'))
79
- """pandas.DataFrame: An empty sessions dataframe with correct columns and dtypes."""
80
-
81
-
82
- # -------------------------------------------------------------------------------------------------
83
- # Parsing util functions
84
- # -------------------------------------------------------------------------------------------------
85
-
86
- def _ses_str_id(session_path):
87
- """Returns a str id from a session path in the form '(lab/)subject/date/number'."""
88
- return Path(*filter(None, session_path_parts(session_path, assert_valid=True))).as_posix()
89
-
90
-
91
- def _get_session_info(rel_ses_path):
92
- """Parse a relative session path.
93
-
94
- Parameters
95
- ----------
96
- rel_ses_path : _type_
97
- _description_
98
-
99
- Returns
100
- -------
101
- str
102
- Experiment ID expressed as a relative session posix path.
103
- str
104
- The lab name (empty str).
105
- datetime.date
106
- The session date.
107
- int
108
- The session number.
109
- str
110
- The task protocol (empty str).
111
- str
112
- The associated project (empty str).
113
-
114
- """
115
- lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
116
- eid = _ses_str_id(rel_ses_path)
117
- s_date = pd.to_datetime(s_date).date()
118
- return eid, lab or '', subject, s_date, int(num), '', ''
119
-
120
-
121
- def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
122
- """Create dataset record from local path.
123
-
124
- Parameters
125
- ----------
126
- dset_path : one.alf.ALFPath
127
- A full ALF path.
128
- ses_eid : str, UUID, optional
129
- A session uuid.
130
- compute_hash : bool, optional
131
- Whether to compute a file hash.
132
-
133
- Returns
134
- -------
135
- str, uuid.UUID
136
- The session uuid.
137
- str
138
- The dataset ID expressed as a posix path relative to the session.
139
- str
140
- The dataset posix path, relative to the session.
141
- int
142
- The dataset file size.
143
- str
144
- The file hash, or empty str if `compute_hash` is false.
145
- True
146
- Whether the file exists.
147
- str
148
- The QC value for the dataset ('NOT_SET').
149
-
150
- """
151
- rel_dset_path = get_alf_path(dset_path.relative_to_session())
152
- ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
153
- file_size = dset_path.stat().st_size
154
- file_hash = md5(dset_path) if compute_hash else ''
155
- return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET'
156
-
157
-
158
- def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False):
159
- base_id = base_id or uuid.uuid1() # Base hash based on system by default
160
- toUUID = partial(uuid.uuid3, base_id) # MD5 hash from base uuid and rel session path string
161
- if keep_old:
162
- df[f'{id_key}_'] = df[id_key].copy()
163
- df.loc[:, id_key] = df.groupby(id_key)[id_key].transform(lambda x: toUUID(x.name))
164
- return df
165
-
166
-
167
- def _ids_to_uuid(df_ses, df_dsets):
168
- ns = uuid.uuid1()
169
- df_dsets = _rel_path_to_uuid(df_dsets, id_key='id', base_id=ns)
170
- df_ses = _rel_path_to_uuid(df_ses, id_key='id', base_id=ns, keep_old=True)
171
- # Copy new eids into datasets frame
172
- df_dsets['eid_'] = df_dsets['eid'].copy()
173
- df_dsets['eid'] = (df_ses
174
- .set_index('id_')
175
- .loc[df_dsets['eid'], 'id']
176
- .values)
177
- # Check that the session int IDs in both frames match
178
- ses_id_set = df_ses.set_index('id_')['id']
179
- assert (df_dsets
180
- .set_index('eid_')['eid']
181
- .drop_duplicates()
182
- .equals(ses_id_set)), 'session int ID mismatch between frames'
183
-
184
- # Set index
185
- df_ses = df_ses.set_index('id').drop('id_', axis=1).sort_index()
186
- df_dsets = df_dsets.set_index(['eid', 'id']).drop('eid_', axis=1).sort_index()
187
-
188
- return df_ses, df_dsets
189
-
190
-
191
- # -------------------------------------------------------------------------------------------------
192
- # Main functions
193
- # -------------------------------------------------------------------------------------------------
194
-
195
- def _metadata(origin):
196
- """Metadata dictionary for Parquet files.
197
-
198
- Parameters
199
- ----------
200
- origin : str, pathlib.Path
201
- Path to full directory, or computer name / db name.
202
-
203
- """
204
- return {
205
- 'date_created': datetime.datetime.now().isoformat(sep=' ', timespec='minutes'),
206
- 'origin': str(origin),
207
- }
208
-
209
-
210
- def _make_sessions_df(root_dir) -> pd.DataFrame:
211
- """Given a root directory, recursively finds all sessions and returns a sessions DataFrame.
212
-
213
- Parameters
214
- ----------
215
- root_dir : str, pathlib.Path
216
- The folder to look for sessions.
217
-
218
- Returns
219
- -------
220
- pandas.DataFrame
221
- A pandas DataFrame of session info.
222
-
223
- """
224
- rows = []
225
- for full_path in iter_sessions(root_dir):
226
- # Get the lab/Subjects/subject/date/number part of a file path
227
- rel_path = get_alf_path(full_path)
228
- # A dict of session info extracted from path
229
- ses_info = _get_session_info(rel_path)
230
- assert len(ses_info) == len(SESSIONS_COLUMNS)
231
- rows.append(ses_info)
232
- df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS)
233
- return df
234
-
235
-
236
- def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
237
- """Given a root directory, recursively finds all datasets and returns a datasets DataFrame.
238
-
239
- Parameters
240
- ----------
241
- root_dir : str, pathlib.Path
242
- The folder to look for sessions.
243
- hash_files : bool
244
- If True, an MD5 is computed for each file and stored in the 'hash' column.
245
-
246
- Returns
247
- -------
248
- pandas.DataFrame
249
- A pandas DataFrame of dataset info.
250
-
251
- """
252
- # Go through sessions and append datasets
253
- rows = []
254
- for session_path in iter_sessions(root_dir):
255
- for dset_path in session_path.iter_datasets(recursive=True):
256
- file_info = _get_dataset_info(dset_path, compute_hash=hash_files)
257
- assert len(file_info) == len(DATASETS_COLUMNS)
258
- rows.append(file_info)
259
- return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)
260
-
261
-
262
- def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
263
- """Given a data directory, index the ALF datasets and save the generated cache tables.
264
-
265
- Parameters
266
- ----------
267
- root_dir : str, pathlib.Path
268
- The file directory to index.
269
- out_dir : str, pathlib.Path
270
- Optional output directory to save cache tables. If None, the files are saved into the
271
- root directory.
272
- hash_ids : bool
273
- If True, experiment and dataset IDs will be UUIDs generated from the system and relative
274
- paths (required for use with ONE API).
275
- hash_files : bool
276
- If True, an MD5 hash is computed for each dataset and stored in the datasets table.
277
- This will substantially increase cache generation time.
278
- lab : str
279
- An optional lab name to associate with the data. If the folder structure
280
- contains 'lab/Subjects', the lab name will be taken from the folder name.
281
-
282
- Returns
283
- -------
284
- pathlib.Path
285
- The full path of the saved sessions parquet table.
286
- pathlib.Path
287
- The full path of the saved datasets parquet table.
288
-
289
- """
290
- root_dir = Path(root_dir).resolve()
291
-
292
- # Make the data frames.
293
- df_ses = _make_sessions_df(root_dir)
294
- df_dsets = _make_datasets_df(root_dir, hash_files=hash_files)
295
-
296
- # Add integer id columns
297
- if hash_ids and len(df_ses) > 0:
298
- df_ses, df_dsets = _ids_to_uuid(df_ses, df_dsets)
299
- # For parquet all indices must be str
300
- df_ses.index = df_ses.index.map(str)
301
- df_dsets.index = df_dsets.index.map(lambda x: tuple(map(str, x)))
302
-
303
- if lab: # Fill in lab name field
304
- assert not df_ses['lab'].any() or (df_ses['lab'] == 'lab').all(), 'lab name conflict'
305
- df_ses['lab'] = lab
306
-
307
- # Check any files were found
308
- if df_ses.empty or df_dsets.empty:
309
- warnings.warn(f'No {"sessions" if df_ses.empty else "datasets"} found', RuntimeWarning)
310
-
311
- # Output directory.
312
- out_dir = Path(out_dir or root_dir)
313
- assert out_dir.is_dir()
314
- assert out_dir.exists()
315
-
316
- # Parquet files to save.
317
- fn_ses = out_dir / 'sessions.pqt'
318
- fn_dsets = out_dir / 'datasets.pqt'
319
-
320
- # Parquet metadata.
321
- metadata = _metadata(root_dir)
322
-
323
- # Save the Parquet files.
324
- parquet.save(fn_ses, df_ses, metadata)
325
- parquet.save(fn_dsets, df_dsets, metadata)
326
-
327
- return fn_ses, fn_dsets
328
-
329
-
330
- def cast_index_object(df: pd.DataFrame, dtype: type = uuid.UUID) -> pd.Index:
331
- """Cast the index object to the specified dtype.
332
-
333
- NB: The data frame index will remain as 'object', however the underlying object type will be
334
- modified.
335
-
336
- Parameters
337
- ----------
338
- df : pandas.DataFrame
339
- A data frame with an index to cast.
340
- dtype : type, function
341
- The desired dtype or a mapping function.
342
-
343
- Returns
344
- -------
345
- pandas.DataFrame
346
- An updated data frame with a new index data type.
347
-
348
- """
349
- if isinstance(df.index, pd.MultiIndex):
350
- # df.index = df.index.map(lambda x: tuple(map(UUID, x)))
351
- levels = range(df.index.nlevels)
352
- df.index = pd.MultiIndex.from_arrays(
353
- [df.index.get_level_values(i).map(dtype, na_action='ignore') for i in levels],
354
- names=df.index.names
355
- )
356
- else:
357
- df.index = df.index.map(dtype, na_action='ignore')
358
- return df
359
-
360
-
361
- def load_tables(tables_dir, glob_pattern='*.pqt'):
362
- """Load parquet cache files from a local directory.
363
-
364
- Parameters
365
- ----------
366
- tables_dir : str, pathlib.Path
367
- The directory location of the parquet files.
368
- glob_pattern : str
369
- A glob pattern to match the cache files.
370
-
371
-
372
- Returns
373
- -------
374
- Bunch
375
- A Bunch object containing the loaded cache tables and associated metadata.
376
-
377
- """
378
- meta = {
379
- 'expired': False,
380
- 'created_time': None,
381
- 'loaded_time': None,
382
- 'modified_time': None,
383
- 'saved_time': None,
384
- 'raw': {}
385
- }
386
- caches = Bunch({
387
- 'datasets': EMPTY_DATASETS_FRAME.copy(),
388
- 'sessions': EMPTY_SESSIONS_FRAME.copy(),
389
- '_meta': meta})
390
- INDEX_KEY = '.?id'
391
- for cache_file in Path(tables_dir).glob(glob_pattern):
392
- table = cache_file.stem
393
- # we need to keep this part fast enough for transient objects
394
- cache, meta['raw'][table] = parquet.load(cache_file)
395
- if 'date_created' not in meta['raw'][table]:
396
- _logger.warning(f"{cache_file} does not appear to be a valid table. Skipping")
397
- continue
398
- meta['loaded_time'] = datetime.datetime.now()
399
-
400
- # Set the appropriate index if none already set
401
- if isinstance(cache.index, pd.RangeIndex):
402
- idx_columns = sorted(cache.filter(regex=INDEX_KEY).columns)
403
- if len(idx_columns) == 0:
404
- raise KeyError('Failed to set index')
405
- cache.set_index(idx_columns, inplace=True)
406
-
407
- # Patch older tables
408
- cache = patch_tables(cache, meta['raw'][table].get('min_api_version'), table)
409
-
410
- # Cast indices to UUID
411
- cache = cast_index_object(cache, uuid.UUID)
412
-
413
- # Check sorted
414
- # Sorting makes MultiIndex indexing O(N) -> O(1)
415
- if not cache.index.is_monotonic_increasing:
416
- cache.sort_index(inplace=True)
417
-
418
- caches[table] = cache
419
-
420
- created = [datetime.datetime.fromisoformat(x['date_created'])
421
- for x in meta['raw'].values() if 'date_created' in x]
422
- if created:
423
- meta['created_time'] = min(created)
424
- return caches
425
-
426
-
427
- def merge_tables(cache, strict=False, **kwargs):
428
- """Update the cache tables with new records.
429
-
430
- Parameters
431
- ----------
432
- dict
433
- A map of cache tables to update.
434
- strict : bool
435
- If not True, the columns don't need to match. Extra columns in input tables are
436
- dropped and missing columns are added and filled with np.nan.
437
- kwargs
438
- pandas.DataFrame or pandas.Series to insert/update for each table.
439
-
440
- Returns
441
- -------
442
- datetime.datetime:
443
- A timestamp of when the cache was updated.
444
-
445
- Example
446
- -------
447
- >>> session, datasets = ses2records(self.get_details(eid, full=True))
448
- ... self._update_cache_from_records(sessions=session, datasets=datasets)
449
-
450
- Raises
451
- ------
452
- AssertionError
453
- When strict is True the input columns must exactly match those oo the cache table,
454
- including the order.
455
- KeyError
456
- One or more of the keyword arguments does not match a table in cache.
457
-
458
- """
459
- updated = None
460
- for table, records in kwargs.items():
461
- if records is None or records.empty:
462
- continue
463
- if table not in cache:
464
- raise KeyError(f'Table "{table}" not in cache')
465
- if isinstance(records, pd.Series):
466
- records = pd.DataFrame([records])
467
- records.index.set_names(cache[table].index.names, inplace=True)
468
- # Drop duplicate indices
469
- records = records[~records.index.duplicated(keep='first')]
470
- if not strict:
471
- # Deal with case where there are extra columns in the cache
472
- extra_columns = list(set(cache[table].columns) - set(records.columns))
473
- # Convert these columns to nullable, if required
474
- cache_columns = cache[table][extra_columns]
475
- cache[table][extra_columns] = cache_columns.convert_dtypes()
476
- column_ids = map(list(cache[table].columns).index, extra_columns)
477
- for col, n in sorted(zip(extra_columns, column_ids), key=lambda x: x[1]):
478
- dtype = cache[table][col].dtype
479
- nan = getattr(dtype, 'na_value', np.nan)
480
- val = records.get('exists', True) if col.startswith('exists_') else nan
481
- records.insert(n, col, val)
482
- # Drop any extra columns in the records that aren't in cache table
483
- to_drop = set(records.columns) - set(cache[table].columns)
484
- records = records.drop(to_drop, axis=1)
485
- records = records.reindex(columns=cache[table].columns)
486
- assert set(cache[table].columns) == set(records.columns)
487
- records = records.astype(cache[table].dtypes)
488
- # Update existing rows
489
- to_update = records.index.isin(cache[table].index)
490
- cache[table].loc[records.index[to_update], :] = records[to_update]
491
- # Assign new rows
492
- to_assign = records[~to_update]
493
- frames = [cache[table], to_assign]
494
- # Concatenate and sort
495
- cache[table] = pd.concat(frames).sort_index()
496
- updated = datetime.datetime.now()
497
- cache['_meta']['modified_time'] = updated
498
- return updated
499
-
500
-
501
- def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True, dry=True):
502
- """Remove dataset files and session folders that are not in the provided cache.
503
-
504
- NB: This *does not* remove entries from the cache tables that are missing on disk.
505
- Non-ALF files are not removed. Empty sessions that exist in the sessions table are not removed.
506
-
507
- Parameters
508
- ----------
509
- cache_dir : str, pathlib.Path
510
- tables : dict[str, pandas.DataFrame], optional
511
- A dict with keys ('sessions', 'datasets'), containing the cache tables as DataFrames.
512
- remove_empty_sessions : bool
513
- Attempt to remove session folders that are empty and not in the sessions table.
514
- dry : bool
515
- If true, do not remove anything.
516
-
517
- Returns
518
- -------
519
- list
520
- A sorted list of paths to be removed.
521
-
522
- """
523
- cache_dir = Path(cache_dir)
524
- if tables is None:
525
- tables = {}
526
- for name in ('datasets', 'sessions'):
527
- table, m = parquet.load(cache_dir / f'{name}.pqt')
528
- tables[name] = patch_tables(table, m.get('min_api_version'), name)
529
-
530
- INDEX_KEY = '.?id'
531
- for name in tables:
532
- # Set the appropriate index if none already set
533
- if isinstance(tables[name].index, pd.RangeIndex):
534
- idx_columns = sorted(tables[name].filter(regex=INDEX_KEY).columns)
535
- tables[name].set_index(idx_columns, inplace=True)
536
-
537
- # Cast indices to UUID
538
- if any(map(is_uuid_string, tables[name].index.get_level_values(0))):
539
- tables[name] = cast_index_object(tables[name], uuid.UUID)
540
-
541
- to_delete = set()
542
- from one.converters import session_record2path # imported here due to circular imports
543
- gen_path = partial(session_record2path, root_dir=cache_dir)
544
- # map of session path to eid
545
- sessions = {gen_path(rec): eid for eid, rec in tables['sessions'].iterrows()}
546
- for session_path in iter_sessions(cache_dir):
547
- try:
548
- datasets = tables['datasets'].loc[sessions[session_path]]
549
- except KeyError:
550
- datasets = tables['datasets'].iloc[0:0, :]
551
- for dataset in session_path.iter_datasets():
552
- if dataset.relative_to_session().as_posix() not in datasets['rel_path']:
553
- to_delete.add(dataset)
554
- if session_path not in sessions and remove_empty_sessions:
555
- to_delete.add(session_path)
556
-
557
- if dry:
558
- print('The following session and datasets would be removed:', end='\n\t')
559
- print('\n\t'.join(sorted(map(str, to_delete))))
560
- return sorted(to_delete)
561
-
562
- # Delete datasets
563
- for path in to_delete:
564
- if path.is_file():
565
- _logger.debug(f'Removing {path}')
566
- path.unlink()
567
- else:
568
- # Recursively remove empty folders
569
- while path.parent != cache_dir and not next(path.rglob('*'), False):
570
- _logger.debug(f'Removing {path}')
571
- path.rmdir()
572
- path = path.parent
573
-
574
- return sorted(to_delete)
575
-
576
-
577
- def remove_table_files(folder, tables=('sessions', 'datasets')):
578
- """Delete cache tables on disk.
579
-
580
- Parameters
581
- ----------
582
- folder : pathlib.Path
583
- The directory path containing cache tables to remove.
584
- tables : list of str
585
- A list of table names to remove, e.g. ['sessions', 'datasets'].
586
- NB: This will also delete the cache_info.json metadata file.
587
-
588
- Returns
589
- -------
590
- list of pathlib.Path
591
- A list of the removed files.
592
-
593
- """
594
- filenames = ('cache_info.json', *(f'{t}.pqt' for t in tables))
595
- removed = []
596
- for file in map(folder.joinpath, filenames):
597
- if file.exists():
598
- file.unlink()
599
- removed.append(file)
600
- else:
601
- _logger.warning('%s not found', file)
602
- return removed
603
-
604
-
605
- def _cache_int2str(table: pd.DataFrame) -> pd.DataFrame:
606
- """Convert int ids to str ids for cache table.
607
-
608
- Parameters
609
- ----------
610
- table : pd.DataFrame
611
- A cache table (from One._cache).
612
-
613
- """
614
- # Convert integer uuids to str uuids
615
- if table.index.nlevels < 2 or not any(x.endswith('_0') for x in table.index.names):
616
- return table
617
- table = table.reset_index()
618
- int_cols = table.filter(regex=r'_\d{1}$').columns.sort_values()
619
- assert not len(int_cols) % 2, 'expected even number of columns ending in _0 or _1'
620
- names = sorted(set(c.rsplit('_', 1)[0] for c in int_cols.values))
621
- for i, name in zip(range(0, len(int_cols), 2), names):
622
- table[name] = parquet.np2str(table[int_cols[i:i + 2]])
623
- table = table.drop(int_cols, axis=1).set_index(names)
624
- return table
625
-
626
-
627
- def patch_tables(table: pd.DataFrame, min_api_version=None, name=None) -> pd.DataFrame:
628
- """Reformat older cache tables to comply with this version of ONE.
629
-
630
- Currently this function will 1. convert integer UUIDs to string UUIDs; 2. rename the 'project'
631
- column to 'projects'; 3. add QC column; 4. drop session_path column.
632
-
633
- Parameters
634
- ----------
635
- table : pd.DataFrame
636
- A cache table (from One._cache).
637
- min_api_version : str
638
- The minimum API version supported by this cache table.
639
- name : {'dataset', 'session'} str
640
- The name of the table.
641
-
642
- """
643
- min_version = version.parse(min_api_version or '0.0.0')
644
- table = _cache_int2str(table)
645
- # Rename project column
646
- if min_version < version.Version('1.13.0') and 'project' in table.columns:
647
- table.rename(columns={'project': 'projects'}, inplace=True)
648
- if name == 'datasets' and min_version < version.Version('2.7.0') and 'qc' not in table.columns:
649
- qc = pd.Categorical.from_codes(np.zeros(len(table.index), dtype=int), dtype=QC_TYPE)
650
- table = table.assign(qc=qc)
651
- if name == 'datasets' and 'session_path' in table.columns:
652
- table = table.drop('session_path', axis=1)
653
- return table
1
+ """Construct Parquet database from local file system.
2
+
3
+ NB: If using a remote Alyx instance it is advisable to generate the cache via the Alyx one_cache
4
+ management command, otherwise the resulting cache UUIDs will not match those on the database.
5
+
6
+ Examples
7
+ --------
8
+ >>> from one.api import One
9
+ >>> cache_dir = 'path/to/data'
10
+ >>> make_parquet_db(cache_dir)
11
+ >>> one = One(cache_dir=cache_dir)
12
+
13
+ """
14
+
15
+ # -------------------------------------------------------------------------------------------------
16
+ # Imports
17
+ # -------------------------------------------------------------------------------------------------
18
+
19
+ import datetime
20
+ import uuid
21
+ from functools import partial
22
+ from pathlib import Path
23
+ import warnings
24
+ import logging
25
+
26
+ import pandas as pd
27
+ import numpy as np
28
+ from packaging import version
29
+ from iblutil.util import Bunch
30
+ from iblutil.io import parquet
31
+ from iblutil.io.hashfile import md5
32
+
33
+ from one.alf.spec import QC, is_uuid_string
34
+ from one.alf.io import iter_sessions
35
+ from one.alf.path import session_path_parts, get_alf_path
36
+
37
+ __all__ = [
38
+ 'make_parquet_db', 'patch_tables', 'merge_tables', 'QC_TYPE', 'remove_table_files',
39
+ 'remove_missing_datasets', 'load_tables', 'EMPTY_DATASETS_FRAME', 'EMPTY_SESSIONS_FRAME']
40
+ _logger = logging.getLogger(__name__)
41
+
42
+ # -------------------------------------------------------------------------------------------------
43
+ # Global variables
44
+ # -------------------------------------------------------------------------------------------------
45
+
46
+ QC_TYPE = pd.CategoricalDtype(categories=[e.name for e in sorted(QC)], ordered=True)
47
+ """pandas.api.types.CategoricalDtype: The cache table QC column data type."""
48
+
49
+ SESSIONS_COLUMNS = {
50
+ 'id': object, # str
51
+ 'lab': object, # str
52
+ 'subject': object, # str
53
+ 'date': object, # datetime.date
54
+ 'number': np.uint16, # int
55
+ 'task_protocol': object, # str
56
+ 'projects': object # str
57
+ }
58
+ """dict: A map of sessions table fields and their data types."""
59
+
60
+ DATASETS_COLUMNS = {
61
+ 'eid': object, # str
62
+ 'id': object, # str
63
+ 'rel_path': object, # relative to the session path, includes the filename
64
+ 'file_size': 'UInt64', # file size in bytes (nullable)
65
+ 'hash': object, # sha1/md5, computed in load function
66
+ 'exists': bool, # bool
67
+ 'qc': QC_TYPE # one.alf.spec.QC enumeration
68
+ }
69
+ """dict: A map of datasets table fields and their data types."""
70
+
71
+ EMPTY_DATASETS_FRAME = (pd.DataFrame(columns=DATASETS_COLUMNS)
72
+ .astype(DATASETS_COLUMNS)
73
+ .set_index(['eid', 'id']))
74
+ """pandas.DataFrame: An empty datasets dataframe with correct columns and dtypes."""
75
+
76
+ EMPTY_SESSIONS_FRAME = (pd.DataFrame(columns=SESSIONS_COLUMNS)
77
+ .astype(SESSIONS_COLUMNS)
78
+ .set_index('id'))
79
+ """pandas.DataFrame: An empty sessions dataframe with correct columns and dtypes."""
80
+
81
+
82
+ # -------------------------------------------------------------------------------------------------
83
+ # Parsing util functions
84
+ # -------------------------------------------------------------------------------------------------
85
+
86
+ def _ses_str_id(session_path):
87
+ """Returns a str id from a session path in the form '(lab/)subject/date/number'."""
88
+ return Path(*filter(None, session_path_parts(session_path, assert_valid=True))).as_posix()
89
+
90
+
91
+ def _get_session_info(rel_ses_path):
92
+ """Parse a relative session path.
93
+
94
+ Parameters
95
+ ----------
96
+ rel_ses_path : _type_
97
+ _description_
98
+
99
+ Returns
100
+ -------
101
+ str
102
+ Experiment ID expressed as a relative session posix path.
103
+ str
104
+ The lab name (empty str).
105
+ datetime.date
106
+ The session date.
107
+ int
108
+ The session number.
109
+ str
110
+ The task protocol (empty str).
111
+ str
112
+ The associated project (empty str).
113
+
114
+ """
115
+ lab, subject, s_date, num = session_path_parts(rel_ses_path, as_dict=False, assert_valid=True)
116
+ eid = _ses_str_id(rel_ses_path)
117
+ s_date = pd.to_datetime(s_date).date()
118
+ return eid, lab or '', subject, s_date, int(num), '', ''
119
+
120
+
121
+ def _get_dataset_info(dset_path, ses_eid=None, compute_hash=False):
122
+ """Create dataset record from local path.
123
+
124
+ Parameters
125
+ ----------
126
+ dset_path : one.alf.ALFPath
127
+ A full ALF path.
128
+ ses_eid : str, UUID, optional
129
+ A session uuid.
130
+ compute_hash : bool, optional
131
+ Whether to compute a file hash.
132
+
133
+ Returns
134
+ -------
135
+ str, uuid.UUID
136
+ The session uuid.
137
+ str
138
+ The dataset ID expressed as a posix path relative to the session.
139
+ str
140
+ The dataset posix path, relative to the session.
141
+ int
142
+ The dataset file size.
143
+ str
144
+ The file hash, or empty str if `compute_hash` is false.
145
+ True
146
+ Whether the file exists.
147
+ str
148
+ The QC value for the dataset ('NOT_SET').
149
+
150
+ """
151
+ rel_dset_path = get_alf_path(dset_path.relative_to_session())
152
+ ses_eid = ses_eid or _ses_str_id(dset_path.session_path())
153
+ file_size = dset_path.stat().st_size
154
+ file_hash = md5(dset_path) if compute_hash else ''
155
+ return ses_eid or pd.NA, rel_dset_path, rel_dset_path, file_size, file_hash, True, 'NOT_SET'
156
+
157
+
158
+ def _rel_path_to_uuid(df, id_key='rel_path', base_id=None, keep_old=False):
159
+ base_id = base_id or uuid.uuid1() # Base hash based on system by default
160
+ toUUID = partial(uuid.uuid3, base_id) # MD5 hash from base uuid and rel session path string
161
+ if keep_old:
162
+ df[f'{id_key}_'] = df[id_key].copy()
163
+ df.loc[:, id_key] = df.groupby(id_key)[id_key].transform(lambda x: toUUID(x.name))
164
+ return df
165
+
166
+
167
+ def _ids_to_uuid(df_ses, df_dsets):
168
+ ns = uuid.uuid1()
169
+ df_dsets = _rel_path_to_uuid(df_dsets, id_key='id', base_id=ns)
170
+ df_ses = _rel_path_to_uuid(df_ses, id_key='id', base_id=ns, keep_old=True)
171
+ # Copy new eids into datasets frame
172
+ df_dsets['eid_'] = df_dsets['eid'].copy()
173
+ df_dsets['eid'] = (df_ses
174
+ .set_index('id_')
175
+ .loc[df_dsets['eid'], 'id']
176
+ .values)
177
+ # Check that the session int IDs in both frames match
178
+ ses_id_set = df_ses.set_index('id_')['id']
179
+ assert (df_dsets
180
+ .set_index('eid_')['eid']
181
+ .drop_duplicates()
182
+ .equals(ses_id_set)), 'session int ID mismatch between frames'
183
+
184
+ # Set index
185
+ df_ses = df_ses.set_index('id').drop('id_', axis=1).sort_index()
186
+ df_dsets = df_dsets.set_index(['eid', 'id']).drop('eid_', axis=1).sort_index()
187
+
188
+ return df_ses, df_dsets
189
+
190
+
191
+ # -------------------------------------------------------------------------------------------------
192
+ # Main functions
193
+ # -------------------------------------------------------------------------------------------------
194
+
195
+ def _metadata(origin):
196
+ """Metadata dictionary for Parquet files.
197
+
198
+ Parameters
199
+ ----------
200
+ origin : str, pathlib.Path
201
+ Path to full directory, or computer name / db name.
202
+
203
+ """
204
+ return {
205
+ 'date_created': datetime.datetime.now().isoformat(sep=' ', timespec='minutes'),
206
+ 'origin': str(origin),
207
+ }
208
+
209
+
210
+ def _make_sessions_df(root_dir) -> pd.DataFrame:
211
+ """Given a root directory, recursively finds all sessions and returns a sessions DataFrame.
212
+
213
+ Parameters
214
+ ----------
215
+ root_dir : str, pathlib.Path
216
+ The folder to look for sessions.
217
+
218
+ Returns
219
+ -------
220
+ pandas.DataFrame
221
+ A pandas DataFrame of session info.
222
+
223
+ """
224
+ rows = []
225
+ for full_path in iter_sessions(root_dir):
226
+ # Get the lab/Subjects/subject/date/number part of a file path
227
+ rel_path = get_alf_path(full_path)
228
+ # A dict of session info extracted from path
229
+ ses_info = _get_session_info(rel_path)
230
+ assert len(ses_info) == len(SESSIONS_COLUMNS)
231
+ rows.append(ses_info)
232
+ df = pd.DataFrame(rows, columns=SESSIONS_COLUMNS).astype(SESSIONS_COLUMNS)
233
+ return df
234
+
235
+
236
+ def _make_datasets_df(root_dir, hash_files=False) -> pd.DataFrame:
237
+ """Given a root directory, recursively finds all datasets and returns a datasets DataFrame.
238
+
239
+ Parameters
240
+ ----------
241
+ root_dir : str, pathlib.Path
242
+ The folder to look for sessions.
243
+ hash_files : bool
244
+ If True, an MD5 is computed for each file and stored in the 'hash' column.
245
+
246
+ Returns
247
+ -------
248
+ pandas.DataFrame
249
+ A pandas DataFrame of dataset info.
250
+
251
+ """
252
+ # Go through sessions and append datasets
253
+ rows = []
254
+ for session_path in iter_sessions(root_dir):
255
+ for dset_path in session_path.iter_datasets(recursive=True):
256
+ file_info = _get_dataset_info(dset_path, compute_hash=hash_files)
257
+ assert len(file_info) == len(DATASETS_COLUMNS)
258
+ rows.append(file_info)
259
+ return pd.DataFrame(rows, columns=DATASETS_COLUMNS).astype(DATASETS_COLUMNS)
260
+
261
+
262
+ def make_parquet_db(root_dir, out_dir=None, hash_ids=True, hash_files=False, lab=None):
263
+ """Given a data directory, index the ALF datasets and save the generated cache tables.
264
+
265
+ Parameters
266
+ ----------
267
+ root_dir : str, pathlib.Path
268
+ The file directory to index.
269
+ out_dir : str, pathlib.Path
270
+ Optional output directory to save cache tables. If None, the files are saved into the
271
+ root directory.
272
+ hash_ids : bool
273
+ If True, experiment and dataset IDs will be UUIDs generated from the system and relative
274
+ paths (required for use with ONE API).
275
+ hash_files : bool
276
+ If True, an MD5 hash is computed for each dataset and stored in the datasets table.
277
+ This will substantially increase cache generation time.
278
+ lab : str
279
+ An optional lab name to associate with the data. If the folder structure
280
+ contains 'lab/Subjects', the lab name will be taken from the folder name.
281
+
282
+ Returns
283
+ -------
284
+ pathlib.Path
285
+ The full path of the saved sessions parquet table.
286
+ pathlib.Path
287
+ The full path of the saved datasets parquet table.
288
+
289
+ """
290
+ root_dir = Path(root_dir).resolve()
291
+
292
+ # Make the data frames.
293
+ df_ses = _make_sessions_df(root_dir)
294
+ df_dsets = _make_datasets_df(root_dir, hash_files=hash_files)
295
+
296
+ # Add integer id columns
297
+ if hash_ids and len(df_ses) > 0:
298
+ df_ses, df_dsets = _ids_to_uuid(df_ses, df_dsets)
299
+ # For parquet all indices must be str
300
+ df_ses.index = df_ses.index.map(str)
301
+ df_dsets.index = df_dsets.index.map(lambda x: tuple(map(str, x)))
302
+
303
+ if lab: # Fill in lab name field
304
+ assert not df_ses['lab'].any() or (df_ses['lab'] == 'lab').all(), 'lab name conflict'
305
+ df_ses['lab'] = lab
306
+
307
+ # Check any files were found
308
+ if df_ses.empty or df_dsets.empty:
309
+ warnings.warn(f'No {"sessions" if df_ses.empty else "datasets"} found', RuntimeWarning)
310
+
311
+ # Output directory.
312
+ out_dir = Path(out_dir or root_dir)
313
+ assert out_dir.is_dir()
314
+ assert out_dir.exists()
315
+
316
+ # Parquet files to save.
317
+ fn_ses = out_dir / 'sessions.pqt'
318
+ fn_dsets = out_dir / 'datasets.pqt'
319
+
320
+ # Parquet metadata.
321
+ metadata = _metadata(root_dir)
322
+
323
+ # Save the Parquet files.
324
+ parquet.save(fn_ses, df_ses, metadata)
325
+ parquet.save(fn_dsets, df_dsets, metadata)
326
+
327
+ return fn_ses, fn_dsets
328
+
329
+
330
+ def cast_index_object(df: pd.DataFrame, dtype: type = uuid.UUID) -> pd.Index:
331
+ """Cast the index object to the specified dtype.
332
+
333
+ NB: The data frame index will remain as 'object', however the underlying object type will be
334
+ modified.
335
+
336
+ Parameters
337
+ ----------
338
+ df : pandas.DataFrame
339
+ A data frame with an index to cast.
340
+ dtype : type, function
341
+ The desired dtype or a mapping function.
342
+
343
+ Returns
344
+ -------
345
+ pandas.DataFrame
346
+ An updated data frame with a new index data type.
347
+
348
+ """
349
+ if isinstance(df.index, pd.MultiIndex):
350
+ # df.index = df.index.map(lambda x: tuple(map(UUID, x)))
351
+ levels = range(df.index.nlevels)
352
+ df.index = pd.MultiIndex.from_arrays(
353
+ [df.index.get_level_values(i).map(dtype, na_action='ignore') for i in levels],
354
+ names=df.index.names
355
+ )
356
+ else:
357
+ df.index = df.index.map(dtype, na_action='ignore')
358
+ return df
359
+
360
+
361
+ def load_tables(tables_dir, glob_pattern='*.pqt'):
362
+ """Load parquet cache files from a local directory.
363
+
364
+ Parameters
365
+ ----------
366
+ tables_dir : str, pathlib.Path
367
+ The directory location of the parquet files.
368
+ glob_pattern : str
369
+ A glob pattern to match the cache files.
370
+
371
+
372
+ Returns
373
+ -------
374
+ Bunch
375
+ A Bunch object containing the loaded cache tables and associated metadata.
376
+
377
+ """
378
+ meta = {
379
+ 'created_time': None,
380
+ 'loaded_time': None,
381
+ 'modified_time': None,
382
+ 'saved_time': None,
383
+ 'raw': {}
384
+ }
385
+ caches = Bunch({
386
+ 'datasets': EMPTY_DATASETS_FRAME.copy(),
387
+ 'sessions': EMPTY_SESSIONS_FRAME.copy(),
388
+ '_meta': meta})
389
+ INDEX_KEY = '.?id'
390
+ for cache_file in Path(tables_dir).glob(glob_pattern):
391
+ table = cache_file.stem
392
+ # we need to keep this part fast enough for transient objects
393
+ cache, meta['raw'][table] = parquet.load(cache_file)
394
+ if 'date_created' not in meta['raw'][table]:
395
+ _logger.warning(f"{cache_file} does not appear to be a valid table. Skipping")
396
+ continue
397
+ meta['loaded_time'] = datetime.datetime.now()
398
+
399
+ # Set the appropriate index if none already set
400
+ if isinstance(cache.index, pd.RangeIndex):
401
+ idx_columns = sorted(cache.filter(regex=INDEX_KEY).columns)
402
+ if len(idx_columns) == 0:
403
+ raise KeyError('Failed to set index')
404
+ cache.set_index(idx_columns, inplace=True)
405
+
406
+ # Patch older tables
407
+ cache = patch_tables(cache, meta['raw'][table].get('min_api_version'), table)
408
+
409
+ # Cast indices to UUID
410
+ # NB: Old caches may have pathstr indices
411
+ if any(map(is_uuid_string, cache.index.get_level_values(0))):
412
+ cache = cast_index_object(cache, uuid.UUID)
413
+
414
+ # Check sorted
415
+ # Sorting makes MultiIndex indexing O(N) -> O(1)
416
+ if not cache.index.is_monotonic_increasing:
417
+ cache.sort_index(inplace=True)
418
+
419
+ caches[table] = cache
420
+
421
+ created = [datetime.datetime.fromisoformat(x['date_created'])
422
+ for x in meta['raw'].values() if 'date_created' in x]
423
+ if created:
424
+ meta['created_time'] = min(created)
425
+ return caches
426
+
427
+
428
+ def merge_tables(cache, strict=False, **kwargs):
429
+ """Update the cache tables with new records.
430
+
431
+ Parameters
432
+ ----------
433
+ dict
434
+ A map of cache tables to update.
435
+ strict : bool
436
+ If not True, the columns don't need to match. Extra columns in input tables are
437
+ dropped and missing columns are added and filled with np.nan.
438
+ kwargs
439
+ pandas.DataFrame or pandas.Series to insert/update for each table.
440
+
441
+ Returns
442
+ -------
443
+ datetime.datetime:
444
+ A timestamp of when the cache was updated.
445
+
446
+ Example
447
+ -------
448
+ >>> session, datasets = ses2records(self.get_details(eid, full=True))
449
+ ... self._update_cache_from_records(sessions=session, datasets=datasets)
450
+
451
+ Raises
452
+ ------
453
+ AssertionError
454
+ When strict is True the input columns must exactly match those oo the cache table,
455
+ including the order.
456
+ KeyError
457
+ One or more of the keyword arguments does not match a table in cache.
458
+
459
+ """
460
+ updated = None
461
+ for table, records in kwargs.items():
462
+ if records is None or records.empty:
463
+ continue
464
+ if table not in cache:
465
+ raise KeyError(f'Table "{table}" not in cache')
466
+ if isinstance(records, pd.Series):
467
+ records = pd.DataFrame([records])
468
+ records.index.set_names(cache[table].index.names, inplace=True)
469
+ # Drop duplicate indices
470
+ records = records[~records.index.duplicated(keep='first')]
471
+ if not strict:
472
+ # Deal with case where there are extra columns in the cache
473
+ extra_columns = list(set(cache[table].columns) - set(records.columns))
474
+ # Convert these columns to nullable, if required
475
+ cache_columns = cache[table][extra_columns]
476
+ cache[table][extra_columns] = cache_columns.convert_dtypes()
477
+ column_ids = map(list(cache[table].columns).index, extra_columns)
478
+ for col, n in sorted(zip(extra_columns, column_ids), key=lambda x: x[1]):
479
+ dtype = cache[table][col].dtype
480
+ nan = getattr(dtype, 'na_value', np.nan)
481
+ val = records.get('exists', True) if col.startswith('exists_') else nan
482
+ records.insert(n, col, val)
483
+ # Drop any extra columns in the records that aren't in cache table
484
+ to_drop = set(records.columns) - set(cache[table].columns)
485
+ records = records.drop(to_drop, axis=1)
486
+ records = records.reindex(columns=cache[table].columns)
487
+ assert set(cache[table].columns) == set(records.columns)
488
+ records = records.astype(cache[table].dtypes)
489
+ # Update existing rows
490
+ to_update = records.index.isin(cache[table].index)
491
+ cache[table].loc[records.index[to_update], :] = records[to_update]
492
+ # Assign new rows
493
+ to_assign = records[~to_update]
494
+ frames = [cache[table], to_assign]
495
+ # Concatenate and sort
496
+ cache[table] = pd.concat(frames).sort_index()
497
+ updated = datetime.datetime.now()
498
+ cache['_meta']['modified_time'] = updated
499
+ return updated
500
+
501
+
502
+ def remove_missing_datasets(cache_dir, tables=None, remove_empty_sessions=True, dry=True):
503
+ """Remove dataset files and session folders that are not in the provided cache.
504
+
505
+ NB: This *does not* remove entries from the cache tables that are missing on disk.
506
+ Non-ALF files are not removed. Empty sessions that exist in the sessions table are not removed.
507
+
508
+ Parameters
509
+ ----------
510
+ cache_dir : str, pathlib.Path
511
+ tables : dict[str, pandas.DataFrame], optional
512
+ A dict with keys ('sessions', 'datasets'), containing the cache tables as DataFrames.
513
+ remove_empty_sessions : bool
514
+ Attempt to remove session folders that are empty and not in the sessions table.
515
+ dry : bool
516
+ If true, do not remove anything.
517
+
518
+ Returns
519
+ -------
520
+ list
521
+ A sorted list of paths to be removed.
522
+
523
+ """
524
+ cache_dir = Path(cache_dir)
525
+ if tables is None:
526
+ tables = load_tables(cache_dir)
527
+
528
+ to_delete = set()
529
+ from one.converters import session_record2path # imported here due to circular imports
530
+ gen_path = partial(session_record2path, root_dir=cache_dir)
531
+ # map of session path to eid
532
+ sessions = {gen_path(rec): eid for eid, rec in tables['sessions'].iterrows()}
533
+ for session_path in iter_sessions(cache_dir):
534
+ try:
535
+ datasets = tables['datasets'].loc[sessions[session_path]]
536
+ except KeyError:
537
+ datasets = tables['datasets'].iloc[0:0, :]
538
+ for dataset in session_path.iter_datasets():
539
+ if dataset.relative_to_session().as_posix() not in datasets['rel_path']:
540
+ to_delete.add(dataset)
541
+ if session_path not in sessions and remove_empty_sessions:
542
+ to_delete.add(session_path)
543
+
544
+ if dry:
545
+ print('The following session and datasets would be removed:', end='\n\t')
546
+ print('\n\t'.join(sorted(map(str, to_delete))))
547
+ return sorted(to_delete)
548
+
549
+ # Delete datasets
550
+ for path in to_delete:
551
+ if path.is_file():
552
+ _logger.debug(f'Removing {path}')
553
+ path.unlink()
554
+ else:
555
+ # Recursively remove empty folders
556
+ while path.parent != cache_dir and not next(path.rglob('*'), False):
557
+ _logger.debug(f'Removing {path}')
558
+ path.rmdir()
559
+ path = path.parent
560
+
561
+ return sorted(to_delete)
562
+
563
+
564
+ def remove_table_files(folder, tables=('sessions', 'datasets')):
565
+ """Delete cache tables on disk.
566
+
567
+ Parameters
568
+ ----------
569
+ folder : pathlib.Path
570
+ The directory path containing cache tables to remove.
571
+ tables : list of str
572
+ A list of table names to remove, e.g. ['sessions', 'datasets'].
573
+ NB: This will also delete the cache_info.json metadata file.
574
+
575
+ Returns
576
+ -------
577
+ list of pathlib.Path
578
+ A list of the removed files.
579
+
580
+ """
581
+ filenames = ('cache_info.json', *(f'{t}.pqt' for t in tables))
582
+ removed = []
583
+ for file in map(folder.joinpath, filenames):
584
+ if file.exists():
585
+ file.unlink()
586
+ removed.append(file)
587
+ else:
588
+ _logger.warning('%s not found', file)
589
+ return removed
590
+
591
+
592
+ def _cache_int2str(table: pd.DataFrame) -> pd.DataFrame:
593
+ """Convert int ids to str ids for cache table.
594
+
595
+ Parameters
596
+ ----------
597
+ table : pd.DataFrame
598
+ A cache table (from One._cache).
599
+
600
+ """
601
+ # Convert integer uuids to str uuids
602
+ if table.index.nlevels < 2 or not any(x.endswith('_0') for x in table.index.names):
603
+ return table
604
+ table = table.reset_index()
605
+ int_cols = table.filter(regex=r'_\d{1}$').columns.sort_values()
606
+ assert not len(int_cols) % 2, 'expected even number of columns ending in _0 or _1'
607
+ names = sorted(set(c.rsplit('_', 1)[0] for c in int_cols.values))
608
+ for i, name in zip(range(0, len(int_cols), 2), names):
609
+ table[name] = parquet.np2str(table[int_cols[i:i + 2]])
610
+ table = table.drop(int_cols, axis=1).set_index(names)
611
+ return table
612
+
613
+
614
+ def patch_tables(table: pd.DataFrame, min_api_version=None, name=None) -> pd.DataFrame:
615
+ """Reformat older cache tables to comply with this version of ONE.
616
+
617
+ Currently this function will 1. convert integer UUIDs to string UUIDs; 2. rename the 'project'
618
+ column to 'projects'; 3. add QC column; 4. drop session_path column.
619
+
620
+ Parameters
621
+ ----------
622
+ table : pd.DataFrame
623
+ A cache table (from One._cache).
624
+ min_api_version : str
625
+ The minimum API version supported by this cache table.
626
+ name : {'dataset', 'session'} str
627
+ The name of the table.
628
+
629
+ """
630
+ min_version = version.parse(min_api_version or '0.0.0')
631
+ table = _cache_int2str(table)
632
+ # Rename project column
633
+ if min_version < version.Version('1.13.0') and 'project' in table.columns:
634
+ table.rename(columns={'project': 'projects'}, inplace=True)
635
+ if name == 'datasets' and min_version < version.Version('2.7.0') and 'qc' not in table.columns:
636
+ qc = pd.Categorical.from_codes(np.zeros(len(table.index), dtype=int), dtype=QC_TYPE)
637
+ table = table.assign(qc=qc)
638
+ if name == 'datasets' and 'session_path' in table.columns:
639
+ table = table.drop('session_path', axis=1)
640
+ return table