ONE-api 3.0b3__py3-none-any.whl → 3.0b4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {ONE_api-3.0b3.dist-info → ONE_api-3.0b4.dist-info}/LICENSE +21 -21
  2. {ONE_api-3.0b3.dist-info → ONE_api-3.0b4.dist-info}/METADATA +115 -115
  3. ONE_api-3.0b4.dist-info/RECORD +37 -0
  4. one/__init__.py +2 -2
  5. one/alf/__init__.py +1 -1
  6. one/alf/cache.py +640 -653
  7. one/alf/exceptions.py +105 -105
  8. one/alf/io.py +876 -876
  9. one/alf/path.py +1450 -1450
  10. one/alf/spec.py +519 -519
  11. one/api.py +2949 -2973
  12. one/converters.py +850 -850
  13. one/params.py +414 -414
  14. one/registration.py +845 -845
  15. one/remote/__init__.py +1 -1
  16. one/remote/aws.py +313 -313
  17. one/remote/base.py +142 -142
  18. one/remote/globus.py +1254 -1254
  19. one/tests/fixtures/params/.caches +6 -6
  20. one/tests/fixtures/params/.test.alyx.internationalbrainlab.org +8 -8
  21. one/tests/fixtures/rest_responses/1f187d80fd59677b395fcdb18e68e4401bfa1cc9 +1 -1
  22. one/tests/fixtures/rest_responses/47893cf67c985e6361cdee009334963f49fb0746 +1 -1
  23. one/tests/fixtures/rest_responses/535d0e9a1e2c1efbdeba0d673b131e00361a2edb +1 -1
  24. one/tests/fixtures/rest_responses/6dc96f7e9bcc6ac2e7581489b9580a6cd3f28293 +1 -1
  25. one/tests/fixtures/rest_responses/db1731fb8df0208944ae85f76718430813a8bf50 +1 -1
  26. one/tests/fixtures/rest_responses/dcce48259bb929661f60a02a48563f70aa6185b3 +1 -1
  27. one/tests/fixtures/rest_responses/f530d6022f61cdc9e38cc66beb3cb71f3003c9a1 +1 -1
  28. one/tests/fixtures/test_dbs.json +14 -14
  29. one/util.py +524 -524
  30. one/webclient.py +1366 -1354
  31. ONE_api-3.0b3.dist-info/RECORD +0 -37
  32. {ONE_api-3.0b3.dist-info → ONE_api-3.0b4.dist-info}/WHEEL +0 -0
  33. {ONE_api-3.0b3.dist-info → ONE_api-3.0b4.dist-info}/top_level.txt +0 -0
one/alf/io.py CHANGED
@@ -1,876 +1,876 @@
1
- """I/O functions for ALyx Files.
2
-
3
- Provides support for time-series reading and interpolation as per the specifications
4
- For a full overview of the scope of the format, see:
5
-
6
- https://int-brain-lab.github.io/ONE/alf_intro.html
7
- """
8
-
9
- import json
10
- import copy
11
- import logging
12
- import re
13
- from fnmatch import fnmatch
14
- from pathlib import Path
15
- from typing import Union
16
- from functools import partial
17
- from itertools import chain
18
- import warnings
19
-
20
- import numpy as np
21
- import pandas as pd
22
- import yaml
23
-
24
- from iblutil.util import Bunch
25
- from iblutil.io import parquet
26
- from iblutil.io import jsonable
27
- from .exceptions import ALFObjectNotFound
28
- from . import path, spec
29
- from .spec import FILE_SPEC
30
-
31
- _logger = logging.getLogger(__name__)
32
-
33
-
34
- class AlfBunch(Bunch):
35
- """A dict-like object that supports dot indexing and conversion to DataFrame."""
36
-
37
- @property
38
- def check_dimensions(self):
39
- """int: 0 for consistent dimensions, 1 for inconsistent dimensions."""
40
- return check_dimensions(self)
41
-
42
- def append(self, b, inplace=False):
43
- """Appends one bunch to another, key by key.
44
-
45
- Parameters
46
- ----------
47
- b : Bunch, dict
48
- A Bunch of data to append
49
- inplace : bool
50
- If true, the data are appended in place, otherwise a copy is returned
51
-
52
- Returns
53
- -------
54
- ALFBunch, None
55
- An ALFBunch with the data appended, or None if inplace is True
56
-
57
- """
58
- # default is to return a copy
59
- if inplace:
60
- a = self
61
- else:
62
- a = AlfBunch(copy.deepcopy(self))
63
- # handles empty bunches for convenience if looping
64
- if b == {}:
65
- return a
66
- if a == {}:
67
- return AlfBunch(b)
68
- # right now supports only strictly matching keys. Will implement other cases as needed
69
- if set(a.keys()) != set(b.keys()):
70
- raise NotImplementedError('Append bunches only works with strictly matching keys'
71
- 'For more complex merges, convert to pandas dataframe.')
72
- # do the merge; only concatenate lists and np arrays right now
73
- for k in a:
74
- if isinstance(a[k], np.ndarray):
75
- a[k] = np.concatenate((a[k], b[k]), axis=0)
76
- elif isinstance(a[k], list):
77
- a[k].extend(b[k])
78
- else:
79
- _logger.warning(f'bunch key "{k}" is a {a[k].__class__}. I don\'t know how to'
80
- f' handle that. Use pandas for advanced features')
81
- if a.check_dimensions != 0:
82
- print_sizes = '\n'.join(f'{v.shape},\t{k}' for k, v in a.items())
83
- _logger.warning(f'Inconsistent dimensions for object: \n{print_sizes}')
84
-
85
- return a
86
-
87
- def to_df(self) -> pd.DataFrame:
88
- """Return DataFrame with data keys as columns."""
89
- return dataframe(self)
90
-
91
- @staticmethod
92
- def from_df(df) -> 'AlfBunch':
93
- data = dict(zip(df.columns, df.values.T))
94
- split_keys = sorted(x for x in data.keys() if re.match(r'.+?_[01]$', x))
95
- for x1, x2 in zip(*[iter(split_keys)] * 2):
96
- data[x1[:-2]] = np.c_[data.pop(x1), data.pop(x2)]
97
- return AlfBunch(data)
98
-
99
-
100
- def dataframe(adict):
101
- """Convert an Bunch conforming to size conventions into a pandas DataFrame.
102
-
103
- For 2-D arrays, stops at 10 columns per attribute.
104
-
105
- Parameters
106
- ----------
107
- adict : dict, Bunch
108
- A dict-like object of data to convert to DataFrame
109
-
110
- Returns
111
- -------
112
- pd.DataFrame
113
- A pandas DataFrame of data
114
-
115
- """
116
- if check_dimensions(adict) != 0:
117
- raise ValueError('Can only convert to DataFrame objects with consistent size')
118
- # easy case where there are only vectors
119
- if all([len(adict[k].shape) == 1 for k in adict]):
120
- return pd.DataFrame(adict)
121
- # pandas has trouble with 2d data, chop it off with a limit of 10 columns per dataset
122
- df = pd.DataFrame()
123
- for k in adict.keys():
124
- if adict[k].ndim == 1:
125
- df[k] = adict[k]
126
- elif adict[k].ndim == 2 and adict[k].shape[1] == 1:
127
- df[k] = adict[k][:, 0]
128
- elif adict[k].ndim == 2:
129
- for i in np.arange(adict[k].shape[1]):
130
- df[f"{k}_{i}"] = adict[k][:, i]
131
- if i == 9:
132
- break
133
- else:
134
- _logger.warning(f'{k} attribute is 3D or more and won\'t convert to dataframe')
135
- continue
136
- return df
137
-
138
-
139
- def _find_metadata(file_alf) -> path.ALFPath:
140
- """File path for an existing meta-data file for an alf_file.
141
-
142
- Parameters
143
- ----------
144
- file_alf : str, pathlib.Path
145
- A path of existing ALF.
146
-
147
- Returns
148
- -------
149
- one.alf.path.ALFPath
150
- Path of meta-data file if exists.
151
-
152
- """
153
- file_alf = path.ALFPath(file_alf)
154
- ns, obj = file_alf.name.split('.')[:2]
155
- return next(file_alf.parent.glob(f'{ns}.{obj}*.metadata*.json'), None)
156
-
157
-
158
- def read_ts(filename):
159
- """Load time-series from ALF format.
160
-
161
- Parameters
162
- ----------
163
- filename : str, pathlib.Path
164
- An ALF path whose values to load
165
-
166
- Returns
167
- -------
168
- numpy.ndarray
169
- An array of timestamps belonging to the ALF path object
170
- numpy.ndarray
171
- An array of values in filename
172
-
173
- Examples
174
- --------
175
- >>> t, d = read_ts(filename)
176
-
177
- """
178
- filename = path.ensure_alf_path(filename)
179
-
180
- # alf format is object.attribute.extension, for example '_ibl_wheel.position.npy'
181
- _, obj, attr, *_, ext = filename.dataset_name_parts
182
-
183
- try:
184
- # looking for matching object with attribute timestamps: '_ibl_wheel.timestamps.npy'
185
- (time_file,), _ = filter_by(filename.parent, object=obj, attribute='times*', extension=ext)
186
- assert time_file
187
- except (ValueError, AssertionError):
188
- name = spec.to_alf(obj, attr, ext)
189
- raise FileNotFoundError(name + ' not found! No time-scale for ' + str(filename))
190
-
191
- ts = np.load(filename.parent / time_file)
192
- val = np.load(filename)
193
- # Ensure timestamps
194
- return ts2vec(ts, val.shape[0]), _ensure_flat(val)
195
-
196
-
197
- def _ensure_flat(arr):
198
- """Given a single column array, returns a flat vector. Other shapes are returned unchanged.
199
-
200
- Parameters
201
- ----------
202
- arr : numpy.array
203
- An array with shape (n, 1)
204
-
205
- Returns
206
- -------
207
- numpy.ndarray
208
- A vector with shape (n,)
209
-
210
- """
211
- return arr.flatten() if arr.ndim == 2 and arr.shape[1] == 1 else arr
212
-
213
-
214
- def ts2vec(ts: np.ndarray, n_samples: int) -> np.ndarray:
215
- """Interpolate a continuous timeseries of the shape (2, 2).
216
-
217
- Parameters
218
- ----------
219
- ts : numpy.array
220
- a 2x2 numpy array of the form (sample, ts)
221
- n_samples : int
222
- Number of samples; i.e. the size of the resulting vector
223
-
224
- Returns
225
- -------
226
- numpy.ndarray
227
- A vector of interpolated timestamps
228
-
229
- """
230
- if len(ts.shape) == 1:
231
- return ts
232
- elif ts.ndim == 2 and ts.shape[1] == 1:
233
- return ts.flatten() # Deal with MATLAB single column array
234
- if ts.ndim > 2 or ts.shape[1] != 2:
235
- raise ValueError('Array shape should be (2, 2)')
236
- # Linearly interpolate the times
237
- x = np.arange(n_samples)
238
- return np.interp(x, ts[:, 0], ts[:, 1])
239
-
240
-
241
- def check_dimensions(dico):
242
- """Test for consistency of dimensions as per ALF specs in a dictionary.
243
-
244
- Alf broadcasting rules: only accepts consistent dimensions for a given axis
245
- a dimension is consistent with another if it's empty, 1, or equal to the other arrays
246
- dims [a, 1], [1, b] and [a, b] are all consistent, [c, 1] is not
247
-
248
- Parameters
249
- ----------
250
- dico : ALFBunch, dict
251
- Dictionary containing data
252
-
253
- Returns
254
- -------
255
- int
256
- Status 0 for consistent dimensions, 1 for inconsistent dimensions
257
-
258
- """
259
- supported = (np.ndarray, pd.DataFrame) # Data types that have a shape attribute
260
- shapes = [dico[lab].shape for lab in dico
261
- if isinstance(dico[lab], supported) and not lab.startswith('timestamps')]
262
- first_shapes = [sh[0] for sh in shapes]
263
- # Continuous timeseries are permitted to be a (2, 2)
264
- timeseries = [k for k, v in dico.items()
265
- if k.startswith('timestamps') and isinstance(v, np.ndarray)]
266
- if any(timeseries):
267
- for key in timeseries:
268
- if dico[key].ndim == 1 or (dico[key].ndim == 2 and dico[key].shape[1] == 1):
269
- # Should be vector with same length as other attributes
270
- first_shapes.append(dico[key].shape[0])
271
- elif dico[key].ndim > 1 and dico[key].shape != (2, 2):
272
- return 1 # ts not a (2, 2) arr or a vector
273
-
274
- ok = len(first_shapes) == 0 or set(first_shapes).issubset({max(first_shapes), 1})
275
- return int(ok is False)
276
-
277
-
278
- def load_file_content(fil):
279
- """Return content of a file path.
280
-
281
- Designed for very generic data file formats such as `json`, `npy`, `csv`, `(h)tsv`, `ssv`.
282
-
283
- Parameters
284
- ----------
285
- fil : str, pathlib.Path
286
- File to read
287
-
288
- Returns
289
- -------
290
- Any
291
- Array/json/pandas dataframe depending on format
292
-
293
- """
294
- if not fil:
295
- return
296
- fil = Path(fil)
297
- if fil.stat().st_size == 0:
298
- return
299
- if fil.suffix == '.csv':
300
- return pd.read_csv(fil).squeeze('columns')
301
- if fil.suffix == '.json':
302
- try:
303
- with open(fil) as _fil:
304
- return json.loads(_fil.read())
305
- except Exception as e:
306
- _logger.error(e)
307
- return None
308
- if fil.suffix == '.jsonable':
309
- return jsonable.read(fil)
310
- if fil.suffix == '.npy':
311
- return _ensure_flat(np.load(file=fil, allow_pickle=True))
312
- if fil.suffix == '.npz':
313
- arr = np.load(file=fil)
314
- # If single array with the default name ('arr_0') return individual array
315
- return arr['arr_0'] if set(arr.files) == {'arr_0'} else arr
316
- if fil.suffix == '.pqt':
317
- return parquet.load(fil)[0]
318
- if fil.suffix == '.ssv':
319
- return pd.read_csv(fil, delimiter=' ').squeeze('columns')
320
- if fil.suffix in ('.tsv', '.htsv'):
321
- return pd.read_csv(fil, delimiter='\t').squeeze('columns')
322
- if fil.suffix in ('.yml', '.yaml'):
323
- with open(fil, 'r') as _fil:
324
- return yaml.safe_load(_fil)
325
- if fil.suffix == '.sparse_npz':
326
- try:
327
- import sparse
328
- return sparse.load_npz(fil)
329
- except ModuleNotFoundError:
330
- warnings.warn(f'{Path(fil).name} requires the pydata sparse package to load.')
331
- return path.ALFPath(fil)
332
- return path.ALFPath(fil)
333
-
334
-
335
- def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
336
- """Given a path, an object and a filter, returns all files and associated attributes.
337
-
338
- Parameters
339
- ----------
340
- alfpath : str, pathlib.Path
341
- The folder to list
342
- object : str, list
343
- An ALF object name to filter by
344
- wildcards : bool
345
- If true uses unix shell style pattern matching, otherwise uses regular expressions
346
- kwargs
347
- Other ALF parts to filter, including namespace, attribute, etc.
348
-
349
- Returns
350
- -------
351
- list of one.alf.path.ALFPath
352
- A list of ALF paths.
353
- tuple
354
- A tuple of ALF attributes corresponding to the file paths.
355
-
356
- Raises
357
- ------
358
- ALFObjectNotFound
359
- No matching ALF object was found in the alfpath directory
360
-
361
- """
362
- alfpath = path.ALFPath(alfpath)
363
- if not alfpath.exists():
364
- files_alf = attributes = None
365
- elif alfpath.is_dir():
366
- if object is None:
367
- # List all ALF files
368
- files_alf, attributes = filter_by(alfpath, **kwargs)
369
- else:
370
- files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
371
- else:
372
- object = alfpath.object
373
- alfpath = alfpath.parent
374
- files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
375
-
376
- # raise error if no files found
377
- if not files_alf:
378
- err_str = f'object "{object}"' if object else 'ALF files'
379
- raise ALFObjectNotFound(f'No {err_str} found in {alfpath}')
380
-
381
- return [alfpath.joinpath(f) for f in files_alf], attributes
382
-
383
-
384
- def iter_sessions(root_dir, pattern='*'):
385
- """Recursively iterate over session paths in a given directory.
386
-
387
- Parameters
388
- ----------
389
- root_dir : str, pathlib.Path
390
- The folder to look for sessions.
391
- pattern : str
392
- Glob pattern to use. Default searches all folders. Providing a more specific pattern makes
393
- this more performant (see examples).
394
-
395
- Yields
396
- ------
397
- pathlib.Path
398
- The next session path in lexicographical order.
399
-
400
- Examples
401
- --------
402
- Efficient iteration when `root_dir` contains <lab>/Subjects folders
403
-
404
- >>> sessions = list(iter_sessions(root_dir, pattern='*/Subjects/*/????-??-??/*'))
405
-
406
- Efficient iteration when `root_dir` contains subject folders
407
-
408
- >>> sessions = list(iter_sessions(root_dir, pattern='*/????-??-??/*'))
409
-
410
- """
411
- if spec.is_session_path(root_dir):
412
- yield path.ALFPath(root_dir)
413
- for p in sorted(Path(root_dir).rglob(pattern)):
414
- if p.is_dir() and spec.is_session_path(p):
415
- yield path.ALFPath(p)
416
-
417
-
418
- def iter_datasets(session_path):
419
- """Iterate over all files in a session, and yield relative dataset paths.
420
-
421
- Parameters
422
- ----------
423
- session_path : str, pathlib.Path
424
- The folder to look for datasets.
425
-
426
- Yields
427
- ------
428
- one.alf.path.ALFPath
429
- The next dataset path (relative to the session path) in lexicographical order.
430
-
431
- """
432
- for dataset in path.ALFPath(session_path).iter_datasets(recursive=True):
433
- yield dataset.relative_to(session_path)
434
-
435
-
436
- def exists(alfpath, object, attributes=None, **kwargs) -> bool:
437
- """Test if ALF object and optionally specific attributes exist in the given path.
438
-
439
- Parameters
440
- ----------
441
- alfpath : str, pathlib.Path
442
- The folder to look into
443
- object : str
444
- ALF object name
445
- attributes : str, list
446
- Wanted attributes
447
- wildcards : bool
448
- If true uses unix shell style pattern matching, otherwise uses regular expressions
449
- kwargs
450
- Other ALF parts to filter by
451
-
452
- Returns
453
- -------
454
- bool
455
- For multiple attributes, returns True only if all attributes are found
456
-
457
- """
458
- # if the object is not found, return False
459
- try:
460
- _, attributes_found = _ls(alfpath, object, **kwargs)
461
- except (FileNotFoundError, ALFObjectNotFound):
462
- return False
463
-
464
- # if object found and no attribute provided, True
465
- if not attributes:
466
- return True
467
-
468
- # if attributes provided, test if all are found
469
- if isinstance(attributes, str):
470
- attributes = [attributes]
471
- attributes_found = set(part[2] for part in attributes_found)
472
- return set(attributes).issubset(attributes_found)
473
-
474
-
475
- def load_object(alfpath, object=None, short_keys=False, **kwargs):
476
- """Reads all files sharing the same object name.
477
-
478
- For example, if the file provided to the function is `spikes.times`, the function will
479
- load `spikes.times`, `spikes.clusters`, `spikes.depths`, `spike.amps` in a dictionary
480
- whose keys will be `times`, `clusters`, `depths`, `amps`
481
-
482
- Full Reference here: https://int-brain-lab.github.io/ONE/alf_intro.html
483
-
484
- Simplified example: _namespace_object.attribute_timescale.part1.part2.extension
485
-
486
- Parameters
487
- ----------
488
- alfpath : str, pathlib.Path, list
489
- Any ALF path pertaining to the object OR directory containing ALFs OR list of paths.
490
- object : str, list, None
491
- The ALF object(s) to filter by. If a directory is provided and object is None, all valid
492
- ALF files returned.
493
- short_keys : bool
494
- By default, the output dictionary keys will be compounds of attributes, timescale and
495
- any eventual parts separated by a dot. Use True to shorten the keys to the attribute
496
- and timescale.
497
- wildcards : bool
498
- If true uses unix shell style pattern matching, otherwise uses regular expressions.
499
- kwargs
500
- Other ALF parts to filter by.
501
-
502
- Returns
503
- -------
504
- AlfBunch
505
- An ALFBunch (dict-like) of all attributes pertaining to the object.
506
-
507
- Examples
508
- --------
509
- Load 'spikes' object
510
-
511
- >>> spikes = load_object('full/path/to/my/alffolder/', 'spikes')
512
-
513
- Load 'trials' object under the 'ibl' namespace
514
-
515
- >>> trials = load_object('/subject/2021-01-01/001', 'trials', namespace='ibl')
516
-
517
- """
518
- if isinstance(alfpath, (Path, str)):
519
- if Path(alfpath).is_dir() and object is None:
520
- raise ValueError('If a directory is provided, the object name should be provided too')
521
- files_alf, parts = _ls(alfpath, object, **kwargs)
522
- else: # A list of paths allows us to load an object from different revisions
523
- files_alf = list(map(path.ALFPath, alfpath))
524
- parts = [x.dataset_name_parts for x in files_alf]
525
- assert len(set(p[1] for p in parts)) == 1
526
- object = next(x[1] for x in parts)
527
- # Take attribute and timescale from parts list
528
- attributes = [p[2] if not p[3] else '_'.join(p[2:4]) for p in parts]
529
- if not short_keys: # Include extra parts in the keys
530
- attributes = ['.'.join(filter(None, (attr, p[4]))) for attr, p in zip(attributes, parts)]
531
- # TODO List duplicates; raise ALFError
532
- assert len(set(attributes)) == len(attributes), (
533
- f'multiple object {object} with the same attribute in {alfpath}, restrict parts/namespace')
534
- out = AlfBunch({})
535
-
536
- # load content for each file
537
- for fil, att in zip(files_alf, attributes):
538
- # if there is a corresponding metadata file, read it:
539
- meta_data_file = _find_metadata(fil)
540
- # if this is the actual meta-data file, skip and it will be read later
541
- if meta_data_file == fil:
542
- continue
543
- out[att] = load_file_content(fil)
544
- if meta_data_file:
545
- meta = load_file_content(meta_data_file)
546
- # the columns keyword splits array along the last dimension
547
- if 'columns' in meta.keys():
548
- out.update({v: out[att][::, k] for k, v in enumerate(meta['columns'])})
549
- out.pop(att)
550
- meta.pop('columns')
551
- # if there is other stuff in the dictionary, save it, otherwise disregard
552
- if meta:
553
- out[att + 'metadata'] = meta
554
- # Merge 'table' dataframe into bunch
555
- table_key = next(filter(re.compile(r'^table([_.]|$)').match, out), None) # py 3.8
556
- if table_key:
557
- table = out.pop(table_key)
558
-
559
- def rename_columns(field):
560
- """"Rename DataFrame fields to include timescale or extra ALF parts from table_key.
561
-
562
- For example...
563
- with table_key = table_clock, field1 -> field1_clock;
564
- with table_key = table_clock.extra, field1_0 -> field1_clock.extra_0;
565
- with table_key = table, field1 -> field1
566
- """
567
- return (field[:-2] + table_key[5:] + field[-2:]
568
- if re.match(r'.+?_[01]$', field)
569
- else field + table_key[5:])
570
- table.rename(columns=rename_columns, inplace=True)
571
- out.update(AlfBunch.from_df(table))
572
- status = out.check_dimensions
573
- timeseries = [k for k in out.keys() if 'timestamps' in k]
574
- if any(timeseries) and len(out.keys()) > len(timeseries) and status == 0:
575
- # Get length of one of the other arrays
576
- ignore = ('timestamps', 'meta')
577
- n_samples = next(v for k, v in out.items() if not any(x in k for x in ignore)).shape[0]
578
- for key in timeseries:
579
- # Expand timeseries if necessary
580
- out[key] = ts2vec(out[key], n_samples)
581
- if status != 0:
582
- supported = (np.ndarray, pd.DataFrame)
583
- print_sizes = '\n'.join(
584
- f'{v.shape},\t{k}' for k, v in out.items() if isinstance(v, supported)
585
- )
586
- _logger.warning(f'Inconsistent dimensions for object: {object} \n{print_sizes}')
587
- return out
588
-
589
-
590
- def save_object_npy(alfpath, dico, object, parts=None, namespace=None, timescale=None) -> list:
591
- """Save dictionary in `ALF format`_ using dictionary keys as attribute names.
592
-
593
- Dimensions have to be consistent.
594
-
595
- Simplified ALF example: _namespace_object.attribute.part1.part2.extension.
596
-
597
- Parameters
598
- ----------
599
- alfpath : str, pathlib.Path
600
- Path of the folder to save data to.
601
- dico : dict
602
- Dictionary to save to npy; keys correspond to ALF attributes.
603
- object : str
604
- Name of the object to save.
605
- parts : str, list, None
606
- Extra parts to the ALF name.
607
- namespace : str, None
608
- The optional namespace of the object.
609
- timescale : str, None
610
- The optional timescale of the object.
611
-
612
- Returns
613
- -------
614
- list of one.alf.path.ALFPath
615
- List of written files.
616
-
617
- Examples
618
- --------
619
- >>> spikes = {'times': np.arange(50), 'depths': np.random.random(50)}
620
- >>> files = save_object_npy('/path/to/my/alffolder/', spikes, 'spikes')
621
-
622
- .. _ALF format:
623
- https://int-brain-lab.github.io/ONE/alf_intro.html
624
-
625
- """
626
- alfpath = path.ALFPath(alfpath)
627
- status = check_dimensions(dico)
628
- if status != 0:
629
- raise ValueError('Dimensions are not consistent to save all arrays in ALF format: ' +
630
- str([(k, v.shape) for k, v in dico.items()]))
631
- out_files = []
632
- for k, v in dico.items():
633
- out_file = alfpath / spec.to_alf(object, k, 'npy',
634
- extra=parts, namespace=namespace, timescale=timescale)
635
- np.save(out_file, v)
636
- out_files.append(out_file)
637
- return out_files
638
-
639
-
640
- def save_metadata(file_alf, dico) -> path.ALFPath:
641
- """Writes a meta data file matching a current ALF file object.
642
-
643
- For example given an alf file `clusters.ccfLocation.ssv` this will write a dictionary in JSON
644
- format in `clusters.ccfLocation.metadata.json`
645
-
646
- Reserved keywords:
647
- - **columns**: column names for binary tables.
648
- - **row**: row names for binary tables.
649
- - **unit**
650
-
651
- Parameters
652
- ----------
653
- file_alf : str, pathlib.Path
654
- Full path to the alf object
655
- dico : dict, ALFBunch
656
- Dictionary containing meta-data
657
-
658
- Returns
659
- -------
660
- one.alf.path.ALFPath
661
- The saved metadata file path.
662
-
663
- """
664
- file_alf = path.ALFPath(file_alf)
665
- assert file_alf.is_dataset, 'ALF filename not valid'
666
- file_meta_data = file_alf.parent / (file_alf.stem + '.metadata.json')
667
- with open(file_meta_data, 'w+') as fid:
668
- fid.write(json.dumps(dico, indent=1))
669
- return file_meta_data
670
-
671
-
672
- def next_num_folder(session_date_folder: Union[str, Path]) -> str:
673
- """Return the next number for a session given a session_date_folder."""
674
- session_date_folder = Path(session_date_folder)
675
- if not session_date_folder.exists():
676
- return '001'
677
- session_nums = [
678
- int(x.name) for x in session_date_folder.iterdir()
679
- if x.is_dir() and not x.name.startswith('.') and x.name.isdigit()
680
- ]
681
- out = f'{max(session_nums or [0]) + 1:03d}'
682
- assert len(out) == 3, 'ALF spec does not support session numbers > 999'
683
- return out
684
-
685
-
686
- def remove_empty_folders(folder: Union[str, Path]) -> None:
687
- """Iteratively remove any empty child folders."""
688
- all_folders = sorted(x for x in Path(folder).rglob('*') if x.is_dir())
689
- for f in reversed(all_folders): # Reversed sorted ensures we remove deepest first
690
- try:
691
- f.rmdir()
692
- except Exception:
693
- continue
694
-
695
-
696
- def filter_by(alf_path, wildcards=True, **kwargs):
697
- """Given a path and optional filters, returns all ALF files and their associated parts.
698
-
699
- The filters constitute a logical AND. For all but `extra`, if a list is provided, one or more
700
- elements must match (a logical OR).
701
-
702
- Parameters
703
- ----------
704
- alf_path : str, pathlib.Path
705
- A path to a folder containing ALF datasets.
706
- wildcards : bool
707
- If true, kwargs are matched as unix-style patterns, otherwise as regular expressions.
708
- object : str, list
709
- Filter by a given object (e.g. 'spikes').
710
- attribute : str, list
711
- Filter by a given attribute (e.g. 'intervals').
712
- extension : str, list
713
- Filter by extension (e.g. 'npy').
714
- namespace : str, list
715
- Filter by a given namespace (e.g. 'ibl') or None for files without one.
716
- timescale : str, list
717
- Filter by a given timescale (e.g. 'bpod') or None for files without one.
718
- extra : str, list
719
- Filter by extra parameters (e.g. 'raw') or None for files without extra parts
720
- NB: Wild cards not permitted here.
721
-
722
- Returns
723
- -------
724
- alf_files : list of one.alf.path.ALFPath
725
- A Path to a directory containing ALF files.
726
- attributes : list of dicts
727
- A list of parsed file parts.
728
-
729
- Examples
730
- --------
731
- Filter files with universal timescale
732
-
733
- >>> filter_by(alf_path, timescale=None)
734
-
735
- Filter files by a given ALF object
736
-
737
- >>> filter_by(alf_path, object='wheel')
738
-
739
- Filter using wildcard, e.g. 'wheel' and 'wheelMoves' ALF objects
740
-
741
- >>> filter_by(alf_path, object='wh*')
742
-
743
- Filter all intervals that are in bpod time
744
-
745
- >>> filter_by(alf_path, attribute='intervals', timescale='bpod')
746
-
747
- Filter all files containing either 'intervals' OR 'timestamps' attributes
748
-
749
- >>> filter_by(alf_path, attribute=['intervals', 'timestamps'])
750
-
751
- Filter all files using a regular expression
752
-
753
- >>> filter_by(alf_path, object='^wheel.*', wildcards=False)
754
- >>> filter_by(alf_path, object=['^wheel$', '.*Moves'], wildcards=False)
755
-
756
- """
757
- alf_files = [f.relative_to(alf_path) for f in path.ALFPath(alf_path).iter_datasets()]
758
- attributes = list(map(path.ALFPath.parse_alf_name, alf_files))
759
-
760
- if kwargs:
761
- # Validate keyword arguments against regex group names
762
- invalid = kwargs.keys() - spec.regex(FILE_SPEC).groupindex.keys()
763
- if invalid:
764
- raise TypeError('%s() got an unexpected keyword argument "%s"'
765
- % (__name__, set(invalid).pop()))
766
-
767
- # # Ensure 'extra' input is a list; if str split on dot
768
- if 'extra' in kwargs and isinstance(kwargs['extra'], str):
769
- kwargs['extra'] = kwargs['extra'].split('.')
770
-
771
- def _match(part, pattern, split=None):
772
- if pattern is None or part is None:
773
- # If either is None, both should be None to match
774
- return pattern is part
775
- elif split:
776
- # Check all provided extra fields match those in ALF
777
- return all(elem in part.split(split) for elem in pattern if elem)
778
- elif not isinstance(pattern, str):
779
- if wildcards:
780
- return any(_match(part, x, split) for x in pattern)
781
- else:
782
- return re.match('|'.join(pattern), part) is not None
783
- else:
784
- # Check given attribute matches, allowing wildcards
785
- return fnmatch(part, pattern) if wildcards else re.match(pattern, part) is not None
786
-
787
- # Iterate over ALF files
788
- for file, attr in zip(alf_files.copy(), attributes.copy()):
789
- for k, v in kwargs.items(): # Iterate over attributes
790
- match = _match(attr[k], v, '.' if k == 'extra' else None)
791
-
792
- if not match: # Remove file from list and move on to next file
793
- alf_files.remove(file)
794
- attributes.remove(attr)
795
- break
796
-
797
- return alf_files, [tuple(attr.values()) for attr in attributes]
798
-
799
-
800
- def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
801
- """Find variant datasets.
802
-
803
- Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
804
- dataset is uniquely defined by session path, collection, object and attribute. Therefore,
805
- datasets with the same name and collection in a different revision folder are considered a
806
- variant. If any of the keyword arguments are set to False, those parts are ignored when
807
- comparing datasets.
808
-
809
- Parameters
810
- ----------
811
- file_list : list of str, list of pathlib.Path
812
- A list of ALF paths to find variants of.
813
- namespace : bool
814
- If true, treat datasets with a different namespace as unique.
815
- timescale : bool
816
- If true, treat datasets with a different timescale as unique.
817
- extra : bool
818
- If true, treat datasets with a different extra parts as unique.
819
- extension : bool
820
- If true, treat datasets with a different extension as unique.
821
-
822
- Returns
823
- -------
824
- Dict[pathlib.Path, list of pathlib.Path]
825
- A map of input file paths to a list variant dataset paths.
826
-
827
- Raises
828
- ------
829
- ValueError
830
- One or more input file paths are not valid ALF datasets.
831
-
832
- Examples
833
- --------
834
- Find all datasets with an identical name and collection in a different revision folder
835
-
836
- >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
837
- {Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
838
- Path('/sub/2020-10-01/001/alf/obj.attr.npy')
839
- ]}
840
-
841
- Find all datasets with different namespace or revision
842
-
843
- >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
844
- {Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
845
- Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
846
- Path('/sub/2020-10-01/001/obj.attr.npy'),
847
- ]}
848
-
849
- """
850
- # Initialize map of unique files to their duplicates
851
- duplicates = {}
852
- # Determine which parts to filter
853
- variables = locals()
854
- filters = {'namespace', 'timescale', 'extra', 'extension'}
855
- to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
856
- *(arg for arg in filters if variables[arg]))
857
-
858
- def parts_match(parts, file):
859
- """Compare a file's unique parts to a given file."""
860
- other = file.parse_alf_path()
861
- return all(parts[k] == other[k] for k in to_compare)
862
-
863
- # iterate over unique files and their parts
864
- for f in map(path.ALFPath, file_list):
865
- parts = f.parse_alf_path()
866
- # first glob for files matching object.attribute (including revisions)
867
- pattern = f'*{parts["object"]}.{parts["attribute"]}*'
868
- # this works because revision will always be last folder;
869
- # i.e. revisions can't contain collections
870
- globbed = map(f.without_revision().parent.glob, (pattern, '#*#/' + pattern))
871
- globbed = chain.from_iterable(globbed) # unite revision and non-revision globs
872
- # refine duplicates based on other parts (this also ensures we don't catch similar objects)
873
- globbed = filter(partial(parts_match, parts), globbed)
874
- # key = f.relative_to_session().as_posix()
875
- duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates
876
- return duplicates
1
+ """I/O functions for ALyx Files.
2
+
3
+ Provides support for time-series reading and interpolation as per the specifications
4
+ For a full overview of the scope of the format, see:
5
+
6
+ https://int-brain-lab.github.io/ONE/alf_intro.html
7
+ """
8
+
9
+ import json
10
+ import copy
11
+ import logging
12
+ import re
13
+ from fnmatch import fnmatch
14
+ from pathlib import Path
15
+ from typing import Union
16
+ from functools import partial
17
+ from itertools import chain
18
+ import warnings
19
+
20
+ import numpy as np
21
+ import pandas as pd
22
+ import yaml
23
+
24
+ from iblutil.util import Bunch
25
+ from iblutil.io import parquet
26
+ from iblutil.io import jsonable
27
+ from .exceptions import ALFObjectNotFound
28
+ from . import path, spec
29
+ from .spec import FILE_SPEC
30
+
31
+ _logger = logging.getLogger(__name__)
32
+
33
+
34
+ class AlfBunch(Bunch):
35
+ """A dict-like object that supports dot indexing and conversion to DataFrame."""
36
+
37
+ @property
38
+ def check_dimensions(self):
39
+ """int: 0 for consistent dimensions, 1 for inconsistent dimensions."""
40
+ return check_dimensions(self)
41
+
42
+ def append(self, b, inplace=False):
43
+ """Appends one bunch to another, key by key.
44
+
45
+ Parameters
46
+ ----------
47
+ b : Bunch, dict
48
+ A Bunch of data to append
49
+ inplace : bool
50
+ If true, the data are appended in place, otherwise a copy is returned
51
+
52
+ Returns
53
+ -------
54
+ ALFBunch, None
55
+ An ALFBunch with the data appended, or None if inplace is True
56
+
57
+ """
58
+ # default is to return a copy
59
+ if inplace:
60
+ a = self
61
+ else:
62
+ a = AlfBunch(copy.deepcopy(self))
63
+ # handles empty bunches for convenience if looping
64
+ if b == {}:
65
+ return a
66
+ if a == {}:
67
+ return AlfBunch(b)
68
+ # right now supports only strictly matching keys. Will implement other cases as needed
69
+ if set(a.keys()) != set(b.keys()):
70
+ raise NotImplementedError('Append bunches only works with strictly matching keys'
71
+ 'For more complex merges, convert to pandas dataframe.')
72
+ # do the merge; only concatenate lists and np arrays right now
73
+ for k in a:
74
+ if isinstance(a[k], np.ndarray):
75
+ a[k] = np.concatenate((a[k], b[k]), axis=0)
76
+ elif isinstance(a[k], list):
77
+ a[k].extend(b[k])
78
+ else:
79
+ _logger.warning(f'bunch key "{k}" is a {a[k].__class__}. I don\'t know how to'
80
+ f' handle that. Use pandas for advanced features')
81
+ if a.check_dimensions != 0:
82
+ print_sizes = '\n'.join(f'{v.shape},\t{k}' for k, v in a.items())
83
+ _logger.warning(f'Inconsistent dimensions for object: \n{print_sizes}')
84
+
85
+ return a
86
+
87
+ def to_df(self) -> pd.DataFrame:
88
+ """Return DataFrame with data keys as columns."""
89
+ return dataframe(self)
90
+
91
+ @staticmethod
92
+ def from_df(df) -> 'AlfBunch':
93
+ data = dict(zip(df.columns, df.values.T))
94
+ split_keys = sorted(x for x in data.keys() if re.match(r'.+?_[01]$', x))
95
+ for x1, x2 in zip(*[iter(split_keys)] * 2):
96
+ data[x1[:-2]] = np.c_[data.pop(x1), data.pop(x2)]
97
+ return AlfBunch(data)
98
+
99
+
100
+ def dataframe(adict):
101
+ """Convert an Bunch conforming to size conventions into a pandas DataFrame.
102
+
103
+ For 2-D arrays, stops at 10 columns per attribute.
104
+
105
+ Parameters
106
+ ----------
107
+ adict : dict, Bunch
108
+ A dict-like object of data to convert to DataFrame
109
+
110
+ Returns
111
+ -------
112
+ pd.DataFrame
113
+ A pandas DataFrame of data
114
+
115
+ """
116
+ if check_dimensions(adict) != 0:
117
+ raise ValueError('Can only convert to DataFrame objects with consistent size')
118
+ # easy case where there are only vectors
119
+ if all([len(adict[k].shape) == 1 for k in adict]):
120
+ return pd.DataFrame(adict)
121
+ # pandas has trouble with 2d data, chop it off with a limit of 10 columns per dataset
122
+ df = pd.DataFrame()
123
+ for k in adict.keys():
124
+ if adict[k].ndim == 1:
125
+ df[k] = adict[k]
126
+ elif adict[k].ndim == 2 and adict[k].shape[1] == 1:
127
+ df[k] = adict[k][:, 0]
128
+ elif adict[k].ndim == 2:
129
+ for i in np.arange(adict[k].shape[1]):
130
+ df[f"{k}_{i}"] = adict[k][:, i]
131
+ if i == 9:
132
+ break
133
+ else:
134
+ _logger.warning(f'{k} attribute is 3D or more and won\'t convert to dataframe')
135
+ continue
136
+ return df
137
+
138
+
139
+ def _find_metadata(file_alf) -> path.ALFPath:
140
+ """File path for an existing meta-data file for an alf_file.
141
+
142
+ Parameters
143
+ ----------
144
+ file_alf : str, pathlib.Path
145
+ A path of existing ALF.
146
+
147
+ Returns
148
+ -------
149
+ one.alf.path.ALFPath
150
+ Path of meta-data file if exists.
151
+
152
+ """
153
+ file_alf = path.ALFPath(file_alf)
154
+ ns, obj = file_alf.name.split('.')[:2]
155
+ return next(file_alf.parent.glob(f'{ns}.{obj}*.metadata*.json'), None)
156
+
157
+
158
+ def read_ts(filename):
159
+ """Load time-series from ALF format.
160
+
161
+ Parameters
162
+ ----------
163
+ filename : str, pathlib.Path
164
+ An ALF path whose values to load
165
+
166
+ Returns
167
+ -------
168
+ numpy.ndarray
169
+ An array of timestamps belonging to the ALF path object
170
+ numpy.ndarray
171
+ An array of values in filename
172
+
173
+ Examples
174
+ --------
175
+ >>> t, d = read_ts(filename)
176
+
177
+ """
178
+ filename = path.ensure_alf_path(filename)
179
+
180
+ # alf format is object.attribute.extension, for example '_ibl_wheel.position.npy'
181
+ _, obj, attr, *_, ext = filename.dataset_name_parts
182
+
183
+ try:
184
+ # looking for matching object with attribute timestamps: '_ibl_wheel.timestamps.npy'
185
+ (time_file,), _ = filter_by(filename.parent, object=obj, attribute='times*', extension=ext)
186
+ assert time_file
187
+ except (ValueError, AssertionError):
188
+ name = spec.to_alf(obj, attr, ext)
189
+ raise FileNotFoundError(name + ' not found! No time-scale for ' + str(filename))
190
+
191
+ ts = np.load(filename.parent / time_file)
192
+ val = np.load(filename)
193
+ # Ensure timestamps
194
+ return ts2vec(ts, val.shape[0]), _ensure_flat(val)
195
+
196
+
197
+ def _ensure_flat(arr):
198
+ """Given a single column array, returns a flat vector. Other shapes are returned unchanged.
199
+
200
+ Parameters
201
+ ----------
202
+ arr : numpy.array
203
+ An array with shape (n, 1)
204
+
205
+ Returns
206
+ -------
207
+ numpy.ndarray
208
+ A vector with shape (n,)
209
+
210
+ """
211
+ return arr.flatten() if arr.ndim == 2 and arr.shape[1] == 1 else arr
212
+
213
+
214
+ def ts2vec(ts: np.ndarray, n_samples: int) -> np.ndarray:
215
+ """Interpolate a continuous timeseries of the shape (2, 2).
216
+
217
+ Parameters
218
+ ----------
219
+ ts : numpy.array
220
+ a 2x2 numpy array of the form (sample, ts)
221
+ n_samples : int
222
+ Number of samples; i.e. the size of the resulting vector
223
+
224
+ Returns
225
+ -------
226
+ numpy.ndarray
227
+ A vector of interpolated timestamps
228
+
229
+ """
230
+ if len(ts.shape) == 1:
231
+ return ts
232
+ elif ts.ndim == 2 and ts.shape[1] == 1:
233
+ return ts.flatten() # Deal with MATLAB single column array
234
+ if ts.ndim > 2 or ts.shape[1] != 2:
235
+ raise ValueError('Array shape should be (2, 2)')
236
+ # Linearly interpolate the times
237
+ x = np.arange(n_samples)
238
+ return np.interp(x, ts[:, 0], ts[:, 1])
239
+
240
+
241
+ def check_dimensions(dico):
242
+ """Test for consistency of dimensions as per ALF specs in a dictionary.
243
+
244
+ Alf broadcasting rules: only accepts consistent dimensions for a given axis
245
+ a dimension is consistent with another if it's empty, 1, or equal to the other arrays
246
+ dims [a, 1], [1, b] and [a, b] are all consistent, [c, 1] is not
247
+
248
+ Parameters
249
+ ----------
250
+ dico : ALFBunch, dict
251
+ Dictionary containing data
252
+
253
+ Returns
254
+ -------
255
+ int
256
+ Status 0 for consistent dimensions, 1 for inconsistent dimensions
257
+
258
+ """
259
+ supported = (np.ndarray, pd.DataFrame) # Data types that have a shape attribute
260
+ shapes = [dico[lab].shape for lab in dico
261
+ if isinstance(dico[lab], supported) and not lab.startswith('timestamps')]
262
+ first_shapes = [sh[0] for sh in shapes]
263
+ # Continuous timeseries are permitted to be a (2, 2)
264
+ timeseries = [k for k, v in dico.items()
265
+ if k.startswith('timestamps') and isinstance(v, np.ndarray)]
266
+ if any(timeseries):
267
+ for key in timeseries:
268
+ if dico[key].ndim == 1 or (dico[key].ndim == 2 and dico[key].shape[1] == 1):
269
+ # Should be vector with same length as other attributes
270
+ first_shapes.append(dico[key].shape[0])
271
+ elif dico[key].ndim > 1 and dico[key].shape != (2, 2):
272
+ return 1 # ts not a (2, 2) arr or a vector
273
+
274
+ ok = len(first_shapes) == 0 or set(first_shapes).issubset({max(first_shapes), 1})
275
+ return int(ok is False)
276
+
277
+
278
+ def load_file_content(fil):
279
+ """Return content of a file path.
280
+
281
+ Designed for very generic data file formats such as `json`, `npy`, `csv`, `(h)tsv`, `ssv`.
282
+
283
+ Parameters
284
+ ----------
285
+ fil : str, pathlib.Path
286
+ File to read
287
+
288
+ Returns
289
+ -------
290
+ Any
291
+ Array/json/pandas dataframe depending on format
292
+
293
+ """
294
+ if not fil:
295
+ return
296
+ fil = Path(fil)
297
+ if fil.stat().st_size == 0:
298
+ return
299
+ if fil.suffix == '.csv':
300
+ return pd.read_csv(fil).squeeze('columns')
301
+ if fil.suffix == '.json':
302
+ try:
303
+ with open(fil) as _fil:
304
+ return json.loads(_fil.read())
305
+ except Exception as e:
306
+ _logger.error(e)
307
+ return None
308
+ if fil.suffix == '.jsonable':
309
+ return jsonable.read(fil)
310
+ if fil.suffix == '.npy':
311
+ return _ensure_flat(np.load(file=fil, allow_pickle=True))
312
+ if fil.suffix == '.npz':
313
+ arr = np.load(file=fil)
314
+ # If single array with the default name ('arr_0') return individual array
315
+ return arr['arr_0'] if set(arr.files) == {'arr_0'} else arr
316
+ if fil.suffix == '.pqt':
317
+ return parquet.load(fil)[0]
318
+ if fil.suffix == '.ssv':
319
+ return pd.read_csv(fil, delimiter=' ').squeeze('columns')
320
+ if fil.suffix in ('.tsv', '.htsv'):
321
+ return pd.read_csv(fil, delimiter='\t').squeeze('columns')
322
+ if fil.suffix in ('.yml', '.yaml'):
323
+ with open(fil, 'r') as _fil:
324
+ return yaml.safe_load(_fil)
325
+ if fil.suffix == '.sparse_npz':
326
+ try:
327
+ import sparse
328
+ return sparse.load_npz(fil)
329
+ except ModuleNotFoundError:
330
+ warnings.warn(f'{Path(fil).name} requires the pydata sparse package to load.')
331
+ return path.ALFPath(fil)
332
+ return path.ALFPath(fil)
333
+
334
+
335
+ def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
336
+ """Given a path, an object and a filter, returns all files and associated attributes.
337
+
338
+ Parameters
339
+ ----------
340
+ alfpath : str, pathlib.Path
341
+ The folder to list
342
+ object : str, list
343
+ An ALF object name to filter by
344
+ wildcards : bool
345
+ If true uses unix shell style pattern matching, otherwise uses regular expressions
346
+ kwargs
347
+ Other ALF parts to filter, including namespace, attribute, etc.
348
+
349
+ Returns
350
+ -------
351
+ list of one.alf.path.ALFPath
352
+ A list of ALF paths.
353
+ tuple
354
+ A tuple of ALF attributes corresponding to the file paths.
355
+
356
+ Raises
357
+ ------
358
+ ALFObjectNotFound
359
+ No matching ALF object was found in the alfpath directory
360
+
361
+ """
362
+ alfpath = path.ALFPath(alfpath)
363
+ if not alfpath.exists():
364
+ files_alf = attributes = None
365
+ elif alfpath.is_dir():
366
+ if object is None:
367
+ # List all ALF files
368
+ files_alf, attributes = filter_by(alfpath, **kwargs)
369
+ else:
370
+ files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
371
+ else:
372
+ object = alfpath.object
373
+ alfpath = alfpath.parent
374
+ files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
375
+
376
+ # raise error if no files found
377
+ if not files_alf:
378
+ err_str = f'object "{object}"' if object else 'ALF files'
379
+ raise ALFObjectNotFound(f'No {err_str} found in {alfpath}')
380
+
381
+ return [alfpath.joinpath(f) for f in files_alf], attributes
382
+
383
+
384
+ def iter_sessions(root_dir, pattern='*'):
385
+ """Recursively iterate over session paths in a given directory.
386
+
387
+ Parameters
388
+ ----------
389
+ root_dir : str, pathlib.Path
390
+ The folder to look for sessions.
391
+ pattern : str
392
+ Glob pattern to use. Default searches all folders. Providing a more specific pattern makes
393
+ this more performant (see examples).
394
+
395
+ Yields
396
+ ------
397
+ pathlib.Path
398
+ The next session path in lexicographical order.
399
+
400
+ Examples
401
+ --------
402
+ Efficient iteration when `root_dir` contains <lab>/Subjects folders
403
+
404
+ >>> sessions = list(iter_sessions(root_dir, pattern='*/Subjects/*/????-??-??/*'))
405
+
406
+ Efficient iteration when `root_dir` contains subject folders
407
+
408
+ >>> sessions = list(iter_sessions(root_dir, pattern='*/????-??-??/*'))
409
+
410
+ """
411
+ if spec.is_session_path(root_dir):
412
+ yield path.ALFPath(root_dir)
413
+ for p in sorted(Path(root_dir).rglob(pattern)):
414
+ if p.is_dir() and spec.is_session_path(p):
415
+ yield path.ALFPath(p)
416
+
417
+
418
+ def iter_datasets(session_path):
419
+ """Iterate over all files in a session, and yield relative dataset paths.
420
+
421
+ Parameters
422
+ ----------
423
+ session_path : str, pathlib.Path
424
+ The folder to look for datasets.
425
+
426
+ Yields
427
+ ------
428
+ one.alf.path.ALFPath
429
+ The next dataset path (relative to the session path) in lexicographical order.
430
+
431
+ """
432
+ for dataset in path.ALFPath(session_path).iter_datasets(recursive=True):
433
+ yield dataset.relative_to(session_path)
434
+
435
+
436
+ def exists(alfpath, object, attributes=None, **kwargs) -> bool:
437
+ """Test if ALF object and optionally specific attributes exist in the given path.
438
+
439
+ Parameters
440
+ ----------
441
+ alfpath : str, pathlib.Path
442
+ The folder to look into
443
+ object : str
444
+ ALF object name
445
+ attributes : str, list
446
+ Wanted attributes
447
+ wildcards : bool
448
+ If true uses unix shell style pattern matching, otherwise uses regular expressions
449
+ kwargs
450
+ Other ALF parts to filter by
451
+
452
+ Returns
453
+ -------
454
+ bool
455
+ For multiple attributes, returns True only if all attributes are found
456
+
457
+ """
458
+ # if the object is not found, return False
459
+ try:
460
+ _, attributes_found = _ls(alfpath, object, **kwargs)
461
+ except (FileNotFoundError, ALFObjectNotFound):
462
+ return False
463
+
464
+ # if object found and no attribute provided, True
465
+ if not attributes:
466
+ return True
467
+
468
+ # if attributes provided, test if all are found
469
+ if isinstance(attributes, str):
470
+ attributes = [attributes]
471
+ attributes_found = set(part[2] for part in attributes_found)
472
+ return set(attributes).issubset(attributes_found)
473
+
474
+
475
+ def load_object(alfpath, object=None, short_keys=False, **kwargs):
476
+ """Reads all files sharing the same object name.
477
+
478
+ For example, if the file provided to the function is `spikes.times`, the function will
479
+ load `spikes.times`, `spikes.clusters`, `spikes.depths`, `spike.amps` in a dictionary
480
+ whose keys will be `times`, `clusters`, `depths`, `amps`
481
+
482
+ Full Reference here: https://int-brain-lab.github.io/ONE/alf_intro.html
483
+
484
+ Simplified example: _namespace_object.attribute_timescale.part1.part2.extension
485
+
486
+ Parameters
487
+ ----------
488
+ alfpath : str, pathlib.Path, list
489
+ Any ALF path pertaining to the object OR directory containing ALFs OR list of paths.
490
+ object : str, list, None
491
+ The ALF object(s) to filter by. If a directory is provided and object is None, all valid
492
+ ALF files returned.
493
+ short_keys : bool
494
+ By default, the output dictionary keys will be compounds of attributes, timescale and
495
+ any eventual parts separated by a dot. Use True to shorten the keys to the attribute
496
+ and timescale.
497
+ wildcards : bool
498
+ If true uses unix shell style pattern matching, otherwise uses regular expressions.
499
+ kwargs
500
+ Other ALF parts to filter by.
501
+
502
+ Returns
503
+ -------
504
+ AlfBunch
505
+ An ALFBunch (dict-like) of all attributes pertaining to the object.
506
+
507
+ Examples
508
+ --------
509
+ Load 'spikes' object
510
+
511
+ >>> spikes = load_object('full/path/to/my/alffolder/', 'spikes')
512
+
513
+ Load 'trials' object under the 'ibl' namespace
514
+
515
+ >>> trials = load_object('/subject/2021-01-01/001', 'trials', namespace='ibl')
516
+
517
+ """
518
+ if isinstance(alfpath, (Path, str)):
519
+ if Path(alfpath).is_dir() and object is None:
520
+ raise ValueError('If a directory is provided, the object name should be provided too')
521
+ files_alf, parts = _ls(alfpath, object, **kwargs)
522
+ else: # A list of paths allows us to load an object from different revisions
523
+ files_alf = list(map(path.ALFPath, alfpath))
524
+ parts = [x.dataset_name_parts for x in files_alf]
525
+ assert len(set(p[1] for p in parts)) == 1
526
+ object = next(x[1] for x in parts)
527
+ # Take attribute and timescale from parts list
528
+ attributes = [p[2] if not p[3] else '_'.join(p[2:4]) for p in parts]
529
+ if not short_keys: # Include extra parts in the keys
530
+ attributes = ['.'.join(filter(None, (attr, p[4]))) for attr, p in zip(attributes, parts)]
531
+ # TODO List duplicates; raise ALFError
532
+ assert len(set(attributes)) == len(attributes), (
533
+ f'multiple object {object} with the same attribute in {alfpath}, restrict parts/namespace')
534
+ out = AlfBunch({})
535
+
536
+ # load content for each file
537
+ for fil, att in zip(files_alf, attributes):
538
+ # if there is a corresponding metadata file, read it:
539
+ meta_data_file = _find_metadata(fil)
540
+ # if this is the actual meta-data file, skip and it will be read later
541
+ if meta_data_file == fil:
542
+ continue
543
+ out[att] = load_file_content(fil)
544
+ if meta_data_file:
545
+ meta = load_file_content(meta_data_file)
546
+ # the columns keyword splits array along the last dimension
547
+ if 'columns' in meta.keys():
548
+ out.update({v: out[att][::, k] for k, v in enumerate(meta['columns'])})
549
+ out.pop(att)
550
+ meta.pop('columns')
551
+ # if there is other stuff in the dictionary, save it, otherwise disregard
552
+ if meta:
553
+ out[att + 'metadata'] = meta
554
+ # Merge 'table' dataframe into bunch
555
+ table_key = next(filter(re.compile(r'^table([_.]|$)').match, out), None) # py 3.8
556
+ if table_key:
557
+ table = out.pop(table_key)
558
+
559
+ def rename_columns(field):
560
+ """"Rename DataFrame fields to include timescale or extra ALF parts from table_key.
561
+
562
+ For example...
563
+ with table_key = table_clock, field1 -> field1_clock;
564
+ with table_key = table_clock.extra, field1_0 -> field1_clock.extra_0;
565
+ with table_key = table, field1 -> field1
566
+ """
567
+ return (field[:-2] + table_key[5:] + field[-2:]
568
+ if re.match(r'.+?_[01]$', field)
569
+ else field + table_key[5:])
570
+ table.rename(columns=rename_columns, inplace=True)
571
+ out.update(AlfBunch.from_df(table))
572
+ status = out.check_dimensions
573
+ timeseries = [k for k in out.keys() if 'timestamps' in k]
574
+ if any(timeseries) and len(out.keys()) > len(timeseries) and status == 0:
575
+ # Get length of one of the other arrays
576
+ ignore = ('timestamps', 'meta')
577
+ n_samples = next(v for k, v in out.items() if not any(x in k for x in ignore)).shape[0]
578
+ for key in timeseries:
579
+ # Expand timeseries if necessary
580
+ out[key] = ts2vec(out[key], n_samples)
581
+ if status != 0:
582
+ supported = (np.ndarray, pd.DataFrame)
583
+ print_sizes = '\n'.join(
584
+ f'{v.shape},\t{k}' for k, v in out.items() if isinstance(v, supported)
585
+ )
586
+ _logger.warning(f'Inconsistent dimensions for object: {object} \n{print_sizes}')
587
+ return out
588
+
589
+
590
+ def save_object_npy(alfpath, dico, object, parts=None, namespace=None, timescale=None) -> list:
591
+ """Save dictionary in `ALF format`_ using dictionary keys as attribute names.
592
+
593
+ Dimensions have to be consistent.
594
+
595
+ Simplified ALF example: _namespace_object.attribute.part1.part2.extension.
596
+
597
+ Parameters
598
+ ----------
599
+ alfpath : str, pathlib.Path
600
+ Path of the folder to save data to.
601
+ dico : dict
602
+ Dictionary to save to npy; keys correspond to ALF attributes.
603
+ object : str
604
+ Name of the object to save.
605
+ parts : str, list, None
606
+ Extra parts to the ALF name.
607
+ namespace : str, None
608
+ The optional namespace of the object.
609
+ timescale : str, None
610
+ The optional timescale of the object.
611
+
612
+ Returns
613
+ -------
614
+ list of one.alf.path.ALFPath
615
+ List of written files.
616
+
617
+ Examples
618
+ --------
619
+ >>> spikes = {'times': np.arange(50), 'depths': np.random.random(50)}
620
+ >>> files = save_object_npy('/path/to/my/alffolder/', spikes, 'spikes')
621
+
622
+ .. _ALF format:
623
+ https://int-brain-lab.github.io/ONE/alf_intro.html
624
+
625
+ """
626
+ alfpath = path.ALFPath(alfpath)
627
+ status = check_dimensions(dico)
628
+ if status != 0:
629
+ raise ValueError('Dimensions are not consistent to save all arrays in ALF format: ' +
630
+ str([(k, v.shape) for k, v in dico.items()]))
631
+ out_files = []
632
+ for k, v in dico.items():
633
+ out_file = alfpath / spec.to_alf(object, k, 'npy',
634
+ extra=parts, namespace=namespace, timescale=timescale)
635
+ np.save(out_file, v)
636
+ out_files.append(out_file)
637
+ return out_files
638
+
639
+
640
+ def save_metadata(file_alf, dico) -> path.ALFPath:
641
+ """Writes a meta data file matching a current ALF file object.
642
+
643
+ For example given an alf file `clusters.ccfLocation.ssv` this will write a dictionary in JSON
644
+ format in `clusters.ccfLocation.metadata.json`
645
+
646
+ Reserved keywords:
647
+ - **columns**: column names for binary tables.
648
+ - **row**: row names for binary tables.
649
+ - **unit**
650
+
651
+ Parameters
652
+ ----------
653
+ file_alf : str, pathlib.Path
654
+ Full path to the alf object
655
+ dico : dict, ALFBunch
656
+ Dictionary containing meta-data
657
+
658
+ Returns
659
+ -------
660
+ one.alf.path.ALFPath
661
+ The saved metadata file path.
662
+
663
+ """
664
+ file_alf = path.ALFPath(file_alf)
665
+ assert file_alf.is_dataset, 'ALF filename not valid'
666
+ file_meta_data = file_alf.parent / (file_alf.stem + '.metadata.json')
667
+ with open(file_meta_data, 'w+') as fid:
668
+ fid.write(json.dumps(dico, indent=1))
669
+ return file_meta_data
670
+
671
+
672
+ def next_num_folder(session_date_folder: Union[str, Path]) -> str:
673
+ """Return the next number for a session given a session_date_folder."""
674
+ session_date_folder = Path(session_date_folder)
675
+ if not session_date_folder.exists():
676
+ return '001'
677
+ session_nums = [
678
+ int(x.name) for x in session_date_folder.iterdir()
679
+ if x.is_dir() and not x.name.startswith('.') and x.name.isdigit()
680
+ ]
681
+ out = f'{max(session_nums or [0]) + 1:03d}'
682
+ assert len(out) == 3, 'ALF spec does not support session numbers > 999'
683
+ return out
684
+
685
+
686
+ def remove_empty_folders(folder: Union[str, Path]) -> None:
687
+ """Iteratively remove any empty child folders."""
688
+ all_folders = sorted(x for x in Path(folder).rglob('*') if x.is_dir())
689
+ for f in reversed(all_folders): # Reversed sorted ensures we remove deepest first
690
+ try:
691
+ f.rmdir()
692
+ except Exception:
693
+ continue
694
+
695
+
696
+ def filter_by(alf_path, wildcards=True, **kwargs):
697
+ """Given a path and optional filters, returns all ALF files and their associated parts.
698
+
699
+ The filters constitute a logical AND. For all but `extra`, if a list is provided, one or more
700
+ elements must match (a logical OR).
701
+
702
+ Parameters
703
+ ----------
704
+ alf_path : str, pathlib.Path
705
+ A path to a folder containing ALF datasets.
706
+ wildcards : bool
707
+ If true, kwargs are matched as unix-style patterns, otherwise as regular expressions.
708
+ object : str, list
709
+ Filter by a given object (e.g. 'spikes').
710
+ attribute : str, list
711
+ Filter by a given attribute (e.g. 'intervals').
712
+ extension : str, list
713
+ Filter by extension (e.g. 'npy').
714
+ namespace : str, list
715
+ Filter by a given namespace (e.g. 'ibl') or None for files without one.
716
+ timescale : str, list
717
+ Filter by a given timescale (e.g. 'bpod') or None for files without one.
718
+ extra : str, list
719
+ Filter by extra parameters (e.g. 'raw') or None for files without extra parts
720
+ NB: Wild cards not permitted here.
721
+
722
+ Returns
723
+ -------
724
+ alf_files : list of one.alf.path.ALFPath
725
+ A Path to a directory containing ALF files.
726
+ attributes : list of dicts
727
+ A list of parsed file parts.
728
+
729
+ Examples
730
+ --------
731
+ Filter files with universal timescale
732
+
733
+ >>> filter_by(alf_path, timescale=None)
734
+
735
+ Filter files by a given ALF object
736
+
737
+ >>> filter_by(alf_path, object='wheel')
738
+
739
+ Filter using wildcard, e.g. 'wheel' and 'wheelMoves' ALF objects
740
+
741
+ >>> filter_by(alf_path, object='wh*')
742
+
743
+ Filter all intervals that are in bpod time
744
+
745
+ >>> filter_by(alf_path, attribute='intervals', timescale='bpod')
746
+
747
+ Filter all files containing either 'intervals' OR 'timestamps' attributes
748
+
749
+ >>> filter_by(alf_path, attribute=['intervals', 'timestamps'])
750
+
751
+ Filter all files using a regular expression
752
+
753
+ >>> filter_by(alf_path, object='^wheel.*', wildcards=False)
754
+ >>> filter_by(alf_path, object=['^wheel$', '.*Moves'], wildcards=False)
755
+
756
+ """
757
+ alf_files = [f.relative_to(alf_path) for f in path.ALFPath(alf_path).iter_datasets()]
758
+ attributes = list(map(path.ALFPath.parse_alf_name, alf_files))
759
+
760
+ if kwargs:
761
+ # Validate keyword arguments against regex group names
762
+ invalid = kwargs.keys() - spec.regex(FILE_SPEC).groupindex.keys()
763
+ if invalid:
764
+ raise TypeError('%s() got an unexpected keyword argument "%s"'
765
+ % (__name__, set(invalid).pop()))
766
+
767
+ # # Ensure 'extra' input is a list; if str split on dot
768
+ if 'extra' in kwargs and isinstance(kwargs['extra'], str):
769
+ kwargs['extra'] = kwargs['extra'].split('.')
770
+
771
+ def _match(part, pattern, split=None):
772
+ if pattern is None or part is None:
773
+ # If either is None, both should be None to match
774
+ return pattern is part
775
+ elif split:
776
+ # Check all provided extra fields match those in ALF
777
+ return all(elem in part.split(split) for elem in pattern if elem)
778
+ elif not isinstance(pattern, str):
779
+ if wildcards:
780
+ return any(_match(part, x, split) for x in pattern)
781
+ else:
782
+ return re.match('|'.join(pattern), part) is not None
783
+ else:
784
+ # Check given attribute matches, allowing wildcards
785
+ return fnmatch(part, pattern) if wildcards else re.match(pattern, part) is not None
786
+
787
+ # Iterate over ALF files
788
+ for file, attr in zip(alf_files.copy(), attributes.copy()):
789
+ for k, v in kwargs.items(): # Iterate over attributes
790
+ match = _match(attr[k], v, '.' if k == 'extra' else None)
791
+
792
+ if not match: # Remove file from list and move on to next file
793
+ alf_files.remove(file)
794
+ attributes.remove(attr)
795
+ break
796
+
797
+ return alf_files, [tuple(attr.values()) for attr in attributes]
798
+
799
+
800
+ def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
801
+ """Find variant datasets.
802
+
803
+ Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
804
+ dataset is uniquely defined by session path, collection, object and attribute. Therefore,
805
+ datasets with the same name and collection in a different revision folder are considered a
806
+ variant. If any of the keyword arguments are set to False, those parts are ignored when
807
+ comparing datasets.
808
+
809
+ Parameters
810
+ ----------
811
+ file_list : list of str, list of pathlib.Path
812
+ A list of ALF paths to find variants of.
813
+ namespace : bool
814
+ If true, treat datasets with a different namespace as unique.
815
+ timescale : bool
816
+ If true, treat datasets with a different timescale as unique.
817
+ extra : bool
818
+ If true, treat datasets with a different extra parts as unique.
819
+ extension : bool
820
+ If true, treat datasets with a different extension as unique.
821
+
822
+ Returns
823
+ -------
824
+ Dict[pathlib.Path, list of pathlib.Path]
825
+ A map of input file paths to a list variant dataset paths.
826
+
827
+ Raises
828
+ ------
829
+ ValueError
830
+ One or more input file paths are not valid ALF datasets.
831
+
832
+ Examples
833
+ --------
834
+ Find all datasets with an identical name and collection in a different revision folder
835
+
836
+ >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
837
+ {Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
838
+ Path('/sub/2020-10-01/001/alf/obj.attr.npy')
839
+ ]}
840
+
841
+ Find all datasets with different namespace or revision
842
+
843
+ >>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
844
+ {Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
845
+ Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
846
+ Path('/sub/2020-10-01/001/obj.attr.npy'),
847
+ ]}
848
+
849
+ """
850
+ # Initialize map of unique files to their duplicates
851
+ duplicates = {}
852
+ # Determine which parts to filter
853
+ variables = locals()
854
+ filters = {'namespace', 'timescale', 'extra', 'extension'}
855
+ to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
856
+ *(arg for arg in filters if variables[arg]))
857
+
858
+ def parts_match(parts, file):
859
+ """Compare a file's unique parts to a given file."""
860
+ other = file.parse_alf_path()
861
+ return all(parts[k] == other[k] for k in to_compare)
862
+
863
+ # iterate over unique files and their parts
864
+ for f in map(path.ALFPath, file_list):
865
+ parts = f.parse_alf_path()
866
+ # first glob for files matching object.attribute (including revisions)
867
+ pattern = f'*{parts["object"]}.{parts["attribute"]}*'
868
+ # this works because revision will always be last folder;
869
+ # i.e. revisions can't contain collections
870
+ globbed = map(f.without_revision().parent.glob, (pattern, '#*#/' + pattern))
871
+ globbed = chain.from_iterable(globbed) # unite revision and non-revision globs
872
+ # refine duplicates based on other parts (this also ensures we don't catch similar objects)
873
+ globbed = filter(partial(parts_match, parts), globbed)
874
+ # key = f.relative_to_session().as_posix()
875
+ duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates
876
+ return duplicates