ONE-api 3.0b1__py3-none-any.whl → 3.0b4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/LICENSE +21 -21
- {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/METADATA +115 -115
- ONE_api-3.0b4.dist-info/RECORD +37 -0
- one/__init__.py +2 -2
- one/alf/__init__.py +1 -1
- one/alf/cache.py +640 -653
- one/alf/exceptions.py +105 -105
- one/alf/io.py +876 -876
- one/alf/path.py +1450 -1450
- one/alf/spec.py +519 -504
- one/api.py +2949 -2973
- one/converters.py +850 -850
- one/params.py +414 -414
- one/registration.py +845 -845
- one/remote/__init__.py +1 -1
- one/remote/aws.py +313 -313
- one/remote/base.py +142 -142
- one/remote/globus.py +1254 -1254
- one/tests/fixtures/params/.caches +6 -6
- one/tests/fixtures/params/.test.alyx.internationalbrainlab.org +8 -8
- one/tests/fixtures/rest_responses/1f187d80fd59677b395fcdb18e68e4401bfa1cc9 +1 -1
- one/tests/fixtures/rest_responses/47893cf67c985e6361cdee009334963f49fb0746 +1 -1
- one/tests/fixtures/rest_responses/535d0e9a1e2c1efbdeba0d673b131e00361a2edb +1 -1
- one/tests/fixtures/rest_responses/6dc96f7e9bcc6ac2e7581489b9580a6cd3f28293 +1 -1
- one/tests/fixtures/rest_responses/db1731fb8df0208944ae85f76718430813a8bf50 +1 -1
- one/tests/fixtures/rest_responses/dcce48259bb929661f60a02a48563f70aa6185b3 +1 -1
- one/tests/fixtures/rest_responses/f530d6022f61cdc9e38cc66beb3cb71f3003c9a1 +1 -1
- one/tests/fixtures/test_dbs.json +14 -14
- one/util.py +524 -524
- one/webclient.py +1366 -1354
- ONE_api-3.0b1.dist-info/RECORD +0 -37
- {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/WHEEL +0 -0
- {ONE_api-3.0b1.dist-info → ONE_api-3.0b4.dist-info}/top_level.txt +0 -0
one/alf/io.py
CHANGED
|
@@ -1,876 +1,876 @@
|
|
|
1
|
-
"""I/O functions for ALyx Files.
|
|
2
|
-
|
|
3
|
-
Provides support for time-series reading and interpolation as per the specifications
|
|
4
|
-
For a full overview of the scope of the format, see:
|
|
5
|
-
|
|
6
|
-
https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import json
|
|
10
|
-
import copy
|
|
11
|
-
import logging
|
|
12
|
-
import re
|
|
13
|
-
from fnmatch import fnmatch
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from typing import Union
|
|
16
|
-
from functools import partial
|
|
17
|
-
from itertools import chain
|
|
18
|
-
import warnings
|
|
19
|
-
|
|
20
|
-
import numpy as np
|
|
21
|
-
import pandas as pd
|
|
22
|
-
import yaml
|
|
23
|
-
|
|
24
|
-
from iblutil.util import Bunch
|
|
25
|
-
from iblutil.io import parquet
|
|
26
|
-
from iblutil.io import jsonable
|
|
27
|
-
from .exceptions import ALFObjectNotFound
|
|
28
|
-
from . import path, spec
|
|
29
|
-
from .spec import FILE_SPEC
|
|
30
|
-
|
|
31
|
-
_logger = logging.getLogger(__name__)
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class AlfBunch(Bunch):
|
|
35
|
-
"""A dict-like object that supports dot indexing and conversion to DataFrame."""
|
|
36
|
-
|
|
37
|
-
@property
|
|
38
|
-
def check_dimensions(self):
|
|
39
|
-
"""int: 0 for consistent dimensions, 1 for inconsistent dimensions."""
|
|
40
|
-
return check_dimensions(self)
|
|
41
|
-
|
|
42
|
-
def append(self, b, inplace=False):
|
|
43
|
-
"""Appends one bunch to another, key by key.
|
|
44
|
-
|
|
45
|
-
Parameters
|
|
46
|
-
----------
|
|
47
|
-
b : Bunch, dict
|
|
48
|
-
A Bunch of data to append
|
|
49
|
-
inplace : bool
|
|
50
|
-
If true, the data are appended in place, otherwise a copy is returned
|
|
51
|
-
|
|
52
|
-
Returns
|
|
53
|
-
-------
|
|
54
|
-
ALFBunch, None
|
|
55
|
-
An ALFBunch with the data appended, or None if inplace is True
|
|
56
|
-
|
|
57
|
-
"""
|
|
58
|
-
# default is to return a copy
|
|
59
|
-
if inplace:
|
|
60
|
-
a = self
|
|
61
|
-
else:
|
|
62
|
-
a = AlfBunch(copy.deepcopy(self))
|
|
63
|
-
# handles empty bunches for convenience if looping
|
|
64
|
-
if b == {}:
|
|
65
|
-
return a
|
|
66
|
-
if a == {}:
|
|
67
|
-
return AlfBunch(b)
|
|
68
|
-
# right now supports only strictly matching keys. Will implement other cases as needed
|
|
69
|
-
if set(a.keys()) != set(b.keys()):
|
|
70
|
-
raise NotImplementedError('Append bunches only works with strictly matching keys'
|
|
71
|
-
'For more complex merges, convert to pandas dataframe.')
|
|
72
|
-
# do the merge; only concatenate lists and np arrays right now
|
|
73
|
-
for k in a:
|
|
74
|
-
if isinstance(a[k], np.ndarray):
|
|
75
|
-
a[k] = np.concatenate((a[k], b[k]), axis=0)
|
|
76
|
-
elif isinstance(a[k], list):
|
|
77
|
-
a[k].extend(b[k])
|
|
78
|
-
else:
|
|
79
|
-
_logger.warning(f'bunch key "{k}" is a {a[k].__class__}. I don\'t know how to'
|
|
80
|
-
f' handle that. Use pandas for advanced features')
|
|
81
|
-
if a.check_dimensions != 0:
|
|
82
|
-
print_sizes = '\n'.join(f'{v.shape},\t{k}' for k, v in a.items())
|
|
83
|
-
_logger.warning(f'Inconsistent dimensions for object: \n{print_sizes}')
|
|
84
|
-
|
|
85
|
-
return a
|
|
86
|
-
|
|
87
|
-
def to_df(self) -> pd.DataFrame:
|
|
88
|
-
"""Return DataFrame with data keys as columns."""
|
|
89
|
-
return dataframe(self)
|
|
90
|
-
|
|
91
|
-
@staticmethod
|
|
92
|
-
def from_df(df) -> 'AlfBunch':
|
|
93
|
-
data = dict(zip(df.columns, df.values.T))
|
|
94
|
-
split_keys = sorted(x for x in data.keys() if re.match(r'.+?_[01]$', x))
|
|
95
|
-
for x1, x2 in zip(*[iter(split_keys)] * 2):
|
|
96
|
-
data[x1[:-2]] = np.c_[data.pop(x1), data.pop(x2)]
|
|
97
|
-
return AlfBunch(data)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def dataframe(adict):
|
|
101
|
-
"""Convert an Bunch conforming to size conventions into a pandas DataFrame.
|
|
102
|
-
|
|
103
|
-
For 2-D arrays, stops at 10 columns per attribute.
|
|
104
|
-
|
|
105
|
-
Parameters
|
|
106
|
-
----------
|
|
107
|
-
adict : dict, Bunch
|
|
108
|
-
A dict-like object of data to convert to DataFrame
|
|
109
|
-
|
|
110
|
-
Returns
|
|
111
|
-
-------
|
|
112
|
-
pd.DataFrame
|
|
113
|
-
A pandas DataFrame of data
|
|
114
|
-
|
|
115
|
-
"""
|
|
116
|
-
if check_dimensions(adict) != 0:
|
|
117
|
-
raise ValueError('Can only convert to DataFrame objects with consistent size')
|
|
118
|
-
# easy case where there are only vectors
|
|
119
|
-
if all([len(adict[k].shape) == 1 for k in adict]):
|
|
120
|
-
return pd.DataFrame(adict)
|
|
121
|
-
# pandas has trouble with 2d data, chop it off with a limit of 10 columns per dataset
|
|
122
|
-
df = pd.DataFrame()
|
|
123
|
-
for k in adict.keys():
|
|
124
|
-
if adict[k].ndim == 1:
|
|
125
|
-
df[k] = adict[k]
|
|
126
|
-
elif adict[k].ndim == 2 and adict[k].shape[1] == 1:
|
|
127
|
-
df[k] = adict[k][:, 0]
|
|
128
|
-
elif adict[k].ndim == 2:
|
|
129
|
-
for i in np.arange(adict[k].shape[1]):
|
|
130
|
-
df[f"{k}_{i}"] = adict[k][:, i]
|
|
131
|
-
if i == 9:
|
|
132
|
-
break
|
|
133
|
-
else:
|
|
134
|
-
_logger.warning(f'{k} attribute is 3D or more and won\'t convert to dataframe')
|
|
135
|
-
continue
|
|
136
|
-
return df
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
def _find_metadata(file_alf) -> path.ALFPath:
|
|
140
|
-
"""File path for an existing meta-data file for an alf_file.
|
|
141
|
-
|
|
142
|
-
Parameters
|
|
143
|
-
----------
|
|
144
|
-
file_alf : str, pathlib.Path
|
|
145
|
-
A path of existing ALF.
|
|
146
|
-
|
|
147
|
-
Returns
|
|
148
|
-
-------
|
|
149
|
-
one.alf.path.ALFPath
|
|
150
|
-
Path of meta-data file if exists.
|
|
151
|
-
|
|
152
|
-
"""
|
|
153
|
-
file_alf = path.ALFPath(file_alf)
|
|
154
|
-
ns, obj = file_alf.name.split('.')[:2]
|
|
155
|
-
return next(file_alf.parent.glob(f'{ns}.{obj}*.metadata*.json'), None)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
def read_ts(filename):
|
|
159
|
-
"""Load time-series from ALF format.
|
|
160
|
-
|
|
161
|
-
Parameters
|
|
162
|
-
----------
|
|
163
|
-
filename : str, pathlib.Path
|
|
164
|
-
An ALF path whose values to load
|
|
165
|
-
|
|
166
|
-
Returns
|
|
167
|
-
-------
|
|
168
|
-
numpy.ndarray
|
|
169
|
-
An array of timestamps belonging to the ALF path object
|
|
170
|
-
numpy.ndarray
|
|
171
|
-
An array of values in filename
|
|
172
|
-
|
|
173
|
-
Examples
|
|
174
|
-
--------
|
|
175
|
-
>>> t, d = read_ts(filename)
|
|
176
|
-
|
|
177
|
-
"""
|
|
178
|
-
filename = path.ensure_alf_path(filename)
|
|
179
|
-
|
|
180
|
-
# alf format is object.attribute.extension, for example '_ibl_wheel.position.npy'
|
|
181
|
-
_, obj, attr, *_, ext = filename.dataset_name_parts
|
|
182
|
-
|
|
183
|
-
try:
|
|
184
|
-
# looking for matching object with attribute timestamps: '_ibl_wheel.timestamps.npy'
|
|
185
|
-
(time_file,), _ = filter_by(filename.parent, object=obj, attribute='times*', extension=ext)
|
|
186
|
-
assert time_file
|
|
187
|
-
except (ValueError, AssertionError):
|
|
188
|
-
name = spec.to_alf(obj, attr, ext)
|
|
189
|
-
raise FileNotFoundError(name + ' not found! No time-scale for ' + str(filename))
|
|
190
|
-
|
|
191
|
-
ts = np.load(filename.parent / time_file)
|
|
192
|
-
val = np.load(filename)
|
|
193
|
-
# Ensure timestamps
|
|
194
|
-
return ts2vec(ts, val.shape[0]), _ensure_flat(val)
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def _ensure_flat(arr):
|
|
198
|
-
"""Given a single column array, returns a flat vector. Other shapes are returned unchanged.
|
|
199
|
-
|
|
200
|
-
Parameters
|
|
201
|
-
----------
|
|
202
|
-
arr : numpy.array
|
|
203
|
-
An array with shape (n, 1)
|
|
204
|
-
|
|
205
|
-
Returns
|
|
206
|
-
-------
|
|
207
|
-
numpy.ndarray
|
|
208
|
-
A vector with shape (n,)
|
|
209
|
-
|
|
210
|
-
"""
|
|
211
|
-
return arr.flatten() if arr.ndim == 2 and arr.shape[1] == 1 else arr
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
def ts2vec(ts: np.ndarray, n_samples: int) -> np.ndarray:
|
|
215
|
-
"""Interpolate a continuous timeseries of the shape (2, 2).
|
|
216
|
-
|
|
217
|
-
Parameters
|
|
218
|
-
----------
|
|
219
|
-
ts : numpy.array
|
|
220
|
-
a 2x2 numpy array of the form (sample, ts)
|
|
221
|
-
n_samples : int
|
|
222
|
-
Number of samples; i.e. the size of the resulting vector
|
|
223
|
-
|
|
224
|
-
Returns
|
|
225
|
-
-------
|
|
226
|
-
numpy.ndarray
|
|
227
|
-
A vector of interpolated timestamps
|
|
228
|
-
|
|
229
|
-
"""
|
|
230
|
-
if len(ts.shape) == 1:
|
|
231
|
-
return ts
|
|
232
|
-
elif ts.ndim == 2 and ts.shape[1] == 1:
|
|
233
|
-
return ts.flatten() # Deal with MATLAB single column array
|
|
234
|
-
if ts.ndim > 2 or ts.shape[1] != 2:
|
|
235
|
-
raise ValueError('Array shape should be (2, 2)')
|
|
236
|
-
# Linearly interpolate the times
|
|
237
|
-
x = np.arange(n_samples)
|
|
238
|
-
return np.interp(x, ts[:, 0], ts[:, 1])
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
def check_dimensions(dico):
|
|
242
|
-
"""Test for consistency of dimensions as per ALF specs in a dictionary.
|
|
243
|
-
|
|
244
|
-
Alf broadcasting rules: only accepts consistent dimensions for a given axis
|
|
245
|
-
a dimension is consistent with another if it's empty, 1, or equal to the other arrays
|
|
246
|
-
dims [a, 1], [1, b] and [a, b] are all consistent, [c, 1] is not
|
|
247
|
-
|
|
248
|
-
Parameters
|
|
249
|
-
----------
|
|
250
|
-
dico : ALFBunch, dict
|
|
251
|
-
Dictionary containing data
|
|
252
|
-
|
|
253
|
-
Returns
|
|
254
|
-
-------
|
|
255
|
-
int
|
|
256
|
-
Status 0 for consistent dimensions, 1 for inconsistent dimensions
|
|
257
|
-
|
|
258
|
-
"""
|
|
259
|
-
supported = (np.ndarray, pd.DataFrame) # Data types that have a shape attribute
|
|
260
|
-
shapes = [dico[lab].shape for lab in dico
|
|
261
|
-
if isinstance(dico[lab], supported) and not lab.startswith('timestamps')]
|
|
262
|
-
first_shapes = [sh[0] for sh in shapes]
|
|
263
|
-
# Continuous timeseries are permitted to be a (2, 2)
|
|
264
|
-
timeseries = [k for k, v in dico.items()
|
|
265
|
-
if k.startswith('timestamps') and isinstance(v, np.ndarray)]
|
|
266
|
-
if any(timeseries):
|
|
267
|
-
for key in timeseries:
|
|
268
|
-
if dico[key].ndim == 1 or (dico[key].ndim == 2 and dico[key].shape[1] == 1):
|
|
269
|
-
# Should be vector with same length as other attributes
|
|
270
|
-
first_shapes.append(dico[key].shape[0])
|
|
271
|
-
elif dico[key].ndim > 1 and dico[key].shape != (2, 2):
|
|
272
|
-
return 1 # ts not a (2, 2) arr or a vector
|
|
273
|
-
|
|
274
|
-
ok = len(first_shapes) == 0 or set(first_shapes).issubset({max(first_shapes), 1})
|
|
275
|
-
return int(ok is False)
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
def load_file_content(fil):
|
|
279
|
-
"""Return content of a file path.
|
|
280
|
-
|
|
281
|
-
Designed for very generic data file formats such as `json`, `npy`, `csv`, `(h)tsv`, `ssv`.
|
|
282
|
-
|
|
283
|
-
Parameters
|
|
284
|
-
----------
|
|
285
|
-
fil : str, pathlib.Path
|
|
286
|
-
File to read
|
|
287
|
-
|
|
288
|
-
Returns
|
|
289
|
-
-------
|
|
290
|
-
Any
|
|
291
|
-
Array/json/pandas dataframe depending on format
|
|
292
|
-
|
|
293
|
-
"""
|
|
294
|
-
if not fil:
|
|
295
|
-
return
|
|
296
|
-
fil = Path(fil)
|
|
297
|
-
if fil.stat().st_size == 0:
|
|
298
|
-
return
|
|
299
|
-
if fil.suffix == '.csv':
|
|
300
|
-
return pd.read_csv(fil).squeeze('columns')
|
|
301
|
-
if fil.suffix == '.json':
|
|
302
|
-
try:
|
|
303
|
-
with open(fil) as _fil:
|
|
304
|
-
return json.loads(_fil.read())
|
|
305
|
-
except Exception as e:
|
|
306
|
-
_logger.error(e)
|
|
307
|
-
return None
|
|
308
|
-
if fil.suffix == '.jsonable':
|
|
309
|
-
return jsonable.read(fil)
|
|
310
|
-
if fil.suffix == '.npy':
|
|
311
|
-
return _ensure_flat(np.load(file=fil, allow_pickle=True))
|
|
312
|
-
if fil.suffix == '.npz':
|
|
313
|
-
arr = np.load(file=fil)
|
|
314
|
-
# If single array with the default name ('arr_0') return individual array
|
|
315
|
-
return arr['arr_0'] if set(arr.files) == {'arr_0'} else arr
|
|
316
|
-
if fil.suffix == '.pqt':
|
|
317
|
-
return parquet.load(fil)[0]
|
|
318
|
-
if fil.suffix == '.ssv':
|
|
319
|
-
return pd.read_csv(fil, delimiter=' ').squeeze('columns')
|
|
320
|
-
if fil.suffix in ('.tsv', '.htsv'):
|
|
321
|
-
return pd.read_csv(fil, delimiter='\t').squeeze('columns')
|
|
322
|
-
if fil.suffix in ('.yml', '.yaml'):
|
|
323
|
-
with open(fil, 'r') as _fil:
|
|
324
|
-
return yaml.safe_load(_fil)
|
|
325
|
-
if fil.suffix == '.sparse_npz':
|
|
326
|
-
try:
|
|
327
|
-
import sparse
|
|
328
|
-
return sparse.load_npz(fil)
|
|
329
|
-
except ModuleNotFoundError:
|
|
330
|
-
warnings.warn(f'{Path(fil).name} requires the pydata sparse package to load.')
|
|
331
|
-
return path.ALFPath(fil)
|
|
332
|
-
return path.ALFPath(fil)
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
|
|
336
|
-
"""Given a path, an object and a filter, returns all files and associated attributes.
|
|
337
|
-
|
|
338
|
-
Parameters
|
|
339
|
-
----------
|
|
340
|
-
alfpath : str, pathlib.Path
|
|
341
|
-
The folder to list
|
|
342
|
-
object : str, list
|
|
343
|
-
An ALF object name to filter by
|
|
344
|
-
wildcards : bool
|
|
345
|
-
If true uses unix shell style pattern matching, otherwise uses regular expressions
|
|
346
|
-
kwargs
|
|
347
|
-
Other ALF parts to filter, including namespace, attribute, etc.
|
|
348
|
-
|
|
349
|
-
Returns
|
|
350
|
-
-------
|
|
351
|
-
list of one.alf.path.ALFPath
|
|
352
|
-
A list of ALF paths.
|
|
353
|
-
tuple
|
|
354
|
-
A tuple of ALF attributes corresponding to the file paths.
|
|
355
|
-
|
|
356
|
-
Raises
|
|
357
|
-
------
|
|
358
|
-
ALFObjectNotFound
|
|
359
|
-
No matching ALF object was found in the alfpath directory
|
|
360
|
-
|
|
361
|
-
"""
|
|
362
|
-
alfpath = path.ALFPath(alfpath)
|
|
363
|
-
if not alfpath.exists():
|
|
364
|
-
files_alf = attributes = None
|
|
365
|
-
elif alfpath.is_dir():
|
|
366
|
-
if object is None:
|
|
367
|
-
# List all ALF files
|
|
368
|
-
files_alf, attributes = filter_by(alfpath, **kwargs)
|
|
369
|
-
else:
|
|
370
|
-
files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
|
|
371
|
-
else:
|
|
372
|
-
object = alfpath.object
|
|
373
|
-
alfpath = alfpath.parent
|
|
374
|
-
files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
|
|
375
|
-
|
|
376
|
-
# raise error if no files found
|
|
377
|
-
if not files_alf:
|
|
378
|
-
err_str = f'object "{object}"' if object else 'ALF files'
|
|
379
|
-
raise ALFObjectNotFound(f'No {err_str} found in {alfpath}')
|
|
380
|
-
|
|
381
|
-
return [alfpath.joinpath(f) for f in files_alf], attributes
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
def iter_sessions(root_dir, pattern='*'):
|
|
385
|
-
"""Recursively iterate over session paths in a given directory.
|
|
386
|
-
|
|
387
|
-
Parameters
|
|
388
|
-
----------
|
|
389
|
-
root_dir : str, pathlib.Path
|
|
390
|
-
The folder to look for sessions.
|
|
391
|
-
pattern : str
|
|
392
|
-
Glob pattern to use. Default searches all folders. Providing a more specific pattern makes
|
|
393
|
-
this more performant (see examples).
|
|
394
|
-
|
|
395
|
-
Yields
|
|
396
|
-
------
|
|
397
|
-
pathlib.Path
|
|
398
|
-
The next session path in lexicographical order.
|
|
399
|
-
|
|
400
|
-
Examples
|
|
401
|
-
--------
|
|
402
|
-
Efficient iteration when `root_dir` contains <lab>/Subjects folders
|
|
403
|
-
|
|
404
|
-
>>> sessions = list(iter_sessions(root_dir, pattern='*/Subjects/*/????-??-??/*'))
|
|
405
|
-
|
|
406
|
-
Efficient iteration when `root_dir` contains subject folders
|
|
407
|
-
|
|
408
|
-
>>> sessions = list(iter_sessions(root_dir, pattern='*/????-??-??/*'))
|
|
409
|
-
|
|
410
|
-
"""
|
|
411
|
-
if spec.is_session_path(root_dir):
|
|
412
|
-
yield path.ALFPath(root_dir)
|
|
413
|
-
for p in sorted(Path(root_dir).rglob(pattern)):
|
|
414
|
-
if p.is_dir() and spec.is_session_path(p):
|
|
415
|
-
yield path.ALFPath(p)
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
def iter_datasets(session_path):
|
|
419
|
-
"""Iterate over all files in a session, and yield relative dataset paths.
|
|
420
|
-
|
|
421
|
-
Parameters
|
|
422
|
-
----------
|
|
423
|
-
session_path : str, pathlib.Path
|
|
424
|
-
The folder to look for datasets.
|
|
425
|
-
|
|
426
|
-
Yields
|
|
427
|
-
------
|
|
428
|
-
one.alf.path.ALFPath
|
|
429
|
-
The next dataset path (relative to the session path) in lexicographical order.
|
|
430
|
-
|
|
431
|
-
"""
|
|
432
|
-
for dataset in path.ALFPath(session_path).iter_datasets(recursive=True):
|
|
433
|
-
yield dataset.relative_to(session_path)
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
def exists(alfpath, object, attributes=None, **kwargs) -> bool:
|
|
437
|
-
"""Test if ALF object and optionally specific attributes exist in the given path.
|
|
438
|
-
|
|
439
|
-
Parameters
|
|
440
|
-
----------
|
|
441
|
-
alfpath : str, pathlib.Path
|
|
442
|
-
The folder to look into
|
|
443
|
-
object : str
|
|
444
|
-
ALF object name
|
|
445
|
-
attributes : str, list
|
|
446
|
-
Wanted attributes
|
|
447
|
-
wildcards : bool
|
|
448
|
-
If true uses unix shell style pattern matching, otherwise uses regular expressions
|
|
449
|
-
kwargs
|
|
450
|
-
Other ALF parts to filter by
|
|
451
|
-
|
|
452
|
-
Returns
|
|
453
|
-
-------
|
|
454
|
-
bool
|
|
455
|
-
For multiple attributes, returns True only if all attributes are found
|
|
456
|
-
|
|
457
|
-
"""
|
|
458
|
-
# if the object is not found, return False
|
|
459
|
-
try:
|
|
460
|
-
_, attributes_found = _ls(alfpath, object, **kwargs)
|
|
461
|
-
except (FileNotFoundError, ALFObjectNotFound):
|
|
462
|
-
return False
|
|
463
|
-
|
|
464
|
-
# if object found and no attribute provided, True
|
|
465
|
-
if not attributes:
|
|
466
|
-
return True
|
|
467
|
-
|
|
468
|
-
# if attributes provided, test if all are found
|
|
469
|
-
if isinstance(attributes, str):
|
|
470
|
-
attributes = [attributes]
|
|
471
|
-
attributes_found = set(part[2] for part in attributes_found)
|
|
472
|
-
return set(attributes).issubset(attributes_found)
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
def load_object(alfpath, object=None, short_keys=False, **kwargs):
|
|
476
|
-
"""Reads all files sharing the same object name.
|
|
477
|
-
|
|
478
|
-
For example, if the file provided to the function is `spikes.times`, the function will
|
|
479
|
-
load `spikes.times`, `spikes.clusters`, `spikes.depths`, `spike.amps` in a dictionary
|
|
480
|
-
whose keys will be `times`, `clusters`, `depths`, `amps`
|
|
481
|
-
|
|
482
|
-
Full Reference here: https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
483
|
-
|
|
484
|
-
Simplified example: _namespace_object.attribute_timescale.part1.part2.extension
|
|
485
|
-
|
|
486
|
-
Parameters
|
|
487
|
-
----------
|
|
488
|
-
alfpath : str, pathlib.Path, list
|
|
489
|
-
Any ALF path pertaining to the object OR directory containing ALFs OR list of paths.
|
|
490
|
-
object : str, list, None
|
|
491
|
-
The ALF object(s) to filter by. If a directory is provided and object is None, all valid
|
|
492
|
-
ALF files returned.
|
|
493
|
-
short_keys : bool
|
|
494
|
-
By default, the output dictionary keys will be compounds of attributes, timescale and
|
|
495
|
-
any eventual parts separated by a dot. Use True to shorten the keys to the attribute
|
|
496
|
-
and timescale.
|
|
497
|
-
wildcards : bool
|
|
498
|
-
If true uses unix shell style pattern matching, otherwise uses regular expressions.
|
|
499
|
-
kwargs
|
|
500
|
-
Other ALF parts to filter by.
|
|
501
|
-
|
|
502
|
-
Returns
|
|
503
|
-
-------
|
|
504
|
-
AlfBunch
|
|
505
|
-
An ALFBunch (dict-like) of all attributes pertaining to the object.
|
|
506
|
-
|
|
507
|
-
Examples
|
|
508
|
-
--------
|
|
509
|
-
Load 'spikes' object
|
|
510
|
-
|
|
511
|
-
>>> spikes = load_object('full/path/to/my/alffolder/', 'spikes')
|
|
512
|
-
|
|
513
|
-
Load 'trials' object under the 'ibl' namespace
|
|
514
|
-
|
|
515
|
-
>>> trials = load_object('/subject/2021-01-01/001', 'trials', namespace='ibl')
|
|
516
|
-
|
|
517
|
-
"""
|
|
518
|
-
if isinstance(alfpath, (Path, str)):
|
|
519
|
-
if Path(alfpath).is_dir() and object is None:
|
|
520
|
-
raise ValueError('If a directory is provided, the object name should be provided too')
|
|
521
|
-
files_alf, parts = _ls(alfpath, object, **kwargs)
|
|
522
|
-
else: # A list of paths allows us to load an object from different revisions
|
|
523
|
-
files_alf = list(map(path.ALFPath, alfpath))
|
|
524
|
-
parts = [x.dataset_name_parts for x in files_alf]
|
|
525
|
-
assert len(set(p[1] for p in parts)) == 1
|
|
526
|
-
object = next(x[1] for x in parts)
|
|
527
|
-
# Take attribute and timescale from parts list
|
|
528
|
-
attributes = [p[2] if not p[3] else '_'.join(p[2:4]) for p in parts]
|
|
529
|
-
if not short_keys: # Include extra parts in the keys
|
|
530
|
-
attributes = ['.'.join(filter(None, (attr, p[4]))) for attr, p in zip(attributes, parts)]
|
|
531
|
-
# TODO List duplicates; raise ALFError
|
|
532
|
-
assert len(set(attributes)) == len(attributes), (
|
|
533
|
-
f'multiple object {object} with the same attribute in {alfpath}, restrict parts/namespace')
|
|
534
|
-
out = AlfBunch({})
|
|
535
|
-
|
|
536
|
-
# load content for each file
|
|
537
|
-
for fil, att in zip(files_alf, attributes):
|
|
538
|
-
# if there is a corresponding metadata file, read it:
|
|
539
|
-
meta_data_file = _find_metadata(fil)
|
|
540
|
-
# if this is the actual meta-data file, skip and it will be read later
|
|
541
|
-
if meta_data_file == fil:
|
|
542
|
-
continue
|
|
543
|
-
out[att] = load_file_content(fil)
|
|
544
|
-
if meta_data_file:
|
|
545
|
-
meta = load_file_content(meta_data_file)
|
|
546
|
-
# the columns keyword splits array along the last dimension
|
|
547
|
-
if 'columns' in meta.keys():
|
|
548
|
-
out.update({v: out[att][::, k] for k, v in enumerate(meta['columns'])})
|
|
549
|
-
out.pop(att)
|
|
550
|
-
meta.pop('columns')
|
|
551
|
-
# if there is other stuff in the dictionary, save it, otherwise disregard
|
|
552
|
-
if meta:
|
|
553
|
-
out[att + 'metadata'] = meta
|
|
554
|
-
# Merge 'table' dataframe into bunch
|
|
555
|
-
table_key = next(filter(re.compile(r'^table([_.]|$)').match, out), None) # py 3.8
|
|
556
|
-
if table_key:
|
|
557
|
-
table = out.pop(table_key)
|
|
558
|
-
|
|
559
|
-
def rename_columns(field):
|
|
560
|
-
""""Rename DataFrame fields to include timescale or extra ALF parts from table_key.
|
|
561
|
-
|
|
562
|
-
For example...
|
|
563
|
-
with table_key = table_clock, field1 -> field1_clock;
|
|
564
|
-
with table_key = table_clock.extra, field1_0 -> field1_clock.extra_0;
|
|
565
|
-
with table_key = table, field1 -> field1
|
|
566
|
-
"""
|
|
567
|
-
return (field[:-2] + table_key[5:] + field[-2:]
|
|
568
|
-
if re.match(r'.+?_[01]$', field)
|
|
569
|
-
else field + table_key[5:])
|
|
570
|
-
table.rename(columns=rename_columns, inplace=True)
|
|
571
|
-
out.update(AlfBunch.from_df(table))
|
|
572
|
-
status = out.check_dimensions
|
|
573
|
-
timeseries = [k for k in out.keys() if 'timestamps' in k]
|
|
574
|
-
if any(timeseries) and len(out.keys()) > len(timeseries) and status == 0:
|
|
575
|
-
# Get length of one of the other arrays
|
|
576
|
-
ignore = ('timestamps', 'meta')
|
|
577
|
-
n_samples = next(v for k, v in out.items() if not any(x in k for x in ignore)).shape[0]
|
|
578
|
-
for key in timeseries:
|
|
579
|
-
# Expand timeseries if necessary
|
|
580
|
-
out[key] = ts2vec(out[key], n_samples)
|
|
581
|
-
if status != 0:
|
|
582
|
-
supported = (np.ndarray, pd.DataFrame)
|
|
583
|
-
print_sizes = '\n'.join(
|
|
584
|
-
f'{v.shape},\t{k}' for k, v in out.items() if isinstance(v, supported)
|
|
585
|
-
)
|
|
586
|
-
_logger.warning(f'Inconsistent dimensions for object: {object} \n{print_sizes}')
|
|
587
|
-
return out
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
def save_object_npy(alfpath, dico, object, parts=None, namespace=None, timescale=None) -> list:
|
|
591
|
-
"""Save dictionary in `ALF format`_ using dictionary keys as attribute names.
|
|
592
|
-
|
|
593
|
-
Dimensions have to be consistent.
|
|
594
|
-
|
|
595
|
-
Simplified ALF example: _namespace_object.attribute.part1.part2.extension.
|
|
596
|
-
|
|
597
|
-
Parameters
|
|
598
|
-
----------
|
|
599
|
-
alfpath : str, pathlib.Path
|
|
600
|
-
Path of the folder to save data to.
|
|
601
|
-
dico : dict
|
|
602
|
-
Dictionary to save to npy; keys correspond to ALF attributes.
|
|
603
|
-
object : str
|
|
604
|
-
Name of the object to save.
|
|
605
|
-
parts : str, list, None
|
|
606
|
-
Extra parts to the ALF name.
|
|
607
|
-
namespace : str, None
|
|
608
|
-
The optional namespace of the object.
|
|
609
|
-
timescale : str, None
|
|
610
|
-
The optional timescale of the object.
|
|
611
|
-
|
|
612
|
-
Returns
|
|
613
|
-
-------
|
|
614
|
-
list of one.alf.path.ALFPath
|
|
615
|
-
List of written files.
|
|
616
|
-
|
|
617
|
-
Examples
|
|
618
|
-
--------
|
|
619
|
-
>>> spikes = {'times': np.arange(50), 'depths': np.random.random(50)}
|
|
620
|
-
>>> files = save_object_npy('/path/to/my/alffolder/', spikes, 'spikes')
|
|
621
|
-
|
|
622
|
-
.. _ALF format:
|
|
623
|
-
https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
624
|
-
|
|
625
|
-
"""
|
|
626
|
-
alfpath = path.ALFPath(alfpath)
|
|
627
|
-
status = check_dimensions(dico)
|
|
628
|
-
if status != 0:
|
|
629
|
-
raise ValueError('Dimensions are not consistent to save all arrays in ALF format: ' +
|
|
630
|
-
str([(k, v.shape) for k, v in dico.items()]))
|
|
631
|
-
out_files = []
|
|
632
|
-
for k, v in dico.items():
|
|
633
|
-
out_file = alfpath / spec.to_alf(object, k, 'npy',
|
|
634
|
-
extra=parts, namespace=namespace, timescale=timescale)
|
|
635
|
-
np.save(out_file, v)
|
|
636
|
-
out_files.append(out_file)
|
|
637
|
-
return out_files
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
def save_metadata(file_alf, dico) -> path.ALFPath:
|
|
641
|
-
"""Writes a meta data file matching a current ALF file object.
|
|
642
|
-
|
|
643
|
-
For example given an alf file `clusters.ccfLocation.ssv` this will write a dictionary in JSON
|
|
644
|
-
format in `clusters.ccfLocation.metadata.json`
|
|
645
|
-
|
|
646
|
-
Reserved keywords:
|
|
647
|
-
- **columns**: column names for binary tables.
|
|
648
|
-
- **row**: row names for binary tables.
|
|
649
|
-
- **unit**
|
|
650
|
-
|
|
651
|
-
Parameters
|
|
652
|
-
----------
|
|
653
|
-
file_alf : str, pathlib.Path
|
|
654
|
-
Full path to the alf object
|
|
655
|
-
dico : dict, ALFBunch
|
|
656
|
-
Dictionary containing meta-data
|
|
657
|
-
|
|
658
|
-
Returns
|
|
659
|
-
-------
|
|
660
|
-
one.alf.path.ALFPath
|
|
661
|
-
The saved metadata file path.
|
|
662
|
-
|
|
663
|
-
"""
|
|
664
|
-
file_alf = path.ALFPath(file_alf)
|
|
665
|
-
assert file_alf.is_dataset, 'ALF filename not valid'
|
|
666
|
-
file_meta_data = file_alf.parent / (file_alf.stem + '.metadata.json')
|
|
667
|
-
with open(file_meta_data, 'w+') as fid:
|
|
668
|
-
fid.write(json.dumps(dico, indent=1))
|
|
669
|
-
return file_meta_data
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
def next_num_folder(session_date_folder: Union[str, Path]) -> str:
|
|
673
|
-
"""Return the next number for a session given a session_date_folder."""
|
|
674
|
-
session_date_folder = Path(session_date_folder)
|
|
675
|
-
if not session_date_folder.exists():
|
|
676
|
-
return '001'
|
|
677
|
-
session_nums = [
|
|
678
|
-
int(x.name) for x in session_date_folder.iterdir()
|
|
679
|
-
if x.is_dir() and not x.name.startswith('.') and x.name.isdigit()
|
|
680
|
-
]
|
|
681
|
-
out = f'{max(session_nums or [0]) + 1:03d}'
|
|
682
|
-
assert len(out) == 3, 'ALF spec does not support session numbers > 999'
|
|
683
|
-
return out
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
def remove_empty_folders(folder: Union[str, Path]) -> None:
|
|
687
|
-
"""Iteratively remove any empty child folders."""
|
|
688
|
-
all_folders = sorted(x for x in Path(folder).rglob('*') if x.is_dir())
|
|
689
|
-
for f in reversed(all_folders): # Reversed sorted ensures we remove deepest first
|
|
690
|
-
try:
|
|
691
|
-
f.rmdir()
|
|
692
|
-
except Exception:
|
|
693
|
-
continue
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
def filter_by(alf_path, wildcards=True, **kwargs):
|
|
697
|
-
"""Given a path and optional filters, returns all ALF files and their associated parts.
|
|
698
|
-
|
|
699
|
-
The filters constitute a logical AND. For all but `extra`, if a list is provided, one or more
|
|
700
|
-
elements must match (a logical OR).
|
|
701
|
-
|
|
702
|
-
Parameters
|
|
703
|
-
----------
|
|
704
|
-
alf_path : str, pathlib.Path
|
|
705
|
-
A path to a folder containing ALF datasets.
|
|
706
|
-
wildcards : bool
|
|
707
|
-
If true, kwargs are matched as unix-style patterns, otherwise as regular expressions.
|
|
708
|
-
object : str, list
|
|
709
|
-
Filter by a given object (e.g. 'spikes').
|
|
710
|
-
attribute : str, list
|
|
711
|
-
Filter by a given attribute (e.g. 'intervals').
|
|
712
|
-
extension : str, list
|
|
713
|
-
Filter by extension (e.g. 'npy').
|
|
714
|
-
namespace : str, list
|
|
715
|
-
Filter by a given namespace (e.g. 'ibl') or None for files without one.
|
|
716
|
-
timescale : str, list
|
|
717
|
-
Filter by a given timescale (e.g. 'bpod') or None for files without one.
|
|
718
|
-
extra : str, list
|
|
719
|
-
Filter by extra parameters (e.g. 'raw') or None for files without extra parts
|
|
720
|
-
NB: Wild cards not permitted here.
|
|
721
|
-
|
|
722
|
-
Returns
|
|
723
|
-
-------
|
|
724
|
-
alf_files : list of one.alf.path.ALFPath
|
|
725
|
-
A Path to a directory containing ALF files.
|
|
726
|
-
attributes : list of dicts
|
|
727
|
-
A list of parsed file parts.
|
|
728
|
-
|
|
729
|
-
Examples
|
|
730
|
-
--------
|
|
731
|
-
Filter files with universal timescale
|
|
732
|
-
|
|
733
|
-
>>> filter_by(alf_path, timescale=None)
|
|
734
|
-
|
|
735
|
-
Filter files by a given ALF object
|
|
736
|
-
|
|
737
|
-
>>> filter_by(alf_path, object='wheel')
|
|
738
|
-
|
|
739
|
-
Filter using wildcard, e.g. 'wheel' and 'wheelMoves' ALF objects
|
|
740
|
-
|
|
741
|
-
>>> filter_by(alf_path, object='wh*')
|
|
742
|
-
|
|
743
|
-
Filter all intervals that are in bpod time
|
|
744
|
-
|
|
745
|
-
>>> filter_by(alf_path, attribute='intervals', timescale='bpod')
|
|
746
|
-
|
|
747
|
-
Filter all files containing either 'intervals' OR 'timestamps' attributes
|
|
748
|
-
|
|
749
|
-
>>> filter_by(alf_path, attribute=['intervals', 'timestamps'])
|
|
750
|
-
|
|
751
|
-
Filter all files using a regular expression
|
|
752
|
-
|
|
753
|
-
>>> filter_by(alf_path, object='^wheel.*', wildcards=False)
|
|
754
|
-
>>> filter_by(alf_path, object=['^wheel$', '.*Moves'], wildcards=False)
|
|
755
|
-
|
|
756
|
-
"""
|
|
757
|
-
alf_files = [f.relative_to(alf_path) for f in path.ALFPath(alf_path).iter_datasets()]
|
|
758
|
-
attributes = list(map(path.ALFPath.parse_alf_name, alf_files))
|
|
759
|
-
|
|
760
|
-
if kwargs:
|
|
761
|
-
# Validate keyword arguments against regex group names
|
|
762
|
-
invalid = kwargs.keys() - spec.regex(FILE_SPEC).groupindex.keys()
|
|
763
|
-
if invalid:
|
|
764
|
-
raise TypeError('%s() got an unexpected keyword argument "%s"'
|
|
765
|
-
% (__name__, set(invalid).pop()))
|
|
766
|
-
|
|
767
|
-
# # Ensure 'extra' input is a list; if str split on dot
|
|
768
|
-
if 'extra' in kwargs and isinstance(kwargs['extra'], str):
|
|
769
|
-
kwargs['extra'] = kwargs['extra'].split('.')
|
|
770
|
-
|
|
771
|
-
def _match(part, pattern, split=None):
|
|
772
|
-
if pattern is None or part is None:
|
|
773
|
-
# If either is None, both should be None to match
|
|
774
|
-
return pattern is part
|
|
775
|
-
elif split:
|
|
776
|
-
# Check all provided extra fields match those in ALF
|
|
777
|
-
return all(elem in part.split(split) for elem in pattern if elem)
|
|
778
|
-
elif not isinstance(pattern, str):
|
|
779
|
-
if wildcards:
|
|
780
|
-
return any(_match(part, x, split) for x in pattern)
|
|
781
|
-
else:
|
|
782
|
-
return re.match('|'.join(pattern), part) is not None
|
|
783
|
-
else:
|
|
784
|
-
# Check given attribute matches, allowing wildcards
|
|
785
|
-
return fnmatch(part, pattern) if wildcards else re.match(pattern, part) is not None
|
|
786
|
-
|
|
787
|
-
# Iterate over ALF files
|
|
788
|
-
for file, attr in zip(alf_files.copy(), attributes.copy()):
|
|
789
|
-
for k, v in kwargs.items(): # Iterate over attributes
|
|
790
|
-
match = _match(attr[k], v, '.' if k == 'extra' else None)
|
|
791
|
-
|
|
792
|
-
if not match: # Remove file from list and move on to next file
|
|
793
|
-
alf_files.remove(file)
|
|
794
|
-
attributes.remove(attr)
|
|
795
|
-
break
|
|
796
|
-
|
|
797
|
-
return alf_files, [tuple(attr.values()) for attr in attributes]
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
|
|
801
|
-
"""Find variant datasets.
|
|
802
|
-
|
|
803
|
-
Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
|
|
804
|
-
dataset is uniquely defined by session path, collection, object and attribute. Therefore,
|
|
805
|
-
datasets with the same name and collection in a different revision folder are considered a
|
|
806
|
-
variant. If any of the keyword arguments are set to False, those parts are ignored when
|
|
807
|
-
comparing datasets.
|
|
808
|
-
|
|
809
|
-
Parameters
|
|
810
|
-
----------
|
|
811
|
-
file_list : list of str, list of pathlib.Path
|
|
812
|
-
A list of ALF paths to find variants of.
|
|
813
|
-
namespace : bool
|
|
814
|
-
If true, treat datasets with a different namespace as unique.
|
|
815
|
-
timescale : bool
|
|
816
|
-
If true, treat datasets with a different timescale as unique.
|
|
817
|
-
extra : bool
|
|
818
|
-
If true, treat datasets with a different extra parts as unique.
|
|
819
|
-
extension : bool
|
|
820
|
-
If true, treat datasets with a different extension as unique.
|
|
821
|
-
|
|
822
|
-
Returns
|
|
823
|
-
-------
|
|
824
|
-
Dict[pathlib.Path, list of pathlib.Path]
|
|
825
|
-
A map of input file paths to a list variant dataset paths.
|
|
826
|
-
|
|
827
|
-
Raises
|
|
828
|
-
------
|
|
829
|
-
ValueError
|
|
830
|
-
One or more input file paths are not valid ALF datasets.
|
|
831
|
-
|
|
832
|
-
Examples
|
|
833
|
-
--------
|
|
834
|
-
Find all datasets with an identical name and collection in a different revision folder
|
|
835
|
-
|
|
836
|
-
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
|
|
837
|
-
{Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
|
|
838
|
-
Path('/sub/2020-10-01/001/alf/obj.attr.npy')
|
|
839
|
-
]}
|
|
840
|
-
|
|
841
|
-
Find all datasets with different namespace or revision
|
|
842
|
-
|
|
843
|
-
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
|
|
844
|
-
{Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
|
|
845
|
-
Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
|
|
846
|
-
Path('/sub/2020-10-01/001/obj.attr.npy'),
|
|
847
|
-
]}
|
|
848
|
-
|
|
849
|
-
"""
|
|
850
|
-
# Initialize map of unique files to their duplicates
|
|
851
|
-
duplicates = {}
|
|
852
|
-
# Determine which parts to filter
|
|
853
|
-
variables = locals()
|
|
854
|
-
filters = {'namespace', 'timescale', 'extra', 'extension'}
|
|
855
|
-
to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
|
|
856
|
-
*(arg for arg in filters if variables[arg]))
|
|
857
|
-
|
|
858
|
-
def parts_match(parts, file):
|
|
859
|
-
"""Compare a file's unique parts to a given file."""
|
|
860
|
-
other = file.parse_alf_path()
|
|
861
|
-
return all(parts[k] == other[k] for k in to_compare)
|
|
862
|
-
|
|
863
|
-
# iterate over unique files and their parts
|
|
864
|
-
for f in map(path.ALFPath, file_list):
|
|
865
|
-
parts = f.parse_alf_path()
|
|
866
|
-
# first glob for files matching object.attribute (including revisions)
|
|
867
|
-
pattern = f'*{parts["object"]}.{parts["attribute"]}*'
|
|
868
|
-
# this works because revision will always be last folder;
|
|
869
|
-
# i.e. revisions can't contain collections
|
|
870
|
-
globbed = map(f.without_revision().parent.glob, (pattern, '#*#/' + pattern))
|
|
871
|
-
globbed = chain.from_iterable(globbed) # unite revision and non-revision globs
|
|
872
|
-
# refine duplicates based on other parts (this also ensures we don't catch similar objects)
|
|
873
|
-
globbed = filter(partial(parts_match, parts), globbed)
|
|
874
|
-
# key = f.relative_to_session().as_posix()
|
|
875
|
-
duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates
|
|
876
|
-
return duplicates
|
|
1
|
+
"""I/O functions for ALyx Files.
|
|
2
|
+
|
|
3
|
+
Provides support for time-series reading and interpolation as per the specifications
|
|
4
|
+
For a full overview of the scope of the format, see:
|
|
5
|
+
|
|
6
|
+
https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import copy
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
from fnmatch import fnmatch
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Union
|
|
16
|
+
from functools import partial
|
|
17
|
+
from itertools import chain
|
|
18
|
+
import warnings
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import yaml
|
|
23
|
+
|
|
24
|
+
from iblutil.util import Bunch
|
|
25
|
+
from iblutil.io import parquet
|
|
26
|
+
from iblutil.io import jsonable
|
|
27
|
+
from .exceptions import ALFObjectNotFound
|
|
28
|
+
from . import path, spec
|
|
29
|
+
from .spec import FILE_SPEC
|
|
30
|
+
|
|
31
|
+
_logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class AlfBunch(Bunch):
|
|
35
|
+
"""A dict-like object that supports dot indexing and conversion to DataFrame."""
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def check_dimensions(self):
|
|
39
|
+
"""int: 0 for consistent dimensions, 1 for inconsistent dimensions."""
|
|
40
|
+
return check_dimensions(self)
|
|
41
|
+
|
|
42
|
+
def append(self, b, inplace=False):
|
|
43
|
+
"""Appends one bunch to another, key by key.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
b : Bunch, dict
|
|
48
|
+
A Bunch of data to append
|
|
49
|
+
inplace : bool
|
|
50
|
+
If true, the data are appended in place, otherwise a copy is returned
|
|
51
|
+
|
|
52
|
+
Returns
|
|
53
|
+
-------
|
|
54
|
+
ALFBunch, None
|
|
55
|
+
An ALFBunch with the data appended, or None if inplace is True
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
# default is to return a copy
|
|
59
|
+
if inplace:
|
|
60
|
+
a = self
|
|
61
|
+
else:
|
|
62
|
+
a = AlfBunch(copy.deepcopy(self))
|
|
63
|
+
# handles empty bunches for convenience if looping
|
|
64
|
+
if b == {}:
|
|
65
|
+
return a
|
|
66
|
+
if a == {}:
|
|
67
|
+
return AlfBunch(b)
|
|
68
|
+
# right now supports only strictly matching keys. Will implement other cases as needed
|
|
69
|
+
if set(a.keys()) != set(b.keys()):
|
|
70
|
+
raise NotImplementedError('Append bunches only works with strictly matching keys'
|
|
71
|
+
'For more complex merges, convert to pandas dataframe.')
|
|
72
|
+
# do the merge; only concatenate lists and np arrays right now
|
|
73
|
+
for k in a:
|
|
74
|
+
if isinstance(a[k], np.ndarray):
|
|
75
|
+
a[k] = np.concatenate((a[k], b[k]), axis=0)
|
|
76
|
+
elif isinstance(a[k], list):
|
|
77
|
+
a[k].extend(b[k])
|
|
78
|
+
else:
|
|
79
|
+
_logger.warning(f'bunch key "{k}" is a {a[k].__class__}. I don\'t know how to'
|
|
80
|
+
f' handle that. Use pandas for advanced features')
|
|
81
|
+
if a.check_dimensions != 0:
|
|
82
|
+
print_sizes = '\n'.join(f'{v.shape},\t{k}' for k, v in a.items())
|
|
83
|
+
_logger.warning(f'Inconsistent dimensions for object: \n{print_sizes}')
|
|
84
|
+
|
|
85
|
+
return a
|
|
86
|
+
|
|
87
|
+
def to_df(self) -> pd.DataFrame:
|
|
88
|
+
"""Return DataFrame with data keys as columns."""
|
|
89
|
+
return dataframe(self)
|
|
90
|
+
|
|
91
|
+
@staticmethod
|
|
92
|
+
def from_df(df) -> 'AlfBunch':
|
|
93
|
+
data = dict(zip(df.columns, df.values.T))
|
|
94
|
+
split_keys = sorted(x for x in data.keys() if re.match(r'.+?_[01]$', x))
|
|
95
|
+
for x1, x2 in zip(*[iter(split_keys)] * 2):
|
|
96
|
+
data[x1[:-2]] = np.c_[data.pop(x1), data.pop(x2)]
|
|
97
|
+
return AlfBunch(data)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def dataframe(adict):
|
|
101
|
+
"""Convert an Bunch conforming to size conventions into a pandas DataFrame.
|
|
102
|
+
|
|
103
|
+
For 2-D arrays, stops at 10 columns per attribute.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
adict : dict, Bunch
|
|
108
|
+
A dict-like object of data to convert to DataFrame
|
|
109
|
+
|
|
110
|
+
Returns
|
|
111
|
+
-------
|
|
112
|
+
pd.DataFrame
|
|
113
|
+
A pandas DataFrame of data
|
|
114
|
+
|
|
115
|
+
"""
|
|
116
|
+
if check_dimensions(adict) != 0:
|
|
117
|
+
raise ValueError('Can only convert to DataFrame objects with consistent size')
|
|
118
|
+
# easy case where there are only vectors
|
|
119
|
+
if all([len(adict[k].shape) == 1 for k in adict]):
|
|
120
|
+
return pd.DataFrame(adict)
|
|
121
|
+
# pandas has trouble with 2d data, chop it off with a limit of 10 columns per dataset
|
|
122
|
+
df = pd.DataFrame()
|
|
123
|
+
for k in adict.keys():
|
|
124
|
+
if adict[k].ndim == 1:
|
|
125
|
+
df[k] = adict[k]
|
|
126
|
+
elif adict[k].ndim == 2 and adict[k].shape[1] == 1:
|
|
127
|
+
df[k] = adict[k][:, 0]
|
|
128
|
+
elif adict[k].ndim == 2:
|
|
129
|
+
for i in np.arange(adict[k].shape[1]):
|
|
130
|
+
df[f"{k}_{i}"] = adict[k][:, i]
|
|
131
|
+
if i == 9:
|
|
132
|
+
break
|
|
133
|
+
else:
|
|
134
|
+
_logger.warning(f'{k} attribute is 3D or more and won\'t convert to dataframe')
|
|
135
|
+
continue
|
|
136
|
+
return df
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _find_metadata(file_alf) -> path.ALFPath:
|
|
140
|
+
"""File path for an existing meta-data file for an alf_file.
|
|
141
|
+
|
|
142
|
+
Parameters
|
|
143
|
+
----------
|
|
144
|
+
file_alf : str, pathlib.Path
|
|
145
|
+
A path of existing ALF.
|
|
146
|
+
|
|
147
|
+
Returns
|
|
148
|
+
-------
|
|
149
|
+
one.alf.path.ALFPath
|
|
150
|
+
Path of meta-data file if exists.
|
|
151
|
+
|
|
152
|
+
"""
|
|
153
|
+
file_alf = path.ALFPath(file_alf)
|
|
154
|
+
ns, obj = file_alf.name.split('.')[:2]
|
|
155
|
+
return next(file_alf.parent.glob(f'{ns}.{obj}*.metadata*.json'), None)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def read_ts(filename):
|
|
159
|
+
"""Load time-series from ALF format.
|
|
160
|
+
|
|
161
|
+
Parameters
|
|
162
|
+
----------
|
|
163
|
+
filename : str, pathlib.Path
|
|
164
|
+
An ALF path whose values to load
|
|
165
|
+
|
|
166
|
+
Returns
|
|
167
|
+
-------
|
|
168
|
+
numpy.ndarray
|
|
169
|
+
An array of timestamps belonging to the ALF path object
|
|
170
|
+
numpy.ndarray
|
|
171
|
+
An array of values in filename
|
|
172
|
+
|
|
173
|
+
Examples
|
|
174
|
+
--------
|
|
175
|
+
>>> t, d = read_ts(filename)
|
|
176
|
+
|
|
177
|
+
"""
|
|
178
|
+
filename = path.ensure_alf_path(filename)
|
|
179
|
+
|
|
180
|
+
# alf format is object.attribute.extension, for example '_ibl_wheel.position.npy'
|
|
181
|
+
_, obj, attr, *_, ext = filename.dataset_name_parts
|
|
182
|
+
|
|
183
|
+
try:
|
|
184
|
+
# looking for matching object with attribute timestamps: '_ibl_wheel.timestamps.npy'
|
|
185
|
+
(time_file,), _ = filter_by(filename.parent, object=obj, attribute='times*', extension=ext)
|
|
186
|
+
assert time_file
|
|
187
|
+
except (ValueError, AssertionError):
|
|
188
|
+
name = spec.to_alf(obj, attr, ext)
|
|
189
|
+
raise FileNotFoundError(name + ' not found! No time-scale for ' + str(filename))
|
|
190
|
+
|
|
191
|
+
ts = np.load(filename.parent / time_file)
|
|
192
|
+
val = np.load(filename)
|
|
193
|
+
# Ensure timestamps
|
|
194
|
+
return ts2vec(ts, val.shape[0]), _ensure_flat(val)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _ensure_flat(arr):
|
|
198
|
+
"""Given a single column array, returns a flat vector. Other shapes are returned unchanged.
|
|
199
|
+
|
|
200
|
+
Parameters
|
|
201
|
+
----------
|
|
202
|
+
arr : numpy.array
|
|
203
|
+
An array with shape (n, 1)
|
|
204
|
+
|
|
205
|
+
Returns
|
|
206
|
+
-------
|
|
207
|
+
numpy.ndarray
|
|
208
|
+
A vector with shape (n,)
|
|
209
|
+
|
|
210
|
+
"""
|
|
211
|
+
return arr.flatten() if arr.ndim == 2 and arr.shape[1] == 1 else arr
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def ts2vec(ts: np.ndarray, n_samples: int) -> np.ndarray:
|
|
215
|
+
"""Interpolate a continuous timeseries of the shape (2, 2).
|
|
216
|
+
|
|
217
|
+
Parameters
|
|
218
|
+
----------
|
|
219
|
+
ts : numpy.array
|
|
220
|
+
a 2x2 numpy array of the form (sample, ts)
|
|
221
|
+
n_samples : int
|
|
222
|
+
Number of samples; i.e. the size of the resulting vector
|
|
223
|
+
|
|
224
|
+
Returns
|
|
225
|
+
-------
|
|
226
|
+
numpy.ndarray
|
|
227
|
+
A vector of interpolated timestamps
|
|
228
|
+
|
|
229
|
+
"""
|
|
230
|
+
if len(ts.shape) == 1:
|
|
231
|
+
return ts
|
|
232
|
+
elif ts.ndim == 2 and ts.shape[1] == 1:
|
|
233
|
+
return ts.flatten() # Deal with MATLAB single column array
|
|
234
|
+
if ts.ndim > 2 or ts.shape[1] != 2:
|
|
235
|
+
raise ValueError('Array shape should be (2, 2)')
|
|
236
|
+
# Linearly interpolate the times
|
|
237
|
+
x = np.arange(n_samples)
|
|
238
|
+
return np.interp(x, ts[:, 0], ts[:, 1])
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def check_dimensions(dico):
|
|
242
|
+
"""Test for consistency of dimensions as per ALF specs in a dictionary.
|
|
243
|
+
|
|
244
|
+
Alf broadcasting rules: only accepts consistent dimensions for a given axis
|
|
245
|
+
a dimension is consistent with another if it's empty, 1, or equal to the other arrays
|
|
246
|
+
dims [a, 1], [1, b] and [a, b] are all consistent, [c, 1] is not
|
|
247
|
+
|
|
248
|
+
Parameters
|
|
249
|
+
----------
|
|
250
|
+
dico : ALFBunch, dict
|
|
251
|
+
Dictionary containing data
|
|
252
|
+
|
|
253
|
+
Returns
|
|
254
|
+
-------
|
|
255
|
+
int
|
|
256
|
+
Status 0 for consistent dimensions, 1 for inconsistent dimensions
|
|
257
|
+
|
|
258
|
+
"""
|
|
259
|
+
supported = (np.ndarray, pd.DataFrame) # Data types that have a shape attribute
|
|
260
|
+
shapes = [dico[lab].shape for lab in dico
|
|
261
|
+
if isinstance(dico[lab], supported) and not lab.startswith('timestamps')]
|
|
262
|
+
first_shapes = [sh[0] for sh in shapes]
|
|
263
|
+
# Continuous timeseries are permitted to be a (2, 2)
|
|
264
|
+
timeseries = [k for k, v in dico.items()
|
|
265
|
+
if k.startswith('timestamps') and isinstance(v, np.ndarray)]
|
|
266
|
+
if any(timeseries):
|
|
267
|
+
for key in timeseries:
|
|
268
|
+
if dico[key].ndim == 1 or (dico[key].ndim == 2 and dico[key].shape[1] == 1):
|
|
269
|
+
# Should be vector with same length as other attributes
|
|
270
|
+
first_shapes.append(dico[key].shape[0])
|
|
271
|
+
elif dico[key].ndim > 1 and dico[key].shape != (2, 2):
|
|
272
|
+
return 1 # ts not a (2, 2) arr or a vector
|
|
273
|
+
|
|
274
|
+
ok = len(first_shapes) == 0 or set(first_shapes).issubset({max(first_shapes), 1})
|
|
275
|
+
return int(ok is False)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def load_file_content(fil):
|
|
279
|
+
"""Return content of a file path.
|
|
280
|
+
|
|
281
|
+
Designed for very generic data file formats such as `json`, `npy`, `csv`, `(h)tsv`, `ssv`.
|
|
282
|
+
|
|
283
|
+
Parameters
|
|
284
|
+
----------
|
|
285
|
+
fil : str, pathlib.Path
|
|
286
|
+
File to read
|
|
287
|
+
|
|
288
|
+
Returns
|
|
289
|
+
-------
|
|
290
|
+
Any
|
|
291
|
+
Array/json/pandas dataframe depending on format
|
|
292
|
+
|
|
293
|
+
"""
|
|
294
|
+
if not fil:
|
|
295
|
+
return
|
|
296
|
+
fil = Path(fil)
|
|
297
|
+
if fil.stat().st_size == 0:
|
|
298
|
+
return
|
|
299
|
+
if fil.suffix == '.csv':
|
|
300
|
+
return pd.read_csv(fil).squeeze('columns')
|
|
301
|
+
if fil.suffix == '.json':
|
|
302
|
+
try:
|
|
303
|
+
with open(fil) as _fil:
|
|
304
|
+
return json.loads(_fil.read())
|
|
305
|
+
except Exception as e:
|
|
306
|
+
_logger.error(e)
|
|
307
|
+
return None
|
|
308
|
+
if fil.suffix == '.jsonable':
|
|
309
|
+
return jsonable.read(fil)
|
|
310
|
+
if fil.suffix == '.npy':
|
|
311
|
+
return _ensure_flat(np.load(file=fil, allow_pickle=True))
|
|
312
|
+
if fil.suffix == '.npz':
|
|
313
|
+
arr = np.load(file=fil)
|
|
314
|
+
# If single array with the default name ('arr_0') return individual array
|
|
315
|
+
return arr['arr_0'] if set(arr.files) == {'arr_0'} else arr
|
|
316
|
+
if fil.suffix == '.pqt':
|
|
317
|
+
return parquet.load(fil)[0]
|
|
318
|
+
if fil.suffix == '.ssv':
|
|
319
|
+
return pd.read_csv(fil, delimiter=' ').squeeze('columns')
|
|
320
|
+
if fil.suffix in ('.tsv', '.htsv'):
|
|
321
|
+
return pd.read_csv(fil, delimiter='\t').squeeze('columns')
|
|
322
|
+
if fil.suffix in ('.yml', '.yaml'):
|
|
323
|
+
with open(fil, 'r') as _fil:
|
|
324
|
+
return yaml.safe_load(_fil)
|
|
325
|
+
if fil.suffix == '.sparse_npz':
|
|
326
|
+
try:
|
|
327
|
+
import sparse
|
|
328
|
+
return sparse.load_npz(fil)
|
|
329
|
+
except ModuleNotFoundError:
|
|
330
|
+
warnings.warn(f'{Path(fil).name} requires the pydata sparse package to load.')
|
|
331
|
+
return path.ALFPath(fil)
|
|
332
|
+
return path.ALFPath(fil)
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _ls(alfpath, object=None, **kwargs) -> (list, tuple):
|
|
336
|
+
"""Given a path, an object and a filter, returns all files and associated attributes.
|
|
337
|
+
|
|
338
|
+
Parameters
|
|
339
|
+
----------
|
|
340
|
+
alfpath : str, pathlib.Path
|
|
341
|
+
The folder to list
|
|
342
|
+
object : str, list
|
|
343
|
+
An ALF object name to filter by
|
|
344
|
+
wildcards : bool
|
|
345
|
+
If true uses unix shell style pattern matching, otherwise uses regular expressions
|
|
346
|
+
kwargs
|
|
347
|
+
Other ALF parts to filter, including namespace, attribute, etc.
|
|
348
|
+
|
|
349
|
+
Returns
|
|
350
|
+
-------
|
|
351
|
+
list of one.alf.path.ALFPath
|
|
352
|
+
A list of ALF paths.
|
|
353
|
+
tuple
|
|
354
|
+
A tuple of ALF attributes corresponding to the file paths.
|
|
355
|
+
|
|
356
|
+
Raises
|
|
357
|
+
------
|
|
358
|
+
ALFObjectNotFound
|
|
359
|
+
No matching ALF object was found in the alfpath directory
|
|
360
|
+
|
|
361
|
+
"""
|
|
362
|
+
alfpath = path.ALFPath(alfpath)
|
|
363
|
+
if not alfpath.exists():
|
|
364
|
+
files_alf = attributes = None
|
|
365
|
+
elif alfpath.is_dir():
|
|
366
|
+
if object is None:
|
|
367
|
+
# List all ALF files
|
|
368
|
+
files_alf, attributes = filter_by(alfpath, **kwargs)
|
|
369
|
+
else:
|
|
370
|
+
files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
|
|
371
|
+
else:
|
|
372
|
+
object = alfpath.object
|
|
373
|
+
alfpath = alfpath.parent
|
|
374
|
+
files_alf, attributes = filter_by(alfpath, object=object, **kwargs)
|
|
375
|
+
|
|
376
|
+
# raise error if no files found
|
|
377
|
+
if not files_alf:
|
|
378
|
+
err_str = f'object "{object}"' if object else 'ALF files'
|
|
379
|
+
raise ALFObjectNotFound(f'No {err_str} found in {alfpath}')
|
|
380
|
+
|
|
381
|
+
return [alfpath.joinpath(f) for f in files_alf], attributes
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def iter_sessions(root_dir, pattern='*'):
|
|
385
|
+
"""Recursively iterate over session paths in a given directory.
|
|
386
|
+
|
|
387
|
+
Parameters
|
|
388
|
+
----------
|
|
389
|
+
root_dir : str, pathlib.Path
|
|
390
|
+
The folder to look for sessions.
|
|
391
|
+
pattern : str
|
|
392
|
+
Glob pattern to use. Default searches all folders. Providing a more specific pattern makes
|
|
393
|
+
this more performant (see examples).
|
|
394
|
+
|
|
395
|
+
Yields
|
|
396
|
+
------
|
|
397
|
+
pathlib.Path
|
|
398
|
+
The next session path in lexicographical order.
|
|
399
|
+
|
|
400
|
+
Examples
|
|
401
|
+
--------
|
|
402
|
+
Efficient iteration when `root_dir` contains <lab>/Subjects folders
|
|
403
|
+
|
|
404
|
+
>>> sessions = list(iter_sessions(root_dir, pattern='*/Subjects/*/????-??-??/*'))
|
|
405
|
+
|
|
406
|
+
Efficient iteration when `root_dir` contains subject folders
|
|
407
|
+
|
|
408
|
+
>>> sessions = list(iter_sessions(root_dir, pattern='*/????-??-??/*'))
|
|
409
|
+
|
|
410
|
+
"""
|
|
411
|
+
if spec.is_session_path(root_dir):
|
|
412
|
+
yield path.ALFPath(root_dir)
|
|
413
|
+
for p in sorted(Path(root_dir).rglob(pattern)):
|
|
414
|
+
if p.is_dir() and spec.is_session_path(p):
|
|
415
|
+
yield path.ALFPath(p)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def iter_datasets(session_path):
|
|
419
|
+
"""Iterate over all files in a session, and yield relative dataset paths.
|
|
420
|
+
|
|
421
|
+
Parameters
|
|
422
|
+
----------
|
|
423
|
+
session_path : str, pathlib.Path
|
|
424
|
+
The folder to look for datasets.
|
|
425
|
+
|
|
426
|
+
Yields
|
|
427
|
+
------
|
|
428
|
+
one.alf.path.ALFPath
|
|
429
|
+
The next dataset path (relative to the session path) in lexicographical order.
|
|
430
|
+
|
|
431
|
+
"""
|
|
432
|
+
for dataset in path.ALFPath(session_path).iter_datasets(recursive=True):
|
|
433
|
+
yield dataset.relative_to(session_path)
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
def exists(alfpath, object, attributes=None, **kwargs) -> bool:
|
|
437
|
+
"""Test if ALF object and optionally specific attributes exist in the given path.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
----------
|
|
441
|
+
alfpath : str, pathlib.Path
|
|
442
|
+
The folder to look into
|
|
443
|
+
object : str
|
|
444
|
+
ALF object name
|
|
445
|
+
attributes : str, list
|
|
446
|
+
Wanted attributes
|
|
447
|
+
wildcards : bool
|
|
448
|
+
If true uses unix shell style pattern matching, otherwise uses regular expressions
|
|
449
|
+
kwargs
|
|
450
|
+
Other ALF parts to filter by
|
|
451
|
+
|
|
452
|
+
Returns
|
|
453
|
+
-------
|
|
454
|
+
bool
|
|
455
|
+
For multiple attributes, returns True only if all attributes are found
|
|
456
|
+
|
|
457
|
+
"""
|
|
458
|
+
# if the object is not found, return False
|
|
459
|
+
try:
|
|
460
|
+
_, attributes_found = _ls(alfpath, object, **kwargs)
|
|
461
|
+
except (FileNotFoundError, ALFObjectNotFound):
|
|
462
|
+
return False
|
|
463
|
+
|
|
464
|
+
# if object found and no attribute provided, True
|
|
465
|
+
if not attributes:
|
|
466
|
+
return True
|
|
467
|
+
|
|
468
|
+
# if attributes provided, test if all are found
|
|
469
|
+
if isinstance(attributes, str):
|
|
470
|
+
attributes = [attributes]
|
|
471
|
+
attributes_found = set(part[2] for part in attributes_found)
|
|
472
|
+
return set(attributes).issubset(attributes_found)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def load_object(alfpath, object=None, short_keys=False, **kwargs):
|
|
476
|
+
"""Reads all files sharing the same object name.
|
|
477
|
+
|
|
478
|
+
For example, if the file provided to the function is `spikes.times`, the function will
|
|
479
|
+
load `spikes.times`, `spikes.clusters`, `spikes.depths`, `spike.amps` in a dictionary
|
|
480
|
+
whose keys will be `times`, `clusters`, `depths`, `amps`
|
|
481
|
+
|
|
482
|
+
Full Reference here: https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
483
|
+
|
|
484
|
+
Simplified example: _namespace_object.attribute_timescale.part1.part2.extension
|
|
485
|
+
|
|
486
|
+
Parameters
|
|
487
|
+
----------
|
|
488
|
+
alfpath : str, pathlib.Path, list
|
|
489
|
+
Any ALF path pertaining to the object OR directory containing ALFs OR list of paths.
|
|
490
|
+
object : str, list, None
|
|
491
|
+
The ALF object(s) to filter by. If a directory is provided and object is None, all valid
|
|
492
|
+
ALF files returned.
|
|
493
|
+
short_keys : bool
|
|
494
|
+
By default, the output dictionary keys will be compounds of attributes, timescale and
|
|
495
|
+
any eventual parts separated by a dot. Use True to shorten the keys to the attribute
|
|
496
|
+
and timescale.
|
|
497
|
+
wildcards : bool
|
|
498
|
+
If true uses unix shell style pattern matching, otherwise uses regular expressions.
|
|
499
|
+
kwargs
|
|
500
|
+
Other ALF parts to filter by.
|
|
501
|
+
|
|
502
|
+
Returns
|
|
503
|
+
-------
|
|
504
|
+
AlfBunch
|
|
505
|
+
An ALFBunch (dict-like) of all attributes pertaining to the object.
|
|
506
|
+
|
|
507
|
+
Examples
|
|
508
|
+
--------
|
|
509
|
+
Load 'spikes' object
|
|
510
|
+
|
|
511
|
+
>>> spikes = load_object('full/path/to/my/alffolder/', 'spikes')
|
|
512
|
+
|
|
513
|
+
Load 'trials' object under the 'ibl' namespace
|
|
514
|
+
|
|
515
|
+
>>> trials = load_object('/subject/2021-01-01/001', 'trials', namespace='ibl')
|
|
516
|
+
|
|
517
|
+
"""
|
|
518
|
+
if isinstance(alfpath, (Path, str)):
|
|
519
|
+
if Path(alfpath).is_dir() and object is None:
|
|
520
|
+
raise ValueError('If a directory is provided, the object name should be provided too')
|
|
521
|
+
files_alf, parts = _ls(alfpath, object, **kwargs)
|
|
522
|
+
else: # A list of paths allows us to load an object from different revisions
|
|
523
|
+
files_alf = list(map(path.ALFPath, alfpath))
|
|
524
|
+
parts = [x.dataset_name_parts for x in files_alf]
|
|
525
|
+
assert len(set(p[1] for p in parts)) == 1
|
|
526
|
+
object = next(x[1] for x in parts)
|
|
527
|
+
# Take attribute and timescale from parts list
|
|
528
|
+
attributes = [p[2] if not p[3] else '_'.join(p[2:4]) for p in parts]
|
|
529
|
+
if not short_keys: # Include extra parts in the keys
|
|
530
|
+
attributes = ['.'.join(filter(None, (attr, p[4]))) for attr, p in zip(attributes, parts)]
|
|
531
|
+
# TODO List duplicates; raise ALFError
|
|
532
|
+
assert len(set(attributes)) == len(attributes), (
|
|
533
|
+
f'multiple object {object} with the same attribute in {alfpath}, restrict parts/namespace')
|
|
534
|
+
out = AlfBunch({})
|
|
535
|
+
|
|
536
|
+
# load content for each file
|
|
537
|
+
for fil, att in zip(files_alf, attributes):
|
|
538
|
+
# if there is a corresponding metadata file, read it:
|
|
539
|
+
meta_data_file = _find_metadata(fil)
|
|
540
|
+
# if this is the actual meta-data file, skip and it will be read later
|
|
541
|
+
if meta_data_file == fil:
|
|
542
|
+
continue
|
|
543
|
+
out[att] = load_file_content(fil)
|
|
544
|
+
if meta_data_file:
|
|
545
|
+
meta = load_file_content(meta_data_file)
|
|
546
|
+
# the columns keyword splits array along the last dimension
|
|
547
|
+
if 'columns' in meta.keys():
|
|
548
|
+
out.update({v: out[att][::, k] for k, v in enumerate(meta['columns'])})
|
|
549
|
+
out.pop(att)
|
|
550
|
+
meta.pop('columns')
|
|
551
|
+
# if there is other stuff in the dictionary, save it, otherwise disregard
|
|
552
|
+
if meta:
|
|
553
|
+
out[att + 'metadata'] = meta
|
|
554
|
+
# Merge 'table' dataframe into bunch
|
|
555
|
+
table_key = next(filter(re.compile(r'^table([_.]|$)').match, out), None) # py 3.8
|
|
556
|
+
if table_key:
|
|
557
|
+
table = out.pop(table_key)
|
|
558
|
+
|
|
559
|
+
def rename_columns(field):
|
|
560
|
+
""""Rename DataFrame fields to include timescale or extra ALF parts from table_key.
|
|
561
|
+
|
|
562
|
+
For example...
|
|
563
|
+
with table_key = table_clock, field1 -> field1_clock;
|
|
564
|
+
with table_key = table_clock.extra, field1_0 -> field1_clock.extra_0;
|
|
565
|
+
with table_key = table, field1 -> field1
|
|
566
|
+
"""
|
|
567
|
+
return (field[:-2] + table_key[5:] + field[-2:]
|
|
568
|
+
if re.match(r'.+?_[01]$', field)
|
|
569
|
+
else field + table_key[5:])
|
|
570
|
+
table.rename(columns=rename_columns, inplace=True)
|
|
571
|
+
out.update(AlfBunch.from_df(table))
|
|
572
|
+
status = out.check_dimensions
|
|
573
|
+
timeseries = [k for k in out.keys() if 'timestamps' in k]
|
|
574
|
+
if any(timeseries) and len(out.keys()) > len(timeseries) and status == 0:
|
|
575
|
+
# Get length of one of the other arrays
|
|
576
|
+
ignore = ('timestamps', 'meta')
|
|
577
|
+
n_samples = next(v for k, v in out.items() if not any(x in k for x in ignore)).shape[0]
|
|
578
|
+
for key in timeseries:
|
|
579
|
+
# Expand timeseries if necessary
|
|
580
|
+
out[key] = ts2vec(out[key], n_samples)
|
|
581
|
+
if status != 0:
|
|
582
|
+
supported = (np.ndarray, pd.DataFrame)
|
|
583
|
+
print_sizes = '\n'.join(
|
|
584
|
+
f'{v.shape},\t{k}' for k, v in out.items() if isinstance(v, supported)
|
|
585
|
+
)
|
|
586
|
+
_logger.warning(f'Inconsistent dimensions for object: {object} \n{print_sizes}')
|
|
587
|
+
return out
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
def save_object_npy(alfpath, dico, object, parts=None, namespace=None, timescale=None) -> list:
|
|
591
|
+
"""Save dictionary in `ALF format`_ using dictionary keys as attribute names.
|
|
592
|
+
|
|
593
|
+
Dimensions have to be consistent.
|
|
594
|
+
|
|
595
|
+
Simplified ALF example: _namespace_object.attribute.part1.part2.extension.
|
|
596
|
+
|
|
597
|
+
Parameters
|
|
598
|
+
----------
|
|
599
|
+
alfpath : str, pathlib.Path
|
|
600
|
+
Path of the folder to save data to.
|
|
601
|
+
dico : dict
|
|
602
|
+
Dictionary to save to npy; keys correspond to ALF attributes.
|
|
603
|
+
object : str
|
|
604
|
+
Name of the object to save.
|
|
605
|
+
parts : str, list, None
|
|
606
|
+
Extra parts to the ALF name.
|
|
607
|
+
namespace : str, None
|
|
608
|
+
The optional namespace of the object.
|
|
609
|
+
timescale : str, None
|
|
610
|
+
The optional timescale of the object.
|
|
611
|
+
|
|
612
|
+
Returns
|
|
613
|
+
-------
|
|
614
|
+
list of one.alf.path.ALFPath
|
|
615
|
+
List of written files.
|
|
616
|
+
|
|
617
|
+
Examples
|
|
618
|
+
--------
|
|
619
|
+
>>> spikes = {'times': np.arange(50), 'depths': np.random.random(50)}
|
|
620
|
+
>>> files = save_object_npy('/path/to/my/alffolder/', spikes, 'spikes')
|
|
621
|
+
|
|
622
|
+
.. _ALF format:
|
|
623
|
+
https://int-brain-lab.github.io/ONE/alf_intro.html
|
|
624
|
+
|
|
625
|
+
"""
|
|
626
|
+
alfpath = path.ALFPath(alfpath)
|
|
627
|
+
status = check_dimensions(dico)
|
|
628
|
+
if status != 0:
|
|
629
|
+
raise ValueError('Dimensions are not consistent to save all arrays in ALF format: ' +
|
|
630
|
+
str([(k, v.shape) for k, v in dico.items()]))
|
|
631
|
+
out_files = []
|
|
632
|
+
for k, v in dico.items():
|
|
633
|
+
out_file = alfpath / spec.to_alf(object, k, 'npy',
|
|
634
|
+
extra=parts, namespace=namespace, timescale=timescale)
|
|
635
|
+
np.save(out_file, v)
|
|
636
|
+
out_files.append(out_file)
|
|
637
|
+
return out_files
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def save_metadata(file_alf, dico) -> path.ALFPath:
|
|
641
|
+
"""Writes a meta data file matching a current ALF file object.
|
|
642
|
+
|
|
643
|
+
For example given an alf file `clusters.ccfLocation.ssv` this will write a dictionary in JSON
|
|
644
|
+
format in `clusters.ccfLocation.metadata.json`
|
|
645
|
+
|
|
646
|
+
Reserved keywords:
|
|
647
|
+
- **columns**: column names for binary tables.
|
|
648
|
+
- **row**: row names for binary tables.
|
|
649
|
+
- **unit**
|
|
650
|
+
|
|
651
|
+
Parameters
|
|
652
|
+
----------
|
|
653
|
+
file_alf : str, pathlib.Path
|
|
654
|
+
Full path to the alf object
|
|
655
|
+
dico : dict, ALFBunch
|
|
656
|
+
Dictionary containing meta-data
|
|
657
|
+
|
|
658
|
+
Returns
|
|
659
|
+
-------
|
|
660
|
+
one.alf.path.ALFPath
|
|
661
|
+
The saved metadata file path.
|
|
662
|
+
|
|
663
|
+
"""
|
|
664
|
+
file_alf = path.ALFPath(file_alf)
|
|
665
|
+
assert file_alf.is_dataset, 'ALF filename not valid'
|
|
666
|
+
file_meta_data = file_alf.parent / (file_alf.stem + '.metadata.json')
|
|
667
|
+
with open(file_meta_data, 'w+') as fid:
|
|
668
|
+
fid.write(json.dumps(dico, indent=1))
|
|
669
|
+
return file_meta_data
|
|
670
|
+
|
|
671
|
+
|
|
672
|
+
def next_num_folder(session_date_folder: Union[str, Path]) -> str:
|
|
673
|
+
"""Return the next number for a session given a session_date_folder."""
|
|
674
|
+
session_date_folder = Path(session_date_folder)
|
|
675
|
+
if not session_date_folder.exists():
|
|
676
|
+
return '001'
|
|
677
|
+
session_nums = [
|
|
678
|
+
int(x.name) for x in session_date_folder.iterdir()
|
|
679
|
+
if x.is_dir() and not x.name.startswith('.') and x.name.isdigit()
|
|
680
|
+
]
|
|
681
|
+
out = f'{max(session_nums or [0]) + 1:03d}'
|
|
682
|
+
assert len(out) == 3, 'ALF spec does not support session numbers > 999'
|
|
683
|
+
return out
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def remove_empty_folders(folder: Union[str, Path]) -> None:
|
|
687
|
+
"""Iteratively remove any empty child folders."""
|
|
688
|
+
all_folders = sorted(x for x in Path(folder).rglob('*') if x.is_dir())
|
|
689
|
+
for f in reversed(all_folders): # Reversed sorted ensures we remove deepest first
|
|
690
|
+
try:
|
|
691
|
+
f.rmdir()
|
|
692
|
+
except Exception:
|
|
693
|
+
continue
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def filter_by(alf_path, wildcards=True, **kwargs):
|
|
697
|
+
"""Given a path and optional filters, returns all ALF files and their associated parts.
|
|
698
|
+
|
|
699
|
+
The filters constitute a logical AND. For all but `extra`, if a list is provided, one or more
|
|
700
|
+
elements must match (a logical OR).
|
|
701
|
+
|
|
702
|
+
Parameters
|
|
703
|
+
----------
|
|
704
|
+
alf_path : str, pathlib.Path
|
|
705
|
+
A path to a folder containing ALF datasets.
|
|
706
|
+
wildcards : bool
|
|
707
|
+
If true, kwargs are matched as unix-style patterns, otherwise as regular expressions.
|
|
708
|
+
object : str, list
|
|
709
|
+
Filter by a given object (e.g. 'spikes').
|
|
710
|
+
attribute : str, list
|
|
711
|
+
Filter by a given attribute (e.g. 'intervals').
|
|
712
|
+
extension : str, list
|
|
713
|
+
Filter by extension (e.g. 'npy').
|
|
714
|
+
namespace : str, list
|
|
715
|
+
Filter by a given namespace (e.g. 'ibl') or None for files without one.
|
|
716
|
+
timescale : str, list
|
|
717
|
+
Filter by a given timescale (e.g. 'bpod') or None for files without one.
|
|
718
|
+
extra : str, list
|
|
719
|
+
Filter by extra parameters (e.g. 'raw') or None for files without extra parts
|
|
720
|
+
NB: Wild cards not permitted here.
|
|
721
|
+
|
|
722
|
+
Returns
|
|
723
|
+
-------
|
|
724
|
+
alf_files : list of one.alf.path.ALFPath
|
|
725
|
+
A Path to a directory containing ALF files.
|
|
726
|
+
attributes : list of dicts
|
|
727
|
+
A list of parsed file parts.
|
|
728
|
+
|
|
729
|
+
Examples
|
|
730
|
+
--------
|
|
731
|
+
Filter files with universal timescale
|
|
732
|
+
|
|
733
|
+
>>> filter_by(alf_path, timescale=None)
|
|
734
|
+
|
|
735
|
+
Filter files by a given ALF object
|
|
736
|
+
|
|
737
|
+
>>> filter_by(alf_path, object='wheel')
|
|
738
|
+
|
|
739
|
+
Filter using wildcard, e.g. 'wheel' and 'wheelMoves' ALF objects
|
|
740
|
+
|
|
741
|
+
>>> filter_by(alf_path, object='wh*')
|
|
742
|
+
|
|
743
|
+
Filter all intervals that are in bpod time
|
|
744
|
+
|
|
745
|
+
>>> filter_by(alf_path, attribute='intervals', timescale='bpod')
|
|
746
|
+
|
|
747
|
+
Filter all files containing either 'intervals' OR 'timestamps' attributes
|
|
748
|
+
|
|
749
|
+
>>> filter_by(alf_path, attribute=['intervals', 'timestamps'])
|
|
750
|
+
|
|
751
|
+
Filter all files using a regular expression
|
|
752
|
+
|
|
753
|
+
>>> filter_by(alf_path, object='^wheel.*', wildcards=False)
|
|
754
|
+
>>> filter_by(alf_path, object=['^wheel$', '.*Moves'], wildcards=False)
|
|
755
|
+
|
|
756
|
+
"""
|
|
757
|
+
alf_files = [f.relative_to(alf_path) for f in path.ALFPath(alf_path).iter_datasets()]
|
|
758
|
+
attributes = list(map(path.ALFPath.parse_alf_name, alf_files))
|
|
759
|
+
|
|
760
|
+
if kwargs:
|
|
761
|
+
# Validate keyword arguments against regex group names
|
|
762
|
+
invalid = kwargs.keys() - spec.regex(FILE_SPEC).groupindex.keys()
|
|
763
|
+
if invalid:
|
|
764
|
+
raise TypeError('%s() got an unexpected keyword argument "%s"'
|
|
765
|
+
% (__name__, set(invalid).pop()))
|
|
766
|
+
|
|
767
|
+
# # Ensure 'extra' input is a list; if str split on dot
|
|
768
|
+
if 'extra' in kwargs and isinstance(kwargs['extra'], str):
|
|
769
|
+
kwargs['extra'] = kwargs['extra'].split('.')
|
|
770
|
+
|
|
771
|
+
def _match(part, pattern, split=None):
|
|
772
|
+
if pattern is None or part is None:
|
|
773
|
+
# If either is None, both should be None to match
|
|
774
|
+
return pattern is part
|
|
775
|
+
elif split:
|
|
776
|
+
# Check all provided extra fields match those in ALF
|
|
777
|
+
return all(elem in part.split(split) for elem in pattern if elem)
|
|
778
|
+
elif not isinstance(pattern, str):
|
|
779
|
+
if wildcards:
|
|
780
|
+
return any(_match(part, x, split) for x in pattern)
|
|
781
|
+
else:
|
|
782
|
+
return re.match('|'.join(pattern), part) is not None
|
|
783
|
+
else:
|
|
784
|
+
# Check given attribute matches, allowing wildcards
|
|
785
|
+
return fnmatch(part, pattern) if wildcards else re.match(pattern, part) is not None
|
|
786
|
+
|
|
787
|
+
# Iterate over ALF files
|
|
788
|
+
for file, attr in zip(alf_files.copy(), attributes.copy()):
|
|
789
|
+
for k, v in kwargs.items(): # Iterate over attributes
|
|
790
|
+
match = _match(attr[k], v, '.' if k == 'extra' else None)
|
|
791
|
+
|
|
792
|
+
if not match: # Remove file from list and move on to next file
|
|
793
|
+
alf_files.remove(file)
|
|
794
|
+
attributes.remove(attr)
|
|
795
|
+
break
|
|
796
|
+
|
|
797
|
+
return alf_files, [tuple(attr.values()) for attr in attributes]
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def find_variants(file_list, namespace=True, timescale=True, extra=True, extension=True):
|
|
801
|
+
"""Find variant datasets.
|
|
802
|
+
|
|
803
|
+
Finds any datasets on disk that are considered a variant of the input datasets. At minimum, a
|
|
804
|
+
dataset is uniquely defined by session path, collection, object and attribute. Therefore,
|
|
805
|
+
datasets with the same name and collection in a different revision folder are considered a
|
|
806
|
+
variant. If any of the keyword arguments are set to False, those parts are ignored when
|
|
807
|
+
comparing datasets.
|
|
808
|
+
|
|
809
|
+
Parameters
|
|
810
|
+
----------
|
|
811
|
+
file_list : list of str, list of pathlib.Path
|
|
812
|
+
A list of ALF paths to find variants of.
|
|
813
|
+
namespace : bool
|
|
814
|
+
If true, treat datasets with a different namespace as unique.
|
|
815
|
+
timescale : bool
|
|
816
|
+
If true, treat datasets with a different timescale as unique.
|
|
817
|
+
extra : bool
|
|
818
|
+
If true, treat datasets with a different extra parts as unique.
|
|
819
|
+
extension : bool
|
|
820
|
+
If true, treat datasets with a different extension as unique.
|
|
821
|
+
|
|
822
|
+
Returns
|
|
823
|
+
-------
|
|
824
|
+
Dict[pathlib.Path, list of pathlib.Path]
|
|
825
|
+
A map of input file paths to a list variant dataset paths.
|
|
826
|
+
|
|
827
|
+
Raises
|
|
828
|
+
------
|
|
829
|
+
ValueError
|
|
830
|
+
One or more input file paths are not valid ALF datasets.
|
|
831
|
+
|
|
832
|
+
Examples
|
|
833
|
+
--------
|
|
834
|
+
Find all datasets with an identical name and collection in a different revision folder
|
|
835
|
+
|
|
836
|
+
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'])
|
|
837
|
+
{Path('/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'): [
|
|
838
|
+
Path('/sub/2020-10-01/001/alf/obj.attr.npy')
|
|
839
|
+
]}
|
|
840
|
+
|
|
841
|
+
Find all datasets with different namespace or revision
|
|
842
|
+
|
|
843
|
+
>>> find_variants(['/sub/2020-10-01/001/alf/#2020-01-01#/obj.attr.npy'], namespace=False)
|
|
844
|
+
{Path('/sub/2020-10-01/001/#2020-01-01#/obj.attr.npy'): [
|
|
845
|
+
Path('/sub/2020-10-01/001/#2020-01-01#/_ns_obj.attr.npy'),
|
|
846
|
+
Path('/sub/2020-10-01/001/obj.attr.npy'),
|
|
847
|
+
]}
|
|
848
|
+
|
|
849
|
+
"""
|
|
850
|
+
# Initialize map of unique files to their duplicates
|
|
851
|
+
duplicates = {}
|
|
852
|
+
# Determine which parts to filter
|
|
853
|
+
variables = locals()
|
|
854
|
+
filters = {'namespace', 'timescale', 'extra', 'extension'}
|
|
855
|
+
to_compare = ('lab', 'subject', 'date', 'number', 'collection', 'object', 'attribute',
|
|
856
|
+
*(arg for arg in filters if variables[arg]))
|
|
857
|
+
|
|
858
|
+
def parts_match(parts, file):
|
|
859
|
+
"""Compare a file's unique parts to a given file."""
|
|
860
|
+
other = file.parse_alf_path()
|
|
861
|
+
return all(parts[k] == other[k] for k in to_compare)
|
|
862
|
+
|
|
863
|
+
# iterate over unique files and their parts
|
|
864
|
+
for f in map(path.ALFPath, file_list):
|
|
865
|
+
parts = f.parse_alf_path()
|
|
866
|
+
# first glob for files matching object.attribute (including revisions)
|
|
867
|
+
pattern = f'*{parts["object"]}.{parts["attribute"]}*'
|
|
868
|
+
# this works because revision will always be last folder;
|
|
869
|
+
# i.e. revisions can't contain collections
|
|
870
|
+
globbed = map(f.without_revision().parent.glob, (pattern, '#*#/' + pattern))
|
|
871
|
+
globbed = chain.from_iterable(globbed) # unite revision and non-revision globs
|
|
872
|
+
# refine duplicates based on other parts (this also ensures we don't catch similar objects)
|
|
873
|
+
globbed = filter(partial(parts_match, parts), globbed)
|
|
874
|
+
# key = f.relative_to_session().as_posix()
|
|
875
|
+
duplicates[f] = [x for x in globbed if x != f] # map file to list of its duplicates
|
|
876
|
+
return duplicates
|