eegdash 0.0.2__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/data_utils.py → data_utils.py +126 -4
- {eegdash-0.0.2.dist-info → eegdash-0.0.3.dist-info}/METADATA +5 -5
- eegdash-0.0.3.dist-info/RECORD +8 -0
- eegdash-0.0.3.dist-info/top_level.txt +3 -0
- main.py +199 -0
- eegdash/main.py +0 -17
- eegdash/script.py +0 -25
- eegdash/signalstore_data_utils.py +0 -630
- eegdash-0.0.2.dist-info/RECORD +0 -10
- eegdash-0.0.2.dist-info/top_level.txt +0 -1
- /eegdash/__init__.py → /__init__.py +0 -0
- {eegdash-0.0.2.dist-info → eegdash-0.0.3.dist-info}/LICENSE +0 -0
- {eegdash-0.0.2.dist-info → eegdash-0.0.3.dist-info}/WHEEL +0 -0
|
@@ -3,11 +3,104 @@ import sys
|
|
|
3
3
|
from joblib import Parallel, delayed
|
|
4
4
|
import mne
|
|
5
5
|
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
import re
|
|
8
9
|
import json
|
|
10
|
+
from mne.io import BaseRaw
|
|
11
|
+
from mne._fiff.utils import _find_channels, _read_segments_file
|
|
12
|
+
import s3fs
|
|
13
|
+
import tempfile
|
|
14
|
+
from mne._fiff.utils import _read_segments_file
|
|
9
15
|
|
|
10
|
-
|
|
16
|
+
class RawEEGDash(BaseRaw):
|
|
17
|
+
r"""Raw object from EEG-Dash connection with Openneuro S3 file.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
input_fname : path-like
|
|
22
|
+
Path to the S3 file
|
|
23
|
+
eog : list | tuple | 'auto'
|
|
24
|
+
Names or indices of channels that should be designated EOG channels.
|
|
25
|
+
If 'auto', the channel names containing ``EOG`` or ``EYE`` are used.
|
|
26
|
+
Defaults to empty tuple.
|
|
27
|
+
%(preload)s
|
|
28
|
+
Note that preload=False will be effective only if the data is stored
|
|
29
|
+
in a separate binary file.
|
|
30
|
+
%(uint16_codec)s
|
|
31
|
+
%(montage_units)s
|
|
32
|
+
%(verbose)s
|
|
33
|
+
|
|
34
|
+
See Also
|
|
35
|
+
--------
|
|
36
|
+
mne.io.Raw : Documentation of attributes and methods.
|
|
37
|
+
|
|
38
|
+
Notes
|
|
39
|
+
-----
|
|
40
|
+
.. versionadded:: 0.11.0
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
input_fname,
|
|
46
|
+
metadata,
|
|
47
|
+
eog=(),
|
|
48
|
+
preload=False,
|
|
49
|
+
*,
|
|
50
|
+
cache_dir='./.eegdash_cache',
|
|
51
|
+
uint16_codec=None,
|
|
52
|
+
montage_units="auto",
|
|
53
|
+
verbose=None,
|
|
54
|
+
):
|
|
55
|
+
'''
|
|
56
|
+
Get to work with S3 endpoint first, no caching
|
|
57
|
+
'''
|
|
58
|
+
# Create a simple RawArray
|
|
59
|
+
sfreq = metadata['sfreq'] # Sampling frequency
|
|
60
|
+
n_times = metadata['n_times']
|
|
61
|
+
ch_names = metadata['ch_names']
|
|
62
|
+
ch_types = []
|
|
63
|
+
for ch in metadata['ch_types']:
|
|
64
|
+
chtype = ch.lower()
|
|
65
|
+
if chtype == 'heog' or chtype == 'veog':
|
|
66
|
+
chtype = 'eog'
|
|
67
|
+
ch_types.append(chtype)
|
|
68
|
+
info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
|
|
69
|
+
self.s3file = input_fname
|
|
70
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
71
|
+
self.filecache = os.path.join(cache_dir, os.path.basename(self.s3file))
|
|
72
|
+
|
|
73
|
+
if preload and not os.path.exists(self.filecache):
|
|
74
|
+
self._download_s3()
|
|
75
|
+
preload = self.filecache
|
|
76
|
+
|
|
77
|
+
super().__init__(
|
|
78
|
+
info,
|
|
79
|
+
preload,
|
|
80
|
+
last_samps=[n_times-1],
|
|
81
|
+
orig_format="single",
|
|
82
|
+
verbose=verbose,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def _download_s3(self):
|
|
86
|
+
filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
|
|
87
|
+
print('s3file', self.s3file)
|
|
88
|
+
print('filecache', self.filecache)
|
|
89
|
+
filesystem.download(self.s3file, self.filecache)
|
|
90
|
+
self.filenames = [self.filecache]
|
|
91
|
+
|
|
92
|
+
def _read_segment(
|
|
93
|
+
self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
|
|
94
|
+
):
|
|
95
|
+
if not os.path.exists(self.filecache): # not preload
|
|
96
|
+
self._download_s3()
|
|
97
|
+
else: # not preload and file is not cached
|
|
98
|
+
self.filenames = [self.filecache]
|
|
99
|
+
return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
|
|
100
|
+
|
|
101
|
+
def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
|
|
102
|
+
"""Read a chunk of data from the file."""
|
|
103
|
+
_read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
|
|
11
104
|
|
|
12
105
|
|
|
13
106
|
class BIDSDataset():
|
|
@@ -20,7 +113,7 @@ class BIDSDataset():
|
|
|
20
113
|
}
|
|
21
114
|
METADATA_FILE_EXTENSIONS = ['eeg.json', 'channels.tsv', 'electrodes.tsv', 'events.tsv', 'events.json']
|
|
22
115
|
def __init__(self,
|
|
23
|
-
data_dir=None, # location of
|
|
116
|
+
data_dir=None, # location of bids dataset
|
|
24
117
|
dataset='', # dataset name
|
|
25
118
|
raw_format='eeglab', # format of raw data
|
|
26
119
|
):
|
|
@@ -51,6 +144,18 @@ class BIDSDataset():
|
|
|
51
144
|
lookup = re.search(rf'{property}-(.*?)[_\/]', filename)
|
|
52
145
|
return lookup.group(1) if lookup else ''
|
|
53
146
|
|
|
147
|
+
def merge_json_inheritance(self, json_files):
|
|
148
|
+
'''
|
|
149
|
+
Merge list of json files found by get_bids_file_inheritance,
|
|
150
|
+
expecting the order (from left to right) is from lowest level to highest level,
|
|
151
|
+
and return a merged dictionary
|
|
152
|
+
'''
|
|
153
|
+
json_files.reverse()
|
|
154
|
+
json_dict = {}
|
|
155
|
+
for f in json_files:
|
|
156
|
+
json_dict.update(json.load(open(f)))
|
|
157
|
+
return json_dict
|
|
158
|
+
|
|
54
159
|
def get_bids_file_inheritance(self, path, basename, extension):
|
|
55
160
|
'''
|
|
56
161
|
Get all files with given extension that applies to the basename file
|
|
@@ -72,7 +177,7 @@ class BIDSDataset():
|
|
|
72
177
|
for file in os.listdir(path):
|
|
73
178
|
# target_file = path / f"{cur_file_basename}_{extension}"
|
|
74
179
|
if os.path.isfile(path/file):
|
|
75
|
-
cur_file_basename = file[:file.rfind('_')]
|
|
180
|
+
cur_file_basename = file[:file.rfind('_')] # TODO: change to just search for any file with extension
|
|
76
181
|
if file.endswith(extension) and cur_file_basename in basename:
|
|
77
182
|
filepath = path / file
|
|
78
183
|
bids_files.append(filepath)
|
|
@@ -214,4 +319,21 @@ class BIDSDataset():
|
|
|
214
319
|
return self.get_property_from_filename('run', data_filepath)
|
|
215
320
|
|
|
216
321
|
def subject(self, data_filepath):
|
|
217
|
-
return self.get_property_from_filename('sub', data_filepath)
|
|
322
|
+
return self.get_property_from_filename('sub', data_filepath)
|
|
323
|
+
|
|
324
|
+
def num_channels(self, data_filepath):
|
|
325
|
+
channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
|
|
326
|
+
return len(channels_tsv)
|
|
327
|
+
|
|
328
|
+
def channel_labels(self, data_filepath):
|
|
329
|
+
channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
|
|
330
|
+
return channels_tsv['name'].tolist()
|
|
331
|
+
|
|
332
|
+
def channel_types(self, data_filepath):
|
|
333
|
+
channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
|
|
334
|
+
return channels_tsv['type'].tolist()
|
|
335
|
+
|
|
336
|
+
def num_times(self, data_filepath):
|
|
337
|
+
eeg_jsons = self.get_bids_metadata_files(data_filepath, 'eeg.json')
|
|
338
|
+
eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
|
|
339
|
+
return int(eeg_json_dict['SamplingFrequency'] * eeg_json_dict['RecordingDuration'])
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.2
|
|
2
2
|
Name: eegdash
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: EEG data for machine learning
|
|
5
5
|
Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>
|
|
6
6
|
License: GNU General Public License
|
|
@@ -108,16 +108,16 @@ Additionally, users can search for a specific dataset by specifying criteria.
|
|
|
108
108
|
EEGDashInstance.find({'task': 'FaceRecognition'})
|
|
109
109
|
```
|
|
110
110
|
|
|
111
|
-
After locating the desired dataset or data record, users can download it locally by executing the following command
|
|
111
|
+
After locating the desired dataset or data record, users can download it locally by executing the following command. This will return an xArray Python object.
|
|
112
112
|
|
|
113
113
|
```python
|
|
114
|
-
EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
|
|
114
|
+
XArrayData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
|
|
115
115
|
```
|
|
116
116
|
|
|
117
|
-
Optionally, this is how you may access the raw data for the first record.
|
|
117
|
+
Optionally, this is how you may access the raw data for the first record. This will return an numpy array.
|
|
118
118
|
|
|
119
119
|
```python
|
|
120
|
-
EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
|
|
120
|
+
npData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
|
|
121
121
|
```
|
|
122
122
|
|
|
123
123
|
## Example use
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
__init__.py,sha256=hgxE8COvPu3EV2Tq3GqtMk68fsd7bYvOs_0GO6rrzfk,32
|
|
2
|
+
data_utils.py,sha256=vzMGVp4PBWyRF8tbYNqkJs0QnUd5CzvmJUkpPfxdJh8,13491
|
|
3
|
+
main.py,sha256=fFZHHdVYNLqKr2X_NDB0XXla7A2QlHexgI9AD79_niY,7217
|
|
4
|
+
eegdash-0.0.3.dist-info/LICENSE,sha256=Xafu48R-h_kyaNj2tuhfgdEv9_ovciktjUEgRRwMZ6w,812
|
|
5
|
+
eegdash-0.0.3.dist-info/METADATA,sha256=k0Lvxj1hHHQOZtLt1id4qnp2LpzlN_AZ4Ed3hSoXZG8,9432
|
|
6
|
+
eegdash-0.0.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
7
|
+
eegdash-0.0.3.dist-info/top_level.txt,sha256=MyqA0HvmlirifVrDoM8jHKwKDiA_-XrVNsV6tFfhpAU,25
|
|
8
|
+
eegdash-0.0.3.dist-info/RECORD,,
|
main.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
import pymongo
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
import os
|
|
4
|
+
import s3fs
|
|
5
|
+
from joblib import Parallel, delayed
|
|
6
|
+
import tempfile
|
|
7
|
+
import mne
|
|
8
|
+
import numpy as np
|
|
9
|
+
import xarray as xr
|
|
10
|
+
from .data_utils import BIDSDataset
|
|
11
|
+
class EEGDash:
|
|
12
|
+
AWS_BUCKET = 's3://openneuro.org'
|
|
13
|
+
def __init__(self,
|
|
14
|
+
is_public=True):
|
|
15
|
+
if is_public:
|
|
16
|
+
DB_CONNECTION_STRING="mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
|
|
17
|
+
else:
|
|
18
|
+
load_dotenv()
|
|
19
|
+
DB_CONNECTION_STRING = os.getenv('DB_CONNECTION_STRING')
|
|
20
|
+
|
|
21
|
+
self.__client = pymongo.MongoClient(DB_CONNECTION_STRING)
|
|
22
|
+
self.__db = self.__client['eegdash']
|
|
23
|
+
self.__collection = self.__db['records']
|
|
24
|
+
|
|
25
|
+
self.is_public = is_public
|
|
26
|
+
self.filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
|
|
27
|
+
|
|
28
|
+
def find(self, *args):
|
|
29
|
+
results = self.__collection.find(*args)
|
|
30
|
+
|
|
31
|
+
# convert to list using get_item on each element
|
|
32
|
+
return [result for result in results]
|
|
33
|
+
|
|
34
|
+
def exist(self, schema_ref='eeg_signal', data_name=''):
|
|
35
|
+
query = {
|
|
36
|
+
"schema_ref": schema_ref,
|
|
37
|
+
"data_name": data_name
|
|
38
|
+
}
|
|
39
|
+
sessions = self.find(query)
|
|
40
|
+
return len(sessions) > 0
|
|
41
|
+
|
|
42
|
+
def add(self, record:dict):
|
|
43
|
+
input_record = self._validate_input(record)
|
|
44
|
+
print(input_record)
|
|
45
|
+
self.__collection.insert_one(input_record)
|
|
46
|
+
|
|
47
|
+
def _validate_input(self, record:dict):
|
|
48
|
+
input_types = {
|
|
49
|
+
'schema_ref': str,
|
|
50
|
+
'data_name': str,
|
|
51
|
+
'dataset': str,
|
|
52
|
+
'bidspath': str,
|
|
53
|
+
'subject': str,
|
|
54
|
+
'task': str,
|
|
55
|
+
'session': str,
|
|
56
|
+
'run': str,
|
|
57
|
+
'sampling_frequency': float,
|
|
58
|
+
'modality': str,
|
|
59
|
+
'nchans': int,
|
|
60
|
+
'ntimes': int,
|
|
61
|
+
'channel_types': list,
|
|
62
|
+
'channel_names': list,
|
|
63
|
+
}
|
|
64
|
+
record['schema_ref'] = 'eeg_signal'
|
|
65
|
+
if 'data_name' not in record:
|
|
66
|
+
raise ValueError("Missing key: data_name")
|
|
67
|
+
# check if args are in the keys and has correct type
|
|
68
|
+
for key,value in record.items():
|
|
69
|
+
if key not in input_types:
|
|
70
|
+
raise ValueError(f"Invalid input: {key}")
|
|
71
|
+
if not isinstance(value, input_types[key]):
|
|
72
|
+
raise ValueError(f"Invalid input: {key}")
|
|
73
|
+
|
|
74
|
+
return record
|
|
75
|
+
|
|
76
|
+
def load_eeg_data_from_s3(self, s3path):
|
|
77
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix='.set') as tmp:
|
|
78
|
+
with self.filesystem.open(s3path) as s3_file:
|
|
79
|
+
tmp.write(s3_file.read())
|
|
80
|
+
tmp_path = tmp.name
|
|
81
|
+
eeg_data = self.load_eeg_data_from_bids_file(tmp_path)
|
|
82
|
+
os.unlink(tmp_path)
|
|
83
|
+
return eeg_data
|
|
84
|
+
|
|
85
|
+
def load_eeg_data_from_bids_file(self, bids_file, eeg_attrs=None):
|
|
86
|
+
'''
|
|
87
|
+
bids_file must be a file of the bids_dataset
|
|
88
|
+
'''
|
|
89
|
+
EEG = mne.io.read_raw_eeglab(bids_file)
|
|
90
|
+
eeg_data = EEG.get_data()
|
|
91
|
+
|
|
92
|
+
fs = EEG.info['sfreq']
|
|
93
|
+
max_time = eeg_data.shape[1] / fs
|
|
94
|
+
time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
|
|
95
|
+
|
|
96
|
+
channel_names = EEG.ch_names
|
|
97
|
+
|
|
98
|
+
eeg_xarray = xr.DataArray(
|
|
99
|
+
data=eeg_data,
|
|
100
|
+
dims=['channel','time'],
|
|
101
|
+
coords={
|
|
102
|
+
'time': time_steps,
|
|
103
|
+
'channel': channel_names
|
|
104
|
+
},
|
|
105
|
+
# attrs=attrs
|
|
106
|
+
)
|
|
107
|
+
return eeg_xarray
|
|
108
|
+
|
|
109
|
+
def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
|
|
110
|
+
'''
|
|
111
|
+
bids_file must be a file of the bids_dataset
|
|
112
|
+
'''
|
|
113
|
+
if bids_file not in bids_dataset.files:
|
|
114
|
+
raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
|
|
115
|
+
f = os.path.basename(bids_file)
|
|
116
|
+
dsnumber = bids_dataset.dataset
|
|
117
|
+
# extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
|
|
118
|
+
openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
|
|
119
|
+
|
|
120
|
+
attrs = {
|
|
121
|
+
'schema_ref': 'eeg_signal',
|
|
122
|
+
'data_name': f'{bids_dataset.dataset}_{f}',
|
|
123
|
+
'dataset': bids_dataset.dataset,
|
|
124
|
+
'bidspath': openneuro_path,
|
|
125
|
+
'subject': bids_dataset.subject(bids_file),
|
|
126
|
+
'nchans': bids_dataset.num_channels(bids_file),
|
|
127
|
+
'ntimes': bids_dataset.num_times(bids_file),
|
|
128
|
+
'channel_types': bids_dataset.channel_types(bids_file),
|
|
129
|
+
'channel_names': bids_dataset.channel_labels(bids_file),
|
|
130
|
+
'task': bids_dataset.task(bids_file),
|
|
131
|
+
'session': bids_dataset.session(bids_file),
|
|
132
|
+
'run': bids_dataset.run(bids_file),
|
|
133
|
+
'sampling_frequency': bids_dataset.sfreq(bids_file),
|
|
134
|
+
'modality': 'EEG',
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return attrs
|
|
138
|
+
|
|
139
|
+
def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=True):
|
|
140
|
+
'''
|
|
141
|
+
Create new records for the dataset in the MongoDB database if not found
|
|
142
|
+
'''
|
|
143
|
+
if self.is_public:
|
|
144
|
+
raise ValueError('This operation is not allowed for public users')
|
|
145
|
+
|
|
146
|
+
bids_dataset = BIDSDataset(
|
|
147
|
+
data_dir=data_dir,
|
|
148
|
+
dataset=dataset,
|
|
149
|
+
raw_format=raw_format,
|
|
150
|
+
)
|
|
151
|
+
for bids_file in bids_dataset.get_files():
|
|
152
|
+
print('bids raw file', bids_file)
|
|
153
|
+
|
|
154
|
+
signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
|
|
155
|
+
|
|
156
|
+
if self.exist(data_name=signalstore_data_id):
|
|
157
|
+
if overwrite:
|
|
158
|
+
eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
159
|
+
print('updating record', eeg_attrs['data_name'])
|
|
160
|
+
self.update(eeg_attrs)
|
|
161
|
+
else:
|
|
162
|
+
print('data already exist and not overwriting. skipped')
|
|
163
|
+
continue
|
|
164
|
+
else:
|
|
165
|
+
eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
166
|
+
# Assume raw data already exists on Openneuro, recreating record only
|
|
167
|
+
print('adding record', eeg_attrs['data_name'])
|
|
168
|
+
self.add(eeg_attrs)
|
|
169
|
+
|
|
170
|
+
def get_s3path(self, record):
|
|
171
|
+
return f"{self.AWS_BUCKET}/{record['bidspath']}"
|
|
172
|
+
|
|
173
|
+
def get(self, query:dict):
|
|
174
|
+
'''
|
|
175
|
+
query: {
|
|
176
|
+
'dataset': 'dsxxxx',
|
|
177
|
+
|
|
178
|
+
}'''
|
|
179
|
+
sessions = self.find(query)
|
|
180
|
+
results = []
|
|
181
|
+
if sessions:
|
|
182
|
+
print(f'Found {len(sessions)} records')
|
|
183
|
+
results = Parallel(n_jobs=-1 if len(sessions) > 1 else 1, prefer="threads", verbose=1)(
|
|
184
|
+
delayed(self.load_eeg_data_from_s3)(self.get_s3path(session)) for session in sessions
|
|
185
|
+
)
|
|
186
|
+
return results
|
|
187
|
+
|
|
188
|
+
def update(self, record:dict):
|
|
189
|
+
record['schema_ref'] = 'eeg_signal'
|
|
190
|
+
self.__collection.update_one({'schema_ref': record['schema_ref'], 'data_name': record['data_name']},
|
|
191
|
+
{'$set': record}
|
|
192
|
+
)
|
|
193
|
+
def main():
|
|
194
|
+
eegdash = EEGDash()
|
|
195
|
+
record = eegdash.find({'dataset': 'ds005511', 'subject': 'NDARUF236HM7'})
|
|
196
|
+
print(record)
|
|
197
|
+
|
|
198
|
+
if __name__ == '__main__':
|
|
199
|
+
main()
|
eegdash/main.py
DELETED
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
from eegdash.signalstore_data_utils import SignalstoreBIDS
|
|
2
|
-
|
|
3
|
-
class EEGDash:
|
|
4
|
-
def __init__(self):
|
|
5
|
-
self.sstore = SignalstoreBIDS(
|
|
6
|
-
# dbconnectionstring='mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1',
|
|
7
|
-
dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
|
|
8
|
-
is_public=True,
|
|
9
|
-
local_filesystem=False,
|
|
10
|
-
project_name='eegdash'
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
def find(self, *args):
|
|
14
|
-
return self.sstore.find(*args)
|
|
15
|
-
|
|
16
|
-
def get(self, *args):
|
|
17
|
-
return self.sstore.get(*args)
|
eegdash/script.py
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
from signalstore_data_utils import SignalstoreOpenneuro
|
|
3
|
-
|
|
4
|
-
def main():
|
|
5
|
-
# Create the parser
|
|
6
|
-
parser = argparse.ArgumentParser(description="A simple command line argument parser")
|
|
7
|
-
|
|
8
|
-
# Add arguments
|
|
9
|
-
parser.add_argument('--data', type=str, default="/mnt/nemar/openneuro/ds004186", help="Path to data directory (Default: /mnt/nemar/openneuro/ds004186)")
|
|
10
|
-
parser.add_argument('--dataset', type=str, default="ds004186", help="Dataset name (Default: ds004186)")
|
|
11
|
-
|
|
12
|
-
# Parse the arguments
|
|
13
|
-
args = parser.parse_args()
|
|
14
|
-
print('Arguments:', args)
|
|
15
|
-
|
|
16
|
-
signalstore = SignalstoreOpenneuro(
|
|
17
|
-
is_public=False,
|
|
18
|
-
local_filesystem=False,
|
|
19
|
-
)
|
|
20
|
-
hbn_datasets = ['ds005505', 'ds005510', 'ds005514','ds005512','ds005511','ds005509','ds005508','ds005507','ds005506']
|
|
21
|
-
for ds in hbn_datasets:
|
|
22
|
-
signalstore.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', raw_format='eeglab')
|
|
23
|
-
|
|
24
|
-
if __name__ == "__main__":
|
|
25
|
-
main()
|
|
@@ -1,630 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
from dotenv import load_dotenv
|
|
3
|
-
import re
|
|
4
|
-
import numpy as np
|
|
5
|
-
import xarray as xr
|
|
6
|
-
import os
|
|
7
|
-
from signalstore.store import UnitOfWorkProvider
|
|
8
|
-
# from mongomock import MongoClient
|
|
9
|
-
from pymongo.mongo_client import MongoClient
|
|
10
|
-
from pymongo.server_api import ServerApi
|
|
11
|
-
from fsspec.implementations.local import LocalFileSystem
|
|
12
|
-
from fsspec.implementations.dirfs import DirFileSystem
|
|
13
|
-
import pandas as pd
|
|
14
|
-
import json
|
|
15
|
-
import s3fs
|
|
16
|
-
from signalstore.store.data_access_objects import FileSystemDAO
|
|
17
|
-
from .data_utils import BIDSDataset
|
|
18
|
-
import tempfile
|
|
19
|
-
import mne
|
|
20
|
-
from joblib import Parallel, delayed
|
|
21
|
-
|
|
22
|
-
class SignalstoreOpenneuro():
|
|
23
|
-
AWS_BUCKET = 'openneuro.org'
|
|
24
|
-
PROJECT_NAME = 'eegdash'
|
|
25
|
-
def __init__(self,
|
|
26
|
-
dbconnectionstring="mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1",
|
|
27
|
-
is_public=False,
|
|
28
|
-
local_filesystem=True,
|
|
29
|
-
):
|
|
30
|
-
self.is_public = is_public
|
|
31
|
-
self.project_name = self.PROJECT_NAME
|
|
32
|
-
if is_public:
|
|
33
|
-
dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
|
|
34
|
-
else:
|
|
35
|
-
load_dotenv()
|
|
36
|
-
dbconnectionstring = os.getenv('DB_CONNECTION_STRING')
|
|
37
|
-
|
|
38
|
-
# Create a new client and connect to the server
|
|
39
|
-
client = MongoClient(dbconnectionstring, server_api=ServerApi('1'))
|
|
40
|
-
# Send a ping to confirm a successful connection
|
|
41
|
-
try:
|
|
42
|
-
client.admin.command('ping')
|
|
43
|
-
print("Pinged your deployment. You successfully connected to MongoDB!")
|
|
44
|
-
except Exception as e:
|
|
45
|
-
print(e)
|
|
46
|
-
|
|
47
|
-
memory_store = {}
|
|
48
|
-
self.filesystem = self.set_up_filesystem(is_local=local_filesystem)
|
|
49
|
-
self.uow_provider = UnitOfWorkProvider(
|
|
50
|
-
mongo_client=client,
|
|
51
|
-
filesystem=self.filesystem,
|
|
52
|
-
memory_store=memory_store,
|
|
53
|
-
default_filetype='zarr'
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
self.uow = self.uow_provider(self.PROJECT_NAME)
|
|
57
|
-
self.load_domain_models()
|
|
58
|
-
|
|
59
|
-
def set_up_filesystem(self, is_local=True):
|
|
60
|
-
if is_local:
|
|
61
|
-
cache_path='/mnt/nemar/dtyoung/eeg-dash-data' # path where signalstore netCDF files are stored
|
|
62
|
-
# Create a directory for the dataset
|
|
63
|
-
store_path = Path(cache_path)
|
|
64
|
-
if not os.path.exists(store_path):
|
|
65
|
-
os.makedirs(store_path)
|
|
66
|
-
|
|
67
|
-
filesystem = LocalFileSystem()
|
|
68
|
-
tmp_dir_fs = DirFileSystem(
|
|
69
|
-
store_path,
|
|
70
|
-
filesystem=filesystem
|
|
71
|
-
)
|
|
72
|
-
return tmp_dir_fs
|
|
73
|
-
else:
|
|
74
|
-
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
|
|
75
|
-
return s3
|
|
76
|
-
|
|
77
|
-
def load_domain_models(self):
|
|
78
|
-
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
79
|
-
cwd = Path(dir_path)
|
|
80
|
-
domain_models_path = cwd / f"DomainModels/{self.project_name}/data_models.json"
|
|
81
|
-
metamodel_path = cwd / f"DomainModels/{self.project_name}/metamodels.json"
|
|
82
|
-
property_path = cwd / f"DomainModels/{self.project_name}/property_models.json"
|
|
83
|
-
with open(metamodel_path) as f:
|
|
84
|
-
metamodels = json.load(f)
|
|
85
|
-
|
|
86
|
-
with open(property_path) as f:
|
|
87
|
-
property_models = json.load(f)
|
|
88
|
-
|
|
89
|
-
# load domain models json file
|
|
90
|
-
with open(domain_models_path) as f:
|
|
91
|
-
domain_models = json.load(f)
|
|
92
|
-
|
|
93
|
-
with self.uow as uow:
|
|
94
|
-
for property_model in property_models:
|
|
95
|
-
if not uow.domain_models.exists(property_model['schema_name']):
|
|
96
|
-
uow.domain_models.add(property_model)
|
|
97
|
-
model = uow.domain_models.get(property_model['schema_name'])
|
|
98
|
-
print('property model: ', model['schema_name'])
|
|
99
|
-
for metamodel in metamodels:
|
|
100
|
-
if not uow.domain_models.exists(metamodel['schema_name']):
|
|
101
|
-
uow.domain_models.add(metamodel)
|
|
102
|
-
model = uow.domain_models.get(metamodel['schema_name'])
|
|
103
|
-
print('meta model: ', model['schema_name'])
|
|
104
|
-
for domain_model in domain_models:
|
|
105
|
-
if not uow.domain_models.exists(domain_model['schema_name']):
|
|
106
|
-
uow.domain_models.add(domain_model)
|
|
107
|
-
model = uow.domain_models.get(domain_model['schema_name'])
|
|
108
|
-
print('domain model: ', model['schema_name'])
|
|
109
|
-
uow.commit()
|
|
110
|
-
|
|
111
|
-
def extract_attribute(self, pattern, filename):
|
|
112
|
-
match = re.search(pattern, filename)
|
|
113
|
-
return match.group(1) if match else None
|
|
114
|
-
|
|
115
|
-
def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
|
|
116
|
-
'''
|
|
117
|
-
bids_file must be a file of the bids_dataset
|
|
118
|
-
'''
|
|
119
|
-
if bids_file not in bids_dataset.files:
|
|
120
|
-
raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
|
|
121
|
-
f = os.path.basename(bids_file)
|
|
122
|
-
dsnumber = bids_dataset.dataset
|
|
123
|
-
# extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
|
|
124
|
-
openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
|
|
125
|
-
|
|
126
|
-
attrs = {
|
|
127
|
-
'schema_ref': 'eeg_signal',
|
|
128
|
-
'data_name': f'{bids_dataset.dataset}_{f}',
|
|
129
|
-
'dataset': bids_dataset.dataset,
|
|
130
|
-
'bidspath': openneuro_path,
|
|
131
|
-
'subject': bids_dataset.subject(bids_file),
|
|
132
|
-
'task': bids_dataset.task(bids_file),
|
|
133
|
-
'session': bids_dataset.session(bids_file),
|
|
134
|
-
'run': bids_dataset.run(bids_file),
|
|
135
|
-
'sampling_frequency': bids_dataset.sfreq(bids_file),
|
|
136
|
-
'modality': 'EEG',
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
return attrs
|
|
140
|
-
|
|
141
|
-
def load_eeg_data_from_s3(self, s3path):
|
|
142
|
-
# import boto3
|
|
143
|
-
# import scipy.io
|
|
144
|
-
# import io
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
# # Initialize the S3 client
|
|
148
|
-
# s3 = boto3.client('s3')
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
# # S3 bucket and object key
|
|
152
|
-
# bucket_name = 'your-bucket-name'
|
|
153
|
-
# object_key = 'path/to/your/file.mat'
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
# # Get the object from S3 and stream it into memory
|
|
157
|
-
# response = s3.get_object(Bucket=bucket_name, Key=object_key)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
# # Read the content into a BytesIO buffer
|
|
161
|
-
# mat_file_stream = io.BytesIO(response['Body'].read())
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# # Load the MAT file using scipy.io.loadmat
|
|
165
|
-
# data = scipy.io.loadmat(mat_file_stream)
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
# # Work with the data
|
|
169
|
-
# print(data)
|
|
170
|
-
with tempfile.NamedTemporaryFile(delete=False, suffix='.set') as tmp:
|
|
171
|
-
with self.filesystem.open(s3path) as s3_file:
|
|
172
|
-
tmp.write(s3_file.read())
|
|
173
|
-
tmp_path = tmp.name
|
|
174
|
-
eeg_data = self.load_eeg_data_from_bids_file(tmp_path)
|
|
175
|
-
os.unlink(tmp_path)
|
|
176
|
-
return eeg_data
|
|
177
|
-
|
|
178
|
-
def load_eeg_data_from_bids_file(self, bids_file, eeg_attrs=None):
|
|
179
|
-
'''
|
|
180
|
-
bids_file must be a file of the bids_dataset
|
|
181
|
-
'''
|
|
182
|
-
EEG = mne.io.read_raw_eeglab(bids_file)
|
|
183
|
-
eeg_data = EEG.get_data()
|
|
184
|
-
|
|
185
|
-
fs = EEG.info['sfreq']
|
|
186
|
-
max_time = eeg_data.shape[1] / fs
|
|
187
|
-
time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
|
|
188
|
-
|
|
189
|
-
channel_names = EEG.ch_names
|
|
190
|
-
|
|
191
|
-
eeg_xarray = xr.DataArray(
|
|
192
|
-
data=eeg_data,
|
|
193
|
-
dims=['channel','time'],
|
|
194
|
-
coords={
|
|
195
|
-
'time': time_steps,
|
|
196
|
-
'channel': channel_names
|
|
197
|
-
},
|
|
198
|
-
# attrs=attrs
|
|
199
|
-
)
|
|
200
|
-
return eeg_xarray
|
|
201
|
-
|
|
202
|
-
def exist(self, schema_ref='eeg_signal', data_name=''):
|
|
203
|
-
with self.uow as uow:
|
|
204
|
-
query = {
|
|
205
|
-
"schema_ref": schema_ref,
|
|
206
|
-
"data_name": data_name
|
|
207
|
-
}
|
|
208
|
-
sessions = uow.data.find(query)
|
|
209
|
-
if len(sessions) > 0:
|
|
210
|
-
return True
|
|
211
|
-
else:
|
|
212
|
-
return False
|
|
213
|
-
|
|
214
|
-
def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=False):
|
|
215
|
-
'''
|
|
216
|
-
Create new records for the dataset in the MongoDB database if not found
|
|
217
|
-
'''
|
|
218
|
-
if self.is_public:
|
|
219
|
-
raise ValueError('This operation is not allowed for public users')
|
|
220
|
-
|
|
221
|
-
bids_dataset = BIDSDataset(
|
|
222
|
-
data_dir=data_dir,
|
|
223
|
-
dataset=dataset,
|
|
224
|
-
raw_format=raw_format,
|
|
225
|
-
)
|
|
226
|
-
for bids_file in bids_dataset.get_files():
|
|
227
|
-
print('bids raw file', bids_file)
|
|
228
|
-
|
|
229
|
-
signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
|
|
230
|
-
if overwrite:
|
|
231
|
-
self.remove(signalstore_data_id)
|
|
232
|
-
|
|
233
|
-
if self.exist(data_name=signalstore_data_id):
|
|
234
|
-
print('data already exist. skipped')
|
|
235
|
-
continue
|
|
236
|
-
else:
|
|
237
|
-
eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
238
|
-
with self.uow as uow:
|
|
239
|
-
# Assume raw data already exists on Openneuro, recreating record only
|
|
240
|
-
eeg_attrs['has_file'] = True
|
|
241
|
-
print('adding record', eeg_attrs['data_name'])
|
|
242
|
-
uow.data.add(eeg_attrs)
|
|
243
|
-
uow.commit()
|
|
244
|
-
|
|
245
|
-
def update_bids_dataset(self, dataset, data_dir, raw_format='eeglab'):
|
|
246
|
-
'''
|
|
247
|
-
Create new records for the dataset in the MongoDB database if not found
|
|
248
|
-
'''
|
|
249
|
-
if self.is_public:
|
|
250
|
-
raise ValueError('This operation is not allowed for public users')
|
|
251
|
-
|
|
252
|
-
bids_dataset = BIDSDataset(
|
|
253
|
-
data_dir=data_dir,
|
|
254
|
-
dataset=dataset,
|
|
255
|
-
raw_format=raw_format,
|
|
256
|
-
)
|
|
257
|
-
for bids_file in bids_dataset.get_files():
|
|
258
|
-
print('bids raw file', bids_file)
|
|
259
|
-
|
|
260
|
-
signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
|
|
261
|
-
|
|
262
|
-
if not self.exist(data_name=signalstore_data_id):
|
|
263
|
-
raise ValueError('data not found')
|
|
264
|
-
else:
|
|
265
|
-
self.remove(data_name=signalstore_data_id)
|
|
266
|
-
|
|
267
|
-
eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
268
|
-
with self.uow as uow:
|
|
269
|
-
# Assume raw data already exists on Openneuro, recreating record only
|
|
270
|
-
eeg_attrs['has_file'] = True
|
|
271
|
-
print('adding record', eeg_attrs['data_name'])
|
|
272
|
-
uow.data.add(eeg_attrs)
|
|
273
|
-
uow.commit()
|
|
274
|
-
|
|
275
|
-
def remove(self, schema_ref='eeg_signal', data_name=''):
|
|
276
|
-
if self.is_public:
|
|
277
|
-
raise ValueError('This operation is not allowed for public users')
|
|
278
|
-
|
|
279
|
-
print('Removing record', data_name)
|
|
280
|
-
with self.uow as uow:
|
|
281
|
-
sessions = uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})
|
|
282
|
-
if len(sessions) > 0:
|
|
283
|
-
for session in sessions:
|
|
284
|
-
uow.data.remove(session['schema_ref'], session['data_name'])
|
|
285
|
-
uow.commit()
|
|
286
|
-
uow.purge()
|
|
287
|
-
assert len(uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})) == 0, 'Data still exists'
|
|
288
|
-
|
|
289
|
-
def remove_all(self):
|
|
290
|
-
if self.is_public:
|
|
291
|
-
raise ValueError('This operation is not allowed for public users')
|
|
292
|
-
|
|
293
|
-
with self.uow as uow:
|
|
294
|
-
sessions = uow.data.find({})
|
|
295
|
-
print(len(sessions))
|
|
296
|
-
for session in range(len(sessions)):
|
|
297
|
-
uow.data.remove(session['schema_ref'], session['data_name'])
|
|
298
|
-
uow.commit()
|
|
299
|
-
|
|
300
|
-
uow.purge()
|
|
301
|
-
|
|
302
|
-
print('Verifying deletion job. Dataset length: ', len(uow.data.find({})))
|
|
303
|
-
|
|
304
|
-
def find(self, query:dict, validate=False):
|
|
305
|
-
'''
|
|
306
|
-
query: {
|
|
307
|
-
'dataset': 'dsxxxx',
|
|
308
|
-
|
|
309
|
-
}'''
|
|
310
|
-
with self.uow as uow:
|
|
311
|
-
sessions = uow.data.find(query, validate=validate)
|
|
312
|
-
if sessions:
|
|
313
|
-
print(f'Found {len(sessions)} records')
|
|
314
|
-
return sessions
|
|
315
|
-
else:
|
|
316
|
-
return []
|
|
317
|
-
|
|
318
|
-
def get(self, query:dict, validate=False):
|
|
319
|
-
'''
|
|
320
|
-
query: {
|
|
321
|
-
'dataset': 'dsxxxx',
|
|
322
|
-
|
|
323
|
-
}'''
|
|
324
|
-
with self.uow as uow:
|
|
325
|
-
sessions = uow.data.find(query, validate=validate)
|
|
326
|
-
results = []
|
|
327
|
-
if sessions:
|
|
328
|
-
print(f'Found {len(sessions)} records')
|
|
329
|
-
results = Parallel(n_jobs=-1, prefer="threads", verbose=1)(
|
|
330
|
-
delayed(self.load_eeg_data_from_s3)(Path(self.AWS_BUCKET) / session['bidspath']) for session in sessions
|
|
331
|
-
)
|
|
332
|
-
return results
|
|
333
|
-
|
|
334
|
-
class SignalstoreBIDS():
|
|
335
|
-
AWS_BUCKET = 'eegdash'
|
|
336
|
-
def __init__(self,
|
|
337
|
-
project_name=AWS_BUCKET,
|
|
338
|
-
dbconnectionstring="mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1",
|
|
339
|
-
is_public=False,
|
|
340
|
-
local_filesystem=True,
|
|
341
|
-
):
|
|
342
|
-
self.is_public = is_public
|
|
343
|
-
if is_public:
|
|
344
|
-
dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
|
|
345
|
-
else:
|
|
346
|
-
load_dotenv()
|
|
347
|
-
dbconnectionstring = os.getenv('DB_CONNECTION_STRING')
|
|
348
|
-
|
|
349
|
-
# Create a new client and connect to the server
|
|
350
|
-
client = MongoClient(dbconnectionstring, server_api=ServerApi('1'))
|
|
351
|
-
# Send a ping to confirm a successful connection
|
|
352
|
-
try:
|
|
353
|
-
client.admin.command('ping')
|
|
354
|
-
print("Pinged your deployment. You successfully connected to MongoDB!")
|
|
355
|
-
except Exception as e:
|
|
356
|
-
print(e)
|
|
357
|
-
|
|
358
|
-
memory_store = {}
|
|
359
|
-
filesystem = self.set_up_filesystem(is_local=local_filesystem)
|
|
360
|
-
self.uow_provider = UnitOfWorkProvider(
|
|
361
|
-
mongo_client=client,
|
|
362
|
-
filesystem=filesystem,
|
|
363
|
-
memory_store=memory_store,
|
|
364
|
-
default_filetype='zarr'
|
|
365
|
-
)
|
|
366
|
-
|
|
367
|
-
self.project_name=project_name
|
|
368
|
-
self.uow = self.uow_provider(self.project_name)
|
|
369
|
-
# self.load_domain_models()
|
|
370
|
-
|
|
371
|
-
def set_up_filesystem(self, is_local=True):
|
|
372
|
-
if is_local:
|
|
373
|
-
cache_path='/mnt/nemar/dtyoung/eeg-ssl-data' # path where signalstore netCDF files are stored
|
|
374
|
-
# Create a directory for the dataset
|
|
375
|
-
store_path = Path(cache_path)
|
|
376
|
-
if not os.path.exists(store_path):
|
|
377
|
-
os.makedirs(store_path)
|
|
378
|
-
|
|
379
|
-
filesystem = LocalFileSystem()
|
|
380
|
-
tmp_dir_fs = DirFileSystem(
|
|
381
|
-
store_path,
|
|
382
|
-
filesystem=filesystem
|
|
383
|
-
)
|
|
384
|
-
return tmp_dir_fs
|
|
385
|
-
else:
|
|
386
|
-
if self.is_public:
|
|
387
|
-
s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
|
|
388
|
-
else:
|
|
389
|
-
s3 = s3fs.S3FileSystem(client_kwargs={'region_name': 'us-east-2'})
|
|
390
|
-
return s3
|
|
391
|
-
|
|
392
|
-
def load_domain_models(self):
|
|
393
|
-
cwd = Path.cwd()
|
|
394
|
-
domain_models_path = cwd / f"DomainModels/{self.project_name}/data_models.json"
|
|
395
|
-
metamodel_path = cwd / f"DomainModels/{self.project_name}/metamodels.json"
|
|
396
|
-
property_path = cwd / f"DomainModels/{self.project_name}/property_models.json"
|
|
397
|
-
with open(metamodel_path) as f:
|
|
398
|
-
metamodels = json.load(f)
|
|
399
|
-
|
|
400
|
-
with open(property_path) as f:
|
|
401
|
-
property_models = json.load(f)
|
|
402
|
-
|
|
403
|
-
# load domain models json file
|
|
404
|
-
with open(domain_models_path) as f:
|
|
405
|
-
domain_models = json.load(f)
|
|
406
|
-
|
|
407
|
-
with self.uow as uow:
|
|
408
|
-
for property_model in property_models:
|
|
409
|
-
uow.domain_models.add(property_model)
|
|
410
|
-
model = uow.domain_models.get(property_model['schema_name'])
|
|
411
|
-
print('property model: ', model['schema_name'])
|
|
412
|
-
for metamodel in metamodels:
|
|
413
|
-
uow.domain_models.add(metamodel)
|
|
414
|
-
model = uow.domain_models.get(metamodel['schema_name'])
|
|
415
|
-
print('meta model: ', model['schema_name'])
|
|
416
|
-
for domain_model in domain_models:
|
|
417
|
-
uow.domain_models.add(domain_model)
|
|
418
|
-
model = uow.domain_models.get(domain_model['schema_name'])
|
|
419
|
-
print('domain model: ', model['schema_name'])
|
|
420
|
-
uow.commit()
|
|
421
|
-
|
|
422
|
-
def extract_attribute(self, pattern, filename):
|
|
423
|
-
match = re.search(pattern, filename)
|
|
424
|
-
return match.group(1) if match else None
|
|
425
|
-
|
|
426
|
-
def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
|
|
427
|
-
'''
|
|
428
|
-
bids_file must be a file of the bids_dataset
|
|
429
|
-
'''
|
|
430
|
-
if bids_file not in bids_dataset.files:
|
|
431
|
-
raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
|
|
432
|
-
f = os.path.basename(bids_file)
|
|
433
|
-
attrs = {
|
|
434
|
-
'schema_ref': 'eeg_signal',
|
|
435
|
-
'data_name': f'{bids_dataset.dataset}_{f}',
|
|
436
|
-
'dataset': bids_dataset.dataset,
|
|
437
|
-
'subject': bids_dataset.subject(bids_file),
|
|
438
|
-
'task': bids_dataset.task(bids_file),
|
|
439
|
-
'session': bids_dataset.session(bids_file),
|
|
440
|
-
'run': bids_dataset.run(bids_file),
|
|
441
|
-
'sampling_frequency': bids_dataset.sfreq(bids_file),
|
|
442
|
-
'modality': 'EEG',
|
|
443
|
-
}
|
|
444
|
-
|
|
445
|
-
return attrs
|
|
446
|
-
|
|
447
|
-
def load_eeg_data_from_bids_file(self, bids_dataset: BIDSDataset, bids_file, eeg_attrs=None):
|
|
448
|
-
'''
|
|
449
|
-
bids_file must be a file of the bids_dataset
|
|
450
|
-
'''
|
|
451
|
-
if bids_file not in bids_dataset.files:
|
|
452
|
-
raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
|
|
453
|
-
|
|
454
|
-
attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file) if eeg_attrs is None else eeg_attrs
|
|
455
|
-
|
|
456
|
-
eeg_data = bids_dataset.load_and_preprocess_raw(bids_file)
|
|
457
|
-
print('data shape:', eeg_data.shape)
|
|
458
|
-
|
|
459
|
-
fs = attrs['sampling_frequency']
|
|
460
|
-
max_time = eeg_data.shape[1] / fs
|
|
461
|
-
time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
|
|
462
|
-
# print('time steps', len(time_steps))
|
|
463
|
-
|
|
464
|
-
# replace eeg.set with channels.tsv
|
|
465
|
-
# todo this is still a hacky way
|
|
466
|
-
channels_tsv = bids_dataset.get_bids_metadata_files(bids_file, 'channels.tsv')
|
|
467
|
-
channels_tsv = Path(channels_tsv[0])
|
|
468
|
-
if channels_tsv.exists():
|
|
469
|
-
channels = pd.read_csv(channels_tsv, sep='\t')
|
|
470
|
-
# get channel names from channel_coords
|
|
471
|
-
channel_names = channels['name'].values
|
|
472
|
-
|
|
473
|
-
eeg_xarray = xr.DataArray(
|
|
474
|
-
data=eeg_data,
|
|
475
|
-
dims=['channel','time'],
|
|
476
|
-
coords={
|
|
477
|
-
'time': time_steps,
|
|
478
|
-
'channel': channel_names
|
|
479
|
-
},
|
|
480
|
-
attrs=attrs
|
|
481
|
-
)
|
|
482
|
-
return eeg_xarray
|
|
483
|
-
|
|
484
|
-
def exist(self, schema_ref='eeg_signal', data_name=''):
|
|
485
|
-
with self.uow as uow:
|
|
486
|
-
query = {
|
|
487
|
-
"schema_ref": schema_ref,
|
|
488
|
-
"data_name": data_name
|
|
489
|
-
}
|
|
490
|
-
sessions = uow.data.find(query)
|
|
491
|
-
if len(sessions) > 0:
|
|
492
|
-
return True
|
|
493
|
-
else:
|
|
494
|
-
return False
|
|
495
|
-
|
|
496
|
-
def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=False, record_only=False):
|
|
497
|
-
if self.is_public:
|
|
498
|
-
raise ValueError('This operation is not allowed for public users')
|
|
499
|
-
|
|
500
|
-
bids_dataset = BIDSDataset(
|
|
501
|
-
data_dir=data_dir,
|
|
502
|
-
dataset=dataset,
|
|
503
|
-
raw_format=raw_format,
|
|
504
|
-
)
|
|
505
|
-
for bids_file in bids_dataset.get_files():
|
|
506
|
-
print('bids raw file', bids_file)
|
|
507
|
-
|
|
508
|
-
signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
|
|
509
|
-
if overwrite:
|
|
510
|
-
self.remove(signalstore_data_id)
|
|
511
|
-
|
|
512
|
-
if self.exist(data_name=signalstore_data_id):
|
|
513
|
-
print('data already exist. skipped')
|
|
514
|
-
continue
|
|
515
|
-
else:
|
|
516
|
-
eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
|
|
517
|
-
with self.uow as uow:
|
|
518
|
-
# Assume raw data already exists, recreating record only
|
|
519
|
-
eeg_attrs['has_file'] = True
|
|
520
|
-
print('adding record', eeg_attrs['data_name'])
|
|
521
|
-
uow.data.add(eeg_attrs)
|
|
522
|
-
uow.commit()
|
|
523
|
-
if not record_only:
|
|
524
|
-
eeg_xarray = self.load_eeg_data_from_bids_file(bids_dataset, bids_file, eeg_attrs)
|
|
525
|
-
with self.uow as uow:
|
|
526
|
-
print('adding data', eeg_xarray.attrs['data_name'])
|
|
527
|
-
uow.data.add(eeg_xarray)
|
|
528
|
-
uow.commit()
|
|
529
|
-
|
|
530
|
-
def remove(self, schema_ref='eeg_signal', data_name=''):
|
|
531
|
-
if self.is_public:
|
|
532
|
-
raise ValueError('This operation is not allowed for public users')
|
|
533
|
-
|
|
534
|
-
with self.uow as uow:
|
|
535
|
-
sessions = uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})
|
|
536
|
-
if len(session) > 0:
|
|
537
|
-
for session in range(len(sessions)):
|
|
538
|
-
uow.data.remove(session['schema_ref'], session['data_name'])
|
|
539
|
-
uow.commit()
|
|
540
|
-
|
|
541
|
-
def remove_all(self):
|
|
542
|
-
if self.is_public:
|
|
543
|
-
raise ValueError('This operation is not allowed for public users')
|
|
544
|
-
|
|
545
|
-
with self.uow as uow:
|
|
546
|
-
sessions = uow.data.find({})
|
|
547
|
-
print(len(sessions))
|
|
548
|
-
for session in range(len(sessions)):
|
|
549
|
-
uow.data.remove(session['schema_ref'], session['data_name'])
|
|
550
|
-
uow.commit()
|
|
551
|
-
|
|
552
|
-
uow.purge()
|
|
553
|
-
|
|
554
|
-
print('Verifying deletion job. Dataset length: ', len(uow.data.find({})))
|
|
555
|
-
|
|
556
|
-
def find(self, query:dict, validate=False, get_data=False):
|
|
557
|
-
'''
|
|
558
|
-
query: {
|
|
559
|
-
'dataset': 'dsxxxx',
|
|
560
|
-
|
|
561
|
-
}'''
|
|
562
|
-
with self.uow as uow:
|
|
563
|
-
sessions = uow.data.find(query, validate=validate, get_data=get_data)
|
|
564
|
-
if sessions:
|
|
565
|
-
print(f'Found {len(sessions)} records')
|
|
566
|
-
return sessions
|
|
567
|
-
else:
|
|
568
|
-
return []
|
|
569
|
-
|
|
570
|
-
def get(self, query:dict, validate=False):
|
|
571
|
-
'''
|
|
572
|
-
query: {
|
|
573
|
-
'dataset': 'dsxxxx',
|
|
574
|
-
|
|
575
|
-
}'''
|
|
576
|
-
with self.uow as uow:
|
|
577
|
-
sessions = uow.data.find(query, validate=validate, get_data=True)
|
|
578
|
-
if sessions:
|
|
579
|
-
print(f'Found {len(sessions)} records')
|
|
580
|
-
return sessions
|
|
581
|
-
else:
|
|
582
|
-
return []
|
|
583
|
-
|
|
584
|
-
class OpenneuroFileSystemDAO(FileSystemDAO):
|
|
585
|
-
def __init__(self):
|
|
586
|
-
filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
|
|
587
|
-
super().__init__(filesystem, project_dir='openneuro.org')
|
|
588
|
-
|
|
589
|
-
def get(self, schema_ref, data_name, version_timestamp=0, nth_most_recent=1, data_adapter=None):
|
|
590
|
-
"""Gets an object from the Openneuro S3 bucket.
|
|
591
|
-
Arguments:
|
|
592
|
-
schema_ref {str} -- The type of object to get.
|
|
593
|
-
data_name {str} -- The name of the object to get.
|
|
594
|
-
version_timestamp {str} -- The version_timestamp of the object to get.
|
|
595
|
-
Raises:
|
|
596
|
-
FileSystemDAOFileNotFoundError -- If the object is not found.
|
|
597
|
-
Returns:
|
|
598
|
-
dict -- The object.
|
|
599
|
-
"""
|
|
600
|
-
self._check_args(
|
|
601
|
-
schema_ref=schema_ref,
|
|
602
|
-
data_name=data_name,
|
|
603
|
-
nth_most_recent=nth_most_recent,
|
|
604
|
-
version_timestamp=version_timestamp,
|
|
605
|
-
data_adapter=data_adapter
|
|
606
|
-
)
|
|
607
|
-
if data_adapter is None:
|
|
608
|
-
data_adapter = self._default_data_adapter
|
|
609
|
-
else:
|
|
610
|
-
data_adapter.set_filesystem(self._fs)
|
|
611
|
-
path = self._get_file_path(schema_ref, data_name, version_timestamp, nth_most_recent, data_adapter)
|
|
612
|
-
if path is None:
|
|
613
|
-
return None
|
|
614
|
-
data_object = data_adapter.read_file(path)
|
|
615
|
-
data_object = self._deserialize(data_object)
|
|
616
|
-
return data_object
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
if __name__ == "__main__":
|
|
620
|
-
# sstore_hbn = SignalstoreHBN()
|
|
621
|
-
# sstore_hbn.add_data()
|
|
622
|
-
# sstore_ds004584 = SignalstoreHBN(
|
|
623
|
-
# data_path='/mnt/nemar/openneuro/ds004584',
|
|
624
|
-
# dataset_name='eegdash',
|
|
625
|
-
# local_filesystem=False,
|
|
626
|
-
# dbconnectionstring='mongodb://23.21.113.214:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.2.1'
|
|
627
|
-
# )
|
|
628
|
-
# sstore_ds004584.load_domain_models()
|
|
629
|
-
# sstore_ds004584.add_data()
|
|
630
|
-
pass
|
eegdash-0.0.2.dist-info/RECORD
DELETED
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
eegdash/__init__.py,sha256=hgxE8COvPu3EV2Tq3GqtMk68fsd7bYvOs_0GO6rrzfk,32
|
|
2
|
-
eegdash/data_utils.py,sha256=CA4lC5MKSoxCp0uJWy_n2okGtTCof2svDzSGxHZcIo0,9080
|
|
3
|
-
eegdash/main.py,sha256=ANyrsVCvDiKNiQAmlQt9FcyOeCoD4Oe6Gq25LM2o38o,675
|
|
4
|
-
eegdash/script.py,sha256=IbxGybE9Bpx0fS9QEw2YMYkakARYsEFelH-xfzlPQxU,974
|
|
5
|
-
eegdash/signalstore_data_utils.py,sha256=g4nSYBIR5obhlKCC1erH4C_KrmuaGVu_JJpcC59yRMY,24198
|
|
6
|
-
eegdash-0.0.2.dist-info/LICENSE,sha256=Xafu48R-h_kyaNj2tuhfgdEv9_ovciktjUEgRRwMZ6w,812
|
|
7
|
-
eegdash-0.0.2.dist-info/METADATA,sha256=rreskaKqIRA0bBmHJOpwQpFLxTX9oX_YCZpk5yd1wBs,9335
|
|
8
|
-
eegdash-0.0.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
|
9
|
-
eegdash-0.0.2.dist-info/top_level.txt,sha256=zavO69HQ6MyZM0aQMR2zUS6TAFc7bnN5GEpDpOpFZzU,8
|
|
10
|
-
eegdash-0.0.2.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
eegdash
|
|
File without changes
|
|
File without changes
|
|
File without changes
|