eegdash 0.0.2__tar.gz → 0.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: eegdash
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: EEG data for machine learning
5
5
  Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>
6
6
  License: GNU General Public License
@@ -108,16 +108,16 @@ Additionally, users can search for a specific dataset by specifying criteria.
108
108
  EEGDashInstance.find({'task': 'FaceRecognition'})
109
109
  ```
110
110
 
111
- After locating the desired dataset or data record, users can download it locally by executing the following command:
111
+ After locating the desired dataset or data record, users can download it locally by executing the following command. This will return an xArray Python object.
112
112
 
113
113
  ```python
114
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
114
+ XArrayData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
115
115
  ```
116
116
 
117
- Optionally, this is how you may access the raw data for the first record.
117
+ Optionally, this is how you may access the raw data for the first record. This will return an numpy array.
118
118
 
119
119
  ```python
120
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
120
+ npData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
121
121
  ```
122
122
 
123
123
  ## Example use
@@ -72,16 +72,16 @@ Additionally, users can search for a specific dataset by specifying criteria.
72
72
  EEGDashInstance.find({'task': 'FaceRecognition'})
73
73
  ```
74
74
 
75
- After locating the desired dataset or data record, users can download it locally by executing the following command:
75
+ After locating the desired dataset or data record, users can download it locally by executing the following command. This will return an xArray Python object.
76
76
 
77
77
  ```python
78
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
78
+ XArrayData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
79
79
  ```
80
80
 
81
- Optionally, this is how you may access the raw data for the first record.
81
+ Optionally, this is how you may access the raw data for the first record. This will return an numpy array.
82
82
 
83
83
  ```python
84
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
84
+ npData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
85
85
  ```
86
86
 
87
87
  ## Example use
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "eegdash"
7
- version = "0.0.2"
7
+ version = "0.0.3"
8
8
  authors = [
9
9
  { name="Young Truong", email="dt.young112@gmail.com" },
10
10
  { name="Arnaud Delorme", email="adelorme@gmail.com" },
@@ -3,11 +3,104 @@ import sys
3
3
  from joblib import Parallel, delayed
4
4
  import mne
5
5
  import numpy as np
6
+ import pandas as pd
6
7
  from pathlib import Path
7
8
  import re
8
9
  import json
10
+ from mne.io import BaseRaw
11
+ from mne._fiff.utils import _find_channels, _read_segments_file
12
+ import s3fs
13
+ import tempfile
14
+ from mne._fiff.utils import _read_segments_file
9
15
 
10
- verbose = False
16
+ class RawEEGDash(BaseRaw):
17
+ r"""Raw object from EEG-Dash connection with Openneuro S3 file.
18
+
19
+ Parameters
20
+ ----------
21
+ input_fname : path-like
22
+ Path to the S3 file
23
+ eog : list | tuple | 'auto'
24
+ Names or indices of channels that should be designated EOG channels.
25
+ If 'auto', the channel names containing ``EOG`` or ``EYE`` are used.
26
+ Defaults to empty tuple.
27
+ %(preload)s
28
+ Note that preload=False will be effective only if the data is stored
29
+ in a separate binary file.
30
+ %(uint16_codec)s
31
+ %(montage_units)s
32
+ %(verbose)s
33
+
34
+ See Also
35
+ --------
36
+ mne.io.Raw : Documentation of attributes and methods.
37
+
38
+ Notes
39
+ -----
40
+ .. versionadded:: 0.11.0
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ input_fname,
46
+ metadata,
47
+ eog=(),
48
+ preload=False,
49
+ *,
50
+ cache_dir='./.eegdash_cache',
51
+ uint16_codec=None,
52
+ montage_units="auto",
53
+ verbose=None,
54
+ ):
55
+ '''
56
+ Get to work with S3 endpoint first, no caching
57
+ '''
58
+ # Create a simple RawArray
59
+ sfreq = metadata['sfreq'] # Sampling frequency
60
+ n_times = metadata['n_times']
61
+ ch_names = metadata['ch_names']
62
+ ch_types = []
63
+ for ch in metadata['ch_types']:
64
+ chtype = ch.lower()
65
+ if chtype == 'heog' or chtype == 'veog':
66
+ chtype = 'eog'
67
+ ch_types.append(chtype)
68
+ info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
69
+ self.s3file = input_fname
70
+ os.makedirs(cache_dir, exist_ok=True)
71
+ self.filecache = os.path.join(cache_dir, os.path.basename(self.s3file))
72
+
73
+ if preload and not os.path.exists(self.filecache):
74
+ self._download_s3()
75
+ preload = self.filecache
76
+
77
+ super().__init__(
78
+ info,
79
+ preload,
80
+ last_samps=[n_times-1],
81
+ orig_format="single",
82
+ verbose=verbose,
83
+ )
84
+
85
+ def _download_s3(self):
86
+ filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
87
+ print('s3file', self.s3file)
88
+ print('filecache', self.filecache)
89
+ filesystem.download(self.s3file, self.filecache)
90
+ self.filenames = [self.filecache]
91
+
92
+ def _read_segment(
93
+ self, start=0, stop=None, sel=None, data_buffer=None, *, verbose=None
94
+ ):
95
+ if not os.path.exists(self.filecache): # not preload
96
+ self._download_s3()
97
+ else: # not preload and file is not cached
98
+ self.filenames = [self.filecache]
99
+ return super()._read_segment(start, stop, sel, data_buffer, verbose=verbose)
100
+
101
+ def _read_segment_file(self, data, idx, fi, start, stop, cals, mult):
102
+ """Read a chunk of data from the file."""
103
+ _read_segments_file(self, data, idx, fi, start, stop, cals, mult, dtype="<f4")
11
104
 
12
105
 
13
106
  class BIDSDataset():
@@ -20,7 +113,7 @@ class BIDSDataset():
20
113
  }
21
114
  METADATA_FILE_EXTENSIONS = ['eeg.json', 'channels.tsv', 'electrodes.tsv', 'events.tsv', 'events.json']
22
115
  def __init__(self,
23
- data_dir=None, # location of asr cleaned data
116
+ data_dir=None, # location of bids dataset
24
117
  dataset='', # dataset name
25
118
  raw_format='eeglab', # format of raw data
26
119
  ):
@@ -51,6 +144,18 @@ class BIDSDataset():
51
144
  lookup = re.search(rf'{property}-(.*?)[_\/]', filename)
52
145
  return lookup.group(1) if lookup else ''
53
146
 
147
+ def merge_json_inheritance(self, json_files):
148
+ '''
149
+ Merge list of json files found by get_bids_file_inheritance,
150
+ expecting the order (from left to right) is from lowest level to highest level,
151
+ and return a merged dictionary
152
+ '''
153
+ json_files.reverse()
154
+ json_dict = {}
155
+ for f in json_files:
156
+ json_dict.update(json.load(open(f)))
157
+ return json_dict
158
+
54
159
  def get_bids_file_inheritance(self, path, basename, extension):
55
160
  '''
56
161
  Get all files with given extension that applies to the basename file
@@ -72,7 +177,7 @@ class BIDSDataset():
72
177
  for file in os.listdir(path):
73
178
  # target_file = path / f"{cur_file_basename}_{extension}"
74
179
  if os.path.isfile(path/file):
75
- cur_file_basename = file[:file.rfind('_')]
180
+ cur_file_basename = file[:file.rfind('_')] # TODO: change to just search for any file with extension
76
181
  if file.endswith(extension) and cur_file_basename in basename:
77
182
  filepath = path / file
78
183
  bids_files.append(filepath)
@@ -214,4 +319,21 @@ class BIDSDataset():
214
319
  return self.get_property_from_filename('run', data_filepath)
215
320
 
216
321
  def subject(self, data_filepath):
217
- return self.get_property_from_filename('sub', data_filepath)
322
+ return self.get_property_from_filename('sub', data_filepath)
323
+
324
+ def num_channels(self, data_filepath):
325
+ channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
326
+ return len(channels_tsv)
327
+
328
+ def channel_labels(self, data_filepath):
329
+ channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
330
+ return channels_tsv['name'].tolist()
331
+
332
+ def channel_types(self, data_filepath):
333
+ channels_tsv = pd.read_csv(self.get_bids_metadata_files(data_filepath, 'channels.tsv')[0], sep='\t')
334
+ return channels_tsv['type'].tolist()
335
+
336
+ def num_times(self, data_filepath):
337
+ eeg_jsons = self.get_bids_metadata_files(data_filepath, 'eeg.json')
338
+ eeg_json_dict = self.merge_json_inheritance(eeg_jsons)
339
+ return int(eeg_json_dict['SamplingFrequency'] * eeg_json_dict['RecordingDuration'])
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: eegdash
3
- Version: 0.0.2
3
+ Version: 0.0.3
4
4
  Summary: EEG data for machine learning
5
5
  Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>
6
6
  License: GNU General Public License
@@ -108,16 +108,16 @@ Additionally, users can search for a specific dataset by specifying criteria.
108
108
  EEGDashInstance.find({'task': 'FaceRecognition'})
109
109
  ```
110
110
 
111
- After locating the desired dataset or data record, users can download it locally by executing the following command:
111
+ After locating the desired dataset or data record, users can download it locally by executing the following command. This will return an xArray Python object.
112
112
 
113
113
  ```python
114
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
114
+ XArrayData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})
115
115
  ```
116
116
 
117
- Optionally, this is how you may access the raw data for the first record.
117
+ Optionally, this is how you may access the raw data for the first record. This will return an numpy array.
118
118
 
119
119
  ```python
120
- EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
120
+ npData = EEGDashInstance.get({'task': 'FaceRecognition', 'subject': '019'})[0].values
121
121
  ```
122
122
 
123
123
  ## Example use
@@ -0,0 +1,11 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/__init__.py
5
+ src/data_utils.py
6
+ src/main.py
7
+ src/eegdash.egg-info/PKG-INFO
8
+ src/eegdash.egg-info/SOURCES.txt
9
+ src/eegdash.egg-info/dependency_links.txt
10
+ src/eegdash.egg-info/requires.txt
11
+ src/eegdash.egg-info/top_level.txt
@@ -0,0 +1,4 @@
1
+ DomainModels
2
+ __init__
3
+ data_utils
4
+ main
@@ -0,0 +1,199 @@
1
+ import pymongo
2
+ from dotenv import load_dotenv
3
+ import os
4
+ import s3fs
5
+ from joblib import Parallel, delayed
6
+ import tempfile
7
+ import mne
8
+ import numpy as np
9
+ import xarray as xr
10
+ from .data_utils import BIDSDataset
11
+ class EEGDash:
12
+ AWS_BUCKET = 's3://openneuro.org'
13
+ def __init__(self,
14
+ is_public=True):
15
+ if is_public:
16
+ DB_CONNECTION_STRING="mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
17
+ else:
18
+ load_dotenv()
19
+ DB_CONNECTION_STRING = os.getenv('DB_CONNECTION_STRING')
20
+
21
+ self.__client = pymongo.MongoClient(DB_CONNECTION_STRING)
22
+ self.__db = self.__client['eegdash']
23
+ self.__collection = self.__db['records']
24
+
25
+ self.is_public = is_public
26
+ self.filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
27
+
28
+ def find(self, *args):
29
+ results = self.__collection.find(*args)
30
+
31
+ # convert to list using get_item on each element
32
+ return [result for result in results]
33
+
34
+ def exist(self, schema_ref='eeg_signal', data_name=''):
35
+ query = {
36
+ "schema_ref": schema_ref,
37
+ "data_name": data_name
38
+ }
39
+ sessions = self.find(query)
40
+ return len(sessions) > 0
41
+
42
+ def add(self, record:dict):
43
+ input_record = self._validate_input(record)
44
+ print(input_record)
45
+ self.__collection.insert_one(input_record)
46
+
47
+ def _validate_input(self, record:dict):
48
+ input_types = {
49
+ 'schema_ref': str,
50
+ 'data_name': str,
51
+ 'dataset': str,
52
+ 'bidspath': str,
53
+ 'subject': str,
54
+ 'task': str,
55
+ 'session': str,
56
+ 'run': str,
57
+ 'sampling_frequency': float,
58
+ 'modality': str,
59
+ 'nchans': int,
60
+ 'ntimes': int,
61
+ 'channel_types': list,
62
+ 'channel_names': list,
63
+ }
64
+ record['schema_ref'] = 'eeg_signal'
65
+ if 'data_name' not in record:
66
+ raise ValueError("Missing key: data_name")
67
+ # check if args are in the keys and has correct type
68
+ for key,value in record.items():
69
+ if key not in input_types:
70
+ raise ValueError(f"Invalid input: {key}")
71
+ if not isinstance(value, input_types[key]):
72
+ raise ValueError(f"Invalid input: {key}")
73
+
74
+ return record
75
+
76
+ def load_eeg_data_from_s3(self, s3path):
77
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.set') as tmp:
78
+ with self.filesystem.open(s3path) as s3_file:
79
+ tmp.write(s3_file.read())
80
+ tmp_path = tmp.name
81
+ eeg_data = self.load_eeg_data_from_bids_file(tmp_path)
82
+ os.unlink(tmp_path)
83
+ return eeg_data
84
+
85
+ def load_eeg_data_from_bids_file(self, bids_file, eeg_attrs=None):
86
+ '''
87
+ bids_file must be a file of the bids_dataset
88
+ '''
89
+ EEG = mne.io.read_raw_eeglab(bids_file)
90
+ eeg_data = EEG.get_data()
91
+
92
+ fs = EEG.info['sfreq']
93
+ max_time = eeg_data.shape[1] / fs
94
+ time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
95
+
96
+ channel_names = EEG.ch_names
97
+
98
+ eeg_xarray = xr.DataArray(
99
+ data=eeg_data,
100
+ dims=['channel','time'],
101
+ coords={
102
+ 'time': time_steps,
103
+ 'channel': channel_names
104
+ },
105
+ # attrs=attrs
106
+ )
107
+ return eeg_xarray
108
+
109
+ def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
110
+ '''
111
+ bids_file must be a file of the bids_dataset
112
+ '''
113
+ if bids_file not in bids_dataset.files:
114
+ raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
115
+ f = os.path.basename(bids_file)
116
+ dsnumber = bids_dataset.dataset
117
+ # extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
118
+ openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
119
+
120
+ attrs = {
121
+ 'schema_ref': 'eeg_signal',
122
+ 'data_name': f'{bids_dataset.dataset}_{f}',
123
+ 'dataset': bids_dataset.dataset,
124
+ 'bidspath': openneuro_path,
125
+ 'subject': bids_dataset.subject(bids_file),
126
+ 'nchans': bids_dataset.num_channels(bids_file),
127
+ 'ntimes': bids_dataset.num_times(bids_file),
128
+ 'channel_types': bids_dataset.channel_types(bids_file),
129
+ 'channel_names': bids_dataset.channel_labels(bids_file),
130
+ 'task': bids_dataset.task(bids_file),
131
+ 'session': bids_dataset.session(bids_file),
132
+ 'run': bids_dataset.run(bids_file),
133
+ 'sampling_frequency': bids_dataset.sfreq(bids_file),
134
+ 'modality': 'EEG',
135
+ }
136
+
137
+ return attrs
138
+
139
+ def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=True):
140
+ '''
141
+ Create new records for the dataset in the MongoDB database if not found
142
+ '''
143
+ if self.is_public:
144
+ raise ValueError('This operation is not allowed for public users')
145
+
146
+ bids_dataset = BIDSDataset(
147
+ data_dir=data_dir,
148
+ dataset=dataset,
149
+ raw_format=raw_format,
150
+ )
151
+ for bids_file in bids_dataset.get_files():
152
+ print('bids raw file', bids_file)
153
+
154
+ signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
155
+
156
+ if self.exist(data_name=signalstore_data_id):
157
+ if overwrite:
158
+ eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
159
+ print('updating record', eeg_attrs['data_name'])
160
+ self.update(eeg_attrs)
161
+ else:
162
+ print('data already exist and not overwriting. skipped')
163
+ continue
164
+ else:
165
+ eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
166
+ # Assume raw data already exists on Openneuro, recreating record only
167
+ print('adding record', eeg_attrs['data_name'])
168
+ self.add(eeg_attrs)
169
+
170
+ def get_s3path(self, record):
171
+ return f"{self.AWS_BUCKET}/{record['bidspath']}"
172
+
173
+ def get(self, query:dict):
174
+ '''
175
+ query: {
176
+ 'dataset': 'dsxxxx',
177
+
178
+ }'''
179
+ sessions = self.find(query)
180
+ results = []
181
+ if sessions:
182
+ print(f'Found {len(sessions)} records')
183
+ results = Parallel(n_jobs=-1 if len(sessions) > 1 else 1, prefer="threads", verbose=1)(
184
+ delayed(self.load_eeg_data_from_s3)(self.get_s3path(session)) for session in sessions
185
+ )
186
+ return results
187
+
188
+ def update(self, record:dict):
189
+ record['schema_ref'] = 'eeg_signal'
190
+ self.__collection.update_one({'schema_ref': record['schema_ref'], 'data_name': record['data_name']},
191
+ {'$set': record}
192
+ )
193
+ def main():
194
+ eegdash = EEGDash()
195
+ record = eegdash.find({'dataset': 'ds005511', 'subject': 'NDARUF236HM7'})
196
+ print(record)
197
+
198
+ if __name__ == '__main__':
199
+ main()
@@ -1,17 +0,0 @@
1
- from eegdash.signalstore_data_utils import SignalstoreBIDS
2
-
3
- class EEGDash:
4
- def __init__(self):
5
- self.sstore = SignalstoreBIDS(
6
- # dbconnectionstring='mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1',
7
- dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
8
- is_public=True,
9
- local_filesystem=False,
10
- project_name='eegdash'
11
- )
12
-
13
- def find(self, *args):
14
- return self.sstore.find(*args)
15
-
16
- def get(self, *args):
17
- return self.sstore.get(*args)
@@ -1,25 +0,0 @@
1
- import argparse
2
- from signalstore_data_utils import SignalstoreOpenneuro
3
-
4
- def main():
5
- # Create the parser
6
- parser = argparse.ArgumentParser(description="A simple command line argument parser")
7
-
8
- # Add arguments
9
- parser.add_argument('--data', type=str, default="/mnt/nemar/openneuro/ds004186", help="Path to data directory (Default: /mnt/nemar/openneuro/ds004186)")
10
- parser.add_argument('--dataset', type=str, default="ds004186", help="Dataset name (Default: ds004186)")
11
-
12
- # Parse the arguments
13
- args = parser.parse_args()
14
- print('Arguments:', args)
15
-
16
- signalstore = SignalstoreOpenneuro(
17
- is_public=False,
18
- local_filesystem=False,
19
- )
20
- hbn_datasets = ['ds005505', 'ds005510', 'ds005514','ds005512','ds005511','ds005509','ds005508','ds005507','ds005506']
21
- for ds in hbn_datasets:
22
- signalstore.add_bids_dataset(dataset=ds, data_dir=f'/mnt/nemar/openneuro/{ds}', raw_format='eeglab')
23
-
24
- if __name__ == "__main__":
25
- main()
@@ -1,630 +0,0 @@
1
- from pathlib import Path
2
- from dotenv import load_dotenv
3
- import re
4
- import numpy as np
5
- import xarray as xr
6
- import os
7
- from signalstore.store import UnitOfWorkProvider
8
- # from mongomock import MongoClient
9
- from pymongo.mongo_client import MongoClient
10
- from pymongo.server_api import ServerApi
11
- from fsspec.implementations.local import LocalFileSystem
12
- from fsspec.implementations.dirfs import DirFileSystem
13
- import pandas as pd
14
- import json
15
- import s3fs
16
- from signalstore.store.data_access_objects import FileSystemDAO
17
- from .data_utils import BIDSDataset
18
- import tempfile
19
- import mne
20
- from joblib import Parallel, delayed
21
-
22
- class SignalstoreOpenneuro():
23
- AWS_BUCKET = 'openneuro.org'
24
- PROJECT_NAME = 'eegdash'
25
- def __init__(self,
26
- dbconnectionstring="mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1",
27
- is_public=False,
28
- local_filesystem=True,
29
- ):
30
- self.is_public = is_public
31
- self.project_name = self.PROJECT_NAME
32
- if is_public:
33
- dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
34
- else:
35
- load_dotenv()
36
- dbconnectionstring = os.getenv('DB_CONNECTION_STRING')
37
-
38
- # Create a new client and connect to the server
39
- client = MongoClient(dbconnectionstring, server_api=ServerApi('1'))
40
- # Send a ping to confirm a successful connection
41
- try:
42
- client.admin.command('ping')
43
- print("Pinged your deployment. You successfully connected to MongoDB!")
44
- except Exception as e:
45
- print(e)
46
-
47
- memory_store = {}
48
- self.filesystem = self.set_up_filesystem(is_local=local_filesystem)
49
- self.uow_provider = UnitOfWorkProvider(
50
- mongo_client=client,
51
- filesystem=self.filesystem,
52
- memory_store=memory_store,
53
- default_filetype='zarr'
54
- )
55
-
56
- self.uow = self.uow_provider(self.PROJECT_NAME)
57
- self.load_domain_models()
58
-
59
- def set_up_filesystem(self, is_local=True):
60
- if is_local:
61
- cache_path='/mnt/nemar/dtyoung/eeg-dash-data' # path where signalstore netCDF files are stored
62
- # Create a directory for the dataset
63
- store_path = Path(cache_path)
64
- if not os.path.exists(store_path):
65
- os.makedirs(store_path)
66
-
67
- filesystem = LocalFileSystem()
68
- tmp_dir_fs = DirFileSystem(
69
- store_path,
70
- filesystem=filesystem
71
- )
72
- return tmp_dir_fs
73
- else:
74
- s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
75
- return s3
76
-
77
- def load_domain_models(self):
78
- dir_path = os.path.dirname(os.path.realpath(__file__))
79
- cwd = Path(dir_path)
80
- domain_models_path = cwd / f"DomainModels/{self.project_name}/data_models.json"
81
- metamodel_path = cwd / f"DomainModels/{self.project_name}/metamodels.json"
82
- property_path = cwd / f"DomainModels/{self.project_name}/property_models.json"
83
- with open(metamodel_path) as f:
84
- metamodels = json.load(f)
85
-
86
- with open(property_path) as f:
87
- property_models = json.load(f)
88
-
89
- # load domain models json file
90
- with open(domain_models_path) as f:
91
- domain_models = json.load(f)
92
-
93
- with self.uow as uow:
94
- for property_model in property_models:
95
- if not uow.domain_models.exists(property_model['schema_name']):
96
- uow.domain_models.add(property_model)
97
- model = uow.domain_models.get(property_model['schema_name'])
98
- print('property model: ', model['schema_name'])
99
- for metamodel in metamodels:
100
- if not uow.domain_models.exists(metamodel['schema_name']):
101
- uow.domain_models.add(metamodel)
102
- model = uow.domain_models.get(metamodel['schema_name'])
103
- print('meta model: ', model['schema_name'])
104
- for domain_model in domain_models:
105
- if not uow.domain_models.exists(domain_model['schema_name']):
106
- uow.domain_models.add(domain_model)
107
- model = uow.domain_models.get(domain_model['schema_name'])
108
- print('domain model: ', model['schema_name'])
109
- uow.commit()
110
-
111
- def extract_attribute(self, pattern, filename):
112
- match = re.search(pattern, filename)
113
- return match.group(1) if match else None
114
-
115
- def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
116
- '''
117
- bids_file must be a file of the bids_dataset
118
- '''
119
- if bids_file not in bids_dataset.files:
120
- raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
121
- f = os.path.basename(bids_file)
122
- dsnumber = bids_dataset.dataset
123
- # extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
124
- openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
125
-
126
- attrs = {
127
- 'schema_ref': 'eeg_signal',
128
- 'data_name': f'{bids_dataset.dataset}_{f}',
129
- 'dataset': bids_dataset.dataset,
130
- 'bidspath': openneuro_path,
131
- 'subject': bids_dataset.subject(bids_file),
132
- 'task': bids_dataset.task(bids_file),
133
- 'session': bids_dataset.session(bids_file),
134
- 'run': bids_dataset.run(bids_file),
135
- 'sampling_frequency': bids_dataset.sfreq(bids_file),
136
- 'modality': 'EEG',
137
- }
138
-
139
- return attrs
140
-
141
- def load_eeg_data_from_s3(self, s3path):
142
- # import boto3
143
- # import scipy.io
144
- # import io
145
-
146
-
147
- # # Initialize the S3 client
148
- # s3 = boto3.client('s3')
149
-
150
-
151
- # # S3 bucket and object key
152
- # bucket_name = 'your-bucket-name'
153
- # object_key = 'path/to/your/file.mat'
154
-
155
-
156
- # # Get the object from S3 and stream it into memory
157
- # response = s3.get_object(Bucket=bucket_name, Key=object_key)
158
-
159
-
160
- # # Read the content into a BytesIO buffer
161
- # mat_file_stream = io.BytesIO(response['Body'].read())
162
-
163
-
164
- # # Load the MAT file using scipy.io.loadmat
165
- # data = scipy.io.loadmat(mat_file_stream)
166
-
167
-
168
- # # Work with the data
169
- # print(data)
170
- with tempfile.NamedTemporaryFile(delete=False, suffix='.set') as tmp:
171
- with self.filesystem.open(s3path) as s3_file:
172
- tmp.write(s3_file.read())
173
- tmp_path = tmp.name
174
- eeg_data = self.load_eeg_data_from_bids_file(tmp_path)
175
- os.unlink(tmp_path)
176
- return eeg_data
177
-
178
- def load_eeg_data_from_bids_file(self, bids_file, eeg_attrs=None):
179
- '''
180
- bids_file must be a file of the bids_dataset
181
- '''
182
- EEG = mne.io.read_raw_eeglab(bids_file)
183
- eeg_data = EEG.get_data()
184
-
185
- fs = EEG.info['sfreq']
186
- max_time = eeg_data.shape[1] / fs
187
- time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
188
-
189
- channel_names = EEG.ch_names
190
-
191
- eeg_xarray = xr.DataArray(
192
- data=eeg_data,
193
- dims=['channel','time'],
194
- coords={
195
- 'time': time_steps,
196
- 'channel': channel_names
197
- },
198
- # attrs=attrs
199
- )
200
- return eeg_xarray
201
-
202
- def exist(self, schema_ref='eeg_signal', data_name=''):
203
- with self.uow as uow:
204
- query = {
205
- "schema_ref": schema_ref,
206
- "data_name": data_name
207
- }
208
- sessions = uow.data.find(query)
209
- if len(sessions) > 0:
210
- return True
211
- else:
212
- return False
213
-
214
- def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=False):
215
- '''
216
- Create new records for the dataset in the MongoDB database if not found
217
- '''
218
- if self.is_public:
219
- raise ValueError('This operation is not allowed for public users')
220
-
221
- bids_dataset = BIDSDataset(
222
- data_dir=data_dir,
223
- dataset=dataset,
224
- raw_format=raw_format,
225
- )
226
- for bids_file in bids_dataset.get_files():
227
- print('bids raw file', bids_file)
228
-
229
- signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
230
- if overwrite:
231
- self.remove(signalstore_data_id)
232
-
233
- if self.exist(data_name=signalstore_data_id):
234
- print('data already exist. skipped')
235
- continue
236
- else:
237
- eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
238
- with self.uow as uow:
239
- # Assume raw data already exists on Openneuro, recreating record only
240
- eeg_attrs['has_file'] = True
241
- print('adding record', eeg_attrs['data_name'])
242
- uow.data.add(eeg_attrs)
243
- uow.commit()
244
-
245
- def update_bids_dataset(self, dataset, data_dir, raw_format='eeglab'):
246
- '''
247
- Create new records for the dataset in the MongoDB database if not found
248
- '''
249
- if self.is_public:
250
- raise ValueError('This operation is not allowed for public users')
251
-
252
- bids_dataset = BIDSDataset(
253
- data_dir=data_dir,
254
- dataset=dataset,
255
- raw_format=raw_format,
256
- )
257
- for bids_file in bids_dataset.get_files():
258
- print('bids raw file', bids_file)
259
-
260
- signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
261
-
262
- if not self.exist(data_name=signalstore_data_id):
263
- raise ValueError('data not found')
264
- else:
265
- self.remove(data_name=signalstore_data_id)
266
-
267
- eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
268
- with self.uow as uow:
269
- # Assume raw data already exists on Openneuro, recreating record only
270
- eeg_attrs['has_file'] = True
271
- print('adding record', eeg_attrs['data_name'])
272
- uow.data.add(eeg_attrs)
273
- uow.commit()
274
-
275
- def remove(self, schema_ref='eeg_signal', data_name=''):
276
- if self.is_public:
277
- raise ValueError('This operation is not allowed for public users')
278
-
279
- print('Removing record', data_name)
280
- with self.uow as uow:
281
- sessions = uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})
282
- if len(sessions) > 0:
283
- for session in sessions:
284
- uow.data.remove(session['schema_ref'], session['data_name'])
285
- uow.commit()
286
- uow.purge()
287
- assert len(uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})) == 0, 'Data still exists'
288
-
289
- def remove_all(self):
290
- if self.is_public:
291
- raise ValueError('This operation is not allowed for public users')
292
-
293
- with self.uow as uow:
294
- sessions = uow.data.find({})
295
- print(len(sessions))
296
- for session in range(len(sessions)):
297
- uow.data.remove(session['schema_ref'], session['data_name'])
298
- uow.commit()
299
-
300
- uow.purge()
301
-
302
- print('Verifying deletion job. Dataset length: ', len(uow.data.find({})))
303
-
304
- def find(self, query:dict, validate=False):
305
- '''
306
- query: {
307
- 'dataset': 'dsxxxx',
308
-
309
- }'''
310
- with self.uow as uow:
311
- sessions = uow.data.find(query, validate=validate)
312
- if sessions:
313
- print(f'Found {len(sessions)} records')
314
- return sessions
315
- else:
316
- return []
317
-
318
- def get(self, query:dict, validate=False):
319
- '''
320
- query: {
321
- 'dataset': 'dsxxxx',
322
-
323
- }'''
324
- with self.uow as uow:
325
- sessions = uow.data.find(query, validate=validate)
326
- results = []
327
- if sessions:
328
- print(f'Found {len(sessions)} records')
329
- results = Parallel(n_jobs=-1, prefer="threads", verbose=1)(
330
- delayed(self.load_eeg_data_from_s3)(Path(self.AWS_BUCKET) / session['bidspath']) for session in sessions
331
- )
332
- return results
333
-
334
- class SignalstoreBIDS():
335
- AWS_BUCKET = 'eegdash'
336
- def __init__(self,
337
- project_name=AWS_BUCKET,
338
- dbconnectionstring="mongodb://127.0.0.1:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.3.1",
339
- is_public=False,
340
- local_filesystem=True,
341
- ):
342
- self.is_public = is_public
343
- if is_public:
344
- dbconnectionstring='mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0',
345
- else:
346
- load_dotenv()
347
- dbconnectionstring = os.getenv('DB_CONNECTION_STRING')
348
-
349
- # Create a new client and connect to the server
350
- client = MongoClient(dbconnectionstring, server_api=ServerApi('1'))
351
- # Send a ping to confirm a successful connection
352
- try:
353
- client.admin.command('ping')
354
- print("Pinged your deployment. You successfully connected to MongoDB!")
355
- except Exception as e:
356
- print(e)
357
-
358
- memory_store = {}
359
- filesystem = self.set_up_filesystem(is_local=local_filesystem)
360
- self.uow_provider = UnitOfWorkProvider(
361
- mongo_client=client,
362
- filesystem=filesystem,
363
- memory_store=memory_store,
364
- default_filetype='zarr'
365
- )
366
-
367
- self.project_name=project_name
368
- self.uow = self.uow_provider(self.project_name)
369
- # self.load_domain_models()
370
-
371
- def set_up_filesystem(self, is_local=True):
372
- if is_local:
373
- cache_path='/mnt/nemar/dtyoung/eeg-ssl-data' # path where signalstore netCDF files are stored
374
- # Create a directory for the dataset
375
- store_path = Path(cache_path)
376
- if not os.path.exists(store_path):
377
- os.makedirs(store_path)
378
-
379
- filesystem = LocalFileSystem()
380
- tmp_dir_fs = DirFileSystem(
381
- store_path,
382
- filesystem=filesystem
383
- )
384
- return tmp_dir_fs
385
- else:
386
- if self.is_public:
387
- s3 = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
388
- else:
389
- s3 = s3fs.S3FileSystem(client_kwargs={'region_name': 'us-east-2'})
390
- return s3
391
-
392
- def load_domain_models(self):
393
- cwd = Path.cwd()
394
- domain_models_path = cwd / f"DomainModels/{self.project_name}/data_models.json"
395
- metamodel_path = cwd / f"DomainModels/{self.project_name}/metamodels.json"
396
- property_path = cwd / f"DomainModels/{self.project_name}/property_models.json"
397
- with open(metamodel_path) as f:
398
- metamodels = json.load(f)
399
-
400
- with open(property_path) as f:
401
- property_models = json.load(f)
402
-
403
- # load domain models json file
404
- with open(domain_models_path) as f:
405
- domain_models = json.load(f)
406
-
407
- with self.uow as uow:
408
- for property_model in property_models:
409
- uow.domain_models.add(property_model)
410
- model = uow.domain_models.get(property_model['schema_name'])
411
- print('property model: ', model['schema_name'])
412
- for metamodel in metamodels:
413
- uow.domain_models.add(metamodel)
414
- model = uow.domain_models.get(metamodel['schema_name'])
415
- print('meta model: ', model['schema_name'])
416
- for domain_model in domain_models:
417
- uow.domain_models.add(domain_model)
418
- model = uow.domain_models.get(domain_model['schema_name'])
419
- print('domain model: ', model['schema_name'])
420
- uow.commit()
421
-
422
- def extract_attribute(self, pattern, filename):
423
- match = re.search(pattern, filename)
424
- return match.group(1) if match else None
425
-
426
- def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
427
- '''
428
- bids_file must be a file of the bids_dataset
429
- '''
430
- if bids_file not in bids_dataset.files:
431
- raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
432
- f = os.path.basename(bids_file)
433
- attrs = {
434
- 'schema_ref': 'eeg_signal',
435
- 'data_name': f'{bids_dataset.dataset}_{f}',
436
- 'dataset': bids_dataset.dataset,
437
- 'subject': bids_dataset.subject(bids_file),
438
- 'task': bids_dataset.task(bids_file),
439
- 'session': bids_dataset.session(bids_file),
440
- 'run': bids_dataset.run(bids_file),
441
- 'sampling_frequency': bids_dataset.sfreq(bids_file),
442
- 'modality': 'EEG',
443
- }
444
-
445
- return attrs
446
-
447
- def load_eeg_data_from_bids_file(self, bids_dataset: BIDSDataset, bids_file, eeg_attrs=None):
448
- '''
449
- bids_file must be a file of the bids_dataset
450
- '''
451
- if bids_file not in bids_dataset.files:
452
- raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
453
-
454
- attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file) if eeg_attrs is None else eeg_attrs
455
-
456
- eeg_data = bids_dataset.load_and_preprocess_raw(bids_file)
457
- print('data shape:', eeg_data.shape)
458
-
459
- fs = attrs['sampling_frequency']
460
- max_time = eeg_data.shape[1] / fs
461
- time_steps = np.linspace(0, max_time, eeg_data.shape[1]).squeeze() # in seconds
462
- # print('time steps', len(time_steps))
463
-
464
- # replace eeg.set with channels.tsv
465
- # todo this is still a hacky way
466
- channels_tsv = bids_dataset.get_bids_metadata_files(bids_file, 'channels.tsv')
467
- channels_tsv = Path(channels_tsv[0])
468
- if channels_tsv.exists():
469
- channels = pd.read_csv(channels_tsv, sep='\t')
470
- # get channel names from channel_coords
471
- channel_names = channels['name'].values
472
-
473
- eeg_xarray = xr.DataArray(
474
- data=eeg_data,
475
- dims=['channel','time'],
476
- coords={
477
- 'time': time_steps,
478
- 'channel': channel_names
479
- },
480
- attrs=attrs
481
- )
482
- return eeg_xarray
483
-
484
- def exist(self, schema_ref='eeg_signal', data_name=''):
485
- with self.uow as uow:
486
- query = {
487
- "schema_ref": schema_ref,
488
- "data_name": data_name
489
- }
490
- sessions = uow.data.find(query)
491
- if len(sessions) > 0:
492
- return True
493
- else:
494
- return False
495
-
496
- def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=False, record_only=False):
497
- if self.is_public:
498
- raise ValueError('This operation is not allowed for public users')
499
-
500
- bids_dataset = BIDSDataset(
501
- data_dir=data_dir,
502
- dataset=dataset,
503
- raw_format=raw_format,
504
- )
505
- for bids_file in bids_dataset.get_files():
506
- print('bids raw file', bids_file)
507
-
508
- signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
509
- if overwrite:
510
- self.remove(signalstore_data_id)
511
-
512
- if self.exist(data_name=signalstore_data_id):
513
- print('data already exist. skipped')
514
- continue
515
- else:
516
- eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
517
- with self.uow as uow:
518
- # Assume raw data already exists, recreating record only
519
- eeg_attrs['has_file'] = True
520
- print('adding record', eeg_attrs['data_name'])
521
- uow.data.add(eeg_attrs)
522
- uow.commit()
523
- if not record_only:
524
- eeg_xarray = self.load_eeg_data_from_bids_file(bids_dataset, bids_file, eeg_attrs)
525
- with self.uow as uow:
526
- print('adding data', eeg_xarray.attrs['data_name'])
527
- uow.data.add(eeg_xarray)
528
- uow.commit()
529
-
530
- def remove(self, schema_ref='eeg_signal', data_name=''):
531
- if self.is_public:
532
- raise ValueError('This operation is not allowed for public users')
533
-
534
- with self.uow as uow:
535
- sessions = uow.data.find({'schema_ref': schema_ref, 'data_name': data_name})
536
- if len(session) > 0:
537
- for session in range(len(sessions)):
538
- uow.data.remove(session['schema_ref'], session['data_name'])
539
- uow.commit()
540
-
541
- def remove_all(self):
542
- if self.is_public:
543
- raise ValueError('This operation is not allowed for public users')
544
-
545
- with self.uow as uow:
546
- sessions = uow.data.find({})
547
- print(len(sessions))
548
- for session in range(len(sessions)):
549
- uow.data.remove(session['schema_ref'], session['data_name'])
550
- uow.commit()
551
-
552
- uow.purge()
553
-
554
- print('Verifying deletion job. Dataset length: ', len(uow.data.find({})))
555
-
556
- def find(self, query:dict, validate=False, get_data=False):
557
- '''
558
- query: {
559
- 'dataset': 'dsxxxx',
560
-
561
- }'''
562
- with self.uow as uow:
563
- sessions = uow.data.find(query, validate=validate, get_data=get_data)
564
- if sessions:
565
- print(f'Found {len(sessions)} records')
566
- return sessions
567
- else:
568
- return []
569
-
570
- def get(self, query:dict, validate=False):
571
- '''
572
- query: {
573
- 'dataset': 'dsxxxx',
574
-
575
- }'''
576
- with self.uow as uow:
577
- sessions = uow.data.find(query, validate=validate, get_data=True)
578
- if sessions:
579
- print(f'Found {len(sessions)} records')
580
- return sessions
581
- else:
582
- return []
583
-
584
- class OpenneuroFileSystemDAO(FileSystemDAO):
585
- def __init__(self):
586
- filesystem = s3fs.S3FileSystem(anon=True, client_kwargs={'region_name': 'us-east-2'})
587
- super().__init__(filesystem, project_dir='openneuro.org')
588
-
589
- def get(self, schema_ref, data_name, version_timestamp=0, nth_most_recent=1, data_adapter=None):
590
- """Gets an object from the Openneuro S3 bucket.
591
- Arguments:
592
- schema_ref {str} -- The type of object to get.
593
- data_name {str} -- The name of the object to get.
594
- version_timestamp {str} -- The version_timestamp of the object to get.
595
- Raises:
596
- FileSystemDAOFileNotFoundError -- If the object is not found.
597
- Returns:
598
- dict -- The object.
599
- """
600
- self._check_args(
601
- schema_ref=schema_ref,
602
- data_name=data_name,
603
- nth_most_recent=nth_most_recent,
604
- version_timestamp=version_timestamp,
605
- data_adapter=data_adapter
606
- )
607
- if data_adapter is None:
608
- data_adapter = self._default_data_adapter
609
- else:
610
- data_adapter.set_filesystem(self._fs)
611
- path = self._get_file_path(schema_ref, data_name, version_timestamp, nth_most_recent, data_adapter)
612
- if path is None:
613
- return None
614
- data_object = data_adapter.read_file(path)
615
- data_object = self._deserialize(data_object)
616
- return data_object
617
-
618
-
619
- if __name__ == "__main__":
620
- # sstore_hbn = SignalstoreHBN()
621
- # sstore_hbn.add_data()
622
- # sstore_ds004584 = SignalstoreHBN(
623
- # data_path='/mnt/nemar/openneuro/ds004584',
624
- # dataset_name='eegdash',
625
- # local_filesystem=False,
626
- # dbconnectionstring='mongodb://23.21.113.214:27017/?directConnection=true&serverSelectionTimeoutMS=2000&appName=mongosh+2.2.1'
627
- # )
628
- # sstore_ds004584.load_domain_models()
629
- # sstore_ds004584.add_data()
630
- pass
@@ -1,14 +0,0 @@
1
- LICENSE
2
- README.md
3
- pyproject.toml
4
- eegdash/__init__.py
5
- eegdash/data_utils.py
6
- eegdash/main.py
7
- eegdash/script.py
8
- eegdash/signalstore_data_utils.py
9
- eegdash.egg-info/PKG-INFO
10
- eegdash.egg-info/SOURCES.txt
11
- eegdash.egg-info/dependency_links.txt
12
- eegdash.egg-info/requires.txt
13
- eegdash.egg-info/top_level.txt
14
- tests/__init__.py
@@ -1 +0,0 @@
1
- eegdash
@@ -1,3 +0,0 @@
1
- from os import path
2
- import sys
3
- sys.path.append(path.abspath('../eegdash'))
File without changes
File without changes
File without changes