eegdash 0.0.7__py3-none-any.whl → 0.0.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -0,0 +1,87 @@
1
+ """
2
+ Convenience functions for storing and loading of features datasets.
3
+
4
+ see also: https://github.com/braindecode/braindecode//blob/master/braindecode/datautil/serialization.py#L165-L229
5
+ """
6
+
7
+ import json
8
+ from pathlib import Path
9
+
10
+ import pandas as pd
11
+ from joblib import Parallel, delayed
12
+
13
+ from mne.io import read_info
14
+ from braindecode.datautil.serialization import _load_kwargs_json
15
+
16
+ from .datasets import (
17
+ FeaturesDataset,
18
+ FeaturesConcatDataset,
19
+ )
20
+
21
+
22
+ def load_features_concat_dataset(path, ids_to_load=None, n_jobs=1):
23
+ """Load a stored FeaturesConcatDataset of FeaturesDatasets from files.
24
+
25
+ Parameters
26
+ ----------
27
+ path: str | pathlib.Path
28
+ Path to the directory of the .fif / -epo.fif and .json files.
29
+ ids_to_load: list of int | None
30
+ Ids of specific files to load.
31
+ n_jobs: int
32
+ Number of jobs to be used to read files in parallel.
33
+
34
+ Returns
35
+ -------
36
+ concat_dataset: FeaturesConcatDataset of FeaturesDatasets
37
+ """
38
+ # Make sure we always work with a pathlib.Path
39
+ path = Path(path)
40
+
41
+ # else we have a dataset saved in the new way with subdirectories in path
42
+ # for every dataset with description.json and -feat.parquet,
43
+ # target_name.json, raw_preproc_kwargs.json, window_kwargs.json,
44
+ # window_preproc_kwargs.json, features_kwargs.json
45
+ if ids_to_load is None:
46
+ ids_to_load = [p.name for p in path.iterdir()]
47
+ ids_to_load = sorted(ids_to_load, key=lambda i: int(i))
48
+ ids_to_load = [str(i) for i in ids_to_load]
49
+
50
+ datasets = Parallel(n_jobs)(delayed(_load_parallel)(path, i) for i in ids_to_load)
51
+ return FeaturesConcatDataset(datasets)
52
+
53
+
54
+ def _load_parallel(path, i):
55
+ sub_dir = path / i
56
+
57
+ parquet_name_pattern = "{}-feat.parquet"
58
+ parquet_file_name = parquet_name_pattern.format(i)
59
+ parquet_file_path = sub_dir / parquet_file_name
60
+
61
+ features = pd.read_parquet(parquet_file_path)
62
+
63
+ description_file_path = sub_dir / "description.json"
64
+ description = pd.read_json(description_file_path, typ="series")
65
+
66
+ raw_info_file_path = sub_dir / "raw-info.fif"
67
+ raw_info = None
68
+ if raw_info_file_path.exists():
69
+ raw_info = read_info(raw_info_file_path)
70
+
71
+ raw_preproc_kwargs = _load_kwargs_json("raw_preproc_kwargs", sub_dir)
72
+ window_kwargs = _load_kwargs_json("window_kwargs", sub_dir)
73
+ window_preproc_kwargs = _load_kwargs_json("window_preproc_kwargs", sub_dir)
74
+ features_kwargs = _load_kwargs_json("features_kwargs", sub_dir)
75
+ metadata = pd.read_pickle(path / i / "metadata_df.pkl")
76
+
77
+ dataset = FeaturesDataset(
78
+ features,
79
+ metadata=metadata,
80
+ description=description,
81
+ raw_info=raw_info,
82
+ raw_preproc_kwargs=raw_preproc_kwargs,
83
+ window_kwargs=window_kwargs,
84
+ window_preproc_kwargs=window_preproc_kwargs,
85
+ features_kwargs=features_kwargs,
86
+ )
87
+ return dataset
@@ -0,0 +1,114 @@
1
+ from typing import Dict, List
2
+ from collections.abc import Callable
3
+ import copy
4
+ import numpy as np
5
+ import pandas as pd
6
+ from joblib import Parallel, delayed
7
+ from tqdm import tqdm
8
+ from torch.utils.data import DataLoader
9
+ from braindecode.datasets.base import (
10
+ EEGWindowsDataset,
11
+ WindowsDataset,
12
+ BaseConcatDataset,
13
+ )
14
+
15
+ from .datasets import FeaturesDataset, FeaturesConcatDataset
16
+ from .extractors import FeatureExtractor
17
+
18
+
19
+ def _extract_features_from_windowsdataset(
20
+ win_ds: EEGWindowsDataset | WindowsDataset,
21
+ feature_extractor: FeatureExtractor,
22
+ batch_size: int = 512,
23
+ ):
24
+ metadata = win_ds.metadata
25
+ if not win_ds.targets_from == "metadata":
26
+ metadata = copy.deepcopy(metadata)
27
+ metadata["orig_index"] = metadata.index
28
+ metadata.set_index(
29
+ ["i_window_in_trial", "i_start_in_trial", "i_stop_in_trial"],
30
+ drop=False,
31
+ inplace=True,
32
+ )
33
+ win_dl = DataLoader(win_ds, batch_size=batch_size, shuffle=False, drop_last=False)
34
+ features_dict = dict()
35
+ ch_names = win_ds.raw.ch_names
36
+ for X, y, crop_inds in win_dl:
37
+ X = X.numpy()
38
+ if hasattr(y, "tolist"):
39
+ y = y.tolist()
40
+ win_dict = dict()
41
+ win_dict.update(
42
+ feature_extractor(X, _batch_size=X.shape[0], _ch_names=ch_names)
43
+ )
44
+ if not win_ds.targets_from == "metadata":
45
+ metadata.loc[crop_inds, "target"] = y
46
+ for k, v in win_dict.items():
47
+ if k not in features_dict:
48
+ features_dict[k] = []
49
+ features_dict[k].extend(v)
50
+ features_df = pd.DataFrame(features_dict)
51
+ if not win_ds.targets_from == "metadata":
52
+ metadata.set_index("orig_index", drop=False, inplace=True)
53
+ metadata.reset_index(drop=True, inplace=True)
54
+ metadata.drop("orig_index", axis=1, inplace=True)
55
+
56
+ # FUTURE: truely support WindowsDataset objects
57
+ return FeaturesDataset(
58
+ features_df,
59
+ metadata=metadata,
60
+ description=win_ds.description,
61
+ raw_info=win_ds.raw.info,
62
+ raw_preproc_kwargs=win_ds.raw_preproc_kwargs,
63
+ window_kwargs=win_ds.window_kwargs,
64
+ features_kwargs=feature_extractor.features_kwargs,
65
+ )
66
+
67
+
68
+ def extract_features(
69
+ concat_dataset: BaseConcatDataset,
70
+ features: FeatureExtractor | Dict[str, Callable] | List[Callable],
71
+ *,
72
+ batch_size: int = 512,
73
+ n_jobs: int = 1,
74
+ ):
75
+ if isinstance(features, list):
76
+ features = dict(enumerate(features))
77
+ if not isinstance(features, FeatureExtractor):
78
+ features = FeatureExtractor(features)
79
+ feature_ds_list = list(
80
+ tqdm(
81
+ Parallel(n_jobs=n_jobs, return_as="generator")(
82
+ delayed(_extract_features_from_windowsdataset)(
83
+ win_ds, features, batch_size
84
+ )
85
+ for win_ds in concat_dataset.datasets
86
+ ),
87
+ total=len(concat_dataset.datasets),
88
+ desc="Extracting features",
89
+ )
90
+ )
91
+ return FeaturesConcatDataset(feature_ds_list)
92
+
93
+
94
+ def fit_feature_extractors(
95
+ concat_dataset: BaseConcatDataset,
96
+ features: FeatureExtractor | Dict[str, Callable] | List[Callable],
97
+ batch_size: int = 8192,
98
+ ):
99
+ if isinstance(features, list):
100
+ features = dict(enumerate(features))
101
+ if not isinstance(features, FeatureExtractor):
102
+ features = FeatureExtractor(features)
103
+ if not features._is_fitable:
104
+ return features
105
+ features.clear()
106
+ concat_dl = DataLoader(
107
+ concat_dataset, batch_size=batch_size, shuffle=False, drop_last=False
108
+ )
109
+ for X, y, _ in tqdm(
110
+ concat_dl, total=len(concat_dl), desc="Fitting feature extractors"
111
+ ):
112
+ features.partial_fit(X.numpy(), y=np.array(y))
113
+ features.fit()
114
+ return features
eegdash/main.py CHANGED
@@ -1,17 +1,30 @@
1
1
  import pymongo
2
2
  from dotenv import load_dotenv
3
3
  import os
4
+ from pathlib import Path
4
5
  import s3fs
5
6
  from joblib import Parallel, delayed
7
+ import json
6
8
  import tempfile
7
9
  import mne
8
10
  import numpy as np
9
11
  import xarray as xr
10
- from .data_utils import BIDSDataset
12
+ from .data_utils import EEGBIDSDataset, EEGDashBaseRaw, EEGDashBaseDataset
13
+ from .data_config import config as data_config
14
+ from braindecode.datasets import BaseDataset, BaseConcatDataset
15
+ from collections import defaultdict
16
+ from pymongo import MongoClient, InsertOne, UpdateOne, DeleteOne
17
+
11
18
  class EEGDash:
12
19
  AWS_BUCKET = 's3://openneuro.org'
13
20
  def __init__(self,
14
21
  is_public=True):
22
+ # Load config file
23
+ # config_path = Path(__file__).parent / 'config.json'
24
+ # with open(config_path, 'r') as f:
25
+ # self.config = json.load(f)
26
+
27
+ self.config = data_config
15
28
  if is_public:
16
29
  DB_CONNECTION_STRING="mongodb+srv://eegdash-user:mdzoMjQcHWTVnKDq@cluster0.vz35p.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
17
30
  else:
@@ -31,22 +44,14 @@ class EEGDash:
31
44
  # convert to list using get_item on each element
32
45
  return [result for result in results]
33
46
 
34
- def exist(self, schema_ref='eeg_signal', data_name=''):
35
- query = {
36
- "schema_ref": schema_ref,
37
- "data_name": data_name
38
- }
47
+ def exist(self, query:dict):
48
+ accepted_query_fields = ['data_name', 'dataset']
49
+ assert all(field in accepted_query_fields for field in query.keys())
39
50
  sessions = self.find(query)
40
51
  return len(sessions) > 0
41
52
 
42
- def add(self, record:dict):
43
- input_record = self._validate_input(record)
44
- print(input_record)
45
- self.__collection.insert_one(input_record)
46
-
47
53
  def _validate_input(self, record:dict):
48
54
  input_types = {
49
- 'schema_ref': str,
50
55
  'data_name': str,
51
56
  'dataset': str,
52
57
  'bidspath': str,
@@ -61,7 +66,6 @@ class EEGDash:
61
66
  'channel_types': list,
62
67
  'channel_names': list,
63
68
  }
64
- record['schema_ref'] = 'eeg_signal'
65
69
  if 'data_name' not in record:
66
70
  raise ValueError("Missing key: data_name")
67
71
  # check if args are in the keys and has correct type
@@ -106,69 +110,126 @@ class EEGDash:
106
110
  )
107
111
  return eeg_xarray
108
112
 
109
- def load_eeg_attrs_from_bids_file(self, bids_dataset: BIDSDataset, bids_file):
113
+ def get_raw_extensions(self, bids_file, bids_dataset: EEGBIDSDataset):
114
+ bids_file = Path(bids_file)
115
+ extensions = {
116
+ '.set': ['.set', '.fdt'], # eeglab
117
+ '.edf': ['.edf'], # european
118
+ '.vhdr': ['.eeg', '.vhdr', '.vmrk', '.dat', '.raw'], # brainvision
119
+ '.bdf': ['.bdf'], # biosemi
120
+ }
121
+ return [str(bids_dataset.get_relative_bidspath(bids_file.with_suffix(suffix))) for suffix in extensions[bids_file.suffix] if bids_file.with_suffix(suffix).exists()]
122
+
123
+ def load_eeg_attrs_from_bids_file(self, bids_dataset: EEGBIDSDataset, bids_file):
110
124
  '''
111
125
  bids_file must be a file of the bids_dataset
112
126
  '''
113
127
  if bids_file not in bids_dataset.files:
114
128
  raise ValueError(f'{bids_file} not in {bids_dataset.dataset}')
129
+
130
+ # Initialize attrs with None values for all expected fields
131
+ attrs = {field: None for field in self.config['attributes'].keys()}
132
+
115
133
  f = os.path.basename(bids_file)
116
134
  dsnumber = bids_dataset.dataset
117
135
  # extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
118
136
  openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
119
137
 
120
- attrs = {
121
- 'schema_ref': 'eeg_signal',
122
- 'data_name': f'{bids_dataset.dataset}_{f}',
123
- 'dataset': bids_dataset.dataset,
124
- 'bidspath': openneuro_path,
125
- 'subject': bids_dataset.subject(bids_file),
126
- 'nchans': bids_dataset.num_channels(bids_file),
127
- 'ntimes': bids_dataset.num_times(bids_file),
128
- 'channel_types': bids_dataset.channel_types(bids_file),
129
- 'channel_names': bids_dataset.channel_labels(bids_file),
130
- 'task': bids_dataset.task(bids_file),
131
- 'session': bids_dataset.session(bids_file),
132
- 'run': bids_dataset.run(bids_file),
133
- 'sampling_frequency': bids_dataset.sfreq(bids_file),
134
- 'modality': 'EEG',
138
+ # Update with actual values where available
139
+ try:
140
+ participants_tsv = bids_dataset.subject_participant_tsv(bids_file)
141
+ except Exception as e:
142
+ print(f"Error getting participants_tsv: {str(e)}")
143
+ participants_tsv = None
144
+
145
+ try:
146
+ eeg_json = bids_dataset.eeg_json(bids_file)
147
+ except Exception as e:
148
+ print(f"Error getting eeg_json: {str(e)}")
149
+ eeg_json = None
150
+
151
+ bids_dependencies_files = self.config['bids_dependencies_files']
152
+ bidsdependencies = []
153
+ for extension in bids_dependencies_files:
154
+ try:
155
+ dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
156
+ dep_path = [str(bids_dataset.get_relative_bidspath(dep)) for dep in dep_path]
157
+ bidsdependencies.extend(dep_path)
158
+ except Exception as e:
159
+ pass
160
+
161
+ bidsdependencies.extend(self.get_raw_extensions(bids_file, bids_dataset))
162
+
163
+ # Define field extraction functions with error handling
164
+ field_extractors = {
165
+ 'data_name': lambda: f'{bids_dataset.dataset}_{f}',
166
+ 'dataset': lambda: bids_dataset.dataset,
167
+ 'bidspath': lambda: openneuro_path,
168
+ 'subject': lambda: bids_dataset.get_bids_file_attribute('subject', bids_file),
169
+ 'task': lambda: bids_dataset.get_bids_file_attribute('task', bids_file),
170
+ 'session': lambda: bids_dataset.get_bids_file_attribute('session', bids_file),
171
+ 'run': lambda: bids_dataset.get_bids_file_attribute('run', bids_file),
172
+ 'modality': lambda: bids_dataset.get_bids_file_attribute('modality', bids_file),
173
+ 'sampling_frequency': lambda: bids_dataset.get_bids_file_attribute('sfreq', bids_file),
174
+ 'nchans': lambda: bids_dataset.get_bids_file_attribute('nchans', bids_file),
175
+ 'ntimes': lambda: bids_dataset.get_bids_file_attribute('ntimes', bids_file),
176
+ 'participant_tsv': lambda: participants_tsv,
177
+ 'eeg_json': lambda: eeg_json,
178
+ 'bidsdependencies': lambda: bidsdependencies,
135
179
  }
180
+
181
+ # Dynamically populate attrs with error handling
182
+ for field, extractor in field_extractors.items():
183
+ try:
184
+ attrs[field] = extractor()
185
+ except Exception as e:
186
+ print(f"Error extracting {field}: {str(e)}")
187
+ attrs[field] = None
136
188
 
137
189
  return attrs
138
190
 
139
- def add_bids_dataset(self, dataset, data_dir, raw_format='eeglab', overwrite=True):
191
+ def add_bids_dataset(self, dataset, data_dir, overwrite=True):
140
192
  '''
141
193
  Create new records for the dataset in the MongoDB database if not found
142
194
  '''
143
195
  if self.is_public:
144
196
  raise ValueError('This operation is not allowed for public users')
145
197
 
146
- bids_dataset = BIDSDataset(
147
- data_dir=data_dir,
148
- dataset=dataset,
149
- raw_format=raw_format,
150
- )
198
+ if not overwrite and self.exist({'dataset': dataset}):
199
+ print(f'Dataset {dataset} already exists in the database')
200
+ return
201
+ try:
202
+ bids_dataset = EEGBIDSDataset(
203
+ data_dir=data_dir,
204
+ dataset=dataset,
205
+ )
206
+ except Exception as e:
207
+ print(f'Error creating bids dataset {dataset}: {str(e)}')
208
+ raise e
209
+ requests = []
151
210
  for bids_file in bids_dataset.get_files():
152
- print('bids raw file', bids_file)
153
-
154
- signalstore_data_id = f"{dataset}_{os.path.basename(bids_file)}"
211
+ try:
212
+ data_id = f"{dataset}_{os.path.basename(bids_file)}"
155
213
 
156
- if self.exist(data_name=signalstore_data_id):
157
- if overwrite:
158
- eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
159
- print('updating record', eeg_attrs['data_name'])
160
- self.update(eeg_attrs)
214
+ if self.exist({'data_name':data_id}):
215
+ if overwrite:
216
+ eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
217
+ requests.append(self.update_request(eeg_attrs))
161
218
  else:
162
- print('data already exist and not overwriting. skipped')
163
- continue
164
- else:
165
- eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
166
- # Assume raw data already exists on Openneuro, recreating record only
167
- print('adding record', eeg_attrs['data_name'])
168
- self.add(eeg_attrs)
219
+ eeg_attrs = self.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
220
+ requests.append(self.add_request(eeg_attrs))
221
+ except:
222
+ print('error adding record', bids_file)
169
223
 
170
- def get_s3path(self, record):
171
- return f"{self.AWS_BUCKET}/{record['bidspath']}"
224
+ print('Number of database requests', len(requests))
225
+
226
+ if requests:
227
+ result = self.__collection.bulk_write(requests, ordered=False)
228
+ print(f"Inserted: {result.inserted_count}")
229
+ print(f"Modified: {result.modified_count}")
230
+ print(f"Deleted: {result.deleted_count}")
231
+ print(f"Upserted: {result.upserted_count}")
232
+ print(f"Errors: {result.bulk_api_result.get('writeErrors', [])}")
172
233
 
173
234
  def get(self, query:dict):
174
235
  '''
@@ -185,11 +246,110 @@ class EEGDash:
185
246
  )
186
247
  return results
187
248
 
249
+ def add_request(self, record:dict):
250
+ return InsertOne(record)
251
+
252
+ def add(self, record:dict):
253
+ try:
254
+ # input_record = self._validate_input(record)
255
+ self.__collection.insert_one(record)
256
+ # silent failing
257
+ except ValueError as e:
258
+ print(f"Failed to validate record: {record['data_name']}")
259
+ print(e)
260
+ except:
261
+ print(f"Error adding record: {record['data_name']}")
262
+
263
+ def update_request(self, record:dict):
264
+ return UpdateOne({'data_name': record['data_name']}, {'$set': record})
265
+
188
266
  def update(self, record:dict):
189
- record['schema_ref'] = 'eeg_signal'
190
- self.__collection.update_one({'schema_ref': record['schema_ref'], 'data_name': record['data_name']},
191
- {'$set': record}
192
- )
267
+ try:
268
+ self.__collection.update_one({'data_name': record['data_name']}, {'$set': record})
269
+ except: # silent failure
270
+ print(f'Error updating record {record["data_name"]}')
271
+
272
+ def remove_field(self, record, field):
273
+ self.__collection.update_one({'data_name': record['data_name']}, {'$unset': {field: 1}})
274
+
275
+ def remove_field_from_db(self, field):
276
+ self.__collection.update_many({}, {'$unset': {field: 1}})
277
+
278
+ @property
279
+ def collection(self):
280
+ return self.__collection
281
+
282
+ class EEGDashDataset(BaseConcatDataset):
283
+ # CACHE_DIR = '.eegdash_cache'
284
+ def __init__(
285
+ self,
286
+ query:dict=None,
287
+ data_dir:str | list =None,
288
+ dataset:str | list =None,
289
+ description_fields: list[str]=['subject', 'session', 'run', 'task', 'age', 'gender', 'sex'],
290
+ cache_dir:str='.eegdash_cache',
291
+ **kwargs
292
+ ):
293
+ self.cache_dir = cache_dir
294
+ if query:
295
+ datasets = self.find_datasets(query, description_fields, **kwargs)
296
+ elif data_dir:
297
+ if type(data_dir) == str:
298
+ datasets = self.load_bids_dataset(dataset, data_dir, description_fields)
299
+ else:
300
+ assert len(data_dir) == len(dataset), 'Number of datasets and their directories must match'
301
+ datasets = []
302
+ for i in range(len(data_dir)):
303
+ datasets.extend(self.load_bids_dataset(dataset[i], data_dir[i], description_fields))
304
+ # convert to list using get_item on each element
305
+ super().__init__(datasets)
306
+
307
+
308
+ def find_key_in_nested_dict(self, data, target_key):
309
+ if isinstance(data, dict):
310
+ if target_key in data:
311
+ return data[target_key]
312
+ for value in data.values():
313
+ result = self.find_key_in_nested_dict(value, target_key)
314
+ if result is not None:
315
+ return result
316
+ return None
317
+
318
+ def find_datasets(self, query:dict, description_fields:list[str], **kwargs):
319
+ eegdashObj = EEGDash()
320
+ datasets = []
321
+ for record in eegdashObj.find(query):
322
+ description = {}
323
+ for field in description_fields:
324
+ value = self.find_key_in_nested_dict(record, field)
325
+ if value:
326
+ description[field] = value
327
+ datasets.append(EEGDashBaseDataset(record, self.cache_dir, description=description, **kwargs))
328
+ return datasets
329
+
330
+ def load_bids_dataset(self, dataset, data_dir, description_fields: list[str],raw_format='eeglab', **kwargs):
331
+ '''
332
+ '''
333
+ def get_base_dataset_from_bids_file(bids_dataset, bids_file):
334
+ record = eegdashObj.load_eeg_attrs_from_bids_file(bids_dataset, bids_file)
335
+ description = {}
336
+ for field in description_fields:
337
+ value = self.find_key_in_nested_dict(record, field)
338
+ if value:
339
+ description[field] = value
340
+ return EEGDashBaseDataset(record, self.cache_dir, description=description, **kwargs)
341
+
342
+ bids_dataset = EEGBIDSDataset(
343
+ data_dir=data_dir,
344
+ dataset=dataset,
345
+ raw_format=raw_format,
346
+ )
347
+ eegdashObj = EEGDash()
348
+ datasets = Parallel(n_jobs=-1, prefer="threads", verbose=1)(
349
+ delayed(get_base_dataset_from_bids_file)(bids_dataset, bids_file) for bids_file in bids_dataset.get_files()
350
+ )
351
+ return datasets
352
+
193
353
  def main():
194
354
  eegdash = EEGDash()
195
355
  record = eegdash.find({'dataset': 'ds005511', 'subject': 'NDARUF236HM7'})
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: eegdash
3
+ Version: 0.0.9
4
+ Summary: EEG data for machine learning
5
+ Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>
6
+ License: GNU General Public License
7
+
8
+ Copyright (C) 2024-2025
9
+
10
+ Young Truong, UCSD, dt.young112@gmail.com
11
+ Arnaud Delorme, UCSD, adelorme@ucsd.edu
12
+
13
+ This program is free software; you can redistribute it and/or modify
14
+ it under the terms of the GNU General Public License as published by
15
+ the Free Software Foundation; either version 2 of the License, or
16
+ (at your option) any later version.
17
+
18
+ This program is distributed in the hope that it will be useful,
19
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
20
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21
+ GNU General Public License for more details.
22
+
23
+ You should have received a copy of the GNU General Public License
24
+ along with this program; if not, write to the Free Software
25
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1.07 USA
26
+
27
+ Project-URL: Homepage, https://eegdash.org
28
+ Project-URL: Issues, https://github.com/sccn/EEGDash/issues
29
+ Classifier: Programming Language :: Python :: 3
30
+ Classifier: License :: OSI Approved :: MIT License
31
+ Classifier: Operating System :: OS Independent
32
+ Requires-Python: >=3.8
33
+ Description-Content-Type: text/markdown
34
+ License-File: LICENSE
35
+ Requires-Dist: xarray
36
+ Requires-Dist: python-dotenv
37
+ Requires-Dist: s3fs
38
+ Requires-Dist: mne
39
+ Requires-Dist: pynwb
40
+ Requires-Dist: h5py
41
+ Requires-Dist: pymongo
42
+ Requires-Dist: joblib
43
+ Requires-Dist: braindecode
44
+ Requires-Dist: mne-bids
45
+ Requires-Dist: pybids
46
+ Requires-Dist: pymatreader
47
+ Requires-Dist: pyarrow
48
+ Requires-Dist: tqdm
49
+ Requires-Dist: numba
50
+ Dynamic: license-file
51
+
52
+ # EEG-Dash
53
+ To leverage recent and ongoing advancements in large-scale computational methods and to ensure the preservation of scientific data generated from publicly funded research, the EEG-DaSh data archive will create a data-sharing resource for MEEG (EEG, MEG) data contributed by collaborators for machine learning (ML) and deep learning (DL) applications.
54
+
55
+ ## Data source
56
+ The data in EEG-DaSh originates from a collaboration involving 25 laboratories, encompassing 27,053 participants. This extensive collection includes MEEG data, which is a combination of EEG and MEG signals. The data is sourced from various studies conducted by these labs, involving both healthy subjects and clinical populations with conditions such as ADHD, depression, schizophrenia, dementia, autism, and psychosis. Additionally, data spans different mental states like sleep, meditation, and cognitive tasks. In addition, EEG-DaSh will incorporate a subset of the data converted from NEMAR, which includes 330 MEEG BIDS-formatted datasets, further expanding the archive with well-curated, standardized neuroelectromagnetic data.
57
+
58
+ ## Featured data
59
+
60
+ The following HBN datasets are currently featured on EEGDash. Documentation about these datasets is available [here](https://neuromechanist.github.io/data/hbn/).
61
+
62
+ | DatasetID | Participants | Files | Sessions | Population | Channels | Is 10-20? | Modality | Size |
63
+ |---|---|---|---|---|---|---|---|---|
64
+ | [ds005505](https://nemar.org/dataexplorer/detail?dataset_id=ds005505) | 136 | 5393 | 1 | Healthy | 129 | other | Visual | 103 GB |
65
+ | [ds005506](https://nemar.org/dataexplorer/detail?dataset_id=ds005506) | 150 | 5645 | 1 | Healthy | 129 | other | Visual | 112 GB |
66
+ | [ds005507](https://nemar.org/dataexplorer/detail?dataset_id=ds005507) | 184 | 7273 | 1 | Healthy | 129 | other | Visual | 140 GB |
67
+ | [ds005508](https://nemar.org/dataexplorer/detail?dataset_id=ds005508) | 324 | 13393 | 1 | Healthy | 129 | other | Visual | 230 GB |
68
+ | [ds005510](https://nemar.org/dataexplorer/detail?dataset_id=ds005510) | 135 | 4933 | 1 | Healthy | 129 | other | Visual | 91 GB |
69
+ | [ds005512](https://nemar.org/dataexplorer/detail?dataset_id=ds005512) | 257 | 9305 | 1 | Healthy | 129 | other | Visual | 157 GB |
70
+ | [ds005514](https://nemar.org/dataexplorer/detail?dataset_id=ds005514) | 295 | 11565 | 1 | Healthy | 129 | other | Visual | 185 GB |
71
+
72
+ A total of [246 other datasets](datasets.md) are also available through EEGDash.
73
+
74
+ ## Data format
75
+ EEGDash queries return a **Pytorch Dataset** formatted to facilitate machine learning (ML) and deep learning (DL) applications. PyTorch Datasets are the best format for EEGDash queries because they provide an efficient, scalable, and flexible structure for machine learning (ML) and deep learning (DL) applications. They allow seamless integration with PyTorch’s DataLoader, enabling efficient batching, shuffling, and parallel data loading, which is essential for training deep learning models on large EEG datasets.
76
+
77
+ ## Data preprocessing
78
+ EEGDash datasets are processed using the popular [BrainDecode](https://braindecode.org/stable/index.html) library. In fact, EEGDash datasets are BrainDecode datasets, which are themselves PyTorch datasets. This means that any preprocessing possible on BrainDecode datasets is also possible on EEGDash datasets. Refer to [BrainDecode](https://braindecode.org/stable/index.html) tutorials for guidance on preprocessing EEG data.
79
+
80
+ ## EEG-Dash usage
81
+
82
+ ### Install
83
+ Use your preferred Python environment manager with Python > 3.9 to install the package.
84
+ * To install the eegdash package, use the following command: `pip install eegdash`
85
+ * To verify the installation, start a Python session and type: `from eegdash import EEGDash`
86
+
87
+ ### Data access
88
+
89
+ To use the data from a single subject, enter:
90
+
91
+ ```python
92
+ from eegdash import EEGDashDataset
93
+ ds_NDARDB033FW5 = EEGDashDataset({'dataset': 'ds005514', 'task': 'RestingState', 'subject': 'NDARDB033FW5'})
94
+ ```
95
+
96
+ This will search and download the metadata for the task **RestingState** for subject **NDARDB033FW5** in BIDS dataset **ds005514**. The actual data will not be downloaded at this stage. Following standard practice, data is only downloaded once it is processed. The **ds_NDARDB033FW5** object is a fully functional BrainDecode dataset, which is itself a PyTorch dataset. This [tutorial](https://github.com/sccn/EEGDash/blob/develop/notebooks/tutorial_eoec.ipynb) shows how to preprocess the EEG data, extracting portions of the data containing eyes-open and eyes-closed segments, then perform eyes-open vs. eyes-closed classification using a (shallow) deep-learning model.
97
+
98
+ To use the data from multiple subjects, enter:
99
+
100
+ ```python
101
+ from eegdash import EEGDashDataset
102
+ ds_ds005505rest = EEGDashDataset({'dataset': 'ds005505', 'task': 'RestingState'}, target_name='sex')
103
+ ```
104
+
105
+ This will search and download the metadata for the task 'RestingState' for all subjects in BIDS dataset 'ds005505' (a total of 136). As above, the actual data will not be downloaded at this stage so this command is quick to execute. Also, the target class for each subject is assigned using the target_name parameter. This means that this object is ready to be directly fed to a deep learning model, although the [tutorial script](https://github.com/sccn/EEGDash/blob/develop/notebooks/tutorial_sex_classification.ipynb) performs minimal processing on it, prior to training a deep-learning model. Because 14 gigabytes of data are downloaded, this tutorial takes about 10 minutes to execute.
106
+
107
+ ### Automatic caching
108
+
109
+ EEGDash automatically caches the downloaded data in the .eegdash_cache folder of the current directory from which the script is called. This means that if you run the tutorial [scripts](https://github.com/sccn/EEGDash/tree/develop/notebooks), the data will only be downloaded the first time the script is executed.
110
+
111
+ ## Education -- Coming soon...
112
+
113
+ We organize workshops and educational events to foster cross-cultural education and student training, offering both online and in-person opportunities in collaboration with US and Israeli partners. Events for 2025 will be announced via the EEGLABNEWS mailing list. Be sure to [subscribe](https://sccn.ucsd.edu/mailman/listinfo/eeglabnews).
114
+
115
+ ## About EEG-DaSh
116
+
117
+ EEG-DaSh is a collaborative initiative between the United States and Israel, supported by the National Science Foundation (NSF). The partnership brings together experts from the Swartz Center for Computational Neuroscience (SCCN) at the University of California San Diego (UCSD) and Ben-Gurion University (BGU) in Israel.
118
+
119
+ ![Screenshot 2024-10-03 at 09 14 06](https://github.com/user-attachments/assets/327639d3-c3b4-46b1-9335-37803209b0d3)
120
+
121
+
122
+
123
+