eegdash 0.3.7.dev104__py3-none-any.whl → 0.3.7.dev105__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +4 -4
- eegdash/api.py +429 -422
- eegdash/bids_eeg_metadata.py +184 -0
- eegdash/const.py +48 -0
- eegdash/data_utils.py +68 -28
- eegdash/dataset/__init__.py +4 -0
- eegdash/{dataset.py → dataset/dataset.py} +53 -10
- eegdash/{registry.py → dataset/registry.py} +3 -3
- eegdash/utils.py +1 -1
- {eegdash-0.3.7.dev104.dist-info → eegdash-0.3.7.dev105.dist-info}/METADATA +1 -1
- {eegdash-0.3.7.dev104.dist-info → eegdash-0.3.7.dev105.dist-info}/RECORD +14 -14
- eegdash/data_config.py +0 -34
- eegdash/dataset_summary.csv +0 -256
- {eegdash-0.3.7.dev104.dist-info → eegdash-0.3.7.dev105.dist-info}/WHEEL +0 -0
- {eegdash-0.3.7.dev104.dist-info → eegdash-0.3.7.dev105.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.3.7.dev104.dist-info → eegdash-0.3.7.dev105.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from .const import ALLOWED_QUERY_FIELDS
|
|
6
|
+
from .const import config as data_config
|
|
7
|
+
from .data_utils import EEGBIDSDataset
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger("eegdash")
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"build_query_from_kwargs",
|
|
13
|
+
"load_eeg_attrs_from_bids_file",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
|
|
18
|
+
"""Build and validate a MongoDB query from user-friendly keyword arguments.
|
|
19
|
+
|
|
20
|
+
Improvements:
|
|
21
|
+
- Reject None values and empty/whitespace-only strings
|
|
22
|
+
- For list/tuple/set values: strip strings, drop None/empties, deduplicate, and use `$in`
|
|
23
|
+
- Preserve scalars as exact matches
|
|
24
|
+
"""
|
|
25
|
+
# 1. Validate that all provided keys are allowed for querying
|
|
26
|
+
unknown_fields = set(kwargs.keys()) - ALLOWED_QUERY_FIELDS
|
|
27
|
+
if unknown_fields:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
f"Unsupported query field(s): {', '.join(sorted(unknown_fields))}. "
|
|
30
|
+
f"Allowed fields are: {', '.join(sorted(ALLOWED_QUERY_FIELDS))}"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# 2. Construct the query dictionary
|
|
34
|
+
query = {}
|
|
35
|
+
for key, value in kwargs.items():
|
|
36
|
+
# None is not a valid constraint
|
|
37
|
+
if value is None:
|
|
38
|
+
raise ValueError(
|
|
39
|
+
f"Received None for query parameter '{key}'. Provide a concrete value."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Handle list-like values as multi-constraints
|
|
43
|
+
if isinstance(value, (list, tuple, set)):
|
|
44
|
+
cleaned: list[Any] = []
|
|
45
|
+
for item in value:
|
|
46
|
+
if item is None:
|
|
47
|
+
continue
|
|
48
|
+
if isinstance(item, str):
|
|
49
|
+
item = item.strip()
|
|
50
|
+
if not item:
|
|
51
|
+
continue
|
|
52
|
+
cleaned.append(item)
|
|
53
|
+
# Deduplicate while preserving order
|
|
54
|
+
cleaned = list(dict.fromkeys(cleaned))
|
|
55
|
+
if not cleaned:
|
|
56
|
+
raise ValueError(
|
|
57
|
+
f"Received an empty list for query parameter '{key}'. This is not supported."
|
|
58
|
+
)
|
|
59
|
+
query[key] = {"$in": cleaned}
|
|
60
|
+
else:
|
|
61
|
+
# Scalars: trim strings and validate
|
|
62
|
+
if isinstance(value, str):
|
|
63
|
+
value = value.strip()
|
|
64
|
+
if not value:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"Received an empty string for query parameter '{key}'."
|
|
67
|
+
)
|
|
68
|
+
query[key] = value
|
|
69
|
+
|
|
70
|
+
return query
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _get_raw_extensions(bids_file: str, bids_dataset: EEGBIDSDataset) -> list[str]:
|
|
74
|
+
"""Helper to find paths to additional "sidecar" files that may be associated
|
|
75
|
+
with a given main data file in a BIDS dataset; paths are returned as relative to
|
|
76
|
+
the parent dataset path.
|
|
77
|
+
|
|
78
|
+
For example, if the input file is a .set file, this will return the relative path
|
|
79
|
+
to a corresponding .fdt file (if any).
|
|
80
|
+
"""
|
|
81
|
+
bids_file = Path(bids_file)
|
|
82
|
+
extensions = {
|
|
83
|
+
".set": [".set", ".fdt"], # eeglab
|
|
84
|
+
".edf": [".edf"], # european
|
|
85
|
+
".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"], # brainvision
|
|
86
|
+
".bdf": [".bdf"], # biosemi
|
|
87
|
+
}
|
|
88
|
+
return [
|
|
89
|
+
str(bids_dataset._get_relative_bidspath(bids_file.with_suffix(suffix)))
|
|
90
|
+
for suffix in extensions[bids_file.suffix]
|
|
91
|
+
if bids_file.with_suffix(suffix).exists()
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def load_eeg_attrs_from_bids_file(
|
|
96
|
+
bids_dataset: EEGBIDSDataset, bids_file: str
|
|
97
|
+
) -> dict[str, Any]:
|
|
98
|
+
"""Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
|
|
99
|
+
|
|
100
|
+
Attributes are at least the ones defined in data_config attributes (set to None if missing),
|
|
101
|
+
but are typically a superset, and include, among others, the paths to relevant
|
|
102
|
+
meta-data files needed to load and interpret the file in question.
|
|
103
|
+
|
|
104
|
+
Parameters
|
|
105
|
+
----------
|
|
106
|
+
bids_dataset : EEGBIDSDataset
|
|
107
|
+
The BIDS dataset object containing the file.
|
|
108
|
+
bids_file : str
|
|
109
|
+
The path to the BIDS file within the dataset.
|
|
110
|
+
|
|
111
|
+
Returns
|
|
112
|
+
-------
|
|
113
|
+
dict:
|
|
114
|
+
A dictionary representing the metadata record for the given file. This is the
|
|
115
|
+
same format as the records stored in the database.
|
|
116
|
+
|
|
117
|
+
"""
|
|
118
|
+
if bids_file not in bids_dataset.files:
|
|
119
|
+
raise ValueError(f"{bids_file} not in {bids_dataset.dataset}")
|
|
120
|
+
|
|
121
|
+
# Initialize attrs with None values for all expected fields
|
|
122
|
+
attrs = {field: None for field in data_config["attributes"].keys()}
|
|
123
|
+
|
|
124
|
+
file = Path(bids_file).name
|
|
125
|
+
dsnumber = bids_dataset.dataset
|
|
126
|
+
# extract openneuro path by finding the first occurrence of the dataset name in the filename and remove the path before that
|
|
127
|
+
openneuro_path = dsnumber + bids_file.split(dsnumber)[1]
|
|
128
|
+
|
|
129
|
+
# Update with actual values where available
|
|
130
|
+
try:
|
|
131
|
+
participants_tsv = bids_dataset.subject_participant_tsv(bids_file)
|
|
132
|
+
except Exception as e:
|
|
133
|
+
logger.error("Error getting participants_tsv: %s", str(e))
|
|
134
|
+
participants_tsv = None
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
eeg_json = bids_dataset.eeg_json(bids_file)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.error("Error getting eeg_json: %s", str(e))
|
|
140
|
+
eeg_json = None
|
|
141
|
+
|
|
142
|
+
bids_dependencies_files = data_config["bids_dependencies_files"]
|
|
143
|
+
bidsdependencies = []
|
|
144
|
+
for extension in bids_dependencies_files:
|
|
145
|
+
try:
|
|
146
|
+
dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
|
|
147
|
+
dep_path = [
|
|
148
|
+
str(bids_dataset.get_relative_bidspath(dep)) for dep in dep_path
|
|
149
|
+
]
|
|
150
|
+
bidsdependencies.extend(dep_path)
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
bidsdependencies.extend(_get_raw_extensions(bids_file, bids_dataset))
|
|
155
|
+
|
|
156
|
+
# Define field extraction functions with error handling
|
|
157
|
+
field_extractors = {
|
|
158
|
+
"data_name": lambda: f"{bids_dataset.dataset}_{file}",
|
|
159
|
+
"dataset": lambda: bids_dataset.dataset,
|
|
160
|
+
"bidspath": lambda: openneuro_path,
|
|
161
|
+
"subject": lambda: bids_dataset.get_bids_file_attribute("subject", bids_file),
|
|
162
|
+
"task": lambda: bids_dataset.get_bids_file_attribute("task", bids_file),
|
|
163
|
+
"session": lambda: bids_dataset.get_bids_file_attribute("session", bids_file),
|
|
164
|
+
"run": lambda: bids_dataset.get_bids_file_attribute("run", bids_file),
|
|
165
|
+
"modality": lambda: bids_dataset.get_bids_file_attribute("modality", bids_file),
|
|
166
|
+
"sampling_frequency": lambda: bids_dataset.get_bids_file_attribute(
|
|
167
|
+
"sfreq", bids_file
|
|
168
|
+
),
|
|
169
|
+
"nchans": lambda: bids_dataset.get_bids_file_attribute("nchans", bids_file),
|
|
170
|
+
"ntimes": lambda: bids_dataset.get_bids_file_attribute("ntimes", bids_file),
|
|
171
|
+
"participant_tsv": lambda: participants_tsv,
|
|
172
|
+
"eeg_json": lambda: eeg_json,
|
|
173
|
+
"bidsdependencies": lambda: bidsdependencies,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# Dynamically populate attrs with error handling
|
|
177
|
+
for field, extractor in field_extractors.items():
|
|
178
|
+
try:
|
|
179
|
+
attrs[field] = extractor()
|
|
180
|
+
except Exception as e:
|
|
181
|
+
logger.error("Error extracting %s : %s", field, str(e))
|
|
182
|
+
attrs[field] = None
|
|
183
|
+
|
|
184
|
+
return attrs
|
eegdash/const.py
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
1
|
+
ALLOWED_QUERY_FIELDS = {
|
|
2
|
+
"data_name",
|
|
3
|
+
"dataset",
|
|
4
|
+
"subject",
|
|
5
|
+
"task",
|
|
6
|
+
"session",
|
|
7
|
+
"run",
|
|
8
|
+
"modality",
|
|
9
|
+
"sampling_frequency",
|
|
10
|
+
"nchans",
|
|
11
|
+
"ntimes",
|
|
12
|
+
}
|
|
13
|
+
|
|
1
14
|
RELEASE_TO_OPENNEURO_DATASET_MAP = {
|
|
2
15
|
"R11": "ds005516",
|
|
3
16
|
"R10": "ds005515",
|
|
@@ -256,3 +269,38 @@ SUBJECT_MINI_RELEASE_MAP = {
|
|
|
256
269
|
"NDARFW972KFQ",
|
|
257
270
|
],
|
|
258
271
|
}
|
|
272
|
+
|
|
273
|
+
config = {
|
|
274
|
+
"required_fields": ["data_name"],
|
|
275
|
+
# Default set of user-facing primary record attributes expected in the database. Records
|
|
276
|
+
# where any of these are missing will be loaded with the respective attribute set to None.
|
|
277
|
+
# Additional fields may be returned if they are present in the database, notably bidsdependencies.
|
|
278
|
+
"attributes": {
|
|
279
|
+
"data_name": "str",
|
|
280
|
+
"dataset": "str",
|
|
281
|
+
"bidspath": "str",
|
|
282
|
+
"subject": "str",
|
|
283
|
+
"task": "str",
|
|
284
|
+
"session": "str",
|
|
285
|
+
"run": "str",
|
|
286
|
+
"sampling_frequency": "float",
|
|
287
|
+
"modality": "str",
|
|
288
|
+
"nchans": "int",
|
|
289
|
+
"ntimes": "int", # note: this is really the number of seconds in the data, rounded down
|
|
290
|
+
},
|
|
291
|
+
# queryable descriptive fields for a given recording
|
|
292
|
+
"description_fields": ["subject", "session", "run", "task", "age", "gender", "sex"],
|
|
293
|
+
# list of filenames that may be present in the BIDS dataset directory that are used
|
|
294
|
+
# to load and interpret a given BIDS recording.
|
|
295
|
+
"bids_dependencies_files": [
|
|
296
|
+
"dataset_description.json",
|
|
297
|
+
"participants.tsv",
|
|
298
|
+
"events.tsv",
|
|
299
|
+
"events.json",
|
|
300
|
+
"eeg.json",
|
|
301
|
+
"electrodes.tsv",
|
|
302
|
+
"channels.tsv",
|
|
303
|
+
"coordsystem.json",
|
|
304
|
+
],
|
|
305
|
+
"accepted_query_fields": ["data_name", "dataset"],
|
|
306
|
+
}
|
eegdash/data_utils.py
CHANGED
|
@@ -57,7 +57,7 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
57
57
|
super().__init__(None, **kwargs)
|
|
58
58
|
self.record = record
|
|
59
59
|
self.cache_dir = Path(cache_dir)
|
|
60
|
-
self.bids_kwargs = self.
|
|
60
|
+
self.bids_kwargs = self._get_raw_bids_args()
|
|
61
61
|
|
|
62
62
|
if s3_bucket:
|
|
63
63
|
self.s3_bucket = s3_bucket
|
|
@@ -66,16 +66,46 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
66
66
|
self.s3_bucket = self._AWS_BUCKET
|
|
67
67
|
self.s3_open_neuro = True
|
|
68
68
|
|
|
69
|
-
|
|
70
|
-
|
|
69
|
+
# Compute a dataset folder name under cache_dir that encodes preprocessing
|
|
70
|
+
# (e.g., bdf, mini) to avoid overlapping with the original dataset cache.
|
|
71
|
+
self.dataset_folder = record.get("dataset", "")
|
|
72
|
+
if s3_bucket:
|
|
73
|
+
suffixes: list[str] = []
|
|
74
|
+
bucket_lower = str(s3_bucket).lower()
|
|
75
|
+
if "bdf" in bucket_lower:
|
|
76
|
+
suffixes.append("bdf")
|
|
77
|
+
if "mini" in bucket_lower:
|
|
78
|
+
suffixes.append("mini")
|
|
79
|
+
if suffixes:
|
|
80
|
+
self.dataset_folder = f"{self.dataset_folder}-{'-'.join(suffixes)}"
|
|
81
|
+
|
|
82
|
+
# Place files under the dataset-specific folder (with suffix if any)
|
|
83
|
+
rel = Path(record["bidspath"]) # usually starts with dataset id
|
|
84
|
+
if rel.parts and rel.parts[0] == record.get("dataset"):
|
|
85
|
+
rel = Path(self.dataset_folder, *rel.parts[1:])
|
|
86
|
+
else:
|
|
87
|
+
rel = Path(self.dataset_folder) / rel
|
|
88
|
+
self.filecache = self.cache_dir / rel
|
|
89
|
+
self.bids_root = self.cache_dir / self.dataset_folder
|
|
71
90
|
self.bidspath = BIDSPath(
|
|
72
91
|
root=self.bids_root,
|
|
73
92
|
datatype="eeg",
|
|
74
93
|
suffix="eeg",
|
|
94
|
+
# extension='.bdf',
|
|
75
95
|
**self.bids_kwargs,
|
|
76
96
|
)
|
|
97
|
+
# TO-DO: remove this once find a better solution using mne-bids or update competition dataset
|
|
98
|
+
try:
|
|
99
|
+
_ = str(self.bidspath)
|
|
100
|
+
except RuntimeError:
|
|
101
|
+
try:
|
|
102
|
+
self.bidspath = self.bidspath.update(extension=".bdf")
|
|
103
|
+
self.filecache = self.filecache.with_suffix(".bdf")
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.error(f"Error while updating BIDS path: {e}")
|
|
106
|
+
raise e
|
|
77
107
|
|
|
78
|
-
self.s3file = self.
|
|
108
|
+
self.s3file = self._get_s3path(record["bidspath"])
|
|
79
109
|
self.bids_dependencies = record["bidsdependencies"]
|
|
80
110
|
# Temporary fix for BIDS dependencies path
|
|
81
111
|
# just to release to the competition
|
|
@@ -87,7 +117,7 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
87
117
|
|
|
88
118
|
self._raw = None
|
|
89
119
|
|
|
90
|
-
def
|
|
120
|
+
def _get_s3path(self, filepath: str) -> str:
|
|
91
121
|
"""Helper to form an AWS S3 URI for the given relative filepath."""
|
|
92
122
|
return f"{self.s3_bucket}/{filepath}"
|
|
93
123
|
|
|
@@ -141,11 +171,16 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
141
171
|
if dep.endswith(".set"):
|
|
142
172
|
dep = dep[:-4] + ".bdf"
|
|
143
173
|
|
|
144
|
-
s3path = self.
|
|
174
|
+
s3path = self._get_s3path(dep)
|
|
145
175
|
if not self.s3_open_neuro:
|
|
146
176
|
dep = self.bids_dependencies_original[i]
|
|
147
177
|
|
|
148
|
-
|
|
178
|
+
dep_path = Path(dep)
|
|
179
|
+
if dep_path.parts and dep_path.parts[0] == self.record.get("dataset"):
|
|
180
|
+
dep_local = Path(self.dataset_folder, *dep_path.parts[1:])
|
|
181
|
+
else:
|
|
182
|
+
dep_local = Path(self.dataset_folder) / dep_path
|
|
183
|
+
filepath = self.cache_dir / dep_local
|
|
149
184
|
if not self.s3_open_neuro:
|
|
150
185
|
if self.filecache.suffix == ".set":
|
|
151
186
|
self.filecache = self.filecache.with_suffix(".bdf")
|
|
@@ -174,14 +209,14 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
174
209
|
)
|
|
175
210
|
filesystem.get(s3path, filepath, callback=callback)
|
|
176
211
|
|
|
177
|
-
def
|
|
212
|
+
def _get_raw_bids_args(self) -> dict[str, Any]:
|
|
178
213
|
"""Helper to restrict the metadata record to the fields needed to locate a BIDS
|
|
179
214
|
recording.
|
|
180
215
|
"""
|
|
181
216
|
desired_fields = ["subject", "session", "task", "run"]
|
|
182
217
|
return {k: self.record[k] for k in desired_fields if self.record[k]}
|
|
183
218
|
|
|
184
|
-
def
|
|
219
|
+
def _ensure_raw(self) -> None:
|
|
185
220
|
"""Download the S3 file and BIDS dependencies if not already cached."""
|
|
186
221
|
if not os.path.exists(self.filecache): # not preload
|
|
187
222
|
if self.bids_dependencies:
|
|
@@ -195,7 +230,6 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
195
230
|
# TO-DO: remove this once is fixed on the our side
|
|
196
231
|
if not self.s3_open_neuro:
|
|
197
232
|
self.bidspath = self.bidspath.update(extension=".bdf")
|
|
198
|
-
|
|
199
233
|
self._raw = mne_bids.read_raw_bids(
|
|
200
234
|
bids_path=self.bidspath, verbose="ERROR"
|
|
201
235
|
)
|
|
@@ -242,7 +276,7 @@ class EEGDashBaseDataset(BaseDataset):
|
|
|
242
276
|
retrieval if not yet done so.
|
|
243
277
|
"""
|
|
244
278
|
if self._raw is None:
|
|
245
|
-
self.
|
|
279
|
+
self._ensure_raw()
|
|
246
280
|
return self._raw
|
|
247
281
|
|
|
248
282
|
@raw.setter
|
|
@@ -300,7 +334,7 @@ class EEGDashBaseRaw(BaseRaw):
|
|
|
300
334
|
chtype = "eog"
|
|
301
335
|
ch_types.append(chtype)
|
|
302
336
|
info = mne.create_info(ch_names=ch_names, sfreq=sfreq, ch_types=ch_types)
|
|
303
|
-
self.s3file = self.
|
|
337
|
+
self.s3file = self._get_s3path(input_fname)
|
|
304
338
|
self.cache_dir = Path(cache_dir)
|
|
305
339
|
self.filecache = self.cache_dir / input_fname
|
|
306
340
|
self.bids_dependencies = bids_dependencies
|
|
@@ -317,7 +351,7 @@ class EEGDashBaseRaw(BaseRaw):
|
|
|
317
351
|
verbose=verbose,
|
|
318
352
|
)
|
|
319
353
|
|
|
320
|
-
def
|
|
354
|
+
def _get_s3path(self, filepath):
|
|
321
355
|
return f"{self._AWS_BUCKET}/{filepath}"
|
|
322
356
|
|
|
323
357
|
def _download_s3(self) -> None:
|
|
@@ -333,7 +367,7 @@ class EEGDashBaseRaw(BaseRaw):
|
|
|
333
367
|
anon=True, client_kwargs={"region_name": "us-east-2"}
|
|
334
368
|
)
|
|
335
369
|
for dep in self.bids_dependencies:
|
|
336
|
-
s3path = self.
|
|
370
|
+
s3path = self._get_s3path(dep)
|
|
337
371
|
filepath = self.cache_dir / dep
|
|
338
372
|
if not filepath.exists():
|
|
339
373
|
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
@@ -394,11 +428,17 @@ class EEGBIDSDataset:
|
|
|
394
428
|
raise ValueError("data_dir must be specified and must exist")
|
|
395
429
|
self.bidsdir = Path(data_dir)
|
|
396
430
|
self.dataset = dataset
|
|
397
|
-
|
|
431
|
+
# Accept exact dataset folder or a variant with informative suffixes
|
|
432
|
+
# (e.g., dsXXXXX-bdf, dsXXXXX-bdf-mini) to avoid collisions.
|
|
433
|
+
dir_name = self.bidsdir.name
|
|
434
|
+
if not (dir_name == self.dataset or dir_name.startswith(self.dataset + "-")):
|
|
435
|
+
raise AssertionError(
|
|
436
|
+
f"BIDS directory '{dir_name}' does not correspond to dataset '{self.dataset}'"
|
|
437
|
+
)
|
|
398
438
|
self.layout = BIDSLayout(data_dir)
|
|
399
439
|
|
|
400
440
|
# get all recording files in the bids directory
|
|
401
|
-
self.files = self.
|
|
441
|
+
self.files = self._get_recordings(self.layout)
|
|
402
442
|
assert len(self.files) > 0, ValueError(
|
|
403
443
|
"Unable to construct EEG dataset. No EEG recordings found."
|
|
404
444
|
)
|
|
@@ -408,7 +448,7 @@ class EEGBIDSDataset:
|
|
|
408
448
|
"""Check if the dataset is EEG."""
|
|
409
449
|
return self.get_bids_file_attribute("modality", self.files[0]).lower() == "eeg"
|
|
410
450
|
|
|
411
|
-
def
|
|
451
|
+
def _get_recordings(self, layout: BIDSLayout) -> list[str]:
|
|
412
452
|
"""Get a list of all EEG recording files in the BIDS layout."""
|
|
413
453
|
files = []
|
|
414
454
|
for ext, exts in self.RAW_EXTENSIONS.items():
|
|
@@ -417,12 +457,12 @@ class EEGBIDSDataset:
|
|
|
417
457
|
break
|
|
418
458
|
return files
|
|
419
459
|
|
|
420
|
-
def
|
|
460
|
+
def _get_relative_bidspath(self, filename: str) -> str:
|
|
421
461
|
"""Make the given file path relative to the BIDS directory."""
|
|
422
462
|
bids_parent_dir = self.bidsdir.parent.absolute()
|
|
423
463
|
return str(Path(filename).relative_to(bids_parent_dir))
|
|
424
464
|
|
|
425
|
-
def
|
|
465
|
+
def _get_property_from_filename(self, property: str, filename: str) -> str:
|
|
426
466
|
"""Parse a property out of a BIDS-compliant filename. Returns an empty string
|
|
427
467
|
if not found.
|
|
428
468
|
"""
|
|
@@ -434,7 +474,7 @@ class EEGBIDSDataset:
|
|
|
434
474
|
lookup = re.search(rf"{property}-(.*?)[_\/]", filename)
|
|
435
475
|
return lookup.group(1) if lookup else ""
|
|
436
476
|
|
|
437
|
-
def
|
|
477
|
+
def _merge_json_inheritance(self, json_files: list[str | Path]) -> dict:
|
|
438
478
|
"""Internal helper to merge list of json files found by get_bids_file_inheritance,
|
|
439
479
|
expecting the order (from left to right) is from lowest
|
|
440
480
|
level to highest level, and return a merged dictionary
|
|
@@ -445,7 +485,7 @@ class EEGBIDSDataset:
|
|
|
445
485
|
json_dict.update(json.load(open(f))) # FIXME: should close file
|
|
446
486
|
return json_dict
|
|
447
487
|
|
|
448
|
-
def
|
|
488
|
+
def _get_bids_file_inheritance(
|
|
449
489
|
self, path: str | Path, basename: str, extension: str
|
|
450
490
|
) -> list[Path]:
|
|
451
491
|
"""Get all file paths that apply to the basename file in the specified directory
|
|
@@ -492,7 +532,7 @@ class EEGBIDSDataset:
|
|
|
492
532
|
else:
|
|
493
533
|
# call get_bids_file_inheritance recursively with parent directory
|
|
494
534
|
bids_files.extend(
|
|
495
|
-
self.
|
|
535
|
+
self._get_bids_file_inheritance(path.parent, basename, extension)
|
|
496
536
|
)
|
|
497
537
|
return bids_files
|
|
498
538
|
|
|
@@ -523,12 +563,12 @@ class EEGBIDSDataset:
|
|
|
523
563
|
path, filename = os.path.split(filepath)
|
|
524
564
|
basename = filename[: filename.rfind("_")]
|
|
525
565
|
# metadata files
|
|
526
|
-
meta_files = self.
|
|
566
|
+
meta_files = self._get_bids_file_inheritance(
|
|
527
567
|
path, basename, metadata_file_extension
|
|
528
568
|
)
|
|
529
569
|
return meta_files
|
|
530
570
|
|
|
531
|
-
def
|
|
571
|
+
def _scan_directory(self, directory: str, extension: str) -> list[Path]:
|
|
532
572
|
"""Return a list of file paths that end with the given extension in the specified
|
|
533
573
|
directory. Ignores certain special directories like .git, .datalad, derivatives,
|
|
534
574
|
and code.
|
|
@@ -545,7 +585,7 @@ class EEGBIDSDataset:
|
|
|
545
585
|
result_files.append(entry.path) # Add directory to scan later
|
|
546
586
|
return result_files
|
|
547
587
|
|
|
548
|
-
def
|
|
588
|
+
def _get_files_with_extension_parallel(
|
|
549
589
|
self, directory: str, extension: str = ".set", max_workers: int = -1
|
|
550
590
|
) -> list[Path]:
|
|
551
591
|
"""Efficiently scan a directory and its subdirectories for files that end with
|
|
@@ -577,7 +617,7 @@ class EEGBIDSDataset:
|
|
|
577
617
|
)
|
|
578
618
|
# Run the scan_directory function in parallel across directories
|
|
579
619
|
results = Parallel(n_jobs=max_workers, prefer="threads", verbose=1)(
|
|
580
|
-
delayed(self.
|
|
620
|
+
delayed(self._scan_directory)(d, extension) for d in dirs_to_scan
|
|
581
621
|
)
|
|
582
622
|
|
|
583
623
|
# Reset the directories to scan and process the results
|
|
@@ -682,7 +722,7 @@ class EEGBIDSDataset:
|
|
|
682
722
|
def num_times(self, data_filepath: str) -> int:
|
|
683
723
|
"""Get the approximate number of time points in the EEG recording based on the BIDS metadata."""
|
|
684
724
|
eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
|
|
685
|
-
eeg_json_dict = self.
|
|
725
|
+
eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
|
|
686
726
|
return int(
|
|
687
727
|
eeg_json_dict["SamplingFrequency"] * eeg_json_dict["RecordingDuration"]
|
|
688
728
|
)
|
|
@@ -705,7 +745,7 @@ class EEGBIDSDataset:
|
|
|
705
745
|
def eeg_json(self, data_filepath: str) -> dict[str, Any]:
|
|
706
746
|
"""Get BIDS eeg.json metadata for the given data file path."""
|
|
707
747
|
eeg_jsons = self.get_bids_metadata_files(data_filepath, "eeg.json")
|
|
708
|
-
eeg_json_dict = self.
|
|
748
|
+
eeg_json_dict = self._merge_json_inheritance(eeg_jsons)
|
|
709
749
|
return eeg_json_dict
|
|
710
750
|
|
|
711
751
|
def channel_tsv(self, data_filepath: str) -> dict[str, Any]:
|
|
@@ -3,8 +3,9 @@ from pathlib import Path
|
|
|
3
3
|
|
|
4
4
|
from mne.utils import warn
|
|
5
5
|
|
|
6
|
-
from
|
|
7
|
-
from
|
|
6
|
+
from ..api import EEGDashDataset
|
|
7
|
+
from ..bids_eeg_metadata import build_query_from_kwargs
|
|
8
|
+
from ..const import RELEASE_TO_OPENNEURO_DATASET_MAP, SUBJECT_MINI_RELEASE_MAP
|
|
8
9
|
from .registry import register_openneuro_datasets
|
|
9
10
|
|
|
10
11
|
logger = logging.getLogger("eegdash")
|
|
@@ -68,15 +69,56 @@ class EEGChallengeDataset(EEGDashDataset):
|
|
|
68
69
|
)
|
|
69
70
|
|
|
70
71
|
if self.mini:
|
|
71
|
-
#
|
|
72
|
-
#
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
72
|
+
# When using the mini release, restrict subjects to the predefined subset.
|
|
73
|
+
# If the user specifies subject(s), ensure they all belong to the mini subset;
|
|
74
|
+
# otherwise, default to the full mini subject list for this release.
|
|
75
|
+
|
|
76
|
+
allowed_subjects = set(SUBJECT_MINI_RELEASE_MAP[release])
|
|
77
|
+
|
|
78
|
+
# Normalize potential 'subjects' -> 'subject' for convenience
|
|
79
|
+
if "subjects" in kwargs and "subject" not in kwargs:
|
|
80
|
+
kwargs["subject"] = kwargs.pop("subjects")
|
|
81
|
+
|
|
82
|
+
# Collect user-requested subjects from kwargs/query. We canonicalize
|
|
83
|
+
# kwargs via build_query_from_kwargs to leverage existing validation,
|
|
84
|
+
# and support Mongo-style {"$in": [...]} shapes from a raw query.
|
|
85
|
+
requested_subjects: list[str] = []
|
|
86
|
+
|
|
87
|
+
# From kwargs
|
|
88
|
+
if "subject" in kwargs and kwargs["subject"] is not None:
|
|
89
|
+
# Use the shared query builder to normalize scalars/lists
|
|
90
|
+
built = build_query_from_kwargs(subject=kwargs["subject"])
|
|
91
|
+
s_val = built.get("subject")
|
|
92
|
+
if isinstance(s_val, dict) and "$in" in s_val:
|
|
93
|
+
requested_subjects.extend(list(s_val["$in"]))
|
|
94
|
+
elif s_val is not None:
|
|
95
|
+
requested_subjects.append(s_val) # type: ignore[arg-type]
|
|
96
|
+
|
|
97
|
+
# From query (top-level only)
|
|
98
|
+
if query and isinstance(query, dict) and "subject" in query:
|
|
99
|
+
qval = query["subject"]
|
|
100
|
+
if isinstance(qval, dict) and "$in" in qval:
|
|
101
|
+
requested_subjects.extend(list(qval["$in"]))
|
|
102
|
+
elif isinstance(qval, (list, tuple, set)):
|
|
103
|
+
requested_subjects.extend(list(qval))
|
|
104
|
+
elif qval is not None:
|
|
105
|
+
requested_subjects.append(qval)
|
|
106
|
+
|
|
107
|
+
# Validate if any subjects were explicitly requested
|
|
108
|
+
if requested_subjects:
|
|
109
|
+
invalid = sorted(
|
|
110
|
+
{s for s in requested_subjects if s not in allowed_subjects}
|
|
78
111
|
)
|
|
79
|
-
|
|
112
|
+
if invalid:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
"Some requested subject(s) are not part of the mini release for "
|
|
115
|
+
f"{release}: {invalid}. Allowed subjects: {sorted(allowed_subjects)}"
|
|
116
|
+
)
|
|
117
|
+
# Do not override user selection; keep their (validated) subjects as-is.
|
|
118
|
+
else:
|
|
119
|
+
# No subject specified by the user: default to the full mini subset
|
|
120
|
+
kwargs["subject"] = sorted(allowed_subjects)
|
|
121
|
+
|
|
80
122
|
s3_bucket = f"{s3_bucket}/{release}_mini_L100_bdf"
|
|
81
123
|
else:
|
|
82
124
|
s3_bucket = f"{s3_bucket}/{release}_L100_bdf"
|
|
@@ -104,6 +146,7 @@ class EEGChallengeDataset(EEGDashDataset):
|
|
|
104
146
|
query=query,
|
|
105
147
|
cache_dir=cache_dir,
|
|
106
148
|
s3_bucket=s3_bucket,
|
|
149
|
+
_suppress_comp_warning=True,
|
|
107
150
|
**kwargs,
|
|
108
151
|
)
|
|
109
152
|
|
|
@@ -16,7 +16,7 @@ def register_openneuro_datasets(
|
|
|
16
16
|
) -> Dict[str, type]:
|
|
17
17
|
"""Dynamically create dataset classes from a summary file."""
|
|
18
18
|
if base_class is None:
|
|
19
|
-
from
|
|
19
|
+
from ..api import EEGDashDataset as base_class # lazy import
|
|
20
20
|
|
|
21
21
|
summary_path = Path(summary_file)
|
|
22
22
|
namespace = namespace if namespace is not None else globals()
|
|
@@ -59,7 +59,7 @@ def register_openneuro_datasets(
|
|
|
59
59
|
|
|
60
60
|
doc = f"""OpenNeuro dataset ``{dataset_id}``.
|
|
61
61
|
|
|
62
|
-
{
|
|
62
|
+
{_markdown_table(row_series)}
|
|
63
63
|
|
|
64
64
|
Parameters
|
|
65
65
|
----------
|
|
@@ -101,7 +101,7 @@ def register_openneuro_datasets(
|
|
|
101
101
|
return registered
|
|
102
102
|
|
|
103
103
|
|
|
104
|
-
def
|
|
104
|
+
def _markdown_table(row_series: pd.Series) -> str:
|
|
105
105
|
"""Create a reStructuredText grid table from a pandas Series."""
|
|
106
106
|
if row_series.empty:
|
|
107
107
|
return ""
|
eegdash/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: eegdash
|
|
3
|
-
Version: 0.3.7.
|
|
3
|
+
Version: 0.3.7.dev105
|
|
4
4
|
Summary: EEG data for machine learning
|
|
5
5
|
Author-email: Young Truong <dt.young112@gmail.com>, Arnaud Delorme <adelorme@gmail.com>, Aviv Dotan <avivd220@gmail.com>, Oren Shriki <oren70@gmail.com>, Bruno Aristimunha <b.aristimunha@gmail.com>
|
|
6
6
|
License-Expression: GPL-3.0-only
|