eegdash 0.3.9.dev170082126__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of eegdash might be problematic. Click here for more details.
- eegdash/__init__.py +12 -1
- eegdash/api.py +297 -295
- eegdash/bids_eeg_metadata.py +297 -56
- eegdash/const.py +43 -0
- eegdash/data_utils.py +327 -430
- eegdash/dataset/__init__.py +19 -1
- eegdash/dataset/dataset.py +61 -33
- eegdash/dataset/dataset_summary.csv +255 -256
- eegdash/dataset/registry.py +163 -11
- eegdash/downloader.py +197 -0
- eegdash/features/datasets.py +323 -138
- eegdash/features/decorators.py +88 -3
- eegdash/features/extractors.py +203 -55
- eegdash/features/feature_bank/complexity.py +7 -3
- eegdash/features/feature_bank/dimensionality.py +1 -1
- eegdash/features/inspect.py +80 -5
- eegdash/features/serialization.py +49 -17
- eegdash/features/utils.py +75 -8
- eegdash/hbn/__init__.py +11 -0
- eegdash/hbn/preprocessing.py +61 -19
- eegdash/hbn/windows.py +157 -34
- eegdash/logging.py +54 -0
- eegdash/mongodb.py +55 -24
- eegdash/paths.py +28 -5
- eegdash/utils.py +29 -1
- {eegdash-0.3.9.dev170082126.dist-info → eegdash-0.4.0.dist-info}/METADATA +11 -59
- eegdash-0.4.0.dist-info/RECORD +37 -0
- eegdash-0.3.9.dev170082126.dist-info/RECORD +0 -35
- {eegdash-0.3.9.dev170082126.dist-info → eegdash-0.4.0.dist-info}/WHEEL +0 -0
- {eegdash-0.3.9.dev170082126.dist-info → eegdash-0.4.0.dist-info}/licenses/LICENSE +0 -0
- {eegdash-0.3.9.dev170082126.dist-info → eegdash-0.4.0.dist-info}/top_level.txt +0 -0
eegdash/dataset/registry.py
CHANGED
|
@@ -14,7 +14,35 @@ def register_openneuro_datasets(
|
|
|
14
14
|
namespace: Dict[str, Any] | None = None,
|
|
15
15
|
add_to_all: bool = True,
|
|
16
16
|
) -> Dict[str, type]:
|
|
17
|
-
"""Dynamically create dataset classes from a summary file.
|
|
17
|
+
"""Dynamically create and register dataset classes from a summary file.
|
|
18
|
+
|
|
19
|
+
This function reads a CSV file containing summaries of OpenNeuro datasets
|
|
20
|
+
and dynamically creates a Python class for each dataset. These classes
|
|
21
|
+
inherit from a specified base class and are pre-configured with the
|
|
22
|
+
dataset's ID.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
summary_file : str or pathlib.Path
|
|
27
|
+
The path to the CSV file containing the dataset summaries.
|
|
28
|
+
base_class : type, optional
|
|
29
|
+
The base class from which the new dataset classes will inherit. If not
|
|
30
|
+
provided, :class:`eegdash.api.EEGDashDataset` is used.
|
|
31
|
+
namespace : dict, optional
|
|
32
|
+
The namespace (e.g., `globals()`) into which the newly created classes
|
|
33
|
+
will be injected. Defaults to the local `globals()` of this module.
|
|
34
|
+
add_to_all : bool, default True
|
|
35
|
+
If True, the names of the newly created classes are added to the
|
|
36
|
+
`__all__` list of the target namespace, making them importable with
|
|
37
|
+
`from ... import *`.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
dict[str, type]
|
|
42
|
+
A dictionary mapping the names of the registered classes to the class
|
|
43
|
+
types themselves.
|
|
44
|
+
|
|
45
|
+
"""
|
|
18
46
|
if base_class is None:
|
|
19
47
|
from ..api import EEGDashDataset as base_class # lazy import
|
|
20
48
|
|
|
@@ -57,14 +85,8 @@ def register_openneuro_datasets(
|
|
|
57
85
|
|
|
58
86
|
init = make_init(dataset_id)
|
|
59
87
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
{_markdown_table(row_series)}
|
|
63
|
-
|
|
64
|
-
This class is a thin convenience wrapper for the dataset ``{dataset_id}``.
|
|
65
|
-
Constructor arguments are forwarded to :class:`{base_class.__name__}`; see the
|
|
66
|
-
base class documentation for parameter details and examples.
|
|
67
|
-
"""
|
|
88
|
+
# Generate rich docstring with dataset metadata
|
|
89
|
+
doc = _generate_rich_docstring(dataset_id, row_series, base_class)
|
|
68
90
|
|
|
69
91
|
# init.__doc__ = doc
|
|
70
92
|
|
|
@@ -90,8 +112,133 @@ def register_openneuro_datasets(
|
|
|
90
112
|
return registered
|
|
91
113
|
|
|
92
114
|
|
|
115
|
+
def _generate_rich_docstring(
|
|
116
|
+
dataset_id: str, row_series: pd.Series, base_class: type
|
|
117
|
+
) -> str:
|
|
118
|
+
"""Generate a comprehensive, well-formatted docstring for a dataset class.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
dataset_id : str
|
|
123
|
+
The identifier of the dataset (e.g., "ds002718").
|
|
124
|
+
row_series : pandas.Series
|
|
125
|
+
A pandas Series containing the metadata for the dataset, extracted
|
|
126
|
+
from the summary CSV file.
|
|
127
|
+
base_class : type
|
|
128
|
+
The base class from which the new dataset class inherits. Used to
|
|
129
|
+
generate the "See Also" section of the docstring.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
str
|
|
134
|
+
A formatted docstring.
|
|
135
|
+
|
|
136
|
+
"""
|
|
137
|
+
# Extract metadata with safe defaults
|
|
138
|
+
n_subjects = row_series.get("n_subjects", "Unknown")
|
|
139
|
+
n_records = row_series.get("n_records", "Unknown")
|
|
140
|
+
n_tasks = row_series.get("n_tasks", "Unknown")
|
|
141
|
+
modality = row_series.get("modality of exp", "")
|
|
142
|
+
exp_type = row_series.get("type of exp", "")
|
|
143
|
+
subject_type = row_series.get("Type Subject", "")
|
|
144
|
+
duration = row_series.get("duration_hours_total", "Unknown")
|
|
145
|
+
size = row_series.get("size", "Unknown")
|
|
146
|
+
|
|
147
|
+
# Create description based on available metadata
|
|
148
|
+
description_parts = []
|
|
149
|
+
if modality and str(modality).strip():
|
|
150
|
+
description_parts.append(f"**Modality**: {modality}")
|
|
151
|
+
if exp_type and str(exp_type).strip():
|
|
152
|
+
description_parts.append(f"**Type**: {exp_type}")
|
|
153
|
+
if subject_type and str(subject_type).strip():
|
|
154
|
+
description_parts.append(f"**Subjects**: {subject_type}")
|
|
155
|
+
|
|
156
|
+
description = (
|
|
157
|
+
" | ".join(description_parts)
|
|
158
|
+
if description_parts
|
|
159
|
+
else "EEG dataset from OpenNeuro"
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Generate the docstring
|
|
163
|
+
docstring = f"""OpenNeuro dataset ``{dataset_id}``.
|
|
164
|
+
|
|
165
|
+
{description}
|
|
166
|
+
|
|
167
|
+
This dataset contains {n_subjects} subjects with {n_records} recordings across {n_tasks} tasks.
|
|
168
|
+
Total duration: {duration} hours. Dataset size: {size}.
|
|
169
|
+
|
|
170
|
+
{_markdown_table(row_series)}
|
|
171
|
+
|
|
172
|
+
This dataset class provides convenient access to the ``{dataset_id}`` dataset through the EEGDash interface.
|
|
173
|
+
It inherits all functionality from :class:`~{base_class.__module__}.{base_class.__name__}` with the dataset filter pre-configured.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
cache_dir : str
|
|
178
|
+
Directory to cache downloaded data.
|
|
179
|
+
query : dict, optional
|
|
180
|
+
Additional MongoDB-style filters to AND with the dataset selection.
|
|
181
|
+
Must not contain the key ``dataset``.
|
|
182
|
+
s3_bucket : str, optional
|
|
183
|
+
Base S3 bucket used to locate the data.
|
|
184
|
+
**kwargs
|
|
185
|
+
Additional arguments passed to the base dataset class.
|
|
186
|
+
|
|
187
|
+
Examples
|
|
188
|
+
--------
|
|
189
|
+
Basic usage:
|
|
190
|
+
|
|
191
|
+
>>> from eegdash.dataset import {dataset_id.upper()}
|
|
192
|
+
>>> dataset = {dataset_id.upper()}(cache_dir="./data")
|
|
193
|
+
>>> print(f"Number of recordings: {{len(dataset)}}")
|
|
194
|
+
|
|
195
|
+
Load a specific recording:
|
|
196
|
+
|
|
197
|
+
>>> if len(dataset) > 0:
|
|
198
|
+
... recording = dataset[0]
|
|
199
|
+
... raw = recording.load()
|
|
200
|
+
... print(f"Sampling rate: {{raw.info['sfreq']}} Hz")
|
|
201
|
+
... print(f"Number of channels: {{len(raw.ch_names)}}")
|
|
202
|
+
|
|
203
|
+
Filter by additional criteria:
|
|
204
|
+
|
|
205
|
+
>>> # Get subset with specific task or subject
|
|
206
|
+
>>> filtered_dataset = {dataset_id.upper()}(
|
|
207
|
+
... cache_dir="./data",
|
|
208
|
+
... query={{"task": "RestingState"}} # if applicable
|
|
209
|
+
... )
|
|
210
|
+
|
|
211
|
+
Notes
|
|
212
|
+
-----
|
|
213
|
+
More details available in the `NEMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__.
|
|
214
|
+
|
|
215
|
+
See Also
|
|
216
|
+
--------
|
|
217
|
+
{base_class.__name__} : Base dataset class with full API documentation
|
|
218
|
+
"""
|
|
219
|
+
|
|
220
|
+
return docstring
|
|
221
|
+
|
|
222
|
+
|
|
93
223
|
def _markdown_table(row_series: pd.Series) -> str:
|
|
94
|
-
"""Create a reStructuredText grid table from a pandas Series.
|
|
224
|
+
"""Create a reStructuredText grid table from a pandas Series.
|
|
225
|
+
|
|
226
|
+
This helper function takes a pandas Series containing dataset metadata
|
|
227
|
+
and formats it into a reStructuredText grid table for inclusion in
|
|
228
|
+
docstrings.
|
|
229
|
+
|
|
230
|
+
Parameters
|
|
231
|
+
----------
|
|
232
|
+
row_series : pandas.Series
|
|
233
|
+
A Series where each index is a metadata field and each value is the
|
|
234
|
+
corresponding metadata value.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
str
|
|
239
|
+
A string containing the formatted reStructuredText table.
|
|
240
|
+
|
|
241
|
+
"""
|
|
95
242
|
if row_series.empty:
|
|
96
243
|
return ""
|
|
97
244
|
dataset_id = row_series["dataset"]
|
|
@@ -128,7 +275,12 @@ def _markdown_table(row_series: pd.Series) -> str:
|
|
|
128
275
|
table = tabulate(df, headers="keys", tablefmt="rst", showindex=False)
|
|
129
276
|
|
|
130
277
|
# Add a caption for the table
|
|
131
|
-
|
|
278
|
+
# Use an anonymous external link (double underscore) to avoid duplicate
|
|
279
|
+
# target warnings when this docstring is repeated across many classes.
|
|
280
|
+
caption = (
|
|
281
|
+
f"Short overview of dataset {dataset_id} more details in the "
|
|
282
|
+
f"`NeMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__."
|
|
283
|
+
)
|
|
132
284
|
# adding caption below the table
|
|
133
285
|
# Indent the table to fit within the admonition block
|
|
134
286
|
indented_table = "\n".join(" " + line for line in table.split("\n"))
|
eegdash/downloader.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# Authors: The EEGDash contributors.
|
|
2
|
+
# License: GNU General Public License
|
|
3
|
+
# Copyright the EEGDash contributors.
|
|
4
|
+
|
|
5
|
+
"""File downloading utilities for EEG data from cloud storage.
|
|
6
|
+
|
|
7
|
+
This module provides functions for downloading EEG data files and BIDS dependencies from
|
|
8
|
+
AWS S3 storage, with support for caching and progress tracking. It handles the communication
|
|
9
|
+
between the EEGDash metadata database and the actual EEG data stored in the cloud.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import s3fs
|
|
17
|
+
from fsspec.callbacks import TqdmCallback
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_s3_filesystem() -> s3fs.S3FileSystem:
|
|
21
|
+
"""Get an anonymous S3 filesystem object.
|
|
22
|
+
|
|
23
|
+
Initializes and returns an ``s3fs.S3FileSystem`` for anonymous access
|
|
24
|
+
to public S3 buckets, configured for the 'us-east-2' region.
|
|
25
|
+
|
|
26
|
+
Returns
|
|
27
|
+
-------
|
|
28
|
+
s3fs.S3FileSystem
|
|
29
|
+
An S3 filesystem object.
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
return s3fs.S3FileSystem(anon=True, client_kwargs={"region_name": "us-east-2"})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_s3path(s3_bucket: str, filepath: str) -> str:
|
|
36
|
+
"""Construct an S3 URI from a bucket and file path.
|
|
37
|
+
|
|
38
|
+
Parameters
|
|
39
|
+
----------
|
|
40
|
+
s3_bucket : str
|
|
41
|
+
The S3 bucket name (e.g., "s3://my-bucket").
|
|
42
|
+
filepath : str
|
|
43
|
+
The path to the file within the bucket.
|
|
44
|
+
|
|
45
|
+
Returns
|
|
46
|
+
-------
|
|
47
|
+
str
|
|
48
|
+
The full S3 URI (e.g., "s3://my-bucket/path/to/file").
|
|
49
|
+
|
|
50
|
+
"""
|
|
51
|
+
return f"{s3_bucket}/{filepath}"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def download_s3_file(s3_path: str, local_path: Path, s3_open_neuro: bool) -> Path:
|
|
55
|
+
"""Download a single file from S3 to a local path.
|
|
56
|
+
|
|
57
|
+
Handles the download of a raw EEG data file from an S3 bucket, caching it
|
|
58
|
+
at the specified local path. Creates parent directories if they do not exist.
|
|
59
|
+
|
|
60
|
+
Parameters
|
|
61
|
+
----------
|
|
62
|
+
s3_path : str
|
|
63
|
+
The full S3 URI of the file to download.
|
|
64
|
+
local_path : pathlib.Path
|
|
65
|
+
The local file path where the downloaded file will be saved.
|
|
66
|
+
s3_open_neuro : bool
|
|
67
|
+
A flag indicating if the S3 bucket is the OpenNeuro main bucket, which
|
|
68
|
+
may affect path handling.
|
|
69
|
+
|
|
70
|
+
Returns
|
|
71
|
+
-------
|
|
72
|
+
pathlib.Path
|
|
73
|
+
The local path to the downloaded file.
|
|
74
|
+
|
|
75
|
+
"""
|
|
76
|
+
filesystem = get_s3_filesystem()
|
|
77
|
+
if not s3_open_neuro:
|
|
78
|
+
s3_path = re.sub(r"(^|/)ds\d{6}/", r"\1", s3_path, count=1)
|
|
79
|
+
# TODO: remove this hack when competition is over
|
|
80
|
+
if s3_path.endswith(".set"):
|
|
81
|
+
s3_path = s3_path[:-4] + ".bdf"
|
|
82
|
+
local_path = local_path.with_suffix(".bdf")
|
|
83
|
+
|
|
84
|
+
local_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
_filesystem_get(filesystem=filesystem, s3path=s3_path, filepath=local_path)
|
|
86
|
+
|
|
87
|
+
return local_path
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def download_dependencies(
|
|
91
|
+
s3_bucket: str,
|
|
92
|
+
bids_dependencies: list[str],
|
|
93
|
+
bids_dependencies_original: list[str],
|
|
94
|
+
cache_dir: Path,
|
|
95
|
+
dataset_folder: Path,
|
|
96
|
+
record: dict[str, Any],
|
|
97
|
+
s3_open_neuro: bool,
|
|
98
|
+
) -> None:
|
|
99
|
+
"""Download all BIDS dependency files from S3.
|
|
100
|
+
|
|
101
|
+
Iterates through a list of BIDS dependency files, downloads each from the
|
|
102
|
+
specified S3 bucket, and caches them in the appropriate local directory
|
|
103
|
+
structure.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
s3_bucket : str
|
|
108
|
+
The S3 bucket to download from.
|
|
109
|
+
bids_dependencies : list of str
|
|
110
|
+
A list of dependency file paths relative to the S3 bucket root.
|
|
111
|
+
bids_dependencies_original : list of str
|
|
112
|
+
The original dependency paths, used for resolving local cache paths.
|
|
113
|
+
cache_dir : pathlib.Path
|
|
114
|
+
The root directory for caching.
|
|
115
|
+
dataset_folder : pathlib.Path
|
|
116
|
+
The specific folder for the dataset within the cache directory.
|
|
117
|
+
record : dict
|
|
118
|
+
The metadata record for the main data file, used to resolve paths.
|
|
119
|
+
s3_open_neuro : bool
|
|
120
|
+
Flag for OpenNeuro-specific path handling.
|
|
121
|
+
|
|
122
|
+
"""
|
|
123
|
+
filesystem = get_s3_filesystem()
|
|
124
|
+
for i, dep in enumerate(bids_dependencies):
|
|
125
|
+
if not s3_open_neuro:
|
|
126
|
+
if dep.endswith(".set"):
|
|
127
|
+
dep = dep[:-4] + ".bdf"
|
|
128
|
+
|
|
129
|
+
s3path = get_s3path(s3_bucket, dep)
|
|
130
|
+
if not s3_open_neuro:
|
|
131
|
+
dep = bids_dependencies_original[i]
|
|
132
|
+
|
|
133
|
+
dep_path = Path(dep)
|
|
134
|
+
if dep_path.parts and dep_path.parts[0] == record.get("dataset"):
|
|
135
|
+
dep_local = Path(dataset_folder, *dep_path.parts[1:])
|
|
136
|
+
else:
|
|
137
|
+
dep_local = Path(dataset_folder) / dep_path
|
|
138
|
+
filepath = cache_dir / dep_local
|
|
139
|
+
if not s3_open_neuro:
|
|
140
|
+
if filepath.suffix == ".set":
|
|
141
|
+
filepath = filepath.with_suffix(".bdf")
|
|
142
|
+
|
|
143
|
+
if not filepath.exists():
|
|
144
|
+
filepath.parent.mkdir(parents=True, exist_ok=True)
|
|
145
|
+
_filesystem_get(filesystem=filesystem, s3path=s3path, filepath=filepath)
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _filesystem_get(filesystem: s3fs.S3FileSystem, s3path: str, filepath: Path) -> Path:
|
|
149
|
+
"""Perform the file download using fsspec with a progress bar.
|
|
150
|
+
|
|
151
|
+
Internal helper function that wraps the ``filesystem.get`` call to include
|
|
152
|
+
a TQDM progress bar.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
filesystem : s3fs.S3FileSystem
|
|
157
|
+
The filesystem object to use for the download.
|
|
158
|
+
s3path : str
|
|
159
|
+
The full S3 URI of the source file.
|
|
160
|
+
filepath : pathlib.Path
|
|
161
|
+
The local destination path.
|
|
162
|
+
|
|
163
|
+
Returns
|
|
164
|
+
-------
|
|
165
|
+
pathlib.Path
|
|
166
|
+
The local path to the downloaded file.
|
|
167
|
+
|
|
168
|
+
"""
|
|
169
|
+
info = filesystem.info(s3path)
|
|
170
|
+
size = info.get("size") or info.get("Size")
|
|
171
|
+
|
|
172
|
+
callback = TqdmCallback(
|
|
173
|
+
size=size,
|
|
174
|
+
tqdm_kwargs=dict(
|
|
175
|
+
desc=f"Downloading {Path(s3path).name}",
|
|
176
|
+
unit="B",
|
|
177
|
+
unit_scale=True,
|
|
178
|
+
unit_divisor=1024,
|
|
179
|
+
dynamic_ncols=True,
|
|
180
|
+
leave=True,
|
|
181
|
+
mininterval=0.2,
|
|
182
|
+
smoothing=0.1,
|
|
183
|
+
miniters=1,
|
|
184
|
+
bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
|
|
185
|
+
"[{elapsed}<{remaining}, {rate_fmt}]",
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
filesystem.get(s3path, str(filepath), callback=callback)
|
|
189
|
+
return filepath
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
__all__ = [
|
|
193
|
+
"download_s3_file",
|
|
194
|
+
"download_dependencies",
|
|
195
|
+
"get_s3path",
|
|
196
|
+
"get_s3_filesystem",
|
|
197
|
+
]
|