eegdash 0.3.9.dev182388821__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -14,7 +14,35 @@ def register_openneuro_datasets(
14
14
  namespace: Dict[str, Any] | None = None,
15
15
  add_to_all: bool = True,
16
16
  ) -> Dict[str, type]:
17
- """Dynamically create dataset classes from a summary file."""
17
+ """Dynamically create and register dataset classes from a summary file.
18
+
19
+ This function reads a CSV file containing summaries of OpenNeuro datasets
20
+ and dynamically creates a Python class for each dataset. These classes
21
+ inherit from a specified base class and are pre-configured with the
22
+ dataset's ID.
23
+
24
+ Parameters
25
+ ----------
26
+ summary_file : str or pathlib.Path
27
+ The path to the CSV file containing the dataset summaries.
28
+ base_class : type, optional
29
+ The base class from which the new dataset classes will inherit. If not
30
+ provided, :class:`eegdash.api.EEGDashDataset` is used.
31
+ namespace : dict, optional
32
+ The namespace (e.g., `globals()`) into which the newly created classes
33
+ will be injected. Defaults to the local `globals()` of this module.
34
+ add_to_all : bool, default True
35
+ If True, the names of the newly created classes are added to the
36
+ `__all__` list of the target namespace, making them importable with
37
+ `from ... import *`.
38
+
39
+ Returns
40
+ -------
41
+ dict[str, type]
42
+ A dictionary mapping the names of the registered classes to the class
43
+ types themselves.
44
+
45
+ """
18
46
  if base_class is None:
19
47
  from ..api import EEGDashDataset as base_class # lazy import
20
48
 
@@ -57,14 +85,8 @@ def register_openneuro_datasets(
57
85
 
58
86
  init = make_init(dataset_id)
59
87
 
60
- doc = f"""OpenNeuro dataset ``{dataset_id}``.
61
-
62
- {_markdown_table(row_series)}
63
-
64
- This class is a thin convenience wrapper for the dataset ``{dataset_id}``.
65
- Constructor arguments are forwarded to :class:`{base_class.__name__}`; see the
66
- base class documentation for parameter details and examples.
67
- """
88
+ # Generate rich docstring with dataset metadata
89
+ doc = _generate_rich_docstring(dataset_id, row_series, base_class)
68
90
 
69
91
  # init.__doc__ = doc
70
92
 
@@ -90,8 +112,133 @@ def register_openneuro_datasets(
90
112
  return registered
91
113
 
92
114
 
115
+ def _generate_rich_docstring(
116
+ dataset_id: str, row_series: pd.Series, base_class: type
117
+ ) -> str:
118
+ """Generate a comprehensive, well-formatted docstring for a dataset class.
119
+
120
+ Parameters
121
+ ----------
122
+ dataset_id : str
123
+ The identifier of the dataset (e.g., "ds002718").
124
+ row_series : pandas.Series
125
+ A pandas Series containing the metadata for the dataset, extracted
126
+ from the summary CSV file.
127
+ base_class : type
128
+ The base class from which the new dataset class inherits. Used to
129
+ generate the "See Also" section of the docstring.
130
+
131
+ Returns
132
+ -------
133
+ str
134
+ A formatted docstring.
135
+
136
+ """
137
+ # Extract metadata with safe defaults
138
+ n_subjects = row_series.get("n_subjects", "Unknown")
139
+ n_records = row_series.get("n_records", "Unknown")
140
+ n_tasks = row_series.get("n_tasks", "Unknown")
141
+ modality = row_series.get("modality of exp", "")
142
+ exp_type = row_series.get("type of exp", "")
143
+ subject_type = row_series.get("Type Subject", "")
144
+ duration = row_series.get("duration_hours_total", "Unknown")
145
+ size = row_series.get("size", "Unknown")
146
+
147
+ # Create description based on available metadata
148
+ description_parts = []
149
+ if modality and str(modality).strip():
150
+ description_parts.append(f"**Modality**: {modality}")
151
+ if exp_type and str(exp_type).strip():
152
+ description_parts.append(f"**Type**: {exp_type}")
153
+ if subject_type and str(subject_type).strip():
154
+ description_parts.append(f"**Subjects**: {subject_type}")
155
+
156
+ description = (
157
+ " | ".join(description_parts)
158
+ if description_parts
159
+ else "EEG dataset from OpenNeuro"
160
+ )
161
+
162
+ # Generate the docstring
163
+ docstring = f"""OpenNeuro dataset ``{dataset_id}``.
164
+
165
+ {description}
166
+
167
+ This dataset contains {n_subjects} subjects with {n_records} recordings across {n_tasks} tasks.
168
+ Total duration: {duration} hours. Dataset size: {size}.
169
+
170
+ {_markdown_table(row_series)}
171
+
172
+ This dataset class provides convenient access to the ``{dataset_id}`` dataset through the EEGDash interface.
173
+ It inherits all functionality from :class:`~{base_class.__module__}.{base_class.__name__}` with the dataset filter pre-configured.
174
+
175
+ Parameters
176
+ ----------
177
+ cache_dir : str
178
+ Directory to cache downloaded data.
179
+ query : dict, optional
180
+ Additional MongoDB-style filters to AND with the dataset selection.
181
+ Must not contain the key ``dataset``.
182
+ s3_bucket : str, optional
183
+ Base S3 bucket used to locate the data.
184
+ **kwargs
185
+ Additional arguments passed to the base dataset class.
186
+
187
+ Examples
188
+ --------
189
+ Basic usage:
190
+
191
+ >>> from eegdash.dataset import {dataset_id.upper()}
192
+ >>> dataset = {dataset_id.upper()}(cache_dir="./data")
193
+ >>> print(f"Number of recordings: {{len(dataset)}}")
194
+
195
+ Load a specific recording:
196
+
197
+ >>> if len(dataset) > 0:
198
+ ... recording = dataset[0]
199
+ ... raw = recording.load()
200
+ ... print(f"Sampling rate: {{raw.info['sfreq']}} Hz")
201
+ ... print(f"Number of channels: {{len(raw.ch_names)}}")
202
+
203
+ Filter by additional criteria:
204
+
205
+ >>> # Get subset with specific task or subject
206
+ >>> filtered_dataset = {dataset_id.upper()}(
207
+ ... cache_dir="./data",
208
+ ... query={{"task": "RestingState"}} # if applicable
209
+ ... )
210
+
211
+ Notes
212
+ -----
213
+ More details available in the `NEMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__.
214
+
215
+ See Also
216
+ --------
217
+ {base_class.__name__} : Base dataset class with full API documentation
218
+ """
219
+
220
+ return docstring
221
+
222
+
93
223
  def _markdown_table(row_series: pd.Series) -> str:
94
- """Create a reStructuredText grid table from a pandas Series."""
224
+ """Create a reStructuredText grid table from a pandas Series.
225
+
226
+ This helper function takes a pandas Series containing dataset metadata
227
+ and formats it into a reStructuredText grid table for inclusion in
228
+ docstrings.
229
+
230
+ Parameters
231
+ ----------
232
+ row_series : pandas.Series
233
+ A Series where each index is a metadata field and each value is the
234
+ corresponding metadata value.
235
+
236
+ Returns
237
+ -------
238
+ str
239
+ A string containing the formatted reStructuredText table.
240
+
241
+ """
95
242
  if row_series.empty:
96
243
  return ""
97
244
  dataset_id = row_series["dataset"]
@@ -128,7 +275,12 @@ def _markdown_table(row_series: pd.Series) -> str:
128
275
  table = tabulate(df, headers="keys", tablefmt="rst", showindex=False)
129
276
 
130
277
  # Add a caption for the table
131
- caption = f"Short overview of dataset {dataset_id} more details in the `Nemar documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`_."
278
+ # Use an anonymous external link (double underscore) to avoid duplicate
279
+ # target warnings when this docstring is repeated across many classes.
280
+ caption = (
281
+ f"Short overview of dataset {dataset_id} more details in the "
282
+ f"`NeMAR documentation <https://nemar.org/dataexplorer/detail?dataset_id={dataset_id}>`__."
283
+ )
132
284
  # adding caption below the table
133
285
  # Indent the table to fit within the admonition block
134
286
  indented_table = "\n".join(" " + line for line in table.split("\n"))
eegdash/downloader.py ADDED
@@ -0,0 +1,197 @@
1
+ # Authors: The EEGDash contributors.
2
+ # License: GNU General Public License
3
+ # Copyright the EEGDash contributors.
4
+
5
+ """File downloading utilities for EEG data from cloud storage.
6
+
7
+ This module provides functions for downloading EEG data files and BIDS dependencies from
8
+ AWS S3 storage, with support for caching and progress tracking. It handles the communication
9
+ between the EEGDash metadata database and the actual EEG data stored in the cloud.
10
+ """
11
+
12
+ import re
13
+ from pathlib import Path
14
+ from typing import Any
15
+
16
+ import s3fs
17
+ from fsspec.callbacks import TqdmCallback
18
+
19
+
20
+ def get_s3_filesystem() -> s3fs.S3FileSystem:
21
+ """Get an anonymous S3 filesystem object.
22
+
23
+ Initializes and returns an ``s3fs.S3FileSystem`` for anonymous access
24
+ to public S3 buckets, configured for the 'us-east-2' region.
25
+
26
+ Returns
27
+ -------
28
+ s3fs.S3FileSystem
29
+ An S3 filesystem object.
30
+
31
+ """
32
+ return s3fs.S3FileSystem(anon=True, client_kwargs={"region_name": "us-east-2"})
33
+
34
+
35
+ def get_s3path(s3_bucket: str, filepath: str) -> str:
36
+ """Construct an S3 URI from a bucket and file path.
37
+
38
+ Parameters
39
+ ----------
40
+ s3_bucket : str
41
+ The S3 bucket name (e.g., "s3://my-bucket").
42
+ filepath : str
43
+ The path to the file within the bucket.
44
+
45
+ Returns
46
+ -------
47
+ str
48
+ The full S3 URI (e.g., "s3://my-bucket/path/to/file").
49
+
50
+ """
51
+ return f"{s3_bucket}/{filepath}"
52
+
53
+
54
+ def download_s3_file(s3_path: str, local_path: Path, s3_open_neuro: bool) -> Path:
55
+ """Download a single file from S3 to a local path.
56
+
57
+ Handles the download of a raw EEG data file from an S3 bucket, caching it
58
+ at the specified local path. Creates parent directories if they do not exist.
59
+
60
+ Parameters
61
+ ----------
62
+ s3_path : str
63
+ The full S3 URI of the file to download.
64
+ local_path : pathlib.Path
65
+ The local file path where the downloaded file will be saved.
66
+ s3_open_neuro : bool
67
+ A flag indicating if the S3 bucket is the OpenNeuro main bucket, which
68
+ may affect path handling.
69
+
70
+ Returns
71
+ -------
72
+ pathlib.Path
73
+ The local path to the downloaded file.
74
+
75
+ """
76
+ filesystem = get_s3_filesystem()
77
+ if not s3_open_neuro:
78
+ s3_path = re.sub(r"(^|/)ds\d{6}/", r"\1", s3_path, count=1)
79
+ # TODO: remove this hack when competition is over
80
+ if s3_path.endswith(".set"):
81
+ s3_path = s3_path[:-4] + ".bdf"
82
+ local_path = local_path.with_suffix(".bdf")
83
+
84
+ local_path.parent.mkdir(parents=True, exist_ok=True)
85
+ _filesystem_get(filesystem=filesystem, s3path=s3_path, filepath=local_path)
86
+
87
+ return local_path
88
+
89
+
90
+ def download_dependencies(
91
+ s3_bucket: str,
92
+ bids_dependencies: list[str],
93
+ bids_dependencies_original: list[str],
94
+ cache_dir: Path,
95
+ dataset_folder: Path,
96
+ record: dict[str, Any],
97
+ s3_open_neuro: bool,
98
+ ) -> None:
99
+ """Download all BIDS dependency files from S3.
100
+
101
+ Iterates through a list of BIDS dependency files, downloads each from the
102
+ specified S3 bucket, and caches them in the appropriate local directory
103
+ structure.
104
+
105
+ Parameters
106
+ ----------
107
+ s3_bucket : str
108
+ The S3 bucket to download from.
109
+ bids_dependencies : list of str
110
+ A list of dependency file paths relative to the S3 bucket root.
111
+ bids_dependencies_original : list of str
112
+ The original dependency paths, used for resolving local cache paths.
113
+ cache_dir : pathlib.Path
114
+ The root directory for caching.
115
+ dataset_folder : pathlib.Path
116
+ The specific folder for the dataset within the cache directory.
117
+ record : dict
118
+ The metadata record for the main data file, used to resolve paths.
119
+ s3_open_neuro : bool
120
+ Flag for OpenNeuro-specific path handling.
121
+
122
+ """
123
+ filesystem = get_s3_filesystem()
124
+ for i, dep in enumerate(bids_dependencies):
125
+ if not s3_open_neuro:
126
+ if dep.endswith(".set"):
127
+ dep = dep[:-4] + ".bdf"
128
+
129
+ s3path = get_s3path(s3_bucket, dep)
130
+ if not s3_open_neuro:
131
+ dep = bids_dependencies_original[i]
132
+
133
+ dep_path = Path(dep)
134
+ if dep_path.parts and dep_path.parts[0] == record.get("dataset"):
135
+ dep_local = Path(dataset_folder, *dep_path.parts[1:])
136
+ else:
137
+ dep_local = Path(dataset_folder) / dep_path
138
+ filepath = cache_dir / dep_local
139
+ if not s3_open_neuro:
140
+ if filepath.suffix == ".set":
141
+ filepath = filepath.with_suffix(".bdf")
142
+
143
+ if not filepath.exists():
144
+ filepath.parent.mkdir(parents=True, exist_ok=True)
145
+ _filesystem_get(filesystem=filesystem, s3path=s3path, filepath=filepath)
146
+
147
+
148
+ def _filesystem_get(filesystem: s3fs.S3FileSystem, s3path: str, filepath: Path) -> Path:
149
+ """Perform the file download using fsspec with a progress bar.
150
+
151
+ Internal helper function that wraps the ``filesystem.get`` call to include
152
+ a TQDM progress bar.
153
+
154
+ Parameters
155
+ ----------
156
+ filesystem : s3fs.S3FileSystem
157
+ The filesystem object to use for the download.
158
+ s3path : str
159
+ The full S3 URI of the source file.
160
+ filepath : pathlib.Path
161
+ The local destination path.
162
+
163
+ Returns
164
+ -------
165
+ pathlib.Path
166
+ The local path to the downloaded file.
167
+
168
+ """
169
+ info = filesystem.info(s3path)
170
+ size = info.get("size") or info.get("Size")
171
+
172
+ callback = TqdmCallback(
173
+ size=size,
174
+ tqdm_kwargs=dict(
175
+ desc=f"Downloading {Path(s3path).name}",
176
+ unit="B",
177
+ unit_scale=True,
178
+ unit_divisor=1024,
179
+ dynamic_ncols=True,
180
+ leave=True,
181
+ mininterval=0.2,
182
+ smoothing=0.1,
183
+ miniters=1,
184
+ bar_format="{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} "
185
+ "[{elapsed}<{remaining}, {rate_fmt}]",
186
+ ),
187
+ )
188
+ filesystem.get(s3path, str(filepath), callback=callback)
189
+ return filepath
190
+
191
+
192
+ __all__ = [
193
+ "download_s3_file",
194
+ "download_dependencies",
195
+ "get_s3path",
196
+ "get_s3_filesystem",
197
+ ]