eegdash 0.3.9.dev170082126__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of eegdash might be problematic. Click here for more details.

@@ -1,28 +1,62 @@
1
- import logging
1
+ # Authors: The EEGDash contributors.
2
+ # License: GNU General Public License
3
+ # Copyright the EEGDash contributors.
4
+
5
+ """BIDS metadata processing and query building utilities.
6
+
7
+ This module provides functions for processing BIDS-formatted EEG metadata, building database
8
+ queries from user parameters, and enriching metadata records with participant information.
9
+ It handles the translation between user-friendly query parameters and MongoDB query syntax.
10
+ """
11
+
2
12
  import re
3
13
  from pathlib import Path
4
14
  from typing import Any
5
15
 
16
+ import pandas as pd
17
+ from mne_bids import BIDSPath
18
+
6
19
  from .const import ALLOWED_QUERY_FIELDS
7
20
  from .const import config as data_config
8
-
9
- logger = logging.getLogger("eegdash")
21
+ from .logging import logger
10
22
 
11
23
  __all__ = [
12
24
  "build_query_from_kwargs",
13
25
  "load_eeg_attrs_from_bids_file",
14
26
  "merge_participants_fields",
15
27
  "normalize_key",
28
+ "participants_row_for_subject",
29
+ "participants_extras_from_tsv",
30
+ "attach_participants_extras",
31
+ "enrich_from_participants",
16
32
  ]
17
33
 
18
34
 
19
35
  def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
20
- """Build and validate a MongoDB query from user-friendly keyword arguments.
36
+ """Build and validate a MongoDB query from keyword arguments.
37
+
38
+ This function converts user-friendly keyword arguments into a valid
39
+ MongoDB query dictionary. It handles scalar values as exact matches and
40
+ list-like values as ``$in`` queries. It also performs validation to
41
+ reject unsupported fields and empty values.
42
+
43
+ Parameters
44
+ ----------
45
+ **kwargs
46
+ Keyword arguments representing query filters. Allowed keys are defined
47
+ in ``eegdash.const.ALLOWED_QUERY_FIELDS``.
48
+
49
+ Returns
50
+ -------
51
+ dict
52
+ A MongoDB query dictionary.
53
+
54
+ Raises
55
+ ------
56
+ ValueError
57
+ If an unsupported query field is provided, or if a value is None or
58
+ an empty string/list.
21
59
 
22
- Improvements:
23
- - Reject None values and empty/whitespace-only strings
24
- - For list/tuple/set values: strip strings, drop None/empties, deduplicate, and use `$in`
25
- - Preserve scalars as exact matches
26
60
  """
27
61
  # 1. Validate that all provided keys are allowed for querying
28
62
  unknown_fields = set(kwargs.keys()) - ALLOWED_QUERY_FIELDS
@@ -72,47 +106,30 @@ def build_query_from_kwargs(**kwargs) -> dict[str, Any]:
72
106
  return query
73
107
 
74
108
 
75
- def _get_raw_extensions(bids_file: str, bids_dataset) -> list[str]:
76
- """Helper to find paths to additional "sidecar" files that may be associated
77
- with a given main data file in a BIDS dataset; paths are returned as relative to
78
- the parent dataset path.
79
-
80
- For example, if the input file is a .set file, this will return the relative path
81
- to a corresponding .fdt file (if any).
82
- """
83
- bids_file = Path(bids_file)
84
- extensions = {
85
- ".set": [".set", ".fdt"], # eeglab
86
- ".edf": [".edf"], # european
87
- ".vhdr": [".eeg", ".vhdr", ".vmrk", ".dat", ".raw"], # brainvision
88
- ".bdf": [".bdf"], # biosemi
89
- }
90
- return [
91
- str(bids_dataset._get_relative_bidspath(bids_file.with_suffix(suffix)))
92
- for suffix in extensions[bids_file.suffix]
93
- if bids_file.with_suffix(suffix).exists()
94
- ]
95
-
96
-
97
109
  def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any]:
98
- """Build the metadata record for a given BIDS file (single recording) in a BIDS dataset.
110
+ """Build a metadata record for a BIDS file.
99
111
 
100
- Attributes are at least the ones defined in data_config attributes (set to None if missing),
101
- but are typically a superset, and include, among others, the paths to relevant
102
- meta-data files needed to load and interpret the file in question.
112
+ Extracts metadata attributes from a single BIDS EEG file within a given
113
+ BIDS dataset. The extracted attributes include BIDS entities, file paths,
114
+ and technical metadata required for database indexing.
103
115
 
104
116
  Parameters
105
117
  ----------
106
118
  bids_dataset : EEGBIDSDataset
107
119
  The BIDS dataset object containing the file.
108
120
  bids_file : str
109
- The path to the BIDS file within the dataset.
121
+ The path to the BIDS file to process.
110
122
 
111
123
  Returns
112
124
  -------
113
- dict:
114
- A dictionary representing the metadata record for the given file. This is the
115
- same format as the records stored in the database.
125
+ dict
126
+ A dictionary of metadata attributes for the file, suitable for
127
+ insertion into the database.
128
+
129
+ Raises
130
+ ------
131
+ ValueError
132
+ If ``bids_file`` is not found in the ``bids_dataset``.
116
133
 
117
134
  """
118
135
  if bids_file not in bids_dataset.files:
@@ -140,7 +157,7 @@ def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any
140
157
  eeg_json = None
141
158
 
142
159
  bids_dependencies_files = data_config["bids_dependencies_files"]
143
- bidsdependencies = []
160
+ bidsdependencies: list[str] = []
144
161
  for extension in bids_dependencies_files:
145
162
  try:
146
163
  dep_path = bids_dataset.get_bids_metadata_files(bids_file, extension)
@@ -151,7 +168,26 @@ def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any
151
168
  except Exception:
152
169
  pass
153
170
 
154
- bidsdependencies.extend(_get_raw_extensions(bids_file, bids_dataset))
171
+ bids_path = BIDSPath(
172
+ subject=bids_dataset.get_bids_file_attribute("subject", bids_file),
173
+ session=bids_dataset.get_bids_file_attribute("session", bids_file),
174
+ task=bids_dataset.get_bids_file_attribute("task", bids_file),
175
+ run=bids_dataset.get_bids_file_attribute("run", bids_file),
176
+ root=bids_dataset.bidsdir,
177
+ datatype=bids_dataset.get_bids_file_attribute("modality", bids_file),
178
+ suffix="eeg",
179
+ extension=Path(bids_file).suffix,
180
+ check=False,
181
+ )
182
+
183
+ sidecars_map = {
184
+ ".set": [".fdt"],
185
+ ".vhdr": [".eeg", ".vmrk", ".dat", ".raw"],
186
+ }
187
+ for ext in sidecars_map.get(bids_path.extension, []):
188
+ sidecar = bids_path.find_matching_sidecar(extension=ext, on_error="ignore")
189
+ if sidecar is not None:
190
+ bidsdependencies.append(str(bids_dataset._get_relative_bidspath(sidecar)))
155
191
 
156
192
  # Define field extraction functions with error handling
157
193
  field_extractors = {
@@ -185,11 +221,23 @@ def load_eeg_attrs_from_bids_file(bids_dataset, bids_file: str) -> dict[str, Any
185
221
 
186
222
 
187
223
  def normalize_key(key: str) -> str:
188
- """Normalize a metadata key for robust matching.
224
+ """Normalize a string key for robust matching.
225
+
226
+ Converts the key to lowercase, replaces non-alphanumeric characters with
227
+ underscores, and removes leading/trailing underscores. This allows for
228
+ tolerant matching of keys that may have different capitalization or
229
+ separators (e.g., "p-factor" becomes "p_factor").
230
+
231
+ Parameters
232
+ ----------
233
+ key : str
234
+ The key to normalize.
235
+
236
+ Returns
237
+ -------
238
+ str
239
+ The normalized key.
189
240
 
190
- Lowercase and replace non-alphanumeric characters with underscores, then strip
191
- leading/trailing underscores. This allows tolerant matching such as
192
- "p-factor" ≈ "p_factor" ≈ "P Factor".
193
241
  """
194
242
  return re.sub(r"[^a-z0-9]+", "_", str(key).lower()).strip("_")
195
243
 
@@ -199,27 +247,27 @@ def merge_participants_fields(
199
247
  participants_row: dict[str, Any] | None,
200
248
  description_fields: list[str] | None = None,
201
249
  ) -> dict[str, Any]:
202
- """Merge participants.tsv fields into a dataset description dictionary.
250
+ """Merge fields from a participants.tsv row into a description dict.
203
251
 
204
- - Preserves existing entries in ``description`` (no overwrites).
205
- - Fills requested ``description_fields`` first, preserving their original names.
206
- - Adds all remaining participants columns generically using normalized keys
207
- unless a matching requested field already captured them.
252
+ Enriches a description dictionary with data from a subject's row in
253
+ ``participants.tsv``. It avoids overwriting existing keys in the
254
+ description.
208
255
 
209
256
  Parameters
210
257
  ----------
211
258
  description : dict
212
- Current description to be enriched in-place and returned.
213
- participants_row : dict | None
214
- A mapping of participants.tsv columns for the current subject.
215
- description_fields : list[str] | None
216
- Optional list of requested description fields. When provided, matching is
217
- performed by normalized names; the original requested field names are kept.
259
+ The description dictionary to enrich.
260
+ participants_row : dict or None
261
+ A dictionary representing a row from ``participants.tsv``. If None,
262
+ the original description is returned unchanged.
263
+ description_fields : list of str, optional
264
+ A list of specific fields to include in the description. Matching is
265
+ done using normalized keys.
218
266
 
219
267
  Returns
220
268
  -------
221
269
  dict
222
- The enriched description (same object as input for convenience).
270
+ The enriched description dictionary.
223
271
 
224
272
  """
225
273
  if not isinstance(description, dict) or not isinstance(participants_row, dict):
@@ -252,3 +300,196 @@ def merge_participants_fields(
252
300
  if norm_key not in description:
253
301
  description[norm_key] = part_value
254
302
  return description
303
+
304
+
305
+ def participants_row_for_subject(
306
+ bids_root: str | Path,
307
+ subject: str,
308
+ id_columns: tuple[str, ...] = ("participant_id", "participant", "subject"),
309
+ ) -> pd.Series | None:
310
+ """Load participants.tsv and return the row for a specific subject.
311
+
312
+ Searches for a subject's data in the ``participants.tsv`` file within a
313
+ BIDS dataset. It can identify the subject with or without the "sub-"
314
+ prefix.
315
+
316
+ Parameters
317
+ ----------
318
+ bids_root : str or Path
319
+ The root directory of the BIDS dataset.
320
+ subject : str
321
+ The subject identifier (e.g., "01" or "sub-01").
322
+ id_columns : tuple of str, default ("participant_id", "participant", "subject")
323
+ A tuple of column names to search for the subject identifier.
324
+
325
+ Returns
326
+ -------
327
+ pandas.Series or None
328
+ A pandas Series containing the subject's data if found, otherwise None.
329
+
330
+ """
331
+ try:
332
+ participants_tsv = Path(bids_root) / "participants.tsv"
333
+ if not participants_tsv.exists():
334
+ return None
335
+
336
+ df = pd.read_csv(
337
+ participants_tsv, sep="\t", dtype="string", keep_default_na=False
338
+ )
339
+ if df.empty:
340
+ return None
341
+
342
+ candidates = {str(subject), f"sub-{subject}"}
343
+ present_cols = [c for c in id_columns if c in df.columns]
344
+ if not present_cols:
345
+ return None
346
+
347
+ mask = pd.Series(False, index=df.index)
348
+ for col in present_cols:
349
+ mask |= df[col].isin(candidates)
350
+ match = df.loc[mask]
351
+ if match.empty:
352
+ return None
353
+ return match.iloc[0]
354
+ except Exception:
355
+ return None
356
+
357
+
358
+ def participants_extras_from_tsv(
359
+ bids_root: str | Path,
360
+ subject: str,
361
+ *,
362
+ id_columns: tuple[str, ...] = ("participant_id", "participant", "subject"),
363
+ na_like: tuple[str, ...] = ("", "n/a", "na", "nan", "unknown", "none"),
364
+ ) -> dict[str, Any]:
365
+ """Extract additional participant information from participants.tsv.
366
+
367
+ Retrieves all non-identifier and non-empty fields for a subject from
368
+ the ``participants.tsv`` file.
369
+
370
+ Parameters
371
+ ----------
372
+ bids_root : str or Path
373
+ The root directory of the BIDS dataset.
374
+ subject : str
375
+ The subject identifier.
376
+ id_columns : tuple of str, default ("participant_id", "participant", "subject")
377
+ Column names to be treated as identifiers and excluded from the
378
+ output.
379
+ na_like : tuple of str, default ("", "n/a", "na", "nan", "unknown", "none")
380
+ Values to be considered as "Not Available" and excluded.
381
+
382
+ Returns
383
+ -------
384
+ dict
385
+ A dictionary of extra participant information.
386
+
387
+ """
388
+ row = participants_row_for_subject(bids_root, subject, id_columns=id_columns)
389
+ if row is None:
390
+ return {}
391
+
392
+ # Drop identifier columns and clean values
393
+ extras = row.drop(labels=[c for c in id_columns if c in row.index], errors="ignore")
394
+ s = extras.astype("string").str.strip()
395
+ valid = ~s.isna() & ~s.str.lower().isin(na_like)
396
+ return s[valid].to_dict()
397
+
398
+
399
+ def attach_participants_extras(
400
+ raw: Any,
401
+ description: Any,
402
+ extras: dict[str, Any],
403
+ ) -> None:
404
+ """Attach extra participant data to a raw object and its description.
405
+
406
+ Updates the ``raw.info['subject_info']`` and the description object
407
+ (dict or pandas Series) with extra data from ``participants.tsv``.
408
+ It does not overwrite existing keys.
409
+
410
+ Parameters
411
+ ----------
412
+ raw : mne.io.Raw
413
+ The MNE Raw object to be updated.
414
+ description : dict or pandas.Series
415
+ The description object to be updated.
416
+ extras : dict
417
+ A dictionary of extra participant information to attach.
418
+
419
+ """
420
+ if not extras:
421
+ return
422
+
423
+ # Raw.info enrichment
424
+ try:
425
+ subject_info = raw.info.get("subject_info") or {}
426
+ if not isinstance(subject_info, dict):
427
+ subject_info = {}
428
+ pe = subject_info.get("participants_extras") or {}
429
+ if not isinstance(pe, dict):
430
+ pe = {}
431
+ for k, v in extras.items():
432
+ pe.setdefault(k, v)
433
+ subject_info["participants_extras"] = pe
434
+ raw.info["subject_info"] = subject_info
435
+ except Exception:
436
+ pass
437
+
438
+ # Description enrichment
439
+ try:
440
+ import pandas as _pd # local import to avoid hard dependency at import time
441
+
442
+ if isinstance(description, dict):
443
+ for k, v in extras.items():
444
+ description.setdefault(k, v)
445
+ elif isinstance(description, _pd.Series):
446
+ missing = [k for k in extras.keys() if k not in description.index]
447
+ if missing:
448
+ description.loc[missing] = [extras[m] for m in missing]
449
+ except Exception:
450
+ pass
451
+
452
+
453
+ def enrich_from_participants(
454
+ bids_root: str | Path,
455
+ bidspath: BIDSPath,
456
+ raw: Any,
457
+ description: Any,
458
+ ) -> dict[str, Any]:
459
+ """Read participants.tsv and attach extra info for the subject.
460
+
461
+ This is a convenience function that finds the subject from the
462
+ ``bidspath``, retrieves extra information from ``participants.tsv``,
463
+ and attaches it to the raw object and its description.
464
+
465
+ Parameters
466
+ ----------
467
+ bids_root : str or Path
468
+ The root directory of the BIDS dataset.
469
+ bidspath : mne_bids.BIDSPath
470
+ The BIDSPath object for the current data file.
471
+ raw : mne.io.Raw
472
+ The MNE Raw object to be updated.
473
+ description : dict or pandas.Series
474
+ The description object to be updated.
475
+
476
+ Returns
477
+ -------
478
+ dict
479
+ The dictionary of extras that were attached.
480
+
481
+ """
482
+ subject = getattr(bidspath, "subject", None)
483
+ if not subject:
484
+ return {}
485
+ extras = participants_extras_from_tsv(bids_root, subject)
486
+ attach_participants_extras(raw, description, extras)
487
+ return extras
488
+
489
+
490
+ __all__ = [
491
+ "participants_row_for_subject",
492
+ "participants_extras_from_tsv",
493
+ "attach_participants_extras",
494
+ "enrich_from_participants",
495
+ ]
eegdash/const.py CHANGED
@@ -1,3 +1,21 @@
1
+ # Authors: The EEGDash contributors.
2
+ # License: GNU General Public License
3
+ # Copyright the EEGDash contributors.
4
+
5
+ """Configuration constants and mappings for EEGDash.
6
+
7
+ This module contains global configuration settings, allowed query fields, and mapping
8
+ constants used throughout the EEGDash package. It defines the interface between EEGDash
9
+ releases and OpenNeuro dataset identifiers, as well as validation rules for database queries.
10
+ """
11
+
12
+ __all__ = [
13
+ "config",
14
+ "ALLOWED_QUERY_FIELDS",
15
+ "RELEASE_TO_OPENNEURO_DATASET_MAP",
16
+ "SUBJECT_MINI_RELEASE_MAP",
17
+ ]
18
+
1
19
  ALLOWED_QUERY_FIELDS = {
2
20
  "data_name",
3
21
  "dataset",
@@ -10,6 +28,8 @@ ALLOWED_QUERY_FIELDS = {
10
28
  "nchans",
11
29
  "ntimes",
12
30
  }
31
+ """set: A set of field names that are permitted in database queries constructed
32
+ via :func:`~eegdash.api.EEGDash.find` with keyword arguments."""
13
33
 
14
34
  RELEASE_TO_OPENNEURO_DATASET_MAP = {
15
35
  "R11": "ds005516",
@@ -24,6 +44,8 @@ RELEASE_TO_OPENNEURO_DATASET_MAP = {
24
44
  "R2": "ds005506",
25
45
  "R1": "ds005505",
26
46
  }
47
+ """dict: A mapping from Healthy Brain Network (HBN) release identifiers (e.g., "R11")
48
+ to their corresponding OpenNeuro dataset identifiers (e.g., "ds005516")."""
27
49
 
28
50
  SUBJECT_MINI_RELEASE_MAP = {
29
51
  "R11": [
@@ -269,6 +291,9 @@ SUBJECT_MINI_RELEASE_MAP = {
269
291
  "NDARFW972KFQ",
270
292
  ],
271
293
  }
294
+ """dict: A mapping from HBN release identifiers to a list of subject IDs.
295
+ This is used to select a small, representative subset of subjects for creating
296
+ "mini" datasets for testing and demonstration purposes."""
272
297
 
273
298
  config = {
274
299
  "required_fields": ["data_name"],
@@ -304,3 +329,21 @@ config = {
304
329
  ],
305
330
  "accepted_query_fields": ["data_name", "dataset"],
306
331
  }
332
+ """dict: A global configuration dictionary for the EEGDash package.
333
+
334
+ Keys
335
+ ----
336
+ required_fields : list
337
+ Fields that must be present in every database record.
338
+ attributes : dict
339
+ A schema defining the expected primary attributes and their types for a
340
+ database record.
341
+ description_fields : list
342
+ A list of fields considered to be descriptive metadata for a recording,
343
+ which can be used for filtering and display.
344
+ bids_dependencies_files : list
345
+ A list of BIDS metadata filenames that are relevant for interpreting an
346
+ EEG recording.
347
+ accepted_query_fields : list
348
+ Fields that are accepted for lightweight existence checks in the database.
349
+ """