bids-manager 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bids_manager/__init__.py +11 -0
- bids_manager/build_heuristic_from_tsv.py +214 -0
- bids_manager/dicom_inventory.py +463 -0
- bids_manager/fill_bids_ignore.py +51 -0
- bids_manager/gui.py +4115 -0
- bids_manager/miscellaneous/images/ANCP_lab.png +0 -0
- bids_manager/miscellaneous/images/Icon.png +0 -0
- bids_manager/miscellaneous/images/Jochem.jpg +0 -0
- bids_manager/miscellaneous/images/Karel.jpeg +0 -0
- bids_manager/miscellaneous/images/Logo.png +0 -0
- bids_manager/post_conv_renamer.py +224 -0
- bids_manager/run_heudiconv_from_heuristic.py +248 -0
- bids_manager/scans_utils.py +80 -0
- bids_manager-0.1.0.dist-info/METADATA +142 -0
- bids_manager-0.1.0.dist-info/RECORD +19 -0
- bids_manager-0.1.0.dist-info/WHEEL +5 -0
- bids_manager-0.1.0.dist-info/entry_points.txt +7 -0
- bids_manager-0.1.0.dist-info/licenses/LICENSE +21 -0
- bids_manager-0.1.0.dist-info/top_level.txt +1 -0
bids_manager/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""BIDS Manager package."""
|
|
2
|
+
|
|
3
|
+
from importlib import metadata
|
|
4
|
+
|
|
5
|
+
__all__ = ["__version__"]
|
|
6
|
+
|
|
7
|
+
try: # pragma: no cover - version resolution
|
|
8
|
+
__version__ = metadata.version("bids-manager")
|
|
9
|
+
except metadata.PackageNotFoundError: # pragma: no cover
|
|
10
|
+
__version__ = "0.0.0"
|
|
11
|
+
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
build_heuristic_from_tsv.py — **v10**
|
|
4
|
+
====================================
|
|
5
|
+
Simple heuristic that:
|
|
6
|
+
1. **Keeps every sequence**, including SBRef.
|
|
7
|
+
2. **Uses the raw SeriesDescription** (cleaned) as the filename stem – no
|
|
8
|
+
added `rep-*`, task, or echo logic.
|
|
9
|
+
3. Skips only modalities listed in `SKIP_BY_DEFAULT` (`report`,
|
|
10
|
+
`physio`, `refscan`).
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from textwrap import dedent
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import re
|
|
18
|
+
|
|
19
|
+
# -----------------------------------------------------------------------------
|
|
20
|
+
# Configuration
|
|
21
|
+
# -----------------------------------------------------------------------------
|
|
22
|
+
SKIP_BY_DEFAULT = {"report", "physio", "refscan"}
|
|
23
|
+
|
|
24
|
+
# -----------------------------------------------------------------------------
|
|
25
|
+
# Helper functions
|
|
26
|
+
# -----------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
def clean(text: str) -> str:
|
|
29
|
+
"""Return alphanumerics only (for variable names)."""
|
|
30
|
+
return re.sub(r"[^0-9A-Za-z]+", "", str(text))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def safe_stem(seq: str) -> str:
|
|
34
|
+
"""Clean SeriesDescription for use in a filename."""
|
|
35
|
+
return re.sub(r"[^0-9A-Za-z_-]+", "_", seq.strip()).strip("_")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def dedup_parts(*parts: str) -> str:
|
|
39
|
+
"""Return underscore-joined *parts* with consecutive repeats removed."""
|
|
40
|
+
tokens: list[str] = []
|
|
41
|
+
for part in parts:
|
|
42
|
+
for t in str(part).split("_"):
|
|
43
|
+
if t and (not tokens or t != tokens[-1]):
|
|
44
|
+
tokens.append(t)
|
|
45
|
+
return "_".join(tokens)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# -----------------------------------------------------------------------------
|
|
49
|
+
# Core writer
|
|
50
|
+
# -----------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
def write_heuristic(df: pd.DataFrame, dst: Path) -> None:
|
|
53
|
+
"""Write a HeuDiConv heuristic from ``df`` to ``dst``.
|
|
54
|
+
|
|
55
|
+
Parameters
|
|
56
|
+
----------
|
|
57
|
+
df : pandas.DataFrame
|
|
58
|
+
Table generated by :mod:`dicom_inventory` describing the DICOM series.
|
|
59
|
+
dst : Path
|
|
60
|
+
Destination ``heuristic_<name>.py`` file.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
print("Building heuristic (v10)…")
|
|
64
|
+
buf: list[str] = []
|
|
65
|
+
|
|
66
|
+
# 1 ─ header -----------------------------------------------------------
|
|
67
|
+
buf.append(
|
|
68
|
+
dedent(
|
|
69
|
+
'''\
|
|
70
|
+
"""AUTO-GENERATED HeuDiConv heuristic (v10)."""
|
|
71
|
+
from typing import Tuple
|
|
72
|
+
|
|
73
|
+
def create_key(template: str,
|
|
74
|
+
outtype: Tuple[str, ...] = ("nii.gz",),
|
|
75
|
+
annotation_classes=None):
|
|
76
|
+
if not template:
|
|
77
|
+
raise ValueError("Template must be non-empty")
|
|
78
|
+
return template, outtype, annotation_classes
|
|
79
|
+
'''
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# 2 ─ SID_MAP ----------------------------------------------------------
|
|
84
|
+
sid_pairs = {(clean(str(r.source_folder)) or clean(Path(r.source_folder or '.').name), r.BIDS_name) for r in df.itertuples()}
|
|
85
|
+
buf.append("\nSID_MAP = {\n")
|
|
86
|
+
for folder, bids in sorted(sid_pairs):
|
|
87
|
+
buf.append(f" '{folder}': '{bids}',\n")
|
|
88
|
+
buf.append("}\n\n")
|
|
89
|
+
|
|
90
|
+
# 3 ─ template keys ----------------------------------------------------
|
|
91
|
+
# Include series UID (or rep) in the key to handle repeated sequences
|
|
92
|
+
seq2key: dict[tuple[str, str, str, str, str], str] = {}
|
|
93
|
+
key_defs: list[tuple[str, str]] = []
|
|
94
|
+
|
|
95
|
+
rep_counts = (
|
|
96
|
+
df.groupby(["BIDS_name", "session", "sequence"], dropna=False)["sequence"].transform("count")
|
|
97
|
+
)
|
|
98
|
+
rep_index = (
|
|
99
|
+
df.groupby(["BIDS_name", "session", "sequence"], dropna=False).cumcount() + 1
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
key_def_set = set()
|
|
103
|
+
for idx, row in df.iterrows():
|
|
104
|
+
ses_raw = row.get("session", "")
|
|
105
|
+
ses = "" if pd.isna(ses_raw) else str(ses_raw).strip()
|
|
106
|
+
folder = Path(str(row.get("source_folder", "."))).name
|
|
107
|
+
rep_num = rep_index.loc[idx]
|
|
108
|
+
uid_field = str(row.get("series_uid", ""))
|
|
109
|
+
bids = row["BIDS_name"]
|
|
110
|
+
container = row.get("modality_bids", "misc") or "misc"
|
|
111
|
+
stem = safe_stem(row["sequence"])
|
|
112
|
+
|
|
113
|
+
base_parts = [bids, ses, stem]
|
|
114
|
+
if rep_counts.loc[idx] > 1:
|
|
115
|
+
base_parts.append(f"rep-{rep_num}")
|
|
116
|
+
base = dedup_parts(*base_parts)
|
|
117
|
+
path = "/".join(p for p in [bids, ses, container] if p)
|
|
118
|
+
template = f"{path}/{base}"
|
|
119
|
+
|
|
120
|
+
key_parts = [bids, ses, stem]
|
|
121
|
+
if rep_counts.loc[idx] > 1:
|
|
122
|
+
key_parts.append(f"rep-{rep_num}")
|
|
123
|
+
key_var = "key_" + clean("_".join(p for p in key_parts if p))
|
|
124
|
+
if key_var not in key_def_set:
|
|
125
|
+
key_defs.append((key_var, template))
|
|
126
|
+
key_def_set.add(key_var)
|
|
127
|
+
|
|
128
|
+
uid_list = [u for u in uid_field.split("|") if u] or [""]
|
|
129
|
+
for uid in uid_list:
|
|
130
|
+
key_id = (row["sequence"], row["BIDS_name"], ses, folder, uid)
|
|
131
|
+
if key_id in seq2key:
|
|
132
|
+
continue
|
|
133
|
+
seq2key[key_id] = key_var
|
|
134
|
+
|
|
135
|
+
for var, tpl in key_defs:
|
|
136
|
+
buf.append(f"{var} = create_key('{tpl}')\n")
|
|
137
|
+
buf.append("\n")
|
|
138
|
+
|
|
139
|
+
# 4 ─ infotodict() ----------------------------------------------------
|
|
140
|
+
buf.append("def infotodict(seqinfo):\n \"\"\"Return mapping SeriesDescription → key list.\"\"\"\n")
|
|
141
|
+
for var in seq2key.values():
|
|
142
|
+
buf.append(f" {var}_list = []\n")
|
|
143
|
+
buf.append(" info = {\n")
|
|
144
|
+
for var in seq2key.values():
|
|
145
|
+
buf.append(f" {var}: {var}_list,\n")
|
|
146
|
+
buf.append(" }\n\n")
|
|
147
|
+
|
|
148
|
+
buf.append(" for s in seqinfo:\n")
|
|
149
|
+
for (seq, _b, _s, folder, uid), var in seq2key.items():
|
|
150
|
+
seq_esc = seq.replace("'", "\\'")
|
|
151
|
+
fol_esc = folder.replace("'", "\\'")
|
|
152
|
+
uid_esc = str(uid).replace("'", "\\'")
|
|
153
|
+
buf.append(
|
|
154
|
+
f" if s.series_description == '{seq_esc}' and s.dcm_dir_name == '{fol_esc}' and getattr(s, 'series_uid', '') == '{uid_esc}':\n"
|
|
155
|
+
)
|
|
156
|
+
buf.append(f" {var}_list.append(s.series_id)\n")
|
|
157
|
+
buf.append(" return info\n")
|
|
158
|
+
|
|
159
|
+
dst.write_text("".join(buf), encoding="utf-8")
|
|
160
|
+
print("Heuristic written →", dst.resolve())
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# -----------------------------------------------------------------------------
|
|
164
|
+
# Driver
|
|
165
|
+
# -----------------------------------------------------------------------------
|
|
166
|
+
|
|
167
|
+
def generate(tsv: Path, out_dir: Path) -> None:
|
|
168
|
+
"""Generate heuristic files for each study described in ``tsv``.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
tsv : Path
|
|
173
|
+
Path to ``subject_summary.tsv`` produced by :mod:`dicom_inventory`.
|
|
174
|
+
out_dir : Path
|
|
175
|
+
Directory where the heuristic files will be written.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
df = pd.read_csv(tsv, sep="\t", keep_default_na=False)
|
|
179
|
+
|
|
180
|
+
# Drop rows with unwanted modalities
|
|
181
|
+
mask = df.modality.isin(SKIP_BY_DEFAULT)
|
|
182
|
+
if mask.any():
|
|
183
|
+
df.loc[mask, "include"] = 0
|
|
184
|
+
print(f"Auto‑skipped {mask.sum()} rows ({', '.join(SKIP_BY_DEFAULT)})")
|
|
185
|
+
|
|
186
|
+
df = df[df.include == 1]
|
|
187
|
+
|
|
188
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
189
|
+
|
|
190
|
+
for study, sub_df in df.groupby("StudyDescription"):
|
|
191
|
+
fname = safe_stem(study or "unknown")
|
|
192
|
+
heur = out_dir / f"heuristic_{fname}.py"
|
|
193
|
+
write_heuristic(sub_df, heur)
|
|
194
|
+
folders = " ".join(sorted({clean(f) or clean(Path(f or '.').name) for f in sub_df.source_folder.unique()}))
|
|
195
|
+
print(dedent(f"""
|
|
196
|
+
heudiconv -d "<RAW_ROOT>/{{subject}}/**/*.*" -s {folders} -f {heur.name} -c dcm2niix -o <BIDS_OUT>/{fname} -b --minmeta --overwrite"""))
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def main() -> None:
|
|
200
|
+
"""Entry point for the ``build-heuristic`` command line utility."""
|
|
201
|
+
|
|
202
|
+
import argparse
|
|
203
|
+
|
|
204
|
+
parser = argparse.ArgumentParser(description="Generate HeuDiConv heuristic(s) from TSV")
|
|
205
|
+
parser.add_argument("tsv", help="Path to subject_summary.tsv file")
|
|
206
|
+
parser.add_argument("out_dir", help="Directory to write heuristic files")
|
|
207
|
+
args = parser.parse_args()
|
|
208
|
+
|
|
209
|
+
generate(Path(args.tsv), Path(args.out_dir))
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
if __name__ == "__main__":
|
|
213
|
+
main()
|
|
214
|
+
|
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
dicom_inventory.py — fully-commented, no-emoji version
|
|
4
|
+
------------------------------------------------------
|
|
5
|
+
|
|
6
|
+
Creates a long-format TSV describing every DICOM series in *root_dir*.
|
|
7
|
+
|
|
8
|
+
Why you want this
|
|
9
|
+
-----------------
|
|
10
|
+
* Lets you review **all** SeriesDescriptions, subjects, sessions and file counts
|
|
11
|
+
before converting anything.
|
|
12
|
+
* Column `include` defaults to 1 except for scout/report/physlog sequences,
|
|
13
|
+
which start at 0 so they are skipped by default.
|
|
14
|
+
* Generated table is the single source of truth you feed into a helper script
|
|
15
|
+
that writes the HeuDiConv heuristic.
|
|
16
|
+
|
|
17
|
+
Output columns (ordered as they appear)
|
|
18
|
+
---------------------------------------
|
|
19
|
+
subject – GivenName shown only on the first row of each subject block
|
|
20
|
+
BIDS_name – auto-assigned `sub-001`, `sub-002`, … (same GivenName → same ID)
|
|
21
|
+
session – `ses-<label>` if exactly one unique session tag is present in
|
|
22
|
+
that folder, otherwise blank
|
|
23
|
+
source_folder – relative path from the DICOM root to the folder containing the
|
|
24
|
+
series
|
|
25
|
+
include – defaults to 1 but scout/report/physlog rows start at 0
|
|
26
|
+
sequence – original SeriesDescription
|
|
27
|
+
series_uid – DICOM SeriesInstanceUID identifying a specific acquisition
|
|
28
|
+
rep – 1, 2, … if multiple SeriesInstanceUIDs share the same description
|
|
29
|
+
acq_time – AcquisitionTime of the first file in that series
|
|
30
|
+
modality – fine label inferred from patterns (T1w, bold, dwi, …)
|
|
31
|
+
modality_bids – top-level container (anat, func, dwi, fmap) derived from
|
|
32
|
+
*modality*
|
|
33
|
+
n_files – number of DICOM files (.dcm or .ima) with that SeriesDescription
|
|
34
|
+
GivenName … StudyDescription – demographics copied from the first header seen
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
import os
|
|
38
|
+
import re
|
|
39
|
+
from collections import defaultdict
|
|
40
|
+
from typing import Optional
|
|
41
|
+
from pathlib import Path
|
|
42
|
+
from joblib import Parallel, delayed
|
|
43
|
+
|
|
44
|
+
import pandas as pd
|
|
45
|
+
import pydicom
|
|
46
|
+
from pydicom.multival import MultiValue
|
|
47
|
+
|
|
48
|
+
# Directory used to store persistent user preferences
|
|
49
|
+
PREF_DIR = Path(__file__).resolve().parent / "user_preferences"
|
|
50
|
+
SEQ_DICT_FILE = PREF_DIR / "sequence_dictionary.tsv"
|
|
51
|
+
|
|
52
|
+
# Acceptable DICOM file extensions (lower case)
|
|
53
|
+
DICOM_EXTS = (".dcm", ".ima")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ----------------------------------------------------------------------
|
|
57
|
+
# 1. Patterns: SeriesDescription → fine-grained modality label
|
|
58
|
+
# (order matters: first match wins)
|
|
59
|
+
# ----------------------------------------------------------------------
|
|
60
|
+
BIDS_PATTERNS = {
|
|
61
|
+
# anatomy
|
|
62
|
+
"T1w" : (
|
|
63
|
+
"t1w",
|
|
64
|
+
"t1-weight",
|
|
65
|
+
"t1_",
|
|
66
|
+
"t1 ",
|
|
67
|
+
"mprage",
|
|
68
|
+
"tfl3d",
|
|
69
|
+
"fspgr",
|
|
70
|
+
),
|
|
71
|
+
"T2w" : ("t2w", "space", "tse"),
|
|
72
|
+
"FLAIR" : ("flair",),
|
|
73
|
+
"MTw" : ("gre-mt", "gre_mt", "mt"),
|
|
74
|
+
"PDw" : ("gre-nm", "gre_nm"),
|
|
75
|
+
"scout" : ("localizer", "scout"),
|
|
76
|
+
"report" : ("phoenixzipreport", "phoenix document", ".pdf", "report"),
|
|
77
|
+
"refscan": ("type-ref", "reference", "refscan"),
|
|
78
|
+
# functional
|
|
79
|
+
"bold" : ("fmri", "bold", "task-"),
|
|
80
|
+
"SBRef" : ("sbref",),
|
|
81
|
+
# diffusion
|
|
82
|
+
"dwi" : ("dti", "dwi", "diff"),
|
|
83
|
+
# field maps
|
|
84
|
+
"fmap" : (
|
|
85
|
+
"gre_field",
|
|
86
|
+
"fieldmapping",
|
|
87
|
+
"_fmap",
|
|
88
|
+
"fmap",
|
|
89
|
+
"phase",
|
|
90
|
+
"magnitude",
|
|
91
|
+
"b0rf",
|
|
92
|
+
"b0_map",
|
|
93
|
+
"b0map",
|
|
94
|
+
),
|
|
95
|
+
# misc (kept for completeness)
|
|
96
|
+
"physio" : ("physiolog", "physio", "pulse", "resp"),
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Keep a pristine copy of the default patterns so the GUI can restore them
|
|
100
|
+
DEFAULT_BIDS_PATTERNS = {m: tuple(pats) for m, pats in BIDS_PATTERNS.items()}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def load_sequence_dictionary() -> None:
|
|
104
|
+
"""Load user-modified sequence patterns from :data:`SEQ_DICT_FILE`."""
|
|
105
|
+
global BIDS_PATTERNS
|
|
106
|
+
if not SEQ_DICT_FILE.exists():
|
|
107
|
+
return
|
|
108
|
+
try:
|
|
109
|
+
df = pd.read_csv(SEQ_DICT_FILE, sep="\t", keep_default_na=False)
|
|
110
|
+
except Exception:
|
|
111
|
+
return
|
|
112
|
+
patterns: defaultdict[str, list[str]] = defaultdict(list)
|
|
113
|
+
for _, row in df.iterrows():
|
|
114
|
+
mod = str(row.get("modality", "")).strip()
|
|
115
|
+
pat = str(row.get("pattern", "")).strip().lower()
|
|
116
|
+
if mod and pat:
|
|
117
|
+
patterns[mod].append(pat)
|
|
118
|
+
if patterns:
|
|
119
|
+
BIDS_PATTERNS = {m: tuple(pats) for m, pats in patterns.items()}
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def restore_sequence_dictionary() -> None:
|
|
123
|
+
"""Revert :data:`BIDS_PATTERNS` to the bundled defaults."""
|
|
124
|
+
global BIDS_PATTERNS
|
|
125
|
+
BIDS_PATTERNS = {m: tuple(pats) for m, pats in DEFAULT_BIDS_PATTERNS.items()}
|
|
126
|
+
try:
|
|
127
|
+
SEQ_DICT_FILE.unlink()
|
|
128
|
+
except Exception:
|
|
129
|
+
pass
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
load_sequence_dictionary()
|
|
133
|
+
|
|
134
|
+
def guess_modality(series: str) -> str:
|
|
135
|
+
"""Return first matching fine label; otherwise 'unknown'."""
|
|
136
|
+
s = series.lower()
|
|
137
|
+
for label, pats in BIDS_PATTERNS.items():
|
|
138
|
+
if any(p in s for p in pats):
|
|
139
|
+
return label
|
|
140
|
+
return "unknown"
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
MAGNITUDE_IMGTYPE = ["ORIGINAL", "PRIMARY", "M", "ND", "NORM"]
|
|
144
|
+
PHASE_IMGTYPE = ["ORIGINAL", "PRIMARY", "P", "ND"]
|
|
145
|
+
|
|
146
|
+
def normalize_image_type(value) -> list:
|
|
147
|
+
"""Return ImageType components as a list of strings."""
|
|
148
|
+
if value is None:
|
|
149
|
+
return []
|
|
150
|
+
if isinstance(value, (list, tuple, MultiValue)):
|
|
151
|
+
return [str(x).strip() for x in value]
|
|
152
|
+
text = str(value)
|
|
153
|
+
if "\\" in text:
|
|
154
|
+
return [p.strip() for p in text.split("\\")]
|
|
155
|
+
text = text.strip()
|
|
156
|
+
if text.startswith("[") and text.endswith("]"):
|
|
157
|
+
text = text[1:-1]
|
|
158
|
+
return [p.strip().strip("'") for p in text.split(",")]
|
|
159
|
+
return [text] if text else []
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def classify_fieldmap_type(img_list: list) -> str:
|
|
163
|
+
"""Return 'M' for magnitude, 'P' for phase, '' otherwise."""
|
|
164
|
+
if img_list == MAGNITUDE_IMGTYPE:
|
|
165
|
+
return "M"
|
|
166
|
+
if img_list == PHASE_IMGTYPE:
|
|
167
|
+
return "P"
|
|
168
|
+
return ""
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# ----------------------------------------------------------------------
|
|
172
|
+
# 2. Map fine label → top-level BIDS container (anat, func, …)
|
|
173
|
+
# ----------------------------------------------------------------------
|
|
174
|
+
BIDS_CONTAINER = {
|
|
175
|
+
"T1w":"anat", "T2w":"anat", "FLAIR":"anat",
|
|
176
|
+
"MTw":"anat", "PDw":"anat",
|
|
177
|
+
"scout":"anat", "report":"anat", "refscan":"anat",
|
|
178
|
+
"bold":"func", "SBRef":"func",
|
|
179
|
+
"dwi":"dwi",
|
|
180
|
+
"fmap":"fmap",
|
|
181
|
+
}
|
|
182
|
+
def modality_to_container(mod: str) -> str:
|
|
183
|
+
"""Translate T1w → anat, bold → func, etc.; unknown → ''."""
|
|
184
|
+
return BIDS_CONTAINER.get(mod, "")
|
|
185
|
+
|
|
186
|
+
# session detector (e.g. ses-pre, ses-01) -- case-insensitive
|
|
187
|
+
SESSION_RE = re.compile(r"ses-([a-zA-Z0-9]+)", re.IGNORECASE)
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ----------------------------------------------------------------------
|
|
191
|
+
# 3. Main scanner
|
|
192
|
+
# ----------------------------------------------------------------------
|
|
193
|
+
def scan_dicoms_long(
|
|
194
|
+
root_dir: str,
|
|
195
|
+
output_tsv: Optional[str] = None,
|
|
196
|
+
n_jobs: int = 1,
|
|
197
|
+
) -> pd.DataFrame:
|
|
198
|
+
"""
|
|
199
|
+
Walk *root_dir*, read DICOM headers, return long-format DataFrame.
|
|
200
|
+
|
|
201
|
+
Parameters
|
|
202
|
+
----------
|
|
203
|
+
root_dir : str
|
|
204
|
+
Path with raw DICOMs organised in sub-folders.
|
|
205
|
+
output_tsv : str | None
|
|
206
|
+
If provided, write the TSV to that path.
|
|
207
|
+
n_jobs : int
|
|
208
|
+
Number of parallel workers to use when reading DICOM files.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
pandas.DataFrame
|
|
213
|
+
Inventory as described in module docstring.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
root_dir = Path(root_dir)
|
|
217
|
+
print(f"Scanning DICOM headers under: {root_dir}")
|
|
218
|
+
|
|
219
|
+
# in-memory stores
|
|
220
|
+
demo = {}
|
|
221
|
+
counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
|
|
222
|
+
mods = defaultdict(lambda: defaultdict(dict))
|
|
223
|
+
acq_times = defaultdict(lambda: defaultdict(dict))
|
|
224
|
+
imgtypes = defaultdict(lambda: defaultdict(dict))
|
|
225
|
+
sessset = defaultdict(lambda: defaultdict(set))
|
|
226
|
+
|
|
227
|
+
# PASS 1: Walk filesystem and collect info in parallel
|
|
228
|
+
file_list = []
|
|
229
|
+
for root, _dirs, files in os.walk(root_dir):
|
|
230
|
+
for fname in files:
|
|
231
|
+
if fname.lower().endswith(DICOM_EXTS):
|
|
232
|
+
file_list.append(os.path.join(root, fname))
|
|
233
|
+
|
|
234
|
+
def _read_one(fpath: str):
|
|
235
|
+
try:
|
|
236
|
+
ds = pydicom.dcmread(fpath, stop_before_pixels=True, force=True)
|
|
237
|
+
except Exception as exc: # pragma: no cover - I/O errors
|
|
238
|
+
print(f"Warning: could not read {fpath}: {exc}")
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
root = os.path.dirname(fpath)
|
|
242
|
+
pn = getattr(ds, "PatientName", None)
|
|
243
|
+
given = pn.given_name.strip() if pn and pn.given_name else ""
|
|
244
|
+
pid = getattr(ds, "PatientID", "").strip()
|
|
245
|
+
subj = given or pid or "UNKNOWN"
|
|
246
|
+
study = (
|
|
247
|
+
getattr(ds, "StudyDescription", None)
|
|
248
|
+
or getattr(ds, "StudyName", None)
|
|
249
|
+
or "n/a"
|
|
250
|
+
)
|
|
251
|
+
study = str(study).strip()
|
|
252
|
+
subj_key = f"{subj}||{study}"
|
|
253
|
+
rel = os.path.relpath(root, root_dir)
|
|
254
|
+
folder = root_dir.name if rel == "." else rel
|
|
255
|
+
series = getattr(ds, "SeriesDescription", "n/a").strip()
|
|
256
|
+
uid = getattr(ds, "SeriesInstanceUID", "")
|
|
257
|
+
raw_img_type = getattr(ds, "ImageType", None)
|
|
258
|
+
img_list = normalize_image_type(raw_img_type)
|
|
259
|
+
img3 = classify_fieldmap_type(img_list)
|
|
260
|
+
if not img3:
|
|
261
|
+
img3 = img_list[2] if len(img_list) >= 3 else ""
|
|
262
|
+
acq_time = str(getattr(ds, "AcquisitionTime", "")).strip()
|
|
263
|
+
m = SESSION_RE.search(series)
|
|
264
|
+
sess_tag = f"ses-{m.group(1)}" if m else None
|
|
265
|
+
demo_dict = dict(
|
|
266
|
+
GivenName=given,
|
|
267
|
+
FamilyName=getattr(pn, "family_name", "").strip(),
|
|
268
|
+
PatientID=pid,
|
|
269
|
+
PatientSex=getattr(ds, "PatientSex", "n/a").strip(),
|
|
270
|
+
PatientAge=getattr(ds, "PatientAge", "n/a").strip(),
|
|
271
|
+
StudyDescription=study,
|
|
272
|
+
)
|
|
273
|
+
return dict(
|
|
274
|
+
subj_key=subj_key,
|
|
275
|
+
folder=folder,
|
|
276
|
+
series=series,
|
|
277
|
+
uid=uid,
|
|
278
|
+
modality=guess_modality(series),
|
|
279
|
+
img3=img3,
|
|
280
|
+
acq_time=acq_time,
|
|
281
|
+
sess_tag=sess_tag,
|
|
282
|
+
demo=demo_dict,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
results = Parallel(n_jobs=n_jobs)(delayed(_read_one)(fp) for fp in file_list)
|
|
286
|
+
for res in results:
|
|
287
|
+
if not res:
|
|
288
|
+
continue
|
|
289
|
+
subj_key = res["subj_key"]
|
|
290
|
+
folder = res["folder"]
|
|
291
|
+
series = res["series"]
|
|
292
|
+
uid = res["uid"]
|
|
293
|
+
key = (series, uid)
|
|
294
|
+
counts[subj_key][folder][key] += 1
|
|
295
|
+
mods[subj_key][folder][key] = res["modality"]
|
|
296
|
+
if key not in imgtypes[subj_key][folder]:
|
|
297
|
+
imgtypes[subj_key][folder][key] = res["img3"]
|
|
298
|
+
if key not in acq_times[subj_key][folder] and res["acq_time"]:
|
|
299
|
+
acq_times[subj_key][folder][key] = res["acq_time"]
|
|
300
|
+
if res["sess_tag"]:
|
|
301
|
+
sessset[subj_key][folder].add(res["sess_tag"])
|
|
302
|
+
if subj_key not in demo:
|
|
303
|
+
demo[subj_key] = res["demo"]
|
|
304
|
+
|
|
305
|
+
print(f"Subjects found : {len(demo)}")
|
|
306
|
+
total_series = sum(len(seq_dict)
|
|
307
|
+
for subj in counts.values()
|
|
308
|
+
for folder, seq_dict in subj.items())
|
|
309
|
+
print(f"Unique Series instances : {total_series}")
|
|
310
|
+
|
|
311
|
+
# PASS 2: assign BIDS subject numbers PER STUDY
|
|
312
|
+
study_subjects = defaultdict(set)
|
|
313
|
+
for subj_key in demo:
|
|
314
|
+
subj, stud = subj_key.split("||", 1)
|
|
315
|
+
study_subjects[stud].add(subj)
|
|
316
|
+
|
|
317
|
+
bids_map = {}
|
|
318
|
+
for study, subj_set in study_subjects.items():
|
|
319
|
+
for i, sid in enumerate(sorted(subj_set)):
|
|
320
|
+
bids_map[f"{sid}||{study}"] = f"sub-{i+1:03d}"
|
|
321
|
+
|
|
322
|
+
print("Assigned BIDS IDs:", bids_map)
|
|
323
|
+
|
|
324
|
+
# PASS 3: build DataFrame rows
|
|
325
|
+
rows = []
|
|
326
|
+
for subj_key in sorted(counts):
|
|
327
|
+
first_row = True
|
|
328
|
+
for folder in sorted(counts[subj_key]):
|
|
329
|
+
|
|
330
|
+
# decide session label for this folder
|
|
331
|
+
ses_labels = sorted(sessset[subj_key][folder])
|
|
332
|
+
session = ses_labels[0] if len(ses_labels) == 1 else ""
|
|
333
|
+
|
|
334
|
+
rep_counter = defaultdict(int)
|
|
335
|
+
for (series, uid), n_files in sorted(counts[subj_key][folder].items()):
|
|
336
|
+
fine_mod = mods[subj_key][folder][(series, uid)]
|
|
337
|
+
img3 = imgtypes[subj_key][folder].get((series, uid), "")
|
|
338
|
+
include = 1
|
|
339
|
+
if fine_mod in {"scout", "report"} or "physlog" in series.lower():
|
|
340
|
+
include = 0
|
|
341
|
+
# Do not consider image type when counting scout duplicates
|
|
342
|
+
rep_key = series if fine_mod == "scout" else (series, img3)
|
|
343
|
+
rep_counter[rep_key] += 1
|
|
344
|
+
rows.append({
|
|
345
|
+
"subject" : demo[subj_key]["GivenName"] if first_row else "",
|
|
346
|
+
"BIDS_name" : bids_map[subj_key],
|
|
347
|
+
"session" : session,
|
|
348
|
+
"source_folder" : folder,
|
|
349
|
+
"include" : include,
|
|
350
|
+
"sequence" : series,
|
|
351
|
+
"series_uid" : uid,
|
|
352
|
+
"rep" : rep_counter[rep_key] if rep_counter[rep_key] > 1 else "",
|
|
353
|
+
"image_type" : img3,
|
|
354
|
+
"acq_time" : acq_times[subj_key][folder].get((series, uid), ""),
|
|
355
|
+
"modality" : fine_mod,
|
|
356
|
+
"modality_bids" : modality_to_container(fine_mod),
|
|
357
|
+
"n_files" : n_files,
|
|
358
|
+
**demo[subj_key], # demographics
|
|
359
|
+
})
|
|
360
|
+
first_row = False
|
|
361
|
+
|
|
362
|
+
# Final column order
|
|
363
|
+
columns = [
|
|
364
|
+
"subject", "BIDS_name", "session", "source_folder",
|
|
365
|
+
"include", "sequence", "series_uid", "rep", "acq_time",
|
|
366
|
+
"image_type", "modality", "modality_bids", "n_files",
|
|
367
|
+
"GivenName", "FamilyName", "PatientID",
|
|
368
|
+
"PatientSex", "PatientAge", "StudyDescription",
|
|
369
|
+
]
|
|
370
|
+
df = pd.DataFrame(rows, columns=columns)
|
|
371
|
+
|
|
372
|
+
# Collapse magnitude/phase rows for fieldmaps
|
|
373
|
+
fmap_mask = df.modality == "fmap"
|
|
374
|
+
if fmap_mask.any():
|
|
375
|
+
base_cols = [
|
|
376
|
+
"BIDS_name",
|
|
377
|
+
"session",
|
|
378
|
+
"source_folder",
|
|
379
|
+
"sequence",
|
|
380
|
+
]
|
|
381
|
+
# Use acquisition time rounded to the minute to merge magnitude and
|
|
382
|
+
# phase series from the same fieldmap even if their timestamps differ
|
|
383
|
+
# by a few seconds.
|
|
384
|
+
fmap_df = df[fmap_mask].copy()
|
|
385
|
+
fmap_df["acq_group"] = fmap_df["acq_time"].apply(lambda t: str(t)[:4])
|
|
386
|
+
group_cols = base_cols + ["acq_group"]
|
|
387
|
+
fmap_df["uid_list"] = fmap_df["series_uid"]
|
|
388
|
+
# keep all UIDs within each group so both magnitude and phase series
|
|
389
|
+
# are converted; they will be joined with '|' below
|
|
390
|
+
fmap_df["img_set"] = fmap_df["image_type"]
|
|
391
|
+
fmap_df = (
|
|
392
|
+
fmap_df.groupby(group_cols, as_index=False)
|
|
393
|
+
.agg(
|
|
394
|
+
{
|
|
395
|
+
"subject": "first",
|
|
396
|
+
"BIDS_name": "first",
|
|
397
|
+
"session": "first",
|
|
398
|
+
"source_folder": "first",
|
|
399
|
+
"include": "max",
|
|
400
|
+
"sequence": "first",
|
|
401
|
+
"uid_list": lambda x: "|".join(sorted(set(str(v) for v in x))),
|
|
402
|
+
"img_set": lambda x: "".join(sorted(set(str(v) for v in x))),
|
|
403
|
+
"acq_time": "first",
|
|
404
|
+
"modality": "first",
|
|
405
|
+
"modality_bids": "first",
|
|
406
|
+
"n_files": "sum",
|
|
407
|
+
"GivenName": "first",
|
|
408
|
+
"FamilyName": "first",
|
|
409
|
+
"PatientID": "first",
|
|
410
|
+
"PatientSex": "first",
|
|
411
|
+
"PatientAge": "first",
|
|
412
|
+
"StudyDescription": "first",
|
|
413
|
+
}
|
|
414
|
+
)
|
|
415
|
+
)
|
|
416
|
+
fmap_df.rename(columns={"uid_list": "series_uid", "img_set": "image_type"}, inplace=True)
|
|
417
|
+
fmap_df.drop(columns=["acq_group"], inplace=True)
|
|
418
|
+
sort_cols = base_cols + ["acq_time"]
|
|
419
|
+
fmap_df.sort_values(sort_cols, inplace=True)
|
|
420
|
+
fmap_df["rep"] = fmap_df.groupby(base_cols).cumcount() + 1
|
|
421
|
+
repeat_mask = fmap_df.groupby(base_cols)["rep"].transform("count") > 1
|
|
422
|
+
fmap_df.loc[~repeat_mask, "rep"] = ""
|
|
423
|
+
|
|
424
|
+
df = pd.concat([df[~fmap_mask], fmap_df], ignore_index=True, sort=False)
|
|
425
|
+
|
|
426
|
+
df.sort_values(["StudyDescription", "BIDS_name"], inplace=True)
|
|
427
|
+
|
|
428
|
+
# optional TSV export
|
|
429
|
+
if output_tsv:
|
|
430
|
+
df.to_csv(output_tsv, sep="\t", index=False)
|
|
431
|
+
print(f"Inventory written to: {output_tsv}")
|
|
432
|
+
|
|
433
|
+
return df
|
|
434
|
+
|
|
435
|
+
|
|
436
|
+
# ----------------------------------------------------------------------
|
|
437
|
+
# Command-line test
|
|
438
|
+
# ----------------------------------------------------------------------
|
|
439
|
+
def main() -> None:
|
|
440
|
+
"""Command line interface for :func:`scan_dicoms_long`."""
|
|
441
|
+
|
|
442
|
+
import argparse
|
|
443
|
+
|
|
444
|
+
parser = argparse.ArgumentParser(description="Generate TSV inventory for a DICOM folder")
|
|
445
|
+
parser.add_argument("dicom_dir", help="Path to the directory containing DICOM files")
|
|
446
|
+
parser.add_argument("output_tsv", help="Destination TSV file")
|
|
447
|
+
parser.add_argument(
|
|
448
|
+
"--jobs",
|
|
449
|
+
type=int,
|
|
450
|
+
# Use ~80% of available CPUs to provide a sensible default while
|
|
451
|
+
# leaving some resources free for the rest of the system.
|
|
452
|
+
default=max(1, round((os.cpu_count() or 1) * 0.8)),
|
|
453
|
+
help="Number of parallel workers to use",
|
|
454
|
+
)
|
|
455
|
+
args = parser.parse_args()
|
|
456
|
+
|
|
457
|
+
table = scan_dicoms_long(args.dicom_dir, args.output_tsv, n_jobs=args.jobs)
|
|
458
|
+
print("\nPreview (first 10 rows):\n")
|
|
459
|
+
print(table.head(10).to_string(index=False))
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
if __name__ == "__main__":
|
|
463
|
+
main()
|