bids-manager 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ """BIDS Manager package."""
2
+
3
+ from importlib import metadata
4
+
5
+ __all__ = ["__version__"]
6
+
7
+ try: # pragma: no cover - version resolution
8
+ __version__ = metadata.version("bids-manager")
9
+ except metadata.PackageNotFoundError: # pragma: no cover
10
+ __version__ = "0.0.0"
11
+
@@ -0,0 +1,214 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ build_heuristic_from_tsv.py — **v10**
4
+ ====================================
5
+ Simple heuristic that:
6
+ 1. **Keeps every sequence**, including SBRef.
7
+ 2. **Uses the raw SeriesDescription** (cleaned) as the filename stem – no
8
+ added `rep-*`, task, or echo logic.
9
+ 3. Skips only modalities listed in `SKIP_BY_DEFAULT` (`report`,
10
+ `physio`, `refscan`).
11
+ """
12
+
13
+ from __future__ import annotations
14
+ from pathlib import Path
15
+ from textwrap import dedent
16
+ import pandas as pd
17
+ import re
18
+
19
+ # -----------------------------------------------------------------------------
20
+ # Configuration
21
+ # -----------------------------------------------------------------------------
22
+ SKIP_BY_DEFAULT = {"report", "physio", "refscan"}
23
+
24
+ # -----------------------------------------------------------------------------
25
+ # Helper functions
26
+ # -----------------------------------------------------------------------------
27
+
28
+ def clean(text: str) -> str:
29
+ """Return alphanumerics only (for variable names)."""
30
+ return re.sub(r"[^0-9A-Za-z]+", "", str(text))
31
+
32
+
33
+ def safe_stem(seq: str) -> str:
34
+ """Clean SeriesDescription for use in a filename."""
35
+ return re.sub(r"[^0-9A-Za-z_-]+", "_", seq.strip()).strip("_")
36
+
37
+
38
+ def dedup_parts(*parts: str) -> str:
39
+ """Return underscore-joined *parts* with consecutive repeats removed."""
40
+ tokens: list[str] = []
41
+ for part in parts:
42
+ for t in str(part).split("_"):
43
+ if t and (not tokens or t != tokens[-1]):
44
+ tokens.append(t)
45
+ return "_".join(tokens)
46
+
47
+
48
+ # -----------------------------------------------------------------------------
49
+ # Core writer
50
+ # -----------------------------------------------------------------------------
51
+
52
+ def write_heuristic(df: pd.DataFrame, dst: Path) -> None:
53
+ """Write a HeuDiConv heuristic from ``df`` to ``dst``.
54
+
55
+ Parameters
56
+ ----------
57
+ df : pandas.DataFrame
58
+ Table generated by :mod:`dicom_inventory` describing the DICOM series.
59
+ dst : Path
60
+ Destination ``heuristic_<name>.py`` file.
61
+ """
62
+
63
+ print("Building heuristic (v10)…")
64
+ buf: list[str] = []
65
+
66
+ # 1 ─ header -----------------------------------------------------------
67
+ buf.append(
68
+ dedent(
69
+ '''\
70
+ """AUTO-GENERATED HeuDiConv heuristic (v10)."""
71
+ from typing import Tuple
72
+
73
+ def create_key(template: str,
74
+ outtype: Tuple[str, ...] = ("nii.gz",),
75
+ annotation_classes=None):
76
+ if not template:
77
+ raise ValueError("Template must be non-empty")
78
+ return template, outtype, annotation_classes
79
+ '''
80
+ )
81
+ )
82
+
83
+ # 2 ─ SID_MAP ----------------------------------------------------------
84
+ sid_pairs = {(clean(str(r.source_folder)) or clean(Path(r.source_folder or '.').name), r.BIDS_name) for r in df.itertuples()}
85
+ buf.append("\nSID_MAP = {\n")
86
+ for folder, bids in sorted(sid_pairs):
87
+ buf.append(f" '{folder}': '{bids}',\n")
88
+ buf.append("}\n\n")
89
+
90
+ # 3 ─ template keys ----------------------------------------------------
91
+ # Include series UID (or rep) in the key to handle repeated sequences
92
+ seq2key: dict[tuple[str, str, str, str, str], str] = {}
93
+ key_defs: list[tuple[str, str]] = []
94
+
95
+ rep_counts = (
96
+ df.groupby(["BIDS_name", "session", "sequence"], dropna=False)["sequence"].transform("count")
97
+ )
98
+ rep_index = (
99
+ df.groupby(["BIDS_name", "session", "sequence"], dropna=False).cumcount() + 1
100
+ )
101
+
102
+ key_def_set = set()
103
+ for idx, row in df.iterrows():
104
+ ses_raw = row.get("session", "")
105
+ ses = "" if pd.isna(ses_raw) else str(ses_raw).strip()
106
+ folder = Path(str(row.get("source_folder", "."))).name
107
+ rep_num = rep_index.loc[idx]
108
+ uid_field = str(row.get("series_uid", ""))
109
+ bids = row["BIDS_name"]
110
+ container = row.get("modality_bids", "misc") or "misc"
111
+ stem = safe_stem(row["sequence"])
112
+
113
+ base_parts = [bids, ses, stem]
114
+ if rep_counts.loc[idx] > 1:
115
+ base_parts.append(f"rep-{rep_num}")
116
+ base = dedup_parts(*base_parts)
117
+ path = "/".join(p for p in [bids, ses, container] if p)
118
+ template = f"{path}/{base}"
119
+
120
+ key_parts = [bids, ses, stem]
121
+ if rep_counts.loc[idx] > 1:
122
+ key_parts.append(f"rep-{rep_num}")
123
+ key_var = "key_" + clean("_".join(p for p in key_parts if p))
124
+ if key_var not in key_def_set:
125
+ key_defs.append((key_var, template))
126
+ key_def_set.add(key_var)
127
+
128
+ uid_list = [u for u in uid_field.split("|") if u] or [""]
129
+ for uid in uid_list:
130
+ key_id = (row["sequence"], row["BIDS_name"], ses, folder, uid)
131
+ if key_id in seq2key:
132
+ continue
133
+ seq2key[key_id] = key_var
134
+
135
+ for var, tpl in key_defs:
136
+ buf.append(f"{var} = create_key('{tpl}')\n")
137
+ buf.append("\n")
138
+
139
+ # 4 ─ infotodict() ----------------------------------------------------
140
+ buf.append("def infotodict(seqinfo):\n \"\"\"Return mapping SeriesDescription → key list.\"\"\"\n")
141
+ for var in seq2key.values():
142
+ buf.append(f" {var}_list = []\n")
143
+ buf.append(" info = {\n")
144
+ for var in seq2key.values():
145
+ buf.append(f" {var}: {var}_list,\n")
146
+ buf.append(" }\n\n")
147
+
148
+ buf.append(" for s in seqinfo:\n")
149
+ for (seq, _b, _s, folder, uid), var in seq2key.items():
150
+ seq_esc = seq.replace("'", "\\'")
151
+ fol_esc = folder.replace("'", "\\'")
152
+ uid_esc = str(uid).replace("'", "\\'")
153
+ buf.append(
154
+ f" if s.series_description == '{seq_esc}' and s.dcm_dir_name == '{fol_esc}' and getattr(s, 'series_uid', '') == '{uid_esc}':\n"
155
+ )
156
+ buf.append(f" {var}_list.append(s.series_id)\n")
157
+ buf.append(" return info\n")
158
+
159
+ dst.write_text("".join(buf), encoding="utf-8")
160
+ print("Heuristic written →", dst.resolve())
161
+
162
+
163
+ # -----------------------------------------------------------------------------
164
+ # Driver
165
+ # -----------------------------------------------------------------------------
166
+
167
+ def generate(tsv: Path, out_dir: Path) -> None:
168
+ """Generate heuristic files for each study described in ``tsv``.
169
+
170
+ Parameters
171
+ ----------
172
+ tsv : Path
173
+ Path to ``subject_summary.tsv`` produced by :mod:`dicom_inventory`.
174
+ out_dir : Path
175
+ Directory where the heuristic files will be written.
176
+ """
177
+
178
+ df = pd.read_csv(tsv, sep="\t", keep_default_na=False)
179
+
180
+ # Drop rows with unwanted modalities
181
+ mask = df.modality.isin(SKIP_BY_DEFAULT)
182
+ if mask.any():
183
+ df.loc[mask, "include"] = 0
184
+ print(f"Auto‑skipped {mask.sum()} rows ({', '.join(SKIP_BY_DEFAULT)})")
185
+
186
+ df = df[df.include == 1]
187
+
188
+ out_dir.mkdir(parents=True, exist_ok=True)
189
+
190
+ for study, sub_df in df.groupby("StudyDescription"):
191
+ fname = safe_stem(study or "unknown")
192
+ heur = out_dir / f"heuristic_{fname}.py"
193
+ write_heuristic(sub_df, heur)
194
+ folders = " ".join(sorted({clean(f) or clean(Path(f or '.').name) for f in sub_df.source_folder.unique()}))
195
+ print(dedent(f"""
196
+ heudiconv -d "<RAW_ROOT>/{{subject}}/**/*.*" -s {folders} -f {heur.name} -c dcm2niix -o <BIDS_OUT>/{fname} -b --minmeta --overwrite"""))
197
+
198
+
199
+ def main() -> None:
200
+ """Entry point for the ``build-heuristic`` command line utility."""
201
+
202
+ import argparse
203
+
204
+ parser = argparse.ArgumentParser(description="Generate HeuDiConv heuristic(s) from TSV")
205
+ parser.add_argument("tsv", help="Path to subject_summary.tsv file")
206
+ parser.add_argument("out_dir", help="Directory to write heuristic files")
207
+ args = parser.parse_args()
208
+
209
+ generate(Path(args.tsv), Path(args.out_dir))
210
+
211
+
212
+ if __name__ == "__main__":
213
+ main()
214
+
@@ -0,0 +1,463 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ dicom_inventory.py — fully-commented, no-emoji version
4
+ ------------------------------------------------------
5
+
6
+ Creates a long-format TSV describing every DICOM series in *root_dir*.
7
+
8
+ Why you want this
9
+ -----------------
10
+ * Lets you review **all** SeriesDescriptions, subjects, sessions and file counts
11
+ before converting anything.
12
+ * Column `include` defaults to 1 except for scout/report/physlog sequences,
13
+ which start at 0 so they are skipped by default.
14
+ * Generated table is the single source of truth you feed into a helper script
15
+ that writes the HeuDiConv heuristic.
16
+
17
+ Output columns (ordered as they appear)
18
+ ---------------------------------------
19
+ subject – GivenName shown only on the first row of each subject block
20
+ BIDS_name – auto-assigned `sub-001`, `sub-002`, … (same GivenName → same ID)
21
+ session – `ses-<label>` if exactly one unique session tag is present in
22
+ that folder, otherwise blank
23
+ source_folder – relative path from the DICOM root to the folder containing the
24
+ series
25
+ include – defaults to 1 but scout/report/physlog rows start at 0
26
+ sequence – original SeriesDescription
27
+ series_uid – DICOM SeriesInstanceUID identifying a specific acquisition
28
+ rep – 1, 2, … if multiple SeriesInstanceUIDs share the same description
29
+ acq_time – AcquisitionTime of the first file in that series
30
+ modality – fine label inferred from patterns (T1w, bold, dwi, …)
31
+ modality_bids – top-level container (anat, func, dwi, fmap) derived from
32
+ *modality*
33
+ n_files – number of DICOM files (.dcm or .ima) with that SeriesDescription
34
+ GivenName … StudyDescription – demographics copied from the first header seen
35
+ """
36
+
37
+ import os
38
+ import re
39
+ from collections import defaultdict
40
+ from typing import Optional
41
+ from pathlib import Path
42
+ from joblib import Parallel, delayed
43
+
44
+ import pandas as pd
45
+ import pydicom
46
+ from pydicom.multival import MultiValue
47
+
48
+ # Directory used to store persistent user preferences
49
+ PREF_DIR = Path(__file__).resolve().parent / "user_preferences"
50
+ SEQ_DICT_FILE = PREF_DIR / "sequence_dictionary.tsv"
51
+
52
+ # Acceptable DICOM file extensions (lower case)
53
+ DICOM_EXTS = (".dcm", ".ima")
54
+
55
+
56
+ # ----------------------------------------------------------------------
57
+ # 1. Patterns: SeriesDescription → fine-grained modality label
58
+ # (order matters: first match wins)
59
+ # ----------------------------------------------------------------------
60
+ BIDS_PATTERNS = {
61
+ # anatomy
62
+ "T1w" : (
63
+ "t1w",
64
+ "t1-weight",
65
+ "t1_",
66
+ "t1 ",
67
+ "mprage",
68
+ "tfl3d",
69
+ "fspgr",
70
+ ),
71
+ "T2w" : ("t2w", "space", "tse"),
72
+ "FLAIR" : ("flair",),
73
+ "MTw" : ("gre-mt", "gre_mt", "mt"),
74
+ "PDw" : ("gre-nm", "gre_nm"),
75
+ "scout" : ("localizer", "scout"),
76
+ "report" : ("phoenixzipreport", "phoenix document", ".pdf", "report"),
77
+ "refscan": ("type-ref", "reference", "refscan"),
78
+ # functional
79
+ "bold" : ("fmri", "bold", "task-"),
80
+ "SBRef" : ("sbref",),
81
+ # diffusion
82
+ "dwi" : ("dti", "dwi", "diff"),
83
+ # field maps
84
+ "fmap" : (
85
+ "gre_field",
86
+ "fieldmapping",
87
+ "_fmap",
88
+ "fmap",
89
+ "phase",
90
+ "magnitude",
91
+ "b0rf",
92
+ "b0_map",
93
+ "b0map",
94
+ ),
95
+ # misc (kept for completeness)
96
+ "physio" : ("physiolog", "physio", "pulse", "resp"),
97
+ }
98
+
99
+ # Keep a pristine copy of the default patterns so the GUI can restore them
100
+ DEFAULT_BIDS_PATTERNS = {m: tuple(pats) for m, pats in BIDS_PATTERNS.items()}
101
+
102
+
103
+ def load_sequence_dictionary() -> None:
104
+ """Load user-modified sequence patterns from :data:`SEQ_DICT_FILE`."""
105
+ global BIDS_PATTERNS
106
+ if not SEQ_DICT_FILE.exists():
107
+ return
108
+ try:
109
+ df = pd.read_csv(SEQ_DICT_FILE, sep="\t", keep_default_na=False)
110
+ except Exception:
111
+ return
112
+ patterns: defaultdict[str, list[str]] = defaultdict(list)
113
+ for _, row in df.iterrows():
114
+ mod = str(row.get("modality", "")).strip()
115
+ pat = str(row.get("pattern", "")).strip().lower()
116
+ if mod and pat:
117
+ patterns[mod].append(pat)
118
+ if patterns:
119
+ BIDS_PATTERNS = {m: tuple(pats) for m, pats in patterns.items()}
120
+
121
+
122
+ def restore_sequence_dictionary() -> None:
123
+ """Revert :data:`BIDS_PATTERNS` to the bundled defaults."""
124
+ global BIDS_PATTERNS
125
+ BIDS_PATTERNS = {m: tuple(pats) for m, pats in DEFAULT_BIDS_PATTERNS.items()}
126
+ try:
127
+ SEQ_DICT_FILE.unlink()
128
+ except Exception:
129
+ pass
130
+
131
+
132
+ load_sequence_dictionary()
133
+
134
+ def guess_modality(series: str) -> str:
135
+ """Return first matching fine label; otherwise 'unknown'."""
136
+ s = series.lower()
137
+ for label, pats in BIDS_PATTERNS.items():
138
+ if any(p in s for p in pats):
139
+ return label
140
+ return "unknown"
141
+
142
+
143
+ MAGNITUDE_IMGTYPE = ["ORIGINAL", "PRIMARY", "M", "ND", "NORM"]
144
+ PHASE_IMGTYPE = ["ORIGINAL", "PRIMARY", "P", "ND"]
145
+
146
+ def normalize_image_type(value) -> list:
147
+ """Return ImageType components as a list of strings."""
148
+ if value is None:
149
+ return []
150
+ if isinstance(value, (list, tuple, MultiValue)):
151
+ return [str(x).strip() for x in value]
152
+ text = str(value)
153
+ if "\\" in text:
154
+ return [p.strip() for p in text.split("\\")]
155
+ text = text.strip()
156
+ if text.startswith("[") and text.endswith("]"):
157
+ text = text[1:-1]
158
+ return [p.strip().strip("'") for p in text.split(",")]
159
+ return [text] if text else []
160
+
161
+
162
+ def classify_fieldmap_type(img_list: list) -> str:
163
+ """Return 'M' for magnitude, 'P' for phase, '' otherwise."""
164
+ if img_list == MAGNITUDE_IMGTYPE:
165
+ return "M"
166
+ if img_list == PHASE_IMGTYPE:
167
+ return "P"
168
+ return ""
169
+
170
+
171
+ # ----------------------------------------------------------------------
172
+ # 2. Map fine label → top-level BIDS container (anat, func, …)
173
+ # ----------------------------------------------------------------------
174
+ BIDS_CONTAINER = {
175
+ "T1w":"anat", "T2w":"anat", "FLAIR":"anat",
176
+ "MTw":"anat", "PDw":"anat",
177
+ "scout":"anat", "report":"anat", "refscan":"anat",
178
+ "bold":"func", "SBRef":"func",
179
+ "dwi":"dwi",
180
+ "fmap":"fmap",
181
+ }
182
+ def modality_to_container(mod: str) -> str:
183
+ """Translate T1w → anat, bold → func, etc.; unknown → ''."""
184
+ return BIDS_CONTAINER.get(mod, "")
185
+
186
+ # session detector (e.g. ses-pre, ses-01) -- case-insensitive
187
+ SESSION_RE = re.compile(r"ses-([a-zA-Z0-9]+)", re.IGNORECASE)
188
+
189
+
190
+ # ----------------------------------------------------------------------
191
+ # 3. Main scanner
192
+ # ----------------------------------------------------------------------
193
+ def scan_dicoms_long(
194
+ root_dir: str,
195
+ output_tsv: Optional[str] = None,
196
+ n_jobs: int = 1,
197
+ ) -> pd.DataFrame:
198
+ """
199
+ Walk *root_dir*, read DICOM headers, return long-format DataFrame.
200
+
201
+ Parameters
202
+ ----------
203
+ root_dir : str
204
+ Path with raw DICOMs organised in sub-folders.
205
+ output_tsv : str | None
206
+ If provided, write the TSV to that path.
207
+ n_jobs : int
208
+ Number of parallel workers to use when reading DICOM files.
209
+
210
+ Returns
211
+ -------
212
+ pandas.DataFrame
213
+ Inventory as described in module docstring.
214
+ """
215
+
216
+ root_dir = Path(root_dir)
217
+ print(f"Scanning DICOM headers under: {root_dir}")
218
+
219
+ # in-memory stores
220
+ demo = {}
221
+ counts = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
222
+ mods = defaultdict(lambda: defaultdict(dict))
223
+ acq_times = defaultdict(lambda: defaultdict(dict))
224
+ imgtypes = defaultdict(lambda: defaultdict(dict))
225
+ sessset = defaultdict(lambda: defaultdict(set))
226
+
227
+ # PASS 1: Walk filesystem and collect info in parallel
228
+ file_list = []
229
+ for root, _dirs, files in os.walk(root_dir):
230
+ for fname in files:
231
+ if fname.lower().endswith(DICOM_EXTS):
232
+ file_list.append(os.path.join(root, fname))
233
+
234
+ def _read_one(fpath: str):
235
+ try:
236
+ ds = pydicom.dcmread(fpath, stop_before_pixels=True, force=True)
237
+ except Exception as exc: # pragma: no cover - I/O errors
238
+ print(f"Warning: could not read {fpath}: {exc}")
239
+ return None
240
+
241
+ root = os.path.dirname(fpath)
242
+ pn = getattr(ds, "PatientName", None)
243
+ given = pn.given_name.strip() if pn and pn.given_name else ""
244
+ pid = getattr(ds, "PatientID", "").strip()
245
+ subj = given or pid or "UNKNOWN"
246
+ study = (
247
+ getattr(ds, "StudyDescription", None)
248
+ or getattr(ds, "StudyName", None)
249
+ or "n/a"
250
+ )
251
+ study = str(study).strip()
252
+ subj_key = f"{subj}||{study}"
253
+ rel = os.path.relpath(root, root_dir)
254
+ folder = root_dir.name if rel == "." else rel
255
+ series = getattr(ds, "SeriesDescription", "n/a").strip()
256
+ uid = getattr(ds, "SeriesInstanceUID", "")
257
+ raw_img_type = getattr(ds, "ImageType", None)
258
+ img_list = normalize_image_type(raw_img_type)
259
+ img3 = classify_fieldmap_type(img_list)
260
+ if not img3:
261
+ img3 = img_list[2] if len(img_list) >= 3 else ""
262
+ acq_time = str(getattr(ds, "AcquisitionTime", "")).strip()
263
+ m = SESSION_RE.search(series)
264
+ sess_tag = f"ses-{m.group(1)}" if m else None
265
+ demo_dict = dict(
266
+ GivenName=given,
267
+ FamilyName=getattr(pn, "family_name", "").strip(),
268
+ PatientID=pid,
269
+ PatientSex=getattr(ds, "PatientSex", "n/a").strip(),
270
+ PatientAge=getattr(ds, "PatientAge", "n/a").strip(),
271
+ StudyDescription=study,
272
+ )
273
+ return dict(
274
+ subj_key=subj_key,
275
+ folder=folder,
276
+ series=series,
277
+ uid=uid,
278
+ modality=guess_modality(series),
279
+ img3=img3,
280
+ acq_time=acq_time,
281
+ sess_tag=sess_tag,
282
+ demo=demo_dict,
283
+ )
284
+
285
+ results = Parallel(n_jobs=n_jobs)(delayed(_read_one)(fp) for fp in file_list)
286
+ for res in results:
287
+ if not res:
288
+ continue
289
+ subj_key = res["subj_key"]
290
+ folder = res["folder"]
291
+ series = res["series"]
292
+ uid = res["uid"]
293
+ key = (series, uid)
294
+ counts[subj_key][folder][key] += 1
295
+ mods[subj_key][folder][key] = res["modality"]
296
+ if key not in imgtypes[subj_key][folder]:
297
+ imgtypes[subj_key][folder][key] = res["img3"]
298
+ if key not in acq_times[subj_key][folder] and res["acq_time"]:
299
+ acq_times[subj_key][folder][key] = res["acq_time"]
300
+ if res["sess_tag"]:
301
+ sessset[subj_key][folder].add(res["sess_tag"])
302
+ if subj_key not in demo:
303
+ demo[subj_key] = res["demo"]
304
+
305
+ print(f"Subjects found : {len(demo)}")
306
+ total_series = sum(len(seq_dict)
307
+ for subj in counts.values()
308
+ for folder, seq_dict in subj.items())
309
+ print(f"Unique Series instances : {total_series}")
310
+
311
+ # PASS 2: assign BIDS subject numbers PER STUDY
312
+ study_subjects = defaultdict(set)
313
+ for subj_key in demo:
314
+ subj, stud = subj_key.split("||", 1)
315
+ study_subjects[stud].add(subj)
316
+
317
+ bids_map = {}
318
+ for study, subj_set in study_subjects.items():
319
+ for i, sid in enumerate(sorted(subj_set)):
320
+ bids_map[f"{sid}||{study}"] = f"sub-{i+1:03d}"
321
+
322
+ print("Assigned BIDS IDs:", bids_map)
323
+
324
+ # PASS 3: build DataFrame rows
325
+ rows = []
326
+ for subj_key in sorted(counts):
327
+ first_row = True
328
+ for folder in sorted(counts[subj_key]):
329
+
330
+ # decide session label for this folder
331
+ ses_labels = sorted(sessset[subj_key][folder])
332
+ session = ses_labels[0] if len(ses_labels) == 1 else ""
333
+
334
+ rep_counter = defaultdict(int)
335
+ for (series, uid), n_files in sorted(counts[subj_key][folder].items()):
336
+ fine_mod = mods[subj_key][folder][(series, uid)]
337
+ img3 = imgtypes[subj_key][folder].get((series, uid), "")
338
+ include = 1
339
+ if fine_mod in {"scout", "report"} or "physlog" in series.lower():
340
+ include = 0
341
+ # Do not consider image type when counting scout duplicates
342
+ rep_key = series if fine_mod == "scout" else (series, img3)
343
+ rep_counter[rep_key] += 1
344
+ rows.append({
345
+ "subject" : demo[subj_key]["GivenName"] if first_row else "",
346
+ "BIDS_name" : bids_map[subj_key],
347
+ "session" : session,
348
+ "source_folder" : folder,
349
+ "include" : include,
350
+ "sequence" : series,
351
+ "series_uid" : uid,
352
+ "rep" : rep_counter[rep_key] if rep_counter[rep_key] > 1 else "",
353
+ "image_type" : img3,
354
+ "acq_time" : acq_times[subj_key][folder].get((series, uid), ""),
355
+ "modality" : fine_mod,
356
+ "modality_bids" : modality_to_container(fine_mod),
357
+ "n_files" : n_files,
358
+ **demo[subj_key], # demographics
359
+ })
360
+ first_row = False
361
+
362
+ # Final column order
363
+ columns = [
364
+ "subject", "BIDS_name", "session", "source_folder",
365
+ "include", "sequence", "series_uid", "rep", "acq_time",
366
+ "image_type", "modality", "modality_bids", "n_files",
367
+ "GivenName", "FamilyName", "PatientID",
368
+ "PatientSex", "PatientAge", "StudyDescription",
369
+ ]
370
+ df = pd.DataFrame(rows, columns=columns)
371
+
372
+ # Collapse magnitude/phase rows for fieldmaps
373
+ fmap_mask = df.modality == "fmap"
374
+ if fmap_mask.any():
375
+ base_cols = [
376
+ "BIDS_name",
377
+ "session",
378
+ "source_folder",
379
+ "sequence",
380
+ ]
381
+ # Use acquisition time rounded to the minute to merge magnitude and
382
+ # phase series from the same fieldmap even if their timestamps differ
383
+ # by a few seconds.
384
+ fmap_df = df[fmap_mask].copy()
385
+ fmap_df["acq_group"] = fmap_df["acq_time"].apply(lambda t: str(t)[:4])
386
+ group_cols = base_cols + ["acq_group"]
387
+ fmap_df["uid_list"] = fmap_df["series_uid"]
388
+ # keep all UIDs within each group so both magnitude and phase series
389
+ # are converted; they will be joined with '|' below
390
+ fmap_df["img_set"] = fmap_df["image_type"]
391
+ fmap_df = (
392
+ fmap_df.groupby(group_cols, as_index=False)
393
+ .agg(
394
+ {
395
+ "subject": "first",
396
+ "BIDS_name": "first",
397
+ "session": "first",
398
+ "source_folder": "first",
399
+ "include": "max",
400
+ "sequence": "first",
401
+ "uid_list": lambda x: "|".join(sorted(set(str(v) for v in x))),
402
+ "img_set": lambda x: "".join(sorted(set(str(v) for v in x))),
403
+ "acq_time": "first",
404
+ "modality": "first",
405
+ "modality_bids": "first",
406
+ "n_files": "sum",
407
+ "GivenName": "first",
408
+ "FamilyName": "first",
409
+ "PatientID": "first",
410
+ "PatientSex": "first",
411
+ "PatientAge": "first",
412
+ "StudyDescription": "first",
413
+ }
414
+ )
415
+ )
416
+ fmap_df.rename(columns={"uid_list": "series_uid", "img_set": "image_type"}, inplace=True)
417
+ fmap_df.drop(columns=["acq_group"], inplace=True)
418
+ sort_cols = base_cols + ["acq_time"]
419
+ fmap_df.sort_values(sort_cols, inplace=True)
420
+ fmap_df["rep"] = fmap_df.groupby(base_cols).cumcount() + 1
421
+ repeat_mask = fmap_df.groupby(base_cols)["rep"].transform("count") > 1
422
+ fmap_df.loc[~repeat_mask, "rep"] = ""
423
+
424
+ df = pd.concat([df[~fmap_mask], fmap_df], ignore_index=True, sort=False)
425
+
426
+ df.sort_values(["StudyDescription", "BIDS_name"], inplace=True)
427
+
428
+ # optional TSV export
429
+ if output_tsv:
430
+ df.to_csv(output_tsv, sep="\t", index=False)
431
+ print(f"Inventory written to: {output_tsv}")
432
+
433
+ return df
434
+
435
+
436
+ # ----------------------------------------------------------------------
437
+ # Command-line test
438
+ # ----------------------------------------------------------------------
439
+ def main() -> None:
440
+ """Command line interface for :func:`scan_dicoms_long`."""
441
+
442
+ import argparse
443
+
444
+ parser = argparse.ArgumentParser(description="Generate TSV inventory for a DICOM folder")
445
+ parser.add_argument("dicom_dir", help="Path to the directory containing DICOM files")
446
+ parser.add_argument("output_tsv", help="Destination TSV file")
447
+ parser.add_argument(
448
+ "--jobs",
449
+ type=int,
450
+ # Use ~80% of available CPUs to provide a sensible default while
451
+ # leaving some resources free for the rest of the system.
452
+ default=max(1, round((os.cpu_count() or 1) * 0.8)),
453
+ help="Number of parallel workers to use",
454
+ )
455
+ args = parser.parse_args()
456
+
457
+ table = scan_dicoms_long(args.dicom_dir, args.output_tsv, n_jobs=args.jobs)
458
+ print("\nPreview (first 10 rows):\n")
459
+ print(table.head(10).to_string(index=False))
460
+
461
+
462
+ if __name__ == "__main__":
463
+ main()