smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ import glob
4
+ import os
5
+ from concurrent.futures import ProcessPoolExecutor, as_completed
1
6
  from pathlib import Path
2
- import pandas as pd
7
+ from typing import Dict, List, Optional, Union
8
+
3
9
  import numpy as np
10
+ import pandas as pd
4
11
  import scipy.sparse as sp
5
- from typing import Optional, List, Dict, Union
12
+
13
+ from smftools.logging_utils import get_logger
14
+ from smftools.optional_imports import require
15
+
16
+ logger = get_logger(__name__)
17
+
6
18
 
7
19
  def add_demux_type_annotation(
8
20
  adata,
@@ -71,14 +83,16 @@ def add_demux_type_annotation(
71
83
 
72
84
  return adata
73
85
 
86
+
74
87
  def add_read_length_and_mapping_qc(
75
88
  adata,
76
89
  bam_files: Optional[List[str]] = None,
77
90
  read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
78
91
  uns_flag: str = "add_read_length_and_mapping_qc_performed",
79
- extract_read_features_from_bam_callable = None,
92
+ extract_read_features_from_bam_callable=None,
80
93
  bypass: bool = False,
81
- force_redo: bool = True
94
+ force_redo: bool = True,
95
+ samtools_backend: str | None = "auto",
82
96
  ):
83
97
  """
84
98
  Populate adata.obs with read/mapping QC columns.
@@ -98,6 +112,7 @@ def add_read_length_and_mapping_qc(
98
112
  Optional callable(bam_path) -> dict mapping read_name -> list/tuple of metrics.
99
113
  If not provided and bam_files is given, function will attempt to call `extract_read_features_from_bam`
100
114
  from the global namespace (your existing helper).
115
+
101
116
  Returns
102
117
  -------
103
118
  None (mutates final_adata in-place)
@@ -113,11 +128,15 @@ def add_read_length_and_mapping_qc(
113
128
  if read_metrics is None:
114
129
  read_metrics = {}
115
130
  if bam_files:
116
- extractor = extract_read_features_from_bam_callable or globals().get("extract_read_features_from_bam")
131
+ extractor = extract_read_features_from_bam_callable or globals().get(
132
+ "extract_read_features_from_bam"
133
+ )
117
134
  if extractor is None:
118
- raise ValueError("No `read_metrics` provided and `extract_read_features_from_bam` not found.")
135
+ raise ValueError(
136
+ "No `read_metrics` provided and `extract_read_features_from_bam` not found."
137
+ )
119
138
  for bam in bam_files:
120
- bam_read_metrics = extractor(bam)
139
+ bam_read_metrics = extractor(bam, samtools_backend)
121
140
  if not isinstance(bam_read_metrics, dict):
122
141
  raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
123
142
  read_metrics.update(bam_read_metrics)
@@ -130,11 +149,11 @@ def add_read_length_and_mapping_qc(
130
149
  if len(read_metrics) == 0:
131
150
  # fill with NaNs
132
151
  n = adata.n_obs
133
- adata.obs['read_length'] = np.full(n, np.nan)
134
- adata.obs['mapped_length'] = np.full(n, np.nan)
135
- adata.obs['reference_length'] = np.full(n, np.nan)
136
- adata.obs['read_quality'] = np.full(n, np.nan)
137
- adata.obs['mapping_quality'] = np.full(n, np.nan)
152
+ adata.obs["read_length"] = np.full(n, np.nan)
153
+ adata.obs["mapped_length"] = np.full(n, np.nan)
154
+ adata.obs["reference_length"] = np.full(n, np.nan)
155
+ adata.obs["read_quality"] = np.full(n, np.nan)
156
+ adata.obs["mapping_quality"] = np.full(n, np.nan)
138
157
  else:
139
158
  # Build DF robustly
140
159
  # Convert values to lists where possible, else to [val, val, val...]
@@ -151,35 +170,45 @@ def add_read_length_and_mapping_qc(
151
170
  vals = vals + [np.nan] * (max_cols - len(vals))
152
171
  rows[k] = vals[:max_cols]
153
172
 
154
- df = pd.DataFrame.from_dict(rows, orient='index', columns=[
155
- 'read_length', 'read_quality', 'reference_length', 'mapped_length', 'mapping_quality'
156
- ])
173
+ df = pd.DataFrame.from_dict(
174
+ rows,
175
+ orient="index",
176
+ columns=[
177
+ "read_length",
178
+ "read_quality",
179
+ "reference_length",
180
+ "mapped_length",
181
+ "mapping_quality",
182
+ ],
183
+ )
157
184
 
158
185
  # Reindex to final_adata.obs_names so order matches adata
159
186
  # If obs_names are not present as keys in df, the results will be NaN
160
187
  df_reindexed = df.reindex(adata.obs_names).astype(float)
161
188
 
162
- adata.obs['read_length'] = df_reindexed['read_length'].values
163
- adata.obs['mapped_length'] = df_reindexed['mapped_length'].values
164
- adata.obs['reference_length'] = df_reindexed['reference_length'].values
165
- adata.obs['read_quality'] = df_reindexed['read_quality'].values
166
- adata.obs['mapping_quality'] = df_reindexed['mapping_quality'].values
189
+ adata.obs["read_length"] = df_reindexed["read_length"].values
190
+ adata.obs["mapped_length"] = df_reindexed["mapped_length"].values
191
+ adata.obs["reference_length"] = df_reindexed["reference_length"].values
192
+ adata.obs["read_quality"] = df_reindexed["read_quality"].values
193
+ adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
167
194
 
168
195
  # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
169
196
  # read_length_to_reference_length_ratio
170
- rl = pd.to_numeric(adata.obs['read_length'], errors='coerce').to_numpy(dtype=float)
171
- ref_len = pd.to_numeric(adata.obs['reference_length'], errors='coerce').to_numpy(dtype=float)
172
- mapped_len = pd.to_numeric(adata.obs['mapped_length'], errors='coerce').to_numpy(dtype=float)
197
+ rl = pd.to_numeric(adata.obs["read_length"], errors="coerce").to_numpy(dtype=float)
198
+ ref_len = pd.to_numeric(adata.obs["reference_length"], errors="coerce").to_numpy(dtype=float)
199
+ mapped_len = pd.to_numeric(adata.obs["mapped_length"], errors="coerce").to_numpy(dtype=float)
173
200
 
174
201
  # safe divisions: use np.where to avoid warnings and replace inf with nan
175
- with np.errstate(divide='ignore', invalid='ignore'):
202
+ with np.errstate(divide="ignore", invalid="ignore"):
176
203
  rl_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), rl / ref_len, np.nan)
177
- mapped_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan)
204
+ mapped_to_ref = np.where(
205
+ (ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan
206
+ )
178
207
  mapped_to_read = np.where((rl != 0) & np.isfinite(rl), mapped_len / rl, np.nan)
179
208
 
180
- adata.obs['read_length_to_reference_length_ratio'] = rl_to_ref
181
- adata.obs['mapped_length_to_reference_length_ratio'] = mapped_to_ref
182
- adata.obs['mapped_length_to_read_length_ratio'] = mapped_to_read
209
+ adata.obs["read_length_to_reference_length_ratio"] = rl_to_ref
210
+ adata.obs["mapped_length_to_reference_length_ratio"] = mapped_to_ref
211
+ adata.obs["mapped_length_to_read_length_ratio"] = mapped_to_read
183
212
 
184
213
  # Add read level raw modification signal: sum over X rows
185
214
  X = adata.X
@@ -189,9 +218,152 @@ def add_read_length_and_mapping_qc(
189
218
  else:
190
219
  raw_sig = np.asarray(X.sum(axis=1)).ravel()
191
220
 
192
- adata.obs['Raw_modification_signal'] = raw_sig
221
+ adata.obs["Raw_modification_signal"] = raw_sig
193
222
 
194
223
  # mark as done
195
224
  adata.uns[uns_flag] = True
196
225
 
197
- return None
226
+ return None
227
+
228
+
229
+ def _collect_read_origins_from_pod5(pod5_path: str, target_ids: set[str]) -> dict[str, str]:
230
+ """
231
+ Worker function: scan one POD5 file and return a mapping
232
+ {read_id: pod5_basename} only for read_ids in `target_ids`.
233
+ """
234
+ p5 = require("pod5", extra="ont", purpose="POD5 metadata")
235
+ Reader = p5.Reader
236
+
237
+ basename = os.path.basename(pod5_path)
238
+ mapping: dict[str, str] = {}
239
+
240
+ with Reader(pod5_path) as reader:
241
+ for read in reader.reads():
242
+ # Cast read id to string
243
+ rid = str(read.read_id)
244
+ if rid in target_ids:
245
+ mapping[rid] = basename
246
+
247
+ return mapping
248
+
249
+
250
+ def annotate_pod5_origin(
251
+ adata,
252
+ pod5_path_or_dir: str | Path,
253
+ pattern: str = "*.pod5",
254
+ n_jobs: int | None = None,
255
+ fill_value: str | None = "unknown",
256
+ verbose: bool = True,
257
+ csv_path: str | None = None,
258
+ ):
259
+ """
260
+ Add `pod5_origin` column to `adata.obs`, containing the POD5 basename
261
+ each read came from.
262
+
263
+ Parameters
264
+ ----------
265
+ adata
266
+ AnnData with obs_names == read_ids (as strings).
267
+ pod5_path_or_dir
268
+ Directory containing POD5 files or path to a single POD5 file.
269
+ pattern
270
+ Glob pattern for POD5 files inside `pod5_dir`.
271
+ n_jobs
272
+ Number of worker processes. If None or <=1, runs serially.
273
+ fill_value
274
+ Value to use when a read_id is not found in any POD5 file.
275
+ If None, leaves missing as NaN.
276
+ verbose
277
+ Print progress info.
278
+ csv_path
279
+ Path to a csv of the read to pod5 origin mapping
280
+
281
+ Returns
282
+ -------
283
+ None (modifies `adata` in-place).
284
+ """
285
+ pod5_path_or_dir = Path(pod5_path_or_dir)
286
+
287
+ # --- Resolve input into a list of pod5 files ---
288
+ if pod5_path_or_dir.is_dir():
289
+ pod5_files = sorted(str(p) for p in pod5_path_or_dir.glob(pattern))
290
+ if not pod5_files:
291
+ raise FileNotFoundError(
292
+ f"No POD5 files matching {pattern!r} in {str(pod5_path_or_dir)!r}"
293
+ )
294
+ elif pod5_path_or_dir.is_file():
295
+ if pod5_path_or_dir.suffix.lower() != ".pod5":
296
+ raise ValueError(f"Expected a .pod5 file, got: {pod5_path_or_dir}")
297
+ pod5_files = [str(pod5_path_or_dir)]
298
+ else:
299
+ raise FileNotFoundError(f"Path does not exist: {pod5_path_or_dir}")
300
+
301
+ # Make sure obs_names are strings
302
+ obs_names = adata.obs_names.astype(str)
303
+ target_ids = set(obs_names) # only these are interesting
304
+
305
+ if verbose:
306
+ logger.info(f"Found {len(pod5_files)} POD5 files.")
307
+ logger.info(f"Tracking {len(target_ids)} read IDs from AnnData.")
308
+
309
+ # --- Collect mappings (possibly multiprocessed) ---
310
+ global_mapping: dict[str, str] = {}
311
+
312
+ if n_jobs is None or n_jobs <= 1:
313
+ # Serial version (less overhead, useful for debugging)
314
+ if verbose:
315
+ logger.debug("Running in SERIAL mode.")
316
+ for f in pod5_files:
317
+ if verbose:
318
+ logger.debug(f" Scanning {os.path.basename(f)} ...")
319
+ part = _collect_read_origins_from_pod5(f, target_ids)
320
+ global_mapping.update(part)
321
+ else:
322
+ if verbose:
323
+ logger.debug(f"Running in PARALLEL mode with {n_jobs} workers.")
324
+ with ProcessPoolExecutor(max_workers=n_jobs) as ex:
325
+ futures = {
326
+ ex.submit(_collect_read_origins_from_pod5, f, target_ids): f for f in pod5_files
327
+ }
328
+ for fut in as_completed(futures):
329
+ f = futures[fut]
330
+ try:
331
+ part = fut.result()
332
+ except Exception as e:
333
+ logger.warning(f"Error while processing {f}: {e}")
334
+ continue
335
+ global_mapping.update(part)
336
+ if verbose:
337
+ logger.info(f" Finished {os.path.basename(f)} ({len(part)} matching reads)")
338
+
339
+ if verbose:
340
+ logger.info(f"Total reads matched: {len(global_mapping)}")
341
+
342
+ # --- Populate obs['pod5_origin'] in AnnData order, memory-efficiently ---
343
+ origin = np.empty(adata.n_obs, dtype=object)
344
+ default = None if fill_value is None else fill_value
345
+ for i, rid in enumerate(obs_names):
346
+ origin[i] = global_mapping.get(rid, default)
347
+
348
+ adata.obs["pod5_origin"] = origin
349
+ if verbose:
350
+ logger.info("Assigned `pod5_origin` to adata.obs.")
351
+
352
+ # --- Optionally write a CSV ---
353
+ if csv_path is not None:
354
+ if verbose:
355
+ logger.info(f"Writing CSV mapping to: {csv_path}")
356
+
357
+ # Create DataFrame in AnnData order for easier cross-referencing
358
+ df = pd.DataFrame(
359
+ {
360
+ "read_id": obs_names,
361
+ "pod5_origin": origin,
362
+ }
363
+ )
364
+ df.to_csv(csv_path, index=False)
365
+
366
+ if verbose:
367
+ logger.info("CSV saved.")
368
+
369
+ return global_mapping