smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,106 +1,334 @@
1
+ from typing import Dict, Optional, Any, Union, Sequence
2
+ from pathlib import Path
3
+
1
4
  def call_hmm_peaks(
2
5
  adata,
3
- feature_configs,
4
- obs_column='Reference_strand',
5
- site_types=['GpC_site', 'CpG_site'],
6
- save_plot=False,
7
- output_dir=None,
8
- date_tag=None,
9
- inplace=False
6
+ feature_configs: Dict[str, Dict[str, Any]],
7
+ ref_column: str = "Reference_strand",
8
+ site_types: Sequence[str] = ("GpC", "CpG"),
9
+ save_plot: bool = False,
10
+ output_dir: Optional[Union[str, "Path"]] = None,
11
+ date_tag: Optional[str] = None,
12
+ inplace: bool = True,
13
+ index_col_suffix: Optional[str] = None,
14
+ alternate_labels: bool = False,
10
15
  ):
16
+ """
17
+ Call peaks on one or more HMM-derived (or other) layers and annotate adata.var / adata.obs,
18
+ doing peak calling *within each reference subset*.
19
+
20
+ Parameters
21
+ ----------
22
+ adata : AnnData
23
+ Input AnnData with layers already containing feature tracks (e.g. HMM-derived masks).
24
+ feature_configs : dict
25
+ Mapping: feature_type_or_layer_suffix -> {
26
+ "min_distance": int (default 200),
27
+ "peak_width": int (default 200),
28
+ "peak_prominence": float (default 0.2),
29
+ "peak_threshold": float (default 0.8),
30
+ }
31
+
32
+ Keys are usually *feature types* like "all_accessible_features" or
33
+ "small_bound_stretch". These are matched against existing HMM layers
34
+ (e.g. "GpC_all_accessible_features", "Combined_small_bound_stretch")
35
+ using a suffix match. You can also pass full layer names if you wish.
36
+ ref_column : str
37
+ Column in adata.obs defining reference groups (e.g. "Reference_strand").
38
+ site_types : sequence of str
39
+ Site types (without "_site"); expects var columns like f"{ref}_{site_type}_site".
40
+ e.g. ("GpC", "CpG") -> "6B6_top_GpC_site", etc.
41
+ save_plot : bool
42
+ If True, save peak diagnostic plots instead of just showing them.
43
+ output_dir : path-like or None
44
+ Directory for saved plots (created if needed).
45
+ date_tag : str or None
46
+ Optional tag to prefix plot filenames.
47
+ inplace : bool
48
+ If False, operate on a copy and return it. If True, modify adata and return None.
49
+ index_col_suffix : str or None
50
+ If None, coordinates come from adata.var_names (cast to int when possible).
51
+ If set, for each ref we use adata.var[f"{ref}_{index_col_suffix}"] as the
52
+ coordinate system (e.g. a reindexed coordinate).
53
+
54
+ Returns
55
+ -------
56
+ None or AnnData
57
+ """
11
58
  import numpy as np
12
59
  import pandas as pd
13
60
  import matplotlib.pyplot as plt
14
61
  from scipy.signal import find_peaks
62
+ from scipy.sparse import issparse
15
63
 
16
64
  if not inplace:
17
65
  adata = adata.copy()
18
66
 
19
- # Ensure obs_column is categorical
20
- if not isinstance(adata.obs[obs_column].dtype, pd.CategoricalDtype):
21
- adata.obs[obs_column] = pd.Categorical(adata.obs[obs_column])
22
-
23
- coordinates = adata.var_names.astype(int).values
24
- peak_columns = []
25
-
26
- obs_updates = {}
27
-
28
- for feature_layer, config in feature_configs.items():
29
- min_distance = config.get('min_distance', 200)
30
- peak_width = config.get('peak_width', 200)
31
- peak_prominence = config.get('peak_prominence', 0.2)
32
- peak_threshold = config.get('peak_threshold', 0.8)
33
-
34
- matrix = adata.layers[feature_layer]
35
- means = np.mean(matrix, axis=0)
36
- peak_indices, _ = find_peaks(means, prominence=peak_prominence, distance=min_distance)
37
- peak_centers = coordinates[peak_indices]
38
- adata.uns[f'{feature_layer} peak_centers'] = peak_centers.tolist()
39
-
40
- # Plot
41
- plt.figure(figsize=(6, 3))
42
- plt.plot(coordinates, means)
43
- plt.title(f"{feature_layer} with peak calls")
44
- plt.xlabel("Genomic position")
45
- plt.ylabel("Mean intensity")
46
- for i, center in enumerate(peak_centers):
47
- start, end = center - peak_width // 2, center + peak_width // 2
48
- plt.axvspan(start, end, color='purple', alpha=0.2)
49
- plt.axvline(center, color='red', linestyle='--')
50
- aligned = [end if i % 2 else start, 'left' if i % 2 else 'right']
51
- plt.text(aligned[0], 0, f"Peak {i}\n{center}", color='red', ha=aligned[1])
52
- if save_plot and output_dir:
53
- filename = f"{output_dir}/{date_tag or 'output'}_{feature_layer}_peaks.png"
54
- plt.savefig(filename, bbox_inches='tight')
55
- print(f"Saved plot to {filename}")
67
+ # Ensure ref_column is categorical
68
+ if not pd.api.types.is_categorical_dtype(adata.obs[ref_column]):
69
+ adata.obs[ref_column] = adata.obs[ref_column].astype("category")
70
+
71
+ # Base coordinates (fallback)
72
+ try:
73
+ base_coordinates = adata.var_names.astype(int).values
74
+ except Exception:
75
+ base_coordinates = np.arange(adata.n_vars, dtype=int)
76
+
77
+ if output_dir is not None:
78
+ output_dir = Path(output_dir)
79
+ output_dir.mkdir(parents=True, exist_ok=True)
80
+
81
+ # HMM layers known to the object (if present)
82
+ hmm_layers = list(adata.uns.get("hmm_appended_layers", [])) or []
83
+ # keep only the binary masks, not *_lengths
84
+ hmm_layers = [layer for layer in hmm_layers if not layer.endswith("_lengths")]
85
+
86
+ # Fallback: use all layer names if hmm_appended_layers is empty/missing
87
+ all_layer_names = list(adata.layers.keys())
88
+
89
+ all_peak_var_cols = []
90
+
91
+ # Iterate over each reference separately
92
+ for ref in adata.obs[ref_column].cat.categories:
93
+ ref_mask = (adata.obs[ref_column] == ref).values
94
+ if not ref_mask.any():
95
+ continue
96
+
97
+ # Per-ref coordinates: either from a reindexed column or global fallback
98
+ if index_col_suffix is not None:
99
+ coord_col = f"{ref}_{index_col_suffix}"
100
+ if coord_col not in adata.var:
101
+ raise KeyError(
102
+ f"index_col_suffix='{index_col_suffix}' requested, "
103
+ f"but var column '{coord_col}' is missing for ref '{ref}'."
104
+ )
105
+ coord_vals = adata.var[coord_col].values
106
+ # Try to coerce to numeric
107
+ try:
108
+ coordinates = coord_vals.astype(int)
109
+ except Exception:
110
+ coordinates = np.asarray(coord_vals, dtype=float)
56
111
  else:
57
- plt.show()
58
-
59
- feature_peak_columns = []
60
- for center in peak_centers:
61
- start, end = center - peak_width // 2, center + peak_width // 2
62
- colname = f'{feature_layer}_peak_{center}'
63
- peak_columns.append(colname)
64
- feature_peak_columns.append(colname)
65
-
66
- peak_mask = (coordinates >= start) & (coordinates <= end)
67
- adata.var[colname] = peak_mask
68
-
69
- region = matrix[:, peak_mask]
70
- obs_updates[f'mean_{feature_layer}_around_{center}'] = np.mean(region, axis=1)
71
- obs_updates[f'sum_{feature_layer}_around_{center}'] = np.sum(region, axis=1)
72
- obs_updates[f'{feature_layer}_present_at_{center}'] = np.mean(region, axis=1) > peak_threshold
73
-
74
- for site_type in site_types:
75
- adata.obs[f'{site_type}_sum_around_{center}'] = 0
76
- adata.obs[f'{site_type}_mean_around_{center}'] = np.nan
77
-
78
- for ref in adata.obs[obs_column].cat.categories:
79
- ref_idx = adata.obs[obs_column] == ref
80
- mask_key = f"{ref}_{site_type}"
81
- for site_type in site_types:
82
- if mask_key not in adata.var:
83
- continue
84
- site_mask = adata.var[mask_key].values
85
- site_coords = coordinates[site_mask]
86
- region_mask = (site_coords >= start) & (site_coords <= end)
87
- if not region_mask.any():
88
- continue
89
- full_mask = site_mask.copy()
90
- full_mask[site_mask] = region_mask
91
- site_region = adata[ref_idx, full_mask].X
92
- if hasattr(site_region, "A"):
93
- site_region = site_region.A
94
- if site_region.shape[1] > 0:
95
- adata.obs.loc[ref_idx, f'{site_type}_sum_around_{center}'] = np.nansum(site_region, axis=1)
96
- adata.obs.loc[ref_idx, f'{site_type}_mean_around_{center}'] = np.nanmean(site_region, axis=1)
112
+ coordinates = base_coordinates
113
+
114
+ # Resolve each feature_config key to one or more actual layer names
115
+ for feature_key, config in feature_configs.items():
116
+ # Candidate search space: HMM layers if present, else all layers
117
+ search_layers = hmm_layers if hmm_layers else all_layer_names
118
+
119
+ candidate_layers = []
120
+
121
+ # First: exact match
122
+ for lname in search_layers:
123
+ if lname == feature_key:
124
+ candidate_layers.append(lname)
125
+
126
+ # Second: suffix match (e.g. "all_accessible_features" ->
127
+ # "GpC_all_accessible_features", "Combined_all_accessible_features", etc.)
128
+ if not candidate_layers:
129
+ for lname in search_layers:
130
+ if lname.endswith(feature_key):
131
+ candidate_layers.append(lname)
132
+
133
+ # Third: if user passed a full layer name that wasn't in hmm_layers,
134
+ # but does exist in adata.layers, allow it.
135
+ if not candidate_layers and feature_key in adata.layers:
136
+ candidate_layers.append(feature_key)
137
+
138
+ if not candidate_layers:
139
+ print(
140
+ f"[call_hmm_peaks] WARNING: no layers found matching feature key "
141
+ f"'{feature_key}' in ref '{ref}'. Skipping."
142
+ )
143
+ continue
144
+
145
+ # Run peak calling on each resolved layer for this ref
146
+ for layer_name in candidate_layers:
147
+ if layer_name not in adata.layers:
148
+ print(
149
+ f"[call_hmm_peaks] WARNING: resolved layer '{layer_name}' "
150
+ f"not found in adata.layers; skipping."
151
+ )
152
+ continue
153
+
154
+ min_distance = int(config.get("min_distance", 200))
155
+ peak_width = int(config.get("peak_width", 200))
156
+ peak_prominence = float(config.get("peak_prominence", 0.2))
157
+ peak_threshold = float(config.get("peak_threshold", 0.8))
158
+
159
+ layer_data = adata.layers[layer_name]
160
+ if issparse(layer_data):
161
+ layer_data = layer_data.toarray()
162
+ else:
163
+ layer_data = np.asarray(layer_data)
164
+
165
+ # Subset rows for this ref
166
+ matrix = layer_data[ref_mask, :] # (n_ref_reads, n_vars)
167
+ if matrix.shape[0] == 0:
168
+ continue
169
+
170
+ # Mean signal along positions (within this ref only)
171
+ means = np.nanmean(matrix, axis=0)
172
+
173
+ # Optional rolling-mean smoothing before peak detection
174
+ rolling_window = int(config.get("rolling_window", 1))
175
+ if rolling_window > 1:
176
+ # Simple centered rolling mean via convolution
177
+ kernel = np.ones(rolling_window, dtype=float) / float(rolling_window)
178
+ smoothed = np.convolve(means, kernel, mode="same")
179
+ peak_metric = smoothed
180
+ else:
181
+ peak_metric = means
182
+
183
+ # Peak detection
184
+ peak_indices, _ = find_peaks(
185
+ peak_metric, prominence=peak_prominence, distance=min_distance
186
+ )
187
+ if peak_indices.size == 0:
188
+ print(
189
+ f"[call_hmm_peaks] No peaks found for layer '{layer_name}' "
190
+ f"in ref '{ref}'."
191
+ )
192
+ continue
193
+
194
+ peak_centers = coordinates[peak_indices]
195
+ # Store per-ref peak centers
196
+ adata.uns[f"{layer_name}_{ref}_peak_centers"] = peak_centers.tolist()
197
+
198
+ # ---- Plot ----
199
+ plt.figure(figsize=(6, 3))
200
+ plt.plot(coordinates, peak_metric, linewidth=1)
201
+ plt.title(f"{layer_name} peaks in {ref}")
202
+ plt.xlabel("Coordinate")
203
+ plt.ylabel(f"Rolling Mean - roll size {rolling_window}")
204
+
205
+ for i, center in enumerate(peak_centers):
206
+ start = center - peak_width // 2
207
+ end = center + peak_width // 2
208
+ height = peak_metric[peak_indices[i]]
209
+ plt.axvspan(start, end, color="purple", alpha=0.2)
210
+ plt.axvline(center, color="red", linestyle="--", linewidth=0.8)
211
+
212
+ # alternate label placement a bit left/right
213
+ if alternate_labels:
214
+ if i % 2 == 0:
215
+ x_text, ha = start, "right"
216
+ else:
217
+ x_text, ha = end, "left"
97
218
  else:
98
- pass
219
+ x_text, ha = start, "right"
220
+
221
+ plt.text(
222
+ x_text,
223
+ height * 0.8,
224
+ f"Peak {i}\n{center}",
225
+ color="red",
226
+ ha=ha,
227
+ va="bottom",
228
+ fontsize=8,
229
+ )
230
+
231
+ if save_plot and output_dir is not None:
232
+ tag = date_tag or "output"
233
+ # include ref in filename
234
+ safe_ref = str(ref).replace("/", "_")
235
+ safe_layer = str(layer_name).replace("/", "_")
236
+ fname = output_dir / f"{tag}_{safe_layer}_{safe_ref}_peaks.png"
237
+ plt.savefig(fname, bbox_inches="tight", dpi=200)
238
+ print(f"[call_hmm_peaks] Saved plot to {fname}")
239
+ plt.close()
240
+ else:
241
+ plt.tight_layout()
242
+ plt.show()
243
+
244
+ feature_peak_cols = []
245
+
246
+ # ---- Per-peak annotations (within this ref) ----
247
+ for center in peak_centers:
248
+ start = center - peak_width // 2
249
+ end = center + peak_width // 2
250
+
251
+ # Make column names ref- and layer-specific so they don't collide
252
+ colname = f"{layer_name}_{ref}_peak_{center}"
253
+ feature_peak_cols.append(colname)
254
+ all_peak_var_cols.append(colname)
255
+
256
+ # Var-level mask: is this position in the window?
257
+ peak_mask = (coordinates >= start) & (coordinates <= end)
258
+ adata.var[colname] = peak_mask
259
+
260
+ # Extract signal in that window from the *ref subset* matrix
261
+ region = matrix[:, peak_mask] # (n_ref_reads, n_positions_in_window)
262
+
263
+ # Per-read summary in this window for the feature layer itself
264
+ mean_col = f"mean_{layer_name}_{ref}_around_{center}"
265
+ sum_col = f"sum_{layer_name}_{ref}_around_{center}"
266
+ present_col = f"{layer_name}_{ref}_present_at_{center}"
267
+
268
+ # Create columns if missing, then fill only the ref rows
269
+ if mean_col not in adata.obs:
270
+ adata.obs[mean_col] = np.nan
271
+ if sum_col not in adata.obs:
272
+ adata.obs[sum_col] = 0.0
273
+ if present_col not in adata.obs:
274
+ adata.obs[present_col] = False
275
+
276
+ adata.obs.loc[ref_mask, mean_col] = np.nanmean(region, axis=1)
277
+ adata.obs.loc[ref_mask, sum_col] = np.nansum(region, axis=1)
278
+ adata.obs.loc[ref_mask, present_col] = (
279
+ adata.obs.loc[ref_mask, mean_col].values > peak_threshold
280
+ )
281
+
282
+ # Initialize site-type summaries (global columns; filled per ref)
283
+ for site_type in site_types:
284
+ sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
285
+ mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
286
+ if sum_site_col not in adata.obs:
287
+ adata.obs[sum_site_col] = 0.0
288
+ if mean_site_col not in adata.obs:
289
+ adata.obs[mean_site_col] = np.nan
290
+
291
+ # Per-site-type summaries for this ref
292
+ for site_type in site_types:
293
+ mask_key = f"{ref}_{site_type}_site"
294
+ if mask_key not in adata.var:
295
+ continue
296
+
297
+ site_mask = adata.var[mask_key].values.astype(bool)
298
+ if not site_mask.any():
299
+ continue
300
+
301
+ site_coords = coordinates[site_mask]
302
+ region_mask = (site_coords >= start) & (site_coords <= end)
303
+ if not region_mask.any():
304
+ continue
305
+
306
+ full_mask = np.zeros_like(site_mask, dtype=bool)
307
+ full_mask[site_mask] = region_mask
308
+
309
+ site_region = adata[ref_mask, full_mask].X
310
+ if hasattr(site_region, "A"):
311
+ site_region = site_region.A # sparse -> dense
312
+
313
+ if site_region.shape[1] == 0:
314
+ continue
315
+
316
+ sum_site_col = f"{site_type}_{ref}_sum_around_{center}"
317
+ mean_site_col = f"{site_type}_{ref}_mean_around_{center}"
318
+
319
+ adata.obs.loc[ref_mask, sum_site_col] = np.nansum(site_region, axis=1)
320
+ adata.obs.loc[ref_mask, mean_site_col] = np.nanmean(site_region, axis=1)
99
321
 
100
- adata.var[f'is_in_any_{feature_layer}_peak'] = adata.var[feature_peak_columns].any(axis=1)
101
- print(f"Annotated {len(peak_centers)} peaks for {feature_layer}")
322
+ # Mark "any peak" for this (layer, ref)
323
+ any_col = f"is_in_any_{layer_name}_peak_{ref}"
324
+ adata.var[any_col] = adata.var[feature_peak_cols].any(axis=1)
325
+ print(
326
+ f"[call_hmm_peaks] Annotated {len(peak_centers)} peaks "
327
+ f"for layer '{layer_name}' in ref '{ref}'."
328
+ )
102
329
 
103
- adata.var['is_in_any_peak'] = adata.var[peak_columns].any(axis=1)
104
- adata.obs = pd.concat([adata.obs, pd.DataFrame(obs_updates, index=adata.obs.index)], axis=1)
330
+ # Global any-peak flag across all feature layers and references
331
+ if all_peak_var_cols:
332
+ adata.var["is_in_any_peak"] = adata.var[all_peak_var_cols].any(axis=1)
105
333
 
106
- return adata if not inplace else None
334
+ return None if inplace else adata
@@ -1,14 +1,20 @@
1
- from . import helpers
2
- from .basecall_pod5s import basecall_pod5s
3
- from .subsample_fasta_from_bed import subsample_fasta_from_bed
4
- from .subsample_pod5 import subsample_pod5
5
- from .fast5_to_pod5 import fast5_to_pod5
6
-
1
+ from .bam_functions import align_and_sort_BAM, bam_qc, concatenate_fastqs_to_bam, count_aligned_reads, demux_and_index_BAM, extract_base_identities, extract_read_features_from_bam, extract_readnames_from_bam, separate_bam_by_bc, split_and_index_BAM
2
+ from .basecalling import canoncall, modcall
3
+ from .bed_functions import aligned_BAM_to_bed, _bed_to_bigwig, extract_read_lengths_from_bed, _plot_bed_histograms
4
+ from .converted_BAM_to_adata import converted_BAM_to_adata
5
+ from .fasta_functions import find_conversion_sites, generate_converted_FASTA, get_chromosome_lengths, get_native_references, index_fasta, subsample_fasta_from_bed
6
+ from .h5ad_functions import add_demux_type_annotation, add_read_length_and_mapping_qc
7
+ from .modkit_functions import extract_mods, make_modbed, modQC
8
+ from .modkit_extract_to_adata import modkit_extract_to_adata
9
+ from .ohe import one_hot_encode, one_hot_decode, ohe_layers_decode, ohe_batching
10
+ from .pod5_functions import basecall_pod5s, fast5_to_pod5, subsample_pod5
11
+ from .run_multiqc import run_multiqc
7
12
 
8
13
  __all__ = [
9
14
  "basecall_pod5s",
15
+ "converted_BAM_to_adata",
10
16
  "subsample_fasta_from_bed",
11
17
  "subsample_pod5",
12
18
  "fast5_to_pod5",
13
- "helpers"
19
+ "run_multiqc"
14
20
  ]
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+ import subprocess
3
+ from typing import Union, List
4
+
5
+ def fast5_to_pod5(
6
+ fast5_dir: Union[str, Path, List[Union[str, Path]]],
7
+ output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
8
+ ) -> None:
9
+ """
10
+ Convert Nanopore FAST5 files (single file, list of files, or directory)
11
+ into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
12
+ """
13
+
14
+ output_pod5 = str(output_pod5) # ensure string
15
+
16
+ # 1) If user gives a list of FAST5 files
17
+ if isinstance(fast5_dir, (list, tuple)):
18
+ fast5_paths = [str(Path(f)) for f in fast5_dir]
19
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
20
+ subprocess.run(cmd, check=True)
21
+ return
22
+
23
+ # Ensure Path object
24
+ p = Path(fast5_dir)
25
+
26
+ # 2) If user gives a single file
27
+ if p.is_file():
28
+ cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
29
+ subprocess.run(cmd, check=True)
30
+ return
31
+
32
+ # 3) If user gives a directory → collect FAST5s
33
+ if p.is_dir():
34
+ fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
35
+ if not fast5_paths:
36
+ raise FileNotFoundError(f"No FAST5 files found in {p}")
37
+
38
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
39
+ subprocess.run(cmd, check=True)
40
+ return
41
+
42
+ raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
43
+
@@ -0,0 +1,71 @@
1
+ # from .align_and_sort_BAM import align_and_sort_BAM
2
+ # from .aligned_BAM_to_bed import aligned_BAM_to_bed
3
+ # from .bam_qc import bam_qc
4
+ # from .bed_to_bigwig import bed_to_bigwig
5
+ # from .binarize_converted_base_identities import binarize_converted_base_identities
6
+ # from .canoncall import canoncall
7
+ # from .complement_base_list import complement_base_list
8
+ # from .converted_BAM_to_adata_II import converted_BAM_to_adata_II
9
+ # from .concatenate_fastqs_to_bam import concatenate_fastqs_to_bam
10
+ # from .count_aligned_reads import count_aligned_reads
11
+ # from .demux_and_index_BAM import demux_and_index_BAM
12
+ # from .discover_input_files import *
13
+ # from .extract_base_identities import extract_base_identities
14
+ # from .extract_mods import extract_mods
15
+ # from .extract_read_features_from_bam import extract_read_features_from_bam
16
+ # from .extract_read_lengths_from_bed import extract_read_lengths_from_bed
17
+ # from .extract_readnames_from_BAM import extract_readnames_from_BAM
18
+ # from .find_conversion_sites import find_conversion_sites
19
+ # from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
20
+ # from .get_chromosome_lengths import get_chromosome_lengths
21
+ # from .get_native_references import get_native_references
22
+ # from .index_fasta import index_fasta
23
+ # from .make_modbed import make_modbed
24
+ # from .modcall import modcall
25
+ # from .modkit_extract_to_adata import modkit_extract_to_adata
26
+ # from .modQC import modQC
27
+ # from .one_hot_encode import one_hot_encode
28
+ # from .ohe_batching import ohe_batching
29
+ # from .one_hot_decode import one_hot_decode
30
+ # from .ohe_layers_decode import ohe_layers_decode
31
+ # from .plot_bed_histograms import plot_bed_histograms
32
+ # from .run_multiqc import run_multiqc
33
+ # from .separate_bam_by_bc import separate_bam_by_bc
34
+ # from .split_and_index_BAM import split_and_index_BAM
35
+
36
+ # __all__ = [
37
+ # "align_and_sort_BAM",
38
+ # "aligned_BAM_to_bed",
39
+ # "bam_qc",
40
+ # "bed_to_bigwig",
41
+ # "binarize_converted_base_identities",
42
+ # "canoncall",
43
+ # "complement_base_list",
44
+ # "converted_BAM_to_adata_II",
45
+ # "concatenate_fastqs_to_bam",
46
+ # "count_aligned_reads",
47
+ # "demux_and_index_BAM",
48
+ # "extract_base_identities",
49
+ # "extract_mods",
50
+ # "extract_read_features_from_bam",
51
+ # "extract_read_lengths_from_bed",
52
+ # "extract_readnames_from_BAM",
53
+ # "find_conversion_sites",
54
+ # "convert_FASTA_record",
55
+ # "generate_converted_FASTA",
56
+ # "get_chromosome_lengths",
57
+ # "get_native_references",
58
+ # "index_fasta",
59
+ # "make_modbed",
60
+ # "modcall",
61
+ # "modkit_extract_to_adata",
62
+ # "modQC",
63
+ # "one_hot_encode",
64
+ # "ohe_batching",
65
+ # "one_hot_decode",
66
+ # "ohe_layers_decode",
67
+ # "plot_bed_histograms",
68
+ # "run_multiqc",
69
+ # "separate_bam_by_bc",
70
+ # "split_and_index_BAM"
71
+ # ]