smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -10,6 +10,7 @@ import numpy as np
10
10
  import pandas as pd
11
11
  import scipy.sparse as sp
12
12
 
13
+ from smftools.constants import BASE_QUALITY_SCORES, READ_SPAN_MASK, REFERENCE_STRAND
13
14
  from smftools.logging_utils import get_logger
14
15
  from smftools.optional_imports import require
15
16
 
@@ -84,6 +85,194 @@ def add_demux_type_annotation(
84
85
  return adata
85
86
 
86
87
 
88
+ def append_reference_strand_quality_stats(
89
+ adata,
90
+ ref_column: str = REFERENCE_STRAND,
91
+ quality_layer: str = BASE_QUALITY_SCORES,
92
+ read_span_layer: str = READ_SPAN_MASK,
93
+ uns_flag: str = "append_reference_strand_quality_stats_performed",
94
+ force_redo: bool = False,
95
+ bypass: bool = False,
96
+ ) -> None:
97
+ """Append per-position quality and error rate stats for each reference strand.
98
+
99
+ Args:
100
+ adata: AnnData object to annotate in-place.
101
+ ref_column: Obs column defining reference strand groups.
102
+ quality_layer: Layer containing base quality scores.
103
+ read_span_layer: Optional layer marking covered positions (1=covered, 0=not covered).
104
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
105
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
106
+ bypass: Whether to skip this step.
107
+ """
108
+ if bypass:
109
+ return
110
+
111
+ already = bool(adata.uns.get(uns_flag, False))
112
+ if already and not force_redo:
113
+ return
114
+
115
+ if ref_column not in adata.obs:
116
+ logger.debug("Reference column '%s' not found; skipping quality stats.", ref_column)
117
+ return
118
+
119
+ if quality_layer not in adata.layers:
120
+ logger.debug("Quality layer '%s' not found; skipping quality stats.", quality_layer)
121
+ return
122
+
123
+ ref_values = adata.obs[ref_column]
124
+ references = (
125
+ ref_values.cat.categories if hasattr(ref_values, "cat") else pd.Index(pd.unique(ref_values))
126
+ )
127
+ n_vars = adata.shape[1]
128
+ has_span_mask = read_span_layer in adata.layers
129
+
130
+ for ref in references:
131
+ ref_mask = ref_values == ref
132
+ ref_position_mask = adata.var.get(f"position_in_{ref}")
133
+ if ref_position_mask is None:
134
+ ref_position_mask = pd.Series(np.ones(n_vars, dtype=bool), index=adata.var.index)
135
+ else:
136
+ ref_position_mask = ref_position_mask.astype(bool)
137
+
138
+ mean_quality = np.full(n_vars, np.nan, dtype=float)
139
+ std_quality = np.full(n_vars, np.nan, dtype=float)
140
+ mean_error = np.full(n_vars, np.nan, dtype=float)
141
+ std_error = np.full(n_vars, np.nan, dtype=float)
142
+
143
+ if ref_mask.sum() > 0:
144
+ quality_matrix = np.asarray(adata.layers[quality_layer][ref_mask]).astype(float)
145
+ quality_matrix[quality_matrix < 0] = np.nan
146
+ if has_span_mask:
147
+ coverage_mask = np.asarray(adata.layers[read_span_layer][ref_mask]) > 0
148
+ quality_matrix = np.where(coverage_mask, quality_matrix, np.nan)
149
+
150
+ mean_quality = np.nanmean(quality_matrix, axis=0)
151
+ std_quality = np.nanstd(quality_matrix, axis=0)
152
+
153
+ error_matrix = np.power(10.0, -quality_matrix / 10.0)
154
+ mean_error = np.nanmean(error_matrix, axis=0)
155
+ std_error = np.nanstd(error_matrix, axis=0)
156
+
157
+ mean_quality = np.where(ref_position_mask.values, mean_quality, np.nan)
158
+ std_quality = np.where(ref_position_mask.values, std_quality, np.nan)
159
+ mean_error = np.where(ref_position_mask.values, mean_error, np.nan)
160
+ std_error = np.where(ref_position_mask.values, std_error, np.nan)
161
+
162
+ adata.var[f"{ref}_mean_base_quality"] = pd.Series(mean_quality, index=adata.var.index)
163
+ adata.var[f"{ref}_std_base_quality"] = pd.Series(std_quality, index=adata.var.index)
164
+ adata.var[f"{ref}_mean_error_rate"] = pd.Series(mean_error, index=adata.var.index)
165
+ adata.var[f"{ref}_std_error_rate"] = pd.Series(std_error, index=adata.var.index)
166
+
167
+ adata.uns[uns_flag] = True
168
+
169
+
170
+ def add_read_tag_annotations(
171
+ adata,
172
+ bam_files: Optional[List[str]] = None,
173
+ read_tags: Optional[Dict[str, Dict[str, object]]] = None,
174
+ tag_names: Optional[List[str]] = None,
175
+ include_flags: bool = True,
176
+ include_cigar: bool = True,
177
+ extract_read_tags_from_bam_callable=None,
178
+ samtools_backend: str | None = "auto",
179
+ ):
180
+ """Populate adata.obs with read tag metadata.
181
+
182
+ Args:
183
+ adata: AnnData to annotate (modified in-place).
184
+ bam_files: Optional list of BAM files to extract tags from.
185
+ read_tags: Optional mapping of read name to tag dict.
186
+ tag_names: Optional list of BAM tag names to extract (e.g. ["NM", "MD", "MM", "ML"]).
187
+ include_flags: Whether to add a FLAGS list column.
188
+ include_cigar: Whether to add the CIGAR string column.
189
+ extract_read_tags_from_bam_callable: Optional callable to extract tags from a BAM.
190
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
191
+
192
+ Returns:
193
+ None (mutates adata in-place).
194
+ """
195
+ if read_tags is None:
196
+ read_tags = {}
197
+ if bam_files:
198
+ extractor = extract_read_tags_from_bam_callable or globals().get(
199
+ "extract_read_tags_from_bam"
200
+ )
201
+ if extractor is None:
202
+ raise ValueError(
203
+ "No `read_tags` provided and `extract_read_tags_from_bam` not found."
204
+ )
205
+ for bam in bam_files:
206
+ bam_read_tags = extractor(
207
+ bam,
208
+ tag_names=tag_names,
209
+ include_flags=include_flags,
210
+ include_cigar=include_cigar,
211
+ samtools_backend=samtools_backend,
212
+ )
213
+ if not isinstance(bam_read_tags, dict):
214
+ raise ValueError(f"extract_read_tags_from_bam returned non-dict for {bam}")
215
+ read_tags.update(bam_read_tags)
216
+
217
+ if not read_tags:
218
+ return
219
+
220
+ df = pd.DataFrame.from_dict(read_tags, orient="index")
221
+ df_reindexed = df.reindex(adata.obs_names)
222
+ for column in df_reindexed.columns:
223
+ adata.obs[column] = df_reindexed[column].values
224
+
225
+
226
+ def add_secondary_supplementary_alignment_flags(
227
+ adata,
228
+ bam_path: str | Path,
229
+ *,
230
+ uns_flag: str = "add_secondary_supplementary_flags_performed",
231
+ bypass: bool = False,
232
+ force_redo: bool = False,
233
+ samtools_backend: str | None = "auto",
234
+ ) -> None:
235
+ """Annotate whether reads have secondary/supplementary alignments.
236
+
237
+ Args:
238
+ adata: AnnData to annotate (modified in-place).
239
+ bam_path: Path to the aligned/sorted BAM to scan.
240
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
241
+ bypass: Whether to skip annotation.
242
+ force_redo: Whether to recompute even if ``uns_flag`` is set.
243
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
244
+ """
245
+ already = bool(adata.uns.get(uns_flag, False))
246
+ if (already and not force_redo) or bypass:
247
+ return
248
+
249
+ from .bam_functions import (
250
+ extract_secondary_supplementary_alignment_spans,
251
+ find_secondary_supplementary_read_names,
252
+ )
253
+
254
+ secondary_reads, supplementary_reads = find_secondary_supplementary_read_names(
255
+ bam_path,
256
+ adata.obs_names,
257
+ samtools_backend=samtools_backend,
258
+ )
259
+ secondary_spans, supplementary_spans = extract_secondary_supplementary_alignment_spans(
260
+ bam_path,
261
+ adata.obs_names,
262
+ samtools_backend=samtools_backend,
263
+ )
264
+
265
+ adata.obs["has_secondary_alignment"] = adata.obs_names.isin(secondary_reads)
266
+ adata.obs["has_supplementary_alignment"] = adata.obs_names.isin(supplementary_reads)
267
+ adata.obs["secondary_alignment_spans"] = [
268
+ secondary_spans.get(read_name) for read_name in adata.obs_names
269
+ ]
270
+ adata.obs["supplementary_alignment_spans"] = [
271
+ supplementary_spans.get(read_name) for read_name in adata.obs_names
272
+ ]
273
+ adata.uns[uns_flag] = True
274
+
275
+
87
276
  def add_read_length_and_mapping_qc(
88
277
  adata,
89
278
  bam_files: Optional[List[str]] = None,
@@ -104,7 +293,8 @@ def add_read_length_and_mapping_qc(
104
293
  bam_files
105
294
  Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
106
295
  read_metrics
107
- Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
296
+ Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
297
+ mapping_quality, reference_start, reference_end]
108
298
  If provided, this will be used directly and bam_files will be ignored.
109
299
  uns_flag
110
300
  key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
@@ -154,10 +344,12 @@ def add_read_length_and_mapping_qc(
154
344
  adata.obs["reference_length"] = np.full(n, np.nan)
155
345
  adata.obs["read_quality"] = np.full(n, np.nan)
156
346
  adata.obs["mapping_quality"] = np.full(n, np.nan)
347
+ adata.obs["reference_start"] = np.full(n, np.nan)
348
+ adata.obs["reference_end"] = np.full(n, np.nan)
157
349
  else:
158
350
  # Build DF robustly
159
351
  # Convert values to lists where possible, else to [val, val, val...]
160
- max_cols = 5
352
+ max_cols = 7
161
353
  rows = {}
162
354
  for k, v in read_metrics.items():
163
355
  if isinstance(v, (list, tuple, np.ndarray)):
@@ -179,6 +371,8 @@ def add_read_length_and_mapping_qc(
179
371
  "reference_length",
180
372
  "mapped_length",
181
373
  "mapping_quality",
374
+ "reference_start",
375
+ "reference_end",
182
376
  ],
183
377
  )
184
378
 
@@ -191,6 +385,8 @@ def add_read_length_and_mapping_qc(
191
385
  adata.obs["reference_length"] = df_reindexed["reference_length"].values
192
386
  adata.obs["read_quality"] = df_reindexed["read_quality"].values
193
387
  adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
388
+ adata.obs["reference_start"] = df_reindexed["reference_start"].values
389
+ adata.obs["reference_end"] = df_reindexed["reference_end"].values
194
390
 
195
391
  # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
196
392
  # read_length_to_reference_length_ratio