smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -84,6 +84,112 @@ def add_demux_type_annotation(
84
84
  return adata
85
85
 
86
86
 
87
+ def add_read_tag_annotations(
88
+ adata,
89
+ bam_files: Optional[List[str]] = None,
90
+ read_tags: Optional[Dict[str, Dict[str, object]]] = None,
91
+ tag_names: Optional[List[str]] = None,
92
+ include_flags: bool = True,
93
+ include_cigar: bool = True,
94
+ extract_read_tags_from_bam_callable=None,
95
+ samtools_backend: str | None = "auto",
96
+ ):
97
+ """Populate adata.obs with read tag metadata.
98
+
99
+ Args:
100
+ adata: AnnData to annotate (modified in-place).
101
+ bam_files: Optional list of BAM files to extract tags from.
102
+ read_tags: Optional mapping of read name to tag dict.
103
+ tag_names: Optional list of BAM tag names to extract (e.g. ["NM", "MD", "MM", "ML"]).
104
+ include_flags: Whether to add a FLAGS list column.
105
+ include_cigar: Whether to add the CIGAR string column.
106
+ extract_read_tags_from_bam_callable: Optional callable to extract tags from a BAM.
107
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
108
+
109
+ Returns:
110
+ None (mutates adata in-place).
111
+ """
112
+ if read_tags is None:
113
+ read_tags = {}
114
+ if bam_files:
115
+ extractor = extract_read_tags_from_bam_callable or globals().get(
116
+ "extract_read_tags_from_bam"
117
+ )
118
+ if extractor is None:
119
+ raise ValueError(
120
+ "No `read_tags` provided and `extract_read_tags_from_bam` not found."
121
+ )
122
+ for bam in bam_files:
123
+ bam_read_tags = extractor(
124
+ bam,
125
+ tag_names=tag_names,
126
+ include_flags=include_flags,
127
+ include_cigar=include_cigar,
128
+ samtools_backend=samtools_backend,
129
+ )
130
+ if not isinstance(bam_read_tags, dict):
131
+ raise ValueError(f"extract_read_tags_from_bam returned non-dict for {bam}")
132
+ read_tags.update(bam_read_tags)
133
+
134
+ if not read_tags:
135
+ return
136
+
137
+ df = pd.DataFrame.from_dict(read_tags, orient="index")
138
+ df_reindexed = df.reindex(adata.obs_names)
139
+ for column in df_reindexed.columns:
140
+ adata.obs[column] = df_reindexed[column].values
141
+
142
+
143
+ def add_secondary_supplementary_alignment_flags(
144
+ adata,
145
+ bam_path: str | Path,
146
+ *,
147
+ uns_flag: str = "add_secondary_supplementary_flags_performed",
148
+ bypass: bool = False,
149
+ force_redo: bool = False,
150
+ samtools_backend: str | None = "auto",
151
+ ) -> None:
152
+ """Annotate whether reads have secondary/supplementary alignments.
153
+
154
+ Args:
155
+ adata: AnnData to annotate (modified in-place).
156
+ bam_path: Path to the aligned/sorted BAM to scan.
157
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
158
+ bypass: Whether to skip annotation.
159
+ force_redo: Whether to recompute even if ``uns_flag`` is set.
160
+ samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
161
+ """
162
+ already = bool(adata.uns.get(uns_flag, False))
163
+ if (already and not force_redo) or bypass:
164
+ return
165
+
166
+ from .bam_functions import (
167
+ extract_secondary_supplementary_alignment_spans,
168
+ find_secondary_supplementary_read_names,
169
+ )
170
+
171
+ secondary_reads, supplementary_reads = find_secondary_supplementary_read_names(
172
+ bam_path,
173
+ adata.obs_names,
174
+ samtools_backend=samtools_backend,
175
+ )
176
+ secondary_spans, supplementary_spans = extract_secondary_supplementary_alignment_spans(
177
+ bam_path,
178
+ adata.obs_names,
179
+ samtools_backend=samtools_backend,
180
+ )
181
+
182
+ adata.obs["has_secondary_alignment"] = adata.obs_names.isin(secondary_reads)
183
+ adata.obs["has_supplementary_alignment"] = adata.obs_names.isin(supplementary_reads)
184
+ adata.obs["secondary_alignment_spans"] = [
185
+ secondary_spans.get(read_name) for read_name in adata.obs_names
186
+ ]
187
+ adata.obs["supplementary_alignment_spans"] = [
188
+ supplementary_spans.get(read_name) for read_name in adata.obs_names
189
+ ]
190
+ adata.uns[uns_flag] = True
191
+
192
+
87
193
  def add_read_length_and_mapping_qc(
88
194
  adata,
89
195
  bam_files: Optional[List[str]] = None,
@@ -104,7 +210,8 @@ def add_read_length_and_mapping_qc(
104
210
  bam_files
105
211
  Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
106
212
  read_metrics
107
- Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
213
+ Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
214
+ mapping_quality, reference_start, reference_end]
108
215
  If provided, this will be used directly and bam_files will be ignored.
109
216
  uns_flag
110
217
  key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
@@ -154,10 +261,12 @@ def add_read_length_and_mapping_qc(
154
261
  adata.obs["reference_length"] = np.full(n, np.nan)
155
262
  adata.obs["read_quality"] = np.full(n, np.nan)
156
263
  adata.obs["mapping_quality"] = np.full(n, np.nan)
264
+ adata.obs["reference_start"] = np.full(n, np.nan)
265
+ adata.obs["reference_end"] = np.full(n, np.nan)
157
266
  else:
158
267
  # Build DF robustly
159
268
  # Convert values to lists where possible, else to [val, val, val...]
160
- max_cols = 5
269
+ max_cols = 7
161
270
  rows = {}
162
271
  for k, v in read_metrics.items():
163
272
  if isinstance(v, (list, tuple, np.ndarray)):
@@ -179,6 +288,8 @@ def add_read_length_and_mapping_qc(
179
288
  "reference_length",
180
289
  "mapped_length",
181
290
  "mapping_quality",
291
+ "reference_start",
292
+ "reference_end",
182
293
  ],
183
294
  )
184
295
 
@@ -191,6 +302,8 @@ def add_read_length_and_mapping_qc(
191
302
  adata.obs["reference_length"] = df_reindexed["reference_length"].values
192
303
  adata.obs["read_quality"] = df_reindexed["read_quality"].values
193
304
  adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
305
+ adata.obs["reference_start"] = df_reindexed["reference_start"].values
306
+ adata.obs["reference_end"] = df_reindexed["reference_end"].values
194
307
 
195
308
  # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
196
309
  # read_length_to_reference_length_ratio