smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +32 -6
- smftools/cli/hmm_adata.py +232 -31
- smftools/cli/latent_adata.py +318 -0
- smftools/cli/load_adata.py +77 -73
- smftools/cli/preprocess_adata.py +178 -53
- smftools/cli/spatial_adata.py +149 -101
- smftools/cli_entry.py +12 -0
- smftools/config/conversion.yaml +11 -1
- smftools/config/default.yaml +38 -1
- smftools/config/experiment_config.py +53 -1
- smftools/constants.py +65 -0
- smftools/hmm/HMM.py +88 -0
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/converted_BAM_to_adata.py +584 -163
- smftools/informatics/h5ad_functions.py +115 -2
- smftools/informatics/modkit_extract_to_adata.py +1003 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +9 -0
- smftools/plotting/general_plotting.py +2411 -628
- smftools/plotting/hmm_plotting.py +85 -7
- smftools/preprocessing/__init__.py +1 -0
- smftools/preprocessing/append_base_context.py +17 -17
- smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/readwrite.py +53 -17
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +4 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +119 -0
- smftools/tools/calculate_umap.py +91 -8
- smftools/tools/rolling_nn_distance.py +235 -0
- smftools/tools/tensor_factorization.py +169 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -84,6 +84,112 @@ def add_demux_type_annotation(
|
|
|
84
84
|
return adata
|
|
85
85
|
|
|
86
86
|
|
|
87
|
+
def add_read_tag_annotations(
|
|
88
|
+
adata,
|
|
89
|
+
bam_files: Optional[List[str]] = None,
|
|
90
|
+
read_tags: Optional[Dict[str, Dict[str, object]]] = None,
|
|
91
|
+
tag_names: Optional[List[str]] = None,
|
|
92
|
+
include_flags: bool = True,
|
|
93
|
+
include_cigar: bool = True,
|
|
94
|
+
extract_read_tags_from_bam_callable=None,
|
|
95
|
+
samtools_backend: str | None = "auto",
|
|
96
|
+
):
|
|
97
|
+
"""Populate adata.obs with read tag metadata.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
adata: AnnData to annotate (modified in-place).
|
|
101
|
+
bam_files: Optional list of BAM files to extract tags from.
|
|
102
|
+
read_tags: Optional mapping of read name to tag dict.
|
|
103
|
+
tag_names: Optional list of BAM tag names to extract (e.g. ["NM", "MD", "MM", "ML"]).
|
|
104
|
+
include_flags: Whether to add a FLAGS list column.
|
|
105
|
+
include_cigar: Whether to add the CIGAR string column.
|
|
106
|
+
extract_read_tags_from_bam_callable: Optional callable to extract tags from a BAM.
|
|
107
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
None (mutates adata in-place).
|
|
111
|
+
"""
|
|
112
|
+
if read_tags is None:
|
|
113
|
+
read_tags = {}
|
|
114
|
+
if bam_files:
|
|
115
|
+
extractor = extract_read_tags_from_bam_callable or globals().get(
|
|
116
|
+
"extract_read_tags_from_bam"
|
|
117
|
+
)
|
|
118
|
+
if extractor is None:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"No `read_tags` provided and `extract_read_tags_from_bam` not found."
|
|
121
|
+
)
|
|
122
|
+
for bam in bam_files:
|
|
123
|
+
bam_read_tags = extractor(
|
|
124
|
+
bam,
|
|
125
|
+
tag_names=tag_names,
|
|
126
|
+
include_flags=include_flags,
|
|
127
|
+
include_cigar=include_cigar,
|
|
128
|
+
samtools_backend=samtools_backend,
|
|
129
|
+
)
|
|
130
|
+
if not isinstance(bam_read_tags, dict):
|
|
131
|
+
raise ValueError(f"extract_read_tags_from_bam returned non-dict for {bam}")
|
|
132
|
+
read_tags.update(bam_read_tags)
|
|
133
|
+
|
|
134
|
+
if not read_tags:
|
|
135
|
+
return
|
|
136
|
+
|
|
137
|
+
df = pd.DataFrame.from_dict(read_tags, orient="index")
|
|
138
|
+
df_reindexed = df.reindex(adata.obs_names)
|
|
139
|
+
for column in df_reindexed.columns:
|
|
140
|
+
adata.obs[column] = df_reindexed[column].values
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def add_secondary_supplementary_alignment_flags(
|
|
144
|
+
adata,
|
|
145
|
+
bam_path: str | Path,
|
|
146
|
+
*,
|
|
147
|
+
uns_flag: str = "add_secondary_supplementary_flags_performed",
|
|
148
|
+
bypass: bool = False,
|
|
149
|
+
force_redo: bool = False,
|
|
150
|
+
samtools_backend: str | None = "auto",
|
|
151
|
+
) -> None:
|
|
152
|
+
"""Annotate whether reads have secondary/supplementary alignments.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
adata: AnnData to annotate (modified in-place).
|
|
156
|
+
bam_path: Path to the aligned/sorted BAM to scan.
|
|
157
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
158
|
+
bypass: Whether to skip annotation.
|
|
159
|
+
force_redo: Whether to recompute even if ``uns_flag`` is set.
|
|
160
|
+
samtools_backend: Backend selection for samtools-compatible operations (auto|python|cli).
|
|
161
|
+
"""
|
|
162
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
163
|
+
if (already and not force_redo) or bypass:
|
|
164
|
+
return
|
|
165
|
+
|
|
166
|
+
from .bam_functions import (
|
|
167
|
+
extract_secondary_supplementary_alignment_spans,
|
|
168
|
+
find_secondary_supplementary_read_names,
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
secondary_reads, supplementary_reads = find_secondary_supplementary_read_names(
|
|
172
|
+
bam_path,
|
|
173
|
+
adata.obs_names,
|
|
174
|
+
samtools_backend=samtools_backend,
|
|
175
|
+
)
|
|
176
|
+
secondary_spans, supplementary_spans = extract_secondary_supplementary_alignment_spans(
|
|
177
|
+
bam_path,
|
|
178
|
+
adata.obs_names,
|
|
179
|
+
samtools_backend=samtools_backend,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
adata.obs["has_secondary_alignment"] = adata.obs_names.isin(secondary_reads)
|
|
183
|
+
adata.obs["has_supplementary_alignment"] = adata.obs_names.isin(supplementary_reads)
|
|
184
|
+
adata.obs["secondary_alignment_spans"] = [
|
|
185
|
+
secondary_spans.get(read_name) for read_name in adata.obs_names
|
|
186
|
+
]
|
|
187
|
+
adata.obs["supplementary_alignment_spans"] = [
|
|
188
|
+
supplementary_spans.get(read_name) for read_name in adata.obs_names
|
|
189
|
+
]
|
|
190
|
+
adata.uns[uns_flag] = True
|
|
191
|
+
|
|
192
|
+
|
|
87
193
|
def add_read_length_and_mapping_qc(
|
|
88
194
|
adata,
|
|
89
195
|
bam_files: Optional[List[str]] = None,
|
|
@@ -104,7 +210,8 @@ def add_read_length_and_mapping_qc(
|
|
|
104
210
|
bam_files
|
|
105
211
|
Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
|
|
106
212
|
read_metrics
|
|
107
|
-
Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
|
|
213
|
+
Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length,
|
|
214
|
+
mapping_quality, reference_start, reference_end]
|
|
108
215
|
If provided, this will be used directly and bam_files will be ignored.
|
|
109
216
|
uns_flag
|
|
110
217
|
key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
|
|
@@ -154,10 +261,12 @@ def add_read_length_and_mapping_qc(
|
|
|
154
261
|
adata.obs["reference_length"] = np.full(n, np.nan)
|
|
155
262
|
adata.obs["read_quality"] = np.full(n, np.nan)
|
|
156
263
|
adata.obs["mapping_quality"] = np.full(n, np.nan)
|
|
264
|
+
adata.obs["reference_start"] = np.full(n, np.nan)
|
|
265
|
+
adata.obs["reference_end"] = np.full(n, np.nan)
|
|
157
266
|
else:
|
|
158
267
|
# Build DF robustly
|
|
159
268
|
# Convert values to lists where possible, else to [val, val, val...]
|
|
160
|
-
max_cols =
|
|
269
|
+
max_cols = 7
|
|
161
270
|
rows = {}
|
|
162
271
|
for k, v in read_metrics.items():
|
|
163
272
|
if isinstance(v, (list, tuple, np.ndarray)):
|
|
@@ -179,6 +288,8 @@ def add_read_length_and_mapping_qc(
|
|
|
179
288
|
"reference_length",
|
|
180
289
|
"mapped_length",
|
|
181
290
|
"mapping_quality",
|
|
291
|
+
"reference_start",
|
|
292
|
+
"reference_end",
|
|
182
293
|
],
|
|
183
294
|
)
|
|
184
295
|
|
|
@@ -191,6 +302,8 @@ def add_read_length_and_mapping_qc(
|
|
|
191
302
|
adata.obs["reference_length"] = df_reindexed["reference_length"].values
|
|
192
303
|
adata.obs["read_quality"] = df_reindexed["read_quality"].values
|
|
193
304
|
adata.obs["mapping_quality"] = df_reindexed["mapping_quality"].values
|
|
305
|
+
adata.obs["reference_start"] = df_reindexed["reference_start"].values
|
|
306
|
+
adata.obs["reference_end"] = df_reindexed["reference_end"].values
|
|
194
307
|
|
|
195
308
|
# Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
|
|
196
309
|
# read_length_to_reference_length_ratio
|