smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -15,7 +15,7 @@ def filter_reads_on_modification_thresholds(
|
|
|
15
15
|
a_thresholds: Optional[Sequence[float]] = None,
|
|
16
16
|
use_other_c_as_background: bool = False,
|
|
17
17
|
min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
|
|
18
|
-
uns_flag: str = '
|
|
18
|
+
uns_flag: str = 'filter_reads_on_modification_thresholds_performed',
|
|
19
19
|
bypass: bool = False,
|
|
20
20
|
force_redo: bool = False,
|
|
21
21
|
reference_column: str = 'Reference_strand',
|
|
@@ -31,9 +31,9 @@ def filter_reads_on_modification_thresholds(
|
|
|
31
31
|
- Otherwise, computes the relevant per-read metrics per-reference in batches
|
|
32
32
|
and writes them into adata.obs before filtering.
|
|
33
33
|
|
|
34
|
-
Parameters of interest
|
|
34
|
+
Parameters of interest :
|
|
35
35
|
- gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
|
|
36
|
-
each should be [min, max] (floats 0..1) or None.
|
|
36
|
+
each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
|
|
37
37
|
- use_other_c_as_background: require GpC/CpG > other_C background (if present).
|
|
38
38
|
- min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
|
|
39
39
|
in the read vs reference (0..1). If None, this check is skipped.
|
|
@@ -53,7 +53,7 @@ def filter_reads_on_modification_thresholds(
|
|
|
53
53
|
col_pref = {
|
|
54
54
|
"GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
|
|
55
55
|
"CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
|
|
56
|
-
"C": ("
|
|
56
|
+
"C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
|
|
57
57
|
"A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
|
|
58
58
|
}.get(mod_type, (None, None))
|
|
59
59
|
return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
|
|
@@ -99,8 +99,8 @@ def filter_reads_on_modification_thresholds(
|
|
|
99
99
|
create_cols["Valid_CpG_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
100
100
|
create_cols["CpG_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
|
|
101
101
|
if "C" in mod_target_bases:
|
|
102
|
-
create_cols["
|
|
103
|
-
create_cols["
|
|
102
|
+
create_cols["Fraction_C_site_modified"] = np.full((n_obs,), np.nan)
|
|
103
|
+
create_cols["Valid_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
104
104
|
if "A" in mod_target_bases:
|
|
105
105
|
create_cols["Fraction_A_site_modified"] = np.full((n_obs,), np.nan)
|
|
106
106
|
create_cols["Valid_A_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
@@ -227,7 +227,7 @@ def filter_reads_on_modification_thresholds(
|
|
|
227
227
|
# any C
|
|
228
228
|
if "C" in mod_target_bases:
|
|
229
229
|
for ref in refs:
|
|
230
|
-
_compute_for_ref_and_suffix(ref, "
|
|
230
|
+
_compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
|
|
231
231
|
|
|
232
232
|
# A
|
|
233
233
|
if "A" in mod_target_bases:
|
|
@@ -283,15 +283,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
283
283
|
filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
|
|
284
284
|
if lo is not None:
|
|
285
285
|
s0 = filtered.n_obs
|
|
286
|
-
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float)
|
|
286
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
|
|
287
287
|
print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
|
|
288
288
|
if hi is not None:
|
|
289
289
|
s0 = filtered.n_obs
|
|
290
|
-
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float)
|
|
290
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
|
|
291
291
|
print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
|
|
292
292
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
|
|
293
293
|
s0 = filtered.n_obs
|
|
294
|
-
filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
|
|
294
|
+
filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
295
295
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
|
|
296
296
|
|
|
297
297
|
# CpG thresholds
|
|
@@ -301,15 +301,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
301
301
|
filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
|
|
302
302
|
if lo is not None:
|
|
303
303
|
s0 = filtered.n_obs
|
|
304
|
-
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float)
|
|
304
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
|
|
305
305
|
print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
|
|
306
306
|
if hi is not None:
|
|
307
307
|
s0 = filtered.n_obs
|
|
308
|
-
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float)
|
|
308
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
|
|
309
309
|
print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
|
|
310
310
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
|
|
311
311
|
s0 = filtered.n_obs
|
|
312
|
-
filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
|
|
312
|
+
filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
313
313
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
|
|
314
314
|
|
|
315
315
|
# any C thresholds
|
|
@@ -317,15 +317,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
317
317
|
lo, hi = _unpack_minmax(any_c_thresholds)
|
|
318
318
|
if lo is not None:
|
|
319
319
|
s0 = filtered.n_obs
|
|
320
|
-
filtered = filtered[filtered.obs["
|
|
320
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
|
|
321
321
|
print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
|
|
322
322
|
if hi is not None:
|
|
323
323
|
s0 = filtered.n_obs
|
|
324
|
-
filtered = filtered[filtered.obs["
|
|
324
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
|
|
325
325
|
print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
|
|
326
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("
|
|
326
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
|
|
327
327
|
s0 = filtered.n_obs
|
|
328
|
-
filtered = filtered[filtered.obs["
|
|
328
|
+
filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
329
329
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
|
|
330
330
|
|
|
331
331
|
# A thresholds
|
|
@@ -333,15 +333,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
333
333
|
lo, hi = _unpack_minmax(a_thresholds)
|
|
334
334
|
if lo is not None:
|
|
335
335
|
s0 = filtered.n_obs
|
|
336
|
-
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float)
|
|
336
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
|
|
337
337
|
print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
|
|
338
338
|
if hi is not None:
|
|
339
339
|
s0 = filtered.n_obs
|
|
340
|
-
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float)
|
|
340
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
|
|
341
341
|
print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
|
|
342
342
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
|
|
343
343
|
s0 = filtered.n_obs
|
|
344
|
-
filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
|
|
344
|
+
filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
345
345
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
|
|
346
346
|
|
|
347
347
|
filtered = filtered.copy()
|
|
@@ -13,7 +13,7 @@ import pandas as pd
|
|
|
13
13
|
import matplotlib.pyplot as plt
|
|
14
14
|
from tqdm import tqdm
|
|
15
15
|
|
|
16
|
-
from ..
|
|
16
|
+
from ..readwrite import make_dirs
|
|
17
17
|
|
|
18
18
|
# optional imports for clustering / PCA / KDE
|
|
19
19
|
try:
|
|
@@ -77,7 +77,7 @@ def flag_duplicate_reads(
|
|
|
77
77
|
sample_col: str = "Barcode",
|
|
78
78
|
output_directory: Optional[str] = None,
|
|
79
79
|
metric_keys: Union[str, List[str]] = ("Fraction_any_C_site_modified",),
|
|
80
|
-
uns_flag: str = "
|
|
80
|
+
uns_flag: str = "flag_duplicate_reads_performed",
|
|
81
81
|
uns_filtered_flag: str = "read_duplicates_removed",
|
|
82
82
|
bypass: bool = False,
|
|
83
83
|
force_redo: bool = False,
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
## reindex_references_adata
|
|
2
|
+
|
|
3
|
+
def reindex_references_adata(adata,
|
|
4
|
+
reference_col="Reference_strand",
|
|
5
|
+
offsets=None,
|
|
6
|
+
new_col="reindexed",
|
|
7
|
+
uns_flag='reindex_references_adata_performed',
|
|
8
|
+
force_redo=False):
|
|
9
|
+
|
|
10
|
+
# Only run if not already performed
|
|
11
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
12
|
+
if (already and not force_redo):
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
if offsets is None:
|
|
16
|
+
pass
|
|
17
|
+
else:
|
|
18
|
+
# Ensure var_names are numeric
|
|
19
|
+
var_coords = adata.var_names.astype(int)
|
|
20
|
+
|
|
21
|
+
for ref in adata.obs[reference_col].unique():
|
|
22
|
+
if ref not in offsets:
|
|
23
|
+
pass
|
|
24
|
+
else:
|
|
25
|
+
offset_value = offsets[ref]
|
|
26
|
+
|
|
27
|
+
# Create a new var column for this reference
|
|
28
|
+
colname = f"{ref}_{new_col}"
|
|
29
|
+
|
|
30
|
+
# Add offset to all var positions
|
|
31
|
+
adata.var[colname] = var_coords + offset_value
|
|
32
|
+
|
|
33
|
+
# mark as done
|
|
34
|
+
adata.uns[uns_flag] = True
|
|
35
|
+
|
|
36
|
+
print("Reindexing complete!")
|
|
37
|
+
return None
|