smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/conversion.yaml +11 -6
- smftools/config/deaminase.yaml +12 -7
- smftools/config/default.yaml +36 -25
- smftools/config/direct.yaml +25 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +109 -12
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1037 -362
- smftools/preprocessing/__init__.py +2 -0
- smftools/preprocessing/append_base_context.py +3 -3
- smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
- smftools/preprocessing/flag_duplicate_reads.py +1 -1
- smftools/readwrite.py +266 -140
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,6 +2,7 @@ from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
|
|
|
2
2
|
from .append_base_context import append_base_context
|
|
3
3
|
from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
|
|
4
4
|
from .binarize_on_Youden import binarize_on_Youden
|
|
5
|
+
from .binarize import binarize_adata
|
|
5
6
|
from .calculate_complexity import calculate_complexity
|
|
6
7
|
from .calculate_complexity_II import calculate_complexity_II
|
|
7
8
|
from .calculate_read_modification_stats import calculate_read_modification_stats
|
|
@@ -22,6 +23,7 @@ __all__ = [
|
|
|
22
23
|
"append_base_context",
|
|
23
24
|
"append_binary_layer_by_base_context",
|
|
24
25
|
"binarize_on_Youden",
|
|
26
|
+
"binarize_adata",
|
|
25
27
|
"calculate_complexity",
|
|
26
28
|
"calculate_read_modification_stats",
|
|
27
29
|
"calculate_coverage",
|
|
@@ -34,7 +34,7 @@ def append_base_context(adata,
|
|
|
34
34
|
site_types = []
|
|
35
35
|
|
|
36
36
|
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
37
|
-
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', '
|
|
37
|
+
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
|
|
38
38
|
|
|
39
39
|
if 'A' in mod_target_bases:
|
|
40
40
|
site_types += ['A_site']
|
|
@@ -70,7 +70,7 @@ def append_base_context(adata,
|
|
|
70
70
|
# Iterate through the sequence and apply the criteria
|
|
71
71
|
for i in range(1, len(sequence) - 1):
|
|
72
72
|
if sequence[i] == 'C':
|
|
73
|
-
boolean_dict[f'{cat}
|
|
73
|
+
boolean_dict[f'{cat}_C_site'][i] = True
|
|
74
74
|
if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
|
|
75
75
|
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
76
76
|
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
@@ -83,7 +83,7 @@ def append_base_context(adata,
|
|
|
83
83
|
# Iterate through the sequence and apply the criteria
|
|
84
84
|
for i in range(1, len(sequence) - 1):
|
|
85
85
|
if sequence[i] == 'G':
|
|
86
|
-
boolean_dict[f'{cat}
|
|
86
|
+
boolean_dict[f'{cat}_C_site'][i] = True
|
|
87
87
|
if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
|
|
88
88
|
boolean_dict[f'{cat}_GpC_site'][i] = True
|
|
89
89
|
elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
|
|
@@ -15,7 +15,7 @@ def append_binary_layer_by_base_context(
|
|
|
15
15
|
- GpC_site_binary
|
|
16
16
|
- CpG_site_binary
|
|
17
17
|
- GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
|
|
18
|
-
-
|
|
18
|
+
- C_site_binary
|
|
19
19
|
- other_C_site_binary
|
|
20
20
|
|
|
21
21
|
Behavior:
|
|
@@ -48,7 +48,7 @@ def append_binary_layer_by_base_context(
|
|
|
48
48
|
references = adata.obs[reference_column].astype("category").cat.categories
|
|
49
49
|
reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
|
|
50
50
|
reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
|
|
51
|
-
reference_to_c_column = {ref: f"{ref}
|
|
51
|
+
reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
|
|
52
52
|
reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
|
|
53
53
|
|
|
54
54
|
# verify var columns exist and build boolean masks per ref (len = n_vars)
|
|
@@ -124,7 +124,7 @@ def append_binary_layer_by_base_context(
|
|
|
124
124
|
adata.layers['GpC_site_binary'] = masked_gpc
|
|
125
125
|
adata.layers['CpG_site_binary'] = masked_cpg
|
|
126
126
|
adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
|
|
127
|
-
adata.layers['
|
|
127
|
+
adata.layers['C_site_binary'] = masked_any_c
|
|
128
128
|
adata.layers['other_C_site_binary'] = masked_other_c
|
|
129
129
|
|
|
130
130
|
if verbose:
|
|
@@ -134,7 +134,7 @@ def append_binary_layer_by_base_context(
|
|
|
134
134
|
print(f" GpC: {_filled_positions(masked_gpc)}")
|
|
135
135
|
print(f" CpG: {_filled_positions(masked_cpg)}")
|
|
136
136
|
print(f" GpC+CpG combined: {_filled_positions(combined_sum)}")
|
|
137
|
-
print(f"
|
|
137
|
+
print(f" C: {_filled_positions(masked_any_c)}")
|
|
138
138
|
print(f" other_C: {_filled_positions(masked_other_c)}")
|
|
139
139
|
|
|
140
140
|
# mark as done
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
|
|
4
|
+
"""
|
|
5
|
+
Binarize a dense matrix and preserve NaN.
|
|
6
|
+
source: "X" or layer name
|
|
7
|
+
"""
|
|
8
|
+
X = adata.X if source == "X" else adata.layers[source]
|
|
9
|
+
|
|
10
|
+
# Copy to avoid modifying original in-place
|
|
11
|
+
X_bin = X.copy()
|
|
12
|
+
|
|
13
|
+
# Where not NaN: apply threshold
|
|
14
|
+
mask = ~np.isnan(X_bin)
|
|
15
|
+
X_bin[mask] = (X_bin[mask] > threshold).astype(np.int8)
|
|
16
|
+
|
|
17
|
+
adata.layers[target_layer] = X_bin
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def binarize_on_Youden(adata, obs_column='Reference'):
|
|
1
|
+
def binarize_on_Youden(adata, obs_column='Reference', output_layer_name='binarized_methylation'):
|
|
2
2
|
"""
|
|
3
3
|
Binarize SMF values based on position thresholds determined by calculate_position_Youden.
|
|
4
4
|
|
|
@@ -42,4 +42,4 @@ def binarize_on_Youden(adata, obs_column='Reference'):
|
|
|
42
42
|
binarized_methylation[cat_mask, :] = binarized_matrix
|
|
43
43
|
|
|
44
44
|
# Store the binarized matrix in a new layer
|
|
45
|
-
adata.layers[
|
|
45
|
+
adata.layers[output_layer_name] = binarized_methylation
|
|
@@ -103,7 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
103
103
|
probability_thresholding_list[position] = (0.8, np.nan)
|
|
104
104
|
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
|
|
105
105
|
plt.title(title)
|
|
106
|
-
save_name = output_directory
|
|
106
|
+
save_name = output_directory / f"{title}.png"
|
|
107
107
|
if save:
|
|
108
108
|
plt.savefig(save_name)
|
|
109
109
|
plt.close()
|
|
@@ -36,7 +36,7 @@ def calculate_read_modification_stats(adata,
|
|
|
36
36
|
site_types = []
|
|
37
37
|
|
|
38
38
|
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
39
|
-
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', '
|
|
39
|
+
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
|
|
40
40
|
|
|
41
41
|
if 'A' in mod_target_bases:
|
|
42
42
|
site_types += ['A_site']
|
|
@@ -31,9 +31,9 @@ def filter_reads_on_modification_thresholds(
|
|
|
31
31
|
- Otherwise, computes the relevant per-read metrics per-reference in batches
|
|
32
32
|
and writes them into adata.obs before filtering.
|
|
33
33
|
|
|
34
|
-
Parameters of interest
|
|
34
|
+
Parameters of interest :
|
|
35
35
|
- gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
|
|
36
|
-
each should be [min, max] (floats 0..1) or None.
|
|
36
|
+
each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
|
|
37
37
|
- use_other_c_as_background: require GpC/CpG > other_C background (if present).
|
|
38
38
|
- min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
|
|
39
39
|
in the read vs reference (0..1). If None, this check is skipped.
|
|
@@ -53,7 +53,7 @@ def filter_reads_on_modification_thresholds(
|
|
|
53
53
|
col_pref = {
|
|
54
54
|
"GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
|
|
55
55
|
"CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
|
|
56
|
-
"C": ("
|
|
56
|
+
"C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
|
|
57
57
|
"A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
|
|
58
58
|
}.get(mod_type, (None, None))
|
|
59
59
|
return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
|
|
@@ -99,8 +99,8 @@ def filter_reads_on_modification_thresholds(
|
|
|
99
99
|
create_cols["Valid_CpG_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
100
100
|
create_cols["CpG_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
|
|
101
101
|
if "C" in mod_target_bases:
|
|
102
|
-
create_cols["
|
|
103
|
-
create_cols["
|
|
102
|
+
create_cols["Fraction_C_site_modified"] = np.full((n_obs,), np.nan)
|
|
103
|
+
create_cols["Valid_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
104
104
|
if "A" in mod_target_bases:
|
|
105
105
|
create_cols["Fraction_A_site_modified"] = np.full((n_obs,), np.nan)
|
|
106
106
|
create_cols["Valid_A_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
|
|
@@ -227,7 +227,7 @@ def filter_reads_on_modification_thresholds(
|
|
|
227
227
|
# any C
|
|
228
228
|
if "C" in mod_target_bases:
|
|
229
229
|
for ref in refs:
|
|
230
|
-
_compute_for_ref_and_suffix(ref, "
|
|
230
|
+
_compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
|
|
231
231
|
|
|
232
232
|
# A
|
|
233
233
|
if "A" in mod_target_bases:
|
|
@@ -283,15 +283,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
283
283
|
filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
|
|
284
284
|
if lo is not None:
|
|
285
285
|
s0 = filtered.n_obs
|
|
286
|
-
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float)
|
|
286
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
|
|
287
287
|
print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
|
|
288
288
|
if hi is not None:
|
|
289
289
|
s0 = filtered.n_obs
|
|
290
|
-
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float)
|
|
290
|
+
filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
|
|
291
291
|
print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
|
|
292
292
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
|
|
293
293
|
s0 = filtered.n_obs
|
|
294
|
-
filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float)
|
|
294
|
+
filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
295
295
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
|
|
296
296
|
|
|
297
297
|
# CpG thresholds
|
|
@@ -301,15 +301,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
301
301
|
filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
|
|
302
302
|
if lo is not None:
|
|
303
303
|
s0 = filtered.n_obs
|
|
304
|
-
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float)
|
|
304
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
|
|
305
305
|
print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
|
|
306
306
|
if hi is not None:
|
|
307
307
|
s0 = filtered.n_obs
|
|
308
|
-
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float)
|
|
308
|
+
filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
|
|
309
309
|
print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
|
|
310
310
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
|
|
311
311
|
s0 = filtered.n_obs
|
|
312
|
-
filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float)
|
|
312
|
+
filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
313
313
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
|
|
314
314
|
|
|
315
315
|
# any C thresholds
|
|
@@ -317,15 +317,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
317
317
|
lo, hi = _unpack_minmax(any_c_thresholds)
|
|
318
318
|
if lo is not None:
|
|
319
319
|
s0 = filtered.n_obs
|
|
320
|
-
filtered = filtered[filtered.obs["
|
|
320
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
|
|
321
321
|
print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
|
|
322
322
|
if hi is not None:
|
|
323
323
|
s0 = filtered.n_obs
|
|
324
|
-
filtered = filtered[filtered.obs["
|
|
324
|
+
filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
|
|
325
325
|
print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
|
|
326
|
-
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("
|
|
326
|
+
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
|
|
327
327
|
s0 = filtered.n_obs
|
|
328
|
-
filtered = filtered[filtered.obs["
|
|
328
|
+
filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
329
329
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
|
|
330
330
|
|
|
331
331
|
# A thresholds
|
|
@@ -333,15 +333,15 @@ def filter_reads_on_modification_thresholds(
|
|
|
333
333
|
lo, hi = _unpack_minmax(a_thresholds)
|
|
334
334
|
if lo is not None:
|
|
335
335
|
s0 = filtered.n_obs
|
|
336
|
-
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float)
|
|
336
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
|
|
337
337
|
print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
|
|
338
338
|
if hi is not None:
|
|
339
339
|
s0 = filtered.n_obs
|
|
340
|
-
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float)
|
|
340
|
+
filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
|
|
341
341
|
print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
|
|
342
342
|
if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
|
|
343
343
|
s0 = filtered.n_obs
|
|
344
|
-
filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float)
|
|
344
|
+
filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
|
|
345
345
|
print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
|
|
346
346
|
|
|
347
347
|
filtered = filtered.copy()
|