smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -90,7 +90,7 @@ def plot_volcano_relative_risk(
|
|
|
90
90
|
safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
|
|
91
91
|
out_file = os.path.join(save_path, f"{safe_name}.png")
|
|
92
92
|
plt.savefig(out_file, dpi=300)
|
|
93
|
-
print(f"
|
|
93
|
+
print(f"Saved: {out_file}")
|
|
94
94
|
|
|
95
95
|
plt.show()
|
|
96
96
|
|
|
@@ -449,7 +449,7 @@ def plot_positionwise_matrix_grid(
|
|
|
449
449
|
os.makedirs(save_path, exist_ok=True)
|
|
450
450
|
fname = outer_label.replace("_", "").replace("=", "") + ".png"
|
|
451
451
|
plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches='tight')
|
|
452
|
-
print(f"
|
|
452
|
+
print(f"Saved {fname}")
|
|
453
453
|
|
|
454
454
|
plt.close(fig)
|
|
455
455
|
|
|
@@ -459,4 +459,4 @@ def plot_positionwise_matrix_grid(
|
|
|
459
459
|
for outer_label in parsed['outer'].unique():
|
|
460
460
|
plot_one_grid(outer_label)
|
|
461
461
|
|
|
462
|
-
print("
|
|
462
|
+
print("Finished plotting all grids.")
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
|
|
2
1
|
from .append_base_context import append_base_context
|
|
3
2
|
from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
|
|
4
3
|
from .binarize_on_Youden import binarize_on_Youden
|
|
5
|
-
from .
|
|
4
|
+
from .binarize import binarize_adata
|
|
6
5
|
from .calculate_complexity_II import calculate_complexity_II
|
|
7
6
|
from .calculate_read_modification_stats import calculate_read_modification_stats
|
|
8
7
|
from .calculate_coverage import calculate_coverage
|
|
@@ -15,14 +14,15 @@ from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quali
|
|
|
15
14
|
from .invert_adata import invert_adata
|
|
16
15
|
from .load_sample_sheet import load_sample_sheet
|
|
17
16
|
from .flag_duplicate_reads import flag_duplicate_reads
|
|
17
|
+
from .reindex_references_adata import reindex_references_adata
|
|
18
18
|
from .subsample_adata import subsample_adata
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"add_read_length_and_mapping_qc",
|
|
22
21
|
"append_base_context",
|
|
23
22
|
"append_binary_layer_by_base_context",
|
|
24
23
|
"binarize_on_Youden",
|
|
25
|
-
"
|
|
24
|
+
"binarize_adata",
|
|
25
|
+
"calculate_complexity_II",
|
|
26
26
|
"calculate_read_modification_stats",
|
|
27
27
|
"calculate_coverage",
|
|
28
28
|
"calculate_position_Youden",
|
|
@@ -1,18 +1,19 @@
|
|
|
1
1
|
def append_base_context(adata,
|
|
2
|
-
|
|
2
|
+
ref_column='Reference_strand',
|
|
3
3
|
use_consensus=False,
|
|
4
4
|
native=False,
|
|
5
5
|
mod_target_bases=['GpC', 'CpG'],
|
|
6
6
|
bypass=False,
|
|
7
7
|
force_redo=False,
|
|
8
|
-
uns_flag='
|
|
8
|
+
uns_flag='append_base_context_performed'
|
|
9
9
|
):
|
|
10
10
|
"""
|
|
11
11
|
Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
|
|
12
|
+
This needs to be performed prior to AnnData inversion step.
|
|
12
13
|
|
|
13
14
|
Parameters:
|
|
14
15
|
adata (AnnData): The input adata object.
|
|
15
|
-
|
|
16
|
+
ref_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
|
|
16
17
|
use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
|
|
17
18
|
native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
|
|
18
19
|
mod_target_bases (list): Base contexts that may be modified.
|
|
@@ -30,68 +31,69 @@ def append_base_context(adata,
|
|
|
30
31
|
return
|
|
31
32
|
|
|
32
33
|
print('Adding base context based on reference FASTA sequence for sample')
|
|
33
|
-
|
|
34
|
+
references = adata.obs[ref_column].cat.categories
|
|
34
35
|
site_types = []
|
|
35
36
|
|
|
36
37
|
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
37
|
-
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', '
|
|
38
|
+
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
|
|
38
39
|
|
|
39
40
|
if 'A' in mod_target_bases:
|
|
40
41
|
site_types += ['A_site']
|
|
41
42
|
|
|
42
|
-
for
|
|
43
|
+
for ref in references:
|
|
43
44
|
# Assess if the strand is the top or bottom strand converted
|
|
44
|
-
if 'top' in
|
|
45
|
+
if 'top' in ref:
|
|
45
46
|
strand = 'top'
|
|
46
|
-
elif 'bottom' in
|
|
47
|
+
elif 'bottom' in ref:
|
|
47
48
|
strand = 'bottom'
|
|
48
49
|
|
|
49
50
|
if native:
|
|
50
|
-
basename =
|
|
51
|
+
basename = ref.split(f"_{strand}")[0]
|
|
51
52
|
if use_consensus:
|
|
52
53
|
sequence = adata.uns[f'{basename}_consensus_sequence']
|
|
53
54
|
else:
|
|
54
55
|
# This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
|
|
55
56
|
sequence = adata.uns[f'{basename}_FASTA_sequence']
|
|
56
57
|
else:
|
|
57
|
-
basename =
|
|
58
|
+
basename = ref.split(f"_{strand}")[0]
|
|
58
59
|
if use_consensus:
|
|
59
60
|
sequence = adata.uns[f'{basename}_consensus_sequence']
|
|
60
61
|
else:
|
|
61
62
|
# This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
|
|
62
63
|
sequence = adata.uns[f'{basename}_FASTA_sequence']
|
|
64
|
+
|
|
63
65
|
# Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
|
|
64
66
|
boolean_dict = {}
|
|
65
67
|
for site_type in site_types:
|
|
66
|
-
boolean_dict[f'{
|
|
68
|
+
boolean_dict[f'{ref}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
|
|
67
69
|
|
|
68
70
|
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
69
71
|
if strand == 'top':
|
|
70
72
|
# Iterate through the sequence and apply the criteria
|
|
71
73
|
for i in range(1, len(sequence) - 1):
|
|
72
74
|
if sequence[i] == 'C':
|
|
73
|
-
boolean_dict[f'{
|
|
75
|
+
boolean_dict[f'{ref}_C_site'][i] = True
|
|
74
76
|
if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
|
|
75
|
-
boolean_dict[f'{
|
|
77
|
+
boolean_dict[f'{ref}_GpC_site'][i] = True
|
|
76
78
|
elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
|
|
77
|
-
boolean_dict[f'{
|
|
79
|
+
boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
|
|
78
80
|
elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
|
|
79
|
-
boolean_dict[f'{
|
|
81
|
+
boolean_dict[f'{ref}_CpG_site'][i] = True
|
|
80
82
|
elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
|
|
81
|
-
boolean_dict[f'{
|
|
83
|
+
boolean_dict[f'{ref}_other_C_site'][i] = True
|
|
82
84
|
elif strand == 'bottom':
|
|
83
85
|
# Iterate through the sequence and apply the criteria
|
|
84
86
|
for i in range(1, len(sequence) - 1):
|
|
85
87
|
if sequence[i] == 'G':
|
|
86
|
-
boolean_dict[f'{
|
|
88
|
+
boolean_dict[f'{ref}_C_site'][i] = True
|
|
87
89
|
if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
|
|
88
|
-
boolean_dict[f'{
|
|
90
|
+
boolean_dict[f'{ref}_GpC_site'][i] = True
|
|
89
91
|
elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
|
|
90
|
-
boolean_dict[f'{
|
|
92
|
+
boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
|
|
91
93
|
elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
|
|
92
|
-
boolean_dict[f'{
|
|
94
|
+
boolean_dict[f'{ref}_CpG_site'][i] = True
|
|
93
95
|
elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
|
|
94
|
-
boolean_dict[f'{
|
|
96
|
+
boolean_dict[f'{ref}_other_C_site'][i] = True
|
|
95
97
|
else:
|
|
96
98
|
print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
|
|
97
99
|
|
|
@@ -100,21 +102,28 @@ def append_base_context(adata,
|
|
|
100
102
|
# Iterate through the sequence and apply the criteria
|
|
101
103
|
for i in range(1, len(sequence) - 1):
|
|
102
104
|
if sequence[i] == 'A':
|
|
103
|
-
boolean_dict[f'{
|
|
105
|
+
boolean_dict[f'{ref}_A_site'][i] = True
|
|
104
106
|
elif strand == 'bottom':
|
|
105
107
|
# Iterate through the sequence and apply the criteria
|
|
106
108
|
for i in range(1, len(sequence) - 1):
|
|
107
109
|
if sequence[i] == 'T':
|
|
108
|
-
boolean_dict[f'{
|
|
110
|
+
boolean_dict[f'{ref}_A_site'][i] = True
|
|
109
111
|
else:
|
|
110
112
|
print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
|
|
111
113
|
|
|
112
114
|
for site_type in site_types:
|
|
113
|
-
|
|
115
|
+
# Site context annotations for each reference
|
|
116
|
+
adata.var[f'{ref}_{site_type}'] = boolean_dict[f'{ref}_{site_type}'].astype(bool)
|
|
117
|
+
# Restrict the site type labels to only be in positions that occur at a high enough frequency in the dataset
|
|
118
|
+
if adata.uns["calculate_coverage_performed"] == True:
|
|
119
|
+
adata.var[f'{ref}_{site_type}'] = (adata.var[f'{ref}_{site_type}']) & (adata.var[f'position_in_{ref}'])
|
|
120
|
+
else:
|
|
121
|
+
pass
|
|
122
|
+
|
|
114
123
|
if native:
|
|
115
|
-
adata.obsm[f'{
|
|
124
|
+
adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].layers['binarized_methylation']
|
|
116
125
|
else:
|
|
117
|
-
adata.obsm[f'{
|
|
126
|
+
adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].X
|
|
118
127
|
|
|
119
128
|
# mark as done
|
|
120
129
|
adata.uns[uns_flag] = True
|
|
@@ -6,7 +6,7 @@ def append_binary_layer_by_base_context(
|
|
|
6
6
|
reference_column: str,
|
|
7
7
|
smf_modality: str = "conversion",
|
|
8
8
|
verbose: bool = True,
|
|
9
|
-
uns_flag: str = "
|
|
9
|
+
uns_flag: str = "append_binary_layer_by_base_context_performed",
|
|
10
10
|
bypass: bool = False,
|
|
11
11
|
force_redo: bool = False
|
|
12
12
|
):
|
|
@@ -15,7 +15,7 @@ def append_binary_layer_by_base_context(
|
|
|
15
15
|
- GpC_site_binary
|
|
16
16
|
- CpG_site_binary
|
|
17
17
|
- GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
|
|
18
|
-
-
|
|
18
|
+
- C_site_binary
|
|
19
19
|
- other_C_site_binary
|
|
20
20
|
|
|
21
21
|
Behavior:
|
|
@@ -27,7 +27,7 @@ def append_binary_layer_by_base_context(
|
|
|
27
27
|
|
|
28
28
|
# Only run if not already performed
|
|
29
29
|
already = bool(adata.uns.get(uns_flag, False))
|
|
30
|
-
if (already and not force_redo) or bypass or ("
|
|
30
|
+
if (already and not force_redo) or bypass or ("append_base_context_performed" not in adata.uns):
|
|
31
31
|
# QC already performed; nothing to do
|
|
32
32
|
return adata
|
|
33
33
|
|
|
@@ -48,7 +48,7 @@ def append_binary_layer_by_base_context(
|
|
|
48
48
|
references = adata.obs[reference_column].astype("category").cat.categories
|
|
49
49
|
reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
|
|
50
50
|
reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
|
|
51
|
-
reference_to_c_column = {ref: f"{ref}
|
|
51
|
+
reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
|
|
52
52
|
reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
|
|
53
53
|
|
|
54
54
|
# verify var columns exist and build boolean masks per ref (len = n_vars)
|
|
@@ -124,7 +124,7 @@ def append_binary_layer_by_base_context(
|
|
|
124
124
|
adata.layers['GpC_site_binary'] = masked_gpc
|
|
125
125
|
adata.layers['CpG_site_binary'] = masked_cpg
|
|
126
126
|
adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
|
|
127
|
-
adata.layers['
|
|
127
|
+
adata.layers['C_site_binary'] = masked_any_c
|
|
128
128
|
adata.layers['other_C_site_binary'] = masked_other_c
|
|
129
129
|
|
|
130
130
|
if verbose:
|
|
@@ -134,7 +134,7 @@ def append_binary_layer_by_base_context(
|
|
|
134
134
|
print(f" GpC: {_filled_positions(masked_gpc)}")
|
|
135
135
|
print(f" CpG: {_filled_positions(masked_cpg)}")
|
|
136
136
|
print(f" GpC+CpG combined: {_filled_positions(combined_sum)}")
|
|
137
|
-
print(f"
|
|
137
|
+
print(f" C: {_filled_positions(masked_any_c)}")
|
|
138
138
|
print(f" other_C: {_filled_positions(masked_other_c)}")
|
|
139
139
|
|
|
140
140
|
# mark as done
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
|
|
4
|
+
"""
|
|
5
|
+
Binarize a dense matrix and preserve NaN.
|
|
6
|
+
source: "X" or layer name
|
|
7
|
+
"""
|
|
8
|
+
X = adata.X if source == "X" else adata.layers[source]
|
|
9
|
+
|
|
10
|
+
# Copy to avoid modifying original in-place
|
|
11
|
+
X_bin = X.copy()
|
|
12
|
+
|
|
13
|
+
# Where not NaN: apply threshold
|
|
14
|
+
mask = ~np.isnan(X_bin)
|
|
15
|
+
X_bin[mask] = (X_bin[mask] > threshold).astype(np.int8)
|
|
16
|
+
|
|
17
|
+
adata.layers[target_layer] = X_bin
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
def binarize_on_Youden(adata,
|
|
1
|
+
def binarize_on_Youden(adata,
|
|
2
|
+
ref_column='Reference_strand',
|
|
3
|
+
output_layer_name='binarized_methylation'):
|
|
2
4
|
"""
|
|
3
5
|
Binarize SMF values based on position thresholds determined by calculate_position_Youden.
|
|
4
6
|
|
|
@@ -16,18 +18,18 @@ def binarize_on_Youden(adata, obs_column='Reference'):
|
|
|
16
18
|
binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
|
|
17
19
|
|
|
18
20
|
# Get unique categories
|
|
19
|
-
|
|
21
|
+
references = adata.obs[ref_column].cat.categories
|
|
20
22
|
|
|
21
|
-
for
|
|
23
|
+
for ref in references:
|
|
22
24
|
# Select subset for this category
|
|
23
|
-
|
|
24
|
-
|
|
25
|
+
ref_mask = adata.obs[ref_column] == ref
|
|
26
|
+
ref_subset = adata[ref_mask]
|
|
25
27
|
|
|
26
28
|
# Extract the probability matrix
|
|
27
|
-
original_matrix =
|
|
29
|
+
original_matrix = ref_subset.X.copy()
|
|
28
30
|
|
|
29
31
|
# Extract the thresholds for each position efficiently
|
|
30
|
-
thresholds = np.array(
|
|
32
|
+
thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
|
|
31
33
|
|
|
32
34
|
# Identify NaN values
|
|
33
35
|
nan_mask = np.isnan(original_matrix)
|
|
@@ -39,7 +41,7 @@ def binarize_on_Youden(adata, obs_column='Reference'):
|
|
|
39
41
|
binarized_matrix[nan_mask] = np.nan
|
|
40
42
|
|
|
41
43
|
# Assign the binarized values back into the preallocated storage
|
|
42
|
-
binarized_methylation[
|
|
44
|
+
binarized_methylation[ref_subset, :] = binarized_matrix
|
|
43
45
|
|
|
44
46
|
# Store the binarized matrix in a new layer
|
|
45
|
-
adata.layers[
|
|
47
|
+
adata.layers[output_layer_name] = binarized_methylation
|
|
@@ -1,4 +1,7 @@
|
|
|
1
|
-
def calculate_coverage(adata,
|
|
1
|
+
def calculate_coverage(adata,
|
|
2
|
+
ref_column='Reference_strand',
|
|
3
|
+
position_nan_threshold=0.01,
|
|
4
|
+
uns_flag='calculate_coverage_performed'):
|
|
2
5
|
"""
|
|
3
6
|
Append position-level metadata regarding whether the position is informative within the given observation category.
|
|
4
7
|
|
|
@@ -20,32 +23,32 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
|
|
|
20
23
|
# QC already performed; nothing to do
|
|
21
24
|
return
|
|
22
25
|
|
|
23
|
-
|
|
26
|
+
references = adata.obs[ref_column].cat.categories
|
|
24
27
|
n_categories_with_position = np.zeros(adata.shape[1])
|
|
25
28
|
|
|
26
|
-
# Loop over
|
|
27
|
-
for
|
|
28
|
-
print(f'Assessing positional coverage across samples for {
|
|
29
|
+
# Loop over references
|
|
30
|
+
for ref in references:
|
|
31
|
+
print(f'Assessing positional coverage across samples for {ref} reference')
|
|
29
32
|
|
|
30
33
|
# Subset to current category
|
|
31
|
-
|
|
32
|
-
|
|
34
|
+
ref_mask = adata.obs[ref_column] == ref
|
|
35
|
+
temp_ref_adata = adata[ref_mask]
|
|
33
36
|
|
|
34
37
|
# Compute fraction of valid coverage
|
|
35
|
-
|
|
36
|
-
|
|
38
|
+
ref_valid_coverage = np.sum(~np.isnan(temp_ref_adata.X), axis=0)
|
|
39
|
+
ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0] # Avoid extra computation
|
|
37
40
|
|
|
38
41
|
# Store coverage stats
|
|
39
|
-
adata.var[f'{
|
|
42
|
+
adata.var[f'{ref}_valid_fraction'] = pd.Series(ref_valid_fraction, index=adata.var.index)
|
|
40
43
|
|
|
41
44
|
# Assign whether the position is covered based on threshold
|
|
42
|
-
adata.var[f'position_in_{
|
|
45
|
+
adata.var[f'position_in_{ref}'] = ref_valid_fraction >= position_nan_threshold
|
|
43
46
|
|
|
44
47
|
# Sum the number of categories covering each position
|
|
45
|
-
n_categories_with_position += adata.var[f'position_in_{
|
|
48
|
+
n_categories_with_position += adata.var[f'position_in_{ref}'].values
|
|
46
49
|
|
|
47
50
|
# Store final category count
|
|
48
|
-
adata.var[f'N_{
|
|
51
|
+
adata.var[f'N_{ref_column}_with_position'] = n_categories_with_position.astype(int)
|
|
49
52
|
|
|
50
53
|
# mark as done
|
|
51
54
|
adata.uns[uns_flag] = True
|
|
@@ -1,7 +1,15 @@
|
|
|
1
1
|
## calculate_position_Youden
|
|
2
|
-
|
|
3
2
|
## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
|
|
4
|
-
def calculate_position_Youden(adata,
|
|
3
|
+
def calculate_position_Youden(adata,
|
|
4
|
+
positive_control_sample=None,
|
|
5
|
+
negative_control_sample=None,
|
|
6
|
+
J_threshold=0.5,
|
|
7
|
+
ref_column='Reference_strand',
|
|
8
|
+
sample_column='Sample_names',
|
|
9
|
+
infer_on_percentile=True,
|
|
10
|
+
inference_variable='Raw_modification_signal',
|
|
11
|
+
save=False,
|
|
12
|
+
output_directory=''):
|
|
5
13
|
"""
|
|
6
14
|
Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
|
|
7
15
|
|
|
@@ -26,28 +34,36 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
26
34
|
from sklearn.metrics import roc_curve, roc_auc_score
|
|
27
35
|
|
|
28
36
|
control_samples = [positive_control_sample, negative_control_sample]
|
|
29
|
-
|
|
37
|
+
references = adata.obs[ref_column].cat.categories
|
|
30
38
|
# Iterate over each category in the specified obs_column
|
|
31
|
-
for
|
|
32
|
-
print(f"Calculating position Youden statistics for {
|
|
39
|
+
for ref in references:
|
|
40
|
+
print(f"Calculating position Youden statistics for {ref}")
|
|
33
41
|
# Subset to keep only reads associated with the category
|
|
34
|
-
|
|
42
|
+
ref_subset = adata[adata.obs[ref_column] == ref]
|
|
35
43
|
# Iterate over positive and negative control samples
|
|
36
|
-
for control in control_samples:
|
|
44
|
+
for i, control in enumerate(control_samples):
|
|
37
45
|
# Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
|
|
38
|
-
adata.uns[f'{
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
46
|
+
adata.uns[f'{ref}_position_methylation_dict_{control}'] = {}
|
|
47
|
+
# If controls are not passed and infer on percentile is True, infer thresholds based on top and bottom percentile windows for a given obs column metric.
|
|
48
|
+
if infer_on_percentile and not control:
|
|
49
|
+
sorted_column = ref_subset.obs[inference_variable].sort_values(ascending=False)
|
|
50
|
+
if i == 0:
|
|
51
|
+
control == 'positive'
|
|
52
|
+
positive_control_sample = control
|
|
42
53
|
threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
|
|
43
|
-
control_subset =
|
|
54
|
+
control_subset = ref_subset[ref_subset.obs[inference_variable] >= threshold, :]
|
|
44
55
|
else:
|
|
56
|
+
control == 'negative'
|
|
57
|
+
negative_control_sample = control
|
|
45
58
|
threshold = np.percentile(sorted_column, infer_on_percentile)
|
|
46
|
-
control_subset =
|
|
59
|
+
control_subset = ref_subset[ref_subset.obs[inference_variable] <= threshold, :]
|
|
60
|
+
elif not infer_on_percentile and not control:
|
|
61
|
+
print("Can not threshold Anndata on Youden threshold. Need to either provide control samples or set infer_on_percentile to True")
|
|
62
|
+
return
|
|
47
63
|
else:
|
|
48
64
|
# get the current control subset on the given category
|
|
49
|
-
filtered_obs =
|
|
50
|
-
control_subset =
|
|
65
|
+
filtered_obs = ref_subset.obs[ref_subset.obs[sample_column] == control]
|
|
66
|
+
control_subset = ref_subset[filtered_obs.index]
|
|
51
67
|
# Iterate through every position in the control subset
|
|
52
68
|
for position in range(control_subset.shape[1]):
|
|
53
69
|
# Get the coordinate name associated with that position
|
|
@@ -63,9 +79,9 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
63
79
|
# Get fraction coverage
|
|
64
80
|
fraction_coverage = position_coverage / control_subset.shape[0]
|
|
65
81
|
# Save the position and the position methylation data for the control subset
|
|
66
|
-
adata.uns[f'{
|
|
82
|
+
adata.uns[f'{ref}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
|
|
67
83
|
|
|
68
|
-
for
|
|
84
|
+
for ref in references:
|
|
69
85
|
fig, ax = plt.subplots(figsize=(6, 4))
|
|
70
86
|
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
|
|
71
87
|
plt.xlabel('False Positive Rate')
|
|
@@ -76,13 +92,13 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
76
92
|
n_total_positions = 0
|
|
77
93
|
# Initialize a list that will hold the positional thresholds for the category
|
|
78
94
|
probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
|
|
79
|
-
for i, key in enumerate(adata.uns[f'{
|
|
80
|
-
position = int(adata.uns[f'{
|
|
81
|
-
positive_position_array = adata.uns[f'{
|
|
82
|
-
fraction_coverage = adata.uns[f'{
|
|
95
|
+
for i, key in enumerate(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'].keys()):
|
|
96
|
+
position = int(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][0])
|
|
97
|
+
positive_position_array = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][1]
|
|
98
|
+
fraction_coverage = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][2]
|
|
83
99
|
if fraction_coverage > 0.2:
|
|
84
100
|
try:
|
|
85
|
-
negative_position_array = adata.uns[f'{
|
|
101
|
+
negative_position_array = adata.uns[f'{ref}_position_methylation_dict_{negative_control_sample}'][key][1]
|
|
86
102
|
# Combine the negative and positive control data
|
|
87
103
|
data = np.concatenate([negative_position_array, positive_position_array])
|
|
88
104
|
labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
|
|
@@ -101,15 +117,15 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
101
117
|
plt.plot(fpr, tpr, label='ROC curve')
|
|
102
118
|
except:
|
|
103
119
|
probability_thresholding_list[position] = (0.8, np.nan)
|
|
104
|
-
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {
|
|
120
|
+
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {ref}'
|
|
105
121
|
plt.title(title)
|
|
106
|
-
save_name = output_directory
|
|
122
|
+
save_name = output_directory / f"{title}.png"
|
|
107
123
|
if save:
|
|
108
124
|
plt.savefig(save_name)
|
|
109
125
|
plt.close()
|
|
110
126
|
else:
|
|
111
127
|
plt.show()
|
|
112
128
|
|
|
113
|
-
adata.var[f'{
|
|
129
|
+
adata.var[f'{ref}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
|
|
114
130
|
J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
|
|
115
|
-
adata.var[f'{
|
|
131
|
+
adata.var[f'{ref}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
|
|
@@ -2,7 +2,7 @@ def calculate_read_modification_stats(adata,
|
|
|
2
2
|
reference_column,
|
|
3
3
|
sample_names_col,
|
|
4
4
|
mod_target_bases,
|
|
5
|
-
uns_flag="
|
|
5
|
+
uns_flag="calculate_read_modification_stats_performed",
|
|
6
6
|
bypass=False,
|
|
7
7
|
force_redo=False
|
|
8
8
|
):
|
|
@@ -36,7 +36,7 @@ def calculate_read_modification_stats(adata,
|
|
|
36
36
|
site_types = []
|
|
37
37
|
|
|
38
38
|
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
39
|
-
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', '
|
|
39
|
+
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
|
|
40
40
|
|
|
41
41
|
if 'A' in mod_target_bases:
|
|
42
42
|
site_types += ['A_site']
|
|
@@ -11,7 +11,7 @@ def filter_reads_on_length_quality_mapping(
|
|
|
11
11
|
length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
|
|
12
12
|
read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
13
13
|
mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
14
|
-
uns_flag: str = "
|
|
14
|
+
uns_flag: str = "filter_reads_on_length_quality_mapping_performed",
|
|
15
15
|
bypass: bool = False,
|
|
16
16
|
force_redo: bool = True
|
|
17
17
|
) -> ad.AnnData:
|