smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
def preprocess_adata(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for preprocessing an adata object.
|
|
4
|
+
Command line accesses this through smftools preprocess <config_path>
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(pp_adata, pp_adata_path, pp_dedup_adata, pp_dedup_adata_path)
|
|
11
|
+
"""
|
|
12
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
|
|
13
|
+
from .load_adata import load_adata
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import anndata as ad
|
|
18
|
+
import scanpy as sc
|
|
19
|
+
|
|
20
|
+
import os
|
|
21
|
+
from importlib import resources
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
26
|
+
|
|
27
|
+
################################### 1) Load existing ###################################
|
|
28
|
+
adata, adata_path, cfg = load_adata(config_path)
|
|
29
|
+
|
|
30
|
+
# General config variable init - Necessary user passed inputs
|
|
31
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
32
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
33
|
+
|
|
34
|
+
# Make initial output directory
|
|
35
|
+
make_dirs([output_directory])
|
|
36
|
+
|
|
37
|
+
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
38
|
+
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
39
|
+
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
40
|
+
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
41
|
+
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
42
|
+
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
43
|
+
|
|
44
|
+
if adata:
|
|
45
|
+
# This happens on first run of the load pipeline
|
|
46
|
+
pass
|
|
47
|
+
else:
|
|
48
|
+
# If an anndata is saved, check which stages of the anndata are available
|
|
49
|
+
initial_version_available = initial_adata_path.exists()
|
|
50
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
51
|
+
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
52
|
+
spatial_adata_exists = spatial_adata_path.exists()
|
|
53
|
+
hmm_adata_exists = hmm_adata_path.exists()
|
|
54
|
+
|
|
55
|
+
if cfg.force_redo_preprocessing:
|
|
56
|
+
print(f"Forcing full redo of preprocessing workflow, starting from earliest stage adata available.")
|
|
57
|
+
if initial_version_available:
|
|
58
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
59
|
+
elif preprocessed_version_available:
|
|
60
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
61
|
+
elif preprocessed_dup_removed_version_available:
|
|
62
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
63
|
+
else:
|
|
64
|
+
print(f"Can not redo preprocessing when there is no adata available.")
|
|
65
|
+
return
|
|
66
|
+
elif cfg.force_redo_flag_duplicate_reads:
|
|
67
|
+
print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
68
|
+
if preprocessed_version_available:
|
|
69
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
70
|
+
elif initial_version_available:
|
|
71
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
72
|
+
else:
|
|
73
|
+
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
74
|
+
return
|
|
75
|
+
elif cfg.force_redo_basic_analyses:
|
|
76
|
+
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
77
|
+
if preprocessed_version_available:
|
|
78
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
79
|
+
elif initial_version_available:
|
|
80
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
81
|
+
else:
|
|
82
|
+
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
83
|
+
elif hmm_adata_exists:
|
|
84
|
+
print(f"HMM anndata found: {hmm_adata_path}")
|
|
85
|
+
return (None, None, None, None)
|
|
86
|
+
elif spatial_adata_exists:
|
|
87
|
+
print(f"Spatial anndata found: {spatial_adata_exists}")
|
|
88
|
+
return (None, None, None, None)
|
|
89
|
+
elif preprocessed_dup_removed_version_available:
|
|
90
|
+
print(f"Preprocessed deduplicated anndata found: {pp_dup_rem_adata_path}")
|
|
91
|
+
return (None, pp_adata_path, None, pp_dup_rem_adata_path)
|
|
92
|
+
elif preprocessed_version_available:
|
|
93
|
+
print(f"Preprocessed anndata found: {pp_adata_path}")
|
|
94
|
+
adata, load_report = safe_read_h5ad(pp_adata_path)
|
|
95
|
+
elif initial_version_available:
|
|
96
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
97
|
+
else:
|
|
98
|
+
print(f"No adata available.")
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
######### Begin Preprocessing #########
|
|
102
|
+
pp_dir = output_directory / "preprocessed"
|
|
103
|
+
|
|
104
|
+
## Load sample sheet metadata based on barcode mapping ##
|
|
105
|
+
if cfg.sample_sheet_path:
|
|
106
|
+
from ..preprocessing import load_sample_sheet
|
|
107
|
+
load_sample_sheet(adata,
|
|
108
|
+
cfg.sample_sheet_path,
|
|
109
|
+
mapping_key_column=cfg.sample_sheet_mapping_column,
|
|
110
|
+
as_category=True,
|
|
111
|
+
force_reload=cfg.force_reload_sample_sheet)
|
|
112
|
+
else:
|
|
113
|
+
pass
|
|
114
|
+
|
|
115
|
+
# Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
|
|
116
|
+
pp_length_qc_dir = pp_dir / "01_Read_length_and_quality_QC_metrics"
|
|
117
|
+
|
|
118
|
+
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
119
|
+
print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
|
|
120
|
+
else:
|
|
121
|
+
from ..plotting import plot_read_qc_histograms
|
|
122
|
+
make_dirs([pp_dir, pp_length_qc_dir])
|
|
123
|
+
obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
|
|
124
|
+
plot_read_qc_histograms(adata,
|
|
125
|
+
pp_length_qc_dir,
|
|
126
|
+
obs_to_plot,
|
|
127
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
128
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
129
|
+
|
|
130
|
+
# Filter on read length, read quality, reference length, mapped_length, and mapping quality metadata.
|
|
131
|
+
from ..preprocessing import filter_reads_on_length_quality_mapping
|
|
132
|
+
print(adata.shape)
|
|
133
|
+
adata = filter_reads_on_length_quality_mapping(adata,
|
|
134
|
+
filter_on_coordinates=cfg.read_coord_filter,
|
|
135
|
+
read_length=cfg.read_len_filter_thresholds,
|
|
136
|
+
length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
|
|
137
|
+
read_quality=cfg.read_quality_filter_thresholds,
|
|
138
|
+
mapping_quality=cfg.read_mapping_quality_filter_thresholds,
|
|
139
|
+
bypass=None,
|
|
140
|
+
force_redo=None)
|
|
141
|
+
print(adata.shape)
|
|
142
|
+
|
|
143
|
+
pp_length_qc_dir = pp_dir / "02_Read_length_and_quality_QC_metrics_post_filtering"
|
|
144
|
+
|
|
145
|
+
if pp_length_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
146
|
+
print( f'{pp_length_qc_dir} already exists. Skipping read level QC plotting.')
|
|
147
|
+
else:
|
|
148
|
+
from ..plotting import plot_read_qc_histograms
|
|
149
|
+
make_dirs([pp_dir, pp_length_qc_dir])
|
|
150
|
+
obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
|
|
151
|
+
plot_read_qc_histograms(adata,
|
|
152
|
+
pp_length_qc_dir,
|
|
153
|
+
obs_to_plot,
|
|
154
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
155
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
156
|
+
|
|
157
|
+
############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
|
|
158
|
+
from ..preprocessing import clean_NaN
|
|
159
|
+
if smf_modality == 'direct':
|
|
160
|
+
from ..preprocessing import calculate_position_Youden, binarize_on_Youden, binarize_adata
|
|
161
|
+
native = True
|
|
162
|
+
if cfg.fit_position_methylation_thresholds:
|
|
163
|
+
pp_Youden_dir = pp_dir / "02B_Position_wide_Youden_threshold_performance"
|
|
164
|
+
make_dirs([pp_Youden_dir])
|
|
165
|
+
# Calculate positional methylation thresholds for mod calls
|
|
166
|
+
calculate_position_Youden(adata,
|
|
167
|
+
positive_control_sample=cfg.positive_control_sample_methylation_fitting,
|
|
168
|
+
negative_control_sample=cfg.negative_control_sample_methylation_fitting,
|
|
169
|
+
J_threshold=cfg.fit_j_threshold,
|
|
170
|
+
obs_column=cfg.reference_column,
|
|
171
|
+
infer_on_percentile=cfg.infer_on_percentile_sample_methylation_fitting,
|
|
172
|
+
inference_variable=cfg.inference_variable_sample_methylation_fitting,
|
|
173
|
+
save=True,
|
|
174
|
+
output_directory=pp_Youden_dir
|
|
175
|
+
)
|
|
176
|
+
# binarize the modcalls based on the determined thresholds
|
|
177
|
+
binarize_on_Youden(adata,
|
|
178
|
+
obs_column=cfg.reference_column,
|
|
179
|
+
output_layer_name=cfg.output_binary_layer_name
|
|
180
|
+
)
|
|
181
|
+
else:
|
|
182
|
+
binarize_adata(adata,
|
|
183
|
+
source="X",
|
|
184
|
+
target_layer=cfg.output_binary_layer_name,
|
|
185
|
+
threshold=cfg.binarize_on_fixed_methlyation_threshold)
|
|
186
|
+
|
|
187
|
+
clean_NaN(adata,
|
|
188
|
+
layer=cfg.output_binary_layer_name,
|
|
189
|
+
bypass=cfg.bypass_clean_nan,
|
|
190
|
+
force_redo=cfg.force_redo_clean_nan
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
native = False
|
|
194
|
+
clean_NaN(adata,
|
|
195
|
+
bypass=cfg.bypass_clean_nan,
|
|
196
|
+
force_redo=cfg.force_redo_clean_nan
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
|
|
200
|
+
from ..preprocessing import append_base_context, append_binary_layer_by_base_context
|
|
201
|
+
# Additionally, store base_context level binary modification arrays in adata.obsm
|
|
202
|
+
append_base_context(adata,
|
|
203
|
+
obs_column=cfg.reference_column,
|
|
204
|
+
use_consensus=False,
|
|
205
|
+
native=native,
|
|
206
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
207
|
+
bypass=cfg.bypass_append_base_context,
|
|
208
|
+
force_redo=cfg.force_redo_append_base_context)
|
|
209
|
+
|
|
210
|
+
adata = append_binary_layer_by_base_context(adata,
|
|
211
|
+
cfg.reference_column,
|
|
212
|
+
smf_modality,
|
|
213
|
+
bypass=cfg.bypass_append_binary_layer_by_base_context,
|
|
214
|
+
force_redo=cfg.force_redo_append_binary_layer_by_base_context)
|
|
215
|
+
|
|
216
|
+
############### Optional inversion of the adata along positions axis ###################
|
|
217
|
+
if cfg.invert_adata:
|
|
218
|
+
from ..preprocessing import invert_adata
|
|
219
|
+
adata = invert_adata(adata)
|
|
220
|
+
|
|
221
|
+
############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
|
|
222
|
+
from ..preprocessing import calculate_read_modification_stats
|
|
223
|
+
calculate_read_modification_stats(adata,
|
|
224
|
+
cfg.reference_column,
|
|
225
|
+
cfg.sample_column,
|
|
226
|
+
cfg.mod_target_bases,
|
|
227
|
+
bypass=cfg.bypass_calculate_read_modification_stats,
|
|
228
|
+
force_redo=cfg.force_redo_calculate_read_modification_stats)
|
|
229
|
+
|
|
230
|
+
### Make a dir for outputting sample level read modification metrics before filtering ###
|
|
231
|
+
pp_meth_qc_dir = pp_dir / "03_read_modification_QC_metrics"
|
|
232
|
+
|
|
233
|
+
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
234
|
+
print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
|
|
235
|
+
else:
|
|
236
|
+
from ..plotting import plot_read_qc_histograms
|
|
237
|
+
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
238
|
+
obs_to_plot = ['Raw_modification_signal']
|
|
239
|
+
if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
240
|
+
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
|
|
241
|
+
if 'A' in cfg.mod_target_bases:
|
|
242
|
+
obs_to_plot += ['Fraction_A_site_modified']
|
|
243
|
+
plot_read_qc_histograms(adata,
|
|
244
|
+
pp_meth_qc_dir, obs_to_plot,
|
|
245
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
246
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
247
|
+
|
|
248
|
+
##### Optionally filter reads on modification metrics
|
|
249
|
+
from ..preprocessing import filter_reads_on_modification_thresholds
|
|
250
|
+
adata = filter_reads_on_modification_thresholds(adata,
|
|
251
|
+
smf_modality=smf_modality,
|
|
252
|
+
mod_target_bases=cfg.mod_target_bases,
|
|
253
|
+
gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
|
|
254
|
+
cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
|
|
255
|
+
any_c_thresholds=cfg.read_mod_filtering_any_c_thresholds,
|
|
256
|
+
a_thresholds=cfg.read_mod_filtering_a_thresholds,
|
|
257
|
+
use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
|
|
258
|
+
min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
|
|
259
|
+
bypass=cfg.bypass_filter_reads_on_modification_thresholds,
|
|
260
|
+
force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
|
|
261
|
+
|
|
262
|
+
pp_meth_qc_dir = pp_dir / "04_read_modification_QC_metrics_post_filtering"
|
|
263
|
+
|
|
264
|
+
if pp_meth_qc_dir.is_dir() and not cfg.force_redo_preprocessing:
|
|
265
|
+
print(f'{pp_meth_qc_dir} already exists. Skipping read level methylation QC plotting.')
|
|
266
|
+
else:
|
|
267
|
+
from ..plotting import plot_read_qc_histograms
|
|
268
|
+
make_dirs([pp_dir, pp_meth_qc_dir])
|
|
269
|
+
obs_to_plot = ['Raw_modification_signal']
|
|
270
|
+
if any(base in cfg.mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
271
|
+
obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
|
|
272
|
+
if 'A' in cfg.mod_target_bases:
|
|
273
|
+
obs_to_plot += ['Fraction_A_site_modified']
|
|
274
|
+
plot_read_qc_histograms(adata,
|
|
275
|
+
pp_meth_qc_dir, obs_to_plot,
|
|
276
|
+
sample_key=cfg.sample_name_col_for_plotting,
|
|
277
|
+
rows_per_fig=cfg.rows_per_qc_histogram_grid)
|
|
278
|
+
|
|
279
|
+
############### Calculate positional coverage in dataset ###############
|
|
280
|
+
from ..preprocessing import calculate_coverage
|
|
281
|
+
calculate_coverage(adata,
|
|
282
|
+
obs_column=cfg.reference_column,
|
|
283
|
+
position_nan_threshold=cfg.position_max_nan_threshold)
|
|
284
|
+
|
|
285
|
+
############### Duplicate detection for conversion/deamination SMF ###############
|
|
286
|
+
if smf_modality != 'direct':
|
|
287
|
+
from ..preprocessing import flag_duplicate_reads, calculate_complexity_II
|
|
288
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
289
|
+
|
|
290
|
+
var_filters_sets =[]
|
|
291
|
+
for ref in references:
|
|
292
|
+
for site_type in cfg.duplicate_detection_site_types:
|
|
293
|
+
var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
|
|
294
|
+
|
|
295
|
+
pp_dup_qc_dir = pp_dir / "05_read_duplication_QC_metrics"
|
|
296
|
+
|
|
297
|
+
make_dirs([pp_dup_qc_dir])
|
|
298
|
+
|
|
299
|
+
# Flag duplicate reads and plot duplicate detection QC
|
|
300
|
+
adata_unique, adata = flag_duplicate_reads(adata,
|
|
301
|
+
var_filters_sets,
|
|
302
|
+
distance_threshold=cfg.duplicate_detection_distance_threshold,
|
|
303
|
+
obs_reference_col=cfg.reference_column,
|
|
304
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
305
|
+
output_directory=pp_dup_qc_dir,
|
|
306
|
+
metric_keys=cfg.hamming_vs_metric_keys,
|
|
307
|
+
keep_best_metric=cfg.duplicate_detection_keep_best_metric,
|
|
308
|
+
bypass=cfg.bypass_flag_duplicate_reads,
|
|
309
|
+
force_redo=cfg.force_redo_flag_duplicate_reads,
|
|
310
|
+
window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
|
|
311
|
+
min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
|
|
312
|
+
do_pca=cfg.duplicate_detection_do_pca,
|
|
313
|
+
pca_n_components=50,
|
|
314
|
+
pca_center=True,
|
|
315
|
+
do_hierarchical=cfg.duplicate_detection_do_hierarchical,
|
|
316
|
+
hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
|
|
317
|
+
hierarchical_metric="euclidean",
|
|
318
|
+
hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Use the flagged duplicate read groups and perform complexity analysis
|
|
322
|
+
complexity_outs = pp_dup_qc_dir / "sample_complexity_analyses"
|
|
323
|
+
make_dirs([complexity_outs])
|
|
324
|
+
calculate_complexity_II(
|
|
325
|
+
adata=adata,
|
|
326
|
+
output_directory=complexity_outs,
|
|
327
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
328
|
+
ref_col=cfg.reference_column,
|
|
329
|
+
cluster_col='sequence__merged_cluster_id',
|
|
330
|
+
plot=True,
|
|
331
|
+
save_plot=True, # set False to display instead
|
|
332
|
+
n_boot=30,
|
|
333
|
+
n_depths=12,
|
|
334
|
+
random_state=42,
|
|
335
|
+
csv_summary=True,
|
|
336
|
+
bypass=cfg.bypass_complexity_analysis,
|
|
337
|
+
force_redo=cfg.force_redo_complexity_analysis
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
else:
|
|
341
|
+
adata_unique = adata
|
|
342
|
+
########################################################################################################################
|
|
343
|
+
|
|
344
|
+
############################################### Save preprocessed adata with duplicate detection ###############################################
|
|
345
|
+
from ..readwrite import safe_write_h5ad
|
|
346
|
+
if not pp_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
347
|
+
print('Saving preprocessed adata.')
|
|
348
|
+
if ".gz" == pp_adata_path.suffix:
|
|
349
|
+
safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
|
|
350
|
+
else:
|
|
351
|
+
pp_adata_path = pp_adata_path.with_name(pp_adata_path.name + '.gz')
|
|
352
|
+
safe_write_h5ad(adata, pp_adata_path, compression='gzip', backup=True)
|
|
353
|
+
|
|
354
|
+
if not pp_dup_rem_adata_path.exists() or cfg.force_redo_preprocessing:
|
|
355
|
+
print('Saving preprocessed adata with duplicates removed.')
|
|
356
|
+
if ".gz" == pp_dup_rem_adata_path.suffix:
|
|
357
|
+
safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
|
|
358
|
+
else:
|
|
359
|
+
pp_adata_path = pp_dup_rem_adata_path.with_name(pp_dup_rem_adata_path.name + '.gz')
|
|
360
|
+
safe_write_h5ad(adata_unique, pp_dup_rem_adata_path, compression='gzip', backup=True)
|
|
361
|
+
########################################################################################################################
|
|
362
|
+
|
|
363
|
+
return (adata, pp_adata_path, adata_unique, pp_dup_rem_adata_path)
|