smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
CHANGED
|
@@ -4,12 +4,13 @@ import logging
|
|
|
4
4
|
import warnings
|
|
5
5
|
|
|
6
6
|
from . import informatics as inform
|
|
7
|
+
from . import machine_learning as ml
|
|
8
|
+
from . import plotting as pl
|
|
7
9
|
from . import preprocessing as pp
|
|
8
10
|
from . import tools as tl
|
|
9
|
-
from . import plotting as pl
|
|
10
|
-
from . import readwrite, datasets
|
|
11
|
-
from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
|
|
12
11
|
|
|
12
|
+
from . import cli, config, datasets, hmm
|
|
13
|
+
from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
|
|
13
14
|
|
|
14
15
|
from importlib.metadata import version
|
|
15
16
|
|
|
@@ -19,11 +20,11 @@ __version__ = version(package_name)
|
|
|
19
20
|
__all__ = [
|
|
20
21
|
"adata_to_df",
|
|
21
22
|
"inform",
|
|
23
|
+
"ml",
|
|
22
24
|
"pp",
|
|
23
25
|
"tl",
|
|
24
26
|
"pl",
|
|
25
|
-
"
|
|
26
|
-
"datasets",
|
|
27
|
+
"datasets"
|
|
27
28
|
"safe_write_h5ad",
|
|
28
|
-
"
|
|
29
|
+
"safe_read_h5ad"
|
|
29
30
|
]
|
smftools/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.
|
|
1
|
+
__version__ = "0.2.3"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
def flow_I(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for converting raw sequencing data to an adata object.
|
|
4
|
+
Command line accesses this through smftools load <config_path>
|
|
5
|
+
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
6
|
+
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
7
|
+
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
None
|
|
14
|
+
"""
|
|
15
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
|
|
16
|
+
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
17
|
+
from .load_adata import load_adata
|
|
18
|
+
from .preprocess_adata import preprocess_adata
|
|
19
|
+
from .spatial_adata import spatial_adata
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import anndata as ad
|
|
24
|
+
import scanpy as sc
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
from importlib import resources
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
32
|
+
################################### 1) General params and input organization ###################################
|
|
33
|
+
# Load experiment config parameters into global variables
|
|
34
|
+
loader = LoadExperimentConfig(config_path)
|
|
35
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
36
|
+
cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
|
|
37
|
+
|
|
38
|
+
# General config variable init - Necessary user passed inputs
|
|
39
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
40
|
+
input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
41
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
42
|
+
fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
|
|
43
|
+
split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
|
|
44
|
+
split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
|
|
45
|
+
|
|
46
|
+
# Make initial output directory
|
|
47
|
+
make_dirs([output_directory])
|
|
48
|
+
|
|
49
|
+
bam_suffix = cfg.bam_suffix
|
|
50
|
+
strands = cfg.strands
|
|
51
|
+
|
|
52
|
+
# General config variable init - Optional user passed inputs for enzyme base specificity
|
|
53
|
+
mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
|
|
54
|
+
|
|
55
|
+
# Conversion/deamination specific variable init
|
|
56
|
+
conversion_types = cfg.conversion_types # 5mC
|
|
57
|
+
conversions = cfg.conversions
|
|
58
|
+
|
|
59
|
+
# Common Anndata accession params
|
|
60
|
+
reference_column = cfg.reference_column
|
|
61
|
+
|
|
62
|
+
# If conversion_types is passed:
|
|
63
|
+
if conversion_types:
|
|
64
|
+
conversions += conversion_types
|
|
65
|
+
|
|
66
|
+
############################################### smftools load start ###############################################
|
|
67
|
+
initial_adata, initial_adata_path = load_adata(config_path)
|
|
68
|
+
|
|
69
|
+
# Initial adata path info
|
|
70
|
+
initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
|
|
71
|
+
############################################### smftools load end ###############################################
|
|
72
|
+
|
|
73
|
+
############################################### smftools preprocess start ###############################################
|
|
74
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
75
|
+
|
|
76
|
+
# Preprocessed adata path info
|
|
77
|
+
pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
|
|
78
|
+
pp_adata_path = initial_adata_path / pp_adata_basename
|
|
79
|
+
pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
|
|
80
|
+
|
|
81
|
+
# Preprocessed duplicate removed adata path info
|
|
82
|
+
pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
|
|
83
|
+
pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
|
|
84
|
+
pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
|
|
85
|
+
############################################### smftools preprocess end ###############################################
|
|
86
|
+
|
|
87
|
+
############################################### smftools spatial start ###############################################
|
|
88
|
+
# Preprocessed duplicate removed adata with basic analyses appended path info
|
|
89
|
+
basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
|
|
90
|
+
basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
|
|
91
|
+
basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
|
|
92
|
+
|
|
93
|
+
spatial_adata, spatial_adata_path = spatial_adata(config_path)
|
|
94
|
+
############################################### smftools spatial end ###############################################
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
def hmm_adata(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for hmm analysis of an adata object.
|
|
4
|
+
Command line accesses this through smftools hmm <config_path>
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
|
|
11
|
+
"""
|
|
12
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
|
|
13
|
+
from .load_adata import load_adata
|
|
14
|
+
from .preprocess_adata import preprocess_adata
|
|
15
|
+
from .spatial_adata import spatial_adata
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import anndata as ad
|
|
20
|
+
import scanpy as sc
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from importlib import resources
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
28
|
+
|
|
29
|
+
############################################### smftools load start ###############################################
|
|
30
|
+
adata, adata_path, cfg = load_adata(config_path)
|
|
31
|
+
# General config variable init - Necessary user passed inputs
|
|
32
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
33
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
34
|
+
|
|
35
|
+
# Make initial output directory
|
|
36
|
+
make_dirs([output_directory])
|
|
37
|
+
############################################### smftools load end ###############################################
|
|
38
|
+
|
|
39
|
+
############################################### smftools preprocess start ###############################################
|
|
40
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
41
|
+
############################################### smftools preprocess end ###############################################
|
|
42
|
+
|
|
43
|
+
############################################### smftools spatial start ###############################################
|
|
44
|
+
spatial_ad, spatial_adata_path = spatial_adata(config_path)
|
|
45
|
+
############################################### smftools spatial end ###############################################
|
|
46
|
+
|
|
47
|
+
############################################### smftools hmm start ###############################################
|
|
48
|
+
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
49
|
+
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
50
|
+
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
51
|
+
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
52
|
+
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
53
|
+
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
54
|
+
|
|
55
|
+
if spatial_ad:
|
|
56
|
+
# This happens on first run of the pipeline
|
|
57
|
+
adata = spatial_ad
|
|
58
|
+
else:
|
|
59
|
+
# If an anndata is saved, check which stages of the anndata are available
|
|
60
|
+
initial_version_available = initial_adata_path.exists()
|
|
61
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
62
|
+
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
63
|
+
preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
|
|
64
|
+
preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
|
|
65
|
+
|
|
66
|
+
if cfg.force_redo_hmm_fit:
|
|
67
|
+
print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
|
|
68
|
+
if preprocessed_dedup_spatial_version_available:
|
|
69
|
+
adata, load_report = safe_read_h5ad(spatial_adata_path)
|
|
70
|
+
elif preprocessed_dup_removed_version_available:
|
|
71
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
72
|
+
elif initial_version_available:
|
|
73
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
74
|
+
else:
|
|
75
|
+
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
76
|
+
elif preprocessed_dedup_spatial_hmm_version_available:
|
|
77
|
+
return (None, hmm_adata_path)
|
|
78
|
+
else:
|
|
79
|
+
if preprocessed_dedup_spatial_version_available:
|
|
80
|
+
adata, load_report = safe_read_h5ad(spatial_adata_path)
|
|
81
|
+
elif preprocessed_dup_removed_version_available:
|
|
82
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
83
|
+
elif initial_version_available:
|
|
84
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
85
|
+
else:
|
|
86
|
+
print(f"No adata available.")
|
|
87
|
+
return
|
|
88
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
89
|
+
deaminase = smf_modality == 'deaminase'
|
|
90
|
+
############################################### HMM based feature annotations ###############################################
|
|
91
|
+
if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
|
|
92
|
+
from ..hmm.HMM import HMM
|
|
93
|
+
from scipy.sparse import issparse, csr_matrix
|
|
94
|
+
import warnings
|
|
95
|
+
|
|
96
|
+
pp_dir = output_directory / "preprocessed"
|
|
97
|
+
pp_dir = pp_dir / "deduplicated"
|
|
98
|
+
hmm_dir = pp_dir / "10_hmm_models"
|
|
99
|
+
|
|
100
|
+
if hmm_dir.is_dir():
|
|
101
|
+
print(f'{hmm_dir} already exists.')
|
|
102
|
+
else:
|
|
103
|
+
make_dirs([pp_dir, hmm_dir])
|
|
104
|
+
|
|
105
|
+
samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
|
|
106
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
107
|
+
uns_key = "hmm_appended_layers"
|
|
108
|
+
|
|
109
|
+
# ensure uns key exists (avoid KeyError later)
|
|
110
|
+
if adata.uns.get(uns_key) is None:
|
|
111
|
+
adata.uns[uns_key] = []
|
|
112
|
+
|
|
113
|
+
for sample in samples:
|
|
114
|
+
for ref in references:
|
|
115
|
+
mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
|
|
116
|
+
subset = adata[mask].copy()
|
|
117
|
+
if subset.shape[0] < 1:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
for mod_site in cfg.hmm_methbases:
|
|
121
|
+
mod_label = {'C': 'C'}.get(mod_site, mod_site)
|
|
122
|
+
hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
|
|
123
|
+
|
|
124
|
+
# ensure the input obsm exists
|
|
125
|
+
obsm_key = f'{ref}_{mod_label}_site'
|
|
126
|
+
if obsm_key not in subset.obsm:
|
|
127
|
+
print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Fit or load model
|
|
131
|
+
if os.path.exists(hmm_path) and not cfg.force_redo_hmm_fit:
|
|
132
|
+
hmm = HMM.load(hmm_path)
|
|
133
|
+
hmm.print_params()
|
|
134
|
+
else:
|
|
135
|
+
print(f"Fitting HMM for {sample} {ref} {mod_label}")
|
|
136
|
+
hmm = HMM.from_config(cfg)
|
|
137
|
+
# fit expects a list-of-seqs or 2D ndarray in the obsm
|
|
138
|
+
seqs = subset.obsm[obsm_key]
|
|
139
|
+
hmm.fit(seqs)
|
|
140
|
+
hmm.print_params()
|
|
141
|
+
hmm.save(hmm_path)
|
|
142
|
+
|
|
143
|
+
# Apply / annotate on the subset, then copy layers back to final_adata
|
|
144
|
+
if (not cfg.bypass_hmm_apply) or cfg.force_redo_hmm_apply:
|
|
145
|
+
print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
|
|
146
|
+
# Use the new uns_key argument so subset will record appended layer names
|
|
147
|
+
# (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
|
|
148
|
+
hmm.annotate_adata(subset,
|
|
149
|
+
obs_column=cfg.reference_column,
|
|
150
|
+
layer=cfg.layer_for_umap_plotting,
|
|
151
|
+
config=cfg)
|
|
152
|
+
|
|
153
|
+
#to_merge = [("C_all_accessible_features", 80)]
|
|
154
|
+
to_merge = cfg.hmm_merge_layer_features
|
|
155
|
+
for layer_to_merge, merge_distance in to_merge:
|
|
156
|
+
if layer_to_merge:
|
|
157
|
+
hmm.merge_intervals_in_layer(subset,
|
|
158
|
+
layer=layer_to_merge,
|
|
159
|
+
distance_threshold=merge_distance,
|
|
160
|
+
overwrite=True
|
|
161
|
+
)
|
|
162
|
+
else:
|
|
163
|
+
pass
|
|
164
|
+
|
|
165
|
+
# collect appended layers from subset.uns
|
|
166
|
+
appended = list(subset.uns.get(uns_key, []))
|
|
167
|
+
print(appended)
|
|
168
|
+
if len(appended) == 0:
|
|
169
|
+
# nothing appended for this subset; continue
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# copy each appended layer into adata
|
|
173
|
+
subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
|
|
174
|
+
for layer_name in appended:
|
|
175
|
+
if layer_name not in subset.layers:
|
|
176
|
+
# defensive: skip
|
|
177
|
+
warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
|
|
178
|
+
continue
|
|
179
|
+
sub_layer = subset.layers[layer_name]
|
|
180
|
+
# ensure final layer exists and assign rows
|
|
181
|
+
try:
|
|
182
|
+
hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
|
|
183
|
+
except Exception as e:
|
|
184
|
+
warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
|
|
185
|
+
# fallback: if dense and small, try to coerce
|
|
186
|
+
if issparse(sub_layer):
|
|
187
|
+
arr = sub_layer.toarray()
|
|
188
|
+
else:
|
|
189
|
+
arr = np.asarray(sub_layer)
|
|
190
|
+
adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
|
|
191
|
+
final_idx = np.nonzero(subset_mask_bool)[0]
|
|
192
|
+
adata.layers[layer_name][final_idx, :] = arr
|
|
193
|
+
|
|
194
|
+
# merge appended layer names into adata.uns
|
|
195
|
+
existing = list(adata.uns.get(uns_key, []))
|
|
196
|
+
for ln in appended:
|
|
197
|
+
if ln not in existing:
|
|
198
|
+
existing.append(ln)
|
|
199
|
+
adata.uns[uns_key] = existing
|
|
200
|
+
|
|
201
|
+
else:
|
|
202
|
+
pass
|
|
203
|
+
|
|
204
|
+
## Save HMM annotated adata
|
|
205
|
+
if not hmm_adata_path.exists():
|
|
206
|
+
print('Saving hmm analyzed adata post preprocessing and duplicate removal')
|
|
207
|
+
if ".gz" == hmm_adata_path.suffix:
|
|
208
|
+
safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
|
|
209
|
+
else:
|
|
210
|
+
hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
|
|
211
|
+
safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
|
|
212
|
+
|
|
213
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
|
|
214
|
+
|
|
215
|
+
########################################################################################################################
|
|
216
|
+
|
|
217
|
+
############################################### HMM based feature plotting ###############################################
|
|
218
|
+
|
|
219
|
+
hmm_dir = pp_dir / "11_hmm_clustermaps"
|
|
220
|
+
|
|
221
|
+
if hmm_dir.is_dir():
|
|
222
|
+
print(f'{hmm_dir} already exists.')
|
|
223
|
+
else:
|
|
224
|
+
make_dirs([pp_dir, hmm_dir])
|
|
225
|
+
from ..plotting import combined_hmm_raw_clustermap
|
|
226
|
+
feature_layers = [
|
|
227
|
+
"all_accessible_features",
|
|
228
|
+
"large_accessible_patch",
|
|
229
|
+
"small_bound_stretch",
|
|
230
|
+
"medium_bound_stretch",
|
|
231
|
+
"putative_nucleosome",
|
|
232
|
+
"all_accessible_features_merged",
|
|
233
|
+
]
|
|
234
|
+
|
|
235
|
+
layers: list[str] = []
|
|
236
|
+
|
|
237
|
+
if any(base in ["C", "CpG", "GpC"] for base in cfg.mod_target_bases):
|
|
238
|
+
if smf_modality == 'deaminase':
|
|
239
|
+
layers.extend([f"C_{layer}" for layer in feature_layers])
|
|
240
|
+
elif smf_modality == 'conversion':
|
|
241
|
+
layers.extend([f"GpC_{layer}" for layer in feature_layers])
|
|
242
|
+
|
|
243
|
+
if 'A' in cfg.mod_target_bases:
|
|
244
|
+
layers.extend([f"A_{layer}" for layer in feature_layers])
|
|
245
|
+
|
|
246
|
+
if not layers:
|
|
247
|
+
raise ValueError(
|
|
248
|
+
f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
|
|
249
|
+
f"and smf_modality={smf_modality}"
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
if smf_modality == 'direct':
|
|
253
|
+
sort_by = "any_a"
|
|
254
|
+
else:
|
|
255
|
+
sort_by = 'gpc'
|
|
256
|
+
|
|
257
|
+
for layer in layers:
|
|
258
|
+
save_path = hmm_dir / layer
|
|
259
|
+
make_dirs([save_path])
|
|
260
|
+
|
|
261
|
+
combined_hmm_raw_clustermap(
|
|
262
|
+
adata,
|
|
263
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
264
|
+
reference_col=cfg.reference_column,
|
|
265
|
+
hmm_feature_layer=layer,
|
|
266
|
+
layer_gpc="nan0_0minus1",
|
|
267
|
+
layer_cpg="nan0_0minus1",
|
|
268
|
+
layer_any_c="nan0_0minus1",
|
|
269
|
+
layer_a= "nan0_0minus1",
|
|
270
|
+
cmap_hmm="coolwarm",
|
|
271
|
+
cmap_gpc="coolwarm",
|
|
272
|
+
cmap_cpg="viridis",
|
|
273
|
+
cmap_any_c='coolwarm',
|
|
274
|
+
cmap_a= "coolwarm",
|
|
275
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
276
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
277
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
278
|
+
min_position_valid_fraction=1-cfg.position_max_nan_threshold,
|
|
279
|
+
save_path=save_path,
|
|
280
|
+
normalize_hmm=False,
|
|
281
|
+
sort_by=sort_by, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
|
|
282
|
+
bins=None,
|
|
283
|
+
deaminase=deaminase,
|
|
284
|
+
min_signal=0
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
hmm_dir = pp_dir / "12_hmm_bulk_traces"
|
|
288
|
+
|
|
289
|
+
if hmm_dir.is_dir():
|
|
290
|
+
print(f'{hmm_dir} already exists.')
|
|
291
|
+
else:
|
|
292
|
+
make_dirs([pp_dir, hmm_dir])
|
|
293
|
+
from ..plotting import plot_hmm_layers_rolling_by_sample_ref
|
|
294
|
+
saved = plot_hmm_layers_rolling_by_sample_ref(
|
|
295
|
+
adata,
|
|
296
|
+
layers=adata.uns['hmm_appended_layers'],
|
|
297
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
298
|
+
ref_col=cfg.reference_column,
|
|
299
|
+
window=101,
|
|
300
|
+
rows_per_page=4,
|
|
301
|
+
figsize_per_cell=(4,2.5),
|
|
302
|
+
output_dir=hmm_dir,
|
|
303
|
+
save=True,
|
|
304
|
+
show_raw=False
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
hmm_dir = pp_dir / "13_hmm_fragment_distributions"
|
|
308
|
+
|
|
309
|
+
if hmm_dir.is_dir():
|
|
310
|
+
print(f'{hmm_dir} already exists.')
|
|
311
|
+
else:
|
|
312
|
+
make_dirs([pp_dir, hmm_dir])
|
|
313
|
+
from ..plotting import plot_hmm_size_contours
|
|
314
|
+
|
|
315
|
+
for layer, max in [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 160), ('C_all_accessible_features_merged_lengths', 800)]:
|
|
316
|
+
save_path = hmm_dir / layer
|
|
317
|
+
make_dirs([save_path])
|
|
318
|
+
|
|
319
|
+
figs = plot_hmm_size_contours(
|
|
320
|
+
adata,
|
|
321
|
+
length_layer=layer,
|
|
322
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
323
|
+
ref_obs_col=cfg.reference_column,
|
|
324
|
+
rows_per_page=6,
|
|
325
|
+
max_length_cap=max,
|
|
326
|
+
figsize_per_cell=(3.5, 2.2),
|
|
327
|
+
save_path=save_path,
|
|
328
|
+
save_pdf=False,
|
|
329
|
+
save_each_page=True,
|
|
330
|
+
dpi=200,
|
|
331
|
+
smoothing_sigma=None,
|
|
332
|
+
normalize_after_smoothing=False,
|
|
333
|
+
cmap='viridis',
|
|
334
|
+
log_scale_z=True
|
|
335
|
+
)
|
|
336
|
+
########################################################################################################################
|
|
337
|
+
|
|
338
|
+
return (adata, hmm_adata_path)
|