smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +2 -6
- smftools/_version.py +1 -1
- smftools/cli/__init__.py +0 -0
- smftools/cli/archived/cli_flows.py +94 -0
- smftools/cli/helpers.py +48 -0
- smftools/cli/hmm_adata.py +361 -0
- smftools/cli/load_adata.py +637 -0
- smftools/cli/preprocess_adata.py +455 -0
- smftools/cli/spatial_adata.py +697 -0
- smftools/cli_entry.py +434 -0
- smftools/config/conversion.yaml +18 -6
- smftools/config/deaminase.yaml +18 -11
- smftools/config/default.yaml +151 -36
- smftools/config/direct.yaml +28 -1
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +225 -27
- smftools/hmm/HMM.py +12 -1
- smftools/hmm/__init__.py +0 -6
- smftools/hmm/archived/call_hmm_peaks.py +106 -0
- smftools/hmm/call_hmm_peaks.py +318 -90
- smftools/informatics/__init__.py +13 -7
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +811 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/plotting/autocorrelation_plotting.py +1 -3
- smftools/plotting/general_plotting.py +1084 -363
- smftools/plotting/position_stats.py +3 -3
- smftools/preprocessing/__init__.py +4 -4
- smftools/preprocessing/append_base_context.py +35 -26
- smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools/preprocessing/calculate_complexity_II.py +1 -1
- smftools/preprocessing/calculate_coverage.py +16 -13
- smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools/preprocessing/calculate_read_modification_stats.py +2 -2
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
- smftools/preprocessing/flag_duplicate_reads.py +2 -2
- smftools/preprocessing/invert_adata.py +1 -1
- smftools/preprocessing/load_sample_sheet.py +1 -1
- smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools/readwrite.py +360 -140
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
- smftools-0.2.4.dist-info/RECORD +176 -0
- smftools-0.2.4.dist-info/entry_points.txt +2 -0
- smftools/cli.py +0 -184
- smftools/informatics/fast5_to_pod5.py +0 -24
- smftools/informatics/helpers/__init__.py +0 -73
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
- smftools/informatics/helpers/discover_input_files.py +0 -100
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/load_adata.py +0 -1346
- smftools-0.2.1.dist-info/RECORD +0 -161
- smftools-0.2.1.dist-info/entry_points.txt +0 -2
- /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
- /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
- /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
- /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
- {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
CHANGED
|
@@ -9,26 +9,22 @@ from . import plotting as pl
|
|
|
9
9
|
from . import preprocessing as pp
|
|
10
10
|
from . import tools as tl
|
|
11
11
|
|
|
12
|
-
from . import config, datasets, hmm
|
|
12
|
+
from . import cli, config, datasets, hmm
|
|
13
13
|
from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
|
|
14
14
|
|
|
15
|
-
from .load_adata import load_adata
|
|
16
|
-
|
|
17
15
|
from importlib.metadata import version
|
|
18
16
|
|
|
19
17
|
package_name = "smftools"
|
|
20
18
|
__version__ = version(package_name)
|
|
21
19
|
|
|
22
20
|
__all__ = [
|
|
23
|
-
"load_adata"
|
|
24
21
|
"adata_to_df",
|
|
25
22
|
"inform",
|
|
26
23
|
"ml",
|
|
27
24
|
"pp",
|
|
28
25
|
"tl",
|
|
29
26
|
"pl",
|
|
30
|
-
"
|
|
31
|
-
"datasets",
|
|
27
|
+
"datasets"
|
|
32
28
|
"safe_write_h5ad",
|
|
33
29
|
"safe_read_h5ad"
|
|
34
30
|
]
|
smftools/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.4"
|
smftools/cli/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
def flow_I(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for converting raw sequencing data to an adata object.
|
|
4
|
+
Command line accesses this through smftools load <config_path>
|
|
5
|
+
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
6
|
+
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
7
|
+
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
None
|
|
14
|
+
"""
|
|
15
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
|
|
16
|
+
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
17
|
+
from .load_adata import load_adata
|
|
18
|
+
from .preprocess_adata import preprocess_adata
|
|
19
|
+
from .spatial_adata import spatial_adata
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import anndata as ad
|
|
24
|
+
import scanpy as sc
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
from importlib import resources
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
32
|
+
################################### 1) General params and input organization ###################################
|
|
33
|
+
# Load experiment config parameters into global variables
|
|
34
|
+
loader = LoadExperimentConfig(config_path)
|
|
35
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
36
|
+
cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
|
|
37
|
+
|
|
38
|
+
# General config variable init - Necessary user passed inputs
|
|
39
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
40
|
+
input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
41
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
42
|
+
fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
|
|
43
|
+
split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
|
|
44
|
+
split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
|
|
45
|
+
|
|
46
|
+
# Make initial output directory
|
|
47
|
+
make_dirs([output_directory])
|
|
48
|
+
|
|
49
|
+
bam_suffix = cfg.bam_suffix
|
|
50
|
+
strands = cfg.strands
|
|
51
|
+
|
|
52
|
+
# General config variable init - Optional user passed inputs for enzyme base specificity
|
|
53
|
+
mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
|
|
54
|
+
|
|
55
|
+
# Conversion/deamination specific variable init
|
|
56
|
+
conversion_types = cfg.conversion_types # 5mC
|
|
57
|
+
conversions = cfg.conversions
|
|
58
|
+
|
|
59
|
+
# Common Anndata accession params
|
|
60
|
+
reference_column = cfg.reference_column
|
|
61
|
+
|
|
62
|
+
# If conversion_types is passed:
|
|
63
|
+
if conversion_types:
|
|
64
|
+
conversions += conversion_types
|
|
65
|
+
|
|
66
|
+
############################################### smftools load start ###############################################
|
|
67
|
+
initial_adata, initial_adata_path = load_adata(config_path)
|
|
68
|
+
|
|
69
|
+
# Initial adata path info
|
|
70
|
+
initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
|
|
71
|
+
############################################### smftools load end ###############################################
|
|
72
|
+
|
|
73
|
+
############################################### smftools preprocess start ###############################################
|
|
74
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
75
|
+
|
|
76
|
+
# Preprocessed adata path info
|
|
77
|
+
pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
|
|
78
|
+
pp_adata_path = initial_adata_path / pp_adata_basename
|
|
79
|
+
pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
|
|
80
|
+
|
|
81
|
+
# Preprocessed duplicate removed adata path info
|
|
82
|
+
pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
|
|
83
|
+
pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
|
|
84
|
+
pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
|
|
85
|
+
############################################### smftools preprocess end ###############################################
|
|
86
|
+
|
|
87
|
+
############################################### smftools spatial start ###############################################
|
|
88
|
+
# Preprocessed duplicate removed adata with basic analyses appended path info
|
|
89
|
+
basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
|
|
90
|
+
basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
|
|
91
|
+
basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
|
|
92
|
+
|
|
93
|
+
spatial_adata, spatial_adata_path = spatial_adata(config_path)
|
|
94
|
+
############################################### smftools spatial end ###############################################
|
smftools/cli/helpers.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import anndata as ad
|
|
4
|
+
from ..readwrite import safe_write_h5ad
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AdataPaths:
|
|
8
|
+
raw: Path
|
|
9
|
+
pp: Path
|
|
10
|
+
pp_dedup: Path
|
|
11
|
+
spatial: Path
|
|
12
|
+
hmm: Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_adata_paths(cfg) -> AdataPaths:
|
|
16
|
+
"""
|
|
17
|
+
Central helper: given cfg, compute all standard AnnData paths.
|
|
18
|
+
"""
|
|
19
|
+
h5_dir = Path(cfg.output_directory) / "h5ads"
|
|
20
|
+
|
|
21
|
+
raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
|
|
22
|
+
|
|
23
|
+
pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
|
|
24
|
+
|
|
25
|
+
if cfg.smf_modality == "direct":
|
|
26
|
+
# direct SMF: duplicate-removed path is just preprocessed path
|
|
27
|
+
pp_dedup = pp
|
|
28
|
+
else:
|
|
29
|
+
pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
|
|
30
|
+
|
|
31
|
+
pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
|
|
32
|
+
|
|
33
|
+
spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
|
|
34
|
+
hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
|
|
35
|
+
|
|
36
|
+
return AdataPaths(
|
|
37
|
+
raw=raw,
|
|
38
|
+
pp=pp,
|
|
39
|
+
pp_dedup=pp_dedup,
|
|
40
|
+
spatial=spatial,
|
|
41
|
+
hmm=hmm,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
|
|
45
|
+
if path.suffix != ".gz":
|
|
46
|
+
path = path.with_name(path.name + ".gz")
|
|
47
|
+
safe_write_h5ad(adata, path, compression="gzip", backup=True)
|
|
48
|
+
return path
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
def hmm_adata(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for hmm analysis of an adata object.
|
|
4
|
+
Command line accesses this through smftools hmm <config_path>
|
|
5
|
+
|
|
6
|
+
Parameters:
|
|
7
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
8
|
+
|
|
9
|
+
Returns:
|
|
10
|
+
(pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
|
|
11
|
+
"""
|
|
12
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
|
|
13
|
+
from .load_adata import load_adata
|
|
14
|
+
from .preprocess_adata import preprocess_adata
|
|
15
|
+
from .spatial_adata import spatial_adata
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import anndata as ad
|
|
20
|
+
import scanpy as sc
|
|
21
|
+
|
|
22
|
+
import os
|
|
23
|
+
from importlib import resources
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
from datetime import datetime
|
|
27
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
28
|
+
|
|
29
|
+
############################################### smftools load start ###############################################
|
|
30
|
+
adata, adata_path, cfg = load_adata(config_path)
|
|
31
|
+
# General config variable init - Necessary user passed inputs
|
|
32
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
33
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
34
|
+
|
|
35
|
+
# Make initial output directory
|
|
36
|
+
make_dirs([output_directory])
|
|
37
|
+
############################################### smftools load end ###############################################
|
|
38
|
+
|
|
39
|
+
############################################### smftools preprocess start ###############################################
|
|
40
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
41
|
+
############################################### smftools preprocess end ###############################################
|
|
42
|
+
|
|
43
|
+
############################################### smftools spatial start ###############################################
|
|
44
|
+
spatial_ad, spatial_adata_path = spatial_adata(config_path)
|
|
45
|
+
############################################### smftools spatial end ###############################################
|
|
46
|
+
|
|
47
|
+
############################################### smftools hmm start ###############################################
|
|
48
|
+
input_manager_df = pd.read_csv(cfg.summary_file)
|
|
49
|
+
initial_adata_path = Path(input_manager_df['load_adata'][0])
|
|
50
|
+
pp_adata_path = Path(input_manager_df['pp_adata'][0])
|
|
51
|
+
pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
|
|
52
|
+
spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
|
|
53
|
+
hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
|
|
54
|
+
|
|
55
|
+
if spatial_ad:
|
|
56
|
+
# This happens on first run of the pipeline
|
|
57
|
+
adata = spatial_ad
|
|
58
|
+
else:
|
|
59
|
+
# If an anndata is saved, check which stages of the anndata are available
|
|
60
|
+
initial_version_available = initial_adata_path.exists()
|
|
61
|
+
preprocessed_version_available = pp_adata_path.exists()
|
|
62
|
+
preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
|
|
63
|
+
preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
|
|
64
|
+
preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
|
|
65
|
+
|
|
66
|
+
if cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply:
|
|
67
|
+
print(f"Forcing redo of hmm analysis workflow.")
|
|
68
|
+
if preprocessed_dedup_spatial_hmm_version_available:
|
|
69
|
+
adata, load_report = safe_read_h5ad(hmm_adata_path)
|
|
70
|
+
elif preprocessed_dedup_spatial_version_available:
|
|
71
|
+
adata, load_report = safe_read_h5ad(spatial_adata_path)
|
|
72
|
+
elif preprocessed_dup_removed_version_available:
|
|
73
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
74
|
+
elif initial_version_available:
|
|
75
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
76
|
+
else:
|
|
77
|
+
print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
|
|
78
|
+
elif preprocessed_dedup_spatial_hmm_version_available:
|
|
79
|
+
adata, load_report = safe_read_h5ad(hmm_adata_path)
|
|
80
|
+
else:
|
|
81
|
+
if preprocessed_dedup_spatial_version_available:
|
|
82
|
+
adata, load_report = safe_read_h5ad(spatial_adata_path)
|
|
83
|
+
elif preprocessed_dup_removed_version_available:
|
|
84
|
+
adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
|
|
85
|
+
elif initial_version_available:
|
|
86
|
+
adata, load_report = safe_read_h5ad(initial_adata_path)
|
|
87
|
+
else:
|
|
88
|
+
print(f"No adata available.")
|
|
89
|
+
return
|
|
90
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
91
|
+
deaminase = smf_modality == 'deaminase'
|
|
92
|
+
############################################### HMM based feature annotations ###############################################
|
|
93
|
+
if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
|
|
94
|
+
from ..hmm.HMM import HMM
|
|
95
|
+
from scipy.sparse import issparse, csr_matrix
|
|
96
|
+
import warnings
|
|
97
|
+
|
|
98
|
+
pp_dir = output_directory / "preprocessed"
|
|
99
|
+
pp_dir = pp_dir / "deduplicated"
|
|
100
|
+
hmm_dir = pp_dir / "10_hmm_models"
|
|
101
|
+
|
|
102
|
+
if hmm_dir.is_dir():
|
|
103
|
+
print(f'{hmm_dir} already exists.')
|
|
104
|
+
else:
|
|
105
|
+
make_dirs([pp_dir, hmm_dir])
|
|
106
|
+
|
|
107
|
+
samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
|
|
108
|
+
references = adata.obs[cfg.reference_column].cat.categories
|
|
109
|
+
uns_key = "hmm_appended_layers"
|
|
110
|
+
|
|
111
|
+
# ensure uns key exists (avoid KeyError later)
|
|
112
|
+
if adata.uns.get(uns_key) is None:
|
|
113
|
+
adata.uns[uns_key] = []
|
|
114
|
+
|
|
115
|
+
if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_fit and not cfg.force_redo_hmm_apply:
|
|
116
|
+
pass
|
|
117
|
+
else:
|
|
118
|
+
for sample in samples:
|
|
119
|
+
for ref in references:
|
|
120
|
+
mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
|
|
121
|
+
subset = adata[mask].copy()
|
|
122
|
+
if subset.shape[0] < 1:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
for mod_site in cfg.hmm_methbases:
|
|
126
|
+
mod_label = {'C': 'C'}.get(mod_site, mod_site)
|
|
127
|
+
hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
|
|
128
|
+
|
|
129
|
+
# ensure the input obsm exists
|
|
130
|
+
obsm_key = f'{ref}_{mod_label}_site'
|
|
131
|
+
if obsm_key not in subset.obsm:
|
|
132
|
+
print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
# Fit or load model
|
|
136
|
+
if hmm_path.exists() and not cfg.force_redo_hmm_fit:
|
|
137
|
+
hmm = HMM.load(hmm_path)
|
|
138
|
+
hmm.print_params()
|
|
139
|
+
else:
|
|
140
|
+
print(f"Fitting HMM for {sample} {ref} {mod_label}")
|
|
141
|
+
hmm = HMM.from_config(cfg)
|
|
142
|
+
# fit expects a list-of-seqs or 2D ndarray in the obsm
|
|
143
|
+
seqs = subset.obsm[obsm_key]
|
|
144
|
+
hmm.fit(seqs)
|
|
145
|
+
hmm.print_params()
|
|
146
|
+
hmm.save(hmm_path)
|
|
147
|
+
|
|
148
|
+
# Apply / annotate on the subset, then copy layers back to final_adata
|
|
149
|
+
if cfg.bypass_hmm_apply:
|
|
150
|
+
pass
|
|
151
|
+
else:
|
|
152
|
+
print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
|
|
153
|
+
# Use the new uns_key argument so subset will record appended layer names
|
|
154
|
+
# (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
|
|
155
|
+
if smf_modality == "direct":
|
|
156
|
+
hmm_layer = cfg.output_binary_layer_name
|
|
157
|
+
else:
|
|
158
|
+
hmm_layer = None
|
|
159
|
+
|
|
160
|
+
hmm.annotate_adata(subset,
|
|
161
|
+
obs_column=cfg.reference_column,
|
|
162
|
+
layer=hmm_layer,
|
|
163
|
+
config=cfg,
|
|
164
|
+
force_redo=cfg.force_redo_hmm_apply
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_apply:
|
|
168
|
+
pass
|
|
169
|
+
else:
|
|
170
|
+
to_merge = cfg.hmm_merge_layer_features
|
|
171
|
+
for layer_to_merge, merge_distance in to_merge:
|
|
172
|
+
if layer_to_merge:
|
|
173
|
+
hmm.merge_intervals_in_layer(subset,
|
|
174
|
+
layer=layer_to_merge,
|
|
175
|
+
distance_threshold=merge_distance,
|
|
176
|
+
overwrite=True
|
|
177
|
+
)
|
|
178
|
+
else:
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
# collect appended layers from subset.uns
|
|
182
|
+
appended = list(subset.uns.get(uns_key, []))
|
|
183
|
+
print(appended)
|
|
184
|
+
if len(appended) == 0:
|
|
185
|
+
# nothing appended for this subset; continue
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# copy each appended layer into adata
|
|
189
|
+
subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
|
|
190
|
+
for layer_name in appended:
|
|
191
|
+
if layer_name not in subset.layers:
|
|
192
|
+
# defensive: skip
|
|
193
|
+
warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
|
|
194
|
+
continue
|
|
195
|
+
sub_layer = subset.layers[layer_name]
|
|
196
|
+
# ensure final layer exists and assign rows
|
|
197
|
+
try:
|
|
198
|
+
hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
|
|
199
|
+
except Exception as e:
|
|
200
|
+
warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
|
|
201
|
+
# fallback: if dense and small, try to coerce
|
|
202
|
+
if issparse(sub_layer):
|
|
203
|
+
arr = sub_layer.toarray()
|
|
204
|
+
else:
|
|
205
|
+
arr = np.asarray(sub_layer)
|
|
206
|
+
adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
|
|
207
|
+
final_idx = np.nonzero(subset_mask_bool)[0]
|
|
208
|
+
adata.layers[layer_name][final_idx, :] = arr
|
|
209
|
+
|
|
210
|
+
# merge appended layer names into adata.uns
|
|
211
|
+
existing = list(adata.uns.get(uns_key, []))
|
|
212
|
+
for ln in appended:
|
|
213
|
+
if ln not in existing:
|
|
214
|
+
existing.append(ln)
|
|
215
|
+
adata.uns[uns_key] = existing
|
|
216
|
+
|
|
217
|
+
else:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
from ..hmm import call_hmm_peaks
|
|
221
|
+
hmm_dir = pp_dir / "11_hmm_peak_calling"
|
|
222
|
+
if hmm_dir.is_dir():
|
|
223
|
+
pass
|
|
224
|
+
else:
|
|
225
|
+
make_dirs([pp_dir, hmm_dir])
|
|
226
|
+
|
|
227
|
+
call_hmm_peaks(
|
|
228
|
+
adata,
|
|
229
|
+
feature_configs=cfg.hmm_peak_feature_configs,
|
|
230
|
+
ref_column=cfg.reference_column,
|
|
231
|
+
site_types=cfg.mod_target_bases,
|
|
232
|
+
save_plot=True,
|
|
233
|
+
output_dir=hmm_dir,
|
|
234
|
+
index_col_suffix=cfg.reindexed_var_suffix)
|
|
235
|
+
|
|
236
|
+
## Save HMM annotated adata
|
|
237
|
+
if not hmm_adata_path.exists():
|
|
238
|
+
print('Saving hmm analyzed adata post preprocessing and duplicate removal')
|
|
239
|
+
if ".gz" == hmm_adata_path.suffix:
|
|
240
|
+
safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
|
|
241
|
+
else:
|
|
242
|
+
hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
|
|
243
|
+
safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
|
|
244
|
+
|
|
245
|
+
add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
|
|
246
|
+
|
|
247
|
+
########################################################################################################################
|
|
248
|
+
|
|
249
|
+
############################################### HMM based feature plotting ###############################################
|
|
250
|
+
from ..plotting import combined_hmm_raw_clustermap
|
|
251
|
+
hmm_dir = pp_dir / "12_hmm_clustermaps"
|
|
252
|
+
make_dirs([pp_dir, hmm_dir])
|
|
253
|
+
|
|
254
|
+
layers: list[str] = []
|
|
255
|
+
|
|
256
|
+
for base in cfg.hmm_methbases:
|
|
257
|
+
layers.extend([f"{base}_{layer}" for layer in cfg.hmm_clustermap_feature_layers])
|
|
258
|
+
|
|
259
|
+
if cfg.cpg:
|
|
260
|
+
layers.extend(["CpG_cpg_patch"])
|
|
261
|
+
|
|
262
|
+
if not layers:
|
|
263
|
+
raise ValueError(
|
|
264
|
+
f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
|
|
265
|
+
f"and smf_modality={smf_modality}"
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
for layer in layers:
|
|
269
|
+
hmm_cluster_save_dir = hmm_dir / layer
|
|
270
|
+
if hmm_cluster_save_dir.is_dir():
|
|
271
|
+
pass
|
|
272
|
+
else:
|
|
273
|
+
make_dirs([hmm_cluster_save_dir])
|
|
274
|
+
|
|
275
|
+
combined_hmm_raw_clustermap(
|
|
276
|
+
adata,
|
|
277
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
278
|
+
reference_col=cfg.reference_column,
|
|
279
|
+
hmm_feature_layer=layer,
|
|
280
|
+
layer_gpc=cfg.layer_for_clustermap_plotting,
|
|
281
|
+
layer_cpg=cfg.layer_for_clustermap_plotting,
|
|
282
|
+
layer_c=cfg.layer_for_clustermap_plotting,
|
|
283
|
+
layer_a=cfg.layer_for_clustermap_plotting,
|
|
284
|
+
cmap_hmm=cfg.clustermap_cmap_hmm,
|
|
285
|
+
cmap_gpc=cfg.clustermap_cmap_gpc,
|
|
286
|
+
cmap_cpg=cfg.clustermap_cmap_cpg,
|
|
287
|
+
cmap_c=cfg.clustermap_cmap_c,
|
|
288
|
+
cmap_a=cfg.clustermap_cmap_a,
|
|
289
|
+
min_quality=cfg.read_quality_filter_thresholds[0],
|
|
290
|
+
min_length=cfg.read_len_filter_thresholds[0],
|
|
291
|
+
min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
|
|
292
|
+
min_position_valid_fraction=1-cfg.position_max_nan_threshold,
|
|
293
|
+
save_path=hmm_cluster_save_dir,
|
|
294
|
+
normalize_hmm=False,
|
|
295
|
+
sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
|
|
296
|
+
bins=None,
|
|
297
|
+
deaminase=deaminase,
|
|
298
|
+
min_signal=0,
|
|
299
|
+
index_col_suffix=cfg.reindexed_var_suffix
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
hmm_dir = pp_dir / "13_hmm_bulk_traces"
|
|
303
|
+
|
|
304
|
+
if hmm_dir.is_dir():
|
|
305
|
+
print(f'{hmm_dir} already exists.')
|
|
306
|
+
else:
|
|
307
|
+
make_dirs([pp_dir, hmm_dir])
|
|
308
|
+
from ..plotting import plot_hmm_layers_rolling_by_sample_ref
|
|
309
|
+
bulk_hmm_layers = [layer for layer in adata.uns['hmm_appended_layers'] if "_lengths" not in layer]
|
|
310
|
+
saved = plot_hmm_layers_rolling_by_sample_ref(
|
|
311
|
+
adata,
|
|
312
|
+
layers=bulk_hmm_layers,
|
|
313
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
314
|
+
ref_col=cfg.reference_column,
|
|
315
|
+
window=101,
|
|
316
|
+
rows_per_page=4,
|
|
317
|
+
figsize_per_cell=(4,2.5),
|
|
318
|
+
output_dir=hmm_dir,
|
|
319
|
+
save=True,
|
|
320
|
+
show_raw=False
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
hmm_dir = pp_dir / "14_hmm_fragment_distributions"
|
|
324
|
+
|
|
325
|
+
if hmm_dir.is_dir():
|
|
326
|
+
print(f'{hmm_dir} already exists.')
|
|
327
|
+
else:
|
|
328
|
+
make_dirs([pp_dir, hmm_dir])
|
|
329
|
+
from ..plotting import plot_hmm_size_contours
|
|
330
|
+
|
|
331
|
+
if smf_modality == 'deaminase':
|
|
332
|
+
fragments = [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 250), ('C_all_accessible_features_merged_lengths', 800)]
|
|
333
|
+
elif smf_modality == 'conversion':
|
|
334
|
+
fragments = [('GpC_all_accessible_features_lengths', 400), ('GpC_all_footprint_features_lengths', 250), ('GpC_all_accessible_features_merged_lengths', 800)]
|
|
335
|
+
elif smf_modality == "direct":
|
|
336
|
+
fragments = [('A_all_accessible_features_lengths', 400), ('A_all_footprint_features_lengths', 200), ('A_all_accessible_features_merged_lengths', 800)]
|
|
337
|
+
|
|
338
|
+
for layer, max in fragments:
|
|
339
|
+
save_path = hmm_dir / layer
|
|
340
|
+
make_dirs([save_path])
|
|
341
|
+
|
|
342
|
+
figs = plot_hmm_size_contours(
|
|
343
|
+
adata,
|
|
344
|
+
length_layer=layer,
|
|
345
|
+
sample_col=cfg.sample_name_col_for_plotting,
|
|
346
|
+
ref_obs_col=cfg.reference_column,
|
|
347
|
+
rows_per_page=6,
|
|
348
|
+
max_length_cap=max,
|
|
349
|
+
figsize_per_cell=(3.5, 2.2),
|
|
350
|
+
save_path=save_path,
|
|
351
|
+
save_pdf=False,
|
|
352
|
+
save_each_page=True,
|
|
353
|
+
dpi=200,
|
|
354
|
+
smoothing_sigma=(10, 10),
|
|
355
|
+
normalize_after_smoothing=True,
|
|
356
|
+
cmap='Greens',
|
|
357
|
+
log_scale_z=True
|
|
358
|
+
)
|
|
359
|
+
########################################################################################################################
|
|
360
|
+
|
|
361
|
+
return (adata, hmm_adata_path)
|