smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py CHANGED
@@ -9,26 +9,22 @@ from . import plotting as pl
9
9
  from . import preprocessing as pp
10
10
  from . import tools as tl
11
11
 
12
- from . import config, datasets, hmm, readwrite
12
+ from . import cli, config, datasets, hmm
13
13
  from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
14
14
 
15
- from .load_adata import load_adata
16
-
17
15
  from importlib.metadata import version
18
16
 
19
17
  package_name = "smftools"
20
18
  __version__ = version(package_name)
21
19
 
22
20
  __all__ = [
23
- "load_adata"
24
21
  "adata_to_df",
25
22
  "inform",
26
23
  "ml",
27
24
  "pp",
28
25
  "tl",
29
26
  "pl",
30
- "readwrite",
31
- "datasets",
27
+ "datasets"
32
28
  "safe_write_h5ad",
33
29
  "safe_read_h5ad"
34
30
  ]
smftools/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.1"
1
+ __version__ = "0.2.4"
File without changes
@@ -0,0 +1,94 @@
1
+ def flow_I(config_path):
2
+ """
3
+ High-level function to call for converting raw sequencing data to an adata object.
4
+ Command line accesses this through smftools load <config_path>
5
+ Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
6
+ Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
7
+ Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
8
+
9
+ Parameters:
10
+ config_path (str): A string representing the file path to the experiment configuration csv file.
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
16
+ from ..config import LoadExperimentConfig, ExperimentConfig
17
+ from .load_adata import load_adata
18
+ from .preprocess_adata import preprocess_adata
19
+ from .spatial_adata import spatial_adata
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import anndata as ad
24
+ import scanpy as sc
25
+
26
+ import os
27
+ from importlib import resources
28
+ from pathlib import Path
29
+
30
+ from datetime import datetime
31
+ date_str = datetime.today().strftime("%y%m%d")
32
+ ################################### 1) General params and input organization ###################################
33
+ # Load experiment config parameters into global variables
34
+ loader = LoadExperimentConfig(config_path)
35
+ defaults_dir = resources.files("smftools").joinpath("config")
36
+ cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
37
+
38
+ # General config variable init - Necessary user passed inputs
39
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
40
+ input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
41
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
42
+ fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
43
+ split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
44
+ split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
45
+
46
+ # Make initial output directory
47
+ make_dirs([output_directory])
48
+
49
+ bam_suffix = cfg.bam_suffix
50
+ strands = cfg.strands
51
+
52
+ # General config variable init - Optional user passed inputs for enzyme base specificity
53
+ mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
54
+
55
+ # Conversion/deamination specific variable init
56
+ conversion_types = cfg.conversion_types # 5mC
57
+ conversions = cfg.conversions
58
+
59
+ # Common Anndata accession params
60
+ reference_column = cfg.reference_column
61
+
62
+ # If conversion_types is passed:
63
+ if conversion_types:
64
+ conversions += conversion_types
65
+
66
+ ############################################### smftools load start ###############################################
67
+ initial_adata, initial_adata_path = load_adata(config_path)
68
+
69
+ # Initial adata path info
70
+ initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
71
+ ############################################### smftools load end ###############################################
72
+
73
+ ############################################### smftools preprocess start ###############################################
74
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
75
+
76
+ # Preprocessed adata path info
77
+ pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
78
+ pp_adata_path = initial_adata_path / pp_adata_basename
79
+ pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
80
+
81
+ # Preprocessed duplicate removed adata path info
82
+ pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
83
+ pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
84
+ pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
85
+ ############################################### smftools preprocess end ###############################################
86
+
87
+ ############################################### smftools spatial start ###############################################
88
+ # Preprocessed duplicate removed adata with basic analyses appended path info
89
+ basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
90
+ basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
91
+ basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
92
+
93
+ spatial_adata, spatial_adata_path = spatial_adata(config_path)
94
+ ############################################### smftools spatial end ###############################################
@@ -0,0 +1,48 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ import anndata as ad
4
+ from ..readwrite import safe_write_h5ad
5
+
6
+ @dataclass
7
+ class AdataPaths:
8
+ raw: Path
9
+ pp: Path
10
+ pp_dedup: Path
11
+ spatial: Path
12
+ hmm: Path
13
+
14
+
15
+ def get_adata_paths(cfg) -> AdataPaths:
16
+ """
17
+ Central helper: given cfg, compute all standard AnnData paths.
18
+ """
19
+ h5_dir = Path(cfg.output_directory) / "h5ads"
20
+
21
+ raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
22
+
23
+ pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
24
+
25
+ if cfg.smf_modality == "direct":
26
+ # direct SMF: duplicate-removed path is just preprocessed path
27
+ pp_dedup = pp
28
+ else:
29
+ pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
30
+
31
+ pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
32
+
33
+ spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
34
+ hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
35
+
36
+ return AdataPaths(
37
+ raw=raw,
38
+ pp=pp,
39
+ pp_dedup=pp_dedup,
40
+ spatial=spatial,
41
+ hmm=hmm,
42
+ )
43
+
44
+ def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
45
+ if path.suffix != ".gz":
46
+ path = path.with_name(path.name + ".gz")
47
+ safe_write_h5ad(adata, path, compression="gzip", backup=True)
48
+ return path
@@ -0,0 +1,361 @@
1
+ def hmm_adata(config_path):
2
+ """
3
+ High-level function to call for hmm analysis of an adata object.
4
+ Command line accesses this through smftools hmm <config_path>
5
+
6
+ Parameters:
7
+ config_path (str): A string representing the file path to the experiment configuration csv file.
8
+
9
+ Returns:
10
+ (pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
11
+ """
12
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
13
+ from .load_adata import load_adata
14
+ from .preprocess_adata import preprocess_adata
15
+ from .spatial_adata import spatial_adata
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import anndata as ad
20
+ import scanpy as sc
21
+
22
+ import os
23
+ from importlib import resources
24
+ from pathlib import Path
25
+
26
+ from datetime import datetime
27
+ date_str = datetime.today().strftime("%y%m%d")
28
+
29
+ ############################################### smftools load start ###############################################
30
+ adata, adata_path, cfg = load_adata(config_path)
31
+ # General config variable init - Necessary user passed inputs
32
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
33
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
34
+
35
+ # Make initial output directory
36
+ make_dirs([output_directory])
37
+ ############################################### smftools load end ###############################################
38
+
39
+ ############################################### smftools preprocess start ###############################################
40
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
41
+ ############################################### smftools preprocess end ###############################################
42
+
43
+ ############################################### smftools spatial start ###############################################
44
+ spatial_ad, spatial_adata_path = spatial_adata(config_path)
45
+ ############################################### smftools spatial end ###############################################
46
+
47
+ ############################################### smftools hmm start ###############################################
48
+ input_manager_df = pd.read_csv(cfg.summary_file)
49
+ initial_adata_path = Path(input_manager_df['load_adata'][0])
50
+ pp_adata_path = Path(input_manager_df['pp_adata'][0])
51
+ pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
52
+ spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
53
+ hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
54
+
55
+ if spatial_ad:
56
+ # This happens on first run of the pipeline
57
+ adata = spatial_ad
58
+ else:
59
+ # If an anndata is saved, check which stages of the anndata are available
60
+ initial_version_available = initial_adata_path.exists()
61
+ preprocessed_version_available = pp_adata_path.exists()
62
+ preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
63
+ preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
64
+ preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
65
+
66
+ if cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply:
67
+ print(f"Forcing redo of hmm analysis workflow.")
68
+ if preprocessed_dedup_spatial_hmm_version_available:
69
+ adata, load_report = safe_read_h5ad(hmm_adata_path)
70
+ elif preprocessed_dedup_spatial_version_available:
71
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
72
+ elif preprocessed_dup_removed_version_available:
73
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
74
+ elif initial_version_available:
75
+ adata, load_report = safe_read_h5ad(initial_adata_path)
76
+ else:
77
+ print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
78
+ elif preprocessed_dedup_spatial_hmm_version_available:
79
+ adata, load_report = safe_read_h5ad(hmm_adata_path)
80
+ else:
81
+ if preprocessed_dedup_spatial_version_available:
82
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
83
+ elif preprocessed_dup_removed_version_available:
84
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
85
+ elif initial_version_available:
86
+ adata, load_report = safe_read_h5ad(initial_adata_path)
87
+ else:
88
+ print(f"No adata available.")
89
+ return
90
+ references = adata.obs[cfg.reference_column].cat.categories
91
+ deaminase = smf_modality == 'deaminase'
92
+ ############################################### HMM based feature annotations ###############################################
93
+ if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
94
+ from ..hmm.HMM import HMM
95
+ from scipy.sparse import issparse, csr_matrix
96
+ import warnings
97
+
98
+ pp_dir = output_directory / "preprocessed"
99
+ pp_dir = pp_dir / "deduplicated"
100
+ hmm_dir = pp_dir / "10_hmm_models"
101
+
102
+ if hmm_dir.is_dir():
103
+ print(f'{hmm_dir} already exists.')
104
+ else:
105
+ make_dirs([pp_dir, hmm_dir])
106
+
107
+ samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
108
+ references = adata.obs[cfg.reference_column].cat.categories
109
+ uns_key = "hmm_appended_layers"
110
+
111
+ # ensure uns key exists (avoid KeyError later)
112
+ if adata.uns.get(uns_key) is None:
113
+ adata.uns[uns_key] = []
114
+
115
+ if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_fit and not cfg.force_redo_hmm_apply:
116
+ pass
117
+ else:
118
+ for sample in samples:
119
+ for ref in references:
120
+ mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
121
+ subset = adata[mask].copy()
122
+ if subset.shape[0] < 1:
123
+ continue
124
+
125
+ for mod_site in cfg.hmm_methbases:
126
+ mod_label = {'C': 'C'}.get(mod_site, mod_site)
127
+ hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
128
+
129
+ # ensure the input obsm exists
130
+ obsm_key = f'{ref}_{mod_label}_site'
131
+ if obsm_key not in subset.obsm:
132
+ print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
133
+ continue
134
+
135
+ # Fit or load model
136
+ if hmm_path.exists() and not cfg.force_redo_hmm_fit:
137
+ hmm = HMM.load(hmm_path)
138
+ hmm.print_params()
139
+ else:
140
+ print(f"Fitting HMM for {sample} {ref} {mod_label}")
141
+ hmm = HMM.from_config(cfg)
142
+ # fit expects a list-of-seqs or 2D ndarray in the obsm
143
+ seqs = subset.obsm[obsm_key]
144
+ hmm.fit(seqs)
145
+ hmm.print_params()
146
+ hmm.save(hmm_path)
147
+
148
+ # Apply / annotate on the subset, then copy layers back to final_adata
149
+ if cfg.bypass_hmm_apply:
150
+ pass
151
+ else:
152
+ print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
153
+ # Use the new uns_key argument so subset will record appended layer names
154
+ # (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
155
+ if smf_modality == "direct":
156
+ hmm_layer = cfg.output_binary_layer_name
157
+ else:
158
+ hmm_layer = None
159
+
160
+ hmm.annotate_adata(subset,
161
+ obs_column=cfg.reference_column,
162
+ layer=hmm_layer,
163
+ config=cfg,
164
+ force_redo=cfg.force_redo_hmm_apply
165
+ )
166
+
167
+ if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_apply:
168
+ pass
169
+ else:
170
+ to_merge = cfg.hmm_merge_layer_features
171
+ for layer_to_merge, merge_distance in to_merge:
172
+ if layer_to_merge:
173
+ hmm.merge_intervals_in_layer(subset,
174
+ layer=layer_to_merge,
175
+ distance_threshold=merge_distance,
176
+ overwrite=True
177
+ )
178
+ else:
179
+ pass
180
+
181
+ # collect appended layers from subset.uns
182
+ appended = list(subset.uns.get(uns_key, []))
183
+ print(appended)
184
+ if len(appended) == 0:
185
+ # nothing appended for this subset; continue
186
+ continue
187
+
188
+ # copy each appended layer into adata
189
+ subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
190
+ for layer_name in appended:
191
+ if layer_name not in subset.layers:
192
+ # defensive: skip
193
+ warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
194
+ continue
195
+ sub_layer = subset.layers[layer_name]
196
+ # ensure final layer exists and assign rows
197
+ try:
198
+ hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
199
+ except Exception as e:
200
+ warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
201
+ # fallback: if dense and small, try to coerce
202
+ if issparse(sub_layer):
203
+ arr = sub_layer.toarray()
204
+ else:
205
+ arr = np.asarray(sub_layer)
206
+ adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
207
+ final_idx = np.nonzero(subset_mask_bool)[0]
208
+ adata.layers[layer_name][final_idx, :] = arr
209
+
210
+ # merge appended layer names into adata.uns
211
+ existing = list(adata.uns.get(uns_key, []))
212
+ for ln in appended:
213
+ if ln not in existing:
214
+ existing.append(ln)
215
+ adata.uns[uns_key] = existing
216
+
217
+ else:
218
+ pass
219
+
220
+ from ..hmm import call_hmm_peaks
221
+ hmm_dir = pp_dir / "11_hmm_peak_calling"
222
+ if hmm_dir.is_dir():
223
+ pass
224
+ else:
225
+ make_dirs([pp_dir, hmm_dir])
226
+
227
+ call_hmm_peaks(
228
+ adata,
229
+ feature_configs=cfg.hmm_peak_feature_configs,
230
+ ref_column=cfg.reference_column,
231
+ site_types=cfg.mod_target_bases,
232
+ save_plot=True,
233
+ output_dir=hmm_dir,
234
+ index_col_suffix=cfg.reindexed_var_suffix)
235
+
236
+ ## Save HMM annotated adata
237
+ if not hmm_adata_path.exists():
238
+ print('Saving hmm analyzed adata post preprocessing and duplicate removal')
239
+ if ".gz" == hmm_adata_path.suffix:
240
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
241
+ else:
242
+ hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
243
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
244
+
245
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
246
+
247
+ ########################################################################################################################
248
+
249
+ ############################################### HMM based feature plotting ###############################################
250
+ from ..plotting import combined_hmm_raw_clustermap
251
+ hmm_dir = pp_dir / "12_hmm_clustermaps"
252
+ make_dirs([pp_dir, hmm_dir])
253
+
254
+ layers: list[str] = []
255
+
256
+ for base in cfg.hmm_methbases:
257
+ layers.extend([f"{base}_{layer}" for layer in cfg.hmm_clustermap_feature_layers])
258
+
259
+ if cfg.cpg:
260
+ layers.extend(["CpG_cpg_patch"])
261
+
262
+ if not layers:
263
+ raise ValueError(
264
+ f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
265
+ f"and smf_modality={smf_modality}"
266
+ )
267
+
268
+ for layer in layers:
269
+ hmm_cluster_save_dir = hmm_dir / layer
270
+ if hmm_cluster_save_dir.is_dir():
271
+ pass
272
+ else:
273
+ make_dirs([hmm_cluster_save_dir])
274
+
275
+ combined_hmm_raw_clustermap(
276
+ adata,
277
+ sample_col=cfg.sample_name_col_for_plotting,
278
+ reference_col=cfg.reference_column,
279
+ hmm_feature_layer=layer,
280
+ layer_gpc=cfg.layer_for_clustermap_plotting,
281
+ layer_cpg=cfg.layer_for_clustermap_plotting,
282
+ layer_c=cfg.layer_for_clustermap_plotting,
283
+ layer_a=cfg.layer_for_clustermap_plotting,
284
+ cmap_hmm=cfg.clustermap_cmap_hmm,
285
+ cmap_gpc=cfg.clustermap_cmap_gpc,
286
+ cmap_cpg=cfg.clustermap_cmap_cpg,
287
+ cmap_c=cfg.clustermap_cmap_c,
288
+ cmap_a=cfg.clustermap_cmap_a,
289
+ min_quality=cfg.read_quality_filter_thresholds[0],
290
+ min_length=cfg.read_len_filter_thresholds[0],
291
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
292
+ min_position_valid_fraction=1-cfg.position_max_nan_threshold,
293
+ save_path=hmm_cluster_save_dir,
294
+ normalize_hmm=False,
295
+ sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
296
+ bins=None,
297
+ deaminase=deaminase,
298
+ min_signal=0,
299
+ index_col_suffix=cfg.reindexed_var_suffix
300
+ )
301
+
302
+ hmm_dir = pp_dir / "13_hmm_bulk_traces"
303
+
304
+ if hmm_dir.is_dir():
305
+ print(f'{hmm_dir} already exists.')
306
+ else:
307
+ make_dirs([pp_dir, hmm_dir])
308
+ from ..plotting import plot_hmm_layers_rolling_by_sample_ref
309
+ bulk_hmm_layers = [layer for layer in adata.uns['hmm_appended_layers'] if "_lengths" not in layer]
310
+ saved = plot_hmm_layers_rolling_by_sample_ref(
311
+ adata,
312
+ layers=bulk_hmm_layers,
313
+ sample_col=cfg.sample_name_col_for_plotting,
314
+ ref_col=cfg.reference_column,
315
+ window=101,
316
+ rows_per_page=4,
317
+ figsize_per_cell=(4,2.5),
318
+ output_dir=hmm_dir,
319
+ save=True,
320
+ show_raw=False
321
+ )
322
+
323
+ hmm_dir = pp_dir / "14_hmm_fragment_distributions"
324
+
325
+ if hmm_dir.is_dir():
326
+ print(f'{hmm_dir} already exists.')
327
+ else:
328
+ make_dirs([pp_dir, hmm_dir])
329
+ from ..plotting import plot_hmm_size_contours
330
+
331
+ if smf_modality == 'deaminase':
332
+ fragments = [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 250), ('C_all_accessible_features_merged_lengths', 800)]
333
+ elif smf_modality == 'conversion':
334
+ fragments = [('GpC_all_accessible_features_lengths', 400), ('GpC_all_footprint_features_lengths', 250), ('GpC_all_accessible_features_merged_lengths', 800)]
335
+ elif smf_modality == "direct":
336
+ fragments = [('A_all_accessible_features_lengths', 400), ('A_all_footprint_features_lengths', 200), ('A_all_accessible_features_merged_lengths', 800)]
337
+
338
+ for layer, max in fragments:
339
+ save_path = hmm_dir / layer
340
+ make_dirs([save_path])
341
+
342
+ figs = plot_hmm_size_contours(
343
+ adata,
344
+ length_layer=layer,
345
+ sample_col=cfg.sample_name_col_for_plotting,
346
+ ref_obs_col=cfg.reference_column,
347
+ rows_per_page=6,
348
+ max_length_cap=max,
349
+ figsize_per_cell=(3.5, 2.2),
350
+ save_path=save_path,
351
+ save_pdf=False,
352
+ save_each_page=True,
353
+ dpi=200,
354
+ smoothing_sigma=(10, 10),
355
+ normalize_after_smoothing=True,
356
+ cmap='Greens',
357
+ log_scale_z=True
358
+ )
359
+ ########################################################################################################################
360
+
361
+ return (adata, hmm_adata_path)