smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py CHANGED
@@ -9,26 +9,22 @@ from . import plotting as pl
9
9
  from . import preprocessing as pp
10
10
  from . import tools as tl
11
11
 
12
- from . import config, datasets, hmm, readwrite
12
+ from . import cli, config, datasets, hmm
13
13
  from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
14
14
 
15
- from .load_adata import load_adata
16
-
17
15
  from importlib.metadata import version
18
16
 
19
17
  package_name = "smftools"
20
18
  __version__ = version(package_name)
21
19
 
22
20
  __all__ = [
23
- "load_adata"
24
21
  "adata_to_df",
25
22
  "inform",
26
23
  "ml",
27
24
  "pp",
28
25
  "tl",
29
26
  "pl",
30
- "readwrite",
31
- "datasets",
27
+ "datasets"
32
28
  "safe_write_h5ad",
33
29
  "safe_read_h5ad"
34
30
  ]
smftools/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.1"
1
+ __version__ = "0.2.3"
File without changes
@@ -0,0 +1,94 @@
1
+ def flow_I(config_path):
2
+ """
3
+ High-level function to call for converting raw sequencing data to an adata object.
4
+ Command line accesses this through smftools load <config_path>
5
+ Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
6
+ Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
7
+ Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
8
+
9
+ Parameters:
10
+ config_path (str): A string representing the file path to the experiment configuration csv file.
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
16
+ from ..config import LoadExperimentConfig, ExperimentConfig
17
+ from .load_adata import load_adata
18
+ from .preprocess_adata import preprocess_adata
19
+ from .spatial_adata import spatial_adata
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import anndata as ad
24
+ import scanpy as sc
25
+
26
+ import os
27
+ from importlib import resources
28
+ from pathlib import Path
29
+
30
+ from datetime import datetime
31
+ date_str = datetime.today().strftime("%y%m%d")
32
+ ################################### 1) General params and input organization ###################################
33
+ # Load experiment config parameters into global variables
34
+ loader = LoadExperimentConfig(config_path)
35
+ defaults_dir = resources.files("smftools").joinpath("config")
36
+ cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
37
+
38
+ # General config variable init - Necessary user passed inputs
39
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
40
+ input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
41
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
42
+ fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
43
+ split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
44
+ split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
45
+
46
+ # Make initial output directory
47
+ make_dirs([output_directory])
48
+
49
+ bam_suffix = cfg.bam_suffix
50
+ strands = cfg.strands
51
+
52
+ # General config variable init - Optional user passed inputs for enzyme base specificity
53
+ mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
54
+
55
+ # Conversion/deamination specific variable init
56
+ conversion_types = cfg.conversion_types # 5mC
57
+ conversions = cfg.conversions
58
+
59
+ # Common Anndata accession params
60
+ reference_column = cfg.reference_column
61
+
62
+ # If conversion_types is passed:
63
+ if conversion_types:
64
+ conversions += conversion_types
65
+
66
+ ############################################### smftools load start ###############################################
67
+ initial_adata, initial_adata_path = load_adata(config_path)
68
+
69
+ # Initial adata path info
70
+ initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
71
+ ############################################### smftools load end ###############################################
72
+
73
+ ############################################### smftools preprocess start ###############################################
74
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
75
+
76
+ # Preprocessed adata path info
77
+ pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
78
+ pp_adata_path = initial_adata_path / pp_adata_basename
79
+ pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
80
+
81
+ # Preprocessed duplicate removed adata path info
82
+ pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
83
+ pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
84
+ pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
85
+ ############################################### smftools preprocess end ###############################################
86
+
87
+ ############################################### smftools spatial start ###############################################
88
+ # Preprocessed duplicate removed adata with basic analyses appended path info
89
+ basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
90
+ basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
91
+ basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
92
+
93
+ spatial_adata, spatial_adata_path = spatial_adata(config_path)
94
+ ############################################### smftools spatial end ###############################################
@@ -0,0 +1,338 @@
1
+ def hmm_adata(config_path):
2
+ """
3
+ High-level function to call for hmm analysis of an adata object.
4
+ Command line accesses this through smftools hmm <config_path>
5
+
6
+ Parameters:
7
+ config_path (str): A string representing the file path to the experiment configuration csv file.
8
+
9
+ Returns:
10
+ (pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
11
+ """
12
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
13
+ from .load_adata import load_adata
14
+ from .preprocess_adata import preprocess_adata
15
+ from .spatial_adata import spatial_adata
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import anndata as ad
20
+ import scanpy as sc
21
+
22
+ import os
23
+ from importlib import resources
24
+ from pathlib import Path
25
+
26
+ from datetime import datetime
27
+ date_str = datetime.today().strftime("%y%m%d")
28
+
29
+ ############################################### smftools load start ###############################################
30
+ adata, adata_path, cfg = load_adata(config_path)
31
+ # General config variable init - Necessary user passed inputs
32
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
33
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
34
+
35
+ # Make initial output directory
36
+ make_dirs([output_directory])
37
+ ############################################### smftools load end ###############################################
38
+
39
+ ############################################### smftools preprocess start ###############################################
40
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
41
+ ############################################### smftools preprocess end ###############################################
42
+
43
+ ############################################### smftools spatial start ###############################################
44
+ spatial_ad, spatial_adata_path = spatial_adata(config_path)
45
+ ############################################### smftools spatial end ###############################################
46
+
47
+ ############################################### smftools hmm start ###############################################
48
+ input_manager_df = pd.read_csv(cfg.summary_file)
49
+ initial_adata_path = Path(input_manager_df['load_adata'][0])
50
+ pp_adata_path = Path(input_manager_df['pp_adata'][0])
51
+ pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
52
+ spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
53
+ hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
54
+
55
+ if spatial_ad:
56
+ # This happens on first run of the pipeline
57
+ adata = spatial_ad
58
+ else:
59
+ # If an anndata is saved, check which stages of the anndata are available
60
+ initial_version_available = initial_adata_path.exists()
61
+ preprocessed_version_available = pp_adata_path.exists()
62
+ preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
63
+ preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
64
+ preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
65
+
66
+ if cfg.force_redo_hmm_fit:
67
+ print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
68
+ if preprocessed_dedup_spatial_version_available:
69
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
70
+ elif preprocessed_dup_removed_version_available:
71
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
72
+ elif initial_version_available:
73
+ adata, load_report = safe_read_h5ad(initial_adata_path)
74
+ else:
75
+ print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
76
+ elif preprocessed_dedup_spatial_hmm_version_available:
77
+ return (None, hmm_adata_path)
78
+ else:
79
+ if preprocessed_dedup_spatial_version_available:
80
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
81
+ elif preprocessed_dup_removed_version_available:
82
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
83
+ elif initial_version_available:
84
+ adata, load_report = safe_read_h5ad(initial_adata_path)
85
+ else:
86
+ print(f"No adata available.")
87
+ return
88
+ references = adata.obs[cfg.reference_column].cat.categories
89
+ deaminase = smf_modality == 'deaminase'
90
+ ############################################### HMM based feature annotations ###############################################
91
+ if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
92
+ from ..hmm.HMM import HMM
93
+ from scipy.sparse import issparse, csr_matrix
94
+ import warnings
95
+
96
+ pp_dir = output_directory / "preprocessed"
97
+ pp_dir = pp_dir / "deduplicated"
98
+ hmm_dir = pp_dir / "10_hmm_models"
99
+
100
+ if hmm_dir.is_dir():
101
+ print(f'{hmm_dir} already exists.')
102
+ else:
103
+ make_dirs([pp_dir, hmm_dir])
104
+
105
+ samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
106
+ references = adata.obs[cfg.reference_column].cat.categories
107
+ uns_key = "hmm_appended_layers"
108
+
109
+ # ensure uns key exists (avoid KeyError later)
110
+ if adata.uns.get(uns_key) is None:
111
+ adata.uns[uns_key] = []
112
+
113
+ for sample in samples:
114
+ for ref in references:
115
+ mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
116
+ subset = adata[mask].copy()
117
+ if subset.shape[0] < 1:
118
+ continue
119
+
120
+ for mod_site in cfg.hmm_methbases:
121
+ mod_label = {'C': 'C'}.get(mod_site, mod_site)
122
+ hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
123
+
124
+ # ensure the input obsm exists
125
+ obsm_key = f'{ref}_{mod_label}_site'
126
+ if obsm_key not in subset.obsm:
127
+ print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
128
+ continue
129
+
130
+ # Fit or load model
131
+ if os.path.exists(hmm_path) and not cfg.force_redo_hmm_fit:
132
+ hmm = HMM.load(hmm_path)
133
+ hmm.print_params()
134
+ else:
135
+ print(f"Fitting HMM for {sample} {ref} {mod_label}")
136
+ hmm = HMM.from_config(cfg)
137
+ # fit expects a list-of-seqs or 2D ndarray in the obsm
138
+ seqs = subset.obsm[obsm_key]
139
+ hmm.fit(seqs)
140
+ hmm.print_params()
141
+ hmm.save(hmm_path)
142
+
143
+ # Apply / annotate on the subset, then copy layers back to final_adata
144
+ if (not cfg.bypass_hmm_apply) or cfg.force_redo_hmm_apply:
145
+ print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
146
+ # Use the new uns_key argument so subset will record appended layer names
147
+ # (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
148
+ hmm.annotate_adata(subset,
149
+ obs_column=cfg.reference_column,
150
+ layer=cfg.layer_for_umap_plotting,
151
+ config=cfg)
152
+
153
+ #to_merge = [("C_all_accessible_features", 80)]
154
+ to_merge = cfg.hmm_merge_layer_features
155
+ for layer_to_merge, merge_distance in to_merge:
156
+ if layer_to_merge:
157
+ hmm.merge_intervals_in_layer(subset,
158
+ layer=layer_to_merge,
159
+ distance_threshold=merge_distance,
160
+ overwrite=True
161
+ )
162
+ else:
163
+ pass
164
+
165
+ # collect appended layers from subset.uns
166
+ appended = list(subset.uns.get(uns_key, []))
167
+ print(appended)
168
+ if len(appended) == 0:
169
+ # nothing appended for this subset; continue
170
+ continue
171
+
172
+ # copy each appended layer into adata
173
+ subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
174
+ for layer_name in appended:
175
+ if layer_name not in subset.layers:
176
+ # defensive: skip
177
+ warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
178
+ continue
179
+ sub_layer = subset.layers[layer_name]
180
+ # ensure final layer exists and assign rows
181
+ try:
182
+ hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
183
+ except Exception as e:
184
+ warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
185
+ # fallback: if dense and small, try to coerce
186
+ if issparse(sub_layer):
187
+ arr = sub_layer.toarray()
188
+ else:
189
+ arr = np.asarray(sub_layer)
190
+ adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
191
+ final_idx = np.nonzero(subset_mask_bool)[0]
192
+ adata.layers[layer_name][final_idx, :] = arr
193
+
194
+ # merge appended layer names into adata.uns
195
+ existing = list(adata.uns.get(uns_key, []))
196
+ for ln in appended:
197
+ if ln not in existing:
198
+ existing.append(ln)
199
+ adata.uns[uns_key] = existing
200
+
201
+ else:
202
+ pass
203
+
204
+ ## Save HMM annotated adata
205
+ if not hmm_adata_path.exists():
206
+ print('Saving hmm analyzed adata post preprocessing and duplicate removal')
207
+ if ".gz" == hmm_adata_path.suffix:
208
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
209
+ else:
210
+ hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
211
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
212
+
213
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
214
+
215
+ ########################################################################################################################
216
+
217
+ ############################################### HMM based feature plotting ###############################################
218
+
219
+ hmm_dir = pp_dir / "11_hmm_clustermaps"
220
+
221
+ if hmm_dir.is_dir():
222
+ print(f'{hmm_dir} already exists.')
223
+ else:
224
+ make_dirs([pp_dir, hmm_dir])
225
+ from ..plotting import combined_hmm_raw_clustermap
226
+ feature_layers = [
227
+ "all_accessible_features",
228
+ "large_accessible_patch",
229
+ "small_bound_stretch",
230
+ "medium_bound_stretch",
231
+ "putative_nucleosome",
232
+ "all_accessible_features_merged",
233
+ ]
234
+
235
+ layers: list[str] = []
236
+
237
+ if any(base in ["C", "CpG", "GpC"] for base in cfg.mod_target_bases):
238
+ if smf_modality == 'deaminase':
239
+ layers.extend([f"C_{layer}" for layer in feature_layers])
240
+ elif smf_modality == 'conversion':
241
+ layers.extend([f"GpC_{layer}" for layer in feature_layers])
242
+
243
+ if 'A' in cfg.mod_target_bases:
244
+ layers.extend([f"A_{layer}" for layer in feature_layers])
245
+
246
+ if not layers:
247
+ raise ValueError(
248
+ f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
249
+ f"and smf_modality={smf_modality}"
250
+ )
251
+
252
+ if smf_modality == 'direct':
253
+ sort_by = "any_a"
254
+ else:
255
+ sort_by = 'gpc'
256
+
257
+ for layer in layers:
258
+ save_path = hmm_dir / layer
259
+ make_dirs([save_path])
260
+
261
+ combined_hmm_raw_clustermap(
262
+ adata,
263
+ sample_col=cfg.sample_name_col_for_plotting,
264
+ reference_col=cfg.reference_column,
265
+ hmm_feature_layer=layer,
266
+ layer_gpc="nan0_0minus1",
267
+ layer_cpg="nan0_0minus1",
268
+ layer_any_c="nan0_0minus1",
269
+ layer_a= "nan0_0minus1",
270
+ cmap_hmm="coolwarm",
271
+ cmap_gpc="coolwarm",
272
+ cmap_cpg="viridis",
273
+ cmap_any_c='coolwarm',
274
+ cmap_a= "coolwarm",
275
+ min_quality=cfg.read_quality_filter_thresholds[0],
276
+ min_length=cfg.read_len_filter_thresholds[0],
277
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
278
+ min_position_valid_fraction=1-cfg.position_max_nan_threshold,
279
+ save_path=save_path,
280
+ normalize_hmm=False,
281
+ sort_by=sort_by, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
282
+ bins=None,
283
+ deaminase=deaminase,
284
+ min_signal=0
285
+ )
286
+
287
+ hmm_dir = pp_dir / "12_hmm_bulk_traces"
288
+
289
+ if hmm_dir.is_dir():
290
+ print(f'{hmm_dir} already exists.')
291
+ else:
292
+ make_dirs([pp_dir, hmm_dir])
293
+ from ..plotting import plot_hmm_layers_rolling_by_sample_ref
294
+ saved = plot_hmm_layers_rolling_by_sample_ref(
295
+ adata,
296
+ layers=adata.uns['hmm_appended_layers'],
297
+ sample_col=cfg.sample_name_col_for_plotting,
298
+ ref_col=cfg.reference_column,
299
+ window=101,
300
+ rows_per_page=4,
301
+ figsize_per_cell=(4,2.5),
302
+ output_dir=hmm_dir,
303
+ save=True,
304
+ show_raw=False
305
+ )
306
+
307
+ hmm_dir = pp_dir / "13_hmm_fragment_distributions"
308
+
309
+ if hmm_dir.is_dir():
310
+ print(f'{hmm_dir} already exists.')
311
+ else:
312
+ make_dirs([pp_dir, hmm_dir])
313
+ from ..plotting import plot_hmm_size_contours
314
+
315
+ for layer, max in [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 160), ('C_all_accessible_features_merged_lengths', 800)]:
316
+ save_path = hmm_dir / layer
317
+ make_dirs([save_path])
318
+
319
+ figs = plot_hmm_size_contours(
320
+ adata,
321
+ length_layer=layer,
322
+ sample_col=cfg.sample_name_col_for_plotting,
323
+ ref_obs_col=cfg.reference_column,
324
+ rows_per_page=6,
325
+ max_length_cap=max,
326
+ figsize_per_cell=(3.5, 2.2),
327
+ save_path=save_path,
328
+ save_pdf=False,
329
+ save_each_page=True,
330
+ dpi=200,
331
+ smoothing_sigma=None,
332
+ normalize_after_smoothing=False,
333
+ cmap='viridis',
334
+ log_scale_z=True
335
+ )
336
+ ########################################################################################################################
337
+
338
+ return (adata, hmm_adata_path)