smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
smftools/load_adata.py DELETED
@@ -1,1346 +0,0 @@
1
- ## load_adata
2
-
3
- def load_adata(config_path):
4
- """
5
- High-level function to call for converting raw sequencing data to an adata object.
6
- Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
7
- Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
8
- Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
9
-
10
- Parameters:
11
- config_path (str): A string representing the file path to the experiment configuration csv file.
12
-
13
- Returns:
14
- None
15
- """
16
- from .readwrite import safe_read_h5ad, safe_write_h5ad
17
- from .config import LoadExperimentConfig, ExperimentConfig
18
- from .informatics.helpers import discover_input_files, make_dirs, concatenate_fastqs_to_bam, extract_read_features_from_bam
19
- from .informatics.fast5_to_pod5 import fast5_to_pod5
20
- from .informatics.subsample_fasta_from_bed import subsample_fasta_from_bed
21
-
22
- import numpy as np
23
- import pandas as pd
24
- import anndata as ad
25
- import scanpy as sc
26
-
27
- import os
28
- from importlib import resources
29
- from pathlib import Path
30
-
31
- from datetime import datetime
32
- date_str = datetime.today().strftime("%y%m%d")
33
-
34
-
35
- ################################### 1) General params and input organization ###################################
36
-
37
- # Load experiment config parameters into global variables
38
- loader = LoadExperimentConfig(config_path)
39
- defaults_dir = resources.files("smftools").joinpath("config")
40
- cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
41
-
42
- # General config variable init - Necessary user passed inputs
43
- smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
44
- input_data_path = cfg.input_data_path # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
45
- output_directory = cfg.output_directory # Path to the output directory to make for the analysis. Necessary.
46
- fasta = cfg.fasta # Path to reference FASTA. Necessary.
47
-
48
- bam_suffix = cfg.bam_suffix
49
- split_dir = cfg.split_dir
50
- strands = cfg.strands
51
-
52
- # General config variable init - Optional user passed inputs for enzyme base specificity
53
- mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
54
-
55
- # Conversion/deamination specific variable init
56
- conversion_types = cfg.conversion_types # 5mC
57
- conversions = cfg.conversions
58
-
59
- # Common Anndata accession params
60
- reference_column = cfg.reference_column
61
-
62
- # Make initial output directory
63
- make_dirs([output_directory])
64
- os.chdir(output_directory)
65
-
66
- # Define the pathname to split BAMs into later during demultiplexing.
67
- split_path = os.path.join(output_directory, split_dir)
68
-
69
- # If conversion_types is passed:
70
- if conversion_types:
71
- conversions += conversion_types
72
-
73
- # Detect the input filetype
74
- if Path(input_data_path).is_file():
75
- input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
76
- input_is_pod5 = input_data_filetype in ['.pod5','.p5']
77
- input_is_fast5 = input_data_filetype in ['.fast5','.f5']
78
- input_is_fastq = input_data_filetype in ['.fastq', '.fq']
79
- input_is_bam = input_data_filetype == bam_suffix
80
- if input_is_fastq:
81
- fastq_paths = [input_data_path]
82
- elif Path(input_data_path).is_dir():
83
- found = discover_input_files(input_data_path, bam_suffix=bam_suffix, recursive=cfg.recursive_input_search)
84
-
85
- input_is_pod5 = found["input_is_pod5"]
86
- input_is_fast5 = found["input_is_fast5"]
87
- input_is_fastq = found["input_is_fastq"]
88
- input_is_bam = found["input_is_bam"]
89
-
90
- pod5_paths = found["pod5_paths"]
91
- fast5_paths = found["fast5_paths"]
92
- fastq_paths = found["fastq_paths"]
93
- bam_paths = found["bam_paths"]
94
-
95
- print(f"Found {found['all_files_searched']} files; fastq={len(fastq_paths)}, bam={len(bam_paths)}, pod5={len(pod5_paths)}, fast5={len(fast5_paths)}")
96
-
97
- # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
98
- if input_is_fast5 and not input_is_pod5:
99
- # take the input directory of fast5 files and write out a single pod5 file into the output directory.
100
- output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
101
- print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
102
- fast5_to_pod5(input_data_path, output_pod5)
103
- # Reassign the pod5_dir variable to point to the new pod5 file.
104
- input_data_path = output_pod5
105
- input_is_pod5 = True
106
- input_is_fast5 = False
107
-
108
- # If the input is a fastq or a directory of fastqs, concatenate them into an unaligned BAM and save the barcode
109
- elif input_is_fastq:
110
- # Output file for FASTQ concatenation.
111
- output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
112
-
113
- summary = concatenate_fastqs_to_bam(
114
- fastq_paths,
115
- output_bam,
116
- barcode_tag='BC',
117
- gzip_suffixes=('.gz','.gzip'),
118
- barcode_map=cfg.fastq_barcode_map,
119
- add_read_group=True,
120
- rg_sample_field=None,
121
- progress=False,
122
- auto_pair=cfg.fastq_auto_pairing)
123
-
124
- print(f"Found the following barcodes: {summary['barcodes']}")
125
-
126
- # Set the input data path to the concatenated BAM.
127
- input_data_path = output_bam
128
- input_is_bam = True
129
- input_is_fastq = False
130
-
131
- # Determine if the input data needs to be basecalled
132
- if input_is_pod5:
133
- basecall = True
134
- elif input_is_bam:
135
- basecall = False
136
- else:
137
- print('Error, can not find input bam or pod5')
138
-
139
- # Generate the base name of the unaligned bam without the .bam suffix
140
- if basecall:
141
- model_basename = os.path.basename(cfg.model)
142
- model_basename = model_basename.replace('.', '_')
143
- if smf_modality == 'direct':
144
- mod_string = "_".join(cfg.mod_list)
145
- bam=f"{output_directory}/{model_basename}_{mod_string}_calls"
146
- else:
147
- bam=f"{output_directory}/{model_basename}_canonical_basecalls"
148
- else:
149
- bam_base=os.path.basename(input_data_path).split('.bam')[0]
150
- bam=os.path.join(output_directory, bam_base)
151
-
152
- # Generate path names for the unaligned, aligned, as well as the aligned/sorted bam.
153
- unaligned_output = bam + bam_suffix
154
- aligned_BAM=f"{bam}_aligned"
155
- aligned_output = aligned_BAM + bam_suffix
156
- aligned_sorted_BAM=f"{aligned_BAM}_sorted"
157
- aligned_sorted_output = aligned_sorted_BAM + bam_suffix
158
-
159
- # Naming of the demultiplexed output directory
160
- if cfg.barcode_both_ends:
161
- split_dir = split_dir + '_both_ends_barcoded'
162
- else:
163
- split_dir = split_dir + '_at_least_one_end_barcoded'
164
-
165
- # Direct methylation detection SMF specific parameters
166
- if smf_modality == 'direct':
167
- mod_bed_dir=f"{split_dir}/split_mod_beds"
168
- mod_tsv_dir=f"{split_dir}/split_mod_tsvs"
169
- bam_qc_dir = f"{split_dir}/bam_qc"
170
- mod_map = {'6mA': '6mA', '5mC_5hmC': '5mC'}
171
- mods = [mod_map[mod] for mod in cfg.mod_list]
172
-
173
- os.chdir(output_directory)
174
- ########################################################################################################################
175
-
176
- ################################### 2) FASTA Handling ###################################
177
- from .informatics.helpers import generate_converted_FASTA, get_chromosome_lengths
178
-
179
- # If fasta_regions_of_interest bed is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
180
- if cfg.fasta_regions_of_interest and '.bed' in cfg.fasta_regions_of_interest:
181
- fasta_basename = os.path.basename(fasta).split('.fa')[0]
182
- bed_basename_minus_suffix = os.path.basename(cfg.fasta_regions_of_interest).split('.bed')[0]
183
- output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
184
- subsample_fasta_from_bed(fasta, cfg.fasta_regions_of_interest, output_directory, output_FASTA)
185
- fasta = os.path.join(output_directory, output_FASTA)
186
-
187
- # For conversion style SMF, make a converted reference FASTA
188
- if smf_modality == 'conversion':
189
- fasta_basename = os.path.basename(fasta)
190
- converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
191
- converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
192
- if 'converted.fa' in fasta:
193
- print(fasta + ' is already converted. Using existing converted FASTA.')
194
- converted_FASTA = fasta
195
- elif os.path.exists(converted_FASTA):
196
- print(converted_FASTA + ' already exists. Using existing converted FASTA.')
197
- else:
198
- generate_converted_FASTA(fasta, conversions, strands, converted_FASTA)
199
- fasta = converted_FASTA
200
-
201
- # Make a FAI and .chrom.names file for the fasta
202
- get_chromosome_lengths(fasta)
203
- ########################################################################################################################
204
-
205
- ################################### 3) Basecalling ###################################
206
- from .informatics.helpers import modcall, canoncall
207
- # 1) Basecall using dorado
208
- if basecall and cfg.sequencer == 'ont':
209
- if os.path.exists(unaligned_output):
210
- print(unaligned_output + ' already exists. Using existing basecalled BAM.')
211
- elif smf_modality != 'direct':
212
- canoncall(cfg.model_dir, cfg.model, input_data_path, cfg.barcode_kit, bam, bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
213
- else:
214
- modcall(cfg.model_dir, cfg.model, input_data_path, cfg.barcode_kit, cfg.mod_list, bam, bam_suffix, cfg.barcode_both_ends, cfg.trim, cfg.device)
215
- elif basecall:
216
- print(f"Basecalling is currently only supported for ont sequencers and not pacbio.")
217
- else:
218
- pass
219
- ########################################################################################################################
220
-
221
- ################################### 4) Alignment and sorting #############################################
222
- from .informatics.helpers import align_and_sort_BAM, aligned_BAM_to_bed
223
- # 3) Align the BAM to the reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
224
- if os.path.exists(aligned_output) and os.path.exists(aligned_sorted_output):
225
- print(aligned_sorted_output + ' already exists. Using existing aligned/sorted BAM.')
226
- else:
227
- align_and_sort_BAM(fasta, unaligned_output, bam_suffix, output_directory, cfg.make_bigwigs, cfg.threads, cfg.aligner, cfg.aligner_args)
228
-
229
- # Make beds and provide basic histograms
230
- bed_dir = os.path.join(output_directory, 'beds')
231
- if os.path.isdir(bed_dir):
232
- print(bed_dir + ' already exists. Skipping BAM -> BED conversion for ' + aligned_sorted_output)
233
- else:
234
- aligned_BAM_to_bed(aligned_output, output_directory, fasta, cfg.make_bigwigs, cfg.threads)
235
- ########################################################################################################################
236
-
237
- ################################### 5) Demultiplexing ######################################################################
238
- from .informatics.helpers import demux_and_index_BAM, split_and_index_BAM
239
- # 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory
240
- if os.path.isdir(split_dir):
241
- print(split_dir + ' already exists. Using existing demultiplexed BAMs.')
242
- bam_files = os.listdir(split_dir)
243
- bam_files = [os.path.join(split_dir, file) for file in bam_files if '.bam' in file and '.bai' not in file and 'unclassified' not in file]
244
- bam_files.sort()
245
- else:
246
- make_dirs([split_dir])
247
- if cfg.input_already_demuxed:
248
- split_and_index_BAM(aligned_sorted_BAM,
249
- split_dir,
250
- bam_suffix)
251
- else:
252
- bam_files = demux_and_index_BAM(aligned_sorted_BAM,
253
- split_dir, bam_suffix,
254
- cfg.barcode_kit,
255
- cfg.barcode_both_ends,
256
- cfg.trim,
257
- fasta,
258
- cfg.make_bigwigs,
259
- cfg.threads)
260
-
261
- # Make beds and provide basic histograms
262
- bed_dir = os.path.join(split_dir, 'beds')
263
- if os.path.isdir(bed_dir):
264
- print(bed_dir + ' already exists. Skipping BAM -> BED conversion for demultiplexed bams')
265
- else:
266
- for bam in bam_files:
267
- aligned_BAM_to_bed(bam, split_dir, fasta, cfg.make_bigwigs, cfg.threads)
268
- ########################################################################################################################
269
-
270
- ################################### 6) SAMTools based BAM QC ######################################################################
271
- from .informatics.helpers import bam_qc
272
- # 5) Samtools QC metrics on split BAM files
273
- bam_qc_dir = f"{split_dir}/bam_qc"
274
- if os.path.isdir(bam_qc_dir):
275
- print(bam_qc_dir + ' already exists. Using existing BAM QC calculations.')
276
- else:
277
- make_dirs([bam_qc_dir])
278
- bam_qc(bam_files, bam_qc_dir, cfg.threads, modality=smf_modality)
279
- ########################################################################################################################
280
-
281
- ################################### 7) AnnData loading ######################################################################
282
- if smf_modality != 'direct':
283
- from .informatics.helpers import converted_BAM_to_adata_II
284
- # 6) Take the converted BAM and load it into an adata object.
285
- if smf_modality == 'deaminase':
286
- deaminase_footprinting = True
287
- else:
288
- deaminase_footprinting = False
289
- raw_adata, raw_adata_path = converted_BAM_to_adata_II(fasta,
290
- split_dir,
291
- cfg.mapping_threshold,
292
- cfg.experiment_name,
293
- conversions,
294
- bam_suffix,
295
- cfg.device,
296
- cfg.threads,
297
- deaminase_footprinting,
298
- delete_intermediates=cfg.delete_intermediate_hdfs)
299
- else:
300
- if os.path.isdir(mod_bed_dir):
301
- print(mod_bed_dir + ' already exists, skipping making modbeds')
302
- else:
303
- from .informatics.helpers import modQC, make_modbed, extract_mods, modkit_extract_to_adata
304
- make_dirs([mod_bed_dir])
305
-
306
- modQC(aligned_sorted_output,
307
- cfg.thresholds) # get QC metrics for mod calls
308
-
309
- make_modbed(aligned_sorted_output,
310
- cfg.thresholds,
311
- mod_bed_dir) # Generate bed files of position methylation summaries for every sample
312
-
313
- make_dirs([mod_tsv_dir])
314
- extract_mods(cfg.thresholds,
315
- mod_tsv_dir,
316
- split_dir,
317
- bam_suffix,
318
- cfg.skip_unclassified,
319
- cfg.threads) # Extract methylations calls for split BAM files into split TSV files
320
-
321
- #6 Load the modification data from TSVs into an adata object
322
- raw_adata, raw_adata_path = modkit_extract_to_adata(fasta,
323
- split_dir,
324
- cfg.mapping_threshold,
325
- cfg.experiment_name,
326
- mods,
327
- cfg.batch_size,
328
- mod_tsv_dir,
329
- cfg.delete_batch_hdfs,
330
- cfg.threads)
331
-
332
- ########################################################################################################################
333
-
334
- ############################################### 8) Basic Read quality metrics: Read length, read quality, mapping quality, etc #################################################
335
-
336
- # Raw adata path info
337
- raw_backup_dir = os.path.join(os.path.dirname(raw_adata_path), 'adata_accessory_data')
338
-
339
- # Preprocessed adata path info
340
- pp_adata_basename = os.path.basename(raw_adata_path).split('.h5ad')[0] + '_preprocessed.h5ad.gz'
341
- pp_adata_path = os.path.join(os.path.dirname(raw_adata_path), pp_adata_basename)
342
- pp_backup_dir=os.path.join(os.path.dirname(pp_adata_path), 'pp_adata_accessory_data')
343
-
344
- # Preprocessed duplicate removed adata path info
345
- pp_dup_rem_adata_basename = os.path.basename(pp_adata_basename).split('.h5ad')[0] + '_duplicates_removed.h5ad.gz'
346
- pp_dup_rem_adata_path = os.path.join(os.path.dirname(pp_adata_path), pp_dup_rem_adata_basename)
347
- pp_dup_rem_backup_dir=os.path.join(os.path.dirname(pp_dup_rem_adata_path), 'pp_dup_rem_adata_accessory_data')
348
-
349
- # Preprocessed duplicate removed adata with basic analyses appended path info
350
- basic_analyzed_adata_basename = os.path.basename(pp_dup_rem_adata_path).split('.h5ad')[0] + '_analyzed_I.h5ad.gz'
351
- basic_analyzed_adata_path = os.path.join(os.path.dirname(pp_dup_rem_adata_path), basic_analyzed_adata_basename)
352
- basic_analyzed_backup_dir=os.path.join(os.path.dirname(pp_dup_rem_adata_path), 'duplicate_removed_analyzed_adata_I_accessory_data')
353
-
354
- # Preprocessed duplicate removed adata with basic analyses appended path info. With additional HMM feature layers added
355
- hmm_adata_basename = os.path.basename(basic_analyzed_adata_path).split('.h5ad')[0] + '_hmm.h5ad.gz'
356
- hmm_adata_path = os.path.join(os.path.dirname(basic_analyzed_adata_path), hmm_adata_basename)
357
- hmm_backup_dir=os.path.join(os.path.dirname(hmm_adata_path), 'duplicate_removed_analyzed_adata_I_hmm_accessory_data')
358
-
359
- if raw_adata:
360
- # This happens on first run of the pipeline
361
- adata = raw_adata
362
- else:
363
- # If an anndata is saved, check which stages of the anndata are available
364
- raw_version_available = os.path.exists(raw_adata_path) and os.path.isdir(raw_backup_dir)
365
- preprocessed_version_available = os.path.exists(pp_adata_path) and os.path.isdir(pp_backup_dir)
366
- preprocessed_dup_removed_version_available = os.path.exists(pp_dup_rem_adata_path) and os.path.isdir(pp_dup_rem_backup_dir)
367
- preprocessed_dup_removed_analyzed_I_version_available = os.path.exists(basic_analyzed_adata_path) and os.path.isdir(basic_analyzed_backup_dir)
368
- hmm_version_available = os.path.exists(hmm_adata_path) and os.path.isdir(hmm_backup_dir)
369
-
370
- if cfg.force_redo_preprocessing:
371
- print(f"Forcing full redo of preprocessing workflow, starting from earliest stage adata available.")
372
- if raw_version_available:
373
- adata, load_report = safe_read_h5ad(raw_adata_path, backup_dir=raw_backup_dir)
374
- elif preprocessed_version_available:
375
- adata, load_report = safe_read_h5ad(pp_adata_path, backup_dir=pp_backup_dir)
376
- elif preprocessed_dup_removed_version_available:
377
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path, backup_dir=pp_dup_rem_backup_dir)
378
- elif preprocessed_dup_removed_analyzed_I_version_available:
379
- adata, load_report = safe_read_h5ad(basic_analyzed_adata_path, backup_dir=basic_analyzed_backup_dir)
380
- elif hmm_version_available:
381
- adata, load_report = safe_read_h5ad(hmm_adata_path, backup_dir=hmm_backup_dir)
382
- else:
383
- print(f"Can not redo preprocessing when there is no adata available.")
384
- elif cfg.force_redo_flag_duplicate_reads:
385
- print(f"Forcing redo of duplicate detection workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
386
- if preprocessed_version_available:
387
- adata, load_report = safe_read_h5ad(pp_adata_path, backup_dir=pp_backup_dir)
388
- elif raw_version_available:
389
- adata, load_report = safe_read_h5ad(raw_adata_path, backup_dir=raw_backup_dir)
390
- else:
391
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
392
- elif cfg.force_redo_basic_analyses:
393
- print(f"Forcing redo of basic analysis workflow, starting from the preprocessed adata if available. Otherwise, will use the raw adata.")
394
- if preprocessed_version_available:
395
- adata, load_report = safe_read_h5ad(pp_adata_path, backup_dir=pp_backup_dir)
396
- elif raw_version_available:
397
- adata, load_report = safe_read_h5ad(raw_adata_path, backup_dir=raw_backup_dir)
398
- else:
399
- print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
400
- elif hmm_version_available:
401
- adata, load_report = safe_read_h5ad(hmm_adata_path, backup_dir=hmm_backup_dir)
402
- elif preprocessed_dup_removed_analyzed_I_version_available:
403
- adata, load_report = safe_read_h5ad(basic_analyzed_adata_path, backup_dir=basic_analyzed_backup_dir)
404
- elif preprocessed_dup_removed_version_available:
405
- adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path, backup_dir=pp_dup_rem_backup_dir)
406
- elif preprocessed_version_available:
407
- adata, load_report = safe_read_h5ad(pp_adata_path, backup_dir=pp_backup_dir)
408
- elif raw_version_available:
409
- adata, load_report = safe_read_h5ad(raw_adata_path, backup_dir=raw_backup_dir)
410
- else:
411
- print(f"No adata available.")
412
-
413
- ## Load sample sheet metadata based on barcode mapping ##
414
- if cfg.sample_sheet_path:
415
- from .preprocessing import load_sample_sheet
416
- load_sample_sheet(adata,
417
- cfg.sample_sheet_path,
418
- mapping_key_column=cfg.sample_sheet_mapping_column,
419
- as_category=True,
420
- force_reload=cfg.force_reload_sample_sheet)
421
- else:
422
- pass
423
-
424
- # Adding read length, read quality, reference length, mapped_length, and mapping quality metadata to adata object.
425
- from .preprocessing import add_read_length_and_mapping_qc
426
- add_read_length_and_mapping_qc(adata, bam_files,
427
- extract_read_features_from_bam_callable=extract_read_features_from_bam,
428
- bypass=cfg.bypass_add_read_length_and_mapping_qc,
429
- force_redo=cfg.force_redo_add_read_length_and_mapping_qc)
430
-
431
-
432
- adata.obs['Raw_modification_signal'] = np.nansum(adata.X, axis=1)
433
-
434
- pp_dir = f"{split_dir}/preprocessed"
435
- pp_length_qc_dir = f"{pp_dir}/01_Read_length_and_quality_QC_metrics"
436
-
437
- if os.path.isdir(pp_length_qc_dir) and not cfg.force_redo_preprocessing:
438
- print(pp_length_qc_dir + ' already exists. Skipping read level QC plotting.')
439
- else:
440
- from .plotting import plot_read_qc_histograms
441
- make_dirs([pp_dir, pp_length_qc_dir])
442
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
443
- plot_read_qc_histograms(adata,
444
- pp_length_qc_dir,
445
- obs_to_plot,
446
- sample_key=cfg.sample_name_col_for_plotting,
447
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
448
-
449
- ## Read length, quality, and mapping filtering
450
- from .preprocessing import filter_reads_on_length_quality_mapping
451
- print(adata.shape)
452
- adata = filter_reads_on_length_quality_mapping(adata,
453
- filter_on_coordinates=cfg.read_coord_filter,
454
- read_length=cfg.read_len_filter_thresholds,
455
- length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds,
456
- read_quality=cfg.read_quality_filter_thresholds,
457
- mapping_quality=cfg.read_mapping_quality_filter_thresholds,
458
- bypass=None,
459
- force_redo=None)
460
- print(adata.shape)
461
-
462
- pp_length_qc_dir = f"{pp_dir}/02_Read_length_and_quality_QC_metrics_post_filtering"
463
- if os.path.isdir(pp_length_qc_dir) and not cfg.force_redo_preprocessing:
464
- print(pp_length_qc_dir + ' already exists. Skipping read level QC plotting.')
465
- else:
466
- from .plotting import plot_read_qc_histograms
467
- make_dirs([pp_length_qc_dir])
468
- obs_to_plot = ['read_length', 'mapped_length','read_quality', 'mapping_quality','mapped_length_to_reference_length_ratio', 'mapped_length_to_read_length_ratio', 'Raw_modification_signal']
469
- plot_read_qc_histograms(adata,
470
- pp_length_qc_dir,
471
- obs_to_plot,
472
- sample_key=cfg.sample_name_col_for_plotting,
473
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
474
- ########################################################################################################################
475
-
476
- ############################################### 9) Basic Preprocessing ###############################################
477
-
478
- ############## Binarize direct modcall data and store in new layer. Clean nans and store as new layers with various nan replacement strategies ##########
479
- from .preprocessing import clean_NaN
480
- if smf_modality == 'direct':
481
- from .preprocessing import calculate_position_Youden, binarize_on_Youden
482
- native = True
483
- # Calculate positional methylation thresholds for mod calls
484
- calculate_position_Youden(adata,
485
- positive_control_sample=None,
486
- negative_control_sample=None,
487
- J_threshold=0.5,
488
- obs_column=reference_column,
489
- infer_on_percentile=10,
490
- inference_variable='Raw_modification_signal',
491
- save=False,
492
- output_directory=''
493
- )
494
- # binarize the modcalls based on the determined thresholds
495
- binarize_on_Youden(adata,
496
- obs_column=reference_column
497
- )
498
- clean_NaN(adata,
499
- layer='binarized_methylation',
500
- bypass=cfg.bypass_clean_nan,
501
- force_redo=cfg.force_redo_clean_nan
502
- )
503
- else:
504
- native = False
505
- clean_NaN(adata,
506
- bypass=cfg.bypass_clean_nan,
507
- force_redo=cfg.force_redo_clean_nan
508
- )
509
-
510
- ############### Add base context to each position for each Reference_strand and calculate read level methylation/deamination stats ###############
511
- from .preprocessing import append_base_context, append_binary_layer_by_base_context
512
- # Additionally, store base_context level binary modification arrays in adata.obsm
513
- append_base_context(adata,
514
- obs_column=reference_column,
515
- use_consensus=False,
516
- native=native,
517
- mod_target_bases=mod_target_bases,
518
- bypass=cfg.bypass_append_base_context,
519
- force_redo=cfg.force_redo_append_base_context)
520
-
521
- adata = append_binary_layer_by_base_context(adata,
522
- reference_column,
523
- smf_modality,
524
- bypass=cfg.bypass_append_binary_layer_by_base_context,
525
- force_redo=cfg.force_redo_append_binary_layer_by_base_context)
526
-
527
- ############### Optional inversion of the adata along positions axis ###################
528
- if cfg.invert_adata:
529
- from .preprocessing import invert_adata
530
- adata = invert_adata(adata)
531
-
532
- ############### Calculate read methylation/deamination statistics for specific base contexts defined above ###############
533
- from .preprocessing import calculate_read_modification_stats
534
- calculate_read_modification_stats(adata,
535
- reference_column,
536
- cfg.sample_column,
537
- mod_target_bases,
538
- bypass=cfg.bypass_calculate_read_modification_stats,
539
- force_redo=cfg.force_redo_calculate_read_modification_stats)
540
-
541
- ### Make a dir for outputting sample level read modification metrics before filtering ###
542
- pp_dir = f"{split_dir}/preprocessed"
543
- pp_meth_qc_dir = f"{pp_dir}/03_read_modification_QC_metrics"
544
-
545
- if os.path.isdir(pp_meth_qc_dir) and not cfg.force_redo_preprocessing:
546
- print(pp_meth_qc_dir + ' already exists. Skipping read level methylation QC plotting.')
547
- else:
548
- from .plotting import plot_read_qc_histograms
549
- make_dirs([pp_dir, pp_meth_qc_dir])
550
- obs_to_plot = ['Raw_modification_signal']
551
- if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
552
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
553
- if 'A' in mod_target_bases:
554
- obs_to_plot += ['Fraction_A_site_modified']
555
- plot_read_qc_histograms(adata,
556
- pp_meth_qc_dir, obs_to_plot,
557
- sample_key=cfg.sample_name_col_for_plotting,
558
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
559
-
560
- ##### Optionally filter reads on modification metrics
561
- from .preprocessing import filter_reads_on_modification_thresholds
562
- adata = filter_reads_on_modification_thresholds(adata,
563
- smf_modality=smf_modality,
564
- mod_target_bases=mod_target_bases,
565
- gpc_thresholds=cfg.read_mod_filtering_gpc_thresholds,
566
- cpg_thresholds=cfg.read_mod_filtering_cpg_thresholds,
567
- any_c_thresholds=cfg.read_mod_filtering_any_c_thresholds,
568
- a_thresholds=cfg.read_mod_filtering_a_thresholds,
569
- use_other_c_as_background=cfg.read_mod_filtering_use_other_c_as_background,
570
- min_valid_fraction_positions_in_read_vs_ref=cfg.min_valid_fraction_positions_in_read_vs_ref,
571
- bypass=cfg.bypass_filter_reads_on_modification_thresholds,
572
- force_redo=cfg.force_redo_filter_reads_on_modification_thresholds)
573
-
574
- ## Plot post filtering read methylation metrics
575
- pp_meth_qc_dir = f"{pp_dir}/04_read_modification_QC_metrics_post_filtering"
576
-
577
- if os.path.isdir(pp_meth_qc_dir) and not cfg.force_redo_preprocessing:
578
- print(pp_meth_qc_dir + ' already exists. Skipping read level methylation QC plotting.')
579
- else:
580
- from .plotting import plot_read_qc_histograms
581
- make_dirs([pp_dir, pp_meth_qc_dir])
582
- obs_to_plot = ['Raw_modification_signal']
583
- if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
584
- obs_to_plot += ['Fraction_GpC_site_modified', 'Fraction_CpG_site_modified', 'Fraction_other_C_site_modified', 'Fraction_any_C_site_modified']
585
- if 'A' in mod_target_bases:
586
- obs_to_plot += ['Fraction_A_site_modified']
587
- plot_read_qc_histograms(adata,
588
- pp_meth_qc_dir,
589
- obs_to_plot,
590
- sample_key=cfg.sample_name_col_for_plotting,
591
- rows_per_fig=cfg.rows_per_qc_histogram_grid)
592
-
593
- ############### Calculate positional coverage in dataset ###############
594
- from .preprocessing import calculate_coverage
595
- calculate_coverage(adata,
596
- obs_column=reference_column,
597
- position_nan_threshold=0.1)
598
-
599
- ############### Duplicate detection for conversion/deamination SMF ###############
600
- if smf_modality != 'direct':
601
- from .preprocessing import flag_duplicate_reads, calculate_complexity_II
602
- references = adata.obs[reference_column].cat.categories
603
-
604
- var_filters_sets =[]
605
- for ref in references:
606
- for site_type in cfg.duplicate_detection_site_types:
607
- var_filters_sets += [[f"{ref}_{site_type}_site", f"position_in_{ref}"]]
608
-
609
- pp_dir = f"{split_dir}/preprocessed"
610
- pp_dup_qc_dir = f"{pp_dir}/05_read_duplication_QC_metrics"
611
-
612
- make_dirs([pp_dir, pp_dup_qc_dir])
613
-
614
- # Flag duplicate reads and plot duplicate detection QC
615
- adata_unique, adata = flag_duplicate_reads(adata,
616
- var_filters_sets,
617
- distance_threshold=cfg.duplicate_detection_distance_threshold,
618
- obs_reference_col=reference_column,
619
- sample_col=cfg.sample_name_col_for_plotting,
620
- output_directory=pp_dup_qc_dir,
621
- metric_keys=cfg.hamming_vs_metric_keys,
622
- keep_best_metric=cfg.duplicate_detection_keep_best_metric,
623
- bypass=cfg.bypass_flag_duplicate_reads,
624
- force_redo=cfg.force_redo_flag_duplicate_reads,
625
- window_size=cfg.duplicate_detection_window_size_for_hamming_neighbors,
626
- min_overlap_positions=cfg.duplicate_detection_min_overlapping_positions,
627
- do_pca=cfg.duplicate_detection_do_pca,
628
- pca_n_components=50,
629
- pca_center=True,
630
- do_hierarchical=cfg.duplicate_detection_do_hierarchical,
631
- hierarchical_linkage=cfg.duplicate_detection_hierarchical_linkage,
632
- hierarchical_metric="euclidean",
633
- hierarchical_window=cfg.duplicate_detection_window_size_for_hamming_neighbors
634
- )
635
-
636
- # Use the flagged duplicate read groups and perform complexity analysis
637
- complexity_outs = os.path.join(pp_dup_qc_dir, "sample_complexity_analyses")
638
- make_dirs([complexity_outs])
639
- calculate_complexity_II(
640
- adata=adata,
641
- output_directory=complexity_outs,
642
- sample_col=cfg.sample_name_col_for_plotting,
643
- ref_col=reference_column,
644
- cluster_col='sequence__merged_cluster_id',
645
- plot=True,
646
- save_plot=True, # set False to display instead
647
- n_boot=30,
648
- n_depths=12,
649
- random_state=42,
650
- csv_summary=True,
651
- bypass=cfg.bypass_complexity_analysis,
652
- force_redo=cfg.force_redo_complexity_analysis
653
- )
654
-
655
- else:
656
- adata_unique = adata
657
-
658
- ########################################################################################################################
659
-
660
- ############################################### Save preprocessed adata with duplicate detection ###############################################
661
- from .readwrite import safe_write_h5ad
662
- if not os.path.exists(pp_adata_path) or cfg.force_redo_preprocessing:
663
- print('Saving preprocessed adata post duplicate detection.')
664
- if ".gz" in pp_adata_path:
665
- safe_write_h5ad(adata, f"{pp_adata_path}", compression='gzip', backup=True, backup_dir=pp_backup_dir)
666
- else:
667
- safe_write_h5ad(adata, f"{pp_adata_path}.gz", compression='gzip', backup=True, backup_dir=pp_backup_dir)
668
-
669
- if not os.path.exists(pp_dup_rem_adata_path) or cfg.force_redo_preprocessing:
670
- print('Saving preprocessed adata with duplicates removed.')
671
- if ".gz" in pp_dup_rem_adata_path:
672
- safe_write_h5ad(adata_unique, f"{pp_dup_rem_adata_path}", compression='gzip', backup=True, backup_dir=pp_dup_rem_backup_dir)
673
- else:
674
- safe_write_h5ad(adata_unique, f"{pp_dup_rem_adata_path}.gz", compression='gzip', backup=True, backup_dir=pp_dup_rem_backup_dir)
675
- ########################################################################################################################
676
-
677
- ############################################### Basic Analyses ###############################################
678
- if smf_modality != 'direct':
679
- if smf_modality == 'conversion':
680
- deaminase = False
681
- else:
682
- deaminase = True
683
- references = adata.obs[reference_column].cat.categories
684
-
685
- pp_dir = f"{split_dir}/preprocessed"
686
- pp_clustermap_dir = f"{pp_dir}/06_clustermaps"
687
-
688
- # ## Basic clustermap plotting
689
- if os.path.isdir(pp_clustermap_dir):
690
- print(pp_clustermap_dir + ' already exists. Skipping clustermap plotting.')
691
- else:
692
- from .plotting import combined_raw_clustermap
693
- make_dirs([pp_dir, pp_clustermap_dir])
694
- clustermap_results = combined_raw_clustermap(adata,
695
- sample_col=cfg.sample_name_col_for_plotting,
696
- reference_col=cfg.reference_column,
697
- layer_any_c=cfg.layer_for_clustermap_plotting,
698
- layer_gpc=cfg.layer_for_clustermap_plotting,
699
- layer_cpg=cfg.layer_for_clustermap_plotting,
700
- cmap_any_c="coolwarm",
701
- cmap_gpc="coolwarm",
702
- cmap_cpg="viridis",
703
- min_quality=cfg.read_quality_filter_thresholds[0],
704
- min_length=cfg.read_len_filter_thresholds[0],
705
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
706
- min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
707
- bins=None,
708
- sample_mapping=None,
709
- save_path=pp_clustermap_dir,
710
- sort_by='gpc',
711
- deaminase=deaminase)
712
-
713
- # Switch the main adata moving forward to be the one with duplicates removed.
714
- adata = adata_unique
715
-
716
- #### Repeat on duplicate scrubbed anndata ###
717
-
718
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
719
- pp_clustermap_dir = f"{pp_dir}/06_clustermaps"
720
- pp_umap_dir = f"{pp_dir}/07_umaps"
721
-
722
- # ## Basic clustermap plotting
723
- if os.path.isdir(pp_clustermap_dir):
724
- print(pp_clustermap_dir + ' already exists. Skipping clustermap plotting.')
725
- else:
726
- from .plotting import combined_raw_clustermap
727
- make_dirs([pp_dir, pp_clustermap_dir])
728
- clustermap_results = combined_raw_clustermap(adata,
729
- sample_col=cfg.sample_name_col_for_plotting,
730
- reference_col=cfg.reference_column,
731
- layer_any_c=cfg.layer_for_clustermap_plotting,
732
- layer_gpc=cfg.layer_for_clustermap_plotting,
733
- layer_cpg=cfg.layer_for_clustermap_plotting,
734
- cmap_any_c="coolwarm",
735
- cmap_gpc="coolwarm",
736
- cmap_cpg="viridis",
737
- min_quality=cfg.read_quality_filter_thresholds[0],
738
- min_length=cfg.read_len_filter_thresholds[0],
739
- min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
740
- min_position_valid_fraction=cfg.min_valid_fraction_positions_in_read_vs_ref,
741
- bins=None,
742
- sample_mapping=None,
743
- save_path=pp_clustermap_dir,
744
- sort_by='gpc',
745
- deaminase=deaminase)
746
-
747
- ## Basic PCA/UMAP
748
- if os.path.isdir(pp_umap_dir):
749
- print(pp_umap_dir + ' already exists. Skipping UMAP plotting.')
750
- else:
751
- from .tools import calculate_umap
752
- make_dirs([pp_dir, pp_umap_dir])
753
- var_filters = []
754
- for ref in references:
755
- var_filters += [f'{ref}_any_C_site']
756
- adata = calculate_umap(adata,
757
- layer=cfg.layer_for_umap_plotting,
758
- var_filters=var_filters,
759
- n_pcs=10,
760
- knn_neighbors=15)
761
-
762
- ## Clustering
763
- sc.tl.leiden(adata, resolution=0.1, flavor="igraph", n_iterations=2)
764
-
765
- # Plotting UMAP
766
- sc.settings.figdir = pp_umap_dir
767
- umap_layers = ['leiden', cfg.sample_name_col_for_plotting]
768
- umap_layers += cfg.umap_layers_to_plot
769
- sc.pl.umap(adata, color=umap_layers, show=False, save=True)
770
-
771
-
772
- ########################################################################################################################
773
-
774
- ############################################### Spatial autocorrelation analyses ########################################
775
- from .tools.spatial_autocorrelation import binary_autocorrelation_with_spacing, analyze_autocorr_matrix, bootstrap_periodicity, rolling_autocorr_metrics
776
- from .plotting import plot_rolling_grid
777
- import warnings
778
-
779
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
780
- pp_autocorr_dir = f"{pp_dir}/08_autocorrelations"
781
-
782
- if os.path.isdir(pp_autocorr_dir):
783
- print(pp_autocorr_dir + ' already exists. Skipping autocorrelation plotting.')
784
- else:
785
- positions = adata.var_names.astype(int).values
786
- lags = np.arange(cfg.autocorr_max_lag + 1)
787
-
788
- # optional: try to parallelize autocorr per-row with joblib
789
- try:
790
- from joblib import Parallel, delayed
791
- _have_joblib = True
792
- except Exception:
793
- _have_joblib = False
794
-
795
- for site_type in cfg.autocorr_site_types:
796
- layer_key = f"{site_type}_site_binary"
797
- if layer_key not in adata.layers:
798
- print(f"Layer {layer_key} not found in adata.layers — skipping {site_type}.")
799
- continue
800
-
801
- X = adata.layers[layer_key]
802
- if getattr(X, "shape", (0,))[0] == 0:
803
- print(f"Layer {layer_key} empty — skipping {site_type}.")
804
- continue
805
-
806
- # compute per-molecule autocorrs (and counts)
807
- rows = []
808
- counts = []
809
- if _have_joblib:
810
- # parallel map
811
- def _worker(row):
812
- try:
813
- ac, cnts = binary_autocorrelation_with_spacing(
814
- row, positions, max_lag=cfg.autocorr_max_lag, return_counts=True
815
- )
816
- except Exception as e:
817
- # on error return NaN arrays
818
- ac = np.full(cfg.autocorr_max_lag + 1, np.nan, dtype=np.float32)
819
- cnts = np.zeros(cfg.autocorr_max_lag + 1, dtype=np.int32)
820
- return ac, cnts
821
-
822
- res = Parallel(n_jobs=cfg.n_jobs if hasattr(cfg, "n_jobs") else -1)(
823
- delayed(_worker)(X[i]) for i in range(X.shape[0])
824
- )
825
- for ac, cnts in res:
826
- rows.append(ac)
827
- counts.append(cnts)
828
- else:
829
- # sequential fallback
830
- for i in range(X.shape[0]):
831
- ac, cnts = binary_autocorrelation_with_spacing(
832
- X[i], positions, max_lag=cfg.autocorr_max_lag, return_counts=True
833
- )
834
- rows.append(ac)
835
- counts.append(cnts)
836
-
837
- autocorr_matrix = np.asarray(rows, dtype=np.float32)
838
- counts_matrix = np.asarray(counts, dtype=np.int32)
839
-
840
- # store raw per-molecule arrays (keep memory format compact)
841
- adata.obsm[f"{site_type}_spatial_autocorr"] = autocorr_matrix
842
- adata.obsm[f"{site_type}_spatial_autocorr_counts"] = counts_matrix
843
- adata.uns[f"{site_type}_spatial_autocorr_lags"] = lags
844
-
845
- # compute global periodicity metrics across all molecules for this site_type
846
- try:
847
- results = analyze_autocorr_matrix(
848
- autocorr_matrix, counts_matrix, lags,
849
- nrl_search_bp=(120, 260), pad_factor=4, min_count=20, max_harmonics=6
850
- )
851
- except Exception as e:
852
- results = {"error": str(e)}
853
-
854
- # store global metrics (same keys you used)
855
- global_metrics = {
856
- "nrl_bp": results.get("nrl_bp", np.nan),
857
- "xi": results.get("xi", np.nan),
858
- "snr": results.get("snr", np.nan),
859
- "fwhm_bp": results.get("fwhm_bp", np.nan),
860
- "envelope_sample_lags": results.get("envelope_sample_lags", np.array([])).tolist(),
861
- "envelope_heights": results.get("envelope_heights", np.array([])).tolist(),
862
- "analyzer_error": results.get("error", None),
863
- }
864
- adata.uns[f"{site_type}_spatial_periodicity_metrics"] = global_metrics
865
-
866
- # bootstrap for CI (use a reasonable default; set low only for debugging)
867
- n_boot = getattr(cfg, "autocorr_bootstrap_n", 200)
868
- # if user intentionally set very low n_boot in cfg, we keep that; otherwise default 200
869
- try:
870
- bs = bootstrap_periodicity(
871
- autocorr_matrix, counts_matrix, lags,
872
- n_boot=n_boot, nrl_search_bp=(120, 260), pad_factor=4, min_count=20
873
- )
874
- adata.uns[f"{site_type}_spatial_periodicity_boot"] = {
875
- "nrl_boot": np.asarray(bs["nrl_boot"]).tolist(),
876
- "xi_boot": np.asarray(bs["xi_boot"]).tolist(),
877
- }
878
- except Exception as e:
879
- adata.uns[f"{site_type}_spatial_periodicity_boot_error"] = str(e)
880
-
881
- # ----------------------------
882
- # Compute group-level metrics for plotting (per sample × reference)
883
- # ----------------------------
884
- metrics_by_group = {}
885
- sample_col = cfg.sample_name_col_for_plotting
886
- ref_col = cfg.reference_strand_col if hasattr(cfg, "reference_strand_col") else "Reference_strand"
887
- samples = adata.obs[sample_col].astype("category").cat.categories.tolist()
888
- refs = adata.obs[ref_col].astype("category").cat.categories.tolist()
889
-
890
- # iterate groups and run analyzer on each group's subset; cache errors
891
- for sample_name in samples:
892
- sample_mask = (adata.obs[sample_col].values == sample_name)
893
- # combined group
894
- mask = sample_mask
895
- ac_sel = autocorr_matrix[mask, :]
896
- cnt_sel = counts_matrix[mask, :] if counts_matrix is not None else None
897
- if ac_sel.size:
898
- try:
899
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
900
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
901
- except Exception as e:
902
- r = {"error": str(e)}
903
- else:
904
- r = {"error": "no_data"}
905
- metrics_by_group[(sample_name, None)] = r
906
-
907
- # per-reference groups
908
- for ref in refs:
909
- mask_ref = sample_mask & (adata.obs[ref_col].values == ref)
910
- ac_sel = autocorr_matrix[mask_ref, :]
911
- cnt_sel = counts_matrix[mask_ref, :] if counts_matrix is not None else None
912
- if ac_sel.size:
913
- try:
914
- r = analyze_autocorr_matrix(ac_sel, cnt_sel if cnt_sel is not None else np.zeros_like(ac_sel, dtype=int),
915
- lags, nrl_search_bp=(120,260), pad_factor=4, min_count=10, max_harmonics=6)
916
- except Exception as e:
917
- r = {"error": str(e)}
918
- else:
919
- r = {"error": "no_data"}
920
- metrics_by_group[(sample_name, ref)] = r
921
-
922
- # persist group metrics
923
- adata.uns[f"{site_type}_spatial_periodicity_metrics_by_group"] = metrics_by_group
924
-
925
- global_nrl = adata.uns.get(f"{site_type}_spatial_periodicity_metrics", {}).get("nrl_bp", None)
926
-
927
- # configuration / sensible defaults (override in cfg if present)
928
- rolling_cfg = {
929
- "window_size": getattr(cfg, "rolling_window_size", getattr(cfg, "autocorr_rolling_window_size", 600)),
930
- "step": getattr(cfg, "rolling_step", 100),
931
- "max_lag": getattr(cfg, "rolling_max_lag", cfg.autocorr_max_lag if hasattr(cfg, "autocorr_max_lag") else 500),
932
- "min_molecules_per_window": getattr(cfg, "rolling_min_molecules_per_window", 10),
933
- "nrl_search_bp": getattr(cfg, "rolling_nrl_search_bp", (120, 240)),
934
- "pad_factor": getattr(cfg, "rolling_pad_factor", 4),
935
- "min_count_for_mean": getattr(cfg, "rolling_min_count_for_mean", 10),
936
- "max_harmonics": getattr(cfg, "rolling_max_harmonics", 6),
937
- "n_jobs": getattr(cfg, "rolling_n_jobs", 4),
938
- }
939
-
940
- write_plots = getattr(cfg, "rolling_write_plots", True)
941
- write_csvs = getattr(cfg, "rolling_write_csvs", True)
942
- min_molecules_for_group = getattr(cfg, "rolling_min_molecules_for_group", 30) # only run rolling if group has >= this many molecules
943
-
944
- rolling_out_dir = os.path.join(pp_autocorr_dir, "rolling_metrics")
945
- os.makedirs(rolling_out_dir, exist_ok=True)
946
- # also a per-site subfolder
947
- site_out_dir = os.path.join(rolling_out_dir, site_type)
948
- os.makedirs(site_out_dir, exist_ok=True)
949
-
950
- combined_rows = [] # accumulate one row per window for combined CSV
951
- rolling_results_by_group = {} # store DataFrame per group in memory (persist later to adata.uns)
952
-
953
- # iterate groups (samples × refs). `samples` and `refs` were computed above.
954
- for sample_name in samples:
955
- sample_mask = (adata.obs[sample_col].values == sample_name)
956
- # first the combined group ("all refs")
957
- group_masks = [("all", sample_mask)]
958
- # then per-reference groups
959
- for ref in refs:
960
- ref_mask = sample_mask & (adata.obs[ref_col].values == ref)
961
- group_masks.append((ref, ref_mask))
962
-
963
- for ref_label, mask in group_masks:
964
- n_group = int(mask.sum())
965
- if n_group < min_molecules_for_group:
966
- # skip tiny groups
967
- if cfg.get("verbosity", 0) if hasattr(cfg, "get") else False:
968
- print(f"Skipping rolling for {site_type} {sample_name} {ref_label}: only {n_group} molecules (<{min_molecules_for_group})")
969
- # still write an empty CSV row set if desired; here we skip
970
- continue
971
-
972
- # extract group matrix X_group (works with dense or sparse adata.layers)
973
- X_group = X[mask, :]
974
- # positions already set above
975
- try:
976
- # call your rolling function (this may be slow; it uses cfg.n_jobs)
977
- df_roll = rolling_autocorr_metrics(
978
- X_group,
979
- positions,
980
- site_label=site_type,
981
- window_size=rolling_cfg["window_size"],
982
- step=rolling_cfg["step"],
983
- max_lag=rolling_cfg["max_lag"],
984
- min_molecules_per_window=rolling_cfg["min_molecules_per_window"],
985
- nrl_search_bp=rolling_cfg["nrl_search_bp"],
986
- pad_factor=rolling_cfg["pad_factor"],
987
- min_count_for_mean=rolling_cfg["min_count_for_mean"],
988
- max_harmonics=rolling_cfg["max_harmonics"],
989
- n_jobs=rolling_cfg["n_jobs"],
990
- verbose=False,
991
- fixed_nrl_bp=global_nrl
992
- )
993
- except Exception as e:
994
- warnings.warn(f"rolling_autocorr_metrics failed for {site_type} {sample_name} {ref_label}: {e}")
995
- continue
996
-
997
- # normalize column names and keep only the compact set you want
998
- # keep: center, n_molecules, nrl_bp, snr, xi, fwhm_bp
999
- if "center" not in df_roll.columns:
1000
- # defensive: if the rolling function returned different schema, skip
1001
- warnings.warn(f"rolling_autocorr_metrics returned unexpected schema for {site_type} {sample_name} {ref_label}")
1002
- continue
1003
-
1004
- compact_df = df_roll[["center", "n_molecules", "nrl_bp", "snr", "xi", "fwhm_bp"]].copy()
1005
- compact_df["site"] = site_type
1006
- compact_df["sample"] = sample_name
1007
- compact_df["reference"] = ref_label if ref_label != "all" else "all"
1008
-
1009
- # save per-group CSV
1010
- if write_csvs:
1011
- safe_sample = str(sample_name).replace(os.sep, "_")
1012
- safe_ref = str(ref_label if ref_label != "all" else "all").replace(os.sep, "_")
1013
- out_csv = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.csv")
1014
- try:
1015
- compact_df.to_csv(out_csv, index=False)
1016
- except Exception as e:
1017
- warnings.warn(f"Failed to write rolling CSV {out_csv}: {e}")
1018
-
1019
- # save a plot per-group (NRL and SNR vs center)
1020
- if write_plots:
1021
- try:
1022
- # use your plot helper; if it's in a different module, import accordingly
1023
- from .plotting import plot_rolling_metrics as _plot_roll
1024
- except Exception:
1025
- _plot_roll = globals().get("plot_rolling_metrics", None)
1026
- if _plot_roll is not None:
1027
- plot_png = os.path.join(site_out_dir, f"{safe_sample}__{safe_ref}__rolling_metrics.png")
1028
- try:
1029
- _plot_roll(compact_df, out_png=plot_png,
1030
- title=f"{site_type} {sample_name} {ref_label}",
1031
- figsize=(10,3.5), dpi=160, show=False)
1032
- except Exception as e:
1033
- warnings.warn(f"Failed to create rolling plot for {site_type} {sample_name} {ref_label}: {e}")
1034
-
1035
- # store in combined_rows and in-memory dict
1036
- combined_rows.append(compact_df.assign(site=site_type, sample=sample_name, reference=ref_label))
1037
- rolling_results_by_group[(sample_name, None if ref_label == "all" else ref_label)] = compact_df
1038
-
1039
- # persist per-site rolling metrics into adata.uns as dict of DataFrames (or empty dict)
1040
- adata.uns[f"{site_type}_rolling_metrics_by_group"] = rolling_results_by_group
1041
-
1042
- # write combined CSV for this site across all groups
1043
- if len(combined_rows):
1044
- combined_df_site = pd.concat(combined_rows, ignore_index=True, sort=False)
1045
- combined_out_csv = os.path.join(rolling_out_dir, f"{site_type}__rolling_metrics_combined.csv")
1046
- try:
1047
- combined_df_site.to_csv(combined_out_csv, index=False)
1048
- except Exception as e:
1049
- warnings.warn(f"Failed to write combined rolling CSV for {site_type}: {e}")
1050
-
1051
- rolling_dict = adata.uns[f"{site_type}_rolling_metrics_by_group"]
1052
- plot_out_dir = os.path.join(pp_autocorr_dir, "rolling_plots")
1053
- os.makedirs(plot_out_dir, exist_ok=True)
1054
- pages = plot_rolling_grid(rolling_dict, plot_out_dir, site_type,
1055
- rows_per_page=cfg.rows_per_qc_autocorr_grid,
1056
- cols_per_page=len(refs),
1057
- dpi=160,
1058
- metrics=("nrl_bp","snr", "xi"),
1059
- per_metric_ylim={"snr": (0, 25)})
1060
-
1061
- from .plotting import plot_spatial_autocorr_grid
1062
- make_dirs([pp_autocorr_dir, pp_autocorr_dir])
1063
-
1064
- plot_spatial_autocorr_grid(adata,
1065
- pp_autocorr_dir,
1066
- site_types=cfg.autocorr_site_types,
1067
- sample_col=cfg.sample_name_col_for_plotting,
1068
- window=cfg.autocorr_rolling_window_size,
1069
- rows_per_fig=cfg.rows_per_qc_autocorr_grid)
1070
-
1071
- ########################################################################################################################
1072
-
1073
- ############################################### Pearson analyses ########################################
1074
- if smf_modality != 'direct':
1075
- from .tools.position_stats import compute_positionwise_statistics, plot_positionwise_matrices
1076
-
1077
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
1078
- pp_corr_dir = f"{pp_dir}/09_correlation_matrices"
1079
-
1080
- if os.path.isdir(pp_corr_dir):
1081
- print(pp_corr_dir + ' already exists. Skipping correlation matrix plotting.')
1082
- else:
1083
- compute_positionwise_statistics(
1084
- adata,
1085
- layer="nan0_0minus1",
1086
- methods=cfg.correlation_matrix_types,
1087
- sample_col=cfg.sample_name_col_for_plotting,
1088
- ref_col=reference_column,
1089
- output_key="positionwise_result",
1090
- site_types=cfg.correlation_matrix_site_types,
1091
- encoding="signed",
1092
- max_threads=cfg.threads,
1093
- min_count_for_pairwise=10,
1094
- )
1095
-
1096
- plot_positionwise_matrices(
1097
- adata,
1098
- methods=cfg.correlation_matrix_types,
1099
- sample_col=cfg.sample_name_col_for_plotting,
1100
- ref_col=reference_column,
1101
- figsize_per_cell=(4.0, 3.0),
1102
- dpi=160,
1103
- cmaps=cfg.correlation_matrix_cmaps,
1104
- vmin=None,
1105
- vmax=None,
1106
- output_dir=pp_corr_dir,
1107
- output_key= "positionwise_result"
1108
- )
1109
-
1110
- ########################################################################################################################
1111
-
1112
- ############################################### Save basic analysis adata - post preprocessing and duplicate removal ###############################################
1113
- from .readwrite import safe_write_h5ad
1114
-
1115
- if not os.path.exists(basic_analyzed_adata_path) or cfg.force_redo_preprocessing:
1116
- print('Saving basic analyzed adata post preprocessing and duplicate removal')
1117
- if ".gz" in basic_analyzed_adata_path:
1118
- safe_write_h5ad(adata, f"{basic_analyzed_adata_path}", compression='gzip', backup=True, backup_dir=basic_analyzed_backup_dir)
1119
- else:
1120
- safe_write_h5ad(adata, f"{basic_analyzed_adata_path}.gz", compression='gzip', backup=True, backup_dir=basic_analyzed_backup_dir)
1121
- ########################################################################################################################
1122
-
1123
- ############################################### HMM based feature annotations ###############################################
1124
- if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
1125
- from .hmm.HMM import HMM
1126
- from scipy.sparse import issparse, csr_matrix
1127
- import warnings
1128
-
1129
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
1130
- hmm_dir = f"{pp_dir}/10_hmm_models"
1131
-
1132
- if os.path.isdir(hmm_dir):
1133
- print(hmm_dir + ' already exists.')
1134
- else:
1135
- make_dirs([pp_dir, hmm_dir])
1136
-
1137
- samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
1138
- references = adata.obs[reference_column].cat.categories
1139
- uns_key = "hmm_appended_layers"
1140
-
1141
- # ensure uns key exists (avoid KeyError later)
1142
- if adata.uns.get(uns_key) is None:
1143
- adata.uns[uns_key] = []
1144
-
1145
- for sample in samples:
1146
- for ref in references:
1147
- mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[reference_column] == ref)
1148
- subset = adata[mask].copy()
1149
- if subset.shape[0] < 1:
1150
- continue
1151
-
1152
- for mod_site in cfg.hmm_methbases:
1153
- mod_label = {'C': 'any_C'}.get(mod_site, mod_site)
1154
- hmm_path = os.path.join(hmm_dir, f"{sample}_{ref}_{mod_label}_hmm_model.pth")
1155
-
1156
- # ensure the input obsm exists
1157
- obsm_key = f'{ref}_{mod_label}_site'
1158
- if obsm_key not in subset.obsm:
1159
- print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
1160
- continue
1161
-
1162
- # Fit or load model
1163
- if os.path.exists(hmm_path) and not cfg.force_redo_hmm_fit:
1164
- hmm = HMM.load(hmm_path)
1165
- hmm.print_params()
1166
- else:
1167
- print(f"Fitting HMM for {sample} {ref} {mod_label}")
1168
- hmm = HMM.from_config(cfg)
1169
- # fit expects a list-of-seqs or 2D ndarray in the obsm
1170
- seqs = subset.obsm[obsm_key]
1171
- hmm.fit(seqs)
1172
- hmm.print_params()
1173
- hmm.save(hmm_path)
1174
-
1175
- # Apply / annotate on the subset, then copy layers back to final_adata
1176
- if (not cfg.bypass_hmm_apply) or cfg.force_redo_hmm_apply:
1177
- print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
1178
- # Use the new uns_key argument so subset will record appended layer names
1179
- # (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
1180
- hmm.annotate_adata(subset,
1181
- obs_column=cfg.reference_column,
1182
- layer=cfg.layer_for_umap_plotting,
1183
- config=cfg)
1184
-
1185
- #to_merge = [("C_all_accessible_features", 80)]
1186
- to_merge = cfg.hmm_merge_layer_features
1187
- for layer_to_merge, merge_distance in to_merge:
1188
- if layer_to_merge:
1189
- hmm.merge_intervals_in_layer(subset,
1190
- layer=layer_to_merge,
1191
- distance_threshold=merge_distance,
1192
- overwrite=True
1193
- )
1194
- else:
1195
- pass
1196
-
1197
- # collect appended layers from subset.uns
1198
- appended = list(subset.uns.get(uns_key, []))
1199
- print(appended)
1200
- if len(appended) == 0:
1201
- # nothing appended for this subset; continue
1202
- continue
1203
-
1204
- # copy each appended layer into adata
1205
- subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
1206
- for layer_name in appended:
1207
- if layer_name not in subset.layers:
1208
- # defensive: skip
1209
- warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
1210
- continue
1211
- sub_layer = subset.layers[layer_name]
1212
- # ensure final layer exists and assign rows
1213
- try:
1214
- hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
1215
- except Exception as e:
1216
- warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
1217
- # fallback: if dense and small, try to coerce
1218
- if issparse(sub_layer):
1219
- arr = sub_layer.toarray()
1220
- else:
1221
- arr = np.asarray(sub_layer)
1222
- adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
1223
- final_idx = np.nonzero(subset_mask_bool)[0]
1224
- adata.layers[layer_name][final_idx, :] = arr
1225
-
1226
- # merge appended layer names into adata.uns
1227
- existing = list(adata.uns.get(uns_key, []))
1228
- for ln in appended:
1229
- if ln not in existing:
1230
- existing.append(ln)
1231
- adata.uns[uns_key] = existing
1232
-
1233
- else:
1234
- pass
1235
-
1236
- ## Save HMM annotated adata
1237
- if not os.path.exists(hmm_adata_path):
1238
- print('Saving hmm analyzed adata post preprocessing and duplicate removal')
1239
- if ".gz" in hmm_adata_path:
1240
- safe_write_h5ad(adata, f"{hmm_adata_path}", compression='gzip', backup=True, backup_dir=hmm_backup_dir)
1241
- else:
1242
- safe_write_h5ad(adata, f"{hmm_adata_path}.gz", compression='gzip', backup=True, backup_dir=hmm_backup_dir)
1243
-
1244
- ########################################################################################################################
1245
-
1246
- ############################################### HMM based feature plotting ###############################################
1247
-
1248
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
1249
- hmm_dir = f"{pp_dir}/11_hmm_clustermaps"
1250
-
1251
- if os.path.isdir(hmm_dir):
1252
- print(hmm_dir + ' already exists.')
1253
- else:
1254
- make_dirs([pp_dir, hmm_dir])
1255
- from .plotting import combined_hmm_raw_clustermap
1256
-
1257
- for layer in ['C_all_accessible_features', 'C_small_bound_stretch', 'C_medium_bound_stretch', 'C_putative_nucleosome', 'C_all_accessible_features_merged']:
1258
- save_path = os.path.join(hmm_dir, layer)
1259
- make_dirs([save_path])
1260
-
1261
- combined_hmm_raw_clustermap(
1262
- adata,
1263
- sample_col=cfg.sample_name_col_for_plotting,
1264
- reference_col=reference_column,
1265
- hmm_feature_layer=layer,
1266
- layer_gpc="nan0_0minus1",
1267
- layer_cpg="nan0_0minus1",
1268
- layer_any_c="nan0_0minus1",
1269
- cmap_hmm="coolwarm",
1270
- cmap_gpc="coolwarm",
1271
- cmap_cpg="viridis",
1272
- cmap_any_c='coolwarm',
1273
- min_quality=20,
1274
- min_length=80,
1275
- min_mapped_length_to_reference_length_ratio=0.2,
1276
- min_position_valid_fraction=0.2,
1277
- sample_mapping=None,
1278
- save_path=save_path,
1279
- normalize_hmm=False,
1280
- sort_by="gpc", # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
1281
- bins=None,
1282
- deaminase=True,
1283
- min_signal=0
1284
- )
1285
-
1286
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
1287
- hmm_dir = f"{pp_dir}/12_hmm_bulk_traces"
1288
-
1289
- if os.path.isdir(hmm_dir):
1290
- print(hmm_dir + ' already exists.')
1291
- else:
1292
- make_dirs([pp_dir, hmm_dir])
1293
- from .plotting import plot_hmm_layers_rolling_by_sample_ref
1294
- saved = plot_hmm_layers_rolling_by_sample_ref(
1295
- adata,
1296
- layers=adata.uns['hmm_appended_layers'],
1297
- sample_col=cfg.sample_name_col_for_plotting,
1298
- ref_col=reference_column,
1299
- window=101,
1300
- rows_per_page=4,
1301
- figsize_per_cell=(4,2.5),
1302
- output_dir=hmm_dir,
1303
- save=True,
1304
- show_raw=False
1305
- )
1306
-
1307
- pp_dir = f"{split_dir}/preprocessed_duplicates_removed"
1308
- hmm_dir = f"{pp_dir}/13_hmm_fragment_distributions"
1309
- if os.path.isdir(hmm_dir):
1310
- print(hmm_dir + ' already exists.')
1311
- else:
1312
- make_dirs([pp_dir, hmm_dir])
1313
- from .plotting import plot_hmm_size_contours
1314
-
1315
- for layer, max in [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 160), ('C_all_accessible_features_merged_lengths', 800)]:
1316
- save_path = os.path.join(hmm_dir, layer)
1317
- make_dirs([save_path])
1318
-
1319
- figs = plot_hmm_size_contours(
1320
- adata,
1321
- length_layer=layer,
1322
- sample_col=cfg.sample_name_col_for_plotting,
1323
- ref_obs_col=reference_column,
1324
- rows_per_page=6,
1325
- max_length_cap=max,
1326
- figsize_per_cell=(3.5, 2.2),
1327
- save_path=save_path,
1328
- save_pdf=False,
1329
- save_each_page=True,
1330
- dpi=200,
1331
- smoothing_sigma=None,
1332
- normalize_after_smoothing=False,
1333
- cmap='viridis',
1334
- log_scale_z=True
1335
- )
1336
-
1337
- ########################################################################################################################
1338
-
1339
- ############################################### MultiQC HTML Report ###############################################
1340
- from .informatics.helpers import run_multiqc
1341
- # multiqc ###
1342
- if os.path.isdir(f"{split_dir}/multiqc"):
1343
- print(f"{split_dir}/multiqc" + ' already exists, skipping multiqc')
1344
- else:
1345
- run_multiqc(split_dir, f"{split_dir}/multiqc")
1346
- ########################################################################################################################