smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -11,29 +11,48 @@ import traceback
11
11
  import gzip
12
12
  import torch
13
13
 
14
- from .. import readwrite
14
+ import shutil
15
+ from pathlib import Path
16
+ from typing import Union, Iterable, Optional
17
+
18
+ from ..readwrite import make_dirs, safe_write_h5ad
15
19
  from .binarize_converted_base_identities import binarize_converted_base_identities
16
- from .find_conversion_sites import find_conversion_sites
17
- from .count_aligned_reads import count_aligned_reads
18
- from .extract_base_identities import extract_base_identities
19
- from .make_dirs import make_dirs
20
- from .ohe_batching import ohe_batching
20
+ from .fasta_functions import find_conversion_sites
21
+ from .bam_functions import count_aligned_reads, extract_base_identities
22
+ from .ohe import ohe_batching
21
23
 
22
24
  if __name__ == "__main__":
23
25
  multiprocessing.set_start_method("forkserver", force=True)
24
26
 
25
- def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix, device='cpu', num_threads=8):
27
+ def converted_BAM_to_adata(converted_FASTA,
28
+ split_dir,
29
+ output_dir,
30
+ input_already_demuxed,
31
+ mapping_threshold,
32
+ experiment_name,
33
+ conversions,
34
+ bam_suffix,
35
+ device='cpu',
36
+ num_threads=8,
37
+ deaminase_footprinting=False,
38
+ delete_intermediates=True,
39
+ double_barcoded_path = None,
40
+ ):
26
41
  """
27
42
  Converts BAM files into an AnnData object by binarizing modified base identities.
28
43
 
29
44
  Parameters:
30
- converted_FASTA (str): Path to the converted FASTA reference.
31
- split_dir (str): Directory containing converted BAM files.
45
+ converted_FASTA (Path): Path to the converted FASTA reference.
46
+ split_dir (Path): Directory containing converted BAM files.
47
+ output_dir (Path): Directory of the output dir
48
+ input_already_demuxed (bool): Whether input reads were originally demuxed
32
49
  mapping_threshold (float): Minimum fraction of aligned reads required for inclusion.
33
50
  experiment_name (str): Name for the output AnnData object.
34
- conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
51
+ conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
35
52
  bam_suffix (str): File suffix for BAM files.
36
53
  num_threads (int): Number of parallel processing threads.
54
+ deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
55
+ double_barcoded_path (Path): Path to dorado demux summary file of double ended barcodes
37
56
 
38
57
  Returns:
39
58
  str: Path to the final AnnData object.
@@ -48,50 +67,73 @@ def converted_BAM_to_adata_II(converted_FASTA, split_dir, mapping_threshold, exp
48
67
  print(f"Using device: {device}")
49
68
 
50
69
  ## Set Up Directories and File Paths
51
- parent_dir = os.path.dirname(split_dir)
52
- h5_dir = os.path.join(parent_dir, 'h5ads')
53
- tmp_dir = os.path.join(parent_dir, 'tmp')
54
- final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{os.path.basename(split_dir)}.h5ad.gz')
70
+ h5_dir = output_dir / 'h5ads'
71
+ tmp_dir = output_dir / 'tmp'
72
+ final_adata = None
73
+ final_adata_path = h5_dir / f'{experiment_name}.h5ad.gz'
55
74
 
56
- if os.path.exists(final_adata_path):
75
+ if final_adata_path.exists():
57
76
  print(f"{final_adata_path} already exists. Using existing AnnData object.")
58
- return final_adata_path
77
+ return final_adata, final_adata_path
59
78
 
60
79
  make_dirs([h5_dir, tmp_dir])
61
80
 
62
- ## Get BAM Files ##
63
- bam_files = [f for f in os.listdir(split_dir) if f.endswith(bam_suffix) and not f.endswith('.bai') and 'unclassified' not in f]
64
- bam_files.sort()
65
- bam_path_list = [os.path.join(split_dir, f) for f in bam_files]
81
+ bam_files = sorted(
82
+ p for p in split_dir.iterdir()
83
+ if p.is_file()
84
+ and p.suffix == ".bam"
85
+ and "unclassified" not in p.name
86
+ )
87
+
88
+ bam_path_list = [split_dir / f for f in bam_files]
66
89
  print(f"Found {len(bam_files)} BAM files: {bam_files}")
67
90
 
68
91
  ## Process Conversion Sites
69
- max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversion_types)
92
+ max_reference_length, record_FASTA_dict, chromosome_FASTA_dict = process_conversion_sites(converted_FASTA, conversions, deaminase_footprinting)
70
93
 
71
94
  ## Filter BAM Files by Mapping Threshold
72
95
  records_to_analyze = filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold)
73
96
 
74
97
  ## Process BAMs in Parallel
75
- final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device)
98
+ final_adata = process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting)
76
99
 
100
+ final_adata.uns['References'] = {}
77
101
  for chromosome, [seq, comp] in chromosome_FASTA_dict.items():
78
102
  final_adata.var[f'{chromosome}_top_strand_FASTA_base'] = list(seq)
79
103
  final_adata.var[f'{chromosome}_bottom_strand_FASTA_base'] = list(comp)
80
104
  final_adata.uns[f'{chromosome}_FASTA_sequence'] = seq
105
+ final_adata.uns['References'][f'{chromosome}_FASTA_sequence'] = seq
106
+
107
+ final_adata.obs_names_make_unique()
108
+ cols = final_adata.obs.columns
81
109
 
82
- ## Save Final AnnData
83
- # print(f"Saving AnnData to {final_adata_path}")
84
- # final_adata.write_h5ad(final_adata_path, compression='gzip')
110
+ # Make obs cols categorical
111
+ for col in cols:
112
+ final_adata.obs[col] = final_adata.obs[col].astype('category')
113
+
114
+ if input_already_demuxed:
115
+ final_adata.obs["demux_type"] = ["already"] * final_adata.shape[0]
116
+ final_adata.obs["demux_type"] = final_adata.obs["demux_type"].astype("category")
117
+ else:
118
+ from .h5ad_functions import add_demux_type_annotation
119
+ double_barcoded_reads = double_barcoded_path / "barcoding_summary.txt"
120
+ add_demux_type_annotation(final_adata, double_barcoded_reads)
121
+
122
+ ## Delete intermediate h5ad files and temp directories
123
+ if delete_intermediates:
124
+ delete_intermediate_h5ads_and_tmpdir(h5_dir, tmp_dir)
125
+
85
126
  return final_adata, final_adata_path
86
127
 
87
128
 
88
- def process_conversion_sites(converted_FASTA, conversion_types):
129
+ def process_conversion_sites(converted_FASTA, conversions=['unconverted', '5mC'], deaminase_footprinting=False):
89
130
  """
90
131
  Extracts conversion sites and determines the max reference length.
91
132
 
92
133
  Parameters:
93
134
  converted_FASTA (str): Path to the converted reference FASTA.
94
- conversion_types (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
135
+ conversions (list): List of modification types (e.g., ['unconverted', '5mC', '6mA']).
136
+ deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
95
137
 
96
138
  Returns:
97
139
  max_reference_length (int): The length of the longest sequence.
@@ -101,11 +143,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
101
143
  record_FASTA_dict = {}
102
144
  chromosome_FASTA_dict = {}
103
145
  max_reference_length = 0
104
- unconverted = conversion_types[0]
105
- conversions = conversion_types[1:]
146
+ unconverted = conversions[0]
147
+ conversion_types = conversions[1:]
106
148
 
107
149
  # Process the unconverted sequence once
108
- modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversion_types)
150
+ modification_dict[unconverted] = find_conversion_sites(converted_FASTA, unconverted, conversions, deaminase_footprinting)
109
151
  # Above points to record_dict[record.id] = [sequence_length, [], [], sequence, complement] with only unconverted record.id keys
110
152
 
111
153
  # Get **max sequence length** from unconverted records
@@ -114,7 +156,11 @@ def process_conversion_sites(converted_FASTA, conversion_types):
114
156
  # Add **unconverted records** to `record_FASTA_dict`
115
157
  for record, values in modification_dict[unconverted].items():
116
158
  sequence_length, top_coords, bottom_coords, sequence, complement = values
117
- chromosome = record.replace(f"_{unconverted}_top", "")
159
+
160
+ if not deaminase_footprinting:
161
+ chromosome = record.replace(f"_{unconverted}_top", "")
162
+ else:
163
+ chromosome = record
118
164
 
119
165
  # Store **original sequence**
120
166
  record_FASTA_dict[record] = [
@@ -127,13 +173,17 @@ def process_conversion_sites(converted_FASTA, conversion_types):
127
173
  chromosome_FASTA_dict[chromosome] = [sequence + "N" * (max_reference_length - sequence_length), complement + "N" * (max_reference_length - sequence_length)]
128
174
 
129
175
  # Process converted records
130
- for conversion in conversions:
131
- modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversion_types)
176
+ for conversion in conversion_types:
177
+ modification_dict[conversion] = find_conversion_sites(converted_FASTA, conversion, conversions, deaminase_footprinting)
132
178
  # Above points to record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement] with only unconverted record.id keys
133
179
 
134
180
  for record, values in modification_dict[conversion].items():
135
181
  sequence_length, top_coords, bottom_coords, sequence, complement = values
136
- chromosome = record.split(f"_{unconverted}_")[0] # Extract chromosome name
182
+
183
+ if not deaminase_footprinting:
184
+ chromosome = record.split(f"_{unconverted}_")[0] # Extract chromosome name
185
+ else:
186
+ chromosome = record
137
187
 
138
188
  # Add **both strands** for converted records
139
189
  for strand in ["top", "bottom"]:
@@ -168,18 +218,20 @@ def filter_bams_by_mapping_threshold(bam_path_list, bam_files, mapping_threshold
168
218
  return records_to_analyze
169
219
 
170
220
 
171
- def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tmp_dir, max_reference_length, device):
221
+ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting):
172
222
  """Worker function to process a single BAM file (must be at top-level for multiprocessing)."""
173
223
  adata_list = []
174
224
 
175
225
  for record in records_to_analyze:
176
- sample = os.path.basename(bam).split(sep=".bam")[0]
226
+ sample = bam.stem
177
227
  chromosome = record_FASTA_dict[record][2]
178
228
  current_length = record_FASTA_dict[record][4]
179
229
  mod_type, strand = record_FASTA_dict[record][6], record_FASTA_dict[record][7]
230
+ sequence = chromosome_FASTA_dict[chromosome][0]
180
231
 
181
232
  # Extract Base Identities
182
- fwd_bases, rev_bases = extract_base_identities(bam, record, range(current_length), max_reference_length)
233
+ fwd_bases, rev_bases, mismatch_counts_per_read, mismatch_trend_per_read = extract_base_identities(bam, record, range(current_length), max_reference_length, sequence)
234
+ mismatch_trend_series = pd.Series(mismatch_trend_per_read)
183
235
 
184
236
  # Skip processing if both forward and reverse base identities are empty
185
237
  if not fwd_bases and not rev_bases:
@@ -190,11 +242,11 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
190
242
 
191
243
  # Binarize the Base Identities if they exist
192
244
  if fwd_bases:
193
- fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device)
245
+ fwd_bin = binarize_converted_base_identities(fwd_bases, strand, mod_type, bam, device,deaminase_footprinting, mismatch_trend_per_read)
194
246
  merged_bin.update(fwd_bin)
195
247
 
196
248
  if rev_bases:
197
- rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device)
249
+ rev_bin = binarize_converted_base_identities(rev_bases, strand, mod_type, bam, device, deaminase_footprinting, mismatch_trend_per_read)
198
250
  merged_bin.update(rev_bin)
199
251
 
200
252
  # Skip if merged_bin is empty (no valid binarized data)
@@ -257,11 +309,18 @@ def process_single_bam(bam_index, bam, records_to_analyze, record_FASTA_dict, tm
257
309
  adata.obs_names = bin_df.index.astype(str)
258
310
  adata.var_names = bin_df.columns.astype(str)
259
311
  adata.obs["Sample"] = [sample] * len(adata)
312
+ try:
313
+ barcode = sample.split('barcode')[1]
314
+ except:
315
+ barcode = np.nan
316
+ adata.obs["Barcode"] = [int(barcode)] * len(adata)
317
+ adata.obs["Barcode"] = adata.obs["Barcode"].astype(str)
260
318
  adata.obs["Reference"] = [chromosome] * len(adata)
261
319
  adata.obs["Strand"] = [strand] * len(adata)
262
320
  adata.obs["Dataset"] = [mod_type] * len(adata)
263
321
  adata.obs["Reference_dataset_strand"] = [f"{chromosome}_{mod_type}_{strand}"] * len(adata)
264
322
  adata.obs["Reference_strand"] = [f"{chromosome}_{strand}"] * len(adata)
323
+ adata.obs["Read_mismatch_trend"] = adata.obs_names.map(mismatch_trend_series)
265
324
 
266
325
  # Attach One-Hot Encodings to Layers
267
326
  adata.layers["A_binary_encoding"] = df_A
@@ -279,16 +338,16 @@ def timestamp():
279
338
  return time.strftime("[%Y-%m-%d %H:%M:%S]")
280
339
 
281
340
 
282
- def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue):
341
+ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue):
283
342
  """Worker function that processes a single BAM and writes the output to an H5AD file."""
284
343
  worker_id = current_process().pid # Get worker process ID
285
- sample = os.path.basename(bam).split(sep=".bam")[0]
344
+ sample = bam.stem
286
345
 
287
346
  try:
288
347
  print(f"{timestamp()} [Worker {worker_id}] Processing BAM: {sample}")
289
348
 
290
- h5ad_path = os.path.join(h5_dir, f"{sample}.h5ad")
291
- if os.path.exists(h5ad_path):
349
+ h5ad_path = h5_dir / bam.with_suffix(".h5ad").name
350
+ if h5ad_path.exists():
292
351
  print(f"{timestamp()} [Worker {worker_id}] Skipping {sample}: Already processed.")
293
352
  progress_queue.put(sample)
294
353
  return
@@ -302,10 +361,10 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
302
361
  return
303
362
 
304
363
  # Process BAM
305
- adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, tmp_dir, max_reference_length, device)
364
+ adata = process_single_bam(bam_index, bam, bam_records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, max_reference_length, device, deaminase_footprinting)
306
365
 
307
366
  if adata is not None:
308
- adata.write_h5ad(h5ad_path)
367
+ adata.write_h5ad(str(h5ad_path))
309
368
  print(f"{timestamp()} [Worker {worker_id}] Completed processing for BAM: {sample}")
310
369
 
311
370
  # Free memory
@@ -318,9 +377,9 @@ def worker_function(bam_index, bam, records_to_analyze, shared_record_FASTA_dict
318
377
  print(f"{timestamp()} [Worker {worker_id}] ERROR while processing {sample}:\n{traceback.format_exc()}")
319
378
  progress_queue.put(sample) # Still signal completion to prevent deadlock
320
379
 
321
- def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device):
380
+ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, num_threads, max_reference_length, device, deaminase_footprinting):
322
381
  """Processes BAM files in parallel, writes each H5AD to disk, and concatenates them at the end."""
323
- os.makedirs(h5_dir, exist_ok=True) # Ensure h5_dir exists
382
+ make_dirs(h5_dir) # Ensure h5_dir exists
324
383
 
325
384
  print(f"{timestamp()} Starting parallel BAM processing with {num_threads} threads...")
326
385
 
@@ -337,7 +396,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
337
396
 
338
397
  with Pool(processes=num_threads) as pool:
339
398
  results = [
340
- pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, progress_queue))
399
+ pool.apply_async(worker_function, (i, bam, records_to_analyze, shared_record_FASTA_dict, chromosome_FASTA_dict, tmp_dir, h5_dir, max_reference_length, device, deaminase_footprinting, progress_queue))
341
400
  for i, bam in enumerate(bam_path_list)
342
401
  ]
343
402
 
@@ -356,7 +415,7 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
356
415
  pool.join() # Ensure all workers finish
357
416
 
358
417
  # Final Concatenation Step
359
- h5ad_files = [os.path.join(h5_dir, f) for f in os.listdir(h5_dir) if f.endswith(".h5ad")]
418
+ h5ad_files = [h5_dir / f for f in h5_dir.iterdir() if f.suffix == ".h5ad"]
360
419
 
361
420
  if not h5ad_files:
362
421
  print(f"{timestamp()} No valid H5AD files generated. Exiting.")
@@ -366,4 +425,93 @@ def process_bams_parallel(bam_path_list, records_to_analyze, record_FASTA_dict,
366
425
  final_adata = ad.concat([ad.read_h5ad(f) for f in h5ad_files], join="outer")
367
426
 
368
427
  print(f"{timestamp()} Successfully generated final AnnData object.")
369
- return final_adata
428
+ return final_adata
429
+
430
+ def delete_intermediate_h5ads_and_tmpdir(
431
+ h5_dir: Union[str, Path, Iterable[str], None],
432
+ tmp_dir: Optional[Union[str, Path]] = None,
433
+ *,
434
+ dry_run: bool = False,
435
+ verbose: bool = True,
436
+ ):
437
+ """
438
+ Delete intermediate .h5ad files and a temporary directory.
439
+
440
+ Parameters
441
+ ----------
442
+ h5_dir : str | Path | iterable[str] | None
443
+ If a directory path is given, all files directly inside it will be considered.
444
+ If an iterable of file paths is given, those files will be considered.
445
+ Only files ending with '.h5ad' (and not ending with '.gz') are removed.
446
+ tmp_dir : str | Path | None
447
+ Path to a directory to remove recursively (e.g. a temp dir created earlier).
448
+ dry_run : bool
449
+ If True, print what *would* be removed but do not actually delete.
450
+ verbose : bool
451
+ Print progress / warnings.
452
+ """
453
+ # Helper: remove a single file path (Path-like or string)
454
+ def _maybe_unlink(p: Path):
455
+ if not p.exists():
456
+ if verbose:
457
+ print(f"[skip] not found: {p}")
458
+ return
459
+ if not p.is_file():
460
+ if verbose:
461
+ print(f"[skip] not a file: {p}")
462
+ return
463
+ if dry_run:
464
+ print(f"[dry-run] would remove file: {p}")
465
+ return
466
+ try:
467
+ p.unlink()
468
+ if verbose:
469
+ print(f"Removed file: {p}")
470
+ except Exception as e:
471
+ print(f"[error] failed to remove file {p}: {e}")
472
+
473
+ # Handle h5_dir input (directory OR iterable of file paths)
474
+ if h5_dir is not None:
475
+ # If it's a path to a directory, iterate its children
476
+ if isinstance(h5_dir, (str, Path)) and Path(h5_dir).is_dir():
477
+ dpath = Path(h5_dir)
478
+ for p in dpath.iterdir():
479
+ # only target top-level files (not recursing); require '.h5ad' suffix and exclude gz
480
+ name = p.name.lower()
481
+ if name.endswith(".h5ad") and not name.endswith(".gz"):
482
+ _maybe_unlink(p)
483
+ else:
484
+ if verbose:
485
+ # optional: comment this out if too noisy
486
+ print(f"[skip] not matching pattern: {p.name}")
487
+ else:
488
+ # treat as iterable of file paths
489
+ for f in h5_dir:
490
+ p = Path(f)
491
+ name = p.name.lower()
492
+ if name.endswith(".h5ad") and not name.endswith(".gz"):
493
+ _maybe_unlink(p)
494
+ else:
495
+ if verbose:
496
+ print(f"[skip] not matching pattern or not a file: {p}")
497
+
498
+ # Remove tmp_dir recursively (if provided)
499
+ if tmp_dir is not None:
500
+ td = Path(tmp_dir)
501
+ if not td.exists():
502
+ if verbose:
503
+ print(f"[skip] tmp_dir not found: {td}")
504
+ else:
505
+ if not td.is_dir():
506
+ if verbose:
507
+ print(f"[skip] tmp_dir is not a directory: {td}")
508
+ else:
509
+ if dry_run:
510
+ print(f"[dry-run] would remove directory tree: {td}")
511
+ else:
512
+ try:
513
+ shutil.rmtree(td)
514
+ if verbose:
515
+ print(f"Removed directory tree: {td}")
516
+ except Exception as e:
517
+ print(f"[error] failed to remove tmp dir {td}: {e}")
@@ -0,0 +1,255 @@
1
+ from ..readwrite import make_dirs, time_string
2
+
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ from typing import Union, List, Dict, Tuple
8
+
9
+ import numpy as np
10
+ import gzip
11
+
12
+ from Bio import SeqIO
13
+ from Bio.SeqRecord import SeqRecord
14
+ from Bio.Seq import Seq
15
+ from pyfaidx import Fasta
16
+ import pysam
17
+
18
+ from concurrent.futures import ProcessPoolExecutor
19
+ from itertools import chain
20
+
21
+ def _convert_FASTA_record(record, modification_type, strand, unconverted):
22
+ """ Converts a FASTA record based on modification type and strand. """
23
+ conversion_maps = {
24
+ ('5mC', 'top'): ('C', 'T'),
25
+ ('5mC', 'bottom'): ('G', 'A'),
26
+ ('6mA', 'top'): ('A', 'G'),
27
+ ('6mA', 'bottom'): ('T', 'C')
28
+ }
29
+
30
+ sequence = str(record.seq).upper()
31
+
32
+ if modification_type == unconverted:
33
+ return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
34
+
35
+ if (modification_type, strand) not in conversion_maps:
36
+ raise ValueError(f"Invalid combination: {modification_type}, {strand}")
37
+
38
+ original_base, converted_base = conversion_maps[(modification_type, strand)]
39
+ new_seq = sequence.replace(original_base, converted_base)
40
+
41
+ return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
42
+
43
+ def _process_fasta_record(args):
44
+ """
45
+ Processes a single FASTA record for parallel execution.
46
+ Args:
47
+ args (tuple): (record, modification_types, strands, unconverted)
48
+ Returns:
49
+ list of modified SeqRecord objects.
50
+ """
51
+ record, modification_types, strands, unconverted = args
52
+ modified_records = []
53
+
54
+ for modification_type in modification_types:
55
+ for i, strand in enumerate(strands):
56
+ if i > 0 and modification_type == unconverted:
57
+ continue # Ensure unconverted is added only once
58
+
59
+ modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
60
+
61
+ return modified_records
62
+
63
+ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
64
+ """
65
+ Converts an input FASTA file and writes a new converted FASTA file efficiently.
66
+
67
+ Parameters:
68
+ input_fasta (str): Path to the unconverted FASTA file.
69
+ modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
70
+ strands (list): List of strands ('top', 'bottom').
71
+ output_fasta (str): Path to the converted FASTA output file.
72
+ num_threads (int): Number of parallel threads to use.
73
+ chunk_size (int): Number of records to process per write batch.
74
+
75
+ Returns:
76
+ None (Writes the converted FASTA file).
77
+ """
78
+ unconverted = modification_types[0]
79
+ input_fasta = str(input_fasta)
80
+ output_fasta = str(output_fasta)
81
+
82
+ # Detect if input is gzipped
83
+ open_func = gzip.open if input_fasta.endswith('.gz') else open
84
+ file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
85
+
86
+ def _fasta_record_generator():
87
+ """ Lazily yields FASTA records from file. """
88
+ with open_func(input_fasta, file_mode) as handle:
89
+ for record in SeqIO.parse(handle, 'fasta'):
90
+ yield record
91
+
92
+ with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
93
+ # Process records in parallel using a named function (avoiding lambda)
94
+ results = executor.map(
95
+ _process_fasta_record,
96
+ ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
97
+ )
98
+
99
+ buffer = []
100
+ for modified_records in results:
101
+ buffer.extend(modified_records)
102
+
103
+ # Write out in chunks to save memory
104
+ if len(buffer) >= chunk_size:
105
+ SeqIO.write(buffer, output_handle, 'fasta')
106
+ buffer.clear()
107
+
108
+ # Write any remaining records
109
+ if buffer:
110
+ SeqIO.write(buffer, output_handle, 'fasta')
111
+
112
+ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
113
+ fasta = Path(fasta)
114
+ pysam.faidx(str(fasta)) # creates <fasta>.fai
115
+
116
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
117
+ if write_chrom_sizes:
118
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
119
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
120
+ for line in f_in:
121
+ chrom, size = line.split()[:2]
122
+ out.write(f"{chrom}\t{size}\n")
123
+ return chrom_sizes
124
+ return fai
125
+
126
+ def get_chromosome_lengths(fasta: str | Path) -> Path:
127
+ """
128
+ Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
129
+ """
130
+ fasta = Path(fasta)
131
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
132
+ if not fai.exists():
133
+ index_fasta(fasta, write_chrom_sizes=True) # will also create .chrom.sizes
134
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
135
+ if chrom_sizes.exists():
136
+ print(f"Using existing chrom length file: {chrom_sizes}")
137
+ return chrom_sizes
138
+
139
+ # Build chrom.sizes from .fai
140
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
141
+ for line in f_in:
142
+ chrom, size = line.split()[:2]
143
+ out.write(f"{chrom}\t{size}\n")
144
+ return chrom_sizes
145
+
146
+ def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
147
+ """
148
+ Return {record_id: (length, sequence)} from a FASTA.
149
+ Direct methylation specific
150
+ """
151
+ fasta_file = Path(fasta_file)
152
+ print(f"{time_string()}: Opening FASTA file {fasta_file}")
153
+ record_dict: Dict[str, Tuple[int, str]] = {}
154
+ with fasta_file.open("r") as f:
155
+ for rec in SeqIO.parse(f, "fasta"):
156
+ seq = str(rec.seq).upper()
157
+ record_dict[rec.id] = (len(seq), seq)
158
+ return record_dict
159
+
160
+ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
161
+ """
162
+ Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
163
+
164
+ Parameters:
165
+ fasta_file (str): Path to the converted reference FASTA.
166
+ modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
167
+ conversions (list): List of conversion types. The first element is the unconverted record type.
168
+ deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
169
+
170
+ Returns:
171
+ dict: Dictionary where keys are **both unconverted & converted record names**.
172
+ Values contain:
173
+ [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
174
+ """
175
+ unconverted = conversions[0]
176
+ record_dict = {}
177
+
178
+ # Define base mapping based on modification type
179
+ base_mappings = {
180
+ '5mC': ('C', 'G'), # Cytosine and Guanine
181
+ '6mA': ('A', 'T') # Adenine and Thymine
182
+ }
183
+
184
+ # Read FASTA file and process records
185
+ with open(fasta_file, "r") as f:
186
+ for record in SeqIO.parse(f, "fasta"):
187
+ if unconverted in record.id or deaminase_footprinting:
188
+ sequence = str(record.seq).upper()
189
+ complement = str(record.seq.complement()).upper()
190
+ sequence_length = len(sequence)
191
+
192
+ # Unconverted case: store the full sequence without coordinate filtering
193
+ if modification_type == unconverted:
194
+ record_dict[record.id] = [sequence_length, [], [], sequence, complement]
195
+
196
+ # Process converted records: extract modified base positions
197
+ elif modification_type in base_mappings:
198
+ top_base, bottom_base = base_mappings[modification_type]
199
+ seq_array = np.array(list(sequence))
200
+ top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
201
+ bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
202
+
203
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
204
+
205
+ else:
206
+ raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
207
+
208
+ return record_dict
209
+
210
+ def subsample_fasta_from_bed(
211
+ input_FASTA: str | Path,
212
+ input_bed: str | Path,
213
+ output_directory: str | Path,
214
+ output_FASTA: str | Path
215
+ ) -> None:
216
+ """
217
+ Take a genome-wide FASTA file and a BED file containing
218
+ coordinate windows of interest. Outputs a subsampled FASTA.
219
+ """
220
+
221
+ # Normalize everything to Path
222
+ input_FASTA = Path(input_FASTA)
223
+ input_bed = Path(input_bed)
224
+ output_directory = Path(output_directory)
225
+ output_FASTA = Path(output_FASTA)
226
+
227
+ # Ensure output directory exists
228
+ output_directory.mkdir(parents=True, exist_ok=True)
229
+
230
+ output_FASTA_path = output_directory / output_FASTA
231
+
232
+ # Load the FASTA file using pyfaidx
233
+ fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
234
+
235
+ # Open BED + output FASTA
236
+ with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
237
+ for line in bed:
238
+ fields = line.strip().split()
239
+ chrom = fields[0]
240
+ start = int(fields[1]) # BED is 0-based
241
+ end = int(fields[2]) # BED is 0-based and end is exclusive
242
+ desc = " ".join(fields[3:]) if len(fields) > 3 else ""
243
+
244
+ if chrom not in fasta:
245
+ print(f"Warning: {chrom} not found in FASTA")
246
+ continue
247
+
248
+ # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
249
+ sequence = fasta[chrom][start:end].seq
250
+
251
+ header = f">{chrom}:{start}-{end}"
252
+ if desc:
253
+ header += f" {desc}"
254
+
255
+ out_fasta.write(f"{header}\n{sequence}\n")