smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -18,13 +18,12 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
18
18
  bam_files (list): List of split BAM file path strings
19
19
  Splits an input BAM file on barcode value and makes a BAM index file.
20
20
  """
21
- from .. import readwrite
21
+ from ...readwrite import make_dirs
22
22
  import os
23
23
  import subprocess
24
24
  import glob
25
- from .make_dirs import make_dirs
26
25
 
27
- input_bam = aligned_sorted_BAM + bam_suffix
26
+ input_bam = aligned_sorted_BAM.with_suffix(bam_suffix)
28
27
  command = ["dorado", "demux", "--kit-name", barcode_kit]
29
28
  if barcode_both_ends:
30
29
  command.append("--barcode-both-ends")
@@ -34,17 +33,16 @@ def demux_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, barcode_kit,
34
33
  command += ["-t", str(threads)]
35
34
  else:
36
35
  pass
37
- command += ["--emit-summary", "--sort-bam", "--output-dir", split_dir]
38
- command.append(input_bam)
36
+ command += ["--emit-summary", "--sort-bam", "--output-dir", str(split_dir)]
37
+ command.append(str(input_bam))
39
38
  command_string = ' '.join(command)
40
39
  print(f"Running: {command_string}")
41
40
  subprocess.run(command)
42
41
 
43
- # Make a BAM index file for the BAMs in that directory
44
- bam_pattern = '*' + bam_suffix
45
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
46
- bam_files = [bam for bam in bam_files if '.bai' not in bam and 'unclassified' not in bam]
47
- bam_files.sort()
42
+ bam_files = sorted(
43
+ p for p in split_dir.glob(f"*{bam_suffix}")
44
+ if p.is_file() and p.suffix == bam_suffix and "unclassified" not in p.name
45
+ )
48
46
 
49
47
  if not bam_files:
50
48
  raise FileNotFoundError(f"No BAM files found in {split_dir} with suffix {bam_suffix}")
@@ -1,4 +1,4 @@
1
- def extract_base_identities(bam_file, chromosome, positions, max_reference_length):
1
+ def extract_base_identities(bam_file, chromosome, positions, max_reference_length, sequence):
2
2
  """
3
3
  Efficiently extracts base identities from mapped reads with reference coordinates.
4
4
 
@@ -7,6 +7,7 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
7
7
  chromosome (str): Name of the reference chromosome.
8
8
  positions (list): Positions to extract (0-based).
9
9
  max_reference_length (int): Maximum reference length for padding.
10
+ sequence (str): The sequence of the record fasta
10
11
 
11
12
  Returns:
12
13
  dict: Base identities from forward mapped reads.
@@ -16,16 +17,19 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
16
17
  import numpy as np
17
18
  from collections import defaultdict
18
19
  import time
20
+ from collections import defaultdict, Counter
19
21
 
20
22
  timestamp = time.strftime("[%Y-%m-%d %H:%M:%S]")
21
23
 
22
24
  positions = set(positions)
23
25
  fwd_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
24
26
  rev_base_identities = defaultdict(lambda: np.full(max_reference_length, 'N', dtype='<U1'))
27
+ mismatch_counts_per_read = defaultdict(lambda: defaultdict(Counter))
25
28
 
26
29
  #print(f"{timestamp} Reading reads from {chromosome} BAM file: {bam_file}")
27
- with pysam.AlignmentFile(bam_file, "rb") as bam:
30
+ with pysam.AlignmentFile(str(bam_file), "rb") as bam:
28
31
  total_reads = bam.mapped
32
+ ref_seq = sequence.upper()
29
33
  for read in bam.fetch(chromosome):
30
34
  if not read.is_mapped:
31
35
  continue # Skip unmapped reads
@@ -39,6 +43,28 @@ def extract_base_identities(bam_file, chromosome, positions, max_reference_lengt
39
43
 
40
44
  for read_position, reference_position in aligned_pairs:
41
45
  if reference_position in positions:
42
- base_dict[read_name][reference_position] = query_sequence[read_position]
46
+ read_base = query_sequence[read_position]
47
+ ref_base = ref_seq[reference_position]
43
48
 
44
- return dict(fwd_base_identities), dict(rev_base_identities)
49
+ base_dict[read_name][reference_position] = read_base
50
+
51
+ # Track mismatches (excluding Ns)
52
+ if read_base != ref_base and read_base != 'N' and ref_base != 'N':
53
+ mismatch_counts_per_read[read_name][ref_base][read_base] += 1
54
+
55
+ # Determine C→T vs G→A dominance per read
56
+ mismatch_trend_per_read = {}
57
+ for read_name, ref_dict in mismatch_counts_per_read.items():
58
+ c_to_t = ref_dict.get("C", {}).get("T", 0)
59
+ g_to_a = ref_dict.get("G", {}).get("A", 0)
60
+
61
+ if abs(c_to_t - g_to_a) < 0.01 and c_to_t > 0:
62
+ mismatch_trend_per_read[read_name] = "equal"
63
+ elif c_to_t > g_to_a:
64
+ mismatch_trend_per_read[read_name] = "C->T"
65
+ elif g_to_a > c_to_t:
66
+ mismatch_trend_per_read[read_name] = "G->A"
67
+ else:
68
+ mismatch_trend_per_read[read_name] = "none"
69
+
70
+ return dict(fwd_base_identities), dict(rev_base_identities), dict(mismatch_counts_per_read), mismatch_trend_per_read
@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
23
23
  import glob
24
24
  import zipfile
25
25
 
26
- os.chdir(mod_tsv_dir)
27
26
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
28
- bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
27
+ bam_files = glob.glob(split_dir / f"*{bam_suffix}")
28
+ print(f"Running modkit extract for the following bam files: {bam_files}")
29
29
 
30
30
  if threads:
31
31
  threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
35
35
  for input_file in bam_files:
36
36
  print(input_file)
37
37
  # Extract the file basename
38
- file_name = os.path.basename(input_file)
38
+ file_name = input_file.name
39
39
  if skip_unclassified and "unclassified" in file_name:
40
40
  print("Skipping modkit extract on unclassified reads")
41
41
  else:
42
42
  # Construct the output TSV file path
43
- output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
44
- output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
45
- if os.path.exists(f"{output_tsv}.gz"):
46
- print(f"{output_tsv}.gz already exists, skipping modkit extract")
43
+ output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
44
+ output_tsv_gz = output_tsv + '.gz'
45
+ if output_tsv_gz.exists():
46
+ print(f"{output_tsv_gz} already exists, skipping modkit extract")
47
47
  else:
48
48
  print(f"Extracting modification data from {input_file}")
49
49
  if modkit_summary:
50
50
  # Run modkit summary
51
- subprocess.run(["modkit", "summary", input_file])
51
+ subprocess.run(["modkit", "summary", str(input_file)])
52
52
  else:
53
53
  pass
54
54
  # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
61
61
  "--mod-thresholds", f"a:{m6A_threshold}",
62
62
  "--mod-thresholds", f"h:{hm5C_threshold}",
63
63
  "-t", threads,
64
- input_file, output_tsv
64
+ str(input_file), str(output_tsv)
65
65
  ]
66
66
  else:
67
67
  extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
71
71
  "--mod-thresholds", f"m:{m5C_threshold}",
72
72
  "--mod-thresholds", f"a:{m6A_threshold}",
73
73
  "--mod-thresholds", f"h:{hm5C_threshold}",
74
- input_file, output_tsv
74
+ str(input_file), str(output_tsv)
75
75
  ]
76
76
  subprocess.run(extract_command)
77
77
  # Zip the output TSV
78
78
  print(f'zipping {output_tsv}')
79
79
  if threads:
80
- zip_command = ["pigz", "-f", "-p", threads, output_tsv]
80
+ zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
81
81
  else:
82
- zip_command = ["pigz", "-f", output_tsv]
83
- subprocess.run(zip_command, check=True)
82
+ zip_command = ["pigz", "-f", str(output_tsv)]
83
+ subprocess.run(zip_command, check=True)
84
+
85
+ return
@@ -2,7 +2,7 @@
2
2
 
3
3
  def extract_read_features_from_bam(bam_file_path):
4
4
  """
5
- Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length.
5
+ Make a dict of reads from a bam that points to a list of read metrics: read length, read median Q-score, reference length, mapped length, mapping quality
6
6
  Params:
7
7
  bam_file_path (str):
8
8
  Returns:
@@ -26,6 +26,8 @@ def extract_read_features_from_bam(bam_file_path):
26
26
  reference_name = read.reference_name
27
27
  reference_index = bam_file.references.index(reference_name)
28
28
  reference_length = reference_lengths[reference_index]
29
- read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length]
29
+ mapped_length = sum(end - start for start, end in read.get_blocks())
30
+ mapping_quality = read.mapping_quality # Phred-scaled MAPQ
31
+ read_metrics[read.query_name] = [read.query_length, median_read_quality, reference_length, mapped_length, mapping_quality]
30
32
 
31
33
  return read_metrics
@@ -1,11 +1,12 @@
1
- def find_conversion_sites(fasta_file, modification_type, conversion_types):
1
+ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
2
2
  """
3
3
  Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
4
4
 
5
5
  Parameters:
6
6
  fasta_file (str): Path to the converted reference FASTA.
7
7
  modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
8
- conversion_types (list): List of conversion types. The first element is the unconverted record type.
8
+ conversions (list): List of conversion types. The first element is the unconverted record type.
9
+ deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
9
10
 
10
11
  Returns:
11
12
  dict: Dictionary where keys are **both unconverted & converted record names**.
@@ -14,7 +15,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
14
15
  """
15
16
  import numpy as np
16
17
  from Bio import SeqIO
17
- unconverted = conversion_types[0]
18
+ unconverted = conversions[0]
18
19
  record_dict = {}
19
20
 
20
21
  # Define base mapping based on modification type
@@ -26,7 +27,7 @@ def find_conversion_sites(fasta_file, modification_type, conversion_types):
26
27
  # Read FASTA file and process records
27
28
  with open(fasta_file, "r") as f:
28
29
  for record in SeqIO.parse(f, "fasta"):
29
- if unconverted in record.id:
30
+ if unconverted in record.id or deaminase_footprinting:
30
31
  sequence = str(record.seq).upper()
31
32
  complement = str(record.seq.complement()).upper()
32
33
  sequence_length = len(sequence)
@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
67
67
  None (Writes the converted FASTA file).
68
68
  """
69
69
  unconverted = modification_types[0]
70
+ input_fasta = str(input_fasta)
71
+ output_fasta = str(output_fasta)
70
72
 
71
73
  # Detect if input is gzipped
72
74
  open_func = gzip.open if input_fasta.endswith('.gz') else open
@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
8
8
  fasta (str): Path to the input fasta
9
9
  """
10
10
  import os
11
+ from pathlib import Path
11
12
  import subprocess
12
13
  from .index_fasta import index_fasta
13
14
 
14
15
  # Make a fasta index file if one isn't already available
15
- index_path = f'{fasta}.fai'
16
- if os.path.exists(index_path):
16
+ index_path = fasta / '.fai'
17
+ if index_path.exists():
17
18
  print(f'Using existing fasta index file: {index_path}')
18
19
  else:
19
20
  index_fasta(fasta)
20
21
 
21
- parent_dir = os.path.dirname(fasta)
22
- fasta_basename = os.path.basename(fasta)
23
- chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
- chrom_path = os.path.join(parent_dir, chrom_basename)
22
+ parent_dir = fasta.parent
23
+ fasta_basename = fasta.name
24
+ chrom_basename = fasta.stem + '.chrom.sizes'
25
+ chrom_path = parent_dir / chrom_basename
25
26
 
26
27
  # Make a chromosome length file
27
- if os.path.exists(chrom_path):
28
+ if chrom_path.exists():
28
29
  print(f'Using existing chrom length index file: {chrom_path}')
29
30
  else:
30
31
  with open(chrom_path, 'w') as outfile:
31
- command = ["cut", "-f1,2", index_path]
32
+ command = ["cut", "-f1,2", str(index_path)]
32
33
  subprocess.run(command, stdout=outfile)
@@ -0,0 +1,24 @@
1
+ import pysam
2
+ from pathlib import Path
3
+
4
+ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
5
+ """
6
+ Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
7
+
8
+ Returns
9
+ -------
10
+ Path: path to chrom.sizes file (if requested), else .fai
11
+ """
12
+ fasta = Path(fasta)
13
+ pysam.faidx(str(fasta)) # makes fasta.fai
14
+
15
+ if write_chrom_sizes:
16
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
17
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
18
+ with open(fai) as f_in, open(chrom_sizes, "w") as out:
19
+ for line in f_in:
20
+ chrom, size = line.split()[:2]
21
+ out.write(f"{chrom}\t{size}\n")
22
+ return chrom_sizes
23
+
24
+ return fasta.with_suffix(fasta.suffix + ".fai")
@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
13
13
  import os
14
14
  import subprocess
15
15
 
16
- os.chdir(mod_bed_dir)
17
16
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
18
17
  command = [
19
- "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
18
+ "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
20
19
  "--partition-tag", "BC",
21
20
  "--only-tabs",
22
21
  "--filter-threshold", f'{filter_threshold}',
@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
16
16
  import subprocess
17
17
 
18
18
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
19
- subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
19
+ subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
20
20
  command = [
21
- "modkit", "summary", aligned_sorted_output,
21
+ "modkit", "summary", str(aligned_sorted_output),
22
22
  "--filter-threshold", f"{filter_threshold}",
23
23
  "--mod-thresholds", f"m:{m5C_threshold}",
24
24
  "--mod-thresholds", f"a:{m6A_threshold}",
@@ -0,0 +1,250 @@
1
+ # plot_bed_histograms
2
+
3
+ def plot_bed_histograms(
4
+ bed_file,
5
+ plotting_directory,
6
+ fasta,
7
+ *,
8
+ bins=60,
9
+ clip_quantiles=(0.0, 0.995),
10
+ cov_bin_size=1000, # coverage bin size in bp
11
+ rows_per_fig=6, # paginate if many chromosomes
12
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
13
+ coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
14
+ ):
15
+ """
16
+ Plot per-chromosome QC grids from a BED-like file.
17
+
18
+ Expects columns:
19
+ chrom, start, end, read_len, qname, mapq, avg_base_qual
20
+
21
+ For each chromosome:
22
+ - Column 1: Read length histogram
23
+ - Column 2: Coverage across the chromosome (binned)
24
+ - (optional) Column 3: MAPQ histogram
25
+ - (optional) Column 4: Avg base quality histogram
26
+
27
+ The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
28
+ Saves one PNG per page under `plotting_directory`.
29
+
30
+ Parameters
31
+ ----------
32
+ bed_file : str
33
+ plotting_directory : str
34
+ fasta : str
35
+ Reference FASTA (used to get chromosome lengths).
36
+ bins : int
37
+ Histogram bins for read length / MAPQ / quality.
38
+ clip_quantiles : (float, float)
39
+ Clip hist tails for readability (e.g., (0, 0.995)).
40
+ cov_bin_size : int
41
+ Bin size (bp) for coverage plot; bigger = faster/coarser.
42
+ rows_per_fig : int
43
+ Number of chromosomes per page.
44
+ include_mapq_quality : bool
45
+ If True, add MAPQ and avg base quality histograms as extra columns.
46
+ coordinate_mode : {"one_based","zero_based"}
47
+ One-based, inclusive (your file) vs BED-standard zero-based, half-open.
48
+ """
49
+ import os
50
+ import numpy as np
51
+ import pandas as pd
52
+ import matplotlib.pyplot as plt
53
+ import pysam
54
+
55
+ os.makedirs(plotting_directory, exist_ok=True)
56
+
57
+ bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
58
+ print(f"[plot_bed_histograms] Loading: {bed_file}")
59
+
60
+ # Load BED-like table
61
+ cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
62
+ df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
63
+ 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
64
+ 'mapq': float, 'avg_q': float
65
+ })
66
+
67
+ # Drop unaligned records (chrom == '*') if present
68
+ df = df[df['chrom'] != '*'].copy()
69
+ if df.empty:
70
+ print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
71
+ return
72
+
73
+ # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
74
+ # Input is typically one_based inclusive (from your writer).
75
+ if coordinate_mode not in {"one_based", "zero_based"}:
76
+ raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
77
+
78
+ if coordinate_mode == "one_based":
79
+ # convert to 0-based half-open [start0, end0)
80
+ start0 = df['start'].to_numpy() - 1
81
+ end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
82
+ else:
83
+ # already 0-based half-open (assumption)
84
+ start0 = df['start'].to_numpy()
85
+ end0 = df['end'].to_numpy()
86
+
87
+ # Clip helper for hist tails
88
+ def _clip_series(s, q=(0.0, 0.995)):
89
+ if q is None:
90
+ return s.to_numpy()
91
+ lo = s.quantile(q[0]) if q[0] is not None else s.min()
92
+ hi = s.quantile(q[1]) if q[1] is not None else s.max()
93
+ x = s.to_numpy(dtype=float)
94
+ return np.clip(x, lo, hi)
95
+
96
+ # Load chromosome order/lengths from FASTA
97
+ with pysam.FastaFile(fasta) as fa:
98
+ ref_names = list(fa.references)
99
+ ref_lengths = dict(zip(ref_names, fa.lengths))
100
+
101
+ # Keep only chroms present in FASTA and with at least one read
102
+ chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
103
+ # Order chromosomes by FASTA order
104
+ chrom_order = [c for c in ref_names if c in chroms]
105
+
106
+ if not chrom_order:
107
+ print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
108
+ return
109
+
110
+ # Pagination
111
+ def _sanitize(name: str) -> str:
112
+ return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
113
+
114
+ cols_per_fig = 4 if include_mapq_quality else 2
115
+
116
+ for start_idx in range(0, len(chrom_order), rows_per_fig):
117
+ chunk = chrom_order[start_idx:start_idx + rows_per_fig]
118
+ nrows = len(chunk)
119
+ ncols = cols_per_fig
120
+
121
+ fig, axes = plt.subplots(
122
+ nrows=nrows, ncols=ncols,
123
+ figsize=(4.0 * ncols, 2.6 * nrows),
124
+ dpi=160,
125
+ squeeze=False
126
+ )
127
+
128
+ for r, chrom in enumerate(chunk):
129
+ chrom_len = ref_lengths[chrom]
130
+ mask = (df['chrom'].to_numpy() == chrom)
131
+
132
+ # Slice per-chrom arrays for speed
133
+ s0 = start0[mask]
134
+ e0 = end0[mask]
135
+ len_arr = df.loc[mask, 'read_len']
136
+ mapq_arr = df.loc[mask, 'mapq']
137
+ q_arr = df.loc[mask, 'avg_q']
138
+
139
+ # --- Col 1: Read length histogram (clipped) ---
140
+ ax = axes[r, 0]
141
+ ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
142
+ if r == 0:
143
+ ax.set_title("Read length")
144
+ ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
145
+ ax.set_xlabel("bp")
146
+ ax.grid(alpha=0.25)
147
+
148
+ # --- Col 2: Coverage (binned over genome) ---
149
+ ax = axes[r, 1]
150
+ nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
151
+ # Bin edges in 0-based coords
152
+ edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
153
+
154
+ # Compute per-bin "read count coverage": number of reads overlapping each bin.
155
+ # Approximate by incrementing all bins touched by the interval.
156
+ # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
157
+ cov = np.zeros(nb, dtype=np.int32)
158
+ # bin indices overlapped by each read (0-based half-open)
159
+ b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
160
+ b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
161
+ # ensure valid ordering
162
+ b_lo = np.minimum(b0, b1)
163
+ b_hi = np.maximum(b0, b1)
164
+
165
+ # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
166
+ for lo, hi in zip(b_lo, b_hi):
167
+ cov[lo:hi + 1] += 1
168
+
169
+ x_mid = (edges[:-1] + edges[1:]) / 2.0
170
+ ax.plot(x_mid, cov)
171
+ if r == 0:
172
+ ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
173
+ ax.set_xlim(0, chrom_len)
174
+ ax.set_xlabel("Position (bp)")
175
+ ax.set_ylabel("") # already show chrom on col 1
176
+ ax.grid(alpha=0.25)
177
+
178
+ if include_mapq_quality:
179
+ # --- Col 3: MAPQ ---
180
+ ax = axes[r, 2]
181
+ # Clip MAPQ upper tail if needed (usually 60)
182
+ ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
183
+ if r == 0:
184
+ ax.set_title("MAPQ")
185
+ ax.set_xlabel("MAPQ")
186
+ ax.grid(alpha=0.25)
187
+
188
+ # --- Col 4: Avg base quality ---
189
+ ax = axes[r, 3]
190
+ ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
191
+ if r == 0:
192
+ ax.set_title("Avg base qual")
193
+ ax.set_xlabel("Phred")
194
+ ax.grid(alpha=0.25)
195
+
196
+ fig.suptitle(
197
+ f"{bed_basename} — per-chromosome QC "
198
+ f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
199
+ y=0.995, fontsize=11
200
+ )
201
+ fig.tight_layout(rect=[0, 0, 1, 0.98])
202
+
203
+ page = start_idx // rows_per_fig + 1
204
+ out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
205
+ plt.savefig(out_png, bbox_inches="tight")
206
+ plt.close(fig)
207
+
208
+ print("[plot_bed_histograms] Done.")
209
+
210
+
211
+ # bed_basename = os.path.basename(bed_file).split('.bed')[0]
212
+ # # Load the BED file into a DataFrame
213
+ # print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
214
+ # df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name', 'mapq', 'read_quality'])
215
+
216
+ # # Group by chromosome
217
+ # grouped = df.groupby('chromosome')
218
+
219
+ # # for each chromosome, get the record length of that chromosome from the fasta. Use from 0 to this length for the positional coverage plot.
220
+
221
+ # # Change below and make a plot grid instead. For each, make row for chromsome, col for read length and coverage
222
+ # # Clip the outliers to make plots cleaner
223
+
224
+ # for chrom, group in grouped:
225
+ # # Plot read length histogram
226
+ # plt.figure(figsize=(12, 6))
227
+ # plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
228
+ # plt.title(f'Read Length Histogram of reads aligned to {chrom}')
229
+ # plt.xlabel('Read Length')
230
+ # plt.ylabel('Count')
231
+ # plt.grid(True)
232
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
233
+ # plt.savefig(save_name)
234
+ # plt.close()
235
+
236
+ # # Compute coverage
237
+ # coverage = np.zeros(group['end'].max())
238
+ # for _, row in group.iterrows():
239
+ # coverage[row['start']:row['end']] += 1
240
+
241
+ # # Plot coverage histogram
242
+ # plt.figure(figsize=(12, 6))
243
+ # plt.plot(coverage, color='b')
244
+ # plt.title(f'Coverage Histogram for {chrom}')
245
+ # plt.xlabel('Position')
246
+ # plt.ylabel('Coverage')
247
+ # plt.grid(True)
248
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
249
+ # plt.savefig(save_name)
250
+ # plt.close()
@@ -1,6 +1,5 @@
1
1
  ## separate_bam_by_bc
2
2
 
3
- # General
4
3
  def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
5
4
  """
6
5
  Separates an input BAM file on the BC SAM tag values.
@@ -16,24 +15,26 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
16
15
  Writes out split BAM files.
17
16
  """
18
17
  import pysam
18
+ from pathlib import Path
19
19
  import os
20
20
 
21
- bam_base = os.path.basename(input_bam)
22
- bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
21
+ bam_base = input_bam.name
22
+ bam_base_minus_suffix = input_bam.stem
23
23
 
24
24
  # Open the input BAM file for reading
25
- with pysam.AlignmentFile(input_bam, "rb") as bam:
25
+ with pysam.AlignmentFile(str(input_bam), "rb") as bam:
26
26
  # Create a dictionary to store output BAM files
27
27
  output_files = {}
28
28
  # Iterate over each read in the BAM file
29
29
  for read in bam:
30
30
  try:
31
31
  # Get the barcode tag value
32
- bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
32
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
33
+ #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
33
34
  # Open the output BAM file corresponding to the barcode
34
35
  if bc_tag not in output_files:
35
- output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
36
- output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
36
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
37
+ output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
37
38
  # Write the read to the corresponding output BAM file
38
39
  output_files[bc_tag].write(read)
39
40
  except KeyError:
@@ -1,36 +1,32 @@
1
1
  ## split_and_index_BAM
2
2
 
3
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory):
3
+ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
4
4
  """
5
5
  A wrapper function for splitting BAMS and indexing them.
6
6
  Parameters:
7
7
  aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
8
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
9
  bam_suffix (str): A suffix to add to the bam file.
10
- output_directory (str): A file path to the directory to output all the analyses.
11
10
 
12
11
  Returns:
13
12
  None
14
13
  Splits an input BAM file on barcode value and makes a BAM index file.
15
14
  """
16
- from .. import readwrite
15
+ from ...readwrite import date_string, make_dirs
16
+ from pathlib import Path
17
17
  import os
18
- import subprocess
18
+ import pysam
19
19
  import glob
20
20
  from .separate_bam_by_bc import separate_bam_by_bc
21
- from .make_dirs import make_dirs
22
21
 
23
- plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
24
- bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
25
- make_dirs([plotting_dir, bed_dir])
26
22
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
27
- file_prefix = readwrite.date_string()
23
+ file_prefix = date_string()
28
24
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
29
25
  # Make a BAM index file for the BAMs in that directory
30
26
  bam_pattern = '*' + bam_suffix
31
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
32
- bam_files = [bam for bam in bam_files if '.bai' not in bam]
27
+ bam_files = glob.glob(split_dir / bam_pattern)
28
+ bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
33
29
  for input_file in bam_files:
34
- subprocess.run(["samtools", "index", input_file])
30
+ pysam.index(input_file)
35
31
 
36
32
  return bam_files