smftools 0.2.3__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +54 -0
  5. smftools/cli/hmm_adata.py +937 -256
  6. smftools/cli/load_adata.py +448 -268
  7. smftools/cli/preprocess_adata.py +469 -263
  8. smftools/cli/spatial_adata.py +536 -319
  9. smftools/cli_entry.py +97 -182
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +17 -6
  12. smftools/config/deaminase.yaml +12 -10
  13. smftools/config/default.yaml +142 -33
  14. smftools/config/direct.yaml +11 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +594 -264
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2128 -1418
  21. smftools/hmm/__init__.py +2 -9
  22. smftools/hmm/archived/call_hmm_peaks.py +121 -0
  23. smftools/hmm/call_hmm_peaks.py +299 -91
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +397 -175
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +196 -30
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +422 -197
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +147 -87
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +10 -12
  84. smftools/preprocessing/append_base_context.py +115 -80
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +77 -39
  86. smftools/preprocessing/{calculate_complexity.py → archived/calculate_complexity.py} +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +129 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +50 -25
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +118 -54
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +71 -38
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +689 -272
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +103 -0
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +331 -82
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/METADATA +17 -39
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.3.dist-info/RECORD +0 -173
  128. /smftools/cli/{cli_flows.py → archived/cli_flows.py} +0 -0
  129. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  130. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  131. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  132. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archived/add_read_length_and_mapping_qc.py} +0 -0
  133. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  134. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  135. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  136. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  137. {smftools-0.2.3.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,7 +1,17 @@
1
1
  import subprocess
2
- from pathlib import Path
3
2
 
4
- def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
3
+
4
+ def canoncall(
5
+ model_dir,
6
+ model,
7
+ pod5_dir,
8
+ barcode_kit,
9
+ bam,
10
+ bam_suffix,
11
+ barcode_both_ends=True,
12
+ trim=False,
13
+ device="auto",
14
+ ):
5
15
  """
6
16
  Wrapper function for dorado canonical base calling.
7
17
 
@@ -15,13 +25,24 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
15
25
  barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
16
26
  trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
17
27
  device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
18
-
28
+
19
29
  Returns:
20
30
  None
21
31
  Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
22
32
  """
23
33
  output = bam + bam_suffix
24
- command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
34
+ command = [
35
+ "dorado",
36
+ "basecaller",
37
+ "--models-directory",
38
+ model_dir,
39
+ "--kit-name",
40
+ barcode_kit,
41
+ "--device",
42
+ device,
43
+ "--batchsize",
44
+ "0",
45
+ ]
25
46
  if barcode_both_ends:
26
47
  command.append("--barcode-both-ends")
27
48
  if not trim:
@@ -32,7 +53,19 @@ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_
32
53
  with open(output, "w") as outfile:
33
54
  subprocess.run(command, stdout=outfile)
34
55
 
35
- def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
56
+
57
+ def modcall(
58
+ model_dir,
59
+ model,
60
+ pod5_dir,
61
+ barcode_kit,
62
+ mod_list,
63
+ bam,
64
+ bam_suffix,
65
+ barcode_both_ends=True,
66
+ trim=False,
67
+ device="auto",
68
+ ):
36
69
  """
37
70
  Wrapper function for dorado modified base calling.
38
71
 
@@ -47,14 +80,23 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
47
80
  barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
48
81
  trim (bool): Whether to trim barcodes, adapters, and primers from read ends
49
82
  device (str): Device to use for basecalling. auto, metal, cpu, cuda.
50
-
83
+
51
84
  Returns:
52
85
  None
53
86
  Outputs a BAM file holding the modified base calls output by the dorado basecaller.
54
87
  """
55
88
  import subprocess
89
+
56
90
  output = bam + bam_suffix
57
- command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
91
+ command = [
92
+ "dorado",
93
+ "basecaller",
94
+ "--models-directory",
95
+ model_dir,
96
+ "--kit-name",
97
+ barcode_kit,
98
+ "--modified-bases",
99
+ ]
58
100
  command += mod_list
59
101
  command += ["--device", device, "--batchsize", "0"]
60
102
  if barcode_both_ends:
@@ -62,6 +104,6 @@ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix,
62
104
  if not trim:
63
105
  command.append("--no-trim")
64
106
  command += [model, pod5_dir]
65
- print(f'Running: {" ".join(command)}')
107
+ print(f"Running: {' '.join(command)}")
66
108
  with open(output, "w") as outfile:
67
- subprocess.run(command, stdout=outfile)
109
+ subprocess.run(command, stdout=outfile)
@@ -1,20 +1,22 @@
1
- from pathlib import Path
1
+ import concurrent.futures
2
2
  import os
3
- import subprocess
4
- from typing import List, Optional, Union
5
- import pysam
6
- import pybedtools
7
- import pyBigWig
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from pathlib import Path
8
5
 
6
+ import matplotlib.pyplot as plt
9
7
  import numpy as np
10
8
  import pandas as pd
11
- import concurrent.futures
12
- from concurrent.futures import ProcessPoolExecutor
9
+ import pybedtools
10
+ import pyBigWig
11
+ import pysam
13
12
 
14
- import matplotlib.pyplot as plt
13
+ from smftools.logging_utils import get_logger
15
14
 
16
15
  from ..readwrite import make_dirs
17
16
 
17
+ logger = get_logger(__name__)
18
+
19
+
18
20
  def _bed_to_bigwig(fasta: str, bed: str) -> str:
19
21
  """
20
22
  BED → bedGraph → bigWig
@@ -33,14 +35,14 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
33
35
  bigwig = parent / f"{stem}.bw"
34
36
 
35
37
  # 1) Compute coverage → bedGraph
36
- print(f"[pybedtools] generating coverage bedgraph from {bed}")
38
+ logger.debug(f"[pybedtools] generating coverage bedgraph from {bed}")
37
39
  bt = pybedtools.BedTool(str(bed))
38
40
  # bedtools genomecov -bg
39
41
  coverage = bt.genome_coverage(bg=True, genome=str(fai))
40
42
  coverage.saveas(str(bedgraph))
41
43
 
42
44
  # 2) Convert bedGraph → BigWig via pyBigWig
43
- print(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
45
+ logger.debug(f"[pyBigWig] converting bedgraph → bigwig: {bigwig}")
44
46
 
45
47
  # read chrom sizes from the FASTA .fai index
46
48
  chrom_sizes = {}
@@ -61,9 +63,10 @@ def _bed_to_bigwig(fasta: str, bed: str) -> str:
61
63
 
62
64
  bw.close()
63
65
 
64
- print(f"BigWig written: {bigwig}")
66
+ logger.debug(f"BigWig written: {bigwig}")
65
67
  return str(bigwig)
66
68
 
69
+
67
70
  def _plot_bed_histograms(
68
71
  bed_file,
69
72
  plotting_directory,
@@ -71,9 +74,9 @@ def _plot_bed_histograms(
71
74
  *,
72
75
  bins=60,
73
76
  clip_quantiles=(0.0, 0.995),
74
- cov_bin_size=1000, # coverage bin size in bp
75
- rows_per_fig=6, # paginate if many chromosomes
76
- include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
77
+ cov_bin_size=1000, # coverage bin size in bp
78
+ rows_per_fig=6, # paginate if many chromosomes
79
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
77
80
  coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
78
81
  ):
79
82
  """
@@ -113,19 +116,30 @@ def _plot_bed_histograms(
113
116
  os.makedirs(plotting_directory, exist_ok=True)
114
117
 
115
118
  bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
116
- print(f"[plot_bed_histograms] Loading: {bed_file}")
119
+ logger.debug(f"[plot_bed_histograms] Loading: {bed_file}")
117
120
 
118
121
  # Load BED-like table
119
- cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
120
- df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
121
- 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
122
- 'mapq': float, 'avg_q': float
123
- })
122
+ cols = ["chrom", "start", "end", "read_len", "qname", "mapq", "avg_q"]
123
+ df = pd.read_csv(
124
+ bed_file,
125
+ sep="\t",
126
+ header=None,
127
+ names=cols,
128
+ dtype={
129
+ "chrom": str,
130
+ "start": int,
131
+ "end": int,
132
+ "read_len": int,
133
+ "qname": str,
134
+ "mapq": float,
135
+ "avg_q": float,
136
+ },
137
+ )
124
138
 
125
139
  # Drop unaligned records (chrom == '*') if present
126
- df = df[df['chrom'] != '*'].copy()
140
+ df = df[df["chrom"] != "*"].copy()
127
141
  if df.empty:
128
- print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
142
+ logger.debug("[plot_bed_histograms] No aligned reads found; nothing to plot.")
129
143
  return
130
144
 
131
145
  # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
@@ -135,15 +149,16 @@ def _plot_bed_histograms(
135
149
 
136
150
  if coordinate_mode == "one_based":
137
151
  # convert to 0-based half-open [start0, end0)
138
- start0 = df['start'].to_numpy() - 1
139
- end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
152
+ start0 = df["start"].to_numpy() - 1
153
+ end0 = df["end"].to_numpy() # inclusive in input -> +1 already handled by not subtracting
140
154
  else:
141
155
  # already 0-based half-open (assumption)
142
- start0 = df['start'].to_numpy()
143
- end0 = df['end'].to_numpy()
156
+ start0 = df["start"].to_numpy()
157
+ end0 = df["end"].to_numpy()
144
158
 
145
159
  # Clip helper for hist tails
146
160
  def _clip_series(s, q=(0.0, 0.995)):
161
+ """Clip a Series to quantile bounds for plotting."""
147
162
  if q is None:
148
163
  return s.to_numpy()
149
164
  lo = s.quantile(q[0]) if q[0] is not None else s.min()
@@ -157,42 +172,42 @@ def _plot_bed_histograms(
157
172
  ref_lengths = dict(zip(ref_names, fa.lengths))
158
173
 
159
174
  # Keep only chroms present in FASTA and with at least one read
160
- chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
175
+ chroms = [c for c in df["chrom"].unique() if c in ref_lengths]
161
176
  # Order chromosomes by FASTA order
162
177
  chrom_order = [c for c in ref_names if c in chroms]
163
178
 
164
179
  if not chrom_order:
165
- print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
180
+ logger.debug(
181
+ "[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting."
182
+ )
166
183
  return
167
184
 
168
185
  # Pagination
169
186
  def _sanitize(name: str) -> str:
187
+ """Sanitize a string for use in filenames."""
170
188
  return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
171
189
 
172
190
  cols_per_fig = 4 if include_mapq_quality else 2
173
191
 
174
192
  for start_idx in range(0, len(chrom_order), rows_per_fig):
175
- chunk = chrom_order[start_idx:start_idx + rows_per_fig]
193
+ chunk = chrom_order[start_idx : start_idx + rows_per_fig]
176
194
  nrows = len(chunk)
177
195
  ncols = cols_per_fig
178
196
 
179
197
  fig, axes = plt.subplots(
180
- nrows=nrows, ncols=ncols,
181
- figsize=(4.0 * ncols, 2.6 * nrows),
182
- dpi=160,
183
- squeeze=False
198
+ nrows=nrows, ncols=ncols, figsize=(4.0 * ncols, 2.6 * nrows), dpi=160, squeeze=False
184
199
  )
185
200
 
186
201
  for r, chrom in enumerate(chunk):
187
202
  chrom_len = ref_lengths[chrom]
188
- mask = (df['chrom'].to_numpy() == chrom)
203
+ mask = df["chrom"].to_numpy() == chrom
189
204
 
190
205
  # Slice per-chrom arrays for speed
191
206
  s0 = start0[mask]
192
207
  e0 = end0[mask]
193
- len_arr = df.loc[mask, 'read_len']
194
- mapq_arr = df.loc[mask, 'mapq']
195
- q_arr = df.loc[mask, 'avg_q']
208
+ len_arr = df.loc[mask, "read_len"]
209
+ mapq_arr = df.loc[mask, "mapq"]
210
+ q_arr = df.loc[mask, "avg_q"]
196
211
 
197
212
  # --- Col 1: Read length histogram (clipped) ---
198
213
  ax = axes[r, 0]
@@ -222,7 +237,7 @@ def _plot_bed_histograms(
222
237
 
223
238
  # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
224
239
  for lo, hi in zip(b_lo, b_hi):
225
- cov[lo:hi + 1] += 1
240
+ cov[lo : hi + 1] += 1
226
241
 
227
242
  x_mid = (edges[:-1] + edges[1:]) / 2.0
228
243
  ax.plot(x_mid, cov)
@@ -237,7 +252,12 @@ def _plot_bed_histograms(
237
252
  # --- Col 3: MAPQ ---
238
253
  ax = axes[r, 2]
239
254
  # Clip MAPQ upper tail if needed (usually 60)
240
- ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
255
+ ax.hist(
256
+ _clip_series(mapq_arr.fillna(0), clip_quantiles),
257
+ bins=bins,
258
+ edgecolor="black",
259
+ alpha=0.7,
260
+ )
241
261
  if r == 0:
242
262
  ax.set_title("MAPQ")
243
263
  ax.set_xlabel("MAPQ")
@@ -245,7 +265,12 @@ def _plot_bed_histograms(
245
265
 
246
266
  # --- Col 4: Avg base quality ---
247
267
  ax = axes[r, 3]
248
- ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
268
+ ax.hist(
269
+ _clip_series(q_arr.fillna(np.nan), clip_quantiles),
270
+ bins=bins,
271
+ edgecolor="black",
272
+ alpha=0.7,
273
+ )
249
274
  if r == 0:
250
275
  ax.set_title("Avg base qual")
251
276
  ax.set_xlabel("Phred")
@@ -254,7 +279,8 @@ def _plot_bed_histograms(
254
279
  fig.suptitle(
255
280
  f"{bed_basename} — per-chromosome QC "
256
281
  f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
257
- y=0.995, fontsize=11
282
+ y=0.995,
283
+ fontsize=11,
258
284
  )
259
285
  fig.tight_layout(rect=[0, 0, 1, 0.98])
260
286
 
@@ -263,7 +289,8 @@ def _plot_bed_histograms(
263
289
  plt.savefig(out_png, bbox_inches="tight")
264
290
  plt.close(fig)
265
291
 
266
- print("[plot_bed_histograms] Done.")
292
+ logger.debug("[plot_bed_histograms] Done.")
293
+
267
294
 
268
295
  def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
269
296
  """
@@ -287,9 +314,9 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
287
314
  bed_dir = out_dir / "beds"
288
315
  make_dirs([plotting_dir, bed_dir])
289
316
 
290
- bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
317
+ bed_output = bed_dir / str(aligned_BAM.name).replace(".bam", "_bed.bed")
291
318
 
292
- print(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
319
+ logger.debug(f"Creating BED-like file from BAM (with MAPQ and avg base quality): {aligned_BAM}")
293
320
 
294
321
  with pysam.AlignmentFile(aligned_BAM, "rb") as bam, open(bed_output, "w") as out:
295
322
  for read in bam.fetch(until_eof=True):
@@ -317,20 +344,24 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
317
344
 
318
345
  out.write(f"{chrom}\t{start1}\t{end1}\t{rl}\t{qname}\t{mapq}\t{avg_q:.3f}\n")
319
346
 
320
- print(f"BED-like file created: {bed_output}")
347
+ logger.debug(f"BED-like file created: {bed_output}")
321
348
 
322
349
  def split_bed(bed):
323
350
  """Splits into aligned and unaligned reads (chrom == '*')."""
324
351
  bed = str(bed)
325
352
  aligned = bed.replace(".bed", "_aligned.bed")
326
353
  unaligned = bed.replace(".bed", "_unaligned.bed")
327
- with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
354
+ with (
355
+ open(bed, "r") as infile,
356
+ open(aligned, "w") as aligned_out,
357
+ open(unaligned, "w") as unaligned_out,
358
+ ):
328
359
  for line in infile:
329
360
  (unaligned_out if line.startswith("*\t") else aligned_out).write(line)
330
361
  os.remove(bed)
331
362
  return aligned
332
363
 
333
- print(f"Splitting: {bed_output}")
364
+ logger.debug(f"Splitting: {bed_output}")
334
365
  aligned_bed = split_bed(bed_output)
335
366
 
336
367
  with ProcessPoolExecutor() as executor:
@@ -340,7 +371,8 @@ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
340
371
  futures.append(executor.submit(_bed_to_bigwig, fasta, aligned_bed))
341
372
  concurrent.futures.wait(futures)
342
373
 
343
- print("Processing completed successfully.")
374
+ logger.debug("Processing completed successfully.")
375
+
344
376
 
345
377
  def extract_read_lengths_from_bed(file_path):
346
378
  """
@@ -352,15 +384,16 @@ def extract_read_lengths_from_bed(file_path):
352
384
  read_dict (dict)
353
385
  """
354
386
  import pandas as pd
355
- columns = ['chrom', 'start', 'end', 'length', 'name']
356
- df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
387
+
388
+ columns = ["chrom", "start", "end", "length", "name"]
389
+ df = pd.read_csv(file_path, sep="\t", header=None, names=columns, comment="#")
357
390
  read_dict = {}
358
391
  for _, row in df.iterrows():
359
- chrom = row['chrom']
360
- start = row['start']
361
- end = row['end']
362
- name = row['name']
363
- length = row['length']
392
+ chrom = row["chrom"]
393
+ start = row["start"]
394
+ end = row["end"]
395
+ name = row["name"]
396
+ length = row["length"]
364
397
  read_dict[name] = length
365
398
 
366
- return read_dict
399
+ return read_dict
@@ -1,4 +1,13 @@
1
- def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
1
+ def binarize_converted_base_identities(
2
+ base_identities,
3
+ strand,
4
+ modification_type,
5
+ bam,
6
+ device="cpu",
7
+ deaminase_footprinting=False,
8
+ mismatch_trend_per_read={},
9
+ on_missing="nan",
10
+ ):
2
11
  """
3
12
  Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
4
13
 
@@ -10,7 +19,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
10
19
  deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
11
20
  mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
12
21
  on_missing (str): Error handling if a read is missing
13
-
22
+
14
23
  Returns:
15
24
  dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
16
25
  If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
@@ -64,14 +73,16 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
64
73
 
65
74
  # Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
66
75
  bin_maps = {
67
- ("top", "5mC"): {"C": 1.0, "T": 0.0},
76
+ ("top", "5mC"): {"C": 1.0, "T": 0.0},
68
77
  ("bottom", "5mC"): {"G": 1.0, "A": 0.0},
69
- ("top", "6mA"): {"A": 1.0, "G": 0.0},
78
+ ("top", "6mA"): {"A": 1.0, "G": 0.0},
70
79
  ("bottom", "6mA"): {"T": 1.0, "C": 0.0},
71
80
  }
72
81
  key = (strand, modification_type)
73
82
  if key not in bin_maps:
74
- raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
83
+ raise ValueError(
84
+ f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'"
85
+ )
75
86
 
76
87
  base_map = bin_maps[key]
77
88
 
@@ -110,7 +121,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
110
121
  # binarized_base_identities[key] = binarized
111
122
 
112
123
  # return binarized_base_identities
113
-
124
+
114
125
  # else:
115
126
  # binarization_maps = {
116
127
  # ('top', '5mC'): {'C': 1, 'T': 0},
@@ -152,7 +163,7 @@ def binarize_converted_base_identities(base_identities, strand, modification_typ
152
163
 
153
164
  # # Fetch the appropriate mapping
154
165
  # base_map = binarization_maps[(strand, modification_type)]
155
-
166
+
156
167
  # # Convert mapping to tensor
157
168
  # base_keys = list(base_map.keys())
158
169
  # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
@@ -1,5 +1,6 @@
1
1
  # complement_base_list
2
2
 
3
+
3
4
  def complement_base_list(sequence):
4
5
  """
5
6
  Takes a list of DNA base identities and returns their complement.
@@ -11,11 +12,11 @@ def complement_base_list(sequence):
11
12
  complement (list): A list of complementary DNA bases.
12
13
  """
13
14
  complement_mapping = {
14
- 'A': 'T',
15
- 'T': 'A',
16
- 'C': 'G',
17
- 'G': 'C',
18
- 'N': 'N' # Handling ambiguous bases like 'N'
15
+ "A": "T",
16
+ "T": "A",
17
+ "C": "G",
18
+ "G": "C",
19
+ "N": "N", # Handling ambiguous bases like 'N'
19
20
  }
20
21
 
21
- return [complement_mapping[base] for base in sequence]
22
+ return [complement_mapping[base] for base in sequence]