smftools 0.1.7__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. smftools/__init__.py +9 -4
  2. smftools/_version.py +1 -1
  3. smftools/cli.py +184 -0
  4. smftools/config/__init__.py +1 -0
  5. smftools/config/conversion.yaml +33 -0
  6. smftools/config/deaminase.yaml +56 -0
  7. smftools/config/default.yaml +253 -0
  8. smftools/config/direct.yaml +17 -0
  9. smftools/config/experiment_config.py +1191 -0
  10. smftools/hmm/HMM.py +1576 -0
  11. smftools/hmm/__init__.py +20 -0
  12. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  13. smftools/hmm/call_hmm_peaks.py +106 -0
  14. smftools/{tools → hmm}/display_hmm.py +3 -3
  15. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  16. smftools/{tools → hmm}/train_hmm.py +1 -1
  17. smftools/informatics/__init__.py +0 -2
  18. smftools/informatics/archived/deaminase_smf.py +132 -0
  19. smftools/informatics/fast5_to_pod5.py +4 -1
  20. smftools/informatics/helpers/__init__.py +3 -4
  21. smftools/informatics/helpers/align_and_sort_BAM.py +34 -7
  22. smftools/informatics/helpers/aligned_BAM_to_bed.py +35 -24
  23. smftools/informatics/helpers/binarize_converted_base_identities.py +116 -23
  24. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +365 -42
  25. smftools/informatics/helpers/converted_BAM_to_adata_II.py +165 -29
  26. smftools/informatics/helpers/discover_input_files.py +100 -0
  27. smftools/informatics/helpers/extract_base_identities.py +29 -3
  28. smftools/informatics/helpers/extract_read_features_from_bam.py +4 -2
  29. smftools/informatics/helpers/find_conversion_sites.py +5 -4
  30. smftools/informatics/helpers/modkit_extract_to_adata.py +6 -3
  31. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  32. smftools/informatics/helpers/separate_bam_by_bc.py +2 -2
  33. smftools/informatics/helpers/split_and_index_BAM.py +1 -5
  34. smftools/load_adata.py +1346 -0
  35. smftools/machine_learning/__init__.py +12 -0
  36. smftools/machine_learning/data/__init__.py +2 -0
  37. smftools/machine_learning/data/anndata_data_module.py +234 -0
  38. smftools/machine_learning/evaluation/__init__.py +2 -0
  39. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  40. smftools/machine_learning/evaluation/evaluators.py +223 -0
  41. smftools/machine_learning/inference/__init__.py +3 -0
  42. smftools/machine_learning/inference/inference_utils.py +27 -0
  43. smftools/machine_learning/inference/lightning_inference.py +68 -0
  44. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  45. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  46. smftools/machine_learning/models/base.py +295 -0
  47. smftools/machine_learning/models/cnn.py +138 -0
  48. smftools/machine_learning/models/lightning_base.py +345 -0
  49. smftools/machine_learning/models/mlp.py +26 -0
  50. smftools/{tools → machine_learning}/models/positional.py +3 -2
  51. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  52. smftools/machine_learning/models/sklearn_models.py +273 -0
  53. smftools/machine_learning/models/transformer.py +303 -0
  54. smftools/machine_learning/training/__init__.py +2 -0
  55. smftools/machine_learning/training/train_lightning_model.py +135 -0
  56. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  57. smftools/plotting/__init__.py +4 -1
  58. smftools/plotting/autocorrelation_plotting.py +611 -0
  59. smftools/plotting/general_plotting.py +566 -89
  60. smftools/plotting/hmm_plotting.py +260 -0
  61. smftools/plotting/qc_plotting.py +270 -0
  62. smftools/preprocessing/__init__.py +13 -8
  63. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  64. smftools/preprocessing/append_base_context.py +122 -0
  65. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  66. smftools/preprocessing/calculate_complexity_II.py +248 -0
  67. smftools/preprocessing/calculate_coverage.py +10 -1
  68. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  69. smftools/preprocessing/clean_NaN.py +17 -1
  70. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  71. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  72. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  73. smftools/preprocessing/invert_adata.py +12 -5
  74. smftools/preprocessing/load_sample_sheet.py +19 -4
  75. smftools/readwrite.py +849 -43
  76. smftools/tools/__init__.py +3 -32
  77. smftools/tools/calculate_umap.py +5 -5
  78. smftools/tools/general_tools.py +3 -3
  79. smftools/tools/position_stats.py +468 -106
  80. smftools/tools/read_stats.py +115 -1
  81. smftools/tools/spatial_autocorrelation.py +562 -0
  82. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/METADATA +5 -1
  83. smftools-0.2.1.dist-info/RECORD +161 -0
  84. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  85. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  86. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  87. smftools/informatics/load_adata.py +0 -182
  88. smftools/preprocessing/append_C_context.py +0 -82
  89. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  90. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  91. smftools/preprocessing/filter_reads_on_length.py +0 -51
  92. smftools/tools/call_hmm_peaks.py +0 -105
  93. smftools/tools/data/__init__.py +0 -2
  94. smftools/tools/data/anndata_data_module.py +0 -90
  95. smftools/tools/evaluation/__init__.py +0 -0
  96. smftools/tools/inference/__init__.py +0 -1
  97. smftools/tools/inference/lightning_inference.py +0 -41
  98. smftools/tools/models/base.py +0 -14
  99. smftools/tools/models/cnn.py +0 -34
  100. smftools/tools/models/lightning_base.py +0 -41
  101. smftools/tools/models/mlp.py +0 -17
  102. smftools/tools/models/sklearn_models.py +0 -40
  103. smftools/tools/models/transformer.py +0 -133
  104. smftools/tools/training/__init__.py +0 -1
  105. smftools/tools/training/train_lightning_model.py +0 -47
  106. smftools-0.1.7.dist-info/RECORD +0 -136
  107. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  108. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  109. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  110. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  111. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  112. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  113. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  114. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  115. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  116. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  117. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  118. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  119. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  120. {smftools-0.1.7.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,269 @@
1
+ # plot_bed_histograms
2
+
3
+ def plot_bed_histograms(bed_file, plotting_directory, fasta):
4
+ """
5
+ Plots read length, coverage, mapq, read quality stats for each record.
6
+
7
+ Parameters:
8
+ bed_file (str): Path to the bed file to derive metrics from.
9
+ plot_directory (str): Path to the directory to write out historgrams.
10
+ fasta (str): Path to FASTA corresponding to bed
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ import pandas as pd
16
+ import matplotlib.pyplot as plt
17
+ import numpy as np
18
+ import os
19
+
20
+ # plot_bed_histograms.py
21
+
22
+ def plot_bed_histograms(
23
+ bed_file,
24
+ plotting_directory,
25
+ fasta,
26
+ *,
27
+ bins=60,
28
+ clip_quantiles=(0.0, 0.995),
29
+ cov_bin_size=1000, # coverage bin size in bp
30
+ rows_per_fig=6, # paginate if many chromosomes
31
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
32
+ coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
33
+ ):
34
+ """
35
+ Plot per-chromosome QC grids from a BED-like file.
36
+
37
+ Expects columns:
38
+ chrom, start, end, read_len, qname, mapq, avg_base_qual
39
+
40
+ For each chromosome:
41
+ - Column 1: Read length histogram
42
+ - Column 2: Coverage across the chromosome (binned)
43
+ - (optional) Column 3: MAPQ histogram
44
+ - (optional) Column 4: Avg base quality histogram
45
+
46
+ The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
47
+ Saves one PNG per page under `plotting_directory`.
48
+
49
+ Parameters
50
+ ----------
51
+ bed_file : str
52
+ plotting_directory : str
53
+ fasta : str
54
+ Reference FASTA (used to get chromosome lengths).
55
+ bins : int
56
+ Histogram bins for read length / MAPQ / quality.
57
+ clip_quantiles : (float, float)
58
+ Clip hist tails for readability (e.g., (0, 0.995)).
59
+ cov_bin_size : int
60
+ Bin size (bp) for coverage plot; bigger = faster/coarser.
61
+ rows_per_fig : int
62
+ Number of chromosomes per page.
63
+ include_mapq_quality : bool
64
+ If True, add MAPQ and avg base quality histograms as extra columns.
65
+ coordinate_mode : {"one_based","zero_based"}
66
+ One-based, inclusive (your file) vs BED-standard zero-based, half-open.
67
+ """
68
+ import os
69
+ import numpy as np
70
+ import pandas as pd
71
+ import matplotlib.pyplot as plt
72
+ import pysam
73
+
74
+ os.makedirs(plotting_directory, exist_ok=True)
75
+
76
+ bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
77
+ print(f"[plot_bed_histograms] Loading: {bed_file}")
78
+
79
+ # Load BED-like table
80
+ cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
81
+ df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
82
+ 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
83
+ 'mapq': float, 'avg_q': float
84
+ })
85
+
86
+ # Drop unaligned records (chrom == '*') if present
87
+ df = df[df['chrom'] != '*'].copy()
88
+ if df.empty:
89
+ print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
90
+ return
91
+
92
+ # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
93
+ # Input is typically one_based inclusive (from your writer).
94
+ if coordinate_mode not in {"one_based", "zero_based"}:
95
+ raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
96
+
97
+ if coordinate_mode == "one_based":
98
+ # convert to 0-based half-open [start0, end0)
99
+ start0 = df['start'].to_numpy() - 1
100
+ end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
101
+ else:
102
+ # already 0-based half-open (assumption)
103
+ start0 = df['start'].to_numpy()
104
+ end0 = df['end'].to_numpy()
105
+
106
+ # Clip helper for hist tails
107
+ def _clip_series(s, q=(0.0, 0.995)):
108
+ if q is None:
109
+ return s.to_numpy()
110
+ lo = s.quantile(q[0]) if q[0] is not None else s.min()
111
+ hi = s.quantile(q[1]) if q[1] is not None else s.max()
112
+ x = s.to_numpy(dtype=float)
113
+ return np.clip(x, lo, hi)
114
+
115
+ # Load chromosome order/lengths from FASTA
116
+ with pysam.FastaFile(fasta) as fa:
117
+ ref_names = list(fa.references)
118
+ ref_lengths = dict(zip(ref_names, fa.lengths))
119
+
120
+ # Keep only chroms present in FASTA and with at least one read
121
+ chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
122
+ # Order chromosomes by FASTA order
123
+ chrom_order = [c for c in ref_names if c in chroms]
124
+
125
+ if not chrom_order:
126
+ print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
127
+ return
128
+
129
+ # Pagination
130
+ def _sanitize(name: str) -> str:
131
+ return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
132
+
133
+ cols_per_fig = 4 if include_mapq_quality else 2
134
+
135
+ for start_idx in range(0, len(chrom_order), rows_per_fig):
136
+ chunk = chrom_order[start_idx:start_idx + rows_per_fig]
137
+ nrows = len(chunk)
138
+ ncols = cols_per_fig
139
+
140
+ fig, axes = plt.subplots(
141
+ nrows=nrows, ncols=ncols,
142
+ figsize=(4.0 * ncols, 2.6 * nrows),
143
+ dpi=160,
144
+ squeeze=False
145
+ )
146
+
147
+ for r, chrom in enumerate(chunk):
148
+ chrom_len = ref_lengths[chrom]
149
+ mask = (df['chrom'].to_numpy() == chrom)
150
+
151
+ # Slice per-chrom arrays for speed
152
+ s0 = start0[mask]
153
+ e0 = end0[mask]
154
+ len_arr = df.loc[mask, 'read_len']
155
+ mapq_arr = df.loc[mask, 'mapq']
156
+ q_arr = df.loc[mask, 'avg_q']
157
+
158
+ # --- Col 1: Read length histogram (clipped) ---
159
+ ax = axes[r, 0]
160
+ ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
161
+ if r == 0:
162
+ ax.set_title("Read length")
163
+ ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
164
+ ax.set_xlabel("bp")
165
+ ax.grid(alpha=0.25)
166
+
167
+ # --- Col 2: Coverage (binned over genome) ---
168
+ ax = axes[r, 1]
169
+ nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
170
+ # Bin edges in 0-based coords
171
+ edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
172
+
173
+ # Compute per-bin "read count coverage": number of reads overlapping each bin.
174
+ # Approximate by incrementing all bins touched by the interval.
175
+ # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
176
+ cov = np.zeros(nb, dtype=np.int32)
177
+ # bin indices overlapped by each read (0-based half-open)
178
+ b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
179
+ b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
180
+ # ensure valid ordering
181
+ b_lo = np.minimum(b0, b1)
182
+ b_hi = np.maximum(b0, b1)
183
+
184
+ # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
185
+ for lo, hi in zip(b_lo, b_hi):
186
+ cov[lo:hi + 1] += 1
187
+
188
+ x_mid = (edges[:-1] + edges[1:]) / 2.0
189
+ ax.plot(x_mid, cov)
190
+ if r == 0:
191
+ ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
192
+ ax.set_xlim(0, chrom_len)
193
+ ax.set_xlabel("Position (bp)")
194
+ ax.set_ylabel("") # already show chrom on col 1
195
+ ax.grid(alpha=0.25)
196
+
197
+ if include_mapq_quality:
198
+ # --- Col 3: MAPQ ---
199
+ ax = axes[r, 2]
200
+ # Clip MAPQ upper tail if needed (usually 60)
201
+ ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
202
+ if r == 0:
203
+ ax.set_title("MAPQ")
204
+ ax.set_xlabel("MAPQ")
205
+ ax.grid(alpha=0.25)
206
+
207
+ # --- Col 4: Avg base quality ---
208
+ ax = axes[r, 3]
209
+ ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
210
+ if r == 0:
211
+ ax.set_title("Avg base qual")
212
+ ax.set_xlabel("Phred")
213
+ ax.grid(alpha=0.25)
214
+
215
+ fig.suptitle(
216
+ f"{bed_basename} — per-chromosome QC "
217
+ f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
218
+ y=0.995, fontsize=11
219
+ )
220
+ fig.tight_layout(rect=[0, 0, 1, 0.98])
221
+
222
+ page = start_idx // rows_per_fig + 1
223
+ out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
224
+ plt.savefig(out_png, bbox_inches="tight")
225
+ plt.close(fig)
226
+
227
+ print("[plot_bed_histograms] Done.")
228
+
229
+
230
+ # bed_basename = os.path.basename(bed_file).split('.bed')[0]
231
+ # # Load the BED file into a DataFrame
232
+ # print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
233
+ # df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name', 'mapq', 'read_quality'])
234
+
235
+ # # Group by chromosome
236
+ # grouped = df.groupby('chromosome')
237
+
238
+ # # for each chromosome, get the record length of that chromosome from the fasta. Use from 0 to this length for the positional coverage plot.
239
+
240
+ # # Change below and make a plot grid instead. For each, make row for chromsome, col for read length and coverage
241
+ # # Clip the outliers to make plots cleaner
242
+
243
+ # for chrom, group in grouped:
244
+ # # Plot read length histogram
245
+ # plt.figure(figsize=(12, 6))
246
+ # plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
247
+ # plt.title(f'Read Length Histogram of reads aligned to {chrom}')
248
+ # plt.xlabel('Read Length')
249
+ # plt.ylabel('Count')
250
+ # plt.grid(True)
251
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
252
+ # plt.savefig(save_name)
253
+ # plt.close()
254
+
255
+ # # Compute coverage
256
+ # coverage = np.zeros(group['end'].max())
257
+ # for _, row in group.iterrows():
258
+ # coverage[row['start']:row['end']] += 1
259
+
260
+ # # Plot coverage histogram
261
+ # plt.figure(figsize=(12, 6))
262
+ # plt.plot(coverage, color='b')
263
+ # plt.title(f'Coverage Histogram for {chrom}')
264
+ # plt.xlabel('Position')
265
+ # plt.ylabel('Coverage')
266
+ # plt.grid(True)
267
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
268
+ # plt.savefig(save_name)
269
+ # plt.close()
@@ -1,6 +1,5 @@
1
1
  ## separate_bam_by_bc
2
2
 
3
- # General
4
3
  def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
5
4
  """
6
5
  Separates an input BAM file on the BC SAM tag values.
@@ -29,7 +28,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
29
28
  for read in bam:
30
29
  try:
31
30
  # Get the barcode tag value
32
- bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
31
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
32
+ #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
33
33
  # Open the output BAM file corresponding to the barcode
34
34
  if bc_tag not in output_files:
35
35
  output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
@@ -1,13 +1,12 @@
1
1
  ## split_and_index_BAM
2
2
 
3
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory):
3
+ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
4
4
  """
5
5
  A wrapper function for splitting BAMS and indexing them.
6
6
  Parameters:
7
7
  aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
8
  split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
9
  bam_suffix (str): A suffix to add to the bam file.
10
- output_directory (str): A file path to the directory to output all the analyses.
11
10
 
12
11
  Returns:
13
12
  None
@@ -20,9 +19,6 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_direct
20
19
  from .separate_bam_by_bc import separate_bam_by_bc
21
20
  from .make_dirs import make_dirs
22
21
 
23
- plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
24
- bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
25
- make_dirs([plotting_dir, bed_dir])
26
22
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
27
23
  file_prefix = readwrite.date_string()
28
24
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)