smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -23,9 +23,9 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
23
23
  import glob
24
24
  import zipfile
25
25
 
26
- os.chdir(mod_tsv_dir)
27
26
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
28
- bam_files = glob.glob(os.path.join(split_dir, f"*{bam_suffix}"))
27
+ bam_files = glob.glob(split_dir / f"*{bam_suffix}")
28
+ print(f"Running modkit extract for the following bam files: {bam_files}")
29
29
 
30
30
  if threads:
31
31
  threads = str(threads)
@@ -35,20 +35,20 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
35
35
  for input_file in bam_files:
36
36
  print(input_file)
37
37
  # Extract the file basename
38
- file_name = os.path.basename(input_file)
38
+ file_name = input_file.name
39
39
  if skip_unclassified and "unclassified" in file_name:
40
40
  print("Skipping modkit extract on unclassified reads")
41
41
  else:
42
42
  # Construct the output TSV file path
43
- output_tsv_temp = os.path.join(mod_tsv_dir, file_name)
44
- output_tsv = output_tsv_temp.replace(bam_suffix, "") + "_extract.tsv"
45
- if os.path.exists(f"{output_tsv}.gz"):
46
- print(f"{output_tsv}.gz already exists, skipping modkit extract")
43
+ output_tsv = mod_tsv_dir / file_name.stem + "_extract.tsv"
44
+ output_tsv_gz = output_tsv + '.gz'
45
+ if output_tsv_gz.exists():
46
+ print(f"{output_tsv_gz} already exists, skipping modkit extract")
47
47
  else:
48
48
  print(f"Extracting modification data from {input_file}")
49
49
  if modkit_summary:
50
50
  # Run modkit summary
51
- subprocess.run(["modkit", "summary", input_file])
51
+ subprocess.run(["modkit", "summary", str(input_file)])
52
52
  else:
53
53
  pass
54
54
  # Run modkit extract
@@ -61,7 +61,7 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
61
61
  "--mod-thresholds", f"a:{m6A_threshold}",
62
62
  "--mod-thresholds", f"h:{hm5C_threshold}",
63
63
  "-t", threads,
64
- input_file, output_tsv
64
+ str(input_file), str(output_tsv)
65
65
  ]
66
66
  else:
67
67
  extract_command = [
@@ -71,13 +71,15 @@ def extract_mods(thresholds, mod_tsv_dir, split_dir, bam_suffix, skip_unclassifi
71
71
  "--mod-thresholds", f"m:{m5C_threshold}",
72
72
  "--mod-thresholds", f"a:{m6A_threshold}",
73
73
  "--mod-thresholds", f"h:{hm5C_threshold}",
74
- input_file, output_tsv
74
+ str(input_file), str(output_tsv)
75
75
  ]
76
76
  subprocess.run(extract_command)
77
77
  # Zip the output TSV
78
78
  print(f'zipping {output_tsv}')
79
79
  if threads:
80
- zip_command = ["pigz", "-f", "-p", threads, output_tsv]
80
+ zip_command = ["pigz", "-f", "-p", threads, str(output_tsv)]
81
81
  else:
82
- zip_command = ["pigz", "-f", output_tsv]
83
- subprocess.run(zip_command, check=True)
82
+ zip_command = ["pigz", "-f", str(output_tsv)]
83
+ subprocess.run(zip_command, check=True)
84
+
85
+ return
@@ -67,6 +67,8 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
67
67
  None (Writes the converted FASTA file).
68
68
  """
69
69
  unconverted = modification_types[0]
70
+ input_fasta = str(input_fasta)
71
+ output_fasta = str(output_fasta)
70
72
 
71
73
  # Detect if input is gzipped
72
74
  open_func = gzip.open if input_fasta.endswith('.gz') else open
@@ -8,25 +8,26 @@ def get_chromosome_lengths(fasta):
8
8
  fasta (str): Path to the input fasta
9
9
  """
10
10
  import os
11
+ from pathlib import Path
11
12
  import subprocess
12
13
  from .index_fasta import index_fasta
13
14
 
14
15
  # Make a fasta index file if one isn't already available
15
- index_path = f'{fasta}.fai'
16
- if os.path.exists(index_path):
16
+ index_path = fasta / '.fai'
17
+ if index_path.exists():
17
18
  print(f'Using existing fasta index file: {index_path}')
18
19
  else:
19
20
  index_fasta(fasta)
20
21
 
21
- parent_dir = os.path.dirname(fasta)
22
- fasta_basename = os.path.basename(fasta)
23
- chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
- chrom_path = os.path.join(parent_dir, chrom_basename)
22
+ parent_dir = fasta.parent
23
+ fasta_basename = fasta.name
24
+ chrom_basename = fasta.stem + '.chrom.sizes'
25
+ chrom_path = parent_dir / chrom_basename
25
26
 
26
27
  # Make a chromosome length file
27
- if os.path.exists(chrom_path):
28
+ if chrom_path.exists():
28
29
  print(f'Using existing chrom length index file: {chrom_path}')
29
30
  else:
30
31
  with open(chrom_path, 'w') as outfile:
31
- command = ["cut", "-f1,2", index_path]
32
+ command = ["cut", "-f1,2", str(index_path)]
32
33
  subprocess.run(command, stdout=outfile)
@@ -0,0 +1,24 @@
1
+ import pysam
2
+ from pathlib import Path
3
+
4
+ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
5
+ """
6
+ Index a FASTA and optionally write <fasta>.chrom.sizes for bigwig/bedgraph work.
7
+
8
+ Returns
9
+ -------
10
+ Path: path to chrom.sizes file (if requested), else .fai
11
+ """
12
+ fasta = Path(fasta)
13
+ pysam.faidx(str(fasta)) # makes fasta.fai
14
+
15
+ if write_chrom_sizes:
16
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
17
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
18
+ with open(fai) as f_in, open(chrom_sizes, "w") as out:
19
+ for line in f_in:
20
+ chrom, size = line.split()[:2]
21
+ out.write(f"{chrom}\t{size}\n")
22
+ return chrom_sizes
23
+
24
+ return fasta.with_suffix(fasta.suffix + ".fai")
@@ -13,10 +13,9 @@ def make_modbed(aligned_sorted_output, thresholds, mod_bed_dir):
13
13
  import os
14
14
  import subprocess
15
15
 
16
- os.chdir(mod_bed_dir)
17
16
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
18
17
  command = [
19
- "modkit", "pileup", aligned_sorted_output, mod_bed_dir,
18
+ "modkit", "pileup", str(aligned_sorted_output), str(mod_bed_dir),
20
19
  "--partition-tag", "BC",
21
20
  "--only-tabs",
22
21
  "--filter-threshold", f'{filter_threshold}',
@@ -16,9 +16,9 @@ def modQC(aligned_sorted_output, thresholds):
16
16
  import subprocess
17
17
 
18
18
  filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold = thresholds
19
- subprocess.run(["modkit", "sample-probs", aligned_sorted_output])
19
+ subprocess.run(["modkit", "sample-probs", str(aligned_sorted_output)])
20
20
  command = [
21
- "modkit", "summary", aligned_sorted_output,
21
+ "modkit", "summary", str(aligned_sorted_output),
22
22
  "--filter-threshold", f"{filter_threshold}",
23
23
  "--mod-thresholds", f"m:{m5C_threshold}",
24
24
  "--mod-thresholds", f"a:{m6A_threshold}",
@@ -1,24 +1,5 @@
1
1
  # plot_bed_histograms
2
2
 
3
- def plot_bed_histograms(bed_file, plotting_directory, fasta):
4
- """
5
- Plots read length, coverage, mapq, read quality stats for each record.
6
-
7
- Parameters:
8
- bed_file (str): Path to the bed file to derive metrics from.
9
- plot_directory (str): Path to the directory to write out historgrams.
10
- fasta (str): Path to FASTA corresponding to bed
11
-
12
- Returns:
13
- None
14
- """
15
- import pandas as pd
16
- import matplotlib.pyplot as plt
17
- import numpy as np
18
- import os
19
-
20
- # plot_bed_histograms.py
21
-
22
3
  def plot_bed_histograms(
23
4
  bed_file,
24
5
  plotting_directory,
@@ -15,13 +15,14 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
15
15
  Writes out split BAM files.
16
16
  """
17
17
  import pysam
18
+ from pathlib import Path
18
19
  import os
19
20
 
20
- bam_base = os.path.basename(input_bam)
21
- bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
21
+ bam_base = input_bam.name
22
+ bam_base_minus_suffix = input_bam.stem
22
23
 
23
24
  # Open the input BAM file for reading
24
- with pysam.AlignmentFile(input_bam, "rb") as bam:
25
+ with pysam.AlignmentFile(str(input_bam), "rb") as bam:
25
26
  # Create a dictionary to store output BAM files
26
27
  output_files = {}
27
28
  # Iterate over each read in the BAM file
@@ -32,8 +33,8 @@ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
32
33
  #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
33
34
  # Open the output BAM file corresponding to the barcode
34
35
  if bc_tag not in output_files:
35
- output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
36
- output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
36
+ output_path = split_dir / f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}"
37
+ output_files[bc_tag] = pysam.AlignmentFile(str(output_path), "wb", header=bam.header)
37
38
  # Write the read to the corresponding output BAM file
38
39
  output_files[bc_tag].write(read)
39
40
  except KeyError:
@@ -12,21 +12,21 @@ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
12
12
  None
13
13
  Splits an input BAM file on barcode value and makes a BAM index file.
14
14
  """
15
- from .. import readwrite
15
+ from ...readwrite import date_string, make_dirs
16
+ from pathlib import Path
16
17
  import os
17
- import subprocess
18
+ import pysam
18
19
  import glob
19
20
  from .separate_bam_by_bc import separate_bam_by_bc
20
- from .make_dirs import make_dirs
21
21
 
22
22
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
23
- file_prefix = readwrite.date_string()
23
+ file_prefix = date_string()
24
24
  separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
25
25
  # Make a BAM index file for the BAMs in that directory
26
26
  bam_pattern = '*' + bam_suffix
27
- bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
28
- bam_files = [bam for bam in bam_files if '.bai' not in bam]
27
+ bam_files = glob.glob(split_dir / bam_pattern)
28
+ bam_files = [str(bam) for bam in bam_files if '.bai' not in str(bam)]
29
29
  for input_file in bam_files:
30
- subprocess.run(["samtools", "index", input_file])
30
+ pysam.index(input_file)
31
31
 
32
32
  return bam_files
@@ -0,0 +1,49 @@
1
+ from pathlib import Path
2
+ from pyfaidx import Fasta
3
+
4
+ def subsample_fasta_from_bed(
5
+ input_FASTA: str | Path,
6
+ input_bed: str | Path,
7
+ output_directory: str | Path,
8
+ output_FASTA: str | Path
9
+ ) -> None:
10
+ """
11
+ Take a genome-wide FASTA file and a BED file containing
12
+ coordinate windows of interest. Outputs a subsampled FASTA.
13
+ """
14
+
15
+ # Normalize everything to Path
16
+ input_FASTA = Path(input_FASTA)
17
+ input_bed = Path(input_bed)
18
+ output_directory = Path(output_directory)
19
+ output_FASTA = Path(output_FASTA)
20
+
21
+ # Ensure output directory exists
22
+ output_directory.mkdir(parents=True, exist_ok=True)
23
+
24
+ output_FASTA_path = output_directory / output_FASTA
25
+
26
+ # Load the FASTA file using pyfaidx
27
+ fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
28
+
29
+ # Open BED + output FASTA
30
+ with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
31
+ for line in bed:
32
+ fields = line.strip().split()
33
+ chrom = fields[0]
34
+ start = int(fields[1]) # BED is 0-based
35
+ end = int(fields[2]) # BED is 0-based and end is exclusive
36
+ desc = " ".join(fields[3:]) if len(fields) > 3 else ""
37
+
38
+ if chrom not in fasta:
39
+ print(f"Warning: {chrom} not found in FASTA")
40
+ continue
41
+
42
+ # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
43
+ sequence = fasta[chrom][start:end].seq
44
+
45
+ header = f">{chrom}:{start}-{end}"
46
+ if desc:
47
+ header += f" {desc}"
48
+
49
+ out_fasta.write(f"{header}\n{sequence}\n")