smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +3 -2
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_sample_sheet.csv +5 -0
  5. smftools/datasets/datasets.py +14 -11
  6. smftools/informatics/__init__.py +10 -7
  7. smftools/informatics/archived/bam_conversion.py +59 -0
  8. smftools/informatics/archived/bam_direct.py +63 -0
  9. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  10. smftools/informatics/conversion_smf.py +79 -0
  11. smftools/informatics/direct_smf.py +89 -0
  12. smftools/informatics/fast5_to_pod5.py +21 -0
  13. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  14. smftools/informatics/helpers/__init__.py +22 -4
  15. smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
  16. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  17. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  18. smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
  19. smftools/informatics/helpers/canoncall.py +14 -1
  20. smftools/informatics/helpers/complement_base_list.py +21 -0
  21. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
  23. smftools/informatics/helpers/count_aligned_reads.py +25 -14
  24. smftools/informatics/helpers/extract_base_identities.py +44 -23
  25. smftools/informatics/helpers/extract_mods.py +17 -5
  26. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  27. smftools/informatics/helpers/find_conversion_sites.py +24 -16
  28. smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
  29. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  30. smftools/informatics/helpers/get_native_references.py +10 -7
  31. smftools/informatics/helpers/index_fasta.py +12 -0
  32. smftools/informatics/helpers/make_dirs.py +9 -3
  33. smftools/informatics/helpers/make_modbed.py +10 -4
  34. smftools/informatics/helpers/modQC.py +10 -2
  35. smftools/informatics/helpers/modcall.py +16 -2
  36. smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
  37. smftools/informatics/helpers/ohe_batching.py +52 -0
  38. smftools/informatics/helpers/one_hot_encode.py +15 -8
  39. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  40. smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
  41. smftools/informatics/helpers/split_and_index_BAM.py +31 -11
  42. smftools/informatics/load_adata.py +127 -0
  43. smftools/informatics/readwrite.py +13 -16
  44. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  45. smftools/informatics/subsample_pod5.py +104 -0
  46. smftools/preprocessing/__init__.py +6 -7
  47. smftools/preprocessing/append_C_context.py +52 -22
  48. smftools/preprocessing/binarize_on_Youden.py +8 -4
  49. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  50. smftools/preprocessing/calculate_complexity.py +26 -14
  51. smftools/preprocessing/calculate_consensus.py +47 -0
  52. smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
  53. smftools/preprocessing/calculate_coverage.py +14 -8
  54. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  55. smftools/preprocessing/calculate_position_Youden.py +21 -12
  56. smftools/preprocessing/calculate_read_length_stats.py +67 -8
  57. smftools/preprocessing/clean_NaN.py +13 -6
  58. smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
  59. smftools/preprocessing/filter_reads_on_length.py +16 -6
  60. smftools/preprocessing/invert_adata.py +10 -5
  61. smftools/preprocessing/load_sample_sheet.py +24 -0
  62. smftools/preprocessing/make_dirs.py +21 -0
  63. smftools/preprocessing/mark_duplicates.py +54 -30
  64. smftools/preprocessing/min_non_diagonal.py +9 -4
  65. smftools/preprocessing/recipes.py +125 -0
  66. smftools/preprocessing/remove_duplicates.py +15 -6
  67. smftools/readwrite.py +13 -16
  68. smftools/tools/apply_HMM.py +1 -0
  69. smftools/tools/cluster.py +0 -0
  70. smftools/tools/read_HMM.py +1 -0
  71. smftools/tools/subset_adata.py +32 -0
  72. smftools/tools/train_HMM.py +43 -0
  73. smftools-0.1.3.dist-info/METADATA +94 -0
  74. smftools-0.1.3.dist-info/RECORD +84 -0
  75. smftools/informatics/helpers/align_BAM.py +0 -49
  76. smftools/informatics/helpers/load_experiment_config.py +0 -17
  77. smftools/informatics/pod5_conversion.py +0 -26
  78. smftools/informatics/pod5_direct.py +0 -29
  79. smftools/informatics/pod5_to_adata.py +0 -17
  80. smftools-0.1.0.dist-info/METADATA +0 -75
  81. smftools-0.1.0.dist-info/RECORD +0 -58
  82. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  83. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  84. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  85. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  86. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,52 @@
1
+ # ohe_batching
2
+
3
+ def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000):
4
+ """
5
+ Processes base identities to one-hot encoded matrices and writes to a h5ad file in batches.
6
+
7
+ Parameters:
8
+ base_identities (dict): A dictionary of read names and sequences.
9
+ tmp_dir (str): Path to directory where the files will be saved.
10
+ record (str): Name of the record.
11
+ prefix (str): Prefix to add to the output file name
12
+ batch_size (int): Number of reads to process in each batch.
13
+
14
+ Returns:
15
+ ohe_file (list): list of output file names
16
+ """
17
+ import os
18
+ import anndata as ad
19
+ import numpy as np
20
+ from tqdm import tqdm
21
+ from .one_hot_encode import one_hot_encode
22
+
23
+ batch = {}
24
+ count = 0
25
+ batch_number = 0
26
+ total_reads = len(base_identities)
27
+ file_names = []
28
+
29
+ for read_name, seq in tqdm(base_identities.items(), desc="Encoding and writing one hot encoded reads", total=total_reads):
30
+ one_hot_matrix = one_hot_encode(seq)
31
+ batch[read_name] = one_hot_matrix
32
+ count += 1
33
+ # If the batch size is reached, write out the batch and reset
34
+ if count >= batch_size:
35
+ save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
36
+ X = np.random.rand(1, 1)
37
+ tmp_ad = ad.AnnData(X=X, uns=batch)
38
+ tmp_ad.write_h5ad(save_name, compression='gzip')
39
+ file_names.append(save_name)
40
+ batch.clear()
41
+ count = 0
42
+ batch_number += 1
43
+
44
+ # Write out any remaining reads in the final batch
45
+ if batch:
46
+ save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad.gz')
47
+ X = np.random.rand(1, 1)
48
+ tmp_ad = ad.AnnData(X=X, uns=batch)
49
+ tmp_ad.write_h5ad(save_name, compression='gzip')
50
+ file_names.append(save_name)
51
+
52
+ return file_names
@@ -1,14 +1,21 @@
1
1
  # one_hot_encode
2
- from .. import readwrite
3
2
 
4
3
  # String encodings
5
4
  def one_hot_encode(sequence):
6
5
  """
7
- Input: A sequence string of a read.
8
- Output: One hot encoding of the sequence string.
6
+ One hot encodes a sequence list.
7
+ Parameters:
8
+ sequence (list): A list of DNA base sequences.
9
+
10
+ Returns:
11
+ flattened (ndarray): A numpy ndarray holding a flattened one hot encoding of the input sequence string.
9
12
  """
10
- mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3, 'N': 4}
11
- one_hot_matrix = np.zeros((len(sequence), 5), dtype=int)
12
- for i, nucleotide in enumerate(sequence):
13
- one_hot_matrix[i, mapping[nucleotide]] = 1
14
- return one_hot_matrix
13
+ import numpy as np
14
+
15
+ seq_array = np.array(sequence, dtype='<U1') # String dtype
16
+ mapping = np.array(['A', 'C', 'G', 'T', 'N'])
17
+ seq_array[~np.isin(seq_array, mapping)] = 'N'
18
+ one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
19
+ flattened = one_hot_matrix.flatten()
20
+
21
+ return flattened
@@ -0,0 +1,52 @@
1
+ # plot_read_length_and_coverage_histograms
2
+
3
+ def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
4
+ """
5
+ Plots read length and coverage statistics for each record.
6
+
7
+ Parameters:
8
+ bed_file (str): Path to the bed file to derive read lengths and coverage from.
9
+ plot_directory (str): Path to the directory to write out historgrams.
10
+
11
+ Returns:
12
+ None
13
+ """
14
+ import pandas as pd
15
+ import matplotlib.pyplot as plt
16
+ import numpy as np
17
+ import os
18
+
19
+ bed_basename = os.path.basename(bed_file).split('.bed')[0]
20
+ # Load the BED file into a DataFrame
21
+ df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
22
+
23
+ # Group by chromosome
24
+ grouped = df.groupby('chromosome')
25
+
26
+ for chrom, group in grouped:
27
+ # Plot read length histogram
28
+ plt.figure(figsize=(12, 6))
29
+ plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
30
+ plt.title(f'Read Length Histogram of reads aligned to {chrom}')
31
+ plt.xlabel('Read Length')
32
+ plt.ylabel('Count')
33
+ plt.grid(True)
34
+ save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
35
+ plt.savefig(save_name)
36
+ plt.close()
37
+
38
+ # Compute coverage
39
+ coverage = np.zeros(group['end'].max())
40
+ for _, row in group.iterrows():
41
+ coverage[row['start']:row['end']] += 1
42
+
43
+ # Plot coverage histogram
44
+ plt.figure(figsize=(12, 6))
45
+ plt.plot(coverage, color='b')
46
+ plt.title(f'Coverage Histogram for {chrom}')
47
+ plt.xlabel('Position')
48
+ plt.ylabel('Coverage')
49
+ plt.grid(True)
50
+ save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
51
+ plt.savefig(save_name)
52
+ plt.close()
@@ -1,12 +1,26 @@
1
1
  ## separate_bam_by_bc
2
- import pysam
3
2
 
4
3
  # General
5
- def separate_bam_by_bc(input_bam, output_prefix):
4
+ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
6
5
  """
7
- Input: Takes a single BAM input. Also takes an output prefix to append to the output file.
8
- Output: Splits the BAM based on the BC SAM tag value.
6
+ Separates an input BAM file on the BC SAM tag values.
7
+
8
+ Parameters:
9
+ input_bam (str): File path to the BAM file to split.
10
+ output_prefix (str): A prefix to append to the output BAM.
11
+ bam_suffix (str): A suffix to add to the bam file.
12
+ split_dir (str): String indicating path to directory to split BAMs into
13
+
14
+ Returns:
15
+ None
16
+ Writes out split BAM files.
9
17
  """
18
+ import pysam
19
+ import os
20
+
21
+ bam_base = os.path.basename(input_bam)
22
+ bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
23
+
10
24
  # Open the input BAM file for reading
11
25
  with pysam.AlignmentFile(input_bam, "rb") as bam:
12
26
  # Create a dictionary to store output BAM files
@@ -18,7 +32,8 @@ def separate_bam_by_bc(input_bam, output_prefix):
18
32
  bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
19
33
  # Open the output BAM file corresponding to the barcode
20
34
  if bc_tag not in output_files:
21
- output_files[bc_tag] = pysam.AlignmentFile(f"{output_prefix}_{bc_tag}.bam", "wb", header=bam.header)
35
+ output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
36
+ output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
22
37
  # Write the read to the corresponding output BAM file
23
38
  output_files[bc_tag].write(read)
24
39
  except KeyError:
@@ -1,21 +1,41 @@
1
1
  ## split_and_index_BAM
2
- from .. import readwrite
3
- import os
4
- import subprocess
5
- import glob
6
- from .separate_bam_by_bc import separate_bam_by_bc
7
2
 
8
- def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
3
+ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory, fasta):
9
4
  """
10
- A wrapper function for splitting BAMS and indexing them
5
+ A wrapper function for splitting BAMS and indexing them.
6
+ Parameters:
7
+ aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
+ bam_suffix (str): A suffix to add to the bam file.
10
+ output_directory (str): A file path to the directory to output all the analyses.
11
+ fasta (str): File path to the reference genome to align to.
12
+
13
+ Returns:
14
+ None
15
+ Splits an input BAM file on barcode value and makes a BAM index file.
11
16
  """
12
- os.chdir(split_dir)
17
+ from .. import readwrite
18
+ import os
19
+ import subprocess
20
+ import glob
21
+ from .separate_bam_by_bc import separate_bam_by_bc
22
+ from .aligned_BAM_to_bed import aligned_BAM_to_bed
23
+ from .extract_readnames_from_BAM import extract_readnames_from_BAM
24
+ from .make_dirs import make_dirs
25
+
26
+ plotting_dir = os.path.join(output_directory, 'demultiplexed_bed_histograms')
27
+ bed_dir = os.path.join(output_directory, 'demultiplexed_read_alignment_coordinates')
28
+ make_dirs([plotting_dir, bed_dir])
13
29
  aligned_sorted_output = aligned_sorted_BAM + bam_suffix
14
- file_prefix = readwrite.datestring()
15
- separate_bam_by_bc(aligned_sorted_output, file_prefix)
30
+ file_prefix = readwrite.date_string()
31
+ separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
16
32
  # Make a BAM index file for the BAMs in that directory
17
33
  bam_pattern = '*' + bam_suffix
18
34
  bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
35
+ bam_files = [bam for bam in bam_files if '.bai' not in bam]
19
36
  for input_file in bam_files:
20
37
  subprocess.run(["samtools", "index", input_file])
21
- print(f"Indexed {input_file}")
38
+ # Make a bed file of coordinates for the BAM
39
+ aligned_BAM_to_bed(input_file, plotting_dir, bed_dir, fasta)
40
+ # Make a text file of reads for the BAM
41
+ extract_readnames_from_BAM(input_file)
@@ -0,0 +1,127 @@
1
+ ## load_adata
2
+
3
+ def load_adata(config_path):
4
+ """
5
+ High-level function to call for converting raw sequencing data to an adata object.
6
+ Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
7
+ Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
8
+ Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
9
+
10
+ Parameters:
11
+ config_path (str): A string representing the file path to the experiment configuration csv file.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ # Lazy importing of packages
17
+ from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam
18
+ from .fast5_to_pod5 import fast5_to_pod5
19
+ from .subsample_fasta_from_bed import subsample_fasta_from_bed
20
+ import os
21
+ import numpy as np
22
+ from pathlib import Path
23
+
24
+ # Default params
25
+ bam_suffix = '.bam' # If different, change from here.
26
+ split_dir = 'demultiplexed_BAMs' # If different, change from here.
27
+ strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
28
+ conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
29
+
30
+ # Load experiment config parameters into global variables
31
+ experiment_config = LoadExperimentConfig(config_path)
32
+ var_dict = experiment_config.var_dict
33
+
34
+ # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
35
+ default_value = None
36
+
37
+ # General config variable init
38
+ smf_modality = var_dict.get('smf_modality', default_value) # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Necessary.
39
+ input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
40
+ output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
41
+ fasta = var_dict.get('fasta', default_value) # Path to reference FASTA.
42
+ fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
43
+ mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
44
+ experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
45
+ model = var_dict.get('model', default_value) # needed for dorado basecaller
46
+ barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
47
+ # Conversion specific variable init
48
+ conversion_types = var_dict.get('conversion_types', default_value)
49
+ # Direct methylation specific variable init
50
+ filter_threshold = var_dict.get('filter_threshold', default_value)
51
+ m6A_threshold = var_dict.get('m6A_threshold', default_value)
52
+ m5C_threshold = var_dict.get('m5C_threshold', default_value)
53
+ hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
54
+ thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
55
+ mod_list = var_dict.get('mod_list', default_value)
56
+ batch_size = var_dict.get('batch_size', default_value)
57
+
58
+ # Make initial output directory
59
+ make_dirs([output_directory])
60
+ os.chdir(output_directory)
61
+ # Define the pathname to split BAMs into later during demultiplexing.
62
+ split_path = os.path.join(output_directory, split_dir)
63
+
64
+ # If fasta_regions_of_interest is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
65
+ if fasta_regions_of_interest and '.bed' in fasta_regions_of_interest:
66
+ fasta_basename = os.path.basename(fasta).split('.fa')[0]
67
+ bed_basename_minus_suffix = os.path.basename(fasta_regions_of_interest).split('.bed')[0]
68
+ output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
69
+ subsample_fasta_from_bed(fasta, fasta_regions_of_interest, output_directory, output_FASTA)
70
+ fasta = os.path.join(output_directory, output_FASTA)
71
+
72
+ # If conversion_types is passed:
73
+ if conversion_types:
74
+ conversions += conversion_types
75
+
76
+ # Get the input filetype
77
+ if Path(input_data_path).is_file():
78
+ input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
79
+ input_is_pod5 = input_data_filetype in ['.pod5','.p5']
80
+ input_is_fast5 = input_data_filetype in ['.fast5','.f5']
81
+ input_is_fastq = input_data_filetype in ['.fastq', '.fq']
82
+ input_is_bam = input_data_filetype == bam_suffix
83
+ if input_is_fastq:
84
+ fastq_paths = [input_data_path]
85
+ elif Path(input_data_path).is_dir():
86
+ # Get the file names in the input data dir
87
+ input_files = os.listdir(input_data_path)
88
+ input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
89
+ input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
90
+ input_is_fastq = sum([True for file in input_files if '.fastq' in file or '.fq' in file])
91
+ input_is_bam = sum([True for file in input_files if bam_suffix in file])
92
+ if input_is_fastq:
93
+ fastq_paths = [os.path.join(input_data_path, file) for file in input_files if '.fastq' in file or '.fq' in file]
94
+
95
+ # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
96
+ if input_is_fast5 and not input_is_pod5:
97
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
98
+ output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
99
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
100
+ fast5_to_pod5(input_data_path, output_pod5)
101
+ # Reassign the pod5_dir variable to point to the new pod5 file.
102
+ input_data_path = output_pod5
103
+ input_is_pod5 = True
104
+ input_is_fast5 = False
105
+
106
+ elif input_is_fastq:
107
+ output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
108
+ concatenate_fastqs_to_bam(fastq_paths, output_bam, barcode_tag='BC', gzip_suffix='.gz')
109
+ input_data_path = output_bam
110
+ input_is_bam = True
111
+ input_is_fastq = False
112
+
113
+ if input_is_pod5:
114
+ basecall = True
115
+ elif input_is_bam:
116
+ basecall = False
117
+ else:
118
+ print('Error, can not find input bam or pod5')
119
+
120
+ if smf_modality == 'conversion':
121
+ from .conversion_smf import conversion_smf
122
+ conversion_smf(fasta, output_directory, conversions, strands, model, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall)
123
+ elif smf_modality == 'direct':
124
+ from .direct_smf import direct_smf
125
+ direct_smf(fasta, output_directory, mod_list, model, thresholds, input_data_path, split_path, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall)
126
+ else:
127
+ print("Error")
@@ -1,27 +1,12 @@
1
1
  ## readwrite ##
2
2
 
3
- # Basic I/O
4
- import os
5
- # Datetime
6
- from datetime import datetime
7
- # Data structures and basic operations
8
- import math
9
- import numpy as np
10
- import pandas as pd
11
- import anndata as ad
12
- import scipy.sparse as sp
13
-
14
- # Runtime warnings
15
- import warnings
16
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
17
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
18
-
19
3
  ######################################################################################################
20
4
  ## Datetime functionality
21
5
  def date_string():
22
6
  """
23
7
  Each time this is called, it returns the current date string
24
8
  """
9
+ from datetime import datetime
25
10
  current_date = datetime.now()
26
11
  date_string = current_date.strftime("%Y%m%d")
27
12
  date_string = date_string[2:]
@@ -31,6 +16,7 @@ def time_string():
31
16
  """
32
17
  Each time this is called, it returns the current time string
33
18
  """
19
+ from datetime import datetime
34
20
  current_time = datetime.now()
35
21
  return current_time.strftime("%H:%M:%S")
36
22
  ######################################################################################################
@@ -42,6 +28,9 @@ def adata_to_df(adata, layer=None):
42
28
  Input: An adata object with a specified layer.
43
29
  Output: A dataframe for the specific layer.
44
30
  """
31
+ import pandas as pd
32
+ import anndata as ad
33
+
45
34
  # Extract the data matrix from the given layer
46
35
  if layer:
47
36
  data_matrix = adata.layers[layer]
@@ -60,6 +49,7 @@ def save_matrix(matrix, save_name):
60
49
  Input: A numpy matrix and a save_name
61
50
  Output: A txt file representation of the data matrix
62
51
  """
52
+ import numpy as np
63
53
  np.savetxt(f'{save_name}.txt', matrix)
64
54
 
65
55
  def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
@@ -67,6 +57,13 @@ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
67
57
  Concatenate all h5ad files in a directory and delete them after the final adata is written out.
68
58
  Input: an output file path relative to the directory in which the function is called
69
59
  """
60
+ import os
61
+ import anndata as ad
62
+ # Runtime warnings
63
+ import warnings
64
+ warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
+ warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
+
70
67
  # List all files in the directory
71
68
  files = os.listdir(os.getcwd())
72
69
  # get current working directory
@@ -0,0 +1,47 @@
1
+ # subsample_fasta_from_bed
2
+
3
+ def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
4
+ """
5
+ Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
6
+
7
+ Parameters:
8
+ input_FASTA (str): String representing the path to the input FASTA file.
9
+ input_bed (str): String representing the path to the input BED file.
10
+ output_directory (str): String representing the path to the output directory for the new FASTA file.
11
+ output_FASTA (str): Name of the output FASTA.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ from pyfaidx import Fasta
17
+ import os
18
+
19
+ # Load the FASTA file using pyfaidx
20
+ fasta = Fasta(input_FASTA)
21
+
22
+ output_FASTA_path = os.path.join(output_directory, output_FASTA)
23
+
24
+ # Open the BED file
25
+ with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
26
+ for line in bed:
27
+ # Each line in BED file contains: chrom, start, end (and possibly more columns)
28
+ fields = line.strip().split()
29
+ n_fields = len(fields)
30
+ chrom = fields[0]
31
+ start = int(fields[1]) # BED is 0-based
32
+ end = int(fields[2]) # BED is 0-based and end is exclusive
33
+ if n_fields > 3:
34
+ description = " ".join(fields[3:])
35
+
36
+ # Check if the chromosome exists in the FASTA file
37
+ if chrom in fasta:
38
+ # pyfaidx is 1-based, so convert coordinates accordingly
39
+ sequence = fasta[chrom][start:end].seq
40
+ # Write the sequence to the output FASTA file
41
+ if n_fields > 3:
42
+ out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
43
+ else:
44
+ out_fasta.write(f">{chrom}:{start}-{end}\n")
45
+ out_fasta.write(f"{sequence}\n")
46
+ else:
47
+ print(f"Warning: {chrom} not found in the FASTA file")
@@ -0,0 +1,104 @@
1
+ # subsample_pod5
2
+
3
+ def subsample_pod5(pod5_path, read_name_path, output_directory):
4
+ """
5
+ Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
6
+ This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
7
+
8
+ Parameters:
9
+ pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
10
+ read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
11
+ output_directory (str): A file path to the directory to output the file.
12
+
13
+ Returns:
14
+ None
15
+ """
16
+ import pod5 as p5
17
+ import os
18
+
19
+ if os.path.isdir(pod5_path):
20
+ pod5_path_is_dir = True
21
+ input_pod5_base = 'input_pod5s.pod5'
22
+ files = os.listdir(pod5_path)
23
+ pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
24
+ pod5_files.sort()
25
+ print(f'Found input pod5s: {pod5_files}')
26
+
27
+ elif os.path.exists(pod5_path):
28
+ pod5_path_is_dir = False
29
+ input_pod5_base = os.path.basename(pod5_path)
30
+
31
+ else:
32
+ print('Error: pod5_path passed does not exist')
33
+ return None
34
+
35
+ if type(read_name_path) == str:
36
+ input_read_name_base = os.path.basename(read_name_path)
37
+ output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
38
+
39
+ # extract read names into a list of strings
40
+ with open(read_name_path, 'r') as file:
41
+ read_names = [line.strip() for line in file]
42
+
43
+ print(f'Looking for read_ids: {read_names}')
44
+ read_records = []
45
+
46
+ if pod5_path_is_dir:
47
+ for input_pod5 in pod5_files:
48
+ with p5.Reader(input_pod5) as reader:
49
+ try:
50
+ for read_record in reader.reads(selection=read_names, missing_ok=True):
51
+ read_records.append(read_record.to_read())
52
+ print(f'Found read in {input_pod5}: {read_record.read_id}')
53
+ except:
54
+ print('Skipping pod5, could not find reads')
55
+ else:
56
+ with p5.Reader(pod5_path) as reader:
57
+ try:
58
+ for read_record in reader.reads(selection=read_names):
59
+ read_records.append(read_record.to_read())
60
+ print(f'Found read in {input_pod5}: {read_record}')
61
+ except:
62
+ print('Could not find reads')
63
+
64
+ elif type(read_name_path) == int:
65
+ import random
66
+ output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
67
+ all_read_records = []
68
+
69
+ if pod5_path_is_dir:
70
+ # Shuffle the list of input pod5 paths
71
+ random.shuffle(pod5_files)
72
+ for input_pod5 in pod5_files:
73
+ # iterate over the input pod5s
74
+ print(f'Opening pod5 file {input_pod5}')
75
+ with p5.Reader(pod5_path) as reader:
76
+ for read_record in reader.reads():
77
+ all_read_records.append(read_record.to_read())
78
+ # When enough reads are in all_read_records, stop accumulating reads.
79
+ if len(all_read_records) >= read_name_path:
80
+ break
81
+
82
+ if read_name_path <= len(all_read_records):
83
+ read_records = random.sample(all_read_records, read_name_path)
84
+ else:
85
+ print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
86
+ read_records = all_read_records
87
+
88
+ else:
89
+ with p5.Reader(pod5_path) as reader:
90
+ for read_record in reader.reads():
91
+ # get all read records from the input pod5
92
+ all_read_records.append(read_record.to_read())
93
+ if read_name_path <= len(all_read_records):
94
+ # if the subsampling amount is less than the record amount in the file, randomly subsample the reads
95
+ read_records = random.sample(all_read_records, read_name_path)
96
+ else:
97
+ print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
98
+ read_records = all_read_records
99
+
100
+ output_pod5 = os.path.join(output_directory, output_base)
101
+
102
+ # Write the subsampled POD5
103
+ with p5.Writer(output_pod5) as writer:
104
+ writer.add_reads(read_records)
@@ -1,35 +1,34 @@
1
1
  from .append_C_context import append_C_context
2
2
  from .binarize_on_Youden import binarize_on_Youden
3
- from .binary_layers_to_ohe import binary_layers_to_ohe
4
3
  from .calculate_complexity import calculate_complexity
5
4
  from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
6
5
  from .calculate_coverage import calculate_coverage
7
- from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
8
6
  from .calculate_position_Youden import calculate_position_Youden
9
7
  from .calculate_read_length_stats import calculate_read_length_stats
10
8
  from .clean_NaN import clean_NaN
11
9
  from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
12
10
  from .filter_reads_on_length import filter_reads_on_length
13
11
  from .invert_adata import invert_adata
12
+ from .load_sample_sheet import load_sample_sheet
14
13
  from .mark_duplicates import mark_duplicates
15
- from .min_non_diagonal import min_non_diagonal
16
14
  from .remove_duplicates import remove_duplicates
15
+ from .recipes import recipe_1_Kissiov_and_McKenna_2025, recipe_2_Kissiov_and_McKenna_2025
17
16
 
18
17
  __all__ = [
19
18
  "append_C_context",
20
19
  "binarize_on_Youden",
21
- "binary_layers_to_ohe",
22
20
  "calculate_complexity",
23
21
  "calculate_converted_read_methylation_stats",
24
22
  "calculate_coverage",
25
- "calculate_pairwise_hamming_distances",
26
23
  "calculate_position_Youden",
27
24
  "calculate_read_length_stats",
28
25
  "clean_NaN",
29
26
  "filter_converted_reads_on_methylation",
30
27
  "filter_reads_on_length",
31
28
  "invert_adata",
29
+ "load_sample_sheet",
32
30
  "mark_duplicates",
33
- "min_non_diagonal",
34
- "remove_duplicates"
31
+ "remove_duplicates",
32
+ "recipe_1_Kissiov_and_McKenna_2025",
33
+ "recipe_2_Kissiov_and_McKenna_2025"
35
34
  ]