smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,224 @@
1
+ from ..config import LoadExperimentConfig
2
+ from ..readwrite import make_dirs
3
+
4
+ import os
5
+ import subprocess
6
+ from pathlib import Path
7
+
8
+ import pod5 as p5
9
+
10
+ from typing import Union, List
11
+
12
+ def basecall_pod5s(config_path):
13
+ """
14
+ Basecall from pod5s given a config file.
15
+
16
+ Parameters:
17
+ config_path (str): File path to the basecall configuration file
18
+
19
+ Returns:
20
+ None
21
+ """
22
+ # Default params
23
+ bam_suffix = '.bam' # If different, change from here.
24
+
25
+ # Load experiment config parameters into global variables
26
+ experiment_config = LoadExperimentConfig(config_path)
27
+ var_dict = experiment_config.var_dict
28
+
29
+ # These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
30
+ default_value = None
31
+
32
+ # General config variable init
33
+ input_data_path = Path(var_dict.get('input_data_path', default_value)) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
34
+ output_directory = Path(var_dict.get('output_directory', default_value)) # Path to the output directory to make for the analysis. Necessary.
35
+ model = var_dict.get('model', default_value) # needed for dorado basecaller
36
+ model_dir = Path(var_dict.get('model_dir', default_value)) # model directory
37
+ barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
38
+ barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
39
+ trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
40
+ device = var_dict.get('device', 'auto')
41
+
42
+ # Modified basecalling specific variable init
43
+ filter_threshold = var_dict.get('filter_threshold', default_value)
44
+ m6A_threshold = var_dict.get('m6A_threshold', default_value)
45
+ m5C_threshold = var_dict.get('m5C_threshold', default_value)
46
+ hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
47
+ thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
48
+ mod_list = var_dict.get('mod_list', default_value)
49
+
50
+ # Make initial output directory
51
+ make_dirs([output_directory])
52
+
53
+ # Get the input filetype
54
+ if input_data_path.is_file():
55
+ input_data_filetype = input_data_path.suffixes[0]
56
+ input_is_pod5 = input_data_filetype in ['.pod5','.p5']
57
+ input_is_fast5 = input_data_filetype in ['.fast5','.f5']
58
+
59
+ elif input_data_path.is_dir():
60
+ # Get the file names in the input data dir
61
+ input_files = input_data_path.iterdir()
62
+ input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
63
+ input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
64
+
65
+ # If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
66
+ if input_is_fast5 and not input_is_pod5:
67
+ # take the input directory of fast5 files and write out a single pod5 file into the output directory.
68
+ output_pod5 = output_directory / 'FAST5s_to_POD5.pod5'
69
+ print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
70
+ fast5_to_pod5(input_data_path, output_pod5)
71
+ # Reassign the pod5_dir variable to point to the new pod5 file.
72
+ input_data_path = output_pod5
73
+
74
+ model_basename = model.name
75
+ model_basename = model_basename.replace('.', '_')
76
+
77
+ if mod_list:
78
+ mod_string = "_".join(mod_list)
79
+ bam = output_directory / f"{model_basename}_{mod_string}_calls"
80
+ modcall(model, input_data_path, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends, trim, device)
81
+ else:
82
+ bam = output_directory / f"{model_basename}_canonical_basecalls"
83
+ canoncall(model, input_data_path, barcode_kit, bam, bam_suffix, barcode_both_ends, trim, device)
84
+
85
+
86
+ def fast5_to_pod5(
87
+ fast5_dir: Union[str, Path, List[Union[str, Path]]],
88
+ output_pod5: Union[str, Path] = "FAST5s_to_POD5.pod5"
89
+ ) -> None:
90
+ """
91
+ Convert Nanopore FAST5 files (single file, list of files, or directory)
92
+ into a single .pod5 output using the 'pod5 convert fast5' CLI tool.
93
+ """
94
+
95
+ output_pod5 = str(output_pod5) # ensure string
96
+
97
+ # 1) If user gives a list of FAST5 files
98
+ if isinstance(fast5_dir, (list, tuple)):
99
+ fast5_paths = [str(Path(f)) for f in fast5_dir]
100
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
101
+ subprocess.run(cmd, check=True)
102
+ return
103
+
104
+ # Ensure Path object
105
+ p = Path(fast5_dir)
106
+
107
+ # 2) If user gives a single file
108
+ if p.is_file():
109
+ cmd = ["pod5", "convert", "fast5", str(p), "--output", output_pod5]
110
+ subprocess.run(cmd, check=True)
111
+ return
112
+
113
+ # 3) If user gives a directory → collect FAST5s
114
+ if p.is_dir():
115
+ fast5_paths = sorted(str(f) for f in p.glob("*.fast5"))
116
+ if not fast5_paths:
117
+ raise FileNotFoundError(f"No FAST5 files found in {p}")
118
+
119
+ cmd = ["pod5", "convert", "fast5", *fast5_paths, "--output", output_pod5]
120
+ subprocess.run(cmd, check=True)
121
+ return
122
+
123
+ raise FileNotFoundError(f"Input path invalid: {fast5_dir}")
124
+
125
+ def subsample_pod5(pod5_path, read_name_path, output_directory):
126
+ """
127
+ Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
128
+ This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
129
+
130
+ Parameters:
131
+ pod5_path (str): File path to the POD5 file (or directory of multiple pod5 files) to subsample.
132
+ read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
133
+ output_directory (str): A file path to the directory to output the file.
134
+
135
+ Returns:
136
+ None
137
+ """
138
+
139
+ if os.path.isdir(pod5_path):
140
+ pod5_path_is_dir = True
141
+ input_pod5_base = 'input_pod5s.pod5'
142
+ files = os.listdir(pod5_path)
143
+ pod5_files = [os.path.join(pod5_path, file) for file in files if '.pod5' in file]
144
+ pod5_files.sort()
145
+ print(f'Found input pod5s: {pod5_files}')
146
+
147
+ elif os.path.exists(pod5_path):
148
+ pod5_path_is_dir = False
149
+ input_pod5_base = os.path.basename(pod5_path)
150
+
151
+ else:
152
+ print('Error: pod5_path passed does not exist')
153
+ return None
154
+
155
+ if type(read_name_path) == str:
156
+ input_read_name_base = os.path.basename(read_name_path)
157
+ output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
158
+
159
+ # extract read names into a list of strings
160
+ with open(read_name_path, 'r') as file:
161
+ read_names = [line.strip() for line in file]
162
+
163
+ print(f'Looking for read_ids: {read_names}')
164
+ read_records = []
165
+
166
+ if pod5_path_is_dir:
167
+ for input_pod5 in pod5_files:
168
+ with p5.Reader(input_pod5) as reader:
169
+ try:
170
+ for read_record in reader.reads(selection=read_names, missing_ok=True):
171
+ read_records.append(read_record.to_read())
172
+ print(f'Found read in {input_pod5}: {read_record.read_id}')
173
+ except:
174
+ print('Skipping pod5, could not find reads')
175
+ else:
176
+ with p5.Reader(pod5_path) as reader:
177
+ try:
178
+ for read_record in reader.reads(selection=read_names):
179
+ read_records.append(read_record.to_read())
180
+ print(f'Found read in {input_pod5}: {read_record}')
181
+ except:
182
+ print('Could not find reads')
183
+
184
+ elif type(read_name_path) == int:
185
+ import random
186
+ output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
187
+ all_read_records = []
188
+
189
+ if pod5_path_is_dir:
190
+ # Shuffle the list of input pod5 paths
191
+ random.shuffle(pod5_files)
192
+ for input_pod5 in pod5_files:
193
+ # iterate over the input pod5s
194
+ print(f'Opening pod5 file {input_pod5}')
195
+ with p5.Reader(pod5_path) as reader:
196
+ for read_record in reader.reads():
197
+ all_read_records.append(read_record.to_read())
198
+ # When enough reads are in all_read_records, stop accumulating reads.
199
+ if len(all_read_records) >= read_name_path:
200
+ break
201
+
202
+ if read_name_path <= len(all_read_records):
203
+ read_records = random.sample(all_read_records, read_name_path)
204
+ else:
205
+ print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
206
+ read_records = all_read_records
207
+
208
+ else:
209
+ with p5.Reader(pod5_path) as reader:
210
+ for read_record in reader.reads():
211
+ # get all read records from the input pod5
212
+ all_read_records.append(read_record.to_read())
213
+ if read_name_path <= len(all_read_records):
214
+ # if the subsampling amount is less than the record amount in the file, randomly subsample the reads
215
+ read_records = random.sample(all_read_records, read_name_path)
216
+ else:
217
+ print('Trying to sample more reads than are contained in the input pod5s, taking all reads')
218
+ read_records = all_read_records
219
+
220
+ output_pod5 = os.path.join(output_directory, output_base)
221
+
222
+ # Write the subsampled POD5
223
+ with p5.Writer(output_pod5) as writer:
224
+ writer.add_reads(read_records)
@@ -9,10 +9,13 @@ def run_multiqc(input_dir, output_dir):
9
9
  Returns:
10
10
  - None: The function executes MultiQC and prints the status.
11
11
  """
12
- import os
12
+ from ..readwrite import make_dirs
13
13
  import subprocess
14
14
  # Ensure the output directory exists
15
- os.makedirs(output_dir, exist_ok=True)
15
+ make_dirs(output_dir)
16
+
17
+ input_dir = str(input_dir)
18
+ output_dir = str(output_dir)
16
19
 
17
20
  # Construct MultiQC command
18
21
  command = ["multiqc", input_dir, "-o", output_dir]
@@ -166,7 +166,7 @@ def plot_spatial_autocorr_grid(
166
166
  ax.set_xlabel("Lag (bp)", fontsize=7)
167
167
  ax.tick_params(axis='both', which='major', labelsize=6)
168
168
  ax.grid(True, alpha=0.22)
169
- col_idx += 1
169
+ #col_idx += 1
170
170
  continue
171
171
 
172
172
  # mask low-support lags if counts available
@@ -417,7 +417,6 @@ def plot_spatial_autocorr_grid(
417
417
 
418
418
  return saved_pages
419
419
 
420
-
421
420
  def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=160, show=False):
422
421
  """
423
422
  Plot NRL and SNR vs window center from the dataframe returned by rolling_autocorr_metrics.
@@ -608,4 +607,3 @@ def plot_rolling_grid(
608
607
  pages_by_metric[metric] = saved_pages
609
608
 
610
609
  return pages_by_metric
611
-