smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -1,39 +0,0 @@
1
- # bed_to_bigwig
2
-
3
- def bed_to_bigwig(fasta, bed):
4
- """
5
- Takes a bed file of reads and makes a bedgraph plus a bigwig
6
-
7
- Parameters:
8
- fasta (str): File path to the reference genome to align to.
9
- bed (str): File path to the input bed.
10
- Returns:
11
- None
12
- """
13
- import os
14
- import subprocess
15
-
16
- bed_basename = os.path.basename(bed)
17
- parent_dir = os.path.dirname(bed)
18
- bed_basename_minus_suffix = bed_basename.split('.bed')[0]
19
- fasta_basename = os.path.basename(fasta)
20
- fasta_dir = os.path.dirname(fasta)
21
- fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
22
- chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
23
- chrom_path = os.path.join(fasta_dir, chrom_basename)
24
- bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
25
- bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
26
- bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
27
- bigwig_output = os.path.join(parent_dir, bigwig_basename)
28
-
29
- # Make the bedgraph
30
- with open(bedgraph_output, 'w') as outfile:
31
- # Command as a list
32
- command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
33
- print(f'Making bedgraph from {bed_basename}')
34
- subprocess.run(command, stdout=outfile)
35
-
36
- # Make the bigwig
37
- command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
38
- print(f'Making bigwig from {bedgraph_basename}')
39
- subprocess.run(command)
@@ -1,378 +0,0 @@
1
- # concatenate_fastqs_to_bam
2
-
3
- def concatenate_fastqs_to_bam(
4
- fastq_files,
5
- output_bam,
6
- barcode_tag='BC',
7
- gzip_suffixes=('.gz',),
8
- barcode_map=None,
9
- add_read_group=True,
10
- rg_sample_field=None,
11
- progress=True,
12
- auto_pair=True,
13
- ):
14
- """
15
- Concatenate FASTQ(s) into an unaligned BAM. Supports single-end and paired-end (auto-detect or explicit).
16
-
17
- Parameters
18
- ----------
19
- fastq_files : list[str] or list[(str,str)]
20
- If list of tuples: each tuple is (R1_path, R2_path).
21
- If list of strings and auto_pair=True: the function will attempt to automatically pair files.
22
- output_bam : str
23
- Path to output BAM (will be overwritten).
24
- barcode_tag : str
25
- SAM tag used for barcode (default 'BC').
26
- gzip_suffixes : tuple
27
- Compressed suffixes to consider (default ('.gz',)).
28
- barcode_map : dict or None
29
- Optional mapping {path: barcode} to override automatic extraction.
30
- add_read_group : bool
31
- If True, add RG entries and set RG tag per-read (ID = barcode).
32
- rg_sample_field : str or None
33
- If set, includes SM field in RG header entries.
34
- progress : bool
35
- Show tqdm progress bar.
36
- auto_pair : bool
37
- If True and `fastq_files` is a list of strings, attempt to auto-pair R1/R2 by filename patterns.
38
-
39
- Returns
40
- -------
41
- dict
42
- Summary: {'total_reads', 'per_file_counts', 'paired_count', 'unpaired_count', 'barcodes'}
43
- """
44
- import os
45
- import re
46
- import gzip
47
- from itertools import zip_longest
48
- from Bio import SeqIO
49
- import pysam
50
- from tqdm import tqdm
51
-
52
- # ---------- helpers ----------
53
- def _is_gz(path):
54
- pl = path.lower()
55
- return any(pl.endswith(suf) for suf in gzip_suffixes)
56
-
57
- def _strip_fastq_ext(basn):
58
- # remove .fastq.gz .fq.gz .fastq .fq
59
- for ext in ('.fastq.gz', '.fq.gz', '.fastq', '.fq'):
60
- if basn.lower().endswith(ext):
61
- return basn[:-len(ext)]
62
- # fallback remove last suffix
63
- return os.path.splitext(basn)[0]
64
-
65
- def _extract_barcode_from_filename(path):
66
- # heuristic: barcode is last underscore-separated token in filename (before ext)
67
- stem = _strip_fastq_ext(os.path.basename(path))
68
- if '_' in stem:
69
- token = stem.split('_')[-1]
70
- if token:
71
- return token
72
- # fallback to whole stem
73
- return stem
74
-
75
- # pairing heuristics: try to identify suffix that marks read number
76
- def _classify_read_token(stem):
77
- # returns (prefix, readnum) if matches, else (None, None)
78
- patterns = [
79
- r'(?i)(.*?)[._-]r?([12])$', # prefix_R1 or prefix.r1 or prefix-1
80
- r'(?i)(.*?)[._-]read[_-]?([12])$',
81
- r'(?i)(.*?)[/_]([12])$', # sometimes /1 is used (rare in filenames)
82
- ]
83
- for pat in patterns:
84
- m = re.match(pat, stem)
85
- if m:
86
- prefix = m.group(1)
87
- num = m.group(2)
88
- return prefix, int(num)
89
- return None, None
90
-
91
- def pair_by_filename(paths):
92
- # paths: list of strings
93
- map_pref = {} # prefix -> {1: path, 2: path, 'orphans': [..]}
94
- unpaired = []
95
- for p in paths:
96
- name = os.path.basename(p)
97
- stem = _strip_fastq_ext(name)
98
- pref, num = _classify_read_token(stem)
99
- if pref is not None:
100
- entry = map_pref.setdefault(pref, {})
101
- entry[num] = p
102
- else:
103
- # try fallback: split by last underscore or dot and check last token is 1/2 or R1/R2
104
- toks = re.split(r'[_\.]', stem)
105
- if toks and toks[-1] in ('1', '2', 'R1', 'R2', 'r1', 'r2'):
106
- last = toks[-1]
107
- basepref = "_".join(toks[:-1]) if len(toks) > 1 else toks[0]
108
- num = 1 if last.endswith('1') else 2
109
- entry = map_pref.setdefault(basepref, {})
110
- entry[num] = p
111
- else:
112
- unpaired.append(p)
113
- pairs = []
114
- leftovers = []
115
- for k, d in map_pref.items():
116
- if 1 in d and 2 in d:
117
- pairs.append((d[1], d[2]))
118
- else:
119
- # put whoever exists into leftovers
120
- leftovers.extend([v for kk, v in d.items()])
121
- # append also unpaired
122
- leftovers.extend(unpaired)
123
- return pairs, leftovers
124
-
125
- # ---------- normalize input ----------
126
- explicit_pairs = []
127
- singles = []
128
- if not isinstance(fastq_files, (list, tuple)):
129
- raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
130
-
131
- # mixture: if user supplied tuples -> treat as explicit pairs
132
- if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
133
- explicit_pairs = [(str(a), str(b)) for a, b in fastq_files]
134
- else:
135
- # flatten and coerce to strings, ignore None
136
- paths = [str(x) for x in fastq_files if x is not None]
137
- if auto_pair:
138
- explicit_pairs, leftovers = pair_by_filename(paths)
139
- singles = leftovers
140
- else:
141
- singles = paths
142
-
143
- # Build barcode map and ordered barcodes
144
- barcode_map = barcode_map or {}
145
- per_path_barcode = {}
146
- barcodes_in_order = []
147
-
148
- # pairs: assign barcode per pair from either provided barcode_map for first file or from filenames
149
- for r1, r2 in explicit_pairs:
150
- bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
151
- per_path_barcode[r1] = bc
152
- per_path_barcode[r2] = bc
153
- if bc not in barcodes_in_order:
154
- barcodes_in_order.append(bc)
155
- for p in singles:
156
- bc = barcode_map.get(p) or _extract_barcode_from_filename(p)
157
- per_path_barcode[p] = bc
158
- if bc not in barcodes_in_order:
159
- barcodes_in_order.append(bc)
160
-
161
- # prepare BAM header
162
- header = {"HD": {"VN": "1.0"}, "SQ": []}
163
- if add_read_group:
164
- rg_list = []
165
- for bc in barcodes_in_order:
166
- rg = {"ID": bc}
167
- if rg_sample_field:
168
- rg["SM"] = rg_sample_field
169
- rg_list.append(rg)
170
- header["RG"] = rg_list
171
-
172
- # ---------- write BAM ----------
173
- per_file_counts = {}
174
- total_written = 0
175
- paired_count = 0
176
- unpaired_count = 0
177
-
178
- def _open_fh(path):
179
- return gzip.open(path, "rt") if _is_gz(path) else open(path, "rt")
180
-
181
- with pysam.AlignmentFile(output_bam, "wb", header=header) as bam_out:
182
- # process paired files first
183
- seq_iter = list(explicit_pairs) # list of (r1,r2)
184
- if progress:
185
- seq_iter = tqdm(seq_iter, desc="Paired FASTQ->BAM")
186
- for r1_path, r2_path in seq_iter:
187
- if not (os.path.exists(r1_path) and os.path.exists(r2_path)):
188
- raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
189
- bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
190
- # open both and iterate in parallel
191
- with _open_fh(r1_path) as fh1, _open_fh(r2_path) as fh2:
192
- it1 = SeqIO.parse(fh1, "fastq")
193
- it2 = SeqIO.parse(fh2, "fastq")
194
- # iterate in lockstep; if one shorter we still write remaining as unpaired (zip_longest)
195
- for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
196
- # determine a common read name
197
- if rec1 is not None:
198
- id1 = rec1.id
199
- else:
200
- id1 = None
201
- if rec2 is not None:
202
- id2 = rec2.id
203
- else:
204
- id2 = None
205
- # try to derive a common name (strip /1 or /2 if present)
206
- def _strip_end_id(s):
207
- if s is None:
208
- return None
209
- return re.sub(r'(?:/1$|/2$|\s[12]$)', '', s)
210
- common_name = _strip_end_id(id1) or _strip_end_id(id2) or (id1 or id2)
211
-
212
- # create AlignedSegment for read1
213
- if rec1 is not None:
214
- a1 = pysam.AlignedSegment()
215
- a1.query_name = common_name
216
- a1.query_sequence = str(rec1.seq)
217
- a1.is_paired = True
218
- a1.is_read1 = True
219
- a1.is_read2 = False
220
- a1.is_unmapped = True
221
- a1.mate_is_unmapped = True
222
- # reference fields for unmapped
223
- a1.reference_id = -1
224
- a1.reference_start = -1
225
- a1.next_reference_id = -1
226
- a1.next_reference_start = -1
227
- a1.template_length = 0
228
- # qualities
229
- if "phred_quality" in rec1.letter_annotations:
230
- try:
231
- a1.query_qualities = [int(x) for x in rec1.letter_annotations["phred_quality"]]
232
- except Exception:
233
- a1.query_qualities = None
234
- # tags
235
- a1.set_tag(barcode_tag, str(bc), value_type='Z')
236
- if add_read_group:
237
- a1.set_tag("RG", str(bc), value_type='Z')
238
- bam_out.write(a1)
239
- per_file_counts.setdefault(r1_path, 0)
240
- per_file_counts[r1_path] += 1
241
- total_written += 1
242
- # create AlignedSegment for read2
243
- if rec2 is not None:
244
- a2 = pysam.AlignedSegment()
245
- a2.query_name = common_name
246
- a2.query_sequence = str(rec2.seq)
247
- a2.is_paired = True
248
- a2.is_read1 = False
249
- a2.is_read2 = True
250
- a2.is_unmapped = True
251
- a2.mate_is_unmapped = True
252
- a2.reference_id = -1
253
- a2.reference_start = -1
254
- a2.next_reference_id = -1
255
- a2.next_reference_start = -1
256
- a2.template_length = 0
257
- if "phred_quality" in rec2.letter_annotations:
258
- try:
259
- a2.query_qualities = [int(x) for x in rec2.letter_annotations["phred_quality"]]
260
- except Exception:
261
- a2.query_qualities = None
262
- a2.set_tag(barcode_tag, str(bc), value_type='Z')
263
- if add_read_group:
264
- a2.set_tag("RG", str(bc), value_type='Z')
265
- bam_out.write(a2)
266
- per_file_counts.setdefault(r2_path, 0)
267
- per_file_counts[r2_path] += 1
268
- total_written += 1
269
- # count paired/unpaired bookkeeping
270
- if rec1 is not None and rec2 is not None:
271
- paired_count += 1
272
- else:
273
- # one side missing -> counted as unpaired for whichever exists
274
- if rec1 is not None:
275
- unpaired_count += 1
276
- if rec2 is not None:
277
- unpaired_count += 1
278
-
279
- # process singletons
280
- single_iter = list(singles)
281
- if progress:
282
- single_iter = tqdm(single_iter, desc="Single FASTQ->BAM")
283
- for p in single_iter:
284
- if not os.path.exists(p):
285
- raise FileNotFoundError(p)
286
- bc = per_path_barcode.get(p, "barcode")
287
- with _open_fh(p) as fh:
288
- for rec in SeqIO.parse(fh, "fastq"):
289
- a = pysam.AlignedSegment()
290
- a.query_name = rec.id
291
- a.query_sequence = str(rec.seq)
292
- a.is_paired = False
293
- a.is_read1 = False
294
- a.is_read2 = False
295
- a.is_unmapped = True
296
- a.mate_is_unmapped = True
297
- a.reference_id = -1
298
- a.reference_start = -1
299
- a.next_reference_id = -1
300
- a.next_reference_start = -1
301
- a.template_length = 0
302
- if "phred_quality" in rec.letter_annotations:
303
- try:
304
- a.query_qualities = [int(x) for x in rec.letter_annotations["phred_quality"]]
305
- except Exception:
306
- a.query_qualities = None
307
- a.set_tag(barcode_tag, str(bc), value_type='Z')
308
- if add_read_group:
309
- a.set_tag("RG", str(bc), value_type='Z')
310
- bam_out.write(a)
311
- per_file_counts.setdefault(p, 0)
312
- per_file_counts[p] += 1
313
- total_written += 1
314
- unpaired_count += 1
315
-
316
- summary = {
317
- "total_reads": total_written,
318
- "per_file": per_file_counts,
319
- "paired_pairs_written": paired_count,
320
- "singletons_written": unpaired_count,
321
- "barcodes": barcodes_in_order
322
- }
323
- return summary
324
-
325
-
326
- # def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
327
- # """
328
- # Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
329
-
330
- # Parameters:
331
- # fastq_files (list): List of paths to demultiplexed FASTQ files.
332
- # output_bam (str): Path to the output BAM file.
333
- # barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
334
- # gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
335
-
336
- # Returns:
337
- # None
338
- # """
339
- # import os
340
- # import pysam
341
- # import gzip
342
- # from Bio import SeqIO
343
- # from tqdm import tqdm
344
-
345
- # n_fastqs = len(fastq_files)
346
-
347
- # with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
348
- # for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
349
- # # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
350
- # base_name = os.path.basename(fastq_file)
351
- # if n_fastqs > 1:
352
- # if base_name.endswith('.fastq.gz'):
353
- # barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
354
- # elif base_name.endswith('.fq.gz'):
355
- # barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
356
- # elif base_name.endswith('.fastq'):
357
- # barcode = base_name.split('_')[-1].replace('.fastq', '')
358
- # elif base_name.endswith('.fq'):
359
- # barcode = base_name.split('_')[-1].replace('.fq', '')
360
- # else:
361
- # raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
362
- # else:
363
- # barcode = 'barcode0'
364
-
365
- # # Read the FASTQ file (handle gzipped and non-gzipped files)
366
- # open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
367
- # with open_func(fastq_file, 'rt') as fq_in:
368
- # for record in SeqIO.parse(fq_in, 'fastq'):
369
- # # Create an unaligned BAM entry for each FASTQ record
370
- # aln = pysam.AlignedSegment()
371
- # aln.query_name = record.id
372
- # aln.query_sequence = str(record.seq)
373
- # aln.flag = 4 # Unmapped
374
- # aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
375
- # # Add the barcode to the BC tag
376
- # aln.set_tag(barcode_tag, barcode)
377
- # # Write to BAM file
378
- # bam_out.write(aln)
@@ -1,100 +0,0 @@
1
- from pathlib import Path
2
- from typing import Dict, List, Any, Tuple
3
-
4
- def discover_input_files(
5
- input_data_path: str,
6
- bam_suffix: str = ".bam",
7
- recursive: bool = False,
8
- follow_symlinks: bool = False,
9
- ) -> Dict[str, Any]:
10
- """
11
- Discover input files under `input_data_path`.
12
-
13
- Returns a dict with:
14
- - pod5_paths, fast5_paths, fastq_paths, bam_paths (lists of str)
15
- - input_is_pod5, input_is_fast5, input_is_fastq, input_is_bam (bools)
16
- - all_files_searched (int)
17
- Behavior:
18
- - If `input_data_path` is a file, returns that single file categorized.
19
- - If it is a directory, scans either immediate children (recursive=False)
20
- or entire tree (recursive=True). Uses Path.suffixes to detect .fastq.gz etc.
21
- """
22
- p = Path(input_data_path)
23
- pod5_exts = {".pod5", ".p5"}
24
- fast5_exts = {".fast5", ".f5"}
25
- fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.xz", ".fq.xz"}
26
- # normalize bam suffix with leading dot
27
- if not bam_suffix.startswith("."):
28
- bam_suffix = "." + bam_suffix
29
- bam_suffix = bam_suffix.lower()
30
-
31
- pod5_paths: List[str] = []
32
- fast5_paths: List[str] = []
33
- fastq_paths: List[str] = []
34
- bam_paths: List[str] = []
35
- other_paths: List[str] = []
36
-
37
- def _file_ext_key(pp: Path) -> str:
38
- # join suffixes to handle .fastq.gz
39
- return "".join(pp.suffixes).lower() if pp.suffixes else pp.suffix.lower()
40
-
41
- if p.exists() and p.is_file():
42
- ext_key = _file_ext_key(p)
43
- if ext_key in pod5_exts:
44
- pod5_paths.append(str(p))
45
- elif ext_key in fast5_exts:
46
- fast5_paths.append(str(p))
47
- elif ext_key in fastq_exts:
48
- fastq_paths.append(str(p))
49
- elif ext_key == bam_suffix:
50
- bam_paths.append(str(p))
51
- else:
52
- other_paths.append(str(p))
53
- total_searched = 1
54
- elif p.exists() and p.is_dir():
55
- if recursive:
56
- iterator = p.rglob("*")
57
- else:
58
- iterator = p.iterdir()
59
- total_searched = 0
60
- for fp in iterator:
61
- if not fp.is_file():
62
- continue
63
- total_searched += 1
64
- ext_key = _file_ext_key(fp)
65
- if ext_key in pod5_exts:
66
- pod5_paths.append(str(fp))
67
- elif ext_key in fast5_exts:
68
- fast5_paths.append(str(fp))
69
- elif ext_key in fastq_exts:
70
- fastq_paths.append(str(fp))
71
- elif ext_key == bam_suffix:
72
- bam_paths.append(str(fp))
73
- else:
74
- # additional heuristic: check filename contains extension fragments (.pod5 etc)
75
- name = fp.name.lower()
76
- if any(e in name for e in pod5_exts):
77
- pod5_paths.append(str(fp))
78
- elif any(e in name for e in fast5_exts):
79
- fast5_paths.append(str(fp))
80
- elif any(e in name for e in [".fastq", ".fq"]):
81
- fastq_paths.append(str(fp))
82
- elif name.endswith(bam_suffix):
83
- bam_paths.append(str(fp))
84
- else:
85
- other_paths.append(str(fp))
86
- else:
87
- raise FileNotFoundError(f"input_data_path does not exist: {input_data_path}")
88
-
89
- return {
90
- "pod5_paths": sorted(pod5_paths),
91
- "fast5_paths": sorted(fast5_paths),
92
- "fastq_paths": sorted(fastq_paths),
93
- "bam_paths": sorted(bam_paths),
94
- "other_paths": sorted(other_paths),
95
- "input_is_pod5": len(pod5_paths) > 0,
96
- "input_is_fast5": len(fast5_paths) > 0,
97
- "input_is_fastq": len(fastq_paths) > 0,
98
- "input_is_bam": len(bam_paths) > 0,
99
- "all_files_searched": total_searched,
100
- }
@@ -1,12 +0,0 @@
1
- # index_fasta
2
-
3
- def index_fasta(fasta):
4
- """
5
- Generate a FASTA index file for an input fasta.
6
-
7
- Parameters:
8
- fasta (str): Path to the input fasta to make an index file for.
9
- """
10
- import subprocess
11
-
12
- subprocess.run(["samtools", "faidx", fasta])
@@ -1,21 +0,0 @@
1
- ## make_dirs
2
-
3
- # General
4
- def make_dirs(directories):
5
- """
6
- Takes a list of file paths and makes new directories if the directory does not already exist.
7
-
8
- Parameters:
9
- directories (list): A list of directories to make
10
-
11
- Returns:
12
- None
13
- """
14
- import os
15
-
16
- for directory in directories:
17
- if not os.path.isdir(directory):
18
- os.mkdir(directory)
19
- print(f"Directory '{directory}' created successfully.")
20
- else:
21
- print(f"Directory '{directory}' already exists.")
@@ -1,106 +0,0 @@
1
- ## readwrite ##
2
-
3
- ######################################################################################################
4
- ## Datetime functionality
5
- def date_string():
6
- """
7
- Each time this is called, it returns the current date string
8
- """
9
- from datetime import datetime
10
- current_date = datetime.now()
11
- date_string = current_date.strftime("%Y%m%d")
12
- date_string = date_string[2:]
13
- return date_string
14
-
15
- def time_string():
16
- """
17
- Each time this is called, it returns the current time string
18
- """
19
- from datetime import datetime
20
- current_time = datetime.now()
21
- return current_time.strftime("%H:%M:%S")
22
- ######################################################################################################
23
-
24
- ######################################################################################################
25
- ## Numpy, Pandas, Anndata functionality
26
- def adata_to_df(adata, layer=None):
27
- """
28
- Input: An adata object with a specified layer.
29
- Output: A dataframe for the specific layer.
30
- """
31
- import pandas as pd
32
- import anndata as ad
33
-
34
- # Extract the data matrix from the given layer
35
- if layer:
36
- data_matrix = adata.layers[layer]
37
- else:
38
- data_matrix = adata.X
39
- # Extract observation (read) annotations
40
- obs_df = adata.obs
41
- # Extract variable (position) annotations
42
- var_df = adata.var
43
- # Convert data matrix and annotations to pandas DataFrames
44
- df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
45
- return df
46
-
47
- def save_matrix(matrix, save_name):
48
- """
49
- Input: A numpy matrix and a save_name
50
- Output: A txt file representation of the data matrix
51
- """
52
- import numpy as np
53
- np.savetxt(f'{save_name}.txt', matrix)
54
-
55
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
56
- """
57
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
58
- Input: an output file path relative to the directory in which the function is called
59
- """
60
- import os
61
- import anndata as ad
62
- # Runtime warnings
63
- import warnings
64
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
-
67
- # List all files in the directory
68
- files = os.listdir(os.getcwd())
69
- # get current working directory
70
- cwd = os.getcwd()
71
- suffix = file_suffix
72
- # Filter file names that contain the search string in their filename and keep them in a list
73
- hdfs = [hdf for hdf in files if suffix in hdf]
74
- # Sort file list by names and print the list of file names
75
- hdfs.sort()
76
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
77
- # Iterate over all of the hdf5 files and concatenate them.
78
- final_adata = None
79
- for hdf in hdfs:
80
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
81
- temp_adata = ad.read_h5ad(hdf)
82
- if final_adata:
83
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
84
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
85
- else:
86
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
- final_adata = temp_adata
88
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
89
- final_adata.write_h5ad(output_file, compression='gzip')
90
-
91
- # Delete the individual h5ad files and only keep the final concatenated file
92
- if delete_inputs:
93
- files = os.listdir(os.getcwd())
94
- hdfs = [hdf for hdf in files if suffix in hdf]
95
- if output_file in hdfs:
96
- hdfs.remove(output_file)
97
- # Iterate over the files and delete them
98
- for hdf in hdfs:
99
- try:
100
- os.remove(hdf)
101
- print(f"Deleted file: {hdf}")
102
- except OSError as e:
103
- print(f"Error deleting file {hdf}: {e}")
104
- else:
105
- print('Keeping input files')
106
- ######################################################################################################