smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,172 @@
1
+ def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu', deaminase_footprinting=False, mismatch_trend_per_read={}, on_missing="nan"):
2
+ """
3
+ Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
4
+
5
+ Parameters:
6
+ base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
7
+ strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
8
+ modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
9
+ bam (str): The bam file path
10
+ deaminase_footprinting (bool): Whether direct deaminase footprinting chemistry was used.
11
+ mismatch_trend_per_read (dict): For deaminase footprinting, indicates the type of conversion relative to the top strand reference for each read. (C->T or G->A if bottom strand was converted)
12
+ on_missing (str): Error handling if a read is missing
13
+
14
+ Returns:
15
+ dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
16
+ If deaminase_footprinting, 1 represents deaminated sites, while 0 represents non-deaminated sites.
17
+ """
18
+ import numpy as np
19
+
20
+ if mismatch_trend_per_read is None:
21
+ mismatch_trend_per_read = {}
22
+
23
+ # Fast path
24
+ if modification_type == "unconverted" and not deaminase_footprinting:
25
+ return {k: np.full(len(v), np.nan, dtype=np.float32) for k, v in base_identities.items()}
26
+
27
+ out = {}
28
+
29
+ if deaminase_footprinting:
30
+ valid_trends = {"C->T", "G->A"}
31
+
32
+ for read_id, bases in base_identities.items():
33
+ trend_raw = mismatch_trend_per_read.get(read_id, None)
34
+ if trend_raw is None:
35
+ if on_missing == "error":
36
+ raise KeyError(f"Missing mismatch trend for read '{read_id}'")
37
+ out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
38
+ continue
39
+
40
+ trend = trend_raw.replace(" ", "").upper()
41
+ if trend not in valid_trends:
42
+ if on_missing == "error":
43
+ raise KeyError(
44
+ f"Invalid mismatch trend '{trend_raw}' for read '{read_id}'. "
45
+ f"Expected one of {sorted(valid_trends)}"
46
+ )
47
+ out[read_id] = np.full(len(bases), np.nan, dtype=np.float32)
48
+ continue
49
+
50
+ arr = np.asarray(bases, dtype="<U1")
51
+ res = np.full(arr.shape, np.nan, dtype=np.float32)
52
+
53
+ if trend == "C->T":
54
+ # C (unconverted) -> 0, T (converted) -> 1
55
+ res[arr == "C"] = 0.0
56
+ res[arr == "T"] = 1.0
57
+ else: # "G->A"
58
+ res[arr == "G"] = 0.0
59
+ res[arr == "A"] = 1.0
60
+
61
+ out[read_id] = res
62
+
63
+ return out
64
+
65
+ # Non-deaminase mapping (bisulfite-style for 5mC; 6mA mapping is protocol dependent)
66
+ bin_maps = {
67
+ ("top", "5mC"): {"C": 1.0, "T": 0.0},
68
+ ("bottom", "5mC"): {"G": 1.0, "A": 0.0},
69
+ ("top", "6mA"): {"A": 1.0, "G": 0.0},
70
+ ("bottom", "6mA"): {"T": 1.0, "C": 0.0},
71
+ }
72
+ key = (strand, modification_type)
73
+ if key not in bin_maps:
74
+ raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
75
+
76
+ base_map = bin_maps[key]
77
+
78
+ for read_id, bases in base_identities.items():
79
+ arr = np.asarray(bases, dtype="<U1")
80
+ res = np.full(arr.shape, np.nan, dtype=np.float32)
81
+ # mask-assign; unknown characters (N, -, etc.) remain NaN
82
+ for b, v in base_map.items():
83
+ res[arr == b] = v
84
+ out[read_id] = res
85
+
86
+ return out
87
+
88
+ # if mismatch_trend_per_read is None:
89
+ # mismatch_trend_per_read = {}
90
+
91
+ # # If the modification type is 'unconverted', return NaN for all positions if the deaminase_footprinting strategy is not being used.
92
+ # if modification_type == "unconverted" and not deaminase_footprinting:
93
+ # #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
94
+ # return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
95
+
96
+ # # Define mappings for binarization based on strand and modification type
97
+ # if deaminase_footprinting:
98
+ # binarization_maps = {
99
+ # ('C->T'): {'C': 0, 'T': 1},
100
+ # ('G->A'): {'G': 0, 'A': 1},
101
+ # }
102
+
103
+ # binarized_base_identities = {}
104
+ # for key, bases in base_identities.items():
105
+ # arr = np.array(bases, dtype='<U1')
106
+ # # Fetch the appropriate mapping
107
+ # conversion_type = mismatch_trend_per_read[key]
108
+ # base_map = binarization_maps.get(conversion_type, None)
109
+ # binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
110
+ # binarized_base_identities[key] = binarized
111
+
112
+ # return binarized_base_identities
113
+
114
+ # else:
115
+ # binarization_maps = {
116
+ # ('top', '5mC'): {'C': 1, 'T': 0},
117
+ # ('top', '6mA'): {'A': 1, 'G': 0},
118
+ # ('bottom', '5mC'): {'G': 1, 'A': 0},
119
+ # ('bottom', '6mA'): {'T': 1, 'C': 0}
120
+ # }
121
+
122
+ # if (strand, modification_type) not in binarization_maps:
123
+ # raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
124
+
125
+ # # Fetch the appropriate mapping
126
+ # base_map = binarization_maps[(strand, modification_type)]
127
+
128
+ # binarized_base_identities = {}
129
+ # for key, bases in base_identities.items():
130
+ # arr = np.array(bases, dtype='<U1')
131
+ # binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
132
+ # binarized_base_identities[key] = binarized
133
+
134
+ # return binarized_base_identities
135
+ # import torch
136
+
137
+ # # If the modification type is 'unconverted', return NaN for all positions
138
+ # if modification_type == "unconverted":
139
+ # print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
140
+ # return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
141
+
142
+ # # Define mappings for binarization based on strand and modification type
143
+ # binarization_maps = {
144
+ # ('top', '5mC'): {'C': 1, 'T': 0},
145
+ # ('top', '6mA'): {'A': 1, 'G': 0},
146
+ # ('bottom', '5mC'): {'G': 1, 'A': 0},
147
+ # ('bottom', '6mA'): {'T': 1, 'C': 0}
148
+ # }
149
+
150
+ # if (strand, modification_type) not in binarization_maps:
151
+ # raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
152
+
153
+ # # Fetch the appropriate mapping
154
+ # base_map = binarization_maps[(strand, modification_type)]
155
+
156
+ # # Convert mapping to tensor
157
+ # base_keys = list(base_map.keys())
158
+ # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
159
+
160
+ # # Create a lookup dictionary (ASCII-based for fast mapping)
161
+ # lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
162
+ # for k, v in zip(base_keys, base_values):
163
+ # lookup_table[ord(k)] = v
164
+
165
+ # # Process reads
166
+ # binarized_base_identities = {}
167
+ # for key, bases in base_identities.items():
168
+ # bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
169
+ # binarized = lookup_table[bases_tensor] # Efficient lookup
170
+ # binarized_base_identities[key] = binarized
171
+
172
+ # return binarized_base_identities
@@ -0,0 +1,34 @@
1
+ ## canoncall
2
+
3
+ # Conversion SMF specific
4
+ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
5
+ """
6
+ Wrapper function for dorado canonical base calling.
7
+
8
+ Parameters:
9
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
10
+ model (str): a string representing the the dorado basecalling model.
11
+ pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
12
+ barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
13
+ bam (str): File path to the BAM file to output.
14
+ bam_suffix (str): The suffix to use for the BAM file.
15
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
16
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
17
+ device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
18
+
19
+ Returns:
20
+ None
21
+ Outputs a BAM file holding the canonical base calls output by the dorado basecaller.
22
+ """
23
+ import subprocess
24
+ output = bam + bam_suffix
25
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
26
+ if barcode_both_ends:
27
+ command.append("--barcode-both-ends")
28
+ if not trim:
29
+ command.append("--no-trim")
30
+ command += [model, pod5_dir]
31
+ command_string = " ".join(command)
32
+ print(f"Running {command_string}\n to generate {output}")
33
+ with open(output, "w") as outfile:
34
+ subprocess.run(command, stdout=outfile)
@@ -0,0 +1,21 @@
1
+ # complement_base_list
2
+
3
+ def complement_base_list(sequence):
4
+ """
5
+ Takes a list of DNA base identities and returns their complement.
6
+
7
+ Parameters:
8
+ sequence (list): A list of DNA bases (e.g., ['A', 'C', 'G', 'T']).
9
+
10
+ Returns:
11
+ complement (list): A list of complementary DNA bases.
12
+ """
13
+ complement_mapping = {
14
+ 'A': 'T',
15
+ 'T': 'A',
16
+ 'C': 'G',
17
+ 'G': 'C',
18
+ 'N': 'N' # Handling ambiguous bases like 'N'
19
+ }
20
+
21
+ return [complement_mapping[base] for base in sequence]
@@ -0,0 +1,378 @@
1
+ # concatenate_fastqs_to_bam
2
+
3
+ def concatenate_fastqs_to_bam(
4
+ fastq_files,
5
+ output_bam,
6
+ barcode_tag='BC',
7
+ gzip_suffixes=('.gz',),
8
+ barcode_map=None,
9
+ add_read_group=True,
10
+ rg_sample_field=None,
11
+ progress=True,
12
+ auto_pair=True,
13
+ ):
14
+ """
15
+ Concatenate FASTQ(s) into an unaligned BAM. Supports single-end and paired-end (auto-detect or explicit).
16
+
17
+ Parameters
18
+ ----------
19
+ fastq_files : list[str] or list[(str,str)]
20
+ If list of tuples: each tuple is (R1_path, R2_path).
21
+ If list of strings and auto_pair=True: the function will attempt to automatically pair files.
22
+ output_bam : str
23
+ Path to output BAM (will be overwritten).
24
+ barcode_tag : str
25
+ SAM tag used for barcode (default 'BC').
26
+ gzip_suffixes : tuple
27
+ Compressed suffixes to consider (default ('.gz',)).
28
+ barcode_map : dict or None
29
+ Optional mapping {path: barcode} to override automatic extraction.
30
+ add_read_group : bool
31
+ If True, add RG entries and set RG tag per-read (ID = barcode).
32
+ rg_sample_field : str or None
33
+ If set, includes SM field in RG header entries.
34
+ progress : bool
35
+ Show tqdm progress bar.
36
+ auto_pair : bool
37
+ If True and `fastq_files` is a list of strings, attempt to auto-pair R1/R2 by filename patterns.
38
+
39
+ Returns
40
+ -------
41
+ dict
42
+ Summary: {'total_reads', 'per_file_counts', 'paired_count', 'unpaired_count', 'barcodes'}
43
+ """
44
+ import os
45
+ import re
46
+ import gzip
47
+ from itertools import zip_longest
48
+ from Bio import SeqIO
49
+ import pysam
50
+ from tqdm import tqdm
51
+
52
+ # ---------- helpers ----------
53
+ def _is_gz(path):
54
+ pl = path.lower()
55
+ return any(pl.endswith(suf) for suf in gzip_suffixes)
56
+
57
+ def _strip_fastq_ext(basn):
58
+ # remove .fastq.gz .fq.gz .fastq .fq
59
+ for ext in ('.fastq.gz', '.fq.gz', '.fastq', '.fq'):
60
+ if basn.lower().endswith(ext):
61
+ return basn[:-len(ext)]
62
+ # fallback remove last suffix
63
+ return os.path.splitext(basn)[0]
64
+
65
+ def _extract_barcode_from_filename(path):
66
+ # heuristic: barcode is last underscore-separated token in filename (before ext)
67
+ stem = _strip_fastq_ext(os.path.basename(path))
68
+ if '_' in stem:
69
+ token = stem.split('_')[-1]
70
+ if token:
71
+ return token
72
+ # fallback to whole stem
73
+ return stem
74
+
75
+ # pairing heuristics: try to identify suffix that marks read number
76
+ def _classify_read_token(stem):
77
+ # returns (prefix, readnum) if matches, else (None, None)
78
+ patterns = [
79
+ r'(?i)(.*?)[._-]r?([12])$', # prefix_R1 or prefix.r1 or prefix-1
80
+ r'(?i)(.*?)[._-]read[_-]?([12])$',
81
+ r'(?i)(.*?)[/_]([12])$', # sometimes /1 is used (rare in filenames)
82
+ ]
83
+ for pat in patterns:
84
+ m = re.match(pat, stem)
85
+ if m:
86
+ prefix = m.group(1)
87
+ num = m.group(2)
88
+ return prefix, int(num)
89
+ return None, None
90
+
91
+ def pair_by_filename(paths):
92
+ # paths: list of strings
93
+ map_pref = {} # prefix -> {1: path, 2: path, 'orphans': [..]}
94
+ unpaired = []
95
+ for p in paths:
96
+ name = os.path.basename(p)
97
+ stem = _strip_fastq_ext(name)
98
+ pref, num = _classify_read_token(stem)
99
+ if pref is not None:
100
+ entry = map_pref.setdefault(pref, {})
101
+ entry[num] = p
102
+ else:
103
+ # try fallback: split by last underscore or dot and check last token is 1/2 or R1/R2
104
+ toks = re.split(r'[_\.]', stem)
105
+ if toks and toks[-1] in ('1', '2', 'R1', 'R2', 'r1', 'r2'):
106
+ last = toks[-1]
107
+ basepref = "_".join(toks[:-1]) if len(toks) > 1 else toks[0]
108
+ num = 1 if last.endswith('1') else 2
109
+ entry = map_pref.setdefault(basepref, {})
110
+ entry[num] = p
111
+ else:
112
+ unpaired.append(p)
113
+ pairs = []
114
+ leftovers = []
115
+ for k, d in map_pref.items():
116
+ if 1 in d and 2 in d:
117
+ pairs.append((d[1], d[2]))
118
+ else:
119
+ # put whoever exists into leftovers
120
+ leftovers.extend([v for kk, v in d.items()])
121
+ # append also unpaired
122
+ leftovers.extend(unpaired)
123
+ return pairs, leftovers
124
+
125
+ # ---------- normalize input ----------
126
+ explicit_pairs = []
127
+ singles = []
128
+ if not isinstance(fastq_files, (list, tuple)):
129
+ raise ValueError("fastq_files must be a list of paths or list of (R1,R2) tuples.")
130
+
131
+ # mixture: if user supplied tuples -> treat as explicit pairs
132
+ if all(isinstance(x, (list, tuple)) and len(x) == 2 for x in fastq_files):
133
+ explicit_pairs = [(str(a), str(b)) for a, b in fastq_files]
134
+ else:
135
+ # flatten and coerce to strings, ignore None
136
+ paths = [str(x) for x in fastq_files if x is not None]
137
+ if auto_pair:
138
+ explicit_pairs, leftovers = pair_by_filename(paths)
139
+ singles = leftovers
140
+ else:
141
+ singles = paths
142
+
143
+ # Build barcode map and ordered barcodes
144
+ barcode_map = barcode_map or {}
145
+ per_path_barcode = {}
146
+ barcodes_in_order = []
147
+
148
+ # pairs: assign barcode per pair from either provided barcode_map for first file or from filenames
149
+ for r1, r2 in explicit_pairs:
150
+ bc = barcode_map.get(r1) or barcode_map.get(r2) or _extract_barcode_from_filename(r1)
151
+ per_path_barcode[r1] = bc
152
+ per_path_barcode[r2] = bc
153
+ if bc not in barcodes_in_order:
154
+ barcodes_in_order.append(bc)
155
+ for p in singles:
156
+ bc = barcode_map.get(p) or _extract_barcode_from_filename(p)
157
+ per_path_barcode[p] = bc
158
+ if bc not in barcodes_in_order:
159
+ barcodes_in_order.append(bc)
160
+
161
+ # prepare BAM header
162
+ header = {"HD": {"VN": "1.0"}, "SQ": []}
163
+ if add_read_group:
164
+ rg_list = []
165
+ for bc in barcodes_in_order:
166
+ rg = {"ID": bc}
167
+ if rg_sample_field:
168
+ rg["SM"] = rg_sample_field
169
+ rg_list.append(rg)
170
+ header["RG"] = rg_list
171
+
172
+ # ---------- write BAM ----------
173
+ per_file_counts = {}
174
+ total_written = 0
175
+ paired_count = 0
176
+ unpaired_count = 0
177
+
178
+ def _open_fh(path):
179
+ return gzip.open(path, "rt") if _is_gz(path) else open(path, "rt")
180
+
181
+ with pysam.AlignmentFile(output_bam, "wb", header=header) as bam_out:
182
+ # process paired files first
183
+ seq_iter = list(explicit_pairs) # list of (r1,r2)
184
+ if progress:
185
+ seq_iter = tqdm(seq_iter, desc="Paired FASTQ->BAM")
186
+ for r1_path, r2_path in seq_iter:
187
+ if not (os.path.exists(r1_path) and os.path.exists(r2_path)):
188
+ raise FileNotFoundError(f"Paired file missing: {r1_path} or {r2_path}")
189
+ bc = per_path_barcode.get(r1_path) or per_path_barcode.get(r2_path) or "barcode"
190
+ # open both and iterate in parallel
191
+ with _open_fh(r1_path) as fh1, _open_fh(r2_path) as fh2:
192
+ it1 = SeqIO.parse(fh1, "fastq")
193
+ it2 = SeqIO.parse(fh2, "fastq")
194
+ # iterate in lockstep; if one shorter we still write remaining as unpaired (zip_longest)
195
+ for rec1, rec2 in zip_longest(it1, it2, fillvalue=None):
196
+ # determine a common read name
197
+ if rec1 is not None:
198
+ id1 = rec1.id
199
+ else:
200
+ id1 = None
201
+ if rec2 is not None:
202
+ id2 = rec2.id
203
+ else:
204
+ id2 = None
205
+ # try to derive a common name (strip /1 or /2 if present)
206
+ def _strip_end_id(s):
207
+ if s is None:
208
+ return None
209
+ return re.sub(r'(?:/1$|/2$|\s[12]$)', '', s)
210
+ common_name = _strip_end_id(id1) or _strip_end_id(id2) or (id1 or id2)
211
+
212
+ # create AlignedSegment for read1
213
+ if rec1 is not None:
214
+ a1 = pysam.AlignedSegment()
215
+ a1.query_name = common_name
216
+ a1.query_sequence = str(rec1.seq)
217
+ a1.is_paired = True
218
+ a1.is_read1 = True
219
+ a1.is_read2 = False
220
+ a1.is_unmapped = True
221
+ a1.mate_is_unmapped = True
222
+ # reference fields for unmapped
223
+ a1.reference_id = -1
224
+ a1.reference_start = -1
225
+ a1.next_reference_id = -1
226
+ a1.next_reference_start = -1
227
+ a1.template_length = 0
228
+ # qualities
229
+ if "phred_quality" in rec1.letter_annotations:
230
+ try:
231
+ a1.query_qualities = [int(x) for x in rec1.letter_annotations["phred_quality"]]
232
+ except Exception:
233
+ a1.query_qualities = None
234
+ # tags
235
+ a1.set_tag(barcode_tag, str(bc), value_type='Z')
236
+ if add_read_group:
237
+ a1.set_tag("RG", str(bc), value_type='Z')
238
+ bam_out.write(a1)
239
+ per_file_counts.setdefault(r1_path, 0)
240
+ per_file_counts[r1_path] += 1
241
+ total_written += 1
242
+ # create AlignedSegment for read2
243
+ if rec2 is not None:
244
+ a2 = pysam.AlignedSegment()
245
+ a2.query_name = common_name
246
+ a2.query_sequence = str(rec2.seq)
247
+ a2.is_paired = True
248
+ a2.is_read1 = False
249
+ a2.is_read2 = True
250
+ a2.is_unmapped = True
251
+ a2.mate_is_unmapped = True
252
+ a2.reference_id = -1
253
+ a2.reference_start = -1
254
+ a2.next_reference_id = -1
255
+ a2.next_reference_start = -1
256
+ a2.template_length = 0
257
+ if "phred_quality" in rec2.letter_annotations:
258
+ try:
259
+ a2.query_qualities = [int(x) for x in rec2.letter_annotations["phred_quality"]]
260
+ except Exception:
261
+ a2.query_qualities = None
262
+ a2.set_tag(barcode_tag, str(bc), value_type='Z')
263
+ if add_read_group:
264
+ a2.set_tag("RG", str(bc), value_type='Z')
265
+ bam_out.write(a2)
266
+ per_file_counts.setdefault(r2_path, 0)
267
+ per_file_counts[r2_path] += 1
268
+ total_written += 1
269
+ # count paired/unpaired bookkeeping
270
+ if rec1 is not None and rec2 is not None:
271
+ paired_count += 1
272
+ else:
273
+ # one side missing -> counted as unpaired for whichever exists
274
+ if rec1 is not None:
275
+ unpaired_count += 1
276
+ if rec2 is not None:
277
+ unpaired_count += 1
278
+
279
+ # process singletons
280
+ single_iter = list(singles)
281
+ if progress:
282
+ single_iter = tqdm(single_iter, desc="Single FASTQ->BAM")
283
+ for p in single_iter:
284
+ if not os.path.exists(p):
285
+ raise FileNotFoundError(p)
286
+ bc = per_path_barcode.get(p, "barcode")
287
+ with _open_fh(p) as fh:
288
+ for rec in SeqIO.parse(fh, "fastq"):
289
+ a = pysam.AlignedSegment()
290
+ a.query_name = rec.id
291
+ a.query_sequence = str(rec.seq)
292
+ a.is_paired = False
293
+ a.is_read1 = False
294
+ a.is_read2 = False
295
+ a.is_unmapped = True
296
+ a.mate_is_unmapped = True
297
+ a.reference_id = -1
298
+ a.reference_start = -1
299
+ a.next_reference_id = -1
300
+ a.next_reference_start = -1
301
+ a.template_length = 0
302
+ if "phred_quality" in rec.letter_annotations:
303
+ try:
304
+ a.query_qualities = [int(x) for x in rec.letter_annotations["phred_quality"]]
305
+ except Exception:
306
+ a.query_qualities = None
307
+ a.set_tag(barcode_tag, str(bc), value_type='Z')
308
+ if add_read_group:
309
+ a.set_tag("RG", str(bc), value_type='Z')
310
+ bam_out.write(a)
311
+ per_file_counts.setdefault(p, 0)
312
+ per_file_counts[p] += 1
313
+ total_written += 1
314
+ unpaired_count += 1
315
+
316
+ summary = {
317
+ "total_reads": total_written,
318
+ "per_file": per_file_counts,
319
+ "paired_pairs_written": paired_count,
320
+ "singletons_written": unpaired_count,
321
+ "barcodes": barcodes_in_order
322
+ }
323
+ return summary
324
+
325
+
326
+ # def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
327
+ # """
328
+ # Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
329
+
330
+ # Parameters:
331
+ # fastq_files (list): List of paths to demultiplexed FASTQ files.
332
+ # output_bam (str): Path to the output BAM file.
333
+ # barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
334
+ # gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
335
+
336
+ # Returns:
337
+ # None
338
+ # """
339
+ # import os
340
+ # import pysam
341
+ # import gzip
342
+ # from Bio import SeqIO
343
+ # from tqdm import tqdm
344
+
345
+ # n_fastqs = len(fastq_files)
346
+
347
+ # with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
348
+ # for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
349
+ # # Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
350
+ # base_name = os.path.basename(fastq_file)
351
+ # if n_fastqs > 1:
352
+ # if base_name.endswith('.fastq.gz'):
353
+ # barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
354
+ # elif base_name.endswith('.fq.gz'):
355
+ # barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
356
+ # elif base_name.endswith('.fastq'):
357
+ # barcode = base_name.split('_')[-1].replace('.fastq', '')
358
+ # elif base_name.endswith('.fq'):
359
+ # barcode = base_name.split('_')[-1].replace('.fq', '')
360
+ # else:
361
+ # raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
362
+ # else:
363
+ # barcode = 'barcode0'
364
+
365
+ # # Read the FASTQ file (handle gzipped and non-gzipped files)
366
+ # open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
367
+ # with open_func(fastq_file, 'rt') as fq_in:
368
+ # for record in SeqIO.parse(fq_in, 'fastq'):
369
+ # # Create an unaligned BAM entry for each FASTQ record
370
+ # aln = pysam.AlignedSegment()
371
+ # aln.query_name = record.id
372
+ # aln.query_sequence = str(record.seq)
373
+ # aln.flag = 4 # Unmapped
374
+ # aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
375
+ # # Add the barcode to the BC tag
376
+ # aln.set_tag(barcode_tag, barcode)
377
+ # # Write to BAM file
378
+ # bam_out.write(aln)