smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,76 @@
1
+ import os
2
+ import anndata as ad
3
+ import numpy as np
4
+ import concurrent.futures
5
+ from .one_hot_encode import one_hot_encode
6
+
7
+ def encode_sequence(args):
8
+ """Parallel helper function for one-hot encoding."""
9
+ read_name, seq, device = args
10
+ try:
11
+ one_hot_matrix = one_hot_encode(seq, device)
12
+ return read_name, one_hot_matrix
13
+ except Exception:
14
+ return None # Skip invalid sequences
15
+
16
+ def encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number):
17
+ """Encodes a batch and writes to disk immediately."""
18
+ batch = {read_name: matrix for read_name, matrix in batch_data if matrix is not None}
19
+
20
+ if batch:
21
+ save_name = os.path.join(tmp_dir, f'tmp_{prefix}_{record}_{batch_number}.h5ad')
22
+ tmp_ad = ad.AnnData(X=np.zeros((1, 1)), uns=batch) # Placeholder X
23
+ tmp_ad.write_h5ad(save_name)
24
+ return save_name
25
+ return None
26
+
27
+ def ohe_batching(base_identities, tmp_dir, record, prefix='', batch_size=100000, progress_bar=None, device='auto', threads=None):
28
+ """
29
+ Efficient version of ohe_batching: one-hot encodes sequences in parallel and writes batches immediately.
30
+
31
+ Parameters:
32
+ base_identities (dict): Dictionary mapping read names to sequences.
33
+ tmp_dir (str): Directory for storing temporary files.
34
+ record (str): Record name.
35
+ prefix (str): Prefix for file naming.
36
+ batch_size (int): Number of reads per batch.
37
+ progress_bar (tqdm instance, optional): Shared progress bar.
38
+ device (str): Device for encoding.
39
+ threads (int, optional): Number of parallel workers.
40
+
41
+ Returns:
42
+ list: List of valid H5AD file paths.
43
+ """
44
+ threads = threads or os.cpu_count() # Default to max available CPU cores
45
+ batch_data = []
46
+ batch_number = 0
47
+ file_names = []
48
+
49
+ # Step 1: Prepare Data for Parallel Encoding
50
+ encoding_args = [(read_name, seq, device) for read_name, seq in base_identities.items() if seq is not None]
51
+
52
+ # Step 2: Parallel One-Hot Encoding using threads (to avoid nested processes)
53
+ with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
54
+ for result in executor.map(encode_sequence, encoding_args):
55
+ if result:
56
+ batch_data.append(result)
57
+
58
+ if len(batch_data) >= batch_size:
59
+ # Step 3: Process and Write Batch Immediately
60
+ file_name = encode_and_save_batch(batch_data.copy(), tmp_dir, prefix, record, batch_number)
61
+ if file_name:
62
+ file_names.append(file_name)
63
+
64
+ batch_data.clear()
65
+ batch_number += 1
66
+
67
+ if progress_bar:
68
+ progress_bar.update(1)
69
+
70
+ # Step 4: Process Remaining Batch
71
+ if batch_data:
72
+ file_name = encode_and_save_batch(batch_data, tmp_dir, prefix, record, batch_number)
73
+ if file_name:
74
+ file_names.append(file_name)
75
+
76
+ return file_names
@@ -0,0 +1,32 @@
1
+ # ohe_layers_decode
2
+
3
+ def ohe_layers_decode(adata, obs_names):
4
+ """
5
+ Takes an anndata object and a list of observation names. Returns a list of sequence strings for the reads of interest.
6
+ Parameters:
7
+ adata (AnnData): An anndata object.
8
+ obs_names (list): A list of observation name strings to retrieve sequences for.
9
+
10
+ Returns:
11
+ sequences (list of str): List of strings of the one hot encoded array
12
+ """
13
+ import anndata as ad
14
+ import numpy as np
15
+ from .ohe_decode import ohe_decode
16
+
17
+ # Define the mapping of one-hot encoded indices to DNA bases
18
+ mapping = ['A', 'C', 'G', 'T', 'N']
19
+
20
+ ohe_layers = [f"{base}_binary_encoding" for base in mapping]
21
+ sequences = []
22
+
23
+ for obs_name in obs_names:
24
+ obs_subset = adata[obs_name]
25
+ ohe_list = []
26
+ for layer in ohe_layers:
27
+ ohe_list += list(obs_subset.layers[layer])
28
+ ohe_array = np.array(ohe_list)
29
+ sequence = ohe_decode(ohe_array)
30
+ sequences.append(sequence)
31
+
32
+ return sequences
@@ -0,0 +1,27 @@
1
+ # one_hot_decode
2
+
3
+ # String encodings
4
+ def one_hot_decode(ohe_array):
5
+ """
6
+ Takes a flattened one hot encoded array and returns the sequence string from that array.
7
+ Parameters:
8
+ ohe_array (np.array): A one hot encoded array
9
+
10
+ Returns:
11
+ sequence (str): Sequence string of the one hot encoded array
12
+ """
13
+ import numpy as np
14
+ # Define the mapping of one-hot encoded indices to DNA bases
15
+ mapping = ['A', 'C', 'G', 'T', 'N']
16
+
17
+ # Reshape the flattened array into a 2D matrix with 5 columns (one for each base)
18
+ one_hot_matrix = ohe_array.reshape(-1, 5)
19
+
20
+ # Get the index of the maximum value (which will be 1) in each row
21
+ decoded_indices = np.argmax(one_hot_matrix, axis=1)
22
+
23
+ # Map the indices back to the corresponding bases
24
+ sequence_list = [mapping[i] for i in decoded_indices]
25
+ sequence = ''.join(sequence_list)
26
+
27
+ return sequence
@@ -0,0 +1,57 @@
1
+ # one_hot_encode
2
+
3
+ def one_hot_encode(sequence, device='auto'):
4
+ """
5
+ One-hot encodes a DNA sequence.
6
+
7
+ Parameters:
8
+ sequence (str or list): DNA sequence (e.g., "ACGTN" or ['A', 'C', 'G', 'T', 'N']).
9
+
10
+ Returns:
11
+ ndarray: Flattened one-hot encoded representation of the input sequence.
12
+ """
13
+ import numpy as np
14
+
15
+ mapping = np.array(['A', 'C', 'G', 'T', 'N'])
16
+
17
+ # Ensure input is a list of characters
18
+ if not isinstance(sequence, list):
19
+ sequence = list(sequence) # Convert string to list of characters
20
+
21
+ # Handle empty sequences
22
+ if len(sequence) == 0:
23
+ print("Warning: Empty sequence encountered in one_hot_encode()")
24
+ return np.zeros(len(mapping)) # Return empty encoding instead of failing
25
+
26
+ # Convert sequence to NumPy array
27
+ seq_array = np.array(sequence, dtype='<U1')
28
+
29
+ # Replace invalid bases with 'N'
30
+ seq_array = np.where(np.isin(seq_array, mapping), seq_array, 'N')
31
+
32
+ # Create one-hot encoding matrix
33
+ one_hot_matrix = (seq_array[:, None] == mapping).astype(int)
34
+
35
+ # Flatten and return
36
+ return one_hot_matrix.flatten()
37
+
38
+ # import torch
39
+ # bases = torch.tensor([ord('A'), ord('C'), ord('G'), ord('T'), ord('N')], dtype=torch.int8, device=device)
40
+
41
+ # # Convert input to tensor of character ASCII codes
42
+ # seq_tensor = torch.tensor([ord(c) for c in sequence], dtype=torch.int8, device=device)
43
+
44
+ # # Handle empty sequence
45
+ # if seq_tensor.numel() == 0:
46
+ # print("Warning: Empty sequence encountered in one_hot_encode_torch()")
47
+ # return torch.zeros(len(bases), device=device)
48
+
49
+ # # Replace invalid bases with 'N'
50
+ # is_valid = (seq_tensor[:, None] == bases) # Compare each base with mapping
51
+ # seq_tensor = torch.where(is_valid.any(dim=1), seq_tensor, ord('N'))
52
+
53
+ # # Create one-hot encoding matrix
54
+ # one_hot_matrix = (seq_tensor[:, None] == bases).int()
55
+
56
+ # # Flatten and return
57
+ # return one_hot_matrix.flatten()
@@ -0,0 +1,269 @@
1
+ # plot_bed_histograms
2
+
3
+ def plot_bed_histograms(bed_file, plotting_directory, fasta):
4
+ """
5
+ Plots read length, coverage, mapq, read quality stats for each record.
6
+
7
+ Parameters:
8
+ bed_file (str): Path to the bed file to derive metrics from.
9
+ plot_directory (str): Path to the directory to write out historgrams.
10
+ fasta (str): Path to FASTA corresponding to bed
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ import pandas as pd
16
+ import matplotlib.pyplot as plt
17
+ import numpy as np
18
+ import os
19
+
20
+ # plot_bed_histograms.py
21
+
22
+ def plot_bed_histograms(
23
+ bed_file,
24
+ plotting_directory,
25
+ fasta,
26
+ *,
27
+ bins=60,
28
+ clip_quantiles=(0.0, 0.995),
29
+ cov_bin_size=1000, # coverage bin size in bp
30
+ rows_per_fig=6, # paginate if many chromosomes
31
+ include_mapq_quality=True, # add MAPQ + avg read quality columns to grid
32
+ coordinate_mode="one_based", # "one_based" (your BED-like) or "zero_based"
33
+ ):
34
+ """
35
+ Plot per-chromosome QC grids from a BED-like file.
36
+
37
+ Expects columns:
38
+ chrom, start, end, read_len, qname, mapq, avg_base_qual
39
+
40
+ For each chromosome:
41
+ - Column 1: Read length histogram
42
+ - Column 2: Coverage across the chromosome (binned)
43
+ - (optional) Column 3: MAPQ histogram
44
+ - (optional) Column 4: Avg base quality histogram
45
+
46
+ The figure is paginated: rows = chromosomes (up to rows_per_fig), columns depend on include_mapq_quality.
47
+ Saves one PNG per page under `plotting_directory`.
48
+
49
+ Parameters
50
+ ----------
51
+ bed_file : str
52
+ plotting_directory : str
53
+ fasta : str
54
+ Reference FASTA (used to get chromosome lengths).
55
+ bins : int
56
+ Histogram bins for read length / MAPQ / quality.
57
+ clip_quantiles : (float, float)
58
+ Clip hist tails for readability (e.g., (0, 0.995)).
59
+ cov_bin_size : int
60
+ Bin size (bp) for coverage plot; bigger = faster/coarser.
61
+ rows_per_fig : int
62
+ Number of chromosomes per page.
63
+ include_mapq_quality : bool
64
+ If True, add MAPQ and avg base quality histograms as extra columns.
65
+ coordinate_mode : {"one_based","zero_based"}
66
+ One-based, inclusive (your file) vs BED-standard zero-based, half-open.
67
+ """
68
+ import os
69
+ import numpy as np
70
+ import pandas as pd
71
+ import matplotlib.pyplot as plt
72
+ import pysam
73
+
74
+ os.makedirs(plotting_directory, exist_ok=True)
75
+
76
+ bed_basename = os.path.basename(bed_file).rsplit(".bed", 1)[0]
77
+ print(f"[plot_bed_histograms] Loading: {bed_file}")
78
+
79
+ # Load BED-like table
80
+ cols = ['chrom', 'start', 'end', 'read_len', 'qname', 'mapq', 'avg_q']
81
+ df = pd.read_csv(bed_file, sep="\t", header=None, names=cols, dtype={
82
+ 'chrom': str, 'start': int, 'end': int, 'read_len': int, 'qname': str,
83
+ 'mapq': float, 'avg_q': float
84
+ })
85
+
86
+ # Drop unaligned records (chrom == '*') if present
87
+ df = df[df['chrom'] != '*'].copy()
88
+ if df.empty:
89
+ print("[plot_bed_histograms] No aligned reads found; nothing to plot.")
90
+ return
91
+
92
+ # Ensure coordinate mode consistent; convert to 0-based half-open for bin math internally
93
+ # Input is typically one_based inclusive (from your writer).
94
+ if coordinate_mode not in {"one_based", "zero_based"}:
95
+ raise ValueError("coordinate_mode must be 'one_based' or 'zero_based'")
96
+
97
+ if coordinate_mode == "one_based":
98
+ # convert to 0-based half-open [start0, end0)
99
+ start0 = df['start'].to_numpy() - 1
100
+ end0 = df['end'].to_numpy() # inclusive in input -> +1 already handled by not subtracting
101
+ else:
102
+ # already 0-based half-open (assumption)
103
+ start0 = df['start'].to_numpy()
104
+ end0 = df['end'].to_numpy()
105
+
106
+ # Clip helper for hist tails
107
+ def _clip_series(s, q=(0.0, 0.995)):
108
+ if q is None:
109
+ return s.to_numpy()
110
+ lo = s.quantile(q[0]) if q[0] is not None else s.min()
111
+ hi = s.quantile(q[1]) if q[1] is not None else s.max()
112
+ x = s.to_numpy(dtype=float)
113
+ return np.clip(x, lo, hi)
114
+
115
+ # Load chromosome order/lengths from FASTA
116
+ with pysam.FastaFile(fasta) as fa:
117
+ ref_names = list(fa.references)
118
+ ref_lengths = dict(zip(ref_names, fa.lengths))
119
+
120
+ # Keep only chroms present in FASTA and with at least one read
121
+ chroms = [c for c in df['chrom'].unique() if c in ref_lengths]
122
+ # Order chromosomes by FASTA order
123
+ chrom_order = [c for c in ref_names if c in chroms]
124
+
125
+ if not chrom_order:
126
+ print("[plot_bed_histograms] No chromosomes from BED are present in FASTA; aborting.")
127
+ return
128
+
129
+ # Pagination
130
+ def _sanitize(name: str) -> str:
131
+ return "".join(ch if ch.isalnum() or ch in "-._" else "_" for ch in name)
132
+
133
+ cols_per_fig = 4 if include_mapq_quality else 2
134
+
135
+ for start_idx in range(0, len(chrom_order), rows_per_fig):
136
+ chunk = chrom_order[start_idx:start_idx + rows_per_fig]
137
+ nrows = len(chunk)
138
+ ncols = cols_per_fig
139
+
140
+ fig, axes = plt.subplots(
141
+ nrows=nrows, ncols=ncols,
142
+ figsize=(4.0 * ncols, 2.6 * nrows),
143
+ dpi=160,
144
+ squeeze=False
145
+ )
146
+
147
+ for r, chrom in enumerate(chunk):
148
+ chrom_len = ref_lengths[chrom]
149
+ mask = (df['chrom'].to_numpy() == chrom)
150
+
151
+ # Slice per-chrom arrays for speed
152
+ s0 = start0[mask]
153
+ e0 = end0[mask]
154
+ len_arr = df.loc[mask, 'read_len']
155
+ mapq_arr = df.loc[mask, 'mapq']
156
+ q_arr = df.loc[mask, 'avg_q']
157
+
158
+ # --- Col 1: Read length histogram (clipped) ---
159
+ ax = axes[r, 0]
160
+ ax.hist(_clip_series(len_arr, clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
161
+ if r == 0:
162
+ ax.set_title("Read length")
163
+ ax.set_ylabel(f"{chrom}\n(n={mask.sum()})")
164
+ ax.set_xlabel("bp")
165
+ ax.grid(alpha=0.25)
166
+
167
+ # --- Col 2: Coverage (binned over genome) ---
168
+ ax = axes[r, 1]
169
+ nb = max(1, int(np.ceil(chrom_len / cov_bin_size)))
170
+ # Bin edges in 0-based coords
171
+ edges = np.linspace(0, chrom_len, nb + 1, dtype=int)
172
+
173
+ # Compute per-bin "read count coverage": number of reads overlapping each bin.
174
+ # Approximate by incrementing all bins touched by the interval.
175
+ # (Fast and memory-light; for exact base coverage use smaller cov_bin_size.)
176
+ cov = np.zeros(nb, dtype=np.int32)
177
+ # bin indices overlapped by each read (0-based half-open)
178
+ b0 = np.minimum(np.searchsorted(edges, s0, side="right") - 1, nb - 1)
179
+ b1 = np.maximum(np.searchsorted(edges, np.maximum(e0 - 1, 0), side="right") - 1, 0)
180
+ # ensure valid ordering
181
+ b_lo = np.minimum(b0, b1)
182
+ b_hi = np.maximum(b0, b1)
183
+
184
+ # Increment all bins in range; loop but at bin resolution (fast for reasonable cov_bin_size).
185
+ for lo, hi in zip(b_lo, b_hi):
186
+ cov[lo:hi + 1] += 1
187
+
188
+ x_mid = (edges[:-1] + edges[1:]) / 2.0
189
+ ax.plot(x_mid, cov)
190
+ if r == 0:
191
+ ax.set_title(f"Coverage (~{cov_bin_size} bp bins)")
192
+ ax.set_xlim(0, chrom_len)
193
+ ax.set_xlabel("Position (bp)")
194
+ ax.set_ylabel("") # already show chrom on col 1
195
+ ax.grid(alpha=0.25)
196
+
197
+ if include_mapq_quality:
198
+ # --- Col 3: MAPQ ---
199
+ ax = axes[r, 2]
200
+ # Clip MAPQ upper tail if needed (usually 60)
201
+ ax.hist(_clip_series(mapq_arr.fillna(0), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
202
+ if r == 0:
203
+ ax.set_title("MAPQ")
204
+ ax.set_xlabel("MAPQ")
205
+ ax.grid(alpha=0.25)
206
+
207
+ # --- Col 4: Avg base quality ---
208
+ ax = axes[r, 3]
209
+ ax.hist(_clip_series(q_arr.fillna(np.nan), clip_quantiles), bins=bins, edgecolor="black", alpha=0.7)
210
+ if r == 0:
211
+ ax.set_title("Avg base qual")
212
+ ax.set_xlabel("Phred")
213
+ ax.grid(alpha=0.25)
214
+
215
+ fig.suptitle(
216
+ f"{bed_basename} — per-chromosome QC "
217
+ f"({'len,cov,MAPQ,qual' if include_mapq_quality else 'len,cov'})",
218
+ y=0.995, fontsize=11
219
+ )
220
+ fig.tight_layout(rect=[0, 0, 1, 0.98])
221
+
222
+ page = start_idx // rows_per_fig + 1
223
+ out_png = os.path.join(plotting_directory, f"{_sanitize(bed_basename)}_qc_page{page}.png")
224
+ plt.savefig(out_png, bbox_inches="tight")
225
+ plt.close(fig)
226
+
227
+ print("[plot_bed_histograms] Done.")
228
+
229
+
230
+ # bed_basename = os.path.basename(bed_file).split('.bed')[0]
231
+ # # Load the BED file into a DataFrame
232
+ # print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
233
+ # df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name', 'mapq', 'read_quality'])
234
+
235
+ # # Group by chromosome
236
+ # grouped = df.groupby('chromosome')
237
+
238
+ # # for each chromosome, get the record length of that chromosome from the fasta. Use from 0 to this length for the positional coverage plot.
239
+
240
+ # # Change below and make a plot grid instead. For each, make row for chromsome, col for read length and coverage
241
+ # # Clip the outliers to make plots cleaner
242
+
243
+ # for chrom, group in grouped:
244
+ # # Plot read length histogram
245
+ # plt.figure(figsize=(12, 6))
246
+ # plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
247
+ # plt.title(f'Read Length Histogram of reads aligned to {chrom}')
248
+ # plt.xlabel('Read Length')
249
+ # plt.ylabel('Count')
250
+ # plt.grid(True)
251
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
252
+ # plt.savefig(save_name)
253
+ # plt.close()
254
+
255
+ # # Compute coverage
256
+ # coverage = np.zeros(group['end'].max())
257
+ # for _, row in group.iterrows():
258
+ # coverage[row['start']:row['end']] += 1
259
+
260
+ # # Plot coverage histogram
261
+ # plt.figure(figsize=(12, 6))
262
+ # plt.plot(coverage, color='b')
263
+ # plt.title(f'Coverage Histogram for {chrom}')
264
+ # plt.xlabel('Position')
265
+ # plt.ylabel('Coverage')
266
+ # plt.grid(True)
267
+ # save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
268
+ # plt.savefig(save_name)
269
+ # plt.close()
@@ -0,0 +1,28 @@
1
+ def run_multiqc(input_dir, output_dir):
2
+ """
3
+ Runs MultiQC on a given directory and saves the report to the specified output directory.
4
+
5
+ Parameters:
6
+ - input_dir (str): Path to the directory containing QC reports (e.g., FastQC, Samtools, bcftools outputs).
7
+ - output_dir (str): Path to the directory where MultiQC reports should be saved.
8
+
9
+ Returns:
10
+ - None: The function executes MultiQC and prints the status.
11
+ """
12
+ import os
13
+ import subprocess
14
+ # Ensure the output directory exists
15
+ os.makedirs(output_dir, exist_ok=True)
16
+
17
+ # Construct MultiQC command
18
+ command = ["multiqc", input_dir, "-o", output_dir]
19
+
20
+ print(f"Running MultiQC on '{input_dir}' and saving results to '{output_dir}'...")
21
+
22
+ # Run MultiQC
23
+ try:
24
+ subprocess.run(command, check=True)
25
+ print(f"MultiQC report generated successfully in: {output_dir}")
26
+ except subprocess.CalledProcessError as e:
27
+ print(f"Error running MultiQC: {e}")
28
+
@@ -0,0 +1,43 @@
1
+ ## separate_bam_by_bc
2
+
3
+ def separate_bam_by_bc(input_bam, output_prefix, bam_suffix, split_dir):
4
+ """
5
+ Separates an input BAM file on the BC SAM tag values.
6
+
7
+ Parameters:
8
+ input_bam (str): File path to the BAM file to split.
9
+ output_prefix (str): A prefix to append to the output BAM.
10
+ bam_suffix (str): A suffix to add to the bam file.
11
+ split_dir (str): String indicating path to directory to split BAMs into
12
+
13
+ Returns:
14
+ None
15
+ Writes out split BAM files.
16
+ """
17
+ import pysam
18
+ import os
19
+
20
+ bam_base = os.path.basename(input_bam)
21
+ bam_base_minus_suffix = bam_base.split(bam_suffix)[0]
22
+
23
+ # Open the input BAM file for reading
24
+ with pysam.AlignmentFile(input_bam, "rb") as bam:
25
+ # Create a dictionary to store output BAM files
26
+ output_files = {}
27
+ # Iterate over each read in the BAM file
28
+ for read in bam:
29
+ try:
30
+ # Get the barcode tag value
31
+ bc_tag = read.get_tag("BC", with_value_type=True)[0]
32
+ #bc_tag = read.get_tag("BC", with_value_type=True)[0].split('barcode')[1]
33
+ # Open the output BAM file corresponding to the barcode
34
+ if bc_tag not in output_files:
35
+ output_path = os.path.join(split_dir, f"{output_prefix}_{bam_base_minus_suffix}_{bc_tag}{bam_suffix}")
36
+ output_files[bc_tag] = pysam.AlignmentFile(output_path, "wb", header=bam.header)
37
+ # Write the read to the corresponding output BAM file
38
+ output_files[bc_tag].write(read)
39
+ except KeyError:
40
+ print(f"BC tag not present for read: {read.query_name}")
41
+ # Close all output BAM files
42
+ for output_file in output_files.values():
43
+ output_file.close()
@@ -0,0 +1,32 @@
1
+ ## split_and_index_BAM
2
+
3
+ def split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix):
4
+ """
5
+ A wrapper function for splitting BAMS and indexing them.
6
+ Parameters:
7
+ aligned_sorted_BAM (str): A string representing the file path of the aligned_sorted BAM file.
8
+ split_dir (str): A string representing the file path to the directory to split the BAMs into.
9
+ bam_suffix (str): A suffix to add to the bam file.
10
+
11
+ Returns:
12
+ None
13
+ Splits an input BAM file on barcode value and makes a BAM index file.
14
+ """
15
+ from .. import readwrite
16
+ import os
17
+ import subprocess
18
+ import glob
19
+ from .separate_bam_by_bc import separate_bam_by_bc
20
+ from .make_dirs import make_dirs
21
+
22
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
23
+ file_prefix = readwrite.date_string()
24
+ separate_bam_by_bc(aligned_sorted_output, file_prefix, bam_suffix, split_dir)
25
+ # Make a BAM index file for the BAMs in that directory
26
+ bam_pattern = '*' + bam_suffix
27
+ bam_files = glob.glob(os.path.join(split_dir, bam_pattern))
28
+ bam_files = [bam for bam in bam_files if '.bai' not in bam]
29
+ for input_file in bam_files:
30
+ subprocess.run(["samtools", "index", input_file])
31
+
32
+ return bam_files
@@ -0,0 +1,106 @@
1
+ ## readwrite ##
2
+
3
+ ######################################################################################################
4
+ ## Datetime functionality
5
+ def date_string():
6
+ """
7
+ Each time this is called, it returns the current date string
8
+ """
9
+ from datetime import datetime
10
+ current_date = datetime.now()
11
+ date_string = current_date.strftime("%Y%m%d")
12
+ date_string = date_string[2:]
13
+ return date_string
14
+
15
+ def time_string():
16
+ """
17
+ Each time this is called, it returns the current time string
18
+ """
19
+ from datetime import datetime
20
+ current_time = datetime.now()
21
+ return current_time.strftime("%H:%M:%S")
22
+ ######################################################################################################
23
+
24
+ ######################################################################################################
25
+ ## Numpy, Pandas, Anndata functionality
26
+ def adata_to_df(adata, layer=None):
27
+ """
28
+ Input: An adata object with a specified layer.
29
+ Output: A dataframe for the specific layer.
30
+ """
31
+ import pandas as pd
32
+ import anndata as ad
33
+
34
+ # Extract the data matrix from the given layer
35
+ if layer:
36
+ data_matrix = adata.layers[layer]
37
+ else:
38
+ data_matrix = adata.X
39
+ # Extract observation (read) annotations
40
+ obs_df = adata.obs
41
+ # Extract variable (position) annotations
42
+ var_df = adata.var
43
+ # Convert data matrix and annotations to pandas DataFrames
44
+ df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
45
+ return df
46
+
47
+ def save_matrix(matrix, save_name):
48
+ """
49
+ Input: A numpy matrix and a save_name
50
+ Output: A txt file representation of the data matrix
51
+ """
52
+ import numpy as np
53
+ np.savetxt(f'{save_name}.txt', matrix)
54
+
55
+ def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
56
+ """
57
+ Concatenate all h5ad files in a directory and delete them after the final adata is written out.
58
+ Input: an output file path relative to the directory in which the function is called
59
+ """
60
+ import os
61
+ import anndata as ad
62
+ # Runtime warnings
63
+ import warnings
64
+ warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
+ warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
+
67
+ # List all files in the directory
68
+ files = os.listdir(os.getcwd())
69
+ # get current working directory
70
+ cwd = os.getcwd()
71
+ suffix = file_suffix
72
+ # Filter file names that contain the search string in their filename and keep them in a list
73
+ hdfs = [hdf for hdf in files if suffix in hdf]
74
+ # Sort file list by names and print the list of file names
75
+ hdfs.sort()
76
+ print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
77
+ # Iterate over all of the hdf5 files and concatenate them.
78
+ final_adata = None
79
+ for hdf in hdfs:
80
+ print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
81
+ temp_adata = ad.read_h5ad(hdf)
82
+ if final_adata:
83
+ print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
84
+ final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
85
+ else:
86
+ print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
+ final_adata = temp_adata
88
+ print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
89
+ final_adata.write_h5ad(output_file, compression='gzip')
90
+
91
+ # Delete the individual h5ad files and only keep the final concatenated file
92
+ if delete_inputs:
93
+ files = os.listdir(os.getcwd())
94
+ hdfs = [hdf for hdf in files if suffix in hdf]
95
+ if output_file in hdfs:
96
+ hdfs.remove(output_file)
97
+ # Iterate over the files and delete them
98
+ for hdf in hdfs:
99
+ try:
100
+ os.remove(hdf)
101
+ print(f"Deleted file: {hdf}")
102
+ except OSError as e:
103
+ print(f"Error deleting file {hdf}: {e}")
104
+ else:
105
+ print('Keeping input files')
106
+ ######################################################################################################