smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,255 @@
1
+ from ..readwrite import make_dirs, time_string
2
+
3
+ import os
4
+ import subprocess
5
+ from pathlib import Path
6
+
7
+ from typing import Union, List, Dict, Tuple
8
+
9
+ import numpy as np
10
+ import gzip
11
+
12
+ from Bio import SeqIO
13
+ from Bio.SeqRecord import SeqRecord
14
+ from Bio.Seq import Seq
15
+ from pyfaidx import Fasta
16
+ import pysam
17
+
18
+ from concurrent.futures import ProcessPoolExecutor
19
+ from itertools import chain
20
+
21
+ def _convert_FASTA_record(record, modification_type, strand, unconverted):
22
+ """ Converts a FASTA record based on modification type and strand. """
23
+ conversion_maps = {
24
+ ('5mC', 'top'): ('C', 'T'),
25
+ ('5mC', 'bottom'): ('G', 'A'),
26
+ ('6mA', 'top'): ('A', 'G'),
27
+ ('6mA', 'bottom'): ('T', 'C')
28
+ }
29
+
30
+ sequence = str(record.seq).upper()
31
+
32
+ if modification_type == unconverted:
33
+ return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
34
+
35
+ if (modification_type, strand) not in conversion_maps:
36
+ raise ValueError(f"Invalid combination: {modification_type}, {strand}")
37
+
38
+ original_base, converted_base = conversion_maps[(modification_type, strand)]
39
+ new_seq = sequence.replace(original_base, converted_base)
40
+
41
+ return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
42
+
43
+ def _process_fasta_record(args):
44
+ """
45
+ Processes a single FASTA record for parallel execution.
46
+ Args:
47
+ args (tuple): (record, modification_types, strands, unconverted)
48
+ Returns:
49
+ list of modified SeqRecord objects.
50
+ """
51
+ record, modification_types, strands, unconverted = args
52
+ modified_records = []
53
+
54
+ for modification_type in modification_types:
55
+ for i, strand in enumerate(strands):
56
+ if i > 0 and modification_type == unconverted:
57
+ continue # Ensure unconverted is added only once
58
+
59
+ modified_records.append(_convert_FASTA_record(record, modification_type, strand, unconverted))
60
+
61
+ return modified_records
62
+
63
+ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
64
+ """
65
+ Converts an input FASTA file and writes a new converted FASTA file efficiently.
66
+
67
+ Parameters:
68
+ input_fasta (str): Path to the unconverted FASTA file.
69
+ modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
70
+ strands (list): List of strands ('top', 'bottom').
71
+ output_fasta (str): Path to the converted FASTA output file.
72
+ num_threads (int): Number of parallel threads to use.
73
+ chunk_size (int): Number of records to process per write batch.
74
+
75
+ Returns:
76
+ None (Writes the converted FASTA file).
77
+ """
78
+ unconverted = modification_types[0]
79
+ input_fasta = str(input_fasta)
80
+ output_fasta = str(output_fasta)
81
+
82
+ # Detect if input is gzipped
83
+ open_func = gzip.open if input_fasta.endswith('.gz') else open
84
+ file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
85
+
86
+ def _fasta_record_generator():
87
+ """ Lazily yields FASTA records from file. """
88
+ with open_func(input_fasta, file_mode) as handle:
89
+ for record in SeqIO.parse(handle, 'fasta'):
90
+ yield record
91
+
92
+ with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
93
+ # Process records in parallel using a named function (avoiding lambda)
94
+ results = executor.map(
95
+ _process_fasta_record,
96
+ ((record, modification_types, strands, unconverted) for record in _fasta_record_generator())
97
+ )
98
+
99
+ buffer = []
100
+ for modified_records in results:
101
+ buffer.extend(modified_records)
102
+
103
+ # Write out in chunks to save memory
104
+ if len(buffer) >= chunk_size:
105
+ SeqIO.write(buffer, output_handle, 'fasta')
106
+ buffer.clear()
107
+
108
+ # Write any remaining records
109
+ if buffer:
110
+ SeqIO.write(buffer, output_handle, 'fasta')
111
+
112
+ def index_fasta(fasta: str | Path, write_chrom_sizes: bool = True) -> Path:
113
+ fasta = Path(fasta)
114
+ pysam.faidx(str(fasta)) # creates <fasta>.fai
115
+
116
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
117
+ if write_chrom_sizes:
118
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
119
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
120
+ for line in f_in:
121
+ chrom, size = line.split()[:2]
122
+ out.write(f"{chrom}\t{size}\n")
123
+ return chrom_sizes
124
+ return fai
125
+
126
+ def get_chromosome_lengths(fasta: str | Path) -> Path:
127
+ """
128
+ Create (or reuse) <fasta>.chrom.sizes, derived from the FASTA index.
129
+ """
130
+ fasta = Path(fasta)
131
+ fai = fasta.with_suffix(fasta.suffix + ".fai")
132
+ if not fai.exists():
133
+ index_fasta(fasta, write_chrom_sizes=True) # will also create .chrom.sizes
134
+ chrom_sizes = fasta.with_suffix(".chrom.sizes")
135
+ if chrom_sizes.exists():
136
+ print(f"Using existing chrom length file: {chrom_sizes}")
137
+ return chrom_sizes
138
+
139
+ # Build chrom.sizes from .fai
140
+ with fai.open() as f_in, chrom_sizes.open("w") as out:
141
+ for line in f_in:
142
+ chrom, size = line.split()[:2]
143
+ out.write(f"{chrom}\t{size}\n")
144
+ return chrom_sizes
145
+
146
+ def get_native_references(fasta_file: str | Path) -> Dict[str, Tuple[int, str]]:
147
+ """
148
+ Return {record_id: (length, sequence)} from a FASTA.
149
+ Direct methylation specific
150
+ """
151
+ fasta_file = Path(fasta_file)
152
+ print(f"{time_string()}: Opening FASTA file {fasta_file}")
153
+ record_dict: Dict[str, Tuple[int, str]] = {}
154
+ with fasta_file.open("r") as f:
155
+ for rec in SeqIO.parse(f, "fasta"):
156
+ seq = str(rec.seq).upper()
157
+ record_dict[rec.id] = (len(seq), seq)
158
+ return record_dict
159
+
160
+ def find_conversion_sites(fasta_file, modification_type, conversions, deaminase_footprinting=False):
161
+ """
162
+ Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
163
+
164
+ Parameters:
165
+ fasta_file (str): Path to the converted reference FASTA.
166
+ modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
167
+ conversions (list): List of conversion types. The first element is the unconverted record type.
168
+ deaminase_footprinting (bool): Whether the footprinting was done with a direct deamination chemistry.
169
+
170
+ Returns:
171
+ dict: Dictionary where keys are **both unconverted & converted record names**.
172
+ Values contain:
173
+ [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
174
+ """
175
+ unconverted = conversions[0]
176
+ record_dict = {}
177
+
178
+ # Define base mapping based on modification type
179
+ base_mappings = {
180
+ '5mC': ('C', 'G'), # Cytosine and Guanine
181
+ '6mA': ('A', 'T') # Adenine and Thymine
182
+ }
183
+
184
+ # Read FASTA file and process records
185
+ with open(fasta_file, "r") as f:
186
+ for record in SeqIO.parse(f, "fasta"):
187
+ if unconverted in record.id or deaminase_footprinting:
188
+ sequence = str(record.seq).upper()
189
+ complement = str(record.seq.complement()).upper()
190
+ sequence_length = len(sequence)
191
+
192
+ # Unconverted case: store the full sequence without coordinate filtering
193
+ if modification_type == unconverted:
194
+ record_dict[record.id] = [sequence_length, [], [], sequence, complement]
195
+
196
+ # Process converted records: extract modified base positions
197
+ elif modification_type in base_mappings:
198
+ top_base, bottom_base = base_mappings[modification_type]
199
+ seq_array = np.array(list(sequence))
200
+ top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
201
+ bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
202
+
203
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
204
+
205
+ else:
206
+ raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
207
+
208
+ return record_dict
209
+
210
+ def subsample_fasta_from_bed(
211
+ input_FASTA: str | Path,
212
+ input_bed: str | Path,
213
+ output_directory: str | Path,
214
+ output_FASTA: str | Path
215
+ ) -> None:
216
+ """
217
+ Take a genome-wide FASTA file and a BED file containing
218
+ coordinate windows of interest. Outputs a subsampled FASTA.
219
+ """
220
+
221
+ # Normalize everything to Path
222
+ input_FASTA = Path(input_FASTA)
223
+ input_bed = Path(input_bed)
224
+ output_directory = Path(output_directory)
225
+ output_FASTA = Path(output_FASTA)
226
+
227
+ # Ensure output directory exists
228
+ output_directory.mkdir(parents=True, exist_ok=True)
229
+
230
+ output_FASTA_path = output_directory / output_FASTA
231
+
232
+ # Load the FASTA file using pyfaidx
233
+ fasta = Fasta(str(input_FASTA)) # pyfaidx requires string paths
234
+
235
+ # Open BED + output FASTA
236
+ with input_bed.open("r") as bed, output_FASTA_path.open("w") as out_fasta:
237
+ for line in bed:
238
+ fields = line.strip().split()
239
+ chrom = fields[0]
240
+ start = int(fields[1]) # BED is 0-based
241
+ end = int(fields[2]) # BED is 0-based and end is exclusive
242
+ desc = " ".join(fields[3:]) if len(fields) > 3 else ""
243
+
244
+ if chrom not in fasta:
245
+ print(f"Warning: {chrom} not found in FASTA")
246
+ continue
247
+
248
+ # pyfaidx is 1-based indexing internally, but [start:end] works with BED coords
249
+ sequence = fasta[chrom][start:end].seq
250
+
251
+ header = f">{chrom}:{start}-{end}"
252
+ if desc:
253
+ header += f" {desc}"
254
+
255
+ out_fasta.write(f"{header}\n{sequence}\n")
@@ -0,0 +1,197 @@
1
+ from pathlib import Path
2
+ import pandas as pd
3
+ import numpy as np
4
+ import scipy.sparse as sp
5
+ from typing import Optional, List, Dict, Union
6
+
7
+ def add_demux_type_annotation(
8
+ adata,
9
+ double_demux_source,
10
+ sep: str = "\t",
11
+ read_id_col: str = "read_id",
12
+ barcode_col: str = "barcode",
13
+ ):
14
+ """
15
+ Add adata.obs["demux_type"]:
16
+ - "double" if read_id appears in the *double demux* TSV
17
+ - "single" otherwise
18
+
19
+ Rows where barcode == "unclassified" in the demux TSV are ignored.
20
+
21
+ Parameters
22
+ ----------
23
+ adata : AnnData
24
+ AnnData object whose obs_names are read_ids.
25
+ double_demux_source : str | Path | list[str]
26
+ Either:
27
+ - path to a TSV/TXT of dorado demux results
28
+ - a list of read_ids
29
+ """
30
+
31
+ # -----------------------------
32
+ # If it's a file → load TSV
33
+ # -----------------------------
34
+ if isinstance(double_demux_source, (str, Path)):
35
+ file_path = Path(double_demux_source)
36
+ if not file_path.exists():
37
+ raise FileNotFoundError(f"File does not exist: {file_path}")
38
+
39
+ df = pd.read_csv(file_path, sep=sep, dtype=str)
40
+
41
+ # If the file has only one column → treat as a simple read list
42
+ if df.shape[1] == 1:
43
+ read_ids = df.iloc[:, 0].tolist()
44
+ else:
45
+ # Validate columns
46
+ if read_id_col not in df.columns:
47
+ raise ValueError(f"TSV must contain a '{read_id_col}' column.")
48
+ if barcode_col not in df.columns:
49
+ raise ValueError(f"TSV must contain a '{barcode_col}' column.")
50
+
51
+ # Drop unclassified reads
52
+ df = df[df[barcode_col].str.lower() != "unclassified"]
53
+
54
+ # Extract read_ids
55
+ read_ids = df[read_id_col].tolist()
56
+
57
+ # -----------------------------
58
+ # If user supplied list-of-ids
59
+ # -----------------------------
60
+ else:
61
+ read_ids = list(double_demux_source)
62
+
63
+ # Deduplicate for speed
64
+ double_set = set(read_ids)
65
+
66
+ # Boolean lookup in AnnData
67
+ is_double = adata.obs_names.isin(double_set)
68
+
69
+ adata.obs["demux_type"] = np.where(is_double, "double", "single")
70
+ adata.obs["demux_type"] = adata.obs["demux_type"].astype("category")
71
+
72
+ return adata
73
+
74
+ def add_read_length_and_mapping_qc(
75
+ adata,
76
+ bam_files: Optional[List[str]] = None,
77
+ read_metrics: Optional[Dict[str, Union[list, tuple]]] = None,
78
+ uns_flag: str = "add_read_length_and_mapping_qc_performed",
79
+ extract_read_features_from_bam_callable = None,
80
+ bypass: bool = False,
81
+ force_redo: bool = True
82
+ ):
83
+ """
84
+ Populate adata.obs with read/mapping QC columns.
85
+
86
+ Parameters
87
+ ----------
88
+ adata
89
+ AnnData to annotate (modified in-place).
90
+ bam_files
91
+ Optional list of BAM files to extract metrics from. Ignored if read_metrics supplied.
92
+ read_metrics
93
+ Optional dict mapping obs_name -> [read_length, read_quality, reference_length, mapped_length, mapping_quality]
94
+ If provided, this will be used directly and bam_files will be ignored.
95
+ uns_flag
96
+ key in final_adata.uns used to record that QC was performed (kept the name with original misspelling).
97
+ extract_read_features_from_bam_callable
98
+ Optional callable(bam_path) -> dict mapping read_name -> list/tuple of metrics.
99
+ If not provided and bam_files is given, function will attempt to call `extract_read_features_from_bam`
100
+ from the global namespace (your existing helper).
101
+ Returns
102
+ -------
103
+ None (mutates final_adata in-place)
104
+ """
105
+
106
+ # Only run if not already performed
107
+ already = bool(adata.uns.get(uns_flag, False))
108
+ if (already and not force_redo) or bypass:
109
+ # QC already performed; nothing to do
110
+ return
111
+
112
+ # Build read_metrics dict either from provided arg or by extracting from bam files
113
+ if read_metrics is None:
114
+ read_metrics = {}
115
+ if bam_files:
116
+ extractor = extract_read_features_from_bam_callable or globals().get("extract_read_features_from_bam")
117
+ if extractor is None:
118
+ raise ValueError("No `read_metrics` provided and `extract_read_features_from_bam` not found.")
119
+ for bam in bam_files:
120
+ bam_read_metrics = extractor(bam)
121
+ if not isinstance(bam_read_metrics, dict):
122
+ raise ValueError(f"extract_read_features_from_bam returned non-dict for {bam}")
123
+ read_metrics.update(bam_read_metrics)
124
+ else:
125
+ # nothing to do
126
+ read_metrics = {}
127
+
128
+ # Convert read_metrics dict -> DataFrame (rows = read id)
129
+ # Values may be lists/tuples or scalars; prefer lists/tuples with 5 entries.
130
+ if len(read_metrics) == 0:
131
+ # fill with NaNs
132
+ n = adata.n_obs
133
+ adata.obs['read_length'] = np.full(n, np.nan)
134
+ adata.obs['mapped_length'] = np.full(n, np.nan)
135
+ adata.obs['reference_length'] = np.full(n, np.nan)
136
+ adata.obs['read_quality'] = np.full(n, np.nan)
137
+ adata.obs['mapping_quality'] = np.full(n, np.nan)
138
+ else:
139
+ # Build DF robustly
140
+ # Convert values to lists where possible, else to [val, val, val...]
141
+ max_cols = 5
142
+ rows = {}
143
+ for k, v in read_metrics.items():
144
+ if isinstance(v, (list, tuple, np.ndarray)):
145
+ vals = list(v)
146
+ else:
147
+ # scalar -> replicate into 5 columns to preserve original behavior
148
+ vals = [v] * max_cols
149
+ # Ensure length >= 5
150
+ if len(vals) < max_cols:
151
+ vals = vals + [np.nan] * (max_cols - len(vals))
152
+ rows[k] = vals[:max_cols]
153
+
154
+ df = pd.DataFrame.from_dict(rows, orient='index', columns=[
155
+ 'read_length', 'read_quality', 'reference_length', 'mapped_length', 'mapping_quality'
156
+ ])
157
+
158
+ # Reindex to final_adata.obs_names so order matches adata
159
+ # If obs_names are not present as keys in df, the results will be NaN
160
+ df_reindexed = df.reindex(adata.obs_names).astype(float)
161
+
162
+ adata.obs['read_length'] = df_reindexed['read_length'].values
163
+ adata.obs['mapped_length'] = df_reindexed['mapped_length'].values
164
+ adata.obs['reference_length'] = df_reindexed['reference_length'].values
165
+ adata.obs['read_quality'] = df_reindexed['read_quality'].values
166
+ adata.obs['mapping_quality'] = df_reindexed['mapping_quality'].values
167
+
168
+ # Compute ratio columns safely (avoid divide-by-zero and preserve NaN)
169
+ # read_length_to_reference_length_ratio
170
+ rl = pd.to_numeric(adata.obs['read_length'], errors='coerce').to_numpy(dtype=float)
171
+ ref_len = pd.to_numeric(adata.obs['reference_length'], errors='coerce').to_numpy(dtype=float)
172
+ mapped_len = pd.to_numeric(adata.obs['mapped_length'], errors='coerce').to_numpy(dtype=float)
173
+
174
+ # safe divisions: use np.where to avoid warnings and replace inf with nan
175
+ with np.errstate(divide='ignore', invalid='ignore'):
176
+ rl_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), rl / ref_len, np.nan)
177
+ mapped_to_ref = np.where((ref_len != 0) & np.isfinite(ref_len), mapped_len / ref_len, np.nan)
178
+ mapped_to_read = np.where((rl != 0) & np.isfinite(rl), mapped_len / rl, np.nan)
179
+
180
+ adata.obs['read_length_to_reference_length_ratio'] = rl_to_ref
181
+ adata.obs['mapped_length_to_reference_length_ratio'] = mapped_to_ref
182
+ adata.obs['mapped_length_to_read_length_ratio'] = mapped_to_read
183
+
184
+ # Add read level raw modification signal: sum over X rows
185
+ X = adata.X
186
+ if sp.issparse(X):
187
+ # sum returns (n_obs, 1) sparse matrix; convert to 1d array
188
+ raw_sig = np.asarray(X.sum(axis=1)).ravel()
189
+ else:
190
+ raw_sig = np.asarray(X.sum(axis=1)).ravel()
191
+
192
+ adata.obs['Raw_modification_signal'] = raw_sig
193
+
194
+ # mark as done
195
+ adata.uns[uns_flag] = True
196
+
197
+ return None