smftools 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +18 -2
  4. smftools/cli/hmm_adata.py +18 -1
  5. smftools/cli/latent_adata.py +522 -67
  6. smftools/cli/load_adata.py +2 -2
  7. smftools/cli/preprocess_adata.py +32 -93
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +23 -109
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +41 -5
  12. smftools/config/conversion.yaml +0 -10
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +49 -13
  15. smftools/config/experiment_config.py +96 -3
  16. smftools/constants.py +4 -0
  17. smftools/hmm/call_hmm_peaks.py +1 -1
  18. smftools/informatics/binarize_converted_base_identities.py +2 -89
  19. smftools/informatics/converted_BAM_to_adata.py +53 -13
  20. smftools/informatics/h5ad_functions.py +83 -0
  21. smftools/informatics/modkit_extract_to_adata.py +4 -0
  22. smftools/plotting/__init__.py +26 -12
  23. smftools/plotting/autocorrelation_plotting.py +22 -4
  24. smftools/plotting/chimeric_plotting.py +1893 -0
  25. smftools/plotting/classifiers.py +28 -14
  26. smftools/plotting/general_plotting.py +58 -3362
  27. smftools/plotting/hmm_plotting.py +1586 -2
  28. smftools/plotting/latent_plotting.py +804 -0
  29. smftools/plotting/plotting_utils.py +243 -0
  30. smftools/plotting/position_stats.py +16 -8
  31. smftools/plotting/preprocess_plotting.py +281 -0
  32. smftools/plotting/qc_plotting.py +8 -3
  33. smftools/plotting/spatial_plotting.py +1134 -0
  34. smftools/plotting/variant_plotting.py +1231 -0
  35. smftools/preprocessing/__init__.py +3 -0
  36. smftools/preprocessing/append_base_context.py +1 -1
  37. smftools/preprocessing/append_mismatch_frequency_sites.py +35 -6
  38. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  39. smftools/preprocessing/append_variant_call_layer.py +480 -0
  40. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  41. smftools/preprocessing/invert_adata.py +1 -0
  42. smftools/readwrite.py +109 -85
  43. smftools/tools/__init__.py +6 -0
  44. smftools/tools/calculate_knn.py +121 -0
  45. smftools/tools/calculate_nmf.py +18 -7
  46. smftools/tools/calculate_pca.py +180 -0
  47. smftools/tools/calculate_umap.py +70 -154
  48. smftools/tools/position_stats.py +4 -4
  49. smftools/tools/rolling_nn_distance.py +640 -3
  50. smftools/tools/sequence_alignment.py +140 -0
  51. smftools/tools/tensor_factorization.py +52 -4
  52. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/METADATA +3 -1
  53. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/RECORD +56 -42
  54. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  55. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  56. {smftools-0.3.1.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,480 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import numpy as np
6
+
7
+ from smftools.constants import MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT
8
+ from smftools.logging_utils import get_logger
9
+
10
+ if TYPE_CHECKING:
11
+ import anndata as ad
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ def append_variant_call_layer(
17
+ adata: "ad.AnnData",
18
+ seq1_column: str,
19
+ seq2_column: str,
20
+ seq1_converted_column: str | None = None,
21
+ seq2_converted_column: str | None = None,
22
+ sequence_layer: str = "sequence_integer_encoding",
23
+ read_span_layer: str = "read_span_mask",
24
+ reference_col: str = "Reference_strand",
25
+ output_prefix: str | None = None,
26
+ uns_flag: str = "append_variant_call_layer_performed",
27
+ force_redo: bool = False,
28
+ bypass: bool = False,
29
+ ) -> None:
30
+ """Append a layer recording per-read, per-position variant calls at reference mismatch sites.
31
+
32
+ Uses the substitution map from ``append_sequence_mismatch_annotations`` to
33
+ correctly handle coordinate shifts caused by indels between references.
34
+ For each substitution, reads aligned to ref1 are checked at ref1's var index,
35
+ and reads aligned to ref2 are checked at ref2's var index.
36
+
37
+ For conversion SMF, reads are mapped to *converted* references while the
38
+ alignment that identifies mismatch positions uses *unconverted* sequences.
39
+ When ``seq1_converted_column`` / ``seq2_converted_column`` are provided, each
40
+ reference gets a **set** of acceptable bases at each mismatch position
41
+ (unconverted + converted), since not every base converts in every read.
42
+ A position is informative only if the two acceptable-base sets are disjoint.
43
+ A read base matching either the unconverted or converted form of a reference
44
+ counts as a match for that reference.
45
+
46
+ Values in the output layer:
47
+ 1 = matches seq1 base(s)
48
+ 2 = matches seq2 base(s)
49
+ 0 = unknown (N, PAD, no coverage, or matches neither)
50
+ -1 = not a mismatch position (or not informative after conversion)
51
+
52
+ Args:
53
+ adata: AnnData object.
54
+ seq1_column: Column in ``adata.var`` with the first reference base per position (unconverted).
55
+ seq2_column: Column in ``adata.var`` with the second reference base per position (unconverted).
56
+ seq1_converted_column: Optional column in ``adata.var`` with the converted seq1 bases.
57
+ When provided, both unconverted and converted bases are accepted as ref1 matches.
58
+ seq2_converted_column: Optional column in ``adata.var`` with the converted seq2 bases.
59
+ sequence_layer: Layer containing integer-encoded actual read bases.
60
+ read_span_layer: Layer containing read span masks.
61
+ reference_col: Obs column defining which reference each read is aligned to.
62
+ output_prefix: Prefix for the output layer name. Defaults to ``{seq1_column}__{seq2_column}``.
63
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
64
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
65
+ bypass: Whether to skip processing.
66
+ """
67
+ if bypass:
68
+ return
69
+
70
+ already = bool(adata.uns.get(uns_flag, False))
71
+ if already and not force_redo:
72
+ return
73
+
74
+ if sequence_layer not in adata.layers:
75
+ logger.debug("Sequence layer '%s' not found; skipping variant call layer.", sequence_layer)
76
+ return
77
+
78
+ output_prefix = output_prefix or f"{seq1_column}__{seq2_column}"
79
+ layer_name = f"{output_prefix}_variant_call"
80
+
81
+ # Get the substitution map from alignment annotations
82
+ sub_map_key = f"{output_prefix}_substitution_map"
83
+ sub_map = adata.uns.get(sub_map_key)
84
+ if sub_map is None or (hasattr(sub_map, "__len__") and len(sub_map) == 0):
85
+ logger.warning(
86
+ "Substitution map '%s' not found or empty; skipping variant call layer.",
87
+ sub_map_key,
88
+ )
89
+ return
90
+
91
+ import pandas as pd
92
+
93
+ if isinstance(sub_map, pd.DataFrame):
94
+ vi1_arr = sub_map["seq1_var_idx"].values
95
+ vi2_arr = sub_map["seq2_var_idx"].values
96
+ b1_arr = sub_map["seq1_base"].values
97
+ b2_arr = sub_map["seq2_base"].values
98
+ else:
99
+ vi1_arr = np.asarray(sub_map.get("seq1_var_idx", []))
100
+ vi2_arr = np.asarray(sub_map.get("seq2_var_idx", []))
101
+ b1_arr = np.asarray(sub_map.get("seq1_base", []))
102
+ b2_arr = np.asarray(sub_map.get("seq2_base", []))
103
+ n_subs = len(vi1_arr)
104
+ if n_subs == 0:
105
+ logger.warning("Substitution map is empty; skipping variant call layer.")
106
+ return
107
+
108
+ mismatch_map = adata.uns.get("mismatch_integer_encoding_map", {})
109
+ if not mismatch_map:
110
+ logger.debug("Mismatch encoding map not found; skipping variant call layer.")
111
+ return
112
+
113
+ n_value = int(mismatch_map.get("N", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["N"]))
114
+ pad_value = int(mismatch_map.get("PAD", MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT["PAD"]))
115
+ uninformative = {n_value, pad_value}
116
+
117
+ # Build base -> int lookup
118
+ base_to_int: dict[str, int] = {}
119
+ for base, value in mismatch_map.items():
120
+ if base not in {"N", "PAD"} and isinstance(value, (int, np.integer)):
121
+ base_to_int[base.upper()] = int(value)
122
+
123
+ # Reverse lookup: int -> base letter (for storing readable annotations in var)
124
+ int_to_base: dict[int, str] = {v: k for k, v in base_to_int.items()}
125
+
126
+ n_obs, n_vars = adata.shape
127
+ result = np.full((n_obs, n_vars), -1, dtype=np.int8)
128
+
129
+ # Per-position var annotations
130
+ ref1_acceptable_bases = [""] * n_vars
131
+ ref2_acceptable_bases = [""] * n_vars
132
+ is_informative = np.zeros(n_vars, dtype=bool)
133
+
134
+ seq_matrix = np.asarray(adata.layers[sequence_layer])
135
+ has_span = read_span_layer in adata.layers
136
+ if has_span:
137
+ span_matrix = np.asarray(adata.layers[read_span_layer])
138
+
139
+ # Determine which reference each read belongs to
140
+ ref_labels = adata.obs[reference_col].values
141
+ ref_categories = adata.obs[reference_col].cat.categories
142
+
143
+ # Map each reference category to seq1 or seq2.
144
+ # Column names like "6B6_top_strand_FASTA_base" have stem "6B6_top" matching ref categories.
145
+ suffix = "_strand_FASTA_base"
146
+ seq1_stem = seq1_column[: -len(suffix)] if seq1_column.endswith(suffix) else seq1_column
147
+ seq2_stem = seq2_column[: -len(suffix)] if seq2_column.endswith(suffix) else seq2_column
148
+ ref_to_seq: dict[str, int] = {} # ref_category -> 1 or 2
149
+ for ref in ref_categories:
150
+ if ref == seq1_stem:
151
+ ref_to_seq[ref] = 1
152
+ elif ref == seq2_stem:
153
+ ref_to_seq[ref] = 2
154
+ else:
155
+ logger.debug(
156
+ "Reference '%s' does not match seq1 stem '%s' or seq2 stem '%s'.",
157
+ ref,
158
+ seq1_stem,
159
+ seq2_stem,
160
+ )
161
+ logger.info("Reference-to-sequence mapping: %s", ref_to_seq)
162
+
163
+ # Build per-reference acceptable base sets.
164
+ # For conversion SMF, a read base can match either the unconverted or converted
165
+ # form of a reference. A substitution is informative only when the two sets are disjoint.
166
+ use_converted = bool(seq1_converted_column and seq2_converted_column)
167
+ if use_converted:
168
+ if seq1_converted_column not in adata.var:
169
+ logger.warning(
170
+ "Converted column '%s' not in adata.var; falling back to unconverted.",
171
+ seq1_converted_column,
172
+ )
173
+ use_converted = False
174
+ elif seq2_converted_column not in adata.var:
175
+ logger.warning(
176
+ "Converted column '%s' not in adata.var; falling back to unconverted.",
177
+ seq2_converted_column,
178
+ )
179
+ use_converted = False
180
+ else:
181
+ conv1_bases = adata.var[seq1_converted_column].values
182
+ conv2_bases = adata.var[seq2_converted_column].values
183
+ logger.info(
184
+ "Using converted columns for variant calling: '%s', '%s'.",
185
+ seq1_converted_column,
186
+ seq2_converted_column,
187
+ )
188
+
189
+ logger.info("Processing %d substitutions for variant calling.", n_subs)
190
+
191
+ n_informative = 0
192
+ n_collapsed = 0
193
+ for i in range(n_subs):
194
+ vi1 = int(vi1_arr[i])
195
+ vi2 = int(vi2_arr[i])
196
+
197
+ # Unconverted bases (always available from substitution map)
198
+ ub1 = base_to_int.get(str(b1_arr[i]).upper())
199
+ ub2 = base_to_int.get(str(b2_arr[i]).upper())
200
+ if ub1 is None or ub2 is None:
201
+ continue
202
+
203
+ # Build sets of acceptable integer-encoded bases for each reference
204
+ ref1_ints: set[int] = {ub1}
205
+ ref2_ints: set[int] = {ub2}
206
+ if use_converted:
207
+ cb1 = base_to_int.get(str(conv1_bases[vi1]).upper())
208
+ cb2 = base_to_int.get(str(conv2_bases[vi2]).upper())
209
+ if cb1 is not None:
210
+ ref1_ints.add(cb1)
211
+ if cb2 is not None:
212
+ ref2_ints.add(cb2)
213
+
214
+ # Store acceptable bases at the primary var index for this substitution
215
+ ref1_bases_str = ",".join(sorted(int_to_base.get(v, "?") for v in ref1_ints))
216
+ ref2_bases_str = ",".join(sorted(int_to_base.get(v, "?") for v in ref2_ints))
217
+ ref1_acceptable_bases[vi1] = ref1_bases_str
218
+ ref2_acceptable_bases[vi2] = ref2_bases_str
219
+
220
+ # Position is informative only if the acceptable base sets are disjoint
221
+ if ref1_ints & ref2_ints:
222
+ n_collapsed += 1
223
+ continue
224
+ n_informative += 1
225
+ is_informative[vi1] = True
226
+ if vi2 != vi1:
227
+ is_informative[vi2] = True
228
+
229
+ # Pre-compute numpy arrays for fast membership testing
230
+ ref1_arr = np.array(list(ref1_ints), dtype=seq_matrix.dtype)
231
+ ref2_arr = np.array(list(ref2_ints), dtype=seq_matrix.dtype)
232
+
233
+ # For each reference, use that reference's var index from the substitution map.
234
+ # Reads aligned to seq1's reference use vi1; reads aligned to seq2's reference use vi2.
235
+ for ref in ref_categories:
236
+ seq_id = ref_to_seq.get(ref)
237
+ if seq_id is None:
238
+ continue
239
+ var_idx = vi1 if seq_id == 1 else vi2
240
+
241
+ ref_mask = ref_labels == ref
242
+
243
+ read_bases = seq_matrix[ref_mask, var_idx]
244
+ if has_span:
245
+ covered = span_matrix[ref_mask, var_idx] > 0
246
+ else:
247
+ covered = np.ones(ref_mask.sum(), dtype=bool)
248
+
249
+ calls = np.zeros(ref_mask.sum(), dtype=np.int8)
250
+ calls[np.isin(read_bases, ref1_arr) & covered] = 1
251
+ calls[np.isin(read_bases, ref2_arr) & covered] = 2
252
+ calls[~covered | np.isin(read_bases, list(uninformative))] = 0
253
+
254
+ result[ref_mask, var_idx] = calls
255
+
256
+ logger.info(
257
+ "Variant calling complete: %d informative, %d collapsed (overlapping base sets).",
258
+ n_informative,
259
+ n_collapsed,
260
+ )
261
+
262
+ adata.var[f"{output_prefix}_seq1_acceptable_bases"] = pd.Categorical(ref1_acceptable_bases)
263
+ adata.var[f"{output_prefix}_seq2_acceptable_bases"] = pd.Categorical(ref2_acceptable_bases)
264
+ adata.var[f"{output_prefix}_informative_site"] = is_informative
265
+
266
+ adata.layers[layer_name] = result
267
+
268
+ adata.uns[uns_flag] = True
269
+ logger.info("Added variant call layer '%s'.", layer_name)
270
+
271
+
272
+ def append_variant_segment_layer(
273
+ adata: "ad.AnnData",
274
+ seq1_column: str,
275
+ seq2_column: str,
276
+ variant_call_layer: str | None = None,
277
+ read_span_layer: str = "read_span_mask",
278
+ reference_col: str = "Reference_strand",
279
+ output_prefix: str | None = None,
280
+ uns_flag: str = "append_variant_segment_layer_performed",
281
+ force_redo: bool = False,
282
+ bypass: bool = False,
283
+ ) -> None:
284
+ """Segment each read span into contiguous seq1/seq2 regions based on variant calls.
285
+
286
+ Uses the per-position variant calls (1=seq1, 2=seq2) at informative mismatch
287
+ sites to segment each read into contiguous regions. At boundaries where the
288
+ class switches, a putative breakpoint is placed at the midpoint between the
289
+ two flanking mismatch positions.
290
+
291
+ Values in the output layer:
292
+ 0 = outside read span (no coverage)
293
+ 1 = seq1 segment
294
+ 2 = seq2 segment
295
+ 3 = transition zone between different-class segments
296
+
297
+ Args:
298
+ adata: AnnData object.
299
+ seq1_column: Column in ``adata.var`` with the first reference base.
300
+ seq2_column: Column in ``adata.var`` with the second reference base.
301
+ variant_call_layer: Layer with per-position variant calls. Auto-derived if None.
302
+ read_span_layer: Layer containing read span masks.
303
+ reference_col: Obs column defining which reference each read is aligned to.
304
+ output_prefix: Prefix for output layer/obs names. Defaults to ``{seq1_column}__{seq2_column}``.
305
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
306
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
307
+ bypass: Whether to skip processing.
308
+ """
309
+ if bypass:
310
+ return
311
+
312
+ already = bool(adata.uns.get(uns_flag, False))
313
+ if already and not force_redo:
314
+ return
315
+
316
+ import pandas as pd
317
+
318
+ output_prefix = output_prefix or f"{seq1_column}__{seq2_column}"
319
+ if variant_call_layer is None:
320
+ variant_call_layer = f"{output_prefix}_variant_call"
321
+
322
+ if variant_call_layer not in adata.layers:
323
+ logger.warning(
324
+ "Variant call layer '%s' not found; skipping segment layer.", variant_call_layer
325
+ )
326
+ return
327
+
328
+ has_span = read_span_layer in adata.layers
329
+ if not has_span:
330
+ logger.warning("Read span layer '%s' not found; skipping segment layer.", read_span_layer)
331
+ return
332
+
333
+ call_matrix = np.asarray(adata.layers[variant_call_layer])
334
+ span_matrix = np.asarray(adata.layers[read_span_layer])
335
+ n_obs, n_vars = adata.shape
336
+
337
+ segment_layer = np.zeros((n_obs, n_vars), dtype=np.int8)
338
+ breakpoint_counts = np.zeros(n_obs, dtype=np.int32)
339
+
340
+ for i in range(n_obs):
341
+ span_row = span_matrix[i]
342
+ call_row = call_matrix[i]
343
+
344
+ # Find read span boundaries
345
+ covered = np.where(span_row > 0)[0]
346
+ if len(covered) == 0:
347
+ continue
348
+ span_start = int(covered[0])
349
+ span_end = int(covered[-1])
350
+
351
+ # Collect informative positions (call == 1 or 2) within span
352
+ informative_mask = (call_row == 1) | (call_row == 2)
353
+ informative_positions = np.where(informative_mask)[0]
354
+ # Restrict to within span
355
+ informative_positions = informative_positions[
356
+ (informative_positions >= span_start) & (informative_positions <= span_end)
357
+ ]
358
+
359
+ if len(informative_positions) == 0:
360
+ # No informative sites — leave as 0 (no segment info)
361
+ continue
362
+
363
+ # Sort by position (should already be sorted)
364
+ informative_positions = np.sort(informative_positions)
365
+ classes = call_row[informative_positions] # 1 or 2
366
+
367
+ n_bp = 0
368
+ # Walk through consecutive informative positions and fill segments
369
+ prev_pos = informative_positions[0]
370
+ prev_cls = int(classes[0])
371
+
372
+ # Extend first class leftward to span start
373
+ segment_layer[i, span_start:prev_pos] = prev_cls
374
+
375
+ for k in range(1, len(informative_positions)):
376
+ cur_pos = informative_positions[k]
377
+ cur_cls = int(classes[k])
378
+
379
+ if cur_cls == prev_cls:
380
+ # Same class — fill from prev_pos to cur_pos
381
+ segment_layer[i, prev_pos:cur_pos] = prev_cls
382
+ else:
383
+ # Class transition — fill gap between informative sites with transition value
384
+ segment_layer[i, prev_pos] = prev_cls
385
+ segment_layer[i, prev_pos + 1 : cur_pos] = 3
386
+ n_bp += 1
387
+
388
+ prev_pos = cur_pos
389
+ prev_cls = cur_cls
390
+
391
+ # Fill the last informative position itself
392
+ segment_layer[i, prev_pos] = prev_cls
393
+ # Extend last class rightward to span end (inclusive)
394
+ segment_layer[i, prev_pos : span_end + 1] = prev_cls
395
+ # But re-mark breakpoints that may have been overwritten — they weren't,
396
+ # since we only extend from prev_pos forward and breakpoints are before prev_pos.
397
+
398
+ breakpoint_counts[i] = n_bp
399
+
400
+ layer_name = f"{output_prefix}_variant_segments"
401
+ adata.layers[layer_name] = segment_layer
402
+
403
+ adata.obs[f"{output_prefix}_breakpoint_count"] = breakpoint_counts
404
+ adata.obs[f"{output_prefix}_is_chimeric"] = breakpoint_counts > 0
405
+
406
+ # Per-read chimeric flags from mismatch segments relative to each read's own reference.
407
+ # A mismatch segment is a contiguous run where a seq1-aligned read is labeled as seq2,
408
+ # or vice versa, within the read span.
409
+ ref_labels = adata.obs[reference_col].values
410
+ ref_categories = adata.obs[reference_col].cat.categories
411
+ suffix = "_strand_FASTA_base"
412
+ seq1_stem = seq1_column[: -len(suffix)] if seq1_column.endswith(suffix) else seq1_column
413
+ seq2_stem = seq2_column[: -len(suffix)] if seq2_column.endswith(suffix) else seq2_column
414
+
415
+ ref_to_seq: dict[str, int] = {}
416
+ for ref in ref_categories:
417
+ if ref == seq1_stem:
418
+ ref_to_seq[ref] = 1
419
+ elif ref == seq2_stem:
420
+ ref_to_seq[ref] = 2
421
+
422
+ chimeric_flags = np.zeros(n_obs, dtype=bool)
423
+ chimeric_types: list[str] = ["no_segment_mismatch"] * n_obs
424
+
425
+ for i in range(n_obs):
426
+ covered = np.where(span_matrix[i] > 0)[0]
427
+ if len(covered) == 0:
428
+ continue
429
+
430
+ span_start = int(covered[0])
431
+ span_end = int(covered[-1])
432
+ in_span = segment_layer[i, span_start : span_end + 1]
433
+
434
+ seq_id = ref_to_seq.get(ref_labels[i])
435
+ if seq_id is None:
436
+ continue
437
+
438
+ mismatch_value = 2 if seq_id == 1 else 1
439
+ mismatch_mask = in_span == mismatch_value
440
+ if not np.any(mismatch_mask):
441
+ continue
442
+
443
+ starts = np.where(mismatch_mask & ~np.r_[False, mismatch_mask[:-1]])[0]
444
+ ends = np.where(mismatch_mask & ~np.r_[mismatch_mask[1:], False])[0]
445
+ n_segments = len(starts)
446
+ chimeric_flags[i] = True
447
+
448
+ if n_segments >= 2:
449
+ chimeric_types[i] = "multi_segment_mismatch"
450
+ else:
451
+ start = int(starts[0])
452
+ end = int(ends[0])
453
+ if start == 0:
454
+ chimeric_types[i] = "left_segment_mismatch"
455
+ elif end == (len(in_span) - 1):
456
+ chimeric_types[i] = "right_segment_mismatch"
457
+ else:
458
+ chimeric_types[i] = "middle_segment_mismatch"
459
+
460
+ adata.obs["chimeric_variant_sites"] = chimeric_flags
461
+ adata.obs["chimeric_variant_sites_type"] = pd.Categorical(
462
+ chimeric_types,
463
+ categories=[
464
+ "no_segment_mismatch",
465
+ "left_segment_mismatch",
466
+ "right_segment_mismatch",
467
+ "middle_segment_mismatch",
468
+ "multi_segment_mismatch",
469
+ ],
470
+ )
471
+
472
+ n_chimeric = int(np.sum(breakpoint_counts > 0))
473
+ logger.info(
474
+ "Variant segmentation complete: %d reads with breakpoints out of %d total.",
475
+ n_chimeric,
476
+ n_obs,
477
+ )
478
+
479
+ adata.uns[uns_flag] = True
480
+ logger.info("Added variant segment layer '%s'.", layer_name)
@@ -844,11 +844,11 @@ def plot_histogram_pages(
844
844
  if adata is not None and sample_key in adata.obs.columns and ref_key in adata.obs.columns:
845
845
  obs = adata.obs
846
846
  sseries = obs[sample_key]
847
- if not pd.api.types.is_categorical_dtype(sseries):
847
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
848
848
  sseries = sseries.astype("category")
849
849
  samples = list(sseries.cat.categories)
850
850
  rseries = obs[ref_key]
851
- if not pd.api.types.is_categorical_dtype(rseries):
851
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
852
852
  rseries = rseries.astype("category")
853
853
  references = list(rseries.cat.categories)
854
854
  use_adata = True
@@ -1189,7 +1189,7 @@ def plot_hamming_vs_metric_pages(
1189
1189
  # canonicalize samples and refs
1190
1190
  if samples is None:
1191
1191
  sseries = obs[sample_col]
1192
- if not pd.api.types.is_categorical_dtype(sseries):
1192
+ if not isinstance(sseries.dtype, pd.CategoricalDtype):
1193
1193
  sseries = sseries.astype("category")
1194
1194
  samples_all = list(sseries.cat.categories)
1195
1195
  else:
@@ -1197,7 +1197,7 @@ def plot_hamming_vs_metric_pages(
1197
1197
 
1198
1198
  if references is None:
1199
1199
  rseries = obs[ref_col]
1200
- if not pd.api.types.is_categorical_dtype(rseries):
1200
+ if not isinstance(rseries.dtype, pd.CategoricalDtype):
1201
1201
  rseries = rseries.astype("category")
1202
1202
  refs_all = list(rseries.cat.categories)
1203
1203
  else:
@@ -32,6 +32,7 @@ def invert_adata(
32
32
  already = bool(adata.uns.get(uns_flag, False))
33
33
  if already and not force_redo:
34
34
  # QC already performed; nothing to do
35
+ logger.info("Inversion already performed")
35
36
  return adata
36
37
 
37
38
  logger.info("Inverting AnnData along the column axis...")