smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/chimeric_adata.py +1563 -0
  3. smftools/cli/helpers.py +49 -7
  4. smftools/cli/hmm_adata.py +250 -32
  5. smftools/cli/latent_adata.py +773 -0
  6. smftools/cli/load_adata.py +78 -74
  7. smftools/cli/preprocess_adata.py +122 -58
  8. smftools/cli/recipes.py +26 -0
  9. smftools/cli/spatial_adata.py +74 -112
  10. smftools/cli/variant_adata.py +423 -0
  11. smftools/cli_entry.py +52 -4
  12. smftools/config/conversion.yaml +1 -1
  13. smftools/config/deaminase.yaml +3 -0
  14. smftools/config/default.yaml +85 -12
  15. smftools/config/experiment_config.py +146 -1
  16. smftools/constants.py +69 -0
  17. smftools/hmm/HMM.py +88 -0
  18. smftools/hmm/call_hmm_peaks.py +1 -1
  19. smftools/informatics/__init__.py +6 -0
  20. smftools/informatics/bam_functions.py +358 -8
  21. smftools/informatics/binarize_converted_base_identities.py +2 -89
  22. smftools/informatics/converted_BAM_to_adata.py +636 -175
  23. smftools/informatics/h5ad_functions.py +198 -2
  24. smftools/informatics/modkit_extract_to_adata.py +1007 -425
  25. smftools/informatics/sequence_encoding.py +72 -0
  26. smftools/logging_utils.py +21 -2
  27. smftools/metadata.py +1 -1
  28. smftools/plotting/__init__.py +26 -3
  29. smftools/plotting/autocorrelation_plotting.py +22 -4
  30. smftools/plotting/chimeric_plotting.py +1893 -0
  31. smftools/plotting/classifiers.py +28 -14
  32. smftools/plotting/general_plotting.py +62 -1583
  33. smftools/plotting/hmm_plotting.py +1670 -8
  34. smftools/plotting/latent_plotting.py +804 -0
  35. smftools/plotting/plotting_utils.py +243 -0
  36. smftools/plotting/position_stats.py +16 -8
  37. smftools/plotting/preprocess_plotting.py +281 -0
  38. smftools/plotting/qc_plotting.py +8 -3
  39. smftools/plotting/spatial_plotting.py +1134 -0
  40. smftools/plotting/variant_plotting.py +1231 -0
  41. smftools/preprocessing/__init__.py +4 -0
  42. smftools/preprocessing/append_base_context.py +18 -18
  43. smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
  44. smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
  45. smftools/preprocessing/append_variant_call_layer.py +480 -0
  46. smftools/preprocessing/calculate_consensus.py +1 -1
  47. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  48. smftools/preprocessing/flag_duplicate_reads.py +4 -4
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/readwrite.py +159 -99
  51. smftools/schema/anndata_schema_v1.yaml +15 -1
  52. smftools/tools/__init__.py +10 -0
  53. smftools/tools/calculate_knn.py +121 -0
  54. smftools/tools/calculate_leiden.py +57 -0
  55. smftools/tools/calculate_nmf.py +130 -0
  56. smftools/tools/calculate_pca.py +180 -0
  57. smftools/tools/calculate_umap.py +79 -80
  58. smftools/tools/position_stats.py +4 -4
  59. smftools/tools/rolling_nn_distance.py +872 -0
  60. smftools/tools/sequence_alignment.py +140 -0
  61. smftools/tools/tensor_factorization.py +217 -0
  62. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
  63. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
  64. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
  65. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
  66. {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Mapping
4
+
5
+ import numpy as np
6
+
7
+ from smftools.constants import (
8
+ MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
9
+ MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
10
+ )
11
+
12
+
13
+ def encode_sequence_to_int(
14
+ sequence: str | Iterable[str],
15
+ *,
16
+ base_to_int: Mapping[str, int] = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
17
+ unknown_base: str = "N",
18
+ ) -> np.ndarray:
19
+ """Encode a base sequence into integer values using constant mappings.
20
+
21
+ Args:
22
+ sequence: Sequence string or iterable of base characters.
23
+ base_to_int: Mapping of base characters to integer encodings.
24
+ unknown_base: Base to use when a character is not in the encoding map.
25
+
26
+ Returns:
27
+ np.ndarray: Integer-encoded sequence array.
28
+
29
+ Raises:
30
+ ValueError: If an unknown base is encountered and ``unknown_base`` is not mapped.
31
+ """
32
+ if unknown_base not in base_to_int:
33
+ raise ValueError(f"Unknown base '{unknown_base}' not present in encoding map.")
34
+
35
+ if isinstance(sequence, str):
36
+ sequence_iter = sequence
37
+ else:
38
+ sequence_iter = list(sequence)
39
+
40
+ fallback = base_to_int[unknown_base]
41
+ encoded = np.fromiter(
42
+ (base_to_int.get(base, fallback) for base in sequence_iter),
43
+ dtype=np.int16,
44
+ count=len(sequence_iter),
45
+ )
46
+ return encoded
47
+
48
+
49
+ def decode_int_sequence(
50
+ encoded_sequence: Iterable[int] | np.ndarray,
51
+ *,
52
+ int_to_base: Mapping[int, str] = MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
53
+ unknown_base: str = "N",
54
+ ) -> list[str]:
55
+ """Decode integer-encoded bases into characters using constant mappings.
56
+
57
+ Args:
58
+ encoded_sequence: Iterable of integer-encoded bases.
59
+ int_to_base: Mapping of integer encodings to base characters.
60
+ unknown_base: Base to use when an integer is not in the decoding map.
61
+
62
+ Returns:
63
+ list[str]: Decoded base characters.
64
+
65
+ Raises:
66
+ ValueError: If ``unknown_base`` is not available for fallback.
67
+ """
68
+ if unknown_base not in int_to_base.values():
69
+ raise ValueError(f"Unknown base '{unknown_base}' not present in decoding map.")
70
+
71
+ fallback = unknown_base
72
+ return [int_to_base.get(int(value), fallback) for value in encoded_sequence]
smftools/logging_utils.py CHANGED
@@ -15,18 +15,37 @@ def setup_logging(
15
15
  fmt: str = DEFAULT_LOG_FORMAT,
16
16
  datefmt: str = DEFAULT_DATE_FORMAT,
17
17
  log_file: Optional[Union[str, Path]] = None,
18
+ reconfigure: bool = False,
18
19
  ) -> None:
19
20
  """
20
21
  Configure logging for smftools.
21
22
 
22
23
  Should be called once by the CLI entrypoint.
23
- Safe to call multiple times.
24
+ Safe to call multiple times, with optional reconfiguration.
24
25
  """
25
26
  logger = logging.getLogger("smftools")
26
27
 
27
- if logger.handlers:
28
+ if logger.handlers and not reconfigure:
29
+ if log_file is not None:
30
+ log_path = Path(log_file)
31
+ has_file_handler = any(
32
+ isinstance(handler, logging.FileHandler)
33
+ and Path(getattr(handler, "baseFilename", "")) == log_path
34
+ for handler in logger.handlers
35
+ )
36
+ if not has_file_handler:
37
+ log_path.parent.mkdir(parents=True, exist_ok=True)
38
+ file_handler = logging.FileHandler(log_path)
39
+ file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))
40
+ logger.addHandler(file_handler)
41
+ logger.setLevel(level)
28
42
  return
29
43
 
44
+ if logger.handlers and reconfigure:
45
+ for handler in list(logger.handlers):
46
+ logger.removeHandler(handler)
47
+ handler.close()
48
+
30
49
  formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
31
50
 
32
51
  # Console handler (stderr)
smftools/metadata.py CHANGED
@@ -12,7 +12,7 @@ from typing import Any, Iterable, Optional
12
12
  from ._version import __version__
13
13
  from .schema import SCHEMA_REGISTRY_RESOURCE, SCHEMA_REGISTRY_VERSION
14
14
 
15
- _DEPENDENCIES = ("anndata", "numpy", "pandas", "scanpy", "torch")
15
+ _DEPENDENCIES = ("anndata", "numpy", "pandas", "umap-learn", "pynndescent", "torch")
16
16
 
17
17
 
18
18
  def _iso_timestamp() -> str:
@@ -3,9 +3,32 @@ from __future__ import annotations
3
3
  from importlib import import_module
4
4
 
5
5
  _LAZY_ATTRS = {
6
- "combined_hmm_raw_clustermap": "smftools.plotting.general_plotting",
7
- "combined_raw_clustermap": "smftools.plotting.general_plotting",
8
- "plot_hmm_layers_rolling_by_sample_ref": "smftools.plotting.general_plotting",
6
+ "combined_hmm_length_clustermap": "smftools.plotting.hmm_plotting",
7
+ "combined_hmm_raw_clustermap": "smftools.plotting.hmm_plotting",
8
+ "combined_raw_clustermap": "smftools.plotting.spatial_plotting",
9
+ "plot_delta_hamming_summary": "smftools.plotting.chimeric_plotting",
10
+ "plot_hamming_span_trio": "smftools.plotting.chimeric_plotting",
11
+ "plot_rolling_nn_and_layer": "smftools.plotting.chimeric_plotting",
12
+ "plot_rolling_nn_and_two_layers": "smftools.plotting.chimeric_plotting",
13
+ "plot_segment_length_histogram": "smftools.plotting.chimeric_plotting",
14
+ "plot_span_length_distributions": "smftools.plotting.chimeric_plotting",
15
+ "plot_zero_hamming_pair_counts": "smftools.plotting.chimeric_plotting",
16
+ "plot_zero_hamming_span_and_layer": "smftools.plotting.chimeric_plotting",
17
+ "plot_hmm_layers_rolling_by_sample_ref": "smftools.plotting.hmm_plotting",
18
+ "plot_nmf_components": "smftools.plotting.latent_plotting",
19
+ "plot_pca_components": "smftools.plotting.latent_plotting",
20
+ "plot_cp_sequence_components": "smftools.plotting.latent_plotting",
21
+ "plot_embedding": "smftools.plotting.latent_plotting",
22
+ "plot_embedding_grid": "smftools.plotting.latent_plotting",
23
+ "plot_read_span_quality_clustermaps": "smftools.plotting.preprocess_plotting",
24
+ "plot_mismatch_base_frequency_by_position": "smftools.plotting.variant_plotting",
25
+ "plot_pca": "smftools.plotting.latent_plotting",
26
+ "plot_pca_grid": "smftools.plotting.latent_plotting",
27
+ "plot_pca_explained_variance": "smftools.plotting.latent_plotting",
28
+ "plot_sequence_integer_encoding_clustermaps": "smftools.plotting.variant_plotting",
29
+ "plot_variant_segment_clustermaps": "smftools.plotting.variant_plotting",
30
+ "plot_umap": "smftools.plotting.latent_plotting",
31
+ "plot_umap_grid": "smftools.plotting.latent_plotting",
9
32
  "plot_bar_relative_risk": "smftools.plotting.position_stats",
10
33
  "plot_positionwise_matrix": "smftools.plotting.position_stats",
11
34
  "plot_positionwise_matrix_grid": "smftools.plotting.position_stats",
@@ -5,8 +5,11 @@ from typing import Optional
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
 
8
+ from smftools.logging_utils import get_logger
8
9
  from smftools.optional_imports import require
9
10
 
11
+ logger = get_logger(__name__)
12
+
10
13
 
11
14
  def plot_spatial_autocorr_grid(
12
15
  adata,
@@ -39,6 +42,12 @@ def plot_spatial_autocorr_grid(
39
42
  import os
40
43
  import warnings
41
44
 
45
+ logger.info(
46
+ "Plotting spatial autocorrelation grid to %s for site_types=%s.",
47
+ out_dir,
48
+ site_types,
49
+ )
50
+
42
51
  plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
43
52
 
44
53
  # Try importing analyzer (used only as fallback)
@@ -98,7 +107,7 @@ def plot_spatial_autocorr_grid(
98
107
  if sample_col not in adata.obs:
99
108
  raise KeyError(f"sample_col '{sample_col}' not present in adata.obs")
100
109
  samples = adata.obs[sample_col]
101
- if not pd.api.types.is_categorical_dtype(samples):
110
+ if not isinstance(samples.dtype, pd.CategoricalDtype):
102
111
  samples = samples.astype("category")
103
112
  sample_levels = list(samples.cat.categories)
104
113
 
@@ -107,7 +116,7 @@ def plot_spatial_autocorr_grid(
107
116
  raise KeyError(f"reference_col '{reference_col}' not present in adata.obs")
108
117
  if references is None:
109
118
  refs_series = adata.obs[reference_col]
110
- if not pd.api.types.is_categorical_dtype(refs_series):
119
+ if not isinstance(refs_series.dtype, pd.CategoricalDtype):
111
120
  refs_series = refs_series.astype("category")
112
121
  references = list(refs_series.cat.categories)
113
122
  references = list(references)
@@ -510,10 +519,10 @@ def plot_spatial_autocorr_grid(
510
519
  try:
511
520
  combined_df.to_csv(combined_out, index=False)
512
521
  except Exception as e:
513
- import warnings
514
-
515
522
  warnings.warn(f"Failed to write combined CSV {combined_out}: {e}")
523
+ logger.warning("Failed to write combined CSV %s: %s", combined_out, e)
516
524
 
525
+ logger.info("Saved %s autocorrelation grid pages to %s.", len(saved_pages), out_dir)
517
526
  return saved_pages
518
527
 
519
528
 
@@ -522,6 +531,7 @@ def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=16
522
531
  Plot NRL and SNR vs window center from the dataframe returned by rolling_autocorr_metrics.
523
532
  If out_png is None, returns the matplotlib Figure object; otherwise saves PNG and returns path.
524
533
  """
534
+ logger.info("Plotting rolling metrics%s.", f" -> {out_png}" if out_png else "")
525
535
  plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
526
536
 
527
537
  # sort by center
@@ -546,6 +556,7 @@ def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=16
546
556
 
547
557
  if out_png:
548
558
  fig.savefig(out_png, bbox_inches="tight")
559
+ logger.info("Saved rolling metrics plot to %s.", out_png)
549
560
  if not show:
550
561
  matplotlib = require("matplotlib", extra="plotting", purpose="autocorrelation plots")
551
562
 
@@ -604,6 +615,12 @@ def plot_rolling_grid(
604
615
  """
605
616
  import os
606
617
 
618
+ logger.info(
619
+ "Plotting rolling metric grids for site=%s to %s (metrics=%s).",
620
+ site,
621
+ out_dir,
622
+ metrics,
623
+ )
607
624
  plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
608
625
 
609
626
  if per_metric_ylim is None:
@@ -708,6 +725,7 @@ def plot_rolling_grid(
708
725
  fig.savefig(out_png, bbox_inches="tight")
709
726
  plt.close(fig)
710
727
  saved_pages.append(out_png)
728
+ logger.info("Saved rolling grid page to %s.", out_png)
711
729
 
712
730
  pages_by_metric[metric] = saved_pages
713
731