smftools 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. smftools/_version.py +1 -1
  2. smftools/cli/helpers.py +32 -6
  3. smftools/cli/hmm_adata.py +232 -31
  4. smftools/cli/latent_adata.py +318 -0
  5. smftools/cli/load_adata.py +77 -73
  6. smftools/cli/preprocess_adata.py +178 -53
  7. smftools/cli/spatial_adata.py +149 -101
  8. smftools/cli_entry.py +12 -0
  9. smftools/config/conversion.yaml +11 -1
  10. smftools/config/default.yaml +38 -1
  11. smftools/config/experiment_config.py +53 -1
  12. smftools/constants.py +65 -0
  13. smftools/hmm/HMM.py +88 -0
  14. smftools/informatics/__init__.py +6 -0
  15. smftools/informatics/bam_functions.py +358 -8
  16. smftools/informatics/converted_BAM_to_adata.py +584 -163
  17. smftools/informatics/h5ad_functions.py +115 -2
  18. smftools/informatics/modkit_extract_to_adata.py +1003 -425
  19. smftools/informatics/sequence_encoding.py +72 -0
  20. smftools/logging_utils.py +21 -2
  21. smftools/metadata.py +1 -1
  22. smftools/plotting/__init__.py +9 -0
  23. smftools/plotting/general_plotting.py +2411 -628
  24. smftools/plotting/hmm_plotting.py +85 -7
  25. smftools/preprocessing/__init__.py +1 -0
  26. smftools/preprocessing/append_base_context.py +17 -17
  27. smftools/preprocessing/append_mismatch_frequency_sites.py +158 -0
  28. smftools/preprocessing/calculate_consensus.py +1 -1
  29. smftools/preprocessing/calculate_read_modification_stats.py +6 -1
  30. smftools/readwrite.py +53 -17
  31. smftools/schema/anndata_schema_v1.yaml +15 -1
  32. smftools/tools/__init__.py +4 -0
  33. smftools/tools/calculate_leiden.py +57 -0
  34. smftools/tools/calculate_nmf.py +119 -0
  35. smftools/tools/calculate_umap.py +91 -8
  36. smftools/tools/rolling_nn_distance.py +235 -0
  37. smftools/tools/tensor_factorization.py +169 -0
  38. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/METADATA +8 -6
  39. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/RECORD +42 -35
  40. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/WHEEL +0 -0
  41. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/entry_points.txt +0 -0
  42. {smftools-0.3.0.dist-info → smftools-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,72 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Iterable, Mapping
4
+
5
+ import numpy as np
6
+
7
+ from smftools.constants import (
8
+ MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
9
+ MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
10
+ )
11
+
12
+
13
+ def encode_sequence_to_int(
14
+ sequence: str | Iterable[str],
15
+ *,
16
+ base_to_int: Mapping[str, int] = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
17
+ unknown_base: str = "N",
18
+ ) -> np.ndarray:
19
+ """Encode a base sequence into integer values using constant mappings.
20
+
21
+ Args:
22
+ sequence: Sequence string or iterable of base characters.
23
+ base_to_int: Mapping of base characters to integer encodings.
24
+ unknown_base: Base to use when a character is not in the encoding map.
25
+
26
+ Returns:
27
+ np.ndarray: Integer-encoded sequence array.
28
+
29
+ Raises:
30
+ ValueError: If an unknown base is encountered and ``unknown_base`` is not mapped.
31
+ """
32
+ if unknown_base not in base_to_int:
33
+ raise ValueError(f"Unknown base '{unknown_base}' not present in encoding map.")
34
+
35
+ if isinstance(sequence, str):
36
+ sequence_iter = sequence
37
+ else:
38
+ sequence_iter = list(sequence)
39
+
40
+ fallback = base_to_int[unknown_base]
41
+ encoded = np.fromiter(
42
+ (base_to_int.get(base, fallback) for base in sequence_iter),
43
+ dtype=np.int16,
44
+ count=len(sequence_iter),
45
+ )
46
+ return encoded
47
+
48
+
49
+ def decode_int_sequence(
50
+ encoded_sequence: Iterable[int] | np.ndarray,
51
+ *,
52
+ int_to_base: Mapping[int, str] = MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
53
+ unknown_base: str = "N",
54
+ ) -> list[str]:
55
+ """Decode integer-encoded bases into characters using constant mappings.
56
+
57
+ Args:
58
+ encoded_sequence: Iterable of integer-encoded bases.
59
+ int_to_base: Mapping of integer encodings to base characters.
60
+ unknown_base: Base to use when an integer is not in the decoding map.
61
+
62
+ Returns:
63
+ list[str]: Decoded base characters.
64
+
65
+ Raises:
66
+ ValueError: If ``unknown_base`` is not available for fallback.
67
+ """
68
+ if unknown_base not in int_to_base.values():
69
+ raise ValueError(f"Unknown base '{unknown_base}' not present in decoding map.")
70
+
71
+ fallback = unknown_base
72
+ return [int_to_base.get(int(value), fallback) for value in encoded_sequence]
smftools/logging_utils.py CHANGED
@@ -15,18 +15,37 @@ def setup_logging(
15
15
  fmt: str = DEFAULT_LOG_FORMAT,
16
16
  datefmt: str = DEFAULT_DATE_FORMAT,
17
17
  log_file: Optional[Union[str, Path]] = None,
18
+ reconfigure: bool = False,
18
19
  ) -> None:
19
20
  """
20
21
  Configure logging for smftools.
21
22
 
22
23
  Should be called once by the CLI entrypoint.
23
- Safe to call multiple times.
24
+ Safe to call multiple times, with optional reconfiguration.
24
25
  """
25
26
  logger = logging.getLogger("smftools")
26
27
 
27
- if logger.handlers:
28
+ if logger.handlers and not reconfigure:
29
+ if log_file is not None:
30
+ log_path = Path(log_file)
31
+ has_file_handler = any(
32
+ isinstance(handler, logging.FileHandler)
33
+ and Path(getattr(handler, "baseFilename", "")) == log_path
34
+ for handler in logger.handlers
35
+ )
36
+ if not has_file_handler:
37
+ log_path.parent.mkdir(parents=True, exist_ok=True)
38
+ file_handler = logging.FileHandler(log_path)
39
+ file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))
40
+ logger.addHandler(file_handler)
41
+ logger.setLevel(level)
28
42
  return
29
43
 
44
+ if logger.handlers and reconfigure:
45
+ for handler in list(logger.handlers):
46
+ logger.removeHandler(handler)
47
+ handler.close()
48
+
30
49
  formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
31
50
 
32
51
  # Console handler (stderr)
smftools/metadata.py CHANGED
@@ -12,7 +12,7 @@ from typing import Any, Iterable, Optional
12
12
  from ._version import __version__
13
13
  from .schema import SCHEMA_REGISTRY_RESOURCE, SCHEMA_REGISTRY_VERSION
14
14
 
15
- _DEPENDENCIES = ("anndata", "numpy", "pandas", "scanpy", "torch")
15
+ _DEPENDENCIES = ("anndata", "numpy", "pandas", "umap-learn", "pynndescent", "torch")
16
16
 
17
17
 
18
18
  def _iso_timestamp() -> str:
@@ -3,9 +3,18 @@ from __future__ import annotations
3
3
  from importlib import import_module
4
4
 
5
5
  _LAZY_ATTRS = {
6
+ "combined_hmm_length_clustermap": "smftools.plotting.general_plotting",
6
7
  "combined_hmm_raw_clustermap": "smftools.plotting.general_plotting",
7
8
  "combined_raw_clustermap": "smftools.plotting.general_plotting",
9
+ "plot_rolling_nn_and_layer": "smftools.plotting.general_plotting",
8
10
  "plot_hmm_layers_rolling_by_sample_ref": "smftools.plotting.general_plotting",
11
+ "plot_nmf_components": "smftools.plotting.general_plotting",
12
+ "plot_cp_sequence_components": "smftools.plotting.general_plotting",
13
+ "plot_embedding": "smftools.plotting.general_plotting",
14
+ "plot_read_span_quality_clustermaps": "smftools.plotting.general_plotting",
15
+ "plot_pca": "smftools.plotting.general_plotting",
16
+ "plot_sequence_integer_encoding_clustermaps": "smftools.plotting.general_plotting",
17
+ "plot_umap": "smftools.plotting.general_plotting",
9
18
  "plot_bar_relative_risk": "smftools.plotting.position_stats",
10
19
  "plot_positionwise_matrix": "smftools.plotting.position_stats",
11
20
  "plot_positionwise_matrix_grid": "smftools.plotting.position_stats",