smftools 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/_version.py +1 -1
- smftools/cli/chimeric_adata.py +1563 -0
- smftools/cli/helpers.py +49 -7
- smftools/cli/hmm_adata.py +250 -32
- smftools/cli/latent_adata.py +773 -0
- smftools/cli/load_adata.py +78 -74
- smftools/cli/preprocess_adata.py +122 -58
- smftools/cli/recipes.py +26 -0
- smftools/cli/spatial_adata.py +74 -112
- smftools/cli/variant_adata.py +423 -0
- smftools/cli_entry.py +52 -4
- smftools/config/conversion.yaml +1 -1
- smftools/config/deaminase.yaml +3 -0
- smftools/config/default.yaml +85 -12
- smftools/config/experiment_config.py +146 -1
- smftools/constants.py +69 -0
- smftools/hmm/HMM.py +88 -0
- smftools/hmm/call_hmm_peaks.py +1 -1
- smftools/informatics/__init__.py +6 -0
- smftools/informatics/bam_functions.py +358 -8
- smftools/informatics/binarize_converted_base_identities.py +2 -89
- smftools/informatics/converted_BAM_to_adata.py +636 -175
- smftools/informatics/h5ad_functions.py +198 -2
- smftools/informatics/modkit_extract_to_adata.py +1007 -425
- smftools/informatics/sequence_encoding.py +72 -0
- smftools/logging_utils.py +21 -2
- smftools/metadata.py +1 -1
- smftools/plotting/__init__.py +26 -3
- smftools/plotting/autocorrelation_plotting.py +22 -4
- smftools/plotting/chimeric_plotting.py +1893 -0
- smftools/plotting/classifiers.py +28 -14
- smftools/plotting/general_plotting.py +62 -1583
- smftools/plotting/hmm_plotting.py +1670 -8
- smftools/plotting/latent_plotting.py +804 -0
- smftools/plotting/plotting_utils.py +243 -0
- smftools/plotting/position_stats.py +16 -8
- smftools/plotting/preprocess_plotting.py +281 -0
- smftools/plotting/qc_plotting.py +8 -3
- smftools/plotting/spatial_plotting.py +1134 -0
- smftools/plotting/variant_plotting.py +1231 -0
- smftools/preprocessing/__init__.py +4 -0
- smftools/preprocessing/append_base_context.py +18 -18
- smftools/preprocessing/append_mismatch_frequency_sites.py +187 -0
- smftools/preprocessing/append_sequence_mismatch_annotations.py +171 -0
- smftools/preprocessing/append_variant_call_layer.py +480 -0
- smftools/preprocessing/calculate_consensus.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +6 -1
- smftools/preprocessing/flag_duplicate_reads.py +4 -4
- smftools/preprocessing/invert_adata.py +1 -0
- smftools/readwrite.py +159 -99
- smftools/schema/anndata_schema_v1.yaml +15 -1
- smftools/tools/__init__.py +10 -0
- smftools/tools/calculate_knn.py +121 -0
- smftools/tools/calculate_leiden.py +57 -0
- smftools/tools/calculate_nmf.py +130 -0
- smftools/tools/calculate_pca.py +180 -0
- smftools/tools/calculate_umap.py +79 -80
- smftools/tools/position_stats.py +4 -4
- smftools/tools/rolling_nn_distance.py +872 -0
- smftools/tools/sequence_alignment.py +140 -0
- smftools/tools/tensor_factorization.py +217 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/METADATA +9 -5
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/RECORD +66 -45
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/WHEEL +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/entry_points.txt +0 -0
- {smftools-0.3.0.dist-info → smftools-0.3.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Iterable, Mapping
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
|
|
7
|
+
from smftools.constants import (
|
|
8
|
+
MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
|
|
9
|
+
MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def encode_sequence_to_int(
|
|
14
|
+
sequence: str | Iterable[str],
|
|
15
|
+
*,
|
|
16
|
+
base_to_int: Mapping[str, int] = MODKIT_EXTRACT_SEQUENCE_BASE_TO_INT,
|
|
17
|
+
unknown_base: str = "N",
|
|
18
|
+
) -> np.ndarray:
|
|
19
|
+
"""Encode a base sequence into integer values using constant mappings.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
sequence: Sequence string or iterable of base characters.
|
|
23
|
+
base_to_int: Mapping of base characters to integer encodings.
|
|
24
|
+
unknown_base: Base to use when a character is not in the encoding map.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
np.ndarray: Integer-encoded sequence array.
|
|
28
|
+
|
|
29
|
+
Raises:
|
|
30
|
+
ValueError: If an unknown base is encountered and ``unknown_base`` is not mapped.
|
|
31
|
+
"""
|
|
32
|
+
if unknown_base not in base_to_int:
|
|
33
|
+
raise ValueError(f"Unknown base '{unknown_base}' not present in encoding map.")
|
|
34
|
+
|
|
35
|
+
if isinstance(sequence, str):
|
|
36
|
+
sequence_iter = sequence
|
|
37
|
+
else:
|
|
38
|
+
sequence_iter = list(sequence)
|
|
39
|
+
|
|
40
|
+
fallback = base_to_int[unknown_base]
|
|
41
|
+
encoded = np.fromiter(
|
|
42
|
+
(base_to_int.get(base, fallback) for base in sequence_iter),
|
|
43
|
+
dtype=np.int16,
|
|
44
|
+
count=len(sequence_iter),
|
|
45
|
+
)
|
|
46
|
+
return encoded
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def decode_int_sequence(
|
|
50
|
+
encoded_sequence: Iterable[int] | np.ndarray,
|
|
51
|
+
*,
|
|
52
|
+
int_to_base: Mapping[int, str] = MODKIT_EXTRACT_SEQUENCE_INT_TO_BASE,
|
|
53
|
+
unknown_base: str = "N",
|
|
54
|
+
) -> list[str]:
|
|
55
|
+
"""Decode integer-encoded bases into characters using constant mappings.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
encoded_sequence: Iterable of integer-encoded bases.
|
|
59
|
+
int_to_base: Mapping of integer encodings to base characters.
|
|
60
|
+
unknown_base: Base to use when an integer is not in the decoding map.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
list[str]: Decoded base characters.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
ValueError: If ``unknown_base`` is not available for fallback.
|
|
67
|
+
"""
|
|
68
|
+
if unknown_base not in int_to_base.values():
|
|
69
|
+
raise ValueError(f"Unknown base '{unknown_base}' not present in decoding map.")
|
|
70
|
+
|
|
71
|
+
fallback = unknown_base
|
|
72
|
+
return [int_to_base.get(int(value), fallback) for value in encoded_sequence]
|
smftools/logging_utils.py
CHANGED
|
@@ -15,18 +15,37 @@ def setup_logging(
|
|
|
15
15
|
fmt: str = DEFAULT_LOG_FORMAT,
|
|
16
16
|
datefmt: str = DEFAULT_DATE_FORMAT,
|
|
17
17
|
log_file: Optional[Union[str, Path]] = None,
|
|
18
|
+
reconfigure: bool = False,
|
|
18
19
|
) -> None:
|
|
19
20
|
"""
|
|
20
21
|
Configure logging for smftools.
|
|
21
22
|
|
|
22
23
|
Should be called once by the CLI entrypoint.
|
|
23
|
-
Safe to call multiple times.
|
|
24
|
+
Safe to call multiple times, with optional reconfiguration.
|
|
24
25
|
"""
|
|
25
26
|
logger = logging.getLogger("smftools")
|
|
26
27
|
|
|
27
|
-
if logger.handlers:
|
|
28
|
+
if logger.handlers and not reconfigure:
|
|
29
|
+
if log_file is not None:
|
|
30
|
+
log_path = Path(log_file)
|
|
31
|
+
has_file_handler = any(
|
|
32
|
+
isinstance(handler, logging.FileHandler)
|
|
33
|
+
and Path(getattr(handler, "baseFilename", "")) == log_path
|
|
34
|
+
for handler in logger.handlers
|
|
35
|
+
)
|
|
36
|
+
if not has_file_handler:
|
|
37
|
+
log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
38
|
+
file_handler = logging.FileHandler(log_path)
|
|
39
|
+
file_handler.setFormatter(logging.Formatter(fmt=fmt, datefmt=datefmt))
|
|
40
|
+
logger.addHandler(file_handler)
|
|
41
|
+
logger.setLevel(level)
|
|
28
42
|
return
|
|
29
43
|
|
|
44
|
+
if logger.handlers and reconfigure:
|
|
45
|
+
for handler in list(logger.handlers):
|
|
46
|
+
logger.removeHandler(handler)
|
|
47
|
+
handler.close()
|
|
48
|
+
|
|
30
49
|
formatter = logging.Formatter(fmt=fmt, datefmt=datefmt)
|
|
31
50
|
|
|
32
51
|
# Console handler (stderr)
|
smftools/metadata.py
CHANGED
|
@@ -12,7 +12,7 @@ from typing import Any, Iterable, Optional
|
|
|
12
12
|
from ._version import __version__
|
|
13
13
|
from .schema import SCHEMA_REGISTRY_RESOURCE, SCHEMA_REGISTRY_VERSION
|
|
14
14
|
|
|
15
|
-
_DEPENDENCIES = ("anndata", "numpy", "pandas", "
|
|
15
|
+
_DEPENDENCIES = ("anndata", "numpy", "pandas", "umap-learn", "pynndescent", "torch")
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _iso_timestamp() -> str:
|
smftools/plotting/__init__.py
CHANGED
|
@@ -3,9 +3,32 @@ from __future__ import annotations
|
|
|
3
3
|
from importlib import import_module
|
|
4
4
|
|
|
5
5
|
_LAZY_ATTRS = {
|
|
6
|
-
"
|
|
7
|
-
"
|
|
8
|
-
"
|
|
6
|
+
"combined_hmm_length_clustermap": "smftools.plotting.hmm_plotting",
|
|
7
|
+
"combined_hmm_raw_clustermap": "smftools.plotting.hmm_plotting",
|
|
8
|
+
"combined_raw_clustermap": "smftools.plotting.spatial_plotting",
|
|
9
|
+
"plot_delta_hamming_summary": "smftools.plotting.chimeric_plotting",
|
|
10
|
+
"plot_hamming_span_trio": "smftools.plotting.chimeric_plotting",
|
|
11
|
+
"plot_rolling_nn_and_layer": "smftools.plotting.chimeric_plotting",
|
|
12
|
+
"plot_rolling_nn_and_two_layers": "smftools.plotting.chimeric_plotting",
|
|
13
|
+
"plot_segment_length_histogram": "smftools.plotting.chimeric_plotting",
|
|
14
|
+
"plot_span_length_distributions": "smftools.plotting.chimeric_plotting",
|
|
15
|
+
"plot_zero_hamming_pair_counts": "smftools.plotting.chimeric_plotting",
|
|
16
|
+
"plot_zero_hamming_span_and_layer": "smftools.plotting.chimeric_plotting",
|
|
17
|
+
"plot_hmm_layers_rolling_by_sample_ref": "smftools.plotting.hmm_plotting",
|
|
18
|
+
"plot_nmf_components": "smftools.plotting.latent_plotting",
|
|
19
|
+
"plot_pca_components": "smftools.plotting.latent_plotting",
|
|
20
|
+
"plot_cp_sequence_components": "smftools.plotting.latent_plotting",
|
|
21
|
+
"plot_embedding": "smftools.plotting.latent_plotting",
|
|
22
|
+
"plot_embedding_grid": "smftools.plotting.latent_plotting",
|
|
23
|
+
"plot_read_span_quality_clustermaps": "smftools.plotting.preprocess_plotting",
|
|
24
|
+
"plot_mismatch_base_frequency_by_position": "smftools.plotting.variant_plotting",
|
|
25
|
+
"plot_pca": "smftools.plotting.latent_plotting",
|
|
26
|
+
"plot_pca_grid": "smftools.plotting.latent_plotting",
|
|
27
|
+
"plot_pca_explained_variance": "smftools.plotting.latent_plotting",
|
|
28
|
+
"plot_sequence_integer_encoding_clustermaps": "smftools.plotting.variant_plotting",
|
|
29
|
+
"plot_variant_segment_clustermaps": "smftools.plotting.variant_plotting",
|
|
30
|
+
"plot_umap": "smftools.plotting.latent_plotting",
|
|
31
|
+
"plot_umap_grid": "smftools.plotting.latent_plotting",
|
|
9
32
|
"plot_bar_relative_risk": "smftools.plotting.position_stats",
|
|
10
33
|
"plot_positionwise_matrix": "smftools.plotting.position_stats",
|
|
11
34
|
"plot_positionwise_matrix_grid": "smftools.plotting.position_stats",
|
|
@@ -5,8 +5,11 @@ from typing import Optional
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
7
7
|
|
|
8
|
+
from smftools.logging_utils import get_logger
|
|
8
9
|
from smftools.optional_imports import require
|
|
9
10
|
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
10
13
|
|
|
11
14
|
def plot_spatial_autocorr_grid(
|
|
12
15
|
adata,
|
|
@@ -39,6 +42,12 @@ def plot_spatial_autocorr_grid(
|
|
|
39
42
|
import os
|
|
40
43
|
import warnings
|
|
41
44
|
|
|
45
|
+
logger.info(
|
|
46
|
+
"Plotting spatial autocorrelation grid to %s for site_types=%s.",
|
|
47
|
+
out_dir,
|
|
48
|
+
site_types,
|
|
49
|
+
)
|
|
50
|
+
|
|
42
51
|
plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
|
|
43
52
|
|
|
44
53
|
# Try importing analyzer (used only as fallback)
|
|
@@ -98,7 +107,7 @@ def plot_spatial_autocorr_grid(
|
|
|
98
107
|
if sample_col not in adata.obs:
|
|
99
108
|
raise KeyError(f"sample_col '{sample_col}' not present in adata.obs")
|
|
100
109
|
samples = adata.obs[sample_col]
|
|
101
|
-
if not pd.
|
|
110
|
+
if not isinstance(samples.dtype, pd.CategoricalDtype):
|
|
102
111
|
samples = samples.astype("category")
|
|
103
112
|
sample_levels = list(samples.cat.categories)
|
|
104
113
|
|
|
@@ -107,7 +116,7 @@ def plot_spatial_autocorr_grid(
|
|
|
107
116
|
raise KeyError(f"reference_col '{reference_col}' not present in adata.obs")
|
|
108
117
|
if references is None:
|
|
109
118
|
refs_series = adata.obs[reference_col]
|
|
110
|
-
if not pd.
|
|
119
|
+
if not isinstance(refs_series.dtype, pd.CategoricalDtype):
|
|
111
120
|
refs_series = refs_series.astype("category")
|
|
112
121
|
references = list(refs_series.cat.categories)
|
|
113
122
|
references = list(references)
|
|
@@ -510,10 +519,10 @@ def plot_spatial_autocorr_grid(
|
|
|
510
519
|
try:
|
|
511
520
|
combined_df.to_csv(combined_out, index=False)
|
|
512
521
|
except Exception as e:
|
|
513
|
-
import warnings
|
|
514
|
-
|
|
515
522
|
warnings.warn(f"Failed to write combined CSV {combined_out}: {e}")
|
|
523
|
+
logger.warning("Failed to write combined CSV %s: %s", combined_out, e)
|
|
516
524
|
|
|
525
|
+
logger.info("Saved %s autocorrelation grid pages to %s.", len(saved_pages), out_dir)
|
|
517
526
|
return saved_pages
|
|
518
527
|
|
|
519
528
|
|
|
@@ -522,6 +531,7 @@ def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=16
|
|
|
522
531
|
Plot NRL and SNR vs window center from the dataframe returned by rolling_autocorr_metrics.
|
|
523
532
|
If out_png is None, returns the matplotlib Figure object; otherwise saves PNG and returns path.
|
|
524
533
|
"""
|
|
534
|
+
logger.info("Plotting rolling metrics%s.", f" -> {out_png}" if out_png else "")
|
|
525
535
|
plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
|
|
526
536
|
|
|
527
537
|
# sort by center
|
|
@@ -546,6 +556,7 @@ def plot_rolling_metrics(df, out_png=None, title=None, figsize=(10, 3.5), dpi=16
|
|
|
546
556
|
|
|
547
557
|
if out_png:
|
|
548
558
|
fig.savefig(out_png, bbox_inches="tight")
|
|
559
|
+
logger.info("Saved rolling metrics plot to %s.", out_png)
|
|
549
560
|
if not show:
|
|
550
561
|
matplotlib = require("matplotlib", extra="plotting", purpose="autocorrelation plots")
|
|
551
562
|
|
|
@@ -604,6 +615,12 @@ def plot_rolling_grid(
|
|
|
604
615
|
"""
|
|
605
616
|
import os
|
|
606
617
|
|
|
618
|
+
logger.info(
|
|
619
|
+
"Plotting rolling metric grids for site=%s to %s (metrics=%s).",
|
|
620
|
+
site,
|
|
621
|
+
out_dir,
|
|
622
|
+
metrics,
|
|
623
|
+
)
|
|
607
624
|
plt = require("matplotlib.pyplot", extra="plotting", purpose="autocorrelation plots")
|
|
608
625
|
|
|
609
626
|
if per_metric_ylim is None:
|
|
@@ -708,6 +725,7 @@ def plot_rolling_grid(
|
|
|
708
725
|
fig.savefig(out_png, bbox_inches="tight")
|
|
709
726
|
plt.close(fig)
|
|
710
727
|
saved_pages.append(out_png)
|
|
728
|
+
logger.info("Saved rolling grid page to %s.", out_png)
|
|
711
729
|
|
|
712
730
|
pages_by_metric[metric] = saved_pages
|
|
713
731
|
|