smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,47 +1,143 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import anndata as ad
|
|
6
9
|
|
|
7
|
-
|
|
8
|
-
adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
|
|
9
|
-
obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
|
|
10
|
+
logger = get_logger(__name__)
|
|
10
11
|
|
|
11
|
-
|
|
12
|
-
|
|
12
|
+
|
|
13
|
+
def binarize_on_Youden(
|
|
14
|
+
adata: "ad.AnnData",
|
|
15
|
+
ref_column: str = "Reference_strand",
|
|
16
|
+
output_layer_name: str = "binarized_methylation",
|
|
17
|
+
mask_failed_positions: bool = True,
|
|
18
|
+
) -> None:
|
|
19
|
+
"""Binarize SMF values using thresholds from ``calculate_position_Youden``.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
adata: AnnData object to binarize.
|
|
23
|
+
ref_column: Obs column denoting reference/strand categories.
|
|
24
|
+
output_layer_name: Layer in which to store the binarized matrix.
|
|
25
|
+
mask_failed_positions: If ``True``, positions that failed Youden QC are set to NaN;
|
|
26
|
+
otherwise all positions are binarized.
|
|
13
27
|
"""
|
|
28
|
+
|
|
14
29
|
import numpy as np
|
|
15
|
-
import anndata as ad
|
|
16
30
|
|
|
17
|
-
#
|
|
18
|
-
|
|
31
|
+
# Extract dense X once
|
|
32
|
+
X = adata.X
|
|
33
|
+
if hasattr(X, "toarray"): # sparse → dense
|
|
34
|
+
X = X.toarray()
|
|
35
|
+
|
|
36
|
+
n_obs, n_var = X.shape
|
|
37
|
+
binarized = np.full((n_obs, n_var), np.nan, dtype=float)
|
|
19
38
|
|
|
20
|
-
# Get unique categories
|
|
21
39
|
references = adata.obs[ref_column].cat.categories
|
|
40
|
+
ref_labels = adata.obs[ref_column].to_numpy()
|
|
22
41
|
|
|
23
42
|
for ref in references:
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
43
|
+
logger.info("Binarizing on Youden statistics for %s", ref)
|
|
44
|
+
|
|
45
|
+
ref_mask = ref_labels == ref
|
|
46
|
+
if not np.any(ref_mask):
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
X_block = X[ref_mask, :].astype(float, copy=True)
|
|
50
|
+
|
|
51
|
+
# thresholds: list of (threshold, J)
|
|
52
|
+
youden_stats = adata.var[f"{ref}_position_methylation_thresholding_Youden_stats"].to_numpy()
|
|
53
|
+
|
|
54
|
+
thresholds = np.array(
|
|
55
|
+
[t[0] if isinstance(t, (tuple, list)) else np.nan for t in youden_stats],
|
|
56
|
+
dtype=float,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# QC mask
|
|
60
|
+
qc_mask = adata.var[f"{ref}_position_passed_Youden_thresholding_QC"].to_numpy().astype(bool)
|
|
61
|
+
|
|
62
|
+
if mask_failed_positions:
|
|
63
|
+
# Only binarize positions passing QC
|
|
64
|
+
cols_to_binarize = np.where(qc_mask)[0]
|
|
65
|
+
else:
|
|
66
|
+
# Binarize all positions
|
|
67
|
+
cols_to_binarize = np.arange(n_var)
|
|
68
|
+
|
|
69
|
+
# Prepare result block
|
|
70
|
+
block_out = np.full_like(X_block, np.nan, dtype=float)
|
|
71
|
+
|
|
72
|
+
if len(cols_to_binarize) > 0:
|
|
73
|
+
sub_X = X_block[:, cols_to_binarize]
|
|
74
|
+
sub_thresh = thresholds[cols_to_binarize]
|
|
75
|
+
|
|
76
|
+
nan_mask = np.isnan(sub_X)
|
|
77
|
+
|
|
78
|
+
bin_sub = (sub_X > sub_thresh[None, :]).astype(float)
|
|
79
|
+
bin_sub[nan_mask] = np.nan
|
|
80
|
+
|
|
81
|
+
block_out[:, cols_to_binarize] = bin_sub
|
|
82
|
+
|
|
83
|
+
# Write into full output matrix
|
|
84
|
+
binarized[ref_mask, :] = block_out
|
|
85
|
+
|
|
86
|
+
adata.layers[output_layer_name] = binarized
|
|
87
|
+
logger.info(
|
|
88
|
+
"Finished binarization → stored in adata.layers['%s'] (mask_failed_positions=%s)",
|
|
89
|
+
output_layer_name,
|
|
90
|
+
mask_failed_positions,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
# def binarize_on_Youden(adata,
|
|
95
|
+
# ref_column='Reference_strand',
|
|
96
|
+
# output_layer_name='binarized_methylation'):
|
|
97
|
+
# """
|
|
98
|
+
# Binarize SMF values based on position thresholds determined by calculate_position_Youden.
|
|
99
|
+
|
|
100
|
+
# Parameters:
|
|
101
|
+
# adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
|
|
102
|
+
# obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
|
|
103
|
+
|
|
104
|
+
# Modifies:
|
|
105
|
+
# Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
|
|
106
|
+
# """
|
|
107
|
+
# import numpy as np
|
|
108
|
+
# import anndata as ad
|
|
109
|
+
|
|
110
|
+
# # Initialize an empty matrix to store the binarized methylation values
|
|
111
|
+
# binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
|
|
112
|
+
|
|
113
|
+
# # Get unique categories
|
|
114
|
+
# references = adata.obs[ref_column].cat.categories
|
|
115
|
+
|
|
116
|
+
# for ref in references:
|
|
117
|
+
# print(f"Binarizing adata on Youden statistics for {ref}")
|
|
118
|
+
# # Select subset for this category
|
|
119
|
+
# ref_mask = adata.obs[ref_column] == ref
|
|
120
|
+
# ref_subset = adata[ref_mask]
|
|
121
|
+
|
|
122
|
+
# # Extract the probability matrix
|
|
123
|
+
# original_matrix = ref_subset.X.copy()
|
|
27
124
|
|
|
28
|
-
|
|
29
|
-
|
|
125
|
+
# # Extract the thresholds for each position efficiently
|
|
126
|
+
# thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
|
|
30
127
|
|
|
31
|
-
|
|
32
|
-
|
|
128
|
+
# # Identify NaN values
|
|
129
|
+
# nan_mask = np.isnan(original_matrix)
|
|
33
130
|
|
|
34
|
-
|
|
35
|
-
|
|
131
|
+
# # Binarize based on threshold
|
|
132
|
+
# binarized_matrix = (original_matrix > thresholds).astype(float)
|
|
36
133
|
|
|
37
|
-
|
|
38
|
-
|
|
134
|
+
# # Restore NaN values
|
|
135
|
+
# binarized_matrix[nan_mask] = np.nan
|
|
39
136
|
|
|
40
|
-
|
|
41
|
-
|
|
137
|
+
# # Assign the binarized values back into the preallocated storage
|
|
138
|
+
# binarized_methylation[ref_subset, :] = binarized_matrix
|
|
42
139
|
|
|
43
|
-
|
|
44
|
-
|
|
140
|
+
# # Store the binarized matrix in a new layer
|
|
141
|
+
# adata.layers[output_layer_name] = binarized_methylation
|
|
45
142
|
|
|
46
|
-
|
|
47
|
-
adata.layers[output_layer_name] = binarized_methylation
|
|
143
|
+
# print(f"Finished binarizing adata on Youden statistics")
|
|
@@ -1,28 +1,34 @@
|
|
|
1
1
|
## binary_layers_to_ohe
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
from smftools.logging_utils import get_logger
|
|
4
|
+
|
|
5
|
+
logger = get_logger(__name__)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
## Conversion SMF Specific
|
|
9
|
+
def binary_layers_to_ohe(adata, binary_layers, stack="hstack"):
|
|
5
10
|
"""
|
|
6
11
|
Parameters:
|
|
7
12
|
adata (AnnData): Anndata object.
|
|
8
|
-
binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
|
|
13
|
+
binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
|
|
9
14
|
stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
|
|
10
|
-
|
|
15
|
+
|
|
11
16
|
Returns:
|
|
12
17
|
ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
|
|
13
18
|
Input: An adata object and a list of layers containing a binary encoding.
|
|
14
19
|
"""
|
|
15
20
|
import numpy as np
|
|
16
|
-
import anndata as ad
|
|
17
21
|
|
|
18
22
|
# Ensure that the N layer is last!
|
|
19
23
|
# Grab all binary layers that are not encoding N
|
|
20
|
-
ACGT_binary_layers = [
|
|
24
|
+
ACGT_binary_layers = [
|
|
25
|
+
layer for layer in binary_layers if "binary" in layer and layer != "N_binary_encoding"
|
|
26
|
+
]
|
|
21
27
|
# If there is a binary layer encoding N, hold it in N_binary_layer
|
|
22
|
-
N_binary_layer = [layer for layer in binary_layers if layer ==
|
|
28
|
+
N_binary_layer = [layer for layer in binary_layers if layer == "N_binary_encoding"]
|
|
23
29
|
# Add the N_binary_encoding layer to the end of the list of binary layers
|
|
24
30
|
all_binary_layers = ACGT_binary_layers + N_binary_layer
|
|
25
|
-
|
|
31
|
+
logger.info("Found %s layers in adata", all_binary_layers)
|
|
26
32
|
|
|
27
33
|
# Extract the layers
|
|
28
34
|
layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
|
|
@@ -33,8 +39,8 @@ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
|
|
|
33
39
|
for layer in layers:
|
|
34
40
|
read_ohe.append(layer[i])
|
|
35
41
|
read_name = adata.obs_names[i]
|
|
36
|
-
if stack ==
|
|
42
|
+
if stack == "hstack":
|
|
37
43
|
ohe_dict[read_name] = np.hstack(read_ohe)
|
|
38
|
-
elif stack ==
|
|
44
|
+
elif stack == "vstack":
|
|
39
45
|
ohe_dict[read_name] = np.vstack(read_ohe)
|
|
40
|
-
return ohe_dict
|
|
46
|
+
return ohe_dict
|
|
@@ -1,42 +1,59 @@
|
|
|
1
|
-
from
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Optional
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
import anndata as ad
|
|
8
|
+
|
|
9
|
+
|
|
2
10
|
def calculate_complexity_II(
|
|
3
|
-
adata,
|
|
4
|
-
output_directory=
|
|
5
|
-
sample_col=
|
|
6
|
-
ref_col: Optional[str] =
|
|
7
|
-
cluster_col=
|
|
8
|
-
plot=True,
|
|
9
|
-
save_plot=False,
|
|
10
|
-
n_boot=30,
|
|
11
|
-
n_depths=12,
|
|
12
|
-
random_state=0,
|
|
13
|
-
csv_summary=True,
|
|
14
|
-
uns_flag=
|
|
15
|
-
force_redo=False,
|
|
16
|
-
bypass=False
|
|
17
|
-
):
|
|
18
|
-
"""
|
|
19
|
-
Estimate and plot library complexity.
|
|
11
|
+
adata: "ad.AnnData",
|
|
12
|
+
output_directory: str | Path = "",
|
|
13
|
+
sample_col: str = "Sample_names",
|
|
14
|
+
ref_col: Optional[str] = "Reference_strand",
|
|
15
|
+
cluster_col: str = "sequence__merged_cluster_id",
|
|
16
|
+
plot: bool = True,
|
|
17
|
+
save_plot: bool = False,
|
|
18
|
+
n_boot: int = 30,
|
|
19
|
+
n_depths: int = 12,
|
|
20
|
+
random_state: int = 0,
|
|
21
|
+
csv_summary: bool = True,
|
|
22
|
+
uns_flag: str = "calculate_complexity_II_performed",
|
|
23
|
+
force_redo: bool = False,
|
|
24
|
+
bypass: bool = False,
|
|
25
|
+
) -> None:
|
|
26
|
+
"""Estimate and optionally plot library complexity.
|
|
20
27
|
|
|
21
|
-
If ref_col is None
|
|
22
|
-
|
|
28
|
+
If ``ref_col`` is ``None``, the calculation is performed per sample. If provided,
|
|
29
|
+
complexity is computed for each ``(sample, reference)`` pair.
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
31
|
+
Args:
|
|
32
|
+
adata: AnnData object containing read metadata.
|
|
33
|
+
output_directory: Directory for output plots/CSVs.
|
|
34
|
+
sample_col: Obs column containing sample names.
|
|
35
|
+
ref_col: Obs column with reference/strand categories, or ``None``.
|
|
36
|
+
cluster_col: Obs column with merged cluster IDs.
|
|
37
|
+
plot: Whether to generate plots.
|
|
38
|
+
save_plot: Whether to save plots to disk.
|
|
39
|
+
n_boot: Number of bootstrap iterations per depth.
|
|
40
|
+
n_depths: Number of subsampling depths to evaluate.
|
|
41
|
+
random_state: Random seed for bootstrapping.
|
|
42
|
+
csv_summary: Whether to write CSV summary files.
|
|
43
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
44
|
+
force_redo: Whether to rerun even if ``uns_flag`` is present.
|
|
45
|
+
bypass: Whether to skip processing.
|
|
29
46
|
"""
|
|
30
47
|
import os
|
|
48
|
+
|
|
49
|
+
import matplotlib.pyplot as plt
|
|
31
50
|
import numpy as np
|
|
32
51
|
import pandas as pd
|
|
33
|
-
import matplotlib.pyplot as plt
|
|
34
52
|
from scipy.optimize import curve_fit
|
|
35
|
-
from datetime import datetime
|
|
36
53
|
|
|
37
54
|
# early exits
|
|
38
55
|
already = bool(adata.uns.get(uns_flag, False))
|
|
39
|
-
if
|
|
56
|
+
if already and not force_redo:
|
|
40
57
|
return None
|
|
41
58
|
if bypass:
|
|
42
59
|
return None
|
|
@@ -44,9 +61,11 @@ def calculate_complexity_II(
|
|
|
44
61
|
rng = np.random.default_rng(random_state)
|
|
45
62
|
|
|
46
63
|
def lw(x, C0):
|
|
64
|
+
"""Lander-Waterman curve for complexity estimation."""
|
|
47
65
|
return C0 * (1.0 - np.exp(-x / C0))
|
|
48
66
|
|
|
49
67
|
def sanitize(name: str) -> str:
|
|
68
|
+
"""Sanitize a string for safe filenames."""
|
|
50
69
|
return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
|
|
51
70
|
|
|
52
71
|
# checks
|
|
@@ -77,7 +96,7 @@ def calculate_complexity_II(
|
|
|
77
96
|
group_keys = []
|
|
78
97
|
# iterate only pairs that exist in data to avoid empty processing
|
|
79
98
|
for s in samples:
|
|
80
|
-
mask_s =
|
|
99
|
+
mask_s = adata.obs[sample_col] == s
|
|
81
100
|
# find references present for this sample
|
|
82
101
|
ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
|
|
83
102
|
# Use intersection of known reference categories and those present for sample
|
|
@@ -109,7 +128,7 @@ def calculate_complexity_II(
|
|
|
109
128
|
"ci_high": np.array([], dtype=float),
|
|
110
129
|
}
|
|
111
130
|
# also store back-compat key
|
|
112
|
-
adata.uns[f
|
|
131
|
+
adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
|
|
113
132
|
continue
|
|
114
133
|
|
|
115
134
|
# cluster ids array for this group
|
|
@@ -175,39 +194,45 @@ def calculate_complexity_II(
|
|
|
175
194
|
}
|
|
176
195
|
|
|
177
196
|
# save per-group in adata.uns for backward compatibility
|
|
178
|
-
adata.uns[f
|
|
197
|
+
adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
|
|
179
198
|
|
|
180
199
|
# prepare curve and fit records for CSV
|
|
181
|
-
fit_records.append(
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
200
|
+
fit_records.append(
|
|
201
|
+
{
|
|
202
|
+
"sample": sample,
|
|
203
|
+
"reference": ref if ref_col is not None else "",
|
|
204
|
+
"C0": float(C0),
|
|
205
|
+
"n_reads": int(n_reads),
|
|
206
|
+
"n_unique_observed": int(observed_unique),
|
|
207
|
+
}
|
|
208
|
+
)
|
|
188
209
|
|
|
189
210
|
x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
|
|
190
211
|
y_fit = lw(x_fit, C0)
|
|
191
212
|
for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
|
|
192
|
-
curve_records.append(
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
213
|
+
curve_records.append(
|
|
214
|
+
{
|
|
215
|
+
"sample": sample,
|
|
216
|
+
"reference": ref if ref_col is not None else "",
|
|
217
|
+
"type": "bootstrap",
|
|
218
|
+
"depth": int(d),
|
|
219
|
+
"mean_unique": float(mu),
|
|
220
|
+
"ci_low": float(lo),
|
|
221
|
+
"ci_high": float(hi),
|
|
222
|
+
}
|
|
223
|
+
)
|
|
201
224
|
for xf, yf in zip(x_fit, y_fit):
|
|
202
|
-
curve_records.append(
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
225
|
+
curve_records.append(
|
|
226
|
+
{
|
|
227
|
+
"sample": sample,
|
|
228
|
+
"reference": ref if ref_col is not None else "",
|
|
229
|
+
"type": "fit",
|
|
230
|
+
"depth": float(xf),
|
|
231
|
+
"mean_unique": float(yf),
|
|
232
|
+
"ci_low": np.nan,
|
|
233
|
+
"ci_high": np.nan,
|
|
234
|
+
}
|
|
235
|
+
)
|
|
211
236
|
|
|
212
237
|
# plotting for this group
|
|
213
238
|
if plot:
|
|
@@ -226,7 +251,9 @@ def calculate_complexity_II(
|
|
|
226
251
|
|
|
227
252
|
if save_plot:
|
|
228
253
|
fname = f"complexity_{sanitize(group_label)}.png"
|
|
229
|
-
plt.savefig(
|
|
254
|
+
plt.savefig(
|
|
255
|
+
os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight"
|
|
256
|
+
)
|
|
230
257
|
plt.close()
|
|
231
258
|
else:
|
|
232
259
|
plt.show()
|
|
@@ -242,7 +269,7 @@ def calculate_complexity_II(
|
|
|
242
269
|
fit_df = pd.DataFrame(fit_records)
|
|
243
270
|
curve_df = pd.DataFrame(curve_records)
|
|
244
271
|
base = output_directory or "."
|
|
245
|
-
fit_df.to_csv(os.path.join(base,
|
|
246
|
-
curve_df.to_csv(os.path.join(base,
|
|
272
|
+
fit_df.to_csv(os.path.join(base, "complexity_fit_summary.csv"), index=False)
|
|
273
|
+
curve_df.to_csv(os.path.join(base, "complexity_curves.csv"), index=False)
|
|
247
274
|
|
|
248
275
|
return results
|
|
@@ -1,19 +1,28 @@
|
|
|
1
1
|
# calculate_consensus
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import anndata as ad
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def calculate_consensus(
|
|
12
|
+
adata: "ad.AnnData",
|
|
13
|
+
reference: str,
|
|
14
|
+
sample: str | bool = False,
|
|
15
|
+
reference_column: str = "Reference",
|
|
16
|
+
sample_column: str = "Sample",
|
|
17
|
+
) -> None:
|
|
18
|
+
"""Calculate a consensus sequence for a reference (and optional sample).
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
adata: AnnData object to append consensus metadata to.
|
|
22
|
+
reference: Reference name to subset on.
|
|
23
|
+
sample: If ``False``, uses all samples. If a string is passed, subsets to that sample.
|
|
24
|
+
reference_column: Obs column with reference names.
|
|
25
|
+
sample_column: Obs column with sample names.
|
|
17
26
|
"""
|
|
18
27
|
import numpy as np
|
|
19
28
|
|
|
@@ -25,11 +34,11 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
|
|
|
25
34
|
pass
|
|
26
35
|
|
|
27
36
|
# Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
|
|
28
|
-
layers = [layer for layer in record_subset.layers if
|
|
37
|
+
layers = [layer for layer in record_subset.layers if "_binary_" in layer]
|
|
29
38
|
layer_map, layer_counts = {}, []
|
|
30
39
|
for i, layer in enumerate(layers):
|
|
31
40
|
# Gives an integer mapping to access which sequence base the binary layer is encoding
|
|
32
|
-
layer_map[i] = layer.split(
|
|
41
|
+
layer_map[i] = layer.split("_")[0]
|
|
33
42
|
# Get the positional counts from all reads for the given base identity.
|
|
34
43
|
layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
|
|
35
44
|
# Combine the positional counts array derived from each binary base layer into an ndarray
|
|
@@ -40,8 +49,8 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
|
|
|
40
49
|
consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
|
|
41
50
|
|
|
42
51
|
if sample:
|
|
43
|
-
adata.var[f
|
|
52
|
+
adata.var[f"{reference}_consensus_from_{sample}"] = consensus_sequence_list
|
|
44
53
|
else:
|
|
45
|
-
adata.var[f
|
|
54
|
+
adata.var[f"{reference}_consensus_across_samples"] = consensus_sequence_list
|
|
46
55
|
|
|
47
|
-
adata.uns[f
|
|
56
|
+
adata.uns[f"{reference}_consensus_sequence"] = consensus_sequence_list
|
|
@@ -1,54 +1,76 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
from smftools.logging_utils import get_logger
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import anndata as ad
|
|
7
9
|
|
|
8
|
-
|
|
9
|
-
adata (AnnData): An AnnData object
|
|
10
|
-
obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
|
|
11
|
-
position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
|
|
10
|
+
logger = get_logger(__name__)
|
|
12
11
|
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
|
|
13
|
+
def calculate_coverage(
|
|
14
|
+
adata: "ad.AnnData",
|
|
15
|
+
ref_column: str = "Reference_strand",
|
|
16
|
+
position_nan_threshold: float = 0.01,
|
|
17
|
+
smf_modality: str = "deaminase",
|
|
18
|
+
target_layer: str = "binarized_methylation",
|
|
19
|
+
uns_flag: str = "calculate_coverage_performed",
|
|
20
|
+
force_redo: bool = False,
|
|
21
|
+
) -> None:
|
|
22
|
+
"""Append position-level coverage metadata per reference category.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
adata: AnnData object.
|
|
26
|
+
ref_column: Obs column used to define reference/strand categories.
|
|
27
|
+
position_nan_threshold: Minimum fraction of coverage to mark a position as valid.
|
|
28
|
+
smf_modality: SMF modality. Use ``adata.X`` for conversion/deaminase or ``target_layer`` for direct.
|
|
29
|
+
target_layer: Layer used for direct SMF coverage calculations.
|
|
30
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
31
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
15
32
|
"""
|
|
16
33
|
import numpy as np
|
|
17
34
|
import pandas as pd
|
|
18
|
-
import anndata as ad
|
|
19
35
|
|
|
20
36
|
# Only run if not already performed
|
|
21
37
|
already = bool(adata.uns.get(uns_flag, False))
|
|
22
|
-
if already:
|
|
38
|
+
if already and not force_redo:
|
|
23
39
|
# QC already performed; nothing to do
|
|
24
40
|
return
|
|
25
|
-
|
|
41
|
+
|
|
26
42
|
references = adata.obs[ref_column].cat.categories
|
|
27
43
|
n_categories_with_position = np.zeros(adata.shape[1])
|
|
28
44
|
|
|
29
45
|
# Loop over references
|
|
30
46
|
for ref in references:
|
|
31
|
-
|
|
47
|
+
logger.info("Assessing positional coverage across samples for %s reference", ref)
|
|
32
48
|
|
|
33
49
|
# Subset to current category
|
|
34
50
|
ref_mask = adata.obs[ref_column] == ref
|
|
35
51
|
temp_ref_adata = adata[ref_mask]
|
|
36
52
|
|
|
53
|
+
if smf_modality == "direct":
|
|
54
|
+
matrix = temp_ref_adata.layers[target_layer]
|
|
55
|
+
else:
|
|
56
|
+
matrix = temp_ref_adata.X
|
|
57
|
+
|
|
37
58
|
# Compute fraction of valid coverage
|
|
38
|
-
ref_valid_coverage = np.sum(~np.isnan(
|
|
59
|
+
ref_valid_coverage = np.sum(~np.isnan(matrix), axis=0)
|
|
39
60
|
ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0] # Avoid extra computation
|
|
40
61
|
|
|
41
62
|
# Store coverage stats
|
|
42
|
-
adata.var[f
|
|
63
|
+
adata.var[f"{ref}_valid_count"] = pd.Series(ref_valid_coverage, index=adata.var.index)
|
|
64
|
+
adata.var[f"{ref}_valid_fraction"] = pd.Series(ref_valid_fraction, index=adata.var.index)
|
|
43
65
|
|
|
44
66
|
# Assign whether the position is covered based on threshold
|
|
45
|
-
adata.var[f
|
|
67
|
+
adata.var[f"position_in_{ref}"] = ref_valid_fraction >= position_nan_threshold
|
|
46
68
|
|
|
47
69
|
# Sum the number of categories covering each position
|
|
48
|
-
n_categories_with_position += adata.var[f
|
|
70
|
+
n_categories_with_position += adata.var[f"position_in_{ref}"].values
|
|
49
71
|
|
|
50
72
|
# Store final category count
|
|
51
|
-
adata.var[f
|
|
73
|
+
adata.var[f"N_{ref_column}_with_position"] = n_categories_with_position.astype(int)
|
|
52
74
|
|
|
53
75
|
# mark as done
|
|
54
|
-
adata.uns[uns_flag] = True
|
|
76
|
+
adata.uns[uns_flag] = True
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# calculate_pairwise_differences
|
|
2
2
|
|
|
3
|
+
|
|
3
4
|
def calculate_pairwise_differences(arrays):
|
|
4
5
|
"""
|
|
5
6
|
Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
|
|
@@ -41,7 +42,7 @@ def calculate_pairwise_differences(arrays):
|
|
|
41
42
|
# Calculate the hamming distance directly with boolean operations
|
|
42
43
|
differences = (array_i != array_j) & ~combined_mask
|
|
43
44
|
distance = np.sum(differences) / np.sum(~combined_mask)
|
|
44
|
-
|
|
45
|
+
|
|
45
46
|
# Store the symmetric distances
|
|
46
47
|
distance_matrix[i, j] = distance
|
|
47
48
|
distance_matrix[j, i] = distance
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
## calculate_pairwise_hamming_distances
|
|
2
2
|
|
|
3
|
-
## Conversion SMF Specific
|
|
3
|
+
## Conversion SMF Specific
|
|
4
4
|
def calculate_pairwise_hamming_distances(arrays):
|
|
5
5
|
"""
|
|
6
6
|
Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
|
|
@@ -13,8 +13,9 @@ def calculate_pairwise_hamming_distances(arrays):
|
|
|
13
13
|
|
|
14
14
|
"""
|
|
15
15
|
import numpy as np
|
|
16
|
-
from tqdm import tqdm
|
|
17
16
|
from scipy.spatial.distance import hamming
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
18
19
|
num_arrays = len(arrays)
|
|
19
20
|
# Initialize an empty distance matrix
|
|
20
21
|
distance_matrix = np.zeros((num_arrays, num_arrays))
|
|
@@ -24,4 +25,4 @@ def calculate_pairwise_hamming_distances(arrays):
|
|
|
24
25
|
distance = hamming(arrays[i], arrays[j])
|
|
25
26
|
distance_matrix[i, j] = distance
|
|
26
27
|
distance_matrix[j, i] = distance
|
|
27
|
-
return distance_matrix
|
|
28
|
+
return distance_matrix
|