smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +6 -8
- smftools/_settings.py +4 -6
- smftools/_version.py +1 -1
- smftools/cli/helpers.py +7 -1
- smftools/cli/hmm_adata.py +902 -244
- smftools/cli/load_adata.py +318 -198
- smftools/cli/preprocess_adata.py +285 -171
- smftools/cli/spatial_adata.py +137 -53
- smftools/cli_entry.py +94 -178
- smftools/config/__init__.py +1 -1
- smftools/config/conversion.yaml +5 -1
- smftools/config/deaminase.yaml +1 -1
- smftools/config/default.yaml +22 -17
- smftools/config/direct.yaml +8 -3
- smftools/config/discover_input_files.py +19 -5
- smftools/config/experiment_config.py +505 -276
- smftools/constants.py +37 -0
- smftools/datasets/__init__.py +2 -8
- smftools/datasets/datasets.py +32 -18
- smftools/hmm/HMM.py +2125 -1426
- smftools/hmm/__init__.py +2 -3
- smftools/hmm/archived/call_hmm_peaks.py +16 -1
- smftools/hmm/call_hmm_peaks.py +173 -193
- smftools/hmm/display_hmm.py +19 -6
- smftools/hmm/hmm_readwrite.py +13 -4
- smftools/hmm/nucleosome_hmm_refinement.py +102 -14
- smftools/informatics/__init__.py +30 -7
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
- smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
- smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
- smftools/informatics/archived/print_bam_query_seq.py +7 -1
- smftools/informatics/bam_functions.py +379 -156
- smftools/informatics/basecalling.py +51 -9
- smftools/informatics/bed_functions.py +90 -57
- smftools/informatics/binarize_converted_base_identities.py +18 -7
- smftools/informatics/complement_base_list.py +7 -6
- smftools/informatics/converted_BAM_to_adata.py +265 -122
- smftools/informatics/fasta_functions.py +161 -83
- smftools/informatics/h5ad_functions.py +195 -29
- smftools/informatics/modkit_extract_to_adata.py +609 -270
- smftools/informatics/modkit_functions.py +85 -44
- smftools/informatics/ohe.py +44 -21
- smftools/informatics/pod5_functions.py +112 -73
- smftools/informatics/run_multiqc.py +20 -14
- smftools/logging_utils.py +51 -0
- smftools/machine_learning/__init__.py +2 -7
- smftools/machine_learning/data/anndata_data_module.py +143 -50
- smftools/machine_learning/data/preprocessing.py +2 -1
- smftools/machine_learning/evaluation/__init__.py +1 -1
- smftools/machine_learning/evaluation/eval_utils.py +11 -14
- smftools/machine_learning/evaluation/evaluators.py +46 -33
- smftools/machine_learning/inference/__init__.py +1 -1
- smftools/machine_learning/inference/inference_utils.py +7 -4
- smftools/machine_learning/inference/lightning_inference.py +9 -13
- smftools/machine_learning/inference/sklearn_inference.py +6 -8
- smftools/machine_learning/inference/sliding_window_inference.py +35 -25
- smftools/machine_learning/models/__init__.py +10 -5
- smftools/machine_learning/models/base.py +28 -42
- smftools/machine_learning/models/cnn.py +15 -11
- smftools/machine_learning/models/lightning_base.py +71 -40
- smftools/machine_learning/models/mlp.py +13 -4
- smftools/machine_learning/models/positional.py +3 -2
- smftools/machine_learning/models/rnn.py +3 -2
- smftools/machine_learning/models/sklearn_models.py +39 -22
- smftools/machine_learning/models/transformer.py +68 -53
- smftools/machine_learning/models/wrappers.py +2 -1
- smftools/machine_learning/training/__init__.py +2 -2
- smftools/machine_learning/training/train_lightning_model.py +29 -20
- smftools/machine_learning/training/train_sklearn_model.py +9 -15
- smftools/machine_learning/utils/__init__.py +1 -1
- smftools/machine_learning/utils/device.py +7 -4
- smftools/machine_learning/utils/grl.py +3 -1
- smftools/metadata.py +443 -0
- smftools/plotting/__init__.py +19 -5
- smftools/plotting/autocorrelation_plotting.py +145 -44
- smftools/plotting/classifiers.py +162 -72
- smftools/plotting/general_plotting.py +347 -168
- smftools/plotting/hmm_plotting.py +42 -13
- smftools/plotting/position_stats.py +145 -85
- smftools/plotting/qc_plotting.py +20 -12
- smftools/preprocessing/__init__.py +8 -8
- smftools/preprocessing/append_base_context.py +105 -79
- smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
- smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
- smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
- smftools/preprocessing/binarize.py +21 -4
- smftools/preprocessing/binarize_on_Youden.py +127 -31
- smftools/preprocessing/binary_layers_to_ohe.py +17 -11
- smftools/preprocessing/calculate_complexity_II.py +86 -59
- smftools/preprocessing/calculate_consensus.py +28 -19
- smftools/preprocessing/calculate_coverage.py +44 -22
- smftools/preprocessing/calculate_pairwise_differences.py +2 -1
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
- smftools/preprocessing/calculate_position_Youden.py +103 -55
- smftools/preprocessing/calculate_read_length_stats.py +52 -23
- smftools/preprocessing/calculate_read_modification_stats.py +91 -57
- smftools/preprocessing/clean_NaN.py +38 -28
- smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
- smftools/preprocessing/flag_duplicate_reads.py +688 -271
- smftools/preprocessing/invert_adata.py +26 -11
- smftools/preprocessing/load_sample_sheet.py +40 -22
- smftools/preprocessing/make_dirs.py +8 -3
- smftools/preprocessing/min_non_diagonal.py +2 -1
- smftools/preprocessing/recipes.py +56 -23
- smftools/preprocessing/reindex_references_adata.py +93 -27
- smftools/preprocessing/subsample_adata.py +33 -16
- smftools/readwrite.py +264 -109
- smftools/schema/__init__.py +11 -0
- smftools/schema/anndata_schema_v1.yaml +227 -0
- smftools/tools/__init__.py +3 -4
- smftools/tools/archived/classifiers.py +163 -0
- smftools/tools/archived/subset_adata_v1.py +10 -1
- smftools/tools/archived/subset_adata_v2.py +12 -1
- smftools/tools/calculate_umap.py +54 -15
- smftools/tools/cluster_adata_on_methylation.py +115 -46
- smftools/tools/general_tools.py +70 -25
- smftools/tools/position_stats.py +229 -98
- smftools/tools/read_stats.py +50 -29
- smftools/tools/spatial_autocorrelation.py +365 -192
- smftools/tools/subset_adata.py +23 -21
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
- smftools-0.2.5.dist-info/RECORD +181 -0
- smftools-0.2.4.dist-info/RECORD +0 -176
- /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
- /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
- /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
- {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,26 +1,38 @@
|
|
|
1
1
|
## filter_adata_by_nan_proportion
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
import anndata as ad
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def filter_adata_by_nan_proportion(
|
|
12
|
+
adata: "ad.AnnData", threshold: float, axis: str = "obs"
|
|
13
|
+
) -> "ad.AnnData":
|
|
14
|
+
"""Filter an AnnData object on NaN proportion in a matrix axis.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
adata: AnnData object to filter.
|
|
18
|
+
threshold: Maximum allowed NaN proportion.
|
|
19
|
+
axis: Whether to filter based on ``"obs"`` or ``"var"`` NaN content.
|
|
6
20
|
|
|
7
|
-
Parameters:
|
|
8
|
-
adata (AnnData):
|
|
9
|
-
threshold (float): The max np.nan content to allow in the given axis.
|
|
10
|
-
axis (str): Whether to filter the adata based on obs or var np.nan content
|
|
11
21
|
Returns:
|
|
12
|
-
|
|
22
|
+
anndata.AnnData: Filtered AnnData object.
|
|
23
|
+
|
|
24
|
+
Raises:
|
|
25
|
+
ValueError: If ``axis`` is not ``"obs"`` or ``"var"``.
|
|
13
26
|
"""
|
|
14
27
|
import numpy as np
|
|
15
|
-
import anndata as ad
|
|
16
28
|
|
|
17
|
-
if axis ==
|
|
29
|
+
if axis == "obs":
|
|
18
30
|
# Calculate the proportion of NaN values in each read
|
|
19
31
|
nan_proportion = np.isnan(adata.X).mean(axis=1)
|
|
20
32
|
# Filter reads to keep reads with less than a certain NaN proportion
|
|
21
33
|
filtered_indices = np.where(nan_proportion <= threshold)[0]
|
|
22
34
|
filtered_adata = adata[filtered_indices, :].copy()
|
|
23
|
-
elif axis ==
|
|
35
|
+
elif axis == "var":
|
|
24
36
|
# Calculate the proportion of NaN values at a given position
|
|
25
37
|
nan_proportion = np.isnan(adata.X).mean(axis=0)
|
|
26
38
|
# Filter positions to keep positions with less than a certain NaN proportion
|
|
@@ -28,4 +40,4 @@ def filter_adata_by_nan_proportion(adata, threshold, axis='obs'):
|
|
|
28
40
|
filtered_adata = adata[:, filtered_indices].copy()
|
|
29
41
|
else:
|
|
30
42
|
raise ValueError("Axis must be either 'obs' or 'var'")
|
|
31
|
-
return filtered_adata
|
|
43
|
+
return filtered_adata
|
|
@@ -1,28 +1,41 @@
|
|
|
1
|
-
from typing import Optional,
|
|
1
|
+
from typing import Optional, Sequence, Union
|
|
2
|
+
|
|
3
|
+
import anndata as ad
|
|
2
4
|
import numpy as np
|
|
3
5
|
import pandas as pd
|
|
4
|
-
|
|
6
|
+
|
|
7
|
+
from smftools.logging_utils import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
5
11
|
|
|
6
12
|
def filter_reads_on_length_quality_mapping(
|
|
7
13
|
adata: ad.AnnData,
|
|
8
14
|
filter_on_coordinates: Union[bool, Sequence] = False,
|
|
9
15
|
# New single-range params (preferred):
|
|
10
|
-
read_length: Optional[Sequence[float]] = None,
|
|
11
|
-
length_ratio: Optional[Sequence[float]] = None,
|
|
12
|
-
read_quality: Optional[Sequence[float]] = None,
|
|
13
|
-
mapping_quality: Optional[Sequence[float]] = None,
|
|
16
|
+
read_length: Optional[Sequence[float]] = None, # e.g. [min, max]
|
|
17
|
+
length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
|
|
18
|
+
read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
19
|
+
mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
14
20
|
uns_flag: str = "filter_reads_on_length_quality_mapping_performed",
|
|
15
21
|
bypass: bool = False,
|
|
16
|
-
force_redo: bool = True
|
|
22
|
+
force_redo: bool = True,
|
|
17
23
|
) -> ad.AnnData:
|
|
18
|
-
"""
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
24
|
+
"""Filter AnnData by coordinates, read length, quality, and mapping metrics.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
adata: AnnData object to filter.
|
|
28
|
+
filter_on_coordinates: Optional coordinate window as a two-value sequence.
|
|
29
|
+
read_length: Read length range as ``[min, max]``.
|
|
30
|
+
length_ratio: Length ratio range as ``[min, max]``.
|
|
31
|
+
read_quality: Read quality range as ``[min, max]``.
|
|
32
|
+
mapping_quality: Mapping quality range as ``[min, max]``.
|
|
33
|
+
uns_flag: Flag in ``adata.uns`` indicating prior completion.
|
|
34
|
+
bypass: Whether to skip processing.
|
|
35
|
+
force_redo: Whether to rerun even if ``uns_flag`` is set.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
anndata.AnnData: Filtered copy of the input AnnData.
|
|
26
39
|
"""
|
|
27
40
|
# early exit
|
|
28
41
|
already = bool(adata.uns.get(uns_flag, False))
|
|
@@ -37,7 +50,9 @@ def filter_reads_on_length_quality_mapping(
|
|
|
37
50
|
try:
|
|
38
51
|
low, high = tuple(filter_on_coordinates)
|
|
39
52
|
except Exception:
|
|
40
|
-
raise ValueError(
|
|
53
|
+
raise ValueError(
|
|
54
|
+
"filter_on_coordinates must be False or an iterable of two numbers (low, high)."
|
|
55
|
+
)
|
|
41
56
|
try:
|
|
42
57
|
var_coords = np.array([float(v) for v in adata_work.var_names])
|
|
43
58
|
if low > high:
|
|
@@ -50,10 +65,17 @@ def filter_reads_on_length_quality_mapping(
|
|
|
50
65
|
selected_cols = list(adata_work.var_names[lo_idx : hi_idx + 1])
|
|
51
66
|
else:
|
|
52
67
|
selected_cols = list(adata_work.var_names[col_mask_bool])
|
|
53
|
-
|
|
68
|
+
logger.info(
|
|
69
|
+
"Subsetting adata to coordinates between %s and %s: keeping %s variables.",
|
|
70
|
+
low,
|
|
71
|
+
high,
|
|
72
|
+
len(selected_cols),
|
|
73
|
+
)
|
|
54
74
|
adata_work = adata_work[:, selected_cols].copy()
|
|
55
75
|
except Exception:
|
|
56
|
-
|
|
76
|
+
logger.warning(
|
|
77
|
+
"Could not interpret adata.var_names as numeric coordinates — skipping coordinate filtering."
|
|
78
|
+
)
|
|
57
79
|
|
|
58
80
|
# --- helper to coerce range inputs ---
|
|
59
81
|
def _coerce_range(range_arg):
|
|
@@ -85,72 +107,83 @@ def filter_reads_on_length_quality_mapping(
|
|
|
85
107
|
# read length filter
|
|
86
108
|
if (rl_min is not None) or (rl_max is not None):
|
|
87
109
|
if "mapped_length" not in adata_work.obs.columns:
|
|
88
|
-
|
|
110
|
+
logger.warning("'mapped_length' not found in adata.obs — skipping read_length filter.")
|
|
89
111
|
else:
|
|
90
112
|
vals = pd.to_numeric(adata_work.obs["mapped_length"], errors="coerce")
|
|
91
113
|
mask = pd.Series(True, index=adata_work.obs.index)
|
|
92
114
|
if rl_min is not None:
|
|
93
|
-
mask &=
|
|
115
|
+
mask &= vals >= rl_min
|
|
94
116
|
if rl_max is not None:
|
|
95
|
-
mask &=
|
|
117
|
+
mask &= vals <= rl_max
|
|
96
118
|
mask &= vals.notna()
|
|
97
119
|
combined_mask &= mask
|
|
98
|
-
|
|
120
|
+
logger.info("Planned read_length filter: min=%s, max=%s", rl_min, rl_max)
|
|
99
121
|
|
|
100
122
|
# length ratio filter
|
|
101
123
|
if (lr_min is not None) or (lr_max is not None):
|
|
102
124
|
if "mapped_length_to_reference_length_ratio" not in adata_work.obs.columns:
|
|
103
|
-
|
|
125
|
+
logger.warning(
|
|
126
|
+
"'mapped_length_to_reference_length_ratio' not found in adata.obs — skipping length_ratio filter."
|
|
127
|
+
)
|
|
104
128
|
else:
|
|
105
|
-
vals = pd.to_numeric(
|
|
129
|
+
vals = pd.to_numeric(
|
|
130
|
+
adata_work.obs["mapped_length_to_reference_length_ratio"], errors="coerce"
|
|
131
|
+
)
|
|
106
132
|
mask = pd.Series(True, index=adata_work.obs.index)
|
|
107
133
|
if lr_min is not None:
|
|
108
|
-
mask &=
|
|
134
|
+
mask &= vals >= lr_min
|
|
109
135
|
if lr_max is not None:
|
|
110
|
-
mask &=
|
|
136
|
+
mask &= vals <= lr_max
|
|
111
137
|
mask &= vals.notna()
|
|
112
138
|
combined_mask &= mask
|
|
113
|
-
|
|
139
|
+
logger.info("Planned length_ratio filter: min=%s, max=%s", lr_min, lr_max)
|
|
114
140
|
|
|
115
141
|
# read quality filter (supporting optional range but typically min only)
|
|
116
142
|
if (rq_min is not None) or (rq_max is not None):
|
|
117
143
|
if "read_quality" not in adata_work.obs.columns:
|
|
118
|
-
|
|
144
|
+
logger.warning("'read_quality' not found in adata.obs — skipping read_quality filter.")
|
|
119
145
|
else:
|
|
120
146
|
vals = pd.to_numeric(adata_work.obs["read_quality"], errors="coerce")
|
|
121
147
|
mask = pd.Series(True, index=adata_work.obs.index)
|
|
122
148
|
if rq_min is not None:
|
|
123
|
-
mask &=
|
|
149
|
+
mask &= vals >= rq_min
|
|
124
150
|
if rq_max is not None:
|
|
125
|
-
mask &=
|
|
151
|
+
mask &= vals <= rq_max
|
|
126
152
|
mask &= vals.notna()
|
|
127
153
|
combined_mask &= mask
|
|
128
|
-
|
|
154
|
+
logger.info("Planned read_quality filter: min=%s, max=%s", rq_min, rq_max)
|
|
129
155
|
|
|
130
156
|
# mapping quality filter (supporting optional range but typically min only)
|
|
131
157
|
if (mq_min is not None) or (mq_max is not None):
|
|
132
158
|
if "mapping_quality" not in adata_work.obs.columns:
|
|
133
|
-
|
|
159
|
+
logger.warning(
|
|
160
|
+
"'mapping_quality' not found in adata.obs — skipping mapping_quality filter."
|
|
161
|
+
)
|
|
134
162
|
else:
|
|
135
163
|
vals = pd.to_numeric(adata_work.obs["mapping_quality"], errors="coerce")
|
|
136
164
|
mask = pd.Series(True, index=adata_work.obs.index)
|
|
137
165
|
if mq_min is not None:
|
|
138
|
-
mask &=
|
|
166
|
+
mask &= vals >= mq_min
|
|
139
167
|
if mq_max is not None:
|
|
140
|
-
mask &=
|
|
168
|
+
mask &= vals <= mq_max
|
|
141
169
|
mask &= vals.notna()
|
|
142
170
|
combined_mask &= mask
|
|
143
|
-
|
|
171
|
+
logger.info("Planned mapping_quality filter: min=%s, max=%s", mq_min, mq_max)
|
|
144
172
|
|
|
145
173
|
# Apply combined mask and report
|
|
146
174
|
s0 = adata_work.n_obs
|
|
147
175
|
combined_mask_bool = combined_mask.astype(bool).values
|
|
148
176
|
adata_work = adata_work[combined_mask_bool].copy()
|
|
149
177
|
s1 = adata_work.n_obs
|
|
150
|
-
|
|
178
|
+
logger.info("Combined filters applied: kept %s / %s reads (removed %s)", s1, s0, s0 - s1)
|
|
151
179
|
|
|
152
180
|
final_n = adata_work.n_obs
|
|
153
|
-
|
|
181
|
+
logger.info(
|
|
182
|
+
"Filtering complete: start=%s, final=%s, removed=%s",
|
|
183
|
+
start_n,
|
|
184
|
+
final_n,
|
|
185
|
+
start_n - final_n,
|
|
186
|
+
)
|
|
154
187
|
|
|
155
188
|
# mark as done
|
|
156
189
|
adata_work.uns[uns_flag] = True
|