smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.
|
|
1
|
+
def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.00001, uns_flag='positional_coverage_calculated'):
|
|
2
2
|
"""
|
|
3
3
|
Append position-level metadata regarding whether the position is informative within the given observation category.
|
|
4
4
|
|
|
@@ -13,6 +13,12 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pandas as pd
|
|
15
15
|
import anndata as ad
|
|
16
|
+
|
|
17
|
+
# Only run if not already performed
|
|
18
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
19
|
+
if already:
|
|
20
|
+
# QC already performed; nothing to do
|
|
21
|
+
return
|
|
16
22
|
|
|
17
23
|
categories = adata.obs[obs_column].cat.categories
|
|
18
24
|
n_categories_with_position = np.zeros(adata.shape[1])
|
|
@@ -40,3 +46,6 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
|
|
|
40
46
|
|
|
41
47
|
# Store final category count
|
|
42
48
|
adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
|
|
49
|
+
|
|
50
|
+
# mark as done
|
|
51
|
+
adata.uns[uns_flag] = True
|
|
@@ -103,7 +103,7 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
|
|
|
103
103
|
probability_thresholding_list[position] = (0.8, np.nan)
|
|
104
104
|
title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
|
|
105
105
|
plt.title(title)
|
|
106
|
-
save_name = output_directory
|
|
106
|
+
save_name = output_directory / f"{title}.png"
|
|
107
107
|
if save:
|
|
108
108
|
plt.savefig(save_name)
|
|
109
109
|
plt.close()
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
def calculate_read_modification_stats(adata,
|
|
2
|
+
reference_column,
|
|
3
|
+
sample_names_col,
|
|
4
|
+
mod_target_bases,
|
|
5
|
+
uns_flag="read_modification_stats_calculated",
|
|
6
|
+
bypass=False,
|
|
7
|
+
force_redo=False
|
|
8
|
+
):
|
|
9
|
+
"""
|
|
10
|
+
Adds methylation/deamination statistics for each read.
|
|
11
|
+
Indicates the read GpC and CpG methylation ratio to other_C methylation (background false positive metric for Cytosine MTase SMF).
|
|
12
|
+
|
|
13
|
+
Parameters:
|
|
14
|
+
adata (AnnData): An adata object
|
|
15
|
+
reference_column (str): String representing the name of the Reference column to use
|
|
16
|
+
sample_names_col (str): String representing the name of the sample name column to use
|
|
17
|
+
mod_target_bases:
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
None
|
|
21
|
+
"""
|
|
22
|
+
import numpy as np
|
|
23
|
+
import anndata as ad
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
# Only run if not already performed
|
|
27
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
28
|
+
if (already and not force_redo) or bypass:
|
|
29
|
+
# QC already performed; nothing to do
|
|
30
|
+
return
|
|
31
|
+
|
|
32
|
+
print('Calculating read level Modification statistics')
|
|
33
|
+
|
|
34
|
+
references = set(adata.obs[reference_column])
|
|
35
|
+
sample_names = set(adata.obs[sample_names_col])
|
|
36
|
+
site_types = []
|
|
37
|
+
|
|
38
|
+
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
39
|
+
site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
|
|
40
|
+
|
|
41
|
+
if 'A' in mod_target_bases:
|
|
42
|
+
site_types += ['A_site']
|
|
43
|
+
|
|
44
|
+
for site_type in site_types:
|
|
45
|
+
adata.obs[f'Modified_{site_type}_count'] = pd.Series(0, index=adata.obs_names, dtype=int)
|
|
46
|
+
adata.obs[f'Total_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
|
|
47
|
+
adata.obs[f'Fraction_{site_type}_modified'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
48
|
+
adata.obs[f'Total_{site_type}_in_reference'] = pd.Series(np.nan, index=adata.obs_names, dtype=int)
|
|
49
|
+
adata.obs[f'Valid_{site_type}_in_read_vs_reference'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
for ref in references:
|
|
53
|
+
ref_subset = adata[adata.obs[reference_column] == ref]
|
|
54
|
+
for site_type in site_types:
|
|
55
|
+
print(f'Iterating over {ref}_{site_type}')
|
|
56
|
+
observation_matrix = ref_subset.obsm[f'{ref}_{site_type}']
|
|
57
|
+
total_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
|
|
58
|
+
total_positions_in_reference = observation_matrix.shape[1]
|
|
59
|
+
fraction_valid_positions_in_read_vs_ref = total_positions_in_read / total_positions_in_reference
|
|
60
|
+
number_mods_in_read = np.nansum(observation_matrix, axis=1)
|
|
61
|
+
fraction_modified = number_mods_in_read / total_positions_in_read
|
|
62
|
+
|
|
63
|
+
fraction_modified = np.divide(
|
|
64
|
+
number_mods_in_read,
|
|
65
|
+
total_positions_in_read,
|
|
66
|
+
out=np.full_like(number_mods_in_read, np.nan, dtype=float),
|
|
67
|
+
where=total_positions_in_read != 0
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
temp_obs_data = pd.DataFrame({f'Total_{site_type}_in_read': total_positions_in_read,
|
|
71
|
+
f'Modified_{site_type}_count': number_mods_in_read,
|
|
72
|
+
f'Fraction_{site_type}_modified': fraction_modified,
|
|
73
|
+
f'Total_{site_type}_in_reference': total_positions_in_reference,
|
|
74
|
+
f'Valid_{site_type}_in_read_vs_reference': fraction_valid_positions_in_read_vs_ref},
|
|
75
|
+
index=ref_subset.obs.index)
|
|
76
|
+
|
|
77
|
+
adata.obs.update(temp_obs_data)
|
|
78
|
+
|
|
79
|
+
if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
|
|
80
|
+
with np.errstate(divide='ignore', invalid='ignore'):
|
|
81
|
+
gpc_to_c_ratio = np.divide(
|
|
82
|
+
adata.obs[f'Fraction_GpC_site_modified'],
|
|
83
|
+
adata.obs[f'Fraction_other_C_site_modified'],
|
|
84
|
+
out=np.full_like(adata.obs[f'Fraction_GpC_site_modified'], np.nan, dtype=float),
|
|
85
|
+
where=adata.obs[f'Fraction_other_C_site_modified'] != 0
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
cpg_to_c_ratio = np.divide(
|
|
89
|
+
adata.obs[f'Fraction_CpG_site_modified'],
|
|
90
|
+
adata.obs[f'Fraction_other_C_site_modified'],
|
|
91
|
+
out=np.full_like(adata.obs[f'Fraction_CpG_site_modified'], np.nan, dtype=float),
|
|
92
|
+
where=adata.obs[f'Fraction_other_C_site_modified'] != 0
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
adata.obs['GpC_to_other_C_mod_ratio'] = gpc_to_c_ratio
|
|
96
|
+
adata.obs['CpG_to_other_C_mod_ratio'] = cpg_to_c_ratio
|
|
97
|
+
|
|
98
|
+
# mark as done
|
|
99
|
+
adata.uns[uns_flag] = True
|
|
100
|
+
|
|
101
|
+
return
|
|
@@ -1,4 +1,9 @@
|
|
|
1
|
-
def clean_NaN(adata,
|
|
1
|
+
def clean_NaN(adata,
|
|
2
|
+
layer=None,
|
|
3
|
+
uns_flag='clean_NaN_performed',
|
|
4
|
+
bypass=False,
|
|
5
|
+
force_redo=True
|
|
6
|
+
):
|
|
2
7
|
"""
|
|
3
8
|
Append layers to adata that contain NaN cleaning strategies.
|
|
4
9
|
|
|
@@ -14,6 +19,12 @@ def clean_NaN(adata, layer=None):
|
|
|
14
19
|
import anndata as ad
|
|
15
20
|
from ..readwrite import adata_to_df
|
|
16
21
|
|
|
22
|
+
# Only run if not already performed
|
|
23
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
24
|
+
if (already and not force_redo) or bypass:
|
|
25
|
+
# QC already performed; nothing to do
|
|
26
|
+
return
|
|
27
|
+
|
|
17
28
|
# Ensure the specified layer exists
|
|
18
29
|
if layer and layer not in adata.layers:
|
|
19
30
|
raise ValueError(f"Layer '{layer}' not found in adata.layers.")
|
|
@@ -44,3 +55,8 @@ def clean_NaN(adata, layer=None):
|
|
|
44
55
|
print('Making layer: nan_half')
|
|
45
56
|
df_nan_half = df.fillna(0.5)
|
|
46
57
|
adata.layers['nan_half'] = df_nan_half.values
|
|
58
|
+
|
|
59
|
+
# mark as done
|
|
60
|
+
adata.uns[uns_flag] = True
|
|
61
|
+
|
|
62
|
+
return None
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
from typing import Optional, Union, Sequence
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import anndata as ad
|
|
5
|
+
|
|
6
|
+
def filter_reads_on_length_quality_mapping(
|
|
7
|
+
adata: ad.AnnData,
|
|
8
|
+
filter_on_coordinates: Union[bool, Sequence] = False,
|
|
9
|
+
# New single-range params (preferred):
|
|
10
|
+
read_length: Optional[Sequence[float]] = None, # e.g. [min, max]
|
|
11
|
+
length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
|
|
12
|
+
read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
13
|
+
mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
|
|
14
|
+
uns_flag: str = "reads_removed_failing_length_quality_mapping_qc",
|
|
15
|
+
bypass: bool = False,
|
|
16
|
+
force_redo: bool = True
|
|
17
|
+
) -> ad.AnnData:
|
|
18
|
+
"""
|
|
19
|
+
Filter AnnData by coordinate window, read length, length ratios, read quality and mapping quality.
|
|
20
|
+
|
|
21
|
+
New: you may pass `read_length=[min, max]` (or tuple) to set both min/max in one argument.
|
|
22
|
+
If `read_length` is given it overrides scalar min/max variants (which are not present in this signature).
|
|
23
|
+
Same behavior supported for `length_ratio`, `read_quality`, `mapping_quality`.
|
|
24
|
+
|
|
25
|
+
Returns a filtered copy of the input AnnData and marks adata.uns[uns_flag] = True.
|
|
26
|
+
"""
|
|
27
|
+
# early exit
|
|
28
|
+
already = bool(adata.uns.get(uns_flag, False))
|
|
29
|
+
if bypass or (already and not force_redo):
|
|
30
|
+
return adata
|
|
31
|
+
|
|
32
|
+
adata_work = adata
|
|
33
|
+
start_n = adata_work.n_obs
|
|
34
|
+
|
|
35
|
+
# --- coordinate filtering (unchanged) ---
|
|
36
|
+
if filter_on_coordinates:
|
|
37
|
+
try:
|
|
38
|
+
low, high = tuple(filter_on_coordinates)
|
|
39
|
+
except Exception:
|
|
40
|
+
raise ValueError("filter_on_coordinates must be False or an iterable of two numbers (low, high).")
|
|
41
|
+
try:
|
|
42
|
+
var_coords = np.array([float(v) for v in adata_work.var_names])
|
|
43
|
+
if low > high:
|
|
44
|
+
low, high = high, low
|
|
45
|
+
col_mask_bool = (var_coords >= float(low)) & (var_coords <= float(high))
|
|
46
|
+
if not col_mask_bool.any():
|
|
47
|
+
start_idx = int(np.argmin(np.abs(var_coords - float(low))))
|
|
48
|
+
end_idx = int(np.argmin(np.abs(var_coords - float(high))))
|
|
49
|
+
lo_idx, hi_idx = min(start_idx, end_idx), max(start_idx, end_idx)
|
|
50
|
+
selected_cols = list(adata_work.var_names[lo_idx : hi_idx + 1])
|
|
51
|
+
else:
|
|
52
|
+
selected_cols = list(adata_work.var_names[col_mask_bool])
|
|
53
|
+
print(f"Subsetting adata to coordinates between {low} and {high}: keeping {len(selected_cols)} variables.")
|
|
54
|
+
adata_work = adata_work[:, selected_cols].copy()
|
|
55
|
+
except Exception:
|
|
56
|
+
print("Warning: could not interpret adata.var_names as numeric coordinates — skipping coordinate filtering.")
|
|
57
|
+
|
|
58
|
+
# --- helper to coerce range inputs ---
|
|
59
|
+
def _coerce_range(range_arg):
|
|
60
|
+
"""
|
|
61
|
+
Given range_arg which may be None or a 2-seq [min,max], return (min_or_None, max_or_None).
|
|
62
|
+
If both present and min>max they are swapped.
|
|
63
|
+
"""
|
|
64
|
+
if range_arg is None:
|
|
65
|
+
return None, None
|
|
66
|
+
if not isinstance(range_arg, (list, tuple, np.ndarray)) or len(range_arg) != 2:
|
|
67
|
+
# not a 2-element range -> treat as no restriction (or you could raise)
|
|
68
|
+
return None, None
|
|
69
|
+
lo_raw, hi_raw = range_arg[0], range_arg[1]
|
|
70
|
+
lo = None if lo_raw is None else float(lo_raw)
|
|
71
|
+
hi = None if hi_raw is None else float(hi_raw)
|
|
72
|
+
if (lo is not None) and (hi is not None) and lo > hi:
|
|
73
|
+
lo, hi = hi, lo
|
|
74
|
+
return lo, hi
|
|
75
|
+
|
|
76
|
+
# Resolve ranges using only the provided range arguments
|
|
77
|
+
rl_min, rl_max = _coerce_range(read_length)
|
|
78
|
+
lr_min, lr_max = _coerce_range(length_ratio)
|
|
79
|
+
rq_min, rq_max = _coerce_range(read_quality)
|
|
80
|
+
mq_min, mq_max = _coerce_range(mapping_quality)
|
|
81
|
+
|
|
82
|
+
# --- build combined mask ---
|
|
83
|
+
combined_mask = pd.Series(True, index=adata_work.obs.index)
|
|
84
|
+
|
|
85
|
+
# read length filter
|
|
86
|
+
if (rl_min is not None) or (rl_max is not None):
|
|
87
|
+
if "mapped_length" not in adata_work.obs.columns:
|
|
88
|
+
print("Warning: 'mapped_length' not found in adata.obs — skipping read_length filter.")
|
|
89
|
+
else:
|
|
90
|
+
vals = pd.to_numeric(adata_work.obs["mapped_length"], errors="coerce")
|
|
91
|
+
mask = pd.Series(True, index=adata_work.obs.index)
|
|
92
|
+
if rl_min is not None:
|
|
93
|
+
mask &= (vals >= rl_min)
|
|
94
|
+
if rl_max is not None:
|
|
95
|
+
mask &= (vals <= rl_max)
|
|
96
|
+
mask &= vals.notna()
|
|
97
|
+
combined_mask &= mask
|
|
98
|
+
print(f"Planned read_length filter: min={rl_min}, max={rl_max}")
|
|
99
|
+
|
|
100
|
+
# length ratio filter
|
|
101
|
+
if (lr_min is not None) or (lr_max is not None):
|
|
102
|
+
if "mapped_length_to_reference_length_ratio" not in adata_work.obs.columns:
|
|
103
|
+
print("Warning: 'mapped_length_to_reference_length_ratio' not found in adata.obs — skipping length_ratio filter.")
|
|
104
|
+
else:
|
|
105
|
+
vals = pd.to_numeric(adata_work.obs["mapped_length_to_reference_length_ratio"], errors="coerce")
|
|
106
|
+
mask = pd.Series(True, index=adata_work.obs.index)
|
|
107
|
+
if lr_min is not None:
|
|
108
|
+
mask &= (vals >= lr_min)
|
|
109
|
+
if lr_max is not None:
|
|
110
|
+
mask &= (vals <= lr_max)
|
|
111
|
+
mask &= vals.notna()
|
|
112
|
+
combined_mask &= mask
|
|
113
|
+
print(f"Planned length_ratio filter: min={lr_min}, max={lr_max}")
|
|
114
|
+
|
|
115
|
+
# read quality filter (supporting optional range but typically min only)
|
|
116
|
+
if (rq_min is not None) or (rq_max is not None):
|
|
117
|
+
if "read_quality" not in adata_work.obs.columns:
|
|
118
|
+
print("Warning: 'read_quality' not found in adata.obs — skipping read_quality filter.")
|
|
119
|
+
else:
|
|
120
|
+
vals = pd.to_numeric(adata_work.obs["read_quality"], errors="coerce")
|
|
121
|
+
mask = pd.Series(True, index=adata_work.obs.index)
|
|
122
|
+
if rq_min is not None:
|
|
123
|
+
mask &= (vals >= rq_min)
|
|
124
|
+
if rq_max is not None:
|
|
125
|
+
mask &= (vals <= rq_max)
|
|
126
|
+
mask &= vals.notna()
|
|
127
|
+
combined_mask &= mask
|
|
128
|
+
print(f"Planned read_quality filter: min={rq_min}, max={rq_max}")
|
|
129
|
+
|
|
130
|
+
# mapping quality filter (supporting optional range but typically min only)
|
|
131
|
+
if (mq_min is not None) or (mq_max is not None):
|
|
132
|
+
if "mapping_quality" not in adata_work.obs.columns:
|
|
133
|
+
print("Warning: 'mapping_quality' not found in adata.obs — skipping mapping_quality filter.")
|
|
134
|
+
else:
|
|
135
|
+
vals = pd.to_numeric(adata_work.obs["mapping_quality"], errors="coerce")
|
|
136
|
+
mask = pd.Series(True, index=adata_work.obs.index)
|
|
137
|
+
if mq_min is not None:
|
|
138
|
+
mask &= (vals >= mq_min)
|
|
139
|
+
if mq_max is not None:
|
|
140
|
+
mask &= (vals <= mq_max)
|
|
141
|
+
mask &= vals.notna()
|
|
142
|
+
combined_mask &= mask
|
|
143
|
+
print(f"Planned mapping_quality filter: min={mq_min}, max={mq_max}")
|
|
144
|
+
|
|
145
|
+
# Apply combined mask and report
|
|
146
|
+
s0 = adata_work.n_obs
|
|
147
|
+
combined_mask_bool = combined_mask.astype(bool).values
|
|
148
|
+
adata_work = adata_work[combined_mask_bool].copy()
|
|
149
|
+
s1 = adata_work.n_obs
|
|
150
|
+
print(f"Combined filters applied: kept {s1} / {s0} reads (removed {s0 - s1})")
|
|
151
|
+
|
|
152
|
+
final_n = adata_work.n_obs
|
|
153
|
+
print(f"Filtering complete: start={start_n}, final={final_n}, removed={start_n - final_n}")
|
|
154
|
+
|
|
155
|
+
# mark as done
|
|
156
|
+
adata_work.uns[uns_flag] = True
|
|
157
|
+
|
|
158
|
+
return adata_work
|