smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +34 -0
- smftools/_settings.py +20 -0
- smftools/_version.py +1 -0
- smftools/cli.py +184 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +33 -0
- smftools/config/deaminase.yaml +56 -0
- smftools/config/default.yaml +253 -0
- smftools/config/direct.yaml +17 -0
- smftools/config/experiment_config.py +1191 -0
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- smftools/datasets/F1_sample_sheet.csv +5 -0
- smftools/datasets/__init__.py +9 -0
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- smftools/datasets/datasets.py +28 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/hmm/apply_hmm_batched.py +242 -0
- smftools/hmm/calculate_distances.py +18 -0
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/hmm/display_hmm.py +18 -0
- smftools/hmm/hmm_readwrite.py +16 -0
- smftools/hmm/nucleosome_hmm_refinement.py +104 -0
- smftools/hmm/train_hmm.py +78 -0
- smftools/informatics/__init__.py +14 -0
- smftools/informatics/archived/bam_conversion.py +59 -0
- smftools/informatics/archived/bam_direct.py +63 -0
- smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools/informatics/archived/conversion_smf.py +132 -0
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/direct_smf.py +137 -0
- smftools/informatics/archived/print_bam_query_seq.py +29 -0
- smftools/informatics/basecall_pod5s.py +80 -0
- smftools/informatics/fast5_to_pod5.py +24 -0
- smftools/informatics/helpers/__init__.py +73 -0
- smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
- smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
- smftools/informatics/helpers/archived/informatics.py +260 -0
- smftools/informatics/helpers/archived/load_adata.py +516 -0
- smftools/informatics/helpers/bam_qc.py +66 -0
- smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
- smftools/informatics/helpers/canoncall.py +34 -0
- smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
- smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
- smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
- smftools/informatics/helpers/count_aligned_reads.py +43 -0
- smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
- smftools/informatics/helpers/discover_input_files.py +100 -0
- smftools/informatics/helpers/extract_base_identities.py +70 -0
- smftools/informatics/helpers/extract_mods.py +83 -0
- smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
- smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
- smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- smftools/informatics/helpers/find_conversion_sites.py +51 -0
- smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
- smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools/informatics/helpers/get_native_references.py +28 -0
- smftools/informatics/helpers/index_fasta.py +12 -0
- smftools/informatics/helpers/make_dirs.py +21 -0
- smftools/informatics/helpers/make_modbed.py +27 -0
- smftools/informatics/helpers/modQC.py +27 -0
- smftools/informatics/helpers/modcall.py +36 -0
- smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
- smftools/informatics/helpers/ohe_batching.py +76 -0
- smftools/informatics/helpers/ohe_layers_decode.py +32 -0
- smftools/informatics/helpers/one_hot_decode.py +27 -0
- smftools/informatics/helpers/one_hot_encode.py +57 -0
- smftools/informatics/helpers/plot_bed_histograms.py +269 -0
- smftools/informatics/helpers/run_multiqc.py +28 -0
- smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
- smftools/informatics/helpers/split_and_index_BAM.py +32 -0
- smftools/informatics/readwrite.py +106 -0
- smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools/informatics/subsample_pod5.py +104 -0
- smftools/load_adata.py +1346 -0
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/data/preprocessing.py +6 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/__init__.py +9 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/machine_learning/models/positional.py +18 -0
- smftools/machine_learning/models/rnn.py +17 -0
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/models/wrappers.py +20 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/machine_learning/utils/__init__.py +2 -0
- smftools/machine_learning/utils/device.py +10 -0
- smftools/machine_learning/utils/grl.py +14 -0
- smftools/plotting/__init__.py +18 -0
- smftools/plotting/autocorrelation_plotting.py +611 -0
- smftools/plotting/classifiers.py +355 -0
- smftools/plotting/general_plotting.py +682 -0
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/position_stats.py +462 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +38 -0
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/archives/mark_duplicates.py +146 -0
- smftools/preprocessing/archives/preprocessing.py +614 -0
- smftools/preprocessing/archives/remove_duplicates.py +21 -0
- smftools/preprocessing/binarize_on_Youden.py +45 -0
- smftools/preprocessing/binary_layers_to_ohe.py +40 -0
- smftools/preprocessing/calculate_complexity.py +72 -0
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_consensus.py +47 -0
- smftools/preprocessing/calculate_coverage.py +51 -0
- smftools/preprocessing/calculate_pairwise_differences.py +49 -0
- smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
- smftools/preprocessing/calculate_position_Youden.py +115 -0
- smftools/preprocessing/calculate_read_length_stats.py +79 -0
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +62 -0
- smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- smftools/preprocessing/invert_adata.py +37 -0
- smftools/preprocessing/load_sample_sheet.py +53 -0
- smftools/preprocessing/make_dirs.py +21 -0
- smftools/preprocessing/min_non_diagonal.py +25 -0
- smftools/preprocessing/recipes.py +127 -0
- smftools/preprocessing/subsample_adata.py +58 -0
- smftools/readwrite.py +1004 -0
- smftools/tools/__init__.py +20 -0
- smftools/tools/archived/apply_hmm.py +202 -0
- smftools/tools/archived/classifiers.py +787 -0
- smftools/tools/archived/classify_methylated_features.py +66 -0
- smftools/tools/archived/classify_non_methylated_features.py +75 -0
- smftools/tools/archived/subset_adata_v1.py +32 -0
- smftools/tools/archived/subset_adata_v2.py +46 -0
- smftools/tools/calculate_umap.py +62 -0
- smftools/tools/cluster_adata_on_methylation.py +105 -0
- smftools/tools/general_tools.py +69 -0
- smftools/tools/position_stats.py +601 -0
- smftools/tools/read_stats.py +184 -0
- smftools/tools/spatial_autocorrelation.py +562 -0
- smftools/tools/subset_adata.py +28 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
- smftools-0.2.1.dist-info/RECORD +161 -0
- smftools-0.2.1.dist-info/entry_points.txt +2 -0
- smftools-0.1.6.dist-info/RECORD +0 -4
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
- {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""smftools"""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import warnings
|
|
5
|
+
|
|
6
|
+
from . import informatics as inform
|
|
7
|
+
from . import machine_learning as ml
|
|
8
|
+
from . import plotting as pl
|
|
9
|
+
from . import preprocessing as pp
|
|
10
|
+
from . import tools as tl
|
|
11
|
+
|
|
12
|
+
from . import config, datasets, hmm, readwrite
|
|
13
|
+
from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
|
|
14
|
+
|
|
15
|
+
from .load_adata import load_adata
|
|
16
|
+
|
|
17
|
+
from importlib.metadata import version
|
|
18
|
+
|
|
19
|
+
package_name = "smftools"
|
|
20
|
+
__version__ = version(package_name)
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"load_adata"
|
|
24
|
+
"adata_to_df",
|
|
25
|
+
"inform",
|
|
26
|
+
"ml",
|
|
27
|
+
"pp",
|
|
28
|
+
"tl",
|
|
29
|
+
"pl",
|
|
30
|
+
"readwrite",
|
|
31
|
+
"datasets",
|
|
32
|
+
"safe_write_h5ad",
|
|
33
|
+
"safe_read_h5ad"
|
|
34
|
+
]
|
smftools/_settings.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
4
|
+
class SMFConfig:
|
|
5
|
+
"""\
|
|
6
|
+
Config for smftools.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
*,
|
|
12
|
+
datasetdir: Union[Path, str] = "./datasets/"
|
|
13
|
+
):
|
|
14
|
+
self._datasetdir = Path(datasetdir) if isinstance(datasetdir, str) else datasetdir
|
|
15
|
+
|
|
16
|
+
@property
|
|
17
|
+
def datasetdir(self) -> Path:
|
|
18
|
+
return self._datasetdir
|
|
19
|
+
|
|
20
|
+
settings = SMFConfig()
|
smftools/_version.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.1"
|
smftools/cli.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import click
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Dict, Optional
|
|
5
|
+
|
|
6
|
+
from . import load_adata
|
|
7
|
+
from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
|
|
8
|
+
|
|
9
|
+
@click.group()
|
|
10
|
+
def cli():
|
|
11
|
+
"""Command-line interface for smftools."""
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
####### Main processing workflow ###########
|
|
15
|
+
@cli.command()
|
|
16
|
+
@click.argument("config_path", type=click.Path(exists=True))
|
|
17
|
+
def load(config_path):
|
|
18
|
+
"""Load and process data from CONFIG_PATH."""
|
|
19
|
+
load_adata(config_path)
|
|
20
|
+
##########################################
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
|
|
24
|
+
REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
|
|
25
|
+
OPTIONAL_KEYS = (
|
|
26
|
+
"adata_single_backups_path",
|
|
27
|
+
"adata_double_backups_path",
|
|
28
|
+
"output_path",
|
|
29
|
+
"merged_filename",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def _read_config_csv(csv_path: Path) -> Dict[str, str]:
|
|
33
|
+
"""
|
|
34
|
+
Read a multi-row, two-column CSV of key,value pairs into a dict.
|
|
35
|
+
|
|
36
|
+
Supported features:
|
|
37
|
+
- Optional header ("key,value") or none.
|
|
38
|
+
- Comments starting with '#' and blank lines are ignored.
|
|
39
|
+
- If duplicate keys occur, the last one wins.
|
|
40
|
+
- Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
# Read as two columns regardless of header; comments ignored.
|
|
44
|
+
df = pd.read_csv(
|
|
45
|
+
csv_path,
|
|
46
|
+
dtype=str,
|
|
47
|
+
comment="#",
|
|
48
|
+
header=None, # treat everything as rows; we'll normalize below
|
|
49
|
+
usecols=[0, 1],
|
|
50
|
+
names=["key", "value"]
|
|
51
|
+
)
|
|
52
|
+
except Exception as e:
|
|
53
|
+
raise click.ClickException(f"Failed to read CSV: {e}") from e
|
|
54
|
+
|
|
55
|
+
# Drop completely empty rows
|
|
56
|
+
df = df.fillna("").astype(str)
|
|
57
|
+
df["key"] = df["key"].str.strip()
|
|
58
|
+
df["value"] = df["value"].str.strip()
|
|
59
|
+
df = df[(df["key"] != "") & (df["key"].notna())]
|
|
60
|
+
|
|
61
|
+
if df.empty:
|
|
62
|
+
raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
|
|
63
|
+
|
|
64
|
+
# Remove an optional header row if present
|
|
65
|
+
if df.iloc[0]["key"].lower() in {"key", "keys"}:
|
|
66
|
+
df = df.iloc[1:]
|
|
67
|
+
df = df[(df["key"] != "") & (df["key"].notna())]
|
|
68
|
+
if df.empty:
|
|
69
|
+
raise click.ClickException("Config CSV contains only a header row.")
|
|
70
|
+
|
|
71
|
+
# Build dict; last occurrence of a key wins
|
|
72
|
+
cfg = {}
|
|
73
|
+
for k, v in zip(df["key"], df["value"]):
|
|
74
|
+
cfg[k] = v
|
|
75
|
+
|
|
76
|
+
# Validate required keys
|
|
77
|
+
missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
|
|
78
|
+
if missing:
|
|
79
|
+
raise click.ClickException(
|
|
80
|
+
"Missing required keys in CSV: "
|
|
81
|
+
+ ", ".join(missing)
|
|
82
|
+
+ "\nExpected keys:\n - "
|
|
83
|
+
+ "\n - ".join(REQUIRED_KEYS)
|
|
84
|
+
+ "\nOptional keys:\n - "
|
|
85
|
+
+ "\n - ".join(OPTIONAL_KEYS)
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
return cfg
|
|
89
|
+
|
|
90
|
+
def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
|
|
91
|
+
"""Decide on the output .h5ad path based on CSV; create directories if needed."""
|
|
92
|
+
merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
|
|
93
|
+
if not merged_filename.endswith(".h5ad"):
|
|
94
|
+
merged_filename += ".h5ad"
|
|
95
|
+
|
|
96
|
+
output_path_raw = cfg.get("output_path", "").strip()
|
|
97
|
+
|
|
98
|
+
if not output_path_raw:
|
|
99
|
+
out_dir = Path.cwd() / "merged_output"
|
|
100
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
return out_dir / merged_filename
|
|
102
|
+
|
|
103
|
+
output_path = Path(output_path_raw)
|
|
104
|
+
|
|
105
|
+
if output_path.suffix.lower() == ".h5ad":
|
|
106
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
107
|
+
return output_path
|
|
108
|
+
|
|
109
|
+
# Treat as directory
|
|
110
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
111
|
+
return output_path / merged_filename
|
|
112
|
+
|
|
113
|
+
def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
|
|
114
|
+
|
|
115
|
+
if backups:
|
|
116
|
+
click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
|
|
117
|
+
return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
|
|
118
|
+
else:
|
|
119
|
+
click.echo(f"Loading {label} from {primary} with backups disabled ...")
|
|
120
|
+
return safe_read_h5ad(primary, restore_backups=False)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@cli.command()
|
|
124
|
+
@click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
|
|
125
|
+
def merge_barcoded_anndatas(config_path: Path):
|
|
126
|
+
"""
|
|
127
|
+
Merge two AnnData objects from the same experiment that were demultiplexed
|
|
128
|
+
under different end-barcoding requirements, using a 1-row CSV for config.
|
|
129
|
+
|
|
130
|
+
CSV must include:
|
|
131
|
+
- adata_single_path
|
|
132
|
+
- adata_double_path
|
|
133
|
+
|
|
134
|
+
Optional columns:
|
|
135
|
+
- adata_single_backups_path
|
|
136
|
+
- adata_double_backups_path
|
|
137
|
+
- output_path (file or directory; default: ./merged_output/)
|
|
138
|
+
- merged_filename (default: merged_<single>__<double>.h5ad)
|
|
139
|
+
|
|
140
|
+
Example CSV:
|
|
141
|
+
|
|
142
|
+
adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
|
|
143
|
+
/path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
cfg = _read_config_csv(config_path)
|
|
147
|
+
|
|
148
|
+
single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
|
|
149
|
+
double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
|
|
150
|
+
|
|
151
|
+
for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
|
|
152
|
+
if not p.exists():
|
|
153
|
+
raise click.ClickException(f"{label} does not exist: {p}")
|
|
154
|
+
|
|
155
|
+
single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
|
|
156
|
+
double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
|
|
157
|
+
|
|
158
|
+
if single_backups and not single_backups.exists():
|
|
159
|
+
raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
|
|
160
|
+
if double_backups and not double_backups.exists():
|
|
161
|
+
raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
|
|
162
|
+
|
|
163
|
+
output_path = _resolve_output_path(cfg, single_path, double_path)
|
|
164
|
+
|
|
165
|
+
# Load
|
|
166
|
+
adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
|
|
167
|
+
adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
|
|
168
|
+
|
|
169
|
+
click.echo("Merging AnnDatas ...")
|
|
170
|
+
merged = merge_barcoded_anndatas_core(adata_single, adata_double)
|
|
171
|
+
|
|
172
|
+
click.echo(f"Writing merged AnnData to: {output_path}")
|
|
173
|
+
backup_dir = output_path.cwd() / "merged_backups"
|
|
174
|
+
safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
|
|
175
|
+
|
|
176
|
+
click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
|
|
177
|
+
|
|
178
|
+
except click.ClickException:
|
|
179
|
+
raise
|
|
180
|
+
except Exception as e:
|
|
181
|
+
# Surface unexpected errors cleanly
|
|
182
|
+
raise click.ClickException(f"Unexpected error: {e}") from e
|
|
183
|
+
|
|
184
|
+
################################################################################################################
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .experiment_config import LoadExperimentConfig, ExperimentConfig
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# Conversion (Bisulfite/APOBEC)footprinting defaults
|
|
2
|
+
extends: default
|
|
3
|
+
conversion_types:
|
|
4
|
+
- '5mC' # 5mC
|
|
5
|
+
|
|
6
|
+
# Read QC Params
|
|
7
|
+
read_mod_filtering_use_other_c_as_background: True
|
|
8
|
+
|
|
9
|
+
# HMM
|
|
10
|
+
cpg: True # whether to use the default HMM endogenous CpG patch params
|
|
11
|
+
hmm_methbases:
|
|
12
|
+
- "GpC"
|
|
13
|
+
hmm_feature_sets:
|
|
14
|
+
footprint:
|
|
15
|
+
state: "Non-Modified"
|
|
16
|
+
features:
|
|
17
|
+
small_bound_stretch: [0, 20]
|
|
18
|
+
medium_bound_stretch: [20, 50]
|
|
19
|
+
putative_nucleosome: [50, 200]
|
|
20
|
+
large_bound_stretch: [200, inf]
|
|
21
|
+
accessible:
|
|
22
|
+
state: "Modified"
|
|
23
|
+
features:
|
|
24
|
+
small_accessible_patch: [0, 20]
|
|
25
|
+
mid_accessible_patch: [20, 80]
|
|
26
|
+
large_accessible_patch: [80, inf]
|
|
27
|
+
cpg:
|
|
28
|
+
state: "Modified"
|
|
29
|
+
features:
|
|
30
|
+
cpg_patch: [0, inf]
|
|
31
|
+
|
|
32
|
+
hmm_merge_layer_features:
|
|
33
|
+
- ["GpC_all_accessible_features", 80]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Deaminase footprinting defaults
|
|
2
|
+
extends: default
|
|
3
|
+
conversion_types:
|
|
4
|
+
- '5mC' # 5mC
|
|
5
|
+
|
|
6
|
+
mod_target_bases:
|
|
7
|
+
- "C"
|
|
8
|
+
|
|
9
|
+
read_mod_filtering_gpc_thresholds:
|
|
10
|
+
- null
|
|
11
|
+
- null
|
|
12
|
+
read_mod_filtering_cpg_thresholds:
|
|
13
|
+
- null
|
|
14
|
+
- null
|
|
15
|
+
read_mod_filtering_any_c_thresholds:
|
|
16
|
+
- 0.01
|
|
17
|
+
- 0.99
|
|
18
|
+
read_mod_filtering_a_thresholds:
|
|
19
|
+
- null
|
|
20
|
+
- null
|
|
21
|
+
|
|
22
|
+
read_mod_filtering_use_other_c_as_background: False
|
|
23
|
+
|
|
24
|
+
# Duplicate Detection Params
|
|
25
|
+
duplicate_detection_site_types:
|
|
26
|
+
- "any_C"
|
|
27
|
+
|
|
28
|
+
# Autocorrelation params
|
|
29
|
+
autocorr_site_types:
|
|
30
|
+
- "any_C"
|
|
31
|
+
|
|
32
|
+
# Correlation matrix params
|
|
33
|
+
correlation_matrix_site_types:
|
|
34
|
+
- "any_C_site"
|
|
35
|
+
|
|
36
|
+
# HMM
|
|
37
|
+
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
38
|
+
hmm_methbases:
|
|
39
|
+
- "C"
|
|
40
|
+
hmm_feature_sets:
|
|
41
|
+
footprint:
|
|
42
|
+
state: "Non-Modified"
|
|
43
|
+
features:
|
|
44
|
+
small_bound_stretch: [0, 25]
|
|
45
|
+
medium_bound_stretch: [25, 80]
|
|
46
|
+
putative_nucleosome: [80, 200]
|
|
47
|
+
large_bound_stretch: [200, inf]
|
|
48
|
+
accessible:
|
|
49
|
+
state: "Modified"
|
|
50
|
+
features:
|
|
51
|
+
small_accessible_patch: [0, 20]
|
|
52
|
+
mid_accessible_patch: [20, 100]
|
|
53
|
+
large_accessible_patch: [100, inf]
|
|
54
|
+
|
|
55
|
+
hmm_merge_layer_features:
|
|
56
|
+
- ["C_all_accessible_features", 80]
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
# Generic i/o
|
|
2
|
+
bam_suffix: ".bam"
|
|
3
|
+
recursive_input_search: True
|
|
4
|
+
split_dir: "demultiplexed_BAMs"
|
|
5
|
+
strands:
|
|
6
|
+
- bottom
|
|
7
|
+
- top
|
|
8
|
+
conversions:
|
|
9
|
+
- unconverted
|
|
10
|
+
sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
|
|
11
|
+
sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
|
|
12
|
+
fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
|
|
13
|
+
fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
|
|
14
|
+
input_already_demuxed: False # If the input files are already demultiplexed.
|
|
15
|
+
delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
|
|
16
|
+
|
|
17
|
+
# Compute params
|
|
18
|
+
threads: 4
|
|
19
|
+
device: "auto"
|
|
20
|
+
|
|
21
|
+
# Sequencing modality and general experiment params
|
|
22
|
+
smf_modality: 'conversion' # conversion, deaminase, direct
|
|
23
|
+
sequencer: 'ont' # ont, pacbio, illumina
|
|
24
|
+
barcode_kit: 'SQK-RBK114-96' # SQK-RBK114-96, SQK-NBD114-24, etc
|
|
25
|
+
mod_target_bases:
|
|
26
|
+
- "GpC"
|
|
27
|
+
- "CpG"
|
|
28
|
+
enzyme_target_bases:
|
|
29
|
+
- "GpC"
|
|
30
|
+
|
|
31
|
+
# Nanopore specific basecalling params
|
|
32
|
+
model_dir: null # Directory where dorado basecalling models are stored.
|
|
33
|
+
model: "hac" # needed for dorado basecaller
|
|
34
|
+
filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
|
|
35
|
+
|
|
36
|
+
# Alignment params
|
|
37
|
+
aligner: "minimap2" # Aligner to use: dorado, minimap2
|
|
38
|
+
aligner_args:
|
|
39
|
+
minimap2:
|
|
40
|
+
ont:
|
|
41
|
+
- '-a'
|
|
42
|
+
- '-x'
|
|
43
|
+
- 'map-ont'
|
|
44
|
+
- '--MD'
|
|
45
|
+
- '-Y'
|
|
46
|
+
- '-y'
|
|
47
|
+
- '-N'
|
|
48
|
+
- '5'
|
|
49
|
+
- '--secondary=no'
|
|
50
|
+
pacbio:
|
|
51
|
+
- '-a'
|
|
52
|
+
- '-x'
|
|
53
|
+
- 'map-hifi'
|
|
54
|
+
- '--MD'
|
|
55
|
+
- '-Y'
|
|
56
|
+
- '-y'
|
|
57
|
+
- '-N'
|
|
58
|
+
- '5'
|
|
59
|
+
- '--secondary=no'
|
|
60
|
+
illumina:
|
|
61
|
+
- '-a'
|
|
62
|
+
- '-x'
|
|
63
|
+
- 'sr'
|
|
64
|
+
- '--MD'
|
|
65
|
+
- '-Y'
|
|
66
|
+
- '-y'
|
|
67
|
+
- '-N'
|
|
68
|
+
- '5'
|
|
69
|
+
- '--secondary=no'
|
|
70
|
+
dorado:
|
|
71
|
+
ont:
|
|
72
|
+
- "--mm2-opts"
|
|
73
|
+
- "-N"
|
|
74
|
+
- "5"
|
|
75
|
+
|
|
76
|
+
# Sorted BAM and BED specific handling
|
|
77
|
+
make_bigwigs: False # Whether to make coverage bigwigs
|
|
78
|
+
|
|
79
|
+
# Nanopore specific demultiplexing
|
|
80
|
+
barcode_both_ends: False # dorado demultiplexing
|
|
81
|
+
trim: False # dorado adapter and barcode removal during demultiplexing
|
|
82
|
+
|
|
83
|
+
# Anndata structure
|
|
84
|
+
mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
85
|
+
reference_column: 'Reference_strand'
|
|
86
|
+
sample_column: 'Barcode'
|
|
87
|
+
|
|
88
|
+
# Preprocessing - Read length, quality, and mapping filtering params
|
|
89
|
+
read_coord_filter:
|
|
90
|
+
- null
|
|
91
|
+
- null
|
|
92
|
+
read_len_filter_thresholds:
|
|
93
|
+
- 200
|
|
94
|
+
- null
|
|
95
|
+
read_len_to_ref_ratio_filter_thresholds:
|
|
96
|
+
- 0.8
|
|
97
|
+
- null
|
|
98
|
+
read_quality_filter_thresholds:
|
|
99
|
+
- 20
|
|
100
|
+
- null
|
|
101
|
+
read_mapping_quality_filter_thresholds:
|
|
102
|
+
- null
|
|
103
|
+
- null
|
|
104
|
+
|
|
105
|
+
# Preprocessing - Read modification filtering params
|
|
106
|
+
read_mod_filtering_gpc_thresholds:
|
|
107
|
+
- 0.025
|
|
108
|
+
- 0.975
|
|
109
|
+
read_mod_filtering_cpg_thresholds:
|
|
110
|
+
- 0.0
|
|
111
|
+
- 1.0
|
|
112
|
+
read_mod_filtering_any_c_thresholds:
|
|
113
|
+
- 0.025
|
|
114
|
+
- 0.975
|
|
115
|
+
read_mod_filtering_a_thresholds:
|
|
116
|
+
- 0.025
|
|
117
|
+
- 0.975
|
|
118
|
+
read_mod_filtering_use_other_c_as_background: False
|
|
119
|
+
min_valid_fraction_positions_in_read_vs_ref: 0.8
|
|
120
|
+
|
|
121
|
+
# Preprocessing - Duplicate detection params
|
|
122
|
+
duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
|
|
123
|
+
- "GpC"
|
|
124
|
+
- "CpG"
|
|
125
|
+
- "ambiguous_GpC_CpG"
|
|
126
|
+
duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
|
|
127
|
+
hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
|
|
128
|
+
- Fraction_any_C_site_modified
|
|
129
|
+
duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
|
|
130
|
+
duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
|
|
131
|
+
duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
|
|
132
|
+
duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicographic duplicate detection with hieratchical clustering based method
|
|
133
|
+
duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
|
|
134
|
+
duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
|
|
135
|
+
|
|
136
|
+
# Preprocessing - Complexity analysis params
|
|
137
|
+
|
|
138
|
+
# General Plotting params
|
|
139
|
+
sample_name_col_for_plotting: 'Barcode'
|
|
140
|
+
|
|
141
|
+
# Basic Analysis - QC Plotting params
|
|
142
|
+
rows_per_qc_histogram_grid: 12
|
|
143
|
+
|
|
144
|
+
# Basic Analysis - Clustermap params
|
|
145
|
+
layer_for_clustermap_plotting: 'nan0_0minus1'
|
|
146
|
+
|
|
147
|
+
# Basic Analysis - UMAP/Leiden params
|
|
148
|
+
layer_for_umap_plotting: 'nan_half'
|
|
149
|
+
umap_layers_to_plot:
|
|
150
|
+
- "mapped_length"
|
|
151
|
+
- "Raw_modification_signal"
|
|
152
|
+
|
|
153
|
+
# Basic Analysis - Spatial Autocorrelation params
|
|
154
|
+
rows_per_qc_autocorr_grid: 6
|
|
155
|
+
autocorr_rolling_window_size: 25
|
|
156
|
+
autocorr_max_lag: 800
|
|
157
|
+
autocorr_site_types:
|
|
158
|
+
- "GpC"
|
|
159
|
+
- "CpG"
|
|
160
|
+
- "any_C"
|
|
161
|
+
|
|
162
|
+
# Basic Analysis - Correlation Matrix params
|
|
163
|
+
correlation_matrix_types:
|
|
164
|
+
- "pearson"
|
|
165
|
+
- "binary_covariance"
|
|
166
|
+
correlation_matrix_cmaps:
|
|
167
|
+
- "seismic"
|
|
168
|
+
- "viridis"
|
|
169
|
+
correlation_matrix_site_types:
|
|
170
|
+
- "GpC_site"
|
|
171
|
+
|
|
172
|
+
# HMM params
|
|
173
|
+
hmm_n_states: 2 # Number of HMM states
|
|
174
|
+
hmm_init_emission_probs:
|
|
175
|
+
- [0.8, 0.2]
|
|
176
|
+
- [0.2, 0.8]
|
|
177
|
+
hmm_init_transition_probs:
|
|
178
|
+
- [0.9, 0.1]
|
|
179
|
+
- [0.1, 0.9]
|
|
180
|
+
hmm_init_start_probs:
|
|
181
|
+
- 0.5
|
|
182
|
+
- 0.5
|
|
183
|
+
hmm_eps: 1e-8
|
|
184
|
+
hmm_dtype: "float64"
|
|
185
|
+
hmm_annotation_threshold: 0.5
|
|
186
|
+
hmm_batch_size: 1024
|
|
187
|
+
hmm_use_viterbi: False
|
|
188
|
+
footprints: True # whether to use the default HMM footprint params
|
|
189
|
+
accessible_patches: True # whether to use the default HMM accessible patch params
|
|
190
|
+
cpg: False # whether to use the default HMM endogenous CpG patch params
|
|
191
|
+
hmm_methbases:
|
|
192
|
+
- "GpC"
|
|
193
|
+
- "CpG"
|
|
194
|
+
- "C"
|
|
195
|
+
- "A"
|
|
196
|
+
hmm_feature_sets:
|
|
197
|
+
footprint:
|
|
198
|
+
state: "Non-Modified"
|
|
199
|
+
features:
|
|
200
|
+
small_bound_stretch: [0, 25]
|
|
201
|
+
medium_bound_stretch: [25, 80]
|
|
202
|
+
putative_nucleosome: [80, 200]
|
|
203
|
+
large_bound_stretch: [200, inf]
|
|
204
|
+
accessible:
|
|
205
|
+
state: "Modified"
|
|
206
|
+
features:
|
|
207
|
+
small_accessible_patch: [0, 20]
|
|
208
|
+
mid_accessible_patch: [20, 100]
|
|
209
|
+
large_accessible_patch: [100, inf]
|
|
210
|
+
hmm_merge_layer_features:
|
|
211
|
+
- [null, 80]
|
|
212
|
+
|
|
213
|
+
# Pipeline control flow - Preprocessing and QC
|
|
214
|
+
force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
|
|
215
|
+
force_reload_sample_sheet: True # Whether to force redo sample sheet loading
|
|
216
|
+
bypass_add_read_length_and_mapping_qc: False # Whether to skip read length, quality, and mapping qc.
|
|
217
|
+
force_redo_add_read_length_and_mapping_qc: False # Whether to force redo read length, quality, and mapping qc.
|
|
218
|
+
bypass_clean_nan: False # Whether to skip NaN cleaning
|
|
219
|
+
force_redo_clean_nan: False # Whether to redo NaN cleaning
|
|
220
|
+
bypass_append_base_context: False # Whether to skip adding per reference base context additions.
|
|
221
|
+
force_redo_append_base_context: False # Whether to redo per reference base context additions.
|
|
222
|
+
invert_adata: False # Whether to invert the AnnData along the positions axis.
|
|
223
|
+
bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
|
|
224
|
+
force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
|
|
225
|
+
bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
|
|
226
|
+
force_redo_calculate_read_modification_stats: False # Whether to force redo adding read level modification statistics.
|
|
227
|
+
bypass_filter_reads_on_modification_thresholds: False # Whether to skip filtering reads based on read level modification statistics.
|
|
228
|
+
force_redo_filter_reads_on_modification_thresholds: False # Whether to redo filtering reads based on read level modification statistics.
|
|
229
|
+
bypass_flag_duplicate_reads: False # Whether to skip flagging duplicate reads based on modification similarity.
|
|
230
|
+
force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate reads based on modification similarity.
|
|
231
|
+
bypass_complexity_analysis: False # Whether to skip complexity analysis
|
|
232
|
+
force_redo_complexity_analysis: False # Whether to redo complexity analysis
|
|
233
|
+
|
|
234
|
+
# Pipeline control flow - Basic Analyses
|
|
235
|
+
force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
|
|
236
|
+
bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
|
|
237
|
+
force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
|
|
238
|
+
bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
|
|
239
|
+
force_redo_basic_umap: False # Whether to redo basic UMAP calculation/plotting
|
|
240
|
+
bypass_spatial_autocorr_calculations: False # Whether to skip basic spatial autocorrelation calculation
|
|
241
|
+
force_redo_spatial_autocorr_calculations: False # Whether to redo basic spatial autocorrelation calculation
|
|
242
|
+
bypass_spatial_autocorr_plotting: False # Whether to skip basic spatial autocorrelation plotting
|
|
243
|
+
force_redo_spatial_autocorr_plotting: False # Whether to redo basic spatial autocorrelation plotting
|
|
244
|
+
bypass_matrix_corr_calculations: False # Whether to skip basic correlation matrix calculation
|
|
245
|
+
force_redo_matrix_corr_calculations: False # Whether to force redo basic correlation matrix calculation
|
|
246
|
+
bypass_matrix_corr_plotting: False # Whether to skip basic correlation matrix plotting
|
|
247
|
+
force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation matrix calculation
|
|
248
|
+
|
|
249
|
+
# Pipeline control flow - HMMs
|
|
250
|
+
bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
|
|
251
|
+
force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
|
|
252
|
+
bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
|
|
253
|
+
force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Direct (Nanopore modified base calling)footprinting defaults
|
|
2
|
+
extends: default
|
|
3
|
+
filter_threshold: 0.8 # min threshold to call a canononical base
|
|
4
|
+
m6A_threshold: 0.7 # min threshold to call a modified m6a base
|
|
5
|
+
m5C_threshold: 0.7 # min threshold to call a modified 5mC base
|
|
6
|
+
hm5C_threshold: 0.7 # min threshold to call a modified 5hmC base
|
|
7
|
+
thresholds:
|
|
8
|
+
- filter_threshold
|
|
9
|
+
- m6A_threshold
|
|
10
|
+
- m5C_threshold
|
|
11
|
+
- hm5C_threshold
|
|
12
|
+
mod_list:
|
|
13
|
+
- '5mC_5hmC'
|
|
14
|
+
- '6mA' # mods to detect
|
|
15
|
+
batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
|
|
16
|
+
skip_unclassified: True # Whether to skip unclassified barcodes
|
|
17
|
+
delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata
|