smftools 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. smftools/__init__.py +34 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/cli.py +184 -0
  5. smftools/config/__init__.py +1 -0
  6. smftools/config/conversion.yaml +33 -0
  7. smftools/config/deaminase.yaml +56 -0
  8. smftools/config/default.yaml +253 -0
  9. smftools/config/direct.yaml +17 -0
  10. smftools/config/experiment_config.py +1191 -0
  11. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools/datasets/F1_sample_sheet.csv +5 -0
  13. smftools/datasets/__init__.py +9 -0
  14. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  15. smftools/datasets/datasets.py +28 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/hmm/apply_hmm_batched.py +242 -0
  19. smftools/hmm/calculate_distances.py +18 -0
  20. smftools/hmm/call_hmm_peaks.py +106 -0
  21. smftools/hmm/display_hmm.py +18 -0
  22. smftools/hmm/hmm_readwrite.py +16 -0
  23. smftools/hmm/nucleosome_hmm_refinement.py +104 -0
  24. smftools/hmm/train_hmm.py +78 -0
  25. smftools/informatics/__init__.py +14 -0
  26. smftools/informatics/archived/bam_conversion.py +59 -0
  27. smftools/informatics/archived/bam_direct.py +63 -0
  28. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools/informatics/archived/conversion_smf.py +132 -0
  30. smftools/informatics/archived/deaminase_smf.py +132 -0
  31. smftools/informatics/archived/direct_smf.py +137 -0
  32. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  33. smftools/informatics/basecall_pod5s.py +80 -0
  34. smftools/informatics/fast5_to_pod5.py +24 -0
  35. smftools/informatics/helpers/__init__.py +73 -0
  36. smftools/informatics/helpers/align_and_sort_BAM.py +86 -0
  37. smftools/informatics/helpers/aligned_BAM_to_bed.py +85 -0
  38. smftools/informatics/helpers/archived/informatics.py +260 -0
  39. smftools/informatics/helpers/archived/load_adata.py +516 -0
  40. smftools/informatics/helpers/bam_qc.py +66 -0
  41. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  42. smftools/informatics/helpers/binarize_converted_base_identities.py +172 -0
  43. smftools/informatics/helpers/canoncall.py +34 -0
  44. smftools/informatics/helpers/complement_base_list.py +21 -0
  45. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +378 -0
  46. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  47. smftools/informatics/helpers/converted_BAM_to_adata_II.py +505 -0
  48. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  49. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  50. smftools/informatics/helpers/discover_input_files.py +100 -0
  51. smftools/informatics/helpers/extract_base_identities.py +70 -0
  52. smftools/informatics/helpers/extract_mods.py +83 -0
  53. smftools/informatics/helpers/extract_read_features_from_bam.py +33 -0
  54. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  55. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  56. smftools/informatics/helpers/find_conversion_sites.py +51 -0
  57. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  58. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  59. smftools/informatics/helpers/get_native_references.py +28 -0
  60. smftools/informatics/helpers/index_fasta.py +12 -0
  61. smftools/informatics/helpers/make_dirs.py +21 -0
  62. smftools/informatics/helpers/make_modbed.py +27 -0
  63. smftools/informatics/helpers/modQC.py +27 -0
  64. smftools/informatics/helpers/modcall.py +36 -0
  65. smftools/informatics/helpers/modkit_extract_to_adata.py +887 -0
  66. smftools/informatics/helpers/ohe_batching.py +76 -0
  67. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  68. smftools/informatics/helpers/one_hot_decode.py +27 -0
  69. smftools/informatics/helpers/one_hot_encode.py +57 -0
  70. smftools/informatics/helpers/plot_bed_histograms.py +269 -0
  71. smftools/informatics/helpers/run_multiqc.py +28 -0
  72. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  73. smftools/informatics/helpers/split_and_index_BAM.py +32 -0
  74. smftools/informatics/readwrite.py +106 -0
  75. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  76. smftools/informatics/subsample_pod5.py +104 -0
  77. smftools/load_adata.py +1346 -0
  78. smftools/machine_learning/__init__.py +12 -0
  79. smftools/machine_learning/data/__init__.py +2 -0
  80. smftools/machine_learning/data/anndata_data_module.py +234 -0
  81. smftools/machine_learning/data/preprocessing.py +6 -0
  82. smftools/machine_learning/evaluation/__init__.py +2 -0
  83. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  84. smftools/machine_learning/evaluation/evaluators.py +223 -0
  85. smftools/machine_learning/inference/__init__.py +3 -0
  86. smftools/machine_learning/inference/inference_utils.py +27 -0
  87. smftools/machine_learning/inference/lightning_inference.py +68 -0
  88. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  89. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  90. smftools/machine_learning/models/__init__.py +9 -0
  91. smftools/machine_learning/models/base.py +295 -0
  92. smftools/machine_learning/models/cnn.py +138 -0
  93. smftools/machine_learning/models/lightning_base.py +345 -0
  94. smftools/machine_learning/models/mlp.py +26 -0
  95. smftools/machine_learning/models/positional.py +18 -0
  96. smftools/machine_learning/models/rnn.py +17 -0
  97. smftools/machine_learning/models/sklearn_models.py +273 -0
  98. smftools/machine_learning/models/transformer.py +303 -0
  99. smftools/machine_learning/models/wrappers.py +20 -0
  100. smftools/machine_learning/training/__init__.py +2 -0
  101. smftools/machine_learning/training/train_lightning_model.py +135 -0
  102. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  103. smftools/machine_learning/utils/__init__.py +2 -0
  104. smftools/machine_learning/utils/device.py +10 -0
  105. smftools/machine_learning/utils/grl.py +14 -0
  106. smftools/plotting/__init__.py +18 -0
  107. smftools/plotting/autocorrelation_plotting.py +611 -0
  108. smftools/plotting/classifiers.py +355 -0
  109. smftools/plotting/general_plotting.py +682 -0
  110. smftools/plotting/hmm_plotting.py +260 -0
  111. smftools/plotting/position_stats.py +462 -0
  112. smftools/plotting/qc_plotting.py +270 -0
  113. smftools/preprocessing/__init__.py +38 -0
  114. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  115. smftools/preprocessing/append_base_context.py +122 -0
  116. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  117. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  118. smftools/preprocessing/archives/preprocessing.py +614 -0
  119. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  120. smftools/preprocessing/binarize_on_Youden.py +45 -0
  121. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  122. smftools/preprocessing/calculate_complexity.py +72 -0
  123. smftools/preprocessing/calculate_complexity_II.py +248 -0
  124. smftools/preprocessing/calculate_consensus.py +47 -0
  125. smftools/preprocessing/calculate_coverage.py +51 -0
  126. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  127. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  128. smftools/preprocessing/calculate_position_Youden.py +115 -0
  129. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  130. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  131. smftools/preprocessing/clean_NaN.py +62 -0
  132. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  133. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  134. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  135. smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  136. smftools/preprocessing/invert_adata.py +37 -0
  137. smftools/preprocessing/load_sample_sheet.py +53 -0
  138. smftools/preprocessing/make_dirs.py +21 -0
  139. smftools/preprocessing/min_non_diagonal.py +25 -0
  140. smftools/preprocessing/recipes.py +127 -0
  141. smftools/preprocessing/subsample_adata.py +58 -0
  142. smftools/readwrite.py +1004 -0
  143. smftools/tools/__init__.py +20 -0
  144. smftools/tools/archived/apply_hmm.py +202 -0
  145. smftools/tools/archived/classifiers.py +787 -0
  146. smftools/tools/archived/classify_methylated_features.py +66 -0
  147. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  148. smftools/tools/archived/subset_adata_v1.py +32 -0
  149. smftools/tools/archived/subset_adata_v2.py +46 -0
  150. smftools/tools/calculate_umap.py +62 -0
  151. smftools/tools/cluster_adata_on_methylation.py +105 -0
  152. smftools/tools/general_tools.py +69 -0
  153. smftools/tools/position_stats.py +601 -0
  154. smftools/tools/read_stats.py +184 -0
  155. smftools/tools/spatial_autocorrelation.py +562 -0
  156. smftools/tools/subset_adata.py +28 -0
  157. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/METADATA +9 -2
  158. smftools-0.2.1.dist-info/RECORD +161 -0
  159. smftools-0.2.1.dist-info/entry_points.txt +2 -0
  160. smftools-0.1.6.dist-info/RECORD +0 -4
  161. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/WHEEL +0 -0
  162. {smftools-0.1.6.dist-info → smftools-0.2.1.dist-info}/licenses/LICENSE +0 -0
smftools/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """smftools"""
2
+
3
+ import logging
4
+ import warnings
5
+
6
+ from . import informatics as inform
7
+ from . import machine_learning as ml
8
+ from . import plotting as pl
9
+ from . import preprocessing as pp
10
+ from . import tools as tl
11
+
12
+ from . import config, datasets, hmm, readwrite
13
+ from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
14
+
15
+ from .load_adata import load_adata
16
+
17
+ from importlib.metadata import version
18
+
19
+ package_name = "smftools"
20
+ __version__ = version(package_name)
21
+
22
+ __all__ = [
23
+ "load_adata"
24
+ "adata_to_df",
25
+ "inform",
26
+ "ml",
27
+ "pp",
28
+ "tl",
29
+ "pl",
30
+ "readwrite",
31
+ "datasets",
32
+ "safe_write_h5ad",
33
+ "safe_read_h5ad"
34
+ ]
smftools/_settings.py ADDED
@@ -0,0 +1,20 @@
1
+ from pathlib import Path
2
+ from typing import Union
3
+
4
+ class SMFConfig:
5
+ """\
6
+ Config for smftools.
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ *,
12
+ datasetdir: Union[Path, str] = "./datasets/"
13
+ ):
14
+ self._datasetdir = Path(datasetdir) if isinstance(datasetdir, str) else datasetdir
15
+
16
+ @property
17
+ def datasetdir(self) -> Path:
18
+ return self._datasetdir
19
+
20
+ settings = SMFConfig()
smftools/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1"
smftools/cli.py ADDED
@@ -0,0 +1,184 @@
1
+ import click
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from typing import Dict, Optional
5
+
6
+ from . import load_adata
7
+ from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad
8
+
9
+ @click.group()
10
+ def cli():
11
+ """Command-line interface for smftools."""
12
+ pass
13
+
14
+ ####### Main processing workflow ###########
15
+ @cli.command()
16
+ @click.argument("config_path", type=click.Path(exists=True))
17
+ def load(config_path):
18
+ """Load and process data from CONFIG_PATH."""
19
+ load_adata(config_path)
20
+ ##########################################
21
+
22
+
23
+ ####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
24
+ REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
25
+ OPTIONAL_KEYS = (
26
+ "adata_single_backups_path",
27
+ "adata_double_backups_path",
28
+ "output_path",
29
+ "merged_filename",
30
+ )
31
+
32
+ def _read_config_csv(csv_path: Path) -> Dict[str, str]:
33
+ """
34
+ Read a multi-row, two-column CSV of key,value pairs into a dict.
35
+
36
+ Supported features:
37
+ - Optional header ("key,value") or none.
38
+ - Comments starting with '#' and blank lines are ignored.
39
+ - If duplicate keys occur, the last one wins.
40
+ - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
41
+ """
42
+ try:
43
+ # Read as two columns regardless of header; comments ignored.
44
+ df = pd.read_csv(
45
+ csv_path,
46
+ dtype=str,
47
+ comment="#",
48
+ header=None, # treat everything as rows; we'll normalize below
49
+ usecols=[0, 1],
50
+ names=["key", "value"]
51
+ )
52
+ except Exception as e:
53
+ raise click.ClickException(f"Failed to read CSV: {e}") from e
54
+
55
+ # Drop completely empty rows
56
+ df = df.fillna("").astype(str)
57
+ df["key"] = df["key"].str.strip()
58
+ df["value"] = df["value"].str.strip()
59
+ df = df[(df["key"] != "") & (df["key"].notna())]
60
+
61
+ if df.empty:
62
+ raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
63
+
64
+ # Remove an optional header row if present
65
+ if df.iloc[0]["key"].lower() in {"key", "keys"}:
66
+ df = df.iloc[1:]
67
+ df = df[(df["key"] != "") & (df["key"].notna())]
68
+ if df.empty:
69
+ raise click.ClickException("Config CSV contains only a header row.")
70
+
71
+ # Build dict; last occurrence of a key wins
72
+ cfg = {}
73
+ for k, v in zip(df["key"], df["value"]):
74
+ cfg[k] = v
75
+
76
+ # Validate required keys
77
+ missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
78
+ if missing:
79
+ raise click.ClickException(
80
+ "Missing required keys in CSV: "
81
+ + ", ".join(missing)
82
+ + "\nExpected keys:\n - "
83
+ + "\n - ".join(REQUIRED_KEYS)
84
+ + "\nOptional keys:\n - "
85
+ + "\n - ".join(OPTIONAL_KEYS)
86
+ )
87
+
88
+ return cfg
89
+
90
+ def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
91
+ """Decide on the output .h5ad path based on CSV; create directories if needed."""
92
+ merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
93
+ if not merged_filename.endswith(".h5ad"):
94
+ merged_filename += ".h5ad"
95
+
96
+ output_path_raw = cfg.get("output_path", "").strip()
97
+
98
+ if not output_path_raw:
99
+ out_dir = Path.cwd() / "merged_output"
100
+ out_dir.mkdir(parents=True, exist_ok=True)
101
+ return out_dir / merged_filename
102
+
103
+ output_path = Path(output_path_raw)
104
+
105
+ if output_path.suffix.lower() == ".h5ad":
106
+ output_path.parent.mkdir(parents=True, exist_ok=True)
107
+ return output_path
108
+
109
+ # Treat as directory
110
+ output_path.mkdir(parents=True, exist_ok=True)
111
+ return output_path / merged_filename
112
+
113
+ def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
114
+
115
+ if backups:
116
+ click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
117
+ return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
118
+ else:
119
+ click.echo(f"Loading {label} from {primary} with backups disabled ...")
120
+ return safe_read_h5ad(primary, restore_backups=False)
121
+
122
+
123
+ @cli.command()
124
+ @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
125
+ def merge_barcoded_anndatas(config_path: Path):
126
+ """
127
+ Merge two AnnData objects from the same experiment that were demultiplexed
128
+ under different end-barcoding requirements, using a 1-row CSV for config.
129
+
130
+ CSV must include:
131
+ - adata_single_path
132
+ - adata_double_path
133
+
134
+ Optional columns:
135
+ - adata_single_backups_path
136
+ - adata_double_backups_path
137
+ - output_path (file or directory; default: ./merged_output/)
138
+ - merged_filename (default: merged_<single>__<double>.h5ad)
139
+
140
+ Example CSV:
141
+
142
+ adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
143
+ /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
144
+ """
145
+ try:
146
+ cfg = _read_config_csv(config_path)
147
+
148
+ single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
149
+ double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
150
+
151
+ for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
152
+ if not p.exists():
153
+ raise click.ClickException(f"{label} does not exist: {p}")
154
+
155
+ single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
156
+ double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
157
+
158
+ if single_backups and not single_backups.exists():
159
+ raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
160
+ if double_backups and not double_backups.exists():
161
+ raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
162
+
163
+ output_path = _resolve_output_path(cfg, single_path, double_path)
164
+
165
+ # Load
166
+ adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
167
+ adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
168
+
169
+ click.echo("Merging AnnDatas ...")
170
+ merged = merge_barcoded_anndatas_core(adata_single, adata_double)
171
+
172
+ click.echo(f"Writing merged AnnData to: {output_path}")
173
+ backup_dir = output_path.cwd() / "merged_backups"
174
+ safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
175
+
176
+ click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
177
+
178
+ except click.ClickException:
179
+ raise
180
+ except Exception as e:
181
+ # Surface unexpected errors cleanly
182
+ raise click.ClickException(f"Unexpected error: {e}") from e
183
+
184
+ ################################################################################################################
@@ -0,0 +1 @@
1
+ from .experiment_config import LoadExperimentConfig, ExperimentConfig
@@ -0,0 +1,33 @@
1
+ # Conversion (Bisulfite/APOBEC)footprinting defaults
2
+ extends: default
3
+ conversion_types:
4
+ - '5mC' # 5mC
5
+
6
+ # Read QC Params
7
+ read_mod_filtering_use_other_c_as_background: True
8
+
9
+ # HMM
10
+ cpg: True # whether to use the default HMM endogenous CpG patch params
11
+ hmm_methbases:
12
+ - "GpC"
13
+ hmm_feature_sets:
14
+ footprint:
15
+ state: "Non-Modified"
16
+ features:
17
+ small_bound_stretch: [0, 20]
18
+ medium_bound_stretch: [20, 50]
19
+ putative_nucleosome: [50, 200]
20
+ large_bound_stretch: [200, inf]
21
+ accessible:
22
+ state: "Modified"
23
+ features:
24
+ small_accessible_patch: [0, 20]
25
+ mid_accessible_patch: [20, 80]
26
+ large_accessible_patch: [80, inf]
27
+ cpg:
28
+ state: "Modified"
29
+ features:
30
+ cpg_patch: [0, inf]
31
+
32
+ hmm_merge_layer_features:
33
+ - ["GpC_all_accessible_features", 80]
@@ -0,0 +1,56 @@
1
+ # Deaminase footprinting defaults
2
+ extends: default
3
+ conversion_types:
4
+ - '5mC' # 5mC
5
+
6
+ mod_target_bases:
7
+ - "C"
8
+
9
+ read_mod_filtering_gpc_thresholds:
10
+ - null
11
+ - null
12
+ read_mod_filtering_cpg_thresholds:
13
+ - null
14
+ - null
15
+ read_mod_filtering_any_c_thresholds:
16
+ - 0.01
17
+ - 0.99
18
+ read_mod_filtering_a_thresholds:
19
+ - null
20
+ - null
21
+
22
+ read_mod_filtering_use_other_c_as_background: False
23
+
24
+ # Duplicate Detection Params
25
+ duplicate_detection_site_types:
26
+ - "any_C"
27
+
28
+ # Autocorrelation params
29
+ autocorr_site_types:
30
+ - "any_C"
31
+
32
+ # Correlation matrix params
33
+ correlation_matrix_site_types:
34
+ - "any_C_site"
35
+
36
+ # HMM
37
+ cpg: False # whether to use the default HMM endogenous CpG patch params
38
+ hmm_methbases:
39
+ - "C"
40
+ hmm_feature_sets:
41
+ footprint:
42
+ state: "Non-Modified"
43
+ features:
44
+ small_bound_stretch: [0, 25]
45
+ medium_bound_stretch: [25, 80]
46
+ putative_nucleosome: [80, 200]
47
+ large_bound_stretch: [200, inf]
48
+ accessible:
49
+ state: "Modified"
50
+ features:
51
+ small_accessible_patch: [0, 20]
52
+ mid_accessible_patch: [20, 100]
53
+ large_accessible_patch: [100, inf]
54
+
55
+ hmm_merge_layer_features:
56
+ - ["C_all_accessible_features", 80]
@@ -0,0 +1,253 @@
1
+ # Generic i/o
2
+ bam_suffix: ".bam"
3
+ recursive_input_search: True
4
+ split_dir: "demultiplexed_BAMs"
5
+ strands:
6
+ - bottom
7
+ - top
8
+ conversions:
9
+ - unconverted
10
+ sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
11
+ sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
12
+ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barcodes can be provided. Default is autodetecting barcodes.
13
+ fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
14
+ input_already_demuxed: False # If the input files are already demultiplexed.
15
+ delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
16
+
17
+ # Compute params
18
+ threads: 4
19
+ device: "auto"
20
+
21
+ # Sequencing modality and general experiment params
22
+ smf_modality: 'conversion' # conversion, deaminase, direct
23
+ sequencer: 'ont' # ont, pacbio, illumina
24
+ barcode_kit: 'SQK-RBK114-96' # SQK-RBK114-96, SQK-NBD114-24, etc
25
+ mod_target_bases:
26
+ - "GpC"
27
+ - "CpG"
28
+ enzyme_target_bases:
29
+ - "GpC"
30
+
31
+ # Nanopore specific basecalling params
32
+ model_dir: null # Directory where dorado basecalling models are stored.
33
+ model: "hac" # needed for dorado basecaller
34
+ filter_threshold: 0.8 # Dorado probability filter threshold for base calling.
35
+
36
+ # Alignment params
37
+ aligner: "minimap2" # Aligner to use: dorado, minimap2
38
+ aligner_args:
39
+ minimap2:
40
+ ont:
41
+ - '-a'
42
+ - '-x'
43
+ - 'map-ont'
44
+ - '--MD'
45
+ - '-Y'
46
+ - '-y'
47
+ - '-N'
48
+ - '5'
49
+ - '--secondary=no'
50
+ pacbio:
51
+ - '-a'
52
+ - '-x'
53
+ - 'map-hifi'
54
+ - '--MD'
55
+ - '-Y'
56
+ - '-y'
57
+ - '-N'
58
+ - '5'
59
+ - '--secondary=no'
60
+ illumina:
61
+ - '-a'
62
+ - '-x'
63
+ - 'sr'
64
+ - '--MD'
65
+ - '-Y'
66
+ - '-y'
67
+ - '-N'
68
+ - '5'
69
+ - '--secondary=no'
70
+ dorado:
71
+ ont:
72
+ - "--mm2-opts"
73
+ - "-N"
74
+ - "5"
75
+
76
+ # Sorted BAM and BED specific handling
77
+ make_bigwigs: False # Whether to make coverage bigwigs
78
+
79
+ # Nanopore specific demultiplexing
80
+ barcode_both_ends: False # dorado demultiplexing
81
+ trim: False # dorado adapter and barcode removal during demultiplexing
82
+
83
+ # Anndata structure
84
+ mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
85
+ reference_column: 'Reference_strand'
86
+ sample_column: 'Barcode'
87
+
88
+ # Preprocessing - Read length, quality, and mapping filtering params
89
+ read_coord_filter:
90
+ - null
91
+ - null
92
+ read_len_filter_thresholds:
93
+ - 200
94
+ - null
95
+ read_len_to_ref_ratio_filter_thresholds:
96
+ - 0.8
97
+ - null
98
+ read_quality_filter_thresholds:
99
+ - 20
100
+ - null
101
+ read_mapping_quality_filter_thresholds:
102
+ - null
103
+ - null
104
+
105
+ # Preprocessing - Read modification filtering params
106
+ read_mod_filtering_gpc_thresholds:
107
+ - 0.025
108
+ - 0.975
109
+ read_mod_filtering_cpg_thresholds:
110
+ - 0.0
111
+ - 1.0
112
+ read_mod_filtering_any_c_thresholds:
113
+ - 0.025
114
+ - 0.975
115
+ read_mod_filtering_a_thresholds:
116
+ - 0.025
117
+ - 0.975
118
+ read_mod_filtering_use_other_c_as_background: False
119
+ min_valid_fraction_positions_in_read_vs_ref: 0.8
120
+
121
+ # Preprocessing - Duplicate detection params
122
+ duplicate_detection_site_types: # Site types to consider for duplicate detection workflow
123
+ - "GpC"
124
+ - "CpG"
125
+ - "ambiguous_GpC_CpG"
126
+ duplicate_detection_distance_threshold: 0.07 # Hamming distance based similarity threshold to use for marking duplicate reads.
127
+ hamming_vs_metric_keys: # Metrics to plot the hamming distance against.
128
+ - Fraction_any_C_site_modified
129
+ duplicate_detection_keep_best_metric: "read_quality" # Obs metric to use to keep a representative read from a read duplicate cluster
130
+ duplicate_detection_window_size_for_hamming_neighbors: 50 # How many neighboring reads to look at for calculating hamming distance pairs
131
+ duplicate_detection_min_overlapping_positions: 20 # The minimum amount of valid overlapping positions that will allow duplicate detection to work
132
+ duplicate_detection_do_hierarchical: True # Whether to follow up fwd/rev lexicographic duplicate detection with hieratchical clustering based method
133
+ duplicate_detection_hierarchical_linkage: "average" # Method for hierarchical clustering distance calculation
134
+ duplicate_detection_do_pca: False # Whether to do PCA before hierarchical linkage based duplicate detection.
135
+
136
+ # Preprocessing - Complexity analysis params
137
+
138
+ # General Plotting params
139
+ sample_name_col_for_plotting: 'Barcode'
140
+
141
+ # Basic Analysis - QC Plotting params
142
+ rows_per_qc_histogram_grid: 12
143
+
144
+ # Basic Analysis - Clustermap params
145
+ layer_for_clustermap_plotting: 'nan0_0minus1'
146
+
147
+ # Basic Analysis - UMAP/Leiden params
148
+ layer_for_umap_plotting: 'nan_half'
149
+ umap_layers_to_plot:
150
+ - "mapped_length"
151
+ - "Raw_modification_signal"
152
+
153
+ # Basic Analysis - Spatial Autocorrelation params
154
+ rows_per_qc_autocorr_grid: 6
155
+ autocorr_rolling_window_size: 25
156
+ autocorr_max_lag: 800
157
+ autocorr_site_types:
158
+ - "GpC"
159
+ - "CpG"
160
+ - "any_C"
161
+
162
+ # Basic Analysis - Correlation Matrix params
163
+ correlation_matrix_types:
164
+ - "pearson"
165
+ - "binary_covariance"
166
+ correlation_matrix_cmaps:
167
+ - "seismic"
168
+ - "viridis"
169
+ correlation_matrix_site_types:
170
+ - "GpC_site"
171
+
172
+ # HMM params
173
+ hmm_n_states: 2 # Number of HMM states
174
+ hmm_init_emission_probs:
175
+ - [0.8, 0.2]
176
+ - [0.2, 0.8]
177
+ hmm_init_transition_probs:
178
+ - [0.9, 0.1]
179
+ - [0.1, 0.9]
180
+ hmm_init_start_probs:
181
+ - 0.5
182
+ - 0.5
183
+ hmm_eps: 1e-8
184
+ hmm_dtype: "float64"
185
+ hmm_annotation_threshold: 0.5
186
+ hmm_batch_size: 1024
187
+ hmm_use_viterbi: False
188
+ footprints: True # whether to use the default HMM footprint params
189
+ accessible_patches: True # whether to use the default HMM accessible patch params
190
+ cpg: False # whether to use the default HMM endogenous CpG patch params
191
+ hmm_methbases:
192
+ - "GpC"
193
+ - "CpG"
194
+ - "C"
195
+ - "A"
196
+ hmm_feature_sets:
197
+ footprint:
198
+ state: "Non-Modified"
199
+ features:
200
+ small_bound_stretch: [0, 25]
201
+ medium_bound_stretch: [25, 80]
202
+ putative_nucleosome: [80, 200]
203
+ large_bound_stretch: [200, inf]
204
+ accessible:
205
+ state: "Modified"
206
+ features:
207
+ small_accessible_patch: [0, 20]
208
+ mid_accessible_patch: [20, 100]
209
+ large_accessible_patch: [100, inf]
210
+ hmm_merge_layer_features:
211
+ - [null, 80]
212
+
213
+ # Pipeline control flow - Preprocessing and QC
214
+ force_redo_preprocessing: False # Whether to force redo the entire preprocessing workflow from the initial raw anndata.
215
+ force_reload_sample_sheet: True # Whether to force redo sample sheet loading
216
+ bypass_add_read_length_and_mapping_qc: False # Whether to skip read length, quality, and mapping qc.
217
+ force_redo_add_read_length_and_mapping_qc: False # Whether to force redo read length, quality, and mapping qc.
218
+ bypass_clean_nan: False # Whether to skip NaN cleaning
219
+ force_redo_clean_nan: False # Whether to redo NaN cleaning
220
+ bypass_append_base_context: False # Whether to skip adding per reference base context additions.
221
+ force_redo_append_base_context: False # Whether to redo per reference base context additions.
222
+ invert_adata: False # Whether to invert the AnnData along the positions axis.
223
+ bypass_append_binary_layer_by_base_context: False # Whether to skip adding new binary layers for each specific base context.
224
+ force_redo_append_binary_layer_by_base_context: False # Whether to redo adding new binary layers for each specific base context.
225
+ bypass_calculate_read_modification_stats: False # Whether to skip adding read level modification statistics.
226
+ force_redo_calculate_read_modification_stats: False # Whether to force redo adding read level modification statistics.
227
+ bypass_filter_reads_on_modification_thresholds: False # Whether to skip filtering reads based on read level modification statistics.
228
+ force_redo_filter_reads_on_modification_thresholds: False # Whether to redo filtering reads based on read level modification statistics.
229
+ bypass_flag_duplicate_reads: False # Whether to skip flagging duplicate reads based on modification similarity.
230
+ force_redo_flag_duplicate_reads: False # Whether to redo flagging duplicate reads based on modification similarity.
231
+ bypass_complexity_analysis: False # Whether to skip complexity analysis
232
+ force_redo_complexity_analysis: False # Whether to redo complexity analysis
233
+
234
+ # Pipeline control flow - Basic Analyses
235
+ force_redo_basic_analyses: False # Whether to force redo the entire basic analysis pipeline from the AnnData
236
+ bypass_basic_clustermaps: False # Whether to skip basic clustermap plotting
237
+ force_redo_basic_clustermaps: False # Whether to redo basic clustermap plotting
238
+ bypass_basic_umap: False # Whether to skip basic UMAP calculation/plotting
239
+ force_redo_basic_umap: False # Whether to redo basic UMAP calculation/plotting
240
+ bypass_spatial_autocorr_calculations: False # Whether to skip basic spatial autocorrelation calculation
241
+ force_redo_spatial_autocorr_calculations: False # Whether to redo basic spatial autocorrelation calculation
242
+ bypass_spatial_autocorr_plotting: False # Whether to skip basic spatial autocorrelation plotting
243
+ force_redo_spatial_autocorr_plotting: False # Whether to redo basic spatial autocorrelation plotting
244
+ bypass_matrix_corr_calculations: False # Whether to skip basic correlation matrix calculation
245
+ force_redo_matrix_corr_calculations: False # Whether to force redo basic correlation matrix calculation
246
+ bypass_matrix_corr_plotting: False # Whether to skip basic correlation matrix plotting
247
+ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation matrix calculation
248
+
249
+ # Pipeline control flow - HMMs
250
+ bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
251
+ force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
252
+ bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
253
+ force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
@@ -0,0 +1,17 @@
1
+ # Direct (Nanopore modified base calling)footprinting defaults
2
+ extends: default
3
+ filter_threshold: 0.8 # min threshold to call a canononical base
4
+ m6A_threshold: 0.7 # min threshold to call a modified m6a base
5
+ m5C_threshold: 0.7 # min threshold to call a modified 5mC base
6
+ hm5C_threshold: 0.7 # min threshold to call a modified 5hmC base
7
+ thresholds:
8
+ - filter_threshold
9
+ - m6A_threshold
10
+ - m5C_threshold
11
+ - hm5C_threshold
12
+ mod_list:
13
+ - '5mC_5hmC'
14
+ - '6mA' # mods to detect
15
+ batch_size: 4 # How many mod TSVs to load into memory at a time when making anndata batches
16
+ skip_unclassified: True # Whether to skip unclassified barcodes
17
+ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs after making final anndata