smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. smftools/__init__.py +7 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/cli_flows.py +94 -0
  4. smftools/cli/hmm_adata.py +338 -0
  5. smftools/cli/load_adata.py +577 -0
  6. smftools/cli/preprocess_adata.py +363 -0
  7. smftools/cli/spatial_adata.py +564 -0
  8. smftools/cli_entry.py +435 -0
  9. smftools/config/__init__.py +1 -0
  10. smftools/config/conversion.yaml +38 -0
  11. smftools/config/deaminase.yaml +61 -0
  12. smftools/config/default.yaml +264 -0
  13. smftools/config/direct.yaml +41 -0
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +1288 -0
  16. smftools/hmm/HMM.py +1576 -0
  17. smftools/hmm/__init__.py +20 -0
  18. smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
  19. smftools/hmm/call_hmm_peaks.py +106 -0
  20. smftools/{tools → hmm}/display_hmm.py +3 -3
  21. smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
  22. smftools/{tools → hmm}/train_hmm.py +1 -1
  23. smftools/informatics/__init__.py +13 -9
  24. smftools/informatics/archived/deaminase_smf.py +132 -0
  25. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  26. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  27. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  28. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  30. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  31. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  32. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  34. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
  35. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  36. smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
  38. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  39. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  40. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  41. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  42. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  43. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  44. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  45. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
  46. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  47. smftools/informatics/bam_functions.py +812 -0
  48. smftools/informatics/basecalling.py +67 -0
  49. smftools/informatics/bed_functions.py +366 -0
  50. smftools/informatics/binarize_converted_base_identities.py +172 -0
  51. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
  52. smftools/informatics/fasta_functions.py +255 -0
  53. smftools/informatics/h5ad_functions.py +197 -0
  54. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
  55. smftools/informatics/modkit_functions.py +129 -0
  56. smftools/informatics/ohe.py +160 -0
  57. smftools/informatics/pod5_functions.py +224 -0
  58. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  59. smftools/machine_learning/__init__.py +12 -0
  60. smftools/machine_learning/data/__init__.py +2 -0
  61. smftools/machine_learning/data/anndata_data_module.py +234 -0
  62. smftools/machine_learning/evaluation/__init__.py +2 -0
  63. smftools/machine_learning/evaluation/eval_utils.py +31 -0
  64. smftools/machine_learning/evaluation/evaluators.py +223 -0
  65. smftools/machine_learning/inference/__init__.py +3 -0
  66. smftools/machine_learning/inference/inference_utils.py +27 -0
  67. smftools/machine_learning/inference/lightning_inference.py +68 -0
  68. smftools/machine_learning/inference/sklearn_inference.py +55 -0
  69. smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  70. smftools/machine_learning/models/base.py +295 -0
  71. smftools/machine_learning/models/cnn.py +138 -0
  72. smftools/machine_learning/models/lightning_base.py +345 -0
  73. smftools/machine_learning/models/mlp.py +26 -0
  74. smftools/{tools → machine_learning}/models/positional.py +3 -2
  75. smftools/{tools → machine_learning}/models/rnn.py +2 -1
  76. smftools/machine_learning/models/sklearn_models.py +273 -0
  77. smftools/machine_learning/models/transformer.py +303 -0
  78. smftools/machine_learning/training/__init__.py +2 -0
  79. smftools/machine_learning/training/train_lightning_model.py +135 -0
  80. smftools/machine_learning/training/train_sklearn_model.py +114 -0
  81. smftools/plotting/__init__.py +4 -1
  82. smftools/plotting/autocorrelation_plotting.py +609 -0
  83. smftools/plotting/general_plotting.py +1292 -140
  84. smftools/plotting/hmm_plotting.py +260 -0
  85. smftools/plotting/qc_plotting.py +270 -0
  86. smftools/preprocessing/__init__.py +15 -8
  87. smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
  88. smftools/preprocessing/append_base_context.py +122 -0
  89. smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  90. smftools/preprocessing/binarize.py +17 -0
  91. smftools/preprocessing/binarize_on_Youden.py +2 -2
  92. smftools/preprocessing/calculate_complexity_II.py +248 -0
  93. smftools/preprocessing/calculate_coverage.py +10 -1
  94. smftools/preprocessing/calculate_position_Youden.py +1 -1
  95. smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  96. smftools/preprocessing/clean_NaN.py +17 -1
  97. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  98. smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  99. smftools/preprocessing/flag_duplicate_reads.py +1326 -124
  100. smftools/preprocessing/invert_adata.py +12 -5
  101. smftools/preprocessing/load_sample_sheet.py +19 -4
  102. smftools/readwrite.py +1021 -89
  103. smftools/tools/__init__.py +3 -32
  104. smftools/tools/calculate_umap.py +5 -5
  105. smftools/tools/general_tools.py +3 -3
  106. smftools/tools/position_stats.py +468 -106
  107. smftools/tools/read_stats.py +115 -1
  108. smftools/tools/spatial_autocorrelation.py +562 -0
  109. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
  110. smftools-0.2.3.dist-info/RECORD +173 -0
  111. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  112. smftools/informatics/fast5_to_pod5.py +0 -21
  113. smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  114. smftools/informatics/helpers/__init__.py +0 -74
  115. smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  116. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  117. smftools/informatics/helpers/bam_qc.py +0 -66
  118. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  119. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  120. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  121. smftools/informatics/helpers/index_fasta.py +0 -12
  122. smftools/informatics/helpers/make_dirs.py +0 -21
  123. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  124. smftools/informatics/load_adata.py +0 -182
  125. smftools/informatics/readwrite.py +0 -106
  126. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  127. smftools/preprocessing/append_C_context.py +0 -82
  128. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  129. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  130. smftools/preprocessing/filter_reads_on_length.py +0 -51
  131. smftools/tools/call_hmm_peaks.py +0 -105
  132. smftools/tools/data/__init__.py +0 -2
  133. smftools/tools/data/anndata_data_module.py +0 -90
  134. smftools/tools/inference/__init__.py +0 -1
  135. smftools/tools/inference/lightning_inference.py +0 -41
  136. smftools/tools/models/base.py +0 -14
  137. smftools/tools/models/cnn.py +0 -34
  138. smftools/tools/models/lightning_base.py +0 -41
  139. smftools/tools/models/mlp.py +0 -17
  140. smftools/tools/models/sklearn_models.py +0 -40
  141. smftools/tools/models/transformer.py +0 -133
  142. smftools/tools/training/__init__.py +0 -1
  143. smftools/tools/training/train_lightning_model.py +0 -47
  144. smftools-0.1.7.dist-info/RECORD +0 -136
  145. /smftools/{tools/evaluation → cli}/__init__.py +0 -0
  146. /smftools/{tools → hmm}/calculate_distances.py +0 -0
  147. /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
  148. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  149. /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
  150. /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
  151. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  152. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  153. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  154. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  155. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  156. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  157. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  158. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  159. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  160. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  161. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  162. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  163. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  164. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  165. /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
  166. /smftools/{tools → machine_learning}/models/__init__.py +0 -0
  167. /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
  168. /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
  169. /smftools/{tools → machine_learning}/utils/device.py +0 -0
  170. /smftools/{tools → machine_learning}/utils/grl.py +0 -0
  171. /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
  172. /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
  173. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  174. {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py ADDED
@@ -0,0 +1,435 @@
1
+ import click
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from typing import Dict, Optional, Sequence
5
+
6
+ from .cli.load_adata import load_adata
7
+ from .cli.cli_flows import flow_I
8
+ from .cli.preprocess_adata import preprocess_adata
9
+ from .cli.spatial_adata import spatial_adata
10
+ from .cli.hmm_adata import hmm_adata
11
+
12
+ from .readwrite import merge_barcoded_anndatas_core, safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
13
+
14
+ @click.group()
15
+ def cli():
16
+ """Command-line interface for smftools."""
17
+ pass
18
+
19
+ ####### Load anndata from raw data ###########
20
+ @cli.command()
21
+ @click.argument("config_path", type=click.Path(exists=True))
22
+ def load(config_path):
23
+ """Load and process data from CONFIG_PATH."""
24
+ load_adata(config_path)
25
+ ##########################################
26
+
27
+ ####### Preprocessing ###########
28
+ @cli.command()
29
+ @click.argument("config_path", type=click.Path(exists=True))
30
+ def preprocess(config_path):
31
+ """Preprocess data from CONFIG_PATH."""
32
+ preprocess_adata(config_path)
33
+ ##########################################
34
+
35
+ ####### Spatial ###########
36
+ @cli.command()
37
+ @click.argument("config_path", type=click.Path(exists=True))
38
+ def spatial(config_path):
39
+ """Process data from CONFIG_PATH."""
40
+ spatial_adata(config_path)
41
+ ##########################################
42
+
43
+ ####### HMM ###########
44
+ @cli.command()
45
+ @click.argument("config_path", type=click.Path(exists=True))
46
+ def hmm(config_path):
47
+ """Process data from CONFIG_PATH."""
48
+ hmm_adata(config_path)
49
+ ##########################################
50
+
51
+ ####### batch command ###########
52
+ @cli.command()
53
+ @click.argument(
54
+ "task",
55
+ type=click.Choice(["load", "preprocess", "spatial", "hmm"], case_sensitive=False),
56
+ )
57
+ @click.argument(
58
+ "config_table",
59
+ type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path),
60
+ )
61
+ @click.option(
62
+ "--column",
63
+ "-c",
64
+ default="config_path",
65
+ show_default=True,
66
+ help="Column name containing config paths (ignored for plain TXT).",
67
+ )
68
+ @click.option(
69
+ "--sep",
70
+ default=None,
71
+ help="Field separator: default auto-detect (.tsv -> '\\t', .csv -> ',', others treated as TXT).",
72
+ )
73
+ def batch(task, config_table: Path, column: str, sep: str | None):
74
+ """
75
+ Run a TASK (load, preprocess, spatial, hmm) on multiple CONFIG_PATHs
76
+ listed in a CSV/TSV or plain TXT file.
77
+
78
+ Plain text format: one config path per line, no header.
79
+ """
80
+
81
+ # ----------------------------
82
+ # Decide file type
83
+ # ----------------------------
84
+ suffix = config_table.suffix.lower()
85
+
86
+ # TXT mode → each line is a config path
87
+ if suffix in {".txt", ".list"}:
88
+ paths = []
89
+ with config_table.open() as f:
90
+ for line in f:
91
+ line = line.strip()
92
+ if line:
93
+ paths.append(Path(line).expanduser())
94
+
95
+ if not paths:
96
+ raise click.ClickException(f"No config paths found in text file: {config_table}")
97
+
98
+ config_paths = paths
99
+
100
+ else:
101
+ # CSV / TSV mode
102
+ # auto-detect separator if not provided
103
+ if sep is None:
104
+ if suffix in {".tsv", ".tab"}:
105
+ sep = "\t"
106
+ else:
107
+ sep = ","
108
+
109
+ try:
110
+ df = pd.read_csv(config_table, sep=sep, dtype=str)
111
+ except Exception as e:
112
+ raise click.ClickException(f"Failed to read table {config_table}: {e}") from e
113
+
114
+ if df.empty:
115
+ raise click.ClickException(f"Config table is empty: {config_table}")
116
+
117
+ # If table has no header or only one column, treat it as raw paths
118
+ if df.shape[1] == 1 and column not in df.columns:
119
+ # re-read as headerless single-column list, so we don't drop the first path
120
+ try:
121
+ df = pd.read_csv(
122
+ config_table,
123
+ sep=sep,
124
+ header=None,
125
+ names=[column],
126
+ dtype=str,
127
+ )
128
+ except Exception as e:
129
+ raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
130
+
131
+ config_series = df[column]
132
+ else:
133
+ if column not in df.columns:
134
+ raise click.ClickException(
135
+ f"Column '{column}' not found in {config_table}. "
136
+ f"Available columns: {', '.join(df.columns)}"
137
+ )
138
+ config_series = df[column]
139
+
140
+ config_paths = (
141
+ config_series.dropna()
142
+ .map(str)
143
+ .map(lambda p: Path(p).expanduser())
144
+ .tolist()
145
+ )
146
+
147
+ # ----------------------------
148
+ # Validate config paths
149
+ # ----------------------------
150
+ if not config_paths:
151
+ raise click.ClickException("No config paths found.")
152
+
153
+ # ----------------------------
154
+ # Map task to function
155
+ # ----------------------------
156
+ task = task.lower()
157
+ task_funcs = {
158
+ "load": load_adata,
159
+ "preprocess": preprocess_adata,
160
+ "spatial": spatial_adata,
161
+ "hmm": hmm_adata,
162
+ }
163
+
164
+ func = task_funcs[task]
165
+
166
+ click.echo(
167
+ f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
168
+ )
169
+
170
+ # ----------------------------
171
+ # Loop over paths
172
+ # ----------------------------
173
+ for i, cfg in enumerate(config_paths, start=1):
174
+ if not cfg.exists():
175
+ click.echo(f"[{i}/{len(config_paths)}] SKIP (missing): {cfg}")
176
+ continue
177
+
178
+ click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
179
+
180
+ try:
181
+ func(str(cfg)) # underlying functions take a string path
182
+ except Exception as e:
183
+ click.echo(f" ERROR on {cfg}: {e}")
184
+
185
+ click.echo("Batch processing complete.")
186
+ ##########################################
187
+
188
+ ####### concatenate command ###########
189
+ @cli.command("concatenate")
190
+ @click.argument(
191
+ "output_path",
192
+ type=click.Path(path_type=Path, dir_okay=False),
193
+ )
194
+ @click.option(
195
+ "--input-dir",
196
+ "-d",
197
+ type=click.Path(path_type=Path, file_okay=False),
198
+ default=None,
199
+ help="Directory containing .h5ad/.h5ad.gz files to concatenate.",
200
+ )
201
+ @click.option(
202
+ "--csv-path",
203
+ "-c",
204
+ type=click.Path(path_type=Path, dir_okay=False),
205
+ default=None,
206
+ help="CSV/TSV/TXT containing file paths of h5ad files.",
207
+ )
208
+ @click.option(
209
+ "--csv-column",
210
+ "-C",
211
+ default="h5ad_path",
212
+ help="Column in the CSV containing file paths (ignored for TXT).",
213
+ show_default=True,
214
+ )
215
+ @click.option(
216
+ "--suffix",
217
+ "-s",
218
+ multiple=True,
219
+ default=[".h5ad", ".h5ad.gz"],
220
+ help="Allowed file suffixes (repeatable).",
221
+ show_default=True,
222
+ )
223
+ @click.option(
224
+ "--delete",
225
+ is_flag=False,
226
+ help="Delete input .h5ad files after concatenation.",
227
+ )
228
+ @click.option(
229
+ "--restore",
230
+ is_flag=True,
231
+ help="Restore .h5ad backups during reading.",
232
+ )
233
+ def concatenate_cmd(
234
+ output_path: Path,
235
+ input_dir: Path | None,
236
+ csv_path: Path | None,
237
+ csv_column: str,
238
+ suffix: Sequence[str],
239
+ delete: bool,
240
+ restore: bool,
241
+ ):
242
+ """
243
+ Concatenate multiple .h5ad files into a single output file.
244
+
245
+ Two modes:
246
+
247
+ smftools concatenate out.h5ad --input-dir ./dir
248
+
249
+ smftools concatenate out.h5ad --csv-path paths.csv --csv-column h5ad_path
250
+
251
+ TXT input also works (one file path per line).
252
+
253
+ Uses safe_read_h5ad() and safe_write_h5ad().
254
+ """
255
+
256
+ if input_dir and csv_path:
257
+ raise click.ClickException("Provide only ONE of --input-dir or --csv-path.")
258
+
259
+ try:
260
+ out = concatenate_h5ads(
261
+ output_path=output_path,
262
+ input_dir=input_dir,
263
+ csv_path=csv_path,
264
+ csv_column=csv_column,
265
+ file_suffixes=tuple(suffix),
266
+ delete_inputs=delete,
267
+ restore_backups=restore,
268
+ )
269
+ click.echo(f"✓ Concatenated file written to: {out}")
270
+
271
+ except Exception as e:
272
+ raise click.ClickException(str(e)) from e
273
+ ##########################################
274
+
275
+ ####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
276
+ # REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
277
+ # OPTIONAL_KEYS = (
278
+ # "adata_single_backups_path",
279
+ # "adata_double_backups_path",
280
+ # "output_path",
281
+ # "merged_filename",
282
+ # )
283
+
284
+ # def _read_config_csv(csv_path: Path) -> Dict[str, str]:
285
+ # """
286
+ # Read a multi-row, two-column CSV of key,value pairs into a dict.
287
+
288
+ # Supported features:
289
+ # - Optional header ("key,value") or none.
290
+ # - Comments starting with '#' and blank lines are ignored.
291
+ # - If duplicate keys occur, the last one wins.
292
+ # - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
293
+ # """
294
+ # try:
295
+ # # Read as two columns regardless of header; comments ignored.
296
+ # df = pd.read_csv(
297
+ # csv_path,
298
+ # dtype=str,
299
+ # comment="#",
300
+ # header=None, # treat everything as rows; we'll normalize below
301
+ # usecols=[0, 1],
302
+ # names=["key", "value"]
303
+ # )
304
+ # except Exception as e:
305
+ # raise click.ClickException(f"Failed to read CSV: {e}") from e
306
+
307
+ # # Drop completely empty rows
308
+ # df = df.fillna("").astype(str)
309
+ # df["key"] = df["key"].str.strip()
310
+ # df["value"] = df["value"].str.strip()
311
+ # df = df[(df["key"] != "") & (df["key"].notna())]
312
+
313
+ # if df.empty:
314
+ # raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
315
+
316
+ # # Remove an optional header row if present
317
+ # if df.iloc[0]["key"].lower() in {"key", "keys"}:
318
+ # df = df.iloc[1:]
319
+ # df = df[(df["key"] != "") & (df["key"].notna())]
320
+ # if df.empty:
321
+ # raise click.ClickException("Config CSV contains only a header row.")
322
+
323
+ # # Build dict; last occurrence of a key wins
324
+ # cfg = {}
325
+ # for k, v in zip(df["key"], df["value"]):
326
+ # cfg[k] = v
327
+
328
+ # # Validate required keys
329
+ # missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
330
+ # if missing:
331
+ # raise click.ClickException(
332
+ # "Missing required keys in CSV: "
333
+ # + ", ".join(missing)
334
+ # + "\nExpected keys:\n - "
335
+ # + "\n - ".join(REQUIRED_KEYS)
336
+ # + "\nOptional keys:\n - "
337
+ # + "\n - ".join(OPTIONAL_KEYS)
338
+ # )
339
+
340
+ # return cfg
341
+
342
+ # def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
343
+ # """Decide on the output .h5ad path based on CSV; create directories if needed."""
344
+ # merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
345
+ # if not merged_filename.endswith(".h5ad"):
346
+ # merged_filename += ".h5ad"
347
+
348
+ # output_path_raw = cfg.get("output_path", "").strip()
349
+
350
+ # if not output_path_raw:
351
+ # out_dir = Path.cwd() / "merged_output"
352
+ # out_dir.mkdir(parents=True, exist_ok=True)
353
+ # return out_dir / merged_filename
354
+
355
+ # output_path = Path(output_path_raw)
356
+
357
+ # if output_path.suffix.lower() == ".h5ad":
358
+ # output_path.parent.mkdir(parents=True, exist_ok=True)
359
+ # return output_path
360
+
361
+ # # Treat as directory
362
+ # output_path.mkdir(parents=True, exist_ok=True)
363
+ # return output_path / merged_filename
364
+
365
+ # def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
366
+
367
+ # if backups:
368
+ # click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
369
+ # return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
370
+ # else:
371
+ # click.echo(f"Loading {label} from {primary} with backups disabled ...")
372
+ # return safe_read_h5ad(primary, restore_backups=False)
373
+
374
+
375
+ # @cli.command()
376
+ # @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
377
+ # def merge_barcoded_anndatas(config_path: Path):
378
+ # """
379
+ # Merge two AnnData objects from the same experiment that were demultiplexed
380
+ # under different end-barcoding requirements, using a 1-row CSV for config.
381
+
382
+ # CSV must include:
383
+ # - adata_single_path
384
+ # - adata_double_path
385
+
386
+ # Optional columns:
387
+ # - adata_single_backups_path
388
+ # - adata_double_backups_path
389
+ # - output_path (file or directory; default: ./merged_output/)
390
+ # - merged_filename (default: merged_<single>__<double>.h5ad)
391
+
392
+ # Example CSV:
393
+
394
+ # adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
395
+ # /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
396
+ # """
397
+ # try:
398
+ # cfg = _read_config_csv(config_path)
399
+
400
+ # single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
401
+ # double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
402
+
403
+ # for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
404
+ # if not p.exists():
405
+ # raise click.ClickException(f"{label} does not exist: {p}")
406
+
407
+ # single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
408
+ # double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
409
+
410
+ # if single_backups and not single_backups.exists():
411
+ # raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
412
+ # if double_backups and not double_backups.exists():
413
+ # raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
414
+
415
+ # output_path = _resolve_output_path(cfg, single_path, double_path)
416
+
417
+ # # Load
418
+ # adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
419
+ # adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
420
+
421
+ # click.echo("Merging AnnDatas ...")
422
+ # merged = merge_barcoded_anndatas_core(adata_single, adata_double)
423
+
424
+ # click.echo(f"Writing merged AnnData to: {output_path}")
425
+ # backup_dir = output_path.cwd() / "merged_backups"
426
+ # safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
427
+
428
+ # click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
429
+
430
+ # except click.ClickException:
431
+ # raise
432
+ # except Exception as e:
433
+ # # Surface unexpected errors cleanly
434
+ # raise click.ClickException(f"Unexpected error: {e}") from e
435
+ ################################################################################################################
@@ -0,0 +1 @@
1
+ from .experiment_config import LoadExperimentConfig, ExperimentConfig
@@ -0,0 +1,38 @@
1
+ # Conversion (Bisulfite/APOBEC)footprinting defaults
2
+ extends: default
3
+
4
+ ######## smftools load params #########
5
+ conversion_types:
6
+ - '5mC' # 5mC
7
+
8
+ ######## smftools preprocess params #########
9
+ # Read QC Params
10
+ read_mod_filtering_use_other_c_as_background: True
11
+
12
+ ######## smftools hmm params #########
13
+ # HMM
14
+ cpg: True # whether to use the default HMM endogenous CpG patch params
15
+ hmm_methbases:
16
+ - "GpC"
17
+ hmm_feature_sets:
18
+ footprint:
19
+ state: "Non-Modified"
20
+ features:
21
+ small_bound_stretch: [10, 30]
22
+ medium_bound_stretch: [30, 110]
23
+ putative_nucleosome: [110, 200]
24
+ large_bound_stretch: [200, inf]
25
+ accessible:
26
+ state: "Modified"
27
+ features:
28
+ small_accessible_patch: [3, 20]
29
+ mid_accessible_patch: [20, 40]
30
+ mid_large_accessible_patch: [40, 130]
31
+ large_accessible_patch: [130, inf]
32
+ cpg:
33
+ state: "Modified"
34
+ features:
35
+ cpg_patch: [0, inf]
36
+
37
+ hmm_merge_layer_features:
38
+ - ["GpC_all_accessible_features", 80]
@@ -0,0 +1,61 @@
1
+ # Deaminase footprinting defaults
2
+ extends: default
3
+
4
+ ######## smftools load params #########
5
+ conversion_types:
6
+ - '5mC' # 5mC
7
+
8
+ mod_target_bases:
9
+ - "C"
10
+
11
+ ######## smftools preprocess params #########
12
+ read_mod_filtering_gpc_thresholds:
13
+ - null
14
+ - null
15
+ read_mod_filtering_cpg_thresholds:
16
+ - null
17
+ - null
18
+ read_mod_filtering_any_c_thresholds:
19
+ - 0.01
20
+ - 0.99
21
+ read_mod_filtering_a_thresholds:
22
+ - null
23
+ - null
24
+
25
+ read_mod_filtering_use_other_c_as_background: False
26
+
27
+ # Duplicate Detection Params
28
+ duplicate_detection_site_types:
29
+ - "any_C"
30
+
31
+ ######## smftools analyze params #########
32
+ # Autocorrelation params
33
+ autocorr_site_types:
34
+ - "any_C"
35
+
36
+ # Correlation matrix params
37
+ correlation_matrix_site_types:
38
+ - "any_C_site"
39
+
40
+ # ######## smftools hmm params #########
41
+ cpg: False # whether to use the default HMM endogenous CpG patch params
42
+ hmm_methbases:
43
+ - "C"
44
+ hmm_feature_sets:
45
+ footprint:
46
+ state: "Non-Modified"
47
+ features:
48
+ small_bound_stretch: [10, 30]
49
+ medium_bound_stretch: [30, 110]
50
+ putative_nucleosome: [110, 200]
51
+ large_bound_stretch: [200, inf]
52
+ accessible:
53
+ state: "Modified"
54
+ features:
55
+ small_accessible_patch: [3, 20]
56
+ mid_accessible_patch: [20, 40]
57
+ mid_large_accessible_patch: [40, 130]
58
+ large_accessible_patch: [130, inf]
59
+
60
+ hmm_merge_layer_features:
61
+ - ["C_all_accessible_features", 80]