smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py CHANGED
@@ -1,19 +1,63 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from pathlib import Path
5
+ from typing import Sequence
6
+
1
7
  import click
2
8
  import pandas as pd
3
- from pathlib import Path
4
- from typing import Dict, Optional, Sequence
5
9
 
10
+ from .cli.hmm_adata import hmm_adata
6
11
  from .cli.load_adata import load_adata
7
12
  from .cli.preprocess_adata import preprocess_adata
8
13
  from .cli.spatial_adata import spatial_adata
9
- from .cli.hmm_adata import hmm_adata
14
+ from .informatics.pod5_functions import subsample_pod5
15
+ from .logging_utils import get_logger, setup_logging
16
+ from .readwrite import concatenate_h5ads
17
+
18
+
19
+ def _configure_multiprocessing() -> None:
20
+ import multiprocessing as mp
21
+ import sys
22
+
23
+ logger = get_logger(__name__)
24
+
25
+ try:
26
+ if sys.platform == "win32":
27
+ mp.set_start_method("spawn")
28
+ logger.debug("Setting multiprocessing start method to spawn")
29
+ else:
30
+ # try forkserver first, fallback to spawn
31
+ try:
32
+ mp.set_start_method("forkserver")
33
+ logger.debug("Setting multiprocessing start method to forkserver")
34
+ except ValueError:
35
+ mp.set_start_method("spawn")
36
+ logger.debug("Setting multiprocessing start method to spawn")
37
+ except RuntimeError:
38
+ logger.warning("Could not set multiprocessing start method")
10
39
 
11
- from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
12
40
 
13
41
  @click.group()
14
- def cli():
42
+ @click.option(
43
+ "--log-file",
44
+ type=click.Path(dir_okay=False, writable=True, path_type=Path),
45
+ default=None,
46
+ help="Optional file path to write smftools logs.",
47
+ )
48
+ @click.option(
49
+ "--log-level",
50
+ type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
51
+ default="INFO",
52
+ show_default=True,
53
+ help="Logging level for smftools output.",
54
+ )
55
+ def cli(log_file: Path | None, log_level: str):
15
56
  """Command-line interface for smftools."""
16
- pass
57
+ level = getattr(logging, log_level.upper(), logging.INFO)
58
+ setup_logging(level=level, log_file=log_file)
59
+ _configure_multiprocessing()
60
+
17
61
 
18
62
  ####### Load anndata from raw data ###########
19
63
  @cli.command()
@@ -21,32 +65,44 @@ def cli():
21
65
  def load(config_path):
22
66
  """Load and process data from CONFIG_PATH."""
23
67
  load_adata(config_path)
68
+
69
+
24
70
  ##########################################
25
71
 
72
+
26
73
  ####### Preprocessing ###########
27
74
  @cli.command()
28
75
  @click.argument("config_path", type=click.Path(exists=True))
29
76
  def preprocess(config_path):
30
77
  """Preprocess data from CONFIG_PATH."""
31
78
  preprocess_adata(config_path)
79
+
80
+
32
81
  ##########################################
33
82
 
83
+
34
84
  ####### Spatial ###########
35
85
  @cli.command()
36
86
  @click.argument("config_path", type=click.Path(exists=True))
37
87
  def spatial(config_path):
38
88
  """Process data from CONFIG_PATH."""
39
89
  spatial_adata(config_path)
90
+
91
+
40
92
  ##########################################
41
93
 
94
+
42
95
  ####### HMM ###########
43
96
  @cli.command()
44
97
  @click.argument("config_path", type=click.Path(exists=True))
45
98
  def hmm(config_path):
46
99
  """Process data from CONFIG_PATH."""
47
100
  hmm_adata(config_path)
101
+
102
+
48
103
  ##########################################
49
104
 
105
+
50
106
  ####### batch command ###########
51
107
  @cli.command()
52
108
  @click.argument(
@@ -125,7 +181,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
125
181
  dtype=str,
126
182
  )
127
183
  except Exception as e:
128
- raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
184
+ raise click.ClickException(
185
+ f"Failed to read {config_table} as headerless list: {e}"
186
+ ) from e
129
187
 
130
188
  config_series = df[column]
131
189
  else:
@@ -136,12 +194,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
136
194
  )
137
195
  config_series = df[column]
138
196
 
139
- config_paths = (
140
- config_series.dropna()
141
- .map(str)
142
- .map(lambda p: Path(p).expanduser())
143
- .tolist()
144
- )
197
+ config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
145
198
 
146
199
  # ----------------------------
147
200
  # Validate config paths
@@ -162,9 +215,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
162
215
 
163
216
  func = task_funcs[task]
164
217
 
165
- click.echo(
166
- f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
167
- )
218
+ click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
168
219
 
169
220
  # ----------------------------
170
221
  # Loop over paths
@@ -177,13 +228,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
177
228
  click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
178
229
 
179
230
  try:
180
- func(str(cfg)) # underlying functions take a string path
231
+ func(str(cfg)) # underlying functions take a string path
181
232
  except Exception as e:
182
233
  click.echo(f" ERROR on {cfg}: {e}")
183
234
 
184
235
  click.echo("Batch processing complete.")
236
+
237
+
185
238
  ##########################################
186
239
 
240
+
187
241
  ####### concatenate command ###########
188
242
  @cli.command("concatenate")
189
243
  @click.argument(
@@ -269,166 +323,53 @@ def concatenate_cmd(
269
323
 
270
324
  except Exception as e:
271
325
  raise click.ClickException(str(e)) from e
326
+
327
+
272
328
  ##########################################
273
329
 
274
- ####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
275
- # REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
276
- # OPTIONAL_KEYS = (
277
- # "adata_single_backups_path",
278
- # "adata_double_backups_path",
279
- # "output_path",
280
- # "merged_filename",
281
- # )
282
-
283
- # def _read_config_csv(csv_path: Path) -> Dict[str, str]:
284
- # """
285
- # Read a multi-row, two-column CSV of key,value pairs into a dict.
286
-
287
- # Supported features:
288
- # - Optional header ("key,value") or none.
289
- # - Comments starting with '#' and blank lines are ignored.
290
- # - If duplicate keys occur, the last one wins.
291
- # - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
292
- # """
293
- # try:
294
- # # Read as two columns regardless of header; comments ignored.
295
- # df = pd.read_csv(
296
- # csv_path,
297
- # dtype=str,
298
- # comment="#",
299
- # header=None, # treat everything as rows; we'll normalize below
300
- # usecols=[0, 1],
301
- # names=["key", "value"]
302
- # )
303
- # except Exception as e:
304
- # raise click.ClickException(f"Failed to read CSV: {e}") from e
305
-
306
- # # Drop completely empty rows
307
- # df = df.fillna("").astype(str)
308
- # df["key"] = df["key"].str.strip()
309
- # df["value"] = df["value"].str.strip()
310
- # df = df[(df["key"] != "") & (df["key"].notna())]
311
-
312
- # if df.empty:
313
- # raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
314
-
315
- # # Remove an optional header row if present
316
- # if df.iloc[0]["key"].lower() in {"key", "keys"}:
317
- # df = df.iloc[1:]
318
- # df = df[(df["key"] != "") & (df["key"].notna())]
319
- # if df.empty:
320
- # raise click.ClickException("Config CSV contains only a header row.")
321
-
322
- # # Build dict; last occurrence of a key wins
323
- # cfg = {}
324
- # for k, v in zip(df["key"], df["value"]):
325
- # cfg[k] = v
326
-
327
- # # Validate required keys
328
- # missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
329
- # if missing:
330
- # raise click.ClickException(
331
- # "Missing required keys in CSV: "
332
- # + ", ".join(missing)
333
- # + "\nExpected keys:\n - "
334
- # + "\n - ".join(REQUIRED_KEYS)
335
- # + "\nOptional keys:\n - "
336
- # + "\n - ".join(OPTIONAL_KEYS)
337
- # )
338
-
339
- # return cfg
340
-
341
- # def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
342
- # """Decide on the output .h5ad path based on CSV; create directories if needed."""
343
- # merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
344
- # if not merged_filename.endswith(".h5ad"):
345
- # merged_filename += ".h5ad"
346
-
347
- # output_path_raw = cfg.get("output_path", "").strip()
348
-
349
- # if not output_path_raw:
350
- # out_dir = Path.cwd() / "merged_output"
351
- # out_dir.mkdir(parents=True, exist_ok=True)
352
- # return out_dir / merged_filename
353
-
354
- # output_path = Path(output_path_raw)
355
-
356
- # if output_path.suffix.lower() == ".h5ad":
357
- # output_path.parent.mkdir(parents=True, exist_ok=True)
358
- # return output_path
359
-
360
- # # Treat as directory
361
- # output_path.mkdir(parents=True, exist_ok=True)
362
- # return output_path / merged_filename
363
-
364
- # def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
365
-
366
- # if backups:
367
- # click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
368
- # return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
369
- # else:
370
- # click.echo(f"Loading {label} from {primary} with backups disabled ...")
371
- # return safe_read_h5ad(primary, restore_backups=False)
372
-
373
-
374
- # @cli.command()
375
- # @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
376
- # def merge_barcoded_anndatas(config_path: Path):
377
- # """
378
- # Merge two AnnData objects from the same experiment that were demultiplexed
379
- # under different end-barcoding requirements, using a 1-row CSV for config.
380
-
381
- # CSV must include:
382
- # - adata_single_path
383
- # - adata_double_path
384
-
385
- # Optional columns:
386
- # - adata_single_backups_path
387
- # - adata_double_backups_path
388
- # - output_path (file or directory; default: ./merged_output/)
389
- # - merged_filename (default: merged_<single>__<double>.h5ad)
390
-
391
- # Example CSV:
392
-
393
- # adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
394
- # /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
395
- # """
396
- # try:
397
- # cfg = _read_config_csv(config_path)
398
-
399
- # single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
400
- # double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
401
-
402
- # for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
403
- # if not p.exists():
404
- # raise click.ClickException(f"{label} does not exist: {p}")
405
-
406
- # single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
407
- # double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
408
-
409
- # if single_backups and not single_backups.exists():
410
- # raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
411
- # if double_backups and not double_backups.exists():
412
- # raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
413
-
414
- # output_path = _resolve_output_path(cfg, single_path, double_path)
415
-
416
- # # Load
417
- # adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
418
- # adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
419
-
420
- # click.echo("Merging AnnDatas ...")
421
- # merged = merge_barcoded_anndatas_core(adata_single, adata_double)
422
-
423
- # click.echo(f"Writing merged AnnData to: {output_path}")
424
- # backup_dir = output_path.cwd() / "merged_backups"
425
- # safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
426
-
427
- # click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
428
-
429
- # except click.ClickException:
430
- # raise
431
- # except Exception as e:
432
- # # Surface unexpected errors cleanly
433
- # raise click.ClickException(f"Unexpected error: {e}") from e
434
- ################################################################################################################
330
+
331
+ ####### subsample pod5 command ###########
332
+ @cli.command("subsample-pod5")
333
+ @click.argument(
334
+ "pod5_path",
335
+ type=click.Path(exists=True, path_type=Path),
336
+ )
337
+ @click.option(
338
+ "--read-names",
339
+ "-r",
340
+ type=click.Path(exists=True, path_type=Path),
341
+ default=None,
342
+ help="Text file with one read_id per line.",
343
+ )
344
+ @click.option(
345
+ "--n-reads",
346
+ "-n",
347
+ type=int,
348
+ default=None,
349
+ help="Randomly subsample N reads.",
350
+ )
351
+ @click.option(
352
+ "--outdir",
353
+ "-o",
354
+ type=click.Path(path_type=Path, file_okay=False),
355
+ required=True,
356
+ help="Output directory for subsampled POD5.",
357
+ )
358
+ def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
359
+ """
360
+ Subsample POD5 file(s) by read ID list or random sampling.
361
+ """
362
+
363
+ # --- Validate mutually exclusive options ---
364
+ if (read_names is None and n_reads is None) or (read_names and n_reads):
365
+ raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
366
+
367
+ outdir.mkdir(parents=True, exist_ok=True)
368
+
369
+ subsample_arg = str(read_names) if read_names else n_reads
370
+
371
+ subsample_pod5(
372
+ pod5_path=str(pod5_path),
373
+ read_name_path=subsample_arg,
374
+ output_directory=str(outdir),
375
+ )
@@ -1 +1,3 @@
1
- from .experiment_config import LoadExperimentConfig, ExperimentConfig
1
+ from __future__ import annotations
2
+
3
+ from .experiment_config import ExperimentConfig, LoadExperimentConfig
@@ -9,6 +9,10 @@ conversion_types:
9
9
  # Read QC Params
10
10
  read_mod_filtering_use_other_c_as_background: True
11
11
 
12
+ # Spatial Analysis - Autocorr params
13
+ autocorr_site_types:
14
+ - "GpC"
15
+
12
16
  # Spatial Analysis - Clustermap params
13
17
  layer_for_clustermap_plotting: 'nan0_0minus1'
14
18
  clustermap_cmap_c: "coolwarm"
@@ -42,4 +46,4 @@ hmm_feature_sets:
42
46
  cpg_patch: [0, inf]
43
47
 
44
48
  hmm_merge_layer_features:
45
- - ["GpC_all_accessible_features", 80]
49
+ - ["all_accessible_features", 60]
@@ -60,4 +60,4 @@ hmm_feature_sets:
60
60
  nucleosome_depleted_region: [110, inf]
61
61
 
62
62
  hmm_merge_layer_features:
63
- - ["C_all_accessible_features", 80]
63
+ - ["all_accessible_features", 60]
@@ -1,7 +1,7 @@
1
1
  # General
2
2
  sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
- sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
- sample_name_col_for_plotting: 'Barcode'
3
+ sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Experiment_name_and_barcode'
5
5
 
6
6
  # Compute params
7
7
  threads: 4
@@ -9,9 +9,7 @@ device: "auto"
9
9
 
10
10
  ######## smftools load params #########
11
11
  # Generic i/o
12
- bam_suffix: ".bam"
13
12
  recursive_input_search: True
14
- split_dir: "demultiplexed_BAMs"
15
13
  strands:
16
14
  - bottom
17
15
  - top
@@ -21,7 +19,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
21
19
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
22
20
  input_already_demuxed: False # If the input files are already demultiplexed.
23
21
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
24
- delete_intermediate_bams: False # Whether to delete intermediate BAM files.
22
+ delete_intermediate_bams: True # Whether to delete intermediate BAM files.
25
23
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
26
24
 
27
25
  # Sequencing modality and general experiment params
@@ -53,7 +51,6 @@ aligner_args:
53
51
  - '-y'
54
52
  - '-N'
55
53
  - '5'
56
- - '--secondary=no'
57
54
  pacbio:
58
55
  - '-a'
59
56
  - '-x'
@@ -63,7 +60,6 @@ aligner_args:
63
60
  - '-y'
64
61
  - '-N'
65
62
  - '5'
66
- - '--secondary=no'
67
63
  illumina:
68
64
  - '-a'
69
65
  - '-x'
@@ -73,7 +69,6 @@ aligner_args:
73
69
  - '-y'
74
70
  - '-N'
75
71
  - '5'
76
- - '--secondary=no'
77
72
  dorado:
78
73
  ont:
79
74
  - "--mm2-opts"
@@ -82,15 +77,18 @@ aligner_args:
82
77
  # Sorted BAM and BED specific handling
83
78
  make_bigwigs: False # Whether to make coverage bigwigs
84
79
  make_beds: False # Whether to make beds from the aligned bams
80
+ samtools_backend: auto # auto|python|cli for samtools-compatible operations
81
+ bedtools_backend: auto # auto|python|cli for bedtools-compatible operations
82
+ bigwig_backend: auto # auto|python|cli for bedGraphToBigWig conversion
85
83
 
86
84
  # Nanopore specific demultiplexing
87
85
  barcode_both_ends: False # dorado demultiplexing
88
86
  trim: False # dorado adapter and barcode removal during demultiplexing
89
87
 
90
88
  # Anndata structure
91
- mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
89
+ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
92
90
  reference_column: 'Reference_strand'
93
- sample_column: 'Barcode'
91
+ sample_column: 'Experiment_name_and_barcode'
94
92
 
95
93
  ######## smftools preprocess params #########
96
94
  # Read length, quality, and mapping filtering params
@@ -101,7 +99,7 @@ read_len_filter_thresholds:
101
99
  - 100
102
100
  - null
103
101
  read_len_to_ref_ratio_filter_thresholds:
104
- - 0.5
102
+ - null
105
103
  - null
106
104
  read_quality_filter_thresholds:
107
105
  - 15
@@ -179,13 +177,12 @@ umap_layers_to_plot:
179
177
  - "Raw_modification_signal"
180
178
 
181
179
  # Spatial Analysis - Spatial Autocorrelation params
180
+ autocorr_normalization_method: "pearson" # options are pearson or sum
182
181
  rows_per_qc_autocorr_grid: 6
183
182
  autocorr_rolling_window_size: 25
184
183
  autocorr_max_lag: 800
185
184
  autocorr_site_types:
186
185
  - "GpC"
187
- - "CpG"
188
- - "C"
189
186
 
190
187
  # Spatial Analysis - Correlation Matrix params
191
188
  correlation_matrix_types:
@@ -210,10 +207,19 @@ hmm_init_start_probs:
210
207
  - 0.5
211
208
  - 0.5
212
209
  hmm_eps: 1e-8
210
+ # Fitting strategy
211
+ hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
212
+ hmm_shared_scope: ["reference", "methbase"]
213
+ hmm_groupby: ["sample", "reference", "methbase"]
214
+ # If hmm_fit_strategy == shared_transitions
215
+ hmm_adapt_emissions: true
216
+ hmm_adapt_startprobs: true
217
+ hmm_emission_adapt_iters: 5
218
+ hmm_emission_adapt_tol: 1.0e-4
213
219
  hmm_dtype: "float64"
214
- hmm_annotation_threshold: 0.5
215
- hmm_batch_size: 1024
216
- hmm_use_viterbi: False
220
+ hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
221
+ hmm_batch_size: 1024 # hmm batch size
222
+ hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
217
223
  footprints: True # whether to use the default HMM footprint params
218
224
  accessible_patches: True # whether to use the default HMM accessible patch params
219
225
  cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -238,7 +244,7 @@ hmm_feature_sets:
238
244
  large_accessible_patch: [40, 110]
239
245
  nucleosome_depleted_region: [110, inf]
240
246
  hmm_merge_layer_features:
241
- - [null, 80]
247
+ - ["all_accessible_features", 60]
242
248
  clustermap_cmap_hmm: "coolwarm"
243
249
  hmm_clustermap_feature_layers:
244
250
  - all_accessible_features
@@ -246,7 +252,9 @@ hmm_clustermap_feature_layers:
246
252
  - small_accessible_patch
247
253
  - mid_accessible_patch
248
254
  - large_accessible_patch
255
+ - large_accessible_patch_merged
249
256
  - nucleosome_depleted_region
257
+ - nucleosome_depleted_region_merged
250
258
  - small_bound_stretch
251
259
  - medium_bound_stretch
252
260
  - putative_nucleosome
@@ -365,4 +373,4 @@ force_redo_matrix_corr_plotting: False # Whether to force redo basic correlation
365
373
  bypass_hmm_fit: False # Whether to skip HMM fitting for each sample/reference
366
374
  force_redo_hmm_fit: False # Whether to redo HMM fitting for each sample/reference
367
375
  bypass_hmm_apply: False # Whether to skip HMM application for each sample/reference
368
- force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
376
+ force_redo_hmm_apply: False # Whether to redo HMM application for each sample/reference
@@ -27,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
27
27
 
28
28
  ######## smftools preprocess params ########
29
29
  fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
30
- binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
30
+ binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
31
31
  positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
32
32
  negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
33
- infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
33
+ infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
34
34
  inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
35
35
  fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
36
36
  output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
@@ -39,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
39
39
  autocorr_site_types:
40
40
  - "A"
41
41
 
42
+ spatial_clustermap_sortby: "a"
43
+
42
44
  ######## smftools hmm params #########
43
45
  hmm_methbases:
44
- - "A"
46
+ - "A"
47
+
48
+ hmm_merge_layer_features:
49
+ - ["A_all_accessible_features", 60]
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import Dict, List, Any, Iterable, Union
4
+ from typing import Any, Dict, List, Union
5
+
6
+ from smftools.constants import BAM_SUFFIX
7
+
5
8
 
6
9
  def discover_input_files(
7
10
  input_data_path: Union[str, Path],
8
- bam_suffix: str = ".bam",
11
+ bam_suffix: str = BAM_SUFFIX,
9
12
  recursive: bool = False,
10
13
  follow_symlinks: bool = False,
11
14
  ) -> Dict[str, Any]:
@@ -30,10 +33,21 @@ def discover_input_files(
30
33
  bam_suffix = bam_suffix.lower()
31
34
 
32
35
  # Sets of canonical extension keys we’ll compare against
33
- pod5_exts = {".pod5", ".p5"}
36
+ pod5_exts = {".pod5", ".p5"}
34
37
  fast5_exts = {".fast5", ".f5"}
35
- fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
- h5ad_exts = {".h5ad", ".h5"}
38
+ fastq_exts = {
39
+ ".fastq",
40
+ ".fq",
41
+ ".fastq.gz",
42
+ ".fq.gz",
43
+ ".fastq.bz2",
44
+ ".fq.bz2",
45
+ ".fastq.xz",
46
+ ".fq.xz",
47
+ ".fastq.zst",
48
+ ".fq.zst",
49
+ }
50
+ h5ad_exts = {".h5ad", ".h5"}
37
51
  compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
52
 
39
53
  def ext_key(pp: Path) -> str: