smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
smftools/cli_entry.py CHANGED
@@ -1,19 +1,38 @@
1
+ import logging
2
+ from pathlib import Path
3
+ from typing import Sequence
4
+
1
5
  import click
2
6
  import pandas as pd
3
- from pathlib import Path
4
- from typing import Dict, Optional, Sequence
5
7
 
8
+ from .cli.hmm_adata import hmm_adata
6
9
  from .cli.load_adata import load_adata
7
10
  from .cli.preprocess_adata import preprocess_adata
8
11
  from .cli.spatial_adata import spatial_adata
9
- from .cli.hmm_adata import hmm_adata
12
+ from .informatics.pod5_functions import subsample_pod5
13
+ from .logging_utils import setup_logging
14
+ from .readwrite import concatenate_h5ads
10
15
 
11
- from .readwrite import safe_read_h5ad, safe_write_h5ad, concatenate_h5ads
12
16
 
13
17
  @click.group()
14
- def cli():
18
+ @click.option(
19
+ "--log-file",
20
+ type=click.Path(dir_okay=False, writable=True, path_type=Path),
21
+ default=None,
22
+ help="Optional file path to write smftools logs.",
23
+ )
24
+ @click.option(
25
+ "--log-level",
26
+ type=click.Choice(["CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"], case_sensitive=False),
27
+ default="INFO",
28
+ show_default=True,
29
+ help="Logging level for smftools output.",
30
+ )
31
+ def cli(log_file: Path | None, log_level: str):
15
32
  """Command-line interface for smftools."""
16
- pass
33
+ level = getattr(logging, log_level.upper(), logging.INFO)
34
+ setup_logging(level=level, log_file=log_file)
35
+
17
36
 
18
37
  ####### Load anndata from raw data ###########
19
38
  @cli.command()
@@ -21,32 +40,44 @@ def cli():
21
40
  def load(config_path):
22
41
  """Load and process data from CONFIG_PATH."""
23
42
  load_adata(config_path)
43
+
44
+
24
45
  ##########################################
25
46
 
47
+
26
48
  ####### Preprocessing ###########
27
49
  @cli.command()
28
50
  @click.argument("config_path", type=click.Path(exists=True))
29
51
  def preprocess(config_path):
30
52
  """Preprocess data from CONFIG_PATH."""
31
53
  preprocess_adata(config_path)
54
+
55
+
32
56
  ##########################################
33
57
 
58
+
34
59
  ####### Spatial ###########
35
60
  @cli.command()
36
61
  @click.argument("config_path", type=click.Path(exists=True))
37
62
  def spatial(config_path):
38
63
  """Process data from CONFIG_PATH."""
39
64
  spatial_adata(config_path)
65
+
66
+
40
67
  ##########################################
41
68
 
69
+
42
70
  ####### HMM ###########
43
71
  @cli.command()
44
72
  @click.argument("config_path", type=click.Path(exists=True))
45
73
  def hmm(config_path):
46
74
  """Process data from CONFIG_PATH."""
47
75
  hmm_adata(config_path)
76
+
77
+
48
78
  ##########################################
49
79
 
80
+
50
81
  ####### batch command ###########
51
82
  @cli.command()
52
83
  @click.argument(
@@ -125,7 +156,9 @@ def batch(task, config_table: Path, column: str, sep: str | None):
125
156
  dtype=str,
126
157
  )
127
158
  except Exception as e:
128
- raise click.ClickException(f"Failed to read {config_table} as headerless list: {e}") from e
159
+ raise click.ClickException(
160
+ f"Failed to read {config_table} as headerless list: {e}"
161
+ ) from e
129
162
 
130
163
  config_series = df[column]
131
164
  else:
@@ -136,12 +169,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
136
169
  )
137
170
  config_series = df[column]
138
171
 
139
- config_paths = (
140
- config_series.dropna()
141
- .map(str)
142
- .map(lambda p: Path(p).expanduser())
143
- .tolist()
144
- )
172
+ config_paths = config_series.dropna().map(str).map(lambda p: Path(p).expanduser()).tolist()
145
173
 
146
174
  # ----------------------------
147
175
  # Validate config paths
@@ -162,9 +190,7 @@ def batch(task, config_table: Path, column: str, sep: str | None):
162
190
 
163
191
  func = task_funcs[task]
164
192
 
165
- click.echo(
166
- f"Running task '{task}' on {len(config_paths)} config paths from {config_table}"
167
- )
193
+ click.echo(f"Running task '{task}' on {len(config_paths)} config paths from {config_table}")
168
194
 
169
195
  # ----------------------------
170
196
  # Loop over paths
@@ -177,13 +203,16 @@ def batch(task, config_table: Path, column: str, sep: str | None):
177
203
  click.echo(f"[{i}/{len(config_paths)}] {task} → {cfg}")
178
204
 
179
205
  try:
180
- func(str(cfg)) # underlying functions take a string path
206
+ func(str(cfg)) # underlying functions take a string path
181
207
  except Exception as e:
182
208
  click.echo(f" ERROR on {cfg}: {e}")
183
209
 
184
210
  click.echo("Batch processing complete.")
211
+
212
+
185
213
  ##########################################
186
214
 
215
+
187
216
  ####### concatenate command ###########
188
217
  @cli.command("concatenate")
189
218
  @click.argument(
@@ -269,166 +298,53 @@ def concatenate_cmd(
269
298
 
270
299
  except Exception as e:
271
300
  raise click.ClickException(str(e)) from e
301
+
302
+
272
303
  ##########################################
273
304
 
274
- ####### Merging existing anndatas from an experiment that used two different demultiplexing rules #######
275
- # REQUIRED_KEYS = ("adata_single_path", "adata_double_path")
276
- # OPTIONAL_KEYS = (
277
- # "adata_single_backups_path",
278
- # "adata_double_backups_path",
279
- # "output_path",
280
- # "merged_filename",
281
- # )
282
-
283
- # def _read_config_csv(csv_path: Path) -> Dict[str, str]:
284
- # """
285
- # Read a multi-row, two-column CSV of key,value pairs into a dict.
286
-
287
- # Supported features:
288
- # - Optional header ("key,value") or none.
289
- # - Comments starting with '#' and blank lines are ignored.
290
- # - If duplicate keys occur, the last one wins.
291
- # - Keys are matched literally against REQUIRED_KEYS/OPTIONAL_KEYS.
292
- # """
293
- # try:
294
- # # Read as two columns regardless of header; comments ignored.
295
- # df = pd.read_csv(
296
- # csv_path,
297
- # dtype=str,
298
- # comment="#",
299
- # header=None, # treat everything as rows; we'll normalize below
300
- # usecols=[0, 1],
301
- # names=["key", "value"]
302
- # )
303
- # except Exception as e:
304
- # raise click.ClickException(f"Failed to read CSV: {e}") from e
305
-
306
- # # Drop completely empty rows
307
- # df = df.fillna("").astype(str)
308
- # df["key"] = df["key"].str.strip()
309
- # df["value"] = df["value"].str.strip()
310
- # df = df[(df["key"] != "") & (df["key"].notna())]
311
-
312
- # if df.empty:
313
- # raise click.ClickException("Config CSV is empty after removing comments/blank lines.")
314
-
315
- # # Remove an optional header row if present
316
- # if df.iloc[0]["key"].lower() in {"key", "keys"}:
317
- # df = df.iloc[1:]
318
- # df = df[(df["key"] != "") & (df["key"].notna())]
319
- # if df.empty:
320
- # raise click.ClickException("Config CSV contains only a header row.")
321
-
322
- # # Build dict; last occurrence of a key wins
323
- # cfg = {}
324
- # for k, v in zip(df["key"], df["value"]):
325
- # cfg[k] = v
326
-
327
- # # Validate required keys
328
- # missing = [k for k in REQUIRED_KEYS if not cfg.get(k)]
329
- # if missing:
330
- # raise click.ClickException(
331
- # "Missing required keys in CSV: "
332
- # + ", ".join(missing)
333
- # + "\nExpected keys:\n - "
334
- # + "\n - ".join(REQUIRED_KEYS)
335
- # + "\nOptional keys:\n - "
336
- # + "\n - ".join(OPTIONAL_KEYS)
337
- # )
338
-
339
- # return cfg
340
-
341
- # def _resolve_output_path(cfg: Dict[str, str], single_path: Path, double_path: Path) -> Path:
342
- # """Decide on the output .h5ad path based on CSV; create directories if needed."""
343
- # merged_filename = cfg.get("merged_filename") or f"merged_{single_path.stem}__{double_path.stem}.h5ad"
344
- # if not merged_filename.endswith(".h5ad"):
345
- # merged_filename += ".h5ad"
346
-
347
- # output_path_raw = cfg.get("output_path", "").strip()
348
-
349
- # if not output_path_raw:
350
- # out_dir = Path.cwd() / "merged_output"
351
- # out_dir.mkdir(parents=True, exist_ok=True)
352
- # return out_dir / merged_filename
353
-
354
- # output_path = Path(output_path_raw)
355
-
356
- # if output_path.suffix.lower() == ".h5ad":
357
- # output_path.parent.mkdir(parents=True, exist_ok=True)
358
- # return output_path
359
-
360
- # # Treat as directory
361
- # output_path.mkdir(parents=True, exist_ok=True)
362
- # return output_path / merged_filename
363
-
364
- # def _maybe_read_adata(label: str, primary: Path, backups: Optional[Path]):
365
-
366
- # if backups:
367
- # click.echo(f"Loading {label} from {primary} with backups at {backups} ...")
368
- # return safe_read_h5ad(primary, backups_path=backups, restore_backups=True)
369
- # else:
370
- # click.echo(f"Loading {label} from {primary} with backups disabled ...")
371
- # return safe_read_h5ad(primary, restore_backups=False)
372
-
373
-
374
- # @cli.command()
375
- # @click.argument("config_path", type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path))
376
- # def merge_barcoded_anndatas(config_path: Path):
377
- # """
378
- # Merge two AnnData objects from the same experiment that were demultiplexed
379
- # under different end-barcoding requirements, using a 1-row CSV for config.
380
-
381
- # CSV must include:
382
- # - adata_single_path
383
- # - adata_double_path
384
-
385
- # Optional columns:
386
- # - adata_single_backups_path
387
- # - adata_double_backups_path
388
- # - output_path (file or directory; default: ./merged_output/)
389
- # - merged_filename (default: merged_<single>__<double>.h5ad)
390
-
391
- # Example CSV:
392
-
393
- # adata_single_path,adata_double_path,adata_single_backups_path,adata_double_backups_path,output_path,merged_filename
394
- # /path/single.h5ad,/path/double.h5ad,,,,merged_output,merged_run.h5ad
395
- # """
396
- # try:
397
- # cfg = _read_config_csv(config_path)
398
-
399
- # single_path = Path(cfg["adata_single_path"]).expanduser().resolve()
400
- # double_path = Path(cfg["adata_double_path"]).expanduser().resolve()
401
-
402
- # for p, label in [(single_path, "adata_single_path"), (double_path, "adata_double_path")]:
403
- # if not p.exists():
404
- # raise click.ClickException(f"{label} does not exist: {p}")
405
-
406
- # single_backups = Path(cfg["adata_single_backups_path"]).expanduser().resolve() if cfg.get("adata_single_backups_path") else None
407
- # double_backups = Path(cfg["adata_double_backups_path"]).expanduser().resolve() if cfg.get("adata_double_backups_path") else None
408
-
409
- # if single_backups and not single_backups.exists():
410
- # raise click.ClickException(f"adata_single_backups_path does not exist: {single_backups}")
411
- # if double_backups and not double_backups.exists():
412
- # raise click.ClickException(f"adata_double_backups_path does not exist: {double_backups}")
413
-
414
- # output_path = _resolve_output_path(cfg, single_path, double_path)
415
-
416
- # # Load
417
- # adata_single, read_report_single = _maybe_read_adata("single-barcoded AnnData", single_path, single_backups)
418
- # adata_double, read_report_double = _maybe_read_adata("double-barcoded AnnData", double_path, double_backups)
419
-
420
- # click.echo("Merging AnnDatas ...")
421
- # merged = merge_barcoded_anndatas_core(adata_single, adata_double)
422
-
423
- # click.echo(f"Writing merged AnnData to: {output_path}")
424
- # backup_dir = output_path.cwd() / "merged_backups"
425
- # safe_write_h5ad(merged, output_path, backup=True, backup_dir=backup_dir)
426
-
427
- # click.secho(f"Done. Merged AnnData saved to {output_path}", fg="green")
428
-
429
- # except click.ClickException:
430
- # raise
431
- # except Exception as e:
432
- # # Surface unexpected errors cleanly
433
- # raise click.ClickException(f"Unexpected error: {e}") from e
434
- ################################################################################################################
305
+
306
+ ####### subsample pod5 command ###########
307
+ @cli.command("subsample-pod5")
308
+ @click.argument(
309
+ "pod5_path",
310
+ type=click.Path(exists=True, path_type=Path),
311
+ )
312
+ @click.option(
313
+ "--read-names",
314
+ "-r",
315
+ type=click.Path(exists=True, path_type=Path),
316
+ default=None,
317
+ help="Text file with one read_id per line.",
318
+ )
319
+ @click.option(
320
+ "--n-reads",
321
+ "-n",
322
+ type=int,
323
+ default=None,
324
+ help="Randomly subsample N reads.",
325
+ )
326
+ @click.option(
327
+ "--outdir",
328
+ "-o",
329
+ type=click.Path(path_type=Path, file_okay=False),
330
+ required=True,
331
+ help="Output directory for subsampled POD5.",
332
+ )
333
+ def subsample_pod5_cmd(pod5_path, read_names, n_reads, outdir):
334
+ """
335
+ Subsample POD5 file(s) by read ID list or random sampling.
336
+ """
337
+
338
+ # --- Validate mutually exclusive options ---
339
+ if (read_names is None and n_reads is None) or (read_names and n_reads):
340
+ raise click.UsageError("You must specify exactly ONE of --read-names or --n-reads.")
341
+
342
+ outdir.mkdir(parents=True, exist_ok=True)
343
+
344
+ subsample_arg = str(read_names) if read_names else n_reads
345
+
346
+ subsample_pod5(
347
+ pod5_path=str(pod5_path),
348
+ read_name_path=subsample_arg,
349
+ output_directory=str(outdir),
350
+ )
@@ -1 +1 @@
1
- from .experiment_config import LoadExperimentConfig, ExperimentConfig
1
+ from .experiment_config import ExperimentConfig, LoadExperimentConfig
@@ -9,6 +9,10 @@ conversion_types:
9
9
  # Read QC Params
10
10
  read_mod_filtering_use_other_c_as_background: True
11
11
 
12
+ # Spatial Analysis - Autocorr params
13
+ autocorr_site_types:
14
+ - "GpC"
15
+
12
16
  # Spatial Analysis - Clustermap params
13
17
  layer_for_clustermap_plotting: 'nan0_0minus1'
14
18
  clustermap_cmap_c: "coolwarm"
@@ -42,4 +46,4 @@ hmm_feature_sets:
42
46
  cpg_patch: [0, inf]
43
47
 
44
48
  hmm_merge_layer_features:
45
- - ["GpC_all_accessible_features", 80]
49
+ - ["all_accessible_features", 60]
@@ -60,4 +60,4 @@ hmm_feature_sets:
60
60
  nucleosome_depleted_region: [110, inf]
61
61
 
62
62
  hmm_merge_layer_features:
63
- - ["C_all_accessible_features", 80]
63
+ - ["all_accessible_features", 60]
@@ -1,7 +1,7 @@
1
1
  # General
2
2
  sample_sheet_path: null # path to sample_sheet to load metadata into anndata.
3
- sample_sheet_mapping_column: 'Barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
- sample_name_col_for_plotting: 'Barcode'
3
+ sample_sheet_mapping_column: 'Experiment_name_and_barcode' # The column in the sample sheet and current anndata to use for mapping metadata.
4
+ sample_name_col_for_plotting: 'Experiment_name_and_barcode'
5
5
 
6
6
  # Compute params
7
7
  threads: 4
@@ -9,9 +9,7 @@ device: "auto"
9
9
 
10
10
  ######## smftools load params #########
11
11
  # Generic i/o
12
- bam_suffix: ".bam"
13
12
  recursive_input_search: True
14
- split_dir: "demultiplexed_BAMs"
15
13
  strands:
16
14
  - bottom
17
15
  - top
@@ -21,7 +19,7 @@ fastq_barcode_map: null # For FASTQ files, an optional map of file paths to barc
21
19
  fastq_auto_pairing: True # For FASTQ files, attempt to find read pair files automatically.
22
20
  input_already_demuxed: False # If the input files are already demultiplexed.
23
21
  delete_intermediate_hdfs: True # Whether to delete the intermediate hdfs from the conversion/deamination workflows.
24
- delete_intermediate_bams: False # Whether to delete intermediate BAM files.
22
+ delete_intermediate_bams: True # Whether to delete intermediate BAM files.
25
23
  delete_intermediate_tsvs: True # Whether to delete intermediate TSV files.
26
24
 
27
25
  # Sequencing modality and general experiment params
@@ -53,7 +51,6 @@ aligner_args:
53
51
  - '-y'
54
52
  - '-N'
55
53
  - '5'
56
- - '--secondary=no'
57
54
  pacbio:
58
55
  - '-a'
59
56
  - '-x'
@@ -63,7 +60,6 @@ aligner_args:
63
60
  - '-y'
64
61
  - '-N'
65
62
  - '5'
66
- - '--secondary=no'
67
63
  illumina:
68
64
  - '-a'
69
65
  - '-x'
@@ -73,7 +69,6 @@ aligner_args:
73
69
  - '-y'
74
70
  - '-N'
75
71
  - '5'
76
- - '--secondary=no'
77
72
  dorado:
78
73
  ont:
79
74
  - "--mm2-opts"
@@ -88,9 +83,9 @@ barcode_both_ends: False # dorado demultiplexing
88
83
  trim: False # dorado adapter and barcode removal during demultiplexing
89
84
 
90
85
  # Anndata structure
91
- mapping_threshold: 0.01 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
86
+ mapping_threshold: 0.10 # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
92
87
  reference_column: 'Reference_strand'
93
- sample_column: 'Barcode'
88
+ sample_column: 'Experiment_name_and_barcode'
94
89
 
95
90
  ######## smftools preprocess params #########
96
91
  # Read length, quality, and mapping filtering params
@@ -101,7 +96,7 @@ read_len_filter_thresholds:
101
96
  - 100
102
97
  - null
103
98
  read_len_to_ref_ratio_filter_thresholds:
104
- - 0.5
99
+ - null
105
100
  - null
106
101
  read_quality_filter_thresholds:
107
102
  - 15
@@ -179,13 +174,12 @@ umap_layers_to_plot:
179
174
  - "Raw_modification_signal"
180
175
 
181
176
  # Spatial Analysis - Spatial Autocorrelation params
177
+ autocorr_normalization_method: "pearson" # options are pearson or sum
182
178
  rows_per_qc_autocorr_grid: 6
183
179
  autocorr_rolling_window_size: 25
184
180
  autocorr_max_lag: 800
185
181
  autocorr_site_types:
186
182
  - "GpC"
187
- - "CpG"
188
- - "C"
189
183
 
190
184
  # Spatial Analysis - Correlation Matrix params
191
185
  correlation_matrix_types:
@@ -210,10 +204,19 @@ hmm_init_start_probs:
210
204
  - 0.5
211
205
  - 0.5
212
206
  hmm_eps: 1e-8
207
+ # Fitting strategy
208
+ hmm_fit_strategy: "per_group" # "per_group" | "shared_transitions"
209
+ hmm_shared_scope: ["reference", "methbase"]
210
+ hmm_groupby: ["sample", "reference", "methbase"]
211
+ # If hmm_fit_strategy == shared_transitions
212
+ hmm_adapt_emissions: true
213
+ hmm_adapt_startprobs: true
214
+ hmm_emission_adapt_iters: 5
215
+ hmm_emission_adapt_tol: 1.0e-4
213
216
  hmm_dtype: "float64"
214
- hmm_annotation_threshold: 0.5
215
- hmm_batch_size: 1024
216
- hmm_use_viterbi: False
217
+ hmm_annotation_threshold: 0.5 # The minimum probability threshold of a feature interval to accept it for layer annotation.
218
+ hmm_batch_size: 1024 # hmm batch size
219
+ hmm_use_viterbi: False # Whether to use viterbi decoding. If False, uses forward-backward gammas. Viterbi is smoother, but less sensitive.
217
220
  footprints: True # whether to use the default HMM footprint params
218
221
  accessible_patches: True # whether to use the default HMM accessible patch params
219
222
  cpg: False # whether to use the default HMM endogenous CpG patch params
@@ -238,7 +241,7 @@ hmm_feature_sets:
238
241
  large_accessible_patch: [40, 110]
239
242
  nucleosome_depleted_region: [110, inf]
240
243
  hmm_merge_layer_features:
241
- - [null, 80]
244
+ - ["all_accessible_features", 60]
242
245
  clustermap_cmap_hmm: "coolwarm"
243
246
  hmm_clustermap_feature_layers:
244
247
  - all_accessible_features
@@ -246,7 +249,9 @@ hmm_clustermap_feature_layers:
246
249
  - small_accessible_patch
247
250
  - mid_accessible_patch
248
251
  - large_accessible_patch
252
+ - large_accessible_patch_merged
249
253
  - nucleosome_depleted_region
254
+ - nucleosome_depleted_region_merged
250
255
  - small_bound_stretch
251
256
  - medium_bound_stretch
252
257
  - putative_nucleosome
@@ -27,10 +27,10 @@ delete_batch_hdfs: True # Whether to delete intermediate barcode level hdfs afte
27
27
 
28
28
  ######## smftools preprocess params ########
29
29
  fit_position_methylation_thresholds: False # Whether to use Youden J-stat to determine position by positions thresholds for modification binarization.
30
- binarize_on_fixed_methlyation_threshold: 0.7 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
30
+ binarize_on_fixed_methlyation_threshold: 0.5 # The threshold used to binarize the anndata using a fixed value if fitting parameter above is False.
31
31
  positive_control_sample_methylation_fitting: null # A positive control Sample_name to use for fully modified template data
32
32
  negative_control_sample_methylation_fitting: null # A negative control Sample_name to use for fully unmodified template data
33
- infer_on_percentile_sample_methylation_fitting: 10 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
33
+ infer_on_percentile_sample_methylation_fitting: 5 # If a positive/negative control are not provided and fitting the data is requested, use the indicated percentile windows from the top and bottom of the dataset.
34
34
  inference_variable_sample_methylation_fitting: "Raw_modification_signal" # The obs column value used for the percentile metric above.
35
35
  fit_j_threshold: 0.5 # The J-statistic threhold to use for determining which positions pass qc for mod detection thresholding
36
36
  output_binary_layer_name: "binarized_methylation" # The layer to store the binarized methylation data in
@@ -39,6 +39,11 @@ output_binary_layer_name: "binarized_methylation" # The layer to store the binar
39
39
  autocorr_site_types:
40
40
  - "A"
41
41
 
42
+ spatial_clustermap_sortby: "a"
43
+
42
44
  ######## smftools hmm params #########
43
45
  hmm_methbases:
44
- - "A"
46
+ - "A"
47
+
48
+ hmm_merge_layer_features:
49
+ - ["A_all_accessible_features", 60]
@@ -1,11 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from pathlib import Path
4
- from typing import Dict, List, Any, Iterable, Union
4
+ from typing import Any, Dict, List, Union
5
+
6
+ from smftools.constants import BAM_SUFFIX
7
+
5
8
 
6
9
  def discover_input_files(
7
10
  input_data_path: Union[str, Path],
8
- bam_suffix: str = ".bam",
11
+ bam_suffix: str = BAM_SUFFIX,
9
12
  recursive: bool = False,
10
13
  follow_symlinks: bool = False,
11
14
  ) -> Dict[str, Any]:
@@ -30,10 +33,21 @@ def discover_input_files(
30
33
  bam_suffix = bam_suffix.lower()
31
34
 
32
35
  # Sets of canonical extension keys we’ll compare against
33
- pod5_exts = {".pod5", ".p5"}
36
+ pod5_exts = {".pod5", ".p5"}
34
37
  fast5_exts = {".fast5", ".f5"}
35
- fastq_exts = {".fastq", ".fq", ".fastq.gz", ".fq.gz", ".fastq.bz2", ".fq.bz2", ".fastq.xz", ".fq.xz", ".fastq.zst", ".fq.zst"}
36
- h5ad_exts = {".h5ad", ".h5"}
38
+ fastq_exts = {
39
+ ".fastq",
40
+ ".fq",
41
+ ".fastq.gz",
42
+ ".fq.gz",
43
+ ".fastq.bz2",
44
+ ".fq.bz2",
45
+ ".fastq.xz",
46
+ ".fq.xz",
47
+ ".fastq.zst",
48
+ ".fq.zst",
49
+ }
50
+ h5ad_exts = {".h5ad", ".h5"}
37
51
  compressed_exts = {".gz", ".bz2", ".xz", ".zst"}
38
52
 
39
53
  def ext_key(pp: Path) -> str: