smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -15,7 +15,7 @@ def filter_reads_on_modification_thresholds(
15
15
  a_thresholds: Optional[Sequence[float]] = None,
16
16
  use_other_c_as_background: bool = False,
17
17
  min_valid_fraction_positions_in_read_vs_ref: Optional[float] = None,
18
- uns_flag: str = 'reads_filtered_on_modification_thresholds',
18
+ uns_flag: str = 'filter_reads_on_modification_thresholds_performed',
19
19
  bypass: bool = False,
20
20
  force_redo: bool = False,
21
21
  reference_column: str = 'Reference_strand',
@@ -31,9 +31,9 @@ def filter_reads_on_modification_thresholds(
31
31
  - Otherwise, computes the relevant per-read metrics per-reference in batches
32
32
  and writes them into adata.obs before filtering.
33
33
 
34
- Parameters of interest (same semantics as your original function):
34
+ Parameters of interest :
35
35
  - gpc_thresholds, cpg_thresholds, any_c_thresholds, a_thresholds:
36
- each should be [min, max] (floats 0..1) or None.
36
+ each should be [min, max] (floats 0..1) or None. Thresholds are inclusive.
37
37
  - use_other_c_as_background: require GpC/CpG > other_C background (if present).
38
38
  - min_valid_fraction_positions_in_read_vs_ref: minimum fraction of valid sites
39
39
  in the read vs reference (0..1). If None, this check is skipped.
@@ -53,7 +53,7 @@ def filter_reads_on_modification_thresholds(
53
53
  col_pref = {
54
54
  "GpC": ("Fraction_GpC_site_modified", f"Valid_GpC_site_in_read_vs_reference"),
55
55
  "CpG": ("Fraction_CpG_site_modified", f"Valid_CpG_site_in_read_vs_reference"),
56
- "C": ("Fraction_any_C_site_modified", f"Valid_any_C_site_in_read_vs_reference"),
56
+ "C": ("Fraction_C_site_modified", f"Valid_C_site_in_read_vs_reference"),
57
57
  "A": ("Fraction_A_site_modified", f"Valid_A_site_in_read_vs_reference"),
58
58
  }.get(mod_type, (None, None))
59
59
  return (col_pref[0] in adata.obs.columns) and (col_pref[1] in adata.obs.columns)
@@ -99,8 +99,8 @@ def filter_reads_on_modification_thresholds(
99
99
  create_cols["Valid_CpG_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
100
100
  create_cols["CpG_to_other_C_mod_ratio"] = np.full((n_obs,), np.nan)
101
101
  if "C" in mod_target_bases:
102
- create_cols["Fraction_any_C_site_modified"] = np.full((n_obs,), np.nan)
103
- create_cols["Valid_any_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
102
+ create_cols["Fraction_C_site_modified"] = np.full((n_obs,), np.nan)
103
+ create_cols["Valid_C_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
104
104
  if "A" in mod_target_bases:
105
105
  create_cols["Fraction_A_site_modified"] = np.full((n_obs,), np.nan)
106
106
  create_cols["Valid_A_site_in_read_vs_reference"] = np.full((n_obs,), np.nan)
@@ -227,7 +227,7 @@ def filter_reads_on_modification_thresholds(
227
227
  # any C
228
228
  if "C" in mod_target_bases:
229
229
  for ref in refs:
230
- _compute_for_ref_and_suffix(ref, "any_C_site", create_cols["Fraction_any_C_site_modified"], create_cols["Valid_any_C_site_in_read_vs_reference"])
230
+ _compute_for_ref_and_suffix(ref, "C_site", create_cols["Fraction_C_site_modified"], create_cols["Valid_C_site_in_read_vs_reference"])
231
231
 
232
232
  # A
233
233
  if "A" in mod_target_bases:
@@ -283,15 +283,15 @@ def filter_reads_on_modification_thresholds(
283
283
  filtered = filtered[filtered.obs["GpC_to_other_C_mod_ratio"].astype(float) > 1]
284
284
  if lo is not None:
285
285
  s0 = filtered.n_obs
286
- filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) > lo]
286
+ filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) >= lo]
287
287
  print(f"Removed {s0 - filtered.n_obs} reads below min GpC fraction {lo}")
288
288
  if hi is not None:
289
289
  s0 = filtered.n_obs
290
- filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) < hi]
290
+ filtered = filtered[filtered.obs["Fraction_GpC_site_modified"].astype(float) <= hi]
291
291
  print(f"Removed {s0 - filtered.n_obs} reads above max GpC fraction {hi}")
292
292
  if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_GpC_site_in_read_vs_reference" in filtered.obs.columns):
293
293
  s0 = filtered.n_obs
294
- filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) > float(min_valid_fraction_positions_in_read_vs_ref)]
294
+ filtered = filtered[filtered.obs["Valid_GpC_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
295
295
  print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid GpC site fraction vs ref")
296
296
 
297
297
  # CpG thresholds
@@ -301,15 +301,15 @@ def filter_reads_on_modification_thresholds(
301
301
  filtered = filtered[filtered.obs["CpG_to_other_C_mod_ratio"].astype(float) > 1]
302
302
  if lo is not None:
303
303
  s0 = filtered.n_obs
304
- filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) > lo]
304
+ filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) >= lo]
305
305
  print(f"Removed {s0 - filtered.n_obs} reads below min CpG fraction {lo}")
306
306
  if hi is not None:
307
307
  s0 = filtered.n_obs
308
- filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) < hi]
308
+ filtered = filtered[filtered.obs["Fraction_CpG_site_modified"].astype(float) <= hi]
309
309
  print(f"Removed {s0 - filtered.n_obs} reads above max CpG fraction {hi}")
310
310
  if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_CpG_site_in_read_vs_reference" in filtered.obs.columns):
311
311
  s0 = filtered.n_obs
312
- filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) > float(min_valid_fraction_positions_in_read_vs_ref)]
312
+ filtered = filtered[filtered.obs["Valid_CpG_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
313
313
  print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid CpG site fraction vs ref")
314
314
 
315
315
  # any C thresholds
@@ -317,15 +317,15 @@ def filter_reads_on_modification_thresholds(
317
317
  lo, hi = _unpack_minmax(any_c_thresholds)
318
318
  if lo is not None:
319
319
  s0 = filtered.n_obs
320
- filtered = filtered[filtered.obs["Fraction_any_C_site_modified"].astype(float) > lo]
320
+ filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) >= lo]
321
321
  print(f"Removed {s0 - filtered.n_obs} reads below min any-C fraction {lo}")
322
322
  if hi is not None:
323
323
  s0 = filtered.n_obs
324
- filtered = filtered[filtered.obs["Fraction_any_C_site_modified"].astype(float) < hi]
324
+ filtered = filtered[filtered.obs["Fraction_C_site_modified"].astype(float) <= hi]
325
325
  print(f"Removed {s0 - filtered.n_obs} reads above max any-C fraction {hi}")
326
- if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_any_C_site_in_read_vs_reference" in filtered.obs.columns):
326
+ if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_C_site_in_read_vs_reference" in filtered.obs.columns):
327
327
  s0 = filtered.n_obs
328
- filtered = filtered[filtered.obs["Valid_any_C_site_in_read_vs_reference"].astype(float) > float(min_valid_fraction_positions_in_read_vs_ref)]
328
+ filtered = filtered[filtered.obs["Valid_C_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
329
329
  print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid any-C site fraction vs ref")
330
330
 
331
331
  # A thresholds
@@ -333,15 +333,15 @@ def filter_reads_on_modification_thresholds(
333
333
  lo, hi = _unpack_minmax(a_thresholds)
334
334
  if lo is not None:
335
335
  s0 = filtered.n_obs
336
- filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) > lo]
336
+ filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) >= lo]
337
337
  print(f"Removed {s0 - filtered.n_obs} reads below min A fraction {lo}")
338
338
  if hi is not None:
339
339
  s0 = filtered.n_obs
340
- filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) < hi]
340
+ filtered = filtered[filtered.obs["Fraction_A_site_modified"].astype(float) <= hi]
341
341
  print(f"Removed {s0 - filtered.n_obs} reads above max A fraction {hi}")
342
342
  if (min_valid_fraction_positions_in_read_vs_ref is not None) and ("Valid_A_site_in_read_vs_reference" in filtered.obs.columns):
343
343
  s0 = filtered.n_obs
344
- filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) > float(min_valid_fraction_positions_in_read_vs_ref)]
344
+ filtered = filtered[filtered.obs["Valid_A_site_in_read_vs_reference"].astype(float) >= float(min_valid_fraction_positions_in_read_vs_ref)]
345
345
  print(f"Removed {s0 - filtered.n_obs} reads with insufficient valid A site fraction vs ref")
346
346
 
347
347
  filtered = filtered.copy()
@@ -13,7 +13,7 @@ import pandas as pd
13
13
  import matplotlib.pyplot as plt
14
14
  from tqdm import tqdm
15
15
 
16
- from ..informatics.helpers import make_dirs
16
+ from ..readwrite import make_dirs
17
17
 
18
18
  # optional imports for clustering / PCA / KDE
19
19
  try:
@@ -77,7 +77,7 @@ def flag_duplicate_reads(
77
77
  sample_col: str = "Barcode",
78
78
  output_directory: Optional[str] = None,
79
79
  metric_keys: Union[str, List[str]] = ("Fraction_any_C_site_modified",),
80
- uns_flag: str = "read_duplicate_detection_performed",
80
+ uns_flag: str = "flag_duplicate_reads_performed",
81
81
  uns_filtered_flag: str = "read_duplicates_removed",
82
82
  bypass: bool = False,
83
83
  force_redo: bool = False,
@@ -1,6 +1,6 @@
1
1
  ## invert_adata
2
2
 
3
- def invert_adata(adata, uns_flag='adata_positions_inverted', force_redo=False):
3
+ def invert_adata(adata, uns_flag='invert_adata_performed', force_redo=False):
4
4
  """
5
5
  Inverts the AnnData object along the column (variable) axis.
6
6
 
@@ -2,7 +2,7 @@ def load_sample_sheet(adata,
2
2
  sample_sheet_path,
3
3
  mapping_key_column='obs_names',
4
4
  as_category=True,
5
- uns_flag='sample_sheet_loaded',
5
+ uns_flag='load_sample_sheet_performed',
6
6
  force_reload=True
7
7
  ):
8
8
  """
@@ -0,0 +1,37 @@
1
+ ## reindex_references_adata
2
+
3
+ def reindex_references_adata(adata,
4
+ reference_col="Reference_strand",
5
+ offsets=None,
6
+ new_col="reindexed",
7
+ uns_flag='reindex_references_adata_performed',
8
+ force_redo=False):
9
+
10
+ # Only run if not already performed
11
+ already = bool(adata.uns.get(uns_flag, False))
12
+ if (already and not force_redo):
13
+ return None
14
+
15
+ if offsets is None:
16
+ pass
17
+ else:
18
+ # Ensure var_names are numeric
19
+ var_coords = adata.var_names.astype(int)
20
+
21
+ for ref in adata.obs[reference_col].unique():
22
+ if ref not in offsets:
23
+ pass
24
+ else:
25
+ offset_value = offsets[ref]
26
+
27
+ # Create a new var column for this reference
28
+ colname = f"{ref}_{new_col}"
29
+
30
+ # Add offset to all var positions
31
+ adata.var[colname] = var_coords + offset_value
32
+
33
+ # mark as done
34
+ adata.uns[uns_flag] = True
35
+
36
+ print("Reindexing complete!")
37
+ return None