smftools 0.2.4__py3-none-any.whl → 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (133) hide show
  1. smftools/__init__.py +6 -8
  2. smftools/_settings.py +4 -6
  3. smftools/_version.py +1 -1
  4. smftools/cli/helpers.py +7 -1
  5. smftools/cli/hmm_adata.py +902 -244
  6. smftools/cli/load_adata.py +318 -198
  7. smftools/cli/preprocess_adata.py +285 -171
  8. smftools/cli/spatial_adata.py +137 -53
  9. smftools/cli_entry.py +94 -178
  10. smftools/config/__init__.py +1 -1
  11. smftools/config/conversion.yaml +5 -1
  12. smftools/config/deaminase.yaml +1 -1
  13. smftools/config/default.yaml +22 -17
  14. smftools/config/direct.yaml +8 -3
  15. smftools/config/discover_input_files.py +19 -5
  16. smftools/config/experiment_config.py +505 -276
  17. smftools/constants.py +37 -0
  18. smftools/datasets/__init__.py +2 -8
  19. smftools/datasets/datasets.py +32 -18
  20. smftools/hmm/HMM.py +2125 -1426
  21. smftools/hmm/__init__.py +2 -3
  22. smftools/hmm/archived/call_hmm_peaks.py +16 -1
  23. smftools/hmm/call_hmm_peaks.py +173 -193
  24. smftools/hmm/display_hmm.py +19 -6
  25. smftools/hmm/hmm_readwrite.py +13 -4
  26. smftools/hmm/nucleosome_hmm_refinement.py +102 -14
  27. smftools/informatics/__init__.py +30 -7
  28. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +14 -1
  29. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  30. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  31. smftools/informatics/archived/helpers/archived/load_adata.py +3 -3
  32. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +3 -1
  33. smftools/informatics/archived/print_bam_query_seq.py +7 -1
  34. smftools/informatics/bam_functions.py +379 -156
  35. smftools/informatics/basecalling.py +51 -9
  36. smftools/informatics/bed_functions.py +90 -57
  37. smftools/informatics/binarize_converted_base_identities.py +18 -7
  38. smftools/informatics/complement_base_list.py +7 -6
  39. smftools/informatics/converted_BAM_to_adata.py +265 -122
  40. smftools/informatics/fasta_functions.py +161 -83
  41. smftools/informatics/h5ad_functions.py +195 -29
  42. smftools/informatics/modkit_extract_to_adata.py +609 -270
  43. smftools/informatics/modkit_functions.py +85 -44
  44. smftools/informatics/ohe.py +44 -21
  45. smftools/informatics/pod5_functions.py +112 -73
  46. smftools/informatics/run_multiqc.py +20 -14
  47. smftools/logging_utils.py +51 -0
  48. smftools/machine_learning/__init__.py +2 -7
  49. smftools/machine_learning/data/anndata_data_module.py +143 -50
  50. smftools/machine_learning/data/preprocessing.py +2 -1
  51. smftools/machine_learning/evaluation/__init__.py +1 -1
  52. smftools/machine_learning/evaluation/eval_utils.py +11 -14
  53. smftools/machine_learning/evaluation/evaluators.py +46 -33
  54. smftools/machine_learning/inference/__init__.py +1 -1
  55. smftools/machine_learning/inference/inference_utils.py +7 -4
  56. smftools/machine_learning/inference/lightning_inference.py +9 -13
  57. smftools/machine_learning/inference/sklearn_inference.py +6 -8
  58. smftools/machine_learning/inference/sliding_window_inference.py +35 -25
  59. smftools/machine_learning/models/__init__.py +10 -5
  60. smftools/machine_learning/models/base.py +28 -42
  61. smftools/machine_learning/models/cnn.py +15 -11
  62. smftools/machine_learning/models/lightning_base.py +71 -40
  63. smftools/machine_learning/models/mlp.py +13 -4
  64. smftools/machine_learning/models/positional.py +3 -2
  65. smftools/machine_learning/models/rnn.py +3 -2
  66. smftools/machine_learning/models/sklearn_models.py +39 -22
  67. smftools/machine_learning/models/transformer.py +68 -53
  68. smftools/machine_learning/models/wrappers.py +2 -1
  69. smftools/machine_learning/training/__init__.py +2 -2
  70. smftools/machine_learning/training/train_lightning_model.py +29 -20
  71. smftools/machine_learning/training/train_sklearn_model.py +9 -15
  72. smftools/machine_learning/utils/__init__.py +1 -1
  73. smftools/machine_learning/utils/device.py +7 -4
  74. smftools/machine_learning/utils/grl.py +3 -1
  75. smftools/metadata.py +443 -0
  76. smftools/plotting/__init__.py +19 -5
  77. smftools/plotting/autocorrelation_plotting.py +145 -44
  78. smftools/plotting/classifiers.py +162 -72
  79. smftools/plotting/general_plotting.py +347 -168
  80. smftools/plotting/hmm_plotting.py +42 -13
  81. smftools/plotting/position_stats.py +145 -85
  82. smftools/plotting/qc_plotting.py +20 -12
  83. smftools/preprocessing/__init__.py +8 -8
  84. smftools/preprocessing/append_base_context.py +105 -79
  85. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  86. smftools/preprocessing/{archives → archived}/calculate_complexity.py +3 -1
  87. smftools/preprocessing/{archives → archived}/preprocessing.py +8 -6
  88. smftools/preprocessing/binarize.py +21 -4
  89. smftools/preprocessing/binarize_on_Youden.py +127 -31
  90. smftools/preprocessing/binary_layers_to_ohe.py +17 -11
  91. smftools/preprocessing/calculate_complexity_II.py +86 -59
  92. smftools/preprocessing/calculate_consensus.py +28 -19
  93. smftools/preprocessing/calculate_coverage.py +44 -22
  94. smftools/preprocessing/calculate_pairwise_differences.py +2 -1
  95. smftools/preprocessing/calculate_pairwise_hamming_distances.py +4 -3
  96. smftools/preprocessing/calculate_position_Youden.py +103 -55
  97. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  98. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  99. smftools/preprocessing/clean_NaN.py +38 -28
  100. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  101. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +70 -37
  102. smftools/preprocessing/filter_reads_on_modification_thresholds.py +181 -73
  103. smftools/preprocessing/flag_duplicate_reads.py +688 -271
  104. smftools/preprocessing/invert_adata.py +26 -11
  105. smftools/preprocessing/load_sample_sheet.py +40 -22
  106. smftools/preprocessing/make_dirs.py +8 -3
  107. smftools/preprocessing/min_non_diagonal.py +2 -1
  108. smftools/preprocessing/recipes.py +56 -23
  109. smftools/preprocessing/reindex_references_adata.py +93 -27
  110. smftools/preprocessing/subsample_adata.py +33 -16
  111. smftools/readwrite.py +264 -109
  112. smftools/schema/__init__.py +11 -0
  113. smftools/schema/anndata_schema_v1.yaml +227 -0
  114. smftools/tools/__init__.py +3 -4
  115. smftools/tools/archived/classifiers.py +163 -0
  116. smftools/tools/archived/subset_adata_v1.py +10 -1
  117. smftools/tools/archived/subset_adata_v2.py +12 -1
  118. smftools/tools/calculate_umap.py +54 -15
  119. smftools/tools/cluster_adata_on_methylation.py +115 -46
  120. smftools/tools/general_tools.py +70 -25
  121. smftools/tools/position_stats.py +229 -98
  122. smftools/tools/read_stats.py +50 -29
  123. smftools/tools/spatial_autocorrelation.py +365 -192
  124. smftools/tools/subset_adata.py +23 -21
  125. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/METADATA +15 -43
  126. smftools-0.2.5.dist-info/RECORD +181 -0
  127. smftools-0.2.4.dist-info/RECORD +0 -176
  128. /smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +0 -0
  129. /smftools/preprocessing/{archives → archived}/mark_duplicates.py +0 -0
  130. /smftools/preprocessing/{archives → archived}/remove_duplicates.py +0 -0
  131. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/WHEEL +0 -0
  132. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/entry_points.txt +0 -0
  133. {smftools-0.2.4.dist-info → smftools-0.2.5.dist-info}/licenses/LICENSE +0 -0
@@ -1,47 +1,143 @@
1
- def binarize_on_Youden(adata,
2
- ref_column='Reference_strand',
3
- output_layer_name='binarized_methylation'):
4
- """
5
- Binarize SMF values based on position thresholds determined by calculate_position_Youden.
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from smftools.logging_utils import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
6
9
 
7
- Parameters:
8
- adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
9
- obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
10
+ logger = get_logger(__name__)
10
11
 
11
- Modifies:
12
- Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
12
+
13
+ def binarize_on_Youden(
14
+ adata: "ad.AnnData",
15
+ ref_column: str = "Reference_strand",
16
+ output_layer_name: str = "binarized_methylation",
17
+ mask_failed_positions: bool = True,
18
+ ) -> None:
19
+ """Binarize SMF values using thresholds from ``calculate_position_Youden``.
20
+
21
+ Args:
22
+ adata: AnnData object to binarize.
23
+ ref_column: Obs column denoting reference/strand categories.
24
+ output_layer_name: Layer in which to store the binarized matrix.
25
+ mask_failed_positions: If ``True``, positions that failed Youden QC are set to NaN;
26
+ otherwise all positions are binarized.
13
27
  """
28
+
14
29
  import numpy as np
15
- import anndata as ad
16
30
 
17
- # Initialize an empty matrix to store the binarized methylation values
18
- binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
31
+ # Extract dense X once
32
+ X = adata.X
33
+ if hasattr(X, "toarray"): # sparse → dense
34
+ X = X.toarray()
35
+
36
+ n_obs, n_var = X.shape
37
+ binarized = np.full((n_obs, n_var), np.nan, dtype=float)
19
38
 
20
- # Get unique categories
21
39
  references = adata.obs[ref_column].cat.categories
40
+ ref_labels = adata.obs[ref_column].to_numpy()
22
41
 
23
42
  for ref in references:
24
- # Select subset for this category
25
- ref_mask = adata.obs[ref_column] == ref
26
- ref_subset = adata[ref_mask]
43
+ logger.info("Binarizing on Youden statistics for %s", ref)
44
+
45
+ ref_mask = ref_labels == ref
46
+ if not np.any(ref_mask):
47
+ continue
48
+
49
+ X_block = X[ref_mask, :].astype(float, copy=True)
50
+
51
+ # thresholds: list of (threshold, J)
52
+ youden_stats = adata.var[f"{ref}_position_methylation_thresholding_Youden_stats"].to_numpy()
53
+
54
+ thresholds = np.array(
55
+ [t[0] if isinstance(t, (tuple, list)) else np.nan for t in youden_stats],
56
+ dtype=float,
57
+ )
58
+
59
+ # QC mask
60
+ qc_mask = adata.var[f"{ref}_position_passed_Youden_thresholding_QC"].to_numpy().astype(bool)
61
+
62
+ if mask_failed_positions:
63
+ # Only binarize positions passing QC
64
+ cols_to_binarize = np.where(qc_mask)[0]
65
+ else:
66
+ # Binarize all positions
67
+ cols_to_binarize = np.arange(n_var)
68
+
69
+ # Prepare result block
70
+ block_out = np.full_like(X_block, np.nan, dtype=float)
71
+
72
+ if len(cols_to_binarize) > 0:
73
+ sub_X = X_block[:, cols_to_binarize]
74
+ sub_thresh = thresholds[cols_to_binarize]
75
+
76
+ nan_mask = np.isnan(sub_X)
77
+
78
+ bin_sub = (sub_X > sub_thresh[None, :]).astype(float)
79
+ bin_sub[nan_mask] = np.nan
80
+
81
+ block_out[:, cols_to_binarize] = bin_sub
82
+
83
+ # Write into full output matrix
84
+ binarized[ref_mask, :] = block_out
85
+
86
+ adata.layers[output_layer_name] = binarized
87
+ logger.info(
88
+ "Finished binarization → stored in adata.layers['%s'] (mask_failed_positions=%s)",
89
+ output_layer_name,
90
+ mask_failed_positions,
91
+ )
92
+
93
+
94
+ # def binarize_on_Youden(adata,
95
+ # ref_column='Reference_strand',
96
+ # output_layer_name='binarized_methylation'):
97
+ # """
98
+ # Binarize SMF values based on position thresholds determined by calculate_position_Youden.
99
+
100
+ # Parameters:
101
+ # adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
102
+ # obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
103
+
104
+ # Modifies:
105
+ # Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
106
+ # """
107
+ # import numpy as np
108
+ # import anndata as ad
109
+
110
+ # # Initialize an empty matrix to store the binarized methylation values
111
+ # binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
112
+
113
+ # # Get unique categories
114
+ # references = adata.obs[ref_column].cat.categories
115
+
116
+ # for ref in references:
117
+ # print(f"Binarizing adata on Youden statistics for {ref}")
118
+ # # Select subset for this category
119
+ # ref_mask = adata.obs[ref_column] == ref
120
+ # ref_subset = adata[ref_mask]
121
+
122
+ # # Extract the probability matrix
123
+ # original_matrix = ref_subset.X.copy()
27
124
 
28
- # Extract the probability matrix
29
- original_matrix = ref_subset.X.copy()
125
+ # # Extract the thresholds for each position efficiently
126
+ # thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
30
127
 
31
- # Extract the thresholds for each position efficiently
32
- thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
128
+ # # Identify NaN values
129
+ # nan_mask = np.isnan(original_matrix)
33
130
 
34
- # Identify NaN values
35
- nan_mask = np.isnan(original_matrix)
131
+ # # Binarize based on threshold
132
+ # binarized_matrix = (original_matrix > thresholds).astype(float)
36
133
 
37
- # Binarize based on threshold
38
- binarized_matrix = (original_matrix > thresholds).astype(float)
134
+ # # Restore NaN values
135
+ # binarized_matrix[nan_mask] = np.nan
39
136
 
40
- # Restore NaN values
41
- binarized_matrix[nan_mask] = np.nan
137
+ # # Assign the binarized values back into the preallocated storage
138
+ # binarized_methylation[ref_subset, :] = binarized_matrix
42
139
 
43
- # Assign the binarized values back into the preallocated storage
44
- binarized_methylation[ref_subset, :] = binarized_matrix
140
+ # # Store the binarized matrix in a new layer
141
+ # adata.layers[output_layer_name] = binarized_methylation
45
142
 
46
- # Store the binarized matrix in a new layer
47
- adata.layers[output_layer_name] = binarized_methylation
143
+ # print(f"Finished binarizing adata on Youden statistics")
@@ -1,28 +1,34 @@
1
1
  ## binary_layers_to_ohe
2
2
 
3
- ## Conversion SMF Specific
4
- def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
3
+ from smftools.logging_utils import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ ## Conversion SMF Specific
9
+ def binary_layers_to_ohe(adata, binary_layers, stack="hstack"):
5
10
  """
6
11
  Parameters:
7
12
  adata (AnnData): Anndata object.
8
- binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
13
+ binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
9
14
  stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
10
-
15
+
11
16
  Returns:
12
17
  ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
13
18
  Input: An adata object and a list of layers containing a binary encoding.
14
19
  """
15
20
  import numpy as np
16
- import anndata as ad
17
21
 
18
22
  # Ensure that the N layer is last!
19
23
  # Grab all binary layers that are not encoding N
20
- ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
24
+ ACGT_binary_layers = [
25
+ layer for layer in binary_layers if "binary" in layer and layer != "N_binary_encoding"
26
+ ]
21
27
  # If there is a binary layer encoding N, hold it in N_binary_layer
22
- N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
28
+ N_binary_layer = [layer for layer in binary_layers if layer == "N_binary_encoding"]
23
29
  # Add the N_binary_encoding layer to the end of the list of binary layers
24
30
  all_binary_layers = ACGT_binary_layers + N_binary_layer
25
- print(f'Found {all_binary_layers} layers in adata')
31
+ logger.info("Found %s layers in adata", all_binary_layers)
26
32
 
27
33
  # Extract the layers
28
34
  layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
@@ -33,8 +39,8 @@ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
33
39
  for layer in layers:
34
40
  read_ohe.append(layer[i])
35
41
  read_name = adata.obs_names[i]
36
- if stack == 'hstack':
42
+ if stack == "hstack":
37
43
  ohe_dict[read_name] = np.hstack(read_ohe)
38
- elif stack == 'vstack':
44
+ elif stack == "vstack":
39
45
  ohe_dict[read_name] = np.vstack(read_ohe)
40
- return ohe_dict
46
+ return ohe_dict
@@ -1,42 +1,59 @@
1
- from typing import Optional
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Optional
5
+
6
+ if TYPE_CHECKING:
7
+ import anndata as ad
8
+
9
+
2
10
  def calculate_complexity_II(
3
- adata,
4
- output_directory='',
5
- sample_col='Sample_names',
6
- ref_col: Optional[str] = 'Reference_strand',
7
- cluster_col='sequence__merged_cluster_id',
8
- plot=True,
9
- save_plot=False,
10
- n_boot=30,
11
- n_depths=12,
12
- random_state=0,
13
- csv_summary=True,
14
- uns_flag='calculate_complexity_II_performed',
15
- force_redo=False,
16
- bypass=False
17
- ):
18
- """
19
- Estimate and plot library complexity.
11
+ adata: "ad.AnnData",
12
+ output_directory: str | Path = "",
13
+ sample_col: str = "Sample_names",
14
+ ref_col: Optional[str] = "Reference_strand",
15
+ cluster_col: str = "sequence__merged_cluster_id",
16
+ plot: bool = True,
17
+ save_plot: bool = False,
18
+ n_boot: int = 30,
19
+ n_depths: int = 12,
20
+ random_state: int = 0,
21
+ csv_summary: bool = True,
22
+ uns_flag: str = "calculate_complexity_II_performed",
23
+ force_redo: bool = False,
24
+ bypass: bool = False,
25
+ ) -> None:
26
+ """Estimate and optionally plot library complexity.
20
27
 
21
- If ref_col is None (default), behaves as before: one calculation per sample.
22
- If ref_col is provided, computes complexity for each (sample, ref) pair.
28
+ If ``ref_col`` is ``None``, the calculation is performed per sample. If provided,
29
+ complexity is computed for each ``(sample, reference)`` pair.
23
30
 
24
- Results:
25
- - adata.uns['Library_complexity_results'] : dict keyed by (sample,) or (sample, ref) -> dict with fields
26
- C0, n_reads, n_unique, depths, mean_unique, ci_low, ci_high
27
- - Also stores per-entity record in adata.uns[f'Library_complexity_{sanitized_name}'] (backwards compatible)
28
- - Optionally saves PNGs and CSVs (curve points + fit summary)
31
+ Args:
32
+ adata: AnnData object containing read metadata.
33
+ output_directory: Directory for output plots/CSVs.
34
+ sample_col: Obs column containing sample names.
35
+ ref_col: Obs column with reference/strand categories, or ``None``.
36
+ cluster_col: Obs column with merged cluster IDs.
37
+ plot: Whether to generate plots.
38
+ save_plot: Whether to save plots to disk.
39
+ n_boot: Number of bootstrap iterations per depth.
40
+ n_depths: Number of subsampling depths to evaluate.
41
+ random_state: Random seed for bootstrapping.
42
+ csv_summary: Whether to write CSV summary files.
43
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
44
+ force_redo: Whether to rerun even if ``uns_flag`` is present.
45
+ bypass: Whether to skip processing.
29
46
  """
30
47
  import os
48
+
49
+ import matplotlib.pyplot as plt
31
50
  import numpy as np
32
51
  import pandas as pd
33
- import matplotlib.pyplot as plt
34
52
  from scipy.optimize import curve_fit
35
- from datetime import datetime
36
53
 
37
54
  # early exits
38
55
  already = bool(adata.uns.get(uns_flag, False))
39
- if (already and not force_redo):
56
+ if already and not force_redo:
40
57
  return None
41
58
  if bypass:
42
59
  return None
@@ -44,9 +61,11 @@ def calculate_complexity_II(
44
61
  rng = np.random.default_rng(random_state)
45
62
 
46
63
  def lw(x, C0):
64
+ """Lander-Waterman curve for complexity estimation."""
47
65
  return C0 * (1.0 - np.exp(-x / C0))
48
66
 
49
67
  def sanitize(name: str) -> str:
68
+ """Sanitize a string for safe filenames."""
50
69
  return "".join(c if c.isalnum() or c in "-._" else "_" for c in str(name))
51
70
 
52
71
  # checks
@@ -77,7 +96,7 @@ def calculate_complexity_II(
77
96
  group_keys = []
78
97
  # iterate only pairs that exist in data to avoid empty processing
79
98
  for s in samples:
80
- mask_s = (adata.obs[sample_col] == s)
99
+ mask_s = adata.obs[sample_col] == s
81
100
  # find references present for this sample
82
101
  ref_present = pd.Categorical(adata.obs.loc[mask_s, ref_col]).categories
83
102
  # Use intersection of known reference categories and those present for sample
@@ -109,7 +128,7 @@ def calculate_complexity_II(
109
128
  "ci_high": np.array([], dtype=float),
110
129
  }
111
130
  # also store back-compat key
112
- adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
131
+ adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
113
132
  continue
114
133
 
115
134
  # cluster ids array for this group
@@ -175,39 +194,45 @@ def calculate_complexity_II(
175
194
  }
176
195
 
177
196
  # save per-group in adata.uns for backward compatibility
178
- adata.uns[f'Library_complexity_{sanitize(group_label)}'] = results[g]
197
+ adata.uns[f"Library_complexity_{sanitize(group_label)}"] = results[g]
179
198
 
180
199
  # prepare curve and fit records for CSV
181
- fit_records.append({
182
- "sample": sample,
183
- "reference": ref if ref_col is not None else "",
184
- "C0": float(C0),
185
- "n_reads": int(n_reads),
186
- "n_unique_observed": int(observed_unique),
187
- })
200
+ fit_records.append(
201
+ {
202
+ "sample": sample,
203
+ "reference": ref if ref_col is not None else "",
204
+ "C0": float(C0),
205
+ "n_reads": int(n_reads),
206
+ "n_unique_observed": int(observed_unique),
207
+ }
208
+ )
188
209
 
189
210
  x_fit = np.linspace(0, max(n_reads, int(depths[-1]) if depths.size else n_reads), 200)
190
211
  y_fit = lw(x_fit, C0)
191
212
  for d, mu, lo, hi in zip(depths, mean_unique, lo_ci, hi_ci):
192
- curve_records.append({
193
- "sample": sample,
194
- "reference": ref if ref_col is not None else "",
195
- "type": "bootstrap",
196
- "depth": int(d),
197
- "mean_unique": float(mu),
198
- "ci_low": float(lo),
199
- "ci_high": float(hi),
200
- })
213
+ curve_records.append(
214
+ {
215
+ "sample": sample,
216
+ "reference": ref if ref_col is not None else "",
217
+ "type": "bootstrap",
218
+ "depth": int(d),
219
+ "mean_unique": float(mu),
220
+ "ci_low": float(lo),
221
+ "ci_high": float(hi),
222
+ }
223
+ )
201
224
  for xf, yf in zip(x_fit, y_fit):
202
- curve_records.append({
203
- "sample": sample,
204
- "reference": ref if ref_col is not None else "",
205
- "type": "fit",
206
- "depth": float(xf),
207
- "mean_unique": float(yf),
208
- "ci_low": np.nan,
209
- "ci_high": np.nan,
210
- })
225
+ curve_records.append(
226
+ {
227
+ "sample": sample,
228
+ "reference": ref if ref_col is not None else "",
229
+ "type": "fit",
230
+ "depth": float(xf),
231
+ "mean_unique": float(yf),
232
+ "ci_low": np.nan,
233
+ "ci_high": np.nan,
234
+ }
235
+ )
211
236
 
212
237
  # plotting for this group
213
238
  if plot:
@@ -226,7 +251,9 @@ def calculate_complexity_II(
226
251
 
227
252
  if save_plot:
228
253
  fname = f"complexity_{sanitize(group_label)}.png"
229
- plt.savefig(os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight")
254
+ plt.savefig(
255
+ os.path.join(output_directory or ".", fname), dpi=160, bbox_inches="tight"
256
+ )
230
257
  plt.close()
231
258
  else:
232
259
  plt.show()
@@ -242,7 +269,7 @@ def calculate_complexity_II(
242
269
  fit_df = pd.DataFrame(fit_records)
243
270
  curve_df = pd.DataFrame(curve_records)
244
271
  base = output_directory or "."
245
- fit_df.to_csv(os.path.join(base, f"complexity_fit_summary.csv"), index=False)
246
- curve_df.to_csv(os.path.join(base, f"complexity_curves.csv"), index=False)
272
+ fit_df.to_csv(os.path.join(base, "complexity_fit_summary.csv"), index=False)
273
+ curve_df.to_csv(os.path.join(base, "complexity_curves.csv"), index=False)
247
274
 
248
275
  return results
@@ -1,19 +1,28 @@
1
1
  # calculate_consensus
2
2
 
3
- def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
4
- """
5
- Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
6
-
7
- Parameters:
8
- adata (AnnData): The input adata to append consensus metadata to.
9
- reference (str): The name of the reference to subset the adata on.
10
- sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
11
- reference_column (str): The name of the reference column (Default is 'Reference')
12
- sample_column (str): The name of the sample column (Default is 'Sample)
13
-
14
- Returns:
15
- None
16
-
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
9
+
10
+
11
+ def calculate_consensus(
12
+ adata: "ad.AnnData",
13
+ reference: str,
14
+ sample: str | bool = False,
15
+ reference_column: str = "Reference",
16
+ sample_column: str = "Sample",
17
+ ) -> None:
18
+ """Calculate a consensus sequence for a reference (and optional sample).
19
+
20
+ Args:
21
+ adata: AnnData object to append consensus metadata to.
22
+ reference: Reference name to subset on.
23
+ sample: If ``False``, uses all samples. If a string is passed, subsets to that sample.
24
+ reference_column: Obs column with reference names.
25
+ sample_column: Obs column with sample names.
17
26
  """
18
27
  import numpy as np
19
28
 
@@ -25,11 +34,11 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
25
34
  pass
26
35
 
27
36
  # Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
28
- layers = [layer for layer in record_subset.layers if '_binary_' in layer]
37
+ layers = [layer for layer in record_subset.layers if "_binary_" in layer]
29
38
  layer_map, layer_counts = {}, []
30
39
  for i, layer in enumerate(layers):
31
40
  # Gives an integer mapping to access which sequence base the binary layer is encoding
32
- layer_map[i] = layer.split('_')[0]
41
+ layer_map[i] = layer.split("_")[0]
33
42
  # Get the positional counts from all reads for the given base identity.
34
43
  layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
35
44
  # Combine the positional counts array derived from each binary base layer into an ndarray
@@ -40,8 +49,8 @@ def calculate_consensus(adata, reference, sample=False, reference_column='Refere
40
49
  consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
41
50
 
42
51
  if sample:
43
- adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
52
+ adata.var[f"{reference}_consensus_from_{sample}"] = consensus_sequence_list
44
53
  else:
45
- adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
54
+ adata.var[f"{reference}_consensus_across_samples"] = consensus_sequence_list
46
55
 
47
- adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
56
+ adata.uns[f"{reference}_consensus_sequence"] = consensus_sequence_list
@@ -1,54 +1,76 @@
1
- def calculate_coverage(adata,
2
- ref_column='Reference_strand',
3
- position_nan_threshold=0.01,
4
- uns_flag='calculate_coverage_performed'):
5
- """
6
- Append position-level metadata regarding whether the position is informative within the given observation category.
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from smftools.logging_utils import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
7
9
 
8
- Parameters:
9
- adata (AnnData): An AnnData object
10
- obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
11
- position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
10
+ logger = get_logger(__name__)
12
11
 
13
- Modifies:
14
- - Adds new columns to `adata.var` containing coverage statistics.
12
+
13
+ def calculate_coverage(
14
+ adata: "ad.AnnData",
15
+ ref_column: str = "Reference_strand",
16
+ position_nan_threshold: float = 0.01,
17
+ smf_modality: str = "deaminase",
18
+ target_layer: str = "binarized_methylation",
19
+ uns_flag: str = "calculate_coverage_performed",
20
+ force_redo: bool = False,
21
+ ) -> None:
22
+ """Append position-level coverage metadata per reference category.
23
+
24
+ Args:
25
+ adata: AnnData object.
26
+ ref_column: Obs column used to define reference/strand categories.
27
+ position_nan_threshold: Minimum fraction of coverage to mark a position as valid.
28
+ smf_modality: SMF modality. Use ``adata.X`` for conversion/deaminase or ``target_layer`` for direct.
29
+ target_layer: Layer used for direct SMF coverage calculations.
30
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
31
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
15
32
  """
16
33
  import numpy as np
17
34
  import pandas as pd
18
- import anndata as ad
19
35
 
20
36
  # Only run if not already performed
21
37
  already = bool(adata.uns.get(uns_flag, False))
22
- if already:
38
+ if already and not force_redo:
23
39
  # QC already performed; nothing to do
24
40
  return
25
-
41
+
26
42
  references = adata.obs[ref_column].cat.categories
27
43
  n_categories_with_position = np.zeros(adata.shape[1])
28
44
 
29
45
  # Loop over references
30
46
  for ref in references:
31
- print(f'Assessing positional coverage across samples for {ref} reference')
47
+ logger.info("Assessing positional coverage across samples for %s reference", ref)
32
48
 
33
49
  # Subset to current category
34
50
  ref_mask = adata.obs[ref_column] == ref
35
51
  temp_ref_adata = adata[ref_mask]
36
52
 
53
+ if smf_modality == "direct":
54
+ matrix = temp_ref_adata.layers[target_layer]
55
+ else:
56
+ matrix = temp_ref_adata.X
57
+
37
58
  # Compute fraction of valid coverage
38
- ref_valid_coverage = np.sum(~np.isnan(temp_ref_adata.X), axis=0)
59
+ ref_valid_coverage = np.sum(~np.isnan(matrix), axis=0)
39
60
  ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0] # Avoid extra computation
40
61
 
41
62
  # Store coverage stats
42
- adata.var[f'{ref}_valid_fraction'] = pd.Series(ref_valid_fraction, index=adata.var.index)
63
+ adata.var[f"{ref}_valid_count"] = pd.Series(ref_valid_coverage, index=adata.var.index)
64
+ adata.var[f"{ref}_valid_fraction"] = pd.Series(ref_valid_fraction, index=adata.var.index)
43
65
 
44
66
  # Assign whether the position is covered based on threshold
45
- adata.var[f'position_in_{ref}'] = ref_valid_fraction >= position_nan_threshold
67
+ adata.var[f"position_in_{ref}"] = ref_valid_fraction >= position_nan_threshold
46
68
 
47
69
  # Sum the number of categories covering each position
48
- n_categories_with_position += adata.var[f'position_in_{ref}'].values
70
+ n_categories_with_position += adata.var[f"position_in_{ref}"].values
49
71
 
50
72
  # Store final category count
51
- adata.var[f'N_{ref_column}_with_position'] = n_categories_with_position.astype(int)
73
+ adata.var[f"N_{ref_column}_with_position"] = n_categories_with_position.astype(int)
52
74
 
53
75
  # mark as done
54
- adata.uns[uns_flag] = True
76
+ adata.uns[uns_flag] = True
@@ -1,5 +1,6 @@
1
1
  # calculate_pairwise_differences
2
2
 
3
+
3
4
  def calculate_pairwise_differences(arrays):
4
5
  """
5
6
  Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
@@ -41,7 +42,7 @@ def calculate_pairwise_differences(arrays):
41
42
  # Calculate the hamming distance directly with boolean operations
42
43
  differences = (array_i != array_j) & ~combined_mask
43
44
  distance = np.sum(differences) / np.sum(~combined_mask)
44
-
45
+
45
46
  # Store the symmetric distances
46
47
  distance_matrix[i, j] = distance
47
48
  distance_matrix[j, i] = distance
@@ -1,6 +1,6 @@
1
1
  ## calculate_pairwise_hamming_distances
2
2
 
3
- ## Conversion SMF Specific
3
+ ## Conversion SMF Specific
4
4
  def calculate_pairwise_hamming_distances(arrays):
5
5
  """
6
6
  Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
@@ -13,8 +13,9 @@ def calculate_pairwise_hamming_distances(arrays):
13
13
 
14
14
  """
15
15
  import numpy as np
16
- from tqdm import tqdm
17
16
  from scipy.spatial.distance import hamming
17
+ from tqdm import tqdm
18
+
18
19
  num_arrays = len(arrays)
19
20
  # Initialize an empty distance matrix
20
21
  distance_matrix = np.zeros((num_arrays, num_arrays))
@@ -24,4 +25,4 @@ def calculate_pairwise_hamming_distances(arrays):
24
25
  distance = hamming(arrays[i], arrays[j])
25
26
  distance_matrix[i, j] = distance
26
27
  distance_matrix[j, i] = distance
27
- return distance_matrix
28
+ return distance_matrix