smftools 0.2.4__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. smftools/__init__.py +43 -13
  2. smftools/_settings.py +6 -6
  3. smftools/_version.py +3 -1
  4. smftools/cli/__init__.py +1 -0
  5. smftools/cli/archived/cli_flows.py +2 -0
  6. smftools/cli/helpers.py +9 -1
  7. smftools/cli/hmm_adata.py +905 -242
  8. smftools/cli/load_adata.py +432 -280
  9. smftools/cli/preprocess_adata.py +287 -171
  10. smftools/cli/spatial_adata.py +141 -53
  11. smftools/cli_entry.py +119 -178
  12. smftools/config/__init__.py +3 -1
  13. smftools/config/conversion.yaml +5 -1
  14. smftools/config/deaminase.yaml +1 -1
  15. smftools/config/default.yaml +26 -18
  16. smftools/config/direct.yaml +8 -3
  17. smftools/config/discover_input_files.py +19 -5
  18. smftools/config/experiment_config.py +511 -276
  19. smftools/constants.py +37 -0
  20. smftools/datasets/__init__.py +4 -8
  21. smftools/datasets/datasets.py +32 -18
  22. smftools/hmm/HMM.py +2133 -1428
  23. smftools/hmm/__init__.py +24 -14
  24. smftools/hmm/archived/apply_hmm_batched.py +2 -0
  25. smftools/hmm/archived/calculate_distances.py +2 -0
  26. smftools/hmm/archived/call_hmm_peaks.py +18 -1
  27. smftools/hmm/archived/train_hmm.py +2 -0
  28. smftools/hmm/call_hmm_peaks.py +176 -193
  29. smftools/hmm/display_hmm.py +23 -7
  30. smftools/hmm/hmm_readwrite.py +20 -6
  31. smftools/hmm/nucleosome_hmm_refinement.py +104 -14
  32. smftools/informatics/__init__.py +55 -13
  33. smftools/informatics/archived/bam_conversion.py +2 -0
  34. smftools/informatics/archived/bam_direct.py +2 -0
  35. smftools/informatics/archived/basecall_pod5s.py +2 -0
  36. smftools/informatics/archived/basecalls_to_adata.py +2 -0
  37. smftools/informatics/archived/conversion_smf.py +2 -0
  38. smftools/informatics/archived/deaminase_smf.py +1 -0
  39. smftools/informatics/archived/direct_smf.py +2 -0
  40. smftools/informatics/archived/fast5_to_pod5.py +2 -0
  41. smftools/informatics/archived/helpers/archived/__init__.py +2 -0
  42. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +16 -1
  43. smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +2 -0
  44. smftools/informatics/archived/helpers/archived/bam_qc.py +14 -1
  45. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +2 -0
  46. smftools/informatics/archived/helpers/archived/canoncall.py +2 -0
  47. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +8 -1
  48. smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +2 -0
  49. smftools/informatics/archived/helpers/archived/count_aligned_reads.py +2 -0
  50. smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +2 -0
  51. smftools/informatics/archived/helpers/archived/extract_base_identities.py +2 -0
  52. smftools/informatics/archived/helpers/archived/extract_mods.py +2 -0
  53. smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +2 -0
  54. smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +2 -0
  55. smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +2 -0
  56. smftools/informatics/archived/helpers/archived/find_conversion_sites.py +2 -0
  57. smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +2 -0
  58. smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +2 -0
  59. smftools/informatics/archived/helpers/archived/get_native_references.py +2 -0
  60. smftools/informatics/archived/helpers/archived/index_fasta.py +2 -0
  61. smftools/informatics/archived/helpers/archived/informatics.py +2 -0
  62. smftools/informatics/archived/helpers/archived/load_adata.py +5 -3
  63. smftools/informatics/archived/helpers/archived/make_modbed.py +2 -0
  64. smftools/informatics/archived/helpers/archived/modQC.py +2 -0
  65. smftools/informatics/archived/helpers/archived/modcall.py +2 -0
  66. smftools/informatics/archived/helpers/archived/ohe_batching.py +2 -0
  67. smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +2 -0
  68. smftools/informatics/archived/helpers/archived/one_hot_decode.py +2 -0
  69. smftools/informatics/archived/helpers/archived/one_hot_encode.py +2 -0
  70. smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +5 -1
  71. smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +2 -0
  72. smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +2 -0
  73. smftools/informatics/archived/print_bam_query_seq.py +9 -1
  74. smftools/informatics/archived/subsample_fasta_from_bed.py +2 -0
  75. smftools/informatics/archived/subsample_pod5.py +2 -0
  76. smftools/informatics/bam_functions.py +1059 -269
  77. smftools/informatics/basecalling.py +53 -9
  78. smftools/informatics/bed_functions.py +357 -114
  79. smftools/informatics/binarize_converted_base_identities.py +21 -7
  80. smftools/informatics/complement_base_list.py +9 -6
  81. smftools/informatics/converted_BAM_to_adata.py +324 -137
  82. smftools/informatics/fasta_functions.py +251 -89
  83. smftools/informatics/h5ad_functions.py +202 -30
  84. smftools/informatics/modkit_extract_to_adata.py +623 -274
  85. smftools/informatics/modkit_functions.py +87 -44
  86. smftools/informatics/ohe.py +46 -21
  87. smftools/informatics/pod5_functions.py +114 -74
  88. smftools/informatics/run_multiqc.py +20 -14
  89. smftools/logging_utils.py +51 -0
  90. smftools/machine_learning/__init__.py +23 -12
  91. smftools/machine_learning/data/__init__.py +2 -0
  92. smftools/machine_learning/data/anndata_data_module.py +157 -50
  93. smftools/machine_learning/data/preprocessing.py +4 -1
  94. smftools/machine_learning/evaluation/__init__.py +3 -1
  95. smftools/machine_learning/evaluation/eval_utils.py +13 -14
  96. smftools/machine_learning/evaluation/evaluators.py +52 -34
  97. smftools/machine_learning/inference/__init__.py +3 -1
  98. smftools/machine_learning/inference/inference_utils.py +9 -4
  99. smftools/machine_learning/inference/lightning_inference.py +14 -13
  100. smftools/machine_learning/inference/sklearn_inference.py +8 -8
  101. smftools/machine_learning/inference/sliding_window_inference.py +37 -25
  102. smftools/machine_learning/models/__init__.py +12 -5
  103. smftools/machine_learning/models/base.py +34 -43
  104. smftools/machine_learning/models/cnn.py +22 -13
  105. smftools/machine_learning/models/lightning_base.py +78 -42
  106. smftools/machine_learning/models/mlp.py +18 -5
  107. smftools/machine_learning/models/positional.py +10 -4
  108. smftools/machine_learning/models/rnn.py +8 -3
  109. smftools/machine_learning/models/sklearn_models.py +46 -24
  110. smftools/machine_learning/models/transformer.py +75 -55
  111. smftools/machine_learning/models/wrappers.py +8 -3
  112. smftools/machine_learning/training/__init__.py +4 -2
  113. smftools/machine_learning/training/train_lightning_model.py +42 -23
  114. smftools/machine_learning/training/train_sklearn_model.py +11 -15
  115. smftools/machine_learning/utils/__init__.py +3 -1
  116. smftools/machine_learning/utils/device.py +12 -5
  117. smftools/machine_learning/utils/grl.py +8 -2
  118. smftools/metadata.py +443 -0
  119. smftools/optional_imports.py +31 -0
  120. smftools/plotting/__init__.py +32 -17
  121. smftools/plotting/autocorrelation_plotting.py +153 -48
  122. smftools/plotting/classifiers.py +175 -73
  123. smftools/plotting/general_plotting.py +350 -168
  124. smftools/plotting/hmm_plotting.py +53 -14
  125. smftools/plotting/position_stats.py +155 -87
  126. smftools/plotting/qc_plotting.py +25 -12
  127. smftools/preprocessing/__init__.py +35 -37
  128. smftools/preprocessing/append_base_context.py +105 -79
  129. smftools/preprocessing/append_binary_layer_by_base_context.py +75 -37
  130. smftools/preprocessing/{archives → archived}/add_read_length_and_mapping_qc.py +2 -0
  131. smftools/preprocessing/{archives → archived}/calculate_complexity.py +5 -1
  132. smftools/preprocessing/{archives → archived}/mark_duplicates.py +2 -0
  133. smftools/preprocessing/{archives → archived}/preprocessing.py +10 -6
  134. smftools/preprocessing/{archives → archived}/remove_duplicates.py +2 -0
  135. smftools/preprocessing/binarize.py +21 -4
  136. smftools/preprocessing/binarize_on_Youden.py +127 -31
  137. smftools/preprocessing/binary_layers_to_ohe.py +18 -11
  138. smftools/preprocessing/calculate_complexity_II.py +89 -59
  139. smftools/preprocessing/calculate_consensus.py +28 -19
  140. smftools/preprocessing/calculate_coverage.py +44 -22
  141. smftools/preprocessing/calculate_pairwise_differences.py +4 -1
  142. smftools/preprocessing/calculate_pairwise_hamming_distances.py +7 -3
  143. smftools/preprocessing/calculate_position_Youden.py +110 -55
  144. smftools/preprocessing/calculate_read_length_stats.py +52 -23
  145. smftools/preprocessing/calculate_read_modification_stats.py +91 -57
  146. smftools/preprocessing/clean_NaN.py +38 -28
  147. smftools/preprocessing/filter_adata_by_nan_proportion.py +24 -12
  148. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +72 -37
  149. smftools/preprocessing/filter_reads_on_modification_thresholds.py +183 -73
  150. smftools/preprocessing/flag_duplicate_reads.py +708 -303
  151. smftools/preprocessing/invert_adata.py +26 -11
  152. smftools/preprocessing/load_sample_sheet.py +40 -22
  153. smftools/preprocessing/make_dirs.py +9 -3
  154. smftools/preprocessing/min_non_diagonal.py +4 -1
  155. smftools/preprocessing/recipes.py +58 -23
  156. smftools/preprocessing/reindex_references_adata.py +93 -27
  157. smftools/preprocessing/subsample_adata.py +33 -16
  158. smftools/readwrite.py +264 -109
  159. smftools/schema/__init__.py +11 -0
  160. smftools/schema/anndata_schema_v1.yaml +227 -0
  161. smftools/tools/__init__.py +25 -18
  162. smftools/tools/archived/apply_hmm.py +2 -0
  163. smftools/tools/archived/classifiers.py +165 -0
  164. smftools/tools/archived/classify_methylated_features.py +2 -0
  165. smftools/tools/archived/classify_non_methylated_features.py +2 -0
  166. smftools/tools/archived/subset_adata_v1.py +12 -1
  167. smftools/tools/archived/subset_adata_v2.py +14 -1
  168. smftools/tools/calculate_umap.py +56 -15
  169. smftools/tools/cluster_adata_on_methylation.py +122 -47
  170. smftools/tools/general_tools.py +70 -25
  171. smftools/tools/position_stats.py +220 -99
  172. smftools/tools/read_stats.py +50 -29
  173. smftools/tools/spatial_autocorrelation.py +365 -192
  174. smftools/tools/subset_adata.py +23 -21
  175. smftools-0.3.0.dist-info/METADATA +147 -0
  176. smftools-0.3.0.dist-info/RECORD +182 -0
  177. smftools-0.2.4.dist-info/METADATA +0 -141
  178. smftools-0.2.4.dist-info/RECORD +0 -176
  179. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/WHEEL +0 -0
  180. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/entry_points.txt +0 -0
  181. {smftools-0.2.4.dist-info → smftools-0.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,25 +1,40 @@
1
1
  ## invert_adata
2
2
 
3
- def invert_adata(adata, uns_flag='invert_adata_performed', force_redo=False):
4
- """
5
- Inverts the AnnData object along the column (variable) axis.
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ from smftools.logging_utils import get_logger
8
+
9
+ if TYPE_CHECKING:
10
+ import anndata as ad
6
11
 
7
- Parameters:
8
- adata (AnnData): An AnnData object.
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ def invert_adata(
16
+ adata: "ad.AnnData",
17
+ uns_flag: str = "invert_adata_performed",
18
+ force_redo: bool = False,
19
+ ) -> "ad.AnnData":
20
+ """Invert the AnnData object along the column axis.
21
+
22
+ Args:
23
+ adata: AnnData object.
24
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
25
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
9
26
 
10
27
  Returns:
11
- AnnData: A new AnnData object with inverted column ordering.
28
+ anndata.AnnData: New AnnData object with inverted column ordering.
12
29
  """
13
- import numpy as np
14
- import anndata as ad
15
30
 
16
31
  # Only run if not already performed
17
32
  already = bool(adata.uns.get(uns_flag, False))
18
- if (already and not force_redo):
33
+ if already and not force_redo:
19
34
  # QC already performed; nothing to do
20
35
  return adata
21
36
 
22
- print("Inverting AnnData along the column axis...")
37
+ logger.info("Inverting AnnData along the column axis...")
23
38
 
24
39
  # Reverse the order of columns (variables)
25
40
  inverted_adata = adata[:, ::-1].copy()
@@ -33,5 +48,5 @@ def invert_adata(adata, uns_flag='invert_adata_performed', force_redo=False):
33
48
  # mark as done
34
49
  inverted_adata.uns[uns_flag] = True
35
50
 
36
- print("Inversion complete!")
51
+ logger.info("Inversion complete!")
37
52
  return inverted_adata
@@ -1,21 +1,36 @@
1
- def load_sample_sheet(adata,
2
- sample_sheet_path,
3
- mapping_key_column='obs_names',
4
- as_category=True,
5
- uns_flag='load_sample_sheet_performed',
6
- force_reload=True
7
- ):
8
- """
9
- Loads a sample sheet CSV and maps metadata into the AnnData object as categorical columns.
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+
6
+ from smftools.logging_utils import get_logger
7
+
8
+ if TYPE_CHECKING:
9
+ import anndata as ad
10
+
11
+ logger = get_logger(__name__)
10
12
 
11
- Parameters:
12
- adata (AnnData): The AnnData object to append sample information to.
13
- sample_sheet_path (str): Path to the CSV file.
14
- mapping_key_column (str): Column name in the CSV to map against adata.obs_names or an existing obs column.
15
- as_category (bool): If True, added columns will be cast as pandas Categorical.
13
+
14
+ def load_sample_sheet(
15
+ adata: "ad.AnnData",
16
+ sample_sheet_path: str | Path,
17
+ mapping_key_column: str = "obs_names",
18
+ as_category: bool = True,
19
+ uns_flag: str = "load_sample_sheet_performed",
20
+ force_reload: bool = True,
21
+ ) -> "ad.AnnData":
22
+ """Load a sample sheet CSV and map metadata into ``adata.obs``.
23
+
24
+ Args:
25
+ adata: AnnData object to append sample information to.
26
+ sample_sheet_path: Path to the CSV file.
27
+ mapping_key_column: Column name to map against ``adata.obs_names`` or an obs column.
28
+ as_category: Whether to cast added columns as pandas Categoricals.
29
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
30
+ force_reload: Whether to reload even if ``uns_flag`` is set.
16
31
 
17
32
  Returns:
18
- AnnData: Updated AnnData object.
33
+ anndata.AnnData: Updated AnnData object.
19
34
  """
20
35
  import pandas as pd
21
36
 
@@ -25,29 +40,32 @@ def load_sample_sheet(adata,
25
40
  # QC already performed; nothing to do
26
41
  return
27
42
 
28
- print('Loading sample sheet...')
43
+ logger.info("Loading sample sheet...")
29
44
  df = pd.read_csv(sample_sheet_path)
30
45
  df[mapping_key_column] = df[mapping_key_column].astype(str)
31
-
46
+
32
47
  # If matching against obs_names directly
33
- if mapping_key_column == 'obs_names':
48
+ if mapping_key_column == "obs_names":
34
49
  key_series = adata.obs_names.astype(str)
35
50
  else:
36
51
  key_series = adata.obs[mapping_key_column].astype(str)
37
52
 
38
53
  value_columns = [col for col in df.columns if col != mapping_key_column]
39
-
40
- print(f'Appending metadata columns: {value_columns}')
54
+
55
+ logger.info("Appending metadata columns: %s", value_columns)
41
56
  df = df.set_index(mapping_key_column)
42
57
 
43
58
  for col in value_columns:
44
59
  mapped = key_series.map(df[col])
45
60
  if as_category:
46
- mapped = mapped.astype('category')
61
+ mapped = mapped.astype("category")
47
62
  adata.obs[col] = mapped
48
63
 
49
64
  # mark as done
50
65
  adata.uns[uns_flag] = True
51
66
 
52
- print('Sample sheet metadata successfully added as categories.' if as_category else 'Metadata added.')
67
+ if as_category:
68
+ logger.info("Sample sheet metadata successfully added as categories.")
69
+ else:
70
+ logger.info("Metadata added.")
53
71
  return adata
@@ -1,4 +1,10 @@
1
+ from __future__ import annotations
2
+
1
3
  ## make_dirs
4
+ from smftools.logging_utils import get_logger
5
+
6
+ logger = get_logger(__name__)
7
+
2
8
 
3
9
  # General
4
10
  def make_dirs(directories):
@@ -7,7 +13,7 @@ def make_dirs(directories):
7
13
 
8
14
  Parameters:
9
15
  directories (list): A list of directories to make
10
-
16
+
11
17
  Returns:
12
18
  None
13
19
  """
@@ -16,6 +22,6 @@ def make_dirs(directories):
16
22
  for directory in directories:
17
23
  if not os.path.isdir(directory):
18
24
  os.mkdir(directory)
19
- print(f"Directory '{directory}' created successfully.")
25
+ logger.info("Directory '%s' created successfully.", directory)
20
26
  else:
21
- print(f"Directory '{directory}' already exists.")
27
+ logger.info("Directory '%s' already exists.", directory)
@@ -1,5 +1,8 @@
1
+ from __future__ import annotations
2
+
1
3
  ## min_non_diagonal
2
4
 
5
+
3
6
  def min_non_diagonal(matrix):
4
7
  """
5
8
  Takes a matrix and returns the smallest value from each row with the diagonal masked.
@@ -22,4 +25,4 @@ def min_non_diagonal(matrix):
22
25
  row = matrix[i, row_mask]
23
26
  # Find the minimum value in the row
24
27
  min_values.append(np.min(row))
25
- return min_values
28
+ return min_values
@@ -1,6 +1,17 @@
1
+ from __future__ import annotations
2
+
1
3
  # recipes
2
4
 
3
- def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory, mapping_key_column='Sample', reference_column = 'Reference', sample_names_col='Sample_names', invert=True):
5
+
6
+ def recipe_1_Kissiov_and_McKenna_2025(
7
+ adata,
8
+ sample_sheet_path,
9
+ output_directory,
10
+ mapping_key_column="Sample",
11
+ reference_column="Reference",
12
+ sample_names_col="Sample_names",
13
+ invert=True,
14
+ ):
4
15
  """
5
16
  The first part of the preprocessing workflow applied to the smf.inform.pod_to_adata() output derived from Kissiov_and_McKenna_2025.
6
17
 
@@ -26,36 +37,38 @@ def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory
26
37
  Returns:
27
38
  variables (dict): A dictionary of variables to append to the parent scope.
28
39
  """
29
- import anndata as ad
30
- import pandas as pd
31
- import numpy as np
32
- from .load_sample_sheet import load_sample_sheet
33
- from .calculate_coverage import calculate_coverage
40
+
34
41
  from .append_C_context import append_C_context
35
- from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
36
- from .invert_adata import invert_adata
42
+ from .calculate_converted_read_methylation_stats import (
43
+ calculate_converted_read_methylation_stats,
44
+ )
45
+ from .calculate_coverage import calculate_coverage
37
46
  from .calculate_read_length_stats import calculate_read_length_stats
38
47
  from .clean_NaN import clean_NaN
48
+ from .invert_adata import invert_adata
49
+ from .load_sample_sheet import load_sample_sheet
39
50
 
40
51
  # Clean up some of the Reference metadata and save variable names that point to sets of values in the column.
41
- adata.obs[reference_column] = adata.obs[reference_column].astype('category')
52
+ adata.obs[reference_column] = adata.obs[reference_column].astype("category")
42
53
  references = adata.obs[reference_column].cat.categories
43
- split_references = [(reference, reference.split('_')[0][1:]) for reference in references]
54
+ split_references = [(reference, reference.split("_")[0][1:]) for reference in references]
44
55
  reference_mapping = {k: v for k, v in split_references}
45
- adata.obs[f'{reference_column}_short'] = adata.obs[reference_column].map(reference_mapping)
46
- short_references = set(adata.obs[f'{reference_column}_short'])
56
+ adata.obs[f"{reference_column}_short"] = adata.obs[reference_column].map(reference_mapping)
57
+ short_references = set(adata.obs[f"{reference_column}_short"])
47
58
  binary_layers = list(adata.layers.keys())
48
59
 
49
60
  # load sample sheet metadata
50
61
  load_sample_sheet(adata, sample_sheet_path, mapping_key_column)
51
62
 
52
63
  # hold sample names set
53
- adata.obs[sample_names_col] = adata.obs[sample_names_col].astype('category')
64
+ adata.obs[sample_names_col] = adata.obs[sample_names_col].astype("category")
54
65
  sample_names = adata.obs[sample_names_col].cat.categories
55
66
 
56
67
  # Add position level metadata
57
68
  calculate_coverage(adata, obs_column=reference_column)
58
- adata.var['SNP_position'] = (adata.var[f'N_{reference_column}_with_position'] > 0) & (adata.var[f'N_{reference_column}_with_position'] < len(references)).astype(bool)
69
+ adata.var["SNP_position"] = (adata.var[f"N_{reference_column}_with_position"] > 0) & (
70
+ adata.var[f"N_{reference_column}_with_position"] < len(references)
71
+ ).astype(bool)
59
72
 
60
73
  # Append cytosine context to the reference positions based on the conversion strand.
61
74
  append_C_context(adata, obs_column=reference_column, use_consensus=False)
@@ -64,7 +77,9 @@ def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory
64
77
  calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col)
65
78
 
66
79
  # Calculate read length statistics
67
- upper_bound, lower_bound = calculate_read_length_stats(adata, reference_column, sample_names_col)
80
+ upper_bound, lower_bound = calculate_read_length_stats(
81
+ adata, reference_column, sample_names_col
82
+ )
68
83
 
69
84
  # Invert the adata object (ie flip the strand orientation for visualization)
70
85
  if invert:
@@ -81,11 +96,19 @@ def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory
81
96
  "sample_names": sample_names,
82
97
  "upper_bound": upper_bound,
83
98
  "lower_bound": lower_bound,
84
- "references": references
99
+ "references": references,
85
100
  }
86
101
  return variables
87
102
 
88
- def recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers, distance_thresholds={}, reference_column = 'Reference', sample_names_col='Sample_names'):
103
+
104
+ def recipe_2_Kissiov_and_McKenna_2025(
105
+ adata,
106
+ output_directory,
107
+ binary_layers,
108
+ distance_thresholds={},
109
+ reference_column="Reference",
110
+ sample_names_col="Sample_names",
111
+ ):
89
112
  """
90
113
  The second part of the preprocessing workflow applied to the adata that has already been preprocessed by recipe_1_Kissiov_and_McKenna_2025.
91
114
 
@@ -107,20 +130,32 @@ def recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers, di
107
130
  filtered_adata (AnnData): An AnnData object containing the filtered reads
108
131
  duplicates (AnnData): An AnnData object containing the duplicate reads
109
132
  """
110
- import anndata as ad
111
- import pandas as pd
112
- import numpy as np
113
- from .mark_duplicates import mark_duplicates
133
+
114
134
  from .calculate_complexity import calculate_complexity
135
+ from .mark_duplicates import mark_duplicates
115
136
  from .remove_duplicates import remove_duplicates
116
137
 
117
138
  # Add here a way to remove reads below a given read quality (based on nan content). Need to also add a way to pull from BAM files the read quality from each read
118
139
 
119
140
  # Duplicate detection using pairwise hamming distance across reads
120
- mark_duplicates(adata, binary_layers, obs_column=reference_column, sample_col=sample_names_col, distance_thresholds=distance_thresholds, method='N_masked_distances')
141
+ mark_duplicates(
142
+ adata,
143
+ binary_layers,
144
+ obs_column=reference_column,
145
+ sample_col=sample_names_col,
146
+ distance_thresholds=distance_thresholds,
147
+ method="N_masked_distances",
148
+ )
121
149
 
122
150
  # Complexity analysis using the marked duplicates and the lander-watermann algorithm
123
- calculate_complexity(adata, output_directory, obs_column=reference_column, sample_col=sample_names_col, plot=True, save_plot=False)
151
+ calculate_complexity(
152
+ adata,
153
+ output_directory,
154
+ obs_column=reference_column,
155
+ sample_col=sample_names_col,
156
+ plot=True,
157
+ save_plot=False,
158
+ )
124
159
 
125
160
  # Remove duplicate reads and store the duplicate reads in a new AnnData object named duplicates.
126
161
  filtered_adata, duplicates = remove_duplicates(adata)
@@ -1,37 +1,103 @@
1
- ## reindex_references_adata
2
-
3
- def reindex_references_adata(adata,
4
- reference_col="Reference_strand",
5
- offsets=None,
6
- new_col="reindexed",
7
- uns_flag='reindex_references_adata_performed',
8
- force_redo=False):
9
-
10
- # Only run if not already performed
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from smftools.logging_utils import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def reindex_references_adata(
14
+ adata: "ad.AnnData",
15
+ reference_col: str = "Reference_strand",
16
+ offsets: dict | None = None,
17
+ new_col: str = "reindexed",
18
+ uns_flag: str = "reindex_references_adata_performed",
19
+ force_redo: bool = False,
20
+ ) -> None:
21
+ """Reindex genomic coordinates by adding per-reference offsets.
22
+
23
+ Args:
24
+ adata: AnnData object.
25
+ reference_col: Obs column containing reference identifiers.
26
+ offsets: Mapping of reference to integer offset.
27
+ new_col: Suffix for generated reindexed columns.
28
+ uns_flag: Flag in ``adata.uns`` indicating prior completion.
29
+ force_redo: Whether to rerun even if ``uns_flag`` is set.
30
+
31
+ Notes:
32
+ If ``offsets`` is ``None`` or missing a reference, the new column mirrors
33
+ the existing ``var_names`` values.
34
+ """
35
+
36
+ import numpy as np
37
+
38
+ # ============================================================
39
+ # 1. Skip if already done
40
+ # ============================================================
11
41
  already = bool(adata.uns.get(uns_flag, False))
12
- if (already and not force_redo):
42
+ if already and not force_redo:
43
+ logger.info("%s already set; skipping. Use force_redo=True to recompute.", uns_flag)
13
44
  return None
14
-
45
+
46
+ # Normalize offsets
15
47
  if offsets is None:
16
- pass
17
- else:
18
- # Ensure var_names are numeric
48
+ offsets = {}
49
+ elif not isinstance(offsets, dict):
50
+ raise TypeError("offsets must be a dict {ref: int} or None.")
51
+
52
+ # ============================================================
53
+ # 2. Ensure var_names are numeric
54
+ # ============================================================
55
+ try:
19
56
  var_coords = adata.var_names.astype(int)
57
+ except Exception as e:
58
+ raise ValueError(
59
+ "reindex_references_adata requires adata.var_names to be integer-like."
60
+ ) from e
61
+
62
+ # ============================================================
63
+ # 3. Gather all references
64
+ # ============================================================
65
+ ref_series = adata.obs[reference_col]
66
+ references = ref_series.cat.categories if hasattr(ref_series, "cat") else ref_series.unique()
67
+
68
+ # ============================================================
69
+ # 4. Create reindexed columns
70
+ # ============================================================
71
+ for ref in references:
72
+ colname = f"{ref}_{new_col}"
73
+
74
+ # Case 1: No offset provided → identity mapping
75
+ if ref not in offsets:
76
+ logger.info("No offset for ref=%r; using identity positions.", ref)
77
+ adata.var[colname] = var_coords
78
+ continue
79
+
80
+ offset_value = offsets[ref]
20
81
 
21
- for ref in adata.obs[reference_col].unique():
22
- if ref not in offsets:
23
- pass
24
- else:
25
- offset_value = offsets[ref]
82
+ # Case 2: offset explicitly None → identity mapping
83
+ if offset_value is None:
84
+ logger.info("Offset for ref=%r is None; using identity positions.", ref)
85
+ adata.var[colname] = var_coords
86
+ continue
26
87
 
27
- # Create a new var column for this reference
28
- colname = f"{ref}_{new_col}"
88
+ # Case 3: real shift
89
+ if not isinstance(offset_value, (int, np.integer)):
90
+ raise TypeError(
91
+ f"Offset for reference {ref!r} must be an integer or None. Got {offset_value!r}"
92
+ )
29
93
 
30
- # Add offset to all var positions
31
- adata.var[colname] = var_coords + offset_value
94
+ adata.var[colname] = var_coords + offset_value
95
+ logger.info("Added reindexed column '%s' (offset=%s).", colname, offset_value)
32
96
 
33
- # mark as done
97
+ # ============================================================
98
+ # 5. Mark complete
99
+ # ============================================================
34
100
  adata.uns[uns_flag] = True
101
+ logger.info("Reindexing complete!")
35
102
 
36
- print("Reindexing complete!")
37
- return None
103
+ return None
@@ -1,19 +1,36 @@
1
- def subsample_adata(adata, obs_columns=None, max_samples=2000, random_seed=42):
2
- """
3
- Subsamples an AnnData object so that each unique combination of categories
4
- in the given `obs_columns` has at most `max_samples` observations.
5
- If `obs_columns` is None or empty, the function randomly subsamples the entire dataset.
6
-
7
- Parameters:
8
- adata (AnnData): The AnnData object to subsample.
9
- obs_columns (list of str, optional): List of observation column names to group by.
10
- max_samples (int): The maximum number of observations per category combination.
11
- random_seed (int): Random seed for reproducibility.
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Sequence
4
+
5
+ from smftools.logging_utils import get_logger
6
+
7
+ if TYPE_CHECKING:
8
+ import anndata as ad
9
+
10
+ logger = get_logger(__name__)
11
+
12
+
13
+ def subsample_adata(
14
+ adata: "ad.AnnData",
15
+ obs_columns: Sequence[str] | None = None,
16
+ max_samples: int = 2000,
17
+ random_seed: int = 42,
18
+ ) -> "ad.AnnData":
19
+ """Subsample an AnnData object by observation categories.
20
+
21
+ Each unique combination of categories in ``obs_columns`` is capped at
22
+ ``max_samples`` observations. If ``obs_columns`` is ``None``, the function
23
+ randomly subsamples the entire dataset.
24
+
25
+ Args:
26
+ adata: AnnData object to subsample.
27
+ obs_columns: Observation column names to group by.
28
+ max_samples: Maximum observations per category combination.
29
+ random_seed: Random seed for reproducibility.
12
30
 
13
31
  Returns:
14
- AnnData: A new AnnData object with subsampled observations.
32
+ anndata.AnnData: Subsampled AnnData object.
15
33
  """
16
- import anndata as ad
17
34
  import numpy as np
18
35
 
19
36
  np.random.seed(random_seed) # Ensure reproducibility
@@ -23,7 +40,7 @@ def subsample_adata(adata, obs_columns=None, max_samples=2000, random_seed=42):
23
40
  sampled_indices = np.random.choice(adata.obs.index, max_samples, replace=False)
24
41
  else:
25
42
  sampled_indices = adata.obs.index # Keep all if fewer than max_samples
26
-
43
+
27
44
  return adata[sampled_indices].copy()
28
45
 
29
46
  sampled_indices = []
@@ -34,7 +51,7 @@ def subsample_adata(adata, obs_columns=None, max_samples=2000, random_seed=42):
34
51
  for _, row in unique_combinations.iterrows():
35
52
  # Build filter condition dynamically for multiple columns
36
53
  condition = (adata.obs[obs_columns] == row.values).all(axis=1)
37
-
54
+
38
55
  # Get indices for the current category combination
39
56
  subset_indices = adata.obs[condition].index.to_numpy()
40
57
 
@@ -48,7 +65,7 @@ def subsample_adata(adata, obs_columns=None, max_samples=2000, random_seed=42):
48
65
 
49
66
  # ⚠ Handle backed mode detection
50
67
  if adata.isbacked:
51
- print("Detected backed mode. Subset will be loaded fully into memory.")
68
+ logger.warning("Detected backed mode. Subset will be loaded fully into memory.")
52
69
  subset = adata[sampled_indices]
53
70
  subset = subset.to_memory()
54
71
  else: