smftools 0.2.1__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/archived/cli_flows.py +94 -0
  5. smftools/cli/helpers.py +48 -0
  6. smftools/cli/hmm_adata.py +361 -0
  7. smftools/cli/load_adata.py +637 -0
  8. smftools/cli/preprocess_adata.py +455 -0
  9. smftools/cli/spatial_adata.py +697 -0
  10. smftools/cli_entry.py +434 -0
  11. smftools/config/conversion.yaml +18 -6
  12. smftools/config/deaminase.yaml +18 -11
  13. smftools/config/default.yaml +151 -36
  14. smftools/config/direct.yaml +28 -1
  15. smftools/config/discover_input_files.py +115 -0
  16. smftools/config/experiment_config.py +225 -27
  17. smftools/hmm/HMM.py +12 -1
  18. smftools/hmm/__init__.py +0 -6
  19. smftools/hmm/archived/call_hmm_peaks.py +106 -0
  20. smftools/hmm/call_hmm_peaks.py +318 -90
  21. smftools/informatics/__init__.py +13 -7
  22. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  23. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  24. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  25. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  26. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  27. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  28. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  30. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  31. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  32. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  33. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  34. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  35. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  36. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  37. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  38. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  39. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  40. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  41. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  42. smftools/informatics/bam_functions.py +811 -0
  43. smftools/informatics/basecalling.py +67 -0
  44. smftools/informatics/bed_functions.py +366 -0
  45. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  46. smftools/informatics/fasta_functions.py +255 -0
  47. smftools/informatics/h5ad_functions.py +197 -0
  48. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  49. smftools/informatics/modkit_functions.py +129 -0
  50. smftools/informatics/ohe.py +160 -0
  51. smftools/informatics/pod5_functions.py +224 -0
  52. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  53. smftools/plotting/autocorrelation_plotting.py +1 -3
  54. smftools/plotting/general_plotting.py +1084 -363
  55. smftools/plotting/position_stats.py +3 -3
  56. smftools/preprocessing/__init__.py +4 -4
  57. smftools/preprocessing/append_base_context.py +35 -26
  58. smftools/preprocessing/append_binary_layer_by_base_context.py +6 -6
  59. smftools/preprocessing/binarize.py +17 -0
  60. smftools/preprocessing/binarize_on_Youden.py +11 -9
  61. smftools/preprocessing/calculate_complexity_II.py +1 -1
  62. smftools/preprocessing/calculate_coverage.py +16 -13
  63. smftools/preprocessing/calculate_position_Youden.py +42 -26
  64. smftools/preprocessing/calculate_read_modification_stats.py +2 -2
  65. smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  66. smftools/preprocessing/filter_reads_on_modification_thresholds.py +20 -20
  67. smftools/preprocessing/flag_duplicate_reads.py +2 -2
  68. smftools/preprocessing/invert_adata.py +1 -1
  69. smftools/preprocessing/load_sample_sheet.py +1 -1
  70. smftools/preprocessing/reindex_references_adata.py +37 -0
  71. smftools/readwrite.py +360 -140
  72. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/METADATA +26 -19
  73. smftools-0.2.4.dist-info/RECORD +176 -0
  74. smftools-0.2.4.dist-info/entry_points.txt +2 -0
  75. smftools/cli.py +0 -184
  76. smftools/informatics/fast5_to_pod5.py +0 -24
  77. smftools/informatics/helpers/__init__.py +0 -73
  78. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  79. smftools/informatics/helpers/bam_qc.py +0 -66
  80. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  81. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  82. smftools/informatics/helpers/discover_input_files.py +0 -100
  83. smftools/informatics/helpers/index_fasta.py +0 -12
  84. smftools/informatics/helpers/make_dirs.py +0 -21
  85. smftools/informatics/readwrite.py +0 -106
  86. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  87. smftools/load_adata.py +0 -1346
  88. smftools-0.2.1.dist-info/RECORD +0 -161
  89. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  90. /smftools/hmm/{apply_hmm_batched.py → archived/apply_hmm_batched.py} +0 -0
  91. /smftools/hmm/{calculate_distances.py → archived/calculate_distances.py} +0 -0
  92. /smftools/hmm/{train_hmm.py → archived/train_hmm.py} +0 -0
  93. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  94. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  95. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  96. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  97. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  98. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  99. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  100. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  101. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  102. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  103. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  104. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  105. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  106. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  107. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  108. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  109. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  110. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  111. /smftools/preprocessing/{add_read_length_and_mapping_qc.py → archives/add_read_length_and_mapping_qc.py} +0 -0
  112. /smftools/preprocessing/{calculate_complexity.py → archives/calculate_complexity.py} +0 -0
  113. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/WHEEL +0 -0
  114. {smftools-0.2.1.dist-info → smftools-0.2.4.dist-info}/licenses/LICENSE +0 -0
@@ -90,7 +90,7 @@ def plot_volcano_relative_risk(
90
90
  safe_name = f"{ref}_{group_label}".replace("=", "").replace("__", "_").replace(",", "_").replace(" ", "_")
91
91
  out_file = os.path.join(save_path, f"{safe_name}.png")
92
92
  plt.savefig(out_file, dpi=300)
93
- print(f"📁 Saved: {out_file}")
93
+ print(f"Saved: {out_file}")
94
94
 
95
95
  plt.show()
96
96
 
@@ -449,7 +449,7 @@ def plot_positionwise_matrix_grid(
449
449
  os.makedirs(save_path, exist_ok=True)
450
450
  fname = outer_label.replace("_", "").replace("=", "") + ".png"
451
451
  plt.savefig(os.path.join(save_path, fname), dpi=300, bbox_inches='tight')
452
- print(f"Saved {fname}")
452
+ print(f"Saved {fname}")
453
453
 
454
454
  plt.close(fig)
455
455
 
@@ -459,4 +459,4 @@ def plot_positionwise_matrix_grid(
459
459
  for outer_label in parsed['outer'].unique():
460
460
  plot_one_grid(outer_label)
461
461
 
462
- print("Finished plotting all grids.")
462
+ print("Finished plotting all grids.")
@@ -1,8 +1,7 @@
1
- from .add_read_length_and_mapping_qc import add_read_length_and_mapping_qc
2
1
  from .append_base_context import append_base_context
3
2
  from .append_binary_layer_by_base_context import append_binary_layer_by_base_context
4
3
  from .binarize_on_Youden import binarize_on_Youden
5
- from .calculate_complexity import calculate_complexity
4
+ from .binarize import binarize_adata
6
5
  from .calculate_complexity_II import calculate_complexity_II
7
6
  from .calculate_read_modification_stats import calculate_read_modification_stats
8
7
  from .calculate_coverage import calculate_coverage
@@ -15,14 +14,15 @@ from .filter_reads_on_length_quality_mapping import filter_reads_on_length_quali
15
14
  from .invert_adata import invert_adata
16
15
  from .load_sample_sheet import load_sample_sheet
17
16
  from .flag_duplicate_reads import flag_duplicate_reads
17
+ from .reindex_references_adata import reindex_references_adata
18
18
  from .subsample_adata import subsample_adata
19
19
 
20
20
  __all__ = [
21
- "add_read_length_and_mapping_qc",
22
21
  "append_base_context",
23
22
  "append_binary_layer_by_base_context",
24
23
  "binarize_on_Youden",
25
- "calculate_complexity",
24
+ "binarize_adata",
25
+ "calculate_complexity_II",
26
26
  "calculate_read_modification_stats",
27
27
  "calculate_coverage",
28
28
  "calculate_position_Youden",
@@ -1,18 +1,19 @@
1
1
  def append_base_context(adata,
2
- obs_column='Reference_strand',
2
+ ref_column='Reference_strand',
3
3
  use_consensus=False,
4
4
  native=False,
5
5
  mod_target_bases=['GpC', 'CpG'],
6
6
  bypass=False,
7
7
  force_redo=False,
8
- uns_flag='base_context_added'
8
+ uns_flag='append_base_context_performed'
9
9
  ):
10
10
  """
11
11
  Adds nucleobase context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
12
+ This needs to be performed prior to AnnData inversion step.
12
13
 
13
14
  Parameters:
14
15
  adata (AnnData): The input adata object.
15
- obs_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
16
+ ref_column (str): The observation column in which to stratify on. Default is 'Reference_strand', which should not be changed for most purposes.
16
17
  use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
17
18
  native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
18
19
  mod_target_bases (list): Base contexts that may be modified.
@@ -30,68 +31,69 @@ def append_base_context(adata,
30
31
  return
31
32
 
32
33
  print('Adding base context based on reference FASTA sequence for sample')
33
- categories = adata.obs[obs_column].cat.categories
34
+ references = adata.obs[ref_column].cat.categories
34
35
  site_types = []
35
36
 
36
37
  if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
37
- site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'any_C_site']
38
+ site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
38
39
 
39
40
  if 'A' in mod_target_bases:
40
41
  site_types += ['A_site']
41
42
 
42
- for cat in categories:
43
+ for ref in references:
43
44
  # Assess if the strand is the top or bottom strand converted
44
- if 'top' in cat:
45
+ if 'top' in ref:
45
46
  strand = 'top'
46
- elif 'bottom' in cat:
47
+ elif 'bottom' in ref:
47
48
  strand = 'bottom'
48
49
 
49
50
  if native:
50
- basename = cat.split(f"_{strand}")[0]
51
+ basename = ref.split(f"_{strand}")[0]
51
52
  if use_consensus:
52
53
  sequence = adata.uns[f'{basename}_consensus_sequence']
53
54
  else:
54
55
  # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
55
56
  sequence = adata.uns[f'{basename}_FASTA_sequence']
56
57
  else:
57
- basename = cat.split(f"_{strand}")[0]
58
+ basename = ref.split(f"_{strand}")[0]
58
59
  if use_consensus:
59
60
  sequence = adata.uns[f'{basename}_consensus_sequence']
60
61
  else:
61
62
  # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
62
63
  sequence = adata.uns[f'{basename}_FASTA_sequence']
64
+
63
65
  # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
64
66
  boolean_dict = {}
65
67
  for site_type in site_types:
66
- boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
68
+ boolean_dict[f'{ref}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
67
69
 
68
70
  if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
69
71
  if strand == 'top':
70
72
  # Iterate through the sequence and apply the criteria
71
73
  for i in range(1, len(sequence) - 1):
72
74
  if sequence[i] == 'C':
73
- boolean_dict[f'{cat}_any_C_site'][i] = True
75
+ boolean_dict[f'{ref}_C_site'][i] = True
74
76
  if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
75
- boolean_dict[f'{cat}_GpC_site'][i] = True
77
+ boolean_dict[f'{ref}_GpC_site'][i] = True
76
78
  elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
77
- boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
79
+ boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
78
80
  elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
79
- boolean_dict[f'{cat}_CpG_site'][i] = True
81
+ boolean_dict[f'{ref}_CpG_site'][i] = True
80
82
  elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
81
- boolean_dict[f'{cat}_other_C_site'][i] = True
83
+ boolean_dict[f'{ref}_other_C_site'][i] = True
82
84
  elif strand == 'bottom':
83
85
  # Iterate through the sequence and apply the criteria
84
86
  for i in range(1, len(sequence) - 1):
85
87
  if sequence[i] == 'G':
86
- boolean_dict[f'{cat}_any_C_site'][i] = True
88
+ boolean_dict[f'{ref}_C_site'][i] = True
87
89
  if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
88
- boolean_dict[f'{cat}_GpC_site'][i] = True
90
+ boolean_dict[f'{ref}_GpC_site'][i] = True
89
91
  elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
90
- boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
92
+ boolean_dict[f'{ref}_ambiguous_GpC_CpG_site'][i] = True
91
93
  elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
92
- boolean_dict[f'{cat}_CpG_site'][i] = True
94
+ boolean_dict[f'{ref}_CpG_site'][i] = True
93
95
  elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
94
- boolean_dict[f'{cat}_other_C_site'][i] = True
96
+ boolean_dict[f'{ref}_other_C_site'][i] = True
95
97
  else:
96
98
  print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
97
99
 
@@ -100,21 +102,28 @@ def append_base_context(adata,
100
102
  # Iterate through the sequence and apply the criteria
101
103
  for i in range(1, len(sequence) - 1):
102
104
  if sequence[i] == 'A':
103
- boolean_dict[f'{cat}_A_site'][i] = True
105
+ boolean_dict[f'{ref}_A_site'][i] = True
104
106
  elif strand == 'bottom':
105
107
  # Iterate through the sequence and apply the criteria
106
108
  for i in range(1, len(sequence) - 1):
107
109
  if sequence[i] == 'T':
108
- boolean_dict[f'{cat}_A_site'][i] = True
110
+ boolean_dict[f'{ref}_A_site'][i] = True
109
111
  else:
110
112
  print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
111
113
 
112
114
  for site_type in site_types:
113
- adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
115
+ # Site context annotations for each reference
116
+ adata.var[f'{ref}_{site_type}'] = boolean_dict[f'{ref}_{site_type}'].astype(bool)
117
+ # Restrict the site type labels to only be in positions that occur at a high enough frequency in the dataset
118
+ if adata.uns["calculate_coverage_performed"] == True:
119
+ adata.var[f'{ref}_{site_type}'] = (adata.var[f'{ref}_{site_type}']) & (adata.var[f'position_in_{ref}'])
120
+ else:
121
+ pass
122
+
114
123
  if native:
115
- adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].layers['binarized_methylation']
124
+ adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].layers['binarized_methylation']
116
125
  else:
117
- adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
126
+ adata.obsm[f'{ref}_{site_type}'] = adata[:, adata.var[f'{ref}_{site_type}'] == True].X
118
127
 
119
128
  # mark as done
120
129
  adata.uns[uns_flag] = True
@@ -6,7 +6,7 @@ def append_binary_layer_by_base_context(
6
6
  reference_column: str,
7
7
  smf_modality: str = "conversion",
8
8
  verbose: bool = True,
9
- uns_flag: str = "binary_layers_by_base_context_added",
9
+ uns_flag: str = "append_binary_layer_by_base_context_performed",
10
10
  bypass: bool = False,
11
11
  force_redo: bool = False
12
12
  ):
@@ -15,7 +15,7 @@ def append_binary_layer_by_base_context(
15
15
  - GpC_site_binary
16
16
  - CpG_site_binary
17
17
  - GpC_CpG_combined_site_binary (numeric sum where present; NaN where neither present)
18
- - any_C_site_binary
18
+ - C_site_binary
19
19
  - other_C_site_binary
20
20
 
21
21
  Behavior:
@@ -27,7 +27,7 @@ def append_binary_layer_by_base_context(
27
27
 
28
28
  # Only run if not already performed
29
29
  already = bool(adata.uns.get(uns_flag, False))
30
- if (already and not force_redo) or bypass or ("base_context_added" not in adata.uns):
30
+ if (already and not force_redo) or bypass or ("append_base_context_performed" not in adata.uns):
31
31
  # QC already performed; nothing to do
32
32
  return adata
33
33
 
@@ -48,7 +48,7 @@ def append_binary_layer_by_base_context(
48
48
  references = adata.obs[reference_column].astype("category").cat.categories
49
49
  reference_to_gpc_column = {ref: f"{ref}_GpC_site" for ref in references}
50
50
  reference_to_cpg_column = {ref: f"{ref}_CpG_site" for ref in references}
51
- reference_to_c_column = {ref: f"{ref}_any_C_site" for ref in references}
51
+ reference_to_c_column = {ref: f"{ref}_C_site" for ref in references}
52
52
  reference_to_other_c_column = {ref: f"{ref}_other_C_site" for ref in references}
53
53
 
54
54
  # verify var columns exist and build boolean masks per ref (len = n_vars)
@@ -124,7 +124,7 @@ def append_binary_layer_by_base_context(
124
124
  adata.layers['GpC_site_binary'] = masked_gpc
125
125
  adata.layers['CpG_site_binary'] = masked_cpg
126
126
  adata.layers['GpC_CpG_combined_site_binary'] = combined_sum
127
- adata.layers['any_C_site_binary'] = masked_any_c
127
+ adata.layers['C_site_binary'] = masked_any_c
128
128
  adata.layers['other_C_site_binary'] = masked_other_c
129
129
 
130
130
  if verbose:
@@ -134,7 +134,7 @@ def append_binary_layer_by_base_context(
134
134
  print(f" GpC: {_filled_positions(masked_gpc)}")
135
135
  print(f" CpG: {_filled_positions(masked_cpg)}")
136
136
  print(f" GpC+CpG combined: {_filled_positions(combined_sum)}")
137
- print(f" any_C: {_filled_positions(masked_any_c)}")
137
+ print(f" C: {_filled_positions(masked_any_c)}")
138
138
  print(f" other_C: {_filled_positions(masked_other_c)}")
139
139
 
140
140
  # mark as done
@@ -0,0 +1,17 @@
1
+ import numpy as np
2
+
3
+ def binarize_adata(adata, source="X", target_layer="binary", threshold=0.8):
4
+ """
5
+ Binarize a dense matrix and preserve NaN.
6
+ source: "X" or layer name
7
+ """
8
+ X = adata.X if source == "X" else adata.layers[source]
9
+
10
+ # Copy to avoid modifying original in-place
11
+ X_bin = X.copy()
12
+
13
+ # Where not NaN: apply threshold
14
+ mask = ~np.isnan(X_bin)
15
+ X_bin[mask] = (X_bin[mask] > threshold).astype(np.int8)
16
+
17
+ adata.layers[target_layer] = X_bin
@@ -1,4 +1,6 @@
1
- def binarize_on_Youden(adata, obs_column='Reference'):
1
+ def binarize_on_Youden(adata,
2
+ ref_column='Reference_strand',
3
+ output_layer_name='binarized_methylation'):
2
4
  """
3
5
  Binarize SMF values based on position thresholds determined by calculate_position_Youden.
4
6
 
@@ -16,18 +18,18 @@ def binarize_on_Youden(adata, obs_column='Reference'):
16
18
  binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
17
19
 
18
20
  # Get unique categories
19
- categories = adata.obs[obs_column].cat.categories
21
+ references = adata.obs[ref_column].cat.categories
20
22
 
21
- for cat in categories:
23
+ for ref in references:
22
24
  # Select subset for this category
23
- cat_mask = adata.obs[obs_column] == cat
24
- cat_subset = adata[cat_mask]
25
+ ref_mask = adata.obs[ref_column] == ref
26
+ ref_subset = adata[ref_mask]
25
27
 
26
28
  # Extract the probability matrix
27
- original_matrix = cat_subset.X.copy()
29
+ original_matrix = ref_subset.X.copy()
28
30
 
29
31
  # Extract the thresholds for each position efficiently
30
- thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
32
+ thresholds = np.array(ref_subset.var[f'{ref}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
31
33
 
32
34
  # Identify NaN values
33
35
  nan_mask = np.isnan(original_matrix)
@@ -39,7 +41,7 @@ def binarize_on_Youden(adata, obs_column='Reference'):
39
41
  binarized_matrix[nan_mask] = np.nan
40
42
 
41
43
  # Assign the binarized values back into the preallocated storage
42
- binarized_methylation[cat_mask, :] = binarized_matrix
44
+ binarized_methylation[ref_subset, :] = binarized_matrix
43
45
 
44
46
  # Store the binarized matrix in a new layer
45
- adata.layers['binarized_methylation'] = binarized_methylation
47
+ adata.layers[output_layer_name] = binarized_methylation
@@ -11,7 +11,7 @@ def calculate_complexity_II(
11
11
  n_depths=12,
12
12
  random_state=0,
13
13
  csv_summary=True,
14
- uns_flag='complexity_analysis_complete',
14
+ uns_flag='calculate_complexity_II_performed',
15
15
  force_redo=False,
16
16
  bypass=False
17
17
  ):
@@ -1,4 +1,7 @@
1
- def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.00001, uns_flag='positional_coverage_calculated'):
1
+ def calculate_coverage(adata,
2
+ ref_column='Reference_strand',
3
+ position_nan_threshold=0.01,
4
+ uns_flag='calculate_coverage_performed'):
2
5
  """
3
6
  Append position-level metadata regarding whether the position is informative within the given observation category.
4
7
 
@@ -20,32 +23,32 @@ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_thresh
20
23
  # QC already performed; nothing to do
21
24
  return
22
25
 
23
- categories = adata.obs[obs_column].cat.categories
26
+ references = adata.obs[ref_column].cat.categories
24
27
  n_categories_with_position = np.zeros(adata.shape[1])
25
28
 
26
- # Loop over categories
27
- for cat in categories:
28
- print(f'Assessing positional coverage across samples for {cat} reference')
29
+ # Loop over references
30
+ for ref in references:
31
+ print(f'Assessing positional coverage across samples for {ref} reference')
29
32
 
30
33
  # Subset to current category
31
- cat_mask = adata.obs[obs_column] == cat
32
- temp_cat_adata = adata[cat_mask]
34
+ ref_mask = adata.obs[ref_column] == ref
35
+ temp_ref_adata = adata[ref_mask]
33
36
 
34
37
  # Compute fraction of valid coverage
35
- cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
36
- cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0] # Avoid extra computation
38
+ ref_valid_coverage = np.sum(~np.isnan(temp_ref_adata.X), axis=0)
39
+ ref_valid_fraction = ref_valid_coverage / temp_ref_adata.shape[0] # Avoid extra computation
37
40
 
38
41
  # Store coverage stats
39
- adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
42
+ adata.var[f'{ref}_valid_fraction'] = pd.Series(ref_valid_fraction, index=adata.var.index)
40
43
 
41
44
  # Assign whether the position is covered based on threshold
42
- adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
45
+ adata.var[f'position_in_{ref}'] = ref_valid_fraction >= position_nan_threshold
43
46
 
44
47
  # Sum the number of categories covering each position
45
- n_categories_with_position += adata.var[f'position_in_{cat}'].values
48
+ n_categories_with_position += adata.var[f'position_in_{ref}'].values
46
49
 
47
50
  # Store final category count
48
- adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
51
+ adata.var[f'N_{ref_column}_with_position'] = n_categories_with_position.astype(int)
49
52
 
50
53
  # mark as done
51
54
  adata.uns[uns_flag] = True
@@ -1,7 +1,15 @@
1
1
  ## calculate_position_Youden
2
-
3
2
  ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
4
- def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
3
+ def calculate_position_Youden(adata,
4
+ positive_control_sample=None,
5
+ negative_control_sample=None,
6
+ J_threshold=0.5,
7
+ ref_column='Reference_strand',
8
+ sample_column='Sample_names',
9
+ infer_on_percentile=True,
10
+ inference_variable='Raw_modification_signal',
11
+ save=False,
12
+ output_directory=''):
5
13
  """
6
14
  Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
7
15
 
@@ -26,28 +34,36 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
26
34
  from sklearn.metrics import roc_curve, roc_auc_score
27
35
 
28
36
  control_samples = [positive_control_sample, negative_control_sample]
29
- categories = adata.obs[obs_column].cat.categories
37
+ references = adata.obs[ref_column].cat.categories
30
38
  # Iterate over each category in the specified obs_column
31
- for cat in categories:
32
- print(f"Calculating position Youden statistics for {cat}")
39
+ for ref in references:
40
+ print(f"Calculating position Youden statistics for {ref}")
33
41
  # Subset to keep only reads associated with the category
34
- cat_subset = adata[adata.obs[obs_column] == cat]
42
+ ref_subset = adata[adata.obs[ref_column] == ref]
35
43
  # Iterate over positive and negative control samples
36
- for control in control_samples:
44
+ for i, control in enumerate(control_samples):
37
45
  # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
38
- adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
39
- if infer_on_percentile:
40
- sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
41
- if control == "positive":
46
+ adata.uns[f'{ref}_position_methylation_dict_{control}'] = {}
47
+ # If controls are not passed and infer on percentile is True, infer thresholds based on top and bottom percentile windows for a given obs column metric.
48
+ if infer_on_percentile and not control:
49
+ sorted_column = ref_subset.obs[inference_variable].sort_values(ascending=False)
50
+ if i == 0:
51
+ control == 'positive'
52
+ positive_control_sample = control
42
53
  threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
43
- control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
54
+ control_subset = ref_subset[ref_subset.obs[inference_variable] >= threshold, :]
44
55
  else:
56
+ control == 'negative'
57
+ negative_control_sample = control
45
58
  threshold = np.percentile(sorted_column, infer_on_percentile)
46
- control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
59
+ control_subset = ref_subset[ref_subset.obs[inference_variable] <= threshold, :]
60
+ elif not infer_on_percentile and not control:
61
+ print("Can not threshold Anndata on Youden threshold. Need to either provide control samples or set infer_on_percentile to True")
62
+ return
47
63
  else:
48
64
  # get the current control subset on the given category
49
- filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
50
- control_subset = cat_subset[filtered_obs.index]
65
+ filtered_obs = ref_subset.obs[ref_subset.obs[sample_column] == control]
66
+ control_subset = ref_subset[filtered_obs.index]
51
67
  # Iterate through every position in the control subset
52
68
  for position in range(control_subset.shape[1]):
53
69
  # Get the coordinate name associated with that position
@@ -63,9 +79,9 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
63
79
  # Get fraction coverage
64
80
  fraction_coverage = position_coverage / control_subset.shape[0]
65
81
  # Save the position and the position methylation data for the control subset
66
- adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
82
+ adata.uns[f'{ref}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
67
83
 
68
- for cat in categories:
84
+ for ref in references:
69
85
  fig, ax = plt.subplots(figsize=(6, 4))
70
86
  plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
71
87
  plt.xlabel('False Positive Rate')
@@ -76,13 +92,13 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
76
92
  n_total_positions = 0
77
93
  # Initialize a list that will hold the positional thresholds for the category
78
94
  probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
79
- for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
80
- position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
81
- positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
82
- fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
95
+ for i, key in enumerate(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'].keys()):
96
+ position = int(adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][0])
97
+ positive_position_array = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][1]
98
+ fraction_coverage = adata.uns[f'{ref}_position_methylation_dict_{positive_control_sample}'][key][2]
83
99
  if fraction_coverage > 0.2:
84
100
  try:
85
- negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
101
+ negative_position_array = adata.uns[f'{ref}_position_methylation_dict_{negative_control_sample}'][key][1]
86
102
  # Combine the negative and positive control data
87
103
  data = np.concatenate([negative_position_array, positive_position_array])
88
104
  labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
@@ -101,15 +117,15 @@ def calculate_position_Youden(adata, positive_control_sample='positive', negativ
101
117
  plt.plot(fpr, tpr, label='ROC curve')
102
118
  except:
103
119
  probability_thresholding_list[position] = (0.8, np.nan)
104
- title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
120
+ title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {ref}'
105
121
  plt.title(title)
106
- save_name = output_directory + f'/{title}'
122
+ save_name = output_directory / f"{title}.png"
107
123
  if save:
108
124
  plt.savefig(save_name)
109
125
  plt.close()
110
126
  else:
111
127
  plt.show()
112
128
 
113
- adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
129
+ adata.var[f'{ref}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
114
130
  J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
115
- adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
131
+ adata.var[f'{ref}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
@@ -2,7 +2,7 @@ def calculate_read_modification_stats(adata,
2
2
  reference_column,
3
3
  sample_names_col,
4
4
  mod_target_bases,
5
- uns_flag="read_modification_stats_calculated",
5
+ uns_flag="calculate_read_modification_stats_performed",
6
6
  bypass=False,
7
7
  force_redo=False
8
8
  ):
@@ -36,7 +36,7 @@ def calculate_read_modification_stats(adata,
36
36
  site_types = []
37
37
 
38
38
  if any(base in mod_target_bases for base in ['GpC', 'CpG', 'C']):
39
- site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'any_C_site']
39
+ site_types += ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C_site', 'C_site']
40
40
 
41
41
  if 'A' in mod_target_bases:
42
42
  site_types += ['A_site']
@@ -11,7 +11,7 @@ def filter_reads_on_length_quality_mapping(
11
11
  length_ratio: Optional[Sequence[float]] = None, # e.g. [min, max]
12
12
  read_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
13
13
  mapping_quality: Optional[Sequence[float]] = None, # e.g. [min, max] (commonly min only)
14
- uns_flag: str = "reads_removed_failing_length_quality_mapping_qc",
14
+ uns_flag: str = "filter_reads_on_length_quality_mapping_performed",
15
15
  bypass: bool = False,
16
16
  force_redo: bool = True
17
17
  ) -> ad.AnnData: