smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +29 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  5. smftools/datasets/F1_sample_sheet.csv +5 -0
  6. smftools/datasets/__init__.py +9 -0
  7. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  8. smftools/datasets/datasets.py +28 -0
  9. smftools/informatics/__init__.py +16 -0
  10. smftools/informatics/archived/bam_conversion.py +59 -0
  11. smftools/informatics/archived/bam_direct.py +63 -0
  12. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  13. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  14. smftools/informatics/basecall_pod5s.py +80 -0
  15. smftools/informatics/conversion_smf.py +132 -0
  16. smftools/informatics/direct_smf.py +137 -0
  17. smftools/informatics/fast5_to_pod5.py +21 -0
  18. smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
  19. smftools/informatics/helpers/__init__.py +74 -0
  20. smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
  21. smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
  22. smftools/informatics/helpers/archived/informatics.py +260 -0
  23. smftools/informatics/helpers/archived/load_adata.py +516 -0
  24. smftools/informatics/helpers/bam_qc.py +66 -0
  25. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  26. smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
  27. smftools/informatics/helpers/canoncall.py +34 -0
  28. smftools/informatics/helpers/complement_base_list.py +21 -0
  29. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
  30. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  31. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  32. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  33. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  34. smftools/informatics/helpers/extract_base_identities.py +44 -0
  35. smftools/informatics/helpers/extract_mods.py +83 -0
  36. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  37. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  38. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  39. smftools/informatics/helpers/find_conversion_sites.py +50 -0
  40. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  41. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  42. smftools/informatics/helpers/get_native_references.py +28 -0
  43. smftools/informatics/helpers/index_fasta.py +12 -0
  44. smftools/informatics/helpers/make_dirs.py +21 -0
  45. smftools/informatics/helpers/make_modbed.py +27 -0
  46. smftools/informatics/helpers/modQC.py +27 -0
  47. smftools/informatics/helpers/modcall.py +36 -0
  48. smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
  49. smftools/informatics/helpers/ohe_batching.py +76 -0
  50. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  51. smftools/informatics/helpers/one_hot_decode.py +27 -0
  52. smftools/informatics/helpers/one_hot_encode.py +57 -0
  53. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
  54. smftools/informatics/helpers/run_multiqc.py +28 -0
  55. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  56. smftools/informatics/helpers/split_and_index_BAM.py +36 -0
  57. smftools/informatics/load_adata.py +182 -0
  58. smftools/informatics/readwrite.py +106 -0
  59. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  60. smftools/informatics/subsample_pod5.py +104 -0
  61. smftools/plotting/__init__.py +15 -0
  62. smftools/plotting/classifiers.py +355 -0
  63. smftools/plotting/general_plotting.py +205 -0
  64. smftools/plotting/position_stats.py +462 -0
  65. smftools/preprocessing/__init__.py +33 -0
  66. smftools/preprocessing/append_C_context.py +82 -0
  67. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  68. smftools/preprocessing/archives/preprocessing.py +614 -0
  69. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  70. smftools/preprocessing/binarize_on_Youden.py +45 -0
  71. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  72. smftools/preprocessing/calculate_complexity.py +72 -0
  73. smftools/preprocessing/calculate_consensus.py +47 -0
  74. smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
  75. smftools/preprocessing/calculate_coverage.py +42 -0
  76. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  77. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  78. smftools/preprocessing/calculate_position_Youden.py +115 -0
  79. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  80. smftools/preprocessing/clean_NaN.py +46 -0
  81. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  82. smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
  83. smftools/preprocessing/filter_reads_on_length.py +51 -0
  84. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  85. smftools/preprocessing/invert_adata.py +30 -0
  86. smftools/preprocessing/load_sample_sheet.py +38 -0
  87. smftools/preprocessing/make_dirs.py +21 -0
  88. smftools/preprocessing/min_non_diagonal.py +25 -0
  89. smftools/preprocessing/recipes.py +127 -0
  90. smftools/preprocessing/subsample_adata.py +58 -0
  91. smftools/readwrite.py +198 -0
  92. smftools/tools/__init__.py +49 -0
  93. smftools/tools/apply_hmm.py +202 -0
  94. smftools/tools/apply_hmm_batched.py +241 -0
  95. smftools/tools/archived/classify_methylated_features.py +66 -0
  96. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  97. smftools/tools/archived/subset_adata_v1.py +32 -0
  98. smftools/tools/archived/subset_adata_v2.py +46 -0
  99. smftools/tools/calculate_distances.py +18 -0
  100. smftools/tools/calculate_umap.py +62 -0
  101. smftools/tools/call_hmm_peaks.py +105 -0
  102. smftools/tools/classifiers.py +787 -0
  103. smftools/tools/cluster_adata_on_methylation.py +105 -0
  104. smftools/tools/data/__init__.py +2 -0
  105. smftools/tools/data/anndata_data_module.py +90 -0
  106. smftools/tools/data/preprocessing.py +6 -0
  107. smftools/tools/display_hmm.py +18 -0
  108. smftools/tools/evaluation/__init__.py +0 -0
  109. smftools/tools/general_tools.py +69 -0
  110. smftools/tools/hmm_readwrite.py +16 -0
  111. smftools/tools/inference/__init__.py +1 -0
  112. smftools/tools/inference/lightning_inference.py +41 -0
  113. smftools/tools/models/__init__.py +9 -0
  114. smftools/tools/models/base.py +14 -0
  115. smftools/tools/models/cnn.py +34 -0
  116. smftools/tools/models/lightning_base.py +41 -0
  117. smftools/tools/models/mlp.py +17 -0
  118. smftools/tools/models/positional.py +17 -0
  119. smftools/tools/models/rnn.py +16 -0
  120. smftools/tools/models/sklearn_models.py +40 -0
  121. smftools/tools/models/transformer.py +133 -0
  122. smftools/tools/models/wrappers.py +20 -0
  123. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  124. smftools/tools/position_stats.py +239 -0
  125. smftools/tools/read_stats.py +70 -0
  126. smftools/tools/subset_adata.py +28 -0
  127. smftools/tools/train_hmm.py +78 -0
  128. smftools/tools/training/__init__.py +1 -0
  129. smftools/tools/training/train_lightning_model.py +47 -0
  130. smftools/tools/utils/__init__.py +2 -0
  131. smftools/tools/utils/device.py +10 -0
  132. smftools/tools/utils/grl.py +14 -0
  133. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
  134. smftools-0.1.7.dist-info/RECORD +136 -0
  135. smftools-0.1.6.dist-info/RECORD +0 -4
  136. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  137. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,45 @@
1
+ def binarize_on_Youden(adata, obs_column='Reference'):
2
+ """
3
+ Binarize SMF values based on position thresholds determined by calculate_position_Youden.
4
+
5
+ Parameters:
6
+ adata (AnnData): The anndata object to binarize. `calculate_position_Youden` must have been run first.
7
+ obs_column (str): The obs column to stratify on. Needs to match what was passed in `calculate_position_Youden`.
8
+
9
+ Modifies:
10
+ Adds a new layer to `adata.layers['binarized_methylation']` containing the binarized methylation matrix.
11
+ """
12
+ import numpy as np
13
+ import anndata as ad
14
+
15
+ # Initialize an empty matrix to store the binarized methylation values
16
+ binarized_methylation = np.full_like(adata.X, np.nan, dtype=float) # Keeps same shape as adata.X
17
+
18
+ # Get unique categories
19
+ categories = adata.obs[obs_column].cat.categories
20
+
21
+ for cat in categories:
22
+ # Select subset for this category
23
+ cat_mask = adata.obs[obs_column] == cat
24
+ cat_subset = adata[cat_mask]
25
+
26
+ # Extract the probability matrix
27
+ original_matrix = cat_subset.X.copy()
28
+
29
+ # Extract the thresholds for each position efficiently
30
+ thresholds = np.array(cat_subset.var[f'{cat}_position_methylation_thresholding_Youden_stats'].apply(lambda x: x[0]))
31
+
32
+ # Identify NaN values
33
+ nan_mask = np.isnan(original_matrix)
34
+
35
+ # Binarize based on threshold
36
+ binarized_matrix = (original_matrix > thresholds).astype(float)
37
+
38
+ # Restore NaN values
39
+ binarized_matrix[nan_mask] = np.nan
40
+
41
+ # Assign the binarized values back into the preallocated storage
42
+ binarized_methylation[cat_mask, :] = binarized_matrix
43
+
44
+ # Store the binarized matrix in a new layer
45
+ adata.layers['binarized_methylation'] = binarized_methylation
@@ -0,0 +1,40 @@
1
+ ## binary_layers_to_ohe
2
+
3
+ ## Conversion SMF Specific
4
+ def binary_layers_to_ohe(adata, binary_layers, stack='hstack'):
5
+ """
6
+ Parameters:
7
+ adata (AnnData): Anndata object.
8
+ binary_layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix.
9
+ stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
10
+
11
+ Returns:
12
+ ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
13
+ Input: An adata object and a list of layers containing a binary encoding.
14
+ """
15
+ import numpy as np
16
+ import anndata as ad
17
+
18
+ # Ensure that the N layer is last!
19
+ # Grab all binary layers that are not encoding N
20
+ ACGT_binary_layers = [layer for layer in binary_layers if 'binary' in layer and layer != 'N_binary_encoding']
21
+ # If there is a binary layer encoding N, hold it in N_binary_layer
22
+ N_binary_layer = [layer for layer in binary_layers if layer == 'N_binary_encoding']
23
+ # Add the N_binary_encoding layer to the end of the list of binary layers
24
+ all_binary_layers = ACGT_binary_layers + N_binary_layer
25
+ print(f'Found {all_binary_layers} layers in adata')
26
+
27
+ # Extract the layers
28
+ layers = [adata.layers[layer_name] for layer_name in all_binary_layers]
29
+ n_reads = layers[0].shape[0]
30
+ ohe_dict = {}
31
+ for i in range(n_reads):
32
+ read_ohe = []
33
+ for layer in layers:
34
+ read_ohe.append(layer[i])
35
+ read_name = adata.obs_names[i]
36
+ if stack == 'hstack':
37
+ ohe_dict[read_name] = np.hstack(read_ohe)
38
+ elif stack == 'vstack':
39
+ ohe_dict[read_name] = np.vstack(read_ohe)
40
+ return ohe_dict
@@ -0,0 +1,72 @@
1
+ ## calculate_complexity
2
+
3
+ def calculate_complexity(adata, output_directory='', obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
4
+ """
5
+ A complexity analysis of the library.
6
+
7
+ Parameters:
8
+ adata (AnnData): An adata object with mark_duplicates already run.
9
+ output_directory (str): String representing the path to the output directory.
10
+ obs_column (str): String of the obs column to iterate over.
11
+ sample_col (str): String of the sample column to iterate over.
12
+ plot (bool): Whether to plot the complexity model.
13
+ save_plot (bool): Whether to save the complexity model.
14
+
15
+ Returns:
16
+ None
17
+
18
+ """
19
+ import numpy as np
20
+ import pandas as pd
21
+ from scipy.optimize import curve_fit
22
+
23
+ def lander_waterman(x, C0):
24
+ return C0 * (1 - np.exp(-x / C0))
25
+
26
+ def count_unique_reads(reads, depth):
27
+ subsample = np.random.choice(reads, depth, replace=False)
28
+ return len(np.unique(subsample))
29
+
30
+ categories = adata.obs[obs_column].cat.categories
31
+ sample_names = adata.obs[sample_col].cat.categories
32
+
33
+ for cat in categories:
34
+ for sample in sample_names:
35
+ unique_reads = adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}']
36
+ total_reads = adata.uns[f'total_reads_within_{cat}_{sample}']
37
+ reads = np.concatenate((np.arange(unique_reads), np.random.choice(unique_reads, total_reads - unique_reads, replace=True)))
38
+ # Subsampling depths
39
+ subsampling_depths = [total_reads // (i+1) for i in range(10)]
40
+ # Arrays to store results
41
+ subsampled_total_reads = []
42
+ subsampled_unique_reads = []
43
+ # Perform subsampling
44
+ for depth in subsampling_depths:
45
+ unique_count = count_unique_reads(reads, depth)
46
+ subsampled_total_reads.append(depth)
47
+ subsampled_unique_reads.append(unique_count)
48
+ # Fit the Lander-Waterman model to the data
49
+ popt, _ = curve_fit(lander_waterman, subsampled_total_reads, subsampled_unique_reads)
50
+ # Generate data for the complexity curve
51
+ x_data = np.linspace(0, 5000, 100)
52
+ y_data = lander_waterman(x_data, *popt)
53
+ adata.uns[f'Library_complexity_of_{sample}_on_{cat}'] = popt[0]
54
+ if plot:
55
+ import matplotlib.pyplot as plt
56
+ # Plot the complexity curve
57
+ plt.figure(figsize=(6, 4))
58
+ plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
59
+ plt.plot(x_data, y_data, '-', label=f'Lander-Waterman fit\nEstimated C0 = {popt[0]:.2f}')
60
+ plt.xlabel('Total number of reads')
61
+ plt.ylabel('Number of unique reads')
62
+ title = f'Library Complexity Analysis for {sample} on {cat}'
63
+ plt.title(title)
64
+ plt.legend()
65
+ plt.grid(True)
66
+ if save_plot:
67
+ date_string = date_string()
68
+ save_name = output_directory + f'/{date_string}_{title}'
69
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
70
+ plt.close()
71
+ else:
72
+ plt.show()
@@ -0,0 +1,47 @@
1
+ # calculate_consensus
2
+
3
+ def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
4
+ """
5
+ Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
6
+
7
+ Parameters:
8
+ adata (AnnData): The input adata to append consensus metadata to.
9
+ reference (str): The name of the reference to subset the adata on.
10
+ sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
11
+ reference_column (str): The name of the reference column (Default is 'Reference')
12
+ sample_column (str): The name of the sample column (Default is 'Sample)
13
+
14
+ Returns:
15
+ None
16
+
17
+ """
18
+ import numpy as np
19
+
20
+ # Subset the adata on the refernce of interest. Optionally, subset additionally on a sample of interest.
21
+ record_subset = adata[adata.obs[reference_column] == reference].copy()
22
+ if sample:
23
+ record_subset = record_subset[record_subset.obs[sample_column] == sample].copy()
24
+ else:
25
+ pass
26
+
27
+ # Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
28
+ layers = [layer for layer in record_subset.layers if '_binary_' in layer]
29
+ layer_map, layer_counts = {}, []
30
+ for i, layer in enumerate(layers):
31
+ # Gives an integer mapping to access which sequence base the binary layer is encoding
32
+ layer_map[i] = layer.split('_')[0]
33
+ # Get the positional counts from all reads for the given base identity.
34
+ layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
35
+ # Combine the positional counts array derived from each binary base layer into an ndarray
36
+ count_array = np.array(layer_counts)
37
+ # Determine the row index that contains the largest count for each position and store this in an array.
38
+ nucleotide_indexes = np.argmax(count_array, axis=0)
39
+ # Map the base sequence derived from the row index array to attain the consensus sequence in a list.
40
+ consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
41
+
42
+ if sample:
43
+ adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
44
+ else:
45
+ adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
46
+
47
+ adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
@@ -0,0 +1,94 @@
1
+ ## calculate_converted_read_methylation_stats
2
+
3
+ ## Conversion SMF Specific
4
+ # Read methylation QC
5
+
6
+ def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col):
7
+ """
8
+ Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
9
+
10
+ Parameters:
11
+ adata (AnnData): An adata object
12
+ reference_column (str): String representing the name of the Reference column to use
13
+ sample_names_col (str): String representing the name of the sample name column to use
14
+
15
+ Returns:
16
+ None
17
+ """
18
+ import numpy as np
19
+ import anndata as ad
20
+ import pandas as pd
21
+
22
+ print('Calculating read level methylation statistics')
23
+
24
+ references = set(adata.obs[reference_column])
25
+ sample_names = set(adata.obs[sample_names_col])
26
+
27
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
28
+
29
+ for site_type in site_types:
30
+ adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
31
+ adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
32
+ adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
33
+ adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
34
+ for cat in references:
35
+ cat_subset = adata[adata.obs[reference_column] == cat].copy()
36
+ for site_type in site_types:
37
+ print(f'Iterating over {cat}_{site_type}')
38
+ observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
39
+ number_valid_positions_in_read = np.nansum(~np.isnan(observation_matrix), axis=1)
40
+ row_methylation_sums = np.nansum(observation_matrix, axis=1)
41
+ number_valid_positions_in_read[number_valid_positions_in_read == 0] = 1
42
+ fraction_valid_positions_in_range = number_valid_positions_in_read / np.max(number_valid_positions_in_read)
43
+ row_methylation_means = np.divide(row_methylation_sums, number_valid_positions_in_read)
44
+ temp_obs_data = pd.DataFrame({f'number_valid_{site_type}_in_read': number_valid_positions_in_read,
45
+ f'fraction_valid_{site_type}_in_range': fraction_valid_positions_in_range,
46
+ f'{site_type}_row_methylation_sums': row_methylation_sums,
47
+ f'{site_type}_row_methylation_means': row_methylation_means}, index=cat_subset.obs.index)
48
+ adata.obs.update(temp_obs_data)
49
+ # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
50
+ pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
51
+ adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
52
+
53
+ # Below should be a plotting function
54
+ # adata.uns['methylation_dict'] = {}
55
+ # n_bins = 50
56
+ # site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
57
+
58
+ # for reference in references:
59
+ # reference_adata = adata[adata.obs[reference_column] == reference].copy()
60
+ # split_reference = reference.split('_')[0][1:]
61
+ # for sample in sample_names:
62
+ # sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
63
+ # for site_type in site_types_to_analyze:
64
+ # methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
65
+ # max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
66
+ # if not np.isnan(max_meth):
67
+ # n_bins = int(max_meth // 2)
68
+ # else:
69
+ # n_bins = 1
70
+ # mean = np.mean(methylation_data)
71
+ # median = np.median(methylation_data)
72
+ # stdev = np.std(methylation_data)
73
+ # adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
74
+ # if show_methylation_histogram or save_methylation_histogram:
75
+ # fig, ax = plt.subplots(figsize=(6, 4))
76
+ # count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
77
+ # plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
78
+ # plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
79
+ # plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
80
+ # plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
81
+ # plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
82
+ # plt.xlabel('Fraction methylated')
83
+ # plt.ylabel('Proportion')
84
+ # title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
85
+ # plt.title(title, pad=20)
86
+ # plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
87
+ # ax.spines['right'].set_visible(False)
88
+ # ax.spines['top'].set_visible(False)
89
+ # save_name = output_directory + f'/{readwrite.date_string()} {title}'
90
+ # if save_methylation_histogram:
91
+ # plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
92
+ # plt.close()
93
+ # else:
94
+ # plt.show()
@@ -0,0 +1,42 @@
1
+ def calculate_coverage(adata, obs_column='Reference_strand', position_nan_threshold=0.05):
2
+ """
3
+ Append position-level metadata regarding whether the position is informative within the given observation category.
4
+
5
+ Parameters:
6
+ adata (AnnData): An AnnData object
7
+ obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
8
+ position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
9
+
10
+ Modifies:
11
+ - Adds new columns to `adata.var` containing coverage statistics.
12
+ """
13
+ import numpy as np
14
+ import pandas as pd
15
+ import anndata as ad
16
+
17
+ categories = adata.obs[obs_column].cat.categories
18
+ n_categories_with_position = np.zeros(adata.shape[1])
19
+
20
+ # Loop over categories
21
+ for cat in categories:
22
+ print(f'Assessing positional coverage across samples for {cat} reference')
23
+
24
+ # Subset to current category
25
+ cat_mask = adata.obs[obs_column] == cat
26
+ temp_cat_adata = adata[cat_mask]
27
+
28
+ # Compute fraction of valid coverage
29
+ cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
30
+ cat_valid_fraction = cat_valid_coverage / temp_cat_adata.shape[0] # Avoid extra computation
31
+
32
+ # Store coverage stats
33
+ adata.var[f'{cat}_valid_fraction'] = pd.Series(cat_valid_fraction, index=adata.var.index)
34
+
35
+ # Assign whether the position is covered based on threshold
36
+ adata.var[f'position_in_{cat}'] = cat_valid_fraction >= position_nan_threshold
37
+
38
+ # Sum the number of categories covering each position
39
+ n_categories_with_position += adata.var[f'position_in_{cat}'].values
40
+
41
+ # Store final category count
42
+ adata.var[f'N_{obs_column}_with_position'] = n_categories_with_position.astype(int)
@@ -0,0 +1,49 @@
1
+ # calculate_pairwise_differences
2
+
3
+ def calculate_pairwise_differences(arrays):
4
+ """
5
+ Calculate the pairwise differences for a list of h-stacked ndarrays. Ignore N-positions
6
+
7
+ Parameters:
8
+ arrays (str): A list of ndarrays.
9
+
10
+ Returns:
11
+ distance_matrix (ndarray): a 2D array containing the pairwise differences between all arrays.
12
+ """
13
+ import numpy as np
14
+ from tqdm import tqdm
15
+
16
+ num_arrays = len(arrays)
17
+
18
+ n_rows = 5
19
+ reshaped_arrays = [array.reshape(n_rows, -1) for array in arrays]
20
+ N_masks = [array[-1].astype(bool) for array in reshaped_arrays]
21
+ reshaped_arrays_minus_N = [array[:-1].flatten() for array in reshaped_arrays]
22
+
23
+ # Precompute the repeated N masks to avoid repeated computations
24
+ repeated_N_masks = [np.tile(N_mask, (n_rows - 1)) for N_mask in N_masks]
25
+
26
+ # Initialize the distance matrix
27
+ distance_matrix = np.zeros((num_arrays, num_arrays), dtype=np.float32)
28
+
29
+ # Calculate pairwise distances with progress bar
30
+ for i in tqdm(range(num_arrays), desc="Calculating Pairwise Differences"):
31
+ array_i = reshaped_arrays_minus_N[i]
32
+ N_mask_i = repeated_N_masks[i]
33
+
34
+ for j in range(i + 1, num_arrays):
35
+ array_j = reshaped_arrays_minus_N[j]
36
+ N_mask_j = repeated_N_masks[j]
37
+
38
+ # Combined mask to ignore N positions
39
+ combined_mask = N_mask_i | N_mask_j
40
+
41
+ # Calculate the hamming distance directly with boolean operations
42
+ differences = (array_i != array_j) & ~combined_mask
43
+ distance = np.sum(differences) / np.sum(~combined_mask)
44
+
45
+ # Store the symmetric distances
46
+ distance_matrix[i, j] = distance
47
+ distance_matrix[j, i] = distance
48
+
49
+ return distance_matrix
@@ -0,0 +1,27 @@
1
+ ## calculate_pairwise_hamming_distances
2
+
3
+ ## Conversion SMF Specific
4
+ def calculate_pairwise_hamming_distances(arrays):
5
+ """
6
+ Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
7
+
8
+ Parameters:
9
+ arrays (str): A list of ndarrays.
10
+
11
+ Returns:
12
+ distance_matrix (ndarray): a 2D array containing the pairwise Hamming distances between all arrays.
13
+
14
+ """
15
+ import numpy as np
16
+ from tqdm import tqdm
17
+ from scipy.spatial.distance import hamming
18
+ num_arrays = len(arrays)
19
+ # Initialize an empty distance matrix
20
+ distance_matrix = np.zeros((num_arrays, num_arrays))
21
+ # Calculate pairwise distances with progress bar
22
+ for i in tqdm(range(num_arrays), desc="Calculating Hamming Distances"):
23
+ for j in range(i + 1, num_arrays):
24
+ distance = hamming(arrays[i], arrays[j])
25
+ distance_matrix[i, j] = distance
26
+ distance_matrix[j, i] = distance
27
+ return distance_matrix
@@ -0,0 +1,115 @@
1
+ ## calculate_position_Youden
2
+
3
+ ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
4
+ def calculate_position_Youden(adata, positive_control_sample='positive', negative_control_sample='negative', J_threshold=0.5, obs_column='Reference', infer_on_percentile=False, inference_variable='', save=False, output_directory=''):
5
+ """
6
+ Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
7
+
8
+ Parameters:
9
+ adata (AnnData): An AnnData object.
10
+ positive_control_sample (str): string representing the sample name corresponding to the Plus MTase control sample.
11
+ negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
12
+ J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
13
+ obs_column (str): The category to iterate over.
14
+ infer_on_perdentile (bool | int): If False, use defined postive and negative control samples. If an int (0 < int < 100) is passed, this uses the top and bottom int percentile of methylated reads based on metric in inference_variable column.
15
+ inference_variable (str): If infer_on_percentile has an integer value passed, use the AnnData observation column name passed by this string as the metric.
16
+ save (bool): Whether to save the ROC plots.
17
+ output_directory (str): String representing the path to the output directory to output the ROC curves.
18
+
19
+ Returns:
20
+ None
21
+ """
22
+ import numpy as np
23
+ import pandas as pd
24
+ import anndata as ad
25
+ import matplotlib.pyplot as plt
26
+ from sklearn.metrics import roc_curve, roc_auc_score
27
+
28
+ control_samples = [positive_control_sample, negative_control_sample]
29
+ categories = adata.obs[obs_column].cat.categories
30
+ # Iterate over each category in the specified obs_column
31
+ for cat in categories:
32
+ print(f"Calculating position Youden statistics for {cat}")
33
+ # Subset to keep only reads associated with the category
34
+ cat_subset = adata[adata.obs[obs_column] == cat]
35
+ # Iterate over positive and negative control samples
36
+ for control in control_samples:
37
+ # Initialize a dictionary for the given control sample. This will be keyed by dataset and position to point to a tuple of coordinate position and an array of methylation probabilities
38
+ adata.uns[f'{cat}_position_methylation_dict_{control}'] = {}
39
+ if infer_on_percentile:
40
+ sorted_column = cat_subset.obs[inference_variable].sort_values(ascending=False)
41
+ if control == "positive":
42
+ threshold = np.percentile(sorted_column, 100 - infer_on_percentile)
43
+ control_subset = cat_subset[cat_subset.obs[inference_variable] >= threshold, :]
44
+ else:
45
+ threshold = np.percentile(sorted_column, infer_on_percentile)
46
+ control_subset = cat_subset[cat_subset.obs[inference_variable] <= threshold, :]
47
+ else:
48
+ # get the current control subset on the given category
49
+ filtered_obs = cat_subset.obs[cat_subset.obs['Sample_names'].str.contains(control, na=False, regex=True)]
50
+ control_subset = cat_subset[filtered_obs.index]
51
+ # Iterate through every position in the control subset
52
+ for position in range(control_subset.shape[1]):
53
+ # Get the coordinate name associated with that position
54
+ coordinate = control_subset.var_names[position]
55
+ # Get the array of methlyation probabilities for each read in the subset at that position
56
+ position_data = control_subset.X[:, position]
57
+ # Get the indexes of everywhere that is not a nan value
58
+ nan_mask = ~np.isnan(position_data)
59
+ # Keep only the methlyation data that has real values
60
+ position_data = position_data[nan_mask]
61
+ # Get the position data coverage
62
+ position_coverage = len(position_data)
63
+ # Get fraction coverage
64
+ fraction_coverage = position_coverage / control_subset.shape[0]
65
+ # Save the position and the position methylation data for the control subset
66
+ adata.uns[f'{cat}_position_methylation_dict_{control}'][f'{position}'] = (position, position_data, fraction_coverage)
67
+
68
+ for cat in categories:
69
+ fig, ax = plt.subplots(figsize=(6, 4))
70
+ plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
71
+ plt.xlabel('False Positive Rate')
72
+ plt.ylabel('True Positive Rate')
73
+ ax.spines['right'].set_visible(False)
74
+ ax.spines['top'].set_visible(False)
75
+ n_passed_positions = 0
76
+ n_total_positions = 0
77
+ # Initialize a list that will hold the positional thresholds for the category
78
+ probability_thresholding_list = [(np.nan, np.nan)] * adata.shape[1]
79
+ for i, key in enumerate(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'].keys()):
80
+ position = int(adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][0])
81
+ positive_position_array = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][1]
82
+ fraction_coverage = adata.uns[f'{cat}_position_methylation_dict_{positive_control_sample}'][key][2]
83
+ if fraction_coverage > 0.2:
84
+ try:
85
+ negative_position_array = adata.uns[f'{cat}_position_methylation_dict_{negative_control_sample}'][key][1]
86
+ # Combine the negative and positive control data
87
+ data = np.concatenate([negative_position_array, positive_position_array])
88
+ labels = np.array([0] * len(negative_position_array) + [1] * len(positive_position_array))
89
+ # Calculate the ROC curve
90
+ fpr, tpr, thresholds = roc_curve(labels, data)
91
+ # Calculate Youden's J statistic
92
+ J = tpr - fpr
93
+ optimal_idx = np.argmax(J)
94
+ optimal_threshold = thresholds[optimal_idx]
95
+ max_J = np.max(J)
96
+ data_tuple = (optimal_threshold, max_J)
97
+ probability_thresholding_list[position] = data_tuple
98
+ n_total_positions += 1
99
+ if max_J > J_threshold:
100
+ n_passed_positions += 1
101
+ plt.plot(fpr, tpr, label='ROC curve')
102
+ except:
103
+ probability_thresholding_list[position] = (0.8, np.nan)
104
+ title = f'ROC Curve for {n_passed_positions} positions with J-stat greater than {J_threshold}\n out of {n_total_positions} total positions on {cat}'
105
+ plt.title(title)
106
+ save_name = output_directory + f'/{title}'
107
+ if save:
108
+ plt.savefig(save_name)
109
+ plt.close()
110
+ else:
111
+ plt.show()
112
+
113
+ adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
114
+ J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
115
+ adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
@@ -0,0 +1,79 @@
1
+ ## calculate_read_length_stats
2
+
3
+ # Read length QC
4
+ def calculate_read_length_stats(adata, reference_column='', sample_names_col=''):
5
+ """
6
+ Append first valid position in a read and last valid position in the read. From this determine and append the read length.
7
+
8
+ Parameters:
9
+ adata (AnnData): An adata object
10
+ reference_column (str): String representing the name of the Reference column to use
11
+ sample_names_col (str): String representing the name of the sample name column to use
12
+
13
+ Returns:
14
+ upper_bound (int): last valid position in the dataset
15
+ lower_bound (int): first valid position in the dataset
16
+ """
17
+ import numpy as np
18
+ import anndata as ad
19
+ import pandas as pd
20
+
21
+ print('Calculating read length statistics')
22
+
23
+ references = set(adata.obs[reference_column])
24
+ sample_names = set(adata.obs[sample_names_col])
25
+
26
+ ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
27
+ print('calculating read length stats')
28
+ # Add some basic observation-level (read-level) metadata to the anndata object
29
+ read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
30
+ read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
31
+ read_length = read_last_valid_position - read_first_valid_position + np.ones(len(read_first_valid_position))
32
+
33
+ adata.obs['first_valid_position'] = pd.Series(read_first_valid_position, index=adata.obs.index, dtype=int)
34
+ adata.obs['last_valid_position'] = pd.Series(read_last_valid_position, index=adata.obs.index, dtype=int)
35
+ adata.obs['read_length'] = pd.Series(read_length, index=adata.obs.index, dtype=int)
36
+
37
+ # Define variables to hold the first and last valid position in the dataset
38
+ upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
39
+ lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
40
+
41
+ return upper_bound, lower_bound
42
+
43
+ # # Add an unstructured element to the anndata object which points to a dictionary of read lengths keyed by reference and sample name. Points to a tuple containing (mean, median, stdev) of the read lengths of the sample for the given reference strand
44
+ # ## Plot histogram of read length data and save the median and stdev of the read lengths for each sample.
45
+ # adata.uns['read_length_dict'] = {}
46
+
47
+ # for reference in references:
48
+ # temp_reference_adata = adata[adata.obs[reference_column] == reference].copy()
49
+ # split_reference = reference.split('_')[0][1:]
50
+ # for sample in sample_names:
51
+ # temp_sample_adata = temp_reference_adata[temp_reference_adata.obs[sample_names_col] == sample].copy()
52
+ # temp_data = temp_sample_adata.obs['read_length']
53
+ # max_length = np.max(temp_data)
54
+ # mean = np.mean(temp_data)
55
+ # median = np.median(temp_data)
56
+ # stdev = np.std(temp_data)
57
+ # adata.uns['read_length_dict'][f'{reference}_{sample}'] = [mean, median, stdev]
58
+ # if not np.isnan(max_length):
59
+ # n_bins = int(max_length // 100)
60
+ # else:
61
+ # n_bins = 1
62
+ # if show_read_length_histogram or save_read_length_histogram:
63
+ # plt.figure(figsize=(10, 6))
64
+ # plt.text(median + 0.5, max(plt.hist(temp_data, bins=n_bins)[0]) / 2, f'Median: {median:.2f}', color='red')
65
+ # plt.hist(temp_data, bins=n_bins, alpha=0.7, color='blue', edgecolor='black')
66
+ # plt.xlabel('Read Length')
67
+ # plt.ylabel('Count')
68
+ # title = f'Read length distribution of {temp_sample_adata.shape[0]} total reads from {sample} sample on {split_reference} allele'
69
+ # plt.title(title)
70
+ # # Add a vertical line at the median
71
+ # plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
72
+ # # Annotate the median
73
+ # plt.xlim(lower_bound - 100, upper_bound + 100)
74
+ # if save_read_length_histogram:
75
+ # save_name = output_directory + f'/{readwrite.date_string()} {title}'
76
+ # plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
77
+ # plt.close()
78
+ # else:
79
+ # plt.show()