smftools 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (137) hide show
  1. smftools/__init__.py +29 -0
  2. smftools/_settings.py +20 -0
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  5. smftools/datasets/F1_sample_sheet.csv +5 -0
  6. smftools/datasets/__init__.py +9 -0
  7. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  8. smftools/datasets/datasets.py +28 -0
  9. smftools/informatics/__init__.py +16 -0
  10. smftools/informatics/archived/bam_conversion.py +59 -0
  11. smftools/informatics/archived/bam_direct.py +63 -0
  12. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  13. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  14. smftools/informatics/basecall_pod5s.py +80 -0
  15. smftools/informatics/conversion_smf.py +132 -0
  16. smftools/informatics/direct_smf.py +137 -0
  17. smftools/informatics/fast5_to_pod5.py +21 -0
  18. smftools/informatics/helpers/LoadExperimentConfig.py +75 -0
  19. smftools/informatics/helpers/__init__.py +74 -0
  20. smftools/informatics/helpers/align_and_sort_BAM.py +59 -0
  21. smftools/informatics/helpers/aligned_BAM_to_bed.py +74 -0
  22. smftools/informatics/helpers/archived/informatics.py +260 -0
  23. smftools/informatics/helpers/archived/load_adata.py +516 -0
  24. smftools/informatics/helpers/bam_qc.py +66 -0
  25. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  26. smftools/informatics/helpers/binarize_converted_base_identities.py +79 -0
  27. smftools/informatics/helpers/canoncall.py +34 -0
  28. smftools/informatics/helpers/complement_base_list.py +21 -0
  29. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +55 -0
  30. smftools/informatics/helpers/converted_BAM_to_adata.py +245 -0
  31. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  32. smftools/informatics/helpers/count_aligned_reads.py +43 -0
  33. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  34. smftools/informatics/helpers/extract_base_identities.py +44 -0
  35. smftools/informatics/helpers/extract_mods.py +83 -0
  36. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  37. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  38. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  39. smftools/informatics/helpers/find_conversion_sites.py +50 -0
  40. smftools/informatics/helpers/generate_converted_FASTA.py +99 -0
  41. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  42. smftools/informatics/helpers/get_native_references.py +28 -0
  43. smftools/informatics/helpers/index_fasta.py +12 -0
  44. smftools/informatics/helpers/make_dirs.py +21 -0
  45. smftools/informatics/helpers/make_modbed.py +27 -0
  46. smftools/informatics/helpers/modQC.py +27 -0
  47. smftools/informatics/helpers/modcall.py +36 -0
  48. smftools/informatics/helpers/modkit_extract_to_adata.py +884 -0
  49. smftools/informatics/helpers/ohe_batching.py +76 -0
  50. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  51. smftools/informatics/helpers/one_hot_decode.py +27 -0
  52. smftools/informatics/helpers/one_hot_encode.py +57 -0
  53. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +53 -0
  54. smftools/informatics/helpers/run_multiqc.py +28 -0
  55. smftools/informatics/helpers/separate_bam_by_bc.py +43 -0
  56. smftools/informatics/helpers/split_and_index_BAM.py +36 -0
  57. smftools/informatics/load_adata.py +182 -0
  58. smftools/informatics/readwrite.py +106 -0
  59. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  60. smftools/informatics/subsample_pod5.py +104 -0
  61. smftools/plotting/__init__.py +15 -0
  62. smftools/plotting/classifiers.py +355 -0
  63. smftools/plotting/general_plotting.py +205 -0
  64. smftools/plotting/position_stats.py +462 -0
  65. smftools/preprocessing/__init__.py +33 -0
  66. smftools/preprocessing/append_C_context.py +82 -0
  67. smftools/preprocessing/archives/mark_duplicates.py +146 -0
  68. smftools/preprocessing/archives/preprocessing.py +614 -0
  69. smftools/preprocessing/archives/remove_duplicates.py +21 -0
  70. smftools/preprocessing/binarize_on_Youden.py +45 -0
  71. smftools/preprocessing/binary_layers_to_ohe.py +40 -0
  72. smftools/preprocessing/calculate_complexity.py +72 -0
  73. smftools/preprocessing/calculate_consensus.py +47 -0
  74. smftools/preprocessing/calculate_converted_read_methylation_stats.py +94 -0
  75. smftools/preprocessing/calculate_coverage.py +42 -0
  76. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  77. smftools/preprocessing/calculate_pairwise_hamming_distances.py +27 -0
  78. smftools/preprocessing/calculate_position_Youden.py +115 -0
  79. smftools/preprocessing/calculate_read_length_stats.py +79 -0
  80. smftools/preprocessing/clean_NaN.py +46 -0
  81. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  82. smftools/preprocessing/filter_converted_reads_on_methylation.py +44 -0
  83. smftools/preprocessing/filter_reads_on_length.py +51 -0
  84. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  85. smftools/preprocessing/invert_adata.py +30 -0
  86. smftools/preprocessing/load_sample_sheet.py +38 -0
  87. smftools/preprocessing/make_dirs.py +21 -0
  88. smftools/preprocessing/min_non_diagonal.py +25 -0
  89. smftools/preprocessing/recipes.py +127 -0
  90. smftools/preprocessing/subsample_adata.py +58 -0
  91. smftools/readwrite.py +198 -0
  92. smftools/tools/__init__.py +49 -0
  93. smftools/tools/apply_hmm.py +202 -0
  94. smftools/tools/apply_hmm_batched.py +241 -0
  95. smftools/tools/archived/classify_methylated_features.py +66 -0
  96. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  97. smftools/tools/archived/subset_adata_v1.py +32 -0
  98. smftools/tools/archived/subset_adata_v2.py +46 -0
  99. smftools/tools/calculate_distances.py +18 -0
  100. smftools/tools/calculate_umap.py +62 -0
  101. smftools/tools/call_hmm_peaks.py +105 -0
  102. smftools/tools/classifiers.py +787 -0
  103. smftools/tools/cluster_adata_on_methylation.py +105 -0
  104. smftools/tools/data/__init__.py +2 -0
  105. smftools/tools/data/anndata_data_module.py +90 -0
  106. smftools/tools/data/preprocessing.py +6 -0
  107. smftools/tools/display_hmm.py +18 -0
  108. smftools/tools/evaluation/__init__.py +0 -0
  109. smftools/tools/general_tools.py +69 -0
  110. smftools/tools/hmm_readwrite.py +16 -0
  111. smftools/tools/inference/__init__.py +1 -0
  112. smftools/tools/inference/lightning_inference.py +41 -0
  113. smftools/tools/models/__init__.py +9 -0
  114. smftools/tools/models/base.py +14 -0
  115. smftools/tools/models/cnn.py +34 -0
  116. smftools/tools/models/lightning_base.py +41 -0
  117. smftools/tools/models/mlp.py +17 -0
  118. smftools/tools/models/positional.py +17 -0
  119. smftools/tools/models/rnn.py +16 -0
  120. smftools/tools/models/sklearn_models.py +40 -0
  121. smftools/tools/models/transformer.py +133 -0
  122. smftools/tools/models/wrappers.py +20 -0
  123. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  124. smftools/tools/position_stats.py +239 -0
  125. smftools/tools/read_stats.py +70 -0
  126. smftools/tools/subset_adata.py +28 -0
  127. smftools/tools/train_hmm.py +78 -0
  128. smftools/tools/training/__init__.py +1 -0
  129. smftools/tools/training/train_lightning_model.py +47 -0
  130. smftools/tools/utils/__init__.py +2 -0
  131. smftools/tools/utils/device.py +10 -0
  132. smftools/tools/utils/grl.py +14 -0
  133. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/METADATA +5 -2
  134. smftools-0.1.7.dist-info/RECORD +136 -0
  135. smftools-0.1.6.dist-info/RECORD +0 -4
  136. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  137. {smftools-0.1.6.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,33 @@
1
+ from .append_C_context import append_C_context
2
+ from .binarize_on_Youden import binarize_on_Youden
3
+ from .calculate_complexity import calculate_complexity
4
+ from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
5
+ from .calculate_coverage import calculate_coverage
6
+ from .calculate_position_Youden import calculate_position_Youden
7
+ from .calculate_read_length_stats import calculate_read_length_stats
8
+ from .clean_NaN import clean_NaN
9
+ from .filter_adata_by_nan_proportion import filter_adata_by_nan_proportion
10
+ from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
11
+ from .filter_reads_on_length import filter_reads_on_length
12
+ from .invert_adata import invert_adata
13
+ from .load_sample_sheet import load_sample_sheet
14
+ from .flag_duplicate_reads import flag_duplicate_reads
15
+ from .subsample_adata import subsample_adata
16
+
17
+ __all__ = [
18
+ "append_C_context",
19
+ "binarize_on_Youden",
20
+ "calculate_complexity",
21
+ "calculate_converted_read_methylation_stats",
22
+ "calculate_coverage",
23
+ "calculate_position_Youden",
24
+ "calculate_read_length_stats",
25
+ "clean_NaN",
26
+ "filter_adata_by_nan_proportion",
27
+ "filter_converted_reads_on_methylation",
28
+ "filter_reads_on_length",
29
+ "invert_adata",
30
+ "load_sample_sheet",
31
+ "flag_duplicate_reads",
32
+ "subsample_adata"
33
+ ]
@@ -0,0 +1,82 @@
1
+ ## append_C_context
2
+
3
+ ## Conversion SMF Specific
4
+ # Read methylation QC
5
+ def append_C_context(adata, obs_column='Reference', use_consensus=False, native=False):
6
+ """
7
+ Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
8
+
9
+ Parameters:
10
+ adata (AnnData): The input adata object.
11
+ obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
12
+ use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
13
+ native (bool): If False, perform conversion SMF assumptions. If True, perform native SMF assumptions
14
+
15
+ Returns:
16
+ None
17
+ """
18
+ import numpy as np
19
+ import anndata as ad
20
+
21
+ print('Adding Cytosine context based on reference FASTA sequence for sample')
22
+
23
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C', 'any_C_site']
24
+ categories = adata.obs[obs_column].cat.categories
25
+ for cat in categories:
26
+ # Assess if the strand is the top or bottom strand converted
27
+ if 'top' in cat:
28
+ strand = 'top'
29
+ elif 'bottom' in cat:
30
+ strand = 'bottom'
31
+
32
+ if native:
33
+ basename = cat.split(f"_{strand}")[0]
34
+ if use_consensus:
35
+ sequence = adata.uns[f'{basename}_consensus_sequence']
36
+ else:
37
+ # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
38
+ sequence = adata.uns[f'{basename}_FASTA_sequence']
39
+ else:
40
+ basename = cat.split(f"_{strand}")[0]
41
+ if use_consensus:
42
+ sequence = adata.uns[f'{basename}_consensus_sequence']
43
+ else:
44
+ # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
45
+ sequence = adata.uns[f'{basename}_FASTA_sequence']
46
+ # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
47
+ boolean_dict = {}
48
+ for site_type in site_types:
49
+ boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
50
+
51
+ if strand == 'top':
52
+ # Iterate through the sequence and apply the criteria
53
+ for i in range(1, len(sequence) - 1):
54
+ if sequence[i] == 'C':
55
+ boolean_dict[f'{cat}_any_C_site'][i] = True
56
+ if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
57
+ boolean_dict[f'{cat}_GpC_site'][i] = True
58
+ elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
59
+ boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
60
+ elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
61
+ boolean_dict[f'{cat}_CpG_site'][i] = True
62
+ elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
63
+ boolean_dict[f'{cat}_other_C'][i] = True
64
+ elif strand == 'bottom':
65
+ # Iterate through the sequence and apply the criteria
66
+ for i in range(1, len(sequence) - 1):
67
+ if sequence[i] == 'G':
68
+ boolean_dict[f'{cat}_any_C_site'][i] = True
69
+ if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
70
+ boolean_dict[f'{cat}_GpC_site'][i] = True
71
+ elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
72
+ boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
73
+ elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
74
+ boolean_dict[f'{cat}_CpG_site'][i] = True
75
+ elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
76
+ boolean_dict[f'{cat}_other_C'][i] = True
77
+ else:
78
+ print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
79
+
80
+ for site_type in site_types:
81
+ adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
82
+ adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].X
@@ -0,0 +1,146 @@
1
+ ## mark_duplicates
2
+
3
+ def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', method='N_masked_distances', distance_thresholds={}):
4
+ """
5
+ Marks duplicates in the adata object.
6
+
7
+ Parameters:
8
+ adata (AnnData): An adata object.
9
+ layers (list): A list of strings representing the layers to use.
10
+ obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
11
+ sample_col (str): A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
+ method (str): method to use for calculating the distance metric
13
+ distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
14
+
15
+ Returns:
16
+ None
17
+ """
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+ import matplotlib.pyplot as plt
22
+ from scipy.signal import find_peaks
23
+ import networkx as nx
24
+ from .binary_layers_to_ohe import binary_layers_to_ohe
25
+ from .calculate_pairwise_differences import calculate_pairwise_differences
26
+ from .min_non_diagonal import min_non_diagonal
27
+
28
+ categories = adata.obs[obs_column].cat.categories
29
+ sample_names = adata.obs[sample_col].cat.categories
30
+
31
+ # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
32
+ adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
33
+ cat_sample_dict = {}
34
+ for cat in categories:
35
+ cat_subset = adata[adata.obs[obs_column] == cat].copy()
36
+ for sample in sample_names:
37
+ sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
38
+ sample_subset = sample_subset[:, sample_subset.var[f'{cat}_any_C_site'] == True].copy() # only uses C sites from the converted strand
39
+ # Encode sequencing reads as a one-hot-encodings
40
+ print(f'One-hot encoding reads from {sample} on {cat}')
41
+ cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
42
+ # Unpack the read names and one hot encodings into lists
43
+ read_names = []
44
+ ohe_list = []
45
+ for read_name, ohe in cat_sample_dict[f'{cat}_{sample}_read_OHE_dict'].items():
46
+ read_names.append(read_name)
47
+ ohe_list.append(ohe)
48
+ # Calculate the pairwise hamming distances
49
+ if method == 'N_masked_distances':
50
+ print(f'Calculating N_masked_distances for {sample} on {cat} allele')
51
+ distance_matrix = calculate_pairwise_differences(ohe_list)
52
+ else:
53
+ print(f'{method} for calculating differences is not available')
54
+ n_reads = distance_matrix.shape[0]
55
+ # Load the hamming matrix into a dataframe with index and column names as the read_names
56
+ distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
57
+ cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
58
+
59
+ if n_reads > 1:
60
+ # Calculate the minimum non-self distance for every read in the reference and sample
61
+ min_distance_values = min_non_diagonal(distance_matrix)
62
+ min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
63
+ adata.obs.update(min_distance_df)
64
+
65
+ if cat in distance_thresholds:
66
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = distance_thresholds[cat]
67
+ else: # eventually this should be written to use known PCR duplicate controls for thresholding.
68
+ # Generate a histogram of minimum non-self distances for each read
69
+ if n_reads > 3:
70
+ n_bins = n_reads // 4
71
+ else:
72
+ n_bins = 1
73
+ min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
74
+ # Normalize the max value in any histogram bin to 1
75
+ normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
76
+ # Extract the bin index of peak centers in the histogram
77
+ try:
78
+ peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
79
+ first_peak_index = peak_centers[0]
80
+ offset_index = first_peak_index-1
81
+ # Use the distance corresponding to the first peak as the threshold distance in graph construction
82
+ first_peak_distance = min_distance_bins[1][first_peak_index]
83
+ offset_distance = min_distance_bins[1][offset_index]
84
+ except:
85
+ offset_distance = normalized_min_distance_counts[0]
86
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
87
+ else:
88
+ adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
89
+
90
+ ## Detect likely duplicate reads and mark them in the adata object.
91
+ adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
92
+ adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
93
+ adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
94
+
95
+ for cat in categories:
96
+ for sample in sample_names:
97
+ distance_df = cat_sample_dict[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
98
+ read_names = distance_df.index
99
+ distance_matrix = distance_df.values
100
+ n_reads = distance_matrix.shape[0]
101
+ distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
102
+ # Initialize the read distance graph
103
+ G = nx.Graph()
104
+ # Add each read as a node to the graph
105
+ G.add_nodes_from(range(n_reads))
106
+ # Add edges based on the threshold
107
+ for i in range(n_reads):
108
+ for j in range(i + 1, n_reads):
109
+ if distance_matrix[i, j] <= distance_threshold:
110
+ G.add_edge(i, j)
111
+ # Determine distinct clusters using connected components
112
+ clusters = list(nx.connected_components(G))
113
+ clusters = [list(cluster) for cluster in clusters]
114
+ # Get the number of clusters
115
+ cluster_count = len(clusters)
116
+ if n_reads > 0:
117
+ fraction_unique = cluster_count / n_reads
118
+ else:
119
+ fraction_unique = 0
120
+ adata.uns[f'Hamming_distance_cluster_count_within_{cat}_{sample}'] = cluster_count
121
+ adata.uns[f'total_reads_within_{cat}_{sample}'] = n_reads
122
+ # Update the adata object
123
+ read_cluster_map = {}
124
+ read_duplicate_map = {}
125
+ read_keep_map = {}
126
+ for i, cluster in enumerate(clusters):
127
+ for j, read_index in enumerate(cluster):
128
+ read_name = read_names[read_index]
129
+ read_cluster_map[read_name] = i
130
+ if len(cluster) > 1:
131
+ read_duplicate_map[read_name] = True
132
+ if j == 0:
133
+ read_keep_map[read_name] = True
134
+ else:
135
+ read_keep_map[read_name] = False
136
+ elif len(cluster) == 1:
137
+ read_duplicate_map[read_name] = False
138
+ read_keep_map[read_name] = True
139
+ cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
140
+ duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
141
+ keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
142
+ df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
143
+ adata.obs.update(df_combined)
144
+ adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
145
+ adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
146
+ print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')