smftools 0.1.0__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. smftools/__init__.py +0 -2
  2. smftools/_settings.py +3 -2
  3. smftools/_version.py +1 -0
  4. smftools/datasets/F1_sample_sheet.csv +5 -0
  5. smftools/datasets/datasets.py +14 -11
  6. smftools/informatics/__init__.py +10 -7
  7. smftools/informatics/archived/bam_conversion.py +59 -0
  8. smftools/informatics/archived/bam_direct.py +63 -0
  9. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  10. smftools/informatics/conversion_smf.py +79 -0
  11. smftools/informatics/direct_smf.py +89 -0
  12. smftools/informatics/fast5_to_pod5.py +21 -0
  13. smftools/informatics/helpers/LoadExperimentConfig.py +74 -0
  14. smftools/informatics/helpers/__init__.py +22 -4
  15. smftools/informatics/helpers/align_and_sort_BAM.py +48 -0
  16. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  17. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  18. smftools/informatics/helpers/binarize_converted_base_identities.py +11 -4
  19. smftools/informatics/helpers/canoncall.py +14 -1
  20. smftools/informatics/helpers/complement_base_list.py +21 -0
  21. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +183 -97
  23. smftools/informatics/helpers/count_aligned_reads.py +25 -14
  24. smftools/informatics/helpers/extract_base_identities.py +44 -23
  25. smftools/informatics/helpers/extract_mods.py +17 -5
  26. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  27. smftools/informatics/helpers/find_conversion_sites.py +24 -16
  28. smftools/informatics/helpers/generate_converted_FASTA.py +60 -21
  29. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  30. smftools/informatics/helpers/get_native_references.py +10 -7
  31. smftools/informatics/helpers/index_fasta.py +12 -0
  32. smftools/informatics/helpers/make_dirs.py +9 -3
  33. smftools/informatics/helpers/make_modbed.py +10 -4
  34. smftools/informatics/helpers/modQC.py +10 -2
  35. smftools/informatics/helpers/modcall.py +16 -2
  36. smftools/informatics/helpers/modkit_extract_to_adata.py +486 -323
  37. smftools/informatics/helpers/ohe_batching.py +52 -0
  38. smftools/informatics/helpers/one_hot_encode.py +15 -8
  39. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  40. smftools/informatics/helpers/separate_bam_by_bc.py +20 -5
  41. smftools/informatics/helpers/split_and_index_BAM.py +31 -11
  42. smftools/informatics/load_adata.py +127 -0
  43. smftools/informatics/readwrite.py +13 -16
  44. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  45. smftools/informatics/subsample_pod5.py +104 -0
  46. smftools/preprocessing/__init__.py +6 -7
  47. smftools/preprocessing/append_C_context.py +52 -22
  48. smftools/preprocessing/binarize_on_Youden.py +8 -4
  49. smftools/preprocessing/binary_layers_to_ohe.py +9 -4
  50. smftools/preprocessing/calculate_complexity.py +26 -14
  51. smftools/preprocessing/calculate_consensus.py +47 -0
  52. smftools/preprocessing/calculate_converted_read_methylation_stats.py +69 -11
  53. smftools/preprocessing/calculate_coverage.py +14 -8
  54. smftools/preprocessing/calculate_pairwise_hamming_distances.py +11 -6
  55. smftools/preprocessing/calculate_position_Youden.py +21 -12
  56. smftools/preprocessing/calculate_read_length_stats.py +67 -8
  57. smftools/preprocessing/clean_NaN.py +13 -6
  58. smftools/preprocessing/filter_converted_reads_on_methylation.py +15 -6
  59. smftools/preprocessing/filter_reads_on_length.py +16 -6
  60. smftools/preprocessing/invert_adata.py +10 -5
  61. smftools/preprocessing/load_sample_sheet.py +24 -0
  62. smftools/preprocessing/make_dirs.py +21 -0
  63. smftools/preprocessing/mark_duplicates.py +54 -30
  64. smftools/preprocessing/min_non_diagonal.py +9 -4
  65. smftools/preprocessing/recipes.py +125 -0
  66. smftools/preprocessing/remove_duplicates.py +15 -6
  67. smftools/readwrite.py +13 -16
  68. smftools/tools/apply_HMM.py +1 -0
  69. smftools/tools/cluster.py +0 -0
  70. smftools/tools/read_HMM.py +1 -0
  71. smftools/tools/subset_adata.py +32 -0
  72. smftools/tools/train_HMM.py +43 -0
  73. smftools-0.1.3.dist-info/METADATA +94 -0
  74. smftools-0.1.3.dist-info/RECORD +84 -0
  75. smftools/informatics/helpers/align_BAM.py +0 -49
  76. smftools/informatics/helpers/load_experiment_config.py +0 -17
  77. smftools/informatics/pod5_conversion.py +0 -26
  78. smftools/informatics/pod5_direct.py +0 -29
  79. smftools/informatics/pod5_to_adata.py +0 -17
  80. smftools-0.1.0.dist-info/METADATA +0 -75
  81. smftools-0.1.0.dist-info/RECORD +0 -58
  82. /smftools/informatics/helpers/{informatics.py → archived/informatics.py} +0 -0
  83. /smftools/informatics/helpers/{load_adata.py → archived/load_adata.py} +0 -0
  84. /smftools/preprocessing/{preprocessing.py → archives/preprocessing.py} +0 -0
  85. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  86. {smftools-0.1.0.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,38 +1,68 @@
1
1
  ## append_C_context
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
3
  ## Conversion SMF Specific
7
4
  # Read methylation QC
8
5
  def append_C_context(adata, obs_column='Reference', use_consensus=False):
9
6
  """
7
+ Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
8
+
9
+ Parameters:
10
+ adata (AnnData): The input adata object.
11
+ obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
12
+ use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
10
13
  Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
11
- Output: Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
14
+
15
+ Returns:
16
+ None
12
17
  """
13
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
18
+ import numpy as np
19
+ import anndata as ad
20
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
14
21
  categories = adata.obs[obs_column].cat.categories
15
- if use_consensus:
16
- sequence = adata.uns[f'{cat}_consensus_sequence']
17
- else:
18
- sequence = adata.uns[f'{cat}_FASTA_sequence']
19
22
  for cat in categories:
23
+ # Assess if the strand is the top or bottom strand converted
24
+ if 'top' in cat:
25
+ strand = 'top'
26
+ elif 'bottom' in cat:
27
+ strand = 'bottom'
28
+
29
+ if use_consensus:
30
+ sequence = adata.uns[f'{cat}_consensus_sequence']
31
+ else:
32
+ # This sequence is the unconverted FASTA sequence of the original input FASTA for the locus
33
+ sequence = adata.uns[f'{cat}_FASTA_sequence']
34
+ # Init a dict keyed by reference site type that points to a bool of whether the position is that site type.
20
35
  boolean_dict = {}
21
36
  for site_type in site_types:
22
37
  boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
23
- # Iterate through the sequence and apply the criteria
24
- for i in range(1, len(sequence) - 1):
25
- if sequence[i] == 'C':
26
- if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
27
- boolean_dict[f'{cat}_GpC_site'][i] = True
28
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
29
- boolean_dict[f'{cat}_ambiguous_GpC_site'][i] = True
30
- elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
31
- boolean_dict[f'{cat}_CpG_site'][i] = True
32
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
33
- boolean_dict[f'{cat}_ambiguous_CpG_site'][i] = True
34
- elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
35
- boolean_dict[f'{cat}_other_C'][i] = True
38
+
39
+ if strand == 'top':
40
+ # Iterate through the sequence and apply the criteria
41
+ for i in range(1, len(sequence) - 1):
42
+ if sequence[i] == 'C':
43
+ if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
44
+ boolean_dict[f'{cat}_GpC_site'][i] = True
45
+ elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
46
+ boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
47
+ elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
48
+ boolean_dict[f'{cat}_CpG_site'][i] = True
49
+ elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
50
+ boolean_dict[f'{cat}_other_C'][i] = True
51
+ elif strand == 'bottom':
52
+ # Iterate through the sequence and apply the criteria
53
+ for i in range(1, len(sequence) - 1):
54
+ if sequence[i] == 'G':
55
+ if sequence[i + 1] == 'C' and sequence[i - 1] != 'C':
56
+ boolean_dict[f'{cat}_GpC_site'][i] = True
57
+ elif sequence[i - 1] == 'C' and sequence[i + 1] == 'C':
58
+ boolean_dict[f'{cat}_ambiguous_GpC_CpG_site'][i] = True
59
+ elif sequence[i - 1] == 'C' and sequence[i + 1] != 'C':
60
+ boolean_dict[f'{cat}_CpG_site'][i] = True
61
+ elif sequence[i - 1] != 'C' and sequence[i + 1] != 'C':
62
+ boolean_dict[f'{cat}_other_C'][i] = True
63
+ else:
64
+ print('Error: top or bottom strand of conversion could not be determined. Ensure this value is in the Reference name.')
65
+
36
66
  for site_type in site_types:
37
67
  adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
38
68
  adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
@@ -1,13 +1,17 @@
1
1
  ## binarize_on_Youden
2
- import numpy as np
3
- import pandas as pd
4
- import anndata as ad
5
2
 
6
3
  def binarize_on_Youden(adata, obs_column='Reference'):
7
4
  """
5
+ Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
6
+
7
+ Parameters:
8
+ adata (AnnData): The anndata object to binarize. pp.calculate_position_Youden function has to be run first.
9
+ obs_column (str): The obs_column to stratify on. Needs to be the same as passed in pp.calculate_position_Youden.
8
10
  Input: adata object that has had calculate_position_Youden called on it.
9
- Output: Add a new layer to the adata object that has binarized SMF values based on the position thresholds determined by calculate_position_Youden
11
+ Output:
10
12
  """
13
+ import numpy as np
14
+ import anndata as ad
11
15
  temp_adata = None
12
16
  categories = adata.obs[obs_column].cat.categories
13
17
  for cat in categories:
@@ -1,14 +1,19 @@
1
1
  ## binary_layers_to_ohe
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
3
  ## Conversion SMF Specific
7
4
  def binary_layers_to_ohe(adata, layers, stack='hstack'):
8
5
  """
6
+ Parameters:
7
+ adata (AnnData): Anndata object.
8
+ layers (list): a list of strings. Each string represents a layer in the adata object. The layer should encode a binary matrix
9
+ stack (str): Dimension to stack the one-hot-encoding. Options include 'hstack' and 'vstack'. Default is 'hstack', since this is more efficient.
10
+
11
+ Returns:
12
+ ohe_dict (dict): A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
9
13
  Input: An adata object and a list of layers containing a binary encoding.
10
- Output: A dictionary keyed by obs_name that points to a stacked (hstack or vstack) one-hot encoding of the binary layers
11
14
  """
15
+ import numpy as np
16
+ import anndata as ad
12
17
  # Extract the layers
13
18
  layers = [adata.layers[layer_name] for layer_name in layers]
14
19
  n_reads = layers[0].shape[0]
@@ -1,21 +1,32 @@
1
1
  ## calculate_complexity
2
- import numpy as np
3
- import pandas as pd
4
- from scipy.optimize import curve_fit
5
- import matplotlib.pyplot as plt
6
2
 
7
- def lander_waterman(x, C0):
8
- return C0 * (1 - np.exp(-x / C0))
3
+ def calculate_complexity(adata, output_directory='', obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
4
+ """
5
+ A complexity analysis of the library.
9
6
 
10
- def count_unique_reads(reads, depth):
11
- subsample = np.random.choice(reads, depth, replace=False)
12
- return len(np.unique(subsample))
7
+ Parameters:
8
+ adata (AnnData): An adata object with mark_duplicates already run.
9
+ output_directory (str): String representing the path to the output directory.
10
+ obs_column (str): String of the obs column to iterate over.
11
+ sample_col (str): String of the sample column to iterate over.
12
+ plot (bool): Whether to plot the complexity model.
13
+ save_plot (bool): Whether to save the complexity model.
14
+
15
+ Returns:
16
+ None
13
17
 
14
- def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names', plot=True, save_plot=False):
15
- """
16
- Input: adata object with mark_duplicates already run.
17
- Output: A complexity analysis of the library
18
18
  """
19
+ import numpy as np
20
+ import pandas as pd
21
+ from scipy.optimize import curve_fit
22
+
23
+ def lander_waterman(x, C0):
24
+ return C0 * (1 - np.exp(-x / C0))
25
+
26
+ def count_unique_reads(reads, depth):
27
+ subsample = np.random.choice(reads, depth, replace=False)
28
+ return len(np.unique(subsample))
29
+
19
30
  categories = adata.obs[obs_column].cat.categories
20
31
  sample_names = adata.obs[sample_col].cat.categories
21
32
 
@@ -40,6 +51,7 @@ def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names
40
51
  y_data = lander_waterman(x_data, *popt)
41
52
  adata.uns[f'Library_complexity_{sample}_on_{cat}'] = popt[0]
42
53
  if plot:
54
+ import matplotlib.pyplot as plt
43
55
  # Plot the complexity curve
44
56
  plt.figure(figsize=(6, 4))
45
57
  plt.plot(total_reads, unique_reads, 'o', label='Observed unique reads')
@@ -52,7 +64,7 @@ def calculate_complexity(adata, obs_column='Reference', sample_col='Sample_names
52
64
  plt.grid(True)
53
65
  if save_plot:
54
66
  date_string = date_string()
55
- save_name = output_directory + f'/{date_string} {title}'
67
+ save_name = output_directory + f'/{date_string}_{title}'
56
68
  plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
57
69
  plt.close()
58
70
  else:
@@ -0,0 +1,47 @@
1
+ # calculate_consensus
2
+
3
+ def calculate_consensus(adata, reference, sample=False, reference_column='Reference', sample_column='Sample'):
4
+ """
5
+ Takes an input AnnData object, the reference to subset on, and the sample name to subset on to calculate the consensus sequence of the read set.
6
+
7
+ Parameters:
8
+ adata (AnnData): The input adata to append consensus metadata to.
9
+ reference (str): The name of the reference to subset the adata on.
10
+ sample (bool | str): If False, uses all samples. If a string is passed, the adata is further subsetted to only analyze that sample.
11
+ reference_column (str): The name of the reference column (Default is 'Reference')
12
+ sample_column (str): The name of the sample column (Default is 'Sample)
13
+
14
+ Returns:
15
+ None
16
+
17
+ """
18
+ import numpy as np
19
+
20
+ # Subset the adata on the refernce of interest. Optionally, subset additionally on a sample of interest.
21
+ record_subset = adata[adata.obs[reference_column] == reference].copy()
22
+ if sample:
23
+ record_subset = record_subset[record_subset.obs[sample_column] == sample].copy()
24
+ else:
25
+ pass
26
+
27
+ # Grab layer names from the adata object that correspond to the binary encodings of the read sequences.
28
+ layers = [layer for layer in record_subset.layers if '_binary_' in layer]
29
+ layer_map, layer_counts = {}, []
30
+ for i, layer in enumerate(layers):
31
+ # Gives an integer mapping to access which sequence base the binary layer is encoding
32
+ layer_map[i] = layer.split('_')[0]
33
+ # Get the positional counts from all reads for the given base identity.
34
+ layer_counts.append(np.sum(record_subset.layers[layer], axis=0))
35
+ # Combine the positional counts array derived from each binary base layer into an ndarray
36
+ count_array = np.array(layer_counts)
37
+ # Determine the row index that contains the largest count for each position and store this in an array.
38
+ nucleotide_indexes = np.argmax(count_array, axis=0)
39
+ # Map the base sequence derived from the row index array to attain the consensus sequence in a list.
40
+ consensus_sequence_list = [layer_map[i] for i in nucleotide_indexes]
41
+
42
+ if sample:
43
+ adata.var[f'{reference}_consensus_from_{sample}'] = consensus_sequence_list
44
+ else:
45
+ adata.var[f'{reference}_consensus_across_samples'] = consensus_sequence_list
46
+
47
+ adata.uns[f'{reference}_consensus_sequence'] = consensus_sequence_list
@@ -1,25 +1,41 @@
1
1
  ## calculate_converted_read_methylation_stats
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
3
  ## Conversion SMF Specific
7
4
  # Read methylation QC
8
5
 
9
- def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
6
+ def calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False):
10
7
  """
11
- Input: adata and the observation category of interest
12
- Output: Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives)
8
+ Adds methylation statistics for each read. Indicates whether the read GpC methylation exceeded other_C methylation (background false positives).
9
+
10
+ Parameters:
11
+ adata (AnnData): An adata object
12
+ reference_column (str): String representing the name of the Reference column to use
13
+ sample_names_col (str): String representing the name of the sample name column to use
14
+ output_directory (str): String representing the output directory to make and write out the histograms.
15
+ show_methylation_histogram (bool): Whether to display the histograms.
16
+ save_methylation_histogram (bool): Whether to save the histograms.
17
+
18
+ Returns:
19
+ None
13
20
  """
14
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
15
- categories = adata.obs[obs_column].cat.categories
21
+ import numpy as np
22
+ import anndata as ad
23
+ import pandas as pd
24
+ import matplotlib.pyplot as plt
25
+ from .. import readwrite
26
+
27
+ references = set(adata.obs[reference_column])
28
+ sample_names = set(adata.obs[sample_names_col])
29
+
30
+ site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
31
+
16
32
  for site_type in site_types:
17
33
  adata.obs[f'{site_type}_row_methylation_sums'] = pd.Series(0, index=adata.obs_names, dtype=int)
18
34
  adata.obs[f'{site_type}_row_methylation_means'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
19
35
  adata.obs[f'number_valid_{site_type}_in_read'] = pd.Series(0, index=adata.obs_names, dtype=int)
20
36
  adata.obs[f'fraction_valid_{site_type}_in_range'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
21
- for cat in categories:
22
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
37
+ for cat in references:
38
+ cat_subset = adata[adata.obs[reference_column] == cat].copy()
23
39
  for site_type in site_types:
24
40
  print(f'Iterating over {cat}_{site_type}')
25
41
  observation_matrix = cat_subset.obsm[f'{cat}_{site_type}']
@@ -35,4 +51,46 @@ def calculate_converted_read_methylation_stats(adata, obs_column='Reference'):
35
51
  adata.obs.update(temp_obs_data)
36
52
  # Indicate whether the read-level GpC methylation rate exceeds the false methylation rate of the read
37
53
  pass_array = np.array(adata.obs[f'GpC_site_row_methylation_means'] > adata.obs[f'other_C_row_methylation_means'])
38
- adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
54
+ adata.obs['GpC_above_other_C'] = pd.Series(pass_array, index=adata.obs.index, dtype=bool)
55
+
56
+ adata.uns['methylation_dict'] = {}
57
+ n_bins = 50
58
+ site_types_to_analyze = ['GpC_site', 'CpG_site', 'ambiguous_GpC_CpG_site', 'other_C']
59
+
60
+ for reference in references:
61
+ reference_adata = adata[adata.obs[reference_column] == reference].copy()
62
+ split_reference = reference.split('_')[0][1:]
63
+ for sample in sample_names:
64
+ sample_adata = reference_adata[reference_adata.obs[sample_names_col] == sample].copy()
65
+ for site_type in site_types_to_analyze:
66
+ methylation_data = sample_adata.obs[f'{site_type}_row_methylation_means']
67
+ max_meth = np.max(sample_adata.obs[f'{site_type}_row_methylation_sums'])
68
+ if not np.isnan(max_meth):
69
+ n_bins = int(max_meth // 2)
70
+ else:
71
+ n_bins = 1
72
+ mean = np.mean(methylation_data)
73
+ median = np.median(methylation_data)
74
+ stdev = np.std(methylation_data)
75
+ adata.uns['methylation_dict'][f'{reference}_{sample}_{site_type}'] = [mean, median, stdev]
76
+ if show_methylation_histogram or save_methylation_histogram:
77
+ fig, ax = plt.subplots(figsize=(6, 4))
78
+ count, bins, patches = plt.hist(methylation_data, bins=n_bins, weights=np.ones(len(methylation_data)) / len(methylation_data), alpha=0.7, color='blue', edgecolor='black')
79
+ plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
80
+ plt.text(median + stdev, max(count)*0.8, f'Median: {median:.2f}', color='red')
81
+ plt.axvline(median - stdev, color='green', linestyle='dashed', linewidth=1, label=f'Stdev: {stdev:.2f}')
82
+ plt.axvline(median + stdev, color='green', linestyle='dashed', linewidth=1)
83
+ plt.text(median + stdev + 0.05, max(count) / 3, f'+1 Stdev: {stdev:.2f}', color='green')
84
+ plt.xlabel('Fraction methylated')
85
+ plt.ylabel('Proportion')
86
+ title = f'Distribution of {methylation_data.shape[0]} read {site_type} methylation means \nfor {sample} sample on {split_reference} after filtering'
87
+ plt.title(title, pad=20)
88
+ plt.xlim(-0.05, 1.05) # Set x-axis range from 0 to 1
89
+ ax.spines['right'].set_visible(False)
90
+ ax.spines['top'].set_visible(False)
91
+ save_name = output_directory + f'/{readwrite.date_string()} {title}'
92
+ if save_methylation_histogram:
93
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
94
+ plt.close()
95
+ else:
96
+ plt.show()
@@ -1,21 +1,27 @@
1
1
  ## calculate_coverage
2
- from .. import readwrite
3
- import numpy as np
4
- import anndata as ad
5
- import pandas as pd
6
-
7
2
 
8
3
  def calculate_coverage(adata, obs_column='Reference', position_nan_threshold=0.05):
9
4
  """
10
- Input: An adata object and an observation column of interest. Assess if the position is present in the dataset category.
11
- Output: Append position level metadata indicating whether the position is informative within the given observation category.
5
+ Append position level metadata regarding whether the position is informative within the given observation category.
6
+
7
+ Parameters:
8
+ adata (AnnData): An AnnData object
9
+ obs_column (str): Observation column value to subset on prior to calculating position statistics for that category.
10
+ position_nan_threshold (float): A minimal fractional threshold of coverage within the obs_column category to call the position as valid.
11
+
12
+ Returns:
13
+ None
12
14
  """
15
+ import numpy as np
16
+ import anndata as ad
17
+ import pandas as pd
18
+
13
19
  categories = adata.obs[obs_column].cat.categories
14
20
  n_categories_with_position = np.zeros(adata.shape[1])
15
21
  # Loop over categories
16
22
  for cat in categories:
17
23
  # Look at positional information for each reference
18
- temp_cat_adata = adata[adata.obs[obs_column] == cat]
24
+ temp_cat_adata = adata[adata.obs[obs_column] == cat].copy()
19
25
  # Look at read coverage on the given category strand
20
26
  cat_valid_coverage = np.sum(~np.isnan(temp_cat_adata.X), axis=0)
21
27
  cat_invalid_coverage = np.sum(np.isnan(temp_cat_adata.X), axis=0)
@@ -1,15 +1,20 @@
1
1
  ## calculate_pairwise_hamming_distances
2
- import numpy as np
3
- import tqdm
4
- from scipy.spatial.distance import hamming
5
2
 
6
3
  ## Conversion SMF Specific
7
4
  def calculate_pairwise_hamming_distances(arrays):
8
5
  """
9
- Calculate the pairwise Hamming distances for a list of ndarrays.
10
- Input: A list of ndarrays
11
- Output: a 2D array containing the pairwise Hamming distances.
6
+ Calculate the pairwise Hamming distances for a list of h-stacked ndarrays.
7
+
8
+ Parameters:
9
+ arrays (str): A list of ndarrays.
10
+
11
+ Returns:
12
+ distance_matrix (ndarray): a 2D array containing the pairwise Hamming distances between all arrays.
13
+
12
14
  """
15
+ import numpy as np
16
+ from tqdm import tqdm
17
+ from scipy.spatial.distance import hamming
13
18
  num_arrays = len(arrays)
14
19
  # Initialize an empty distance matrix
15
20
  distance_matrix = np.zeros((num_arrays, num_arrays))
@@ -1,20 +1,28 @@
1
1
  ## calculate_position_Youden
2
- import numpy as np
3
- import pandas as pd
4
- import anndata as ad
5
- import matplotlib.pyplot as plt
6
- from sklearn.metrics import roc_curve, roc_auc_score
7
-
8
-
9
2
 
10
3
  ## Calculating and applying position level thresholds for methylation calls to binarize the SMF data
11
4
  def calculate_position_Youden(adata, positive_control_sample, negative_control_sample, J_threshold=0.4, obs_column='Reference', save=False, output_directory=''):
12
5
  """
13
- Input: An adata object, a plus MTase control, a minus MTase control, the minimal J-statistic threshold, and a categorical observation column to iterate over.
14
- Input notes: The control samples are passed as string names of the samples as they appear in the 'Sample_names' obs column
15
- Output: Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
16
- Can optionally save the output plots of the ROC curve
6
+ Adds new variable metadata to each position indicating whether the position provides reliable SMF methylation calls. Also outputs plots of the positional ROC curves.
7
+
8
+ Parameters:
9
+ adata (AnnData): An AnnData object.
10
+ positive_control_sample (str): string representing the sample name corresponding to the Plus MTase control sample.
11
+ negative_control_sample (str): string representing the sample name corresponding to the Minus MTase control sample.
12
+ J_threshold (float): A float indicating the J-statistic used to indicate whether a position passes QC for methylation calls.
13
+ obs_column (str): The category to iterate over.
14
+ save (bool): Whether to save the ROC plots.
15
+ output_directory (str): String representing the path to the output directory to output the ROC curves.
16
+
17
+ Returns:
18
+ None
17
19
  """
20
+ import numpy as np
21
+ import pandas as pd
22
+ import anndata as ad
23
+ import matplotlib.pyplot as plt
24
+ from sklearn.metrics import roc_curve, roc_auc_score
25
+
18
26
  control_samples = [positive_control_sample, negative_control_sample]
19
27
  categories = adata.obs[obs_column].cat.categories
20
28
  # Iterate over each category in the specified obs_column
@@ -89,7 +97,8 @@ def calculate_position_Youden(adata, positive_control_sample, negative_control_s
89
97
  plt.savefig(save_name)
90
98
  plt.close()
91
99
  else:
92
- plt.show()
100
+ plt.show()
101
+
93
102
  adata.var[f'{cat}_position_methylation_thresholding_Youden_stats'] = probability_thresholding_list
94
103
  J_max_list = [probability_thresholding_list[i][1] for i in range(adata.shape[1])]
95
104
  adata.var[f'{cat}_position_passed_QC'] = [True if i > J_threshold else False for i in J_max_list]
@@ -1,17 +1,36 @@
1
1
  ## calculate_read_length_stats
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
3
  # Read length QC
7
- def calculate_read_length_stats(adata):
4
+ def calculate_read_length_stats(adata, reference_column, sample_names_col, output_directory, show_read_length_histogram=False, save_read_length_histogram=False):
8
5
  """
9
- Input: An adata object
10
- Output: Append first valid position in a read and last valid position in the read. From this determine and append the read length.
11
- Return two new variable which hold the first and last valid positions in the entire dataset
6
+ Append first valid position in a read and last valid position in the read. From this determine and append the read length.
7
+
8
+ Parameters:
9
+ adata (AnnData): An adata object
10
+ reference_column (str): String representing the name of the Reference column to use
11
+ sample_names_col (str): String representing the name of the sample name column to use
12
+ output_directory (str): String representing the output directory to make and write out the histograms.
13
+ show_read_length_histogram (bool): Whether to display the histograms.
14
+ save_read_length_histogram (bool): Whether to save the histograms.
15
+
16
+ Returns:
17
+ upper_bound (int): last valid position in the dataset
18
+ lower_bound (int): first valid position in the dataset
12
19
  """
13
- ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
20
+ import numpy as np
21
+ import anndata as ad
22
+ import pandas as pd
23
+ import matplotlib.pyplot as plt
24
+ from .. import readwrite
25
+ from .make_dirs import make_dirs
26
+
27
+ make_dirs([output_directory])
14
28
 
29
+ references = set(adata.obs[reference_column])
30
+ sample_names = set(adata.obs[sample_names_col])
31
+
32
+ ## Add basic observation-level (read-level) metadata to the object: first valid position in a read and last valid position in the read. From this determine the read length. Save two new variable which hold the first and last valid positions in the entire dataset
33
+ print('calculating read length stats')
15
34
  # Add some basic observation-level (read-level) metadata to the anndata object
16
35
  read_first_valid_position = np.array([int(adata.var_names[i]) for i in np.argmax(~np.isnan(adata.X), axis=1)])
17
36
  read_last_valid_position = np.array([int(adata.var_names[i]) for i in (adata.X.shape[1] - 1 - np.argmax(~np.isnan(adata.X[:, ::-1]), axis=1))])
@@ -24,4 +43,44 @@ def calculate_read_length_stats(adata):
24
43
  # Define variables to hold the first and last valid position in the dataset
25
44
  upper_bound = int(np.nanmax(adata.obs['last_valid_position']))
26
45
  lower_bound = int(np.nanmin(adata.obs['first_valid_position']))
46
+
47
+ # Add an unstructured element to the anndata object which points to a dictionary of read lengths keyed by reference and sample name. Points to a tuple containing (mean, median, stdev) of the read lengths of the sample for the given reference strand
48
+
49
+ ## Plot histogram of read length data and save the median and stdev of the read lengths for each sample.
50
+ adata.uns['read_length_dict'] = {}
51
+
52
+ for reference in references:
53
+ temp_reference_adata = adata[adata.obs[reference_column] == reference].copy()
54
+ split_reference = reference.split('_')[0][1:]
55
+ for sample in sample_names:
56
+ temp_sample_adata = temp_reference_adata[temp_reference_adata.obs[sample_names_col] == sample].copy()
57
+ temp_data = temp_sample_adata.obs['read_length']
58
+ max_length = np.max(temp_data)
59
+ mean = np.mean(temp_data)
60
+ median = np.median(temp_data)
61
+ stdev = np.std(temp_data)
62
+ adata.uns['read_length_dict'][f'{reference}_{sample}'] = [mean, median, stdev]
63
+ if not np.isnan(max_length):
64
+ n_bins = int(max_length // 100)
65
+ else:
66
+ n_bins = 1
67
+ if show_read_length_histogram or save_read_length_histogram:
68
+ plt.figure(figsize=(10, 6))
69
+ plt.text(median + 0.5, max(plt.hist(temp_data, bins=n_bins)[0]) / 2, f'Median: {median:.2f}', color='red')
70
+ plt.hist(temp_data, bins=n_bins, alpha=0.7, color='blue', edgecolor='black')
71
+ plt.xlabel('Read Length')
72
+ plt.ylabel('Count')
73
+ title = f'Read length distribution of {temp_sample_adata.shape[0]} total reads from {sample} sample on {split_reference} allele'
74
+ plt.title(title)
75
+ # Add a vertical line at the median
76
+ plt.axvline(median, color='red', linestyle='dashed', linewidth=1)
77
+ # Annotate the median
78
+ plt.xlim(lower_bound - 100, upper_bound + 100)
79
+ if save_read_length_histogram:
80
+ save_name = output_directory + f'/{readwrite.date_string()} {title}'
81
+ plt.savefig(save_name, bbox_inches='tight', pad_inches=0.1)
82
+ plt.close()
83
+ else:
84
+ plt.show()
85
+
27
86
  return upper_bound, lower_bound
@@ -1,14 +1,21 @@
1
1
  ## clean_NaN
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
- # NaN handling
7
3
  def clean_NaN(adata, layer=None):
8
4
  """
9
- Input: An adata object and the layer to fill Nan values of
10
- Output: Append layers to adata that contain NaN cleaning strategies
5
+ Append layers to adata that contain NaN cleaning strategies.
6
+
7
+ Parameters:
8
+ adata (AnnData): an adata object
9
+ layer (str): string representing the layer to fill NaN values in
10
+
11
+ Returns:
12
+ None
11
13
  """
14
+ import numpy as np
15
+ import anndata as ad
16
+ import pandas as pd
17
+ from ..readwrite import adata_to_df
18
+
12
19
  # Fill NaN with closest SMF value
13
20
  df = adata_to_df(adata, layer=layer)
14
21
  df = df.ffill(axis=1).bfill(axis=1)
@@ -1,15 +1,22 @@
1
1
  ## filter_converted_reads_on_methylation
2
- import numpy as np
3
- import anndata as ad
4
- import pandas as pd
5
2
 
6
3
  ## Conversion SMF Specific
7
4
  # Read methylation QC
8
5
  def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025):
9
6
  """
10
- Input: Adata object. Minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read
11
- Output: A subset of the adata object
7
+ Filter adata object using minimum thresholds for valid SMF site fraction in read, as well as minimum methylation content in read.
8
+
9
+ Parameters:
10
+ adata (AnnData): An adata object.
11
+ valid_SMF_site_threshold (float): A minimum proportion of valid SMF sites that must be present in the read. Default is 0.8
12
+ min_SMF_threshold (float): A minimum read methylation level. Default is 0.025
13
+ Returns:
14
+ adata (AnnData): The filtered adata object.
12
15
  """
16
+ import numpy as np
17
+ import anndata as ad
18
+ import pandas as pd
19
+
13
20
  if valid_SMF_site_threshold:
14
21
  # Keep reads that have over a given valid GpC site content
15
22
  adata = adata[adata.obs['fraction_valid_GpC_site_in_range'] > valid_SMF_site_threshold].copy()
@@ -17,4 +24,6 @@ def filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, m
17
24
  # Keep reads with SMF methylation over background methylation.
18
25
  adata = adata[adata.obs['GpC_above_other_C'] == True].copy()
19
26
  # Keep reads over a defined methylation threshold
20
- adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
27
+ adata = adata[adata.obs['GpC_site_row_methylation_means'] > min_SMF_threshold].copy()
28
+
29
+ return adata