smftools 0.1.3__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/METADATA +44 -11
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -20
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/F1_sample_sheet.csv +0 -5
  8. smftools/datasets/__init__.py +0 -9
  9. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  10. smftools/datasets/datasets.py +0 -28
  11. smftools/informatics/__init__.py +0 -14
  12. smftools/informatics/archived/bam_conversion.py +0 -59
  13. smftools/informatics/archived/bam_direct.py +0 -63
  14. smftools/informatics/archived/basecalls_to_adata.py +0 -71
  15. smftools/informatics/conversion_smf.py +0 -79
  16. smftools/informatics/direct_smf.py +0 -89
  17. smftools/informatics/fast5_to_pod5.py +0 -21
  18. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  19. smftools/informatics/helpers/__init__.py +0 -60
  20. smftools/informatics/helpers/align_and_sort_BAM.py +0 -48
  21. smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -73
  22. smftools/informatics/helpers/archived/informatics.py +0 -260
  23. smftools/informatics/helpers/archived/load_adata.py +0 -516
  24. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  25. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  26. smftools/informatics/helpers/canoncall.py +0 -25
  27. smftools/informatics/helpers/complement_base_list.py +0 -21
  28. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -54
  29. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -233
  30. smftools/informatics/helpers/count_aligned_reads.py +0 -43
  31. smftools/informatics/helpers/extract_base_identities.py +0 -57
  32. smftools/informatics/helpers/extract_mods.py +0 -51
  33. smftools/informatics/helpers/extract_readnames_from_BAM.py +0 -22
  34. smftools/informatics/helpers/find_conversion_sites.py +0 -61
  35. smftools/informatics/helpers/generate_converted_FASTA.py +0 -98
  36. smftools/informatics/helpers/get_chromosome_lengths.py +0 -32
  37. smftools/informatics/helpers/get_native_references.py +0 -28
  38. smftools/informatics/helpers/index_fasta.py +0 -12
  39. smftools/informatics/helpers/make_dirs.py +0 -21
  40. smftools/informatics/helpers/make_modbed.py +0 -27
  41. smftools/informatics/helpers/modQC.py +0 -27
  42. smftools/informatics/helpers/modcall.py +0 -28
  43. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -518
  44. smftools/informatics/helpers/ohe_batching.py +0 -52
  45. smftools/informatics/helpers/one_hot_encode.py +0 -21
  46. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -52
  47. smftools/informatics/helpers/separate_bam_by_bc.py +0 -43
  48. smftools/informatics/helpers/split_and_index_BAM.py +0 -41
  49. smftools/informatics/load_adata.py +0 -127
  50. smftools/informatics/readwrite.py +0 -106
  51. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  52. smftools/informatics/subsample_pod5.py +0 -104
  53. smftools/plotting/__init__.py +0 -0
  54. smftools/preprocessing/__init__.py +0 -34
  55. smftools/preprocessing/append_C_context.py +0 -69
  56. smftools/preprocessing/archives/preprocessing.py +0 -614
  57. smftools/preprocessing/binarize_on_Youden.py +0 -42
  58. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  59. smftools/preprocessing/calculate_complexity.py +0 -71
  60. smftools/preprocessing/calculate_consensus.py +0 -47
  61. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -96
  62. smftools/preprocessing/calculate_coverage.py +0 -41
  63. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  64. smftools/preprocessing/calculate_position_Youden.py +0 -104
  65. smftools/preprocessing/calculate_read_length_stats.py +0 -86
  66. smftools/preprocessing/clean_NaN.py +0 -38
  67. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -29
  68. smftools/preprocessing/filter_reads_on_length.py +0 -41
  69. smftools/preprocessing/invert_adata.py +0 -23
  70. smftools/preprocessing/load_sample_sheet.py +0 -24
  71. smftools/preprocessing/make_dirs.py +0 -21
  72. smftools/preprocessing/mark_duplicates.py +0 -134
  73. smftools/preprocessing/min_non_diagonal.py +0 -25
  74. smftools/preprocessing/recipes.py +0 -125
  75. smftools/preprocessing/remove_duplicates.py +0 -21
  76. smftools/readwrite.py +0 -106
  77. smftools/tools/__init__.py +0 -0
  78. smftools/tools/apply_HMM.py +0 -1
  79. smftools/tools/cluster.py +0 -0
  80. smftools/tools/read_HMM.py +0 -1
  81. smftools/tools/subset_adata.py +0 -32
  82. smftools/tools/train_HMM.py +0 -43
  83. smftools-0.1.3.dist-info/RECORD +0 -84
  84. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  85. {smftools-0.1.3.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,24 +0,0 @@
1
- # load_sample_sheet
2
-
3
- def load_sample_sheet(adata, sample_sheet_path, mapping_key_column):
4
- """
5
- Loads a sample sheet csv and uses one of the columns to map sample information into the AnnData object.
6
-
7
- Parameters:
8
- adata (AnnData): The Anndata object to append sample information to.
9
- sample_sheet_path (str):
10
- mapping_key_column (str):
11
-
12
- Returns:
13
- None
14
- """
15
- import pandas as pd
16
- import anndata as ad
17
- df = pd.read_csv(sample_sheet_path)
18
- key_column = mapping_key_column
19
- df[key_column] = df[key_column].astype(str)
20
- value_columns = [column for column in df.columns if column != key_column]
21
- mapping_dict = df.set_index(key_column)[value_columns].to_dict(orient='index')
22
- for column in value_columns:
23
- column_map = {key: value[column] for key, value in mapping_dict.items()}
24
- adata.obs[column] = adata.obs[key_column].map(column_map)
@@ -1,21 +0,0 @@
1
- ## make_dirs
2
-
3
- # General
4
- def make_dirs(directories):
5
- """
6
- Takes a list of file paths and makes new directories if the directory does not already exist.
7
-
8
- Parameters:
9
- directories (list): A list of directories to make
10
-
11
- Returns:
12
- None
13
- """
14
- import os
15
-
16
- for directory in directories:
17
- if not os.path.isdir(directory):
18
- os.mkdir(directory)
19
- print(f"Directory '{directory}' created successfully.")
20
- else:
21
- print(f"Directory '{directory}' already exists.")
@@ -1,134 +0,0 @@
1
- ## mark_duplicates
2
-
3
- def mark_duplicates(adata, layers, obs_column='Reference', sample_col='Sample_names', hamming_distance_thresholds={}):
4
- """
5
- Marks duplicates in the adata object.
6
-
7
- Parameters:
8
- adata (AnnData): An adata object.
9
- layers (list): A list of strings representing the layers to use.
10
- obs_column (str): A string representing the obs column name to first subset on. Default is 'Reference'.
11
- sample_col (str):L A string representing the obs column name to second subset on. Default is 'Sample_names'.
12
- hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
13
-
14
- Returns:
15
- None
16
- """
17
-
18
- import numpy as np
19
- import pandas as pd
20
- import matplotlib.pyplot as plt
21
- from scipy.signal import find_peaks
22
- import networkx as nx
23
- from .binary_layers_to_ohe import binary_layers_to_ohe
24
- from .calculate_pairwise_hamming_distances import calculate_pairwise_hamming_distances
25
- from .min_non_diagonal import min_non_diagonal
26
-
27
- categories = adata.obs[obs_column].cat.categories
28
- sample_names = adata.obs[sample_col].cat.categories
29
-
30
- # Calculate the pairwise Hamming distances within each reference/sample set. Determine distance thresholds for each reference/sample pair
31
- adata.obs['Nearest_neighbor_Hamming_distance'] = pd.Series(np.nan, index=adata.obs_names, dtype=float)
32
- for cat in categories:
33
- cat_subset = adata[adata.obs[obs_column] == cat].copy()
34
- for sample in sample_names:
35
- sample_subset = cat_subset[cat_subset.obs[sample_col] == sample].copy()
36
- # Encode sequencing reads as a one-hot-encodings
37
- adata.uns[f'{cat}_{sample}_read_OHE_dict'] = binary_layers_to_ohe(sample_subset, layers, stack='hstack')
38
- # Unpack the read names and one hot encodings into lists
39
- read_names = []
40
- ohe_list = []
41
- for read_name, ohe in adata.uns[f'{cat}_{sample}_read_OHE_dict'].items():
42
- read_names.append(read_name)
43
- ohe_list.append(ohe)
44
- # Calculate the pairwise hamming distances
45
- print(f'Calculating hamming distances for {sample} on {cat} allele')
46
- distance_matrix = calculate_pairwise_hamming_distances(ohe_list)
47
- n_reads = distance_matrix.shape[0]
48
- # Load the hamming matrix into a dataframe with index and column names as the read_names
49
- distance_df = pd.DataFrame(distance_matrix, index=read_names, columns=read_names)
50
- # Save the distance dataframe into an unstructured component of the adata object
51
- adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}'] = distance_df
52
- if n_reads > 1:
53
- # Calculate the minimum non-self distance for every read in the reference and sample
54
- min_distance_values = min_non_diagonal(distance_matrix)
55
- min_distance_df = pd.DataFrame({'Nearest_neighbor_Hamming_distance': min_distance_values}, index=read_names)
56
- adata.obs.update(min_distance_df)
57
- # Generate a histogram of minimum non-self distances for each read
58
- if n_reads > 3:
59
- n_bins = n_reads // 4
60
- else:
61
- n_bins = 1
62
- min_distance_bins = plt.hist(min_distance_values, bins=n_bins)
63
- if cat in hamming_distance_thresholds:
64
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = hamming_distance_thresholds[cat]
65
- else: # eventually this should be written to use known PCR duplicate controls for thresholding.
66
- # Normalize the max value in any histogram bin to 1
67
- normalized_min_distance_counts = min_distance_bins[0] / np.max(min_distance_bins[0])
68
- # Extract the bin index of peak centers in the histogram
69
- peak_centers, _ = find_peaks(normalized_min_distance_counts, prominence=0.2, distance=5)
70
- first_peak_index = peak_centers[0]
71
- offset_index = first_peak_index-1
72
- # Use the distance corresponding to the first peak as the threshold distance in graph construction
73
- first_peak_distance = min_distance_bins[1][first_peak_index]
74
- offset_distance = min_distance_bins[1][offset_index]
75
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = offset_distance
76
- else:
77
- adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}'] = 0
78
-
79
- ## Detect likely duplicate reads and mark them in the adata object.
80
- adata.obs['Marked_duplicate'] = pd.Series(False, index=adata.obs_names, dtype=bool)
81
- adata.obs['Unique_in_final_read_set'] = pd.Series(False, index=adata.obs_names, dtype=bool)
82
- adata.obs[f'Hamming_distance_cluster_within_{obs_column}_and_sample'] = pd.Series(-1, index=adata.obs_names, dtype=int)
83
-
84
- for cat in categories:
85
- for sample in sample_names:
86
- distance_df = adata.uns[f'Pairwise_Hamming_distance_within_{cat}_{sample}']
87
- read_names = distance_df.index
88
- distance_matrix = distance_df.values
89
- n_reads = distance_matrix.shape[0]
90
- distance_threshold = adata.uns[f'Hamming_distance_threshold_for_{cat}_{sample}']
91
- # Initialize the read distance graph
92
- G = nx.Graph()
93
- # Add each read as a node to the graph
94
- G.add_nodes_from(range(n_reads))
95
- # Add edges based on the threshold
96
- for i in range(n_reads):
97
- for j in range(i + 1, n_reads):
98
- if distance_matrix[i, j] <= distance_threshold:
99
- G.add_edge(i, j)
100
- # Determine distinct clusters using connected components
101
- clusters = list(nx.connected_components(G))
102
- clusters = [list(cluster) for cluster in clusters]
103
- # Get the number of clusters
104
- cluster_count = len(clusters)
105
- if n_reads > 0:
106
- fraction_unique = cluster_count / n_reads
107
- else:
108
- fraction_unique = 0
109
- adata.uns[f'Hamming_distance_clusters_within_{cat}_{sample}'] = [cluster_count, n_reads, fraction_unique, clusters]
110
- # Update the adata object
111
- read_cluster_map = {}
112
- read_duplicate_map = {}
113
- read_keep_map = {}
114
- for i, cluster in enumerate(clusters):
115
- for j, read_index in enumerate(cluster):
116
- read_name = read_names[read_index]
117
- read_cluster_map[read_name] = i
118
- if len(cluster) > 1:
119
- read_duplicate_map[read_name] = True
120
- if j == 0:
121
- read_keep_map[read_name] = True
122
- else:
123
- read_keep_map[read_name] = False
124
- elif len(cluster) == 1:
125
- read_duplicate_map[read_name] = False
126
- read_keep_map[read_name] = True
127
- cluster_df = pd.DataFrame.from_dict(read_cluster_map, orient='index', columns=[f'Hamming_distance_cluster_within_{obs_column}_and_sample'], dtype=int)
128
- duplicate_df = pd.DataFrame.from_dict(read_duplicate_map, orient='index', columns=['Marked_duplicate'], dtype=bool)
129
- keep_df = pd.DataFrame.from_dict(read_keep_map, orient='index', columns=['Unique_in_final_read_set'], dtype=bool)
130
- df_combined = pd.concat([cluster_df, duplicate_df, keep_df], axis=1)
131
- adata.obs.update(df_combined)
132
- adata.obs['Marked_duplicate'] = adata.obs['Marked_duplicate'].astype(bool)
133
- adata.obs['Unique_in_final_read_set'] = adata.obs['Unique_in_final_read_set'].astype(bool)
134
- print(f'Hamming clusters for {sample} on {cat}\nThreshold: {distance_threshold}\nNumber clusters: {cluster_count}\nNumber reads: {n_reads}\nFraction unique: {fraction_unique}')
@@ -1,25 +0,0 @@
1
- ## min_non_diagonal
2
-
3
- def min_non_diagonal(matrix):
4
- """
5
- Takes a matrix and returns the smallest value from each row with the diagonal masked.
6
-
7
- Parameters:
8
- matrix (ndarray): A 2D ndarray.
9
-
10
- Returns:
11
- min_values (list): A list of minimum values from each row of the matrix
12
- """
13
- import numpy as np
14
-
15
- n = matrix.shape[0]
16
- min_values = []
17
- for i in range(n):
18
- # Mask to exclude the diagonal element
19
- row_mask = np.ones(n, dtype=bool)
20
- row_mask[i] = False
21
- # Extract the row excluding the diagonal element
22
- row = matrix[i, row_mask]
23
- # Find the minimum value in the row
24
- min_values.append(np.min(row))
25
- return min_values
@@ -1,125 +0,0 @@
1
- # recipes
2
-
3
- def recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory, mapping_key_column='Sample', reference_column = 'Reference', sample_names_col='Sample_names', invert=False):
4
- """
5
- The first part of the preprocessing workflow applied to the smf.inform.pod_to_adata() output derived from Kissiov_and_McKenna_2025.
6
-
7
- Performs the following tasks:
8
- 1) Loads a sample CSV to append metadata mappings to the adata object.
9
- 2) Appends a boolean indicating whether each position in var_names is within a given reference.
10
- 3) Appends the cytosine context to each position from each reference.
11
- 4) Calculate read level methylation statistics.
12
- 5) Optionally inverts the adata to flip the position coordinate orientation.
13
- 6) Calculates read length statistics (start position, end position, read length)
14
- 7) Returns a dictionary to pass the variable namespace to the parent scope.
15
-
16
- Parameters:
17
- adata (AnnData): The AnnData object to use as input.
18
- sample_sheet_path (str): String representing the path to the sample sheet csv containing the sample metadata.
19
- output_directory (str): String representing the path to the output directory for plots.
20
- mapping_key_column (str): The column name to use as the mapping keys for applying the sample sheet metadata.
21
- reference_column (str): The name of the reference column to use.
22
- sample_names_col (str): The name of the sample name column to use.
23
- invert (bool): Whether to invert the positional coordinates of the adata object.
24
-
25
- Returns:
26
- variables (dict): A dictionary of variables to append to the parent scope.
27
- """
28
- import anndata as ad
29
- import pandas as pd
30
- import numpy as np
31
- from .load_sample_sheet import load_sample_sheet
32
- from .calculate_coverage import calculate_coverage
33
- from .append_C_context import append_C_context
34
- from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
35
- from .invert_adata import invert_adata
36
- from .calculate_read_length_stats import calculate_read_length_stats
37
-
38
- # Clean up some of the Reference metadata and save variable names that point to sets of values in the column.
39
- adata.obs[reference_column] = adata.obs[reference_column].astype('category')
40
- references = adata.obs[reference_column].cat.categories
41
- split_references = [(reference, reference.split('_')[0][1:]) for reference in references]
42
- reference_mapping = {k: v for k, v in split_references}
43
- adata.obs[f'{reference_column}_short'] = adata.obs[reference_column].map(reference_mapping)
44
- short_references = set(adata.obs[f'{reference_column}_short'])
45
- binary_layers = adata.layers.keys()
46
-
47
- # load sample sheet metadata
48
- load_sample_sheet(adata, sample_sheet_path, mapping_key_column)
49
-
50
- # hold sample names set
51
- adata.obs[sample_names_col] = adata.obs[sample_names_col].astype('category')
52
- sample_names = adata.obs[sample_names_col].cat.categories
53
-
54
- # Add position level metadata
55
- calculate_coverage(adata, obs_column=reference_column)
56
- adata.var['SNP_position'] = (adata.var[f'N_{reference_column}_with_position'] > 0) & (adata.var[f'N_{reference_column}_with_position'] < len(references)).astype(bool)
57
-
58
- # Append cytosine context to the reference positions based on the conversion strand.
59
- append_C_context(adata, obs_column=reference_column, use_consensus=False)
60
-
61
- # Calculate read level methylation statistics. Assess if GpC methylation level is above other_C methylation level as a QC.
62
- calculate_converted_read_methylation_stats(adata, reference_column, sample_names_col, output_directory, show_methylation_histogram=False, save_methylation_histogram=False)
63
-
64
- # Invert the adata object (ie flip the strand orientation for visualization)
65
- if invert:
66
- invert_adata(adata)
67
- else:
68
- pass
69
-
70
- # Calculate read length statistics, with options to display or save the read length histograms
71
- upper_bound, lower_bound = calculate_read_length_stats(adata, reference_column, sample_names_col, output_directory, show_read_length_histogram=False, save_read_length_histogram=False)
72
-
73
- variables = {
74
- "short_references": short_references,
75
- "binary_layers": binary_layers,
76
- "sample_names": sample_names,
77
- "upper_bound": upper_bound,
78
- "lower_bound": lower_bound,
79
- "references": references
80
- }
81
- return variables
82
-
83
- def recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers, hamming_distance_thresholds={}, reference_column = 'Reference', sample_names_col='Sample_names'):
84
- """
85
- The second part of the preprocessing workflow applied to the adata that has already been preprocessed by recipe_1_Kissiov_and_McKenna_2025.
86
-
87
- Performs the following tasks:
88
- 1) Adds new layers containing NaN replaced variants of adata.X (fill_closest, nan0_0minus1, nan1_12).
89
- 2) Marks putative PCR duplicates using pairwise hamming distance metrics.
90
- 3) Performs a complexity analysis of the library based on the PCR duplicate detection rate.
91
- 4) Removes PCR duplicates from the adata.
92
- 5) Returns two adata object: one for the filtered adata and one for the duplicate adata.
93
-
94
- Parameters:
95
- adata (AnnData): The AnnData object to use as input.
96
- output_directory (str): String representing the path to the output directory for plots.
97
- binary_layers (list): A list of layers to used for the binary encoding of read sequences. Used for duplicate detection.
98
- hamming_distance_thresholds (dict): A dictionary keyed by obs_column categories that points to a float corresponding to the distance threshold to apply. Default is an empty dict.
99
- reference_column (str): The name of the reference column to use.
100
- sample_names_col (str): The name of the sample name column to use.
101
-
102
- Returns:
103
- filtered_adata (AnnData): An AnnData object containing the filtered reads
104
- duplicates (AnnData): An AnnData object containing the duplicate reads
105
- """
106
- import anndata as ad
107
- import pandas as pd
108
- import numpy as np
109
- from .clean_NaN import clean_NaN
110
- from .mark_duplicates import mark_duplicates
111
- from .calculate_complexity import calculate_complexity
112
- from .remove_duplicates import remove_duplicates
113
-
114
- # NaN replacement strategies stored in additional layers. Having layer=None uses adata.X
115
- clean_NaN(adata, layer=None)
116
-
117
- # Duplicate detection using pairwise hamming distance across reads
118
- mark_duplicates(adata, binary_layers, obs_column=reference_column, sample_col=sample_names_col, hamming_distance_thresholds=hamming_distance_thresholds)
119
-
120
- # Complexity analysis using the marked duplicates and the lander-watermann algorithm
121
- calculate_complexity(adata, output_directory, obs_column=reference_column, sample_col=sample_names_col, plot=True, save_plot=False)
122
-
123
- # Remove duplicate reads and store the duplicate reads in a new AnnData object named duplicates.
124
- filtered_adata, duplicates = remove_duplicates(adata)
125
- return filtered_adata, duplicates
@@ -1,21 +0,0 @@
1
- # remove_duplicates
2
-
3
- def remove_duplicates(adata):
4
- """
5
- Remove duplicates from the adata object
6
-
7
- Parameters:
8
- adata (Anndata): An adata object.
9
-
10
- Returns:
11
- filtered_adata (AnnData): An AnnData object of the filtered reads
12
- duplicates (AnnData): An AnnData object of the duplicate reads
13
- """
14
- import anndata as ad
15
-
16
- initial_size = adata.shape[0]
17
- filtered_adata = adata[adata.obs['Unique_in_final_read_set'] == True].copy()
18
- final_size = filtered_adata.shape[0]
19
- print(f'Removed {initial_size-final_size} reads from the dataset')
20
- duplicates = adata[adata.obs['Unique_in_final_read_set'] == False].copy()
21
- return filtered_adata, duplicates
smftools/readwrite.py DELETED
@@ -1,106 +0,0 @@
1
- ## readwrite ##
2
-
3
- ######################################################################################################
4
- ## Datetime functionality
5
- def date_string():
6
- """
7
- Each time this is called, it returns the current date string
8
- """
9
- from datetime import datetime
10
- current_date = datetime.now()
11
- date_string = current_date.strftime("%Y%m%d")
12
- date_string = date_string[2:]
13
- return date_string
14
-
15
- def time_string():
16
- """
17
- Each time this is called, it returns the current time string
18
- """
19
- from datetime import datetime
20
- current_time = datetime.now()
21
- return current_time.strftime("%H:%M:%S")
22
- ######################################################################################################
23
-
24
- ######################################################################################################
25
- ## Numpy, Pandas, Anndata functionality
26
- def adata_to_df(adata, layer=None):
27
- """
28
- Input: An adata object with a specified layer.
29
- Output: A dataframe for the specific layer.
30
- """
31
- import pandas as pd
32
- import anndata as ad
33
-
34
- # Extract the data matrix from the given layer
35
- if layer:
36
- data_matrix = adata.layers[layer]
37
- else:
38
- data_matrix = adata.X
39
- # Extract observation (read) annotations
40
- obs_df = adata.obs
41
- # Extract variable (position) annotations
42
- var_df = adata.var
43
- # Convert data matrix and annotations to pandas DataFrames
44
- df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
45
- return df
46
-
47
- def save_matrix(matrix, save_name):
48
- """
49
- Input: A numpy matrix and a save_name
50
- Output: A txt file representation of the data matrix
51
- """
52
- import numpy as np
53
- np.savetxt(f'{save_name}.txt', matrix)
54
-
55
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
56
- """
57
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
58
- Input: an output file path relative to the directory in which the function is called
59
- """
60
- import os
61
- import anndata as ad
62
- # Runtime warnings
63
- import warnings
64
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
-
67
- # List all files in the directory
68
- files = os.listdir(os.getcwd())
69
- # get current working directory
70
- cwd = os.getcwd()
71
- suffix = file_suffix
72
- # Filter file names that contain the search string in their filename and keep them in a list
73
- hdfs = [hdf for hdf in files if suffix in hdf]
74
- # Sort file list by names and print the list of file names
75
- hdfs.sort()
76
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
77
- # Iterate over all of the hdf5 files and concatenate them.
78
- final_adata = None
79
- for hdf in hdfs:
80
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
81
- temp_adata = ad.read_h5ad(hdf)
82
- if final_adata:
83
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
84
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
85
- else:
86
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
- final_adata = temp_adata
88
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
89
- final_adata.write_h5ad(output_file, compression='gzip')
90
-
91
- # Delete the individual h5ad files and only keep the final concatenated file
92
- if delete_inputs:
93
- files = os.listdir(os.getcwd())
94
- hdfs = [hdf for hdf in files if suffix in hdf]
95
- if output_file in hdfs:
96
- hdfs.remove(output_file)
97
- # Iterate over the files and delete them
98
- for hdf in hdfs:
99
- try:
100
- os.remove(hdf)
101
- print(f"Deleted file: {hdf}")
102
- except OSError as e:
103
- print(f"Error deleting file {hdf}: {e}")
104
- else:
105
- print('Keeping input files')
106
- ######################################################################################################
File without changes
@@ -1 +0,0 @@
1
- # apply_HMM
smftools/tools/cluster.py DELETED
File without changes
@@ -1 +0,0 @@
1
- # read_HMM
@@ -1,32 +0,0 @@
1
- # subset_adata
2
-
3
- def subset_adata(adata, obs_columns):
4
- """
5
- Subsets an AnnData object based on categorical values in specified `.obs` columns.
6
-
7
- Parameters:
8
- adata (AnnData): The AnnData object to subset.
9
- obs_columns (list of str): List of `.obs` column names to subset by. The order matters.
10
-
11
- Returns:
12
- dict: A dictionary where keys are tuples of category values and values are corresponding AnnData subsets.
13
- """
14
-
15
- def subset_recursive(adata_subset, columns):
16
- if not columns:
17
- return {(): adata_subset}
18
-
19
- current_column = columns[0]
20
- categories = adata_subset.obs[current_column].cat.categories
21
-
22
- subsets = {}
23
- for cat in categories:
24
- subset = adata_subset[adata_subset.obs[current_column] == cat]
25
- subsets.update(subset_recursive(subset, columns[1:]))
26
-
27
- return subsets
28
-
29
- # Start the recursive subset process
30
- subsets_dict = subset_recursive(adata, obs_columns)
31
-
32
- return subsets_dict
@@ -1,43 +0,0 @@
1
- # train_HMM
2
-
3
- def train_HMM(adata, model_name='trained_HMM', save_hmm=False):
4
- """
5
-
6
- Parameters:
7
- adata (AnnData): Input AnnData object
8
- model_name (str): Name of the model
9
- save_hmm (bool): Whether to save the model
10
-
11
- """
12
- import numpy as np
13
- import anndata as ad
14
- from pomegranate.distributions import Categorical
15
- from pomegranate.hmm import DenseHMM
16
-
17
- bound = Categorical([[0.95, 0.05]])
18
- unbound = Categorical([[0.05, 0.95]])
19
-
20
- edges = [[0.9, 0.1], [0.1, 0.9]]
21
- starts = [0.5, 0.5]
22
- ends = [0.5, 0.5]
23
-
24
- model = DenseHMM([bound, unbound], edges=edges, starts=starts, ends=ends, max_iter=5, verbose=True)
25
-
26
- # define training sets and labels
27
- # Determine the number of reads to sample
28
- n_sample = round(0.7 * adata.X.shape[0])
29
- # Generate random indices
30
- np.random.seed(0)
31
- random_indices = np.random.choice(adata.shape[0], size=n_sample, replace=False)
32
- # Subset the AnnData object using the random indices
33
- training_adata_subsampled = adata[random_indices, :]
34
- training_sequences = training_adata_subsampled.X
35
-
36
- # Train the HMM without labeled data
37
- model.fit(training_sequences, algorithm='baum-welch')
38
-
39
- if save_hmm:
40
- # Save the model to a file
41
- model_json = model.to_json()
42
- with open(f'{model_name}.json', 'w') as f:
43
- f.write(model_json)
@@ -1,84 +0,0 @@
1
- smftools/__init__.py,sha256=zy4ckT7hKrLrlm6NiZQoupvc6oSN7wJsyOBCYdzukcQ,401
2
- smftools/_settings.py,sha256=Ed8lzKUA5ncq5ZRfSp0t6_rphEEjMxts6guttwTZP5Y,409
3
- smftools/_version.py,sha256=R5TtpJu7Qu6sOarfDpp-5Oyy8Pi2Ir3VewCvsCQiAgo,21
4
- smftools/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
5
- smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz,sha256=q6wJtgFRDln0o20XNCx1qad3lwcdCoylqPN7wskTfI8,2926497
6
- smftools/datasets/F1_sample_sheet.csv,sha256=9PodIIOXK2eamYPbC6DGnXdzgi9bRDovf296j1aM0ak,259
7
- smftools/datasets/__init__.py,sha256=xkSTlPuakVYVCuRurif9BceNBDt6bsngJvvjI8757QI,142
8
- smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz,sha256=niOcVHaYY7h3XyvwSkN-V_NMBaRt2vTP5TrJO0CwMCs,8385050
9
- smftools/datasets/datasets.py,sha256=0y597Ntp707bOgDwN6O-JEt9yxgplj66p0aj6Zs_IB4,779
10
- smftools/informatics/__init__.py,sha256=WQiMBr1yjDrlmHg8UNgW2MJsq4fPrVfh-UBr5tYI9x4,326
11
- smftools/informatics/conversion_smf.py,sha256=PS-TjgMttr3VRrT0zg5L_L01xMOewB_OXSsQyoM7DWI,4333
12
- smftools/informatics/direct_smf.py,sha256=ue7p7deuRwaZtEh9EFV1YTE8HKRAmOsx9oaRJdjCrbY,4697
13
- smftools/informatics/fast5_to_pod5.py,sha256=xfdZU3QluaAcR-q2uBRz8hcBwYt73nCnrFeahvi0OKQ,704
14
- smftools/informatics/load_adata.py,sha256=i-2YCSaeLzbPfNtKPrLwfkv-9u_TrTAZrbtNAj3FRWY,7271
15
- smftools/informatics/readwrite.py,sha256=DgVisHYdkjzaO7suPbUvluImeTc3jqGDlioNveHUxPc,4158
16
- smftools/informatics/subsample_fasta_from_bed.py,sha256=YqYV09rvEQdeiS5hTTrKa8xYmJfeM3Vk-UUqwpw0qBk,1983
17
- smftools/informatics/subsample_pod5.py,sha256=zDw9tRcrFRmPI62xkcy9dh8IfsJcuYm7R-FVeBC_g3s,4701
18
- smftools/informatics/archived/bam_conversion.py,sha256=I8EzXjQixMmqx2oWnoNSH5NURBhfT-krbWHkoi_M964,3330
19
- smftools/informatics/archived/bam_direct.py,sha256=jbEFtUIiUR8Wlp3po_sWkr19AUNS9WZjglojb9j28vo,3606
20
- smftools/informatics/archived/basecalls_to_adata.py,sha256=-Nag6lr_NAtU4t8jo0GSMdgIAIfmDge-5VEUPQbEatE,3692
21
- smftools/informatics/helpers/LoadExperimentConfig.py,sha256=gsWGoa9cydwY4Kd-hTXF2gtmxc8glRRD2V1JB88e9js,2822
22
- smftools/informatics/helpers/__init__.py,sha256=KrfyM08_RgDf3Ajvb4KNTvcOqZiWYSIVhEznCr01Gcc,2255
23
- smftools/informatics/helpers/align_and_sort_BAM.py,sha256=DouG6nGWXtz2ulZD5p0sEShE-4dbPudHaWcHFm4-oJA,2184
24
- smftools/informatics/helpers/aligned_BAM_to_bed.py,sha256=eYkGQFSM2gPEauASkY_-9Yvy6727vP8Q4wx_st85Dpc,2638
25
- smftools/informatics/helpers/bed_to_bigwig.py,sha256=AazYEZzKgKgukSFwCpeiApzxh1kbt11X4RFqRIiBIaY,1466
26
- smftools/informatics/helpers/binarize_converted_base_identities.py,sha256=iJlDah-YJ0zx0UrlHdtgvrALVNSA0TTTdDoKmNCVg0Q,1846
27
- smftools/informatics/helpers/canoncall.py,sha256=M7HEqhYsWMUB0tLP3hzMM0L7PhcOTXgetl5lV3GgIaw,1062
28
- smftools/informatics/helpers/complement_base_list.py,sha256=k6EkLtxFoajaIufxw1p0pShJ2nPHyGLTbzZmIFFjB4o,532
29
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py,sha256=RXPn7e6Dcwol9tnUsfXJu3EuZcMSOJJo5LNWouovvZs,2715
30
- smftools/informatics/helpers/converted_BAM_to_adata.py,sha256=Rsnydzpf9lMS3TQjXpbXJSSfCzhVTPn3rBDLiK-8utA,13991
31
- smftools/informatics/helpers/count_aligned_reads.py,sha256=uYyUYglF1asiaoxr-LKxPMUEbfyD7FS-dumTg2hJHzQ,2170
32
- smftools/informatics/helpers/extract_base_identities.py,sha256=E-_m9W82N52NjX5kz9Af5YH0S2k58hnq9KTrm4S5vgM,4370
33
- smftools/informatics/helpers/extract_mods.py,sha256=UBFjXDKz_A6ivjcocYT1_pKjvygY2Fdg0RjQmMS8UuA,2269
34
- smftools/informatics/helpers/extract_readnames_from_BAM.py,sha256=3FxSNqbZ1VsOK2RfHrvevQTzhWATf5E8bZ5yVOqayvk,759
35
- smftools/informatics/helpers/find_conversion_sites.py,sha256=5AghDQzEoSvE2Og98VsKoeWUFSLnIGY1LnRu1BtQavM,3700
36
- smftools/informatics/helpers/generate_converted_FASTA.py,sha256=ueaAsFnBuc7zKwkBivBR3DJg4DtkxkHHIQcVVSWzv-w,5161
37
- smftools/informatics/helpers/get_chromosome_lengths.py,sha256=sLumLrGsU_Xg_oJcdOpQyjUGpJoT2HbcmxWwbwzXUlE,1036
38
- smftools/informatics/helpers/get_native_references.py,sha256=fRuyEm9UJkfd5DwHmFb1bxEtNvtSI1_BxGRmrCymGkw,981
39
- smftools/informatics/helpers/index_fasta.py,sha256=N3IErfSiavYldeaat8xcQgA1MpykoQHcE0gHUeWuClE,267
40
- smftools/informatics/helpers/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
41
- smftools/informatics/helpers/make_modbed.py,sha256=cOQ97gPfRiCcw_fqboxousXIiOYjp78IFYLbu749U1Y,939
42
- smftools/informatics/helpers/modQC.py,sha256=LeOBObG8gAVVdgESIMceYhd5AW1gfN7ABo91OQtOzTM,1041
43
- smftools/informatics/helpers/modcall.py,sha256=9PH7Peq4y-VBqQcMkbv0TwgePBlD5aM4_FmI7H4hbQQ,1142
44
- smftools/informatics/helpers/modkit_extract_to_adata.py,sha256=duPlRAIz4VWM-jm9iaLY7N6JHQcun_L0nhr2VyUjNTI,38184
45
- smftools/informatics/helpers/ohe_batching.py,sha256=_Mz2p1We5PVIb8S6Hbq_hREKJ9mGQiADwfFK_NgMGhA,1909
46
- smftools/informatics/helpers/one_hot_encode.py,sha256=hpZAuwa9ndkhyCm9sO65KVHE0lbFDKqRylfliEKyD4o,632
47
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py,sha256=tAnXFleGzXJNjHRAgZ0NUJuZ0P3aKmUYIrK-V9VoJKY,1860
48
- smftools/informatics/helpers/separate_bam_by_bc.py,sha256=Fsi8OEmv5Ny13cWoHVV9JmEjVFEXT_ZxbBOlRdmyPbE,1742
49
- smftools/informatics/helpers/split_and_index_BAM.py,sha256=_TFJ8fcLbIf37JG83hSc1zgs1yxX70-NhA8y-PbhTpo,1966
50
- smftools/informatics/helpers/archived/informatics.py,sha256=gKb2ZJ_LcAeEXuQqn9e-QDF_sS4tMpMTr2vZlqa7n54,14572
51
- smftools/informatics/helpers/archived/load_adata.py,sha256=DhvYYqO9VLsZqhL1WjN9sd-e3fgvdXGlgTP18z1h0L0,33654
52
- smftools/plotting/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
53
- smftools/preprocessing/__init__.py,sha256=5FQNrj51KmaDLeAGGBA8iWMkYiSOe7O91ES8mT4aVtE,1399
54
- smftools/preprocessing/append_C_context.py,sha256=pP5u9o5U4JmHras0PK6yas65u4-U5KlX3sKLb-duo80,3728
55
- smftools/preprocessing/binarize_on_Youden.py,sha256=slkkt56DZ1FZWy8Un5mNJEZ49JlPnPKow2zU4GoHEr8,2303
56
- smftools/preprocessing/binary_layers_to_ohe.py,sha256=931eHuVda6pMZTvC7jVTKkY2a_KQWpSfgi-nkA5NmaI,1238
57
- smftools/preprocessing/calculate_complexity.py,sha256=ut60et8bmIswtiLhctJWHNseIV4ZRQultYdtJPHcRPs,3224
58
- smftools/preprocessing/calculate_consensus.py,sha256=6zRpRmb2xdfDu5hctZrReALRb7Pjn8sy8xJZTm3o0nU,2442
59
- smftools/preprocessing/calculate_converted_read_methylation_stats.py,sha256=Si0DcES0lLMvg3XgdKpedxfPnXQ14tEFKrOAFRn3fHs,6059
60
- smftools/preprocessing/calculate_coverage.py,sha256=ZgRxQGpydxQg1exkvSiy8nHmzDIPGGqL5vL9XQ2PZQ4,2068
61
- smftools/preprocessing/calculate_pairwise_hamming_distances.py,sha256=e5Mzyex7pT29H2PY014uU4Fi_eewbut1JkzC1ffBbCg,961
62
- smftools/preprocessing/calculate_position_Youden.py,sha256=mfQ6nFfUaEaKg_icyHA1zZlhh0wHjpLE56BZDXOdP_4,6364
63
- smftools/preprocessing/calculate_read_length_stats.py,sha256=6m362JaCKlD0QoBUMnM2qsB6Jo_4shl7xFzqU1uZccU,4945
64
- smftools/preprocessing/clean_NaN.py,sha256=1vieT026p0gDJCbqB_CiLvAGGxlc-5xufoKJgZuBFFk,1150
65
- smftools/preprocessing/filter_converted_reads_on_methylation.py,sha256=SN5q0rqYtYW9j3i0sVSyTv9EmR_uLKI7GkjmJixeOU0,1307
66
- smftools/preprocessing/filter_reads_on_length.py,sha256=sAT66bjuI8ZtXyQc9SuPzq1dPIB1CNVx6VfWqVng4Dg,2191
67
- smftools/preprocessing/invert_adata.py,sha256=u6Y70EH0B5mXb9-HuukIlzpMgZ6rhzcJuy3YZZTx3SA,684
68
- smftools/preprocessing/load_sample_sheet.py,sha256=uGjzG9x-1t_1lCooH85P8Tfg80GdvVx8Jv1LPl9XNFM,915
69
- smftools/preprocessing/make_dirs.py,sha256=lWHXpwC76MFM5sSme9i_WeYUaxutzybendokhny03ds,537
70
- smftools/preprocessing/mark_duplicates.py,sha256=sQuPcTw8JsQoONOk-kMlAF965sIk2Pu-M7rIyfbyGGs,8145
71
- smftools/preprocessing/min_non_diagonal.py,sha256=hx1asW8CEmLaIroZISW8EcAf_RnBEC_nofGD8QG0b1E,711
72
- smftools/preprocessing/recipes.py,sha256=KzSw5JW0WJGzSis5Fm7moQY5PxOYl6-uYYf1NDj6nOE,7117
73
- smftools/preprocessing/remove_duplicates.py,sha256=Erooi5_1VOUNfWpzddzmMNYMCl1U1jJryt7ZtMhabAs,699
74
- smftools/preprocessing/archives/preprocessing.py,sha256=4mLT09A7vwRZ78FHmuwtv38mH9TQ9qrZc_WjHRhhkIw,34379
75
- smftools/tools/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
76
- smftools/tools/apply_HMM.py,sha256=AuVtOki69-Xs4mhjhTXJzd49KCVXwixFyWSUgDjtR6s,11
77
- smftools/tools/cluster.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
78
- smftools/tools/read_HMM.py,sha256=N0MGG494VjlxYJcCVz1jN4OasGtRITZS98SJ2xB_j8k,10
79
- smftools/tools/subset_adata.py,sha256=qyU9iCal03edb5aUS3AZ2U4TlL3uQ42jGI9hX3QF7Fc,1047
80
- smftools/tools/train_HMM.py,sha256=x5ZcXj-heWQqDOX86nuuDoj1tPkYKl04fYA1fCKNQ0c,1380
81
- smftools-0.1.3.dist-info/METADATA,sha256=u26Og8tpAF2TgXZztotk3Q4EuP7Fvf73s1tlIjBDD-A,6410
82
- smftools-0.1.3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
83
- smftools-0.1.3.dist-info/licenses/LICENSE,sha256=F8LwmL6vMPddaCt1z1S83Kh_OZv50alTlY7BvVx1RXw,1066
84
- smftools-0.1.3.dist-info/RECORD,,