smftools 0.1.1__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. smftools-0.1.6.dist-info/METADATA +127 -0
  2. smftools-0.1.6.dist-info/RECORD +4 -0
  3. smftools/__init__.py +0 -25
  4. smftools/_settings.py +0 -19
  5. smftools/_version.py +0 -1
  6. smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  7. smftools/datasets/__init__.py +0 -9
  8. smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  9. smftools/datasets/datasets.py +0 -27
  10. smftools/informatics/__init__.py +0 -12
  11. smftools/informatics/bam_conversion.py +0 -47
  12. smftools/informatics/bam_direct.py +0 -49
  13. smftools/informatics/basecalls_to_adata.py +0 -42
  14. smftools/informatics/fast5_to_pod5.py +0 -19
  15. smftools/informatics/helpers/LoadExperimentConfig.py +0 -74
  16. smftools/informatics/helpers/__init__.py +0 -42
  17. smftools/informatics/helpers/align_and_sort_BAM.py +0 -52
  18. smftools/informatics/helpers/archived/informatics.py +0 -260
  19. smftools/informatics/helpers/archived/load_adata.py +0 -516
  20. smftools/informatics/helpers/binarize_converted_base_identities.py +0 -31
  21. smftools/informatics/helpers/canoncall.py +0 -23
  22. smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  23. smftools/informatics/helpers/count_aligned_reads.py +0 -39
  24. smftools/informatics/helpers/extract_base_identities.py +0 -43
  25. smftools/informatics/helpers/extract_mods.py +0 -51
  26. smftools/informatics/helpers/find_conversion_sites.py +0 -59
  27. smftools/informatics/helpers/generate_converted_FASTA.py +0 -79
  28. smftools/informatics/helpers/get_native_references.py +0 -28
  29. smftools/informatics/helpers/make_dirs.py +0 -21
  30. smftools/informatics/helpers/make_modbed.py +0 -27
  31. smftools/informatics/helpers/modQC.py +0 -27
  32. smftools/informatics/helpers/modcall.py +0 -26
  33. smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  34. smftools/informatics/helpers/one_hot_encode.py +0 -19
  35. smftools/informatics/helpers/separate_bam_by_bc.py +0 -41
  36. smftools/informatics/helpers/split_and_index_BAM.py +0 -29
  37. smftools/informatics/pod5_conversion.py +0 -53
  38. smftools/informatics/pod5_direct.py +0 -55
  39. smftools/informatics/pod5_to_adata.py +0 -40
  40. smftools/informatics/readwrite.py +0 -106
  41. smftools/informatics/subsample_pod5.py +0 -48
  42. smftools/plotting/__init__.py +0 -0
  43. smftools/preprocessing/__init__.py +0 -29
  44. smftools/preprocessing/append_C_context.py +0 -46
  45. smftools/preprocessing/archives/preprocessing.py +0 -614
  46. smftools/preprocessing/binarize_on_Youden.py +0 -42
  47. smftools/preprocessing/binary_layers_to_ohe.py +0 -30
  48. smftools/preprocessing/calculate_complexity.py +0 -71
  49. smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  50. smftools/preprocessing/calculate_coverage.py +0 -41
  51. smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -27
  52. smftools/preprocessing/calculate_position_Youden.py +0 -104
  53. smftools/preprocessing/calculate_read_length_stats.py +0 -32
  54. smftools/preprocessing/clean_NaN.py +0 -38
  55. smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -27
  56. smftools/preprocessing/filter_reads_on_length.py +0 -39
  57. smftools/preprocessing/invert_adata.py +0 -22
  58. smftools/preprocessing/mark_duplicates.py +0 -119
  59. smftools/preprocessing/min_non_diagonal.py +0 -25
  60. smftools/preprocessing/remove_duplicates.py +0 -18
  61. smftools/readwrite.py +0 -106
  62. smftools/tools/__init__.py +0 -0
  63. smftools-0.1.1.dist-info/METADATA +0 -88
  64. smftools-0.1.1.dist-info/RECORD +0 -64
  65. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/WHEEL +0 -0
  66. {smftools-0.1.1.dist-info → smftools-0.1.6.dist-info}/licenses/LICENSE +0 -0
@@ -1,106 +0,0 @@
1
- ## readwrite ##
2
-
3
- ######################################################################################################
4
- ## Datetime functionality
5
- def date_string():
6
- """
7
- Each time this is called, it returns the current date string
8
- """
9
- from datetime import datetime
10
- current_date = datetime.now()
11
- date_string = current_date.strftime("%Y%m%d")
12
- date_string = date_string[2:]
13
- return date_string
14
-
15
- def time_string():
16
- """
17
- Each time this is called, it returns the current time string
18
- """
19
- from datetime import datetime
20
- current_time = datetime.now()
21
- return current_time.strftime("%H:%M:%S")
22
- ######################################################################################################
23
-
24
- ######################################################################################################
25
- ## Numpy, Pandas, Anndata functionality
26
- def adata_to_df(adata, layer=None):
27
- """
28
- Input: An adata object with a specified layer.
29
- Output: A dataframe for the specific layer.
30
- """
31
- import pandas as pd
32
- import anndata as ad
33
-
34
- # Extract the data matrix from the given layer
35
- if layer:
36
- data_matrix = adata.layers[layer]
37
- else:
38
- data_matrix = adata.X
39
- # Extract observation (read) annotations
40
- obs_df = adata.obs
41
- # Extract variable (position) annotations
42
- var_df = adata.var
43
- # Convert data matrix and annotations to pandas DataFrames
44
- df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
45
- return df
46
-
47
- def save_matrix(matrix, save_name):
48
- """
49
- Input: A numpy matrix and a save_name
50
- Output: A txt file representation of the data matrix
51
- """
52
- import numpy as np
53
- np.savetxt(f'{save_name}.txt', matrix)
54
-
55
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
56
- """
57
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
58
- Input: an output file path relative to the directory in which the function is called
59
- """
60
- import os
61
- import anndata as ad
62
- # Runtime warnings
63
- import warnings
64
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
-
67
- # List all files in the directory
68
- files = os.listdir(os.getcwd())
69
- # get current working directory
70
- cwd = os.getcwd()
71
- suffix = file_suffix
72
- # Filter file names that contain the search string in their filename and keep them in a list
73
- hdfs = [hdf for hdf in files if suffix in hdf]
74
- # Sort file list by names and print the list of file names
75
- hdfs.sort()
76
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
77
- # Iterate over all of the hdf5 files and concatenate them.
78
- final_adata = None
79
- for hdf in hdfs:
80
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
81
- temp_adata = ad.read_h5ad(hdf)
82
- if final_adata:
83
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
84
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
85
- else:
86
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
- final_adata = temp_adata
88
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
89
- final_adata.write_h5ad(output_file, compression='gzip')
90
-
91
- # Delete the individual h5ad files and only keep the final concatenated file
92
- if delete_inputs:
93
- files = os.listdir(os.getcwd())
94
- hdfs = [hdf for hdf in files if suffix in hdf]
95
- if output_file in hdfs:
96
- hdfs.remove(output_file)
97
- # Iterate over the files and delete them
98
- for hdf in hdfs:
99
- try:
100
- os.remove(hdf)
101
- print(f"Deleted file: {hdf}")
102
- except OSError as e:
103
- print(f"Error deleting file {hdf}: {e}")
104
- else:
105
- print('Keeping input files')
106
- ######################################################################################################
@@ -1,48 +0,0 @@
1
- # subsample_pod5
2
-
3
- def subsample_pod5(pod5_path, read_name_path, output_directory):
4
- """
5
- Takes a POD5 file and a text file containing read names of interest and writes out a subsampled POD5 for just those reads.
6
- This is a useful function when you have a list of read names that mapped to a region of interest that you want to reanalyze from the pod5 level.
7
-
8
- Parameters:
9
- pod5_path (str): File path to the POD5 to subsample.
10
- read_name_path (str | int): File path to a text file of read names. One read name per line. If an int value is passed, a random subset of that many reads will occur
11
- output_directory (str): A file path to the directory to output the file.
12
-
13
- Returns:
14
- None
15
- """
16
- import pod5 as p5
17
- import os
18
-
19
- input_pod5_base = os.path.basename(pod5_path)
20
-
21
- if type(read_name_path) == str:
22
- input_read_name_base = os.path.basename(read_name_path)
23
- output_base = input_pod5_base.split('.pod5')[0] + '_' + input_read_name_base.split('.txt')[0] + '_subsampled.pod5'
24
- # extract read names into a list of strings
25
- with open(read_name_path, 'r') as file:
26
- read_names = [line.strip() for line in file]
27
- with p5.Reader(pod5_path) as reader:
28
- read_records = []
29
- for read_record in reader.reads(selection=read_names):
30
- read_records.append(read_record.to_read())
31
-
32
- elif type(read_name_path) == int:
33
- import random
34
- output_base = input_pod5_base.split('.pod5')[0] + f'_{read_name_path}_randomly_subsampled.pod5'
35
- with p5.Reader(pod5_path) as reader:
36
- all_read_records = []
37
- for read_record in reader.reads():
38
- all_read_records.append(read_record.to_read())
39
- if read_name_path <= len(all_read_records):
40
- read_records = random.sample(all_read_records, read_name_path)
41
- else:
42
- print('Trying to sample more reads than are contained in the input pod5, please try a lower value.')
43
-
44
- output_pod5 = os.path.join(output_directory, output_base)
45
-
46
- # Write the subsampled POD5
47
- with p5.Writer(output_pod5) as writer:
48
- writer.add_reads(read_records)
File without changes
@@ -1,29 +0,0 @@
1
- from .append_C_context import append_C_context
2
- from .binarize_on_Youden import binarize_on_Youden
3
- from .calculate_complexity import calculate_complexity
4
- from .calculate_converted_read_methylation_stats import calculate_converted_read_methylation_stats
5
- from .calculate_coverage import calculate_coverage
6
- from .calculate_position_Youden import calculate_position_Youden
7
- from .calculate_read_length_stats import calculate_read_length_stats
8
- from .clean_NaN import clean_NaN
9
- from .filter_converted_reads_on_methylation import filter_converted_reads_on_methylation
10
- from .filter_reads_on_length import filter_reads_on_length
11
- from .invert_adata import invert_adata
12
- from .mark_duplicates import mark_duplicates
13
- from .remove_duplicates import remove_duplicates
14
-
15
- __all__ = [
16
- "append_C_context",
17
- "binarize_on_Youden",
18
- "calculate_complexity",
19
- "calculate_converted_read_methylation_stats",
20
- "calculate_coverage",
21
- "calculate_position_Youden",
22
- "calculate_read_length_stats",
23
- "clean_NaN",
24
- "filter_converted_reads_on_methylation",
25
- "filter_reads_on_length",
26
- "invert_adata",
27
- "mark_duplicates",
28
- "remove_duplicates"
29
- ]
@@ -1,46 +0,0 @@
1
- ## append_C_context
2
-
3
- ## Conversion SMF Specific
4
- # Read methylation QC
5
- def append_C_context(adata, obs_column='Reference', use_consensus=False):
6
- """
7
- Adds Cytosine context to the position within the given category. When use_consensus is True, it uses the consensus sequence, otherwise it defaults to the FASTA sequence.
8
-
9
- Parameters:
10
- adata (AnnData): The input adata object.
11
- obs_column (str): The observation column in which to stratify on. Default is 'Reference', which should not be changed for most purposes.
12
- use_consensus (bool): A truth statement indicating whether to use the consensus sequence from the reads mapped to the reference. If False, the reference FASTA is used instead.
13
- Input: An adata object, the obs_column of interst, and whether to use the consensus sequence from the category.
14
-
15
- Returns:
16
- None
17
- """
18
- import numpy as np
19
- import anndata as ad
20
- site_types = ['GpC_site', 'CpG_site', 'ambiguous_GpC_site', 'ambiguous_CpG_site', 'other_C']
21
- categories = adata.obs[obs_column].cat.categories
22
- for cat in categories:
23
- if use_consensus:
24
- sequence = adata.uns[f'{cat}_consensus_sequence']
25
- else:
26
- sequence = adata.uns[f'{cat}_FASTA_sequence']
27
- boolean_dict = {}
28
- for site_type in site_types:
29
- boolean_dict[f'{cat}_{site_type}'] = np.full(len(sequence), False, dtype=bool)
30
- # Iterate through the sequence and apply the criteria
31
- for i in range(1, len(sequence) - 1):
32
- if sequence[i] == 'C':
33
- if sequence[i - 1] == 'G' and sequence[i + 1] != 'G':
34
- boolean_dict[f'{cat}_GpC_site'][i] = True
35
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
36
- boolean_dict[f'{cat}_ambiguous_GpC_site'][i] = True
37
- elif sequence[i - 1] != 'G' and sequence[i + 1] == 'G':
38
- boolean_dict[f'{cat}_CpG_site'][i] = True
39
- elif sequence[i - 1] == 'G' and sequence[i + 1] == 'G':
40
- boolean_dict[f'{cat}_ambiguous_CpG_site'][i] = True
41
- elif sequence[i - 1] != 'G' and sequence[i + 1] != 'G':
42
- boolean_dict[f'{cat}_other_C'][i] = True
43
- for site_type in site_types:
44
- adata.var[f'{cat}_{site_type}'] = boolean_dict[f'{cat}_{site_type}'].astype(bool)
45
- adata.obsm[f'{cat}_{site_type}'] = adata[:, adata.var[f'{cat}_{site_type}'] == True].copy().X
46
-