smftools 0.2.1__py3-none-any.whl → 0.2.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. smftools/__init__.py +2 -6
  2. smftools/_version.py +1 -1
  3. smftools/cli/__init__.py +0 -0
  4. smftools/cli/cli_flows.py +94 -0
  5. smftools/cli/hmm_adata.py +338 -0
  6. smftools/cli/load_adata.py +577 -0
  7. smftools/cli/preprocess_adata.py +363 -0
  8. smftools/cli/spatial_adata.py +564 -0
  9. smftools/cli_entry.py +435 -0
  10. smftools/config/conversion.yaml +11 -6
  11. smftools/config/deaminase.yaml +12 -7
  12. smftools/config/default.yaml +36 -25
  13. smftools/config/direct.yaml +25 -1
  14. smftools/config/discover_input_files.py +115 -0
  15. smftools/config/experiment_config.py +109 -12
  16. smftools/informatics/__init__.py +13 -7
  17. smftools/informatics/archived/fast5_to_pod5.py +43 -0
  18. smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  19. smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  20. smftools/informatics/{helpers → archived/helpers/archived}/aligned_BAM_to_bed.py +6 -4
  21. smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  22. smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  23. smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  24. smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
  25. smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  26. smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +1 -1
  27. smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
  28. smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  29. smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  30. smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  31. smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
  32. smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
  33. smftools/informatics/{helpers → archived/helpers/archived}/plot_bed_histograms.py +0 -19
  34. smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +6 -5
  35. smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +7 -7
  36. smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  37. smftools/informatics/bam_functions.py +812 -0
  38. smftools/informatics/basecalling.py +67 -0
  39. smftools/informatics/bed_functions.py +366 -0
  40. smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +42 -30
  41. smftools/informatics/fasta_functions.py +255 -0
  42. smftools/informatics/h5ad_functions.py +197 -0
  43. smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +142 -59
  44. smftools/informatics/modkit_functions.py +129 -0
  45. smftools/informatics/ohe.py +160 -0
  46. smftools/informatics/pod5_functions.py +224 -0
  47. smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
  48. smftools/plotting/autocorrelation_plotting.py +1 -3
  49. smftools/plotting/general_plotting.py +1037 -362
  50. smftools/preprocessing/__init__.py +2 -0
  51. smftools/preprocessing/append_base_context.py +3 -3
  52. smftools/preprocessing/append_binary_layer_by_base_context.py +4 -4
  53. smftools/preprocessing/binarize.py +17 -0
  54. smftools/preprocessing/binarize_on_Youden.py +2 -2
  55. smftools/preprocessing/calculate_position_Youden.py +1 -1
  56. smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  57. smftools/preprocessing/filter_reads_on_modification_thresholds.py +19 -19
  58. smftools/preprocessing/flag_duplicate_reads.py +1 -1
  59. smftools/readwrite.py +266 -140
  60. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/METADATA +10 -9
  61. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/RECORD +82 -70
  62. smftools-0.2.3.dist-info/entry_points.txt +2 -0
  63. smftools/cli.py +0 -184
  64. smftools/informatics/fast5_to_pod5.py +0 -24
  65. smftools/informatics/helpers/__init__.py +0 -73
  66. smftools/informatics/helpers/align_and_sort_BAM.py +0 -86
  67. smftools/informatics/helpers/bam_qc.py +0 -66
  68. smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  69. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -378
  70. smftools/informatics/helpers/discover_input_files.py +0 -100
  71. smftools/informatics/helpers/index_fasta.py +0 -12
  72. smftools/informatics/helpers/make_dirs.py +0 -21
  73. smftools/informatics/readwrite.py +0 -106
  74. smftools/informatics/subsample_fasta_from_bed.py +0 -47
  75. smftools/load_adata.py +0 -1346
  76. smftools-0.2.1.dist-info/entry_points.txt +0 -2
  77. /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
  78. /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
  79. /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  80. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +0 -0
  81. /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  82. /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  83. /smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +0 -0
  84. /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
  85. /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
  86. /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
  87. /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
  88. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
  89. /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
  90. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
  91. /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
  92. /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
  93. /smftools/informatics/{helpers/binarize_converted_base_identities.py → binarize_converted_base_identities.py} +0 -0
  94. /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
  95. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
  96. {smftools-0.2.1.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
@@ -1,106 +0,0 @@
1
- ## readwrite ##
2
-
3
- ######################################################################################################
4
- ## Datetime functionality
5
- def date_string():
6
- """
7
- Each time this is called, it returns the current date string
8
- """
9
- from datetime import datetime
10
- current_date = datetime.now()
11
- date_string = current_date.strftime("%Y%m%d")
12
- date_string = date_string[2:]
13
- return date_string
14
-
15
- def time_string():
16
- """
17
- Each time this is called, it returns the current time string
18
- """
19
- from datetime import datetime
20
- current_time = datetime.now()
21
- return current_time.strftime("%H:%M:%S")
22
- ######################################################################################################
23
-
24
- ######################################################################################################
25
- ## Numpy, Pandas, Anndata functionality
26
- def adata_to_df(adata, layer=None):
27
- """
28
- Input: An adata object with a specified layer.
29
- Output: A dataframe for the specific layer.
30
- """
31
- import pandas as pd
32
- import anndata as ad
33
-
34
- # Extract the data matrix from the given layer
35
- if layer:
36
- data_matrix = adata.layers[layer]
37
- else:
38
- data_matrix = adata.X
39
- # Extract observation (read) annotations
40
- obs_df = adata.obs
41
- # Extract variable (position) annotations
42
- var_df = adata.var
43
- # Convert data matrix and annotations to pandas DataFrames
44
- df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
45
- return df
46
-
47
- def save_matrix(matrix, save_name):
48
- """
49
- Input: A numpy matrix and a save_name
50
- Output: A txt file representation of the data matrix
51
- """
52
- import numpy as np
53
- np.savetxt(f'{save_name}.txt', matrix)
54
-
55
- def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
56
- """
57
- Concatenate all h5ad files in a directory and delete them after the final adata is written out.
58
- Input: an output file path relative to the directory in which the function is called
59
- """
60
- import os
61
- import anndata as ad
62
- # Runtime warnings
63
- import warnings
64
- warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
65
- warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
66
-
67
- # List all files in the directory
68
- files = os.listdir(os.getcwd())
69
- # get current working directory
70
- cwd = os.getcwd()
71
- suffix = file_suffix
72
- # Filter file names that contain the search string in their filename and keep them in a list
73
- hdfs = [hdf for hdf in files if suffix in hdf]
74
- # Sort file list by names and print the list of file names
75
- hdfs.sort()
76
- print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
77
- # Iterate over all of the hdf5 files and concatenate them.
78
- final_adata = None
79
- for hdf in hdfs:
80
- print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
81
- temp_adata = ad.read_h5ad(hdf)
82
- if final_adata:
83
- print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
84
- final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
85
- else:
86
- print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
87
- final_adata = temp_adata
88
- print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
89
- final_adata.write_h5ad(output_file, compression='gzip')
90
-
91
- # Delete the individual h5ad files and only keep the final concatenated file
92
- if delete_inputs:
93
- files = os.listdir(os.getcwd())
94
- hdfs = [hdf for hdf in files if suffix in hdf]
95
- if output_file in hdfs:
96
- hdfs.remove(output_file)
97
- # Iterate over the files and delete them
98
- for hdf in hdfs:
99
- try:
100
- os.remove(hdf)
101
- print(f"Deleted file: {hdf}")
102
- except OSError as e:
103
- print(f"Error deleting file {hdf}: {e}")
104
- else:
105
- print('Keeping input files')
106
- ######################################################################################################
@@ -1,47 +0,0 @@
1
- # subsample_fasta_from_bed
2
-
3
- def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
4
- """
5
- Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
6
-
7
- Parameters:
8
- input_FASTA (str): String representing the path to the input FASTA file.
9
- input_bed (str): String representing the path to the input BED file.
10
- output_directory (str): String representing the path to the output directory for the new FASTA file.
11
- output_FASTA (str): Name of the output FASTA.
12
-
13
- Returns:
14
- None
15
- """
16
- from pyfaidx import Fasta
17
- import os
18
-
19
- # Load the FASTA file using pyfaidx
20
- fasta = Fasta(input_FASTA)
21
-
22
- output_FASTA_path = os.path.join(output_directory, output_FASTA)
23
-
24
- # Open the BED file
25
- with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
26
- for line in bed:
27
- # Each line in BED file contains: chrom, start, end (and possibly more columns)
28
- fields = line.strip().split()
29
- n_fields = len(fields)
30
- chrom = fields[0]
31
- start = int(fields[1]) # BED is 0-based
32
- end = int(fields[2]) # BED is 0-based and end is exclusive
33
- if n_fields > 3:
34
- description = " ".join(fields[3:])
35
-
36
- # Check if the chromosome exists in the FASTA file
37
- if chrom in fasta:
38
- # pyfaidx is 1-based, so convert coordinates accordingly
39
- sequence = fasta[chrom][start:end].seq
40
- # Write the sequence to the output FASTA file
41
- if n_fields > 3:
42
- out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
43
- else:
44
- out_fasta.write(f">{chrom}:{start}-{end}\n")
45
- out_fasta.write(f"{sequence}\n")
46
- else:
47
- print(f"Warning: {chrom} not found in the FASTA file")