smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. smftools/__init__.py +5 -1
  2. smftools/_version.py +1 -1
  3. smftools/informatics/__init__.py +2 -0
  4. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  5. smftools/informatics/basecall_pod5s.py +80 -0
  6. smftools/informatics/conversion_smf.py +63 -10
  7. smftools/informatics/direct_smf.py +66 -18
  8. smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
  9. smftools/informatics/helpers/__init__.py +16 -2
  10. smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
  11. smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
  12. smftools/informatics/helpers/bam_qc.py +66 -0
  13. smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
  14. smftools/informatics/helpers/canoncall.py +12 -3
  15. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
  16. smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
  17. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  18. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  19. smftools/informatics/helpers/extract_base_identities.py +33 -46
  20. smftools/informatics/helpers/extract_mods.py +55 -23
  21. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  22. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  23. smftools/informatics/helpers/find_conversion_sites.py +33 -44
  24. smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
  25. smftools/informatics/helpers/modcall.py +13 -5
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
  27. smftools/informatics/helpers/ohe_batching.py +65 -41
  28. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  29. smftools/informatics/helpers/one_hot_decode.py +27 -0
  30. smftools/informatics/helpers/one_hot_encode.py +45 -9
  31. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
  32. smftools/informatics/helpers/run_multiqc.py +28 -0
  33. smftools/informatics/helpers/split_and_index_BAM.py +3 -8
  34. smftools/informatics/load_adata.py +58 -3
  35. smftools/plotting/__init__.py +15 -0
  36. smftools/plotting/classifiers.py +355 -0
  37. smftools/plotting/general_plotting.py +205 -0
  38. smftools/plotting/position_stats.py +462 -0
  39. smftools/preprocessing/__init__.py +6 -7
  40. smftools/preprocessing/append_C_context.py +22 -9
  41. smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
  42. smftools/preprocessing/binarize_on_Youden.py +35 -32
  43. smftools/preprocessing/binary_layers_to_ohe.py +13 -3
  44. smftools/preprocessing/calculate_complexity.py +3 -2
  45. smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
  46. smftools/preprocessing/calculate_coverage.py +26 -25
  47. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  48. smftools/preprocessing/calculate_position_Youden.py +18 -7
  49. smftools/preprocessing/calculate_read_length_stats.py +39 -46
  50. smftools/preprocessing/clean_NaN.py +33 -25
  51. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  52. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
  53. smftools/preprocessing/filter_reads_on_length.py +14 -4
  54. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  55. smftools/preprocessing/invert_adata.py +18 -11
  56. smftools/preprocessing/load_sample_sheet.py +30 -16
  57. smftools/preprocessing/recipes.py +22 -20
  58. smftools/preprocessing/subsample_adata.py +58 -0
  59. smftools/readwrite.py +105 -13
  60. smftools/tools/__init__.py +49 -0
  61. smftools/tools/apply_hmm.py +202 -0
  62. smftools/tools/apply_hmm_batched.py +241 -0
  63. smftools/tools/archived/classify_methylated_features.py +66 -0
  64. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  65. smftools/tools/archived/subset_adata_v1.py +32 -0
  66. smftools/tools/archived/subset_adata_v2.py +46 -0
  67. smftools/tools/calculate_distances.py +18 -0
  68. smftools/tools/calculate_umap.py +62 -0
  69. smftools/tools/call_hmm_peaks.py +105 -0
  70. smftools/tools/classifiers.py +787 -0
  71. smftools/tools/cluster_adata_on_methylation.py +105 -0
  72. smftools/tools/data/__init__.py +2 -0
  73. smftools/tools/data/anndata_data_module.py +90 -0
  74. smftools/tools/data/preprocessing.py +6 -0
  75. smftools/tools/display_hmm.py +18 -0
  76. smftools/tools/general_tools.py +69 -0
  77. smftools/tools/hmm_readwrite.py +16 -0
  78. smftools/tools/inference/__init__.py +1 -0
  79. smftools/tools/inference/lightning_inference.py +41 -0
  80. smftools/tools/models/__init__.py +9 -0
  81. smftools/tools/models/base.py +14 -0
  82. smftools/tools/models/cnn.py +34 -0
  83. smftools/tools/models/lightning_base.py +41 -0
  84. smftools/tools/models/mlp.py +17 -0
  85. smftools/tools/models/positional.py +17 -0
  86. smftools/tools/models/rnn.py +16 -0
  87. smftools/tools/models/sklearn_models.py +40 -0
  88. smftools/tools/models/transformer.py +133 -0
  89. smftools/tools/models/wrappers.py +20 -0
  90. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  91. smftools/tools/position_stats.py +239 -0
  92. smftools/tools/read_stats.py +70 -0
  93. smftools/tools/subset_adata.py +19 -23
  94. smftools/tools/train_hmm.py +78 -0
  95. smftools/tools/training/__init__.py +1 -0
  96. smftools/tools/training/train_lightning_model.py +47 -0
  97. smftools/tools/utils/__init__.py +2 -0
  98. smftools/tools/utils/device.py +10 -0
  99. smftools/tools/utils/grl.py +14 -0
  100. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
  101. smftools-0.1.7.dist-info/RECORD +136 -0
  102. smftools/tools/apply_HMM.py +0 -1
  103. smftools/tools/read_HMM.py +0 -1
  104. smftools/tools/train_HMM.py +0 -43
  105. smftools-0.1.3.dist-info/RECORD +0 -84
  106. /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
  107. /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
  108. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  109. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,25 @@
1
+ # extract_read_lengths_from_bed
2
+
3
+ def extract_read_lengths_from_bed(file_path):
4
+ """
5
+ Load a dict of read names that points to the read length
6
+
7
+ Params:
8
+ file_path (str): file path to a bed file
9
+ Returns:
10
+ read_dict (dict)
11
+ """
12
+ import pandas as pd
13
+ columns = ['chrom', 'start', 'end', 'length', 'name']
14
+ df = pd.read_csv(file_path, sep='\t', header=None, names=columns, comment='#')
15
+ read_dict = {}
16
+ for _, row in df.iterrows():
17
+ chrom = row['chrom']
18
+ start = row['start']
19
+ end = row['end']
20
+ name = row['name']
21
+ length = row['length']
22
+ read_dict[name] = length
23
+
24
+ return read_dict
25
+
@@ -1,61 +1,50 @@
1
- ## find_conversion_sites
2
-
3
1
  def find_conversion_sites(fasta_file, modification_type, conversion_types):
4
2
  """
5
- A function to find genomic coordinates in every unconverted record contained within a FASTA file of every cytosine.
6
- If searching for adenine conversions, it will find coordinates of all adenines.
3
+ Finds genomic coordinates of modified bases (5mC or 6mA) in a reference FASTA file.
7
4
 
8
5
  Parameters:
9
- fasta_file (str): A string representing the file path to the converted reference FASTA.
10
- modification_type (str): A string representing the modification type of interest (options are '5mC' and '6mA').
11
- conversion_types (list): A list of strings of the conversion types to use in the analysis. Used here to pass the unconverted record name.
6
+ fasta_file (str): Path to the converted reference FASTA.
7
+ modification_type (str): Modification type ('5mC' or '6mA') or 'unconverted'.
8
+ conversion_types (list): List of conversion types. The first element is the unconverted record type.
12
9
 
13
10
  Returns:
14
- record_dict (dict): A dictionary keyed by unconverted record ids contained within the FASTA. Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string, 5) Complement sequence
11
+ dict: Dictionary where keys are **both unconverted & converted record names**.
12
+ Values contain:
13
+ [sequence length, top strand coordinates, bottom strand coordinates, sequence, complement sequence].
15
14
  """
16
- from .. import readwrite
15
+ import numpy as np
17
16
  from Bio import SeqIO
18
- from Bio.SeqRecord import SeqRecord
19
- from Bio.Seq import Seq
20
-
21
- #print('{0}: Finding positions of interest in reference FASTA: {1}'.format(readwrite.time_string(), fasta_file))
22
- # Initialize lists to hold top and bottom strand positional coordinates of interest
23
- top_strand_coordinates = []
24
- bottom_strand_coordinates = []
25
17
  unconverted = conversion_types[0]
26
18
  record_dict = {}
27
- #print('{0}: Opening FASTA file {1}'.format(readwrite.time_string(), fasta_file))
28
- # Open the FASTA record as read only
19
+
20
+ # Define base mapping based on modification type
21
+ base_mappings = {
22
+ '5mC': ('C', 'G'), # Cytosine and Guanine
23
+ '6mA': ('A', 'T') # Adenine and Thymine
24
+ }
25
+
26
+ # Read FASTA file and process records
29
27
  with open(fasta_file, "r") as f:
30
- # Iterate over records in the FASTA
31
28
  for record in SeqIO.parse(f, "fasta"):
32
- # Only iterate over the unconverted records for the reference
33
29
  if unconverted in record.id:
34
- #print('{0}: Iterating over record {1} in FASTA file {2}'.format(readwrite.time_string(), record, fasta_file))
35
- # Extract the sequence string of the record
36
30
  sequence = str(record.seq).upper()
37
31
  complement = str(record.seq.complement()).upper()
38
32
  sequence_length = len(sequence)
39
- if modification_type == '5mC':
40
- # Iterate over the sequence string from the record
41
- for i in range(0, len(sequence)):
42
- if sequence[i] == 'C':
43
- top_strand_coordinates.append(i) # 0-indexed coordinate
44
- if sequence[i] == 'G':
45
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
46
- #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for all cytosines'.format(readwrite.time_string()))
47
- elif modification_type == '6mA':
48
- # Iterate over the sequence string from the record
49
- for i in range(0, len(sequence)):
50
- if sequence[i] == 'A':
51
- top_strand_coordinates.append(i) # 0-indexed coordinate
52
- if sequence[i] == 'T':
53
- bottom_strand_coordinates.append(i) # 0-indexed coordinate
54
- #print('{0}: Returning zero-indexed top and bottom strand FASTA coordinates for adenines of interest'.format(readwrite.time_string()))
33
+
34
+ # Unconverted case: store the full sequence without coordinate filtering
35
+ if modification_type == unconverted:
36
+ record_dict[record.id] = [sequence_length, [], [], sequence, complement]
37
+
38
+ # Process converted records: extract modified base positions
39
+ elif modification_type in base_mappings:
40
+ top_base, bottom_base = base_mappings[modification_type]
41
+ seq_array = np.array(list(sequence))
42
+ top_strand_coordinates = np.where(seq_array == top_base)[0].tolist()
43
+ bottom_strand_coordinates = np.where(seq_array == bottom_base)[0].tolist()
44
+
45
+ record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
46
+
55
47
  else:
56
- #print('modification_type not found. Please try 5mC or 6mA')
57
- pass
58
- record_dict[record.id] = [sequence_length, top_strand_coordinates, bottom_strand_coordinates, sequence, complement]
59
- else:
60
- pass
61
- return record_dict
48
+ raise ValueError(f"Invalid modification_type: {modification_type}. Choose '5mC', '6mA', or 'unconverted'.")
49
+
50
+ return record_dict
@@ -1,98 +1,99 @@
1
- ## generate_converted_FASTA
1
+ import numpy as np
2
+ import gzip
3
+ import os
4
+ from Bio import SeqIO
5
+ from Bio.SeqRecord import SeqRecord
6
+ from Bio.Seq import Seq
7
+ from concurrent.futures import ProcessPoolExecutor
8
+ from itertools import chain
2
9
 
3
10
  def convert_FASTA_record(record, modification_type, strand, unconverted):
4
- """
5
- Takes a FASTA record and converts every instance of a base to the converted state.
11
+ """ Converts a FASTA record based on modification type and strand. """
12
+ conversion_maps = {
13
+ ('5mC', 'top'): ('C', 'T'),
14
+ ('5mC', 'bottom'): ('G', 'A'),
15
+ ('6mA', 'top'): ('A', 'G'),
16
+ ('6mA', 'bottom'): ('T', 'C')
17
+ }
6
18
 
7
- Parameters:
8
- record (str): The name of the record instance within the FASTA.
9
- modification_type (str): The modification type to convert for (options are '5mC' and '6mA').
10
- strand (str): The strand that is being converted in the experiment (options are 'top' and 'bottom').
19
+ sequence = str(record.seq).upper()
20
+
21
+ if modification_type == unconverted:
22
+ return SeqRecord(Seq(sequence), id=f"{record.id}_{modification_type}_top", description=record.description)
23
+
24
+ if (modification_type, strand) not in conversion_maps:
25
+ raise ValueError(f"Invalid combination: {modification_type}, {strand}")
26
+
27
+ original_base, converted_base = conversion_maps[(modification_type, strand)]
28
+ new_seq = sequence.replace(original_base, converted_base)
29
+
30
+ return SeqRecord(Seq(new_seq), id=f"{record.id}_{modification_type}_{strand}", description=record.description)
31
+
32
+
33
+ def process_fasta_record(args):
34
+ """
35
+ Processes a single FASTA record for parallel execution.
36
+ Args:
37
+ args (tuple): (record, modification_types, strands, unconverted)
11
38
  Returns:
12
- new_seq (str): Converted sequence string.
13
- new_id (str): Record id for the converted sequence string.
39
+ list of modified SeqRecord objects.
14
40
  """
15
- if modification_type == '5mC':
16
- if strand == 'top':
17
- # Replace every 'C' with 'T' in the sequence
18
- new_seq = record.seq.upper().replace('C', 'T')
19
- elif strand == 'bottom':
20
- # Replace every 'G' with 'A' in the sequence
21
- new_seq = record.seq.upper().replace('G', 'A')
22
- else:
23
- print('need to provide a valid strand string: top or bottom')
24
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
25
- elif modification_type == '6mA':
26
- if strand == 'top':
27
- # Replace every 'A' with 'G' in the sequence
28
- new_seq = record.seq.upper().replace('A', 'G')
29
- elif strand == 'bottom':
30
- # Replace every 'T' with 'C' in the sequence
31
- new_seq = record.seq.upper().replace('T', 'C')
32
- else:
33
- print('need to provide a valid strand string: top or bottom')
34
- new_id = '{0}_{1}_{2}'.format(record.id, modification_type, strand)
35
- elif modification_type == unconverted:
36
- new_seq = record.seq.upper()
37
- new_id = '{0}_{1}_top'.format(record.id, modification_type)
38
- else:
39
- print(f'need to provide a valid modification_type string: 5mC, 6mA, or {unconverted}')
40
-
41
- return new_seq, new_id
42
-
43
- def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta):
41
+ record, modification_types, strands, unconverted = args
42
+ modified_records = []
43
+
44
+ for modification_type in modification_types:
45
+ for i, strand in enumerate(strands):
46
+ if i > 0 and modification_type == unconverted:
47
+ continue # Ensure unconverted is added only once
48
+
49
+ modified_records.append(convert_FASTA_record(record, modification_type, strand, unconverted))
50
+
51
+ return modified_records
52
+
53
+
54
+ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fasta, num_threads=4, chunk_size=500):
44
55
  """
45
- Uses modify_sequence_and_id function on every record within the FASTA to write out a converted FASTA.
56
+ Converts an input FASTA file and writes a new converted FASTA file efficiently.
46
57
 
47
58
  Parameters:
48
- input_FASTA (str): A string representing the path to the unconverted FASTA file.
49
- modification_types (list): A list of modification types to use in the experiment.
50
- strands (list): A list of converstion strands to use in the experiment.
51
- output_FASTA (str): A string representing the path to the converted FASTA output file.
59
+ input_fasta (str): Path to the unconverted FASTA file.
60
+ modification_types (list): List of modification types ('5mC', '6mA', or unconverted).
61
+ strands (list): List of strands ('top', 'bottom').
62
+ output_fasta (str): Path to the converted FASTA output file.
63
+ num_threads (int): Number of parallel threads to use.
64
+ chunk_size (int): Number of records to process per write batch.
65
+
52
66
  Returns:
53
- None
54
- Writes out a converted FASTA reference for the experiment.
67
+ None (Writes the converted FASTA file).
55
68
  """
56
- from .. import readwrite
57
- from Bio import SeqIO
58
- from Bio.SeqRecord import SeqRecord
59
- from Bio.Seq import Seq
60
- import gzip
61
- modified_records = []
62
69
  unconverted = modification_types[0]
63
- # Iterate over each record in the input FASTA
64
- if '.gz' in input_fasta:
65
- with gzip.open(input_fasta, 'rt') as handle:
70
+
71
+ # Detect if input is gzipped
72
+ open_func = gzip.open if input_fasta.endswith('.gz') else open
73
+ file_mode = 'rt' if input_fasta.endswith('.gz') else 'r'
74
+
75
+ def fasta_record_generator():
76
+ """ Lazily yields FASTA records from file. """
77
+ with open_func(input_fasta, file_mode) as handle:
66
78
  for record in SeqIO.parse(handle, 'fasta'):
67
- record_description = record.description
68
- # Iterate over each modification type of interest
69
- for modification_type in modification_types:
70
- # Iterate over the strands of interest
71
- for i, strand in enumerate(strands):
72
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
73
- pass
74
- else:
75
- # Add the modified record to the list of modified records
76
- print(f'converting {modification_type} on the {strand} strand of record {record}')
77
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
78
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
79
- modified_records.append(new_record)
80
- else:
81
- for record in SeqIO.parse(input_fasta, 'fasta'):
82
- record_description = record.description
83
- # Iterate over each modification type of interest
84
- for modification_type in modification_types:
85
- # Iterate over the strands of interest
86
- for i, strand in enumerate(strands):
87
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
88
- pass
89
- else:
90
- # Add the modified record to the list of modified records
91
- print(f'converting {modification_type} on the {strand} strand of record {record}')
92
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
93
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
94
- modified_records.append(new_record)
95
-
96
- with open(output_fasta, 'w') as output_handle:
97
- # write out the concatenated FASTA file of modified sequences
98
- SeqIO.write(modified_records, output_handle, 'fasta')
79
+ yield record
80
+
81
+ with open(output_fasta, 'w') as output_handle, ProcessPoolExecutor(max_workers=num_threads) as executor:
82
+ # Process records in parallel using a named function (avoiding lambda)
83
+ results = executor.map(
84
+ process_fasta_record,
85
+ ((record, modification_types, strands, unconverted) for record in fasta_record_generator())
86
+ )
87
+
88
+ buffer = []
89
+ for modified_records in results:
90
+ buffer.extend(modified_records)
91
+
92
+ # Write out in chunks to save memory
93
+ if len(buffer) >= chunk_size:
94
+ SeqIO.write(buffer, output_handle, 'fasta')
95
+ buffer.clear()
96
+
97
+ # Write any remaining records
98
+ if buffer:
99
+ SeqIO.write(buffer, output_handle, 'fasta')
@@ -1,17 +1,21 @@
1
1
  ## modcall
2
2
 
3
3
  # Direct methylation specific
4
- def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
4
+ def modcall(model_dir, model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
5
5
  """
6
6
  Wrapper function for dorado modified base calling.
7
7
 
8
8
  Parameters:
9
- model (str): a string representing the file path to the dorado basecalling model.
9
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
10
+ model (str): a string representing the the dorado basecalling model.
10
11
  pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
12
  barcode_kit (str): A string representing the barcoding kit used in the experiment.
12
13
  mod_list (list): A list of modification types to use in the analysis.
13
14
  bam (str): File path to the BAM file to output.
14
15
  bam_suffix (str): The suffix to use for the BAM file.
16
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
17
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends
18
+ device (str): Device to use for basecalling. auto, metal, cpu, cuda.
15
19
 
16
20
  Returns:
17
21
  None
@@ -19,10 +23,14 @@ def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
19
23
  """
20
24
  import subprocess
21
25
  output = bam + bam_suffix
22
- command = [
23
- "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
24
- "--modified-bases"]
26
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--modified-bases"]
25
27
  command += mod_list
28
+ command += ["--device", device, "--batchsize", "0"]
29
+ if barcode_both_ends:
30
+ command.append("--barcode-both-ends")
31
+ if not trim:
32
+ command.append("--no-trim")
33
+ command += [model, pod5_dir]
26
34
  print(f'Running: {" ".join(command)}')
27
35
  with open(output, "w") as outfile:
28
36
  subprocess.run(command, stdout=outfile)