smftools 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. smftools/_settings.py +3 -2
  2. smftools/_version.py +1 -1
  3. smftools/datasets/F1_sample_sheet.csv +5 -0
  4. smftools/datasets/datasets.py +8 -7
  5. smftools/informatics/__init__.py +7 -5
  6. smftools/informatics/{bam_conversion.py → archived/bam_conversion.py} +16 -4
  7. smftools/informatics/{bam_direct.py → archived/bam_direct.py} +22 -8
  8. smftools/informatics/archived/basecalls_to_adata.py +71 -0
  9. smftools/informatics/conversion_smf.py +79 -0
  10. smftools/informatics/direct_smf.py +89 -0
  11. smftools/informatics/fast5_to_pod5.py +8 -6
  12. smftools/informatics/helpers/__init__.py +18 -0
  13. smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
  14. smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  15. smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  16. smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
  17. smftools/informatics/helpers/canoncall.py +2 -0
  18. smftools/informatics/helpers/complement_base_list.py +21 -0
  19. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  20. smftools/informatics/helpers/converted_BAM_to_adata.py +161 -92
  21. smftools/informatics/helpers/count_aligned_reads.py +13 -9
  22. smftools/informatics/helpers/extract_base_identities.py +34 -20
  23. smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  24. smftools/informatics/helpers/find_conversion_sites.py +11 -9
  25. smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
  26. smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  27. smftools/informatics/helpers/index_fasta.py +12 -0
  28. smftools/informatics/helpers/modcall.py +3 -1
  29. smftools/informatics/helpers/modkit_extract_to_adata.py +467 -316
  30. smftools/informatics/helpers/ohe_batching.py +52 -0
  31. smftools/informatics/helpers/one_hot_encode.py +10 -8
  32. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  33. smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
  34. smftools/informatics/helpers/split_and_index_BAM.py +16 -4
  35. smftools/informatics/load_adata.py +127 -0
  36. smftools/informatics/subsample_fasta_from_bed.py +47 -0
  37. smftools/informatics/subsample_pod5.py +69 -13
  38. smftools/preprocessing/__init__.py +6 -1
  39. smftools/preprocessing/append_C_context.py +37 -14
  40. smftools/preprocessing/calculate_complexity.py +2 -2
  41. smftools/preprocessing/calculate_consensus.py +47 -0
  42. smftools/preprocessing/calculate_converted_read_methylation_stats.py +60 -9
  43. smftools/preprocessing/calculate_coverage.py +2 -2
  44. smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
  45. smftools/preprocessing/calculate_read_length_stats.py +56 -2
  46. smftools/preprocessing/clean_NaN.py +2 -2
  47. smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
  48. smftools/preprocessing/filter_reads_on_length.py +4 -2
  49. smftools/preprocessing/invert_adata.py +1 -0
  50. smftools/preprocessing/load_sample_sheet.py +24 -0
  51. smftools/preprocessing/make_dirs.py +21 -0
  52. smftools/preprocessing/mark_duplicates.py +34 -19
  53. smftools/preprocessing/recipes.py +125 -0
  54. smftools/preprocessing/remove_duplicates.py +7 -4
  55. smftools/tools/apply_HMM.py +1 -0
  56. smftools/tools/cluster.py +0 -0
  57. smftools/tools/read_HMM.py +1 -0
  58. smftools/tools/subset_adata.py +32 -0
  59. smftools/tools/train_HMM.py +43 -0
  60. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/METADATA +13 -7
  61. smftools-0.1.3.dist-info/RECORD +84 -0
  62. smftools/informatics/basecalls_to_adata.py +0 -42
  63. smftools/informatics/pod5_conversion.py +0 -53
  64. smftools/informatics/pod5_direct.py +0 -55
  65. smftools/informatics/pod5_to_adata.py +0 -40
  66. smftools-0.1.1.dist-info/RECORD +0 -64
  67. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/WHEEL +0 -0
  68. {smftools-0.1.1.dist-info → smftools-0.1.3.dist-info}/licenses/LICENSE +0 -0
@@ -57,23 +57,42 @@ def generate_converted_FASTA(input_fasta, modification_types, strands, output_fa
57
57
  from Bio import SeqIO
58
58
  from Bio.SeqRecord import SeqRecord
59
59
  from Bio.Seq import Seq
60
+ import gzip
60
61
  modified_records = []
61
62
  unconverted = modification_types[0]
62
63
  # Iterate over each record in the input FASTA
63
- for record in SeqIO.parse(input_fasta, 'fasta'):
64
- record_description = record.description
65
- # Iterate over each modification type of interest
66
- for modification_type in modification_types:
67
- # Iterate over the strands of interest
68
- for i, strand in enumerate(strands):
69
- if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
70
- pass
71
- else:
72
- # Add the modified record to the list of modified records
73
- print(f'converting {modification_type} on the {strand} strand of record {record}')
74
- new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
75
- new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
76
- modified_records.append(new_record)
64
+ if '.gz' in input_fasta:
65
+ with gzip.open(input_fasta, 'rt') as handle:
66
+ for record in SeqIO.parse(handle, 'fasta'):
67
+ record_description = record.description
68
+ # Iterate over each modification type of interest
69
+ for modification_type in modification_types:
70
+ # Iterate over the strands of interest
71
+ for i, strand in enumerate(strands):
72
+ if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
73
+ pass
74
+ else:
75
+ # Add the modified record to the list of modified records
76
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
77
+ new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
78
+ new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
79
+ modified_records.append(new_record)
80
+ else:
81
+ for record in SeqIO.parse(input_fasta, 'fasta'):
82
+ record_description = record.description
83
+ # Iterate over each modification type of interest
84
+ for modification_type in modification_types:
85
+ # Iterate over the strands of interest
86
+ for i, strand in enumerate(strands):
87
+ if i > 0 and modification_type == unconverted: # This ensures that the unconverted is only added once.
88
+ pass
89
+ else:
90
+ # Add the modified record to the list of modified records
91
+ print(f'converting {modification_type} on the {strand} strand of record {record}')
92
+ new_seq, new_id = convert_FASTA_record(record, modification_type, strand, unconverted)
93
+ new_record = SeqRecord(Seq(new_seq), id=new_id, description=record_description)
94
+ modified_records.append(new_record)
95
+
77
96
  with open(output_fasta, 'w') as output_handle:
78
97
  # write out the concatenated FASTA file of modified sequences
79
98
  SeqIO.write(modified_records, output_handle, 'fasta')
@@ -0,0 +1,32 @@
1
+ # get_chromosome_lengths
2
+
3
+ def get_chromosome_lengths(fasta):
4
+ """
5
+ Generates a file containing chromosome lengths within an input FASTA.
6
+
7
+ Parameters:
8
+ fasta (str): Path to the input fasta
9
+ """
10
+ import os
11
+ import subprocess
12
+ from .index_fasta import index_fasta
13
+
14
+ # Make a fasta index file if one isn't already available
15
+ index_path = f'{fasta}.fai'
16
+ if os.path.exists(index_path):
17
+ print(f'Using existing fasta index file: {index_path}')
18
+ else:
19
+ index_fasta(fasta)
20
+
21
+ parent_dir = os.path.dirname(fasta)
22
+ fasta_basename = os.path.basename(fasta)
23
+ chrom_basename = fasta_basename.split('.fa')[0] + '.chrom.sizes'
24
+ chrom_path = os.path.join(parent_dir, chrom_basename)
25
+
26
+ # Make a chromosome length file
27
+ if os.path.exists(chrom_path):
28
+ print(f'Using existing chrom length index file: {chrom_path}')
29
+ else:
30
+ with open(chrom_path, 'w') as outfile:
31
+ command = ["cut", "-f1,2", index_path]
32
+ subprocess.run(command, stdout=outfile)
@@ -0,0 +1,12 @@
1
+ # index_fasta
2
+
3
+ def index_fasta(fasta):
4
+ """
5
+ Generate a FASTA index file for an input fasta.
6
+
7
+ Parameters:
8
+ fasta (str): Path to the input fasta to make an index file for.
9
+ """
10
+ import subprocess
11
+
12
+ subprocess.run(["samtools", "faidx", fasta])
@@ -21,6 +21,8 @@ def modcall(model, pod5_dir, barcode_kit, mod_list, bam, bam_suffix):
21
21
  output = bam + bam_suffix
22
22
  command = [
23
23
  "dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y",
24
- "--modified-bases", ",".join(mod_list)] # Join MOD_LIST elements with commas
24
+ "--modified-bases"]
25
+ command += mod_list
26
+ print(f'Running: {" ".join(command)}')
25
27
  with open(output, "w") as outfile:
26
28
  subprocess.run(command, stdout=outfile)