smftools 0.1.3__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. smftools/__init__.py +5 -1
  2. smftools/_version.py +1 -1
  3. smftools/informatics/__init__.py +2 -0
  4. smftools/informatics/archived/print_bam_query_seq.py +29 -0
  5. smftools/informatics/basecall_pod5s.py +80 -0
  6. smftools/informatics/conversion_smf.py +63 -10
  7. smftools/informatics/direct_smf.py +66 -18
  8. smftools/informatics/helpers/LoadExperimentConfig.py +1 -0
  9. smftools/informatics/helpers/__init__.py +16 -2
  10. smftools/informatics/helpers/align_and_sort_BAM.py +27 -16
  11. smftools/informatics/helpers/aligned_BAM_to_bed.py +49 -48
  12. smftools/informatics/helpers/bam_qc.py +66 -0
  13. smftools/informatics/helpers/binarize_converted_base_identities.py +69 -21
  14. smftools/informatics/helpers/canoncall.py +12 -3
  15. smftools/informatics/helpers/concatenate_fastqs_to_bam.py +5 -4
  16. smftools/informatics/helpers/converted_BAM_to_adata.py +34 -22
  17. smftools/informatics/helpers/converted_BAM_to_adata_II.py +369 -0
  18. smftools/informatics/helpers/demux_and_index_BAM.py +52 -0
  19. smftools/informatics/helpers/extract_base_identities.py +33 -46
  20. smftools/informatics/helpers/extract_mods.py +55 -23
  21. smftools/informatics/helpers/extract_read_features_from_bam.py +31 -0
  22. smftools/informatics/helpers/extract_read_lengths_from_bed.py +25 -0
  23. smftools/informatics/helpers/find_conversion_sites.py +33 -44
  24. smftools/informatics/helpers/generate_converted_FASTA.py +87 -86
  25. smftools/informatics/helpers/modcall.py +13 -5
  26. smftools/informatics/helpers/modkit_extract_to_adata.py +762 -396
  27. smftools/informatics/helpers/ohe_batching.py +65 -41
  28. smftools/informatics/helpers/ohe_layers_decode.py +32 -0
  29. smftools/informatics/helpers/one_hot_decode.py +27 -0
  30. smftools/informatics/helpers/one_hot_encode.py +45 -9
  31. smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +1 -0
  32. smftools/informatics/helpers/run_multiqc.py +28 -0
  33. smftools/informatics/helpers/split_and_index_BAM.py +3 -8
  34. smftools/informatics/load_adata.py +58 -3
  35. smftools/plotting/__init__.py +15 -0
  36. smftools/plotting/classifiers.py +355 -0
  37. smftools/plotting/general_plotting.py +205 -0
  38. smftools/plotting/position_stats.py +462 -0
  39. smftools/preprocessing/__init__.py +6 -7
  40. smftools/preprocessing/append_C_context.py +22 -9
  41. smftools/preprocessing/{mark_duplicates.py → archives/mark_duplicates.py} +38 -26
  42. smftools/preprocessing/binarize_on_Youden.py +35 -32
  43. smftools/preprocessing/binary_layers_to_ohe.py +13 -3
  44. smftools/preprocessing/calculate_complexity.py +3 -2
  45. smftools/preprocessing/calculate_converted_read_methylation_stats.py +44 -46
  46. smftools/preprocessing/calculate_coverage.py +26 -25
  47. smftools/preprocessing/calculate_pairwise_differences.py +49 -0
  48. smftools/preprocessing/calculate_position_Youden.py +18 -7
  49. smftools/preprocessing/calculate_read_length_stats.py +39 -46
  50. smftools/preprocessing/clean_NaN.py +33 -25
  51. smftools/preprocessing/filter_adata_by_nan_proportion.py +31 -0
  52. smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -5
  53. smftools/preprocessing/filter_reads_on_length.py +14 -4
  54. smftools/preprocessing/flag_duplicate_reads.py +149 -0
  55. smftools/preprocessing/invert_adata.py +18 -11
  56. smftools/preprocessing/load_sample_sheet.py +30 -16
  57. smftools/preprocessing/recipes.py +22 -20
  58. smftools/preprocessing/subsample_adata.py +58 -0
  59. smftools/readwrite.py +105 -13
  60. smftools/tools/__init__.py +49 -0
  61. smftools/tools/apply_hmm.py +202 -0
  62. smftools/tools/apply_hmm_batched.py +241 -0
  63. smftools/tools/archived/classify_methylated_features.py +66 -0
  64. smftools/tools/archived/classify_non_methylated_features.py +75 -0
  65. smftools/tools/archived/subset_adata_v1.py +32 -0
  66. smftools/tools/archived/subset_adata_v2.py +46 -0
  67. smftools/tools/calculate_distances.py +18 -0
  68. smftools/tools/calculate_umap.py +62 -0
  69. smftools/tools/call_hmm_peaks.py +105 -0
  70. smftools/tools/classifiers.py +787 -0
  71. smftools/tools/cluster_adata_on_methylation.py +105 -0
  72. smftools/tools/data/__init__.py +2 -0
  73. smftools/tools/data/anndata_data_module.py +90 -0
  74. smftools/tools/data/preprocessing.py +6 -0
  75. smftools/tools/display_hmm.py +18 -0
  76. smftools/tools/general_tools.py +69 -0
  77. smftools/tools/hmm_readwrite.py +16 -0
  78. smftools/tools/inference/__init__.py +1 -0
  79. smftools/tools/inference/lightning_inference.py +41 -0
  80. smftools/tools/models/__init__.py +9 -0
  81. smftools/tools/models/base.py +14 -0
  82. smftools/tools/models/cnn.py +34 -0
  83. smftools/tools/models/lightning_base.py +41 -0
  84. smftools/tools/models/mlp.py +17 -0
  85. smftools/tools/models/positional.py +17 -0
  86. smftools/tools/models/rnn.py +16 -0
  87. smftools/tools/models/sklearn_models.py +40 -0
  88. smftools/tools/models/transformer.py +133 -0
  89. smftools/tools/models/wrappers.py +20 -0
  90. smftools/tools/nucleosome_hmm_refinement.py +104 -0
  91. smftools/tools/position_stats.py +239 -0
  92. smftools/tools/read_stats.py +70 -0
  93. smftools/tools/subset_adata.py +19 -23
  94. smftools/tools/train_hmm.py +78 -0
  95. smftools/tools/training/__init__.py +1 -0
  96. smftools/tools/training/train_lightning_model.py +47 -0
  97. smftools/tools/utils/__init__.py +2 -0
  98. smftools/tools/utils/device.py +10 -0
  99. smftools/tools/utils/grl.py +14 -0
  100. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/METADATA +47 -11
  101. smftools-0.1.7.dist-info/RECORD +136 -0
  102. smftools/tools/apply_HMM.py +0 -1
  103. smftools/tools/read_HMM.py +0 -1
  104. smftools/tools/train_HMM.py +0 -43
  105. smftools-0.1.3.dist-info/RECORD +0 -84
  106. /smftools/preprocessing/{remove_duplicates.py → archives/remove_duplicates.py} +0 -0
  107. /smftools/tools/{cluster.py → evaluation/__init__.py} +0 -0
  108. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/WHEEL +0 -0
  109. {smftools-0.1.3.dist-info → smftools-0.1.7.dist-info}/licenses/LICENSE +0 -0
@@ -1,73 +1,74 @@
1
- # aligned_BAM_to_bed
2
-
3
- def aligned_BAM_to_bed(aligned_BAM, plotting_dir, bed_dir, fasta):
1
+ def aligned_BAM_to_bed(aligned_BAM, out_dir, fasta, make_bigwigs, threads=None):
4
2
  """
5
- Takes an aligned BAM as input and writes a bed file of reads as output.
6
- Bed columns are: Record name, start position, end position, read length, read name
3
+ Takes an aligned BAM as input and writes a BED file of reads as output.
4
+ Bed columns are: Record name, start position, end position, read length, read name.
7
5
 
8
6
  Parameters:
9
7
  aligned_BAM (str): Path to an input aligned_BAM to extract to a BED file.
10
- plotting_dir (str): Path to write out read alignment length and coverage histograms
11
- bed_dir (str): Path to write out read alignment coordinates
12
- fasta (str): File path to the reference genome to align to.
8
+ out_dir (str): Directory to output files.
9
+ fasta (str): File path to the reference genome.
10
+ make_bigwigs (bool): Whether to generate bigwig files.
11
+ threads (int): Number of threads to use.
13
12
 
14
13
  Returns:
15
14
  None
16
-
17
15
  """
18
16
  import subprocess
19
17
  import os
18
+ import concurrent.futures
19
+ from concurrent.futures import ProcessPoolExecutor
20
20
  from .bed_to_bigwig import bed_to_bigwig
21
+ from . import make_dirs
21
22
  from .plot_read_length_and_coverage_histograms import plot_read_length_and_coverage_histograms
22
23
 
23
- bed_output_basename = os.path.basename(aligned_BAM).split('.bam')[0] + '_bed.bed'
24
- bed_output = os.path.join(bed_dir, bed_output_basename)
24
+ threads = threads or os.cpu_count() # Use max available cores if not specified
25
+
26
+ # Create necessary directories
27
+ plotting_dir = os.path.join(out_dir, "bed_cov_histograms")
28
+ bed_dir = os.path.join(out_dir, "beds")
29
+ make_dirs([plotting_dir, bed_dir])
25
30
 
26
- samtools_view = subprocess.Popen(["samtools", "view", aligned_BAM], stdout=subprocess.PIPE)
31
+ bed_output = os.path.join(bed_dir, os.path.basename(aligned_BAM).replace(".bam", "_bed.bed"))
32
+
33
+ print(f"Creating BED from BAM: {aligned_BAM} using {threads} threads...")
34
+
35
+ # Convert BAM to BED format
27
36
  with open(bed_output, "w") as output_file:
28
- awk_process = subprocess.Popen(["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'], stdin=samtools_view.stdout, stdout=output_file)
37
+ samtools_view = subprocess.Popen(["samtools", "view", "-@", str(threads), aligned_BAM], stdout=subprocess.PIPE)
38
+ awk_process = subprocess.Popen(
39
+ ["awk", '{print $3 "\t" $4 "\t" $4+length($10)-1 "\t" length($10)-1 "\t" $1}'],
40
+ stdin=samtools_view.stdout,
41
+ stdout=output_file
42
+ )
43
+
29
44
  samtools_view.stdout.close()
30
45
  awk_process.wait()
31
46
  samtools_view.wait()
32
47
 
33
- def split_bed(bed, delete_input=True):
34
- """
35
- Reads in a BED file and splits it into two separate BED files based on alignment status.
36
-
37
- Parameters:
38
- bed (str): Path to the input BED file.
39
- delete_input (bool): Whether to delete the input bed file
40
-
41
- Returns:
42
- aligned (str): Path to the aligned bed file
43
- """
44
- unaligned = bed.split('.bed')[0] + '_unaligned.bed'
45
- aligned = bed.split('.bed')[0] + '_aligned.bed'
46
-
47
- with open(bed, 'r') as infile, \
48
- open(unaligned, 'w') as unaligned_outfile, \
49
- open(aligned, 'w') as aligned_outfile:
50
-
48
+ print(f"BED file created: {bed_output}")
49
+
50
+ def split_bed(bed):
51
+ """Splits BED into aligned and unaligned reads."""
52
+ aligned = bed.replace(".bed", "_aligned.bed")
53
+ unaligned = bed.replace(".bed", "_unaligned.bed")
54
+
55
+ with open(bed, "r") as infile, open(aligned, "w") as aligned_out, open(unaligned, "w") as unaligned_out:
51
56
  for line in infile:
52
- fields = line.strip().split('\t')
53
-
54
- if fields[0] == '*':
55
- unaligned_outfile.write(line)
56
- else:
57
- aligned_outfile.write(line)
58
-
59
- if delete_input:
60
- os.remove(bed)
61
-
62
- return aligned
63
-
64
- aligned_bed = split_bed(bed_output)
57
+ (unaligned_out if line.startswith("*") else aligned_out).write(line)
65
58
 
66
- # Write out basic plots of reference coverage and read lengths
67
- plot_read_length_and_coverage_histograms(aligned_bed, plotting_dir)
59
+ os.remove(bed)
60
+ return aligned
68
61
 
69
- # Make a bedgraph and bigwig for the aligned reads
70
- bed_to_bigwig(fasta, aligned_bed)
62
+ print(f"Splitting BED: {bed_output}")
63
+ aligned_bed = split_bed(bed_output)
71
64
 
65
+ with ProcessPoolExecutor() as executor: # Use processes instead of threads
66
+ futures = []
67
+ futures.append(executor.submit(plot_read_length_and_coverage_histograms, aligned_bed, plotting_dir))
68
+ if make_bigwigs:
69
+ futures.append(executor.submit(bed_to_bigwig, fasta, aligned_bed))
72
70
 
71
+ # Wait for all tasks to complete
72
+ concurrent.futures.wait(futures)
73
73
 
74
+ print("Processing completed successfully.")
@@ -0,0 +1,66 @@
1
+ ## bam_qc
2
+
3
+ def bam_qc(bam_files, bam_qc_dir, threads, modality, stats=True, flagstats=True, idxstats=True):
4
+ """
5
+ Performs QC on BAM files by running samtools stats, flagstat, and idxstats.
6
+
7
+ Parameters:
8
+ - bam_files: List of BAM file paths.
9
+ - bam_qc_dir: Directory to save QC reports.
10
+ - threads: Number threads to use.
11
+ - modality: 'conversion' or 'direct' (affects processing mode).
12
+ - stats: Run `samtools stats` if True.
13
+ - flagstats: Run `samtools flagstat` if True.
14
+ - idxstats: Run `samtools idxstats` if True.
15
+ """
16
+ import os
17
+ import subprocess
18
+
19
+ # Ensure the QC output directory exists
20
+ os.makedirs(bam_qc_dir, exist_ok=True)
21
+
22
+ if threads:
23
+ threads = str(threads)
24
+ else:
25
+ pass
26
+
27
+ for bam in bam_files:
28
+ bam_name = os.path.basename(bam).replace(".bam", "") # Extract filename without extension
29
+
30
+ # Run samtools QC commands based on selected options
31
+ if stats:
32
+ stats_out = os.path.join(bam_qc_dir, f"{bam_name}_stats.txt")
33
+ if threads:
34
+ command = ["samtools", "stats", "-@", threads, bam]
35
+ else:
36
+ command = ["samtools", "stats", bam]
37
+ print(f"Running: {' '.join(command)} > {stats_out}")
38
+ with open(stats_out, "w") as out_file:
39
+ subprocess.run(command, stdout=out_file)
40
+
41
+ if flagstats:
42
+ flagstats_out = os.path.join(bam_qc_dir, f"{bam_name}_flagstat.txt")
43
+ if threads:
44
+ command = ["samtools", "flagstat", "-@", threads, bam]
45
+ else:
46
+ command = ["samtools", "flagstat", bam]
47
+ print(f"Running: {' '.join(command)} > {flagstats_out}")
48
+ with open(flagstats_out, "w") as out_file:
49
+ subprocess.run(command, stdout=out_file)
50
+
51
+ if idxstats:
52
+ idxstats_out = os.path.join(bam_qc_dir, f"{bam_name}_idxstats.txt")
53
+ if threads:
54
+ command = ["samtools", "idxstats", "-@", threads, bam]
55
+ else:
56
+ command = ["samtools", "idxstats", bam]
57
+ print(f"Running: {' '.join(command)} > {idxstats_out}")
58
+ with open(idxstats_out, "w") as out_file:
59
+ subprocess.run(command, stdout=out_file)
60
+
61
+ if modality == 'conversion':
62
+ pass
63
+ elif modality == 'direct':
64
+ pass
65
+
66
+ print("QC processing completed.")
@@ -1,31 +1,79 @@
1
- ## binarize_converted_base_identities
2
- # Conversion SMF specific
3
- def binarize_converted_base_identities(base_identities, strand, modification_type):
1
+ def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
4
2
  """
5
- Binarizes conversion SMF data within a sequence string
3
+ Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
6
4
 
7
5
  Parameters:
8
6
  base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
9
7
  strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
10
8
  modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
11
-
9
+ bam (str): The bam file path
10
+
12
11
  Returns:
13
- binarized_base_identities (dict): A binarized dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry methylation information.
12
+ dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
14
13
  """
15
14
  import numpy as np
15
+
16
+ # If the modification type is 'unconverted', return NaN for all positions
17
+ if modification_type == "unconverted":
18
+ #print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
19
+ return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
20
+
21
+ # Define mappings for binarization based on strand and modification type
22
+ binarization_maps = {
23
+ ('top', '5mC'): {'C': 1, 'T': 0},
24
+ ('top', '6mA'): {'A': 1, 'G': 0},
25
+ ('bottom', '5mC'): {'G': 1, 'A': 0},
26
+ ('bottom', '6mA'): {'T': 1, 'C': 0}
27
+ }
28
+
29
+ if (strand, modification_type) not in binarization_maps:
30
+ raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
31
+
32
+ # Fetch the appropriate mapping
33
+ base_map = binarization_maps[(strand, modification_type)]
34
+
16
35
  binarized_base_identities = {}
17
- # Iterate over base identity keys to binarize the base identities
18
- for key in base_identities.keys():
19
- if strand == 'top':
20
- if modification_type == '5mC':
21
- binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
22
- elif modification_type == '6mA':
23
- binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
24
- elif strand == 'bottom':
25
- if modification_type == '5mC':
26
- binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
27
- elif modification_type == '6mA':
28
- binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
29
- else:
30
- print(f"{strand} not recognized")
31
- return binarized_base_identities
36
+ for key, bases in base_identities.items():
37
+ arr = np.array(bases, dtype='<U1')
38
+ binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
39
+ binarized_base_identities[key] = binarized
40
+
41
+ return binarized_base_identities
42
+ # import torch
43
+
44
+ # # If the modification type is 'unconverted', return NaN for all positions
45
+ # if modification_type == "unconverted":
46
+ # print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
47
+ # return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
48
+
49
+ # # Define mappings for binarization based on strand and modification type
50
+ # binarization_maps = {
51
+ # ('top', '5mC'): {'C': 1, 'T': 0},
52
+ # ('top', '6mA'): {'A': 1, 'G': 0},
53
+ # ('bottom', '5mC'): {'G': 1, 'A': 0},
54
+ # ('bottom', '6mA'): {'T': 1, 'C': 0}
55
+ # }
56
+
57
+ # if (strand, modification_type) not in binarization_maps:
58
+ # raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
59
+
60
+ # # Fetch the appropriate mapping
61
+ # base_map = binarization_maps[(strand, modification_type)]
62
+
63
+ # # Convert mapping to tensor
64
+ # base_keys = list(base_map.keys())
65
+ # base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
66
+
67
+ # # Create a lookup dictionary (ASCII-based for fast mapping)
68
+ # lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
69
+ # for k, v in zip(base_keys, base_values):
70
+ # lookup_table[ord(k)] = v
71
+
72
+ # # Process reads
73
+ # binarized_base_identities = {}
74
+ # for key, bases in base_identities.items():
75
+ # bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
76
+ # binarized = lookup_table[bases_tensor] # Efficient lookup
77
+ # binarized_base_identities[key] = binarized
78
+
79
+ # return binarized_base_identities
@@ -1,16 +1,20 @@
1
1
  ## canoncall
2
2
 
3
3
  # Conversion SMF specific
4
- def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
4
+ def canoncall(model_dir, model, pod5_dir, barcode_kit, bam, bam_suffix, barcode_both_ends=True, trim=False, device='auto'):
5
5
  """
6
6
  Wrapper function for dorado canonical base calling.
7
7
 
8
8
  Parameters:
9
- model (str): a string representing the file path to the dorado basecalling model.
9
+ model_dir (str): a string representing the file path to the dorado basecalling model directory.
10
+ model (str): a string representing the the dorado basecalling model.
10
11
  pod5_dir (str): a string representing the file path to the experiment directory containing the POD5 files.
11
12
  barcode_kit (str): A string reppresenting the barcoding kit used in the experiment.
12
13
  bam (str): File path to the BAM file to output.
13
14
  bam_suffix (str): The suffix to use for the BAM file.
15
+ barcode_both_ends (bool): Whether to require a barcode detection on both ends for demultiplexing.
16
+ trim (bool): Whether to trim barcodes, adapters, and primers from read ends.
17
+ device (str): The device to use. 'auto' is default, which can detect device to use. Can also specify metal, cpu, cuda.
14
18
 
15
19
  Returns:
16
20
  None
@@ -18,7 +22,12 @@ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
18
22
  """
19
23
  import subprocess
20
24
  output = bam + bam_suffix
21
- command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
25
+ command = ["dorado", "basecaller", "--models-directory", model_dir, "--kit-name", barcode_kit, "--device", device, "--batchsize", "0"]
26
+ if barcode_both_ends:
27
+ command.append("--barcode-both-ends")
28
+ if not trim:
29
+ command.append("--no-trim")
30
+ command += [model, pod5_dir]
22
31
  command_string = " ".join(command)
23
32
  print(f"Running {command_string}\n to generate {output}")
24
33
  with open(output, "w") as outfile:
@@ -36,7 +36,9 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
36
36
  barcode = base_name.split('_')[-1].replace('.fq', '')
37
37
  else:
38
38
  raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
39
-
39
+ else:
40
+ barcode = 'barcode0'
41
+
40
42
  # Read the FASTQ file (handle gzipped and non-gzipped files)
41
43
  open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
42
44
  with open_func(fastq_file, 'rt') as fq_in:
@@ -47,8 +49,7 @@ def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_su
47
49
  aln.query_sequence = str(record.seq)
48
50
  aln.flag = 4 # Unmapped
49
51
  aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
50
- if n_fastqs > 1:
51
- # Add the barcode to the BC tag
52
- aln.set_tag(barcode_tag, barcode)
52
+ # Add the barcode to the BC tag
53
+ aln.set_tag(barcode_tag, barcode)
53
54
  # Write to BAM file
54
55
  bam_out.write(aln)
@@ -13,7 +13,7 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
13
13
  bam_suffix (str): The suffix to use for the BAM file.
14
14
 
15
15
  Returns:
16
- None
16
+ final_adata_path (str): File path to the final adata object
17
17
  Outputs a single gzipped adata object for the experiment.
18
18
  """
19
19
  from .. import readwrite
@@ -36,7 +36,14 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
36
36
  files = os.listdir(split_dir)
37
37
  # Make output dir
38
38
  parent_dir = os.path.dirname(split_dir)
39
+ split_dir_base = os.path.basename(split_dir)
39
40
  h5_dir = os.path.join(parent_dir, 'h5ads')
41
+ final_adata_path = os.path.join(h5_dir, f'{experiment_name}_{split_dir_base}.h5ad')
42
+
43
+ if os.path.exists(f"{final_adata_path}.gz"):
44
+ print(f'{final_adata_path}.gz already exists, using existing adata object') # Stops here if the final_adata file already exists
45
+ return final_adata_path
46
+
40
47
  tmp_dir = os.path.join(parent_dir, 'tmp')
41
48
  make_dirs([h5_dir, tmp_dir])
42
49
  # Filter file names that contain the search string in their filename and keep them in a list
@@ -57,7 +64,8 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
57
64
  record_FASTA_dict = {}
58
65
  # While populating the dictionary, also extract the longest sequence record in the input references
59
66
  max_reference_length = 0
60
- for conversion_type in conversion_types:
67
+ conversions = conversion_types[1:]
68
+ for conversion_type in conversions:
61
69
  # Points to a list containing: 1) sequence length of the record, 2) top strand coordinate list, 3) bottom strand coorinate list, 4) sequence string unconverted , 5) Complement sequence unconverted
62
70
  modification_dict[conversion_type] = find_conversion_sites(converted_FASTA, conversion_type, conversion_types)
63
71
  # Get the max reference length
@@ -132,10 +140,11 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
132
140
  adata.obs_names = binarized_base_identities_df.index.astype(str)
133
141
  adata.var_names = binarized_base_identities_df.columns.astype(str)
134
142
  adata.obs['Sample'] = [sample] * len(adata)
143
+ adata.obs['Reference'] = [chromosome] * len(adata)
135
144
  adata.obs['Strand'] = [strand] * len(adata)
136
145
  adata.obs['Dataset'] = [mod_type] * len(adata)
137
- adata.obs['Reference'] = [record] * len(adata)
138
- adata.obs['Reference_chromosome'] = [chromosome] * len(adata)
146
+ adata.obs['Reference_dataset_strand'] = [f'{chromosome}_{mod_type}_{strand}'] * len(adata)
147
+ adata.obs['Reference_strand'] = [f'{record}'] * len(adata)
139
148
 
140
149
  read_mapping_direction = []
141
150
  for read_id in adata.obs_names:
@@ -162,15 +171,16 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
162
171
  del tmp_ohe_dict
163
172
 
164
173
  read_names = list(one_hot_reads.keys())
165
- dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
166
174
 
167
175
  sequence_length = one_hot_reads[read_names[0]].reshape(n_rows_OHE, -1).shape[1]
168
- df_A = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
169
- df_C = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
170
- df_G = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
171
- df_T = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
172
- df_N = pd.DataFrame(0, index=sorted_index, columns=range(sequence_length))
176
+ df_A = np.zeros((len(sorted_index), sequence_length), dtype=int)
177
+ df_C = np.zeros((len(sorted_index), sequence_length), dtype=int)
178
+ df_G = np.zeros((len(sorted_index), sequence_length), dtype=int)
179
+ df_T = np.zeros((len(sorted_index), sequence_length), dtype=int)
180
+ df_N = np.zeros((len(sorted_index), sequence_length), dtype=int)
173
181
 
182
+ # Process one-hot data into dictionaries
183
+ dict_A, dict_C, dict_G, dict_T, dict_N = {}, {}, {}, {}, {}
174
184
  for read_name, one_hot_array in one_hot_reads.items():
175
185
  one_hot_array = one_hot_array.reshape(n_rows_OHE, -1)
176
186
  dict_A[read_name] = one_hot_array[0, :]
@@ -182,21 +192,22 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
182
192
  del one_hot_reads
183
193
  gc.collect()
184
194
 
185
- for j, read_name in tqdm(enumerate(sorted_index), desc='Loading dataframes of OHE reads', total=len(sorted_index)):
186
- df_A.iloc[j] = dict_A[read_name]
187
- df_C.iloc[j] = dict_C[read_name]
188
- df_G.iloc[j] = dict_G[read_name]
189
- df_T.iloc[j] = dict_T[read_name]
190
- df_N.iloc[j] = dict_N[read_name]
195
+ # Fill the arrays
196
+ for j, read_name in tqdm(enumerate(sorted_index), desc='Loading arrays of OHE reads', total=len(sorted_index)):
197
+ df_A[j, :] = dict_A[read_name]
198
+ df_C[j, :] = dict_C[read_name]
199
+ df_G[j, :] = dict_G[read_name]
200
+ df_T[j, :] = dict_T[read_name]
201
+ df_N[j, :] = dict_N[read_name]
191
202
 
192
203
  del dict_A, dict_C, dict_G, dict_T, dict_N
193
204
  gc.collect()
194
205
 
206
+ # Store the results in AnnData layers
195
207
  ohe_df_map = {0: df_A, 1: df_C, 2: df_G, 3: df_T, 4: df_N}
196
-
197
208
  for j, base in enumerate(['A', 'C', 'G', 'T', 'N']):
198
- adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j].values
199
- ohe_df_map[j] = None # Reassign pointer for memory usage purposes
209
+ adata.layers[f'{base}_binary_encoding'] = ohe_df_map[j]
210
+ ohe_df_map[j] = None # Reassign pointer for memory usage purposes
200
211
 
201
212
  if final_adata:
202
213
  if adata.shape[0] > 0:
@@ -223,11 +234,12 @@ def converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experi
223
234
  chromosome = record_FASTA_dict[unconverted_record_name][2]
224
235
  final_adata.var[f'{chromosome}_unconverted_top_strand_FASTA_base'] = list(sequence)
225
236
  final_adata.var[f'{chromosome}_unconverted_bottom_strand_FASTA_base'] = list(complement)
226
- final_adata.uns[f'{record}_FASTA_sequence'] = sequence
237
+ final_adata.uns[f'{chromosome}_FASTA_sequence'] = sequence
227
238
 
228
239
  ######################################################################################################
229
240
 
230
241
  ######################################################################################################
231
242
  ## Export the final adata object
232
- final_output = os.path.join(h5_dir, f'{readwrite.date_string()}_{experiment_name}.h5ad.gz')
233
- final_adata.write_h5ad(final_output, compression='gzip')
243
+ print('Saving initial draft of final adata')
244
+ final_adata.write_h5ad(final_adata_path)
245
+ return final_adata_path