smftools 0.1.7__py3-none-any.whl → 0.2.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- smftools/__init__.py +7 -6
- smftools/_version.py +1 -1
- smftools/cli/cli_flows.py +94 -0
- smftools/cli/hmm_adata.py +338 -0
- smftools/cli/load_adata.py +577 -0
- smftools/cli/preprocess_adata.py +363 -0
- smftools/cli/spatial_adata.py +564 -0
- smftools/cli_entry.py +435 -0
- smftools/config/__init__.py +1 -0
- smftools/config/conversion.yaml +38 -0
- smftools/config/deaminase.yaml +61 -0
- smftools/config/default.yaml +264 -0
- smftools/config/direct.yaml +41 -0
- smftools/config/discover_input_files.py +115 -0
- smftools/config/experiment_config.py +1288 -0
- smftools/hmm/HMM.py +1576 -0
- smftools/hmm/__init__.py +20 -0
- smftools/{tools → hmm}/apply_hmm_batched.py +8 -7
- smftools/hmm/call_hmm_peaks.py +106 -0
- smftools/{tools → hmm}/display_hmm.py +3 -3
- smftools/{tools → hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools/{tools → hmm}/train_hmm.py +1 -1
- smftools/informatics/__init__.py +13 -9
- smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- smftools/informatics/{helpers → archived/helpers/archived}/count_aligned_reads.py +2 -2
- smftools/informatics/{helpers → archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- smftools/informatics/{helpers → archived/helpers/archived}/extract_base_identities.py +30 -4
- smftools/informatics/{helpers → archived/helpers/archived}/extract_mods.py +15 -13
- smftools/informatics/{helpers → archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- smftools/informatics/{helpers → archived/helpers/archived}/find_conversion_sites.py +5 -4
- smftools/informatics/{helpers → archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- smftools/informatics/{helpers → archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- smftools/informatics/{helpers → archived/helpers/archived}/make_modbed.py +1 -2
- smftools/informatics/{helpers → archived/helpers/archived}/modQC.py +2 -2
- smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- smftools/informatics/{helpers → archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- smftools/informatics/{helpers → archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools/informatics/bam_functions.py +812 -0
- smftools/informatics/basecalling.py +67 -0
- smftools/informatics/bed_functions.py +366 -0
- smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools/informatics/{helpers/converted_BAM_to_adata_II.py → converted_BAM_to_adata.py} +198 -50
- smftools/informatics/fasta_functions.py +255 -0
- smftools/informatics/h5ad_functions.py +197 -0
- smftools/informatics/{helpers/modkit_extract_to_adata.py → modkit_extract_to_adata.py} +147 -61
- smftools/informatics/modkit_functions.py +129 -0
- smftools/informatics/ohe.py +160 -0
- smftools/informatics/pod5_functions.py +224 -0
- smftools/informatics/{helpers/run_multiqc.py → run_multiqc.py} +5 -2
- smftools/machine_learning/__init__.py +12 -0
- smftools/machine_learning/data/__init__.py +2 -0
- smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools/machine_learning/inference/__init__.py +3 -0
- smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools/machine_learning/models/base.py +295 -0
- smftools/machine_learning/models/cnn.py +138 -0
- smftools/machine_learning/models/lightning_base.py +345 -0
- smftools/machine_learning/models/mlp.py +26 -0
- smftools/{tools → machine_learning}/models/positional.py +3 -2
- smftools/{tools → machine_learning}/models/rnn.py +2 -1
- smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools/machine_learning/models/transformer.py +303 -0
- smftools/machine_learning/training/__init__.py +2 -0
- smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools/machine_learning/training/train_sklearn_model.py +114 -0
- smftools/plotting/__init__.py +4 -1
- smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools/plotting/general_plotting.py +1292 -140
- smftools/plotting/hmm_plotting.py +260 -0
- smftools/plotting/qc_plotting.py +270 -0
- smftools/preprocessing/__init__.py +15 -8
- smftools/preprocessing/add_read_length_and_mapping_qc.py +129 -0
- smftools/preprocessing/append_base_context.py +122 -0
- smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools/preprocessing/binarize.py +17 -0
- smftools/preprocessing/binarize_on_Youden.py +2 -2
- smftools/preprocessing/calculate_complexity_II.py +248 -0
- smftools/preprocessing/calculate_coverage.py +10 -1
- smftools/preprocessing/calculate_position_Youden.py +1 -1
- smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- smftools/preprocessing/clean_NaN.py +17 -1
- smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools/preprocessing/flag_duplicate_reads.py +1326 -124
- smftools/preprocessing/invert_adata.py +12 -5
- smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools/readwrite.py +1021 -89
- smftools/tools/__init__.py +3 -32
- smftools/tools/calculate_umap.py +5 -5
- smftools/tools/general_tools.py +3 -3
- smftools/tools/position_stats.py +468 -106
- smftools/tools/read_stats.py +115 -1
- smftools/tools/spatial_autocorrelation.py +562 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/METADATA +14 -9
- smftools-0.2.3.dist-info/RECORD +173 -0
- smftools-0.2.3.dist-info/entry_points.txt +2 -0
- smftools/informatics/fast5_to_pod5.py +0 -21
- smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools/informatics/helpers/__init__.py +0 -74
- smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools/informatics/helpers/bam_qc.py +0 -66
- smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools/informatics/helpers/index_fasta.py +0 -12
- smftools/informatics/helpers/make_dirs.py +0 -21
- smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools/informatics/load_adata.py +0 -182
- smftools/informatics/readwrite.py +0 -106
- smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools/preprocessing/append_C_context.py +0 -82
- smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools/tools/call_hmm_peaks.py +0 -105
- smftools/tools/data/__init__.py +0 -2
- smftools/tools/data/anndata_data_module.py +0 -90
- smftools/tools/inference/__init__.py +0 -1
- smftools/tools/inference/lightning_inference.py +0 -41
- smftools/tools/models/base.py +0 -14
- smftools/tools/models/cnn.py +0 -34
- smftools/tools/models/lightning_base.py +0 -41
- smftools/tools/models/mlp.py +0 -17
- smftools/tools/models/sklearn_models.py +0 -40
- smftools/tools/models/transformer.py +0 -133
- smftools/tools/training/__init__.py +0 -1
- smftools/tools/training/train_lightning_model.py +0 -47
- smftools-0.1.7.dist-info/RECORD +0 -136
- /smftools/{tools/evaluation → cli}/__init__.py +0 -0
- /smftools/{tools → hmm}/calculate_distances.py +0 -0
- /smftools/{tools → hmm}/hmm_readwrite.py +0 -0
- /smftools/informatics/{basecall_pod5s.py → archived/basecall_pod5s.py} +0 -0
- /smftools/informatics/{conversion_smf.py → archived/conversion_smf.py} +0 -0
- /smftools/informatics/{direct_smf.py → archived/direct_smf.py} +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/canoncall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/get_native_references.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/informatics.py +0 -0
- /smftools/informatics/{helpers → archived/helpers}/archived/load_adata.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/modcall.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_batching.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/ohe_layers_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_decode.py +0 -0
- /smftools/informatics/{helpers → archived/helpers/archived}/one_hot_encode.py +0 -0
- /smftools/informatics/{subsample_pod5.py → archived/subsample_pod5.py} +0 -0
- /smftools/informatics/{helpers/complement_base_list.py → complement_base_list.py} +0 -0
- /smftools/{tools → machine_learning}/data/preprocessing.py +0 -0
- /smftools/{tools → machine_learning}/models/__init__.py +0 -0
- /smftools/{tools → machine_learning}/models/wrappers.py +0 -0
- /smftools/{tools → machine_learning}/utils/__init__.py +0 -0
- /smftools/{tools → machine_learning}/utils/device.py +0 -0
- /smftools/{tools → machine_learning}/utils/grl.py +0 -0
- /smftools/tools/{apply_hmm.py → archived/apply_hmm.py} +0 -0
- /smftools/tools/{classifiers.py → archived/classifiers.py} +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/WHEEL +0 -0
- {smftools-0.1.7.dist-info → smftools-0.2.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
# bed_to_bigwig
|
|
2
|
-
|
|
3
|
-
def bed_to_bigwig(fasta, bed):
|
|
4
|
-
"""
|
|
5
|
-
Takes a bed file of reads and makes a bedgraph plus a bigwig
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): File path to the reference genome to align to.
|
|
9
|
-
bed (str): File path to the input bed.
|
|
10
|
-
Returns:
|
|
11
|
-
None
|
|
12
|
-
"""
|
|
13
|
-
import os
|
|
14
|
-
import subprocess
|
|
15
|
-
|
|
16
|
-
bed_basename = os.path.basename(bed)
|
|
17
|
-
parent_dir = os.path.dirname(bed)
|
|
18
|
-
bed_basename_minus_suffix = bed_basename.split('.bed')[0]
|
|
19
|
-
fasta_basename = os.path.basename(fasta)
|
|
20
|
-
fasta_dir = os.path.dirname(fasta)
|
|
21
|
-
fasta_basename_minus_suffix = fasta_basename.split('.fa')[0]
|
|
22
|
-
chrom_basename = fasta_basename_minus_suffix + '.chrom.sizes'
|
|
23
|
-
chrom_path = os.path.join(fasta_dir, chrom_basename)
|
|
24
|
-
bedgraph_basename = bed_basename_minus_suffix + '_bedgraph.bedgraph'
|
|
25
|
-
bedgraph_output = os.path.join(parent_dir, bedgraph_basename)
|
|
26
|
-
bigwig_basename = bed_basename_minus_suffix + '_bigwig.bw'
|
|
27
|
-
bigwig_output = os.path.join(parent_dir, bigwig_basename)
|
|
28
|
-
|
|
29
|
-
# Make the bedgraph
|
|
30
|
-
with open(bedgraph_output, 'w') as outfile:
|
|
31
|
-
# Command as a list
|
|
32
|
-
command = ["bedtools", "genomecov", "-i", bed, "-g", chrom_path, "-bg"]
|
|
33
|
-
print(f'Making bedgraph from {bed_basename}')
|
|
34
|
-
subprocess.run(command, stdout=outfile)
|
|
35
|
-
|
|
36
|
-
# Make the bigwig
|
|
37
|
-
command = ["bedGraphToBigWig", bedgraph_output, chrom_path, bigwig_output]
|
|
38
|
-
print(f'Making bigwig from {bedgraph_basename}')
|
|
39
|
-
subprocess.run(command)
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
def binarize_converted_base_identities(base_identities, strand, modification_type, bam, device='cpu'):
|
|
2
|
-
"""
|
|
3
|
-
Efficiently binarizes conversion SMF data within a sequence string using NumPy arrays.
|
|
4
|
-
|
|
5
|
-
Parameters:
|
|
6
|
-
base_identities (dict): A dictionary returned by extract_base_identities. Keyed by read name. Points to a list of base identities.
|
|
7
|
-
strand (str): A string indicating which strand was converted in the experiment (options are 'top' and 'bottom').
|
|
8
|
-
modification_type (str): A string indicating the modification type of interest (options are '5mC' and '6mA').
|
|
9
|
-
bam (str): The bam file path
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
dict: A dictionary where 1 represents a methylated site, 0 represents an unmethylated site, and NaN represents a site without methylation info.
|
|
13
|
-
"""
|
|
14
|
-
import numpy as np
|
|
15
|
-
|
|
16
|
-
# If the modification type is 'unconverted', return NaN for all positions
|
|
17
|
-
if modification_type == "unconverted":
|
|
18
|
-
#print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
19
|
-
return {key: np.full(len(bases), np.nan) for key, bases in base_identities.items()}
|
|
20
|
-
|
|
21
|
-
# Define mappings for binarization based on strand and modification type
|
|
22
|
-
binarization_maps = {
|
|
23
|
-
('top', '5mC'): {'C': 1, 'T': 0},
|
|
24
|
-
('top', '6mA'): {'A': 1, 'G': 0},
|
|
25
|
-
('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
26
|
-
('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
if (strand, modification_type) not in binarization_maps:
|
|
30
|
-
raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
31
|
-
|
|
32
|
-
# Fetch the appropriate mapping
|
|
33
|
-
base_map = binarization_maps[(strand, modification_type)]
|
|
34
|
-
|
|
35
|
-
binarized_base_identities = {}
|
|
36
|
-
for key, bases in base_identities.items():
|
|
37
|
-
arr = np.array(bases, dtype='<U1')
|
|
38
|
-
binarized = np.vectorize(lambda x: base_map.get(x, np.nan))(arr) # Apply mapping with fallback to NaN
|
|
39
|
-
binarized_base_identities[key] = binarized
|
|
40
|
-
|
|
41
|
-
return binarized_base_identities
|
|
42
|
-
# import torch
|
|
43
|
-
|
|
44
|
-
# # If the modification type is 'unconverted', return NaN for all positions
|
|
45
|
-
# if modification_type == "unconverted":
|
|
46
|
-
# print(f"Skipping binarization for unconverted {strand} reads on bam: {bam}.")
|
|
47
|
-
# return {key: torch.full((len(bases),), float('nan'), device=device) for key, bases in base_identities.items()}
|
|
48
|
-
|
|
49
|
-
# # Define mappings for binarization based on strand and modification type
|
|
50
|
-
# binarization_maps = {
|
|
51
|
-
# ('top', '5mC'): {'C': 1, 'T': 0},
|
|
52
|
-
# ('top', '6mA'): {'A': 1, 'G': 0},
|
|
53
|
-
# ('bottom', '5mC'): {'G': 1, 'A': 0},
|
|
54
|
-
# ('bottom', '6mA'): {'T': 1, 'C': 0}
|
|
55
|
-
# }
|
|
56
|
-
|
|
57
|
-
# if (strand, modification_type) not in binarization_maps:
|
|
58
|
-
# raise ValueError(f"Invalid combination of strand='{strand}' and modification_type='{modification_type}'")
|
|
59
|
-
|
|
60
|
-
# # Fetch the appropriate mapping
|
|
61
|
-
# base_map = binarization_maps[(strand, modification_type)]
|
|
62
|
-
|
|
63
|
-
# # Convert mapping to tensor
|
|
64
|
-
# base_keys = list(base_map.keys())
|
|
65
|
-
# base_values = torch.tensor(list(base_map.values()), dtype=torch.float32, device=device)
|
|
66
|
-
|
|
67
|
-
# # Create a lookup dictionary (ASCII-based for fast mapping)
|
|
68
|
-
# lookup_table = torch.full((256,), float('nan'), dtype=torch.float32, device=device)
|
|
69
|
-
# for k, v in zip(base_keys, base_values):
|
|
70
|
-
# lookup_table[ord(k)] = v
|
|
71
|
-
|
|
72
|
-
# # Process reads
|
|
73
|
-
# binarized_base_identities = {}
|
|
74
|
-
# for key, bases in base_identities.items():
|
|
75
|
-
# bases_tensor = torch.tensor([ord(c) for c in bases], dtype=torch.uint8, device=device) # Convert chars to ASCII
|
|
76
|
-
# binarized = lookup_table[bases_tensor] # Efficient lookup
|
|
77
|
-
# binarized_base_identities[key] = binarized
|
|
78
|
-
|
|
79
|
-
# return binarized_base_identities
|
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
# concatenate_fastqs_to_bam
|
|
2
|
-
|
|
3
|
-
def concatenate_fastqs_to_bam(fastq_files, output_bam, barcode_tag='BC', gzip_suffix='.gz'):
|
|
4
|
-
"""
|
|
5
|
-
Concatenate multiple demultiplexed FASTQ (.fastq or .fq) files into an unaligned BAM and add the FASTQ barcode suffix to the BC tag.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fastq_files (list): List of paths to demultiplexed FASTQ files.
|
|
9
|
-
output_bam (str): Path to the output BAM file.
|
|
10
|
-
barcode_tag (str): The SAM tag for storing the barcode (default: 'BC').
|
|
11
|
-
gzip_suffix (str): Suffix to use for input gzip files (Defaul: '.gz')
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
import os
|
|
17
|
-
import pysam
|
|
18
|
-
import gzip
|
|
19
|
-
from Bio import SeqIO
|
|
20
|
-
from tqdm import tqdm
|
|
21
|
-
|
|
22
|
-
n_fastqs = len(fastq_files)
|
|
23
|
-
|
|
24
|
-
with pysam.AlignmentFile(output_bam, "wb", header={"HD": {"VN": "1.0"}, "SQ": []}) as bam_out:
|
|
25
|
-
for fastq_file in tqdm(fastq_files, desc="Processing FASTQ files"):
|
|
26
|
-
# Extract barcode from the FASTQ filename (handles .fq, .fastq, .fq.gz, and .fastq.gz extensions)
|
|
27
|
-
base_name = os.path.basename(fastq_file)
|
|
28
|
-
if n_fastqs > 1:
|
|
29
|
-
if base_name.endswith('.fastq.gz'):
|
|
30
|
-
barcode = base_name.split('_')[-1].replace(f'.fastq{gzip_suffix}', '')
|
|
31
|
-
elif base_name.endswith('.fq.gz'):
|
|
32
|
-
barcode = base_name.split('_')[-1].replace(f'.fq{gzip_suffix}', '')
|
|
33
|
-
elif base_name.endswith('.fastq'):
|
|
34
|
-
barcode = base_name.split('_')[-1].replace('.fastq', '')
|
|
35
|
-
elif base_name.endswith('.fq'):
|
|
36
|
-
barcode = base_name.split('_')[-1].replace('.fq', '')
|
|
37
|
-
else:
|
|
38
|
-
raise ValueError(f"Unexpected file extension for {fastq_file}. Only .fq, .fastq, .fq{gzip_suffix}, and .fastq{gzip_suffix} are supported.")
|
|
39
|
-
else:
|
|
40
|
-
barcode = 'barcode0'
|
|
41
|
-
|
|
42
|
-
# Read the FASTQ file (handle gzipped and non-gzipped files)
|
|
43
|
-
open_func = gzip.open if fastq_file.endswith(gzip_suffix) else open
|
|
44
|
-
with open_func(fastq_file, 'rt') as fq_in:
|
|
45
|
-
for record in SeqIO.parse(fq_in, 'fastq'):
|
|
46
|
-
# Create an unaligned BAM entry for each FASTQ record
|
|
47
|
-
aln = pysam.AlignedSegment()
|
|
48
|
-
aln.query_name = record.id
|
|
49
|
-
aln.query_sequence = str(record.seq)
|
|
50
|
-
aln.flag = 4 # Unmapped
|
|
51
|
-
aln.query_qualities = pysam.qualitystring_to_array(record.letter_annotations["phred_quality"])
|
|
52
|
-
# Add the barcode to the BC tag
|
|
53
|
-
aln.set_tag(barcode_tag, barcode)
|
|
54
|
-
# Write to BAM file
|
|
55
|
-
bam_out.write(aln)
|
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# index_fasta
|
|
2
|
-
|
|
3
|
-
def index_fasta(fasta):
|
|
4
|
-
"""
|
|
5
|
-
Generate a FASTA index file for an input fasta.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
fasta (str): Path to the input fasta to make an index file for.
|
|
9
|
-
"""
|
|
10
|
-
import subprocess
|
|
11
|
-
|
|
12
|
-
subprocess.run(["samtools", "faidx", fasta])
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
## make_dirs
|
|
2
|
-
|
|
3
|
-
# General
|
|
4
|
-
def make_dirs(directories):
|
|
5
|
-
"""
|
|
6
|
-
Takes a list of file paths and makes new directories if the directory does not already exist.
|
|
7
|
-
|
|
8
|
-
Parameters:
|
|
9
|
-
directories (list): A list of directories to make
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import os
|
|
15
|
-
|
|
16
|
-
for directory in directories:
|
|
17
|
-
if not os.path.isdir(directory):
|
|
18
|
-
os.mkdir(directory)
|
|
19
|
-
print(f"Directory '{directory}' created successfully.")
|
|
20
|
-
else:
|
|
21
|
-
print(f"Directory '{directory}' already exists.")
|
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# plot_read_length_and_coverage_histograms
|
|
2
|
-
|
|
3
|
-
def plot_read_length_and_coverage_histograms(bed_file, plotting_directory):
|
|
4
|
-
"""
|
|
5
|
-
Plots read length and coverage statistics for each record.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
bed_file (str): Path to the bed file to derive read lengths and coverage from.
|
|
9
|
-
plot_directory (str): Path to the directory to write out historgrams.
|
|
10
|
-
|
|
11
|
-
Returns:
|
|
12
|
-
None
|
|
13
|
-
"""
|
|
14
|
-
import pandas as pd
|
|
15
|
-
import matplotlib.pyplot as plt
|
|
16
|
-
import numpy as np
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
|
-
bed_basename = os.path.basename(bed_file).split('.bed')[0]
|
|
20
|
-
# Load the BED file into a DataFrame
|
|
21
|
-
print(f"Loading BED to plot read length and coverage histograms: {bed_file}")
|
|
22
|
-
df = pd.read_csv(bed_file, sep='\t', header=None, names=['chromosome', 'start', 'end', 'length', 'read_name'])
|
|
23
|
-
|
|
24
|
-
# Group by chromosome
|
|
25
|
-
grouped = df.groupby('chromosome')
|
|
26
|
-
|
|
27
|
-
for chrom, group in grouped:
|
|
28
|
-
# Plot read length histogram
|
|
29
|
-
plt.figure(figsize=(12, 6))
|
|
30
|
-
plt.hist(group['length'], bins=50, edgecolor='k', alpha=0.7)
|
|
31
|
-
plt.title(f'Read Length Histogram of reads aligned to {chrom}')
|
|
32
|
-
plt.xlabel('Read Length')
|
|
33
|
-
plt.ylabel('Count')
|
|
34
|
-
plt.grid(True)
|
|
35
|
-
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_read_length_histogram.png')
|
|
36
|
-
plt.savefig(save_name)
|
|
37
|
-
plt.close()
|
|
38
|
-
|
|
39
|
-
# Compute coverage
|
|
40
|
-
coverage = np.zeros(group['end'].max())
|
|
41
|
-
for _, row in group.iterrows():
|
|
42
|
-
coverage[row['start']:row['end']] += 1
|
|
43
|
-
|
|
44
|
-
# Plot coverage histogram
|
|
45
|
-
plt.figure(figsize=(12, 6))
|
|
46
|
-
plt.plot(coverage, color='b')
|
|
47
|
-
plt.title(f'Coverage Histogram for {chrom}')
|
|
48
|
-
plt.xlabel('Position')
|
|
49
|
-
plt.ylabel('Coverage')
|
|
50
|
-
plt.grid(True)
|
|
51
|
-
save_name = os.path.join(plotting_directory, f'{bed_basename}_{chrom}_coverage_histogram.png')
|
|
52
|
-
plt.savefig(save_name)
|
|
53
|
-
plt.close()
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
## load_adata
|
|
2
|
-
|
|
3
|
-
def load_adata(config_path):
|
|
4
|
-
"""
|
|
5
|
-
High-level function to call for converting raw sequencing data to an adata object.
|
|
6
|
-
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
7
|
-
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
8
|
-
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
9
|
-
|
|
10
|
-
Parameters:
|
|
11
|
-
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
# Lazy importing of packages
|
|
17
|
-
from .helpers import LoadExperimentConfig, make_dirs, concatenate_fastqs_to_bam, extract_read_features_from_bam
|
|
18
|
-
from .fast5_to_pod5 import fast5_to_pod5
|
|
19
|
-
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
20
|
-
import os
|
|
21
|
-
import numpy as np
|
|
22
|
-
import anndata as ad
|
|
23
|
-
from pathlib import Path
|
|
24
|
-
|
|
25
|
-
# Default params
|
|
26
|
-
bam_suffix = '.bam' # If different, change from here.
|
|
27
|
-
split_dir = 'demultiplexed_BAMs' # If different, change from here.
|
|
28
|
-
strands = ['bottom', 'top'] # If different, change from here. Having both listed generally doesn't slow things down too much.
|
|
29
|
-
conversions = ['unconverted'] # The name to use for the unconverted files. If different, change from here.
|
|
30
|
-
|
|
31
|
-
# Load experiment config parameters into global variables
|
|
32
|
-
experiment_config = LoadExperimentConfig(config_path)
|
|
33
|
-
var_dict = experiment_config.var_dict
|
|
34
|
-
|
|
35
|
-
# These below variables will point to default_value if they are empty in the experiment_config.csv or if the variable is fully omitted from the csv.
|
|
36
|
-
default_value = None
|
|
37
|
-
|
|
38
|
-
# General config variable init
|
|
39
|
-
smf_modality = var_dict.get('smf_modality', default_value) # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Necessary.
|
|
40
|
-
input_data_path = var_dict.get('input_data_path', default_value) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
41
|
-
output_directory = var_dict.get('output_directory', default_value) # Path to the output directory to make for the analysis. Necessary.
|
|
42
|
-
fasta = var_dict.get('fasta', default_value) # Path to reference FASTA.
|
|
43
|
-
fasta_regions_of_interest = var_dict.get("fasta_regions_of_interest", default_value) # Path to a bed file listing coordinate regions of interest within the FASTA to include. Optional.
|
|
44
|
-
mapping_threshold = var_dict.get('mapping_threshold', default_value) # Minimum proportion of mapped reads that need to fall within a region to include in the final AnnData.
|
|
45
|
-
experiment_name = var_dict.get('experiment_name', default_value) # A key term to add to the AnnData file name.
|
|
46
|
-
model_dir = var_dict.get('model_dir', default_value) # needed for dorado basecaller
|
|
47
|
-
model = var_dict.get('model', default_value) # needed for dorado basecaller
|
|
48
|
-
barcode_kit = var_dict.get('barcode_kit', default_value) # needed for dorado basecaller
|
|
49
|
-
barcode_both_ends = var_dict.get('barcode_both_ends', default_value) # dorado demultiplexing
|
|
50
|
-
trim = var_dict.get('trim', default_value) # dorado adapter and barcode removal
|
|
51
|
-
input_already_demuxed = var_dict.get('input_already_demuxed', default_value) # If the input files are already demultiplexed.
|
|
52
|
-
threads = var_dict.get('threads', default_value) # number of cpu threads available for multiprocessing
|
|
53
|
-
# Conversion specific variable init
|
|
54
|
-
conversion_types = var_dict.get('conversion_types', default_value)
|
|
55
|
-
# Direct methylation specific variable init
|
|
56
|
-
filter_threshold = var_dict.get('filter_threshold', default_value)
|
|
57
|
-
m6A_threshold = var_dict.get('m6A_threshold', default_value)
|
|
58
|
-
m5C_threshold = var_dict.get('m5C_threshold', default_value)
|
|
59
|
-
hm5C_threshold = var_dict.get('hm5C_threshold', default_value)
|
|
60
|
-
thresholds = [filter_threshold, m6A_threshold, m5C_threshold, hm5C_threshold]
|
|
61
|
-
mod_list = var_dict.get('mod_list', default_value)
|
|
62
|
-
batch_size = var_dict.get('batch_size', default_value)
|
|
63
|
-
device = var_dict.get('device', 'auto')
|
|
64
|
-
make_bigwigs = var_dict.get('make_bigwigs', default_value)
|
|
65
|
-
skip_unclassified = var_dict.get('skip_unclassified', True)
|
|
66
|
-
delete_batch_hdfs = var_dict.get('delete_batch_hdfs', True)
|
|
67
|
-
|
|
68
|
-
# Make initial output directory
|
|
69
|
-
make_dirs([output_directory])
|
|
70
|
-
os.chdir(output_directory)
|
|
71
|
-
# Define the pathname to split BAMs into later during demultiplexing.
|
|
72
|
-
split_path = os.path.join(output_directory, split_dir)
|
|
73
|
-
|
|
74
|
-
# If fasta_regions_of_interest is passed, subsample the input FASTA on regions of interest and use the subsampled FASTA.
|
|
75
|
-
if fasta_regions_of_interest and '.bed' in fasta_regions_of_interest:
|
|
76
|
-
fasta_basename = os.path.basename(fasta).split('.fa')[0]
|
|
77
|
-
bed_basename_minus_suffix = os.path.basename(fasta_regions_of_interest).split('.bed')[0]
|
|
78
|
-
output_FASTA = fasta_basename + '_subsampled_by_' + bed_basename_minus_suffix + '.fasta'
|
|
79
|
-
subsample_fasta_from_bed(fasta, fasta_regions_of_interest, output_directory, output_FASTA)
|
|
80
|
-
fasta = os.path.join(output_directory, output_FASTA)
|
|
81
|
-
|
|
82
|
-
# If conversion_types is passed:
|
|
83
|
-
if conversion_types:
|
|
84
|
-
conversions += conversion_types
|
|
85
|
-
|
|
86
|
-
# Get the input filetype
|
|
87
|
-
if Path(input_data_path).is_file():
|
|
88
|
-
input_data_filetype = '.' + os.path.basename(input_data_path).split('.')[1].lower()
|
|
89
|
-
input_is_pod5 = input_data_filetype in ['.pod5','.p5']
|
|
90
|
-
input_is_fast5 = input_data_filetype in ['.fast5','.f5']
|
|
91
|
-
input_is_fastq = input_data_filetype in ['.fastq', '.fq']
|
|
92
|
-
input_is_bam = input_data_filetype == bam_suffix
|
|
93
|
-
if input_is_fastq:
|
|
94
|
-
fastq_paths = [input_data_path]
|
|
95
|
-
elif Path(input_data_path).is_dir():
|
|
96
|
-
# Get the file names in the input data dir
|
|
97
|
-
input_files = os.listdir(input_data_path)
|
|
98
|
-
input_is_pod5 = sum([True for file in input_files if '.pod5' in file or '.p5' in file])
|
|
99
|
-
input_is_fast5 = sum([True for file in input_files if '.fast5' in file or '.f5' in file])
|
|
100
|
-
input_is_fastq = sum([True for file in input_files if '.fastq' in file or '.fq' in file])
|
|
101
|
-
input_is_bam = sum([True for file in input_files if bam_suffix in file])
|
|
102
|
-
if input_is_fastq:
|
|
103
|
-
fastq_paths = [os.path.join(input_data_path, file) for file in input_files if '.fastq' in file or '.fq' in file]
|
|
104
|
-
|
|
105
|
-
# If the input files are not pod5 files, and they are fast5 files, convert the files to a pod5 file before proceeding.
|
|
106
|
-
if input_is_fast5 and not input_is_pod5:
|
|
107
|
-
# take the input directory of fast5 files and write out a single pod5 file into the output directory.
|
|
108
|
-
output_pod5 = os.path.join(output_directory, 'FAST5s_to_POD5.pod5')
|
|
109
|
-
print(f'Input directory contains fast5 files, converting them and concatenating into a single pod5 file in the {output_pod5}')
|
|
110
|
-
fast5_to_pod5(input_data_path, output_pod5)
|
|
111
|
-
# Reassign the pod5_dir variable to point to the new pod5 file.
|
|
112
|
-
input_data_path = output_pod5
|
|
113
|
-
input_is_pod5 = True
|
|
114
|
-
input_is_fast5 = False
|
|
115
|
-
|
|
116
|
-
elif input_is_fastq:
|
|
117
|
-
output_bam = os.path.join(output_directory, 'FASTQs_concatenated_into_BAM.bam')
|
|
118
|
-
concatenate_fastqs_to_bam(fastq_paths, output_bam, barcode_tag='BC', gzip_suffix='.gz')
|
|
119
|
-
input_data_path = output_bam
|
|
120
|
-
input_is_bam = True
|
|
121
|
-
input_is_fastq = False
|
|
122
|
-
|
|
123
|
-
if input_is_pod5:
|
|
124
|
-
basecall = True
|
|
125
|
-
elif input_is_bam:
|
|
126
|
-
basecall = False
|
|
127
|
-
else:
|
|
128
|
-
print('Error, can not find input bam or pod5')
|
|
129
|
-
|
|
130
|
-
if smf_modality == 'conversion':
|
|
131
|
-
from .conversion_smf import conversion_smf
|
|
132
|
-
final_adata, final_adata_path, sorted_output, bam_files = conversion_smf(fasta, output_directory, conversions, strands, model_dir, model, input_data_path, split_path
|
|
133
|
-
, barcode_kit, mapping_threshold, experiment_name, bam_suffix, basecall, barcode_both_ends, trim, device, make_bigwigs, threads, input_already_demuxed)
|
|
134
|
-
elif smf_modality == 'direct':
|
|
135
|
-
from .direct_smf import direct_smf
|
|
136
|
-
# need to add input_already_demuxed workflow here.
|
|
137
|
-
final_adata, final_adata_path, sorted_output, bam_files = direct_smf(fasta, output_directory, mod_list,model_dir, model, thresholds, input_data_path, split_path
|
|
138
|
-
, barcode_kit, mapping_threshold, experiment_name, bam_suffix, batch_size, basecall, barcode_both_ends, trim, device, make_bigwigs, skip_unclassified, delete_batch_hdfs, threads)
|
|
139
|
-
else:
|
|
140
|
-
print("Error")
|
|
141
|
-
|
|
142
|
-
# Read in the final adata object and append final metadata
|
|
143
|
-
#print(f'Reading in adata from {final_adata_path} to add final metadata')
|
|
144
|
-
# final_adata = ad.read_h5ad(final_adata_path)
|
|
145
|
-
|
|
146
|
-
# Adding read query length metadata to adata object.
|
|
147
|
-
read_metrics = {}
|
|
148
|
-
for bam_file in bam_files:
|
|
149
|
-
bam_read_metrics = extract_read_features_from_bam(bam_file)
|
|
150
|
-
read_metrics.update(bam_read_metrics)
|
|
151
|
-
#read_metrics = extract_read_features_from_bam(sorted_output)
|
|
152
|
-
|
|
153
|
-
query_read_length_values = []
|
|
154
|
-
query_read_quality_values = []
|
|
155
|
-
reference_lengths = []
|
|
156
|
-
# Iterate over each row of the AnnData object
|
|
157
|
-
for obs_name in final_adata.obs_names:
|
|
158
|
-
# Fetch the value from the dictionary using the obs_name as the key
|
|
159
|
-
value = read_metrics.get(obs_name, np.nan) # Use np.nan if the key is not found
|
|
160
|
-
if type(value) is list:
|
|
161
|
-
query_read_length_values.append(value[0])
|
|
162
|
-
query_read_quality_values.append(value[1])
|
|
163
|
-
reference_lengths.append(value[2])
|
|
164
|
-
else:
|
|
165
|
-
query_read_length_values.append(value)
|
|
166
|
-
query_read_quality_values.append(value)
|
|
167
|
-
reference_lengths.append(value)
|
|
168
|
-
|
|
169
|
-
# Add the new column to adata.obs
|
|
170
|
-
final_adata.obs['query_read_length'] = query_read_length_values
|
|
171
|
-
final_adata.obs['query_read_quality'] = query_read_quality_values
|
|
172
|
-
final_adata.obs['query_length_to_reference_length_ratio'] = np.array(query_read_length_values) / np.array(reference_lengths)
|
|
173
|
-
|
|
174
|
-
final_adata.obs['Raw_methylation_signal'] = np.nansum(final_adata.X, axis=1)
|
|
175
|
-
final_adata.obs['Raw_per_base_methylation_average'] = final_adata.obs['Raw_methylation_signal'] / final_adata.obs['query_read_length']
|
|
176
|
-
|
|
177
|
-
print('Saving final adata')
|
|
178
|
-
if ".gz" in final_adata_path:
|
|
179
|
-
final_adata.write_h5ad(f"{final_adata_path}", compression='gzip')
|
|
180
|
-
else:
|
|
181
|
-
final_adata.write_h5ad(f"{final_adata_path}.gz", compression='gzip')
|
|
182
|
-
print('Final adata saved')
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
## readwrite ##
|
|
2
|
-
|
|
3
|
-
######################################################################################################
|
|
4
|
-
## Datetime functionality
|
|
5
|
-
def date_string():
|
|
6
|
-
"""
|
|
7
|
-
Each time this is called, it returns the current date string
|
|
8
|
-
"""
|
|
9
|
-
from datetime import datetime
|
|
10
|
-
current_date = datetime.now()
|
|
11
|
-
date_string = current_date.strftime("%Y%m%d")
|
|
12
|
-
date_string = date_string[2:]
|
|
13
|
-
return date_string
|
|
14
|
-
|
|
15
|
-
def time_string():
|
|
16
|
-
"""
|
|
17
|
-
Each time this is called, it returns the current time string
|
|
18
|
-
"""
|
|
19
|
-
from datetime import datetime
|
|
20
|
-
current_time = datetime.now()
|
|
21
|
-
return current_time.strftime("%H:%M:%S")
|
|
22
|
-
######################################################################################################
|
|
23
|
-
|
|
24
|
-
######################################################################################################
|
|
25
|
-
## Numpy, Pandas, Anndata functionality
|
|
26
|
-
def adata_to_df(adata, layer=None):
|
|
27
|
-
"""
|
|
28
|
-
Input: An adata object with a specified layer.
|
|
29
|
-
Output: A dataframe for the specific layer.
|
|
30
|
-
"""
|
|
31
|
-
import pandas as pd
|
|
32
|
-
import anndata as ad
|
|
33
|
-
|
|
34
|
-
# Extract the data matrix from the given layer
|
|
35
|
-
if layer:
|
|
36
|
-
data_matrix = adata.layers[layer]
|
|
37
|
-
else:
|
|
38
|
-
data_matrix = adata.X
|
|
39
|
-
# Extract observation (read) annotations
|
|
40
|
-
obs_df = adata.obs
|
|
41
|
-
# Extract variable (position) annotations
|
|
42
|
-
var_df = adata.var
|
|
43
|
-
# Convert data matrix and annotations to pandas DataFrames
|
|
44
|
-
df = pd.DataFrame(data_matrix, index=obs_df.index, columns=var_df.index)
|
|
45
|
-
return df
|
|
46
|
-
|
|
47
|
-
def save_matrix(matrix, save_name):
|
|
48
|
-
"""
|
|
49
|
-
Input: A numpy matrix and a save_name
|
|
50
|
-
Output: A txt file representation of the data matrix
|
|
51
|
-
"""
|
|
52
|
-
import numpy as np
|
|
53
|
-
np.savetxt(f'{save_name}.txt', matrix)
|
|
54
|
-
|
|
55
|
-
def concatenate_h5ads(output_file, file_suffix='h5ad.gz', delete_inputs=True):
|
|
56
|
-
"""
|
|
57
|
-
Concatenate all h5ad files in a directory and delete them after the final adata is written out.
|
|
58
|
-
Input: an output file path relative to the directory in which the function is called
|
|
59
|
-
"""
|
|
60
|
-
import os
|
|
61
|
-
import anndata as ad
|
|
62
|
-
# Runtime warnings
|
|
63
|
-
import warnings
|
|
64
|
-
warnings.filterwarnings('ignore', category=UserWarning, module='anndata')
|
|
65
|
-
warnings.filterwarnings('ignore', category=FutureWarning, module='anndata')
|
|
66
|
-
|
|
67
|
-
# List all files in the directory
|
|
68
|
-
files = os.listdir(os.getcwd())
|
|
69
|
-
# get current working directory
|
|
70
|
-
cwd = os.getcwd()
|
|
71
|
-
suffix = file_suffix
|
|
72
|
-
# Filter file names that contain the search string in their filename and keep them in a list
|
|
73
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
74
|
-
# Sort file list by names and print the list of file names
|
|
75
|
-
hdfs.sort()
|
|
76
|
-
print('{0} sample files found: {1}'.format(len(hdfs), hdfs))
|
|
77
|
-
# Iterate over all of the hdf5 files and concatenate them.
|
|
78
|
-
final_adata = None
|
|
79
|
-
for hdf in hdfs:
|
|
80
|
-
print('{0}: Reading in {1} hdf5 file'.format(time_string(), hdf))
|
|
81
|
-
temp_adata = ad.read_h5ad(hdf)
|
|
82
|
-
if final_adata:
|
|
83
|
-
print('{0}: Concatenating final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
84
|
-
final_adata = ad.concat([final_adata, temp_adata], join='outer', index_unique=None)
|
|
85
|
-
else:
|
|
86
|
-
print('{0}: Initializing final adata object with {1} hdf5 file'.format(time_string(), hdf))
|
|
87
|
-
final_adata = temp_adata
|
|
88
|
-
print('{0}: Writing final concatenated hdf5 file'.format(time_string()))
|
|
89
|
-
final_adata.write_h5ad(output_file, compression='gzip')
|
|
90
|
-
|
|
91
|
-
# Delete the individual h5ad files and only keep the final concatenated file
|
|
92
|
-
if delete_inputs:
|
|
93
|
-
files = os.listdir(os.getcwd())
|
|
94
|
-
hdfs = [hdf for hdf in files if suffix in hdf]
|
|
95
|
-
if output_file in hdfs:
|
|
96
|
-
hdfs.remove(output_file)
|
|
97
|
-
# Iterate over the files and delete them
|
|
98
|
-
for hdf in hdfs:
|
|
99
|
-
try:
|
|
100
|
-
os.remove(hdf)
|
|
101
|
-
print(f"Deleted file: {hdf}")
|
|
102
|
-
except OSError as e:
|
|
103
|
-
print(f"Error deleting file {hdf}: {e}")
|
|
104
|
-
else:
|
|
105
|
-
print('Keeping input files')
|
|
106
|
-
######################################################################################################
|
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
# subsample_fasta_from_bed
|
|
2
|
-
|
|
3
|
-
def subsample_fasta_from_bed(input_FASTA, input_bed, output_directory, output_FASTA):
|
|
4
|
-
"""
|
|
5
|
-
Take a genome-wide FASTA file and a bed file containing coordinate windows of interest. Outputs a subsampled FASTA.
|
|
6
|
-
|
|
7
|
-
Parameters:
|
|
8
|
-
input_FASTA (str): String representing the path to the input FASTA file.
|
|
9
|
-
input_bed (str): String representing the path to the input BED file.
|
|
10
|
-
output_directory (str): String representing the path to the output directory for the new FASTA file.
|
|
11
|
-
output_FASTA (str): Name of the output FASTA.
|
|
12
|
-
|
|
13
|
-
Returns:
|
|
14
|
-
None
|
|
15
|
-
"""
|
|
16
|
-
from pyfaidx import Fasta
|
|
17
|
-
import os
|
|
18
|
-
|
|
19
|
-
# Load the FASTA file using pyfaidx
|
|
20
|
-
fasta = Fasta(input_FASTA)
|
|
21
|
-
|
|
22
|
-
output_FASTA_path = os.path.join(output_directory, output_FASTA)
|
|
23
|
-
|
|
24
|
-
# Open the BED file
|
|
25
|
-
with open(input_bed, 'r') as bed, open(output_FASTA_path, 'w') as out_fasta:
|
|
26
|
-
for line in bed:
|
|
27
|
-
# Each line in BED file contains: chrom, start, end (and possibly more columns)
|
|
28
|
-
fields = line.strip().split()
|
|
29
|
-
n_fields = len(fields)
|
|
30
|
-
chrom = fields[0]
|
|
31
|
-
start = int(fields[1]) # BED is 0-based
|
|
32
|
-
end = int(fields[2]) # BED is 0-based and end is exclusive
|
|
33
|
-
if n_fields > 3:
|
|
34
|
-
description = " ".join(fields[3:])
|
|
35
|
-
|
|
36
|
-
# Check if the chromosome exists in the FASTA file
|
|
37
|
-
if chrom in fasta:
|
|
38
|
-
# pyfaidx is 1-based, so convert coordinates accordingly
|
|
39
|
-
sequence = fasta[chrom][start:end].seq
|
|
40
|
-
# Write the sequence to the output FASTA file
|
|
41
|
-
if n_fields > 3:
|
|
42
|
-
out_fasta.write(f">{chrom}:{start}-{end} {description}\n")
|
|
43
|
-
else:
|
|
44
|
-
out_fasta.write(f">{chrom}:{start}-{end}\n")
|
|
45
|
-
out_fasta.write(f"{sequence}\n")
|
|
46
|
-
else:
|
|
47
|
-
print(f"Warning: {chrom} not found in the FASTA file")
|