smftools 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {smftools-0.1.1 → smftools-0.1.3}/.gitignore +4 -0
- {smftools-0.1.1 → smftools-0.1.3}/PKG-INFO +13 -7
- {smftools-0.1.1 → smftools-0.1.3}/README.md +7 -3
- smftools-0.1.3/docs/source/_static/converted_BAM_to_adata.png +0 -0
- smftools-0.1.3/docs/source/_static/modkit_extract_to_adata.png +0 -0
- smftools-0.1.3/docs/source/_static/smftools_informatics_diagram.pdf +0 -0
- smftools-0.1.3/docs/source/_static/smftools_informatics_diagram.png +0 -0
- smftools-0.1.3/docs/source/_static/smftools_preprocessing_diagram.png +0 -0
- smftools-0.1.3/docs/source/api/index.md +26 -0
- smftools-0.1.3/docs/source/api/informatics.md +27 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/preprocessing.md +5 -0
- smftools-0.1.3/docs/source/basic_usage.md +75 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/index.md +1 -0
- smftools-0.1.3/docs/source/installation.md +60 -0
- {smftools-0.1.1 → smftools-0.1.3}/experiment_config.csv +2 -2
- smftools-0.1.3/notebooks/Kissiov_and_McKenna_2025_example_notebook.ipynb +85 -0
- smftools-0.1.3/notebooks/Kissiov_and_McKenna_2025_sample_sheet.csv +11 -0
- {smftools-0.1.1 → smftools-0.1.3}/pyproject.toml +9 -7
- {smftools-0.1.1 → smftools-0.1.3}/requirements.txt +1 -0
- smftools-0.1.3/sample_sheet.csv +11 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/_settings.py +3 -2
- smftools-0.1.3/src/smftools/_version.py +1 -0
- smftools-0.1.3/src/smftools/datasets/F1_sample_sheet.csv +5 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/datasets.py +8 -7
- smftools-0.1.3/src/smftools/informatics/__init__.py +14 -0
- {smftools-0.1.1/src/smftools/informatics → smftools-0.1.3/src/smftools/informatics/archived}/bam_conversion.py +16 -4
- {smftools-0.1.1/src/smftools/informatics → smftools-0.1.3/src/smftools/informatics/archived}/bam_direct.py +22 -8
- smftools-0.1.3/src/smftools/informatics/archived/basecalls_to_adata.py +71 -0
- smftools-0.1.3/src/smftools/informatics/conversion_smf.py +79 -0
- smftools-0.1.3/src/smftools/informatics/direct_smf.py +89 -0
- smftools-0.1.3/src/smftools/informatics/fast5_to_pod5.py +21 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/__init__.py +18 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
- smftools-0.1.3/src/smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
- smftools-0.1.3/src/smftools/informatics/helpers/bed_to_bigwig.py +39 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/canoncall.py +2 -0
- smftools-0.1.3/src/smftools/informatics/helpers/complement_base_list.py +21 -0
- smftools-0.1.3/src/smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
- smftools-0.1.3/src/smftools/informatics/helpers/converted_BAM_to_adata.py +233 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/count_aligned_reads.py +13 -9
- smftools-0.1.3/src/smftools/informatics/helpers/extract_base_identities.py +57 -0
- smftools-0.1.3/src/smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/find_conversion_sites.py +11 -9
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
- smftools-0.1.3/src/smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
- smftools-0.1.3/src/smftools/informatics/helpers/index_fasta.py +12 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/modcall.py +3 -1
- smftools-0.1.3/src/smftools/informatics/helpers/modkit_extract_to_adata.py +518 -0
- smftools-0.1.3/src/smftools/informatics/helpers/ohe_batching.py +52 -0
- smftools-0.1.3/src/smftools/informatics/helpers/one_hot_encode.py +21 -0
- smftools-0.1.3/src/smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/split_and_index_BAM.py +16 -4
- smftools-0.1.3/src/smftools/informatics/load_adata.py +127 -0
- smftools-0.1.3/src/smftools/informatics/subsample_fasta_from_bed.py +47 -0
- smftools-0.1.3/src/smftools/informatics/subsample_pod5.py +104 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/__init__.py +6 -1
- smftools-0.1.3/src/smftools/preprocessing/append_C_context.py +69 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_complexity.py +2 -2
- smftools-0.1.3/src/smftools/preprocessing/calculate_consensus.py +47 -0
- smftools-0.1.3/src/smftools/preprocessing/calculate_converted_read_methylation_stats.py +96 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_coverage.py +2 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
- smftools-0.1.3/src/smftools/preprocessing/calculate_read_length_stats.py +86 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/clean_NaN.py +2 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/filter_reads_on_length.py +4 -2
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/invert_adata.py +1 -0
- smftools-0.1.3/src/smftools/preprocessing/load_sample_sheet.py +24 -0
- smftools-0.1.3/src/smftools/preprocessing/make_dirs.py +21 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/mark_duplicates.py +34 -19
- smftools-0.1.3/src/smftools/preprocessing/recipes.py +125 -0
- smftools-0.1.3/src/smftools/preprocessing/remove_duplicates.py +21 -0
- smftools-0.1.3/src/smftools/tools/apply_HMM.py +1 -0
- smftools-0.1.3/src/smftools/tools/read_HMM.py +1 -0
- smftools-0.1.3/src/smftools/tools/subset_adata.py +32 -0
- smftools-0.1.3/src/smftools/tools/train_HMM.py +43 -0
- smftools-0.1.3/tests/informatics/helpers/test_LoadExperimentConfig.py +17 -0
- smftools-0.1.1/docs/source/_templates/tmp +0 -1
- smftools-0.1.1/docs/source/api/index.md +0 -16
- smftools-0.1.1/docs/source/api/informatics.md +0 -11
- smftools-0.1.1/docs/source/installation.md +0 -20
- smftools-0.1.1/src/smftools/_version.py +0 -1
- smftools-0.1.1/src/smftools/informatics/__init__.py +0 -12
- smftools-0.1.1/src/smftools/informatics/basecalls_to_adata.py +0 -42
- smftools-0.1.1/src/smftools/informatics/fast5_to_pod5.py +0 -19
- smftools-0.1.1/src/smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
- smftools-0.1.1/src/smftools/informatics/helpers/extract_base_identities.py +0 -43
- smftools-0.1.1/src/smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
- smftools-0.1.1/src/smftools/informatics/helpers/one_hot_encode.py +0 -19
- smftools-0.1.1/src/smftools/informatics/pod5_conversion.py +0 -53
- smftools-0.1.1/src/smftools/informatics/pod5_direct.py +0 -55
- smftools-0.1.1/src/smftools/informatics/pod5_to_adata.py +0 -40
- smftools-0.1.1/src/smftools/informatics/subsample_pod5.py +0 -48
- smftools-0.1.1/src/smftools/preprocessing/append_C_context.py +0 -46
- smftools-0.1.1/src/smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
- smftools-0.1.1/src/smftools/preprocessing/calculate_read_length_stats.py +0 -32
- smftools-0.1.1/src/smftools/preprocessing/remove_duplicates.py +0 -18
- smftools-0.1.1/tests/informatics/helpers/test_align_BAM.py +0 -49
- smftools-0.1.1/tests/informatics/helpers/test_binarize_converted_base_identities.py +0 -24
- smftools-0.1.1/tests/informatics/helpers/test_canoncall.py +0 -12
- smftools-0.1.1/tests/informatics/helpers/test_converted_BAM_to_adata.py +0 -147
- smftools-0.1.1/tests/informatics/helpers/test_count_aligned_reads.py +0 -32
- smftools-0.1.1/tests/informatics/helpers/test_extract_base_identities.py +0 -36
- smftools-0.1.1/tests/informatics/helpers/test_extract_mods.py +0 -39
- smftools-0.1.1/tests/informatics/helpers/test_find_conversion_sites.py +0 -53
- smftools-0.1.1/tests/informatics/helpers/test_generate_converted_FASTA.py +0 -59
- smftools-0.1.1/tests/informatics/helpers/test_get_native_references.py +0 -25
- smftools-0.1.1/tests/informatics/helpers/test_informatics.py +0 -260
- smftools-0.1.1/tests/informatics/helpers/test_load_adata.py +0 -516
- smftools-0.1.1/tests/informatics/helpers/test_load_experiment_config.py +0 -17
- smftools-0.1.1/tests/informatics/helpers/test_make_dirs.py +0 -15
- smftools-0.1.1/tests/informatics/helpers/test_make_modbed.py +0 -21
- smftools-0.1.1/tests/informatics/helpers/test_modQC.py +0 -19
- smftools-0.1.1/tests/informatics/helpers/test_modcall.py +0 -14
- smftools-0.1.1/tests/informatics/helpers/test_modkit_extract_to_adata.py +0 -355
- smftools-0.1.1/tests/informatics/helpers/test_one_hot_encode.py +0 -14
- smftools-0.1.1/tests/informatics/helpers/test_separate_bam_by_bc.py +0 -28
- smftools-0.1.1/tests/informatics/helpers/test_split_and_index_BAM.py +0 -21
- smftools-0.1.1/tests/informatics/test_pod5_conversion.py +0 -26
- smftools-0.1.1/tests/informatics/test_pod5_direct.py +0 -29
- smftools-0.1.1/tests/informatics/test_pod5_to_adata.py +0 -17
- smftools-0.1.1/tests/preprocessing/test_append_C_context.py +0 -39
- smftools-0.1.1/tests/preprocessing/test_binarize_on_Youden.py +0 -38
- smftools-0.1.1/tests/preprocessing/test_binary_layers_to_ohe.py +0 -25
- smftools-0.1.1/tests/preprocessing/test_calculate_complexity.py +0 -59
- smftools-0.1.1/tests/preprocessing/test_calculate_converted_read_methylation_stats.py +0 -38
- smftools-0.1.1/tests/preprocessing/test_calculate_coverage.py +0 -35
- smftools-0.1.1/tests/preprocessing/test_calculate_pairwise_hamming_distances.py +0 -22
- smftools-0.1.1/tests/preprocessing/test_calculate_position_Youden.py +0 -95
- smftools-0.1.1/tests/preprocessing/test_calculate_read_length_stats.py +0 -27
- smftools-0.1.1/tests/preprocessing/test_clean_NaN.py +0 -31
- smftools-0.1.1/tests/preprocessing/test_filter_converted_reads_on_methylation.py +0 -20
- smftools-0.1.1/tests/preprocessing/test_filter_reads_on_length.py +0 -31
- smftools-0.1.1/tests/preprocessing/test_invert_adata.py +0 -18
- smftools-0.1.1/tests/preprocessing/test_mark_duplicates.py +0 -110
- smftools-0.1.1/tests/preprocessing/test_min_non_diagonal.py +0 -20
- smftools-0.1.1/tests/preprocessing/test_preprocessing.py +0 -614
- smftools-0.1.1/tests/preprocessing/test_remove_duplicates.py +0 -12
- {smftools-0.1.1 → smftools-0.1.3}/.gitattributes +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/.readthedocs.yaml +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/CONTRIBUTING.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/LICENSE +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/Makefile +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/make.bat +0 -0
- {smftools-0.1.1/docs/source/_static → smftools-0.1.3/docs/source/_templates}/tmp +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/datasets.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/tools.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/conf.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/contributors.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/dev/index.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/references.bib +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/references.rst +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/release-notes/0.1.0.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/release-notes/index.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/requirements.txt +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/docs/source/tutorials/index.md +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/__init__.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/__init__.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/LoadExperimentConfig.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/archived/informatics.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/archived/load_adata.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/extract_mods.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/get_native_references.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/make_dirs.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/make_modbed.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/modQC.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/readwrite.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/plotting/__init__.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/archives/preprocessing.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/binarize_on_Youden.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/binary_layers_to_ohe.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_position_Youden.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/min_non_diagonal.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/readwrite.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/src/smftools/tools/__init__.py +0 -0
- /smftools-0.1.1/tests/__init__.py → /smftools-0.1.3/src/smftools/tools/cluster.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/tests/datasets/test_datasets.py +0 -0
- {smftools-0.1.1 → smftools-0.1.3}/tests/test_readwrite.py +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: smftools
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: Single Molecule Footprinting Analysis in Python.
|
|
5
5
|
Project-URL: Source, https://github.com/jkmckenna/smftools
|
|
6
|
+
Project-URL: Documentation, https://smftools.readthedocs.io/
|
|
6
7
|
Author: Joseph McKenna
|
|
7
8
|
Maintainer-email: Joseph McKenna <jkmckenna@berkeley.edu>
|
|
8
9
|
License-Expression: MIT
|
|
@@ -31,6 +32,7 @@ Requires-Dist: numpy<2,>=1.22.0
|
|
|
31
32
|
Requires-Dist: pandas>=1.4.2
|
|
32
33
|
Requires-Dist: pod5>=0.1.21
|
|
33
34
|
Requires-Dist: pomegranate>1.0.0
|
|
35
|
+
Requires-Dist: pyfaidx>=0.8.0
|
|
34
36
|
Requires-Dist: pysam>=0.19.1
|
|
35
37
|
Requires-Dist: scanpy>=1.9
|
|
36
38
|
Requires-Dist: scikit-learn>=1.0.2
|
|
@@ -38,9 +40,6 @@ Requires-Dist: scipy>=1.7.3
|
|
|
38
40
|
Requires-Dist: seaborn>=0.11
|
|
39
41
|
Requires-Dist: torch>=1.9.0
|
|
40
42
|
Requires-Dist: tqdm
|
|
41
|
-
Provides-Extra: base-tests
|
|
42
|
-
Requires-Dist: pytest; extra == 'base-tests'
|
|
43
|
-
Requires-Dist: pytest-cov; extra == 'base-tests'
|
|
44
43
|
Provides-Extra: docs
|
|
45
44
|
Requires-Dist: ipython>=7.20; extra == 'docs'
|
|
46
45
|
Requires-Dist: matplotlib!=3.6.1; extra == 'docs'
|
|
@@ -56,13 +55,16 @@ Requires-Dist: sphinx-design; extra == 'docs'
|
|
|
56
55
|
Requires-Dist: sphinx>=7; extra == 'docs'
|
|
57
56
|
Requires-Dist: sphinxcontrib-bibtex; extra == 'docs'
|
|
58
57
|
Requires-Dist: sphinxext-opengraph; extra == 'docs'
|
|
58
|
+
Provides-Extra: tests
|
|
59
|
+
Requires-Dist: pytest; extra == 'tests'
|
|
60
|
+
Requires-Dist: pytest-cov; extra == 'tests'
|
|
59
61
|
Description-Content-Type: text/markdown
|
|
60
62
|
|
|
61
63
|
[](https://pypi.org/project/smftools)
|
|
62
64
|
[](https://smftools.readthedocs.io/en/latest/?badge=latest)
|
|
63
65
|
|
|
64
66
|
# smftools
|
|
65
|
-
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
|
|
67
|
+
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
|
|
66
68
|
|
|
67
69
|
## Philosophy
|
|
68
70
|
While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to at least 1 million X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
|
|
@@ -73,10 +75,14 @@ The following CLI tools need to be installed and configured before using the inf
|
|
|
73
75
|
2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
|
|
74
76
|
3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
|
|
75
77
|
4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
|
|
78
|
+
5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
|
|
79
|
+
6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
|
|
76
80
|
|
|
77
81
|
## Modules
|
|
78
|
-
|
|
79
|
-
|
|
82
|
+
### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
|
|
83
|
+

|
|
84
|
+
### Preprocessing: Appends QC metrics to the AnnData object and perfroms filtering.
|
|
85
|
+

|
|
80
86
|
- Tools: Appends various analyses to the AnnData object.
|
|
81
87
|
- Plotting: Visualization of analyses stored within the AnnData object.
|
|
82
88
|
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
[](https://smftools.readthedocs.io/en/latest/?badge=latest)
|
|
3
3
|
|
|
4
4
|
# smftools
|
|
5
|
-
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
|
|
5
|
+
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
|
|
6
6
|
|
|
7
7
|
## Philosophy
|
|
8
8
|
While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to at least 1 million X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
|
|
@@ -13,10 +13,14 @@ The following CLI tools need to be installed and configured before using the inf
|
|
|
13
13
|
2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
|
|
14
14
|
3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
|
|
15
15
|
4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
|
|
16
|
+
5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
|
|
17
|
+
6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
|
|
16
18
|
|
|
17
19
|
## Modules
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
|
|
21
|
+

|
|
22
|
+
### Preprocessing: Appends QC metrics to the AnnData object and perfroms filtering.
|
|
23
|
+

|
|
20
24
|
- Tools: Appends various analyses to the AnnData object.
|
|
21
25
|
- Plotting: Visualization of analyses stored within the AnnData object.
|
|
22
26
|
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# API
|
|
2
|
+
|
|
3
|
+
Import smftools as:
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
import smftools as smf
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
```{toctree}
|
|
10
|
+
:maxdepth: 2
|
|
11
|
+
|
|
12
|
+
informatics
|
|
13
|
+
preprocessing
|
|
14
|
+
tools
|
|
15
|
+
datasets
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Informatics module diagram
|
|
19
|
+
```{image} ../_static/smftools_informatics_diagram.png
|
|
20
|
+
:width: 800px
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Preprocessing module diagram
|
|
24
|
+
```{image} ../_static/smftools_preprocessing_diagram.png
|
|
25
|
+
:width: 800px
|
|
26
|
+
```
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
## Informatics: `inform`
|
|
2
|
+
|
|
3
|
+
## Informatics module diagram
|
|
4
|
+
```{image} ../_static/smftools_informatics_diagram.png
|
|
5
|
+
:width: 1000px
|
|
6
|
+
```
|
|
7
|
+
|
|
8
|
+
```{eval-rst}
|
|
9
|
+
.. module:: smftools.inform
|
|
10
|
+
```
|
|
11
|
+
|
|
12
|
+
```{eval-rst}
|
|
13
|
+
.. currentmodule:: smftools
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
Processes raw sequencing data to load an adata object.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
### Diagram of final steps of Direct SMF workflow
|
|
20
|
+
```{image} ../_static/modkit_extract_to_adata.png
|
|
21
|
+
:width: 1000px
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
### Diagram of final steps of Conversion SMF workflow
|
|
25
|
+
```{image} ../_static/converted_BAM_to_adata.png
|
|
26
|
+
:width: 1000px
|
|
27
|
+
```
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Basic Usage
|
|
2
|
+
|
|
3
|
+
Import SmfTools:
|
|
4
|
+
|
|
5
|
+
```
|
|
6
|
+
import smftools as smf
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
## Informatics Module Usage
|
|
10
|
+
|
|
11
|
+
Many use cases for smftools begin here. For most users, the call below will be sufficient to convert any raw SMF dataset to an AnnData object:
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
config_path = "/Path_to_experiment_config.csv"
|
|
15
|
+
smf.inform.load_adata(config_path)
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Loading AnnData objects created by the informatics module
|
|
19
|
+
|
|
20
|
+
After creating an AnnData object holding your experiment's SMF data, you can load the AnnData object as so:
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
import anndata as ad
|
|
24
|
+
input_adata = "/Path_to_experiment_AnnData.h5ad.gz"
|
|
25
|
+
adata = ad.read_h5ad(input_file)
|
|
26
|
+
adata.obs_names_make_unique()
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
If you don't have an AnnData object yet, but want to play with the downstream Preprocessing, Tools, and Plotting modules, you can load a pre-loaded SMF dataset.
|
|
30
|
+
|
|
31
|
+
Currently, you can do this with our lab's in vitro dCas9 binding kinetics dataset generated from a Hia5 SMF dataset generated with direct m6A high accuracy basecalls:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
adata = smf.datasets.dCas9_kinetics()
|
|
35
|
+
adata.obs_names_make_unique()
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
Alternatively, you can do this with our lab's M.CviPI SMF test data in F1-hybrid natural killer cells generated by NEB EMseq conversion followed by canonical basecalling:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
adata = smf.datasets.Kissiov_and_McKenna_2025()
|
|
42
|
+
adata.obs_names_make_unique()
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Writing out AnnData objects to save analysis progress
|
|
46
|
+
|
|
47
|
+
After preprocessing and downstream analysis of the AnnData object, you can save the AnnData object at any step as so:
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
import anndata as ad
|
|
51
|
+
import os
|
|
52
|
+
|
|
53
|
+
output_dir = '/Path_to_output_directory'
|
|
54
|
+
output_adata = 'analyzed_adata.h5ad.gz'
|
|
55
|
+
final_output = os.path.join(output_dir, output_adata)
|
|
56
|
+
adata.write_h5ad(final_output, compression='gzip')
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
## Troubleshooting
|
|
61
|
+
For more advanced usage and help troubleshooting, the API and tutorials for each of the modules is still being developed.
|
|
62
|
+
However, you can currently learn about the functions contained within the module by calling:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
smf.inform.__all__
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
This lists the functions within any given module. If you want to see the associated docstring for a given function, here is an example:
|
|
69
|
+
|
|
70
|
+
```
|
|
71
|
+
print(smf.inform.load_adata.__doc__)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
These docstrings will provide a brief description of the function and also tell you the input parameters and what the function returns.
|
|
75
|
+
In some cases, usage examples will also be provided in the docstring in the form of doctests.
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Installation
|
|
2
|
+
|
|
3
|
+
## PyPi version
|
|
4
|
+
|
|
5
|
+
Pull smftools from [PyPI](https://pypi.org/project/smftools):
|
|
6
|
+
|
|
7
|
+
```shell
|
|
8
|
+
pip install smftools
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
It is recommended to first create and activate a conda environment before installing smftools to ensure dependencies are managed smoothly:
|
|
12
|
+
|
|
13
|
+
```shell
|
|
14
|
+
conda create -n smftools
|
|
15
|
+
conda activate smftools
|
|
16
|
+
pip install smftools
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Ensure that you can access dorado, samtools, modkit, bedtools, and BedGraphtoBigWig executables from the terminal in this environment. These are all necessary for the functionality within the Informatics module.
|
|
20
|
+
You may need to add them to $PATH if they are not globally configured.
|
|
21
|
+
For example, if you want to check if dorado is executable, simply run this in the terminal:
|
|
22
|
+
|
|
23
|
+
```shell
|
|
24
|
+
dorado
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
On Mac OSX, the following can be used to congigure bedtools (with brew) and BedGraphToBigWig (with wget). Change the BedGraphToBigWig link to include the correct architecture for your OS.
|
|
28
|
+
|
|
29
|
+
```shell
|
|
30
|
+
brew install bedtools
|
|
31
|
+
wget http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bedGraphToBigWig
|
|
32
|
+
chmod +x bedGraphToBigWig
|
|
33
|
+
sudo mv bedGraphToBigWig /usr/local/bin/
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Development Version
|
|
37
|
+
|
|
38
|
+
Clone smftools from source and change into the smftools directory:
|
|
39
|
+
|
|
40
|
+
```shell
|
|
41
|
+
git clone https://github.com/jkmckenna/smftools.git
|
|
42
|
+
cd smftools
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
A virtual environment can be created for the current version within the smftools directory:
|
|
46
|
+
|
|
47
|
+
```shell
|
|
48
|
+
python -m venv venv-smftools
|
|
49
|
+
source venv-smftools/bin/activate
|
|
50
|
+
pip install .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Subsequent use of the installed version of smftools can be run by changing to the smftools directory and activating the venv:
|
|
54
|
+
|
|
55
|
+
```shell
|
|
56
|
+
cd smftools
|
|
57
|
+
source venv-smftools/bin/activate
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
You can now run smftools from the terminal, an IDE, or a notebook within the virtual environment.
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
variable,value,help,options,type
|
|
2
2
|
smf_modality,conversion,Modality of SMF. Can either be conversion or direct.,"conversion, direct",str
|
|
3
|
-
|
|
4
|
-
basecalled_path,/path_to_basecalled_HTS_file.bam,Path to directory containing input BAM file (if doing SMF from an already basecalled experiment). Can also be a path to a FASTQ for conversion SMF.,,str
|
|
3
|
+
input_data_path,/path_to_POD5_directory,Path to directory/file containing input sequencing data,,str
|
|
5
4
|
fasta,/path_to_fasta.fasta,Path to initial FASTA file,,str
|
|
5
|
+
fasta_regions_of_interest,/path_to_bed.bed,Path to a bed file to subsample the fasta on.,,str
|
|
6
6
|
output_directory,/outputs,Directory to act as root for all analysis outputs,,str
|
|
7
7
|
experiment_name,,An experiment name for the final h5ad file,,str
|
|
8
8
|
model,None,The dorado basecalling model to use,,str
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
{
|
|
2
|
+
"cells": [
|
|
3
|
+
{
|
|
4
|
+
"cell_type": "code",
|
|
5
|
+
"execution_count": null,
|
|
6
|
+
"metadata": {},
|
|
7
|
+
"outputs": [],
|
|
8
|
+
"source": [
|
|
9
|
+
"import anndata as ad\n",
|
|
10
|
+
"import pandas as pd\n",
|
|
11
|
+
"import numpy as np\n",
|
|
12
|
+
"import matplotlib.pyplot as plt\n",
|
|
13
|
+
"import smftools as smf\n",
|
|
14
|
+
"import os"
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"cell_type": "code",
|
|
19
|
+
"execution_count": null,
|
|
20
|
+
"metadata": {},
|
|
21
|
+
"outputs": [],
|
|
22
|
+
"source": [
|
|
23
|
+
"# Define file paths\n",
|
|
24
|
+
"adata_path = '/Path_to_input_adata.h5ad.gz'\n",
|
|
25
|
+
"output_directory = '/Path_to_output_directory'\n",
|
|
26
|
+
"output_adata = 'analyzed_adata.h5ad.gz'\n",
|
|
27
|
+
"final_output = os.path.join(output_directory, output_adata)\n",
|
|
28
|
+
"\n",
|
|
29
|
+
"# Load adata\n",
|
|
30
|
+
"adata = ad.read_h5ad(adata_path)\n",
|
|
31
|
+
"adata.obs_names_make_unique()"
|
|
32
|
+
]
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"cell_type": "code",
|
|
36
|
+
"execution_count": null,
|
|
37
|
+
"metadata": {},
|
|
38
|
+
"outputs": [],
|
|
39
|
+
"source": [
|
|
40
|
+
"# Define path to sample sheet and run first part of preprocessing.\n",
|
|
41
|
+
"sample_sheet_path = '/path_to_sample_sheet.csv'\n",
|
|
42
|
+
"variables = smf.pp.recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory)\n",
|
|
43
|
+
"# Update global variables\n",
|
|
44
|
+
"globals().update(variables)"
|
|
45
|
+
]
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"cell_type": "code",
|
|
49
|
+
"execution_count": null,
|
|
50
|
+
"metadata": {},
|
|
51
|
+
"outputs": [],
|
|
52
|
+
"source": [
|
|
53
|
+
"# Filter adata based on defined read length statistics, using the plots from preprocessing part 1 to direct the input parameters here.\n",
|
|
54
|
+
"smf.pp.filter_reads_on_length(adata, filter_on_coordinates=[lower_bound, upper_bound], min_read_length=2700)"
|
|
55
|
+
]
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"cell_type": "code",
|
|
59
|
+
"execution_count": null,
|
|
60
|
+
"metadata": {},
|
|
61
|
+
"outputs": [],
|
|
62
|
+
"source": [
|
|
63
|
+
"# Filter adata on defined read methylation statistics\n",
|
|
64
|
+
"smf.pp.filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025)"
|
|
65
|
+
]
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
"cell_type": "code",
|
|
69
|
+
"execution_count": null,
|
|
70
|
+
"metadata": {},
|
|
71
|
+
"outputs": [],
|
|
72
|
+
"source": [
|
|
73
|
+
"# Run second part of preprocessing\n",
|
|
74
|
+
"duplicates = smf.pp.recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers)"
|
|
75
|
+
]
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
"metadata": {
|
|
79
|
+
"language_info": {
|
|
80
|
+
"name": "python"
|
|
81
|
+
}
|
|
82
|
+
},
|
|
83
|
+
"nbformat": 4,
|
|
84
|
+
"nbformat_minor": 2
|
|
85
|
+
}
|
|
@@ -48,6 +48,7 @@ dependencies = [
|
|
|
48
48
|
"pandas>=1.4.2",
|
|
49
49
|
"pod5>=0.1.21",
|
|
50
50
|
"pomegranate>1.0.0",
|
|
51
|
+
"pyfaidx>=0.8.0",
|
|
51
52
|
"pysam>=0.19.1",
|
|
52
53
|
"scanpy>=1.9",
|
|
53
54
|
"scikit-learn>=1.0.2",
|
|
@@ -60,9 +61,10 @@ dynamic = ["version"]
|
|
|
60
61
|
|
|
61
62
|
[project.urls]
|
|
62
63
|
Source = "https://github.com/jkmckenna/smftools"
|
|
64
|
+
Documentation = "https://smftools.readthedocs.io/"
|
|
63
65
|
|
|
64
66
|
[project.optional-dependencies]
|
|
65
|
-
|
|
67
|
+
tests = [
|
|
66
68
|
"pytest",
|
|
67
69
|
"pytest-cov"
|
|
68
70
|
]
|
|
@@ -91,16 +93,16 @@ packages = ["src/smftools"]
|
|
|
91
93
|
path = "src/smftools/_version.py"
|
|
92
94
|
|
|
93
95
|
[tool.pytest.ini_options]
|
|
96
|
+
addopts = [
|
|
97
|
+
"--import-mode=importlib",
|
|
98
|
+
"--strict-markers",
|
|
99
|
+
"--doctest-modules",
|
|
100
|
+
"--pyargs",
|
|
101
|
+
]
|
|
94
102
|
testpaths = ["tests"]
|
|
95
103
|
pythonpath = ["src"]
|
|
96
104
|
xfail_strict = true
|
|
97
|
-
markers = [
|
|
98
|
-
"internet: mark tests that requires internet access",
|
|
99
|
-
"optional: mark optional tests",
|
|
100
|
-
"private: mark tests that are private",
|
|
101
|
-
]
|
|
102
105
|
|
|
103
106
|
[tool.coverage.run]
|
|
104
|
-
branch = true
|
|
105
107
|
source = ["smftools"]
|
|
106
108
|
omit = ["tests/*"]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
|
+
from typing import Union
|
|
2
3
|
|
|
3
4
|
class SMFConfig:
|
|
4
5
|
"""\
|
|
@@ -8,9 +9,9 @@ class SMFConfig:
|
|
|
8
9
|
def __init__(
|
|
9
10
|
self,
|
|
10
11
|
*,
|
|
11
|
-
datasetdir: Path
|
|
12
|
+
datasetdir: Union[Path, str] = "./datasets/"
|
|
12
13
|
):
|
|
13
|
-
|
|
14
|
+
self._datasetdir = Path(datasetdir) if isinstance(datasetdir, str) else datasetdir
|
|
14
15
|
|
|
15
16
|
@property
|
|
16
17
|
def datasetdir(self) -> Path:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.3"
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
## datasets
|
|
2
2
|
|
|
3
|
-
def
|
|
3
|
+
def import_HERE():
|
|
4
4
|
"""
|
|
5
|
-
|
|
5
|
+
Imports HERE for loading datasets
|
|
6
6
|
"""
|
|
7
|
-
import anndata as ad
|
|
8
7
|
from pathlib import Path
|
|
9
8
|
from .._settings import settings
|
|
10
9
|
HERE = Path(__file__).parent
|
|
@@ -12,16 +11,18 @@ def import_deps():
|
|
|
12
11
|
|
|
13
12
|
def dCas9_kinetics():
|
|
14
13
|
"""
|
|
15
|
-
|
|
14
|
+
in vitro Hia5 dCas9 kinetics SMF dataset. Nanopore HAC m6A modcalls.
|
|
16
15
|
"""
|
|
17
|
-
|
|
16
|
+
import anndata as ad
|
|
17
|
+
HERE = import_HERE()
|
|
18
18
|
filepath = HERE / "dCas9_m6A_invitro_kinetics.h5ad.gz"
|
|
19
19
|
return ad.read_h5ad(filepath)
|
|
20
20
|
|
|
21
21
|
def Kissiov_and_McKenna_2025():
|
|
22
22
|
"""
|
|
23
|
-
|
|
23
|
+
F1 Hybrid M.CviPI natural killer cell SMF. Nanopore canonical calls of NEB EMseq converted SMF gDNA.
|
|
24
24
|
"""
|
|
25
|
-
|
|
25
|
+
import anndata as ad
|
|
26
|
+
HERE = import_HERE()
|
|
26
27
|
filepath = HERE / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
|
|
27
28
|
return ad.read_h5ad(filepath)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
from . import helpers
|
|
2
|
+
from .load_adata import load_adata
|
|
3
|
+
from .subsample_fasta_from_bed import subsample_fasta_from_bed
|
|
4
|
+
from .subsample_pod5 import subsample_pod5
|
|
5
|
+
from .fast5_to_pod5 import fast5_to_pod5
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"load_adata",
|
|
10
|
+
"subsample_fasta_from_bed",
|
|
11
|
+
"subsample_pod5",
|
|
12
|
+
"fast5_to_pod5",
|
|
13
|
+
"helpers"
|
|
14
|
+
]
|
|
@@ -18,7 +18,7 @@ def bam_conversion(fasta, output_directory, conversion_types, strands, basecalle
|
|
|
18
18
|
Returns:
|
|
19
19
|
None
|
|
20
20
|
"""
|
|
21
|
-
from .helpers import align_and_sort_BAM, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
|
|
21
|
+
from .helpers import align_and_sort_BAM, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM, make_dirs
|
|
22
22
|
import os
|
|
23
23
|
input_basecalled_basename = os.path.basename(basecalled_path)
|
|
24
24
|
bam_basename = input_basecalled_basename.split(".")[0]
|
|
@@ -32,16 +32,28 @@ def bam_conversion(fasta, output_directory, conversion_types, strands, basecalle
|
|
|
32
32
|
fasta_basename = os.path.basename(fasta)
|
|
33
33
|
converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
|
|
34
34
|
converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
|
|
35
|
-
if
|
|
35
|
+
if 'converted.fa' in fasta:
|
|
36
|
+
print(fasta + ' is already converted. Using existing converted FASTA.')
|
|
37
|
+
converted_FASTA = fasta
|
|
38
|
+
elif os.path.exists(converted_FASTA):
|
|
36
39
|
print(converted_FASTA + ' already exists. Using existing converted FASTA.')
|
|
37
40
|
else:
|
|
38
41
|
generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
|
|
39
42
|
|
|
40
43
|
# 2) Align the basecalled file to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
|
|
41
|
-
|
|
44
|
+
aligned_output = aligned_BAM + bam_suffix
|
|
45
|
+
sorted_output = aligned_sorted_BAM + bam_suffix
|
|
46
|
+
if os.path.exists(aligned_output) and os.path.exists(sorted_output):
|
|
47
|
+
print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
|
|
48
|
+
else:
|
|
49
|
+
align_and_sort_BAM(converted_FASTA, basecalled_path, bam_suffix, output_directory)
|
|
42
50
|
|
|
43
51
|
### 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
|
|
44
|
-
|
|
52
|
+
if os.path.isdir(split_dir):
|
|
53
|
+
print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
|
|
54
|
+
else:
|
|
55
|
+
make_dirs([split_dir])
|
|
56
|
+
split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory)
|
|
45
57
|
|
|
46
58
|
# 4) Take the converted BAM and load it into an adata object.
|
|
47
59
|
converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)
|