smftools 0.1.7__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {smftools-0.1.7 → smftools-0.2.4}/PKG-INFO +30 -19
- {smftools-0.1.7 → smftools-0.2.4}/README.md +23 -17
- smftools-0.2.4/docs/source/basic_usage.md +114 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/installation.md +6 -3
- {smftools-0.1.7 → smftools-0.2.4}/experiment_config.csv +2 -7
- {smftools-0.1.7 → smftools-0.2.4}/pyproject.toml +10 -2
- {smftools-0.1.7 → smftools-0.2.4}/requirements.txt +7 -2
- {smftools-0.1.7 → smftools-0.2.4}/smftools/__init__.py +7 -6
- smftools-0.2.4/smftools/_version.py +1 -0
- smftools-0.2.4/smftools/cli/archived/cli_flows.py +94 -0
- smftools-0.2.4/smftools/cli/helpers.py +48 -0
- smftools-0.2.4/smftools/cli/hmm_adata.py +361 -0
- smftools-0.2.4/smftools/cli/load_adata.py +637 -0
- smftools-0.2.4/smftools/cli/preprocess_adata.py +455 -0
- smftools-0.2.4/smftools/cli/spatial_adata.py +697 -0
- smftools-0.2.4/smftools/cli_entry.py +434 -0
- smftools-0.2.4/smftools/config/__init__.py +1 -0
- smftools-0.2.4/smftools/config/conversion.yaml +45 -0
- smftools-0.2.4/smftools/config/deaminase.yaml +63 -0
- smftools-0.2.4/smftools/config/default.yaml +368 -0
- smftools-0.2.4/smftools/config/direct.yaml +44 -0
- smftools-0.2.4/smftools/config/discover_input_files.py +115 -0
- smftools-0.2.4/smftools/config/experiment_config.py +1389 -0
- smftools-0.2.4/smftools/hmm/HMM.py +1587 -0
- smftools-0.2.4/smftools/hmm/__init__.py +14 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/apply_hmm_batched.py +8 -7
- smftools-0.2.4/smftools/hmm/archived/call_hmm_peaks.py +106 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/train_hmm.py +1 -1
- smftools-0.2.4/smftools/hmm/call_hmm_peaks.py +334 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/display_hmm.py +3 -3
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/nucleosome_hmm_refinement.py +2 -2
- smftools-0.2.4/smftools/informatics/__init__.py +20 -0
- smftools-0.2.4/smftools/informatics/archived/deaminase_smf.py +132 -0
- smftools-0.2.4/smftools/informatics/archived/fast5_to_pod5.py +43 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/__init__.py +71 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/count_aligned_reads.py +2 -2
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/demux_and_index_BAM.py +8 -10
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_base_identities.py +30 -4
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_mods.py +15 -13
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/find_conversion_sites.py +5 -4
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/generate_converted_FASTA.py +2 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/get_chromosome_lengths.py +9 -8
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/make_modbed.py +1 -2
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/modQC.py +2 -2
- smftools-0.2.4/smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/separate_bam_by_bc.py +8 -7
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/split_and_index_BAM.py +8 -12
- smftools-0.2.4/smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
- smftools-0.2.4/smftools/informatics/bam_functions.py +811 -0
- smftools-0.2.4/smftools/informatics/basecalling.py +67 -0
- smftools-0.2.4/smftools/informatics/bed_functions.py +366 -0
- smftools-0.2.4/smftools/informatics/binarize_converted_base_identities.py +172 -0
- smftools-0.1.7/smftools/informatics/helpers/converted_BAM_to_adata_II.py → smftools-0.2.4/smftools/informatics/converted_BAM_to_adata.py +198 -50
- smftools-0.2.4/smftools/informatics/fasta_functions.py +255 -0
- smftools-0.2.4/smftools/informatics/h5ad_functions.py +197 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/modkit_extract_to_adata.py +147 -61
- smftools-0.2.4/smftools/informatics/modkit_functions.py +129 -0
- smftools-0.2.4/smftools/informatics/ohe.py +160 -0
- smftools-0.2.4/smftools/informatics/pod5_functions.py +224 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/run_multiqc.py +5 -2
- smftools-0.2.4/smftools/machine_learning/__init__.py +12 -0
- smftools-0.2.4/smftools/machine_learning/data/__init__.py +2 -0
- smftools-0.2.4/smftools/machine_learning/data/anndata_data_module.py +234 -0
- smftools-0.2.4/smftools/machine_learning/evaluation/__init__.py +2 -0
- smftools-0.2.4/smftools/machine_learning/evaluation/eval_utils.py +31 -0
- smftools-0.2.4/smftools/machine_learning/evaluation/evaluators.py +223 -0
- smftools-0.2.4/smftools/machine_learning/inference/__init__.py +3 -0
- smftools-0.2.4/smftools/machine_learning/inference/inference_utils.py +27 -0
- smftools-0.2.4/smftools/machine_learning/inference/lightning_inference.py +68 -0
- smftools-0.2.4/smftools/machine_learning/inference/sklearn_inference.py +55 -0
- smftools-0.2.4/smftools/machine_learning/inference/sliding_window_inference.py +114 -0
- smftools-0.2.4/smftools/machine_learning/models/base.py +295 -0
- smftools-0.2.4/smftools/machine_learning/models/cnn.py +138 -0
- smftools-0.2.4/smftools/machine_learning/models/lightning_base.py +345 -0
- smftools-0.2.4/smftools/machine_learning/models/mlp.py +26 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/positional.py +3 -2
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/rnn.py +2 -1
- smftools-0.2.4/smftools/machine_learning/models/sklearn_models.py +273 -0
- smftools-0.2.4/smftools/machine_learning/models/transformer.py +303 -0
- smftools-0.2.4/smftools/machine_learning/training/__init__.py +2 -0
- smftools-0.2.4/smftools/machine_learning/training/train_lightning_model.py +135 -0
- smftools-0.2.4/smftools/machine_learning/training/train_sklearn_model.py +114 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/__init__.py +4 -1
- smftools-0.2.4/smftools/plotting/autocorrelation_plotting.py +609 -0
- smftools-0.2.4/smftools/plotting/general_plotting.py +1403 -0
- smftools-0.2.4/smftools/plotting/hmm_plotting.py +260 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/position_stats.py +3 -3
- smftools-0.2.4/smftools/plotting/qc_plotting.py +270 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/__init__.py +15 -10
- smftools-0.2.4/smftools/preprocessing/append_base_context.py +131 -0
- smftools-0.2.4/smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
- smftools-0.2.4/smftools/preprocessing/archives/add_read_length_and_mapping_qc.py +129 -0
- smftools-0.2.4/smftools/preprocessing/binarize.py +17 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/binarize_on_Youden.py +11 -9
- smftools-0.2.4/smftools/preprocessing/calculate_complexity_II.py +248 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_coverage.py +25 -13
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_position_Youden.py +42 -26
- smftools-0.2.4/smftools/preprocessing/calculate_read_modification_stats.py +101 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/clean_NaN.py +17 -1
- smftools-0.2.4/smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
- smftools-0.2.4/smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
- smftools-0.2.4/smftools/preprocessing/flag_duplicate_reads.py +1351 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/invert_adata.py +12 -5
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/load_sample_sheet.py +19 -4
- smftools-0.2.4/smftools/preprocessing/reindex_references_adata.py +37 -0
- smftools-0.2.4/smftools/readwrite.py +1224 -0
- smftools-0.2.4/smftools/tools/__init__.py +20 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/calculate_umap.py +5 -5
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/general_tools.py +3 -3
- smftools-0.2.4/smftools/tools/position_stats.py +601 -0
- smftools-0.2.4/smftools/tools/read_stats.py +184 -0
- smftools-0.2.4/smftools/tools/spatial_autocorrelation.py +562 -0
- smftools-0.1.7/docs/source/basic_usage.md +0 -75
- smftools-0.1.7/smftools/_version.py +0 -1
- smftools-0.1.7/smftools/informatics/__init__.py +0 -16
- smftools-0.1.7/smftools/informatics/fast5_to_pod5.py +0 -21
- smftools-0.1.7/smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
- smftools-0.1.7/smftools/informatics/helpers/__init__.py +0 -74
- smftools-0.1.7/smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
- smftools-0.1.7/smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
- smftools-0.1.7/smftools/informatics/helpers/bam_qc.py +0 -66
- smftools-0.1.7/smftools/informatics/helpers/bed_to_bigwig.py +0 -39
- smftools-0.1.7/smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
- smftools-0.1.7/smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
- smftools-0.1.7/smftools/informatics/helpers/index_fasta.py +0 -12
- smftools-0.1.7/smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
- smftools-0.1.7/smftools/informatics/load_adata.py +0 -182
- smftools-0.1.7/smftools/informatics/readwrite.py +0 -106
- smftools-0.1.7/smftools/informatics/subsample_fasta_from_bed.py +0 -47
- smftools-0.1.7/smftools/plotting/general_plotting.py +0 -205
- smftools-0.1.7/smftools/preprocessing/append_C_context.py +0 -82
- smftools-0.1.7/smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
- smftools-0.1.7/smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
- smftools-0.1.7/smftools/preprocessing/filter_reads_on_length.py +0 -51
- smftools-0.1.7/smftools/preprocessing/flag_duplicate_reads.py +0 -149
- smftools-0.1.7/smftools/preprocessing/make_dirs.py +0 -21
- smftools-0.1.7/smftools/readwrite.py +0 -198
- smftools-0.1.7/smftools/tools/__init__.py +0 -49
- smftools-0.1.7/smftools/tools/call_hmm_peaks.py +0 -105
- smftools-0.1.7/smftools/tools/data/__init__.py +0 -2
- smftools-0.1.7/smftools/tools/data/anndata_data_module.py +0 -90
- smftools-0.1.7/smftools/tools/inference/__init__.py +0 -1
- smftools-0.1.7/smftools/tools/inference/lightning_inference.py +0 -41
- smftools-0.1.7/smftools/tools/models/base.py +0 -14
- smftools-0.1.7/smftools/tools/models/cnn.py +0 -34
- smftools-0.1.7/smftools/tools/models/lightning_base.py +0 -41
- smftools-0.1.7/smftools/tools/models/mlp.py +0 -17
- smftools-0.1.7/smftools/tools/models/sklearn_models.py +0 -40
- smftools-0.1.7/smftools/tools/models/transformer.py +0 -133
- smftools-0.1.7/smftools/tools/position_stats.py +0 -239
- smftools-0.1.7/smftools/tools/read_stats.py +0 -70
- smftools-0.1.7/smftools/tools/training/__init__.py +0 -1
- smftools-0.1.7/smftools/tools/training/train_lightning_model.py +0 -47
- {smftools-0.1.7 → smftools-0.2.4}/.gitattributes +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/.gitignore +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/.readthedocs.yaml +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/CONTRIBUTING.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/LICENSE +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/Makefile +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/make.bat +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/converted_BAM_to_adata.png +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/modkit_extract_to_adata.png +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools-1.svg +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools-1.tif +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.pdf +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.png +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_preprocessing_diagram.png +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/_templates/tmp +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/datasets.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/index.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/informatics.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/preprocessing.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/tools.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/conf.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/contributors.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/dev/index.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/index.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/references.bib +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/references.rst +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/release-notes/0.1.0.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/release-notes/index.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/requirements.txt +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/docs/source/tutorials/index.md +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_example_notebook.ipynb +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_sample_sheet.csv +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/sample_sheet.csv +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/_settings.py +0 -0
- {smftools-0.1.7/smftools/tools/evaluation → smftools-0.2.4/smftools/cli}/__init__.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/F1_sample_sheet.csv +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/__init__.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/datasets.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/calculate_distances.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/hmm_readwrite.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/bam_conversion.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/bam_direct.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/basecall_pod5s.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/basecalls_to_adata.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/conversion_smf.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/direct_smf.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/canoncall.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/get_native_references.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/helpers/archived/informatics.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/helpers/archived/load_adata.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/modcall.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/ohe_batching.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/ohe_layers_decode.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/one_hot_decode.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/one_hot_encode.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/print_bam_query_seq.py +0 -0
- {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/subsample_pod5.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/complement_base_list.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/data/preprocessing.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/__init__.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/wrappers.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/__init__.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/device.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/grl.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/classifiers.py +0 -0
- {smftools-0.1.7/smftools/preprocessing → smftools-0.2.4/smftools/preprocessing/archives}/calculate_complexity.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/mark_duplicates.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/preprocessing.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/remove_duplicates.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/binary_layers_to_ohe.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_consensus.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_differences.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_read_length_stats.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/filter_adata_by_nan_proportion.py +0 -0
- {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/preprocessing}/make_dirs.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/min_non_diagonal.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/recipes.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/subsample_adata.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/tools/archived}/apply_hmm.py +0 -0
- {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/tools/archived}/classifiers.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/classify_methylated_features.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/classify_non_methylated_features.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v1.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v2.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/cluster_adata_on_methylation.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/subset_adata.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/tests/datasets/test_datasets.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/tests/informatics/helpers/test_LoadExperimentConfig.py +0 -0
- {smftools-0.1.7 → smftools-0.2.4}/tests/test_readwrite.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: smftools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.4
|
|
4
4
|
Summary: Single Molecule Footprinting Analysis in Python.
|
|
5
5
|
Project-URL: Source, https://github.com/jkmckenna/smftools
|
|
6
6
|
Project-URL: Documentation, https://smftools.readthedocs.io/
|
|
@@ -43,9 +43,11 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
43
43
|
Classifier: Programming Language :: Python :: 3.12
|
|
44
44
|
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
45
45
|
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
46
|
-
Requires-Python:
|
|
46
|
+
Requires-Python: <3.13,>=3.9
|
|
47
47
|
Requires-Dist: anndata>=0.10.0
|
|
48
48
|
Requires-Dist: biopython>=1.79
|
|
49
|
+
Requires-Dist: captum
|
|
50
|
+
Requires-Dist: click
|
|
49
51
|
Requires-Dist: fastcluster
|
|
50
52
|
Requires-Dist: hydra-core
|
|
51
53
|
Requires-Dist: igraph
|
|
@@ -57,15 +59,18 @@ Requires-Dist: numpy<2,>=1.22.0
|
|
|
57
59
|
Requires-Dist: omegaconf
|
|
58
60
|
Requires-Dist: pandas>=1.4.2
|
|
59
61
|
Requires-Dist: pod5>=0.1.21
|
|
60
|
-
Requires-Dist:
|
|
62
|
+
Requires-Dist: pybedtools>=0.12.0
|
|
63
|
+
Requires-Dist: pybigwig>=0.3.24
|
|
61
64
|
Requires-Dist: pyfaidx>=0.8.0
|
|
62
65
|
Requires-Dist: pysam>=0.19.1
|
|
63
66
|
Requires-Dist: scanpy>=1.9
|
|
64
67
|
Requires-Dist: scikit-learn>=1.0.2
|
|
65
68
|
Requires-Dist: scipy>=1.7.3
|
|
66
69
|
Requires-Dist: seaborn>=0.11
|
|
70
|
+
Requires-Dist: shap
|
|
67
71
|
Requires-Dist: torch>=1.9.0
|
|
68
72
|
Requires-Dist: tqdm
|
|
73
|
+
Requires-Dist: upsetplot
|
|
69
74
|
Requires-Dist: wandb
|
|
70
75
|
Provides-Extra: docs
|
|
71
76
|
Requires-Dist: ipython>=7.20; extra == 'docs'
|
|
@@ -91,33 +96,39 @@ Description-Content-Type: text/markdown
|
|
|
91
96
|
[](https://smftools.readthedocs.io/en/latest/?badge=latest)
|
|
92
97
|
|
|
93
98
|
# smftools
|
|
94
|
-
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing,
|
|
99
|
+
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
|
|
95
100
|
|
|
96
101
|
## Philosophy
|
|
97
|
-
While
|
|
102
|
+
While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
|
|
98
103
|
|
|
99
104
|
## Dependencies
|
|
100
105
|
The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
|
|
101
|
-
1) [Dorado](https://github.com/nanoporetech/dorado) ->
|
|
102
|
-
2) [
|
|
103
|
-
3) [
|
|
104
|
-
4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
|
|
105
|
-
5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
|
|
106
|
-
6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
|
|
106
|
+
1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
|
|
107
|
+
2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
|
|
108
|
+
3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
|
|
107
109
|
|
|
108
|
-
##
|
|
109
|
-
###
|
|
110
|
+
## Main Commands
|
|
111
|
+
### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
|
|
110
112
|

|
|
111
|
-
###
|
|
113
|
+
### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
|
|
112
114
|

|
|
113
|
-
###
|
|
114
|
-
- Currently Includes: Position X Position correlation matrices,
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
115
|
+
### smftools spatial: Appends spatial analyses to the AnnData object.
|
|
116
|
+
- Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
|
|
117
|
+
### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
|
|
118
|
+
- Main outputs wills be stored in adata.layers
|
|
119
|
+
### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
|
|
120
|
+
- Nice when analyzing multiple experiments
|
|
121
|
+
### smftools concatenate: Concatenates a list or directory of anndata objects.
|
|
122
|
+
- Mainly used for combining multiple experiments into a single anndata object.
|
|
118
123
|
|
|
119
124
|
## Announcements
|
|
120
125
|
|
|
126
|
+
### 12/02/25 - Version 0.2.3 is available through PyPI
|
|
127
|
+
Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
|
|
128
|
+
|
|
129
|
+
### 11/05/25 - Version 0.2.1 is available through PyPI
|
|
130
|
+
Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
|
|
131
|
+
|
|
121
132
|
### 05/29/25 - Version 0.1.6 is available through PyPI.
|
|
122
133
|
Informatics, preprocessing, tools, plotting modules have core functionality that is approaching stability on MacOS(Intel/Silicon) and Linux(Ubuntu). I will work on improving documentation/tutorials shortly. The base PyTorch/Scikit-Learn ML-infrastructure is going through some organizational changes to work with PyTorch Lightning, Hydra, and WanDB to facilitate organizational scaling, multi-device usage, and logging.
|
|
123
134
|
|
|
@@ -2,33 +2,39 @@
|
|
|
2
2
|
[](https://smftools.readthedocs.io/en/latest/?badge=latest)
|
|
3
3
|
|
|
4
4
|
# smftools
|
|
5
|
-
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing,
|
|
5
|
+
A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
|
|
6
6
|
|
|
7
7
|
## Philosophy
|
|
8
|
-
While
|
|
8
|
+
While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
|
|
9
9
|
|
|
10
10
|
## Dependencies
|
|
11
11
|
The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
|
|
12
|
-
1) [Dorado](https://github.com/nanoporetech/dorado) ->
|
|
13
|
-
2) [
|
|
14
|
-
3) [
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
## Modules
|
|
20
|
-
### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
|
|
12
|
+
1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
|
|
13
|
+
2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
|
|
14
|
+
3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
|
|
15
|
+
|
|
16
|
+
## Main Commands
|
|
17
|
+
### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
|
|
21
18
|

|
|
22
|
-
###
|
|
19
|
+
### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
|
|
23
20
|

|
|
24
|
-
###
|
|
25
|
-
- Currently Includes: Position X Position correlation matrices,
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
21
|
+
### smftools spatial: Appends spatial analyses to the AnnData object.
|
|
22
|
+
- Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
|
|
23
|
+
### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
|
|
24
|
+
- Main outputs wills be stored in adata.layers
|
|
25
|
+
### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
|
|
26
|
+
- Nice when analyzing multiple experiments
|
|
27
|
+
### smftools concatenate: Concatenates a list or directory of anndata objects.
|
|
28
|
+
- Mainly used for combining multiple experiments into a single anndata object.
|
|
29
29
|
|
|
30
30
|
## Announcements
|
|
31
31
|
|
|
32
|
+
### 12/02/25 - Version 0.2.3 is available through PyPI
|
|
33
|
+
Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
|
|
34
|
+
|
|
35
|
+
### 11/05/25 - Version 0.2.1 is available through PyPI
|
|
36
|
+
Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
|
|
37
|
+
|
|
32
38
|
### 05/29/25 - Version 0.1.6 is available through PyPI.
|
|
33
39
|
Informatics, preprocessing, tools, plotting modules have core functionality that is approaching stability on MacOS(Intel/Silicon) and Linux(Ubuntu). I will work on improving documentation/tutorials shortly. The base PyTorch/Scikit-Learn ML-infrastructure is going through some organizational changes to work with PyTorch Lightning, Hydra, and WanDB to facilitate organizational scaling, multi-device usage, and logging.
|
|
34
40
|
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Basic Usage
|
|
2
|
+
|
|
3
|
+
## Load Usage
|
|
4
|
+
|
|
5
|
+
Many use cases for smftools begin here. For most users, the call below will be sufficient to convert any raw SMF dataset from Nanopore/Illumina to an AnnData object:
|
|
6
|
+
|
|
7
|
+
```shell
|
|
8
|
+
smftools load "/Path_to_experiment_config.csv"
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
This command takes a user passed config file handling:
|
|
12
|
+
- I/O pathes (With data input path, FASTA path, optional BED path for subsampling FASTA, and a data output path)
|
|
13
|
+
- Experiment info (SMF modality, sequencer type, barcoding kit if nanopore, sample sheet with metadata mapping)
|
|
14
|
+
- Options to override default workflow parameters from smftools/config. Params are handled from default.yaml -> modality_type.yaml -> user passed config.csv.
|
|
15
|
+
|
|
16
|
+
## Preprocess Usage
|
|
17
|
+
|
|
18
|
+
This command performs preprocessing on the anndata object. It automatically runs the load command under the hood if starting from raw data.
|
|
19
|
+
|
|
20
|
+
```shell
|
|
21
|
+
smftools preprocess "/Path_to_experiment_config.csv"
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Spatial Usage
|
|
25
|
+
|
|
26
|
+
This command performs spatial analysis on the anndata object. It automatically runs the load command and preprocessing under the hood if they have not been already run.
|
|
27
|
+
|
|
28
|
+
```shell
|
|
29
|
+
smftools spatial "/Path_to_experiment_config.csv"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## HMM Usage
|
|
33
|
+
|
|
34
|
+
This command performs hmm based feature annotation on the anndata object. It automatically runs the load command and preprocessing under the hood if they have not been already run.
|
|
35
|
+
|
|
36
|
+
```shell
|
|
37
|
+
smftools hmm "/Path_to_experiment_config.csv"
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Batch Usage
|
|
41
|
+
|
|
42
|
+
This command performs batch processing of any of the above commands across multiple experiments. It takes in a tsv, txt, or csv of experiment specific config csvs.
|
|
43
|
+
```shell
|
|
44
|
+
smftools batch preprocess "/Path_to_experiment_config_path_list.csv"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Concatenate Usage
|
|
48
|
+
|
|
49
|
+
This command concatenates multiple h5ad files and saves them to a new output. The h5ads to concatenate are provided as a txt, tsv, or h5ad file of paths.
|
|
50
|
+
```shell
|
|
51
|
+
smftools concatenate output.h5ad "/Path_to_h5ad_path_list.csv"
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Reading AnnData objects created by smftools
|
|
55
|
+
|
|
56
|
+
After creating an AnnData object holding your experiment's SMF data, you can load the AnnData object as so:
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
import smftools as smf
|
|
60
|
+
input_adata = "/Path_to_experiment_AnnData.h5ad.gz"
|
|
61
|
+
adata = safe_read_h5ad(input_adata)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
This custom read function will take an optional directory of pickle files for data types that can not normally be saved directly in hdf5 formatting that was saved with the safe_write_h5ad function.
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
If you don't have an AnnData object yet, but want to play with the downstream Preprocessing, Tools, and Plotting modules, you can load a pre-loaded SMF dataset.
|
|
68
|
+
|
|
69
|
+
Currently, you can do this with our lab's in vitro dCas9 binding kinetics dataset generated from a Hia5 SMF dataset generated with direct m6A high accuracy basecalls:
|
|
70
|
+
|
|
71
|
+
```
|
|
72
|
+
adata = smf.datasets.dCas9_kinetics()
|
|
73
|
+
adata.obs_names_make_unique()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Alternatively, you can do this with our lab's M.CviPI SMF test data in F1-hybrid natural killer cells generated by NEB EMseq conversion followed by canonical basecalling:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
adata = smf.datasets.Kissiov_and_McKenna_2025()
|
|
80
|
+
adata.obs_names_make_unique()
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Writing out AnnData objects to save analysis progress
|
|
84
|
+
|
|
85
|
+
After preprocessing and downstream analysis of the AnnData object, you can save the AnnData object at any step as so:
|
|
86
|
+
|
|
87
|
+
```
|
|
88
|
+
import smftools as smf
|
|
89
|
+
from pathlib import Path
|
|
90
|
+
|
|
91
|
+
output_dir = Path('/Path_to_output_directory')
|
|
92
|
+
output_adata = 'analyzed_adata.h5ad.gz'
|
|
93
|
+
final_output_path = output_dir / output_adata
|
|
94
|
+
safe_write_h5ad(adata, final_output_path, compression='gzip')
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
This custom save function will make a directory of pickle files for data types that can not normally be saved directly in hdf5 formatting.
|
|
98
|
+
|
|
99
|
+
## Troubleshooting
|
|
100
|
+
For more advanced usage and help troubleshooting, the API and tutorials for each of the modules is still being developed.
|
|
101
|
+
However, you can currently learn about the functions contained within the module by calling:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
smf.inform.__all__
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
This lists the functions within any given module. If you want to see the associated docstring for a given function, here is an example:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
print(smf.inform.load_adata.__doc__)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
These docstrings will provide a brief description of the function and also tell you the input parameters and what the function returns.
|
|
114
|
+
In some cases, usage examples will also be provided in the docstring in the form of doctests.
|
|
@@ -16,7 +16,7 @@ conda activate smftools
|
|
|
16
16
|
pip install smftools
|
|
17
17
|
```
|
|
18
18
|
|
|
19
|
-
Ensure that you can access dorado,
|
|
19
|
+
Ensure that you can access dorado, modkit, and minimap2 executables from the terminal in this environment.
|
|
20
20
|
You may need to add them to $PATH if they are not globally configured.
|
|
21
21
|
For example, if you want to check if dorado is executable, simply run this in the terminal:
|
|
22
22
|
|
|
@@ -24,10 +24,10 @@ For example, if you want to check if dorado is executable, simply run this in th
|
|
|
24
24
|
dorado
|
|
25
25
|
```
|
|
26
26
|
|
|
27
|
-
On Mac OSX, the following can be used to congigure
|
|
27
|
+
On Mac OSX, the following can be used to congigure minimap2 (with brew) and BedGraphToBigWig (with wget).
|
|
28
28
|
|
|
29
29
|
```shell
|
|
30
|
-
brew install
|
|
30
|
+
brew install minimap2
|
|
31
31
|
wget http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bedGraphToBigWig
|
|
32
32
|
chmod +x bedGraphToBigWig
|
|
33
33
|
sudo mv bedGraphToBigWig /usr/local/bin/
|
|
@@ -47,7 +47,10 @@ A python virtual environment can be created as an alternative to conda. I like t
|
|
|
47
47
|
```shell
|
|
48
48
|
python -m venv venv-smftools
|
|
49
49
|
source venv-smftools/bin/activate
|
|
50
|
+
pip install --upgrade pip
|
|
50
51
|
pip install .
|
|
52
|
+
pip install ipykernel jupyter
|
|
53
|
+
python -m ipykernel install --user --name=venv-smftools --display-name "Python (smftools)"
|
|
51
54
|
```
|
|
52
55
|
|
|
53
56
|
Subsequent use of the installed version of smftools can be run by changing to the smftools directory and activating the venv:
|
|
@@ -5,15 +5,10 @@ fasta,/path_to_fasta.fasta,Path to initial FASTA file,,str
|
|
|
5
5
|
fasta_regions_of_interest,/path_to_bed.bed,Path to a bed file to subsample the fasta on.,,str
|
|
6
6
|
output_directory,/outputs,Directory to act as root for all analysis outputs,,str
|
|
7
7
|
experiment_name,,An experiment name for the final h5ad file,,str
|
|
8
|
+
model_dir,/path_to_dorado_model_dir,Path,,str
|
|
8
9
|
model,None,The dorado basecalling model to use,,str
|
|
9
10
|
barcode_kit,SQK-NBD114-24,The barcoding kit used for the experiment,,str
|
|
10
11
|
mapping_threshold,0.05,Minimum proportion of reads mapping to a reference to further use that reference (Ranges from 0-1 as a proportion of mapped reads),,float
|
|
11
|
-
filter_threshold,0.8,Minimum probability to call a canonical base identity,,float
|
|
12
|
-
m6A_threshold,0.8,Minimum probability to flag m6A as True,,float
|
|
13
|
-
m5C_threshold,0.8,Minimum probability to flag m5C as True,,float
|
|
14
|
-
hm5C_threshold,0.8,Minimum probability to flag hm5C as True,,float
|
|
15
12
|
mod_list,[5mC_5hmC],Modified base names for Dorado,"""6mA"", ""5mC_5hmC""",list
|
|
16
13
|
batch_size,4,number of samples to analyze at a time,,int
|
|
17
|
-
conversion_types,[5mC],Types of modification types to use in conversion SMF,"5mC', '6mA'",list
|
|
18
|
-
barcode_both_ends,TRUE,whether to require both ends of a read to be barcoded for demultiplexing,,bool
|
|
19
|
-
trim,FALSE,whether to trim barcodes and adapters from reads during demultiplexing,,bool
|
|
14
|
+
conversion_types,[5mC],Types of modification types to use in conversion SMF,"5mC', '6mA'",list
|
|
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "smftools"
|
|
7
7
|
description = "Single Molecule Footprinting Analysis in Python."
|
|
8
|
-
requires-python = ">=3.9"
|
|
8
|
+
requires-python = ">=3.9,<3.13"
|
|
9
9
|
license = { file = "LICENSE" }
|
|
10
10
|
authors = [
|
|
11
11
|
{name = "Joseph McKenna"}
|
|
@@ -42,6 +42,8 @@ classifiers = [
|
|
|
42
42
|
dependencies = [
|
|
43
43
|
"anndata>=0.10.0",
|
|
44
44
|
"biopython>=1.79",
|
|
45
|
+
"captum",
|
|
46
|
+
"click",
|
|
45
47
|
"fastcluster",
|
|
46
48
|
"hydra-core",
|
|
47
49
|
"igraph",
|
|
@@ -53,15 +55,18 @@ dependencies = [
|
|
|
53
55
|
"omegaconf",
|
|
54
56
|
"pandas>=1.4.2",
|
|
55
57
|
"pod5>=0.1.21",
|
|
56
|
-
"pomegranate>=1.0.0",
|
|
57
58
|
"pyfaidx>=0.8.0",
|
|
59
|
+
"pybedtools>=0.12.0",
|
|
60
|
+
"pyBigWig>=0.3.24",
|
|
58
61
|
"pysam>=0.19.1",
|
|
59
62
|
"scanpy>=1.9",
|
|
60
63
|
"scikit-learn>=1.0.2",
|
|
61
64
|
"scipy>=1.7.3",
|
|
65
|
+
"shap",
|
|
62
66
|
"seaborn>=0.11",
|
|
63
67
|
"torch>=1.9.0",
|
|
64
68
|
"tqdm",
|
|
69
|
+
"upsetplot",
|
|
65
70
|
"wandb"
|
|
66
71
|
]
|
|
67
72
|
dynamic = ["version"]
|
|
@@ -70,6 +75,9 @@ dynamic = ["version"]
|
|
|
70
75
|
Source = "https://github.com/jkmckenna/smftools"
|
|
71
76
|
Documentation = "https://smftools.readthedocs.io/"
|
|
72
77
|
|
|
78
|
+
[project.scripts]
|
|
79
|
+
smftools = "smftools.cli_entry:cli"
|
|
80
|
+
|
|
73
81
|
[project.optional-dependencies]
|
|
74
82
|
tests = [
|
|
75
83
|
"pytest",
|
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# Essential packages
|
|
2
2
|
anndata>=0.10.0
|
|
3
3
|
biopython>=1.79
|
|
4
|
+
captum
|
|
5
|
+
click
|
|
4
6
|
fastcluster
|
|
5
7
|
hydra-core
|
|
6
8
|
leidenalg
|
|
@@ -14,13 +16,16 @@ numpy>=1.22.0,<2
|
|
|
14
16
|
omegaconf
|
|
15
17
|
pandas>=1.4.2
|
|
16
18
|
pod5>=0.1.21
|
|
17
|
-
|
|
19
|
+
pybedtools>=0.12.0
|
|
20
|
+
pyBigWig>=0.3.24
|
|
18
21
|
pyfaidx>=0.8.0
|
|
19
22
|
pysam>=0.19.1
|
|
20
|
-
scanpy>=1.
|
|
23
|
+
scanpy>=1.11
|
|
21
24
|
scikit-learn>=1.0.2
|
|
22
25
|
scipy>=1.7.3
|
|
23
26
|
seaborn>=0.11
|
|
27
|
+
shap
|
|
24
28
|
torch>=1.9.0
|
|
25
29
|
tqdm
|
|
30
|
+
upsetplot
|
|
26
31
|
wandb
|
|
@@ -4,12 +4,13 @@ import logging
|
|
|
4
4
|
import warnings
|
|
5
5
|
|
|
6
6
|
from . import informatics as inform
|
|
7
|
+
from . import machine_learning as ml
|
|
8
|
+
from . import plotting as pl
|
|
7
9
|
from . import preprocessing as pp
|
|
8
10
|
from . import tools as tl
|
|
9
|
-
from . import plotting as pl
|
|
10
|
-
from . import readwrite, datasets
|
|
11
|
-
from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
|
|
12
11
|
|
|
12
|
+
from . import cli, config, datasets, hmm
|
|
13
|
+
from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
|
|
13
14
|
|
|
14
15
|
from importlib.metadata import version
|
|
15
16
|
|
|
@@ -19,11 +20,11 @@ __version__ = version(package_name)
|
|
|
19
20
|
__all__ = [
|
|
20
21
|
"adata_to_df",
|
|
21
22
|
"inform",
|
|
23
|
+
"ml",
|
|
22
24
|
"pp",
|
|
23
25
|
"tl",
|
|
24
26
|
"pl",
|
|
25
|
-
"
|
|
26
|
-
"datasets",
|
|
27
|
+
"datasets"
|
|
27
28
|
"safe_write_h5ad",
|
|
28
|
-
"
|
|
29
|
+
"safe_read_h5ad"
|
|
29
30
|
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.4"
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
def flow_I(config_path):
|
|
2
|
+
"""
|
|
3
|
+
High-level function to call for converting raw sequencing data to an adata object.
|
|
4
|
+
Command line accesses this through smftools load <config_path>
|
|
5
|
+
Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
|
|
6
|
+
Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
|
|
7
|
+
Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
|
|
8
|
+
|
|
9
|
+
Parameters:
|
|
10
|
+
config_path (str): A string representing the file path to the experiment configuration csv file.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
None
|
|
14
|
+
"""
|
|
15
|
+
from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
|
|
16
|
+
from ..config import LoadExperimentConfig, ExperimentConfig
|
|
17
|
+
from .load_adata import load_adata
|
|
18
|
+
from .preprocess_adata import preprocess_adata
|
|
19
|
+
from .spatial_adata import spatial_adata
|
|
20
|
+
|
|
21
|
+
import numpy as np
|
|
22
|
+
import pandas as pd
|
|
23
|
+
import anndata as ad
|
|
24
|
+
import scanpy as sc
|
|
25
|
+
|
|
26
|
+
import os
|
|
27
|
+
from importlib import resources
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
date_str = datetime.today().strftime("%y%m%d")
|
|
32
|
+
################################### 1) General params and input organization ###################################
|
|
33
|
+
# Load experiment config parameters into global variables
|
|
34
|
+
loader = LoadExperimentConfig(config_path)
|
|
35
|
+
defaults_dir = resources.files("smftools").joinpath("config")
|
|
36
|
+
cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
|
|
37
|
+
|
|
38
|
+
# General config variable init - Necessary user passed inputs
|
|
39
|
+
smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
|
|
40
|
+
input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
|
|
41
|
+
output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
|
|
42
|
+
fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
|
|
43
|
+
split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
|
|
44
|
+
split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
|
|
45
|
+
|
|
46
|
+
# Make initial output directory
|
|
47
|
+
make_dirs([output_directory])
|
|
48
|
+
|
|
49
|
+
bam_suffix = cfg.bam_suffix
|
|
50
|
+
strands = cfg.strands
|
|
51
|
+
|
|
52
|
+
# General config variable init - Optional user passed inputs for enzyme base specificity
|
|
53
|
+
mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
|
|
54
|
+
|
|
55
|
+
# Conversion/deamination specific variable init
|
|
56
|
+
conversion_types = cfg.conversion_types # 5mC
|
|
57
|
+
conversions = cfg.conversions
|
|
58
|
+
|
|
59
|
+
# Common Anndata accession params
|
|
60
|
+
reference_column = cfg.reference_column
|
|
61
|
+
|
|
62
|
+
# If conversion_types is passed:
|
|
63
|
+
if conversion_types:
|
|
64
|
+
conversions += conversion_types
|
|
65
|
+
|
|
66
|
+
############################################### smftools load start ###############################################
|
|
67
|
+
initial_adata, initial_adata_path = load_adata(config_path)
|
|
68
|
+
|
|
69
|
+
# Initial adata path info
|
|
70
|
+
initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
|
|
71
|
+
############################################### smftools load end ###############################################
|
|
72
|
+
|
|
73
|
+
############################################### smftools preprocess start ###############################################
|
|
74
|
+
pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
|
|
75
|
+
|
|
76
|
+
# Preprocessed adata path info
|
|
77
|
+
pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
|
|
78
|
+
pp_adata_path = initial_adata_path / pp_adata_basename
|
|
79
|
+
pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
|
|
80
|
+
|
|
81
|
+
# Preprocessed duplicate removed adata path info
|
|
82
|
+
pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
|
|
83
|
+
pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
|
|
84
|
+
pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
|
|
85
|
+
############################################### smftools preprocess end ###############################################
|
|
86
|
+
|
|
87
|
+
############################################### smftools spatial start ###############################################
|
|
88
|
+
# Preprocessed duplicate removed adata with basic analyses appended path info
|
|
89
|
+
basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
|
|
90
|
+
basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
|
|
91
|
+
basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
|
|
92
|
+
|
|
93
|
+
spatial_adata, spatial_adata_path = spatial_adata(config_path)
|
|
94
|
+
############################################### smftools spatial end ###############################################
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
import anndata as ad
|
|
4
|
+
from ..readwrite import safe_write_h5ad
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class AdataPaths:
|
|
8
|
+
raw: Path
|
|
9
|
+
pp: Path
|
|
10
|
+
pp_dedup: Path
|
|
11
|
+
spatial: Path
|
|
12
|
+
hmm: Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def get_adata_paths(cfg) -> AdataPaths:
|
|
16
|
+
"""
|
|
17
|
+
Central helper: given cfg, compute all standard AnnData paths.
|
|
18
|
+
"""
|
|
19
|
+
h5_dir = Path(cfg.output_directory) / "h5ads"
|
|
20
|
+
|
|
21
|
+
raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
|
|
22
|
+
|
|
23
|
+
pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
|
|
24
|
+
|
|
25
|
+
if cfg.smf_modality == "direct":
|
|
26
|
+
# direct SMF: duplicate-removed path is just preprocessed path
|
|
27
|
+
pp_dedup = pp
|
|
28
|
+
else:
|
|
29
|
+
pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
|
|
30
|
+
|
|
31
|
+
pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
|
|
32
|
+
|
|
33
|
+
spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
|
|
34
|
+
hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
|
|
35
|
+
|
|
36
|
+
return AdataPaths(
|
|
37
|
+
raw=raw,
|
|
38
|
+
pp=pp,
|
|
39
|
+
pp_dedup=pp_dedup,
|
|
40
|
+
spatial=spatial,
|
|
41
|
+
hmm=hmm,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
|
|
45
|
+
if path.suffix != ".gz":
|
|
46
|
+
path = path.with_name(path.name + ".gz")
|
|
47
|
+
safe_write_h5ad(adata, path, compression="gzip", backup=True)
|
|
48
|
+
return path
|