smftools 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (106) hide show
  1. smftools-0.1.0/.gitattributes +2 -0
  2. smftools-0.1.0/.gitignore +45 -0
  3. smftools-0.1.0/LICENSE +21 -0
  4. smftools-0.1.0/PKG-INFO +75 -0
  5. smftools-0.1.0/README.md +9 -0
  6. smftools-0.1.0/experiment_config.csv +20 -0
  7. smftools-0.1.0/pyproject.toml +132 -0
  8. smftools-0.1.0/requirements.txt +14 -0
  9. smftools-0.1.0/src/smftools/__init__.py +27 -0
  10. smftools-0.1.0/src/smftools/_settings.py +19 -0
  11. smftools-0.1.0/src/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  12. smftools-0.1.0/src/smftools/datasets/__init__.py +9 -0
  13. smftools-0.1.0/src/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  14. smftools-0.1.0/src/smftools/datasets/datasets.py +25 -0
  15. smftools-0.1.0/src/smftools/informatics/__init__.py +11 -0
  16. smftools-0.1.0/src/smftools/informatics/helpers/__init__.py +42 -0
  17. smftools-0.1.0/src/smftools/informatics/helpers/align_BAM.py +49 -0
  18. smftools-0.1.0/src/smftools/informatics/helpers/binarize_converted_base_identities.py +24 -0
  19. smftools-0.1.0/src/smftools/informatics/helpers/canoncall.py +12 -0
  20. smftools-0.1.0/src/smftools/informatics/helpers/converted_BAM_to_adata.py +147 -0
  21. smftools-0.1.0/src/smftools/informatics/helpers/count_aligned_reads.py +32 -0
  22. smftools-0.1.0/src/smftools/informatics/helpers/extract_base_identities.py +36 -0
  23. smftools-0.1.0/src/smftools/informatics/helpers/extract_mods.py +39 -0
  24. smftools-0.1.0/src/smftools/informatics/helpers/find_conversion_sites.py +53 -0
  25. smftools-0.1.0/src/smftools/informatics/helpers/generate_converted_FASTA.py +59 -0
  26. smftools-0.1.0/src/smftools/informatics/helpers/get_native_references.py +25 -0
  27. smftools-0.1.0/src/smftools/informatics/helpers/informatics.py +260 -0
  28. smftools-0.1.0/src/smftools/informatics/helpers/load_adata.py +516 -0
  29. smftools-0.1.0/src/smftools/informatics/helpers/load_experiment_config.py +17 -0
  30. smftools-0.1.0/src/smftools/informatics/helpers/make_dirs.py +15 -0
  31. smftools-0.1.0/src/smftools/informatics/helpers/make_modbed.py +21 -0
  32. smftools-0.1.0/src/smftools/informatics/helpers/modQC.py +19 -0
  33. smftools-0.1.0/src/smftools/informatics/helpers/modcall.py +14 -0
  34. smftools-0.1.0/src/smftools/informatics/helpers/modkit_extract_to_adata.py +355 -0
  35. smftools-0.1.0/src/smftools/informatics/helpers/one_hot_encode.py +14 -0
  36. smftools-0.1.0/src/smftools/informatics/helpers/separate_bam_by_bc.py +28 -0
  37. smftools-0.1.0/src/smftools/informatics/helpers/split_and_index_BAM.py +21 -0
  38. smftools-0.1.0/src/smftools/informatics/pod5_conversion.py +26 -0
  39. smftools-0.1.0/src/smftools/informatics/pod5_direct.py +29 -0
  40. smftools-0.1.0/src/smftools/informatics/pod5_to_adata.py +17 -0
  41. smftools-0.1.0/src/smftools/informatics/readwrite.py +109 -0
  42. smftools-0.1.0/src/smftools/plotting/__init__.py +0 -0
  43. smftools-0.1.0/src/smftools/preprocessing/__init__.py +35 -0
  44. smftools-0.1.0/src/smftools/preprocessing/append_C_context.py +39 -0
  45. smftools-0.1.0/src/smftools/preprocessing/binarize_on_Youden.py +38 -0
  46. smftools-0.1.0/src/smftools/preprocessing/binary_layers_to_ohe.py +25 -0
  47. smftools-0.1.0/src/smftools/preprocessing/calculate_complexity.py +59 -0
  48. smftools-0.1.0/src/smftools/preprocessing/calculate_converted_read_methylation_stats.py +38 -0
  49. smftools-0.1.0/src/smftools/preprocessing/calculate_coverage.py +35 -0
  50. smftools-0.1.0/src/smftools/preprocessing/calculate_pairwise_hamming_distances.py +22 -0
  51. smftools-0.1.0/src/smftools/preprocessing/calculate_position_Youden.py +95 -0
  52. smftools-0.1.0/src/smftools/preprocessing/calculate_read_length_stats.py +27 -0
  53. smftools-0.1.0/src/smftools/preprocessing/clean_NaN.py +31 -0
  54. smftools-0.1.0/src/smftools/preprocessing/filter_converted_reads_on_methylation.py +20 -0
  55. smftools-0.1.0/src/smftools/preprocessing/filter_reads_on_length.py +31 -0
  56. smftools-0.1.0/src/smftools/preprocessing/invert_adata.py +18 -0
  57. smftools-0.1.0/src/smftools/preprocessing/mark_duplicates.py +110 -0
  58. smftools-0.1.0/src/smftools/preprocessing/min_non_diagonal.py +20 -0
  59. smftools-0.1.0/src/smftools/preprocessing/preprocessing.py +614 -0
  60. smftools-0.1.0/src/smftools/preprocessing/remove_duplicates.py +12 -0
  61. smftools-0.1.0/src/smftools/readwrite.py +109 -0
  62. smftools-0.1.0/src/smftools/tools/__init__.py +0 -0
  63. smftools-0.1.0/tests/__init__.py +0 -0
  64. smftools-0.1.0/tests/datasets/test_datasets.py +2 -0
  65. smftools-0.1.0/tests/informatics/helpers/test_align_BAM.py +49 -0
  66. smftools-0.1.0/tests/informatics/helpers/test_binarize_converted_base_identities.py +24 -0
  67. smftools-0.1.0/tests/informatics/helpers/test_canoncall.py +12 -0
  68. smftools-0.1.0/tests/informatics/helpers/test_converted_BAM_to_adata.py +147 -0
  69. smftools-0.1.0/tests/informatics/helpers/test_count_aligned_reads.py +32 -0
  70. smftools-0.1.0/tests/informatics/helpers/test_extract_base_identities.py +36 -0
  71. smftools-0.1.0/tests/informatics/helpers/test_extract_mods.py +39 -0
  72. smftools-0.1.0/tests/informatics/helpers/test_find_conversion_sites.py +53 -0
  73. smftools-0.1.0/tests/informatics/helpers/test_generate_converted_FASTA.py +59 -0
  74. smftools-0.1.0/tests/informatics/helpers/test_get_native_references.py +25 -0
  75. smftools-0.1.0/tests/informatics/helpers/test_informatics.py +260 -0
  76. smftools-0.1.0/tests/informatics/helpers/test_load_adata.py +516 -0
  77. smftools-0.1.0/tests/informatics/helpers/test_load_experiment_config.py +17 -0
  78. smftools-0.1.0/tests/informatics/helpers/test_make_dirs.py +15 -0
  79. smftools-0.1.0/tests/informatics/helpers/test_make_modbed.py +21 -0
  80. smftools-0.1.0/tests/informatics/helpers/test_modQC.py +19 -0
  81. smftools-0.1.0/tests/informatics/helpers/test_modcall.py +14 -0
  82. smftools-0.1.0/tests/informatics/helpers/test_modkit_extract_to_adata.py +355 -0
  83. smftools-0.1.0/tests/informatics/helpers/test_one_hot_encode.py +14 -0
  84. smftools-0.1.0/tests/informatics/helpers/test_separate_bam_by_bc.py +28 -0
  85. smftools-0.1.0/tests/informatics/helpers/test_split_and_index_BAM.py +21 -0
  86. smftools-0.1.0/tests/informatics/test_pod5_conversion.py +26 -0
  87. smftools-0.1.0/tests/informatics/test_pod5_direct.py +29 -0
  88. smftools-0.1.0/tests/informatics/test_pod5_to_adata.py +17 -0
  89. smftools-0.1.0/tests/preprocessing/test_append_C_context.py +39 -0
  90. smftools-0.1.0/tests/preprocessing/test_binarize_on_Youden.py +38 -0
  91. smftools-0.1.0/tests/preprocessing/test_binary_layers_to_ohe.py +25 -0
  92. smftools-0.1.0/tests/preprocessing/test_calculate_complexity.py +59 -0
  93. smftools-0.1.0/tests/preprocessing/test_calculate_converted_read_methylation_stats.py +38 -0
  94. smftools-0.1.0/tests/preprocessing/test_calculate_coverage.py +35 -0
  95. smftools-0.1.0/tests/preprocessing/test_calculate_pairwise_hamming_distances.py +22 -0
  96. smftools-0.1.0/tests/preprocessing/test_calculate_position_Youden.py +95 -0
  97. smftools-0.1.0/tests/preprocessing/test_calculate_read_length_stats.py +27 -0
  98. smftools-0.1.0/tests/preprocessing/test_clean_NaN.py +31 -0
  99. smftools-0.1.0/tests/preprocessing/test_filter_converted_reads_on_methylation.py +20 -0
  100. smftools-0.1.0/tests/preprocessing/test_filter_reads_on_length.py +31 -0
  101. smftools-0.1.0/tests/preprocessing/test_invert_adata.py +18 -0
  102. smftools-0.1.0/tests/preprocessing/test_mark_duplicates.py +110 -0
  103. smftools-0.1.0/tests/preprocessing/test_min_non_diagonal.py +20 -0
  104. smftools-0.1.0/tests/preprocessing/test_preprocessing.py +614 -0
  105. smftools-0.1.0/tests/preprocessing/test_remove_duplicates.py +12 -0
  106. smftools-0.1.0/tests/test_readwrite.py +12 -0
@@ -0,0 +1,2 @@
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
@@ -0,0 +1,45 @@
1
+ # Python
2
+ __pycache__/
3
+ /src/smftools/_version.py
4
+
5
+ # Build files
6
+ build/
7
+ dist/
8
+ /hatch.toml
9
+ /Pipfile
10
+ /Pipfile.lock
11
+
12
+ # Environments
13
+ .env
14
+ .venv
15
+ /*-env/
16
+ /*-venv/
17
+ /env-*/
18
+ /venv-*/
19
+ /environment.yml
20
+
21
+ # OS
22
+ *.DS_Store
23
+ *.LSOverride
24
+ *Thumbs.db
25
+ *.ipynb_checkpoints/
26
+ *.directory
27
+
28
+ # IDEs and editors
29
+ *.vscode/
30
+ *.idea/
31
+ *.iml
32
+
33
+ # Logs
34
+ *.log
35
+
36
+ # temp files
37
+ temp/
38
+ tmp/
39
+
40
+ # Coverage reports
41
+ .coverage
42
+ htmlcov/
43
+
44
+ # Docs
45
+ /docs
smftools-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 jkmckenna
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,75 @@
1
+ Metadata-Version: 2.3
2
+ Name: smftools
3
+ Version: 0.1.0
4
+ Summary: Single Molecule Footprinting Analysis in Python.
5
+ Project-URL: Source, https://github.com/jkmckenna/smftools
6
+ Author: Joseph McKenna
7
+ Maintainer-email: Joseph McKenna <jkmckenna@berkeley.edu>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Keywords: anndata,chromatin-accessibility,machine-learning,nanopore,protein-dna-binding,single-locus,single-molecule-footprinting
11
+ Classifier: Development Status :: 2 - Pre-Alpha
12
+ Classifier: Environment :: Console
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Natural Language :: English
17
+ Classifier: Operating System :: MacOS :: MacOS X
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
24
+ Classifier: Topic :: Scientific/Engineering :: Visualization
25
+ Requires-Python: >=3.9
26
+ Requires-Dist: anndata>=0.10.0
27
+ Requires-Dist: biopython>=1.79
28
+ Requires-Dist: cython>=0.29.28
29
+ Requires-Dist: networkx>=3.2
30
+ Requires-Dist: numpy<2,>=1.22.0
31
+ Requires-Dist: pandas>=1.4.2
32
+ Requires-Dist: pomegranate>1.0.0
33
+ Requires-Dist: pysam>=0.19.1
34
+ Requires-Dist: scanpy>=1.9
35
+ Requires-Dist: scikit-learn>=1.0.2
36
+ Requires-Dist: scipy>=1.7.3
37
+ Requires-Dist: seaborn>=0.11
38
+ Requires-Dist: tqdm
39
+ Provides-Extra: base-tests
40
+ Requires-Dist: pytest; extra == 'base-tests'
41
+ Requires-Dist: pytest-cov; extra == 'base-tests'
42
+ Provides-Extra: doc
43
+ Requires-Dist: ipython>=7.20; extra == 'doc'
44
+ Requires-Dist: matplotlib!=3.6.1; extra == 'doc'
45
+ Requires-Dist: myst-nb>=1; extra == 'doc'
46
+ Requires-Dist: myst-parser>=2; extra == 'doc'
47
+ Requires-Dist: nbsphinx>=0.9; extra == 'doc'
48
+ Requires-Dist: readthedocs-sphinx-search; extra == 'doc'
49
+ Requires-Dist: setuptools; extra == 'doc'
50
+ Requires-Dist: sphinx-autodoc-typehints>=1.25.2; extra == 'doc'
51
+ Requires-Dist: sphinx-book-theme>=1.1.0; extra == 'doc'
52
+ Requires-Dist: sphinx-copybutton; extra == 'doc'
53
+ Requires-Dist: sphinx-design; extra == 'doc'
54
+ Requires-Dist: sphinx>=7; extra == 'doc'
55
+ Requires-Dist: sphinxcontrib-bibtex; extra == 'doc'
56
+ Requires-Dist: sphinxext-opengraph; extra == 'doc'
57
+ Provides-Extra: torch
58
+ Requires-Dist: pomeganate>=1.0.0; extra == 'torch'
59
+ Requires-Dist: torch>=1.9.0; extra == 'torch'
60
+ Provides-Extra: torch-tests
61
+ Requires-Dist: pomeganate>=1.0.0; extra == 'torch-tests'
62
+ Requires-Dist: pytest; extra == 'torch-tests'
63
+ Requires-Dist: pytest-cov; extra == 'torch-tests'
64
+ Requires-Dist: torch>=1.9.0; extra == 'torch-tests'
65
+ Description-Content-Type: text/markdown
66
+
67
+ # smftools
68
+ A tool for processing raw sequencing data for single molecule footprinting experiments at single genomic loci.
69
+
70
+ ## Dependencies
71
+ The following tools need to be installed and configured:
72
+ 1) [Dorado](https://github.com/nanoporetech/dorado) -> For standard/modified basecalling and alignment. Can be attained by downloading and configuring nanopore MinKnow software.
73
+ 2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
74
+ 3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
75
+ 4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
@@ -0,0 +1,9 @@
1
+ # smftools
2
+ A tool for processing raw sequencing data for single molecule footprinting experiments at single genomic loci.
3
+
4
+ ## Dependencies
5
+ The following tools need to be installed and configured:
6
+ 1) [Dorado](https://github.com/nanoporetech/dorado) -> For standard/modified basecalling and alignment. Can be attained by downloading and configuring nanopore MinKnow software.
7
+ 2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
8
+ 3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
9
+ 4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
@@ -0,0 +1,20 @@
1
+ variable,value,help,options,type
2
+ base_dir,,Path to directory to act as root directory for analysis,,string
3
+ smf_modality,,Modality of SMF. Can either be conversion or direct.,"conversion, direct",string
4
+ pod5_dir,,Path to directory containing input POD5 files (If doing Nanopore SMF),,string
5
+ fastq_dir,,Path to directory containing input FASTQ files (if doing Illumina based conversion SMF),,string
6
+ fasta,,Path to initial FASTA file,,string
7
+ bam_suffix,,The file suffix used for BAM files.,,string
8
+ output_directory,,Directory within BASE_dir to create for analysis outputs,,string
9
+ experiment_name,,An experiment name for the final h5ad file,,string
10
+ model,,The dorado basecalling model to use,,string
11
+ barcode_kit,,The barcoding kit used for the experiment,,string
12
+ mapping_threshold,,Minimum proportion of reads mapping to a reference to further use that reference (Ranges from 0-1 as a proportion of mapped reads),,float
13
+ filter_threshold,,Minimum probability to call a canonical base identity,,float
14
+ m6A_threshold,,Minimum probability to flag m6A as True,,float
15
+ m5C_threshold,,Minimum probability to flag m5C as True,,float
16
+ hm5C_threshold,,Minimum probability to flag hm5C as True,,float
17
+ mod_list,,Modified base names for Dorado,"""6mA"", ""5mC_5hmC""",list
18
+ batch_size,,number of samples to analyze at a time,,int
19
+ conversion_types,,Types of modification types to use in conversion SMF,"unconverted', '5mC', '6mA'",list
20
+ strands,,Converted strands to analyze for conversion SMF,"top','bottom'",list
@@ -0,0 +1,132 @@
1
+ [build-system]
2
+ requires = ["hatchling", "hatch-vcs"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "smftools"
7
+ description = "Single Molecule Footprinting Analysis in Python."
8
+ requires-python = ">=3.9"
9
+ license = "MIT"
10
+ authors = [
11
+ {name = "Joseph McKenna"}
12
+ ]
13
+ maintainers = [
14
+ {name = "Joseph McKenna", email = "jkmckenna@berkeley.edu"}
15
+ ]
16
+ keywords = [
17
+ "single-molecule-footprinting",
18
+ "chromatin-accessibility",
19
+ "protein-dna-binding",
20
+ "nanopore",
21
+ "single-locus",
22
+ "anndata",
23
+ "machine-learning"
24
+ ]
25
+ readme = "README.md"
26
+ classifiers = [
27
+ "License :: OSI Approved :: MIT License",
28
+ "Development Status :: 2 - Pre-Alpha",
29
+ "Environment :: Console",
30
+ "Intended Audience :: Developers",
31
+ "Intended Audience :: Science/Research",
32
+ "Natural Language :: English",
33
+ "Operating System :: MacOS :: MacOS X",
34
+ "Programming Language :: Python :: 3",
35
+ "Programming Language :: Python :: 3.9",
36
+ "Programming Language :: Python :: 3.10",
37
+ "Programming Language :: Python :: 3.11",
38
+ "Programming Language :: Python :: 3.12",
39
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
40
+ "Topic :: Scientific/Engineering :: Visualization"
41
+ ]
42
+ dependencies = [
43
+ "anndata>=0.10.0",
44
+ "biopython>=1.79",
45
+ "Cython>=0.29.28",
46
+ "networkx>=3.2",
47
+ "numpy>=1.22.0,<2",
48
+ "pandas>=1.4.2",
49
+ "pomegranate>1.0.0",
50
+ "pysam>=0.19.1",
51
+ "scanpy>=1.9",
52
+ "scikit-learn>=1.0.2",
53
+ "scipy>=1.7.3",
54
+ "seaborn>=0.11",
55
+ "tqdm"
56
+ ]
57
+ dynamic = ["version"]
58
+
59
+ [project.urls]
60
+ Source = "https://github.com/jkmckenna/smftools"
61
+
62
+ [project.optional-dependencies]
63
+ torch = ["torch>=1.9.0", "pomeganate>=1.0.0"]
64
+ base_tests = [
65
+ "pytest",
66
+ "pytest-cov"
67
+ ]
68
+
69
+ torch_tests = [
70
+ "smftools[base_tests]",
71
+ # Optional dependencies
72
+ "smftools[torch]"
73
+ ]
74
+
75
+ doc = [
76
+ "sphinx>=7",
77
+ "sphinx-book-theme>=1.1.0",
78
+ "sphinx-autodoc-typehints>=1.25.2",
79
+ "myst-parser>=2",
80
+ "myst-nb>=1",
81
+ "sphinx-design",
82
+ "readthedocs-sphinx-search",
83
+ "sphinxext-opengraph", # for nice cards when sharing on social
84
+ "sphinx-copybutton",
85
+ "nbsphinx>=0.9",
86
+ "ipython>=7.20", # for nbsphinx code highlighting
87
+ "matplotlib!=3.6.1",
88
+ "sphinxcontrib-bibtex",
89
+ "setuptools"
90
+ ]
91
+
92
+ [tool.hatch.build.targets.wheel]
93
+ packages = ["src/smftools"]
94
+
95
+ [tool.hatch.version]
96
+ path = "src/smftools/_version.py"
97
+
98
+ [tool.pytest.ini_options]
99
+ testpaths = ["tests"]
100
+ pythonpath = ["src"]
101
+ xfail_strict = true
102
+ markers = [
103
+ "internet: mark tests that requires internet access",
104
+ "optional: mark optional tests",
105
+ "private: mark tests that are private",
106
+ ]
107
+
108
+ [tool.coverage.run]
109
+ branch = true
110
+ source = ["smftools"]
111
+ omit = ["tests/*"]
112
+
113
+ [tool.ruff]
114
+ src = ["src"]
115
+ line-length = 99
116
+ indent-width = 4
117
+
118
+ [tool.ruff.lint]
119
+ select = [
120
+ "E", # Error detected by Pycodestyle
121
+ "F", # Errors detected by Pyflakes
122
+ "W", # Warning detected by Pycodestyle
123
+ "UP", # pyupgrade
124
+ "I", # isort
125
+ "TCH", # manage type checking blocks
126
+ "TID251", # Banned imports
127
+ "ICN", # Follow import conventions
128
+ "PTH", # Pathlib instead of os.path
129
+ "PLR0917", # Ban APIs with too many positional parameters
130
+ "FBT", # No positional boolean parameters
131
+ "PT" # Pytest style
132
+ ]
@@ -0,0 +1,14 @@
1
+ # Essential packages
2
+ anndata>=0.10.0
3
+ biopython>=1.79
4
+ Cython>=0.29.28
5
+ networkx>=3
6
+ numpy>=1.22.0,<2
7
+ pandas>=1.4.2
8
+ pomegranate>1.0.0
9
+ pysam>=0.19.1
10
+ scanpy>=1.9
11
+ scikit-learn>=1.0.2
12
+ scipy>=1.7.3
13
+ seaborn>=0.11
14
+ tqdm
@@ -0,0 +1,27 @@
1
+ """smftools"""
2
+
3
+ import logging
4
+ import warnings
5
+
6
+ from anndata import AnnData
7
+ from . import informatics as inform
8
+ from . import preprocessing as pp
9
+ from . import tools as tl
10
+ from . import plotting as pl
11
+ from . import readwrite, datasets
12
+
13
+
14
+ from importlib.metadata import version
15
+
16
+ package_name = "smftools"
17
+ __version__ = version(package_name)
18
+
19
+ __all__ = [
20
+ "AnnData",
21
+ "inform",
22
+ "pp",
23
+ "tl",
24
+ "pl",
25
+ "readwrite",
26
+ "datasets"
27
+ ]
@@ -0,0 +1,19 @@
1
+ from pathlib import Path
2
+
3
+ class SMFConfig:
4
+ """\
5
+ Config for smftools.
6
+ """
7
+
8
+ def __init__(
9
+ self,
10
+ *,
11
+ datasetdir: Path | str = "./datasets/"
12
+ ):
13
+ self.datasetdir = datasetdir
14
+
15
+ @property
16
+ def datasetdir(self) -> Path:
17
+ return self._datasetdir
18
+
19
+ settings = SMFConfig()
@@ -0,0 +1,9 @@
1
+ from .datasets import (
2
+ dCas9_kinetics,
3
+ Kissiov_and_McKenna_2025
4
+ )
5
+
6
+ __all__ = [
7
+ "dCas9_kinetics",
8
+ "Kissiov_and_McKenna_2025"
9
+ ]
@@ -0,0 +1,25 @@
1
+ ## datasets
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import anndata as ad
6
+ from pathlib import Path
7
+
8
+ from .._settings import settings
9
+
10
+ HERE = Path(__file__).parent
11
+
12
+
13
+ def dCas9_kinetics():
14
+ """
15
+
16
+ """
17
+ filepath = HERE / "dCas9_m6A_invitro_kinetics.h5ad.gz"
18
+ return ad.read_h5ad(filepath)
19
+
20
+ def Kissiov_and_McKenna_2025():
21
+ """
22
+
23
+ """
24
+ filepath = HERE / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
25
+ return ad.read_h5ad(filepath)
@@ -0,0 +1,11 @@
1
+ from . import helpers
2
+ from .pod5_conversion import pod5_conversion
3
+ from .pod5_direct import pod5_direct
4
+ from .pod5_to_adata import pod5_to_adata
5
+
6
+ __all__ = [
7
+ "helpers",
8
+ "pod5_conversion",
9
+ "pod5_direct"
10
+ "pod5_to_adata"
11
+ ]
@@ -0,0 +1,42 @@
1
+ from .align_BAM import align_BAM
2
+ from .binarize_converted_base_identities import binarize_converted_base_identities
3
+ from .canoncall import canoncall
4
+ from .converted_BAM_to_adata import converted_BAM_to_adata
5
+ from .count_aligned_reads import count_aligned_reads
6
+ from .extract_base_identities import extract_base_identities
7
+ from .extract_mods import extract_mods
8
+ from .find_conversion_sites import find_conversion_sites
9
+ from .generate_converted_FASTA import convert_FASTA_record, generate_converted_FASTA
10
+ from .get_native_references import get_native_references
11
+ from .load_experiment_config import load_experiment_config
12
+ from .make_dirs import make_dirs
13
+ from .make_modbed import make_modbed
14
+ from .modcall import modcall
15
+ from .modkit_extract_to_adata import modkit_extract_to_adata
16
+ from .modQC import modQC
17
+ from .one_hot_encode import one_hot_encode
18
+ from .separate_bam_by_bc import separate_bam_by_bc
19
+ from .split_and_index_BAM import split_and_index_BAM
20
+
21
+ __all__ = [
22
+ "align_BAM",
23
+ "binarize_converted_base_identities",
24
+ "canoncall",
25
+ "converted_BAM_to_adata",
26
+ "count_aligned_reads",
27
+ "extract_base_identities",
28
+ "extract_mods",
29
+ "find_conversion_sites",
30
+ "convert_FASTA_record",
31
+ "generate_converted_FASTA",
32
+ "get_native_references",
33
+ "load_experiment_config",
34
+ "make_dirs",
35
+ "make_modbed",
36
+ "modcall",
37
+ "modkit_extract_to_adata",
38
+ "modQC",
39
+ "one_hot_encode",
40
+ "separate_bam_by_bc",
41
+ "split_and_index_BAM"
42
+ ]
@@ -0,0 +1,49 @@
1
+ ## align_BAM
2
+ import subprocess
3
+
4
+ def align_BAM(fasta, bam, bam_suffix):
5
+ """
6
+ A wrapper for running dorado aligner and samtools functions
7
+ """
8
+ aligned_BAM=f"{bam}_aligned"
9
+ aligned_sorted_BAM=f"{aligned_BAM}_sorted"
10
+ output = bam + bam_suffix
11
+ aligned_output = aligned_BAM + bam_suffix
12
+ aligned_sorted_output = aligned_sorted_BAM + bam_suffix
13
+
14
+ # Run dorado aligner
15
+ subprocess.run([
16
+ "dorado", "aligner",
17
+ "--secondary=no",
18
+ fasta,
19
+ output
20
+ ], stdout=open(aligned_output, "w"))
21
+
22
+ # Sort the BAM on positional coordinates
23
+ subprocess.run([
24
+ "samtools", "sort",
25
+ "-o", aligned_sorted_output,
26
+ aligned_output
27
+ ])
28
+
29
+ # Create a BAM index file
30
+ subprocess.run([
31
+ "samtools", "index",
32
+ aligned_sorted_output
33
+ ])
34
+
35
+ # Make a bed file of coordinates for the BAM
36
+ subprocess.run([
37
+ "samtools", "view",
38
+ aligned_sorted_output
39
+ ], stdout=subprocess.PIPE) | subprocess.run([
40
+ "awk", '{print $3, $4, $4+length($10)-1}'
41
+ ], stdin=subprocess.PIPE, stdout=open(f"{aligned_sorted_BAM}_bed.bed", "w"))
42
+
43
+ # Make a text file of reads for the BAM
44
+ subprocess.run([
45
+ "samtools", "view",
46
+ aligned_sorted_output
47
+ ], stdout=subprocess.PIPE) | subprocess.run([
48
+ "cut", "-f1"
49
+ ], stdin=subprocess.PIPE, stdout=open(f"aligned_sorted_BAM_read_names.txt", "w"))
@@ -0,0 +1,24 @@
1
+ ## binarize_converted_base_identities
2
+ import numpy as np
3
+ # Conversion SMF specific
4
+ def binarize_converted_base_identities(base_identities, strand, modification_type):
5
+ """
6
+ Input: The base identities dictionary returned by extract_base_identity_at_coordinates.
7
+ Output: A binarized format of the dictionary, where 1 represents a methylated site. 0 represents an unmethylated site. NaN represents a site that does not carry SMF information.
8
+ """
9
+ binarized_base_identities = {}
10
+ # Iterate over base identity keys to binarize the base identities
11
+ for key in base_identities.keys():
12
+ if strand == 'top':
13
+ if modification_type == '5mC':
14
+ binarized_base_identities[key] = [1 if x == 'C' else 0 if x == 'T' else np.nan for x in base_identities[key]]
15
+ elif modification_type == '6mA':
16
+ binarized_base_identities[key] = [1 if x == 'A' else 0 if x == 'G' else np.nan for x in base_identities[key]]
17
+ elif strand == 'bottom':
18
+ if modification_type == '5mC':
19
+ binarized_base_identities[key] = [1 if x == 'G' else 0 if x == 'A' else np.nan for x in base_identities[key]]
20
+ elif modification_type == '6mA':
21
+ binarized_base_identities[key] = [1 if x == 'T' else 0 if x == 'C' else np.nan for x in base_identities[key]]
22
+ else:
23
+ pass
24
+ return binarized_base_identities
@@ -0,0 +1,12 @@
1
+ ## canoncall
2
+ import subprocess
3
+
4
+ # Conversion SMF specific
5
+ def canoncall(model, pod5_dir, barcode_kit, bam, bam_suffix):
6
+ """
7
+ Wrapper function for dorado canonical base calling.
8
+ """
9
+ output = bam + bam_suffix
10
+ command = ["dorado", "basecaller", model, pod5_dir, "--kit-name", barcode_kit, "-Y"]
11
+ with open(output, "w") as outfile:
12
+ subprocess.run(command, stdout=outfile)