smftools 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. {smftools-0.1.1 → smftools-0.1.3}/.gitignore +4 -0
  2. {smftools-0.1.1 → smftools-0.1.3}/PKG-INFO +13 -7
  3. {smftools-0.1.1 → smftools-0.1.3}/README.md +7 -3
  4. smftools-0.1.3/docs/source/_static/converted_BAM_to_adata.png +0 -0
  5. smftools-0.1.3/docs/source/_static/modkit_extract_to_adata.png +0 -0
  6. smftools-0.1.3/docs/source/_static/smftools_informatics_diagram.pdf +0 -0
  7. smftools-0.1.3/docs/source/_static/smftools_informatics_diagram.png +0 -0
  8. smftools-0.1.3/docs/source/_static/smftools_preprocessing_diagram.png +0 -0
  9. smftools-0.1.3/docs/source/api/index.md +26 -0
  10. smftools-0.1.3/docs/source/api/informatics.md +27 -0
  11. {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/preprocessing.md +5 -0
  12. smftools-0.1.3/docs/source/basic_usage.md +75 -0
  13. {smftools-0.1.1 → smftools-0.1.3}/docs/source/index.md +1 -0
  14. smftools-0.1.3/docs/source/installation.md +60 -0
  15. {smftools-0.1.1 → smftools-0.1.3}/experiment_config.csv +2 -2
  16. smftools-0.1.3/notebooks/Kissiov_and_McKenna_2025_example_notebook.ipynb +85 -0
  17. smftools-0.1.3/notebooks/Kissiov_and_McKenna_2025_sample_sheet.csv +11 -0
  18. {smftools-0.1.1 → smftools-0.1.3}/pyproject.toml +9 -7
  19. {smftools-0.1.1 → smftools-0.1.3}/requirements.txt +1 -0
  20. smftools-0.1.3/sample_sheet.csv +11 -0
  21. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/_settings.py +3 -2
  22. smftools-0.1.3/src/smftools/_version.py +1 -0
  23. smftools-0.1.3/src/smftools/datasets/F1_sample_sheet.csv +5 -0
  24. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/datasets.py +8 -7
  25. smftools-0.1.3/src/smftools/informatics/__init__.py +14 -0
  26. {smftools-0.1.1/src/smftools/informatics → smftools-0.1.3/src/smftools/informatics/archived}/bam_conversion.py +16 -4
  27. {smftools-0.1.1/src/smftools/informatics → smftools-0.1.3/src/smftools/informatics/archived}/bam_direct.py +22 -8
  28. smftools-0.1.3/src/smftools/informatics/archived/basecalls_to_adata.py +71 -0
  29. smftools-0.1.3/src/smftools/informatics/conversion_smf.py +79 -0
  30. smftools-0.1.3/src/smftools/informatics/direct_smf.py +89 -0
  31. smftools-0.1.3/src/smftools/informatics/fast5_to_pod5.py +21 -0
  32. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/__init__.py +18 -0
  33. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/align_and_sort_BAM.py +9 -13
  34. smftools-0.1.3/src/smftools/informatics/helpers/aligned_BAM_to_bed.py +73 -0
  35. smftools-0.1.3/src/smftools/informatics/helpers/bed_to_bigwig.py +39 -0
  36. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/binarize_converted_base_identities.py +2 -2
  37. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/canoncall.py +2 -0
  38. smftools-0.1.3/src/smftools/informatics/helpers/complement_base_list.py +21 -0
  39. smftools-0.1.3/src/smftools/informatics/helpers/concatenate_fastqs_to_bam.py +54 -0
  40. smftools-0.1.3/src/smftools/informatics/helpers/converted_BAM_to_adata.py +233 -0
  41. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/count_aligned_reads.py +13 -9
  42. smftools-0.1.3/src/smftools/informatics/helpers/extract_base_identities.py +57 -0
  43. smftools-0.1.3/src/smftools/informatics/helpers/extract_readnames_from_BAM.py +22 -0
  44. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/find_conversion_sites.py +11 -9
  45. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/generate_converted_FASTA.py +33 -14
  46. smftools-0.1.3/src/smftools/informatics/helpers/get_chromosome_lengths.py +32 -0
  47. smftools-0.1.3/src/smftools/informatics/helpers/index_fasta.py +12 -0
  48. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/modcall.py +3 -1
  49. smftools-0.1.3/src/smftools/informatics/helpers/modkit_extract_to_adata.py +518 -0
  50. smftools-0.1.3/src/smftools/informatics/helpers/ohe_batching.py +52 -0
  51. smftools-0.1.3/src/smftools/informatics/helpers/one_hot_encode.py +21 -0
  52. smftools-0.1.3/src/smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +52 -0
  53. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/separate_bam_by_bc.py +4 -2
  54. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/split_and_index_BAM.py +16 -4
  55. smftools-0.1.3/src/smftools/informatics/load_adata.py +127 -0
  56. smftools-0.1.3/src/smftools/informatics/subsample_fasta_from_bed.py +47 -0
  57. smftools-0.1.3/src/smftools/informatics/subsample_pod5.py +104 -0
  58. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/__init__.py +6 -1
  59. smftools-0.1.3/src/smftools/preprocessing/append_C_context.py +69 -0
  60. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_complexity.py +2 -2
  61. smftools-0.1.3/src/smftools/preprocessing/calculate_consensus.py +47 -0
  62. smftools-0.1.3/src/smftools/preprocessing/calculate_converted_read_methylation_stats.py +96 -0
  63. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_coverage.py +2 -2
  64. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_pairwise_hamming_distances.py +1 -1
  65. smftools-0.1.3/src/smftools/preprocessing/calculate_read_length_stats.py +86 -0
  66. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/clean_NaN.py +2 -2
  67. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/filter_converted_reads_on_methylation.py +4 -2
  68. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/filter_reads_on_length.py +4 -2
  69. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/invert_adata.py +1 -0
  70. smftools-0.1.3/src/smftools/preprocessing/load_sample_sheet.py +24 -0
  71. smftools-0.1.3/src/smftools/preprocessing/make_dirs.py +21 -0
  72. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/mark_duplicates.py +34 -19
  73. smftools-0.1.3/src/smftools/preprocessing/recipes.py +125 -0
  74. smftools-0.1.3/src/smftools/preprocessing/remove_duplicates.py +21 -0
  75. smftools-0.1.3/src/smftools/tools/apply_HMM.py +1 -0
  76. smftools-0.1.3/src/smftools/tools/read_HMM.py +1 -0
  77. smftools-0.1.3/src/smftools/tools/subset_adata.py +32 -0
  78. smftools-0.1.3/src/smftools/tools/train_HMM.py +43 -0
  79. smftools-0.1.3/tests/informatics/helpers/test_LoadExperimentConfig.py +17 -0
  80. smftools-0.1.1/docs/source/_templates/tmp +0 -1
  81. smftools-0.1.1/docs/source/api/index.md +0 -16
  82. smftools-0.1.1/docs/source/api/informatics.md +0 -11
  83. smftools-0.1.1/docs/source/installation.md +0 -20
  84. smftools-0.1.1/src/smftools/_version.py +0 -1
  85. smftools-0.1.1/src/smftools/informatics/__init__.py +0 -12
  86. smftools-0.1.1/src/smftools/informatics/basecalls_to_adata.py +0 -42
  87. smftools-0.1.1/src/smftools/informatics/fast5_to_pod5.py +0 -19
  88. smftools-0.1.1/src/smftools/informatics/helpers/converted_BAM_to_adata.py +0 -164
  89. smftools-0.1.1/src/smftools/informatics/helpers/extract_base_identities.py +0 -43
  90. smftools-0.1.1/src/smftools/informatics/helpers/modkit_extract_to_adata.py +0 -367
  91. smftools-0.1.1/src/smftools/informatics/helpers/one_hot_encode.py +0 -19
  92. smftools-0.1.1/src/smftools/informatics/pod5_conversion.py +0 -53
  93. smftools-0.1.1/src/smftools/informatics/pod5_direct.py +0 -55
  94. smftools-0.1.1/src/smftools/informatics/pod5_to_adata.py +0 -40
  95. smftools-0.1.1/src/smftools/informatics/subsample_pod5.py +0 -48
  96. smftools-0.1.1/src/smftools/preprocessing/append_C_context.py +0 -46
  97. smftools-0.1.1/src/smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -45
  98. smftools-0.1.1/src/smftools/preprocessing/calculate_read_length_stats.py +0 -32
  99. smftools-0.1.1/src/smftools/preprocessing/remove_duplicates.py +0 -18
  100. smftools-0.1.1/tests/informatics/helpers/test_align_BAM.py +0 -49
  101. smftools-0.1.1/tests/informatics/helpers/test_binarize_converted_base_identities.py +0 -24
  102. smftools-0.1.1/tests/informatics/helpers/test_canoncall.py +0 -12
  103. smftools-0.1.1/tests/informatics/helpers/test_converted_BAM_to_adata.py +0 -147
  104. smftools-0.1.1/tests/informatics/helpers/test_count_aligned_reads.py +0 -32
  105. smftools-0.1.1/tests/informatics/helpers/test_extract_base_identities.py +0 -36
  106. smftools-0.1.1/tests/informatics/helpers/test_extract_mods.py +0 -39
  107. smftools-0.1.1/tests/informatics/helpers/test_find_conversion_sites.py +0 -53
  108. smftools-0.1.1/tests/informatics/helpers/test_generate_converted_FASTA.py +0 -59
  109. smftools-0.1.1/tests/informatics/helpers/test_get_native_references.py +0 -25
  110. smftools-0.1.1/tests/informatics/helpers/test_informatics.py +0 -260
  111. smftools-0.1.1/tests/informatics/helpers/test_load_adata.py +0 -516
  112. smftools-0.1.1/tests/informatics/helpers/test_load_experiment_config.py +0 -17
  113. smftools-0.1.1/tests/informatics/helpers/test_make_dirs.py +0 -15
  114. smftools-0.1.1/tests/informatics/helpers/test_make_modbed.py +0 -21
  115. smftools-0.1.1/tests/informatics/helpers/test_modQC.py +0 -19
  116. smftools-0.1.1/tests/informatics/helpers/test_modcall.py +0 -14
  117. smftools-0.1.1/tests/informatics/helpers/test_modkit_extract_to_adata.py +0 -355
  118. smftools-0.1.1/tests/informatics/helpers/test_one_hot_encode.py +0 -14
  119. smftools-0.1.1/tests/informatics/helpers/test_separate_bam_by_bc.py +0 -28
  120. smftools-0.1.1/tests/informatics/helpers/test_split_and_index_BAM.py +0 -21
  121. smftools-0.1.1/tests/informatics/test_pod5_conversion.py +0 -26
  122. smftools-0.1.1/tests/informatics/test_pod5_direct.py +0 -29
  123. smftools-0.1.1/tests/informatics/test_pod5_to_adata.py +0 -17
  124. smftools-0.1.1/tests/preprocessing/test_append_C_context.py +0 -39
  125. smftools-0.1.1/tests/preprocessing/test_binarize_on_Youden.py +0 -38
  126. smftools-0.1.1/tests/preprocessing/test_binary_layers_to_ohe.py +0 -25
  127. smftools-0.1.1/tests/preprocessing/test_calculate_complexity.py +0 -59
  128. smftools-0.1.1/tests/preprocessing/test_calculate_converted_read_methylation_stats.py +0 -38
  129. smftools-0.1.1/tests/preprocessing/test_calculate_coverage.py +0 -35
  130. smftools-0.1.1/tests/preprocessing/test_calculate_pairwise_hamming_distances.py +0 -22
  131. smftools-0.1.1/tests/preprocessing/test_calculate_position_Youden.py +0 -95
  132. smftools-0.1.1/tests/preprocessing/test_calculate_read_length_stats.py +0 -27
  133. smftools-0.1.1/tests/preprocessing/test_clean_NaN.py +0 -31
  134. smftools-0.1.1/tests/preprocessing/test_filter_converted_reads_on_methylation.py +0 -20
  135. smftools-0.1.1/tests/preprocessing/test_filter_reads_on_length.py +0 -31
  136. smftools-0.1.1/tests/preprocessing/test_invert_adata.py +0 -18
  137. smftools-0.1.1/tests/preprocessing/test_mark_duplicates.py +0 -110
  138. smftools-0.1.1/tests/preprocessing/test_min_non_diagonal.py +0 -20
  139. smftools-0.1.1/tests/preprocessing/test_preprocessing.py +0 -614
  140. smftools-0.1.1/tests/preprocessing/test_remove_duplicates.py +0 -12
  141. {smftools-0.1.1 → smftools-0.1.3}/.gitattributes +0 -0
  142. {smftools-0.1.1 → smftools-0.1.3}/.readthedocs.yaml +0 -0
  143. {smftools-0.1.1 → smftools-0.1.3}/CONTRIBUTING.md +0 -0
  144. {smftools-0.1.1 → smftools-0.1.3}/LICENSE +0 -0
  145. {smftools-0.1.1 → smftools-0.1.3}/docs/Makefile +0 -0
  146. {smftools-0.1.1 → smftools-0.1.3}/docs/make.bat +0 -0
  147. {smftools-0.1.1/docs/source/_static → smftools-0.1.3/docs/source/_templates}/tmp +0 -0
  148. {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/datasets.md +0 -0
  149. {smftools-0.1.1 → smftools-0.1.3}/docs/source/api/tools.md +0 -0
  150. {smftools-0.1.1 → smftools-0.1.3}/docs/source/conf.py +0 -0
  151. {smftools-0.1.1 → smftools-0.1.3}/docs/source/contributors.md +0 -0
  152. {smftools-0.1.1 → smftools-0.1.3}/docs/source/dev/index.md +0 -0
  153. {smftools-0.1.1 → smftools-0.1.3}/docs/source/references.bib +0 -0
  154. {smftools-0.1.1 → smftools-0.1.3}/docs/source/references.rst +0 -0
  155. {smftools-0.1.1 → smftools-0.1.3}/docs/source/release-notes/0.1.0.md +0 -0
  156. {smftools-0.1.1 → smftools-0.1.3}/docs/source/release-notes/index.md +0 -0
  157. {smftools-0.1.1 → smftools-0.1.3}/docs/source/requirements.txt +0 -0
  158. {smftools-0.1.1 → smftools-0.1.3}/docs/source/tutorials/index.md +0 -0
  159. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/__init__.py +0 -0
  160. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  161. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/__init__.py +0 -0
  162. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  163. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/LoadExperimentConfig.py +0 -0
  164. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/archived/informatics.py +0 -0
  165. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/archived/load_adata.py +0 -0
  166. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/extract_mods.py +0 -0
  167. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/get_native_references.py +0 -0
  168. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/make_dirs.py +0 -0
  169. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/make_modbed.py +0 -0
  170. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/helpers/modQC.py +0 -0
  171. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/informatics/readwrite.py +0 -0
  172. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/plotting/__init__.py +0 -0
  173. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/archives/preprocessing.py +0 -0
  174. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/binarize_on_Youden.py +0 -0
  175. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/binary_layers_to_ohe.py +0 -0
  176. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/calculate_position_Youden.py +0 -0
  177. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/preprocessing/min_non_diagonal.py +0 -0
  178. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/readwrite.py +0 -0
  179. {smftools-0.1.1 → smftools-0.1.3}/src/smftools/tools/__init__.py +0 -0
  180. /smftools-0.1.1/tests/__init__.py → /smftools-0.1.3/src/smftools/tools/cluster.py +0 -0
  181. {smftools-0.1.1 → smftools-0.1.3}/tests/datasets/test_datasets.py +0 -0
  182. {smftools-0.1.1 → smftools-0.1.3}/tests/test_readwrite.py +0 -0
@@ -18,6 +18,10 @@ build/
18
18
  venv/
19
19
  /environment.yml
20
20
 
21
+ # Tests
22
+ /tests/_test_inputs/
23
+ /tests/_test_outputs/
24
+
21
25
  # OS
22
26
  .DS_Store
23
27
  .LSOverride
@@ -1,8 +1,9 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
+ Project-URL: Documentation, https://smftools.readthedocs.io/
6
7
  Author: Joseph McKenna
7
8
  Maintainer-email: Joseph McKenna <jkmckenna@berkeley.edu>
8
9
  License-Expression: MIT
@@ -31,6 +32,7 @@ Requires-Dist: numpy<2,>=1.22.0
31
32
  Requires-Dist: pandas>=1.4.2
32
33
  Requires-Dist: pod5>=0.1.21
33
34
  Requires-Dist: pomegranate>1.0.0
35
+ Requires-Dist: pyfaidx>=0.8.0
34
36
  Requires-Dist: pysam>=0.19.1
35
37
  Requires-Dist: scanpy>=1.9
36
38
  Requires-Dist: scikit-learn>=1.0.2
@@ -38,9 +40,6 @@ Requires-Dist: scipy>=1.7.3
38
40
  Requires-Dist: seaborn>=0.11
39
41
  Requires-Dist: torch>=1.9.0
40
42
  Requires-Dist: tqdm
41
- Provides-Extra: base-tests
42
- Requires-Dist: pytest; extra == 'base-tests'
43
- Requires-Dist: pytest-cov; extra == 'base-tests'
44
43
  Provides-Extra: docs
45
44
  Requires-Dist: ipython>=7.20; extra == 'docs'
46
45
  Requires-Dist: matplotlib!=3.6.1; extra == 'docs'
@@ -56,13 +55,16 @@ Requires-Dist: sphinx-design; extra == 'docs'
56
55
  Requires-Dist: sphinx>=7; extra == 'docs'
57
56
  Requires-Dist: sphinxcontrib-bibtex; extra == 'docs'
58
57
  Requires-Dist: sphinxext-opengraph; extra == 'docs'
58
+ Provides-Extra: tests
59
+ Requires-Dist: pytest; extra == 'tests'
60
+ Requires-Dist: pytest-cov; extra == 'tests'
59
61
  Description-Content-Type: text/markdown
60
62
 
61
63
  [![PyPI](https://img.shields.io/pypi/v/smftools.svg)](https://pypi.org/project/smftools)
62
64
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
63
65
 
64
66
  # smftools
65
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization. Data structures are compatible with analyses developed within the [scverse](https://github.com/scverse) project, including [scanpy](https://github.com/scverse/scanpy) and [scvi-tools](https://github.com/scverse/scvi-tools).
67
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
66
68
 
67
69
  ## Philosophy
68
70
  While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to at least 1 million X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
@@ -73,10 +75,14 @@ The following CLI tools need to be installed and configured before using the inf
73
75
  2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
74
76
  3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
75
77
  4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
78
+ 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
79
+ 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
76
80
 
77
81
  ## Modules
78
- - Informatics: Processes raw SMF data coming from Nanopore POD5 files, BAM files, or FASTQ files and organizes it into an AnnData object.
79
- - Preprocessing: Filters the AnnData object on read length, total methylation, and a variety of QC metrics.
82
+ ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
83
+ ![](docs/source/_static/smftools_informatics_diagram.png)
84
+ ### Preprocessing: Appends QC metrics to the AnnData object and perfroms filtering.
85
+ ![](docs/source/_static/smftools_preprocessing_diagram.png)
80
86
  - Tools: Appends various analyses to the AnnData object.
81
87
  - Plotting: Visualization of analyses stored within the AnnData object.
82
88
 
@@ -2,7 +2,7 @@
2
2
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
3
3
 
4
4
  # smftools
5
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization. Data structures are compatible with analyses developed within the [scverse](https://github.com/scverse) project, including [scanpy](https://github.com/scverse/scanpy) and [scvi-tools](https://github.com/scverse/scvi-tools).
5
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
6
6
 
7
7
  ## Philosophy
8
8
  While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to at least 1 million X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
@@ -13,10 +13,14 @@ The following CLI tools need to be installed and configured before using the inf
13
13
  2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
14
14
  3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
15
15
  4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
16
+ 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
17
+ 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
16
18
 
17
19
  ## Modules
18
- - Informatics: Processes raw SMF data coming from Nanopore POD5 files, BAM files, or FASTQ files and organizes it into an AnnData object.
19
- - Preprocessing: Filters the AnnData object on read length, total methylation, and a variety of QC metrics.
20
+ ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
21
+ ![](docs/source/_static/smftools_informatics_diagram.png)
22
+ ### Preprocessing: Appends QC metrics to the AnnData object and perfroms filtering.
23
+ ![](docs/source/_static/smftools_preprocessing_diagram.png)
20
24
  - Tools: Appends various analyses to the AnnData object.
21
25
  - Plotting: Visualization of analyses stored within the AnnData object.
22
26
 
@@ -0,0 +1,26 @@
1
+ # API
2
+
3
+ Import smftools as:
4
+
5
+ ```
6
+ import smftools as smf
7
+ ```
8
+
9
+ ```{toctree}
10
+ :maxdepth: 2
11
+
12
+ informatics
13
+ preprocessing
14
+ tools
15
+ datasets
16
+ ```
17
+
18
+ ## Informatics module diagram
19
+ ```{image} ../_static/smftools_informatics_diagram.png
20
+ :width: 800px
21
+ ```
22
+
23
+ ## Preprocessing module diagram
24
+ ```{image} ../_static/smftools_preprocessing_diagram.png
25
+ :width: 800px
26
+ ```
@@ -0,0 +1,27 @@
1
+ ## Informatics: `inform`
2
+
3
+ ## Informatics module diagram
4
+ ```{image} ../_static/smftools_informatics_diagram.png
5
+ :width: 1000px
6
+ ```
7
+
8
+ ```{eval-rst}
9
+ .. module:: smftools.inform
10
+ ```
11
+
12
+ ```{eval-rst}
13
+ .. currentmodule:: smftools
14
+ ```
15
+
16
+ Processes raw sequencing data to load an adata object.
17
+
18
+
19
+ ### Diagram of final steps of Direct SMF workflow
20
+ ```{image} ../_static/modkit_extract_to_adata.png
21
+ :width: 1000px
22
+ ```
23
+
24
+ ### Diagram of final steps of Conversion SMF workflow
25
+ ```{image} ../_static/converted_BAM_to_adata.png
26
+ :width: 1000px
27
+ ```
@@ -1,5 +1,10 @@
1
1
  ## Preprocessing: `pp`
2
2
 
3
+ ## Preprocessing module diagram
4
+ ```{image} ../_static/smftools_preprocessing_diagram.png
5
+ :width: 1000px
6
+ ```
7
+
3
8
  ```{eval-rst}
4
9
  .. module:: smftools.pp
5
10
  ```
@@ -0,0 +1,75 @@
1
+ # Basic Usage
2
+
3
+ Import SmfTools:
4
+
5
+ ```
6
+ import smftools as smf
7
+ ```
8
+
9
+ ## Informatics Module Usage
10
+
11
+ Many use cases for smftools begin here. For most users, the call below will be sufficient to convert any raw SMF dataset to an AnnData object:
12
+
13
+ ```
14
+ config_path = "/Path_to_experiment_config.csv"
15
+ smf.inform.load_adata(config_path)
16
+ ```
17
+
18
+ ## Loading AnnData objects created by the informatics module
19
+
20
+ After creating an AnnData object holding your experiment's SMF data, you can load the AnnData object as so:
21
+
22
+ ```
23
+ import anndata as ad
24
+ input_adata = "/Path_to_experiment_AnnData.h5ad.gz"
25
+ adata = ad.read_h5ad(input_file)
26
+ adata.obs_names_make_unique()
27
+ ```
28
+
29
+ If you don't have an AnnData object yet, but want to play with the downstream Preprocessing, Tools, and Plotting modules, you can load a pre-loaded SMF dataset.
30
+
31
+ Currently, you can do this with our lab's in vitro dCas9 binding kinetics dataset generated from a Hia5 SMF dataset generated with direct m6A high accuracy basecalls:
32
+
33
+ ```
34
+ adata = smf.datasets.dCas9_kinetics()
35
+ adata.obs_names_make_unique()
36
+ ```
37
+
38
+ Alternatively, you can do this with our lab's M.CviPI SMF test data in F1-hybrid natural killer cells generated by NEB EMseq conversion followed by canonical basecalling:
39
+
40
+ ```
41
+ adata = smf.datasets.Kissiov_and_McKenna_2025()
42
+ adata.obs_names_make_unique()
43
+ ```
44
+
45
+ ## Writing out AnnData objects to save analysis progress
46
+
47
+ After preprocessing and downstream analysis of the AnnData object, you can save the AnnData object at any step as so:
48
+
49
+ ```
50
+ import anndata as ad
51
+ import os
52
+
53
+ output_dir = '/Path_to_output_directory'
54
+ output_adata = 'analyzed_adata.h5ad.gz'
55
+ final_output = os.path.join(output_dir, output_adata)
56
+ adata.write_h5ad(final_output, compression='gzip')
57
+ ```
58
+
59
+
60
+ ## Troubleshooting
61
+ For more advanced usage and help troubleshooting, the API and tutorials for each of the modules is still being developed.
62
+ However, you can currently learn about the functions contained within the module by calling:
63
+
64
+ ```
65
+ smf.inform.__all__
66
+ ```
67
+
68
+ This lists the functions within any given module. If you want to see the associated docstring for a given function, here is an example:
69
+
70
+ ```
71
+ print(smf.inform.load_adata.__doc__)
72
+ ```
73
+
74
+ These docstrings will provide a brief description of the function and also tell you the input parameters and what the function returns.
75
+ In some cases, usage examples will also be provided in the docstring in the form of doctests.
@@ -42,6 +42,7 @@ smftools GitHub link
42
42
  :maxdepth: 1
43
43
 
44
44
  installation
45
+ basic_usage
45
46
  tutorials/index
46
47
  api/index
47
48
  release-notes/index
@@ -0,0 +1,60 @@
1
+ # Installation
2
+
3
+ ## PyPi version
4
+
5
+ Pull smftools from [PyPI](https://pypi.org/project/smftools):
6
+
7
+ ```shell
8
+ pip install smftools
9
+ ```
10
+
11
+ It is recommended to first create and activate a conda environment before installing smftools to ensure dependencies are managed smoothly:
12
+
13
+ ```shell
14
+ conda create -n smftools
15
+ conda activate smftools
16
+ pip install smftools
17
+ ```
18
+
19
+ Ensure that you can access dorado, samtools, modkit, bedtools, and BedGraphtoBigWig executables from the terminal in this environment. These are all necessary for the functionality within the Informatics module.
20
+ You may need to add them to $PATH if they are not globally configured.
21
+ For example, if you want to check if dorado is executable, simply run this in the terminal:
22
+
23
+ ```shell
24
+ dorado
25
+ ```
26
+
27
+ On Mac OSX, the following can be used to congigure bedtools (with brew) and BedGraphToBigWig (with wget). Change the BedGraphToBigWig link to include the correct architecture for your OS.
28
+
29
+ ```shell
30
+ brew install bedtools
31
+ wget http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bedGraphToBigWig
32
+ chmod +x bedGraphToBigWig
33
+ sudo mv bedGraphToBigWig /usr/local/bin/
34
+ ```
35
+
36
+ ## Development Version
37
+
38
+ Clone smftools from source and change into the smftools directory:
39
+
40
+ ```shell
41
+ git clone https://github.com/jkmckenna/smftools.git
42
+ cd smftools
43
+ ```
44
+
45
+ A virtual environment can be created for the current version within the smftools directory:
46
+
47
+ ```shell
48
+ python -m venv venv-smftools
49
+ source venv-smftools/bin/activate
50
+ pip install .
51
+ ```
52
+
53
+ Subsequent use of the installed version of smftools can be run by changing to the smftools directory and activating the venv:
54
+
55
+ ```shell
56
+ cd smftools
57
+ source venv-smftools/bin/activate
58
+ ```
59
+
60
+ You can now run smftools from the terminal, an IDE, or a notebook within the virtual environment.
@@ -1,8 +1,8 @@
1
1
  variable,value,help,options,type
2
2
  smf_modality,conversion,Modality of SMF. Can either be conversion or direct.,"conversion, direct",str
3
- pod5_dir,/path_to_POD5_directory,Path to directory containing input POD5 files (If doing Nanopore SMF),,str
4
- basecalled_path,/path_to_basecalled_HTS_file.bam,Path to directory containing input BAM file (if doing SMF from an already basecalled experiment). Can also be a path to a FASTQ for conversion SMF.,,str
3
+ input_data_path,/path_to_POD5_directory,Path to directory/file containing input sequencing data,,str
5
4
  fasta,/path_to_fasta.fasta,Path to initial FASTA file,,str
5
+ fasta_regions_of_interest,/path_to_bed.bed,Path to a bed file to subsample the fasta on.,,str
6
6
  output_directory,/outputs,Directory to act as root for all analysis outputs,,str
7
7
  experiment_name,,An experiment name for the final h5ad file,,str
8
8
  model,None,The dorado basecalling model to use,,str
@@ -0,0 +1,85 @@
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import anndata as ad\n",
10
+ "import pandas as pd\n",
11
+ "import numpy as np\n",
12
+ "import matplotlib.pyplot as plt\n",
13
+ "import smftools as smf\n",
14
+ "import os"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": null,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "# Define file paths\n",
24
+ "adata_path = '/Path_to_input_adata.h5ad.gz'\n",
25
+ "output_directory = '/Path_to_output_directory'\n",
26
+ "output_adata = 'analyzed_adata.h5ad.gz'\n",
27
+ "final_output = os.path.join(output_directory, output_adata)\n",
28
+ "\n",
29
+ "# Load adata\n",
30
+ "adata = ad.read_h5ad(adata_path)\n",
31
+ "adata.obs_names_make_unique()"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "code",
36
+ "execution_count": null,
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "# Define path to sample sheet and run first part of preprocessing.\n",
41
+ "sample_sheet_path = '/path_to_sample_sheet.csv'\n",
42
+ "variables = smf.pp.recipe_1_Kissiov_and_McKenna_2025(adata, sample_sheet_path, output_directory)\n",
43
+ "# Update global variables\n",
44
+ "globals().update(variables)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {},
51
+ "outputs": [],
52
+ "source": [
53
+ "# Filter adata based on defined read length statistics, using the plots from preprocessing part 1 to direct the input parameters here.\n",
54
+ "smf.pp.filter_reads_on_length(adata, filter_on_coordinates=[lower_bound, upper_bound], min_read_length=2700)"
55
+ ]
56
+ },
57
+ {
58
+ "cell_type": "code",
59
+ "execution_count": null,
60
+ "metadata": {},
61
+ "outputs": [],
62
+ "source": [
63
+ "# Filter adata on defined read methylation statistics\n",
64
+ "smf.pp.filter_converted_reads_on_methylation(adata, valid_SMF_site_threshold=0.8, min_SMF_threshold=0.025)"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# Run second part of preprocessing\n",
74
+ "duplicates = smf.pp.recipe_2_Kissiov_and_McKenna_2025(adata, output_directory, binary_layers)"
75
+ ]
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "language_info": {
80
+ "name": "python"
81
+ }
82
+ },
83
+ "nbformat": 4,
84
+ "nbformat_minor": 2
85
+ }
@@ -0,0 +1,11 @@
1
+ Sample,Sample_names,MTase,Time (sec),Group
2
+ 0,,Hia5,0,0
3
+ 1,,Hia5,15,0
4
+ 2,,Hia5,30,0
5
+ 3,,Hia5,120,0
6
+ 4,,Hia5,300,0
7
+ 5,,Hia5,0,1
8
+ 6,,Hia5,15,1
9
+ 7,,Hia5,30,1
10
+ 8,,Hia5,120,1
11
+ 9,,Hia5,300,1
@@ -48,6 +48,7 @@ dependencies = [
48
48
  "pandas>=1.4.2",
49
49
  "pod5>=0.1.21",
50
50
  "pomegranate>1.0.0",
51
+ "pyfaidx>=0.8.0",
51
52
  "pysam>=0.19.1",
52
53
  "scanpy>=1.9",
53
54
  "scikit-learn>=1.0.2",
@@ -60,9 +61,10 @@ dynamic = ["version"]
60
61
 
61
62
  [project.urls]
62
63
  Source = "https://github.com/jkmckenna/smftools"
64
+ Documentation = "https://smftools.readthedocs.io/"
63
65
 
64
66
  [project.optional-dependencies]
65
- base_tests = [
67
+ tests = [
66
68
  "pytest",
67
69
  "pytest-cov"
68
70
  ]
@@ -91,16 +93,16 @@ packages = ["src/smftools"]
91
93
  path = "src/smftools/_version.py"
92
94
 
93
95
  [tool.pytest.ini_options]
96
+ addopts = [
97
+ "--import-mode=importlib",
98
+ "--strict-markers",
99
+ "--doctest-modules",
100
+ "--pyargs",
101
+ ]
94
102
  testpaths = ["tests"]
95
103
  pythonpath = ["src"]
96
104
  xfail_strict = true
97
- markers = [
98
- "internet: mark tests that requires internet access",
99
- "optional: mark optional tests",
100
- "private: mark tests that are private",
101
- ]
102
105
 
103
106
  [tool.coverage.run]
104
- branch = true
105
107
  source = ["smftools"]
106
108
  omit = ["tests/*"]
@@ -7,6 +7,7 @@ numpy>=1.22.0,<2
7
7
  pandas>=1.4.2
8
8
  pomegranate>1.0.0
9
9
  pod5>=0.1.21
10
+ pyfaidx>=0.8.0
10
11
  pysam>=0.19.1
11
12
  scanpy>=1.9
12
13
  scikit-learn>=1.0.2
@@ -0,0 +1,11 @@
1
+ Sample,Sample_names,MTase,Time (sec),Group
2
+ 0,,Hia5,0,0
3
+ 1,,Hia5,15,0
4
+ 2,,Hia5,30,0
5
+ 3,,Hia5,120,0
6
+ 4,,Hia5,300,0
7
+ 5,,Hia5,0,1
8
+ 6,,Hia5,15,1
9
+ 7,,Hia5,30,1
10
+ 8,,Hia5,120,1
11
+ 9,,Hia5,300,1
@@ -1,4 +1,5 @@
1
1
  from pathlib import Path
2
+ from typing import Union
2
3
 
3
4
  class SMFConfig:
4
5
  """\
@@ -8,9 +9,9 @@ class SMFConfig:
8
9
  def __init__(
9
10
  self,
10
11
  *,
11
- datasetdir: Path | str = "./datasets/"
12
+ datasetdir: Union[Path, str] = "./datasets/"
12
13
  ):
13
- self._datasetdir = Path(datasetdir) if isinstance(datasetdir, str) else datasetdir
14
+ self._datasetdir = Path(datasetdir) if isinstance(datasetdir, str) else datasetdir
14
15
 
15
16
  @property
16
17
  def datasetdir(self) -> Path:
@@ -0,0 +1 @@
1
+ __version__ = "0.1.3"
@@ -0,0 +1,5 @@
1
+ Sample,Sample_names,MTase,Time (min),Notes
2
+ barcode0001_sorted,Neither,M.CviPI,7.5,Cultured in IL2
3
+ barcode0002_sorted,BALBC,M.CviPI,7.5,Cultured in IL2
4
+ barcode0003_sorted,B6,M.CviPI,7.5,Cultured in IL2
5
+ barcode0004_sorted,Both,M.CviPI,7.5,Cultured in IL2
@@ -1,10 +1,9 @@
1
1
  ## datasets
2
2
 
3
- def import_deps():
3
+ def import_HERE():
4
4
  """
5
-
5
+ Imports HERE for loading datasets
6
6
  """
7
- import anndata as ad
8
7
  from pathlib import Path
9
8
  from .._settings import settings
10
9
  HERE = Path(__file__).parent
@@ -12,16 +11,18 @@ def import_deps():
12
11
 
13
12
  def dCas9_kinetics():
14
13
  """
15
-
14
+ in vitro Hia5 dCas9 kinetics SMF dataset. Nanopore HAC m6A modcalls.
16
15
  """
17
- HERE = import_deps()
16
+ import anndata as ad
17
+ HERE = import_HERE()
18
18
  filepath = HERE / "dCas9_m6A_invitro_kinetics.h5ad.gz"
19
19
  return ad.read_h5ad(filepath)
20
20
 
21
21
  def Kissiov_and_McKenna_2025():
22
22
  """
23
-
23
+ F1 Hybrid M.CviPI natural killer cell SMF. Nanopore canonical calls of NEB EMseq converted SMF gDNA.
24
24
  """
25
- HERE = import_deps()
25
+ import anndata as ad
26
+ HERE = import_HERE()
26
27
  filepath = HERE / "F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz"
27
28
  return ad.read_h5ad(filepath)
@@ -0,0 +1,14 @@
1
+ from . import helpers
2
+ from .load_adata import load_adata
3
+ from .subsample_fasta_from_bed import subsample_fasta_from_bed
4
+ from .subsample_pod5 import subsample_pod5
5
+ from .fast5_to_pod5 import fast5_to_pod5
6
+
7
+
8
+ __all__ = [
9
+ "load_adata",
10
+ "subsample_fasta_from_bed",
11
+ "subsample_pod5",
12
+ "fast5_to_pod5",
13
+ "helpers"
14
+ ]
@@ -18,7 +18,7 @@ def bam_conversion(fasta, output_directory, conversion_types, strands, basecalle
18
18
  Returns:
19
19
  None
20
20
  """
21
- from .helpers import align_and_sort_BAM, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM
21
+ from .helpers import align_and_sort_BAM, converted_BAM_to_adata, generate_converted_FASTA, split_and_index_BAM, make_dirs
22
22
  import os
23
23
  input_basecalled_basename = os.path.basename(basecalled_path)
24
24
  bam_basename = input_basecalled_basename.split(".")[0]
@@ -32,16 +32,28 @@ def bam_conversion(fasta, output_directory, conversion_types, strands, basecalle
32
32
  fasta_basename = os.path.basename(fasta)
33
33
  converted_FASTA_basename = fasta_basename.split('.fa')[0]+'_converted.fasta'
34
34
  converted_FASTA = os.path.join(output_directory, converted_FASTA_basename)
35
- if os.path.exists(converted_FASTA):
35
+ if 'converted.fa' in fasta:
36
+ print(fasta + ' is already converted. Using existing converted FASTA.')
37
+ converted_FASTA = fasta
38
+ elif os.path.exists(converted_FASTA):
36
39
  print(converted_FASTA + ' already exists. Using existing converted FASTA.')
37
40
  else:
38
41
  generate_converted_FASTA(fasta, conversion_types, strands, converted_FASTA)
39
42
 
40
43
  # 2) Align the basecalled file to the converted reference FASTA and sort the bam on positional coordinates. Also make an index and a bed file of mapped reads
41
- align_and_sort_BAM(converted_FASTA, basecalled_path, bam_suffix, output_directory)
44
+ aligned_output = aligned_BAM + bam_suffix
45
+ sorted_output = aligned_sorted_BAM + bam_suffix
46
+ if os.path.exists(aligned_output) and os.path.exists(sorted_output):
47
+ print(sorted_output + ' already exists. Using existing aligned/sorted BAM.')
48
+ else:
49
+ align_and_sort_BAM(converted_FASTA, basecalled_path, bam_suffix, output_directory)
42
50
 
43
51
  ### 3) Split the aligned and sorted BAM files by barcode (BC Tag) into the split_BAM directory###
44
- split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix)
52
+ if os.path.isdir(split_dir):
53
+ print(split_dir + ' already exists. Using existing aligned/sorted/split BAMs.')
54
+ else:
55
+ make_dirs([split_dir])
56
+ split_and_index_BAM(aligned_sorted_BAM, split_dir, bam_suffix, output_directory)
45
57
 
46
58
  # 4) Take the converted BAM and load it into an adata object.
47
59
  converted_BAM_to_adata(converted_FASTA, split_dir, mapping_threshold, experiment_name, conversion_types, bam_suffix)