smftools 0.1.7__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (255) hide show
  1. {smftools-0.1.7 → smftools-0.2.4}/PKG-INFO +30 -19
  2. {smftools-0.1.7 → smftools-0.2.4}/README.md +23 -17
  3. smftools-0.2.4/docs/source/basic_usage.md +114 -0
  4. {smftools-0.1.7 → smftools-0.2.4}/docs/source/installation.md +6 -3
  5. {smftools-0.1.7 → smftools-0.2.4}/experiment_config.csv +2 -7
  6. {smftools-0.1.7 → smftools-0.2.4}/pyproject.toml +10 -2
  7. {smftools-0.1.7 → smftools-0.2.4}/requirements.txt +7 -2
  8. {smftools-0.1.7 → smftools-0.2.4}/smftools/__init__.py +7 -6
  9. smftools-0.2.4/smftools/_version.py +1 -0
  10. smftools-0.2.4/smftools/cli/archived/cli_flows.py +94 -0
  11. smftools-0.2.4/smftools/cli/helpers.py +48 -0
  12. smftools-0.2.4/smftools/cli/hmm_adata.py +361 -0
  13. smftools-0.2.4/smftools/cli/load_adata.py +637 -0
  14. smftools-0.2.4/smftools/cli/preprocess_adata.py +455 -0
  15. smftools-0.2.4/smftools/cli/spatial_adata.py +697 -0
  16. smftools-0.2.4/smftools/cli_entry.py +434 -0
  17. smftools-0.2.4/smftools/config/__init__.py +1 -0
  18. smftools-0.2.4/smftools/config/conversion.yaml +45 -0
  19. smftools-0.2.4/smftools/config/deaminase.yaml +63 -0
  20. smftools-0.2.4/smftools/config/default.yaml +368 -0
  21. smftools-0.2.4/smftools/config/direct.yaml +44 -0
  22. smftools-0.2.4/smftools/config/discover_input_files.py +115 -0
  23. smftools-0.2.4/smftools/config/experiment_config.py +1389 -0
  24. smftools-0.2.4/smftools/hmm/HMM.py +1587 -0
  25. smftools-0.2.4/smftools/hmm/__init__.py +14 -0
  26. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/apply_hmm_batched.py +8 -7
  27. smftools-0.2.4/smftools/hmm/archived/call_hmm_peaks.py +106 -0
  28. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/train_hmm.py +1 -1
  29. smftools-0.2.4/smftools/hmm/call_hmm_peaks.py +334 -0
  30. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/display_hmm.py +3 -3
  31. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/nucleosome_hmm_refinement.py +2 -2
  32. smftools-0.2.4/smftools/informatics/__init__.py +20 -0
  33. smftools-0.2.4/smftools/informatics/archived/deaminase_smf.py +132 -0
  34. smftools-0.2.4/smftools/informatics/archived/fast5_to_pod5.py +43 -0
  35. smftools-0.2.4/smftools/informatics/archived/helpers/archived/__init__.py +71 -0
  36. smftools-0.2.4/smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +126 -0
  37. smftools-0.2.4/smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +87 -0
  38. smftools-0.2.4/smftools/informatics/archived/helpers/archived/bam_qc.py +213 -0
  39. smftools-0.2.4/smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +90 -0
  40. smftools-0.2.4/smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +259 -0
  41. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/count_aligned_reads.py +2 -2
  42. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/demux_and_index_BAM.py +8 -10
  43. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_base_identities.py +30 -4
  44. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_mods.py +15 -13
  45. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_read_features_from_bam.py +4 -2
  46. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/find_conversion_sites.py +5 -4
  47. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/generate_converted_FASTA.py +2 -0
  48. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/get_chromosome_lengths.py +9 -8
  49. smftools-0.2.4/smftools/informatics/archived/helpers/archived/index_fasta.py +24 -0
  50. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/make_modbed.py +1 -2
  51. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/modQC.py +2 -2
  52. smftools-0.2.4/smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +250 -0
  53. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/separate_bam_by_bc.py +8 -7
  54. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/split_and_index_BAM.py +8 -12
  55. smftools-0.2.4/smftools/informatics/archived/subsample_fasta_from_bed.py +49 -0
  56. smftools-0.2.4/smftools/informatics/bam_functions.py +811 -0
  57. smftools-0.2.4/smftools/informatics/basecalling.py +67 -0
  58. smftools-0.2.4/smftools/informatics/bed_functions.py +366 -0
  59. smftools-0.2.4/smftools/informatics/binarize_converted_base_identities.py +172 -0
  60. smftools-0.1.7/smftools/informatics/helpers/converted_BAM_to_adata_II.py → smftools-0.2.4/smftools/informatics/converted_BAM_to_adata.py +198 -50
  61. smftools-0.2.4/smftools/informatics/fasta_functions.py +255 -0
  62. smftools-0.2.4/smftools/informatics/h5ad_functions.py +197 -0
  63. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/modkit_extract_to_adata.py +147 -61
  64. smftools-0.2.4/smftools/informatics/modkit_functions.py +129 -0
  65. smftools-0.2.4/smftools/informatics/ohe.py +160 -0
  66. smftools-0.2.4/smftools/informatics/pod5_functions.py +224 -0
  67. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/run_multiqc.py +5 -2
  68. smftools-0.2.4/smftools/machine_learning/__init__.py +12 -0
  69. smftools-0.2.4/smftools/machine_learning/data/__init__.py +2 -0
  70. smftools-0.2.4/smftools/machine_learning/data/anndata_data_module.py +234 -0
  71. smftools-0.2.4/smftools/machine_learning/evaluation/__init__.py +2 -0
  72. smftools-0.2.4/smftools/machine_learning/evaluation/eval_utils.py +31 -0
  73. smftools-0.2.4/smftools/machine_learning/evaluation/evaluators.py +223 -0
  74. smftools-0.2.4/smftools/machine_learning/inference/__init__.py +3 -0
  75. smftools-0.2.4/smftools/machine_learning/inference/inference_utils.py +27 -0
  76. smftools-0.2.4/smftools/machine_learning/inference/lightning_inference.py +68 -0
  77. smftools-0.2.4/smftools/machine_learning/inference/sklearn_inference.py +55 -0
  78. smftools-0.2.4/smftools/machine_learning/inference/sliding_window_inference.py +114 -0
  79. smftools-0.2.4/smftools/machine_learning/models/base.py +295 -0
  80. smftools-0.2.4/smftools/machine_learning/models/cnn.py +138 -0
  81. smftools-0.2.4/smftools/machine_learning/models/lightning_base.py +345 -0
  82. smftools-0.2.4/smftools/machine_learning/models/mlp.py +26 -0
  83. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/positional.py +3 -2
  84. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/rnn.py +2 -1
  85. smftools-0.2.4/smftools/machine_learning/models/sklearn_models.py +273 -0
  86. smftools-0.2.4/smftools/machine_learning/models/transformer.py +303 -0
  87. smftools-0.2.4/smftools/machine_learning/training/__init__.py +2 -0
  88. smftools-0.2.4/smftools/machine_learning/training/train_lightning_model.py +135 -0
  89. smftools-0.2.4/smftools/machine_learning/training/train_sklearn_model.py +114 -0
  90. {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/__init__.py +4 -1
  91. smftools-0.2.4/smftools/plotting/autocorrelation_plotting.py +609 -0
  92. smftools-0.2.4/smftools/plotting/general_plotting.py +1403 -0
  93. smftools-0.2.4/smftools/plotting/hmm_plotting.py +260 -0
  94. {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/position_stats.py +3 -3
  95. smftools-0.2.4/smftools/plotting/qc_plotting.py +270 -0
  96. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/__init__.py +15 -10
  97. smftools-0.2.4/smftools/preprocessing/append_base_context.py +131 -0
  98. smftools-0.2.4/smftools/preprocessing/append_binary_layer_by_base_context.py +143 -0
  99. smftools-0.2.4/smftools/preprocessing/archives/add_read_length_and_mapping_qc.py +129 -0
  100. smftools-0.2.4/smftools/preprocessing/binarize.py +17 -0
  101. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/binarize_on_Youden.py +11 -9
  102. smftools-0.2.4/smftools/preprocessing/calculate_complexity_II.py +248 -0
  103. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_coverage.py +25 -13
  104. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_position_Youden.py +42 -26
  105. smftools-0.2.4/smftools/preprocessing/calculate_read_modification_stats.py +101 -0
  106. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/clean_NaN.py +17 -1
  107. smftools-0.2.4/smftools/preprocessing/filter_reads_on_length_quality_mapping.py +158 -0
  108. smftools-0.2.4/smftools/preprocessing/filter_reads_on_modification_thresholds.py +352 -0
  109. smftools-0.2.4/smftools/preprocessing/flag_duplicate_reads.py +1351 -0
  110. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/invert_adata.py +12 -5
  111. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/load_sample_sheet.py +19 -4
  112. smftools-0.2.4/smftools/preprocessing/reindex_references_adata.py +37 -0
  113. smftools-0.2.4/smftools/readwrite.py +1224 -0
  114. smftools-0.2.4/smftools/tools/__init__.py +20 -0
  115. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/calculate_umap.py +5 -5
  116. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/general_tools.py +3 -3
  117. smftools-0.2.4/smftools/tools/position_stats.py +601 -0
  118. smftools-0.2.4/smftools/tools/read_stats.py +184 -0
  119. smftools-0.2.4/smftools/tools/spatial_autocorrelation.py +562 -0
  120. smftools-0.1.7/docs/source/basic_usage.md +0 -75
  121. smftools-0.1.7/smftools/_version.py +0 -1
  122. smftools-0.1.7/smftools/informatics/__init__.py +0 -16
  123. smftools-0.1.7/smftools/informatics/fast5_to_pod5.py +0 -21
  124. smftools-0.1.7/smftools/informatics/helpers/LoadExperimentConfig.py +0 -75
  125. smftools-0.1.7/smftools/informatics/helpers/__init__.py +0 -74
  126. smftools-0.1.7/smftools/informatics/helpers/align_and_sort_BAM.py +0 -59
  127. smftools-0.1.7/smftools/informatics/helpers/aligned_BAM_to_bed.py +0 -74
  128. smftools-0.1.7/smftools/informatics/helpers/bam_qc.py +0 -66
  129. smftools-0.1.7/smftools/informatics/helpers/bed_to_bigwig.py +0 -39
  130. smftools-0.1.7/smftools/informatics/helpers/binarize_converted_base_identities.py +0 -79
  131. smftools-0.1.7/smftools/informatics/helpers/concatenate_fastqs_to_bam.py +0 -55
  132. smftools-0.1.7/smftools/informatics/helpers/index_fasta.py +0 -12
  133. smftools-0.1.7/smftools/informatics/helpers/plot_read_length_and_coverage_histograms.py +0 -53
  134. smftools-0.1.7/smftools/informatics/load_adata.py +0 -182
  135. smftools-0.1.7/smftools/informatics/readwrite.py +0 -106
  136. smftools-0.1.7/smftools/informatics/subsample_fasta_from_bed.py +0 -47
  137. smftools-0.1.7/smftools/plotting/general_plotting.py +0 -205
  138. smftools-0.1.7/smftools/preprocessing/append_C_context.py +0 -82
  139. smftools-0.1.7/smftools/preprocessing/calculate_converted_read_methylation_stats.py +0 -94
  140. smftools-0.1.7/smftools/preprocessing/filter_converted_reads_on_methylation.py +0 -44
  141. smftools-0.1.7/smftools/preprocessing/filter_reads_on_length.py +0 -51
  142. smftools-0.1.7/smftools/preprocessing/flag_duplicate_reads.py +0 -149
  143. smftools-0.1.7/smftools/preprocessing/make_dirs.py +0 -21
  144. smftools-0.1.7/smftools/readwrite.py +0 -198
  145. smftools-0.1.7/smftools/tools/__init__.py +0 -49
  146. smftools-0.1.7/smftools/tools/call_hmm_peaks.py +0 -105
  147. smftools-0.1.7/smftools/tools/data/__init__.py +0 -2
  148. smftools-0.1.7/smftools/tools/data/anndata_data_module.py +0 -90
  149. smftools-0.1.7/smftools/tools/inference/__init__.py +0 -1
  150. smftools-0.1.7/smftools/tools/inference/lightning_inference.py +0 -41
  151. smftools-0.1.7/smftools/tools/models/base.py +0 -14
  152. smftools-0.1.7/smftools/tools/models/cnn.py +0 -34
  153. smftools-0.1.7/smftools/tools/models/lightning_base.py +0 -41
  154. smftools-0.1.7/smftools/tools/models/mlp.py +0 -17
  155. smftools-0.1.7/smftools/tools/models/sklearn_models.py +0 -40
  156. smftools-0.1.7/smftools/tools/models/transformer.py +0 -133
  157. smftools-0.1.7/smftools/tools/position_stats.py +0 -239
  158. smftools-0.1.7/smftools/tools/read_stats.py +0 -70
  159. smftools-0.1.7/smftools/tools/training/__init__.py +0 -1
  160. smftools-0.1.7/smftools/tools/training/train_lightning_model.py +0 -47
  161. {smftools-0.1.7 → smftools-0.2.4}/.gitattributes +0 -0
  162. {smftools-0.1.7 → smftools-0.2.4}/.gitignore +0 -0
  163. {smftools-0.1.7 → smftools-0.2.4}/.readthedocs.yaml +0 -0
  164. {smftools-0.1.7 → smftools-0.2.4}/CONTRIBUTING.md +0 -0
  165. {smftools-0.1.7 → smftools-0.2.4}/LICENSE +0 -0
  166. {smftools-0.1.7 → smftools-0.2.4}/docs/Makefile +0 -0
  167. {smftools-0.1.7 → smftools-0.2.4}/docs/make.bat +0 -0
  168. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/converted_BAM_to_adata.png +0 -0
  169. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/modkit_extract_to_adata.png +0 -0
  170. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools-1.svg +0 -0
  171. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools-1.tif +0 -0
  172. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.pdf +0 -0
  173. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.png +0 -0
  174. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_static/smftools_preprocessing_diagram.png +0 -0
  175. {smftools-0.1.7 → smftools-0.2.4}/docs/source/_templates/tmp +0 -0
  176. {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/datasets.md +0 -0
  177. {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/index.md +0 -0
  178. {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/informatics.md +0 -0
  179. {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/preprocessing.md +0 -0
  180. {smftools-0.1.7 → smftools-0.2.4}/docs/source/api/tools.md +0 -0
  181. {smftools-0.1.7 → smftools-0.2.4}/docs/source/conf.py +0 -0
  182. {smftools-0.1.7 → smftools-0.2.4}/docs/source/contributors.md +0 -0
  183. {smftools-0.1.7 → smftools-0.2.4}/docs/source/dev/index.md +0 -0
  184. {smftools-0.1.7 → smftools-0.2.4}/docs/source/index.md +0 -0
  185. {smftools-0.1.7 → smftools-0.2.4}/docs/source/references.bib +0 -0
  186. {smftools-0.1.7 → smftools-0.2.4}/docs/source/references.rst +0 -0
  187. {smftools-0.1.7 → smftools-0.2.4}/docs/source/release-notes/0.1.0.md +0 -0
  188. {smftools-0.1.7 → smftools-0.2.4}/docs/source/release-notes/index.md +0 -0
  189. {smftools-0.1.7 → smftools-0.2.4}/docs/source/requirements.txt +0 -0
  190. {smftools-0.1.7 → smftools-0.2.4}/docs/source/tutorials/index.md +0 -0
  191. {smftools-0.1.7 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_example_notebook.ipynb +0 -0
  192. {smftools-0.1.7 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_sample_sheet.csv +0 -0
  193. {smftools-0.1.7 → smftools-0.2.4}/sample_sheet.csv +0 -0
  194. {smftools-0.1.7 → smftools-0.2.4}/smftools/_settings.py +0 -0
  195. {smftools-0.1.7/smftools/tools/evaluation → smftools-0.2.4/smftools/cli}/__init__.py +0 -0
  196. {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  197. {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/F1_sample_sheet.csv +0 -0
  198. {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/__init__.py +0 -0
  199. {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  200. {smftools-0.1.7 → smftools-0.2.4}/smftools/datasets/datasets.py +0 -0
  201. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm/archived}/calculate_distances.py +0 -0
  202. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/hmm}/hmm_readwrite.py +0 -0
  203. {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/bam_conversion.py +0 -0
  204. {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/bam_direct.py +0 -0
  205. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/basecall_pod5s.py +0 -0
  206. {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/basecalls_to_adata.py +0 -0
  207. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/conversion_smf.py +0 -0
  208. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/direct_smf.py +0 -0
  209. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/canoncall.py +0 -0
  210. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/converted_BAM_to_adata.py +0 -0
  211. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_read_lengths_from_bed.py +0 -0
  212. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/extract_readnames_from_BAM.py +0 -0
  213. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/get_native_references.py +0 -0
  214. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/helpers/archived/informatics.py +0 -0
  215. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/helpers/archived/load_adata.py +0 -0
  216. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/modcall.py +0 -0
  217. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/ohe_batching.py +0 -0
  218. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/ohe_layers_decode.py +0 -0
  219. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/one_hot_decode.py +0 -0
  220. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics/archived/helpers/archived}/one_hot_encode.py +0 -0
  221. {smftools-0.1.7 → smftools-0.2.4}/smftools/informatics/archived/print_bam_query_seq.py +0 -0
  222. {smftools-0.1.7/smftools/informatics → smftools-0.2.4/smftools/informatics/archived}/subsample_pod5.py +0 -0
  223. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/informatics}/complement_base_list.py +0 -0
  224. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/data/preprocessing.py +0 -0
  225. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/__init__.py +0 -0
  226. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/models/wrappers.py +0 -0
  227. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/__init__.py +0 -0
  228. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/device.py +0 -0
  229. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/machine_learning}/utils/grl.py +0 -0
  230. {smftools-0.1.7 → smftools-0.2.4}/smftools/plotting/classifiers.py +0 -0
  231. {smftools-0.1.7/smftools/preprocessing → smftools-0.2.4/smftools/preprocessing/archives}/calculate_complexity.py +0 -0
  232. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/mark_duplicates.py +0 -0
  233. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/preprocessing.py +0 -0
  234. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/archives/remove_duplicates.py +0 -0
  235. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/binary_layers_to_ohe.py +0 -0
  236. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_consensus.py +0 -0
  237. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_differences.py +0 -0
  238. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -0
  239. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/calculate_read_length_stats.py +0 -0
  240. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/filter_adata_by_nan_proportion.py +0 -0
  241. {smftools-0.1.7/smftools/informatics/helpers → smftools-0.2.4/smftools/preprocessing}/make_dirs.py +0 -0
  242. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/min_non_diagonal.py +0 -0
  243. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/recipes.py +0 -0
  244. {smftools-0.1.7 → smftools-0.2.4}/smftools/preprocessing/subsample_adata.py +0 -0
  245. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/tools/archived}/apply_hmm.py +0 -0
  246. {smftools-0.1.7/smftools/tools → smftools-0.2.4/smftools/tools/archived}/classifiers.py +0 -0
  247. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/classify_methylated_features.py +0 -0
  248. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/classify_non_methylated_features.py +0 -0
  249. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v1.py +0 -0
  250. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v2.py +0 -0
  251. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/cluster_adata_on_methylation.py +0 -0
  252. {smftools-0.1.7 → smftools-0.2.4}/smftools/tools/subset_adata.py +0 -0
  253. {smftools-0.1.7 → smftools-0.2.4}/tests/datasets/test_datasets.py +0 -0
  254. {smftools-0.1.7 → smftools-0.2.4}/tests/informatics/helpers/test_LoadExperimentConfig.py +0 -0
  255. {smftools-0.1.7 → smftools-0.2.4}/tests/test_readwrite.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.1.7
3
+ Version: 0.2.4
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
6
  Project-URL: Documentation, https://smftools.readthedocs.io/
@@ -43,9 +43,11 @@ Classifier: Programming Language :: Python :: 3.11
43
43
  Classifier: Programming Language :: Python :: 3.12
44
44
  Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
45
45
  Classifier: Topic :: Scientific/Engineering :: Visualization
46
- Requires-Python: >=3.9
46
+ Requires-Python: <3.13,>=3.9
47
47
  Requires-Dist: anndata>=0.10.0
48
48
  Requires-Dist: biopython>=1.79
49
+ Requires-Dist: captum
50
+ Requires-Dist: click
49
51
  Requires-Dist: fastcluster
50
52
  Requires-Dist: hydra-core
51
53
  Requires-Dist: igraph
@@ -57,15 +59,18 @@ Requires-Dist: numpy<2,>=1.22.0
57
59
  Requires-Dist: omegaconf
58
60
  Requires-Dist: pandas>=1.4.2
59
61
  Requires-Dist: pod5>=0.1.21
60
- Requires-Dist: pomegranate>=1.0.0
62
+ Requires-Dist: pybedtools>=0.12.0
63
+ Requires-Dist: pybigwig>=0.3.24
61
64
  Requires-Dist: pyfaidx>=0.8.0
62
65
  Requires-Dist: pysam>=0.19.1
63
66
  Requires-Dist: scanpy>=1.9
64
67
  Requires-Dist: scikit-learn>=1.0.2
65
68
  Requires-Dist: scipy>=1.7.3
66
69
  Requires-Dist: seaborn>=0.11
70
+ Requires-Dist: shap
67
71
  Requires-Dist: torch>=1.9.0
68
72
  Requires-Dist: tqdm
73
+ Requires-Dist: upsetplot
69
74
  Requires-Dist: wandb
70
75
  Provides-Extra: docs
71
76
  Requires-Dist: ipython>=7.20; extra == 'docs'
@@ -91,33 +96,39 @@ Description-Content-Type: text/markdown
91
96
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
92
97
 
93
98
  # smftools
94
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
99
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
95
100
 
96
101
  ## Philosophy
97
- While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
102
+ While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
98
103
 
99
104
  ## Dependencies
100
105
  The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
101
- 1) [Dorado](https://github.com/nanoporetech/dorado) -> For standard/modified basecalling and alignment. Can be attained by downloading and configuring nanopore MinKnow software.
102
- 2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
103
- 3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
104
- 4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
105
- 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
106
- 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
106
+ 1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
107
+ 2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
108
+ 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
107
109
 
108
- ## Modules
109
- ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
110
+ ## Main Commands
111
+ ### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
110
112
  ![](docs/source/_static/smftools_informatics_diagram.png)
111
- ### Preprocessing: Appends QC metrics to the AnnData object and performs filtering.
113
+ ### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
112
114
  ![](docs/source/_static/smftools_preprocessing_diagram.png)
113
- ### Tools: Appends analyses to the AnnData object.
114
- - Currently Includes: Position X Position correlation matrices, Hidden Markov Model feature detection, clustering, dimensionality reduction, peak calling, train/test workflows for various ML classifiers.
115
- - To do: Additional ML methods for learning predictive single molecule features on condition labels: Autoencoders, Variational Autoencoders, Transformers.
116
- ### Plotting: Visualization of analyses stored within the AnnData object.
117
- - Most analyses appended to the adata object by a tools method have, or will have, an accompanying plotting method.
115
+ ### smftools spatial: Appends spatial analyses to the AnnData object.
116
+ - Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
117
+ ### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
118
+ - Main outputs wills be stored in adata.layers
119
+ ### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
120
+ - Nice when analyzing multiple experiments
121
+ ### smftools concatenate: Concatenates a list or directory of anndata objects.
122
+ - Mainly used for combining multiple experiments into a single anndata object.
118
123
 
119
124
  ## Announcements
120
125
 
126
+ ### 12/02/25 - Version 0.2.3 is available through PyPI
127
+ Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
128
+
129
+ ### 11/05/25 - Version 0.2.1 is available through PyPI
130
+ Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
131
+
121
132
  ### 05/29/25 - Version 0.1.6 is available through PyPI.
122
133
  Informatics, preprocessing, tools, plotting modules have core functionality that is approaching stability on MacOS(Intel/Silicon) and Linux(Ubuntu). I will work on improving documentation/tutorials shortly. The base PyTorch/Scikit-Learn ML-infrastructure is going through some organizational changes to work with PyTorch Lightning, Hydra, and WanDB to facilitate organizational scaling, multi-device usage, and logging.
123
134
 
@@ -2,33 +2,39 @@
2
2
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
3
3
 
4
4
  # smftools
5
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
5
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
6
6
 
7
7
  ## Philosophy
8
- While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
8
+ While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
9
9
 
10
10
  ## Dependencies
11
11
  The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
12
- 1) [Dorado](https://github.com/nanoporetech/dorado) -> For standard/modified basecalling and alignment. Can be attained by downloading and configuring nanopore MinKnow software.
13
- 2) [Samtools](https://github.com/samtools/samtools) -> For working with SAM/BAM files
14
- 3) [Minimap2](https://github.com/lh3/minimap2) -> The aligner used by Dorado
15
- 4) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting summary statistics and read level methylation calls from modified BAM files
16
- 5) [Bedtools](https://github.com/arq5x/bedtools2) -> For generating Bedgraphs from BAM alignment files.
17
- 6) [BedGraphToBigWig](https://genome.ucsc.edu/goldenPath/help/bigWig.html) -> For converting BedGraphs to BigWig files for IGV sessions.
18
-
19
- ## Modules
20
- ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
12
+ 1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
13
+ 2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
14
+ 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
15
+
16
+ ## Main Commands
17
+ ### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
21
18
  ![](docs/source/_static/smftools_informatics_diagram.png)
22
- ### Preprocessing: Appends QC metrics to the AnnData object and performs filtering.
19
+ ### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
23
20
  ![](docs/source/_static/smftools_preprocessing_diagram.png)
24
- ### Tools: Appends analyses to the AnnData object.
25
- - Currently Includes: Position X Position correlation matrices, Hidden Markov Model feature detection, clustering, dimensionality reduction, peak calling, train/test workflows for various ML classifiers.
26
- - To do: Additional ML methods for learning predictive single molecule features on condition labels: Autoencoders, Variational Autoencoders, Transformers.
27
- ### Plotting: Visualization of analyses stored within the AnnData object.
28
- - Most analyses appended to the adata object by a tools method have, or will have, an accompanying plotting method.
21
+ ### smftools spatial: Appends spatial analyses to the AnnData object.
22
+ - Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
23
+ ### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
24
+ - Main outputs wills be stored in adata.layers
25
+ ### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
26
+ - Nice when analyzing multiple experiments
27
+ ### smftools concatenate: Concatenates a list or directory of anndata objects.
28
+ - Mainly used for combining multiple experiments into a single anndata object.
29
29
 
30
30
  ## Announcements
31
31
 
32
+ ### 12/02/25 - Version 0.2.3 is available through PyPI
33
+ Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
34
+
35
+ ### 11/05/25 - Version 0.2.1 is available through PyPI
36
+ Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
37
+
32
38
  ### 05/29/25 - Version 0.1.6 is available through PyPI.
33
39
  Informatics, preprocessing, tools, plotting modules have core functionality that is approaching stability on MacOS(Intel/Silicon) and Linux(Ubuntu). I will work on improving documentation/tutorials shortly. The base PyTorch/Scikit-Learn ML-infrastructure is going through some organizational changes to work with PyTorch Lightning, Hydra, and WanDB to facilitate organizational scaling, multi-device usage, and logging.
34
40
 
@@ -0,0 +1,114 @@
1
+ # Basic Usage
2
+
3
+ ## Load Usage
4
+
5
+ Many use cases for smftools begin here. For most users, the call below will be sufficient to convert any raw SMF dataset from Nanopore/Illumina to an AnnData object:
6
+
7
+ ```shell
8
+ smftools load "/Path_to_experiment_config.csv"
9
+ ```
10
+
11
+ This command takes a user passed config file handling:
12
+ - I/O pathes (With data input path, FASTA path, optional BED path for subsampling FASTA, and a data output path)
13
+ - Experiment info (SMF modality, sequencer type, barcoding kit if nanopore, sample sheet with metadata mapping)
14
+ - Options to override default workflow parameters from smftools/config. Params are handled from default.yaml -> modality_type.yaml -> user passed config.csv.
15
+
16
+ ## Preprocess Usage
17
+
18
+ This command performs preprocessing on the anndata object. It automatically runs the load command under the hood if starting from raw data.
19
+
20
+ ```shell
21
+ smftools preprocess "/Path_to_experiment_config.csv"
22
+ ```
23
+
24
+ ## Spatial Usage
25
+
26
+ This command performs spatial analysis on the anndata object. It automatically runs the load command and preprocessing under the hood if they have not been already run.
27
+
28
+ ```shell
29
+ smftools spatial "/Path_to_experiment_config.csv"
30
+ ```
31
+
32
+ ## HMM Usage
33
+
34
+ This command performs hmm based feature annotation on the anndata object. It automatically runs the load command and preprocessing under the hood if they have not been already run.
35
+
36
+ ```shell
37
+ smftools hmm "/Path_to_experiment_config.csv"
38
+ ```
39
+
40
+ ## Batch Usage
41
+
42
+ This command performs batch processing of any of the above commands across multiple experiments. It takes in a tsv, txt, or csv of experiment specific config csvs.
43
+ ```shell
44
+ smftools batch preprocess "/Path_to_experiment_config_path_list.csv"
45
+ ```
46
+
47
+ ## Concatenate Usage
48
+
49
+ This command concatenates multiple h5ad files and saves them to a new output. The h5ads to concatenate are provided as a txt, tsv, or h5ad file of paths.
50
+ ```shell
51
+ smftools concatenate output.h5ad "/Path_to_h5ad_path_list.csv"
52
+ ```
53
+
54
+ ## Reading AnnData objects created by smftools
55
+
56
+ After creating an AnnData object holding your experiment's SMF data, you can load the AnnData object as so:
57
+
58
+ ```
59
+ import smftools as smf
60
+ input_adata = "/Path_to_experiment_AnnData.h5ad.gz"
61
+ adata = safe_read_h5ad(input_adata)
62
+ ```
63
+
64
+ This custom read function will take an optional directory of pickle files for data types that can not normally be saved directly in hdf5 formatting that was saved with the safe_write_h5ad function.
65
+
66
+
67
+ If you don't have an AnnData object yet, but want to play with the downstream Preprocessing, Tools, and Plotting modules, you can load a pre-loaded SMF dataset.
68
+
69
+ Currently, you can do this with our lab's in vitro dCas9 binding kinetics dataset generated from a Hia5 SMF dataset generated with direct m6A high accuracy basecalls:
70
+
71
+ ```
72
+ adata = smf.datasets.dCas9_kinetics()
73
+ adata.obs_names_make_unique()
74
+ ```
75
+
76
+ Alternatively, you can do this with our lab's M.CviPI SMF test data in F1-hybrid natural killer cells generated by NEB EMseq conversion followed by canonical basecalling:
77
+
78
+ ```
79
+ adata = smf.datasets.Kissiov_and_McKenna_2025()
80
+ adata.obs_names_make_unique()
81
+ ```
82
+
83
+ ## Writing out AnnData objects to save analysis progress
84
+
85
+ After preprocessing and downstream analysis of the AnnData object, you can save the AnnData object at any step as so:
86
+
87
+ ```
88
+ import smftools as smf
89
+ from pathlib import Path
90
+
91
+ output_dir = Path('/Path_to_output_directory')
92
+ output_adata = 'analyzed_adata.h5ad.gz'
93
+ final_output_path = output_dir / output_adata
94
+ safe_write_h5ad(adata, final_output_path, compression='gzip')
95
+ ```
96
+
97
+ This custom save function will make a directory of pickle files for data types that can not normally be saved directly in hdf5 formatting.
98
+
99
+ ## Troubleshooting
100
+ For more advanced usage and help troubleshooting, the API and tutorials for each of the modules is still being developed.
101
+ However, you can currently learn about the functions contained within the module by calling:
102
+
103
+ ```
104
+ smf.inform.__all__
105
+ ```
106
+
107
+ This lists the functions within any given module. If you want to see the associated docstring for a given function, here is an example:
108
+
109
+ ```
110
+ print(smf.inform.load_adata.__doc__)
111
+ ```
112
+
113
+ These docstrings will provide a brief description of the function and also tell you the input parameters and what the function returns.
114
+ In some cases, usage examples will also be provided in the docstring in the form of doctests.
@@ -16,7 +16,7 @@ conda activate smftools
16
16
  pip install smftools
17
17
  ```
18
18
 
19
- Ensure that you can access dorado, samtools, modkit, bedtools, and BedGraphtoBigWig executables from the terminal in this environment. These are all necessary for the functionality within the Informatics module.
19
+ Ensure that you can access dorado, modkit, and minimap2 executables from the terminal in this environment.
20
20
  You may need to add them to $PATH if they are not globally configured.
21
21
  For example, if you want to check if dorado is executable, simply run this in the terminal:
22
22
 
@@ -24,10 +24,10 @@ For example, if you want to check if dorado is executable, simply run this in th
24
24
  dorado
25
25
  ```
26
26
 
27
- On Mac OSX, the following can be used to congigure bedtools (with brew) and BedGraphToBigWig (with wget). Change the BedGraphToBigWig link to include the correct architecture for your OS.
27
+ On Mac OSX, the following can be used to congigure minimap2 (with brew) and BedGraphToBigWig (with wget).
28
28
 
29
29
  ```shell
30
- brew install bedtools
30
+ brew install minimap2
31
31
  wget http://hgdownload.soe.ucsc.edu/admin/exe/macOSX.x86_64/bedGraphToBigWig
32
32
  chmod +x bedGraphToBigWig
33
33
  sudo mv bedGraphToBigWig /usr/local/bin/
@@ -47,7 +47,10 @@ A python virtual environment can be created as an alternative to conda. I like t
47
47
  ```shell
48
48
  python -m venv venv-smftools
49
49
  source venv-smftools/bin/activate
50
+ pip install --upgrade pip
50
51
  pip install .
52
+ pip install ipykernel jupyter
53
+ python -m ipykernel install --user --name=venv-smftools --display-name "Python (smftools)"
51
54
  ```
52
55
 
53
56
  Subsequent use of the installed version of smftools can be run by changing to the smftools directory and activating the venv:
@@ -5,15 +5,10 @@ fasta,/path_to_fasta.fasta,Path to initial FASTA file,,str
5
5
  fasta_regions_of_interest,/path_to_bed.bed,Path to a bed file to subsample the fasta on.,,str
6
6
  output_directory,/outputs,Directory to act as root for all analysis outputs,,str
7
7
  experiment_name,,An experiment name for the final h5ad file,,str
8
+ model_dir,/path_to_dorado_model_dir,Path,,str
8
9
  model,None,The dorado basecalling model to use,,str
9
10
  barcode_kit,SQK-NBD114-24,The barcoding kit used for the experiment,,str
10
11
  mapping_threshold,0.05,Minimum proportion of reads mapping to a reference to further use that reference (Ranges from 0-1 as a proportion of mapped reads),,float
11
- filter_threshold,0.8,Minimum probability to call a canonical base identity,,float
12
- m6A_threshold,0.8,Minimum probability to flag m6A as True,,float
13
- m5C_threshold,0.8,Minimum probability to flag m5C as True,,float
14
- hm5C_threshold,0.8,Minimum probability to flag hm5C as True,,float
15
12
  mod_list,[5mC_5hmC],Modified base names for Dorado,"""6mA"", ""5mC_5hmC""",list
16
13
  batch_size,4,number of samples to analyze at a time,,int
17
- conversion_types,[5mC],Types of modification types to use in conversion SMF,"5mC', '6mA'",list
18
- barcode_both_ends,TRUE,whether to require both ends of a read to be barcoded for demultiplexing,,bool
19
- trim,FALSE,whether to trim barcodes and adapters from reads during demultiplexing,,bool
14
+ conversion_types,[5mC],Types of modification types to use in conversion SMF,"5mC', '6mA'",list
@@ -5,7 +5,7 @@ build-backend = "hatchling.build"
5
5
  [project]
6
6
  name = "smftools"
7
7
  description = "Single Molecule Footprinting Analysis in Python."
8
- requires-python = ">=3.9"
8
+ requires-python = ">=3.9,<3.13"
9
9
  license = { file = "LICENSE" }
10
10
  authors = [
11
11
  {name = "Joseph McKenna"}
@@ -42,6 +42,8 @@ classifiers = [
42
42
  dependencies = [
43
43
  "anndata>=0.10.0",
44
44
  "biopython>=1.79",
45
+ "captum",
46
+ "click",
45
47
  "fastcluster",
46
48
  "hydra-core",
47
49
  "igraph",
@@ -53,15 +55,18 @@ dependencies = [
53
55
  "omegaconf",
54
56
  "pandas>=1.4.2",
55
57
  "pod5>=0.1.21",
56
- "pomegranate>=1.0.0",
57
58
  "pyfaidx>=0.8.0",
59
+ "pybedtools>=0.12.0",
60
+ "pyBigWig>=0.3.24",
58
61
  "pysam>=0.19.1",
59
62
  "scanpy>=1.9",
60
63
  "scikit-learn>=1.0.2",
61
64
  "scipy>=1.7.3",
65
+ "shap",
62
66
  "seaborn>=0.11",
63
67
  "torch>=1.9.0",
64
68
  "tqdm",
69
+ "upsetplot",
65
70
  "wandb"
66
71
  ]
67
72
  dynamic = ["version"]
@@ -70,6 +75,9 @@ dynamic = ["version"]
70
75
  Source = "https://github.com/jkmckenna/smftools"
71
76
  Documentation = "https://smftools.readthedocs.io/"
72
77
 
78
+ [project.scripts]
79
+ smftools = "smftools.cli_entry:cli"
80
+
73
81
  [project.optional-dependencies]
74
82
  tests = [
75
83
  "pytest",
@@ -1,6 +1,8 @@
1
1
  # Essential packages
2
2
  anndata>=0.10.0
3
3
  biopython>=1.79
4
+ captum
5
+ click
4
6
  fastcluster
5
7
  hydra-core
6
8
  leidenalg
@@ -14,13 +16,16 @@ numpy>=1.22.0,<2
14
16
  omegaconf
15
17
  pandas>=1.4.2
16
18
  pod5>=0.1.21
17
- pomegranate>=1.0.0
19
+ pybedtools>=0.12.0
20
+ pyBigWig>=0.3.24
18
21
  pyfaidx>=0.8.0
19
22
  pysam>=0.19.1
20
- scanpy>=1.9
23
+ scanpy>=1.11
21
24
  scikit-learn>=1.0.2
22
25
  scipy>=1.7.3
23
26
  seaborn>=0.11
27
+ shap
24
28
  torch>=1.9.0
25
29
  tqdm
30
+ upsetplot
26
31
  wandb
@@ -4,12 +4,13 @@ import logging
4
4
  import warnings
5
5
 
6
6
  from . import informatics as inform
7
+ from . import machine_learning as ml
8
+ from . import plotting as pl
7
9
  from . import preprocessing as pp
8
10
  from . import tools as tl
9
- from . import plotting as pl
10
- from . import readwrite, datasets
11
- from .readwrite import adata_to_df, safe_write_h5ad, merge_barcoded_anndatas
12
11
 
12
+ from . import cli, config, datasets, hmm
13
+ from .readwrite import adata_to_df, safe_write_h5ad, safe_read_h5ad, merge_barcoded_anndatas_core
13
14
 
14
15
  from importlib.metadata import version
15
16
 
@@ -19,11 +20,11 @@ __version__ = version(package_name)
19
20
  __all__ = [
20
21
  "adata_to_df",
21
22
  "inform",
23
+ "ml",
22
24
  "pp",
23
25
  "tl",
24
26
  "pl",
25
- "readwrite",
26
- "datasets",
27
+ "datasets"
27
28
  "safe_write_h5ad",
28
- "merge_barcoded_anndatas"
29
+ "safe_read_h5ad"
29
30
  ]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.4"
@@ -0,0 +1,94 @@
1
+ def flow_I(config_path):
2
+ """
3
+ High-level function to call for converting raw sequencing data to an adata object.
4
+ Command line accesses this through smftools load <config_path>
5
+ Works for nanopore pod5, fast5, and unaligned modBAM data types for direct SMF workflows.
6
+ Works for nanopore pod5, fast5, unaligned BAM for conversion SMF workflows.
7
+ Also works for illumina fastq and unaligned BAM for conversion SMF workflows.
8
+
9
+ Parameters:
10
+ config_path (str): A string representing the file path to the experiment configuration csv file.
11
+
12
+ Returns:
13
+ None
14
+ """
15
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs
16
+ from ..config import LoadExperimentConfig, ExperimentConfig
17
+ from .load_adata import load_adata
18
+ from .preprocess_adata import preprocess_adata
19
+ from .spatial_adata import spatial_adata
20
+
21
+ import numpy as np
22
+ import pandas as pd
23
+ import anndata as ad
24
+ import scanpy as sc
25
+
26
+ import os
27
+ from importlib import resources
28
+ from pathlib import Path
29
+
30
+ from datetime import datetime
31
+ date_str = datetime.today().strftime("%y%m%d")
32
+ ################################### 1) General params and input organization ###################################
33
+ # Load experiment config parameters into global variables
34
+ loader = LoadExperimentConfig(config_path)
35
+ defaults_dir = resources.files("smftools").joinpath("config")
36
+ cfg, report = ExperimentConfig.from_var_dict(loader.var_dict, date_str=date_str, defaults_dir=defaults_dir)
37
+
38
+ # General config variable init - Necessary user passed inputs
39
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
40
+ input_data_path = Path(cfg.input_data_path) # Path to a directory of POD5s/FAST5s or to a BAM/FASTQ file. Necessary.
41
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
42
+ fasta = Path(cfg.fasta) # Path to reference FASTA. Necessary.
43
+ split_dir = Path(cfg.split_dir) # Relative path to directory for demultiplexing reads
44
+ split_path = output_directory / split_dir # Absolute path to directory for demultiplexing reads
45
+
46
+ # Make initial output directory
47
+ make_dirs([output_directory])
48
+
49
+ bam_suffix = cfg.bam_suffix
50
+ strands = cfg.strands
51
+
52
+ # General config variable init - Optional user passed inputs for enzyme base specificity
53
+ mod_target_bases = cfg.mod_target_bases # Nucleobases of interest that may be modified. ['GpC', 'CpG', 'C', 'A']
54
+
55
+ # Conversion/deamination specific variable init
56
+ conversion_types = cfg.conversion_types # 5mC
57
+ conversions = cfg.conversions
58
+
59
+ # Common Anndata accession params
60
+ reference_column = cfg.reference_column
61
+
62
+ # If conversion_types is passed:
63
+ if conversion_types:
64
+ conversions += conversion_types
65
+
66
+ ############################################### smftools load start ###############################################
67
+ initial_adata, initial_adata_path = load_adata(config_path)
68
+
69
+ # Initial adata path info
70
+ initial_backup_dir = initial_adata_path.parent / 'adata_accessory_data'
71
+ ############################################### smftools load end ###############################################
72
+
73
+ ############################################### smftools preprocess start ###############################################
74
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
75
+
76
+ # Preprocessed adata path info
77
+ pp_adata_basename = initial_adata_path.with_suffix("").name + '_preprocessed.h5ad.gz'
78
+ pp_adata_path = initial_adata_path / pp_adata_basename
79
+ pp_backup_dir = pp_adata_path.parent / 'pp_adata_accessory_data'
80
+
81
+ # Preprocessed duplicate removed adata path info
82
+ pp_dup_rem_adata_basename = pp_adata_path.with_suffix("").name + '_duplicates_removed.h5ad.gz'
83
+ pp_dup_rem_adata_path = pp_adata_path / pp_dup_rem_adata_basename
84
+ pp_dup_rem_backup_dir= pp_adata_path.parent / 'pp_dup_rem_adata_accessory_data'
85
+ ############################################### smftools preprocess end ###############################################
86
+
87
+ ############################################### smftools spatial start ###############################################
88
+ # Preprocessed duplicate removed adata with basic analyses appended path info
89
+ basic_analyzed_adata_basename = pp_dup_rem_adata_path.with_suffix("").name + '_analyzed_I.h5ad.gz'
90
+ basic_analyzed_adata_path = pp_dup_rem_adata_path / basic_analyzed_adata_basename
91
+ basic_analyzed_backup_dir= pp_dup_rem_adata_path.parent /'duplicate_removed_analyzed_adata_I_accessory_data'
92
+
93
+ spatial_adata, spatial_adata_path = spatial_adata(config_path)
94
+ ############################################### smftools spatial end ###############################################
@@ -0,0 +1,48 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ import anndata as ad
4
+ from ..readwrite import safe_write_h5ad
5
+
6
+ @dataclass
7
+ class AdataPaths:
8
+ raw: Path
9
+ pp: Path
10
+ pp_dedup: Path
11
+ spatial: Path
12
+ hmm: Path
13
+
14
+
15
+ def get_adata_paths(cfg) -> AdataPaths:
16
+ """
17
+ Central helper: given cfg, compute all standard AnnData paths.
18
+ """
19
+ h5_dir = Path(cfg.output_directory) / "h5ads"
20
+
21
+ raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
22
+
23
+ pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
24
+
25
+ if cfg.smf_modality == "direct":
26
+ # direct SMF: duplicate-removed path is just preprocessed path
27
+ pp_dedup = pp
28
+ else:
29
+ pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
30
+
31
+ pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
32
+
33
+ spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
34
+ hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
35
+
36
+ return AdataPaths(
37
+ raw=raw,
38
+ pp=pp,
39
+ pp_dedup=pp_dedup,
40
+ spatial=spatial,
41
+ hmm=hmm,
42
+ )
43
+
44
+ def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
45
+ if path.suffix != ".gz":
46
+ path = path.with_name(path.name + ".gz")
47
+ safe_write_h5ad(adata, path, compression="gzip", backup=True)
48
+ return path