smftools 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. {smftools-0.2.3 → smftools-0.2.4}/PKG-INFO +18 -12
  2. {smftools-0.2.3 → smftools-0.2.4}/README.md +17 -11
  3. smftools-0.2.4/smftools/_version.py +1 -0
  4. smftools-0.2.4/smftools/cli/helpers.py +48 -0
  5. smftools-0.2.4/smftools/cli/hmm_adata.py +361 -0
  6. {smftools-0.2.3 → smftools-0.2.4}/smftools/cli/load_adata.py +155 -95
  7. {smftools-0.2.3 → smftools-0.2.4}/smftools/cli/preprocess_adata.py +222 -130
  8. smftools-0.2.4/smftools/cli/spatial_adata.py +697 -0
  9. {smftools-0.2.3 → smftools-0.2.4}/smftools/cli_entry.py +4 -5
  10. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/conversion.yaml +12 -5
  11. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/deaminase.yaml +11 -9
  12. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/default.yaml +123 -19
  13. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/direct.yaml +3 -0
  14. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/experiment_config.py +120 -19
  15. {smftools-0.2.3 → smftools-0.2.4}/smftools/hmm/HMM.py +12 -1
  16. {smftools-0.2.3 → smftools-0.2.4}/smftools/hmm/__init__.py +0 -6
  17. smftools-0.2.4/smftools/hmm/call_hmm_peaks.py +334 -0
  18. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/bam_functions.py +28 -29
  19. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/h5ad_functions.py +1 -1
  20. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/general_plotting.py +97 -51
  21. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/position_stats.py +3 -3
  22. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/__init__.py +2 -4
  23. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/append_base_context.py +34 -25
  24. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/append_binary_layer_by_base_context.py +2 -2
  25. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/binarize_on_Youden.py +10 -8
  26. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_complexity_II.py +1 -1
  27. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_coverage.py +16 -13
  28. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_position_Youden.py +41 -25
  29. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_read_modification_stats.py +1 -1
  30. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/filter_reads_on_length_quality_mapping.py +1 -1
  31. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/filter_reads_on_modification_thresholds.py +1 -1
  32. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/flag_duplicate_reads.py +1 -1
  33. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/invert_adata.py +1 -1
  34. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/load_sample_sheet.py +1 -1
  35. smftools-0.2.4/smftools/preprocessing/reindex_references_adata.py +37 -0
  36. {smftools-0.2.3 → smftools-0.2.4}/smftools/readwrite.py +94 -0
  37. smftools-0.2.3/smftools/_version.py +0 -1
  38. smftools-0.2.3/smftools/cli/hmm_adata.py +0 -338
  39. smftools-0.2.3/smftools/cli/spatial_adata.py +0 -564
  40. {smftools-0.2.3 → smftools-0.2.4}/.gitattributes +0 -0
  41. {smftools-0.2.3 → smftools-0.2.4}/.gitignore +0 -0
  42. {smftools-0.2.3 → smftools-0.2.4}/.readthedocs.yaml +0 -0
  43. {smftools-0.2.3 → smftools-0.2.4}/CONTRIBUTING.md +0 -0
  44. {smftools-0.2.3 → smftools-0.2.4}/LICENSE +0 -0
  45. {smftools-0.2.3 → smftools-0.2.4}/docs/Makefile +0 -0
  46. {smftools-0.2.3 → smftools-0.2.4}/docs/make.bat +0 -0
  47. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/converted_BAM_to_adata.png +0 -0
  48. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/modkit_extract_to_adata.png +0 -0
  49. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/smftools-1.svg +0 -0
  50. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/smftools-1.tif +0 -0
  51. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.pdf +0 -0
  52. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/smftools_informatics_diagram.png +0 -0
  53. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_static/smftools_preprocessing_diagram.png +0 -0
  54. {smftools-0.2.3 → smftools-0.2.4}/docs/source/_templates/tmp +0 -0
  55. {smftools-0.2.3 → smftools-0.2.4}/docs/source/api/datasets.md +0 -0
  56. {smftools-0.2.3 → smftools-0.2.4}/docs/source/api/index.md +0 -0
  57. {smftools-0.2.3 → smftools-0.2.4}/docs/source/api/informatics.md +0 -0
  58. {smftools-0.2.3 → smftools-0.2.4}/docs/source/api/preprocessing.md +0 -0
  59. {smftools-0.2.3 → smftools-0.2.4}/docs/source/api/tools.md +0 -0
  60. {smftools-0.2.3 → smftools-0.2.4}/docs/source/basic_usage.md +0 -0
  61. {smftools-0.2.3 → smftools-0.2.4}/docs/source/conf.py +0 -0
  62. {smftools-0.2.3 → smftools-0.2.4}/docs/source/contributors.md +0 -0
  63. {smftools-0.2.3 → smftools-0.2.4}/docs/source/dev/index.md +0 -0
  64. {smftools-0.2.3 → smftools-0.2.4}/docs/source/index.md +0 -0
  65. {smftools-0.2.3 → smftools-0.2.4}/docs/source/installation.md +0 -0
  66. {smftools-0.2.3 → smftools-0.2.4}/docs/source/references.bib +0 -0
  67. {smftools-0.2.3 → smftools-0.2.4}/docs/source/references.rst +0 -0
  68. {smftools-0.2.3 → smftools-0.2.4}/docs/source/release-notes/0.1.0.md +0 -0
  69. {smftools-0.2.3 → smftools-0.2.4}/docs/source/release-notes/index.md +0 -0
  70. {smftools-0.2.3 → smftools-0.2.4}/docs/source/requirements.txt +0 -0
  71. {smftools-0.2.3 → smftools-0.2.4}/docs/source/tutorials/index.md +0 -0
  72. {smftools-0.2.3 → smftools-0.2.4}/experiment_config.csv +0 -0
  73. {smftools-0.2.3 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_example_notebook.ipynb +0 -0
  74. {smftools-0.2.3 → smftools-0.2.4}/notebooks/Kissiov_and_McKenna_2025_sample_sheet.csv +0 -0
  75. {smftools-0.2.3 → smftools-0.2.4}/pyproject.toml +0 -0
  76. {smftools-0.2.3 → smftools-0.2.4}/requirements.txt +0 -0
  77. {smftools-0.2.3 → smftools-0.2.4}/sample_sheet.csv +0 -0
  78. {smftools-0.2.3 → smftools-0.2.4}/smftools/__init__.py +0 -0
  79. {smftools-0.2.3 → smftools-0.2.4}/smftools/_settings.py +0 -0
  80. {smftools-0.2.3 → smftools-0.2.4}/smftools/cli/__init__.py +0 -0
  81. {smftools-0.2.3/smftools/cli → smftools-0.2.4/smftools/cli/archived}/cli_flows.py +0 -0
  82. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/__init__.py +0 -0
  83. {smftools-0.2.3 → smftools-0.2.4}/smftools/config/discover_input_files.py +0 -0
  84. {smftools-0.2.3 → smftools-0.2.4}/smftools/datasets/F1_hybrid_NKG2A_enhander_promoter_GpC_conversion_SMF.h5ad.gz +0 -0
  85. {smftools-0.2.3 → smftools-0.2.4}/smftools/datasets/F1_sample_sheet.csv +0 -0
  86. {smftools-0.2.3 → smftools-0.2.4}/smftools/datasets/__init__.py +0 -0
  87. {smftools-0.2.3 → smftools-0.2.4}/smftools/datasets/dCas9_m6A_invitro_kinetics.h5ad.gz +0 -0
  88. {smftools-0.2.3 → smftools-0.2.4}/smftools/datasets/datasets.py +0 -0
  89. {smftools-0.2.3/smftools/hmm → smftools-0.2.4/smftools/hmm/archived}/apply_hmm_batched.py +0 -0
  90. {smftools-0.2.3/smftools/hmm → smftools-0.2.4/smftools/hmm/archived}/calculate_distances.py +0 -0
  91. {smftools-0.2.3/smftools/hmm → smftools-0.2.4/smftools/hmm/archived}/call_hmm_peaks.py +0 -0
  92. {smftools-0.2.3/smftools/hmm → smftools-0.2.4/smftools/hmm/archived}/train_hmm.py +0 -0
  93. {smftools-0.2.3 → smftools-0.2.4}/smftools/hmm/display_hmm.py +0 -0
  94. {smftools-0.2.3 → smftools-0.2.4}/smftools/hmm/hmm_readwrite.py +0 -0
  95. {smftools-0.2.3 → smftools-0.2.4}/smftools/hmm/nucleosome_hmm_refinement.py +0 -0
  96. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/__init__.py +0 -0
  97. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/bam_conversion.py +0 -0
  98. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/bam_direct.py +0 -0
  99. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/basecall_pod5s.py +0 -0
  100. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/basecalls_to_adata.py +0 -0
  101. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/conversion_smf.py +0 -0
  102. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/deaminase_smf.py +0 -0
  103. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/direct_smf.py +0 -0
  104. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/fast5_to_pod5.py +0 -0
  105. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/__init__.py +0 -0
  106. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/align_and_sort_BAM.py +0 -0
  107. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/aligned_BAM_to_bed.py +0 -0
  108. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/bam_qc.py +0 -0
  109. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/bed_to_bigwig.py +0 -0
  110. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/canoncall.py +0 -0
  111. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/concatenate_fastqs_to_bam.py +0 -0
  112. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/converted_BAM_to_adata.py +0 -0
  113. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/count_aligned_reads.py +0 -0
  114. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/demux_and_index_BAM.py +0 -0
  115. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/extract_base_identities.py +0 -0
  116. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/extract_mods.py +0 -0
  117. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/extract_read_features_from_bam.py +0 -0
  118. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/extract_read_lengths_from_bed.py +0 -0
  119. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/extract_readnames_from_BAM.py +0 -0
  120. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/find_conversion_sites.py +0 -0
  121. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/generate_converted_FASTA.py +0 -0
  122. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/get_chromosome_lengths.py +0 -0
  123. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/get_native_references.py +0 -0
  124. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/index_fasta.py +0 -0
  125. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/informatics.py +0 -0
  126. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/load_adata.py +0 -0
  127. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/make_modbed.py +0 -0
  128. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/modQC.py +0 -0
  129. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/modcall.py +0 -0
  130. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/ohe_batching.py +0 -0
  131. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/ohe_layers_decode.py +0 -0
  132. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/one_hot_decode.py +0 -0
  133. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/one_hot_encode.py +0 -0
  134. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/plot_bed_histograms.py +0 -0
  135. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/separate_bam_by_bc.py +0 -0
  136. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/helpers/archived/split_and_index_BAM.py +0 -0
  137. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/print_bam_query_seq.py +0 -0
  138. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/subsample_fasta_from_bed.py +0 -0
  139. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/archived/subsample_pod5.py +0 -0
  140. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/basecalling.py +0 -0
  141. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/bed_functions.py +0 -0
  142. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/binarize_converted_base_identities.py +0 -0
  143. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/complement_base_list.py +0 -0
  144. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/converted_BAM_to_adata.py +0 -0
  145. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/fasta_functions.py +0 -0
  146. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/modkit_extract_to_adata.py +0 -0
  147. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/modkit_functions.py +0 -0
  148. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/ohe.py +0 -0
  149. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/pod5_functions.py +0 -0
  150. {smftools-0.2.3 → smftools-0.2.4}/smftools/informatics/run_multiqc.py +0 -0
  151. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/__init__.py +0 -0
  152. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/data/__init__.py +0 -0
  153. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/data/anndata_data_module.py +0 -0
  154. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/data/preprocessing.py +0 -0
  155. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/evaluation/__init__.py +0 -0
  156. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/evaluation/eval_utils.py +0 -0
  157. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/evaluation/evaluators.py +0 -0
  158. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/inference/__init__.py +0 -0
  159. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/inference/inference_utils.py +0 -0
  160. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/inference/lightning_inference.py +0 -0
  161. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/inference/sklearn_inference.py +0 -0
  162. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/inference/sliding_window_inference.py +0 -0
  163. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/__init__.py +0 -0
  164. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/base.py +0 -0
  165. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/cnn.py +0 -0
  166. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/lightning_base.py +0 -0
  167. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/mlp.py +0 -0
  168. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/positional.py +0 -0
  169. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/rnn.py +0 -0
  170. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/sklearn_models.py +0 -0
  171. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/transformer.py +0 -0
  172. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/models/wrappers.py +0 -0
  173. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/training/__init__.py +0 -0
  174. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/training/train_lightning_model.py +0 -0
  175. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/training/train_sklearn_model.py +0 -0
  176. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/utils/__init__.py +0 -0
  177. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/utils/device.py +0 -0
  178. {smftools-0.2.3 → smftools-0.2.4}/smftools/machine_learning/utils/grl.py +0 -0
  179. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/__init__.py +0 -0
  180. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/autocorrelation_plotting.py +0 -0
  181. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/classifiers.py +0 -0
  182. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/hmm_plotting.py +0 -0
  183. {smftools-0.2.3 → smftools-0.2.4}/smftools/plotting/qc_plotting.py +0 -0
  184. {smftools-0.2.3/smftools/preprocessing → smftools-0.2.4/smftools/preprocessing/archives}/add_read_length_and_mapping_qc.py +0 -0
  185. {smftools-0.2.3/smftools/preprocessing → smftools-0.2.4/smftools/preprocessing/archives}/calculate_complexity.py +0 -0
  186. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/archives/mark_duplicates.py +0 -0
  187. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/archives/preprocessing.py +0 -0
  188. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/archives/remove_duplicates.py +0 -0
  189. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/binarize.py +0 -0
  190. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/binary_layers_to_ohe.py +0 -0
  191. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_consensus.py +0 -0
  192. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_differences.py +0 -0
  193. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_pairwise_hamming_distances.py +0 -0
  194. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/calculate_read_length_stats.py +0 -0
  195. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/clean_NaN.py +0 -0
  196. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/filter_adata_by_nan_proportion.py +0 -0
  197. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/make_dirs.py +0 -0
  198. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/min_non_diagonal.py +0 -0
  199. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/recipes.py +0 -0
  200. {smftools-0.2.3 → smftools-0.2.4}/smftools/preprocessing/subsample_adata.py +0 -0
  201. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/__init__.py +0 -0
  202. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/apply_hmm.py +0 -0
  203. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/classifiers.py +0 -0
  204. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/classify_methylated_features.py +0 -0
  205. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/classify_non_methylated_features.py +0 -0
  206. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v1.py +0 -0
  207. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/archived/subset_adata_v2.py +0 -0
  208. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/calculate_umap.py +0 -0
  209. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/cluster_adata_on_methylation.py +0 -0
  210. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/general_tools.py +0 -0
  211. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/position_stats.py +0 -0
  212. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/read_stats.py +0 -0
  213. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/spatial_autocorrelation.py +0 -0
  214. {smftools-0.2.3 → smftools-0.2.4}/smftools/tools/subset_adata.py +0 -0
  215. {smftools-0.2.3 → smftools-0.2.4}/tests/datasets/test_datasets.py +0 -0
  216. {smftools-0.2.3 → smftools-0.2.4}/tests/informatics/helpers/test_LoadExperimentConfig.py +0 -0
  217. {smftools-0.2.3 → smftools-0.2.4}/tests/test_readwrite.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: smftools
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Single Molecule Footprinting Analysis in Python.
5
5
  Project-URL: Source, https://github.com/jkmckenna/smftools
6
6
  Project-URL: Documentation, https://smftools.readthedocs.io/
@@ -96,30 +96,36 @@ Description-Content-Type: text/markdown
96
96
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
97
97
 
98
98
  # smftools
99
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
99
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
100
100
 
101
101
  ## Philosophy
102
- While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
102
+ While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
103
103
 
104
104
  ## Dependencies
105
105
  The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
106
106
  1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
107
107
  2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
108
- 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files.
108
+ 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
109
109
 
110
- ## Modules
111
- ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
110
+ ## Main Commands
111
+ ### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
112
112
  ![](docs/source/_static/smftools_informatics_diagram.png)
113
- ### Preprocessing: Appends QC metrics to the AnnData object and performs filtering.
113
+ ### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
114
114
  ![](docs/source/_static/smftools_preprocessing_diagram.png)
115
- ### Tools: Appends analyses to the AnnData object.
116
- - Currently Includes: Position X Position correlation matrices, Hidden Markov Model feature detection, clustering, dimensionality reduction, peak calling, train/test workflows for various ML classifiers.
117
- - To do: Additional ML methods for learning predictive single molecule features on condition labels: Autoencoders, Variational Autoencoders, Transformers.
118
- ### Plotting: Visualization of analyses stored within the AnnData object.
119
- - Most analyses appended to the adata object by a tools method have, or will have, an accompanying plotting method.
115
+ ### smftools spatial: Appends spatial analyses to the AnnData object.
116
+ - Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
117
+ ### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
118
+ - Main outputs wills be stored in adata.layers
119
+ ### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
120
+ - Nice when analyzing multiple experiments
121
+ ### smftools concatenate: Concatenates a list or directory of anndata objects.
122
+ - Mainly used for combining multiple experiments into a single anndata object.
120
123
 
121
124
  ## Announcements
122
125
 
126
+ ### 12/02/25 - Version 0.2.3 is available through PyPI
127
+ Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
128
+
123
129
  ### 11/05/25 - Version 0.2.1 is available through PyPI
124
130
  Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
125
131
 
@@ -2,30 +2,36 @@
2
2
  [![Docs](https://readthedocs.org/projects/smftools/badge/?version=latest)](https://smftools.readthedocs.io/en/latest/?badge=latest)
3
3
 
4
4
  # smftools
5
- A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, analysis, and visualization.
5
+ A Python tool for processing raw sequencing data derived from single molecule footprinting experiments into [anndata](https://anndata.readthedocs.io/en/latest/) objects. Additional functionality for preprocessing, spatial analyses, and HMM based feature annotation.
6
6
 
7
7
  ## Philosophy
8
- While most genomic data structures handle low-coverage data (<100X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
8
+ While genomic data structures (SAM/BAM) were built to handle low-coverage data (<1000X) along large references, smftools prioritizes high-coverage data (scalable to >1,000,000X coverage) of a few genomic loci at a time. This enables efficient data storage, rapid data operations, hierarchical metadata handling, seamless integration with various machine-learning packages, and ease of visualization. Furthermore, functionality is modularized, enabling analysis sessions to be saved, reloaded, and easily shared with collaborators. Analyses are centered around the [anndata](https://anndata.readthedocs.io/en/latest/) object, and are heavily inspired by the work conducted within the single-cell genomics community.
9
9
 
10
10
  ## Dependencies
11
11
  The following CLI tools need to be installed and configured before using the informatics (smftools.inform) module of smftools:
12
12
  1) [Dorado](https://github.com/nanoporetech/dorado) -> Basecalling, alignment, demultiplexing.
13
13
  2) [Minimap2](https://github.com/lh3/minimap2) -> Alignment if not using dorado.
14
- 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files.
14
+ 3) [Modkit](https://github.com/nanoporetech/modkit) -> Extracting read level methylation metrics from modified BAM files. Only required for direct modification detection (ie methylation).
15
15
 
16
- ## Modules
17
- ### Informatics: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
16
+ ## Main Commands
17
+ ### smftools load: Processes raw Nanopore/Illumina data from SMF experiments into an AnnData object.
18
18
  ![](docs/source/_static/smftools_informatics_diagram.png)
19
- ### Preprocessing: Appends QC metrics to the AnnData object and performs filtering.
19
+ ### smftools preprocess: Appends QC metrics to the AnnData object and performs filtering.
20
20
  ![](docs/source/_static/smftools_preprocessing_diagram.png)
21
- ### Tools: Appends analyses to the AnnData object.
22
- - Currently Includes: Position X Position correlation matrices, Hidden Markov Model feature detection, clustering, dimensionality reduction, peak calling, train/test workflows for various ML classifiers.
23
- - To do: Additional ML methods for learning predictive single molecule features on condition labels: Autoencoders, Variational Autoencoders, Transformers.
24
- ### Plotting: Visualization of analyses stored within the AnnData object.
25
- - Most analyses appended to the adata object by a tools method have, or will have, an accompanying plotting method.
21
+ ### smftools spatial: Appends spatial analyses to the AnnData object.
22
+ - Currently Includes: Position X Position correlation matrices, clustering, dimensionality reduction, spatial autocorrelation.
23
+ ### smftools hmm: Fits a basic HMM to each sample and appends HMM feature layers
24
+ - Main outputs wills be stored in adata.layers
25
+ ### smftools batch <command>: Performs batch processing on a csv of config file pathes for any of the above commands.
26
+ - Nice when analyzing multiple experiments
27
+ ### smftools concatenate: Concatenates a list or directory of anndata objects.
28
+ - Mainly used for combining multiple experiments into a single anndata object.
26
29
 
27
30
  ## Announcements
28
31
 
32
+ ### 12/02/25 - Version 0.2.3 is available through PyPI
33
+ Version 0.2.3 provides the core smftools functionality through several command line commands (load, preprocess, spatial, hmm).
34
+
29
35
  ### 11/05/25 - Version 0.2.1 is available through PyPI
30
36
  Version 0.2.1 makes the core workflow (smftools load) a command line tool that takes in an experiment_config.csv file for input/output and parameter management.
31
37
 
@@ -0,0 +1 @@
1
+ __version__ = "0.2.4"
@@ -0,0 +1,48 @@
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ import anndata as ad
4
+ from ..readwrite import safe_write_h5ad
5
+
6
+ @dataclass
7
+ class AdataPaths:
8
+ raw: Path
9
+ pp: Path
10
+ pp_dedup: Path
11
+ spatial: Path
12
+ hmm: Path
13
+
14
+
15
+ def get_adata_paths(cfg) -> AdataPaths:
16
+ """
17
+ Central helper: given cfg, compute all standard AnnData paths.
18
+ """
19
+ h5_dir = Path(cfg.output_directory) / "h5ads"
20
+
21
+ raw = h5_dir / f"{cfg.experiment_name}.h5ad.gz"
22
+
23
+ pp = h5_dir / f"{cfg.experiment_name}_preprocessed.h5ad.gz"
24
+
25
+ if cfg.smf_modality == "direct":
26
+ # direct SMF: duplicate-removed path is just preprocessed path
27
+ pp_dedup = pp
28
+ else:
29
+ pp_dedup = h5_dir / f"{cfg.experiment_name}_preprocessed_duplicates_removed.h5ad.gz"
30
+
31
+ pp_dedup_base = pp_dedup.name.removesuffix(".h5ad.gz")
32
+
33
+ spatial = h5_dir / f"{pp_dedup_base}_spatial.h5ad.gz"
34
+ hmm = h5_dir / f"{pp_dedup_base}_spatial_hmm.h5ad.gz"
35
+
36
+ return AdataPaths(
37
+ raw=raw,
38
+ pp=pp,
39
+ pp_dedup=pp_dedup,
40
+ spatial=spatial,
41
+ hmm=hmm,
42
+ )
43
+
44
+ def write_gz_h5ad(adata: ad.AnnData, path: Path) -> Path:
45
+ if path.suffix != ".gz":
46
+ path = path.with_name(path.name + ".gz")
47
+ safe_write_h5ad(adata, path, compression="gzip", backup=True)
48
+ return path
@@ -0,0 +1,361 @@
1
+ def hmm_adata(config_path):
2
+ """
3
+ High-level function to call for hmm analysis of an adata object.
4
+ Command line accesses this through smftools hmm <config_path>
5
+
6
+ Parameters:
7
+ config_path (str): A string representing the file path to the experiment configuration csv file.
8
+
9
+ Returns:
10
+ (pp_dedup_spatial_hmm_adata, pp_dedup_spatial_hmm_adata_path)
11
+ """
12
+ from ..readwrite import safe_read_h5ad, safe_write_h5ad, make_dirs, add_or_update_column_in_csv
13
+ from .load_adata import load_adata
14
+ from .preprocess_adata import preprocess_adata
15
+ from .spatial_adata import spatial_adata
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+ import anndata as ad
20
+ import scanpy as sc
21
+
22
+ import os
23
+ from importlib import resources
24
+ from pathlib import Path
25
+
26
+ from datetime import datetime
27
+ date_str = datetime.today().strftime("%y%m%d")
28
+
29
+ ############################################### smftools load start ###############################################
30
+ adata, adata_path, cfg = load_adata(config_path)
31
+ # General config variable init - Necessary user passed inputs
32
+ smf_modality = cfg.smf_modality # needed for specifying if the data is conversion SMF or direct methylation detection SMF. Or deaminase smf Necessary.
33
+ output_directory = Path(cfg.output_directory) # Path to the output directory to make for the analysis. Necessary.
34
+
35
+ # Make initial output directory
36
+ make_dirs([output_directory])
37
+ ############################################### smftools load end ###############################################
38
+
39
+ ############################################### smftools preprocess start ###############################################
40
+ pp_adata, pp_adata_path, pp_dedup_adata, pp_dup_rem_adata_path = preprocess_adata(config_path)
41
+ ############################################### smftools preprocess end ###############################################
42
+
43
+ ############################################### smftools spatial start ###############################################
44
+ spatial_ad, spatial_adata_path = spatial_adata(config_path)
45
+ ############################################### smftools spatial end ###############################################
46
+
47
+ ############################################### smftools hmm start ###############################################
48
+ input_manager_df = pd.read_csv(cfg.summary_file)
49
+ initial_adata_path = Path(input_manager_df['load_adata'][0])
50
+ pp_adata_path = Path(input_manager_df['pp_adata'][0])
51
+ pp_dup_rem_adata_path = Path(input_manager_df['pp_dedup_adata'][0])
52
+ spatial_adata_path = Path(input_manager_df['spatial_adata'][0])
53
+ hmm_adata_path = Path(input_manager_df['hmm_adata'][0])
54
+
55
+ if spatial_ad:
56
+ # This happens on first run of the pipeline
57
+ adata = spatial_ad
58
+ else:
59
+ # If an anndata is saved, check which stages of the anndata are available
60
+ initial_version_available = initial_adata_path.exists()
61
+ preprocessed_version_available = pp_adata_path.exists()
62
+ preprocessed_dup_removed_version_available = pp_dup_rem_adata_path.exists()
63
+ preprocessed_dedup_spatial_version_available = spatial_adata_path.exists()
64
+ preprocessed_dedup_spatial_hmm_version_available = hmm_adata_path.exists()
65
+
66
+ if cfg.force_redo_hmm_fit or cfg.force_redo_hmm_apply:
67
+ print(f"Forcing redo of hmm analysis workflow.")
68
+ if preprocessed_dedup_spatial_hmm_version_available:
69
+ adata, load_report = safe_read_h5ad(hmm_adata_path)
70
+ elif preprocessed_dedup_spatial_version_available:
71
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
72
+ elif preprocessed_dup_removed_version_available:
73
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
74
+ elif initial_version_available:
75
+ adata, load_report = safe_read_h5ad(initial_adata_path)
76
+ else:
77
+ print(f"Can not redo duplicate detection when there is no compatible adata available: either raw or preprocessed are required")
78
+ elif preprocessed_dedup_spatial_hmm_version_available:
79
+ adata, load_report = safe_read_h5ad(hmm_adata_path)
80
+ else:
81
+ if preprocessed_dedup_spatial_version_available:
82
+ adata, load_report = safe_read_h5ad(spatial_adata_path)
83
+ elif preprocessed_dup_removed_version_available:
84
+ adata, load_report = safe_read_h5ad(pp_dup_rem_adata_path)
85
+ elif initial_version_available:
86
+ adata, load_report = safe_read_h5ad(initial_adata_path)
87
+ else:
88
+ print(f"No adata available.")
89
+ return
90
+ references = adata.obs[cfg.reference_column].cat.categories
91
+ deaminase = smf_modality == 'deaminase'
92
+ ############################################### HMM based feature annotations ###############################################
93
+ if not (cfg.bypass_hmm_fit and cfg.bypass_hmm_apply):
94
+ from ..hmm.HMM import HMM
95
+ from scipy.sparse import issparse, csr_matrix
96
+ import warnings
97
+
98
+ pp_dir = output_directory / "preprocessed"
99
+ pp_dir = pp_dir / "deduplicated"
100
+ hmm_dir = pp_dir / "10_hmm_models"
101
+
102
+ if hmm_dir.is_dir():
103
+ print(f'{hmm_dir} already exists.')
104
+ else:
105
+ make_dirs([pp_dir, hmm_dir])
106
+
107
+ samples = adata.obs[cfg.sample_name_col_for_plotting].cat.categories
108
+ references = adata.obs[cfg.reference_column].cat.categories
109
+ uns_key = "hmm_appended_layers"
110
+
111
+ # ensure uns key exists (avoid KeyError later)
112
+ if adata.uns.get(uns_key) is None:
113
+ adata.uns[uns_key] = []
114
+
115
+ if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_fit and not cfg.force_redo_hmm_apply:
116
+ pass
117
+ else:
118
+ for sample in samples:
119
+ for ref in references:
120
+ mask = (adata.obs[cfg.sample_name_col_for_plotting] == sample) & (adata.obs[cfg.reference_column] == ref)
121
+ subset = adata[mask].copy()
122
+ if subset.shape[0] < 1:
123
+ continue
124
+
125
+ for mod_site in cfg.hmm_methbases:
126
+ mod_label = {'C': 'C'}.get(mod_site, mod_site)
127
+ hmm_path = hmm_dir / f"{sample}_{ref}_{mod_label}_hmm_model.pth"
128
+
129
+ # ensure the input obsm exists
130
+ obsm_key = f'{ref}_{mod_label}_site'
131
+ if obsm_key not in subset.obsm:
132
+ print(f"Skipping {sample} {ref} {mod_label}: missing obsm '{obsm_key}'")
133
+ continue
134
+
135
+ # Fit or load model
136
+ if hmm_path.exists() and not cfg.force_redo_hmm_fit:
137
+ hmm = HMM.load(hmm_path)
138
+ hmm.print_params()
139
+ else:
140
+ print(f"Fitting HMM for {sample} {ref} {mod_label}")
141
+ hmm = HMM.from_config(cfg)
142
+ # fit expects a list-of-seqs or 2D ndarray in the obsm
143
+ seqs = subset.obsm[obsm_key]
144
+ hmm.fit(seqs)
145
+ hmm.print_params()
146
+ hmm.save(hmm_path)
147
+
148
+ # Apply / annotate on the subset, then copy layers back to final_adata
149
+ if cfg.bypass_hmm_apply:
150
+ pass
151
+ else:
152
+ print(f"Applying HMM on subset for {sample} {ref} {mod_label}")
153
+ # Use the new uns_key argument so subset will record appended layer names
154
+ # (annotate_adata modifies subset.obs/layers in-place and should write subset.uns[uns_key])
155
+ if smf_modality == "direct":
156
+ hmm_layer = cfg.output_binary_layer_name
157
+ else:
158
+ hmm_layer = None
159
+
160
+ hmm.annotate_adata(subset,
161
+ obs_column=cfg.reference_column,
162
+ layer=hmm_layer,
163
+ config=cfg,
164
+ force_redo=cfg.force_redo_hmm_apply
165
+ )
166
+
167
+ if adata.uns.get('hmm_annotated', False) and not cfg.force_redo_hmm_apply:
168
+ pass
169
+ else:
170
+ to_merge = cfg.hmm_merge_layer_features
171
+ for layer_to_merge, merge_distance in to_merge:
172
+ if layer_to_merge:
173
+ hmm.merge_intervals_in_layer(subset,
174
+ layer=layer_to_merge,
175
+ distance_threshold=merge_distance,
176
+ overwrite=True
177
+ )
178
+ else:
179
+ pass
180
+
181
+ # collect appended layers from subset.uns
182
+ appended = list(subset.uns.get(uns_key, []))
183
+ print(appended)
184
+ if len(appended) == 0:
185
+ # nothing appended for this subset; continue
186
+ continue
187
+
188
+ # copy each appended layer into adata
189
+ subset_mask_bool = mask.values if hasattr(mask, "values") else np.asarray(mask)
190
+ for layer_name in appended:
191
+ if layer_name not in subset.layers:
192
+ # defensive: skip
193
+ warnings.warn(f"Expected layer {layer_name} in subset but not found; skipping copy.")
194
+ continue
195
+ sub_layer = subset.layers[layer_name]
196
+ # ensure final layer exists and assign rows
197
+ try:
198
+ hmm._ensure_final_layer_and_assign(adata, layer_name, subset_mask_bool, sub_layer)
199
+ except Exception as e:
200
+ warnings.warn(f"Failed to copy layer {layer_name} into adata: {e}", stacklevel=2)
201
+ # fallback: if dense and small, try to coerce
202
+ if issparse(sub_layer):
203
+ arr = sub_layer.toarray()
204
+ else:
205
+ arr = np.asarray(sub_layer)
206
+ adata.layers[layer_name] = adata.layers.get(layer_name, np.zeros((adata.shape[0], arr.shape[1]), dtype=arr.dtype))
207
+ final_idx = np.nonzero(subset_mask_bool)[0]
208
+ adata.layers[layer_name][final_idx, :] = arr
209
+
210
+ # merge appended layer names into adata.uns
211
+ existing = list(adata.uns.get(uns_key, []))
212
+ for ln in appended:
213
+ if ln not in existing:
214
+ existing.append(ln)
215
+ adata.uns[uns_key] = existing
216
+
217
+ else:
218
+ pass
219
+
220
+ from ..hmm import call_hmm_peaks
221
+ hmm_dir = pp_dir / "11_hmm_peak_calling"
222
+ if hmm_dir.is_dir():
223
+ pass
224
+ else:
225
+ make_dirs([pp_dir, hmm_dir])
226
+
227
+ call_hmm_peaks(
228
+ adata,
229
+ feature_configs=cfg.hmm_peak_feature_configs,
230
+ ref_column=cfg.reference_column,
231
+ site_types=cfg.mod_target_bases,
232
+ save_plot=True,
233
+ output_dir=hmm_dir,
234
+ index_col_suffix=cfg.reindexed_var_suffix)
235
+
236
+ ## Save HMM annotated adata
237
+ if not hmm_adata_path.exists():
238
+ print('Saving hmm analyzed adata post preprocessing and duplicate removal')
239
+ if ".gz" == hmm_adata_path.suffix:
240
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
241
+ else:
242
+ hmm_adata_path = hmm_adata_path.with_name(hmm_adata_path.name + '.gz')
243
+ safe_write_h5ad(adata, hmm_adata_path, compression='gzip', backup=True)
244
+
245
+ add_or_update_column_in_csv(cfg.summary_file, "hmm_adata", hmm_adata_path)
246
+
247
+ ########################################################################################################################
248
+
249
+ ############################################### HMM based feature plotting ###############################################
250
+ from ..plotting import combined_hmm_raw_clustermap
251
+ hmm_dir = pp_dir / "12_hmm_clustermaps"
252
+ make_dirs([pp_dir, hmm_dir])
253
+
254
+ layers: list[str] = []
255
+
256
+ for base in cfg.hmm_methbases:
257
+ layers.extend([f"{base}_{layer}" for layer in cfg.hmm_clustermap_feature_layers])
258
+
259
+ if cfg.cpg:
260
+ layers.extend(["CpG_cpg_patch"])
261
+
262
+ if not layers:
263
+ raise ValueError(
264
+ f"No HMM feature layers matched mod_target_bases={cfg.mod_target_bases} "
265
+ f"and smf_modality={smf_modality}"
266
+ )
267
+
268
+ for layer in layers:
269
+ hmm_cluster_save_dir = hmm_dir / layer
270
+ if hmm_cluster_save_dir.is_dir():
271
+ pass
272
+ else:
273
+ make_dirs([hmm_cluster_save_dir])
274
+
275
+ combined_hmm_raw_clustermap(
276
+ adata,
277
+ sample_col=cfg.sample_name_col_for_plotting,
278
+ reference_col=cfg.reference_column,
279
+ hmm_feature_layer=layer,
280
+ layer_gpc=cfg.layer_for_clustermap_plotting,
281
+ layer_cpg=cfg.layer_for_clustermap_plotting,
282
+ layer_c=cfg.layer_for_clustermap_plotting,
283
+ layer_a=cfg.layer_for_clustermap_plotting,
284
+ cmap_hmm=cfg.clustermap_cmap_hmm,
285
+ cmap_gpc=cfg.clustermap_cmap_gpc,
286
+ cmap_cpg=cfg.clustermap_cmap_cpg,
287
+ cmap_c=cfg.clustermap_cmap_c,
288
+ cmap_a=cfg.clustermap_cmap_a,
289
+ min_quality=cfg.read_quality_filter_thresholds[0],
290
+ min_length=cfg.read_len_filter_thresholds[0],
291
+ min_mapped_length_to_reference_length_ratio=cfg.read_len_to_ref_ratio_filter_thresholds[0],
292
+ min_position_valid_fraction=1-cfg.position_max_nan_threshold,
293
+ save_path=hmm_cluster_save_dir,
294
+ normalize_hmm=False,
295
+ sort_by=cfg.hmm_clustermap_sortby, # options: 'gpc', 'cpg', 'gpc_cpg', 'none', or 'obs:<column>'
296
+ bins=None,
297
+ deaminase=deaminase,
298
+ min_signal=0,
299
+ index_col_suffix=cfg.reindexed_var_suffix
300
+ )
301
+
302
+ hmm_dir = pp_dir / "13_hmm_bulk_traces"
303
+
304
+ if hmm_dir.is_dir():
305
+ print(f'{hmm_dir} already exists.')
306
+ else:
307
+ make_dirs([pp_dir, hmm_dir])
308
+ from ..plotting import plot_hmm_layers_rolling_by_sample_ref
309
+ bulk_hmm_layers = [layer for layer in adata.uns['hmm_appended_layers'] if "_lengths" not in layer]
310
+ saved = plot_hmm_layers_rolling_by_sample_ref(
311
+ adata,
312
+ layers=bulk_hmm_layers,
313
+ sample_col=cfg.sample_name_col_for_plotting,
314
+ ref_col=cfg.reference_column,
315
+ window=101,
316
+ rows_per_page=4,
317
+ figsize_per_cell=(4,2.5),
318
+ output_dir=hmm_dir,
319
+ save=True,
320
+ show_raw=False
321
+ )
322
+
323
+ hmm_dir = pp_dir / "14_hmm_fragment_distributions"
324
+
325
+ if hmm_dir.is_dir():
326
+ print(f'{hmm_dir} already exists.')
327
+ else:
328
+ make_dirs([pp_dir, hmm_dir])
329
+ from ..plotting import plot_hmm_size_contours
330
+
331
+ if smf_modality == 'deaminase':
332
+ fragments = [('C_all_accessible_features_lengths', 400), ('C_all_footprint_features_lengths', 250), ('C_all_accessible_features_merged_lengths', 800)]
333
+ elif smf_modality == 'conversion':
334
+ fragments = [('GpC_all_accessible_features_lengths', 400), ('GpC_all_footprint_features_lengths', 250), ('GpC_all_accessible_features_merged_lengths', 800)]
335
+ elif smf_modality == "direct":
336
+ fragments = [('A_all_accessible_features_lengths', 400), ('A_all_footprint_features_lengths', 200), ('A_all_accessible_features_merged_lengths', 800)]
337
+
338
+ for layer, max in fragments:
339
+ save_path = hmm_dir / layer
340
+ make_dirs([save_path])
341
+
342
+ figs = plot_hmm_size_contours(
343
+ adata,
344
+ length_layer=layer,
345
+ sample_col=cfg.sample_name_col_for_plotting,
346
+ ref_obs_col=cfg.reference_column,
347
+ rows_per_page=6,
348
+ max_length_cap=max,
349
+ figsize_per_cell=(3.5, 2.2),
350
+ save_path=save_path,
351
+ save_pdf=False,
352
+ save_each_page=True,
353
+ dpi=200,
354
+ smoothing_sigma=(10, 10),
355
+ normalize_after_smoothing=True,
356
+ cmap='Greens',
357
+ log_scale_z=True
358
+ )
359
+ ########################################################################################################################
360
+
361
+ return (adata, hmm_adata_path)